3 * Copyright (c) 2000, 2001 Fabrice Bellard.
4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
6 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
8 * This file is part of FFmpeg.
10 * FFmpeg is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public
12 * License as published by the Free Software Foundation; either
13 * version 2.1 of the License, or (at your option) any later version.
15 * FFmpeg is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * Lesser General Public License for more details.
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with FFmpeg; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
32 #include "simple_idct.h"
40 void ff_spatial_dwt(int *buffer, int width, int height, int stride, int type, int decomposition_count);
43 void vorbis_inverse_coupling(float *mag, float *ang, int blocksize);
46 void ff_ac3_downmix_c(float (*samples)[256], float (*matrix)[2], int out_ch, int in_ch, int len);
49 void ff_flac_compute_autocorr(const int32_t *data, int len, int lag, double *autoc);
52 void ff_add_png_paeth_prediction(uint8_t *dst, uint8_t *src, uint8_t *top, int w, int bpp);
55 void ff_ea_idct_put_c(uint8_t *dest, int linesize, DCTELEM *block);
57 uint8_t ff_cropTbl[256 + 2 * MAX_NEG_CROP] = {0, };
58 uint32_t ff_squareTbl[512] = {0, };
60 // 0x7f7f7f7f or 0x7f7f7f7f7f7f7f7f or whatever, depending on the cpu's native arithmetic size
61 #define pb_7f (~0UL/255 * 0x7f)
62 #define pb_80 (~0UL/255 * 0x80)
64 const uint8_t ff_zigzag_direct[64] = {
65 0, 1, 8, 16, 9, 2, 3, 10,
66 17, 24, 32, 25, 18, 11, 4, 5,
67 12, 19, 26, 33, 40, 48, 41, 34,
68 27, 20, 13, 6, 7, 14, 21, 28,
69 35, 42, 49, 56, 57, 50, 43, 36,
70 29, 22, 15, 23, 30, 37, 44, 51,
71 58, 59, 52, 45, 38, 31, 39, 46,
72 53, 60, 61, 54, 47, 55, 62, 63
75 /* Specific zigzag scan for 248 idct. NOTE that unlike the
76 specification, we interleave the fields */
77 const uint8_t ff_zigzag248_direct[64] = {
78 0, 8, 1, 9, 16, 24, 2, 10,
79 17, 25, 32, 40, 48, 56, 33, 41,
80 18, 26, 3, 11, 4, 12, 19, 27,
81 34, 42, 49, 57, 50, 58, 35, 43,
82 20, 28, 5, 13, 6, 14, 21, 29,
83 36, 44, 51, 59, 52, 60, 37, 45,
84 22, 30, 7, 15, 23, 31, 38, 46,
85 53, 61, 54, 62, 39, 47, 55, 63,
88 /* not permutated inverse zigzag_direct + 1 for MMX quantizer */
89 DECLARE_ALIGNED_8(uint16_t, inv_zigzag_direct16[64]) = {0, };
91 const uint8_t ff_alternate_horizontal_scan[64] = {
92 0, 1, 2, 3, 8, 9, 16, 17,
93 10, 11, 4, 5, 6, 7, 15, 14,
94 13, 12, 19, 18, 24, 25, 32, 33,
95 26, 27, 20, 21, 22, 23, 28, 29,
96 30, 31, 34, 35, 40, 41, 48, 49,
97 42, 43, 36, 37, 38, 39, 44, 45,
98 46, 47, 50, 51, 56, 57, 58, 59,
99 52, 53, 54, 55, 60, 61, 62, 63,
102 const uint8_t ff_alternate_vertical_scan[64] = {
103 0, 8, 16, 24, 1, 9, 2, 10,
104 17, 25, 32, 40, 48, 56, 57, 49,
105 41, 33, 26, 18, 3, 11, 4, 12,
106 19, 27, 34, 42, 50, 58, 35, 43,
107 51, 59, 20, 28, 5, 13, 6, 14,
108 21, 29, 36, 44, 52, 60, 37, 45,
109 53, 61, 22, 30, 7, 15, 23, 31,
110 38, 46, 54, 62, 39, 47, 55, 63,
113 /* a*inverse[b]>>32 == a/b for all 0<=a<=65536 && 2<=b<=255 */
114 const uint32_t ff_inverse[256]={
115 0, 4294967295U,2147483648U,1431655766, 1073741824, 858993460, 715827883, 613566757,
116 536870912, 477218589, 429496730, 390451573, 357913942, 330382100, 306783379, 286331154,
117 268435456, 252645136, 238609295, 226050911, 214748365, 204522253, 195225787, 186737709,
118 178956971, 171798692, 165191050, 159072863, 153391690, 148102321, 143165577, 138547333,
119 134217728, 130150525, 126322568, 122713352, 119304648, 116080198, 113025456, 110127367,
120 107374183, 104755300, 102261127, 99882961, 97612894, 95443718, 93368855, 91382283,
121 89478486, 87652394, 85899346, 84215046, 82595525, 81037119, 79536432, 78090315,
122 76695845, 75350304, 74051161, 72796056, 71582789, 70409300, 69273667, 68174085,
123 67108864, 66076420, 65075263, 64103990, 63161284, 62245903, 61356676, 60492498,
124 59652324, 58835169, 58040099, 57266231, 56512728, 55778797, 55063684, 54366675,
125 53687092, 53024288, 52377650, 51746594, 51130564, 50529028, 49941481, 49367441,
126 48806447, 48258060, 47721859, 47197443, 46684428, 46182445, 45691142, 45210183,
127 44739243, 44278014, 43826197, 43383509, 42949673, 42524429, 42107523, 41698712,
128 41297763, 40904451, 40518560, 40139882, 39768216, 39403370, 39045158, 38693400,
129 38347923, 38008561, 37675152, 37347542, 37025581, 36709123, 36398028, 36092163,
130 35791395, 35495598, 35204650, 34918434, 34636834, 34359739, 34087043, 33818641,
131 33554432, 33294321, 33038210, 32786010, 32537632, 32292988, 32051995, 31814573,
132 31580642, 31350127, 31122952, 30899046, 30678338, 30460761, 30246249, 30034737,
133 29826162, 29620465, 29417585, 29217465, 29020050, 28825284, 28633116, 28443493,
134 28256364, 28071682, 27889399, 27709467, 27531842, 27356480, 27183338, 27012373,
135 26843546, 26676816, 26512144, 26349493, 26188825, 26030105, 25873297, 25718368,
136 25565282, 25414008, 25264514, 25116768, 24970741, 24826401, 24683721, 24542671,
137 24403224, 24265352, 24129030, 23994231, 23860930, 23729102, 23598722, 23469767,
138 23342214, 23216040, 23091223, 22967740, 22845571, 22724695, 22605092, 22486740,
139 22369622, 22253717, 22139007, 22025474, 21913099, 21801865, 21691755, 21582751,
140 21474837, 21367997, 21262215, 21157475, 21053762, 20951060, 20849356, 20748635,
141 20648882, 20550083, 20452226, 20355296, 20259280, 20164166, 20069941, 19976593,
142 19884108, 19792477, 19701685, 19611723, 19522579, 19434242, 19346700, 19259944,
143 19173962, 19088744, 19004281, 18920561, 18837576, 18755316, 18673771, 18592933,
144 18512791, 18433337, 18354562, 18276457, 18199014, 18122225, 18046082, 17970575,
145 17895698, 17821442, 17747799, 17674763, 17602325, 17530479, 17459217, 17388532,
146 17318417, 17248865, 17179870, 17111424, 17043522, 16976156, 16909321, 16843010,
149 /* Input permutation for the simple_idct_mmx */
150 static const uint8_t simple_mmx_permutation[64]={
151 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
152 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
153 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
154 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
155 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
156 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
157 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
158 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
161 static const uint8_t idct_sse2_row_perm[8] = {0, 4, 1, 5, 2, 6, 3, 7};
163 void ff_init_scantable(uint8_t *permutation, ScanTable *st, const uint8_t *src_scantable){
167 st->scantable= src_scantable;
171 j = src_scantable[i];
172 st->permutated[i] = permutation[j];
181 j = st->permutated[i];
183 st->raster_end[i]= end;
187 static int pix_sum_c(uint8_t * pix, int line_size)
192 for (i = 0; i < 16; i++) {
193 for (j = 0; j < 16; j += 8) {
204 pix += line_size - 16;
209 static int pix_norm1_c(uint8_t * pix, int line_size)
212 uint32_t *sq = ff_squareTbl + 256;
215 for (i = 0; i < 16; i++) {
216 for (j = 0; j < 16; j += 8) {
227 #if LONG_MAX > 2147483647
228 register uint64_t x=*(uint64_t*)pix;
230 s += sq[(x>>8)&0xff];
231 s += sq[(x>>16)&0xff];
232 s += sq[(x>>24)&0xff];
233 s += sq[(x>>32)&0xff];
234 s += sq[(x>>40)&0xff];
235 s += sq[(x>>48)&0xff];
236 s += sq[(x>>56)&0xff];
238 register uint32_t x=*(uint32_t*)pix;
240 s += sq[(x>>8)&0xff];
241 s += sq[(x>>16)&0xff];
242 s += sq[(x>>24)&0xff];
243 x=*(uint32_t*)(pix+4);
245 s += sq[(x>>8)&0xff];
246 s += sq[(x>>16)&0xff];
247 s += sq[(x>>24)&0xff];
252 pix += line_size - 16;
257 static void bswap_buf(uint32_t *dst, const uint32_t *src, int w){
260 for(i=0; i+8<=w; i+=8){
261 dst[i+0]= bswap_32(src[i+0]);
262 dst[i+1]= bswap_32(src[i+1]);
263 dst[i+2]= bswap_32(src[i+2]);
264 dst[i+3]= bswap_32(src[i+3]);
265 dst[i+4]= bswap_32(src[i+4]);
266 dst[i+5]= bswap_32(src[i+5]);
267 dst[i+6]= bswap_32(src[i+6]);
268 dst[i+7]= bswap_32(src[i+7]);
271 dst[i+0]= bswap_32(src[i+0]);
275 static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
278 uint32_t *sq = ff_squareTbl + 256;
281 for (i = 0; i < h; i++) {
282 s += sq[pix1[0] - pix2[0]];
283 s += sq[pix1[1] - pix2[1]];
284 s += sq[pix1[2] - pix2[2]];
285 s += sq[pix1[3] - pix2[3]];
292 static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
295 uint32_t *sq = ff_squareTbl + 256;
298 for (i = 0; i < h; i++) {
299 s += sq[pix1[0] - pix2[0]];
300 s += sq[pix1[1] - pix2[1]];
301 s += sq[pix1[2] - pix2[2]];
302 s += sq[pix1[3] - pix2[3]];
303 s += sq[pix1[4] - pix2[4]];
304 s += sq[pix1[5] - pix2[5]];
305 s += sq[pix1[6] - pix2[6]];
306 s += sq[pix1[7] - pix2[7]];
313 static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
316 uint32_t *sq = ff_squareTbl + 256;
319 for (i = 0; i < h; i++) {
320 s += sq[pix1[ 0] - pix2[ 0]];
321 s += sq[pix1[ 1] - pix2[ 1]];
322 s += sq[pix1[ 2] - pix2[ 2]];
323 s += sq[pix1[ 3] - pix2[ 3]];
324 s += sq[pix1[ 4] - pix2[ 4]];
325 s += sq[pix1[ 5] - pix2[ 5]];
326 s += sq[pix1[ 6] - pix2[ 6]];
327 s += sq[pix1[ 7] - pix2[ 7]];
328 s += sq[pix1[ 8] - pix2[ 8]];
329 s += sq[pix1[ 9] - pix2[ 9]];
330 s += sq[pix1[10] - pix2[10]];
331 s += sq[pix1[11] - pix2[11]];
332 s += sq[pix1[12] - pix2[12]];
333 s += sq[pix1[13] - pix2[13]];
334 s += sq[pix1[14] - pix2[14]];
335 s += sq[pix1[15] - pix2[15]];
344 #if CONFIG_SNOW_ENCODER //dwt is in snow.c
345 static inline int w_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int w, int h, int type){
347 const int dec_count= w==8 ? 3 : 4;
350 static const int scale[2][2][4][4]={
354 {268, 239, 239, 213},
358 // 9/7 16x16 or 32x32 dec=4
359 {344, 310, 310, 280},
367 {275, 245, 245, 218},
371 // 5/3 16x16 or 32x32 dec=4
372 {352, 317, 317, 286},
380 for (i = 0; i < h; i++) {
381 for (j = 0; j < w; j+=4) {
382 tmp[32*i+j+0] = (pix1[j+0] - pix2[j+0])<<4;
383 tmp[32*i+j+1] = (pix1[j+1] - pix2[j+1])<<4;
384 tmp[32*i+j+2] = (pix1[j+2] - pix2[j+2])<<4;
385 tmp[32*i+j+3] = (pix1[j+3] - pix2[j+3])<<4;
391 ff_spatial_dwt(tmp, w, h, 32, type, dec_count);
395 for(level=0; level<dec_count; level++){
396 for(ori= level ? 1 : 0; ori<4; ori++){
397 int size= w>>(dec_count-level);
398 int sx= (ori&1) ? size : 0;
399 int stride= 32<<(dec_count-level);
400 int sy= (ori&2) ? stride>>1 : 0;
402 for(i=0; i<size; i++){
403 for(j=0; j<size; j++){
404 int v= tmp[sx + sy + i*stride + j] * scale[type][dec_count-3][level][ori];
414 static int w53_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
415 return w_c(v, pix1, pix2, line_size, 8, h, 1);
418 static int w97_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
419 return w_c(v, pix1, pix2, line_size, 8, h, 0);
422 static int w53_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
423 return w_c(v, pix1, pix2, line_size, 16, h, 1);
426 static int w97_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
427 return w_c(v, pix1, pix2, line_size, 16, h, 0);
430 int w53_32_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
431 return w_c(v, pix1, pix2, line_size, 32, h, 1);
434 int w97_32_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
435 return w_c(v, pix1, pix2, line_size, 32, h, 0);
439 /* draw the edges of width 'w' of an image of size width, height */
440 //FIXME check that this is ok for mpeg4 interlaced
441 static void draw_edges_c(uint8_t *buf, int wrap, int width, int height, int w)
443 uint8_t *ptr, *last_line;
446 last_line = buf + (height - 1) * wrap;
449 memcpy(buf - (i + 1) * wrap, buf, width);
450 memcpy(last_line + (i + 1) * wrap, last_line, width);
454 for(i=0;i<height;i++) {
455 memset(ptr - w, ptr[0], w);
456 memset(ptr + width, ptr[width-1], w);
461 memset(buf - (i + 1) * wrap - w, buf[0], w); /* top left */
462 memset(buf - (i + 1) * wrap + width, buf[width-1], w); /* top right */
463 memset(last_line + (i + 1) * wrap - w, last_line[0], w); /* top left */
464 memset(last_line + (i + 1) * wrap + width, last_line[width-1], w); /* top right */
469 * Copies a rectangular area of samples to a temporary buffer and replicates the boarder samples.
470 * @param buf destination buffer
471 * @param src source buffer
472 * @param linesize number of bytes between 2 vertically adjacent samples in both the source and destination buffers
473 * @param block_w width of block
474 * @param block_h height of block
475 * @param src_x x coordinate of the top left sample of the block in the source buffer
476 * @param src_y y coordinate of the top left sample of the block in the source buffer
477 * @param w width of the source buffer
478 * @param h height of the source buffer
480 void ff_emulated_edge_mc(uint8_t *buf, uint8_t *src, int linesize, int block_w, int block_h,
481 int src_x, int src_y, int w, int h){
483 int start_y, start_x, end_y, end_x;
486 src+= (h-1-src_y)*linesize;
488 }else if(src_y<=-block_h){
489 src+= (1-block_h-src_y)*linesize;
495 }else if(src_x<=-block_w){
496 src+= (1-block_w-src_x);
500 start_y= FFMAX(0, -src_y);
501 start_x= FFMAX(0, -src_x);
502 end_y= FFMIN(block_h, h-src_y);
503 end_x= FFMIN(block_w, w-src_x);
505 // copy existing part
506 for(y=start_y; y<end_y; y++){
507 for(x=start_x; x<end_x; x++){
508 buf[x + y*linesize]= src[x + y*linesize];
513 for(y=0; y<start_y; y++){
514 for(x=start_x; x<end_x; x++){
515 buf[x + y*linesize]= buf[x + start_y*linesize];
520 for(y=end_y; y<block_h; y++){
521 for(x=start_x; x<end_x; x++){
522 buf[x + y*linesize]= buf[x + (end_y-1)*linesize];
526 for(y=0; y<block_h; y++){
528 for(x=0; x<start_x; x++){
529 buf[x + y*linesize]= buf[start_x + y*linesize];
533 for(x=end_x; x<block_w; x++){
534 buf[x + y*linesize]= buf[end_x - 1 + y*linesize];
539 static void get_pixels_c(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
543 /* read the pixels */
545 block[0] = pixels[0];
546 block[1] = pixels[1];
547 block[2] = pixels[2];
548 block[3] = pixels[3];
549 block[4] = pixels[4];
550 block[5] = pixels[5];
551 block[6] = pixels[6];
552 block[7] = pixels[7];
558 static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
559 const uint8_t *s2, int stride){
562 /* read the pixels */
564 block[0] = s1[0] - s2[0];
565 block[1] = s1[1] - s2[1];
566 block[2] = s1[2] - s2[2];
567 block[3] = s1[3] - s2[3];
568 block[4] = s1[4] - s2[4];
569 block[5] = s1[5] - s2[5];
570 block[6] = s1[6] - s2[6];
571 block[7] = s1[7] - s2[7];
579 static void put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
583 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
585 /* read the pixels */
587 pixels[0] = cm[block[0]];
588 pixels[1] = cm[block[1]];
589 pixels[2] = cm[block[2]];
590 pixels[3] = cm[block[3]];
591 pixels[4] = cm[block[4]];
592 pixels[5] = cm[block[5]];
593 pixels[6] = cm[block[6]];
594 pixels[7] = cm[block[7]];
601 static void put_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
605 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
607 /* read the pixels */
609 pixels[0] = cm[block[0]];
610 pixels[1] = cm[block[1]];
611 pixels[2] = cm[block[2]];
612 pixels[3] = cm[block[3]];
619 static void put_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
623 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
625 /* read the pixels */
627 pixels[0] = cm[block[0]];
628 pixels[1] = cm[block[1]];
635 static void put_signed_pixels_clamped_c(const DCTELEM *block,
636 uint8_t *restrict pixels,
641 for (i = 0; i < 8; i++) {
642 for (j = 0; j < 8; j++) {
645 else if (*block > 127)
648 *pixels = (uint8_t)(*block + 128);
652 pixels += (line_size - 8);
656 static void add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
660 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
662 /* read the pixels */
664 pixels[0] = cm[pixels[0] + block[0]];
665 pixels[1] = cm[pixels[1] + block[1]];
666 pixels[2] = cm[pixels[2] + block[2]];
667 pixels[3] = cm[pixels[3] + block[3]];
668 pixels[4] = cm[pixels[4] + block[4]];
669 pixels[5] = cm[pixels[5] + block[5]];
670 pixels[6] = cm[pixels[6] + block[6]];
671 pixels[7] = cm[pixels[7] + block[7]];
677 static void add_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
681 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
683 /* read the pixels */
685 pixels[0] = cm[pixels[0] + block[0]];
686 pixels[1] = cm[pixels[1] + block[1]];
687 pixels[2] = cm[pixels[2] + block[2]];
688 pixels[3] = cm[pixels[3] + block[3]];
694 static void add_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
698 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
700 /* read the pixels */
702 pixels[0] = cm[pixels[0] + block[0]];
703 pixels[1] = cm[pixels[1] + block[1]];
709 static void add_pixels8_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
713 pixels[0] += block[0];
714 pixels[1] += block[1];
715 pixels[2] += block[2];
716 pixels[3] += block[3];
717 pixels[4] += block[4];
718 pixels[5] += block[5];
719 pixels[6] += block[6];
720 pixels[7] += block[7];
726 static void add_pixels4_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
730 pixels[0] += block[0];
731 pixels[1] += block[1];
732 pixels[2] += block[2];
733 pixels[3] += block[3];
739 static int sum_abs_dctelem_c(DCTELEM *block)
743 sum+= FFABS(block[i]);
749 #define PIXOP2(OPNAME, OP) \
750 static void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
754 OP(*((uint64_t*)block), AV_RN64(pixels));\
760 static void OPNAME ## _no_rnd_pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
764 const uint64_t a= AV_RN64(pixels );\
765 const uint64_t b= AV_RN64(pixels+1);\
766 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
772 static void OPNAME ## _pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
776 const uint64_t a= AV_RN64(pixels );\
777 const uint64_t b= AV_RN64(pixels+1);\
778 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
784 static void OPNAME ## _no_rnd_pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
788 const uint64_t a= AV_RN64(pixels );\
789 const uint64_t b= AV_RN64(pixels+line_size);\
790 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
796 static void OPNAME ## _pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
800 const uint64_t a= AV_RN64(pixels );\
801 const uint64_t b= AV_RN64(pixels+line_size);\
802 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
808 static void OPNAME ## _pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
811 const uint64_t a= AV_RN64(pixels );\
812 const uint64_t b= AV_RN64(pixels+1);\
813 uint64_t l0= (a&0x0303030303030303ULL)\
814 + (b&0x0303030303030303ULL)\
815 + 0x0202020202020202ULL;\
816 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
817 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
821 for(i=0; i<h; i+=2){\
822 uint64_t a= AV_RN64(pixels );\
823 uint64_t b= AV_RN64(pixels+1);\
824 l1= (a&0x0303030303030303ULL)\
825 + (b&0x0303030303030303ULL);\
826 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
827 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
828 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
831 a= AV_RN64(pixels );\
832 b= AV_RN64(pixels+1);\
833 l0= (a&0x0303030303030303ULL)\
834 + (b&0x0303030303030303ULL)\
835 + 0x0202020202020202ULL;\
836 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
837 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
838 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
844 static void OPNAME ## _no_rnd_pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
847 const uint64_t a= AV_RN64(pixels );\
848 const uint64_t b= AV_RN64(pixels+1);\
849 uint64_t l0= (a&0x0303030303030303ULL)\
850 + (b&0x0303030303030303ULL)\
851 + 0x0101010101010101ULL;\
852 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
853 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
857 for(i=0; i<h; i+=2){\
858 uint64_t a= AV_RN64(pixels );\
859 uint64_t b= AV_RN64(pixels+1);\
860 l1= (a&0x0303030303030303ULL)\
861 + (b&0x0303030303030303ULL);\
862 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
863 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
864 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
867 a= AV_RN64(pixels );\
868 b= AV_RN64(pixels+1);\
869 l0= (a&0x0303030303030303ULL)\
870 + (b&0x0303030303030303ULL)\
871 + 0x0101010101010101ULL;\
872 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
873 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
874 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
880 CALL_2X_PIXELS(OPNAME ## _pixels16_c , OPNAME ## _pixels_c , 8)\
881 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels_x2_c , 8)\
882 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels_y2_c , 8)\
883 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels_xy2_c, 8)\
884 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels_x2_c , 8)\
885 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels_y2_c , 8)\
886 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels_xy2_c, 8)
888 #define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
889 #else // 64 bit variant
891 #define PIXOP2(OPNAME, OP) \
892 static void OPNAME ## _pixels2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
895 OP(*((uint16_t*)(block )), AV_RN16(pixels ));\
900 static void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
903 OP(*((uint32_t*)(block )), AV_RN32(pixels ));\
908 static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
911 OP(*((uint32_t*)(block )), AV_RN32(pixels ));\
912 OP(*((uint32_t*)(block+4)), AV_RN32(pixels+4));\
917 static inline void OPNAME ## _no_rnd_pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
918 OPNAME ## _pixels8_c(block, pixels, line_size, h);\
921 static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
922 int src_stride1, int src_stride2, int h){\
926 a= AV_RN32(&src1[i*src_stride1 ]);\
927 b= AV_RN32(&src2[i*src_stride2 ]);\
928 OP(*((uint32_t*)&dst[i*dst_stride ]), no_rnd_avg32(a, b));\
929 a= AV_RN32(&src1[i*src_stride1+4]);\
930 b= AV_RN32(&src2[i*src_stride2+4]);\
931 OP(*((uint32_t*)&dst[i*dst_stride+4]), no_rnd_avg32(a, b));\
935 static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
936 int src_stride1, int src_stride2, int h){\
940 a= AV_RN32(&src1[i*src_stride1 ]);\
941 b= AV_RN32(&src2[i*src_stride2 ]);\
942 OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
943 a= AV_RN32(&src1[i*src_stride1+4]);\
944 b= AV_RN32(&src2[i*src_stride2+4]);\
945 OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\
949 static inline void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
950 int src_stride1, int src_stride2, int h){\
954 a= AV_RN32(&src1[i*src_stride1 ]);\
955 b= AV_RN32(&src2[i*src_stride2 ]);\
956 OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
960 static inline void OPNAME ## _pixels2_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
961 int src_stride1, int src_stride2, int h){\
965 a= AV_RN16(&src1[i*src_stride1 ]);\
966 b= AV_RN16(&src2[i*src_stride2 ]);\
967 OP(*((uint16_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
971 static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
972 int src_stride1, int src_stride2, int h){\
973 OPNAME ## _pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
974 OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
977 static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
978 int src_stride1, int src_stride2, int h){\
979 OPNAME ## _no_rnd_pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
980 OPNAME ## _no_rnd_pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
983 static inline void OPNAME ## _no_rnd_pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
984 OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
987 static inline void OPNAME ## _pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
988 OPNAME ## _pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
991 static inline void OPNAME ## _no_rnd_pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
992 OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
995 static inline void OPNAME ## _pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
996 OPNAME ## _pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
999 static inline void OPNAME ## _pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
1000 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1002 for(i=0; i<h; i++){\
1003 uint32_t a, b, c, d, l0, l1, h0, h1;\
1004 a= AV_RN32(&src1[i*src_stride1]);\
1005 b= AV_RN32(&src2[i*src_stride2]);\
1006 c= AV_RN32(&src3[i*src_stride3]);\
1007 d= AV_RN32(&src4[i*src_stride4]);\
1008 l0= (a&0x03030303UL)\
1011 h0= ((a&0xFCFCFCFCUL)>>2)\
1012 + ((b&0xFCFCFCFCUL)>>2);\
1013 l1= (c&0x03030303UL)\
1014 + (d&0x03030303UL);\
1015 h1= ((c&0xFCFCFCFCUL)>>2)\
1016 + ((d&0xFCFCFCFCUL)>>2);\
1017 OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1018 a= AV_RN32(&src1[i*src_stride1+4]);\
1019 b= AV_RN32(&src2[i*src_stride2+4]);\
1020 c= AV_RN32(&src3[i*src_stride3+4]);\
1021 d= AV_RN32(&src4[i*src_stride4+4]);\
1022 l0= (a&0x03030303UL)\
1025 h0= ((a&0xFCFCFCFCUL)>>2)\
1026 + ((b&0xFCFCFCFCUL)>>2);\
1027 l1= (c&0x03030303UL)\
1028 + (d&0x03030303UL);\
1029 h1= ((c&0xFCFCFCFCUL)>>2)\
1030 + ((d&0xFCFCFCFCUL)>>2);\
1031 OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1035 static inline void OPNAME ## _pixels4_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
1036 OPNAME ## _pixels4_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
1039 static inline void OPNAME ## _pixels4_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
1040 OPNAME ## _pixels4_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
1043 static inline void OPNAME ## _pixels2_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
1044 OPNAME ## _pixels2_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
1047 static inline void OPNAME ## _pixels2_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
1048 OPNAME ## _pixels2_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
1051 static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
1052 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1054 for(i=0; i<h; i++){\
1055 uint32_t a, b, c, d, l0, l1, h0, h1;\
1056 a= AV_RN32(&src1[i*src_stride1]);\
1057 b= AV_RN32(&src2[i*src_stride2]);\
1058 c= AV_RN32(&src3[i*src_stride3]);\
1059 d= AV_RN32(&src4[i*src_stride4]);\
1060 l0= (a&0x03030303UL)\
1063 h0= ((a&0xFCFCFCFCUL)>>2)\
1064 + ((b&0xFCFCFCFCUL)>>2);\
1065 l1= (c&0x03030303UL)\
1066 + (d&0x03030303UL);\
1067 h1= ((c&0xFCFCFCFCUL)>>2)\
1068 + ((d&0xFCFCFCFCUL)>>2);\
1069 OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1070 a= AV_RN32(&src1[i*src_stride1+4]);\
1071 b= AV_RN32(&src2[i*src_stride2+4]);\
1072 c= AV_RN32(&src3[i*src_stride3+4]);\
1073 d= AV_RN32(&src4[i*src_stride4+4]);\
1074 l0= (a&0x03030303UL)\
1077 h0= ((a&0xFCFCFCFCUL)>>2)\
1078 + ((b&0xFCFCFCFCUL)>>2);\
1079 l1= (c&0x03030303UL)\
1080 + (d&0x03030303UL);\
1081 h1= ((c&0xFCFCFCFCUL)>>2)\
1082 + ((d&0xFCFCFCFCUL)>>2);\
1083 OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1086 static inline void OPNAME ## _pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
1087 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1088 OPNAME ## _pixels8_l4(dst , src1 , src2 , src3 , src4 , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1089 OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1091 static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
1092 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1093 OPNAME ## _no_rnd_pixels8_l4(dst , src1 , src2 , src3 , src4 , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1094 OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1097 static inline void OPNAME ## _pixels2_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1099 int i, a0, b0, a1, b1;\
1106 for(i=0; i<h; i+=2){\
1112 block[0]= (a1+a0)>>2; /* FIXME non put */\
1113 block[1]= (b1+b0)>>2;\
1123 block[0]= (a1+a0)>>2;\
1124 block[1]= (b1+b0)>>2;\
1130 static inline void OPNAME ## _pixels4_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1133 const uint32_t a= AV_RN32(pixels );\
1134 const uint32_t b= AV_RN32(pixels+1);\
1135 uint32_t l0= (a&0x03030303UL)\
1138 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1139 + ((b&0xFCFCFCFCUL)>>2);\
1143 for(i=0; i<h; i+=2){\
1144 uint32_t a= AV_RN32(pixels );\
1145 uint32_t b= AV_RN32(pixels+1);\
1146 l1= (a&0x03030303UL)\
1147 + (b&0x03030303UL);\
1148 h1= ((a&0xFCFCFCFCUL)>>2)\
1149 + ((b&0xFCFCFCFCUL)>>2);\
1150 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1153 a= AV_RN32(pixels );\
1154 b= AV_RN32(pixels+1);\
1155 l0= (a&0x03030303UL)\
1158 h0= ((a&0xFCFCFCFCUL)>>2)\
1159 + ((b&0xFCFCFCFCUL)>>2);\
1160 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1166 static inline void OPNAME ## _pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1169 for(j=0; j<2; j++){\
1171 const uint32_t a= AV_RN32(pixels );\
1172 const uint32_t b= AV_RN32(pixels+1);\
1173 uint32_t l0= (a&0x03030303UL)\
1176 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1177 + ((b&0xFCFCFCFCUL)>>2);\
1181 for(i=0; i<h; i+=2){\
1182 uint32_t a= AV_RN32(pixels );\
1183 uint32_t b= AV_RN32(pixels+1);\
1184 l1= (a&0x03030303UL)\
1185 + (b&0x03030303UL);\
1186 h1= ((a&0xFCFCFCFCUL)>>2)\
1187 + ((b&0xFCFCFCFCUL)>>2);\
1188 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1191 a= AV_RN32(pixels );\
1192 b= AV_RN32(pixels+1);\
1193 l0= (a&0x03030303UL)\
1196 h0= ((a&0xFCFCFCFCUL)>>2)\
1197 + ((b&0xFCFCFCFCUL)>>2);\
1198 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1202 pixels+=4-line_size*(h+1);\
1203 block +=4-line_size*h;\
1207 static inline void OPNAME ## _no_rnd_pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1210 for(j=0; j<2; j++){\
1212 const uint32_t a= AV_RN32(pixels );\
1213 const uint32_t b= AV_RN32(pixels+1);\
1214 uint32_t l0= (a&0x03030303UL)\
1217 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1218 + ((b&0xFCFCFCFCUL)>>2);\
1222 for(i=0; i<h; i+=2){\
1223 uint32_t a= AV_RN32(pixels );\
1224 uint32_t b= AV_RN32(pixels+1);\
1225 l1= (a&0x03030303UL)\
1226 + (b&0x03030303UL);\
1227 h1= ((a&0xFCFCFCFCUL)>>2)\
1228 + ((b&0xFCFCFCFCUL)>>2);\
1229 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1232 a= AV_RN32(pixels );\
1233 b= AV_RN32(pixels+1);\
1234 l0= (a&0x03030303UL)\
1237 h0= ((a&0xFCFCFCFCUL)>>2)\
1238 + ((b&0xFCFCFCFCUL)>>2);\
1239 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1243 pixels+=4-line_size*(h+1);\
1244 block +=4-line_size*h;\
1248 CALL_2X_PIXELS(OPNAME ## _pixels16_c , OPNAME ## _pixels8_c , 8)\
1249 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels8_x2_c , 8)\
1250 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels8_y2_c , 8)\
1251 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels8_xy2_c, 8)\
1252 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_c , OPNAME ## _pixels8_c , 8)\
1253 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels8_x2_c , 8)\
1254 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels8_y2_c , 8)\
1255 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels8_xy2_c, 8)\
1257 #define op_avg(a, b) a = rnd_avg32(a, b)
1259 #define op_put(a, b) a = b
1266 #define avg2(a,b) ((a+b+1)>>1)
1267 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
1269 static void put_no_rnd_pixels16_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1270 put_no_rnd_pixels16_l2(dst, a, b, stride, stride, stride, h);
1273 static void put_no_rnd_pixels8_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1274 put_no_rnd_pixels8_l2(dst, a, b, stride, stride, stride, h);
1277 static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
1279 const int A=(16-x16)*(16-y16);
1280 const int B=( x16)*(16-y16);
1281 const int C=(16-x16)*( y16);
1282 const int D=( x16)*( y16);
1287 dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
1288 dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
1289 dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
1290 dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
1291 dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
1292 dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
1293 dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
1294 dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
1300 void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
1301 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
1304 const int s= 1<<shift;
1314 for(x=0; x<8; x++){ //XXX FIXME optimize
1315 int src_x, src_y, frac_x, frac_y, index;
1319 frac_x= src_x&(s-1);
1320 frac_y= src_y&(s-1);
1324 if((unsigned)src_x < width){
1325 if((unsigned)src_y < height){
1326 index= src_x + src_y*stride;
1327 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
1328 + src[index +1]* frac_x )*(s-frac_y)
1329 + ( src[index+stride ]*(s-frac_x)
1330 + src[index+stride+1]* frac_x )* frac_y
1333 index= src_x + av_clip(src_y, 0, height)*stride;
1334 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
1335 + src[index +1]* frac_x )*s
1339 if((unsigned)src_y < height){
1340 index= av_clip(src_x, 0, width) + src_y*stride;
1341 dst[y*stride + x]= ( ( src[index ]*(s-frac_y)
1342 + src[index+stride ]* frac_y )*s
1345 index= av_clip(src_x, 0, width) + av_clip(src_y, 0, height)*stride;
1346 dst[y*stride + x]= src[index ];
1358 static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1360 case 2: put_pixels2_c (dst, src, stride, height); break;
1361 case 4: put_pixels4_c (dst, src, stride, height); break;
1362 case 8: put_pixels8_c (dst, src, stride, height); break;
1363 case 16:put_pixels16_c(dst, src, stride, height); break;
1367 static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1369 for (i=0; i < height; i++) {
1370 for (j=0; j < width; j++) {
1371 dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
1378 static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1380 for (i=0; i < height; i++) {
1381 for (j=0; j < width; j++) {
1382 dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
1389 static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1391 for (i=0; i < height; i++) {
1392 for (j=0; j < width; j++) {
1393 dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
1400 static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1402 for (i=0; i < height; i++) {
1403 for (j=0; j < width; j++) {
1404 dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
1411 static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1413 for (i=0; i < height; i++) {
1414 for (j=0; j < width; j++) {
1415 dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1422 static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1424 for (i=0; i < height; i++) {
1425 for (j=0; j < width; j++) {
1426 dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
1433 static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1435 for (i=0; i < height; i++) {
1436 for (j=0; j < width; j++) {
1437 dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1444 static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1446 for (i=0; i < height; i++) {
1447 for (j=0; j < width; j++) {
1448 dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
1455 static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1457 case 2: avg_pixels2_c (dst, src, stride, height); break;
1458 case 4: avg_pixels4_c (dst, src, stride, height); break;
1459 case 8: avg_pixels8_c (dst, src, stride, height); break;
1460 case 16:avg_pixels16_c(dst, src, stride, height); break;
1464 static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1466 for (i=0; i < height; i++) {
1467 for (j=0; j < width; j++) {
1468 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
1475 static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1477 for (i=0; i < height; i++) {
1478 for (j=0; j < width; j++) {
1479 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
1486 static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1488 for (i=0; i < height; i++) {
1489 for (j=0; j < width; j++) {
1490 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
1497 static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1499 for (i=0; i < height; i++) {
1500 for (j=0; j < width; j++) {
1501 dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1508 static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1510 for (i=0; i < height; i++) {
1511 for (j=0; j < width; j++) {
1512 dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1519 static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1521 for (i=0; i < height; i++) {
1522 for (j=0; j < width; j++) {
1523 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
1530 static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1532 for (i=0; i < height; i++) {
1533 for (j=0; j < width; j++) {
1534 dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1541 static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1543 for (i=0; i < height; i++) {
1544 for (j=0; j < width; j++) {
1545 dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1552 #define TPEL_WIDTH(width)\
1553 static void put_tpel_pixels ## width ## _mc00_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1554 void put_tpel_pixels_mc00_c(dst, src, stride, width, height);}\
1555 static void put_tpel_pixels ## width ## _mc10_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1556 void put_tpel_pixels_mc10_c(dst, src, stride, width, height);}\
1557 static void put_tpel_pixels ## width ## _mc20_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1558 void put_tpel_pixels_mc20_c(dst, src, stride, width, height);}\
1559 static void put_tpel_pixels ## width ## _mc01_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1560 void put_tpel_pixels_mc01_c(dst, src, stride, width, height);}\
1561 static void put_tpel_pixels ## width ## _mc11_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1562 void put_tpel_pixels_mc11_c(dst, src, stride, width, height);}\
1563 static void put_tpel_pixels ## width ## _mc21_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1564 void put_tpel_pixels_mc21_c(dst, src, stride, width, height);}\
1565 static void put_tpel_pixels ## width ## _mc02_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1566 void put_tpel_pixels_mc02_c(dst, src, stride, width, height);}\
1567 static void put_tpel_pixels ## width ## _mc12_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1568 void put_tpel_pixels_mc12_c(dst, src, stride, width, height);}\
1569 static void put_tpel_pixels ## width ## _mc22_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1570 void put_tpel_pixels_mc22_c(dst, src, stride, width, height);}
1573 #define H264_CHROMA_MC(OPNAME, OP)\
1574 static void OPNAME ## h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1575 const int A=(8-x)*(8-y);\
1576 const int B=( x)*(8-y);\
1577 const int C=(8-x)*( y);\
1578 const int D=( x)*( y);\
1581 assert(x<8 && y<8 && x>=0 && y>=0);\
1584 for(i=0; i<h; i++){\
1585 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1586 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1592 const int step= C ? stride : 1;\
1593 for(i=0; i<h; i++){\
1594 OP(dst[0], (A*src[0] + E*src[step+0]));\
1595 OP(dst[1], (A*src[1] + E*src[step+1]));\
1602 static void OPNAME ## h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1603 const int A=(8-x)*(8-y);\
1604 const int B=( x)*(8-y);\
1605 const int C=(8-x)*( y);\
1606 const int D=( x)*( y);\
1609 assert(x<8 && y<8 && x>=0 && y>=0);\
1612 for(i=0; i<h; i++){\
1613 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1614 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1615 OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1616 OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1622 const int step= C ? stride : 1;\
1623 for(i=0; i<h; i++){\
1624 OP(dst[0], (A*src[0] + E*src[step+0]));\
1625 OP(dst[1], (A*src[1] + E*src[step+1]));\
1626 OP(dst[2], (A*src[2] + E*src[step+2]));\
1627 OP(dst[3], (A*src[3] + E*src[step+3]));\
1634 static void OPNAME ## h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1635 const int A=(8-x)*(8-y);\
1636 const int B=( x)*(8-y);\
1637 const int C=(8-x)*( y);\
1638 const int D=( x)*( y);\
1641 assert(x<8 && y<8 && x>=0 && y>=0);\
1644 for(i=0; i<h; i++){\
1645 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1646 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1647 OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1648 OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1649 OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));\
1650 OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));\
1651 OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));\
1652 OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));\
1658 const int step= C ? stride : 1;\
1659 for(i=0; i<h; i++){\
1660 OP(dst[0], (A*src[0] + E*src[step+0]));\
1661 OP(dst[1], (A*src[1] + E*src[step+1]));\
1662 OP(dst[2], (A*src[2] + E*src[step+2]));\
1663 OP(dst[3], (A*src[3] + E*src[step+3]));\
1664 OP(dst[4], (A*src[4] + E*src[step+4]));\
1665 OP(dst[5], (A*src[5] + E*src[step+5]));\
1666 OP(dst[6], (A*src[6] + E*src[step+6]));\
1667 OP(dst[7], (A*src[7] + E*src[step+7]));\
1674 #define op_avg(a, b) a = (((a)+(((b) + 32)>>6)+1)>>1)
1675 #define op_put(a, b) a = (((b) + 32)>>6)
1677 H264_CHROMA_MC(put_ , op_put)
1678 H264_CHROMA_MC(avg_ , op_avg)
1682 static void put_no_rnd_h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){
1683 const int A=(8-x)*(8-y);
1684 const int B=( x)*(8-y);
1685 const int C=(8-x)*( y);
1686 const int D=( x)*( y);
1689 assert(x<8 && y<8 && x>=0 && y>=0);
1693 dst[0] = (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + 32 - 4) >> 6;
1694 dst[1] = (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + 32 - 4) >> 6;
1695 dst[2] = (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + 32 - 4) >> 6;
1696 dst[3] = (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + 32 - 4) >> 6;
1697 dst[4] = (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + 32 - 4) >> 6;
1698 dst[5] = (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + 32 - 4) >> 6;
1699 dst[6] = (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + 32 - 4) >> 6;
1700 dst[7] = (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + 32 - 4) >> 6;
1706 #define QPEL_MC(r, OPNAME, RND, OP) \
1707 static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1708 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1712 OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
1713 OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
1714 OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
1715 OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
1716 OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
1717 OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
1718 OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
1719 OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
1725 static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1727 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1731 const int src0= src[0*srcStride];\
1732 const int src1= src[1*srcStride];\
1733 const int src2= src[2*srcStride];\
1734 const int src3= src[3*srcStride];\
1735 const int src4= src[4*srcStride];\
1736 const int src5= src[5*srcStride];\
1737 const int src6= src[6*srcStride];\
1738 const int src7= src[7*srcStride];\
1739 const int src8= src[8*srcStride];\
1740 OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
1741 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
1742 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
1743 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
1744 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
1745 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
1746 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
1747 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
1753 static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1754 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1759 OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
1760 OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
1761 OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
1762 OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
1763 OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
1764 OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
1765 OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
1766 OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
1767 OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
1768 OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
1769 OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
1770 OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
1771 OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
1772 OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
1773 OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
1774 OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
1780 static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1781 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1786 const int src0= src[0*srcStride];\
1787 const int src1= src[1*srcStride];\
1788 const int src2= src[2*srcStride];\
1789 const int src3= src[3*srcStride];\
1790 const int src4= src[4*srcStride];\
1791 const int src5= src[5*srcStride];\
1792 const int src6= src[6*srcStride];\
1793 const int src7= src[7*srcStride];\
1794 const int src8= src[8*srcStride];\
1795 const int src9= src[9*srcStride];\
1796 const int src10= src[10*srcStride];\
1797 const int src11= src[11*srcStride];\
1798 const int src12= src[12*srcStride];\
1799 const int src13= src[13*srcStride];\
1800 const int src14= src[14*srcStride];\
1801 const int src15= src[15*srcStride];\
1802 const int src16= src[16*srcStride];\
1803 OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
1804 OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
1805 OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
1806 OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
1807 OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
1808 OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
1809 OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
1810 OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
1811 OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
1812 OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
1813 OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
1814 OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
1815 OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
1816 OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
1817 OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
1818 OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
1824 static void OPNAME ## qpel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1825 OPNAME ## pixels8_c(dst, src, stride, 8);\
1828 static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1830 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1831 OPNAME ## pixels8_l2(dst, src, half, stride, stride, 8, 8);\
1834 static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1835 OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
1838 static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1840 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1841 OPNAME ## pixels8_l2(dst, src+1, half, stride, stride, 8, 8);\
1844 static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1845 uint8_t full[16*9];\
1847 copy_block9(full, src, 16, stride, 9);\
1848 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1849 OPNAME ## pixels8_l2(dst, full, half, stride, 16, 8, 8);\
1852 static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1853 uint8_t full[16*9];\
1854 copy_block9(full, src, 16, stride, 9);\
1855 OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
1858 static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1859 uint8_t full[16*9];\
1861 copy_block9(full, src, 16, stride, 9);\
1862 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1863 OPNAME ## pixels8_l2(dst, full+16, half, stride, 16, 8, 8);\
1865 void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1866 uint8_t full[16*9];\
1869 uint8_t halfHV[64];\
1870 copy_block9(full, src, 16, stride, 9);\
1871 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1872 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1873 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1874 OPNAME ## pixels8_l4(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1876 static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1877 uint8_t full[16*9];\
1879 uint8_t halfHV[64];\
1880 copy_block9(full, src, 16, stride, 9);\
1881 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1882 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1883 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1884 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1886 void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1887 uint8_t full[16*9];\
1890 uint8_t halfHV[64];\
1891 copy_block9(full, src, 16, stride, 9);\
1892 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1893 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1894 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1895 OPNAME ## pixels8_l4(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1897 static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1898 uint8_t full[16*9];\
1900 uint8_t halfHV[64];\
1901 copy_block9(full, src, 16, stride, 9);\
1902 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1903 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1904 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1905 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1907 void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1908 uint8_t full[16*9];\
1911 uint8_t halfHV[64];\
1912 copy_block9(full, src, 16, stride, 9);\
1913 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1914 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1915 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1916 OPNAME ## pixels8_l4(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1918 static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1919 uint8_t full[16*9];\
1921 uint8_t halfHV[64];\
1922 copy_block9(full, src, 16, stride, 9);\
1923 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1924 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1925 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1926 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1928 void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1929 uint8_t full[16*9];\
1932 uint8_t halfHV[64];\
1933 copy_block9(full, src, 16, stride, 9);\
1934 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full , 8, 16, 9);\
1935 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1936 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1937 OPNAME ## pixels8_l4(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1939 static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1940 uint8_t full[16*9];\
1942 uint8_t halfHV[64];\
1943 copy_block9(full, src, 16, stride, 9);\
1944 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1945 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1946 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1947 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1949 static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1951 uint8_t halfHV[64];\
1952 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1953 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1954 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1956 static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1958 uint8_t halfHV[64];\
1959 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1960 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1961 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1963 void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1964 uint8_t full[16*9];\
1967 uint8_t halfHV[64];\
1968 copy_block9(full, src, 16, stride, 9);\
1969 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1970 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1971 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1972 OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1974 static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1975 uint8_t full[16*9];\
1977 copy_block9(full, src, 16, stride, 9);\
1978 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1979 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1980 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1982 void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1983 uint8_t full[16*9];\
1986 uint8_t halfHV[64];\
1987 copy_block9(full, src, 16, stride, 9);\
1988 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1989 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1990 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1991 OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1993 static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1994 uint8_t full[16*9];\
1996 copy_block9(full, src, 16, stride, 9);\
1997 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1998 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1999 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
2001 static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2003 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
2004 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
2006 static void OPNAME ## qpel16_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2007 OPNAME ## pixels16_c(dst, src, stride, 16);\
2010 static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2012 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
2013 OPNAME ## pixels16_l2(dst, src, half, stride, stride, 16, 16);\
2016 static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2017 OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
2020 static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2022 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
2023 OPNAME ## pixels16_l2(dst, src+1, half, stride, stride, 16, 16);\
2026 static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2027 uint8_t full[24*17];\
2029 copy_block17(full, src, 24, stride, 17);\
2030 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
2031 OPNAME ## pixels16_l2(dst, full, half, stride, 24, 16, 16);\
2034 static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2035 uint8_t full[24*17];\
2036 copy_block17(full, src, 24, stride, 17);\
2037 OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
2040 static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2041 uint8_t full[24*17];\
2043 copy_block17(full, src, 24, stride, 17);\
2044 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
2045 OPNAME ## pixels16_l2(dst, full+24, half, stride, 24, 16, 16);\
2047 void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
2048 uint8_t full[24*17];\
2049 uint8_t halfH[272];\
2050 uint8_t halfV[256];\
2051 uint8_t halfHV[256];\
2052 copy_block17(full, src, 24, stride, 17);\
2053 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2054 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2055 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2056 OPNAME ## pixels16_l4(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2058 static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2059 uint8_t full[24*17];\
2060 uint8_t halfH[272];\
2061 uint8_t halfHV[256];\
2062 copy_block17(full, src, 24, stride, 17);\
2063 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2064 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2065 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2066 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2068 void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
2069 uint8_t full[24*17];\
2070 uint8_t halfH[272];\
2071 uint8_t halfV[256];\
2072 uint8_t halfHV[256];\
2073 copy_block17(full, src, 24, stride, 17);\
2074 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2075 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2076 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2077 OPNAME ## pixels16_l4(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2079 static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2080 uint8_t full[24*17];\
2081 uint8_t halfH[272];\
2082 uint8_t halfHV[256];\
2083 copy_block17(full, src, 24, stride, 17);\
2084 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2085 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2086 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2087 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2089 void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
2090 uint8_t full[24*17];\
2091 uint8_t halfH[272];\
2092 uint8_t halfV[256];\
2093 uint8_t halfHV[256];\
2094 copy_block17(full, src, 24, stride, 17);\
2095 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2096 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2097 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2098 OPNAME ## pixels16_l4(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2100 static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2101 uint8_t full[24*17];\
2102 uint8_t halfH[272];\
2103 uint8_t halfHV[256];\
2104 copy_block17(full, src, 24, stride, 17);\
2105 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2106 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2107 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2108 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2110 void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
2111 uint8_t full[24*17];\
2112 uint8_t halfH[272];\
2113 uint8_t halfV[256];\
2114 uint8_t halfHV[256];\
2115 copy_block17(full, src, 24, stride, 17);\
2116 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full , 16, 24, 17);\
2117 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2118 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2119 OPNAME ## pixels16_l4(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2121 static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2122 uint8_t full[24*17];\
2123 uint8_t halfH[272];\
2124 uint8_t halfHV[256];\
2125 copy_block17(full, src, 24, stride, 17);\
2126 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2127 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2128 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2129 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2131 static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2132 uint8_t halfH[272];\
2133 uint8_t halfHV[256];\
2134 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2135 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2136 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2138 static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2139 uint8_t halfH[272];\
2140 uint8_t halfHV[256];\
2141 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2142 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2143 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2145 void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
2146 uint8_t full[24*17];\
2147 uint8_t halfH[272];\
2148 uint8_t halfV[256];\
2149 uint8_t halfHV[256];\
2150 copy_block17(full, src, 24, stride, 17);\
2151 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2152 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2153 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2154 OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2156 static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2157 uint8_t full[24*17];\
2158 uint8_t halfH[272];\
2159 copy_block17(full, src, 24, stride, 17);\
2160 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2161 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2162 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2164 void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
2165 uint8_t full[24*17];\
2166 uint8_t halfH[272];\
2167 uint8_t halfV[256];\
2168 uint8_t halfHV[256];\
2169 copy_block17(full, src, 24, stride, 17);\
2170 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2171 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2172 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2173 OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2175 static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2176 uint8_t full[24*17];\
2177 uint8_t halfH[272];\
2178 copy_block17(full, src, 24, stride, 17);\
2179 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2180 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2181 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2183 static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2184 uint8_t halfH[272];\
2185 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2186 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2189 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2190 #define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
2191 #define op_put(a, b) a = cm[((b) + 16)>>5]
2192 #define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
2194 QPEL_MC(0, put_ , _ , op_put)
2195 QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
2196 QPEL_MC(0, avg_ , _ , op_avg)
2197 //QPEL_MC(1, avg_no_rnd , _ , op_avg)
2199 #undef op_avg_no_rnd
2201 #undef op_put_no_rnd
2204 #define H264_LOWPASS(OPNAME, OP, OP2) \
2205 static av_unused void OPNAME ## h264_qpel2_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2207 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2211 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2212 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2218 static av_unused void OPNAME ## h264_qpel2_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2220 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2224 const int srcB= src[-2*srcStride];\
2225 const int srcA= src[-1*srcStride];\
2226 const int src0= src[0 *srcStride];\
2227 const int src1= src[1 *srcStride];\
2228 const int src2= src[2 *srcStride];\
2229 const int src3= src[3 *srcStride];\
2230 const int src4= src[4 *srcStride];\
2231 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2232 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2238 static av_unused void OPNAME ## h264_qpel2_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2241 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2243 src -= 2*srcStride;\
2244 for(i=0; i<h+5; i++)\
2246 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2247 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2251 tmp -= tmpStride*(h+5-2);\
2254 const int tmpB= tmp[-2*tmpStride];\
2255 const int tmpA= tmp[-1*tmpStride];\
2256 const int tmp0= tmp[0 *tmpStride];\
2257 const int tmp1= tmp[1 *tmpStride];\
2258 const int tmp2= tmp[2 *tmpStride];\
2259 const int tmp3= tmp[3 *tmpStride];\
2260 const int tmp4= tmp[4 *tmpStride];\
2261 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2262 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2267 static void OPNAME ## h264_qpel4_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2269 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2273 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2274 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2275 OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\
2276 OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\
2282 static void OPNAME ## h264_qpel4_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2284 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2288 const int srcB= src[-2*srcStride];\
2289 const int srcA= src[-1*srcStride];\
2290 const int src0= src[0 *srcStride];\
2291 const int src1= src[1 *srcStride];\
2292 const int src2= src[2 *srcStride];\
2293 const int src3= src[3 *srcStride];\
2294 const int src4= src[4 *srcStride];\
2295 const int src5= src[5 *srcStride];\
2296 const int src6= src[6 *srcStride];\
2297 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2298 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2299 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2300 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2306 static void OPNAME ## h264_qpel4_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2309 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2311 src -= 2*srcStride;\
2312 for(i=0; i<h+5; i++)\
2314 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2315 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2316 tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]);\
2317 tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]);\
2321 tmp -= tmpStride*(h+5-2);\
2324 const int tmpB= tmp[-2*tmpStride];\
2325 const int tmpA= tmp[-1*tmpStride];\
2326 const int tmp0= tmp[0 *tmpStride];\
2327 const int tmp1= tmp[1 *tmpStride];\
2328 const int tmp2= tmp[2 *tmpStride];\
2329 const int tmp3= tmp[3 *tmpStride];\
2330 const int tmp4= tmp[4 *tmpStride];\
2331 const int tmp5= tmp[5 *tmpStride];\
2332 const int tmp6= tmp[6 *tmpStride];\
2333 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2334 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2335 OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2336 OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2342 static void OPNAME ## h264_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2344 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2348 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]));\
2349 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]));\
2350 OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]));\
2351 OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]));\
2352 OP(dst[4], (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]));\
2353 OP(dst[5], (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]));\
2354 OP(dst[6], (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]));\
2355 OP(dst[7], (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]));\
2361 static void OPNAME ## h264_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2363 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2367 const int srcB= src[-2*srcStride];\
2368 const int srcA= src[-1*srcStride];\
2369 const int src0= src[0 *srcStride];\
2370 const int src1= src[1 *srcStride];\
2371 const int src2= src[2 *srcStride];\
2372 const int src3= src[3 *srcStride];\
2373 const int src4= src[4 *srcStride];\
2374 const int src5= src[5 *srcStride];\
2375 const int src6= src[6 *srcStride];\
2376 const int src7= src[7 *srcStride];\
2377 const int src8= src[8 *srcStride];\
2378 const int src9= src[9 *srcStride];\
2379 const int src10=src[10*srcStride];\
2380 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2381 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2382 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2383 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2384 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*5 + (src2+src7));\
2385 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*5 + (src3+src8));\
2386 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*5 + (src4+src9));\
2387 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src9)*5 + (src5+src10));\
2393 static void OPNAME ## h264_qpel8_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2396 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2398 src -= 2*srcStride;\
2399 for(i=0; i<h+5; i++)\
2401 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]);\
2402 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]);\
2403 tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]);\
2404 tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]);\
2405 tmp[4]= (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]);\
2406 tmp[5]= (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]);\
2407 tmp[6]= (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]);\
2408 tmp[7]= (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]);\
2412 tmp -= tmpStride*(h+5-2);\
2415 const int tmpB= tmp[-2*tmpStride];\
2416 const int tmpA= tmp[-1*tmpStride];\
2417 const int tmp0= tmp[0 *tmpStride];\
2418 const int tmp1= tmp[1 *tmpStride];\
2419 const int tmp2= tmp[2 *tmpStride];\
2420 const int tmp3= tmp[3 *tmpStride];\
2421 const int tmp4= tmp[4 *tmpStride];\
2422 const int tmp5= tmp[5 *tmpStride];\
2423 const int tmp6= tmp[6 *tmpStride];\
2424 const int tmp7= tmp[7 *tmpStride];\
2425 const int tmp8= tmp[8 *tmpStride];\
2426 const int tmp9= tmp[9 *tmpStride];\
2427 const int tmp10=tmp[10*tmpStride];\
2428 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2429 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2430 OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2431 OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2432 OP2(dst[4*dstStride], (tmp4+tmp5)*20 - (tmp3+tmp6)*5 + (tmp2+tmp7));\
2433 OP2(dst[5*dstStride], (tmp5+tmp6)*20 - (tmp4+tmp7)*5 + (tmp3+tmp8));\
2434 OP2(dst[6*dstStride], (tmp6+tmp7)*20 - (tmp5+tmp8)*5 + (tmp4+tmp9));\
2435 OP2(dst[7*dstStride], (tmp7+tmp8)*20 - (tmp6+tmp9)*5 + (tmp5+tmp10));\
2441 static void OPNAME ## h264_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2442 OPNAME ## h264_qpel8_v_lowpass(dst , src , dstStride, srcStride);\
2443 OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2444 src += 8*srcStride;\
2445 dst += 8*dstStride;\
2446 OPNAME ## h264_qpel8_v_lowpass(dst , src , dstStride, srcStride);\
2447 OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2450 static void OPNAME ## h264_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2451 OPNAME ## h264_qpel8_h_lowpass(dst , src , dstStride, srcStride);\
2452 OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2453 src += 8*srcStride;\
2454 dst += 8*dstStride;\
2455 OPNAME ## h264_qpel8_h_lowpass(dst , src , dstStride, srcStride);\
2456 OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2459 static void OPNAME ## h264_qpel16_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2460 OPNAME ## h264_qpel8_hv_lowpass(dst , tmp , src , dstStride, tmpStride, srcStride);\
2461 OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2462 src += 8*srcStride;\
2463 dst += 8*dstStride;\
2464 OPNAME ## h264_qpel8_hv_lowpass(dst , tmp , src , dstStride, tmpStride, srcStride);\
2465 OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2468 #define H264_MC(OPNAME, SIZE) \
2469 static void OPNAME ## h264_qpel ## SIZE ## _mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2470 OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\
2473 static void OPNAME ## h264_qpel ## SIZE ## _mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2474 uint8_t half[SIZE*SIZE];\
2475 put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2476 OPNAME ## pixels ## SIZE ## _l2(dst, src, half, stride, stride, SIZE, SIZE);\
2479 static void OPNAME ## h264_qpel ## SIZE ## _mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2480 OPNAME ## h264_qpel ## SIZE ## _h_lowpass(dst, src, stride, stride);\
2483 static void OPNAME ## h264_qpel ## SIZE ## _mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2484 uint8_t half[SIZE*SIZE];\
2485 put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2486 OPNAME ## pixels ## SIZE ## _l2(dst, src+1, half, stride, stride, SIZE, SIZE);\
2489 static void OPNAME ## h264_qpel ## SIZE ## _mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2490 uint8_t full[SIZE*(SIZE+5)];\
2491 uint8_t * const full_mid= full + SIZE*2;\
2492 uint8_t half[SIZE*SIZE];\
2493 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2494 put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2495 OPNAME ## pixels ## SIZE ## _l2(dst, full_mid, half, stride, SIZE, SIZE, SIZE);\
2498 static void OPNAME ## h264_qpel ## SIZE ## _mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2499 uint8_t full[SIZE*(SIZE+5)];\
2500 uint8_t * const full_mid= full + SIZE*2;\
2501 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2502 OPNAME ## h264_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE);\
2505 static void OPNAME ## h264_qpel ## SIZE ## _mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2506 uint8_t full[SIZE*(SIZE+5)];\
2507 uint8_t * const full_mid= full + SIZE*2;\
2508 uint8_t half[SIZE*SIZE];\
2509 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2510 put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2511 OPNAME ## pixels ## SIZE ## _l2(dst, full_mid+SIZE, half, stride, SIZE, SIZE, SIZE);\
2514 static void OPNAME ## h264_qpel ## SIZE ## _mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2515 uint8_t full[SIZE*(SIZE+5)];\
2516 uint8_t * const full_mid= full + SIZE*2;\
2517 uint8_t halfH[SIZE*SIZE];\
2518 uint8_t halfV[SIZE*SIZE];\
2519 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2520 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2521 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2522 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2525 static void OPNAME ## h264_qpel ## SIZE ## _mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2526 uint8_t full[SIZE*(SIZE+5)];\
2527 uint8_t * const full_mid= full + SIZE*2;\
2528 uint8_t halfH[SIZE*SIZE];\
2529 uint8_t halfV[SIZE*SIZE];\
2530 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2531 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2532 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2533 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2536 static void OPNAME ## h264_qpel ## SIZE ## _mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2537 uint8_t full[SIZE*(SIZE+5)];\
2538 uint8_t * const full_mid= full + SIZE*2;\
2539 uint8_t halfH[SIZE*SIZE];\
2540 uint8_t halfV[SIZE*SIZE];\
2541 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2542 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2543 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2544 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2547 static void OPNAME ## h264_qpel ## SIZE ## _mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2548 uint8_t full[SIZE*(SIZE+5)];\
2549 uint8_t * const full_mid= full + SIZE*2;\
2550 uint8_t halfH[SIZE*SIZE];\
2551 uint8_t halfV[SIZE*SIZE];\
2552 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2553 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2554 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2555 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2558 static void OPNAME ## h264_qpel ## SIZE ## _mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2559 int16_t tmp[SIZE*(SIZE+5)];\
2560 OPNAME ## h264_qpel ## SIZE ## _hv_lowpass(dst, tmp, src, stride, SIZE, stride);\
2563 static void OPNAME ## h264_qpel ## SIZE ## _mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2564 int16_t tmp[SIZE*(SIZE+5)];\
2565 uint8_t halfH[SIZE*SIZE];\
2566 uint8_t halfHV[SIZE*SIZE];\
2567 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2568 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2569 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2572 static void OPNAME ## h264_qpel ## SIZE ## _mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2573 int16_t tmp[SIZE*(SIZE+5)];\
2574 uint8_t halfH[SIZE*SIZE];\
2575 uint8_t halfHV[SIZE*SIZE];\
2576 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2577 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2578 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2581 static void OPNAME ## h264_qpel ## SIZE ## _mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2582 uint8_t full[SIZE*(SIZE+5)];\
2583 uint8_t * const full_mid= full + SIZE*2;\
2584 int16_t tmp[SIZE*(SIZE+5)];\
2585 uint8_t halfV[SIZE*SIZE];\
2586 uint8_t halfHV[SIZE*SIZE];\
2587 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2588 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2589 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2590 OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2593 static void OPNAME ## h264_qpel ## SIZE ## _mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2594 uint8_t full[SIZE*(SIZE+5)];\
2595 uint8_t * const full_mid= full + SIZE*2;\
2596 int16_t tmp[SIZE*(SIZE+5)];\
2597 uint8_t halfV[SIZE*SIZE];\
2598 uint8_t halfHV[SIZE*SIZE];\
2599 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2600 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2601 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2602 OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2605 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2606 //#define op_avg2(a, b) a = (((a)*w1+cm[((b) + 16)>>5]*w2 + o + 64)>>7)
2607 #define op_put(a, b) a = cm[((b) + 16)>>5]
2608 #define op2_avg(a, b) a = (((a)+cm[((b) + 512)>>10]+1)>>1)
2609 #define op2_put(a, b) a = cm[((b) + 512)>>10]
2611 H264_LOWPASS(put_ , op_put, op2_put)
2612 H264_LOWPASS(avg_ , op_avg, op2_avg)
2627 #define op_scale1(x) block[x] = av_clip_uint8( (block[x]*weight + offset) >> log2_denom )
2628 #define op_scale2(x) dst[x] = av_clip_uint8( (src[x]*weights + dst[x]*weightd + offset) >> (log2_denom+1))
2629 #define H264_WEIGHT(W,H) \
2630 static void weight_h264_pixels ## W ## x ## H ## _c(uint8_t *block, int stride, int log2_denom, int weight, int offset){ \
2632 offset <<= log2_denom; \
2633 if(log2_denom) offset += 1<<(log2_denom-1); \
2634 for(y=0; y<H; y++, block += stride){ \
2637 if(W==2) continue; \
2640 if(W==4) continue; \
2645 if(W==8) continue; \
2656 static void biweight_h264_pixels ## W ## x ## H ## _c(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset){ \
2658 offset = ((offset + 1) | 1) << log2_denom; \
2659 for(y=0; y<H; y++, dst += stride, src += stride){ \
2662 if(W==2) continue; \
2665 if(W==4) continue; \
2670 if(W==8) continue; \
2697 static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
2698 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2702 dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
2703 dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
2704 dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
2705 dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
2706 dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
2707 dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
2708 dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
2709 dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
2715 #if CONFIG_CAVS_DECODER
2717 void ff_cavsdsp_init(DSPContext* c, AVCodecContext *avctx);
2719 void ff_put_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2720 put_pixels8_c(dst, src, stride, 8);
2722 void ff_avg_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2723 avg_pixels8_c(dst, src, stride, 8);
2725 void ff_put_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2726 put_pixels16_c(dst, src, stride, 16);
2728 void ff_avg_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2729 avg_pixels16_c(dst, src, stride, 16);
2731 #endif /* CONFIG_CAVS_DECODER */
2733 #if CONFIG_VC1_DECODER || CONFIG_WMV3_DECODER
2735 void ff_vc1dsp_init(DSPContext* c, AVCodecContext *avctx);
2737 void ff_put_vc1_mspel_mc00_c(uint8_t *dst, uint8_t *src, int stride, int rnd) {
2738 put_pixels8_c(dst, src, stride, 8);
2740 #endif /* CONFIG_VC1_DECODER||CONFIG_WMV3_DECODER */
2742 void ff_intrax8dsp_init(DSPContext* c, AVCodecContext *avctx);
2745 void ff_h264dspenc_init(DSPContext* c, AVCodecContext *avctx);
2747 #if CONFIG_RV30_DECODER
2748 void ff_rv30dsp_init(DSPContext* c, AVCodecContext *avctx);
2749 #endif /* CONFIG_RV30_DECODER */
2751 #if CONFIG_RV40_DECODER
2752 static void put_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2753 put_pixels16_xy2_c(dst, src, stride, 16);
2755 static void avg_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2756 avg_pixels16_xy2_c(dst, src, stride, 16);
2758 static void put_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2759 put_pixels8_xy2_c(dst, src, stride, 8);
2761 static void avg_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2762 avg_pixels8_xy2_c(dst, src, stride, 8);
2765 void ff_rv40dsp_init(DSPContext* c, AVCodecContext *avctx);
2766 #endif /* CONFIG_RV40_DECODER */
2768 static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
2769 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2773 const int src_1= src[ -srcStride];
2774 const int src0 = src[0 ];
2775 const int src1 = src[ srcStride];
2776 const int src2 = src[2*srcStride];
2777 const int src3 = src[3*srcStride];
2778 const int src4 = src[4*srcStride];
2779 const int src5 = src[5*srcStride];
2780 const int src6 = src[6*srcStride];
2781 const int src7 = src[7*srcStride];
2782 const int src8 = src[8*srcStride];
2783 const int src9 = src[9*srcStride];
2784 dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
2785 dst[1*dstStride]= cm[(9*(src1 + src2) - (src0 + src3) + 8)>>4];
2786 dst[2*dstStride]= cm[(9*(src2 + src3) - (src1 + src4) + 8)>>4];
2787 dst[3*dstStride]= cm[(9*(src3 + src4) - (src2 + src5) + 8)>>4];
2788 dst[4*dstStride]= cm[(9*(src4 + src5) - (src3 + src6) + 8)>>4];
2789 dst[5*dstStride]= cm[(9*(src5 + src6) - (src4 + src7) + 8)>>4];
2790 dst[6*dstStride]= cm[(9*(src6 + src7) - (src5 + src8) + 8)>>4];
2791 dst[7*dstStride]= cm[(9*(src7 + src8) - (src6 + src9) + 8)>>4];
2797 static void put_mspel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){
2798 put_pixels8_c(dst, src, stride, 8);
2801 static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
2803 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2804 put_pixels8_l2(dst, src, half, stride, stride, 8, 8);
2807 static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
2808 wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
2811 static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
2813 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2814 put_pixels8_l2(dst, src+1, half, stride, stride, 8, 8);
2817 static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
2818 wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
2821 static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
2825 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2826 wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
2827 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2828 put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2830 static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
2834 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2835 wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
2836 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2837 put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2839 static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
2841 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2842 wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
2845 static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
2846 if(CONFIG_ANY_H263) {
2848 const int strength= ff_h263_loop_filter_strength[qscale];
2852 int p0= src[x-2*stride];
2853 int p1= src[x-1*stride];
2854 int p2= src[x+0*stride];
2855 int p3= src[x+1*stride];
2856 int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2858 if (d<-2*strength) d1= 0;
2859 else if(d<- strength) d1=-2*strength - d;
2860 else if(d< strength) d1= d;
2861 else if(d< 2*strength) d1= 2*strength - d;
2866 if(p1&256) p1= ~(p1>>31);
2867 if(p2&256) p2= ~(p2>>31);
2869 src[x-1*stride] = p1;
2870 src[x+0*stride] = p2;
2874 d2= av_clip((p0-p3)/4, -ad1, ad1);
2876 src[x-2*stride] = p0 - d2;
2877 src[x+ stride] = p3 + d2;
2882 static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
2883 if(CONFIG_ANY_H263) {
2885 const int strength= ff_h263_loop_filter_strength[qscale];
2889 int p0= src[y*stride-2];
2890 int p1= src[y*stride-1];
2891 int p2= src[y*stride+0];
2892 int p3= src[y*stride+1];
2893 int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2895 if (d<-2*strength) d1= 0;
2896 else if(d<- strength) d1=-2*strength - d;
2897 else if(d< strength) d1= d;
2898 else if(d< 2*strength) d1= 2*strength - d;
2903 if(p1&256) p1= ~(p1>>31);
2904 if(p2&256) p2= ~(p2>>31);
2906 src[y*stride-1] = p1;
2907 src[y*stride+0] = p2;
2911 d2= av_clip((p0-p3)/4, -ad1, ad1);
2913 src[y*stride-2] = p0 - d2;
2914 src[y*stride+1] = p3 + d2;
2919 static void h261_loop_filter_c(uint8_t *src, int stride){
2924 temp[x ] = 4*src[x ];
2925 temp[x + 7*8] = 4*src[x + 7*stride];
2929 xy = y * stride + x;
2931 temp[yz] = src[xy - stride] + 2*src[xy] + src[xy + stride];
2936 src[ y*stride] = (temp[ y*8] + 2)>>2;
2937 src[7+y*stride] = (temp[7+y*8] + 2)>>2;
2939 xy = y * stride + x;
2941 src[xy] = (temp[yz-1] + 2*temp[yz] + temp[yz+1] + 8)>>4;
2946 static inline void h264_loop_filter_luma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
2949 for( i = 0; i < 4; i++ ) {
2954 for( d = 0; d < 4; d++ ) {
2955 const int p0 = pix[-1*xstride];
2956 const int p1 = pix[-2*xstride];
2957 const int p2 = pix[-3*xstride];
2958 const int q0 = pix[0];
2959 const int q1 = pix[1*xstride];
2960 const int q2 = pix[2*xstride];
2962 if( FFABS( p0 - q0 ) < alpha &&
2963 FFABS( p1 - p0 ) < beta &&
2964 FFABS( q1 - q0 ) < beta ) {
2969 if( FFABS( p2 - p0 ) < beta ) {
2970 pix[-2*xstride] = p1 + av_clip( (( p2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - p1, -tc0[i], tc0[i] );
2973 if( FFABS( q2 - q0 ) < beta ) {
2974 pix[ xstride] = q1 + av_clip( (( q2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - q1, -tc0[i], tc0[i] );
2978 i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
2979 pix[-xstride] = av_clip_uint8( p0 + i_delta ); /* p0' */
2980 pix[0] = av_clip_uint8( q0 - i_delta ); /* q0' */
2986 static void h264_v_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2988 h264_loop_filter_luma_c(pix, stride, 1, alpha, beta, tc0);
2990 static void h264_h_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2992 h264_loop_filter_luma_c(pix, 1, stride, alpha, beta, tc0);
2995 static inline void h264_loop_filter_luma_intra_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta)
2998 for( d = 0; d < 16; d++ ) {
2999 const int p2 = pix[-3*xstride];
3000 const int p1 = pix[-2*xstride];
3001 const int p0 = pix[-1*xstride];
3003 const int q0 = pix[ 0*xstride];
3004 const int q1 = pix[ 1*xstride];
3005 const int q2 = pix[ 2*xstride];
3007 if( FFABS( p0 - q0 ) < alpha &&
3008 FFABS( p1 - p0 ) < beta &&
3009 FFABS( q1 - q0 ) < beta ) {
3011 if(FFABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
3012 if( FFABS( p2 - p0 ) < beta)
3014 const int p3 = pix[-4*xstride];
3016 pix[-1*xstride] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
3017 pix[-2*xstride] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
3018 pix[-3*xstride] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
3021 pix[-1*xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
3023 if( FFABS( q2 - q0 ) < beta)
3025 const int q3 = pix[3*xstride];
3027 pix[0*xstride] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
3028 pix[1*xstride] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
3029 pix[2*xstride] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
3032 pix[0*xstride] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
3036 pix[-1*xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
3037 pix[ 0*xstride] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
3043 static void h264_v_loop_filter_luma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
3045 h264_loop_filter_luma_intra_c(pix, stride, 1, alpha, beta);
3047 static void h264_h_loop_filter_luma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
3049 h264_loop_filter_luma_intra_c(pix, 1, stride, alpha, beta);
3052 static inline void h264_loop_filter_chroma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
3055 for( i = 0; i < 4; i++ ) {
3056 const int tc = tc0[i];
3061 for( d = 0; d < 2; d++ ) {
3062 const int p0 = pix[-1*xstride];
3063 const int p1 = pix[-2*xstride];
3064 const int q0 = pix[0];
3065 const int q1 = pix[1*xstride];
3067 if( FFABS( p0 - q0 ) < alpha &&
3068 FFABS( p1 - p0 ) < beta &&
3069 FFABS( q1 - q0 ) < beta ) {
3071 int delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
3073 pix[-xstride] = av_clip_uint8( p0 + delta ); /* p0' */
3074 pix[0] = av_clip_uint8( q0 - delta ); /* q0' */
3080 static void h264_v_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
3082 h264_loop_filter_chroma_c(pix, stride, 1, alpha, beta, tc0);
3084 static void h264_h_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
3086 h264_loop_filter_chroma_c(pix, 1, stride, alpha, beta, tc0);
3089 static inline void h264_loop_filter_chroma_intra_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta)
3092 for( d = 0; d < 8; d++ ) {
3093 const int p0 = pix[-1*xstride];
3094 const int p1 = pix[-2*xstride];
3095 const int q0 = pix[0];
3096 const int q1 = pix[1*xstride];
3098 if( FFABS( p0 - q0 ) < alpha &&
3099 FFABS( p1 - p0 ) < beta &&
3100 FFABS( q1 - q0 ) < beta ) {
3102 pix[-xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2; /* p0' */
3103 pix[0] = ( 2*q1 + q0 + p1 + 2 ) >> 2; /* q0' */
3108 static void h264_v_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
3110 h264_loop_filter_chroma_intra_c(pix, stride, 1, alpha, beta);
3112 static void h264_h_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
3114 h264_loop_filter_chroma_intra_c(pix, 1, stride, alpha, beta);
3117 static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3123 s += abs(pix1[0] - pix2[0]);
3124 s += abs(pix1[1] - pix2[1]);
3125 s += abs(pix1[2] - pix2[2]);
3126 s += abs(pix1[3] - pix2[3]);
3127 s += abs(pix1[4] - pix2[4]);
3128 s += abs(pix1[5] - pix2[5]);
3129 s += abs(pix1[6] - pix2[6]);
3130 s += abs(pix1[7] - pix2[7]);
3131 s += abs(pix1[8] - pix2[8]);
3132 s += abs(pix1[9] - pix2[9]);
3133 s += abs(pix1[10] - pix2[10]);
3134 s += abs(pix1[11] - pix2[11]);
3135 s += abs(pix1[12] - pix2[12]);
3136 s += abs(pix1[13] - pix2[13]);
3137 s += abs(pix1[14] - pix2[14]);
3138 s += abs(pix1[15] - pix2[15]);
3145 static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3151 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
3152 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
3153 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
3154 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
3155 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
3156 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
3157 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
3158 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
3159 s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
3160 s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
3161 s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
3162 s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
3163 s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
3164 s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
3165 s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
3166 s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
3173 static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3176 uint8_t *pix3 = pix2 + line_size;
3180 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
3181 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
3182 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
3183 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
3184 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
3185 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
3186 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
3187 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
3188 s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
3189 s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
3190 s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
3191 s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
3192 s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
3193 s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
3194 s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
3195 s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
3203 static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3206 uint8_t *pix3 = pix2 + line_size;
3210 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
3211 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
3212 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
3213 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
3214 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
3215 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
3216 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
3217 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
3218 s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
3219 s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
3220 s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
3221 s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
3222 s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
3223 s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
3224 s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
3225 s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
3233 static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3239 s += abs(pix1[0] - pix2[0]);
3240 s += abs(pix1[1] - pix2[1]);
3241 s += abs(pix1[2] - pix2[2]);
3242 s += abs(pix1[3] - pix2[3]);
3243 s += abs(pix1[4] - pix2[4]);
3244 s += abs(pix1[5] - pix2[5]);
3245 s += abs(pix1[6] - pix2[6]);
3246 s += abs(pix1[7] - pix2[7]);
3253 static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3259 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
3260 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
3261 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
3262 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
3263 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
3264 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
3265 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
3266 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
3273 static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3276 uint8_t *pix3 = pix2 + line_size;
3280 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
3281 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
3282 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
3283 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
3284 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
3285 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
3286 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
3287 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
3295 static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3298 uint8_t *pix3 = pix2 + line_size;
3302 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
3303 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
3304 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
3305 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
3306 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
3307 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
3308 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
3309 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
3317 static int nsse16_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
3318 MpegEncContext *c = v;
3324 for(x=0; x<16; x++){
3325 score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]);
3328 for(x=0; x<15; x++){
3329 score2+= FFABS( s1[x ] - s1[x +stride]
3330 - s1[x+1] + s1[x+1+stride])
3331 -FFABS( s2[x ] - s2[x +stride]
3332 - s2[x+1] + s2[x+1+stride]);
3339 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
3340 else return score1 + FFABS(score2)*8;
3343 static int nsse8_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
3344 MpegEncContext *c = v;
3351 score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]);
3355 score2+= FFABS( s1[x ] - s1[x +stride]
3356 - s1[x+1] + s1[x+1+stride])
3357 -FFABS( s2[x ] - s2[x +stride]
3358 - s2[x+1] + s2[x+1+stride]);
3365 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
3366 else return score1 + FFABS(score2)*8;
3369 static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
3373 for(i=0; i<8*8; i++){
3374 int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
3377 assert(-512<b && b<512);
3379 sum += (w*b)*(w*b)>>4;
3384 static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
3387 for(i=0; i<8*8; i++){
3388 rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
3393 * permutes an 8x8 block.
3394 * @param block the block which will be permuted according to the given permutation vector
3395 * @param permutation the permutation vector
3396 * @param last the last non zero coefficient in scantable order, used to speed the permutation up
3397 * @param scantable the used scantable, this is only used to speed the permutation up, the block is not
3398 * (inverse) permutated to scantable order!
3400 void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
3406 //if(permutation[1]==1) return; //FIXME it is ok but not clean and might fail for some permutations
3408 for(i=0; i<=last; i++){
3409 const int j= scantable[i];
3414 for(i=0; i<=last; i++){
3415 const int j= scantable[i];
3416 const int perm_j= permutation[j];
3417 block[perm_j]= temp[j];
3421 static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
3425 void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
3428 memset(cmp, 0, sizeof(void*)*5);
3436 cmp[i]= c->hadamard8_diff[i];
3442 cmp[i]= c->dct_sad[i];
3445 cmp[i]= c->dct264_sad[i];
3448 cmp[i]= c->dct_max[i];
3451 cmp[i]= c->quant_psnr[i];
3471 #if CONFIG_SNOW_ENCODER
3480 av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
3485 static void clear_block_c(DCTELEM *block)
3487 memset(block, 0, sizeof(DCTELEM)*64);
3491 * memset(blocks, 0, sizeof(DCTELEM)*6*64)
3493 static void clear_blocks_c(DCTELEM *blocks)
3495 memset(blocks, 0, sizeof(DCTELEM)*6*64);
3498 static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
3500 for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3501 long a = *(long*)(src+i);
3502 long b = *(long*)(dst+i);
3503 *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
3506 dst[i+0] += src[i+0];
3509 static void add_bytes_l2_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
3511 for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3512 long a = *(long*)(src1+i);
3513 long b = *(long*)(src2+i);
3514 *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
3517 dst[i] = src1[i]+src2[i];
3520 static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
3522 #if !HAVE_FAST_UNALIGNED
3523 if((long)src2 & (sizeof(long)-1)){
3524 for(i=0; i+7<w; i+=8){
3525 dst[i+0] = src1[i+0]-src2[i+0];
3526 dst[i+1] = src1[i+1]-src2[i+1];
3527 dst[i+2] = src1[i+2]-src2[i+2];
3528 dst[i+3] = src1[i+3]-src2[i+3];
3529 dst[i+4] = src1[i+4]-src2[i+4];
3530 dst[i+5] = src1[i+5]-src2[i+5];
3531 dst[i+6] = src1[i+6]-src2[i+6];
3532 dst[i+7] = src1[i+7]-src2[i+7];
3536 for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3537 long a = *(long*)(src1+i);
3538 long b = *(long*)(src2+i);
3539 *(long*)(dst+i) = ((a|pb_80) - (b&pb_7f)) ^ ((a^b^pb_80)&pb_80);
3542 dst[i+0] = src1[i+0]-src2[i+0];
3545 static void sub_hfyu_median_prediction_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top){
3553 const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
3563 #define BUTTERFLY2(o1,o2,i1,i2) \
3567 #define BUTTERFLY1(x,y) \
3576 #define BUTTERFLYA(x,y) (FFABS((x)+(y)) + FFABS((x)-(y)))
3578 static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
3586 //FIXME try pointer walks
3587 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
3588 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
3589 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
3590 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
3592 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3593 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3594 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3595 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3597 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3598 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3599 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3600 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3604 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3605 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3606 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3607 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3609 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3610 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3611 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3612 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3615 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3616 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3617 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3618 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3624 printf("MAX:%d\n", maxi);
3630 static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
3638 //FIXME try pointer walks
3639 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
3640 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
3641 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
3642 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
3644 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3645 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3646 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3647 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3649 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3650 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3651 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3652 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3656 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3657 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3658 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3659 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3661 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3662 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3663 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3664 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3667 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3668 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3669 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3670 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3673 sum -= FFABS(temp[8*0] + temp[8*4]); // -mean
3678 static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3679 MpegEncContext * const s= (MpegEncContext *)c;
3680 DECLARE_ALIGNED_16(uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3681 DCTELEM * const temp= (DCTELEM*)aligned_temp;
3685 s->dsp.diff_pixels(temp, src1, src2, stride);
3687 return s->dsp.sum_abs_dctelem(temp);
3692 const int s07 = SRC(0) + SRC(7);\
3693 const int s16 = SRC(1) + SRC(6);\
3694 const int s25 = SRC(2) + SRC(5);\
3695 const int s34 = SRC(3) + SRC(4);\
3696 const int a0 = s07 + s34;\
3697 const int a1 = s16 + s25;\
3698 const int a2 = s07 - s34;\
3699 const int a3 = s16 - s25;\
3700 const int d07 = SRC(0) - SRC(7);\
3701 const int d16 = SRC(1) - SRC(6);\
3702 const int d25 = SRC(2) - SRC(5);\
3703 const int d34 = SRC(3) - SRC(4);\
3704 const int a4 = d16 + d25 + (d07 + (d07>>1));\
3705 const int a5 = d07 - d34 - (d25 + (d25>>1));\
3706 const int a6 = d07 + d34 - (d16 + (d16>>1));\
3707 const int a7 = d16 - d25 + (d34 + (d34>>1));\
3709 DST(1, a4 + (a7>>2)) ;\
3710 DST(2, a2 + (a3>>1)) ;\
3711 DST(3, a5 + (a6>>2)) ;\
3713 DST(5, a6 - (a5>>2)) ;\
3714 DST(6, (a2>>1) - a3 ) ;\
3715 DST(7, (a4>>2) - a7 ) ;\
3718 static int dct264_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3719 MpegEncContext * const s= (MpegEncContext *)c;
3724 s->dsp.diff_pixels(dct[0], src1, src2, stride);
3726 #define SRC(x) dct[i][x]
3727 #define DST(x,v) dct[i][x]= v
3728 for( i = 0; i < 8; i++ )
3733 #define SRC(x) dct[x][i]
3734 #define DST(x,v) sum += FFABS(v)
3735 for( i = 0; i < 8; i++ )
3743 static int dct_max8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3744 MpegEncContext * const s= (MpegEncContext *)c;
3745 DECLARE_ALIGNED_8(uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3746 DCTELEM * const temp= (DCTELEM*)aligned_temp;
3751 s->dsp.diff_pixels(temp, src1, src2, stride);
3755 sum= FFMAX(sum, FFABS(temp[i]));
3760 static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3761 MpegEncContext * const s= (MpegEncContext *)c;
3762 DECLARE_ALIGNED_8 (uint64_t, aligned_temp[sizeof(DCTELEM)*64*2/8]);
3763 DCTELEM * const temp= (DCTELEM*)aligned_temp;
3764 DCTELEM * const bak = ((DCTELEM*)aligned_temp)+64;
3770 s->dsp.diff_pixels(temp, src1, src2, stride);
3772 memcpy(bak, temp, 64*sizeof(DCTELEM));
3774 s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3775 s->dct_unquantize_inter(s, temp, 0, s->qscale);
3776 ff_simple_idct(temp); //FIXME
3779 sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
3784 static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3785 MpegEncContext * const s= (MpegEncContext *)c;
3786 const uint8_t *scantable= s->intra_scantable.permutated;
3787 DECLARE_ALIGNED_8 (uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3788 DECLARE_ALIGNED_8 (uint64_t, aligned_bak[stride]);
3789 DCTELEM * const temp= (DCTELEM*)aligned_temp;
3790 uint8_t * const bak= (uint8_t*)aligned_bak;
3791 int i, last, run, bits, level, distortion, start_i;
3792 const int esc_length= s->ac_esc_length;
3794 uint8_t * last_length;
3799 ((uint32_t*)(bak + i*stride))[0]= ((uint32_t*)(src2 + i*stride))[0];
3800 ((uint32_t*)(bak + i*stride))[1]= ((uint32_t*)(src2 + i*stride))[1];
3803 s->dsp.diff_pixels(temp, src1, src2, stride);
3805 s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3811 length = s->intra_ac_vlc_length;
3812 last_length= s->intra_ac_vlc_last_length;
3813 bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3816 length = s->inter_ac_vlc_length;
3817 last_length= s->inter_ac_vlc_last_length;
3822 for(i=start_i; i<last; i++){
3823 int j= scantable[i];
3828 if((level&(~127)) == 0){
3829 bits+= length[UNI_AC_ENC_INDEX(run, level)];
3838 level= temp[i] + 64;
3842 if((level&(~127)) == 0){
3843 bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3851 s->dct_unquantize_intra(s, temp, 0, s->qscale);
3853 s->dct_unquantize_inter(s, temp, 0, s->qscale);
3856 s->dsp.idct_add(bak, stride, temp);
3858 distortion= s->dsp.sse[1](NULL, bak, src1, stride, 8);
3860 return distortion + ((bits*s->qscale*s->qscale*109 + 64)>>7);
3863 static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3864 MpegEncContext * const s= (MpegEncContext *)c;
3865 const uint8_t *scantable= s->intra_scantable.permutated;
3866 DECLARE_ALIGNED_8 (uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3867 DCTELEM * const temp= (DCTELEM*)aligned_temp;
3868 int i, last, run, bits, level, start_i;
3869 const int esc_length= s->ac_esc_length;
3871 uint8_t * last_length;
3875 s->dsp.diff_pixels(temp, src1, src2, stride);
3877 s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3883 length = s->intra_ac_vlc_length;
3884 last_length= s->intra_ac_vlc_last_length;
3885 bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3888 length = s->inter_ac_vlc_length;
3889 last_length= s->inter_ac_vlc_last_length;
3894 for(i=start_i; i<last; i++){
3895 int j= scantable[i];
3900 if((level&(~127)) == 0){
3901 bits+= length[UNI_AC_ENC_INDEX(run, level)];
3910 level= temp[i] + 64;
3914 if((level&(~127)) == 0){
3915 bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3923 static int vsad_intra16_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){
3928 for(x=0; x<16; x+=4){
3929 score+= FFABS(s[x ] - s[x +stride]) + FFABS(s[x+1] - s[x+1+stride])
3930 +FFABS(s[x+2] - s[x+2+stride]) + FFABS(s[x+3] - s[x+3+stride]);
3938 static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3943 for(x=0; x<16; x++){
3944 score+= FFABS(s1[x ] - s2[x ] - s1[x +stride] + s2[x +stride]);
3953 #define SQ(a) ((a)*(a))
3954 static int vsse_intra16_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){
3959 for(x=0; x<16; x+=4){
3960 score+= SQ(s[x ] - s[x +stride]) + SQ(s[x+1] - s[x+1+stride])
3961 +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]);
3969 static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3974 for(x=0; x<16; x++){
3975 score+= SQ(s1[x ] - s2[x ] - s1[x +stride] + s2[x +stride]);
3984 static int ssd_int8_vs_int16_c(const int8_t *pix1, const int16_t *pix2,
3988 for(i=0; i<size; i++)
3989 score += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]);
3993 WRAPPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
3994 WRAPPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
3995 WRAPPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
3997 WRAPPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c)
3999 WRAPPER8_16_SQ(dct_max8x8_c, dct_max16_c)
4000 WRAPPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
4001 WRAPPER8_16_SQ(rd8x8_c, rd16_c)
4002 WRAPPER8_16_SQ(bit8x8_c, bit16_c)
4004 static void vector_fmul_c(float *dst, const float *src, int len){
4006 for(i=0; i<len; i++)
4010 static void vector_fmul_reverse_c(float *dst, const float *src0, const float *src1, int len){
4013 for(i=0; i<len; i++)
4014 dst[i] = src0[i] * src1[-i];
4017 void ff_vector_fmul_add_add_c(float *dst, const float *src0, const float *src1, const float *src2, int src3, int len, int step){
4019 for(i=0; i<len; i++)
4020 dst[i*step] = src0[i] * src1[i] + src2[i] + src3;
4023 void ff_vector_fmul_window_c(float *dst, const float *src0, const float *src1, const float *win, float add_bias, int len){
4028 for(i=-len, j=len-1; i<0; i++, j--) {
4033 dst[i] = s0*wj - s1*wi + add_bias;
4034 dst[j] = s0*wi + s1*wj + add_bias;
4038 static void int32_to_float_fmul_scalar_c(float *dst, const int *src, float mul, int len){
4040 for(i=0; i<len; i++)
4041 dst[i] = src[i] * mul;
4044 static av_always_inline int float_to_int16_one(const float *src){
4045 int_fast32_t tmp = *(const int32_t*)src;
4047 tmp = (0x43c0ffff - tmp)>>31;
4048 // is this faster on some gcc/cpu combinations?
4049 // if(tmp > 0x43c0ffff) tmp = 0xFFFF;
4052 return tmp - 0x8000;
4055 void ff_float_to_int16_c(int16_t *dst, const float *src, long len){
4057 for(i=0; i<len; i++)
4058 dst[i] = float_to_int16_one(src+i);
4061 void ff_float_to_int16_interleave_c(int16_t *dst, const float **src, long len, int channels){
4064 for(i=0; i<len; i++){
4065 dst[2*i] = float_to_int16_one(src[0]+i);
4066 dst[2*i+1] = float_to_int16_one(src[1]+i);
4069 for(c=0; c<channels; c++)
4070 for(i=0, j=c; i<len; i++, j+=channels)
4071 dst[j] = float_to_int16_one(src[c]+i);
4075 static void add_int16_c(int16_t * v1, int16_t * v2, int order)
4081 static void sub_int16_c(int16_t * v1, int16_t * v2, int order)
4087 static int32_t scalarproduct_int16_c(int16_t * v1, int16_t * v2, int order, int shift)
4092 res += (*v1++ * *v2++) >> shift;
4098 #define W1 2841 /* 2048*sqrt (2)*cos (1*pi/16) */
4099 #define W2 2676 /* 2048*sqrt (2)*cos (2*pi/16) */
4100 #define W3 2408 /* 2048*sqrt (2)*cos (3*pi/16) */
4101 #define W4 2048 /* 2048*sqrt (2)*cos (4*pi/16) */
4102 #define W5 1609 /* 2048*sqrt (2)*cos (5*pi/16) */
4103 #define W6 1108 /* 2048*sqrt (2)*cos (6*pi/16) */
4104 #define W7 565 /* 2048*sqrt (2)*cos (7*pi/16) */
4106 static void wmv2_idct_row(short * b)
4109 int a0,a1,a2,a3,a4,a5,a6,a7;
4111 a1 = W1*b[1]+W7*b[7];
4112 a7 = W7*b[1]-W1*b[7];
4113 a5 = W5*b[5]+W3*b[3];
4114 a3 = W3*b[5]-W5*b[3];
4115 a2 = W2*b[2]+W6*b[6];
4116 a6 = W6*b[2]-W2*b[6];
4117 a0 = W0*b[0]+W0*b[4];
4118 a4 = W0*b[0]-W0*b[4];
4120 s1 = (181*(a1-a5+a7-a3)+128)>>8;//1,3,5,7,
4121 s2 = (181*(a1-a5-a7+a3)+128)>>8;
4123 b[0] = (a0+a2+a1+a5 + (1<<7))>>8;
4124 b[1] = (a4+a6 +s1 + (1<<7))>>8;
4125 b[2] = (a4-a6 +s2 + (1<<7))>>8;
4126 b[3] = (a0-a2+a7+a3 + (1<<7))>>8;
4127 b[4] = (a0-a2-a7-a3 + (1<<7))>>8;
4128 b[5] = (a4-a6 -s2 + (1<<7))>>8;
4129 b[6] = (a4+a6 -s1 + (1<<7))>>8;
4130 b[7] = (a0+a2-a1-a5 + (1<<7))>>8;
4132 static void wmv2_idct_col(short * b)
4135 int a0,a1,a2,a3,a4,a5,a6,a7;
4136 /*step 1, with extended precision*/
4137 a1 = (W1*b[8*1]+W7*b[8*7] + 4)>>3;
4138 a7 = (W7*b[8*1]-W1*b[8*7] + 4)>>3;
4139 a5 = (W5*b[8*5]+W3*b[8*3] + 4)>>3;
4140 a3 = (W3*b[8*5]-W5*b[8*3] + 4)>>3;
4141 a2 = (W2*b[8*2]+W6*b[8*6] + 4)>>3;
4142 a6 = (W6*b[8*2]-W2*b[8*6] + 4)>>3;
4143 a0 = (W0*b[8*0]+W0*b[8*4] )>>3;
4144 a4 = (W0*b[8*0]-W0*b[8*4] )>>3;
4146 s1 = (181*(a1-a5+a7-a3)+128)>>8;
4147 s2 = (181*(a1-a5-a7+a3)+128)>>8;
4149 b[8*0] = (a0+a2+a1+a5 + (1<<13))>>14;
4150 b[8*1] = (a4+a6 +s1 + (1<<13))>>14;
4151 b[8*2] = (a4-a6 +s2 + (1<<13))>>14;
4152 b[8*3] = (a0-a2+a7+a3 + (1<<13))>>14;
4154 b[8*4] = (a0-a2-a7-a3 + (1<<13))>>14;
4155 b[8*5] = (a4-a6 -s2 + (1<<13))>>14;
4156 b[8*6] = (a4+a6 -s1 + (1<<13))>>14;
4157 b[8*7] = (a0+a2-a1-a5 + (1<<13))>>14;
4159 void ff_wmv2_idct_c(short * block){
4163 wmv2_idct_row(block+i);
4166 wmv2_idct_col(block+i);
4169 /* XXX: those functions should be suppressed ASAP when all IDCTs are
4171 static void ff_wmv2_idct_put_c(uint8_t *dest, int line_size, DCTELEM *block)
4173 ff_wmv2_idct_c(block);
4174 put_pixels_clamped_c(block, dest, line_size);
4176 static void ff_wmv2_idct_add_c(uint8_t *dest, int line_size, DCTELEM *block)
4178 ff_wmv2_idct_c(block);
4179 add_pixels_clamped_c(block, dest, line_size);
4181 static void ff_jref_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
4184 put_pixels_clamped_c(block, dest, line_size);
4186 static void ff_jref_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
4189 add_pixels_clamped_c(block, dest, line_size);
4192 static void ff_jref_idct4_put(uint8_t *dest, int line_size, DCTELEM *block)
4195 put_pixels_clamped4_c(block, dest, line_size);
4197 static void ff_jref_idct4_add(uint8_t *dest, int line_size, DCTELEM *block)
4200 add_pixels_clamped4_c(block, dest, line_size);
4203 static void ff_jref_idct2_put(uint8_t *dest, int line_size, DCTELEM *block)
4206 put_pixels_clamped2_c(block, dest, line_size);
4208 static void ff_jref_idct2_add(uint8_t *dest, int line_size, DCTELEM *block)
4211 add_pixels_clamped2_c(block, dest, line_size);
4214 static void ff_jref_idct1_put(uint8_t *dest, int line_size, DCTELEM *block)
4216 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
4218 dest[0] = cm[(block[0] + 4)>>3];
4220 static void ff_jref_idct1_add(uint8_t *dest, int line_size, DCTELEM *block)
4222 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
4224 dest[0] = cm[dest[0] + ((block[0] + 4)>>3)];
4227 static void just_return(void *mem av_unused, int stride av_unused, int h av_unused) { return; }
4229 /* init static data */
4230 void dsputil_static_init(void)
4234 for(i=0;i<256;i++) ff_cropTbl[i + MAX_NEG_CROP] = i;
4235 for(i=0;i<MAX_NEG_CROP;i++) {
4237 ff_cropTbl[i + MAX_NEG_CROP + 256] = 255;
4240 for(i=0;i<512;i++) {
4241 ff_squareTbl[i] = (i - 256) * (i - 256);
4244 for(i=0; i<64; i++) inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
4247 int ff_check_alignment(void){
4248 static int did_fail=0;
4249 DECLARE_ALIGNED_16(int, aligned);
4251 if((long)&aligned & 15){
4253 #if HAVE_MMX || HAVE_ALTIVEC
4254 av_log(NULL, AV_LOG_ERROR,
4255 "Compiler did not align stack variables. Libavcodec has been miscompiled\n"
4256 "and may be very slow or crash. This is not a bug in libavcodec,\n"
4257 "but in the compiler. You may try recompiling using gcc >= 4.2.\n"
4258 "Do not report crashes to FFmpeg developers.\n");
4267 void dsputil_init(DSPContext* c, AVCodecContext *avctx)
4271 ff_check_alignment();
4274 if(avctx->dct_algo==FF_DCT_FASTINT) {
4275 c->fdct = fdct_ifast;
4276 c->fdct248 = fdct_ifast248;
4278 else if(avctx->dct_algo==FF_DCT_FAAN) {
4279 c->fdct = ff_faandct;
4280 c->fdct248 = ff_faandct248;
4283 c->fdct = ff_jpeg_fdct_islow; //slow/accurate/default
4284 c->fdct248 = ff_fdct248_islow;
4286 #endif //CONFIG_ENCODERS
4288 if(avctx->lowres==1){
4289 if(avctx->idct_algo==FF_IDCT_INT || avctx->idct_algo==FF_IDCT_AUTO || !CONFIG_H264_DECODER){
4290 c->idct_put= ff_jref_idct4_put;
4291 c->idct_add= ff_jref_idct4_add;
4293 c->idct_put= ff_h264_lowres_idct_put_c;
4294 c->idct_add= ff_h264_lowres_idct_add_c;
4296 c->idct = j_rev_dct4;
4297 c->idct_permutation_type= FF_NO_IDCT_PERM;
4298 }else if(avctx->lowres==2){
4299 c->idct_put= ff_jref_idct2_put;
4300 c->idct_add= ff_jref_idct2_add;
4301 c->idct = j_rev_dct2;
4302 c->idct_permutation_type= FF_NO_IDCT_PERM;
4303 }else if(avctx->lowres==3){
4304 c->idct_put= ff_jref_idct1_put;
4305 c->idct_add= ff_jref_idct1_add;
4306 c->idct = j_rev_dct1;
4307 c->idct_permutation_type= FF_NO_IDCT_PERM;
4309 if(avctx->idct_algo==FF_IDCT_INT){
4310 c->idct_put= ff_jref_idct_put;
4311 c->idct_add= ff_jref_idct_add;
4312 c->idct = j_rev_dct;
4313 c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
4314 }else if((CONFIG_VP3_DECODER || CONFIG_VP5_DECODER || CONFIG_VP6_DECODER || CONFIG_THEORA_DECODER ) &&
4315 avctx->idct_algo==FF_IDCT_VP3){
4316 c->idct_put= ff_vp3_idct_put_c;
4317 c->idct_add= ff_vp3_idct_add_c;
4318 c->idct = ff_vp3_idct_c;
4319 c->idct_permutation_type= FF_NO_IDCT_PERM;
4320 }else if(avctx->idct_algo==FF_IDCT_WMV2){
4321 c->idct_put= ff_wmv2_idct_put_c;
4322 c->idct_add= ff_wmv2_idct_add_c;
4323 c->idct = ff_wmv2_idct_c;
4324 c->idct_permutation_type= FF_NO_IDCT_PERM;
4325 }else if(avctx->idct_algo==FF_IDCT_FAAN){
4326 c->idct_put= ff_faanidct_put;
4327 c->idct_add= ff_faanidct_add;
4328 c->idct = ff_faanidct;
4329 c->idct_permutation_type= FF_NO_IDCT_PERM;
4330 }else if(CONFIG_EATGQ_DECODER && avctx->idct_algo==FF_IDCT_EA) {
4331 c->idct_put= ff_ea_idct_put_c;
4332 c->idct_permutation_type= FF_NO_IDCT_PERM;
4333 }else{ //accurate/default
4334 c->idct_put= ff_simple_idct_put;
4335 c->idct_add= ff_simple_idct_add;
4336 c->idct = ff_simple_idct;
4337 c->idct_permutation_type= FF_NO_IDCT_PERM;
4341 if (CONFIG_H264_DECODER) {
4342 c->h264_idct_add= ff_h264_idct_add_c;
4343 c->h264_idct8_add= ff_h264_idct8_add_c;
4344 c->h264_idct_dc_add= ff_h264_idct_dc_add_c;
4345 c->h264_idct8_dc_add= ff_h264_idct8_dc_add_c;
4346 c->h264_idct_add16 = ff_h264_idct_add16_c;
4347 c->h264_idct8_add4 = ff_h264_idct8_add4_c;
4348 c->h264_idct_add8 = ff_h264_idct_add8_c;
4349 c->h264_idct_add16intra= ff_h264_idct_add16intra_c;
4352 c->get_pixels = get_pixels_c;
4353 c->diff_pixels = diff_pixels_c;
4354 c->put_pixels_clamped = put_pixels_clamped_c;
4355 c->put_signed_pixels_clamped = put_signed_pixels_clamped_c;
4356 c->add_pixels_clamped = add_pixels_clamped_c;
4357 c->add_pixels8 = add_pixels8_c;
4358 c->add_pixels4 = add_pixels4_c;
4359 c->sum_abs_dctelem = sum_abs_dctelem_c;
4362 c->clear_block = clear_block_c;
4363 c->clear_blocks = clear_blocks_c;
4364 c->pix_sum = pix_sum_c;
4365 c->pix_norm1 = pix_norm1_c;
4367 /* TODO [0] 16 [1] 8 */
4368 c->pix_abs[0][0] = pix_abs16_c;
4369 c->pix_abs[0][1] = pix_abs16_x2_c;
4370 c->pix_abs[0][2] = pix_abs16_y2_c;
4371 c->pix_abs[0][3] = pix_abs16_xy2_c;
4372 c->pix_abs[1][0] = pix_abs8_c;
4373 c->pix_abs[1][1] = pix_abs8_x2_c;
4374 c->pix_abs[1][2] = pix_abs8_y2_c;
4375 c->pix_abs[1][3] = pix_abs8_xy2_c;
4377 #define dspfunc(PFX, IDX, NUM) \
4378 c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## NUM ## _c; \
4379 c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## NUM ## _x2_c; \
4380 c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## NUM ## _y2_c; \
4381 c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## NUM ## _xy2_c
4383 dspfunc(put, 0, 16);
4384 dspfunc(put_no_rnd, 0, 16);
4386 dspfunc(put_no_rnd, 1, 8);
4390 dspfunc(avg, 0, 16);
4391 dspfunc(avg_no_rnd, 0, 16);
4393 dspfunc(avg_no_rnd, 1, 8);
4398 c->put_no_rnd_pixels_l2[0]= put_no_rnd_pixels16_l2_c;
4399 c->put_no_rnd_pixels_l2[1]= put_no_rnd_pixels8_l2_c;
4401 c->put_tpel_pixels_tab[ 0] = put_tpel_pixels_mc00_c;
4402 c->put_tpel_pixels_tab[ 1] = put_tpel_pixels_mc10_c;
4403 c->put_tpel_pixels_tab[ 2] = put_tpel_pixels_mc20_c;
4404 c->put_tpel_pixels_tab[ 4] = put_tpel_pixels_mc01_c;
4405 c->put_tpel_pixels_tab[ 5] = put_tpel_pixels_mc11_c;
4406 c->put_tpel_pixels_tab[ 6] = put_tpel_pixels_mc21_c;
4407 c->put_tpel_pixels_tab[ 8] = put_tpel_pixels_mc02_c;
4408 c->put_tpel_pixels_tab[ 9] = put_tpel_pixels_mc12_c;
4409 c->put_tpel_pixels_tab[10] = put_tpel_pixels_mc22_c;
4411 c->avg_tpel_pixels_tab[ 0] = avg_tpel_pixels_mc00_c;
4412 c->avg_tpel_pixels_tab[ 1] = avg_tpel_pixels_mc10_c;
4413 c->avg_tpel_pixels_tab[ 2] = avg_tpel_pixels_mc20_c;
4414 c->avg_tpel_pixels_tab[ 4] = avg_tpel_pixels_mc01_c;
4415 c->avg_tpel_pixels_tab[ 5] = avg_tpel_pixels_mc11_c;
4416 c->avg_tpel_pixels_tab[ 6] = avg_tpel_pixels_mc21_c;
4417 c->avg_tpel_pixels_tab[ 8] = avg_tpel_pixels_mc02_c;
4418 c->avg_tpel_pixels_tab[ 9] = avg_tpel_pixels_mc12_c;
4419 c->avg_tpel_pixels_tab[10] = avg_tpel_pixels_mc22_c;
4421 #define dspfunc(PFX, IDX, NUM) \
4422 c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
4423 c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
4424 c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
4425 c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
4426 c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
4427 c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
4428 c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
4429 c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
4430 c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
4431 c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
4432 c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
4433 c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
4434 c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
4435 c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
4436 c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
4437 c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
4439 dspfunc(put_qpel, 0, 16);
4440 dspfunc(put_no_rnd_qpel, 0, 16);
4442 dspfunc(avg_qpel, 0, 16);
4443 /* dspfunc(avg_no_rnd_qpel, 0, 16); */
4445 dspfunc(put_qpel, 1, 8);
4446 dspfunc(put_no_rnd_qpel, 1, 8);
4448 dspfunc(avg_qpel, 1, 8);
4449 /* dspfunc(avg_no_rnd_qpel, 1, 8); */
4451 dspfunc(put_h264_qpel, 0, 16);
4452 dspfunc(put_h264_qpel, 1, 8);
4453 dspfunc(put_h264_qpel, 2, 4);
4454 dspfunc(put_h264_qpel, 3, 2);
4455 dspfunc(avg_h264_qpel, 0, 16);
4456 dspfunc(avg_h264_qpel, 1, 8);
4457 dspfunc(avg_h264_qpel, 2, 4);
4460 c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_c;
4461 c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_c;
4462 c->put_h264_chroma_pixels_tab[2]= put_h264_chroma_mc2_c;
4463 c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_c;
4464 c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_c;
4465 c->avg_h264_chroma_pixels_tab[2]= avg_h264_chroma_mc2_c;
4466 c->put_no_rnd_h264_chroma_pixels_tab[0]= put_no_rnd_h264_chroma_mc8_c;
4468 c->weight_h264_pixels_tab[0]= weight_h264_pixels16x16_c;
4469 c->weight_h264_pixels_tab[1]= weight_h264_pixels16x8_c;
4470 c->weight_h264_pixels_tab[2]= weight_h264_pixels8x16_c;
4471 c->weight_h264_pixels_tab[3]= weight_h264_pixels8x8_c;
4472 c->weight_h264_pixels_tab[4]= weight_h264_pixels8x4_c;
4473 c->weight_h264_pixels_tab[5]= weight_h264_pixels4x8_c;
4474 c->weight_h264_pixels_tab[6]= weight_h264_pixels4x4_c;
4475 c->weight_h264_pixels_tab[7]= weight_h264_pixels4x2_c;
4476 c->weight_h264_pixels_tab[8]= weight_h264_pixels2x4_c;
4477 c->weight_h264_pixels_tab[9]= weight_h264_pixels2x2_c;
4478 c->biweight_h264_pixels_tab[0]= biweight_h264_pixels16x16_c;
4479 c->biweight_h264_pixels_tab[1]= biweight_h264_pixels16x8_c;
4480 c->biweight_h264_pixels_tab[2]= biweight_h264_pixels8x16_c;
4481 c->biweight_h264_pixels_tab[3]= biweight_h264_pixels8x8_c;
4482 c->biweight_h264_pixels_tab[4]= biweight_h264_pixels8x4_c;
4483 c->biweight_h264_pixels_tab[5]= biweight_h264_pixels4x8_c;
4484 c->biweight_h264_pixels_tab[6]= biweight_h264_pixels4x4_c;
4485 c->biweight_h264_pixels_tab[7]= biweight_h264_pixels4x2_c;
4486 c->biweight_h264_pixels_tab[8]= biweight_h264_pixels2x4_c;
4487 c->biweight_h264_pixels_tab[9]= biweight_h264_pixels2x2_c;
4489 c->draw_edges = draw_edges_c;
4491 #if CONFIG_CAVS_DECODER
4492 ff_cavsdsp_init(c,avctx);
4494 #if CONFIG_VC1_DECODER || CONFIG_WMV3_DECODER
4495 ff_vc1dsp_init(c,avctx);
4497 #if CONFIG_WMV2_DECODER || CONFIG_VC1_DECODER || CONFIG_WMV3_DECODER
4498 ff_intrax8dsp_init(c,avctx);
4500 #if CONFIG_H264_ENCODER
4501 ff_h264dspenc_init(c,avctx);
4503 #if CONFIG_RV30_DECODER
4504 ff_rv30dsp_init(c,avctx);
4506 #if CONFIG_RV40_DECODER
4507 ff_rv40dsp_init(c,avctx);
4508 c->put_rv40_qpel_pixels_tab[0][15] = put_rv40_qpel16_mc33_c;
4509 c->avg_rv40_qpel_pixels_tab[0][15] = avg_rv40_qpel16_mc33_c;
4510 c->put_rv40_qpel_pixels_tab[1][15] = put_rv40_qpel8_mc33_c;
4511 c->avg_rv40_qpel_pixels_tab[1][15] = avg_rv40_qpel8_mc33_c;
4514 c->put_mspel_pixels_tab[0]= put_mspel8_mc00_c;
4515 c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
4516 c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c;
4517 c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c;
4518 c->put_mspel_pixels_tab[4]= put_mspel8_mc02_c;
4519 c->put_mspel_pixels_tab[5]= put_mspel8_mc12_c;
4520 c->put_mspel_pixels_tab[6]= put_mspel8_mc22_c;
4521 c->put_mspel_pixels_tab[7]= put_mspel8_mc32_c;
4523 #define SET_CMP_FUNC(name) \
4524 c->name[0]= name ## 16_c;\
4525 c->name[1]= name ## 8x8_c;
4527 SET_CMP_FUNC(hadamard8_diff)
4528 c->hadamard8_diff[4]= hadamard8_intra16_c;
4529 SET_CMP_FUNC(dct_sad)
4530 SET_CMP_FUNC(dct_max)
4532 SET_CMP_FUNC(dct264_sad)
4534 c->sad[0]= pix_abs16_c;
4535 c->sad[1]= pix_abs8_c;
4539 SET_CMP_FUNC(quant_psnr)
4542 c->vsad[0]= vsad16_c;
4543 c->vsad[4]= vsad_intra16_c;
4544 c->vsse[0]= vsse16_c;
4545 c->vsse[4]= vsse_intra16_c;
4546 c->nsse[0]= nsse16_c;
4547 c->nsse[1]= nsse8_c;
4548 #if CONFIG_SNOW_ENCODER
4549 c->w53[0]= w53_16_c;
4551 c->w97[0]= w97_16_c;
4555 c->ssd_int8_vs_int16 = ssd_int8_vs_int16_c;
4557 c->add_bytes= add_bytes_c;
4558 c->add_bytes_l2= add_bytes_l2_c;
4559 c->diff_bytes= diff_bytes_c;
4560 c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_c;
4561 c->bswap_buf= bswap_buf;
4562 #if CONFIG_PNG_DECODER
4563 c->add_png_paeth_prediction= ff_add_png_paeth_prediction;
4566 c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_c;
4567 c->h264_h_loop_filter_luma= h264_h_loop_filter_luma_c;
4568 c->h264_v_loop_filter_luma_intra= h264_v_loop_filter_luma_intra_c;
4569 c->h264_h_loop_filter_luma_intra= h264_h_loop_filter_luma_intra_c;
4570 c->h264_v_loop_filter_chroma= h264_v_loop_filter_chroma_c;
4571 c->h264_h_loop_filter_chroma= h264_h_loop_filter_chroma_c;
4572 c->h264_v_loop_filter_chroma_intra= h264_v_loop_filter_chroma_intra_c;
4573 c->h264_h_loop_filter_chroma_intra= h264_h_loop_filter_chroma_intra_c;
4574 c->h264_loop_filter_strength= NULL;
4576 if (CONFIG_ANY_H263) {
4577 c->h263_h_loop_filter= h263_h_loop_filter_c;
4578 c->h263_v_loop_filter= h263_v_loop_filter_c;
4581 if (CONFIG_VP3_DECODER || CONFIG_THEORA_DECODER) {
4582 c->vp3_h_loop_filter= ff_vp3_h_loop_filter_c;
4583 c->vp3_v_loop_filter= ff_vp3_v_loop_filter_c;
4586 c->h261_loop_filter= h261_loop_filter_c;
4588 c->try_8x8basis= try_8x8basis_c;
4589 c->add_8x8basis= add_8x8basis_c;
4591 #if CONFIG_SNOW_DECODER
4592 c->vertical_compose97i = ff_snow_vertical_compose97i;
4593 c->horizontal_compose97i = ff_snow_horizontal_compose97i;
4594 c->inner_add_yblock = ff_snow_inner_add_yblock;
4597 #if CONFIG_VORBIS_DECODER
4598 c->vorbis_inverse_coupling = vorbis_inverse_coupling;
4600 #if CONFIG_AC3_DECODER
4601 c->ac3_downmix = ff_ac3_downmix_c;
4603 #if CONFIG_FLAC_ENCODER
4604 c->flac_compute_autocorr = ff_flac_compute_autocorr;
4606 c->vector_fmul = vector_fmul_c;
4607 c->vector_fmul_reverse = vector_fmul_reverse_c;
4608 c->vector_fmul_add_add = ff_vector_fmul_add_add_c;
4609 c->vector_fmul_window = ff_vector_fmul_window_c;
4610 c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_c;
4611 c->float_to_int16 = ff_float_to_int16_c;
4612 c->float_to_int16_interleave = ff_float_to_int16_interleave_c;
4613 c->add_int16 = add_int16_c;
4614 c->sub_int16 = sub_int16_c;
4615 c->scalarproduct_int16 = scalarproduct_int16_c;
4617 c->shrink[0]= ff_img_copy_plane;
4618 c->shrink[1]= ff_shrink22;
4619 c->shrink[2]= ff_shrink44;
4620 c->shrink[3]= ff_shrink88;
4622 c->prefetch= just_return;
4624 memset(c->put_2tap_qpel_pixels_tab, 0, sizeof(c->put_2tap_qpel_pixels_tab));
4625 memset(c->avg_2tap_qpel_pixels_tab, 0, sizeof(c->avg_2tap_qpel_pixels_tab));
4627 if (HAVE_MMX) dsputil_init_mmx (c, avctx);
4628 if (ARCH_ARM) dsputil_init_arm (c, avctx);
4629 if (CONFIG_MLIB) dsputil_init_mlib (c, avctx);
4630 if (HAVE_VIS) dsputil_init_vis (c, avctx);
4631 if (ARCH_ALPHA) dsputil_init_alpha (c, avctx);
4632 if (ARCH_PPC) dsputil_init_ppc (c, avctx);
4633 if (HAVE_MMI) dsputil_init_mmi (c, avctx);
4634 if (ARCH_SH4) dsputil_init_sh4 (c, avctx);
4635 if (ARCH_BFIN) dsputil_init_bfin (c, avctx);
4637 for(i=0; i<64; i++){
4638 if(!c->put_2tap_qpel_pixels_tab[0][i])
4639 c->put_2tap_qpel_pixels_tab[0][i]= c->put_h264_qpel_pixels_tab[0][i];
4640 if(!c->avg_2tap_qpel_pixels_tab[0][i])
4641 c->avg_2tap_qpel_pixels_tab[0][i]= c->avg_h264_qpel_pixels_tab[0][i];
4644 switch(c->idct_permutation_type){
4645 case FF_NO_IDCT_PERM:
4647 c->idct_permutation[i]= i;
4649 case FF_LIBMPEG2_IDCT_PERM:
4651 c->idct_permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
4653 case FF_SIMPLE_IDCT_PERM:
4655 c->idct_permutation[i]= simple_mmx_permutation[i];
4657 case FF_TRANSPOSE_IDCT_PERM:
4659 c->idct_permutation[i]= ((i&7)<<3) | (i>>3);
4661 case FF_PARTTRANS_IDCT_PERM:
4663 c->idct_permutation[i]= (i&0x24) | ((i&3)<<3) | ((i>>3)&3);
4665 case FF_SSE2_IDCT_PERM:
4667 c->idct_permutation[i]= (i&0x38) | idct_sse2_row_perm[i&7];
4670 av_log(avctx, AV_LOG_ERROR, "Internal error, IDCT permutation not set\n");