3 * Copyright (c) 2000, 2001 Fabrice Bellard
4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
6 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
8 * This file is part of FFmpeg.
10 * FFmpeg is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public
12 * License as published by the Free Software Foundation; either
13 * version 2.1 of the License, or (at your option) any later version.
15 * FFmpeg is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * Lesser General Public License for more details.
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with FFmpeg; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
26 * @file libavcodec/dsputil.c
32 #include "simple_idct.h"
37 #include "mpegvideo.h"
41 void ff_spatial_dwt(int *buffer, int width, int height, int stride, int type, int decomposition_count);
44 void vorbis_inverse_coupling(float *mag, float *ang, int blocksize);
47 void ff_ac3_downmix_c(float (*samples)[256], float (*matrix)[2], int out_ch, int in_ch, int len);
50 void ff_lpc_compute_autocorr(const int32_t *data, int len, int lag, double *autoc);
53 void ff_add_png_paeth_prediction(uint8_t *dst, uint8_t *src, uint8_t *top, int w, int bpp);
56 void ff_ea_idct_put_c(uint8_t *dest, int linesize, DCTELEM *block);
58 uint8_t ff_cropTbl[256 + 2 * MAX_NEG_CROP] = {0, };
59 uint32_t ff_squareTbl[512] = {0, };
61 // 0x7f7f7f7f or 0x7f7f7f7f7f7f7f7f or whatever, depending on the cpu's native arithmetic size
62 #define pb_7f (~0UL/255 * 0x7f)
63 #define pb_80 (~0UL/255 * 0x80)
65 const uint8_t ff_zigzag_direct[64] = {
66 0, 1, 8, 16, 9, 2, 3, 10,
67 17, 24, 32, 25, 18, 11, 4, 5,
68 12, 19, 26, 33, 40, 48, 41, 34,
69 27, 20, 13, 6, 7, 14, 21, 28,
70 35, 42, 49, 56, 57, 50, 43, 36,
71 29, 22, 15, 23, 30, 37, 44, 51,
72 58, 59, 52, 45, 38, 31, 39, 46,
73 53, 60, 61, 54, 47, 55, 62, 63
76 /* Specific zigzag scan for 248 idct. NOTE that unlike the
77 specification, we interleave the fields */
78 const uint8_t ff_zigzag248_direct[64] = {
79 0, 8, 1, 9, 16, 24, 2, 10,
80 17, 25, 32, 40, 48, 56, 33, 41,
81 18, 26, 3, 11, 4, 12, 19, 27,
82 34, 42, 49, 57, 50, 58, 35, 43,
83 20, 28, 5, 13, 6, 14, 21, 29,
84 36, 44, 51, 59, 52, 60, 37, 45,
85 22, 30, 7, 15, 23, 31, 38, 46,
86 53, 61, 54, 62, 39, 47, 55, 63,
89 /* not permutated inverse zigzag_direct + 1 for MMX quantizer */
90 DECLARE_ALIGNED_16(uint16_t, inv_zigzag_direct16)[64];
92 const uint8_t ff_alternate_horizontal_scan[64] = {
93 0, 1, 2, 3, 8, 9, 16, 17,
94 10, 11, 4, 5, 6, 7, 15, 14,
95 13, 12, 19, 18, 24, 25, 32, 33,
96 26, 27, 20, 21, 22, 23, 28, 29,
97 30, 31, 34, 35, 40, 41, 48, 49,
98 42, 43, 36, 37, 38, 39, 44, 45,
99 46, 47, 50, 51, 56, 57, 58, 59,
100 52, 53, 54, 55, 60, 61, 62, 63,
103 const uint8_t ff_alternate_vertical_scan[64] = {
104 0, 8, 16, 24, 1, 9, 2, 10,
105 17, 25, 32, 40, 48, 56, 57, 49,
106 41, 33, 26, 18, 3, 11, 4, 12,
107 19, 27, 34, 42, 50, 58, 35, 43,
108 51, 59, 20, 28, 5, 13, 6, 14,
109 21, 29, 36, 44, 52, 60, 37, 45,
110 53, 61, 22, 30, 7, 15, 23, 31,
111 38, 46, 54, 62, 39, 47, 55, 63,
114 /* a*inverse[b]>>32 == a/b for all 0<=a<=16909558 && 2<=b<=256
115 * for a>16909558, is an overestimate by less than 1 part in 1<<24 */
116 const uint32_t ff_inverse[257]={
117 0, 4294967295U,2147483648U,1431655766, 1073741824, 858993460, 715827883, 613566757,
118 536870912, 477218589, 429496730, 390451573, 357913942, 330382100, 306783379, 286331154,
119 268435456, 252645136, 238609295, 226050911, 214748365, 204522253, 195225787, 186737709,
120 178956971, 171798692, 165191050, 159072863, 153391690, 148102321, 143165577, 138547333,
121 134217728, 130150525, 126322568, 122713352, 119304648, 116080198, 113025456, 110127367,
122 107374183, 104755300, 102261127, 99882961, 97612894, 95443718, 93368855, 91382283,
123 89478486, 87652394, 85899346, 84215046, 82595525, 81037119, 79536432, 78090315,
124 76695845, 75350304, 74051161, 72796056, 71582789, 70409300, 69273667, 68174085,
125 67108864, 66076420, 65075263, 64103990, 63161284, 62245903, 61356676, 60492498,
126 59652324, 58835169, 58040099, 57266231, 56512728, 55778797, 55063684, 54366675,
127 53687092, 53024288, 52377650, 51746594, 51130564, 50529028, 49941481, 49367441,
128 48806447, 48258060, 47721859, 47197443, 46684428, 46182445, 45691142, 45210183,
129 44739243, 44278014, 43826197, 43383509, 42949673, 42524429, 42107523, 41698712,
130 41297763, 40904451, 40518560, 40139882, 39768216, 39403370, 39045158, 38693400,
131 38347923, 38008561, 37675152, 37347542, 37025581, 36709123, 36398028, 36092163,
132 35791395, 35495598, 35204650, 34918434, 34636834, 34359739, 34087043, 33818641,
133 33554432, 33294321, 33038210, 32786010, 32537632, 32292988, 32051995, 31814573,
134 31580642, 31350127, 31122952, 30899046, 30678338, 30460761, 30246249, 30034737,
135 29826162, 29620465, 29417585, 29217465, 29020050, 28825284, 28633116, 28443493,
136 28256364, 28071682, 27889399, 27709467, 27531842, 27356480, 27183338, 27012373,
137 26843546, 26676816, 26512144, 26349493, 26188825, 26030105, 25873297, 25718368,
138 25565282, 25414008, 25264514, 25116768, 24970741, 24826401, 24683721, 24542671,
139 24403224, 24265352, 24129030, 23994231, 23860930, 23729102, 23598722, 23469767,
140 23342214, 23216040, 23091223, 22967740, 22845571, 22724695, 22605092, 22486740,
141 22369622, 22253717, 22139007, 22025474, 21913099, 21801865, 21691755, 21582751,
142 21474837, 21367997, 21262215, 21157475, 21053762, 20951060, 20849356, 20748635,
143 20648882, 20550083, 20452226, 20355296, 20259280, 20164166, 20069941, 19976593,
144 19884108, 19792477, 19701685, 19611723, 19522579, 19434242, 19346700, 19259944,
145 19173962, 19088744, 19004281, 18920561, 18837576, 18755316, 18673771, 18592933,
146 18512791, 18433337, 18354562, 18276457, 18199014, 18122225, 18046082, 17970575,
147 17895698, 17821442, 17747799, 17674763, 17602325, 17530479, 17459217, 17388532,
148 17318417, 17248865, 17179870, 17111424, 17043522, 16976156, 16909321, 16843010,
152 /* Input permutation for the simple_idct_mmx */
153 static const uint8_t simple_mmx_permutation[64]={
154 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
155 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
156 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
157 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
158 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
159 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
160 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
161 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
164 static const uint8_t idct_sse2_row_perm[8] = {0, 4, 1, 5, 2, 6, 3, 7};
166 void ff_init_scantable(uint8_t *permutation, ScanTable *st, const uint8_t *src_scantable){
170 st->scantable= src_scantable;
174 j = src_scantable[i];
175 st->permutated[i] = permutation[j];
184 j = st->permutated[i];
186 st->raster_end[i]= end;
190 static int pix_sum_c(uint8_t * pix, int line_size)
195 for (i = 0; i < 16; i++) {
196 for (j = 0; j < 16; j += 8) {
207 pix += line_size - 16;
212 static int pix_norm1_c(uint8_t * pix, int line_size)
215 uint32_t *sq = ff_squareTbl + 256;
218 for (i = 0; i < 16; i++) {
219 for (j = 0; j < 16; j += 8) {
230 #if LONG_MAX > 2147483647
231 register uint64_t x=*(uint64_t*)pix;
233 s += sq[(x>>8)&0xff];
234 s += sq[(x>>16)&0xff];
235 s += sq[(x>>24)&0xff];
236 s += sq[(x>>32)&0xff];
237 s += sq[(x>>40)&0xff];
238 s += sq[(x>>48)&0xff];
239 s += sq[(x>>56)&0xff];
241 register uint32_t x=*(uint32_t*)pix;
243 s += sq[(x>>8)&0xff];
244 s += sq[(x>>16)&0xff];
245 s += sq[(x>>24)&0xff];
246 x=*(uint32_t*)(pix+4);
248 s += sq[(x>>8)&0xff];
249 s += sq[(x>>16)&0xff];
250 s += sq[(x>>24)&0xff];
255 pix += line_size - 16;
260 static void bswap_buf(uint32_t *dst, const uint32_t *src, int w){
263 for(i=0; i+8<=w; i+=8){
264 dst[i+0]= bswap_32(src[i+0]);
265 dst[i+1]= bswap_32(src[i+1]);
266 dst[i+2]= bswap_32(src[i+2]);
267 dst[i+3]= bswap_32(src[i+3]);
268 dst[i+4]= bswap_32(src[i+4]);
269 dst[i+5]= bswap_32(src[i+5]);
270 dst[i+6]= bswap_32(src[i+6]);
271 dst[i+7]= bswap_32(src[i+7]);
274 dst[i+0]= bswap_32(src[i+0]);
278 static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
281 uint32_t *sq = ff_squareTbl + 256;
284 for (i = 0; i < h; i++) {
285 s += sq[pix1[0] - pix2[0]];
286 s += sq[pix1[1] - pix2[1]];
287 s += sq[pix1[2] - pix2[2]];
288 s += sq[pix1[3] - pix2[3]];
295 static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
298 uint32_t *sq = ff_squareTbl + 256;
301 for (i = 0; i < h; i++) {
302 s += sq[pix1[0] - pix2[0]];
303 s += sq[pix1[1] - pix2[1]];
304 s += sq[pix1[2] - pix2[2]];
305 s += sq[pix1[3] - pix2[3]];
306 s += sq[pix1[4] - pix2[4]];
307 s += sq[pix1[5] - pix2[5]];
308 s += sq[pix1[6] - pix2[6]];
309 s += sq[pix1[7] - pix2[7]];
316 static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
319 uint32_t *sq = ff_squareTbl + 256;
322 for (i = 0; i < h; i++) {
323 s += sq[pix1[ 0] - pix2[ 0]];
324 s += sq[pix1[ 1] - pix2[ 1]];
325 s += sq[pix1[ 2] - pix2[ 2]];
326 s += sq[pix1[ 3] - pix2[ 3]];
327 s += sq[pix1[ 4] - pix2[ 4]];
328 s += sq[pix1[ 5] - pix2[ 5]];
329 s += sq[pix1[ 6] - pix2[ 6]];
330 s += sq[pix1[ 7] - pix2[ 7]];
331 s += sq[pix1[ 8] - pix2[ 8]];
332 s += sq[pix1[ 9] - pix2[ 9]];
333 s += sq[pix1[10] - pix2[10]];
334 s += sq[pix1[11] - pix2[11]];
335 s += sq[pix1[12] - pix2[12]];
336 s += sq[pix1[13] - pix2[13]];
337 s += sq[pix1[14] - pix2[14]];
338 s += sq[pix1[15] - pix2[15]];
347 #if CONFIG_SNOW_ENCODER //dwt is in snow.c
348 static inline int w_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int w, int h, int type){
350 const int dec_count= w==8 ? 3 : 4;
353 static const int scale[2][2][4][4]={
357 {268, 239, 239, 213},
361 // 9/7 16x16 or 32x32 dec=4
362 {344, 310, 310, 280},
370 {275, 245, 245, 218},
374 // 5/3 16x16 or 32x32 dec=4
375 {352, 317, 317, 286},
383 for (i = 0; i < h; i++) {
384 for (j = 0; j < w; j+=4) {
385 tmp[32*i+j+0] = (pix1[j+0] - pix2[j+0])<<4;
386 tmp[32*i+j+1] = (pix1[j+1] - pix2[j+1])<<4;
387 tmp[32*i+j+2] = (pix1[j+2] - pix2[j+2])<<4;
388 tmp[32*i+j+3] = (pix1[j+3] - pix2[j+3])<<4;
394 ff_spatial_dwt(tmp, w, h, 32, type, dec_count);
398 for(level=0; level<dec_count; level++){
399 for(ori= level ? 1 : 0; ori<4; ori++){
400 int size= w>>(dec_count-level);
401 int sx= (ori&1) ? size : 0;
402 int stride= 32<<(dec_count-level);
403 int sy= (ori&2) ? stride>>1 : 0;
405 for(i=0; i<size; i++){
406 for(j=0; j<size; j++){
407 int v= tmp[sx + sy + i*stride + j] * scale[type][dec_count-3][level][ori];
417 static int w53_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
418 return w_c(v, pix1, pix2, line_size, 8, h, 1);
421 static int w97_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
422 return w_c(v, pix1, pix2, line_size, 8, h, 0);
425 static int w53_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
426 return w_c(v, pix1, pix2, line_size, 16, h, 1);
429 static int w97_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
430 return w_c(v, pix1, pix2, line_size, 16, h, 0);
433 int w53_32_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
434 return w_c(v, pix1, pix2, line_size, 32, h, 1);
437 int w97_32_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
438 return w_c(v, pix1, pix2, line_size, 32, h, 0);
442 /* draw the edges of width 'w' of an image of size width, height */
443 //FIXME check that this is ok for mpeg4 interlaced
444 static void draw_edges_c(uint8_t *buf, int wrap, int width, int height, int w)
446 uint8_t *ptr, *last_line;
449 last_line = buf + (height - 1) * wrap;
452 memcpy(buf - (i + 1) * wrap, buf, width);
453 memcpy(last_line + (i + 1) * wrap, last_line, width);
457 for(i=0;i<height;i++) {
458 memset(ptr - w, ptr[0], w);
459 memset(ptr + width, ptr[width-1], w);
464 memset(buf - (i + 1) * wrap - w, buf[0], w); /* top left */
465 memset(buf - (i + 1) * wrap + width, buf[width-1], w); /* top right */
466 memset(last_line + (i + 1) * wrap - w, last_line[0], w); /* top left */
467 memset(last_line + (i + 1) * wrap + width, last_line[width-1], w); /* top right */
472 * Copies a rectangular area of samples to a temporary buffer and replicates the boarder samples.
473 * @param buf destination buffer
474 * @param src source buffer
475 * @param linesize number of bytes between 2 vertically adjacent samples in both the source and destination buffers
476 * @param block_w width of block
477 * @param block_h height of block
478 * @param src_x x coordinate of the top left sample of the block in the source buffer
479 * @param src_y y coordinate of the top left sample of the block in the source buffer
480 * @param w width of the source buffer
481 * @param h height of the source buffer
483 void ff_emulated_edge_mc(uint8_t *buf, uint8_t *src, int linesize, int block_w, int block_h,
484 int src_x, int src_y, int w, int h){
486 int start_y, start_x, end_y, end_x;
489 src+= (h-1-src_y)*linesize;
491 }else if(src_y<=-block_h){
492 src+= (1-block_h-src_y)*linesize;
498 }else if(src_x<=-block_w){
499 src+= (1-block_w-src_x);
503 start_y= FFMAX(0, -src_y);
504 start_x= FFMAX(0, -src_x);
505 end_y= FFMIN(block_h, h-src_y);
506 end_x= FFMIN(block_w, w-src_x);
508 // copy existing part
509 for(y=start_y; y<end_y; y++){
510 for(x=start_x; x<end_x; x++){
511 buf[x + y*linesize]= src[x + y*linesize];
516 for(y=0; y<start_y; y++){
517 for(x=start_x; x<end_x; x++){
518 buf[x + y*linesize]= buf[x + start_y*linesize];
523 for(y=end_y; y<block_h; y++){
524 for(x=start_x; x<end_x; x++){
525 buf[x + y*linesize]= buf[x + (end_y-1)*linesize];
529 for(y=0; y<block_h; y++){
531 for(x=0; x<start_x; x++){
532 buf[x + y*linesize]= buf[start_x + y*linesize];
536 for(x=end_x; x<block_w; x++){
537 buf[x + y*linesize]= buf[end_x - 1 + y*linesize];
542 static void get_pixels_c(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
546 /* read the pixels */
548 block[0] = pixels[0];
549 block[1] = pixels[1];
550 block[2] = pixels[2];
551 block[3] = pixels[3];
552 block[4] = pixels[4];
553 block[5] = pixels[5];
554 block[6] = pixels[6];
555 block[7] = pixels[7];
561 static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
562 const uint8_t *s2, int stride){
565 /* read the pixels */
567 block[0] = s1[0] - s2[0];
568 block[1] = s1[1] - s2[1];
569 block[2] = s1[2] - s2[2];
570 block[3] = s1[3] - s2[3];
571 block[4] = s1[4] - s2[4];
572 block[5] = s1[5] - s2[5];
573 block[6] = s1[6] - s2[6];
574 block[7] = s1[7] - s2[7];
582 static void put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
586 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
588 /* read the pixels */
590 pixels[0] = cm[block[0]];
591 pixels[1] = cm[block[1]];
592 pixels[2] = cm[block[2]];
593 pixels[3] = cm[block[3]];
594 pixels[4] = cm[block[4]];
595 pixels[5] = cm[block[5]];
596 pixels[6] = cm[block[6]];
597 pixels[7] = cm[block[7]];
604 static void put_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
608 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
610 /* read the pixels */
612 pixels[0] = cm[block[0]];
613 pixels[1] = cm[block[1]];
614 pixels[2] = cm[block[2]];
615 pixels[3] = cm[block[3]];
622 static void put_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
626 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
628 /* read the pixels */
630 pixels[0] = cm[block[0]];
631 pixels[1] = cm[block[1]];
638 static void put_signed_pixels_clamped_c(const DCTELEM *block,
639 uint8_t *restrict pixels,
644 for (i = 0; i < 8; i++) {
645 for (j = 0; j < 8; j++) {
648 else if (*block > 127)
651 *pixels = (uint8_t)(*block + 128);
655 pixels += (line_size - 8);
659 static void add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
663 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
665 /* read the pixels */
667 pixels[0] = cm[pixels[0] + block[0]];
668 pixels[1] = cm[pixels[1] + block[1]];
669 pixels[2] = cm[pixels[2] + block[2]];
670 pixels[3] = cm[pixels[3] + block[3]];
671 pixels[4] = cm[pixels[4] + block[4]];
672 pixels[5] = cm[pixels[5] + block[5]];
673 pixels[6] = cm[pixels[6] + block[6]];
674 pixels[7] = cm[pixels[7] + block[7]];
680 static void add_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
684 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
686 /* read the pixels */
688 pixels[0] = cm[pixels[0] + block[0]];
689 pixels[1] = cm[pixels[1] + block[1]];
690 pixels[2] = cm[pixels[2] + block[2]];
691 pixels[3] = cm[pixels[3] + block[3]];
697 static void add_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
701 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
703 /* read the pixels */
705 pixels[0] = cm[pixels[0] + block[0]];
706 pixels[1] = cm[pixels[1] + block[1]];
712 static void add_pixels8_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
716 pixels[0] += block[0];
717 pixels[1] += block[1];
718 pixels[2] += block[2];
719 pixels[3] += block[3];
720 pixels[4] += block[4];
721 pixels[5] += block[5];
722 pixels[6] += block[6];
723 pixels[7] += block[7];
729 static void add_pixels4_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
733 pixels[0] += block[0];
734 pixels[1] += block[1];
735 pixels[2] += block[2];
736 pixels[3] += block[3];
742 static int sum_abs_dctelem_c(DCTELEM *block)
746 sum+= FFABS(block[i]);
752 #define PIXOP2(OPNAME, OP) \
753 static void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
757 OP(*((uint64_t*)block), AV_RN64(pixels));\
763 static void OPNAME ## _no_rnd_pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
767 const uint64_t a= AV_RN64(pixels );\
768 const uint64_t b= AV_RN64(pixels+1);\
769 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
775 static void OPNAME ## _pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
779 const uint64_t a= AV_RN64(pixels );\
780 const uint64_t b= AV_RN64(pixels+1);\
781 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
787 static void OPNAME ## _no_rnd_pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
791 const uint64_t a= AV_RN64(pixels );\
792 const uint64_t b= AV_RN64(pixels+line_size);\
793 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
799 static void OPNAME ## _pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
803 const uint64_t a= AV_RN64(pixels );\
804 const uint64_t b= AV_RN64(pixels+line_size);\
805 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
811 static void OPNAME ## _pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
814 const uint64_t a= AV_RN64(pixels );\
815 const uint64_t b= AV_RN64(pixels+1);\
816 uint64_t l0= (a&0x0303030303030303ULL)\
817 + (b&0x0303030303030303ULL)\
818 + 0x0202020202020202ULL;\
819 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
820 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
824 for(i=0; i<h; i+=2){\
825 uint64_t a= AV_RN64(pixels );\
826 uint64_t b= AV_RN64(pixels+1);\
827 l1= (a&0x0303030303030303ULL)\
828 + (b&0x0303030303030303ULL);\
829 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
830 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
831 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
834 a= AV_RN64(pixels );\
835 b= AV_RN64(pixels+1);\
836 l0= (a&0x0303030303030303ULL)\
837 + (b&0x0303030303030303ULL)\
838 + 0x0202020202020202ULL;\
839 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
840 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
841 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
847 static void OPNAME ## _no_rnd_pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
850 const uint64_t a= AV_RN64(pixels );\
851 const uint64_t b= AV_RN64(pixels+1);\
852 uint64_t l0= (a&0x0303030303030303ULL)\
853 + (b&0x0303030303030303ULL)\
854 + 0x0101010101010101ULL;\
855 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
856 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
860 for(i=0; i<h; i+=2){\
861 uint64_t a= AV_RN64(pixels );\
862 uint64_t b= AV_RN64(pixels+1);\
863 l1= (a&0x0303030303030303ULL)\
864 + (b&0x0303030303030303ULL);\
865 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
866 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
867 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
870 a= AV_RN64(pixels );\
871 b= AV_RN64(pixels+1);\
872 l0= (a&0x0303030303030303ULL)\
873 + (b&0x0303030303030303ULL)\
874 + 0x0101010101010101ULL;\
875 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
876 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
877 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
883 CALL_2X_PIXELS(OPNAME ## _pixels16_c , OPNAME ## _pixels_c , 8)\
884 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels_x2_c , 8)\
885 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels_y2_c , 8)\
886 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels_xy2_c, 8)\
887 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels_x2_c , 8)\
888 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels_y2_c , 8)\
889 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels_xy2_c, 8)
891 #define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
892 #else // 64 bit variant
894 #define PIXOP2(OPNAME, OP) \
895 static void OPNAME ## _pixels2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
898 OP(*((uint16_t*)(block )), AV_RN16(pixels ));\
903 static void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
906 OP(*((uint32_t*)(block )), AV_RN32(pixels ));\
911 static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
914 OP(*((uint32_t*)(block )), AV_RN32(pixels ));\
915 OP(*((uint32_t*)(block+4)), AV_RN32(pixels+4));\
920 static inline void OPNAME ## _no_rnd_pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
921 OPNAME ## _pixels8_c(block, pixels, line_size, h);\
924 static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
925 int src_stride1, int src_stride2, int h){\
929 a= AV_RN32(&src1[i*src_stride1 ]);\
930 b= AV_RN32(&src2[i*src_stride2 ]);\
931 OP(*((uint32_t*)&dst[i*dst_stride ]), no_rnd_avg32(a, b));\
932 a= AV_RN32(&src1[i*src_stride1+4]);\
933 b= AV_RN32(&src2[i*src_stride2+4]);\
934 OP(*((uint32_t*)&dst[i*dst_stride+4]), no_rnd_avg32(a, b));\
938 static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
939 int src_stride1, int src_stride2, int h){\
943 a= AV_RN32(&src1[i*src_stride1 ]);\
944 b= AV_RN32(&src2[i*src_stride2 ]);\
945 OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
946 a= AV_RN32(&src1[i*src_stride1+4]);\
947 b= AV_RN32(&src2[i*src_stride2+4]);\
948 OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\
952 static inline void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
953 int src_stride1, int src_stride2, int h){\
957 a= AV_RN32(&src1[i*src_stride1 ]);\
958 b= AV_RN32(&src2[i*src_stride2 ]);\
959 OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
963 static inline void OPNAME ## _pixels2_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
964 int src_stride1, int src_stride2, int h){\
968 a= AV_RN16(&src1[i*src_stride1 ]);\
969 b= AV_RN16(&src2[i*src_stride2 ]);\
970 OP(*((uint16_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
974 static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
975 int src_stride1, int src_stride2, int h){\
976 OPNAME ## _pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
977 OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
980 static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
981 int src_stride1, int src_stride2, int h){\
982 OPNAME ## _no_rnd_pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
983 OPNAME ## _no_rnd_pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
986 static inline void OPNAME ## _no_rnd_pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
987 OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
990 static inline void OPNAME ## _pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
991 OPNAME ## _pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
994 static inline void OPNAME ## _no_rnd_pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
995 OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
998 static inline void OPNAME ## _pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
999 OPNAME ## _pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
1002 static inline void OPNAME ## _pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
1003 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1005 for(i=0; i<h; i++){\
1006 uint32_t a, b, c, d, l0, l1, h0, h1;\
1007 a= AV_RN32(&src1[i*src_stride1]);\
1008 b= AV_RN32(&src2[i*src_stride2]);\
1009 c= AV_RN32(&src3[i*src_stride3]);\
1010 d= AV_RN32(&src4[i*src_stride4]);\
1011 l0= (a&0x03030303UL)\
1014 h0= ((a&0xFCFCFCFCUL)>>2)\
1015 + ((b&0xFCFCFCFCUL)>>2);\
1016 l1= (c&0x03030303UL)\
1017 + (d&0x03030303UL);\
1018 h1= ((c&0xFCFCFCFCUL)>>2)\
1019 + ((d&0xFCFCFCFCUL)>>2);\
1020 OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1021 a= AV_RN32(&src1[i*src_stride1+4]);\
1022 b= AV_RN32(&src2[i*src_stride2+4]);\
1023 c= AV_RN32(&src3[i*src_stride3+4]);\
1024 d= AV_RN32(&src4[i*src_stride4+4]);\
1025 l0= (a&0x03030303UL)\
1028 h0= ((a&0xFCFCFCFCUL)>>2)\
1029 + ((b&0xFCFCFCFCUL)>>2);\
1030 l1= (c&0x03030303UL)\
1031 + (d&0x03030303UL);\
1032 h1= ((c&0xFCFCFCFCUL)>>2)\
1033 + ((d&0xFCFCFCFCUL)>>2);\
1034 OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1038 static inline void OPNAME ## _pixels4_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
1039 OPNAME ## _pixels4_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
1042 static inline void OPNAME ## _pixels4_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
1043 OPNAME ## _pixels4_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
1046 static inline void OPNAME ## _pixels2_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
1047 OPNAME ## _pixels2_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
1050 static inline void OPNAME ## _pixels2_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
1051 OPNAME ## _pixels2_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
1054 static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
1055 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1057 for(i=0; i<h; i++){\
1058 uint32_t a, b, c, d, l0, l1, h0, h1;\
1059 a= AV_RN32(&src1[i*src_stride1]);\
1060 b= AV_RN32(&src2[i*src_stride2]);\
1061 c= AV_RN32(&src3[i*src_stride3]);\
1062 d= AV_RN32(&src4[i*src_stride4]);\
1063 l0= (a&0x03030303UL)\
1066 h0= ((a&0xFCFCFCFCUL)>>2)\
1067 + ((b&0xFCFCFCFCUL)>>2);\
1068 l1= (c&0x03030303UL)\
1069 + (d&0x03030303UL);\
1070 h1= ((c&0xFCFCFCFCUL)>>2)\
1071 + ((d&0xFCFCFCFCUL)>>2);\
1072 OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1073 a= AV_RN32(&src1[i*src_stride1+4]);\
1074 b= AV_RN32(&src2[i*src_stride2+4]);\
1075 c= AV_RN32(&src3[i*src_stride3+4]);\
1076 d= AV_RN32(&src4[i*src_stride4+4]);\
1077 l0= (a&0x03030303UL)\
1080 h0= ((a&0xFCFCFCFCUL)>>2)\
1081 + ((b&0xFCFCFCFCUL)>>2);\
1082 l1= (c&0x03030303UL)\
1083 + (d&0x03030303UL);\
1084 h1= ((c&0xFCFCFCFCUL)>>2)\
1085 + ((d&0xFCFCFCFCUL)>>2);\
1086 OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1089 static inline void OPNAME ## _pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
1090 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1091 OPNAME ## _pixels8_l4(dst , src1 , src2 , src3 , src4 , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1092 OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1094 static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
1095 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1096 OPNAME ## _no_rnd_pixels8_l4(dst , src1 , src2 , src3 , src4 , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1097 OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1100 static inline void OPNAME ## _pixels2_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1102 int i, a0, b0, a1, b1;\
1109 for(i=0; i<h; i+=2){\
1115 block[0]= (a1+a0)>>2; /* FIXME non put */\
1116 block[1]= (b1+b0)>>2;\
1126 block[0]= (a1+a0)>>2;\
1127 block[1]= (b1+b0)>>2;\
1133 static inline void OPNAME ## _pixels4_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1136 const uint32_t a= AV_RN32(pixels );\
1137 const uint32_t b= AV_RN32(pixels+1);\
1138 uint32_t l0= (a&0x03030303UL)\
1141 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1142 + ((b&0xFCFCFCFCUL)>>2);\
1146 for(i=0; i<h; i+=2){\
1147 uint32_t a= AV_RN32(pixels );\
1148 uint32_t b= AV_RN32(pixels+1);\
1149 l1= (a&0x03030303UL)\
1150 + (b&0x03030303UL);\
1151 h1= ((a&0xFCFCFCFCUL)>>2)\
1152 + ((b&0xFCFCFCFCUL)>>2);\
1153 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1156 a= AV_RN32(pixels );\
1157 b= AV_RN32(pixels+1);\
1158 l0= (a&0x03030303UL)\
1161 h0= ((a&0xFCFCFCFCUL)>>2)\
1162 + ((b&0xFCFCFCFCUL)>>2);\
1163 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1169 static inline void OPNAME ## _pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1172 for(j=0; j<2; j++){\
1174 const uint32_t a= AV_RN32(pixels );\
1175 const uint32_t b= AV_RN32(pixels+1);\
1176 uint32_t l0= (a&0x03030303UL)\
1179 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1180 + ((b&0xFCFCFCFCUL)>>2);\
1184 for(i=0; i<h; i+=2){\
1185 uint32_t a= AV_RN32(pixels );\
1186 uint32_t b= AV_RN32(pixels+1);\
1187 l1= (a&0x03030303UL)\
1188 + (b&0x03030303UL);\
1189 h1= ((a&0xFCFCFCFCUL)>>2)\
1190 + ((b&0xFCFCFCFCUL)>>2);\
1191 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1194 a= AV_RN32(pixels );\
1195 b= AV_RN32(pixels+1);\
1196 l0= (a&0x03030303UL)\
1199 h0= ((a&0xFCFCFCFCUL)>>2)\
1200 + ((b&0xFCFCFCFCUL)>>2);\
1201 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1205 pixels+=4-line_size*(h+1);\
1206 block +=4-line_size*h;\
1210 static inline void OPNAME ## _no_rnd_pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1213 for(j=0; j<2; j++){\
1215 const uint32_t a= AV_RN32(pixels );\
1216 const uint32_t b= AV_RN32(pixels+1);\
1217 uint32_t l0= (a&0x03030303UL)\
1220 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1221 + ((b&0xFCFCFCFCUL)>>2);\
1225 for(i=0; i<h; i+=2){\
1226 uint32_t a= AV_RN32(pixels );\
1227 uint32_t b= AV_RN32(pixels+1);\
1228 l1= (a&0x03030303UL)\
1229 + (b&0x03030303UL);\
1230 h1= ((a&0xFCFCFCFCUL)>>2)\
1231 + ((b&0xFCFCFCFCUL)>>2);\
1232 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1235 a= AV_RN32(pixels );\
1236 b= AV_RN32(pixels+1);\
1237 l0= (a&0x03030303UL)\
1240 h0= ((a&0xFCFCFCFCUL)>>2)\
1241 + ((b&0xFCFCFCFCUL)>>2);\
1242 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1246 pixels+=4-line_size*(h+1);\
1247 block +=4-line_size*h;\
1251 CALL_2X_PIXELS(OPNAME ## _pixels16_c , OPNAME ## _pixels8_c , 8)\
1252 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels8_x2_c , 8)\
1253 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels8_y2_c , 8)\
1254 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels8_xy2_c, 8)\
1255 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_c , OPNAME ## _pixels8_c , 8)\
1256 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels8_x2_c , 8)\
1257 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels8_y2_c , 8)\
1258 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels8_xy2_c, 8)\
1260 #define op_avg(a, b) a = rnd_avg32(a, b)
1262 #define op_put(a, b) a = b
1269 #define avg2(a,b) ((a+b+1)>>1)
1270 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
1272 static void put_no_rnd_pixels16_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1273 put_no_rnd_pixels16_l2(dst, a, b, stride, stride, stride, h);
1276 static void put_no_rnd_pixels8_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1277 put_no_rnd_pixels8_l2(dst, a, b, stride, stride, stride, h);
1280 static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
1282 const int A=(16-x16)*(16-y16);
1283 const int B=( x16)*(16-y16);
1284 const int C=(16-x16)*( y16);
1285 const int D=( x16)*( y16);
1290 dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
1291 dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
1292 dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
1293 dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
1294 dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
1295 dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
1296 dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
1297 dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
1303 void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
1304 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
1307 const int s= 1<<shift;
1317 for(x=0; x<8; x++){ //XXX FIXME optimize
1318 int src_x, src_y, frac_x, frac_y, index;
1322 frac_x= src_x&(s-1);
1323 frac_y= src_y&(s-1);
1327 if((unsigned)src_x < width){
1328 if((unsigned)src_y < height){
1329 index= src_x + src_y*stride;
1330 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
1331 + src[index +1]* frac_x )*(s-frac_y)
1332 + ( src[index+stride ]*(s-frac_x)
1333 + src[index+stride+1]* frac_x )* frac_y
1336 index= src_x + av_clip(src_y, 0, height)*stride;
1337 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
1338 + src[index +1]* frac_x )*s
1342 if((unsigned)src_y < height){
1343 index= av_clip(src_x, 0, width) + src_y*stride;
1344 dst[y*stride + x]= ( ( src[index ]*(s-frac_y)
1345 + src[index+stride ]* frac_y )*s
1348 index= av_clip(src_x, 0, width) + av_clip(src_y, 0, height)*stride;
1349 dst[y*stride + x]= src[index ];
1361 static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1363 case 2: put_pixels2_c (dst, src, stride, height); break;
1364 case 4: put_pixels4_c (dst, src, stride, height); break;
1365 case 8: put_pixels8_c (dst, src, stride, height); break;
1366 case 16:put_pixels16_c(dst, src, stride, height); break;
1370 static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1372 for (i=0; i < height; i++) {
1373 for (j=0; j < width; j++) {
1374 dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
1381 static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1383 for (i=0; i < height; i++) {
1384 for (j=0; j < width; j++) {
1385 dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
1392 static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1394 for (i=0; i < height; i++) {
1395 for (j=0; j < width; j++) {
1396 dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
1403 static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1405 for (i=0; i < height; i++) {
1406 for (j=0; j < width; j++) {
1407 dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
1414 static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1416 for (i=0; i < height; i++) {
1417 for (j=0; j < width; j++) {
1418 dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1425 static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1427 for (i=0; i < height; i++) {
1428 for (j=0; j < width; j++) {
1429 dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
1436 static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1438 for (i=0; i < height; i++) {
1439 for (j=0; j < width; j++) {
1440 dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1447 static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1449 for (i=0; i < height; i++) {
1450 for (j=0; j < width; j++) {
1451 dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
1458 static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1460 case 2: avg_pixels2_c (dst, src, stride, height); break;
1461 case 4: avg_pixels4_c (dst, src, stride, height); break;
1462 case 8: avg_pixels8_c (dst, src, stride, height); break;
1463 case 16:avg_pixels16_c(dst, src, stride, height); break;
1467 static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1469 for (i=0; i < height; i++) {
1470 for (j=0; j < width; j++) {
1471 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
1478 static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1480 for (i=0; i < height; i++) {
1481 for (j=0; j < width; j++) {
1482 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
1489 static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1491 for (i=0; i < height; i++) {
1492 for (j=0; j < width; j++) {
1493 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
1500 static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1502 for (i=0; i < height; i++) {
1503 for (j=0; j < width; j++) {
1504 dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1511 static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1513 for (i=0; i < height; i++) {
1514 for (j=0; j < width; j++) {
1515 dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1522 static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1524 for (i=0; i < height; i++) {
1525 for (j=0; j < width; j++) {
1526 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
1533 static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1535 for (i=0; i < height; i++) {
1536 for (j=0; j < width; j++) {
1537 dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1544 static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1546 for (i=0; i < height; i++) {
1547 for (j=0; j < width; j++) {
1548 dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1555 #define TPEL_WIDTH(width)\
1556 static void put_tpel_pixels ## width ## _mc00_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1557 void put_tpel_pixels_mc00_c(dst, src, stride, width, height);}\
1558 static void put_tpel_pixels ## width ## _mc10_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1559 void put_tpel_pixels_mc10_c(dst, src, stride, width, height);}\
1560 static void put_tpel_pixels ## width ## _mc20_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1561 void put_tpel_pixels_mc20_c(dst, src, stride, width, height);}\
1562 static void put_tpel_pixels ## width ## _mc01_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1563 void put_tpel_pixels_mc01_c(dst, src, stride, width, height);}\
1564 static void put_tpel_pixels ## width ## _mc11_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1565 void put_tpel_pixels_mc11_c(dst, src, stride, width, height);}\
1566 static void put_tpel_pixels ## width ## _mc21_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1567 void put_tpel_pixels_mc21_c(dst, src, stride, width, height);}\
1568 static void put_tpel_pixels ## width ## _mc02_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1569 void put_tpel_pixels_mc02_c(dst, src, stride, width, height);}\
1570 static void put_tpel_pixels ## width ## _mc12_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1571 void put_tpel_pixels_mc12_c(dst, src, stride, width, height);}\
1572 static void put_tpel_pixels ## width ## _mc22_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1573 void put_tpel_pixels_mc22_c(dst, src, stride, width, height);}
1576 #define H264_CHROMA_MC(OPNAME, OP)\
1577 static void OPNAME ## h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1578 const int A=(8-x)*(8-y);\
1579 const int B=( x)*(8-y);\
1580 const int C=(8-x)*( y);\
1581 const int D=( x)*( y);\
1584 assert(x<8 && y<8 && x>=0 && y>=0);\
1587 for(i=0; i<h; i++){\
1588 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1589 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1595 const int step= C ? stride : 1;\
1596 for(i=0; i<h; i++){\
1597 OP(dst[0], (A*src[0] + E*src[step+0]));\
1598 OP(dst[1], (A*src[1] + E*src[step+1]));\
1605 static void OPNAME ## h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1606 const int A=(8-x)*(8-y);\
1607 const int B=( x)*(8-y);\
1608 const int C=(8-x)*( y);\
1609 const int D=( x)*( y);\
1612 assert(x<8 && y<8 && x>=0 && y>=0);\
1615 for(i=0; i<h; i++){\
1616 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1617 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1618 OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1619 OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1625 const int step= C ? stride : 1;\
1626 for(i=0; i<h; i++){\
1627 OP(dst[0], (A*src[0] + E*src[step+0]));\
1628 OP(dst[1], (A*src[1] + E*src[step+1]));\
1629 OP(dst[2], (A*src[2] + E*src[step+2]));\
1630 OP(dst[3], (A*src[3] + E*src[step+3]));\
1637 static void OPNAME ## h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1638 const int A=(8-x)*(8-y);\
1639 const int B=( x)*(8-y);\
1640 const int C=(8-x)*( y);\
1641 const int D=( x)*( y);\
1644 assert(x<8 && y<8 && x>=0 && y>=0);\
1647 for(i=0; i<h; i++){\
1648 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1649 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1650 OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1651 OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1652 OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));\
1653 OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));\
1654 OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));\
1655 OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));\
1661 const int step= C ? stride : 1;\
1662 for(i=0; i<h; i++){\
1663 OP(dst[0], (A*src[0] + E*src[step+0]));\
1664 OP(dst[1], (A*src[1] + E*src[step+1]));\
1665 OP(dst[2], (A*src[2] + E*src[step+2]));\
1666 OP(dst[3], (A*src[3] + E*src[step+3]));\
1667 OP(dst[4], (A*src[4] + E*src[step+4]));\
1668 OP(dst[5], (A*src[5] + E*src[step+5]));\
1669 OP(dst[6], (A*src[6] + E*src[step+6]));\
1670 OP(dst[7], (A*src[7] + E*src[step+7]));\
1677 #define op_avg(a, b) a = (((a)+(((b) + 32)>>6)+1)>>1)
1678 #define op_put(a, b) a = (((b) + 32)>>6)
1680 H264_CHROMA_MC(put_ , op_put)
1681 H264_CHROMA_MC(avg_ , op_avg)
1685 static void put_no_rnd_vc1_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){
1686 const int A=(8-x)*(8-y);
1687 const int B=( x)*(8-y);
1688 const int C=(8-x)*( y);
1689 const int D=( x)*( y);
1692 assert(x<8 && y<8 && x>=0 && y>=0);
1696 dst[0] = (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + 32 - 4) >> 6;
1697 dst[1] = (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + 32 - 4) >> 6;
1698 dst[2] = (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + 32 - 4) >> 6;
1699 dst[3] = (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + 32 - 4) >> 6;
1700 dst[4] = (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + 32 - 4) >> 6;
1701 dst[5] = (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + 32 - 4) >> 6;
1702 dst[6] = (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + 32 - 4) >> 6;
1703 dst[7] = (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + 32 - 4) >> 6;
1709 static void avg_no_rnd_vc1_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){
1710 const int A=(8-x)*(8-y);
1711 const int B=( x)*(8-y);
1712 const int C=(8-x)*( y);
1713 const int D=( x)*( y);
1716 assert(x<8 && y<8 && x>=0 && y>=0);
1720 dst[0] = avg2(dst[0], ((A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + 32 - 4) >> 6));
1721 dst[1] = avg2(dst[1], ((A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + 32 - 4) >> 6));
1722 dst[2] = avg2(dst[2], ((A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + 32 - 4) >> 6));
1723 dst[3] = avg2(dst[3], ((A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + 32 - 4) >> 6));
1724 dst[4] = avg2(dst[4], ((A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + 32 - 4) >> 6));
1725 dst[5] = avg2(dst[5], ((A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + 32 - 4) >> 6));
1726 dst[6] = avg2(dst[6], ((A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + 32 - 4) >> 6));
1727 dst[7] = avg2(dst[7], ((A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + 32 - 4) >> 6));
1733 #define QPEL_MC(r, OPNAME, RND, OP) \
1734 static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1735 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1739 OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
1740 OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
1741 OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
1742 OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
1743 OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
1744 OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
1745 OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
1746 OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
1752 static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1754 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1758 const int src0= src[0*srcStride];\
1759 const int src1= src[1*srcStride];\
1760 const int src2= src[2*srcStride];\
1761 const int src3= src[3*srcStride];\
1762 const int src4= src[4*srcStride];\
1763 const int src5= src[5*srcStride];\
1764 const int src6= src[6*srcStride];\
1765 const int src7= src[7*srcStride];\
1766 const int src8= src[8*srcStride];\
1767 OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
1768 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
1769 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
1770 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
1771 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
1772 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
1773 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
1774 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
1780 static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1781 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1786 OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
1787 OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
1788 OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
1789 OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
1790 OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
1791 OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
1792 OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
1793 OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
1794 OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
1795 OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
1796 OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
1797 OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
1798 OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
1799 OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
1800 OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
1801 OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
1807 static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1808 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1813 const int src0= src[0*srcStride];\
1814 const int src1= src[1*srcStride];\
1815 const int src2= src[2*srcStride];\
1816 const int src3= src[3*srcStride];\
1817 const int src4= src[4*srcStride];\
1818 const int src5= src[5*srcStride];\
1819 const int src6= src[6*srcStride];\
1820 const int src7= src[7*srcStride];\
1821 const int src8= src[8*srcStride];\
1822 const int src9= src[9*srcStride];\
1823 const int src10= src[10*srcStride];\
1824 const int src11= src[11*srcStride];\
1825 const int src12= src[12*srcStride];\
1826 const int src13= src[13*srcStride];\
1827 const int src14= src[14*srcStride];\
1828 const int src15= src[15*srcStride];\
1829 const int src16= src[16*srcStride];\
1830 OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
1831 OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
1832 OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
1833 OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
1834 OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
1835 OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
1836 OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
1837 OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
1838 OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
1839 OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
1840 OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
1841 OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
1842 OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
1843 OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
1844 OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
1845 OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
1851 static void OPNAME ## qpel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1852 OPNAME ## pixels8_c(dst, src, stride, 8);\
1855 static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1857 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1858 OPNAME ## pixels8_l2(dst, src, half, stride, stride, 8, 8);\
1861 static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1862 OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
1865 static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1867 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1868 OPNAME ## pixels8_l2(dst, src+1, half, stride, stride, 8, 8);\
1871 static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1872 uint8_t full[16*9];\
1874 copy_block9(full, src, 16, stride, 9);\
1875 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1876 OPNAME ## pixels8_l2(dst, full, half, stride, 16, 8, 8);\
1879 static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1880 uint8_t full[16*9];\
1881 copy_block9(full, src, 16, stride, 9);\
1882 OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
1885 static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1886 uint8_t full[16*9];\
1888 copy_block9(full, src, 16, stride, 9);\
1889 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1890 OPNAME ## pixels8_l2(dst, full+16, half, stride, 16, 8, 8);\
1892 void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1893 uint8_t full[16*9];\
1896 uint8_t halfHV[64];\
1897 copy_block9(full, src, 16, stride, 9);\
1898 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1899 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1900 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1901 OPNAME ## pixels8_l4(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1903 static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1904 uint8_t full[16*9];\
1906 uint8_t halfHV[64];\
1907 copy_block9(full, src, 16, stride, 9);\
1908 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1909 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1910 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1911 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1913 void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1914 uint8_t full[16*9];\
1917 uint8_t halfHV[64];\
1918 copy_block9(full, src, 16, stride, 9);\
1919 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1920 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1921 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1922 OPNAME ## pixels8_l4(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1924 static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1925 uint8_t full[16*9];\
1927 uint8_t halfHV[64];\
1928 copy_block9(full, src, 16, stride, 9);\
1929 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1930 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1931 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1932 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1934 void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1935 uint8_t full[16*9];\
1938 uint8_t halfHV[64];\
1939 copy_block9(full, src, 16, stride, 9);\
1940 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1941 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1942 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1943 OPNAME ## pixels8_l4(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1945 static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1946 uint8_t full[16*9];\
1948 uint8_t halfHV[64];\
1949 copy_block9(full, src, 16, stride, 9);\
1950 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1951 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1952 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1953 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1955 void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1956 uint8_t full[16*9];\
1959 uint8_t halfHV[64];\
1960 copy_block9(full, src, 16, stride, 9);\
1961 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full , 8, 16, 9);\
1962 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1963 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1964 OPNAME ## pixels8_l4(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1966 static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1967 uint8_t full[16*9];\
1969 uint8_t halfHV[64];\
1970 copy_block9(full, src, 16, stride, 9);\
1971 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1972 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1973 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1974 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1976 static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1978 uint8_t halfHV[64];\
1979 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1980 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1981 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1983 static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1985 uint8_t halfHV[64];\
1986 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1987 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1988 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1990 void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1991 uint8_t full[16*9];\
1994 uint8_t halfHV[64];\
1995 copy_block9(full, src, 16, stride, 9);\
1996 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1997 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1998 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1999 OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
2001 static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2002 uint8_t full[16*9];\
2004 copy_block9(full, src, 16, stride, 9);\
2005 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
2006 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
2007 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
2009 void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
2010 uint8_t full[16*9];\
2013 uint8_t halfHV[64];\
2014 copy_block9(full, src, 16, stride, 9);\
2015 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
2016 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
2017 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
2018 OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
2020 static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2021 uint8_t full[16*9];\
2023 copy_block9(full, src, 16, stride, 9);\
2024 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
2025 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
2026 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
2028 static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2030 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
2031 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
2033 static void OPNAME ## qpel16_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2034 OPNAME ## pixels16_c(dst, src, stride, 16);\
2037 static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2039 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
2040 OPNAME ## pixels16_l2(dst, src, half, stride, stride, 16, 16);\
2043 static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2044 OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
2047 static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2049 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
2050 OPNAME ## pixels16_l2(dst, src+1, half, stride, stride, 16, 16);\
2053 static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2054 uint8_t full[24*17];\
2056 copy_block17(full, src, 24, stride, 17);\
2057 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
2058 OPNAME ## pixels16_l2(dst, full, half, stride, 24, 16, 16);\
2061 static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2062 uint8_t full[24*17];\
2063 copy_block17(full, src, 24, stride, 17);\
2064 OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
2067 static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2068 uint8_t full[24*17];\
2070 copy_block17(full, src, 24, stride, 17);\
2071 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
2072 OPNAME ## pixels16_l2(dst, full+24, half, stride, 24, 16, 16);\
2074 void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
2075 uint8_t full[24*17];\
2076 uint8_t halfH[272];\
2077 uint8_t halfV[256];\
2078 uint8_t halfHV[256];\
2079 copy_block17(full, src, 24, stride, 17);\
2080 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2081 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2082 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2083 OPNAME ## pixels16_l4(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2085 static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2086 uint8_t full[24*17];\
2087 uint8_t halfH[272];\
2088 uint8_t halfHV[256];\
2089 copy_block17(full, src, 24, stride, 17);\
2090 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2091 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2092 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2093 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2095 void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
2096 uint8_t full[24*17];\
2097 uint8_t halfH[272];\
2098 uint8_t halfV[256];\
2099 uint8_t halfHV[256];\
2100 copy_block17(full, src, 24, stride, 17);\
2101 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2102 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2103 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2104 OPNAME ## pixels16_l4(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2106 static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2107 uint8_t full[24*17];\
2108 uint8_t halfH[272];\
2109 uint8_t halfHV[256];\
2110 copy_block17(full, src, 24, stride, 17);\
2111 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2112 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2113 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2114 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2116 void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
2117 uint8_t full[24*17];\
2118 uint8_t halfH[272];\
2119 uint8_t halfV[256];\
2120 uint8_t halfHV[256];\
2121 copy_block17(full, src, 24, stride, 17);\
2122 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2123 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2124 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2125 OPNAME ## pixels16_l4(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2127 static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2128 uint8_t full[24*17];\
2129 uint8_t halfH[272];\
2130 uint8_t halfHV[256];\
2131 copy_block17(full, src, 24, stride, 17);\
2132 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2133 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2134 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2135 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2137 void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
2138 uint8_t full[24*17];\
2139 uint8_t halfH[272];\
2140 uint8_t halfV[256];\
2141 uint8_t halfHV[256];\
2142 copy_block17(full, src, 24, stride, 17);\
2143 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full , 16, 24, 17);\
2144 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2145 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2146 OPNAME ## pixels16_l4(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2148 static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2149 uint8_t full[24*17];\
2150 uint8_t halfH[272];\
2151 uint8_t halfHV[256];\
2152 copy_block17(full, src, 24, stride, 17);\
2153 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2154 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2155 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2156 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2158 static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2159 uint8_t halfH[272];\
2160 uint8_t halfHV[256];\
2161 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2162 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2163 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2165 static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2166 uint8_t halfH[272];\
2167 uint8_t halfHV[256];\
2168 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2169 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2170 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2172 void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
2173 uint8_t full[24*17];\
2174 uint8_t halfH[272];\
2175 uint8_t halfV[256];\
2176 uint8_t halfHV[256];\
2177 copy_block17(full, src, 24, stride, 17);\
2178 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2179 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2180 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2181 OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2183 static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2184 uint8_t full[24*17];\
2185 uint8_t halfH[272];\
2186 copy_block17(full, src, 24, stride, 17);\
2187 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2188 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2189 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2191 void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
2192 uint8_t full[24*17];\
2193 uint8_t halfH[272];\
2194 uint8_t halfV[256];\
2195 uint8_t halfHV[256];\
2196 copy_block17(full, src, 24, stride, 17);\
2197 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2198 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2199 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2200 OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2202 static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2203 uint8_t full[24*17];\
2204 uint8_t halfH[272];\
2205 copy_block17(full, src, 24, stride, 17);\
2206 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2207 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2208 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2210 static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2211 uint8_t halfH[272];\
2212 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2213 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2216 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2217 #define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
2218 #define op_put(a, b) a = cm[((b) + 16)>>5]
2219 #define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
2221 QPEL_MC(0, put_ , _ , op_put)
2222 QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
2223 QPEL_MC(0, avg_ , _ , op_avg)
2224 //QPEL_MC(1, avg_no_rnd , _ , op_avg)
2226 #undef op_avg_no_rnd
2228 #undef op_put_no_rnd
2231 #define H264_LOWPASS(OPNAME, OP, OP2) \
2232 static av_unused void OPNAME ## h264_qpel2_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2234 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2238 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2239 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2245 static av_unused void OPNAME ## h264_qpel2_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2247 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2251 const int srcB= src[-2*srcStride];\
2252 const int srcA= src[-1*srcStride];\
2253 const int src0= src[0 *srcStride];\
2254 const int src1= src[1 *srcStride];\
2255 const int src2= src[2 *srcStride];\
2256 const int src3= src[3 *srcStride];\
2257 const int src4= src[4 *srcStride];\
2258 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2259 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2265 static av_unused void OPNAME ## h264_qpel2_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2268 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2270 src -= 2*srcStride;\
2271 for(i=0; i<h+5; i++)\
2273 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2274 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2278 tmp -= tmpStride*(h+5-2);\
2281 const int tmpB= tmp[-2*tmpStride];\
2282 const int tmpA= tmp[-1*tmpStride];\
2283 const int tmp0= tmp[0 *tmpStride];\
2284 const int tmp1= tmp[1 *tmpStride];\
2285 const int tmp2= tmp[2 *tmpStride];\
2286 const int tmp3= tmp[3 *tmpStride];\
2287 const int tmp4= tmp[4 *tmpStride];\
2288 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2289 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2294 static void OPNAME ## h264_qpel4_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2296 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2300 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2301 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2302 OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\
2303 OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\
2309 static void OPNAME ## h264_qpel4_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2311 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2315 const int srcB= src[-2*srcStride];\
2316 const int srcA= src[-1*srcStride];\
2317 const int src0= src[0 *srcStride];\
2318 const int src1= src[1 *srcStride];\
2319 const int src2= src[2 *srcStride];\
2320 const int src3= src[3 *srcStride];\
2321 const int src4= src[4 *srcStride];\
2322 const int src5= src[5 *srcStride];\
2323 const int src6= src[6 *srcStride];\
2324 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2325 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2326 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2327 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2333 static void OPNAME ## h264_qpel4_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2336 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2338 src -= 2*srcStride;\
2339 for(i=0; i<h+5; i++)\
2341 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2342 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2343 tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]);\
2344 tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]);\
2348 tmp -= tmpStride*(h+5-2);\
2351 const int tmpB= tmp[-2*tmpStride];\
2352 const int tmpA= tmp[-1*tmpStride];\
2353 const int tmp0= tmp[0 *tmpStride];\
2354 const int tmp1= tmp[1 *tmpStride];\
2355 const int tmp2= tmp[2 *tmpStride];\
2356 const int tmp3= tmp[3 *tmpStride];\
2357 const int tmp4= tmp[4 *tmpStride];\
2358 const int tmp5= tmp[5 *tmpStride];\
2359 const int tmp6= tmp[6 *tmpStride];\
2360 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2361 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2362 OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2363 OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2369 static void OPNAME ## h264_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2371 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2375 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]));\
2376 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]));\
2377 OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]));\
2378 OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]));\
2379 OP(dst[4], (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]));\
2380 OP(dst[5], (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]));\
2381 OP(dst[6], (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]));\
2382 OP(dst[7], (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]));\
2388 static void OPNAME ## h264_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2390 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2394 const int srcB= src[-2*srcStride];\
2395 const int srcA= src[-1*srcStride];\
2396 const int src0= src[0 *srcStride];\
2397 const int src1= src[1 *srcStride];\
2398 const int src2= src[2 *srcStride];\
2399 const int src3= src[3 *srcStride];\
2400 const int src4= src[4 *srcStride];\
2401 const int src5= src[5 *srcStride];\
2402 const int src6= src[6 *srcStride];\
2403 const int src7= src[7 *srcStride];\
2404 const int src8= src[8 *srcStride];\
2405 const int src9= src[9 *srcStride];\
2406 const int src10=src[10*srcStride];\
2407 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2408 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2409 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2410 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2411 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*5 + (src2+src7));\
2412 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*5 + (src3+src8));\
2413 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*5 + (src4+src9));\
2414 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src9)*5 + (src5+src10));\
2420 static void OPNAME ## h264_qpel8_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2423 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2425 src -= 2*srcStride;\
2426 for(i=0; i<h+5; i++)\
2428 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]);\
2429 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]);\
2430 tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]);\
2431 tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]);\
2432 tmp[4]= (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]);\
2433 tmp[5]= (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]);\
2434 tmp[6]= (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]);\
2435 tmp[7]= (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]);\
2439 tmp -= tmpStride*(h+5-2);\
2442 const int tmpB= tmp[-2*tmpStride];\
2443 const int tmpA= tmp[-1*tmpStride];\
2444 const int tmp0= tmp[0 *tmpStride];\
2445 const int tmp1= tmp[1 *tmpStride];\
2446 const int tmp2= tmp[2 *tmpStride];\
2447 const int tmp3= tmp[3 *tmpStride];\
2448 const int tmp4= tmp[4 *tmpStride];\
2449 const int tmp5= tmp[5 *tmpStride];\
2450 const int tmp6= tmp[6 *tmpStride];\
2451 const int tmp7= tmp[7 *tmpStride];\
2452 const int tmp8= tmp[8 *tmpStride];\
2453 const int tmp9= tmp[9 *tmpStride];\
2454 const int tmp10=tmp[10*tmpStride];\
2455 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2456 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2457 OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2458 OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2459 OP2(dst[4*dstStride], (tmp4+tmp5)*20 - (tmp3+tmp6)*5 + (tmp2+tmp7));\
2460 OP2(dst[5*dstStride], (tmp5+tmp6)*20 - (tmp4+tmp7)*5 + (tmp3+tmp8));\
2461 OP2(dst[6*dstStride], (tmp6+tmp7)*20 - (tmp5+tmp8)*5 + (tmp4+tmp9));\
2462 OP2(dst[7*dstStride], (tmp7+tmp8)*20 - (tmp6+tmp9)*5 + (tmp5+tmp10));\
2468 static void OPNAME ## h264_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2469 OPNAME ## h264_qpel8_v_lowpass(dst , src , dstStride, srcStride);\
2470 OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2471 src += 8*srcStride;\
2472 dst += 8*dstStride;\
2473 OPNAME ## h264_qpel8_v_lowpass(dst , src , dstStride, srcStride);\
2474 OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2477 static void OPNAME ## h264_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2478 OPNAME ## h264_qpel8_h_lowpass(dst , src , dstStride, srcStride);\
2479 OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2480 src += 8*srcStride;\
2481 dst += 8*dstStride;\
2482 OPNAME ## h264_qpel8_h_lowpass(dst , src , dstStride, srcStride);\
2483 OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2486 static void OPNAME ## h264_qpel16_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2487 OPNAME ## h264_qpel8_hv_lowpass(dst , tmp , src , dstStride, tmpStride, srcStride);\
2488 OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2489 src += 8*srcStride;\
2490 dst += 8*dstStride;\
2491 OPNAME ## h264_qpel8_hv_lowpass(dst , tmp , src , dstStride, tmpStride, srcStride);\
2492 OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2495 #define H264_MC(OPNAME, SIZE) \
2496 static void OPNAME ## h264_qpel ## SIZE ## _mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2497 OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\
2500 static void OPNAME ## h264_qpel ## SIZE ## _mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2501 uint8_t half[SIZE*SIZE];\
2502 put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2503 OPNAME ## pixels ## SIZE ## _l2(dst, src, half, stride, stride, SIZE, SIZE);\
2506 static void OPNAME ## h264_qpel ## SIZE ## _mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2507 OPNAME ## h264_qpel ## SIZE ## _h_lowpass(dst, src, stride, stride);\
2510 static void OPNAME ## h264_qpel ## SIZE ## _mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2511 uint8_t half[SIZE*SIZE];\
2512 put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2513 OPNAME ## pixels ## SIZE ## _l2(dst, src+1, half, stride, stride, SIZE, SIZE);\
2516 static void OPNAME ## h264_qpel ## SIZE ## _mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2517 uint8_t full[SIZE*(SIZE+5)];\
2518 uint8_t * const full_mid= full + SIZE*2;\
2519 uint8_t half[SIZE*SIZE];\
2520 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2521 put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2522 OPNAME ## pixels ## SIZE ## _l2(dst, full_mid, half, stride, SIZE, SIZE, SIZE);\
2525 static void OPNAME ## h264_qpel ## SIZE ## _mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2526 uint8_t full[SIZE*(SIZE+5)];\
2527 uint8_t * const full_mid= full + SIZE*2;\
2528 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2529 OPNAME ## h264_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE);\
2532 static void OPNAME ## h264_qpel ## SIZE ## _mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2533 uint8_t full[SIZE*(SIZE+5)];\
2534 uint8_t * const full_mid= full + SIZE*2;\
2535 uint8_t half[SIZE*SIZE];\
2536 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2537 put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2538 OPNAME ## pixels ## SIZE ## _l2(dst, full_mid+SIZE, half, stride, SIZE, SIZE, SIZE);\
2541 static void OPNAME ## h264_qpel ## SIZE ## _mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2542 uint8_t full[SIZE*(SIZE+5)];\
2543 uint8_t * const full_mid= full + SIZE*2;\
2544 uint8_t halfH[SIZE*SIZE];\
2545 uint8_t halfV[SIZE*SIZE];\
2546 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2547 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2548 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2549 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2552 static void OPNAME ## h264_qpel ## SIZE ## _mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2553 uint8_t full[SIZE*(SIZE+5)];\
2554 uint8_t * const full_mid= full + SIZE*2;\
2555 uint8_t halfH[SIZE*SIZE];\
2556 uint8_t halfV[SIZE*SIZE];\
2557 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2558 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2559 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2560 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2563 static void OPNAME ## h264_qpel ## SIZE ## _mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2564 uint8_t full[SIZE*(SIZE+5)];\
2565 uint8_t * const full_mid= full + SIZE*2;\
2566 uint8_t halfH[SIZE*SIZE];\
2567 uint8_t halfV[SIZE*SIZE];\
2568 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2569 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2570 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2571 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2574 static void OPNAME ## h264_qpel ## SIZE ## _mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2575 uint8_t full[SIZE*(SIZE+5)];\
2576 uint8_t * const full_mid= full + SIZE*2;\
2577 uint8_t halfH[SIZE*SIZE];\
2578 uint8_t halfV[SIZE*SIZE];\
2579 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2580 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2581 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2582 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2585 static void OPNAME ## h264_qpel ## SIZE ## _mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2586 int16_t tmp[SIZE*(SIZE+5)];\
2587 OPNAME ## h264_qpel ## SIZE ## _hv_lowpass(dst, tmp, src, stride, SIZE, stride);\
2590 static void OPNAME ## h264_qpel ## SIZE ## _mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2591 int16_t tmp[SIZE*(SIZE+5)];\
2592 uint8_t halfH[SIZE*SIZE];\
2593 uint8_t halfHV[SIZE*SIZE];\
2594 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2595 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2596 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2599 static void OPNAME ## h264_qpel ## SIZE ## _mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2600 int16_t tmp[SIZE*(SIZE+5)];\
2601 uint8_t halfH[SIZE*SIZE];\
2602 uint8_t halfHV[SIZE*SIZE];\
2603 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2604 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2605 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2608 static void OPNAME ## h264_qpel ## SIZE ## _mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2609 uint8_t full[SIZE*(SIZE+5)];\
2610 uint8_t * const full_mid= full + SIZE*2;\
2611 int16_t tmp[SIZE*(SIZE+5)];\
2612 uint8_t halfV[SIZE*SIZE];\
2613 uint8_t halfHV[SIZE*SIZE];\
2614 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2615 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2616 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2617 OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2620 static void OPNAME ## h264_qpel ## SIZE ## _mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2621 uint8_t full[SIZE*(SIZE+5)];\
2622 uint8_t * const full_mid= full + SIZE*2;\
2623 int16_t tmp[SIZE*(SIZE+5)];\
2624 uint8_t halfV[SIZE*SIZE];\
2625 uint8_t halfHV[SIZE*SIZE];\
2626 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2627 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2628 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2629 OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2632 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2633 //#define op_avg2(a, b) a = (((a)*w1+cm[((b) + 16)>>5]*w2 + o + 64)>>7)
2634 #define op_put(a, b) a = cm[((b) + 16)>>5]
2635 #define op2_avg(a, b) a = (((a)+cm[((b) + 512)>>10]+1)>>1)
2636 #define op2_put(a, b) a = cm[((b) + 512)>>10]
2638 H264_LOWPASS(put_ , op_put, op2_put)
2639 H264_LOWPASS(avg_ , op_avg, op2_avg)
2654 #define op_scale1(x) block[x] = av_clip_uint8( (block[x]*weight + offset) >> log2_denom )
2655 #define op_scale2(x) dst[x] = av_clip_uint8( (src[x]*weights + dst[x]*weightd + offset) >> (log2_denom+1))
2656 #define H264_WEIGHT(W,H) \
2657 static void weight_h264_pixels ## W ## x ## H ## _c(uint8_t *block, int stride, int log2_denom, int weight, int offset){ \
2659 offset <<= log2_denom; \
2660 if(log2_denom) offset += 1<<(log2_denom-1); \
2661 for(y=0; y<H; y++, block += stride){ \
2664 if(W==2) continue; \
2667 if(W==4) continue; \
2672 if(W==8) continue; \
2683 static void biweight_h264_pixels ## W ## x ## H ## _c(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset){ \
2685 offset = ((offset + 1) | 1) << log2_denom; \
2686 for(y=0; y<H; y++, dst += stride, src += stride){ \
2689 if(W==2) continue; \
2692 if(W==4) continue; \
2697 if(W==8) continue; \
2724 static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
2725 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2729 dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
2730 dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
2731 dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
2732 dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
2733 dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
2734 dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
2735 dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
2736 dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
2742 #if CONFIG_CAVS_DECODER
2744 void ff_cavsdsp_init(DSPContext* c, AVCodecContext *avctx);
2746 void ff_put_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2747 put_pixels8_c(dst, src, stride, 8);
2749 void ff_avg_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2750 avg_pixels8_c(dst, src, stride, 8);
2752 void ff_put_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2753 put_pixels16_c(dst, src, stride, 16);
2755 void ff_avg_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2756 avg_pixels16_c(dst, src, stride, 16);
2758 #endif /* CONFIG_CAVS_DECODER */
2760 void ff_mlp_init(DSPContext* c, AVCodecContext *avctx);
2762 #if CONFIG_VC1_DECODER
2764 void ff_vc1dsp_init(DSPContext* c, AVCodecContext *avctx);
2766 void ff_put_vc1_mspel_mc00_c(uint8_t *dst, uint8_t *src, int stride, int rnd) {
2767 put_pixels8_c(dst, src, stride, 8);
2769 void ff_avg_vc1_mspel_mc00_c(uint8_t *dst, uint8_t *src, int stride, int rnd) {
2770 avg_pixels8_c(dst, src, stride, 8);
2772 #endif /* CONFIG_VC1_DECODER */
2774 void ff_intrax8dsp_init(DSPContext* c, AVCodecContext *avctx);
2777 void ff_h264dspenc_init(DSPContext* c, AVCodecContext *avctx);
2779 #if CONFIG_RV30_DECODER
2780 void ff_rv30dsp_init(DSPContext* c, AVCodecContext *avctx);
2781 #endif /* CONFIG_RV30_DECODER */
2783 #if CONFIG_RV40_DECODER
2784 static void put_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2785 put_pixels16_xy2_c(dst, src, stride, 16);
2787 static void avg_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2788 avg_pixels16_xy2_c(dst, src, stride, 16);
2790 static void put_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2791 put_pixels8_xy2_c(dst, src, stride, 8);
2793 static void avg_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2794 avg_pixels8_xy2_c(dst, src, stride, 8);
2797 void ff_rv40dsp_init(DSPContext* c, AVCodecContext *avctx);
2798 #endif /* CONFIG_RV40_DECODER */
2800 static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
2801 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2805 const int src_1= src[ -srcStride];
2806 const int src0 = src[0 ];
2807 const int src1 = src[ srcStride];
2808 const int src2 = src[2*srcStride];
2809 const int src3 = src[3*srcStride];
2810 const int src4 = src[4*srcStride];
2811 const int src5 = src[5*srcStride];
2812 const int src6 = src[6*srcStride];
2813 const int src7 = src[7*srcStride];
2814 const int src8 = src[8*srcStride];
2815 const int src9 = src[9*srcStride];
2816 dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
2817 dst[1*dstStride]= cm[(9*(src1 + src2) - (src0 + src3) + 8)>>4];
2818 dst[2*dstStride]= cm[(9*(src2 + src3) - (src1 + src4) + 8)>>4];
2819 dst[3*dstStride]= cm[(9*(src3 + src4) - (src2 + src5) + 8)>>4];
2820 dst[4*dstStride]= cm[(9*(src4 + src5) - (src3 + src6) + 8)>>4];
2821 dst[5*dstStride]= cm[(9*(src5 + src6) - (src4 + src7) + 8)>>4];
2822 dst[6*dstStride]= cm[(9*(src6 + src7) - (src5 + src8) + 8)>>4];
2823 dst[7*dstStride]= cm[(9*(src7 + src8) - (src6 + src9) + 8)>>4];
2829 static void put_mspel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){
2830 put_pixels8_c(dst, src, stride, 8);
2833 static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
2835 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2836 put_pixels8_l2(dst, src, half, stride, stride, 8, 8);
2839 static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
2840 wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
2843 static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
2845 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2846 put_pixels8_l2(dst, src+1, half, stride, stride, 8, 8);
2849 static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
2850 wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
2853 static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
2857 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2858 wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
2859 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2860 put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2862 static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
2866 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2867 wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
2868 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2869 put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2871 static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
2873 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2874 wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
2877 static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
2878 if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
2880 const int strength= ff_h263_loop_filter_strength[qscale];
2884 int p0= src[x-2*stride];
2885 int p1= src[x-1*stride];
2886 int p2= src[x+0*stride];
2887 int p3= src[x+1*stride];
2888 int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2890 if (d<-2*strength) d1= 0;
2891 else if(d<- strength) d1=-2*strength - d;
2892 else if(d< strength) d1= d;
2893 else if(d< 2*strength) d1= 2*strength - d;
2898 if(p1&256) p1= ~(p1>>31);
2899 if(p2&256) p2= ~(p2>>31);
2901 src[x-1*stride] = p1;
2902 src[x+0*stride] = p2;
2906 d2= av_clip((p0-p3)/4, -ad1, ad1);
2908 src[x-2*stride] = p0 - d2;
2909 src[x+ stride] = p3 + d2;
2914 static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
2915 if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
2917 const int strength= ff_h263_loop_filter_strength[qscale];
2921 int p0= src[y*stride-2];
2922 int p1= src[y*stride-1];
2923 int p2= src[y*stride+0];
2924 int p3= src[y*stride+1];
2925 int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2927 if (d<-2*strength) d1= 0;
2928 else if(d<- strength) d1=-2*strength - d;
2929 else if(d< strength) d1= d;
2930 else if(d< 2*strength) d1= 2*strength - d;
2935 if(p1&256) p1= ~(p1>>31);
2936 if(p2&256) p2= ~(p2>>31);
2938 src[y*stride-1] = p1;
2939 src[y*stride+0] = p2;
2943 d2= av_clip((p0-p3)/4, -ad1, ad1);
2945 src[y*stride-2] = p0 - d2;
2946 src[y*stride+1] = p3 + d2;
2951 static void h261_loop_filter_c(uint8_t *src, int stride){
2956 temp[x ] = 4*src[x ];
2957 temp[x + 7*8] = 4*src[x + 7*stride];
2961 xy = y * stride + x;
2963 temp[yz] = src[xy - stride] + 2*src[xy] + src[xy + stride];
2968 src[ y*stride] = (temp[ y*8] + 2)>>2;
2969 src[7+y*stride] = (temp[7+y*8] + 2)>>2;
2971 xy = y * stride + x;
2973 src[xy] = (temp[yz-1] + 2*temp[yz] + temp[yz+1] + 8)>>4;
2978 static av_always_inline av_flatten void h264_loop_filter_luma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
2981 for( i = 0; i < 4; i++ ) {
2986 for( d = 0; d < 4; d++ ) {
2987 const int p0 = pix[-1*xstride];
2988 const int p1 = pix[-2*xstride];
2989 const int p2 = pix[-3*xstride];
2990 const int q0 = pix[0];
2991 const int q1 = pix[1*xstride];
2992 const int q2 = pix[2*xstride];
2994 if( FFABS( p0 - q0 ) < alpha &&
2995 FFABS( p1 - p0 ) < beta &&
2996 FFABS( q1 - q0 ) < beta ) {
3001 if( FFABS( p2 - p0 ) < beta ) {
3003 pix[-2*xstride] = p1 + av_clip( (( p2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - p1, -tc0[i], tc0[i] );
3006 if( FFABS( q2 - q0 ) < beta ) {
3008 pix[ xstride] = q1 + av_clip( (( q2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - q1, -tc0[i], tc0[i] );
3012 i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
3013 pix[-xstride] = av_clip_uint8( p0 + i_delta ); /* p0' */
3014 pix[0] = av_clip_uint8( q0 - i_delta ); /* q0' */
3020 static void h264_v_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
3022 h264_loop_filter_luma_c(pix, stride, 1, alpha, beta, tc0);
3024 static void h264_h_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
3026 h264_loop_filter_luma_c(pix, 1, stride, alpha, beta, tc0);
3029 static av_always_inline av_flatten void h264_loop_filter_luma_intra_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta)
3032 for( d = 0; d < 16; d++ ) {
3033 const int p2 = pix[-3*xstride];
3034 const int p1 = pix[-2*xstride];
3035 const int p0 = pix[-1*xstride];
3037 const int q0 = pix[ 0*xstride];
3038 const int q1 = pix[ 1*xstride];
3039 const int q2 = pix[ 2*xstride];
3041 if( FFABS( p0 - q0 ) < alpha &&
3042 FFABS( p1 - p0 ) < beta &&
3043 FFABS( q1 - q0 ) < beta ) {
3045 if(FFABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
3046 if( FFABS( p2 - p0 ) < beta)
3048 const int p3 = pix[-4*xstride];
3050 pix[-1*xstride] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
3051 pix[-2*xstride] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
3052 pix[-3*xstride] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
3055 pix[-1*xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
3057 if( FFABS( q2 - q0 ) < beta)
3059 const int q3 = pix[3*xstride];
3061 pix[0*xstride] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
3062 pix[1*xstride] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
3063 pix[2*xstride] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
3066 pix[0*xstride] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
3070 pix[-1*xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
3071 pix[ 0*xstride] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
3077 static void h264_v_loop_filter_luma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
3079 h264_loop_filter_luma_intra_c(pix, stride, 1, alpha, beta);
3081 static void h264_h_loop_filter_luma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
3083 h264_loop_filter_luma_intra_c(pix, 1, stride, alpha, beta);
3086 static av_always_inline av_flatten void h264_loop_filter_chroma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
3089 for( i = 0; i < 4; i++ ) {
3090 const int tc = tc0[i];
3095 for( d = 0; d < 2; d++ ) {
3096 const int p0 = pix[-1*xstride];
3097 const int p1 = pix[-2*xstride];
3098 const int q0 = pix[0];
3099 const int q1 = pix[1*xstride];
3101 if( FFABS( p0 - q0 ) < alpha &&
3102 FFABS( p1 - p0 ) < beta &&
3103 FFABS( q1 - q0 ) < beta ) {
3105 int delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
3107 pix[-xstride] = av_clip_uint8( p0 + delta ); /* p0' */
3108 pix[0] = av_clip_uint8( q0 - delta ); /* q0' */
3114 static void h264_v_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
3116 h264_loop_filter_chroma_c(pix, stride, 1, alpha, beta, tc0);
3118 static void h264_h_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
3120 h264_loop_filter_chroma_c(pix, 1, stride, alpha, beta, tc0);
3123 static av_always_inline av_flatten void h264_loop_filter_chroma_intra_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta)
3126 for( d = 0; d < 8; d++ ) {
3127 const int p0 = pix[-1*xstride];
3128 const int p1 = pix[-2*xstride];
3129 const int q0 = pix[0];
3130 const int q1 = pix[1*xstride];
3132 if( FFABS( p0 - q0 ) < alpha &&
3133 FFABS( p1 - p0 ) < beta &&
3134 FFABS( q1 - q0 ) < beta ) {
3136 pix[-xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2; /* p0' */
3137 pix[0] = ( 2*q1 + q0 + p1 + 2 ) >> 2; /* q0' */
3142 static void h264_v_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
3144 h264_loop_filter_chroma_intra_c(pix, stride, 1, alpha, beta);
3146 static void h264_h_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
3148 h264_loop_filter_chroma_intra_c(pix, 1, stride, alpha, beta);
3151 static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3157 s += abs(pix1[0] - pix2[0]);
3158 s += abs(pix1[1] - pix2[1]);
3159 s += abs(pix1[2] - pix2[2]);
3160 s += abs(pix1[3] - pix2[3]);
3161 s += abs(pix1[4] - pix2[4]);
3162 s += abs(pix1[5] - pix2[5]);
3163 s += abs(pix1[6] - pix2[6]);
3164 s += abs(pix1[7] - pix2[7]);
3165 s += abs(pix1[8] - pix2[8]);
3166 s += abs(pix1[9] - pix2[9]);
3167 s += abs(pix1[10] - pix2[10]);
3168 s += abs(pix1[11] - pix2[11]);
3169 s += abs(pix1[12] - pix2[12]);
3170 s += abs(pix1[13] - pix2[13]);
3171 s += abs(pix1[14] - pix2[14]);
3172 s += abs(pix1[15] - pix2[15]);
3179 static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3185 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
3186 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
3187 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
3188 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
3189 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
3190 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
3191 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
3192 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
3193 s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
3194 s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
3195 s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
3196 s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
3197 s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
3198 s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
3199 s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
3200 s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
3207 static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3210 uint8_t *pix3 = pix2 + line_size;
3214 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
3215 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
3216 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
3217 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
3218 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
3219 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
3220 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
3221 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
3222 s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
3223 s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
3224 s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
3225 s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
3226 s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
3227 s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
3228 s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
3229 s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
3237 static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3240 uint8_t *pix3 = pix2 + line_size;
3244 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
3245 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
3246 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
3247 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
3248 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
3249 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
3250 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
3251 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
3252 s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
3253 s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
3254 s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
3255 s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
3256 s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
3257 s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
3258 s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
3259 s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
3267 static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3273 s += abs(pix1[0] - pix2[0]);
3274 s += abs(pix1[1] - pix2[1]);
3275 s += abs(pix1[2] - pix2[2]);
3276 s += abs(pix1[3] - pix2[3]);
3277 s += abs(pix1[4] - pix2[4]);
3278 s += abs(pix1[5] - pix2[5]);
3279 s += abs(pix1[6] - pix2[6]);
3280 s += abs(pix1[7] - pix2[7]);
3287 static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3293 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
3294 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
3295 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
3296 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
3297 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
3298 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
3299 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
3300 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
3307 static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3310 uint8_t *pix3 = pix2 + line_size;
3314 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
3315 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
3316 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
3317 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
3318 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
3319 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
3320 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
3321 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
3329 static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3332 uint8_t *pix3 = pix2 + line_size;
3336 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
3337 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
3338 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
3339 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
3340 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
3341 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
3342 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
3343 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
3351 static int nsse16_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
3352 MpegEncContext *c = v;
3358 for(x=0; x<16; x++){
3359 score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]);
3362 for(x=0; x<15; x++){
3363 score2+= FFABS( s1[x ] - s1[x +stride]
3364 - s1[x+1] + s1[x+1+stride])
3365 -FFABS( s2[x ] - s2[x +stride]
3366 - s2[x+1] + s2[x+1+stride]);
3373 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
3374 else return score1 + FFABS(score2)*8;
3377 static int nsse8_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
3378 MpegEncContext *c = v;
3385 score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]);
3389 score2+= FFABS( s1[x ] - s1[x +stride]
3390 - s1[x+1] + s1[x+1+stride])
3391 -FFABS( s2[x ] - s2[x +stride]
3392 - s2[x+1] + s2[x+1+stride]);
3399 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
3400 else return score1 + FFABS(score2)*8;
3403 static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
3407 for(i=0; i<8*8; i++){
3408 int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
3411 assert(-512<b && b<512);
3413 sum += (w*b)*(w*b)>>4;
3418 static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
3421 for(i=0; i<8*8; i++){
3422 rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
3427 * permutes an 8x8 block.
3428 * @param block the block which will be permuted according to the given permutation vector
3429 * @param permutation the permutation vector
3430 * @param last the last non zero coefficient in scantable order, used to speed the permutation up
3431 * @param scantable the used scantable, this is only used to speed the permutation up, the block is not
3432 * (inverse) permutated to scantable order!
3434 void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
3440 //if(permutation[1]==1) return; //FIXME it is ok but not clean and might fail for some permutations
3442 for(i=0; i<=last; i++){
3443 const int j= scantable[i];
3448 for(i=0; i<=last; i++){
3449 const int j= scantable[i];
3450 const int perm_j= permutation[j];
3451 block[perm_j]= temp[j];
3455 static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
3459 void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
3462 memset(cmp, 0, sizeof(void*)*6);
3470 cmp[i]= c->hadamard8_diff[i];
3476 cmp[i]= c->dct_sad[i];
3479 cmp[i]= c->dct264_sad[i];
3482 cmp[i]= c->dct_max[i];
3485 cmp[i]= c->quant_psnr[i];
3505 #if CONFIG_SNOW_ENCODER
3514 av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
3519 static void clear_block_c(DCTELEM *block)
3521 memset(block, 0, sizeof(DCTELEM)*64);
3525 * memset(blocks, 0, sizeof(DCTELEM)*6*64)
3527 static void clear_blocks_c(DCTELEM *blocks)
3529 memset(blocks, 0, sizeof(DCTELEM)*6*64);
3532 static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
3534 for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3535 long a = *(long*)(src+i);
3536 long b = *(long*)(dst+i);
3537 *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
3540 dst[i+0] += src[i+0];
3543 static void add_bytes_l2_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
3545 for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3546 long a = *(long*)(src1+i);
3547 long b = *(long*)(src2+i);
3548 *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
3551 dst[i] = src1[i]+src2[i];
3554 static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
3556 #if !HAVE_FAST_UNALIGNED
3557 if((long)src2 & (sizeof(long)-1)){
3558 for(i=0; i+7<w; i+=8){
3559 dst[i+0] = src1[i+0]-src2[i+0];
3560 dst[i+1] = src1[i+1]-src2[i+1];
3561 dst[i+2] = src1[i+2]-src2[i+2];
3562 dst[i+3] = src1[i+3]-src2[i+3];
3563 dst[i+4] = src1[i+4]-src2[i+4];
3564 dst[i+5] = src1[i+5]-src2[i+5];
3565 dst[i+6] = src1[i+6]-src2[i+6];
3566 dst[i+7] = src1[i+7]-src2[i+7];
3570 for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3571 long a = *(long*)(src1+i);
3572 long b = *(long*)(src2+i);
3573 *(long*)(dst+i) = ((a|pb_80) - (b&pb_7f)) ^ ((a^b^pb_80)&pb_80);
3576 dst[i+0] = src1[i+0]-src2[i+0];
3579 static void add_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *diff, int w, int *left, int *left_top){
3587 l= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF) + diff[i];
3596 static void sub_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int w, int *left, int *left_top){
3604 const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
3614 static int add_hfyu_left_prediction_c(uint8_t *dst, const uint8_t *src, int w, int acc){
3617 for(i=0; i<w-1; i++){
3644 static void add_hfyu_left_prediction_bgr32_c(uint8_t *dst, const uint8_t *src, int w, int *red, int *green, int *blue, int *alpha){
3674 #define BUTTERFLY2(o1,o2,i1,i2) \
3678 #define BUTTERFLY1(x,y) \
3687 #define BUTTERFLYA(x,y) (FFABS((x)+(y)) + FFABS((x)-(y)))
3689 static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
3697 //FIXME try pointer walks
3698 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
3699 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
3700 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
3701 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
3703 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3704 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3705 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3706 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3708 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3709 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3710 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3711 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3715 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3716 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3717 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3718 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3720 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3721 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3722 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3723 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3726 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3727 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3728 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3729 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3735 printf("MAX:%d\n", maxi);
3741 static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
3749 //FIXME try pointer walks
3750 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
3751 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
3752 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
3753 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
3755 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3756 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3757 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3758 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3760 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3761 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3762 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3763 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3767 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3768 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3769 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3770 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3772 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3773 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3774 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3775 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3778 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3779 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3780 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3781 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3784 sum -= FFABS(temp[8*0] + temp[8*4]); // -mean
3789 static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3790 MpegEncContext * const s= (MpegEncContext *)c;
3791 DECLARE_ALIGNED_16(uint64_t, aligned_temp)[sizeof(DCTELEM)*64/8];
3792 DCTELEM * const temp= (DCTELEM*)aligned_temp;
3796 s->dsp.diff_pixels(temp, src1, src2, stride);
3798 return s->dsp.sum_abs_dctelem(temp);
3803 const int s07 = SRC(0) + SRC(7);\
3804 const int s16 = SRC(1) + SRC(6);\
3805 const int s25 = SRC(2) + SRC(5);\
3806 const int s34 = SRC(3) + SRC(4);\
3807 const int a0 = s07 + s34;\
3808 const int a1 = s16 + s25;\
3809 const int a2 = s07 - s34;\
3810 const int a3 = s16 - s25;\
3811 const int d07 = SRC(0) - SRC(7);\
3812 const int d16 = SRC(1) - SRC(6);\
3813 const int d25 = SRC(2) - SRC(5);\
3814 const int d34 = SRC(3) - SRC(4);\
3815 const int a4 = d16 + d25 + (d07 + (d07>>1));\
3816 const int a5 = d07 - d34 - (d25 + (d25>>1));\
3817 const int a6 = d07 + d34 - (d16 + (d16>>1));\
3818 const int a7 = d16 - d25 + (d34 + (d34>>1));\
3820 DST(1, a4 + (a7>>2)) ;\
3821 DST(2, a2 + (a3>>1)) ;\
3822 DST(3, a5 + (a6>>2)) ;\
3824 DST(5, a6 - (a5>>2)) ;\
3825 DST(6, (a2>>1) - a3 ) ;\
3826 DST(7, (a4>>2) - a7 ) ;\
3829 static int dct264_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3830 MpegEncContext * const s= (MpegEncContext *)c;
3835 s->dsp.diff_pixels(dct[0], src1, src2, stride);
3837 #define SRC(x) dct[i][x]
3838 #define DST(x,v) dct[i][x]= v
3839 for( i = 0; i < 8; i++ )
3844 #define SRC(x) dct[x][i]
3845 #define DST(x,v) sum += FFABS(v)
3846 for( i = 0; i < 8; i++ )
3854 static int dct_max8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3855 MpegEncContext * const s= (MpegEncContext *)c;
3856 DECLARE_ALIGNED_16(uint64_t, aligned_temp)[sizeof(DCTELEM)*64/8];
3857 DCTELEM * const temp= (DCTELEM*)aligned_temp;
3862 s->dsp.diff_pixels(temp, src1, src2, stride);
3866 sum= FFMAX(sum, FFABS(temp[i]));
3871 static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3872 MpegEncContext * const s= (MpegEncContext *)c;
3873 DECLARE_ALIGNED_16(uint64_t, aligned_temp)[sizeof(DCTELEM)*64*2/8];
3874 DCTELEM * const temp= (DCTELEM*)aligned_temp;
3875 DCTELEM * const bak = ((DCTELEM*)aligned_temp)+64;
3881 s->dsp.diff_pixels(temp, src1, src2, stride);
3883 memcpy(bak, temp, 64*sizeof(DCTELEM));
3885 s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3886 s->dct_unquantize_inter(s, temp, 0, s->qscale);
3887 ff_simple_idct(temp); //FIXME
3890 sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
3895 static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3896 MpegEncContext * const s= (MpegEncContext *)c;
3897 const uint8_t *scantable= s->intra_scantable.permutated;
3898 DECLARE_ALIGNED_16(uint64_t, aligned_temp)[sizeof(DCTELEM)*64/8];
3899 DECLARE_ALIGNED_16(uint64_t, aligned_src1)[8];
3900 DECLARE_ALIGNED_16(uint64_t, aligned_src2)[8];
3901 DCTELEM * const temp= (DCTELEM*)aligned_temp;
3902 uint8_t * const lsrc1 = (uint8_t*)aligned_src1;
3903 uint8_t * const lsrc2 = (uint8_t*)aligned_src2;
3904 int i, last, run, bits, level, distortion, start_i;
3905 const int esc_length= s->ac_esc_length;
3907 uint8_t * last_length;
3911 copy_block8(lsrc1, src1, 8, stride, 8);
3912 copy_block8(lsrc2, src2, 8, stride, 8);
3914 s->dsp.diff_pixels(temp, lsrc1, lsrc2, 8);
3916 s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3922 length = s->intra_ac_vlc_length;
3923 last_length= s->intra_ac_vlc_last_length;
3924 bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3927 length = s->inter_ac_vlc_length;
3928 last_length= s->inter_ac_vlc_last_length;
3933 for(i=start_i; i<last; i++){
3934 int j= scantable[i];
3939 if((level&(~127)) == 0){
3940 bits+= length[UNI_AC_ENC_INDEX(run, level)];
3949 level= temp[i] + 64;
3953 if((level&(~127)) == 0){
3954 bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3962 s->dct_unquantize_intra(s, temp, 0, s->qscale);
3964 s->dct_unquantize_inter(s, temp, 0, s->qscale);
3967 s->dsp.idct_add(lsrc2, 8, temp);
3969 distortion= s->dsp.sse[1](NULL, lsrc2, lsrc1, 8, 8);
3971 return distortion + ((bits*s->qscale*s->qscale*109 + 64)>>7);
3974 static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3975 MpegEncContext * const s= (MpegEncContext *)c;
3976 const uint8_t *scantable= s->intra_scantable.permutated;
3977 DECLARE_ALIGNED_16(uint64_t, aligned_temp)[sizeof(DCTELEM)*64/8];
3978 DCTELEM * const temp= (DCTELEM*)aligned_temp;
3979 int i, last, run, bits, level, start_i;
3980 const int esc_length= s->ac_esc_length;
3982 uint8_t * last_length;
3986 s->dsp.diff_pixels(temp, src1, src2, stride);
3988 s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3994 length = s->intra_ac_vlc_length;
3995 last_length= s->intra_ac_vlc_last_length;
3996 bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3999 length = s->inter_ac_vlc_length;
4000 last_length= s->inter_ac_vlc_last_length;
4005 for(i=start_i; i<last; i++){
4006 int j= scantable[i];
4011 if((level&(~127)) == 0){
4012 bits+= length[UNI_AC_ENC_INDEX(run, level)];
4021 level= temp[i] + 64;
4025 if((level&(~127)) == 0){
4026 bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
4034 #define VSAD_INTRA(size) \
4035 static int vsad_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
4039 for(y=1; y<h; y++){ \
4040 for(x=0; x<size; x+=4){ \
4041 score+= FFABS(s[x ] - s[x +stride]) + FFABS(s[x+1] - s[x+1+stride]) \
4042 +FFABS(s[x+2] - s[x+2+stride]) + FFABS(s[x+3] - s[x+3+stride]); \
4052 static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
4057 for(x=0; x<16; x++){
4058 score+= FFABS(s1[x ] - s2[x ] - s1[x +stride] + s2[x +stride]);
4067 #define SQ(a) ((a)*(a))
4068 #define VSSE_INTRA(size) \
4069 static int vsse_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
4073 for(y=1; y<h; y++){ \
4074 for(x=0; x<size; x+=4){ \
4075 score+= SQ(s[x ] - s[x +stride]) + SQ(s[x+1] - s[x+1+stride]) \
4076 +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]); \
4086 static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
4091 for(x=0; x<16; x++){
4092 score+= SQ(s1[x ] - s2[x ] - s1[x +stride] + s2[x +stride]);
4101 static int ssd_int8_vs_int16_c(const int8_t *pix1, const int16_t *pix2,
4105 for(i=0; i<size; i++)
4106 score += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]);
4110 WRAPPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
4111 WRAPPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
4112 WRAPPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
4114 WRAPPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c)
4116 WRAPPER8_16_SQ(dct_max8x8_c, dct_max16_c)
4117 WRAPPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
4118 WRAPPER8_16_SQ(rd8x8_c, rd16_c)
4119 WRAPPER8_16_SQ(bit8x8_c, bit16_c)
4121 static void vector_fmul_c(float *dst, const float *src, int len){
4123 for(i=0; i<len; i++)
4127 static void vector_fmul_reverse_c(float *dst, const float *src0, const float *src1, int len){
4130 for(i=0; i<len; i++)
4131 dst[i] = src0[i] * src1[-i];
4134 static void vector_fmul_add_c(float *dst, const float *src0, const float *src1, const float *src2, int len){
4136 for(i=0; i<len; i++)
4137 dst[i] = src0[i] * src1[i] + src2[i];
4140 void ff_vector_fmul_window_c(float *dst, const float *src0, const float *src1, const float *win, float add_bias, int len){
4145 for(i=-len, j=len-1; i<0; i++, j--) {
4150 dst[i] = s0*wj - s1*wi + add_bias;
4151 dst[j] = s0*wi + s1*wj + add_bias;
4155 static void vector_fmul_scalar_c(float *dst, const float *src, float mul,
4159 for (i = 0; i < len; i++)
4160 dst[i] = src[i] * mul;
4163 static void vector_fmul_sv_scalar_2_c(float *dst, const float *src,
4164 const float **sv, float mul, int len)
4167 for (i = 0; i < len; i += 2, sv++) {
4168 dst[i ] = src[i ] * sv[0][0] * mul;
4169 dst[i+1] = src[i+1] * sv[0][1] * mul;
4173 static void vector_fmul_sv_scalar_4_c(float *dst, const float *src,
4174 const float **sv, float mul, int len)
4177 for (i = 0; i < len; i += 4, sv++) {
4178 dst[i ] = src[i ] * sv[0][0] * mul;
4179 dst[i+1] = src[i+1] * sv[0][1] * mul;
4180 dst[i+2] = src[i+2] * sv[0][2] * mul;
4181 dst[i+3] = src[i+3] * sv[0][3] * mul;
4185 static void sv_fmul_scalar_2_c(float *dst, const float **sv, float mul,
4189 for (i = 0; i < len; i += 2, sv++) {
4190 dst[i ] = sv[0][0] * mul;
4191 dst[i+1] = sv[0][1] * mul;
4195 static void sv_fmul_scalar_4_c(float *dst, const float **sv, float mul,
4199 for (i = 0; i < len; i += 4, sv++) {
4200 dst[i ] = sv[0][0] * mul;
4201 dst[i+1] = sv[0][1] * mul;
4202 dst[i+2] = sv[0][2] * mul;
4203 dst[i+3] = sv[0][3] * mul;
4207 static void butterflies_float_c(float *restrict v1, float *restrict v2,
4211 for (i = 0; i < len; i++) {
4212 float t = v1[i] - v2[i];
4218 static float scalarproduct_float_c(const float *v1, const float *v2, int len)
4223 for (i = 0; i < len; i++)
4229 static void int32_to_float_fmul_scalar_c(float *dst, const int *src, float mul, int len){
4231 for(i=0; i<len; i++)
4232 dst[i] = src[i] * mul;
4235 static inline uint32_t clipf_c_one(uint32_t a, uint32_t mini,
4236 uint32_t maxi, uint32_t maxisign)
4239 if(a > mini) return mini;
4240 else if((a^(1<<31)) > maxisign) return maxi;
4244 static void vector_clipf_c_opposite_sign(float *dst, const float *src, float *min, float *max, int len){
4246 uint32_t mini = *(uint32_t*)min;
4247 uint32_t maxi = *(uint32_t*)max;
4248 uint32_t maxisign = maxi ^ (1<<31);
4249 uint32_t *dsti = (uint32_t*)dst;
4250 const uint32_t *srci = (const uint32_t*)src;
4251 for(i=0; i<len; i+=8) {
4252 dsti[i + 0] = clipf_c_one(srci[i + 0], mini, maxi, maxisign);
4253 dsti[i + 1] = clipf_c_one(srci[i + 1], mini, maxi, maxisign);
4254 dsti[i + 2] = clipf_c_one(srci[i + 2], mini, maxi, maxisign);
4255 dsti[i + 3] = clipf_c_one(srci[i + 3], mini, maxi, maxisign);
4256 dsti[i + 4] = clipf_c_one(srci[i + 4], mini, maxi, maxisign);
4257 dsti[i + 5] = clipf_c_one(srci[i + 5], mini, maxi, maxisign);
4258 dsti[i + 6] = clipf_c_one(srci[i + 6], mini, maxi, maxisign);
4259 dsti[i + 7] = clipf_c_one(srci[i + 7], mini, maxi, maxisign);
4262 static void vector_clipf_c(float *dst, const float *src, float min, float max, int len){
4264 if(min < 0 && max > 0) {
4265 vector_clipf_c_opposite_sign(dst, src, &min, &max, len);
4267 for(i=0; i < len; i+=8) {
4268 dst[i ] = av_clipf(src[i ], min, max);
4269 dst[i + 1] = av_clipf(src[i + 1], min, max);
4270 dst[i + 2] = av_clipf(src[i + 2], min, max);
4271 dst[i + 3] = av_clipf(src[i + 3], min, max);
4272 dst[i + 4] = av_clipf(src[i + 4], min, max);
4273 dst[i + 5] = av_clipf(src[i + 5], min, max);
4274 dst[i + 6] = av_clipf(src[i + 6], min, max);
4275 dst[i + 7] = av_clipf(src[i + 7], min, max);
4280 static av_always_inline int float_to_int16_one(const float *src){
4281 int_fast32_t tmp = *(const int32_t*)src;
4283 tmp = (0x43c0ffff - tmp)>>31;
4284 // is this faster on some gcc/cpu combinations?
4285 // if(tmp > 0x43c0ffff) tmp = 0xFFFF;
4288 return tmp - 0x8000;
4291 void ff_float_to_int16_c(int16_t *dst, const float *src, long len){
4293 for(i=0; i<len; i++)
4294 dst[i] = float_to_int16_one(src+i);
4297 void ff_float_to_int16_interleave_c(int16_t *dst, const float **src, long len, int channels){
4300 for(i=0; i<len; i++){
4301 dst[2*i] = float_to_int16_one(src[0]+i);
4302 dst[2*i+1] = float_to_int16_one(src[1]+i);
4305 for(c=0; c<channels; c++)
4306 for(i=0, j=c; i<len; i++, j+=channels)
4307 dst[j] = float_to_int16_one(src[c]+i);
4311 static int32_t scalarproduct_int16_c(int16_t * v1, int16_t * v2, int order, int shift)
4316 res += (*v1++ * *v2++) >> shift;
4321 static int32_t scalarproduct_and_madd_int16_c(int16_t *v1, int16_t *v2, int16_t *v3, int order, int mul)
4326 *v1++ += mul * *v3++;
4332 #define W1 2841 /* 2048*sqrt (2)*cos (1*pi/16) */
4333 #define W2 2676 /* 2048*sqrt (2)*cos (2*pi/16) */
4334 #define W3 2408 /* 2048*sqrt (2)*cos (3*pi/16) */
4335 #define W4 2048 /* 2048*sqrt (2)*cos (4*pi/16) */
4336 #define W5 1609 /* 2048*sqrt (2)*cos (5*pi/16) */
4337 #define W6 1108 /* 2048*sqrt (2)*cos (6*pi/16) */
4338 #define W7 565 /* 2048*sqrt (2)*cos (7*pi/16) */
4340 static void wmv2_idct_row(short * b)
4343 int a0,a1,a2,a3,a4,a5,a6,a7;
4345 a1 = W1*b[1]+W7*b[7];
4346 a7 = W7*b[1]-W1*b[7];
4347 a5 = W5*b[5]+W3*b[3];
4348 a3 = W3*b[5]-W5*b[3];
4349 a2 = W2*b[2]+W6*b[6];
4350 a6 = W6*b[2]-W2*b[6];
4351 a0 = W0*b[0]+W0*b[4];
4352 a4 = W0*b[0]-W0*b[4];
4354 s1 = (181*(a1-a5+a7-a3)+128)>>8;//1,3,5,7,
4355 s2 = (181*(a1-a5-a7+a3)+128)>>8;
4357 b[0] = (a0+a2+a1+a5 + (1<<7))>>8;
4358 b[1] = (a4+a6 +s1 + (1<<7))>>8;
4359 b[2] = (a4-a6 +s2 + (1<<7))>>8;
4360 b[3] = (a0-a2+a7+a3 + (1<<7))>>8;
4361 b[4] = (a0-a2-a7-a3 + (1<<7))>>8;
4362 b[5] = (a4-a6 -s2 + (1<<7))>>8;
4363 b[6] = (a4+a6 -s1 + (1<<7))>>8;
4364 b[7] = (a0+a2-a1-a5 + (1<<7))>>8;
4366 static void wmv2_idct_col(short * b)
4369 int a0,a1,a2,a3,a4,a5,a6,a7;
4370 /*step 1, with extended precision*/
4371 a1 = (W1*b[8*1]+W7*b[8*7] + 4)>>3;
4372 a7 = (W7*b[8*1]-W1*b[8*7] + 4)>>3;
4373 a5 = (W5*b[8*5]+W3*b[8*3] + 4)>>3;
4374 a3 = (W3*b[8*5]-W5*b[8*3] + 4)>>3;
4375 a2 = (W2*b[8*2]+W6*b[8*6] + 4)>>3;
4376 a6 = (W6*b[8*2]-W2*b[8*6] + 4)>>3;
4377 a0 = (W0*b[8*0]+W0*b[8*4] )>>3;
4378 a4 = (W0*b[8*0]-W0*b[8*4] )>>3;
4380 s1 = (181*(a1-a5+a7-a3)+128)>>8;
4381 s2 = (181*(a1-a5-a7+a3)+128)>>8;
4383 b[8*0] = (a0+a2+a1+a5 + (1<<13))>>14;
4384 b[8*1] = (a4+a6 +s1 + (1<<13))>>14;
4385 b[8*2] = (a4-a6 +s2 + (1<<13))>>14;
4386 b[8*3] = (a0-a2+a7+a3 + (1<<13))>>14;
4388 b[8*4] = (a0-a2-a7-a3 + (1<<13))>>14;
4389 b[8*5] = (a4-a6 -s2 + (1<<13))>>14;
4390 b[8*6] = (a4+a6 -s1 + (1<<13))>>14;
4391 b[8*7] = (a0+a2-a1-a5 + (1<<13))>>14;
4393 void ff_wmv2_idct_c(short * block){
4397 wmv2_idct_row(block+i);
4400 wmv2_idct_col(block+i);
4403 /* XXX: those functions should be suppressed ASAP when all IDCTs are
4405 static void ff_wmv2_idct_put_c(uint8_t *dest, int line_size, DCTELEM *block)
4407 ff_wmv2_idct_c(block);
4408 put_pixels_clamped_c(block, dest, line_size);
4410 static void ff_wmv2_idct_add_c(uint8_t *dest, int line_size, DCTELEM *block)
4412 ff_wmv2_idct_c(block);
4413 add_pixels_clamped_c(block, dest, line_size);
4415 static void ff_jref_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
4418 put_pixels_clamped_c(block, dest, line_size);
4420 static void ff_jref_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
4423 add_pixels_clamped_c(block, dest, line_size);
4426 static void ff_jref_idct4_put(uint8_t *dest, int line_size, DCTELEM *block)
4429 put_pixels_clamped4_c(block, dest, line_size);
4431 static void ff_jref_idct4_add(uint8_t *dest, int line_size, DCTELEM *block)
4434 add_pixels_clamped4_c(block, dest, line_size);
4437 static void ff_jref_idct2_put(uint8_t *dest, int line_size, DCTELEM *block)
4440 put_pixels_clamped2_c(block, dest, line_size);
4442 static void ff_jref_idct2_add(uint8_t *dest, int line_size, DCTELEM *block)
4445 add_pixels_clamped2_c(block, dest, line_size);
4448 static void ff_jref_idct1_put(uint8_t *dest, int line_size, DCTELEM *block)
4450 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
4452 dest[0] = cm[(block[0] + 4)>>3];
4454 static void ff_jref_idct1_add(uint8_t *dest, int line_size, DCTELEM *block)
4456 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
4458 dest[0] = cm[dest[0] + ((block[0] + 4)>>3)];
4461 static void just_return(void *mem av_unused, int stride av_unused, int h av_unused) { return; }
4463 /* init static data */
4464 av_cold void dsputil_static_init(void)
4468 for(i=0;i<256;i++) ff_cropTbl[i + MAX_NEG_CROP] = i;
4469 for(i=0;i<MAX_NEG_CROP;i++) {
4471 ff_cropTbl[i + MAX_NEG_CROP + 256] = 255;
4474 for(i=0;i<512;i++) {
4475 ff_squareTbl[i] = (i - 256) * (i - 256);
4478 for(i=0; i<64; i++) inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
4481 int ff_check_alignment(void){
4482 static int did_fail=0;
4483 DECLARE_ALIGNED_16(int, aligned);
4485 if((intptr_t)&aligned & 15){
4487 #if HAVE_MMX || HAVE_ALTIVEC
4488 av_log(NULL, AV_LOG_ERROR,
4489 "Compiler did not align stack variables. Libavcodec has been miscompiled\n"
4490 "and may be very slow or crash. This is not a bug in libavcodec,\n"
4491 "but in the compiler. You may try recompiling using gcc >= 4.2.\n"
4492 "Do not report crashes to FFmpeg developers.\n");
4501 av_cold void dsputil_init(DSPContext* c, AVCodecContext *avctx)
4505 ff_check_alignment();
4508 if(avctx->dct_algo==FF_DCT_FASTINT) {
4509 c->fdct = fdct_ifast;
4510 c->fdct248 = fdct_ifast248;
4512 else if(avctx->dct_algo==FF_DCT_FAAN) {
4513 c->fdct = ff_faandct;
4514 c->fdct248 = ff_faandct248;
4517 c->fdct = ff_jpeg_fdct_islow; //slow/accurate/default
4518 c->fdct248 = ff_fdct248_islow;
4520 #endif //CONFIG_ENCODERS
4522 if(avctx->lowres==1){
4523 if(avctx->idct_algo==FF_IDCT_INT || avctx->idct_algo==FF_IDCT_AUTO || !CONFIG_H264_DECODER){
4524 c->idct_put= ff_jref_idct4_put;
4525 c->idct_add= ff_jref_idct4_add;
4527 c->idct_put= ff_h264_lowres_idct_put_c;
4528 c->idct_add= ff_h264_lowres_idct_add_c;
4530 c->idct = j_rev_dct4;
4531 c->idct_permutation_type= FF_NO_IDCT_PERM;
4532 }else if(avctx->lowres==2){
4533 c->idct_put= ff_jref_idct2_put;
4534 c->idct_add= ff_jref_idct2_add;
4535 c->idct = j_rev_dct2;
4536 c->idct_permutation_type= FF_NO_IDCT_PERM;
4537 }else if(avctx->lowres==3){
4538 c->idct_put= ff_jref_idct1_put;
4539 c->idct_add= ff_jref_idct1_add;
4540 c->idct = j_rev_dct1;
4541 c->idct_permutation_type= FF_NO_IDCT_PERM;
4543 if(avctx->idct_algo==FF_IDCT_INT){
4544 c->idct_put= ff_jref_idct_put;
4545 c->idct_add= ff_jref_idct_add;
4546 c->idct = j_rev_dct;
4547 c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
4548 }else if((CONFIG_VP3_DECODER || CONFIG_VP5_DECODER || CONFIG_VP6_DECODER ) &&
4549 avctx->idct_algo==FF_IDCT_VP3){
4550 c->idct_put= ff_vp3_idct_put_c;
4551 c->idct_add= ff_vp3_idct_add_c;
4552 c->idct = ff_vp3_idct_c;
4553 c->idct_permutation_type= FF_NO_IDCT_PERM;
4554 }else if(avctx->idct_algo==FF_IDCT_WMV2){
4555 c->idct_put= ff_wmv2_idct_put_c;
4556 c->idct_add= ff_wmv2_idct_add_c;
4557 c->idct = ff_wmv2_idct_c;
4558 c->idct_permutation_type= FF_NO_IDCT_PERM;
4559 }else if(avctx->idct_algo==FF_IDCT_FAAN){
4560 c->idct_put= ff_faanidct_put;
4561 c->idct_add= ff_faanidct_add;
4562 c->idct = ff_faanidct;
4563 c->idct_permutation_type= FF_NO_IDCT_PERM;
4564 }else if(CONFIG_EATGQ_DECODER && avctx->idct_algo==FF_IDCT_EA) {
4565 c->idct_put= ff_ea_idct_put_c;
4566 c->idct_permutation_type= FF_NO_IDCT_PERM;
4567 }else{ //accurate/default
4568 c->idct_put= ff_simple_idct_put;
4569 c->idct_add= ff_simple_idct_add;
4570 c->idct = ff_simple_idct;
4571 c->idct_permutation_type= FF_NO_IDCT_PERM;
4575 if (CONFIG_H264_DECODER) {
4576 c->h264_idct_add= ff_h264_idct_add_c;
4577 c->h264_idct8_add= ff_h264_idct8_add_c;
4578 c->h264_idct_dc_add= ff_h264_idct_dc_add_c;
4579 c->h264_idct8_dc_add= ff_h264_idct8_dc_add_c;
4580 c->h264_idct_add16 = ff_h264_idct_add16_c;
4581 c->h264_idct8_add4 = ff_h264_idct8_add4_c;
4582 c->h264_idct_add8 = ff_h264_idct_add8_c;
4583 c->h264_idct_add16intra= ff_h264_idct_add16intra_c;
4586 c->get_pixels = get_pixels_c;
4587 c->diff_pixels = diff_pixels_c;
4588 c->put_pixels_clamped = put_pixels_clamped_c;
4589 c->put_signed_pixels_clamped = put_signed_pixels_clamped_c;
4590 c->add_pixels_clamped = add_pixels_clamped_c;
4591 c->add_pixels8 = add_pixels8_c;
4592 c->add_pixels4 = add_pixels4_c;
4593 c->sum_abs_dctelem = sum_abs_dctelem_c;
4596 c->clear_block = clear_block_c;
4597 c->clear_blocks = clear_blocks_c;
4598 c->pix_sum = pix_sum_c;
4599 c->pix_norm1 = pix_norm1_c;
4601 /* TODO [0] 16 [1] 8 */
4602 c->pix_abs[0][0] = pix_abs16_c;
4603 c->pix_abs[0][1] = pix_abs16_x2_c;
4604 c->pix_abs[0][2] = pix_abs16_y2_c;
4605 c->pix_abs[0][3] = pix_abs16_xy2_c;
4606 c->pix_abs[1][0] = pix_abs8_c;
4607 c->pix_abs[1][1] = pix_abs8_x2_c;
4608 c->pix_abs[1][2] = pix_abs8_y2_c;
4609 c->pix_abs[1][3] = pix_abs8_xy2_c;
4611 #define dspfunc(PFX, IDX, NUM) \
4612 c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## NUM ## _c; \
4613 c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## NUM ## _x2_c; \
4614 c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## NUM ## _y2_c; \
4615 c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## NUM ## _xy2_c
4617 dspfunc(put, 0, 16);
4618 dspfunc(put_no_rnd, 0, 16);
4620 dspfunc(put_no_rnd, 1, 8);
4624 dspfunc(avg, 0, 16);
4625 dspfunc(avg_no_rnd, 0, 16);
4627 dspfunc(avg_no_rnd, 1, 8);
4632 c->put_no_rnd_pixels_l2[0]= put_no_rnd_pixels16_l2_c;
4633 c->put_no_rnd_pixels_l2[1]= put_no_rnd_pixels8_l2_c;
4635 c->put_tpel_pixels_tab[ 0] = put_tpel_pixels_mc00_c;
4636 c->put_tpel_pixels_tab[ 1] = put_tpel_pixels_mc10_c;
4637 c->put_tpel_pixels_tab[ 2] = put_tpel_pixels_mc20_c;
4638 c->put_tpel_pixels_tab[ 4] = put_tpel_pixels_mc01_c;
4639 c->put_tpel_pixels_tab[ 5] = put_tpel_pixels_mc11_c;
4640 c->put_tpel_pixels_tab[ 6] = put_tpel_pixels_mc21_c;
4641 c->put_tpel_pixels_tab[ 8] = put_tpel_pixels_mc02_c;
4642 c->put_tpel_pixels_tab[ 9] = put_tpel_pixels_mc12_c;
4643 c->put_tpel_pixels_tab[10] = put_tpel_pixels_mc22_c;
4645 c->avg_tpel_pixels_tab[ 0] = avg_tpel_pixels_mc00_c;
4646 c->avg_tpel_pixels_tab[ 1] = avg_tpel_pixels_mc10_c;
4647 c->avg_tpel_pixels_tab[ 2] = avg_tpel_pixels_mc20_c;
4648 c->avg_tpel_pixels_tab[ 4] = avg_tpel_pixels_mc01_c;
4649 c->avg_tpel_pixels_tab[ 5] = avg_tpel_pixels_mc11_c;
4650 c->avg_tpel_pixels_tab[ 6] = avg_tpel_pixels_mc21_c;
4651 c->avg_tpel_pixels_tab[ 8] = avg_tpel_pixels_mc02_c;
4652 c->avg_tpel_pixels_tab[ 9] = avg_tpel_pixels_mc12_c;
4653 c->avg_tpel_pixels_tab[10] = avg_tpel_pixels_mc22_c;
4655 #define dspfunc(PFX, IDX, NUM) \
4656 c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
4657 c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
4658 c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
4659 c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
4660 c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
4661 c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
4662 c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
4663 c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
4664 c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
4665 c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
4666 c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
4667 c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
4668 c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
4669 c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
4670 c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
4671 c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
4673 dspfunc(put_qpel, 0, 16);
4674 dspfunc(put_no_rnd_qpel, 0, 16);
4676 dspfunc(avg_qpel, 0, 16);
4677 /* dspfunc(avg_no_rnd_qpel, 0, 16); */
4679 dspfunc(put_qpel, 1, 8);
4680 dspfunc(put_no_rnd_qpel, 1, 8);
4682 dspfunc(avg_qpel, 1, 8);
4683 /* dspfunc(avg_no_rnd_qpel, 1, 8); */
4685 dspfunc(put_h264_qpel, 0, 16);
4686 dspfunc(put_h264_qpel, 1, 8);
4687 dspfunc(put_h264_qpel, 2, 4);
4688 dspfunc(put_h264_qpel, 3, 2);
4689 dspfunc(avg_h264_qpel, 0, 16);
4690 dspfunc(avg_h264_qpel, 1, 8);
4691 dspfunc(avg_h264_qpel, 2, 4);
4694 c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_c;
4695 c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_c;
4696 c->put_h264_chroma_pixels_tab[2]= put_h264_chroma_mc2_c;
4697 c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_c;
4698 c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_c;
4699 c->avg_h264_chroma_pixels_tab[2]= avg_h264_chroma_mc2_c;
4700 c->put_no_rnd_vc1_chroma_pixels_tab[0]= put_no_rnd_vc1_chroma_mc8_c;
4701 c->avg_no_rnd_vc1_chroma_pixels_tab[0]= avg_no_rnd_vc1_chroma_mc8_c;
4703 c->weight_h264_pixels_tab[0]= weight_h264_pixels16x16_c;
4704 c->weight_h264_pixels_tab[1]= weight_h264_pixels16x8_c;
4705 c->weight_h264_pixels_tab[2]= weight_h264_pixels8x16_c;
4706 c->weight_h264_pixels_tab[3]= weight_h264_pixels8x8_c;
4707 c->weight_h264_pixels_tab[4]= weight_h264_pixels8x4_c;
4708 c->weight_h264_pixels_tab[5]= weight_h264_pixels4x8_c;
4709 c->weight_h264_pixels_tab[6]= weight_h264_pixels4x4_c;
4710 c->weight_h264_pixels_tab[7]= weight_h264_pixels4x2_c;
4711 c->weight_h264_pixels_tab[8]= weight_h264_pixels2x4_c;
4712 c->weight_h264_pixels_tab[9]= weight_h264_pixels2x2_c;
4713 c->biweight_h264_pixels_tab[0]= biweight_h264_pixels16x16_c;
4714 c->biweight_h264_pixels_tab[1]= biweight_h264_pixels16x8_c;
4715 c->biweight_h264_pixels_tab[2]= biweight_h264_pixels8x16_c;
4716 c->biweight_h264_pixels_tab[3]= biweight_h264_pixels8x8_c;
4717 c->biweight_h264_pixels_tab[4]= biweight_h264_pixels8x4_c;
4718 c->biweight_h264_pixels_tab[5]= biweight_h264_pixels4x8_c;
4719 c->biweight_h264_pixels_tab[6]= biweight_h264_pixels4x4_c;
4720 c->biweight_h264_pixels_tab[7]= biweight_h264_pixels4x2_c;
4721 c->biweight_h264_pixels_tab[8]= biweight_h264_pixels2x4_c;
4722 c->biweight_h264_pixels_tab[9]= biweight_h264_pixels2x2_c;
4724 c->draw_edges = draw_edges_c;
4726 #if CONFIG_CAVS_DECODER
4727 ff_cavsdsp_init(c,avctx);
4730 #if CONFIG_MLP_DECODER || CONFIG_TRUEHD_DECODER
4731 ff_mlp_init(c, avctx);
4733 #if CONFIG_VC1_DECODER
4734 ff_vc1dsp_init(c,avctx);
4736 #if CONFIG_WMV2_DECODER || CONFIG_VC1_DECODER
4737 ff_intrax8dsp_init(c,avctx);
4739 #if CONFIG_RV30_DECODER
4740 ff_rv30dsp_init(c,avctx);
4742 #if CONFIG_RV40_DECODER
4743 ff_rv40dsp_init(c,avctx);
4744 c->put_rv40_qpel_pixels_tab[0][15] = put_rv40_qpel16_mc33_c;
4745 c->avg_rv40_qpel_pixels_tab[0][15] = avg_rv40_qpel16_mc33_c;
4746 c->put_rv40_qpel_pixels_tab[1][15] = put_rv40_qpel8_mc33_c;
4747 c->avg_rv40_qpel_pixels_tab[1][15] = avg_rv40_qpel8_mc33_c;
4750 c->put_mspel_pixels_tab[0]= put_mspel8_mc00_c;
4751 c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
4752 c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c;
4753 c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c;
4754 c->put_mspel_pixels_tab[4]= put_mspel8_mc02_c;
4755 c->put_mspel_pixels_tab[5]= put_mspel8_mc12_c;
4756 c->put_mspel_pixels_tab[6]= put_mspel8_mc22_c;
4757 c->put_mspel_pixels_tab[7]= put_mspel8_mc32_c;
4759 #define SET_CMP_FUNC(name) \
4760 c->name[0]= name ## 16_c;\
4761 c->name[1]= name ## 8x8_c;
4763 SET_CMP_FUNC(hadamard8_diff)
4764 c->hadamard8_diff[4]= hadamard8_intra16_c;
4765 c->hadamard8_diff[5]= hadamard8_intra8x8_c;
4766 SET_CMP_FUNC(dct_sad)
4767 SET_CMP_FUNC(dct_max)
4769 SET_CMP_FUNC(dct264_sad)
4771 c->sad[0]= pix_abs16_c;
4772 c->sad[1]= pix_abs8_c;
4776 SET_CMP_FUNC(quant_psnr)
4779 c->vsad[0]= vsad16_c;
4780 c->vsad[4]= vsad_intra16_c;
4781 c->vsad[5]= vsad_intra8_c;
4782 c->vsse[0]= vsse16_c;
4783 c->vsse[4]= vsse_intra16_c;
4784 c->vsse[5]= vsse_intra8_c;
4785 c->nsse[0]= nsse16_c;
4786 c->nsse[1]= nsse8_c;
4787 #if CONFIG_SNOW_ENCODER
4788 c->w53[0]= w53_16_c;
4790 c->w97[0]= w97_16_c;
4794 c->ssd_int8_vs_int16 = ssd_int8_vs_int16_c;
4796 c->add_bytes= add_bytes_c;
4797 c->add_bytes_l2= add_bytes_l2_c;
4798 c->diff_bytes= diff_bytes_c;
4799 c->add_hfyu_median_prediction= add_hfyu_median_prediction_c;
4800 c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_c;
4801 c->add_hfyu_left_prediction = add_hfyu_left_prediction_c;
4802 c->add_hfyu_left_prediction_bgr32 = add_hfyu_left_prediction_bgr32_c;
4803 c->bswap_buf= bswap_buf;
4804 #if CONFIG_PNG_DECODER
4805 c->add_png_paeth_prediction= ff_add_png_paeth_prediction;
4808 c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_c;
4809 c->h264_h_loop_filter_luma= h264_h_loop_filter_luma_c;
4810 c->h264_v_loop_filter_luma_intra= h264_v_loop_filter_luma_intra_c;
4811 c->h264_h_loop_filter_luma_intra= h264_h_loop_filter_luma_intra_c;
4812 c->h264_v_loop_filter_chroma= h264_v_loop_filter_chroma_c;
4813 c->h264_h_loop_filter_chroma= h264_h_loop_filter_chroma_c;
4814 c->h264_v_loop_filter_chroma_intra= h264_v_loop_filter_chroma_intra_c;
4815 c->h264_h_loop_filter_chroma_intra= h264_h_loop_filter_chroma_intra_c;
4816 c->h264_loop_filter_strength= NULL;
4818 if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
4819 c->h263_h_loop_filter= h263_h_loop_filter_c;
4820 c->h263_v_loop_filter= h263_v_loop_filter_c;
4823 if (CONFIG_VP3_DECODER) {
4824 c->vp3_h_loop_filter= ff_vp3_h_loop_filter_c;
4825 c->vp3_v_loop_filter= ff_vp3_v_loop_filter_c;
4827 if (CONFIG_VP6_DECODER) {
4828 c->vp6_filter_diag4= ff_vp6_filter_diag4_c;
4831 c->h261_loop_filter= h261_loop_filter_c;
4833 c->try_8x8basis= try_8x8basis_c;
4834 c->add_8x8basis= add_8x8basis_c;
4836 #if CONFIG_SNOW_DECODER
4837 c->vertical_compose97i = ff_snow_vertical_compose97i;
4838 c->horizontal_compose97i = ff_snow_horizontal_compose97i;
4839 c->inner_add_yblock = ff_snow_inner_add_yblock;
4842 #if CONFIG_VORBIS_DECODER
4843 c->vorbis_inverse_coupling = vorbis_inverse_coupling;
4845 #if CONFIG_AC3_DECODER
4846 c->ac3_downmix = ff_ac3_downmix_c;
4849 c->lpc_compute_autocorr = ff_lpc_compute_autocorr;
4851 c->vector_fmul = vector_fmul_c;
4852 c->vector_fmul_reverse = vector_fmul_reverse_c;
4853 c->vector_fmul_add = vector_fmul_add_c;
4854 c->vector_fmul_window = ff_vector_fmul_window_c;
4855 c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_c;
4856 c->vector_clipf = vector_clipf_c;
4857 c->float_to_int16 = ff_float_to_int16_c;
4858 c->float_to_int16_interleave = ff_float_to_int16_interleave_c;
4859 c->scalarproduct_int16 = scalarproduct_int16_c;
4860 c->scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_c;
4861 c->scalarproduct_float = scalarproduct_float_c;
4862 c->butterflies_float = butterflies_float_c;
4863 c->vector_fmul_scalar = vector_fmul_scalar_c;
4865 c->vector_fmul_sv_scalar[0] = vector_fmul_sv_scalar_2_c;
4866 c->vector_fmul_sv_scalar[1] = vector_fmul_sv_scalar_4_c;
4868 c->sv_fmul_scalar[0] = sv_fmul_scalar_2_c;
4869 c->sv_fmul_scalar[1] = sv_fmul_scalar_4_c;
4871 c->shrink[0]= ff_img_copy_plane;
4872 c->shrink[1]= ff_shrink22;
4873 c->shrink[2]= ff_shrink44;
4874 c->shrink[3]= ff_shrink88;
4876 c->prefetch= just_return;
4878 memset(c->put_2tap_qpel_pixels_tab, 0, sizeof(c->put_2tap_qpel_pixels_tab));
4879 memset(c->avg_2tap_qpel_pixels_tab, 0, sizeof(c->avg_2tap_qpel_pixels_tab));
4881 if (HAVE_MMX) dsputil_init_mmx (c, avctx);
4882 if (ARCH_ARM) dsputil_init_arm (c, avctx);
4883 if (CONFIG_MLIB) dsputil_init_mlib (c, avctx);
4884 if (HAVE_VIS) dsputil_init_vis (c, avctx);
4885 if (ARCH_ALPHA) dsputil_init_alpha (c, avctx);
4886 if (ARCH_PPC) dsputil_init_ppc (c, avctx);
4887 if (HAVE_MMI) dsputil_init_mmi (c, avctx);
4888 if (ARCH_SH4) dsputil_init_sh4 (c, avctx);
4889 if (ARCH_BFIN) dsputil_init_bfin (c, avctx);
4891 for(i=0; i<64; i++){
4892 if(!c->put_2tap_qpel_pixels_tab[0][i])
4893 c->put_2tap_qpel_pixels_tab[0][i]= c->put_h264_qpel_pixels_tab[0][i];
4894 if(!c->avg_2tap_qpel_pixels_tab[0][i])
4895 c->avg_2tap_qpel_pixels_tab[0][i]= c->avg_h264_qpel_pixels_tab[0][i];
4898 switch(c->idct_permutation_type){
4899 case FF_NO_IDCT_PERM:
4901 c->idct_permutation[i]= i;
4903 case FF_LIBMPEG2_IDCT_PERM:
4905 c->idct_permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
4907 case FF_SIMPLE_IDCT_PERM:
4909 c->idct_permutation[i]= simple_mmx_permutation[i];
4911 case FF_TRANSPOSE_IDCT_PERM:
4913 c->idct_permutation[i]= ((i&7)<<3) | (i>>3);
4915 case FF_PARTTRANS_IDCT_PERM:
4917 c->idct_permutation[i]= (i&0x24) | ((i&3)<<3) | ((i>>3)&3);
4919 case FF_SSE2_IDCT_PERM:
4921 c->idct_permutation[i]= (i&0x38) | idct_sse2_row_perm[i&7];
4924 av_log(avctx, AV_LOG_ERROR, "Internal error, IDCT permutation not set\n");