3 * Copyright (c) 2000, 2001 Fabrice Bellard
4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
6 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
8 * This file is part of FFmpeg.
10 * FFmpeg is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public
12 * License as published by the Free Software Foundation; either
13 * version 2.1 of the License, or (at your option) any later version.
15 * FFmpeg is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * Lesser General Public License for more details.
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with FFmpeg; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
26 * @file libavcodec/dsputil.c
32 #include "simple_idct.h"
37 #include "mpegvideo.h"
41 void ff_spatial_dwt(int *buffer, int width, int height, int stride, int type, int decomposition_count);
44 void vorbis_inverse_coupling(float *mag, float *ang, int blocksize);
47 void ff_ac3_downmix_c(float (*samples)[256], float (*matrix)[2], int out_ch, int in_ch, int len);
50 void ff_lpc_compute_autocorr(const int32_t *data, int len, int lag, double *autoc);
53 void ff_add_png_paeth_prediction(uint8_t *dst, uint8_t *src, uint8_t *top, int w, int bpp);
56 void ff_ea_idct_put_c(uint8_t *dest, int linesize, DCTELEM *block);
58 uint8_t ff_cropTbl[256 + 2 * MAX_NEG_CROP] = {0, };
59 uint32_t ff_squareTbl[512] = {0, };
61 // 0x7f7f7f7f or 0x7f7f7f7f7f7f7f7f or whatever, depending on the cpu's native arithmetic size
62 #define pb_7f (~0UL/255 * 0x7f)
63 #define pb_80 (~0UL/255 * 0x80)
65 const uint8_t ff_zigzag_direct[64] = {
66 0, 1, 8, 16, 9, 2, 3, 10,
67 17, 24, 32, 25, 18, 11, 4, 5,
68 12, 19, 26, 33, 40, 48, 41, 34,
69 27, 20, 13, 6, 7, 14, 21, 28,
70 35, 42, 49, 56, 57, 50, 43, 36,
71 29, 22, 15, 23, 30, 37, 44, 51,
72 58, 59, 52, 45, 38, 31, 39, 46,
73 53, 60, 61, 54, 47, 55, 62, 63
76 /* Specific zigzag scan for 248 idct. NOTE that unlike the
77 specification, we interleave the fields */
78 const uint8_t ff_zigzag248_direct[64] = {
79 0, 8, 1, 9, 16, 24, 2, 10,
80 17, 25, 32, 40, 48, 56, 33, 41,
81 18, 26, 3, 11, 4, 12, 19, 27,
82 34, 42, 49, 57, 50, 58, 35, 43,
83 20, 28, 5, 13, 6, 14, 21, 29,
84 36, 44, 51, 59, 52, 60, 37, 45,
85 22, 30, 7, 15, 23, 31, 38, 46,
86 53, 61, 54, 62, 39, 47, 55, 63,
89 /* not permutated inverse zigzag_direct + 1 for MMX quantizer */
90 DECLARE_ALIGNED_16(uint16_t, inv_zigzag_direct16[64]);
92 const uint8_t ff_alternate_horizontal_scan[64] = {
93 0, 1, 2, 3, 8, 9, 16, 17,
94 10, 11, 4, 5, 6, 7, 15, 14,
95 13, 12, 19, 18, 24, 25, 32, 33,
96 26, 27, 20, 21, 22, 23, 28, 29,
97 30, 31, 34, 35, 40, 41, 48, 49,
98 42, 43, 36, 37, 38, 39, 44, 45,
99 46, 47, 50, 51, 56, 57, 58, 59,
100 52, 53, 54, 55, 60, 61, 62, 63,
103 const uint8_t ff_alternate_vertical_scan[64] = {
104 0, 8, 16, 24, 1, 9, 2, 10,
105 17, 25, 32, 40, 48, 56, 57, 49,
106 41, 33, 26, 18, 3, 11, 4, 12,
107 19, 27, 34, 42, 50, 58, 35, 43,
108 51, 59, 20, 28, 5, 13, 6, 14,
109 21, 29, 36, 44, 52, 60, 37, 45,
110 53, 61, 22, 30, 7, 15, 23, 31,
111 38, 46, 54, 62, 39, 47, 55, 63,
114 /* a*inverse[b]>>32 == a/b for all 0<=a<=16909558 && 2<=b<=256
115 * for a>16909558, is an overestimate by less than 1 part in 1<<24 */
116 const uint32_t ff_inverse[257]={
117 0, 4294967295U,2147483648U,1431655766, 1073741824, 858993460, 715827883, 613566757,
118 536870912, 477218589, 429496730, 390451573, 357913942, 330382100, 306783379, 286331154,
119 268435456, 252645136, 238609295, 226050911, 214748365, 204522253, 195225787, 186737709,
120 178956971, 171798692, 165191050, 159072863, 153391690, 148102321, 143165577, 138547333,
121 134217728, 130150525, 126322568, 122713352, 119304648, 116080198, 113025456, 110127367,
122 107374183, 104755300, 102261127, 99882961, 97612894, 95443718, 93368855, 91382283,
123 89478486, 87652394, 85899346, 84215046, 82595525, 81037119, 79536432, 78090315,
124 76695845, 75350304, 74051161, 72796056, 71582789, 70409300, 69273667, 68174085,
125 67108864, 66076420, 65075263, 64103990, 63161284, 62245903, 61356676, 60492498,
126 59652324, 58835169, 58040099, 57266231, 56512728, 55778797, 55063684, 54366675,
127 53687092, 53024288, 52377650, 51746594, 51130564, 50529028, 49941481, 49367441,
128 48806447, 48258060, 47721859, 47197443, 46684428, 46182445, 45691142, 45210183,
129 44739243, 44278014, 43826197, 43383509, 42949673, 42524429, 42107523, 41698712,
130 41297763, 40904451, 40518560, 40139882, 39768216, 39403370, 39045158, 38693400,
131 38347923, 38008561, 37675152, 37347542, 37025581, 36709123, 36398028, 36092163,
132 35791395, 35495598, 35204650, 34918434, 34636834, 34359739, 34087043, 33818641,
133 33554432, 33294321, 33038210, 32786010, 32537632, 32292988, 32051995, 31814573,
134 31580642, 31350127, 31122952, 30899046, 30678338, 30460761, 30246249, 30034737,
135 29826162, 29620465, 29417585, 29217465, 29020050, 28825284, 28633116, 28443493,
136 28256364, 28071682, 27889399, 27709467, 27531842, 27356480, 27183338, 27012373,
137 26843546, 26676816, 26512144, 26349493, 26188825, 26030105, 25873297, 25718368,
138 25565282, 25414008, 25264514, 25116768, 24970741, 24826401, 24683721, 24542671,
139 24403224, 24265352, 24129030, 23994231, 23860930, 23729102, 23598722, 23469767,
140 23342214, 23216040, 23091223, 22967740, 22845571, 22724695, 22605092, 22486740,
141 22369622, 22253717, 22139007, 22025474, 21913099, 21801865, 21691755, 21582751,
142 21474837, 21367997, 21262215, 21157475, 21053762, 20951060, 20849356, 20748635,
143 20648882, 20550083, 20452226, 20355296, 20259280, 20164166, 20069941, 19976593,
144 19884108, 19792477, 19701685, 19611723, 19522579, 19434242, 19346700, 19259944,
145 19173962, 19088744, 19004281, 18920561, 18837576, 18755316, 18673771, 18592933,
146 18512791, 18433337, 18354562, 18276457, 18199014, 18122225, 18046082, 17970575,
147 17895698, 17821442, 17747799, 17674763, 17602325, 17530479, 17459217, 17388532,
148 17318417, 17248865, 17179870, 17111424, 17043522, 16976156, 16909321, 16843010,
152 /* Input permutation for the simple_idct_mmx */
153 static const uint8_t simple_mmx_permutation[64]={
154 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
155 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
156 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
157 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
158 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
159 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
160 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
161 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
164 static const uint8_t idct_sse2_row_perm[8] = {0, 4, 1, 5, 2, 6, 3, 7};
166 void ff_init_scantable(uint8_t *permutation, ScanTable *st, const uint8_t *src_scantable){
170 st->scantable= src_scantable;
174 j = src_scantable[i];
175 st->permutated[i] = permutation[j];
184 j = st->permutated[i];
186 st->raster_end[i]= end;
190 static int pix_sum_c(uint8_t * pix, int line_size)
195 for (i = 0; i < 16; i++) {
196 for (j = 0; j < 16; j += 8) {
207 pix += line_size - 16;
212 static int pix_norm1_c(uint8_t * pix, int line_size)
215 uint32_t *sq = ff_squareTbl + 256;
218 for (i = 0; i < 16; i++) {
219 for (j = 0; j < 16; j += 8) {
230 #if LONG_MAX > 2147483647
231 register uint64_t x=*(uint64_t*)pix;
233 s += sq[(x>>8)&0xff];
234 s += sq[(x>>16)&0xff];
235 s += sq[(x>>24)&0xff];
236 s += sq[(x>>32)&0xff];
237 s += sq[(x>>40)&0xff];
238 s += sq[(x>>48)&0xff];
239 s += sq[(x>>56)&0xff];
241 register uint32_t x=*(uint32_t*)pix;
243 s += sq[(x>>8)&0xff];
244 s += sq[(x>>16)&0xff];
245 s += sq[(x>>24)&0xff];
246 x=*(uint32_t*)(pix+4);
248 s += sq[(x>>8)&0xff];
249 s += sq[(x>>16)&0xff];
250 s += sq[(x>>24)&0xff];
255 pix += line_size - 16;
260 static void bswap_buf(uint32_t *dst, const uint32_t *src, int w){
263 for(i=0; i+8<=w; i+=8){
264 dst[i+0]= bswap_32(src[i+0]);
265 dst[i+1]= bswap_32(src[i+1]);
266 dst[i+2]= bswap_32(src[i+2]);
267 dst[i+3]= bswap_32(src[i+3]);
268 dst[i+4]= bswap_32(src[i+4]);
269 dst[i+5]= bswap_32(src[i+5]);
270 dst[i+6]= bswap_32(src[i+6]);
271 dst[i+7]= bswap_32(src[i+7]);
274 dst[i+0]= bswap_32(src[i+0]);
278 static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
281 uint32_t *sq = ff_squareTbl + 256;
284 for (i = 0; i < h; i++) {
285 s += sq[pix1[0] - pix2[0]];
286 s += sq[pix1[1] - pix2[1]];
287 s += sq[pix1[2] - pix2[2]];
288 s += sq[pix1[3] - pix2[3]];
295 static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
298 uint32_t *sq = ff_squareTbl + 256;
301 for (i = 0; i < h; i++) {
302 s += sq[pix1[0] - pix2[0]];
303 s += sq[pix1[1] - pix2[1]];
304 s += sq[pix1[2] - pix2[2]];
305 s += sq[pix1[3] - pix2[3]];
306 s += sq[pix1[4] - pix2[4]];
307 s += sq[pix1[5] - pix2[5]];
308 s += sq[pix1[6] - pix2[6]];
309 s += sq[pix1[7] - pix2[7]];
316 static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
319 uint32_t *sq = ff_squareTbl + 256;
322 for (i = 0; i < h; i++) {
323 s += sq[pix1[ 0] - pix2[ 0]];
324 s += sq[pix1[ 1] - pix2[ 1]];
325 s += sq[pix1[ 2] - pix2[ 2]];
326 s += sq[pix1[ 3] - pix2[ 3]];
327 s += sq[pix1[ 4] - pix2[ 4]];
328 s += sq[pix1[ 5] - pix2[ 5]];
329 s += sq[pix1[ 6] - pix2[ 6]];
330 s += sq[pix1[ 7] - pix2[ 7]];
331 s += sq[pix1[ 8] - pix2[ 8]];
332 s += sq[pix1[ 9] - pix2[ 9]];
333 s += sq[pix1[10] - pix2[10]];
334 s += sq[pix1[11] - pix2[11]];
335 s += sq[pix1[12] - pix2[12]];
336 s += sq[pix1[13] - pix2[13]];
337 s += sq[pix1[14] - pix2[14]];
338 s += sq[pix1[15] - pix2[15]];
347 #if CONFIG_SNOW_ENCODER //dwt is in snow.c
348 static inline int w_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int w, int h, int type){
350 const int dec_count= w==8 ? 3 : 4;
353 static const int scale[2][2][4][4]={
357 {268, 239, 239, 213},
361 // 9/7 16x16 or 32x32 dec=4
362 {344, 310, 310, 280},
370 {275, 245, 245, 218},
374 // 5/3 16x16 or 32x32 dec=4
375 {352, 317, 317, 286},
383 for (i = 0; i < h; i++) {
384 for (j = 0; j < w; j+=4) {
385 tmp[32*i+j+0] = (pix1[j+0] - pix2[j+0])<<4;
386 tmp[32*i+j+1] = (pix1[j+1] - pix2[j+1])<<4;
387 tmp[32*i+j+2] = (pix1[j+2] - pix2[j+2])<<4;
388 tmp[32*i+j+3] = (pix1[j+3] - pix2[j+3])<<4;
394 ff_spatial_dwt(tmp, w, h, 32, type, dec_count);
398 for(level=0; level<dec_count; level++){
399 for(ori= level ? 1 : 0; ori<4; ori++){
400 int size= w>>(dec_count-level);
401 int sx= (ori&1) ? size : 0;
402 int stride= 32<<(dec_count-level);
403 int sy= (ori&2) ? stride>>1 : 0;
405 for(i=0; i<size; i++){
406 for(j=0; j<size; j++){
407 int v= tmp[sx + sy + i*stride + j] * scale[type][dec_count-3][level][ori];
417 static int w53_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
418 return w_c(v, pix1, pix2, line_size, 8, h, 1);
421 static int w97_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
422 return w_c(v, pix1, pix2, line_size, 8, h, 0);
425 static int w53_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
426 return w_c(v, pix1, pix2, line_size, 16, h, 1);
429 static int w97_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
430 return w_c(v, pix1, pix2, line_size, 16, h, 0);
433 int w53_32_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
434 return w_c(v, pix1, pix2, line_size, 32, h, 1);
437 int w97_32_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
438 return w_c(v, pix1, pix2, line_size, 32, h, 0);
442 /* draw the edges of width 'w' of an image of size width, height */
443 //FIXME check that this is ok for mpeg4 interlaced
444 static void draw_edges_c(uint8_t *buf, int wrap, int width, int height, int w)
446 uint8_t *ptr, *last_line;
449 last_line = buf + (height - 1) * wrap;
452 memcpy(buf - (i + 1) * wrap, buf, width);
453 memcpy(last_line + (i + 1) * wrap, last_line, width);
457 for(i=0;i<height;i++) {
458 memset(ptr - w, ptr[0], w);
459 memset(ptr + width, ptr[width-1], w);
464 memset(buf - (i + 1) * wrap - w, buf[0], w); /* top left */
465 memset(buf - (i + 1) * wrap + width, buf[width-1], w); /* top right */
466 memset(last_line + (i + 1) * wrap - w, last_line[0], w); /* top left */
467 memset(last_line + (i + 1) * wrap + width, last_line[width-1], w); /* top right */
472 * Copies a rectangular area of samples to a temporary buffer and replicates the boarder samples.
473 * @param buf destination buffer
474 * @param src source buffer
475 * @param linesize number of bytes between 2 vertically adjacent samples in both the source and destination buffers
476 * @param block_w width of block
477 * @param block_h height of block
478 * @param src_x x coordinate of the top left sample of the block in the source buffer
479 * @param src_y y coordinate of the top left sample of the block in the source buffer
480 * @param w width of the source buffer
481 * @param h height of the source buffer
483 void ff_emulated_edge_mc(uint8_t *buf, uint8_t *src, int linesize, int block_w, int block_h,
484 int src_x, int src_y, int w, int h){
486 int start_y, start_x, end_y, end_x;
489 src+= (h-1-src_y)*linesize;
491 }else if(src_y<=-block_h){
492 src+= (1-block_h-src_y)*linesize;
498 }else if(src_x<=-block_w){
499 src+= (1-block_w-src_x);
503 start_y= FFMAX(0, -src_y);
504 start_x= FFMAX(0, -src_x);
505 end_y= FFMIN(block_h, h-src_y);
506 end_x= FFMIN(block_w, w-src_x);
508 // copy existing part
509 for(y=start_y; y<end_y; y++){
510 for(x=start_x; x<end_x; x++){
511 buf[x + y*linesize]= src[x + y*linesize];
516 for(y=0; y<start_y; y++){
517 for(x=start_x; x<end_x; x++){
518 buf[x + y*linesize]= buf[x + start_y*linesize];
523 for(y=end_y; y<block_h; y++){
524 for(x=start_x; x<end_x; x++){
525 buf[x + y*linesize]= buf[x + (end_y-1)*linesize];
529 for(y=0; y<block_h; y++){
531 for(x=0; x<start_x; x++){
532 buf[x + y*linesize]= buf[start_x + y*linesize];
536 for(x=end_x; x<block_w; x++){
537 buf[x + y*linesize]= buf[end_x - 1 + y*linesize];
542 static void get_pixels_c(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
546 /* read the pixels */
548 block[0] = pixels[0];
549 block[1] = pixels[1];
550 block[2] = pixels[2];
551 block[3] = pixels[3];
552 block[4] = pixels[4];
553 block[5] = pixels[5];
554 block[6] = pixels[6];
555 block[7] = pixels[7];
561 static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
562 const uint8_t *s2, int stride){
565 /* read the pixels */
567 block[0] = s1[0] - s2[0];
568 block[1] = s1[1] - s2[1];
569 block[2] = s1[2] - s2[2];
570 block[3] = s1[3] - s2[3];
571 block[4] = s1[4] - s2[4];
572 block[5] = s1[5] - s2[5];
573 block[6] = s1[6] - s2[6];
574 block[7] = s1[7] - s2[7];
582 static void put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
586 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
588 /* read the pixels */
590 pixels[0] = cm[block[0]];
591 pixels[1] = cm[block[1]];
592 pixels[2] = cm[block[2]];
593 pixels[3] = cm[block[3]];
594 pixels[4] = cm[block[4]];
595 pixels[5] = cm[block[5]];
596 pixels[6] = cm[block[6]];
597 pixels[7] = cm[block[7]];
604 static void put_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
608 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
610 /* read the pixels */
612 pixels[0] = cm[block[0]];
613 pixels[1] = cm[block[1]];
614 pixels[2] = cm[block[2]];
615 pixels[3] = cm[block[3]];
622 static void put_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
626 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
628 /* read the pixels */
630 pixels[0] = cm[block[0]];
631 pixels[1] = cm[block[1]];
638 static void put_signed_pixels_clamped_c(const DCTELEM *block,
639 uint8_t *restrict pixels,
644 for (i = 0; i < 8; i++) {
645 for (j = 0; j < 8; j++) {
648 else if (*block > 127)
651 *pixels = (uint8_t)(*block + 128);
655 pixels += (line_size - 8);
659 static void add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
663 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
665 /* read the pixels */
667 pixels[0] = cm[pixels[0] + block[0]];
668 pixels[1] = cm[pixels[1] + block[1]];
669 pixels[2] = cm[pixels[2] + block[2]];
670 pixels[3] = cm[pixels[3] + block[3]];
671 pixels[4] = cm[pixels[4] + block[4]];
672 pixels[5] = cm[pixels[5] + block[5]];
673 pixels[6] = cm[pixels[6] + block[6]];
674 pixels[7] = cm[pixels[7] + block[7]];
680 static void add_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
684 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
686 /* read the pixels */
688 pixels[0] = cm[pixels[0] + block[0]];
689 pixels[1] = cm[pixels[1] + block[1]];
690 pixels[2] = cm[pixels[2] + block[2]];
691 pixels[3] = cm[pixels[3] + block[3]];
697 static void add_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
701 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
703 /* read the pixels */
705 pixels[0] = cm[pixels[0] + block[0]];
706 pixels[1] = cm[pixels[1] + block[1]];
712 static void add_pixels8_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
716 pixels[0] += block[0];
717 pixels[1] += block[1];
718 pixels[2] += block[2];
719 pixels[3] += block[3];
720 pixels[4] += block[4];
721 pixels[5] += block[5];
722 pixels[6] += block[6];
723 pixels[7] += block[7];
729 static void add_pixels4_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
733 pixels[0] += block[0];
734 pixels[1] += block[1];
735 pixels[2] += block[2];
736 pixels[3] += block[3];
742 static int sum_abs_dctelem_c(DCTELEM *block)
746 sum+= FFABS(block[i]);
752 #define PIXOP2(OPNAME, OP) \
753 static void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
757 OP(*((uint64_t*)block), AV_RN64(pixels));\
763 static void OPNAME ## _no_rnd_pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
767 const uint64_t a= AV_RN64(pixels );\
768 const uint64_t b= AV_RN64(pixels+1);\
769 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
775 static void OPNAME ## _pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
779 const uint64_t a= AV_RN64(pixels );\
780 const uint64_t b= AV_RN64(pixels+1);\
781 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
787 static void OPNAME ## _no_rnd_pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
791 const uint64_t a= AV_RN64(pixels );\
792 const uint64_t b= AV_RN64(pixels+line_size);\
793 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
799 static void OPNAME ## _pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
803 const uint64_t a= AV_RN64(pixels );\
804 const uint64_t b= AV_RN64(pixels+line_size);\
805 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
811 static void OPNAME ## _pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
814 const uint64_t a= AV_RN64(pixels );\
815 const uint64_t b= AV_RN64(pixels+1);\
816 uint64_t l0= (a&0x0303030303030303ULL)\
817 + (b&0x0303030303030303ULL)\
818 + 0x0202020202020202ULL;\
819 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
820 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
824 for(i=0; i<h; i+=2){\
825 uint64_t a= AV_RN64(pixels );\
826 uint64_t b= AV_RN64(pixels+1);\
827 l1= (a&0x0303030303030303ULL)\
828 + (b&0x0303030303030303ULL);\
829 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
830 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
831 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
834 a= AV_RN64(pixels );\
835 b= AV_RN64(pixels+1);\
836 l0= (a&0x0303030303030303ULL)\
837 + (b&0x0303030303030303ULL)\
838 + 0x0202020202020202ULL;\
839 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
840 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
841 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
847 static void OPNAME ## _no_rnd_pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
850 const uint64_t a= AV_RN64(pixels );\
851 const uint64_t b= AV_RN64(pixels+1);\
852 uint64_t l0= (a&0x0303030303030303ULL)\
853 + (b&0x0303030303030303ULL)\
854 + 0x0101010101010101ULL;\
855 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
856 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
860 for(i=0; i<h; i+=2){\
861 uint64_t a= AV_RN64(pixels );\
862 uint64_t b= AV_RN64(pixels+1);\
863 l1= (a&0x0303030303030303ULL)\
864 + (b&0x0303030303030303ULL);\
865 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
866 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
867 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
870 a= AV_RN64(pixels );\
871 b= AV_RN64(pixels+1);\
872 l0= (a&0x0303030303030303ULL)\
873 + (b&0x0303030303030303ULL)\
874 + 0x0101010101010101ULL;\
875 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
876 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
877 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
883 CALL_2X_PIXELS(OPNAME ## _pixels16_c , OPNAME ## _pixels_c , 8)\
884 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels_x2_c , 8)\
885 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels_y2_c , 8)\
886 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels_xy2_c, 8)\
887 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels_x2_c , 8)\
888 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels_y2_c , 8)\
889 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels_xy2_c, 8)
891 #define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
892 #else // 64 bit variant
894 #define PIXOP2(OPNAME, OP) \
895 static void OPNAME ## _pixels2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
898 OP(*((uint16_t*)(block )), AV_RN16(pixels ));\
903 static void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
906 OP(*((uint32_t*)(block )), AV_RN32(pixels ));\
911 static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
914 OP(*((uint32_t*)(block )), AV_RN32(pixels ));\
915 OP(*((uint32_t*)(block+4)), AV_RN32(pixels+4));\
920 static inline void OPNAME ## _no_rnd_pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
921 OPNAME ## _pixels8_c(block, pixels, line_size, h);\
924 static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
925 int src_stride1, int src_stride2, int h){\
929 a= AV_RN32(&src1[i*src_stride1 ]);\
930 b= AV_RN32(&src2[i*src_stride2 ]);\
931 OP(*((uint32_t*)&dst[i*dst_stride ]), no_rnd_avg32(a, b));\
932 a= AV_RN32(&src1[i*src_stride1+4]);\
933 b= AV_RN32(&src2[i*src_stride2+4]);\
934 OP(*((uint32_t*)&dst[i*dst_stride+4]), no_rnd_avg32(a, b));\
938 static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
939 int src_stride1, int src_stride2, int h){\
943 a= AV_RN32(&src1[i*src_stride1 ]);\
944 b= AV_RN32(&src2[i*src_stride2 ]);\
945 OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
946 a= AV_RN32(&src1[i*src_stride1+4]);\
947 b= AV_RN32(&src2[i*src_stride2+4]);\
948 OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\
952 static inline void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
953 int src_stride1, int src_stride2, int h){\
957 a= AV_RN32(&src1[i*src_stride1 ]);\
958 b= AV_RN32(&src2[i*src_stride2 ]);\
959 OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
963 static inline void OPNAME ## _pixels2_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
964 int src_stride1, int src_stride2, int h){\
968 a= AV_RN16(&src1[i*src_stride1 ]);\
969 b= AV_RN16(&src2[i*src_stride2 ]);\
970 OP(*((uint16_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
974 static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
975 int src_stride1, int src_stride2, int h){\
976 OPNAME ## _pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
977 OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
980 static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
981 int src_stride1, int src_stride2, int h){\
982 OPNAME ## _no_rnd_pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
983 OPNAME ## _no_rnd_pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
986 static inline void OPNAME ## _no_rnd_pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
987 OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
990 static inline void OPNAME ## _pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
991 OPNAME ## _pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
994 static inline void OPNAME ## _no_rnd_pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
995 OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
998 static inline void OPNAME ## _pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
999 OPNAME ## _pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
1002 static inline void OPNAME ## _pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
1003 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1005 for(i=0; i<h; i++){\
1006 uint32_t a, b, c, d, l0, l1, h0, h1;\
1007 a= AV_RN32(&src1[i*src_stride1]);\
1008 b= AV_RN32(&src2[i*src_stride2]);\
1009 c= AV_RN32(&src3[i*src_stride3]);\
1010 d= AV_RN32(&src4[i*src_stride4]);\
1011 l0= (a&0x03030303UL)\
1014 h0= ((a&0xFCFCFCFCUL)>>2)\
1015 + ((b&0xFCFCFCFCUL)>>2);\
1016 l1= (c&0x03030303UL)\
1017 + (d&0x03030303UL);\
1018 h1= ((c&0xFCFCFCFCUL)>>2)\
1019 + ((d&0xFCFCFCFCUL)>>2);\
1020 OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1021 a= AV_RN32(&src1[i*src_stride1+4]);\
1022 b= AV_RN32(&src2[i*src_stride2+4]);\
1023 c= AV_RN32(&src3[i*src_stride3+4]);\
1024 d= AV_RN32(&src4[i*src_stride4+4]);\
1025 l0= (a&0x03030303UL)\
1028 h0= ((a&0xFCFCFCFCUL)>>2)\
1029 + ((b&0xFCFCFCFCUL)>>2);\
1030 l1= (c&0x03030303UL)\
1031 + (d&0x03030303UL);\
1032 h1= ((c&0xFCFCFCFCUL)>>2)\
1033 + ((d&0xFCFCFCFCUL)>>2);\
1034 OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1038 static inline void OPNAME ## _pixels4_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
1039 OPNAME ## _pixels4_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
1042 static inline void OPNAME ## _pixels4_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
1043 OPNAME ## _pixels4_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
1046 static inline void OPNAME ## _pixels2_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
1047 OPNAME ## _pixels2_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
1050 static inline void OPNAME ## _pixels2_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
1051 OPNAME ## _pixels2_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
1054 static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
1055 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1057 for(i=0; i<h; i++){\
1058 uint32_t a, b, c, d, l0, l1, h0, h1;\
1059 a= AV_RN32(&src1[i*src_stride1]);\
1060 b= AV_RN32(&src2[i*src_stride2]);\
1061 c= AV_RN32(&src3[i*src_stride3]);\
1062 d= AV_RN32(&src4[i*src_stride4]);\
1063 l0= (a&0x03030303UL)\
1066 h0= ((a&0xFCFCFCFCUL)>>2)\
1067 + ((b&0xFCFCFCFCUL)>>2);\
1068 l1= (c&0x03030303UL)\
1069 + (d&0x03030303UL);\
1070 h1= ((c&0xFCFCFCFCUL)>>2)\
1071 + ((d&0xFCFCFCFCUL)>>2);\
1072 OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1073 a= AV_RN32(&src1[i*src_stride1+4]);\
1074 b= AV_RN32(&src2[i*src_stride2+4]);\
1075 c= AV_RN32(&src3[i*src_stride3+4]);\
1076 d= AV_RN32(&src4[i*src_stride4+4]);\
1077 l0= (a&0x03030303UL)\
1080 h0= ((a&0xFCFCFCFCUL)>>2)\
1081 + ((b&0xFCFCFCFCUL)>>2);\
1082 l1= (c&0x03030303UL)\
1083 + (d&0x03030303UL);\
1084 h1= ((c&0xFCFCFCFCUL)>>2)\
1085 + ((d&0xFCFCFCFCUL)>>2);\
1086 OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1089 static inline void OPNAME ## _pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
1090 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1091 OPNAME ## _pixels8_l4(dst , src1 , src2 , src3 , src4 , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1092 OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1094 static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
1095 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1096 OPNAME ## _no_rnd_pixels8_l4(dst , src1 , src2 , src3 , src4 , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1097 OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1100 static inline void OPNAME ## _pixels2_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1102 int i, a0, b0, a1, b1;\
1109 for(i=0; i<h; i+=2){\
1115 block[0]= (a1+a0)>>2; /* FIXME non put */\
1116 block[1]= (b1+b0)>>2;\
1126 block[0]= (a1+a0)>>2;\
1127 block[1]= (b1+b0)>>2;\
1133 static inline void OPNAME ## _pixels4_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1136 const uint32_t a= AV_RN32(pixels );\
1137 const uint32_t b= AV_RN32(pixels+1);\
1138 uint32_t l0= (a&0x03030303UL)\
1141 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1142 + ((b&0xFCFCFCFCUL)>>2);\
1146 for(i=0; i<h; i+=2){\
1147 uint32_t a= AV_RN32(pixels );\
1148 uint32_t b= AV_RN32(pixels+1);\
1149 l1= (a&0x03030303UL)\
1150 + (b&0x03030303UL);\
1151 h1= ((a&0xFCFCFCFCUL)>>2)\
1152 + ((b&0xFCFCFCFCUL)>>2);\
1153 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1156 a= AV_RN32(pixels );\
1157 b= AV_RN32(pixels+1);\
1158 l0= (a&0x03030303UL)\
1161 h0= ((a&0xFCFCFCFCUL)>>2)\
1162 + ((b&0xFCFCFCFCUL)>>2);\
1163 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1169 static inline void OPNAME ## _pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1172 for(j=0; j<2; j++){\
1174 const uint32_t a= AV_RN32(pixels );\
1175 const uint32_t b= AV_RN32(pixels+1);\
1176 uint32_t l0= (a&0x03030303UL)\
1179 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1180 + ((b&0xFCFCFCFCUL)>>2);\
1184 for(i=0; i<h; i+=2){\
1185 uint32_t a= AV_RN32(pixels );\
1186 uint32_t b= AV_RN32(pixels+1);\
1187 l1= (a&0x03030303UL)\
1188 + (b&0x03030303UL);\
1189 h1= ((a&0xFCFCFCFCUL)>>2)\
1190 + ((b&0xFCFCFCFCUL)>>2);\
1191 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1194 a= AV_RN32(pixels );\
1195 b= AV_RN32(pixels+1);\
1196 l0= (a&0x03030303UL)\
1199 h0= ((a&0xFCFCFCFCUL)>>2)\
1200 + ((b&0xFCFCFCFCUL)>>2);\
1201 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1205 pixels+=4-line_size*(h+1);\
1206 block +=4-line_size*h;\
1210 static inline void OPNAME ## _no_rnd_pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1213 for(j=0; j<2; j++){\
1215 const uint32_t a= AV_RN32(pixels );\
1216 const uint32_t b= AV_RN32(pixels+1);\
1217 uint32_t l0= (a&0x03030303UL)\
1220 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1221 + ((b&0xFCFCFCFCUL)>>2);\
1225 for(i=0; i<h; i+=2){\
1226 uint32_t a= AV_RN32(pixels );\
1227 uint32_t b= AV_RN32(pixels+1);\
1228 l1= (a&0x03030303UL)\
1229 + (b&0x03030303UL);\
1230 h1= ((a&0xFCFCFCFCUL)>>2)\
1231 + ((b&0xFCFCFCFCUL)>>2);\
1232 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1235 a= AV_RN32(pixels );\
1236 b= AV_RN32(pixels+1);\
1237 l0= (a&0x03030303UL)\
1240 h0= ((a&0xFCFCFCFCUL)>>2)\
1241 + ((b&0xFCFCFCFCUL)>>2);\
1242 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1246 pixels+=4-line_size*(h+1);\
1247 block +=4-line_size*h;\
1251 CALL_2X_PIXELS(OPNAME ## _pixels16_c , OPNAME ## _pixels8_c , 8)\
1252 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels8_x2_c , 8)\
1253 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels8_y2_c , 8)\
1254 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels8_xy2_c, 8)\
1255 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_c , OPNAME ## _pixels8_c , 8)\
1256 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels8_x2_c , 8)\
1257 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels8_y2_c , 8)\
1258 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels8_xy2_c, 8)\
1260 #define op_avg(a, b) a = rnd_avg32(a, b)
1262 #define op_put(a, b) a = b
1269 #define avg2(a,b) ((a+b+1)>>1)
1270 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
1272 static void put_no_rnd_pixels16_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1273 put_no_rnd_pixels16_l2(dst, a, b, stride, stride, stride, h);
1276 static void put_no_rnd_pixels8_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1277 put_no_rnd_pixels8_l2(dst, a, b, stride, stride, stride, h);
1280 static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
1282 const int A=(16-x16)*(16-y16);
1283 const int B=( x16)*(16-y16);
1284 const int C=(16-x16)*( y16);
1285 const int D=( x16)*( y16);
1290 dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
1291 dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
1292 dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
1293 dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
1294 dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
1295 dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
1296 dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
1297 dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
1303 void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
1304 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
1307 const int s= 1<<shift;
1317 for(x=0; x<8; x++){ //XXX FIXME optimize
1318 int src_x, src_y, frac_x, frac_y, index;
1322 frac_x= src_x&(s-1);
1323 frac_y= src_y&(s-1);
1327 if((unsigned)src_x < width){
1328 if((unsigned)src_y < height){
1329 index= src_x + src_y*stride;
1330 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
1331 + src[index +1]* frac_x )*(s-frac_y)
1332 + ( src[index+stride ]*(s-frac_x)
1333 + src[index+stride+1]* frac_x )* frac_y
1336 index= src_x + av_clip(src_y, 0, height)*stride;
1337 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
1338 + src[index +1]* frac_x )*s
1342 if((unsigned)src_y < height){
1343 index= av_clip(src_x, 0, width) + src_y*stride;
1344 dst[y*stride + x]= ( ( src[index ]*(s-frac_y)
1345 + src[index+stride ]* frac_y )*s
1348 index= av_clip(src_x, 0, width) + av_clip(src_y, 0, height)*stride;
1349 dst[y*stride + x]= src[index ];
1361 static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1363 case 2: put_pixels2_c (dst, src, stride, height); break;
1364 case 4: put_pixels4_c (dst, src, stride, height); break;
1365 case 8: put_pixels8_c (dst, src, stride, height); break;
1366 case 16:put_pixels16_c(dst, src, stride, height); break;
1370 static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1372 for (i=0; i < height; i++) {
1373 for (j=0; j < width; j++) {
1374 dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
1381 static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1383 for (i=0; i < height; i++) {
1384 for (j=0; j < width; j++) {
1385 dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
1392 static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1394 for (i=0; i < height; i++) {
1395 for (j=0; j < width; j++) {
1396 dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
1403 static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1405 for (i=0; i < height; i++) {
1406 for (j=0; j < width; j++) {
1407 dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
1414 static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1416 for (i=0; i < height; i++) {
1417 for (j=0; j < width; j++) {
1418 dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1425 static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1427 for (i=0; i < height; i++) {
1428 for (j=0; j < width; j++) {
1429 dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
1436 static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1438 for (i=0; i < height; i++) {
1439 for (j=0; j < width; j++) {
1440 dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1447 static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1449 for (i=0; i < height; i++) {
1450 for (j=0; j < width; j++) {
1451 dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
1458 static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1460 case 2: avg_pixels2_c (dst, src, stride, height); break;
1461 case 4: avg_pixels4_c (dst, src, stride, height); break;
1462 case 8: avg_pixels8_c (dst, src, stride, height); break;
1463 case 16:avg_pixels16_c(dst, src, stride, height); break;
1467 static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1469 for (i=0; i < height; i++) {
1470 for (j=0; j < width; j++) {
1471 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
1478 static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1480 for (i=0; i < height; i++) {
1481 for (j=0; j < width; j++) {
1482 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
1489 static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1491 for (i=0; i < height; i++) {
1492 for (j=0; j < width; j++) {
1493 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
1500 static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1502 for (i=0; i < height; i++) {
1503 for (j=0; j < width; j++) {
1504 dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1511 static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1513 for (i=0; i < height; i++) {
1514 for (j=0; j < width; j++) {
1515 dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1522 static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1524 for (i=0; i < height; i++) {
1525 for (j=0; j < width; j++) {
1526 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
1533 static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1535 for (i=0; i < height; i++) {
1536 for (j=0; j < width; j++) {
1537 dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1544 static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1546 for (i=0; i < height; i++) {
1547 for (j=0; j < width; j++) {
1548 dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1555 #define TPEL_WIDTH(width)\
1556 static void put_tpel_pixels ## width ## _mc00_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1557 void put_tpel_pixels_mc00_c(dst, src, stride, width, height);}\
1558 static void put_tpel_pixels ## width ## _mc10_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1559 void put_tpel_pixels_mc10_c(dst, src, stride, width, height);}\
1560 static void put_tpel_pixels ## width ## _mc20_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1561 void put_tpel_pixels_mc20_c(dst, src, stride, width, height);}\
1562 static void put_tpel_pixels ## width ## _mc01_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1563 void put_tpel_pixels_mc01_c(dst, src, stride, width, height);}\
1564 static void put_tpel_pixels ## width ## _mc11_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1565 void put_tpel_pixels_mc11_c(dst, src, stride, width, height);}\
1566 static void put_tpel_pixels ## width ## _mc21_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1567 void put_tpel_pixels_mc21_c(dst, src, stride, width, height);}\
1568 static void put_tpel_pixels ## width ## _mc02_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1569 void put_tpel_pixels_mc02_c(dst, src, stride, width, height);}\
1570 static void put_tpel_pixels ## width ## _mc12_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1571 void put_tpel_pixels_mc12_c(dst, src, stride, width, height);}\
1572 static void put_tpel_pixels ## width ## _mc22_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1573 void put_tpel_pixels_mc22_c(dst, src, stride, width, height);}
1576 #define H264_CHROMA_MC(OPNAME, OP)\
1577 static void OPNAME ## h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1578 const int A=(8-x)*(8-y);\
1579 const int B=( x)*(8-y);\
1580 const int C=(8-x)*( y);\
1581 const int D=( x)*( y);\
1584 assert(x<8 && y<8 && x>=0 && y>=0);\
1587 for(i=0; i<h; i++){\
1588 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1589 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1595 const int step= C ? stride : 1;\
1596 for(i=0; i<h; i++){\
1597 OP(dst[0], (A*src[0] + E*src[step+0]));\
1598 OP(dst[1], (A*src[1] + E*src[step+1]));\
1605 static void OPNAME ## h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1606 const int A=(8-x)*(8-y);\
1607 const int B=( x)*(8-y);\
1608 const int C=(8-x)*( y);\
1609 const int D=( x)*( y);\
1612 assert(x<8 && y<8 && x>=0 && y>=0);\
1615 for(i=0; i<h; i++){\
1616 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1617 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1618 OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1619 OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1625 const int step= C ? stride : 1;\
1626 for(i=0; i<h; i++){\
1627 OP(dst[0], (A*src[0] + E*src[step+0]));\
1628 OP(dst[1], (A*src[1] + E*src[step+1]));\
1629 OP(dst[2], (A*src[2] + E*src[step+2]));\
1630 OP(dst[3], (A*src[3] + E*src[step+3]));\
1637 static void OPNAME ## h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1638 const int A=(8-x)*(8-y);\
1639 const int B=( x)*(8-y);\
1640 const int C=(8-x)*( y);\
1641 const int D=( x)*( y);\
1644 assert(x<8 && y<8 && x>=0 && y>=0);\
1647 for(i=0; i<h; i++){\
1648 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1649 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1650 OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1651 OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1652 OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));\
1653 OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));\
1654 OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));\
1655 OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));\
1661 const int step= C ? stride : 1;\
1662 for(i=0; i<h; i++){\
1663 OP(dst[0], (A*src[0] + E*src[step+0]));\
1664 OP(dst[1], (A*src[1] + E*src[step+1]));\
1665 OP(dst[2], (A*src[2] + E*src[step+2]));\
1666 OP(dst[3], (A*src[3] + E*src[step+3]));\
1667 OP(dst[4], (A*src[4] + E*src[step+4]));\
1668 OP(dst[5], (A*src[5] + E*src[step+5]));\
1669 OP(dst[6], (A*src[6] + E*src[step+6]));\
1670 OP(dst[7], (A*src[7] + E*src[step+7]));\
1677 #define op_avg(a, b) a = (((a)+(((b) + 32)>>6)+1)>>1)
1678 #define op_put(a, b) a = (((b) + 32)>>6)
1680 H264_CHROMA_MC(put_ , op_put)
1681 H264_CHROMA_MC(avg_ , op_avg)
1685 static void put_no_rnd_vc1_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){
1686 const int A=(8-x)*(8-y);
1687 const int B=( x)*(8-y);
1688 const int C=(8-x)*( y);
1689 const int D=( x)*( y);
1692 assert(x<8 && y<8 && x>=0 && y>=0);
1696 dst[0] = (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + 32 - 4) >> 6;
1697 dst[1] = (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + 32 - 4) >> 6;
1698 dst[2] = (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + 32 - 4) >> 6;
1699 dst[3] = (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + 32 - 4) >> 6;
1700 dst[4] = (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + 32 - 4) >> 6;
1701 dst[5] = (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + 32 - 4) >> 6;
1702 dst[6] = (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + 32 - 4) >> 6;
1703 dst[7] = (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + 32 - 4) >> 6;
1709 static void avg_no_rnd_vc1_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){
1710 const int A=(8-x)*(8-y);
1711 const int B=( x)*(8-y);
1712 const int C=(8-x)*( y);
1713 const int D=( x)*( y);
1716 assert(x<8 && y<8 && x>=0 && y>=0);
1720 dst[0] = avg2(dst[0], ((A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + 32 - 4) >> 6));
1721 dst[1] = avg2(dst[1], ((A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + 32 - 4) >> 6));
1722 dst[2] = avg2(dst[2], ((A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + 32 - 4) >> 6));
1723 dst[3] = avg2(dst[3], ((A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + 32 - 4) >> 6));
1724 dst[4] = avg2(dst[4], ((A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + 32 - 4) >> 6));
1725 dst[5] = avg2(dst[5], ((A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + 32 - 4) >> 6));
1726 dst[6] = avg2(dst[6], ((A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + 32 - 4) >> 6));
1727 dst[7] = avg2(dst[7], ((A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + 32 - 4) >> 6));
1733 #define QPEL_MC(r, OPNAME, RND, OP) \
1734 static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1735 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1739 OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
1740 OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
1741 OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
1742 OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
1743 OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
1744 OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
1745 OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
1746 OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
1752 static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1754 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1758 const int src0= src[0*srcStride];\
1759 const int src1= src[1*srcStride];\
1760 const int src2= src[2*srcStride];\
1761 const int src3= src[3*srcStride];\
1762 const int src4= src[4*srcStride];\
1763 const int src5= src[5*srcStride];\
1764 const int src6= src[6*srcStride];\
1765 const int src7= src[7*srcStride];\
1766 const int src8= src[8*srcStride];\
1767 OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
1768 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
1769 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
1770 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
1771 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
1772 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
1773 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
1774 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
1780 static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1781 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1786 OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
1787 OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
1788 OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
1789 OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
1790 OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
1791 OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
1792 OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
1793 OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
1794 OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
1795 OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
1796 OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
1797 OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
1798 OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
1799 OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
1800 OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
1801 OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
1807 static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1808 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1813 const int src0= src[0*srcStride];\
1814 const int src1= src[1*srcStride];\
1815 const int src2= src[2*srcStride];\
1816 const int src3= src[3*srcStride];\
1817 const int src4= src[4*srcStride];\
1818 const int src5= src[5*srcStride];\
1819 const int src6= src[6*srcStride];\
1820 const int src7= src[7*srcStride];\
1821 const int src8= src[8*srcStride];\
1822 const int src9= src[9*srcStride];\
1823 const int src10= src[10*srcStride];\
1824 const int src11= src[11*srcStride];\
1825 const int src12= src[12*srcStride];\
1826 const int src13= src[13*srcStride];\
1827 const int src14= src[14*srcStride];\
1828 const int src15= src[15*srcStride];\
1829 const int src16= src[16*srcStride];\
1830 OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
1831 OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
1832 OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
1833 OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
1834 OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
1835 OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
1836 OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
1837 OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
1838 OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
1839 OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
1840 OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
1841 OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
1842 OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
1843 OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
1844 OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
1845 OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
1851 static void OPNAME ## qpel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1852 OPNAME ## pixels8_c(dst, src, stride, 8);\
1855 static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1857 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1858 OPNAME ## pixels8_l2(dst, src, half, stride, stride, 8, 8);\
1861 static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1862 OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
1865 static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1867 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1868 OPNAME ## pixels8_l2(dst, src+1, half, stride, stride, 8, 8);\
1871 static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1872 uint8_t full[16*9];\
1874 copy_block9(full, src, 16, stride, 9);\
1875 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1876 OPNAME ## pixels8_l2(dst, full, half, stride, 16, 8, 8);\
1879 static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1880 uint8_t full[16*9];\
1881 copy_block9(full, src, 16, stride, 9);\
1882 OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
1885 static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1886 uint8_t full[16*9];\
1888 copy_block9(full, src, 16, stride, 9);\
1889 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1890 OPNAME ## pixels8_l2(dst, full+16, half, stride, 16, 8, 8);\
1892 void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1893 uint8_t full[16*9];\
1896 uint8_t halfHV[64];\
1897 copy_block9(full, src, 16, stride, 9);\
1898 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1899 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1900 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1901 OPNAME ## pixels8_l4(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1903 static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1904 uint8_t full[16*9];\
1906 uint8_t halfHV[64];\
1907 copy_block9(full, src, 16, stride, 9);\
1908 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1909 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1910 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1911 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1913 void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1914 uint8_t full[16*9];\
1917 uint8_t halfHV[64];\
1918 copy_block9(full, src, 16, stride, 9);\
1919 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1920 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1921 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1922 OPNAME ## pixels8_l4(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1924 static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1925 uint8_t full[16*9];\
1927 uint8_t halfHV[64];\
1928 copy_block9(full, src, 16, stride, 9);\
1929 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1930 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1931 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1932 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1934 void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1935 uint8_t full[16*9];\
1938 uint8_t halfHV[64];\
1939 copy_block9(full, src, 16, stride, 9);\
1940 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1941 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1942 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1943 OPNAME ## pixels8_l4(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1945 static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1946 uint8_t full[16*9];\
1948 uint8_t halfHV[64];\
1949 copy_block9(full, src, 16, stride, 9);\
1950 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1951 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1952 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1953 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1955 void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1956 uint8_t full[16*9];\
1959 uint8_t halfHV[64];\
1960 copy_block9(full, src, 16, stride, 9);\
1961 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full , 8, 16, 9);\
1962 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1963 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1964 OPNAME ## pixels8_l4(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1966 static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1967 uint8_t full[16*9];\
1969 uint8_t halfHV[64];\
1970 copy_block9(full, src, 16, stride, 9);\
1971 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1972 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1973 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1974 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1976 static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1978 uint8_t halfHV[64];\
1979 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1980 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1981 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1983 static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1985 uint8_t halfHV[64];\
1986 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1987 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1988 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1990 void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1991 uint8_t full[16*9];\
1994 uint8_t halfHV[64];\
1995 copy_block9(full, src, 16, stride, 9);\
1996 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1997 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1998 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1999 OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
2001 static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2002 uint8_t full[16*9];\
2004 copy_block9(full, src, 16, stride, 9);\
2005 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
2006 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
2007 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
2009 void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
2010 uint8_t full[16*9];\
2013 uint8_t halfHV[64];\
2014 copy_block9(full, src, 16, stride, 9);\
2015 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
2016 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
2017 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
2018 OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
2020 static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2021 uint8_t full[16*9];\
2023 copy_block9(full, src, 16, stride, 9);\
2024 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
2025 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
2026 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
2028 static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2030 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
2031 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
2033 static void OPNAME ## qpel16_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2034 OPNAME ## pixels16_c(dst, src, stride, 16);\
2037 static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2039 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
2040 OPNAME ## pixels16_l2(dst, src, half, stride, stride, 16, 16);\
2043 static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2044 OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
2047 static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2049 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
2050 OPNAME ## pixels16_l2(dst, src+1, half, stride, stride, 16, 16);\
2053 static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2054 uint8_t full[24*17];\
2056 copy_block17(full, src, 24, stride, 17);\
2057 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
2058 OPNAME ## pixels16_l2(dst, full, half, stride, 24, 16, 16);\
2061 static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2062 uint8_t full[24*17];\
2063 copy_block17(full, src, 24, stride, 17);\
2064 OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
2067 static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2068 uint8_t full[24*17];\
2070 copy_block17(full, src, 24, stride, 17);\
2071 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
2072 OPNAME ## pixels16_l2(dst, full+24, half, stride, 24, 16, 16);\
2074 void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
2075 uint8_t full[24*17];\
2076 uint8_t halfH[272];\
2077 uint8_t halfV[256];\
2078 uint8_t halfHV[256];\
2079 copy_block17(full, src, 24, stride, 17);\
2080 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2081 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2082 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2083 OPNAME ## pixels16_l4(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2085 static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2086 uint8_t full[24*17];\
2087 uint8_t halfH[272];\
2088 uint8_t halfHV[256];\
2089 copy_block17(full, src, 24, stride, 17);\
2090 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2091 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2092 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2093 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2095 void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
2096 uint8_t full[24*17];\
2097 uint8_t halfH[272];\
2098 uint8_t halfV[256];\
2099 uint8_t halfHV[256];\
2100 copy_block17(full, src, 24, stride, 17);\
2101 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2102 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2103 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2104 OPNAME ## pixels16_l4(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2106 static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2107 uint8_t full[24*17];\
2108 uint8_t halfH[272];\
2109 uint8_t halfHV[256];\
2110 copy_block17(full, src, 24, stride, 17);\
2111 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2112 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2113 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2114 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2116 void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
2117 uint8_t full[24*17];\
2118 uint8_t halfH[272];\
2119 uint8_t halfV[256];\
2120 uint8_t halfHV[256];\
2121 copy_block17(full, src, 24, stride, 17);\
2122 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2123 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2124 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2125 OPNAME ## pixels16_l4(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2127 static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2128 uint8_t full[24*17];\
2129 uint8_t halfH[272];\
2130 uint8_t halfHV[256];\
2131 copy_block17(full, src, 24, stride, 17);\
2132 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2133 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2134 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2135 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2137 void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
2138 uint8_t full[24*17];\
2139 uint8_t halfH[272];\
2140 uint8_t halfV[256];\
2141 uint8_t halfHV[256];\
2142 copy_block17(full, src, 24, stride, 17);\
2143 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full , 16, 24, 17);\
2144 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2145 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2146 OPNAME ## pixels16_l4(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2148 static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2149 uint8_t full[24*17];\
2150 uint8_t halfH[272];\
2151 uint8_t halfHV[256];\
2152 copy_block17(full, src, 24, stride, 17);\
2153 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2154 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2155 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2156 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2158 static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2159 uint8_t halfH[272];\
2160 uint8_t halfHV[256];\
2161 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2162 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2163 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2165 static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2166 uint8_t halfH[272];\
2167 uint8_t halfHV[256];\
2168 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2169 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2170 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2172 void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
2173 uint8_t full[24*17];\
2174 uint8_t halfH[272];\
2175 uint8_t halfV[256];\
2176 uint8_t halfHV[256];\
2177 copy_block17(full, src, 24, stride, 17);\
2178 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2179 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2180 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2181 OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2183 static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2184 uint8_t full[24*17];\
2185 uint8_t halfH[272];\
2186 copy_block17(full, src, 24, stride, 17);\
2187 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2188 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2189 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2191 void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
2192 uint8_t full[24*17];\
2193 uint8_t halfH[272];\
2194 uint8_t halfV[256];\
2195 uint8_t halfHV[256];\
2196 copy_block17(full, src, 24, stride, 17);\
2197 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2198 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2199 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2200 OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2202 static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2203 uint8_t full[24*17];\
2204 uint8_t halfH[272];\
2205 copy_block17(full, src, 24, stride, 17);\
2206 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2207 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2208 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2210 static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2211 uint8_t halfH[272];\
2212 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2213 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2216 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2217 #define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
2218 #define op_put(a, b) a = cm[((b) + 16)>>5]
2219 #define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
2221 QPEL_MC(0, put_ , _ , op_put)
2222 QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
2223 QPEL_MC(0, avg_ , _ , op_avg)
2224 //QPEL_MC(1, avg_no_rnd , _ , op_avg)
2226 #undef op_avg_no_rnd
2228 #undef op_put_no_rnd
2231 #define H264_LOWPASS(OPNAME, OP, OP2) \
2232 static av_unused void OPNAME ## h264_qpel2_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2234 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2238 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2239 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2245 static av_unused void OPNAME ## h264_qpel2_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2247 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2251 const int srcB= src[-2*srcStride];\
2252 const int srcA= src[-1*srcStride];\
2253 const int src0= src[0 *srcStride];\
2254 const int src1= src[1 *srcStride];\
2255 const int src2= src[2 *srcStride];\
2256 const int src3= src[3 *srcStride];\
2257 const int src4= src[4 *srcStride];\
2258 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2259 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2265 static av_unused void OPNAME ## h264_qpel2_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2268 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2270 src -= 2*srcStride;\
2271 for(i=0; i<h+5; i++)\
2273 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2274 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2278 tmp -= tmpStride*(h+5-2);\
2281 const int tmpB= tmp[-2*tmpStride];\
2282 const int tmpA= tmp[-1*tmpStride];\
2283 const int tmp0= tmp[0 *tmpStride];\
2284 const int tmp1= tmp[1 *tmpStride];\
2285 const int tmp2= tmp[2 *tmpStride];\
2286 const int tmp3= tmp[3 *tmpStride];\
2287 const int tmp4= tmp[4 *tmpStride];\
2288 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2289 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2294 static void OPNAME ## h264_qpel4_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2296 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2300 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2301 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2302 OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\
2303 OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\
2309 static void OPNAME ## h264_qpel4_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2311 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2315 const int srcB= src[-2*srcStride];\
2316 const int srcA= src[-1*srcStride];\
2317 const int src0= src[0 *srcStride];\
2318 const int src1= src[1 *srcStride];\
2319 const int src2= src[2 *srcStride];\
2320 const int src3= src[3 *srcStride];\
2321 const int src4= src[4 *srcStride];\
2322 const int src5= src[5 *srcStride];\
2323 const int src6= src[6 *srcStride];\
2324 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2325 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2326 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2327 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2333 static void OPNAME ## h264_qpel4_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2336 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2338 src -= 2*srcStride;\
2339 for(i=0; i<h+5; i++)\
2341 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2342 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2343 tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]);\
2344 tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]);\
2348 tmp -= tmpStride*(h+5-2);\
2351 const int tmpB= tmp[-2*tmpStride];\
2352 const int tmpA= tmp[-1*tmpStride];\
2353 const int tmp0= tmp[0 *tmpStride];\
2354 const int tmp1= tmp[1 *tmpStride];\
2355 const int tmp2= tmp[2 *tmpStride];\
2356 const int tmp3= tmp[3 *tmpStride];\
2357 const int tmp4= tmp[4 *tmpStride];\
2358 const int tmp5= tmp[5 *tmpStride];\
2359 const int tmp6= tmp[6 *tmpStride];\
2360 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2361 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2362 OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2363 OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2369 static void OPNAME ## h264_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2371 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2375 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]));\
2376 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]));\
2377 OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]));\
2378 OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]));\
2379 OP(dst[4], (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]));\
2380 OP(dst[5], (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]));\
2381 OP(dst[6], (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]));\
2382 OP(dst[7], (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]));\
2388 static void OPNAME ## h264_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2390 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2394 const int srcB= src[-2*srcStride];\
2395 const int srcA= src[-1*srcStride];\
2396 const int src0= src[0 *srcStride];\
2397 const int src1= src[1 *srcStride];\
2398 const int src2= src[2 *srcStride];\
2399 const int src3= src[3 *srcStride];\
2400 const int src4= src[4 *srcStride];\
2401 const int src5= src[5 *srcStride];\
2402 const int src6= src[6 *srcStride];\
2403 const int src7= src[7 *srcStride];\
2404 const int src8= src[8 *srcStride];\
2405 const int src9= src[9 *srcStride];\
2406 const int src10=src[10*srcStride];\
2407 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2408 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2409 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2410 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2411 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*5 + (src2+src7));\
2412 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*5 + (src3+src8));\
2413 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*5 + (src4+src9));\
2414 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src9)*5 + (src5+src10));\
2420 static void OPNAME ## h264_qpel8_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2423 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2425 src -= 2*srcStride;\
2426 for(i=0; i<h+5; i++)\
2428 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]);\
2429 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]);\
2430 tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]);\
2431 tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]);\
2432 tmp[4]= (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]);\
2433 tmp[5]= (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]);\
2434 tmp[6]= (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]);\
2435 tmp[7]= (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]);\
2439 tmp -= tmpStride*(h+5-2);\
2442 const int tmpB= tmp[-2*tmpStride];\
2443 const int tmpA= tmp[-1*tmpStride];\
2444 const int tmp0= tmp[0 *tmpStride];\
2445 const int tmp1= tmp[1 *tmpStride];\
2446 const int tmp2= tmp[2 *tmpStride];\
2447 const int tmp3= tmp[3 *tmpStride];\
2448 const int tmp4= tmp[4 *tmpStride];\
2449 const int tmp5= tmp[5 *tmpStride];\
2450 const int tmp6= tmp[6 *tmpStride];\
2451 const int tmp7= tmp[7 *tmpStride];\
2452 const int tmp8= tmp[8 *tmpStride];\
2453 const int tmp9= tmp[9 *tmpStride];\
2454 const int tmp10=tmp[10*tmpStride];\
2455 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2456 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2457 OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2458 OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2459 OP2(dst[4*dstStride], (tmp4+tmp5)*20 - (tmp3+tmp6)*5 + (tmp2+tmp7));\
2460 OP2(dst[5*dstStride], (tmp5+tmp6)*20 - (tmp4+tmp7)*5 + (tmp3+tmp8));\
2461 OP2(dst[6*dstStride], (tmp6+tmp7)*20 - (tmp5+tmp8)*5 + (tmp4+tmp9));\
2462 OP2(dst[7*dstStride], (tmp7+tmp8)*20 - (tmp6+tmp9)*5 + (tmp5+tmp10));\
2468 static void OPNAME ## h264_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2469 OPNAME ## h264_qpel8_v_lowpass(dst , src , dstStride, srcStride);\
2470 OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2471 src += 8*srcStride;\
2472 dst += 8*dstStride;\
2473 OPNAME ## h264_qpel8_v_lowpass(dst , src , dstStride, srcStride);\
2474 OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2477 static void OPNAME ## h264_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2478 OPNAME ## h264_qpel8_h_lowpass(dst , src , dstStride, srcStride);\
2479 OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2480 src += 8*srcStride;\
2481 dst += 8*dstStride;\
2482 OPNAME ## h264_qpel8_h_lowpass(dst , src , dstStride, srcStride);\
2483 OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2486 static void OPNAME ## h264_qpel16_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2487 OPNAME ## h264_qpel8_hv_lowpass(dst , tmp , src , dstStride, tmpStride, srcStride);\
2488 OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2489 src += 8*srcStride;\
2490 dst += 8*dstStride;\
2491 OPNAME ## h264_qpel8_hv_lowpass(dst , tmp , src , dstStride, tmpStride, srcStride);\
2492 OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2495 #define H264_MC(OPNAME, SIZE) \
2496 static void OPNAME ## h264_qpel ## SIZE ## _mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2497 OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\
2500 static void OPNAME ## h264_qpel ## SIZE ## _mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2501 uint8_t half[SIZE*SIZE];\
2502 put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2503 OPNAME ## pixels ## SIZE ## _l2(dst, src, half, stride, stride, SIZE, SIZE);\
2506 static void OPNAME ## h264_qpel ## SIZE ## _mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2507 OPNAME ## h264_qpel ## SIZE ## _h_lowpass(dst, src, stride, stride);\
2510 static void OPNAME ## h264_qpel ## SIZE ## _mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2511 uint8_t half[SIZE*SIZE];\
2512 put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2513 OPNAME ## pixels ## SIZE ## _l2(dst, src+1, half, stride, stride, SIZE, SIZE);\
2516 static void OPNAME ## h264_qpel ## SIZE ## _mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2517 uint8_t full[SIZE*(SIZE+5)];\
2518 uint8_t * const full_mid= full + SIZE*2;\
2519 uint8_t half[SIZE*SIZE];\
2520 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2521 put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2522 OPNAME ## pixels ## SIZE ## _l2(dst, full_mid, half, stride, SIZE, SIZE, SIZE);\
2525 static void OPNAME ## h264_qpel ## SIZE ## _mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2526 uint8_t full[SIZE*(SIZE+5)];\
2527 uint8_t * const full_mid= full + SIZE*2;\
2528 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2529 OPNAME ## h264_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE);\
2532 static void OPNAME ## h264_qpel ## SIZE ## _mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2533 uint8_t full[SIZE*(SIZE+5)];\
2534 uint8_t * const full_mid= full + SIZE*2;\
2535 uint8_t half[SIZE*SIZE];\
2536 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2537 put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2538 OPNAME ## pixels ## SIZE ## _l2(dst, full_mid+SIZE, half, stride, SIZE, SIZE, SIZE);\
2541 static void OPNAME ## h264_qpel ## SIZE ## _mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2542 uint8_t full[SIZE*(SIZE+5)];\
2543 uint8_t * const full_mid= full + SIZE*2;\
2544 uint8_t halfH[SIZE*SIZE];\
2545 uint8_t halfV[SIZE*SIZE];\
2546 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2547 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2548 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2549 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2552 static void OPNAME ## h264_qpel ## SIZE ## _mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2553 uint8_t full[SIZE*(SIZE+5)];\
2554 uint8_t * const full_mid= full + SIZE*2;\
2555 uint8_t halfH[SIZE*SIZE];\
2556 uint8_t halfV[SIZE*SIZE];\
2557 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2558 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2559 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2560 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2563 static void OPNAME ## h264_qpel ## SIZE ## _mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2564 uint8_t full[SIZE*(SIZE+5)];\
2565 uint8_t * const full_mid= full + SIZE*2;\
2566 uint8_t halfH[SIZE*SIZE];\
2567 uint8_t halfV[SIZE*SIZE];\
2568 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2569 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2570 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2571 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2574 static void OPNAME ## h264_qpel ## SIZE ## _mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2575 uint8_t full[SIZE*(SIZE+5)];\
2576 uint8_t * const full_mid= full + SIZE*2;\
2577 uint8_t halfH[SIZE*SIZE];\
2578 uint8_t halfV[SIZE*SIZE];\
2579 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2580 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2581 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2582 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2585 static void OPNAME ## h264_qpel ## SIZE ## _mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2586 int16_t tmp[SIZE*(SIZE+5)];\
2587 OPNAME ## h264_qpel ## SIZE ## _hv_lowpass(dst, tmp, src, stride, SIZE, stride);\
2590 static void OPNAME ## h264_qpel ## SIZE ## _mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2591 int16_t tmp[SIZE*(SIZE+5)];\
2592 uint8_t halfH[SIZE*SIZE];\
2593 uint8_t halfHV[SIZE*SIZE];\
2594 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2595 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2596 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2599 static void OPNAME ## h264_qpel ## SIZE ## _mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2600 int16_t tmp[SIZE*(SIZE+5)];\
2601 uint8_t halfH[SIZE*SIZE];\
2602 uint8_t halfHV[SIZE*SIZE];\
2603 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2604 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2605 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2608 static void OPNAME ## h264_qpel ## SIZE ## _mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2609 uint8_t full[SIZE*(SIZE+5)];\
2610 uint8_t * const full_mid= full + SIZE*2;\
2611 int16_t tmp[SIZE*(SIZE+5)];\
2612 uint8_t halfV[SIZE*SIZE];\
2613 uint8_t halfHV[SIZE*SIZE];\
2614 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2615 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2616 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2617 OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2620 static void OPNAME ## h264_qpel ## SIZE ## _mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2621 uint8_t full[SIZE*(SIZE+5)];\
2622 uint8_t * const full_mid= full + SIZE*2;\
2623 int16_t tmp[SIZE*(SIZE+5)];\
2624 uint8_t halfV[SIZE*SIZE];\
2625 uint8_t halfHV[SIZE*SIZE];\
2626 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2627 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2628 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2629 OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2632 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2633 //#define op_avg2(a, b) a = (((a)*w1+cm[((b) + 16)>>5]*w2 + o + 64)>>7)
2634 #define op_put(a, b) a = cm[((b) + 16)>>5]
2635 #define op2_avg(a, b) a = (((a)+cm[((b) + 512)>>10]+1)>>1)
2636 #define op2_put(a, b) a = cm[((b) + 512)>>10]
2638 H264_LOWPASS(put_ , op_put, op2_put)
2639 H264_LOWPASS(avg_ , op_avg, op2_avg)
2654 #define op_scale1(x) block[x] = av_clip_uint8( (block[x]*weight + offset) >> log2_denom )
2655 #define op_scale2(x) dst[x] = av_clip_uint8( (src[x]*weights + dst[x]*weightd + offset) >> (log2_denom+1))
2656 #define H264_WEIGHT(W,H) \
2657 static void weight_h264_pixels ## W ## x ## H ## _c(uint8_t *block, int stride, int log2_denom, int weight, int offset){ \
2659 offset <<= log2_denom; \
2660 if(log2_denom) offset += 1<<(log2_denom-1); \
2661 for(y=0; y<H; y++, block += stride){ \
2664 if(W==2) continue; \
2667 if(W==4) continue; \
2672 if(W==8) continue; \
2683 static void biweight_h264_pixels ## W ## x ## H ## _c(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset){ \
2685 offset = ((offset + 1) | 1) << log2_denom; \
2686 for(y=0; y<H; y++, dst += stride, src += stride){ \
2689 if(W==2) continue; \
2692 if(W==4) continue; \
2697 if(W==8) continue; \
2724 static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
2725 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2729 dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
2730 dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
2731 dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
2732 dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
2733 dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
2734 dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
2735 dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
2736 dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
2742 #if CONFIG_CAVS_DECODER
2744 void ff_cavsdsp_init(DSPContext* c, AVCodecContext *avctx);
2746 void ff_put_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2747 put_pixels8_c(dst, src, stride, 8);
2749 void ff_avg_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2750 avg_pixels8_c(dst, src, stride, 8);
2752 void ff_put_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2753 put_pixels16_c(dst, src, stride, 16);
2755 void ff_avg_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2756 avg_pixels16_c(dst, src, stride, 16);
2758 #endif /* CONFIG_CAVS_DECODER */
2760 void ff_mlp_init(DSPContext* c, AVCodecContext *avctx);
2762 #if CONFIG_VC1_DECODER
2764 void ff_vc1dsp_init(DSPContext* c, AVCodecContext *avctx);
2766 void ff_put_vc1_mspel_mc00_c(uint8_t *dst, uint8_t *src, int stride, int rnd) {
2767 put_pixels8_c(dst, src, stride, 8);
2769 void ff_avg_vc1_mspel_mc00_c(uint8_t *dst, uint8_t *src, int stride, int rnd) {
2770 avg_pixels8_c(dst, src, stride, 8);
2772 #endif /* CONFIG_VC1_DECODER */
2774 void ff_intrax8dsp_init(DSPContext* c, AVCodecContext *avctx);
2777 void ff_h264dspenc_init(DSPContext* c, AVCodecContext *avctx);
2779 #if CONFIG_RV30_DECODER
2780 void ff_rv30dsp_init(DSPContext* c, AVCodecContext *avctx);
2781 #endif /* CONFIG_RV30_DECODER */
2783 #if CONFIG_RV40_DECODER
2784 static void put_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2785 put_pixels16_xy2_c(dst, src, stride, 16);
2787 static void avg_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2788 avg_pixels16_xy2_c(dst, src, stride, 16);
2790 static void put_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2791 put_pixels8_xy2_c(dst, src, stride, 8);
2793 static void avg_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2794 avg_pixels8_xy2_c(dst, src, stride, 8);
2797 void ff_rv40dsp_init(DSPContext* c, AVCodecContext *avctx);
2798 #endif /* CONFIG_RV40_DECODER */
2800 static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
2801 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2805 const int src_1= src[ -srcStride];
2806 const int src0 = src[0 ];
2807 const int src1 = src[ srcStride];
2808 const int src2 = src[2*srcStride];
2809 const int src3 = src[3*srcStride];
2810 const int src4 = src[4*srcStride];
2811 const int src5 = src[5*srcStride];
2812 const int src6 = src[6*srcStride];
2813 const int src7 = src[7*srcStride];
2814 const int src8 = src[8*srcStride];
2815 const int src9 = src[9*srcStride];
2816 dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
2817 dst[1*dstStride]= cm[(9*(src1 + src2) - (src0 + src3) + 8)>>4];
2818 dst[2*dstStride]= cm[(9*(src2 + src3) - (src1 + src4) + 8)>>4];
2819 dst[3*dstStride]= cm[(9*(src3 + src4) - (src2 + src5) + 8)>>4];
2820 dst[4*dstStride]= cm[(9*(src4 + src5) - (src3 + src6) + 8)>>4];
2821 dst[5*dstStride]= cm[(9*(src5 + src6) - (src4 + src7) + 8)>>4];
2822 dst[6*dstStride]= cm[(9*(src6 + src7) - (src5 + src8) + 8)>>4];
2823 dst[7*dstStride]= cm[(9*(src7 + src8) - (src6 + src9) + 8)>>4];
2829 static void put_mspel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){
2830 put_pixels8_c(dst, src, stride, 8);
2833 static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
2835 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2836 put_pixels8_l2(dst, src, half, stride, stride, 8, 8);
2839 static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
2840 wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
2843 static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
2845 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2846 put_pixels8_l2(dst, src+1, half, stride, stride, 8, 8);
2849 static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
2850 wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
2853 static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
2857 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2858 wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
2859 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2860 put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2862 static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
2866 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2867 wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
2868 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2869 put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2871 static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
2873 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2874 wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
2877 static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
2878 if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
2880 const int strength= ff_h263_loop_filter_strength[qscale];
2884 int p0= src[x-2*stride];
2885 int p1= src[x-1*stride];
2886 int p2= src[x+0*stride];
2887 int p3= src[x+1*stride];
2888 int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2890 if (d<-2*strength) d1= 0;
2891 else if(d<- strength) d1=-2*strength - d;
2892 else if(d< strength) d1= d;
2893 else if(d< 2*strength) d1= 2*strength - d;
2898 if(p1&256) p1= ~(p1>>31);
2899 if(p2&256) p2= ~(p2>>31);
2901 src[x-1*stride] = p1;
2902 src[x+0*stride] = p2;
2906 d2= av_clip((p0-p3)/4, -ad1, ad1);
2908 src[x-2*stride] = p0 - d2;
2909 src[x+ stride] = p3 + d2;
2914 static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
2915 if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
2917 const int strength= ff_h263_loop_filter_strength[qscale];
2921 int p0= src[y*stride-2];
2922 int p1= src[y*stride-1];
2923 int p2= src[y*stride+0];
2924 int p3= src[y*stride+1];
2925 int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2927 if (d<-2*strength) d1= 0;
2928 else if(d<- strength) d1=-2*strength - d;
2929 else if(d< strength) d1= d;
2930 else if(d< 2*strength) d1= 2*strength - d;
2935 if(p1&256) p1= ~(p1>>31);
2936 if(p2&256) p2= ~(p2>>31);
2938 src[y*stride-1] = p1;
2939 src[y*stride+0] = p2;
2943 d2= av_clip((p0-p3)/4, -ad1, ad1);
2945 src[y*stride-2] = p0 - d2;
2946 src[y*stride+1] = p3 + d2;
2951 static void h261_loop_filter_c(uint8_t *src, int stride){
2956 temp[x ] = 4*src[x ];
2957 temp[x + 7*8] = 4*src[x + 7*stride];
2961 xy = y * stride + x;
2963 temp[yz] = src[xy - stride] + 2*src[xy] + src[xy + stride];
2968 src[ y*stride] = (temp[ y*8] + 2)>>2;
2969 src[7+y*stride] = (temp[7+y*8] + 2)>>2;
2971 xy = y * stride + x;
2973 src[xy] = (temp[yz-1] + 2*temp[yz] + temp[yz+1] + 8)>>4;
2978 static inline void h264_loop_filter_luma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
2981 for( i = 0; i < 4; i++ ) {
2986 for( d = 0; d < 4; d++ ) {
2987 const int p0 = pix[-1*xstride];
2988 const int p1 = pix[-2*xstride];
2989 const int p2 = pix[-3*xstride];
2990 const int q0 = pix[0];
2991 const int q1 = pix[1*xstride];
2992 const int q2 = pix[2*xstride];
2994 if( FFABS( p0 - q0 ) < alpha &&
2995 FFABS( p1 - p0 ) < beta &&
2996 FFABS( q1 - q0 ) < beta ) {
3001 if( FFABS( p2 - p0 ) < beta ) {
3002 pix[-2*xstride] = p1 + av_clip( (( p2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - p1, -tc0[i], tc0[i] );
3005 if( FFABS( q2 - q0 ) < beta ) {
3006 pix[ xstride] = q1 + av_clip( (( q2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - q1, -tc0[i], tc0[i] );
3010 i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
3011 pix[-xstride] = av_clip_uint8( p0 + i_delta ); /* p0' */
3012 pix[0] = av_clip_uint8( q0 - i_delta ); /* q0' */
3018 static void h264_v_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
3020 h264_loop_filter_luma_c(pix, stride, 1, alpha, beta, tc0);
3022 static void h264_h_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
3024 h264_loop_filter_luma_c(pix, 1, stride, alpha, beta, tc0);
3027 static inline void h264_loop_filter_luma_intra_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta)
3030 for( d = 0; d < 16; d++ ) {
3031 const int p2 = pix[-3*xstride];
3032 const int p1 = pix[-2*xstride];
3033 const int p0 = pix[-1*xstride];
3035 const int q0 = pix[ 0*xstride];
3036 const int q1 = pix[ 1*xstride];
3037 const int q2 = pix[ 2*xstride];
3039 if( FFABS( p0 - q0 ) < alpha &&
3040 FFABS( p1 - p0 ) < beta &&
3041 FFABS( q1 - q0 ) < beta ) {
3043 if(FFABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
3044 if( FFABS( p2 - p0 ) < beta)
3046 const int p3 = pix[-4*xstride];
3048 pix[-1*xstride] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
3049 pix[-2*xstride] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
3050 pix[-3*xstride] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
3053 pix[-1*xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
3055 if( FFABS( q2 - q0 ) < beta)
3057 const int q3 = pix[3*xstride];
3059 pix[0*xstride] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
3060 pix[1*xstride] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
3061 pix[2*xstride] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
3064 pix[0*xstride] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
3068 pix[-1*xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
3069 pix[ 0*xstride] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
3075 static void h264_v_loop_filter_luma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
3077 h264_loop_filter_luma_intra_c(pix, stride, 1, alpha, beta);
3079 static void h264_h_loop_filter_luma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
3081 h264_loop_filter_luma_intra_c(pix, 1, stride, alpha, beta);
3084 static inline void h264_loop_filter_chroma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
3087 for( i = 0; i < 4; i++ ) {
3088 const int tc = tc0[i];
3093 for( d = 0; d < 2; d++ ) {
3094 const int p0 = pix[-1*xstride];
3095 const int p1 = pix[-2*xstride];
3096 const int q0 = pix[0];
3097 const int q1 = pix[1*xstride];
3099 if( FFABS( p0 - q0 ) < alpha &&
3100 FFABS( p1 - p0 ) < beta &&
3101 FFABS( q1 - q0 ) < beta ) {
3103 int delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
3105 pix[-xstride] = av_clip_uint8( p0 + delta ); /* p0' */
3106 pix[0] = av_clip_uint8( q0 - delta ); /* q0' */
3112 static void h264_v_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
3114 h264_loop_filter_chroma_c(pix, stride, 1, alpha, beta, tc0);
3116 static void h264_h_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
3118 h264_loop_filter_chroma_c(pix, 1, stride, alpha, beta, tc0);
3121 static inline void h264_loop_filter_chroma_intra_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta)
3124 for( d = 0; d < 8; d++ ) {
3125 const int p0 = pix[-1*xstride];
3126 const int p1 = pix[-2*xstride];
3127 const int q0 = pix[0];
3128 const int q1 = pix[1*xstride];
3130 if( FFABS( p0 - q0 ) < alpha &&
3131 FFABS( p1 - p0 ) < beta &&
3132 FFABS( q1 - q0 ) < beta ) {
3134 pix[-xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2; /* p0' */
3135 pix[0] = ( 2*q1 + q0 + p1 + 2 ) >> 2; /* q0' */
3140 static void h264_v_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
3142 h264_loop_filter_chroma_intra_c(pix, stride, 1, alpha, beta);
3144 static void h264_h_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
3146 h264_loop_filter_chroma_intra_c(pix, 1, stride, alpha, beta);
3149 static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3155 s += abs(pix1[0] - pix2[0]);
3156 s += abs(pix1[1] - pix2[1]);
3157 s += abs(pix1[2] - pix2[2]);
3158 s += abs(pix1[3] - pix2[3]);
3159 s += abs(pix1[4] - pix2[4]);
3160 s += abs(pix1[5] - pix2[5]);
3161 s += abs(pix1[6] - pix2[6]);
3162 s += abs(pix1[7] - pix2[7]);
3163 s += abs(pix1[8] - pix2[8]);
3164 s += abs(pix1[9] - pix2[9]);
3165 s += abs(pix1[10] - pix2[10]);
3166 s += abs(pix1[11] - pix2[11]);
3167 s += abs(pix1[12] - pix2[12]);
3168 s += abs(pix1[13] - pix2[13]);
3169 s += abs(pix1[14] - pix2[14]);
3170 s += abs(pix1[15] - pix2[15]);
3177 static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3183 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
3184 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
3185 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
3186 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
3187 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
3188 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
3189 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
3190 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
3191 s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
3192 s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
3193 s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
3194 s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
3195 s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
3196 s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
3197 s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
3198 s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
3205 static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3208 uint8_t *pix3 = pix2 + line_size;
3212 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
3213 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
3214 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
3215 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
3216 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
3217 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
3218 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
3219 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
3220 s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
3221 s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
3222 s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
3223 s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
3224 s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
3225 s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
3226 s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
3227 s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
3235 static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3238 uint8_t *pix3 = pix2 + line_size;
3242 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
3243 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
3244 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
3245 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
3246 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
3247 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
3248 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
3249 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
3250 s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
3251 s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
3252 s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
3253 s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
3254 s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
3255 s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
3256 s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
3257 s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
3265 static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3271 s += abs(pix1[0] - pix2[0]);
3272 s += abs(pix1[1] - pix2[1]);
3273 s += abs(pix1[2] - pix2[2]);
3274 s += abs(pix1[3] - pix2[3]);
3275 s += abs(pix1[4] - pix2[4]);
3276 s += abs(pix1[5] - pix2[5]);
3277 s += abs(pix1[6] - pix2[6]);
3278 s += abs(pix1[7] - pix2[7]);
3285 static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3291 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
3292 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
3293 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
3294 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
3295 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
3296 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
3297 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
3298 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
3305 static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3308 uint8_t *pix3 = pix2 + line_size;
3312 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
3313 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
3314 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
3315 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
3316 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
3317 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
3318 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
3319 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
3327 static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3330 uint8_t *pix3 = pix2 + line_size;
3334 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
3335 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
3336 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
3337 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
3338 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
3339 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
3340 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
3341 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
3349 static int nsse16_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
3350 MpegEncContext *c = v;
3356 for(x=0; x<16; x++){
3357 score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]);
3360 for(x=0; x<15; x++){
3361 score2+= FFABS( s1[x ] - s1[x +stride]
3362 - s1[x+1] + s1[x+1+stride])
3363 -FFABS( s2[x ] - s2[x +stride]
3364 - s2[x+1] + s2[x+1+stride]);
3371 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
3372 else return score1 + FFABS(score2)*8;
3375 static int nsse8_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
3376 MpegEncContext *c = v;
3383 score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]);
3387 score2+= FFABS( s1[x ] - s1[x +stride]
3388 - s1[x+1] + s1[x+1+stride])
3389 -FFABS( s2[x ] - s2[x +stride]
3390 - s2[x+1] + s2[x+1+stride]);
3397 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
3398 else return score1 + FFABS(score2)*8;
3401 static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
3405 for(i=0; i<8*8; i++){
3406 int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
3409 assert(-512<b && b<512);
3411 sum += (w*b)*(w*b)>>4;
3416 static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
3419 for(i=0; i<8*8; i++){
3420 rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
3425 * permutes an 8x8 block.
3426 * @param block the block which will be permuted according to the given permutation vector
3427 * @param permutation the permutation vector
3428 * @param last the last non zero coefficient in scantable order, used to speed the permutation up
3429 * @param scantable the used scantable, this is only used to speed the permutation up, the block is not
3430 * (inverse) permutated to scantable order!
3432 void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
3438 //if(permutation[1]==1) return; //FIXME it is ok but not clean and might fail for some permutations
3440 for(i=0; i<=last; i++){
3441 const int j= scantable[i];
3446 for(i=0; i<=last; i++){
3447 const int j= scantable[i];
3448 const int perm_j= permutation[j];
3449 block[perm_j]= temp[j];
3453 static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
3457 void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
3460 memset(cmp, 0, sizeof(void*)*6);
3468 cmp[i]= c->hadamard8_diff[i];
3474 cmp[i]= c->dct_sad[i];
3477 cmp[i]= c->dct264_sad[i];
3480 cmp[i]= c->dct_max[i];
3483 cmp[i]= c->quant_psnr[i];
3503 #if CONFIG_SNOW_ENCODER
3512 av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
3517 static void clear_block_c(DCTELEM *block)
3519 memset(block, 0, sizeof(DCTELEM)*64);
3523 * memset(blocks, 0, sizeof(DCTELEM)*6*64)
3525 static void clear_blocks_c(DCTELEM *blocks)
3527 memset(blocks, 0, sizeof(DCTELEM)*6*64);
3530 static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
3532 for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3533 long a = *(long*)(src+i);
3534 long b = *(long*)(dst+i);
3535 *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
3538 dst[i+0] += src[i+0];
3541 static void add_bytes_l2_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
3543 for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3544 long a = *(long*)(src1+i);
3545 long b = *(long*)(src2+i);
3546 *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
3549 dst[i] = src1[i]+src2[i];
3552 static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
3554 #if !HAVE_FAST_UNALIGNED
3555 if((long)src2 & (sizeof(long)-1)){
3556 for(i=0; i+7<w; i+=8){
3557 dst[i+0] = src1[i+0]-src2[i+0];
3558 dst[i+1] = src1[i+1]-src2[i+1];
3559 dst[i+2] = src1[i+2]-src2[i+2];
3560 dst[i+3] = src1[i+3]-src2[i+3];
3561 dst[i+4] = src1[i+4]-src2[i+4];
3562 dst[i+5] = src1[i+5]-src2[i+5];
3563 dst[i+6] = src1[i+6]-src2[i+6];
3564 dst[i+7] = src1[i+7]-src2[i+7];
3568 for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3569 long a = *(long*)(src1+i);
3570 long b = *(long*)(src2+i);
3571 *(long*)(dst+i) = ((a|pb_80) - (b&pb_7f)) ^ ((a^b^pb_80)&pb_80);
3574 dst[i+0] = src1[i+0]-src2[i+0];
3577 static void add_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *diff, int w, int *left, int *left_top){
3585 l= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF) + diff[i];
3594 static void sub_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int w, int *left, int *left_top){
3602 const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
3612 static int add_hfyu_left_prediction_c(uint8_t *dst, const uint8_t *src, int w, int acc){
3615 for(i=0; i<w-1; i++){
3642 static void add_hfyu_left_prediction_bgr32_c(uint8_t *dst, const uint8_t *src, int w, int *red, int *green, int *blue, int *alpha){
3672 #define BUTTERFLY2(o1,o2,i1,i2) \
3676 #define BUTTERFLY1(x,y) \
3685 #define BUTTERFLYA(x,y) (FFABS((x)+(y)) + FFABS((x)-(y)))
3687 static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
3695 //FIXME try pointer walks
3696 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
3697 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
3698 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
3699 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
3701 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3702 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3703 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3704 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3706 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3707 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3708 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3709 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3713 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3714 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3715 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3716 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3718 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3719 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3720 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3721 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3724 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3725 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3726 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3727 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3733 printf("MAX:%d\n", maxi);
3739 static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
3747 //FIXME try pointer walks
3748 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
3749 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
3750 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
3751 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
3753 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3754 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3755 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3756 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3758 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3759 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3760 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3761 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3765 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3766 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3767 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3768 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3770 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3771 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3772 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3773 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3776 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3777 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3778 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3779 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3782 sum -= FFABS(temp[8*0] + temp[8*4]); // -mean
3787 static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3788 MpegEncContext * const s= (MpegEncContext *)c;
3789 DECLARE_ALIGNED_16(uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3790 DCTELEM * const temp= (DCTELEM*)aligned_temp;
3794 s->dsp.diff_pixels(temp, src1, src2, stride);
3796 return s->dsp.sum_abs_dctelem(temp);
3801 const int s07 = SRC(0) + SRC(7);\
3802 const int s16 = SRC(1) + SRC(6);\
3803 const int s25 = SRC(2) + SRC(5);\
3804 const int s34 = SRC(3) + SRC(4);\
3805 const int a0 = s07 + s34;\
3806 const int a1 = s16 + s25;\
3807 const int a2 = s07 - s34;\
3808 const int a3 = s16 - s25;\
3809 const int d07 = SRC(0) - SRC(7);\
3810 const int d16 = SRC(1) - SRC(6);\
3811 const int d25 = SRC(2) - SRC(5);\
3812 const int d34 = SRC(3) - SRC(4);\
3813 const int a4 = d16 + d25 + (d07 + (d07>>1));\
3814 const int a5 = d07 - d34 - (d25 + (d25>>1));\
3815 const int a6 = d07 + d34 - (d16 + (d16>>1));\
3816 const int a7 = d16 - d25 + (d34 + (d34>>1));\
3818 DST(1, a4 + (a7>>2)) ;\
3819 DST(2, a2 + (a3>>1)) ;\
3820 DST(3, a5 + (a6>>2)) ;\
3822 DST(5, a6 - (a5>>2)) ;\
3823 DST(6, (a2>>1) - a3 ) ;\
3824 DST(7, (a4>>2) - a7 ) ;\
3827 static int dct264_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3828 MpegEncContext * const s= (MpegEncContext *)c;
3833 s->dsp.diff_pixels(dct[0], src1, src2, stride);
3835 #define SRC(x) dct[i][x]
3836 #define DST(x,v) dct[i][x]= v
3837 for( i = 0; i < 8; i++ )
3842 #define SRC(x) dct[x][i]
3843 #define DST(x,v) sum += FFABS(v)
3844 for( i = 0; i < 8; i++ )
3852 static int dct_max8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3853 MpegEncContext * const s= (MpegEncContext *)c;
3854 DECLARE_ALIGNED_16(uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3855 DCTELEM * const temp= (DCTELEM*)aligned_temp;
3860 s->dsp.diff_pixels(temp, src1, src2, stride);
3864 sum= FFMAX(sum, FFABS(temp[i]));
3869 static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3870 MpegEncContext * const s= (MpegEncContext *)c;
3871 DECLARE_ALIGNED_16(uint64_t, aligned_temp[sizeof(DCTELEM)*64*2/8]);
3872 DCTELEM * const temp= (DCTELEM*)aligned_temp;
3873 DCTELEM * const bak = ((DCTELEM*)aligned_temp)+64;
3879 s->dsp.diff_pixels(temp, src1, src2, stride);
3881 memcpy(bak, temp, 64*sizeof(DCTELEM));
3883 s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3884 s->dct_unquantize_inter(s, temp, 0, s->qscale);
3885 ff_simple_idct(temp); //FIXME
3888 sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
3893 static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3894 MpegEncContext * const s= (MpegEncContext *)c;
3895 const uint8_t *scantable= s->intra_scantable.permutated;
3896 DECLARE_ALIGNED_16(uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3897 DECLARE_ALIGNED_16(uint64_t, aligned_src1[8]);
3898 DECLARE_ALIGNED_16(uint64_t, aligned_src2[8]);
3899 DCTELEM * const temp= (DCTELEM*)aligned_temp;
3900 uint8_t * const lsrc1 = (uint8_t*)aligned_src1;
3901 uint8_t * const lsrc2 = (uint8_t*)aligned_src2;
3902 int i, last, run, bits, level, distortion, start_i;
3903 const int esc_length= s->ac_esc_length;
3905 uint8_t * last_length;
3909 copy_block8(lsrc1, src1, 8, stride, 8);
3910 copy_block8(lsrc2, src2, 8, stride, 8);
3912 s->dsp.diff_pixels(temp, lsrc1, lsrc2, 8);
3914 s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3920 length = s->intra_ac_vlc_length;
3921 last_length= s->intra_ac_vlc_last_length;
3922 bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3925 length = s->inter_ac_vlc_length;
3926 last_length= s->inter_ac_vlc_last_length;
3931 for(i=start_i; i<last; i++){
3932 int j= scantable[i];
3937 if((level&(~127)) == 0){
3938 bits+= length[UNI_AC_ENC_INDEX(run, level)];
3947 level= temp[i] + 64;
3951 if((level&(~127)) == 0){
3952 bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3960 s->dct_unquantize_intra(s, temp, 0, s->qscale);
3962 s->dct_unquantize_inter(s, temp, 0, s->qscale);
3965 s->dsp.idct_add(lsrc2, 8, temp);
3967 distortion= s->dsp.sse[1](NULL, lsrc2, lsrc1, 8, 8);
3969 return distortion + ((bits*s->qscale*s->qscale*109 + 64)>>7);
3972 static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3973 MpegEncContext * const s= (MpegEncContext *)c;
3974 const uint8_t *scantable= s->intra_scantable.permutated;
3975 DECLARE_ALIGNED_16(uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3976 DCTELEM * const temp= (DCTELEM*)aligned_temp;
3977 int i, last, run, bits, level, start_i;
3978 const int esc_length= s->ac_esc_length;
3980 uint8_t * last_length;
3984 s->dsp.diff_pixels(temp, src1, src2, stride);
3986 s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3992 length = s->intra_ac_vlc_length;
3993 last_length= s->intra_ac_vlc_last_length;
3994 bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3997 length = s->inter_ac_vlc_length;
3998 last_length= s->inter_ac_vlc_last_length;
4003 for(i=start_i; i<last; i++){
4004 int j= scantable[i];
4009 if((level&(~127)) == 0){
4010 bits+= length[UNI_AC_ENC_INDEX(run, level)];
4019 level= temp[i] + 64;
4023 if((level&(~127)) == 0){
4024 bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
4032 #define VSAD_INTRA(size) \
4033 static int vsad_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
4037 for(y=1; y<h; y++){ \
4038 for(x=0; x<size; x+=4){ \
4039 score+= FFABS(s[x ] - s[x +stride]) + FFABS(s[x+1] - s[x+1+stride]) \
4040 +FFABS(s[x+2] - s[x+2+stride]) + FFABS(s[x+3] - s[x+3+stride]); \
4050 static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
4055 for(x=0; x<16; x++){
4056 score+= FFABS(s1[x ] - s2[x ] - s1[x +stride] + s2[x +stride]);
4065 #define SQ(a) ((a)*(a))
4066 #define VSSE_INTRA(size) \
4067 static int vsse_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
4071 for(y=1; y<h; y++){ \
4072 for(x=0; x<size; x+=4){ \
4073 score+= SQ(s[x ] - s[x +stride]) + SQ(s[x+1] - s[x+1+stride]) \
4074 +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]); \
4084 static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
4089 for(x=0; x<16; x++){
4090 score+= SQ(s1[x ] - s2[x ] - s1[x +stride] + s2[x +stride]);
4099 static int ssd_int8_vs_int16_c(const int8_t *pix1, const int16_t *pix2,
4103 for(i=0; i<size; i++)
4104 score += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]);
4108 WRAPPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
4109 WRAPPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
4110 WRAPPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
4112 WRAPPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c)
4114 WRAPPER8_16_SQ(dct_max8x8_c, dct_max16_c)
4115 WRAPPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
4116 WRAPPER8_16_SQ(rd8x8_c, rd16_c)
4117 WRAPPER8_16_SQ(bit8x8_c, bit16_c)
4119 static void vector_fmul_c(float *dst, const float *src, int len){
4121 for(i=0; i<len; i++)
4125 static void vector_fmul_reverse_c(float *dst, const float *src0, const float *src1, int len){
4128 for(i=0; i<len; i++)
4129 dst[i] = src0[i] * src1[-i];
4132 static void vector_fmul_add_c(float *dst, const float *src0, const float *src1, const float *src2, int len){
4134 for(i=0; i<len; i++)
4135 dst[i] = src0[i] * src1[i] + src2[i];
4138 void ff_vector_fmul_window_c(float *dst, const float *src0, const float *src1, const float *win, float add_bias, int len){
4143 for(i=-len, j=len-1; i<0; i++, j--) {
4148 dst[i] = s0*wj - s1*wi + add_bias;
4149 dst[j] = s0*wi + s1*wj + add_bias;
4153 static void vector_fmul_scalar_c(float *dst, const float *src, float mul,
4157 for (i = 0; i < len; i++)
4158 dst[i] = src[i] * mul;
4161 static void vector_fmul_sv_scalar_2_c(float *dst, const float *src,
4162 const float **sv, float mul, int len)
4165 for (i = 0; i < len; i += 2, sv++) {
4166 dst[i ] = src[i ] * sv[0][0] * mul;
4167 dst[i+1] = src[i+1] * sv[0][1] * mul;
4171 static void vector_fmul_sv_scalar_4_c(float *dst, const float *src,
4172 const float **sv, float mul, int len)
4175 for (i = 0; i < len; i += 4, sv++) {
4176 dst[i ] = src[i ] * sv[0][0] * mul;
4177 dst[i+1] = src[i+1] * sv[0][1] * mul;
4178 dst[i+2] = src[i+2] * sv[0][2] * mul;
4179 dst[i+3] = src[i+3] * sv[0][3] * mul;
4183 static void sv_fmul_scalar_2_c(float *dst, const float **sv, float mul,
4187 for (i = 0; i < len; i += 2, sv++) {
4188 dst[i ] = sv[0][0] * mul;
4189 dst[i+1] = sv[0][1] * mul;
4193 static void sv_fmul_scalar_4_c(float *dst, const float **sv, float mul,
4197 for (i = 0; i < len; i += 4, sv++) {
4198 dst[i ] = sv[0][0] * mul;
4199 dst[i+1] = sv[0][1] * mul;
4200 dst[i+2] = sv[0][2] * mul;
4201 dst[i+3] = sv[0][3] * mul;
4205 static void butterflies_float_c(float *restrict v1, float *restrict v2,
4209 for (i = 0; i < len; i++) {
4210 float t = v1[i] - v2[i];
4216 static float scalarproduct_float_c(const float *v1, const float *v2, int len)
4221 for (i = 0; i < len; i++)
4227 static void int32_to_float_fmul_scalar_c(float *dst, const int *src, float mul, int len){
4229 for(i=0; i<len; i++)
4230 dst[i] = src[i] * mul;
4233 static inline uint32_t clipf_c_one(uint32_t a, uint32_t mini,
4234 uint32_t maxi, uint32_t maxisign)
4237 if(a > mini) return mini;
4238 else if((a^(1<<31)) > maxisign) return maxi;
4242 static void vector_clipf_c_opposite_sign(float *dst, const float *src, float *min, float *max, int len){
4244 uint32_t mini = *(uint32_t*)min;
4245 uint32_t maxi = *(uint32_t*)max;
4246 uint32_t maxisign = maxi ^ (1<<31);
4247 uint32_t *dsti = (uint32_t*)dst;
4248 const uint32_t *srci = (const uint32_t*)src;
4249 for(i=0; i<len; i+=8) {
4250 dsti[i + 0] = clipf_c_one(srci[i + 0], mini, maxi, maxisign);
4251 dsti[i + 1] = clipf_c_one(srci[i + 1], mini, maxi, maxisign);
4252 dsti[i + 2] = clipf_c_one(srci[i + 2], mini, maxi, maxisign);
4253 dsti[i + 3] = clipf_c_one(srci[i + 3], mini, maxi, maxisign);
4254 dsti[i + 4] = clipf_c_one(srci[i + 4], mini, maxi, maxisign);
4255 dsti[i + 5] = clipf_c_one(srci[i + 5], mini, maxi, maxisign);
4256 dsti[i + 6] = clipf_c_one(srci[i + 6], mini, maxi, maxisign);
4257 dsti[i + 7] = clipf_c_one(srci[i + 7], mini, maxi, maxisign);
4260 static void vector_clipf_c(float *dst, const float *src, float min, float max, int len){
4262 if(min < 0 && max > 0) {
4263 vector_clipf_c_opposite_sign(dst, src, &min, &max, len);
4265 for(i=0; i < len; i+=8) {
4266 dst[i ] = av_clipf(src[i ], min, max);
4267 dst[i + 1] = av_clipf(src[i + 1], min, max);
4268 dst[i + 2] = av_clipf(src[i + 2], min, max);
4269 dst[i + 3] = av_clipf(src[i + 3], min, max);
4270 dst[i + 4] = av_clipf(src[i + 4], min, max);
4271 dst[i + 5] = av_clipf(src[i + 5], min, max);
4272 dst[i + 6] = av_clipf(src[i + 6], min, max);
4273 dst[i + 7] = av_clipf(src[i + 7], min, max);
4278 static av_always_inline int float_to_int16_one(const float *src){
4279 int_fast32_t tmp = *(const int32_t*)src;
4281 tmp = (0x43c0ffff - tmp)>>31;
4282 // is this faster on some gcc/cpu combinations?
4283 // if(tmp > 0x43c0ffff) tmp = 0xFFFF;
4286 return tmp - 0x8000;
4289 void ff_float_to_int16_c(int16_t *dst, const float *src, long len){
4291 for(i=0; i<len; i++)
4292 dst[i] = float_to_int16_one(src+i);
4295 void ff_float_to_int16_interleave_c(int16_t *dst, const float **src, long len, int channels){
4298 for(i=0; i<len; i++){
4299 dst[2*i] = float_to_int16_one(src[0]+i);
4300 dst[2*i+1] = float_to_int16_one(src[1]+i);
4303 for(c=0; c<channels; c++)
4304 for(i=0, j=c; i<len; i++, j+=channels)
4305 dst[j] = float_to_int16_one(src[c]+i);
4309 static int32_t scalarproduct_int16_c(int16_t * v1, int16_t * v2, int order, int shift)
4314 res += (*v1++ * *v2++) >> shift;
4319 static int32_t scalarproduct_and_madd_int16_c(int16_t *v1, int16_t *v2, int16_t *v3, int order, int mul)
4324 *v1++ += mul * *v3++;
4330 #define W1 2841 /* 2048*sqrt (2)*cos (1*pi/16) */
4331 #define W2 2676 /* 2048*sqrt (2)*cos (2*pi/16) */
4332 #define W3 2408 /* 2048*sqrt (2)*cos (3*pi/16) */
4333 #define W4 2048 /* 2048*sqrt (2)*cos (4*pi/16) */
4334 #define W5 1609 /* 2048*sqrt (2)*cos (5*pi/16) */
4335 #define W6 1108 /* 2048*sqrt (2)*cos (6*pi/16) */
4336 #define W7 565 /* 2048*sqrt (2)*cos (7*pi/16) */
4338 static void wmv2_idct_row(short * b)
4341 int a0,a1,a2,a3,a4,a5,a6,a7;
4343 a1 = W1*b[1]+W7*b[7];
4344 a7 = W7*b[1]-W1*b[7];
4345 a5 = W5*b[5]+W3*b[3];
4346 a3 = W3*b[5]-W5*b[3];
4347 a2 = W2*b[2]+W6*b[6];
4348 a6 = W6*b[2]-W2*b[6];
4349 a0 = W0*b[0]+W0*b[4];
4350 a4 = W0*b[0]-W0*b[4];
4352 s1 = (181*(a1-a5+a7-a3)+128)>>8;//1,3,5,7,
4353 s2 = (181*(a1-a5-a7+a3)+128)>>8;
4355 b[0] = (a0+a2+a1+a5 + (1<<7))>>8;
4356 b[1] = (a4+a6 +s1 + (1<<7))>>8;
4357 b[2] = (a4-a6 +s2 + (1<<7))>>8;
4358 b[3] = (a0-a2+a7+a3 + (1<<7))>>8;
4359 b[4] = (a0-a2-a7-a3 + (1<<7))>>8;
4360 b[5] = (a4-a6 -s2 + (1<<7))>>8;
4361 b[6] = (a4+a6 -s1 + (1<<7))>>8;
4362 b[7] = (a0+a2-a1-a5 + (1<<7))>>8;
4364 static void wmv2_idct_col(short * b)
4367 int a0,a1,a2,a3,a4,a5,a6,a7;
4368 /*step 1, with extended precision*/
4369 a1 = (W1*b[8*1]+W7*b[8*7] + 4)>>3;
4370 a7 = (W7*b[8*1]-W1*b[8*7] + 4)>>3;
4371 a5 = (W5*b[8*5]+W3*b[8*3] + 4)>>3;
4372 a3 = (W3*b[8*5]-W5*b[8*3] + 4)>>3;
4373 a2 = (W2*b[8*2]+W6*b[8*6] + 4)>>3;
4374 a6 = (W6*b[8*2]-W2*b[8*6] + 4)>>3;
4375 a0 = (W0*b[8*0]+W0*b[8*4] )>>3;
4376 a4 = (W0*b[8*0]-W0*b[8*4] )>>3;
4378 s1 = (181*(a1-a5+a7-a3)+128)>>8;
4379 s2 = (181*(a1-a5-a7+a3)+128)>>8;
4381 b[8*0] = (a0+a2+a1+a5 + (1<<13))>>14;
4382 b[8*1] = (a4+a6 +s1 + (1<<13))>>14;
4383 b[8*2] = (a4-a6 +s2 + (1<<13))>>14;
4384 b[8*3] = (a0-a2+a7+a3 + (1<<13))>>14;
4386 b[8*4] = (a0-a2-a7-a3 + (1<<13))>>14;
4387 b[8*5] = (a4-a6 -s2 + (1<<13))>>14;
4388 b[8*6] = (a4+a6 -s1 + (1<<13))>>14;
4389 b[8*7] = (a0+a2-a1-a5 + (1<<13))>>14;
4391 void ff_wmv2_idct_c(short * block){
4395 wmv2_idct_row(block+i);
4398 wmv2_idct_col(block+i);
4401 /* XXX: those functions should be suppressed ASAP when all IDCTs are
4403 static void ff_wmv2_idct_put_c(uint8_t *dest, int line_size, DCTELEM *block)
4405 ff_wmv2_idct_c(block);
4406 put_pixels_clamped_c(block, dest, line_size);
4408 static void ff_wmv2_idct_add_c(uint8_t *dest, int line_size, DCTELEM *block)
4410 ff_wmv2_idct_c(block);
4411 add_pixels_clamped_c(block, dest, line_size);
4413 static void ff_jref_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
4416 put_pixels_clamped_c(block, dest, line_size);
4418 static void ff_jref_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
4421 add_pixels_clamped_c(block, dest, line_size);
4424 static void ff_jref_idct4_put(uint8_t *dest, int line_size, DCTELEM *block)
4427 put_pixels_clamped4_c(block, dest, line_size);
4429 static void ff_jref_idct4_add(uint8_t *dest, int line_size, DCTELEM *block)
4432 add_pixels_clamped4_c(block, dest, line_size);
4435 static void ff_jref_idct2_put(uint8_t *dest, int line_size, DCTELEM *block)
4438 put_pixels_clamped2_c(block, dest, line_size);
4440 static void ff_jref_idct2_add(uint8_t *dest, int line_size, DCTELEM *block)
4443 add_pixels_clamped2_c(block, dest, line_size);
4446 static void ff_jref_idct1_put(uint8_t *dest, int line_size, DCTELEM *block)
4448 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
4450 dest[0] = cm[(block[0] + 4)>>3];
4452 static void ff_jref_idct1_add(uint8_t *dest, int line_size, DCTELEM *block)
4454 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
4456 dest[0] = cm[dest[0] + ((block[0] + 4)>>3)];
4459 static void just_return(void *mem av_unused, int stride av_unused, int h av_unused) { return; }
4461 /* init static data */
4462 av_cold void dsputil_static_init(void)
4466 for(i=0;i<256;i++) ff_cropTbl[i + MAX_NEG_CROP] = i;
4467 for(i=0;i<MAX_NEG_CROP;i++) {
4469 ff_cropTbl[i + MAX_NEG_CROP + 256] = 255;
4472 for(i=0;i<512;i++) {
4473 ff_squareTbl[i] = (i - 256) * (i - 256);
4476 for(i=0; i<64; i++) inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
4479 int ff_check_alignment(void){
4480 static int did_fail=0;
4481 DECLARE_ALIGNED_16(int, aligned);
4483 if((intptr_t)&aligned & 15){
4485 #if HAVE_MMX || HAVE_ALTIVEC
4486 av_log(NULL, AV_LOG_ERROR,
4487 "Compiler did not align stack variables. Libavcodec has been miscompiled\n"
4488 "and may be very slow or crash. This is not a bug in libavcodec,\n"
4489 "but in the compiler. You may try recompiling using gcc >= 4.2.\n"
4490 "Do not report crashes to FFmpeg developers.\n");
4499 av_cold void dsputil_init(DSPContext* c, AVCodecContext *avctx)
4503 ff_check_alignment();
4506 if(avctx->dct_algo==FF_DCT_FASTINT) {
4507 c->fdct = fdct_ifast;
4508 c->fdct248 = fdct_ifast248;
4510 else if(avctx->dct_algo==FF_DCT_FAAN) {
4511 c->fdct = ff_faandct;
4512 c->fdct248 = ff_faandct248;
4515 c->fdct = ff_jpeg_fdct_islow; //slow/accurate/default
4516 c->fdct248 = ff_fdct248_islow;
4518 #endif //CONFIG_ENCODERS
4520 if(avctx->lowres==1){
4521 if(avctx->idct_algo==FF_IDCT_INT || avctx->idct_algo==FF_IDCT_AUTO || !CONFIG_H264_DECODER){
4522 c->idct_put= ff_jref_idct4_put;
4523 c->idct_add= ff_jref_idct4_add;
4525 c->idct_put= ff_h264_lowres_idct_put_c;
4526 c->idct_add= ff_h264_lowres_idct_add_c;
4528 c->idct = j_rev_dct4;
4529 c->idct_permutation_type= FF_NO_IDCT_PERM;
4530 }else if(avctx->lowres==2){
4531 c->idct_put= ff_jref_idct2_put;
4532 c->idct_add= ff_jref_idct2_add;
4533 c->idct = j_rev_dct2;
4534 c->idct_permutation_type= FF_NO_IDCT_PERM;
4535 }else if(avctx->lowres==3){
4536 c->idct_put= ff_jref_idct1_put;
4537 c->idct_add= ff_jref_idct1_add;
4538 c->idct = j_rev_dct1;
4539 c->idct_permutation_type= FF_NO_IDCT_PERM;
4541 if(avctx->idct_algo==FF_IDCT_INT){
4542 c->idct_put= ff_jref_idct_put;
4543 c->idct_add= ff_jref_idct_add;
4544 c->idct = j_rev_dct;
4545 c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
4546 }else if((CONFIG_VP3_DECODER || CONFIG_VP5_DECODER || CONFIG_VP6_DECODER ) &&
4547 avctx->idct_algo==FF_IDCT_VP3){
4548 c->idct_put= ff_vp3_idct_put_c;
4549 c->idct_add= ff_vp3_idct_add_c;
4550 c->idct = ff_vp3_idct_c;
4551 c->idct_permutation_type= FF_NO_IDCT_PERM;
4552 }else if(avctx->idct_algo==FF_IDCT_WMV2){
4553 c->idct_put= ff_wmv2_idct_put_c;
4554 c->idct_add= ff_wmv2_idct_add_c;
4555 c->idct = ff_wmv2_idct_c;
4556 c->idct_permutation_type= FF_NO_IDCT_PERM;
4557 }else if(avctx->idct_algo==FF_IDCT_FAAN){
4558 c->idct_put= ff_faanidct_put;
4559 c->idct_add= ff_faanidct_add;
4560 c->idct = ff_faanidct;
4561 c->idct_permutation_type= FF_NO_IDCT_PERM;
4562 }else if(CONFIG_EATGQ_DECODER && avctx->idct_algo==FF_IDCT_EA) {
4563 c->idct_put= ff_ea_idct_put_c;
4564 c->idct_permutation_type= FF_NO_IDCT_PERM;
4565 }else{ //accurate/default
4566 c->idct_put= ff_simple_idct_put;
4567 c->idct_add= ff_simple_idct_add;
4568 c->idct = ff_simple_idct;
4569 c->idct_permutation_type= FF_NO_IDCT_PERM;
4573 if (CONFIG_H264_DECODER) {
4574 c->h264_idct_add= ff_h264_idct_add_c;
4575 c->h264_idct8_add= ff_h264_idct8_add_c;
4576 c->h264_idct_dc_add= ff_h264_idct_dc_add_c;
4577 c->h264_idct8_dc_add= ff_h264_idct8_dc_add_c;
4578 c->h264_idct_add16 = ff_h264_idct_add16_c;
4579 c->h264_idct8_add4 = ff_h264_idct8_add4_c;
4580 c->h264_idct_add8 = ff_h264_idct_add8_c;
4581 c->h264_idct_add16intra= ff_h264_idct_add16intra_c;
4584 c->get_pixels = get_pixels_c;
4585 c->diff_pixels = diff_pixels_c;
4586 c->put_pixels_clamped = put_pixels_clamped_c;
4587 c->put_signed_pixels_clamped = put_signed_pixels_clamped_c;
4588 c->add_pixels_clamped = add_pixels_clamped_c;
4589 c->add_pixels8 = add_pixels8_c;
4590 c->add_pixels4 = add_pixels4_c;
4591 c->sum_abs_dctelem = sum_abs_dctelem_c;
4594 c->clear_block = clear_block_c;
4595 c->clear_blocks = clear_blocks_c;
4596 c->pix_sum = pix_sum_c;
4597 c->pix_norm1 = pix_norm1_c;
4599 /* TODO [0] 16 [1] 8 */
4600 c->pix_abs[0][0] = pix_abs16_c;
4601 c->pix_abs[0][1] = pix_abs16_x2_c;
4602 c->pix_abs[0][2] = pix_abs16_y2_c;
4603 c->pix_abs[0][3] = pix_abs16_xy2_c;
4604 c->pix_abs[1][0] = pix_abs8_c;
4605 c->pix_abs[1][1] = pix_abs8_x2_c;
4606 c->pix_abs[1][2] = pix_abs8_y2_c;
4607 c->pix_abs[1][3] = pix_abs8_xy2_c;
4609 #define dspfunc(PFX, IDX, NUM) \
4610 c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## NUM ## _c; \
4611 c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## NUM ## _x2_c; \
4612 c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## NUM ## _y2_c; \
4613 c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## NUM ## _xy2_c
4615 dspfunc(put, 0, 16);
4616 dspfunc(put_no_rnd, 0, 16);
4618 dspfunc(put_no_rnd, 1, 8);
4622 dspfunc(avg, 0, 16);
4623 dspfunc(avg_no_rnd, 0, 16);
4625 dspfunc(avg_no_rnd, 1, 8);
4630 c->put_no_rnd_pixels_l2[0]= put_no_rnd_pixels16_l2_c;
4631 c->put_no_rnd_pixels_l2[1]= put_no_rnd_pixels8_l2_c;
4633 c->put_tpel_pixels_tab[ 0] = put_tpel_pixels_mc00_c;
4634 c->put_tpel_pixels_tab[ 1] = put_tpel_pixels_mc10_c;
4635 c->put_tpel_pixels_tab[ 2] = put_tpel_pixels_mc20_c;
4636 c->put_tpel_pixels_tab[ 4] = put_tpel_pixels_mc01_c;
4637 c->put_tpel_pixels_tab[ 5] = put_tpel_pixels_mc11_c;
4638 c->put_tpel_pixels_tab[ 6] = put_tpel_pixels_mc21_c;
4639 c->put_tpel_pixels_tab[ 8] = put_tpel_pixels_mc02_c;
4640 c->put_tpel_pixels_tab[ 9] = put_tpel_pixels_mc12_c;
4641 c->put_tpel_pixels_tab[10] = put_tpel_pixels_mc22_c;
4643 c->avg_tpel_pixels_tab[ 0] = avg_tpel_pixels_mc00_c;
4644 c->avg_tpel_pixels_tab[ 1] = avg_tpel_pixels_mc10_c;
4645 c->avg_tpel_pixels_tab[ 2] = avg_tpel_pixels_mc20_c;
4646 c->avg_tpel_pixels_tab[ 4] = avg_tpel_pixels_mc01_c;
4647 c->avg_tpel_pixels_tab[ 5] = avg_tpel_pixels_mc11_c;
4648 c->avg_tpel_pixels_tab[ 6] = avg_tpel_pixels_mc21_c;
4649 c->avg_tpel_pixels_tab[ 8] = avg_tpel_pixels_mc02_c;
4650 c->avg_tpel_pixels_tab[ 9] = avg_tpel_pixels_mc12_c;
4651 c->avg_tpel_pixels_tab[10] = avg_tpel_pixels_mc22_c;
4653 #define dspfunc(PFX, IDX, NUM) \
4654 c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
4655 c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
4656 c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
4657 c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
4658 c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
4659 c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
4660 c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
4661 c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
4662 c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
4663 c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
4664 c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
4665 c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
4666 c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
4667 c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
4668 c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
4669 c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
4671 dspfunc(put_qpel, 0, 16);
4672 dspfunc(put_no_rnd_qpel, 0, 16);
4674 dspfunc(avg_qpel, 0, 16);
4675 /* dspfunc(avg_no_rnd_qpel, 0, 16); */
4677 dspfunc(put_qpel, 1, 8);
4678 dspfunc(put_no_rnd_qpel, 1, 8);
4680 dspfunc(avg_qpel, 1, 8);
4681 /* dspfunc(avg_no_rnd_qpel, 1, 8); */
4683 dspfunc(put_h264_qpel, 0, 16);
4684 dspfunc(put_h264_qpel, 1, 8);
4685 dspfunc(put_h264_qpel, 2, 4);
4686 dspfunc(put_h264_qpel, 3, 2);
4687 dspfunc(avg_h264_qpel, 0, 16);
4688 dspfunc(avg_h264_qpel, 1, 8);
4689 dspfunc(avg_h264_qpel, 2, 4);
4692 c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_c;
4693 c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_c;
4694 c->put_h264_chroma_pixels_tab[2]= put_h264_chroma_mc2_c;
4695 c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_c;
4696 c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_c;
4697 c->avg_h264_chroma_pixels_tab[2]= avg_h264_chroma_mc2_c;
4698 c->put_no_rnd_vc1_chroma_pixels_tab[0]= put_no_rnd_vc1_chroma_mc8_c;
4699 c->avg_no_rnd_vc1_chroma_pixels_tab[0]= avg_no_rnd_vc1_chroma_mc8_c;
4701 c->weight_h264_pixels_tab[0]= weight_h264_pixels16x16_c;
4702 c->weight_h264_pixels_tab[1]= weight_h264_pixels16x8_c;
4703 c->weight_h264_pixels_tab[2]= weight_h264_pixels8x16_c;
4704 c->weight_h264_pixels_tab[3]= weight_h264_pixels8x8_c;
4705 c->weight_h264_pixels_tab[4]= weight_h264_pixels8x4_c;
4706 c->weight_h264_pixels_tab[5]= weight_h264_pixels4x8_c;
4707 c->weight_h264_pixels_tab[6]= weight_h264_pixels4x4_c;
4708 c->weight_h264_pixels_tab[7]= weight_h264_pixels4x2_c;
4709 c->weight_h264_pixels_tab[8]= weight_h264_pixels2x4_c;
4710 c->weight_h264_pixels_tab[9]= weight_h264_pixels2x2_c;
4711 c->biweight_h264_pixels_tab[0]= biweight_h264_pixels16x16_c;
4712 c->biweight_h264_pixels_tab[1]= biweight_h264_pixels16x8_c;
4713 c->biweight_h264_pixels_tab[2]= biweight_h264_pixels8x16_c;
4714 c->biweight_h264_pixels_tab[3]= biweight_h264_pixels8x8_c;
4715 c->biweight_h264_pixels_tab[4]= biweight_h264_pixels8x4_c;
4716 c->biweight_h264_pixels_tab[5]= biweight_h264_pixels4x8_c;
4717 c->biweight_h264_pixels_tab[6]= biweight_h264_pixels4x4_c;
4718 c->biweight_h264_pixels_tab[7]= biweight_h264_pixels4x2_c;
4719 c->biweight_h264_pixels_tab[8]= biweight_h264_pixels2x4_c;
4720 c->biweight_h264_pixels_tab[9]= biweight_h264_pixels2x2_c;
4722 c->draw_edges = draw_edges_c;
4724 #if CONFIG_CAVS_DECODER
4725 ff_cavsdsp_init(c,avctx);
4728 #if CONFIG_MLP_DECODER || CONFIG_TRUEHD_DECODER
4729 ff_mlp_init(c, avctx);
4731 #if CONFIG_VC1_DECODER
4732 ff_vc1dsp_init(c,avctx);
4734 #if CONFIG_WMV2_DECODER || CONFIG_VC1_DECODER
4735 ff_intrax8dsp_init(c,avctx);
4737 #if CONFIG_RV30_DECODER
4738 ff_rv30dsp_init(c,avctx);
4740 #if CONFIG_RV40_DECODER
4741 ff_rv40dsp_init(c,avctx);
4742 c->put_rv40_qpel_pixels_tab[0][15] = put_rv40_qpel16_mc33_c;
4743 c->avg_rv40_qpel_pixels_tab[0][15] = avg_rv40_qpel16_mc33_c;
4744 c->put_rv40_qpel_pixels_tab[1][15] = put_rv40_qpel8_mc33_c;
4745 c->avg_rv40_qpel_pixels_tab[1][15] = avg_rv40_qpel8_mc33_c;
4748 c->put_mspel_pixels_tab[0]= put_mspel8_mc00_c;
4749 c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
4750 c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c;
4751 c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c;
4752 c->put_mspel_pixels_tab[4]= put_mspel8_mc02_c;
4753 c->put_mspel_pixels_tab[5]= put_mspel8_mc12_c;
4754 c->put_mspel_pixels_tab[6]= put_mspel8_mc22_c;
4755 c->put_mspel_pixels_tab[7]= put_mspel8_mc32_c;
4757 #define SET_CMP_FUNC(name) \
4758 c->name[0]= name ## 16_c;\
4759 c->name[1]= name ## 8x8_c;
4761 SET_CMP_FUNC(hadamard8_diff)
4762 c->hadamard8_diff[4]= hadamard8_intra16_c;
4763 c->hadamard8_diff[5]= hadamard8_intra8x8_c;
4764 SET_CMP_FUNC(dct_sad)
4765 SET_CMP_FUNC(dct_max)
4767 SET_CMP_FUNC(dct264_sad)
4769 c->sad[0]= pix_abs16_c;
4770 c->sad[1]= pix_abs8_c;
4774 SET_CMP_FUNC(quant_psnr)
4777 c->vsad[0]= vsad16_c;
4778 c->vsad[4]= vsad_intra16_c;
4779 c->vsad[5]= vsad_intra8_c;
4780 c->vsse[0]= vsse16_c;
4781 c->vsse[4]= vsse_intra16_c;
4782 c->vsse[5]= vsse_intra8_c;
4783 c->nsse[0]= nsse16_c;
4784 c->nsse[1]= nsse8_c;
4785 #if CONFIG_SNOW_ENCODER
4786 c->w53[0]= w53_16_c;
4788 c->w97[0]= w97_16_c;
4792 c->ssd_int8_vs_int16 = ssd_int8_vs_int16_c;
4794 c->add_bytes= add_bytes_c;
4795 c->add_bytes_l2= add_bytes_l2_c;
4796 c->diff_bytes= diff_bytes_c;
4797 c->add_hfyu_median_prediction= add_hfyu_median_prediction_c;
4798 c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_c;
4799 c->add_hfyu_left_prediction = add_hfyu_left_prediction_c;
4800 c->add_hfyu_left_prediction_bgr32 = add_hfyu_left_prediction_bgr32_c;
4801 c->bswap_buf= bswap_buf;
4802 #if CONFIG_PNG_DECODER
4803 c->add_png_paeth_prediction= ff_add_png_paeth_prediction;
4806 c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_c;
4807 c->h264_h_loop_filter_luma= h264_h_loop_filter_luma_c;
4808 c->h264_v_loop_filter_luma_intra= h264_v_loop_filter_luma_intra_c;
4809 c->h264_h_loop_filter_luma_intra= h264_h_loop_filter_luma_intra_c;
4810 c->h264_v_loop_filter_chroma= h264_v_loop_filter_chroma_c;
4811 c->h264_h_loop_filter_chroma= h264_h_loop_filter_chroma_c;
4812 c->h264_v_loop_filter_chroma_intra= h264_v_loop_filter_chroma_intra_c;
4813 c->h264_h_loop_filter_chroma_intra= h264_h_loop_filter_chroma_intra_c;
4814 c->h264_loop_filter_strength= NULL;
4816 if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
4817 c->h263_h_loop_filter= h263_h_loop_filter_c;
4818 c->h263_v_loop_filter= h263_v_loop_filter_c;
4821 if (CONFIG_VP3_DECODER) {
4822 c->vp3_h_loop_filter= ff_vp3_h_loop_filter_c;
4823 c->vp3_v_loop_filter= ff_vp3_v_loop_filter_c;
4825 if (CONFIG_VP6_DECODER) {
4826 c->vp6_filter_diag4= ff_vp6_filter_diag4_c;
4829 c->h261_loop_filter= h261_loop_filter_c;
4831 c->try_8x8basis= try_8x8basis_c;
4832 c->add_8x8basis= add_8x8basis_c;
4834 #if CONFIG_SNOW_DECODER
4835 c->vertical_compose97i = ff_snow_vertical_compose97i;
4836 c->horizontal_compose97i = ff_snow_horizontal_compose97i;
4837 c->inner_add_yblock = ff_snow_inner_add_yblock;
4840 #if CONFIG_VORBIS_DECODER
4841 c->vorbis_inverse_coupling = vorbis_inverse_coupling;
4843 #if CONFIG_AC3_DECODER
4844 c->ac3_downmix = ff_ac3_downmix_c;
4847 c->lpc_compute_autocorr = ff_lpc_compute_autocorr;
4849 c->vector_fmul = vector_fmul_c;
4850 c->vector_fmul_reverse = vector_fmul_reverse_c;
4851 c->vector_fmul_add = vector_fmul_add_c;
4852 c->vector_fmul_window = ff_vector_fmul_window_c;
4853 c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_c;
4854 c->vector_clipf = vector_clipf_c;
4855 c->float_to_int16 = ff_float_to_int16_c;
4856 c->float_to_int16_interleave = ff_float_to_int16_interleave_c;
4857 c->scalarproduct_int16 = scalarproduct_int16_c;
4858 c->scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_c;
4859 c->scalarproduct_float = scalarproduct_float_c;
4860 c->butterflies_float = butterflies_float_c;
4861 c->vector_fmul_scalar = vector_fmul_scalar_c;
4863 c->vector_fmul_sv_scalar[0] = vector_fmul_sv_scalar_2_c;
4864 c->vector_fmul_sv_scalar[1] = vector_fmul_sv_scalar_4_c;
4866 c->sv_fmul_scalar[0] = sv_fmul_scalar_2_c;
4867 c->sv_fmul_scalar[1] = sv_fmul_scalar_4_c;
4869 c->shrink[0]= ff_img_copy_plane;
4870 c->shrink[1]= ff_shrink22;
4871 c->shrink[2]= ff_shrink44;
4872 c->shrink[3]= ff_shrink88;
4874 c->prefetch= just_return;
4876 memset(c->put_2tap_qpel_pixels_tab, 0, sizeof(c->put_2tap_qpel_pixels_tab));
4877 memset(c->avg_2tap_qpel_pixels_tab, 0, sizeof(c->avg_2tap_qpel_pixels_tab));
4879 if (HAVE_MMX) dsputil_init_mmx (c, avctx);
4880 if (ARCH_ARM) dsputil_init_arm (c, avctx);
4881 if (CONFIG_MLIB) dsputil_init_mlib (c, avctx);
4882 if (HAVE_VIS) dsputil_init_vis (c, avctx);
4883 if (ARCH_ALPHA) dsputil_init_alpha (c, avctx);
4884 if (ARCH_PPC) dsputil_init_ppc (c, avctx);
4885 if (HAVE_MMI) dsputil_init_mmi (c, avctx);
4886 if (ARCH_SH4) dsputil_init_sh4 (c, avctx);
4887 if (ARCH_BFIN) dsputil_init_bfin (c, avctx);
4889 for(i=0; i<64; i++){
4890 if(!c->put_2tap_qpel_pixels_tab[0][i])
4891 c->put_2tap_qpel_pixels_tab[0][i]= c->put_h264_qpel_pixels_tab[0][i];
4892 if(!c->avg_2tap_qpel_pixels_tab[0][i])
4893 c->avg_2tap_qpel_pixels_tab[0][i]= c->avg_h264_qpel_pixels_tab[0][i];
4896 switch(c->idct_permutation_type){
4897 case FF_NO_IDCT_PERM:
4899 c->idct_permutation[i]= i;
4901 case FF_LIBMPEG2_IDCT_PERM:
4903 c->idct_permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
4905 case FF_SIMPLE_IDCT_PERM:
4907 c->idct_permutation[i]= simple_mmx_permutation[i];
4909 case FF_TRANSPOSE_IDCT_PERM:
4911 c->idct_permutation[i]= ((i&7)<<3) | (i>>3);
4913 case FF_PARTTRANS_IDCT_PERM:
4915 c->idct_permutation[i]= (i&0x24) | ((i&3)<<3) | ((i>>3)&3);
4917 case FF_SSE2_IDCT_PERM:
4919 c->idct_permutation[i]= (i&0x38) | idct_sse2_row_perm[i&7];
4922 av_log(avctx, AV_LOG_ERROR, "Internal error, IDCT permutation not set\n");