3 * Copyright (c) 2000, 2001 Fabrice Bellard.
4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
6 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
8 * This file is part of FFmpeg.
10 * FFmpeg is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public
12 * License as published by the Free Software Foundation; either
13 * version 2.1 of the License, or (at your option) any later version.
15 * FFmpeg is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * Lesser General Public License for more details.
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with FFmpeg; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
32 #include "simple_idct.h"
39 void ff_spatial_dwt(int *buffer, int width, int height, int stride, int type, int decomposition_count);
42 void vorbis_inverse_coupling(float *mag, float *ang, int blocksize);
45 void ff_ac3_downmix_c(float (*samples)[256], float (*matrix)[2], int out_ch, int in_ch, int len);
48 void ff_flac_compute_autocorr(const int32_t *data, int len, int lag, double *autoc);
51 void ff_add_png_paeth_prediction(uint8_t *dst, uint8_t *src, uint8_t *top, int w, int bpp);
54 void ff_ea_idct_put_c(uint8_t *dest, int linesize, DCTELEM *block);
56 uint8_t ff_cropTbl[256 + 2 * MAX_NEG_CROP] = {0, };
57 uint32_t ff_squareTbl[512] = {0, };
59 // 0x7f7f7f7f or 0x7f7f7f7f7f7f7f7f or whatever, depending on the cpu's native arithmetic size
60 #define pb_7f (~0UL/255 * 0x7f)
61 #define pb_80 (~0UL/255 * 0x80)
63 const uint8_t ff_zigzag_direct[64] = {
64 0, 1, 8, 16, 9, 2, 3, 10,
65 17, 24, 32, 25, 18, 11, 4, 5,
66 12, 19, 26, 33, 40, 48, 41, 34,
67 27, 20, 13, 6, 7, 14, 21, 28,
68 35, 42, 49, 56, 57, 50, 43, 36,
69 29, 22, 15, 23, 30, 37, 44, 51,
70 58, 59, 52, 45, 38, 31, 39, 46,
71 53, 60, 61, 54, 47, 55, 62, 63
74 /* Specific zigzag scan for 248 idct. NOTE that unlike the
75 specification, we interleave the fields */
76 const uint8_t ff_zigzag248_direct[64] = {
77 0, 8, 1, 9, 16, 24, 2, 10,
78 17, 25, 32, 40, 48, 56, 33, 41,
79 18, 26, 3, 11, 4, 12, 19, 27,
80 34, 42, 49, 57, 50, 58, 35, 43,
81 20, 28, 5, 13, 6, 14, 21, 29,
82 36, 44, 51, 59, 52, 60, 37, 45,
83 22, 30, 7, 15, 23, 31, 38, 46,
84 53, 61, 54, 62, 39, 47, 55, 63,
87 /* not permutated inverse zigzag_direct + 1 for MMX quantizer */
88 DECLARE_ALIGNED_8(uint16_t, inv_zigzag_direct16[64]) = {0, };
90 const uint8_t ff_alternate_horizontal_scan[64] = {
91 0, 1, 2, 3, 8, 9, 16, 17,
92 10, 11, 4, 5, 6, 7, 15, 14,
93 13, 12, 19, 18, 24, 25, 32, 33,
94 26, 27, 20, 21, 22, 23, 28, 29,
95 30, 31, 34, 35, 40, 41, 48, 49,
96 42, 43, 36, 37, 38, 39, 44, 45,
97 46, 47, 50, 51, 56, 57, 58, 59,
98 52, 53, 54, 55, 60, 61, 62, 63,
101 const uint8_t ff_alternate_vertical_scan[64] = {
102 0, 8, 16, 24, 1, 9, 2, 10,
103 17, 25, 32, 40, 48, 56, 57, 49,
104 41, 33, 26, 18, 3, 11, 4, 12,
105 19, 27, 34, 42, 50, 58, 35, 43,
106 51, 59, 20, 28, 5, 13, 6, 14,
107 21, 29, 36, 44, 52, 60, 37, 45,
108 53, 61, 22, 30, 7, 15, 23, 31,
109 38, 46, 54, 62, 39, 47, 55, 63,
112 /* a*inverse[b]>>32 == a/b for all 0<=a<=65536 && 2<=b<=255 */
113 const uint32_t ff_inverse[256]={
114 0, 4294967295U,2147483648U,1431655766, 1073741824, 858993460, 715827883, 613566757,
115 536870912, 477218589, 429496730, 390451573, 357913942, 330382100, 306783379, 286331154,
116 268435456, 252645136, 238609295, 226050911, 214748365, 204522253, 195225787, 186737709,
117 178956971, 171798692, 165191050, 159072863, 153391690, 148102321, 143165577, 138547333,
118 134217728, 130150525, 126322568, 122713352, 119304648, 116080198, 113025456, 110127367,
119 107374183, 104755300, 102261127, 99882961, 97612894, 95443718, 93368855, 91382283,
120 89478486, 87652394, 85899346, 84215046, 82595525, 81037119, 79536432, 78090315,
121 76695845, 75350304, 74051161, 72796056, 71582789, 70409300, 69273667, 68174085,
122 67108864, 66076420, 65075263, 64103990, 63161284, 62245903, 61356676, 60492498,
123 59652324, 58835169, 58040099, 57266231, 56512728, 55778797, 55063684, 54366675,
124 53687092, 53024288, 52377650, 51746594, 51130564, 50529028, 49941481, 49367441,
125 48806447, 48258060, 47721859, 47197443, 46684428, 46182445, 45691142, 45210183,
126 44739243, 44278014, 43826197, 43383509, 42949673, 42524429, 42107523, 41698712,
127 41297763, 40904451, 40518560, 40139882, 39768216, 39403370, 39045158, 38693400,
128 38347923, 38008561, 37675152, 37347542, 37025581, 36709123, 36398028, 36092163,
129 35791395, 35495598, 35204650, 34918434, 34636834, 34359739, 34087043, 33818641,
130 33554432, 33294321, 33038210, 32786010, 32537632, 32292988, 32051995, 31814573,
131 31580642, 31350127, 31122952, 30899046, 30678338, 30460761, 30246249, 30034737,
132 29826162, 29620465, 29417585, 29217465, 29020050, 28825284, 28633116, 28443493,
133 28256364, 28071682, 27889399, 27709467, 27531842, 27356480, 27183338, 27012373,
134 26843546, 26676816, 26512144, 26349493, 26188825, 26030105, 25873297, 25718368,
135 25565282, 25414008, 25264514, 25116768, 24970741, 24826401, 24683721, 24542671,
136 24403224, 24265352, 24129030, 23994231, 23860930, 23729102, 23598722, 23469767,
137 23342214, 23216040, 23091223, 22967740, 22845571, 22724695, 22605092, 22486740,
138 22369622, 22253717, 22139007, 22025474, 21913099, 21801865, 21691755, 21582751,
139 21474837, 21367997, 21262215, 21157475, 21053762, 20951060, 20849356, 20748635,
140 20648882, 20550083, 20452226, 20355296, 20259280, 20164166, 20069941, 19976593,
141 19884108, 19792477, 19701685, 19611723, 19522579, 19434242, 19346700, 19259944,
142 19173962, 19088744, 19004281, 18920561, 18837576, 18755316, 18673771, 18592933,
143 18512791, 18433337, 18354562, 18276457, 18199014, 18122225, 18046082, 17970575,
144 17895698, 17821442, 17747799, 17674763, 17602325, 17530479, 17459217, 17388532,
145 17318417, 17248865, 17179870, 17111424, 17043522, 16976156, 16909321, 16843010,
148 /* Input permutation for the simple_idct_mmx */
149 static const uint8_t simple_mmx_permutation[64]={
150 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
151 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
152 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
153 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
154 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
155 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
156 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
157 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
160 static const uint8_t idct_sse2_row_perm[8] = {0, 4, 1, 5, 2, 6, 3, 7};
162 void ff_init_scantable(uint8_t *permutation, ScanTable *st, const uint8_t *src_scantable){
166 st->scantable= src_scantable;
170 j = src_scantable[i];
171 st->permutated[i] = permutation[j];
180 j = st->permutated[i];
182 st->raster_end[i]= end;
186 static int pix_sum_c(uint8_t * pix, int line_size)
191 for (i = 0; i < 16; i++) {
192 for (j = 0; j < 16; j += 8) {
203 pix += line_size - 16;
208 static int pix_norm1_c(uint8_t * pix, int line_size)
211 uint32_t *sq = ff_squareTbl + 256;
214 for (i = 0; i < 16; i++) {
215 for (j = 0; j < 16; j += 8) {
226 #if LONG_MAX > 2147483647
227 register uint64_t x=*(uint64_t*)pix;
229 s += sq[(x>>8)&0xff];
230 s += sq[(x>>16)&0xff];
231 s += sq[(x>>24)&0xff];
232 s += sq[(x>>32)&0xff];
233 s += sq[(x>>40)&0xff];
234 s += sq[(x>>48)&0xff];
235 s += sq[(x>>56)&0xff];
237 register uint32_t x=*(uint32_t*)pix;
239 s += sq[(x>>8)&0xff];
240 s += sq[(x>>16)&0xff];
241 s += sq[(x>>24)&0xff];
242 x=*(uint32_t*)(pix+4);
244 s += sq[(x>>8)&0xff];
245 s += sq[(x>>16)&0xff];
246 s += sq[(x>>24)&0xff];
251 pix += line_size - 16;
256 static void bswap_buf(uint32_t *dst, const uint32_t *src, int w){
259 for(i=0; i+8<=w; i+=8){
260 dst[i+0]= bswap_32(src[i+0]);
261 dst[i+1]= bswap_32(src[i+1]);
262 dst[i+2]= bswap_32(src[i+2]);
263 dst[i+3]= bswap_32(src[i+3]);
264 dst[i+4]= bswap_32(src[i+4]);
265 dst[i+5]= bswap_32(src[i+5]);
266 dst[i+6]= bswap_32(src[i+6]);
267 dst[i+7]= bswap_32(src[i+7]);
270 dst[i+0]= bswap_32(src[i+0]);
274 static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
277 uint32_t *sq = ff_squareTbl + 256;
280 for (i = 0; i < h; i++) {
281 s += sq[pix1[0] - pix2[0]];
282 s += sq[pix1[1] - pix2[1]];
283 s += sq[pix1[2] - pix2[2]];
284 s += sq[pix1[3] - pix2[3]];
291 static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
294 uint32_t *sq = ff_squareTbl + 256;
297 for (i = 0; i < h; i++) {
298 s += sq[pix1[0] - pix2[0]];
299 s += sq[pix1[1] - pix2[1]];
300 s += sq[pix1[2] - pix2[2]];
301 s += sq[pix1[3] - pix2[3]];
302 s += sq[pix1[4] - pix2[4]];
303 s += sq[pix1[5] - pix2[5]];
304 s += sq[pix1[6] - pix2[6]];
305 s += sq[pix1[7] - pix2[7]];
312 static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
315 uint32_t *sq = ff_squareTbl + 256;
318 for (i = 0; i < h; i++) {
319 s += sq[pix1[ 0] - pix2[ 0]];
320 s += sq[pix1[ 1] - pix2[ 1]];
321 s += sq[pix1[ 2] - pix2[ 2]];
322 s += sq[pix1[ 3] - pix2[ 3]];
323 s += sq[pix1[ 4] - pix2[ 4]];
324 s += sq[pix1[ 5] - pix2[ 5]];
325 s += sq[pix1[ 6] - pix2[ 6]];
326 s += sq[pix1[ 7] - pix2[ 7]];
327 s += sq[pix1[ 8] - pix2[ 8]];
328 s += sq[pix1[ 9] - pix2[ 9]];
329 s += sq[pix1[10] - pix2[10]];
330 s += sq[pix1[11] - pix2[11]];
331 s += sq[pix1[12] - pix2[12]];
332 s += sq[pix1[13] - pix2[13]];
333 s += sq[pix1[14] - pix2[14]];
334 s += sq[pix1[15] - pix2[15]];
343 #ifdef CONFIG_SNOW_ENCODER //dwt is in snow.c
344 static inline int w_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int w, int h, int type){
346 const int dec_count= w==8 ? 3 : 4;
349 static const int scale[2][2][4][4]={
353 {268, 239, 239, 213},
357 // 9/7 16x16 or 32x32 dec=4
358 {344, 310, 310, 280},
366 {275, 245, 245, 218},
370 // 5/3 16x16 or 32x32 dec=4
371 {352, 317, 317, 286},
379 for (i = 0; i < h; i++) {
380 for (j = 0; j < w; j+=4) {
381 tmp[32*i+j+0] = (pix1[j+0] - pix2[j+0])<<4;
382 tmp[32*i+j+1] = (pix1[j+1] - pix2[j+1])<<4;
383 tmp[32*i+j+2] = (pix1[j+2] - pix2[j+2])<<4;
384 tmp[32*i+j+3] = (pix1[j+3] - pix2[j+3])<<4;
390 ff_spatial_dwt(tmp, w, h, 32, type, dec_count);
394 for(level=0; level<dec_count; level++){
395 for(ori= level ? 1 : 0; ori<4; ori++){
396 int size= w>>(dec_count-level);
397 int sx= (ori&1) ? size : 0;
398 int stride= 32<<(dec_count-level);
399 int sy= (ori&2) ? stride>>1 : 0;
401 for(i=0; i<size; i++){
402 for(j=0; j<size; j++){
403 int v= tmp[sx + sy + i*stride + j] * scale[type][dec_count-3][level][ori];
413 static int w53_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
414 return w_c(v, pix1, pix2, line_size, 8, h, 1);
417 static int w97_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
418 return w_c(v, pix1, pix2, line_size, 8, h, 0);
421 static int w53_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
422 return w_c(v, pix1, pix2, line_size, 16, h, 1);
425 static int w97_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
426 return w_c(v, pix1, pix2, line_size, 16, h, 0);
429 int w53_32_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
430 return w_c(v, pix1, pix2, line_size, 32, h, 1);
433 int w97_32_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
434 return w_c(v, pix1, pix2, line_size, 32, h, 0);
438 /* draw the edges of width 'w' of an image of size width, height */
439 //FIXME check that this is ok for mpeg4 interlaced
440 static void draw_edges_c(uint8_t *buf, int wrap, int width, int height, int w)
442 uint8_t *ptr, *last_line;
445 last_line = buf + (height - 1) * wrap;
448 memcpy(buf - (i + 1) * wrap, buf, width);
449 memcpy(last_line + (i + 1) * wrap, last_line, width);
453 for(i=0;i<height;i++) {
454 memset(ptr - w, ptr[0], w);
455 memset(ptr + width, ptr[width-1], w);
460 memset(buf - (i + 1) * wrap - w, buf[0], w); /* top left */
461 memset(buf - (i + 1) * wrap + width, buf[width-1], w); /* top right */
462 memset(last_line + (i + 1) * wrap - w, last_line[0], w); /* top left */
463 memset(last_line + (i + 1) * wrap + width, last_line[width-1], w); /* top right */
468 * Copies a rectangular area of samples to a temporary buffer and replicates the boarder samples.
469 * @param buf destination buffer
470 * @param src source buffer
471 * @param linesize number of bytes between 2 vertically adjacent samples in both the source and destination buffers
472 * @param block_w width of block
473 * @param block_h height of block
474 * @param src_x x coordinate of the top left sample of the block in the source buffer
475 * @param src_y y coordinate of the top left sample of the block in the source buffer
476 * @param w width of the source buffer
477 * @param h height of the source buffer
479 void ff_emulated_edge_mc(uint8_t *buf, uint8_t *src, int linesize, int block_w, int block_h,
480 int src_x, int src_y, int w, int h){
482 int start_y, start_x, end_y, end_x;
485 src+= (h-1-src_y)*linesize;
487 }else if(src_y<=-block_h){
488 src+= (1-block_h-src_y)*linesize;
494 }else if(src_x<=-block_w){
495 src+= (1-block_w-src_x);
499 start_y= FFMAX(0, -src_y);
500 start_x= FFMAX(0, -src_x);
501 end_y= FFMIN(block_h, h-src_y);
502 end_x= FFMIN(block_w, w-src_x);
504 // copy existing part
505 for(y=start_y; y<end_y; y++){
506 for(x=start_x; x<end_x; x++){
507 buf[x + y*linesize]= src[x + y*linesize];
512 for(y=0; y<start_y; y++){
513 for(x=start_x; x<end_x; x++){
514 buf[x + y*linesize]= buf[x + start_y*linesize];
519 for(y=end_y; y<block_h; y++){
520 for(x=start_x; x<end_x; x++){
521 buf[x + y*linesize]= buf[x + (end_y-1)*linesize];
525 for(y=0; y<block_h; y++){
527 for(x=0; x<start_x; x++){
528 buf[x + y*linesize]= buf[start_x + y*linesize];
532 for(x=end_x; x<block_w; x++){
533 buf[x + y*linesize]= buf[end_x - 1 + y*linesize];
538 static void get_pixels_c(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
542 /* read the pixels */
544 block[0] = pixels[0];
545 block[1] = pixels[1];
546 block[2] = pixels[2];
547 block[3] = pixels[3];
548 block[4] = pixels[4];
549 block[5] = pixels[5];
550 block[6] = pixels[6];
551 block[7] = pixels[7];
557 static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
558 const uint8_t *s2, int stride){
561 /* read the pixels */
563 block[0] = s1[0] - s2[0];
564 block[1] = s1[1] - s2[1];
565 block[2] = s1[2] - s2[2];
566 block[3] = s1[3] - s2[3];
567 block[4] = s1[4] - s2[4];
568 block[5] = s1[5] - s2[5];
569 block[6] = s1[6] - s2[6];
570 block[7] = s1[7] - s2[7];
578 static void put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
582 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
584 /* read the pixels */
586 pixels[0] = cm[block[0]];
587 pixels[1] = cm[block[1]];
588 pixels[2] = cm[block[2]];
589 pixels[3] = cm[block[3]];
590 pixels[4] = cm[block[4]];
591 pixels[5] = cm[block[5]];
592 pixels[6] = cm[block[6]];
593 pixels[7] = cm[block[7]];
600 static void put_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
604 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
606 /* read the pixels */
608 pixels[0] = cm[block[0]];
609 pixels[1] = cm[block[1]];
610 pixels[2] = cm[block[2]];
611 pixels[3] = cm[block[3]];
618 static void put_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
622 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
624 /* read the pixels */
626 pixels[0] = cm[block[0]];
627 pixels[1] = cm[block[1]];
634 static void put_signed_pixels_clamped_c(const DCTELEM *block,
635 uint8_t *restrict pixels,
640 for (i = 0; i < 8; i++) {
641 for (j = 0; j < 8; j++) {
644 else if (*block > 127)
647 *pixels = (uint8_t)(*block + 128);
651 pixels += (line_size - 8);
655 static void add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
659 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
661 /* read the pixels */
663 pixels[0] = cm[pixels[0] + block[0]];
664 pixels[1] = cm[pixels[1] + block[1]];
665 pixels[2] = cm[pixels[2] + block[2]];
666 pixels[3] = cm[pixels[3] + block[3]];
667 pixels[4] = cm[pixels[4] + block[4]];
668 pixels[5] = cm[pixels[5] + block[5]];
669 pixels[6] = cm[pixels[6] + block[6]];
670 pixels[7] = cm[pixels[7] + block[7]];
676 static void add_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
680 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
682 /* read the pixels */
684 pixels[0] = cm[pixels[0] + block[0]];
685 pixels[1] = cm[pixels[1] + block[1]];
686 pixels[2] = cm[pixels[2] + block[2]];
687 pixels[3] = cm[pixels[3] + block[3]];
693 static void add_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
697 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
699 /* read the pixels */
701 pixels[0] = cm[pixels[0] + block[0]];
702 pixels[1] = cm[pixels[1] + block[1]];
708 static void add_pixels8_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
712 pixels[0] += block[0];
713 pixels[1] += block[1];
714 pixels[2] += block[2];
715 pixels[3] += block[3];
716 pixels[4] += block[4];
717 pixels[5] += block[5];
718 pixels[6] += block[6];
719 pixels[7] += block[7];
725 static void add_pixels4_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
729 pixels[0] += block[0];
730 pixels[1] += block[1];
731 pixels[2] += block[2];
732 pixels[3] += block[3];
738 static int sum_abs_dctelem_c(DCTELEM *block)
742 sum+= FFABS(block[i]);
748 #define PIXOP2(OPNAME, OP) \
749 static void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
753 OP(*((uint64_t*)block), AV_RN64(pixels));\
759 static void OPNAME ## _no_rnd_pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
763 const uint64_t a= AV_RN64(pixels );\
764 const uint64_t b= AV_RN64(pixels+1);\
765 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
771 static void OPNAME ## _pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
775 const uint64_t a= AV_RN64(pixels );\
776 const uint64_t b= AV_RN64(pixels+1);\
777 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
783 static void OPNAME ## _no_rnd_pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
787 const uint64_t a= AV_RN64(pixels );\
788 const uint64_t b= AV_RN64(pixels+line_size);\
789 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
795 static void OPNAME ## _pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
799 const uint64_t a= AV_RN64(pixels );\
800 const uint64_t b= AV_RN64(pixels+line_size);\
801 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
807 static void OPNAME ## _pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
810 const uint64_t a= AV_RN64(pixels );\
811 const uint64_t b= AV_RN64(pixels+1);\
812 uint64_t l0= (a&0x0303030303030303ULL)\
813 + (b&0x0303030303030303ULL)\
814 + 0x0202020202020202ULL;\
815 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
816 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
820 for(i=0; i<h; i+=2){\
821 uint64_t a= AV_RN64(pixels );\
822 uint64_t b= AV_RN64(pixels+1);\
823 l1= (a&0x0303030303030303ULL)\
824 + (b&0x0303030303030303ULL);\
825 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
826 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
827 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
830 a= AV_RN64(pixels );\
831 b= AV_RN64(pixels+1);\
832 l0= (a&0x0303030303030303ULL)\
833 + (b&0x0303030303030303ULL)\
834 + 0x0202020202020202ULL;\
835 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
836 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
837 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
843 static void OPNAME ## _no_rnd_pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
846 const uint64_t a= AV_RN64(pixels );\
847 const uint64_t b= AV_RN64(pixels+1);\
848 uint64_t l0= (a&0x0303030303030303ULL)\
849 + (b&0x0303030303030303ULL)\
850 + 0x0101010101010101ULL;\
851 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
852 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
856 for(i=0; i<h; i+=2){\
857 uint64_t a= AV_RN64(pixels );\
858 uint64_t b= AV_RN64(pixels+1);\
859 l1= (a&0x0303030303030303ULL)\
860 + (b&0x0303030303030303ULL);\
861 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
862 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
863 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
866 a= AV_RN64(pixels );\
867 b= AV_RN64(pixels+1);\
868 l0= (a&0x0303030303030303ULL)\
869 + (b&0x0303030303030303ULL)\
870 + 0x0101010101010101ULL;\
871 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
872 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
873 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
879 CALL_2X_PIXELS(OPNAME ## _pixels16_c , OPNAME ## _pixels_c , 8)\
880 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels_x2_c , 8)\
881 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels_y2_c , 8)\
882 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels_xy2_c, 8)\
883 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels_x2_c , 8)\
884 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels_y2_c , 8)\
885 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels_xy2_c, 8)
887 #define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
888 #else // 64 bit variant
890 #define PIXOP2(OPNAME, OP) \
891 static void OPNAME ## _pixels2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
894 OP(*((uint16_t*)(block )), AV_RN16(pixels ));\
899 static void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
902 OP(*((uint32_t*)(block )), AV_RN32(pixels ));\
907 static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
910 OP(*((uint32_t*)(block )), AV_RN32(pixels ));\
911 OP(*((uint32_t*)(block+4)), AV_RN32(pixels+4));\
916 static inline void OPNAME ## _no_rnd_pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
917 OPNAME ## _pixels8_c(block, pixels, line_size, h);\
920 static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
921 int src_stride1, int src_stride2, int h){\
925 a= AV_RN32(&src1[i*src_stride1 ]);\
926 b= AV_RN32(&src2[i*src_stride2 ]);\
927 OP(*((uint32_t*)&dst[i*dst_stride ]), no_rnd_avg32(a, b));\
928 a= AV_RN32(&src1[i*src_stride1+4]);\
929 b= AV_RN32(&src2[i*src_stride2+4]);\
930 OP(*((uint32_t*)&dst[i*dst_stride+4]), no_rnd_avg32(a, b));\
934 static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
935 int src_stride1, int src_stride2, int h){\
939 a= AV_RN32(&src1[i*src_stride1 ]);\
940 b= AV_RN32(&src2[i*src_stride2 ]);\
941 OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
942 a= AV_RN32(&src1[i*src_stride1+4]);\
943 b= AV_RN32(&src2[i*src_stride2+4]);\
944 OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\
948 static inline void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
949 int src_stride1, int src_stride2, int h){\
953 a= AV_RN32(&src1[i*src_stride1 ]);\
954 b= AV_RN32(&src2[i*src_stride2 ]);\
955 OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
959 static inline void OPNAME ## _pixels2_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
960 int src_stride1, int src_stride2, int h){\
964 a= AV_RN16(&src1[i*src_stride1 ]);\
965 b= AV_RN16(&src2[i*src_stride2 ]);\
966 OP(*((uint16_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
970 static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
971 int src_stride1, int src_stride2, int h){\
972 OPNAME ## _pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
973 OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
976 static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
977 int src_stride1, int src_stride2, int h){\
978 OPNAME ## _no_rnd_pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
979 OPNAME ## _no_rnd_pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
982 static inline void OPNAME ## _no_rnd_pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
983 OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
986 static inline void OPNAME ## _pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
987 OPNAME ## _pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
990 static inline void OPNAME ## _no_rnd_pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
991 OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
994 static inline void OPNAME ## _pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
995 OPNAME ## _pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
998 static inline void OPNAME ## _pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
999 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1001 for(i=0; i<h; i++){\
1002 uint32_t a, b, c, d, l0, l1, h0, h1;\
1003 a= AV_RN32(&src1[i*src_stride1]);\
1004 b= AV_RN32(&src2[i*src_stride2]);\
1005 c= AV_RN32(&src3[i*src_stride3]);\
1006 d= AV_RN32(&src4[i*src_stride4]);\
1007 l0= (a&0x03030303UL)\
1010 h0= ((a&0xFCFCFCFCUL)>>2)\
1011 + ((b&0xFCFCFCFCUL)>>2);\
1012 l1= (c&0x03030303UL)\
1013 + (d&0x03030303UL);\
1014 h1= ((c&0xFCFCFCFCUL)>>2)\
1015 + ((d&0xFCFCFCFCUL)>>2);\
1016 OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1017 a= AV_RN32(&src1[i*src_stride1+4]);\
1018 b= AV_RN32(&src2[i*src_stride2+4]);\
1019 c= AV_RN32(&src3[i*src_stride3+4]);\
1020 d= AV_RN32(&src4[i*src_stride4+4]);\
1021 l0= (a&0x03030303UL)\
1024 h0= ((a&0xFCFCFCFCUL)>>2)\
1025 + ((b&0xFCFCFCFCUL)>>2);\
1026 l1= (c&0x03030303UL)\
1027 + (d&0x03030303UL);\
1028 h1= ((c&0xFCFCFCFCUL)>>2)\
1029 + ((d&0xFCFCFCFCUL)>>2);\
1030 OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1034 static inline void OPNAME ## _pixels4_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
1035 OPNAME ## _pixels4_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
1038 static inline void OPNAME ## _pixels4_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
1039 OPNAME ## _pixels4_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
1042 static inline void OPNAME ## _pixels2_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
1043 OPNAME ## _pixels2_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
1046 static inline void OPNAME ## _pixels2_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
1047 OPNAME ## _pixels2_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
1050 static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
1051 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1053 for(i=0; i<h; i++){\
1054 uint32_t a, b, c, d, l0, l1, h0, h1;\
1055 a= AV_RN32(&src1[i*src_stride1]);\
1056 b= AV_RN32(&src2[i*src_stride2]);\
1057 c= AV_RN32(&src3[i*src_stride3]);\
1058 d= AV_RN32(&src4[i*src_stride4]);\
1059 l0= (a&0x03030303UL)\
1062 h0= ((a&0xFCFCFCFCUL)>>2)\
1063 + ((b&0xFCFCFCFCUL)>>2);\
1064 l1= (c&0x03030303UL)\
1065 + (d&0x03030303UL);\
1066 h1= ((c&0xFCFCFCFCUL)>>2)\
1067 + ((d&0xFCFCFCFCUL)>>2);\
1068 OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1069 a= AV_RN32(&src1[i*src_stride1+4]);\
1070 b= AV_RN32(&src2[i*src_stride2+4]);\
1071 c= AV_RN32(&src3[i*src_stride3+4]);\
1072 d= AV_RN32(&src4[i*src_stride4+4]);\
1073 l0= (a&0x03030303UL)\
1076 h0= ((a&0xFCFCFCFCUL)>>2)\
1077 + ((b&0xFCFCFCFCUL)>>2);\
1078 l1= (c&0x03030303UL)\
1079 + (d&0x03030303UL);\
1080 h1= ((c&0xFCFCFCFCUL)>>2)\
1081 + ((d&0xFCFCFCFCUL)>>2);\
1082 OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1085 static inline void OPNAME ## _pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
1086 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1087 OPNAME ## _pixels8_l4(dst , src1 , src2 , src3 , src4 , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1088 OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1090 static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
1091 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1092 OPNAME ## _no_rnd_pixels8_l4(dst , src1 , src2 , src3 , src4 , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1093 OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1096 static inline void OPNAME ## _pixels2_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1098 int i, a0, b0, a1, b1;\
1105 for(i=0; i<h; i+=2){\
1111 block[0]= (a1+a0)>>2; /* FIXME non put */\
1112 block[1]= (b1+b0)>>2;\
1122 block[0]= (a1+a0)>>2;\
1123 block[1]= (b1+b0)>>2;\
1129 static inline void OPNAME ## _pixels4_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1132 const uint32_t a= AV_RN32(pixels );\
1133 const uint32_t b= AV_RN32(pixels+1);\
1134 uint32_t l0= (a&0x03030303UL)\
1137 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1138 + ((b&0xFCFCFCFCUL)>>2);\
1142 for(i=0; i<h; i+=2){\
1143 uint32_t a= AV_RN32(pixels );\
1144 uint32_t b= AV_RN32(pixels+1);\
1145 l1= (a&0x03030303UL)\
1146 + (b&0x03030303UL);\
1147 h1= ((a&0xFCFCFCFCUL)>>2)\
1148 + ((b&0xFCFCFCFCUL)>>2);\
1149 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1152 a= AV_RN32(pixels );\
1153 b= AV_RN32(pixels+1);\
1154 l0= (a&0x03030303UL)\
1157 h0= ((a&0xFCFCFCFCUL)>>2)\
1158 + ((b&0xFCFCFCFCUL)>>2);\
1159 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1165 static inline void OPNAME ## _pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1168 for(j=0; j<2; j++){\
1170 const uint32_t a= AV_RN32(pixels );\
1171 const uint32_t b= AV_RN32(pixels+1);\
1172 uint32_t l0= (a&0x03030303UL)\
1175 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1176 + ((b&0xFCFCFCFCUL)>>2);\
1180 for(i=0; i<h; i+=2){\
1181 uint32_t a= AV_RN32(pixels );\
1182 uint32_t b= AV_RN32(pixels+1);\
1183 l1= (a&0x03030303UL)\
1184 + (b&0x03030303UL);\
1185 h1= ((a&0xFCFCFCFCUL)>>2)\
1186 + ((b&0xFCFCFCFCUL)>>2);\
1187 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1190 a= AV_RN32(pixels );\
1191 b= AV_RN32(pixels+1);\
1192 l0= (a&0x03030303UL)\
1195 h0= ((a&0xFCFCFCFCUL)>>2)\
1196 + ((b&0xFCFCFCFCUL)>>2);\
1197 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1201 pixels+=4-line_size*(h+1);\
1202 block +=4-line_size*h;\
1206 static inline void OPNAME ## _no_rnd_pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1209 for(j=0; j<2; j++){\
1211 const uint32_t a= AV_RN32(pixels );\
1212 const uint32_t b= AV_RN32(pixels+1);\
1213 uint32_t l0= (a&0x03030303UL)\
1216 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1217 + ((b&0xFCFCFCFCUL)>>2);\
1221 for(i=0; i<h; i+=2){\
1222 uint32_t a= AV_RN32(pixels );\
1223 uint32_t b= AV_RN32(pixels+1);\
1224 l1= (a&0x03030303UL)\
1225 + (b&0x03030303UL);\
1226 h1= ((a&0xFCFCFCFCUL)>>2)\
1227 + ((b&0xFCFCFCFCUL)>>2);\
1228 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1231 a= AV_RN32(pixels );\
1232 b= AV_RN32(pixels+1);\
1233 l0= (a&0x03030303UL)\
1236 h0= ((a&0xFCFCFCFCUL)>>2)\
1237 + ((b&0xFCFCFCFCUL)>>2);\
1238 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1242 pixels+=4-line_size*(h+1);\
1243 block +=4-line_size*h;\
1247 CALL_2X_PIXELS(OPNAME ## _pixels16_c , OPNAME ## _pixels8_c , 8)\
1248 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels8_x2_c , 8)\
1249 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels8_y2_c , 8)\
1250 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels8_xy2_c, 8)\
1251 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_c , OPNAME ## _pixels8_c , 8)\
1252 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels8_x2_c , 8)\
1253 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels8_y2_c , 8)\
1254 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels8_xy2_c, 8)\
1256 #define op_avg(a, b) a = rnd_avg32(a, b)
1258 #define op_put(a, b) a = b
1265 #define avg2(a,b) ((a+b+1)>>1)
1266 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
1268 static void put_no_rnd_pixels16_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1269 put_no_rnd_pixels16_l2(dst, a, b, stride, stride, stride, h);
1272 static void put_no_rnd_pixels8_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1273 put_no_rnd_pixels8_l2(dst, a, b, stride, stride, stride, h);
1276 static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
1278 const int A=(16-x16)*(16-y16);
1279 const int B=( x16)*(16-y16);
1280 const int C=(16-x16)*( y16);
1281 const int D=( x16)*( y16);
1286 dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
1287 dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
1288 dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
1289 dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
1290 dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
1291 dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
1292 dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
1293 dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
1299 void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
1300 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
1303 const int s= 1<<shift;
1313 for(x=0; x<8; x++){ //XXX FIXME optimize
1314 int src_x, src_y, frac_x, frac_y, index;
1318 frac_x= src_x&(s-1);
1319 frac_y= src_y&(s-1);
1323 if((unsigned)src_x < width){
1324 if((unsigned)src_y < height){
1325 index= src_x + src_y*stride;
1326 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
1327 + src[index +1]* frac_x )*(s-frac_y)
1328 + ( src[index+stride ]*(s-frac_x)
1329 + src[index+stride+1]* frac_x )* frac_y
1332 index= src_x + av_clip(src_y, 0, height)*stride;
1333 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
1334 + src[index +1]* frac_x )*s
1338 if((unsigned)src_y < height){
1339 index= av_clip(src_x, 0, width) + src_y*stride;
1340 dst[y*stride + x]= ( ( src[index ]*(s-frac_y)
1341 + src[index+stride ]* frac_y )*s
1344 index= av_clip(src_x, 0, width) + av_clip(src_y, 0, height)*stride;
1345 dst[y*stride + x]= src[index ];
1357 static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1359 case 2: put_pixels2_c (dst, src, stride, height); break;
1360 case 4: put_pixels4_c (dst, src, stride, height); break;
1361 case 8: put_pixels8_c (dst, src, stride, height); break;
1362 case 16:put_pixels16_c(dst, src, stride, height); break;
1366 static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1368 for (i=0; i < height; i++) {
1369 for (j=0; j < width; j++) {
1370 dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
1377 static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1379 for (i=0; i < height; i++) {
1380 for (j=0; j < width; j++) {
1381 dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
1388 static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1390 for (i=0; i < height; i++) {
1391 for (j=0; j < width; j++) {
1392 dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
1399 static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1401 for (i=0; i < height; i++) {
1402 for (j=0; j < width; j++) {
1403 dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
1410 static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1412 for (i=0; i < height; i++) {
1413 for (j=0; j < width; j++) {
1414 dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1421 static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1423 for (i=0; i < height; i++) {
1424 for (j=0; j < width; j++) {
1425 dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
1432 static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1434 for (i=0; i < height; i++) {
1435 for (j=0; j < width; j++) {
1436 dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1443 static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1445 for (i=0; i < height; i++) {
1446 for (j=0; j < width; j++) {
1447 dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
1454 static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1456 case 2: avg_pixels2_c (dst, src, stride, height); break;
1457 case 4: avg_pixels4_c (dst, src, stride, height); break;
1458 case 8: avg_pixels8_c (dst, src, stride, height); break;
1459 case 16:avg_pixels16_c(dst, src, stride, height); break;
1463 static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1465 for (i=0; i < height; i++) {
1466 for (j=0; j < width; j++) {
1467 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
1474 static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1476 for (i=0; i < height; i++) {
1477 for (j=0; j < width; j++) {
1478 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
1485 static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1487 for (i=0; i < height; i++) {
1488 for (j=0; j < width; j++) {
1489 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
1496 static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1498 for (i=0; i < height; i++) {
1499 for (j=0; j < width; j++) {
1500 dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1507 static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1509 for (i=0; i < height; i++) {
1510 for (j=0; j < width; j++) {
1511 dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1518 static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1520 for (i=0; i < height; i++) {
1521 for (j=0; j < width; j++) {
1522 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
1529 static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1531 for (i=0; i < height; i++) {
1532 for (j=0; j < width; j++) {
1533 dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1540 static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1542 for (i=0; i < height; i++) {
1543 for (j=0; j < width; j++) {
1544 dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1551 #define TPEL_WIDTH(width)\
1552 static void put_tpel_pixels ## width ## _mc00_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1553 void put_tpel_pixels_mc00_c(dst, src, stride, width, height);}\
1554 static void put_tpel_pixels ## width ## _mc10_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1555 void put_tpel_pixels_mc10_c(dst, src, stride, width, height);}\
1556 static void put_tpel_pixels ## width ## _mc20_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1557 void put_tpel_pixels_mc20_c(dst, src, stride, width, height);}\
1558 static void put_tpel_pixels ## width ## _mc01_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1559 void put_tpel_pixels_mc01_c(dst, src, stride, width, height);}\
1560 static void put_tpel_pixels ## width ## _mc11_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1561 void put_tpel_pixels_mc11_c(dst, src, stride, width, height);}\
1562 static void put_tpel_pixels ## width ## _mc21_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1563 void put_tpel_pixels_mc21_c(dst, src, stride, width, height);}\
1564 static void put_tpel_pixels ## width ## _mc02_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1565 void put_tpel_pixels_mc02_c(dst, src, stride, width, height);}\
1566 static void put_tpel_pixels ## width ## _mc12_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1567 void put_tpel_pixels_mc12_c(dst, src, stride, width, height);}\
1568 static void put_tpel_pixels ## width ## _mc22_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1569 void put_tpel_pixels_mc22_c(dst, src, stride, width, height);}
1572 #define H264_CHROMA_MC(OPNAME, OP)\
1573 static void OPNAME ## h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1574 const int A=(8-x)*(8-y);\
1575 const int B=( x)*(8-y);\
1576 const int C=(8-x)*( y);\
1577 const int D=( x)*( y);\
1580 assert(x<8 && y<8 && x>=0 && y>=0);\
1583 for(i=0; i<h; i++){\
1584 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1585 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1591 const int step= C ? stride : 1;\
1592 for(i=0; i<h; i++){\
1593 OP(dst[0], (A*src[0] + E*src[step+0]));\
1594 OP(dst[1], (A*src[1] + E*src[step+1]));\
1601 static void OPNAME ## h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1602 const int A=(8-x)*(8-y);\
1603 const int B=( x)*(8-y);\
1604 const int C=(8-x)*( y);\
1605 const int D=( x)*( y);\
1608 assert(x<8 && y<8 && x>=0 && y>=0);\
1611 for(i=0; i<h; i++){\
1612 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1613 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1614 OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1615 OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1621 const int step= C ? stride : 1;\
1622 for(i=0; i<h; i++){\
1623 OP(dst[0], (A*src[0] + E*src[step+0]));\
1624 OP(dst[1], (A*src[1] + E*src[step+1]));\
1625 OP(dst[2], (A*src[2] + E*src[step+2]));\
1626 OP(dst[3], (A*src[3] + E*src[step+3]));\
1633 static void OPNAME ## h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1634 const int A=(8-x)*(8-y);\
1635 const int B=( x)*(8-y);\
1636 const int C=(8-x)*( y);\
1637 const int D=( x)*( y);\
1640 assert(x<8 && y<8 && x>=0 && y>=0);\
1643 for(i=0; i<h; i++){\
1644 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1645 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1646 OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1647 OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1648 OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));\
1649 OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));\
1650 OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));\
1651 OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));\
1657 const int step= C ? stride : 1;\
1658 for(i=0; i<h; i++){\
1659 OP(dst[0], (A*src[0] + E*src[step+0]));\
1660 OP(dst[1], (A*src[1] + E*src[step+1]));\
1661 OP(dst[2], (A*src[2] + E*src[step+2]));\
1662 OP(dst[3], (A*src[3] + E*src[step+3]));\
1663 OP(dst[4], (A*src[4] + E*src[step+4]));\
1664 OP(dst[5], (A*src[5] + E*src[step+5]));\
1665 OP(dst[6], (A*src[6] + E*src[step+6]));\
1666 OP(dst[7], (A*src[7] + E*src[step+7]));\
1673 #define op_avg(a, b) a = (((a)+(((b) + 32)>>6)+1)>>1)
1674 #define op_put(a, b) a = (((b) + 32)>>6)
1676 H264_CHROMA_MC(put_ , op_put)
1677 H264_CHROMA_MC(avg_ , op_avg)
1681 static void put_no_rnd_h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){
1682 const int A=(8-x)*(8-y);
1683 const int B=( x)*(8-y);
1684 const int C=(8-x)*( y);
1685 const int D=( x)*( y);
1688 assert(x<8 && y<8 && x>=0 && y>=0);
1692 dst[0] = (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + 32 - 4) >> 6;
1693 dst[1] = (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + 32 - 4) >> 6;
1694 dst[2] = (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + 32 - 4) >> 6;
1695 dst[3] = (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + 32 - 4) >> 6;
1696 dst[4] = (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + 32 - 4) >> 6;
1697 dst[5] = (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + 32 - 4) >> 6;
1698 dst[6] = (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + 32 - 4) >> 6;
1699 dst[7] = (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + 32 - 4) >> 6;
1705 #define QPEL_MC(r, OPNAME, RND, OP) \
1706 static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1707 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1711 OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
1712 OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
1713 OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
1714 OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
1715 OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
1716 OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
1717 OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
1718 OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
1724 static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1726 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1730 const int src0= src[0*srcStride];\
1731 const int src1= src[1*srcStride];\
1732 const int src2= src[2*srcStride];\
1733 const int src3= src[3*srcStride];\
1734 const int src4= src[4*srcStride];\
1735 const int src5= src[5*srcStride];\
1736 const int src6= src[6*srcStride];\
1737 const int src7= src[7*srcStride];\
1738 const int src8= src[8*srcStride];\
1739 OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
1740 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
1741 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
1742 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
1743 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
1744 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
1745 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
1746 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
1752 static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1753 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1758 OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
1759 OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
1760 OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
1761 OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
1762 OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
1763 OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
1764 OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
1765 OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
1766 OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
1767 OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
1768 OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
1769 OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
1770 OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
1771 OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
1772 OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
1773 OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
1779 static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1780 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1785 const int src0= src[0*srcStride];\
1786 const int src1= src[1*srcStride];\
1787 const int src2= src[2*srcStride];\
1788 const int src3= src[3*srcStride];\
1789 const int src4= src[4*srcStride];\
1790 const int src5= src[5*srcStride];\
1791 const int src6= src[6*srcStride];\
1792 const int src7= src[7*srcStride];\
1793 const int src8= src[8*srcStride];\
1794 const int src9= src[9*srcStride];\
1795 const int src10= src[10*srcStride];\
1796 const int src11= src[11*srcStride];\
1797 const int src12= src[12*srcStride];\
1798 const int src13= src[13*srcStride];\
1799 const int src14= src[14*srcStride];\
1800 const int src15= src[15*srcStride];\
1801 const int src16= src[16*srcStride];\
1802 OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
1803 OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
1804 OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
1805 OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
1806 OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
1807 OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
1808 OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
1809 OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
1810 OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
1811 OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
1812 OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
1813 OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
1814 OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
1815 OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
1816 OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
1817 OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
1823 static void OPNAME ## qpel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1824 OPNAME ## pixels8_c(dst, src, stride, 8);\
1827 static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1829 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1830 OPNAME ## pixels8_l2(dst, src, half, stride, stride, 8, 8);\
1833 static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1834 OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
1837 static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1839 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1840 OPNAME ## pixels8_l2(dst, src+1, half, stride, stride, 8, 8);\
1843 static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1844 uint8_t full[16*9];\
1846 copy_block9(full, src, 16, stride, 9);\
1847 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1848 OPNAME ## pixels8_l2(dst, full, half, stride, 16, 8, 8);\
1851 static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1852 uint8_t full[16*9];\
1853 copy_block9(full, src, 16, stride, 9);\
1854 OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
1857 static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1858 uint8_t full[16*9];\
1860 copy_block9(full, src, 16, stride, 9);\
1861 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1862 OPNAME ## pixels8_l2(dst, full+16, half, stride, 16, 8, 8);\
1864 void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1865 uint8_t full[16*9];\
1868 uint8_t halfHV[64];\
1869 copy_block9(full, src, 16, stride, 9);\
1870 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1871 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1872 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1873 OPNAME ## pixels8_l4(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1875 static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1876 uint8_t full[16*9];\
1878 uint8_t halfHV[64];\
1879 copy_block9(full, src, 16, stride, 9);\
1880 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1881 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1882 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1883 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1885 void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1886 uint8_t full[16*9];\
1889 uint8_t halfHV[64];\
1890 copy_block9(full, src, 16, stride, 9);\
1891 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1892 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1893 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1894 OPNAME ## pixels8_l4(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1896 static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1897 uint8_t full[16*9];\
1899 uint8_t halfHV[64];\
1900 copy_block9(full, src, 16, stride, 9);\
1901 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1902 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1903 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1904 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1906 void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1907 uint8_t full[16*9];\
1910 uint8_t halfHV[64];\
1911 copy_block9(full, src, 16, stride, 9);\
1912 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1913 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1914 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1915 OPNAME ## pixels8_l4(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1917 static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1918 uint8_t full[16*9];\
1920 uint8_t halfHV[64];\
1921 copy_block9(full, src, 16, stride, 9);\
1922 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1923 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1924 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1925 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1927 void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1928 uint8_t full[16*9];\
1931 uint8_t halfHV[64];\
1932 copy_block9(full, src, 16, stride, 9);\
1933 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full , 8, 16, 9);\
1934 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1935 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1936 OPNAME ## pixels8_l4(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1938 static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1939 uint8_t full[16*9];\
1941 uint8_t halfHV[64];\
1942 copy_block9(full, src, 16, stride, 9);\
1943 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1944 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1945 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1946 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1948 static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1950 uint8_t halfHV[64];\
1951 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1952 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1953 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1955 static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1957 uint8_t halfHV[64];\
1958 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1959 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1960 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1962 void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1963 uint8_t full[16*9];\
1966 uint8_t halfHV[64];\
1967 copy_block9(full, src, 16, stride, 9);\
1968 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1969 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1970 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1971 OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1973 static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1974 uint8_t full[16*9];\
1976 copy_block9(full, src, 16, stride, 9);\
1977 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1978 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1979 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1981 void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1982 uint8_t full[16*9];\
1985 uint8_t halfHV[64];\
1986 copy_block9(full, src, 16, stride, 9);\
1987 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1988 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1989 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1990 OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1992 static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1993 uint8_t full[16*9];\
1995 copy_block9(full, src, 16, stride, 9);\
1996 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1997 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1998 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
2000 static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2002 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
2003 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
2005 static void OPNAME ## qpel16_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2006 OPNAME ## pixels16_c(dst, src, stride, 16);\
2009 static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2011 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
2012 OPNAME ## pixels16_l2(dst, src, half, stride, stride, 16, 16);\
2015 static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2016 OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
2019 static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2021 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
2022 OPNAME ## pixels16_l2(dst, src+1, half, stride, stride, 16, 16);\
2025 static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2026 uint8_t full[24*17];\
2028 copy_block17(full, src, 24, stride, 17);\
2029 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
2030 OPNAME ## pixels16_l2(dst, full, half, stride, 24, 16, 16);\
2033 static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2034 uint8_t full[24*17];\
2035 copy_block17(full, src, 24, stride, 17);\
2036 OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
2039 static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2040 uint8_t full[24*17];\
2042 copy_block17(full, src, 24, stride, 17);\
2043 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
2044 OPNAME ## pixels16_l2(dst, full+24, half, stride, 24, 16, 16);\
2046 void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
2047 uint8_t full[24*17];\
2048 uint8_t halfH[272];\
2049 uint8_t halfV[256];\
2050 uint8_t halfHV[256];\
2051 copy_block17(full, src, 24, stride, 17);\
2052 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2053 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2054 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2055 OPNAME ## pixels16_l4(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2057 static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2058 uint8_t full[24*17];\
2059 uint8_t halfH[272];\
2060 uint8_t halfHV[256];\
2061 copy_block17(full, src, 24, stride, 17);\
2062 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2063 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2064 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2065 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2067 void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
2068 uint8_t full[24*17];\
2069 uint8_t halfH[272];\
2070 uint8_t halfV[256];\
2071 uint8_t halfHV[256];\
2072 copy_block17(full, src, 24, stride, 17);\
2073 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2074 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2075 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2076 OPNAME ## pixels16_l4(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2078 static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2079 uint8_t full[24*17];\
2080 uint8_t halfH[272];\
2081 uint8_t halfHV[256];\
2082 copy_block17(full, src, 24, stride, 17);\
2083 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2084 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2085 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2086 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2088 void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
2089 uint8_t full[24*17];\
2090 uint8_t halfH[272];\
2091 uint8_t halfV[256];\
2092 uint8_t halfHV[256];\
2093 copy_block17(full, src, 24, stride, 17);\
2094 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2095 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2096 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2097 OPNAME ## pixels16_l4(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2099 static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2100 uint8_t full[24*17];\
2101 uint8_t halfH[272];\
2102 uint8_t halfHV[256];\
2103 copy_block17(full, src, 24, stride, 17);\
2104 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2105 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2106 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2107 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2109 void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
2110 uint8_t full[24*17];\
2111 uint8_t halfH[272];\
2112 uint8_t halfV[256];\
2113 uint8_t halfHV[256];\
2114 copy_block17(full, src, 24, stride, 17);\
2115 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full , 16, 24, 17);\
2116 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2117 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2118 OPNAME ## pixels16_l4(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2120 static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2121 uint8_t full[24*17];\
2122 uint8_t halfH[272];\
2123 uint8_t halfHV[256];\
2124 copy_block17(full, src, 24, stride, 17);\
2125 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2126 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2127 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2128 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2130 static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2131 uint8_t halfH[272];\
2132 uint8_t halfHV[256];\
2133 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2134 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2135 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2137 static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2138 uint8_t halfH[272];\
2139 uint8_t halfHV[256];\
2140 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2141 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2142 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2144 void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
2145 uint8_t full[24*17];\
2146 uint8_t halfH[272];\
2147 uint8_t halfV[256];\
2148 uint8_t halfHV[256];\
2149 copy_block17(full, src, 24, stride, 17);\
2150 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2151 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2152 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2153 OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2155 static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2156 uint8_t full[24*17];\
2157 uint8_t halfH[272];\
2158 copy_block17(full, src, 24, stride, 17);\
2159 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2160 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2161 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2163 void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
2164 uint8_t full[24*17];\
2165 uint8_t halfH[272];\
2166 uint8_t halfV[256];\
2167 uint8_t halfHV[256];\
2168 copy_block17(full, src, 24, stride, 17);\
2169 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2170 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2171 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2172 OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2174 static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2175 uint8_t full[24*17];\
2176 uint8_t halfH[272];\
2177 copy_block17(full, src, 24, stride, 17);\
2178 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2179 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2180 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2182 static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2183 uint8_t halfH[272];\
2184 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2185 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2188 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2189 #define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
2190 #define op_put(a, b) a = cm[((b) + 16)>>5]
2191 #define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
2193 QPEL_MC(0, put_ , _ , op_put)
2194 QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
2195 QPEL_MC(0, avg_ , _ , op_avg)
2196 //QPEL_MC(1, avg_no_rnd , _ , op_avg)
2198 #undef op_avg_no_rnd
2200 #undef op_put_no_rnd
2203 #define H264_LOWPASS(OPNAME, OP, OP2) \
2204 static av_unused void OPNAME ## h264_qpel2_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2206 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2210 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2211 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2217 static av_unused void OPNAME ## h264_qpel2_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2219 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2223 const int srcB= src[-2*srcStride];\
2224 const int srcA= src[-1*srcStride];\
2225 const int src0= src[0 *srcStride];\
2226 const int src1= src[1 *srcStride];\
2227 const int src2= src[2 *srcStride];\
2228 const int src3= src[3 *srcStride];\
2229 const int src4= src[4 *srcStride];\
2230 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2231 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2237 static av_unused void OPNAME ## h264_qpel2_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2240 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2242 src -= 2*srcStride;\
2243 for(i=0; i<h+5; i++)\
2245 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2246 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2250 tmp -= tmpStride*(h+5-2);\
2253 const int tmpB= tmp[-2*tmpStride];\
2254 const int tmpA= tmp[-1*tmpStride];\
2255 const int tmp0= tmp[0 *tmpStride];\
2256 const int tmp1= tmp[1 *tmpStride];\
2257 const int tmp2= tmp[2 *tmpStride];\
2258 const int tmp3= tmp[3 *tmpStride];\
2259 const int tmp4= tmp[4 *tmpStride];\
2260 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2261 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2266 static void OPNAME ## h264_qpel4_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2268 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2272 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2273 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2274 OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\
2275 OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\
2281 static void OPNAME ## h264_qpel4_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2283 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2287 const int srcB= src[-2*srcStride];\
2288 const int srcA= src[-1*srcStride];\
2289 const int src0= src[0 *srcStride];\
2290 const int src1= src[1 *srcStride];\
2291 const int src2= src[2 *srcStride];\
2292 const int src3= src[3 *srcStride];\
2293 const int src4= src[4 *srcStride];\
2294 const int src5= src[5 *srcStride];\
2295 const int src6= src[6 *srcStride];\
2296 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2297 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2298 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2299 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2305 static void OPNAME ## h264_qpel4_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2308 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2310 src -= 2*srcStride;\
2311 for(i=0; i<h+5; i++)\
2313 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2314 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2315 tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]);\
2316 tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]);\
2320 tmp -= tmpStride*(h+5-2);\
2323 const int tmpB= tmp[-2*tmpStride];\
2324 const int tmpA= tmp[-1*tmpStride];\
2325 const int tmp0= tmp[0 *tmpStride];\
2326 const int tmp1= tmp[1 *tmpStride];\
2327 const int tmp2= tmp[2 *tmpStride];\
2328 const int tmp3= tmp[3 *tmpStride];\
2329 const int tmp4= tmp[4 *tmpStride];\
2330 const int tmp5= tmp[5 *tmpStride];\
2331 const int tmp6= tmp[6 *tmpStride];\
2332 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2333 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2334 OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2335 OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2341 static void OPNAME ## h264_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2343 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2347 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]));\
2348 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]));\
2349 OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]));\
2350 OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]));\
2351 OP(dst[4], (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]));\
2352 OP(dst[5], (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]));\
2353 OP(dst[6], (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]));\
2354 OP(dst[7], (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]));\
2360 static void OPNAME ## h264_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2362 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2366 const int srcB= src[-2*srcStride];\
2367 const int srcA= src[-1*srcStride];\
2368 const int src0= src[0 *srcStride];\
2369 const int src1= src[1 *srcStride];\
2370 const int src2= src[2 *srcStride];\
2371 const int src3= src[3 *srcStride];\
2372 const int src4= src[4 *srcStride];\
2373 const int src5= src[5 *srcStride];\
2374 const int src6= src[6 *srcStride];\
2375 const int src7= src[7 *srcStride];\
2376 const int src8= src[8 *srcStride];\
2377 const int src9= src[9 *srcStride];\
2378 const int src10=src[10*srcStride];\
2379 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2380 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2381 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2382 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2383 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*5 + (src2+src7));\
2384 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*5 + (src3+src8));\
2385 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*5 + (src4+src9));\
2386 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src9)*5 + (src5+src10));\
2392 static void OPNAME ## h264_qpel8_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2395 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2397 src -= 2*srcStride;\
2398 for(i=0; i<h+5; i++)\
2400 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]);\
2401 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]);\
2402 tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]);\
2403 tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]);\
2404 tmp[4]= (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]);\
2405 tmp[5]= (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]);\
2406 tmp[6]= (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]);\
2407 tmp[7]= (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]);\
2411 tmp -= tmpStride*(h+5-2);\
2414 const int tmpB= tmp[-2*tmpStride];\
2415 const int tmpA= tmp[-1*tmpStride];\
2416 const int tmp0= tmp[0 *tmpStride];\
2417 const int tmp1= tmp[1 *tmpStride];\
2418 const int tmp2= tmp[2 *tmpStride];\
2419 const int tmp3= tmp[3 *tmpStride];\
2420 const int tmp4= tmp[4 *tmpStride];\
2421 const int tmp5= tmp[5 *tmpStride];\
2422 const int tmp6= tmp[6 *tmpStride];\
2423 const int tmp7= tmp[7 *tmpStride];\
2424 const int tmp8= tmp[8 *tmpStride];\
2425 const int tmp9= tmp[9 *tmpStride];\
2426 const int tmp10=tmp[10*tmpStride];\
2427 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2428 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2429 OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2430 OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2431 OP2(dst[4*dstStride], (tmp4+tmp5)*20 - (tmp3+tmp6)*5 + (tmp2+tmp7));\
2432 OP2(dst[5*dstStride], (tmp5+tmp6)*20 - (tmp4+tmp7)*5 + (tmp3+tmp8));\
2433 OP2(dst[6*dstStride], (tmp6+tmp7)*20 - (tmp5+tmp8)*5 + (tmp4+tmp9));\
2434 OP2(dst[7*dstStride], (tmp7+tmp8)*20 - (tmp6+tmp9)*5 + (tmp5+tmp10));\
2440 static void OPNAME ## h264_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2441 OPNAME ## h264_qpel8_v_lowpass(dst , src , dstStride, srcStride);\
2442 OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2443 src += 8*srcStride;\
2444 dst += 8*dstStride;\
2445 OPNAME ## h264_qpel8_v_lowpass(dst , src , dstStride, srcStride);\
2446 OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2449 static void OPNAME ## h264_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2450 OPNAME ## h264_qpel8_h_lowpass(dst , src , dstStride, srcStride);\
2451 OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2452 src += 8*srcStride;\
2453 dst += 8*dstStride;\
2454 OPNAME ## h264_qpel8_h_lowpass(dst , src , dstStride, srcStride);\
2455 OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2458 static void OPNAME ## h264_qpel16_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2459 OPNAME ## h264_qpel8_hv_lowpass(dst , tmp , src , dstStride, tmpStride, srcStride);\
2460 OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2461 src += 8*srcStride;\
2462 dst += 8*dstStride;\
2463 OPNAME ## h264_qpel8_hv_lowpass(dst , tmp , src , dstStride, tmpStride, srcStride);\
2464 OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2467 #define H264_MC(OPNAME, SIZE) \
2468 static void OPNAME ## h264_qpel ## SIZE ## _mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2469 OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\
2472 static void OPNAME ## h264_qpel ## SIZE ## _mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2473 uint8_t half[SIZE*SIZE];\
2474 put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2475 OPNAME ## pixels ## SIZE ## _l2(dst, src, half, stride, stride, SIZE, SIZE);\
2478 static void OPNAME ## h264_qpel ## SIZE ## _mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2479 OPNAME ## h264_qpel ## SIZE ## _h_lowpass(dst, src, stride, stride);\
2482 static void OPNAME ## h264_qpel ## SIZE ## _mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2483 uint8_t half[SIZE*SIZE];\
2484 put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2485 OPNAME ## pixels ## SIZE ## _l2(dst, src+1, half, stride, stride, SIZE, SIZE);\
2488 static void OPNAME ## h264_qpel ## SIZE ## _mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2489 uint8_t full[SIZE*(SIZE+5)];\
2490 uint8_t * const full_mid= full + SIZE*2;\
2491 uint8_t half[SIZE*SIZE];\
2492 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2493 put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2494 OPNAME ## pixels ## SIZE ## _l2(dst, full_mid, half, stride, SIZE, SIZE, SIZE);\
2497 static void OPNAME ## h264_qpel ## SIZE ## _mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2498 uint8_t full[SIZE*(SIZE+5)];\
2499 uint8_t * const full_mid= full + SIZE*2;\
2500 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2501 OPNAME ## h264_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE);\
2504 static void OPNAME ## h264_qpel ## SIZE ## _mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2505 uint8_t full[SIZE*(SIZE+5)];\
2506 uint8_t * const full_mid= full + SIZE*2;\
2507 uint8_t half[SIZE*SIZE];\
2508 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2509 put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2510 OPNAME ## pixels ## SIZE ## _l2(dst, full_mid+SIZE, half, stride, SIZE, SIZE, SIZE);\
2513 static void OPNAME ## h264_qpel ## SIZE ## _mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2514 uint8_t full[SIZE*(SIZE+5)];\
2515 uint8_t * const full_mid= full + SIZE*2;\
2516 uint8_t halfH[SIZE*SIZE];\
2517 uint8_t halfV[SIZE*SIZE];\
2518 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2519 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2520 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2521 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2524 static void OPNAME ## h264_qpel ## SIZE ## _mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2525 uint8_t full[SIZE*(SIZE+5)];\
2526 uint8_t * const full_mid= full + SIZE*2;\
2527 uint8_t halfH[SIZE*SIZE];\
2528 uint8_t halfV[SIZE*SIZE];\
2529 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2530 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2531 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2532 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2535 static void OPNAME ## h264_qpel ## SIZE ## _mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2536 uint8_t full[SIZE*(SIZE+5)];\
2537 uint8_t * const full_mid= full + SIZE*2;\
2538 uint8_t halfH[SIZE*SIZE];\
2539 uint8_t halfV[SIZE*SIZE];\
2540 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2541 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2542 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2543 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2546 static void OPNAME ## h264_qpel ## SIZE ## _mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2547 uint8_t full[SIZE*(SIZE+5)];\
2548 uint8_t * const full_mid= full + SIZE*2;\
2549 uint8_t halfH[SIZE*SIZE];\
2550 uint8_t halfV[SIZE*SIZE];\
2551 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2552 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2553 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2554 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2557 static void OPNAME ## h264_qpel ## SIZE ## _mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2558 int16_t tmp[SIZE*(SIZE+5)];\
2559 OPNAME ## h264_qpel ## SIZE ## _hv_lowpass(dst, tmp, src, stride, SIZE, stride);\
2562 static void OPNAME ## h264_qpel ## SIZE ## _mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2563 int16_t tmp[SIZE*(SIZE+5)];\
2564 uint8_t halfH[SIZE*SIZE];\
2565 uint8_t halfHV[SIZE*SIZE];\
2566 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2567 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2568 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2571 static void OPNAME ## h264_qpel ## SIZE ## _mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2572 int16_t tmp[SIZE*(SIZE+5)];\
2573 uint8_t halfH[SIZE*SIZE];\
2574 uint8_t halfHV[SIZE*SIZE];\
2575 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2576 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2577 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2580 static void OPNAME ## h264_qpel ## SIZE ## _mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2581 uint8_t full[SIZE*(SIZE+5)];\
2582 uint8_t * const full_mid= full + SIZE*2;\
2583 int16_t tmp[SIZE*(SIZE+5)];\
2584 uint8_t halfV[SIZE*SIZE];\
2585 uint8_t halfHV[SIZE*SIZE];\
2586 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2587 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2588 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2589 OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2592 static void OPNAME ## h264_qpel ## SIZE ## _mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2593 uint8_t full[SIZE*(SIZE+5)];\
2594 uint8_t * const full_mid= full + SIZE*2;\
2595 int16_t tmp[SIZE*(SIZE+5)];\
2596 uint8_t halfV[SIZE*SIZE];\
2597 uint8_t halfHV[SIZE*SIZE];\
2598 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2599 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2600 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2601 OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2604 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2605 //#define op_avg2(a, b) a = (((a)*w1+cm[((b) + 16)>>5]*w2 + o + 64)>>7)
2606 #define op_put(a, b) a = cm[((b) + 16)>>5]
2607 #define op2_avg(a, b) a = (((a)+cm[((b) + 512)>>10]+1)>>1)
2608 #define op2_put(a, b) a = cm[((b) + 512)>>10]
2610 H264_LOWPASS(put_ , op_put, op2_put)
2611 H264_LOWPASS(avg_ , op_avg, op2_avg)
2626 #define op_scale1(x) block[x] = av_clip_uint8( (block[x]*weight + offset) >> log2_denom )
2627 #define op_scale2(x) dst[x] = av_clip_uint8( (src[x]*weights + dst[x]*weightd + offset) >> (log2_denom+1))
2628 #define H264_WEIGHT(W,H) \
2629 static void weight_h264_pixels ## W ## x ## H ## _c(uint8_t *block, int stride, int log2_denom, int weight, int offset){ \
2631 offset <<= log2_denom; \
2632 if(log2_denom) offset += 1<<(log2_denom-1); \
2633 for(y=0; y<H; y++, block += stride){ \
2636 if(W==2) continue; \
2639 if(W==4) continue; \
2644 if(W==8) continue; \
2655 static void biweight_h264_pixels ## W ## x ## H ## _c(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset){ \
2657 offset = ((offset + 1) | 1) << log2_denom; \
2658 for(y=0; y<H; y++, dst += stride, src += stride){ \
2661 if(W==2) continue; \
2664 if(W==4) continue; \
2669 if(W==8) continue; \
2696 static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
2697 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2701 dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
2702 dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
2703 dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
2704 dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
2705 dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
2706 dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
2707 dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
2708 dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
2714 #ifdef CONFIG_CAVS_DECODER
2716 void ff_cavsdsp_init(DSPContext* c, AVCodecContext *avctx);
2718 void ff_put_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2719 put_pixels8_c(dst, src, stride, 8);
2721 void ff_avg_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2722 avg_pixels8_c(dst, src, stride, 8);
2724 void ff_put_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2725 put_pixels16_c(dst, src, stride, 16);
2727 void ff_avg_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2728 avg_pixels16_c(dst, src, stride, 16);
2730 #endif /* CONFIG_CAVS_DECODER */
2732 #if defined(CONFIG_VC1_DECODER) || defined(CONFIG_WMV3_DECODER)
2734 void ff_vc1dsp_init(DSPContext* c, AVCodecContext *avctx);
2736 void ff_put_vc1_mspel_mc00_c(uint8_t *dst, uint8_t *src, int stride, int rnd) {
2737 put_pixels8_c(dst, src, stride, 8);
2739 #endif /* CONFIG_VC1_DECODER||CONFIG_WMV3_DECODER */
2741 void ff_intrax8dsp_init(DSPContext* c, AVCodecContext *avctx);
2744 void ff_h264dspenc_init(DSPContext* c, AVCodecContext *avctx);
2746 #if defined(CONFIG_RV40_DECODER)
2747 static void put_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2748 put_pixels16_xy2_c(dst, src, stride, 16);
2750 static void avg_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2751 avg_pixels16_xy2_c(dst, src, stride, 16);
2753 static void put_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2754 put_pixels8_xy2_c(dst, src, stride, 8);
2756 static void avg_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2757 avg_pixels8_xy2_c(dst, src, stride, 8);
2760 void ff_rv40dsp_init(DSPContext* c, AVCodecContext *avctx);
2761 #endif /* CONFIG_RV40_DECODER */
2763 static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
2764 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2768 const int src_1= src[ -srcStride];
2769 const int src0 = src[0 ];
2770 const int src1 = src[ srcStride];
2771 const int src2 = src[2*srcStride];
2772 const int src3 = src[3*srcStride];
2773 const int src4 = src[4*srcStride];
2774 const int src5 = src[5*srcStride];
2775 const int src6 = src[6*srcStride];
2776 const int src7 = src[7*srcStride];
2777 const int src8 = src[8*srcStride];
2778 const int src9 = src[9*srcStride];
2779 dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
2780 dst[1*dstStride]= cm[(9*(src1 + src2) - (src0 + src3) + 8)>>4];
2781 dst[2*dstStride]= cm[(9*(src2 + src3) - (src1 + src4) + 8)>>4];
2782 dst[3*dstStride]= cm[(9*(src3 + src4) - (src2 + src5) + 8)>>4];
2783 dst[4*dstStride]= cm[(9*(src4 + src5) - (src3 + src6) + 8)>>4];
2784 dst[5*dstStride]= cm[(9*(src5 + src6) - (src4 + src7) + 8)>>4];
2785 dst[6*dstStride]= cm[(9*(src6 + src7) - (src5 + src8) + 8)>>4];
2786 dst[7*dstStride]= cm[(9*(src7 + src8) - (src6 + src9) + 8)>>4];
2792 static void put_mspel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){
2793 put_pixels8_c(dst, src, stride, 8);
2796 static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
2798 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2799 put_pixels8_l2(dst, src, half, stride, stride, 8, 8);
2802 static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
2803 wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
2806 static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
2808 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2809 put_pixels8_l2(dst, src+1, half, stride, stride, 8, 8);
2812 static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
2813 wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
2816 static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
2820 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2821 wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
2822 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2823 put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2825 static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
2829 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2830 wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
2831 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2832 put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2834 static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
2836 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2837 wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
2840 static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
2841 if(ENABLE_ANY_H263) {
2843 const int strength= ff_h263_loop_filter_strength[qscale];
2847 int p0= src[x-2*stride];
2848 int p1= src[x-1*stride];
2849 int p2= src[x+0*stride];
2850 int p3= src[x+1*stride];
2851 int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2853 if (d<-2*strength) d1= 0;
2854 else if(d<- strength) d1=-2*strength - d;
2855 else if(d< strength) d1= d;
2856 else if(d< 2*strength) d1= 2*strength - d;
2861 if(p1&256) p1= ~(p1>>31);
2862 if(p2&256) p2= ~(p2>>31);
2864 src[x-1*stride] = p1;
2865 src[x+0*stride] = p2;
2869 d2= av_clip((p0-p3)/4, -ad1, ad1);
2871 src[x-2*stride] = p0 - d2;
2872 src[x+ stride] = p3 + d2;
2877 static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
2878 if(ENABLE_ANY_H263) {
2880 const int strength= ff_h263_loop_filter_strength[qscale];
2884 int p0= src[y*stride-2];
2885 int p1= src[y*stride-1];
2886 int p2= src[y*stride+0];
2887 int p3= src[y*stride+1];
2888 int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2890 if (d<-2*strength) d1= 0;
2891 else if(d<- strength) d1=-2*strength - d;
2892 else if(d< strength) d1= d;
2893 else if(d< 2*strength) d1= 2*strength - d;
2898 if(p1&256) p1= ~(p1>>31);
2899 if(p2&256) p2= ~(p2>>31);
2901 src[y*stride-1] = p1;
2902 src[y*stride+0] = p2;
2906 d2= av_clip((p0-p3)/4, -ad1, ad1);
2908 src[y*stride-2] = p0 - d2;
2909 src[y*stride+1] = p3 + d2;
2914 static void h261_loop_filter_c(uint8_t *src, int stride){
2919 temp[x ] = 4*src[x ];
2920 temp[x + 7*8] = 4*src[x + 7*stride];
2924 xy = y * stride + x;
2926 temp[yz] = src[xy - stride] + 2*src[xy] + src[xy + stride];
2931 src[ y*stride] = (temp[ y*8] + 2)>>2;
2932 src[7+y*stride] = (temp[7+y*8] + 2)>>2;
2934 xy = y * stride + x;
2936 src[xy] = (temp[yz-1] + 2*temp[yz] + temp[yz+1] + 8)>>4;
2941 static inline void h264_loop_filter_luma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
2944 for( i = 0; i < 4; i++ ) {
2949 for( d = 0; d < 4; d++ ) {
2950 const int p0 = pix[-1*xstride];
2951 const int p1 = pix[-2*xstride];
2952 const int p2 = pix[-3*xstride];
2953 const int q0 = pix[0];
2954 const int q1 = pix[1*xstride];
2955 const int q2 = pix[2*xstride];
2957 if( FFABS( p0 - q0 ) < alpha &&
2958 FFABS( p1 - p0 ) < beta &&
2959 FFABS( q1 - q0 ) < beta ) {
2964 if( FFABS( p2 - p0 ) < beta ) {
2965 pix[-2*xstride] = p1 + av_clip( (( p2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - p1, -tc0[i], tc0[i] );
2968 if( FFABS( q2 - q0 ) < beta ) {
2969 pix[ xstride] = q1 + av_clip( (( q2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - q1, -tc0[i], tc0[i] );
2973 i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
2974 pix[-xstride] = av_clip_uint8( p0 + i_delta ); /* p0' */
2975 pix[0] = av_clip_uint8( q0 - i_delta ); /* q0' */
2981 static void h264_v_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2983 h264_loop_filter_luma_c(pix, stride, 1, alpha, beta, tc0);
2985 static void h264_h_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2987 h264_loop_filter_luma_c(pix, 1, stride, alpha, beta, tc0);
2990 static inline void h264_loop_filter_luma_intra_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta)
2993 for( d = 0; d < 16; d++ ) {
2994 const int p2 = pix[-3*xstride];
2995 const int p1 = pix[-2*xstride];
2996 const int p0 = pix[-1*xstride];
2998 const int q0 = pix[ 0*xstride];
2999 const int q1 = pix[ 1*xstride];
3000 const int q2 = pix[ 2*xstride];
3002 if( FFABS( p0 - q0 ) < alpha &&
3003 FFABS( p1 - p0 ) < beta &&
3004 FFABS( q1 - q0 ) < beta ) {
3006 if(FFABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
3007 if( FFABS( p2 - p0 ) < beta)
3009 const int p3 = pix[-4*xstride];
3011 pix[-1*xstride] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
3012 pix[-2*xstride] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
3013 pix[-3*xstride] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
3016 pix[-1*xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
3018 if( FFABS( q2 - q0 ) < beta)
3020 const int q3 = pix[3*xstride];
3022 pix[0*xstride] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
3023 pix[1*xstride] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
3024 pix[2*xstride] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
3027 pix[0*xstride] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
3031 pix[-1*xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
3032 pix[ 0*xstride] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
3038 static void h264_v_loop_filter_luma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
3040 h264_loop_filter_luma_intra_c(pix, stride, 1, alpha, beta);
3042 static void h264_h_loop_filter_luma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
3044 h264_loop_filter_luma_intra_c(pix, 1, stride, alpha, beta);
3047 static inline void h264_loop_filter_chroma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
3050 for( i = 0; i < 4; i++ ) {
3051 const int tc = tc0[i];
3056 for( d = 0; d < 2; d++ ) {
3057 const int p0 = pix[-1*xstride];
3058 const int p1 = pix[-2*xstride];
3059 const int q0 = pix[0];
3060 const int q1 = pix[1*xstride];
3062 if( FFABS( p0 - q0 ) < alpha &&
3063 FFABS( p1 - p0 ) < beta &&
3064 FFABS( q1 - q0 ) < beta ) {
3066 int delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
3068 pix[-xstride] = av_clip_uint8( p0 + delta ); /* p0' */
3069 pix[0] = av_clip_uint8( q0 - delta ); /* q0' */
3075 static void h264_v_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
3077 h264_loop_filter_chroma_c(pix, stride, 1, alpha, beta, tc0);
3079 static void h264_h_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
3081 h264_loop_filter_chroma_c(pix, 1, stride, alpha, beta, tc0);
3084 static inline void h264_loop_filter_chroma_intra_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta)
3087 for( d = 0; d < 8; d++ ) {
3088 const int p0 = pix[-1*xstride];
3089 const int p1 = pix[-2*xstride];
3090 const int q0 = pix[0];
3091 const int q1 = pix[1*xstride];
3093 if( FFABS( p0 - q0 ) < alpha &&
3094 FFABS( p1 - p0 ) < beta &&
3095 FFABS( q1 - q0 ) < beta ) {
3097 pix[-xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2; /* p0' */
3098 pix[0] = ( 2*q1 + q0 + p1 + 2 ) >> 2; /* q0' */
3103 static void h264_v_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
3105 h264_loop_filter_chroma_intra_c(pix, stride, 1, alpha, beta);
3107 static void h264_h_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
3109 h264_loop_filter_chroma_intra_c(pix, 1, stride, alpha, beta);
3112 static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3118 s += abs(pix1[0] - pix2[0]);
3119 s += abs(pix1[1] - pix2[1]);
3120 s += abs(pix1[2] - pix2[2]);
3121 s += abs(pix1[3] - pix2[3]);
3122 s += abs(pix1[4] - pix2[4]);
3123 s += abs(pix1[5] - pix2[5]);
3124 s += abs(pix1[6] - pix2[6]);
3125 s += abs(pix1[7] - pix2[7]);
3126 s += abs(pix1[8] - pix2[8]);
3127 s += abs(pix1[9] - pix2[9]);
3128 s += abs(pix1[10] - pix2[10]);
3129 s += abs(pix1[11] - pix2[11]);
3130 s += abs(pix1[12] - pix2[12]);
3131 s += abs(pix1[13] - pix2[13]);
3132 s += abs(pix1[14] - pix2[14]);
3133 s += abs(pix1[15] - pix2[15]);
3140 static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3146 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
3147 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
3148 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
3149 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
3150 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
3151 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
3152 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
3153 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
3154 s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
3155 s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
3156 s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
3157 s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
3158 s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
3159 s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
3160 s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
3161 s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
3168 static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3171 uint8_t *pix3 = pix2 + line_size;
3175 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
3176 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
3177 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
3178 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
3179 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
3180 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
3181 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
3182 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
3183 s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
3184 s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
3185 s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
3186 s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
3187 s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
3188 s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
3189 s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
3190 s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
3198 static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3201 uint8_t *pix3 = pix2 + line_size;
3205 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
3206 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
3207 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
3208 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
3209 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
3210 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
3211 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
3212 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
3213 s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
3214 s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
3215 s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
3216 s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
3217 s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
3218 s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
3219 s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
3220 s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
3228 static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3234 s += abs(pix1[0] - pix2[0]);
3235 s += abs(pix1[1] - pix2[1]);
3236 s += abs(pix1[2] - pix2[2]);
3237 s += abs(pix1[3] - pix2[3]);
3238 s += abs(pix1[4] - pix2[4]);
3239 s += abs(pix1[5] - pix2[5]);
3240 s += abs(pix1[6] - pix2[6]);
3241 s += abs(pix1[7] - pix2[7]);
3248 static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3254 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
3255 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
3256 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
3257 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
3258 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
3259 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
3260 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
3261 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
3268 static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3271 uint8_t *pix3 = pix2 + line_size;
3275 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
3276 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
3277 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
3278 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
3279 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
3280 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
3281 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
3282 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
3290 static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3293 uint8_t *pix3 = pix2 + line_size;
3297 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
3298 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
3299 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
3300 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
3301 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
3302 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
3303 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
3304 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
3312 static int nsse16_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
3313 MpegEncContext *c = v;
3319 for(x=0; x<16; x++){
3320 score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]);
3323 for(x=0; x<15; x++){
3324 score2+= FFABS( s1[x ] - s1[x +stride]
3325 - s1[x+1] + s1[x+1+stride])
3326 -FFABS( s2[x ] - s2[x +stride]
3327 - s2[x+1] + s2[x+1+stride]);
3334 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
3335 else return score1 + FFABS(score2)*8;
3338 static int nsse8_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
3339 MpegEncContext *c = v;
3346 score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]);
3350 score2+= FFABS( s1[x ] - s1[x +stride]
3351 - s1[x+1] + s1[x+1+stride])
3352 -FFABS( s2[x ] - s2[x +stride]
3353 - s2[x+1] + s2[x+1+stride]);
3360 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
3361 else return score1 + FFABS(score2)*8;
3364 static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
3368 for(i=0; i<8*8; i++){
3369 int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
3372 assert(-512<b && b<512);
3374 sum += (w*b)*(w*b)>>4;
3379 static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
3382 for(i=0; i<8*8; i++){
3383 rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
3388 * permutes an 8x8 block.
3389 * @param block the block which will be permuted according to the given permutation vector
3390 * @param permutation the permutation vector
3391 * @param last the last non zero coefficient in scantable order, used to speed the permutation up
3392 * @param scantable the used scantable, this is only used to speed the permutation up, the block is not
3393 * (inverse) permutated to scantable order!
3395 void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
3401 //if(permutation[1]==1) return; //FIXME it is ok but not clean and might fail for some permutations
3403 for(i=0; i<=last; i++){
3404 const int j= scantable[i];
3409 for(i=0; i<=last; i++){
3410 const int j= scantable[i];
3411 const int perm_j= permutation[j];
3412 block[perm_j]= temp[j];
3416 static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
3420 void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
3423 memset(cmp, 0, sizeof(void*)*5);
3431 cmp[i]= c->hadamard8_diff[i];
3437 cmp[i]= c->dct_sad[i];
3440 cmp[i]= c->dct264_sad[i];
3443 cmp[i]= c->dct_max[i];
3446 cmp[i]= c->quant_psnr[i];
3466 #ifdef CONFIG_SNOW_ENCODER
3475 av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
3480 static void clear_block_c(DCTELEM *block)
3482 memset(block, 0, sizeof(DCTELEM)*64);
3486 * memset(blocks, 0, sizeof(DCTELEM)*6*64)
3488 static void clear_blocks_c(DCTELEM *blocks)
3490 memset(blocks, 0, sizeof(DCTELEM)*6*64);
3493 static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
3495 for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3496 long a = *(long*)(src+i);
3497 long b = *(long*)(dst+i);
3498 *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
3501 dst[i+0] += src[i+0];
3504 static void add_bytes_l2_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
3506 for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3507 long a = *(long*)(src1+i);
3508 long b = *(long*)(src2+i);
3509 *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
3512 dst[i] = src1[i]+src2[i];
3515 static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
3517 #ifndef HAVE_FAST_UNALIGNED
3518 if((long)src2 & (sizeof(long)-1)){
3519 for(i=0; i+7<w; i+=8){
3520 dst[i+0] = src1[i+0]-src2[i+0];
3521 dst[i+1] = src1[i+1]-src2[i+1];
3522 dst[i+2] = src1[i+2]-src2[i+2];
3523 dst[i+3] = src1[i+3]-src2[i+3];
3524 dst[i+4] = src1[i+4]-src2[i+4];
3525 dst[i+5] = src1[i+5]-src2[i+5];
3526 dst[i+6] = src1[i+6]-src2[i+6];
3527 dst[i+7] = src1[i+7]-src2[i+7];
3531 for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3532 long a = *(long*)(src1+i);
3533 long b = *(long*)(src2+i);
3534 *(long*)(dst+i) = ((a|pb_80) - (b&pb_7f)) ^ ((a^b^pb_80)&pb_80);
3537 dst[i+0] = src1[i+0]-src2[i+0];
3540 static void sub_hfyu_median_prediction_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top){
3548 const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
3558 #define BUTTERFLY2(o1,o2,i1,i2) \
3562 #define BUTTERFLY1(x,y) \
3571 #define BUTTERFLYA(x,y) (FFABS((x)+(y)) + FFABS((x)-(y)))
3573 static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
3581 //FIXME try pointer walks
3582 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
3583 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
3584 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
3585 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
3587 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3588 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3589 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3590 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3592 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3593 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3594 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3595 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3599 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3600 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3601 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3602 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3604 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3605 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3606 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3607 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3610 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3611 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3612 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3613 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3619 printf("MAX:%d\n", maxi);
3625 static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
3633 //FIXME try pointer walks
3634 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
3635 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
3636 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
3637 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
3639 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3640 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3641 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3642 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3644 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3645 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3646 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3647 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3651 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3652 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3653 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3654 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3656 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3657 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3658 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3659 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3662 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3663 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3664 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3665 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3668 sum -= FFABS(temp[8*0] + temp[8*4]); // -mean
3673 static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3674 MpegEncContext * const s= (MpegEncContext *)c;
3675 DECLARE_ALIGNED_16(uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3676 DCTELEM * const temp= (DCTELEM*)aligned_temp;
3680 s->dsp.diff_pixels(temp, src1, src2, stride);
3682 return s->dsp.sum_abs_dctelem(temp);
3687 const int s07 = SRC(0) + SRC(7);\
3688 const int s16 = SRC(1) + SRC(6);\
3689 const int s25 = SRC(2) + SRC(5);\
3690 const int s34 = SRC(3) + SRC(4);\
3691 const int a0 = s07 + s34;\
3692 const int a1 = s16 + s25;\
3693 const int a2 = s07 - s34;\
3694 const int a3 = s16 - s25;\
3695 const int d07 = SRC(0) - SRC(7);\
3696 const int d16 = SRC(1) - SRC(6);\
3697 const int d25 = SRC(2) - SRC(5);\
3698 const int d34 = SRC(3) - SRC(4);\
3699 const int a4 = d16 + d25 + (d07 + (d07>>1));\
3700 const int a5 = d07 - d34 - (d25 + (d25>>1));\
3701 const int a6 = d07 + d34 - (d16 + (d16>>1));\
3702 const int a7 = d16 - d25 + (d34 + (d34>>1));\
3704 DST(1, a4 + (a7>>2)) ;\
3705 DST(2, a2 + (a3>>1)) ;\
3706 DST(3, a5 + (a6>>2)) ;\
3708 DST(5, a6 - (a5>>2)) ;\
3709 DST(6, (a2>>1) - a3 ) ;\
3710 DST(7, (a4>>2) - a7 ) ;\
3713 static int dct264_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3714 MpegEncContext * const s= (MpegEncContext *)c;
3719 s->dsp.diff_pixels(dct[0], src1, src2, stride);
3721 #define SRC(x) dct[i][x]
3722 #define DST(x,v) dct[i][x]= v
3723 for( i = 0; i < 8; i++ )
3728 #define SRC(x) dct[x][i]
3729 #define DST(x,v) sum += FFABS(v)
3730 for( i = 0; i < 8; i++ )
3738 static int dct_max8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3739 MpegEncContext * const s= (MpegEncContext *)c;
3740 DECLARE_ALIGNED_8(uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3741 DCTELEM * const temp= (DCTELEM*)aligned_temp;
3746 s->dsp.diff_pixels(temp, src1, src2, stride);
3750 sum= FFMAX(sum, FFABS(temp[i]));
3755 static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3756 MpegEncContext * const s= (MpegEncContext *)c;
3757 DECLARE_ALIGNED_8 (uint64_t, aligned_temp[sizeof(DCTELEM)*64*2/8]);
3758 DCTELEM * const temp= (DCTELEM*)aligned_temp;
3759 DCTELEM * const bak = ((DCTELEM*)aligned_temp)+64;
3765 s->dsp.diff_pixels(temp, src1, src2, stride);
3767 memcpy(bak, temp, 64*sizeof(DCTELEM));
3769 s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3770 s->dct_unquantize_inter(s, temp, 0, s->qscale);
3771 ff_simple_idct(temp); //FIXME
3774 sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
3779 static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3780 MpegEncContext * const s= (MpegEncContext *)c;
3781 const uint8_t *scantable= s->intra_scantable.permutated;
3782 DECLARE_ALIGNED_8 (uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3783 DECLARE_ALIGNED_8 (uint64_t, aligned_bak[stride]);
3784 DCTELEM * const temp= (DCTELEM*)aligned_temp;
3785 uint8_t * const bak= (uint8_t*)aligned_bak;
3786 int i, last, run, bits, level, distortion, start_i;
3787 const int esc_length= s->ac_esc_length;
3789 uint8_t * last_length;
3794 ((uint32_t*)(bak + i*stride))[0]= ((uint32_t*)(src2 + i*stride))[0];
3795 ((uint32_t*)(bak + i*stride))[1]= ((uint32_t*)(src2 + i*stride))[1];
3798 s->dsp.diff_pixels(temp, src1, src2, stride);
3800 s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3806 length = s->intra_ac_vlc_length;
3807 last_length= s->intra_ac_vlc_last_length;
3808 bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3811 length = s->inter_ac_vlc_length;
3812 last_length= s->inter_ac_vlc_last_length;
3817 for(i=start_i; i<last; i++){
3818 int j= scantable[i];
3823 if((level&(~127)) == 0){
3824 bits+= length[UNI_AC_ENC_INDEX(run, level)];
3833 level= temp[i] + 64;
3837 if((level&(~127)) == 0){
3838 bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3846 s->dct_unquantize_intra(s, temp, 0, s->qscale);
3848 s->dct_unquantize_inter(s, temp, 0, s->qscale);
3851 s->dsp.idct_add(bak, stride, temp);
3853 distortion= s->dsp.sse[1](NULL, bak, src1, stride, 8);
3855 return distortion + ((bits*s->qscale*s->qscale*109 + 64)>>7);
3858 static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3859 MpegEncContext * const s= (MpegEncContext *)c;
3860 const uint8_t *scantable= s->intra_scantable.permutated;
3861 DECLARE_ALIGNED_8 (uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3862 DCTELEM * const temp= (DCTELEM*)aligned_temp;
3863 int i, last, run, bits, level, start_i;
3864 const int esc_length= s->ac_esc_length;
3866 uint8_t * last_length;
3870 s->dsp.diff_pixels(temp, src1, src2, stride);
3872 s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3878 length = s->intra_ac_vlc_length;
3879 last_length= s->intra_ac_vlc_last_length;
3880 bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3883 length = s->inter_ac_vlc_length;
3884 last_length= s->inter_ac_vlc_last_length;
3889 for(i=start_i; i<last; i++){
3890 int j= scantable[i];
3895 if((level&(~127)) == 0){
3896 bits+= length[UNI_AC_ENC_INDEX(run, level)];
3905 level= temp[i] + 64;
3909 if((level&(~127)) == 0){
3910 bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3918 static int vsad_intra16_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){
3923 for(x=0; x<16; x+=4){
3924 score+= FFABS(s[x ] - s[x +stride]) + FFABS(s[x+1] - s[x+1+stride])
3925 +FFABS(s[x+2] - s[x+2+stride]) + FFABS(s[x+3] - s[x+3+stride]);
3933 static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3938 for(x=0; x<16; x++){
3939 score+= FFABS(s1[x ] - s2[x ] - s1[x +stride] + s2[x +stride]);
3948 #define SQ(a) ((a)*(a))
3949 static int vsse_intra16_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){
3954 for(x=0; x<16; x+=4){
3955 score+= SQ(s[x ] - s[x +stride]) + SQ(s[x+1] - s[x+1+stride])
3956 +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]);
3964 static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3969 for(x=0; x<16; x++){
3970 score+= SQ(s1[x ] - s2[x ] - s1[x +stride] + s2[x +stride]);
3979 static int ssd_int8_vs_int16_c(const int8_t *pix1, const int16_t *pix2,
3983 for(i=0; i<size; i++)
3984 score += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]);
3988 WRAPPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
3989 WRAPPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
3990 WRAPPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
3992 WRAPPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c)
3994 WRAPPER8_16_SQ(dct_max8x8_c, dct_max16_c)
3995 WRAPPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
3996 WRAPPER8_16_SQ(rd8x8_c, rd16_c)
3997 WRAPPER8_16_SQ(bit8x8_c, bit16_c)
3999 static void vector_fmul_c(float *dst, const float *src, int len){
4001 for(i=0; i<len; i++)
4005 static void vector_fmul_reverse_c(float *dst, const float *src0, const float *src1, int len){
4008 for(i=0; i<len; i++)
4009 dst[i] = src0[i] * src1[-i];
4012 void ff_vector_fmul_add_add_c(float *dst, const float *src0, const float *src1, const float *src2, int src3, int len, int step){
4014 for(i=0; i<len; i++)
4015 dst[i*step] = src0[i] * src1[i] + src2[i] + src3;
4018 void ff_vector_fmul_window_c(float *dst, const float *src0, const float *src1, const float *win, float add_bias, int len){
4023 for(i=-len, j=len-1; i<0; i++, j--) {
4028 dst[i] = s0*wj - s1*wi + add_bias;
4029 dst[j] = s0*wi + s1*wj + add_bias;
4033 static void int32_to_float_fmul_scalar_c(float *dst, const int *src, float mul, int len){
4035 for(i=0; i<len; i++)
4036 dst[i] = src[i] * mul;
4039 static av_always_inline int float_to_int16_one(const float *src){
4040 int_fast32_t tmp = *(const int32_t*)src;
4042 tmp = (0x43c0ffff - tmp)>>31;
4043 // is this faster on some gcc/cpu combinations?
4044 // if(tmp > 0x43c0ffff) tmp = 0xFFFF;
4047 return tmp - 0x8000;
4050 void ff_float_to_int16_c(int16_t *dst, const float *src, long len){
4052 for(i=0; i<len; i++)
4053 dst[i] = float_to_int16_one(src+i);
4056 void ff_float_to_int16_interleave_c(int16_t *dst, const float **src, long len, int channels){
4059 for(i=0; i<len; i++){
4060 dst[2*i] = float_to_int16_one(src[0]+i);
4061 dst[2*i+1] = float_to_int16_one(src[1]+i);
4064 for(c=0; c<channels; c++)
4065 for(i=0, j=c; i<len; i++, j+=channels)
4066 dst[j] = float_to_int16_one(src[c]+i);
4070 static void add_int16_c(int16_t * v1, int16_t * v2, int order)
4076 static void sub_int16_c(int16_t * v1, int16_t * v2, int order)
4082 static int32_t scalarproduct_int16_c(int16_t * v1, int16_t * v2, int order, int shift)
4087 res += (*v1++ * *v2++) >> shift;
4093 #define W1 2841 /* 2048*sqrt (2)*cos (1*pi/16) */
4094 #define W2 2676 /* 2048*sqrt (2)*cos (2*pi/16) */
4095 #define W3 2408 /* 2048*sqrt (2)*cos (3*pi/16) */
4096 #define W4 2048 /* 2048*sqrt (2)*cos (4*pi/16) */
4097 #define W5 1609 /* 2048*sqrt (2)*cos (5*pi/16) */
4098 #define W6 1108 /* 2048*sqrt (2)*cos (6*pi/16) */
4099 #define W7 565 /* 2048*sqrt (2)*cos (7*pi/16) */
4101 static void wmv2_idct_row(short * b)
4104 int a0,a1,a2,a3,a4,a5,a6,a7;
4106 a1 = W1*b[1]+W7*b[7];
4107 a7 = W7*b[1]-W1*b[7];
4108 a5 = W5*b[5]+W3*b[3];
4109 a3 = W3*b[5]-W5*b[3];
4110 a2 = W2*b[2]+W6*b[6];
4111 a6 = W6*b[2]-W2*b[6];
4112 a0 = W0*b[0]+W0*b[4];
4113 a4 = W0*b[0]-W0*b[4];
4115 s1 = (181*(a1-a5+a7-a3)+128)>>8;//1,3,5,7,
4116 s2 = (181*(a1-a5-a7+a3)+128)>>8;
4118 b[0] = (a0+a2+a1+a5 + (1<<7))>>8;
4119 b[1] = (a4+a6 +s1 + (1<<7))>>8;
4120 b[2] = (a4-a6 +s2 + (1<<7))>>8;
4121 b[3] = (a0-a2+a7+a3 + (1<<7))>>8;
4122 b[4] = (a0-a2-a7-a3 + (1<<7))>>8;
4123 b[5] = (a4-a6 -s2 + (1<<7))>>8;
4124 b[6] = (a4+a6 -s1 + (1<<7))>>8;
4125 b[7] = (a0+a2-a1-a5 + (1<<7))>>8;
4127 static void wmv2_idct_col(short * b)
4130 int a0,a1,a2,a3,a4,a5,a6,a7;
4131 /*step 1, with extended precision*/
4132 a1 = (W1*b[8*1]+W7*b[8*7] + 4)>>3;
4133 a7 = (W7*b[8*1]-W1*b[8*7] + 4)>>3;
4134 a5 = (W5*b[8*5]+W3*b[8*3] + 4)>>3;
4135 a3 = (W3*b[8*5]-W5*b[8*3] + 4)>>3;
4136 a2 = (W2*b[8*2]+W6*b[8*6] + 4)>>3;
4137 a6 = (W6*b[8*2]-W2*b[8*6] + 4)>>3;
4138 a0 = (W0*b[8*0]+W0*b[8*4] )>>3;
4139 a4 = (W0*b[8*0]-W0*b[8*4] )>>3;
4141 s1 = (181*(a1-a5+a7-a3)+128)>>8;
4142 s2 = (181*(a1-a5-a7+a3)+128)>>8;
4144 b[8*0] = (a0+a2+a1+a5 + (1<<13))>>14;
4145 b[8*1] = (a4+a6 +s1 + (1<<13))>>14;
4146 b[8*2] = (a4-a6 +s2 + (1<<13))>>14;
4147 b[8*3] = (a0-a2+a7+a3 + (1<<13))>>14;
4149 b[8*4] = (a0-a2-a7-a3 + (1<<13))>>14;
4150 b[8*5] = (a4-a6 -s2 + (1<<13))>>14;
4151 b[8*6] = (a4+a6 -s1 + (1<<13))>>14;
4152 b[8*7] = (a0+a2-a1-a5 + (1<<13))>>14;
4154 void ff_wmv2_idct_c(short * block){
4158 wmv2_idct_row(block+i);
4161 wmv2_idct_col(block+i);
4164 /* XXX: those functions should be suppressed ASAP when all IDCTs are
4166 static void ff_wmv2_idct_put_c(uint8_t *dest, int line_size, DCTELEM *block)
4168 ff_wmv2_idct_c(block);
4169 put_pixels_clamped_c(block, dest, line_size);
4171 static void ff_wmv2_idct_add_c(uint8_t *dest, int line_size, DCTELEM *block)
4173 ff_wmv2_idct_c(block);
4174 add_pixels_clamped_c(block, dest, line_size);
4176 static void ff_jref_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
4179 put_pixels_clamped_c(block, dest, line_size);
4181 static void ff_jref_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
4184 add_pixels_clamped_c(block, dest, line_size);
4187 static void ff_jref_idct4_put(uint8_t *dest, int line_size, DCTELEM *block)
4190 put_pixels_clamped4_c(block, dest, line_size);
4192 static void ff_jref_idct4_add(uint8_t *dest, int line_size, DCTELEM *block)
4195 add_pixels_clamped4_c(block, dest, line_size);
4198 static void ff_jref_idct2_put(uint8_t *dest, int line_size, DCTELEM *block)
4201 put_pixels_clamped2_c(block, dest, line_size);
4203 static void ff_jref_idct2_add(uint8_t *dest, int line_size, DCTELEM *block)
4206 add_pixels_clamped2_c(block, dest, line_size);
4209 static void ff_jref_idct1_put(uint8_t *dest, int line_size, DCTELEM *block)
4211 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
4213 dest[0] = cm[(block[0] + 4)>>3];
4215 static void ff_jref_idct1_add(uint8_t *dest, int line_size, DCTELEM *block)
4217 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
4219 dest[0] = cm[dest[0] + ((block[0] + 4)>>3)];
4222 static void just_return(void *mem av_unused, int stride av_unused, int h av_unused) { return; }
4224 /* init static data */
4225 void dsputil_static_init(void)
4229 for(i=0;i<256;i++) ff_cropTbl[i + MAX_NEG_CROP] = i;
4230 for(i=0;i<MAX_NEG_CROP;i++) {
4232 ff_cropTbl[i + MAX_NEG_CROP + 256] = 255;
4235 for(i=0;i<512;i++) {
4236 ff_squareTbl[i] = (i - 256) * (i - 256);
4239 for(i=0; i<64; i++) inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
4242 int ff_check_alignment(void){
4243 static int did_fail=0;
4244 DECLARE_ALIGNED_16(int, aligned);
4246 if((long)&aligned & 15){
4248 #if defined(HAVE_MMX) || defined(HAVE_ALTIVEC)
4249 av_log(NULL, AV_LOG_ERROR,
4250 "Compiler did not align stack variables. Libavcodec has been miscompiled\n"
4251 "and may be very slow or crash. This is not a bug in libavcodec,\n"
4252 "but in the compiler. You may try recompiling using gcc >= 4.2.\n"
4253 "Do not report crashes to FFmpeg developers.\n");
4262 void dsputil_init(DSPContext* c, AVCodecContext *avctx)
4266 ff_check_alignment();
4268 #ifdef CONFIG_ENCODERS
4269 if(avctx->dct_algo==FF_DCT_FASTINT) {
4270 c->fdct = fdct_ifast;
4271 c->fdct248 = fdct_ifast248;
4273 else if(avctx->dct_algo==FF_DCT_FAAN) {
4274 c->fdct = ff_faandct;
4275 c->fdct248 = ff_faandct248;
4278 c->fdct = ff_jpeg_fdct_islow; //slow/accurate/default
4279 c->fdct248 = ff_fdct248_islow;
4281 #endif //CONFIG_ENCODERS
4283 if(avctx->lowres==1){
4284 if(avctx->idct_algo==FF_IDCT_INT || avctx->idct_algo==FF_IDCT_AUTO || !ENABLE_H264_DECODER){
4285 c->idct_put= ff_jref_idct4_put;
4286 c->idct_add= ff_jref_idct4_add;
4288 c->idct_put= ff_h264_lowres_idct_put_c;
4289 c->idct_add= ff_h264_lowres_idct_add_c;
4291 c->idct = j_rev_dct4;
4292 c->idct_permutation_type= FF_NO_IDCT_PERM;
4293 }else if(avctx->lowres==2){
4294 c->idct_put= ff_jref_idct2_put;
4295 c->idct_add= ff_jref_idct2_add;
4296 c->idct = j_rev_dct2;
4297 c->idct_permutation_type= FF_NO_IDCT_PERM;
4298 }else if(avctx->lowres==3){
4299 c->idct_put= ff_jref_idct1_put;
4300 c->idct_add= ff_jref_idct1_add;
4301 c->idct = j_rev_dct1;
4302 c->idct_permutation_type= FF_NO_IDCT_PERM;
4304 if(avctx->idct_algo==FF_IDCT_INT){
4305 c->idct_put= ff_jref_idct_put;
4306 c->idct_add= ff_jref_idct_add;
4307 c->idct = j_rev_dct;
4308 c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
4309 }else if((ENABLE_VP3_DECODER || ENABLE_VP5_DECODER || ENABLE_VP6_DECODER || ENABLE_THEORA_DECODER ) &&
4310 avctx->idct_algo==FF_IDCT_VP3){
4311 c->idct_put= ff_vp3_idct_put_c;
4312 c->idct_add= ff_vp3_idct_add_c;
4313 c->idct = ff_vp3_idct_c;
4314 c->idct_permutation_type= FF_NO_IDCT_PERM;
4315 }else if(avctx->idct_algo==FF_IDCT_WMV2){
4316 c->idct_put= ff_wmv2_idct_put_c;
4317 c->idct_add= ff_wmv2_idct_add_c;
4318 c->idct = ff_wmv2_idct_c;
4319 c->idct_permutation_type= FF_NO_IDCT_PERM;
4320 }else if(avctx->idct_algo==FF_IDCT_FAAN){
4321 c->idct_put= ff_faanidct_put;
4322 c->idct_add= ff_faanidct_add;
4323 c->idct = ff_faanidct;
4324 c->idct_permutation_type= FF_NO_IDCT_PERM;
4325 }else if(ENABLE_EATGQ_DECODER && avctx->idct_algo==FF_IDCT_EA) {
4326 c->idct_put= ff_ea_idct_put_c;
4327 c->idct_permutation_type= FF_NO_IDCT_PERM;
4328 }else{ //accurate/default
4329 c->idct_put= ff_simple_idct_put;
4330 c->idct_add= ff_simple_idct_add;
4331 c->idct = ff_simple_idct;
4332 c->idct_permutation_type= FF_NO_IDCT_PERM;
4336 if (ENABLE_H264_DECODER) {
4337 c->h264_idct_add= ff_h264_idct_add_c;
4338 c->h264_idct8_add= ff_h264_idct8_add_c;
4339 c->h264_idct_dc_add= ff_h264_idct_dc_add_c;
4340 c->h264_idct8_dc_add= ff_h264_idct8_dc_add_c;
4341 c->h264_idct_add16 = ff_h264_idct_add16_c;
4342 c->h264_idct8_add4 = ff_h264_idct8_add4_c;
4343 c->h264_idct_add8 = ff_h264_idct_add8_c;
4344 c->h264_idct_add16intra= ff_h264_idct_add16intra_c;
4347 c->get_pixels = get_pixels_c;
4348 c->diff_pixels = diff_pixels_c;
4349 c->put_pixels_clamped = put_pixels_clamped_c;
4350 c->put_signed_pixels_clamped = put_signed_pixels_clamped_c;
4351 c->add_pixels_clamped = add_pixels_clamped_c;
4352 c->add_pixels8 = add_pixels8_c;
4353 c->add_pixels4 = add_pixels4_c;
4354 c->sum_abs_dctelem = sum_abs_dctelem_c;
4357 c->clear_block = clear_block_c;
4358 c->clear_blocks = clear_blocks_c;
4359 c->pix_sum = pix_sum_c;
4360 c->pix_norm1 = pix_norm1_c;
4362 /* TODO [0] 16 [1] 8 */
4363 c->pix_abs[0][0] = pix_abs16_c;
4364 c->pix_abs[0][1] = pix_abs16_x2_c;
4365 c->pix_abs[0][2] = pix_abs16_y2_c;
4366 c->pix_abs[0][3] = pix_abs16_xy2_c;
4367 c->pix_abs[1][0] = pix_abs8_c;
4368 c->pix_abs[1][1] = pix_abs8_x2_c;
4369 c->pix_abs[1][2] = pix_abs8_y2_c;
4370 c->pix_abs[1][3] = pix_abs8_xy2_c;
4372 #define dspfunc(PFX, IDX, NUM) \
4373 c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## NUM ## _c; \
4374 c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## NUM ## _x2_c; \
4375 c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## NUM ## _y2_c; \
4376 c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## NUM ## _xy2_c
4378 dspfunc(put, 0, 16);
4379 dspfunc(put_no_rnd, 0, 16);
4381 dspfunc(put_no_rnd, 1, 8);
4385 dspfunc(avg, 0, 16);
4386 dspfunc(avg_no_rnd, 0, 16);
4388 dspfunc(avg_no_rnd, 1, 8);
4393 c->put_no_rnd_pixels_l2[0]= put_no_rnd_pixels16_l2_c;
4394 c->put_no_rnd_pixels_l2[1]= put_no_rnd_pixels8_l2_c;
4396 c->put_tpel_pixels_tab[ 0] = put_tpel_pixels_mc00_c;
4397 c->put_tpel_pixels_tab[ 1] = put_tpel_pixels_mc10_c;
4398 c->put_tpel_pixels_tab[ 2] = put_tpel_pixels_mc20_c;
4399 c->put_tpel_pixels_tab[ 4] = put_tpel_pixels_mc01_c;
4400 c->put_tpel_pixels_tab[ 5] = put_tpel_pixels_mc11_c;
4401 c->put_tpel_pixels_tab[ 6] = put_tpel_pixels_mc21_c;
4402 c->put_tpel_pixels_tab[ 8] = put_tpel_pixels_mc02_c;
4403 c->put_tpel_pixels_tab[ 9] = put_tpel_pixels_mc12_c;
4404 c->put_tpel_pixels_tab[10] = put_tpel_pixels_mc22_c;
4406 c->avg_tpel_pixels_tab[ 0] = avg_tpel_pixels_mc00_c;
4407 c->avg_tpel_pixels_tab[ 1] = avg_tpel_pixels_mc10_c;
4408 c->avg_tpel_pixels_tab[ 2] = avg_tpel_pixels_mc20_c;
4409 c->avg_tpel_pixels_tab[ 4] = avg_tpel_pixels_mc01_c;
4410 c->avg_tpel_pixels_tab[ 5] = avg_tpel_pixels_mc11_c;
4411 c->avg_tpel_pixels_tab[ 6] = avg_tpel_pixels_mc21_c;
4412 c->avg_tpel_pixels_tab[ 8] = avg_tpel_pixels_mc02_c;
4413 c->avg_tpel_pixels_tab[ 9] = avg_tpel_pixels_mc12_c;
4414 c->avg_tpel_pixels_tab[10] = avg_tpel_pixels_mc22_c;
4416 #define dspfunc(PFX, IDX, NUM) \
4417 c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
4418 c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
4419 c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
4420 c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
4421 c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
4422 c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
4423 c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
4424 c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
4425 c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
4426 c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
4427 c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
4428 c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
4429 c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
4430 c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
4431 c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
4432 c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
4434 dspfunc(put_qpel, 0, 16);
4435 dspfunc(put_no_rnd_qpel, 0, 16);
4437 dspfunc(avg_qpel, 0, 16);
4438 /* dspfunc(avg_no_rnd_qpel, 0, 16); */
4440 dspfunc(put_qpel, 1, 8);
4441 dspfunc(put_no_rnd_qpel, 1, 8);
4443 dspfunc(avg_qpel, 1, 8);
4444 /* dspfunc(avg_no_rnd_qpel, 1, 8); */
4446 dspfunc(put_h264_qpel, 0, 16);
4447 dspfunc(put_h264_qpel, 1, 8);
4448 dspfunc(put_h264_qpel, 2, 4);
4449 dspfunc(put_h264_qpel, 3, 2);
4450 dspfunc(avg_h264_qpel, 0, 16);
4451 dspfunc(avg_h264_qpel, 1, 8);
4452 dspfunc(avg_h264_qpel, 2, 4);
4455 c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_c;
4456 c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_c;
4457 c->put_h264_chroma_pixels_tab[2]= put_h264_chroma_mc2_c;
4458 c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_c;
4459 c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_c;
4460 c->avg_h264_chroma_pixels_tab[2]= avg_h264_chroma_mc2_c;
4461 c->put_no_rnd_h264_chroma_pixels_tab[0]= put_no_rnd_h264_chroma_mc8_c;
4463 c->weight_h264_pixels_tab[0]= weight_h264_pixels16x16_c;
4464 c->weight_h264_pixels_tab[1]= weight_h264_pixels16x8_c;
4465 c->weight_h264_pixels_tab[2]= weight_h264_pixels8x16_c;
4466 c->weight_h264_pixels_tab[3]= weight_h264_pixels8x8_c;
4467 c->weight_h264_pixels_tab[4]= weight_h264_pixels8x4_c;
4468 c->weight_h264_pixels_tab[5]= weight_h264_pixels4x8_c;
4469 c->weight_h264_pixels_tab[6]= weight_h264_pixels4x4_c;
4470 c->weight_h264_pixels_tab[7]= weight_h264_pixels4x2_c;
4471 c->weight_h264_pixels_tab[8]= weight_h264_pixels2x4_c;
4472 c->weight_h264_pixels_tab[9]= weight_h264_pixels2x2_c;
4473 c->biweight_h264_pixels_tab[0]= biweight_h264_pixels16x16_c;
4474 c->biweight_h264_pixels_tab[1]= biweight_h264_pixels16x8_c;
4475 c->biweight_h264_pixels_tab[2]= biweight_h264_pixels8x16_c;
4476 c->biweight_h264_pixels_tab[3]= biweight_h264_pixels8x8_c;
4477 c->biweight_h264_pixels_tab[4]= biweight_h264_pixels8x4_c;
4478 c->biweight_h264_pixels_tab[5]= biweight_h264_pixels4x8_c;
4479 c->biweight_h264_pixels_tab[6]= biweight_h264_pixels4x4_c;
4480 c->biweight_h264_pixels_tab[7]= biweight_h264_pixels4x2_c;
4481 c->biweight_h264_pixels_tab[8]= biweight_h264_pixels2x4_c;
4482 c->biweight_h264_pixels_tab[9]= biweight_h264_pixels2x2_c;
4484 c->draw_edges = draw_edges_c;
4486 #ifdef CONFIG_CAVS_DECODER
4487 ff_cavsdsp_init(c,avctx);
4489 #if defined(CONFIG_VC1_DECODER) || defined(CONFIG_WMV3_DECODER)
4490 ff_vc1dsp_init(c,avctx);
4492 #if defined(CONFIG_WMV2_DECODER) || defined(CONFIG_VC1_DECODER) || defined(CONFIG_WMV3_DECODER)
4493 ff_intrax8dsp_init(c,avctx);
4495 #if defined(CONFIG_H264_ENCODER)
4496 ff_h264dspenc_init(c,avctx);
4498 #if defined(CONFIG_RV40_DECODER)
4499 ff_rv40dsp_init(c,avctx);
4500 c->put_rv40_qpel_pixels_tab[0][15] = put_rv40_qpel16_mc33_c;
4501 c->avg_rv40_qpel_pixels_tab[0][15] = avg_rv40_qpel16_mc33_c;
4502 c->put_rv40_qpel_pixels_tab[1][15] = put_rv40_qpel8_mc33_c;
4503 c->avg_rv40_qpel_pixels_tab[1][15] = avg_rv40_qpel8_mc33_c;
4506 c->put_mspel_pixels_tab[0]= put_mspel8_mc00_c;
4507 c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
4508 c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c;
4509 c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c;
4510 c->put_mspel_pixels_tab[4]= put_mspel8_mc02_c;
4511 c->put_mspel_pixels_tab[5]= put_mspel8_mc12_c;
4512 c->put_mspel_pixels_tab[6]= put_mspel8_mc22_c;
4513 c->put_mspel_pixels_tab[7]= put_mspel8_mc32_c;
4515 #define SET_CMP_FUNC(name) \
4516 c->name[0]= name ## 16_c;\
4517 c->name[1]= name ## 8x8_c;
4519 SET_CMP_FUNC(hadamard8_diff)
4520 c->hadamard8_diff[4]= hadamard8_intra16_c;
4521 SET_CMP_FUNC(dct_sad)
4522 SET_CMP_FUNC(dct_max)
4524 SET_CMP_FUNC(dct264_sad)
4526 c->sad[0]= pix_abs16_c;
4527 c->sad[1]= pix_abs8_c;
4531 SET_CMP_FUNC(quant_psnr)
4534 c->vsad[0]= vsad16_c;
4535 c->vsad[4]= vsad_intra16_c;
4536 c->vsse[0]= vsse16_c;
4537 c->vsse[4]= vsse_intra16_c;
4538 c->nsse[0]= nsse16_c;
4539 c->nsse[1]= nsse8_c;
4540 #ifdef CONFIG_SNOW_ENCODER
4541 c->w53[0]= w53_16_c;
4543 c->w97[0]= w97_16_c;
4547 c->ssd_int8_vs_int16 = ssd_int8_vs_int16_c;
4549 c->add_bytes= add_bytes_c;
4550 c->add_bytes_l2= add_bytes_l2_c;
4551 c->diff_bytes= diff_bytes_c;
4552 c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_c;
4553 c->bswap_buf= bswap_buf;
4554 #ifdef CONFIG_PNG_DECODER
4555 c->add_png_paeth_prediction= ff_add_png_paeth_prediction;
4558 c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_c;
4559 c->h264_h_loop_filter_luma= h264_h_loop_filter_luma_c;
4560 c->h264_v_loop_filter_luma_intra= h264_v_loop_filter_luma_intra_c;
4561 c->h264_h_loop_filter_luma_intra= h264_h_loop_filter_luma_intra_c;
4562 c->h264_v_loop_filter_chroma= h264_v_loop_filter_chroma_c;
4563 c->h264_h_loop_filter_chroma= h264_h_loop_filter_chroma_c;
4564 c->h264_v_loop_filter_chroma_intra= h264_v_loop_filter_chroma_intra_c;
4565 c->h264_h_loop_filter_chroma_intra= h264_h_loop_filter_chroma_intra_c;
4566 c->h264_loop_filter_strength= NULL;
4568 if (ENABLE_ANY_H263) {
4569 c->h263_h_loop_filter= h263_h_loop_filter_c;
4570 c->h263_v_loop_filter= h263_v_loop_filter_c;
4573 if (ENABLE_VP3_DECODER || ENABLE_THEORA_DECODER) {
4574 c->vp3_h_loop_filter= ff_vp3_h_loop_filter_c;
4575 c->vp3_v_loop_filter= ff_vp3_v_loop_filter_c;
4578 c->h261_loop_filter= h261_loop_filter_c;
4580 c->try_8x8basis= try_8x8basis_c;
4581 c->add_8x8basis= add_8x8basis_c;
4583 #ifdef CONFIG_SNOW_DECODER
4584 c->vertical_compose97i = ff_snow_vertical_compose97i;
4585 c->horizontal_compose97i = ff_snow_horizontal_compose97i;
4586 c->inner_add_yblock = ff_snow_inner_add_yblock;
4589 #ifdef CONFIG_VORBIS_DECODER
4590 c->vorbis_inverse_coupling = vorbis_inverse_coupling;
4592 #ifdef CONFIG_AC3_DECODER
4593 c->ac3_downmix = ff_ac3_downmix_c;
4595 #ifdef CONFIG_FLAC_ENCODER
4596 c->flac_compute_autocorr = ff_flac_compute_autocorr;
4598 c->vector_fmul = vector_fmul_c;
4599 c->vector_fmul_reverse = vector_fmul_reverse_c;
4600 c->vector_fmul_add_add = ff_vector_fmul_add_add_c;
4601 c->vector_fmul_window = ff_vector_fmul_window_c;
4602 c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_c;
4603 c->float_to_int16 = ff_float_to_int16_c;
4604 c->float_to_int16_interleave = ff_float_to_int16_interleave_c;
4605 c->add_int16 = add_int16_c;
4606 c->sub_int16 = sub_int16_c;
4607 c->scalarproduct_int16 = scalarproduct_int16_c;
4609 c->shrink[0]= ff_img_copy_plane;
4610 c->shrink[1]= ff_shrink22;
4611 c->shrink[2]= ff_shrink44;
4612 c->shrink[3]= ff_shrink88;
4614 c->prefetch= just_return;
4616 memset(c->put_2tap_qpel_pixels_tab, 0, sizeof(c->put_2tap_qpel_pixels_tab));
4617 memset(c->avg_2tap_qpel_pixels_tab, 0, sizeof(c->avg_2tap_qpel_pixels_tab));
4619 if (ENABLE_MMX) dsputil_init_mmx (c, avctx);
4620 if (ENABLE_ARM) dsputil_init_arm (c, avctx);
4621 if (ENABLE_MLIB) dsputil_init_mlib (c, avctx);
4622 if (ENABLE_VIS) dsputil_init_vis (c, avctx);
4623 if (ENABLE_ALPHA) dsputil_init_alpha (c, avctx);
4624 if (ENABLE_POWERPC) dsputil_init_ppc (c, avctx);
4625 if (ENABLE_MMI) dsputil_init_mmi (c, avctx);
4626 if (ENABLE_SH4) dsputil_init_sh4 (c, avctx);
4627 if (ENABLE_BFIN) dsputil_init_bfin (c, avctx);
4629 for(i=0; i<64; i++){
4630 if(!c->put_2tap_qpel_pixels_tab[0][i])
4631 c->put_2tap_qpel_pixels_tab[0][i]= c->put_h264_qpel_pixels_tab[0][i];
4632 if(!c->avg_2tap_qpel_pixels_tab[0][i])
4633 c->avg_2tap_qpel_pixels_tab[0][i]= c->avg_h264_qpel_pixels_tab[0][i];
4636 switch(c->idct_permutation_type){
4637 case FF_NO_IDCT_PERM:
4639 c->idct_permutation[i]= i;
4641 case FF_LIBMPEG2_IDCT_PERM:
4643 c->idct_permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
4645 case FF_SIMPLE_IDCT_PERM:
4647 c->idct_permutation[i]= simple_mmx_permutation[i];
4649 case FF_TRANSPOSE_IDCT_PERM:
4651 c->idct_permutation[i]= ((i&7)<<3) | (i>>3);
4653 case FF_PARTTRANS_IDCT_PERM:
4655 c->idct_permutation[i]= (i&0x24) | ((i&3)<<3) | ((i>>3)&3);
4657 case FF_SSE2_IDCT_PERM:
4659 c->idct_permutation[i]= (i&0x38) | idct_sse2_row_perm[i&7];
4662 av_log(avctx, AV_LOG_ERROR, "Internal error, IDCT permutation not set\n");