3 * Copyright (c) 2000, 2001 Fabrice Bellard
4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
6 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
8 * This file is part of FFmpeg.
10 * FFmpeg is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public
12 * License as published by the Free Software Foundation; either
13 * version 2.1 of the License, or (at your option) any later version.
15 * FFmpeg is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * Lesser General Public License for more details.
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with FFmpeg; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
26 * @file libavcodec/dsputil.c
32 #include "simple_idct.h"
37 #include "mpegvideo.h"
44 uint8_t ff_cropTbl[256 + 2 * MAX_NEG_CROP] = {0, };
45 uint32_t ff_squareTbl[512] = {0, };
47 // 0x7f7f7f7f or 0x7f7f7f7f7f7f7f7f or whatever, depending on the cpu's native arithmetic size
48 #define pb_7f (~0UL/255 * 0x7f)
49 #define pb_80 (~0UL/255 * 0x80)
51 const uint8_t ff_zigzag_direct[64] = {
52 0, 1, 8, 16, 9, 2, 3, 10,
53 17, 24, 32, 25, 18, 11, 4, 5,
54 12, 19, 26, 33, 40, 48, 41, 34,
55 27, 20, 13, 6, 7, 14, 21, 28,
56 35, 42, 49, 56, 57, 50, 43, 36,
57 29, 22, 15, 23, 30, 37, 44, 51,
58 58, 59, 52, 45, 38, 31, 39, 46,
59 53, 60, 61, 54, 47, 55, 62, 63
62 /* Specific zigzag scan for 248 idct. NOTE that unlike the
63 specification, we interleave the fields */
64 const uint8_t ff_zigzag248_direct[64] = {
65 0, 8, 1, 9, 16, 24, 2, 10,
66 17, 25, 32, 40, 48, 56, 33, 41,
67 18, 26, 3, 11, 4, 12, 19, 27,
68 34, 42, 49, 57, 50, 58, 35, 43,
69 20, 28, 5, 13, 6, 14, 21, 29,
70 36, 44, 51, 59, 52, 60, 37, 45,
71 22, 30, 7, 15, 23, 31, 38, 46,
72 53, 61, 54, 62, 39, 47, 55, 63,
75 /* not permutated inverse zigzag_direct + 1 for MMX quantizer */
76 DECLARE_ALIGNED(16, uint16_t, inv_zigzag_direct16)[64];
78 const uint8_t ff_alternate_horizontal_scan[64] = {
79 0, 1, 2, 3, 8, 9, 16, 17,
80 10, 11, 4, 5, 6, 7, 15, 14,
81 13, 12, 19, 18, 24, 25, 32, 33,
82 26, 27, 20, 21, 22, 23, 28, 29,
83 30, 31, 34, 35, 40, 41, 48, 49,
84 42, 43, 36, 37, 38, 39, 44, 45,
85 46, 47, 50, 51, 56, 57, 58, 59,
86 52, 53, 54, 55, 60, 61, 62, 63,
89 const uint8_t ff_alternate_vertical_scan[64] = {
90 0, 8, 16, 24, 1, 9, 2, 10,
91 17, 25, 32, 40, 48, 56, 57, 49,
92 41, 33, 26, 18, 3, 11, 4, 12,
93 19, 27, 34, 42, 50, 58, 35, 43,
94 51, 59, 20, 28, 5, 13, 6, 14,
95 21, 29, 36, 44, 52, 60, 37, 45,
96 53, 61, 22, 30, 7, 15, 23, 31,
97 38, 46, 54, 62, 39, 47, 55, 63,
100 /* a*inverse[b]>>32 == a/b for all 0<=a<=16909558 && 2<=b<=256
101 * for a>16909558, is an overestimate by less than 1 part in 1<<24 */
102 const uint32_t ff_inverse[257]={
103 0, 4294967295U,2147483648U,1431655766, 1073741824, 858993460, 715827883, 613566757,
104 536870912, 477218589, 429496730, 390451573, 357913942, 330382100, 306783379, 286331154,
105 268435456, 252645136, 238609295, 226050911, 214748365, 204522253, 195225787, 186737709,
106 178956971, 171798692, 165191050, 159072863, 153391690, 148102321, 143165577, 138547333,
107 134217728, 130150525, 126322568, 122713352, 119304648, 116080198, 113025456, 110127367,
108 107374183, 104755300, 102261127, 99882961, 97612894, 95443718, 93368855, 91382283,
109 89478486, 87652394, 85899346, 84215046, 82595525, 81037119, 79536432, 78090315,
110 76695845, 75350304, 74051161, 72796056, 71582789, 70409300, 69273667, 68174085,
111 67108864, 66076420, 65075263, 64103990, 63161284, 62245903, 61356676, 60492498,
112 59652324, 58835169, 58040099, 57266231, 56512728, 55778797, 55063684, 54366675,
113 53687092, 53024288, 52377650, 51746594, 51130564, 50529028, 49941481, 49367441,
114 48806447, 48258060, 47721859, 47197443, 46684428, 46182445, 45691142, 45210183,
115 44739243, 44278014, 43826197, 43383509, 42949673, 42524429, 42107523, 41698712,
116 41297763, 40904451, 40518560, 40139882, 39768216, 39403370, 39045158, 38693400,
117 38347923, 38008561, 37675152, 37347542, 37025581, 36709123, 36398028, 36092163,
118 35791395, 35495598, 35204650, 34918434, 34636834, 34359739, 34087043, 33818641,
119 33554432, 33294321, 33038210, 32786010, 32537632, 32292988, 32051995, 31814573,
120 31580642, 31350127, 31122952, 30899046, 30678338, 30460761, 30246249, 30034737,
121 29826162, 29620465, 29417585, 29217465, 29020050, 28825284, 28633116, 28443493,
122 28256364, 28071682, 27889399, 27709467, 27531842, 27356480, 27183338, 27012373,
123 26843546, 26676816, 26512144, 26349493, 26188825, 26030105, 25873297, 25718368,
124 25565282, 25414008, 25264514, 25116768, 24970741, 24826401, 24683721, 24542671,
125 24403224, 24265352, 24129030, 23994231, 23860930, 23729102, 23598722, 23469767,
126 23342214, 23216040, 23091223, 22967740, 22845571, 22724695, 22605092, 22486740,
127 22369622, 22253717, 22139007, 22025474, 21913099, 21801865, 21691755, 21582751,
128 21474837, 21367997, 21262215, 21157475, 21053762, 20951060, 20849356, 20748635,
129 20648882, 20550083, 20452226, 20355296, 20259280, 20164166, 20069941, 19976593,
130 19884108, 19792477, 19701685, 19611723, 19522579, 19434242, 19346700, 19259944,
131 19173962, 19088744, 19004281, 18920561, 18837576, 18755316, 18673771, 18592933,
132 18512791, 18433337, 18354562, 18276457, 18199014, 18122225, 18046082, 17970575,
133 17895698, 17821442, 17747799, 17674763, 17602325, 17530479, 17459217, 17388532,
134 17318417, 17248865, 17179870, 17111424, 17043522, 16976156, 16909321, 16843010,
138 /* Input permutation for the simple_idct_mmx */
139 static const uint8_t simple_mmx_permutation[64]={
140 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
141 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
142 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
143 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
144 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
145 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
146 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
147 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
150 static const uint8_t idct_sse2_row_perm[8] = {0, 4, 1, 5, 2, 6, 3, 7};
152 void ff_init_scantable(uint8_t *permutation, ScanTable *st, const uint8_t *src_scantable){
156 st->scantable= src_scantable;
160 j = src_scantable[i];
161 st->permutated[i] = permutation[j];
170 j = st->permutated[i];
172 st->raster_end[i]= end;
176 static int pix_sum_c(uint8_t * pix, int line_size)
181 for (i = 0; i < 16; i++) {
182 for (j = 0; j < 16; j += 8) {
193 pix += line_size - 16;
198 static int pix_norm1_c(uint8_t * pix, int line_size)
201 uint32_t *sq = ff_squareTbl + 256;
204 for (i = 0; i < 16; i++) {
205 for (j = 0; j < 16; j += 8) {
216 #if LONG_MAX > 2147483647
217 register uint64_t x=*(uint64_t*)pix;
219 s += sq[(x>>8)&0xff];
220 s += sq[(x>>16)&0xff];
221 s += sq[(x>>24)&0xff];
222 s += sq[(x>>32)&0xff];
223 s += sq[(x>>40)&0xff];
224 s += sq[(x>>48)&0xff];
225 s += sq[(x>>56)&0xff];
227 register uint32_t x=*(uint32_t*)pix;
229 s += sq[(x>>8)&0xff];
230 s += sq[(x>>16)&0xff];
231 s += sq[(x>>24)&0xff];
232 x=*(uint32_t*)(pix+4);
234 s += sq[(x>>8)&0xff];
235 s += sq[(x>>16)&0xff];
236 s += sq[(x>>24)&0xff];
241 pix += line_size - 16;
246 static void bswap_buf(uint32_t *dst, const uint32_t *src, int w){
249 for(i=0; i+8<=w; i+=8){
250 dst[i+0]= bswap_32(src[i+0]);
251 dst[i+1]= bswap_32(src[i+1]);
252 dst[i+2]= bswap_32(src[i+2]);
253 dst[i+3]= bswap_32(src[i+3]);
254 dst[i+4]= bswap_32(src[i+4]);
255 dst[i+5]= bswap_32(src[i+5]);
256 dst[i+6]= bswap_32(src[i+6]);
257 dst[i+7]= bswap_32(src[i+7]);
260 dst[i+0]= bswap_32(src[i+0]);
264 static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
267 uint32_t *sq = ff_squareTbl + 256;
270 for (i = 0; i < h; i++) {
271 s += sq[pix1[0] - pix2[0]];
272 s += sq[pix1[1] - pix2[1]];
273 s += sq[pix1[2] - pix2[2]];
274 s += sq[pix1[3] - pix2[3]];
281 static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
284 uint32_t *sq = ff_squareTbl + 256;
287 for (i = 0; i < h; i++) {
288 s += sq[pix1[0] - pix2[0]];
289 s += sq[pix1[1] - pix2[1]];
290 s += sq[pix1[2] - pix2[2]];
291 s += sq[pix1[3] - pix2[3]];
292 s += sq[pix1[4] - pix2[4]];
293 s += sq[pix1[5] - pix2[5]];
294 s += sq[pix1[6] - pix2[6]];
295 s += sq[pix1[7] - pix2[7]];
302 static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
305 uint32_t *sq = ff_squareTbl + 256;
308 for (i = 0; i < h; i++) {
309 s += sq[pix1[ 0] - pix2[ 0]];
310 s += sq[pix1[ 1] - pix2[ 1]];
311 s += sq[pix1[ 2] - pix2[ 2]];
312 s += sq[pix1[ 3] - pix2[ 3]];
313 s += sq[pix1[ 4] - pix2[ 4]];
314 s += sq[pix1[ 5] - pix2[ 5]];
315 s += sq[pix1[ 6] - pix2[ 6]];
316 s += sq[pix1[ 7] - pix2[ 7]];
317 s += sq[pix1[ 8] - pix2[ 8]];
318 s += sq[pix1[ 9] - pix2[ 9]];
319 s += sq[pix1[10] - pix2[10]];
320 s += sq[pix1[11] - pix2[11]];
321 s += sq[pix1[12] - pix2[12]];
322 s += sq[pix1[13] - pix2[13]];
323 s += sq[pix1[14] - pix2[14]];
324 s += sq[pix1[15] - pix2[15]];
333 #if CONFIG_SNOW_ENCODER //dwt is in snow.c
334 static inline int w_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int w, int h, int type){
336 const int dec_count= w==8 ? 3 : 4;
339 static const int scale[2][2][4][4]={
343 {268, 239, 239, 213},
347 // 9/7 16x16 or 32x32 dec=4
348 {344, 310, 310, 280},
356 {275, 245, 245, 218},
360 // 5/3 16x16 or 32x32 dec=4
361 {352, 317, 317, 286},
369 for (i = 0; i < h; i++) {
370 for (j = 0; j < w; j+=4) {
371 tmp[32*i+j+0] = (pix1[j+0] - pix2[j+0])<<4;
372 tmp[32*i+j+1] = (pix1[j+1] - pix2[j+1])<<4;
373 tmp[32*i+j+2] = (pix1[j+2] - pix2[j+2])<<4;
374 tmp[32*i+j+3] = (pix1[j+3] - pix2[j+3])<<4;
380 ff_spatial_dwt(tmp, w, h, 32, type, dec_count);
384 for(level=0; level<dec_count; level++){
385 for(ori= level ? 1 : 0; ori<4; ori++){
386 int size= w>>(dec_count-level);
387 int sx= (ori&1) ? size : 0;
388 int stride= 32<<(dec_count-level);
389 int sy= (ori&2) ? stride>>1 : 0;
391 for(i=0; i<size; i++){
392 for(j=0; j<size; j++){
393 int v= tmp[sx + sy + i*stride + j] * scale[type][dec_count-3][level][ori];
403 static int w53_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
404 return w_c(v, pix1, pix2, line_size, 8, h, 1);
407 static int w97_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
408 return w_c(v, pix1, pix2, line_size, 8, h, 0);
411 static int w53_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
412 return w_c(v, pix1, pix2, line_size, 16, h, 1);
415 static int w97_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
416 return w_c(v, pix1, pix2, line_size, 16, h, 0);
419 int w53_32_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
420 return w_c(v, pix1, pix2, line_size, 32, h, 1);
423 int w97_32_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
424 return w_c(v, pix1, pix2, line_size, 32, h, 0);
428 /* draw the edges of width 'w' of an image of size width, height */
429 //FIXME check that this is ok for mpeg4 interlaced
430 static void draw_edges_c(uint8_t *buf, int wrap, int width, int height, int w)
432 uint8_t *ptr, *last_line;
435 last_line = buf + (height - 1) * wrap;
438 memcpy(buf - (i + 1) * wrap, buf, width);
439 memcpy(last_line + (i + 1) * wrap, last_line, width);
443 for(i=0;i<height;i++) {
444 memset(ptr - w, ptr[0], w);
445 memset(ptr + width, ptr[width-1], w);
450 memset(buf - (i + 1) * wrap - w, buf[0], w); /* top left */
451 memset(buf - (i + 1) * wrap + width, buf[width-1], w); /* top right */
452 memset(last_line + (i + 1) * wrap - w, last_line[0], w); /* top left */
453 memset(last_line + (i + 1) * wrap + width, last_line[width-1], w); /* top right */
458 * Copies a rectangular area of samples to a temporary buffer and replicates the boarder samples.
459 * @param buf destination buffer
460 * @param src source buffer
461 * @param linesize number of bytes between 2 vertically adjacent samples in both the source and destination buffers
462 * @param block_w width of block
463 * @param block_h height of block
464 * @param src_x x coordinate of the top left sample of the block in the source buffer
465 * @param src_y y coordinate of the top left sample of the block in the source buffer
466 * @param w width of the source buffer
467 * @param h height of the source buffer
469 void ff_emulated_edge_mc(uint8_t *buf, uint8_t *src, int linesize, int block_w, int block_h,
470 int src_x, int src_y, int w, int h){
472 int start_y, start_x, end_y, end_x;
475 src+= (h-1-src_y)*linesize;
477 }else if(src_y<=-block_h){
478 src+= (1-block_h-src_y)*linesize;
484 }else if(src_x<=-block_w){
485 src+= (1-block_w-src_x);
489 start_y= FFMAX(0, -src_y);
490 start_x= FFMAX(0, -src_x);
491 end_y= FFMIN(block_h, h-src_y);
492 end_x= FFMIN(block_w, w-src_x);
494 // copy existing part
495 for(y=start_y; y<end_y; y++){
496 for(x=start_x; x<end_x; x++){
497 buf[x + y*linesize]= src[x + y*linesize];
502 for(y=0; y<start_y; y++){
503 for(x=start_x; x<end_x; x++){
504 buf[x + y*linesize]= buf[x + start_y*linesize];
509 for(y=end_y; y<block_h; y++){
510 for(x=start_x; x<end_x; x++){
511 buf[x + y*linesize]= buf[x + (end_y-1)*linesize];
515 for(y=0; y<block_h; y++){
517 for(x=0; x<start_x; x++){
518 buf[x + y*linesize]= buf[start_x + y*linesize];
522 for(x=end_x; x<block_w; x++){
523 buf[x + y*linesize]= buf[end_x - 1 + y*linesize];
528 static void get_pixels_c(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
532 /* read the pixels */
534 block[0] = pixels[0];
535 block[1] = pixels[1];
536 block[2] = pixels[2];
537 block[3] = pixels[3];
538 block[4] = pixels[4];
539 block[5] = pixels[5];
540 block[6] = pixels[6];
541 block[7] = pixels[7];
547 static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
548 const uint8_t *s2, int stride){
551 /* read the pixels */
553 block[0] = s1[0] - s2[0];
554 block[1] = s1[1] - s2[1];
555 block[2] = s1[2] - s2[2];
556 block[3] = s1[3] - s2[3];
557 block[4] = s1[4] - s2[4];
558 block[5] = s1[5] - s2[5];
559 block[6] = s1[6] - s2[6];
560 block[7] = s1[7] - s2[7];
568 static void put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
572 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
574 /* read the pixels */
576 pixels[0] = cm[block[0]];
577 pixels[1] = cm[block[1]];
578 pixels[2] = cm[block[2]];
579 pixels[3] = cm[block[3]];
580 pixels[4] = cm[block[4]];
581 pixels[5] = cm[block[5]];
582 pixels[6] = cm[block[6]];
583 pixels[7] = cm[block[7]];
590 static void put_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
594 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
596 /* read the pixels */
598 pixels[0] = cm[block[0]];
599 pixels[1] = cm[block[1]];
600 pixels[2] = cm[block[2]];
601 pixels[3] = cm[block[3]];
608 static void put_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
612 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
614 /* read the pixels */
616 pixels[0] = cm[block[0]];
617 pixels[1] = cm[block[1]];
624 static void put_signed_pixels_clamped_c(const DCTELEM *block,
625 uint8_t *restrict pixels,
630 for (i = 0; i < 8; i++) {
631 for (j = 0; j < 8; j++) {
634 else if (*block > 127)
637 *pixels = (uint8_t)(*block + 128);
641 pixels += (line_size - 8);
645 static void put_pixels_nonclamped_c(const DCTELEM *block, uint8_t *restrict pixels,
650 /* read the pixels */
652 pixels[0] = block[0];
653 pixels[1] = block[1];
654 pixels[2] = block[2];
655 pixels[3] = block[3];
656 pixels[4] = block[4];
657 pixels[5] = block[5];
658 pixels[6] = block[6];
659 pixels[7] = block[7];
666 static void add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
670 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
672 /* read the pixels */
674 pixels[0] = cm[pixels[0] + block[0]];
675 pixels[1] = cm[pixels[1] + block[1]];
676 pixels[2] = cm[pixels[2] + block[2]];
677 pixels[3] = cm[pixels[3] + block[3]];
678 pixels[4] = cm[pixels[4] + block[4]];
679 pixels[5] = cm[pixels[5] + block[5]];
680 pixels[6] = cm[pixels[6] + block[6]];
681 pixels[7] = cm[pixels[7] + block[7]];
687 static void add_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
691 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
693 /* read the pixels */
695 pixels[0] = cm[pixels[0] + block[0]];
696 pixels[1] = cm[pixels[1] + block[1]];
697 pixels[2] = cm[pixels[2] + block[2]];
698 pixels[3] = cm[pixels[3] + block[3]];
704 static void add_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
708 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
710 /* read the pixels */
712 pixels[0] = cm[pixels[0] + block[0]];
713 pixels[1] = cm[pixels[1] + block[1]];
719 static void add_pixels8_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
723 pixels[0] += block[0];
724 pixels[1] += block[1];
725 pixels[2] += block[2];
726 pixels[3] += block[3];
727 pixels[4] += block[4];
728 pixels[5] += block[5];
729 pixels[6] += block[6];
730 pixels[7] += block[7];
736 static void add_pixels4_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
740 pixels[0] += block[0];
741 pixels[1] += block[1];
742 pixels[2] += block[2];
743 pixels[3] += block[3];
749 static int sum_abs_dctelem_c(DCTELEM *block)
753 sum+= FFABS(block[i]);
757 static void fill_block16_c(uint8_t *block, uint8_t value, int line_size, int h)
761 for (i = 0; i < h; i++) {
762 memset(block, value, 16);
767 static void fill_block8_c(uint8_t *block, uint8_t value, int line_size, int h)
771 for (i = 0; i < h; i++) {
772 memset(block, value, 8);
777 static void scale_block_c(const uint8_t src[64]/*align 8*/, uint8_t *dst/*align 8*/, int linesize)
780 uint16_t *dst1 = dst;
781 uint16_t *dst2 = dst + linesize;
783 for (j = 0; j < 8; j++) {
784 for (i = 0; i < 8; i++) {
785 dst1[i] = dst2[i] = src[i] * 0x0101;
795 #define PIXOP2(OPNAME, OP) \
796 static void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
800 OP(*((uint64_t*)block), AV_RN64(pixels));\
806 static void OPNAME ## _no_rnd_pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
810 const uint64_t a= AV_RN64(pixels );\
811 const uint64_t b= AV_RN64(pixels+1);\
812 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
818 static void OPNAME ## _pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
822 const uint64_t a= AV_RN64(pixels );\
823 const uint64_t b= AV_RN64(pixels+1);\
824 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
830 static void OPNAME ## _no_rnd_pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
834 const uint64_t a= AV_RN64(pixels );\
835 const uint64_t b= AV_RN64(pixels+line_size);\
836 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
842 static void OPNAME ## _pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
846 const uint64_t a= AV_RN64(pixels );\
847 const uint64_t b= AV_RN64(pixels+line_size);\
848 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
854 static void OPNAME ## _pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
857 const uint64_t a= AV_RN64(pixels );\
858 const uint64_t b= AV_RN64(pixels+1);\
859 uint64_t l0= (a&0x0303030303030303ULL)\
860 + (b&0x0303030303030303ULL)\
861 + 0x0202020202020202ULL;\
862 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
863 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
867 for(i=0; i<h; i+=2){\
868 uint64_t a= AV_RN64(pixels );\
869 uint64_t b= AV_RN64(pixels+1);\
870 l1= (a&0x0303030303030303ULL)\
871 + (b&0x0303030303030303ULL);\
872 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
873 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
874 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
877 a= AV_RN64(pixels );\
878 b= AV_RN64(pixels+1);\
879 l0= (a&0x0303030303030303ULL)\
880 + (b&0x0303030303030303ULL)\
881 + 0x0202020202020202ULL;\
882 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
883 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
884 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
890 static void OPNAME ## _no_rnd_pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
893 const uint64_t a= AV_RN64(pixels );\
894 const uint64_t b= AV_RN64(pixels+1);\
895 uint64_t l0= (a&0x0303030303030303ULL)\
896 + (b&0x0303030303030303ULL)\
897 + 0x0101010101010101ULL;\
898 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
899 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
903 for(i=0; i<h; i+=2){\
904 uint64_t a= AV_RN64(pixels );\
905 uint64_t b= AV_RN64(pixels+1);\
906 l1= (a&0x0303030303030303ULL)\
907 + (b&0x0303030303030303ULL);\
908 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
909 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
910 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
913 a= AV_RN64(pixels );\
914 b= AV_RN64(pixels+1);\
915 l0= (a&0x0303030303030303ULL)\
916 + (b&0x0303030303030303ULL)\
917 + 0x0101010101010101ULL;\
918 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
919 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
920 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
926 CALL_2X_PIXELS(OPNAME ## _pixels16_c , OPNAME ## _pixels_c , 8)\
927 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels_x2_c , 8)\
928 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels_y2_c , 8)\
929 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels_xy2_c, 8)\
930 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels_x2_c , 8)\
931 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels_y2_c , 8)\
932 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels_xy2_c, 8)
934 #define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
935 #else // 64 bit variant
937 #define PIXOP2(OPNAME, OP) \
938 static void OPNAME ## _pixels2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
941 OP(*((uint16_t*)(block )), AV_RN16(pixels ));\
946 static void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
949 OP(*((uint32_t*)(block )), AV_RN32(pixels ));\
954 static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
957 OP(*((uint32_t*)(block )), AV_RN32(pixels ));\
958 OP(*((uint32_t*)(block+4)), AV_RN32(pixels+4));\
963 static inline void OPNAME ## _no_rnd_pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
964 OPNAME ## _pixels8_c(block, pixels, line_size, h);\
967 static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
968 int src_stride1, int src_stride2, int h){\
972 a= AV_RN32(&src1[i*src_stride1 ]);\
973 b= AV_RN32(&src2[i*src_stride2 ]);\
974 OP(*((uint32_t*)&dst[i*dst_stride ]), no_rnd_avg32(a, b));\
975 a= AV_RN32(&src1[i*src_stride1+4]);\
976 b= AV_RN32(&src2[i*src_stride2+4]);\
977 OP(*((uint32_t*)&dst[i*dst_stride+4]), no_rnd_avg32(a, b));\
981 static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
982 int src_stride1, int src_stride2, int h){\
986 a= AV_RN32(&src1[i*src_stride1 ]);\
987 b= AV_RN32(&src2[i*src_stride2 ]);\
988 OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
989 a= AV_RN32(&src1[i*src_stride1+4]);\
990 b= AV_RN32(&src2[i*src_stride2+4]);\
991 OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\
995 static inline void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
996 int src_stride1, int src_stride2, int h){\
1000 a= AV_RN32(&src1[i*src_stride1 ]);\
1001 b= AV_RN32(&src2[i*src_stride2 ]);\
1002 OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
1006 static inline void OPNAME ## _pixels2_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
1007 int src_stride1, int src_stride2, int h){\
1009 for(i=0; i<h; i++){\
1011 a= AV_RN16(&src1[i*src_stride1 ]);\
1012 b= AV_RN16(&src2[i*src_stride2 ]);\
1013 OP(*((uint16_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
1017 static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
1018 int src_stride1, int src_stride2, int h){\
1019 OPNAME ## _pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
1020 OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
1023 static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
1024 int src_stride1, int src_stride2, int h){\
1025 OPNAME ## _no_rnd_pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
1026 OPNAME ## _no_rnd_pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
1029 static inline void OPNAME ## _no_rnd_pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
1030 OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
1033 static inline void OPNAME ## _pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
1034 OPNAME ## _pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
1037 static inline void OPNAME ## _no_rnd_pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
1038 OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
1041 static inline void OPNAME ## _pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
1042 OPNAME ## _pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
1045 static inline void OPNAME ## _pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
1046 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1048 for(i=0; i<h; i++){\
1049 uint32_t a, b, c, d, l0, l1, h0, h1;\
1050 a= AV_RN32(&src1[i*src_stride1]);\
1051 b= AV_RN32(&src2[i*src_stride2]);\
1052 c= AV_RN32(&src3[i*src_stride3]);\
1053 d= AV_RN32(&src4[i*src_stride4]);\
1054 l0= (a&0x03030303UL)\
1057 h0= ((a&0xFCFCFCFCUL)>>2)\
1058 + ((b&0xFCFCFCFCUL)>>2);\
1059 l1= (c&0x03030303UL)\
1060 + (d&0x03030303UL);\
1061 h1= ((c&0xFCFCFCFCUL)>>2)\
1062 + ((d&0xFCFCFCFCUL)>>2);\
1063 OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1064 a= AV_RN32(&src1[i*src_stride1+4]);\
1065 b= AV_RN32(&src2[i*src_stride2+4]);\
1066 c= AV_RN32(&src3[i*src_stride3+4]);\
1067 d= AV_RN32(&src4[i*src_stride4+4]);\
1068 l0= (a&0x03030303UL)\
1071 h0= ((a&0xFCFCFCFCUL)>>2)\
1072 + ((b&0xFCFCFCFCUL)>>2);\
1073 l1= (c&0x03030303UL)\
1074 + (d&0x03030303UL);\
1075 h1= ((c&0xFCFCFCFCUL)>>2)\
1076 + ((d&0xFCFCFCFCUL)>>2);\
1077 OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1081 static inline void OPNAME ## _pixels4_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
1082 OPNAME ## _pixels4_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
1085 static inline void OPNAME ## _pixels4_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
1086 OPNAME ## _pixels4_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
1089 static inline void OPNAME ## _pixels2_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
1090 OPNAME ## _pixels2_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
1093 static inline void OPNAME ## _pixels2_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
1094 OPNAME ## _pixels2_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
1097 static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
1098 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1100 for(i=0; i<h; i++){\
1101 uint32_t a, b, c, d, l0, l1, h0, h1;\
1102 a= AV_RN32(&src1[i*src_stride1]);\
1103 b= AV_RN32(&src2[i*src_stride2]);\
1104 c= AV_RN32(&src3[i*src_stride3]);\
1105 d= AV_RN32(&src4[i*src_stride4]);\
1106 l0= (a&0x03030303UL)\
1109 h0= ((a&0xFCFCFCFCUL)>>2)\
1110 + ((b&0xFCFCFCFCUL)>>2);\
1111 l1= (c&0x03030303UL)\
1112 + (d&0x03030303UL);\
1113 h1= ((c&0xFCFCFCFCUL)>>2)\
1114 + ((d&0xFCFCFCFCUL)>>2);\
1115 OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1116 a= AV_RN32(&src1[i*src_stride1+4]);\
1117 b= AV_RN32(&src2[i*src_stride2+4]);\
1118 c= AV_RN32(&src3[i*src_stride3+4]);\
1119 d= AV_RN32(&src4[i*src_stride4+4]);\
1120 l0= (a&0x03030303UL)\
1123 h0= ((a&0xFCFCFCFCUL)>>2)\
1124 + ((b&0xFCFCFCFCUL)>>2);\
1125 l1= (c&0x03030303UL)\
1126 + (d&0x03030303UL);\
1127 h1= ((c&0xFCFCFCFCUL)>>2)\
1128 + ((d&0xFCFCFCFCUL)>>2);\
1129 OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1132 static inline void OPNAME ## _pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
1133 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1134 OPNAME ## _pixels8_l4(dst , src1 , src2 , src3 , src4 , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1135 OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1137 static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
1138 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1139 OPNAME ## _no_rnd_pixels8_l4(dst , src1 , src2 , src3 , src4 , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1140 OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1143 static inline void OPNAME ## _pixels2_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1145 int i, a0, b0, a1, b1;\
1152 for(i=0; i<h; i+=2){\
1158 block[0]= (a1+a0)>>2; /* FIXME non put */\
1159 block[1]= (b1+b0)>>2;\
1169 block[0]= (a1+a0)>>2;\
1170 block[1]= (b1+b0)>>2;\
1176 static inline void OPNAME ## _pixels4_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1179 const uint32_t a= AV_RN32(pixels );\
1180 const uint32_t b= AV_RN32(pixels+1);\
1181 uint32_t l0= (a&0x03030303UL)\
1184 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1185 + ((b&0xFCFCFCFCUL)>>2);\
1189 for(i=0; i<h; i+=2){\
1190 uint32_t a= AV_RN32(pixels );\
1191 uint32_t b= AV_RN32(pixels+1);\
1192 l1= (a&0x03030303UL)\
1193 + (b&0x03030303UL);\
1194 h1= ((a&0xFCFCFCFCUL)>>2)\
1195 + ((b&0xFCFCFCFCUL)>>2);\
1196 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1199 a= AV_RN32(pixels );\
1200 b= AV_RN32(pixels+1);\
1201 l0= (a&0x03030303UL)\
1204 h0= ((a&0xFCFCFCFCUL)>>2)\
1205 + ((b&0xFCFCFCFCUL)>>2);\
1206 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1212 static inline void OPNAME ## _pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1215 for(j=0; j<2; j++){\
1217 const uint32_t a= AV_RN32(pixels );\
1218 const uint32_t b= AV_RN32(pixels+1);\
1219 uint32_t l0= (a&0x03030303UL)\
1222 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1223 + ((b&0xFCFCFCFCUL)>>2);\
1227 for(i=0; i<h; i+=2){\
1228 uint32_t a= AV_RN32(pixels );\
1229 uint32_t b= AV_RN32(pixels+1);\
1230 l1= (a&0x03030303UL)\
1231 + (b&0x03030303UL);\
1232 h1= ((a&0xFCFCFCFCUL)>>2)\
1233 + ((b&0xFCFCFCFCUL)>>2);\
1234 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1237 a= AV_RN32(pixels );\
1238 b= AV_RN32(pixels+1);\
1239 l0= (a&0x03030303UL)\
1242 h0= ((a&0xFCFCFCFCUL)>>2)\
1243 + ((b&0xFCFCFCFCUL)>>2);\
1244 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1248 pixels+=4-line_size*(h+1);\
1249 block +=4-line_size*h;\
1253 static inline void OPNAME ## _no_rnd_pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1256 for(j=0; j<2; j++){\
1258 const uint32_t a= AV_RN32(pixels );\
1259 const uint32_t b= AV_RN32(pixels+1);\
1260 uint32_t l0= (a&0x03030303UL)\
1263 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1264 + ((b&0xFCFCFCFCUL)>>2);\
1268 for(i=0; i<h; i+=2){\
1269 uint32_t a= AV_RN32(pixels );\
1270 uint32_t b= AV_RN32(pixels+1);\
1271 l1= (a&0x03030303UL)\
1272 + (b&0x03030303UL);\
1273 h1= ((a&0xFCFCFCFCUL)>>2)\
1274 + ((b&0xFCFCFCFCUL)>>2);\
1275 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1278 a= AV_RN32(pixels );\
1279 b= AV_RN32(pixels+1);\
1280 l0= (a&0x03030303UL)\
1283 h0= ((a&0xFCFCFCFCUL)>>2)\
1284 + ((b&0xFCFCFCFCUL)>>2);\
1285 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1289 pixels+=4-line_size*(h+1);\
1290 block +=4-line_size*h;\
1294 CALL_2X_PIXELS(OPNAME ## _pixels16_c , OPNAME ## _pixels8_c , 8)\
1295 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels8_x2_c , 8)\
1296 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels8_y2_c , 8)\
1297 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels8_xy2_c, 8)\
1298 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_c , OPNAME ## _pixels8_c , 8)\
1299 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels8_x2_c , 8)\
1300 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels8_y2_c , 8)\
1301 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels8_xy2_c, 8)\
1303 #define op_avg(a, b) a = rnd_avg32(a, b)
1305 #define op_put(a, b) a = b
1312 #define avg2(a,b) ((a+b+1)>>1)
1313 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
1315 static void put_no_rnd_pixels16_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1316 put_no_rnd_pixels16_l2(dst, a, b, stride, stride, stride, h);
1319 static void put_no_rnd_pixels8_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1320 put_no_rnd_pixels8_l2(dst, a, b, stride, stride, stride, h);
1323 static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
1325 const int A=(16-x16)*(16-y16);
1326 const int B=( x16)*(16-y16);
1327 const int C=(16-x16)*( y16);
1328 const int D=( x16)*( y16);
1333 dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
1334 dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
1335 dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
1336 dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
1337 dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
1338 dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
1339 dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
1340 dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
1346 void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
1347 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
1350 const int s= 1<<shift;
1360 for(x=0; x<8; x++){ //XXX FIXME optimize
1361 int src_x, src_y, frac_x, frac_y, index;
1365 frac_x= src_x&(s-1);
1366 frac_y= src_y&(s-1);
1370 if((unsigned)src_x < width){
1371 if((unsigned)src_y < height){
1372 index= src_x + src_y*stride;
1373 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
1374 + src[index +1]* frac_x )*(s-frac_y)
1375 + ( src[index+stride ]*(s-frac_x)
1376 + src[index+stride+1]* frac_x )* frac_y
1379 index= src_x + av_clip(src_y, 0, height)*stride;
1380 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
1381 + src[index +1]* frac_x )*s
1385 if((unsigned)src_y < height){
1386 index= av_clip(src_x, 0, width) + src_y*stride;
1387 dst[y*stride + x]= ( ( src[index ]*(s-frac_y)
1388 + src[index+stride ]* frac_y )*s
1391 index= av_clip(src_x, 0, width) + av_clip(src_y, 0, height)*stride;
1392 dst[y*stride + x]= src[index ];
1404 static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1406 case 2: put_pixels2_c (dst, src, stride, height); break;
1407 case 4: put_pixels4_c (dst, src, stride, height); break;
1408 case 8: put_pixels8_c (dst, src, stride, height); break;
1409 case 16:put_pixels16_c(dst, src, stride, height); break;
1413 static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1415 for (i=0; i < height; i++) {
1416 for (j=0; j < width; j++) {
1417 dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
1424 static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1426 for (i=0; i < height; i++) {
1427 for (j=0; j < width; j++) {
1428 dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
1435 static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1437 for (i=0; i < height; i++) {
1438 for (j=0; j < width; j++) {
1439 dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
1446 static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1448 for (i=0; i < height; i++) {
1449 for (j=0; j < width; j++) {
1450 dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
1457 static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1459 for (i=0; i < height; i++) {
1460 for (j=0; j < width; j++) {
1461 dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1468 static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1470 for (i=0; i < height; i++) {
1471 for (j=0; j < width; j++) {
1472 dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
1479 static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1481 for (i=0; i < height; i++) {
1482 for (j=0; j < width; j++) {
1483 dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1490 static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1492 for (i=0; i < height; i++) {
1493 for (j=0; j < width; j++) {
1494 dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
1501 static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1503 case 2: avg_pixels2_c (dst, src, stride, height); break;
1504 case 4: avg_pixels4_c (dst, src, stride, height); break;
1505 case 8: avg_pixels8_c (dst, src, stride, height); break;
1506 case 16:avg_pixels16_c(dst, src, stride, height); break;
1510 static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1512 for (i=0; i < height; i++) {
1513 for (j=0; j < width; j++) {
1514 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
1521 static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1523 for (i=0; i < height; i++) {
1524 for (j=0; j < width; j++) {
1525 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
1532 static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1534 for (i=0; i < height; i++) {
1535 for (j=0; j < width; j++) {
1536 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
1543 static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1545 for (i=0; i < height; i++) {
1546 for (j=0; j < width; j++) {
1547 dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1554 static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1556 for (i=0; i < height; i++) {
1557 for (j=0; j < width; j++) {
1558 dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1565 static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1567 for (i=0; i < height; i++) {
1568 for (j=0; j < width; j++) {
1569 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
1576 static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1578 for (i=0; i < height; i++) {
1579 for (j=0; j < width; j++) {
1580 dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1587 static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1589 for (i=0; i < height; i++) {
1590 for (j=0; j < width; j++) {
1591 dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1598 #define TPEL_WIDTH(width)\
1599 static void put_tpel_pixels ## width ## _mc00_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1600 void put_tpel_pixels_mc00_c(dst, src, stride, width, height);}\
1601 static void put_tpel_pixels ## width ## _mc10_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1602 void put_tpel_pixels_mc10_c(dst, src, stride, width, height);}\
1603 static void put_tpel_pixels ## width ## _mc20_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1604 void put_tpel_pixels_mc20_c(dst, src, stride, width, height);}\
1605 static void put_tpel_pixels ## width ## _mc01_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1606 void put_tpel_pixels_mc01_c(dst, src, stride, width, height);}\
1607 static void put_tpel_pixels ## width ## _mc11_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1608 void put_tpel_pixels_mc11_c(dst, src, stride, width, height);}\
1609 static void put_tpel_pixels ## width ## _mc21_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1610 void put_tpel_pixels_mc21_c(dst, src, stride, width, height);}\
1611 static void put_tpel_pixels ## width ## _mc02_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1612 void put_tpel_pixels_mc02_c(dst, src, stride, width, height);}\
1613 static void put_tpel_pixels ## width ## _mc12_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1614 void put_tpel_pixels_mc12_c(dst, src, stride, width, height);}\
1615 static void put_tpel_pixels ## width ## _mc22_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1616 void put_tpel_pixels_mc22_c(dst, src, stride, width, height);}
1619 #define H264_CHROMA_MC(OPNAME, OP)\
1620 static void OPNAME ## h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1621 const int A=(8-x)*(8-y);\
1622 const int B=( x)*(8-y);\
1623 const int C=(8-x)*( y);\
1624 const int D=( x)*( y);\
1627 assert(x<8 && y<8 && x>=0 && y>=0);\
1630 for(i=0; i<h; i++){\
1631 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1632 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1638 const int step= C ? stride : 1;\
1639 for(i=0; i<h; i++){\
1640 OP(dst[0], (A*src[0] + E*src[step+0]));\
1641 OP(dst[1], (A*src[1] + E*src[step+1]));\
1648 static void OPNAME ## h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1649 const int A=(8-x)*(8-y);\
1650 const int B=( x)*(8-y);\
1651 const int C=(8-x)*( y);\
1652 const int D=( x)*( y);\
1655 assert(x<8 && y<8 && x>=0 && y>=0);\
1658 for(i=0; i<h; i++){\
1659 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1660 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1661 OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1662 OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1668 const int step= C ? stride : 1;\
1669 for(i=0; i<h; i++){\
1670 OP(dst[0], (A*src[0] + E*src[step+0]));\
1671 OP(dst[1], (A*src[1] + E*src[step+1]));\
1672 OP(dst[2], (A*src[2] + E*src[step+2]));\
1673 OP(dst[3], (A*src[3] + E*src[step+3]));\
1680 static void OPNAME ## h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1681 const int A=(8-x)*(8-y);\
1682 const int B=( x)*(8-y);\
1683 const int C=(8-x)*( y);\
1684 const int D=( x)*( y);\
1687 assert(x<8 && y<8 && x>=0 && y>=0);\
1690 for(i=0; i<h; i++){\
1691 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1692 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1693 OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1694 OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1695 OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));\
1696 OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));\
1697 OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));\
1698 OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));\
1704 const int step= C ? stride : 1;\
1705 for(i=0; i<h; i++){\
1706 OP(dst[0], (A*src[0] + E*src[step+0]));\
1707 OP(dst[1], (A*src[1] + E*src[step+1]));\
1708 OP(dst[2], (A*src[2] + E*src[step+2]));\
1709 OP(dst[3], (A*src[3] + E*src[step+3]));\
1710 OP(dst[4], (A*src[4] + E*src[step+4]));\
1711 OP(dst[5], (A*src[5] + E*src[step+5]));\
1712 OP(dst[6], (A*src[6] + E*src[step+6]));\
1713 OP(dst[7], (A*src[7] + E*src[step+7]));\
1720 #define op_avg(a, b) a = (((a)+(((b) + 32)>>6)+1)>>1)
1721 #define op_put(a, b) a = (((b) + 32)>>6)
1723 H264_CHROMA_MC(put_ , op_put)
1724 H264_CHROMA_MC(avg_ , op_avg)
1728 static void put_no_rnd_vc1_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){
1729 const int A=(8-x)*(8-y);
1730 const int B=( x)*(8-y);
1731 const int C=(8-x)*( y);
1732 const int D=( x)*( y);
1735 assert(x<8 && y<8 && x>=0 && y>=0);
1739 dst[0] = (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + 32 - 4) >> 6;
1740 dst[1] = (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + 32 - 4) >> 6;
1741 dst[2] = (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + 32 - 4) >> 6;
1742 dst[3] = (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + 32 - 4) >> 6;
1743 dst[4] = (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + 32 - 4) >> 6;
1744 dst[5] = (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + 32 - 4) >> 6;
1745 dst[6] = (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + 32 - 4) >> 6;
1746 dst[7] = (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + 32 - 4) >> 6;
1752 static void avg_no_rnd_vc1_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){
1753 const int A=(8-x)*(8-y);
1754 const int B=( x)*(8-y);
1755 const int C=(8-x)*( y);
1756 const int D=( x)*( y);
1759 assert(x<8 && y<8 && x>=0 && y>=0);
1763 dst[0] = avg2(dst[0], ((A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + 32 - 4) >> 6));
1764 dst[1] = avg2(dst[1], ((A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + 32 - 4) >> 6));
1765 dst[2] = avg2(dst[2], ((A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + 32 - 4) >> 6));
1766 dst[3] = avg2(dst[3], ((A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + 32 - 4) >> 6));
1767 dst[4] = avg2(dst[4], ((A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + 32 - 4) >> 6));
1768 dst[5] = avg2(dst[5], ((A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + 32 - 4) >> 6));
1769 dst[6] = avg2(dst[6], ((A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + 32 - 4) >> 6));
1770 dst[7] = avg2(dst[7], ((A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + 32 - 4) >> 6));
1776 #define QPEL_MC(r, OPNAME, RND, OP) \
1777 static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1778 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1782 OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
1783 OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
1784 OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
1785 OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
1786 OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
1787 OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
1788 OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
1789 OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
1795 static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1797 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1801 const int src0= src[0*srcStride];\
1802 const int src1= src[1*srcStride];\
1803 const int src2= src[2*srcStride];\
1804 const int src3= src[3*srcStride];\
1805 const int src4= src[4*srcStride];\
1806 const int src5= src[5*srcStride];\
1807 const int src6= src[6*srcStride];\
1808 const int src7= src[7*srcStride];\
1809 const int src8= src[8*srcStride];\
1810 OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
1811 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
1812 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
1813 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
1814 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
1815 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
1816 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
1817 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
1823 static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1824 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1829 OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
1830 OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
1831 OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
1832 OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
1833 OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
1834 OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
1835 OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
1836 OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
1837 OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
1838 OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
1839 OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
1840 OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
1841 OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
1842 OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
1843 OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
1844 OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
1850 static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1851 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1856 const int src0= src[0*srcStride];\
1857 const int src1= src[1*srcStride];\
1858 const int src2= src[2*srcStride];\
1859 const int src3= src[3*srcStride];\
1860 const int src4= src[4*srcStride];\
1861 const int src5= src[5*srcStride];\
1862 const int src6= src[6*srcStride];\
1863 const int src7= src[7*srcStride];\
1864 const int src8= src[8*srcStride];\
1865 const int src9= src[9*srcStride];\
1866 const int src10= src[10*srcStride];\
1867 const int src11= src[11*srcStride];\
1868 const int src12= src[12*srcStride];\
1869 const int src13= src[13*srcStride];\
1870 const int src14= src[14*srcStride];\
1871 const int src15= src[15*srcStride];\
1872 const int src16= src[16*srcStride];\
1873 OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
1874 OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
1875 OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
1876 OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
1877 OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
1878 OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
1879 OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
1880 OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
1881 OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
1882 OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
1883 OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
1884 OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
1885 OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
1886 OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
1887 OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
1888 OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
1894 static void OPNAME ## qpel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1895 OPNAME ## pixels8_c(dst, src, stride, 8);\
1898 static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1900 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1901 OPNAME ## pixels8_l2(dst, src, half, stride, stride, 8, 8);\
1904 static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1905 OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
1908 static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1910 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1911 OPNAME ## pixels8_l2(dst, src+1, half, stride, stride, 8, 8);\
1914 static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1915 uint8_t full[16*9];\
1917 copy_block9(full, src, 16, stride, 9);\
1918 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1919 OPNAME ## pixels8_l2(dst, full, half, stride, 16, 8, 8);\
1922 static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1923 uint8_t full[16*9];\
1924 copy_block9(full, src, 16, stride, 9);\
1925 OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
1928 static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1929 uint8_t full[16*9];\
1931 copy_block9(full, src, 16, stride, 9);\
1932 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1933 OPNAME ## pixels8_l2(dst, full+16, half, stride, 16, 8, 8);\
1935 void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1936 uint8_t full[16*9];\
1939 uint8_t halfHV[64];\
1940 copy_block9(full, src, 16, stride, 9);\
1941 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1942 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1943 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1944 OPNAME ## pixels8_l4(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1946 static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1947 uint8_t full[16*9];\
1949 uint8_t halfHV[64];\
1950 copy_block9(full, src, 16, stride, 9);\
1951 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1952 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1953 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1954 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1956 void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1957 uint8_t full[16*9];\
1960 uint8_t halfHV[64];\
1961 copy_block9(full, src, 16, stride, 9);\
1962 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1963 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1964 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1965 OPNAME ## pixels8_l4(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1967 static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1968 uint8_t full[16*9];\
1970 uint8_t halfHV[64];\
1971 copy_block9(full, src, 16, stride, 9);\
1972 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1973 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1974 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1975 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1977 void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1978 uint8_t full[16*9];\
1981 uint8_t halfHV[64];\
1982 copy_block9(full, src, 16, stride, 9);\
1983 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1984 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1985 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1986 OPNAME ## pixels8_l4(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1988 static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1989 uint8_t full[16*9];\
1991 uint8_t halfHV[64];\
1992 copy_block9(full, src, 16, stride, 9);\
1993 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1994 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1995 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1996 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1998 void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1999 uint8_t full[16*9];\
2002 uint8_t halfHV[64];\
2003 copy_block9(full, src, 16, stride, 9);\
2004 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full , 8, 16, 9);\
2005 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
2006 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
2007 OPNAME ## pixels8_l4(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
2009 static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2010 uint8_t full[16*9];\
2012 uint8_t halfHV[64];\
2013 copy_block9(full, src, 16, stride, 9);\
2014 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
2015 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
2016 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
2017 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
2019 static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2021 uint8_t halfHV[64];\
2022 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
2023 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
2024 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
2026 static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2028 uint8_t halfHV[64];\
2029 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
2030 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
2031 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
2033 void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
2034 uint8_t full[16*9];\
2037 uint8_t halfHV[64];\
2038 copy_block9(full, src, 16, stride, 9);\
2039 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
2040 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
2041 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
2042 OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
2044 static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2045 uint8_t full[16*9];\
2047 copy_block9(full, src, 16, stride, 9);\
2048 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
2049 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
2050 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
2052 void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
2053 uint8_t full[16*9];\
2056 uint8_t halfHV[64];\
2057 copy_block9(full, src, 16, stride, 9);\
2058 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
2059 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
2060 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
2061 OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
2063 static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2064 uint8_t full[16*9];\
2066 copy_block9(full, src, 16, stride, 9);\
2067 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
2068 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
2069 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
2071 static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2073 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
2074 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
2076 static void OPNAME ## qpel16_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2077 OPNAME ## pixels16_c(dst, src, stride, 16);\
2080 static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2082 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
2083 OPNAME ## pixels16_l2(dst, src, half, stride, stride, 16, 16);\
2086 static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2087 OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
2090 static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2092 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
2093 OPNAME ## pixels16_l2(dst, src+1, half, stride, stride, 16, 16);\
2096 static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2097 uint8_t full[24*17];\
2099 copy_block17(full, src, 24, stride, 17);\
2100 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
2101 OPNAME ## pixels16_l2(dst, full, half, stride, 24, 16, 16);\
2104 static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2105 uint8_t full[24*17];\
2106 copy_block17(full, src, 24, stride, 17);\
2107 OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
2110 static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2111 uint8_t full[24*17];\
2113 copy_block17(full, src, 24, stride, 17);\
2114 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
2115 OPNAME ## pixels16_l2(dst, full+24, half, stride, 24, 16, 16);\
2117 void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
2118 uint8_t full[24*17];\
2119 uint8_t halfH[272];\
2120 uint8_t halfV[256];\
2121 uint8_t halfHV[256];\
2122 copy_block17(full, src, 24, stride, 17);\
2123 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2124 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2125 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2126 OPNAME ## pixels16_l4(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2128 static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2129 uint8_t full[24*17];\
2130 uint8_t halfH[272];\
2131 uint8_t halfHV[256];\
2132 copy_block17(full, src, 24, stride, 17);\
2133 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2134 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2135 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2136 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2138 void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
2139 uint8_t full[24*17];\
2140 uint8_t halfH[272];\
2141 uint8_t halfV[256];\
2142 uint8_t halfHV[256];\
2143 copy_block17(full, src, 24, stride, 17);\
2144 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2145 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2146 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2147 OPNAME ## pixels16_l4(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2149 static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2150 uint8_t full[24*17];\
2151 uint8_t halfH[272];\
2152 uint8_t halfHV[256];\
2153 copy_block17(full, src, 24, stride, 17);\
2154 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2155 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2156 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2157 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2159 void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
2160 uint8_t full[24*17];\
2161 uint8_t halfH[272];\
2162 uint8_t halfV[256];\
2163 uint8_t halfHV[256];\
2164 copy_block17(full, src, 24, stride, 17);\
2165 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2166 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2167 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2168 OPNAME ## pixels16_l4(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2170 static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2171 uint8_t full[24*17];\
2172 uint8_t halfH[272];\
2173 uint8_t halfHV[256];\
2174 copy_block17(full, src, 24, stride, 17);\
2175 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2176 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2177 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2178 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2180 void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
2181 uint8_t full[24*17];\
2182 uint8_t halfH[272];\
2183 uint8_t halfV[256];\
2184 uint8_t halfHV[256];\
2185 copy_block17(full, src, 24, stride, 17);\
2186 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full , 16, 24, 17);\
2187 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2188 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2189 OPNAME ## pixels16_l4(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2191 static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2192 uint8_t full[24*17];\
2193 uint8_t halfH[272];\
2194 uint8_t halfHV[256];\
2195 copy_block17(full, src, 24, stride, 17);\
2196 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2197 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2198 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2199 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2201 static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2202 uint8_t halfH[272];\
2203 uint8_t halfHV[256];\
2204 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2205 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2206 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2208 static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2209 uint8_t halfH[272];\
2210 uint8_t halfHV[256];\
2211 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2212 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2213 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2215 void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
2216 uint8_t full[24*17];\
2217 uint8_t halfH[272];\
2218 uint8_t halfV[256];\
2219 uint8_t halfHV[256];\
2220 copy_block17(full, src, 24, stride, 17);\
2221 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2222 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2223 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2224 OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2226 static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2227 uint8_t full[24*17];\
2228 uint8_t halfH[272];\
2229 copy_block17(full, src, 24, stride, 17);\
2230 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2231 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2232 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2234 void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
2235 uint8_t full[24*17];\
2236 uint8_t halfH[272];\
2237 uint8_t halfV[256];\
2238 uint8_t halfHV[256];\
2239 copy_block17(full, src, 24, stride, 17);\
2240 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2241 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2242 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2243 OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2245 static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2246 uint8_t full[24*17];\
2247 uint8_t halfH[272];\
2248 copy_block17(full, src, 24, stride, 17);\
2249 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2250 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2251 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2253 static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2254 uint8_t halfH[272];\
2255 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2256 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2259 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2260 #define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
2261 #define op_put(a, b) a = cm[((b) + 16)>>5]
2262 #define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
2264 QPEL_MC(0, put_ , _ , op_put)
2265 QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
2266 QPEL_MC(0, avg_ , _ , op_avg)
2267 //QPEL_MC(1, avg_no_rnd , _ , op_avg)
2269 #undef op_avg_no_rnd
2271 #undef op_put_no_rnd
2274 #define H264_LOWPASS(OPNAME, OP, OP2) \
2275 static av_unused void OPNAME ## h264_qpel2_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2277 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2281 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2282 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2288 static av_unused void OPNAME ## h264_qpel2_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2290 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2294 const int srcB= src[-2*srcStride];\
2295 const int srcA= src[-1*srcStride];\
2296 const int src0= src[0 *srcStride];\
2297 const int src1= src[1 *srcStride];\
2298 const int src2= src[2 *srcStride];\
2299 const int src3= src[3 *srcStride];\
2300 const int src4= src[4 *srcStride];\
2301 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2302 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2308 static av_unused void OPNAME ## h264_qpel2_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2311 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2313 src -= 2*srcStride;\
2314 for(i=0; i<h+5; i++)\
2316 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2317 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2321 tmp -= tmpStride*(h+5-2);\
2324 const int tmpB= tmp[-2*tmpStride];\
2325 const int tmpA= tmp[-1*tmpStride];\
2326 const int tmp0= tmp[0 *tmpStride];\
2327 const int tmp1= tmp[1 *tmpStride];\
2328 const int tmp2= tmp[2 *tmpStride];\
2329 const int tmp3= tmp[3 *tmpStride];\
2330 const int tmp4= tmp[4 *tmpStride];\
2331 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2332 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2337 static void OPNAME ## h264_qpel4_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2339 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2343 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2344 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2345 OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\
2346 OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\
2352 static void OPNAME ## h264_qpel4_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2354 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2358 const int srcB= src[-2*srcStride];\
2359 const int srcA= src[-1*srcStride];\
2360 const int src0= src[0 *srcStride];\
2361 const int src1= src[1 *srcStride];\
2362 const int src2= src[2 *srcStride];\
2363 const int src3= src[3 *srcStride];\
2364 const int src4= src[4 *srcStride];\
2365 const int src5= src[5 *srcStride];\
2366 const int src6= src[6 *srcStride];\
2367 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2368 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2369 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2370 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2376 static void OPNAME ## h264_qpel4_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2379 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2381 src -= 2*srcStride;\
2382 for(i=0; i<h+5; i++)\
2384 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2385 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2386 tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]);\
2387 tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]);\
2391 tmp -= tmpStride*(h+5-2);\
2394 const int tmpB= tmp[-2*tmpStride];\
2395 const int tmpA= tmp[-1*tmpStride];\
2396 const int tmp0= tmp[0 *tmpStride];\
2397 const int tmp1= tmp[1 *tmpStride];\
2398 const int tmp2= tmp[2 *tmpStride];\
2399 const int tmp3= tmp[3 *tmpStride];\
2400 const int tmp4= tmp[4 *tmpStride];\
2401 const int tmp5= tmp[5 *tmpStride];\
2402 const int tmp6= tmp[6 *tmpStride];\
2403 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2404 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2405 OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2406 OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2412 static void OPNAME ## h264_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2414 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2418 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]));\
2419 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]));\
2420 OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]));\
2421 OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]));\
2422 OP(dst[4], (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]));\
2423 OP(dst[5], (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]));\
2424 OP(dst[6], (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]));\
2425 OP(dst[7], (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]));\
2431 static void OPNAME ## h264_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2433 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2437 const int srcB= src[-2*srcStride];\
2438 const int srcA= src[-1*srcStride];\
2439 const int src0= src[0 *srcStride];\
2440 const int src1= src[1 *srcStride];\
2441 const int src2= src[2 *srcStride];\
2442 const int src3= src[3 *srcStride];\
2443 const int src4= src[4 *srcStride];\
2444 const int src5= src[5 *srcStride];\
2445 const int src6= src[6 *srcStride];\
2446 const int src7= src[7 *srcStride];\
2447 const int src8= src[8 *srcStride];\
2448 const int src9= src[9 *srcStride];\
2449 const int src10=src[10*srcStride];\
2450 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2451 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2452 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2453 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2454 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*5 + (src2+src7));\
2455 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*5 + (src3+src8));\
2456 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*5 + (src4+src9));\
2457 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src9)*5 + (src5+src10));\
2463 static void OPNAME ## h264_qpel8_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2466 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2468 src -= 2*srcStride;\
2469 for(i=0; i<h+5; i++)\
2471 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]);\
2472 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]);\
2473 tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]);\
2474 tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]);\
2475 tmp[4]= (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]);\
2476 tmp[5]= (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]);\
2477 tmp[6]= (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]);\
2478 tmp[7]= (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]);\
2482 tmp -= tmpStride*(h+5-2);\
2485 const int tmpB= tmp[-2*tmpStride];\
2486 const int tmpA= tmp[-1*tmpStride];\
2487 const int tmp0= tmp[0 *tmpStride];\
2488 const int tmp1= tmp[1 *tmpStride];\
2489 const int tmp2= tmp[2 *tmpStride];\
2490 const int tmp3= tmp[3 *tmpStride];\
2491 const int tmp4= tmp[4 *tmpStride];\
2492 const int tmp5= tmp[5 *tmpStride];\
2493 const int tmp6= tmp[6 *tmpStride];\
2494 const int tmp7= tmp[7 *tmpStride];\
2495 const int tmp8= tmp[8 *tmpStride];\
2496 const int tmp9= tmp[9 *tmpStride];\
2497 const int tmp10=tmp[10*tmpStride];\
2498 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2499 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2500 OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2501 OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2502 OP2(dst[4*dstStride], (tmp4+tmp5)*20 - (tmp3+tmp6)*5 + (tmp2+tmp7));\
2503 OP2(dst[5*dstStride], (tmp5+tmp6)*20 - (tmp4+tmp7)*5 + (tmp3+tmp8));\
2504 OP2(dst[6*dstStride], (tmp6+tmp7)*20 - (tmp5+tmp8)*5 + (tmp4+tmp9));\
2505 OP2(dst[7*dstStride], (tmp7+tmp8)*20 - (tmp6+tmp9)*5 + (tmp5+tmp10));\
2511 static void OPNAME ## h264_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2512 OPNAME ## h264_qpel8_v_lowpass(dst , src , dstStride, srcStride);\
2513 OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2514 src += 8*srcStride;\
2515 dst += 8*dstStride;\
2516 OPNAME ## h264_qpel8_v_lowpass(dst , src , dstStride, srcStride);\
2517 OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2520 static void OPNAME ## h264_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2521 OPNAME ## h264_qpel8_h_lowpass(dst , src , dstStride, srcStride);\
2522 OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2523 src += 8*srcStride;\
2524 dst += 8*dstStride;\
2525 OPNAME ## h264_qpel8_h_lowpass(dst , src , dstStride, srcStride);\
2526 OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2529 static void OPNAME ## h264_qpel16_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2530 OPNAME ## h264_qpel8_hv_lowpass(dst , tmp , src , dstStride, tmpStride, srcStride);\
2531 OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2532 src += 8*srcStride;\
2533 dst += 8*dstStride;\
2534 OPNAME ## h264_qpel8_hv_lowpass(dst , tmp , src , dstStride, tmpStride, srcStride);\
2535 OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2538 #define H264_MC(OPNAME, SIZE) \
2539 static void OPNAME ## h264_qpel ## SIZE ## _mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2540 OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\
2543 static void OPNAME ## h264_qpel ## SIZE ## _mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2544 uint8_t half[SIZE*SIZE];\
2545 put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2546 OPNAME ## pixels ## SIZE ## _l2(dst, src, half, stride, stride, SIZE, SIZE);\
2549 static void OPNAME ## h264_qpel ## SIZE ## _mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2550 OPNAME ## h264_qpel ## SIZE ## _h_lowpass(dst, src, stride, stride);\
2553 static void OPNAME ## h264_qpel ## SIZE ## _mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2554 uint8_t half[SIZE*SIZE];\
2555 put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2556 OPNAME ## pixels ## SIZE ## _l2(dst, src+1, half, stride, stride, SIZE, SIZE);\
2559 static void OPNAME ## h264_qpel ## SIZE ## _mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2560 uint8_t full[SIZE*(SIZE+5)];\
2561 uint8_t * const full_mid= full + SIZE*2;\
2562 uint8_t half[SIZE*SIZE];\
2563 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2564 put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2565 OPNAME ## pixels ## SIZE ## _l2(dst, full_mid, half, stride, SIZE, SIZE, SIZE);\
2568 static void OPNAME ## h264_qpel ## SIZE ## _mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2569 uint8_t full[SIZE*(SIZE+5)];\
2570 uint8_t * const full_mid= full + SIZE*2;\
2571 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2572 OPNAME ## h264_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE);\
2575 static void OPNAME ## h264_qpel ## SIZE ## _mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2576 uint8_t full[SIZE*(SIZE+5)];\
2577 uint8_t * const full_mid= full + SIZE*2;\
2578 uint8_t half[SIZE*SIZE];\
2579 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2580 put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2581 OPNAME ## pixels ## SIZE ## _l2(dst, full_mid+SIZE, half, stride, SIZE, SIZE, SIZE);\
2584 static void OPNAME ## h264_qpel ## SIZE ## _mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2585 uint8_t full[SIZE*(SIZE+5)];\
2586 uint8_t * const full_mid= full + SIZE*2;\
2587 uint8_t halfH[SIZE*SIZE];\
2588 uint8_t halfV[SIZE*SIZE];\
2589 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2590 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2591 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2592 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2595 static void OPNAME ## h264_qpel ## SIZE ## _mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2596 uint8_t full[SIZE*(SIZE+5)];\
2597 uint8_t * const full_mid= full + SIZE*2;\
2598 uint8_t halfH[SIZE*SIZE];\
2599 uint8_t halfV[SIZE*SIZE];\
2600 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2601 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2602 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2603 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2606 static void OPNAME ## h264_qpel ## SIZE ## _mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2607 uint8_t full[SIZE*(SIZE+5)];\
2608 uint8_t * const full_mid= full + SIZE*2;\
2609 uint8_t halfH[SIZE*SIZE];\
2610 uint8_t halfV[SIZE*SIZE];\
2611 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2612 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2613 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2614 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2617 static void OPNAME ## h264_qpel ## SIZE ## _mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2618 uint8_t full[SIZE*(SIZE+5)];\
2619 uint8_t * const full_mid= full + SIZE*2;\
2620 uint8_t halfH[SIZE*SIZE];\
2621 uint8_t halfV[SIZE*SIZE];\
2622 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2623 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2624 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2625 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2628 static void OPNAME ## h264_qpel ## SIZE ## _mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2629 int16_t tmp[SIZE*(SIZE+5)];\
2630 OPNAME ## h264_qpel ## SIZE ## _hv_lowpass(dst, tmp, src, stride, SIZE, stride);\
2633 static void OPNAME ## h264_qpel ## SIZE ## _mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2634 int16_t tmp[SIZE*(SIZE+5)];\
2635 uint8_t halfH[SIZE*SIZE];\
2636 uint8_t halfHV[SIZE*SIZE];\
2637 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2638 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2639 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2642 static void OPNAME ## h264_qpel ## SIZE ## _mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2643 int16_t tmp[SIZE*(SIZE+5)];\
2644 uint8_t halfH[SIZE*SIZE];\
2645 uint8_t halfHV[SIZE*SIZE];\
2646 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2647 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2648 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2651 static void OPNAME ## h264_qpel ## SIZE ## _mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2652 uint8_t full[SIZE*(SIZE+5)];\
2653 uint8_t * const full_mid= full + SIZE*2;\
2654 int16_t tmp[SIZE*(SIZE+5)];\
2655 uint8_t halfV[SIZE*SIZE];\
2656 uint8_t halfHV[SIZE*SIZE];\
2657 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2658 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2659 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2660 OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2663 static void OPNAME ## h264_qpel ## SIZE ## _mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2664 uint8_t full[SIZE*(SIZE+5)];\
2665 uint8_t * const full_mid= full + SIZE*2;\
2666 int16_t tmp[SIZE*(SIZE+5)];\
2667 uint8_t halfV[SIZE*SIZE];\
2668 uint8_t halfHV[SIZE*SIZE];\
2669 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2670 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2671 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2672 OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2675 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2676 //#define op_avg2(a, b) a = (((a)*w1+cm[((b) + 16)>>5]*w2 + o + 64)>>7)
2677 #define op_put(a, b) a = cm[((b) + 16)>>5]
2678 #define op2_avg(a, b) a = (((a)+cm[((b) + 512)>>10]+1)>>1)
2679 #define op2_put(a, b) a = cm[((b) + 512)>>10]
2681 H264_LOWPASS(put_ , op_put, op2_put)
2682 H264_LOWPASS(avg_ , op_avg, op2_avg)
2697 #define op_scale1(x) block[x] = av_clip_uint8( (block[x]*weight + offset) >> log2_denom )
2698 #define op_scale2(x) dst[x] = av_clip_uint8( (src[x]*weights + dst[x]*weightd + offset) >> (log2_denom+1))
2699 #define H264_WEIGHT(W,H) \
2700 static void weight_h264_pixels ## W ## x ## H ## _c(uint8_t *block, int stride, int log2_denom, int weight, int offset){ \
2702 offset <<= log2_denom; \
2703 if(log2_denom) offset += 1<<(log2_denom-1); \
2704 for(y=0; y<H; y++, block += stride){ \
2707 if(W==2) continue; \
2710 if(W==4) continue; \
2715 if(W==8) continue; \
2726 static void biweight_h264_pixels ## W ## x ## H ## _c(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset){ \
2728 offset = ((offset + 1) | 1) << log2_denom; \
2729 for(y=0; y<H; y++, dst += stride, src += stride){ \
2732 if(W==2) continue; \
2735 if(W==4) continue; \
2740 if(W==8) continue; \
2767 static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
2768 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2772 dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
2773 dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
2774 dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
2775 dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
2776 dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
2777 dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
2778 dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
2779 dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
2785 #if CONFIG_CAVS_DECODER
2787 void ff_put_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2788 put_pixels8_c(dst, src, stride, 8);
2790 void ff_avg_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2791 avg_pixels8_c(dst, src, stride, 8);
2793 void ff_put_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2794 put_pixels16_c(dst, src, stride, 16);
2796 void ff_avg_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2797 avg_pixels16_c(dst, src, stride, 16);
2799 #endif /* CONFIG_CAVS_DECODER */
2801 #if CONFIG_VC1_DECODER
2803 void ff_put_vc1_mspel_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int rnd) {
2804 put_pixels8_c(dst, src, stride, 8);
2806 void ff_avg_vc1_mspel_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int rnd) {
2807 avg_pixels8_c(dst, src, stride, 8);
2809 #endif /* CONFIG_VC1_DECODER */
2812 void ff_h264dspenc_init(DSPContext* c, AVCodecContext *avctx);
2814 #if CONFIG_RV40_DECODER
2815 static void put_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2816 put_pixels16_xy2_c(dst, src, stride, 16);
2818 static void avg_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2819 avg_pixels16_xy2_c(dst, src, stride, 16);
2821 static void put_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2822 put_pixels8_xy2_c(dst, src, stride, 8);
2824 static void avg_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2825 avg_pixels8_xy2_c(dst, src, stride, 8);
2827 #endif /* CONFIG_RV40_DECODER */
2829 static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
2830 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2834 const int src_1= src[ -srcStride];
2835 const int src0 = src[0 ];
2836 const int src1 = src[ srcStride];
2837 const int src2 = src[2*srcStride];
2838 const int src3 = src[3*srcStride];
2839 const int src4 = src[4*srcStride];
2840 const int src5 = src[5*srcStride];
2841 const int src6 = src[6*srcStride];
2842 const int src7 = src[7*srcStride];
2843 const int src8 = src[8*srcStride];
2844 const int src9 = src[9*srcStride];
2845 dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
2846 dst[1*dstStride]= cm[(9*(src1 + src2) - (src0 + src3) + 8)>>4];
2847 dst[2*dstStride]= cm[(9*(src2 + src3) - (src1 + src4) + 8)>>4];
2848 dst[3*dstStride]= cm[(9*(src3 + src4) - (src2 + src5) + 8)>>4];
2849 dst[4*dstStride]= cm[(9*(src4 + src5) - (src3 + src6) + 8)>>4];
2850 dst[5*dstStride]= cm[(9*(src5 + src6) - (src4 + src7) + 8)>>4];
2851 dst[6*dstStride]= cm[(9*(src6 + src7) - (src5 + src8) + 8)>>4];
2852 dst[7*dstStride]= cm[(9*(src7 + src8) - (src6 + src9) + 8)>>4];
2858 static void put_mspel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){
2859 put_pixels8_c(dst, src, stride, 8);
2862 static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
2864 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2865 put_pixels8_l2(dst, src, half, stride, stride, 8, 8);
2868 static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
2869 wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
2872 static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
2874 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2875 put_pixels8_l2(dst, src+1, half, stride, stride, 8, 8);
2878 static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
2879 wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
2882 static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
2886 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2887 wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
2888 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2889 put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2891 static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
2895 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2896 wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
2897 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2898 put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2900 static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
2902 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2903 wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
2906 static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
2907 if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
2909 const int strength= ff_h263_loop_filter_strength[qscale];
2913 int p0= src[x-2*stride];
2914 int p1= src[x-1*stride];
2915 int p2= src[x+0*stride];
2916 int p3= src[x+1*stride];
2917 int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2919 if (d<-2*strength) d1= 0;
2920 else if(d<- strength) d1=-2*strength - d;
2921 else if(d< strength) d1= d;
2922 else if(d< 2*strength) d1= 2*strength - d;
2927 if(p1&256) p1= ~(p1>>31);
2928 if(p2&256) p2= ~(p2>>31);
2930 src[x-1*stride] = p1;
2931 src[x+0*stride] = p2;
2935 d2= av_clip((p0-p3)/4, -ad1, ad1);
2937 src[x-2*stride] = p0 - d2;
2938 src[x+ stride] = p3 + d2;
2943 static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
2944 if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
2946 const int strength= ff_h263_loop_filter_strength[qscale];
2950 int p0= src[y*stride-2];
2951 int p1= src[y*stride-1];
2952 int p2= src[y*stride+0];
2953 int p3= src[y*stride+1];
2954 int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2956 if (d<-2*strength) d1= 0;
2957 else if(d<- strength) d1=-2*strength - d;
2958 else if(d< strength) d1= d;
2959 else if(d< 2*strength) d1= 2*strength - d;
2964 if(p1&256) p1= ~(p1>>31);
2965 if(p2&256) p2= ~(p2>>31);
2967 src[y*stride-1] = p1;
2968 src[y*stride+0] = p2;
2972 d2= av_clip((p0-p3)/4, -ad1, ad1);
2974 src[y*stride-2] = p0 - d2;
2975 src[y*stride+1] = p3 + d2;
2980 static void h261_loop_filter_c(uint8_t *src, int stride){
2985 temp[x ] = 4*src[x ];
2986 temp[x + 7*8] = 4*src[x + 7*stride];
2990 xy = y * stride + x;
2992 temp[yz] = src[xy - stride] + 2*src[xy] + src[xy + stride];
2997 src[ y*stride] = (temp[ y*8] + 2)>>2;
2998 src[7+y*stride] = (temp[7+y*8] + 2)>>2;
3000 xy = y * stride + x;
3002 src[xy] = (temp[yz-1] + 2*temp[yz] + temp[yz+1] + 8)>>4;
3007 static av_always_inline av_flatten void h264_loop_filter_luma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
3010 for( i = 0; i < 4; i++ ) {
3015 for( d = 0; d < 4; d++ ) {
3016 const int p0 = pix[-1*xstride];
3017 const int p1 = pix[-2*xstride];
3018 const int p2 = pix[-3*xstride];
3019 const int q0 = pix[0];
3020 const int q1 = pix[1*xstride];
3021 const int q2 = pix[2*xstride];
3023 if( FFABS( p0 - q0 ) < alpha &&
3024 FFABS( p1 - p0 ) < beta &&
3025 FFABS( q1 - q0 ) < beta ) {
3030 if( FFABS( p2 - p0 ) < beta ) {
3032 pix[-2*xstride] = p1 + av_clip( (( p2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - p1, -tc0[i], tc0[i] );
3035 if( FFABS( q2 - q0 ) < beta ) {
3037 pix[ xstride] = q1 + av_clip( (( q2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - q1, -tc0[i], tc0[i] );
3041 i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
3042 pix[-xstride] = av_clip_uint8( p0 + i_delta ); /* p0' */
3043 pix[0] = av_clip_uint8( q0 - i_delta ); /* q0' */
3049 static void h264_v_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
3051 h264_loop_filter_luma_c(pix, stride, 1, alpha, beta, tc0);
3053 static void h264_h_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
3055 h264_loop_filter_luma_c(pix, 1, stride, alpha, beta, tc0);
3058 static av_always_inline av_flatten void h264_loop_filter_luma_intra_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta)
3061 for( d = 0; d < 16; d++ ) {
3062 const int p2 = pix[-3*xstride];
3063 const int p1 = pix[-2*xstride];
3064 const int p0 = pix[-1*xstride];
3066 const int q0 = pix[ 0*xstride];
3067 const int q1 = pix[ 1*xstride];
3068 const int q2 = pix[ 2*xstride];
3070 if( FFABS( p0 - q0 ) < alpha &&
3071 FFABS( p1 - p0 ) < beta &&
3072 FFABS( q1 - q0 ) < beta ) {
3074 if(FFABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
3075 if( FFABS( p2 - p0 ) < beta)
3077 const int p3 = pix[-4*xstride];
3079 pix[-1*xstride] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
3080 pix[-2*xstride] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
3081 pix[-3*xstride] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
3084 pix[-1*xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
3086 if( FFABS( q2 - q0 ) < beta)
3088 const int q3 = pix[3*xstride];
3090 pix[0*xstride] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
3091 pix[1*xstride] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
3092 pix[2*xstride] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
3095 pix[0*xstride] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
3099 pix[-1*xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
3100 pix[ 0*xstride] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
3106 static void h264_v_loop_filter_luma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
3108 h264_loop_filter_luma_intra_c(pix, stride, 1, alpha, beta);
3110 static void h264_h_loop_filter_luma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
3112 h264_loop_filter_luma_intra_c(pix, 1, stride, alpha, beta);
3115 static av_always_inline av_flatten void h264_loop_filter_chroma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
3118 for( i = 0; i < 4; i++ ) {
3119 const int tc = tc0[i];
3124 for( d = 0; d < 2; d++ ) {
3125 const int p0 = pix[-1*xstride];
3126 const int p1 = pix[-2*xstride];
3127 const int q0 = pix[0];
3128 const int q1 = pix[1*xstride];
3130 if( FFABS( p0 - q0 ) < alpha &&
3131 FFABS( p1 - p0 ) < beta &&
3132 FFABS( q1 - q0 ) < beta ) {
3134 int delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
3136 pix[-xstride] = av_clip_uint8( p0 + delta ); /* p0' */
3137 pix[0] = av_clip_uint8( q0 - delta ); /* q0' */
3143 static void h264_v_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
3145 h264_loop_filter_chroma_c(pix, stride, 1, alpha, beta, tc0);
3147 static void h264_h_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
3149 h264_loop_filter_chroma_c(pix, 1, stride, alpha, beta, tc0);
3152 static av_always_inline av_flatten void h264_loop_filter_chroma_intra_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta)
3155 for( d = 0; d < 8; d++ ) {
3156 const int p0 = pix[-1*xstride];
3157 const int p1 = pix[-2*xstride];
3158 const int q0 = pix[0];
3159 const int q1 = pix[1*xstride];
3161 if( FFABS( p0 - q0 ) < alpha &&
3162 FFABS( p1 - p0 ) < beta &&
3163 FFABS( q1 - q0 ) < beta ) {
3165 pix[-xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2; /* p0' */
3166 pix[0] = ( 2*q1 + q0 + p1 + 2 ) >> 2; /* q0' */
3171 static void h264_v_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
3173 h264_loop_filter_chroma_intra_c(pix, stride, 1, alpha, beta);
3175 static void h264_h_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
3177 h264_loop_filter_chroma_intra_c(pix, 1, stride, alpha, beta);
3180 static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3186 s += abs(pix1[0] - pix2[0]);
3187 s += abs(pix1[1] - pix2[1]);
3188 s += abs(pix1[2] - pix2[2]);
3189 s += abs(pix1[3] - pix2[3]);
3190 s += abs(pix1[4] - pix2[4]);
3191 s += abs(pix1[5] - pix2[5]);
3192 s += abs(pix1[6] - pix2[6]);
3193 s += abs(pix1[7] - pix2[7]);
3194 s += abs(pix1[8] - pix2[8]);
3195 s += abs(pix1[9] - pix2[9]);
3196 s += abs(pix1[10] - pix2[10]);
3197 s += abs(pix1[11] - pix2[11]);
3198 s += abs(pix1[12] - pix2[12]);
3199 s += abs(pix1[13] - pix2[13]);
3200 s += abs(pix1[14] - pix2[14]);
3201 s += abs(pix1[15] - pix2[15]);
3208 static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3214 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
3215 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
3216 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
3217 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
3218 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
3219 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
3220 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
3221 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
3222 s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
3223 s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
3224 s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
3225 s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
3226 s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
3227 s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
3228 s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
3229 s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
3236 static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3239 uint8_t *pix3 = pix2 + line_size;
3243 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
3244 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
3245 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
3246 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
3247 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
3248 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
3249 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
3250 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
3251 s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
3252 s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
3253 s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
3254 s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
3255 s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
3256 s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
3257 s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
3258 s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
3266 static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3269 uint8_t *pix3 = pix2 + line_size;
3273 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
3274 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
3275 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
3276 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
3277 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
3278 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
3279 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
3280 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
3281 s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
3282 s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
3283 s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
3284 s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
3285 s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
3286 s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
3287 s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
3288 s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
3296 static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3302 s += abs(pix1[0] - pix2[0]);
3303 s += abs(pix1[1] - pix2[1]);
3304 s += abs(pix1[2] - pix2[2]);
3305 s += abs(pix1[3] - pix2[3]);
3306 s += abs(pix1[4] - pix2[4]);
3307 s += abs(pix1[5] - pix2[5]);
3308 s += abs(pix1[6] - pix2[6]);
3309 s += abs(pix1[7] - pix2[7]);
3316 static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3322 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
3323 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
3324 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
3325 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
3326 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
3327 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
3328 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
3329 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
3336 static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3339 uint8_t *pix3 = pix2 + line_size;
3343 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
3344 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
3345 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
3346 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
3347 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
3348 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
3349 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
3350 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
3358 static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3361 uint8_t *pix3 = pix2 + line_size;
3365 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
3366 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
3367 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
3368 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
3369 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
3370 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
3371 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
3372 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
3380 static int nsse16_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
3381 MpegEncContext *c = v;
3387 for(x=0; x<16; x++){
3388 score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]);
3391 for(x=0; x<15; x++){
3392 score2+= FFABS( s1[x ] - s1[x +stride]
3393 - s1[x+1] + s1[x+1+stride])
3394 -FFABS( s2[x ] - s2[x +stride]
3395 - s2[x+1] + s2[x+1+stride]);
3402 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
3403 else return score1 + FFABS(score2)*8;
3406 static int nsse8_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
3407 MpegEncContext *c = v;
3414 score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]);
3418 score2+= FFABS( s1[x ] - s1[x +stride]
3419 - s1[x+1] + s1[x+1+stride])
3420 -FFABS( s2[x ] - s2[x +stride]
3421 - s2[x+1] + s2[x+1+stride]);
3428 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
3429 else return score1 + FFABS(score2)*8;
3432 static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
3436 for(i=0; i<8*8; i++){
3437 int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
3440 assert(-512<b && b<512);
3442 sum += (w*b)*(w*b)>>4;
3447 static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
3450 for(i=0; i<8*8; i++){
3451 rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
3456 * permutes an 8x8 block.
3457 * @param block the block which will be permuted according to the given permutation vector
3458 * @param permutation the permutation vector
3459 * @param last the last non zero coefficient in scantable order, used to speed the permutation up
3460 * @param scantable the used scantable, this is only used to speed the permutation up, the block is not
3461 * (inverse) permutated to scantable order!
3463 void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
3469 //if(permutation[1]==1) return; //FIXME it is ok but not clean and might fail for some permutations
3471 for(i=0; i<=last; i++){
3472 const int j= scantable[i];
3477 for(i=0; i<=last; i++){
3478 const int j= scantable[i];
3479 const int perm_j= permutation[j];
3480 block[perm_j]= temp[j];
3484 static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
3488 void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
3491 memset(cmp, 0, sizeof(void*)*6);
3499 cmp[i]= c->hadamard8_diff[i];
3505 cmp[i]= c->dct_sad[i];
3508 cmp[i]= c->dct264_sad[i];
3511 cmp[i]= c->dct_max[i];
3514 cmp[i]= c->quant_psnr[i];
3534 #if CONFIG_SNOW_ENCODER
3543 av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
3548 static void clear_block_c(DCTELEM *block)
3550 memset(block, 0, sizeof(DCTELEM)*64);
3554 * memset(blocks, 0, sizeof(DCTELEM)*6*64)
3556 static void clear_blocks_c(DCTELEM *blocks)
3558 memset(blocks, 0, sizeof(DCTELEM)*6*64);
3561 static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
3563 for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3564 long a = *(long*)(src+i);
3565 long b = *(long*)(dst+i);
3566 *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
3569 dst[i+0] += src[i+0];
3572 static void add_bytes_l2_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
3574 for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3575 long a = *(long*)(src1+i);
3576 long b = *(long*)(src2+i);
3577 *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
3580 dst[i] = src1[i]+src2[i];
3583 static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
3585 #if !HAVE_FAST_UNALIGNED
3586 if((long)src2 & (sizeof(long)-1)){
3587 for(i=0; i+7<w; i+=8){
3588 dst[i+0] = src1[i+0]-src2[i+0];
3589 dst[i+1] = src1[i+1]-src2[i+1];
3590 dst[i+2] = src1[i+2]-src2[i+2];
3591 dst[i+3] = src1[i+3]-src2[i+3];
3592 dst[i+4] = src1[i+4]-src2[i+4];
3593 dst[i+5] = src1[i+5]-src2[i+5];
3594 dst[i+6] = src1[i+6]-src2[i+6];
3595 dst[i+7] = src1[i+7]-src2[i+7];
3599 for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3600 long a = *(long*)(src1+i);
3601 long b = *(long*)(src2+i);
3602 *(long*)(dst+i) = ((a|pb_80) - (b&pb_7f)) ^ ((a^b^pb_80)&pb_80);
3605 dst[i+0] = src1[i+0]-src2[i+0];
3608 static void add_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *diff, int w, int *left, int *left_top){
3616 l= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF) + diff[i];
3625 static void sub_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int w, int *left, int *left_top){
3633 const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
3643 static int add_hfyu_left_prediction_c(uint8_t *dst, const uint8_t *src, int w, int acc){
3646 for(i=0; i<w-1; i++){
3673 static void add_hfyu_left_prediction_bgr32_c(uint8_t *dst, const uint8_t *src, int w, int *red, int *green, int *blue, int *alpha){
3703 #define BUTTERFLY2(o1,o2,i1,i2) \
3707 #define BUTTERFLY1(x,y) \
3716 #define BUTTERFLYA(x,y) (FFABS((x)+(y)) + FFABS((x)-(y)))
3718 static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
3726 //FIXME try pointer walks
3727 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
3728 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
3729 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
3730 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
3732 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3733 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3734 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3735 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3737 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3738 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3739 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3740 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3744 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3745 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3746 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3747 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3749 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3750 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3751 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3752 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3755 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3756 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3757 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3758 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3764 printf("MAX:%d\n", maxi);
3770 static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
3778 //FIXME try pointer walks
3779 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
3780 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
3781 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
3782 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
3784 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3785 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3786 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3787 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3789 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3790 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3791 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3792 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3796 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3797 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3798 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3799 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3801 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3802 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3803 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3804 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3807 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3808 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3809 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3810 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3813 sum -= FFABS(temp[8*0] + temp[8*4]); // -mean
3818 static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3819 MpegEncContext * const s= (MpegEncContext *)c;
3820 LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
3824 s->dsp.diff_pixels(temp, src1, src2, stride);
3826 return s->dsp.sum_abs_dctelem(temp);
3831 const int s07 = SRC(0) + SRC(7);\
3832 const int s16 = SRC(1) + SRC(6);\
3833 const int s25 = SRC(2) + SRC(5);\
3834 const int s34 = SRC(3) + SRC(4);\
3835 const int a0 = s07 + s34;\
3836 const int a1 = s16 + s25;\
3837 const int a2 = s07 - s34;\
3838 const int a3 = s16 - s25;\
3839 const int d07 = SRC(0) - SRC(7);\
3840 const int d16 = SRC(1) - SRC(6);\
3841 const int d25 = SRC(2) - SRC(5);\
3842 const int d34 = SRC(3) - SRC(4);\
3843 const int a4 = d16 + d25 + (d07 + (d07>>1));\
3844 const int a5 = d07 - d34 - (d25 + (d25>>1));\
3845 const int a6 = d07 + d34 - (d16 + (d16>>1));\
3846 const int a7 = d16 - d25 + (d34 + (d34>>1));\
3848 DST(1, a4 + (a7>>2)) ;\
3849 DST(2, a2 + (a3>>1)) ;\
3850 DST(3, a5 + (a6>>2)) ;\
3852 DST(5, a6 - (a5>>2)) ;\
3853 DST(6, (a2>>1) - a3 ) ;\
3854 DST(7, (a4>>2) - a7 ) ;\
3857 static int dct264_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3858 MpegEncContext * const s= (MpegEncContext *)c;
3863 s->dsp.diff_pixels(dct[0], src1, src2, stride);
3865 #define SRC(x) dct[i][x]
3866 #define DST(x,v) dct[i][x]= v
3867 for( i = 0; i < 8; i++ )
3872 #define SRC(x) dct[x][i]
3873 #define DST(x,v) sum += FFABS(v)
3874 for( i = 0; i < 8; i++ )
3882 static int dct_max8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3883 MpegEncContext * const s= (MpegEncContext *)c;
3884 LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
3889 s->dsp.diff_pixels(temp, src1, src2, stride);
3893 sum= FFMAX(sum, FFABS(temp[i]));
3898 static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3899 MpegEncContext * const s= (MpegEncContext *)c;
3900 LOCAL_ALIGNED_16(DCTELEM, temp, [64*2]);
3901 DCTELEM * const bak = temp+64;
3907 s->dsp.diff_pixels(temp, src1, src2, stride);
3909 memcpy(bak, temp, 64*sizeof(DCTELEM));
3911 s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3912 s->dct_unquantize_inter(s, temp, 0, s->qscale);
3913 ff_simple_idct(temp); //FIXME
3916 sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
3921 static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3922 MpegEncContext * const s= (MpegEncContext *)c;
3923 const uint8_t *scantable= s->intra_scantable.permutated;
3924 LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
3925 LOCAL_ALIGNED_16(uint8_t, lsrc1, [64]);
3926 LOCAL_ALIGNED_16(uint8_t, lsrc2, [64]);
3927 int i, last, run, bits, level, distortion, start_i;
3928 const int esc_length= s->ac_esc_length;
3930 uint8_t * last_length;
3934 copy_block8(lsrc1, src1, 8, stride, 8);
3935 copy_block8(lsrc2, src2, 8, stride, 8);
3937 s->dsp.diff_pixels(temp, lsrc1, lsrc2, 8);
3939 s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3945 length = s->intra_ac_vlc_length;
3946 last_length= s->intra_ac_vlc_last_length;
3947 bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3950 length = s->inter_ac_vlc_length;
3951 last_length= s->inter_ac_vlc_last_length;
3956 for(i=start_i; i<last; i++){
3957 int j= scantable[i];
3962 if((level&(~127)) == 0){
3963 bits+= length[UNI_AC_ENC_INDEX(run, level)];
3972 level= temp[i] + 64;
3976 if((level&(~127)) == 0){
3977 bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3985 s->dct_unquantize_intra(s, temp, 0, s->qscale);
3987 s->dct_unquantize_inter(s, temp, 0, s->qscale);
3990 s->dsp.idct_add(lsrc2, 8, temp);
3992 distortion= s->dsp.sse[1](NULL, lsrc2, lsrc1, 8, 8);
3994 return distortion + ((bits*s->qscale*s->qscale*109 + 64)>>7);
3997 static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3998 MpegEncContext * const s= (MpegEncContext *)c;
3999 const uint8_t *scantable= s->intra_scantable.permutated;
4000 LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
4001 int i, last, run, bits, level, start_i;
4002 const int esc_length= s->ac_esc_length;
4004 uint8_t * last_length;
4008 s->dsp.diff_pixels(temp, src1, src2, stride);
4010 s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
4016 length = s->intra_ac_vlc_length;
4017 last_length= s->intra_ac_vlc_last_length;
4018 bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
4021 length = s->inter_ac_vlc_length;
4022 last_length= s->inter_ac_vlc_last_length;
4027 for(i=start_i; i<last; i++){
4028 int j= scantable[i];
4033 if((level&(~127)) == 0){
4034 bits+= length[UNI_AC_ENC_INDEX(run, level)];
4043 level= temp[i] + 64;
4047 if((level&(~127)) == 0){
4048 bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
4056 #define VSAD_INTRA(size) \
4057 static int vsad_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
4061 for(y=1; y<h; y++){ \
4062 for(x=0; x<size; x+=4){ \
4063 score+= FFABS(s[x ] - s[x +stride]) + FFABS(s[x+1] - s[x+1+stride]) \
4064 +FFABS(s[x+2] - s[x+2+stride]) + FFABS(s[x+3] - s[x+3+stride]); \
4074 static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
4079 for(x=0; x<16; x++){
4080 score+= FFABS(s1[x ] - s2[x ] - s1[x +stride] + s2[x +stride]);
4089 #define SQ(a) ((a)*(a))
4090 #define VSSE_INTRA(size) \
4091 static int vsse_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
4095 for(y=1; y<h; y++){ \
4096 for(x=0; x<size; x+=4){ \
4097 score+= SQ(s[x ] - s[x +stride]) + SQ(s[x+1] - s[x+1+stride]) \
4098 +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]); \
4108 static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
4113 for(x=0; x<16; x++){
4114 score+= SQ(s1[x ] - s2[x ] - s1[x +stride] + s2[x +stride]);
4123 static int ssd_int8_vs_int16_c(const int8_t *pix1, const int16_t *pix2,
4127 for(i=0; i<size; i++)
4128 score += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]);
4132 WRAPPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
4133 WRAPPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
4134 WRAPPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
4136 WRAPPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c)
4138 WRAPPER8_16_SQ(dct_max8x8_c, dct_max16_c)
4139 WRAPPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
4140 WRAPPER8_16_SQ(rd8x8_c, rd16_c)
4141 WRAPPER8_16_SQ(bit8x8_c, bit16_c)
4143 static void vector_fmul_c(float *dst, const float *src, int len){
4145 for(i=0; i<len; i++)
4149 static void vector_fmul_reverse_c(float *dst, const float *src0, const float *src1, int len){
4152 for(i=0; i<len; i++)
4153 dst[i] = src0[i] * src1[-i];
4156 static void vector_fmul_add_c(float *dst, const float *src0, const float *src1, const float *src2, int len){
4158 for(i=0; i<len; i++)
4159 dst[i] = src0[i] * src1[i] + src2[i];
4162 void ff_vector_fmul_window_c(float *dst, const float *src0, const float *src1, const float *win, float add_bias, int len){
4167 for(i=-len, j=len-1; i<0; i++, j--) {
4172 dst[i] = s0*wj - s1*wi + add_bias;
4173 dst[j] = s0*wi + s1*wj + add_bias;
4177 static void vector_fmul_scalar_c(float *dst, const float *src, float mul,
4181 for (i = 0; i < len; i++)
4182 dst[i] = src[i] * mul;
4185 static void vector_fmul_sv_scalar_2_c(float *dst, const float *src,
4186 const float **sv, float mul, int len)
4189 for (i = 0; i < len; i += 2, sv++) {
4190 dst[i ] = src[i ] * sv[0][0] * mul;
4191 dst[i+1] = src[i+1] * sv[0][1] * mul;
4195 static void vector_fmul_sv_scalar_4_c(float *dst, const float *src,
4196 const float **sv, float mul, int len)
4199 for (i = 0; i < len; i += 4, sv++) {
4200 dst[i ] = src[i ] * sv[0][0] * mul;
4201 dst[i+1] = src[i+1] * sv[0][1] * mul;
4202 dst[i+2] = src[i+2] * sv[0][2] * mul;
4203 dst[i+3] = src[i+3] * sv[0][3] * mul;
4207 static void sv_fmul_scalar_2_c(float *dst, const float **sv, float mul,
4211 for (i = 0; i < len; i += 2, sv++) {
4212 dst[i ] = sv[0][0] * mul;
4213 dst[i+1] = sv[0][1] * mul;
4217 static void sv_fmul_scalar_4_c(float *dst, const float **sv, float mul,
4221 for (i = 0; i < len; i += 4, sv++) {
4222 dst[i ] = sv[0][0] * mul;
4223 dst[i+1] = sv[0][1] * mul;
4224 dst[i+2] = sv[0][2] * mul;
4225 dst[i+3] = sv[0][3] * mul;
4229 static void butterflies_float_c(float *restrict v1, float *restrict v2,
4233 for (i = 0; i < len; i++) {
4234 float t = v1[i] - v2[i];
4240 static float scalarproduct_float_c(const float *v1, const float *v2, int len)
4245 for (i = 0; i < len; i++)
4251 static void int32_to_float_fmul_scalar_c(float *dst, const int *src, float mul, int len){
4253 for(i=0; i<len; i++)
4254 dst[i] = src[i] * mul;
4257 static inline uint32_t clipf_c_one(uint32_t a, uint32_t mini,
4258 uint32_t maxi, uint32_t maxisign)
4261 if(a > mini) return mini;
4262 else if((a^(1<<31)) > maxisign) return maxi;
4266 static void vector_clipf_c_opposite_sign(float *dst, const float *src, float *min, float *max, int len){
4268 uint32_t mini = *(uint32_t*)min;
4269 uint32_t maxi = *(uint32_t*)max;
4270 uint32_t maxisign = maxi ^ (1<<31);
4271 uint32_t *dsti = (uint32_t*)dst;
4272 const uint32_t *srci = (const uint32_t*)src;
4273 for(i=0; i<len; i+=8) {
4274 dsti[i + 0] = clipf_c_one(srci[i + 0], mini, maxi, maxisign);
4275 dsti[i + 1] = clipf_c_one(srci[i + 1], mini, maxi, maxisign);
4276 dsti[i + 2] = clipf_c_one(srci[i + 2], mini, maxi, maxisign);
4277 dsti[i + 3] = clipf_c_one(srci[i + 3], mini, maxi, maxisign);
4278 dsti[i + 4] = clipf_c_one(srci[i + 4], mini, maxi, maxisign);
4279 dsti[i + 5] = clipf_c_one(srci[i + 5], mini, maxi, maxisign);
4280 dsti[i + 6] = clipf_c_one(srci[i + 6], mini, maxi, maxisign);
4281 dsti[i + 7] = clipf_c_one(srci[i + 7], mini, maxi, maxisign);
4284 static void vector_clipf_c(float *dst, const float *src, float min, float max, int len){
4286 if(min < 0 && max > 0) {
4287 vector_clipf_c_opposite_sign(dst, src, &min, &max, len);
4289 for(i=0; i < len; i+=8) {
4290 dst[i ] = av_clipf(src[i ], min, max);
4291 dst[i + 1] = av_clipf(src[i + 1], min, max);
4292 dst[i + 2] = av_clipf(src[i + 2], min, max);
4293 dst[i + 3] = av_clipf(src[i + 3], min, max);
4294 dst[i + 4] = av_clipf(src[i + 4], min, max);
4295 dst[i + 5] = av_clipf(src[i + 5], min, max);
4296 dst[i + 6] = av_clipf(src[i + 6], min, max);
4297 dst[i + 7] = av_clipf(src[i + 7], min, max);
4302 static av_always_inline int float_to_int16_one(const float *src){
4303 int_fast32_t tmp = *(const int32_t*)src;
4305 tmp = (0x43c0ffff - tmp)>>31;
4306 // is this faster on some gcc/cpu combinations?
4307 // if(tmp > 0x43c0ffff) tmp = 0xFFFF;
4310 return tmp - 0x8000;
4313 void ff_float_to_int16_c(int16_t *dst, const float *src, long len){
4315 for(i=0; i<len; i++)
4316 dst[i] = float_to_int16_one(src+i);
4319 void ff_float_to_int16_interleave_c(int16_t *dst, const float **src, long len, int channels){
4322 for(i=0; i<len; i++){
4323 dst[2*i] = float_to_int16_one(src[0]+i);
4324 dst[2*i+1] = float_to_int16_one(src[1]+i);
4327 for(c=0; c<channels; c++)
4328 for(i=0, j=c; i<len; i++, j+=channels)
4329 dst[j] = float_to_int16_one(src[c]+i);
4333 static int32_t scalarproduct_int16_c(int16_t * v1, int16_t * v2, int order, int shift)
4338 res += (*v1++ * *v2++) >> shift;
4343 static int32_t scalarproduct_and_madd_int16_c(int16_t *v1, int16_t *v2, int16_t *v3, int order, int mul)
4348 *v1++ += mul * *v3++;
4354 #define W1 2841 /* 2048*sqrt (2)*cos (1*pi/16) */
4355 #define W2 2676 /* 2048*sqrt (2)*cos (2*pi/16) */
4356 #define W3 2408 /* 2048*sqrt (2)*cos (3*pi/16) */
4357 #define W4 2048 /* 2048*sqrt (2)*cos (4*pi/16) */
4358 #define W5 1609 /* 2048*sqrt (2)*cos (5*pi/16) */
4359 #define W6 1108 /* 2048*sqrt (2)*cos (6*pi/16) */
4360 #define W7 565 /* 2048*sqrt (2)*cos (7*pi/16) */
4362 static void wmv2_idct_row(short * b)
4365 int a0,a1,a2,a3,a4,a5,a6,a7;
4367 a1 = W1*b[1]+W7*b[7];
4368 a7 = W7*b[1]-W1*b[7];
4369 a5 = W5*b[5]+W3*b[3];
4370 a3 = W3*b[5]-W5*b[3];
4371 a2 = W2*b[2]+W6*b[6];
4372 a6 = W6*b[2]-W2*b[6];
4373 a0 = W0*b[0]+W0*b[4];
4374 a4 = W0*b[0]-W0*b[4];
4376 s1 = (181*(a1-a5+a7-a3)+128)>>8;//1,3,5,7,
4377 s2 = (181*(a1-a5-a7+a3)+128)>>8;
4379 b[0] = (a0+a2+a1+a5 + (1<<7))>>8;
4380 b[1] = (a4+a6 +s1 + (1<<7))>>8;
4381 b[2] = (a4-a6 +s2 + (1<<7))>>8;
4382 b[3] = (a0-a2+a7+a3 + (1<<7))>>8;
4383 b[4] = (a0-a2-a7-a3 + (1<<7))>>8;
4384 b[5] = (a4-a6 -s2 + (1<<7))>>8;
4385 b[6] = (a4+a6 -s1 + (1<<7))>>8;
4386 b[7] = (a0+a2-a1-a5 + (1<<7))>>8;
4388 static void wmv2_idct_col(short * b)
4391 int a0,a1,a2,a3,a4,a5,a6,a7;
4392 /*step 1, with extended precision*/
4393 a1 = (W1*b[8*1]+W7*b[8*7] + 4)>>3;
4394 a7 = (W7*b[8*1]-W1*b[8*7] + 4)>>3;
4395 a5 = (W5*b[8*5]+W3*b[8*3] + 4)>>3;
4396 a3 = (W3*b[8*5]-W5*b[8*3] + 4)>>3;
4397 a2 = (W2*b[8*2]+W6*b[8*6] + 4)>>3;
4398 a6 = (W6*b[8*2]-W2*b[8*6] + 4)>>3;
4399 a0 = (W0*b[8*0]+W0*b[8*4] )>>3;
4400 a4 = (W0*b[8*0]-W0*b[8*4] )>>3;
4402 s1 = (181*(a1-a5+a7-a3)+128)>>8;
4403 s2 = (181*(a1-a5-a7+a3)+128)>>8;
4405 b[8*0] = (a0+a2+a1+a5 + (1<<13))>>14;
4406 b[8*1] = (a4+a6 +s1 + (1<<13))>>14;
4407 b[8*2] = (a4-a6 +s2 + (1<<13))>>14;
4408 b[8*3] = (a0-a2+a7+a3 + (1<<13))>>14;
4410 b[8*4] = (a0-a2-a7-a3 + (1<<13))>>14;
4411 b[8*5] = (a4-a6 -s2 + (1<<13))>>14;
4412 b[8*6] = (a4+a6 -s1 + (1<<13))>>14;
4413 b[8*7] = (a0+a2-a1-a5 + (1<<13))>>14;
4415 void ff_wmv2_idct_c(short * block){
4419 wmv2_idct_row(block+i);
4422 wmv2_idct_col(block+i);
4425 /* XXX: those functions should be suppressed ASAP when all IDCTs are
4427 static void ff_wmv2_idct_put_c(uint8_t *dest, int line_size, DCTELEM *block)
4429 ff_wmv2_idct_c(block);
4430 put_pixels_clamped_c(block, dest, line_size);
4432 static void ff_wmv2_idct_add_c(uint8_t *dest, int line_size, DCTELEM *block)
4434 ff_wmv2_idct_c(block);
4435 add_pixels_clamped_c(block, dest, line_size);
4437 static void ff_jref_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
4440 put_pixels_clamped_c(block, dest, line_size);
4442 static void ff_jref_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
4445 add_pixels_clamped_c(block, dest, line_size);
4448 static void ff_jref_idct4_put(uint8_t *dest, int line_size, DCTELEM *block)
4451 put_pixels_clamped4_c(block, dest, line_size);
4453 static void ff_jref_idct4_add(uint8_t *dest, int line_size, DCTELEM *block)
4456 add_pixels_clamped4_c(block, dest, line_size);
4459 static void ff_jref_idct2_put(uint8_t *dest, int line_size, DCTELEM *block)
4462 put_pixels_clamped2_c(block, dest, line_size);
4464 static void ff_jref_idct2_add(uint8_t *dest, int line_size, DCTELEM *block)
4467 add_pixels_clamped2_c(block, dest, line_size);
4470 static void ff_jref_idct1_put(uint8_t *dest, int line_size, DCTELEM *block)
4472 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
4474 dest[0] = cm[(block[0] + 4)>>3];
4476 static void ff_jref_idct1_add(uint8_t *dest, int line_size, DCTELEM *block)
4478 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
4480 dest[0] = cm[dest[0] + ((block[0] + 4)>>3)];
4483 static void just_return(void *mem av_unused, int stride av_unused, int h av_unused) { return; }
4485 /* init static data */
4486 av_cold void dsputil_static_init(void)
4490 for(i=0;i<256;i++) ff_cropTbl[i + MAX_NEG_CROP] = i;
4491 for(i=0;i<MAX_NEG_CROP;i++) {
4493 ff_cropTbl[i + MAX_NEG_CROP + 256] = 255;
4496 for(i=0;i<512;i++) {
4497 ff_squareTbl[i] = (i - 256) * (i - 256);
4500 for(i=0; i<64; i++) inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
4503 int ff_check_alignment(void){
4504 static int did_fail=0;
4505 DECLARE_ALIGNED(16, int, aligned);
4507 if((intptr_t)&aligned & 15){
4509 #if HAVE_MMX || HAVE_ALTIVEC
4510 av_log(NULL, AV_LOG_ERROR,
4511 "Compiler did not align stack variables. Libavcodec has been miscompiled\n"
4512 "and may be very slow or crash. This is not a bug in libavcodec,\n"
4513 "but in the compiler. You may try recompiling using gcc >= 4.2.\n"
4514 "Do not report crashes to FFmpeg developers.\n");
4523 av_cold void dsputil_init(DSPContext* c, AVCodecContext *avctx)
4527 ff_check_alignment();
4530 if(avctx->dct_algo==FF_DCT_FASTINT) {
4531 c->fdct = fdct_ifast;
4532 c->fdct248 = fdct_ifast248;
4534 else if(avctx->dct_algo==FF_DCT_FAAN) {
4535 c->fdct = ff_faandct;
4536 c->fdct248 = ff_faandct248;
4539 c->fdct = ff_jpeg_fdct_islow; //slow/accurate/default
4540 c->fdct248 = ff_fdct248_islow;
4542 #endif //CONFIG_ENCODERS
4544 if(avctx->lowres==1){
4545 if(avctx->idct_algo==FF_IDCT_INT || avctx->idct_algo==FF_IDCT_AUTO || !CONFIG_H264_DECODER){
4546 c->idct_put= ff_jref_idct4_put;
4547 c->idct_add= ff_jref_idct4_add;
4549 c->idct_put= ff_h264_lowres_idct_put_c;
4550 c->idct_add= ff_h264_lowres_idct_add_c;
4552 c->idct = j_rev_dct4;
4553 c->idct_permutation_type= FF_NO_IDCT_PERM;
4554 }else if(avctx->lowres==2){
4555 c->idct_put= ff_jref_idct2_put;
4556 c->idct_add= ff_jref_idct2_add;
4557 c->idct = j_rev_dct2;
4558 c->idct_permutation_type= FF_NO_IDCT_PERM;
4559 }else if(avctx->lowres==3){
4560 c->idct_put= ff_jref_idct1_put;
4561 c->idct_add= ff_jref_idct1_add;
4562 c->idct = j_rev_dct1;
4563 c->idct_permutation_type= FF_NO_IDCT_PERM;
4565 if(avctx->idct_algo==FF_IDCT_INT){
4566 c->idct_put= ff_jref_idct_put;
4567 c->idct_add= ff_jref_idct_add;
4568 c->idct = j_rev_dct;
4569 c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
4570 }else if((CONFIG_VP3_DECODER || CONFIG_VP5_DECODER || CONFIG_VP6_DECODER ) &&
4571 avctx->idct_algo==FF_IDCT_VP3){
4572 c->idct_put= ff_vp3_idct_put_c;
4573 c->idct_add= ff_vp3_idct_add_c;
4574 c->idct = ff_vp3_idct_c;
4575 c->idct_permutation_type= FF_NO_IDCT_PERM;
4576 }else if(avctx->idct_algo==FF_IDCT_WMV2){
4577 c->idct_put= ff_wmv2_idct_put_c;
4578 c->idct_add= ff_wmv2_idct_add_c;
4579 c->idct = ff_wmv2_idct_c;
4580 c->idct_permutation_type= FF_NO_IDCT_PERM;
4581 }else if(avctx->idct_algo==FF_IDCT_FAAN){
4582 c->idct_put= ff_faanidct_put;
4583 c->idct_add= ff_faanidct_add;
4584 c->idct = ff_faanidct;
4585 c->idct_permutation_type= FF_NO_IDCT_PERM;
4586 }else if(CONFIG_EATGQ_DECODER && avctx->idct_algo==FF_IDCT_EA) {
4587 c->idct_put= ff_ea_idct_put_c;
4588 c->idct_permutation_type= FF_NO_IDCT_PERM;
4589 }else if(CONFIG_BINK_DECODER && avctx->idct_algo==FF_IDCT_BINK) {
4590 c->idct = ff_bink_idct_c;
4591 c->idct_add = ff_bink_idct_add_c;
4592 c->idct_put = ff_bink_idct_put_c;
4593 c->idct_permutation_type = FF_NO_IDCT_PERM;
4594 }else{ //accurate/default
4595 c->idct_put= ff_simple_idct_put;
4596 c->idct_add= ff_simple_idct_add;
4597 c->idct = ff_simple_idct;
4598 c->idct_permutation_type= FF_NO_IDCT_PERM;
4602 if (CONFIG_H264_DECODER) {
4603 c->h264_idct_add= ff_h264_idct_add_c;
4604 c->h264_idct8_add= ff_h264_idct8_add_c;
4605 c->h264_idct_dc_add= ff_h264_idct_dc_add_c;
4606 c->h264_idct8_dc_add= ff_h264_idct8_dc_add_c;
4607 c->h264_idct_add16 = ff_h264_idct_add16_c;
4608 c->h264_idct8_add4 = ff_h264_idct8_add4_c;
4609 c->h264_idct_add8 = ff_h264_idct_add8_c;
4610 c->h264_idct_add16intra= ff_h264_idct_add16intra_c;
4613 c->get_pixels = get_pixels_c;
4614 c->diff_pixels = diff_pixels_c;
4615 c->put_pixels_clamped = put_pixels_clamped_c;
4616 c->put_signed_pixels_clamped = put_signed_pixels_clamped_c;
4617 c->put_pixels_nonclamped = put_pixels_nonclamped_c;
4618 c->add_pixels_clamped = add_pixels_clamped_c;
4619 c->add_pixels8 = add_pixels8_c;
4620 c->add_pixels4 = add_pixels4_c;
4621 c->sum_abs_dctelem = sum_abs_dctelem_c;
4624 c->clear_block = clear_block_c;
4625 c->clear_blocks = clear_blocks_c;
4626 c->pix_sum = pix_sum_c;
4627 c->pix_norm1 = pix_norm1_c;
4629 c->fill_block_tab[0] = fill_block16_c;
4630 c->fill_block_tab[1] = fill_block8_c;
4631 c->scale_block = scale_block_c;
4633 /* TODO [0] 16 [1] 8 */
4634 c->pix_abs[0][0] = pix_abs16_c;
4635 c->pix_abs[0][1] = pix_abs16_x2_c;
4636 c->pix_abs[0][2] = pix_abs16_y2_c;
4637 c->pix_abs[0][3] = pix_abs16_xy2_c;
4638 c->pix_abs[1][0] = pix_abs8_c;
4639 c->pix_abs[1][1] = pix_abs8_x2_c;
4640 c->pix_abs[1][2] = pix_abs8_y2_c;
4641 c->pix_abs[1][3] = pix_abs8_xy2_c;
4643 #define dspfunc(PFX, IDX, NUM) \
4644 c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## NUM ## _c; \
4645 c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## NUM ## _x2_c; \
4646 c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## NUM ## _y2_c; \
4647 c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## NUM ## _xy2_c
4649 dspfunc(put, 0, 16);
4650 dspfunc(put_no_rnd, 0, 16);
4652 dspfunc(put_no_rnd, 1, 8);
4656 dspfunc(avg, 0, 16);
4657 dspfunc(avg_no_rnd, 0, 16);
4659 dspfunc(avg_no_rnd, 1, 8);
4664 c->put_no_rnd_pixels_l2[0]= put_no_rnd_pixels16_l2_c;
4665 c->put_no_rnd_pixels_l2[1]= put_no_rnd_pixels8_l2_c;
4667 c->put_tpel_pixels_tab[ 0] = put_tpel_pixels_mc00_c;
4668 c->put_tpel_pixels_tab[ 1] = put_tpel_pixels_mc10_c;
4669 c->put_tpel_pixels_tab[ 2] = put_tpel_pixels_mc20_c;
4670 c->put_tpel_pixels_tab[ 4] = put_tpel_pixels_mc01_c;
4671 c->put_tpel_pixels_tab[ 5] = put_tpel_pixels_mc11_c;
4672 c->put_tpel_pixels_tab[ 6] = put_tpel_pixels_mc21_c;
4673 c->put_tpel_pixels_tab[ 8] = put_tpel_pixels_mc02_c;
4674 c->put_tpel_pixels_tab[ 9] = put_tpel_pixels_mc12_c;
4675 c->put_tpel_pixels_tab[10] = put_tpel_pixels_mc22_c;
4677 c->avg_tpel_pixels_tab[ 0] = avg_tpel_pixels_mc00_c;
4678 c->avg_tpel_pixels_tab[ 1] = avg_tpel_pixels_mc10_c;
4679 c->avg_tpel_pixels_tab[ 2] = avg_tpel_pixels_mc20_c;
4680 c->avg_tpel_pixels_tab[ 4] = avg_tpel_pixels_mc01_c;
4681 c->avg_tpel_pixels_tab[ 5] = avg_tpel_pixels_mc11_c;
4682 c->avg_tpel_pixels_tab[ 6] = avg_tpel_pixels_mc21_c;
4683 c->avg_tpel_pixels_tab[ 8] = avg_tpel_pixels_mc02_c;
4684 c->avg_tpel_pixels_tab[ 9] = avg_tpel_pixels_mc12_c;
4685 c->avg_tpel_pixels_tab[10] = avg_tpel_pixels_mc22_c;
4687 #define dspfunc(PFX, IDX, NUM) \
4688 c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
4689 c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
4690 c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
4691 c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
4692 c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
4693 c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
4694 c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
4695 c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
4696 c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
4697 c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
4698 c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
4699 c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
4700 c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
4701 c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
4702 c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
4703 c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
4705 dspfunc(put_qpel, 0, 16);
4706 dspfunc(put_no_rnd_qpel, 0, 16);
4708 dspfunc(avg_qpel, 0, 16);
4709 /* dspfunc(avg_no_rnd_qpel, 0, 16); */
4711 dspfunc(put_qpel, 1, 8);
4712 dspfunc(put_no_rnd_qpel, 1, 8);
4714 dspfunc(avg_qpel, 1, 8);
4715 /* dspfunc(avg_no_rnd_qpel, 1, 8); */
4717 dspfunc(put_h264_qpel, 0, 16);
4718 dspfunc(put_h264_qpel, 1, 8);
4719 dspfunc(put_h264_qpel, 2, 4);
4720 dspfunc(put_h264_qpel, 3, 2);
4721 dspfunc(avg_h264_qpel, 0, 16);
4722 dspfunc(avg_h264_qpel, 1, 8);
4723 dspfunc(avg_h264_qpel, 2, 4);
4726 c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_c;
4727 c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_c;
4728 c->put_h264_chroma_pixels_tab[2]= put_h264_chroma_mc2_c;
4729 c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_c;
4730 c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_c;
4731 c->avg_h264_chroma_pixels_tab[2]= avg_h264_chroma_mc2_c;
4732 c->put_no_rnd_vc1_chroma_pixels_tab[0]= put_no_rnd_vc1_chroma_mc8_c;
4733 c->avg_no_rnd_vc1_chroma_pixels_tab[0]= avg_no_rnd_vc1_chroma_mc8_c;
4735 c->weight_h264_pixels_tab[0]= weight_h264_pixels16x16_c;
4736 c->weight_h264_pixels_tab[1]= weight_h264_pixels16x8_c;
4737 c->weight_h264_pixels_tab[2]= weight_h264_pixels8x16_c;
4738 c->weight_h264_pixels_tab[3]= weight_h264_pixels8x8_c;
4739 c->weight_h264_pixels_tab[4]= weight_h264_pixels8x4_c;
4740 c->weight_h264_pixels_tab[5]= weight_h264_pixels4x8_c;
4741 c->weight_h264_pixels_tab[6]= weight_h264_pixels4x4_c;
4742 c->weight_h264_pixels_tab[7]= weight_h264_pixels4x2_c;
4743 c->weight_h264_pixels_tab[8]= weight_h264_pixels2x4_c;
4744 c->weight_h264_pixels_tab[9]= weight_h264_pixels2x2_c;
4745 c->biweight_h264_pixels_tab[0]= biweight_h264_pixels16x16_c;
4746 c->biweight_h264_pixels_tab[1]= biweight_h264_pixels16x8_c;
4747 c->biweight_h264_pixels_tab[2]= biweight_h264_pixels8x16_c;
4748 c->biweight_h264_pixels_tab[3]= biweight_h264_pixels8x8_c;
4749 c->biweight_h264_pixels_tab[4]= biweight_h264_pixels8x4_c;
4750 c->biweight_h264_pixels_tab[5]= biweight_h264_pixels4x8_c;
4751 c->biweight_h264_pixels_tab[6]= biweight_h264_pixels4x4_c;
4752 c->biweight_h264_pixels_tab[7]= biweight_h264_pixels4x2_c;
4753 c->biweight_h264_pixels_tab[8]= biweight_h264_pixels2x4_c;
4754 c->biweight_h264_pixels_tab[9]= biweight_h264_pixels2x2_c;
4756 c->draw_edges = draw_edges_c;
4758 #if CONFIG_CAVS_DECODER
4759 ff_cavsdsp_init(c,avctx);
4762 #if CONFIG_MLP_DECODER || CONFIG_TRUEHD_DECODER
4763 ff_mlp_init(c, avctx);
4765 #if CONFIG_VC1_DECODER
4766 ff_vc1dsp_init(c,avctx);
4768 #if CONFIG_WMV2_DECODER || CONFIG_VC1_DECODER
4769 ff_intrax8dsp_init(c,avctx);
4771 #if CONFIG_RV30_DECODER
4772 ff_rv30dsp_init(c,avctx);
4774 #if CONFIG_RV40_DECODER
4775 ff_rv40dsp_init(c,avctx);
4776 c->put_rv40_qpel_pixels_tab[0][15] = put_rv40_qpel16_mc33_c;
4777 c->avg_rv40_qpel_pixels_tab[0][15] = avg_rv40_qpel16_mc33_c;
4778 c->put_rv40_qpel_pixels_tab[1][15] = put_rv40_qpel8_mc33_c;
4779 c->avg_rv40_qpel_pixels_tab[1][15] = avg_rv40_qpel8_mc33_c;
4782 c->put_mspel_pixels_tab[0]= put_mspel8_mc00_c;
4783 c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
4784 c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c;
4785 c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c;
4786 c->put_mspel_pixels_tab[4]= put_mspel8_mc02_c;
4787 c->put_mspel_pixels_tab[5]= put_mspel8_mc12_c;
4788 c->put_mspel_pixels_tab[6]= put_mspel8_mc22_c;
4789 c->put_mspel_pixels_tab[7]= put_mspel8_mc32_c;
4791 #define SET_CMP_FUNC(name) \
4792 c->name[0]= name ## 16_c;\
4793 c->name[1]= name ## 8x8_c;
4795 SET_CMP_FUNC(hadamard8_diff)
4796 c->hadamard8_diff[4]= hadamard8_intra16_c;
4797 c->hadamard8_diff[5]= hadamard8_intra8x8_c;
4798 SET_CMP_FUNC(dct_sad)
4799 SET_CMP_FUNC(dct_max)
4801 SET_CMP_FUNC(dct264_sad)
4803 c->sad[0]= pix_abs16_c;
4804 c->sad[1]= pix_abs8_c;
4808 SET_CMP_FUNC(quant_psnr)
4811 c->vsad[0]= vsad16_c;
4812 c->vsad[4]= vsad_intra16_c;
4813 c->vsad[5]= vsad_intra8_c;
4814 c->vsse[0]= vsse16_c;
4815 c->vsse[4]= vsse_intra16_c;
4816 c->vsse[5]= vsse_intra8_c;
4817 c->nsse[0]= nsse16_c;
4818 c->nsse[1]= nsse8_c;
4819 #if CONFIG_SNOW_ENCODER
4820 c->w53[0]= w53_16_c;
4822 c->w97[0]= w97_16_c;
4826 c->ssd_int8_vs_int16 = ssd_int8_vs_int16_c;
4828 c->add_bytes= add_bytes_c;
4829 c->add_bytes_l2= add_bytes_l2_c;
4830 c->diff_bytes= diff_bytes_c;
4831 c->add_hfyu_median_prediction= add_hfyu_median_prediction_c;
4832 c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_c;
4833 c->add_hfyu_left_prediction = add_hfyu_left_prediction_c;
4834 c->add_hfyu_left_prediction_bgr32 = add_hfyu_left_prediction_bgr32_c;
4835 c->bswap_buf= bswap_buf;
4836 #if CONFIG_PNG_DECODER
4837 c->add_png_paeth_prediction= ff_add_png_paeth_prediction;
4840 c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_c;
4841 c->h264_h_loop_filter_luma= h264_h_loop_filter_luma_c;
4842 c->h264_v_loop_filter_luma_intra= h264_v_loop_filter_luma_intra_c;
4843 c->h264_h_loop_filter_luma_intra= h264_h_loop_filter_luma_intra_c;
4844 c->h264_v_loop_filter_chroma= h264_v_loop_filter_chroma_c;
4845 c->h264_h_loop_filter_chroma= h264_h_loop_filter_chroma_c;
4846 c->h264_v_loop_filter_chroma_intra= h264_v_loop_filter_chroma_intra_c;
4847 c->h264_h_loop_filter_chroma_intra= h264_h_loop_filter_chroma_intra_c;
4848 c->h264_loop_filter_strength= NULL;
4850 if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
4851 c->h263_h_loop_filter= h263_h_loop_filter_c;
4852 c->h263_v_loop_filter= h263_v_loop_filter_c;
4855 if (CONFIG_VP3_DECODER) {
4856 c->vp3_h_loop_filter= ff_vp3_h_loop_filter_c;
4857 c->vp3_v_loop_filter= ff_vp3_v_loop_filter_c;
4859 if (CONFIG_VP6_DECODER) {
4860 c->vp6_filter_diag4= ff_vp6_filter_diag4_c;
4863 c->h261_loop_filter= h261_loop_filter_c;
4865 c->try_8x8basis= try_8x8basis_c;
4866 c->add_8x8basis= add_8x8basis_c;
4868 #if CONFIG_SNOW_DECODER
4869 c->vertical_compose97i = ff_snow_vertical_compose97i;
4870 c->horizontal_compose97i = ff_snow_horizontal_compose97i;
4871 c->inner_add_yblock = ff_snow_inner_add_yblock;
4874 #if CONFIG_VORBIS_DECODER
4875 c->vorbis_inverse_coupling = vorbis_inverse_coupling;
4877 #if CONFIG_AC3_DECODER
4878 c->ac3_downmix = ff_ac3_downmix_c;
4881 c->lpc_compute_autocorr = ff_lpc_compute_autocorr;
4883 c->vector_fmul = vector_fmul_c;
4884 c->vector_fmul_reverse = vector_fmul_reverse_c;
4885 c->vector_fmul_add = vector_fmul_add_c;
4886 c->vector_fmul_window = ff_vector_fmul_window_c;
4887 c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_c;
4888 c->vector_clipf = vector_clipf_c;
4889 c->float_to_int16 = ff_float_to_int16_c;
4890 c->float_to_int16_interleave = ff_float_to_int16_interleave_c;
4891 c->scalarproduct_int16 = scalarproduct_int16_c;
4892 c->scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_c;
4893 c->scalarproduct_float = scalarproduct_float_c;
4894 c->butterflies_float = butterflies_float_c;
4895 c->vector_fmul_scalar = vector_fmul_scalar_c;
4897 c->vector_fmul_sv_scalar[0] = vector_fmul_sv_scalar_2_c;
4898 c->vector_fmul_sv_scalar[1] = vector_fmul_sv_scalar_4_c;
4900 c->sv_fmul_scalar[0] = sv_fmul_scalar_2_c;
4901 c->sv_fmul_scalar[1] = sv_fmul_scalar_4_c;
4903 c->shrink[0]= ff_img_copy_plane;
4904 c->shrink[1]= ff_shrink22;
4905 c->shrink[2]= ff_shrink44;
4906 c->shrink[3]= ff_shrink88;
4908 c->prefetch= just_return;
4910 memset(c->put_2tap_qpel_pixels_tab, 0, sizeof(c->put_2tap_qpel_pixels_tab));
4911 memset(c->avg_2tap_qpel_pixels_tab, 0, sizeof(c->avg_2tap_qpel_pixels_tab));
4913 if (HAVE_MMX) dsputil_init_mmx (c, avctx);
4914 if (ARCH_ARM) dsputil_init_arm (c, avctx);
4915 if (CONFIG_MLIB) dsputil_init_mlib (c, avctx);
4916 if (HAVE_VIS) dsputil_init_vis (c, avctx);
4917 if (ARCH_ALPHA) dsputil_init_alpha (c, avctx);
4918 if (ARCH_PPC) dsputil_init_ppc (c, avctx);
4919 if (HAVE_MMI) dsputil_init_mmi (c, avctx);
4920 if (ARCH_SH4) dsputil_init_sh4 (c, avctx);
4921 if (ARCH_BFIN) dsputil_init_bfin (c, avctx);
4923 for(i=0; i<64; i++){
4924 if(!c->put_2tap_qpel_pixels_tab[0][i])
4925 c->put_2tap_qpel_pixels_tab[0][i]= c->put_h264_qpel_pixels_tab[0][i];
4926 if(!c->avg_2tap_qpel_pixels_tab[0][i])
4927 c->avg_2tap_qpel_pixels_tab[0][i]= c->avg_h264_qpel_pixels_tab[0][i];
4930 switch(c->idct_permutation_type){
4931 case FF_NO_IDCT_PERM:
4933 c->idct_permutation[i]= i;
4935 case FF_LIBMPEG2_IDCT_PERM:
4937 c->idct_permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
4939 case FF_SIMPLE_IDCT_PERM:
4941 c->idct_permutation[i]= simple_mmx_permutation[i];
4943 case FF_TRANSPOSE_IDCT_PERM:
4945 c->idct_permutation[i]= ((i&7)<<3) | (i>>3);
4947 case FF_PARTTRANS_IDCT_PERM:
4949 c->idct_permutation[i]= (i&0x24) | ((i&3)<<3) | ((i>>3)&3);
4951 case FF_SSE2_IDCT_PERM:
4953 c->idct_permutation[i]= (i&0x38) | idct_sse2_row_perm[i&7];
4956 av_log(avctx, AV_LOG_ERROR, "Internal error, IDCT permutation not set\n");