3 * Copyright (c) 2000, 2001 Fabrice Bellard
4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
6 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
8 * This file is part of FFmpeg.
10 * FFmpeg is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public
12 * License as published by the Free Software Foundation; either
13 * version 2.1 of the License, or (at your option) any later version.
15 * FFmpeg is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * Lesser General Public License for more details.
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with FFmpeg; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
26 * @file libavcodec/dsputil.c
32 #include "simple_idct.h"
37 #include "mpegvideo.h"
41 void ff_spatial_dwt(int *buffer, int width, int height, int stride, int type, int decomposition_count);
44 void vorbis_inverse_coupling(float *mag, float *ang, int blocksize);
47 void ff_ac3_downmix_c(float (*samples)[256], float (*matrix)[2], int out_ch, int in_ch, int len);
50 void ff_lpc_compute_autocorr(const int32_t *data, int len, int lag, double *autoc);
53 void ff_add_png_paeth_prediction(uint8_t *dst, uint8_t *src, uint8_t *top, int w, int bpp);
56 void ff_ea_idct_put_c(uint8_t *dest, int linesize, DCTELEM *block);
59 void ff_bink_idct_c (DCTELEM *block);
60 void ff_bink_idct_add_c(uint8_t *dest, int linesize, DCTELEM *block);
61 void ff_bink_idct_put_c(uint8_t *dest, int linesize, DCTELEM *block);
63 uint8_t ff_cropTbl[256 + 2 * MAX_NEG_CROP] = {0, };
64 uint32_t ff_squareTbl[512] = {0, };
66 // 0x7f7f7f7f or 0x7f7f7f7f7f7f7f7f or whatever, depending on the cpu's native arithmetic size
67 #define pb_7f (~0UL/255 * 0x7f)
68 #define pb_80 (~0UL/255 * 0x80)
70 const uint8_t ff_zigzag_direct[64] = {
71 0, 1, 8, 16, 9, 2, 3, 10,
72 17, 24, 32, 25, 18, 11, 4, 5,
73 12, 19, 26, 33, 40, 48, 41, 34,
74 27, 20, 13, 6, 7, 14, 21, 28,
75 35, 42, 49, 56, 57, 50, 43, 36,
76 29, 22, 15, 23, 30, 37, 44, 51,
77 58, 59, 52, 45, 38, 31, 39, 46,
78 53, 60, 61, 54, 47, 55, 62, 63
81 /* Specific zigzag scan for 248 idct. NOTE that unlike the
82 specification, we interleave the fields */
83 const uint8_t ff_zigzag248_direct[64] = {
84 0, 8, 1, 9, 16, 24, 2, 10,
85 17, 25, 32, 40, 48, 56, 33, 41,
86 18, 26, 3, 11, 4, 12, 19, 27,
87 34, 42, 49, 57, 50, 58, 35, 43,
88 20, 28, 5, 13, 6, 14, 21, 29,
89 36, 44, 51, 59, 52, 60, 37, 45,
90 22, 30, 7, 15, 23, 31, 38, 46,
91 53, 61, 54, 62, 39, 47, 55, 63,
94 /* not permutated inverse zigzag_direct + 1 for MMX quantizer */
95 DECLARE_ALIGNED_16(uint16_t, inv_zigzag_direct16)[64];
97 const uint8_t ff_alternate_horizontal_scan[64] = {
98 0, 1, 2, 3, 8, 9, 16, 17,
99 10, 11, 4, 5, 6, 7, 15, 14,
100 13, 12, 19, 18, 24, 25, 32, 33,
101 26, 27, 20, 21, 22, 23, 28, 29,
102 30, 31, 34, 35, 40, 41, 48, 49,
103 42, 43, 36, 37, 38, 39, 44, 45,
104 46, 47, 50, 51, 56, 57, 58, 59,
105 52, 53, 54, 55, 60, 61, 62, 63,
108 const uint8_t ff_alternate_vertical_scan[64] = {
109 0, 8, 16, 24, 1, 9, 2, 10,
110 17, 25, 32, 40, 48, 56, 57, 49,
111 41, 33, 26, 18, 3, 11, 4, 12,
112 19, 27, 34, 42, 50, 58, 35, 43,
113 51, 59, 20, 28, 5, 13, 6, 14,
114 21, 29, 36, 44, 52, 60, 37, 45,
115 53, 61, 22, 30, 7, 15, 23, 31,
116 38, 46, 54, 62, 39, 47, 55, 63,
119 /* a*inverse[b]>>32 == a/b for all 0<=a<=16909558 && 2<=b<=256
120 * for a>16909558, is an overestimate by less than 1 part in 1<<24 */
121 const uint32_t ff_inverse[257]={
122 0, 4294967295U,2147483648U,1431655766, 1073741824, 858993460, 715827883, 613566757,
123 536870912, 477218589, 429496730, 390451573, 357913942, 330382100, 306783379, 286331154,
124 268435456, 252645136, 238609295, 226050911, 214748365, 204522253, 195225787, 186737709,
125 178956971, 171798692, 165191050, 159072863, 153391690, 148102321, 143165577, 138547333,
126 134217728, 130150525, 126322568, 122713352, 119304648, 116080198, 113025456, 110127367,
127 107374183, 104755300, 102261127, 99882961, 97612894, 95443718, 93368855, 91382283,
128 89478486, 87652394, 85899346, 84215046, 82595525, 81037119, 79536432, 78090315,
129 76695845, 75350304, 74051161, 72796056, 71582789, 70409300, 69273667, 68174085,
130 67108864, 66076420, 65075263, 64103990, 63161284, 62245903, 61356676, 60492498,
131 59652324, 58835169, 58040099, 57266231, 56512728, 55778797, 55063684, 54366675,
132 53687092, 53024288, 52377650, 51746594, 51130564, 50529028, 49941481, 49367441,
133 48806447, 48258060, 47721859, 47197443, 46684428, 46182445, 45691142, 45210183,
134 44739243, 44278014, 43826197, 43383509, 42949673, 42524429, 42107523, 41698712,
135 41297763, 40904451, 40518560, 40139882, 39768216, 39403370, 39045158, 38693400,
136 38347923, 38008561, 37675152, 37347542, 37025581, 36709123, 36398028, 36092163,
137 35791395, 35495598, 35204650, 34918434, 34636834, 34359739, 34087043, 33818641,
138 33554432, 33294321, 33038210, 32786010, 32537632, 32292988, 32051995, 31814573,
139 31580642, 31350127, 31122952, 30899046, 30678338, 30460761, 30246249, 30034737,
140 29826162, 29620465, 29417585, 29217465, 29020050, 28825284, 28633116, 28443493,
141 28256364, 28071682, 27889399, 27709467, 27531842, 27356480, 27183338, 27012373,
142 26843546, 26676816, 26512144, 26349493, 26188825, 26030105, 25873297, 25718368,
143 25565282, 25414008, 25264514, 25116768, 24970741, 24826401, 24683721, 24542671,
144 24403224, 24265352, 24129030, 23994231, 23860930, 23729102, 23598722, 23469767,
145 23342214, 23216040, 23091223, 22967740, 22845571, 22724695, 22605092, 22486740,
146 22369622, 22253717, 22139007, 22025474, 21913099, 21801865, 21691755, 21582751,
147 21474837, 21367997, 21262215, 21157475, 21053762, 20951060, 20849356, 20748635,
148 20648882, 20550083, 20452226, 20355296, 20259280, 20164166, 20069941, 19976593,
149 19884108, 19792477, 19701685, 19611723, 19522579, 19434242, 19346700, 19259944,
150 19173962, 19088744, 19004281, 18920561, 18837576, 18755316, 18673771, 18592933,
151 18512791, 18433337, 18354562, 18276457, 18199014, 18122225, 18046082, 17970575,
152 17895698, 17821442, 17747799, 17674763, 17602325, 17530479, 17459217, 17388532,
153 17318417, 17248865, 17179870, 17111424, 17043522, 16976156, 16909321, 16843010,
157 /* Input permutation for the simple_idct_mmx */
158 static const uint8_t simple_mmx_permutation[64]={
159 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
160 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
161 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
162 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
163 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
164 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
165 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
166 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
169 static const uint8_t idct_sse2_row_perm[8] = {0, 4, 1, 5, 2, 6, 3, 7};
171 void ff_init_scantable(uint8_t *permutation, ScanTable *st, const uint8_t *src_scantable){
175 st->scantable= src_scantable;
179 j = src_scantable[i];
180 st->permutated[i] = permutation[j];
189 j = st->permutated[i];
191 st->raster_end[i]= end;
195 static int pix_sum_c(uint8_t * pix, int line_size)
200 for (i = 0; i < 16; i++) {
201 for (j = 0; j < 16; j += 8) {
212 pix += line_size - 16;
217 static int pix_norm1_c(uint8_t * pix, int line_size)
220 uint32_t *sq = ff_squareTbl + 256;
223 for (i = 0; i < 16; i++) {
224 for (j = 0; j < 16; j += 8) {
235 #if LONG_MAX > 2147483647
236 register uint64_t x=*(uint64_t*)pix;
238 s += sq[(x>>8)&0xff];
239 s += sq[(x>>16)&0xff];
240 s += sq[(x>>24)&0xff];
241 s += sq[(x>>32)&0xff];
242 s += sq[(x>>40)&0xff];
243 s += sq[(x>>48)&0xff];
244 s += sq[(x>>56)&0xff];
246 register uint32_t x=*(uint32_t*)pix;
248 s += sq[(x>>8)&0xff];
249 s += sq[(x>>16)&0xff];
250 s += sq[(x>>24)&0xff];
251 x=*(uint32_t*)(pix+4);
253 s += sq[(x>>8)&0xff];
254 s += sq[(x>>16)&0xff];
255 s += sq[(x>>24)&0xff];
260 pix += line_size - 16;
265 static void bswap_buf(uint32_t *dst, const uint32_t *src, int w){
268 for(i=0; i+8<=w; i+=8){
269 dst[i+0]= bswap_32(src[i+0]);
270 dst[i+1]= bswap_32(src[i+1]);
271 dst[i+2]= bswap_32(src[i+2]);
272 dst[i+3]= bswap_32(src[i+3]);
273 dst[i+4]= bswap_32(src[i+4]);
274 dst[i+5]= bswap_32(src[i+5]);
275 dst[i+6]= bswap_32(src[i+6]);
276 dst[i+7]= bswap_32(src[i+7]);
279 dst[i+0]= bswap_32(src[i+0]);
283 static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
286 uint32_t *sq = ff_squareTbl + 256;
289 for (i = 0; i < h; i++) {
290 s += sq[pix1[0] - pix2[0]];
291 s += sq[pix1[1] - pix2[1]];
292 s += sq[pix1[2] - pix2[2]];
293 s += sq[pix1[3] - pix2[3]];
300 static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
303 uint32_t *sq = ff_squareTbl + 256;
306 for (i = 0; i < h; i++) {
307 s += sq[pix1[0] - pix2[0]];
308 s += sq[pix1[1] - pix2[1]];
309 s += sq[pix1[2] - pix2[2]];
310 s += sq[pix1[3] - pix2[3]];
311 s += sq[pix1[4] - pix2[4]];
312 s += sq[pix1[5] - pix2[5]];
313 s += sq[pix1[6] - pix2[6]];
314 s += sq[pix1[7] - pix2[7]];
321 static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
324 uint32_t *sq = ff_squareTbl + 256;
327 for (i = 0; i < h; i++) {
328 s += sq[pix1[ 0] - pix2[ 0]];
329 s += sq[pix1[ 1] - pix2[ 1]];
330 s += sq[pix1[ 2] - pix2[ 2]];
331 s += sq[pix1[ 3] - pix2[ 3]];
332 s += sq[pix1[ 4] - pix2[ 4]];
333 s += sq[pix1[ 5] - pix2[ 5]];
334 s += sq[pix1[ 6] - pix2[ 6]];
335 s += sq[pix1[ 7] - pix2[ 7]];
336 s += sq[pix1[ 8] - pix2[ 8]];
337 s += sq[pix1[ 9] - pix2[ 9]];
338 s += sq[pix1[10] - pix2[10]];
339 s += sq[pix1[11] - pix2[11]];
340 s += sq[pix1[12] - pix2[12]];
341 s += sq[pix1[13] - pix2[13]];
342 s += sq[pix1[14] - pix2[14]];
343 s += sq[pix1[15] - pix2[15]];
352 #if CONFIG_SNOW_ENCODER //dwt is in snow.c
353 static inline int w_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int w, int h, int type){
355 const int dec_count= w==8 ? 3 : 4;
358 static const int scale[2][2][4][4]={
362 {268, 239, 239, 213},
366 // 9/7 16x16 or 32x32 dec=4
367 {344, 310, 310, 280},
375 {275, 245, 245, 218},
379 // 5/3 16x16 or 32x32 dec=4
380 {352, 317, 317, 286},
388 for (i = 0; i < h; i++) {
389 for (j = 0; j < w; j+=4) {
390 tmp[32*i+j+0] = (pix1[j+0] - pix2[j+0])<<4;
391 tmp[32*i+j+1] = (pix1[j+1] - pix2[j+1])<<4;
392 tmp[32*i+j+2] = (pix1[j+2] - pix2[j+2])<<4;
393 tmp[32*i+j+3] = (pix1[j+3] - pix2[j+3])<<4;
399 ff_spatial_dwt(tmp, w, h, 32, type, dec_count);
403 for(level=0; level<dec_count; level++){
404 for(ori= level ? 1 : 0; ori<4; ori++){
405 int size= w>>(dec_count-level);
406 int sx= (ori&1) ? size : 0;
407 int stride= 32<<(dec_count-level);
408 int sy= (ori&2) ? stride>>1 : 0;
410 for(i=0; i<size; i++){
411 for(j=0; j<size; j++){
412 int v= tmp[sx + sy + i*stride + j] * scale[type][dec_count-3][level][ori];
422 static int w53_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
423 return w_c(v, pix1, pix2, line_size, 8, h, 1);
426 static int w97_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
427 return w_c(v, pix1, pix2, line_size, 8, h, 0);
430 static int w53_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
431 return w_c(v, pix1, pix2, line_size, 16, h, 1);
434 static int w97_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
435 return w_c(v, pix1, pix2, line_size, 16, h, 0);
438 int w53_32_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
439 return w_c(v, pix1, pix2, line_size, 32, h, 1);
442 int w97_32_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
443 return w_c(v, pix1, pix2, line_size, 32, h, 0);
447 /* draw the edges of width 'w' of an image of size width, height */
448 //FIXME check that this is ok for mpeg4 interlaced
449 static void draw_edges_c(uint8_t *buf, int wrap, int width, int height, int w)
451 uint8_t *ptr, *last_line;
454 last_line = buf + (height - 1) * wrap;
457 memcpy(buf - (i + 1) * wrap, buf, width);
458 memcpy(last_line + (i + 1) * wrap, last_line, width);
462 for(i=0;i<height;i++) {
463 memset(ptr - w, ptr[0], w);
464 memset(ptr + width, ptr[width-1], w);
469 memset(buf - (i + 1) * wrap - w, buf[0], w); /* top left */
470 memset(buf - (i + 1) * wrap + width, buf[width-1], w); /* top right */
471 memset(last_line + (i + 1) * wrap - w, last_line[0], w); /* top left */
472 memset(last_line + (i + 1) * wrap + width, last_line[width-1], w); /* top right */
477 * Copies a rectangular area of samples to a temporary buffer and replicates the boarder samples.
478 * @param buf destination buffer
479 * @param src source buffer
480 * @param linesize number of bytes between 2 vertically adjacent samples in both the source and destination buffers
481 * @param block_w width of block
482 * @param block_h height of block
483 * @param src_x x coordinate of the top left sample of the block in the source buffer
484 * @param src_y y coordinate of the top left sample of the block in the source buffer
485 * @param w width of the source buffer
486 * @param h height of the source buffer
488 void ff_emulated_edge_mc(uint8_t *buf, uint8_t *src, int linesize, int block_w, int block_h,
489 int src_x, int src_y, int w, int h){
491 int start_y, start_x, end_y, end_x;
494 src+= (h-1-src_y)*linesize;
496 }else if(src_y<=-block_h){
497 src+= (1-block_h-src_y)*linesize;
503 }else if(src_x<=-block_w){
504 src+= (1-block_w-src_x);
508 start_y= FFMAX(0, -src_y);
509 start_x= FFMAX(0, -src_x);
510 end_y= FFMIN(block_h, h-src_y);
511 end_x= FFMIN(block_w, w-src_x);
513 // copy existing part
514 for(y=start_y; y<end_y; y++){
515 for(x=start_x; x<end_x; x++){
516 buf[x + y*linesize]= src[x + y*linesize];
521 for(y=0; y<start_y; y++){
522 for(x=start_x; x<end_x; x++){
523 buf[x + y*linesize]= buf[x + start_y*linesize];
528 for(y=end_y; y<block_h; y++){
529 for(x=start_x; x<end_x; x++){
530 buf[x + y*linesize]= buf[x + (end_y-1)*linesize];
534 for(y=0; y<block_h; y++){
536 for(x=0; x<start_x; x++){
537 buf[x + y*linesize]= buf[start_x + y*linesize];
541 for(x=end_x; x<block_w; x++){
542 buf[x + y*linesize]= buf[end_x - 1 + y*linesize];
547 static void get_pixels_c(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
551 /* read the pixels */
553 block[0] = pixels[0];
554 block[1] = pixels[1];
555 block[2] = pixels[2];
556 block[3] = pixels[3];
557 block[4] = pixels[4];
558 block[5] = pixels[5];
559 block[6] = pixels[6];
560 block[7] = pixels[7];
566 static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
567 const uint8_t *s2, int stride){
570 /* read the pixels */
572 block[0] = s1[0] - s2[0];
573 block[1] = s1[1] - s2[1];
574 block[2] = s1[2] - s2[2];
575 block[3] = s1[3] - s2[3];
576 block[4] = s1[4] - s2[4];
577 block[5] = s1[5] - s2[5];
578 block[6] = s1[6] - s2[6];
579 block[7] = s1[7] - s2[7];
587 static void put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
591 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
593 /* read the pixels */
595 pixels[0] = cm[block[0]];
596 pixels[1] = cm[block[1]];
597 pixels[2] = cm[block[2]];
598 pixels[3] = cm[block[3]];
599 pixels[4] = cm[block[4]];
600 pixels[5] = cm[block[5]];
601 pixels[6] = cm[block[6]];
602 pixels[7] = cm[block[7]];
609 static void put_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
613 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
615 /* read the pixels */
617 pixels[0] = cm[block[0]];
618 pixels[1] = cm[block[1]];
619 pixels[2] = cm[block[2]];
620 pixels[3] = cm[block[3]];
627 static void put_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
631 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
633 /* read the pixels */
635 pixels[0] = cm[block[0]];
636 pixels[1] = cm[block[1]];
643 static void put_signed_pixels_clamped_c(const DCTELEM *block,
644 uint8_t *restrict pixels,
649 for (i = 0; i < 8; i++) {
650 for (j = 0; j < 8; j++) {
653 else if (*block > 127)
656 *pixels = (uint8_t)(*block + 128);
660 pixels += (line_size - 8);
664 static void put_pixels_nonclamped_c(const DCTELEM *block, uint8_t *restrict pixels,
669 /* read the pixels */
671 pixels[0] = block[0];
672 pixels[1] = block[1];
673 pixels[2] = block[2];
674 pixels[3] = block[3];
675 pixels[4] = block[4];
676 pixels[5] = block[5];
677 pixels[6] = block[6];
678 pixels[7] = block[7];
685 static void add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
689 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
691 /* read the pixels */
693 pixels[0] = cm[pixels[0] + block[0]];
694 pixels[1] = cm[pixels[1] + block[1]];
695 pixels[2] = cm[pixels[2] + block[2]];
696 pixels[3] = cm[pixels[3] + block[3]];
697 pixels[4] = cm[pixels[4] + block[4]];
698 pixels[5] = cm[pixels[5] + block[5]];
699 pixels[6] = cm[pixels[6] + block[6]];
700 pixels[7] = cm[pixels[7] + block[7]];
706 static void add_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
710 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
712 /* read the pixels */
714 pixels[0] = cm[pixels[0] + block[0]];
715 pixels[1] = cm[pixels[1] + block[1]];
716 pixels[2] = cm[pixels[2] + block[2]];
717 pixels[3] = cm[pixels[3] + block[3]];
723 static void add_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
727 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
729 /* read the pixels */
731 pixels[0] = cm[pixels[0] + block[0]];
732 pixels[1] = cm[pixels[1] + block[1]];
738 static void add_pixels8_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
742 pixels[0] += block[0];
743 pixels[1] += block[1];
744 pixels[2] += block[2];
745 pixels[3] += block[3];
746 pixels[4] += block[4];
747 pixels[5] += block[5];
748 pixels[6] += block[6];
749 pixels[7] += block[7];
755 static void add_pixels4_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
759 pixels[0] += block[0];
760 pixels[1] += block[1];
761 pixels[2] += block[2];
762 pixels[3] += block[3];
768 static int sum_abs_dctelem_c(DCTELEM *block)
772 sum+= FFABS(block[i]);
776 static void fill_block16_c(uint8_t *block, uint8_t value, int line_size, int h)
780 for (i = 0; i < h; i++) {
781 memset(block, value, 16);
786 static void fill_block8_c(uint8_t *block, uint8_t value, int line_size, int h)
790 for (i = 0; i < h; i++) {
791 memset(block, value, 8);
796 static void scale_block_c(const uint8_t src[64]/*align 8*/, uint8_t *dst/*align 8*/, int linesize)
799 uint16_t *dst1 = dst;
800 uint16_t *dst2 = dst + linesize;
802 for (j = 0; j < 8; j++) {
803 for (i = 0; i < 8; i++) {
804 dst1[i] = dst2[i] = src[i] * 0x0101;
814 #define PIXOP2(OPNAME, OP) \
815 static void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
819 OP(*((uint64_t*)block), AV_RN64(pixels));\
825 static void OPNAME ## _no_rnd_pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
829 const uint64_t a= AV_RN64(pixels );\
830 const uint64_t b= AV_RN64(pixels+1);\
831 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
837 static void OPNAME ## _pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
841 const uint64_t a= AV_RN64(pixels );\
842 const uint64_t b= AV_RN64(pixels+1);\
843 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
849 static void OPNAME ## _no_rnd_pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
853 const uint64_t a= AV_RN64(pixels );\
854 const uint64_t b= AV_RN64(pixels+line_size);\
855 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
861 static void OPNAME ## _pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
865 const uint64_t a= AV_RN64(pixels );\
866 const uint64_t b= AV_RN64(pixels+line_size);\
867 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
873 static void OPNAME ## _pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
876 const uint64_t a= AV_RN64(pixels );\
877 const uint64_t b= AV_RN64(pixels+1);\
878 uint64_t l0= (a&0x0303030303030303ULL)\
879 + (b&0x0303030303030303ULL)\
880 + 0x0202020202020202ULL;\
881 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
882 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
886 for(i=0; i<h; i+=2){\
887 uint64_t a= AV_RN64(pixels );\
888 uint64_t b= AV_RN64(pixels+1);\
889 l1= (a&0x0303030303030303ULL)\
890 + (b&0x0303030303030303ULL);\
891 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
892 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
893 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
896 a= AV_RN64(pixels );\
897 b= AV_RN64(pixels+1);\
898 l0= (a&0x0303030303030303ULL)\
899 + (b&0x0303030303030303ULL)\
900 + 0x0202020202020202ULL;\
901 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
902 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
903 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
909 static void OPNAME ## _no_rnd_pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
912 const uint64_t a= AV_RN64(pixels );\
913 const uint64_t b= AV_RN64(pixels+1);\
914 uint64_t l0= (a&0x0303030303030303ULL)\
915 + (b&0x0303030303030303ULL)\
916 + 0x0101010101010101ULL;\
917 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
918 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
922 for(i=0; i<h; i+=2){\
923 uint64_t a= AV_RN64(pixels );\
924 uint64_t b= AV_RN64(pixels+1);\
925 l1= (a&0x0303030303030303ULL)\
926 + (b&0x0303030303030303ULL);\
927 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
928 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
929 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
932 a= AV_RN64(pixels );\
933 b= AV_RN64(pixels+1);\
934 l0= (a&0x0303030303030303ULL)\
935 + (b&0x0303030303030303ULL)\
936 + 0x0101010101010101ULL;\
937 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
938 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
939 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
945 CALL_2X_PIXELS(OPNAME ## _pixels16_c , OPNAME ## _pixels_c , 8)\
946 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels_x2_c , 8)\
947 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels_y2_c , 8)\
948 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels_xy2_c, 8)\
949 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels_x2_c , 8)\
950 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels_y2_c , 8)\
951 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels_xy2_c, 8)
953 #define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
954 #else // 64 bit variant
956 #define PIXOP2(OPNAME, OP) \
957 static void OPNAME ## _pixels2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
960 OP(*((uint16_t*)(block )), AV_RN16(pixels ));\
965 static void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
968 OP(*((uint32_t*)(block )), AV_RN32(pixels ));\
973 static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
976 OP(*((uint32_t*)(block )), AV_RN32(pixels ));\
977 OP(*((uint32_t*)(block+4)), AV_RN32(pixels+4));\
982 static inline void OPNAME ## _no_rnd_pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
983 OPNAME ## _pixels8_c(block, pixels, line_size, h);\
986 static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
987 int src_stride1, int src_stride2, int h){\
991 a= AV_RN32(&src1[i*src_stride1 ]);\
992 b= AV_RN32(&src2[i*src_stride2 ]);\
993 OP(*((uint32_t*)&dst[i*dst_stride ]), no_rnd_avg32(a, b));\
994 a= AV_RN32(&src1[i*src_stride1+4]);\
995 b= AV_RN32(&src2[i*src_stride2+4]);\
996 OP(*((uint32_t*)&dst[i*dst_stride+4]), no_rnd_avg32(a, b));\
1000 static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
1001 int src_stride1, int src_stride2, int h){\
1003 for(i=0; i<h; i++){\
1005 a= AV_RN32(&src1[i*src_stride1 ]);\
1006 b= AV_RN32(&src2[i*src_stride2 ]);\
1007 OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
1008 a= AV_RN32(&src1[i*src_stride1+4]);\
1009 b= AV_RN32(&src2[i*src_stride2+4]);\
1010 OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\
1014 static inline void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
1015 int src_stride1, int src_stride2, int h){\
1017 for(i=0; i<h; i++){\
1019 a= AV_RN32(&src1[i*src_stride1 ]);\
1020 b= AV_RN32(&src2[i*src_stride2 ]);\
1021 OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
1025 static inline void OPNAME ## _pixels2_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
1026 int src_stride1, int src_stride2, int h){\
1028 for(i=0; i<h; i++){\
1030 a= AV_RN16(&src1[i*src_stride1 ]);\
1031 b= AV_RN16(&src2[i*src_stride2 ]);\
1032 OP(*((uint16_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
1036 static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
1037 int src_stride1, int src_stride2, int h){\
1038 OPNAME ## _pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
1039 OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
1042 static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
1043 int src_stride1, int src_stride2, int h){\
1044 OPNAME ## _no_rnd_pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
1045 OPNAME ## _no_rnd_pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
1048 static inline void OPNAME ## _no_rnd_pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
1049 OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
1052 static inline void OPNAME ## _pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
1053 OPNAME ## _pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
1056 static inline void OPNAME ## _no_rnd_pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
1057 OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
1060 static inline void OPNAME ## _pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
1061 OPNAME ## _pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
1064 static inline void OPNAME ## _pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
1065 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1067 for(i=0; i<h; i++){\
1068 uint32_t a, b, c, d, l0, l1, h0, h1;\
1069 a= AV_RN32(&src1[i*src_stride1]);\
1070 b= AV_RN32(&src2[i*src_stride2]);\
1071 c= AV_RN32(&src3[i*src_stride3]);\
1072 d= AV_RN32(&src4[i*src_stride4]);\
1073 l0= (a&0x03030303UL)\
1076 h0= ((a&0xFCFCFCFCUL)>>2)\
1077 + ((b&0xFCFCFCFCUL)>>2);\
1078 l1= (c&0x03030303UL)\
1079 + (d&0x03030303UL);\
1080 h1= ((c&0xFCFCFCFCUL)>>2)\
1081 + ((d&0xFCFCFCFCUL)>>2);\
1082 OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1083 a= AV_RN32(&src1[i*src_stride1+4]);\
1084 b= AV_RN32(&src2[i*src_stride2+4]);\
1085 c= AV_RN32(&src3[i*src_stride3+4]);\
1086 d= AV_RN32(&src4[i*src_stride4+4]);\
1087 l0= (a&0x03030303UL)\
1090 h0= ((a&0xFCFCFCFCUL)>>2)\
1091 + ((b&0xFCFCFCFCUL)>>2);\
1092 l1= (c&0x03030303UL)\
1093 + (d&0x03030303UL);\
1094 h1= ((c&0xFCFCFCFCUL)>>2)\
1095 + ((d&0xFCFCFCFCUL)>>2);\
1096 OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1100 static inline void OPNAME ## _pixels4_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
1101 OPNAME ## _pixels4_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
1104 static inline void OPNAME ## _pixels4_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
1105 OPNAME ## _pixels4_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
1108 static inline void OPNAME ## _pixels2_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
1109 OPNAME ## _pixels2_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
1112 static inline void OPNAME ## _pixels2_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
1113 OPNAME ## _pixels2_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
1116 static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
1117 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1119 for(i=0; i<h; i++){\
1120 uint32_t a, b, c, d, l0, l1, h0, h1;\
1121 a= AV_RN32(&src1[i*src_stride1]);\
1122 b= AV_RN32(&src2[i*src_stride2]);\
1123 c= AV_RN32(&src3[i*src_stride3]);\
1124 d= AV_RN32(&src4[i*src_stride4]);\
1125 l0= (a&0x03030303UL)\
1128 h0= ((a&0xFCFCFCFCUL)>>2)\
1129 + ((b&0xFCFCFCFCUL)>>2);\
1130 l1= (c&0x03030303UL)\
1131 + (d&0x03030303UL);\
1132 h1= ((c&0xFCFCFCFCUL)>>2)\
1133 + ((d&0xFCFCFCFCUL)>>2);\
1134 OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1135 a= AV_RN32(&src1[i*src_stride1+4]);\
1136 b= AV_RN32(&src2[i*src_stride2+4]);\
1137 c= AV_RN32(&src3[i*src_stride3+4]);\
1138 d= AV_RN32(&src4[i*src_stride4+4]);\
1139 l0= (a&0x03030303UL)\
1142 h0= ((a&0xFCFCFCFCUL)>>2)\
1143 + ((b&0xFCFCFCFCUL)>>2);\
1144 l1= (c&0x03030303UL)\
1145 + (d&0x03030303UL);\
1146 h1= ((c&0xFCFCFCFCUL)>>2)\
1147 + ((d&0xFCFCFCFCUL)>>2);\
1148 OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1151 static inline void OPNAME ## _pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
1152 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1153 OPNAME ## _pixels8_l4(dst , src1 , src2 , src3 , src4 , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1154 OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1156 static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
1157 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1158 OPNAME ## _no_rnd_pixels8_l4(dst , src1 , src2 , src3 , src4 , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1159 OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1162 static inline void OPNAME ## _pixels2_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1164 int i, a0, b0, a1, b1;\
1171 for(i=0; i<h; i+=2){\
1177 block[0]= (a1+a0)>>2; /* FIXME non put */\
1178 block[1]= (b1+b0)>>2;\
1188 block[0]= (a1+a0)>>2;\
1189 block[1]= (b1+b0)>>2;\
1195 static inline void OPNAME ## _pixels4_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1198 const uint32_t a= AV_RN32(pixels );\
1199 const uint32_t b= AV_RN32(pixels+1);\
1200 uint32_t l0= (a&0x03030303UL)\
1203 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1204 + ((b&0xFCFCFCFCUL)>>2);\
1208 for(i=0; i<h; i+=2){\
1209 uint32_t a= AV_RN32(pixels );\
1210 uint32_t b= AV_RN32(pixels+1);\
1211 l1= (a&0x03030303UL)\
1212 + (b&0x03030303UL);\
1213 h1= ((a&0xFCFCFCFCUL)>>2)\
1214 + ((b&0xFCFCFCFCUL)>>2);\
1215 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1218 a= AV_RN32(pixels );\
1219 b= AV_RN32(pixels+1);\
1220 l0= (a&0x03030303UL)\
1223 h0= ((a&0xFCFCFCFCUL)>>2)\
1224 + ((b&0xFCFCFCFCUL)>>2);\
1225 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1231 static inline void OPNAME ## _pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1234 for(j=0; j<2; j++){\
1236 const uint32_t a= AV_RN32(pixels );\
1237 const uint32_t b= AV_RN32(pixels+1);\
1238 uint32_t l0= (a&0x03030303UL)\
1241 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1242 + ((b&0xFCFCFCFCUL)>>2);\
1246 for(i=0; i<h; i+=2){\
1247 uint32_t a= AV_RN32(pixels );\
1248 uint32_t b= AV_RN32(pixels+1);\
1249 l1= (a&0x03030303UL)\
1250 + (b&0x03030303UL);\
1251 h1= ((a&0xFCFCFCFCUL)>>2)\
1252 + ((b&0xFCFCFCFCUL)>>2);\
1253 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1256 a= AV_RN32(pixels );\
1257 b= AV_RN32(pixels+1);\
1258 l0= (a&0x03030303UL)\
1261 h0= ((a&0xFCFCFCFCUL)>>2)\
1262 + ((b&0xFCFCFCFCUL)>>2);\
1263 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1267 pixels+=4-line_size*(h+1);\
1268 block +=4-line_size*h;\
1272 static inline void OPNAME ## _no_rnd_pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1275 for(j=0; j<2; j++){\
1277 const uint32_t a= AV_RN32(pixels );\
1278 const uint32_t b= AV_RN32(pixels+1);\
1279 uint32_t l0= (a&0x03030303UL)\
1282 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1283 + ((b&0xFCFCFCFCUL)>>2);\
1287 for(i=0; i<h; i+=2){\
1288 uint32_t a= AV_RN32(pixels );\
1289 uint32_t b= AV_RN32(pixels+1);\
1290 l1= (a&0x03030303UL)\
1291 + (b&0x03030303UL);\
1292 h1= ((a&0xFCFCFCFCUL)>>2)\
1293 + ((b&0xFCFCFCFCUL)>>2);\
1294 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1297 a= AV_RN32(pixels );\
1298 b= AV_RN32(pixels+1);\
1299 l0= (a&0x03030303UL)\
1302 h0= ((a&0xFCFCFCFCUL)>>2)\
1303 + ((b&0xFCFCFCFCUL)>>2);\
1304 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1308 pixels+=4-line_size*(h+1);\
1309 block +=4-line_size*h;\
1313 CALL_2X_PIXELS(OPNAME ## _pixels16_c , OPNAME ## _pixels8_c , 8)\
1314 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels8_x2_c , 8)\
1315 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels8_y2_c , 8)\
1316 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels8_xy2_c, 8)\
1317 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_c , OPNAME ## _pixels8_c , 8)\
1318 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels8_x2_c , 8)\
1319 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels8_y2_c , 8)\
1320 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels8_xy2_c, 8)\
1322 #define op_avg(a, b) a = rnd_avg32(a, b)
1324 #define op_put(a, b) a = b
1331 #define avg2(a,b) ((a+b+1)>>1)
1332 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
1334 static void put_no_rnd_pixels16_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1335 put_no_rnd_pixels16_l2(dst, a, b, stride, stride, stride, h);
1338 static void put_no_rnd_pixels8_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1339 put_no_rnd_pixels8_l2(dst, a, b, stride, stride, stride, h);
1342 static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
1344 const int A=(16-x16)*(16-y16);
1345 const int B=( x16)*(16-y16);
1346 const int C=(16-x16)*( y16);
1347 const int D=( x16)*( y16);
1352 dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
1353 dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
1354 dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
1355 dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
1356 dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
1357 dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
1358 dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
1359 dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
1365 void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
1366 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
1369 const int s= 1<<shift;
1379 for(x=0; x<8; x++){ //XXX FIXME optimize
1380 int src_x, src_y, frac_x, frac_y, index;
1384 frac_x= src_x&(s-1);
1385 frac_y= src_y&(s-1);
1389 if((unsigned)src_x < width){
1390 if((unsigned)src_y < height){
1391 index= src_x + src_y*stride;
1392 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
1393 + src[index +1]* frac_x )*(s-frac_y)
1394 + ( src[index+stride ]*(s-frac_x)
1395 + src[index+stride+1]* frac_x )* frac_y
1398 index= src_x + av_clip(src_y, 0, height)*stride;
1399 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
1400 + src[index +1]* frac_x )*s
1404 if((unsigned)src_y < height){
1405 index= av_clip(src_x, 0, width) + src_y*stride;
1406 dst[y*stride + x]= ( ( src[index ]*(s-frac_y)
1407 + src[index+stride ]* frac_y )*s
1410 index= av_clip(src_x, 0, width) + av_clip(src_y, 0, height)*stride;
1411 dst[y*stride + x]= src[index ];
1423 static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1425 case 2: put_pixels2_c (dst, src, stride, height); break;
1426 case 4: put_pixels4_c (dst, src, stride, height); break;
1427 case 8: put_pixels8_c (dst, src, stride, height); break;
1428 case 16:put_pixels16_c(dst, src, stride, height); break;
1432 static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1434 for (i=0; i < height; i++) {
1435 for (j=0; j < width; j++) {
1436 dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
1443 static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1445 for (i=0; i < height; i++) {
1446 for (j=0; j < width; j++) {
1447 dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
1454 static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1456 for (i=0; i < height; i++) {
1457 for (j=0; j < width; j++) {
1458 dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
1465 static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1467 for (i=0; i < height; i++) {
1468 for (j=0; j < width; j++) {
1469 dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
1476 static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1478 for (i=0; i < height; i++) {
1479 for (j=0; j < width; j++) {
1480 dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1487 static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1489 for (i=0; i < height; i++) {
1490 for (j=0; j < width; j++) {
1491 dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
1498 static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1500 for (i=0; i < height; i++) {
1501 for (j=0; j < width; j++) {
1502 dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1509 static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1511 for (i=0; i < height; i++) {
1512 for (j=0; j < width; j++) {
1513 dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
1520 static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1522 case 2: avg_pixels2_c (dst, src, stride, height); break;
1523 case 4: avg_pixels4_c (dst, src, stride, height); break;
1524 case 8: avg_pixels8_c (dst, src, stride, height); break;
1525 case 16:avg_pixels16_c(dst, src, stride, height); break;
1529 static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1531 for (i=0; i < height; i++) {
1532 for (j=0; j < width; j++) {
1533 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
1540 static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1542 for (i=0; i < height; i++) {
1543 for (j=0; j < width; j++) {
1544 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
1551 static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1553 for (i=0; i < height; i++) {
1554 for (j=0; j < width; j++) {
1555 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
1562 static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1564 for (i=0; i < height; i++) {
1565 for (j=0; j < width; j++) {
1566 dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1573 static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1575 for (i=0; i < height; i++) {
1576 for (j=0; j < width; j++) {
1577 dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1584 static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1586 for (i=0; i < height; i++) {
1587 for (j=0; j < width; j++) {
1588 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
1595 static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1597 for (i=0; i < height; i++) {
1598 for (j=0; j < width; j++) {
1599 dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1606 static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1608 for (i=0; i < height; i++) {
1609 for (j=0; j < width; j++) {
1610 dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1617 #define TPEL_WIDTH(width)\
1618 static void put_tpel_pixels ## width ## _mc00_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1619 void put_tpel_pixels_mc00_c(dst, src, stride, width, height);}\
1620 static void put_tpel_pixels ## width ## _mc10_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1621 void put_tpel_pixels_mc10_c(dst, src, stride, width, height);}\
1622 static void put_tpel_pixels ## width ## _mc20_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1623 void put_tpel_pixels_mc20_c(dst, src, stride, width, height);}\
1624 static void put_tpel_pixels ## width ## _mc01_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1625 void put_tpel_pixels_mc01_c(dst, src, stride, width, height);}\
1626 static void put_tpel_pixels ## width ## _mc11_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1627 void put_tpel_pixels_mc11_c(dst, src, stride, width, height);}\
1628 static void put_tpel_pixels ## width ## _mc21_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1629 void put_tpel_pixels_mc21_c(dst, src, stride, width, height);}\
1630 static void put_tpel_pixels ## width ## _mc02_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1631 void put_tpel_pixels_mc02_c(dst, src, stride, width, height);}\
1632 static void put_tpel_pixels ## width ## _mc12_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1633 void put_tpel_pixels_mc12_c(dst, src, stride, width, height);}\
1634 static void put_tpel_pixels ## width ## _mc22_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1635 void put_tpel_pixels_mc22_c(dst, src, stride, width, height);}
1638 #define H264_CHROMA_MC(OPNAME, OP)\
1639 static void OPNAME ## h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1640 const int A=(8-x)*(8-y);\
1641 const int B=( x)*(8-y);\
1642 const int C=(8-x)*( y);\
1643 const int D=( x)*( y);\
1646 assert(x<8 && y<8 && x>=0 && y>=0);\
1649 for(i=0; i<h; i++){\
1650 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1651 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1657 const int step= C ? stride : 1;\
1658 for(i=0; i<h; i++){\
1659 OP(dst[0], (A*src[0] + E*src[step+0]));\
1660 OP(dst[1], (A*src[1] + E*src[step+1]));\
1667 static void OPNAME ## h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1668 const int A=(8-x)*(8-y);\
1669 const int B=( x)*(8-y);\
1670 const int C=(8-x)*( y);\
1671 const int D=( x)*( y);\
1674 assert(x<8 && y<8 && x>=0 && y>=0);\
1677 for(i=0; i<h; i++){\
1678 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1679 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1680 OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1681 OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1687 const int step= C ? stride : 1;\
1688 for(i=0; i<h; i++){\
1689 OP(dst[0], (A*src[0] + E*src[step+0]));\
1690 OP(dst[1], (A*src[1] + E*src[step+1]));\
1691 OP(dst[2], (A*src[2] + E*src[step+2]));\
1692 OP(dst[3], (A*src[3] + E*src[step+3]));\
1699 static void OPNAME ## h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1700 const int A=(8-x)*(8-y);\
1701 const int B=( x)*(8-y);\
1702 const int C=(8-x)*( y);\
1703 const int D=( x)*( y);\
1706 assert(x<8 && y<8 && x>=0 && y>=0);\
1709 for(i=0; i<h; i++){\
1710 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1711 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1712 OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1713 OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1714 OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));\
1715 OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));\
1716 OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));\
1717 OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));\
1723 const int step= C ? stride : 1;\
1724 for(i=0; i<h; i++){\
1725 OP(dst[0], (A*src[0] + E*src[step+0]));\
1726 OP(dst[1], (A*src[1] + E*src[step+1]));\
1727 OP(dst[2], (A*src[2] + E*src[step+2]));\
1728 OP(dst[3], (A*src[3] + E*src[step+3]));\
1729 OP(dst[4], (A*src[4] + E*src[step+4]));\
1730 OP(dst[5], (A*src[5] + E*src[step+5]));\
1731 OP(dst[6], (A*src[6] + E*src[step+6]));\
1732 OP(dst[7], (A*src[7] + E*src[step+7]));\
1739 #define op_avg(a, b) a = (((a)+(((b) + 32)>>6)+1)>>1)
1740 #define op_put(a, b) a = (((b) + 32)>>6)
1742 H264_CHROMA_MC(put_ , op_put)
1743 H264_CHROMA_MC(avg_ , op_avg)
1747 static void put_no_rnd_vc1_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){
1748 const int A=(8-x)*(8-y);
1749 const int B=( x)*(8-y);
1750 const int C=(8-x)*( y);
1751 const int D=( x)*( y);
1754 assert(x<8 && y<8 && x>=0 && y>=0);
1758 dst[0] = (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + 32 - 4) >> 6;
1759 dst[1] = (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + 32 - 4) >> 6;
1760 dst[2] = (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + 32 - 4) >> 6;
1761 dst[3] = (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + 32 - 4) >> 6;
1762 dst[4] = (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + 32 - 4) >> 6;
1763 dst[5] = (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + 32 - 4) >> 6;
1764 dst[6] = (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + 32 - 4) >> 6;
1765 dst[7] = (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + 32 - 4) >> 6;
1771 static void avg_no_rnd_vc1_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){
1772 const int A=(8-x)*(8-y);
1773 const int B=( x)*(8-y);
1774 const int C=(8-x)*( y);
1775 const int D=( x)*( y);
1778 assert(x<8 && y<8 && x>=0 && y>=0);
1782 dst[0] = avg2(dst[0], ((A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + 32 - 4) >> 6));
1783 dst[1] = avg2(dst[1], ((A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + 32 - 4) >> 6));
1784 dst[2] = avg2(dst[2], ((A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + 32 - 4) >> 6));
1785 dst[3] = avg2(dst[3], ((A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + 32 - 4) >> 6));
1786 dst[4] = avg2(dst[4], ((A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + 32 - 4) >> 6));
1787 dst[5] = avg2(dst[5], ((A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + 32 - 4) >> 6));
1788 dst[6] = avg2(dst[6], ((A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + 32 - 4) >> 6));
1789 dst[7] = avg2(dst[7], ((A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + 32 - 4) >> 6));
1795 #define QPEL_MC(r, OPNAME, RND, OP) \
1796 static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1797 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1801 OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
1802 OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
1803 OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
1804 OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
1805 OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
1806 OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
1807 OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
1808 OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
1814 static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1816 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1820 const int src0= src[0*srcStride];\
1821 const int src1= src[1*srcStride];\
1822 const int src2= src[2*srcStride];\
1823 const int src3= src[3*srcStride];\
1824 const int src4= src[4*srcStride];\
1825 const int src5= src[5*srcStride];\
1826 const int src6= src[6*srcStride];\
1827 const int src7= src[7*srcStride];\
1828 const int src8= src[8*srcStride];\
1829 OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
1830 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
1831 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
1832 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
1833 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
1834 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
1835 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
1836 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
1842 static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1843 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1848 OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
1849 OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
1850 OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
1851 OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
1852 OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
1853 OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
1854 OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
1855 OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
1856 OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
1857 OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
1858 OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
1859 OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
1860 OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
1861 OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
1862 OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
1863 OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
1869 static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1870 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1875 const int src0= src[0*srcStride];\
1876 const int src1= src[1*srcStride];\
1877 const int src2= src[2*srcStride];\
1878 const int src3= src[3*srcStride];\
1879 const int src4= src[4*srcStride];\
1880 const int src5= src[5*srcStride];\
1881 const int src6= src[6*srcStride];\
1882 const int src7= src[7*srcStride];\
1883 const int src8= src[8*srcStride];\
1884 const int src9= src[9*srcStride];\
1885 const int src10= src[10*srcStride];\
1886 const int src11= src[11*srcStride];\
1887 const int src12= src[12*srcStride];\
1888 const int src13= src[13*srcStride];\
1889 const int src14= src[14*srcStride];\
1890 const int src15= src[15*srcStride];\
1891 const int src16= src[16*srcStride];\
1892 OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
1893 OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
1894 OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
1895 OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
1896 OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
1897 OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
1898 OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
1899 OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
1900 OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
1901 OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
1902 OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
1903 OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
1904 OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
1905 OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
1906 OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
1907 OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
1913 static void OPNAME ## qpel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1914 OPNAME ## pixels8_c(dst, src, stride, 8);\
1917 static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1919 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1920 OPNAME ## pixels8_l2(dst, src, half, stride, stride, 8, 8);\
1923 static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1924 OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
1927 static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1929 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1930 OPNAME ## pixels8_l2(dst, src+1, half, stride, stride, 8, 8);\
1933 static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1934 uint8_t full[16*9];\
1936 copy_block9(full, src, 16, stride, 9);\
1937 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1938 OPNAME ## pixels8_l2(dst, full, half, stride, 16, 8, 8);\
1941 static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1942 uint8_t full[16*9];\
1943 copy_block9(full, src, 16, stride, 9);\
1944 OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
1947 static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1948 uint8_t full[16*9];\
1950 copy_block9(full, src, 16, stride, 9);\
1951 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1952 OPNAME ## pixels8_l2(dst, full+16, half, stride, 16, 8, 8);\
1954 void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1955 uint8_t full[16*9];\
1958 uint8_t halfHV[64];\
1959 copy_block9(full, src, 16, stride, 9);\
1960 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1961 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1962 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1963 OPNAME ## pixels8_l4(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1965 static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1966 uint8_t full[16*9];\
1968 uint8_t halfHV[64];\
1969 copy_block9(full, src, 16, stride, 9);\
1970 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1971 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1972 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1973 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1975 void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1976 uint8_t full[16*9];\
1979 uint8_t halfHV[64];\
1980 copy_block9(full, src, 16, stride, 9);\
1981 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1982 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1983 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1984 OPNAME ## pixels8_l4(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1986 static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1987 uint8_t full[16*9];\
1989 uint8_t halfHV[64];\
1990 copy_block9(full, src, 16, stride, 9);\
1991 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1992 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1993 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1994 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1996 void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1997 uint8_t full[16*9];\
2000 uint8_t halfHV[64];\
2001 copy_block9(full, src, 16, stride, 9);\
2002 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
2003 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
2004 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
2005 OPNAME ## pixels8_l4(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
2007 static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2008 uint8_t full[16*9];\
2010 uint8_t halfHV[64];\
2011 copy_block9(full, src, 16, stride, 9);\
2012 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
2013 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
2014 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
2015 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
2017 void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
2018 uint8_t full[16*9];\
2021 uint8_t halfHV[64];\
2022 copy_block9(full, src, 16, stride, 9);\
2023 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full , 8, 16, 9);\
2024 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
2025 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
2026 OPNAME ## pixels8_l4(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
2028 static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2029 uint8_t full[16*9];\
2031 uint8_t halfHV[64];\
2032 copy_block9(full, src, 16, stride, 9);\
2033 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
2034 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
2035 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
2036 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
2038 static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2040 uint8_t halfHV[64];\
2041 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
2042 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
2043 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
2045 static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2047 uint8_t halfHV[64];\
2048 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
2049 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
2050 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
2052 void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
2053 uint8_t full[16*9];\
2056 uint8_t halfHV[64];\
2057 copy_block9(full, src, 16, stride, 9);\
2058 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
2059 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
2060 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
2061 OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
2063 static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2064 uint8_t full[16*9];\
2066 copy_block9(full, src, 16, stride, 9);\
2067 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
2068 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
2069 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
2071 void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
2072 uint8_t full[16*9];\
2075 uint8_t halfHV[64];\
2076 copy_block9(full, src, 16, stride, 9);\
2077 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
2078 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
2079 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
2080 OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
2082 static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2083 uint8_t full[16*9];\
2085 copy_block9(full, src, 16, stride, 9);\
2086 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
2087 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
2088 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
2090 static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2092 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
2093 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
2095 static void OPNAME ## qpel16_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2096 OPNAME ## pixels16_c(dst, src, stride, 16);\
2099 static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2101 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
2102 OPNAME ## pixels16_l2(dst, src, half, stride, stride, 16, 16);\
2105 static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2106 OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
2109 static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2111 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
2112 OPNAME ## pixels16_l2(dst, src+1, half, stride, stride, 16, 16);\
2115 static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2116 uint8_t full[24*17];\
2118 copy_block17(full, src, 24, stride, 17);\
2119 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
2120 OPNAME ## pixels16_l2(dst, full, half, stride, 24, 16, 16);\
2123 static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2124 uint8_t full[24*17];\
2125 copy_block17(full, src, 24, stride, 17);\
2126 OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
2129 static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2130 uint8_t full[24*17];\
2132 copy_block17(full, src, 24, stride, 17);\
2133 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
2134 OPNAME ## pixels16_l2(dst, full+24, half, stride, 24, 16, 16);\
2136 void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
2137 uint8_t full[24*17];\
2138 uint8_t halfH[272];\
2139 uint8_t halfV[256];\
2140 uint8_t halfHV[256];\
2141 copy_block17(full, src, 24, stride, 17);\
2142 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2143 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2144 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2145 OPNAME ## pixels16_l4(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2147 static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2148 uint8_t full[24*17];\
2149 uint8_t halfH[272];\
2150 uint8_t halfHV[256];\
2151 copy_block17(full, src, 24, stride, 17);\
2152 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2153 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2154 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2155 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2157 void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
2158 uint8_t full[24*17];\
2159 uint8_t halfH[272];\
2160 uint8_t halfV[256];\
2161 uint8_t halfHV[256];\
2162 copy_block17(full, src, 24, stride, 17);\
2163 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2164 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2165 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2166 OPNAME ## pixels16_l4(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2168 static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2169 uint8_t full[24*17];\
2170 uint8_t halfH[272];\
2171 uint8_t halfHV[256];\
2172 copy_block17(full, src, 24, stride, 17);\
2173 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2174 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2175 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2176 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2178 void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
2179 uint8_t full[24*17];\
2180 uint8_t halfH[272];\
2181 uint8_t halfV[256];\
2182 uint8_t halfHV[256];\
2183 copy_block17(full, src, 24, stride, 17);\
2184 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2185 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2186 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2187 OPNAME ## pixels16_l4(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2189 static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2190 uint8_t full[24*17];\
2191 uint8_t halfH[272];\
2192 uint8_t halfHV[256];\
2193 copy_block17(full, src, 24, stride, 17);\
2194 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2195 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2196 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2197 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2199 void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
2200 uint8_t full[24*17];\
2201 uint8_t halfH[272];\
2202 uint8_t halfV[256];\
2203 uint8_t halfHV[256];\
2204 copy_block17(full, src, 24, stride, 17);\
2205 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full , 16, 24, 17);\
2206 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2207 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2208 OPNAME ## pixels16_l4(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2210 static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2211 uint8_t full[24*17];\
2212 uint8_t halfH[272];\
2213 uint8_t halfHV[256];\
2214 copy_block17(full, src, 24, stride, 17);\
2215 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2216 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2217 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2218 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2220 static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2221 uint8_t halfH[272];\
2222 uint8_t halfHV[256];\
2223 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2224 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2225 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2227 static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2228 uint8_t halfH[272];\
2229 uint8_t halfHV[256];\
2230 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2231 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2232 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2234 void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
2235 uint8_t full[24*17];\
2236 uint8_t halfH[272];\
2237 uint8_t halfV[256];\
2238 uint8_t halfHV[256];\
2239 copy_block17(full, src, 24, stride, 17);\
2240 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2241 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2242 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2243 OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2245 static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2246 uint8_t full[24*17];\
2247 uint8_t halfH[272];\
2248 copy_block17(full, src, 24, stride, 17);\
2249 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2250 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2251 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2253 void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
2254 uint8_t full[24*17];\
2255 uint8_t halfH[272];\
2256 uint8_t halfV[256];\
2257 uint8_t halfHV[256];\
2258 copy_block17(full, src, 24, stride, 17);\
2259 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2260 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2261 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2262 OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2264 static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2265 uint8_t full[24*17];\
2266 uint8_t halfH[272];\
2267 copy_block17(full, src, 24, stride, 17);\
2268 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2269 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2270 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2272 static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2273 uint8_t halfH[272];\
2274 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2275 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2278 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2279 #define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
2280 #define op_put(a, b) a = cm[((b) + 16)>>5]
2281 #define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
2283 QPEL_MC(0, put_ , _ , op_put)
2284 QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
2285 QPEL_MC(0, avg_ , _ , op_avg)
2286 //QPEL_MC(1, avg_no_rnd , _ , op_avg)
2288 #undef op_avg_no_rnd
2290 #undef op_put_no_rnd
2293 #define H264_LOWPASS(OPNAME, OP, OP2) \
2294 static av_unused void OPNAME ## h264_qpel2_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2296 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2300 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2301 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2307 static av_unused void OPNAME ## h264_qpel2_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2309 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2313 const int srcB= src[-2*srcStride];\
2314 const int srcA= src[-1*srcStride];\
2315 const int src0= src[0 *srcStride];\
2316 const int src1= src[1 *srcStride];\
2317 const int src2= src[2 *srcStride];\
2318 const int src3= src[3 *srcStride];\
2319 const int src4= src[4 *srcStride];\
2320 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2321 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2327 static av_unused void OPNAME ## h264_qpel2_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2330 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2332 src -= 2*srcStride;\
2333 for(i=0; i<h+5; i++)\
2335 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2336 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2340 tmp -= tmpStride*(h+5-2);\
2343 const int tmpB= tmp[-2*tmpStride];\
2344 const int tmpA= tmp[-1*tmpStride];\
2345 const int tmp0= tmp[0 *tmpStride];\
2346 const int tmp1= tmp[1 *tmpStride];\
2347 const int tmp2= tmp[2 *tmpStride];\
2348 const int tmp3= tmp[3 *tmpStride];\
2349 const int tmp4= tmp[4 *tmpStride];\
2350 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2351 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2356 static void OPNAME ## h264_qpel4_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2358 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2362 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2363 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2364 OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\
2365 OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\
2371 static void OPNAME ## h264_qpel4_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2373 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2377 const int srcB= src[-2*srcStride];\
2378 const int srcA= src[-1*srcStride];\
2379 const int src0= src[0 *srcStride];\
2380 const int src1= src[1 *srcStride];\
2381 const int src2= src[2 *srcStride];\
2382 const int src3= src[3 *srcStride];\
2383 const int src4= src[4 *srcStride];\
2384 const int src5= src[5 *srcStride];\
2385 const int src6= src[6 *srcStride];\
2386 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2387 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2388 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2389 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2395 static void OPNAME ## h264_qpel4_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2398 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2400 src -= 2*srcStride;\
2401 for(i=0; i<h+5; i++)\
2403 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2404 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2405 tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]);\
2406 tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]);\
2410 tmp -= tmpStride*(h+5-2);\
2413 const int tmpB= tmp[-2*tmpStride];\
2414 const int tmpA= tmp[-1*tmpStride];\
2415 const int tmp0= tmp[0 *tmpStride];\
2416 const int tmp1= tmp[1 *tmpStride];\
2417 const int tmp2= tmp[2 *tmpStride];\
2418 const int tmp3= tmp[3 *tmpStride];\
2419 const int tmp4= tmp[4 *tmpStride];\
2420 const int tmp5= tmp[5 *tmpStride];\
2421 const int tmp6= tmp[6 *tmpStride];\
2422 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2423 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2424 OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2425 OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2431 static void OPNAME ## h264_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2433 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2437 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]));\
2438 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]));\
2439 OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]));\
2440 OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]));\
2441 OP(dst[4], (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]));\
2442 OP(dst[5], (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]));\
2443 OP(dst[6], (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]));\
2444 OP(dst[7], (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]));\
2450 static void OPNAME ## h264_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2452 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2456 const int srcB= src[-2*srcStride];\
2457 const int srcA= src[-1*srcStride];\
2458 const int src0= src[0 *srcStride];\
2459 const int src1= src[1 *srcStride];\
2460 const int src2= src[2 *srcStride];\
2461 const int src3= src[3 *srcStride];\
2462 const int src4= src[4 *srcStride];\
2463 const int src5= src[5 *srcStride];\
2464 const int src6= src[6 *srcStride];\
2465 const int src7= src[7 *srcStride];\
2466 const int src8= src[8 *srcStride];\
2467 const int src9= src[9 *srcStride];\
2468 const int src10=src[10*srcStride];\
2469 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2470 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2471 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2472 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2473 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*5 + (src2+src7));\
2474 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*5 + (src3+src8));\
2475 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*5 + (src4+src9));\
2476 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src9)*5 + (src5+src10));\
2482 static void OPNAME ## h264_qpel8_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2485 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2487 src -= 2*srcStride;\
2488 for(i=0; i<h+5; i++)\
2490 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]);\
2491 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]);\
2492 tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]);\
2493 tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]);\
2494 tmp[4]= (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]);\
2495 tmp[5]= (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]);\
2496 tmp[6]= (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]);\
2497 tmp[7]= (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]);\
2501 tmp -= tmpStride*(h+5-2);\
2504 const int tmpB= tmp[-2*tmpStride];\
2505 const int tmpA= tmp[-1*tmpStride];\
2506 const int tmp0= tmp[0 *tmpStride];\
2507 const int tmp1= tmp[1 *tmpStride];\
2508 const int tmp2= tmp[2 *tmpStride];\
2509 const int tmp3= tmp[3 *tmpStride];\
2510 const int tmp4= tmp[4 *tmpStride];\
2511 const int tmp5= tmp[5 *tmpStride];\
2512 const int tmp6= tmp[6 *tmpStride];\
2513 const int tmp7= tmp[7 *tmpStride];\
2514 const int tmp8= tmp[8 *tmpStride];\
2515 const int tmp9= tmp[9 *tmpStride];\
2516 const int tmp10=tmp[10*tmpStride];\
2517 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2518 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2519 OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2520 OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2521 OP2(dst[4*dstStride], (tmp4+tmp5)*20 - (tmp3+tmp6)*5 + (tmp2+tmp7));\
2522 OP2(dst[5*dstStride], (tmp5+tmp6)*20 - (tmp4+tmp7)*5 + (tmp3+tmp8));\
2523 OP2(dst[6*dstStride], (tmp6+tmp7)*20 - (tmp5+tmp8)*5 + (tmp4+tmp9));\
2524 OP2(dst[7*dstStride], (tmp7+tmp8)*20 - (tmp6+tmp9)*5 + (tmp5+tmp10));\
2530 static void OPNAME ## h264_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2531 OPNAME ## h264_qpel8_v_lowpass(dst , src , dstStride, srcStride);\
2532 OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2533 src += 8*srcStride;\
2534 dst += 8*dstStride;\
2535 OPNAME ## h264_qpel8_v_lowpass(dst , src , dstStride, srcStride);\
2536 OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2539 static void OPNAME ## h264_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2540 OPNAME ## h264_qpel8_h_lowpass(dst , src , dstStride, srcStride);\
2541 OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2542 src += 8*srcStride;\
2543 dst += 8*dstStride;\
2544 OPNAME ## h264_qpel8_h_lowpass(dst , src , dstStride, srcStride);\
2545 OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2548 static void OPNAME ## h264_qpel16_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2549 OPNAME ## h264_qpel8_hv_lowpass(dst , tmp , src , dstStride, tmpStride, srcStride);\
2550 OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2551 src += 8*srcStride;\
2552 dst += 8*dstStride;\
2553 OPNAME ## h264_qpel8_hv_lowpass(dst , tmp , src , dstStride, tmpStride, srcStride);\
2554 OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2557 #define H264_MC(OPNAME, SIZE) \
2558 static void OPNAME ## h264_qpel ## SIZE ## _mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2559 OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\
2562 static void OPNAME ## h264_qpel ## SIZE ## _mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2563 uint8_t half[SIZE*SIZE];\
2564 put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2565 OPNAME ## pixels ## SIZE ## _l2(dst, src, half, stride, stride, SIZE, SIZE);\
2568 static void OPNAME ## h264_qpel ## SIZE ## _mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2569 OPNAME ## h264_qpel ## SIZE ## _h_lowpass(dst, src, stride, stride);\
2572 static void OPNAME ## h264_qpel ## SIZE ## _mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2573 uint8_t half[SIZE*SIZE];\
2574 put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2575 OPNAME ## pixels ## SIZE ## _l2(dst, src+1, half, stride, stride, SIZE, SIZE);\
2578 static void OPNAME ## h264_qpel ## SIZE ## _mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2579 uint8_t full[SIZE*(SIZE+5)];\
2580 uint8_t * const full_mid= full + SIZE*2;\
2581 uint8_t half[SIZE*SIZE];\
2582 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2583 put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2584 OPNAME ## pixels ## SIZE ## _l2(dst, full_mid, half, stride, SIZE, SIZE, SIZE);\
2587 static void OPNAME ## h264_qpel ## SIZE ## _mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2588 uint8_t full[SIZE*(SIZE+5)];\
2589 uint8_t * const full_mid= full + SIZE*2;\
2590 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2591 OPNAME ## h264_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE);\
2594 static void OPNAME ## h264_qpel ## SIZE ## _mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2595 uint8_t full[SIZE*(SIZE+5)];\
2596 uint8_t * const full_mid= full + SIZE*2;\
2597 uint8_t half[SIZE*SIZE];\
2598 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2599 put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2600 OPNAME ## pixels ## SIZE ## _l2(dst, full_mid+SIZE, half, stride, SIZE, SIZE, SIZE);\
2603 static void OPNAME ## h264_qpel ## SIZE ## _mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2604 uint8_t full[SIZE*(SIZE+5)];\
2605 uint8_t * const full_mid= full + SIZE*2;\
2606 uint8_t halfH[SIZE*SIZE];\
2607 uint8_t halfV[SIZE*SIZE];\
2608 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2609 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2610 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2611 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2614 static void OPNAME ## h264_qpel ## SIZE ## _mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2615 uint8_t full[SIZE*(SIZE+5)];\
2616 uint8_t * const full_mid= full + SIZE*2;\
2617 uint8_t halfH[SIZE*SIZE];\
2618 uint8_t halfV[SIZE*SIZE];\
2619 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2620 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2621 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2622 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2625 static void OPNAME ## h264_qpel ## SIZE ## _mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2626 uint8_t full[SIZE*(SIZE+5)];\
2627 uint8_t * const full_mid= full + SIZE*2;\
2628 uint8_t halfH[SIZE*SIZE];\
2629 uint8_t halfV[SIZE*SIZE];\
2630 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2631 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2632 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2633 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2636 static void OPNAME ## h264_qpel ## SIZE ## _mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2637 uint8_t full[SIZE*(SIZE+5)];\
2638 uint8_t * const full_mid= full + SIZE*2;\
2639 uint8_t halfH[SIZE*SIZE];\
2640 uint8_t halfV[SIZE*SIZE];\
2641 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2642 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2643 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2644 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2647 static void OPNAME ## h264_qpel ## SIZE ## _mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2648 int16_t tmp[SIZE*(SIZE+5)];\
2649 OPNAME ## h264_qpel ## SIZE ## _hv_lowpass(dst, tmp, src, stride, SIZE, stride);\
2652 static void OPNAME ## h264_qpel ## SIZE ## _mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2653 int16_t tmp[SIZE*(SIZE+5)];\
2654 uint8_t halfH[SIZE*SIZE];\
2655 uint8_t halfHV[SIZE*SIZE];\
2656 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2657 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2658 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2661 static void OPNAME ## h264_qpel ## SIZE ## _mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2662 int16_t tmp[SIZE*(SIZE+5)];\
2663 uint8_t halfH[SIZE*SIZE];\
2664 uint8_t halfHV[SIZE*SIZE];\
2665 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2666 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2667 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2670 static void OPNAME ## h264_qpel ## SIZE ## _mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2671 uint8_t full[SIZE*(SIZE+5)];\
2672 uint8_t * const full_mid= full + SIZE*2;\
2673 int16_t tmp[SIZE*(SIZE+5)];\
2674 uint8_t halfV[SIZE*SIZE];\
2675 uint8_t halfHV[SIZE*SIZE];\
2676 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2677 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2678 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2679 OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2682 static void OPNAME ## h264_qpel ## SIZE ## _mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2683 uint8_t full[SIZE*(SIZE+5)];\
2684 uint8_t * const full_mid= full + SIZE*2;\
2685 int16_t tmp[SIZE*(SIZE+5)];\
2686 uint8_t halfV[SIZE*SIZE];\
2687 uint8_t halfHV[SIZE*SIZE];\
2688 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2689 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2690 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2691 OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2694 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2695 //#define op_avg2(a, b) a = (((a)*w1+cm[((b) + 16)>>5]*w2 + o + 64)>>7)
2696 #define op_put(a, b) a = cm[((b) + 16)>>5]
2697 #define op2_avg(a, b) a = (((a)+cm[((b) + 512)>>10]+1)>>1)
2698 #define op2_put(a, b) a = cm[((b) + 512)>>10]
2700 H264_LOWPASS(put_ , op_put, op2_put)
2701 H264_LOWPASS(avg_ , op_avg, op2_avg)
2716 #define op_scale1(x) block[x] = av_clip_uint8( (block[x]*weight + offset) >> log2_denom )
2717 #define op_scale2(x) dst[x] = av_clip_uint8( (src[x]*weights + dst[x]*weightd + offset) >> (log2_denom+1))
2718 #define H264_WEIGHT(W,H) \
2719 static void weight_h264_pixels ## W ## x ## H ## _c(uint8_t *block, int stride, int log2_denom, int weight, int offset){ \
2721 offset <<= log2_denom; \
2722 if(log2_denom) offset += 1<<(log2_denom-1); \
2723 for(y=0; y<H; y++, block += stride){ \
2726 if(W==2) continue; \
2729 if(W==4) continue; \
2734 if(W==8) continue; \
2745 static void biweight_h264_pixels ## W ## x ## H ## _c(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset){ \
2747 offset = ((offset + 1) | 1) << log2_denom; \
2748 for(y=0; y<H; y++, dst += stride, src += stride){ \
2751 if(W==2) continue; \
2754 if(W==4) continue; \
2759 if(W==8) continue; \
2786 static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
2787 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2791 dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
2792 dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
2793 dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
2794 dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
2795 dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
2796 dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
2797 dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
2798 dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
2804 #if CONFIG_CAVS_DECODER
2806 void ff_cavsdsp_init(DSPContext* c, AVCodecContext *avctx);
2808 void ff_put_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2809 put_pixels8_c(dst, src, stride, 8);
2811 void ff_avg_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2812 avg_pixels8_c(dst, src, stride, 8);
2814 void ff_put_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2815 put_pixels16_c(dst, src, stride, 16);
2817 void ff_avg_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2818 avg_pixels16_c(dst, src, stride, 16);
2820 #endif /* CONFIG_CAVS_DECODER */
2822 void ff_mlp_init(DSPContext* c, AVCodecContext *avctx);
2824 #if CONFIG_VC1_DECODER
2826 void ff_vc1dsp_init(DSPContext* c, AVCodecContext *avctx);
2828 void ff_put_vc1_mspel_mc00_c(uint8_t *dst, uint8_t *src, int stride, int rnd) {
2829 put_pixels8_c(dst, src, stride, 8);
2831 void ff_avg_vc1_mspel_mc00_c(uint8_t *dst, uint8_t *src, int stride, int rnd) {
2832 avg_pixels8_c(dst, src, stride, 8);
2834 #endif /* CONFIG_VC1_DECODER */
2836 void ff_intrax8dsp_init(DSPContext* c, AVCodecContext *avctx);
2839 void ff_h264dspenc_init(DSPContext* c, AVCodecContext *avctx);
2841 #if CONFIG_RV30_DECODER
2842 void ff_rv30dsp_init(DSPContext* c, AVCodecContext *avctx);
2843 #endif /* CONFIG_RV30_DECODER */
2845 #if CONFIG_RV40_DECODER
2846 static void put_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2847 put_pixels16_xy2_c(dst, src, stride, 16);
2849 static void avg_rv40_qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2850 avg_pixels16_xy2_c(dst, src, stride, 16);
2852 static void put_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2853 put_pixels8_xy2_c(dst, src, stride, 8);
2855 static void avg_rv40_qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){
2856 avg_pixels8_xy2_c(dst, src, stride, 8);
2859 void ff_rv40dsp_init(DSPContext* c, AVCodecContext *avctx);
2860 #endif /* CONFIG_RV40_DECODER */
2862 static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
2863 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2867 const int src_1= src[ -srcStride];
2868 const int src0 = src[0 ];
2869 const int src1 = src[ srcStride];
2870 const int src2 = src[2*srcStride];
2871 const int src3 = src[3*srcStride];
2872 const int src4 = src[4*srcStride];
2873 const int src5 = src[5*srcStride];
2874 const int src6 = src[6*srcStride];
2875 const int src7 = src[7*srcStride];
2876 const int src8 = src[8*srcStride];
2877 const int src9 = src[9*srcStride];
2878 dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
2879 dst[1*dstStride]= cm[(9*(src1 + src2) - (src0 + src3) + 8)>>4];
2880 dst[2*dstStride]= cm[(9*(src2 + src3) - (src1 + src4) + 8)>>4];
2881 dst[3*dstStride]= cm[(9*(src3 + src4) - (src2 + src5) + 8)>>4];
2882 dst[4*dstStride]= cm[(9*(src4 + src5) - (src3 + src6) + 8)>>4];
2883 dst[5*dstStride]= cm[(9*(src5 + src6) - (src4 + src7) + 8)>>4];
2884 dst[6*dstStride]= cm[(9*(src6 + src7) - (src5 + src8) + 8)>>4];
2885 dst[7*dstStride]= cm[(9*(src7 + src8) - (src6 + src9) + 8)>>4];
2891 static void put_mspel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){
2892 put_pixels8_c(dst, src, stride, 8);
2895 static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
2897 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2898 put_pixels8_l2(dst, src, half, stride, stride, 8, 8);
2901 static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
2902 wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
2905 static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
2907 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2908 put_pixels8_l2(dst, src+1, half, stride, stride, 8, 8);
2911 static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
2912 wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
2915 static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
2919 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2920 wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
2921 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2922 put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2924 static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
2928 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2929 wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
2930 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2931 put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2933 static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
2935 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2936 wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
2939 static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
2940 if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
2942 const int strength= ff_h263_loop_filter_strength[qscale];
2946 int p0= src[x-2*stride];
2947 int p1= src[x-1*stride];
2948 int p2= src[x+0*stride];
2949 int p3= src[x+1*stride];
2950 int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2952 if (d<-2*strength) d1= 0;
2953 else if(d<- strength) d1=-2*strength - d;
2954 else if(d< strength) d1= d;
2955 else if(d< 2*strength) d1= 2*strength - d;
2960 if(p1&256) p1= ~(p1>>31);
2961 if(p2&256) p2= ~(p2>>31);
2963 src[x-1*stride] = p1;
2964 src[x+0*stride] = p2;
2968 d2= av_clip((p0-p3)/4, -ad1, ad1);
2970 src[x-2*stride] = p0 - d2;
2971 src[x+ stride] = p3 + d2;
2976 static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
2977 if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
2979 const int strength= ff_h263_loop_filter_strength[qscale];
2983 int p0= src[y*stride-2];
2984 int p1= src[y*stride-1];
2985 int p2= src[y*stride+0];
2986 int p3= src[y*stride+1];
2987 int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2989 if (d<-2*strength) d1= 0;
2990 else if(d<- strength) d1=-2*strength - d;
2991 else if(d< strength) d1= d;
2992 else if(d< 2*strength) d1= 2*strength - d;
2997 if(p1&256) p1= ~(p1>>31);
2998 if(p2&256) p2= ~(p2>>31);
3000 src[y*stride-1] = p1;
3001 src[y*stride+0] = p2;
3005 d2= av_clip((p0-p3)/4, -ad1, ad1);
3007 src[y*stride-2] = p0 - d2;
3008 src[y*stride+1] = p3 + d2;
3013 static void h261_loop_filter_c(uint8_t *src, int stride){
3018 temp[x ] = 4*src[x ];
3019 temp[x + 7*8] = 4*src[x + 7*stride];
3023 xy = y * stride + x;
3025 temp[yz] = src[xy - stride] + 2*src[xy] + src[xy + stride];
3030 src[ y*stride] = (temp[ y*8] + 2)>>2;
3031 src[7+y*stride] = (temp[7+y*8] + 2)>>2;
3033 xy = y * stride + x;
3035 src[xy] = (temp[yz-1] + 2*temp[yz] + temp[yz+1] + 8)>>4;
3040 static av_always_inline av_flatten void h264_loop_filter_luma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
3043 for( i = 0; i < 4; i++ ) {
3048 for( d = 0; d < 4; d++ ) {
3049 const int p0 = pix[-1*xstride];
3050 const int p1 = pix[-2*xstride];
3051 const int p2 = pix[-3*xstride];
3052 const int q0 = pix[0];
3053 const int q1 = pix[1*xstride];
3054 const int q2 = pix[2*xstride];
3056 if( FFABS( p0 - q0 ) < alpha &&
3057 FFABS( p1 - p0 ) < beta &&
3058 FFABS( q1 - q0 ) < beta ) {
3063 if( FFABS( p2 - p0 ) < beta ) {
3065 pix[-2*xstride] = p1 + av_clip( (( p2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - p1, -tc0[i], tc0[i] );
3068 if( FFABS( q2 - q0 ) < beta ) {
3070 pix[ xstride] = q1 + av_clip( (( q2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - q1, -tc0[i], tc0[i] );
3074 i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
3075 pix[-xstride] = av_clip_uint8( p0 + i_delta ); /* p0' */
3076 pix[0] = av_clip_uint8( q0 - i_delta ); /* q0' */
3082 static void h264_v_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
3084 h264_loop_filter_luma_c(pix, stride, 1, alpha, beta, tc0);
3086 static void h264_h_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
3088 h264_loop_filter_luma_c(pix, 1, stride, alpha, beta, tc0);
3091 static av_always_inline av_flatten void h264_loop_filter_luma_intra_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta)
3094 for( d = 0; d < 16; d++ ) {
3095 const int p2 = pix[-3*xstride];
3096 const int p1 = pix[-2*xstride];
3097 const int p0 = pix[-1*xstride];
3099 const int q0 = pix[ 0*xstride];
3100 const int q1 = pix[ 1*xstride];
3101 const int q2 = pix[ 2*xstride];
3103 if( FFABS( p0 - q0 ) < alpha &&
3104 FFABS( p1 - p0 ) < beta &&
3105 FFABS( q1 - q0 ) < beta ) {
3107 if(FFABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
3108 if( FFABS( p2 - p0 ) < beta)
3110 const int p3 = pix[-4*xstride];
3112 pix[-1*xstride] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
3113 pix[-2*xstride] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
3114 pix[-3*xstride] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
3117 pix[-1*xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
3119 if( FFABS( q2 - q0 ) < beta)
3121 const int q3 = pix[3*xstride];
3123 pix[0*xstride] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
3124 pix[1*xstride] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
3125 pix[2*xstride] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
3128 pix[0*xstride] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
3132 pix[-1*xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
3133 pix[ 0*xstride] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
3139 static void h264_v_loop_filter_luma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
3141 h264_loop_filter_luma_intra_c(pix, stride, 1, alpha, beta);
3143 static void h264_h_loop_filter_luma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
3145 h264_loop_filter_luma_intra_c(pix, 1, stride, alpha, beta);
3148 static av_always_inline av_flatten void h264_loop_filter_chroma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
3151 for( i = 0; i < 4; i++ ) {
3152 const int tc = tc0[i];
3157 for( d = 0; d < 2; d++ ) {
3158 const int p0 = pix[-1*xstride];
3159 const int p1 = pix[-2*xstride];
3160 const int q0 = pix[0];
3161 const int q1 = pix[1*xstride];
3163 if( FFABS( p0 - q0 ) < alpha &&
3164 FFABS( p1 - p0 ) < beta &&
3165 FFABS( q1 - q0 ) < beta ) {
3167 int delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
3169 pix[-xstride] = av_clip_uint8( p0 + delta ); /* p0' */
3170 pix[0] = av_clip_uint8( q0 - delta ); /* q0' */
3176 static void h264_v_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
3178 h264_loop_filter_chroma_c(pix, stride, 1, alpha, beta, tc0);
3180 static void h264_h_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
3182 h264_loop_filter_chroma_c(pix, 1, stride, alpha, beta, tc0);
3185 static av_always_inline av_flatten void h264_loop_filter_chroma_intra_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta)
3188 for( d = 0; d < 8; d++ ) {
3189 const int p0 = pix[-1*xstride];
3190 const int p1 = pix[-2*xstride];
3191 const int q0 = pix[0];
3192 const int q1 = pix[1*xstride];
3194 if( FFABS( p0 - q0 ) < alpha &&
3195 FFABS( p1 - p0 ) < beta &&
3196 FFABS( q1 - q0 ) < beta ) {
3198 pix[-xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2; /* p0' */
3199 pix[0] = ( 2*q1 + q0 + p1 + 2 ) >> 2; /* q0' */
3204 static void h264_v_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
3206 h264_loop_filter_chroma_intra_c(pix, stride, 1, alpha, beta);
3208 static void h264_h_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
3210 h264_loop_filter_chroma_intra_c(pix, 1, stride, alpha, beta);
3213 static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3219 s += abs(pix1[0] - pix2[0]);
3220 s += abs(pix1[1] - pix2[1]);
3221 s += abs(pix1[2] - pix2[2]);
3222 s += abs(pix1[3] - pix2[3]);
3223 s += abs(pix1[4] - pix2[4]);
3224 s += abs(pix1[5] - pix2[5]);
3225 s += abs(pix1[6] - pix2[6]);
3226 s += abs(pix1[7] - pix2[7]);
3227 s += abs(pix1[8] - pix2[8]);
3228 s += abs(pix1[9] - pix2[9]);
3229 s += abs(pix1[10] - pix2[10]);
3230 s += abs(pix1[11] - pix2[11]);
3231 s += abs(pix1[12] - pix2[12]);
3232 s += abs(pix1[13] - pix2[13]);
3233 s += abs(pix1[14] - pix2[14]);
3234 s += abs(pix1[15] - pix2[15]);
3241 static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3247 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
3248 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
3249 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
3250 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
3251 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
3252 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
3253 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
3254 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
3255 s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
3256 s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
3257 s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
3258 s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
3259 s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
3260 s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
3261 s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
3262 s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
3269 static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3272 uint8_t *pix3 = pix2 + line_size;
3276 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
3277 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
3278 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
3279 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
3280 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
3281 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
3282 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
3283 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
3284 s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
3285 s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
3286 s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
3287 s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
3288 s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
3289 s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
3290 s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
3291 s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
3299 static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3302 uint8_t *pix3 = pix2 + line_size;
3306 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
3307 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
3308 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
3309 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
3310 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
3311 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
3312 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
3313 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
3314 s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
3315 s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
3316 s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
3317 s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
3318 s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
3319 s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
3320 s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
3321 s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
3329 static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3335 s += abs(pix1[0] - pix2[0]);
3336 s += abs(pix1[1] - pix2[1]);
3337 s += abs(pix1[2] - pix2[2]);
3338 s += abs(pix1[3] - pix2[3]);
3339 s += abs(pix1[4] - pix2[4]);
3340 s += abs(pix1[5] - pix2[5]);
3341 s += abs(pix1[6] - pix2[6]);
3342 s += abs(pix1[7] - pix2[7]);
3349 static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3355 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
3356 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
3357 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
3358 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
3359 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
3360 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
3361 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
3362 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
3369 static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3372 uint8_t *pix3 = pix2 + line_size;
3376 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
3377 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
3378 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
3379 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
3380 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
3381 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
3382 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
3383 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
3391 static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3394 uint8_t *pix3 = pix2 + line_size;
3398 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
3399 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
3400 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
3401 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
3402 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
3403 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
3404 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
3405 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
3413 static int nsse16_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
3414 MpegEncContext *c = v;
3420 for(x=0; x<16; x++){
3421 score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]);
3424 for(x=0; x<15; x++){
3425 score2+= FFABS( s1[x ] - s1[x +stride]
3426 - s1[x+1] + s1[x+1+stride])
3427 -FFABS( s2[x ] - s2[x +stride]
3428 - s2[x+1] + s2[x+1+stride]);
3435 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
3436 else return score1 + FFABS(score2)*8;
3439 static int nsse8_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
3440 MpegEncContext *c = v;
3447 score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]);
3451 score2+= FFABS( s1[x ] - s1[x +stride]
3452 - s1[x+1] + s1[x+1+stride])
3453 -FFABS( s2[x ] - s2[x +stride]
3454 - s2[x+1] + s2[x+1+stride]);
3461 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
3462 else return score1 + FFABS(score2)*8;
3465 static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
3469 for(i=0; i<8*8; i++){
3470 int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
3473 assert(-512<b && b<512);
3475 sum += (w*b)*(w*b)>>4;
3480 static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
3483 for(i=0; i<8*8; i++){
3484 rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
3489 * permutes an 8x8 block.
3490 * @param block the block which will be permuted according to the given permutation vector
3491 * @param permutation the permutation vector
3492 * @param last the last non zero coefficient in scantable order, used to speed the permutation up
3493 * @param scantable the used scantable, this is only used to speed the permutation up, the block is not
3494 * (inverse) permutated to scantable order!
3496 void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
3502 //if(permutation[1]==1) return; //FIXME it is ok but not clean and might fail for some permutations
3504 for(i=0; i<=last; i++){
3505 const int j= scantable[i];
3510 for(i=0; i<=last; i++){
3511 const int j= scantable[i];
3512 const int perm_j= permutation[j];
3513 block[perm_j]= temp[j];
3517 static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
3521 void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
3524 memset(cmp, 0, sizeof(void*)*6);
3532 cmp[i]= c->hadamard8_diff[i];
3538 cmp[i]= c->dct_sad[i];
3541 cmp[i]= c->dct264_sad[i];
3544 cmp[i]= c->dct_max[i];
3547 cmp[i]= c->quant_psnr[i];
3567 #if CONFIG_SNOW_ENCODER
3576 av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
3581 static void clear_block_c(DCTELEM *block)
3583 memset(block, 0, sizeof(DCTELEM)*64);
3587 * memset(blocks, 0, sizeof(DCTELEM)*6*64)
3589 static void clear_blocks_c(DCTELEM *blocks)
3591 memset(blocks, 0, sizeof(DCTELEM)*6*64);
3594 static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
3596 for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3597 long a = *(long*)(src+i);
3598 long b = *(long*)(dst+i);
3599 *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
3602 dst[i+0] += src[i+0];
3605 static void add_bytes_l2_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
3607 for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3608 long a = *(long*)(src1+i);
3609 long b = *(long*)(src2+i);
3610 *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
3613 dst[i] = src1[i]+src2[i];
3616 static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
3618 #if !HAVE_FAST_UNALIGNED
3619 if((long)src2 & (sizeof(long)-1)){
3620 for(i=0; i+7<w; i+=8){
3621 dst[i+0] = src1[i+0]-src2[i+0];
3622 dst[i+1] = src1[i+1]-src2[i+1];
3623 dst[i+2] = src1[i+2]-src2[i+2];
3624 dst[i+3] = src1[i+3]-src2[i+3];
3625 dst[i+4] = src1[i+4]-src2[i+4];
3626 dst[i+5] = src1[i+5]-src2[i+5];
3627 dst[i+6] = src1[i+6]-src2[i+6];
3628 dst[i+7] = src1[i+7]-src2[i+7];
3632 for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3633 long a = *(long*)(src1+i);
3634 long b = *(long*)(src2+i);
3635 *(long*)(dst+i) = ((a|pb_80) - (b&pb_7f)) ^ ((a^b^pb_80)&pb_80);
3638 dst[i+0] = src1[i+0]-src2[i+0];
3641 static void add_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *diff, int w, int *left, int *left_top){
3649 l= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF) + diff[i];
3658 static void sub_hfyu_median_prediction_c(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int w, int *left, int *left_top){
3666 const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
3676 static int add_hfyu_left_prediction_c(uint8_t *dst, const uint8_t *src, int w, int acc){
3679 for(i=0; i<w-1; i++){
3706 static void add_hfyu_left_prediction_bgr32_c(uint8_t *dst, const uint8_t *src, int w, int *red, int *green, int *blue, int *alpha){
3736 #define BUTTERFLY2(o1,o2,i1,i2) \
3740 #define BUTTERFLY1(x,y) \
3749 #define BUTTERFLYA(x,y) (FFABS((x)+(y)) + FFABS((x)-(y)))
3751 static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
3759 //FIXME try pointer walks
3760 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
3761 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
3762 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
3763 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
3765 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3766 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3767 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3768 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3770 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3771 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3772 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3773 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3777 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3778 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3779 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3780 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3782 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3783 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3784 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3785 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3788 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3789 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3790 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3791 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3797 printf("MAX:%d\n", maxi);
3803 static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
3811 //FIXME try pointer walks
3812 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
3813 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
3814 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
3815 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
3817 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3818 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3819 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3820 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3822 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3823 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3824 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3825 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3829 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3830 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3831 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3832 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3834 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3835 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3836 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3837 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3840 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3841 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3842 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3843 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3846 sum -= FFABS(temp[8*0] + temp[8*4]); // -mean
3851 static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3852 MpegEncContext * const s= (MpegEncContext *)c;
3853 LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
3857 s->dsp.diff_pixels(temp, src1, src2, stride);
3859 return s->dsp.sum_abs_dctelem(temp);
3864 const int s07 = SRC(0) + SRC(7);\
3865 const int s16 = SRC(1) + SRC(6);\
3866 const int s25 = SRC(2) + SRC(5);\
3867 const int s34 = SRC(3) + SRC(4);\
3868 const int a0 = s07 + s34;\
3869 const int a1 = s16 + s25;\
3870 const int a2 = s07 - s34;\
3871 const int a3 = s16 - s25;\
3872 const int d07 = SRC(0) - SRC(7);\
3873 const int d16 = SRC(1) - SRC(6);\
3874 const int d25 = SRC(2) - SRC(5);\
3875 const int d34 = SRC(3) - SRC(4);\
3876 const int a4 = d16 + d25 + (d07 + (d07>>1));\
3877 const int a5 = d07 - d34 - (d25 + (d25>>1));\
3878 const int a6 = d07 + d34 - (d16 + (d16>>1));\
3879 const int a7 = d16 - d25 + (d34 + (d34>>1));\
3881 DST(1, a4 + (a7>>2)) ;\
3882 DST(2, a2 + (a3>>1)) ;\
3883 DST(3, a5 + (a6>>2)) ;\
3885 DST(5, a6 - (a5>>2)) ;\
3886 DST(6, (a2>>1) - a3 ) ;\
3887 DST(7, (a4>>2) - a7 ) ;\
3890 static int dct264_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3891 MpegEncContext * const s= (MpegEncContext *)c;
3896 s->dsp.diff_pixels(dct[0], src1, src2, stride);
3898 #define SRC(x) dct[i][x]
3899 #define DST(x,v) dct[i][x]= v
3900 for( i = 0; i < 8; i++ )
3905 #define SRC(x) dct[x][i]
3906 #define DST(x,v) sum += FFABS(v)
3907 for( i = 0; i < 8; i++ )
3915 static int dct_max8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3916 MpegEncContext * const s= (MpegEncContext *)c;
3917 LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
3922 s->dsp.diff_pixels(temp, src1, src2, stride);
3926 sum= FFMAX(sum, FFABS(temp[i]));
3931 static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3932 MpegEncContext * const s= (MpegEncContext *)c;
3933 LOCAL_ALIGNED_16(DCTELEM, temp, [64*2]);
3934 DCTELEM * const bak = temp+64;
3940 s->dsp.diff_pixels(temp, src1, src2, stride);
3942 memcpy(bak, temp, 64*sizeof(DCTELEM));
3944 s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3945 s->dct_unquantize_inter(s, temp, 0, s->qscale);
3946 ff_simple_idct(temp); //FIXME
3949 sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
3954 static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3955 MpegEncContext * const s= (MpegEncContext *)c;
3956 const uint8_t *scantable= s->intra_scantable.permutated;
3957 LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
3958 LOCAL_ALIGNED_16(uint8_t, lsrc1, [64]);
3959 LOCAL_ALIGNED_16(uint8_t, lsrc2, [64]);
3960 int i, last, run, bits, level, distortion, start_i;
3961 const int esc_length= s->ac_esc_length;
3963 uint8_t * last_length;
3967 copy_block8(lsrc1, src1, 8, stride, 8);
3968 copy_block8(lsrc2, src2, 8, stride, 8);
3970 s->dsp.diff_pixels(temp, lsrc1, lsrc2, 8);
3972 s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3978 length = s->intra_ac_vlc_length;
3979 last_length= s->intra_ac_vlc_last_length;
3980 bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3983 length = s->inter_ac_vlc_length;
3984 last_length= s->inter_ac_vlc_last_length;
3989 for(i=start_i; i<last; i++){
3990 int j= scantable[i];
3995 if((level&(~127)) == 0){
3996 bits+= length[UNI_AC_ENC_INDEX(run, level)];
4005 level= temp[i] + 64;
4009 if((level&(~127)) == 0){
4010 bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
4018 s->dct_unquantize_intra(s, temp, 0, s->qscale);
4020 s->dct_unquantize_inter(s, temp, 0, s->qscale);
4023 s->dsp.idct_add(lsrc2, 8, temp);
4025 distortion= s->dsp.sse[1](NULL, lsrc2, lsrc1, 8, 8);
4027 return distortion + ((bits*s->qscale*s->qscale*109 + 64)>>7);
4030 static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
4031 MpegEncContext * const s= (MpegEncContext *)c;
4032 const uint8_t *scantable= s->intra_scantable.permutated;
4033 LOCAL_ALIGNED_16(DCTELEM, temp, [64]);
4034 int i, last, run, bits, level, start_i;
4035 const int esc_length= s->ac_esc_length;
4037 uint8_t * last_length;
4041 s->dsp.diff_pixels(temp, src1, src2, stride);
4043 s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
4049 length = s->intra_ac_vlc_length;
4050 last_length= s->intra_ac_vlc_last_length;
4051 bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
4054 length = s->inter_ac_vlc_length;
4055 last_length= s->inter_ac_vlc_last_length;
4060 for(i=start_i; i<last; i++){
4061 int j= scantable[i];
4066 if((level&(~127)) == 0){
4067 bits+= length[UNI_AC_ENC_INDEX(run, level)];
4076 level= temp[i] + 64;
4080 if((level&(~127)) == 0){
4081 bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
4089 #define VSAD_INTRA(size) \
4090 static int vsad_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
4094 for(y=1; y<h; y++){ \
4095 for(x=0; x<size; x+=4){ \
4096 score+= FFABS(s[x ] - s[x +stride]) + FFABS(s[x+1] - s[x+1+stride]) \
4097 +FFABS(s[x+2] - s[x+2+stride]) + FFABS(s[x+3] - s[x+3+stride]); \
4107 static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
4112 for(x=0; x<16; x++){
4113 score+= FFABS(s1[x ] - s2[x ] - s1[x +stride] + s2[x +stride]);
4122 #define SQ(a) ((a)*(a))
4123 #define VSSE_INTRA(size) \
4124 static int vsse_intra##size##_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){ \
4128 for(y=1; y<h; y++){ \
4129 for(x=0; x<size; x+=4){ \
4130 score+= SQ(s[x ] - s[x +stride]) + SQ(s[x+1] - s[x+1+stride]) \
4131 +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]); \
4141 static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
4146 for(x=0; x<16; x++){
4147 score+= SQ(s1[x ] - s2[x ] - s1[x +stride] + s2[x +stride]);
4156 static int ssd_int8_vs_int16_c(const int8_t *pix1, const int16_t *pix2,
4160 for(i=0; i<size; i++)
4161 score += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]);
4165 WRAPPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
4166 WRAPPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
4167 WRAPPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
4169 WRAPPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c)
4171 WRAPPER8_16_SQ(dct_max8x8_c, dct_max16_c)
4172 WRAPPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
4173 WRAPPER8_16_SQ(rd8x8_c, rd16_c)
4174 WRAPPER8_16_SQ(bit8x8_c, bit16_c)
4176 static void vector_fmul_c(float *dst, const float *src, int len){
4178 for(i=0; i<len; i++)
4182 static void vector_fmul_reverse_c(float *dst, const float *src0, const float *src1, int len){
4185 for(i=0; i<len; i++)
4186 dst[i] = src0[i] * src1[-i];
4189 static void vector_fmul_add_c(float *dst, const float *src0, const float *src1, const float *src2, int len){
4191 for(i=0; i<len; i++)
4192 dst[i] = src0[i] * src1[i] + src2[i];
4195 void ff_vector_fmul_window_c(float *dst, const float *src0, const float *src1, const float *win, float add_bias, int len){
4200 for(i=-len, j=len-1; i<0; i++, j--) {
4205 dst[i] = s0*wj - s1*wi + add_bias;
4206 dst[j] = s0*wi + s1*wj + add_bias;
4210 static void vector_fmul_scalar_c(float *dst, const float *src, float mul,
4214 for (i = 0; i < len; i++)
4215 dst[i] = src[i] * mul;
4218 static void vector_fmul_sv_scalar_2_c(float *dst, const float *src,
4219 const float **sv, float mul, int len)
4222 for (i = 0; i < len; i += 2, sv++) {
4223 dst[i ] = src[i ] * sv[0][0] * mul;
4224 dst[i+1] = src[i+1] * sv[0][1] * mul;
4228 static void vector_fmul_sv_scalar_4_c(float *dst, const float *src,
4229 const float **sv, float mul, int len)
4232 for (i = 0; i < len; i += 4, sv++) {
4233 dst[i ] = src[i ] * sv[0][0] * mul;
4234 dst[i+1] = src[i+1] * sv[0][1] * mul;
4235 dst[i+2] = src[i+2] * sv[0][2] * mul;
4236 dst[i+3] = src[i+3] * sv[0][3] * mul;
4240 static void sv_fmul_scalar_2_c(float *dst, const float **sv, float mul,
4244 for (i = 0; i < len; i += 2, sv++) {
4245 dst[i ] = sv[0][0] * mul;
4246 dst[i+1] = sv[0][1] * mul;
4250 static void sv_fmul_scalar_4_c(float *dst, const float **sv, float mul,
4254 for (i = 0; i < len; i += 4, sv++) {
4255 dst[i ] = sv[0][0] * mul;
4256 dst[i+1] = sv[0][1] * mul;
4257 dst[i+2] = sv[0][2] * mul;
4258 dst[i+3] = sv[0][3] * mul;
4262 static void butterflies_float_c(float *restrict v1, float *restrict v2,
4266 for (i = 0; i < len; i++) {
4267 float t = v1[i] - v2[i];
4273 static float scalarproduct_float_c(const float *v1, const float *v2, int len)
4278 for (i = 0; i < len; i++)
4284 static void int32_to_float_fmul_scalar_c(float *dst, const int *src, float mul, int len){
4286 for(i=0; i<len; i++)
4287 dst[i] = src[i] * mul;
4290 static inline uint32_t clipf_c_one(uint32_t a, uint32_t mini,
4291 uint32_t maxi, uint32_t maxisign)
4294 if(a > mini) return mini;
4295 else if((a^(1<<31)) > maxisign) return maxi;
4299 static void vector_clipf_c_opposite_sign(float *dst, const float *src, float *min, float *max, int len){
4301 uint32_t mini = *(uint32_t*)min;
4302 uint32_t maxi = *(uint32_t*)max;
4303 uint32_t maxisign = maxi ^ (1<<31);
4304 uint32_t *dsti = (uint32_t*)dst;
4305 const uint32_t *srci = (const uint32_t*)src;
4306 for(i=0; i<len; i+=8) {
4307 dsti[i + 0] = clipf_c_one(srci[i + 0], mini, maxi, maxisign);
4308 dsti[i + 1] = clipf_c_one(srci[i + 1], mini, maxi, maxisign);
4309 dsti[i + 2] = clipf_c_one(srci[i + 2], mini, maxi, maxisign);
4310 dsti[i + 3] = clipf_c_one(srci[i + 3], mini, maxi, maxisign);
4311 dsti[i + 4] = clipf_c_one(srci[i + 4], mini, maxi, maxisign);
4312 dsti[i + 5] = clipf_c_one(srci[i + 5], mini, maxi, maxisign);
4313 dsti[i + 6] = clipf_c_one(srci[i + 6], mini, maxi, maxisign);
4314 dsti[i + 7] = clipf_c_one(srci[i + 7], mini, maxi, maxisign);
4317 static void vector_clipf_c(float *dst, const float *src, float min, float max, int len){
4319 if(min < 0 && max > 0) {
4320 vector_clipf_c_opposite_sign(dst, src, &min, &max, len);
4322 for(i=0; i < len; i+=8) {
4323 dst[i ] = av_clipf(src[i ], min, max);
4324 dst[i + 1] = av_clipf(src[i + 1], min, max);
4325 dst[i + 2] = av_clipf(src[i + 2], min, max);
4326 dst[i + 3] = av_clipf(src[i + 3], min, max);
4327 dst[i + 4] = av_clipf(src[i + 4], min, max);
4328 dst[i + 5] = av_clipf(src[i + 5], min, max);
4329 dst[i + 6] = av_clipf(src[i + 6], min, max);
4330 dst[i + 7] = av_clipf(src[i + 7], min, max);
4335 static av_always_inline int float_to_int16_one(const float *src){
4336 int_fast32_t tmp = *(const int32_t*)src;
4338 tmp = (0x43c0ffff - tmp)>>31;
4339 // is this faster on some gcc/cpu combinations?
4340 // if(tmp > 0x43c0ffff) tmp = 0xFFFF;
4343 return tmp - 0x8000;
4346 void ff_float_to_int16_c(int16_t *dst, const float *src, long len){
4348 for(i=0; i<len; i++)
4349 dst[i] = float_to_int16_one(src+i);
4352 void ff_float_to_int16_interleave_c(int16_t *dst, const float **src, long len, int channels){
4355 for(i=0; i<len; i++){
4356 dst[2*i] = float_to_int16_one(src[0]+i);
4357 dst[2*i+1] = float_to_int16_one(src[1]+i);
4360 for(c=0; c<channels; c++)
4361 for(i=0, j=c; i<len; i++, j+=channels)
4362 dst[j] = float_to_int16_one(src[c]+i);
4366 static int32_t scalarproduct_int16_c(int16_t * v1, int16_t * v2, int order, int shift)
4371 res += (*v1++ * *v2++) >> shift;
4376 static int32_t scalarproduct_and_madd_int16_c(int16_t *v1, int16_t *v2, int16_t *v3, int order, int mul)
4381 *v1++ += mul * *v3++;
4387 #define W1 2841 /* 2048*sqrt (2)*cos (1*pi/16) */
4388 #define W2 2676 /* 2048*sqrt (2)*cos (2*pi/16) */
4389 #define W3 2408 /* 2048*sqrt (2)*cos (3*pi/16) */
4390 #define W4 2048 /* 2048*sqrt (2)*cos (4*pi/16) */
4391 #define W5 1609 /* 2048*sqrt (2)*cos (5*pi/16) */
4392 #define W6 1108 /* 2048*sqrt (2)*cos (6*pi/16) */
4393 #define W7 565 /* 2048*sqrt (2)*cos (7*pi/16) */
4395 static void wmv2_idct_row(short * b)
4398 int a0,a1,a2,a3,a4,a5,a6,a7;
4400 a1 = W1*b[1]+W7*b[7];
4401 a7 = W7*b[1]-W1*b[7];
4402 a5 = W5*b[5]+W3*b[3];
4403 a3 = W3*b[5]-W5*b[3];
4404 a2 = W2*b[2]+W6*b[6];
4405 a6 = W6*b[2]-W2*b[6];
4406 a0 = W0*b[0]+W0*b[4];
4407 a4 = W0*b[0]-W0*b[4];
4409 s1 = (181*(a1-a5+a7-a3)+128)>>8;//1,3,5,7,
4410 s2 = (181*(a1-a5-a7+a3)+128)>>8;
4412 b[0] = (a0+a2+a1+a5 + (1<<7))>>8;
4413 b[1] = (a4+a6 +s1 + (1<<7))>>8;
4414 b[2] = (a4-a6 +s2 + (1<<7))>>8;
4415 b[3] = (a0-a2+a7+a3 + (1<<7))>>8;
4416 b[4] = (a0-a2-a7-a3 + (1<<7))>>8;
4417 b[5] = (a4-a6 -s2 + (1<<7))>>8;
4418 b[6] = (a4+a6 -s1 + (1<<7))>>8;
4419 b[7] = (a0+a2-a1-a5 + (1<<7))>>8;
4421 static void wmv2_idct_col(short * b)
4424 int a0,a1,a2,a3,a4,a5,a6,a7;
4425 /*step 1, with extended precision*/
4426 a1 = (W1*b[8*1]+W7*b[8*7] + 4)>>3;
4427 a7 = (W7*b[8*1]-W1*b[8*7] + 4)>>3;
4428 a5 = (W5*b[8*5]+W3*b[8*3] + 4)>>3;
4429 a3 = (W3*b[8*5]-W5*b[8*3] + 4)>>3;
4430 a2 = (W2*b[8*2]+W6*b[8*6] + 4)>>3;
4431 a6 = (W6*b[8*2]-W2*b[8*6] + 4)>>3;
4432 a0 = (W0*b[8*0]+W0*b[8*4] )>>3;
4433 a4 = (W0*b[8*0]-W0*b[8*4] )>>3;
4435 s1 = (181*(a1-a5+a7-a3)+128)>>8;
4436 s2 = (181*(a1-a5-a7+a3)+128)>>8;
4438 b[8*0] = (a0+a2+a1+a5 + (1<<13))>>14;
4439 b[8*1] = (a4+a6 +s1 + (1<<13))>>14;
4440 b[8*2] = (a4-a6 +s2 + (1<<13))>>14;
4441 b[8*3] = (a0-a2+a7+a3 + (1<<13))>>14;
4443 b[8*4] = (a0-a2-a7-a3 + (1<<13))>>14;
4444 b[8*5] = (a4-a6 -s2 + (1<<13))>>14;
4445 b[8*6] = (a4+a6 -s1 + (1<<13))>>14;
4446 b[8*7] = (a0+a2-a1-a5 + (1<<13))>>14;
4448 void ff_wmv2_idct_c(short * block){
4452 wmv2_idct_row(block+i);
4455 wmv2_idct_col(block+i);
4458 /* XXX: those functions should be suppressed ASAP when all IDCTs are
4460 static void ff_wmv2_idct_put_c(uint8_t *dest, int line_size, DCTELEM *block)
4462 ff_wmv2_idct_c(block);
4463 put_pixels_clamped_c(block, dest, line_size);
4465 static void ff_wmv2_idct_add_c(uint8_t *dest, int line_size, DCTELEM *block)
4467 ff_wmv2_idct_c(block);
4468 add_pixels_clamped_c(block, dest, line_size);
4470 static void ff_jref_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
4473 put_pixels_clamped_c(block, dest, line_size);
4475 static void ff_jref_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
4478 add_pixels_clamped_c(block, dest, line_size);
4481 static void ff_jref_idct4_put(uint8_t *dest, int line_size, DCTELEM *block)
4484 put_pixels_clamped4_c(block, dest, line_size);
4486 static void ff_jref_idct4_add(uint8_t *dest, int line_size, DCTELEM *block)
4489 add_pixels_clamped4_c(block, dest, line_size);
4492 static void ff_jref_idct2_put(uint8_t *dest, int line_size, DCTELEM *block)
4495 put_pixels_clamped2_c(block, dest, line_size);
4497 static void ff_jref_idct2_add(uint8_t *dest, int line_size, DCTELEM *block)
4500 add_pixels_clamped2_c(block, dest, line_size);
4503 static void ff_jref_idct1_put(uint8_t *dest, int line_size, DCTELEM *block)
4505 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
4507 dest[0] = cm[(block[0] + 4)>>3];
4509 static void ff_jref_idct1_add(uint8_t *dest, int line_size, DCTELEM *block)
4511 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
4513 dest[0] = cm[dest[0] + ((block[0] + 4)>>3)];
4516 static void just_return(void *mem av_unused, int stride av_unused, int h av_unused) { return; }
4518 /* init static data */
4519 av_cold void dsputil_static_init(void)
4523 for(i=0;i<256;i++) ff_cropTbl[i + MAX_NEG_CROP] = i;
4524 for(i=0;i<MAX_NEG_CROP;i++) {
4526 ff_cropTbl[i + MAX_NEG_CROP + 256] = 255;
4529 for(i=0;i<512;i++) {
4530 ff_squareTbl[i] = (i - 256) * (i - 256);
4533 for(i=0; i<64; i++) inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
4536 int ff_check_alignment(void){
4537 static int did_fail=0;
4538 DECLARE_ALIGNED_16(int, aligned);
4540 if((intptr_t)&aligned & 15){
4542 #if HAVE_MMX || HAVE_ALTIVEC
4543 av_log(NULL, AV_LOG_ERROR,
4544 "Compiler did not align stack variables. Libavcodec has been miscompiled\n"
4545 "and may be very slow or crash. This is not a bug in libavcodec,\n"
4546 "but in the compiler. You may try recompiling using gcc >= 4.2.\n"
4547 "Do not report crashes to FFmpeg developers.\n");
4556 av_cold void dsputil_init(DSPContext* c, AVCodecContext *avctx)
4560 ff_check_alignment();
4563 if(avctx->dct_algo==FF_DCT_FASTINT) {
4564 c->fdct = fdct_ifast;
4565 c->fdct248 = fdct_ifast248;
4567 else if(avctx->dct_algo==FF_DCT_FAAN) {
4568 c->fdct = ff_faandct;
4569 c->fdct248 = ff_faandct248;
4572 c->fdct = ff_jpeg_fdct_islow; //slow/accurate/default
4573 c->fdct248 = ff_fdct248_islow;
4575 #endif //CONFIG_ENCODERS
4577 if(avctx->lowres==1){
4578 if(avctx->idct_algo==FF_IDCT_INT || avctx->idct_algo==FF_IDCT_AUTO || !CONFIG_H264_DECODER){
4579 c->idct_put= ff_jref_idct4_put;
4580 c->idct_add= ff_jref_idct4_add;
4582 c->idct_put= ff_h264_lowres_idct_put_c;
4583 c->idct_add= ff_h264_lowres_idct_add_c;
4585 c->idct = j_rev_dct4;
4586 c->idct_permutation_type= FF_NO_IDCT_PERM;
4587 }else if(avctx->lowres==2){
4588 c->idct_put= ff_jref_idct2_put;
4589 c->idct_add= ff_jref_idct2_add;
4590 c->idct = j_rev_dct2;
4591 c->idct_permutation_type= FF_NO_IDCT_PERM;
4592 }else if(avctx->lowres==3){
4593 c->idct_put= ff_jref_idct1_put;
4594 c->idct_add= ff_jref_idct1_add;
4595 c->idct = j_rev_dct1;
4596 c->idct_permutation_type= FF_NO_IDCT_PERM;
4598 if(avctx->idct_algo==FF_IDCT_INT){
4599 c->idct_put= ff_jref_idct_put;
4600 c->idct_add= ff_jref_idct_add;
4601 c->idct = j_rev_dct;
4602 c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
4603 }else if((CONFIG_VP3_DECODER || CONFIG_VP5_DECODER || CONFIG_VP6_DECODER ) &&
4604 avctx->idct_algo==FF_IDCT_VP3){
4605 c->idct_put= ff_vp3_idct_put_c;
4606 c->idct_add= ff_vp3_idct_add_c;
4607 c->idct = ff_vp3_idct_c;
4608 c->idct_permutation_type= FF_NO_IDCT_PERM;
4609 }else if(avctx->idct_algo==FF_IDCT_WMV2){
4610 c->idct_put= ff_wmv2_idct_put_c;
4611 c->idct_add= ff_wmv2_idct_add_c;
4612 c->idct = ff_wmv2_idct_c;
4613 c->idct_permutation_type= FF_NO_IDCT_PERM;
4614 }else if(avctx->idct_algo==FF_IDCT_FAAN){
4615 c->idct_put= ff_faanidct_put;
4616 c->idct_add= ff_faanidct_add;
4617 c->idct = ff_faanidct;
4618 c->idct_permutation_type= FF_NO_IDCT_PERM;
4619 }else if(CONFIG_EATGQ_DECODER && avctx->idct_algo==FF_IDCT_EA) {
4620 c->idct_put= ff_ea_idct_put_c;
4621 c->idct_permutation_type= FF_NO_IDCT_PERM;
4622 }else if(CONFIG_BINK_DECODER && avctx->idct_algo==FF_IDCT_BINK) {
4623 c->idct = ff_bink_idct_c;
4624 c->idct_add = ff_bink_idct_add_c;
4625 c->idct_put = ff_bink_idct_put_c;
4626 c->idct_permutation_type = FF_NO_IDCT_PERM;
4627 }else{ //accurate/default
4628 c->idct_put= ff_simple_idct_put;
4629 c->idct_add= ff_simple_idct_add;
4630 c->idct = ff_simple_idct;
4631 c->idct_permutation_type= FF_NO_IDCT_PERM;
4635 if (CONFIG_H264_DECODER) {
4636 c->h264_idct_add= ff_h264_idct_add_c;
4637 c->h264_idct8_add= ff_h264_idct8_add_c;
4638 c->h264_idct_dc_add= ff_h264_idct_dc_add_c;
4639 c->h264_idct8_dc_add= ff_h264_idct8_dc_add_c;
4640 c->h264_idct_add16 = ff_h264_idct_add16_c;
4641 c->h264_idct8_add4 = ff_h264_idct8_add4_c;
4642 c->h264_idct_add8 = ff_h264_idct_add8_c;
4643 c->h264_idct_add16intra= ff_h264_idct_add16intra_c;
4646 c->get_pixels = get_pixels_c;
4647 c->diff_pixels = diff_pixels_c;
4648 c->put_pixels_clamped = put_pixels_clamped_c;
4649 c->put_signed_pixels_clamped = put_signed_pixels_clamped_c;
4650 c->put_pixels_nonclamped = put_pixels_nonclamped_c;
4651 c->add_pixels_clamped = add_pixels_clamped_c;
4652 c->add_pixels8 = add_pixels8_c;
4653 c->add_pixels4 = add_pixels4_c;
4654 c->sum_abs_dctelem = sum_abs_dctelem_c;
4657 c->clear_block = clear_block_c;
4658 c->clear_blocks = clear_blocks_c;
4659 c->pix_sum = pix_sum_c;
4660 c->pix_norm1 = pix_norm1_c;
4662 c->fill_block_tab[0] = fill_block16_c;
4663 c->fill_block_tab[1] = fill_block8_c;
4664 c->scale_block = scale_block_c;
4666 /* TODO [0] 16 [1] 8 */
4667 c->pix_abs[0][0] = pix_abs16_c;
4668 c->pix_abs[0][1] = pix_abs16_x2_c;
4669 c->pix_abs[0][2] = pix_abs16_y2_c;
4670 c->pix_abs[0][3] = pix_abs16_xy2_c;
4671 c->pix_abs[1][0] = pix_abs8_c;
4672 c->pix_abs[1][1] = pix_abs8_x2_c;
4673 c->pix_abs[1][2] = pix_abs8_y2_c;
4674 c->pix_abs[1][3] = pix_abs8_xy2_c;
4676 #define dspfunc(PFX, IDX, NUM) \
4677 c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## NUM ## _c; \
4678 c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## NUM ## _x2_c; \
4679 c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## NUM ## _y2_c; \
4680 c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## NUM ## _xy2_c
4682 dspfunc(put, 0, 16);
4683 dspfunc(put_no_rnd, 0, 16);
4685 dspfunc(put_no_rnd, 1, 8);
4689 dspfunc(avg, 0, 16);
4690 dspfunc(avg_no_rnd, 0, 16);
4692 dspfunc(avg_no_rnd, 1, 8);
4697 c->put_no_rnd_pixels_l2[0]= put_no_rnd_pixels16_l2_c;
4698 c->put_no_rnd_pixels_l2[1]= put_no_rnd_pixels8_l2_c;
4700 c->put_tpel_pixels_tab[ 0] = put_tpel_pixels_mc00_c;
4701 c->put_tpel_pixels_tab[ 1] = put_tpel_pixels_mc10_c;
4702 c->put_tpel_pixels_tab[ 2] = put_tpel_pixels_mc20_c;
4703 c->put_tpel_pixels_tab[ 4] = put_tpel_pixels_mc01_c;
4704 c->put_tpel_pixels_tab[ 5] = put_tpel_pixels_mc11_c;
4705 c->put_tpel_pixels_tab[ 6] = put_tpel_pixels_mc21_c;
4706 c->put_tpel_pixels_tab[ 8] = put_tpel_pixels_mc02_c;
4707 c->put_tpel_pixels_tab[ 9] = put_tpel_pixels_mc12_c;
4708 c->put_tpel_pixels_tab[10] = put_tpel_pixels_mc22_c;
4710 c->avg_tpel_pixels_tab[ 0] = avg_tpel_pixels_mc00_c;
4711 c->avg_tpel_pixels_tab[ 1] = avg_tpel_pixels_mc10_c;
4712 c->avg_tpel_pixels_tab[ 2] = avg_tpel_pixels_mc20_c;
4713 c->avg_tpel_pixels_tab[ 4] = avg_tpel_pixels_mc01_c;
4714 c->avg_tpel_pixels_tab[ 5] = avg_tpel_pixels_mc11_c;
4715 c->avg_tpel_pixels_tab[ 6] = avg_tpel_pixels_mc21_c;
4716 c->avg_tpel_pixels_tab[ 8] = avg_tpel_pixels_mc02_c;
4717 c->avg_tpel_pixels_tab[ 9] = avg_tpel_pixels_mc12_c;
4718 c->avg_tpel_pixels_tab[10] = avg_tpel_pixels_mc22_c;
4720 #define dspfunc(PFX, IDX, NUM) \
4721 c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
4722 c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
4723 c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
4724 c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
4725 c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
4726 c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
4727 c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
4728 c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
4729 c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
4730 c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
4731 c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
4732 c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
4733 c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
4734 c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
4735 c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
4736 c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
4738 dspfunc(put_qpel, 0, 16);
4739 dspfunc(put_no_rnd_qpel, 0, 16);
4741 dspfunc(avg_qpel, 0, 16);
4742 /* dspfunc(avg_no_rnd_qpel, 0, 16); */
4744 dspfunc(put_qpel, 1, 8);
4745 dspfunc(put_no_rnd_qpel, 1, 8);
4747 dspfunc(avg_qpel, 1, 8);
4748 /* dspfunc(avg_no_rnd_qpel, 1, 8); */
4750 dspfunc(put_h264_qpel, 0, 16);
4751 dspfunc(put_h264_qpel, 1, 8);
4752 dspfunc(put_h264_qpel, 2, 4);
4753 dspfunc(put_h264_qpel, 3, 2);
4754 dspfunc(avg_h264_qpel, 0, 16);
4755 dspfunc(avg_h264_qpel, 1, 8);
4756 dspfunc(avg_h264_qpel, 2, 4);
4759 c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_c;
4760 c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_c;
4761 c->put_h264_chroma_pixels_tab[2]= put_h264_chroma_mc2_c;
4762 c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_c;
4763 c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_c;
4764 c->avg_h264_chroma_pixels_tab[2]= avg_h264_chroma_mc2_c;
4765 c->put_no_rnd_vc1_chroma_pixels_tab[0]= put_no_rnd_vc1_chroma_mc8_c;
4766 c->avg_no_rnd_vc1_chroma_pixels_tab[0]= avg_no_rnd_vc1_chroma_mc8_c;
4768 c->weight_h264_pixels_tab[0]= weight_h264_pixels16x16_c;
4769 c->weight_h264_pixels_tab[1]= weight_h264_pixels16x8_c;
4770 c->weight_h264_pixels_tab[2]= weight_h264_pixels8x16_c;
4771 c->weight_h264_pixels_tab[3]= weight_h264_pixels8x8_c;
4772 c->weight_h264_pixels_tab[4]= weight_h264_pixels8x4_c;
4773 c->weight_h264_pixels_tab[5]= weight_h264_pixels4x8_c;
4774 c->weight_h264_pixels_tab[6]= weight_h264_pixels4x4_c;
4775 c->weight_h264_pixels_tab[7]= weight_h264_pixels4x2_c;
4776 c->weight_h264_pixels_tab[8]= weight_h264_pixels2x4_c;
4777 c->weight_h264_pixels_tab[9]= weight_h264_pixels2x2_c;
4778 c->biweight_h264_pixels_tab[0]= biweight_h264_pixels16x16_c;
4779 c->biweight_h264_pixels_tab[1]= biweight_h264_pixels16x8_c;
4780 c->biweight_h264_pixels_tab[2]= biweight_h264_pixels8x16_c;
4781 c->biweight_h264_pixels_tab[3]= biweight_h264_pixels8x8_c;
4782 c->biweight_h264_pixels_tab[4]= biweight_h264_pixels8x4_c;
4783 c->biweight_h264_pixels_tab[5]= biweight_h264_pixels4x8_c;
4784 c->biweight_h264_pixels_tab[6]= biweight_h264_pixels4x4_c;
4785 c->biweight_h264_pixels_tab[7]= biweight_h264_pixels4x2_c;
4786 c->biweight_h264_pixels_tab[8]= biweight_h264_pixels2x4_c;
4787 c->biweight_h264_pixels_tab[9]= biweight_h264_pixels2x2_c;
4789 c->draw_edges = draw_edges_c;
4791 #if CONFIG_CAVS_DECODER
4792 ff_cavsdsp_init(c,avctx);
4795 #if CONFIG_MLP_DECODER || CONFIG_TRUEHD_DECODER
4796 ff_mlp_init(c, avctx);
4798 #if CONFIG_VC1_DECODER
4799 ff_vc1dsp_init(c,avctx);
4801 #if CONFIG_WMV2_DECODER || CONFIG_VC1_DECODER
4802 ff_intrax8dsp_init(c,avctx);
4804 #if CONFIG_RV30_DECODER
4805 ff_rv30dsp_init(c,avctx);
4807 #if CONFIG_RV40_DECODER
4808 ff_rv40dsp_init(c,avctx);
4809 c->put_rv40_qpel_pixels_tab[0][15] = put_rv40_qpel16_mc33_c;
4810 c->avg_rv40_qpel_pixels_tab[0][15] = avg_rv40_qpel16_mc33_c;
4811 c->put_rv40_qpel_pixels_tab[1][15] = put_rv40_qpel8_mc33_c;
4812 c->avg_rv40_qpel_pixels_tab[1][15] = avg_rv40_qpel8_mc33_c;
4815 c->put_mspel_pixels_tab[0]= put_mspel8_mc00_c;
4816 c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
4817 c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c;
4818 c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c;
4819 c->put_mspel_pixels_tab[4]= put_mspel8_mc02_c;
4820 c->put_mspel_pixels_tab[5]= put_mspel8_mc12_c;
4821 c->put_mspel_pixels_tab[6]= put_mspel8_mc22_c;
4822 c->put_mspel_pixels_tab[7]= put_mspel8_mc32_c;
4824 #define SET_CMP_FUNC(name) \
4825 c->name[0]= name ## 16_c;\
4826 c->name[1]= name ## 8x8_c;
4828 SET_CMP_FUNC(hadamard8_diff)
4829 c->hadamard8_diff[4]= hadamard8_intra16_c;
4830 c->hadamard8_diff[5]= hadamard8_intra8x8_c;
4831 SET_CMP_FUNC(dct_sad)
4832 SET_CMP_FUNC(dct_max)
4834 SET_CMP_FUNC(dct264_sad)
4836 c->sad[0]= pix_abs16_c;
4837 c->sad[1]= pix_abs8_c;
4841 SET_CMP_FUNC(quant_psnr)
4844 c->vsad[0]= vsad16_c;
4845 c->vsad[4]= vsad_intra16_c;
4846 c->vsad[5]= vsad_intra8_c;
4847 c->vsse[0]= vsse16_c;
4848 c->vsse[4]= vsse_intra16_c;
4849 c->vsse[5]= vsse_intra8_c;
4850 c->nsse[0]= nsse16_c;
4851 c->nsse[1]= nsse8_c;
4852 #if CONFIG_SNOW_ENCODER
4853 c->w53[0]= w53_16_c;
4855 c->w97[0]= w97_16_c;
4859 c->ssd_int8_vs_int16 = ssd_int8_vs_int16_c;
4861 c->add_bytes= add_bytes_c;
4862 c->add_bytes_l2= add_bytes_l2_c;
4863 c->diff_bytes= diff_bytes_c;
4864 c->add_hfyu_median_prediction= add_hfyu_median_prediction_c;
4865 c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_c;
4866 c->add_hfyu_left_prediction = add_hfyu_left_prediction_c;
4867 c->add_hfyu_left_prediction_bgr32 = add_hfyu_left_prediction_bgr32_c;
4868 c->bswap_buf= bswap_buf;
4869 #if CONFIG_PNG_DECODER
4870 c->add_png_paeth_prediction= ff_add_png_paeth_prediction;
4873 c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_c;
4874 c->h264_h_loop_filter_luma= h264_h_loop_filter_luma_c;
4875 c->h264_v_loop_filter_luma_intra= h264_v_loop_filter_luma_intra_c;
4876 c->h264_h_loop_filter_luma_intra= h264_h_loop_filter_luma_intra_c;
4877 c->h264_v_loop_filter_chroma= h264_v_loop_filter_chroma_c;
4878 c->h264_h_loop_filter_chroma= h264_h_loop_filter_chroma_c;
4879 c->h264_v_loop_filter_chroma_intra= h264_v_loop_filter_chroma_intra_c;
4880 c->h264_h_loop_filter_chroma_intra= h264_h_loop_filter_chroma_intra_c;
4881 c->h264_loop_filter_strength= NULL;
4883 if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
4884 c->h263_h_loop_filter= h263_h_loop_filter_c;
4885 c->h263_v_loop_filter= h263_v_loop_filter_c;
4888 if (CONFIG_VP3_DECODER) {
4889 c->vp3_h_loop_filter= ff_vp3_h_loop_filter_c;
4890 c->vp3_v_loop_filter= ff_vp3_v_loop_filter_c;
4892 if (CONFIG_VP6_DECODER) {
4893 c->vp6_filter_diag4= ff_vp6_filter_diag4_c;
4896 c->h261_loop_filter= h261_loop_filter_c;
4898 c->try_8x8basis= try_8x8basis_c;
4899 c->add_8x8basis= add_8x8basis_c;
4901 #if CONFIG_SNOW_DECODER
4902 c->vertical_compose97i = ff_snow_vertical_compose97i;
4903 c->horizontal_compose97i = ff_snow_horizontal_compose97i;
4904 c->inner_add_yblock = ff_snow_inner_add_yblock;
4907 #if CONFIG_VORBIS_DECODER
4908 c->vorbis_inverse_coupling = vorbis_inverse_coupling;
4910 #if CONFIG_AC3_DECODER
4911 c->ac3_downmix = ff_ac3_downmix_c;
4914 c->lpc_compute_autocorr = ff_lpc_compute_autocorr;
4916 c->vector_fmul = vector_fmul_c;
4917 c->vector_fmul_reverse = vector_fmul_reverse_c;
4918 c->vector_fmul_add = vector_fmul_add_c;
4919 c->vector_fmul_window = ff_vector_fmul_window_c;
4920 c->int32_to_float_fmul_scalar = int32_to_float_fmul_scalar_c;
4921 c->vector_clipf = vector_clipf_c;
4922 c->float_to_int16 = ff_float_to_int16_c;
4923 c->float_to_int16_interleave = ff_float_to_int16_interleave_c;
4924 c->scalarproduct_int16 = scalarproduct_int16_c;
4925 c->scalarproduct_and_madd_int16 = scalarproduct_and_madd_int16_c;
4926 c->scalarproduct_float = scalarproduct_float_c;
4927 c->butterflies_float = butterflies_float_c;
4928 c->vector_fmul_scalar = vector_fmul_scalar_c;
4930 c->vector_fmul_sv_scalar[0] = vector_fmul_sv_scalar_2_c;
4931 c->vector_fmul_sv_scalar[1] = vector_fmul_sv_scalar_4_c;
4933 c->sv_fmul_scalar[0] = sv_fmul_scalar_2_c;
4934 c->sv_fmul_scalar[1] = sv_fmul_scalar_4_c;
4936 c->shrink[0]= ff_img_copy_plane;
4937 c->shrink[1]= ff_shrink22;
4938 c->shrink[2]= ff_shrink44;
4939 c->shrink[3]= ff_shrink88;
4941 c->prefetch= just_return;
4943 memset(c->put_2tap_qpel_pixels_tab, 0, sizeof(c->put_2tap_qpel_pixels_tab));
4944 memset(c->avg_2tap_qpel_pixels_tab, 0, sizeof(c->avg_2tap_qpel_pixels_tab));
4946 if (HAVE_MMX) dsputil_init_mmx (c, avctx);
4947 if (ARCH_ARM) dsputil_init_arm (c, avctx);
4948 if (CONFIG_MLIB) dsputil_init_mlib (c, avctx);
4949 if (HAVE_VIS) dsputil_init_vis (c, avctx);
4950 if (ARCH_ALPHA) dsputil_init_alpha (c, avctx);
4951 if (ARCH_PPC) dsputil_init_ppc (c, avctx);
4952 if (HAVE_MMI) dsputil_init_mmi (c, avctx);
4953 if (ARCH_SH4) dsputil_init_sh4 (c, avctx);
4954 if (ARCH_BFIN) dsputil_init_bfin (c, avctx);
4956 for(i=0; i<64; i++){
4957 if(!c->put_2tap_qpel_pixels_tab[0][i])
4958 c->put_2tap_qpel_pixels_tab[0][i]= c->put_h264_qpel_pixels_tab[0][i];
4959 if(!c->avg_2tap_qpel_pixels_tab[0][i])
4960 c->avg_2tap_qpel_pixels_tab[0][i]= c->avg_h264_qpel_pixels_tab[0][i];
4963 switch(c->idct_permutation_type){
4964 case FF_NO_IDCT_PERM:
4966 c->idct_permutation[i]= i;
4968 case FF_LIBMPEG2_IDCT_PERM:
4970 c->idct_permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
4972 case FF_SIMPLE_IDCT_PERM:
4974 c->idct_permutation[i]= simple_mmx_permutation[i];
4976 case FF_TRANSPOSE_IDCT_PERM:
4978 c->idct_permutation[i]= ((i&7)<<3) | (i>>3);
4980 case FF_PARTTRANS_IDCT_PERM:
4982 c->idct_permutation[i]= (i&0x24) | ((i&3)<<3) | ((i>>3)&3);
4984 case FF_SSE2_IDCT_PERM:
4986 c->idct_permutation[i]= (i&0x38) | idct_sse2_row_perm[i&7];
4989 av_log(avctx, AV_LOG_ERROR, "Internal error, IDCT permutation not set\n");