3 * Copyright (c) 2000, 2001 Fabrice Bellard.
4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
6 * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
8 * This file is part of FFmpeg.
10 * FFmpeg is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public
12 * License as published by the Free Software Foundation; either
13 * version 2.1 of the License, or (at your option) any later version.
15 * FFmpeg is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * Lesser General Public License for more details.
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with FFmpeg; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
32 #include "simple_idct.h"
39 void ff_spatial_dwt(int *buffer, int width, int height, int stride, int type, int decomposition_count);
42 void vorbis_inverse_coupling(float *mag, float *ang, int blocksize);
45 void ff_flac_compute_autocorr(const int32_t *data, int len, int lag, double *autoc);
48 void ff_add_png_paeth_prediction(uint8_t *dst, uint8_t *src, uint8_t *top, int w, int bpp);
50 uint8_t ff_cropTbl[256 + 2 * MAX_NEG_CROP] = {0, };
51 uint32_t ff_squareTbl[512] = {0, };
53 // 0x7f7f7f7f or 0x7f7f7f7f7f7f7f7f or whatever, depending on the cpu's native arithmetic size
54 #define pb_7f (~0UL/255 * 0x7f)
55 #define pb_80 (~0UL/255 * 0x80)
57 const uint8_t ff_zigzag_direct[64] = {
58 0, 1, 8, 16, 9, 2, 3, 10,
59 17, 24, 32, 25, 18, 11, 4, 5,
60 12, 19, 26, 33, 40, 48, 41, 34,
61 27, 20, 13, 6, 7, 14, 21, 28,
62 35, 42, 49, 56, 57, 50, 43, 36,
63 29, 22, 15, 23, 30, 37, 44, 51,
64 58, 59, 52, 45, 38, 31, 39, 46,
65 53, 60, 61, 54, 47, 55, 62, 63
68 /* Specific zigzag scan for 248 idct. NOTE that unlike the
69 specification, we interleave the fields */
70 const uint8_t ff_zigzag248_direct[64] = {
71 0, 8, 1, 9, 16, 24, 2, 10,
72 17, 25, 32, 40, 48, 56, 33, 41,
73 18, 26, 3, 11, 4, 12, 19, 27,
74 34, 42, 49, 57, 50, 58, 35, 43,
75 20, 28, 5, 13, 6, 14, 21, 29,
76 36, 44, 51, 59, 52, 60, 37, 45,
77 22, 30, 7, 15, 23, 31, 38, 46,
78 53, 61, 54, 62, 39, 47, 55, 63,
81 /* not permutated inverse zigzag_direct + 1 for MMX quantizer */
82 DECLARE_ALIGNED_8(uint16_t, inv_zigzag_direct16[64]) = {0, };
84 const uint8_t ff_alternate_horizontal_scan[64] = {
85 0, 1, 2, 3, 8, 9, 16, 17,
86 10, 11, 4, 5, 6, 7, 15, 14,
87 13, 12, 19, 18, 24, 25, 32, 33,
88 26, 27, 20, 21, 22, 23, 28, 29,
89 30, 31, 34, 35, 40, 41, 48, 49,
90 42, 43, 36, 37, 38, 39, 44, 45,
91 46, 47, 50, 51, 56, 57, 58, 59,
92 52, 53, 54, 55, 60, 61, 62, 63,
95 const uint8_t ff_alternate_vertical_scan[64] = {
96 0, 8, 16, 24, 1, 9, 2, 10,
97 17, 25, 32, 40, 48, 56, 57, 49,
98 41, 33, 26, 18, 3, 11, 4, 12,
99 19, 27, 34, 42, 50, 58, 35, 43,
100 51, 59, 20, 28, 5, 13, 6, 14,
101 21, 29, 36, 44, 52, 60, 37, 45,
102 53, 61, 22, 30, 7, 15, 23, 31,
103 38, 46, 54, 62, 39, 47, 55, 63,
106 /* a*inverse[b]>>32 == a/b for all 0<=a<=65536 && 2<=b<=255 */
107 const uint32_t ff_inverse[256]={
108 0, 4294967295U,2147483648U,1431655766, 1073741824, 858993460, 715827883, 613566757,
109 536870912, 477218589, 429496730, 390451573, 357913942, 330382100, 306783379, 286331154,
110 268435456, 252645136, 238609295, 226050911, 214748365, 204522253, 195225787, 186737709,
111 178956971, 171798692, 165191050, 159072863, 153391690, 148102321, 143165577, 138547333,
112 134217728, 130150525, 126322568, 122713352, 119304648, 116080198, 113025456, 110127367,
113 107374183, 104755300, 102261127, 99882961, 97612894, 95443718, 93368855, 91382283,
114 89478486, 87652394, 85899346, 84215046, 82595525, 81037119, 79536432, 78090315,
115 76695845, 75350304, 74051161, 72796056, 71582789, 70409300, 69273667, 68174085,
116 67108864, 66076420, 65075263, 64103990, 63161284, 62245903, 61356676, 60492498,
117 59652324, 58835169, 58040099, 57266231, 56512728, 55778797, 55063684, 54366675,
118 53687092, 53024288, 52377650, 51746594, 51130564, 50529028, 49941481, 49367441,
119 48806447, 48258060, 47721859, 47197443, 46684428, 46182445, 45691142, 45210183,
120 44739243, 44278014, 43826197, 43383509, 42949673, 42524429, 42107523, 41698712,
121 41297763, 40904451, 40518560, 40139882, 39768216, 39403370, 39045158, 38693400,
122 38347923, 38008561, 37675152, 37347542, 37025581, 36709123, 36398028, 36092163,
123 35791395, 35495598, 35204650, 34918434, 34636834, 34359739, 34087043, 33818641,
124 33554432, 33294321, 33038210, 32786010, 32537632, 32292988, 32051995, 31814573,
125 31580642, 31350127, 31122952, 30899046, 30678338, 30460761, 30246249, 30034737,
126 29826162, 29620465, 29417585, 29217465, 29020050, 28825284, 28633116, 28443493,
127 28256364, 28071682, 27889399, 27709467, 27531842, 27356480, 27183338, 27012373,
128 26843546, 26676816, 26512144, 26349493, 26188825, 26030105, 25873297, 25718368,
129 25565282, 25414008, 25264514, 25116768, 24970741, 24826401, 24683721, 24542671,
130 24403224, 24265352, 24129030, 23994231, 23860930, 23729102, 23598722, 23469767,
131 23342214, 23216040, 23091223, 22967740, 22845571, 22724695, 22605092, 22486740,
132 22369622, 22253717, 22139007, 22025474, 21913099, 21801865, 21691755, 21582751,
133 21474837, 21367997, 21262215, 21157475, 21053762, 20951060, 20849356, 20748635,
134 20648882, 20550083, 20452226, 20355296, 20259280, 20164166, 20069941, 19976593,
135 19884108, 19792477, 19701685, 19611723, 19522579, 19434242, 19346700, 19259944,
136 19173962, 19088744, 19004281, 18920561, 18837576, 18755316, 18673771, 18592933,
137 18512791, 18433337, 18354562, 18276457, 18199014, 18122225, 18046082, 17970575,
138 17895698, 17821442, 17747799, 17674763, 17602325, 17530479, 17459217, 17388532,
139 17318417, 17248865, 17179870, 17111424, 17043522, 16976156, 16909321, 16843010,
142 /* Input permutation for the simple_idct_mmx */
143 static const uint8_t simple_mmx_permutation[64]={
144 0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D,
145 0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D,
146 0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D,
147 0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F,
148 0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F,
149 0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D,
150 0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F,
151 0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
154 void ff_init_scantable(uint8_t *permutation, ScanTable *st, const uint8_t *src_scantable){
158 st->scantable= src_scantable;
162 j = src_scantable[i];
163 st->permutated[i] = permutation[j];
172 j = st->permutated[i];
174 st->raster_end[i]= end;
178 static int pix_sum_c(uint8_t * pix, int line_size)
183 for (i = 0; i < 16; i++) {
184 for (j = 0; j < 16; j += 8) {
195 pix += line_size - 16;
200 static int pix_norm1_c(uint8_t * pix, int line_size)
203 uint32_t *sq = ff_squareTbl + 256;
206 for (i = 0; i < 16; i++) {
207 for (j = 0; j < 16; j += 8) {
218 #if LONG_MAX > 2147483647
219 register uint64_t x=*(uint64_t*)pix;
221 s += sq[(x>>8)&0xff];
222 s += sq[(x>>16)&0xff];
223 s += sq[(x>>24)&0xff];
224 s += sq[(x>>32)&0xff];
225 s += sq[(x>>40)&0xff];
226 s += sq[(x>>48)&0xff];
227 s += sq[(x>>56)&0xff];
229 register uint32_t x=*(uint32_t*)pix;
231 s += sq[(x>>8)&0xff];
232 s += sq[(x>>16)&0xff];
233 s += sq[(x>>24)&0xff];
234 x=*(uint32_t*)(pix+4);
236 s += sq[(x>>8)&0xff];
237 s += sq[(x>>16)&0xff];
238 s += sq[(x>>24)&0xff];
243 pix += line_size - 16;
248 static void bswap_buf(uint32_t *dst, const uint32_t *src, int w){
251 for(i=0; i+8<=w; i+=8){
252 dst[i+0]= bswap_32(src[i+0]);
253 dst[i+1]= bswap_32(src[i+1]);
254 dst[i+2]= bswap_32(src[i+2]);
255 dst[i+3]= bswap_32(src[i+3]);
256 dst[i+4]= bswap_32(src[i+4]);
257 dst[i+5]= bswap_32(src[i+5]);
258 dst[i+6]= bswap_32(src[i+6]);
259 dst[i+7]= bswap_32(src[i+7]);
262 dst[i+0]= bswap_32(src[i+0]);
266 static int sse4_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
269 uint32_t *sq = ff_squareTbl + 256;
272 for (i = 0; i < h; i++) {
273 s += sq[pix1[0] - pix2[0]];
274 s += sq[pix1[1] - pix2[1]];
275 s += sq[pix1[2] - pix2[2]];
276 s += sq[pix1[3] - pix2[3]];
283 static int sse8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
286 uint32_t *sq = ff_squareTbl + 256;
289 for (i = 0; i < h; i++) {
290 s += sq[pix1[0] - pix2[0]];
291 s += sq[pix1[1] - pix2[1]];
292 s += sq[pix1[2] - pix2[2]];
293 s += sq[pix1[3] - pix2[3]];
294 s += sq[pix1[4] - pix2[4]];
295 s += sq[pix1[5] - pix2[5]];
296 s += sq[pix1[6] - pix2[6]];
297 s += sq[pix1[7] - pix2[7]];
304 static int sse16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
307 uint32_t *sq = ff_squareTbl + 256;
310 for (i = 0; i < h; i++) {
311 s += sq[pix1[ 0] - pix2[ 0]];
312 s += sq[pix1[ 1] - pix2[ 1]];
313 s += sq[pix1[ 2] - pix2[ 2]];
314 s += sq[pix1[ 3] - pix2[ 3]];
315 s += sq[pix1[ 4] - pix2[ 4]];
316 s += sq[pix1[ 5] - pix2[ 5]];
317 s += sq[pix1[ 6] - pix2[ 6]];
318 s += sq[pix1[ 7] - pix2[ 7]];
319 s += sq[pix1[ 8] - pix2[ 8]];
320 s += sq[pix1[ 9] - pix2[ 9]];
321 s += sq[pix1[10] - pix2[10]];
322 s += sq[pix1[11] - pix2[11]];
323 s += sq[pix1[12] - pix2[12]];
324 s += sq[pix1[13] - pix2[13]];
325 s += sq[pix1[14] - pix2[14]];
326 s += sq[pix1[15] - pix2[15]];
335 #ifdef CONFIG_SNOW_ENCODER //dwt is in snow.c
336 static inline int w_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int w, int h, int type){
338 const int dec_count= w==8 ? 3 : 4;
341 static const int scale[2][2][4][4]={
345 {268, 239, 239, 213},
349 // 9/7 16x16 or 32x32 dec=4
350 {344, 310, 310, 280},
358 {275, 245, 245, 218},
362 // 5/3 16x16 or 32x32 dec=4
363 {352, 317, 317, 286},
371 for (i = 0; i < h; i++) {
372 for (j = 0; j < w; j+=4) {
373 tmp[32*i+j+0] = (pix1[j+0] - pix2[j+0])<<4;
374 tmp[32*i+j+1] = (pix1[j+1] - pix2[j+1])<<4;
375 tmp[32*i+j+2] = (pix1[j+2] - pix2[j+2])<<4;
376 tmp[32*i+j+3] = (pix1[j+3] - pix2[j+3])<<4;
382 ff_spatial_dwt(tmp, w, h, 32, type, dec_count);
386 for(level=0; level<dec_count; level++){
387 for(ori= level ? 1 : 0; ori<4; ori++){
388 int size= w>>(dec_count-level);
389 int sx= (ori&1) ? size : 0;
390 int stride= 32<<(dec_count-level);
391 int sy= (ori&2) ? stride>>1 : 0;
393 for(i=0; i<size; i++){
394 for(j=0; j<size; j++){
395 int v= tmp[sx + sy + i*stride + j] * scale[type][dec_count-3][level][ori];
405 static int w53_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
406 return w_c(v, pix1, pix2, line_size, 8, h, 1);
409 static int w97_8_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
410 return w_c(v, pix1, pix2, line_size, 8, h, 0);
413 static int w53_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
414 return w_c(v, pix1, pix2, line_size, 16, h, 1);
417 static int w97_16_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
418 return w_c(v, pix1, pix2, line_size, 16, h, 0);
421 int w53_32_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
422 return w_c(v, pix1, pix2, line_size, 32, h, 1);
425 int w97_32_c(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h){
426 return w_c(v, pix1, pix2, line_size, 32, h, 0);
430 /* draw the edges of width 'w' of an image of size width, height */
431 //FIXME check that this is ok for mpeg4 interlaced
432 static void draw_edges_c(uint8_t *buf, int wrap, int width, int height, int w)
434 uint8_t *ptr, *last_line;
437 last_line = buf + (height - 1) * wrap;
440 memcpy(buf - (i + 1) * wrap, buf, width);
441 memcpy(last_line + (i + 1) * wrap, last_line, width);
445 for(i=0;i<height;i++) {
446 memset(ptr - w, ptr[0], w);
447 memset(ptr + width, ptr[width-1], w);
452 memset(buf - (i + 1) * wrap - w, buf[0], w); /* top left */
453 memset(buf - (i + 1) * wrap + width, buf[width-1], w); /* top right */
454 memset(last_line + (i + 1) * wrap - w, last_line[0], w); /* top left */
455 memset(last_line + (i + 1) * wrap + width, last_line[width-1], w); /* top right */
460 * Copies a rectangular area of samples to a temporary buffer and replicates the boarder samples.
461 * @param buf destination buffer
462 * @param src source buffer
463 * @param linesize number of bytes between 2 vertically adjacent samples in both the source and destination buffers
464 * @param block_w width of block
465 * @param block_h height of block
466 * @param src_x x coordinate of the top left sample of the block in the source buffer
467 * @param src_y y coordinate of the top left sample of the block in the source buffer
468 * @param w width of the source buffer
469 * @param h height of the source buffer
471 void ff_emulated_edge_mc(uint8_t *buf, uint8_t *src, int linesize, int block_w, int block_h,
472 int src_x, int src_y, int w, int h){
474 int start_y, start_x, end_y, end_x;
477 src+= (h-1-src_y)*linesize;
479 }else if(src_y<=-block_h){
480 src+= (1-block_h-src_y)*linesize;
486 }else if(src_x<=-block_w){
487 src+= (1-block_w-src_x);
491 start_y= FFMAX(0, -src_y);
492 start_x= FFMAX(0, -src_x);
493 end_y= FFMIN(block_h, h-src_y);
494 end_x= FFMIN(block_w, w-src_x);
496 // copy existing part
497 for(y=start_y; y<end_y; y++){
498 for(x=start_x; x<end_x; x++){
499 buf[x + y*linesize]= src[x + y*linesize];
504 for(y=0; y<start_y; y++){
505 for(x=start_x; x<end_x; x++){
506 buf[x + y*linesize]= buf[x + start_y*linesize];
511 for(y=end_y; y<block_h; y++){
512 for(x=start_x; x<end_x; x++){
513 buf[x + y*linesize]= buf[x + (end_y-1)*linesize];
517 for(y=0; y<block_h; y++){
519 for(x=0; x<start_x; x++){
520 buf[x + y*linesize]= buf[start_x + y*linesize];
524 for(x=end_x; x<block_w; x++){
525 buf[x + y*linesize]= buf[end_x - 1 + y*linesize];
530 static void get_pixels_c(DCTELEM *restrict block, const uint8_t *pixels, int line_size)
534 /* read the pixels */
536 block[0] = pixels[0];
537 block[1] = pixels[1];
538 block[2] = pixels[2];
539 block[3] = pixels[3];
540 block[4] = pixels[4];
541 block[5] = pixels[5];
542 block[6] = pixels[6];
543 block[7] = pixels[7];
549 static void diff_pixels_c(DCTELEM *restrict block, const uint8_t *s1,
550 const uint8_t *s2, int stride){
553 /* read the pixels */
555 block[0] = s1[0] - s2[0];
556 block[1] = s1[1] - s2[1];
557 block[2] = s1[2] - s2[2];
558 block[3] = s1[3] - s2[3];
559 block[4] = s1[4] - s2[4];
560 block[5] = s1[5] - s2[5];
561 block[6] = s1[6] - s2[6];
562 block[7] = s1[7] - s2[7];
570 static void put_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
574 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
576 /* read the pixels */
578 pixels[0] = cm[block[0]];
579 pixels[1] = cm[block[1]];
580 pixels[2] = cm[block[2]];
581 pixels[3] = cm[block[3]];
582 pixels[4] = cm[block[4]];
583 pixels[5] = cm[block[5]];
584 pixels[6] = cm[block[6]];
585 pixels[7] = cm[block[7]];
592 static void put_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
596 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
598 /* read the pixels */
600 pixels[0] = cm[block[0]];
601 pixels[1] = cm[block[1]];
602 pixels[2] = cm[block[2]];
603 pixels[3] = cm[block[3]];
610 static void put_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
614 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
616 /* read the pixels */
618 pixels[0] = cm[block[0]];
619 pixels[1] = cm[block[1]];
626 static void put_signed_pixels_clamped_c(const DCTELEM *block,
627 uint8_t *restrict pixels,
632 for (i = 0; i < 8; i++) {
633 for (j = 0; j < 8; j++) {
636 else if (*block > 127)
639 *pixels = (uint8_t)(*block + 128);
643 pixels += (line_size - 8);
647 static void add_pixels_clamped_c(const DCTELEM *block, uint8_t *restrict pixels,
651 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
653 /* read the pixels */
655 pixels[0] = cm[pixels[0] + block[0]];
656 pixels[1] = cm[pixels[1] + block[1]];
657 pixels[2] = cm[pixels[2] + block[2]];
658 pixels[3] = cm[pixels[3] + block[3]];
659 pixels[4] = cm[pixels[4] + block[4]];
660 pixels[5] = cm[pixels[5] + block[5]];
661 pixels[6] = cm[pixels[6] + block[6]];
662 pixels[7] = cm[pixels[7] + block[7]];
668 static void add_pixels_clamped4_c(const DCTELEM *block, uint8_t *restrict pixels,
672 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
674 /* read the pixels */
676 pixels[0] = cm[pixels[0] + block[0]];
677 pixels[1] = cm[pixels[1] + block[1]];
678 pixels[2] = cm[pixels[2] + block[2]];
679 pixels[3] = cm[pixels[3] + block[3]];
685 static void add_pixels_clamped2_c(const DCTELEM *block, uint8_t *restrict pixels,
689 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
691 /* read the pixels */
693 pixels[0] = cm[pixels[0] + block[0]];
694 pixels[1] = cm[pixels[1] + block[1]];
700 static void add_pixels8_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
704 pixels[0] += block[0];
705 pixels[1] += block[1];
706 pixels[2] += block[2];
707 pixels[3] += block[3];
708 pixels[4] += block[4];
709 pixels[5] += block[5];
710 pixels[6] += block[6];
711 pixels[7] += block[7];
717 static void add_pixels4_c(uint8_t *restrict pixels, DCTELEM *block, int line_size)
721 pixels[0] += block[0];
722 pixels[1] += block[1];
723 pixels[2] += block[2];
724 pixels[3] += block[3];
730 static int sum_abs_dctelem_c(DCTELEM *block)
734 sum+= FFABS(block[i]);
740 #define PIXOP2(OPNAME, OP) \
741 static void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
745 OP(*((uint64_t*)block), AV_RN64(pixels));\
751 static void OPNAME ## _no_rnd_pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
755 const uint64_t a= AV_RN64(pixels );\
756 const uint64_t b= AV_RN64(pixels+1);\
757 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
763 static void OPNAME ## _pixels_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
767 const uint64_t a= AV_RN64(pixels );\
768 const uint64_t b= AV_RN64(pixels+1);\
769 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
775 static void OPNAME ## _no_rnd_pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
779 const uint64_t a= AV_RN64(pixels );\
780 const uint64_t b= AV_RN64(pixels+line_size);\
781 OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
787 static void OPNAME ## _pixels_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
791 const uint64_t a= AV_RN64(pixels );\
792 const uint64_t b= AV_RN64(pixels+line_size);\
793 OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
799 static void OPNAME ## _pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
802 const uint64_t a= AV_RN64(pixels );\
803 const uint64_t b= AV_RN64(pixels+1);\
804 uint64_t l0= (a&0x0303030303030303ULL)\
805 + (b&0x0303030303030303ULL)\
806 + 0x0202020202020202ULL;\
807 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
808 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
812 for(i=0; i<h; i+=2){\
813 uint64_t a= AV_RN64(pixels );\
814 uint64_t b= AV_RN64(pixels+1);\
815 l1= (a&0x0303030303030303ULL)\
816 + (b&0x0303030303030303ULL);\
817 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
818 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
819 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
822 a= AV_RN64(pixels );\
823 b= AV_RN64(pixels+1);\
824 l0= (a&0x0303030303030303ULL)\
825 + (b&0x0303030303030303ULL)\
826 + 0x0202020202020202ULL;\
827 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
828 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
829 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
835 static void OPNAME ## _no_rnd_pixels_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
838 const uint64_t a= AV_RN64(pixels );\
839 const uint64_t b= AV_RN64(pixels+1);\
840 uint64_t l0= (a&0x0303030303030303ULL)\
841 + (b&0x0303030303030303ULL)\
842 + 0x0101010101010101ULL;\
843 uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
844 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
848 for(i=0; i<h; i+=2){\
849 uint64_t a= AV_RN64(pixels );\
850 uint64_t b= AV_RN64(pixels+1);\
851 l1= (a&0x0303030303030303ULL)\
852 + (b&0x0303030303030303ULL);\
853 h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
854 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
855 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
858 a= AV_RN64(pixels );\
859 b= AV_RN64(pixels+1);\
860 l0= (a&0x0303030303030303ULL)\
861 + (b&0x0303030303030303ULL)\
862 + 0x0101010101010101ULL;\
863 h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
864 + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
865 OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
871 CALL_2X_PIXELS(OPNAME ## _pixels16_c , OPNAME ## _pixels_c , 8)\
872 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels_x2_c , 8)\
873 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels_y2_c , 8)\
874 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels_xy2_c, 8)\
875 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels_x2_c , 8)\
876 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels_y2_c , 8)\
877 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels_xy2_c, 8)
879 #define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
880 #else // 64 bit variant
882 #define PIXOP2(OPNAME, OP) \
883 static void OPNAME ## _pixels2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
886 OP(*((uint16_t*)(block )), AV_RN16(pixels ));\
891 static void OPNAME ## _pixels4_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
894 OP(*((uint32_t*)(block )), AV_RN32(pixels ));\
899 static void OPNAME ## _pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
902 OP(*((uint32_t*)(block )), AV_RN32(pixels ));\
903 OP(*((uint32_t*)(block+4)), AV_RN32(pixels+4));\
908 static inline void OPNAME ## _no_rnd_pixels8_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
909 OPNAME ## _pixels8_c(block, pixels, line_size, h);\
912 static inline void OPNAME ## _no_rnd_pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
913 int src_stride1, int src_stride2, int h){\
917 a= AV_RN32(&src1[i*src_stride1 ]);\
918 b= AV_RN32(&src2[i*src_stride2 ]);\
919 OP(*((uint32_t*)&dst[i*dst_stride ]), no_rnd_avg32(a, b));\
920 a= AV_RN32(&src1[i*src_stride1+4]);\
921 b= AV_RN32(&src2[i*src_stride2+4]);\
922 OP(*((uint32_t*)&dst[i*dst_stride+4]), no_rnd_avg32(a, b));\
926 static inline void OPNAME ## _pixels8_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
927 int src_stride1, int src_stride2, int h){\
931 a= AV_RN32(&src1[i*src_stride1 ]);\
932 b= AV_RN32(&src2[i*src_stride2 ]);\
933 OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
934 a= AV_RN32(&src1[i*src_stride1+4]);\
935 b= AV_RN32(&src2[i*src_stride2+4]);\
936 OP(*((uint32_t*)&dst[i*dst_stride+4]), rnd_avg32(a, b));\
940 static inline void OPNAME ## _pixels4_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
941 int src_stride1, int src_stride2, int h){\
945 a= AV_RN32(&src1[i*src_stride1 ]);\
946 b= AV_RN32(&src2[i*src_stride2 ]);\
947 OP(*((uint32_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
951 static inline void OPNAME ## _pixels2_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
952 int src_stride1, int src_stride2, int h){\
956 a= AV_RN16(&src1[i*src_stride1 ]);\
957 b= AV_RN16(&src2[i*src_stride2 ]);\
958 OP(*((uint16_t*)&dst[i*dst_stride ]), rnd_avg32(a, b));\
962 static inline void OPNAME ## _pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
963 int src_stride1, int src_stride2, int h){\
964 OPNAME ## _pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
965 OPNAME ## _pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
968 static inline void OPNAME ## _no_rnd_pixels16_l2(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int dst_stride, \
969 int src_stride1, int src_stride2, int h){\
970 OPNAME ## _no_rnd_pixels8_l2(dst , src1 , src2 , dst_stride, src_stride1, src_stride2, h);\
971 OPNAME ## _no_rnd_pixels8_l2(dst+8, src1+8, src2+8, dst_stride, src_stride1, src_stride2, h);\
974 static inline void OPNAME ## _no_rnd_pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
975 OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
978 static inline void OPNAME ## _pixels8_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
979 OPNAME ## _pixels8_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
982 static inline void OPNAME ## _no_rnd_pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
983 OPNAME ## _no_rnd_pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
986 static inline void OPNAME ## _pixels8_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
987 OPNAME ## _pixels8_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
990 static inline void OPNAME ## _pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
991 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
994 uint32_t a, b, c, d, l0, l1, h0, h1;\
995 a= AV_RN32(&src1[i*src_stride1]);\
996 b= AV_RN32(&src2[i*src_stride2]);\
997 c= AV_RN32(&src3[i*src_stride3]);\
998 d= AV_RN32(&src4[i*src_stride4]);\
999 l0= (a&0x03030303UL)\
1002 h0= ((a&0xFCFCFCFCUL)>>2)\
1003 + ((b&0xFCFCFCFCUL)>>2);\
1004 l1= (c&0x03030303UL)\
1005 + (d&0x03030303UL);\
1006 h1= ((c&0xFCFCFCFCUL)>>2)\
1007 + ((d&0xFCFCFCFCUL)>>2);\
1008 OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1009 a= AV_RN32(&src1[i*src_stride1+4]);\
1010 b= AV_RN32(&src2[i*src_stride2+4]);\
1011 c= AV_RN32(&src3[i*src_stride3+4]);\
1012 d= AV_RN32(&src4[i*src_stride4+4]);\
1013 l0= (a&0x03030303UL)\
1016 h0= ((a&0xFCFCFCFCUL)>>2)\
1017 + ((b&0xFCFCFCFCUL)>>2);\
1018 l1= (c&0x03030303UL)\
1019 + (d&0x03030303UL);\
1020 h1= ((c&0xFCFCFCFCUL)>>2)\
1021 + ((d&0xFCFCFCFCUL)>>2);\
1022 OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1026 static inline void OPNAME ## _pixels4_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
1027 OPNAME ## _pixels4_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
1030 static inline void OPNAME ## _pixels4_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
1031 OPNAME ## _pixels4_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
1034 static inline void OPNAME ## _pixels2_x2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
1035 OPNAME ## _pixels2_l2(block, pixels, pixels+1, line_size, line_size, line_size, h);\
1038 static inline void OPNAME ## _pixels2_y2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h){\
1039 OPNAME ## _pixels2_l2(block, pixels, pixels+line_size, line_size, line_size, line_size, h);\
1042 static inline void OPNAME ## _no_rnd_pixels8_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
1043 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1045 for(i=0; i<h; i++){\
1046 uint32_t a, b, c, d, l0, l1, h0, h1;\
1047 a= AV_RN32(&src1[i*src_stride1]);\
1048 b= AV_RN32(&src2[i*src_stride2]);\
1049 c= AV_RN32(&src3[i*src_stride3]);\
1050 d= AV_RN32(&src4[i*src_stride4]);\
1051 l0= (a&0x03030303UL)\
1054 h0= ((a&0xFCFCFCFCUL)>>2)\
1055 + ((b&0xFCFCFCFCUL)>>2);\
1056 l1= (c&0x03030303UL)\
1057 + (d&0x03030303UL);\
1058 h1= ((c&0xFCFCFCFCUL)>>2)\
1059 + ((d&0xFCFCFCFCUL)>>2);\
1060 OP(*((uint32_t*)&dst[i*dst_stride]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1061 a= AV_RN32(&src1[i*src_stride1+4]);\
1062 b= AV_RN32(&src2[i*src_stride2+4]);\
1063 c= AV_RN32(&src3[i*src_stride3+4]);\
1064 d= AV_RN32(&src4[i*src_stride4+4]);\
1065 l0= (a&0x03030303UL)\
1068 h0= ((a&0xFCFCFCFCUL)>>2)\
1069 + ((b&0xFCFCFCFCUL)>>2);\
1070 l1= (c&0x03030303UL)\
1071 + (d&0x03030303UL);\
1072 h1= ((c&0xFCFCFCFCUL)>>2)\
1073 + ((d&0xFCFCFCFCUL)>>2);\
1074 OP(*((uint32_t*)&dst[i*dst_stride+4]), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1077 static inline void OPNAME ## _pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
1078 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1079 OPNAME ## _pixels8_l4(dst , src1 , src2 , src3 , src4 , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1080 OPNAME ## _pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1082 static inline void OPNAME ## _no_rnd_pixels16_l4(uint8_t *dst, const uint8_t *src1, uint8_t *src2, uint8_t *src3, uint8_t *src4,\
1083 int dst_stride, int src_stride1, int src_stride2,int src_stride3,int src_stride4, int h){\
1084 OPNAME ## _no_rnd_pixels8_l4(dst , src1 , src2 , src3 , src4 , dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1085 OPNAME ## _no_rnd_pixels8_l4(dst+8, src1+8, src2+8, src3+8, src4+8, dst_stride, src_stride1, src_stride2, src_stride3, src_stride4, h);\
1088 static inline void OPNAME ## _pixels2_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1090 int i, a0, b0, a1, b1;\
1097 for(i=0; i<h; i+=2){\
1103 block[0]= (a1+a0)>>2; /* FIXME non put */\
1104 block[1]= (b1+b0)>>2;\
1114 block[0]= (a1+a0)>>2;\
1115 block[1]= (b1+b0)>>2;\
1121 static inline void OPNAME ## _pixels4_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1124 const uint32_t a= AV_RN32(pixels );\
1125 const uint32_t b= AV_RN32(pixels+1);\
1126 uint32_t l0= (a&0x03030303UL)\
1129 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1130 + ((b&0xFCFCFCFCUL)>>2);\
1134 for(i=0; i<h; i+=2){\
1135 uint32_t a= AV_RN32(pixels );\
1136 uint32_t b= AV_RN32(pixels+1);\
1137 l1= (a&0x03030303UL)\
1138 + (b&0x03030303UL);\
1139 h1= ((a&0xFCFCFCFCUL)>>2)\
1140 + ((b&0xFCFCFCFCUL)>>2);\
1141 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1144 a= AV_RN32(pixels );\
1145 b= AV_RN32(pixels+1);\
1146 l0= (a&0x03030303UL)\
1149 h0= ((a&0xFCFCFCFCUL)>>2)\
1150 + ((b&0xFCFCFCFCUL)>>2);\
1151 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1157 static inline void OPNAME ## _pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1160 for(j=0; j<2; j++){\
1162 const uint32_t a= AV_RN32(pixels );\
1163 const uint32_t b= AV_RN32(pixels+1);\
1164 uint32_t l0= (a&0x03030303UL)\
1167 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1168 + ((b&0xFCFCFCFCUL)>>2);\
1172 for(i=0; i<h; i+=2){\
1173 uint32_t a= AV_RN32(pixels );\
1174 uint32_t b= AV_RN32(pixels+1);\
1175 l1= (a&0x03030303UL)\
1176 + (b&0x03030303UL);\
1177 h1= ((a&0xFCFCFCFCUL)>>2)\
1178 + ((b&0xFCFCFCFCUL)>>2);\
1179 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1182 a= AV_RN32(pixels );\
1183 b= AV_RN32(pixels+1);\
1184 l0= (a&0x03030303UL)\
1187 h0= ((a&0xFCFCFCFCUL)>>2)\
1188 + ((b&0xFCFCFCFCUL)>>2);\
1189 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1193 pixels+=4-line_size*(h+1);\
1194 block +=4-line_size*h;\
1198 static inline void OPNAME ## _no_rnd_pixels8_xy2_c(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
1201 for(j=0; j<2; j++){\
1203 const uint32_t a= AV_RN32(pixels );\
1204 const uint32_t b= AV_RN32(pixels+1);\
1205 uint32_t l0= (a&0x03030303UL)\
1208 uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
1209 + ((b&0xFCFCFCFCUL)>>2);\
1213 for(i=0; i<h; i+=2){\
1214 uint32_t a= AV_RN32(pixels );\
1215 uint32_t b= AV_RN32(pixels+1);\
1216 l1= (a&0x03030303UL)\
1217 + (b&0x03030303UL);\
1218 h1= ((a&0xFCFCFCFCUL)>>2)\
1219 + ((b&0xFCFCFCFCUL)>>2);\
1220 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1223 a= AV_RN32(pixels );\
1224 b= AV_RN32(pixels+1);\
1225 l0= (a&0x03030303UL)\
1228 h0= ((a&0xFCFCFCFCUL)>>2)\
1229 + ((b&0xFCFCFCFCUL)>>2);\
1230 OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
1234 pixels+=4-line_size*(h+1);\
1235 block +=4-line_size*h;\
1239 CALL_2X_PIXELS(OPNAME ## _pixels16_c , OPNAME ## _pixels8_c , 8)\
1240 CALL_2X_PIXELS(OPNAME ## _pixels16_x2_c , OPNAME ## _pixels8_x2_c , 8)\
1241 CALL_2X_PIXELS(OPNAME ## _pixels16_y2_c , OPNAME ## _pixels8_y2_c , 8)\
1242 CALL_2X_PIXELS(OPNAME ## _pixels16_xy2_c, OPNAME ## _pixels8_xy2_c, 8)\
1243 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_c , OPNAME ## _pixels8_c , 8)\
1244 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_x2_c , OPNAME ## _no_rnd_pixels8_x2_c , 8)\
1245 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_y2_c , OPNAME ## _no_rnd_pixels8_y2_c , 8)\
1246 CALL_2X_PIXELS(OPNAME ## _no_rnd_pixels16_xy2_c, OPNAME ## _no_rnd_pixels8_xy2_c, 8)\
1248 #define op_avg(a, b) a = rnd_avg32(a, b)
1250 #define op_put(a, b) a = b
1257 #define avg2(a,b) ((a+b+1)>>1)
1258 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
1260 static void put_no_rnd_pixels16_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1261 put_no_rnd_pixels16_l2(dst, a, b, stride, stride, stride, h);
1264 static void put_no_rnd_pixels8_l2_c(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h){
1265 put_no_rnd_pixels8_l2(dst, a, b, stride, stride, stride, h);
1268 static void gmc1_c(uint8_t *dst, uint8_t *src, int stride, int h, int x16, int y16, int rounder)
1270 const int A=(16-x16)*(16-y16);
1271 const int B=( x16)*(16-y16);
1272 const int C=(16-x16)*( y16);
1273 const int D=( x16)*( y16);
1278 dst[0]= (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + rounder)>>8;
1279 dst[1]= (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + rounder)>>8;
1280 dst[2]= (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + rounder)>>8;
1281 dst[3]= (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + rounder)>>8;
1282 dst[4]= (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + rounder)>>8;
1283 dst[5]= (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + rounder)>>8;
1284 dst[6]= (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + rounder)>>8;
1285 dst[7]= (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + rounder)>>8;
1291 void ff_gmc_c(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
1292 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
1295 const int s= 1<<shift;
1305 for(x=0; x<8; x++){ //XXX FIXME optimize
1306 int src_x, src_y, frac_x, frac_y, index;
1310 frac_x= src_x&(s-1);
1311 frac_y= src_y&(s-1);
1315 if((unsigned)src_x < width){
1316 if((unsigned)src_y < height){
1317 index= src_x + src_y*stride;
1318 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
1319 + src[index +1]* frac_x )*(s-frac_y)
1320 + ( src[index+stride ]*(s-frac_x)
1321 + src[index+stride+1]* frac_x )* frac_y
1324 index= src_x + av_clip(src_y, 0, height)*stride;
1325 dst[y*stride + x]= ( ( src[index ]*(s-frac_x)
1326 + src[index +1]* frac_x )*s
1330 if((unsigned)src_y < height){
1331 index= av_clip(src_x, 0, width) + src_y*stride;
1332 dst[y*stride + x]= ( ( src[index ]*(s-frac_y)
1333 + src[index+stride ]* frac_y )*s
1336 index= av_clip(src_x, 0, width) + av_clip(src_y, 0, height)*stride;
1337 dst[y*stride + x]= src[index ];
1349 static inline void put_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1351 case 2: put_pixels2_c (dst, src, stride, height); break;
1352 case 4: put_pixels4_c (dst, src, stride, height); break;
1353 case 8: put_pixels8_c (dst, src, stride, height); break;
1354 case 16:put_pixels16_c(dst, src, stride, height); break;
1358 static inline void put_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1360 for (i=0; i < height; i++) {
1361 for (j=0; j < width; j++) {
1362 dst[j] = (683*(2*src[j] + src[j+1] + 1)) >> 11;
1369 static inline void put_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1371 for (i=0; i < height; i++) {
1372 for (j=0; j < width; j++) {
1373 dst[j] = (683*(src[j] + 2*src[j+1] + 1)) >> 11;
1380 static inline void put_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1382 for (i=0; i < height; i++) {
1383 for (j=0; j < width; j++) {
1384 dst[j] = (683*(2*src[j] + src[j+stride] + 1)) >> 11;
1391 static inline void put_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1393 for (i=0; i < height; i++) {
1394 for (j=0; j < width; j++) {
1395 dst[j] = (2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15;
1402 static inline void put_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1404 for (i=0; i < height; i++) {
1405 for (j=0; j < width; j++) {
1406 dst[j] = (2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1413 static inline void put_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1415 for (i=0; i < height; i++) {
1416 for (j=0; j < width; j++) {
1417 dst[j] = (683*(src[j] + 2*src[j+stride] + 1)) >> 11;
1424 static inline void put_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1426 for (i=0; i < height; i++) {
1427 for (j=0; j < width; j++) {
1428 dst[j] = (2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15;
1435 static inline void put_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1437 for (i=0; i < height; i++) {
1438 for (j=0; j < width; j++) {
1439 dst[j] = (2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15;
1446 static inline void avg_tpel_pixels_mc00_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1448 case 2: avg_pixels2_c (dst, src, stride, height); break;
1449 case 4: avg_pixels4_c (dst, src, stride, height); break;
1450 case 8: avg_pixels8_c (dst, src, stride, height); break;
1451 case 16:avg_pixels16_c(dst, src, stride, height); break;
1455 static inline void avg_tpel_pixels_mc10_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1457 for (i=0; i < height; i++) {
1458 for (j=0; j < width; j++) {
1459 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+1] + 1)) >> 11) + 1) >> 1;
1466 static inline void avg_tpel_pixels_mc20_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1468 for (i=0; i < height; i++) {
1469 for (j=0; j < width; j++) {
1470 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+1] + 1)) >> 11) + 1) >> 1;
1477 static inline void avg_tpel_pixels_mc01_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1479 for (i=0; i < height; i++) {
1480 for (j=0; j < width; j++) {
1481 dst[j] = (dst[j] + ((683*(2*src[j] + src[j+stride] + 1)) >> 11) + 1) >> 1;
1488 static inline void avg_tpel_pixels_mc11_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1490 for (i=0; i < height; i++) {
1491 for (j=0; j < width; j++) {
1492 dst[j] = (dst[j] + ((2731*(4*src[j] + 3*src[j+1] + 3*src[j+stride] + 2*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1499 static inline void avg_tpel_pixels_mc12_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1501 for (i=0; i < height; i++) {
1502 for (j=0; j < width; j++) {
1503 dst[j] = (dst[j] + ((2731*(3*src[j] + 2*src[j+1] + 4*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1510 static inline void avg_tpel_pixels_mc02_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1512 for (i=0; i < height; i++) {
1513 for (j=0; j < width; j++) {
1514 dst[j] = (dst[j] + ((683*(src[j] + 2*src[j+stride] + 1)) >> 11) + 1) >> 1;
1521 static inline void avg_tpel_pixels_mc21_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1523 for (i=0; i < height; i++) {
1524 for (j=0; j < width; j++) {
1525 dst[j] = (dst[j] + ((2731*(3*src[j] + 4*src[j+1] + 2*src[j+stride] + 3*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1532 static inline void avg_tpel_pixels_mc22_c(uint8_t *dst, const uint8_t *src, int stride, int width, int height){
1534 for (i=0; i < height; i++) {
1535 for (j=0; j < width; j++) {
1536 dst[j] = (dst[j] + ((2731*(2*src[j] + 3*src[j+1] + 3*src[j+stride] + 4*src[j+stride+1] + 6)) >> 15) + 1) >> 1;
1543 #define TPEL_WIDTH(width)\
1544 static void put_tpel_pixels ## width ## _mc00_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1545 void put_tpel_pixels_mc00_c(dst, src, stride, width, height);}\
1546 static void put_tpel_pixels ## width ## _mc10_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1547 void put_tpel_pixels_mc10_c(dst, src, stride, width, height);}\
1548 static void put_tpel_pixels ## width ## _mc20_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1549 void put_tpel_pixels_mc20_c(dst, src, stride, width, height);}\
1550 static void put_tpel_pixels ## width ## _mc01_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1551 void put_tpel_pixels_mc01_c(dst, src, stride, width, height);}\
1552 static void put_tpel_pixels ## width ## _mc11_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1553 void put_tpel_pixels_mc11_c(dst, src, stride, width, height);}\
1554 static void put_tpel_pixels ## width ## _mc21_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1555 void put_tpel_pixels_mc21_c(dst, src, stride, width, height);}\
1556 static void put_tpel_pixels ## width ## _mc02_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1557 void put_tpel_pixels_mc02_c(dst, src, stride, width, height);}\
1558 static void put_tpel_pixels ## width ## _mc12_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1559 void put_tpel_pixels_mc12_c(dst, src, stride, width, height);}\
1560 static void put_tpel_pixels ## width ## _mc22_c(uint8_t *dst, const uint8_t *src, int stride, int height){\
1561 void put_tpel_pixels_mc22_c(dst, src, stride, width, height);}
1564 #define H264_CHROMA_MC(OPNAME, OP)\
1565 static void OPNAME ## h264_chroma_mc2_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1566 const int A=(8-x)*(8-y);\
1567 const int B=( x)*(8-y);\
1568 const int C=(8-x)*( y);\
1569 const int D=( x)*( y);\
1572 assert(x<8 && y<8 && x>=0 && y>=0);\
1575 for(i=0; i<h; i++){\
1576 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1577 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1583 const int step= C ? stride : 1;\
1584 for(i=0; i<h; i++){\
1585 OP(dst[0], (A*src[0] + E*src[step+0]));\
1586 OP(dst[1], (A*src[1] + E*src[step+1]));\
1593 static void OPNAME ## h264_chroma_mc4_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1594 const int A=(8-x)*(8-y);\
1595 const int B=( x)*(8-y);\
1596 const int C=(8-x)*( y);\
1597 const int D=( x)*( y);\
1600 assert(x<8 && y<8 && x>=0 && y>=0);\
1603 for(i=0; i<h; i++){\
1604 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1605 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1606 OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1607 OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1613 const int step= C ? stride : 1;\
1614 for(i=0; i<h; i++){\
1615 OP(dst[0], (A*src[0] + E*src[step+0]));\
1616 OP(dst[1], (A*src[1] + E*src[step+1]));\
1617 OP(dst[2], (A*src[2] + E*src[step+2]));\
1618 OP(dst[3], (A*src[3] + E*src[step+3]));\
1625 static void OPNAME ## h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){\
1626 const int A=(8-x)*(8-y);\
1627 const int B=( x)*(8-y);\
1628 const int C=(8-x)*( y);\
1629 const int D=( x)*( y);\
1632 assert(x<8 && y<8 && x>=0 && y>=0);\
1635 for(i=0; i<h; i++){\
1636 OP(dst[0], (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1]));\
1637 OP(dst[1], (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2]));\
1638 OP(dst[2], (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3]));\
1639 OP(dst[3], (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4]));\
1640 OP(dst[4], (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5]));\
1641 OP(dst[5], (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6]));\
1642 OP(dst[6], (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7]));\
1643 OP(dst[7], (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8]));\
1649 const int step= C ? stride : 1;\
1650 for(i=0; i<h; i++){\
1651 OP(dst[0], (A*src[0] + E*src[step+0]));\
1652 OP(dst[1], (A*src[1] + E*src[step+1]));\
1653 OP(dst[2], (A*src[2] + E*src[step+2]));\
1654 OP(dst[3], (A*src[3] + E*src[step+3]));\
1655 OP(dst[4], (A*src[4] + E*src[step+4]));\
1656 OP(dst[5], (A*src[5] + E*src[step+5]));\
1657 OP(dst[6], (A*src[6] + E*src[step+6]));\
1658 OP(dst[7], (A*src[7] + E*src[step+7]));\
1665 #define op_avg(a, b) a = (((a)+(((b) + 32)>>6)+1)>>1)
1666 #define op_put(a, b) a = (((b) + 32)>>6)
1668 H264_CHROMA_MC(put_ , op_put)
1669 H264_CHROMA_MC(avg_ , op_avg)
1673 static void put_no_rnd_h264_chroma_mc8_c(uint8_t *dst/*align 8*/, uint8_t *src/*align 1*/, int stride, int h, int x, int y){
1674 const int A=(8-x)*(8-y);
1675 const int B=( x)*(8-y);
1676 const int C=(8-x)*( y);
1677 const int D=( x)*( y);
1680 assert(x<8 && y<8 && x>=0 && y>=0);
1684 dst[0] = (A*src[0] + B*src[1] + C*src[stride+0] + D*src[stride+1] + 32 - 4) >> 6;
1685 dst[1] = (A*src[1] + B*src[2] + C*src[stride+1] + D*src[stride+2] + 32 - 4) >> 6;
1686 dst[2] = (A*src[2] + B*src[3] + C*src[stride+2] + D*src[stride+3] + 32 - 4) >> 6;
1687 dst[3] = (A*src[3] + B*src[4] + C*src[stride+3] + D*src[stride+4] + 32 - 4) >> 6;
1688 dst[4] = (A*src[4] + B*src[5] + C*src[stride+4] + D*src[stride+5] + 32 - 4) >> 6;
1689 dst[5] = (A*src[5] + B*src[6] + C*src[stride+5] + D*src[stride+6] + 32 - 4) >> 6;
1690 dst[6] = (A*src[6] + B*src[7] + C*src[stride+6] + D*src[stride+7] + 32 - 4) >> 6;
1691 dst[7] = (A*src[7] + B*src[8] + C*src[stride+7] + D*src[stride+8] + 32 - 4) >> 6;
1697 #define QPEL_MC(r, OPNAME, RND, OP) \
1698 static void OPNAME ## mpeg4_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1699 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1703 OP(dst[0], (src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]));\
1704 OP(dst[1], (src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]));\
1705 OP(dst[2], (src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]));\
1706 OP(dst[3], (src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]));\
1707 OP(dst[4], (src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]));\
1708 OP(dst[5], (src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]));\
1709 OP(dst[6], (src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]));\
1710 OP(dst[7], (src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]));\
1716 static void OPNAME ## mpeg4_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1718 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1722 const int src0= src[0*srcStride];\
1723 const int src1= src[1*srcStride];\
1724 const int src2= src[2*srcStride];\
1725 const int src3= src[3*srcStride];\
1726 const int src4= src[4*srcStride];\
1727 const int src5= src[5*srcStride];\
1728 const int src6= src[6*srcStride];\
1729 const int src7= src[7*srcStride];\
1730 const int src8= src[8*srcStride];\
1731 OP(dst[0*dstStride], (src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4));\
1732 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5));\
1733 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6));\
1734 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7));\
1735 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8));\
1736 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8));\
1737 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7));\
1738 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6));\
1744 static void OPNAME ## mpeg4_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1745 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1750 OP(dst[ 0], (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]));\
1751 OP(dst[ 1], (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]));\
1752 OP(dst[ 2], (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]));\
1753 OP(dst[ 3], (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]));\
1754 OP(dst[ 4], (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]));\
1755 OP(dst[ 5], (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]));\
1756 OP(dst[ 6], (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]));\
1757 OP(dst[ 7], (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]));\
1758 OP(dst[ 8], (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]));\
1759 OP(dst[ 9], (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]));\
1760 OP(dst[10], (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]));\
1761 OP(dst[11], (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]));\
1762 OP(dst[12], (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]));\
1763 OP(dst[13], (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]));\
1764 OP(dst[14], (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]));\
1765 OP(dst[15], (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]));\
1771 static void OPNAME ## mpeg4_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1772 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
1777 const int src0= src[0*srcStride];\
1778 const int src1= src[1*srcStride];\
1779 const int src2= src[2*srcStride];\
1780 const int src3= src[3*srcStride];\
1781 const int src4= src[4*srcStride];\
1782 const int src5= src[5*srcStride];\
1783 const int src6= src[6*srcStride];\
1784 const int src7= src[7*srcStride];\
1785 const int src8= src[8*srcStride];\
1786 const int src9= src[9*srcStride];\
1787 const int src10= src[10*srcStride];\
1788 const int src11= src[11*srcStride];\
1789 const int src12= src[12*srcStride];\
1790 const int src13= src[13*srcStride];\
1791 const int src14= src[14*srcStride];\
1792 const int src15= src[15*srcStride];\
1793 const int src16= src[16*srcStride];\
1794 OP(dst[ 0*dstStride], (src0 +src1 )*20 - (src0 +src2 )*6 + (src1 +src3 )*3 - (src2 +src4 ));\
1795 OP(dst[ 1*dstStride], (src1 +src2 )*20 - (src0 +src3 )*6 + (src0 +src4 )*3 - (src1 +src5 ));\
1796 OP(dst[ 2*dstStride], (src2 +src3 )*20 - (src1 +src4 )*6 + (src0 +src5 )*3 - (src0 +src6 ));\
1797 OP(dst[ 3*dstStride], (src3 +src4 )*20 - (src2 +src5 )*6 + (src1 +src6 )*3 - (src0 +src7 ));\
1798 OP(dst[ 4*dstStride], (src4 +src5 )*20 - (src3 +src6 )*6 + (src2 +src7 )*3 - (src1 +src8 ));\
1799 OP(dst[ 5*dstStride], (src5 +src6 )*20 - (src4 +src7 )*6 + (src3 +src8 )*3 - (src2 +src9 ));\
1800 OP(dst[ 6*dstStride], (src6 +src7 )*20 - (src5 +src8 )*6 + (src4 +src9 )*3 - (src3 +src10));\
1801 OP(dst[ 7*dstStride], (src7 +src8 )*20 - (src6 +src9 )*6 + (src5 +src10)*3 - (src4 +src11));\
1802 OP(dst[ 8*dstStride], (src8 +src9 )*20 - (src7 +src10)*6 + (src6 +src11)*3 - (src5 +src12));\
1803 OP(dst[ 9*dstStride], (src9 +src10)*20 - (src8 +src11)*6 + (src7 +src12)*3 - (src6 +src13));\
1804 OP(dst[10*dstStride], (src10+src11)*20 - (src9 +src12)*6 + (src8 +src13)*3 - (src7 +src14));\
1805 OP(dst[11*dstStride], (src11+src12)*20 - (src10+src13)*6 + (src9 +src14)*3 - (src8 +src15));\
1806 OP(dst[12*dstStride], (src12+src13)*20 - (src11+src14)*6 + (src10+src15)*3 - (src9 +src16));\
1807 OP(dst[13*dstStride], (src13+src14)*20 - (src12+src15)*6 + (src11+src16)*3 - (src10+src16));\
1808 OP(dst[14*dstStride], (src14+src15)*20 - (src13+src16)*6 + (src12+src16)*3 - (src11+src15));\
1809 OP(dst[15*dstStride], (src15+src16)*20 - (src14+src16)*6 + (src13+src15)*3 - (src12+src14));\
1815 static void OPNAME ## qpel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1816 OPNAME ## pixels8_c(dst, src, stride, 8);\
1819 static void OPNAME ## qpel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
1821 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1822 OPNAME ## pixels8_l2(dst, src, half, stride, stride, 8, 8);\
1825 static void OPNAME ## qpel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
1826 OPNAME ## mpeg4_qpel8_h_lowpass(dst, src, stride, stride, 8);\
1829 static void OPNAME ## qpel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
1831 put ## RND ## mpeg4_qpel8_h_lowpass(half, src, 8, stride, 8);\
1832 OPNAME ## pixels8_l2(dst, src+1, half, stride, stride, 8, 8);\
1835 static void OPNAME ## qpel8_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
1836 uint8_t full[16*9];\
1838 copy_block9(full, src, 16, stride, 9);\
1839 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1840 OPNAME ## pixels8_l2(dst, full, half, stride, 16, 8, 8);\
1843 static void OPNAME ## qpel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
1844 uint8_t full[16*9];\
1845 copy_block9(full, src, 16, stride, 9);\
1846 OPNAME ## mpeg4_qpel8_v_lowpass(dst, full, stride, 16);\
1849 static void OPNAME ## qpel8_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
1850 uint8_t full[16*9];\
1852 copy_block9(full, src, 16, stride, 9);\
1853 put ## RND ## mpeg4_qpel8_v_lowpass(half, full, 8, 16);\
1854 OPNAME ## pixels8_l2(dst, full+16, half, stride, 16, 8, 8);\
1856 void ff_ ## OPNAME ## qpel8_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
1857 uint8_t full[16*9];\
1860 uint8_t halfHV[64];\
1861 copy_block9(full, src, 16, stride, 9);\
1862 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1863 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1864 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1865 OPNAME ## pixels8_l4(dst, full, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1867 static void OPNAME ## qpel8_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
1868 uint8_t full[16*9];\
1870 uint8_t halfHV[64];\
1871 copy_block9(full, src, 16, stride, 9);\
1872 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1873 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1874 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1875 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1877 void ff_ ## OPNAME ## qpel8_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
1878 uint8_t full[16*9];\
1881 uint8_t halfHV[64];\
1882 copy_block9(full, src, 16, stride, 9);\
1883 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1884 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1885 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1886 OPNAME ## pixels8_l4(dst, full+1, halfH, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1888 static void OPNAME ## qpel8_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
1889 uint8_t full[16*9];\
1891 uint8_t halfHV[64];\
1892 copy_block9(full, src, 16, stride, 9);\
1893 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1894 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1895 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1896 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1898 void ff_ ## OPNAME ## qpel8_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
1899 uint8_t full[16*9];\
1902 uint8_t halfHV[64];\
1903 copy_block9(full, src, 16, stride, 9);\
1904 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1905 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1906 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1907 OPNAME ## pixels8_l4(dst, full+16, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1909 static void OPNAME ## qpel8_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
1910 uint8_t full[16*9];\
1912 uint8_t halfHV[64];\
1913 copy_block9(full, src, 16, stride, 9);\
1914 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1915 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1916 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1917 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1919 void ff_ ## OPNAME ## qpel8_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
1920 uint8_t full[16*9];\
1923 uint8_t halfHV[64];\
1924 copy_block9(full, src, 16, stride, 9);\
1925 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full , 8, 16, 9);\
1926 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1927 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1928 OPNAME ## pixels8_l4(dst, full+17, halfH+8, halfV, halfHV, stride, 16, 8, 8, 8, 8);\
1930 static void OPNAME ## qpel8_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
1931 uint8_t full[16*9];\
1933 uint8_t halfHV[64];\
1934 copy_block9(full, src, 16, stride, 9);\
1935 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1936 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1937 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1938 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1940 static void OPNAME ## qpel8_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
1942 uint8_t halfHV[64];\
1943 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1944 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1945 OPNAME ## pixels8_l2(dst, halfH, halfHV, stride, 8, 8, 8);\
1947 static void OPNAME ## qpel8_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
1949 uint8_t halfHV[64];\
1950 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1951 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1952 OPNAME ## pixels8_l2(dst, halfH+8, halfHV, stride, 8, 8, 8);\
1954 void ff_ ## OPNAME ## qpel8_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
1955 uint8_t full[16*9];\
1958 uint8_t halfHV[64];\
1959 copy_block9(full, src, 16, stride, 9);\
1960 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1961 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full, 8, 16);\
1962 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1963 OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1965 static void OPNAME ## qpel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
1966 uint8_t full[16*9];\
1968 copy_block9(full, src, 16, stride, 9);\
1969 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1970 put ## RND ## pixels8_l2(halfH, halfH, full, 8, 8, 16, 9);\
1971 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1973 void ff_ ## OPNAME ## qpel8_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
1974 uint8_t full[16*9];\
1977 uint8_t halfHV[64];\
1978 copy_block9(full, src, 16, stride, 9);\
1979 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1980 put ## RND ## mpeg4_qpel8_v_lowpass(halfV, full+1, 8, 16);\
1981 put ## RND ## mpeg4_qpel8_v_lowpass(halfHV, halfH, 8, 8);\
1982 OPNAME ## pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);\
1984 static void OPNAME ## qpel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
1985 uint8_t full[16*9];\
1987 copy_block9(full, src, 16, stride, 9);\
1988 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, full, 8, 16, 9);\
1989 put ## RND ## pixels8_l2(halfH, halfH, full+1, 8, 8, 16, 9);\
1990 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1992 static void OPNAME ## qpel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
1994 put ## RND ## mpeg4_qpel8_h_lowpass(halfH, src, 8, stride, 9);\
1995 OPNAME ## mpeg4_qpel8_v_lowpass(dst, halfH, stride, 8);\
1997 static void OPNAME ## qpel16_mc00_c (uint8_t *dst, uint8_t *src, int stride){\
1998 OPNAME ## pixels16_c(dst, src, stride, 16);\
2001 static void OPNAME ## qpel16_mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2003 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
2004 OPNAME ## pixels16_l2(dst, src, half, stride, stride, 16, 16);\
2007 static void OPNAME ## qpel16_mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2008 OPNAME ## mpeg4_qpel16_h_lowpass(dst, src, stride, stride, 16);\
2011 static void OPNAME ## qpel16_mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2013 put ## RND ## mpeg4_qpel16_h_lowpass(half, src, 16, stride, 16);\
2014 OPNAME ## pixels16_l2(dst, src+1, half, stride, stride, 16, 16);\
2017 static void OPNAME ## qpel16_mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2018 uint8_t full[24*17];\
2020 copy_block17(full, src, 24, stride, 17);\
2021 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
2022 OPNAME ## pixels16_l2(dst, full, half, stride, 24, 16, 16);\
2025 static void OPNAME ## qpel16_mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2026 uint8_t full[24*17];\
2027 copy_block17(full, src, 24, stride, 17);\
2028 OPNAME ## mpeg4_qpel16_v_lowpass(dst, full, stride, 24);\
2031 static void OPNAME ## qpel16_mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2032 uint8_t full[24*17];\
2034 copy_block17(full, src, 24, stride, 17);\
2035 put ## RND ## mpeg4_qpel16_v_lowpass(half, full, 16, 24);\
2036 OPNAME ## pixels16_l2(dst, full+24, half, stride, 24, 16, 16);\
2038 void ff_ ## OPNAME ## qpel16_mc11_old_c(uint8_t *dst, uint8_t *src, int stride){\
2039 uint8_t full[24*17];\
2040 uint8_t halfH[272];\
2041 uint8_t halfV[256];\
2042 uint8_t halfHV[256];\
2043 copy_block17(full, src, 24, stride, 17);\
2044 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2045 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2046 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2047 OPNAME ## pixels16_l4(dst, full, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2049 static void OPNAME ## qpel16_mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2050 uint8_t full[24*17];\
2051 uint8_t halfH[272];\
2052 uint8_t halfHV[256];\
2053 copy_block17(full, src, 24, stride, 17);\
2054 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2055 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2056 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2057 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2059 void ff_ ## OPNAME ## qpel16_mc31_old_c(uint8_t *dst, uint8_t *src, int stride){\
2060 uint8_t full[24*17];\
2061 uint8_t halfH[272];\
2062 uint8_t halfV[256];\
2063 uint8_t halfHV[256];\
2064 copy_block17(full, src, 24, stride, 17);\
2065 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2066 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2067 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2068 OPNAME ## pixels16_l4(dst, full+1, halfH, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2070 static void OPNAME ## qpel16_mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2071 uint8_t full[24*17];\
2072 uint8_t halfH[272];\
2073 uint8_t halfHV[256];\
2074 copy_block17(full, src, 24, stride, 17);\
2075 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2076 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2077 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2078 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2080 void ff_ ## OPNAME ## qpel16_mc13_old_c(uint8_t *dst, uint8_t *src, int stride){\
2081 uint8_t full[24*17];\
2082 uint8_t halfH[272];\
2083 uint8_t halfV[256];\
2084 uint8_t halfHV[256];\
2085 copy_block17(full, src, 24, stride, 17);\
2086 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2087 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2088 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2089 OPNAME ## pixels16_l4(dst, full+24, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2091 static void OPNAME ## qpel16_mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2092 uint8_t full[24*17];\
2093 uint8_t halfH[272];\
2094 uint8_t halfHV[256];\
2095 copy_block17(full, src, 24, stride, 17);\
2096 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2097 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2098 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2099 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2101 void ff_ ## OPNAME ## qpel16_mc33_old_c(uint8_t *dst, uint8_t *src, int stride){\
2102 uint8_t full[24*17];\
2103 uint8_t halfH[272];\
2104 uint8_t halfV[256];\
2105 uint8_t halfHV[256];\
2106 copy_block17(full, src, 24, stride, 17);\
2107 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full , 16, 24, 17);\
2108 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2109 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2110 OPNAME ## pixels16_l4(dst, full+25, halfH+16, halfV, halfHV, stride, 24, 16, 16, 16, 16);\
2112 static void OPNAME ## qpel16_mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2113 uint8_t full[24*17];\
2114 uint8_t halfH[272];\
2115 uint8_t halfHV[256];\
2116 copy_block17(full, src, 24, stride, 17);\
2117 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2118 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2119 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2120 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2122 static void OPNAME ## qpel16_mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2123 uint8_t halfH[272];\
2124 uint8_t halfHV[256];\
2125 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2126 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2127 OPNAME ## pixels16_l2(dst, halfH, halfHV, stride, 16, 16, 16);\
2129 static void OPNAME ## qpel16_mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2130 uint8_t halfH[272];\
2131 uint8_t halfHV[256];\
2132 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2133 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2134 OPNAME ## pixels16_l2(dst, halfH+16, halfHV, stride, 16, 16, 16);\
2136 void ff_ ## OPNAME ## qpel16_mc12_old_c(uint8_t *dst, uint8_t *src, int stride){\
2137 uint8_t full[24*17];\
2138 uint8_t halfH[272];\
2139 uint8_t halfV[256];\
2140 uint8_t halfHV[256];\
2141 copy_block17(full, src, 24, stride, 17);\
2142 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2143 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full, 16, 24);\
2144 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2145 OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2147 static void OPNAME ## qpel16_mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2148 uint8_t full[24*17];\
2149 uint8_t halfH[272];\
2150 copy_block17(full, src, 24, stride, 17);\
2151 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2152 put ## RND ## pixels16_l2(halfH, halfH, full, 16, 16, 24, 17);\
2153 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2155 void ff_ ## OPNAME ## qpel16_mc32_old_c(uint8_t *dst, uint8_t *src, int stride){\
2156 uint8_t full[24*17];\
2157 uint8_t halfH[272];\
2158 uint8_t halfV[256];\
2159 uint8_t halfHV[256];\
2160 copy_block17(full, src, 24, stride, 17);\
2161 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2162 put ## RND ## mpeg4_qpel16_v_lowpass(halfV, full+1, 16, 24);\
2163 put ## RND ## mpeg4_qpel16_v_lowpass(halfHV, halfH, 16, 16);\
2164 OPNAME ## pixels16_l2(dst, halfV, halfHV, stride, 16, 16, 16);\
2166 static void OPNAME ## qpel16_mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2167 uint8_t full[24*17];\
2168 uint8_t halfH[272];\
2169 copy_block17(full, src, 24, stride, 17);\
2170 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, full, 16, 24, 17);\
2171 put ## RND ## pixels16_l2(halfH, halfH, full+1, 16, 16, 24, 17);\
2172 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2174 static void OPNAME ## qpel16_mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2175 uint8_t halfH[272];\
2176 put ## RND ## mpeg4_qpel16_h_lowpass(halfH, src, 16, stride, 17);\
2177 OPNAME ## mpeg4_qpel16_v_lowpass(dst, halfH, stride, 16);\
2180 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2181 #define op_avg_no_rnd(a, b) a = (((a)+cm[((b) + 15)>>5])>>1)
2182 #define op_put(a, b) a = cm[((b) + 16)>>5]
2183 #define op_put_no_rnd(a, b) a = cm[((b) + 15)>>5]
2185 QPEL_MC(0, put_ , _ , op_put)
2186 QPEL_MC(1, put_no_rnd_, _no_rnd_, op_put_no_rnd)
2187 QPEL_MC(0, avg_ , _ , op_avg)
2188 //QPEL_MC(1, avg_no_rnd , _ , op_avg)
2190 #undef op_avg_no_rnd
2192 #undef op_put_no_rnd
2195 #define H264_LOWPASS(OPNAME, OP, OP2) \
2196 static av_unused void OPNAME ## h264_qpel2_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2198 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2202 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2203 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2209 static av_unused void OPNAME ## h264_qpel2_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2211 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2215 const int srcB= src[-2*srcStride];\
2216 const int srcA= src[-1*srcStride];\
2217 const int src0= src[0 *srcStride];\
2218 const int src1= src[1 *srcStride];\
2219 const int src2= src[2 *srcStride];\
2220 const int src3= src[3 *srcStride];\
2221 const int src4= src[4 *srcStride];\
2222 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2223 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2229 static av_unused void OPNAME ## h264_qpel2_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2232 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2234 src -= 2*srcStride;\
2235 for(i=0; i<h+5; i++)\
2237 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2238 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2242 tmp -= tmpStride*(h+5-2);\
2245 const int tmpB= tmp[-2*tmpStride];\
2246 const int tmpA= tmp[-1*tmpStride];\
2247 const int tmp0= tmp[0 *tmpStride];\
2248 const int tmp1= tmp[1 *tmpStride];\
2249 const int tmp2= tmp[2 *tmpStride];\
2250 const int tmp3= tmp[3 *tmpStride];\
2251 const int tmp4= tmp[4 *tmpStride];\
2252 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2253 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2258 static void OPNAME ## h264_qpel4_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2260 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2264 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]));\
2265 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]));\
2266 OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]));\
2267 OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]));\
2273 static void OPNAME ## h264_qpel4_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2275 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2279 const int srcB= src[-2*srcStride];\
2280 const int srcA= src[-1*srcStride];\
2281 const int src0= src[0 *srcStride];\
2282 const int src1= src[1 *srcStride];\
2283 const int src2= src[2 *srcStride];\
2284 const int src3= src[3 *srcStride];\
2285 const int src4= src[4 *srcStride];\
2286 const int src5= src[5 *srcStride];\
2287 const int src6= src[6 *srcStride];\
2288 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2289 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2290 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2291 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2297 static void OPNAME ## h264_qpel4_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2300 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2302 src -= 2*srcStride;\
2303 for(i=0; i<h+5; i++)\
2305 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3]);\
2306 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4]);\
2307 tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5]);\
2308 tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6]);\
2312 tmp -= tmpStride*(h+5-2);\
2315 const int tmpB= tmp[-2*tmpStride];\
2316 const int tmpA= tmp[-1*tmpStride];\
2317 const int tmp0= tmp[0 *tmpStride];\
2318 const int tmp1= tmp[1 *tmpStride];\
2319 const int tmp2= tmp[2 *tmpStride];\
2320 const int tmp3= tmp[3 *tmpStride];\
2321 const int tmp4= tmp[4 *tmpStride];\
2322 const int tmp5= tmp[5 *tmpStride];\
2323 const int tmp6= tmp[6 *tmpStride];\
2324 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2325 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2326 OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2327 OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2333 static void OPNAME ## h264_qpel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2335 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2339 OP(dst[0], (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]));\
2340 OP(dst[1], (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]));\
2341 OP(dst[2], (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]));\
2342 OP(dst[3], (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]));\
2343 OP(dst[4], (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]));\
2344 OP(dst[5], (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]));\
2345 OP(dst[6], (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]));\
2346 OP(dst[7], (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]));\
2352 static void OPNAME ## h264_qpel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2354 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2358 const int srcB= src[-2*srcStride];\
2359 const int srcA= src[-1*srcStride];\
2360 const int src0= src[0 *srcStride];\
2361 const int src1= src[1 *srcStride];\
2362 const int src2= src[2 *srcStride];\
2363 const int src3= src[3 *srcStride];\
2364 const int src4= src[4 *srcStride];\
2365 const int src5= src[5 *srcStride];\
2366 const int src6= src[6 *srcStride];\
2367 const int src7= src[7 *srcStride];\
2368 const int src8= src[8 *srcStride];\
2369 const int src9= src[9 *srcStride];\
2370 const int src10=src[10*srcStride];\
2371 OP(dst[0*dstStride], (src0+src1)*20 - (srcA+src2)*5 + (srcB+src3));\
2372 OP(dst[1*dstStride], (src1+src2)*20 - (src0+src3)*5 + (srcA+src4));\
2373 OP(dst[2*dstStride], (src2+src3)*20 - (src1+src4)*5 + (src0+src5));\
2374 OP(dst[3*dstStride], (src3+src4)*20 - (src2+src5)*5 + (src1+src6));\
2375 OP(dst[4*dstStride], (src4+src5)*20 - (src3+src6)*5 + (src2+src7));\
2376 OP(dst[5*dstStride], (src5+src6)*20 - (src4+src7)*5 + (src3+src8));\
2377 OP(dst[6*dstStride], (src6+src7)*20 - (src5+src8)*5 + (src4+src9));\
2378 OP(dst[7*dstStride], (src7+src8)*20 - (src6+src9)*5 + (src5+src10));\
2384 static void OPNAME ## h264_qpel8_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2387 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;\
2389 src -= 2*srcStride;\
2390 for(i=0; i<h+5; i++)\
2392 tmp[0]= (src[0]+src[1])*20 - (src[-1]+src[2])*5 + (src[-2]+src[3 ]);\
2393 tmp[1]= (src[1]+src[2])*20 - (src[0 ]+src[3])*5 + (src[-1]+src[4 ]);\
2394 tmp[2]= (src[2]+src[3])*20 - (src[1 ]+src[4])*5 + (src[0 ]+src[5 ]);\
2395 tmp[3]= (src[3]+src[4])*20 - (src[2 ]+src[5])*5 + (src[1 ]+src[6 ]);\
2396 tmp[4]= (src[4]+src[5])*20 - (src[3 ]+src[6])*5 + (src[2 ]+src[7 ]);\
2397 tmp[5]= (src[5]+src[6])*20 - (src[4 ]+src[7])*5 + (src[3 ]+src[8 ]);\
2398 tmp[6]= (src[6]+src[7])*20 - (src[5 ]+src[8])*5 + (src[4 ]+src[9 ]);\
2399 tmp[7]= (src[7]+src[8])*20 - (src[6 ]+src[9])*5 + (src[5 ]+src[10]);\
2403 tmp -= tmpStride*(h+5-2);\
2406 const int tmpB= tmp[-2*tmpStride];\
2407 const int tmpA= tmp[-1*tmpStride];\
2408 const int tmp0= tmp[0 *tmpStride];\
2409 const int tmp1= tmp[1 *tmpStride];\
2410 const int tmp2= tmp[2 *tmpStride];\
2411 const int tmp3= tmp[3 *tmpStride];\
2412 const int tmp4= tmp[4 *tmpStride];\
2413 const int tmp5= tmp[5 *tmpStride];\
2414 const int tmp6= tmp[6 *tmpStride];\
2415 const int tmp7= tmp[7 *tmpStride];\
2416 const int tmp8= tmp[8 *tmpStride];\
2417 const int tmp9= tmp[9 *tmpStride];\
2418 const int tmp10=tmp[10*tmpStride];\
2419 OP2(dst[0*dstStride], (tmp0+tmp1)*20 - (tmpA+tmp2)*5 + (tmpB+tmp3));\
2420 OP2(dst[1*dstStride], (tmp1+tmp2)*20 - (tmp0+tmp3)*5 + (tmpA+tmp4));\
2421 OP2(dst[2*dstStride], (tmp2+tmp3)*20 - (tmp1+tmp4)*5 + (tmp0+tmp5));\
2422 OP2(dst[3*dstStride], (tmp3+tmp4)*20 - (tmp2+tmp5)*5 + (tmp1+tmp6));\
2423 OP2(dst[4*dstStride], (tmp4+tmp5)*20 - (tmp3+tmp6)*5 + (tmp2+tmp7));\
2424 OP2(dst[5*dstStride], (tmp5+tmp6)*20 - (tmp4+tmp7)*5 + (tmp3+tmp8));\
2425 OP2(dst[6*dstStride], (tmp6+tmp7)*20 - (tmp5+tmp8)*5 + (tmp4+tmp9));\
2426 OP2(dst[7*dstStride], (tmp7+tmp8)*20 - (tmp6+tmp9)*5 + (tmp5+tmp10));\
2432 static void OPNAME ## h264_qpel16_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2433 OPNAME ## h264_qpel8_v_lowpass(dst , src , dstStride, srcStride);\
2434 OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2435 src += 8*srcStride;\
2436 dst += 8*dstStride;\
2437 OPNAME ## h264_qpel8_v_lowpass(dst , src , dstStride, srcStride);\
2438 OPNAME ## h264_qpel8_v_lowpass(dst+8, src+8, dstStride, srcStride);\
2441 static void OPNAME ## h264_qpel16_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
2442 OPNAME ## h264_qpel8_h_lowpass(dst , src , dstStride, srcStride);\
2443 OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2444 src += 8*srcStride;\
2445 dst += 8*dstStride;\
2446 OPNAME ## h264_qpel8_h_lowpass(dst , src , dstStride, srcStride);\
2447 OPNAME ## h264_qpel8_h_lowpass(dst+8, src+8, dstStride, srcStride);\
2450 static void OPNAME ## h264_qpel16_hv_lowpass(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
2451 OPNAME ## h264_qpel8_hv_lowpass(dst , tmp , src , dstStride, tmpStride, srcStride);\
2452 OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2453 src += 8*srcStride;\
2454 dst += 8*dstStride;\
2455 OPNAME ## h264_qpel8_hv_lowpass(dst , tmp , src , dstStride, tmpStride, srcStride);\
2456 OPNAME ## h264_qpel8_hv_lowpass(dst+8, tmp+8, src+8, dstStride, tmpStride, srcStride);\
2459 #define H264_MC(OPNAME, SIZE) \
2460 static void OPNAME ## h264_qpel ## SIZE ## _mc00_c (uint8_t *dst, uint8_t *src, int stride){\
2461 OPNAME ## pixels ## SIZE ## _c(dst, src, stride, SIZE);\
2464 static void OPNAME ## h264_qpel ## SIZE ## _mc10_c(uint8_t *dst, uint8_t *src, int stride){\
2465 uint8_t half[SIZE*SIZE];\
2466 put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2467 OPNAME ## pixels ## SIZE ## _l2(dst, src, half, stride, stride, SIZE, SIZE);\
2470 static void OPNAME ## h264_qpel ## SIZE ## _mc20_c(uint8_t *dst, uint8_t *src, int stride){\
2471 OPNAME ## h264_qpel ## SIZE ## _h_lowpass(dst, src, stride, stride);\
2474 static void OPNAME ## h264_qpel ## SIZE ## _mc30_c(uint8_t *dst, uint8_t *src, int stride){\
2475 uint8_t half[SIZE*SIZE];\
2476 put_h264_qpel ## SIZE ## _h_lowpass(half, src, SIZE, stride);\
2477 OPNAME ## pixels ## SIZE ## _l2(dst, src+1, half, stride, stride, SIZE, SIZE);\
2480 static void OPNAME ## h264_qpel ## SIZE ## _mc01_c(uint8_t *dst, uint8_t *src, int stride){\
2481 uint8_t full[SIZE*(SIZE+5)];\
2482 uint8_t * const full_mid= full + SIZE*2;\
2483 uint8_t half[SIZE*SIZE];\
2484 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2485 put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2486 OPNAME ## pixels ## SIZE ## _l2(dst, full_mid, half, stride, SIZE, SIZE, SIZE);\
2489 static void OPNAME ## h264_qpel ## SIZE ## _mc02_c(uint8_t *dst, uint8_t *src, int stride){\
2490 uint8_t full[SIZE*(SIZE+5)];\
2491 uint8_t * const full_mid= full + SIZE*2;\
2492 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2493 OPNAME ## h264_qpel ## SIZE ## _v_lowpass(dst, full_mid, stride, SIZE);\
2496 static void OPNAME ## h264_qpel ## SIZE ## _mc03_c(uint8_t *dst, uint8_t *src, int stride){\
2497 uint8_t full[SIZE*(SIZE+5)];\
2498 uint8_t * const full_mid= full + SIZE*2;\
2499 uint8_t half[SIZE*SIZE];\
2500 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2501 put_h264_qpel ## SIZE ## _v_lowpass(half, full_mid, SIZE, SIZE);\
2502 OPNAME ## pixels ## SIZE ## _l2(dst, full_mid+SIZE, half, stride, SIZE, SIZE, SIZE);\
2505 static void OPNAME ## h264_qpel ## SIZE ## _mc11_c(uint8_t *dst, uint8_t *src, int stride){\
2506 uint8_t full[SIZE*(SIZE+5)];\
2507 uint8_t * const full_mid= full + SIZE*2;\
2508 uint8_t halfH[SIZE*SIZE];\
2509 uint8_t halfV[SIZE*SIZE];\
2510 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2511 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2512 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2513 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2516 static void OPNAME ## h264_qpel ## SIZE ## _mc31_c(uint8_t *dst, uint8_t *src, int stride){\
2517 uint8_t full[SIZE*(SIZE+5)];\
2518 uint8_t * const full_mid= full + SIZE*2;\
2519 uint8_t halfH[SIZE*SIZE];\
2520 uint8_t halfV[SIZE*SIZE];\
2521 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2522 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2523 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2524 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2527 static void OPNAME ## h264_qpel ## SIZE ## _mc13_c(uint8_t *dst, uint8_t *src, int stride){\
2528 uint8_t full[SIZE*(SIZE+5)];\
2529 uint8_t * const full_mid= full + SIZE*2;\
2530 uint8_t halfH[SIZE*SIZE];\
2531 uint8_t halfV[SIZE*SIZE];\
2532 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2533 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2534 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2535 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2538 static void OPNAME ## h264_qpel ## SIZE ## _mc33_c(uint8_t *dst, uint8_t *src, int stride){\
2539 uint8_t full[SIZE*(SIZE+5)];\
2540 uint8_t * const full_mid= full + SIZE*2;\
2541 uint8_t halfH[SIZE*SIZE];\
2542 uint8_t halfV[SIZE*SIZE];\
2543 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2544 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2545 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2546 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfV, stride, SIZE, SIZE, SIZE);\
2549 static void OPNAME ## h264_qpel ## SIZE ## _mc22_c(uint8_t *dst, uint8_t *src, int stride){\
2550 int16_t tmp[SIZE*(SIZE+5)];\
2551 OPNAME ## h264_qpel ## SIZE ## _hv_lowpass(dst, tmp, src, stride, SIZE, stride);\
2554 static void OPNAME ## h264_qpel ## SIZE ## _mc21_c(uint8_t *dst, uint8_t *src, int stride){\
2555 int16_t tmp[SIZE*(SIZE+5)];\
2556 uint8_t halfH[SIZE*SIZE];\
2557 uint8_t halfHV[SIZE*SIZE];\
2558 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src, SIZE, stride);\
2559 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2560 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2563 static void OPNAME ## h264_qpel ## SIZE ## _mc23_c(uint8_t *dst, uint8_t *src, int stride){\
2564 int16_t tmp[SIZE*(SIZE+5)];\
2565 uint8_t halfH[SIZE*SIZE];\
2566 uint8_t halfHV[SIZE*SIZE];\
2567 put_h264_qpel ## SIZE ## _h_lowpass(halfH, src + stride, SIZE, stride);\
2568 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2569 OPNAME ## pixels ## SIZE ## _l2(dst, halfH, halfHV, stride, SIZE, SIZE, SIZE);\
2572 static void OPNAME ## h264_qpel ## SIZE ## _mc12_c(uint8_t *dst, uint8_t *src, int stride){\
2573 uint8_t full[SIZE*(SIZE+5)];\
2574 uint8_t * const full_mid= full + SIZE*2;\
2575 int16_t tmp[SIZE*(SIZE+5)];\
2576 uint8_t halfV[SIZE*SIZE];\
2577 uint8_t halfHV[SIZE*SIZE];\
2578 copy_block ## SIZE (full, src - stride*2, SIZE, stride, SIZE + 5);\
2579 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2580 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2581 OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2584 static void OPNAME ## h264_qpel ## SIZE ## _mc32_c(uint8_t *dst, uint8_t *src, int stride){\
2585 uint8_t full[SIZE*(SIZE+5)];\
2586 uint8_t * const full_mid= full + SIZE*2;\
2587 int16_t tmp[SIZE*(SIZE+5)];\
2588 uint8_t halfV[SIZE*SIZE];\
2589 uint8_t halfHV[SIZE*SIZE];\
2590 copy_block ## SIZE (full, src - stride*2 + 1, SIZE, stride, SIZE + 5);\
2591 put_h264_qpel ## SIZE ## _v_lowpass(halfV, full_mid, SIZE, SIZE);\
2592 put_h264_qpel ## SIZE ## _hv_lowpass(halfHV, tmp, src, SIZE, SIZE, stride);\
2593 OPNAME ## pixels ## SIZE ## _l2(dst, halfV, halfHV, stride, SIZE, SIZE, SIZE);\
2596 #define op_avg(a, b) a = (((a)+cm[((b) + 16)>>5]+1)>>1)
2597 //#define op_avg2(a, b) a = (((a)*w1+cm[((b) + 16)>>5]*w2 + o + 64)>>7)
2598 #define op_put(a, b) a = cm[((b) + 16)>>5]
2599 #define op2_avg(a, b) a = (((a)+cm[((b) + 512)>>10]+1)>>1)
2600 #define op2_put(a, b) a = cm[((b) + 512)>>10]
2602 H264_LOWPASS(put_ , op_put, op2_put)
2603 H264_LOWPASS(avg_ , op_avg, op2_avg)
2618 #define op_scale1(x) block[x] = av_clip_uint8( (block[x]*weight + offset) >> log2_denom )
2619 #define op_scale2(x) dst[x] = av_clip_uint8( (src[x]*weights + dst[x]*weightd + offset) >> (log2_denom+1))
2620 #define H264_WEIGHT(W,H) \
2621 static void weight_h264_pixels ## W ## x ## H ## _c(uint8_t *block, int stride, int log2_denom, int weight, int offset){ \
2623 offset <<= log2_denom; \
2624 if(log2_denom) offset += 1<<(log2_denom-1); \
2625 for(y=0; y<H; y++, block += stride){ \
2628 if(W==2) continue; \
2631 if(W==4) continue; \
2636 if(W==8) continue; \
2647 static void biweight_h264_pixels ## W ## x ## H ## _c(uint8_t *dst, uint8_t *src, int stride, int log2_denom, int weightd, int weights, int offset){ \
2649 offset = ((offset + 1) | 1) << log2_denom; \
2650 for(y=0; y<H; y++, dst += stride, src += stride){ \
2653 if(W==2) continue; \
2656 if(W==4) continue; \
2661 if(W==8) continue; \
2688 static void wmv2_mspel8_h_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){
2689 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2693 dst[0]= cm[(9*(src[0] + src[1]) - (src[-1] + src[2]) + 8)>>4];
2694 dst[1]= cm[(9*(src[1] + src[2]) - (src[ 0] + src[3]) + 8)>>4];
2695 dst[2]= cm[(9*(src[2] + src[3]) - (src[ 1] + src[4]) + 8)>>4];
2696 dst[3]= cm[(9*(src[3] + src[4]) - (src[ 2] + src[5]) + 8)>>4];
2697 dst[4]= cm[(9*(src[4] + src[5]) - (src[ 3] + src[6]) + 8)>>4];
2698 dst[5]= cm[(9*(src[5] + src[6]) - (src[ 4] + src[7]) + 8)>>4];
2699 dst[6]= cm[(9*(src[6] + src[7]) - (src[ 5] + src[8]) + 8)>>4];
2700 dst[7]= cm[(9*(src[7] + src[8]) - (src[ 6] + src[9]) + 8)>>4];
2706 #ifdef CONFIG_CAVS_DECODER
2708 void ff_cavsdsp_init(DSPContext* c, AVCodecContext *avctx);
2710 void ff_put_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2711 put_pixels8_c(dst, src, stride, 8);
2713 void ff_avg_cavs_qpel8_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2714 avg_pixels8_c(dst, src, stride, 8);
2716 void ff_put_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2717 put_pixels16_c(dst, src, stride, 16);
2719 void ff_avg_cavs_qpel16_mc00_c(uint8_t *dst, uint8_t *src, int stride) {
2720 avg_pixels16_c(dst, src, stride, 16);
2722 #endif /* CONFIG_CAVS_DECODER */
2724 #if defined(CONFIG_VC1_DECODER) || defined(CONFIG_WMV3_DECODER)
2726 void ff_vc1dsp_init(DSPContext* c, AVCodecContext *avctx);
2728 void ff_put_vc1_mspel_mc00_c(uint8_t *dst, uint8_t *src, int stride, int rnd) {
2729 put_pixels8_c(dst, src, stride, 8);
2731 #endif /* CONFIG_VC1_DECODER||CONFIG_WMV3_DECODER */
2733 void ff_intrax8dsp_init(DSPContext* c, AVCodecContext *avctx);
2736 void ff_h264dspenc_init(DSPContext* c, AVCodecContext *avctx);
2738 static void wmv2_mspel8_v_lowpass(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int w){
2739 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
2743 const int src_1= src[ -srcStride];
2744 const int src0 = src[0 ];
2745 const int src1 = src[ srcStride];
2746 const int src2 = src[2*srcStride];
2747 const int src3 = src[3*srcStride];
2748 const int src4 = src[4*srcStride];
2749 const int src5 = src[5*srcStride];
2750 const int src6 = src[6*srcStride];
2751 const int src7 = src[7*srcStride];
2752 const int src8 = src[8*srcStride];
2753 const int src9 = src[9*srcStride];
2754 dst[0*dstStride]= cm[(9*(src0 + src1) - (src_1 + src2) + 8)>>4];
2755 dst[1*dstStride]= cm[(9*(src1 + src2) - (src0 + src3) + 8)>>4];
2756 dst[2*dstStride]= cm[(9*(src2 + src3) - (src1 + src4) + 8)>>4];
2757 dst[3*dstStride]= cm[(9*(src3 + src4) - (src2 + src5) + 8)>>4];
2758 dst[4*dstStride]= cm[(9*(src4 + src5) - (src3 + src6) + 8)>>4];
2759 dst[5*dstStride]= cm[(9*(src5 + src6) - (src4 + src7) + 8)>>4];
2760 dst[6*dstStride]= cm[(9*(src6 + src7) - (src5 + src8) + 8)>>4];
2761 dst[7*dstStride]= cm[(9*(src7 + src8) - (src6 + src9) + 8)>>4];
2767 static void put_mspel8_mc00_c (uint8_t *dst, uint8_t *src, int stride){
2768 put_pixels8_c(dst, src, stride, 8);
2771 static void put_mspel8_mc10_c(uint8_t *dst, uint8_t *src, int stride){
2773 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2774 put_pixels8_l2(dst, src, half, stride, stride, 8, 8);
2777 static void put_mspel8_mc20_c(uint8_t *dst, uint8_t *src, int stride){
2778 wmv2_mspel8_h_lowpass(dst, src, stride, stride, 8);
2781 static void put_mspel8_mc30_c(uint8_t *dst, uint8_t *src, int stride){
2783 wmv2_mspel8_h_lowpass(half, src, 8, stride, 8);
2784 put_pixels8_l2(dst, src+1, half, stride, stride, 8, 8);
2787 static void put_mspel8_mc02_c(uint8_t *dst, uint8_t *src, int stride){
2788 wmv2_mspel8_v_lowpass(dst, src, stride, stride, 8);
2791 static void put_mspel8_mc12_c(uint8_t *dst, uint8_t *src, int stride){
2795 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2796 wmv2_mspel8_v_lowpass(halfV, src, 8, stride, 8);
2797 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2798 put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2800 static void put_mspel8_mc32_c(uint8_t *dst, uint8_t *src, int stride){
2804 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2805 wmv2_mspel8_v_lowpass(halfV, src+1, 8, stride, 8);
2806 wmv2_mspel8_v_lowpass(halfHV, halfH+8, 8, 8, 8);
2807 put_pixels8_l2(dst, halfV, halfHV, stride, 8, 8, 8);
2809 static void put_mspel8_mc22_c(uint8_t *dst, uint8_t *src, int stride){
2811 wmv2_mspel8_h_lowpass(halfH, src-stride, 8, stride, 11);
2812 wmv2_mspel8_v_lowpass(dst, halfH+8, stride, 8, 8);
2815 static void h263_v_loop_filter_c(uint8_t *src, int stride, int qscale){
2816 if(ENABLE_ANY_H263) {
2818 const int strength= ff_h263_loop_filter_strength[qscale];
2822 int p0= src[x-2*stride];
2823 int p1= src[x-1*stride];
2824 int p2= src[x+0*stride];
2825 int p3= src[x+1*stride];
2826 int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2828 if (d<-2*strength) d1= 0;
2829 else if(d<- strength) d1=-2*strength - d;
2830 else if(d< strength) d1= d;
2831 else if(d< 2*strength) d1= 2*strength - d;
2836 if(p1&256) p1= ~(p1>>31);
2837 if(p2&256) p2= ~(p2>>31);
2839 src[x-1*stride] = p1;
2840 src[x+0*stride] = p2;
2844 d2= av_clip((p0-p3)/4, -ad1, ad1);
2846 src[x-2*stride] = p0 - d2;
2847 src[x+ stride] = p3 + d2;
2852 static void h263_h_loop_filter_c(uint8_t *src, int stride, int qscale){
2853 if(ENABLE_ANY_H263) {
2855 const int strength= ff_h263_loop_filter_strength[qscale];
2859 int p0= src[y*stride-2];
2860 int p1= src[y*stride-1];
2861 int p2= src[y*stride+0];
2862 int p3= src[y*stride+1];
2863 int d = (p0 - p3 + 4*(p2 - p1)) / 8;
2865 if (d<-2*strength) d1= 0;
2866 else if(d<- strength) d1=-2*strength - d;
2867 else if(d< strength) d1= d;
2868 else if(d< 2*strength) d1= 2*strength - d;
2873 if(p1&256) p1= ~(p1>>31);
2874 if(p2&256) p2= ~(p2>>31);
2876 src[y*stride-1] = p1;
2877 src[y*stride+0] = p2;
2881 d2= av_clip((p0-p3)/4, -ad1, ad1);
2883 src[y*stride-2] = p0 - d2;
2884 src[y*stride+1] = p3 + d2;
2889 static void h261_loop_filter_c(uint8_t *src, int stride){
2894 temp[x ] = 4*src[x ];
2895 temp[x + 7*8] = 4*src[x + 7*stride];
2899 xy = y * stride + x;
2901 temp[yz] = src[xy - stride] + 2*src[xy] + src[xy + stride];
2906 src[ y*stride] = (temp[ y*8] + 2)>>2;
2907 src[7+y*stride] = (temp[7+y*8] + 2)>>2;
2909 xy = y * stride + x;
2911 src[xy] = (temp[yz-1] + 2*temp[yz] + temp[yz+1] + 8)>>4;
2916 static inline void h264_loop_filter_luma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
2919 for( i = 0; i < 4; i++ ) {
2924 for( d = 0; d < 4; d++ ) {
2925 const int p0 = pix[-1*xstride];
2926 const int p1 = pix[-2*xstride];
2927 const int p2 = pix[-3*xstride];
2928 const int q0 = pix[0];
2929 const int q1 = pix[1*xstride];
2930 const int q2 = pix[2*xstride];
2932 if( FFABS( p0 - q0 ) < alpha &&
2933 FFABS( p1 - p0 ) < beta &&
2934 FFABS( q1 - q0 ) < beta ) {
2939 if( FFABS( p2 - p0 ) < beta ) {
2940 pix[-2*xstride] = p1 + av_clip( (( p2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - p1, -tc0[i], tc0[i] );
2943 if( FFABS( q2 - q0 ) < beta ) {
2944 pix[ xstride] = q1 + av_clip( (( q2 + ( ( p0 + q0 + 1 ) >> 1 ) ) >> 1) - q1, -tc0[i], tc0[i] );
2948 i_delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
2949 pix[-xstride] = av_clip_uint8( p0 + i_delta ); /* p0' */
2950 pix[0] = av_clip_uint8( q0 - i_delta ); /* q0' */
2956 static void h264_v_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2958 h264_loop_filter_luma_c(pix, stride, 1, alpha, beta, tc0);
2960 static void h264_h_loop_filter_luma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2962 h264_loop_filter_luma_c(pix, 1, stride, alpha, beta, tc0);
2965 static inline void h264_loop_filter_chroma_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta, int8_t *tc0)
2968 for( i = 0; i < 4; i++ ) {
2969 const int tc = tc0[i];
2974 for( d = 0; d < 2; d++ ) {
2975 const int p0 = pix[-1*xstride];
2976 const int p1 = pix[-2*xstride];
2977 const int q0 = pix[0];
2978 const int q1 = pix[1*xstride];
2980 if( FFABS( p0 - q0 ) < alpha &&
2981 FFABS( p1 - p0 ) < beta &&
2982 FFABS( q1 - q0 ) < beta ) {
2984 int delta = av_clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
2986 pix[-xstride] = av_clip_uint8( p0 + delta ); /* p0' */
2987 pix[0] = av_clip_uint8( q0 - delta ); /* q0' */
2993 static void h264_v_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2995 h264_loop_filter_chroma_c(pix, stride, 1, alpha, beta, tc0);
2997 static void h264_h_loop_filter_chroma_c(uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0)
2999 h264_loop_filter_chroma_c(pix, 1, stride, alpha, beta, tc0);
3002 static inline void h264_loop_filter_chroma_intra_c(uint8_t *pix, int xstride, int ystride, int alpha, int beta)
3005 for( d = 0; d < 8; d++ ) {
3006 const int p0 = pix[-1*xstride];
3007 const int p1 = pix[-2*xstride];
3008 const int q0 = pix[0];
3009 const int q1 = pix[1*xstride];
3011 if( FFABS( p0 - q0 ) < alpha &&
3012 FFABS( p1 - p0 ) < beta &&
3013 FFABS( q1 - q0 ) < beta ) {
3015 pix[-xstride] = ( 2*p1 + p0 + q1 + 2 ) >> 2; /* p0' */
3016 pix[0] = ( 2*q1 + q0 + p1 + 2 ) >> 2; /* q0' */
3021 static void h264_v_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
3023 h264_loop_filter_chroma_intra_c(pix, stride, 1, alpha, beta);
3025 static void h264_h_loop_filter_chroma_intra_c(uint8_t *pix, int stride, int alpha, int beta)
3027 h264_loop_filter_chroma_intra_c(pix, 1, stride, alpha, beta);
3030 static inline int pix_abs16_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3036 s += abs(pix1[0] - pix2[0]);
3037 s += abs(pix1[1] - pix2[1]);
3038 s += abs(pix1[2] - pix2[2]);
3039 s += abs(pix1[3] - pix2[3]);
3040 s += abs(pix1[4] - pix2[4]);
3041 s += abs(pix1[5] - pix2[5]);
3042 s += abs(pix1[6] - pix2[6]);
3043 s += abs(pix1[7] - pix2[7]);
3044 s += abs(pix1[8] - pix2[8]);
3045 s += abs(pix1[9] - pix2[9]);
3046 s += abs(pix1[10] - pix2[10]);
3047 s += abs(pix1[11] - pix2[11]);
3048 s += abs(pix1[12] - pix2[12]);
3049 s += abs(pix1[13] - pix2[13]);
3050 s += abs(pix1[14] - pix2[14]);
3051 s += abs(pix1[15] - pix2[15]);
3058 static int pix_abs16_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3064 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
3065 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
3066 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
3067 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
3068 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
3069 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
3070 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
3071 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
3072 s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
3073 s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
3074 s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
3075 s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
3076 s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
3077 s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
3078 s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
3079 s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
3086 static int pix_abs16_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3089 uint8_t *pix3 = pix2 + line_size;
3093 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
3094 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
3095 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
3096 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
3097 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
3098 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
3099 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
3100 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
3101 s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
3102 s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
3103 s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
3104 s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
3105 s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
3106 s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
3107 s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
3108 s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
3116 static int pix_abs16_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3119 uint8_t *pix3 = pix2 + line_size;
3123 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
3124 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
3125 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
3126 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
3127 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
3128 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
3129 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
3130 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
3131 s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
3132 s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
3133 s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
3134 s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
3135 s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
3136 s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
3137 s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
3138 s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
3146 static inline int pix_abs8_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3152 s += abs(pix1[0] - pix2[0]);
3153 s += abs(pix1[1] - pix2[1]);
3154 s += abs(pix1[2] - pix2[2]);
3155 s += abs(pix1[3] - pix2[3]);
3156 s += abs(pix1[4] - pix2[4]);
3157 s += abs(pix1[5] - pix2[5]);
3158 s += abs(pix1[6] - pix2[6]);
3159 s += abs(pix1[7] - pix2[7]);
3166 static int pix_abs8_x2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3172 s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
3173 s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
3174 s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
3175 s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
3176 s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
3177 s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
3178 s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
3179 s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
3186 static int pix_abs8_y2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3189 uint8_t *pix3 = pix2 + line_size;
3193 s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
3194 s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
3195 s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
3196 s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
3197 s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
3198 s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
3199 s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
3200 s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
3208 static int pix_abs8_xy2_c(void *v, uint8_t *pix1, uint8_t *pix2, int line_size, int h)
3211 uint8_t *pix3 = pix2 + line_size;
3215 s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
3216 s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
3217 s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
3218 s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
3219 s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
3220 s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
3221 s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
3222 s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
3230 static int nsse16_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
3231 MpegEncContext *c = v;
3237 for(x=0; x<16; x++){
3238 score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]);
3241 for(x=0; x<15; x++){
3242 score2+= FFABS( s1[x ] - s1[x +stride]
3243 - s1[x+1] + s1[x+1+stride])
3244 -FFABS( s2[x ] - s2[x +stride]
3245 - s2[x+1] + s2[x+1+stride]);
3252 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
3253 else return score1 + FFABS(score2)*8;
3256 static int nsse8_c(void *v, uint8_t *s1, uint8_t *s2, int stride, int h){
3257 MpegEncContext *c = v;
3264 score1+= (s1[x ] - s2[x ])*(s1[x ] - s2[x ]);
3268 score2+= FFABS( s1[x ] - s1[x +stride]
3269 - s1[x+1] + s1[x+1+stride])
3270 -FFABS( s2[x ] - s2[x +stride]
3271 - s2[x+1] + s2[x+1+stride]);
3278 if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
3279 else return score1 + FFABS(score2)*8;
3282 static int try_8x8basis_c(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale){
3286 for(i=0; i<8*8; i++){
3287 int b= rem[i] + ((basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT));
3290 assert(-512<b && b<512);
3292 sum += (w*b)*(w*b)>>4;
3297 static void add_8x8basis_c(int16_t rem[64], int16_t basis[64], int scale){
3300 for(i=0; i<8*8; i++){
3301 rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
3306 * permutes an 8x8 block.
3307 * @param block the block which will be permuted according to the given permutation vector
3308 * @param permutation the permutation vector
3309 * @param last the last non zero coefficient in scantable order, used to speed the permutation up
3310 * @param scantable the used scantable, this is only used to speed the permutation up, the block is not
3311 * (inverse) permutated to scantable order!
3313 void ff_block_permute(DCTELEM *block, uint8_t *permutation, const uint8_t *scantable, int last)
3319 //if(permutation[1]==1) return; //FIXME it is ok but not clean and might fail for some permutations
3321 for(i=0; i<=last; i++){
3322 const int j= scantable[i];
3327 for(i=0; i<=last; i++){
3328 const int j= scantable[i];
3329 const int perm_j= permutation[j];
3330 block[perm_j]= temp[j];
3334 static int zero_cmp(void *s, uint8_t *a, uint8_t *b, int stride, int h){
3338 void ff_set_cmp(DSPContext* c, me_cmp_func *cmp, int type){
3341 memset(cmp, 0, sizeof(void*)*5);
3349 cmp[i]= c->hadamard8_diff[i];
3355 cmp[i]= c->dct_sad[i];
3358 cmp[i]= c->dct264_sad[i];
3361 cmp[i]= c->dct_max[i];
3364 cmp[i]= c->quant_psnr[i];
3384 #ifdef CONFIG_SNOW_ENCODER
3393 av_log(NULL, AV_LOG_ERROR,"internal error in cmp function selection\n");
3399 * memset(blocks, 0, sizeof(DCTELEM)*6*64)
3401 static void clear_blocks_c(DCTELEM *blocks)
3403 memset(blocks, 0, sizeof(DCTELEM)*6*64);
3406 static void add_bytes_c(uint8_t *dst, uint8_t *src, int w){
3408 for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3409 long a = *(long*)(src+i);
3410 long b = *(long*)(dst+i);
3411 *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
3414 dst[i+0] += src[i+0];
3417 static void add_bytes_l2_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
3419 for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3420 long a = *(long*)(src1+i);
3421 long b = *(long*)(src2+i);
3422 *(long*)(dst+i) = ((a&pb_7f) + (b&pb_7f)) ^ ((a^b)&pb_80);
3425 dst[i] = src1[i]+src2[i];
3428 static void diff_bytes_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
3430 #ifndef HAVE_FAST_UNALIGNED
3431 if((long)src2 & (sizeof(long)-1)){
3432 for(i=0; i+7<w; i+=8){
3433 dst[i+0] = src1[i+0]-src2[i+0];
3434 dst[i+1] = src1[i+1]-src2[i+1];
3435 dst[i+2] = src1[i+2]-src2[i+2];
3436 dst[i+3] = src1[i+3]-src2[i+3];
3437 dst[i+4] = src1[i+4]-src2[i+4];
3438 dst[i+5] = src1[i+5]-src2[i+5];
3439 dst[i+6] = src1[i+6]-src2[i+6];
3440 dst[i+7] = src1[i+7]-src2[i+7];
3444 for(i=0; i<=w-sizeof(long); i+=sizeof(long)){
3445 long a = *(long*)(src1+i);
3446 long b = *(long*)(src2+i);
3447 *(long*)(dst+i) = ((a|pb_80) - (b&pb_7f)) ^ ((a^b^pb_80)&pb_80);
3450 dst[i+0] = src1[i+0]-src2[i+0];
3453 static void sub_hfyu_median_prediction_c(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w, int *left, int *left_top){
3461 const int pred= mid_pred(l, src1[i], (l + src1[i] - lt)&0xFF);
3471 #define BUTTERFLY2(o1,o2,i1,i2) \
3475 #define BUTTERFLY1(x,y) \
3484 #define BUTTERFLYA(x,y) (FFABS((x)+(y)) + FFABS((x)-(y)))
3486 static int hadamard8_diff8x8_c(/*MpegEncContext*/ void *s, uint8_t *dst, uint8_t *src, int stride, int h){
3494 //FIXME try pointer walks
3495 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0]-dst[stride*i+0],src[stride*i+1]-dst[stride*i+1]);
3496 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2]-dst[stride*i+2],src[stride*i+3]-dst[stride*i+3]);
3497 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4]-dst[stride*i+4],src[stride*i+5]-dst[stride*i+5]);
3498 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6]-dst[stride*i+6],src[stride*i+7]-dst[stride*i+7]);
3500 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3501 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3502 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3503 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3505 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3506 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3507 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3508 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3512 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3513 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3514 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3515 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3517 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3518 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3519 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3520 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3523 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3524 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3525 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3526 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3532 printf("MAX:%d\n", maxi);
3538 static int hadamard8_intra8x8_c(/*MpegEncContext*/ void *s, uint8_t *src, uint8_t *dummy, int stride, int h){
3546 //FIXME try pointer walks
3547 BUTTERFLY2(temp[8*i+0], temp[8*i+1], src[stride*i+0],src[stride*i+1]);
3548 BUTTERFLY2(temp[8*i+2], temp[8*i+3], src[stride*i+2],src[stride*i+3]);
3549 BUTTERFLY2(temp[8*i+4], temp[8*i+5], src[stride*i+4],src[stride*i+5]);
3550 BUTTERFLY2(temp[8*i+6], temp[8*i+7], src[stride*i+6],src[stride*i+7]);
3552 BUTTERFLY1(temp[8*i+0], temp[8*i+2]);
3553 BUTTERFLY1(temp[8*i+1], temp[8*i+3]);
3554 BUTTERFLY1(temp[8*i+4], temp[8*i+6]);
3555 BUTTERFLY1(temp[8*i+5], temp[8*i+7]);
3557 BUTTERFLY1(temp[8*i+0], temp[8*i+4]);
3558 BUTTERFLY1(temp[8*i+1], temp[8*i+5]);
3559 BUTTERFLY1(temp[8*i+2], temp[8*i+6]);
3560 BUTTERFLY1(temp[8*i+3], temp[8*i+7]);
3564 BUTTERFLY1(temp[8*0+i], temp[8*1+i]);
3565 BUTTERFLY1(temp[8*2+i], temp[8*3+i]);
3566 BUTTERFLY1(temp[8*4+i], temp[8*5+i]);
3567 BUTTERFLY1(temp[8*6+i], temp[8*7+i]);
3569 BUTTERFLY1(temp[8*0+i], temp[8*2+i]);
3570 BUTTERFLY1(temp[8*1+i], temp[8*3+i]);
3571 BUTTERFLY1(temp[8*4+i], temp[8*6+i]);
3572 BUTTERFLY1(temp[8*5+i], temp[8*7+i]);
3575 BUTTERFLYA(temp[8*0+i], temp[8*4+i])
3576 +BUTTERFLYA(temp[8*1+i], temp[8*5+i])
3577 +BUTTERFLYA(temp[8*2+i], temp[8*6+i])
3578 +BUTTERFLYA(temp[8*3+i], temp[8*7+i]);
3581 sum -= FFABS(temp[8*0] + temp[8*4]); // -mean
3586 static int dct_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3587 MpegEncContext * const s= (MpegEncContext *)c;
3588 DECLARE_ALIGNED_16(uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3589 DCTELEM * const temp= (DCTELEM*)aligned_temp;
3593 s->dsp.diff_pixels(temp, src1, src2, stride);
3595 return s->dsp.sum_abs_dctelem(temp);
3600 const int s07 = SRC(0) + SRC(7);\
3601 const int s16 = SRC(1) + SRC(6);\
3602 const int s25 = SRC(2) + SRC(5);\
3603 const int s34 = SRC(3) + SRC(4);\
3604 const int a0 = s07 + s34;\
3605 const int a1 = s16 + s25;\
3606 const int a2 = s07 - s34;\
3607 const int a3 = s16 - s25;\
3608 const int d07 = SRC(0) - SRC(7);\
3609 const int d16 = SRC(1) - SRC(6);\
3610 const int d25 = SRC(2) - SRC(5);\
3611 const int d34 = SRC(3) - SRC(4);\
3612 const int a4 = d16 + d25 + (d07 + (d07>>1));\
3613 const int a5 = d07 - d34 - (d25 + (d25>>1));\
3614 const int a6 = d07 + d34 - (d16 + (d16>>1));\
3615 const int a7 = d16 - d25 + (d34 + (d34>>1));\
3617 DST(1, a4 + (a7>>2)) ;\
3618 DST(2, a2 + (a3>>1)) ;\
3619 DST(3, a5 + (a6>>2)) ;\
3621 DST(5, a6 - (a5>>2)) ;\
3622 DST(6, (a2>>1) - a3 ) ;\
3623 DST(7, (a4>>2) - a7 ) ;\
3626 static int dct264_sad8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3627 MpegEncContext * const s= (MpegEncContext *)c;
3632 s->dsp.diff_pixels(dct[0], src1, src2, stride);
3634 #define SRC(x) dct[i][x]
3635 #define DST(x,v) dct[i][x]= v
3636 for( i = 0; i < 8; i++ )
3641 #define SRC(x) dct[x][i]
3642 #define DST(x,v) sum += FFABS(v)
3643 for( i = 0; i < 8; i++ )
3651 static int dct_max8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3652 MpegEncContext * const s= (MpegEncContext *)c;
3653 DECLARE_ALIGNED_8(uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3654 DCTELEM * const temp= (DCTELEM*)aligned_temp;
3659 s->dsp.diff_pixels(temp, src1, src2, stride);
3663 sum= FFMAX(sum, FFABS(temp[i]));
3668 static int quant_psnr8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3669 MpegEncContext * const s= (MpegEncContext *)c;
3670 DECLARE_ALIGNED_8 (uint64_t, aligned_temp[sizeof(DCTELEM)*64*2/8]);
3671 DCTELEM * const temp= (DCTELEM*)aligned_temp;
3672 DCTELEM * const bak = ((DCTELEM*)aligned_temp)+64;
3678 s->dsp.diff_pixels(temp, src1, src2, stride);
3680 memcpy(bak, temp, 64*sizeof(DCTELEM));
3682 s->block_last_index[0/*FIXME*/]= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3683 s->dct_unquantize_inter(s, temp, 0, s->qscale);
3684 ff_simple_idct(temp); //FIXME
3687 sum+= (temp[i]-bak[i])*(temp[i]-bak[i]);
3692 static int rd8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3693 MpegEncContext * const s= (MpegEncContext *)c;
3694 const uint8_t *scantable= s->intra_scantable.permutated;
3695 DECLARE_ALIGNED_8 (uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3696 DECLARE_ALIGNED_8 (uint64_t, aligned_bak[stride]);
3697 DCTELEM * const temp= (DCTELEM*)aligned_temp;
3698 uint8_t * const bak= (uint8_t*)aligned_bak;
3699 int i, last, run, bits, level, distoration, start_i;
3700 const int esc_length= s->ac_esc_length;
3702 uint8_t * last_length;
3707 ((uint32_t*)(bak + i*stride))[0]= ((uint32_t*)(src2 + i*stride))[0];
3708 ((uint32_t*)(bak + i*stride))[1]= ((uint32_t*)(src2 + i*stride))[1];
3711 s->dsp.diff_pixels(temp, src1, src2, stride);
3713 s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3719 length = s->intra_ac_vlc_length;
3720 last_length= s->intra_ac_vlc_last_length;
3721 bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3724 length = s->inter_ac_vlc_length;
3725 last_length= s->inter_ac_vlc_last_length;
3730 for(i=start_i; i<last; i++){
3731 int j= scantable[i];
3736 if((level&(~127)) == 0){
3737 bits+= length[UNI_AC_ENC_INDEX(run, level)];
3746 level= temp[i] + 64;
3750 if((level&(~127)) == 0){
3751 bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3759 s->dct_unquantize_intra(s, temp, 0, s->qscale);
3761 s->dct_unquantize_inter(s, temp, 0, s->qscale);
3764 s->dsp.idct_add(bak, stride, temp);
3766 distoration= s->dsp.sse[1](NULL, bak, src1, stride, 8);
3768 return distoration + ((bits*s->qscale*s->qscale*109 + 64)>>7);
3771 static int bit8x8_c(/*MpegEncContext*/ void *c, uint8_t *src1, uint8_t *src2, int stride, int h){
3772 MpegEncContext * const s= (MpegEncContext *)c;
3773 const uint8_t *scantable= s->intra_scantable.permutated;
3774 DECLARE_ALIGNED_8 (uint64_t, aligned_temp[sizeof(DCTELEM)*64/8]);
3775 DCTELEM * const temp= (DCTELEM*)aligned_temp;
3776 int i, last, run, bits, level, start_i;
3777 const int esc_length= s->ac_esc_length;
3779 uint8_t * last_length;
3783 s->dsp.diff_pixels(temp, src1, src2, stride);
3785 s->block_last_index[0/*FIXME*/]= last= s->fast_dct_quantize(s, temp, 0/*FIXME*/, s->qscale, &i);
3791 length = s->intra_ac_vlc_length;
3792 last_length= s->intra_ac_vlc_last_length;
3793 bits+= s->luma_dc_vlc_length[temp[0] + 256]; //FIXME chroma
3796 length = s->inter_ac_vlc_length;
3797 last_length= s->inter_ac_vlc_last_length;
3802 for(i=start_i; i<last; i++){
3803 int j= scantable[i];
3808 if((level&(~127)) == 0){
3809 bits+= length[UNI_AC_ENC_INDEX(run, level)];
3818 level= temp[i] + 64;
3822 if((level&(~127)) == 0){
3823 bits+= last_length[UNI_AC_ENC_INDEX(run, level)];
3831 static int vsad_intra16_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){
3836 for(x=0; x<16; x+=4){
3837 score+= FFABS(s[x ] - s[x +stride]) + FFABS(s[x+1] - s[x+1+stride])
3838 +FFABS(s[x+2] - s[x+2+stride]) + FFABS(s[x+3] - s[x+3+stride]);
3846 static int vsad16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3851 for(x=0; x<16; x++){
3852 score+= FFABS(s1[x ] - s2[x ] - s1[x +stride] + s2[x +stride]);
3861 #define SQ(a) ((a)*(a))
3862 static int vsse_intra16_c(/*MpegEncContext*/ void *c, uint8_t *s, uint8_t *dummy, int stride, int h){
3867 for(x=0; x<16; x+=4){
3868 score+= SQ(s[x ] - s[x +stride]) + SQ(s[x+1] - s[x+1+stride])
3869 +SQ(s[x+2] - s[x+2+stride]) + SQ(s[x+3] - s[x+3+stride]);
3877 static int vsse16_c(/*MpegEncContext*/ void *c, uint8_t *s1, uint8_t *s2, int stride, int h){
3882 for(x=0; x<16; x++){
3883 score+= SQ(s1[x ] - s2[x ] - s1[x +stride] + s2[x +stride]);
3892 static int ssd_int8_vs_int16_c(const int8_t *pix1, const int16_t *pix2,
3896 for(i=0; i<size; i++)
3897 score += (pix1[i]-pix2[i])*(pix1[i]-pix2[i]);
3901 WRAPPER8_16_SQ(hadamard8_diff8x8_c, hadamard8_diff16_c)
3902 WRAPPER8_16_SQ(hadamard8_intra8x8_c, hadamard8_intra16_c)
3903 WRAPPER8_16_SQ(dct_sad8x8_c, dct_sad16_c)
3905 WRAPPER8_16_SQ(dct264_sad8x8_c, dct264_sad16_c)
3907 WRAPPER8_16_SQ(dct_max8x8_c, dct_max16_c)
3908 WRAPPER8_16_SQ(quant_psnr8x8_c, quant_psnr16_c)
3909 WRAPPER8_16_SQ(rd8x8_c, rd16_c)
3910 WRAPPER8_16_SQ(bit8x8_c, bit16_c)
3912 static void vector_fmul_c(float *dst, const float *src, int len){
3914 for(i=0; i<len; i++)
3918 static void vector_fmul_reverse_c(float *dst, const float *src0, const float *src1, int len){
3921 for(i=0; i<len; i++)
3922 dst[i] = src0[i] * src1[-i];
3925 void ff_vector_fmul_add_add_c(float *dst, const float *src0, const float *src1, const float *src2, int src3, int len, int step){
3927 for(i=0; i<len; i++)
3928 dst[i*step] = src0[i] * src1[i] + src2[i] + src3;
3931 void ff_float_to_int16_c(int16_t *dst, const float *src, int len){
3933 for(i=0; i<len; i++) {
3934 int_fast32_t tmp = ((const int32_t*)src)[i];
3936 tmp = (0x43c0ffff - tmp)>>31;
3937 // is this faster on some gcc/cpu combinations?
3938 // if(tmp > 0x43c0ffff) tmp = 0xFFFF;
3941 dst[i] = tmp - 0x8000;
3946 #define W1 2841 /* 2048*sqrt (2)*cos (1*pi/16) */
3947 #define W2 2676 /* 2048*sqrt (2)*cos (2*pi/16) */
3948 #define W3 2408 /* 2048*sqrt (2)*cos (3*pi/16) */
3949 #define W4 2048 /* 2048*sqrt (2)*cos (4*pi/16) */
3950 #define W5 1609 /* 2048*sqrt (2)*cos (5*pi/16) */
3951 #define W6 1108 /* 2048*sqrt (2)*cos (6*pi/16) */
3952 #define W7 565 /* 2048*sqrt (2)*cos (7*pi/16) */
3954 static void wmv2_idct_row(short * b)
3957 int a0,a1,a2,a3,a4,a5,a6,a7;
3959 a1 = W1*b[1]+W7*b[7];
3960 a7 = W7*b[1]-W1*b[7];
3961 a5 = W5*b[5]+W3*b[3];
3962 a3 = W3*b[5]-W5*b[3];
3963 a2 = W2*b[2]+W6*b[6];
3964 a6 = W6*b[2]-W2*b[6];
3965 a0 = W0*b[0]+W0*b[4];
3966 a4 = W0*b[0]-W0*b[4];
3968 s1 = (181*(a1-a5+a7-a3)+128)>>8;//1,3,5,7,
3969 s2 = (181*(a1-a5-a7+a3)+128)>>8;
3971 b[0] = (a0+a2+a1+a5 + (1<<7))>>8;
3972 b[1] = (a4+a6 +s1 + (1<<7))>>8;
3973 b[2] = (a4-a6 +s2 + (1<<7))>>8;
3974 b[3] = (a0-a2+a7+a3 + (1<<7))>>8;
3975 b[4] = (a0-a2-a7-a3 + (1<<7))>>8;
3976 b[5] = (a4-a6 -s2 + (1<<7))>>8;
3977 b[6] = (a4+a6 -s1 + (1<<7))>>8;
3978 b[7] = (a0+a2-a1-a5 + (1<<7))>>8;
3980 static void wmv2_idct_col(short * b)
3983 int a0,a1,a2,a3,a4,a5,a6,a7;
3984 /*step 1, with extended precision*/
3985 a1 = (W1*b[8*1]+W7*b[8*7] + 4)>>3;
3986 a7 = (W7*b[8*1]-W1*b[8*7] + 4)>>3;
3987 a5 = (W5*b[8*5]+W3*b[8*3] + 4)>>3;
3988 a3 = (W3*b[8*5]-W5*b[8*3] + 4)>>3;
3989 a2 = (W2*b[8*2]+W6*b[8*6] + 4)>>3;
3990 a6 = (W6*b[8*2]-W2*b[8*6] + 4)>>3;
3991 a0 = (W0*b[8*0]+W0*b[8*4] )>>3;
3992 a4 = (W0*b[8*0]-W0*b[8*4] )>>3;
3994 s1 = (181*(a1-a5+a7-a3)+128)>>8;
3995 s2 = (181*(a1-a5-a7+a3)+128)>>8;
3997 b[8*0] = (a0+a2+a1+a5 + (1<<13))>>14;
3998 b[8*1] = (a4+a6 +s1 + (1<<13))>>14;
3999 b[8*2] = (a4-a6 +s2 + (1<<13))>>14;
4000 b[8*3] = (a0-a2+a7+a3 + (1<<13))>>14;
4002 b[8*4] = (a0-a2-a7-a3 + (1<<13))>>14;
4003 b[8*5] = (a4-a6 -s2 + (1<<13))>>14;
4004 b[8*6] = (a4+a6 -s1 + (1<<13))>>14;
4005 b[8*7] = (a0+a2-a1-a5 + (1<<13))>>14;
4007 void ff_wmv2_idct_c(short * block){
4011 wmv2_idct_row(block+i);
4014 wmv2_idct_col(block+i);
4017 /* XXX: those functions should be suppressed ASAP when all IDCTs are
4019 static void ff_wmv2_idct_put_c(uint8_t *dest, int line_size, DCTELEM *block)
4021 ff_wmv2_idct_c(block);
4022 put_pixels_clamped_c(block, dest, line_size);
4024 static void ff_wmv2_idct_add_c(uint8_t *dest, int line_size, DCTELEM *block)
4026 ff_wmv2_idct_c(block);
4027 add_pixels_clamped_c(block, dest, line_size);
4029 static void ff_jref_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
4032 put_pixels_clamped_c(block, dest, line_size);
4034 static void ff_jref_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
4037 add_pixels_clamped_c(block, dest, line_size);
4040 static void ff_jref_idct4_put(uint8_t *dest, int line_size, DCTELEM *block)
4043 put_pixels_clamped4_c(block, dest, line_size);
4045 static void ff_jref_idct4_add(uint8_t *dest, int line_size, DCTELEM *block)
4048 add_pixels_clamped4_c(block, dest, line_size);
4051 static void ff_jref_idct2_put(uint8_t *dest, int line_size, DCTELEM *block)
4054 put_pixels_clamped2_c(block, dest, line_size);
4056 static void ff_jref_idct2_add(uint8_t *dest, int line_size, DCTELEM *block)
4059 add_pixels_clamped2_c(block, dest, line_size);
4062 static void ff_jref_idct1_put(uint8_t *dest, int line_size, DCTELEM *block)
4064 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
4066 dest[0] = cm[(block[0] + 4)>>3];
4068 static void ff_jref_idct1_add(uint8_t *dest, int line_size, DCTELEM *block)
4070 uint8_t *cm = ff_cropTbl + MAX_NEG_CROP;
4072 dest[0] = cm[dest[0] + ((block[0] + 4)>>3)];
4075 static void just_return(void *mem av_unused, int stride av_unused, int h av_unused) { return; }
4077 /* init static data */
4078 void dsputil_static_init(void)
4082 for(i=0;i<256;i++) ff_cropTbl[i + MAX_NEG_CROP] = i;
4083 for(i=0;i<MAX_NEG_CROP;i++) {
4085 ff_cropTbl[i + MAX_NEG_CROP + 256] = 255;
4088 for(i=0;i<512;i++) {
4089 ff_squareTbl[i] = (i - 256) * (i - 256);
4092 for(i=0; i<64; i++) inv_zigzag_direct16[ff_zigzag_direct[i]]= i+1;
4095 int ff_check_alignment(void){
4096 static int did_fail=0;
4097 DECLARE_ALIGNED_16(int, aligned);
4099 if((long)&aligned & 15){
4101 #if defined(HAVE_MMX) || defined(HAVE_ALTIVEC)
4102 av_log(NULL, AV_LOG_ERROR,
4103 "Compiler did not align stack variables. Libavcodec has been miscompiled\n"
4104 "and may be very slow or crash. This is not a bug in libavcodec,\n"
4105 "but in the compiler. You may try recompiling using gcc >= 4.2.\n"
4106 "Do not report crashes to FFmpeg developers.\n");
4115 void dsputil_init(DSPContext* c, AVCodecContext *avctx)
4119 ff_check_alignment();
4121 #ifdef CONFIG_ENCODERS
4122 if(avctx->dct_algo==FF_DCT_FASTINT) {
4123 c->fdct = fdct_ifast;
4124 c->fdct248 = fdct_ifast248;
4126 else if(avctx->dct_algo==FF_DCT_FAAN) {
4127 c->fdct = ff_faandct;
4128 c->fdct248 = ff_faandct248;
4131 c->fdct = ff_jpeg_fdct_islow; //slow/accurate/default
4132 c->fdct248 = ff_fdct248_islow;
4134 #endif //CONFIG_ENCODERS
4136 if(avctx->lowres==1){
4137 if(avctx->idct_algo==FF_IDCT_INT || avctx->idct_algo==FF_IDCT_AUTO || !ENABLE_H264_DECODER){
4138 c->idct_put= ff_jref_idct4_put;
4139 c->idct_add= ff_jref_idct4_add;
4141 c->idct_put= ff_h264_lowres_idct_put_c;
4142 c->idct_add= ff_h264_lowres_idct_add_c;
4144 c->idct = j_rev_dct4;
4145 c->idct_permutation_type= FF_NO_IDCT_PERM;
4146 }else if(avctx->lowres==2){
4147 c->idct_put= ff_jref_idct2_put;
4148 c->idct_add= ff_jref_idct2_add;
4149 c->idct = j_rev_dct2;
4150 c->idct_permutation_type= FF_NO_IDCT_PERM;
4151 }else if(avctx->lowres==3){
4152 c->idct_put= ff_jref_idct1_put;
4153 c->idct_add= ff_jref_idct1_add;
4154 c->idct = j_rev_dct1;
4155 c->idct_permutation_type= FF_NO_IDCT_PERM;
4157 if(avctx->idct_algo==FF_IDCT_INT){
4158 c->idct_put= ff_jref_idct_put;
4159 c->idct_add= ff_jref_idct_add;
4160 c->idct = j_rev_dct;
4161 c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
4162 }else if((ENABLE_VP3_DECODER || ENABLE_VP5_DECODER || ENABLE_VP6_DECODER || ENABLE_THEORA_DECODER ) &&
4163 avctx->idct_algo==FF_IDCT_VP3){
4164 c->idct_put= ff_vp3_idct_put_c;
4165 c->idct_add= ff_vp3_idct_add_c;
4166 c->idct = ff_vp3_idct_c;
4167 c->idct_permutation_type= FF_NO_IDCT_PERM;
4168 }else if(avctx->idct_algo==FF_IDCT_WMV2){
4169 c->idct_put= ff_wmv2_idct_put_c;
4170 c->idct_add= ff_wmv2_idct_add_c;
4171 c->idct = ff_wmv2_idct_c;
4172 c->idct_permutation_type= FF_NO_IDCT_PERM;
4173 }else if(avctx->idct_algo==FF_IDCT_FAAN){
4174 c->idct_put= ff_faanidct_put;
4175 c->idct_add= ff_faanidct_add;
4176 c->idct = ff_faanidct;
4177 c->idct_permutation_type= FF_NO_IDCT_PERM;
4178 }else{ //accurate/default
4179 c->idct_put= ff_simple_idct_put;
4180 c->idct_add= ff_simple_idct_add;
4181 c->idct = ff_simple_idct;
4182 c->idct_permutation_type= FF_NO_IDCT_PERM;
4186 if (ENABLE_H264_DECODER) {
4187 c->h264_idct_add= ff_h264_idct_add_c;
4188 c->h264_idct8_add= ff_h264_idct8_add_c;
4189 c->h264_idct_dc_add= ff_h264_idct_dc_add_c;
4190 c->h264_idct8_dc_add= ff_h264_idct8_dc_add_c;
4193 c->get_pixels = get_pixels_c;
4194 c->diff_pixels = diff_pixels_c;
4195 c->put_pixels_clamped = put_pixels_clamped_c;
4196 c->put_signed_pixels_clamped = put_signed_pixels_clamped_c;
4197 c->add_pixels_clamped = add_pixels_clamped_c;
4198 c->add_pixels8 = add_pixels8_c;
4199 c->add_pixels4 = add_pixels4_c;
4200 c->sum_abs_dctelem = sum_abs_dctelem_c;
4203 c->clear_blocks = clear_blocks_c;
4204 c->pix_sum = pix_sum_c;
4205 c->pix_norm1 = pix_norm1_c;
4207 /* TODO [0] 16 [1] 8 */
4208 c->pix_abs[0][0] = pix_abs16_c;
4209 c->pix_abs[0][1] = pix_abs16_x2_c;
4210 c->pix_abs[0][2] = pix_abs16_y2_c;
4211 c->pix_abs[0][3] = pix_abs16_xy2_c;
4212 c->pix_abs[1][0] = pix_abs8_c;
4213 c->pix_abs[1][1] = pix_abs8_x2_c;
4214 c->pix_abs[1][2] = pix_abs8_y2_c;
4215 c->pix_abs[1][3] = pix_abs8_xy2_c;
4217 #define dspfunc(PFX, IDX, NUM) \
4218 c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## NUM ## _c; \
4219 c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## NUM ## _x2_c; \
4220 c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## NUM ## _y2_c; \
4221 c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## NUM ## _xy2_c
4223 dspfunc(put, 0, 16);
4224 dspfunc(put_no_rnd, 0, 16);
4226 dspfunc(put_no_rnd, 1, 8);
4230 dspfunc(avg, 0, 16);
4231 dspfunc(avg_no_rnd, 0, 16);
4233 dspfunc(avg_no_rnd, 1, 8);
4238 c->put_no_rnd_pixels_l2[0]= put_no_rnd_pixels16_l2_c;
4239 c->put_no_rnd_pixels_l2[1]= put_no_rnd_pixels8_l2_c;
4241 c->put_tpel_pixels_tab[ 0] = put_tpel_pixels_mc00_c;
4242 c->put_tpel_pixels_tab[ 1] = put_tpel_pixels_mc10_c;
4243 c->put_tpel_pixels_tab[ 2] = put_tpel_pixels_mc20_c;
4244 c->put_tpel_pixels_tab[ 4] = put_tpel_pixels_mc01_c;
4245 c->put_tpel_pixels_tab[ 5] = put_tpel_pixels_mc11_c;
4246 c->put_tpel_pixels_tab[ 6] = put_tpel_pixels_mc21_c;
4247 c->put_tpel_pixels_tab[ 8] = put_tpel_pixels_mc02_c;
4248 c->put_tpel_pixels_tab[ 9] = put_tpel_pixels_mc12_c;
4249 c->put_tpel_pixels_tab[10] = put_tpel_pixels_mc22_c;
4251 c->avg_tpel_pixels_tab[ 0] = avg_tpel_pixels_mc00_c;
4252 c->avg_tpel_pixels_tab[ 1] = avg_tpel_pixels_mc10_c;
4253 c->avg_tpel_pixels_tab[ 2] = avg_tpel_pixels_mc20_c;
4254 c->avg_tpel_pixels_tab[ 4] = avg_tpel_pixels_mc01_c;
4255 c->avg_tpel_pixels_tab[ 5] = avg_tpel_pixels_mc11_c;
4256 c->avg_tpel_pixels_tab[ 6] = avg_tpel_pixels_mc21_c;
4257 c->avg_tpel_pixels_tab[ 8] = avg_tpel_pixels_mc02_c;
4258 c->avg_tpel_pixels_tab[ 9] = avg_tpel_pixels_mc12_c;
4259 c->avg_tpel_pixels_tab[10] = avg_tpel_pixels_mc22_c;
4261 #define dspfunc(PFX, IDX, NUM) \
4262 c->PFX ## _pixels_tab[IDX][ 0] = PFX ## NUM ## _mc00_c; \
4263 c->PFX ## _pixels_tab[IDX][ 1] = PFX ## NUM ## _mc10_c; \
4264 c->PFX ## _pixels_tab[IDX][ 2] = PFX ## NUM ## _mc20_c; \
4265 c->PFX ## _pixels_tab[IDX][ 3] = PFX ## NUM ## _mc30_c; \
4266 c->PFX ## _pixels_tab[IDX][ 4] = PFX ## NUM ## _mc01_c; \
4267 c->PFX ## _pixels_tab[IDX][ 5] = PFX ## NUM ## _mc11_c; \
4268 c->PFX ## _pixels_tab[IDX][ 6] = PFX ## NUM ## _mc21_c; \
4269 c->PFX ## _pixels_tab[IDX][ 7] = PFX ## NUM ## _mc31_c; \
4270 c->PFX ## _pixels_tab[IDX][ 8] = PFX ## NUM ## _mc02_c; \
4271 c->PFX ## _pixels_tab[IDX][ 9] = PFX ## NUM ## _mc12_c; \
4272 c->PFX ## _pixels_tab[IDX][10] = PFX ## NUM ## _mc22_c; \
4273 c->PFX ## _pixels_tab[IDX][11] = PFX ## NUM ## _mc32_c; \
4274 c->PFX ## _pixels_tab[IDX][12] = PFX ## NUM ## _mc03_c; \
4275 c->PFX ## _pixels_tab[IDX][13] = PFX ## NUM ## _mc13_c; \
4276 c->PFX ## _pixels_tab[IDX][14] = PFX ## NUM ## _mc23_c; \
4277 c->PFX ## _pixels_tab[IDX][15] = PFX ## NUM ## _mc33_c
4279 dspfunc(put_qpel, 0, 16);
4280 dspfunc(put_no_rnd_qpel, 0, 16);
4282 dspfunc(avg_qpel, 0, 16);
4283 /* dspfunc(avg_no_rnd_qpel, 0, 16); */
4285 dspfunc(put_qpel, 1, 8);
4286 dspfunc(put_no_rnd_qpel, 1, 8);
4288 dspfunc(avg_qpel, 1, 8);
4289 /* dspfunc(avg_no_rnd_qpel, 1, 8); */
4291 dspfunc(put_h264_qpel, 0, 16);
4292 dspfunc(put_h264_qpel, 1, 8);
4293 dspfunc(put_h264_qpel, 2, 4);
4294 dspfunc(put_h264_qpel, 3, 2);
4295 dspfunc(avg_h264_qpel, 0, 16);
4296 dspfunc(avg_h264_qpel, 1, 8);
4297 dspfunc(avg_h264_qpel, 2, 4);
4300 c->put_h264_chroma_pixels_tab[0]= put_h264_chroma_mc8_c;
4301 c->put_h264_chroma_pixels_tab[1]= put_h264_chroma_mc4_c;
4302 c->put_h264_chroma_pixels_tab[2]= put_h264_chroma_mc2_c;
4303 c->avg_h264_chroma_pixels_tab[0]= avg_h264_chroma_mc8_c;
4304 c->avg_h264_chroma_pixels_tab[1]= avg_h264_chroma_mc4_c;
4305 c->avg_h264_chroma_pixels_tab[2]= avg_h264_chroma_mc2_c;
4306 c->put_no_rnd_h264_chroma_pixels_tab[0]= put_no_rnd_h264_chroma_mc8_c;
4308 c->weight_h264_pixels_tab[0]= weight_h264_pixels16x16_c;
4309 c->weight_h264_pixels_tab[1]= weight_h264_pixels16x8_c;
4310 c->weight_h264_pixels_tab[2]= weight_h264_pixels8x16_c;
4311 c->weight_h264_pixels_tab[3]= weight_h264_pixels8x8_c;
4312 c->weight_h264_pixels_tab[4]= weight_h264_pixels8x4_c;
4313 c->weight_h264_pixels_tab[5]= weight_h264_pixels4x8_c;
4314 c->weight_h264_pixels_tab[6]= weight_h264_pixels4x4_c;
4315 c->weight_h264_pixels_tab[7]= weight_h264_pixels4x2_c;
4316 c->weight_h264_pixels_tab[8]= weight_h264_pixels2x4_c;
4317 c->weight_h264_pixels_tab[9]= weight_h264_pixels2x2_c;
4318 c->biweight_h264_pixels_tab[0]= biweight_h264_pixels16x16_c;
4319 c->biweight_h264_pixels_tab[1]= biweight_h264_pixels16x8_c;
4320 c->biweight_h264_pixels_tab[2]= biweight_h264_pixels8x16_c;
4321 c->biweight_h264_pixels_tab[3]= biweight_h264_pixels8x8_c;
4322 c->biweight_h264_pixels_tab[4]= biweight_h264_pixels8x4_c;
4323 c->biweight_h264_pixels_tab[5]= biweight_h264_pixels4x8_c;
4324 c->biweight_h264_pixels_tab[6]= biweight_h264_pixels4x4_c;
4325 c->biweight_h264_pixels_tab[7]= biweight_h264_pixels4x2_c;
4326 c->biweight_h264_pixels_tab[8]= biweight_h264_pixels2x4_c;
4327 c->biweight_h264_pixels_tab[9]= biweight_h264_pixels2x2_c;
4329 c->draw_edges = draw_edges_c;
4331 #ifdef CONFIG_CAVS_DECODER
4332 ff_cavsdsp_init(c,avctx);
4334 #if defined(CONFIG_VC1_DECODER) || defined(CONFIG_WMV3_DECODER)
4335 ff_vc1dsp_init(c,avctx);
4337 #if defined(CONFIG_WMV2_DECODER) || defined(CONFIG_VC1_DECODER) || defined(CONFIG_WMV3_DECODER)
4338 ff_intrax8dsp_init(c,avctx);
4340 #if defined(CONFIG_H264_ENCODER)
4341 ff_h264dspenc_init(c,avctx);
4344 c->put_mspel_pixels_tab[0]= put_mspel8_mc00_c;
4345 c->put_mspel_pixels_tab[1]= put_mspel8_mc10_c;
4346 c->put_mspel_pixels_tab[2]= put_mspel8_mc20_c;
4347 c->put_mspel_pixels_tab[3]= put_mspel8_mc30_c;
4348 c->put_mspel_pixels_tab[4]= put_mspel8_mc02_c;
4349 c->put_mspel_pixels_tab[5]= put_mspel8_mc12_c;
4350 c->put_mspel_pixels_tab[6]= put_mspel8_mc22_c;
4351 c->put_mspel_pixels_tab[7]= put_mspel8_mc32_c;
4353 #define SET_CMP_FUNC(name) \
4354 c->name[0]= name ## 16_c;\
4355 c->name[1]= name ## 8x8_c;
4357 SET_CMP_FUNC(hadamard8_diff)
4358 c->hadamard8_diff[4]= hadamard8_intra16_c;
4359 SET_CMP_FUNC(dct_sad)
4360 SET_CMP_FUNC(dct_max)
4362 SET_CMP_FUNC(dct264_sad)
4364 c->sad[0]= pix_abs16_c;
4365 c->sad[1]= pix_abs8_c;
4369 SET_CMP_FUNC(quant_psnr)
4372 c->vsad[0]= vsad16_c;
4373 c->vsad[4]= vsad_intra16_c;
4374 c->vsse[0]= vsse16_c;
4375 c->vsse[4]= vsse_intra16_c;
4376 c->nsse[0]= nsse16_c;
4377 c->nsse[1]= nsse8_c;
4378 #ifdef CONFIG_SNOW_ENCODER
4379 c->w53[0]= w53_16_c;
4381 c->w97[0]= w97_16_c;
4385 c->ssd_int8_vs_int16 = ssd_int8_vs_int16_c;
4387 c->add_bytes= add_bytes_c;
4388 c->add_bytes_l2= add_bytes_l2_c;
4389 c->diff_bytes= diff_bytes_c;
4390 c->sub_hfyu_median_prediction= sub_hfyu_median_prediction_c;
4391 c->bswap_buf= bswap_buf;
4392 #ifdef CONFIG_PNG_DECODER
4393 c->add_png_paeth_prediction= ff_add_png_paeth_prediction;
4396 c->h264_v_loop_filter_luma= h264_v_loop_filter_luma_c;
4397 c->h264_h_loop_filter_luma= h264_h_loop_filter_luma_c;
4398 c->h264_v_loop_filter_chroma= h264_v_loop_filter_chroma_c;
4399 c->h264_h_loop_filter_chroma= h264_h_loop_filter_chroma_c;
4400 c->h264_v_loop_filter_chroma_intra= h264_v_loop_filter_chroma_intra_c;
4401 c->h264_h_loop_filter_chroma_intra= h264_h_loop_filter_chroma_intra_c;
4402 c->h264_loop_filter_strength= NULL;
4404 if (ENABLE_ANY_H263) {
4405 c->h263_h_loop_filter= h263_h_loop_filter_c;
4406 c->h263_v_loop_filter= h263_v_loop_filter_c;
4409 c->h261_loop_filter= h261_loop_filter_c;
4411 c->try_8x8basis= try_8x8basis_c;
4412 c->add_8x8basis= add_8x8basis_c;
4414 #ifdef CONFIG_SNOW_DECODER
4415 c->vertical_compose97i = ff_snow_vertical_compose97i;
4416 c->horizontal_compose97i = ff_snow_horizontal_compose97i;
4417 c->inner_add_yblock = ff_snow_inner_add_yblock;
4420 #ifdef CONFIG_VORBIS_DECODER
4421 c->vorbis_inverse_coupling = vorbis_inverse_coupling;
4423 #ifdef CONFIG_FLAC_ENCODER
4424 c->flac_compute_autocorr = ff_flac_compute_autocorr;
4426 c->vector_fmul = vector_fmul_c;
4427 c->vector_fmul_reverse = vector_fmul_reverse_c;
4428 c->vector_fmul_add_add = ff_vector_fmul_add_add_c;
4429 c->float_to_int16 = ff_float_to_int16_c;
4431 c->shrink[0]= ff_img_copy_plane;
4432 c->shrink[1]= ff_shrink22;
4433 c->shrink[2]= ff_shrink44;
4434 c->shrink[3]= ff_shrink88;
4436 c->prefetch= just_return;
4438 memset(c->put_2tap_qpel_pixels_tab, 0, sizeof(c->put_2tap_qpel_pixels_tab));
4439 memset(c->avg_2tap_qpel_pixels_tab, 0, sizeof(c->avg_2tap_qpel_pixels_tab));
4441 if (ENABLE_MMX) dsputil_init_mmx (c, avctx);
4442 if (ENABLE_ARMV4L) dsputil_init_armv4l(c, avctx);
4443 if (ENABLE_MLIB) dsputil_init_mlib (c, avctx);
4444 if (ENABLE_VIS) dsputil_init_vis (c, avctx);
4445 if (ENABLE_ALPHA) dsputil_init_alpha (c, avctx);
4446 if (ENABLE_POWERPC) dsputil_init_ppc (c, avctx);
4447 if (ENABLE_MMI) dsputil_init_mmi (c, avctx);
4448 if (ENABLE_SH4) dsputil_init_sh4 (c, avctx);
4449 if (ENABLE_BFIN) dsputil_init_bfin (c, avctx);
4451 for(i=0; i<64; i++){
4452 if(!c->put_2tap_qpel_pixels_tab[0][i])
4453 c->put_2tap_qpel_pixels_tab[0][i]= c->put_h264_qpel_pixels_tab[0][i];
4454 if(!c->avg_2tap_qpel_pixels_tab[0][i])
4455 c->avg_2tap_qpel_pixels_tab[0][i]= c->avg_h264_qpel_pixels_tab[0][i];
4458 switch(c->idct_permutation_type){
4459 case FF_NO_IDCT_PERM:
4461 c->idct_permutation[i]= i;
4463 case FF_LIBMPEG2_IDCT_PERM:
4465 c->idct_permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
4467 case FF_SIMPLE_IDCT_PERM:
4469 c->idct_permutation[i]= simple_mmx_permutation[i];
4471 case FF_TRANSPOSE_IDCT_PERM:
4473 c->idct_permutation[i]= ((i&7)<<3) | (i>>3);
4475 case FF_PARTTRANS_IDCT_PERM:
4477 c->idct_permutation[i]= (i&0x24) | ((i&3)<<3) | ((i>>3)&3);
4480 av_log(avctx, AV_LOG_ERROR, "Internal error, IDCT permutation not set\n");