]> git.sesse.net Git - ffmpeg/blob - libavcodec/dsputil.c
first cut at altivec support on darwin patch by (Brian Foley <bfoley at compsoc dot...
[ffmpeg] / libavcodec / dsputil.c
1 /*
2  * DSP utils
3  * Copyright (c) 2000, 2001 Fabrice Bellard.
4  *
5  * This library is free software; you can redistribute it and/or
6  * modify it under the terms of the GNU Lesser General Public
7  * License as published by the Free Software Foundation; either
8  * version 2 of the License, or (at your option) any later version.
9  *
10  * This library is distributed in the hope that it will be useful,
11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13  * Lesser General Public License for more details.
14  *
15  * You should have received a copy of the GNU Lesser General Public
16  * License along with this library; if not, write to the Free Software
17  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
18  *
19  * gmc & q-pel & 32/64 bit based MC by Michael Niedermayer <michaelni@gmx.at>
20  */
21 #include "avcodec.h"
22 #include "dsputil.h"
23 #include "simple_idct.h"
24
25 void (*ff_idct)(DCTELEM *block);
26 void (*ff_idct_put)(UINT8 *dest, int line_size, DCTELEM *block);
27 void (*ff_idct_add)(UINT8 *dest, int line_size, DCTELEM *block);
28 void (*av_fdct)(DCTELEM *block);
29 void (*get_pixels)(DCTELEM *block, const UINT8 *pixels, int line_size);
30 void (*diff_pixels)(DCTELEM *block, const UINT8 *s1, const UINT8 *s2, int stride);
31 void (*put_pixels_clamped)(const DCTELEM *block, UINT8 *pixels, int line_size);
32 void (*add_pixels_clamped)(const DCTELEM *block, UINT8 *pixels, int line_size);
33 void (*gmc1)(UINT8 *dst, UINT8 *src, int srcStride, int h, int x16, int y16, int rounder);
34 void (*clear_blocks)(DCTELEM *blocks);
35 int (*pix_sum)(UINT8 * pix, int line_size);
36 int (*pix_norm1)(UINT8 * pix, int line_size);
37
38 op_pixels_abs_func pix_abs16x16;
39 op_pixels_abs_func pix_abs16x16_x2;
40 op_pixels_abs_func pix_abs16x16_y2;
41 op_pixels_abs_func pix_abs16x16_xy2;
42
43 op_pixels_abs_func pix_abs8x8;
44 op_pixels_abs_func pix_abs8x8_x2;
45 op_pixels_abs_func pix_abs8x8_y2;
46 op_pixels_abs_func pix_abs8x8_xy2;
47
48 UINT8 cropTbl[256 + 2 * MAX_NEG_CROP];
49 UINT32 squareTbl[512];
50
51 extern INT16 ff_mpeg1_default_intra_matrix[64];
52 extern INT16 ff_mpeg1_default_non_intra_matrix[64];
53 extern INT16 ff_mpeg4_default_intra_matrix[64];
54 extern INT16 ff_mpeg4_default_non_intra_matrix[64];
55
56 UINT8 zigzag_direct[64] = {
57     0, 1, 8, 16, 9, 2, 3, 10,
58     17, 24, 32, 25, 18, 11, 4, 5,
59     12, 19, 26, 33, 40, 48, 41, 34,
60     27, 20, 13, 6, 7, 14, 21, 28,
61     35, 42, 49, 56, 57, 50, 43, 36,
62     29, 22, 15, 23, 30, 37, 44, 51,
63     58, 59, 52, 45, 38, 31, 39, 46,
64     53, 60, 61, 54, 47, 55, 62, 63
65 };
66
67 /* not permutated inverse zigzag_direct + 1 for MMX quantizer */
68 UINT16 __align8 inv_zigzag_direct16[64];
69
70 /* not permutated zigzag_direct for MMX quantizer */
71 UINT8 zigzag_direct_noperm[64];
72
73 UINT8 ff_alternate_horizontal_scan[64] = {
74     0,  1,  2,  3,  8,  9, 16, 17, 
75     10, 11,  4,  5,  6,  7, 15, 14,
76     13, 12, 19, 18, 24, 25, 32, 33, 
77     26, 27, 20, 21, 22, 23, 28, 29,
78     30, 31, 34, 35, 40, 41, 48, 49, 
79     42, 43, 36, 37, 38, 39, 44, 45,
80     46, 47, 50, 51, 56, 57, 58, 59, 
81     52, 53, 54, 55, 60, 61, 62, 63,
82 };
83
84 UINT8 ff_alternate_vertical_scan[64] = {
85     0,  8, 16, 24,  1,  9,  2, 10, 
86     17, 25, 32, 40, 48, 56, 57, 49,
87     41, 33, 26, 18,  3, 11,  4, 12, 
88     19, 27, 34, 42, 50, 58, 35, 43,
89     51, 59, 20, 28,  5, 13,  6, 14, 
90     21, 29, 36, 44, 52, 60, 37, 45,
91     53, 61, 22, 30,  7, 15, 23, 31, 
92     38, 46, 54, 62, 39, 47, 55, 63,
93 };
94
95 #ifdef SIMPLE_IDCT
96
97 /* Input permutation for the simple_idct_mmx */
98 static UINT8 simple_mmx_permutation[64]={
99         0x00, 0x08, 0x04, 0x09, 0x01, 0x0C, 0x05, 0x0D, 
100         0x10, 0x18, 0x14, 0x19, 0x11, 0x1C, 0x15, 0x1D, 
101         0x20, 0x28, 0x24, 0x29, 0x21, 0x2C, 0x25, 0x2D, 
102         0x12, 0x1A, 0x16, 0x1B, 0x13, 0x1E, 0x17, 0x1F, 
103         0x02, 0x0A, 0x06, 0x0B, 0x03, 0x0E, 0x07, 0x0F, 
104         0x30, 0x38, 0x34, 0x39, 0x31, 0x3C, 0x35, 0x3D, 
105         0x22, 0x2A, 0x26, 0x2B, 0x23, 0x2E, 0x27, 0x2F, 
106         0x32, 0x3A, 0x36, 0x3B, 0x33, 0x3E, 0x37, 0x3F,
107 };
108 #endif
109
110 /* a*inverse[b]>>32 == a/b for all 0<=a<=65536 && 2<=b<=255 */
111 UINT32 inverse[256]={
112          0, 4294967295U,2147483648U,1431655766, 1073741824,  858993460,  715827883,  613566757, 
113  536870912,  477218589,  429496730,  390451573,  357913942,  330382100,  306783379,  286331154, 
114  268435456,  252645136,  238609295,  226050911,  214748365,  204522253,  195225787,  186737709, 
115  178956971,  171798692,  165191050,  159072863,  153391690,  148102321,  143165577,  138547333, 
116  134217728,  130150525,  126322568,  122713352,  119304648,  116080198,  113025456,  110127367, 
117  107374183,  104755300,  102261127,   99882961,   97612894,   95443718,   93368855,   91382283, 
118   89478486,   87652394,   85899346,   84215046,   82595525,   81037119,   79536432,   78090315, 
119   76695845,   75350304,   74051161,   72796056,   71582789,   70409300,   69273667,   68174085, 
120   67108864,   66076420,   65075263,   64103990,   63161284,   62245903,   61356676,   60492498, 
121   59652324,   58835169,   58040099,   57266231,   56512728,   55778797,   55063684,   54366675, 
122   53687092,   53024288,   52377650,   51746594,   51130564,   50529028,   49941481,   49367441, 
123   48806447,   48258060,   47721859,   47197443,   46684428,   46182445,   45691142,   45210183, 
124   44739243,   44278014,   43826197,   43383509,   42949673,   42524429,   42107523,   41698712, 
125   41297763,   40904451,   40518560,   40139882,   39768216,   39403370,   39045158,   38693400, 
126   38347923,   38008561,   37675152,   37347542,   37025581,   36709123,   36398028,   36092163, 
127   35791395,   35495598,   35204650,   34918434,   34636834,   34359739,   34087043,   33818641, 
128   33554432,   33294321,   33038210,   32786010,   32537632,   32292988,   32051995,   31814573, 
129   31580642,   31350127,   31122952,   30899046,   30678338,   30460761,   30246249,   30034737, 
130   29826162,   29620465,   29417585,   29217465,   29020050,   28825284,   28633116,   28443493, 
131   28256364,   28071682,   27889399,   27709467,   27531842,   27356480,   27183338,   27012373, 
132   26843546,   26676816,   26512144,   26349493,   26188825,   26030105,   25873297,   25718368, 
133   25565282,   25414008,   25264514,   25116768,   24970741,   24826401,   24683721,   24542671, 
134   24403224,   24265352,   24129030,   23994231,   23860930,   23729102,   23598722,   23469767, 
135   23342214,   23216040,   23091223,   22967740,   22845571,   22724695,   22605092,   22486740, 
136   22369622,   22253717,   22139007,   22025474,   21913099,   21801865,   21691755,   21582751, 
137   21474837,   21367997,   21262215,   21157475,   21053762,   20951060,   20849356,   20748635, 
138   20648882,   20550083,   20452226,   20355296,   20259280,   20164166,   20069941,   19976593, 
139   19884108,   19792477,   19701685,   19611723,   19522579,   19434242,   19346700,   19259944, 
140   19173962,   19088744,   19004281,   18920561,   18837576,   18755316,   18673771,   18592933, 
141   18512791,   18433337,   18354562,   18276457,   18199014,   18122225,   18046082,   17970575, 
142   17895698,   17821442,   17747799,   17674763,   17602325,   17530479,   17459217,   17388532, 
143   17318417,   17248865,   17179870,   17111424,   17043522,   16976156,   16909321,   16843010,
144 };
145
146 /* used to skip zeros at the end */
147 UINT8 zigzag_end[64];
148
149 UINT8 permutation[64];
150 //UINT8 invPermutation[64];
151
152 static void build_zigzag_end(void)
153 {
154     int lastIndex;
155     int lastIndexAfterPerm=0;
156     for(lastIndex=0; lastIndex<64; lastIndex++)
157     {
158         if(zigzag_direct[lastIndex] > lastIndexAfterPerm) 
159             lastIndexAfterPerm= zigzag_direct[lastIndex];
160         zigzag_end[lastIndex]= lastIndexAfterPerm + 1;
161     }
162 }
163
164 int pix_sum_c(UINT8 * pix, int line_size)
165 {
166     int s, i, j;
167
168     s = 0;
169     for (i = 0; i < 16; i++) {
170         for (j = 0; j < 16; j += 8) {
171             s += pix[0];
172             s += pix[1];
173             s += pix[2];
174             s += pix[3];
175             s += pix[4];
176             s += pix[5];
177             s += pix[6];
178             s += pix[7];
179             pix += 8;
180         }
181         pix += line_size - 16;
182     }
183     return s;
184 }
185
186 int pix_norm1_c(UINT8 * pix, int line_size)
187 {
188     int s, i, j;
189     UINT32 *sq = squareTbl + 256;
190
191     s = 0;
192     for (i = 0; i < 16; i++) {
193         for (j = 0; j < 16; j += 8) {
194             s += sq[pix[0]];
195             s += sq[pix[1]];
196             s += sq[pix[2]];
197             s += sq[pix[3]];
198             s += sq[pix[4]];
199             s += sq[pix[5]];
200             s += sq[pix[6]];
201             s += sq[pix[7]];
202             pix += 8;
203         }
204         pix += line_size - 16;
205     }
206     return s;
207 }
208
209
210 void get_pixels_c(DCTELEM *restrict block, const UINT8 *pixels, int line_size)
211 {
212     int i;
213
214     /* read the pixels */
215     for(i=0;i<8;i++) {
216         block[0] = pixels[0];
217         block[1] = pixels[1];
218         block[2] = pixels[2];
219         block[3] = pixels[3];
220         block[4] = pixels[4];
221         block[5] = pixels[5];
222         block[6] = pixels[6];
223         block[7] = pixels[7];
224         pixels += line_size;
225         block += 8;
226     }
227 }
228
229 void diff_pixels_c(DCTELEM *restrict block, const UINT8 *s1, const UINT8 *s2,
230                    int stride){
231     int i;
232
233     /* read the pixels */
234     for(i=0;i<8;i++) {
235         block[0] = s1[0] - s2[0];
236         block[1] = s1[1] - s2[1];
237         block[2] = s1[2] - s2[2];
238         block[3] = s1[3] - s2[3];
239         block[4] = s1[4] - s2[4];
240         block[5] = s1[5] - s2[5];
241         block[6] = s1[6] - s2[6];
242         block[7] = s1[7] - s2[7];
243         s1 += stride;
244         s2 += stride;
245         block += 8;
246     }
247 }
248
249
250 void put_pixels_clamped_c(const DCTELEM *block, UINT8 *restrict pixels,
251                           int line_size)
252 {
253     int i;
254     UINT8 *cm = cropTbl + MAX_NEG_CROP;
255     
256     /* read the pixels */
257     for(i=0;i<8;i++) {
258         pixels[0] = cm[block[0]];
259         pixels[1] = cm[block[1]];
260         pixels[2] = cm[block[2]];
261         pixels[3] = cm[block[3]];
262         pixels[4] = cm[block[4]];
263         pixels[5] = cm[block[5]];
264         pixels[6] = cm[block[6]];
265         pixels[7] = cm[block[7]];
266
267         pixels += line_size;
268         block += 8;
269     }
270 }
271
272 void add_pixels_clamped_c(const DCTELEM *block, UINT8 *restrict pixels,
273                           int line_size)
274 {
275     int i;
276     UINT8 *cm = cropTbl + MAX_NEG_CROP;
277     
278     /* read the pixels */
279     for(i=0;i<8;i++) {
280         pixels[0] = cm[pixels[0] + block[0]];
281         pixels[1] = cm[pixels[1] + block[1]];
282         pixels[2] = cm[pixels[2] + block[2]];
283         pixels[3] = cm[pixels[3] + block[3]];
284         pixels[4] = cm[pixels[4] + block[4]];
285         pixels[5] = cm[pixels[5] + block[5]];
286         pixels[6] = cm[pixels[6] + block[6]];
287         pixels[7] = cm[pixels[7] + block[7]];
288         pixels += line_size;
289         block += 8;
290     }
291 }
292 #if 0
293
294 #define PIXOP2(OPNAME, OP) \
295 void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
296 {\
297     int i;\
298     for(i=0; i<h; i++){\
299         OP(*((uint64_t*)block), LD64(pixels));\
300         pixels+=line_size;\
301         block +=line_size;\
302     }\
303 }\
304 \
305 void OPNAME ## _no_rnd_pixels_x2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
306 {\
307     int i;\
308     for(i=0; i<h; i++){\
309         const uint64_t a= LD64(pixels  );\
310         const uint64_t b= LD64(pixels+1);\
311         OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
312         pixels+=line_size;\
313         block +=line_size;\
314     }\
315 }\
316 \
317 void OPNAME ## _pixels_x2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
318 {\
319     int i;\
320     for(i=0; i<h; i++){\
321         const uint64_t a= LD64(pixels  );\
322         const uint64_t b= LD64(pixels+1);\
323         OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
324         pixels+=line_size;\
325         block +=line_size;\
326     }\
327 }\
328 \
329 void OPNAME ## _no_rnd_pixels_y2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
330 {\
331     int i;\
332     for(i=0; i<h; i++){\
333         const uint64_t a= LD64(pixels          );\
334         const uint64_t b= LD64(pixels+line_size);\
335         OP(*((uint64_t*)block), (a&b) + (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
336         pixels+=line_size;\
337         block +=line_size;\
338     }\
339 }\
340 \
341 void OPNAME ## _pixels_y2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
342 {\
343     int i;\
344     for(i=0; i<h; i++){\
345         const uint64_t a= LD64(pixels          );\
346         const uint64_t b= LD64(pixels+line_size);\
347         OP(*((uint64_t*)block), (a|b) - (((a^b)&0xFEFEFEFEFEFEFEFEULL)>>1));\
348         pixels+=line_size;\
349         block +=line_size;\
350     }\
351 }\
352 \
353 void OPNAME ## _pixels_xy2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
354 {\
355         int i;\
356         const uint64_t a= LD64(pixels  );\
357         const uint64_t b= LD64(pixels+1);\
358         uint64_t l0=  (a&0x0303030303030303ULL)\
359                     + (b&0x0303030303030303ULL)\
360                     + 0x0202020202020202ULL;\
361         uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
362                    + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
363         uint64_t l1,h1;\
364 \
365         pixels+=line_size;\
366         for(i=0; i<h; i+=2){\
367             uint64_t a= LD64(pixels  );\
368             uint64_t b= LD64(pixels+1);\
369             l1=  (a&0x0303030303030303ULL)\
370                + (b&0x0303030303030303ULL);\
371             h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
372               + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
373             OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
374             pixels+=line_size;\
375             block +=line_size;\
376             a= LD64(pixels  );\
377             b= LD64(pixels+1);\
378             l0=  (a&0x0303030303030303ULL)\
379                + (b&0x0303030303030303ULL)\
380                + 0x0202020202020202ULL;\
381             h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
382               + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
383             OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
384             pixels+=line_size;\
385             block +=line_size;\
386         }\
387 }\
388 \
389 void OPNAME ## _no_rnd_pixels_xy2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
390 {\
391         int i;\
392         const uint64_t a= LD64(pixels  );\
393         const uint64_t b= LD64(pixels+1);\
394         uint64_t l0=  (a&0x0303030303030303ULL)\
395                     + (b&0x0303030303030303ULL)\
396                     + 0x0101010101010101ULL;\
397         uint64_t h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
398                    + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
399         uint64_t l1,h1;\
400 \
401         pixels+=line_size;\
402         for(i=0; i<h; i+=2){\
403             uint64_t a= LD64(pixels  );\
404             uint64_t b= LD64(pixels+1);\
405             l1=  (a&0x0303030303030303ULL)\
406                + (b&0x0303030303030303ULL);\
407             h1= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
408               + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
409             OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
410             pixels+=line_size;\
411             block +=line_size;\
412             a= LD64(pixels  );\
413             b= LD64(pixels+1);\
414             l0=  (a&0x0303030303030303ULL)\
415                + (b&0x0303030303030303ULL)\
416                + 0x0101010101010101ULL;\
417             h0= ((a&0xFCFCFCFCFCFCFCFCULL)>>2)\
418               + ((b&0xFCFCFCFCFCFCFCFCULL)>>2);\
419             OP(*((uint64_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0F0F0F0F0FULL));\
420             pixels+=line_size;\
421             block +=line_size;\
422         }\
423 }\
424 \
425 void (*OPNAME ## _pixels_tab[4])(uint8_t *block, const uint8_t *pixels, int line_size, int h) = {\
426     OPNAME ## _pixels,\
427     OPNAME ## _pixels_x2,\
428     OPNAME ## _pixels_y2,\
429     OPNAME ## _pixels_xy2,\
430 };\
431 \
432 void (*OPNAME ## _no_rnd_pixels_tab[4])(uint8_t *block, const uint8_t *pixels, int line_size, int h) = {\
433     OPNAME ## _pixels,\
434     OPNAME ## _no_rnd_pixels_x2,\
435     OPNAME ## _no_rnd_pixels_y2,\
436     OPNAME ## _no_rnd_pixels_xy2,\
437 };
438
439 #define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEFEFEFEFEULL)>>1) )
440 #else // 64 bit variant
441
442 #define PIXOP2(OPNAME, OP) \
443 void OPNAME ## _pixels(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
444 {\
445     int i;\
446     for(i=0; i<h; i++){\
447         OP(*((uint32_t*)(block  )), LD32(pixels  ));\
448         OP(*((uint32_t*)(block+4)), LD32(pixels+4));\
449         pixels+=line_size;\
450         block +=line_size;\
451     }\
452 }\
453 \
454 void OPNAME ## _no_rnd_pixels_x2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
455 {\
456     int i;\
457     for(i=0; i<h; i++){\
458         int j;\
459         for(j=0; j<2; j++){\
460             const uint32_t a= LD32(pixels  );\
461             const uint32_t b= LD32(pixels+1);\
462             OP(*((uint32_t*)block), (a&b) + (((a^b)&0xFEFEFEFEUL)>>1));\
463             pixels+=4;\
464             block +=4;\
465         }\
466         pixels+=line_size-8;\
467         block +=line_size-8;\
468     }\
469 }\
470 \
471 void OPNAME ## _pixels_x2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
472 {\
473     int i;\
474     for(i=0; i<h; i++){\
475         int j;\
476         for(j=0; j<2; j++){\
477             const uint32_t a= LD32(pixels  );\
478             const uint32_t b= LD32(pixels+1);\
479             OP(*((uint32_t*)block), (a|b) - (((a^b)&0xFEFEFEFEUL)>>1));\
480             pixels+=4;\
481             block +=4;\
482         }\
483         pixels+=line_size-8;\
484         block +=line_size-8;\
485     }\
486 }\
487 \
488 void OPNAME ## _no_rnd_pixels_y2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
489 {\
490     int i;\
491     for(i=0; i<h; i++){\
492         int j;\
493         for(j=0; j<2; j++){\
494             const uint32_t a= LD32(pixels          );\
495             const uint32_t b= LD32(pixels+line_size);\
496             OP(*((uint32_t*)block), (a&b) + (((a^b)&0xFEFEFEFEUL)>>1));\
497             pixels+=4;\
498             block +=4;\
499         }\
500         pixels+=line_size-8;\
501         block +=line_size-8;\
502     }\
503 }\
504 \
505 void OPNAME ## _pixels_y2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
506 {\
507     int i;\
508     for(i=0; i<h; i++){\
509         int j;\
510         for(j=0; j<2; j++){\
511             const uint32_t a= LD32(pixels          );\
512             const uint32_t b= LD32(pixels+line_size);\
513             OP(*((uint32_t*)block), (a|b) - (((a^b)&0xFEFEFEFEUL)>>1));\
514             pixels+=4;\
515             block +=4;\
516         }\
517         pixels+=line_size-8;\
518         block +=line_size-8;\
519     }\
520 }\
521 \
522 void OPNAME ## _pixels_xy2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
523 {\
524     int j;\
525     for(j=0; j<2; j++){\
526         int i;\
527         const uint32_t a= LD32(pixels  );\
528         const uint32_t b= LD32(pixels+1);\
529         uint32_t l0=  (a&0x03030303UL)\
530                     + (b&0x03030303UL)\
531                     + 0x02020202UL;\
532         uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
533                    + ((b&0xFCFCFCFCUL)>>2);\
534         uint32_t l1,h1;\
535 \
536         pixels+=line_size;\
537         for(i=0; i<h; i+=2){\
538             uint32_t a= LD32(pixels  );\
539             uint32_t b= LD32(pixels+1);\
540             l1=  (a&0x03030303UL)\
541                + (b&0x03030303UL);\
542             h1= ((a&0xFCFCFCFCUL)>>2)\
543               + ((b&0xFCFCFCFCUL)>>2);\
544             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
545             pixels+=line_size;\
546             block +=line_size;\
547             a= LD32(pixels  );\
548             b= LD32(pixels+1);\
549             l0=  (a&0x03030303UL)\
550                + (b&0x03030303UL)\
551                + 0x02020202UL;\
552             h0= ((a&0xFCFCFCFCUL)>>2)\
553               + ((b&0xFCFCFCFCUL)>>2);\
554             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
555             pixels+=line_size;\
556             block +=line_size;\
557         }\
558         pixels+=4-line_size*(h+1);\
559         block +=4-line_size*h;\
560     }\
561 }\
562 \
563 void OPNAME ## _no_rnd_pixels_xy2(uint8_t *block, const uint8_t *pixels, int line_size, int h)\
564 {\
565     int j;\
566     for(j=0; j<2; j++){\
567         int i;\
568         const uint32_t a= LD32(pixels  );\
569         const uint32_t b= LD32(pixels+1);\
570         uint32_t l0=  (a&0x03030303UL)\
571                     + (b&0x03030303UL)\
572                     + 0x01010101UL;\
573         uint32_t h0= ((a&0xFCFCFCFCUL)>>2)\
574                    + ((b&0xFCFCFCFCUL)>>2);\
575         uint32_t l1,h1;\
576 \
577         pixels+=line_size;\
578         for(i=0; i<h; i+=2){\
579             uint32_t a= LD32(pixels  );\
580             uint32_t b= LD32(pixels+1);\
581             l1=  (a&0x03030303UL)\
582                + (b&0x03030303UL);\
583             h1= ((a&0xFCFCFCFCUL)>>2)\
584               + ((b&0xFCFCFCFCUL)>>2);\
585             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
586             pixels+=line_size;\
587             block +=line_size;\
588             a= LD32(pixels  );\
589             b= LD32(pixels+1);\
590             l0=  (a&0x03030303UL)\
591                + (b&0x03030303UL)\
592                + 0x01010101UL;\
593             h0= ((a&0xFCFCFCFCUL)>>2)\
594               + ((b&0xFCFCFCFCUL)>>2);\
595             OP(*((uint32_t*)block), h0+h1+(((l0+l1)>>2)&0x0F0F0F0FUL));\
596             pixels+=line_size;\
597             block +=line_size;\
598         }\
599         pixels+=4-line_size*(h+1);\
600         block +=4-line_size*h;\
601     }\
602 }\
603 \
604 void (*OPNAME ## _pixels_tab[4])(uint8_t *block, const uint8_t *pixels, int line_size, int h) = {\
605     OPNAME ## _pixels,\
606     OPNAME ## _pixels_x2,\
607     OPNAME ## _pixels_y2,\
608     OPNAME ## _pixels_xy2,\
609 };\
610 \
611 void (*OPNAME ## _no_rnd_pixels_tab[4])(uint8_t *block, const uint8_t *pixels, int line_size, int h) = {\
612     OPNAME ## _pixels,\
613     OPNAME ## _no_rnd_pixels_x2,\
614     OPNAME ## _no_rnd_pixels_y2,\
615     OPNAME ## _no_rnd_pixels_xy2,\
616 };
617 #define op_avg(a, b) a = ( ((a)|(b)) - ((((a)^(b))&0xFEFEFEFEUL)>>1) )
618 #endif
619 #define op_put(a, b) a = b
620
621 PIXOP2(avg, op_avg)
622 PIXOP2(put, op_put)
623 #undef op_avg
624 #undef op_put
625
626 #if 0
627 /* FIXME this stuff could be removed as its ot really used anymore */
628 #define PIXOP(BTYPE, OPNAME, OP, INCR)                                                   \
629                                                                                          \
630 static void OPNAME ## _pixels(BTYPE *block, const UINT8 *pixels, int line_size, int h)    \
631 {                                                                                        \
632     BTYPE *p;                                                                            \
633     const UINT8 *pix;                                                                    \
634                                                                                          \
635     p = block;                                                                           \
636     pix = pixels;                                                                        \
637     do {                                                                                 \
638         OP(p[0], pix[0]);                                                                  \
639         OP(p[1], pix[1]);                                                                  \
640         OP(p[2], pix[2]);                                                                  \
641         OP(p[3], pix[3]);                                                                  \
642         OP(p[4], pix[4]);                                                                  \
643         OP(p[5], pix[5]);                                                                  \
644         OP(p[6], pix[6]);                                                                  \
645         OP(p[7], pix[7]);                                                                  \
646         pix += line_size;                                                                \
647         p += INCR;                                                                       \
648     } while (--h);;                                                                       \
649 }                                                                                        \
650                                                                                          \
651 static void OPNAME ## _pixels_x2(BTYPE *block, const UINT8 *pixels, int line_size, int h)     \
652 {                                                                                        \
653     BTYPE *p;                                                                          \
654     const UINT8 *pix;                                                                    \
655                                                                                          \
656     p = block;                                                                           \
657     pix = pixels;                                                                        \
658     do {                                                                   \
659         OP(p[0], avg2(pix[0], pix[1]));                                                    \
660         OP(p[1], avg2(pix[1], pix[2]));                                                    \
661         OP(p[2], avg2(pix[2], pix[3]));                                                    \
662         OP(p[3], avg2(pix[3], pix[4]));                                                    \
663         OP(p[4], avg2(pix[4], pix[5]));                                                    \
664         OP(p[5], avg2(pix[5], pix[6]));                                                    \
665         OP(p[6], avg2(pix[6], pix[7]));                                                    \
666         OP(p[7], avg2(pix[7], pix[8]));                                                    \
667         pix += line_size;                                                                \
668         p += INCR;                                                                       \
669     } while (--h);                                                                        \
670 }                                                                                        \
671                                                                                          \
672 static void OPNAME ## _pixels_y2(BTYPE *block, const UINT8 *pixels, int line_size, int h)     \
673 {                                                                                        \
674     BTYPE *p;                                                                          \
675     const UINT8 *pix;                                                                    \
676     const UINT8 *pix1;                                                                   \
677                                                                                          \
678     p = block;                                                                           \
679     pix = pixels;                                                                        \
680     pix1 = pixels + line_size;                                                           \
681     do {                                                                                 \
682         OP(p[0], avg2(pix[0], pix1[0]));                                                   \
683         OP(p[1], avg2(pix[1], pix1[1]));                                                   \
684         OP(p[2], avg2(pix[2], pix1[2]));                                                   \
685         OP(p[3], avg2(pix[3], pix1[3]));                                                   \
686         OP(p[4], avg2(pix[4], pix1[4]));                                                   \
687         OP(p[5], avg2(pix[5], pix1[5]));                                                   \
688         OP(p[6], avg2(pix[6], pix1[6]));                                                   \
689         OP(p[7], avg2(pix[7], pix1[7]));                                                   \
690         pix += line_size;                                                                \
691         pix1 += line_size;                                                               \
692         p += INCR;                                                                       \
693     } while(--h);                                                                         \
694 }                                                                                        \
695                                                                                          \
696 static void OPNAME ## _pixels_xy2(BTYPE *block, const UINT8 *pixels, int line_size, int h)    \
697 {                                                                                        \
698     BTYPE *p;                                                                          \
699     const UINT8 *pix;                                                                    \
700     const UINT8 *pix1;                                                                   \
701                                                                                          \
702     p = block;                                                                           \
703     pix = pixels;                                                                        \
704     pix1 = pixels + line_size;                                                           \
705     do {                                                                   \
706         OP(p[0], avg4(pix[0], pix[1], pix1[0], pix1[1]));                                  \
707         OP(p[1], avg4(pix[1], pix[2], pix1[1], pix1[2]));                                  \
708         OP(p[2], avg4(pix[2], pix[3], pix1[2], pix1[3]));                                  \
709         OP(p[3], avg4(pix[3], pix[4], pix1[3], pix1[4]));                                  \
710         OP(p[4], avg4(pix[4], pix[5], pix1[4], pix1[5]));                                  \
711         OP(p[5], avg4(pix[5], pix[6], pix1[5], pix1[6]));                                  \
712         OP(p[6], avg4(pix[6], pix[7], pix1[6], pix1[7]));                                  \
713         OP(p[7], avg4(pix[7], pix[8], pix1[7], pix1[8]));                                  \
714         pix += line_size;                                                                \
715         pix1 += line_size;                                                               \
716         p += INCR;                                                                       \
717     } while(--h);                                                                         \
718 }                                                                                        \
719                                                                                          \
720 void (*OPNAME ## _pixels_tab[4])(BTYPE *block, const UINT8 *pixels, int line_size, int h) = { \
721     OPNAME ## _pixels,                                                                   \
722     OPNAME ## _pixels_x2,                                                                \
723     OPNAME ## _pixels_y2,                                                                \
724     OPNAME ## _pixels_xy2,                                                               \
725 };
726
727 /* rounding primitives */
728 #define avg2(a,b) ((a+b+1)>>1)
729 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
730
731 #define op_avg(a, b) a = avg2(a, b)
732 #define op_sub(a, b) a -= b
733 #define op_put(a, b) a = b
734
735 PIXOP(DCTELEM, sub, op_sub, 8)
736 PIXOP(uint8_t, avg, op_avg, line_size)
737 PIXOP(uint8_t, put, op_put, line_size)
738
739 /* not rounding primitives */
740 #undef avg2
741 #undef avg4
742 #define avg2(a,b) ((a+b)>>1)
743 #define avg4(a,b,c,d) ((a+b+c+d+1)>>2)
744
745 PIXOP(uint8_t, avg_no_rnd, op_avg, line_size)
746 PIXOP(uint8_t, put_no_rnd, op_put, line_size)
747 /* motion estimation */
748
749 #undef avg2
750 #undef avg4
751 #endif
752
753 #define avg2(a,b) ((a+b+1)>>1)
754 #define avg4(a,b,c,d) ((a+b+c+d+2)>>2)
755
756 static void gmc1_c(UINT8 *dst, UINT8 *src, int srcStride, int h, int x16, int y16, int rounder)
757 {
758     const int A=(16-x16)*(16-y16);
759     const int B=(   x16)*(16-y16);
760     const int C=(16-x16)*(   y16);
761     const int D=(   x16)*(   y16);
762     int i;
763     rounder= 128 - rounder;
764
765     for(i=0; i<h; i++)
766     {
767         dst[0]= (A*src[0] + B*src[1] + C*src[srcStride+0] + D*src[srcStride+1] + rounder)>>8;
768         dst[1]= (A*src[1] + B*src[2] + C*src[srcStride+1] + D*src[srcStride+2] + rounder)>>8;
769         dst[2]= (A*src[2] + B*src[3] + C*src[srcStride+2] + D*src[srcStride+3] + rounder)>>8;
770         dst[3]= (A*src[3] + B*src[4] + C*src[srcStride+3] + D*src[srcStride+4] + rounder)>>8;
771         dst[4]= (A*src[4] + B*src[5] + C*src[srcStride+4] + D*src[srcStride+5] + rounder)>>8;
772         dst[5]= (A*src[5] + B*src[6] + C*src[srcStride+5] + D*src[srcStride+6] + rounder)>>8;
773         dst[6]= (A*src[6] + B*src[7] + C*src[srcStride+6] + D*src[srcStride+7] + rounder)>>8;
774         dst[7]= (A*src[7] + B*src[8] + C*src[srcStride+7] + D*src[srcStride+8] + rounder)>>8;
775         dst+= srcStride;
776         src+= srcStride;
777     }
778 }
779
780 static void qpel_h_lowpass(UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int h, int r)
781 {
782     UINT8 *cm = cropTbl + MAX_NEG_CROP;
783     int i;
784     for(i=0; i<h; i++)
785     {
786         dst[0]= cm[(((src[0]+src[1])*20 - (src[0]+src[2])*6 + (src[1]+src[3])*3 - (src[2]+src[4]) + r)>>5)];
787         dst[1]= cm[(((src[1]+src[2])*20 - (src[0]+src[3])*6 + (src[0]+src[4])*3 - (src[1]+src[5]) + r)>>5)];
788         dst[2]= cm[(((src[2]+src[3])*20 - (src[1]+src[4])*6 + (src[0]+src[5])*3 - (src[0]+src[6]) + r)>>5)];
789         dst[3]= cm[(((src[3]+src[4])*20 - (src[2]+src[5])*6 + (src[1]+src[6])*3 - (src[0]+src[7]) + r)>>5)];
790         dst[4]= cm[(((src[4]+src[5])*20 - (src[3]+src[6])*6 + (src[2]+src[7])*3 - (src[1]+src[8]) + r)>>5)];
791         dst[5]= cm[(((src[5]+src[6])*20 - (src[4]+src[7])*6 + (src[3]+src[8])*3 - (src[2]+src[8]) + r)>>5)];
792         dst[6]= cm[(((src[6]+src[7])*20 - (src[5]+src[8])*6 + (src[4]+src[8])*3 - (src[3]+src[7]) + r)>>5)];
793         dst[7]= cm[(((src[7]+src[8])*20 - (src[6]+src[8])*6 + (src[5]+src[7])*3 - (src[4]+src[6]) + r)>>5)];
794         dst+=dstStride;
795         src+=srcStride;
796     }
797 }
798
799 static void qpel_v_lowpass(UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int w, int r)
800 {
801     UINT8 *cm = cropTbl + MAX_NEG_CROP;
802     int i;
803     for(i=0; i<w; i++)
804     {
805         const int src0= src[0*srcStride];
806         const int src1= src[1*srcStride];
807         const int src2= src[2*srcStride];
808         const int src3= src[3*srcStride];
809         const int src4= src[4*srcStride];
810         const int src5= src[5*srcStride];
811         const int src6= src[6*srcStride];
812         const int src7= src[7*srcStride];
813         const int src8= src[8*srcStride];
814         dst[0*dstStride]= cm[(((src0+src1)*20 - (src0+src2)*6 + (src1+src3)*3 - (src2+src4) + r)>>5)];
815         dst[1*dstStride]= cm[(((src1+src2)*20 - (src0+src3)*6 + (src0+src4)*3 - (src1+src5) + r)>>5)];
816         dst[2*dstStride]= cm[(((src2+src3)*20 - (src1+src4)*6 + (src0+src5)*3 - (src0+src6) + r)>>5)];
817         dst[3*dstStride]= cm[(((src3+src4)*20 - (src2+src5)*6 + (src1+src6)*3 - (src0+src7) + r)>>5)];
818         dst[4*dstStride]= cm[(((src4+src5)*20 - (src3+src6)*6 + (src2+src7)*3 - (src1+src8) + r)>>5)];
819         dst[5*dstStride]= cm[(((src5+src6)*20 - (src4+src7)*6 + (src3+src8)*3 - (src2+src8) + r)>>5)];
820         dst[6*dstStride]= cm[(((src6+src7)*20 - (src5+src8)*6 + (src4+src8)*3 - (src3+src7) + r)>>5)];
821         dst[7*dstStride]= cm[(((src7+src8)*20 - (src6+src8)*6 + (src5+src7)*3 - (src4+src6) + r)>>5)];
822         dst++;
823         src++;
824     }
825 }
826
827 static inline void put_block(UINT8 *dst, UINT8 *src, int dstStride, int srcStride)
828 {
829     int i;
830     for(i=0; i<8; i++)
831     {
832         dst[0]= src[0];
833         dst[1]= src[1];
834         dst[2]= src[2];
835         dst[3]= src[3];
836         dst[4]= src[4];
837         dst[5]= src[5];
838         dst[6]= src[6];
839         dst[7]= src[7];
840         dst+=dstStride;
841         src+=srcStride;
842     }
843 }
844
845 static inline void avg2_block(UINT8 *dst, UINT8 *src1, UINT8 *src2, int dstStride, int srcStride, int r)
846 {
847     int i;
848     for(i=0; i<8; i++)
849     {
850         dst[0]= (src1[0] + src2[0] + r)>>1;
851         dst[1]= (src1[1] + src2[1] + r)>>1;
852         dst[2]= (src1[2] + src2[2] + r)>>1;
853         dst[3]= (src1[3] + src2[3] + r)>>1;
854         dst[4]= (src1[4] + src2[4] + r)>>1;
855         dst[5]= (src1[5] + src2[5] + r)>>1;
856         dst[6]= (src1[6] + src2[6] + r)>>1;
857         dst[7]= (src1[7] + src2[7] + r)>>1;
858         dst+=dstStride;
859         src1+=srcStride;
860         src2+=8;
861     }
862 }
863
864 static inline void avg4_block(UINT8 *dst, UINT8 *src1, UINT8 *src2, UINT8 *src3, UINT8 *src4, int dstStride, int srcStride, int r)
865 {
866     int i;
867     for(i=0; i<8; i++)
868     {
869         dst[0]= (src1[0] + src2[0] + src3[0] + src4[0] + r)>>2;
870         dst[1]= (src1[1] + src2[1] + src3[1] + src4[1] + r)>>2;
871         dst[2]= (src1[2] + src2[2] + src3[2] + src4[2] + r)>>2;
872         dst[3]= (src1[3] + src2[3] + src3[3] + src4[3] + r)>>2;
873         dst[4]= (src1[4] + src2[4] + src3[4] + src4[4] + r)>>2;
874         dst[5]= (src1[5] + src2[5] + src3[5] + src4[5] + r)>>2;
875         dst[6]= (src1[6] + src2[6] + src3[6] + src4[6] + r)>>2;
876         dst[7]= (src1[7] + src2[7] + src3[7] + src4[7] + r)>>2;
877         dst+=dstStride;
878         src1+=srcStride;
879         src2+=8;
880         src3+=8;
881         src4+=8;
882     }
883 }
884
885 #define QPEL_MC(r, name) \
886 static void qpel_mc00_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
887 {\
888     put_block(dst, src, dstStride, srcStride);\
889 }\
890 \
891 static void qpel_mc10_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
892 {\
893     UINT8 half[64];\
894     qpel_h_lowpass(half, src, 8, srcStride, 8, 16-r);\
895     avg2_block(dst, src, half, dstStride, srcStride, 1-r);\
896 }\
897 \
898 static void qpel_mc20_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
899 {\
900     qpel_h_lowpass(dst, src, dstStride, srcStride, 8, 16-r);\
901 }\
902 \
903 static void qpel_mc30_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
904 {\
905     UINT8 half[64];\
906     qpel_h_lowpass(half, src, 8, srcStride, 8, 16-r);\
907     avg2_block(dst, src+1, half, dstStride, srcStride, 1-r);\
908 }\
909 \
910 static void qpel_mc01_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
911 {\
912     UINT8 half[64];\
913     qpel_v_lowpass(half, src, 8, srcStride, 8, 16-r);\
914     avg2_block(dst, src, half, dstStride, srcStride, 1-r);\
915 }\
916 \
917 static void qpel_mc02_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
918 {\
919     qpel_v_lowpass(dst, src, dstStride, srcStride, 8, 16-r);\
920 }\
921 \
922 static void qpel_mc03_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
923 {\
924     UINT8 half[64];\
925     qpel_v_lowpass(half, src, 8, srcStride, 8, 16-r);\
926     avg2_block(dst, src+srcStride, half, dstStride, srcStride, 1-r);\
927 }\
928 static void qpel_mc11_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
929 {\
930     UINT8 halfH[72];\
931     UINT8 halfV[64];\
932     UINT8 halfHV[64];\
933     qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
934     qpel_v_lowpass(halfV, src, 8, srcStride, 8, 16-r);\
935     qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\
936     avg4_block(dst, src, halfH, halfV, halfHV, dstStride, srcStride, 2-r);\
937 }\
938 static void qpel_mc31_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
939 {\
940     UINT8 halfH[72];\
941     UINT8 halfV[64];\
942     UINT8 halfHV[64];\
943     qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
944     qpel_v_lowpass(halfV, src+1, 8, srcStride, 8, 16-r);\
945     qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\
946     avg4_block(dst, src+1, halfH, halfV, halfHV, dstStride, srcStride, 2-r);\
947 }\
948 static void qpel_mc13_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
949 {\
950     UINT8 halfH[72];\
951     UINT8 halfV[64];\
952     UINT8 halfHV[64];\
953     qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
954     qpel_v_lowpass(halfV, src, 8, srcStride, 8, 16-r);\
955     qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\
956     avg4_block(dst, src+srcStride, halfH+8, halfV, halfHV, dstStride, srcStride, 2-r);\
957 }\
958 static void qpel_mc33_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
959 {\
960     UINT8 halfH[72];\
961     UINT8 halfV[64];\
962     UINT8 halfHV[64];\
963     qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
964     qpel_v_lowpass(halfV, src+1, 8, srcStride, 8, 16-r);\
965     qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\
966     avg4_block(dst, src+srcStride+1, halfH+8, halfV, halfHV, dstStride, srcStride, 2-r);\
967 }\
968 static void qpel_mc21_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
969 {\
970     UINT8 halfH[72];\
971     UINT8 halfHV[64];\
972     qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
973     qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\
974     avg2_block(dst, halfH, halfHV, dstStride, 8, 1-r);\
975 }\
976 static void qpel_mc23_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
977 {\
978     UINT8 halfH[72];\
979     UINT8 halfHV[64];\
980     qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
981     qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\
982     avg2_block(dst, halfH+8, halfHV, dstStride, 8, 1-r);\
983 }\
984 static void qpel_mc12_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
985 {\
986     UINT8 halfH[72];\
987     UINT8 halfV[64];\
988     UINT8 halfHV[64];\
989     qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
990     qpel_v_lowpass(halfV, src, 8, srcStride, 8, 16-r);\
991     qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\
992     avg2_block(dst, halfV, halfHV, dstStride, 8, 1-r);\
993 }\
994 static void qpel_mc32_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
995 {\
996     UINT8 halfH[72];\
997     UINT8 halfV[64];\
998     UINT8 halfHV[64];\
999     qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
1000     qpel_v_lowpass(halfV, src+1, 8, srcStride, 8, 16-r);\
1001     qpel_v_lowpass(halfHV, halfH, 8, 8, 8, 16-r);\
1002     avg2_block(dst, halfV, halfHV, dstStride, 8, 1-r);\
1003 }\
1004 static void qpel_mc22_c ## name (UINT8 *dst, UINT8 *src, int dstStride, int srcStride, int mx, int my)\
1005 {\
1006     UINT8 halfH[72];\
1007     qpel_h_lowpass(halfH, src, 8, srcStride, 9, 16-r);\
1008     qpel_v_lowpass(dst, halfH, dstStride, 8, 8, 16-r);\
1009 }\
1010 qpel_mc_func qpel_mc ## name ## _tab[16]={ \
1011     qpel_mc00_c ## name,                                                                   \
1012     qpel_mc10_c ## name,                                                                   \
1013     qpel_mc20_c ## name,                                                                   \
1014     qpel_mc30_c ## name,                                                                   \
1015     qpel_mc01_c ## name,                                                                   \
1016     qpel_mc11_c ## name,                                                                   \
1017     qpel_mc21_c ## name,                                                                   \
1018     qpel_mc31_c ## name,                                                                   \
1019     qpel_mc02_c ## name,                                                                   \
1020     qpel_mc12_c ## name,                                                                   \
1021     qpel_mc22_c ## name,                                                                   \
1022     qpel_mc32_c ## name,                                                                   \
1023     qpel_mc03_c ## name,                                                                   \
1024     qpel_mc13_c ## name,                                                                   \
1025     qpel_mc23_c ## name,                                                                   \
1026     qpel_mc33_c ## name,                                                                   \
1027 };
1028
1029 QPEL_MC(0, _rnd)
1030 QPEL_MC(1, _no_rnd)
1031
1032 int pix_abs16x16_c(UINT8 *pix1, UINT8 *pix2, int line_size)
1033 {
1034     int s, i;
1035
1036     s = 0;
1037     for(i=0;i<16;i++) {
1038         s += abs(pix1[0] - pix2[0]);
1039         s += abs(pix1[1] - pix2[1]);
1040         s += abs(pix1[2] - pix2[2]);
1041         s += abs(pix1[3] - pix2[3]);
1042         s += abs(pix1[4] - pix2[4]);
1043         s += abs(pix1[5] - pix2[5]);
1044         s += abs(pix1[6] - pix2[6]);
1045         s += abs(pix1[7] - pix2[7]);
1046         s += abs(pix1[8] - pix2[8]);
1047         s += abs(pix1[9] - pix2[9]);
1048         s += abs(pix1[10] - pix2[10]);
1049         s += abs(pix1[11] - pix2[11]);
1050         s += abs(pix1[12] - pix2[12]);
1051         s += abs(pix1[13] - pix2[13]);
1052         s += abs(pix1[14] - pix2[14]);
1053         s += abs(pix1[15] - pix2[15]);
1054         pix1 += line_size;
1055         pix2 += line_size;
1056     }
1057     return s;
1058 }
1059
1060 int pix_abs16x16_x2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
1061 {
1062     int s, i;
1063
1064     s = 0;
1065     for(i=0;i<16;i++) {
1066         s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1067         s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1068         s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1069         s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1070         s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1071         s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1072         s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1073         s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1074         s += abs(pix1[8] - avg2(pix2[8], pix2[9]));
1075         s += abs(pix1[9] - avg2(pix2[9], pix2[10]));
1076         s += abs(pix1[10] - avg2(pix2[10], pix2[11]));
1077         s += abs(pix1[11] - avg2(pix2[11], pix2[12]));
1078         s += abs(pix1[12] - avg2(pix2[12], pix2[13]));
1079         s += abs(pix1[13] - avg2(pix2[13], pix2[14]));
1080         s += abs(pix1[14] - avg2(pix2[14], pix2[15]));
1081         s += abs(pix1[15] - avg2(pix2[15], pix2[16]));
1082         pix1 += line_size;
1083         pix2 += line_size;
1084     }
1085     return s;
1086 }
1087
1088 int pix_abs16x16_y2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
1089 {
1090     int s, i;
1091     UINT8 *pix3 = pix2 + line_size;
1092
1093     s = 0;
1094     for(i=0;i<16;i++) {
1095         s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1096         s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1097         s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1098         s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1099         s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1100         s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1101         s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1102         s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1103         s += abs(pix1[8] - avg2(pix2[8], pix3[8]));
1104         s += abs(pix1[9] - avg2(pix2[9], pix3[9]));
1105         s += abs(pix1[10] - avg2(pix2[10], pix3[10]));
1106         s += abs(pix1[11] - avg2(pix2[11], pix3[11]));
1107         s += abs(pix1[12] - avg2(pix2[12], pix3[12]));
1108         s += abs(pix1[13] - avg2(pix2[13], pix3[13]));
1109         s += abs(pix1[14] - avg2(pix2[14], pix3[14]));
1110         s += abs(pix1[15] - avg2(pix2[15], pix3[15]));
1111         pix1 += line_size;
1112         pix2 += line_size;
1113         pix3 += line_size;
1114     }
1115     return s;
1116 }
1117
1118 int pix_abs16x16_xy2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
1119 {
1120     int s, i;
1121     UINT8 *pix3 = pix2 + line_size;
1122
1123     s = 0;
1124     for(i=0;i<16;i++) {
1125         s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1126         s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1127         s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1128         s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1129         s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1130         s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1131         s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1132         s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1133         s += abs(pix1[8] - avg4(pix2[8], pix2[9], pix3[8], pix3[9]));
1134         s += abs(pix1[9] - avg4(pix2[9], pix2[10], pix3[9], pix3[10]));
1135         s += abs(pix1[10] - avg4(pix2[10], pix2[11], pix3[10], pix3[11]));
1136         s += abs(pix1[11] - avg4(pix2[11], pix2[12], pix3[11], pix3[12]));
1137         s += abs(pix1[12] - avg4(pix2[12], pix2[13], pix3[12], pix3[13]));
1138         s += abs(pix1[13] - avg4(pix2[13], pix2[14], pix3[13], pix3[14]));
1139         s += abs(pix1[14] - avg4(pix2[14], pix2[15], pix3[14], pix3[15]));
1140         s += abs(pix1[15] - avg4(pix2[15], pix2[16], pix3[15], pix3[16]));
1141         pix1 += line_size;
1142         pix2 += line_size;
1143         pix3 += line_size;
1144     }
1145     return s;
1146 }
1147
1148 int pix_abs8x8_c(UINT8 *pix1, UINT8 *pix2, int line_size)
1149 {
1150     int s, i;
1151
1152     s = 0;
1153     for(i=0;i<8;i++) {
1154         s += abs(pix1[0] - pix2[0]);
1155         s += abs(pix1[1] - pix2[1]);
1156         s += abs(pix1[2] - pix2[2]);
1157         s += abs(pix1[3] - pix2[3]);
1158         s += abs(pix1[4] - pix2[4]);
1159         s += abs(pix1[5] - pix2[5]);
1160         s += abs(pix1[6] - pix2[6]);
1161         s += abs(pix1[7] - pix2[7]);
1162         pix1 += line_size;
1163         pix2 += line_size;
1164     }
1165     return s;
1166 }
1167
1168 int pix_abs8x8_x2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
1169 {
1170     int s, i;
1171
1172     s = 0;
1173     for(i=0;i<8;i++) {
1174         s += abs(pix1[0] - avg2(pix2[0], pix2[1]));
1175         s += abs(pix1[1] - avg2(pix2[1], pix2[2]));
1176         s += abs(pix1[2] - avg2(pix2[2], pix2[3]));
1177         s += abs(pix1[3] - avg2(pix2[3], pix2[4]));
1178         s += abs(pix1[4] - avg2(pix2[4], pix2[5]));
1179         s += abs(pix1[5] - avg2(pix2[5], pix2[6]));
1180         s += abs(pix1[6] - avg2(pix2[6], pix2[7]));
1181         s += abs(pix1[7] - avg2(pix2[7], pix2[8]));
1182         pix1 += line_size;
1183         pix2 += line_size;
1184     }
1185     return s;
1186 }
1187
1188 int pix_abs8x8_y2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
1189 {
1190     int s, i;
1191     UINT8 *pix3 = pix2 + line_size;
1192
1193     s = 0;
1194     for(i=0;i<8;i++) {
1195         s += abs(pix1[0] - avg2(pix2[0], pix3[0]));
1196         s += abs(pix1[1] - avg2(pix2[1], pix3[1]));
1197         s += abs(pix1[2] - avg2(pix2[2], pix3[2]));
1198         s += abs(pix1[3] - avg2(pix2[3], pix3[3]));
1199         s += abs(pix1[4] - avg2(pix2[4], pix3[4]));
1200         s += abs(pix1[5] - avg2(pix2[5], pix3[5]));
1201         s += abs(pix1[6] - avg2(pix2[6], pix3[6]));
1202         s += abs(pix1[7] - avg2(pix2[7], pix3[7]));
1203         pix1 += line_size;
1204         pix2 += line_size;
1205         pix3 += line_size;
1206     }
1207     return s;
1208 }
1209
1210 int pix_abs8x8_xy2_c(UINT8 *pix1, UINT8 *pix2, int line_size)
1211 {
1212     int s, i;
1213     UINT8 *pix3 = pix2 + line_size;
1214
1215     s = 0;
1216     for(i=0;i<8;i++) {
1217         s += abs(pix1[0] - avg4(pix2[0], pix2[1], pix3[0], pix3[1]));
1218         s += abs(pix1[1] - avg4(pix2[1], pix2[2], pix3[1], pix3[2]));
1219         s += abs(pix1[2] - avg4(pix2[2], pix2[3], pix3[2], pix3[3]));
1220         s += abs(pix1[3] - avg4(pix2[3], pix2[4], pix3[3], pix3[4]));
1221         s += abs(pix1[4] - avg4(pix2[4], pix2[5], pix3[4], pix3[5]));
1222         s += abs(pix1[5] - avg4(pix2[5], pix2[6], pix3[5], pix3[6]));
1223         s += abs(pix1[6] - avg4(pix2[6], pix2[7], pix3[6], pix3[7]));
1224         s += abs(pix1[7] - avg4(pix2[7], pix2[8], pix3[7], pix3[8]));
1225         pix1 += line_size;
1226         pix2 += line_size;
1227         pix3 += line_size;
1228     }
1229     return s;
1230 }
1231
1232 /* permute block according so that it corresponds to the MMX idct
1233    order */
1234 #ifdef SIMPLE_IDCT
1235  /* general permutation, but perhaps slightly slower */
1236 void block_permute(INT16 *block)
1237 {
1238         int i;
1239         INT16 temp[64];
1240
1241         for(i=0; i<64; i++) temp[ block_permute_op(i) ] = block[i];
1242
1243         for(i=0; i<64; i++) block[i] = temp[i];
1244 }
1245 #else
1246
1247 void block_permute(INT16 *block)
1248 {
1249     int tmp1, tmp2, tmp3, tmp4, tmp5, tmp6;
1250     int i;
1251
1252     for(i=0;i<8;i++) {
1253         tmp1 = block[1];
1254         tmp2 = block[2];
1255         tmp3 = block[3];
1256         tmp4 = block[4];
1257         tmp5 = block[5];
1258         tmp6 = block[6];
1259         block[1] = tmp2;
1260         block[2] = tmp4;
1261         block[3] = tmp6;
1262         block[4] = tmp1;
1263         block[5] = tmp3;
1264         block[6] = tmp5;
1265         block += 8;
1266     }
1267 }
1268 #endif
1269
1270 void clear_blocks_c(DCTELEM *blocks)
1271 {
1272     memset(blocks, 0, sizeof(DCTELEM)*6*64);
1273 }
1274
1275 /* XXX: those functions should be suppressed ASAP when all IDCTs are
1276    converted */
1277 void gen_idct_put(UINT8 *dest, int line_size, DCTELEM *block)
1278 {
1279     ff_idct (block);
1280     put_pixels_clamped(block, dest, line_size);
1281 }
1282
1283 void gen_idct_add(UINT8 *dest, int line_size, DCTELEM *block)
1284 {
1285     ff_idct (block);
1286     add_pixels_clamped(block, dest, line_size);
1287 }
1288
1289 void dsputil_init(void)
1290 {
1291     int i, j;
1292     int use_permuted_idct;
1293
1294     for(i=0;i<256;i++) cropTbl[i + MAX_NEG_CROP] = i;
1295     for(i=0;i<MAX_NEG_CROP;i++) {
1296         cropTbl[i] = 0;
1297         cropTbl[i + MAX_NEG_CROP + 256] = 255;
1298     }
1299
1300     for(i=0;i<512;i++) {
1301         squareTbl[i] = (i - 256) * (i - 256);
1302     }
1303
1304 #ifdef SIMPLE_IDCT
1305     ff_idct = NULL;
1306 #else
1307     ff_idct = j_rev_dct;
1308 #endif
1309     get_pixels = get_pixels_c;
1310     diff_pixels = diff_pixels_c;
1311     put_pixels_clamped = put_pixels_clamped_c;
1312     add_pixels_clamped = add_pixels_clamped_c;
1313     gmc1= gmc1_c;
1314     clear_blocks= clear_blocks_c;
1315     pix_sum= pix_sum_c;
1316     pix_norm1= pix_norm1_c;
1317
1318     pix_abs16x16     = pix_abs16x16_c;
1319     pix_abs16x16_x2  = pix_abs16x16_x2_c;
1320     pix_abs16x16_y2  = pix_abs16x16_y2_c;
1321     pix_abs16x16_xy2 = pix_abs16x16_xy2_c;
1322     pix_abs8x8     = pix_abs8x8_c;
1323     pix_abs8x8_x2  = pix_abs8x8_x2_c;
1324     pix_abs8x8_y2  = pix_abs8x8_y2_c;
1325     pix_abs8x8_xy2 = pix_abs8x8_xy2_c;
1326     av_fdct = fdct_ifast;
1327
1328     use_permuted_idct = 1;
1329
1330 #ifdef HAVE_MMX
1331     dsputil_init_mmx();
1332 #endif
1333 #ifdef ARCH_ARMV4L
1334     dsputil_init_armv4l();
1335 #endif
1336 #ifdef HAVE_MLIB
1337     dsputil_init_mlib();
1338     use_permuted_idct = 0;
1339 #endif
1340 #ifdef ARCH_ALPHA
1341     dsputil_init_alpha();
1342     use_permuted_idct = 0;
1343 #endif
1344 #ifdef ARCH_POWERPC
1345     dsputil_init_altivec();
1346 #endif
1347
1348 #ifdef SIMPLE_IDCT
1349     if (ff_idct == NULL) {
1350         ff_idct_put = simple_idct_put;
1351         ff_idct_add = simple_idct_add;
1352         use_permuted_idct=0;
1353     }
1354 #endif
1355     if(ff_idct != NULL) {
1356         ff_idct_put = gen_idct_put;
1357         ff_idct_add = gen_idct_add;
1358     }
1359
1360     if(use_permuted_idct)
1361 #ifdef SIMPLE_IDCT
1362         for(i=0; i<64; i++) permutation[i]= simple_mmx_permutation[i];
1363 #else
1364         for(i=0; i<64; i++) permutation[i]= (i & 0x38) | ((i & 6) >> 1) | ((i & 1) << 2);
1365 #endif
1366     else
1367         for(i=0; i<64; i++) permutation[i]=i;
1368
1369     for(i=0; i<64; i++) inv_zigzag_direct16[zigzag_direct[i]]= i+1;
1370     for(i=0; i<64; i++) zigzag_direct_noperm[i]= zigzag_direct[i];
1371     
1372     if (use_permuted_idct) {
1373         /* permute for IDCT */
1374         for(i=0;i<64;i++) {
1375             j = zigzag_direct[i];
1376             zigzag_direct[i] = block_permute_op(j);
1377             j = ff_alternate_horizontal_scan[i];
1378             ff_alternate_horizontal_scan[i] = block_permute_op(j);
1379             j = ff_alternate_vertical_scan[i];
1380             ff_alternate_vertical_scan[i] = block_permute_op(j);
1381         }
1382         block_permute(ff_mpeg1_default_intra_matrix);
1383         block_permute(ff_mpeg1_default_non_intra_matrix);
1384         block_permute(ff_mpeg4_default_intra_matrix);
1385         block_permute(ff_mpeg4_default_non_intra_matrix);
1386     }
1387     
1388     build_zigzag_end();
1389 }
1390
1391 /* remove any non bit exact operation (testing purpose) */
1392 void avcodec_set_bit_exact(void)
1393 {
1394 #ifdef HAVE_MMX
1395     dsputil_set_bit_exact_mmx();
1396 #endif
1397 }
1398
1399 void get_psnr(UINT8 *orig_image[3], UINT8 *coded_image[3],
1400               int orig_linesize[3], int coded_linesize,
1401               AVCodecContext *avctx)
1402 {
1403     int quad, diff, x, y;
1404     UINT8 *orig, *coded;
1405     UINT32 *sq = squareTbl + 256;
1406     
1407     quad = 0;
1408     diff = 0;
1409     
1410     /* Luminance */
1411     orig = orig_image[0];
1412     coded = coded_image[0];
1413     
1414     for (y=0;y<avctx->height;y++) {
1415         for (x=0;x<avctx->width;x++) {
1416             diff = *(orig + x) - *(coded + x);
1417             quad += sq[diff];
1418         }
1419         orig += orig_linesize[0];
1420         coded += coded_linesize;
1421     }
1422    
1423     avctx->psnr_y = (float) quad / (float) (avctx->width * avctx->height);
1424     
1425     if (avctx->psnr_y) {
1426         avctx->psnr_y = (float) (255 * 255) / avctx->psnr_y;
1427         avctx->psnr_y = 10 * (float) log10 (avctx->psnr_y); 
1428     } else
1429         avctx->psnr_y = 99.99;
1430 }
1431