]> git.sesse.net Git - ffmpeg/blob - libavcodec/alpha/dsputil_alpha.c
Use updated motion compensation routines.
[ffmpeg] / libavcodec / alpha / dsputil_alpha.c
1 /*
2  * Alpha optimized DSP utils
3  * Copyright (c) 2002 Falk Hueffner <falk@debian.org>
4  *
5  * This library is free software; you can redistribute it and/or
6  * modify it under the terms of the GNU Lesser General Public
7  * License as published by the Free Software Foundation; either
8  * version 2 of the License, or (at your option) any later version.
9  *
10  * This library is distributed in the hope that it will be useful,
11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13  * Lesser General Public License for more details.
14  *
15  * You should have received a copy of the GNU Lesser General Public
16  * License along with this library; if not, write to the Free Software
17  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
18  */
19
20 #include "asm.h"
21 #include "../dsputil.h"
22
23 void simple_idct_axp(DCTELEM *block);
24
25 void put_pixels_axp_asm(uint8_t *block, const uint8_t *pixels,
26                         int line_size, int h);
27 void put_pixels_clamped_mvi_asm(const DCTELEM *block, uint8_t *pixels,
28                                 int line_size);
29 void add_pixels_clamped_mvi_asm(const DCTELEM *block, uint8_t *pixels, 
30                                 int line_size);
31
32 #if 0
33 /* These functions were the base for the optimized assembler routines,
34    and remain here for documentation purposes.  */
35 static void put_pixels_clamped_mvi(const DCTELEM *block, uint8_t *pixels, 
36                                    int line_size)
37 {
38     int i = 8;
39     uint64_t clampmask = zap(-1, 0xaa); /* 0x00ff00ff00ff00ff */
40
41     ASM_ACCEPT_MVI;
42
43     do {
44         uint64_t shorts0, shorts1;
45
46         shorts0 = ldq(block);
47         shorts0 = maxsw4(shorts0, 0);
48         shorts0 = minsw4(shorts0, clampmask);
49         stl(pkwb(shorts0), pixels);
50
51         shorts1 = ldq(block + 4);
52         shorts1 = maxsw4(shorts1, 0);
53         shorts1 = minsw4(shorts1, clampmask);
54         stl(pkwb(shorts1), pixels + 4);
55
56         pixels += line_size;
57         block += 8;
58     } while (--i);
59 }
60
61 void add_pixels_clamped_mvi(const DCTELEM *block, uint8_t *pixels, 
62                             int line_size)
63 {
64     int h = 8;
65     /* Keep this function a leaf function by generating the constants
66        manually (mainly for the hack value ;-).  */
67     uint64_t clampmask = zap(-1, 0xaa); /* 0x00ff00ff00ff00ff */
68     uint64_t signmask  = zap(-1, 0x33);
69     signmask ^= signmask >> 1;  /* 0x8000800080008000 */
70
71     ASM_ACCEPT_MVI;
72
73     do {
74         uint64_t shorts0, pix0, signs0;
75         uint64_t shorts1, pix1, signs1;
76
77         shorts0 = ldq(block);
78         shorts1 = ldq(block + 4);
79
80         pix0    = unpkbw(ldl(pixels));
81         /* Signed subword add (MMX paddw).  */
82         signs0  = shorts0 & signmask;
83         shorts0 &= ~signmask;
84         shorts0 += pix0;
85         shorts0 ^= signs0;
86         /* Clamp. */
87         shorts0 = maxsw4(shorts0, 0);
88         shorts0 = minsw4(shorts0, clampmask);   
89
90         /* Next 4.  */
91         pix1    = unpkbw(ldl(pixels + 4));
92         signs1  = shorts1 & signmask;
93         shorts1 &= ~signmask;
94         shorts1 += pix1;
95         shorts1 ^= signs1;
96         shorts1 = maxsw4(shorts1, 0);
97         shorts1 = minsw4(shorts1, clampmask);
98
99         stl(pkwb(shorts0), pixels);
100         stl(pkwb(shorts1), pixels + 4);
101
102         pixels += line_size;
103         block += 8;
104     } while (--h);
105 }
106 #endif
107
108 static inline uint64_t avg2_no_rnd(uint64_t a, uint64_t b)
109 {
110     return (a & b) + (((a ^ b) & BYTE_VEC(0xfe)) >> 1);
111 }
112
113 static inline uint64_t avg2(uint64_t a, uint64_t b)
114 {
115     return (a | b) - (((a ^ b) & BYTE_VEC(0xfe)) >> 1);    
116 }
117
118 static inline uint64_t avg4(uint64_t l1, uint64_t l2, uint64_t l3, uint64_t l4)
119 {
120     uint64_t r1 = ((l1 & ~BYTE_VEC(0x03)) >> 2)
121                 + ((l2 & ~BYTE_VEC(0x03)) >> 2)
122                 + ((l3 & ~BYTE_VEC(0x03)) >> 2)
123                 + ((l4 & ~BYTE_VEC(0x03)) >> 2);
124     uint64_t r2 = ((  (l1 & BYTE_VEC(0x03))
125                     + (l2 & BYTE_VEC(0x03))
126                     + (l3 & BYTE_VEC(0x03))
127                     + (l4 & BYTE_VEC(0x03))
128                     + BYTE_VEC(0x02)) >> 2) & BYTE_VEC(0x03);
129     return r1 + r2;
130 }
131
132 static inline uint64_t avg4_no_rnd(uint64_t l1, uint64_t l2,
133                                    uint64_t l3, uint64_t l4)
134 {
135     uint64_t r1 = ((l1 & ~BYTE_VEC(0x03)) >> 2)
136                 + ((l2 & ~BYTE_VEC(0x03)) >> 2)
137                 + ((l3 & ~BYTE_VEC(0x03)) >> 2)
138                 + ((l4 & ~BYTE_VEC(0x03)) >> 2);
139     uint64_t r2 = ((  (l1 & BYTE_VEC(0x03))
140                     + (l2 & BYTE_VEC(0x03))
141                     + (l3 & BYTE_VEC(0x03))
142                     + (l4 & BYTE_VEC(0x03))
143                     + BYTE_VEC(0x01)) >> 2) & BYTE_VEC(0x03);
144     return r1 + r2;
145 }
146
147 #define OP(LOAD, STORE, INCR)                   \
148     do {                                        \
149         STORE(LOAD(pixels), block);             \
150         pixels += line_size;                    \
151         block += INCR;                          \
152     } while (--h)
153
154 #define OP_X2(LOAD, STORE, INCR)                                \
155     do {                                                        \
156         uint64_t pix1, pix2;                                    \
157                                                                 \
158         pix1 = LOAD(pixels);                                    \
159         pix2 = pix1 >> 8 | ((uint64_t) pixels[8] << 56);        \
160         STORE(AVG2(pix1, pix2), block);                         \
161         pixels += line_size;                                    \
162         block += INCR;                                          \
163     } while (--h)
164
165 #define OP_Y2(LOAD, STORE, INCR)                \
166     do {                                        \
167         uint64_t pix = LOAD(pixels);            \
168         do {                                    \
169             uint64_t next_pix;                  \
170                                                 \
171             pixels += line_size;                \
172             next_pix = LOAD(pixels);            \
173             STORE(AVG2(pix, next_pix), block);  \
174             block += INCR;                      \
175             pix = next_pix;                     \
176         } while (--h);                          \
177     } while (0)
178
179 #define OP_XY2(LOAD, STORE, INCR)                                       \
180     do {                                                                \
181         uint64_t pix1 = LOAD(pixels);                                   \
182         uint64_t pix2 = pix1 >> 8 | ((uint64_t) pixels[8] << 56);       \
183                                                                         \
184         do {                                                            \
185             uint64_t next_pix1, next_pix2;                              \
186                                                                         \
187             pixels += line_size;                                        \
188             next_pix1 = LOAD(pixels);                                   \
189             next_pix2 = next_pix1 >> 8 | ((uint64_t) pixels[8] << 56);  \
190                                                                         \
191             STORE(AVG4(pix1, pix2, next_pix1, next_pix2), block);       \
192                                                                         \
193             block += INCR;                                              \
194             pix1 = next_pix1;                                           \
195             pix2 = next_pix2;                                           \
196         } while (--h);                                                  \
197     } while (0)
198
199 #define MAKE_OP(BTYPE, OPNAME, SUFF, OPKIND, STORE, INCR)               \
200 static void OPNAME ## _pixels ## SUFF ## _axp(BTYPE *block,             \
201                                               const uint8_t *pixels,    \
202                                               int line_size, int h)     \
203 {                                                                       \
204     if ((size_t) pixels & 0x7) {                                        \
205         OPKIND(uldq, STORE, INCR);                                      \
206     } else {                                                            \
207         OPKIND(ldq, STORE, INCR);                                       \
208     }                                                                   \
209 }
210
211 #define PIXOP(BTYPE, OPNAME, STORE, INCR)               \
212     MAKE_OP(BTYPE, OPNAME, ,     OP,     STORE, INCR);  \
213     MAKE_OP(BTYPE, OPNAME, _x2,  OP_X2,  STORE, INCR);  \
214     MAKE_OP(BTYPE, OPNAME, _y2,  OP_Y2,  STORE, INCR);  \
215     MAKE_OP(BTYPE, OPNAME, _xy2, OP_XY2, STORE, INCR);
216
217 /* Rounding primitives.  */
218 #define AVG2 avg2
219 #define AVG4 avg4
220 #define STORE(l, b) stq(l, b)
221 PIXOP(uint8_t, put, STORE, line_size);
222
223 #undef STORE
224 #define STORE(l, b) stq(AVG2(l, ldq(b)), b);
225 PIXOP(uint8_t, avg, STORE, line_size);
226
227 /* Not rounding primitives.  */
228 #undef AVG2
229 #undef AVG4
230 #undef STORE
231 #define AVG2 avg2_no_rnd
232 #define AVG4 avg4_no_rnd
233 #define STORE(l, b) stq(l, b)
234 PIXOP(uint8_t, put_no_rnd, STORE, line_size);
235
236 #undef STORE
237 #define STORE(l, b) stq(AVG2(l, ldq(b)), b);
238 PIXOP(uint8_t, avg_no_rnd, STORE, line_size);
239
240 void dsputil_init_alpha(void)
241 {
242     put_pixels_tab[0] = put_pixels_axp_asm;
243     put_pixels_tab[1] = put_pixels_x2_axp;
244     put_pixels_tab[2] = put_pixels_y2_axp;
245     put_pixels_tab[3] = put_pixels_xy2_axp;
246
247     put_no_rnd_pixels_tab[0] = put_pixels_axp_asm;
248     put_no_rnd_pixels_tab[1] = put_no_rnd_pixels_x2_axp;
249     put_no_rnd_pixels_tab[2] = put_no_rnd_pixels_y2_axp;
250     put_no_rnd_pixels_tab[3] = put_no_rnd_pixels_xy2_axp;
251
252     avg_pixels_tab[0] = avg_pixels_axp;
253     avg_pixels_tab[1] = avg_pixels_x2_axp;
254     avg_pixels_tab[2] = avg_pixels_y2_axp;
255     avg_pixels_tab[3] = avg_pixels_xy2_axp;
256
257     avg_no_rnd_pixels_tab[0] = avg_no_rnd_pixels_axp;
258     avg_no_rnd_pixels_tab[1] = avg_no_rnd_pixels_x2_axp;
259     avg_no_rnd_pixels_tab[2] = avg_no_rnd_pixels_y2_axp;
260     avg_no_rnd_pixels_tab[3] = avg_no_rnd_pixels_xy2_axp;
261
262     /* amask clears all bits that correspond to present features.  */
263     if (amask(AMASK_MVI) == 0) {
264         put_pixels_clamped = put_pixels_clamped_mvi_asm;
265         add_pixels_clamped = add_pixels_clamped_mvi_asm;
266     }
267 }