2 * DSP utils : average functions are compiled twice for 3dnow/mmx2
3 * Copyright (c) 2000, 2001 Gerard Lantau.
4 * Copyright (c) 2002 Michael Niedermayer
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
20 * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
21 * mostly rewritten by Michael Niedermayer <michaelni@gmx.at>
24 static void DEF(put_pixels_x2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
27 "xorl %%eax, %%eax \n\t"
30 "movq (%1, %%eax), %%mm0 \n\t"
31 "movq 1(%1, %%eax), %%mm1 \n\t"
32 "movq (%2, %%eax), %%mm2 \n\t"
33 "movq 1(%2, %%eax), %%mm3 \n\t"
34 PAVGB" %%mm1, %%mm0 \n\t"
35 PAVGB" %%mm3, %%mm2 \n\t"
36 "movq %%mm0, (%3, %%eax) \n\t"
37 "movq %%mm2, (%4, %%eax) \n\t"
39 "movq (%1, %%eax), %%mm0 \n\t"
40 "movq 1(%1, %%eax), %%mm1 \n\t"
41 "movq (%2, %%eax), %%mm2 \n\t"
42 "movq 1(%2, %%eax), %%mm3 \n\t"
43 PAVGB" %%mm1, %%mm0 \n\t"
44 PAVGB" %%mm3, %%mm2 \n\t"
45 "movq %%mm0, (%3, %%eax) \n\t"
46 "movq %%mm2, (%4, %%eax) \n\t"
51 :"r"(pixels), "r"(pixels+line_size), "r" (block), "r" (block+line_size),
56 static void DEF(put_no_rnd_pixels_x2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
59 "xorl %%eax, %%eax \n\t"
63 "movq (%1, %%eax), %%mm0 \n\t"
64 "movq 1(%1, %%eax), %%mm1 \n\t"
65 "movq (%2, %%eax), %%mm2 \n\t"
66 "movq 1(%2, %%eax), %%mm3 \n\t"
67 "psubusb %%mm7, %%mm0 \n\t"
68 "psubusb %%mm7, %%mm2 \n\t"
69 PAVGB" %%mm1, %%mm0 \n\t"
70 PAVGB" %%mm3, %%mm2 \n\t"
71 "movq %%mm0, (%3, %%eax) \n\t"
72 "movq %%mm2, (%4, %%eax) \n\t"
74 "movq (%1, %%eax), %%mm0 \n\t"
75 "movq 1(%1, %%eax), %%mm1 \n\t"
76 "movq (%2, %%eax), %%mm2 \n\t"
77 "movq 1(%2, %%eax), %%mm3 \n\t"
78 "psubusb %%mm7, %%mm0 \n\t"
79 "psubusb %%mm7, %%mm2 \n\t"
80 PAVGB" %%mm1, %%mm0 \n\t"
81 PAVGB" %%mm3, %%mm2 \n\t"
82 "movq %%mm0, (%3, %%eax) \n\t"
83 "movq %%mm2, (%4, %%eax) \n\t"
88 :"r"(pixels), "r"(pixels+line_size), "r" (block), "r" (block+line_size),
93 static void DEF(put_pixels_y2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
96 "xorl %%eax, %%eax \n\t"
97 "movq (%1), %%mm0 \n\t"
100 "movq (%2, %%eax), %%mm1 \n\t"
101 "movq (%3, %%eax), %%mm2 \n\t"
102 PAVGB" %%mm1, %%mm0 \n\t"
103 PAVGB" %%mm2, %%mm1 \n\t"
104 "movq %%mm0, (%4, %%eax) \n\t"
105 "movq %%mm1, (%5, %%eax) \n\t"
106 "addl %6, %%eax \n\t"
107 "movq (%2, %%eax), %%mm1 \n\t"
108 "movq (%3, %%eax), %%mm0 \n\t"
109 PAVGB" %%mm1, %%mm2 \n\t"
110 PAVGB" %%mm0, %%mm1 \n\t"
111 "movq %%mm2, (%4, %%eax) \n\t"
112 "movq %%mm1, (%5, %%eax) \n\t"
113 "addl %6, %%eax \n\t"
117 :"r"(pixels), "r"(pixels+line_size), "r"(pixels+line_size*2), "r" (block),
118 "r" (block+line_size), "g"(line_size<<1)
122 static void DEF(put_no_rnd_pixels_y2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
126 "xorl %%eax, %%eax \n\t"
127 "movq (%1), %%mm0 \n\t"
130 "movq (%2, %%eax), %%mm1 \n\t"
131 "movq (%3, %%eax), %%mm2 \n\t"
132 "psubusb %%mm7, %%mm1 \n\t"
133 PAVGB" %%mm1, %%mm0 \n\t"
134 PAVGB" %%mm2, %%mm1 \n\t"
135 "movq %%mm0, (%4, %%eax) \n\t"
136 "movq %%mm1, (%5, %%eax) \n\t"
137 "addl %6, %%eax \n\t"
138 "movq (%2, %%eax), %%mm1 \n\t"
139 "movq (%3, %%eax), %%mm0 \n\t"
140 "psubusb %%mm7, %%mm1 \n\t"
141 PAVGB" %%mm1, %%mm2 \n\t"
142 PAVGB" %%mm0, %%mm1 \n\t"
143 "movq %%mm2, (%4, %%eax) \n\t"
144 "movq %%mm1, (%5, %%eax) \n\t"
145 "addl %6, %%eax \n\t"
149 :"r"(pixels), "r"(pixels+line_size), "r"(pixels+line_size*2), "r" (block),
150 "r" (block+line_size), "g"(line_size<<1)
154 static void DEF(avg_pixels)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
157 "xorl %%eax, %%eax \n\t"
160 "movq (%1, %%eax), %%mm0 \n\t"
161 "movq (%2, %%eax), %%mm2 \n\t"
162 "movq (%3, %%eax), %%mm3 \n\t"
163 "movq (%4, %%eax), %%mm4 \n\t"
164 PAVGB" %%mm3, %%mm0 \n\t"
165 PAVGB" %%mm4, %%mm2 \n\t"
166 "movq %%mm0, (%3, %%eax) \n\t"
167 "movq %%mm2, (%4, %%eax) \n\t"
168 "addl %5, %%eax \n\t"
169 "movq (%1, %%eax), %%mm0 \n\t"
170 "movq (%2, %%eax), %%mm2 \n\t"
171 "movq (%3, %%eax), %%mm3 \n\t"
172 "movq (%4, %%eax), %%mm4 \n\t"
173 PAVGB" %%mm3, %%mm0 \n\t"
174 PAVGB" %%mm4, %%mm2 \n\t"
175 "movq %%mm0, (%3, %%eax) \n\t"
176 "movq %%mm2, (%4, %%eax) \n\t"
177 "addl %5, %%eax \n\t"
181 :"r"(pixels), "r"(pixels+line_size), "r" (block), "r" (block+line_size),
186 static void DEF(avg_pixels_x2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
189 "xorl %%eax, %%eax \n\t"
192 "movq (%1, %%eax), %%mm0 \n\t"
193 "movq 1(%1, %%eax), %%mm1 \n\t"
194 "movq (%2, %%eax), %%mm2 \n\t"
195 "movq 1(%2, %%eax), %%mm3 \n\t"
196 PAVGB" %%mm1, %%mm0 \n\t"
197 PAVGB" %%mm3, %%mm2 \n\t"
198 "movq (%3, %%eax), %%mm3 \n\t"
199 "movq (%4, %%eax), %%mm4 \n\t"
200 PAVGB" %%mm3, %%mm0 \n\t"
201 PAVGB" %%mm4, %%mm2 \n\t"
202 "movq %%mm0, (%3, %%eax) \n\t"
203 "movq %%mm2, (%4, %%eax) \n\t"
204 "addl %5, %%eax \n\t"
205 "movq (%1, %%eax), %%mm0 \n\t"
206 "movq 1(%1, %%eax), %%mm1 \n\t"
207 "movq (%2, %%eax), %%mm2 \n\t"
208 "movq 1(%2, %%eax), %%mm3 \n\t"
209 PAVGB" %%mm1, %%mm0 \n\t"
210 PAVGB" %%mm3, %%mm2 \n\t"
211 "movq (%3, %%eax), %%mm3 \n\t"
212 "movq (%4, %%eax), %%mm4 \n\t"
213 PAVGB" %%mm3, %%mm0 \n\t"
214 PAVGB" %%mm4, %%mm2 \n\t"
215 "movq %%mm0, (%3, %%eax) \n\t"
216 "movq %%mm2, (%4, %%eax) \n\t"
217 "addl %5, %%eax \n\t"
221 :"r"(pixels), "r"(pixels+line_size), "r" (block), "r" (block+line_size),
226 static void DEF(avg_pixels_y2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
229 "xorl %%eax, %%eax \n\t"
230 "movq (%1), %%mm0 \n\t"
233 "movq (%2, %%eax), %%mm1 \n\t"
234 "movq (%3, %%eax), %%mm2 \n\t"
235 PAVGB" %%mm1, %%mm0 \n\t"
236 PAVGB" %%mm2, %%mm1 \n\t"
237 "movq (%4, %%eax), %%mm3 \n\t"
238 "movq (%5, %%eax), %%mm4 \n\t"
239 PAVGB" %%mm3, %%mm0 \n\t"
240 PAVGB" %%mm4, %%mm1 \n\t"
241 "movq %%mm0, (%4, %%eax) \n\t"
242 "movq %%mm1, (%5, %%eax) \n\t"
243 "addl %6, %%eax \n\t"
244 "movq (%2, %%eax), %%mm1 \n\t"
245 "movq (%3, %%eax), %%mm0 \n\t"
246 PAVGB" %%mm1, %%mm2 \n\t"
247 PAVGB" %%mm0, %%mm1 \n\t"
248 "movq (%4, %%eax), %%mm3 \n\t"
249 "movq (%5, %%eax), %%mm4 \n\t"
250 PAVGB" %%mm3, %%mm2 \n\t"
251 PAVGB" %%mm4, %%mm1 \n\t"
252 "movq %%mm2, (%4, %%eax) \n\t"
253 "movq %%mm1, (%5, %%eax) \n\t"
254 "addl %6, %%eax \n\t"
258 :"r"(pixels), "r"(pixels+line_size), "r"(pixels+line_size*2), "r" (block),
259 "r" (block+line_size), "g"(line_size<<1)
263 // Note this is not correctly rounded, but this function is only used for b frames so it doesnt matter
264 static void DEF(avg_pixels_xy2)(UINT8 *block, const UINT8 *pixels, int line_size, int h)
268 "xorl %%eax, %%eax \n\t"
269 "movq (%1), %%mm0 \n\t"
270 "movq 1(%1), %%mm1 \n\t"
271 PAVGB" %%mm1, %%mm0 \n\t"
274 "movq (%2, %%eax), %%mm1 \n\t"
275 "movq (%3, %%eax), %%mm2 \n\t"
276 "movq 1(%2, %%eax), %%mm3 \n\t"
277 "movq 1(%3, %%eax), %%mm4 \n\t"
278 "psubusb %%mm7, %%mm2 \n\t"
279 PAVGB" %%mm3, %%mm1 \n\t"
280 PAVGB" %%mm4, %%mm2 \n\t"
281 PAVGB" %%mm1, %%mm0 \n\t"
282 PAVGB" %%mm2, %%mm1 \n\t"
283 "movq (%4, %%eax), %%mm3 \n\t"
284 "movq (%5, %%eax), %%mm4 \n\t"
285 PAVGB" %%mm3, %%mm0 \n\t"
286 PAVGB" %%mm4, %%mm1 \n\t"
287 "movq %%mm0, (%4, %%eax) \n\t"
288 "movq %%mm1, (%5, %%eax) \n\t"
289 "addl %6, %%eax \n\t"
290 "movq (%2, %%eax), %%mm1 \n\t"
291 "movq (%3, %%eax), %%mm0 \n\t"
292 "movq 1(%2, %%eax), %%mm3 \n\t"
293 "movq 1(%3, %%eax), %%mm4 \n\t"
294 PAVGB" %%mm3, %%mm1 \n\t"
295 PAVGB" %%mm4, %%mm0 \n\t"
296 PAVGB" %%mm1, %%mm2 \n\t"
297 PAVGB" %%mm0, %%mm1 \n\t"
298 "movq (%4, %%eax), %%mm3 \n\t"
299 "movq (%5, %%eax), %%mm4 \n\t"
300 PAVGB" %%mm3, %%mm2 \n\t"
301 PAVGB" %%mm4, %%mm1 \n\t"
302 "movq %%mm2, (%4, %%eax) \n\t"
303 "movq %%mm1, (%5, %%eax) \n\t"
304 "addl %6, %%eax \n\t"
308 :"r"(pixels), "r"(pixels+line_size), "r"(pixels+line_size*2), "r" (block),
309 "r" (block+line_size), "g"(line_size<<1)
313 //Note: the sub* functions are no used
315 static void DEF(sub_pixels_x2)( DCTELEM *block, const UINT8 *pixels, int line_size, int h)
322 "pxor %%mm7, %%mm7":);
325 "movq 1%1, %%mm2\n\t"
327 PAVGB" %1, %%mm2\n\t"
328 "movq 8%0, %%mm1\n\t"
329 "movq %%mm2, %%mm3\n\t"
330 "punpcklbw %%mm7, %%mm2\n\t"
331 "punpckhbw %%mm7, %%mm3\n\t"
332 "psubsw %%mm2, %%mm0\n\t"
333 "psubsw %%mm3, %%mm1\n\t"
335 "movq %%mm1, 8%0\n\t"
344 static void DEF(sub_pixels_y2)( DCTELEM *block, const UINT8 *pixels, int line_size, int h)
351 "pxor %%mm7, %%mm7":);
356 PAVGB" %1, %%mm2\n\t"
357 "movq 8%0, %%mm1\n\t"
358 "movq %%mm2, %%mm3\n\t"
359 "punpcklbw %%mm7, %%mm2\n\t"
360 "punpckhbw %%mm7, %%mm3\n\t"
361 "psubsw %%mm2, %%mm0\n\t"
362 "psubsw %%mm3, %%mm1\n\t"
364 "movq %%mm1, 8%0\n\t"
366 :"m"(*pix), "m"(*(pix+line_size))