2 * MMX optimized DSP utils
3 * Copyright (c) 2000, 2001 Fabrice Bellard
4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
6 * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
8 * This file is part of Libav.
10 * Libav is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public
12 * License as published by the Free Software Foundation; either
13 * version 2.1 of the License, or (at your option) any later version.
15 * Libav is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * Lesser General Public License for more details.
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with Libav; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
26 #include "libavutil/cpu.h"
27 #include "libavutil/x86/asm.h"
28 #include "constants.h"
29 #include "dsputil_x86.h"
33 void ff_put_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
44 "movq (%3), %%mm0 \n\t"
45 "movq 8(%3), %%mm1 \n\t"
46 "movq 16(%3), %%mm2 \n\t"
47 "movq 24(%3), %%mm3 \n\t"
48 "movq 32(%3), %%mm4 \n\t"
49 "movq 40(%3), %%mm5 \n\t"
50 "movq 48(%3), %%mm6 \n\t"
51 "movq 56(%3), %%mm7 \n\t"
52 "packuswb %%mm1, %%mm0 \n\t"
53 "packuswb %%mm3, %%mm2 \n\t"
54 "packuswb %%mm5, %%mm4 \n\t"
55 "packuswb %%mm7, %%mm6 \n\t"
56 "movq %%mm0, (%0) \n\t"
57 "movq %%mm2, (%0, %1) \n\t"
58 "movq %%mm4, (%0, %1, 2) \n\t"
59 "movq %%mm6, (%0, %2) \n\t"
60 :: "r"(pix), "r"((x86_reg)line_size), "r"((x86_reg)line_size * 3),
66 // if here would be an exact copy of the code above
67 // compiler would generate some very strange code
70 "movq (%3), %%mm0 \n\t"
71 "movq 8(%3), %%mm1 \n\t"
72 "movq 16(%3), %%mm2 \n\t"
73 "movq 24(%3), %%mm3 \n\t"
74 "movq 32(%3), %%mm4 \n\t"
75 "movq 40(%3), %%mm5 \n\t"
76 "movq 48(%3), %%mm6 \n\t"
77 "movq 56(%3), %%mm7 \n\t"
78 "packuswb %%mm1, %%mm0 \n\t"
79 "packuswb %%mm3, %%mm2 \n\t"
80 "packuswb %%mm5, %%mm4 \n\t"
81 "packuswb %%mm7, %%mm6 \n\t"
82 "movq %%mm0, (%0) \n\t"
83 "movq %%mm2, (%0, %1) \n\t"
84 "movq %%mm4, (%0, %1, 2) \n\t"
85 "movq %%mm6, (%0, %2) \n\t"
86 :: "r"(pix), "r"((x86_reg)line_size), "r"((x86_reg)line_size * 3), "r"(p)
90 #define put_signed_pixels_clamped_mmx_half(off) \
91 "movq "#off"(%2), %%mm1 \n\t" \
92 "movq 16 + "#off"(%2), %%mm2 \n\t" \
93 "movq 32 + "#off"(%2), %%mm3 \n\t" \
94 "movq 48 + "#off"(%2), %%mm4 \n\t" \
95 "packsswb 8 + "#off"(%2), %%mm1 \n\t" \
96 "packsswb 24 + "#off"(%2), %%mm2 \n\t" \
97 "packsswb 40 + "#off"(%2), %%mm3 \n\t" \
98 "packsswb 56 + "#off"(%2), %%mm4 \n\t" \
99 "paddb %%mm0, %%mm1 \n\t" \
100 "paddb %%mm0, %%mm2 \n\t" \
101 "paddb %%mm0, %%mm3 \n\t" \
102 "paddb %%mm0, %%mm4 \n\t" \
103 "movq %%mm1, (%0) \n\t" \
104 "movq %%mm2, (%0, %3) \n\t" \
105 "movq %%mm3, (%0, %3, 2) \n\t" \
106 "movq %%mm4, (%0, %1) \n\t"
108 void ff_put_signed_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
111 x86_reg line_skip = line_size;
115 "movq "MANGLE(ff_pb_80)", %%mm0 \n\t"
116 "lea (%3, %3, 2), %1 \n\t"
117 put_signed_pixels_clamped_mmx_half(0)
118 "lea (%0, %3, 4), %0 \n\t"
119 put_signed_pixels_clamped_mmx_half(64)
120 : "+&r"(pixels), "=&r"(line_skip3)
121 : "r"(block), "r"(line_skip)
125 void ff_add_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
132 /* read the pixels */
139 "movq (%2), %%mm0 \n\t"
140 "movq 8(%2), %%mm1 \n\t"
141 "movq 16(%2), %%mm2 \n\t"
142 "movq 24(%2), %%mm3 \n\t"
143 "movq %0, %%mm4 \n\t"
144 "movq %1, %%mm6 \n\t"
145 "movq %%mm4, %%mm5 \n\t"
146 "punpcklbw %%mm7, %%mm4 \n\t"
147 "punpckhbw %%mm7, %%mm5 \n\t"
148 "paddsw %%mm4, %%mm0 \n\t"
149 "paddsw %%mm5, %%mm1 \n\t"
150 "movq %%mm6, %%mm5 \n\t"
151 "punpcklbw %%mm7, %%mm6 \n\t"
152 "punpckhbw %%mm7, %%mm5 \n\t"
153 "paddsw %%mm6, %%mm2 \n\t"
154 "paddsw %%mm5, %%mm3 \n\t"
155 "packuswb %%mm1, %%mm0 \n\t"
156 "packuswb %%mm3, %%mm2 \n\t"
157 "movq %%mm0, %0 \n\t"
158 "movq %%mm2, %1 \n\t"
159 : "+m"(*pix), "+m"(*(pix + line_size))
162 pix += line_size * 2;
167 #define CLEAR_BLOCKS(name, n) \
168 void name(int16_t *blocks) \
171 "pxor %%mm7, %%mm7 \n\t" \
172 "mov %1, %%"REG_a" \n\t" \
174 "movq %%mm7, (%0, %%"REG_a") \n\t" \
175 "movq %%mm7, 8(%0, %%"REG_a") \n\t" \
176 "movq %%mm7, 16(%0, %%"REG_a") \n\t" \
177 "movq %%mm7, 24(%0, %%"REG_a") \n\t" \
178 "add $32, %%"REG_a" \n\t" \
180 :: "r"(((uint8_t *)blocks) + 128 * n), \
185 CLEAR_BLOCKS(ff_clear_blocks_mmx, 6)
186 CLEAR_BLOCKS(ff_clear_block_mmx, 1)
188 void ff_clear_block_sse(int16_t *block)
191 "xorps %%xmm0, %%xmm0 \n"
192 "movaps %%xmm0, (%0) \n"
193 "movaps %%xmm0, 16(%0) \n"
194 "movaps %%xmm0, 32(%0) \n"
195 "movaps %%xmm0, 48(%0) \n"
196 "movaps %%xmm0, 64(%0) \n"
197 "movaps %%xmm0, 80(%0) \n"
198 "movaps %%xmm0, 96(%0) \n"
199 "movaps %%xmm0, 112(%0) \n"
205 void ff_clear_blocks_sse(int16_t *blocks)
208 "xorps %%xmm0, %%xmm0 \n"
209 "mov %1, %%"REG_a" \n"
211 "movaps %%xmm0, (%0, %%"REG_a") \n"
212 "movaps %%xmm0, 16(%0, %%"REG_a") \n"
213 "movaps %%xmm0, 32(%0, %%"REG_a") \n"
214 "movaps %%xmm0, 48(%0, %%"REG_a") \n"
215 "movaps %%xmm0, 64(%0, %%"REG_a") \n"
216 "movaps %%xmm0, 80(%0, %%"REG_a") \n"
217 "movaps %%xmm0, 96(%0, %%"REG_a") \n"
218 "movaps %%xmm0, 112(%0, %%"REG_a") \n"
219 "add $128, %%"REG_a" \n"
221 :: "r"(((uint8_t *)blocks) + 128 * 6),
227 void ff_add_bytes_mmx(uint8_t *dst, uint8_t *src, int w)
233 "movq (%1, %0), %%mm0 \n\t"
234 "movq (%2, %0), %%mm1 \n\t"
235 "paddb %%mm0, %%mm1 \n\t"
236 "movq %%mm1, (%2, %0) \n\t"
237 "movq 8(%1, %0), %%mm0 \n\t"
238 "movq 8(%2, %0), %%mm1 \n\t"
239 "paddb %%mm0, %%mm1 \n\t"
240 "movq %%mm1, 8(%2, %0) \n\t"
246 : "r"(src), "r"(dst), "r"((x86_reg)w - 15)
249 dst[i + 0] += src[i + 0];
252 /* Draw the edges of width 'w' of an image of size width, height
253 * this MMX version can only handle w == 8 || w == 16. */
254 void ff_draw_edges_mmx(uint8_t *buf, int wrap, int width, int height,
255 int w, int h, int sides)
257 uint8_t *ptr, *last_line;
260 last_line = buf + (height - 1) * wrap;
266 "movd (%0), %%mm0 \n\t"
267 "punpcklbw %%mm0, %%mm0 \n\t"
268 "punpcklwd %%mm0, %%mm0 \n\t"
269 "punpckldq %%mm0, %%mm0 \n\t"
270 "movq %%mm0, -8(%0) \n\t"
271 "movq -8(%0, %2), %%mm1 \n\t"
272 "punpckhbw %%mm1, %%mm1 \n\t"
273 "punpckhwd %%mm1, %%mm1 \n\t"
274 "punpckhdq %%mm1, %%mm1 \n\t"
275 "movq %%mm1, (%0, %2) \n\t"
280 : "r"((x86_reg)wrap), "r"((x86_reg)width), "r"(ptr + wrap * height)
285 "movd (%0), %%mm0 \n\t"
286 "punpcklbw %%mm0, %%mm0 \n\t"
287 "punpcklwd %%mm0, %%mm0 \n\t"
288 "punpckldq %%mm0, %%mm0 \n\t"
289 "movq %%mm0, -8(%0) \n\t"
290 "movq %%mm0, -16(%0) \n\t"
291 "movq -8(%0, %2), %%mm1 \n\t"
292 "punpckhbw %%mm1, %%mm1 \n\t"
293 "punpckhwd %%mm1, %%mm1 \n\t"
294 "punpckhdq %%mm1, %%mm1 \n\t"
295 "movq %%mm1, (%0, %2) \n\t"
296 "movq %%mm1, 8(%0, %2) \n\t"
301 : "r"((x86_reg)wrap), "r"((x86_reg)width), "r"(ptr + wrap * height)
305 /* top and bottom (and hopefully also the corners) */
306 if (sides & EDGE_TOP) {
307 for (i = 0; i < h; i += 4) {
308 ptr = buf - (i + 1) * wrap - w;
311 "movq (%1, %0), %%mm0 \n\t"
312 "movq %%mm0, (%0) \n\t"
313 "movq %%mm0, (%0, %2) \n\t"
314 "movq %%mm0, (%0, %2, 2) \n\t"
315 "movq %%mm0, (%0, %3) \n\t"
320 : "r"((x86_reg)buf - (x86_reg)ptr - w), "r"((x86_reg) -wrap),
321 "r"((x86_reg) -wrap * 3), "r"(ptr + width + 2 * w)
326 if (sides & EDGE_BOTTOM) {
327 for (i = 0; i < h; i += 4) {
328 ptr = last_line + (i + 1) * wrap - w;
331 "movq (%1, %0), %%mm0 \n\t"
332 "movq %%mm0, (%0) \n\t"
333 "movq %%mm0, (%0, %2) \n\t"
334 "movq %%mm0, (%0, %2, 2) \n\t"
335 "movq %%mm0, (%0, %3) \n\t"
340 : "r"((x86_reg)last_line - (x86_reg)ptr - w),
341 "r"((x86_reg)wrap), "r"((x86_reg)wrap * 3),
342 "r"(ptr + width + 2 * w)
348 void ff_gmc_mmx(uint8_t *dst, uint8_t *src,
349 int stride, int h, int ox, int oy,
350 int dxx, int dxy, int dyx, int dyy,
351 int shift, int r, int width, int height)
354 const int ix = ox >> (16 + shift);
355 const int iy = oy >> (16 + shift);
356 const int oxs = ox >> 4;
357 const int oys = oy >> 4;
358 const int dxxs = dxx >> 4;
359 const int dxys = dxy >> 4;
360 const int dyxs = dyx >> 4;
361 const int dyys = dyy >> 4;
362 const uint16_t r4[4] = { r, r, r, r };
363 const uint16_t dxy4[4] = { dxys, dxys, dxys, dxys };
364 const uint16_t dyy4[4] = { dyys, dyys, dyys, dyys };
365 const uint64_t shift2 = 2 * shift;
368 const int dxw = (dxx - (1 << (16 + shift))) * (w - 1);
369 const int dyh = (dyy - (1 << (16 + shift))) * (h - 1);
370 const int dxh = dxy * (h - 1);
371 const int dyw = dyx * (w - 1);
372 if ( // non-constant fullpel offset (3% of blocks)
373 ((ox ^ (ox + dxw)) | (ox ^ (ox + dxh)) | (ox ^ (ox + dxw + dxh)) |
374 (oy ^ (oy + dyw)) | (oy ^ (oy + dyh)) | (oy ^ (oy + dyw + dyh))) >> (16 + shift)
375 // uses more than 16 bits of subpel mv (only at huge resolution)
376 || (dxx | dxy | dyx | dyy) & 15 ||
377 (unsigned)ix >= width - w ||
378 (unsigned)iy >= height - h) {
379 // FIXME could still use mmx for some of the rows
380 ff_gmc_c(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy,
381 shift, r, width, height);
385 src += ix + iy * stride;
388 "movd %0, %%mm6 \n\t"
389 "pxor %%mm7, %%mm7 \n\t"
390 "punpcklwd %%mm6, %%mm6 \n\t"
391 "punpcklwd %%mm6, %%mm6 \n\t"
395 for (x = 0; x < w; x += 4) {
396 uint16_t dx4[4] = { oxs - dxys + dxxs * (x + 0),
397 oxs - dxys + dxxs * (x + 1),
398 oxs - dxys + dxxs * (x + 2),
399 oxs - dxys + dxxs * (x + 3) };
400 uint16_t dy4[4] = { oys - dyys + dyxs * (x + 0),
401 oys - dyys + dyxs * (x + 1),
402 oys - dyys + dyxs * (x + 2),
403 oys - dyys + dyxs * (x + 3) };
405 for (y = 0; y < h; y++) {
407 "movq %0, %%mm4 \n\t"
408 "movq %1, %%mm5 \n\t"
409 "paddw %2, %%mm4 \n\t"
410 "paddw %3, %%mm5 \n\t"
411 "movq %%mm4, %0 \n\t"
412 "movq %%mm5, %1 \n\t"
413 "psrlw $12, %%mm4 \n\t"
414 "psrlw $12, %%mm5 \n\t"
415 : "+m"(*dx4), "+m"(*dy4)
416 : "m"(*dxy4), "m"(*dyy4)
420 "movq %%mm6, %%mm2 \n\t"
421 "movq %%mm6, %%mm1 \n\t"
422 "psubw %%mm4, %%mm2 \n\t"
423 "psubw %%mm5, %%mm1 \n\t"
424 "movq %%mm2, %%mm0 \n\t"
425 "movq %%mm4, %%mm3 \n\t"
426 "pmullw %%mm1, %%mm0 \n\t" // (s - dx) * (s - dy)
427 "pmullw %%mm5, %%mm3 \n\t" // dx * dy
428 "pmullw %%mm5, %%mm2 \n\t" // (s - dx) * dy
429 "pmullw %%mm4, %%mm1 \n\t" // dx * (s - dy)
431 "movd %4, %%mm5 \n\t"
432 "movd %3, %%mm4 \n\t"
433 "punpcklbw %%mm7, %%mm5 \n\t"
434 "punpcklbw %%mm7, %%mm4 \n\t"
435 "pmullw %%mm5, %%mm3 \n\t" // src[1, 1] * dx * dy
436 "pmullw %%mm4, %%mm2 \n\t" // src[0, 1] * (s - dx) * dy
438 "movd %2, %%mm5 \n\t"
439 "movd %1, %%mm4 \n\t"
440 "punpcklbw %%mm7, %%mm5 \n\t"
441 "punpcklbw %%mm7, %%mm4 \n\t"
442 "pmullw %%mm5, %%mm1 \n\t" // src[1, 0] * dx * (s - dy)
443 "pmullw %%mm4, %%mm0 \n\t" // src[0, 0] * (s - dx) * (s - dy)
444 "paddw %5, %%mm1 \n\t"
445 "paddw %%mm3, %%mm2 \n\t"
446 "paddw %%mm1, %%mm0 \n\t"
447 "paddw %%mm2, %%mm0 \n\t"
449 "psrlw %6, %%mm0 \n\t"
450 "packuswb %%mm0, %%mm0 \n\t"
451 "movd %%mm0, %0 \n\t"
453 : "=m"(dst[x + y * stride])
454 : "m"(src[0]), "m"(src[1]),
455 "m"(src[stride]), "m"(src[stride + 1]),
456 "m"(*r4), "m"(shift2)
460 src += 4 - h * stride;
464 void ff_vector_clipf_sse(float *dst, const float *src,
465 float min, float max, int len)
467 x86_reg i = (len - 16) * 4;
469 "movss %3, %%xmm4 \n\t"
470 "movss %4, %%xmm5 \n\t"
471 "shufps $0, %%xmm4, %%xmm4 \n\t"
472 "shufps $0, %%xmm5, %%xmm5 \n\t"
474 "movaps (%2, %0), %%xmm0 \n\t" // 3/1 on intel
475 "movaps 16(%2, %0), %%xmm1 \n\t"
476 "movaps 32(%2, %0), %%xmm2 \n\t"
477 "movaps 48(%2, %0), %%xmm3 \n\t"
478 "maxps %%xmm4, %%xmm0 \n\t"
479 "maxps %%xmm4, %%xmm1 \n\t"
480 "maxps %%xmm4, %%xmm2 \n\t"
481 "maxps %%xmm4, %%xmm3 \n\t"
482 "minps %%xmm5, %%xmm0 \n\t"
483 "minps %%xmm5, %%xmm1 \n\t"
484 "minps %%xmm5, %%xmm2 \n\t"
485 "minps %%xmm5, %%xmm3 \n\t"
486 "movaps %%xmm0, (%1, %0) \n\t"
487 "movaps %%xmm1, 16(%1, %0) \n\t"
488 "movaps %%xmm2, 32(%1, %0) \n\t"
489 "movaps %%xmm3, 48(%1, %0) \n\t"
493 : "r"(dst), "r"(src), "m"(min), "m"(max)
498 #endif /* HAVE_INLINE_ASM */