2 * MMX optimized DSP utils
3 * Copyright (c) 2000, 2001 Fabrice Bellard
4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
6 * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
8 * This file is part of Libav.
10 * Libav is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public
12 * License as published by the Free Software Foundation; either
13 * version 2.1 of the License, or (at your option) any later version.
15 * Libav is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * Lesser General Public License for more details.
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with Libav; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
25 #include "libavutil/attributes.h"
26 #include "libavutil/cpu.h"
27 #include "libavutil/x86/asm.h"
28 #include "libavcodec/dsputil.h"
29 #include "libavcodec/h264dsp.h"
30 #include "libavcodec/mpegvideo.h"
31 #include "libavcodec/simple_idct.h"
32 #include "dsputil_mmx.h"
33 #include "idct_xvid.h"
38 /* pixel operations */
39 DECLARE_ALIGNED(8, const uint64_t, ff_pw_15) = 0x000F000F000F000FULL;
40 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_17) = { 0x0011001100110011ULL, 0x0011001100110011ULL };
41 DECLARE_ALIGNED(8, const uint64_t, ff_pw_20) = 0x0014001400140014ULL;
42 DECLARE_ALIGNED(8, const uint64_t, ff_pw_42) = 0x002A002A002A002AULL;
43 DECLARE_ALIGNED(8, const uint64_t, ff_pw_53) = 0x0035003500350035ULL;
44 DECLARE_ALIGNED(8, const uint64_t, ff_pw_96) = 0x0060006000600060ULL;
45 DECLARE_ALIGNED(8, const uint64_t, ff_pw_128) = 0x0080008000800080ULL;
46 DECLARE_ALIGNED(8, const uint64_t, ff_pw_255) = 0x00ff00ff00ff00ffULL;
47 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_512) = { 0x0200020002000200ULL, 0x0200020002000200ULL };
48 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_1019) = { 0x03FB03FB03FB03FBULL, 0x03FB03FB03FB03FBULL };
50 DECLARE_ALIGNED(8, const uint64_t, ff_pb_3F) = 0x3F3F3F3F3F3F3F3FULL;
51 DECLARE_ALIGNED(8, const uint64_t, ff_pb_FC) = 0xFCFCFCFCFCFCFCFCULL;
53 DECLARE_ALIGNED(16, const double, ff_pd_1)[2] = { 1.0, 1.0 };
54 DECLARE_ALIGNED(16, const double, ff_pd_2)[2] = { 2.0, 2.0 };
58 void ff_put_pixels8_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
59 int dstStride, int src1Stride, int h);
60 void ff_put_no_rnd_pixels8_l2_mmxext(uint8_t *dst, uint8_t *src1,
61 uint8_t *src2, int dstStride,
62 int src1Stride, int h);
63 void ff_avg_pixels8_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
64 int dstStride, int src1Stride, int h);
65 void ff_put_pixels16_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
66 int dstStride, int src1Stride, int h);
67 void ff_avg_pixels16_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
68 int dstStride, int src1Stride, int h);
69 void ff_put_no_rnd_pixels16_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
70 int dstStride, int src1Stride, int h);
72 static void ff_put_pixels16_mmxext(uint8_t *block, const uint8_t *pixels,
73 ptrdiff_t line_size, int h)
75 ff_put_pixels8_mmxext(block, pixels, line_size, h);
76 ff_put_pixels8_mmxext(block + 8, pixels + 8, line_size, h);
79 void ff_put_mpeg4_qpel16_h_lowpass_mmxext(uint8_t *dst, uint8_t *src,
80 int dstStride, int srcStride, int h);
81 void ff_avg_mpeg4_qpel16_h_lowpass_mmxext(uint8_t *dst, uint8_t *src,
82 int dstStride, int srcStride, int h);
83 void ff_put_no_rnd_mpeg4_qpel16_h_lowpass_mmxext(uint8_t *dst, uint8_t *src,
84 int dstStride, int srcStride,
86 void ff_put_mpeg4_qpel8_h_lowpass_mmxext(uint8_t *dst, uint8_t *src,
87 int dstStride, int srcStride, int h);
88 void ff_avg_mpeg4_qpel8_h_lowpass_mmxext(uint8_t *dst, uint8_t *src,
89 int dstStride, int srcStride, int h);
90 void ff_put_no_rnd_mpeg4_qpel8_h_lowpass_mmxext(uint8_t *dst, uint8_t *src,
91 int dstStride, int srcStride,
93 void ff_put_mpeg4_qpel16_v_lowpass_mmxext(uint8_t *dst, uint8_t *src,
94 int dstStride, int srcStride);
95 void ff_avg_mpeg4_qpel16_v_lowpass_mmxext(uint8_t *dst, uint8_t *src,
96 int dstStride, int srcStride);
97 void ff_put_no_rnd_mpeg4_qpel16_v_lowpass_mmxext(uint8_t *dst, uint8_t *src,
98 int dstStride, int srcStride);
99 void ff_put_mpeg4_qpel8_v_lowpass_mmxext(uint8_t *dst, uint8_t *src,
100 int dstStride, int srcStride);
101 void ff_avg_mpeg4_qpel8_v_lowpass_mmxext(uint8_t *dst, uint8_t *src,
102 int dstStride, int srcStride);
103 void ff_put_no_rnd_mpeg4_qpel8_v_lowpass_mmxext(uint8_t *dst, uint8_t *src,
104 int dstStride, int srcStride);
105 #define ff_put_no_rnd_pixels16_mmxext ff_put_pixels16_mmxext
106 #define ff_put_no_rnd_pixels8_mmxext ff_put_pixels8_mmxext
107 #endif /* HAVE_YASM */
112 #define JUMPALIGN() __asm__ volatile (".p2align 3"::)
113 #define MOVQ_ZERO(regd) __asm__ volatile ("pxor %%"#regd", %%"#regd ::)
115 #define MOVQ_BFE(regd) \
117 "pcmpeqd %%"#regd", %%"#regd" \n\t" \
118 "paddb %%"#regd", %%"#regd" \n\t" ::)
121 #define MOVQ_BONE(regd) __asm__ volatile ("movq %0, %%"#regd" \n\t" :: "m"(ff_bone))
122 #define MOVQ_WTWO(regd) __asm__ volatile ("movq %0, %%"#regd" \n\t" :: "m"(ff_wtwo))
124 // for shared library it's better to use this way for accessing constants
126 #define MOVQ_BONE(regd) \
128 "pcmpeqd %%"#regd", %%"#regd" \n\t" \
129 "psrlw $15, %%"#regd" \n\t" \
130 "packuswb %%"#regd", %%"#regd" \n\t" ::)
132 #define MOVQ_WTWO(regd) \
134 "pcmpeqd %%"#regd", %%"#regd" \n\t" \
135 "psrlw $15, %%"#regd" \n\t" \
136 "psllw $1, %%"#regd" \n\t"::)
140 // using regr as temporary and for the output result
141 // first argument is unmodifed and second is trashed
142 // regfe is supposed to contain 0xfefefefefefefefe
143 #define PAVGB_MMX(rega, regb, regr, regfe) \
144 "movq "#rega", "#regr" \n\t" \
145 "por "#regb", "#regr" \n\t" \
146 "pxor "#rega", "#regb" \n\t" \
147 "pand "#regfe", "#regb" \n\t" \
148 "psrlq $1, "#regb" \n\t" \
149 "psubb "#regb", "#regr" \n\t"
151 // mm6 is supposed to contain 0xfefefefefefefefe
152 #define PAVGBP_MMX(rega, regb, regr, regc, regd, regp) \
153 "movq "#rega", "#regr" \n\t" \
154 "movq "#regc", "#regp" \n\t" \
155 "por "#regb", "#regr" \n\t" \
156 "por "#regd", "#regp" \n\t" \
157 "pxor "#rega", "#regb" \n\t" \
158 "pxor "#regc", "#regd" \n\t" \
159 "pand %%mm6, "#regb" \n\t" \
160 "pand %%mm6, "#regd" \n\t" \
161 "psrlq $1, "#regd" \n\t" \
162 "psrlq $1, "#regb" \n\t" \
163 "psubb "#regb", "#regr" \n\t" \
164 "psubb "#regd", "#regp" \n\t"
166 /***********************************/
169 #define DEF(x, y) x ## _ ## y ## _mmx
170 #define SET_RND MOVQ_WTWO
171 #define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX(a, b, c, d, e, f)
172 #define PAVGB(a, b, c, e) PAVGB_MMX(a, b, c, e)
173 #define OP_AVG(a, b, c, e) PAVGB_MMX(a, b, c, e)
175 #include "dsputil_rnd_template.c"
183 #endif /* HAVE_INLINE_ASM */
188 /***********************************/
189 /* MMXEXT specific */
191 //FIXME the following could be optimized too ...
192 static void ff_avg_pixels16_mmxext(uint8_t *block, const uint8_t *pixels,
193 int line_size, int h)
195 ff_avg_pixels8_mmxext(block, pixels, line_size, h);
196 ff_avg_pixels8_mmxext(block + 8, pixels + 8, line_size, h);
199 #endif /* HAVE_YASM */
203 /***********************************/
206 void ff_put_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
212 /* read the pixels */
217 "movq (%3), %%mm0 \n\t"
218 "movq 8(%3), %%mm1 \n\t"
219 "movq 16(%3), %%mm2 \n\t"
220 "movq 24(%3), %%mm3 \n\t"
221 "movq 32(%3), %%mm4 \n\t"
222 "movq 40(%3), %%mm5 \n\t"
223 "movq 48(%3), %%mm6 \n\t"
224 "movq 56(%3), %%mm7 \n\t"
225 "packuswb %%mm1, %%mm0 \n\t"
226 "packuswb %%mm3, %%mm2 \n\t"
227 "packuswb %%mm5, %%mm4 \n\t"
228 "packuswb %%mm7, %%mm6 \n\t"
229 "movq %%mm0, (%0) \n\t"
230 "movq %%mm2, (%0, %1) \n\t"
231 "movq %%mm4, (%0, %1, 2) \n\t"
232 "movq %%mm6, (%0, %2) \n\t"
233 :: "r"(pix), "r"((x86_reg)line_size), "r"((x86_reg)line_size * 3),
236 pix += line_size * 4;
239 // if here would be an exact copy of the code above
240 // compiler would generate some very strange code
243 "movq (%3), %%mm0 \n\t"
244 "movq 8(%3), %%mm1 \n\t"
245 "movq 16(%3), %%mm2 \n\t"
246 "movq 24(%3), %%mm3 \n\t"
247 "movq 32(%3), %%mm4 \n\t"
248 "movq 40(%3), %%mm5 \n\t"
249 "movq 48(%3), %%mm6 \n\t"
250 "movq 56(%3), %%mm7 \n\t"
251 "packuswb %%mm1, %%mm0 \n\t"
252 "packuswb %%mm3, %%mm2 \n\t"
253 "packuswb %%mm5, %%mm4 \n\t"
254 "packuswb %%mm7, %%mm6 \n\t"
255 "movq %%mm0, (%0) \n\t"
256 "movq %%mm2, (%0, %1) \n\t"
257 "movq %%mm4, (%0, %1, 2) \n\t"
258 "movq %%mm6, (%0, %2) \n\t"
259 :: "r"(pix), "r"((x86_reg)line_size), "r"((x86_reg)line_size * 3), "r"(p)
263 #define put_signed_pixels_clamped_mmx_half(off) \
264 "movq "#off"(%2), %%mm1 \n\t" \
265 "movq 16 + "#off"(%2), %%mm2 \n\t" \
266 "movq 32 + "#off"(%2), %%mm3 \n\t" \
267 "movq 48 + "#off"(%2), %%mm4 \n\t" \
268 "packsswb 8 + "#off"(%2), %%mm1 \n\t" \
269 "packsswb 24 + "#off"(%2), %%mm2 \n\t" \
270 "packsswb 40 + "#off"(%2), %%mm3 \n\t" \
271 "packsswb 56 + "#off"(%2), %%mm4 \n\t" \
272 "paddb %%mm0, %%mm1 \n\t" \
273 "paddb %%mm0, %%mm2 \n\t" \
274 "paddb %%mm0, %%mm3 \n\t" \
275 "paddb %%mm0, %%mm4 \n\t" \
276 "movq %%mm1, (%0) \n\t" \
277 "movq %%mm2, (%0, %3) \n\t" \
278 "movq %%mm3, (%0, %3, 2) \n\t" \
279 "movq %%mm4, (%0, %1) \n\t"
281 void ff_put_signed_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
284 x86_reg line_skip = line_size;
288 "movq "MANGLE(ff_pb_80)", %%mm0 \n\t"
289 "lea (%3, %3, 2), %1 \n\t"
290 put_signed_pixels_clamped_mmx_half(0)
291 "lea (%0, %3, 4), %0 \n\t"
292 put_signed_pixels_clamped_mmx_half(64)
293 : "+&r"(pixels), "=&r"(line_skip3)
294 : "r"(block), "r"(line_skip)
298 void ff_add_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
305 /* read the pixels */
312 "movq (%2), %%mm0 \n\t"
313 "movq 8(%2), %%mm1 \n\t"
314 "movq 16(%2), %%mm2 \n\t"
315 "movq 24(%2), %%mm3 \n\t"
316 "movq %0, %%mm4 \n\t"
317 "movq %1, %%mm6 \n\t"
318 "movq %%mm4, %%mm5 \n\t"
319 "punpcklbw %%mm7, %%mm4 \n\t"
320 "punpckhbw %%mm7, %%mm5 \n\t"
321 "paddsw %%mm4, %%mm0 \n\t"
322 "paddsw %%mm5, %%mm1 \n\t"
323 "movq %%mm6, %%mm5 \n\t"
324 "punpcklbw %%mm7, %%mm6 \n\t"
325 "punpckhbw %%mm7, %%mm5 \n\t"
326 "paddsw %%mm6, %%mm2 \n\t"
327 "paddsw %%mm5, %%mm3 \n\t"
328 "packuswb %%mm1, %%mm0 \n\t"
329 "packuswb %%mm3, %%mm2 \n\t"
330 "movq %%mm0, %0 \n\t"
331 "movq %%mm2, %1 \n\t"
332 : "+m"(*pix), "+m"(*(pix + line_size))
335 pix += line_size * 2;
340 static void put_pixels8_mmx(uint8_t *block, const uint8_t *pixels,
341 ptrdiff_t line_size, int h)
344 "lea (%3, %3), %%"REG_a" \n\t"
347 "movq (%1 ), %%mm0 \n\t"
348 "movq (%1, %3), %%mm1 \n\t"
349 "movq %%mm0, (%2) \n\t"
350 "movq %%mm1, (%2, %3) \n\t"
351 "add %%"REG_a", %1 \n\t"
352 "add %%"REG_a", %2 \n\t"
353 "movq (%1 ), %%mm0 \n\t"
354 "movq (%1, %3), %%mm1 \n\t"
355 "movq %%mm0, (%2) \n\t"
356 "movq %%mm1, (%2, %3) \n\t"
357 "add %%"REG_a", %1 \n\t"
358 "add %%"REG_a", %2 \n\t"
361 : "+g"(h), "+r"(pixels), "+r"(block)
362 : "r"((x86_reg)line_size)
367 static void put_pixels16_mmx(uint8_t *block, const uint8_t *pixels,
368 ptrdiff_t line_size, int h)
371 "lea (%3, %3), %%"REG_a" \n\t"
374 "movq (%1 ), %%mm0 \n\t"
375 "movq 8(%1 ), %%mm4 \n\t"
376 "movq (%1, %3), %%mm1 \n\t"
377 "movq 8(%1, %3), %%mm5 \n\t"
378 "movq %%mm0, (%2) \n\t"
379 "movq %%mm4, 8(%2) \n\t"
380 "movq %%mm1, (%2, %3) \n\t"
381 "movq %%mm5, 8(%2, %3) \n\t"
382 "add %%"REG_a", %1 \n\t"
383 "add %%"REG_a", %2 \n\t"
384 "movq (%1 ), %%mm0 \n\t"
385 "movq 8(%1 ), %%mm4 \n\t"
386 "movq (%1, %3), %%mm1 \n\t"
387 "movq 8(%1, %3), %%mm5 \n\t"
388 "movq %%mm0, (%2) \n\t"
389 "movq %%mm4, 8(%2) \n\t"
390 "movq %%mm1, (%2, %3) \n\t"
391 "movq %%mm5, 8(%2, %3) \n\t"
392 "add %%"REG_a", %1 \n\t"
393 "add %%"REG_a", %2 \n\t"
396 : "+g"(h), "+r"(pixels), "+r"(block)
397 : "r"((x86_reg)line_size)
402 #define CLEAR_BLOCKS(name, n) \
403 static void name(int16_t *blocks) \
406 "pxor %%mm7, %%mm7 \n\t" \
407 "mov %1, %%"REG_a" \n\t" \
409 "movq %%mm7, (%0, %%"REG_a") \n\t" \
410 "movq %%mm7, 8(%0, %%"REG_a") \n\t" \
411 "movq %%mm7, 16(%0, %%"REG_a") \n\t" \
412 "movq %%mm7, 24(%0, %%"REG_a") \n\t" \
413 "add $32, %%"REG_a" \n\t" \
415 :: "r"(((uint8_t *)blocks) + 128 * n), \
420 CLEAR_BLOCKS(clear_blocks_mmx, 6)
421 CLEAR_BLOCKS(clear_block_mmx, 1)
423 static void clear_block_sse(int16_t *block)
426 "xorps %%xmm0, %%xmm0 \n"
427 "movaps %%xmm0, (%0) \n"
428 "movaps %%xmm0, 16(%0) \n"
429 "movaps %%xmm0, 32(%0) \n"
430 "movaps %%xmm0, 48(%0) \n"
431 "movaps %%xmm0, 64(%0) \n"
432 "movaps %%xmm0, 80(%0) \n"
433 "movaps %%xmm0, 96(%0) \n"
434 "movaps %%xmm0, 112(%0) \n"
440 static void clear_blocks_sse(int16_t *blocks)
443 "xorps %%xmm0, %%xmm0 \n"
444 "mov %1, %%"REG_a" \n"
446 "movaps %%xmm0, (%0, %%"REG_a") \n"
447 "movaps %%xmm0, 16(%0, %%"REG_a") \n"
448 "movaps %%xmm0, 32(%0, %%"REG_a") \n"
449 "movaps %%xmm0, 48(%0, %%"REG_a") \n"
450 "movaps %%xmm0, 64(%0, %%"REG_a") \n"
451 "movaps %%xmm0, 80(%0, %%"REG_a") \n"
452 "movaps %%xmm0, 96(%0, %%"REG_a") \n"
453 "movaps %%xmm0, 112(%0, %%"REG_a") \n"
454 "add $128, %%"REG_a" \n"
456 :: "r"(((uint8_t *)blocks) + 128 * 6),
462 static void add_bytes_mmx(uint8_t *dst, uint8_t *src, int w)
468 "movq (%1, %0), %%mm0 \n\t"
469 "movq (%2, %0), %%mm1 \n\t"
470 "paddb %%mm0, %%mm1 \n\t"
471 "movq %%mm1, (%2, %0) \n\t"
472 "movq 8(%1, %0), %%mm0 \n\t"
473 "movq 8(%2, %0), %%mm1 \n\t"
474 "paddb %%mm0, %%mm1 \n\t"
475 "movq %%mm1, 8(%2, %0) \n\t"
481 : "r"(src), "r"(dst), "r"((x86_reg)w - 15)
484 dst[i + 0] += src[i + 0];
488 static void add_hfyu_median_prediction_cmov(uint8_t *dst, const uint8_t *top,
489 const uint8_t *diff, int w,
490 int *left, int *left_top)
494 int l = *left & 0xff;
495 int tl = *left_top & 0xff;
500 "movzbl (%3, %4), %2 \n"
513 "add (%6, %4), %b0 \n"
514 "mov %b0, (%5, %4) \n"
517 : "+&q"(l), "+&q"(tl), "=&r"(t), "=&q"(x), "+&r"(w2)
518 : "r"(dst + w), "r"(diff + w), "rm"(top + w)
525 /* Draw the edges of width 'w' of an image of size width, height
526 * this MMX version can only handle w == 8 || w == 16. */
527 static void draw_edges_mmx(uint8_t *buf, int wrap, int width, int height,
528 int w, int h, int sides)
530 uint8_t *ptr, *last_line;
533 last_line = buf + (height - 1) * wrap;
539 "movd (%0), %%mm0 \n\t"
540 "punpcklbw %%mm0, %%mm0 \n\t"
541 "punpcklwd %%mm0, %%mm0 \n\t"
542 "punpckldq %%mm0, %%mm0 \n\t"
543 "movq %%mm0, -8(%0) \n\t"
544 "movq -8(%0, %2), %%mm1 \n\t"
545 "punpckhbw %%mm1, %%mm1 \n\t"
546 "punpckhwd %%mm1, %%mm1 \n\t"
547 "punpckhdq %%mm1, %%mm1 \n\t"
548 "movq %%mm1, (%0, %2) \n\t"
553 : "r"((x86_reg)wrap), "r"((x86_reg)width), "r"(ptr + wrap * height)
558 "movd (%0), %%mm0 \n\t"
559 "punpcklbw %%mm0, %%mm0 \n\t"
560 "punpcklwd %%mm0, %%mm0 \n\t"
561 "punpckldq %%mm0, %%mm0 \n\t"
562 "movq %%mm0, -8(%0) \n\t"
563 "movq %%mm0, -16(%0) \n\t"
564 "movq -8(%0, %2), %%mm1 \n\t"
565 "punpckhbw %%mm1, %%mm1 \n\t"
566 "punpckhwd %%mm1, %%mm1 \n\t"
567 "punpckhdq %%mm1, %%mm1 \n\t"
568 "movq %%mm1, (%0, %2) \n\t"
569 "movq %%mm1, 8(%0, %2) \n\t"
574 : "r"((x86_reg)wrap), "r"((x86_reg)width), "r"(ptr + wrap * height)
578 /* top and bottom (and hopefully also the corners) */
579 if (sides & EDGE_TOP) {
580 for (i = 0; i < h; i += 4) {
581 ptr = buf - (i + 1) * wrap - w;
584 "movq (%1, %0), %%mm0 \n\t"
585 "movq %%mm0, (%0) \n\t"
586 "movq %%mm0, (%0, %2) \n\t"
587 "movq %%mm0, (%0, %2, 2) \n\t"
588 "movq %%mm0, (%0, %3) \n\t"
593 : "r"((x86_reg)buf - (x86_reg)ptr - w), "r"((x86_reg) -wrap),
594 "r"((x86_reg) -wrap * 3), "r"(ptr + width + 2 * w)
599 if (sides & EDGE_BOTTOM) {
600 for (i = 0; i < h; i += 4) {
601 ptr = last_line + (i + 1) * wrap - w;
604 "movq (%1, %0), %%mm0 \n\t"
605 "movq %%mm0, (%0) \n\t"
606 "movq %%mm0, (%0, %2) \n\t"
607 "movq %%mm0, (%0, %2, 2) \n\t"
608 "movq %%mm0, (%0, %3) \n\t"
613 : "r"((x86_reg)last_line - (x86_reg)ptr - w),
614 "r"((x86_reg)wrap), "r"((x86_reg)wrap * 3),
615 "r"(ptr + width + 2 * w)
620 #endif /* HAVE_INLINE_ASM */
624 #define QPEL_OP(OPNAME, ROUNDER, RND, MMX) \
625 static void OPNAME ## qpel8_mc00_ ## MMX (uint8_t *dst, uint8_t *src, \
628 ff_ ## OPNAME ## pixels8_ ## MMX(dst, src, stride, 8); \
631 static void OPNAME ## qpel8_mc10_ ## MMX(uint8_t *dst, uint8_t *src, \
635 uint8_t * const half = (uint8_t*)temp; \
636 ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, \
638 ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, src, half, \
639 stride, stride, 8); \
642 static void OPNAME ## qpel8_mc20_ ## MMX(uint8_t *dst, uint8_t *src, \
645 ff_ ## OPNAME ## mpeg4_qpel8_h_lowpass_ ## MMX(dst, src, stride, \
649 static void OPNAME ## qpel8_mc30_ ## MMX(uint8_t *dst, uint8_t *src, \
653 uint8_t * const half = (uint8_t*)temp; \
654 ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, \
656 ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, src + 1, half, stride, \
660 static void OPNAME ## qpel8_mc01_ ## MMX(uint8_t *dst, uint8_t *src, \
664 uint8_t * const half = (uint8_t*)temp; \
665 ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, \
667 ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, src, half, \
668 stride, stride, 8); \
671 static void OPNAME ## qpel8_mc02_ ## MMX(uint8_t *dst, uint8_t *src, \
674 ff_ ## OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, src, \
678 static void OPNAME ## qpel8_mc03_ ## MMX(uint8_t *dst, uint8_t *src, \
682 uint8_t * const half = (uint8_t*)temp; \
683 ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, \
685 ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, src + stride, half, stride,\
689 static void OPNAME ## qpel8_mc11_ ## MMX(uint8_t *dst, uint8_t *src, \
692 uint64_t half[8 + 9]; \
693 uint8_t * const halfH = ((uint8_t*)half) + 64; \
694 uint8_t * const halfHV = ((uint8_t*)half); \
695 ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
697 ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, \
699 ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
700 ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, \
704 static void OPNAME ## qpel8_mc31_ ## MMX(uint8_t *dst, uint8_t *src, \
707 uint64_t half[8 + 9]; \
708 uint8_t * const halfH = ((uint8_t*)half) + 64; \
709 uint8_t * const halfHV = ((uint8_t*)half); \
710 ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
712 ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src + 1, halfH, 8, \
714 ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
715 ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, \
719 static void OPNAME ## qpel8_mc13_ ## MMX(uint8_t *dst, uint8_t *src, \
722 uint64_t half[8 + 9]; \
723 uint8_t * const halfH = ((uint8_t*)half) + 64; \
724 uint8_t * const halfHV = ((uint8_t*)half); \
725 ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
727 ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, \
729 ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
730 ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH + 8, halfHV, \
734 static void OPNAME ## qpel8_mc33_ ## MMX(uint8_t *dst, uint8_t *src, \
737 uint64_t half[8 + 9]; \
738 uint8_t * const halfH = ((uint8_t*)half) + 64; \
739 uint8_t * const halfHV = ((uint8_t*)half); \
740 ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
742 ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src + 1, halfH, 8, \
744 ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
745 ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH + 8, halfHV, \
749 static void OPNAME ## qpel8_mc21_ ## MMX(uint8_t *dst, uint8_t *src, \
752 uint64_t half[8 + 9]; \
753 uint8_t * const halfH = ((uint8_t*)half) + 64; \
754 uint8_t * const halfHV = ((uint8_t*)half); \
755 ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
757 ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
758 ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, \
762 static void OPNAME ## qpel8_mc23_ ## MMX(uint8_t *dst, uint8_t *src, \
765 uint64_t half[8 + 9]; \
766 uint8_t * const halfH = ((uint8_t*)half) + 64; \
767 uint8_t * const halfHV = ((uint8_t*)half); \
768 ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
770 ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
771 ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH + 8, halfHV, \
775 static void OPNAME ## qpel8_mc12_ ## MMX(uint8_t *dst, uint8_t *src, \
778 uint64_t half[8 + 9]; \
779 uint8_t * const halfH = ((uint8_t*)half); \
780 ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
782 ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, \
784 ff_ ## OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, \
788 static void OPNAME ## qpel8_mc32_ ## MMX(uint8_t *dst, uint8_t *src, \
791 uint64_t half[8 + 9]; \
792 uint8_t * const halfH = ((uint8_t*)half); \
793 ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
795 ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src + 1, halfH, 8, \
797 ff_ ## OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, \
801 static void OPNAME ## qpel8_mc22_ ## MMX(uint8_t *dst, uint8_t *src, \
805 uint8_t * const halfH = ((uint8_t*)half); \
806 ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
808 ff_ ## OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, \
812 static void OPNAME ## qpel16_mc00_ ## MMX (uint8_t *dst, uint8_t *src, \
815 ff_ ## OPNAME ## pixels16_ ## MMX(dst, src, stride, 16); \
818 static void OPNAME ## qpel16_mc10_ ## MMX(uint8_t *dst, uint8_t *src, \
822 uint8_t * const half = (uint8_t*)temp; \
823 ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, \
825 ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, \
829 static void OPNAME ## qpel16_mc20_ ## MMX(uint8_t *dst, uint8_t *src, \
832 ff_ ## OPNAME ## mpeg4_qpel16_h_lowpass_ ## MMX(dst, src, \
833 stride, stride, 16);\
836 static void OPNAME ## qpel16_mc30_ ## MMX(uint8_t *dst, uint8_t *src, \
840 uint8_t * const half = (uint8_t*)temp; \
841 ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, \
843 ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, src + 1, half, \
844 stride, stride, 16); \
847 static void OPNAME ## qpel16_mc01_ ## MMX(uint8_t *dst, uint8_t *src, \
851 uint8_t * const half = (uint8_t*)temp; \
852 ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, \
854 ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, \
858 static void OPNAME ## qpel16_mc02_ ## MMX(uint8_t *dst, uint8_t *src, \
861 ff_ ## OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, src, \
865 static void OPNAME ## qpel16_mc03_ ## MMX(uint8_t *dst, uint8_t *src, \
869 uint8_t * const half = (uint8_t*)temp; \
870 ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, \
872 ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, src+stride, half, \
873 stride, stride, 16); \
876 static void OPNAME ## qpel16_mc11_ ## MMX(uint8_t *dst, uint8_t *src, \
879 uint64_t half[16 * 2 + 17 * 2]; \
880 uint8_t * const halfH = ((uint8_t*)half) + 256; \
881 uint8_t * const halfHV = ((uint8_t*)half); \
882 ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
884 ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, \
886 ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
888 ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, \
892 static void OPNAME ## qpel16_mc31_ ## MMX(uint8_t *dst, uint8_t *src, \
895 uint64_t half[16 * 2 + 17 * 2]; \
896 uint8_t * const halfH = ((uint8_t*)half) + 256; \
897 uint8_t * const halfHV = ((uint8_t*)half); \
898 ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
900 ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src + 1, halfH, 16, \
902 ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
904 ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, \
908 static void OPNAME ## qpel16_mc13_ ## MMX(uint8_t *dst, uint8_t *src, \
911 uint64_t half[16 * 2 + 17 * 2]; \
912 uint8_t * const halfH = ((uint8_t*)half) + 256; \
913 uint8_t * const halfHV = ((uint8_t*)half); \
914 ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
916 ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, \
918 ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
920 ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH + 16, halfHV, \
924 static void OPNAME ## qpel16_mc33_ ## MMX(uint8_t *dst, uint8_t *src, \
927 uint64_t half[16 * 2 + 17 * 2]; \
928 uint8_t * const halfH = ((uint8_t*)half) + 256; \
929 uint8_t * const halfHV = ((uint8_t*)half); \
930 ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
932 ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src + 1, halfH, 16, \
934 ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
936 ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH + 16, halfHV, \
940 static void OPNAME ## qpel16_mc21_ ## MMX(uint8_t *dst, uint8_t *src, \
943 uint64_t half[16 * 2 + 17 * 2]; \
944 uint8_t * const halfH = ((uint8_t*)half) + 256; \
945 uint8_t * const halfHV = ((uint8_t*)half); \
946 ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
948 ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
950 ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, \
954 static void OPNAME ## qpel16_mc23_ ## MMX(uint8_t *dst, uint8_t *src, \
957 uint64_t half[16 * 2 + 17 * 2]; \
958 uint8_t * const halfH = ((uint8_t*)half) + 256; \
959 uint8_t * const halfHV = ((uint8_t*)half); \
960 ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
962 ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
964 ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH + 16, halfHV, \
968 static void OPNAME ## qpel16_mc12_ ## MMX(uint8_t *dst, uint8_t *src, \
971 uint64_t half[17 * 2]; \
972 uint8_t * const halfH = ((uint8_t*)half); \
973 ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
975 ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, \
977 ff_ ## OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, \
981 static void OPNAME ## qpel16_mc32_ ## MMX(uint8_t *dst, uint8_t *src, \
984 uint64_t half[17 * 2]; \
985 uint8_t * const halfH = ((uint8_t*)half); \
986 ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
988 ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src + 1, halfH, 16, \
990 ff_ ## OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, \
994 static void OPNAME ## qpel16_mc22_ ## MMX(uint8_t *dst, uint8_t *src, \
997 uint64_t half[17 * 2]; \
998 uint8_t * const halfH = ((uint8_t*)half); \
999 ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1001 ff_ ## OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, \
1005 QPEL_OP(put_, ff_pw_16, _, mmxext)
1006 QPEL_OP(avg_, ff_pw_16, _, mmxext)
1007 QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, mmxext)
1008 #endif /* HAVE_YASM */
1012 void ff_put_rv40_qpel8_mc33_mmx(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1014 put_pixels8_xy2_mmx(dst, src, stride, 8);
1016 void ff_put_rv40_qpel16_mc33_mmx(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1018 put_pixels16_xy2_mmx(dst, src, stride, 16);
1020 void ff_avg_rv40_qpel8_mc33_mmx(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1022 avg_pixels8_xy2_mmx(dst, src, stride, 8);
1024 void ff_avg_rv40_qpel16_mc33_mmx(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1026 avg_pixels16_xy2_mmx(dst, src, stride, 16);
1029 static void gmc_mmx(uint8_t *dst, uint8_t *src,
1030 int stride, int h, int ox, int oy,
1031 int dxx, int dxy, int dyx, int dyy,
1032 int shift, int r, int width, int height)
1035 const int ix = ox >> (16 + shift);
1036 const int iy = oy >> (16 + shift);
1037 const int oxs = ox >> 4;
1038 const int oys = oy >> 4;
1039 const int dxxs = dxx >> 4;
1040 const int dxys = dxy >> 4;
1041 const int dyxs = dyx >> 4;
1042 const int dyys = dyy >> 4;
1043 const uint16_t r4[4] = { r, r, r, r };
1044 const uint16_t dxy4[4] = { dxys, dxys, dxys, dxys };
1045 const uint16_t dyy4[4] = { dyys, dyys, dyys, dyys };
1046 const uint64_t shift2 = 2 * shift;
1049 const int dxw = (dxx - (1 << (16 + shift))) * (w - 1);
1050 const int dyh = (dyy - (1 << (16 + shift))) * (h - 1);
1051 const int dxh = dxy * (h - 1);
1052 const int dyw = dyx * (w - 1);
1053 if ( // non-constant fullpel offset (3% of blocks)
1054 ((ox ^ (ox + dxw)) | (ox ^ (ox + dxh)) | (ox ^ (ox + dxw + dxh)) |
1055 (oy ^ (oy + dyw)) | (oy ^ (oy + dyh)) | (oy ^ (oy + dyw + dyh))) >> (16 + shift)
1056 // uses more than 16 bits of subpel mv (only at huge resolution)
1057 || (dxx | dxy | dyx | dyy) & 15 ||
1058 (unsigned)ix >= width - w ||
1059 (unsigned)iy >= height - h) {
1060 // FIXME could still use mmx for some of the rows
1061 ff_gmc_c(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy,
1062 shift, r, width, height);
1066 src += ix + iy * stride;
1069 "movd %0, %%mm6 \n\t"
1070 "pxor %%mm7, %%mm7 \n\t"
1071 "punpcklwd %%mm6, %%mm6 \n\t"
1072 "punpcklwd %%mm6, %%mm6 \n\t"
1076 for (x = 0; x < w; x += 4) {
1077 uint16_t dx4[4] = { oxs - dxys + dxxs * (x + 0),
1078 oxs - dxys + dxxs * (x + 1),
1079 oxs - dxys + dxxs * (x + 2),
1080 oxs - dxys + dxxs * (x + 3) };
1081 uint16_t dy4[4] = { oys - dyys + dyxs * (x + 0),
1082 oys - dyys + dyxs * (x + 1),
1083 oys - dyys + dyxs * (x + 2),
1084 oys - dyys + dyxs * (x + 3) };
1086 for (y = 0; y < h; y++) {
1088 "movq %0, %%mm4 \n\t"
1089 "movq %1, %%mm5 \n\t"
1090 "paddw %2, %%mm4 \n\t"
1091 "paddw %3, %%mm5 \n\t"
1092 "movq %%mm4, %0 \n\t"
1093 "movq %%mm5, %1 \n\t"
1094 "psrlw $12, %%mm4 \n\t"
1095 "psrlw $12, %%mm5 \n\t"
1096 : "+m"(*dx4), "+m"(*dy4)
1097 : "m"(*dxy4), "m"(*dyy4)
1101 "movq %%mm6, %%mm2 \n\t"
1102 "movq %%mm6, %%mm1 \n\t"
1103 "psubw %%mm4, %%mm2 \n\t"
1104 "psubw %%mm5, %%mm1 \n\t"
1105 "movq %%mm2, %%mm0 \n\t"
1106 "movq %%mm4, %%mm3 \n\t"
1107 "pmullw %%mm1, %%mm0 \n\t" // (s - dx) * (s - dy)
1108 "pmullw %%mm5, %%mm3 \n\t" // dx * dy
1109 "pmullw %%mm5, %%mm2 \n\t" // (s - dx) * dy
1110 "pmullw %%mm4, %%mm1 \n\t" // dx * (s - dy)
1112 "movd %4, %%mm5 \n\t"
1113 "movd %3, %%mm4 \n\t"
1114 "punpcklbw %%mm7, %%mm5 \n\t"
1115 "punpcklbw %%mm7, %%mm4 \n\t"
1116 "pmullw %%mm5, %%mm3 \n\t" // src[1, 1] * dx * dy
1117 "pmullw %%mm4, %%mm2 \n\t" // src[0, 1] * (s - dx) * dy
1119 "movd %2, %%mm5 \n\t"
1120 "movd %1, %%mm4 \n\t"
1121 "punpcklbw %%mm7, %%mm5 \n\t"
1122 "punpcklbw %%mm7, %%mm4 \n\t"
1123 "pmullw %%mm5, %%mm1 \n\t" // src[1, 0] * dx * (s - dy)
1124 "pmullw %%mm4, %%mm0 \n\t" // src[0, 0] * (s - dx) * (s - dy)
1125 "paddw %5, %%mm1 \n\t"
1126 "paddw %%mm3, %%mm2 \n\t"
1127 "paddw %%mm1, %%mm0 \n\t"
1128 "paddw %%mm2, %%mm0 \n\t"
1130 "psrlw %6, %%mm0 \n\t"
1131 "packuswb %%mm0, %%mm0 \n\t"
1132 "movd %%mm0, %0 \n\t"
1134 : "=m"(dst[x + y * stride])
1135 : "m"(src[0]), "m"(src[1]),
1136 "m"(src[stride]), "m"(src[stride + 1]),
1137 "m"(*r4), "m"(shift2)
1141 src += 4 - h * stride;
1146 void ff_put_cavs_qpel8_mc00_mmxext(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1148 put_pixels8_mmx(dst, src, stride, 8);
1151 void ff_avg_cavs_qpel8_mc00_mmxext(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1153 avg_pixels8_mmx(dst, src, stride, 8);
1156 void ff_put_cavs_qpel16_mc00_mmxext(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1158 put_pixels16_mmx(dst, src, stride, 16);
1161 void ff_avg_cavs_qpel16_mc00_mmxext(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1163 avg_pixels16_mmx(dst, src, stride, 16);
1167 void ff_put_vc1_mspel_mc00_mmx(uint8_t *dst, const uint8_t *src,
1168 ptrdiff_t stride, int rnd)
1170 put_pixels8_mmx(dst, src, stride, 8);
1173 static void vector_clipf_sse(float *dst, const float *src,
1174 float min, float max, int len)
1176 x86_reg i = (len - 16) * 4;
1178 "movss %3, %%xmm4 \n\t"
1179 "movss %4, %%xmm5 \n\t"
1180 "shufps $0, %%xmm4, %%xmm4 \n\t"
1181 "shufps $0, %%xmm5, %%xmm5 \n\t"
1183 "movaps (%2, %0), %%xmm0 \n\t" // 3/1 on intel
1184 "movaps 16(%2, %0), %%xmm1 \n\t"
1185 "movaps 32(%2, %0), %%xmm2 \n\t"
1186 "movaps 48(%2, %0), %%xmm3 \n\t"
1187 "maxps %%xmm4, %%xmm0 \n\t"
1188 "maxps %%xmm4, %%xmm1 \n\t"
1189 "maxps %%xmm4, %%xmm2 \n\t"
1190 "maxps %%xmm4, %%xmm3 \n\t"
1191 "minps %%xmm5, %%xmm0 \n\t"
1192 "minps %%xmm5, %%xmm1 \n\t"
1193 "minps %%xmm5, %%xmm2 \n\t"
1194 "minps %%xmm5, %%xmm3 \n\t"
1195 "movaps %%xmm0, (%1, %0) \n\t"
1196 "movaps %%xmm1, 16(%1, %0) \n\t"
1197 "movaps %%xmm2, 32(%1, %0) \n\t"
1198 "movaps %%xmm3, 48(%1, %0) \n\t"
1202 : "r"(dst), "r"(src), "m"(min), "m"(max)
1207 #endif /* HAVE_INLINE_ASM */
1209 void ff_h263_v_loop_filter_mmx(uint8_t *src, int stride, int qscale);
1210 void ff_h263_h_loop_filter_mmx(uint8_t *src, int stride, int qscale);
1212 int32_t ff_scalarproduct_int16_mmxext(const int16_t *v1, const int16_t *v2,
1214 int32_t ff_scalarproduct_int16_sse2(const int16_t *v1, const int16_t *v2,
1216 int32_t ff_scalarproduct_and_madd_int16_mmxext(int16_t *v1, const int16_t *v2,
1218 int order, int mul);
1219 int32_t ff_scalarproduct_and_madd_int16_sse2(int16_t *v1, const int16_t *v2,
1221 int order, int mul);
1222 int32_t ff_scalarproduct_and_madd_int16_ssse3(int16_t *v1, const int16_t *v2,
1224 int order, int mul);
1226 void ff_apply_window_int16_round_mmxext(int16_t *output, const int16_t *input,
1227 const int16_t *window, unsigned int len);
1228 void ff_apply_window_int16_round_sse2(int16_t *output, const int16_t *input,
1229 const int16_t *window, unsigned int len);
1230 void ff_apply_window_int16_mmxext(int16_t *output, const int16_t *input,
1231 const int16_t *window, unsigned int len);
1232 void ff_apply_window_int16_sse2(int16_t *output, const int16_t *input,
1233 const int16_t *window, unsigned int len);
1234 void ff_apply_window_int16_ssse3(int16_t *output, const int16_t *input,
1235 const int16_t *window, unsigned int len);
1236 void ff_apply_window_int16_ssse3_atom(int16_t *output, const int16_t *input,
1237 const int16_t *window, unsigned int len);
1239 void ff_bswap32_buf_ssse3(uint32_t *dst, const uint32_t *src, int w);
1240 void ff_bswap32_buf_sse2(uint32_t *dst, const uint32_t *src, int w);
1242 void ff_add_hfyu_median_prediction_mmxext(uint8_t *dst, const uint8_t *top,
1243 const uint8_t *diff, int w,
1244 int *left, int *left_top);
1245 int ff_add_hfyu_left_prediction_ssse3(uint8_t *dst, const uint8_t *src,
1247 int ff_add_hfyu_left_prediction_sse4(uint8_t *dst, const uint8_t *src,
1250 void ff_vector_clip_int32_mmx (int32_t *dst, const int32_t *src,
1251 int32_t min, int32_t max, unsigned int len);
1252 void ff_vector_clip_int32_sse2 (int32_t *dst, const int32_t *src,
1253 int32_t min, int32_t max, unsigned int len);
1254 void ff_vector_clip_int32_int_sse2(int32_t *dst, const int32_t *src,
1255 int32_t min, int32_t max, unsigned int len);
1256 void ff_vector_clip_int32_sse4 (int32_t *dst, const int32_t *src,
1257 int32_t min, int32_t max, unsigned int len);
1259 #define SET_QPEL_FUNCS(PFX, IDX, SIZE, CPU, PREFIX) \
1261 c->PFX ## _pixels_tab[IDX][ 0] = PREFIX ## PFX ## SIZE ## _mc00_ ## CPU; \
1262 c->PFX ## _pixels_tab[IDX][ 1] = PREFIX ## PFX ## SIZE ## _mc10_ ## CPU; \
1263 c->PFX ## _pixels_tab[IDX][ 2] = PREFIX ## PFX ## SIZE ## _mc20_ ## CPU; \
1264 c->PFX ## _pixels_tab[IDX][ 3] = PREFIX ## PFX ## SIZE ## _mc30_ ## CPU; \
1265 c->PFX ## _pixels_tab[IDX][ 4] = PREFIX ## PFX ## SIZE ## _mc01_ ## CPU; \
1266 c->PFX ## _pixels_tab[IDX][ 5] = PREFIX ## PFX ## SIZE ## _mc11_ ## CPU; \
1267 c->PFX ## _pixels_tab[IDX][ 6] = PREFIX ## PFX ## SIZE ## _mc21_ ## CPU; \
1268 c->PFX ## _pixels_tab[IDX][ 7] = PREFIX ## PFX ## SIZE ## _mc31_ ## CPU; \
1269 c->PFX ## _pixels_tab[IDX][ 8] = PREFIX ## PFX ## SIZE ## _mc02_ ## CPU; \
1270 c->PFX ## _pixels_tab[IDX][ 9] = PREFIX ## PFX ## SIZE ## _mc12_ ## CPU; \
1271 c->PFX ## _pixels_tab[IDX][10] = PREFIX ## PFX ## SIZE ## _mc22_ ## CPU; \
1272 c->PFX ## _pixels_tab[IDX][11] = PREFIX ## PFX ## SIZE ## _mc32_ ## CPU; \
1273 c->PFX ## _pixels_tab[IDX][12] = PREFIX ## PFX ## SIZE ## _mc03_ ## CPU; \
1274 c->PFX ## _pixels_tab[IDX][13] = PREFIX ## PFX ## SIZE ## _mc13_ ## CPU; \
1275 c->PFX ## _pixels_tab[IDX][14] = PREFIX ## PFX ## SIZE ## _mc23_ ## CPU; \
1276 c->PFX ## _pixels_tab[IDX][15] = PREFIX ## PFX ## SIZE ## _mc33_ ## CPU; \
1279 static av_cold void dsputil_init_mmx(DSPContext *c, AVCodecContext *avctx,
1282 const int high_bit_depth = avctx->bits_per_raw_sample > 8;
1285 c->put_pixels_clamped = ff_put_pixels_clamped_mmx;
1286 c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_mmx;
1287 c->add_pixels_clamped = ff_add_pixels_clamped_mmx;
1289 if (!high_bit_depth) {
1290 c->clear_block = clear_block_mmx;
1291 c->clear_blocks = clear_blocks_mmx;
1292 c->draw_edges = draw_edges_mmx;
1294 switch (avctx->idct_algo) {
1296 case FF_IDCT_SIMPLEMMX:
1297 c->idct_put = ff_simple_idct_put_mmx;
1298 c->idct_add = ff_simple_idct_add_mmx;
1299 c->idct = ff_simple_idct_mmx;
1300 c->idct_permutation_type = FF_SIMPLE_IDCT_PERM;
1302 case FF_IDCT_XVIDMMX:
1303 c->idct_put = ff_idct_xvid_mmx_put;
1304 c->idct_add = ff_idct_xvid_mmx_add;
1305 c->idct = ff_idct_xvid_mmx;
1312 c->add_bytes = add_bytes_mmx;
1313 #endif /* HAVE_INLINE_ASM */
1316 if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
1317 c->h263_v_loop_filter = ff_h263_v_loop_filter_mmx;
1318 c->h263_h_loop_filter = ff_h263_h_loop_filter_mmx;
1321 c->vector_clip_int32 = ff_vector_clip_int32_mmx;
1326 static av_cold void dsputil_init_mmxext(DSPContext *c, AVCodecContext *avctx,
1329 const int high_bit_depth = avctx->bits_per_raw_sample > 8;
1332 SET_QPEL_FUNCS(avg_qpel, 0, 16, mmxext, );
1333 SET_QPEL_FUNCS(avg_qpel, 1, 8, mmxext, );
1335 SET_QPEL_FUNCS(put_qpel, 0, 16, mmxext, );
1336 SET_QPEL_FUNCS(put_qpel, 1, 8, mmxext, );
1337 SET_QPEL_FUNCS(put_no_rnd_qpel, 0, 16, mmxext, );
1338 SET_QPEL_FUNCS(put_no_rnd_qpel, 1, 8, mmxext, );
1339 #endif /* HAVE_YASM */
1342 if (!high_bit_depth && avctx->idct_algo == FF_IDCT_XVIDMMX) {
1343 c->idct_put = ff_idct_xvid_mmxext_put;
1344 c->idct_add = ff_idct_xvid_mmxext_add;
1345 c->idct = ff_idct_xvid_mmxext;
1347 #endif /* HAVE_INLINE_ASM */
1349 #if HAVE_MMXEXT_EXTERNAL
1350 /* slower than cmov version on AMD */
1351 if (!(mm_flags & AV_CPU_FLAG_3DNOW))
1352 c->add_hfyu_median_prediction = ff_add_hfyu_median_prediction_mmxext;
1354 c->scalarproduct_int16 = ff_scalarproduct_int16_mmxext;
1355 c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_mmxext;
1357 if (avctx->flags & CODEC_FLAG_BITEXACT) {
1358 c->apply_window_int16 = ff_apply_window_int16_mmxext;
1360 c->apply_window_int16 = ff_apply_window_int16_round_mmxext;
1362 #endif /* HAVE_MMXEXT_EXTERNAL */
1365 static av_cold void dsputil_init_sse(DSPContext *c, AVCodecContext *avctx,
1369 const int high_bit_depth = avctx->bits_per_raw_sample > 8;
1371 if (!high_bit_depth) {
1372 if (!(CONFIG_MPEG_XVMC_DECODER && avctx->xvmc_acceleration > 1)) {
1373 /* XvMCCreateBlocks() may not allocate 16-byte aligned blocks */
1374 c->clear_block = clear_block_sse;
1375 c->clear_blocks = clear_blocks_sse;
1379 c->vector_clipf = vector_clipf_sse;
1380 #endif /* HAVE_INLINE_ASM */
1383 static av_cold void dsputil_init_sse2(DSPContext *c, AVCodecContext *avctx,
1386 const int high_bit_depth = avctx->bits_per_raw_sample > 8;
1388 #if HAVE_SSE2_INLINE
1389 if (!high_bit_depth && avctx->idct_algo == FF_IDCT_XVIDMMX) {
1390 c->idct_put = ff_idct_xvid_sse2_put;
1391 c->idct_add = ff_idct_xvid_sse2_add;
1392 c->idct = ff_idct_xvid_sse2;
1393 c->idct_permutation_type = FF_SSE2_IDCT_PERM;
1395 #endif /* HAVE_SSE2_INLINE */
1397 #if HAVE_SSE2_EXTERNAL
1398 c->scalarproduct_int16 = ff_scalarproduct_int16_sse2;
1399 c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_sse2;
1400 if (mm_flags & AV_CPU_FLAG_ATOM) {
1401 c->vector_clip_int32 = ff_vector_clip_int32_int_sse2;
1403 c->vector_clip_int32 = ff_vector_clip_int32_sse2;
1405 if (avctx->flags & CODEC_FLAG_BITEXACT) {
1406 c->apply_window_int16 = ff_apply_window_int16_sse2;
1407 } else if (!(mm_flags & AV_CPU_FLAG_SSE2SLOW)) {
1408 c->apply_window_int16 = ff_apply_window_int16_round_sse2;
1410 c->bswap_buf = ff_bswap32_buf_sse2;
1411 #endif /* HAVE_SSE2_EXTERNAL */
1414 static av_cold void dsputil_init_ssse3(DSPContext *c, AVCodecContext *avctx,
1417 #if HAVE_SSSE3_EXTERNAL
1418 c->add_hfyu_left_prediction = ff_add_hfyu_left_prediction_ssse3;
1419 if (mm_flags & AV_CPU_FLAG_SSE4) // not really sse4, just slow on Conroe
1420 c->add_hfyu_left_prediction = ff_add_hfyu_left_prediction_sse4;
1422 if (mm_flags & AV_CPU_FLAG_ATOM)
1423 c->apply_window_int16 = ff_apply_window_int16_ssse3_atom;
1425 c->apply_window_int16 = ff_apply_window_int16_ssse3;
1426 if (!(mm_flags & (AV_CPU_FLAG_SSE42|AV_CPU_FLAG_3DNOW))) // cachesplit
1427 c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_ssse3;
1428 c->bswap_buf = ff_bswap32_buf_ssse3;
1429 #endif /* HAVE_SSSE3_EXTERNAL */
1432 static av_cold void dsputil_init_sse4(DSPContext *c, AVCodecContext *avctx,
1435 #if HAVE_SSE4_EXTERNAL
1436 c->vector_clip_int32 = ff_vector_clip_int32_sse4;
1437 #endif /* HAVE_SSE4_EXTERNAL */
1440 av_cold void ff_dsputil_init_mmx(DSPContext *c, AVCodecContext *avctx)
1442 int mm_flags = av_get_cpu_flags();
1444 #if HAVE_7REGS && HAVE_INLINE_ASM
1445 if (mm_flags & AV_CPU_FLAG_CMOV)
1446 c->add_hfyu_median_prediction = add_hfyu_median_prediction_cmov;
1449 if (mm_flags & AV_CPU_FLAG_MMX)
1450 dsputil_init_mmx(c, avctx, mm_flags);
1452 if (mm_flags & AV_CPU_FLAG_MMXEXT)
1453 dsputil_init_mmxext(c, avctx, mm_flags);
1455 if (mm_flags & AV_CPU_FLAG_SSE)
1456 dsputil_init_sse(c, avctx, mm_flags);
1458 if (mm_flags & AV_CPU_FLAG_SSE2)
1459 dsputil_init_sse2(c, avctx, mm_flags);
1461 if (mm_flags & AV_CPU_FLAG_SSSE3)
1462 dsputil_init_ssse3(c, avctx, mm_flags);
1464 if (mm_flags & AV_CPU_FLAG_SSE4)
1465 dsputil_init_sse4(c, avctx, mm_flags);
1467 if (CONFIG_ENCODERS)
1468 ff_dsputilenc_init_mmx(c, avctx);