2 * MMX optimized DSP utils
3 * Copyright (c) 2000, 2001 Fabrice Bellard
4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
6 * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
8 * This file is part of Libav.
10 * Libav is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public
12 * License as published by the Free Software Foundation; either
13 * version 2.1 of the License, or (at your option) any later version.
15 * Libav is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * Lesser General Public License for more details.
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with Libav; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
25 #include "libavutil/attributes.h"
26 #include "libavutil/cpu.h"
27 #include "libavutil/x86/asm.h"
28 #include "libavcodec/dsputil.h"
29 #include "libavcodec/h264dsp.h"
30 #include "libavcodec/mpegvideo.h"
31 #include "libavcodec/simple_idct.h"
32 #include "dsputil_mmx.h"
33 #include "idct_xvid.h"
38 /* pixel operations */
39 DECLARE_ALIGNED(8, const uint64_t, ff_pw_15) = 0x000F000F000F000FULL;
40 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_17) = { 0x0011001100110011ULL, 0x0011001100110011ULL };
41 DECLARE_ALIGNED(8, const uint64_t, ff_pw_20) = 0x0014001400140014ULL;
42 DECLARE_ALIGNED(8, const uint64_t, ff_pw_42) = 0x002A002A002A002AULL;
43 DECLARE_ALIGNED(8, const uint64_t, ff_pw_53) = 0x0035003500350035ULL;
44 DECLARE_ALIGNED(8, const uint64_t, ff_pw_96) = 0x0060006000600060ULL;
45 DECLARE_ALIGNED(8, const uint64_t, ff_pw_128) = 0x0080008000800080ULL;
46 DECLARE_ALIGNED(8, const uint64_t, ff_pw_255) = 0x00ff00ff00ff00ffULL;
47 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_512) = { 0x0200020002000200ULL, 0x0200020002000200ULL };
48 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_1019) = { 0x03FB03FB03FB03FBULL, 0x03FB03FB03FB03FBULL };
50 DECLARE_ALIGNED(8, const uint64_t, ff_pb_7) = 0x0707070707070707ULL;
51 DECLARE_ALIGNED(8, const uint64_t, ff_pb_1F) = 0x1F1F1F1F1F1F1F1FULL;
52 DECLARE_ALIGNED(8, const uint64_t, ff_pb_3F) = 0x3F3F3F3F3F3F3F3FULL;
53 DECLARE_ALIGNED(8, const uint64_t, ff_pb_81) = 0x8181818181818181ULL;
54 DECLARE_ALIGNED(8, const uint64_t, ff_pb_FC) = 0xFCFCFCFCFCFCFCFCULL;
56 DECLARE_ALIGNED(16, const double, ff_pd_1)[2] = { 1.0, 1.0 };
57 DECLARE_ALIGNED(16, const double, ff_pd_2)[2] = { 2.0, 2.0 };
61 void ff_put_pixels8_x2_mmxext(uint8_t *block, const uint8_t *pixels,
62 ptrdiff_t line_size, int h);
63 void ff_put_pixels8_x2_3dnow(uint8_t *block, const uint8_t *pixels,
64 ptrdiff_t line_size, int h);
65 void ff_put_pixels8_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
66 int dstStride, int src1Stride, int h);
67 void ff_put_no_rnd_pixels8_l2_mmxext(uint8_t *dst, uint8_t *src1,
68 uint8_t *src2, int dstStride,
69 int src1Stride, int h);
70 void ff_avg_pixels8_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
71 int dstStride, int src1Stride, int h);
72 void ff_put_pixels16_x2_mmxext(uint8_t *block, const uint8_t *pixels,
73 ptrdiff_t line_size, int h);
74 void ff_put_pixels16_x2_3dnow(uint8_t *block, const uint8_t *pixels,
75 ptrdiff_t line_size, int h);
76 void ff_put_pixels16_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
77 int dstStride, int src1Stride, int h);
78 void ff_avg_pixels16_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
79 int dstStride, int src1Stride, int h);
80 void ff_put_no_rnd_pixels16_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
81 int dstStride, int src1Stride, int h);
82 void ff_put_no_rnd_pixels8_x2_mmxext(uint8_t *block, const uint8_t *pixels,
83 ptrdiff_t line_size, int h);
84 void ff_put_no_rnd_pixels8_x2_3dnow(uint8_t *block, const uint8_t *pixels,
85 ptrdiff_t line_size, int h);
86 void ff_put_no_rnd_pixels8_x2_exact_mmxext(uint8_t *block,
87 const uint8_t *pixels,
88 ptrdiff_t line_size, int h);
89 void ff_put_no_rnd_pixels8_x2_exact_3dnow(uint8_t *block,
90 const uint8_t *pixels,
91 ptrdiff_t line_size, int h);
92 void ff_put_pixels8_y2_mmxext(uint8_t *block, const uint8_t *pixels,
93 ptrdiff_t line_size, int h);
94 void ff_put_pixels8_y2_3dnow(uint8_t *block, const uint8_t *pixels,
95 ptrdiff_t line_size, int h);
96 void ff_put_no_rnd_pixels8_y2_mmxext(uint8_t *block, const uint8_t *pixels,
97 ptrdiff_t line_size, int h);
98 void ff_put_no_rnd_pixels8_y2_3dnow(uint8_t *block, const uint8_t *pixels,
99 ptrdiff_t line_size, int h);
100 void ff_put_no_rnd_pixels8_y2_exact_mmxext(uint8_t *block,
101 const uint8_t *pixels,
102 ptrdiff_t line_size, int h);
103 void ff_put_no_rnd_pixels8_y2_exact_3dnow(uint8_t *block,
104 const uint8_t *pixels,
105 ptrdiff_t line_size, int h);
106 void ff_avg_pixels8_3dnow(uint8_t *block, const uint8_t *pixels,
107 ptrdiff_t line_size, int h);
108 void ff_avg_pixels8_x2_mmxext(uint8_t *block, const uint8_t *pixels,
109 ptrdiff_t line_size, int h);
110 void ff_avg_pixels8_x2_3dnow(uint8_t *block, const uint8_t *pixels,
111 ptrdiff_t line_size, int h);
112 void ff_avg_pixels8_y2_mmxext(uint8_t *block, const uint8_t *pixels,
113 ptrdiff_t line_size, int h);
114 void ff_avg_pixels8_y2_3dnow(uint8_t *block, const uint8_t *pixels,
115 ptrdiff_t line_size, int h);
116 void ff_avg_pixels8_xy2_mmxext(uint8_t *block, const uint8_t *pixels,
117 ptrdiff_t line_size, int h);
118 void ff_avg_pixels8_xy2_3dnow(uint8_t *block, const uint8_t *pixels,
119 ptrdiff_t line_size, int h);
121 void ff_put_pixels8_mmxext(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h);
122 static void ff_put_pixels16_mmxext(uint8_t *block, const uint8_t *pixels,
123 ptrdiff_t line_size, int h)
125 ff_put_pixels8_mmxext(block, pixels, line_size, h);
126 ff_put_pixels8_mmxext(block + 8, pixels + 8, line_size, h);
129 void ff_put_mpeg4_qpel16_h_lowpass_mmxext(uint8_t *dst, uint8_t *src,
130 int dstStride, int srcStride, int h);
131 void ff_avg_mpeg4_qpel16_h_lowpass_mmxext(uint8_t *dst, uint8_t *src,
132 int dstStride, int srcStride, int h);
133 void ff_put_no_rnd_mpeg4_qpel16_h_lowpass_mmxext(uint8_t *dst, uint8_t *src,
134 int dstStride, int srcStride,
136 void ff_put_mpeg4_qpel8_h_lowpass_mmxext(uint8_t *dst, uint8_t *src,
137 int dstStride, int srcStride, int h);
138 void ff_avg_mpeg4_qpel8_h_lowpass_mmxext(uint8_t *dst, uint8_t *src,
139 int dstStride, int srcStride, int h);
140 void ff_put_no_rnd_mpeg4_qpel8_h_lowpass_mmxext(uint8_t *dst, uint8_t *src,
141 int dstStride, int srcStride,
143 void ff_put_mpeg4_qpel16_v_lowpass_mmxext(uint8_t *dst, uint8_t *src,
144 int dstStride, int srcStride);
145 void ff_avg_mpeg4_qpel16_v_lowpass_mmxext(uint8_t *dst, uint8_t *src,
146 int dstStride, int srcStride);
147 void ff_put_no_rnd_mpeg4_qpel16_v_lowpass_mmxext(uint8_t *dst, uint8_t *src,
148 int dstStride, int srcStride);
149 void ff_put_mpeg4_qpel8_v_lowpass_mmxext(uint8_t *dst, uint8_t *src,
150 int dstStride, int srcStride);
151 void ff_avg_mpeg4_qpel8_v_lowpass_mmxext(uint8_t *dst, uint8_t *src,
152 int dstStride, int srcStride);
153 void ff_put_no_rnd_mpeg4_qpel8_v_lowpass_mmxext(uint8_t *dst, uint8_t *src,
154 int dstStride, int srcStride);
155 #define ff_put_no_rnd_pixels16_mmxext ff_put_pixels16_mmxext
156 #define ff_put_no_rnd_pixels8_mmxext ff_put_pixels8_mmxext
157 #endif /* HAVE_YASM */
162 #define JUMPALIGN() __asm__ volatile (".p2align 3"::)
163 #define MOVQ_ZERO(regd) __asm__ volatile ("pxor %%"#regd", %%"#regd ::)
165 #define MOVQ_BFE(regd) \
167 "pcmpeqd %%"#regd", %%"#regd" \n\t" \
168 "paddb %%"#regd", %%"#regd" \n\t" ::)
171 #define MOVQ_BONE(regd) __asm__ volatile ("movq %0, %%"#regd" \n\t" :: "m"(ff_bone))
172 #define MOVQ_WTWO(regd) __asm__ volatile ("movq %0, %%"#regd" \n\t" :: "m"(ff_wtwo))
174 // for shared library it's better to use this way for accessing constants
176 #define MOVQ_BONE(regd) \
178 "pcmpeqd %%"#regd", %%"#regd" \n\t" \
179 "psrlw $15, %%"#regd" \n\t" \
180 "packuswb %%"#regd", %%"#regd" \n\t" ::)
182 #define MOVQ_WTWO(regd) \
184 "pcmpeqd %%"#regd", %%"#regd" \n\t" \
185 "psrlw $15, %%"#regd" \n\t" \
186 "psllw $1, %%"#regd" \n\t"::)
190 // using regr as temporary and for the output result
191 // first argument is unmodifed and second is trashed
192 // regfe is supposed to contain 0xfefefefefefefefe
193 #define PAVGB_MMX_NO_RND(rega, regb, regr, regfe) \
194 "movq "#rega", "#regr" \n\t" \
195 "pand "#regb", "#regr" \n\t" \
196 "pxor "#rega", "#regb" \n\t" \
197 "pand "#regfe", "#regb" \n\t" \
198 "psrlq $1, "#regb" \n\t" \
199 "paddb "#regb", "#regr" \n\t"
201 #define PAVGB_MMX(rega, regb, regr, regfe) \
202 "movq "#rega", "#regr" \n\t" \
203 "por "#regb", "#regr" \n\t" \
204 "pxor "#rega", "#regb" \n\t" \
205 "pand "#regfe", "#regb" \n\t" \
206 "psrlq $1, "#regb" \n\t" \
207 "psubb "#regb", "#regr" \n\t"
209 // mm6 is supposed to contain 0xfefefefefefefefe
210 #define PAVGBP_MMX_NO_RND(rega, regb, regr, regc, regd, regp) \
211 "movq "#rega", "#regr" \n\t" \
212 "movq "#regc", "#regp" \n\t" \
213 "pand "#regb", "#regr" \n\t" \
214 "pand "#regd", "#regp" \n\t" \
215 "pxor "#rega", "#regb" \n\t" \
216 "pxor "#regc", "#regd" \n\t" \
217 "pand %%mm6, "#regb" \n\t" \
218 "pand %%mm6, "#regd" \n\t" \
219 "psrlq $1, "#regb" \n\t" \
220 "psrlq $1, "#regd" \n\t" \
221 "paddb "#regb", "#regr" \n\t" \
222 "paddb "#regd", "#regp" \n\t"
224 #define PAVGBP_MMX(rega, regb, regr, regc, regd, regp) \
225 "movq "#rega", "#regr" \n\t" \
226 "movq "#regc", "#regp" \n\t" \
227 "por "#regb", "#regr" \n\t" \
228 "por "#regd", "#regp" \n\t" \
229 "pxor "#rega", "#regb" \n\t" \
230 "pxor "#regc", "#regd" \n\t" \
231 "pand %%mm6, "#regb" \n\t" \
232 "pand %%mm6, "#regd" \n\t" \
233 "psrlq $1, "#regd" \n\t" \
234 "psrlq $1, "#regb" \n\t" \
235 "psubb "#regb", "#regr" \n\t" \
236 "psubb "#regd", "#regp" \n\t"
238 /***********************************/
239 /* MMX no rounding */
241 #define DEF(x, y) x ## _no_rnd_ ## y ## _mmx
242 #define SET_RND MOVQ_WONE
243 #define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX_NO_RND(a, b, c, d, e, f)
244 #define PAVGB(a, b, c, e) PAVGB_MMX_NO_RND(a, b, c, e)
245 #define OP_AVG(a, b, c, e) PAVGB_MMX(a, b, c, e)
247 #include "dsputil_rnd_template.c"
254 /***********************************/
257 #define DEF(x, y) x ## _ ## y ## _mmx
258 #define SET_RND MOVQ_WTWO
259 #define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX(a, b, c, d, e, f)
260 #define PAVGB(a, b, c, e) PAVGB_MMX(a, b, c, e)
262 #include "dsputil_rnd_template.c"
270 #endif /* HAVE_INLINE_ASM */
275 /***********************************/
278 #define DEF(x) x ## _3dnow
280 #include "dsputil_avg_template.c"
284 /***********************************/
285 /* MMXEXT specific */
287 #define DEF(x) x ## _mmxext
289 #include "dsputil_avg_template.c"
293 #endif /* HAVE_YASM */
297 #define put_no_rnd_pixels16_mmx put_pixels16_mmx
298 #define put_no_rnd_pixels8_mmx put_pixels8_mmx
300 /***********************************/
303 void ff_put_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
309 /* read the pixels */
314 "movq (%3), %%mm0 \n\t"
315 "movq 8(%3), %%mm1 \n\t"
316 "movq 16(%3), %%mm2 \n\t"
317 "movq 24(%3), %%mm3 \n\t"
318 "movq 32(%3), %%mm4 \n\t"
319 "movq 40(%3), %%mm5 \n\t"
320 "movq 48(%3), %%mm6 \n\t"
321 "movq 56(%3), %%mm7 \n\t"
322 "packuswb %%mm1, %%mm0 \n\t"
323 "packuswb %%mm3, %%mm2 \n\t"
324 "packuswb %%mm5, %%mm4 \n\t"
325 "packuswb %%mm7, %%mm6 \n\t"
326 "movq %%mm0, (%0) \n\t"
327 "movq %%mm2, (%0, %1) \n\t"
328 "movq %%mm4, (%0, %1, 2) \n\t"
329 "movq %%mm6, (%0, %2) \n\t"
330 :: "r"(pix), "r"((x86_reg)line_size), "r"((x86_reg)line_size * 3),
333 pix += line_size * 4;
336 // if here would be an exact copy of the code above
337 // compiler would generate some very strange code
340 "movq (%3), %%mm0 \n\t"
341 "movq 8(%3), %%mm1 \n\t"
342 "movq 16(%3), %%mm2 \n\t"
343 "movq 24(%3), %%mm3 \n\t"
344 "movq 32(%3), %%mm4 \n\t"
345 "movq 40(%3), %%mm5 \n\t"
346 "movq 48(%3), %%mm6 \n\t"
347 "movq 56(%3), %%mm7 \n\t"
348 "packuswb %%mm1, %%mm0 \n\t"
349 "packuswb %%mm3, %%mm2 \n\t"
350 "packuswb %%mm5, %%mm4 \n\t"
351 "packuswb %%mm7, %%mm6 \n\t"
352 "movq %%mm0, (%0) \n\t"
353 "movq %%mm2, (%0, %1) \n\t"
354 "movq %%mm4, (%0, %1, 2) \n\t"
355 "movq %%mm6, (%0, %2) \n\t"
356 :: "r"(pix), "r"((x86_reg)line_size), "r"((x86_reg)line_size * 3), "r"(p)
360 #define put_signed_pixels_clamped_mmx_half(off) \
361 "movq "#off"(%2), %%mm1 \n\t" \
362 "movq 16 + "#off"(%2), %%mm2 \n\t" \
363 "movq 32 + "#off"(%2), %%mm3 \n\t" \
364 "movq 48 + "#off"(%2), %%mm4 \n\t" \
365 "packsswb 8 + "#off"(%2), %%mm1 \n\t" \
366 "packsswb 24 + "#off"(%2), %%mm2 \n\t" \
367 "packsswb 40 + "#off"(%2), %%mm3 \n\t" \
368 "packsswb 56 + "#off"(%2), %%mm4 \n\t" \
369 "paddb %%mm0, %%mm1 \n\t" \
370 "paddb %%mm0, %%mm2 \n\t" \
371 "paddb %%mm0, %%mm3 \n\t" \
372 "paddb %%mm0, %%mm4 \n\t" \
373 "movq %%mm1, (%0) \n\t" \
374 "movq %%mm2, (%0, %3) \n\t" \
375 "movq %%mm3, (%0, %3, 2) \n\t" \
376 "movq %%mm4, (%0, %1) \n\t"
378 void ff_put_signed_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
381 x86_reg line_skip = line_size;
385 "movq "MANGLE(ff_pb_80)", %%mm0 \n\t"
386 "lea (%3, %3, 2), %1 \n\t"
387 put_signed_pixels_clamped_mmx_half(0)
388 "lea (%0, %3, 4), %0 \n\t"
389 put_signed_pixels_clamped_mmx_half(64)
390 : "+&r"(pixels), "=&r"(line_skip3)
391 : "r"(block), "r"(line_skip)
395 void ff_add_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
402 /* read the pixels */
409 "movq (%2), %%mm0 \n\t"
410 "movq 8(%2), %%mm1 \n\t"
411 "movq 16(%2), %%mm2 \n\t"
412 "movq 24(%2), %%mm3 \n\t"
413 "movq %0, %%mm4 \n\t"
414 "movq %1, %%mm6 \n\t"
415 "movq %%mm4, %%mm5 \n\t"
416 "punpcklbw %%mm7, %%mm4 \n\t"
417 "punpckhbw %%mm7, %%mm5 \n\t"
418 "paddsw %%mm4, %%mm0 \n\t"
419 "paddsw %%mm5, %%mm1 \n\t"
420 "movq %%mm6, %%mm5 \n\t"
421 "punpcklbw %%mm7, %%mm6 \n\t"
422 "punpckhbw %%mm7, %%mm5 \n\t"
423 "paddsw %%mm6, %%mm2 \n\t"
424 "paddsw %%mm5, %%mm3 \n\t"
425 "packuswb %%mm1, %%mm0 \n\t"
426 "packuswb %%mm3, %%mm2 \n\t"
427 "movq %%mm0, %0 \n\t"
428 "movq %%mm2, %1 \n\t"
429 : "+m"(*pix), "+m"(*(pix + line_size))
432 pix += line_size * 2;
437 static void put_pixels8_mmx(uint8_t *block, const uint8_t *pixels,
438 ptrdiff_t line_size, int h)
441 "lea (%3, %3), %%"REG_a" \n\t"
444 "movq (%1 ), %%mm0 \n\t"
445 "movq (%1, %3), %%mm1 \n\t"
446 "movq %%mm0, (%2) \n\t"
447 "movq %%mm1, (%2, %3) \n\t"
448 "add %%"REG_a", %1 \n\t"
449 "add %%"REG_a", %2 \n\t"
450 "movq (%1 ), %%mm0 \n\t"
451 "movq (%1, %3), %%mm1 \n\t"
452 "movq %%mm0, (%2) \n\t"
453 "movq %%mm1, (%2, %3) \n\t"
454 "add %%"REG_a", %1 \n\t"
455 "add %%"REG_a", %2 \n\t"
458 : "+g"(h), "+r"(pixels), "+r"(block)
459 : "r"((x86_reg)line_size)
464 static void put_pixels16_mmx(uint8_t *block, const uint8_t *pixels,
465 ptrdiff_t line_size, int h)
468 "lea (%3, %3), %%"REG_a" \n\t"
471 "movq (%1 ), %%mm0 \n\t"
472 "movq 8(%1 ), %%mm4 \n\t"
473 "movq (%1, %3), %%mm1 \n\t"
474 "movq 8(%1, %3), %%mm5 \n\t"
475 "movq %%mm0, (%2) \n\t"
476 "movq %%mm4, 8(%2) \n\t"
477 "movq %%mm1, (%2, %3) \n\t"
478 "movq %%mm5, 8(%2, %3) \n\t"
479 "add %%"REG_a", %1 \n\t"
480 "add %%"REG_a", %2 \n\t"
481 "movq (%1 ), %%mm0 \n\t"
482 "movq 8(%1 ), %%mm4 \n\t"
483 "movq (%1, %3), %%mm1 \n\t"
484 "movq 8(%1, %3), %%mm5 \n\t"
485 "movq %%mm0, (%2) \n\t"
486 "movq %%mm4, 8(%2) \n\t"
487 "movq %%mm1, (%2, %3) \n\t"
488 "movq %%mm5, 8(%2, %3) \n\t"
489 "add %%"REG_a", %1 \n\t"
490 "add %%"REG_a", %2 \n\t"
493 : "+g"(h), "+r"(pixels), "+r"(block)
494 : "r"((x86_reg)line_size)
499 #define CLEAR_BLOCKS(name, n) \
500 static void name(int16_t *blocks) \
503 "pxor %%mm7, %%mm7 \n\t" \
504 "mov %1, %%"REG_a" \n\t" \
506 "movq %%mm7, (%0, %%"REG_a") \n\t" \
507 "movq %%mm7, 8(%0, %%"REG_a") \n\t" \
508 "movq %%mm7, 16(%0, %%"REG_a") \n\t" \
509 "movq %%mm7, 24(%0, %%"REG_a") \n\t" \
510 "add $32, %%"REG_a" \n\t" \
512 :: "r"(((uint8_t *)blocks) + 128 * n), \
517 CLEAR_BLOCKS(clear_blocks_mmx, 6)
518 CLEAR_BLOCKS(clear_block_mmx, 1)
520 static void clear_block_sse(int16_t *block)
523 "xorps %%xmm0, %%xmm0 \n"
524 "movaps %%xmm0, (%0) \n"
525 "movaps %%xmm0, 16(%0) \n"
526 "movaps %%xmm0, 32(%0) \n"
527 "movaps %%xmm0, 48(%0) \n"
528 "movaps %%xmm0, 64(%0) \n"
529 "movaps %%xmm0, 80(%0) \n"
530 "movaps %%xmm0, 96(%0) \n"
531 "movaps %%xmm0, 112(%0) \n"
537 static void clear_blocks_sse(int16_t *blocks)
540 "xorps %%xmm0, %%xmm0 \n"
541 "mov %1, %%"REG_a" \n"
543 "movaps %%xmm0, (%0, %%"REG_a") \n"
544 "movaps %%xmm0, 16(%0, %%"REG_a") \n"
545 "movaps %%xmm0, 32(%0, %%"REG_a") \n"
546 "movaps %%xmm0, 48(%0, %%"REG_a") \n"
547 "movaps %%xmm0, 64(%0, %%"REG_a") \n"
548 "movaps %%xmm0, 80(%0, %%"REG_a") \n"
549 "movaps %%xmm0, 96(%0, %%"REG_a") \n"
550 "movaps %%xmm0, 112(%0, %%"REG_a") \n"
551 "add $128, %%"REG_a" \n"
553 :: "r"(((uint8_t *)blocks) + 128 * 6),
559 static void add_bytes_mmx(uint8_t *dst, uint8_t *src, int w)
565 "movq (%1, %0), %%mm0 \n\t"
566 "movq (%2, %0), %%mm1 \n\t"
567 "paddb %%mm0, %%mm1 \n\t"
568 "movq %%mm1, (%2, %0) \n\t"
569 "movq 8(%1, %0), %%mm0 \n\t"
570 "movq 8(%2, %0), %%mm1 \n\t"
571 "paddb %%mm0, %%mm1 \n\t"
572 "movq %%mm1, 8(%2, %0) \n\t"
578 : "r"(src), "r"(dst), "r"((x86_reg)w - 15)
581 dst[i + 0] += src[i + 0];
585 static void add_hfyu_median_prediction_cmov(uint8_t *dst, const uint8_t *top,
586 const uint8_t *diff, int w,
587 int *left, int *left_top)
591 int l = *left & 0xff;
592 int tl = *left_top & 0xff;
597 "movzbl (%3, %4), %2 \n"
610 "add (%6, %4), %b0 \n"
611 "mov %b0, (%5, %4) \n"
614 : "+&q"(l), "+&q"(tl), "=&r"(t), "=&q"(x), "+&r"(w2)
615 : "r"(dst + w), "r"(diff + w), "rm"(top + w)
621 #endif /* HAVE_INLINE_ASM */
623 void ff_h263_v_loop_filter_mmx(uint8_t *src, int stride, int qscale);
624 void ff_h263_h_loop_filter_mmx(uint8_t *src, int stride, int qscale);
627 /* Draw the edges of width 'w' of an image of size width, height
628 * this MMX version can only handle w == 8 || w == 16. */
629 static void draw_edges_mmx(uint8_t *buf, int wrap, int width, int height,
630 int w, int h, int sides)
632 uint8_t *ptr, *last_line;
635 last_line = buf + (height - 1) * wrap;
641 "movd (%0), %%mm0 \n\t"
642 "punpcklbw %%mm0, %%mm0 \n\t"
643 "punpcklwd %%mm0, %%mm0 \n\t"
644 "punpckldq %%mm0, %%mm0 \n\t"
645 "movq %%mm0, -8(%0) \n\t"
646 "movq -8(%0, %2), %%mm1 \n\t"
647 "punpckhbw %%mm1, %%mm1 \n\t"
648 "punpckhwd %%mm1, %%mm1 \n\t"
649 "punpckhdq %%mm1, %%mm1 \n\t"
650 "movq %%mm1, (%0, %2) \n\t"
655 : "r"((x86_reg)wrap), "r"((x86_reg)width), "r"(ptr + wrap * height)
660 "movd (%0), %%mm0 \n\t"
661 "punpcklbw %%mm0, %%mm0 \n\t"
662 "punpcklwd %%mm0, %%mm0 \n\t"
663 "punpckldq %%mm0, %%mm0 \n\t"
664 "movq %%mm0, -8(%0) \n\t"
665 "movq %%mm0, -16(%0) \n\t"
666 "movq -8(%0, %2), %%mm1 \n\t"
667 "punpckhbw %%mm1, %%mm1 \n\t"
668 "punpckhwd %%mm1, %%mm1 \n\t"
669 "punpckhdq %%mm1, %%mm1 \n\t"
670 "movq %%mm1, (%0, %2) \n\t"
671 "movq %%mm1, 8(%0, %2) \n\t"
676 : "r"((x86_reg)wrap), "r"((x86_reg)width), "r"(ptr + wrap * height)
680 /* top and bottom (and hopefully also the corners) */
681 if (sides & EDGE_TOP) {
682 for (i = 0; i < h; i += 4) {
683 ptr = buf - (i + 1) * wrap - w;
686 "movq (%1, %0), %%mm0 \n\t"
687 "movq %%mm0, (%0) \n\t"
688 "movq %%mm0, (%0, %2) \n\t"
689 "movq %%mm0, (%0, %2, 2) \n\t"
690 "movq %%mm0, (%0, %3) \n\t"
695 : "r"((x86_reg)buf - (x86_reg)ptr - w), "r"((x86_reg) -wrap),
696 "r"((x86_reg) -wrap * 3), "r"(ptr + width + 2 * w)
701 if (sides & EDGE_BOTTOM) {
702 for (i = 0; i < h; i += 4) {
703 ptr = last_line + (i + 1) * wrap - w;
706 "movq (%1, %0), %%mm0 \n\t"
707 "movq %%mm0, (%0) \n\t"
708 "movq %%mm0, (%0, %2) \n\t"
709 "movq %%mm0, (%0, %2, 2) \n\t"
710 "movq %%mm0, (%0, %3) \n\t"
715 : "r"((x86_reg)last_line - (x86_reg)ptr - w),
716 "r"((x86_reg)wrap), "r"((x86_reg)wrap * 3),
717 "r"(ptr + width + 2 * w)
722 #endif /* HAVE_INLINE_ASM */
726 #define QPEL_OP(OPNAME, ROUNDER, RND, MMX) \
727 static void OPNAME ## qpel8_mc00_ ## MMX (uint8_t *dst, uint8_t *src, \
730 ff_ ## OPNAME ## pixels8_ ## MMX(dst, src, stride, 8); \
733 static void OPNAME ## qpel8_mc10_ ## MMX(uint8_t *dst, uint8_t *src, \
737 uint8_t * const half = (uint8_t*)temp; \
738 ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, \
740 ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, src, half, \
741 stride, stride, 8); \
744 static void OPNAME ## qpel8_mc20_ ## MMX(uint8_t *dst, uint8_t *src, \
747 ff_ ## OPNAME ## mpeg4_qpel8_h_lowpass_ ## MMX(dst, src, stride, \
751 static void OPNAME ## qpel8_mc30_ ## MMX(uint8_t *dst, uint8_t *src, \
755 uint8_t * const half = (uint8_t*)temp; \
756 ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, \
758 ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, src + 1, half, stride, \
762 static void OPNAME ## qpel8_mc01_ ## MMX(uint8_t *dst, uint8_t *src, \
766 uint8_t * const half = (uint8_t*)temp; \
767 ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, \
769 ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, src, half, \
770 stride, stride, 8); \
773 static void OPNAME ## qpel8_mc02_ ## MMX(uint8_t *dst, uint8_t *src, \
776 ff_ ## OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, src, \
780 static void OPNAME ## qpel8_mc03_ ## MMX(uint8_t *dst, uint8_t *src, \
784 uint8_t * const half = (uint8_t*)temp; \
785 ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, \
787 ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, src + stride, half, stride,\
791 static void OPNAME ## qpel8_mc11_ ## MMX(uint8_t *dst, uint8_t *src, \
794 uint64_t half[8 + 9]; \
795 uint8_t * const halfH = ((uint8_t*)half) + 64; \
796 uint8_t * const halfHV = ((uint8_t*)half); \
797 ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
799 ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, \
801 ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
802 ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, \
806 static void OPNAME ## qpel8_mc31_ ## MMX(uint8_t *dst, uint8_t *src, \
809 uint64_t half[8 + 9]; \
810 uint8_t * const halfH = ((uint8_t*)half) + 64; \
811 uint8_t * const halfHV = ((uint8_t*)half); \
812 ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
814 ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src + 1, halfH, 8, \
816 ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
817 ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, \
821 static void OPNAME ## qpel8_mc13_ ## MMX(uint8_t *dst, uint8_t *src, \
824 uint64_t half[8 + 9]; \
825 uint8_t * const halfH = ((uint8_t*)half) + 64; \
826 uint8_t * const halfHV = ((uint8_t*)half); \
827 ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
829 ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, \
831 ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
832 ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH + 8, halfHV, \
836 static void OPNAME ## qpel8_mc33_ ## MMX(uint8_t *dst, uint8_t *src, \
839 uint64_t half[8 + 9]; \
840 uint8_t * const halfH = ((uint8_t*)half) + 64; \
841 uint8_t * const halfHV = ((uint8_t*)half); \
842 ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
844 ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src + 1, halfH, 8, \
846 ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
847 ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH + 8, halfHV, \
851 static void OPNAME ## qpel8_mc21_ ## MMX(uint8_t *dst, uint8_t *src, \
854 uint64_t half[8 + 9]; \
855 uint8_t * const halfH = ((uint8_t*)half) + 64; \
856 uint8_t * const halfHV = ((uint8_t*)half); \
857 ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
859 ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
860 ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, \
864 static void OPNAME ## qpel8_mc23_ ## MMX(uint8_t *dst, uint8_t *src, \
867 uint64_t half[8 + 9]; \
868 uint8_t * const halfH = ((uint8_t*)half) + 64; \
869 uint8_t * const halfHV = ((uint8_t*)half); \
870 ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
872 ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
873 ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH + 8, halfHV, \
877 static void OPNAME ## qpel8_mc12_ ## MMX(uint8_t *dst, uint8_t *src, \
880 uint64_t half[8 + 9]; \
881 uint8_t * const halfH = ((uint8_t*)half); \
882 ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
884 ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, \
886 ff_ ## OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, \
890 static void OPNAME ## qpel8_mc32_ ## MMX(uint8_t *dst, uint8_t *src, \
893 uint64_t half[8 + 9]; \
894 uint8_t * const halfH = ((uint8_t*)half); \
895 ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
897 ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src + 1, halfH, 8, \
899 ff_ ## OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, \
903 static void OPNAME ## qpel8_mc22_ ## MMX(uint8_t *dst, uint8_t *src, \
907 uint8_t * const halfH = ((uint8_t*)half); \
908 ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
910 ff_ ## OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, \
914 static void OPNAME ## qpel16_mc00_ ## MMX (uint8_t *dst, uint8_t *src, \
917 ff_ ## OPNAME ## pixels16_ ## MMX(dst, src, stride, 16); \
920 static void OPNAME ## qpel16_mc10_ ## MMX(uint8_t *dst, uint8_t *src, \
924 uint8_t * const half = (uint8_t*)temp; \
925 ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, \
927 ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, \
931 static void OPNAME ## qpel16_mc20_ ## MMX(uint8_t *dst, uint8_t *src, \
934 ff_ ## OPNAME ## mpeg4_qpel16_h_lowpass_ ## MMX(dst, src, \
935 stride, stride, 16);\
938 static void OPNAME ## qpel16_mc30_ ## MMX(uint8_t *dst, uint8_t *src, \
942 uint8_t * const half = (uint8_t*)temp; \
943 ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, \
945 ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, src + 1, half, \
946 stride, stride, 16); \
949 static void OPNAME ## qpel16_mc01_ ## MMX(uint8_t *dst, uint8_t *src, \
953 uint8_t * const half = (uint8_t*)temp; \
954 ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, \
956 ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, \
960 static void OPNAME ## qpel16_mc02_ ## MMX(uint8_t *dst, uint8_t *src, \
963 ff_ ## OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, src, \
967 static void OPNAME ## qpel16_mc03_ ## MMX(uint8_t *dst, uint8_t *src, \
971 uint8_t * const half = (uint8_t*)temp; \
972 ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, \
974 ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, src+stride, half, \
975 stride, stride, 16); \
978 static void OPNAME ## qpel16_mc11_ ## MMX(uint8_t *dst, uint8_t *src, \
981 uint64_t half[16 * 2 + 17 * 2]; \
982 uint8_t * const halfH = ((uint8_t*)half) + 256; \
983 uint8_t * const halfHV = ((uint8_t*)half); \
984 ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
986 ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, \
988 ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
990 ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, \
994 static void OPNAME ## qpel16_mc31_ ## MMX(uint8_t *dst, uint8_t *src, \
997 uint64_t half[16 * 2 + 17 * 2]; \
998 uint8_t * const halfH = ((uint8_t*)half) + 256; \
999 uint8_t * const halfHV = ((uint8_t*)half); \
1000 ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1002 ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src + 1, halfH, 16, \
1004 ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
1006 ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, \
1010 static void OPNAME ## qpel16_mc13_ ## MMX(uint8_t *dst, uint8_t *src, \
1013 uint64_t half[16 * 2 + 17 * 2]; \
1014 uint8_t * const halfH = ((uint8_t*)half) + 256; \
1015 uint8_t * const halfHV = ((uint8_t*)half); \
1016 ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1018 ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, \
1020 ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
1022 ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH + 16, halfHV, \
1026 static void OPNAME ## qpel16_mc33_ ## MMX(uint8_t *dst, uint8_t *src, \
1029 uint64_t half[16 * 2 + 17 * 2]; \
1030 uint8_t * const halfH = ((uint8_t*)half) + 256; \
1031 uint8_t * const halfHV = ((uint8_t*)half); \
1032 ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1034 ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src + 1, halfH, 16, \
1036 ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
1038 ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH + 16, halfHV, \
1042 static void OPNAME ## qpel16_mc21_ ## MMX(uint8_t *dst, uint8_t *src, \
1045 uint64_t half[16 * 2 + 17 * 2]; \
1046 uint8_t * const halfH = ((uint8_t*)half) + 256; \
1047 uint8_t * const halfHV = ((uint8_t*)half); \
1048 ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1050 ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
1052 ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, \
1056 static void OPNAME ## qpel16_mc23_ ## MMX(uint8_t *dst, uint8_t *src, \
1059 uint64_t half[16 * 2 + 17 * 2]; \
1060 uint8_t * const halfH = ((uint8_t*)half) + 256; \
1061 uint8_t * const halfHV = ((uint8_t*)half); \
1062 ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1064 ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
1066 ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH + 16, halfHV, \
1070 static void OPNAME ## qpel16_mc12_ ## MMX(uint8_t *dst, uint8_t *src, \
1073 uint64_t half[17 * 2]; \
1074 uint8_t * const halfH = ((uint8_t*)half); \
1075 ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1077 ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, \
1079 ff_ ## OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, \
1083 static void OPNAME ## qpel16_mc32_ ## MMX(uint8_t *dst, uint8_t *src, \
1086 uint64_t half[17 * 2]; \
1087 uint8_t * const halfH = ((uint8_t*)half); \
1088 ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1090 ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src + 1, halfH, 16, \
1092 ff_ ## OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, \
1096 static void OPNAME ## qpel16_mc22_ ## MMX(uint8_t *dst, uint8_t *src, \
1099 uint64_t half[17 * 2]; \
1100 uint8_t * const halfH = ((uint8_t*)half); \
1101 ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1103 ff_ ## OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, \
1107 QPEL_OP(put_, ff_pw_16, _, mmxext)
1108 QPEL_OP(avg_, ff_pw_16, _, mmxext)
1109 QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, mmxext)
1110 #endif /* HAVE_YASM */
1114 void ff_put_rv40_qpel8_mc33_mmx(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1116 put_pixels8_xy2_mmx(dst, src, stride, 8);
1118 void ff_put_rv40_qpel16_mc33_mmx(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1120 put_pixels16_xy2_mmx(dst, src, stride, 16);
1122 void ff_avg_rv40_qpel8_mc33_mmx(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1124 avg_pixels8_xy2_mmx(dst, src, stride, 8);
1126 void ff_avg_rv40_qpel16_mc33_mmx(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1128 avg_pixels16_xy2_mmx(dst, src, stride, 16);
1131 static void gmc_mmx(uint8_t *dst, uint8_t *src,
1132 int stride, int h, int ox, int oy,
1133 int dxx, int dxy, int dyx, int dyy,
1134 int shift, int r, int width, int height)
1137 const int ix = ox >> (16 + shift);
1138 const int iy = oy >> (16 + shift);
1139 const int oxs = ox >> 4;
1140 const int oys = oy >> 4;
1141 const int dxxs = dxx >> 4;
1142 const int dxys = dxy >> 4;
1143 const int dyxs = dyx >> 4;
1144 const int dyys = dyy >> 4;
1145 const uint16_t r4[4] = { r, r, r, r };
1146 const uint16_t dxy4[4] = { dxys, dxys, dxys, dxys };
1147 const uint16_t dyy4[4] = { dyys, dyys, dyys, dyys };
1148 const uint64_t shift2 = 2 * shift;
1151 const int dxw = (dxx - (1 << (16 + shift))) * (w - 1);
1152 const int dyh = (dyy - (1 << (16 + shift))) * (h - 1);
1153 const int dxh = dxy * (h - 1);
1154 const int dyw = dyx * (w - 1);
1155 if ( // non-constant fullpel offset (3% of blocks)
1156 ((ox ^ (ox + dxw)) | (ox ^ (ox + dxh)) | (ox ^ (ox + dxw + dxh)) |
1157 (oy ^ (oy + dyw)) | (oy ^ (oy + dyh)) | (oy ^ (oy + dyw + dyh))) >> (16 + shift)
1158 // uses more than 16 bits of subpel mv (only at huge resolution)
1159 || (dxx | dxy | dyx | dyy) & 15 ||
1160 (unsigned)ix >= width - w ||
1161 (unsigned)iy >= height - h) {
1162 // FIXME could still use mmx for some of the rows
1163 ff_gmc_c(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy,
1164 shift, r, width, height);
1168 src += ix + iy * stride;
1171 "movd %0, %%mm6 \n\t"
1172 "pxor %%mm7, %%mm7 \n\t"
1173 "punpcklwd %%mm6, %%mm6 \n\t"
1174 "punpcklwd %%mm6, %%mm6 \n\t"
1178 for (x = 0; x < w; x += 4) {
1179 uint16_t dx4[4] = { oxs - dxys + dxxs * (x + 0),
1180 oxs - dxys + dxxs * (x + 1),
1181 oxs - dxys + dxxs * (x + 2),
1182 oxs - dxys + dxxs * (x + 3) };
1183 uint16_t dy4[4] = { oys - dyys + dyxs * (x + 0),
1184 oys - dyys + dyxs * (x + 1),
1185 oys - dyys + dyxs * (x + 2),
1186 oys - dyys + dyxs * (x + 3) };
1188 for (y = 0; y < h; y++) {
1190 "movq %0, %%mm4 \n\t"
1191 "movq %1, %%mm5 \n\t"
1192 "paddw %2, %%mm4 \n\t"
1193 "paddw %3, %%mm5 \n\t"
1194 "movq %%mm4, %0 \n\t"
1195 "movq %%mm5, %1 \n\t"
1196 "psrlw $12, %%mm4 \n\t"
1197 "psrlw $12, %%mm5 \n\t"
1198 : "+m"(*dx4), "+m"(*dy4)
1199 : "m"(*dxy4), "m"(*dyy4)
1203 "movq %%mm6, %%mm2 \n\t"
1204 "movq %%mm6, %%mm1 \n\t"
1205 "psubw %%mm4, %%mm2 \n\t"
1206 "psubw %%mm5, %%mm1 \n\t"
1207 "movq %%mm2, %%mm0 \n\t"
1208 "movq %%mm4, %%mm3 \n\t"
1209 "pmullw %%mm1, %%mm0 \n\t" // (s - dx) * (s - dy)
1210 "pmullw %%mm5, %%mm3 \n\t" // dx * dy
1211 "pmullw %%mm5, %%mm2 \n\t" // (s - dx) * dy
1212 "pmullw %%mm4, %%mm1 \n\t" // dx * (s - dy)
1214 "movd %4, %%mm5 \n\t"
1215 "movd %3, %%mm4 \n\t"
1216 "punpcklbw %%mm7, %%mm5 \n\t"
1217 "punpcklbw %%mm7, %%mm4 \n\t"
1218 "pmullw %%mm5, %%mm3 \n\t" // src[1, 1] * dx * dy
1219 "pmullw %%mm4, %%mm2 \n\t" // src[0, 1] * (s - dx) * dy
1221 "movd %2, %%mm5 \n\t"
1222 "movd %1, %%mm4 \n\t"
1223 "punpcklbw %%mm7, %%mm5 \n\t"
1224 "punpcklbw %%mm7, %%mm4 \n\t"
1225 "pmullw %%mm5, %%mm1 \n\t" // src[1, 0] * dx * (s - dy)
1226 "pmullw %%mm4, %%mm0 \n\t" // src[0, 0] * (s - dx) * (s - dy)
1227 "paddw %5, %%mm1 \n\t"
1228 "paddw %%mm3, %%mm2 \n\t"
1229 "paddw %%mm1, %%mm0 \n\t"
1230 "paddw %%mm2, %%mm0 \n\t"
1232 "psrlw %6, %%mm0 \n\t"
1233 "packuswb %%mm0, %%mm0 \n\t"
1234 "movd %%mm0, %0 \n\t"
1236 : "=m"(dst[x + y * stride])
1237 : "m"(src[0]), "m"(src[1]),
1238 "m"(src[stride]), "m"(src[stride + 1]),
1239 "m"(*r4), "m"(shift2)
1243 src += 4 - h * stride;
1246 #endif /* HAVE_INLINE_ASM */
1248 void ff_put_pixels16_sse2(uint8_t *block, const uint8_t *pixels,
1249 ptrdiff_t line_size, int h);
1250 void ff_avg_pixels16_sse2(uint8_t *block, const uint8_t *pixels,
1251 ptrdiff_t line_size, int h);
1256 void ff_put_cavs_qpel8_mc00_mmxext(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1258 put_pixels8_mmx(dst, src, stride, 8);
1261 void ff_avg_cavs_qpel8_mc00_mmxext(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1263 avg_pixels8_mmx(dst, src, stride, 8);
1266 void ff_put_cavs_qpel16_mc00_mmxext(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1268 put_pixels16_mmx(dst, src, stride, 16);
1271 void ff_avg_cavs_qpel16_mc00_mmxext(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
1273 avg_pixels16_mmx(dst, src, stride, 16);
1277 void ff_put_vc1_mspel_mc00_mmx(uint8_t *dst, const uint8_t *src,
1278 ptrdiff_t stride, int rnd)
1280 put_pixels8_mmx(dst, src, stride, 8);
1283 static void vector_clipf_sse(float *dst, const float *src,
1284 float min, float max, int len)
1286 x86_reg i = (len - 16) * 4;
1288 "movss %3, %%xmm4 \n\t"
1289 "movss %4, %%xmm5 \n\t"
1290 "shufps $0, %%xmm4, %%xmm4 \n\t"
1291 "shufps $0, %%xmm5, %%xmm5 \n\t"
1293 "movaps (%2, %0), %%xmm0 \n\t" // 3/1 on intel
1294 "movaps 16(%2, %0), %%xmm1 \n\t"
1295 "movaps 32(%2, %0), %%xmm2 \n\t"
1296 "movaps 48(%2, %0), %%xmm3 \n\t"
1297 "maxps %%xmm4, %%xmm0 \n\t"
1298 "maxps %%xmm4, %%xmm1 \n\t"
1299 "maxps %%xmm4, %%xmm2 \n\t"
1300 "maxps %%xmm4, %%xmm3 \n\t"
1301 "minps %%xmm5, %%xmm0 \n\t"
1302 "minps %%xmm5, %%xmm1 \n\t"
1303 "minps %%xmm5, %%xmm2 \n\t"
1304 "minps %%xmm5, %%xmm3 \n\t"
1305 "movaps %%xmm0, (%1, %0) \n\t"
1306 "movaps %%xmm1, 16(%1, %0) \n\t"
1307 "movaps %%xmm2, 32(%1, %0) \n\t"
1308 "movaps %%xmm3, 48(%1, %0) \n\t"
1312 : "r"(dst), "r"(src), "m"(min), "m"(max)
1317 #endif /* HAVE_INLINE_ASM */
1319 int32_t ff_scalarproduct_int16_mmxext(const int16_t *v1, const int16_t *v2,
1321 int32_t ff_scalarproduct_int16_sse2(const int16_t *v1, const int16_t *v2,
1323 int32_t ff_scalarproduct_and_madd_int16_mmxext(int16_t *v1, const int16_t *v2,
1325 int order, int mul);
1326 int32_t ff_scalarproduct_and_madd_int16_sse2(int16_t *v1, const int16_t *v2,
1328 int order, int mul);
1329 int32_t ff_scalarproduct_and_madd_int16_ssse3(int16_t *v1, const int16_t *v2,
1331 int order, int mul);
1333 void ff_apply_window_int16_round_mmxext(int16_t *output, const int16_t *input,
1334 const int16_t *window, unsigned int len);
1335 void ff_apply_window_int16_round_sse2(int16_t *output, const int16_t *input,
1336 const int16_t *window, unsigned int len);
1337 void ff_apply_window_int16_mmxext(int16_t *output, const int16_t *input,
1338 const int16_t *window, unsigned int len);
1339 void ff_apply_window_int16_sse2(int16_t *output, const int16_t *input,
1340 const int16_t *window, unsigned int len);
1341 void ff_apply_window_int16_ssse3(int16_t *output, const int16_t *input,
1342 const int16_t *window, unsigned int len);
1343 void ff_apply_window_int16_ssse3_atom(int16_t *output, const int16_t *input,
1344 const int16_t *window, unsigned int len);
1346 void ff_bswap32_buf_ssse3(uint32_t *dst, const uint32_t *src, int w);
1347 void ff_bswap32_buf_sse2(uint32_t *dst, const uint32_t *src, int w);
1349 void ff_add_hfyu_median_prediction_mmxext(uint8_t *dst, const uint8_t *top,
1350 const uint8_t *diff, int w,
1351 int *left, int *left_top);
1352 int ff_add_hfyu_left_prediction_ssse3(uint8_t *dst, const uint8_t *src,
1354 int ff_add_hfyu_left_prediction_sse4(uint8_t *dst, const uint8_t *src,
1357 void ff_vector_clip_int32_mmx (int32_t *dst, const int32_t *src,
1358 int32_t min, int32_t max, unsigned int len);
1359 void ff_vector_clip_int32_sse2 (int32_t *dst, const int32_t *src,
1360 int32_t min, int32_t max, unsigned int len);
1361 void ff_vector_clip_int32_int_sse2(int32_t *dst, const int32_t *src,
1362 int32_t min, int32_t max, unsigned int len);
1363 void ff_vector_clip_int32_sse4 (int32_t *dst, const int32_t *src,
1364 int32_t min, int32_t max, unsigned int len);
1366 #define SET_QPEL_FUNCS(PFX, IDX, SIZE, CPU, PREFIX) \
1368 c->PFX ## _pixels_tab[IDX][ 0] = PREFIX ## PFX ## SIZE ## _mc00_ ## CPU; \
1369 c->PFX ## _pixels_tab[IDX][ 1] = PREFIX ## PFX ## SIZE ## _mc10_ ## CPU; \
1370 c->PFX ## _pixels_tab[IDX][ 2] = PREFIX ## PFX ## SIZE ## _mc20_ ## CPU; \
1371 c->PFX ## _pixels_tab[IDX][ 3] = PREFIX ## PFX ## SIZE ## _mc30_ ## CPU; \
1372 c->PFX ## _pixels_tab[IDX][ 4] = PREFIX ## PFX ## SIZE ## _mc01_ ## CPU; \
1373 c->PFX ## _pixels_tab[IDX][ 5] = PREFIX ## PFX ## SIZE ## _mc11_ ## CPU; \
1374 c->PFX ## _pixels_tab[IDX][ 6] = PREFIX ## PFX ## SIZE ## _mc21_ ## CPU; \
1375 c->PFX ## _pixels_tab[IDX][ 7] = PREFIX ## PFX ## SIZE ## _mc31_ ## CPU; \
1376 c->PFX ## _pixels_tab[IDX][ 8] = PREFIX ## PFX ## SIZE ## _mc02_ ## CPU; \
1377 c->PFX ## _pixels_tab[IDX][ 9] = PREFIX ## PFX ## SIZE ## _mc12_ ## CPU; \
1378 c->PFX ## _pixels_tab[IDX][10] = PREFIX ## PFX ## SIZE ## _mc22_ ## CPU; \
1379 c->PFX ## _pixels_tab[IDX][11] = PREFIX ## PFX ## SIZE ## _mc32_ ## CPU; \
1380 c->PFX ## _pixels_tab[IDX][12] = PREFIX ## PFX ## SIZE ## _mc03_ ## CPU; \
1381 c->PFX ## _pixels_tab[IDX][13] = PREFIX ## PFX ## SIZE ## _mc13_ ## CPU; \
1382 c->PFX ## _pixels_tab[IDX][14] = PREFIX ## PFX ## SIZE ## _mc23_ ## CPU; \
1383 c->PFX ## _pixels_tab[IDX][15] = PREFIX ## PFX ## SIZE ## _mc33_ ## CPU; \
1386 #define SET_HPEL_FUNCS(PFX, IDX, SIZE, CPU) \
1388 c->PFX ## _pixels_tab IDX [0] = PFX ## _pixels ## SIZE ## _ ## CPU; \
1389 c->PFX ## _pixels_tab IDX [1] = PFX ## _pixels ## SIZE ## _x2_ ## CPU; \
1390 c->PFX ## _pixels_tab IDX [2] = PFX ## _pixels ## SIZE ## _y2_ ## CPU; \
1391 c->PFX ## _pixels_tab IDX [3] = PFX ## _pixels ## SIZE ## _xy2_ ## CPU; \
1394 static av_cold void dsputil_init_mmx(DSPContext *c, AVCodecContext *avctx,
1397 const int high_bit_depth = avctx->bits_per_raw_sample > 8;
1400 c->put_pixels_clamped = ff_put_pixels_clamped_mmx;
1401 c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_mmx;
1402 c->add_pixels_clamped = ff_add_pixels_clamped_mmx;
1404 if (!high_bit_depth) {
1405 c->clear_block = clear_block_mmx;
1406 c->clear_blocks = clear_blocks_mmx;
1407 c->draw_edges = draw_edges_mmx;
1409 SET_HPEL_FUNCS(put, [0], 16, mmx);
1410 SET_HPEL_FUNCS(put_no_rnd, [0], 16, mmx);
1411 SET_HPEL_FUNCS(avg, [0], 16, mmx);
1412 SET_HPEL_FUNCS(avg_no_rnd, , 16, mmx);
1413 SET_HPEL_FUNCS(put, [1], 8, mmx);
1414 SET_HPEL_FUNCS(put_no_rnd, [1], 8, mmx);
1415 SET_HPEL_FUNCS(avg, [1], 8, mmx);
1417 switch (avctx->idct_algo) {
1419 case FF_IDCT_SIMPLEMMX:
1420 c->idct_put = ff_simple_idct_put_mmx;
1421 c->idct_add = ff_simple_idct_add_mmx;
1422 c->idct = ff_simple_idct_mmx;
1423 c->idct_permutation_type = FF_SIMPLE_IDCT_PERM;
1425 case FF_IDCT_XVIDMMX:
1426 c->idct_put = ff_idct_xvid_mmx_put;
1427 c->idct_add = ff_idct_xvid_mmx_add;
1428 c->idct = ff_idct_xvid_mmx;
1435 c->add_bytes = add_bytes_mmx;
1436 #endif /* HAVE_INLINE_ASM */
1439 if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
1440 c->h263_v_loop_filter = ff_h263_v_loop_filter_mmx;
1441 c->h263_h_loop_filter = ff_h263_h_loop_filter_mmx;
1444 c->vector_clip_int32 = ff_vector_clip_int32_mmx;
1449 static av_cold void dsputil_init_mmxext(DSPContext *c, AVCodecContext *avctx,
1452 const int bit_depth = avctx->bits_per_raw_sample;
1453 const int high_bit_depth = bit_depth > 8;
1456 SET_QPEL_FUNCS(avg_qpel, 0, 16, mmxext, );
1457 SET_QPEL_FUNCS(avg_qpel, 1, 8, mmxext, );
1459 SET_QPEL_FUNCS(put_qpel, 0, 16, mmxext, );
1460 SET_QPEL_FUNCS(put_qpel, 1, 8, mmxext, );
1461 SET_QPEL_FUNCS(put_no_rnd_qpel, 0, 16, mmxext, );
1462 SET_QPEL_FUNCS(put_no_rnd_qpel, 1, 8, mmxext, );
1464 if (!high_bit_depth) {
1465 c->put_pixels_tab[0][1] = ff_put_pixels16_x2_mmxext;
1466 c->put_pixels_tab[0][2] = ff_put_pixels16_y2_mmxext;
1468 c->avg_pixels_tab[0][0] = ff_avg_pixels16_mmxext;
1469 c->avg_pixels_tab[0][1] = ff_avg_pixels16_x2_mmxext;
1470 c->avg_pixels_tab[0][2] = ff_avg_pixels16_y2_mmxext;
1472 c->put_pixels_tab[1][1] = ff_put_pixels8_x2_mmxext;
1473 c->put_pixels_tab[1][2] = ff_put_pixels8_y2_mmxext;
1475 c->avg_pixels_tab[1][0] = ff_avg_pixels8_mmxext;
1476 c->avg_pixels_tab[1][1] = ff_avg_pixels8_x2_mmxext;
1477 c->avg_pixels_tab[1][2] = ff_avg_pixels8_y2_mmxext;
1480 if (!(avctx->flags & CODEC_FLAG_BITEXACT)) {
1481 if (!high_bit_depth) {
1482 c->put_no_rnd_pixels_tab[0][1] = ff_put_no_rnd_pixels16_x2_mmxext;
1483 c->put_no_rnd_pixels_tab[0][2] = ff_put_no_rnd_pixels16_y2_mmxext;
1484 c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_mmxext;
1485 c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_mmxext;
1487 c->avg_pixels_tab[0][3] = ff_avg_pixels16_xy2_mmxext;
1488 c->avg_pixels_tab[1][3] = ff_avg_pixels8_xy2_mmxext;
1491 #endif /* HAVE_YASM */
1494 if (!high_bit_depth && avctx->idct_algo == FF_IDCT_XVIDMMX) {
1495 c->idct_put = ff_idct_xvid_mmxext_put;
1496 c->idct_add = ff_idct_xvid_mmxext_add;
1497 c->idct = ff_idct_xvid_mmxext;
1499 #endif /* HAVE_INLINE_ASM */
1501 #if HAVE_MMXEXT_EXTERNAL
1502 if (CONFIG_VP3_DECODER && (avctx->codec_id == AV_CODEC_ID_VP3 ||
1503 avctx->codec_id == AV_CODEC_ID_THEORA)) {
1504 c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_exact_mmxext;
1505 c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_exact_mmxext;
1508 /* slower than cmov version on AMD */
1509 if (!(mm_flags & AV_CPU_FLAG_3DNOW))
1510 c->add_hfyu_median_prediction = ff_add_hfyu_median_prediction_mmxext;
1512 c->scalarproduct_int16 = ff_scalarproduct_int16_mmxext;
1513 c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_mmxext;
1515 if (avctx->flags & CODEC_FLAG_BITEXACT) {
1516 c->apply_window_int16 = ff_apply_window_int16_mmxext;
1518 c->apply_window_int16 = ff_apply_window_int16_round_mmxext;
1520 #endif /* HAVE_MMXEXT_EXTERNAL */
1523 static av_cold void dsputil_init_3dnow(DSPContext *c, AVCodecContext *avctx,
1526 const int high_bit_depth = avctx->bits_per_raw_sample > 8;
1529 if (!high_bit_depth) {
1530 c->put_pixels_tab[0][1] = ff_put_pixels16_x2_3dnow;
1531 c->put_pixels_tab[0][2] = ff_put_pixels16_y2_3dnow;
1533 c->avg_pixels_tab[0][0] = ff_avg_pixels16_3dnow;
1534 c->avg_pixels_tab[0][1] = ff_avg_pixels16_x2_3dnow;
1535 c->avg_pixels_tab[0][2] = ff_avg_pixels16_y2_3dnow;
1537 c->put_pixels_tab[1][1] = ff_put_pixels8_x2_3dnow;
1538 c->put_pixels_tab[1][2] = ff_put_pixels8_y2_3dnow;
1540 c->avg_pixels_tab[1][0] = ff_avg_pixels8_3dnow;
1541 c->avg_pixels_tab[1][1] = ff_avg_pixels8_x2_3dnow;
1542 c->avg_pixels_tab[1][2] = ff_avg_pixels8_y2_3dnow;
1544 if (!(avctx->flags & CODEC_FLAG_BITEXACT)){
1545 c->put_no_rnd_pixels_tab[0][1] = ff_put_no_rnd_pixels16_x2_3dnow;
1546 c->put_no_rnd_pixels_tab[0][2] = ff_put_no_rnd_pixels16_y2_3dnow;
1547 c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_3dnow;
1548 c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_3dnow;
1550 c->avg_pixels_tab[0][3] = ff_avg_pixels16_xy2_3dnow;
1551 c->avg_pixels_tab[1][3] = ff_avg_pixels8_xy2_3dnow;
1555 if (CONFIG_VP3_DECODER && (avctx->codec_id == AV_CODEC_ID_VP3 ||
1556 avctx->codec_id == AV_CODEC_ID_THEORA)) {
1557 c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_exact_3dnow;
1558 c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_exact_3dnow;
1560 #endif /* HAVE_YASM */
1563 static av_cold void dsputil_init_sse(DSPContext *c, AVCodecContext *avctx,
1566 const int high_bit_depth = avctx->bits_per_raw_sample > 8;
1569 if (!high_bit_depth) {
1570 if (!(CONFIG_MPEG_XVMC_DECODER && avctx->xvmc_acceleration > 1)) {
1571 /* XvMCCreateBlocks() may not allocate 16-byte aligned blocks */
1572 c->clear_block = clear_block_sse;
1573 c->clear_blocks = clear_blocks_sse;
1577 c->vector_clipf = vector_clipf_sse;
1578 #endif /* HAVE_INLINE_ASM */
1581 static av_cold void dsputil_init_sse2(DSPContext *c, AVCodecContext *avctx,
1584 const int bit_depth = avctx->bits_per_raw_sample;
1585 const int high_bit_depth = bit_depth > 8;
1587 #if HAVE_SSE2_INLINE
1588 if (!high_bit_depth && avctx->idct_algo == FF_IDCT_XVIDMMX) {
1589 c->idct_put = ff_idct_xvid_sse2_put;
1590 c->idct_add = ff_idct_xvid_sse2_add;
1591 c->idct = ff_idct_xvid_sse2;
1592 c->idct_permutation_type = FF_SSE2_IDCT_PERM;
1594 #endif /* HAVE_SSE2_INLINE */
1596 #if HAVE_SSE2_EXTERNAL
1597 if (!(mm_flags & AV_CPU_FLAG_SSE2SLOW)) {
1598 // these functions are slower than mmx on AMD, but faster on Intel
1599 if (!high_bit_depth) {
1600 c->put_pixels_tab[0][0] = ff_put_pixels16_sse2;
1601 c->put_no_rnd_pixels_tab[0][0] = ff_put_pixels16_sse2;
1602 c->avg_pixels_tab[0][0] = ff_avg_pixels16_sse2;
1606 c->scalarproduct_int16 = ff_scalarproduct_int16_sse2;
1607 c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_sse2;
1608 if (mm_flags & AV_CPU_FLAG_ATOM) {
1609 c->vector_clip_int32 = ff_vector_clip_int32_int_sse2;
1611 c->vector_clip_int32 = ff_vector_clip_int32_sse2;
1613 if (avctx->flags & CODEC_FLAG_BITEXACT) {
1614 c->apply_window_int16 = ff_apply_window_int16_sse2;
1615 } else if (!(mm_flags & AV_CPU_FLAG_SSE2SLOW)) {
1616 c->apply_window_int16 = ff_apply_window_int16_round_sse2;
1618 c->bswap_buf = ff_bswap32_buf_sse2;
1619 #endif /* HAVE_SSE2_EXTERNAL */
1622 static av_cold void dsputil_init_ssse3(DSPContext *c, AVCodecContext *avctx,
1625 #if HAVE_SSSE3_EXTERNAL
1626 c->add_hfyu_left_prediction = ff_add_hfyu_left_prediction_ssse3;
1627 if (mm_flags & AV_CPU_FLAG_SSE4) // not really sse4, just slow on Conroe
1628 c->add_hfyu_left_prediction = ff_add_hfyu_left_prediction_sse4;
1630 if (mm_flags & AV_CPU_FLAG_ATOM)
1631 c->apply_window_int16 = ff_apply_window_int16_ssse3_atom;
1633 c->apply_window_int16 = ff_apply_window_int16_ssse3;
1634 if (!(mm_flags & (AV_CPU_FLAG_SSE42|AV_CPU_FLAG_3DNOW))) // cachesplit
1635 c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_ssse3;
1636 c->bswap_buf = ff_bswap32_buf_ssse3;
1637 #endif /* HAVE_SSSE3_EXTERNAL */
1640 static av_cold void dsputil_init_sse4(DSPContext *c, AVCodecContext *avctx,
1643 #if HAVE_SSE4_EXTERNAL
1644 c->vector_clip_int32 = ff_vector_clip_int32_sse4;
1645 #endif /* HAVE_SSE4_EXTERNAL */
1648 av_cold void ff_dsputil_init_mmx(DSPContext *c, AVCodecContext *avctx)
1650 int mm_flags = av_get_cpu_flags();
1652 #if HAVE_7REGS && HAVE_INLINE_ASM
1653 if (mm_flags & AV_CPU_FLAG_CMOV)
1654 c->add_hfyu_median_prediction = add_hfyu_median_prediction_cmov;
1657 if (mm_flags & AV_CPU_FLAG_MMX)
1658 dsputil_init_mmx(c, avctx, mm_flags);
1660 if (mm_flags & AV_CPU_FLAG_MMXEXT)
1661 dsputil_init_mmxext(c, avctx, mm_flags);
1663 if (mm_flags & AV_CPU_FLAG_3DNOW)
1664 dsputil_init_3dnow(c, avctx, mm_flags);
1666 if (mm_flags & AV_CPU_FLAG_SSE)
1667 dsputil_init_sse(c, avctx, mm_flags);
1669 if (mm_flags & AV_CPU_FLAG_SSE2)
1670 dsputil_init_sse2(c, avctx, mm_flags);
1672 if (mm_flags & AV_CPU_FLAG_SSSE3)
1673 dsputil_init_ssse3(c, avctx, mm_flags);
1675 if (mm_flags & AV_CPU_FLAG_SSE4)
1676 dsputil_init_sse4(c, avctx, mm_flags);
1678 if (CONFIG_ENCODERS)
1679 ff_dsputilenc_init_mmx(c, avctx);