2 * MMX optimized DSP utils
3 * Copyright (c) 2000, 2001 Fabrice Bellard
4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
6 * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
8 * This file is part of Libav.
10 * Libav is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public
12 * License as published by the Free Software Foundation; either
13 * version 2.1 of the License, or (at your option) any later version.
15 * Libav is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * Lesser General Public License for more details.
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with Libav; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
25 #include "libavutil/cpu.h"
26 #include "libavutil/x86/asm.h"
27 #include "libavcodec/dsputil.h"
28 #include "libavcodec/h264dsp.h"
29 #include "libavcodec/mpegvideo.h"
30 #include "libavcodec/simple_idct.h"
31 #include "dsputil_mmx.h"
32 #include "idct_xvid.h"
37 /* pixel operations */
38 DECLARE_ALIGNED(8, const uint64_t, ff_bone) = 0x0101010101010101ULL;
39 DECLARE_ALIGNED(8, const uint64_t, ff_wtwo) = 0x0002000200020002ULL;
41 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_1) = { 0x0001000100010001ULL, 0x0001000100010001ULL };
42 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_2) = { 0x0002000200020002ULL, 0x0002000200020002ULL };
43 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_3) = { 0x0003000300030003ULL, 0x0003000300030003ULL };
44 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_4) = { 0x0004000400040004ULL, 0x0004000400040004ULL };
45 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_5) = { 0x0005000500050005ULL, 0x0005000500050005ULL };
46 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_8) = { 0x0008000800080008ULL, 0x0008000800080008ULL };
47 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_9) = { 0x0009000900090009ULL, 0x0009000900090009ULL };
48 DECLARE_ALIGNED(8, const uint64_t, ff_pw_15) = 0x000F000F000F000FULL;
49 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_16) = { 0x0010001000100010ULL, 0x0010001000100010ULL };
50 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_17) = { 0x0011001100110011ULL, 0x0011001100110011ULL };
51 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_18) = { 0x0012001200120012ULL, 0x0012001200120012ULL };
52 DECLARE_ALIGNED(8, const uint64_t, ff_pw_20) = 0x0014001400140014ULL;
53 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_27) = { 0x001B001B001B001BULL, 0x001B001B001B001BULL };
54 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_28) = { 0x001C001C001C001CULL, 0x001C001C001C001CULL };
55 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_32) = { 0x0020002000200020ULL, 0x0020002000200020ULL };
56 DECLARE_ALIGNED(8, const uint64_t, ff_pw_42) = 0x002A002A002A002AULL;
57 DECLARE_ALIGNED(8, const uint64_t, ff_pw_53) = 0x0035003500350035ULL;
58 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_63) = { 0x003F003F003F003FULL, 0x003F003F003F003FULL };
59 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_64) = { 0x0040004000400040ULL, 0x0040004000400040ULL };
60 DECLARE_ALIGNED(8, const uint64_t, ff_pw_96) = 0x0060006000600060ULL;
61 DECLARE_ALIGNED(8, const uint64_t, ff_pw_128) = 0x0080008000800080ULL;
62 DECLARE_ALIGNED(8, const uint64_t, ff_pw_255) = 0x00ff00ff00ff00ffULL;
63 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_512) = { 0x0200020002000200ULL, 0x0200020002000200ULL };
64 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_1019) = { 0x03FB03FB03FB03FBULL, 0x03FB03FB03FB03FBULL };
66 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_0) = { 0x0000000000000000ULL, 0x0000000000000000ULL };
67 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_1) = { 0x0101010101010101ULL, 0x0101010101010101ULL };
68 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_3) = { 0x0303030303030303ULL, 0x0303030303030303ULL };
69 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_4) = { 0x0404040404040404ULL, 0x0404040404040404ULL };
70 DECLARE_ALIGNED(8, const uint64_t, ff_pb_7) = 0x0707070707070707ULL;
71 DECLARE_ALIGNED(8, const uint64_t, ff_pb_1F) = 0x1F1F1F1F1F1F1F1FULL;
72 DECLARE_ALIGNED(8, const uint64_t, ff_pb_3F) = 0x3F3F3F3F3F3F3F3FULL;
73 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_80) = { 0x8080808080808080ULL, 0x8080808080808080ULL };
74 DECLARE_ALIGNED(8, const uint64_t, ff_pb_81) = 0x8181818181818181ULL;
75 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_A1) = { 0xA1A1A1A1A1A1A1A1ULL, 0xA1A1A1A1A1A1A1A1ULL };
76 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_F8) = { 0xF8F8F8F8F8F8F8F8ULL, 0xF8F8F8F8F8F8F8F8ULL };
77 DECLARE_ALIGNED(8, const uint64_t, ff_pb_FC) = 0xFCFCFCFCFCFCFCFCULL;
78 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_FE) = { 0xFEFEFEFEFEFEFEFEULL, 0xFEFEFEFEFEFEFEFEULL };
80 DECLARE_ALIGNED(16, const double, ff_pd_1)[2] = { 1.0, 1.0 };
81 DECLARE_ALIGNED(16, const double, ff_pd_2)[2] = { 2.0, 2.0 };
84 void ff_put_pixels8_x2_mmxext(uint8_t *block, const uint8_t *pixels,
85 int line_size, int h);
86 void ff_put_pixels8_x2_3dnow(uint8_t *block, const uint8_t *pixels,
87 int line_size, int h);
88 void ff_put_pixels8_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
89 int dstStride, int src1Stride, int h);
90 void ff_put_no_rnd_pixels8_l2_mmxext(uint8_t *dst, uint8_t *src1,
91 uint8_t *src2, int dstStride,
92 int src1Stride, int h);
93 void ff_avg_pixels8_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
94 int dstStride, int src1Stride, int h);
95 void ff_put_pixels16_x2_mmxext(uint8_t *block, const uint8_t *pixels,
96 int line_size, int h);
97 void ff_put_pixels16_x2_3dnow(uint8_t *block, const uint8_t *pixels,
98 int line_size, int h);
99 void ff_put_pixels16_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
100 int dstStride, int src1Stride, int h);
101 void ff_avg_pixels16_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
102 int dstStride, int src1Stride, int h);
103 void ff_put_no_rnd_pixels16_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
104 int dstStride, int src1Stride, int h);
105 void ff_put_no_rnd_pixels8_x2_mmxext(uint8_t *block, const uint8_t *pixels,
106 int line_size, int h);
107 void ff_put_no_rnd_pixels8_x2_3dnow(uint8_t *block, const uint8_t *pixels,
108 int line_size, int h);
109 void ff_put_no_rnd_pixels8_x2_exact_mmxext(uint8_t *block,
110 const uint8_t *pixels,
111 int line_size, int h);
112 void ff_put_no_rnd_pixels8_x2_exact_3dnow(uint8_t *block,
113 const uint8_t *pixels,
114 int line_size, int h);
115 void ff_put_pixels8_y2_mmxext(uint8_t *block, const uint8_t *pixels,
116 int line_size, int h);
117 void ff_put_pixels8_y2_3dnow(uint8_t *block, const uint8_t *pixels,
118 int line_size, int h);
119 void ff_put_no_rnd_pixels8_y2_mmxext(uint8_t *block, const uint8_t *pixels,
120 int line_size, int h);
121 void ff_put_no_rnd_pixels8_y2_3dnow(uint8_t *block, const uint8_t *pixels,
122 int line_size, int h);
123 void ff_put_no_rnd_pixels8_y2_exact_mmxext(uint8_t *block,
124 const uint8_t *pixels,
125 int line_size, int h);
126 void ff_put_no_rnd_pixels8_y2_exact_3dnow(uint8_t *block,
127 const uint8_t *pixels,
128 int line_size, int h);
129 void ff_avg_pixels8_mmxext(uint8_t *block, const uint8_t *pixels,
130 int line_size, int h);
131 void ff_avg_pixels8_3dnow(uint8_t *block, const uint8_t *pixels,
132 int line_size, int h);
133 void ff_avg_pixels8_x2_mmxext(uint8_t *block, const uint8_t *pixels,
134 int line_size, int h);
135 void ff_avg_pixels8_x2_3dnow(uint8_t *block, const uint8_t *pixels,
136 int line_size, int h);
137 void ff_avg_pixels8_y2_mmxext(uint8_t *block, const uint8_t *pixels,
138 int line_size, int h);
139 void ff_avg_pixels8_y2_3dnow(uint8_t *block, const uint8_t *pixels,
140 int line_size, int h);
141 void ff_avg_pixels8_xy2_mmxext(uint8_t *block, const uint8_t *pixels,
142 int line_size, int h);
143 void ff_avg_pixels8_xy2_3dnow(uint8_t *block, const uint8_t *pixels,
144 int line_size, int h);
146 void ff_put_pixels8_mmxext(uint8_t *block, const uint8_t *pixels, int line_size, int h);
147 static void ff_put_pixels16_mmxext(uint8_t *block, const uint8_t *pixels,
148 int line_size, int h)
150 ff_put_pixels8_mmxext(block, pixels, line_size, h);
151 ff_put_pixels8_mmxext(block + 8, pixels + 8, line_size, h);
154 void ff_put_mpeg4_qpel16_h_lowpass_mmxext(uint8_t *dst, uint8_t *src,
155 int dstStride, int srcStride, int h);
156 void ff_avg_mpeg4_qpel16_h_lowpass_mmxext(uint8_t *dst, uint8_t *src,
157 int dstStride, int srcStride, int h);
158 void ff_put_no_rnd_mpeg4_qpel16_h_lowpass_mmxext(uint8_t *dst, uint8_t *src,
159 int dstStride, int srcStride,
161 void ff_put_mpeg4_qpel8_h_lowpass_mmxext(uint8_t *dst, uint8_t *src,
162 int dstStride, int srcStride, int h);
163 void ff_avg_mpeg4_qpel8_h_lowpass_mmxext(uint8_t *dst, uint8_t *src,
164 int dstStride, int srcStride, int h);
165 void ff_put_no_rnd_mpeg4_qpel8_h_lowpass_mmxext(uint8_t *dst, uint8_t *src,
166 int dstStride, int srcStride,
168 void ff_put_mpeg4_qpel16_v_lowpass_mmxext(uint8_t *dst, uint8_t *src,
169 int dstStride, int srcStride);
170 void ff_avg_mpeg4_qpel16_v_lowpass_mmxext(uint8_t *dst, uint8_t *src,
171 int dstStride, int srcStride);
172 void ff_put_no_rnd_mpeg4_qpel16_v_lowpass_mmxext(uint8_t *dst, uint8_t *src,
173 int dstStride, int srcStride);
174 void ff_put_mpeg4_qpel8_v_lowpass_mmxext(uint8_t *dst, uint8_t *src,
175 int dstStride, int srcStride);
176 void ff_avg_mpeg4_qpel8_v_lowpass_mmxext(uint8_t *dst, uint8_t *src,
177 int dstStride, int srcStride);
178 void ff_put_no_rnd_mpeg4_qpel8_v_lowpass_mmxext(uint8_t *dst, uint8_t *src,
179 int dstStride, int srcStride);
180 #define ff_put_no_rnd_pixels16_mmxext ff_put_pixels16_mmxext
181 #define ff_put_no_rnd_pixels8_mmxext ff_put_pixels8_mmxext
186 #define JUMPALIGN() __asm__ volatile (".p2align 3"::)
187 #define MOVQ_ZERO(regd) __asm__ volatile ("pxor %%"#regd", %%"#regd ::)
189 #define MOVQ_BFE(regd) \
191 "pcmpeqd %%"#regd", %%"#regd" \n\t" \
192 "paddb %%"#regd", %%"#regd" \n\t" ::)
195 #define MOVQ_BONE(regd) __asm__ volatile ("movq %0, %%"#regd" \n\t" :: "m"(ff_bone))
196 #define MOVQ_WTWO(regd) __asm__ volatile ("movq %0, %%"#regd" \n\t" :: "m"(ff_wtwo))
198 // for shared library it's better to use this way for accessing constants
200 #define MOVQ_BONE(regd) \
202 "pcmpeqd %%"#regd", %%"#regd" \n\t" \
203 "psrlw $15, %%"#regd" \n\t" \
204 "packuswb %%"#regd", %%"#regd" \n\t" ::)
206 #define MOVQ_WTWO(regd) \
208 "pcmpeqd %%"#regd", %%"#regd" \n\t" \
209 "psrlw $15, %%"#regd" \n\t" \
210 "psllw $1, %%"#regd" \n\t"::)
214 // using regr as temporary and for the output result
215 // first argument is unmodifed and second is trashed
216 // regfe is supposed to contain 0xfefefefefefefefe
217 #define PAVGB_MMX_NO_RND(rega, regb, regr, regfe) \
218 "movq "#rega", "#regr" \n\t" \
219 "pand "#regb", "#regr" \n\t" \
220 "pxor "#rega", "#regb" \n\t" \
221 "pand "#regfe", "#regb" \n\t" \
222 "psrlq $1, "#regb" \n\t" \
223 "paddb "#regb", "#regr" \n\t"
225 #define PAVGB_MMX(rega, regb, regr, regfe) \
226 "movq "#rega", "#regr" \n\t" \
227 "por "#regb", "#regr" \n\t" \
228 "pxor "#rega", "#regb" \n\t" \
229 "pand "#regfe", "#regb" \n\t" \
230 "psrlq $1, "#regb" \n\t" \
231 "psubb "#regb", "#regr" \n\t"
233 // mm6 is supposed to contain 0xfefefefefefefefe
234 #define PAVGBP_MMX_NO_RND(rega, regb, regr, regc, regd, regp) \
235 "movq "#rega", "#regr" \n\t" \
236 "movq "#regc", "#regp" \n\t" \
237 "pand "#regb", "#regr" \n\t" \
238 "pand "#regd", "#regp" \n\t" \
239 "pxor "#rega", "#regb" \n\t" \
240 "pxor "#regc", "#regd" \n\t" \
241 "pand %%mm6, "#regb" \n\t" \
242 "pand %%mm6, "#regd" \n\t" \
243 "psrlq $1, "#regb" \n\t" \
244 "psrlq $1, "#regd" \n\t" \
245 "paddb "#regb", "#regr" \n\t" \
246 "paddb "#regd", "#regp" \n\t"
248 #define PAVGBP_MMX(rega, regb, regr, regc, regd, regp) \
249 "movq "#rega", "#regr" \n\t" \
250 "movq "#regc", "#regp" \n\t" \
251 "por "#regb", "#regr" \n\t" \
252 "por "#regd", "#regp" \n\t" \
253 "pxor "#rega", "#regb" \n\t" \
254 "pxor "#regc", "#regd" \n\t" \
255 "pand %%mm6, "#regb" \n\t" \
256 "pand %%mm6, "#regd" \n\t" \
257 "psrlq $1, "#regd" \n\t" \
258 "psrlq $1, "#regb" \n\t" \
259 "psubb "#regb", "#regr" \n\t" \
260 "psubb "#regd", "#regp" \n\t"
262 /***********************************/
263 /* MMX no rounding */
265 #define DEF(x, y) x ## _no_rnd_ ## y ## _mmx
266 #define SET_RND MOVQ_WONE
267 #define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX_NO_RND(a, b, c, d, e, f)
268 #define PAVGB(a, b, c, e) PAVGB_MMX_NO_RND(a, b, c, e)
269 #define OP_AVG(a, b, c, e) PAVGB_MMX(a, b, c, e)
271 #include "dsputil_rnd_template.c"
278 /***********************************/
281 #define DEF(x, y) x ## _ ## y ## _mmx
282 #define SET_RND MOVQ_WTWO
283 #define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX(a, b, c, d, e, f)
284 #define PAVGB(a, b, c, e) PAVGB_MMX(a, b, c, e)
286 #include "dsputil_rnd_template.c"
294 #endif /* HAVE_INLINE_ASM */
298 #define ff_put_pixels8_mmx ff_put_pixels8_mmxext
300 /***********************************/
303 #define DEF(x) x ## _3dnow
305 #include "dsputil_avg_template.c"
309 /***********************************/
310 /* MMXEXT specific */
312 #define DEF(x) x ## _mmxext
314 #include "dsputil_avg_template.c"
318 #endif /* HAVE_YASM */
322 #define put_no_rnd_pixels16_mmx put_pixels16_mmx
323 #define put_no_rnd_pixels8_mmx put_pixels8_mmx
324 #define put_pixels16_mmxext put_pixels16_mmx
325 #define put_pixels8_mmxext put_pixels8_mmx
326 #define put_pixels4_mmxext put_pixels4_mmx
327 #define put_no_rnd_pixels16_mmxext put_no_rnd_pixels16_mmx
328 #define put_no_rnd_pixels8_mmxext put_no_rnd_pixels8_mmx
330 /***********************************/
333 void ff_put_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
339 /* read the pixels */
344 "movq (%3), %%mm0 \n\t"
345 "movq 8(%3), %%mm1 \n\t"
346 "movq 16(%3), %%mm2 \n\t"
347 "movq 24(%3), %%mm3 \n\t"
348 "movq 32(%3), %%mm4 \n\t"
349 "movq 40(%3), %%mm5 \n\t"
350 "movq 48(%3), %%mm6 \n\t"
351 "movq 56(%3), %%mm7 \n\t"
352 "packuswb %%mm1, %%mm0 \n\t"
353 "packuswb %%mm3, %%mm2 \n\t"
354 "packuswb %%mm5, %%mm4 \n\t"
355 "packuswb %%mm7, %%mm6 \n\t"
356 "movq %%mm0, (%0) \n\t"
357 "movq %%mm2, (%0, %1) \n\t"
358 "movq %%mm4, (%0, %1, 2) \n\t"
359 "movq %%mm6, (%0, %2) \n\t"
360 :: "r"(pix), "r"((x86_reg)line_size), "r"((x86_reg)line_size * 3),
363 pix += line_size * 4;
366 // if here would be an exact copy of the code above
367 // compiler would generate some very strange code
370 "movq (%3), %%mm0 \n\t"
371 "movq 8(%3), %%mm1 \n\t"
372 "movq 16(%3), %%mm2 \n\t"
373 "movq 24(%3), %%mm3 \n\t"
374 "movq 32(%3), %%mm4 \n\t"
375 "movq 40(%3), %%mm5 \n\t"
376 "movq 48(%3), %%mm6 \n\t"
377 "movq 56(%3), %%mm7 \n\t"
378 "packuswb %%mm1, %%mm0 \n\t"
379 "packuswb %%mm3, %%mm2 \n\t"
380 "packuswb %%mm5, %%mm4 \n\t"
381 "packuswb %%mm7, %%mm6 \n\t"
382 "movq %%mm0, (%0) \n\t"
383 "movq %%mm2, (%0, %1) \n\t"
384 "movq %%mm4, (%0, %1, 2) \n\t"
385 "movq %%mm6, (%0, %2) \n\t"
386 :: "r"(pix), "r"((x86_reg)line_size), "r"((x86_reg)line_size * 3), "r"(p)
390 #define put_signed_pixels_clamped_mmx_half(off) \
391 "movq "#off"(%2), %%mm1 \n\t" \
392 "movq 16 + "#off"(%2), %%mm2 \n\t" \
393 "movq 32 + "#off"(%2), %%mm3 \n\t" \
394 "movq 48 + "#off"(%2), %%mm4 \n\t" \
395 "packsswb 8 + "#off"(%2), %%mm1 \n\t" \
396 "packsswb 24 + "#off"(%2), %%mm2 \n\t" \
397 "packsswb 40 + "#off"(%2), %%mm3 \n\t" \
398 "packsswb 56 + "#off"(%2), %%mm4 \n\t" \
399 "paddb %%mm0, %%mm1 \n\t" \
400 "paddb %%mm0, %%mm2 \n\t" \
401 "paddb %%mm0, %%mm3 \n\t" \
402 "paddb %%mm0, %%mm4 \n\t" \
403 "movq %%mm1, (%0) \n\t" \
404 "movq %%mm2, (%0, %3) \n\t" \
405 "movq %%mm3, (%0, %3, 2) \n\t" \
406 "movq %%mm4, (%0, %1) \n\t"
408 void ff_put_signed_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
411 x86_reg line_skip = line_size;
415 "movq "MANGLE(ff_pb_80)", %%mm0 \n\t"
416 "lea (%3, %3, 2), %1 \n\t"
417 put_signed_pixels_clamped_mmx_half(0)
418 "lea (%0, %3, 4), %0 \n\t"
419 put_signed_pixels_clamped_mmx_half(64)
420 : "+&r"(pixels), "=&r"(line_skip3)
421 : "r"(block), "r"(line_skip)
425 void ff_add_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
432 /* read the pixels */
439 "movq (%2), %%mm0 \n\t"
440 "movq 8(%2), %%mm1 \n\t"
441 "movq 16(%2), %%mm2 \n\t"
442 "movq 24(%2), %%mm3 \n\t"
443 "movq %0, %%mm4 \n\t"
444 "movq %1, %%mm6 \n\t"
445 "movq %%mm4, %%mm5 \n\t"
446 "punpcklbw %%mm7, %%mm4 \n\t"
447 "punpckhbw %%mm7, %%mm5 \n\t"
448 "paddsw %%mm4, %%mm0 \n\t"
449 "paddsw %%mm5, %%mm1 \n\t"
450 "movq %%mm6, %%mm5 \n\t"
451 "punpcklbw %%mm7, %%mm6 \n\t"
452 "punpckhbw %%mm7, %%mm5 \n\t"
453 "paddsw %%mm6, %%mm2 \n\t"
454 "paddsw %%mm5, %%mm3 \n\t"
455 "packuswb %%mm1, %%mm0 \n\t"
456 "packuswb %%mm3, %%mm2 \n\t"
457 "movq %%mm0, %0 \n\t"
458 "movq %%mm2, %1 \n\t"
459 : "+m"(*pix), "+m"(*(pix + line_size))
462 pix += line_size * 2;
467 static void put_pixels8_mmx(uint8_t *block, const uint8_t *pixels,
468 int line_size, int h)
471 "lea (%3, %3), %%"REG_a" \n\t"
474 "movq (%1 ), %%mm0 \n\t"
475 "movq (%1, %3), %%mm1 \n\t"
476 "movq %%mm0, (%2) \n\t"
477 "movq %%mm1, (%2, %3) \n\t"
478 "add %%"REG_a", %1 \n\t"
479 "add %%"REG_a", %2 \n\t"
480 "movq (%1 ), %%mm0 \n\t"
481 "movq (%1, %3), %%mm1 \n\t"
482 "movq %%mm0, (%2) \n\t"
483 "movq %%mm1, (%2, %3) \n\t"
484 "add %%"REG_a", %1 \n\t"
485 "add %%"REG_a", %2 \n\t"
488 : "+g"(h), "+r"(pixels), "+r"(block)
489 : "r"((x86_reg)line_size)
494 static void put_pixels16_mmx(uint8_t *block, const uint8_t *pixels,
495 int line_size, int h)
498 "lea (%3, %3), %%"REG_a" \n\t"
501 "movq (%1 ), %%mm0 \n\t"
502 "movq 8(%1 ), %%mm4 \n\t"
503 "movq (%1, %3), %%mm1 \n\t"
504 "movq 8(%1, %3), %%mm5 \n\t"
505 "movq %%mm0, (%2) \n\t"
506 "movq %%mm4, 8(%2) \n\t"
507 "movq %%mm1, (%2, %3) \n\t"
508 "movq %%mm5, 8(%2, %3) \n\t"
509 "add %%"REG_a", %1 \n\t"
510 "add %%"REG_a", %2 \n\t"
511 "movq (%1 ), %%mm0 \n\t"
512 "movq 8(%1 ), %%mm4 \n\t"
513 "movq (%1, %3), %%mm1 \n\t"
514 "movq 8(%1, %3), %%mm5 \n\t"
515 "movq %%mm0, (%2) \n\t"
516 "movq %%mm4, 8(%2) \n\t"
517 "movq %%mm1, (%2, %3) \n\t"
518 "movq %%mm5, 8(%2, %3) \n\t"
519 "add %%"REG_a", %1 \n\t"
520 "add %%"REG_a", %2 \n\t"
523 : "+g"(h), "+r"(pixels), "+r"(block)
524 : "r"((x86_reg)line_size)
529 #define CLEAR_BLOCKS(name, n) \
530 static void name(int16_t *blocks) \
533 "pxor %%mm7, %%mm7 \n\t" \
534 "mov %1, %%"REG_a" \n\t" \
536 "movq %%mm7, (%0, %%"REG_a") \n\t" \
537 "movq %%mm7, 8(%0, %%"REG_a") \n\t" \
538 "movq %%mm7, 16(%0, %%"REG_a") \n\t" \
539 "movq %%mm7, 24(%0, %%"REG_a") \n\t" \
540 "add $32, %%"REG_a" \n\t" \
542 :: "r"(((uint8_t *)blocks) + 128 * n), \
547 CLEAR_BLOCKS(clear_blocks_mmx, 6)
548 CLEAR_BLOCKS(clear_block_mmx, 1)
550 static void clear_block_sse(int16_t *block)
553 "xorps %%xmm0, %%xmm0 \n"
554 "movaps %%xmm0, (%0) \n"
555 "movaps %%xmm0, 16(%0) \n"
556 "movaps %%xmm0, 32(%0) \n"
557 "movaps %%xmm0, 48(%0) \n"
558 "movaps %%xmm0, 64(%0) \n"
559 "movaps %%xmm0, 80(%0) \n"
560 "movaps %%xmm0, 96(%0) \n"
561 "movaps %%xmm0, 112(%0) \n"
567 static void clear_blocks_sse(int16_t *blocks)
570 "xorps %%xmm0, %%xmm0 \n"
571 "mov %1, %%"REG_a" \n"
573 "movaps %%xmm0, (%0, %%"REG_a") \n"
574 "movaps %%xmm0, 16(%0, %%"REG_a") \n"
575 "movaps %%xmm0, 32(%0, %%"REG_a") \n"
576 "movaps %%xmm0, 48(%0, %%"REG_a") \n"
577 "movaps %%xmm0, 64(%0, %%"REG_a") \n"
578 "movaps %%xmm0, 80(%0, %%"REG_a") \n"
579 "movaps %%xmm0, 96(%0, %%"REG_a") \n"
580 "movaps %%xmm0, 112(%0, %%"REG_a") \n"
581 "add $128, %%"REG_a" \n"
583 :: "r"(((uint8_t *)blocks) + 128 * 6),
589 static void add_bytes_mmx(uint8_t *dst, uint8_t *src, int w)
595 "movq (%1, %0), %%mm0 \n\t"
596 "movq (%2, %0), %%mm1 \n\t"
597 "paddb %%mm0, %%mm1 \n\t"
598 "movq %%mm1, (%2, %0) \n\t"
599 "movq 8(%1, %0), %%mm0 \n\t"
600 "movq 8(%2, %0), %%mm1 \n\t"
601 "paddb %%mm0, %%mm1 \n\t"
602 "movq %%mm1, 8(%2, %0) \n\t"
608 : "r"(src), "r"(dst), "r"((x86_reg)w - 15)
611 dst[i + 0] += src[i + 0];
615 static void add_hfyu_median_prediction_cmov(uint8_t *dst, const uint8_t *top,
616 const uint8_t *diff, int w,
617 int *left, int *left_top)
621 int l = *left & 0xff;
622 int tl = *left_top & 0xff;
627 "movzbl (%3, %4), %2 \n"
640 "add (%6, %4), %b0 \n"
641 "mov %b0, (%5, %4) \n"
644 : "+&q"(l), "+&q"(tl), "=&r"(t), "=&q"(x), "+&r"(w2)
645 : "r"(dst + w), "r"(diff + w), "rm"(top + w)
652 static inline void transpose4x4(uint8_t *dst, uint8_t *src, x86_reg dst_stride, x86_reg src_stride){
653 __asm__ volatile( //FIXME could save 1 instruction if done as 8x4 ...
654 "movd (%1), %%mm0 \n\t"
656 "movd (%1), %%mm1 \n\t"
657 "movd (%1,%3,1), %%mm2 \n\t"
658 "movd (%1,%3,2), %%mm3 \n\t"
659 "punpcklbw %%mm1, %%mm0 \n\t"
660 "punpcklbw %%mm3, %%mm2 \n\t"
661 "movq %%mm0, %%mm1 \n\t"
662 "punpcklwd %%mm2, %%mm0 \n\t"
663 "punpckhwd %%mm2, %%mm1 \n\t"
664 "movd %%mm0, (%0) \n\t"
666 "punpckhdq %%mm0, %%mm0 \n\t"
667 "movd %%mm0, (%0) \n\t"
668 "movd %%mm1, (%0,%2,1) \n\t"
669 "punpckhdq %%mm1, %%mm1 \n\t"
670 "movd %%mm1, (%0,%2,2) \n\t"
680 #define H263_LOOP_FILTER \
681 "pxor %%mm7, %%mm7 \n\t" \
682 "movq %0, %%mm0 \n\t" \
683 "movq %0, %%mm1 \n\t" \
684 "movq %3, %%mm2 \n\t" \
685 "movq %3, %%mm3 \n\t" \
686 "punpcklbw %%mm7, %%mm0 \n\t" \
687 "punpckhbw %%mm7, %%mm1 \n\t" \
688 "punpcklbw %%mm7, %%mm2 \n\t" \
689 "punpckhbw %%mm7, %%mm3 \n\t" \
690 "psubw %%mm2, %%mm0 \n\t" \
691 "psubw %%mm3, %%mm1 \n\t" \
692 "movq %1, %%mm2 \n\t" \
693 "movq %1, %%mm3 \n\t" \
694 "movq %2, %%mm4 \n\t" \
695 "movq %2, %%mm5 \n\t" \
696 "punpcklbw %%mm7, %%mm2 \n\t" \
697 "punpckhbw %%mm7, %%mm3 \n\t" \
698 "punpcklbw %%mm7, %%mm4 \n\t" \
699 "punpckhbw %%mm7, %%mm5 \n\t" \
700 "psubw %%mm2, %%mm4 \n\t" \
701 "psubw %%mm3, %%mm5 \n\t" \
702 "psllw $2, %%mm4 \n\t" \
703 "psllw $2, %%mm5 \n\t" \
704 "paddw %%mm0, %%mm4 \n\t" \
705 "paddw %%mm1, %%mm5 \n\t" \
706 "pxor %%mm6, %%mm6 \n\t" \
707 "pcmpgtw %%mm4, %%mm6 \n\t" \
708 "pcmpgtw %%mm5, %%mm7 \n\t" \
709 "pxor %%mm6, %%mm4 \n\t" \
710 "pxor %%mm7, %%mm5 \n\t" \
711 "psubw %%mm6, %%mm4 \n\t" \
712 "psubw %%mm7, %%mm5 \n\t" \
713 "psrlw $3, %%mm4 \n\t" \
714 "psrlw $3, %%mm5 \n\t" \
715 "packuswb %%mm5, %%mm4 \n\t" \
716 "packsswb %%mm7, %%mm6 \n\t" \
717 "pxor %%mm7, %%mm7 \n\t" \
718 "movd %4, %%mm2 \n\t" \
719 "punpcklbw %%mm2, %%mm2 \n\t" \
720 "punpcklbw %%mm2, %%mm2 \n\t" \
721 "punpcklbw %%mm2, %%mm2 \n\t" \
722 "psubusb %%mm4, %%mm2 \n\t" \
723 "movq %%mm2, %%mm3 \n\t" \
724 "psubusb %%mm4, %%mm3 \n\t" \
725 "psubb %%mm3, %%mm2 \n\t" \
726 "movq %1, %%mm3 \n\t" \
727 "movq %2, %%mm4 \n\t" \
728 "pxor %%mm6, %%mm3 \n\t" \
729 "pxor %%mm6, %%mm4 \n\t" \
730 "paddusb %%mm2, %%mm3 \n\t" \
731 "psubusb %%mm2, %%mm4 \n\t" \
732 "pxor %%mm6, %%mm3 \n\t" \
733 "pxor %%mm6, %%mm4 \n\t" \
734 "paddusb %%mm2, %%mm2 \n\t" \
735 "packsswb %%mm1, %%mm0 \n\t" \
736 "pcmpgtb %%mm0, %%mm7 \n\t" \
737 "pxor %%mm7, %%mm0 \n\t" \
738 "psubb %%mm7, %%mm0 \n\t" \
739 "movq %%mm0, %%mm1 \n\t" \
740 "psubusb %%mm2, %%mm0 \n\t" \
741 "psubb %%mm0, %%mm1 \n\t" \
742 "pand %5, %%mm1 \n\t" \
743 "psrlw $2, %%mm1 \n\t" \
744 "pxor %%mm7, %%mm1 \n\t" \
745 "psubb %%mm7, %%mm1 \n\t" \
746 "movq %0, %%mm5 \n\t" \
747 "movq %3, %%mm6 \n\t" \
748 "psubb %%mm1, %%mm5 \n\t" \
749 "paddb %%mm1, %%mm6 \n\t"
751 static void h263_v_loop_filter_mmx(uint8_t *src, int stride, int qscale)
753 if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
754 const int strength = ff_h263_loop_filter_strength[qscale];
759 "movq %%mm3, %1 \n\t"
760 "movq %%mm4, %2 \n\t"
761 "movq %%mm5, %0 \n\t"
762 "movq %%mm6, %3 \n\t"
763 : "+m"(*(uint64_t*)(src - 2 * stride)),
764 "+m"(*(uint64_t*)(src - 1 * stride)),
765 "+m"(*(uint64_t*)(src + 0 * stride)),
766 "+m"(*(uint64_t*)(src + 1 * stride))
767 : "g"(2 * strength), "m"(ff_pb_FC)
772 static void h263_h_loop_filter_mmx(uint8_t *src, int stride, int qscale)
774 if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
775 const int strength = ff_h263_loop_filter_strength[qscale];
776 DECLARE_ALIGNED(8, uint64_t, temp)[4];
777 uint8_t *btemp = (uint8_t*)temp;
781 transpose4x4(btemp, src, 8, stride);
782 transpose4x4(btemp + 4, src + 4 * stride, 8, stride);
784 H263_LOOP_FILTER // 5 3 4 6
790 : "g"(2 * strength), "m"(ff_pb_FC)
794 "movq %%mm5, %%mm1 \n\t"
795 "movq %%mm4, %%mm0 \n\t"
796 "punpcklbw %%mm3, %%mm5 \n\t"
797 "punpcklbw %%mm6, %%mm4 \n\t"
798 "punpckhbw %%mm3, %%mm1 \n\t"
799 "punpckhbw %%mm6, %%mm0 \n\t"
800 "movq %%mm5, %%mm3 \n\t"
801 "movq %%mm1, %%mm6 \n\t"
802 "punpcklwd %%mm4, %%mm5 \n\t"
803 "punpcklwd %%mm0, %%mm1 \n\t"
804 "punpckhwd %%mm4, %%mm3 \n\t"
805 "punpckhwd %%mm0, %%mm6 \n\t"
806 "movd %%mm5, (%0) \n\t"
807 "punpckhdq %%mm5, %%mm5 \n\t"
808 "movd %%mm5, (%0, %2) \n\t"
809 "movd %%mm3, (%0, %2, 2) \n\t"
810 "punpckhdq %%mm3, %%mm3 \n\t"
811 "movd %%mm3, (%0, %3) \n\t"
812 "movd %%mm1, (%1) \n\t"
813 "punpckhdq %%mm1, %%mm1 \n\t"
814 "movd %%mm1, (%1, %2) \n\t"
815 "movd %%mm6, (%1, %2, 2) \n\t"
816 "punpckhdq %%mm6, %%mm6 \n\t"
817 "movd %%mm6, (%1, %3) \n\t"
819 "r"(src + 4 * stride),
820 "r"((x86_reg)stride),
821 "r"((x86_reg)(3 * stride))
826 /* Draw the edges of width 'w' of an image of size width, height
827 * this MMX version can only handle w == 8 || w == 16. */
828 static void draw_edges_mmx(uint8_t *buf, int wrap, int width, int height,
829 int w, int h, int sides)
831 uint8_t *ptr, *last_line;
834 last_line = buf + (height - 1) * wrap;
840 "movd (%0), %%mm0 \n\t"
841 "punpcklbw %%mm0, %%mm0 \n\t"
842 "punpcklwd %%mm0, %%mm0 \n\t"
843 "punpckldq %%mm0, %%mm0 \n\t"
844 "movq %%mm0, -8(%0) \n\t"
845 "movq -8(%0, %2), %%mm1 \n\t"
846 "punpckhbw %%mm1, %%mm1 \n\t"
847 "punpckhwd %%mm1, %%mm1 \n\t"
848 "punpckhdq %%mm1, %%mm1 \n\t"
849 "movq %%mm1, (%0, %2) \n\t"
854 : "r"((x86_reg)wrap), "r"((x86_reg)width), "r"(ptr + wrap * height)
859 "movd (%0), %%mm0 \n\t"
860 "punpcklbw %%mm0, %%mm0 \n\t"
861 "punpcklwd %%mm0, %%mm0 \n\t"
862 "punpckldq %%mm0, %%mm0 \n\t"
863 "movq %%mm0, -8(%0) \n\t"
864 "movq %%mm0, -16(%0) \n\t"
865 "movq -8(%0, %2), %%mm1 \n\t"
866 "punpckhbw %%mm1, %%mm1 \n\t"
867 "punpckhwd %%mm1, %%mm1 \n\t"
868 "punpckhdq %%mm1, %%mm1 \n\t"
869 "movq %%mm1, (%0, %2) \n\t"
870 "movq %%mm1, 8(%0, %2) \n\t"
875 : "r"((x86_reg)wrap), "r"((x86_reg)width), "r"(ptr + wrap * height)
879 /* top and bottom (and hopefully also the corners) */
880 if (sides & EDGE_TOP) {
881 for (i = 0; i < h; i += 4) {
882 ptr = buf - (i + 1) * wrap - w;
885 "movq (%1, %0), %%mm0 \n\t"
886 "movq %%mm0, (%0) \n\t"
887 "movq %%mm0, (%0, %2) \n\t"
888 "movq %%mm0, (%0, %2, 2) \n\t"
889 "movq %%mm0, (%0, %3) \n\t"
894 : "r"((x86_reg)buf - (x86_reg)ptr - w), "r"((x86_reg) -wrap),
895 "r"((x86_reg) -wrap * 3), "r"(ptr + width + 2 * w)
900 if (sides & EDGE_BOTTOM) {
901 for (i = 0; i < h; i += 4) {
902 ptr = last_line + (i + 1) * wrap - w;
905 "movq (%1, %0), %%mm0 \n\t"
906 "movq %%mm0, (%0) \n\t"
907 "movq %%mm0, (%0, %2) \n\t"
908 "movq %%mm0, (%0, %2, 2) \n\t"
909 "movq %%mm0, (%0, %3) \n\t"
914 : "r"((x86_reg)last_line - (x86_reg)ptr - w),
915 "r"((x86_reg)wrap), "r"((x86_reg)wrap * 3),
916 "r"(ptr + width + 2 * w)
921 #endif /* HAVE_INLINE_ASM */
925 #define QPEL_OP(OPNAME, ROUNDER, RND, OP, MMX) \
926 static void OPNAME ## qpel8_mc00_ ## MMX (uint8_t *dst, uint8_t *src, \
929 ff_ ## OPNAME ## pixels8_ ## MMX(dst, src, stride, 8); \
932 static void OPNAME ## qpel8_mc10_ ## MMX(uint8_t *dst, uint8_t *src, \
936 uint8_t * const half = (uint8_t*)temp; \
937 ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, \
939 ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, src, half, \
940 stride, stride, 8); \
943 static void OPNAME ## qpel8_mc20_ ## MMX(uint8_t *dst, uint8_t *src, \
946 ff_ ## OPNAME ## mpeg4_qpel8_h_lowpass_ ## MMX(dst, src, stride, \
950 static void OPNAME ## qpel8_mc30_ ## MMX(uint8_t *dst, uint8_t *src, \
954 uint8_t * const half = (uint8_t*)temp; \
955 ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, \
957 ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, src + 1, half, stride, \
961 static void OPNAME ## qpel8_mc01_ ## MMX(uint8_t *dst, uint8_t *src, \
965 uint8_t * const half = (uint8_t*)temp; \
966 ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, \
968 ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, src, half, \
969 stride, stride, 8); \
972 static void OPNAME ## qpel8_mc02_ ## MMX(uint8_t *dst, uint8_t *src, \
975 ff_ ## OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, src, \
979 static void OPNAME ## qpel8_mc03_ ## MMX(uint8_t *dst, uint8_t *src, \
983 uint8_t * const half = (uint8_t*)temp; \
984 ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, \
986 ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, src + stride, half, stride,\
990 static void OPNAME ## qpel8_mc11_ ## MMX(uint8_t *dst, uint8_t *src, \
993 uint64_t half[8 + 9]; \
994 uint8_t * const halfH = ((uint8_t*)half) + 64; \
995 uint8_t * const halfHV = ((uint8_t*)half); \
996 ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
998 ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, \
1000 ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
1001 ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, \
1005 static void OPNAME ## qpel8_mc31_ ## MMX(uint8_t *dst, uint8_t *src, \
1008 uint64_t half[8 + 9]; \
1009 uint8_t * const halfH = ((uint8_t*)half) + 64; \
1010 uint8_t * const halfHV = ((uint8_t*)half); \
1011 ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
1013 ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src + 1, halfH, 8, \
1015 ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
1016 ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, \
1020 static void OPNAME ## qpel8_mc13_ ## MMX(uint8_t *dst, uint8_t *src, \
1023 uint64_t half[8 + 9]; \
1024 uint8_t * const halfH = ((uint8_t*)half) + 64; \
1025 uint8_t * const halfHV = ((uint8_t*)half); \
1026 ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
1028 ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, \
1030 ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
1031 ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH + 8, halfHV, \
1035 static void OPNAME ## qpel8_mc33_ ## MMX(uint8_t *dst, uint8_t *src, \
1038 uint64_t half[8 + 9]; \
1039 uint8_t * const halfH = ((uint8_t*)half) + 64; \
1040 uint8_t * const halfHV = ((uint8_t*)half); \
1041 ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
1043 ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src + 1, halfH, 8, \
1045 ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
1046 ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH + 8, halfHV, \
1050 static void OPNAME ## qpel8_mc21_ ## MMX(uint8_t *dst, uint8_t *src, \
1053 uint64_t half[8 + 9]; \
1054 uint8_t * const halfH = ((uint8_t*)half) + 64; \
1055 uint8_t * const halfHV = ((uint8_t*)half); \
1056 ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
1058 ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
1059 ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, \
1063 static void OPNAME ## qpel8_mc23_ ## MMX(uint8_t *dst, uint8_t *src, \
1066 uint64_t half[8 + 9]; \
1067 uint8_t * const halfH = ((uint8_t*)half) + 64; \
1068 uint8_t * const halfHV = ((uint8_t*)half); \
1069 ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
1071 ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
1072 ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH + 8, halfHV, \
1076 static void OPNAME ## qpel8_mc12_ ## MMX(uint8_t *dst, uint8_t *src, \
1079 uint64_t half[8 + 9]; \
1080 uint8_t * const halfH = ((uint8_t*)half); \
1081 ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
1083 ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, \
1085 ff_ ## OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, \
1089 static void OPNAME ## qpel8_mc32_ ## MMX(uint8_t *dst, uint8_t *src, \
1092 uint64_t half[8 + 9]; \
1093 uint8_t * const halfH = ((uint8_t*)half); \
1094 ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
1096 ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src + 1, halfH, 8, \
1098 ff_ ## OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, \
1102 static void OPNAME ## qpel8_mc22_ ## MMX(uint8_t *dst, uint8_t *src, \
1106 uint8_t * const halfH = ((uint8_t*)half); \
1107 ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
1109 ff_ ## OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, \
1113 static void OPNAME ## qpel16_mc00_ ## MMX (uint8_t *dst, uint8_t *src, \
1116 ff_ ## OPNAME ## pixels16_ ## MMX(dst, src, stride, 16); \
1119 static void OPNAME ## qpel16_mc10_ ## MMX(uint8_t *dst, uint8_t *src, \
1122 uint64_t temp[32]; \
1123 uint8_t * const half = (uint8_t*)temp; \
1124 ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, \
1126 ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, \
1130 static void OPNAME ## qpel16_mc20_ ## MMX(uint8_t *dst, uint8_t *src, \
1133 ff_ ## OPNAME ## mpeg4_qpel16_h_lowpass_ ## MMX(dst, src, \
1134 stride, stride, 16);\
1137 static void OPNAME ## qpel16_mc30_ ## MMX(uint8_t *dst, uint8_t *src, \
1140 uint64_t temp[32]; \
1141 uint8_t * const half = (uint8_t*)temp; \
1142 ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, \
1144 ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, src + 1, half, \
1145 stride, stride, 16); \
1148 static void OPNAME ## qpel16_mc01_ ## MMX(uint8_t *dst, uint8_t *src, \
1151 uint64_t temp[32]; \
1152 uint8_t * const half = (uint8_t*)temp; \
1153 ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, \
1155 ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, \
1159 static void OPNAME ## qpel16_mc02_ ## MMX(uint8_t *dst, uint8_t *src, \
1162 ff_ ## OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, src, \
1166 static void OPNAME ## qpel16_mc03_ ## MMX(uint8_t *dst, uint8_t *src, \
1169 uint64_t temp[32]; \
1170 uint8_t * const half = (uint8_t*)temp; \
1171 ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, \
1173 ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, src+stride, half, \
1174 stride, stride, 16); \
1177 static void OPNAME ## qpel16_mc11_ ## MMX(uint8_t *dst, uint8_t *src, \
1180 uint64_t half[16 * 2 + 17 * 2]; \
1181 uint8_t * const halfH = ((uint8_t*)half) + 256; \
1182 uint8_t * const halfHV = ((uint8_t*)half); \
1183 ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1185 ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, \
1187 ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
1189 ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, \
1193 static void OPNAME ## qpel16_mc31_ ## MMX(uint8_t *dst, uint8_t *src, \
1196 uint64_t half[16 * 2 + 17 * 2]; \
1197 uint8_t * const halfH = ((uint8_t*)half) + 256; \
1198 uint8_t * const halfHV = ((uint8_t*)half); \
1199 ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1201 ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src + 1, halfH, 16, \
1203 ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
1205 ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, \
1209 static void OPNAME ## qpel16_mc13_ ## MMX(uint8_t *dst, uint8_t *src, \
1212 uint64_t half[16 * 2 + 17 * 2]; \
1213 uint8_t * const halfH = ((uint8_t*)half) + 256; \
1214 uint8_t * const halfHV = ((uint8_t*)half); \
1215 ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1217 ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, \
1219 ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
1221 ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH + 16, halfHV, \
1225 static void OPNAME ## qpel16_mc33_ ## MMX(uint8_t *dst, uint8_t *src, \
1228 uint64_t half[16 * 2 + 17 * 2]; \
1229 uint8_t * const halfH = ((uint8_t*)half) + 256; \
1230 uint8_t * const halfHV = ((uint8_t*)half); \
1231 ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1233 ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src + 1, halfH, 16, \
1235 ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
1237 ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH + 16, halfHV, \
1241 static void OPNAME ## qpel16_mc21_ ## MMX(uint8_t *dst, uint8_t *src, \
1244 uint64_t half[16 * 2 + 17 * 2]; \
1245 uint8_t * const halfH = ((uint8_t*)half) + 256; \
1246 uint8_t * const halfHV = ((uint8_t*)half); \
1247 ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1249 ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
1251 ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, \
1255 static void OPNAME ## qpel16_mc23_ ## MMX(uint8_t *dst, uint8_t *src, \
1258 uint64_t half[16 * 2 + 17 * 2]; \
1259 uint8_t * const halfH = ((uint8_t*)half) + 256; \
1260 uint8_t * const halfHV = ((uint8_t*)half); \
1261 ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1263 ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
1265 ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH + 16, halfHV, \
1269 static void OPNAME ## qpel16_mc12_ ## MMX(uint8_t *dst, uint8_t *src, \
1272 uint64_t half[17 * 2]; \
1273 uint8_t * const halfH = ((uint8_t*)half); \
1274 ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1276 ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, \
1278 ff_ ## OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, \
1282 static void OPNAME ## qpel16_mc32_ ## MMX(uint8_t *dst, uint8_t *src, \
1285 uint64_t half[17 * 2]; \
1286 uint8_t * const halfH = ((uint8_t*)half); \
1287 ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1289 ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src + 1, halfH, 16, \
1291 ff_ ## OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, \
1295 static void OPNAME ## qpel16_mc22_ ## MMX(uint8_t *dst, uint8_t *src, \
1298 uint64_t half[17 * 2]; \
1299 uint8_t * const halfH = ((uint8_t*)half); \
1300 ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1302 ff_ ## OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, \
1306 #define PUT_OP(a, b, temp, size) \
1307 "mov"#size" "#a", "#b" \n\t"
1309 #define AVG_MMXEXT_OP(a, b, temp, size) \
1310 "mov"#size" "#b", "#temp" \n\t" \
1311 "pavgb "#temp", "#a" \n\t" \
1312 "mov"#size" "#a", "#b" \n\t"
1314 QPEL_OP(put_, ff_pw_16, _, PUT_OP, mmxext)
1315 QPEL_OP(avg_, ff_pw_16, _, AVG_MMXEXT_OP, mmxext)
1316 QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, mmxext)
1317 #endif /* HAVE_YASM */
1321 void ff_put_rv40_qpel8_mc33_mmx(uint8_t *dst, uint8_t *src, int stride)
1323 put_pixels8_xy2_mmx(dst, src, stride, 8);
1325 void ff_put_rv40_qpel16_mc33_mmx(uint8_t *dst, uint8_t *src, int stride)
1327 put_pixels16_xy2_mmx(dst, src, stride, 16);
1329 void ff_avg_rv40_qpel8_mc33_mmx(uint8_t *dst, uint8_t *src, int stride)
1331 avg_pixels8_xy2_mmx(dst, src, stride, 8);
1333 void ff_avg_rv40_qpel16_mc33_mmx(uint8_t *dst, uint8_t *src, int stride)
1335 avg_pixels16_xy2_mmx(dst, src, stride, 16);
1338 static void gmc_mmx(uint8_t *dst, uint8_t *src,
1339 int stride, int h, int ox, int oy,
1340 int dxx, int dxy, int dyx, int dyy,
1341 int shift, int r, int width, int height)
1344 const int ix = ox >> (16 + shift);
1345 const int iy = oy >> (16 + shift);
1346 const int oxs = ox >> 4;
1347 const int oys = oy >> 4;
1348 const int dxxs = dxx >> 4;
1349 const int dxys = dxy >> 4;
1350 const int dyxs = dyx >> 4;
1351 const int dyys = dyy >> 4;
1352 const uint16_t r4[4] = { r, r, r, r };
1353 const uint16_t dxy4[4] = { dxys, dxys, dxys, dxys };
1354 const uint16_t dyy4[4] = { dyys, dyys, dyys, dyys };
1355 const uint64_t shift2 = 2 * shift;
1358 const int dxw = (dxx - (1 << (16 + shift))) * (w - 1);
1359 const int dyh = (dyy - (1 << (16 + shift))) * (h - 1);
1360 const int dxh = dxy * (h - 1);
1361 const int dyw = dyx * (w - 1);
1362 if ( // non-constant fullpel offset (3% of blocks)
1363 ((ox ^ (ox + dxw)) | (ox ^ (ox + dxh)) | (ox ^ (ox + dxw + dxh)) |
1364 (oy ^ (oy + dyw)) | (oy ^ (oy + dyh)) | (oy ^ (oy + dyw + dyh))) >> (16 + shift)
1365 // uses more than 16 bits of subpel mv (only at huge resolution)
1366 || (dxx | dxy | dyx | dyy) & 15 ||
1367 (unsigned)ix >= width - w ||
1368 (unsigned)iy >= height - h) {
1369 // FIXME could still use mmx for some of the rows
1370 ff_gmc_c(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy,
1371 shift, r, width, height);
1375 src += ix + iy * stride;
1378 "movd %0, %%mm6 \n\t"
1379 "pxor %%mm7, %%mm7 \n\t"
1380 "punpcklwd %%mm6, %%mm6 \n\t"
1381 "punpcklwd %%mm6, %%mm6 \n\t"
1385 for (x = 0; x < w; x += 4) {
1386 uint16_t dx4[4] = { oxs - dxys + dxxs * (x + 0),
1387 oxs - dxys + dxxs * (x + 1),
1388 oxs - dxys + dxxs * (x + 2),
1389 oxs - dxys + dxxs * (x + 3) };
1390 uint16_t dy4[4] = { oys - dyys + dyxs * (x + 0),
1391 oys - dyys + dyxs * (x + 1),
1392 oys - dyys + dyxs * (x + 2),
1393 oys - dyys + dyxs * (x + 3) };
1395 for (y = 0; y < h; y++) {
1397 "movq %0, %%mm4 \n\t"
1398 "movq %1, %%mm5 \n\t"
1399 "paddw %2, %%mm4 \n\t"
1400 "paddw %3, %%mm5 \n\t"
1401 "movq %%mm4, %0 \n\t"
1402 "movq %%mm5, %1 \n\t"
1403 "psrlw $12, %%mm4 \n\t"
1404 "psrlw $12, %%mm5 \n\t"
1405 : "+m"(*dx4), "+m"(*dy4)
1406 : "m"(*dxy4), "m"(*dyy4)
1410 "movq %%mm6, %%mm2 \n\t"
1411 "movq %%mm6, %%mm1 \n\t"
1412 "psubw %%mm4, %%mm2 \n\t"
1413 "psubw %%mm5, %%mm1 \n\t"
1414 "movq %%mm2, %%mm0 \n\t"
1415 "movq %%mm4, %%mm3 \n\t"
1416 "pmullw %%mm1, %%mm0 \n\t" // (s - dx) * (s - dy)
1417 "pmullw %%mm5, %%mm3 \n\t" // dx * dy
1418 "pmullw %%mm5, %%mm2 \n\t" // (s - dx) * dy
1419 "pmullw %%mm4, %%mm1 \n\t" // dx * (s - dy)
1421 "movd %4, %%mm5 \n\t"
1422 "movd %3, %%mm4 \n\t"
1423 "punpcklbw %%mm7, %%mm5 \n\t"
1424 "punpcklbw %%mm7, %%mm4 \n\t"
1425 "pmullw %%mm5, %%mm3 \n\t" // src[1, 1] * dx * dy
1426 "pmullw %%mm4, %%mm2 \n\t" // src[0, 1] * (s - dx) * dy
1428 "movd %2, %%mm5 \n\t"
1429 "movd %1, %%mm4 \n\t"
1430 "punpcklbw %%mm7, %%mm5 \n\t"
1431 "punpcklbw %%mm7, %%mm4 \n\t"
1432 "pmullw %%mm5, %%mm1 \n\t" // src[1, 0] * dx * (s - dy)
1433 "pmullw %%mm4, %%mm0 \n\t" // src[0, 0] * (s - dx) * (s - dy)
1434 "paddw %5, %%mm1 \n\t"
1435 "paddw %%mm3, %%mm2 \n\t"
1436 "paddw %%mm1, %%mm0 \n\t"
1437 "paddw %%mm2, %%mm0 \n\t"
1439 "psrlw %6, %%mm0 \n\t"
1440 "packuswb %%mm0, %%mm0 \n\t"
1441 "movd %%mm0, %0 \n\t"
1443 : "=m"(dst[x + y * stride])
1444 : "m"(src[0]), "m"(src[1]),
1445 "m"(src[stride]), "m"(src[stride + 1]),
1446 "m"(*r4), "m"(shift2)
1450 src += 4 - h * stride;
1453 #endif /* HAVE_INLINE_ASM */
1455 void ff_put_pixels16_sse2(uint8_t *block, const uint8_t *pixels,
1456 int line_size, int h);
1457 void ff_avg_pixels16_sse2(uint8_t *block, const uint8_t *pixels,
1458 int line_size, int h);
1460 void ff_put_h264_chroma_mc8_rnd_mmx (uint8_t *dst, uint8_t *src,
1461 int stride, int h, int x, int y);
1462 void ff_avg_h264_chroma_mc8_rnd_mmxext(uint8_t *dst, uint8_t *src,
1463 int stride, int h, int x, int y);
1464 void ff_avg_h264_chroma_mc8_rnd_3dnow(uint8_t *dst, uint8_t *src,
1465 int stride, int h, int x, int y);
1467 void ff_put_h264_chroma_mc4_mmx (uint8_t *dst, uint8_t *src,
1468 int stride, int h, int x, int y);
1469 void ff_avg_h264_chroma_mc4_mmxext (uint8_t *dst, uint8_t *src,
1470 int stride, int h, int x, int y);
1471 void ff_avg_h264_chroma_mc4_3dnow (uint8_t *dst, uint8_t *src,
1472 int stride, int h, int x, int y);
1474 void ff_put_h264_chroma_mc2_mmxext (uint8_t *dst, uint8_t *src,
1475 int stride, int h, int x, int y);
1476 void ff_avg_h264_chroma_mc2_mmxext (uint8_t *dst, uint8_t *src,
1477 int stride, int h, int x, int y);
1479 void ff_put_h264_chroma_mc8_rnd_ssse3(uint8_t *dst, uint8_t *src,
1480 int stride, int h, int x, int y);
1481 void ff_put_h264_chroma_mc4_ssse3 (uint8_t *dst, uint8_t *src,
1482 int stride, int h, int x, int y);
1484 void ff_avg_h264_chroma_mc8_rnd_ssse3(uint8_t *dst, uint8_t *src,
1485 int stride, int h, int x, int y);
1486 void ff_avg_h264_chroma_mc4_ssse3 (uint8_t *dst, uint8_t *src,
1487 int stride, int h, int x, int y);
1489 #define CHROMA_MC(OP, NUM, DEPTH, OPT) \
1490 void ff_ ## OP ## _h264_chroma_mc ## NUM ## _ ## DEPTH ## _ ## OPT \
1491 (uint8_t *dst, uint8_t *src, \
1492 int stride, int h, int x, int y);
1494 CHROMA_MC(put, 2, 10, mmxext)
1495 CHROMA_MC(avg, 2, 10, mmxext)
1496 CHROMA_MC(put, 4, 10, mmxext)
1497 CHROMA_MC(avg, 4, 10, mmxext)
1498 CHROMA_MC(put, 8, 10, sse2)
1499 CHROMA_MC(avg, 8, 10, sse2)
1500 CHROMA_MC(put, 8, 10, avx)
1501 CHROMA_MC(avg, 8, 10, avx)
1506 void ff_put_cavs_qpel8_mc00_mmxext(uint8_t *dst, uint8_t *src, int stride)
1508 put_pixels8_mmx(dst, src, stride, 8);
1511 void ff_avg_cavs_qpel8_mc00_mmxext(uint8_t *dst, uint8_t *src, int stride)
1513 avg_pixels8_mmx(dst, src, stride, 8);
1516 void ff_put_cavs_qpel16_mc00_mmxext(uint8_t *dst, uint8_t *src, int stride)
1518 put_pixels16_mmx(dst, src, stride, 16);
1521 void ff_avg_cavs_qpel16_mc00_mmxext(uint8_t *dst, uint8_t *src, int stride)
1523 avg_pixels16_mmx(dst, src, stride, 16);
1525 #endif /* HAVE_INLINE_ASM */
1529 void ff_put_vc1_mspel_mc00_mmx(uint8_t *dst, const uint8_t *src,
1530 int stride, int rnd)
1532 ff_put_pixels8_mmx(dst, src, stride, 8);
1535 void ff_avg_vc1_mspel_mc00_mmxext(uint8_t *dst, const uint8_t *src,
1536 int stride, int rnd)
1538 ff_avg_pixels8_mmxext(dst, src, stride, 8);
1540 #endif /* HAVE_YASM */
1543 static void vector_clipf_sse(float *dst, const float *src,
1544 float min, float max, int len)
1546 x86_reg i = (len - 16) * 4;
1548 "movss %3, %%xmm4 \n\t"
1549 "movss %4, %%xmm5 \n\t"
1550 "shufps $0, %%xmm4, %%xmm4 \n\t"
1551 "shufps $0, %%xmm5, %%xmm5 \n\t"
1553 "movaps (%2, %0), %%xmm0 \n\t" // 3/1 on intel
1554 "movaps 16(%2, %0), %%xmm1 \n\t"
1555 "movaps 32(%2, %0), %%xmm2 \n\t"
1556 "movaps 48(%2, %0), %%xmm3 \n\t"
1557 "maxps %%xmm4, %%xmm0 \n\t"
1558 "maxps %%xmm4, %%xmm1 \n\t"
1559 "maxps %%xmm4, %%xmm2 \n\t"
1560 "maxps %%xmm4, %%xmm3 \n\t"
1561 "minps %%xmm5, %%xmm0 \n\t"
1562 "minps %%xmm5, %%xmm1 \n\t"
1563 "minps %%xmm5, %%xmm2 \n\t"
1564 "minps %%xmm5, %%xmm3 \n\t"
1565 "movaps %%xmm0, (%1, %0) \n\t"
1566 "movaps %%xmm1, 16(%1, %0) \n\t"
1567 "movaps %%xmm2, 32(%1, %0) \n\t"
1568 "movaps %%xmm3, 48(%1, %0) \n\t"
1572 : "r"(dst), "r"(src), "m"(min), "m"(max)
1577 #endif /* HAVE_INLINE_ASM */
1579 int32_t ff_scalarproduct_int16_mmxext(const int16_t *v1, const int16_t *v2,
1581 int32_t ff_scalarproduct_int16_sse2(const int16_t *v1, const int16_t *v2,
1583 int32_t ff_scalarproduct_and_madd_int16_mmxext(int16_t *v1, const int16_t *v2,
1585 int order, int mul);
1586 int32_t ff_scalarproduct_and_madd_int16_sse2(int16_t *v1, const int16_t *v2,
1588 int order, int mul);
1589 int32_t ff_scalarproduct_and_madd_int16_ssse3(int16_t *v1, const int16_t *v2,
1591 int order, int mul);
1593 void ff_apply_window_int16_round_mmxext(int16_t *output, const int16_t *input,
1594 const int16_t *window, unsigned int len);
1595 void ff_apply_window_int16_round_sse2(int16_t *output, const int16_t *input,
1596 const int16_t *window, unsigned int len);
1597 void ff_apply_window_int16_mmxext(int16_t *output, const int16_t *input,
1598 const int16_t *window, unsigned int len);
1599 void ff_apply_window_int16_sse2(int16_t *output, const int16_t *input,
1600 const int16_t *window, unsigned int len);
1601 void ff_apply_window_int16_ssse3(int16_t *output, const int16_t *input,
1602 const int16_t *window, unsigned int len);
1603 void ff_apply_window_int16_ssse3_atom(int16_t *output, const int16_t *input,
1604 const int16_t *window, unsigned int len);
1606 void ff_bswap32_buf_ssse3(uint32_t *dst, const uint32_t *src, int w);
1607 void ff_bswap32_buf_sse2(uint32_t *dst, const uint32_t *src, int w);
1609 void ff_add_hfyu_median_prediction_mmxext(uint8_t *dst, const uint8_t *top,
1610 const uint8_t *diff, int w,
1611 int *left, int *left_top);
1612 int ff_add_hfyu_left_prediction_ssse3(uint8_t *dst, const uint8_t *src,
1614 int ff_add_hfyu_left_prediction_sse4(uint8_t *dst, const uint8_t *src,
1617 void ff_vector_clip_int32_mmx (int32_t *dst, const int32_t *src,
1618 int32_t min, int32_t max, unsigned int len);
1619 void ff_vector_clip_int32_sse2 (int32_t *dst, const int32_t *src,
1620 int32_t min, int32_t max, unsigned int len);
1621 void ff_vector_clip_int32_int_sse2(int32_t *dst, const int32_t *src,
1622 int32_t min, int32_t max, unsigned int len);
1623 void ff_vector_clip_int32_sse4 (int32_t *dst, const int32_t *src,
1624 int32_t min, int32_t max, unsigned int len);
1626 #define SET_QPEL_FUNCS(PFX, IDX, SIZE, CPU, PREFIX) \
1628 c->PFX ## _pixels_tab[IDX][ 0] = PREFIX ## PFX ## SIZE ## _mc00_ ## CPU; \
1629 c->PFX ## _pixels_tab[IDX][ 1] = PREFIX ## PFX ## SIZE ## _mc10_ ## CPU; \
1630 c->PFX ## _pixels_tab[IDX][ 2] = PREFIX ## PFX ## SIZE ## _mc20_ ## CPU; \
1631 c->PFX ## _pixels_tab[IDX][ 3] = PREFIX ## PFX ## SIZE ## _mc30_ ## CPU; \
1632 c->PFX ## _pixels_tab[IDX][ 4] = PREFIX ## PFX ## SIZE ## _mc01_ ## CPU; \
1633 c->PFX ## _pixels_tab[IDX][ 5] = PREFIX ## PFX ## SIZE ## _mc11_ ## CPU; \
1634 c->PFX ## _pixels_tab[IDX][ 6] = PREFIX ## PFX ## SIZE ## _mc21_ ## CPU; \
1635 c->PFX ## _pixels_tab[IDX][ 7] = PREFIX ## PFX ## SIZE ## _mc31_ ## CPU; \
1636 c->PFX ## _pixels_tab[IDX][ 8] = PREFIX ## PFX ## SIZE ## _mc02_ ## CPU; \
1637 c->PFX ## _pixels_tab[IDX][ 9] = PREFIX ## PFX ## SIZE ## _mc12_ ## CPU; \
1638 c->PFX ## _pixels_tab[IDX][10] = PREFIX ## PFX ## SIZE ## _mc22_ ## CPU; \
1639 c->PFX ## _pixels_tab[IDX][11] = PREFIX ## PFX ## SIZE ## _mc32_ ## CPU; \
1640 c->PFX ## _pixels_tab[IDX][12] = PREFIX ## PFX ## SIZE ## _mc03_ ## CPU; \
1641 c->PFX ## _pixels_tab[IDX][13] = PREFIX ## PFX ## SIZE ## _mc13_ ## CPU; \
1642 c->PFX ## _pixels_tab[IDX][14] = PREFIX ## PFX ## SIZE ## _mc23_ ## CPU; \
1643 c->PFX ## _pixels_tab[IDX][15] = PREFIX ## PFX ## SIZE ## _mc33_ ## CPU; \
1646 #define SET_HPEL_FUNCS(PFX, IDX, SIZE, CPU) \
1648 c->PFX ## _pixels_tab IDX [0] = PFX ## _pixels ## SIZE ## _ ## CPU; \
1649 c->PFX ## _pixels_tab IDX [1] = PFX ## _pixels ## SIZE ## _x2_ ## CPU; \
1650 c->PFX ## _pixels_tab IDX [2] = PFX ## _pixels ## SIZE ## _y2_ ## CPU; \
1651 c->PFX ## _pixels_tab IDX [3] = PFX ## _pixels ## SIZE ## _xy2_ ## CPU; \
1654 static void dsputil_init_mmx(DSPContext *c, AVCodecContext *avctx, int mm_flags)
1656 const int high_bit_depth = avctx->bits_per_raw_sample > 8;
1659 c->put_pixels_clamped = ff_put_pixels_clamped_mmx;
1660 c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_mmx;
1661 c->add_pixels_clamped = ff_add_pixels_clamped_mmx;
1663 if (!high_bit_depth) {
1664 c->clear_block = clear_block_mmx;
1665 c->clear_blocks = clear_blocks_mmx;
1666 c->draw_edges = draw_edges_mmx;
1668 SET_HPEL_FUNCS(put, [0], 16, mmx);
1669 SET_HPEL_FUNCS(put_no_rnd, [0], 16, mmx);
1670 SET_HPEL_FUNCS(avg, [0], 16, mmx);
1671 SET_HPEL_FUNCS(avg_no_rnd, , 16, mmx);
1672 SET_HPEL_FUNCS(put, [1], 8, mmx);
1673 SET_HPEL_FUNCS(put_no_rnd, [1], 8, mmx);
1674 SET_HPEL_FUNCS(avg, [1], 8, mmx);
1676 switch (avctx->idct_algo) {
1678 case FF_IDCT_SIMPLEMMX:
1679 c->idct_put = ff_simple_idct_put_mmx;
1680 c->idct_add = ff_simple_idct_add_mmx;
1681 c->idct = ff_simple_idct_mmx;
1682 c->idct_permutation_type = FF_SIMPLE_IDCT_PERM;
1684 case FF_IDCT_XVIDMMX:
1685 c->idct_put = ff_idct_xvid_mmx_put;
1686 c->idct_add = ff_idct_xvid_mmx_add;
1687 c->idct = ff_idct_xvid_mmx;
1694 c->add_bytes = add_bytes_mmx;
1696 if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
1697 c->h263_v_loop_filter = h263_v_loop_filter_mmx;
1698 c->h263_h_loop_filter = h263_h_loop_filter_mmx;
1700 #endif /* HAVE_INLINE_ASM */
1703 if (!high_bit_depth && CONFIG_H264CHROMA) {
1704 c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_rnd_mmx;
1705 c->put_h264_chroma_pixels_tab[1] = ff_put_h264_chroma_mc4_mmx;
1708 c->vector_clip_int32 = ff_vector_clip_int32_mmx;
1713 static void dsputil_init_mmxext(DSPContext *c, AVCodecContext *avctx,
1716 const int bit_depth = avctx->bits_per_raw_sample;
1717 const int high_bit_depth = bit_depth > 8;
1720 SET_QPEL_FUNCS(avg_qpel, 0, 16, mmxext, );
1721 SET_QPEL_FUNCS(avg_qpel, 1, 8, mmxext, );
1723 SET_QPEL_FUNCS(put_qpel, 0, 16, mmxext, );
1724 SET_QPEL_FUNCS(put_qpel, 1, 8, mmxext, );
1725 SET_QPEL_FUNCS(put_no_rnd_qpel, 0, 16, mmxext, );
1726 SET_QPEL_FUNCS(put_no_rnd_qpel, 1, 8, mmxext, );
1728 if (!high_bit_depth) {
1729 c->put_pixels_tab[0][1] = ff_put_pixels16_x2_mmxext;
1730 c->put_pixels_tab[0][2] = ff_put_pixels16_y2_mmxext;
1732 c->avg_pixels_tab[0][0] = ff_avg_pixels16_mmxext;
1733 c->avg_pixels_tab[0][1] = ff_avg_pixels16_x2_mmxext;
1734 c->avg_pixels_tab[0][2] = ff_avg_pixels16_y2_mmxext;
1736 c->put_pixels_tab[1][1] = ff_put_pixels8_x2_mmxext;
1737 c->put_pixels_tab[1][2] = ff_put_pixels8_y2_mmxext;
1739 c->avg_pixels_tab[1][0] = ff_avg_pixels8_mmxext;
1740 c->avg_pixels_tab[1][1] = ff_avg_pixels8_x2_mmxext;
1741 c->avg_pixels_tab[1][2] = ff_avg_pixels8_y2_mmxext;
1744 if (!(avctx->flags & CODEC_FLAG_BITEXACT)) {
1745 if (!high_bit_depth) {
1746 c->put_no_rnd_pixels_tab[0][1] = ff_put_no_rnd_pixels16_x2_mmxext;
1747 c->put_no_rnd_pixels_tab[0][2] = ff_put_no_rnd_pixels16_y2_mmxext;
1748 c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_mmxext;
1749 c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_mmxext;
1751 c->avg_pixels_tab[0][3] = ff_avg_pixels16_xy2_mmxext;
1752 c->avg_pixels_tab[1][3] = ff_avg_pixels8_xy2_mmxext;
1755 #endif /* HAVE_YASM */
1758 if (!high_bit_depth && avctx->idct_algo == FF_IDCT_XVIDMMX) {
1759 c->idct_put = ff_idct_xvid_mmxext_put;
1760 c->idct_add = ff_idct_xvid_mmxext_add;
1761 c->idct = ff_idct_xvid_mmxext;
1763 #endif /* HAVE_INLINE_ASM */
1765 #if HAVE_MMXEXT_EXTERNAL
1766 if (CONFIG_VP3_DECODER && (avctx->codec_id == AV_CODEC_ID_VP3 ||
1767 avctx->codec_id == AV_CODEC_ID_THEORA)) {
1768 c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_exact_mmxext;
1769 c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_exact_mmxext;
1772 if (!high_bit_depth && CONFIG_H264CHROMA) {
1773 c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_rnd_mmxext;
1774 c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_mmxext;
1775 c->avg_h264_chroma_pixels_tab[2] = ff_avg_h264_chroma_mc2_mmxext;
1776 c->put_h264_chroma_pixels_tab[2] = ff_put_h264_chroma_mc2_mmxext;
1778 if (bit_depth == 10 && CONFIG_H264CHROMA) {
1779 c->put_h264_chroma_pixels_tab[2] = ff_put_h264_chroma_mc2_10_mmxext;
1780 c->avg_h264_chroma_pixels_tab[2] = ff_avg_h264_chroma_mc2_10_mmxext;
1781 c->put_h264_chroma_pixels_tab[1] = ff_put_h264_chroma_mc4_10_mmxext;
1782 c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_10_mmxext;
1785 /* slower than cmov version on AMD */
1786 if (!(mm_flags & AV_CPU_FLAG_3DNOW))
1787 c->add_hfyu_median_prediction = ff_add_hfyu_median_prediction_mmxext;
1789 c->scalarproduct_int16 = ff_scalarproduct_int16_mmxext;
1790 c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_mmxext;
1792 if (avctx->flags & CODEC_FLAG_BITEXACT) {
1793 c->apply_window_int16 = ff_apply_window_int16_mmxext;
1795 c->apply_window_int16 = ff_apply_window_int16_round_mmxext;
1797 #endif /* HAVE_MMXEXT_EXTERNAL */
1800 static void dsputil_init_3dnow(DSPContext *c, AVCodecContext *avctx,
1803 const int high_bit_depth = avctx->bits_per_raw_sample > 8;
1806 if (!high_bit_depth) {
1807 c->put_pixels_tab[0][1] = ff_put_pixels16_x2_3dnow;
1808 c->put_pixels_tab[0][2] = ff_put_pixels16_y2_3dnow;
1810 c->avg_pixels_tab[0][0] = ff_avg_pixels16_3dnow;
1811 c->avg_pixels_tab[0][1] = ff_avg_pixels16_x2_3dnow;
1812 c->avg_pixels_tab[0][2] = ff_avg_pixels16_y2_3dnow;
1814 c->put_pixels_tab[1][1] = ff_put_pixels8_x2_3dnow;
1815 c->put_pixels_tab[1][2] = ff_put_pixels8_y2_3dnow;
1817 c->avg_pixels_tab[1][0] = ff_avg_pixels8_3dnow;
1818 c->avg_pixels_tab[1][1] = ff_avg_pixels8_x2_3dnow;
1819 c->avg_pixels_tab[1][2] = ff_avg_pixels8_y2_3dnow;
1821 if (!(avctx->flags & CODEC_FLAG_BITEXACT)){
1822 c->put_no_rnd_pixels_tab[0][1] = ff_put_no_rnd_pixels16_x2_3dnow;
1823 c->put_no_rnd_pixels_tab[0][2] = ff_put_no_rnd_pixels16_y2_3dnow;
1824 c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_3dnow;
1825 c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_3dnow;
1827 c->avg_pixels_tab[0][3] = ff_avg_pixels16_xy2_3dnow;
1828 c->avg_pixels_tab[1][3] = ff_avg_pixels8_xy2_3dnow;
1832 if (CONFIG_VP3_DECODER && (avctx->codec_id == AV_CODEC_ID_VP3 ||
1833 avctx->codec_id == AV_CODEC_ID_THEORA)) {
1834 c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_exact_3dnow;
1835 c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_exact_3dnow;
1838 if (!high_bit_depth && CONFIG_H264CHROMA) {
1839 c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_rnd_3dnow;
1840 c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_3dnow;
1842 #endif /* HAVE_YASM */
1845 static void dsputil_init_sse(DSPContext *c, AVCodecContext *avctx, int mm_flags)
1847 const int high_bit_depth = avctx->bits_per_raw_sample > 8;
1850 if (!high_bit_depth) {
1851 if (!(CONFIG_MPEG_XVMC_DECODER && avctx->xvmc_acceleration > 1)) {
1852 /* XvMCCreateBlocks() may not allocate 16-byte aligned blocks */
1853 c->clear_block = clear_block_sse;
1854 c->clear_blocks = clear_blocks_sse;
1858 c->vector_clipf = vector_clipf_sse;
1859 #endif /* HAVE_INLINE_ASM */
1862 static void dsputil_init_sse2(DSPContext *c, AVCodecContext *avctx,
1865 const int bit_depth = avctx->bits_per_raw_sample;
1866 const int high_bit_depth = bit_depth > 8;
1868 #if HAVE_SSE2_INLINE
1869 if (!high_bit_depth && avctx->idct_algo == FF_IDCT_XVIDMMX) {
1870 c->idct_put = ff_idct_xvid_sse2_put;
1871 c->idct_add = ff_idct_xvid_sse2_add;
1872 c->idct = ff_idct_xvid_sse2;
1873 c->idct_permutation_type = FF_SSE2_IDCT_PERM;
1875 #endif /* HAVE_SSE2_INLINE */
1877 #if HAVE_SSE2_EXTERNAL
1878 if (!(mm_flags & AV_CPU_FLAG_SSE2SLOW)) {
1879 // these functions are slower than mmx on AMD, but faster on Intel
1880 if (!high_bit_depth) {
1881 c->put_pixels_tab[0][0] = ff_put_pixels16_sse2;
1882 c->put_no_rnd_pixels_tab[0][0] = ff_put_pixels16_sse2;
1883 c->avg_pixels_tab[0][0] = ff_avg_pixels16_sse2;
1887 if (bit_depth == 10) {
1888 if (CONFIG_H264CHROMA) {
1889 c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_10_sse2;
1890 c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_10_sse2;
1894 c->scalarproduct_int16 = ff_scalarproduct_int16_sse2;
1895 c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_sse2;
1896 if (mm_flags & AV_CPU_FLAG_ATOM) {
1897 c->vector_clip_int32 = ff_vector_clip_int32_int_sse2;
1899 c->vector_clip_int32 = ff_vector_clip_int32_sse2;
1901 if (avctx->flags & CODEC_FLAG_BITEXACT) {
1902 c->apply_window_int16 = ff_apply_window_int16_sse2;
1903 } else if (!(mm_flags & AV_CPU_FLAG_SSE2SLOW)) {
1904 c->apply_window_int16 = ff_apply_window_int16_round_sse2;
1906 c->bswap_buf = ff_bswap32_buf_sse2;
1907 #endif /* HAVE_SSE2_EXTERNAL */
1910 static void dsputil_init_ssse3(DSPContext *c, AVCodecContext *avctx,
1913 #if HAVE_SSSE3_EXTERNAL
1914 const int high_bit_depth = avctx->bits_per_raw_sample > 8;
1916 if (!high_bit_depth && CONFIG_H264CHROMA) {
1917 c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_rnd_ssse3;
1918 c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_rnd_ssse3;
1919 c->put_h264_chroma_pixels_tab[1] = ff_put_h264_chroma_mc4_ssse3;
1920 c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_ssse3;
1922 c->add_hfyu_left_prediction = ff_add_hfyu_left_prediction_ssse3;
1923 if (mm_flags & AV_CPU_FLAG_SSE4) // not really sse4, just slow on Conroe
1924 c->add_hfyu_left_prediction = ff_add_hfyu_left_prediction_sse4;
1926 if (mm_flags & AV_CPU_FLAG_ATOM)
1927 c->apply_window_int16 = ff_apply_window_int16_ssse3_atom;
1929 c->apply_window_int16 = ff_apply_window_int16_ssse3;
1930 if (!(mm_flags & (AV_CPU_FLAG_SSE42|AV_CPU_FLAG_3DNOW))) // cachesplit
1931 c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_ssse3;
1932 c->bswap_buf = ff_bswap32_buf_ssse3;
1933 #endif /* HAVE_SSSE3_EXTERNAL */
1936 static void dsputil_init_sse4(DSPContext *c, AVCodecContext *avctx,
1939 #if HAVE_SSE4_EXTERNAL
1940 c->vector_clip_int32 = ff_vector_clip_int32_sse4;
1941 #endif /* HAVE_SSE4_EXTERNAL */
1944 static void dsputil_init_avx(DSPContext *c, AVCodecContext *avctx, int mm_flags)
1946 #if HAVE_AVX_EXTERNAL
1947 const int bit_depth = avctx->bits_per_raw_sample;
1949 if (bit_depth == 10) {
1950 if (CONFIG_H264CHROMA) {
1951 c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_10_avx;
1952 c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_10_avx;
1955 #endif /* HAVE_AVX_EXTERNAL */
1958 void ff_dsputil_init_mmx(DSPContext *c, AVCodecContext *avctx)
1960 int mm_flags = av_get_cpu_flags();
1962 #if HAVE_7REGS && HAVE_INLINE_ASM
1963 if (mm_flags & AV_CPU_FLAG_CMOV)
1964 c->add_hfyu_median_prediction = add_hfyu_median_prediction_cmov;
1967 if (mm_flags & AV_CPU_FLAG_MMX)
1968 dsputil_init_mmx(c, avctx, mm_flags);
1970 if (mm_flags & AV_CPU_FLAG_MMXEXT)
1971 dsputil_init_mmxext(c, avctx, mm_flags);
1973 if (mm_flags & AV_CPU_FLAG_3DNOW)
1974 dsputil_init_3dnow(c, avctx, mm_flags);
1976 if (mm_flags & AV_CPU_FLAG_SSE)
1977 dsputil_init_sse(c, avctx, mm_flags);
1979 if (mm_flags & AV_CPU_FLAG_SSE2)
1980 dsputil_init_sse2(c, avctx, mm_flags);
1982 if (mm_flags & AV_CPU_FLAG_SSSE3)
1983 dsputil_init_ssse3(c, avctx, mm_flags);
1985 if (mm_flags & AV_CPU_FLAG_SSE4)
1986 dsputil_init_sse4(c, avctx, mm_flags);
1988 if (mm_flags & AV_CPU_FLAG_AVX)
1989 dsputil_init_avx(c, avctx, mm_flags);
1991 if (CONFIG_ENCODERS)
1992 ff_dsputilenc_init_mmx(c, avctx);