2 * MMX optimized DSP utils
3 * Copyright (c) 2000, 2001 Fabrice Bellard
4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
6 * This file is part of FFmpeg.
8 * FFmpeg is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License as published by the Free Software Foundation; either
11 * version 2.1 of the License, or (at your option) any later version.
13 * FFmpeg is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 * Lesser General Public License for more details.
18 * You should have received a copy of the GNU Lesser General Public
19 * License along with FFmpeg; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22 * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
25 #include "libavutil/attributes.h"
26 #include "libavutil/cpu.h"
27 #include "libavutil/x86/asm.h"
28 #include "libavcodec/dsputil.h"
29 #include "libavcodec/h264dsp.h"
30 #include "libavcodec/mpegvideo.h"
31 #include "libavcodec/simple_idct.h"
32 #include "libavcodec/videodsp.h"
33 #include "dsputil_mmx.h"
34 #include "idct_xvid.h"
35 #include "diracdsp_mmx.h"
40 /* pixel operations */
41 DECLARE_ALIGNED(8, const uint64_t, ff_bone) = 0x0101010101010101ULL;
42 DECLARE_ALIGNED(8, const uint64_t, ff_wtwo) = 0x0002000200020002ULL;
44 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_1) = { 0x0001000100010001ULL, 0x0001000100010001ULL };
45 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_2) = { 0x0002000200020002ULL, 0x0002000200020002ULL };
46 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_3) = { 0x0003000300030003ULL, 0x0003000300030003ULL };
47 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_4) = { 0x0004000400040004ULL, 0x0004000400040004ULL };
48 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_5) = { 0x0005000500050005ULL, 0x0005000500050005ULL };
49 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_8) = { 0x0008000800080008ULL, 0x0008000800080008ULL };
50 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_9) = { 0x0009000900090009ULL, 0x0009000900090009ULL };
51 DECLARE_ALIGNED(8, const uint64_t, ff_pw_15) = 0x000F000F000F000FULL;
52 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_16) = { 0x0010001000100010ULL, 0x0010001000100010ULL };
53 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_17) = { 0x0011001100110011ULL, 0x0011001100110011ULL };
54 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_18) = { 0x0012001200120012ULL, 0x0012001200120012ULL };
55 DECLARE_ALIGNED(8, const uint64_t, ff_pw_20) = 0x0014001400140014ULL;
56 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_27) = { 0x001B001B001B001BULL, 0x001B001B001B001BULL };
57 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_28) = { 0x001C001C001C001CULL, 0x001C001C001C001CULL };
58 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_32) = { 0x0020002000200020ULL, 0x0020002000200020ULL };
59 DECLARE_ALIGNED(8, const uint64_t, ff_pw_42) = 0x002A002A002A002AULL;
60 DECLARE_ALIGNED(8, const uint64_t, ff_pw_53) = 0x0035003500350035ULL;
61 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_63) = { 0x003F003F003F003FULL, 0x003F003F003F003FULL };
62 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_64) = { 0x0040004000400040ULL, 0x0040004000400040ULL };
63 DECLARE_ALIGNED(8, const uint64_t, ff_pw_96) = 0x0060006000600060ULL;
64 DECLARE_ALIGNED(8, const uint64_t, ff_pw_128) = 0x0080008000800080ULL;
65 DECLARE_ALIGNED(8, const uint64_t, ff_pw_255) = 0x00ff00ff00ff00ffULL;
66 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_512) = { 0x0200020002000200ULL, 0x0200020002000200ULL };
67 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_1019) = { 0x03FB03FB03FB03FBULL, 0x03FB03FB03FB03FBULL };
69 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_0) = { 0x0000000000000000ULL, 0x0000000000000000ULL };
70 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_1) = { 0x0101010101010101ULL, 0x0101010101010101ULL };
71 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_3) = { 0x0303030303030303ULL, 0x0303030303030303ULL };
72 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_4) = { 0x0404040404040404ULL, 0x0404040404040404ULL };
73 DECLARE_ALIGNED(8, const uint64_t, ff_pb_7) = 0x0707070707070707ULL;
74 DECLARE_ALIGNED(8, const uint64_t, ff_pb_1F) = 0x1F1F1F1F1F1F1F1FULL;
75 DECLARE_ALIGNED(8, const uint64_t, ff_pb_3F) = 0x3F3F3F3F3F3F3F3FULL;
76 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_80) = { 0x8080808080808080ULL, 0x8080808080808080ULL };
77 DECLARE_ALIGNED(8, const uint64_t, ff_pb_81) = 0x8181818181818181ULL;
78 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_A1) = { 0xA1A1A1A1A1A1A1A1ULL, 0xA1A1A1A1A1A1A1A1ULL };
79 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_F8) = { 0xF8F8F8F8F8F8F8F8ULL, 0xF8F8F8F8F8F8F8F8ULL };
80 DECLARE_ALIGNED(8, const uint64_t, ff_pb_FC) = 0xFCFCFCFCFCFCFCFCULL;
81 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_FE) = { 0xFEFEFEFEFEFEFEFEULL, 0xFEFEFEFEFEFEFEFEULL };
83 DECLARE_ALIGNED(16, const double, ff_pd_1)[2] = { 1.0, 1.0 };
84 DECLARE_ALIGNED(16, const double, ff_pd_2)[2] = { 2.0, 2.0 };
88 void ff_put_pixels8_x2_mmxext(uint8_t *block, const uint8_t *pixels,
89 ptrdiff_t line_size, int h);
90 void ff_put_pixels8_x2_3dnow(uint8_t *block, const uint8_t *pixels,
91 ptrdiff_t line_size, int h);
92 void ff_put_pixels8_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
93 int dstStride, int src1Stride, int h);
94 void ff_put_no_rnd_pixels8_l2_mmxext(uint8_t *dst, uint8_t *src1,
95 uint8_t *src2, int dstStride,
96 int src1Stride, int h);
97 void ff_avg_pixels8_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
98 int dstStride, int src1Stride, int h);
99 void ff_put_pixels16_x2_mmxext(uint8_t *block, const uint8_t *pixels,
100 ptrdiff_t line_size, int h);
101 void ff_put_pixels16_x2_3dnow(uint8_t *block, const uint8_t *pixels,
102 ptrdiff_t line_size, int h);
103 void ff_put_pixels16_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
104 int dstStride, int src1Stride, int h);
105 void ff_avg_pixels16_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
106 int dstStride, int src1Stride, int h);
107 void ff_put_no_rnd_pixels16_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
108 int dstStride, int src1Stride, int h);
109 void ff_put_no_rnd_pixels8_x2_mmxext(uint8_t *block, const uint8_t *pixels,
110 ptrdiff_t line_size, int h);
111 void ff_put_no_rnd_pixels8_x2_3dnow(uint8_t *block, const uint8_t *pixels,
112 ptrdiff_t line_size, int h);
113 void ff_put_no_rnd_pixels8_x2_exact_mmxext(uint8_t *block,
114 const uint8_t *pixels,
115 ptrdiff_t line_size, int h);
116 void ff_put_no_rnd_pixels8_x2_exact_3dnow(uint8_t *block,
117 const uint8_t *pixels,
118 ptrdiff_t line_size, int h);
119 void ff_put_pixels8_y2_mmxext(uint8_t *block, const uint8_t *pixels,
120 ptrdiff_t line_size, int h);
121 void ff_put_pixels8_y2_3dnow(uint8_t *block, const uint8_t *pixels,
122 ptrdiff_t line_size, int h);
123 void ff_put_no_rnd_pixels8_y2_mmxext(uint8_t *block, const uint8_t *pixels,
124 ptrdiff_t line_size, int h);
125 void ff_put_no_rnd_pixels8_y2_3dnow(uint8_t *block, const uint8_t *pixels,
126 ptrdiff_t line_size, int h);
127 void ff_put_no_rnd_pixels8_y2_exact_mmxext(uint8_t *block,
128 const uint8_t *pixels,
129 ptrdiff_t line_size, int h);
130 void ff_put_no_rnd_pixels8_y2_exact_3dnow(uint8_t *block,
131 const uint8_t *pixels,
132 ptrdiff_t line_size, int h);
133 void ff_avg_pixels8_mmxext(uint8_t *block, const uint8_t *pixels,
134 ptrdiff_t line_size, int h);
135 void ff_avg_pixels8_3dnow(uint8_t *block, const uint8_t *pixels,
136 ptrdiff_t line_size, int h);
137 void ff_avg_pixels8_x2_mmxext(uint8_t *block, const uint8_t *pixels,
138 ptrdiff_t line_size, int h);
139 void ff_avg_pixels8_x2_3dnow(uint8_t *block, const uint8_t *pixels,
140 ptrdiff_t line_size, int h);
141 void ff_avg_pixels8_y2_mmxext(uint8_t *block, const uint8_t *pixels,
142 ptrdiff_t line_size, int h);
143 void ff_avg_pixels8_y2_3dnow(uint8_t *block, const uint8_t *pixels,
144 ptrdiff_t line_size, int h);
145 void ff_avg_pixels8_xy2_mmxext(uint8_t *block, const uint8_t *pixels,
146 ptrdiff_t line_size, int h);
147 void ff_avg_pixels8_xy2_3dnow(uint8_t *block, const uint8_t *pixels,
148 ptrdiff_t line_size, int h);
150 void ff_put_pixels8_mmxext(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h);
151 static void ff_put_pixels16_mmxext(uint8_t *block, const uint8_t *pixels,
152 ptrdiff_t line_size, int h)
154 ff_put_pixels8_mmxext(block, pixels, line_size, h);
155 ff_put_pixels8_mmxext(block + 8, pixels + 8, line_size, h);
158 void ff_put_mpeg4_qpel16_h_lowpass_mmxext(uint8_t *dst, uint8_t *src,
159 int dstStride, int srcStride, int h);
160 void ff_avg_mpeg4_qpel16_h_lowpass_mmxext(uint8_t *dst, uint8_t *src,
161 int dstStride, int srcStride, int h);
162 void ff_put_no_rnd_mpeg4_qpel16_h_lowpass_mmxext(uint8_t *dst, uint8_t *src,
163 int dstStride, int srcStride,
165 void ff_put_mpeg4_qpel8_h_lowpass_mmxext(uint8_t *dst, uint8_t *src,
166 int dstStride, int srcStride, int h);
167 void ff_avg_mpeg4_qpel8_h_lowpass_mmxext(uint8_t *dst, uint8_t *src,
168 int dstStride, int srcStride, int h);
169 void ff_put_no_rnd_mpeg4_qpel8_h_lowpass_mmxext(uint8_t *dst, uint8_t *src,
170 int dstStride, int srcStride,
172 void ff_put_mpeg4_qpel16_v_lowpass_mmxext(uint8_t *dst, uint8_t *src,
173 int dstStride, int srcStride);
174 void ff_avg_mpeg4_qpel16_v_lowpass_mmxext(uint8_t *dst, uint8_t *src,
175 int dstStride, int srcStride);
176 void ff_put_no_rnd_mpeg4_qpel16_v_lowpass_mmxext(uint8_t *dst, uint8_t *src,
177 int dstStride, int srcStride);
178 void ff_put_mpeg4_qpel8_v_lowpass_mmxext(uint8_t *dst, uint8_t *src,
179 int dstStride, int srcStride);
180 void ff_avg_mpeg4_qpel8_v_lowpass_mmxext(uint8_t *dst, uint8_t *src,
181 int dstStride, int srcStride);
182 void ff_put_no_rnd_mpeg4_qpel8_v_lowpass_mmxext(uint8_t *dst, uint8_t *src,
183 int dstStride, int srcStride);
184 #define ff_put_no_rnd_pixels16_mmxext ff_put_pixels16_mmxext
185 #define ff_put_no_rnd_pixels8_mmxext ff_put_pixels8_mmxext
186 #endif /* HAVE_YASM */
191 #define JUMPALIGN() __asm__ volatile (".p2align 3"::)
192 #define MOVQ_ZERO(regd) __asm__ volatile ("pxor %%"#regd", %%"#regd ::)
194 #define MOVQ_BFE(regd) \
196 "pcmpeqd %%"#regd", %%"#regd" \n\t" \
197 "paddb %%"#regd", %%"#regd" \n\t" ::)
200 #define MOVQ_BONE(regd) __asm__ volatile ("movq %0, %%"#regd" \n\t" :: "m"(ff_bone))
201 #define MOVQ_WTWO(regd) __asm__ volatile ("movq %0, %%"#regd" \n\t" :: "m"(ff_wtwo))
203 // for shared library it's better to use this way for accessing constants
205 #define MOVQ_BONE(regd) \
207 "pcmpeqd %%"#regd", %%"#regd" \n\t" \
208 "psrlw $15, %%"#regd" \n\t" \
209 "packuswb %%"#regd", %%"#regd" \n\t" ::)
211 #define MOVQ_WTWO(regd) \
213 "pcmpeqd %%"#regd", %%"#regd" \n\t" \
214 "psrlw $15, %%"#regd" \n\t" \
215 "psllw $1, %%"#regd" \n\t"::)
219 // using regr as temporary and for the output result
220 // first argument is unmodifed and second is trashed
221 // regfe is supposed to contain 0xfefefefefefefefe
222 #define PAVGB_MMX_NO_RND(rega, regb, regr, regfe) \
223 "movq "#rega", "#regr" \n\t" \
224 "pand "#regb", "#regr" \n\t" \
225 "pxor "#rega", "#regb" \n\t" \
226 "pand "#regfe", "#regb" \n\t" \
227 "psrlq $1, "#regb" \n\t" \
228 "paddb "#regb", "#regr" \n\t"
230 #define PAVGB_MMX(rega, regb, regr, regfe) \
231 "movq "#rega", "#regr" \n\t" \
232 "por "#regb", "#regr" \n\t" \
233 "pxor "#rega", "#regb" \n\t" \
234 "pand "#regfe", "#regb" \n\t" \
235 "psrlq $1, "#regb" \n\t" \
236 "psubb "#regb", "#regr" \n\t"
238 // mm6 is supposed to contain 0xfefefefefefefefe
239 #define PAVGBP_MMX_NO_RND(rega, regb, regr, regc, regd, regp) \
240 "movq "#rega", "#regr" \n\t" \
241 "movq "#regc", "#regp" \n\t" \
242 "pand "#regb", "#regr" \n\t" \
243 "pand "#regd", "#regp" \n\t" \
244 "pxor "#rega", "#regb" \n\t" \
245 "pxor "#regc", "#regd" \n\t" \
246 "pand %%mm6, "#regb" \n\t" \
247 "pand %%mm6, "#regd" \n\t" \
248 "psrlq $1, "#regb" \n\t" \
249 "psrlq $1, "#regd" \n\t" \
250 "paddb "#regb", "#regr" \n\t" \
251 "paddb "#regd", "#regp" \n\t"
253 #define PAVGBP_MMX(rega, regb, regr, regc, regd, regp) \
254 "movq "#rega", "#regr" \n\t" \
255 "movq "#regc", "#regp" \n\t" \
256 "por "#regb", "#regr" \n\t" \
257 "por "#regd", "#regp" \n\t" \
258 "pxor "#rega", "#regb" \n\t" \
259 "pxor "#regc", "#regd" \n\t" \
260 "pand %%mm6, "#regb" \n\t" \
261 "pand %%mm6, "#regd" \n\t" \
262 "psrlq $1, "#regd" \n\t" \
263 "psrlq $1, "#regb" \n\t" \
264 "psubb "#regb", "#regr" \n\t" \
265 "psubb "#regd", "#regp" \n\t"
267 /***********************************/
268 /* MMX no rounding */
270 #define DEF(x, y) x ## _no_rnd_ ## y ## _mmx
271 #define SET_RND MOVQ_WONE
272 #define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX_NO_RND(a, b, c, d, e, f)
273 #define PAVGB(a, b, c, e) PAVGB_MMX_NO_RND(a, b, c, e)
274 #define OP_AVG(a, b, c, e) PAVGB_MMX(a, b, c, e)
276 #include "dsputil_rnd_template.c"
283 /***********************************/
286 #define DEF(x, y) x ## _ ## y ## _mmx
287 #define SET_RND MOVQ_WTWO
288 #define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX(a, b, c, d, e, f)
289 #define PAVGB(a, b, c, e) PAVGB_MMX(a, b, c, e)
291 #include "dsputil_rnd_template.c"
299 #endif /* HAVE_INLINE_ASM */
303 #define ff_put_pixels8_mmx ff_put_pixels8_mmxext
305 /***********************************/
308 #define DEF(x) x ## _3dnow
310 #include "dsputil_avg_template.c"
314 /***********************************/
315 /* MMXEXT specific */
317 #define DEF(x) x ## _mmxext
319 #include "dsputil_avg_template.c"
323 #endif /* HAVE_YASM */
327 #define put_no_rnd_pixels16_mmx put_pixels16_mmx
328 #define put_no_rnd_pixels8_mmx put_pixels8_mmx
329 #define put_pixels16_mmxext put_pixels16_mmx
330 #define put_pixels8_mmxext put_pixels8_mmx
331 #define put_pixels4_mmxext put_pixels4_mmx
332 #define put_no_rnd_pixels16_mmxext put_no_rnd_pixels16_mmx
333 #define put_no_rnd_pixels8_mmxext put_no_rnd_pixels8_mmx
335 /***********************************/
338 void ff_put_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
344 /* read the pixels */
349 "movq (%3), %%mm0 \n\t"
350 "movq 8(%3), %%mm1 \n\t"
351 "movq 16(%3), %%mm2 \n\t"
352 "movq 24(%3), %%mm3 \n\t"
353 "movq 32(%3), %%mm4 \n\t"
354 "movq 40(%3), %%mm5 \n\t"
355 "movq 48(%3), %%mm6 \n\t"
356 "movq 56(%3), %%mm7 \n\t"
357 "packuswb %%mm1, %%mm0 \n\t"
358 "packuswb %%mm3, %%mm2 \n\t"
359 "packuswb %%mm5, %%mm4 \n\t"
360 "packuswb %%mm7, %%mm6 \n\t"
361 "movq %%mm0, (%0) \n\t"
362 "movq %%mm2, (%0, %1) \n\t"
363 "movq %%mm4, (%0, %1, 2) \n\t"
364 "movq %%mm6, (%0, %2) \n\t"
365 :: "r"(pix), "r"((x86_reg)line_size), "r"((x86_reg)line_size * 3),
368 pix += line_size * 4;
371 // if here would be an exact copy of the code above
372 // compiler would generate some very strange code
375 "movq (%3), %%mm0 \n\t"
376 "movq 8(%3), %%mm1 \n\t"
377 "movq 16(%3), %%mm2 \n\t"
378 "movq 24(%3), %%mm3 \n\t"
379 "movq 32(%3), %%mm4 \n\t"
380 "movq 40(%3), %%mm5 \n\t"
381 "movq 48(%3), %%mm6 \n\t"
382 "movq 56(%3), %%mm7 \n\t"
383 "packuswb %%mm1, %%mm0 \n\t"
384 "packuswb %%mm3, %%mm2 \n\t"
385 "packuswb %%mm5, %%mm4 \n\t"
386 "packuswb %%mm7, %%mm6 \n\t"
387 "movq %%mm0, (%0) \n\t"
388 "movq %%mm2, (%0, %1) \n\t"
389 "movq %%mm4, (%0, %1, 2) \n\t"
390 "movq %%mm6, (%0, %2) \n\t"
391 :: "r"(pix), "r"((x86_reg)line_size), "r"((x86_reg)line_size * 3), "r"(p)
395 #define put_signed_pixels_clamped_mmx_half(off) \
396 "movq "#off"(%2), %%mm1 \n\t" \
397 "movq 16 + "#off"(%2), %%mm2 \n\t" \
398 "movq 32 + "#off"(%2), %%mm3 \n\t" \
399 "movq 48 + "#off"(%2), %%mm4 \n\t" \
400 "packsswb 8 + "#off"(%2), %%mm1 \n\t" \
401 "packsswb 24 + "#off"(%2), %%mm2 \n\t" \
402 "packsswb 40 + "#off"(%2), %%mm3 \n\t" \
403 "packsswb 56 + "#off"(%2), %%mm4 \n\t" \
404 "paddb %%mm0, %%mm1 \n\t" \
405 "paddb %%mm0, %%mm2 \n\t" \
406 "paddb %%mm0, %%mm3 \n\t" \
407 "paddb %%mm0, %%mm4 \n\t" \
408 "movq %%mm1, (%0) \n\t" \
409 "movq %%mm2, (%0, %3) \n\t" \
410 "movq %%mm3, (%0, %3, 2) \n\t" \
411 "movq %%mm4, (%0, %1) \n\t"
413 void ff_put_signed_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
416 x86_reg line_skip = line_size;
420 "movq "MANGLE(ff_pb_80)", %%mm0 \n\t"
421 "lea (%3, %3, 2), %1 \n\t"
422 put_signed_pixels_clamped_mmx_half(0)
423 "lea (%0, %3, 4), %0 \n\t"
424 put_signed_pixels_clamped_mmx_half(64)
425 : "+&r"(pixels), "=&r"(line_skip3)
426 : "r"(block), "r"(line_skip)
430 void ff_add_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
437 /* read the pixels */
444 "movq (%2), %%mm0 \n\t"
445 "movq 8(%2), %%mm1 \n\t"
446 "movq 16(%2), %%mm2 \n\t"
447 "movq 24(%2), %%mm3 \n\t"
448 "movq %0, %%mm4 \n\t"
449 "movq %1, %%mm6 \n\t"
450 "movq %%mm4, %%mm5 \n\t"
451 "punpcklbw %%mm7, %%mm4 \n\t"
452 "punpckhbw %%mm7, %%mm5 \n\t"
453 "paddsw %%mm4, %%mm0 \n\t"
454 "paddsw %%mm5, %%mm1 \n\t"
455 "movq %%mm6, %%mm5 \n\t"
456 "punpcklbw %%mm7, %%mm6 \n\t"
457 "punpckhbw %%mm7, %%mm5 \n\t"
458 "paddsw %%mm6, %%mm2 \n\t"
459 "paddsw %%mm5, %%mm3 \n\t"
460 "packuswb %%mm1, %%mm0 \n\t"
461 "packuswb %%mm3, %%mm2 \n\t"
462 "movq %%mm0, %0 \n\t"
463 "movq %%mm2, %1 \n\t"
464 : "+m"(*pix), "+m"(*(pix + line_size))
467 pix += line_size * 2;
472 static void put_pixels8_mmx(uint8_t *block, const uint8_t *pixels,
473 ptrdiff_t line_size, int h)
476 "lea (%3, %3), %%"REG_a" \n\t"
479 "movq (%1 ), %%mm0 \n\t"
480 "movq (%1, %3), %%mm1 \n\t"
481 "movq %%mm0, (%2) \n\t"
482 "movq %%mm1, (%2, %3) \n\t"
483 "add %%"REG_a", %1 \n\t"
484 "add %%"REG_a", %2 \n\t"
485 "movq (%1 ), %%mm0 \n\t"
486 "movq (%1, %3), %%mm1 \n\t"
487 "movq %%mm0, (%2) \n\t"
488 "movq %%mm1, (%2, %3) \n\t"
489 "add %%"REG_a", %1 \n\t"
490 "add %%"REG_a", %2 \n\t"
493 : "+g"(h), "+r"(pixels), "+r"(block)
494 : "r"((x86_reg)line_size)
499 static void put_pixels16_mmx(uint8_t *block, const uint8_t *pixels,
500 ptrdiff_t line_size, int h)
503 "lea (%3, %3), %%"REG_a" \n\t"
506 "movq (%1 ), %%mm0 \n\t"
507 "movq 8(%1 ), %%mm4 \n\t"
508 "movq (%1, %3), %%mm1 \n\t"
509 "movq 8(%1, %3), %%mm5 \n\t"
510 "movq %%mm0, (%2) \n\t"
511 "movq %%mm4, 8(%2) \n\t"
512 "movq %%mm1, (%2, %3) \n\t"
513 "movq %%mm5, 8(%2, %3) \n\t"
514 "add %%"REG_a", %1 \n\t"
515 "add %%"REG_a", %2 \n\t"
516 "movq (%1 ), %%mm0 \n\t"
517 "movq 8(%1 ), %%mm4 \n\t"
518 "movq (%1, %3), %%mm1 \n\t"
519 "movq 8(%1, %3), %%mm5 \n\t"
520 "movq %%mm0, (%2) \n\t"
521 "movq %%mm4, 8(%2) \n\t"
522 "movq %%mm1, (%2, %3) \n\t"
523 "movq %%mm5, 8(%2, %3) \n\t"
524 "add %%"REG_a", %1 \n\t"
525 "add %%"REG_a", %2 \n\t"
528 : "+g"(h), "+r"(pixels), "+r"(block)
529 : "r"((x86_reg)line_size)
534 #define CLEAR_BLOCKS(name, n) \
535 static void name(int16_t *blocks) \
538 "pxor %%mm7, %%mm7 \n\t" \
539 "mov %1, %%"REG_a" \n\t" \
541 "movq %%mm7, (%0, %%"REG_a") \n\t" \
542 "movq %%mm7, 8(%0, %%"REG_a") \n\t" \
543 "movq %%mm7, 16(%0, %%"REG_a") \n\t" \
544 "movq %%mm7, 24(%0, %%"REG_a") \n\t" \
545 "add $32, %%"REG_a" \n\t" \
547 :: "r"(((uint8_t *)blocks) + 128 * n), \
552 CLEAR_BLOCKS(clear_blocks_mmx, 6)
553 CLEAR_BLOCKS(clear_block_mmx, 1)
555 static void clear_block_sse(int16_t *block)
558 "xorps %%xmm0, %%xmm0 \n"
559 "movaps %%xmm0, (%0) \n"
560 "movaps %%xmm0, 16(%0) \n"
561 "movaps %%xmm0, 32(%0) \n"
562 "movaps %%xmm0, 48(%0) \n"
563 "movaps %%xmm0, 64(%0) \n"
564 "movaps %%xmm0, 80(%0) \n"
565 "movaps %%xmm0, 96(%0) \n"
566 "movaps %%xmm0, 112(%0) \n"
572 static void clear_blocks_sse(int16_t *blocks)
575 "xorps %%xmm0, %%xmm0 \n"
576 "mov %1, %%"REG_a" \n"
578 "movaps %%xmm0, (%0, %%"REG_a") \n"
579 "movaps %%xmm0, 16(%0, %%"REG_a") \n"
580 "movaps %%xmm0, 32(%0, %%"REG_a") \n"
581 "movaps %%xmm0, 48(%0, %%"REG_a") \n"
582 "movaps %%xmm0, 64(%0, %%"REG_a") \n"
583 "movaps %%xmm0, 80(%0, %%"REG_a") \n"
584 "movaps %%xmm0, 96(%0, %%"REG_a") \n"
585 "movaps %%xmm0, 112(%0, %%"REG_a") \n"
586 "add $128, %%"REG_a" \n"
588 :: "r"(((uint8_t *)blocks) + 128 * 6),
594 static void add_bytes_mmx(uint8_t *dst, uint8_t *src, int w)
600 "movq (%1, %0), %%mm0 \n\t"
601 "movq (%2, %0), %%mm1 \n\t"
602 "paddb %%mm0, %%mm1 \n\t"
603 "movq %%mm1, (%2, %0) \n\t"
604 "movq 8(%1, %0), %%mm0 \n\t"
605 "movq 8(%2, %0), %%mm1 \n\t"
606 "paddb %%mm0, %%mm1 \n\t"
607 "movq %%mm1, 8(%2, %0) \n\t"
613 : "r"(src), "r"(dst), "r"((x86_reg)w - 15)
616 dst[i + 0] += src[i + 0];
620 static void add_hfyu_median_prediction_cmov(uint8_t *dst, const uint8_t *top,
621 const uint8_t *diff, int w,
622 int *left, int *left_top)
626 int l = *left & 0xff;
627 int tl = *left_top & 0xff;
632 "movzbl (%3, %4), %2 \n"
645 "add (%6, %4), %b0 \n"
646 "mov %b0, (%5, %4) \n"
649 : "+&q"(l), "+&q"(tl), "=&r"(t), "=&q"(x), "+&r"(w2)
650 : "r"(dst + w), "r"(diff + w), "rm"(top + w)
656 #endif /* HAVE_INLINE_ASM */
658 void ff_h263_v_loop_filter_mmx(uint8_t *src, int stride, int qscale);
659 void ff_h263_h_loop_filter_mmx(uint8_t *src, int stride, int qscale);
662 /* Draw the edges of width 'w' of an image of size width, height
663 * this MMX version can only handle w == 8 || w == 16. */
664 static void draw_edges_mmx(uint8_t *buf, int wrap, int width, int height,
665 int w, int h, int sides)
667 uint8_t *ptr, *last_line;
670 last_line = buf + (height - 1) * wrap;
676 "movd (%0), %%mm0 \n\t"
677 "punpcklbw %%mm0, %%mm0 \n\t"
678 "punpcklwd %%mm0, %%mm0 \n\t"
679 "punpckldq %%mm0, %%mm0 \n\t"
680 "movq %%mm0, -8(%0) \n\t"
681 "movq -8(%0, %2), %%mm1 \n\t"
682 "punpckhbw %%mm1, %%mm1 \n\t"
683 "punpckhwd %%mm1, %%mm1 \n\t"
684 "punpckhdq %%mm1, %%mm1 \n\t"
685 "movq %%mm1, (%0, %2) \n\t"
690 : "r"((x86_reg)wrap), "r"((x86_reg)width), "r"(ptr + wrap * height)
695 "movd (%0), %%mm0 \n\t"
696 "punpcklbw %%mm0, %%mm0 \n\t"
697 "punpcklwd %%mm0, %%mm0 \n\t"
698 "punpckldq %%mm0, %%mm0 \n\t"
699 "movq %%mm0, -8(%0) \n\t"
700 "movq %%mm0, -16(%0) \n\t"
701 "movq -8(%0, %2), %%mm1 \n\t"
702 "punpckhbw %%mm1, %%mm1 \n\t"
703 "punpckhwd %%mm1, %%mm1 \n\t"
704 "punpckhdq %%mm1, %%mm1 \n\t"
705 "movq %%mm1, (%0, %2) \n\t"
706 "movq %%mm1, 8(%0, %2) \n\t"
711 : "r"((x86_reg)wrap), "r"((x86_reg)width), "r"(ptr + wrap * height)
717 "movd (%0), %%mm0 \n\t"
718 "punpcklbw %%mm0, %%mm0 \n\t"
719 "punpcklwd %%mm0, %%mm0 \n\t"
720 "movd %%mm0, -4(%0) \n\t"
721 "movd -4(%0, %2), %%mm1 \n\t"
722 "punpcklbw %%mm1, %%mm1 \n\t"
723 "punpckhwd %%mm1, %%mm1 \n\t"
724 "punpckhdq %%mm1, %%mm1 \n\t"
725 "movd %%mm1, (%0, %2) \n\t"
730 : "r"((x86_reg)wrap), "r"((x86_reg)width), "r"(ptr + wrap * height)
734 /* top and bottom (and hopefully also the corners) */
735 if (sides & EDGE_TOP) {
736 for (i = 0; i < h; i += 4) {
737 ptr = buf - (i + 1) * wrap - w;
740 "movq (%1, %0), %%mm0 \n\t"
741 "movq %%mm0, (%0) \n\t"
742 "movq %%mm0, (%0, %2) \n\t"
743 "movq %%mm0, (%0, %2, 2) \n\t"
744 "movq %%mm0, (%0, %3) \n\t"
749 : "r"((x86_reg)buf - (x86_reg)ptr - w), "r"((x86_reg) -wrap),
750 "r"((x86_reg) -wrap * 3), "r"(ptr + width + 2 * w)
755 if (sides & EDGE_BOTTOM) {
756 for (i = 0; i < h; i += 4) {
757 ptr = last_line + (i + 1) * wrap - w;
760 "movq (%1, %0), %%mm0 \n\t"
761 "movq %%mm0, (%0) \n\t"
762 "movq %%mm0, (%0, %2) \n\t"
763 "movq %%mm0, (%0, %2, 2) \n\t"
764 "movq %%mm0, (%0, %3) \n\t"
769 : "r"((x86_reg)last_line - (x86_reg)ptr - w),
770 "r"((x86_reg)wrap), "r"((x86_reg)wrap * 3),
771 "r"(ptr + width + 2 * w)
776 #endif /* HAVE_INLINE_ASM */
780 #define QPEL_OP(OPNAME, ROUNDER, RND, MMX) \
781 static void OPNAME ## qpel8_mc00_ ## MMX (uint8_t *dst, uint8_t *src, \
784 ff_ ## OPNAME ## pixels8_ ## MMX(dst, src, stride, 8); \
787 static void OPNAME ## qpel8_mc10_ ## MMX(uint8_t *dst, uint8_t *src, \
791 uint8_t * const half = (uint8_t*)temp; \
792 ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, \
794 ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, src, half, \
795 stride, stride, 8); \
798 static void OPNAME ## qpel8_mc20_ ## MMX(uint8_t *dst, uint8_t *src, \
801 ff_ ## OPNAME ## mpeg4_qpel8_h_lowpass_ ## MMX(dst, src, stride, \
805 static void OPNAME ## qpel8_mc30_ ## MMX(uint8_t *dst, uint8_t *src, \
809 uint8_t * const half = (uint8_t*)temp; \
810 ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, \
812 ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, src + 1, half, stride, \
816 static void OPNAME ## qpel8_mc01_ ## MMX(uint8_t *dst, uint8_t *src, \
820 uint8_t * const half = (uint8_t*)temp; \
821 ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, \
823 ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, src, half, \
824 stride, stride, 8); \
827 static void OPNAME ## qpel8_mc02_ ## MMX(uint8_t *dst, uint8_t *src, \
830 ff_ ## OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, src, \
834 static void OPNAME ## qpel8_mc03_ ## MMX(uint8_t *dst, uint8_t *src, \
838 uint8_t * const half = (uint8_t*)temp; \
839 ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, \
841 ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, src + stride, half, stride,\
845 static void OPNAME ## qpel8_mc11_ ## MMX(uint8_t *dst, uint8_t *src, \
848 uint64_t half[8 + 9]; \
849 uint8_t * const halfH = ((uint8_t*)half) + 64; \
850 uint8_t * const halfHV = ((uint8_t*)half); \
851 ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
853 ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, \
855 ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
856 ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, \
860 static void OPNAME ## qpel8_mc31_ ## MMX(uint8_t *dst, uint8_t *src, \
863 uint64_t half[8 + 9]; \
864 uint8_t * const halfH = ((uint8_t*)half) + 64; \
865 uint8_t * const halfHV = ((uint8_t*)half); \
866 ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
868 ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src + 1, halfH, 8, \
870 ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
871 ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, \
875 static void OPNAME ## qpel8_mc13_ ## MMX(uint8_t *dst, uint8_t *src, \
878 uint64_t half[8 + 9]; \
879 uint8_t * const halfH = ((uint8_t*)half) + 64; \
880 uint8_t * const halfHV = ((uint8_t*)half); \
881 ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
883 ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, \
885 ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
886 ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH + 8, halfHV, \
890 static void OPNAME ## qpel8_mc33_ ## MMX(uint8_t *dst, uint8_t *src, \
893 uint64_t half[8 + 9]; \
894 uint8_t * const halfH = ((uint8_t*)half) + 64; \
895 uint8_t * const halfHV = ((uint8_t*)half); \
896 ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
898 ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src + 1, halfH, 8, \
900 ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
901 ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH + 8, halfHV, \
905 static void OPNAME ## qpel8_mc21_ ## MMX(uint8_t *dst, uint8_t *src, \
908 uint64_t half[8 + 9]; \
909 uint8_t * const halfH = ((uint8_t*)half) + 64; \
910 uint8_t * const halfHV = ((uint8_t*)half); \
911 ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
913 ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
914 ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, \
918 static void OPNAME ## qpel8_mc23_ ## MMX(uint8_t *dst, uint8_t *src, \
921 uint64_t half[8 + 9]; \
922 uint8_t * const halfH = ((uint8_t*)half) + 64; \
923 uint8_t * const halfHV = ((uint8_t*)half); \
924 ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
926 ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
927 ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH + 8, halfHV, \
931 static void OPNAME ## qpel8_mc12_ ## MMX(uint8_t *dst, uint8_t *src, \
934 uint64_t half[8 + 9]; \
935 uint8_t * const halfH = ((uint8_t*)half); \
936 ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
938 ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, \
940 ff_ ## OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, \
944 static void OPNAME ## qpel8_mc32_ ## MMX(uint8_t *dst, uint8_t *src, \
947 uint64_t half[8 + 9]; \
948 uint8_t * const halfH = ((uint8_t*)half); \
949 ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
951 ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src + 1, halfH, 8, \
953 ff_ ## OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, \
957 static void OPNAME ## qpel8_mc22_ ## MMX(uint8_t *dst, uint8_t *src, \
961 uint8_t * const halfH = ((uint8_t*)half); \
962 ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
964 ff_ ## OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, \
968 static void OPNAME ## qpel16_mc00_ ## MMX (uint8_t *dst, uint8_t *src, \
971 ff_ ## OPNAME ## pixels16_ ## MMX(dst, src, stride, 16); \
974 static void OPNAME ## qpel16_mc10_ ## MMX(uint8_t *dst, uint8_t *src, \
978 uint8_t * const half = (uint8_t*)temp; \
979 ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, \
981 ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, \
985 static void OPNAME ## qpel16_mc20_ ## MMX(uint8_t *dst, uint8_t *src, \
988 ff_ ## OPNAME ## mpeg4_qpel16_h_lowpass_ ## MMX(dst, src, \
989 stride, stride, 16);\
992 static void OPNAME ## qpel16_mc30_ ## MMX(uint8_t *dst, uint8_t *src, \
996 uint8_t * const half = (uint8_t*)temp; \
997 ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, \
999 ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, src + 1, half, \
1000 stride, stride, 16); \
1003 static void OPNAME ## qpel16_mc01_ ## MMX(uint8_t *dst, uint8_t *src, \
1006 uint64_t temp[32]; \
1007 uint8_t * const half = (uint8_t*)temp; \
1008 ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, \
1010 ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, \
1014 static void OPNAME ## qpel16_mc02_ ## MMX(uint8_t *dst, uint8_t *src, \
1017 ff_ ## OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, src, \
1021 static void OPNAME ## qpel16_mc03_ ## MMX(uint8_t *dst, uint8_t *src, \
1024 uint64_t temp[32]; \
1025 uint8_t * const half = (uint8_t*)temp; \
1026 ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, \
1028 ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, src+stride, half, \
1029 stride, stride, 16); \
1032 static void OPNAME ## qpel16_mc11_ ## MMX(uint8_t *dst, uint8_t *src, \
1035 uint64_t half[16 * 2 + 17 * 2]; \
1036 uint8_t * const halfH = ((uint8_t*)half) + 256; \
1037 uint8_t * const halfHV = ((uint8_t*)half); \
1038 ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1040 ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, \
1042 ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
1044 ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, \
1048 static void OPNAME ## qpel16_mc31_ ## MMX(uint8_t *dst, uint8_t *src, \
1051 uint64_t half[16 * 2 + 17 * 2]; \
1052 uint8_t * const halfH = ((uint8_t*)half) + 256; \
1053 uint8_t * const halfHV = ((uint8_t*)half); \
1054 ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1056 ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src + 1, halfH, 16, \
1058 ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
1060 ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, \
1064 static void OPNAME ## qpel16_mc13_ ## MMX(uint8_t *dst, uint8_t *src, \
1067 uint64_t half[16 * 2 + 17 * 2]; \
1068 uint8_t * const halfH = ((uint8_t*)half) + 256; \
1069 uint8_t * const halfHV = ((uint8_t*)half); \
1070 ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1072 ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, \
1074 ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
1076 ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH + 16, halfHV, \
1080 static void OPNAME ## qpel16_mc33_ ## MMX(uint8_t *dst, uint8_t *src, \
1083 uint64_t half[16 * 2 + 17 * 2]; \
1084 uint8_t * const halfH = ((uint8_t*)half) + 256; \
1085 uint8_t * const halfHV = ((uint8_t*)half); \
1086 ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1088 ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src + 1, halfH, 16, \
1090 ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
1092 ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH + 16, halfHV, \
1096 static void OPNAME ## qpel16_mc21_ ## MMX(uint8_t *dst, uint8_t *src, \
1099 uint64_t half[16 * 2 + 17 * 2]; \
1100 uint8_t * const halfH = ((uint8_t*)half) + 256; \
1101 uint8_t * const halfHV = ((uint8_t*)half); \
1102 ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1104 ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
1106 ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, \
1110 static void OPNAME ## qpel16_mc23_ ## MMX(uint8_t *dst, uint8_t *src, \
1113 uint64_t half[16 * 2 + 17 * 2]; \
1114 uint8_t * const halfH = ((uint8_t*)half) + 256; \
1115 uint8_t * const halfHV = ((uint8_t*)half); \
1116 ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1118 ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
1120 ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH + 16, halfHV, \
1124 static void OPNAME ## qpel16_mc12_ ## MMX(uint8_t *dst, uint8_t *src, \
1127 uint64_t half[17 * 2]; \
1128 uint8_t * const halfH = ((uint8_t*)half); \
1129 ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1131 ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, \
1133 ff_ ## OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, \
1137 static void OPNAME ## qpel16_mc32_ ## MMX(uint8_t *dst, uint8_t *src, \
1140 uint64_t half[17 * 2]; \
1141 uint8_t * const halfH = ((uint8_t*)half); \
1142 ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1144 ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src + 1, halfH, 16, \
1146 ff_ ## OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, \
1150 static void OPNAME ## qpel16_mc22_ ## MMX(uint8_t *dst, uint8_t *src, \
1153 uint64_t half[17 * 2]; \
1154 uint8_t * const halfH = ((uint8_t*)half); \
1155 ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1157 ff_ ## OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, \
1161 QPEL_OP(put_, ff_pw_16, _, mmxext)
1162 QPEL_OP(avg_, ff_pw_16, _, mmxext)
1163 QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, mmxext)
1164 #endif /* HAVE_YASM */
1168 void ff_put_rv40_qpel8_mc33_mmx(uint8_t *dst, uint8_t *src, int stride)
1170 put_pixels8_xy2_mmx(dst, src, stride, 8);
1172 void ff_put_rv40_qpel16_mc33_mmx(uint8_t *dst, uint8_t *src, int stride)
1174 put_pixels16_xy2_mmx(dst, src, stride, 16);
1176 void ff_avg_rv40_qpel8_mc33_mmx(uint8_t *dst, uint8_t *src, int stride)
1178 avg_pixels8_xy2_mmx(dst, src, stride, 8);
1180 void ff_avg_rv40_qpel16_mc33_mmx(uint8_t *dst, uint8_t *src, int stride)
1182 avg_pixels16_xy2_mmx(dst, src, stride, 16);
1185 typedef void emulated_edge_mc_func(uint8_t *dst, const uint8_t *src,
1186 ptrdiff_t linesize, int block_w, int block_h,
1187 int src_x, int src_y, int w, int h);
1189 static av_always_inline void gmc(uint8_t *dst, uint8_t *src,
1190 int stride, int h, int ox, int oy,
1191 int dxx, int dxy, int dyx, int dyy,
1192 int shift, int r, int width, int height,
1193 emulated_edge_mc_func *emu_edge_fn)
1196 const int ix = ox >> (16 + shift);
1197 const int iy = oy >> (16 + shift);
1198 const int oxs = ox >> 4;
1199 const int oys = oy >> 4;
1200 const int dxxs = dxx >> 4;
1201 const int dxys = dxy >> 4;
1202 const int dyxs = dyx >> 4;
1203 const int dyys = dyy >> 4;
1204 const uint16_t r4[4] = { r, r, r, r };
1205 const uint16_t dxy4[4] = { dxys, dxys, dxys, dxys };
1206 const uint16_t dyy4[4] = { dyys, dyys, dyys, dyys };
1207 const uint64_t shift2 = 2 * shift;
1208 #define MAX_STRIDE 4096U
1210 uint8_t edge_buf[(MAX_H + 1) * MAX_STRIDE];
1213 const int dxw = (dxx - (1 << (16 + shift))) * (w - 1);
1214 const int dyh = (dyy - (1 << (16 + shift))) * (h - 1);
1215 const int dxh = dxy * (h - 1);
1216 const int dyw = dyx * (w - 1);
1217 int need_emu = (unsigned)ix >= width - w ||
1218 (unsigned)iy >= height - h;
1220 if ( // non-constant fullpel offset (3% of blocks)
1221 ((ox ^ (ox + dxw)) | (ox ^ (ox + dxh)) | (ox ^ (ox + dxw + dxh)) |
1222 (oy ^ (oy + dyw)) | (oy ^ (oy + dyh)) | (oy ^ (oy + dyw + dyh))) >> (16 + shift)
1223 // uses more than 16 bits of subpel mv (only at huge resolution)
1224 || (dxx | dxy | dyx | dyy) & 15
1225 || (need_emu && (h > MAX_H || stride > MAX_STRIDE))) {
1226 // FIXME could still use mmx for some of the rows
1227 ff_gmc_c(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy,
1228 shift, r, width, height);
1232 src += ix + iy * stride;
1234 emu_edge_fn(edge_buf, src, stride, w + 1, h + 1, ix, iy, width, height);
1239 "movd %0, %%mm6 \n\t"
1240 "pxor %%mm7, %%mm7 \n\t"
1241 "punpcklwd %%mm6, %%mm6 \n\t"
1242 "punpcklwd %%mm6, %%mm6 \n\t"
1246 for (x = 0; x < w; x += 4) {
1247 uint16_t dx4[4] = { oxs - dxys + dxxs * (x + 0),
1248 oxs - dxys + dxxs * (x + 1),
1249 oxs - dxys + dxxs * (x + 2),
1250 oxs - dxys + dxxs * (x + 3) };
1251 uint16_t dy4[4] = { oys - dyys + dyxs * (x + 0),
1252 oys - dyys + dyxs * (x + 1),
1253 oys - dyys + dyxs * (x + 2),
1254 oys - dyys + dyxs * (x + 3) };
1256 for (y = 0; y < h; y++) {
1258 "movq %0, %%mm4 \n\t"
1259 "movq %1, %%mm5 \n\t"
1260 "paddw %2, %%mm4 \n\t"
1261 "paddw %3, %%mm5 \n\t"
1262 "movq %%mm4, %0 \n\t"
1263 "movq %%mm5, %1 \n\t"
1264 "psrlw $12, %%mm4 \n\t"
1265 "psrlw $12, %%mm5 \n\t"
1266 : "+m"(*dx4), "+m"(*dy4)
1267 : "m"(*dxy4), "m"(*dyy4)
1271 "movq %%mm6, %%mm2 \n\t"
1272 "movq %%mm6, %%mm1 \n\t"
1273 "psubw %%mm4, %%mm2 \n\t"
1274 "psubw %%mm5, %%mm1 \n\t"
1275 "movq %%mm2, %%mm0 \n\t"
1276 "movq %%mm4, %%mm3 \n\t"
1277 "pmullw %%mm1, %%mm0 \n\t" // (s - dx) * (s - dy)
1278 "pmullw %%mm5, %%mm3 \n\t" // dx * dy
1279 "pmullw %%mm5, %%mm2 \n\t" // (s - dx) * dy
1280 "pmullw %%mm4, %%mm1 \n\t" // dx * (s - dy)
1282 "movd %4, %%mm5 \n\t"
1283 "movd %3, %%mm4 \n\t"
1284 "punpcklbw %%mm7, %%mm5 \n\t"
1285 "punpcklbw %%mm7, %%mm4 \n\t"
1286 "pmullw %%mm5, %%mm3 \n\t" // src[1, 1] * dx * dy
1287 "pmullw %%mm4, %%mm2 \n\t" // src[0, 1] * (s - dx) * dy
1289 "movd %2, %%mm5 \n\t"
1290 "movd %1, %%mm4 \n\t"
1291 "punpcklbw %%mm7, %%mm5 \n\t"
1292 "punpcklbw %%mm7, %%mm4 \n\t"
1293 "pmullw %%mm5, %%mm1 \n\t" // src[1, 0] * dx * (s - dy)
1294 "pmullw %%mm4, %%mm0 \n\t" // src[0, 0] * (s - dx) * (s - dy)
1295 "paddw %5, %%mm1 \n\t"
1296 "paddw %%mm3, %%mm2 \n\t"
1297 "paddw %%mm1, %%mm0 \n\t"
1298 "paddw %%mm2, %%mm0 \n\t"
1300 "psrlw %6, %%mm0 \n\t"
1301 "packuswb %%mm0, %%mm0 \n\t"
1302 "movd %%mm0, %0 \n\t"
1304 : "=m"(dst[x + y * stride])
1305 : "m"(src[0]), "m"(src[1]),
1306 "m"(src[stride]), "m"(src[stride + 1]),
1307 "m"(*r4), "m"(shift2)
1311 src += 4 - h * stride;
1318 static void gmc_mmx(uint8_t *dst, uint8_t *src,
1319 int stride, int h, int ox, int oy,
1320 int dxx, int dxy, int dyx, int dyy,
1321 int shift, int r, int width, int height)
1323 gmc(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r,
1324 width, height, &ff_emulated_edge_mc_8);
1327 static void gmc_sse(uint8_t *dst, uint8_t *src,
1328 int stride, int h, int ox, int oy,
1329 int dxx, int dxy, int dyx, int dyy,
1330 int shift, int r, int width, int height)
1332 gmc(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r,
1333 width, height, &ff_emulated_edge_mc_8);
1336 static void gmc_mmx(uint8_t *dst, uint8_t *src,
1337 int stride, int h, int ox, int oy,
1338 int dxx, int dxy, int dyx, int dyy,
1339 int shift, int r, int width, int height)
1341 gmc(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r,
1342 width, height, &ff_emulated_edge_mc_8);
1347 #endif /* HAVE_INLINE_ASM */
1349 void ff_put_pixels16_sse2(uint8_t *block, const uint8_t *pixels,
1350 ptrdiff_t line_size, int h);
1351 void ff_avg_pixels16_sse2(uint8_t *block, const uint8_t *pixels,
1352 ptrdiff_t line_size, int h);
1357 void ff_put_cavs_qpel8_mc00_mmxext(uint8_t *dst, uint8_t *src, int stride)
1359 put_pixels8_mmx(dst, src, stride, 8);
1362 void ff_avg_cavs_qpel8_mc00_mmxext(uint8_t *dst, uint8_t *src, int stride)
1364 avg_pixels8_mmx(dst, src, stride, 8);
1367 void ff_put_cavs_qpel16_mc00_mmxext(uint8_t *dst, uint8_t *src, int stride)
1369 put_pixels16_mmx(dst, src, stride, 16);
1372 void ff_avg_cavs_qpel16_mc00_mmxext(uint8_t *dst, uint8_t *src, int stride)
1374 avg_pixels16_mmx(dst, src, stride, 16);
1376 #endif /* HAVE_INLINE_ASM */
1380 void ff_put_vc1_mspel_mc00_mmx(uint8_t *dst, const uint8_t *src,
1381 int stride, int rnd)
1383 ff_put_pixels8_mmx(dst, src, stride, 8);
1386 void ff_avg_vc1_mspel_mc00_mmxext(uint8_t *dst, const uint8_t *src,
1387 int stride, int rnd)
1389 ff_avg_pixels8_mmxext(dst, src, stride, 8);
1391 #endif /* HAVE_YASM */
1393 #if CONFIG_DIRAC_DECODER
1394 #define DIRAC_PIXOP(OPNAME2, OPNAME, EXT)\
1395 void ff_ ## OPNAME2 ## _dirac_pixels8_ ## EXT(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1398 ff_ ## OPNAME2 ## _dirac_pixels8_c(dst, src, stride, h);\
1400 OPNAME ## _pixels8_ ## EXT(dst, src[0], stride, h);\
1402 void ff_ ## OPNAME2 ## _dirac_pixels16_ ## EXT(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1405 ff_ ## OPNAME2 ## _dirac_pixels16_c(dst, src, stride, h);\
1407 OPNAME ## _pixels16_ ## EXT(dst, src[0], stride, h);\
1409 void ff_ ## OPNAME2 ## _dirac_pixels32_ ## EXT(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1412 ff_ ## OPNAME2 ## _dirac_pixels32_c(dst, src, stride, h);\
1414 OPNAME ## _pixels16_ ## EXT(dst , src[0] , stride, h);\
1415 OPNAME ## _pixels16_ ## EXT(dst+16, src[0]+16, stride, h);\
1420 DIRAC_PIXOP(put, put, mmx)
1421 DIRAC_PIXOP(avg, avg, mmx)
1425 DIRAC_PIXOP(avg, ff_avg, mmxext)
1427 void ff_put_dirac_pixels16_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h)
1430 ff_put_dirac_pixels16_c(dst, src, stride, h);
1432 ff_put_pixels16_sse2(dst, src[0], stride, h);
1434 void ff_avg_dirac_pixels16_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h)
1437 ff_avg_dirac_pixels16_c(dst, src, stride, h);
1439 ff_avg_pixels16_sse2(dst, src[0], stride, h);
1441 void ff_put_dirac_pixels32_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h)
1444 ff_put_dirac_pixels32_c(dst, src, stride, h);
1446 ff_put_pixels16_sse2(dst , src[0] , stride, h);
1447 ff_put_pixels16_sse2(dst+16, src[0]+16, stride, h);
1450 void ff_avg_dirac_pixels32_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h)
1453 ff_avg_dirac_pixels32_c(dst, src, stride, h);
1455 ff_avg_pixels16_sse2(dst , src[0] , stride, h);
1456 ff_avg_pixels16_sse2(dst+16, src[0]+16, stride, h);
1462 /* XXX: Those functions should be suppressed ASAP when all IDCTs are
1465 static void ff_libmpeg2mmx_idct_put(uint8_t *dest, int line_size,
1469 ff_put_pixels_clamped_mmx(block, dest, line_size);
1472 static void ff_libmpeg2mmx_idct_add(uint8_t *dest, int line_size,
1476 ff_add_pixels_clamped_mmx(block, dest, line_size);
1479 static void ff_libmpeg2mmx2_idct_put(uint8_t *dest, int line_size,
1482 ff_mmxext_idct(block);
1483 ff_put_pixels_clamped_mmx(block, dest, line_size);
1486 static void ff_libmpeg2mmx2_idct_add(uint8_t *dest, int line_size,
1489 ff_mmxext_idct(block);
1490 ff_add_pixels_clamped_mmx(block, dest, line_size);
1495 static void vector_clipf_sse(float *dst, const float *src,
1496 float min, float max, int len)
1498 x86_reg i = (len - 16) * 4;
1500 "movss %3, %%xmm4 \n\t"
1501 "movss %4, %%xmm5 \n\t"
1502 "shufps $0, %%xmm4, %%xmm4 \n\t"
1503 "shufps $0, %%xmm5, %%xmm5 \n\t"
1505 "movaps (%2, %0), %%xmm0 \n\t" // 3/1 on intel
1506 "movaps 16(%2, %0), %%xmm1 \n\t"
1507 "movaps 32(%2, %0), %%xmm2 \n\t"
1508 "movaps 48(%2, %0), %%xmm3 \n\t"
1509 "maxps %%xmm4, %%xmm0 \n\t"
1510 "maxps %%xmm4, %%xmm1 \n\t"
1511 "maxps %%xmm4, %%xmm2 \n\t"
1512 "maxps %%xmm4, %%xmm3 \n\t"
1513 "minps %%xmm5, %%xmm0 \n\t"
1514 "minps %%xmm5, %%xmm1 \n\t"
1515 "minps %%xmm5, %%xmm2 \n\t"
1516 "minps %%xmm5, %%xmm3 \n\t"
1517 "movaps %%xmm0, (%1, %0) \n\t"
1518 "movaps %%xmm1, 16(%1, %0) \n\t"
1519 "movaps %%xmm2, 32(%1, %0) \n\t"
1520 "movaps %%xmm3, 48(%1, %0) \n\t"
1524 : "r"(dst), "r"(src), "m"(min), "m"(max)
1529 #endif /* HAVE_INLINE_ASM */
1531 int32_t ff_scalarproduct_int16_mmxext(const int16_t *v1, const int16_t *v2,
1533 int32_t ff_scalarproduct_int16_sse2(const int16_t *v1, const int16_t *v2,
1535 int32_t ff_scalarproduct_and_madd_int16_mmxext(int16_t *v1, const int16_t *v2,
1537 int order, int mul);
1538 int32_t ff_scalarproduct_and_madd_int16_sse2(int16_t *v1, const int16_t *v2,
1540 int order, int mul);
1541 int32_t ff_scalarproduct_and_madd_int16_ssse3(int16_t *v1, const int16_t *v2,
1543 int order, int mul);
1545 void ff_apply_window_int16_round_mmxext(int16_t *output, const int16_t *input,
1546 const int16_t *window, unsigned int len);
1547 void ff_apply_window_int16_round_sse2(int16_t *output, const int16_t *input,
1548 const int16_t *window, unsigned int len);
1549 void ff_apply_window_int16_mmxext(int16_t *output, const int16_t *input,
1550 const int16_t *window, unsigned int len);
1551 void ff_apply_window_int16_sse2(int16_t *output, const int16_t *input,
1552 const int16_t *window, unsigned int len);
1553 void ff_apply_window_int16_ssse3(int16_t *output, const int16_t *input,
1554 const int16_t *window, unsigned int len);
1555 void ff_apply_window_int16_ssse3_atom(int16_t *output, const int16_t *input,
1556 const int16_t *window, unsigned int len);
1558 void ff_bswap32_buf_ssse3(uint32_t *dst, const uint32_t *src, int w);
1559 void ff_bswap32_buf_sse2(uint32_t *dst, const uint32_t *src, int w);
1561 void ff_add_hfyu_median_prediction_mmxext(uint8_t *dst, const uint8_t *top,
1562 const uint8_t *diff, int w,
1563 int *left, int *left_top);
1564 int ff_add_hfyu_left_prediction_ssse3(uint8_t *dst, const uint8_t *src,
1566 int ff_add_hfyu_left_prediction_sse4(uint8_t *dst, const uint8_t *src,
1569 void ff_vector_clip_int32_mmx (int32_t *dst, const int32_t *src,
1570 int32_t min, int32_t max, unsigned int len);
1571 void ff_vector_clip_int32_sse2 (int32_t *dst, const int32_t *src,
1572 int32_t min, int32_t max, unsigned int len);
1573 void ff_vector_clip_int32_int_sse2(int32_t *dst, const int32_t *src,
1574 int32_t min, int32_t max, unsigned int len);
1575 void ff_vector_clip_int32_sse4 (int32_t *dst, const int32_t *src,
1576 int32_t min, int32_t max, unsigned int len);
1578 #define SET_QPEL_FUNCS(PFX, IDX, SIZE, CPU, PREFIX) \
1580 c->PFX ## _pixels_tab[IDX][ 0] = PREFIX ## PFX ## SIZE ## _mc00_ ## CPU; \
1581 c->PFX ## _pixels_tab[IDX][ 1] = PREFIX ## PFX ## SIZE ## _mc10_ ## CPU; \
1582 c->PFX ## _pixels_tab[IDX][ 2] = PREFIX ## PFX ## SIZE ## _mc20_ ## CPU; \
1583 c->PFX ## _pixels_tab[IDX][ 3] = PREFIX ## PFX ## SIZE ## _mc30_ ## CPU; \
1584 c->PFX ## _pixels_tab[IDX][ 4] = PREFIX ## PFX ## SIZE ## _mc01_ ## CPU; \
1585 c->PFX ## _pixels_tab[IDX][ 5] = PREFIX ## PFX ## SIZE ## _mc11_ ## CPU; \
1586 c->PFX ## _pixels_tab[IDX][ 6] = PREFIX ## PFX ## SIZE ## _mc21_ ## CPU; \
1587 c->PFX ## _pixels_tab[IDX][ 7] = PREFIX ## PFX ## SIZE ## _mc31_ ## CPU; \
1588 c->PFX ## _pixels_tab[IDX][ 8] = PREFIX ## PFX ## SIZE ## _mc02_ ## CPU; \
1589 c->PFX ## _pixels_tab[IDX][ 9] = PREFIX ## PFX ## SIZE ## _mc12_ ## CPU; \
1590 c->PFX ## _pixels_tab[IDX][10] = PREFIX ## PFX ## SIZE ## _mc22_ ## CPU; \
1591 c->PFX ## _pixels_tab[IDX][11] = PREFIX ## PFX ## SIZE ## _mc32_ ## CPU; \
1592 c->PFX ## _pixels_tab[IDX][12] = PREFIX ## PFX ## SIZE ## _mc03_ ## CPU; \
1593 c->PFX ## _pixels_tab[IDX][13] = PREFIX ## PFX ## SIZE ## _mc13_ ## CPU; \
1594 c->PFX ## _pixels_tab[IDX][14] = PREFIX ## PFX ## SIZE ## _mc23_ ## CPU; \
1595 c->PFX ## _pixels_tab[IDX][15] = PREFIX ## PFX ## SIZE ## _mc33_ ## CPU; \
1598 #define SET_HPEL_FUNCS(PFX, IDX, SIZE, CPU) \
1600 c->PFX ## _pixels_tab IDX [0] = PFX ## _pixels ## SIZE ## _ ## CPU; \
1601 c->PFX ## _pixels_tab IDX [1] = PFX ## _pixels ## SIZE ## _x2_ ## CPU; \
1602 c->PFX ## _pixels_tab IDX [2] = PFX ## _pixels ## SIZE ## _y2_ ## CPU; \
1603 c->PFX ## _pixels_tab IDX [3] = PFX ## _pixels ## SIZE ## _xy2_ ## CPU; \
1606 static av_cold void dsputil_init_mmx(DSPContext *c, AVCodecContext *avctx,
1609 const int high_bit_depth = avctx->bits_per_raw_sample > 8;
1612 c->put_pixels_clamped = ff_put_pixels_clamped_mmx;
1613 c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_mmx;
1614 c->add_pixels_clamped = ff_add_pixels_clamped_mmx;
1616 if (!high_bit_depth) {
1617 c->clear_block = clear_block_mmx;
1618 c->clear_blocks = clear_blocks_mmx;
1619 c->draw_edges = draw_edges_mmx;
1621 SET_HPEL_FUNCS(put, [0], 16, mmx);
1622 SET_HPEL_FUNCS(put_no_rnd, [0], 16, mmx);
1623 SET_HPEL_FUNCS(avg, [0], 16, mmx);
1624 SET_HPEL_FUNCS(avg_no_rnd, , 16, mmx);
1625 SET_HPEL_FUNCS(put, [1], 8, mmx);
1626 SET_HPEL_FUNCS(put_no_rnd, [1], 8, mmx);
1627 SET_HPEL_FUNCS(avg, [1], 8, mmx);
1630 #if CONFIG_VIDEODSP && (ARCH_X86_32 || !HAVE_YASM)
1634 c->add_bytes = add_bytes_mmx;
1635 #endif /* HAVE_INLINE_ASM */
1638 if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
1639 c->h263_v_loop_filter = ff_h263_v_loop_filter_mmx;
1640 c->h263_h_loop_filter = ff_h263_h_loop_filter_mmx;
1643 c->vector_clip_int32 = ff_vector_clip_int32_mmx;
1648 static av_cold void dsputil_init_mmxext(DSPContext *c, AVCodecContext *avctx,
1651 const int bit_depth = avctx->bits_per_raw_sample;
1652 const int high_bit_depth = bit_depth > 8;
1655 SET_QPEL_FUNCS(avg_qpel, 0, 16, mmxext, );
1656 SET_QPEL_FUNCS(avg_qpel, 1, 8, mmxext, );
1658 SET_QPEL_FUNCS(put_qpel, 0, 16, mmxext, );
1659 SET_QPEL_FUNCS(put_qpel, 1, 8, mmxext, );
1660 SET_QPEL_FUNCS(put_no_rnd_qpel, 0, 16, mmxext, );
1661 SET_QPEL_FUNCS(put_no_rnd_qpel, 1, 8, mmxext, );
1663 if (!high_bit_depth) {
1664 c->put_pixels_tab[0][1] = ff_put_pixels16_x2_mmxext;
1665 c->put_pixels_tab[0][2] = ff_put_pixels16_y2_mmxext;
1667 c->avg_pixels_tab[0][0] = ff_avg_pixels16_mmxext;
1668 c->avg_pixels_tab[0][1] = ff_avg_pixels16_x2_mmxext;
1669 c->avg_pixels_tab[0][2] = ff_avg_pixels16_y2_mmxext;
1671 c->put_pixels_tab[1][1] = ff_put_pixels8_x2_mmxext;
1672 c->put_pixels_tab[1][2] = ff_put_pixels8_y2_mmxext;
1674 c->avg_pixels_tab[1][0] = ff_avg_pixels8_mmxext;
1675 c->avg_pixels_tab[1][1] = ff_avg_pixels8_x2_mmxext;
1676 c->avg_pixels_tab[1][2] = ff_avg_pixels8_y2_mmxext;
1679 if (!(avctx->flags & CODEC_FLAG_BITEXACT)) {
1680 if (!high_bit_depth) {
1681 c->put_no_rnd_pixels_tab[0][1] = ff_put_no_rnd_pixels16_x2_mmxext;
1682 c->put_no_rnd_pixels_tab[0][2] = ff_put_no_rnd_pixels16_y2_mmxext;
1683 c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_mmxext;
1684 c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_mmxext;
1686 c->avg_pixels_tab[0][3] = ff_avg_pixels16_xy2_mmxext;
1687 c->avg_pixels_tab[1][3] = ff_avg_pixels8_xy2_mmxext;
1690 #endif /* HAVE_YASM */
1692 #if HAVE_MMXEXT_EXTERNAL
1693 if (CONFIG_VP3_DECODER && (avctx->codec_id == AV_CODEC_ID_VP3 ||
1694 avctx->codec_id == AV_CODEC_ID_THEORA)) {
1695 c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_exact_mmxext;
1696 c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_exact_mmxext;
1699 /* slower than cmov version on AMD */
1700 if (!(mm_flags & AV_CPU_FLAG_3DNOW))
1701 c->add_hfyu_median_prediction = ff_add_hfyu_median_prediction_mmxext;
1703 c->scalarproduct_int16 = ff_scalarproduct_int16_mmxext;
1704 c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_mmxext;
1706 if (avctx->flags & CODEC_FLAG_BITEXACT) {
1707 c->apply_window_int16 = ff_apply_window_int16_mmxext;
1709 c->apply_window_int16 = ff_apply_window_int16_round_mmxext;
1711 #endif /* HAVE_MMXEXT_EXTERNAL */
1714 static av_cold void dsputil_init_3dnow(DSPContext *c, AVCodecContext *avctx,
1717 const int high_bit_depth = avctx->bits_per_raw_sample > 8;
1720 if (!high_bit_depth) {
1721 c->put_pixels_tab[0][1] = ff_put_pixels16_x2_3dnow;
1722 c->put_pixels_tab[0][2] = ff_put_pixels16_y2_3dnow;
1724 c->avg_pixels_tab[0][0] = ff_avg_pixels16_3dnow;
1725 c->avg_pixels_tab[0][1] = ff_avg_pixels16_x2_3dnow;
1726 c->avg_pixels_tab[0][2] = ff_avg_pixels16_y2_3dnow;
1728 c->put_pixels_tab[1][1] = ff_put_pixels8_x2_3dnow;
1729 c->put_pixels_tab[1][2] = ff_put_pixels8_y2_3dnow;
1731 c->avg_pixels_tab[1][0] = ff_avg_pixels8_3dnow;
1732 c->avg_pixels_tab[1][1] = ff_avg_pixels8_x2_3dnow;
1733 c->avg_pixels_tab[1][2] = ff_avg_pixels8_y2_3dnow;
1735 if (!(avctx->flags & CODEC_FLAG_BITEXACT)){
1736 c->put_no_rnd_pixels_tab[0][1] = ff_put_no_rnd_pixels16_x2_3dnow;
1737 c->put_no_rnd_pixels_tab[0][2] = ff_put_no_rnd_pixels16_y2_3dnow;
1738 c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_3dnow;
1739 c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_3dnow;
1741 c->avg_pixels_tab[0][3] = ff_avg_pixels16_xy2_3dnow;
1742 c->avg_pixels_tab[1][3] = ff_avg_pixels8_xy2_3dnow;
1746 if (CONFIG_VP3_DECODER && (avctx->codec_id == AV_CODEC_ID_VP3 ||
1747 avctx->codec_id == AV_CODEC_ID_THEORA)) {
1748 c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_exact_3dnow;
1749 c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_exact_3dnow;
1751 #endif /* HAVE_YASM */
1754 static av_cold void dsputil_init_sse(DSPContext *c, AVCodecContext *avctx,
1757 const int high_bit_depth = avctx->bits_per_raw_sample > 8;
1760 if (!high_bit_depth) {
1761 if (!(CONFIG_MPEG_XVMC_DECODER && avctx->xvmc_acceleration > 1)) {
1762 /* XvMCCreateBlocks() may not allocate 16-byte aligned blocks */
1763 c->clear_block = clear_block_sse;
1764 c->clear_blocks = clear_blocks_sse;
1768 c->vector_clipf = vector_clipf_sse;
1769 #endif /* HAVE_INLINE_ASM */
1772 #if HAVE_INLINE_ASM && CONFIG_VIDEODSP
1775 #endif /* HAVE_YASM */
1778 static av_cold void dsputil_init_sse2(DSPContext *c, AVCodecContext *avctx,
1781 const int bit_depth = avctx->bits_per_raw_sample;
1782 const int high_bit_depth = bit_depth > 8;
1784 #if HAVE_SSE2_INLINE
1785 if (!high_bit_depth && avctx->idct_algo == FF_IDCT_XVIDMMX) {
1786 c->idct_put = ff_idct_xvid_sse2_put;
1787 c->idct_add = ff_idct_xvid_sse2_add;
1788 c->idct = ff_idct_xvid_sse2;
1789 c->idct_permutation_type = FF_SSE2_IDCT_PERM;
1791 #endif /* HAVE_SSE2_INLINE */
1793 #if HAVE_SSE2_EXTERNAL
1794 if (!(mm_flags & AV_CPU_FLAG_SSE2SLOW)) {
1795 // these functions are slower than mmx on AMD, but faster on Intel
1796 if (!high_bit_depth) {
1797 c->put_pixels_tab[0][0] = ff_put_pixels16_sse2;
1798 c->put_no_rnd_pixels_tab[0][0] = ff_put_pixels16_sse2;
1799 c->avg_pixels_tab[0][0] = ff_avg_pixels16_sse2;
1803 c->scalarproduct_int16 = ff_scalarproduct_int16_sse2;
1804 c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_sse2;
1805 if (mm_flags & AV_CPU_FLAG_ATOM) {
1806 c->vector_clip_int32 = ff_vector_clip_int32_int_sse2;
1808 c->vector_clip_int32 = ff_vector_clip_int32_sse2;
1810 if (avctx->flags & CODEC_FLAG_BITEXACT) {
1811 c->apply_window_int16 = ff_apply_window_int16_sse2;
1812 } else if (!(mm_flags & AV_CPU_FLAG_SSE2SLOW)) {
1813 c->apply_window_int16 = ff_apply_window_int16_round_sse2;
1815 c->bswap_buf = ff_bswap32_buf_sse2;
1816 #endif /* HAVE_SSE2_EXTERNAL */
1819 static av_cold void dsputil_init_ssse3(DSPContext *c, AVCodecContext *avctx,
1822 #if HAVE_SSSE3_EXTERNAL
1823 c->add_hfyu_left_prediction = ff_add_hfyu_left_prediction_ssse3;
1824 if (mm_flags & AV_CPU_FLAG_SSE4) // not really sse4, just slow on Conroe
1825 c->add_hfyu_left_prediction = ff_add_hfyu_left_prediction_sse4;
1827 if (mm_flags & AV_CPU_FLAG_ATOM)
1828 c->apply_window_int16 = ff_apply_window_int16_ssse3_atom;
1830 c->apply_window_int16 = ff_apply_window_int16_ssse3;
1831 if (!(mm_flags & (AV_CPU_FLAG_SSE42|AV_CPU_FLAG_3DNOW))) // cachesplit
1832 c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_ssse3;
1833 c->bswap_buf = ff_bswap32_buf_ssse3;
1834 #endif /* HAVE_SSSE3_EXTERNAL */
1837 static av_cold void dsputil_init_sse4(DSPContext *c, AVCodecContext *avctx,
1840 #if HAVE_SSE4_EXTERNAL
1841 c->vector_clip_int32 = ff_vector_clip_int32_sse4;
1842 #endif /* HAVE_SSE4_EXTERNAL */
1845 av_cold void ff_dsputil_init_mmx(DSPContext *c, AVCodecContext *avctx)
1847 int mm_flags = av_get_cpu_flags();
1849 #if HAVE_7REGS && HAVE_INLINE_ASM
1850 if (mm_flags & AV_CPU_FLAG_CMOV)
1851 c->add_hfyu_median_prediction = add_hfyu_median_prediction_cmov;
1854 if (mm_flags & AV_CPU_FLAG_MMX) {
1856 const int idct_algo = avctx->idct_algo;
1858 if (avctx->lowres == 0 && avctx->bits_per_raw_sample <= 8) {
1859 if (idct_algo == FF_IDCT_AUTO || idct_algo == FF_IDCT_SIMPLEMMX) {
1860 c->idct_put = ff_simple_idct_put_mmx;
1861 c->idct_add = ff_simple_idct_add_mmx;
1862 c->idct = ff_simple_idct_mmx;
1863 c->idct_permutation_type = FF_SIMPLE_IDCT_PERM;
1865 } else if (idct_algo == FF_IDCT_LIBMPEG2MMX) {
1866 if (mm_flags & AV_CPU_FLAG_MMX2) {
1867 c->idct_put = ff_libmpeg2mmx2_idct_put;
1868 c->idct_add = ff_libmpeg2mmx2_idct_add;
1869 c->idct = ff_mmxext_idct;
1871 c->idct_put = ff_libmpeg2mmx_idct_put;
1872 c->idct_add = ff_libmpeg2mmx_idct_add;
1873 c->idct = ff_mmx_idct;
1875 c->idct_permutation_type = FF_LIBMPEG2_IDCT_PERM;
1877 } else if (idct_algo == FF_IDCT_XVIDMMX) {
1878 if (mm_flags & AV_CPU_FLAG_SSE2) {
1879 c->idct_put = ff_idct_xvid_sse2_put;
1880 c->idct_add = ff_idct_xvid_sse2_add;
1881 c->idct = ff_idct_xvid_sse2;
1882 c->idct_permutation_type = FF_SSE2_IDCT_PERM;
1883 } else if (mm_flags & AV_CPU_FLAG_MMXEXT) {
1884 c->idct_put = ff_idct_xvid_mmxext_put;
1885 c->idct_add = ff_idct_xvid_mmxext_add;
1886 c->idct = ff_idct_xvid_mmxext;
1888 c->idct_put = ff_idct_xvid_mmx_put;
1889 c->idct_add = ff_idct_xvid_mmx_add;
1890 c->idct = ff_idct_xvid_mmx;
1894 #endif /* HAVE_INLINE_ASM */
1896 dsputil_init_mmx(c, avctx, mm_flags);
1899 if (mm_flags & AV_CPU_FLAG_MMXEXT)
1900 dsputil_init_mmxext(c, avctx, mm_flags);
1902 if (mm_flags & AV_CPU_FLAG_3DNOW)
1903 dsputil_init_3dnow(c, avctx, mm_flags);
1905 if (mm_flags & AV_CPU_FLAG_SSE)
1906 dsputil_init_sse(c, avctx, mm_flags);
1908 if (mm_flags & AV_CPU_FLAG_SSE2)
1909 dsputil_init_sse2(c, avctx, mm_flags);
1911 if (mm_flags & AV_CPU_FLAG_SSSE3)
1912 dsputil_init_ssse3(c, avctx, mm_flags);
1914 if (mm_flags & AV_CPU_FLAG_SSE4)
1915 dsputil_init_sse4(c, avctx, mm_flags);
1917 if (CONFIG_ENCODERS)
1918 ff_dsputilenc_init_mmx(c, avctx);