2 * MMX optimized DSP utils
3 * Copyright (c) 2000, 2001 Fabrice Bellard
4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
6 * This file is part of FFmpeg.
8 * FFmpeg is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License as published by the Free Software Foundation; either
11 * version 2.1 of the License, or (at your option) any later version.
13 * FFmpeg is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 * Lesser General Public License for more details.
18 * You should have received a copy of the GNU Lesser General Public
19 * License along with FFmpeg; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22 * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
25 #include "libavutil/attributes.h"
26 #include "libavutil/cpu.h"
27 #include "libavutil/x86/asm.h"
28 #include "libavcodec/dsputil.h"
29 #include "libavcodec/h264dsp.h"
30 #include "libavcodec/mpegvideo.h"
31 #include "libavcodec/simple_idct.h"
32 #include "libavcodec/videodsp.h"
33 #include "dsputil_mmx.h"
34 #include "idct_xvid.h"
35 #include "diracdsp_mmx.h"
40 /* pixel operations */
41 DECLARE_ALIGNED(8, const uint64_t, ff_bone) = 0x0101010101010101ULL;
42 DECLARE_ALIGNED(8, const uint64_t, ff_wtwo) = 0x0002000200020002ULL;
44 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_1) = { 0x0001000100010001ULL, 0x0001000100010001ULL };
45 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_2) = { 0x0002000200020002ULL, 0x0002000200020002ULL };
46 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_3) = { 0x0003000300030003ULL, 0x0003000300030003ULL };
47 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_4) = { 0x0004000400040004ULL, 0x0004000400040004ULL };
48 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_5) = { 0x0005000500050005ULL, 0x0005000500050005ULL };
49 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_8) = { 0x0008000800080008ULL, 0x0008000800080008ULL };
50 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_9) = { 0x0009000900090009ULL, 0x0009000900090009ULL };
51 DECLARE_ALIGNED(8, const uint64_t, ff_pw_15) = 0x000F000F000F000FULL;
52 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_16) = { 0x0010001000100010ULL, 0x0010001000100010ULL };
53 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_17) = { 0x0011001100110011ULL, 0x0011001100110011ULL };
54 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_18) = { 0x0012001200120012ULL, 0x0012001200120012ULL };
55 DECLARE_ALIGNED(8, const uint64_t, ff_pw_20) = 0x0014001400140014ULL;
56 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_27) = { 0x001B001B001B001BULL, 0x001B001B001B001BULL };
57 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_28) = { 0x001C001C001C001CULL, 0x001C001C001C001CULL };
58 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_32) = { 0x0020002000200020ULL, 0x0020002000200020ULL };
59 DECLARE_ALIGNED(8, const uint64_t, ff_pw_42) = 0x002A002A002A002AULL;
60 DECLARE_ALIGNED(8, const uint64_t, ff_pw_53) = 0x0035003500350035ULL;
61 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_63) = { 0x003F003F003F003FULL, 0x003F003F003F003FULL };
62 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_64) = { 0x0040004000400040ULL, 0x0040004000400040ULL };
63 DECLARE_ALIGNED(8, const uint64_t, ff_pw_96) = 0x0060006000600060ULL;
64 DECLARE_ALIGNED(8, const uint64_t, ff_pw_128) = 0x0080008000800080ULL;
65 DECLARE_ALIGNED(8, const uint64_t, ff_pw_255) = 0x00ff00ff00ff00ffULL;
66 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_512) = { 0x0200020002000200ULL, 0x0200020002000200ULL };
67 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_1019) = { 0x03FB03FB03FB03FBULL, 0x03FB03FB03FB03FBULL };
69 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_0) = { 0x0000000000000000ULL, 0x0000000000000000ULL };
70 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_1) = { 0x0101010101010101ULL, 0x0101010101010101ULL };
71 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_3) = { 0x0303030303030303ULL, 0x0303030303030303ULL };
72 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_4) = { 0x0404040404040404ULL, 0x0404040404040404ULL };
73 DECLARE_ALIGNED(8, const uint64_t, ff_pb_7) = 0x0707070707070707ULL;
74 DECLARE_ALIGNED(8, const uint64_t, ff_pb_1F) = 0x1F1F1F1F1F1F1F1FULL;
75 DECLARE_ALIGNED(8, const uint64_t, ff_pb_3F) = 0x3F3F3F3F3F3F3F3FULL;
76 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_80) = { 0x8080808080808080ULL, 0x8080808080808080ULL };
77 DECLARE_ALIGNED(8, const uint64_t, ff_pb_81) = 0x8181818181818181ULL;
78 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_A1) = { 0xA1A1A1A1A1A1A1A1ULL, 0xA1A1A1A1A1A1A1A1ULL };
79 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_F8) = { 0xF8F8F8F8F8F8F8F8ULL, 0xF8F8F8F8F8F8F8F8ULL };
80 DECLARE_ALIGNED(8, const uint64_t, ff_pb_FC) = 0xFCFCFCFCFCFCFCFCULL;
81 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_FE) = { 0xFEFEFEFEFEFEFEFEULL, 0xFEFEFEFEFEFEFEFEULL };
83 DECLARE_ALIGNED(16, const double, ff_pd_1)[2] = { 1.0, 1.0 };
84 DECLARE_ALIGNED(16, const double, ff_pd_2)[2] = { 2.0, 2.0 };
88 void ff_put_pixels8_x2_mmxext(uint8_t *block, const uint8_t *pixels,
89 ptrdiff_t line_size, int h);
90 void ff_put_pixels8_x2_3dnow(uint8_t *block, const uint8_t *pixels,
91 ptrdiff_t line_size, int h);
92 void ff_put_pixels8_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
93 int dstStride, int src1Stride, int h);
94 void ff_put_no_rnd_pixels8_l2_mmxext(uint8_t *dst, uint8_t *src1,
95 uint8_t *src2, int dstStride,
96 int src1Stride, int h);
97 void ff_avg_pixels8_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
98 int dstStride, int src1Stride, int h);
99 void ff_put_pixels16_x2_mmxext(uint8_t *block, const uint8_t *pixels,
100 ptrdiff_t line_size, int h);
101 void ff_put_pixels16_x2_3dnow(uint8_t *block, const uint8_t *pixels,
102 ptrdiff_t line_size, int h);
103 void ff_put_pixels16_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
104 int dstStride, int src1Stride, int h);
105 void ff_avg_pixels16_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
106 int dstStride, int src1Stride, int h);
107 void ff_put_no_rnd_pixels16_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
108 int dstStride, int src1Stride, int h);
109 void ff_put_no_rnd_pixels8_x2_mmxext(uint8_t *block, const uint8_t *pixels,
110 ptrdiff_t line_size, int h);
111 void ff_put_no_rnd_pixels8_x2_3dnow(uint8_t *block, const uint8_t *pixels,
112 ptrdiff_t line_size, int h);
113 void ff_put_no_rnd_pixels8_x2_exact_mmxext(uint8_t *block,
114 const uint8_t *pixels,
115 ptrdiff_t line_size, int h);
116 void ff_put_no_rnd_pixels8_x2_exact_3dnow(uint8_t *block,
117 const uint8_t *pixels,
118 ptrdiff_t line_size, int h);
119 void ff_put_pixels8_y2_mmxext(uint8_t *block, const uint8_t *pixels,
120 ptrdiff_t line_size, int h);
121 void ff_put_pixels8_y2_3dnow(uint8_t *block, const uint8_t *pixels,
122 ptrdiff_t line_size, int h);
123 void ff_put_no_rnd_pixels8_y2_mmxext(uint8_t *block, const uint8_t *pixels,
124 ptrdiff_t line_size, int h);
125 void ff_put_no_rnd_pixels8_y2_3dnow(uint8_t *block, const uint8_t *pixels,
126 ptrdiff_t line_size, int h);
127 void ff_put_no_rnd_pixels8_y2_exact_mmxext(uint8_t *block,
128 const uint8_t *pixels,
129 ptrdiff_t line_size, int h);
130 void ff_put_no_rnd_pixels8_y2_exact_3dnow(uint8_t *block,
131 const uint8_t *pixels,
132 ptrdiff_t line_size, int h);
133 void ff_avg_pixels8_mmxext(uint8_t *block, const uint8_t *pixels,
134 ptrdiff_t line_size, int h);
135 void ff_avg_pixels8_3dnow(uint8_t *block, const uint8_t *pixels,
136 ptrdiff_t line_size, int h);
137 void ff_avg_pixels8_x2_mmxext(uint8_t *block, const uint8_t *pixels,
138 ptrdiff_t line_size, int h);
139 void ff_avg_pixels8_x2_3dnow(uint8_t *block, const uint8_t *pixels,
140 ptrdiff_t line_size, int h);
141 void ff_avg_pixels8_y2_mmxext(uint8_t *block, const uint8_t *pixels,
142 ptrdiff_t line_size, int h);
143 void ff_avg_pixels8_y2_3dnow(uint8_t *block, const uint8_t *pixels,
144 ptrdiff_t line_size, int h);
145 void ff_avg_pixels8_xy2_mmxext(uint8_t *block, const uint8_t *pixels,
146 ptrdiff_t line_size, int h);
147 void ff_avg_pixels8_xy2_3dnow(uint8_t *block, const uint8_t *pixels,
148 ptrdiff_t line_size, int h);
150 void ff_put_pixels8_mmxext(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h);
151 static void ff_put_pixels16_mmxext(uint8_t *block, const uint8_t *pixels,
152 ptrdiff_t line_size, int h)
154 ff_put_pixels8_mmxext(block, pixels, line_size, h);
155 ff_put_pixels8_mmxext(block + 8, pixels + 8, line_size, h);
158 void ff_put_mpeg4_qpel16_h_lowpass_mmxext(uint8_t *dst, uint8_t *src,
159 int dstStride, int srcStride, int h);
160 void ff_avg_mpeg4_qpel16_h_lowpass_mmxext(uint8_t *dst, uint8_t *src,
161 int dstStride, int srcStride, int h);
162 void ff_put_no_rnd_mpeg4_qpel16_h_lowpass_mmxext(uint8_t *dst, uint8_t *src,
163 int dstStride, int srcStride,
165 void ff_put_mpeg4_qpel8_h_lowpass_mmxext(uint8_t *dst, uint8_t *src,
166 int dstStride, int srcStride, int h);
167 void ff_avg_mpeg4_qpel8_h_lowpass_mmxext(uint8_t *dst, uint8_t *src,
168 int dstStride, int srcStride, int h);
169 void ff_put_no_rnd_mpeg4_qpel8_h_lowpass_mmxext(uint8_t *dst, uint8_t *src,
170 int dstStride, int srcStride,
172 void ff_put_mpeg4_qpel16_v_lowpass_mmxext(uint8_t *dst, uint8_t *src,
173 int dstStride, int srcStride);
174 void ff_avg_mpeg4_qpel16_v_lowpass_mmxext(uint8_t *dst, uint8_t *src,
175 int dstStride, int srcStride);
176 void ff_put_no_rnd_mpeg4_qpel16_v_lowpass_mmxext(uint8_t *dst, uint8_t *src,
177 int dstStride, int srcStride);
178 void ff_put_mpeg4_qpel8_v_lowpass_mmxext(uint8_t *dst, uint8_t *src,
179 int dstStride, int srcStride);
180 void ff_avg_mpeg4_qpel8_v_lowpass_mmxext(uint8_t *dst, uint8_t *src,
181 int dstStride, int srcStride);
182 void ff_put_no_rnd_mpeg4_qpel8_v_lowpass_mmxext(uint8_t *dst, uint8_t *src,
183 int dstStride, int srcStride);
184 #define ff_put_no_rnd_pixels16_mmxext ff_put_pixels16_mmxext
185 #define ff_put_no_rnd_pixels8_mmxext ff_put_pixels8_mmxext
186 #endif /* HAVE_YASM */
191 #define JUMPALIGN() __asm__ volatile (".p2align 3"::)
192 #define MOVQ_ZERO(regd) __asm__ volatile ("pxor %%"#regd", %%"#regd ::)
194 #define MOVQ_BFE(regd) \
196 "pcmpeqd %%"#regd", %%"#regd" \n\t" \
197 "paddb %%"#regd", %%"#regd" \n\t" ::)
200 #define MOVQ_BONE(regd) __asm__ volatile ("movq %0, %%"#regd" \n\t" :: "m"(ff_bone))
201 #define MOVQ_WTWO(regd) __asm__ volatile ("movq %0, %%"#regd" \n\t" :: "m"(ff_wtwo))
203 // for shared library it's better to use this way for accessing constants
205 #define MOVQ_BONE(regd) \
207 "pcmpeqd %%"#regd", %%"#regd" \n\t" \
208 "psrlw $15, %%"#regd" \n\t" \
209 "packuswb %%"#regd", %%"#regd" \n\t" ::)
211 #define MOVQ_WTWO(regd) \
213 "pcmpeqd %%"#regd", %%"#regd" \n\t" \
214 "psrlw $15, %%"#regd" \n\t" \
215 "psllw $1, %%"#regd" \n\t"::)
219 // using regr as temporary and for the output result
220 // first argument is unmodifed and second is trashed
221 // regfe is supposed to contain 0xfefefefefefefefe
222 #define PAVGB_MMX_NO_RND(rega, regb, regr, regfe) \
223 "movq "#rega", "#regr" \n\t" \
224 "pand "#regb", "#regr" \n\t" \
225 "pxor "#rega", "#regb" \n\t" \
226 "pand "#regfe", "#regb" \n\t" \
227 "psrlq $1, "#regb" \n\t" \
228 "paddb "#regb", "#regr" \n\t"
230 #define PAVGB_MMX(rega, regb, regr, regfe) \
231 "movq "#rega", "#regr" \n\t" \
232 "por "#regb", "#regr" \n\t" \
233 "pxor "#rega", "#regb" \n\t" \
234 "pand "#regfe", "#regb" \n\t" \
235 "psrlq $1, "#regb" \n\t" \
236 "psubb "#regb", "#regr" \n\t"
238 // mm6 is supposed to contain 0xfefefefefefefefe
239 #define PAVGBP_MMX_NO_RND(rega, regb, regr, regc, regd, regp) \
240 "movq "#rega", "#regr" \n\t" \
241 "movq "#regc", "#regp" \n\t" \
242 "pand "#regb", "#regr" \n\t" \
243 "pand "#regd", "#regp" \n\t" \
244 "pxor "#rega", "#regb" \n\t" \
245 "pxor "#regc", "#regd" \n\t" \
246 "pand %%mm6, "#regb" \n\t" \
247 "pand %%mm6, "#regd" \n\t" \
248 "psrlq $1, "#regb" \n\t" \
249 "psrlq $1, "#regd" \n\t" \
250 "paddb "#regb", "#regr" \n\t" \
251 "paddb "#regd", "#regp" \n\t"
253 #define PAVGBP_MMX(rega, regb, regr, regc, regd, regp) \
254 "movq "#rega", "#regr" \n\t" \
255 "movq "#regc", "#regp" \n\t" \
256 "por "#regb", "#regr" \n\t" \
257 "por "#regd", "#regp" \n\t" \
258 "pxor "#rega", "#regb" \n\t" \
259 "pxor "#regc", "#regd" \n\t" \
260 "pand %%mm6, "#regb" \n\t" \
261 "pand %%mm6, "#regd" \n\t" \
262 "psrlq $1, "#regd" \n\t" \
263 "psrlq $1, "#regb" \n\t" \
264 "psubb "#regb", "#regr" \n\t" \
265 "psubb "#regd", "#regp" \n\t"
267 /***********************************/
268 /* MMX no rounding */
270 #define DEF(x, y) x ## _no_rnd_ ## y ## _mmx
271 #define SET_RND MOVQ_WONE
272 #define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX_NO_RND(a, b, c, d, e, f)
273 #define PAVGB(a, b, c, e) PAVGB_MMX_NO_RND(a, b, c, e)
274 #define OP_AVG(a, b, c, e) PAVGB_MMX(a, b, c, e)
276 #include "dsputil_rnd_template.c"
283 /***********************************/
286 #define DEF(x, y) x ## _ ## y ## _mmx
287 #define SET_RND MOVQ_WTWO
288 #define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX(a, b, c, d, e, f)
289 #define PAVGB(a, b, c, e) PAVGB_MMX(a, b, c, e)
291 #include "dsputil_rnd_template.c"
299 #endif /* HAVE_INLINE_ASM */
303 #define ff_put_pixels8_mmx ff_put_pixels8_mmxext
305 /***********************************/
308 #define DEF(x) x ## _3dnow
310 #include "dsputil_avg_template.c"
314 /***********************************/
315 /* MMXEXT specific */
317 #define DEF(x) x ## _mmxext
319 #include "dsputil_avg_template.c"
323 #endif /* HAVE_YASM */
327 #define put_no_rnd_pixels16_mmx put_pixels16_mmx
328 #define put_no_rnd_pixels8_mmx put_pixels8_mmx
329 #define put_pixels16_mmxext put_pixels16_mmx
330 #define put_pixels8_mmxext put_pixels8_mmx
331 #define put_pixels4_mmxext put_pixels4_mmx
332 #define put_no_rnd_pixels16_mmxext put_no_rnd_pixels16_mmx
333 #define put_no_rnd_pixels8_mmxext put_no_rnd_pixels8_mmx
335 /***********************************/
338 void ff_put_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
344 /* read the pixels */
349 "movq (%3), %%mm0 \n\t"
350 "movq 8(%3), %%mm1 \n\t"
351 "movq 16(%3), %%mm2 \n\t"
352 "movq 24(%3), %%mm3 \n\t"
353 "movq 32(%3), %%mm4 \n\t"
354 "movq 40(%3), %%mm5 \n\t"
355 "movq 48(%3), %%mm6 \n\t"
356 "movq 56(%3), %%mm7 \n\t"
357 "packuswb %%mm1, %%mm0 \n\t"
358 "packuswb %%mm3, %%mm2 \n\t"
359 "packuswb %%mm5, %%mm4 \n\t"
360 "packuswb %%mm7, %%mm6 \n\t"
361 "movq %%mm0, (%0) \n\t"
362 "movq %%mm2, (%0, %1) \n\t"
363 "movq %%mm4, (%0, %1, 2) \n\t"
364 "movq %%mm6, (%0, %2) \n\t"
365 :: "r"(pix), "r"((x86_reg)line_size), "r"((x86_reg)line_size * 3),
368 pix += line_size * 4;
371 // if here would be an exact copy of the code above
372 // compiler would generate some very strange code
375 "movq (%3), %%mm0 \n\t"
376 "movq 8(%3), %%mm1 \n\t"
377 "movq 16(%3), %%mm2 \n\t"
378 "movq 24(%3), %%mm3 \n\t"
379 "movq 32(%3), %%mm4 \n\t"
380 "movq 40(%3), %%mm5 \n\t"
381 "movq 48(%3), %%mm6 \n\t"
382 "movq 56(%3), %%mm7 \n\t"
383 "packuswb %%mm1, %%mm0 \n\t"
384 "packuswb %%mm3, %%mm2 \n\t"
385 "packuswb %%mm5, %%mm4 \n\t"
386 "packuswb %%mm7, %%mm6 \n\t"
387 "movq %%mm0, (%0) \n\t"
388 "movq %%mm2, (%0, %1) \n\t"
389 "movq %%mm4, (%0, %1, 2) \n\t"
390 "movq %%mm6, (%0, %2) \n\t"
391 :: "r"(pix), "r"((x86_reg)line_size), "r"((x86_reg)line_size * 3), "r"(p)
395 #define put_signed_pixels_clamped_mmx_half(off) \
396 "movq "#off"(%2), %%mm1 \n\t" \
397 "movq 16 + "#off"(%2), %%mm2 \n\t" \
398 "movq 32 + "#off"(%2), %%mm3 \n\t" \
399 "movq 48 + "#off"(%2), %%mm4 \n\t" \
400 "packsswb 8 + "#off"(%2), %%mm1 \n\t" \
401 "packsswb 24 + "#off"(%2), %%mm2 \n\t" \
402 "packsswb 40 + "#off"(%2), %%mm3 \n\t" \
403 "packsswb 56 + "#off"(%2), %%mm4 \n\t" \
404 "paddb %%mm0, %%mm1 \n\t" \
405 "paddb %%mm0, %%mm2 \n\t" \
406 "paddb %%mm0, %%mm3 \n\t" \
407 "paddb %%mm0, %%mm4 \n\t" \
408 "movq %%mm1, (%0) \n\t" \
409 "movq %%mm2, (%0, %3) \n\t" \
410 "movq %%mm3, (%0, %3, 2) \n\t" \
411 "movq %%mm4, (%0, %1) \n\t"
413 void ff_put_signed_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
416 x86_reg line_skip = line_size;
420 "movq "MANGLE(ff_pb_80)", %%mm0 \n\t"
421 "lea (%3, %3, 2), %1 \n\t"
422 put_signed_pixels_clamped_mmx_half(0)
423 "lea (%0, %3, 4), %0 \n\t"
424 put_signed_pixels_clamped_mmx_half(64)
425 : "+&r"(pixels), "=&r"(line_skip3)
426 : "r"(block), "r"(line_skip)
430 void ff_add_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
437 /* read the pixels */
444 "movq (%2), %%mm0 \n\t"
445 "movq 8(%2), %%mm1 \n\t"
446 "movq 16(%2), %%mm2 \n\t"
447 "movq 24(%2), %%mm3 \n\t"
448 "movq %0, %%mm4 \n\t"
449 "movq %1, %%mm6 \n\t"
450 "movq %%mm4, %%mm5 \n\t"
451 "punpcklbw %%mm7, %%mm4 \n\t"
452 "punpckhbw %%mm7, %%mm5 \n\t"
453 "paddsw %%mm4, %%mm0 \n\t"
454 "paddsw %%mm5, %%mm1 \n\t"
455 "movq %%mm6, %%mm5 \n\t"
456 "punpcklbw %%mm7, %%mm6 \n\t"
457 "punpckhbw %%mm7, %%mm5 \n\t"
458 "paddsw %%mm6, %%mm2 \n\t"
459 "paddsw %%mm5, %%mm3 \n\t"
460 "packuswb %%mm1, %%mm0 \n\t"
461 "packuswb %%mm3, %%mm2 \n\t"
462 "movq %%mm0, %0 \n\t"
463 "movq %%mm2, %1 \n\t"
464 : "+m"(*pix), "+m"(*(pix + line_size))
467 pix += line_size * 2;
472 static void put_pixels8_mmx(uint8_t *block, const uint8_t *pixels,
473 ptrdiff_t line_size, int h)
476 "lea (%3, %3), %%"REG_a" \n\t"
479 "movq (%1 ), %%mm0 \n\t"
480 "movq (%1, %3), %%mm1 \n\t"
481 "movq %%mm0, (%2) \n\t"
482 "movq %%mm1, (%2, %3) \n\t"
483 "add %%"REG_a", %1 \n\t"
484 "add %%"REG_a", %2 \n\t"
485 "movq (%1 ), %%mm0 \n\t"
486 "movq (%1, %3), %%mm1 \n\t"
487 "movq %%mm0, (%2) \n\t"
488 "movq %%mm1, (%2, %3) \n\t"
489 "add %%"REG_a", %1 \n\t"
490 "add %%"REG_a", %2 \n\t"
493 : "+g"(h), "+r"(pixels), "+r"(block)
494 : "r"((x86_reg)line_size)
499 static void put_pixels16_mmx(uint8_t *block, const uint8_t *pixels,
500 ptrdiff_t line_size, int h)
503 "lea (%3, %3), %%"REG_a" \n\t"
506 "movq (%1 ), %%mm0 \n\t"
507 "movq 8(%1 ), %%mm4 \n\t"
508 "movq (%1, %3), %%mm1 \n\t"
509 "movq 8(%1, %3), %%mm5 \n\t"
510 "movq %%mm0, (%2) \n\t"
511 "movq %%mm4, 8(%2) \n\t"
512 "movq %%mm1, (%2, %3) \n\t"
513 "movq %%mm5, 8(%2, %3) \n\t"
514 "add %%"REG_a", %1 \n\t"
515 "add %%"REG_a", %2 \n\t"
516 "movq (%1 ), %%mm0 \n\t"
517 "movq 8(%1 ), %%mm4 \n\t"
518 "movq (%1, %3), %%mm1 \n\t"
519 "movq 8(%1, %3), %%mm5 \n\t"
520 "movq %%mm0, (%2) \n\t"
521 "movq %%mm4, 8(%2) \n\t"
522 "movq %%mm1, (%2, %3) \n\t"
523 "movq %%mm5, 8(%2, %3) \n\t"
524 "add %%"REG_a", %1 \n\t"
525 "add %%"REG_a", %2 \n\t"
528 : "+g"(h), "+r"(pixels), "+r"(block)
529 : "r"((x86_reg)line_size)
534 #define CLEAR_BLOCKS(name, n) \
535 static void name(int16_t *blocks) \
538 "pxor %%mm7, %%mm7 \n\t" \
539 "mov %1, %%"REG_a" \n\t" \
541 "movq %%mm7, (%0, %%"REG_a") \n\t" \
542 "movq %%mm7, 8(%0, %%"REG_a") \n\t" \
543 "movq %%mm7, 16(%0, %%"REG_a") \n\t" \
544 "movq %%mm7, 24(%0, %%"REG_a") \n\t" \
545 "add $32, %%"REG_a" \n\t" \
547 :: "r"(((uint8_t *)blocks) + 128 * n), \
552 CLEAR_BLOCKS(clear_blocks_mmx, 6)
553 CLEAR_BLOCKS(clear_block_mmx, 1)
555 static void clear_block_sse(int16_t *block)
558 "xorps %%xmm0, %%xmm0 \n"
559 "movaps %%xmm0, (%0) \n"
560 "movaps %%xmm0, 16(%0) \n"
561 "movaps %%xmm0, 32(%0) \n"
562 "movaps %%xmm0, 48(%0) \n"
563 "movaps %%xmm0, 64(%0) \n"
564 "movaps %%xmm0, 80(%0) \n"
565 "movaps %%xmm0, 96(%0) \n"
566 "movaps %%xmm0, 112(%0) \n"
572 static void clear_blocks_sse(int16_t *blocks)
575 "xorps %%xmm0, %%xmm0 \n"
576 "mov %1, %%"REG_a" \n"
578 "movaps %%xmm0, (%0, %%"REG_a") \n"
579 "movaps %%xmm0, 16(%0, %%"REG_a") \n"
580 "movaps %%xmm0, 32(%0, %%"REG_a") \n"
581 "movaps %%xmm0, 48(%0, %%"REG_a") \n"
582 "movaps %%xmm0, 64(%0, %%"REG_a") \n"
583 "movaps %%xmm0, 80(%0, %%"REG_a") \n"
584 "movaps %%xmm0, 96(%0, %%"REG_a") \n"
585 "movaps %%xmm0, 112(%0, %%"REG_a") \n"
586 "add $128, %%"REG_a" \n"
588 :: "r"(((uint8_t *)blocks) + 128 * 6),
594 static void add_bytes_mmx(uint8_t *dst, uint8_t *src, int w)
600 "movq (%1, %0), %%mm0 \n\t"
601 "movq (%2, %0), %%mm1 \n\t"
602 "paddb %%mm0, %%mm1 \n\t"
603 "movq %%mm1, (%2, %0) \n\t"
604 "movq 8(%1, %0), %%mm0 \n\t"
605 "movq 8(%2, %0), %%mm1 \n\t"
606 "paddb %%mm0, %%mm1 \n\t"
607 "movq %%mm1, 8(%2, %0) \n\t"
613 : "r"(src), "r"(dst), "r"((x86_reg)w - 15)
616 dst[i + 0] += src[i + 0];
620 static void add_hfyu_median_prediction_cmov(uint8_t *dst, const uint8_t *top,
621 const uint8_t *diff, int w,
622 int *left, int *left_top)
626 int l = *left & 0xff;
627 int tl = *left_top & 0xff;
632 "movzbl (%3, %4), %2 \n"
645 "add (%6, %4), %b0 \n"
646 "mov %b0, (%5, %4) \n"
649 : "+&q"(l), "+&q"(tl), "=&r"(t), "=&q"(x), "+&r"(w2)
650 : "r"(dst + w), "r"(diff + w), "rm"(top + w)
656 #endif /* HAVE_INLINE_ASM */
658 void ff_h263_v_loop_filter_mmx(uint8_t *src, int stride, int qscale);
659 void ff_h263_h_loop_filter_mmx(uint8_t *src, int stride, int qscale);
662 /* Draw the edges of width 'w' of an image of size width, height
663 * this MMX version can only handle w == 8 || w == 16. */
664 static void draw_edges_mmx(uint8_t *buf, int wrap, int width, int height,
665 int w, int h, int sides)
667 uint8_t *ptr, *last_line;
670 last_line = buf + (height - 1) * wrap;
676 "movd (%0), %%mm0 \n\t"
677 "punpcklbw %%mm0, %%mm0 \n\t"
678 "punpcklwd %%mm0, %%mm0 \n\t"
679 "punpckldq %%mm0, %%mm0 \n\t"
680 "movq %%mm0, -8(%0) \n\t"
681 "movq -8(%0, %2), %%mm1 \n\t"
682 "punpckhbw %%mm1, %%mm1 \n\t"
683 "punpckhwd %%mm1, %%mm1 \n\t"
684 "punpckhdq %%mm1, %%mm1 \n\t"
685 "movq %%mm1, (%0, %2) \n\t"
690 : "r"((x86_reg)wrap), "r"((x86_reg)width), "r"(ptr + wrap * height)
695 "movd (%0), %%mm0 \n\t"
696 "punpcklbw %%mm0, %%mm0 \n\t"
697 "punpcklwd %%mm0, %%mm0 \n\t"
698 "punpckldq %%mm0, %%mm0 \n\t"
699 "movq %%mm0, -8(%0) \n\t"
700 "movq %%mm0, -16(%0) \n\t"
701 "movq -8(%0, %2), %%mm1 \n\t"
702 "punpckhbw %%mm1, %%mm1 \n\t"
703 "punpckhwd %%mm1, %%mm1 \n\t"
704 "punpckhdq %%mm1, %%mm1 \n\t"
705 "movq %%mm1, (%0, %2) \n\t"
706 "movq %%mm1, 8(%0, %2) \n\t"
711 : "r"((x86_reg)wrap), "r"((x86_reg)width), "r"(ptr + wrap * height)
717 "movd (%0), %%mm0 \n\t"
718 "punpcklbw %%mm0, %%mm0 \n\t"
719 "punpcklwd %%mm0, %%mm0 \n\t"
720 "movd %%mm0, -4(%0) \n\t"
721 "movd -4(%0, %2), %%mm1 \n\t"
722 "punpcklbw %%mm1, %%mm1 \n\t"
723 "punpckhwd %%mm1, %%mm1 \n\t"
724 "punpckhdq %%mm1, %%mm1 \n\t"
725 "movd %%mm1, (%0, %2) \n\t"
730 : "r"((x86_reg)wrap), "r"((x86_reg)width), "r"(ptr + wrap * height)
734 /* top and bottom (and hopefully also the corners) */
735 if (sides & EDGE_TOP) {
736 for (i = 0; i < h; i += 4) {
737 ptr = buf - (i + 1) * wrap - w;
740 "movq (%1, %0), %%mm0 \n\t"
741 "movq %%mm0, (%0) \n\t"
742 "movq %%mm0, (%0, %2) \n\t"
743 "movq %%mm0, (%0, %2, 2) \n\t"
744 "movq %%mm0, (%0, %3) \n\t"
749 : "r"((x86_reg)buf - (x86_reg)ptr - w), "r"((x86_reg) -wrap),
750 "r"((x86_reg) -wrap * 3), "r"(ptr + width + 2 * w)
755 if (sides & EDGE_BOTTOM) {
756 for (i = 0; i < h; i += 4) {
757 ptr = last_line + (i + 1) * wrap - w;
760 "movq (%1, %0), %%mm0 \n\t"
761 "movq %%mm0, (%0) \n\t"
762 "movq %%mm0, (%0, %2) \n\t"
763 "movq %%mm0, (%0, %2, 2) \n\t"
764 "movq %%mm0, (%0, %3) \n\t"
769 : "r"((x86_reg)last_line - (x86_reg)ptr - w),
770 "r"((x86_reg)wrap), "r"((x86_reg)wrap * 3),
771 "r"(ptr + width + 2 * w)
776 #endif /* HAVE_INLINE_ASM */
780 #define QPEL_OP(OPNAME, ROUNDER, RND, OP, MMX) \
781 static void OPNAME ## qpel8_mc00_ ## MMX (uint8_t *dst, uint8_t *src, \
784 ff_ ## OPNAME ## pixels8_ ## MMX(dst, src, stride, 8); \
787 static void OPNAME ## qpel8_mc10_ ## MMX(uint8_t *dst, uint8_t *src, \
791 uint8_t * const half = (uint8_t*)temp; \
792 ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, \
794 ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, src, half, \
795 stride, stride, 8); \
798 static void OPNAME ## qpel8_mc20_ ## MMX(uint8_t *dst, uint8_t *src, \
801 ff_ ## OPNAME ## mpeg4_qpel8_h_lowpass_ ## MMX(dst, src, stride, \
805 static void OPNAME ## qpel8_mc30_ ## MMX(uint8_t *dst, uint8_t *src, \
809 uint8_t * const half = (uint8_t*)temp; \
810 ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, \
812 ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, src + 1, half, stride, \
816 static void OPNAME ## qpel8_mc01_ ## MMX(uint8_t *dst, uint8_t *src, \
820 uint8_t * const half = (uint8_t*)temp; \
821 ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, \
823 ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, src, half, \
824 stride, stride, 8); \
827 static void OPNAME ## qpel8_mc02_ ## MMX(uint8_t *dst, uint8_t *src, \
830 ff_ ## OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, src, \
834 static void OPNAME ## qpel8_mc03_ ## MMX(uint8_t *dst, uint8_t *src, \
838 uint8_t * const half = (uint8_t*)temp; \
839 ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, \
841 ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, src + stride, half, stride,\
845 static void OPNAME ## qpel8_mc11_ ## MMX(uint8_t *dst, uint8_t *src, \
848 uint64_t half[8 + 9]; \
849 uint8_t * const halfH = ((uint8_t*)half) + 64; \
850 uint8_t * const halfHV = ((uint8_t*)half); \
851 ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
853 ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, \
855 ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
856 ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, \
860 static void OPNAME ## qpel8_mc31_ ## MMX(uint8_t *dst, uint8_t *src, \
863 uint64_t half[8 + 9]; \
864 uint8_t * const halfH = ((uint8_t*)half) + 64; \
865 uint8_t * const halfHV = ((uint8_t*)half); \
866 ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
868 ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src + 1, halfH, 8, \
870 ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
871 ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, \
875 static void OPNAME ## qpel8_mc13_ ## MMX(uint8_t *dst, uint8_t *src, \
878 uint64_t half[8 + 9]; \
879 uint8_t * const halfH = ((uint8_t*)half) + 64; \
880 uint8_t * const halfHV = ((uint8_t*)half); \
881 ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
883 ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, \
885 ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
886 ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH + 8, halfHV, \
890 static void OPNAME ## qpel8_mc33_ ## MMX(uint8_t *dst, uint8_t *src, \
893 uint64_t half[8 + 9]; \
894 uint8_t * const halfH = ((uint8_t*)half) + 64; \
895 uint8_t * const halfHV = ((uint8_t*)half); \
896 ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
898 ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src + 1, halfH, 8, \
900 ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
901 ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH + 8, halfHV, \
905 static void OPNAME ## qpel8_mc21_ ## MMX(uint8_t *dst, uint8_t *src, \
908 uint64_t half[8 + 9]; \
909 uint8_t * const halfH = ((uint8_t*)half) + 64; \
910 uint8_t * const halfHV = ((uint8_t*)half); \
911 ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
913 ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
914 ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, \
918 static void OPNAME ## qpel8_mc23_ ## MMX(uint8_t *dst, uint8_t *src, \
921 uint64_t half[8 + 9]; \
922 uint8_t * const halfH = ((uint8_t*)half) + 64; \
923 uint8_t * const halfHV = ((uint8_t*)half); \
924 ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
926 ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
927 ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH + 8, halfHV, \
931 static void OPNAME ## qpel8_mc12_ ## MMX(uint8_t *dst, uint8_t *src, \
934 uint64_t half[8 + 9]; \
935 uint8_t * const halfH = ((uint8_t*)half); \
936 ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
938 ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, \
940 ff_ ## OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, \
944 static void OPNAME ## qpel8_mc32_ ## MMX(uint8_t *dst, uint8_t *src, \
947 uint64_t half[8 + 9]; \
948 uint8_t * const halfH = ((uint8_t*)half); \
949 ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
951 ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src + 1, halfH, 8, \
953 ff_ ## OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, \
957 static void OPNAME ## qpel8_mc22_ ## MMX(uint8_t *dst, uint8_t *src, \
961 uint8_t * const halfH = ((uint8_t*)half); \
962 ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
964 ff_ ## OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, \
968 static void OPNAME ## qpel16_mc00_ ## MMX (uint8_t *dst, uint8_t *src, \
971 ff_ ## OPNAME ## pixels16_ ## MMX(dst, src, stride, 16); \
974 static void OPNAME ## qpel16_mc10_ ## MMX(uint8_t *dst, uint8_t *src, \
978 uint8_t * const half = (uint8_t*)temp; \
979 ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, \
981 ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, \
985 static void OPNAME ## qpel16_mc20_ ## MMX(uint8_t *dst, uint8_t *src, \
988 ff_ ## OPNAME ## mpeg4_qpel16_h_lowpass_ ## MMX(dst, src, \
989 stride, stride, 16);\
992 static void OPNAME ## qpel16_mc30_ ## MMX(uint8_t *dst, uint8_t *src, \
996 uint8_t * const half = (uint8_t*)temp; \
997 ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, \
999 ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, src + 1, half, \
1000 stride, stride, 16); \
1003 static void OPNAME ## qpel16_mc01_ ## MMX(uint8_t *dst, uint8_t *src, \
1006 uint64_t temp[32]; \
1007 uint8_t * const half = (uint8_t*)temp; \
1008 ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, \
1010 ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, \
1014 static void OPNAME ## qpel16_mc02_ ## MMX(uint8_t *dst, uint8_t *src, \
1017 ff_ ## OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, src, \
1021 static void OPNAME ## qpel16_mc03_ ## MMX(uint8_t *dst, uint8_t *src, \
1024 uint64_t temp[32]; \
1025 uint8_t * const half = (uint8_t*)temp; \
1026 ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, \
1028 ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, src+stride, half, \
1029 stride, stride, 16); \
1032 static void OPNAME ## qpel16_mc11_ ## MMX(uint8_t *dst, uint8_t *src, \
1035 uint64_t half[16 * 2 + 17 * 2]; \
1036 uint8_t * const halfH = ((uint8_t*)half) + 256; \
1037 uint8_t * const halfHV = ((uint8_t*)half); \
1038 ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1040 ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, \
1042 ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
1044 ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, \
1048 static void OPNAME ## qpel16_mc31_ ## MMX(uint8_t *dst, uint8_t *src, \
1051 uint64_t half[16 * 2 + 17 * 2]; \
1052 uint8_t * const halfH = ((uint8_t*)half) + 256; \
1053 uint8_t * const halfHV = ((uint8_t*)half); \
1054 ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1056 ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src + 1, halfH, 16, \
1058 ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
1060 ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, \
1064 static void OPNAME ## qpel16_mc13_ ## MMX(uint8_t *dst, uint8_t *src, \
1067 uint64_t half[16 * 2 + 17 * 2]; \
1068 uint8_t * const halfH = ((uint8_t*)half) + 256; \
1069 uint8_t * const halfHV = ((uint8_t*)half); \
1070 ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1072 ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, \
1074 ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
1076 ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH + 16, halfHV, \
1080 static void OPNAME ## qpel16_mc33_ ## MMX(uint8_t *dst, uint8_t *src, \
1083 uint64_t half[16 * 2 + 17 * 2]; \
1084 uint8_t * const halfH = ((uint8_t*)half) + 256; \
1085 uint8_t * const halfHV = ((uint8_t*)half); \
1086 ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1088 ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src + 1, halfH, 16, \
1090 ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
1092 ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH + 16, halfHV, \
1096 static void OPNAME ## qpel16_mc21_ ## MMX(uint8_t *dst, uint8_t *src, \
1099 uint64_t half[16 * 2 + 17 * 2]; \
1100 uint8_t * const halfH = ((uint8_t*)half) + 256; \
1101 uint8_t * const halfHV = ((uint8_t*)half); \
1102 ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1104 ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
1106 ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, \
1110 static void OPNAME ## qpel16_mc23_ ## MMX(uint8_t *dst, uint8_t *src, \
1113 uint64_t half[16 * 2 + 17 * 2]; \
1114 uint8_t * const halfH = ((uint8_t*)half) + 256; \
1115 uint8_t * const halfHV = ((uint8_t*)half); \
1116 ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1118 ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
1120 ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH + 16, halfHV, \
1124 static void OPNAME ## qpel16_mc12_ ## MMX(uint8_t *dst, uint8_t *src, \
1127 uint64_t half[17 * 2]; \
1128 uint8_t * const halfH = ((uint8_t*)half); \
1129 ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1131 ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, \
1133 ff_ ## OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, \
1137 static void OPNAME ## qpel16_mc32_ ## MMX(uint8_t *dst, uint8_t *src, \
1140 uint64_t half[17 * 2]; \
1141 uint8_t * const halfH = ((uint8_t*)half); \
1142 ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1144 ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src + 1, halfH, 16, \
1146 ff_ ## OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, \
1150 static void OPNAME ## qpel16_mc22_ ## MMX(uint8_t *dst, uint8_t *src, \
1153 uint64_t half[17 * 2]; \
1154 uint8_t * const halfH = ((uint8_t*)half); \
1155 ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
1157 ff_ ## OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, \
1161 #define PUT_OP(a, b, temp, size) \
1162 "mov"#size" "#a", "#b" \n\t"
1164 #define AVG_MMXEXT_OP(a, b, temp, size) \
1165 "mov"#size" "#b", "#temp" \n\t" \
1166 "pavgb "#temp", "#a" \n\t" \
1167 "mov"#size" "#a", "#b" \n\t"
1169 QPEL_OP(put_, ff_pw_16, _, PUT_OP, mmxext)
1170 QPEL_OP(avg_, ff_pw_16, _, AVG_MMXEXT_OP, mmxext)
1171 QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, mmxext)
1172 #endif /* HAVE_YASM */
1176 void ff_put_rv40_qpel8_mc33_mmx(uint8_t *dst, uint8_t *src, int stride)
1178 put_pixels8_xy2_mmx(dst, src, stride, 8);
1180 void ff_put_rv40_qpel16_mc33_mmx(uint8_t *dst, uint8_t *src, int stride)
1182 put_pixels16_xy2_mmx(dst, src, stride, 16);
1184 void ff_avg_rv40_qpel8_mc33_mmx(uint8_t *dst, uint8_t *src, int stride)
1186 avg_pixels8_xy2_mmx(dst, src, stride, 8);
1188 void ff_avg_rv40_qpel16_mc33_mmx(uint8_t *dst, uint8_t *src, int stride)
1190 avg_pixels16_xy2_mmx(dst, src, stride, 16);
1193 typedef void emulated_edge_mc_func(uint8_t *dst, const uint8_t *src,
1194 ptrdiff_t linesize, int block_w, int block_h,
1195 int src_x, int src_y, int w, int h);
1197 static av_always_inline void gmc(uint8_t *dst, uint8_t *src,
1198 int stride, int h, int ox, int oy,
1199 int dxx, int dxy, int dyx, int dyy,
1200 int shift, int r, int width, int height,
1201 emulated_edge_mc_func *emu_edge_fn)
1204 const int ix = ox >> (16 + shift);
1205 const int iy = oy >> (16 + shift);
1206 const int oxs = ox >> 4;
1207 const int oys = oy >> 4;
1208 const int dxxs = dxx >> 4;
1209 const int dxys = dxy >> 4;
1210 const int dyxs = dyx >> 4;
1211 const int dyys = dyy >> 4;
1212 const uint16_t r4[4] = { r, r, r, r };
1213 const uint16_t dxy4[4] = { dxys, dxys, dxys, dxys };
1214 const uint16_t dyy4[4] = { dyys, dyys, dyys, dyys };
1215 const uint64_t shift2 = 2 * shift;
1216 #define MAX_STRIDE 4096U
1218 uint8_t edge_buf[(MAX_H + 1) * MAX_STRIDE];
1221 const int dxw = (dxx - (1 << (16 + shift))) * (w - 1);
1222 const int dyh = (dyy - (1 << (16 + shift))) * (h - 1);
1223 const int dxh = dxy * (h - 1);
1224 const int dyw = dyx * (w - 1);
1225 int need_emu = (unsigned)ix >= width - w ||
1226 (unsigned)iy >= height - h;
1228 if ( // non-constant fullpel offset (3% of blocks)
1229 ((ox ^ (ox + dxw)) | (ox ^ (ox + dxh)) | (ox ^ (ox + dxw + dxh)) |
1230 (oy ^ (oy + dyw)) | (oy ^ (oy + dyh)) | (oy ^ (oy + dyw + dyh))) >> (16 + shift)
1231 // uses more than 16 bits of subpel mv (only at huge resolution)
1232 || (dxx | dxy | dyx | dyy) & 15
1233 || (need_emu && (h > MAX_H || stride > MAX_STRIDE))) {
1234 // FIXME could still use mmx for some of the rows
1235 ff_gmc_c(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy,
1236 shift, r, width, height);
1240 src += ix + iy * stride;
1242 emu_edge_fn(edge_buf, src, stride, w + 1, h + 1, ix, iy, width, height);
1247 "movd %0, %%mm6 \n\t"
1248 "pxor %%mm7, %%mm7 \n\t"
1249 "punpcklwd %%mm6, %%mm6 \n\t"
1250 "punpcklwd %%mm6, %%mm6 \n\t"
1254 for (x = 0; x < w; x += 4) {
1255 uint16_t dx4[4] = { oxs - dxys + dxxs * (x + 0),
1256 oxs - dxys + dxxs * (x + 1),
1257 oxs - dxys + dxxs * (x + 2),
1258 oxs - dxys + dxxs * (x + 3) };
1259 uint16_t dy4[4] = { oys - dyys + dyxs * (x + 0),
1260 oys - dyys + dyxs * (x + 1),
1261 oys - dyys + dyxs * (x + 2),
1262 oys - dyys + dyxs * (x + 3) };
1264 for (y = 0; y < h; y++) {
1266 "movq %0, %%mm4 \n\t"
1267 "movq %1, %%mm5 \n\t"
1268 "paddw %2, %%mm4 \n\t"
1269 "paddw %3, %%mm5 \n\t"
1270 "movq %%mm4, %0 \n\t"
1271 "movq %%mm5, %1 \n\t"
1272 "psrlw $12, %%mm4 \n\t"
1273 "psrlw $12, %%mm5 \n\t"
1274 : "+m"(*dx4), "+m"(*dy4)
1275 : "m"(*dxy4), "m"(*dyy4)
1279 "movq %%mm6, %%mm2 \n\t"
1280 "movq %%mm6, %%mm1 \n\t"
1281 "psubw %%mm4, %%mm2 \n\t"
1282 "psubw %%mm5, %%mm1 \n\t"
1283 "movq %%mm2, %%mm0 \n\t"
1284 "movq %%mm4, %%mm3 \n\t"
1285 "pmullw %%mm1, %%mm0 \n\t" // (s - dx) * (s - dy)
1286 "pmullw %%mm5, %%mm3 \n\t" // dx * dy
1287 "pmullw %%mm5, %%mm2 \n\t" // (s - dx) * dy
1288 "pmullw %%mm4, %%mm1 \n\t" // dx * (s - dy)
1290 "movd %4, %%mm5 \n\t"
1291 "movd %3, %%mm4 \n\t"
1292 "punpcklbw %%mm7, %%mm5 \n\t"
1293 "punpcklbw %%mm7, %%mm4 \n\t"
1294 "pmullw %%mm5, %%mm3 \n\t" // src[1, 1] * dx * dy
1295 "pmullw %%mm4, %%mm2 \n\t" // src[0, 1] * (s - dx) * dy
1297 "movd %2, %%mm5 \n\t"
1298 "movd %1, %%mm4 \n\t"
1299 "punpcklbw %%mm7, %%mm5 \n\t"
1300 "punpcklbw %%mm7, %%mm4 \n\t"
1301 "pmullw %%mm5, %%mm1 \n\t" // src[1, 0] * dx * (s - dy)
1302 "pmullw %%mm4, %%mm0 \n\t" // src[0, 0] * (s - dx) * (s - dy)
1303 "paddw %5, %%mm1 \n\t"
1304 "paddw %%mm3, %%mm2 \n\t"
1305 "paddw %%mm1, %%mm0 \n\t"
1306 "paddw %%mm2, %%mm0 \n\t"
1308 "psrlw %6, %%mm0 \n\t"
1309 "packuswb %%mm0, %%mm0 \n\t"
1310 "movd %%mm0, %0 \n\t"
1312 : "=m"(dst[x + y * stride])
1313 : "m"(src[0]), "m"(src[1]),
1314 "m"(src[stride]), "m"(src[stride + 1]),
1315 "m"(*r4), "m"(shift2)
1319 src += 4 - h * stride;
1326 static void gmc_mmx(uint8_t *dst, uint8_t *src,
1327 int stride, int h, int ox, int oy,
1328 int dxx, int dxy, int dyx, int dyy,
1329 int shift, int r, int width, int height)
1331 gmc(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r,
1332 width, height, &ff_emulated_edge_mc_8);
1335 static void gmc_sse(uint8_t *dst, uint8_t *src,
1336 int stride, int h, int ox, int oy,
1337 int dxx, int dxy, int dyx, int dyy,
1338 int shift, int r, int width, int height)
1340 gmc(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r,
1341 width, height, &ff_emulated_edge_mc_8);
1344 static void gmc_mmx(uint8_t *dst, uint8_t *src,
1345 int stride, int h, int ox, int oy,
1346 int dxx, int dxy, int dyx, int dyy,
1347 int shift, int r, int width, int height)
1349 gmc(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r,
1350 width, height, &ff_emulated_edge_mc_8);
1355 #endif /* HAVE_INLINE_ASM */
1357 void ff_put_pixels16_sse2(uint8_t *block, const uint8_t *pixels,
1358 ptrdiff_t line_size, int h);
1359 void ff_avg_pixels16_sse2(uint8_t *block, const uint8_t *pixels,
1360 ptrdiff_t line_size, int h);
1365 void ff_put_cavs_qpel8_mc00_mmxext(uint8_t *dst, uint8_t *src, int stride)
1367 put_pixels8_mmx(dst, src, stride, 8);
1370 void ff_avg_cavs_qpel8_mc00_mmxext(uint8_t *dst, uint8_t *src, int stride)
1372 avg_pixels8_mmx(dst, src, stride, 8);
1375 void ff_put_cavs_qpel16_mc00_mmxext(uint8_t *dst, uint8_t *src, int stride)
1377 put_pixels16_mmx(dst, src, stride, 16);
1380 void ff_avg_cavs_qpel16_mc00_mmxext(uint8_t *dst, uint8_t *src, int stride)
1382 avg_pixels16_mmx(dst, src, stride, 16);
1384 #endif /* HAVE_INLINE_ASM */
1388 void ff_put_vc1_mspel_mc00_mmx(uint8_t *dst, const uint8_t *src,
1389 int stride, int rnd)
1391 ff_put_pixels8_mmx(dst, src, stride, 8);
1394 void ff_avg_vc1_mspel_mc00_mmxext(uint8_t *dst, const uint8_t *src,
1395 int stride, int rnd)
1397 ff_avg_pixels8_mmxext(dst, src, stride, 8);
1399 #endif /* HAVE_YASM */
1401 #if CONFIG_DIRAC_DECODER
1402 #define DIRAC_PIXOP(OPNAME2, OPNAME, EXT)\
1403 void ff_ ## OPNAME2 ## _dirac_pixels8_ ## EXT(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1406 ff_ ## OPNAME2 ## _dirac_pixels8_c(dst, src, stride, h);\
1408 OPNAME ## _pixels8_ ## EXT(dst, src[0], stride, h);\
1410 void ff_ ## OPNAME2 ## _dirac_pixels16_ ## EXT(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1413 ff_ ## OPNAME2 ## _dirac_pixels16_c(dst, src, stride, h);\
1415 OPNAME ## _pixels16_ ## EXT(dst, src[0], stride, h);\
1417 void ff_ ## OPNAME2 ## _dirac_pixels32_ ## EXT(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1420 ff_ ## OPNAME2 ## _dirac_pixels32_c(dst, src, stride, h);\
1422 OPNAME ## _pixels16_ ## EXT(dst , src[0] , stride, h);\
1423 OPNAME ## _pixels16_ ## EXT(dst+16, src[0]+16, stride, h);\
1428 DIRAC_PIXOP(put, put, mmx)
1429 DIRAC_PIXOP(avg, avg, mmx)
1433 DIRAC_PIXOP(avg, ff_avg, mmxext)
1435 void ff_put_dirac_pixels16_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h)
1438 ff_put_dirac_pixels16_c(dst, src, stride, h);
1440 ff_put_pixels16_sse2(dst, src[0], stride, h);
1442 void ff_avg_dirac_pixels16_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h)
1445 ff_avg_dirac_pixels16_c(dst, src, stride, h);
1447 ff_avg_pixels16_sse2(dst, src[0], stride, h);
1449 void ff_put_dirac_pixels32_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h)
1452 ff_put_dirac_pixels32_c(dst, src, stride, h);
1454 ff_put_pixels16_sse2(dst , src[0] , stride, h);
1455 ff_put_pixels16_sse2(dst+16, src[0]+16, stride, h);
1458 void ff_avg_dirac_pixels32_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h)
1461 ff_avg_dirac_pixels32_c(dst, src, stride, h);
1463 ff_avg_pixels16_sse2(dst , src[0] , stride, h);
1464 ff_avg_pixels16_sse2(dst+16, src[0]+16, stride, h);
1470 /* XXX: Those functions should be suppressed ASAP when all IDCTs are
1473 static void ff_libmpeg2mmx_idct_put(uint8_t *dest, int line_size,
1477 ff_put_pixels_clamped_mmx(block, dest, line_size);
1480 static void ff_libmpeg2mmx_idct_add(uint8_t *dest, int line_size,
1484 ff_add_pixels_clamped_mmx(block, dest, line_size);
1487 static void ff_libmpeg2mmx2_idct_put(uint8_t *dest, int line_size,
1490 ff_mmxext_idct(block);
1491 ff_put_pixels_clamped_mmx(block, dest, line_size);
1494 static void ff_libmpeg2mmx2_idct_add(uint8_t *dest, int line_size,
1497 ff_mmxext_idct(block);
1498 ff_add_pixels_clamped_mmx(block, dest, line_size);
1503 static void vector_clipf_sse(float *dst, const float *src,
1504 float min, float max, int len)
1506 x86_reg i = (len - 16) * 4;
1508 "movss %3, %%xmm4 \n\t"
1509 "movss %4, %%xmm5 \n\t"
1510 "shufps $0, %%xmm4, %%xmm4 \n\t"
1511 "shufps $0, %%xmm5, %%xmm5 \n\t"
1513 "movaps (%2, %0), %%xmm0 \n\t" // 3/1 on intel
1514 "movaps 16(%2, %0), %%xmm1 \n\t"
1515 "movaps 32(%2, %0), %%xmm2 \n\t"
1516 "movaps 48(%2, %0), %%xmm3 \n\t"
1517 "maxps %%xmm4, %%xmm0 \n\t"
1518 "maxps %%xmm4, %%xmm1 \n\t"
1519 "maxps %%xmm4, %%xmm2 \n\t"
1520 "maxps %%xmm4, %%xmm3 \n\t"
1521 "minps %%xmm5, %%xmm0 \n\t"
1522 "minps %%xmm5, %%xmm1 \n\t"
1523 "minps %%xmm5, %%xmm2 \n\t"
1524 "minps %%xmm5, %%xmm3 \n\t"
1525 "movaps %%xmm0, (%1, %0) \n\t"
1526 "movaps %%xmm1, 16(%1, %0) \n\t"
1527 "movaps %%xmm2, 32(%1, %0) \n\t"
1528 "movaps %%xmm3, 48(%1, %0) \n\t"
1532 : "r"(dst), "r"(src), "m"(min), "m"(max)
1537 #endif /* HAVE_INLINE_ASM */
1539 int32_t ff_scalarproduct_int16_mmxext(const int16_t *v1, const int16_t *v2,
1541 int32_t ff_scalarproduct_int16_sse2(const int16_t *v1, const int16_t *v2,
1543 int32_t ff_scalarproduct_and_madd_int16_mmxext(int16_t *v1, const int16_t *v2,
1545 int order, int mul);
1546 int32_t ff_scalarproduct_and_madd_int16_sse2(int16_t *v1, const int16_t *v2,
1548 int order, int mul);
1549 int32_t ff_scalarproduct_and_madd_int16_ssse3(int16_t *v1, const int16_t *v2,
1551 int order, int mul);
1553 void ff_apply_window_int16_round_mmxext(int16_t *output, const int16_t *input,
1554 const int16_t *window, unsigned int len);
1555 void ff_apply_window_int16_round_sse2(int16_t *output, const int16_t *input,
1556 const int16_t *window, unsigned int len);
1557 void ff_apply_window_int16_mmxext(int16_t *output, const int16_t *input,
1558 const int16_t *window, unsigned int len);
1559 void ff_apply_window_int16_sse2(int16_t *output, const int16_t *input,
1560 const int16_t *window, unsigned int len);
1561 void ff_apply_window_int16_ssse3(int16_t *output, const int16_t *input,
1562 const int16_t *window, unsigned int len);
1563 void ff_apply_window_int16_ssse3_atom(int16_t *output, const int16_t *input,
1564 const int16_t *window, unsigned int len);
1566 void ff_bswap32_buf_ssse3(uint32_t *dst, const uint32_t *src, int w);
1567 void ff_bswap32_buf_sse2(uint32_t *dst, const uint32_t *src, int w);
1569 void ff_add_hfyu_median_prediction_mmxext(uint8_t *dst, const uint8_t *top,
1570 const uint8_t *diff, int w,
1571 int *left, int *left_top);
1572 int ff_add_hfyu_left_prediction_ssse3(uint8_t *dst, const uint8_t *src,
1574 int ff_add_hfyu_left_prediction_sse4(uint8_t *dst, const uint8_t *src,
1577 void ff_vector_clip_int32_mmx (int32_t *dst, const int32_t *src,
1578 int32_t min, int32_t max, unsigned int len);
1579 void ff_vector_clip_int32_sse2 (int32_t *dst, const int32_t *src,
1580 int32_t min, int32_t max, unsigned int len);
1581 void ff_vector_clip_int32_int_sse2(int32_t *dst, const int32_t *src,
1582 int32_t min, int32_t max, unsigned int len);
1583 void ff_vector_clip_int32_sse4 (int32_t *dst, const int32_t *src,
1584 int32_t min, int32_t max, unsigned int len);
1586 #define SET_QPEL_FUNCS(PFX, IDX, SIZE, CPU, PREFIX) \
1588 c->PFX ## _pixels_tab[IDX][ 0] = PREFIX ## PFX ## SIZE ## _mc00_ ## CPU; \
1589 c->PFX ## _pixels_tab[IDX][ 1] = PREFIX ## PFX ## SIZE ## _mc10_ ## CPU; \
1590 c->PFX ## _pixels_tab[IDX][ 2] = PREFIX ## PFX ## SIZE ## _mc20_ ## CPU; \
1591 c->PFX ## _pixels_tab[IDX][ 3] = PREFIX ## PFX ## SIZE ## _mc30_ ## CPU; \
1592 c->PFX ## _pixels_tab[IDX][ 4] = PREFIX ## PFX ## SIZE ## _mc01_ ## CPU; \
1593 c->PFX ## _pixels_tab[IDX][ 5] = PREFIX ## PFX ## SIZE ## _mc11_ ## CPU; \
1594 c->PFX ## _pixels_tab[IDX][ 6] = PREFIX ## PFX ## SIZE ## _mc21_ ## CPU; \
1595 c->PFX ## _pixels_tab[IDX][ 7] = PREFIX ## PFX ## SIZE ## _mc31_ ## CPU; \
1596 c->PFX ## _pixels_tab[IDX][ 8] = PREFIX ## PFX ## SIZE ## _mc02_ ## CPU; \
1597 c->PFX ## _pixels_tab[IDX][ 9] = PREFIX ## PFX ## SIZE ## _mc12_ ## CPU; \
1598 c->PFX ## _pixels_tab[IDX][10] = PREFIX ## PFX ## SIZE ## _mc22_ ## CPU; \
1599 c->PFX ## _pixels_tab[IDX][11] = PREFIX ## PFX ## SIZE ## _mc32_ ## CPU; \
1600 c->PFX ## _pixels_tab[IDX][12] = PREFIX ## PFX ## SIZE ## _mc03_ ## CPU; \
1601 c->PFX ## _pixels_tab[IDX][13] = PREFIX ## PFX ## SIZE ## _mc13_ ## CPU; \
1602 c->PFX ## _pixels_tab[IDX][14] = PREFIX ## PFX ## SIZE ## _mc23_ ## CPU; \
1603 c->PFX ## _pixels_tab[IDX][15] = PREFIX ## PFX ## SIZE ## _mc33_ ## CPU; \
1606 #define SET_HPEL_FUNCS(PFX, IDX, SIZE, CPU) \
1608 c->PFX ## _pixels_tab IDX [0] = PFX ## _pixels ## SIZE ## _ ## CPU; \
1609 c->PFX ## _pixels_tab IDX [1] = PFX ## _pixels ## SIZE ## _x2_ ## CPU; \
1610 c->PFX ## _pixels_tab IDX [2] = PFX ## _pixels ## SIZE ## _y2_ ## CPU; \
1611 c->PFX ## _pixels_tab IDX [3] = PFX ## _pixels ## SIZE ## _xy2_ ## CPU; \
1614 static av_cold void dsputil_init_mmx(DSPContext *c, AVCodecContext *avctx,
1617 const int high_bit_depth = avctx->bits_per_raw_sample > 8;
1620 c->put_pixels_clamped = ff_put_pixels_clamped_mmx;
1621 c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_mmx;
1622 c->add_pixels_clamped = ff_add_pixels_clamped_mmx;
1624 if (!high_bit_depth) {
1625 c->clear_block = clear_block_mmx;
1626 c->clear_blocks = clear_blocks_mmx;
1627 c->draw_edges = draw_edges_mmx;
1629 SET_HPEL_FUNCS(put, [0], 16, mmx);
1630 SET_HPEL_FUNCS(put_no_rnd, [0], 16, mmx);
1631 SET_HPEL_FUNCS(avg, [0], 16, mmx);
1632 SET_HPEL_FUNCS(avg_no_rnd, , 16, mmx);
1633 SET_HPEL_FUNCS(put, [1], 8, mmx);
1634 SET_HPEL_FUNCS(put_no_rnd, [1], 8, mmx);
1635 SET_HPEL_FUNCS(avg, [1], 8, mmx);
1638 #if CONFIG_VIDEODSP && (ARCH_X86_32 || !HAVE_YASM)
1642 c->add_bytes = add_bytes_mmx;
1643 #endif /* HAVE_INLINE_ASM */
1646 if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
1647 c->h263_v_loop_filter = ff_h263_v_loop_filter_mmx;
1648 c->h263_h_loop_filter = ff_h263_h_loop_filter_mmx;
1651 c->vector_clip_int32 = ff_vector_clip_int32_mmx;
1656 static av_cold void dsputil_init_mmxext(DSPContext *c, AVCodecContext *avctx,
1659 const int bit_depth = avctx->bits_per_raw_sample;
1660 const int high_bit_depth = bit_depth > 8;
1663 SET_QPEL_FUNCS(avg_qpel, 0, 16, mmxext, );
1664 SET_QPEL_FUNCS(avg_qpel, 1, 8, mmxext, );
1666 SET_QPEL_FUNCS(put_qpel, 0, 16, mmxext, );
1667 SET_QPEL_FUNCS(put_qpel, 1, 8, mmxext, );
1668 SET_QPEL_FUNCS(put_no_rnd_qpel, 0, 16, mmxext, );
1669 SET_QPEL_FUNCS(put_no_rnd_qpel, 1, 8, mmxext, );
1671 if (!high_bit_depth) {
1672 c->put_pixels_tab[0][1] = ff_put_pixels16_x2_mmxext;
1673 c->put_pixels_tab[0][2] = ff_put_pixels16_y2_mmxext;
1675 c->avg_pixels_tab[0][0] = ff_avg_pixels16_mmxext;
1676 c->avg_pixels_tab[0][1] = ff_avg_pixels16_x2_mmxext;
1677 c->avg_pixels_tab[0][2] = ff_avg_pixels16_y2_mmxext;
1679 c->put_pixels_tab[1][1] = ff_put_pixels8_x2_mmxext;
1680 c->put_pixels_tab[1][2] = ff_put_pixels8_y2_mmxext;
1682 c->avg_pixels_tab[1][0] = ff_avg_pixels8_mmxext;
1683 c->avg_pixels_tab[1][1] = ff_avg_pixels8_x2_mmxext;
1684 c->avg_pixels_tab[1][2] = ff_avg_pixels8_y2_mmxext;
1687 if (!(avctx->flags & CODEC_FLAG_BITEXACT)) {
1688 if (!high_bit_depth) {
1689 c->put_no_rnd_pixels_tab[0][1] = ff_put_no_rnd_pixels16_x2_mmxext;
1690 c->put_no_rnd_pixels_tab[0][2] = ff_put_no_rnd_pixels16_y2_mmxext;
1691 c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_mmxext;
1692 c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_mmxext;
1694 c->avg_pixels_tab[0][3] = ff_avg_pixels16_xy2_mmxext;
1695 c->avg_pixels_tab[1][3] = ff_avg_pixels8_xy2_mmxext;
1698 #endif /* HAVE_YASM */
1700 #if HAVE_MMXEXT_EXTERNAL
1701 if (CONFIG_VP3_DECODER && (avctx->codec_id == AV_CODEC_ID_VP3 ||
1702 avctx->codec_id == AV_CODEC_ID_THEORA)) {
1703 c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_exact_mmxext;
1704 c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_exact_mmxext;
1707 /* slower than cmov version on AMD */
1708 if (!(mm_flags & AV_CPU_FLAG_3DNOW))
1709 c->add_hfyu_median_prediction = ff_add_hfyu_median_prediction_mmxext;
1711 c->scalarproduct_int16 = ff_scalarproduct_int16_mmxext;
1712 c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_mmxext;
1714 if (avctx->flags & CODEC_FLAG_BITEXACT) {
1715 c->apply_window_int16 = ff_apply_window_int16_mmxext;
1717 c->apply_window_int16 = ff_apply_window_int16_round_mmxext;
1719 #endif /* HAVE_MMXEXT_EXTERNAL */
1722 static av_cold void dsputil_init_3dnow(DSPContext *c, AVCodecContext *avctx,
1725 const int high_bit_depth = avctx->bits_per_raw_sample > 8;
1728 if (!high_bit_depth) {
1729 c->put_pixels_tab[0][1] = ff_put_pixels16_x2_3dnow;
1730 c->put_pixels_tab[0][2] = ff_put_pixels16_y2_3dnow;
1732 c->avg_pixels_tab[0][0] = ff_avg_pixels16_3dnow;
1733 c->avg_pixels_tab[0][1] = ff_avg_pixels16_x2_3dnow;
1734 c->avg_pixels_tab[0][2] = ff_avg_pixels16_y2_3dnow;
1736 c->put_pixels_tab[1][1] = ff_put_pixels8_x2_3dnow;
1737 c->put_pixels_tab[1][2] = ff_put_pixels8_y2_3dnow;
1739 c->avg_pixels_tab[1][0] = ff_avg_pixels8_3dnow;
1740 c->avg_pixels_tab[1][1] = ff_avg_pixels8_x2_3dnow;
1741 c->avg_pixels_tab[1][2] = ff_avg_pixels8_y2_3dnow;
1743 if (!(avctx->flags & CODEC_FLAG_BITEXACT)){
1744 c->put_no_rnd_pixels_tab[0][1] = ff_put_no_rnd_pixels16_x2_3dnow;
1745 c->put_no_rnd_pixels_tab[0][2] = ff_put_no_rnd_pixels16_y2_3dnow;
1746 c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_3dnow;
1747 c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_3dnow;
1749 c->avg_pixels_tab[0][3] = ff_avg_pixels16_xy2_3dnow;
1750 c->avg_pixels_tab[1][3] = ff_avg_pixels8_xy2_3dnow;
1754 if (CONFIG_VP3_DECODER && (avctx->codec_id == AV_CODEC_ID_VP3 ||
1755 avctx->codec_id == AV_CODEC_ID_THEORA)) {
1756 c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_exact_3dnow;
1757 c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_exact_3dnow;
1759 #endif /* HAVE_YASM */
1762 static av_cold void dsputil_init_sse(DSPContext *c, AVCodecContext *avctx,
1765 const int high_bit_depth = avctx->bits_per_raw_sample > 8;
1768 if (!high_bit_depth) {
1769 if (!(CONFIG_MPEG_XVMC_DECODER && avctx->xvmc_acceleration > 1)) {
1770 /* XvMCCreateBlocks() may not allocate 16-byte aligned blocks */
1771 c->clear_block = clear_block_sse;
1772 c->clear_blocks = clear_blocks_sse;
1776 c->vector_clipf = vector_clipf_sse;
1777 #endif /* HAVE_INLINE_ASM */
1780 #if HAVE_INLINE_ASM && CONFIG_VIDEODSP
1783 #endif /* HAVE_YASM */
1786 static av_cold void dsputil_init_sse2(DSPContext *c, AVCodecContext *avctx,
1789 const int bit_depth = avctx->bits_per_raw_sample;
1790 const int high_bit_depth = bit_depth > 8;
1792 #if HAVE_SSE2_INLINE
1793 if (!high_bit_depth && avctx->idct_algo == FF_IDCT_XVIDMMX) {
1794 c->idct_put = ff_idct_xvid_sse2_put;
1795 c->idct_add = ff_idct_xvid_sse2_add;
1796 c->idct = ff_idct_xvid_sse2;
1797 c->idct_permutation_type = FF_SSE2_IDCT_PERM;
1799 #endif /* HAVE_SSE2_INLINE */
1801 #if HAVE_SSE2_EXTERNAL
1802 if (!(mm_flags & AV_CPU_FLAG_SSE2SLOW)) {
1803 // these functions are slower than mmx on AMD, but faster on Intel
1804 if (!high_bit_depth) {
1805 c->put_pixels_tab[0][0] = ff_put_pixels16_sse2;
1806 c->put_no_rnd_pixels_tab[0][0] = ff_put_pixels16_sse2;
1807 c->avg_pixels_tab[0][0] = ff_avg_pixels16_sse2;
1811 c->scalarproduct_int16 = ff_scalarproduct_int16_sse2;
1812 c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_sse2;
1813 if (mm_flags & AV_CPU_FLAG_ATOM) {
1814 c->vector_clip_int32 = ff_vector_clip_int32_int_sse2;
1816 c->vector_clip_int32 = ff_vector_clip_int32_sse2;
1818 if (avctx->flags & CODEC_FLAG_BITEXACT) {
1819 c->apply_window_int16 = ff_apply_window_int16_sse2;
1820 } else if (!(mm_flags & AV_CPU_FLAG_SSE2SLOW)) {
1821 c->apply_window_int16 = ff_apply_window_int16_round_sse2;
1823 c->bswap_buf = ff_bswap32_buf_sse2;
1824 #endif /* HAVE_SSE2_EXTERNAL */
1827 static av_cold void dsputil_init_ssse3(DSPContext *c, AVCodecContext *avctx,
1830 #if HAVE_SSSE3_EXTERNAL
1831 c->add_hfyu_left_prediction = ff_add_hfyu_left_prediction_ssse3;
1832 if (mm_flags & AV_CPU_FLAG_SSE4) // not really sse4, just slow on Conroe
1833 c->add_hfyu_left_prediction = ff_add_hfyu_left_prediction_sse4;
1835 if (mm_flags & AV_CPU_FLAG_ATOM)
1836 c->apply_window_int16 = ff_apply_window_int16_ssse3_atom;
1838 c->apply_window_int16 = ff_apply_window_int16_ssse3;
1839 if (!(mm_flags & (AV_CPU_FLAG_SSE42|AV_CPU_FLAG_3DNOW))) // cachesplit
1840 c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_ssse3;
1841 c->bswap_buf = ff_bswap32_buf_ssse3;
1842 #endif /* HAVE_SSSE3_EXTERNAL */
1845 static av_cold void dsputil_init_sse4(DSPContext *c, AVCodecContext *avctx,
1848 #if HAVE_SSE4_EXTERNAL
1849 c->vector_clip_int32 = ff_vector_clip_int32_sse4;
1850 #endif /* HAVE_SSE4_EXTERNAL */
1853 av_cold void ff_dsputil_init_mmx(DSPContext *c, AVCodecContext *avctx)
1855 int mm_flags = av_get_cpu_flags();
1857 #if HAVE_7REGS && HAVE_INLINE_ASM
1858 if (mm_flags & AV_CPU_FLAG_CMOV)
1859 c->add_hfyu_median_prediction = add_hfyu_median_prediction_cmov;
1862 if (mm_flags & AV_CPU_FLAG_MMX) {
1864 const int idct_algo = avctx->idct_algo;
1866 if (avctx->lowres == 0 && avctx->bits_per_raw_sample <= 8) {
1867 if (idct_algo == FF_IDCT_AUTO || idct_algo == FF_IDCT_SIMPLEMMX) {
1868 c->idct_put = ff_simple_idct_put_mmx;
1869 c->idct_add = ff_simple_idct_add_mmx;
1870 c->idct = ff_simple_idct_mmx;
1871 c->idct_permutation_type = FF_SIMPLE_IDCT_PERM;
1873 } else if (idct_algo == FF_IDCT_LIBMPEG2MMX) {
1874 if (mm_flags & AV_CPU_FLAG_MMX2) {
1875 c->idct_put = ff_libmpeg2mmx2_idct_put;
1876 c->idct_add = ff_libmpeg2mmx2_idct_add;
1877 c->idct = ff_mmxext_idct;
1879 c->idct_put = ff_libmpeg2mmx_idct_put;
1880 c->idct_add = ff_libmpeg2mmx_idct_add;
1881 c->idct = ff_mmx_idct;
1883 c->idct_permutation_type = FF_LIBMPEG2_IDCT_PERM;
1885 } else if (idct_algo == FF_IDCT_XVIDMMX) {
1886 if (mm_flags & AV_CPU_FLAG_SSE2) {
1887 c->idct_put = ff_idct_xvid_sse2_put;
1888 c->idct_add = ff_idct_xvid_sse2_add;
1889 c->idct = ff_idct_xvid_sse2;
1890 c->idct_permutation_type = FF_SSE2_IDCT_PERM;
1891 } else if (mm_flags & AV_CPU_FLAG_MMXEXT) {
1892 c->idct_put = ff_idct_xvid_mmxext_put;
1893 c->idct_add = ff_idct_xvid_mmxext_add;
1894 c->idct = ff_idct_xvid_mmxext;
1896 c->idct_put = ff_idct_xvid_mmx_put;
1897 c->idct_add = ff_idct_xvid_mmx_add;
1898 c->idct = ff_idct_xvid_mmx;
1902 #endif /* HAVE_INLINE_ASM */
1904 dsputil_init_mmx(c, avctx, mm_flags);
1907 if (mm_flags & AV_CPU_FLAG_MMXEXT)
1908 dsputil_init_mmxext(c, avctx, mm_flags);
1910 if (mm_flags & AV_CPU_FLAG_3DNOW)
1911 dsputil_init_3dnow(c, avctx, mm_flags);
1913 if (mm_flags & AV_CPU_FLAG_SSE)
1914 dsputil_init_sse(c, avctx, mm_flags);
1916 if (mm_flags & AV_CPU_FLAG_SSE2)
1917 dsputil_init_sse2(c, avctx, mm_flags);
1919 if (mm_flags & AV_CPU_FLAG_SSSE3)
1920 dsputil_init_ssse3(c, avctx, mm_flags);
1922 if (mm_flags & AV_CPU_FLAG_SSE4)
1923 dsputil_init_sse4(c, avctx, mm_flags);
1925 if (CONFIG_ENCODERS)
1926 ff_dsputilenc_init_mmx(c, avctx);