2 * MMX optimized DSP utils
3 * Copyright (c) 2000, 2001 Fabrice Bellard
4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
6 * This file is part of FFmpeg.
8 * FFmpeg is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License as published by the Free Software Foundation; either
11 * version 2.1 of the License, or (at your option) any later version.
13 * FFmpeg is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 * Lesser General Public License for more details.
18 * You should have received a copy of the GNU Lesser General Public
19 * License along with FFmpeg; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22 * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
25 #include "libavutil/cpu.h"
26 #include "libavutil/x86/asm.h"
27 #include "libavcodec/hpeldsp.h"
28 #include "dsputil_mmx.h"
34 void ff_put_pixels8_x2_mmxext(uint8_t *block, const uint8_t *pixels,
35 ptrdiff_t line_size, int h);
36 void ff_put_pixels8_x2_3dnow(uint8_t *block, const uint8_t *pixels,
37 ptrdiff_t line_size, int h);
38 void ff_put_pixels16_x2_mmxext(uint8_t *block, const uint8_t *pixels,
39 ptrdiff_t line_size, int h);
40 void ff_put_pixels16_x2_3dnow(uint8_t *block, const uint8_t *pixels,
41 ptrdiff_t line_size, int h);
42 void ff_put_no_rnd_pixels8_x2_mmxext(uint8_t *block, const uint8_t *pixels,
43 ptrdiff_t line_size, int h);
44 void ff_put_no_rnd_pixels8_x2_3dnow(uint8_t *block, const uint8_t *pixels,
45 ptrdiff_t line_size, int h);
46 void ff_put_no_rnd_pixels8_x2_exact_mmxext(uint8_t *block,
47 const uint8_t *pixels,
48 ptrdiff_t line_size, int h);
49 void ff_put_no_rnd_pixels8_x2_exact_3dnow(uint8_t *block,
50 const uint8_t *pixels,
51 ptrdiff_t line_size, int h);
52 void ff_put_pixels8_y2_mmxext(uint8_t *block, const uint8_t *pixels,
53 ptrdiff_t line_size, int h);
54 void ff_put_pixels8_y2_3dnow(uint8_t *block, const uint8_t *pixels,
55 ptrdiff_t line_size, int h);
56 void ff_put_no_rnd_pixels8_y2_mmxext(uint8_t *block, const uint8_t *pixels,
57 ptrdiff_t line_size, int h);
58 void ff_put_no_rnd_pixels8_y2_3dnow(uint8_t *block, const uint8_t *pixels,
59 ptrdiff_t line_size, int h);
60 void ff_put_no_rnd_pixels8_y2_exact_mmxext(uint8_t *block,
61 const uint8_t *pixels,
62 ptrdiff_t line_size, int h);
63 void ff_put_no_rnd_pixels8_y2_exact_3dnow(uint8_t *block,
64 const uint8_t *pixels,
65 ptrdiff_t line_size, int h);
66 void ff_avg_pixels8_3dnow(uint8_t *block, const uint8_t *pixels,
67 ptrdiff_t line_size, int h);
68 void ff_avg_pixels8_x2_mmxext(uint8_t *block, const uint8_t *pixels,
69 ptrdiff_t line_size, int h);
70 void ff_avg_pixels8_x2_3dnow(uint8_t *block, const uint8_t *pixels,
71 ptrdiff_t line_size, int h);
72 void ff_avg_pixels8_y2_mmxext(uint8_t *block, const uint8_t *pixels,
73 ptrdiff_t line_size, int h);
74 void ff_avg_pixels8_y2_3dnow(uint8_t *block, const uint8_t *pixels,
75 ptrdiff_t line_size, int h);
76 void ff_avg_pixels8_xy2_mmxext(uint8_t *block, const uint8_t *pixels,
77 ptrdiff_t line_size, int h);
78 void ff_avg_pixels8_xy2_3dnow(uint8_t *block, const uint8_t *pixels,
79 ptrdiff_t line_size, int h);
80 #endif /* HAVE_YASM */
85 #define JUMPALIGN() __asm__ volatile (".p2align 3"::)
86 #define MOVQ_ZERO(regd) __asm__ volatile ("pxor %%"#regd", %%"#regd ::)
88 #define MOVQ_BFE(regd) \
90 "pcmpeqd %%"#regd", %%"#regd" \n\t" \
91 "paddb %%"#regd", %%"#regd" \n\t" ::)
94 #define MOVQ_BONE(regd) __asm__ volatile ("movq %0, %%"#regd" \n\t" :: "m"(ff_bone))
95 #define MOVQ_WTWO(regd) __asm__ volatile ("movq %0, %%"#regd" \n\t" :: "m"(ff_wtwo))
97 // for shared library it's better to use this way for accessing constants
99 #define MOVQ_BONE(regd) \
101 "pcmpeqd %%"#regd", %%"#regd" \n\t" \
102 "psrlw $15, %%"#regd" \n\t" \
103 "packuswb %%"#regd", %%"#regd" \n\t" ::)
105 #define MOVQ_WTWO(regd) \
107 "pcmpeqd %%"#regd", %%"#regd" \n\t" \
108 "psrlw $15, %%"#regd" \n\t" \
109 "psllw $1, %%"#regd" \n\t"::)
113 // using regr as temporary and for the output result
114 // first argument is unmodifed and second is trashed
115 // regfe is supposed to contain 0xfefefefefefefefe
116 #define PAVGB_MMX_NO_RND(rega, regb, regr, regfe) \
117 "movq "#rega", "#regr" \n\t" \
118 "pand "#regb", "#regr" \n\t" \
119 "pxor "#rega", "#regb" \n\t" \
120 "pand "#regfe", "#regb" \n\t" \
121 "psrlq $1, "#regb" \n\t" \
122 "paddb "#regb", "#regr" \n\t"
124 #define PAVGB_MMX(rega, regb, regr, regfe) \
125 "movq "#rega", "#regr" \n\t" \
126 "por "#regb", "#regr" \n\t" \
127 "pxor "#rega", "#regb" \n\t" \
128 "pand "#regfe", "#regb" \n\t" \
129 "psrlq $1, "#regb" \n\t" \
130 "psubb "#regb", "#regr" \n\t"
132 // mm6 is supposed to contain 0xfefefefefefefefe
133 #define PAVGBP_MMX_NO_RND(rega, regb, regr, regc, regd, regp) \
134 "movq "#rega", "#regr" \n\t" \
135 "movq "#regc", "#regp" \n\t" \
136 "pand "#regb", "#regr" \n\t" \
137 "pand "#regd", "#regp" \n\t" \
138 "pxor "#rega", "#regb" \n\t" \
139 "pxor "#regc", "#regd" \n\t" \
140 "pand %%mm6, "#regb" \n\t" \
141 "pand %%mm6, "#regd" \n\t" \
142 "psrlq $1, "#regb" \n\t" \
143 "psrlq $1, "#regd" \n\t" \
144 "paddb "#regb", "#regr" \n\t" \
145 "paddb "#regd", "#regp" \n\t"
147 #define PAVGBP_MMX(rega, regb, regr, regc, regd, regp) \
148 "movq "#rega", "#regr" \n\t" \
149 "movq "#regc", "#regp" \n\t" \
150 "por "#regb", "#regr" \n\t" \
151 "por "#regd", "#regp" \n\t" \
152 "pxor "#rega", "#regb" \n\t" \
153 "pxor "#regc", "#regd" \n\t" \
154 "pand %%mm6, "#regb" \n\t" \
155 "pand %%mm6, "#regd" \n\t" \
156 "psrlq $1, "#regd" \n\t" \
157 "psrlq $1, "#regb" \n\t" \
158 "psubb "#regb", "#regr" \n\t" \
159 "psubb "#regd", "#regp" \n\t"
161 /***********************************/
162 /* MMX no rounding */
164 #define DEF(x, y) x ## _no_rnd_ ## y ## _mmx
165 #define SET_RND MOVQ_WONE
166 #define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX_NO_RND(a, b, c, d, e, f)
167 #define PAVGB(a, b, c, e) PAVGB_MMX_NO_RND(a, b, c, e)
168 #define OP_AVG(a, b, c, e) PAVGB_MMX(a, b, c, e)
170 #include "hpeldsp_rnd_template.c"
177 /***********************************/
180 #define DEF(x, y) x ## _ ## y ## _mmx
181 #define SET_RND MOVQ_WTWO
182 #define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX(a, b, c, d, e, f)
183 #define PAVGB(a, b, c, e) PAVGB_MMX(a, b, c, e)
185 #include "hpeldsp_rnd_template.c"
193 #endif /* HAVE_INLINE_ASM */
197 #define ff_put_pixels8_mmx ff_put_pixels8_mmxext
199 /***********************************/
202 #define DEF(x) x ## _3dnow
204 #include "hpeldsp_avg_template.c"
208 /***********************************/
209 /* MMXEXT specific */
211 #define DEF(x) x ## _mmxext
213 #include "hpeldsp_avg_template.c"
217 #endif /* HAVE_YASM */
221 #define put_no_rnd_pixels16_mmx put_pixels16_mmx
222 #define put_no_rnd_pixels8_mmx put_pixels8_mmx
223 #define put_pixels16_mmxext put_pixels16_mmx
224 #define put_pixels8_mmxext put_pixels8_mmx
225 #define put_pixels4_mmxext put_pixels4_mmx
226 #define put_no_rnd_pixels16_mmxext put_no_rnd_pixels16_mmx
227 #define put_no_rnd_pixels8_mmxext put_no_rnd_pixels8_mmx
229 static void put_pixels8_mmx(uint8_t *block, const uint8_t *pixels,
230 ptrdiff_t line_size, int h)
233 "lea (%3, %3), %%"REG_a" \n\t"
236 "movq (%1 ), %%mm0 \n\t"
237 "movq (%1, %3), %%mm1 \n\t"
238 "movq %%mm0, (%2) \n\t"
239 "movq %%mm1, (%2, %3) \n\t"
240 "add %%"REG_a", %1 \n\t"
241 "add %%"REG_a", %2 \n\t"
242 "movq (%1 ), %%mm0 \n\t"
243 "movq (%1, %3), %%mm1 \n\t"
244 "movq %%mm0, (%2) \n\t"
245 "movq %%mm1, (%2, %3) \n\t"
246 "add %%"REG_a", %1 \n\t"
247 "add %%"REG_a", %2 \n\t"
250 : "+g"(h), "+r"(pixels), "+r"(block)
251 : "r"((x86_reg)line_size)
256 static void put_pixels16_mmx(uint8_t *block, const uint8_t *pixels,
257 ptrdiff_t line_size, int h)
260 "lea (%3, %3), %%"REG_a" \n\t"
263 "movq (%1 ), %%mm0 \n\t"
264 "movq 8(%1 ), %%mm4 \n\t"
265 "movq (%1, %3), %%mm1 \n\t"
266 "movq 8(%1, %3), %%mm5 \n\t"
267 "movq %%mm0, (%2) \n\t"
268 "movq %%mm4, 8(%2) \n\t"
269 "movq %%mm1, (%2, %3) \n\t"
270 "movq %%mm5, 8(%2, %3) \n\t"
271 "add %%"REG_a", %1 \n\t"
272 "add %%"REG_a", %2 \n\t"
273 "movq (%1 ), %%mm0 \n\t"
274 "movq 8(%1 ), %%mm4 \n\t"
275 "movq (%1, %3), %%mm1 \n\t"
276 "movq 8(%1, %3), %%mm5 \n\t"
277 "movq %%mm0, (%2) \n\t"
278 "movq %%mm4, 8(%2) \n\t"
279 "movq %%mm1, (%2, %3) \n\t"
280 "movq %%mm5, 8(%2, %3) \n\t"
281 "add %%"REG_a", %1 \n\t"
282 "add %%"REG_a", %2 \n\t"
285 : "+g"(h), "+r"(pixels), "+r"(block)
286 : "r"((x86_reg)line_size)
290 #endif /* HAVE_INLINE_ASM */
292 void ff_put_pixels16_sse2(uint8_t *block, const uint8_t *pixels,
293 ptrdiff_t line_size, int h);
294 void ff_avg_pixels16_sse2(uint8_t *block, const uint8_t *pixels,
295 ptrdiff_t line_size, int h);
297 #define SET_HPEL_FUNCS(PFX, IDX, SIZE, CPU) \
299 c->PFX ## _pixels_tab IDX [0] = PFX ## _pixels ## SIZE ## _ ## CPU; \
300 c->PFX ## _pixels_tab IDX [1] = PFX ## _pixels ## SIZE ## _x2_ ## CPU; \
301 c->PFX ## _pixels_tab IDX [2] = PFX ## _pixels ## SIZE ## _y2_ ## CPU; \
302 c->PFX ## _pixels_tab IDX [3] = PFX ## _pixels ## SIZE ## _xy2_ ## CPU; \
305 static void hpeldsp_init_mmx(HpelDSPContext *c, int flags, int mm_flags)
308 SET_HPEL_FUNCS(put, [0], 16, mmx);
309 SET_HPEL_FUNCS(put_no_rnd, [0], 16, mmx);
310 SET_HPEL_FUNCS(avg, [0], 16, mmx);
311 SET_HPEL_FUNCS(avg_no_rnd, , 16, mmx);
312 SET_HPEL_FUNCS(put, [1], 8, mmx);
313 SET_HPEL_FUNCS(put_no_rnd, [1], 8, mmx);
314 SET_HPEL_FUNCS(avg, [1], 8, mmx);
315 #endif /* HAVE_INLINE_ASM */
318 static void hpeldsp_init_mmxext(HpelDSPContext *c, int flags, int mm_flags)
321 c->put_pixels_tab[0][1] = ff_put_pixels16_x2_mmxext;
322 c->put_pixels_tab[0][2] = ff_put_pixels16_y2_mmxext;
324 c->avg_pixels_tab[0][0] = ff_avg_pixels16_mmxext;
325 c->avg_pixels_tab[0][1] = ff_avg_pixels16_x2_mmxext;
326 c->avg_pixels_tab[0][2] = ff_avg_pixels16_y2_mmxext;
328 c->put_pixels_tab[1][1] = ff_put_pixels8_x2_mmxext;
329 c->put_pixels_tab[1][2] = ff_put_pixels8_y2_mmxext;
331 c->avg_pixels_tab[1][0] = ff_avg_pixels8_mmxext;
332 c->avg_pixels_tab[1][1] = ff_avg_pixels8_x2_mmxext;
333 c->avg_pixels_tab[1][2] = ff_avg_pixels8_y2_mmxext;
335 if (!(flags & CODEC_FLAG_BITEXACT)) {
336 c->put_no_rnd_pixels_tab[0][1] = ff_put_no_rnd_pixels16_x2_mmxext;
337 c->put_no_rnd_pixels_tab[0][2] = ff_put_no_rnd_pixels16_y2_mmxext;
338 c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_mmxext;
339 c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_mmxext;
341 c->avg_pixels_tab[0][3] = ff_avg_pixels16_xy2_mmxext;
342 c->avg_pixels_tab[1][3] = ff_avg_pixels8_xy2_mmxext;
344 #endif /* HAVE_YASM */
346 #if HAVE_MMXEXT_EXTERNAL
347 if (flags & CODEC_FLAG_BITEXACT && CONFIG_VP3_DECODER) {
348 c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_exact_mmxext;
349 c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_exact_mmxext;
351 #endif /* HAVE_MMXEXT_EXTERNAL */
354 static void hpeldsp_init_3dnow(HpelDSPContext *c, int flags, int mm_flags)
357 c->put_pixels_tab[0][1] = ff_put_pixels16_x2_3dnow;
358 c->put_pixels_tab[0][2] = ff_put_pixels16_y2_3dnow;
360 c->avg_pixels_tab[0][0] = ff_avg_pixels16_3dnow;
361 c->avg_pixels_tab[0][1] = ff_avg_pixels16_x2_3dnow;
362 c->avg_pixels_tab[0][2] = ff_avg_pixels16_y2_3dnow;
364 c->put_pixels_tab[1][1] = ff_put_pixels8_x2_3dnow;
365 c->put_pixels_tab[1][2] = ff_put_pixels8_y2_3dnow;
367 c->avg_pixels_tab[1][0] = ff_avg_pixels8_3dnow;
368 c->avg_pixels_tab[1][1] = ff_avg_pixels8_x2_3dnow;
369 c->avg_pixels_tab[1][2] = ff_avg_pixels8_y2_3dnow;
371 if (!(flags & CODEC_FLAG_BITEXACT)){
372 c->put_no_rnd_pixels_tab[0][1] = ff_put_no_rnd_pixels16_x2_3dnow;
373 c->put_no_rnd_pixels_tab[0][2] = ff_put_no_rnd_pixels16_y2_3dnow;
374 c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_3dnow;
375 c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_3dnow;
377 c->avg_pixels_tab[0][3] = ff_avg_pixels16_xy2_3dnow;
378 c->avg_pixels_tab[1][3] = ff_avg_pixels8_xy2_3dnow;
381 if (flags & CODEC_FLAG_BITEXACT && CONFIG_VP3_DECODER) {
382 c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_exact_3dnow;
383 c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_exact_3dnow;
385 #endif /* HAVE_YASM */
388 static void hpeldsp_init_sse2(HpelDSPContext *c, int flags, int mm_flags)
390 #if HAVE_SSE2_EXTERNAL
391 if (!(mm_flags & AV_CPU_FLAG_SSE2SLOW)) {
392 // these functions are slower than mmx on AMD, but faster on Intel
393 c->put_pixels_tab[0][0] = ff_put_pixels16_sse2;
394 c->put_no_rnd_pixels_tab[0][0] = ff_put_pixels16_sse2;
395 c->avg_pixels_tab[0][0] = ff_avg_pixels16_sse2;
397 #endif /* HAVE_SSE2_EXTERNAL */
400 void ff_hpeldsp_init_x86(HpelDSPContext *c, int flags)
402 int mm_flags = av_get_cpu_flags();
404 if (HAVE_MMX && mm_flags & AV_CPU_FLAG_MMX)
405 hpeldsp_init_mmx(c, flags, mm_flags);
407 if (mm_flags & AV_CPU_FLAG_MMXEXT)
408 hpeldsp_init_mmxext(c, flags, mm_flags);
410 if (mm_flags & AV_CPU_FLAG_3DNOW)
411 hpeldsp_init_3dnow(c, flags, mm_flags);
413 if (mm_flags & AV_CPU_FLAG_SSE2)
414 hpeldsp_init_sse2(c, flags, mm_flags);