2 * MMX optimized DSP utils
3 * Copyright (c) 2000, 2001 Fabrice Bellard
4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
6 * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
8 * This file is part of FFmpeg.
10 * FFmpeg is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public
12 * License as published by the Free Software Foundation; either
13 * version 2.1 of the License, or (at your option) any later version.
15 * FFmpeg is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * Lesser General Public License for more details.
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with FFmpeg; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
25 #include "libavutil/attributes.h"
26 #include "libavutil/cpu.h"
27 #include "libavutil/x86/asm.h"
28 #include "libavutil/x86/cpu.h"
29 #include "libavcodec/dsputil.h"
30 #include "libavcodec/mpegvideo.h"
31 #include "dsputil_x86.h"
33 int ff_sum_abs_dctelem_mmx(int16_t *block);
34 int ff_sum_abs_dctelem_mmxext(int16_t *block);
35 int ff_sum_abs_dctelem_sse2(int16_t *block);
36 int ff_sum_abs_dctelem_ssse3(int16_t *block);
37 int ff_sse8_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
38 int line_size, int h);
39 int ff_sse16_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
40 int line_size, int h);
41 int ff_sse16_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
42 int line_size, int h);
43 int ff_hf_noise8_mmx(uint8_t *pix1, int lsize, int h);
44 int ff_hf_noise16_mmx(uint8_t *pix1, int lsize, int h);
46 #define hadamard_func(cpu) \
47 int ff_hadamard8_diff_ ## cpu(MpegEncContext *s, uint8_t *src1, \
48 uint8_t *src2, int stride, int h); \
49 int ff_hadamard8_diff16_ ## cpu(MpegEncContext *s, uint8_t *src1, \
50 uint8_t *src2, int stride, int h);
58 static int nsse16_mmx(MpegEncContext *c, uint8_t *pix1, uint8_t *pix2,
64 score1 = c->dsp.sse[0](c, pix1, pix2, line_size, h);
66 score1 = ff_sse16_mmx(c, pix1, pix2, line_size, h);
67 score2 = ff_hf_noise16_mmx(pix1, line_size, h) + ff_hf_noise8_mmx(pix1+8, line_size, h)
68 - ff_hf_noise16_mmx(pix2, line_size, h) - ff_hf_noise8_mmx(pix2+8, line_size, h);
71 return score1 + FFABS(score2) * c->avctx->nsse_weight;
73 return score1 + FFABS(score2) * 8;
76 static int nsse8_mmx(MpegEncContext *c, uint8_t *pix1, uint8_t *pix2,
79 int score1 = ff_sse8_mmx(c, pix1, pix2, line_size, h);
80 int score2 = ff_hf_noise8_mmx(pix1, line_size, h) -
81 ff_hf_noise8_mmx(pix2, line_size, h);
84 return score1 + FFABS(score2) * c->avctx->nsse_weight;
86 return score1 + FFABS(score2) * 8;
89 #endif /* HAVE_YASM */
93 static int vsad_intra16_mmx(MpegEncContext *v, uint8_t *pix, uint8_t *dummy,
98 av_assert2((((int) pix) & 7) == 0);
99 av_assert2((line_size & 7) == 0);
101 #define SUM(in0, in1, out0, out1) \
102 "movq (%0), %%mm2\n" \
103 "movq 8(%0), %%mm3\n" \
105 "movq %%mm2, " #out0 "\n" \
106 "movq %%mm3, " #out1 "\n" \
107 "psubusb " #in0 ", %%mm2\n" \
108 "psubusb " #in1 ", %%mm3\n" \
109 "psubusb " #out0 ", " #in0 "\n" \
110 "psubusb " #out1 ", " #in1 "\n" \
111 "por %%mm2, " #in0 "\n" \
112 "por %%mm3, " #in1 "\n" \
113 "movq " #in0 ", %%mm2\n" \
114 "movq " #in1 ", %%mm3\n" \
115 "punpcklbw %%mm7, " #in0 "\n" \
116 "punpcklbw %%mm7, " #in1 "\n" \
117 "punpckhbw %%mm7, %%mm2\n" \
118 "punpckhbw %%mm7, %%mm3\n" \
119 "paddw " #in1 ", " #in0 "\n" \
120 "paddw %%mm3, %%mm2\n" \
121 "paddw %%mm2, " #in0 "\n" \
122 "paddw " #in0 ", %%mm6\n"
127 "pxor %%mm6, %%mm6\n"
128 "pxor %%mm7, %%mm7\n"
130 "movq 8(%0), %%mm1\n"
135 SUM(%%mm4, %%mm5, %%mm0, %%mm1)
137 SUM(%%mm0, %%mm1, %%mm4, %%mm5)
142 "movq %%mm6, %%mm0\n"
144 "paddw %%mm6, %%mm0\n"
145 "movq %%mm0, %%mm6\n"
147 "paddw %%mm6, %%mm0\n"
149 : "+r" (pix), "=r" (tmp)
150 : "r" ((x86_reg) line_size), "m" (h)
157 static int vsad_intra16_mmxext(MpegEncContext *v, uint8_t *pix, uint8_t *dummy,
158 int line_size, int h)
162 av_assert2((((int) pix) & 7) == 0);
163 av_assert2((line_size & 7) == 0);
165 #define SUM(in0, in1, out0, out1) \
166 "movq (%0), " #out0 "\n" \
167 "movq 8(%0), " #out1 "\n" \
169 "psadbw " #out0 ", " #in0 "\n" \
170 "psadbw " #out1 ", " #in1 "\n" \
171 "paddw " #in1 ", " #in0 "\n" \
172 "paddw " #in0 ", %%mm6\n"
176 "pxor %%mm6, %%mm6\n"
177 "pxor %%mm7, %%mm7\n"
179 "movq 8(%0), %%mm1\n"
184 SUM(%%mm4, %%mm5, %%mm0, %%mm1)
186 SUM(%%mm0, %%mm1, %%mm4, %%mm5)
192 : "+r" (pix), "=r" (tmp)
193 : "r" ((x86_reg) line_size), "m" (h)
200 static int vsad16_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
201 int line_size, int h)
205 av_assert2((((int) pix1) & 7) == 0);
206 av_assert2((((int) pix2) & 7) == 0);
207 av_assert2((line_size & 7) == 0);
209 #define SUM(in0, in1, out0, out1) \
210 "movq (%0), %%mm2\n" \
211 "movq (%1), " #out0 "\n" \
212 "movq 8(%0), %%mm3\n" \
213 "movq 8(%1), " #out1 "\n" \
216 "psubb " #out0 ", %%mm2\n" \
217 "psubb " #out1 ", %%mm3\n" \
218 "pxor %%mm7, %%mm2\n" \
219 "pxor %%mm7, %%mm3\n" \
220 "movq %%mm2, " #out0 "\n" \
221 "movq %%mm3, " #out1 "\n" \
222 "psubusb " #in0 ", %%mm2\n" \
223 "psubusb " #in1 ", %%mm3\n" \
224 "psubusb " #out0 ", " #in0 "\n" \
225 "psubusb " #out1 ", " #in1 "\n" \
226 "por %%mm2, " #in0 "\n" \
227 "por %%mm3, " #in1 "\n" \
228 "movq " #in0 ", %%mm2\n" \
229 "movq " #in1 ", %%mm3\n" \
230 "punpcklbw %%mm7, " #in0 "\n" \
231 "punpcklbw %%mm7, " #in1 "\n" \
232 "punpckhbw %%mm7, %%mm2\n" \
233 "punpckhbw %%mm7, %%mm3\n" \
234 "paddw " #in1 ", " #in0 "\n" \
235 "paddw %%mm3, %%mm2\n" \
236 "paddw %%mm2, " #in0 "\n" \
237 "paddw " #in0 ", %%mm6\n"
242 "pxor %%mm6, %%mm6\n"
243 "pcmpeqw %%mm7, %%mm7\n"
245 "packsswb %%mm7, %%mm7\n"
248 "movq 8(%0), %%mm1\n"
249 "movq 8(%1), %%mm3\n"
252 "psubb %%mm2, %%mm0\n"
253 "psubb %%mm3, %%mm1\n"
254 "pxor %%mm7, %%mm0\n"
255 "pxor %%mm7, %%mm1\n"
259 SUM(%%mm4, %%mm5, %%mm0, %%mm1)
261 SUM(%%mm0, %%mm1, %%mm4, %%mm5)
266 "movq %%mm6, %%mm0\n"
268 "paddw %%mm6, %%mm0\n"
269 "movq %%mm0, %%mm6\n"
271 "paddw %%mm6, %%mm0\n"
273 : "+r" (pix1), "+r" (pix2), "=r" (tmp)
274 : "r" ((x86_reg) line_size), "m" (h)
281 static int vsad16_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
282 int line_size, int h)
286 av_assert2((((int) pix1) & 7) == 0);
287 av_assert2((((int) pix2) & 7) == 0);
288 av_assert2((line_size & 7) == 0);
290 #define SUM(in0, in1, out0, out1) \
291 "movq (%0), " #out0 "\n" \
292 "movq (%1), %%mm2\n" \
293 "movq 8(%0), " #out1 "\n" \
294 "movq 8(%1), %%mm3\n" \
297 "psubb %%mm2, " #out0 "\n" \
298 "psubb %%mm3, " #out1 "\n" \
299 "pxor %%mm7, " #out0 "\n" \
300 "pxor %%mm7, " #out1 "\n" \
301 "psadbw " #out0 ", " #in0 "\n" \
302 "psadbw " #out1 ", " #in1 "\n" \
303 "paddw " #in1 ", " #in0 "\n" \
304 "paddw " #in0 ", %%mm6\n "
308 "pxor %%mm6, %%mm6\n"
309 "pcmpeqw %%mm7, %%mm7\n"
311 "packsswb %%mm7, %%mm7\n"
314 "movq 8(%0), %%mm1\n"
315 "movq 8(%1), %%mm3\n"
318 "psubb %%mm2, %%mm0\n"
319 "psubb %%mm3, %%mm1\n"
320 "pxor %%mm7, %%mm0\n"
321 "pxor %%mm7, %%mm1\n"
325 SUM(%%mm4, %%mm5, %%mm0, %%mm1)
327 SUM(%%mm0, %%mm1, %%mm4, %%mm5)
333 : "+r" (pix1), "+r" (pix2), "=r" (tmp)
334 : "r" ((x86_reg) line_size), "m" (h)
342 #endif /* HAVE_INLINE_ASM */
344 av_cold void ff_dsputil_init_x86(DSPContext *c, AVCodecContext *avctx)
346 int cpu_flags = av_get_cpu_flags();
349 if (INLINE_MMX(cpu_flags)) {
350 c->vsad[4] = vsad_intra16_mmx;
352 if (!(avctx->flags & CODEC_FLAG_BITEXACT)) {
353 c->vsad[0] = vsad16_mmx;
357 if (INLINE_MMXEXT(cpu_flags)) {
358 c->vsad[4] = vsad_intra16_mmxext;
360 if (!(avctx->flags & CODEC_FLAG_BITEXACT)) {
361 c->vsad[0] = vsad16_mmxext;
364 #endif /* HAVE_INLINE_ASM */
366 if (EXTERNAL_MMX(cpu_flags)) {
367 c->hadamard8_diff[0] = ff_hadamard8_diff16_mmx;
368 c->hadamard8_diff[1] = ff_hadamard8_diff_mmx;
369 c->sum_abs_dctelem = ff_sum_abs_dctelem_mmx;
370 c->sse[0] = ff_sse16_mmx;
371 c->sse[1] = ff_sse8_mmx;
373 c->nsse[0] = nsse16_mmx;
374 c->nsse[1] = nsse8_mmx;
378 if (EXTERNAL_MMXEXT(cpu_flags)) {
379 c->hadamard8_diff[0] = ff_hadamard8_diff16_mmxext;
380 c->hadamard8_diff[1] = ff_hadamard8_diff_mmxext;
381 c->sum_abs_dctelem = ff_sum_abs_dctelem_mmxext;
384 if (EXTERNAL_SSE2(cpu_flags)) {
385 c->sse[0] = ff_sse16_sse2;
386 c->sum_abs_dctelem = ff_sum_abs_dctelem_sse2;
388 #if HAVE_ALIGNED_STACK
389 c->hadamard8_diff[0] = ff_hadamard8_diff16_sse2;
390 c->hadamard8_diff[1] = ff_hadamard8_diff_sse2;
394 if (EXTERNAL_SSSE3(cpu_flags)) {
395 c->sum_abs_dctelem = ff_sum_abs_dctelem_ssse3;
396 #if HAVE_ALIGNED_STACK
397 c->hadamard8_diff[0] = ff_hadamard8_diff16_ssse3;
398 c->hadamard8_diff[1] = ff_hadamard8_diff_ssse3;
402 ff_dsputil_init_pix_mmx(c, avctx);