2 * MMX optimized DSP utils
3 * Copyright (c) 2000, 2001 Fabrice Bellard
4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
6 * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
8 * This file is part of FFmpeg.
10 * FFmpeg is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public
12 * License as published by the Free Software Foundation; either
13 * version 2.1 of the License, or (at your option) any later version.
15 * FFmpeg is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * Lesser General Public License for more details.
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with FFmpeg; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
25 #include "libavutil/attributes.h"
26 #include "libavutil/cpu.h"
27 #include "libavutil/x86/asm.h"
28 #include "libavutil/x86/cpu.h"
29 #include "libavcodec/dsputil.h"
30 #include "libavcodec/mpegvideo.h"
31 #include "dsputil_x86.h"
33 void ff_get_pixels_mmx(int16_t *block, const uint8_t *pixels, int line_size);
34 void ff_get_pixels_sse2(int16_t *block, const uint8_t *pixels, int line_size);
35 void ff_diff_pixels_mmx(int16_t *block, const uint8_t *s1, const uint8_t *s2,
37 void ff_diff_pixels_sse2(int16_t *block, const uint8_t *s1, const uint8_t *s2,
39 int ff_sum_abs_dctelem_mmx(int16_t *block);
40 int ff_sum_abs_dctelem_mmxext(int16_t *block);
41 int ff_sum_abs_dctelem_sse2(int16_t *block);
42 int ff_sum_abs_dctelem_ssse3(int16_t *block);
43 int ff_sse8_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
44 int line_size, int h);
45 int ff_sse16_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
46 int line_size, int h);
47 int ff_sse16_sse2(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
48 int line_size, int h);
49 int ff_hf_noise8_mmx(uint8_t *pix1, int lsize, int h);
50 int ff_hf_noise16_mmx(uint8_t *pix1, int lsize, int h);
52 #define hadamard_func(cpu) \
53 int ff_hadamard8_diff_ ## cpu(MpegEncContext *s, uint8_t *src1, \
54 uint8_t *src2, int stride, int h); \
55 int ff_hadamard8_diff16_ ## cpu(MpegEncContext *s, uint8_t *src1, \
56 uint8_t *src2, int stride, int h);
65 static int nsse16_mmx(MpegEncContext *c, uint8_t *pix1, uint8_t *pix2,
71 score1 = c->dsp.sse[0](c, pix1, pix2, line_size, h);
73 score1 = ff_sse16_mmx(c, pix1, pix2, line_size, h);
74 score2 = ff_hf_noise16_mmx(pix1, line_size, h) + ff_hf_noise8_mmx(pix1+8, line_size, h)
75 - ff_hf_noise16_mmx(pix2, line_size, h) - ff_hf_noise8_mmx(pix2+8, line_size, h);
78 return score1 + FFABS(score2) * c->avctx->nsse_weight;
80 return score1 + FFABS(score2) * 8;
83 static int nsse8_mmx(MpegEncContext *c, uint8_t *pix1, uint8_t *pix2,
86 int score1 = ff_sse8_mmx(c, pix1, pix2, line_size, h);
87 int score2 = ff_hf_noise8_mmx(pix1, line_size, h) -
88 ff_hf_noise8_mmx(pix2, line_size, h);
91 return score1 + FFABS(score2) * c->avctx->nsse_weight;
93 return score1 + FFABS(score2) * 8;
96 #endif /* HAVE_YASM */
100 static int vsad_intra16_mmx(MpegEncContext *v, uint8_t *pix, uint8_t *dummy,
101 int line_size, int h)
105 av_assert2((((int) pix) & 7) == 0);
106 av_assert2((line_size & 7) == 0);
108 #define SUM(in0, in1, out0, out1) \
109 "movq (%0), %%mm2\n" \
110 "movq 8(%0), %%mm3\n" \
112 "movq %%mm2, " #out0 "\n" \
113 "movq %%mm3, " #out1 "\n" \
114 "psubusb " #in0 ", %%mm2\n" \
115 "psubusb " #in1 ", %%mm3\n" \
116 "psubusb " #out0 ", " #in0 "\n" \
117 "psubusb " #out1 ", " #in1 "\n" \
118 "por %%mm2, " #in0 "\n" \
119 "por %%mm3, " #in1 "\n" \
120 "movq " #in0 ", %%mm2\n" \
121 "movq " #in1 ", %%mm3\n" \
122 "punpcklbw %%mm7, " #in0 "\n" \
123 "punpcklbw %%mm7, " #in1 "\n" \
124 "punpckhbw %%mm7, %%mm2\n" \
125 "punpckhbw %%mm7, %%mm3\n" \
126 "paddw " #in1 ", " #in0 "\n" \
127 "paddw %%mm3, %%mm2\n" \
128 "paddw %%mm2, " #in0 "\n" \
129 "paddw " #in0 ", %%mm6\n"
134 "pxor %%mm6, %%mm6\n"
135 "pxor %%mm7, %%mm7\n"
137 "movq 8(%0), %%mm1\n"
142 SUM(%%mm4, %%mm5, %%mm0, %%mm1)
144 SUM(%%mm0, %%mm1, %%mm4, %%mm5)
149 "movq %%mm6, %%mm0\n"
151 "paddw %%mm6, %%mm0\n"
152 "movq %%mm0, %%mm6\n"
154 "paddw %%mm6, %%mm0\n"
156 : "+r" (pix), "=r" (tmp)
157 : "r" ((x86_reg) line_size), "m" (h)
164 static int vsad_intra16_mmxext(MpegEncContext *v, uint8_t *pix, uint8_t *dummy,
165 int line_size, int h)
169 av_assert2((((int) pix) & 7) == 0);
170 av_assert2((line_size & 7) == 0);
172 #define SUM(in0, in1, out0, out1) \
173 "movq (%0), " #out0 "\n" \
174 "movq 8(%0), " #out1 "\n" \
176 "psadbw " #out0 ", " #in0 "\n" \
177 "psadbw " #out1 ", " #in1 "\n" \
178 "paddw " #in1 ", " #in0 "\n" \
179 "paddw " #in0 ", %%mm6\n"
183 "pxor %%mm6, %%mm6\n"
184 "pxor %%mm7, %%mm7\n"
186 "movq 8(%0), %%mm1\n"
191 SUM(%%mm4, %%mm5, %%mm0, %%mm1)
193 SUM(%%mm0, %%mm1, %%mm4, %%mm5)
199 : "+r" (pix), "=r" (tmp)
200 : "r" ((x86_reg) line_size), "m" (h)
207 static int vsad16_mmx(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
208 int line_size, int h)
212 av_assert2((((int) pix1) & 7) == 0);
213 av_assert2((((int) pix2) & 7) == 0);
214 av_assert2((line_size & 7) == 0);
216 #define SUM(in0, in1, out0, out1) \
217 "movq (%0), %%mm2\n" \
218 "movq (%1), " #out0 "\n" \
219 "movq 8(%0), %%mm3\n" \
220 "movq 8(%1), " #out1 "\n" \
223 "psubb " #out0 ", %%mm2\n" \
224 "psubb " #out1 ", %%mm3\n" \
225 "pxor %%mm7, %%mm2\n" \
226 "pxor %%mm7, %%mm3\n" \
227 "movq %%mm2, " #out0 "\n" \
228 "movq %%mm3, " #out1 "\n" \
229 "psubusb " #in0 ", %%mm2\n" \
230 "psubusb " #in1 ", %%mm3\n" \
231 "psubusb " #out0 ", " #in0 "\n" \
232 "psubusb " #out1 ", " #in1 "\n" \
233 "por %%mm2, " #in0 "\n" \
234 "por %%mm3, " #in1 "\n" \
235 "movq " #in0 ", %%mm2\n" \
236 "movq " #in1 ", %%mm3\n" \
237 "punpcklbw %%mm7, " #in0 "\n" \
238 "punpcklbw %%mm7, " #in1 "\n" \
239 "punpckhbw %%mm7, %%mm2\n" \
240 "punpckhbw %%mm7, %%mm3\n" \
241 "paddw " #in1 ", " #in0 "\n" \
242 "paddw %%mm3, %%mm2\n" \
243 "paddw %%mm2, " #in0 "\n" \
244 "paddw " #in0 ", %%mm6\n"
249 "pxor %%mm6, %%mm6\n"
250 "pcmpeqw %%mm7, %%mm7\n"
252 "packsswb %%mm7, %%mm7\n"
255 "movq 8(%0), %%mm1\n"
256 "movq 8(%1), %%mm3\n"
259 "psubb %%mm2, %%mm0\n"
260 "psubb %%mm3, %%mm1\n"
261 "pxor %%mm7, %%mm0\n"
262 "pxor %%mm7, %%mm1\n"
266 SUM(%%mm4, %%mm5, %%mm0, %%mm1)
268 SUM(%%mm0, %%mm1, %%mm4, %%mm5)
273 "movq %%mm6, %%mm0\n"
275 "paddw %%mm6, %%mm0\n"
276 "movq %%mm0, %%mm6\n"
278 "paddw %%mm6, %%mm0\n"
280 : "+r" (pix1), "+r" (pix2), "=r" (tmp)
281 : "r" ((x86_reg) line_size), "m" (h)
288 static int vsad16_mmxext(MpegEncContext *v, uint8_t *pix1, uint8_t *pix2,
289 int line_size, int h)
293 av_assert2((((int) pix1) & 7) == 0);
294 av_assert2((((int) pix2) & 7) == 0);
295 av_assert2((line_size & 7) == 0);
297 #define SUM(in0, in1, out0, out1) \
298 "movq (%0), " #out0 "\n" \
299 "movq (%1), %%mm2\n" \
300 "movq 8(%0), " #out1 "\n" \
301 "movq 8(%1), %%mm3\n" \
304 "psubb %%mm2, " #out0 "\n" \
305 "psubb %%mm3, " #out1 "\n" \
306 "pxor %%mm7, " #out0 "\n" \
307 "pxor %%mm7, " #out1 "\n" \
308 "psadbw " #out0 ", " #in0 "\n" \
309 "psadbw " #out1 ", " #in1 "\n" \
310 "paddw " #in1 ", " #in0 "\n" \
311 "paddw " #in0 ", %%mm6\n "
315 "pxor %%mm6, %%mm6\n"
316 "pcmpeqw %%mm7, %%mm7\n"
318 "packsswb %%mm7, %%mm7\n"
321 "movq 8(%0), %%mm1\n"
322 "movq 8(%1), %%mm3\n"
325 "psubb %%mm2, %%mm0\n"
326 "psubb %%mm3, %%mm1\n"
327 "pxor %%mm7, %%mm0\n"
328 "pxor %%mm7, %%mm1\n"
332 SUM(%%mm4, %%mm5, %%mm0, %%mm1)
334 SUM(%%mm0, %%mm1, %%mm4, %%mm5)
340 : "+r" (pix1), "+r" (pix2), "=r" (tmp)
341 : "r" ((x86_reg) line_size), "m" (h)
349 #endif /* HAVE_INLINE_ASM */
351 av_cold void ff_dsputilenc_init_mmx(DSPContext *c, AVCodecContext *avctx,
352 unsigned high_bit_depth)
354 int cpu_flags = av_get_cpu_flags();
356 if (EXTERNAL_MMX(cpu_flags)) {
358 c->get_pixels = ff_get_pixels_mmx;
359 c->diff_pixels = ff_diff_pixels_mmx;
362 if (EXTERNAL_SSE2(cpu_flags))
364 c->get_pixels = ff_get_pixels_sse2;
367 if (INLINE_MMX(cpu_flags)) {
368 c->vsad[4] = vsad_intra16_mmx;
370 if (!(avctx->flags & CODEC_FLAG_BITEXACT)) {
371 c->vsad[0] = vsad16_mmx;
375 if (INLINE_MMXEXT(cpu_flags)) {
376 c->vsad[4] = vsad_intra16_mmxext;
378 if (!(avctx->flags & CODEC_FLAG_BITEXACT)) {
379 c->vsad[0] = vsad16_mmxext;
383 if (INLINE_SSE2(cpu_flags)) {
386 #if HAVE_SSSE3_INLINE
387 if (INLINE_SSSE3(cpu_flags)) {
390 #endif /* HAVE_INLINE_ASM */
392 if (EXTERNAL_MMX(cpu_flags)) {
393 c->hadamard8_diff[0] = ff_hadamard8_diff16_mmx;
394 c->hadamard8_diff[1] = ff_hadamard8_diff_mmx;
395 c->sum_abs_dctelem = ff_sum_abs_dctelem_mmx;
396 c->sse[0] = ff_sse16_mmx;
397 c->sse[1] = ff_sse8_mmx;
399 c->nsse[0] = nsse16_mmx;
400 c->nsse[1] = nsse8_mmx;
404 if (EXTERNAL_MMXEXT(cpu_flags)) {
405 c->hadamard8_diff[0] = ff_hadamard8_diff16_mmxext;
406 c->hadamard8_diff[1] = ff_hadamard8_diff_mmxext;
407 c->sum_abs_dctelem = ff_sum_abs_dctelem_mmxext;
410 if (EXTERNAL_SSE2(cpu_flags)) {
411 c->sse[0] = ff_sse16_sse2;
412 c->sum_abs_dctelem = ff_sum_abs_dctelem_sse2;
413 c->diff_pixels = ff_diff_pixels_sse2;
415 #if HAVE_ALIGNED_STACK
416 c->hadamard8_diff[0] = ff_hadamard8_diff16_sse2;
417 c->hadamard8_diff[1] = ff_hadamard8_diff_sse2;
421 if (EXTERNAL_SSSE3(cpu_flags)) {
422 c->sum_abs_dctelem = ff_sum_abs_dctelem_ssse3;
423 #if HAVE_ALIGNED_STACK
424 c->hadamard8_diff[0] = ff_hadamard8_diff16_ssse3;
425 c->hadamard8_diff[1] = ff_hadamard8_diff_ssse3;
429 ff_dsputil_init_pix_mmx(c, avctx);