]> git.sesse.net Git - ffmpeg/blob - libavcodec/x86/motion_est.c
2b11cf95d388e82fc08ef0f80573bd6daee4b4d6
[ffmpeg] / libavcodec / x86 / motion_est.c
1 /*
2  * MMX optimized motion estimation
3  * Copyright (c) 2001 Fabrice Bellard
4  * Copyright (c) 2002-2004 Michael Niedermayer
5  *
6  * mostly by Michael Niedermayer <michaelni@gmx.at>
7  *
8  * This file is part of FFmpeg.
9  *
10  * FFmpeg is free software; you can redistribute it and/or
11  * modify it under the terms of the GNU Lesser General Public
12  * License as published by the Free Software Foundation; either
13  * version 2.1 of the License, or (at your option) any later version.
14  *
15  * FFmpeg is distributed in the hope that it will be useful,
16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18  * Lesser General Public License for more details.
19  *
20  * You should have received a copy of the GNU Lesser General Public
21  * License along with FFmpeg; if not, write to the Free Software
22  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23  */
24
25 #include "libavutil/attributes.h"
26 #include "libavutil/avassert.h"
27 #include "libavutil/mem.h"
28 #include "libavutil/x86/asm.h"
29 #include "libavutil/x86/cpu.h"
30 #include "dsputil_x86.h"
31
32 #if HAVE_INLINE_ASM
33
34 DECLARE_ASM_CONST(8, uint64_t, round_tab)[3] = {
35     0x0000000000000000ULL,
36     0x0001000100010001ULL,
37     0x0002000200020002ULL,
38 };
39
40 DECLARE_ASM_CONST(8, uint64_t, bone) = 0x0101010101010101LL;
41
42 static inline void sad8_1_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h)
43 {
44     x86_reg len = -(x86_reg)stride * h;
45     __asm__ volatile (
46         ".p2align 4                     \n\t"
47         "1:                             \n\t"
48         "movq (%1, %%"REG_a"), %%mm0    \n\t"
49         "movq (%2, %%"REG_a"), %%mm2    \n\t"
50         "movq (%2, %%"REG_a"), %%mm4    \n\t"
51         "add %3, %%"REG_a"              \n\t"
52         "psubusb %%mm0, %%mm2           \n\t"
53         "psubusb %%mm4, %%mm0           \n\t"
54         "movq (%1, %%"REG_a"), %%mm1    \n\t"
55         "movq (%2, %%"REG_a"), %%mm3    \n\t"
56         "movq (%2, %%"REG_a"), %%mm5    \n\t"
57         "psubusb %%mm1, %%mm3           \n\t"
58         "psubusb %%mm5, %%mm1           \n\t"
59         "por %%mm2, %%mm0               \n\t"
60         "por %%mm1, %%mm3               \n\t"
61         "movq %%mm0, %%mm1              \n\t"
62         "movq %%mm3, %%mm2              \n\t"
63         "punpcklbw %%mm7, %%mm0         \n\t"
64         "punpckhbw %%mm7, %%mm1         \n\t"
65         "punpcklbw %%mm7, %%mm3         \n\t"
66         "punpckhbw %%mm7, %%mm2         \n\t"
67         "paddw %%mm1, %%mm0             \n\t"
68         "paddw %%mm3, %%mm2             \n\t"
69         "paddw %%mm2, %%mm0             \n\t"
70         "paddw %%mm0, %%mm6             \n\t"
71         "add %3, %%"REG_a"              \n\t"
72         " js 1b                         \n\t"
73         : "+a" (len)
74         : "r" (blk1 - len), "r" (blk2 - len), "r" ((x86_reg) stride));
75 }
76
77 static inline void sad8_1_mmxext(uint8_t *blk1, uint8_t *blk2,
78                                  int stride, int h)
79 {
80     __asm__ volatile (
81         ".p2align 4                     \n\t"
82         "1:                             \n\t"
83         "movq (%1), %%mm0               \n\t"
84         "movq (%1, %3), %%mm1           \n\t"
85         "psadbw (%2), %%mm0             \n\t"
86         "psadbw (%2, %3), %%mm1         \n\t"
87         "paddw %%mm0, %%mm6             \n\t"
88         "paddw %%mm1, %%mm6             \n\t"
89         "lea (%1,%3,2), %1              \n\t"
90         "lea (%2,%3,2), %2              \n\t"
91         "sub $2, %0                     \n\t"
92         " jg 1b                         \n\t"
93         : "+r" (h), "+r" (blk1), "+r" (blk2)
94         : "r" ((x86_reg) stride));
95 }
96
97 static int sad16_sse2(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)
98 {
99     int ret;
100     __asm__ volatile (
101         "pxor %%xmm2, %%xmm2            \n\t"
102         ".p2align 4                     \n\t"
103         "1:                             \n\t"
104         "movdqu (%1), %%xmm0            \n\t"
105         "movdqu (%1, %4), %%xmm1        \n\t"
106         "psadbw (%2), %%xmm0            \n\t"
107         "psadbw (%2, %4), %%xmm1        \n\t"
108         "paddw %%xmm0, %%xmm2           \n\t"
109         "paddw %%xmm1, %%xmm2           \n\t"
110         "lea (%1,%4,2), %1              \n\t"
111         "lea (%2,%4,2), %2              \n\t"
112         "sub $2, %0                     \n\t"
113         " jg 1b                         \n\t"
114         "movhlps %%xmm2, %%xmm0         \n\t"
115         "paddw   %%xmm0, %%xmm2         \n\t"
116         "movd    %%xmm2, %3             \n\t"
117         : "+r" (h), "+r" (blk1), "+r" (blk2), "=r" (ret)
118         : "r" ((x86_reg) stride));
119     return ret;
120 }
121
122 static inline void sad8_x2a_mmxext(uint8_t *blk1, uint8_t *blk2,
123                                    int stride, int h)
124 {
125     __asm__ volatile (
126         ".p2align 4                     \n\t"
127         "1:                             \n\t"
128         "movq (%1), %%mm0               \n\t"
129         "movq (%1, %3), %%mm1           \n\t"
130         "pavgb 1(%1), %%mm0             \n\t"
131         "pavgb 1(%1, %3), %%mm1         \n\t"
132         "psadbw (%2), %%mm0             \n\t"
133         "psadbw (%2, %3), %%mm1         \n\t"
134         "paddw %%mm0, %%mm6             \n\t"
135         "paddw %%mm1, %%mm6             \n\t"
136         "lea (%1,%3,2), %1              \n\t"
137         "lea (%2,%3,2), %2              \n\t"
138         "sub $2, %0                     \n\t"
139         " jg 1b                         \n\t"
140         : "+r" (h), "+r" (blk1), "+r" (blk2)
141         : "r" ((x86_reg) stride));
142 }
143
144 static inline void sad8_y2a_mmxext(uint8_t *blk1, uint8_t *blk2,
145                                    int stride, int h)
146 {
147     __asm__ volatile (
148         "movq (%1), %%mm0               \n\t"
149         "add %3, %1                     \n\t"
150         ".p2align 4                     \n\t"
151         "1:                             \n\t"
152         "movq (%1), %%mm1               \n\t"
153         "movq (%1, %3), %%mm2           \n\t"
154         "pavgb %%mm1, %%mm0             \n\t"
155         "pavgb %%mm2, %%mm1             \n\t"
156         "psadbw (%2), %%mm0             \n\t"
157         "psadbw (%2, %3), %%mm1         \n\t"
158         "paddw %%mm0, %%mm6             \n\t"
159         "paddw %%mm1, %%mm6             \n\t"
160         "movq %%mm2, %%mm0              \n\t"
161         "lea (%1,%3,2), %1              \n\t"
162         "lea (%2,%3,2), %2              \n\t"
163         "sub $2, %0                     \n\t"
164         " jg 1b                         \n\t"
165         : "+r" (h), "+r" (blk1), "+r" (blk2)
166         : "r" ((x86_reg) stride));
167 }
168
169 static inline void sad8_4_mmxext(uint8_t *blk1, uint8_t *blk2,
170                                  int stride, int h)
171 {
172     __asm__ volatile (
173         "movq "MANGLE(bone)", %%mm5     \n\t"
174         "movq (%1), %%mm0               \n\t"
175         "pavgb 1(%1), %%mm0             \n\t"
176         "add %3, %1                     \n\t"
177         ".p2align 4                     \n\t"
178         "1:                             \n\t"
179         "movq (%1), %%mm1               \n\t"
180         "movq (%1,%3), %%mm2            \n\t"
181         "pavgb 1(%1), %%mm1             \n\t"
182         "pavgb 1(%1,%3), %%mm2          \n\t"
183         "psubusb %%mm5, %%mm1           \n\t"
184         "pavgb %%mm1, %%mm0             \n\t"
185         "pavgb %%mm2, %%mm1             \n\t"
186         "psadbw (%2), %%mm0             \n\t"
187         "psadbw (%2,%3), %%mm1          \n\t"
188         "paddw %%mm0, %%mm6             \n\t"
189         "paddw %%mm1, %%mm6             \n\t"
190         "movq %%mm2, %%mm0              \n\t"
191         "lea (%1,%3,2), %1              \n\t"
192         "lea (%2,%3,2), %2              \n\t"
193         "sub $2, %0                     \n\t"
194         " jg 1b                         \n\t"
195         : "+r" (h), "+r" (blk1), "+r" (blk2)
196         : "r" ((x86_reg) stride));
197 }
198
199 static inline void sad8_2_mmx(uint8_t *blk1a, uint8_t *blk1b, uint8_t *blk2,
200                               int stride, int h)
201 {
202     x86_reg len = -(x86_reg)stride * h;
203     __asm__ volatile (
204         ".p2align 4                     \n\t"
205         "1:                             \n\t"
206         "movq (%1, %%"REG_a"), %%mm0    \n\t"
207         "movq (%2, %%"REG_a"), %%mm1    \n\t"
208         "movq (%1, %%"REG_a"), %%mm2    \n\t"
209         "movq (%2, %%"REG_a"), %%mm3    \n\t"
210         "punpcklbw %%mm7, %%mm0         \n\t"
211         "punpcklbw %%mm7, %%mm1         \n\t"
212         "punpckhbw %%mm7, %%mm2         \n\t"
213         "punpckhbw %%mm7, %%mm3         \n\t"
214         "paddw %%mm0, %%mm1             \n\t"
215         "paddw %%mm2, %%mm3             \n\t"
216         "movq (%3, %%"REG_a"), %%mm4    \n\t"
217         "movq (%3, %%"REG_a"), %%mm2    \n\t"
218         "paddw %%mm5, %%mm1             \n\t"
219         "paddw %%mm5, %%mm3             \n\t"
220         "psrlw $1, %%mm1                \n\t"
221         "psrlw $1, %%mm3                \n\t"
222         "packuswb %%mm3, %%mm1          \n\t"
223         "psubusb %%mm1, %%mm4           \n\t"
224         "psubusb %%mm2, %%mm1           \n\t"
225         "por %%mm4, %%mm1               \n\t"
226         "movq %%mm1, %%mm0              \n\t"
227         "punpcklbw %%mm7, %%mm0         \n\t"
228         "punpckhbw %%mm7, %%mm1         \n\t"
229         "paddw %%mm1, %%mm0             \n\t"
230         "paddw %%mm0, %%mm6             \n\t"
231         "add %4, %%"REG_a"              \n\t"
232         " js 1b                         \n\t"
233         : "+a" (len)
234         : "r" (blk1a - len), "r" (blk1b - len), "r" (blk2 - len),
235           "r" ((x86_reg) stride));
236 }
237
238 static inline void sad8_4_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h)
239 {
240     x86_reg len = -(x86_reg)stride * h;
241     __asm__ volatile (
242         "movq  (%1, %%"REG_a"), %%mm0   \n\t"
243         "movq 1(%1, %%"REG_a"), %%mm2   \n\t"
244         "movq %%mm0, %%mm1              \n\t"
245         "movq %%mm2, %%mm3              \n\t"
246         "punpcklbw %%mm7, %%mm0         \n\t"
247         "punpckhbw %%mm7, %%mm1         \n\t"
248         "punpcklbw %%mm7, %%mm2         \n\t"
249         "punpckhbw %%mm7, %%mm3         \n\t"
250         "paddw %%mm2, %%mm0             \n\t"
251         "paddw %%mm3, %%mm1             \n\t"
252         ".p2align 4                     \n\t"
253         "1:                             \n\t"
254         "movq  (%2, %%"REG_a"), %%mm2   \n\t"
255         "movq 1(%2, %%"REG_a"), %%mm4   \n\t"
256         "movq %%mm2, %%mm3              \n\t"
257         "movq %%mm4, %%mm5              \n\t"
258         "punpcklbw %%mm7, %%mm2         \n\t"
259         "punpckhbw %%mm7, %%mm3         \n\t"
260         "punpcklbw %%mm7, %%mm4         \n\t"
261         "punpckhbw %%mm7, %%mm5         \n\t"
262         "paddw %%mm4, %%mm2             \n\t"
263         "paddw %%mm5, %%mm3             \n\t"
264         "movq 16+"MANGLE(round_tab)", %%mm5 \n\t"
265         "paddw %%mm2, %%mm0             \n\t"
266         "paddw %%mm3, %%mm1             \n\t"
267         "paddw %%mm5, %%mm0             \n\t"
268         "paddw %%mm5, %%mm1             \n\t"
269         "movq (%3, %%"REG_a"), %%mm4    \n\t"
270         "movq (%3, %%"REG_a"), %%mm5    \n\t"
271         "psrlw $2, %%mm0                \n\t"
272         "psrlw $2, %%mm1                \n\t"
273         "packuswb %%mm1, %%mm0          \n\t"
274         "psubusb %%mm0, %%mm4           \n\t"
275         "psubusb %%mm5, %%mm0           \n\t"
276         "por %%mm4, %%mm0               \n\t"
277         "movq %%mm0, %%mm4              \n\t"
278         "punpcklbw %%mm7, %%mm0         \n\t"
279         "punpckhbw %%mm7, %%mm4         \n\t"
280         "paddw %%mm0, %%mm6             \n\t"
281         "paddw %%mm4, %%mm6             \n\t"
282         "movq  %%mm2, %%mm0             \n\t"
283         "movq  %%mm3, %%mm1             \n\t"
284         "add %4, %%"REG_a"              \n\t"
285         " js 1b                         \n\t"
286         : "+a" (len)
287         : "r" (blk1 - len), "r" (blk1 - len + stride), "r" (blk2 - len),
288           "r" ((x86_reg) stride));
289 }
290
291 static inline int sum_mmx(void)
292 {
293     int ret;
294     __asm__ volatile (
295         "movq %%mm6, %%mm0              \n\t"
296         "psrlq $32, %%mm6               \n\t"
297         "paddw %%mm0, %%mm6             \n\t"
298         "movq %%mm6, %%mm0              \n\t"
299         "psrlq $16, %%mm6               \n\t"
300         "paddw %%mm0, %%mm6             \n\t"
301         "movd %%mm6, %0                 \n\t"
302         : "=r" (ret));
303     return ret & 0xFFFF;
304 }
305
306 static inline int sum_mmxext(void)
307 {
308     int ret;
309     __asm__ volatile (
310         "movd %%mm6, %0                 \n\t"
311         : "=r" (ret));
312     return ret;
313 }
314
315 static inline void sad8_x2a_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h)
316 {
317     sad8_2_mmx(blk1, blk1 + 1, blk2, stride, h);
318 }
319
320 static inline void sad8_y2a_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h)
321 {
322     sad8_2_mmx(blk1, blk1 + stride, blk2, stride, h);
323 }
324
325 #define PIX_SAD(suf)                                                    \
326 static int sad8_ ## suf(void *v, uint8_t *blk2,                         \
327                         uint8_t *blk1, int stride, int h)               \
328 {                                                                       \
329     av_assert2(h == 8);                                                     \
330     __asm__ volatile (                                                  \
331         "pxor %%mm7, %%mm7     \n\t"                                    \
332         "pxor %%mm6, %%mm6     \n\t"                                    \
333         :);                                                             \
334                                                                         \
335     sad8_1_ ## suf(blk1, blk2, stride, 8);                              \
336                                                                         \
337     return sum_ ## suf();                                               \
338 }                                                                       \
339                                                                         \
340 static int sad8_x2_ ## suf(void *v, uint8_t *blk2,                      \
341                            uint8_t *blk1, int stride, int h)            \
342 {                                                                       \
343     av_assert2(h == 8);                                                     \
344     __asm__ volatile (                                                  \
345         "pxor %%mm7, %%mm7     \n\t"                                    \
346         "pxor %%mm6, %%mm6     \n\t"                                    \
347         "movq %0, %%mm5        \n\t"                                    \
348         :: "m" (round_tab[1]));                                         \
349                                                                         \
350     sad8_x2a_ ## suf(blk1, blk2, stride, 8);                            \
351                                                                         \
352     return sum_ ## suf();                                               \
353 }                                                                       \
354                                                                         \
355 static int sad8_y2_ ## suf(void *v, uint8_t *blk2,                      \
356                            uint8_t *blk1, int stride, int h)            \
357 {                                                                       \
358     av_assert2(h == 8);                                                     \
359     __asm__ volatile (                                                  \
360         "pxor %%mm7, %%mm7     \n\t"                                    \
361         "pxor %%mm6, %%mm6     \n\t"                                    \
362         "movq %0, %%mm5        \n\t"                                    \
363         :: "m" (round_tab[1]));                                         \
364                                                                         \
365     sad8_y2a_ ## suf(blk1, blk2, stride, 8);                            \
366                                                                         \
367     return sum_ ## suf();                                               \
368 }                                                                       \
369                                                                         \
370 static int sad8_xy2_ ## suf(void *v, uint8_t *blk2,                     \
371                             uint8_t *blk1, int stride, int h)           \
372 {                                                                       \
373     av_assert2(h == 8);                                                     \
374     __asm__ volatile (                                                  \
375         "pxor %%mm7, %%mm7     \n\t"                                    \
376         "pxor %%mm6, %%mm6     \n\t"                                    \
377         ::);                                                            \
378                                                                         \
379     sad8_4_ ## suf(blk1, blk2, stride, 8);                              \
380                                                                         \
381     return sum_ ## suf();                                               \
382 }                                                                       \
383                                                                         \
384 static int sad16_ ## suf(void *v, uint8_t *blk2,                        \
385                          uint8_t *blk1, int stride, int h)              \
386 {                                                                       \
387     __asm__ volatile (                                                  \
388         "pxor %%mm7, %%mm7     \n\t"                                    \
389         "pxor %%mm6, %%mm6     \n\t"                                    \
390         :);                                                             \
391                                                                         \
392     sad8_1_ ## suf(blk1,     blk2,     stride, h);                      \
393     sad8_1_ ## suf(blk1 + 8, blk2 + 8, stride, h);                      \
394                                                                         \
395     return sum_ ## suf();                                               \
396 }                                                                       \
397                                                                         \
398 static int sad16_x2_ ## suf(void *v, uint8_t *blk2,                     \
399                             uint8_t *blk1, int stride, int h)           \
400 {                                                                       \
401     __asm__ volatile (                                                  \
402         "pxor %%mm7, %%mm7     \n\t"                                    \
403         "pxor %%mm6, %%mm6     \n\t"                                    \
404         "movq %0, %%mm5        \n\t"                                    \
405         :: "m" (round_tab[1]));                                         \
406                                                                         \
407     sad8_x2a_ ## suf(blk1,     blk2,     stride, h);                    \
408     sad8_x2a_ ## suf(blk1 + 8, blk2 + 8, stride, h);                    \
409                                                                         \
410     return sum_ ## suf();                                               \
411 }                                                                       \
412                                                                         \
413 static int sad16_y2_ ## suf(void *v, uint8_t *blk2,                     \
414                             uint8_t *blk1, int stride, int h)           \
415 {                                                                       \
416     __asm__ volatile (                                                  \
417         "pxor %%mm7, %%mm7     \n\t"                                    \
418         "pxor %%mm6, %%mm6     \n\t"                                    \
419         "movq %0, %%mm5        \n\t"                                    \
420         :: "m" (round_tab[1]));                                         \
421                                                                         \
422     sad8_y2a_ ## suf(blk1,     blk2,     stride, h);                    \
423     sad8_y2a_ ## suf(blk1 + 8, blk2 + 8, stride, h);                    \
424                                                                         \
425     return sum_ ## suf();                                               \
426 }                                                                       \
427                                                                         \
428 static int sad16_xy2_ ## suf(void *v, uint8_t *blk2,                    \
429                              uint8_t *blk1, int stride, int h)          \
430 {                                                                       \
431     __asm__ volatile (                                                  \
432         "pxor %%mm7, %%mm7     \n\t"                                    \
433         "pxor %%mm6, %%mm6     \n\t"                                    \
434         ::);                                                            \
435                                                                         \
436     sad8_4_ ## suf(blk1,     blk2,     stride, h);                      \
437     sad8_4_ ## suf(blk1 + 8, blk2 + 8, stride, h);                      \
438                                                                         \
439     return sum_ ## suf();                                               \
440 }                                                                       \
441
442 PIX_SAD(mmx)
443 PIX_SAD(mmxext)
444
445 #endif /* HAVE_INLINE_ASM */
446
447 av_cold void ff_dsputil_init_pix_mmx(DSPContext *c, AVCodecContext *avctx)
448 {
449 #if HAVE_INLINE_ASM
450     int cpu_flags = av_get_cpu_flags();
451
452     if (INLINE_MMX(cpu_flags)) {
453         c->pix_abs[0][0] = sad16_mmx;
454         c->pix_abs[0][1] = sad16_x2_mmx;
455         c->pix_abs[0][2] = sad16_y2_mmx;
456         c->pix_abs[0][3] = sad16_xy2_mmx;
457         c->pix_abs[1][0] = sad8_mmx;
458         c->pix_abs[1][1] = sad8_x2_mmx;
459         c->pix_abs[1][2] = sad8_y2_mmx;
460         c->pix_abs[1][3] = sad8_xy2_mmx;
461
462         c->sad[0] = sad16_mmx;
463         c->sad[1] = sad8_mmx;
464     }
465     if (INLINE_MMXEXT(cpu_flags)) {
466         c->pix_abs[0][0] = sad16_mmxext;
467         c->pix_abs[1][0] = sad8_mmxext;
468
469         c->sad[0] = sad16_mmxext;
470         c->sad[1] = sad8_mmxext;
471
472         if (!(avctx->flags & CODEC_FLAG_BITEXACT)) {
473             c->pix_abs[0][1] = sad16_x2_mmxext;
474             c->pix_abs[0][2] = sad16_y2_mmxext;
475             c->pix_abs[0][3] = sad16_xy2_mmxext;
476             c->pix_abs[1][1] = sad8_x2_mmxext;
477             c->pix_abs[1][2] = sad8_y2_mmxext;
478             c->pix_abs[1][3] = sad8_xy2_mmxext;
479         }
480     }
481     if (INLINE_SSE2(cpu_flags) && !(cpu_flags & AV_CPU_FLAG_3DNOW) && avctx->codec_id != AV_CODEC_ID_SNOW) {
482         c->sad[0] = sad16_sse2;
483     }
484 #endif /* HAVE_INLINE_ASM */
485 }