]> git.sesse.net Git - ffmpeg/blob - libavcodec/x86/motion_est.c
dsputil: Move APE-specific bits into apedsp
[ffmpeg] / libavcodec / x86 / motion_est.c
1 /*
2  * MMX optimized motion estimation
3  * Copyright (c) 2001 Fabrice Bellard
4  * Copyright (c) 2002-2004 Michael Niedermayer
5  *
6  * mostly by Michael Niedermayer <michaelni@gmx.at>
7  *
8  * This file is part of Libav.
9  *
10  * Libav is free software; you can redistribute it and/or
11  * modify it under the terms of the GNU Lesser General Public
12  * License as published by the Free Software Foundation; either
13  * version 2.1 of the License, or (at your option) any later version.
14  *
15  * Libav is distributed in the hope that it will be useful,
16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18  * Lesser General Public License for more details.
19  *
20  * You should have received a copy of the GNU Lesser General Public
21  * License along with Libav; if not, write to the Free Software
22  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23  */
24
25 #include "libavutil/attributes.h"
26 #include "libavutil/internal.h"
27 #include "libavutil/mem.h"
28 #include "libavutil/x86/asm.h"
29 #include "libavutil/x86/cpu.h"
30 #include "libavcodec/mpegvideo.h"
31 #include "dsputil_x86.h"
32
33 #if HAVE_INLINE_ASM
34
35 DECLARE_ASM_CONST(8, uint64_t, round_tab)[3] = {
36     0x0000000000000000ULL,
37     0x0001000100010001ULL,
38     0x0002000200020002ULL,
39 };
40
41 DECLARE_ASM_CONST(8, uint64_t, bone) = 0x0101010101010101LL;
42
43 static inline void sad8_1_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h)
44 {
45     x86_reg len = -(stride * h);
46     __asm__ volatile (
47         ".p2align 4                     \n\t"
48         "1:                             \n\t"
49         "movq (%1, %%"REG_a"), %%mm0    \n\t"
50         "movq (%2, %%"REG_a"), %%mm2    \n\t"
51         "movq (%2, %%"REG_a"), %%mm4    \n\t"
52         "add %3, %%"REG_a"              \n\t"
53         "psubusb %%mm0, %%mm2           \n\t"
54         "psubusb %%mm4, %%mm0           \n\t"
55         "movq (%1, %%"REG_a"), %%mm1    \n\t"
56         "movq (%2, %%"REG_a"), %%mm3    \n\t"
57         "movq (%2, %%"REG_a"), %%mm5    \n\t"
58         "psubusb %%mm1, %%mm3           \n\t"
59         "psubusb %%mm5, %%mm1           \n\t"
60         "por %%mm2, %%mm0               \n\t"
61         "por %%mm1, %%mm3               \n\t"
62         "movq %%mm0, %%mm1              \n\t"
63         "movq %%mm3, %%mm2              \n\t"
64         "punpcklbw %%mm7, %%mm0         \n\t"
65         "punpckhbw %%mm7, %%mm1         \n\t"
66         "punpcklbw %%mm7, %%mm3         \n\t"
67         "punpckhbw %%mm7, %%mm2         \n\t"
68         "paddw %%mm1, %%mm0             \n\t"
69         "paddw %%mm3, %%mm2             \n\t"
70         "paddw %%mm2, %%mm0             \n\t"
71         "paddw %%mm0, %%mm6             \n\t"
72         "add %3, %%"REG_a"              \n\t"
73         " js 1b                         \n\t"
74         : "+a" (len)
75         : "r" (blk1 - len), "r" (blk2 - len), "r" ((x86_reg) stride));
76 }
77
78 static inline void sad8_1_mmxext(uint8_t *blk1, uint8_t *blk2,
79                                  int stride, int h)
80 {
81     __asm__ volatile (
82         ".p2align 4                     \n\t"
83         "1:                             \n\t"
84         "movq (%1), %%mm0               \n\t"
85         "movq (%1, %3), %%mm1           \n\t"
86         "psadbw (%2), %%mm0             \n\t"
87         "psadbw (%2, %3), %%mm1         \n\t"
88         "paddw %%mm0, %%mm6             \n\t"
89         "paddw %%mm1, %%mm6             \n\t"
90         "lea (%1,%3,2), %1              \n\t"
91         "lea (%2,%3,2), %2              \n\t"
92         "sub $2, %0                     \n\t"
93         " jg 1b                         \n\t"
94         : "+r" (h), "+r" (blk1), "+r" (blk2)
95         : "r" ((x86_reg) stride));
96 }
97
98 static int sad16_sse2(MpegEncContext *v, uint8_t *blk2, uint8_t *blk1,
99                       int stride, int h)
100 {
101     int ret;
102     __asm__ volatile (
103         "pxor %%xmm2, %%xmm2            \n\t"
104         ".p2align 4                     \n\t"
105         "1:                             \n\t"
106         "movdqu (%1), %%xmm0            \n\t"
107         "movdqu (%1, %4), %%xmm1        \n\t"
108         "psadbw (%2), %%xmm0            \n\t"
109         "psadbw (%2, %4), %%xmm1        \n\t"
110         "paddw %%xmm0, %%xmm2           \n\t"
111         "paddw %%xmm1, %%xmm2           \n\t"
112         "lea (%1,%4,2), %1              \n\t"
113         "lea (%2,%4,2), %2              \n\t"
114         "sub $2, %0                     \n\t"
115         " jg 1b                         \n\t"
116         "movhlps %%xmm2, %%xmm0         \n\t"
117         "paddw   %%xmm0, %%xmm2         \n\t"
118         "movd    %%xmm2, %3             \n\t"
119         : "+r" (h), "+r" (blk1), "+r" (blk2), "=r" (ret)
120         : "r" ((x86_reg) stride));
121     return ret;
122 }
123
124 static inline void sad8_x2a_mmxext(uint8_t *blk1, uint8_t *blk2,
125                                    int stride, int h)
126 {
127     __asm__ volatile (
128         ".p2align 4                     \n\t"
129         "1:                             \n\t"
130         "movq (%1), %%mm0               \n\t"
131         "movq (%1, %3), %%mm1           \n\t"
132         "pavgb 1(%1), %%mm0             \n\t"
133         "pavgb 1(%1, %3), %%mm1         \n\t"
134         "psadbw (%2), %%mm0             \n\t"
135         "psadbw (%2, %3), %%mm1         \n\t"
136         "paddw %%mm0, %%mm6             \n\t"
137         "paddw %%mm1, %%mm6             \n\t"
138         "lea (%1,%3,2), %1              \n\t"
139         "lea (%2,%3,2), %2              \n\t"
140         "sub $2, %0                     \n\t"
141         " jg 1b                         \n\t"
142         : "+r" (h), "+r" (blk1), "+r" (blk2)
143         : "r" ((x86_reg) stride));
144 }
145
146 static inline void sad8_y2a_mmxext(uint8_t *blk1, uint8_t *blk2,
147                                    int stride, int h)
148 {
149     __asm__ volatile (
150         "movq (%1), %%mm0               \n\t"
151         "add %3, %1                     \n\t"
152         ".p2align 4                     \n\t"
153         "1:                             \n\t"
154         "movq (%1), %%mm1               \n\t"
155         "movq (%1, %3), %%mm2           \n\t"
156         "pavgb %%mm1, %%mm0             \n\t"
157         "pavgb %%mm2, %%mm1             \n\t"
158         "psadbw (%2), %%mm0             \n\t"
159         "psadbw (%2, %3), %%mm1         \n\t"
160         "paddw %%mm0, %%mm6             \n\t"
161         "paddw %%mm1, %%mm6             \n\t"
162         "movq %%mm2, %%mm0              \n\t"
163         "lea (%1,%3,2), %1              \n\t"
164         "lea (%2,%3,2), %2              \n\t"
165         "sub $2, %0                     \n\t"
166         " jg 1b                         \n\t"
167         : "+r" (h), "+r" (blk1), "+r" (blk2)
168         : "r" ((x86_reg) stride));
169 }
170
171 static inline void sad8_4_mmxext(uint8_t *blk1, uint8_t *blk2,
172                                  int stride, int h)
173 {
174     __asm__ volatile (
175         "movq "MANGLE(bone)", %%mm5     \n\t"
176         "movq (%1), %%mm0               \n\t"
177         "pavgb 1(%1), %%mm0             \n\t"
178         "add %3, %1                     \n\t"
179         ".p2align 4                     \n\t"
180         "1:                             \n\t"
181         "movq (%1), %%mm1               \n\t"
182         "movq (%1,%3), %%mm2            \n\t"
183         "pavgb 1(%1), %%mm1             \n\t"
184         "pavgb 1(%1,%3), %%mm2          \n\t"
185         "psubusb %%mm5, %%mm1           \n\t"
186         "pavgb %%mm1, %%mm0             \n\t"
187         "pavgb %%mm2, %%mm1             \n\t"
188         "psadbw (%2), %%mm0             \n\t"
189         "psadbw (%2,%3), %%mm1          \n\t"
190         "paddw %%mm0, %%mm6             \n\t"
191         "paddw %%mm1, %%mm6             \n\t"
192         "movq %%mm2, %%mm0              \n\t"
193         "lea (%1,%3,2), %1              \n\t"
194         "lea (%2,%3,2), %2              \n\t"
195         "sub $2, %0                     \n\t"
196         " jg 1b                         \n\t"
197         : "+r" (h), "+r" (blk1), "+r" (blk2)
198         : "r" ((x86_reg) stride));
199 }
200
201 static inline void sad8_2_mmx(uint8_t *blk1a, uint8_t *blk1b, uint8_t *blk2,
202                               int stride, int h)
203 {
204     x86_reg len = -(stride * h);
205     __asm__ volatile (
206         ".p2align 4                     \n\t"
207         "1:                             \n\t"
208         "movq (%1, %%"REG_a"), %%mm0    \n\t"
209         "movq (%2, %%"REG_a"), %%mm1    \n\t"
210         "movq (%1, %%"REG_a"), %%mm2    \n\t"
211         "movq (%2, %%"REG_a"), %%mm3    \n\t"
212         "punpcklbw %%mm7, %%mm0         \n\t"
213         "punpcklbw %%mm7, %%mm1         \n\t"
214         "punpckhbw %%mm7, %%mm2         \n\t"
215         "punpckhbw %%mm7, %%mm3         \n\t"
216         "paddw %%mm0, %%mm1             \n\t"
217         "paddw %%mm2, %%mm3             \n\t"
218         "movq (%3, %%"REG_a"), %%mm4    \n\t"
219         "movq (%3, %%"REG_a"), %%mm2    \n\t"
220         "paddw %%mm5, %%mm1             \n\t"
221         "paddw %%mm5, %%mm3             \n\t"
222         "psrlw $1, %%mm1                \n\t"
223         "psrlw $1, %%mm3                \n\t"
224         "packuswb %%mm3, %%mm1          \n\t"
225         "psubusb %%mm1, %%mm4           \n\t"
226         "psubusb %%mm2, %%mm1           \n\t"
227         "por %%mm4, %%mm1               \n\t"
228         "movq %%mm1, %%mm0              \n\t"
229         "punpcklbw %%mm7, %%mm0         \n\t"
230         "punpckhbw %%mm7, %%mm1         \n\t"
231         "paddw %%mm1, %%mm0             \n\t"
232         "paddw %%mm0, %%mm6             \n\t"
233         "add %4, %%"REG_a"              \n\t"
234         " js 1b                         \n\t"
235         : "+a" (len)
236         : "r" (blk1a - len), "r" (blk1b - len), "r" (blk2 - len),
237           "r" ((x86_reg) stride));
238 }
239
240 static inline void sad8_4_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h)
241 {
242     x86_reg len = -(stride * h);
243     __asm__ volatile (
244         "movq  (%1, %%"REG_a"), %%mm0   \n\t"
245         "movq 1(%1, %%"REG_a"), %%mm2   \n\t"
246         "movq %%mm0, %%mm1              \n\t"
247         "movq %%mm2, %%mm3              \n\t"
248         "punpcklbw %%mm7, %%mm0         \n\t"
249         "punpckhbw %%mm7, %%mm1         \n\t"
250         "punpcklbw %%mm7, %%mm2         \n\t"
251         "punpckhbw %%mm7, %%mm3         \n\t"
252         "paddw %%mm2, %%mm0             \n\t"
253         "paddw %%mm3, %%mm1             \n\t"
254         ".p2align 4                     \n\t"
255         "1:                             \n\t"
256         "movq  (%2, %%"REG_a"), %%mm2   \n\t"
257         "movq 1(%2, %%"REG_a"), %%mm4   \n\t"
258         "movq %%mm2, %%mm3              \n\t"
259         "movq %%mm4, %%mm5              \n\t"
260         "punpcklbw %%mm7, %%mm2         \n\t"
261         "punpckhbw %%mm7, %%mm3         \n\t"
262         "punpcklbw %%mm7, %%mm4         \n\t"
263         "punpckhbw %%mm7, %%mm5         \n\t"
264         "paddw %%mm4, %%mm2             \n\t"
265         "paddw %%mm5, %%mm3             \n\t"
266         "movq 16+"MANGLE(round_tab)", %%mm5 \n\t"
267         "paddw %%mm2, %%mm0             \n\t"
268         "paddw %%mm3, %%mm1             \n\t"
269         "paddw %%mm5, %%mm0             \n\t"
270         "paddw %%mm5, %%mm1             \n\t"
271         "movq (%3, %%"REG_a"), %%mm4    \n\t"
272         "movq (%3, %%"REG_a"), %%mm5    \n\t"
273         "psrlw $2, %%mm0                \n\t"
274         "psrlw $2, %%mm1                \n\t"
275         "packuswb %%mm1, %%mm0          \n\t"
276         "psubusb %%mm0, %%mm4           \n\t"
277         "psubusb %%mm5, %%mm0           \n\t"
278         "por %%mm4, %%mm0               \n\t"
279         "movq %%mm0, %%mm4              \n\t"
280         "punpcklbw %%mm7, %%mm0         \n\t"
281         "punpckhbw %%mm7, %%mm4         \n\t"
282         "paddw %%mm0, %%mm6             \n\t"
283         "paddw %%mm4, %%mm6             \n\t"
284         "movq  %%mm2, %%mm0             \n\t"
285         "movq  %%mm3, %%mm1             \n\t"
286         "add %4, %%"REG_a"              \n\t"
287         " js 1b                         \n\t"
288         : "+a" (len)
289         : "r" (blk1 - len), "r" (blk1 - len + stride), "r" (blk2 - len),
290           "r" ((x86_reg) stride));
291 }
292
293 static inline int sum_mmx(void)
294 {
295     int ret;
296     __asm__ volatile (
297         "movq %%mm6, %%mm0              \n\t"
298         "psrlq $32, %%mm6               \n\t"
299         "paddw %%mm0, %%mm6             \n\t"
300         "movq %%mm6, %%mm0              \n\t"
301         "psrlq $16, %%mm6               \n\t"
302         "paddw %%mm0, %%mm6             \n\t"
303         "movd %%mm6, %0                 \n\t"
304         : "=r" (ret));
305     return ret & 0xFFFF;
306 }
307
308 static inline int sum_mmxext(void)
309 {
310     int ret;
311     __asm__ volatile (
312         "movd %%mm6, %0                 \n\t"
313         : "=r" (ret));
314     return ret;
315 }
316
317 static inline void sad8_x2a_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h)
318 {
319     sad8_2_mmx(blk1, blk1 + 1, blk2, stride, h);
320 }
321
322 static inline void sad8_y2a_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h)
323 {
324     sad8_2_mmx(blk1, blk1 + stride, blk2, stride, h);
325 }
326
327 #define PIX_SAD(suf)                                                    \
328 static int sad8_ ## suf(MpegEncContext *v, uint8_t *blk2,               \
329                         uint8_t *blk1, int stride, int h)               \
330 {                                                                       \
331     assert(h == 8);                                                     \
332     __asm__ volatile (                                                  \
333         "pxor %%mm7, %%mm7     \n\t"                                    \
334         "pxor %%mm6, %%mm6     \n\t"                                    \
335         :);                                                             \
336                                                                         \
337     sad8_1_ ## suf(blk1, blk2, stride, 8);                              \
338                                                                         \
339     return sum_ ## suf();                                               \
340 }                                                                       \
341                                                                         \
342 static int sad8_x2_ ## suf(MpegEncContext *v, uint8_t *blk2,            \
343                            uint8_t *blk1, int stride, int h)            \
344 {                                                                       \
345     assert(h == 8);                                                     \
346     __asm__ volatile (                                                  \
347         "pxor %%mm7, %%mm7     \n\t"                                    \
348         "pxor %%mm6, %%mm6     \n\t"                                    \
349         "movq %0, %%mm5        \n\t"                                    \
350         :: "m" (round_tab[1]));                                         \
351                                                                         \
352     sad8_x2a_ ## suf(blk1, blk2, stride, 8);                            \
353                                                                         \
354     return sum_ ## suf();                                               \
355 }                                                                       \
356                                                                         \
357 static int sad8_y2_ ## suf(MpegEncContext *v, uint8_t *blk2,            \
358                            uint8_t *blk1, int stride, int h)            \
359 {                                                                       \
360     assert(h == 8);                                                     \
361     __asm__ volatile (                                                  \
362         "pxor %%mm7, %%mm7     \n\t"                                    \
363         "pxor %%mm6, %%mm6     \n\t"                                    \
364         "movq %0, %%mm5        \n\t"                                    \
365         :: "m" (round_tab[1]));                                         \
366                                                                         \
367     sad8_y2a_ ## suf(blk1, blk2, stride, 8);                            \
368                                                                         \
369     return sum_ ## suf();                                               \
370 }                                                                       \
371                                                                         \
372 static int sad8_xy2_ ## suf(MpegEncContext *v, uint8_t *blk2,           \
373                             uint8_t *blk1, int stride, int h)           \
374 {                                                                       \
375     assert(h == 8);                                                     \
376     __asm__ volatile (                                                  \
377         "pxor %%mm7, %%mm7     \n\t"                                    \
378         "pxor %%mm6, %%mm6     \n\t"                                    \
379         ::);                                                            \
380                                                                         \
381     sad8_4_ ## suf(blk1, blk2, stride, 8);                              \
382                                                                         \
383     return sum_ ## suf();                                               \
384 }                                                                       \
385                                                                         \
386 static int sad16_ ## suf(MpegEncContext *v, uint8_t *blk2,              \
387                          uint8_t *blk1, int stride, int h)              \
388 {                                                                       \
389     __asm__ volatile (                                                  \
390         "pxor %%mm7, %%mm7     \n\t"                                    \
391         "pxor %%mm6, %%mm6     \n\t"                                    \
392         :);                                                             \
393                                                                         \
394     sad8_1_ ## suf(blk1,     blk2,     stride, h);                      \
395     sad8_1_ ## suf(blk1 + 8, blk2 + 8, stride, h);                      \
396                                                                         \
397     return sum_ ## suf();                                               \
398 }                                                                       \
399                                                                         \
400 static int sad16_x2_ ## suf(MpegEncContext *v, uint8_t *blk2,           \
401                             uint8_t *blk1, int stride, int h)           \
402 {                                                                       \
403     __asm__ volatile (                                                  \
404         "pxor %%mm7, %%mm7     \n\t"                                    \
405         "pxor %%mm6, %%mm6     \n\t"                                    \
406         "movq %0, %%mm5        \n\t"                                    \
407         :: "m" (round_tab[1]));                                         \
408                                                                         \
409     sad8_x2a_ ## suf(blk1,     blk2,     stride, h);                    \
410     sad8_x2a_ ## suf(blk1 + 8, blk2 + 8, stride, h);                    \
411                                                                         \
412     return sum_ ## suf();                                               \
413 }                                                                       \
414                                                                         \
415 static int sad16_y2_ ## suf(MpegEncContext *v, uint8_t *blk2,           \
416                             uint8_t *blk1, int stride, int h)           \
417 {                                                                       \
418     __asm__ volatile (                                                  \
419         "pxor %%mm7, %%mm7     \n\t"                                    \
420         "pxor %%mm6, %%mm6     \n\t"                                    \
421         "movq %0, %%mm5        \n\t"                                    \
422         :: "m" (round_tab[1]));                                         \
423                                                                         \
424     sad8_y2a_ ## suf(blk1,     blk2,     stride, h);                    \
425     sad8_y2a_ ## suf(blk1 + 8, blk2 + 8, stride, h);                    \
426                                                                         \
427     return sum_ ## suf();                                               \
428 }                                                                       \
429                                                                         \
430 static int sad16_xy2_ ## suf(MpegEncContext *v, uint8_t *blk2,          \
431                              uint8_t *blk1, int stride, int h)          \
432 {                                                                       \
433     __asm__ volatile (                                                  \
434         "pxor %%mm7, %%mm7     \n\t"                                    \
435         "pxor %%mm6, %%mm6     \n\t"                                    \
436         ::);                                                            \
437                                                                         \
438     sad8_4_ ## suf(blk1,     blk2,     stride, h);                      \
439     sad8_4_ ## suf(blk1 + 8, blk2 + 8, stride, h);                      \
440                                                                         \
441     return sum_ ## suf();                                               \
442 }                                                                       \
443
444 PIX_SAD(mmx)
445 PIX_SAD(mmxext)
446
447 #endif /* HAVE_INLINE_ASM */
448
449 av_cold void ff_dsputil_init_pix_mmx(DSPContext *c, AVCodecContext *avctx)
450 {
451 #if HAVE_INLINE_ASM
452     int cpu_flags = av_get_cpu_flags();
453
454     if (INLINE_MMX(cpu_flags)) {
455         c->pix_abs[0][0] = sad16_mmx;
456         c->pix_abs[0][1] = sad16_x2_mmx;
457         c->pix_abs[0][2] = sad16_y2_mmx;
458         c->pix_abs[0][3] = sad16_xy2_mmx;
459         c->pix_abs[1][0] = sad8_mmx;
460         c->pix_abs[1][1] = sad8_x2_mmx;
461         c->pix_abs[1][2] = sad8_y2_mmx;
462         c->pix_abs[1][3] = sad8_xy2_mmx;
463
464         c->sad[0] = sad16_mmx;
465         c->sad[1] = sad8_mmx;
466     }
467     if (INLINE_MMXEXT(cpu_flags)) {
468         c->pix_abs[0][0] = sad16_mmxext;
469         c->pix_abs[1][0] = sad8_mmxext;
470
471         c->sad[0] = sad16_mmxext;
472         c->sad[1] = sad8_mmxext;
473
474         if (!(avctx->flags & CODEC_FLAG_BITEXACT)) {
475             c->pix_abs[0][1] = sad16_x2_mmxext;
476             c->pix_abs[0][2] = sad16_y2_mmxext;
477             c->pix_abs[0][3] = sad16_xy2_mmxext;
478             c->pix_abs[1][1] = sad8_x2_mmxext;
479             c->pix_abs[1][2] = sad8_y2_mmxext;
480             c->pix_abs[1][3] = sad8_xy2_mmxext;
481         }
482     }
483     if (INLINE_SSE2(cpu_flags) && !(cpu_flags & AV_CPU_FLAG_3DNOW)) {
484         c->sad[0] = sad16_sse2;
485     }
486 #endif /* HAVE_INLINE_ASM */
487 }