2 * MMX optimized DSP utils
3 * Copyright (c) 2000, 2001 Fabrice Bellard
4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
6 * This file is part of Libav.
8 * Libav is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License as published by the Free Software Foundation; either
11 * version 2.1 of the License, or (at your option) any later version.
13 * Libav is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 * Lesser General Public License for more details.
18 * You should have received a copy of the GNU Lesser General Public
19 * License along with Libav; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22 * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
25 #include "libavutil/cpu.h"
26 #include "libavutil/x86_cpu.h"
27 #include "libavcodec/dsputil.h"
28 #include "libavcodec/h264dsp.h"
29 #include "libavcodec/mpegvideo.h"
30 #include "libavcodec/simple_idct.h"
31 #include "libavcodec/ac3dec.h"
32 #include "dsputil_mmx.h"
33 #include "idct_xvid.h"
38 /* pixel operations */
39 DECLARE_ALIGNED(8, const uint64_t, ff_bone) = 0x0101010101010101ULL;
40 DECLARE_ALIGNED(8, const uint64_t, ff_wtwo) = 0x0002000200020002ULL;
42 DECLARE_ALIGNED(16, const uint64_t, ff_pdw_80000000)[2] =
43 {0x8000000080000000ULL, 0x8000000080000000ULL};
45 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_1 ) = {0x0001000100010001ULL, 0x0001000100010001ULL};
46 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_2 ) = {0x0002000200020002ULL, 0x0002000200020002ULL};
47 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_3 ) = {0x0003000300030003ULL, 0x0003000300030003ULL};
48 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_4 ) = {0x0004000400040004ULL, 0x0004000400040004ULL};
49 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_5 ) = {0x0005000500050005ULL, 0x0005000500050005ULL};
50 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_8 ) = {0x0008000800080008ULL, 0x0008000800080008ULL};
51 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_9 ) = {0x0009000900090009ULL, 0x0009000900090009ULL};
52 DECLARE_ALIGNED(8, const uint64_t, ff_pw_15 ) = 0x000F000F000F000FULL;
53 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_16 ) = {0x0010001000100010ULL, 0x0010001000100010ULL};
54 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_17 ) = {0x0011001100110011ULL, 0x0011001100110011ULL};
55 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_18 ) = {0x0012001200120012ULL, 0x0012001200120012ULL};
56 DECLARE_ALIGNED(8, const uint64_t, ff_pw_20 ) = 0x0014001400140014ULL;
57 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_27 ) = {0x001B001B001B001BULL, 0x001B001B001B001BULL};
58 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_28 ) = {0x001C001C001C001CULL, 0x001C001C001C001CULL};
59 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_32 ) = {0x0020002000200020ULL, 0x0020002000200020ULL};
60 DECLARE_ALIGNED(8, const uint64_t, ff_pw_42 ) = 0x002A002A002A002AULL;
61 DECLARE_ALIGNED(8, const uint64_t, ff_pw_53 ) = 0x0035003500350035ULL;
62 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_63 ) = {0x003F003F003F003FULL, 0x003F003F003F003FULL};
63 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_64 ) = {0x0040004000400040ULL, 0x0040004000400040ULL};
64 DECLARE_ALIGNED(8, const uint64_t, ff_pw_96 ) = 0x0060006000600060ULL;
65 DECLARE_ALIGNED(8, const uint64_t, ff_pw_128) = 0x0080008000800080ULL;
66 DECLARE_ALIGNED(8, const uint64_t, ff_pw_255) = 0x00ff00ff00ff00ffULL;
67 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_512) = {0x0200020002000200ULL, 0x0200020002000200ULL};
68 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_1019)= {0x03FB03FB03FB03FBULL, 0x03FB03FB03FB03FBULL};
70 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_0 ) = {0x0000000000000000ULL, 0x0000000000000000ULL};
71 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_1 ) = {0x0101010101010101ULL, 0x0101010101010101ULL};
72 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_3 ) = {0x0303030303030303ULL, 0x0303030303030303ULL};
73 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_4 ) = {0x0404040404040404ULL, 0x0404040404040404ULL};
74 DECLARE_ALIGNED(8, const uint64_t, ff_pb_7 ) = 0x0707070707070707ULL;
75 DECLARE_ALIGNED(8, const uint64_t, ff_pb_1F ) = 0x1F1F1F1F1F1F1F1FULL;
76 DECLARE_ALIGNED(8, const uint64_t, ff_pb_3F ) = 0x3F3F3F3F3F3F3F3FULL;
77 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_80 ) = {0x8080808080808080ULL, 0x8080808080808080ULL};
78 DECLARE_ALIGNED(8, const uint64_t, ff_pb_81 ) = 0x8181818181818181ULL;
79 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_A1 ) = {0xA1A1A1A1A1A1A1A1ULL, 0xA1A1A1A1A1A1A1A1ULL};
80 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_F8 ) = {0xF8F8F8F8F8F8F8F8ULL, 0xF8F8F8F8F8F8F8F8ULL};
81 DECLARE_ALIGNED(8, const uint64_t, ff_pb_FC ) = 0xFCFCFCFCFCFCFCFCULL;
82 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_FE ) = {0xFEFEFEFEFEFEFEFEULL, 0xFEFEFEFEFEFEFEFEULL};
84 DECLARE_ALIGNED(16, const double, ff_pd_1)[2] = { 1.0, 1.0 };
85 DECLARE_ALIGNED(16, const double, ff_pd_2)[2] = { 2.0, 2.0 };
87 #define JUMPALIGN() __asm__ volatile (".p2align 3"::)
88 #define MOVQ_ZERO(regd) __asm__ volatile ("pxor %%" #regd ", %%" #regd ::)
90 #define MOVQ_BFE(regd) \
92 "pcmpeqd %%" #regd ", %%" #regd " \n\t"\
93 "paddb %%" #regd ", %%" #regd " \n\t" ::)
96 #define MOVQ_BONE(regd) __asm__ volatile ("movq %0, %%" #regd " \n\t" ::"m"(ff_bone))
97 #define MOVQ_WTWO(regd) __asm__ volatile ("movq %0, %%" #regd " \n\t" ::"m"(ff_wtwo))
99 // for shared library it's better to use this way for accessing constants
101 #define MOVQ_BONE(regd) \
103 "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
104 "psrlw $15, %%" #regd " \n\t" \
105 "packuswb %%" #regd ", %%" #regd " \n\t" ::)
107 #define MOVQ_WTWO(regd) \
109 "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
110 "psrlw $15, %%" #regd " \n\t" \
111 "psllw $1, %%" #regd " \n\t"::)
115 // using regr as temporary and for the output result
116 // first argument is unmodifed and second is trashed
117 // regfe is supposed to contain 0xfefefefefefefefe
118 #define PAVGB_MMX_NO_RND(rega, regb, regr, regfe) \
119 "movq " #rega ", " #regr " \n\t"\
120 "pand " #regb ", " #regr " \n\t"\
121 "pxor " #rega ", " #regb " \n\t"\
122 "pand " #regfe "," #regb " \n\t"\
123 "psrlq $1, " #regb " \n\t"\
124 "paddb " #regb ", " #regr " \n\t"
126 #define PAVGB_MMX(rega, regb, regr, regfe) \
127 "movq " #rega ", " #regr " \n\t"\
128 "por " #regb ", " #regr " \n\t"\
129 "pxor " #rega ", " #regb " \n\t"\
130 "pand " #regfe "," #regb " \n\t"\
131 "psrlq $1, " #regb " \n\t"\
132 "psubb " #regb ", " #regr " \n\t"
134 // mm6 is supposed to contain 0xfefefefefefefefe
135 #define PAVGBP_MMX_NO_RND(rega, regb, regr, regc, regd, regp) \
136 "movq " #rega ", " #regr " \n\t"\
137 "movq " #regc ", " #regp " \n\t"\
138 "pand " #regb ", " #regr " \n\t"\
139 "pand " #regd ", " #regp " \n\t"\
140 "pxor " #rega ", " #regb " \n\t"\
141 "pxor " #regc ", " #regd " \n\t"\
142 "pand %%mm6, " #regb " \n\t"\
143 "pand %%mm6, " #regd " \n\t"\
144 "psrlq $1, " #regb " \n\t"\
145 "psrlq $1, " #regd " \n\t"\
146 "paddb " #regb ", " #regr " \n\t"\
147 "paddb " #regd ", " #regp " \n\t"
149 #define PAVGBP_MMX(rega, regb, regr, regc, regd, regp) \
150 "movq " #rega ", " #regr " \n\t"\
151 "movq " #regc ", " #regp " \n\t"\
152 "por " #regb ", " #regr " \n\t"\
153 "por " #regd ", " #regp " \n\t"\
154 "pxor " #rega ", " #regb " \n\t"\
155 "pxor " #regc ", " #regd " \n\t"\
156 "pand %%mm6, " #regb " \n\t"\
157 "pand %%mm6, " #regd " \n\t"\
158 "psrlq $1, " #regd " \n\t"\
159 "psrlq $1, " #regb " \n\t"\
160 "psubb " #regb ", " #regr " \n\t"\
161 "psubb " #regd ", " #regp " \n\t"
163 /***********************************/
164 /* MMX no rounding */
165 #define DEF(x, y) x ## _no_rnd_ ## y ##_mmx
166 #define SET_RND MOVQ_WONE
167 #define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX_NO_RND(a, b, c, d, e, f)
168 #define PAVGB(a, b, c, e) PAVGB_MMX_NO_RND(a, b, c, e)
169 #define OP_AVG(a, b, c, e) PAVGB_MMX(a, b, c, e)
171 #include "dsputil_mmx_rnd_template.c"
177 /***********************************/
180 #define DEF(x, y) x ## _ ## y ##_mmx
181 #define SET_RND MOVQ_WTWO
182 #define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX(a, b, c, d, e, f)
183 #define PAVGB(a, b, c, e) PAVGB_MMX(a, b, c, e)
185 #include "dsputil_mmx_rnd_template.c"
193 /***********************************/
196 #define DEF(x) x ## _3dnow
197 #define PAVGB "pavgusb"
200 #include "dsputil_mmx_avg_template.c"
206 /***********************************/
209 #define DEF(x) x ## _mmx2
211 /* Introduced only in MMX2 set */
212 #define PAVGB "pavgb"
215 #include "dsputil_mmx_avg_template.c"
221 #define put_no_rnd_pixels16_mmx put_pixels16_mmx
222 #define put_no_rnd_pixels8_mmx put_pixels8_mmx
223 #define put_pixels16_mmx2 put_pixels16_mmx
224 #define put_pixels8_mmx2 put_pixels8_mmx
225 #define put_pixels4_mmx2 put_pixels4_mmx
226 #define put_no_rnd_pixels16_mmx2 put_no_rnd_pixels16_mmx
227 #define put_no_rnd_pixels8_mmx2 put_no_rnd_pixels8_mmx
228 #define put_pixels16_3dnow put_pixels16_mmx
229 #define put_pixels8_3dnow put_pixels8_mmx
230 #define put_pixels4_3dnow put_pixels4_mmx
231 #define put_no_rnd_pixels16_3dnow put_no_rnd_pixels16_mmx
232 #define put_no_rnd_pixels8_3dnow put_no_rnd_pixels8_mmx
234 /***********************************/
237 void ff_put_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size)
242 /* read the pixels */
247 "movq %3, %%mm0 \n\t"
248 "movq 8%3, %%mm1 \n\t"
249 "movq 16%3, %%mm2 \n\t"
250 "movq 24%3, %%mm3 \n\t"
251 "movq 32%3, %%mm4 \n\t"
252 "movq 40%3, %%mm5 \n\t"
253 "movq 48%3, %%mm6 \n\t"
254 "movq 56%3, %%mm7 \n\t"
255 "packuswb %%mm1, %%mm0 \n\t"
256 "packuswb %%mm3, %%mm2 \n\t"
257 "packuswb %%mm5, %%mm4 \n\t"
258 "packuswb %%mm7, %%mm6 \n\t"
259 "movq %%mm0, (%0) \n\t"
260 "movq %%mm2, (%0, %1) \n\t"
261 "movq %%mm4, (%0, %1, 2) \n\t"
262 "movq %%mm6, (%0, %2) \n\t"
263 ::"r" (pix), "r" ((x86_reg)line_size), "r" ((x86_reg)line_size*3), "m"(*p)
268 // if here would be an exact copy of the code above
269 // compiler would generate some very strange code
272 "movq (%3), %%mm0 \n\t"
273 "movq 8(%3), %%mm1 \n\t"
274 "movq 16(%3), %%mm2 \n\t"
275 "movq 24(%3), %%mm3 \n\t"
276 "movq 32(%3), %%mm4 \n\t"
277 "movq 40(%3), %%mm5 \n\t"
278 "movq 48(%3), %%mm6 \n\t"
279 "movq 56(%3), %%mm7 \n\t"
280 "packuswb %%mm1, %%mm0 \n\t"
281 "packuswb %%mm3, %%mm2 \n\t"
282 "packuswb %%mm5, %%mm4 \n\t"
283 "packuswb %%mm7, %%mm6 \n\t"
284 "movq %%mm0, (%0) \n\t"
285 "movq %%mm2, (%0, %1) \n\t"
286 "movq %%mm4, (%0, %1, 2) \n\t"
287 "movq %%mm6, (%0, %2) \n\t"
288 ::"r" (pix), "r" ((x86_reg)line_size), "r" ((x86_reg)line_size*3), "r"(p)
292 #define put_signed_pixels_clamped_mmx_half(off) \
293 "movq "#off"(%2), %%mm1 \n\t"\
294 "movq 16+"#off"(%2), %%mm2 \n\t"\
295 "movq 32+"#off"(%2), %%mm3 \n\t"\
296 "movq 48+"#off"(%2), %%mm4 \n\t"\
297 "packsswb 8+"#off"(%2), %%mm1 \n\t"\
298 "packsswb 24+"#off"(%2), %%mm2 \n\t"\
299 "packsswb 40+"#off"(%2), %%mm3 \n\t"\
300 "packsswb 56+"#off"(%2), %%mm4 \n\t"\
301 "paddb %%mm0, %%mm1 \n\t"\
302 "paddb %%mm0, %%mm2 \n\t"\
303 "paddb %%mm0, %%mm3 \n\t"\
304 "paddb %%mm0, %%mm4 \n\t"\
305 "movq %%mm1, (%0) \n\t"\
306 "movq %%mm2, (%0, %3) \n\t"\
307 "movq %%mm3, (%0, %3, 2) \n\t"\
308 "movq %%mm4, (%0, %1) \n\t"
310 void ff_put_signed_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size)
312 x86_reg line_skip = line_size;
316 "movq "MANGLE(ff_pb_80)", %%mm0 \n\t"
317 "lea (%3, %3, 2), %1 \n\t"
318 put_signed_pixels_clamped_mmx_half(0)
319 "lea (%0, %3, 4), %0 \n\t"
320 put_signed_pixels_clamped_mmx_half(64)
321 :"+&r" (pixels), "=&r" (line_skip3)
322 :"r" (block), "r"(line_skip)
326 void ff_add_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size)
332 /* read the pixels */
339 "movq (%2), %%mm0 \n\t"
340 "movq 8(%2), %%mm1 \n\t"
341 "movq 16(%2), %%mm2 \n\t"
342 "movq 24(%2), %%mm3 \n\t"
343 "movq %0, %%mm4 \n\t"
344 "movq %1, %%mm6 \n\t"
345 "movq %%mm4, %%mm5 \n\t"
346 "punpcklbw %%mm7, %%mm4 \n\t"
347 "punpckhbw %%mm7, %%mm5 \n\t"
348 "paddsw %%mm4, %%mm0 \n\t"
349 "paddsw %%mm5, %%mm1 \n\t"
350 "movq %%mm6, %%mm5 \n\t"
351 "punpcklbw %%mm7, %%mm6 \n\t"
352 "punpckhbw %%mm7, %%mm5 \n\t"
353 "paddsw %%mm6, %%mm2 \n\t"
354 "paddsw %%mm5, %%mm3 \n\t"
355 "packuswb %%mm1, %%mm0 \n\t"
356 "packuswb %%mm3, %%mm2 \n\t"
357 "movq %%mm0, %0 \n\t"
358 "movq %%mm2, %1 \n\t"
359 :"+m"(*pix), "+m"(*(pix+line_size))
367 static void put_pixels4_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h)
370 "lea (%3, %3), %%"REG_a" \n\t"
373 "movd (%1), %%mm0 \n\t"
374 "movd (%1, %3), %%mm1 \n\t"
375 "movd %%mm0, (%2) \n\t"
376 "movd %%mm1, (%2, %3) \n\t"
377 "add %%"REG_a", %1 \n\t"
378 "add %%"REG_a", %2 \n\t"
379 "movd (%1), %%mm0 \n\t"
380 "movd (%1, %3), %%mm1 \n\t"
381 "movd %%mm0, (%2) \n\t"
382 "movd %%mm1, (%2, %3) \n\t"
383 "add %%"REG_a", %1 \n\t"
384 "add %%"REG_a", %2 \n\t"
387 : "+g"(h), "+r" (pixels), "+r" (block)
388 : "r"((x86_reg)line_size)
393 static void put_pixels8_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h)
396 "lea (%3, %3), %%"REG_a" \n\t"
399 "movq (%1), %%mm0 \n\t"
400 "movq (%1, %3), %%mm1 \n\t"
401 "movq %%mm0, (%2) \n\t"
402 "movq %%mm1, (%2, %3) \n\t"
403 "add %%"REG_a", %1 \n\t"
404 "add %%"REG_a", %2 \n\t"
405 "movq (%1), %%mm0 \n\t"
406 "movq (%1, %3), %%mm1 \n\t"
407 "movq %%mm0, (%2) \n\t"
408 "movq %%mm1, (%2, %3) \n\t"
409 "add %%"REG_a", %1 \n\t"
410 "add %%"REG_a", %2 \n\t"
413 : "+g"(h), "+r" (pixels), "+r" (block)
414 : "r"((x86_reg)line_size)
419 static void put_pixels16_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h)
422 "lea (%3, %3), %%"REG_a" \n\t"
425 "movq (%1), %%mm0 \n\t"
426 "movq 8(%1), %%mm4 \n\t"
427 "movq (%1, %3), %%mm1 \n\t"
428 "movq 8(%1, %3), %%mm5 \n\t"
429 "movq %%mm0, (%2) \n\t"
430 "movq %%mm4, 8(%2) \n\t"
431 "movq %%mm1, (%2, %3) \n\t"
432 "movq %%mm5, 8(%2, %3) \n\t"
433 "add %%"REG_a", %1 \n\t"
434 "add %%"REG_a", %2 \n\t"
435 "movq (%1), %%mm0 \n\t"
436 "movq 8(%1), %%mm4 \n\t"
437 "movq (%1, %3), %%mm1 \n\t"
438 "movq 8(%1, %3), %%mm5 \n\t"
439 "movq %%mm0, (%2) \n\t"
440 "movq %%mm4, 8(%2) \n\t"
441 "movq %%mm1, (%2, %3) \n\t"
442 "movq %%mm5, 8(%2, %3) \n\t"
443 "add %%"REG_a", %1 \n\t"
444 "add %%"REG_a", %2 \n\t"
447 : "+g"(h), "+r" (pixels), "+r" (block)
448 : "r"((x86_reg)line_size)
453 static void put_pixels16_sse2(uint8_t *block, const uint8_t *pixels, int line_size, int h)
457 "movdqu (%1), %%xmm0 \n\t"
458 "movdqu (%1,%3), %%xmm1 \n\t"
459 "movdqu (%1,%3,2), %%xmm2 \n\t"
460 "movdqu (%1,%4), %%xmm3 \n\t"
461 "lea (%1,%3,4), %1 \n\t"
462 "movdqa %%xmm0, (%2) \n\t"
463 "movdqa %%xmm1, (%2,%3) \n\t"
464 "movdqa %%xmm2, (%2,%3,2) \n\t"
465 "movdqa %%xmm3, (%2,%4) \n\t"
467 "lea (%2,%3,4), %2 \n\t"
469 : "+g"(h), "+r" (pixels), "+r" (block)
470 : "r"((x86_reg)line_size), "r"((x86_reg)3L*line_size)
475 static void avg_pixels16_sse2(uint8_t *block, const uint8_t *pixels, int line_size, int h)
479 "movdqu (%1), %%xmm0 \n\t"
480 "movdqu (%1,%3), %%xmm1 \n\t"
481 "movdqu (%1,%3,2), %%xmm2 \n\t"
482 "movdqu (%1,%4), %%xmm3 \n\t"
483 "lea (%1,%3,4), %1 \n\t"
484 "pavgb (%2), %%xmm0 \n\t"
485 "pavgb (%2,%3), %%xmm1 \n\t"
486 "pavgb (%2,%3,2), %%xmm2 \n\t"
487 "pavgb (%2,%4), %%xmm3 \n\t"
488 "movdqa %%xmm0, (%2) \n\t"
489 "movdqa %%xmm1, (%2,%3) \n\t"
490 "movdqa %%xmm2, (%2,%3,2) \n\t"
491 "movdqa %%xmm3, (%2,%4) \n\t"
493 "lea (%2,%3,4), %2 \n\t"
495 : "+g"(h), "+r" (pixels), "+r" (block)
496 : "r"((x86_reg)line_size), "r"((x86_reg)3L*line_size)
501 #define CLEAR_BLOCKS(name,n) \
502 static void name(DCTELEM *blocks)\
505 "pxor %%mm7, %%mm7 \n\t"\
506 "mov %1, %%"REG_a" \n\t"\
508 "movq %%mm7, (%0, %%"REG_a") \n\t"\
509 "movq %%mm7, 8(%0, %%"REG_a") \n\t"\
510 "movq %%mm7, 16(%0, %%"REG_a") \n\t"\
511 "movq %%mm7, 24(%0, %%"REG_a") \n\t"\
512 "add $32, %%"REG_a" \n\t"\
514 : : "r" (((uint8_t *)blocks)+128*n),\
519 CLEAR_BLOCKS(clear_blocks_mmx, 6)
520 CLEAR_BLOCKS(clear_block_mmx, 1)
522 static void clear_block_sse(DCTELEM *block)
525 "xorps %%xmm0, %%xmm0 \n"
526 "movaps %%xmm0, (%0) \n"
527 "movaps %%xmm0, 16(%0) \n"
528 "movaps %%xmm0, 32(%0) \n"
529 "movaps %%xmm0, 48(%0) \n"
530 "movaps %%xmm0, 64(%0) \n"
531 "movaps %%xmm0, 80(%0) \n"
532 "movaps %%xmm0, 96(%0) \n"
533 "movaps %%xmm0, 112(%0) \n"
539 static void clear_blocks_sse(DCTELEM *blocks)
542 "xorps %%xmm0, %%xmm0 \n"
543 "mov %1, %%"REG_a" \n"
545 "movaps %%xmm0, (%0, %%"REG_a") \n"
546 "movaps %%xmm0, 16(%0, %%"REG_a") \n"
547 "movaps %%xmm0, 32(%0, %%"REG_a") \n"
548 "movaps %%xmm0, 48(%0, %%"REG_a") \n"
549 "movaps %%xmm0, 64(%0, %%"REG_a") \n"
550 "movaps %%xmm0, 80(%0, %%"REG_a") \n"
551 "movaps %%xmm0, 96(%0, %%"REG_a") \n"
552 "movaps %%xmm0, 112(%0, %%"REG_a") \n"
553 "add $128, %%"REG_a" \n"
555 : : "r" (((uint8_t *)blocks)+128*6),
561 static void add_bytes_mmx(uint8_t *dst, uint8_t *src, int w){
566 "movq (%1, %0), %%mm0 \n\t"
567 "movq (%2, %0), %%mm1 \n\t"
568 "paddb %%mm0, %%mm1 \n\t"
569 "movq %%mm1, (%2, %0) \n\t"
570 "movq 8(%1, %0), %%mm0 \n\t"
571 "movq 8(%2, %0), %%mm1 \n\t"
572 "paddb %%mm0, %%mm1 \n\t"
573 "movq %%mm1, 8(%2, %0) \n\t"
579 : "r"(src), "r"(dst), "r"((x86_reg)w-15)
582 dst[i+0] += src[i+0];
586 static void add_hfyu_median_prediction_cmov(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int w, int *left, int *left_top) {
589 int l = *left & 0xff;
590 int tl = *left_top & 0xff;
595 "movzbl (%3,%4), %2 \n"
608 "add (%6,%4), %b0 \n"
609 "mov %b0, (%5,%4) \n"
612 :"+&q"(l), "+&q"(tl), "=&r"(t), "=&q"(x), "+&r"(w2)
613 :"r"(dst+w), "r"(diff+w), "rm"(top+w)
620 #define H263_LOOP_FILTER \
621 "pxor %%mm7, %%mm7 \n\t"\
622 "movq %0, %%mm0 \n\t"\
623 "movq %0, %%mm1 \n\t"\
624 "movq %3, %%mm2 \n\t"\
625 "movq %3, %%mm3 \n\t"\
626 "punpcklbw %%mm7, %%mm0 \n\t"\
627 "punpckhbw %%mm7, %%mm1 \n\t"\
628 "punpcklbw %%mm7, %%mm2 \n\t"\
629 "punpckhbw %%mm7, %%mm3 \n\t"\
630 "psubw %%mm2, %%mm0 \n\t"\
631 "psubw %%mm3, %%mm1 \n\t"\
632 "movq %1, %%mm2 \n\t"\
633 "movq %1, %%mm3 \n\t"\
634 "movq %2, %%mm4 \n\t"\
635 "movq %2, %%mm5 \n\t"\
636 "punpcklbw %%mm7, %%mm2 \n\t"\
637 "punpckhbw %%mm7, %%mm3 \n\t"\
638 "punpcklbw %%mm7, %%mm4 \n\t"\
639 "punpckhbw %%mm7, %%mm5 \n\t"\
640 "psubw %%mm2, %%mm4 \n\t"\
641 "psubw %%mm3, %%mm5 \n\t"\
642 "psllw $2, %%mm4 \n\t"\
643 "psllw $2, %%mm5 \n\t"\
644 "paddw %%mm0, %%mm4 \n\t"\
645 "paddw %%mm1, %%mm5 \n\t"\
646 "pxor %%mm6, %%mm6 \n\t"\
647 "pcmpgtw %%mm4, %%mm6 \n\t"\
648 "pcmpgtw %%mm5, %%mm7 \n\t"\
649 "pxor %%mm6, %%mm4 \n\t"\
650 "pxor %%mm7, %%mm5 \n\t"\
651 "psubw %%mm6, %%mm4 \n\t"\
652 "psubw %%mm7, %%mm5 \n\t"\
653 "psrlw $3, %%mm4 \n\t"\
654 "psrlw $3, %%mm5 \n\t"\
655 "packuswb %%mm5, %%mm4 \n\t"\
656 "packsswb %%mm7, %%mm6 \n\t"\
657 "pxor %%mm7, %%mm7 \n\t"\
658 "movd %4, %%mm2 \n\t"\
659 "punpcklbw %%mm2, %%mm2 \n\t"\
660 "punpcklbw %%mm2, %%mm2 \n\t"\
661 "punpcklbw %%mm2, %%mm2 \n\t"\
662 "psubusb %%mm4, %%mm2 \n\t"\
663 "movq %%mm2, %%mm3 \n\t"\
664 "psubusb %%mm4, %%mm3 \n\t"\
665 "psubb %%mm3, %%mm2 \n\t"\
666 "movq %1, %%mm3 \n\t"\
667 "movq %2, %%mm4 \n\t"\
668 "pxor %%mm6, %%mm3 \n\t"\
669 "pxor %%mm6, %%mm4 \n\t"\
670 "paddusb %%mm2, %%mm3 \n\t"\
671 "psubusb %%mm2, %%mm4 \n\t"\
672 "pxor %%mm6, %%mm3 \n\t"\
673 "pxor %%mm6, %%mm4 \n\t"\
674 "paddusb %%mm2, %%mm2 \n\t"\
675 "packsswb %%mm1, %%mm0 \n\t"\
676 "pcmpgtb %%mm0, %%mm7 \n\t"\
677 "pxor %%mm7, %%mm0 \n\t"\
678 "psubb %%mm7, %%mm0 \n\t"\
679 "movq %%mm0, %%mm1 \n\t"\
680 "psubusb %%mm2, %%mm0 \n\t"\
681 "psubb %%mm0, %%mm1 \n\t"\
682 "pand %5, %%mm1 \n\t"\
683 "psrlw $2, %%mm1 \n\t"\
684 "pxor %%mm7, %%mm1 \n\t"\
685 "psubb %%mm7, %%mm1 \n\t"\
686 "movq %0, %%mm5 \n\t"\
687 "movq %3, %%mm6 \n\t"\
688 "psubb %%mm1, %%mm5 \n\t"\
689 "paddb %%mm1, %%mm6 \n\t"
691 static void h263_v_loop_filter_mmx(uint8_t *src, int stride, int qscale){
692 if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
693 const int strength= ff_h263_loop_filter_strength[qscale];
699 "movq %%mm3, %1 \n\t"
700 "movq %%mm4, %2 \n\t"
701 "movq %%mm5, %0 \n\t"
702 "movq %%mm6, %3 \n\t"
703 : "+m" (*(uint64_t*)(src - 2*stride)),
704 "+m" (*(uint64_t*)(src - 1*stride)),
705 "+m" (*(uint64_t*)(src + 0*stride)),
706 "+m" (*(uint64_t*)(src + 1*stride))
707 : "g" (2*strength), "m"(ff_pb_FC)
712 static void h263_h_loop_filter_mmx(uint8_t *src, int stride, int qscale){
713 if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
714 const int strength= ff_h263_loop_filter_strength[qscale];
715 DECLARE_ALIGNED(8, uint64_t, temp)[4];
716 uint8_t *btemp= (uint8_t*)temp;
720 transpose4x4(btemp , src , 8, stride);
721 transpose4x4(btemp+4, src + 4*stride, 8, stride);
723 H263_LOOP_FILTER // 5 3 4 6
729 : "g" (2*strength), "m"(ff_pb_FC)
733 "movq %%mm5, %%mm1 \n\t"
734 "movq %%mm4, %%mm0 \n\t"
735 "punpcklbw %%mm3, %%mm5 \n\t"
736 "punpcklbw %%mm6, %%mm4 \n\t"
737 "punpckhbw %%mm3, %%mm1 \n\t"
738 "punpckhbw %%mm6, %%mm0 \n\t"
739 "movq %%mm5, %%mm3 \n\t"
740 "movq %%mm1, %%mm6 \n\t"
741 "punpcklwd %%mm4, %%mm5 \n\t"
742 "punpcklwd %%mm0, %%mm1 \n\t"
743 "punpckhwd %%mm4, %%mm3 \n\t"
744 "punpckhwd %%mm0, %%mm6 \n\t"
745 "movd %%mm5, (%0) \n\t"
746 "punpckhdq %%mm5, %%mm5 \n\t"
747 "movd %%mm5, (%0,%2) \n\t"
748 "movd %%mm3, (%0,%2,2) \n\t"
749 "punpckhdq %%mm3, %%mm3 \n\t"
750 "movd %%mm3, (%0,%3) \n\t"
751 "movd %%mm1, (%1) \n\t"
752 "punpckhdq %%mm1, %%mm1 \n\t"
753 "movd %%mm1, (%1,%2) \n\t"
754 "movd %%mm6, (%1,%2,2) \n\t"
755 "punpckhdq %%mm6, %%mm6 \n\t"
756 "movd %%mm6, (%1,%3) \n\t"
758 "r" (src + 4*stride),
759 "r" ((x86_reg) stride ),
760 "r" ((x86_reg)(3*stride))
765 /* draw the edges of width 'w' of an image of size width, height
766 this mmx version can only handle w==8 || w==16 */
767 static void draw_edges_mmx(uint8_t *buf, int wrap, int width, int height, int w, int h, int sides)
769 uint8_t *ptr, *last_line;
772 last_line = buf + (height - 1) * wrap;
779 "movd (%0), %%mm0 \n\t"
780 "punpcklbw %%mm0, %%mm0 \n\t"
781 "punpcklwd %%mm0, %%mm0 \n\t"
782 "punpckldq %%mm0, %%mm0 \n\t"
783 "movq %%mm0, -8(%0) \n\t"
784 "movq -8(%0, %2), %%mm1 \n\t"
785 "punpckhbw %%mm1, %%mm1 \n\t"
786 "punpckhwd %%mm1, %%mm1 \n\t"
787 "punpckhdq %%mm1, %%mm1 \n\t"
788 "movq %%mm1, (%0, %2) \n\t"
793 : "r" ((x86_reg)wrap), "r" ((x86_reg)width), "r" (ptr + wrap*height)
800 "movd (%0), %%mm0 \n\t"
801 "punpcklbw %%mm0, %%mm0 \n\t"
802 "punpcklwd %%mm0, %%mm0 \n\t"
803 "punpckldq %%mm0, %%mm0 \n\t"
804 "movq %%mm0, -8(%0) \n\t"
805 "movq %%mm0, -16(%0) \n\t"
806 "movq -8(%0, %2), %%mm1 \n\t"
807 "punpckhbw %%mm1, %%mm1 \n\t"
808 "punpckhwd %%mm1, %%mm1 \n\t"
809 "punpckhdq %%mm1, %%mm1 \n\t"
810 "movq %%mm1, (%0, %2) \n\t"
811 "movq %%mm1, 8(%0, %2) \n\t"
816 : "r" ((x86_reg)wrap), "r" ((x86_reg)width), "r" (ptr + wrap*height)
820 /* top and bottom (and hopefully also the corners) */
821 if (sides&EDGE_TOP) {
822 for(i = 0; i < h; i += 4) {
823 ptr= buf - (i + 1) * wrap - w;
826 "movq (%1, %0), %%mm0 \n\t"
827 "movq %%mm0, (%0) \n\t"
828 "movq %%mm0, (%0, %2) \n\t"
829 "movq %%mm0, (%0, %2, 2) \n\t"
830 "movq %%mm0, (%0, %3) \n\t"
835 : "r" ((x86_reg)buf - (x86_reg)ptr - w), "r" ((x86_reg)-wrap), "r" ((x86_reg)-wrap*3), "r" (ptr+width+2*w)
840 if (sides&EDGE_BOTTOM) {
841 for(i = 0; i < h; i += 4) {
842 ptr= last_line + (i + 1) * wrap - w;
845 "movq (%1, %0), %%mm0 \n\t"
846 "movq %%mm0, (%0) \n\t"
847 "movq %%mm0, (%0, %2) \n\t"
848 "movq %%mm0, (%0, %2, 2) \n\t"
849 "movq %%mm0, (%0, %3) \n\t"
854 : "r" ((x86_reg)last_line - (x86_reg)ptr - w), "r" ((x86_reg)wrap), "r" ((x86_reg)wrap*3), "r" (ptr+width+2*w)
860 #define QPEL_V_LOW(m3,m4,m5,m6, pw_20, pw_3, rnd, in0, in1, in2, in7, out, OP)\
861 "paddw " #m4 ", " #m3 " \n\t" /* x1 */\
862 "movq "MANGLE(ff_pw_20)", %%mm4 \n\t" /* 20 */\
863 "pmullw " #m3 ", %%mm4 \n\t" /* 20x1 */\
864 "movq "#in7", " #m3 " \n\t" /* d */\
865 "movq "#in0", %%mm5 \n\t" /* D */\
866 "paddw " #m3 ", %%mm5 \n\t" /* x4 */\
867 "psubw %%mm5, %%mm4 \n\t" /* 20x1 - x4 */\
868 "movq "#in1", %%mm5 \n\t" /* C */\
869 "movq "#in2", %%mm6 \n\t" /* B */\
870 "paddw " #m6 ", %%mm5 \n\t" /* x3 */\
871 "paddw " #m5 ", %%mm6 \n\t" /* x2 */\
872 "paddw %%mm6, %%mm6 \n\t" /* 2x2 */\
873 "psubw %%mm6, %%mm5 \n\t" /* -2x2 + x3 */\
874 "pmullw "MANGLE(ff_pw_3)", %%mm5 \n\t" /* -6x2 + 3x3 */\
875 "paddw " #rnd ", %%mm4 \n\t" /* x2 */\
876 "paddw %%mm4, %%mm5 \n\t" /* 20x1 - 6x2 + 3x3 - x4 */\
877 "psraw $5, %%mm5 \n\t"\
878 "packuswb %%mm5, %%mm5 \n\t"\
879 OP(%%mm5, out, %%mm7, d)
881 #define QPEL_BASE(OPNAME, ROUNDER, RND, OP_MMX2, OP_3DNOW)\
882 static void OPNAME ## mpeg4_qpel16_h_lowpass_mmx2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
886 "pxor %%mm7, %%mm7 \n\t"\
888 "movq (%0), %%mm0 \n\t" /* ABCDEFGH */\
889 "movq %%mm0, %%mm1 \n\t" /* ABCDEFGH */\
890 "movq %%mm0, %%mm2 \n\t" /* ABCDEFGH */\
891 "punpcklbw %%mm7, %%mm0 \n\t" /* 0A0B0C0D */\
892 "punpckhbw %%mm7, %%mm1 \n\t" /* 0E0F0G0H */\
893 "pshufw $0x90, %%mm0, %%mm5 \n\t" /* 0A0A0B0C */\
894 "pshufw $0x41, %%mm0, %%mm6 \n\t" /* 0B0A0A0B */\
895 "movq %%mm2, %%mm3 \n\t" /* ABCDEFGH */\
896 "movq %%mm2, %%mm4 \n\t" /* ABCDEFGH */\
897 "psllq $8, %%mm2 \n\t" /* 0ABCDEFG */\
898 "psllq $16, %%mm3 \n\t" /* 00ABCDEF */\
899 "psllq $24, %%mm4 \n\t" /* 000ABCDE */\
900 "punpckhbw %%mm7, %%mm2 \n\t" /* 0D0E0F0G */\
901 "punpckhbw %%mm7, %%mm3 \n\t" /* 0C0D0E0F */\
902 "punpckhbw %%mm7, %%mm4 \n\t" /* 0B0C0D0E */\
903 "paddw %%mm3, %%mm5 \n\t" /* b */\
904 "paddw %%mm2, %%mm6 \n\t" /* c */\
905 "paddw %%mm5, %%mm5 \n\t" /* 2b */\
906 "psubw %%mm5, %%mm6 \n\t" /* c - 2b */\
907 "pshufw $0x06, %%mm0, %%mm5 \n\t" /* 0C0B0A0A */\
908 "pmullw "MANGLE(ff_pw_3)", %%mm6 \n\t" /* 3c - 6b */\
909 "paddw %%mm4, %%mm0 \n\t" /* a */\
910 "paddw %%mm1, %%mm5 \n\t" /* d */\
911 "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" /* 20a */\
912 "psubw %%mm5, %%mm0 \n\t" /* 20a - d */\
913 "paddw %6, %%mm6 \n\t"\
914 "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\
915 "psraw $5, %%mm0 \n\t"\
916 "movq %%mm0, %5 \n\t"\
917 /* mm1=EFGH, mm2=DEFG, mm3=CDEF, mm4=BCDE, mm7=0 */\
919 "movq 5(%0), %%mm0 \n\t" /* FGHIJKLM */\
920 "movq %%mm0, %%mm5 \n\t" /* FGHIJKLM */\
921 "movq %%mm0, %%mm6 \n\t" /* FGHIJKLM */\
922 "psrlq $8, %%mm0 \n\t" /* GHIJKLM0 */\
923 "psrlq $16, %%mm5 \n\t" /* HIJKLM00 */\
924 "punpcklbw %%mm7, %%mm0 \n\t" /* 0G0H0I0J */\
925 "punpcklbw %%mm7, %%mm5 \n\t" /* 0H0I0J0K */\
926 "paddw %%mm0, %%mm2 \n\t" /* b */\
927 "paddw %%mm5, %%mm3 \n\t" /* c */\
928 "paddw %%mm2, %%mm2 \n\t" /* 2b */\
929 "psubw %%mm2, %%mm3 \n\t" /* c - 2b */\
930 "movq %%mm6, %%mm2 \n\t" /* FGHIJKLM */\
931 "psrlq $24, %%mm6 \n\t" /* IJKLM000 */\
932 "punpcklbw %%mm7, %%mm2 \n\t" /* 0F0G0H0I */\
933 "punpcklbw %%mm7, %%mm6 \n\t" /* 0I0J0K0L */\
934 "pmullw "MANGLE(ff_pw_3)", %%mm3 \n\t" /* 3c - 6b */\
935 "paddw %%mm2, %%mm1 \n\t" /* a */\
936 "paddw %%mm6, %%mm4 \n\t" /* d */\
937 "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" /* 20a */\
938 "psubw %%mm4, %%mm3 \n\t" /* - 6b +3c - d */\
939 "paddw %6, %%mm1 \n\t"\
940 "paddw %%mm1, %%mm3 \n\t" /* 20a - 6b +3c - d */\
941 "psraw $5, %%mm3 \n\t"\
942 "movq %5, %%mm1 \n\t"\
943 "packuswb %%mm3, %%mm1 \n\t"\
944 OP_MMX2(%%mm1, (%1),%%mm4, q)\
945 /* mm0= GHIJ, mm2=FGHI, mm5=HIJK, mm6=IJKL, mm7=0 */\
947 "movq 9(%0), %%mm1 \n\t" /* JKLMNOPQ */\
948 "movq %%mm1, %%mm4 \n\t" /* JKLMNOPQ */\
949 "movq %%mm1, %%mm3 \n\t" /* JKLMNOPQ */\
950 "psrlq $8, %%mm1 \n\t" /* KLMNOPQ0 */\
951 "psrlq $16, %%mm4 \n\t" /* LMNOPQ00 */\
952 "punpcklbw %%mm7, %%mm1 \n\t" /* 0K0L0M0N */\
953 "punpcklbw %%mm7, %%mm4 \n\t" /* 0L0M0N0O */\
954 "paddw %%mm1, %%mm5 \n\t" /* b */\
955 "paddw %%mm4, %%mm0 \n\t" /* c */\
956 "paddw %%mm5, %%mm5 \n\t" /* 2b */\
957 "psubw %%mm5, %%mm0 \n\t" /* c - 2b */\
958 "movq %%mm3, %%mm5 \n\t" /* JKLMNOPQ */\
959 "psrlq $24, %%mm3 \n\t" /* MNOPQ000 */\
960 "pmullw "MANGLE(ff_pw_3)", %%mm0 \n\t" /* 3c - 6b */\
961 "punpcklbw %%mm7, %%mm3 \n\t" /* 0M0N0O0P */\
962 "paddw %%mm3, %%mm2 \n\t" /* d */\
963 "psubw %%mm2, %%mm0 \n\t" /* -6b + 3c - d */\
964 "movq %%mm5, %%mm2 \n\t" /* JKLMNOPQ */\
965 "punpcklbw %%mm7, %%mm2 \n\t" /* 0J0K0L0M */\
966 "punpckhbw %%mm7, %%mm5 \n\t" /* 0N0O0P0Q */\
967 "paddw %%mm2, %%mm6 \n\t" /* a */\
968 "pmullw "MANGLE(ff_pw_20)", %%mm6 \n\t" /* 20a */\
969 "paddw %6, %%mm0 \n\t"\
970 "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\
971 "psraw $5, %%mm0 \n\t"\
972 /* mm1=KLMN, mm2=JKLM, mm3=MNOP, mm4=LMNO, mm5=NOPQ mm7=0 */\
974 "paddw %%mm5, %%mm3 \n\t" /* a */\
975 "pshufw $0xF9, %%mm5, %%mm6 \n\t" /* 0O0P0Q0Q */\
976 "paddw %%mm4, %%mm6 \n\t" /* b */\
977 "pshufw $0xBE, %%mm5, %%mm4 \n\t" /* 0P0Q0Q0P */\
978 "pshufw $0x6F, %%mm5, %%mm5 \n\t" /* 0Q0Q0P0O */\
979 "paddw %%mm1, %%mm4 \n\t" /* c */\
980 "paddw %%mm2, %%mm5 \n\t" /* d */\
981 "paddw %%mm6, %%mm6 \n\t" /* 2b */\
982 "psubw %%mm6, %%mm4 \n\t" /* c - 2b */\
983 "pmullw "MANGLE(ff_pw_20)", %%mm3 \n\t" /* 20a */\
984 "pmullw "MANGLE(ff_pw_3)", %%mm4 \n\t" /* 3c - 6b */\
985 "psubw %%mm5, %%mm3 \n\t" /* -6b + 3c - d */\
986 "paddw %6, %%mm4 \n\t"\
987 "paddw %%mm3, %%mm4 \n\t" /* 20a - 6b + 3c - d */\
988 "psraw $5, %%mm4 \n\t"\
989 "packuswb %%mm4, %%mm0 \n\t"\
990 OP_MMX2(%%mm0, 8(%1), %%mm4, q)\
996 : "+a"(src), "+c"(dst), "+D"(h)\
997 : "d"((x86_reg)srcStride), "S"((x86_reg)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(temp), "m"(ROUNDER)\
1002 static void OPNAME ## mpeg4_qpel16_h_lowpass_3dnow(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1005 /* quick HACK, XXX FIXME MUST be optimized */\
1008 temp[ 0]= (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]);\
1009 temp[ 1]= (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]);\
1010 temp[ 2]= (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]);\
1011 temp[ 3]= (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]);\
1012 temp[ 4]= (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]);\
1013 temp[ 5]= (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]);\
1014 temp[ 6]= (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]);\
1015 temp[ 7]= (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]);\
1016 temp[ 8]= (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]);\
1017 temp[ 9]= (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]);\
1018 temp[10]= (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]);\
1019 temp[11]= (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]);\
1020 temp[12]= (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]);\
1021 temp[13]= (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]);\
1022 temp[14]= (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]);\
1023 temp[15]= (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]);\
1025 "movq (%0), %%mm0 \n\t"\
1026 "movq 8(%0), %%mm1 \n\t"\
1027 "paddw %2, %%mm0 \n\t"\
1028 "paddw %2, %%mm1 \n\t"\
1029 "psraw $5, %%mm0 \n\t"\
1030 "psraw $5, %%mm1 \n\t"\
1031 "packuswb %%mm1, %%mm0 \n\t"\
1032 OP_3DNOW(%%mm0, (%1), %%mm1, q)\
1033 "movq 16(%0), %%mm0 \n\t"\
1034 "movq 24(%0), %%mm1 \n\t"\
1035 "paddw %2, %%mm0 \n\t"\
1036 "paddw %2, %%mm1 \n\t"\
1037 "psraw $5, %%mm0 \n\t"\
1038 "psraw $5, %%mm1 \n\t"\
1039 "packuswb %%mm1, %%mm0 \n\t"\
1040 OP_3DNOW(%%mm0, 8(%1), %%mm1, q)\
1041 :: "r"(temp), "r"(dst), "m"(ROUNDER)\
1049 static void OPNAME ## mpeg4_qpel8_h_lowpass_mmx2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1051 "pxor %%mm7, %%mm7 \n\t"\
1053 "movq (%0), %%mm0 \n\t" /* ABCDEFGH */\
1054 "movq %%mm0, %%mm1 \n\t" /* ABCDEFGH */\
1055 "movq %%mm0, %%mm2 \n\t" /* ABCDEFGH */\
1056 "punpcklbw %%mm7, %%mm0 \n\t" /* 0A0B0C0D */\
1057 "punpckhbw %%mm7, %%mm1 \n\t" /* 0E0F0G0H */\
1058 "pshufw $0x90, %%mm0, %%mm5 \n\t" /* 0A0A0B0C */\
1059 "pshufw $0x41, %%mm0, %%mm6 \n\t" /* 0B0A0A0B */\
1060 "movq %%mm2, %%mm3 \n\t" /* ABCDEFGH */\
1061 "movq %%mm2, %%mm4 \n\t" /* ABCDEFGH */\
1062 "psllq $8, %%mm2 \n\t" /* 0ABCDEFG */\
1063 "psllq $16, %%mm3 \n\t" /* 00ABCDEF */\
1064 "psllq $24, %%mm4 \n\t" /* 000ABCDE */\
1065 "punpckhbw %%mm7, %%mm2 \n\t" /* 0D0E0F0G */\
1066 "punpckhbw %%mm7, %%mm3 \n\t" /* 0C0D0E0F */\
1067 "punpckhbw %%mm7, %%mm4 \n\t" /* 0B0C0D0E */\
1068 "paddw %%mm3, %%mm5 \n\t" /* b */\
1069 "paddw %%mm2, %%mm6 \n\t" /* c */\
1070 "paddw %%mm5, %%mm5 \n\t" /* 2b */\
1071 "psubw %%mm5, %%mm6 \n\t" /* c - 2b */\
1072 "pshufw $0x06, %%mm0, %%mm5 \n\t" /* 0C0B0A0A */\
1073 "pmullw "MANGLE(ff_pw_3)", %%mm6 \n\t" /* 3c - 6b */\
1074 "paddw %%mm4, %%mm0 \n\t" /* a */\
1075 "paddw %%mm1, %%mm5 \n\t" /* d */\
1076 "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" /* 20a */\
1077 "psubw %%mm5, %%mm0 \n\t" /* 20a - d */\
1078 "paddw %5, %%mm6 \n\t"\
1079 "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\
1080 "psraw $5, %%mm0 \n\t"\
1081 /* mm1=EFGH, mm2=DEFG, mm3=CDEF, mm4=BCDE, mm7=0 */\
1083 "movd 5(%0), %%mm5 \n\t" /* FGHI */\
1084 "punpcklbw %%mm7, %%mm5 \n\t" /* 0F0G0H0I */\
1085 "pshufw $0xF9, %%mm5, %%mm6 \n\t" /* 0G0H0I0I */\
1086 "paddw %%mm5, %%mm1 \n\t" /* a */\
1087 "paddw %%mm6, %%mm2 \n\t" /* b */\
1088 "pshufw $0xBE, %%mm5, %%mm6 \n\t" /* 0H0I0I0H */\
1089 "pshufw $0x6F, %%mm5, %%mm5 \n\t" /* 0I0I0H0G */\
1090 "paddw %%mm6, %%mm3 \n\t" /* c */\
1091 "paddw %%mm5, %%mm4 \n\t" /* d */\
1092 "paddw %%mm2, %%mm2 \n\t" /* 2b */\
1093 "psubw %%mm2, %%mm3 \n\t" /* c - 2b */\
1094 "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" /* 20a */\
1095 "pmullw "MANGLE(ff_pw_3)", %%mm3 \n\t" /* 3c - 6b */\
1096 "psubw %%mm4, %%mm3 \n\t" /* -6b + 3c - d */\
1097 "paddw %5, %%mm1 \n\t"\
1098 "paddw %%mm1, %%mm3 \n\t" /* 20a - 6b + 3c - d */\
1099 "psraw $5, %%mm3 \n\t"\
1100 "packuswb %%mm3, %%mm0 \n\t"\
1101 OP_MMX2(%%mm0, (%1), %%mm4, q)\
1107 : "+a"(src), "+c"(dst), "+d"(h)\
1108 : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(ROUNDER)\
1113 static void OPNAME ## mpeg4_qpel8_h_lowpass_3dnow(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1116 /* quick HACK, XXX FIXME MUST be optimized */\
1119 temp[ 0]= (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]);\
1120 temp[ 1]= (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]);\
1121 temp[ 2]= (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]);\
1122 temp[ 3]= (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]);\
1123 temp[ 4]= (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]);\
1124 temp[ 5]= (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 8]);\
1125 temp[ 6]= (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 8])*3 - (src[ 3]+src[ 7]);\
1126 temp[ 7]= (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 8])*6 + (src[ 5]+src[ 7])*3 - (src[ 4]+src[ 6]);\
1128 "movq (%0), %%mm0 \n\t"\
1129 "movq 8(%0), %%mm1 \n\t"\
1130 "paddw %2, %%mm0 \n\t"\
1131 "paddw %2, %%mm1 \n\t"\
1132 "psraw $5, %%mm0 \n\t"\
1133 "psraw $5, %%mm1 \n\t"\
1134 "packuswb %%mm1, %%mm0 \n\t"\
1135 OP_3DNOW(%%mm0, (%1), %%mm1, q)\
1136 :: "r"(temp), "r"(dst), "m"(ROUNDER)\
1144 #define QPEL_OP(OPNAME, ROUNDER, RND, OP, MMX)\
1146 static void OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1147 uint64_t temp[17*4];\
1148 uint64_t *temp_ptr= temp;\
1153 "pxor %%mm7, %%mm7 \n\t"\
1155 "movq (%0), %%mm0 \n\t"\
1156 "movq (%0), %%mm1 \n\t"\
1157 "movq 8(%0), %%mm2 \n\t"\
1158 "movq 8(%0), %%mm3 \n\t"\
1159 "punpcklbw %%mm7, %%mm0 \n\t"\
1160 "punpckhbw %%mm7, %%mm1 \n\t"\
1161 "punpcklbw %%mm7, %%mm2 \n\t"\
1162 "punpckhbw %%mm7, %%mm3 \n\t"\
1163 "movq %%mm0, (%1) \n\t"\
1164 "movq %%mm1, 17*8(%1) \n\t"\
1165 "movq %%mm2, 2*17*8(%1) \n\t"\
1166 "movq %%mm3, 3*17*8(%1) \n\t"\
1171 : "+r" (src), "+r" (temp_ptr), "+r"(count)\
1172 : "r" ((x86_reg)srcStride)\
1179 /*FIXME reorder for speed */\
1181 /*"pxor %%mm7, %%mm7 \n\t"*/\
1183 "movq (%0), %%mm0 \n\t"\
1184 "movq 8(%0), %%mm1 \n\t"\
1185 "movq 16(%0), %%mm2 \n\t"\
1186 "movq 24(%0), %%mm3 \n\t"\
1187 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP)\
1188 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP)\
1190 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP)\
1192 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP)\
1194 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP)\
1195 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 72(%0), (%1, %3), OP)\
1197 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 80(%0), (%1), OP)\
1198 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 88(%0), (%1, %3), OP)\
1200 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 40(%0), 48(%0), 56(%0), 96(%0), (%1), OP)\
1201 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 48(%0), 56(%0), 64(%0),104(%0), (%1, %3), OP)\
1203 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 56(%0), 64(%0), 72(%0),112(%0), (%1), OP)\
1204 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 64(%0), 72(%0), 80(%0),120(%0), (%1, %3), OP)\
1206 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 72(%0), 80(%0), 88(%0),128(%0), (%1), OP)\
1208 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 80(%0), 88(%0), 96(%0),128(%0), (%1, %3), OP)\
1210 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 88(%0), 96(%0),104(%0),120(%0), (%1), OP)\
1211 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 96(%0),104(%0),112(%0),112(%0), (%1, %3), OP)\
1213 "add $136, %0 \n\t"\
1218 : "+r"(temp_ptr), "+r"(dst), "+g"(count)\
1219 : "r"((x86_reg)dstStride), "r"(2*(x86_reg)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(ROUNDER), "g"(4-14*(x86_reg)dstStride)\
1224 static void OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1225 uint64_t temp[9*2];\
1226 uint64_t *temp_ptr= temp;\
1231 "pxor %%mm7, %%mm7 \n\t"\
1233 "movq (%0), %%mm0 \n\t"\
1234 "movq (%0), %%mm1 \n\t"\
1235 "punpcklbw %%mm7, %%mm0 \n\t"\
1236 "punpckhbw %%mm7, %%mm1 \n\t"\
1237 "movq %%mm0, (%1) \n\t"\
1238 "movq %%mm1, 9*8(%1) \n\t"\
1243 : "+r" (src), "+r" (temp_ptr), "+r"(count)\
1244 : "r" ((x86_reg)srcStride)\
1251 /*FIXME reorder for speed */\
1253 /*"pxor %%mm7, %%mm7 \n\t"*/\
1255 "movq (%0), %%mm0 \n\t"\
1256 "movq 8(%0), %%mm1 \n\t"\
1257 "movq 16(%0), %%mm2 \n\t"\
1258 "movq 24(%0), %%mm3 \n\t"\
1259 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP)\
1260 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP)\
1262 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP)\
1264 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP)\
1266 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP)\
1268 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 64(%0), (%1, %3), OP)\
1270 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 56(%0), (%1), OP)\
1271 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 48(%0), (%1, %3), OP)\
1278 : "+r"(temp_ptr), "+r"(dst), "+g"(count)\
1279 : "r"((x86_reg)dstStride), "r"(2*(x86_reg)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(ROUNDER), "g"(4-6*(x86_reg)dstStride)\
1284 static void OPNAME ## qpel8_mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\
1285 OPNAME ## pixels8_ ## MMX(dst, src, stride, 8);\
1288 static void OPNAME ## qpel8_mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1290 uint8_t * const half= (uint8_t*)temp;\
1291 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, stride, 8);\
1292 OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8);\
1295 static void OPNAME ## qpel8_mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1296 OPNAME ## mpeg4_qpel8_h_lowpass_ ## MMX(dst, src, stride, stride, 8);\
1299 static void OPNAME ## qpel8_mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1301 uint8_t * const half= (uint8_t*)temp;\
1302 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, stride, 8);\
1303 OPNAME ## pixels8_l2_ ## MMX(dst, src+1, half, stride, stride, 8);\
1306 static void OPNAME ## qpel8_mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1308 uint8_t * const half= (uint8_t*)temp;\
1309 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride);\
1310 OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8);\
1313 static void OPNAME ## qpel8_mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1314 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, src, stride, stride);\
1317 static void OPNAME ## qpel8_mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1319 uint8_t * const half= (uint8_t*)temp;\
1320 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride);\
1321 OPNAME ## pixels8_l2_ ## MMX(dst, src+stride, half, stride, stride, 8);\
1323 static void OPNAME ## qpel8_mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1324 uint64_t half[8 + 9];\
1325 uint8_t * const halfH= ((uint8_t*)half) + 64;\
1326 uint8_t * const halfHV= ((uint8_t*)half);\
1327 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
1328 put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\
1329 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
1330 OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\
1332 static void OPNAME ## qpel8_mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1333 uint64_t half[8 + 9];\
1334 uint8_t * const halfH= ((uint8_t*)half) + 64;\
1335 uint8_t * const halfHV= ((uint8_t*)half);\
1336 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
1337 put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\
1338 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
1339 OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\
1341 static void OPNAME ## qpel8_mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1342 uint64_t half[8 + 9];\
1343 uint8_t * const halfH= ((uint8_t*)half) + 64;\
1344 uint8_t * const halfHV= ((uint8_t*)half);\
1345 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
1346 put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\
1347 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
1348 OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\
1350 static void OPNAME ## qpel8_mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1351 uint64_t half[8 + 9];\
1352 uint8_t * const halfH= ((uint8_t*)half) + 64;\
1353 uint8_t * const halfHV= ((uint8_t*)half);\
1354 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
1355 put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\
1356 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
1357 OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\
1359 static void OPNAME ## qpel8_mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1360 uint64_t half[8 + 9];\
1361 uint8_t * const halfH= ((uint8_t*)half) + 64;\
1362 uint8_t * const halfHV= ((uint8_t*)half);\
1363 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
1364 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
1365 OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\
1367 static void OPNAME ## qpel8_mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1368 uint64_t half[8 + 9];\
1369 uint8_t * const halfH= ((uint8_t*)half) + 64;\
1370 uint8_t * const halfHV= ((uint8_t*)half);\
1371 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
1372 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
1373 OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\
1375 static void OPNAME ## qpel8_mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1376 uint64_t half[8 + 9];\
1377 uint8_t * const halfH= ((uint8_t*)half);\
1378 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
1379 put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\
1380 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
1382 static void OPNAME ## qpel8_mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1383 uint64_t half[8 + 9];\
1384 uint8_t * const halfH= ((uint8_t*)half);\
1385 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
1386 put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\
1387 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
1389 static void OPNAME ## qpel8_mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1391 uint8_t * const halfH= ((uint8_t*)half);\
1392 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
1393 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
1395 static void OPNAME ## qpel16_mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\
1396 OPNAME ## pixels16_ ## MMX(dst, src, stride, 16);\
1399 static void OPNAME ## qpel16_mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1401 uint8_t * const half= (uint8_t*)temp;\
1402 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, stride, 16);\
1403 OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16);\
1406 static void OPNAME ## qpel16_mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1407 OPNAME ## mpeg4_qpel16_h_lowpass_ ## MMX(dst, src, stride, stride, 16);\
1410 static void OPNAME ## qpel16_mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1412 uint8_t * const half= (uint8_t*)temp;\
1413 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, stride, 16);\
1414 OPNAME ## pixels16_l2_ ## MMX(dst, src+1, half, stride, stride, 16);\
1417 static void OPNAME ## qpel16_mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1419 uint8_t * const half= (uint8_t*)temp;\
1420 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, stride);\
1421 OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16);\
1424 static void OPNAME ## qpel16_mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1425 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, src, stride, stride);\
1428 static void OPNAME ## qpel16_mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1430 uint8_t * const half= (uint8_t*)temp;\
1431 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, stride);\
1432 OPNAME ## pixels16_l2_ ## MMX(dst, src+stride, half, stride, stride, 16);\
1434 static void OPNAME ## qpel16_mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1435 uint64_t half[16*2 + 17*2];\
1436 uint8_t * const halfH= ((uint8_t*)half) + 256;\
1437 uint8_t * const halfHV= ((uint8_t*)half);\
1438 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
1439 put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\
1440 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
1441 OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\
1443 static void OPNAME ## qpel16_mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1444 uint64_t half[16*2 + 17*2];\
1445 uint8_t * const halfH= ((uint8_t*)half) + 256;\
1446 uint8_t * const halfHV= ((uint8_t*)half);\
1447 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
1448 put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\
1449 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
1450 OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\
1452 static void OPNAME ## qpel16_mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1453 uint64_t half[16*2 + 17*2];\
1454 uint8_t * const halfH= ((uint8_t*)half) + 256;\
1455 uint8_t * const halfHV= ((uint8_t*)half);\
1456 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
1457 put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\
1458 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
1459 OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\
1461 static void OPNAME ## qpel16_mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1462 uint64_t half[16*2 + 17*2];\
1463 uint8_t * const halfH= ((uint8_t*)half) + 256;\
1464 uint8_t * const halfHV= ((uint8_t*)half);\
1465 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
1466 put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\
1467 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
1468 OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\
1470 static void OPNAME ## qpel16_mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1471 uint64_t half[16*2 + 17*2];\
1472 uint8_t * const halfH= ((uint8_t*)half) + 256;\
1473 uint8_t * const halfHV= ((uint8_t*)half);\
1474 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
1475 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
1476 OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\
1478 static void OPNAME ## qpel16_mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1479 uint64_t half[16*2 + 17*2];\
1480 uint8_t * const halfH= ((uint8_t*)half) + 256;\
1481 uint8_t * const halfHV= ((uint8_t*)half);\
1482 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
1483 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
1484 OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\
1486 static void OPNAME ## qpel16_mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1487 uint64_t half[17*2];\
1488 uint8_t * const halfH= ((uint8_t*)half);\
1489 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
1490 put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\
1491 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
1493 static void OPNAME ## qpel16_mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1494 uint64_t half[17*2];\
1495 uint8_t * const halfH= ((uint8_t*)half);\
1496 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
1497 put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\
1498 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
1500 static void OPNAME ## qpel16_mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1501 uint64_t half[17*2];\
1502 uint8_t * const halfH= ((uint8_t*)half);\
1503 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
1504 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
1507 #define PUT_OP(a,b,temp, size) "mov" #size " " #a ", " #b " \n\t"
1508 #define AVG_3DNOW_OP(a,b,temp, size) \
1509 "mov" #size " " #b ", " #temp " \n\t"\
1510 "pavgusb " #temp ", " #a " \n\t"\
1511 "mov" #size " " #a ", " #b " \n\t"
1512 #define AVG_MMX2_OP(a,b,temp, size) \
1513 "mov" #size " " #b ", " #temp " \n\t"\
1514 "pavgb " #temp ", " #a " \n\t"\
1515 "mov" #size " " #a ", " #b " \n\t"
1517 QPEL_BASE(put_ , ff_pw_16, _ , PUT_OP, PUT_OP)
1518 QPEL_BASE(avg_ , ff_pw_16, _ , AVG_MMX2_OP, AVG_3DNOW_OP)
1519 QPEL_BASE(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, PUT_OP)
1520 QPEL_OP(put_ , ff_pw_16, _ , PUT_OP, 3dnow)
1521 QPEL_OP(avg_ , ff_pw_16, _ , AVG_3DNOW_OP, 3dnow)
1522 QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, 3dnow)
1523 QPEL_OP(put_ , ff_pw_16, _ , PUT_OP, mmx2)
1524 QPEL_OP(avg_ , ff_pw_16, _ , AVG_MMX2_OP, mmx2)
1525 QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, mmx2)
1527 /***********************************/
1528 /* bilinear qpel: not compliant to any spec, only for -lavdopts fast */
1530 #define QPEL_2TAP_XY(OPNAME, SIZE, MMX, XY, HPEL)\
1531 static void OPNAME ## 2tap_qpel ## SIZE ## _mc ## XY ## _ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1532 OPNAME ## pixels ## SIZE ## HPEL(dst, src, stride, SIZE);\
1534 #define QPEL_2TAP_L3(OPNAME, SIZE, MMX, XY, S0, S1, S2)\
1535 static void OPNAME ## 2tap_qpel ## SIZE ## _mc ## XY ## _ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1536 OPNAME ## 2tap_qpel ## SIZE ## _l3_ ## MMX(dst, src+S0, stride, SIZE, S1, S2);\
1539 #define QPEL_2TAP(OPNAME, SIZE, MMX)\
1540 QPEL_2TAP_XY(OPNAME, SIZE, MMX, 20, _x2_ ## MMX)\
1541 QPEL_2TAP_XY(OPNAME, SIZE, MMX, 02, _y2_ ## MMX)\
1542 QPEL_2TAP_XY(OPNAME, SIZE, MMX, 22, _xy2_mmx)\
1543 static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc00_ ## MMX =\
1544 OPNAME ## qpel ## SIZE ## _mc00_ ## MMX;\
1545 static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc21_ ## MMX =\
1546 OPNAME ## 2tap_qpel ## SIZE ## _mc20_ ## MMX;\
1547 static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc12_ ## MMX =\
1548 OPNAME ## 2tap_qpel ## SIZE ## _mc02_ ## MMX;\
1549 static void OPNAME ## 2tap_qpel ## SIZE ## _mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1550 OPNAME ## pixels ## SIZE ## _y2_ ## MMX(dst, src+1, stride, SIZE);\
1552 static void OPNAME ## 2tap_qpel ## SIZE ## _mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1553 OPNAME ## pixels ## SIZE ## _x2_ ## MMX(dst, src+stride, stride, SIZE);\
1555 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 10, 0, 1, 0)\
1556 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 30, 1, -1, 0)\
1557 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 01, 0, stride, 0)\
1558 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 03, stride, -stride, 0)\
1559 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 11, 0, stride, 1)\
1560 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 31, 1, stride, -1)\
1561 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 13, stride, -stride, 1)\
1562 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 33, stride+1, -stride, -1)\
1564 QPEL_2TAP(put_, 16, mmx2)
1565 QPEL_2TAP(avg_, 16, mmx2)
1566 QPEL_2TAP(put_, 8, mmx2)
1567 QPEL_2TAP(avg_, 8, mmx2)
1568 QPEL_2TAP(put_, 16, 3dnow)
1569 QPEL_2TAP(avg_, 16, 3dnow)
1570 QPEL_2TAP(put_, 8, 3dnow)
1571 QPEL_2TAP(avg_, 8, 3dnow)
1575 typedef void emu_edge_core_func (uint8_t *buf, const uint8_t *src,
1576 x86_reg linesize, x86_reg start_y,
1577 x86_reg end_y, x86_reg block_h,
1578 x86_reg start_x, x86_reg end_x,
1580 extern emu_edge_core_func ff_emu_edge_core_mmx;
1581 extern emu_edge_core_func ff_emu_edge_core_sse;
1583 static av_always_inline
1584 void emulated_edge_mc(uint8_t *buf, const uint8_t *src, int linesize,
1585 int block_w, int block_h,
1586 int src_x, int src_y, int w, int h,
1587 emu_edge_core_func *core_fn)
1589 int start_y, start_x, end_y, end_x, src_y_add=0;
1592 src_y_add = h-1-src_y;
1594 }else if(src_y<=-block_h){
1595 src_y_add = 1-block_h-src_y;
1601 }else if(src_x<=-block_w){
1602 src+= (1-block_w-src_x);
1606 start_y= FFMAX(0, -src_y);
1607 start_x= FFMAX(0, -src_x);
1608 end_y= FFMIN(block_h, h-src_y);
1609 end_x= FFMIN(block_w, w-src_x);
1610 assert(start_x < end_x && block_w > 0);
1611 assert(start_y < end_y && block_h > 0);
1613 // fill in the to-be-copied part plus all above/below
1614 src += (src_y_add+start_y)*linesize + start_x;
1616 core_fn(buf, src, linesize, start_y, end_y, block_h, start_x, end_x, block_w);
1621 void emulated_edge_mc_mmx(uint8_t *buf, const uint8_t *src, int linesize,
1622 int block_w, int block_h,
1623 int src_x, int src_y, int w, int h)
1625 emulated_edge_mc(buf, src, linesize, block_w, block_h, src_x, src_y,
1626 w, h, &ff_emu_edge_core_mmx);
1630 void emulated_edge_mc_sse(uint8_t *buf, const uint8_t *src, int linesize,
1631 int block_w, int block_h,
1632 int src_x, int src_y, int w, int h)
1634 emulated_edge_mc(buf, src, linesize, block_w, block_h, src_x, src_y,
1635 w, h, &ff_emu_edge_core_sse);
1637 #endif /* HAVE_YASM */
1639 typedef void emulated_edge_mc_func (uint8_t *dst, const uint8_t *src,
1640 int linesize, int block_w, int block_h,
1641 int src_x, int src_y, int w, int h);
1643 static av_always_inline
1644 void gmc(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
1645 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height,
1646 emulated_edge_mc_func *emu_edge_fn)
1649 const int ix = ox>>(16+shift);
1650 const int iy = oy>>(16+shift);
1651 const int oxs = ox>>4;
1652 const int oys = oy>>4;
1653 const int dxxs = dxx>>4;
1654 const int dxys = dxy>>4;
1655 const int dyxs = dyx>>4;
1656 const int dyys = dyy>>4;
1657 const uint16_t r4[4] = {r,r,r,r};
1658 const uint16_t dxy4[4] = {dxys,dxys,dxys,dxys};
1659 const uint16_t dyy4[4] = {dyys,dyys,dyys,dyys};
1660 const uint64_t shift2 = 2*shift;
1661 uint8_t edge_buf[(h+1)*stride];
1664 const int dxw = (dxx-(1<<(16+shift)))*(w-1);
1665 const int dyh = (dyy-(1<<(16+shift)))*(h-1);
1666 const int dxh = dxy*(h-1);
1667 const int dyw = dyx*(w-1);
1668 if( // non-constant fullpel offset (3% of blocks)
1669 ((ox^(ox+dxw)) | (ox^(ox+dxh)) | (ox^(ox+dxw+dxh)) |
1670 (oy^(oy+dyw)) | (oy^(oy+dyh)) | (oy^(oy+dyw+dyh))) >> (16+shift)
1671 // uses more than 16 bits of subpel mv (only at huge resolution)
1672 || (dxx|dxy|dyx|dyy)&15 )
1674 //FIXME could still use mmx for some of the rows
1675 ff_gmc_c(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r, width, height);
1679 src += ix + iy*stride;
1680 if( (unsigned)ix >= width-w ||
1681 (unsigned)iy >= height-h )
1683 emu_edge_fn(edge_buf, src, stride, w+1, h+1, ix, iy, width, height);
1688 "movd %0, %%mm6 \n\t"
1689 "pxor %%mm7, %%mm7 \n\t"
1690 "punpcklwd %%mm6, %%mm6 \n\t"
1691 "punpcklwd %%mm6, %%mm6 \n\t"
1695 for(x=0; x<w; x+=4){
1696 uint16_t dx4[4] = { oxs - dxys + dxxs*(x+0),
1697 oxs - dxys + dxxs*(x+1),
1698 oxs - dxys + dxxs*(x+2),
1699 oxs - dxys + dxxs*(x+3) };
1700 uint16_t dy4[4] = { oys - dyys + dyxs*(x+0),
1701 oys - dyys + dyxs*(x+1),
1702 oys - dyys + dyxs*(x+2),
1703 oys - dyys + dyxs*(x+3) };
1707 "movq %0, %%mm4 \n\t"
1708 "movq %1, %%mm5 \n\t"
1709 "paddw %2, %%mm4 \n\t"
1710 "paddw %3, %%mm5 \n\t"
1711 "movq %%mm4, %0 \n\t"
1712 "movq %%mm5, %1 \n\t"
1713 "psrlw $12, %%mm4 \n\t"
1714 "psrlw $12, %%mm5 \n\t"
1715 : "+m"(*dx4), "+m"(*dy4)
1716 : "m"(*dxy4), "m"(*dyy4)
1720 "movq %%mm6, %%mm2 \n\t"
1721 "movq %%mm6, %%mm1 \n\t"
1722 "psubw %%mm4, %%mm2 \n\t"
1723 "psubw %%mm5, %%mm1 \n\t"
1724 "movq %%mm2, %%mm0 \n\t"
1725 "movq %%mm4, %%mm3 \n\t"
1726 "pmullw %%mm1, %%mm0 \n\t" // (s-dx)*(s-dy)
1727 "pmullw %%mm5, %%mm3 \n\t" // dx*dy
1728 "pmullw %%mm5, %%mm2 \n\t" // (s-dx)*dy
1729 "pmullw %%mm4, %%mm1 \n\t" // dx*(s-dy)
1731 "movd %4, %%mm5 \n\t"
1732 "movd %3, %%mm4 \n\t"
1733 "punpcklbw %%mm7, %%mm5 \n\t"
1734 "punpcklbw %%mm7, %%mm4 \n\t"
1735 "pmullw %%mm5, %%mm3 \n\t" // src[1,1] * dx*dy
1736 "pmullw %%mm4, %%mm2 \n\t" // src[0,1] * (s-dx)*dy
1738 "movd %2, %%mm5 \n\t"
1739 "movd %1, %%mm4 \n\t"
1740 "punpcklbw %%mm7, %%mm5 \n\t"
1741 "punpcklbw %%mm7, %%mm4 \n\t"
1742 "pmullw %%mm5, %%mm1 \n\t" // src[1,0] * dx*(s-dy)
1743 "pmullw %%mm4, %%mm0 \n\t" // src[0,0] * (s-dx)*(s-dy)
1744 "paddw %5, %%mm1 \n\t"
1745 "paddw %%mm3, %%mm2 \n\t"
1746 "paddw %%mm1, %%mm0 \n\t"
1747 "paddw %%mm2, %%mm0 \n\t"
1749 "psrlw %6, %%mm0 \n\t"
1750 "packuswb %%mm0, %%mm0 \n\t"
1751 "movd %%mm0, %0 \n\t"
1753 : "=m"(dst[x+y*stride])
1754 : "m"(src[0]), "m"(src[1]),
1755 "m"(src[stride]), "m"(src[stride+1]),
1756 "m"(*r4), "m"(shift2)
1766 static void gmc_mmx(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
1767 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
1769 gmc(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r,
1770 width, height, &emulated_edge_mc_mmx);
1773 static void gmc_sse(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
1774 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
1776 gmc(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r,
1777 width, height, &emulated_edge_mc_sse);
1780 static void gmc_mmx(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
1781 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
1783 gmc(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r,
1784 width, height, &ff_emulated_edge_mc_8);
1788 #define PREFETCH(name, op) \
1789 static void name(void *mem, int stride, int h){\
1790 const uint8_t *p= mem;\
1792 __asm__ volatile(#op" %0" :: "m"(*p));\
1796 PREFETCH(prefetch_mmx2, prefetcht0)
1797 PREFETCH(prefetch_3dnow, prefetch)
1800 #include "h264_qpel_mmx.c"
1802 void ff_put_h264_chroma_mc8_mmx_rnd (uint8_t *dst, uint8_t *src,
1803 int stride, int h, int x, int y);
1804 void ff_avg_h264_chroma_mc8_mmx2_rnd (uint8_t *dst, uint8_t *src,
1805 int stride, int h, int x, int y);
1806 void ff_avg_h264_chroma_mc8_3dnow_rnd (uint8_t *dst, uint8_t *src,
1807 int stride, int h, int x, int y);
1809 void ff_put_h264_chroma_mc4_mmx (uint8_t *dst, uint8_t *src,
1810 int stride, int h, int x, int y);
1811 void ff_avg_h264_chroma_mc4_mmx2 (uint8_t *dst, uint8_t *src,
1812 int stride, int h, int x, int y);
1813 void ff_avg_h264_chroma_mc4_3dnow (uint8_t *dst, uint8_t *src,
1814 int stride, int h, int x, int y);
1816 void ff_put_h264_chroma_mc2_mmx2 (uint8_t *dst, uint8_t *src,
1817 int stride, int h, int x, int y);
1818 void ff_avg_h264_chroma_mc2_mmx2 (uint8_t *dst, uint8_t *src,
1819 int stride, int h, int x, int y);
1821 void ff_put_h264_chroma_mc8_ssse3_rnd (uint8_t *dst, uint8_t *src,
1822 int stride, int h, int x, int y);
1823 void ff_put_h264_chroma_mc4_ssse3 (uint8_t *dst, uint8_t *src,
1824 int stride, int h, int x, int y);
1826 void ff_avg_h264_chroma_mc8_ssse3_rnd (uint8_t *dst, uint8_t *src,
1827 int stride, int h, int x, int y);
1828 void ff_avg_h264_chroma_mc4_ssse3 (uint8_t *dst, uint8_t *src,
1829 int stride, int h, int x, int y);
1831 #define CHROMA_MC(OP, NUM, DEPTH, OPT) \
1832 void ff_ ## OP ## _h264_chroma_mc ## NUM ## _ ## DEPTH ## _ ## OPT \
1833 (uint8_t *dst, uint8_t *src,\
1834 int stride, int h, int x, int y);
1836 CHROMA_MC(put, 2, 10, mmxext)
1837 CHROMA_MC(avg, 2, 10, mmxext)
1838 CHROMA_MC(put, 4, 10, mmxext)
1839 CHROMA_MC(avg, 4, 10, mmxext)
1840 CHROMA_MC(put, 8, 10, sse2)
1841 CHROMA_MC(avg, 8, 10, sse2)
1842 CHROMA_MC(put, 8, 10, avx)
1843 CHROMA_MC(avg, 8, 10, avx)
1846 void ff_put_cavs_qpel8_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) {
1847 put_pixels8_mmx(dst, src, stride, 8);
1849 void ff_avg_cavs_qpel8_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) {
1850 avg_pixels8_mmx(dst, src, stride, 8);
1852 void ff_put_cavs_qpel16_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) {
1853 put_pixels16_mmx(dst, src, stride, 16);
1855 void ff_avg_cavs_qpel16_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) {
1856 avg_pixels16_mmx(dst, src, stride, 16);
1860 void ff_put_vc1_mspel_mc00_mmx(uint8_t *dst, const uint8_t *src, int stride, int rnd) {
1861 put_pixels8_mmx(dst, src, stride, 8);
1863 void ff_avg_vc1_mspel_mc00_mmx2(uint8_t *dst, const uint8_t *src, int stride, int rnd) {
1864 avg_pixels8_mmx2(dst, src, stride, 8);
1867 /* XXX: those functions should be suppressed ASAP when all IDCTs are
1870 static void ff_libmpeg2mmx_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
1872 ff_mmx_idct (block);
1873 ff_put_pixels_clamped_mmx(block, dest, line_size);
1875 static void ff_libmpeg2mmx_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
1877 ff_mmx_idct (block);
1878 ff_add_pixels_clamped_mmx(block, dest, line_size);
1880 static void ff_libmpeg2mmx2_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
1882 ff_mmxext_idct (block);
1883 ff_put_pixels_clamped_mmx(block, dest, line_size);
1885 static void ff_libmpeg2mmx2_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
1887 ff_mmxext_idct (block);
1888 ff_add_pixels_clamped_mmx(block, dest, line_size);
1891 static void ff_idct_xvid_mmx_put(uint8_t *dest, int line_size, DCTELEM *block)
1893 ff_idct_xvid_mmx (block);
1894 ff_put_pixels_clamped_mmx(block, dest, line_size);
1896 static void ff_idct_xvid_mmx_add(uint8_t *dest, int line_size, DCTELEM *block)
1898 ff_idct_xvid_mmx (block);
1899 ff_add_pixels_clamped_mmx(block, dest, line_size);
1901 static void ff_idct_xvid_mmx2_put(uint8_t *dest, int line_size, DCTELEM *block)
1903 ff_idct_xvid_mmx2 (block);
1904 ff_put_pixels_clamped_mmx(block, dest, line_size);
1906 static void ff_idct_xvid_mmx2_add(uint8_t *dest, int line_size, DCTELEM *block)
1908 ff_idct_xvid_mmx2 (block);
1909 ff_add_pixels_clamped_mmx(block, dest, line_size);
1912 static void vorbis_inverse_coupling_3dnow(float *mag, float *ang, int blocksize)
1915 __asm__ volatile("pxor %%mm7, %%mm7":);
1916 for(i=0; i<blocksize; i+=2) {
1918 "movq %0, %%mm0 \n\t"
1919 "movq %1, %%mm1 \n\t"
1920 "movq %%mm0, %%mm2 \n\t"
1921 "movq %%mm1, %%mm3 \n\t"
1922 "pfcmpge %%mm7, %%mm2 \n\t" // m <= 0.0
1923 "pfcmpge %%mm7, %%mm3 \n\t" // a <= 0.0
1924 "pslld $31, %%mm2 \n\t" // keep only the sign bit
1925 "pxor %%mm2, %%mm1 \n\t"
1926 "movq %%mm3, %%mm4 \n\t"
1927 "pand %%mm1, %%mm3 \n\t"
1928 "pandn %%mm1, %%mm4 \n\t"
1929 "pfadd %%mm0, %%mm3 \n\t" // a = m + ((a<0) & (a ^ sign(m)))
1930 "pfsub %%mm4, %%mm0 \n\t" // m = m + ((a>0) & (a ^ sign(m)))
1931 "movq %%mm3, %1 \n\t"
1932 "movq %%mm0, %0 \n\t"
1933 :"+m"(mag[i]), "+m"(ang[i])
1937 __asm__ volatile("femms");
1939 static void vorbis_inverse_coupling_sse(float *mag, float *ang, int blocksize)
1944 "movaps %0, %%xmm5 \n\t"
1945 ::"m"(ff_pdw_80000000[0])
1947 for(i=0; i<blocksize; i+=4) {
1949 "movaps %0, %%xmm0 \n\t"
1950 "movaps %1, %%xmm1 \n\t"
1951 "xorps %%xmm2, %%xmm2 \n\t"
1952 "xorps %%xmm3, %%xmm3 \n\t"
1953 "cmpleps %%xmm0, %%xmm2 \n\t" // m <= 0.0
1954 "cmpleps %%xmm1, %%xmm3 \n\t" // a <= 0.0
1955 "andps %%xmm5, %%xmm2 \n\t" // keep only the sign bit
1956 "xorps %%xmm2, %%xmm1 \n\t"
1957 "movaps %%xmm3, %%xmm4 \n\t"
1958 "andps %%xmm1, %%xmm3 \n\t"
1959 "andnps %%xmm1, %%xmm4 \n\t"
1960 "addps %%xmm0, %%xmm3 \n\t" // a = m + ((a<0) & (a ^ sign(m)))
1961 "subps %%xmm4, %%xmm0 \n\t" // m = m + ((a>0) & (a ^ sign(m)))
1962 "movaps %%xmm3, %1 \n\t"
1963 "movaps %%xmm0, %0 \n\t"
1964 :"+m"(mag[i]), "+m"(ang[i])
1973 #define MIX5(mono,stereo)\
1975 "movss 0(%2), %%xmm5 \n"\
1976 "movss 8(%2), %%xmm6 \n"\
1977 "movss 24(%2), %%xmm7 \n"\
1978 "shufps $0, %%xmm5, %%xmm5 \n"\
1979 "shufps $0, %%xmm6, %%xmm6 \n"\
1980 "shufps $0, %%xmm7, %%xmm7 \n"\
1982 "movaps (%0,%1), %%xmm0 \n"\
1983 "movaps 0x400(%0,%1), %%xmm1 \n"\
1984 "movaps 0x800(%0,%1), %%xmm2 \n"\
1985 "movaps 0xc00(%0,%1), %%xmm3 \n"\
1986 "movaps 0x1000(%0,%1), %%xmm4 \n"\
1987 "mulps %%xmm5, %%xmm0 \n"\
1988 "mulps %%xmm6, %%xmm1 \n"\
1989 "mulps %%xmm5, %%xmm2 \n"\
1990 "mulps %%xmm7, %%xmm3 \n"\
1991 "mulps %%xmm7, %%xmm4 \n"\
1992 stereo("addps %%xmm1, %%xmm0 \n")\
1993 "addps %%xmm1, %%xmm2 \n"\
1994 "addps %%xmm3, %%xmm0 \n"\
1995 "addps %%xmm4, %%xmm2 \n"\
1996 mono("addps %%xmm2, %%xmm0 \n")\
1997 "movaps %%xmm0, (%0,%1) \n"\
1998 stereo("movaps %%xmm2, 0x400(%0,%1) \n")\
2002 :"r"(samples[0]+len), "r"(matrix)\
2003 :XMM_CLOBBERS("%xmm0", "%xmm1", "%xmm2", "%xmm3", \
2004 "%xmm4", "%xmm5", "%xmm6", "%xmm7",)\
2008 #define MIX_MISC(stereo)\
2011 "movaps (%3,%0), %%xmm0 \n"\
2012 stereo("movaps %%xmm0, %%xmm1 \n")\
2013 "mulps %%xmm4, %%xmm0 \n"\
2014 stereo("mulps %%xmm5, %%xmm1 \n")\
2015 "lea 1024(%3,%0), %1 \n"\
2018 "movaps (%1), %%xmm2 \n"\
2019 stereo("movaps %%xmm2, %%xmm3 \n")\
2020 "mulps (%4,%2), %%xmm2 \n"\
2021 stereo("mulps 16(%4,%2), %%xmm3 \n")\
2022 "addps %%xmm2, %%xmm0 \n"\
2023 stereo("addps %%xmm3, %%xmm1 \n")\
2027 "movaps %%xmm0, (%3,%0) \n"\
2028 stereo("movaps %%xmm1, 1024(%3,%0) \n")\
2031 :"+&r"(i), "=&r"(j), "=&r"(k)\
2032 :"r"(samples[0]+len), "r"(matrix_simd+in_ch), "g"((intptr_t)-32*(in_ch-1))\
2036 static void ac3_downmix_sse(float (*samples)[256], float (*matrix)[2], int out_ch, int in_ch, int len)
2038 int (*matrix_cmp)[2] = (int(*)[2])matrix;
2041 i = -len*sizeof(float);
2042 if(in_ch == 5 && out_ch == 2 && !(matrix_cmp[0][1]|matrix_cmp[2][0]|matrix_cmp[3][1]|matrix_cmp[4][0]|(matrix_cmp[1][0]^matrix_cmp[1][1])|(matrix_cmp[0][0]^matrix_cmp[2][1]))) {
2044 } else if(in_ch == 5 && out_ch == 1 && matrix_cmp[0][0]==matrix_cmp[2][0] && matrix_cmp[3][0]==matrix_cmp[4][0]) {
2047 DECLARE_ALIGNED(16, float, matrix_simd)[AC3_MAX_CHANNELS][2][4];
2048 j = 2*in_ch*sizeof(float);
2052 "movss (%2,%0), %%xmm4 \n"
2053 "movss 4(%2,%0), %%xmm5 \n"
2054 "shufps $0, %%xmm4, %%xmm4 \n"
2055 "shufps $0, %%xmm5, %%xmm5 \n"
2056 "movaps %%xmm4, (%1,%0,4) \n"
2057 "movaps %%xmm5, 16(%1,%0,4) \n"
2060 :"r"(matrix_simd), "r"(matrix)
2071 static void vector_fmul_3dnow(float *dst, const float *src0, const float *src1, int len){
2072 x86_reg i = (len-4)*4;
2075 "movq (%2,%0), %%mm0 \n\t"
2076 "movq 8(%2,%0), %%mm1 \n\t"
2077 "pfmul (%3,%0), %%mm0 \n\t"
2078 "pfmul 8(%3,%0), %%mm1 \n\t"
2079 "movq %%mm0, (%1,%0) \n\t"
2080 "movq %%mm1, 8(%1,%0) \n\t"
2085 :"r"(dst), "r"(src0), "r"(src1)
2089 static void vector_fmul_sse(float *dst, const float *src0, const float *src1, int len){
2090 x86_reg i = (len-8)*4;
2093 "movaps (%2,%0), %%xmm0 \n\t"
2094 "movaps 16(%2,%0), %%xmm1 \n\t"
2095 "mulps (%3,%0), %%xmm0 \n\t"
2096 "mulps 16(%3,%0), %%xmm1 \n\t"
2097 "movaps %%xmm0, (%1,%0) \n\t"
2098 "movaps %%xmm1, 16(%1,%0) \n\t"
2102 :"r"(dst), "r"(src0), "r"(src1)
2107 static void vector_fmul_reverse_3dnow2(float *dst, const float *src0, const float *src1, int len){
2108 x86_reg i = len*4-16;
2111 "pswapd 8(%1), %%mm0 \n\t"
2112 "pswapd (%1), %%mm1 \n\t"
2113 "pfmul (%3,%0), %%mm0 \n\t"
2114 "pfmul 8(%3,%0), %%mm1 \n\t"
2115 "movq %%mm0, (%2,%0) \n\t"
2116 "movq %%mm1, 8(%2,%0) \n\t"
2120 :"+r"(i), "+r"(src1)
2121 :"r"(dst), "r"(src0)
2123 __asm__ volatile("femms");
2125 static void vector_fmul_reverse_sse(float *dst, const float *src0, const float *src1, int len){
2126 x86_reg i = len*4-32;
2129 "movaps 16(%1), %%xmm0 \n\t"
2130 "movaps (%1), %%xmm1 \n\t"
2131 "shufps $0x1b, %%xmm0, %%xmm0 \n\t"
2132 "shufps $0x1b, %%xmm1, %%xmm1 \n\t"
2133 "mulps (%3,%0), %%xmm0 \n\t"
2134 "mulps 16(%3,%0), %%xmm1 \n\t"
2135 "movaps %%xmm0, (%2,%0) \n\t"
2136 "movaps %%xmm1, 16(%2,%0) \n\t"
2140 :"+r"(i), "+r"(src1)
2141 :"r"(dst), "r"(src0)
2145 static void vector_fmul_add_3dnow(float *dst, const float *src0, const float *src1,
2146 const float *src2, int len){
2147 x86_reg i = (len-4)*4;
2150 "movq (%2,%0), %%mm0 \n\t"
2151 "movq 8(%2,%0), %%mm1 \n\t"
2152 "pfmul (%3,%0), %%mm0 \n\t"
2153 "pfmul 8(%3,%0), %%mm1 \n\t"
2154 "pfadd (%4,%0), %%mm0 \n\t"
2155 "pfadd 8(%4,%0), %%mm1 \n\t"
2156 "movq %%mm0, (%1,%0) \n\t"
2157 "movq %%mm1, 8(%1,%0) \n\t"
2161 :"r"(dst), "r"(src0), "r"(src1), "r"(src2)
2164 __asm__ volatile("femms");
2166 static void vector_fmul_add_sse(float *dst, const float *src0, const float *src1,
2167 const float *src2, int len){
2168 x86_reg i = (len-8)*4;
2171 "movaps (%2,%0), %%xmm0 \n\t"
2172 "movaps 16(%2,%0), %%xmm1 \n\t"
2173 "mulps (%3,%0), %%xmm0 \n\t"
2174 "mulps 16(%3,%0), %%xmm1 \n\t"
2175 "addps (%4,%0), %%xmm0 \n\t"
2176 "addps 16(%4,%0), %%xmm1 \n\t"
2177 "movaps %%xmm0, (%1,%0) \n\t"
2178 "movaps %%xmm1, 16(%1,%0) \n\t"
2182 :"r"(dst), "r"(src0), "r"(src1), "r"(src2)
2188 static void vector_fmul_window_3dnow2(float *dst, const float *src0, const float *src1,
2189 const float *win, int len){
2191 x86_reg j = len*4-8;
2194 "pswapd (%5,%1), %%mm1 \n"
2195 "movq (%5,%0), %%mm0 \n"
2196 "pswapd (%4,%1), %%mm5 \n"
2197 "movq (%3,%0), %%mm4 \n"
2198 "movq %%mm0, %%mm2 \n"
2199 "movq %%mm1, %%mm3 \n"
2200 "pfmul %%mm4, %%mm2 \n" // src0[len+i]*win[len+i]
2201 "pfmul %%mm5, %%mm3 \n" // src1[ j]*win[len+j]
2202 "pfmul %%mm4, %%mm1 \n" // src0[len+i]*win[len+j]
2203 "pfmul %%mm5, %%mm0 \n" // src1[ j]*win[len+i]
2204 "pfadd %%mm3, %%mm2 \n"
2205 "pfsub %%mm0, %%mm1 \n"
2206 "pswapd %%mm2, %%mm2 \n"
2207 "movq %%mm1, (%2,%0) \n"
2208 "movq %%mm2, (%2,%1) \n"
2214 :"r"(dst+len), "r"(src0+len), "r"(src1), "r"(win+len)
2218 static void vector_fmul_window_sse(float *dst, const float *src0, const float *src1,
2219 const float *win, int len){
2221 x86_reg j = len*4-16;
2224 "movaps (%5,%1), %%xmm1 \n"
2225 "movaps (%5,%0), %%xmm0 \n"
2226 "movaps (%4,%1), %%xmm5 \n"
2227 "movaps (%3,%0), %%xmm4 \n"
2228 "shufps $0x1b, %%xmm1, %%xmm1 \n"
2229 "shufps $0x1b, %%xmm5, %%xmm5 \n"
2230 "movaps %%xmm0, %%xmm2 \n"
2231 "movaps %%xmm1, %%xmm3 \n"
2232 "mulps %%xmm4, %%xmm2 \n" // src0[len+i]*win[len+i]
2233 "mulps %%xmm5, %%xmm3 \n" // src1[ j]*win[len+j]
2234 "mulps %%xmm4, %%xmm1 \n" // src0[len+i]*win[len+j]
2235 "mulps %%xmm5, %%xmm0 \n" // src1[ j]*win[len+i]
2236 "addps %%xmm3, %%xmm2 \n"
2237 "subps %%xmm0, %%xmm1 \n"
2238 "shufps $0x1b, %%xmm2, %%xmm2 \n"
2239 "movaps %%xmm1, (%2,%0) \n"
2240 "movaps %%xmm2, (%2,%1) \n"
2245 :"r"(dst+len), "r"(src0+len), "r"(src1), "r"(win+len)
2248 #endif /* HAVE_6REGS */
2250 static void vector_clipf_sse(float *dst, const float *src, float min, float max,
2253 x86_reg i = (len-16)*4;
2255 "movss %3, %%xmm4 \n"
2256 "movss %4, %%xmm5 \n"
2257 "shufps $0, %%xmm4, %%xmm4 \n"
2258 "shufps $0, %%xmm5, %%xmm5 \n"
2260 "movaps (%2,%0), %%xmm0 \n\t" // 3/1 on intel
2261 "movaps 16(%2,%0), %%xmm1 \n\t"
2262 "movaps 32(%2,%0), %%xmm2 \n\t"
2263 "movaps 48(%2,%0), %%xmm3 \n\t"
2264 "maxps %%xmm4, %%xmm0 \n\t"
2265 "maxps %%xmm4, %%xmm1 \n\t"
2266 "maxps %%xmm4, %%xmm2 \n\t"
2267 "maxps %%xmm4, %%xmm3 \n\t"
2268 "minps %%xmm5, %%xmm0 \n\t"
2269 "minps %%xmm5, %%xmm1 \n\t"
2270 "minps %%xmm5, %%xmm2 \n\t"
2271 "minps %%xmm5, %%xmm3 \n\t"
2272 "movaps %%xmm0, (%1,%0) \n\t"
2273 "movaps %%xmm1, 16(%1,%0) \n\t"
2274 "movaps %%xmm2, 32(%1,%0) \n\t"
2275 "movaps %%xmm3, 48(%1,%0) \n\t"
2279 :"r"(dst), "r"(src), "m"(min), "m"(max)
2284 void ff_vp3_idct_mmx(int16_t *input_data);
2285 void ff_vp3_idct_put_mmx(uint8_t *dest, int line_size, DCTELEM *block);
2286 void ff_vp3_idct_add_mmx(uint8_t *dest, int line_size, DCTELEM *block);
2288 void ff_vp3_idct_dc_add_mmx2(uint8_t *dest, int line_size, const DCTELEM *block);
2290 void ff_vp3_v_loop_filter_mmx2(uint8_t *src, int stride, int *bounding_values);
2291 void ff_vp3_h_loop_filter_mmx2(uint8_t *src, int stride, int *bounding_values);
2293 void ff_vp3_idct_sse2(int16_t *input_data);
2294 void ff_vp3_idct_put_sse2(uint8_t *dest, int line_size, DCTELEM *block);
2295 void ff_vp3_idct_add_sse2(uint8_t *dest, int line_size, DCTELEM *block);
2297 int32_t ff_scalarproduct_int16_mmx2(const int16_t *v1, const int16_t *v2, int order, int shift);
2298 int32_t ff_scalarproduct_int16_sse2(const int16_t *v1, const int16_t *v2, int order, int shift);
2299 int32_t ff_scalarproduct_and_madd_int16_mmx2(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul);
2300 int32_t ff_scalarproduct_and_madd_int16_sse2(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul);
2301 int32_t ff_scalarproduct_and_madd_int16_ssse3(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul);
2303 void ff_apply_window_int16_mmxext (int16_t *output, const int16_t *input,
2304 const int16_t *window, unsigned int len);
2305 void ff_apply_window_int16_mmxext_ba (int16_t *output, const int16_t *input,
2306 const int16_t *window, unsigned int len);
2307 void ff_apply_window_int16_sse2 (int16_t *output, const int16_t *input,
2308 const int16_t *window, unsigned int len);
2309 void ff_apply_window_int16_sse2_ba (int16_t *output, const int16_t *input,
2310 const int16_t *window, unsigned int len);
2311 void ff_apply_window_int16_ssse3 (int16_t *output, const int16_t *input,
2312 const int16_t *window, unsigned int len);
2313 void ff_apply_window_int16_ssse3_atom(int16_t *output, const int16_t *input,
2314 const int16_t *window, unsigned int len);
2316 void ff_bswap32_buf_ssse3(uint32_t *dst, const uint32_t *src, int w);
2317 void ff_bswap32_buf_sse2(uint32_t *dst, const uint32_t *src, int w);
2319 void ff_add_hfyu_median_prediction_mmx2(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int w, int *left, int *left_top);
2320 int ff_add_hfyu_left_prediction_ssse3(uint8_t *dst, const uint8_t *src, int w, int left);
2321 int ff_add_hfyu_left_prediction_sse4(uint8_t *dst, const uint8_t *src, int w, int left);
2323 float ff_scalarproduct_float_sse(const float *v1, const float *v2, int order);
2325 void ff_vector_clip_int32_mmx (int32_t *dst, const int32_t *src, int32_t min,
2326 int32_t max, unsigned int len);
2327 void ff_vector_clip_int32_sse2 (int32_t *dst, const int32_t *src, int32_t min,
2328 int32_t max, unsigned int len);
2329 void ff_vector_clip_int32_int_sse2(int32_t *dst, const int32_t *src, int32_t min,
2330 int32_t max, unsigned int len);
2331 void ff_vector_clip_int32_sse4 (int32_t *dst, const int32_t *src, int32_t min,
2332 int32_t max, unsigned int len);
2334 extern void ff_butterflies_float_interleave_sse(float *dst, const float *src0,
2335 const float *src1, int len);
2336 extern void ff_butterflies_float_interleave_avx(float *dst, const float *src0,
2337 const float *src1, int len);
2339 #define SET_QPEL_FUNCS(PFX, IDX, SIZE, CPU, PREFIX) \
2341 c->PFX ## _pixels_tab[IDX][ 0] = PREFIX ## PFX ## SIZE ## _mc00_ ## CPU; \
2342 c->PFX ## _pixels_tab[IDX][ 1] = PREFIX ## PFX ## SIZE ## _mc10_ ## CPU; \
2343 c->PFX ## _pixels_tab[IDX][ 2] = PREFIX ## PFX ## SIZE ## _mc20_ ## CPU; \
2344 c->PFX ## _pixels_tab[IDX][ 3] = PREFIX ## PFX ## SIZE ## _mc30_ ## CPU; \
2345 c->PFX ## _pixels_tab[IDX][ 4] = PREFIX ## PFX ## SIZE ## _mc01_ ## CPU; \
2346 c->PFX ## _pixels_tab[IDX][ 5] = PREFIX ## PFX ## SIZE ## _mc11_ ## CPU; \
2347 c->PFX ## _pixels_tab[IDX][ 6] = PREFIX ## PFX ## SIZE ## _mc21_ ## CPU; \
2348 c->PFX ## _pixels_tab[IDX][ 7] = PREFIX ## PFX ## SIZE ## _mc31_ ## CPU; \
2349 c->PFX ## _pixels_tab[IDX][ 8] = PREFIX ## PFX ## SIZE ## _mc02_ ## CPU; \
2350 c->PFX ## _pixels_tab[IDX][ 9] = PREFIX ## PFX ## SIZE ## _mc12_ ## CPU; \
2351 c->PFX ## _pixels_tab[IDX][10] = PREFIX ## PFX ## SIZE ## _mc22_ ## CPU; \
2352 c->PFX ## _pixels_tab[IDX][11] = PREFIX ## PFX ## SIZE ## _mc32_ ## CPU; \
2353 c->PFX ## _pixels_tab[IDX][12] = PREFIX ## PFX ## SIZE ## _mc03_ ## CPU; \
2354 c->PFX ## _pixels_tab[IDX][13] = PREFIX ## PFX ## SIZE ## _mc13_ ## CPU; \
2355 c->PFX ## _pixels_tab[IDX][14] = PREFIX ## PFX ## SIZE ## _mc23_ ## CPU; \
2356 c->PFX ## _pixels_tab[IDX][15] = PREFIX ## PFX ## SIZE ## _mc33_ ## CPU; \
2359 #define SET_HPEL_FUNCS(PFX, IDX, SIZE, CPU) \
2361 c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## SIZE ## _ ## CPU; \
2362 c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## SIZE ## _x2_ ## CPU; \
2363 c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## SIZE ## _y2_ ## CPU; \
2364 c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## SIZE ## _xy2_ ## CPU;\
2367 #define H264_QPEL_FUNCS(x, y, CPU) \
2369 c->put_h264_qpel_pixels_tab[0][x+y*4] = put_h264_qpel16_mc##x##y##_##CPU; \
2370 c->put_h264_qpel_pixels_tab[1][x+y*4] = put_h264_qpel8_mc##x##y##_##CPU; \
2371 c->avg_h264_qpel_pixels_tab[0][x+y*4] = avg_h264_qpel16_mc##x##y##_##CPU; \
2372 c->avg_h264_qpel_pixels_tab[1][x+y*4] = avg_h264_qpel8_mc##x##y##_##CPU; \
2375 #define H264_QPEL_FUNCS_10(x, y, CPU) \
2377 c->put_h264_qpel_pixels_tab[0][x+y*4] = ff_put_h264_qpel16_mc##x##y##_10_##CPU; \
2378 c->put_h264_qpel_pixels_tab[1][x+y*4] = ff_put_h264_qpel8_mc##x##y##_10_##CPU; \
2379 c->avg_h264_qpel_pixels_tab[0][x+y*4] = ff_avg_h264_qpel16_mc##x##y##_10_##CPU; \
2380 c->avg_h264_qpel_pixels_tab[1][x+y*4] = ff_avg_h264_qpel8_mc##x##y##_10_##CPU; \
2383 static void dsputil_init_mmx(DSPContext *c, AVCodecContext *avctx, int mm_flags)
2385 const int high_bit_depth = avctx->bits_per_raw_sample > 8;
2387 c->put_pixels_clamped = ff_put_pixels_clamped_mmx;
2388 c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_mmx;
2389 c->add_pixels_clamped = ff_add_pixels_clamped_mmx;
2391 if (!high_bit_depth) {
2392 c->clear_block = clear_block_mmx;
2393 c->clear_blocks = clear_blocks_mmx;
2394 c->draw_edges = draw_edges_mmx;
2396 SET_HPEL_FUNCS(put, 0, 16, mmx);
2397 SET_HPEL_FUNCS(put_no_rnd, 0, 16, mmx);
2398 SET_HPEL_FUNCS(avg, 0, 16, mmx);
2399 SET_HPEL_FUNCS(avg_no_rnd, 0, 16, mmx);
2400 SET_HPEL_FUNCS(put, 1, 8, mmx);
2401 SET_HPEL_FUNCS(put_no_rnd, 1, 8, mmx);
2402 SET_HPEL_FUNCS(avg, 1, 8, mmx);
2403 SET_HPEL_FUNCS(avg_no_rnd, 1, 8, mmx);
2406 #if ARCH_X86_32 || !HAVE_YASM
2409 #if ARCH_X86_32 && HAVE_YASM
2410 if (!high_bit_depth)
2411 c->emulated_edge_mc = emulated_edge_mc_mmx;
2414 c->add_bytes = add_bytes_mmx;
2416 if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
2417 c->h263_v_loop_filter = h263_v_loop_filter_mmx;
2418 c->h263_h_loop_filter = h263_h_loop_filter_mmx;
2422 if (!high_bit_depth && CONFIG_H264CHROMA) {
2423 c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_mmx_rnd;
2424 c->put_h264_chroma_pixels_tab[1] = ff_put_h264_chroma_mc4_mmx;
2427 c->vector_clip_int32 = ff_vector_clip_int32_mmx;
2432 static void dsputil_init_mmx2(DSPContext *c, AVCodecContext *avctx,
2435 const int bit_depth = avctx->bits_per_raw_sample;
2436 const int high_bit_depth = bit_depth > 8;
2438 c->prefetch = prefetch_mmx2;
2440 if (!high_bit_depth) {
2441 c->put_pixels_tab[0][1] = put_pixels16_x2_mmx2;
2442 c->put_pixels_tab[0][2] = put_pixels16_y2_mmx2;
2444 c->avg_pixels_tab[0][0] = avg_pixels16_mmx2;
2445 c->avg_pixels_tab[0][1] = avg_pixels16_x2_mmx2;
2446 c->avg_pixels_tab[0][2] = avg_pixels16_y2_mmx2;
2448 c->put_pixels_tab[1][1] = put_pixels8_x2_mmx2;
2449 c->put_pixels_tab[1][2] = put_pixels8_y2_mmx2;
2451 c->avg_pixels_tab[1][0] = avg_pixels8_mmx2;
2452 c->avg_pixels_tab[1][1] = avg_pixels8_x2_mmx2;
2453 c->avg_pixels_tab[1][2] = avg_pixels8_y2_mmx2;
2456 if (!(avctx->flags & CODEC_FLAG_BITEXACT)) {
2457 if (!high_bit_depth) {
2458 c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmx2;
2459 c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmx2;
2460 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_mmx2;
2461 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_mmx2;
2463 c->avg_pixels_tab[0][3] = avg_pixels16_xy2_mmx2;
2464 c->avg_pixels_tab[1][3] = avg_pixels8_xy2_mmx2;
2467 if (CONFIG_VP3_DECODER && HAVE_YASM) {
2468 c->vp3_v_loop_filter = ff_vp3_v_loop_filter_mmx2;
2469 c->vp3_h_loop_filter = ff_vp3_h_loop_filter_mmx2;
2472 if (CONFIG_VP3_DECODER && HAVE_YASM) {
2473 c->vp3_idct_dc_add = ff_vp3_idct_dc_add_mmx2;
2476 if (CONFIG_VP3_DECODER
2477 && (avctx->codec_id == CODEC_ID_VP3 || avctx->codec_id == CODEC_ID_THEORA)) {
2478 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_exact_mmx2;
2479 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_exact_mmx2;
2482 SET_QPEL_FUNCS(put_qpel, 0, 16, mmx2, );
2483 SET_QPEL_FUNCS(put_qpel, 1, 8, mmx2, );
2484 SET_QPEL_FUNCS(put_no_rnd_qpel, 0, 16, mmx2, );
2485 SET_QPEL_FUNCS(put_no_rnd_qpel, 1, 8, mmx2, );
2486 SET_QPEL_FUNCS(avg_qpel, 0, 16, mmx2, );
2487 SET_QPEL_FUNCS(avg_qpel, 1, 8, mmx2, );
2489 if (!high_bit_depth) {
2490 SET_QPEL_FUNCS(put_h264_qpel, 0, 16, mmx2, );
2491 SET_QPEL_FUNCS(put_h264_qpel, 1, 8, mmx2, );
2492 SET_QPEL_FUNCS(put_h264_qpel, 2, 4, mmx2, );
2493 SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, mmx2, );
2494 SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, mmx2, );
2495 SET_QPEL_FUNCS(avg_h264_qpel, 2, 4, mmx2, );
2496 } else if (bit_depth == 10) {
2499 SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, 10_mmxext, ff_);
2500 SET_QPEL_FUNCS(put_h264_qpel, 0, 16, 10_mmxext, ff_);
2501 SET_QPEL_FUNCS(put_h264_qpel, 1, 8, 10_mmxext, ff_);
2502 SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, 10_mmxext, ff_);
2504 SET_QPEL_FUNCS(put_h264_qpel, 2, 4, 10_mmxext, ff_);
2505 SET_QPEL_FUNCS(avg_h264_qpel, 2, 4, 10_mmxext, ff_);
2509 SET_QPEL_FUNCS(put_2tap_qpel, 0, 16, mmx2, );
2510 SET_QPEL_FUNCS(put_2tap_qpel, 1, 8, mmx2, );
2511 SET_QPEL_FUNCS(avg_2tap_qpel, 0, 16, mmx2, );
2512 SET_QPEL_FUNCS(avg_2tap_qpel, 1, 8, mmx2, );
2515 if (!high_bit_depth && CONFIG_H264CHROMA) {
2516 c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_mmx2_rnd;
2517 c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_mmx2;
2518 c->avg_h264_chroma_pixels_tab[2] = ff_avg_h264_chroma_mc2_mmx2;
2519 c->put_h264_chroma_pixels_tab[2] = ff_put_h264_chroma_mc2_mmx2;
2521 if (bit_depth == 10 && CONFIG_H264CHROMA) {
2522 c->put_h264_chroma_pixels_tab[2] = ff_put_h264_chroma_mc2_10_mmxext;
2523 c->avg_h264_chroma_pixels_tab[2] = ff_avg_h264_chroma_mc2_10_mmxext;
2524 c->put_h264_chroma_pixels_tab[1] = ff_put_h264_chroma_mc4_10_mmxext;
2525 c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_10_mmxext;
2528 c->add_hfyu_median_prediction = ff_add_hfyu_median_prediction_mmx2;
2530 c->scalarproduct_int16 = ff_scalarproduct_int16_mmx2;
2531 c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_mmx2;
2533 if (avctx->flags & CODEC_FLAG_BITEXACT) {
2534 c->apply_window_int16 = ff_apply_window_int16_mmxext_ba;
2536 c->apply_window_int16 = ff_apply_window_int16_mmxext;
2541 static void dsputil_init_3dnow(DSPContext *c, AVCodecContext *avctx,
2544 const int high_bit_depth = avctx->bits_per_raw_sample > 8;
2546 c->prefetch = prefetch_3dnow;
2548 if (!high_bit_depth) {
2549 c->put_pixels_tab[0][1] = put_pixels16_x2_3dnow;
2550 c->put_pixels_tab[0][2] = put_pixels16_y2_3dnow;
2552 c->avg_pixels_tab[0][0] = avg_pixels16_3dnow;
2553 c->avg_pixels_tab[0][1] = avg_pixels16_x2_3dnow;
2554 c->avg_pixels_tab[0][2] = avg_pixels16_y2_3dnow;
2556 c->put_pixels_tab[1][1] = put_pixels8_x2_3dnow;
2557 c->put_pixels_tab[1][2] = put_pixels8_y2_3dnow;
2559 c->avg_pixels_tab[1][0] = avg_pixels8_3dnow;
2560 c->avg_pixels_tab[1][1] = avg_pixels8_x2_3dnow;
2561 c->avg_pixels_tab[1][2] = avg_pixels8_y2_3dnow;
2563 if (!(avctx->flags & CODEC_FLAG_BITEXACT)){
2564 c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_3dnow;
2565 c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_3dnow;
2566 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_3dnow;
2567 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_3dnow;
2569 c->avg_pixels_tab[0][3] = avg_pixels16_xy2_3dnow;
2570 c->avg_pixels_tab[1][3] = avg_pixels8_xy2_3dnow;
2574 if (CONFIG_VP3_DECODER
2575 && (avctx->codec_id == CODEC_ID_VP3 || avctx->codec_id == CODEC_ID_THEORA)) {
2576 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_exact_3dnow;
2577 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_exact_3dnow;
2580 SET_QPEL_FUNCS(put_qpel, 0, 16, 3dnow, );
2581 SET_QPEL_FUNCS(put_qpel, 1, 8, 3dnow, );
2582 SET_QPEL_FUNCS(put_no_rnd_qpel, 0, 16, 3dnow, );
2583 SET_QPEL_FUNCS(put_no_rnd_qpel, 1, 8, 3dnow, );
2584 SET_QPEL_FUNCS(avg_qpel, 0, 16, 3dnow, );
2585 SET_QPEL_FUNCS(avg_qpel, 1, 8, 3dnow, );
2587 if (!high_bit_depth) {
2588 SET_QPEL_FUNCS(put_h264_qpel, 0, 16, 3dnow, );
2589 SET_QPEL_FUNCS(put_h264_qpel, 1, 8, 3dnow, );
2590 SET_QPEL_FUNCS(put_h264_qpel, 2, 4, 3dnow, );
2591 SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, 3dnow, );
2592 SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, 3dnow, );
2593 SET_QPEL_FUNCS(avg_h264_qpel, 2, 4, 3dnow, );
2596 SET_QPEL_FUNCS(put_2tap_qpel, 0, 16, 3dnow, );
2597 SET_QPEL_FUNCS(put_2tap_qpel, 1, 8, 3dnow, );
2598 SET_QPEL_FUNCS(avg_2tap_qpel, 0, 16, 3dnow, );
2599 SET_QPEL_FUNCS(avg_2tap_qpel, 1, 8, 3dnow, );
2602 if (!high_bit_depth && CONFIG_H264CHROMA) {
2603 c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_3dnow_rnd;
2604 c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_3dnow;
2608 c->vorbis_inverse_coupling = vorbis_inverse_coupling_3dnow;
2609 c->vector_fmul = vector_fmul_3dnow;
2610 c->vector_fmul_add = vector_fmul_add_3dnow;
2613 c->add_hfyu_median_prediction = add_hfyu_median_prediction_cmov;
2617 static void dsputil_init_3dnow2(DSPContext *c, AVCodecContext *avctx,
2620 c->vector_fmul_reverse = vector_fmul_reverse_3dnow2;
2622 c->vector_fmul_window = vector_fmul_window_3dnow2;
2626 static void dsputil_init_sse(DSPContext *c, AVCodecContext *avctx, int mm_flags)
2628 const int high_bit_depth = avctx->bits_per_raw_sample > 8;
2630 if (!high_bit_depth) {
2631 if (!(CONFIG_MPEG_XVMC_DECODER && avctx->xvmc_acceleration > 1)){
2632 /* XvMCCreateBlocks() may not allocate 16-byte aligned blocks */
2633 c->clear_block = clear_block_sse;
2634 c->clear_blocks = clear_blocks_sse;
2638 c->vorbis_inverse_coupling = vorbis_inverse_coupling_sse;
2639 c->ac3_downmix = ac3_downmix_sse;
2640 c->vector_fmul = vector_fmul_sse;
2641 c->vector_fmul_reverse = vector_fmul_reverse_sse;
2643 if (!(mm_flags & AV_CPU_FLAG_3DNOW))
2644 c->vector_fmul_add = vector_fmul_add_sse;
2647 c->vector_fmul_window = vector_fmul_window_sse;
2650 c->vector_clipf = vector_clipf_sse;
2653 c->scalarproduct_float = ff_scalarproduct_float_sse;
2654 c->butterflies_float_interleave = ff_butterflies_float_interleave_sse;
2656 if (!high_bit_depth)
2657 c->emulated_edge_mc = emulated_edge_mc_sse;
2662 static void dsputil_init_sse2(DSPContext *c, AVCodecContext *avctx,
2665 const int bit_depth = avctx->bits_per_raw_sample;
2666 const int high_bit_depth = bit_depth > 8;
2668 if (mm_flags & AV_CPU_FLAG_3DNOW) {
2669 // these functions are slower than mmx on AMD, but faster on Intel
2670 if (!high_bit_depth) {
2671 c->put_pixels_tab[0][0] = put_pixels16_sse2;
2672 c->put_no_rnd_pixels_tab[0][0] = put_pixels16_sse2;
2673 c->avg_pixels_tab[0][0] = avg_pixels16_sse2;
2674 H264_QPEL_FUNCS(0, 0, sse2);
2678 if (!high_bit_depth) {
2679 H264_QPEL_FUNCS(0, 1, sse2);
2680 H264_QPEL_FUNCS(0, 2, sse2);
2681 H264_QPEL_FUNCS(0, 3, sse2);
2682 H264_QPEL_FUNCS(1, 1, sse2);
2683 H264_QPEL_FUNCS(1, 2, sse2);
2684 H264_QPEL_FUNCS(1, 3, sse2);
2685 H264_QPEL_FUNCS(2, 1, sse2);
2686 H264_QPEL_FUNCS(2, 2, sse2);
2687 H264_QPEL_FUNCS(2, 3, sse2);
2688 H264_QPEL_FUNCS(3, 1, sse2);
2689 H264_QPEL_FUNCS(3, 2, sse2);
2690 H264_QPEL_FUNCS(3, 3, sse2);
2694 if (bit_depth == 10) {
2695 SET_QPEL_FUNCS(put_h264_qpel, 0, 16, 10_sse2, ff_);
2696 SET_QPEL_FUNCS(put_h264_qpel, 1, 8, 10_sse2, ff_);
2697 SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, 10_sse2, ff_);
2698 SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, 10_sse2, ff_);
2699 H264_QPEL_FUNCS_10(1, 0, sse2_cache64);
2700 H264_QPEL_FUNCS_10(2, 0, sse2_cache64);
2701 H264_QPEL_FUNCS_10(3, 0, sse2_cache64);
2703 if (CONFIG_H264CHROMA) {
2704 c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_10_sse2;
2705 c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_10_sse2;
2709 c->scalarproduct_int16 = ff_scalarproduct_int16_sse2;
2710 c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_sse2;
2711 if (mm_flags & AV_CPU_FLAG_ATOM) {
2712 c->vector_clip_int32 = ff_vector_clip_int32_int_sse2;
2714 c->vector_clip_int32 = ff_vector_clip_int32_sse2;
2716 if (avctx->flags & CODEC_FLAG_BITEXACT) {
2717 c->apply_window_int16 = ff_apply_window_int16_sse2_ba;
2718 } else if (!(mm_flags & AV_CPU_FLAG_SSE2SLOW)) {
2719 c->apply_window_int16 = ff_apply_window_int16_sse2;
2721 c->bswap_buf = ff_bswap32_buf_sse2;
2725 static void dsputil_init_ssse3(DSPContext *c, AVCodecContext *avctx,
2729 const int high_bit_depth = avctx->bits_per_raw_sample > 8;
2730 const int bit_depth = avctx->bits_per_raw_sample;
2732 if (!high_bit_depth) {
2733 H264_QPEL_FUNCS(1, 0, ssse3);
2734 H264_QPEL_FUNCS(1, 1, ssse3);
2735 H264_QPEL_FUNCS(1, 2, ssse3);
2736 H264_QPEL_FUNCS(1, 3, ssse3);
2737 H264_QPEL_FUNCS(2, 0, ssse3);
2738 H264_QPEL_FUNCS(2, 1, ssse3);
2739 H264_QPEL_FUNCS(2, 2, ssse3);
2740 H264_QPEL_FUNCS(2, 3, ssse3);
2741 H264_QPEL_FUNCS(3, 0, ssse3);
2742 H264_QPEL_FUNCS(3, 1, ssse3);
2743 H264_QPEL_FUNCS(3, 2, ssse3);
2744 H264_QPEL_FUNCS(3, 3, ssse3);
2747 else if (bit_depth == 10) {
2748 H264_QPEL_FUNCS_10(1, 0, ssse3_cache64);
2749 H264_QPEL_FUNCS_10(2, 0, ssse3_cache64);
2750 H264_QPEL_FUNCS_10(3, 0, ssse3_cache64);
2752 if (!high_bit_depth && CONFIG_H264CHROMA) {
2753 c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_ssse3_rnd;
2754 c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_ssse3_rnd;
2755 c->put_h264_chroma_pixels_tab[1] = ff_put_h264_chroma_mc4_ssse3;
2756 c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_ssse3;
2758 c->add_hfyu_left_prediction = ff_add_hfyu_left_prediction_ssse3;
2759 if (mm_flags & AV_CPU_FLAG_SSE4) // not really sse4, just slow on Conroe
2760 c->add_hfyu_left_prediction = ff_add_hfyu_left_prediction_sse4;
2762 if (mm_flags & AV_CPU_FLAG_ATOM) {
2763 c->apply_window_int16 = ff_apply_window_int16_ssse3_atom;
2765 c->apply_window_int16 = ff_apply_window_int16_ssse3;
2767 if (!(mm_flags & (AV_CPU_FLAG_SSE42|AV_CPU_FLAG_3DNOW))) { // cachesplit
2768 c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_ssse3;
2770 c->bswap_buf = ff_bswap32_buf_ssse3;
2775 static void dsputil_init_sse4(DSPContext *c, AVCodecContext *avctx,
2779 c->vector_clip_int32 = ff_vector_clip_int32_sse4;
2783 static void dsputil_init_avx(DSPContext *c, AVCodecContext *avctx, int mm_flags)
2785 #if HAVE_AVX && HAVE_YASM
2786 const int bit_depth = avctx->bits_per_raw_sample;
2788 if (bit_depth == 10) {
2789 // AVX implies !cache64.
2790 // TODO: Port cache(32|64) detection from x264.
2791 H264_QPEL_FUNCS_10(1, 0, sse2);
2792 H264_QPEL_FUNCS_10(2, 0, sse2);
2793 H264_QPEL_FUNCS_10(3, 0, sse2);
2795 if (CONFIG_H264CHROMA) {
2796 c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_10_avx;
2797 c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_10_avx;
2800 c->butterflies_float_interleave = ff_butterflies_float_interleave_avx;
2804 void ff_dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
2806 int mm_flags = av_get_cpu_flags();
2808 if (avctx->dsp_mask) {
2809 if (avctx->dsp_mask & AV_CPU_FLAG_FORCE)
2810 mm_flags |= (avctx->dsp_mask & 0xffff);
2812 mm_flags &= ~(avctx->dsp_mask & 0xffff);
2816 av_log(avctx, AV_LOG_INFO, "libavcodec: CPU flags:");
2817 if (mm_flags & AV_CPU_FLAG_MMX)
2818 av_log(avctx, AV_LOG_INFO, " mmx");
2819 if (mm_flags & AV_CPU_FLAG_MMX2)
2820 av_log(avctx, AV_LOG_INFO, " mmx2");
2821 if (mm_flags & AV_CPU_FLAG_3DNOW)
2822 av_log(avctx, AV_LOG_INFO, " 3dnow");
2823 if (mm_flags & AV_CPU_FLAG_SSE)
2824 av_log(avctx, AV_LOG_INFO, " sse");
2825 if (mm_flags & AV_CPU_FLAG_SSE2)
2826 av_log(avctx, AV_LOG_INFO, " sse2");
2827 av_log(avctx, AV_LOG_INFO, "\n");
2830 if (mm_flags & AV_CPU_FLAG_MMX) {
2831 const int idct_algo= avctx->idct_algo;
2833 if (avctx->lowres == 0 && avctx->bits_per_raw_sample <= 8) {
2834 if(idct_algo==FF_IDCT_AUTO || idct_algo==FF_IDCT_SIMPLEMMX){
2835 c->idct_put= ff_simple_idct_put_mmx;
2836 c->idct_add= ff_simple_idct_add_mmx;
2837 c->idct = ff_simple_idct_mmx;
2838 c->idct_permutation_type= FF_SIMPLE_IDCT_PERM;
2840 }else if(idct_algo==FF_IDCT_LIBMPEG2MMX){
2841 if(mm_flags & AV_CPU_FLAG_MMX2){
2842 c->idct_put= ff_libmpeg2mmx2_idct_put;
2843 c->idct_add= ff_libmpeg2mmx2_idct_add;
2844 c->idct = ff_mmxext_idct;
2846 c->idct_put= ff_libmpeg2mmx_idct_put;
2847 c->idct_add= ff_libmpeg2mmx_idct_add;
2848 c->idct = ff_mmx_idct;
2850 c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
2852 }else if((CONFIG_VP3_DECODER || CONFIG_VP5_DECODER || CONFIG_VP6_DECODER) &&
2853 idct_algo==FF_IDCT_VP3 && HAVE_YASM){
2854 if(mm_flags & AV_CPU_FLAG_SSE2){
2855 c->idct_put= ff_vp3_idct_put_sse2;
2856 c->idct_add= ff_vp3_idct_add_sse2;
2857 c->idct = ff_vp3_idct_sse2;
2858 c->idct_permutation_type= FF_TRANSPOSE_IDCT_PERM;
2860 c->idct_put= ff_vp3_idct_put_mmx;
2861 c->idct_add= ff_vp3_idct_add_mmx;
2862 c->idct = ff_vp3_idct_mmx;
2863 c->idct_permutation_type= FF_PARTTRANS_IDCT_PERM;
2865 }else if(idct_algo==FF_IDCT_CAVS){
2866 c->idct_permutation_type= FF_TRANSPOSE_IDCT_PERM;
2867 }else if(idct_algo==FF_IDCT_XVIDMMX){
2868 if(mm_flags & AV_CPU_FLAG_SSE2){
2869 c->idct_put= ff_idct_xvid_sse2_put;
2870 c->idct_add= ff_idct_xvid_sse2_add;
2871 c->idct = ff_idct_xvid_sse2;
2872 c->idct_permutation_type= FF_SSE2_IDCT_PERM;
2873 }else if(mm_flags & AV_CPU_FLAG_MMX2){
2874 c->idct_put= ff_idct_xvid_mmx2_put;
2875 c->idct_add= ff_idct_xvid_mmx2_add;
2876 c->idct = ff_idct_xvid_mmx2;
2878 c->idct_put= ff_idct_xvid_mmx_put;
2879 c->idct_add= ff_idct_xvid_mmx_add;
2880 c->idct = ff_idct_xvid_mmx;
2885 dsputil_init_mmx(c, avctx, mm_flags);
2888 if (mm_flags & AV_CPU_FLAG_MMX2)
2889 dsputil_init_mmx2(c, avctx, mm_flags);
2891 if (HAVE_AMD3DNOW && (mm_flags & AV_CPU_FLAG_3DNOW))
2892 dsputil_init_3dnow(c, avctx, mm_flags);
2894 if (HAVE_AMD3DNOWEXT && (mm_flags & AV_CPU_FLAG_3DNOWEXT))
2895 dsputil_init_3dnow2(c, avctx, mm_flags);
2897 if (HAVE_SSE && (mm_flags & AV_CPU_FLAG_SSE))
2898 dsputil_init_sse(c, avctx, mm_flags);
2900 if (mm_flags & AV_CPU_FLAG_SSE2)
2901 dsputil_init_sse2(c, avctx, mm_flags);
2903 if (mm_flags & AV_CPU_FLAG_SSSE3)
2904 dsputil_init_ssse3(c, avctx, mm_flags);
2906 if (mm_flags & AV_CPU_FLAG_SSE4 && HAVE_SSE)
2907 dsputil_init_sse4(c, avctx, mm_flags);
2909 if (mm_flags & AV_CPU_FLAG_AVX)
2910 dsputil_init_avx(c, avctx, mm_flags);
2912 if (CONFIG_ENCODERS)
2913 ff_dsputilenc_init_mmx(c, avctx);