2 * MMX optimized DSP utils
3 * Copyright (c) 2000, 2001 Fabrice Bellard
4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
6 * This file is part of Libav.
8 * Libav is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License as published by the Free Software Foundation; either
11 * version 2.1 of the License, or (at your option) any later version.
13 * Libav is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 * Lesser General Public License for more details.
18 * You should have received a copy of the GNU Lesser General Public
19 * License along with Libav; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22 * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
25 #include "libavutil/cpu.h"
26 #include "libavutil/x86_cpu.h"
27 #include "libavcodec/dsputil.h"
28 #include "libavcodec/h264dsp.h"
29 #include "libavcodec/mpegvideo.h"
30 #include "libavcodec/simple_idct.h"
31 #include "libavcodec/ac3dec.h"
32 #include "dsputil_mmx.h"
33 #include "idct_xvid.h"
38 /* pixel operations */
39 DECLARE_ALIGNED(8, const uint64_t, ff_bone) = 0x0101010101010101ULL;
40 DECLARE_ALIGNED(8, const uint64_t, ff_wtwo) = 0x0002000200020002ULL;
42 DECLARE_ALIGNED(16, const uint64_t, ff_pdw_80000000)[2] =
43 {0x8000000080000000ULL, 0x8000000080000000ULL};
45 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_1 ) = {0x0001000100010001ULL, 0x0001000100010001ULL};
46 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_2 ) = {0x0002000200020002ULL, 0x0002000200020002ULL};
47 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_3 ) = {0x0003000300030003ULL, 0x0003000300030003ULL};
48 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_4 ) = {0x0004000400040004ULL, 0x0004000400040004ULL};
49 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_5 ) = {0x0005000500050005ULL, 0x0005000500050005ULL};
50 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_8 ) = {0x0008000800080008ULL, 0x0008000800080008ULL};
51 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_9 ) = {0x0009000900090009ULL, 0x0009000900090009ULL};
52 DECLARE_ALIGNED(8, const uint64_t, ff_pw_15 ) = 0x000F000F000F000FULL;
53 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_16 ) = {0x0010001000100010ULL, 0x0010001000100010ULL};
54 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_17 ) = {0x0011001100110011ULL, 0x0011001100110011ULL};
55 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_18 ) = {0x0012001200120012ULL, 0x0012001200120012ULL};
56 DECLARE_ALIGNED(8, const uint64_t, ff_pw_20 ) = 0x0014001400140014ULL;
57 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_27 ) = {0x001B001B001B001BULL, 0x001B001B001B001BULL};
58 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_28 ) = {0x001C001C001C001CULL, 0x001C001C001C001CULL};
59 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_32 ) = {0x0020002000200020ULL, 0x0020002000200020ULL};
60 DECLARE_ALIGNED(8, const uint64_t, ff_pw_42 ) = 0x002A002A002A002AULL;
61 DECLARE_ALIGNED(8, const uint64_t, ff_pw_53 ) = 0x0035003500350035ULL;
62 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_63 ) = {0x003F003F003F003FULL, 0x003F003F003F003FULL};
63 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_64 ) = {0x0040004000400040ULL, 0x0040004000400040ULL};
64 DECLARE_ALIGNED(8, const uint64_t, ff_pw_96 ) = 0x0060006000600060ULL;
65 DECLARE_ALIGNED(8, const uint64_t, ff_pw_128) = 0x0080008000800080ULL;
66 DECLARE_ALIGNED(8, const uint64_t, ff_pw_255) = 0x00ff00ff00ff00ffULL;
68 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_0 ) = {0x0000000000000000ULL, 0x0000000000000000ULL};
69 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_1 ) = {0x0101010101010101ULL, 0x0101010101010101ULL};
70 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_3 ) = {0x0303030303030303ULL, 0x0303030303030303ULL};
71 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_4 ) = {0x0404040404040404ULL, 0x0404040404040404ULL};
72 DECLARE_ALIGNED(8, const uint64_t, ff_pb_7 ) = 0x0707070707070707ULL;
73 DECLARE_ALIGNED(8, const uint64_t, ff_pb_1F ) = 0x1F1F1F1F1F1F1F1FULL;
74 DECLARE_ALIGNED(8, const uint64_t, ff_pb_3F ) = 0x3F3F3F3F3F3F3F3FULL;
75 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_80 ) = {0x8080808080808080ULL, 0x8080808080808080ULL};
76 DECLARE_ALIGNED(8, const uint64_t, ff_pb_81 ) = 0x8181818181818181ULL;
77 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_A1 ) = {0xA1A1A1A1A1A1A1A1ULL, 0xA1A1A1A1A1A1A1A1ULL};
78 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_F8 ) = {0xF8F8F8F8F8F8F8F8ULL, 0xF8F8F8F8F8F8F8F8ULL};
79 DECLARE_ALIGNED(8, const uint64_t, ff_pb_FC ) = 0xFCFCFCFCFCFCFCFCULL;
80 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_FE ) = {0xFEFEFEFEFEFEFEFEULL, 0xFEFEFEFEFEFEFEFEULL};
82 DECLARE_ALIGNED(16, const double, ff_pd_1)[2] = { 1.0, 1.0 };
83 DECLARE_ALIGNED(16, const double, ff_pd_2)[2] = { 2.0, 2.0 };
85 #define JUMPALIGN() __asm__ volatile (".p2align 3"::)
86 #define MOVQ_ZERO(regd) __asm__ volatile ("pxor %%" #regd ", %%" #regd ::)
88 #define MOVQ_BFE(regd) \
90 "pcmpeqd %%" #regd ", %%" #regd " \n\t"\
91 "paddb %%" #regd ", %%" #regd " \n\t" ::)
94 #define MOVQ_BONE(regd) __asm__ volatile ("movq %0, %%" #regd " \n\t" ::"m"(ff_bone))
95 #define MOVQ_WTWO(regd) __asm__ volatile ("movq %0, %%" #regd " \n\t" ::"m"(ff_wtwo))
97 // for shared library it's better to use this way for accessing constants
99 #define MOVQ_BONE(regd) \
101 "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
102 "psrlw $15, %%" #regd " \n\t" \
103 "packuswb %%" #regd ", %%" #regd " \n\t" ::)
105 #define MOVQ_WTWO(regd) \
107 "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
108 "psrlw $15, %%" #regd " \n\t" \
109 "psllw $1, %%" #regd " \n\t"::)
113 // using regr as temporary and for the output result
114 // first argument is unmodifed and second is trashed
115 // regfe is supposed to contain 0xfefefefefefefefe
116 #define PAVGB_MMX_NO_RND(rega, regb, regr, regfe) \
117 "movq " #rega ", " #regr " \n\t"\
118 "pand " #regb ", " #regr " \n\t"\
119 "pxor " #rega ", " #regb " \n\t"\
120 "pand " #regfe "," #regb " \n\t"\
121 "psrlq $1, " #regb " \n\t"\
122 "paddb " #regb ", " #regr " \n\t"
124 #define PAVGB_MMX(rega, regb, regr, regfe) \
125 "movq " #rega ", " #regr " \n\t"\
126 "por " #regb ", " #regr " \n\t"\
127 "pxor " #rega ", " #regb " \n\t"\
128 "pand " #regfe "," #regb " \n\t"\
129 "psrlq $1, " #regb " \n\t"\
130 "psubb " #regb ", " #regr " \n\t"
132 // mm6 is supposed to contain 0xfefefefefefefefe
133 #define PAVGBP_MMX_NO_RND(rega, regb, regr, regc, regd, regp) \
134 "movq " #rega ", " #regr " \n\t"\
135 "movq " #regc ", " #regp " \n\t"\
136 "pand " #regb ", " #regr " \n\t"\
137 "pand " #regd ", " #regp " \n\t"\
138 "pxor " #rega ", " #regb " \n\t"\
139 "pxor " #regc ", " #regd " \n\t"\
140 "pand %%mm6, " #regb " \n\t"\
141 "pand %%mm6, " #regd " \n\t"\
142 "psrlq $1, " #regb " \n\t"\
143 "psrlq $1, " #regd " \n\t"\
144 "paddb " #regb ", " #regr " \n\t"\
145 "paddb " #regd ", " #regp " \n\t"
147 #define PAVGBP_MMX(rega, regb, regr, regc, regd, regp) \
148 "movq " #rega ", " #regr " \n\t"\
149 "movq " #regc ", " #regp " \n\t"\
150 "por " #regb ", " #regr " \n\t"\
151 "por " #regd ", " #regp " \n\t"\
152 "pxor " #rega ", " #regb " \n\t"\
153 "pxor " #regc ", " #regd " \n\t"\
154 "pand %%mm6, " #regb " \n\t"\
155 "pand %%mm6, " #regd " \n\t"\
156 "psrlq $1, " #regd " \n\t"\
157 "psrlq $1, " #regb " \n\t"\
158 "psubb " #regb ", " #regr " \n\t"\
159 "psubb " #regd ", " #regp " \n\t"
161 /***********************************/
162 /* MMX no rounding */
163 #define DEF(x, y) x ## _no_rnd_ ## y ##_mmx
164 #define SET_RND MOVQ_WONE
165 #define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX_NO_RND(a, b, c, d, e, f)
166 #define PAVGB(a, b, c, e) PAVGB_MMX_NO_RND(a, b, c, e)
167 #define OP_AVG(a, b, c, e) PAVGB_MMX(a, b, c, e)
169 #include "dsputil_mmx_rnd_template.c"
175 /***********************************/
178 #define DEF(x, y) x ## _ ## y ##_mmx
179 #define SET_RND MOVQ_WTWO
180 #define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX(a, b, c, d, e, f)
181 #define PAVGB(a, b, c, e) PAVGB_MMX(a, b, c, e)
183 #include "dsputil_mmx_rnd_template.c"
191 /***********************************/
194 #define DEF(x) x ## _3dnow
195 #define PAVGB "pavgusb"
198 #include "dsputil_mmx_avg_template.c"
204 /***********************************/
207 #define DEF(x) x ## _mmx2
209 /* Introduced only in MMX2 set */
210 #define PAVGB "pavgb"
213 #include "dsputil_mmx_avg_template.c"
219 #define put_no_rnd_pixels16_mmx put_pixels16_mmx
220 #define put_no_rnd_pixels8_mmx put_pixels8_mmx
221 #define put_pixels16_mmx2 put_pixels16_mmx
222 #define put_pixels8_mmx2 put_pixels8_mmx
223 #define put_pixels4_mmx2 put_pixels4_mmx
224 #define put_no_rnd_pixels16_mmx2 put_no_rnd_pixels16_mmx
225 #define put_no_rnd_pixels8_mmx2 put_no_rnd_pixels8_mmx
226 #define put_pixels16_3dnow put_pixels16_mmx
227 #define put_pixels8_3dnow put_pixels8_mmx
228 #define put_pixels4_3dnow put_pixels4_mmx
229 #define put_no_rnd_pixels16_3dnow put_no_rnd_pixels16_mmx
230 #define put_no_rnd_pixels8_3dnow put_no_rnd_pixels8_mmx
232 /***********************************/
235 void ff_put_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size)
240 /* read the pixels */
245 "movq %3, %%mm0 \n\t"
246 "movq 8%3, %%mm1 \n\t"
247 "movq 16%3, %%mm2 \n\t"
248 "movq 24%3, %%mm3 \n\t"
249 "movq 32%3, %%mm4 \n\t"
250 "movq 40%3, %%mm5 \n\t"
251 "movq 48%3, %%mm6 \n\t"
252 "movq 56%3, %%mm7 \n\t"
253 "packuswb %%mm1, %%mm0 \n\t"
254 "packuswb %%mm3, %%mm2 \n\t"
255 "packuswb %%mm5, %%mm4 \n\t"
256 "packuswb %%mm7, %%mm6 \n\t"
257 "movq %%mm0, (%0) \n\t"
258 "movq %%mm2, (%0, %1) \n\t"
259 "movq %%mm4, (%0, %1, 2) \n\t"
260 "movq %%mm6, (%0, %2) \n\t"
261 ::"r" (pix), "r" ((x86_reg)line_size), "r" ((x86_reg)line_size*3), "m"(*p)
266 // if here would be an exact copy of the code above
267 // compiler would generate some very strange code
270 "movq (%3), %%mm0 \n\t"
271 "movq 8(%3), %%mm1 \n\t"
272 "movq 16(%3), %%mm2 \n\t"
273 "movq 24(%3), %%mm3 \n\t"
274 "movq 32(%3), %%mm4 \n\t"
275 "movq 40(%3), %%mm5 \n\t"
276 "movq 48(%3), %%mm6 \n\t"
277 "movq 56(%3), %%mm7 \n\t"
278 "packuswb %%mm1, %%mm0 \n\t"
279 "packuswb %%mm3, %%mm2 \n\t"
280 "packuswb %%mm5, %%mm4 \n\t"
281 "packuswb %%mm7, %%mm6 \n\t"
282 "movq %%mm0, (%0) \n\t"
283 "movq %%mm2, (%0, %1) \n\t"
284 "movq %%mm4, (%0, %1, 2) \n\t"
285 "movq %%mm6, (%0, %2) \n\t"
286 ::"r" (pix), "r" ((x86_reg)line_size), "r" ((x86_reg)line_size*3), "r"(p)
290 #define put_signed_pixels_clamped_mmx_half(off) \
291 "movq "#off"(%2), %%mm1 \n\t"\
292 "movq 16+"#off"(%2), %%mm2 \n\t"\
293 "movq 32+"#off"(%2), %%mm3 \n\t"\
294 "movq 48+"#off"(%2), %%mm4 \n\t"\
295 "packsswb 8+"#off"(%2), %%mm1 \n\t"\
296 "packsswb 24+"#off"(%2), %%mm2 \n\t"\
297 "packsswb 40+"#off"(%2), %%mm3 \n\t"\
298 "packsswb 56+"#off"(%2), %%mm4 \n\t"\
299 "paddb %%mm0, %%mm1 \n\t"\
300 "paddb %%mm0, %%mm2 \n\t"\
301 "paddb %%mm0, %%mm3 \n\t"\
302 "paddb %%mm0, %%mm4 \n\t"\
303 "movq %%mm1, (%0) \n\t"\
304 "movq %%mm2, (%0, %3) \n\t"\
305 "movq %%mm3, (%0, %3, 2) \n\t"\
306 "movq %%mm4, (%0, %1) \n\t"
308 void ff_put_signed_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size)
310 x86_reg line_skip = line_size;
314 "movq "MANGLE(ff_pb_80)", %%mm0 \n\t"
315 "lea (%3, %3, 2), %1 \n\t"
316 put_signed_pixels_clamped_mmx_half(0)
317 "lea (%0, %3, 4), %0 \n\t"
318 put_signed_pixels_clamped_mmx_half(64)
319 :"+&r" (pixels), "=&r" (line_skip3)
320 :"r" (block), "r"(line_skip)
324 void ff_add_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size)
330 /* read the pixels */
337 "movq (%2), %%mm0 \n\t"
338 "movq 8(%2), %%mm1 \n\t"
339 "movq 16(%2), %%mm2 \n\t"
340 "movq 24(%2), %%mm3 \n\t"
341 "movq %0, %%mm4 \n\t"
342 "movq %1, %%mm6 \n\t"
343 "movq %%mm4, %%mm5 \n\t"
344 "punpcklbw %%mm7, %%mm4 \n\t"
345 "punpckhbw %%mm7, %%mm5 \n\t"
346 "paddsw %%mm4, %%mm0 \n\t"
347 "paddsw %%mm5, %%mm1 \n\t"
348 "movq %%mm6, %%mm5 \n\t"
349 "punpcklbw %%mm7, %%mm6 \n\t"
350 "punpckhbw %%mm7, %%mm5 \n\t"
351 "paddsw %%mm6, %%mm2 \n\t"
352 "paddsw %%mm5, %%mm3 \n\t"
353 "packuswb %%mm1, %%mm0 \n\t"
354 "packuswb %%mm3, %%mm2 \n\t"
355 "movq %%mm0, %0 \n\t"
356 "movq %%mm2, %1 \n\t"
357 :"+m"(*pix), "+m"(*(pix+line_size))
365 static void put_pixels4_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h)
368 "lea (%3, %3), %%"REG_a" \n\t"
371 "movd (%1), %%mm0 \n\t"
372 "movd (%1, %3), %%mm1 \n\t"
373 "movd %%mm0, (%2) \n\t"
374 "movd %%mm1, (%2, %3) \n\t"
375 "add %%"REG_a", %1 \n\t"
376 "add %%"REG_a", %2 \n\t"
377 "movd (%1), %%mm0 \n\t"
378 "movd (%1, %3), %%mm1 \n\t"
379 "movd %%mm0, (%2) \n\t"
380 "movd %%mm1, (%2, %3) \n\t"
381 "add %%"REG_a", %1 \n\t"
382 "add %%"REG_a", %2 \n\t"
385 : "+g"(h), "+r" (pixels), "+r" (block)
386 : "r"((x86_reg)line_size)
391 static void put_pixels8_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h)
394 "lea (%3, %3), %%"REG_a" \n\t"
397 "movq (%1), %%mm0 \n\t"
398 "movq (%1, %3), %%mm1 \n\t"
399 "movq %%mm0, (%2) \n\t"
400 "movq %%mm1, (%2, %3) \n\t"
401 "add %%"REG_a", %1 \n\t"
402 "add %%"REG_a", %2 \n\t"
403 "movq (%1), %%mm0 \n\t"
404 "movq (%1, %3), %%mm1 \n\t"
405 "movq %%mm0, (%2) \n\t"
406 "movq %%mm1, (%2, %3) \n\t"
407 "add %%"REG_a", %1 \n\t"
408 "add %%"REG_a", %2 \n\t"
411 : "+g"(h), "+r" (pixels), "+r" (block)
412 : "r"((x86_reg)line_size)
417 static void put_pixels16_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h)
420 "lea (%3, %3), %%"REG_a" \n\t"
423 "movq (%1), %%mm0 \n\t"
424 "movq 8(%1), %%mm4 \n\t"
425 "movq (%1, %3), %%mm1 \n\t"
426 "movq 8(%1, %3), %%mm5 \n\t"
427 "movq %%mm0, (%2) \n\t"
428 "movq %%mm4, 8(%2) \n\t"
429 "movq %%mm1, (%2, %3) \n\t"
430 "movq %%mm5, 8(%2, %3) \n\t"
431 "add %%"REG_a", %1 \n\t"
432 "add %%"REG_a", %2 \n\t"
433 "movq (%1), %%mm0 \n\t"
434 "movq 8(%1), %%mm4 \n\t"
435 "movq (%1, %3), %%mm1 \n\t"
436 "movq 8(%1, %3), %%mm5 \n\t"
437 "movq %%mm0, (%2) \n\t"
438 "movq %%mm4, 8(%2) \n\t"
439 "movq %%mm1, (%2, %3) \n\t"
440 "movq %%mm5, 8(%2, %3) \n\t"
441 "add %%"REG_a", %1 \n\t"
442 "add %%"REG_a", %2 \n\t"
445 : "+g"(h), "+r" (pixels), "+r" (block)
446 : "r"((x86_reg)line_size)
451 static void put_pixels16_sse2(uint8_t *block, const uint8_t *pixels, int line_size, int h)
455 "movdqu (%1), %%xmm0 \n\t"
456 "movdqu (%1,%3), %%xmm1 \n\t"
457 "movdqu (%1,%3,2), %%xmm2 \n\t"
458 "movdqu (%1,%4), %%xmm3 \n\t"
459 "movdqa %%xmm0, (%2) \n\t"
460 "movdqa %%xmm1, (%2,%3) \n\t"
461 "movdqa %%xmm2, (%2,%3,2) \n\t"
462 "movdqa %%xmm3, (%2,%4) \n\t"
464 "lea (%1,%3,4), %1 \n\t"
465 "lea (%2,%3,4), %2 \n\t"
467 : "+g"(h), "+r" (pixels), "+r" (block)
468 : "r"((x86_reg)line_size), "r"((x86_reg)3L*line_size)
473 static void avg_pixels16_sse2(uint8_t *block, const uint8_t *pixels, int line_size, int h)
477 "movdqu (%1), %%xmm0 \n\t"
478 "movdqu (%1,%3), %%xmm1 \n\t"
479 "movdqu (%1,%3,2), %%xmm2 \n\t"
480 "movdqu (%1,%4), %%xmm3 \n\t"
481 "pavgb (%2), %%xmm0 \n\t"
482 "pavgb (%2,%3), %%xmm1 \n\t"
483 "pavgb (%2,%3,2), %%xmm2 \n\t"
484 "pavgb (%2,%4), %%xmm3 \n\t"
485 "movdqa %%xmm0, (%2) \n\t"
486 "movdqa %%xmm1, (%2,%3) \n\t"
487 "movdqa %%xmm2, (%2,%3,2) \n\t"
488 "movdqa %%xmm3, (%2,%4) \n\t"
490 "lea (%1,%3,4), %1 \n\t"
491 "lea (%2,%3,4), %2 \n\t"
493 : "+g"(h), "+r" (pixels), "+r" (block)
494 : "r"((x86_reg)line_size), "r"((x86_reg)3L*line_size)
499 #define CLEAR_BLOCKS(name,n) \
500 static void name(DCTELEM *blocks)\
503 "pxor %%mm7, %%mm7 \n\t"\
504 "mov %1, %%"REG_a" \n\t"\
506 "movq %%mm7, (%0, %%"REG_a") \n\t"\
507 "movq %%mm7, 8(%0, %%"REG_a") \n\t"\
508 "movq %%mm7, 16(%0, %%"REG_a") \n\t"\
509 "movq %%mm7, 24(%0, %%"REG_a") \n\t"\
510 "add $32, %%"REG_a" \n\t"\
512 : : "r" (((uint8_t *)blocks)+128*n),\
517 CLEAR_BLOCKS(clear_blocks_mmx, 6)
518 CLEAR_BLOCKS(clear_block_mmx, 1)
520 static void clear_block_sse(DCTELEM *block)
523 "xorps %%xmm0, %%xmm0 \n"
524 "movaps %%xmm0, (%0) \n"
525 "movaps %%xmm0, 16(%0) \n"
526 "movaps %%xmm0, 32(%0) \n"
527 "movaps %%xmm0, 48(%0) \n"
528 "movaps %%xmm0, 64(%0) \n"
529 "movaps %%xmm0, 80(%0) \n"
530 "movaps %%xmm0, 96(%0) \n"
531 "movaps %%xmm0, 112(%0) \n"
537 static void clear_blocks_sse(DCTELEM *blocks)
540 "xorps %%xmm0, %%xmm0 \n"
541 "mov %1, %%"REG_a" \n"
543 "movaps %%xmm0, (%0, %%"REG_a") \n"
544 "movaps %%xmm0, 16(%0, %%"REG_a") \n"
545 "movaps %%xmm0, 32(%0, %%"REG_a") \n"
546 "movaps %%xmm0, 48(%0, %%"REG_a") \n"
547 "movaps %%xmm0, 64(%0, %%"REG_a") \n"
548 "movaps %%xmm0, 80(%0, %%"REG_a") \n"
549 "movaps %%xmm0, 96(%0, %%"REG_a") \n"
550 "movaps %%xmm0, 112(%0, %%"REG_a") \n"
551 "add $128, %%"REG_a" \n"
553 : : "r" (((uint8_t *)blocks)+128*6),
559 static void add_bytes_mmx(uint8_t *dst, uint8_t *src, int w){
564 "movq (%1, %0), %%mm0 \n\t"
565 "movq (%2, %0), %%mm1 \n\t"
566 "paddb %%mm0, %%mm1 \n\t"
567 "movq %%mm1, (%2, %0) \n\t"
568 "movq 8(%1, %0), %%mm0 \n\t"
569 "movq 8(%2, %0), %%mm1 \n\t"
570 "paddb %%mm0, %%mm1 \n\t"
571 "movq %%mm1, 8(%2, %0) \n\t"
577 : "r"(src), "r"(dst), "r"((x86_reg)w-15)
580 dst[i+0] += src[i+0];
583 static void add_bytes_l2_mmx(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
588 "movq (%2, %0), %%mm0 \n\t"
589 "movq 8(%2, %0), %%mm1 \n\t"
590 "paddb (%3, %0), %%mm0 \n\t"
591 "paddb 8(%3, %0), %%mm1 \n\t"
592 "movq %%mm0, (%1, %0) \n\t"
593 "movq %%mm1, 8(%1, %0) \n\t"
599 : "r"(dst), "r"(src1), "r"(src2), "r"((x86_reg)w-15)
602 dst[i] = src1[i] + src2[i];
606 static void add_hfyu_median_prediction_cmov(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int w, int *left, int *left_top) {
609 int l = *left & 0xff;
610 int tl = *left_top & 0xff;
615 "movzbl (%3,%4), %2 \n"
628 "add (%6,%4), %b0 \n"
629 "mov %b0, (%5,%4) \n"
632 :"+&q"(l), "+&q"(tl), "=&r"(t), "=&q"(x), "+&r"(w2)
633 :"r"(dst+w), "r"(diff+w), "rm"(top+w)
640 #define H263_LOOP_FILTER \
641 "pxor %%mm7, %%mm7 \n\t"\
642 "movq %0, %%mm0 \n\t"\
643 "movq %0, %%mm1 \n\t"\
644 "movq %3, %%mm2 \n\t"\
645 "movq %3, %%mm3 \n\t"\
646 "punpcklbw %%mm7, %%mm0 \n\t"\
647 "punpckhbw %%mm7, %%mm1 \n\t"\
648 "punpcklbw %%mm7, %%mm2 \n\t"\
649 "punpckhbw %%mm7, %%mm3 \n\t"\
650 "psubw %%mm2, %%mm0 \n\t"\
651 "psubw %%mm3, %%mm1 \n\t"\
652 "movq %1, %%mm2 \n\t"\
653 "movq %1, %%mm3 \n\t"\
654 "movq %2, %%mm4 \n\t"\
655 "movq %2, %%mm5 \n\t"\
656 "punpcklbw %%mm7, %%mm2 \n\t"\
657 "punpckhbw %%mm7, %%mm3 \n\t"\
658 "punpcklbw %%mm7, %%mm4 \n\t"\
659 "punpckhbw %%mm7, %%mm5 \n\t"\
660 "psubw %%mm2, %%mm4 \n\t"\
661 "psubw %%mm3, %%mm5 \n\t"\
662 "psllw $2, %%mm4 \n\t"\
663 "psllw $2, %%mm5 \n\t"\
664 "paddw %%mm0, %%mm4 \n\t"\
665 "paddw %%mm1, %%mm5 \n\t"\
666 "pxor %%mm6, %%mm6 \n\t"\
667 "pcmpgtw %%mm4, %%mm6 \n\t"\
668 "pcmpgtw %%mm5, %%mm7 \n\t"\
669 "pxor %%mm6, %%mm4 \n\t"\
670 "pxor %%mm7, %%mm5 \n\t"\
671 "psubw %%mm6, %%mm4 \n\t"\
672 "psubw %%mm7, %%mm5 \n\t"\
673 "psrlw $3, %%mm4 \n\t"\
674 "psrlw $3, %%mm5 \n\t"\
675 "packuswb %%mm5, %%mm4 \n\t"\
676 "packsswb %%mm7, %%mm6 \n\t"\
677 "pxor %%mm7, %%mm7 \n\t"\
678 "movd %4, %%mm2 \n\t"\
679 "punpcklbw %%mm2, %%mm2 \n\t"\
680 "punpcklbw %%mm2, %%mm2 \n\t"\
681 "punpcklbw %%mm2, %%mm2 \n\t"\
682 "psubusb %%mm4, %%mm2 \n\t"\
683 "movq %%mm2, %%mm3 \n\t"\
684 "psubusb %%mm4, %%mm3 \n\t"\
685 "psubb %%mm3, %%mm2 \n\t"\
686 "movq %1, %%mm3 \n\t"\
687 "movq %2, %%mm4 \n\t"\
688 "pxor %%mm6, %%mm3 \n\t"\
689 "pxor %%mm6, %%mm4 \n\t"\
690 "paddusb %%mm2, %%mm3 \n\t"\
691 "psubusb %%mm2, %%mm4 \n\t"\
692 "pxor %%mm6, %%mm3 \n\t"\
693 "pxor %%mm6, %%mm4 \n\t"\
694 "paddusb %%mm2, %%mm2 \n\t"\
695 "packsswb %%mm1, %%mm0 \n\t"\
696 "pcmpgtb %%mm0, %%mm7 \n\t"\
697 "pxor %%mm7, %%mm0 \n\t"\
698 "psubb %%mm7, %%mm0 \n\t"\
699 "movq %%mm0, %%mm1 \n\t"\
700 "psubusb %%mm2, %%mm0 \n\t"\
701 "psubb %%mm0, %%mm1 \n\t"\
702 "pand %5, %%mm1 \n\t"\
703 "psrlw $2, %%mm1 \n\t"\
704 "pxor %%mm7, %%mm1 \n\t"\
705 "psubb %%mm7, %%mm1 \n\t"\
706 "movq %0, %%mm5 \n\t"\
707 "movq %3, %%mm6 \n\t"\
708 "psubb %%mm1, %%mm5 \n\t"\
709 "paddb %%mm1, %%mm6 \n\t"
711 static void h263_v_loop_filter_mmx(uint8_t *src, int stride, int qscale){
712 if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
713 const int strength= ff_h263_loop_filter_strength[qscale];
719 "movq %%mm3, %1 \n\t"
720 "movq %%mm4, %2 \n\t"
721 "movq %%mm5, %0 \n\t"
722 "movq %%mm6, %3 \n\t"
723 : "+m" (*(uint64_t*)(src - 2*stride)),
724 "+m" (*(uint64_t*)(src - 1*stride)),
725 "+m" (*(uint64_t*)(src + 0*stride)),
726 "+m" (*(uint64_t*)(src + 1*stride))
727 : "g" (2*strength), "m"(ff_pb_FC)
732 static void h263_h_loop_filter_mmx(uint8_t *src, int stride, int qscale){
733 if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
734 const int strength= ff_h263_loop_filter_strength[qscale];
735 DECLARE_ALIGNED(8, uint64_t, temp)[4];
736 uint8_t *btemp= (uint8_t*)temp;
740 transpose4x4(btemp , src , 8, stride);
741 transpose4x4(btemp+4, src + 4*stride, 8, stride);
743 H263_LOOP_FILTER // 5 3 4 6
749 : "g" (2*strength), "m"(ff_pb_FC)
753 "movq %%mm5, %%mm1 \n\t"
754 "movq %%mm4, %%mm0 \n\t"
755 "punpcklbw %%mm3, %%mm5 \n\t"
756 "punpcklbw %%mm6, %%mm4 \n\t"
757 "punpckhbw %%mm3, %%mm1 \n\t"
758 "punpckhbw %%mm6, %%mm0 \n\t"
759 "movq %%mm5, %%mm3 \n\t"
760 "movq %%mm1, %%mm6 \n\t"
761 "punpcklwd %%mm4, %%mm5 \n\t"
762 "punpcklwd %%mm0, %%mm1 \n\t"
763 "punpckhwd %%mm4, %%mm3 \n\t"
764 "punpckhwd %%mm0, %%mm6 \n\t"
765 "movd %%mm5, (%0) \n\t"
766 "punpckhdq %%mm5, %%mm5 \n\t"
767 "movd %%mm5, (%0,%2) \n\t"
768 "movd %%mm3, (%0,%2,2) \n\t"
769 "punpckhdq %%mm3, %%mm3 \n\t"
770 "movd %%mm3, (%0,%3) \n\t"
771 "movd %%mm1, (%1) \n\t"
772 "punpckhdq %%mm1, %%mm1 \n\t"
773 "movd %%mm1, (%1,%2) \n\t"
774 "movd %%mm6, (%1,%2,2) \n\t"
775 "punpckhdq %%mm6, %%mm6 \n\t"
776 "movd %%mm6, (%1,%3) \n\t"
778 "r" (src + 4*stride),
779 "r" ((x86_reg) stride ),
780 "r" ((x86_reg)(3*stride))
785 /* draw the edges of width 'w' of an image of size width, height
786 this mmx version can only handle w==8 || w==16 */
787 static void draw_edges_mmx(uint8_t *buf, int wrap, int width, int height, int w, int h, int sides)
789 uint8_t *ptr, *last_line;
792 last_line = buf + (height - 1) * wrap;
799 "movd (%0), %%mm0 \n\t"
800 "punpcklbw %%mm0, %%mm0 \n\t"
801 "punpcklwd %%mm0, %%mm0 \n\t"
802 "punpckldq %%mm0, %%mm0 \n\t"
803 "movq %%mm0, -8(%0) \n\t"
804 "movq -8(%0, %2), %%mm1 \n\t"
805 "punpckhbw %%mm1, %%mm1 \n\t"
806 "punpckhwd %%mm1, %%mm1 \n\t"
807 "punpckhdq %%mm1, %%mm1 \n\t"
808 "movq %%mm1, (%0, %2) \n\t"
813 : "r" ((x86_reg)wrap), "r" ((x86_reg)width), "r" (ptr + wrap*height)
820 "movd (%0), %%mm0 \n\t"
821 "punpcklbw %%mm0, %%mm0 \n\t"
822 "punpcklwd %%mm0, %%mm0 \n\t"
823 "punpckldq %%mm0, %%mm0 \n\t"
824 "movq %%mm0, -8(%0) \n\t"
825 "movq %%mm0, -16(%0) \n\t"
826 "movq -8(%0, %2), %%mm1 \n\t"
827 "punpckhbw %%mm1, %%mm1 \n\t"
828 "punpckhwd %%mm1, %%mm1 \n\t"
829 "punpckhdq %%mm1, %%mm1 \n\t"
830 "movq %%mm1, (%0, %2) \n\t"
831 "movq %%mm1, 8(%0, %2) \n\t"
836 : "r" ((x86_reg)wrap), "r" ((x86_reg)width), "r" (ptr + wrap*height)
840 /* top and bottom (and hopefully also the corners) */
841 if (sides&EDGE_TOP) {
842 for(i = 0; i < h; i += 4) {
843 ptr= buf - (i + 1) * wrap - w;
846 "movq (%1, %0), %%mm0 \n\t"
847 "movq %%mm0, (%0) \n\t"
848 "movq %%mm0, (%0, %2) \n\t"
849 "movq %%mm0, (%0, %2, 2) \n\t"
850 "movq %%mm0, (%0, %3) \n\t"
855 : "r" ((x86_reg)buf - (x86_reg)ptr - w), "r" ((x86_reg)-wrap), "r" ((x86_reg)-wrap*3), "r" (ptr+width+2*w)
860 if (sides&EDGE_BOTTOM) {
861 for(i = 0; i < w; i += 4) {
862 ptr= last_line + (i + 1) * wrap - w;
865 "movq (%1, %0), %%mm0 \n\t"
866 "movq %%mm0, (%0) \n\t"
867 "movq %%mm0, (%0, %2) \n\t"
868 "movq %%mm0, (%0, %2, 2) \n\t"
869 "movq %%mm0, (%0, %3) \n\t"
874 : "r" ((x86_reg)last_line - (x86_reg)ptr - w), "r" ((x86_reg)wrap), "r" ((x86_reg)wrap*3), "r" (ptr+width+2*w)
880 #define PAETH(cpu, abs3)\
881 static void add_png_paeth_prediction_##cpu(uint8_t *dst, uint8_t *src, uint8_t *top, int w, int bpp)\
886 "pxor %%mm7, %%mm7 \n"\
887 "movd (%1,%0), %%mm0 \n"\
888 "movd (%2,%0), %%mm1 \n"\
889 "punpcklbw %%mm7, %%mm0 \n"\
890 "punpcklbw %%mm7, %%mm1 \n"\
893 "movq %%mm1, %%mm2 \n"\
894 "movd (%2,%0), %%mm1 \n"\
895 "movq %%mm2, %%mm3 \n"\
896 "punpcklbw %%mm7, %%mm1 \n"\
897 "movq %%mm2, %%mm4 \n"\
898 "psubw %%mm1, %%mm3 \n"\
899 "psubw %%mm0, %%mm4 \n"\
900 "movq %%mm3, %%mm5 \n"\
901 "paddw %%mm4, %%mm5 \n"\
903 "movq %%mm4, %%mm6 \n"\
904 "pminsw %%mm5, %%mm6 \n"\
905 "pcmpgtw %%mm6, %%mm3 \n"\
906 "pcmpgtw %%mm5, %%mm4 \n"\
907 "movq %%mm4, %%mm6 \n"\
908 "pand %%mm3, %%mm4 \n"\
909 "pandn %%mm3, %%mm6 \n"\
910 "pandn %%mm0, %%mm3 \n"\
911 "movd (%3,%0), %%mm0 \n"\
912 "pand %%mm1, %%mm6 \n"\
913 "pand %%mm4, %%mm2 \n"\
914 "punpcklbw %%mm7, %%mm0 \n"\
916 "paddw %%mm6, %%mm0 \n"\
917 "paddw %%mm2, %%mm3 \n"\
918 "paddw %%mm3, %%mm0 \n"\
919 "pand %%mm5, %%mm0 \n"\
920 "movq %%mm0, %%mm3 \n"\
921 "packuswb %%mm3, %%mm3 \n"\
922 "movd %%mm3, (%1,%0) \n"\
927 :"r"(dst), "r"(top), "r"(src), "r"((x86_reg)bpp), "g"(end),\
934 "psubw %%mm5, %%mm7 \n"\
935 "pmaxsw %%mm7, %%mm5 \n"\
936 "pxor %%mm6, %%mm6 \n"\
937 "pxor %%mm7, %%mm7 \n"\
938 "psubw %%mm3, %%mm6 \n"\
939 "psubw %%mm4, %%mm7 \n"\
940 "pmaxsw %%mm6, %%mm3 \n"\
941 "pmaxsw %%mm7, %%mm4 \n"\
942 "pxor %%mm7, %%mm7 \n"
945 "pabsw %%mm3, %%mm3 \n"\
946 "pabsw %%mm4, %%mm4 \n"\
947 "pabsw %%mm5, %%mm5 \n"
949 PAETH(mmx2, ABS3_MMX2)
951 PAETH(ssse3, ABS3_SSSE3)
954 #define QPEL_V_LOW(m3,m4,m5,m6, pw_20, pw_3, rnd, in0, in1, in2, in7, out, OP)\
955 "paddw " #m4 ", " #m3 " \n\t" /* x1 */\
956 "movq "MANGLE(ff_pw_20)", %%mm4 \n\t" /* 20 */\
957 "pmullw " #m3 ", %%mm4 \n\t" /* 20x1 */\
958 "movq "#in7", " #m3 " \n\t" /* d */\
959 "movq "#in0", %%mm5 \n\t" /* D */\
960 "paddw " #m3 ", %%mm5 \n\t" /* x4 */\
961 "psubw %%mm5, %%mm4 \n\t" /* 20x1 - x4 */\
962 "movq "#in1", %%mm5 \n\t" /* C */\
963 "movq "#in2", %%mm6 \n\t" /* B */\
964 "paddw " #m6 ", %%mm5 \n\t" /* x3 */\
965 "paddw " #m5 ", %%mm6 \n\t" /* x2 */\
966 "paddw %%mm6, %%mm6 \n\t" /* 2x2 */\
967 "psubw %%mm6, %%mm5 \n\t" /* -2x2 + x3 */\
968 "pmullw "MANGLE(ff_pw_3)", %%mm5 \n\t" /* -6x2 + 3x3 */\
969 "paddw " #rnd ", %%mm4 \n\t" /* x2 */\
970 "paddw %%mm4, %%mm5 \n\t" /* 20x1 - 6x2 + 3x3 - x4 */\
971 "psraw $5, %%mm5 \n\t"\
972 "packuswb %%mm5, %%mm5 \n\t"\
973 OP(%%mm5, out, %%mm7, d)
975 #define QPEL_BASE(OPNAME, ROUNDER, RND, OP_MMX2, OP_3DNOW)\
976 static void OPNAME ## mpeg4_qpel16_h_lowpass_mmx2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
980 "pxor %%mm7, %%mm7 \n\t"\
982 "movq (%0), %%mm0 \n\t" /* ABCDEFGH */\
983 "movq %%mm0, %%mm1 \n\t" /* ABCDEFGH */\
984 "movq %%mm0, %%mm2 \n\t" /* ABCDEFGH */\
985 "punpcklbw %%mm7, %%mm0 \n\t" /* 0A0B0C0D */\
986 "punpckhbw %%mm7, %%mm1 \n\t" /* 0E0F0G0H */\
987 "pshufw $0x90, %%mm0, %%mm5 \n\t" /* 0A0A0B0C */\
988 "pshufw $0x41, %%mm0, %%mm6 \n\t" /* 0B0A0A0B */\
989 "movq %%mm2, %%mm3 \n\t" /* ABCDEFGH */\
990 "movq %%mm2, %%mm4 \n\t" /* ABCDEFGH */\
991 "psllq $8, %%mm2 \n\t" /* 0ABCDEFG */\
992 "psllq $16, %%mm3 \n\t" /* 00ABCDEF */\
993 "psllq $24, %%mm4 \n\t" /* 000ABCDE */\
994 "punpckhbw %%mm7, %%mm2 \n\t" /* 0D0E0F0G */\
995 "punpckhbw %%mm7, %%mm3 \n\t" /* 0C0D0E0F */\
996 "punpckhbw %%mm7, %%mm4 \n\t" /* 0B0C0D0E */\
997 "paddw %%mm3, %%mm5 \n\t" /* b */\
998 "paddw %%mm2, %%mm6 \n\t" /* c */\
999 "paddw %%mm5, %%mm5 \n\t" /* 2b */\
1000 "psubw %%mm5, %%mm6 \n\t" /* c - 2b */\
1001 "pshufw $0x06, %%mm0, %%mm5 \n\t" /* 0C0B0A0A */\
1002 "pmullw "MANGLE(ff_pw_3)", %%mm6 \n\t" /* 3c - 6b */\
1003 "paddw %%mm4, %%mm0 \n\t" /* a */\
1004 "paddw %%mm1, %%mm5 \n\t" /* d */\
1005 "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" /* 20a */\
1006 "psubw %%mm5, %%mm0 \n\t" /* 20a - d */\
1007 "paddw %6, %%mm6 \n\t"\
1008 "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\
1009 "psraw $5, %%mm0 \n\t"\
1010 "movq %%mm0, %5 \n\t"\
1011 /* mm1=EFGH, mm2=DEFG, mm3=CDEF, mm4=BCDE, mm7=0 */\
1013 "movq 5(%0), %%mm0 \n\t" /* FGHIJKLM */\
1014 "movq %%mm0, %%mm5 \n\t" /* FGHIJKLM */\
1015 "movq %%mm0, %%mm6 \n\t" /* FGHIJKLM */\
1016 "psrlq $8, %%mm0 \n\t" /* GHIJKLM0 */\
1017 "psrlq $16, %%mm5 \n\t" /* HIJKLM00 */\
1018 "punpcklbw %%mm7, %%mm0 \n\t" /* 0G0H0I0J */\
1019 "punpcklbw %%mm7, %%mm5 \n\t" /* 0H0I0J0K */\
1020 "paddw %%mm0, %%mm2 \n\t" /* b */\
1021 "paddw %%mm5, %%mm3 \n\t" /* c */\
1022 "paddw %%mm2, %%mm2 \n\t" /* 2b */\
1023 "psubw %%mm2, %%mm3 \n\t" /* c - 2b */\
1024 "movq %%mm6, %%mm2 \n\t" /* FGHIJKLM */\
1025 "psrlq $24, %%mm6 \n\t" /* IJKLM000 */\
1026 "punpcklbw %%mm7, %%mm2 \n\t" /* 0F0G0H0I */\
1027 "punpcklbw %%mm7, %%mm6 \n\t" /* 0I0J0K0L */\
1028 "pmullw "MANGLE(ff_pw_3)", %%mm3 \n\t" /* 3c - 6b */\
1029 "paddw %%mm2, %%mm1 \n\t" /* a */\
1030 "paddw %%mm6, %%mm4 \n\t" /* d */\
1031 "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" /* 20a */\
1032 "psubw %%mm4, %%mm3 \n\t" /* - 6b +3c - d */\
1033 "paddw %6, %%mm1 \n\t"\
1034 "paddw %%mm1, %%mm3 \n\t" /* 20a - 6b +3c - d */\
1035 "psraw $5, %%mm3 \n\t"\
1036 "movq %5, %%mm1 \n\t"\
1037 "packuswb %%mm3, %%mm1 \n\t"\
1038 OP_MMX2(%%mm1, (%1),%%mm4, q)\
1039 /* mm0= GHIJ, mm2=FGHI, mm5=HIJK, mm6=IJKL, mm7=0 */\
1041 "movq 9(%0), %%mm1 \n\t" /* JKLMNOPQ */\
1042 "movq %%mm1, %%mm4 \n\t" /* JKLMNOPQ */\
1043 "movq %%mm1, %%mm3 \n\t" /* JKLMNOPQ */\
1044 "psrlq $8, %%mm1 \n\t" /* KLMNOPQ0 */\
1045 "psrlq $16, %%mm4 \n\t" /* LMNOPQ00 */\
1046 "punpcklbw %%mm7, %%mm1 \n\t" /* 0K0L0M0N */\
1047 "punpcklbw %%mm7, %%mm4 \n\t" /* 0L0M0N0O */\
1048 "paddw %%mm1, %%mm5 \n\t" /* b */\
1049 "paddw %%mm4, %%mm0 \n\t" /* c */\
1050 "paddw %%mm5, %%mm5 \n\t" /* 2b */\
1051 "psubw %%mm5, %%mm0 \n\t" /* c - 2b */\
1052 "movq %%mm3, %%mm5 \n\t" /* JKLMNOPQ */\
1053 "psrlq $24, %%mm3 \n\t" /* MNOPQ000 */\
1054 "pmullw "MANGLE(ff_pw_3)", %%mm0 \n\t" /* 3c - 6b */\
1055 "punpcklbw %%mm7, %%mm3 \n\t" /* 0M0N0O0P */\
1056 "paddw %%mm3, %%mm2 \n\t" /* d */\
1057 "psubw %%mm2, %%mm0 \n\t" /* -6b + 3c - d */\
1058 "movq %%mm5, %%mm2 \n\t" /* JKLMNOPQ */\
1059 "punpcklbw %%mm7, %%mm2 \n\t" /* 0J0K0L0M */\
1060 "punpckhbw %%mm7, %%mm5 \n\t" /* 0N0O0P0Q */\
1061 "paddw %%mm2, %%mm6 \n\t" /* a */\
1062 "pmullw "MANGLE(ff_pw_20)", %%mm6 \n\t" /* 20a */\
1063 "paddw %6, %%mm0 \n\t"\
1064 "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\
1065 "psraw $5, %%mm0 \n\t"\
1066 /* mm1=KLMN, mm2=JKLM, mm3=MNOP, mm4=LMNO, mm5=NOPQ mm7=0 */\
1068 "paddw %%mm5, %%mm3 \n\t" /* a */\
1069 "pshufw $0xF9, %%mm5, %%mm6 \n\t" /* 0O0P0Q0Q */\
1070 "paddw %%mm4, %%mm6 \n\t" /* b */\
1071 "pshufw $0xBE, %%mm5, %%mm4 \n\t" /* 0P0Q0Q0P */\
1072 "pshufw $0x6F, %%mm5, %%mm5 \n\t" /* 0Q0Q0P0O */\
1073 "paddw %%mm1, %%mm4 \n\t" /* c */\
1074 "paddw %%mm2, %%mm5 \n\t" /* d */\
1075 "paddw %%mm6, %%mm6 \n\t" /* 2b */\
1076 "psubw %%mm6, %%mm4 \n\t" /* c - 2b */\
1077 "pmullw "MANGLE(ff_pw_20)", %%mm3 \n\t" /* 20a */\
1078 "pmullw "MANGLE(ff_pw_3)", %%mm4 \n\t" /* 3c - 6b */\
1079 "psubw %%mm5, %%mm3 \n\t" /* -6b + 3c - d */\
1080 "paddw %6, %%mm4 \n\t"\
1081 "paddw %%mm3, %%mm4 \n\t" /* 20a - 6b + 3c - d */\
1082 "psraw $5, %%mm4 \n\t"\
1083 "packuswb %%mm4, %%mm0 \n\t"\
1084 OP_MMX2(%%mm0, 8(%1), %%mm4, q)\
1090 : "+a"(src), "+c"(dst), "+D"(h)\
1091 : "d"((x86_reg)srcStride), "S"((x86_reg)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(temp), "m"(ROUNDER)\
1096 static void OPNAME ## mpeg4_qpel16_h_lowpass_3dnow(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1099 /* quick HACK, XXX FIXME MUST be optimized */\
1102 temp[ 0]= (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]);\
1103 temp[ 1]= (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]);\
1104 temp[ 2]= (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]);\
1105 temp[ 3]= (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]);\
1106 temp[ 4]= (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]);\
1107 temp[ 5]= (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]);\
1108 temp[ 6]= (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]);\
1109 temp[ 7]= (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]);\
1110 temp[ 8]= (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]);\
1111 temp[ 9]= (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]);\
1112 temp[10]= (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]);\
1113 temp[11]= (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]);\
1114 temp[12]= (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]);\
1115 temp[13]= (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]);\
1116 temp[14]= (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]);\
1117 temp[15]= (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]);\
1119 "movq (%0), %%mm0 \n\t"\
1120 "movq 8(%0), %%mm1 \n\t"\
1121 "paddw %2, %%mm0 \n\t"\
1122 "paddw %2, %%mm1 \n\t"\
1123 "psraw $5, %%mm0 \n\t"\
1124 "psraw $5, %%mm1 \n\t"\
1125 "packuswb %%mm1, %%mm0 \n\t"\
1126 OP_3DNOW(%%mm0, (%1), %%mm1, q)\
1127 "movq 16(%0), %%mm0 \n\t"\
1128 "movq 24(%0), %%mm1 \n\t"\
1129 "paddw %2, %%mm0 \n\t"\
1130 "paddw %2, %%mm1 \n\t"\
1131 "psraw $5, %%mm0 \n\t"\
1132 "psraw $5, %%mm1 \n\t"\
1133 "packuswb %%mm1, %%mm0 \n\t"\
1134 OP_3DNOW(%%mm0, 8(%1), %%mm1, q)\
1135 :: "r"(temp), "r"(dst), "m"(ROUNDER)\
1143 static void OPNAME ## mpeg4_qpel8_h_lowpass_mmx2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1145 "pxor %%mm7, %%mm7 \n\t"\
1147 "movq (%0), %%mm0 \n\t" /* ABCDEFGH */\
1148 "movq %%mm0, %%mm1 \n\t" /* ABCDEFGH */\
1149 "movq %%mm0, %%mm2 \n\t" /* ABCDEFGH */\
1150 "punpcklbw %%mm7, %%mm0 \n\t" /* 0A0B0C0D */\
1151 "punpckhbw %%mm7, %%mm1 \n\t" /* 0E0F0G0H */\
1152 "pshufw $0x90, %%mm0, %%mm5 \n\t" /* 0A0A0B0C */\
1153 "pshufw $0x41, %%mm0, %%mm6 \n\t" /* 0B0A0A0B */\
1154 "movq %%mm2, %%mm3 \n\t" /* ABCDEFGH */\
1155 "movq %%mm2, %%mm4 \n\t" /* ABCDEFGH */\
1156 "psllq $8, %%mm2 \n\t" /* 0ABCDEFG */\
1157 "psllq $16, %%mm3 \n\t" /* 00ABCDEF */\
1158 "psllq $24, %%mm4 \n\t" /* 000ABCDE */\
1159 "punpckhbw %%mm7, %%mm2 \n\t" /* 0D0E0F0G */\
1160 "punpckhbw %%mm7, %%mm3 \n\t" /* 0C0D0E0F */\
1161 "punpckhbw %%mm7, %%mm4 \n\t" /* 0B0C0D0E */\
1162 "paddw %%mm3, %%mm5 \n\t" /* b */\
1163 "paddw %%mm2, %%mm6 \n\t" /* c */\
1164 "paddw %%mm5, %%mm5 \n\t" /* 2b */\
1165 "psubw %%mm5, %%mm6 \n\t" /* c - 2b */\
1166 "pshufw $0x06, %%mm0, %%mm5 \n\t" /* 0C0B0A0A */\
1167 "pmullw "MANGLE(ff_pw_3)", %%mm6 \n\t" /* 3c - 6b */\
1168 "paddw %%mm4, %%mm0 \n\t" /* a */\
1169 "paddw %%mm1, %%mm5 \n\t" /* d */\
1170 "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" /* 20a */\
1171 "psubw %%mm5, %%mm0 \n\t" /* 20a - d */\
1172 "paddw %5, %%mm6 \n\t"\
1173 "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\
1174 "psraw $5, %%mm0 \n\t"\
1175 /* mm1=EFGH, mm2=DEFG, mm3=CDEF, mm4=BCDE, mm7=0 */\
1177 "movd 5(%0), %%mm5 \n\t" /* FGHI */\
1178 "punpcklbw %%mm7, %%mm5 \n\t" /* 0F0G0H0I */\
1179 "pshufw $0xF9, %%mm5, %%mm6 \n\t" /* 0G0H0I0I */\
1180 "paddw %%mm5, %%mm1 \n\t" /* a */\
1181 "paddw %%mm6, %%mm2 \n\t" /* b */\
1182 "pshufw $0xBE, %%mm5, %%mm6 \n\t" /* 0H0I0I0H */\
1183 "pshufw $0x6F, %%mm5, %%mm5 \n\t" /* 0I0I0H0G */\
1184 "paddw %%mm6, %%mm3 \n\t" /* c */\
1185 "paddw %%mm5, %%mm4 \n\t" /* d */\
1186 "paddw %%mm2, %%mm2 \n\t" /* 2b */\
1187 "psubw %%mm2, %%mm3 \n\t" /* c - 2b */\
1188 "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" /* 20a */\
1189 "pmullw "MANGLE(ff_pw_3)", %%mm3 \n\t" /* 3c - 6b */\
1190 "psubw %%mm4, %%mm3 \n\t" /* -6b + 3c - d */\
1191 "paddw %5, %%mm1 \n\t"\
1192 "paddw %%mm1, %%mm3 \n\t" /* 20a - 6b + 3c - d */\
1193 "psraw $5, %%mm3 \n\t"\
1194 "packuswb %%mm3, %%mm0 \n\t"\
1195 OP_MMX2(%%mm0, (%1), %%mm4, q)\
1201 : "+a"(src), "+c"(dst), "+d"(h)\
1202 : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(ROUNDER)\
1207 static void OPNAME ## mpeg4_qpel8_h_lowpass_3dnow(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1210 /* quick HACK, XXX FIXME MUST be optimized */\
1213 temp[ 0]= (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]);\
1214 temp[ 1]= (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]);\
1215 temp[ 2]= (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]);\
1216 temp[ 3]= (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]);\
1217 temp[ 4]= (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]);\
1218 temp[ 5]= (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 8]);\
1219 temp[ 6]= (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 8])*3 - (src[ 3]+src[ 7]);\
1220 temp[ 7]= (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 8])*6 + (src[ 5]+src[ 7])*3 - (src[ 4]+src[ 6]);\
1222 "movq (%0), %%mm0 \n\t"\
1223 "movq 8(%0), %%mm1 \n\t"\
1224 "paddw %2, %%mm0 \n\t"\
1225 "paddw %2, %%mm1 \n\t"\
1226 "psraw $5, %%mm0 \n\t"\
1227 "psraw $5, %%mm1 \n\t"\
1228 "packuswb %%mm1, %%mm0 \n\t"\
1229 OP_3DNOW(%%mm0, (%1), %%mm1, q)\
1230 :: "r"(temp), "r"(dst), "m"(ROUNDER)\
1238 #define QPEL_OP(OPNAME, ROUNDER, RND, OP, MMX)\
1240 static void OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1241 uint64_t temp[17*4];\
1242 uint64_t *temp_ptr= temp;\
1247 "pxor %%mm7, %%mm7 \n\t"\
1249 "movq (%0), %%mm0 \n\t"\
1250 "movq (%0), %%mm1 \n\t"\
1251 "movq 8(%0), %%mm2 \n\t"\
1252 "movq 8(%0), %%mm3 \n\t"\
1253 "punpcklbw %%mm7, %%mm0 \n\t"\
1254 "punpckhbw %%mm7, %%mm1 \n\t"\
1255 "punpcklbw %%mm7, %%mm2 \n\t"\
1256 "punpckhbw %%mm7, %%mm3 \n\t"\
1257 "movq %%mm0, (%1) \n\t"\
1258 "movq %%mm1, 17*8(%1) \n\t"\
1259 "movq %%mm2, 2*17*8(%1) \n\t"\
1260 "movq %%mm3, 3*17*8(%1) \n\t"\
1265 : "+r" (src), "+r" (temp_ptr), "+r"(count)\
1266 : "r" ((x86_reg)srcStride)\
1273 /*FIXME reorder for speed */\
1275 /*"pxor %%mm7, %%mm7 \n\t"*/\
1277 "movq (%0), %%mm0 \n\t"\
1278 "movq 8(%0), %%mm1 \n\t"\
1279 "movq 16(%0), %%mm2 \n\t"\
1280 "movq 24(%0), %%mm3 \n\t"\
1281 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP)\
1282 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP)\
1284 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP)\
1286 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP)\
1288 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP)\
1289 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 72(%0), (%1, %3), OP)\
1291 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 80(%0), (%1), OP)\
1292 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 88(%0), (%1, %3), OP)\
1294 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 40(%0), 48(%0), 56(%0), 96(%0), (%1), OP)\
1295 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 48(%0), 56(%0), 64(%0),104(%0), (%1, %3), OP)\
1297 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 56(%0), 64(%0), 72(%0),112(%0), (%1), OP)\
1298 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 64(%0), 72(%0), 80(%0),120(%0), (%1, %3), OP)\
1300 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 72(%0), 80(%0), 88(%0),128(%0), (%1), OP)\
1302 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 80(%0), 88(%0), 96(%0),128(%0), (%1, %3), OP)\
1304 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 88(%0), 96(%0),104(%0),120(%0), (%1), OP)\
1305 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 96(%0),104(%0),112(%0),112(%0), (%1, %3), OP)\
1307 "add $136, %0 \n\t"\
1312 : "+r"(temp_ptr), "+r"(dst), "+g"(count)\
1313 : "r"((x86_reg)dstStride), "r"(2*(x86_reg)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(ROUNDER), "g"(4-14*(x86_reg)dstStride)\
1318 static void OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1319 uint64_t temp[9*2];\
1320 uint64_t *temp_ptr= temp;\
1325 "pxor %%mm7, %%mm7 \n\t"\
1327 "movq (%0), %%mm0 \n\t"\
1328 "movq (%0), %%mm1 \n\t"\
1329 "punpcklbw %%mm7, %%mm0 \n\t"\
1330 "punpckhbw %%mm7, %%mm1 \n\t"\
1331 "movq %%mm0, (%1) \n\t"\
1332 "movq %%mm1, 9*8(%1) \n\t"\
1337 : "+r" (src), "+r" (temp_ptr), "+r"(count)\
1338 : "r" ((x86_reg)srcStride)\
1345 /*FIXME reorder for speed */\
1347 /*"pxor %%mm7, %%mm7 \n\t"*/\
1349 "movq (%0), %%mm0 \n\t"\
1350 "movq 8(%0), %%mm1 \n\t"\
1351 "movq 16(%0), %%mm2 \n\t"\
1352 "movq 24(%0), %%mm3 \n\t"\
1353 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP)\
1354 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP)\
1356 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP)\
1358 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP)\
1360 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP)\
1362 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 64(%0), (%1, %3), OP)\
1364 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 56(%0), (%1), OP)\
1365 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 48(%0), (%1, %3), OP)\
1372 : "+r"(temp_ptr), "+r"(dst), "+g"(count)\
1373 : "r"((x86_reg)dstStride), "r"(2*(x86_reg)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(ROUNDER), "g"(4-6*(x86_reg)dstStride)\
1378 static void OPNAME ## qpel8_mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\
1379 OPNAME ## pixels8_ ## MMX(dst, src, stride, 8);\
1382 static void OPNAME ## qpel8_mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1384 uint8_t * const half= (uint8_t*)temp;\
1385 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, stride, 8);\
1386 OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8);\
1389 static void OPNAME ## qpel8_mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1390 OPNAME ## mpeg4_qpel8_h_lowpass_ ## MMX(dst, src, stride, stride, 8);\
1393 static void OPNAME ## qpel8_mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1395 uint8_t * const half= (uint8_t*)temp;\
1396 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, stride, 8);\
1397 OPNAME ## pixels8_l2_ ## MMX(dst, src+1, half, stride, stride, 8);\
1400 static void OPNAME ## qpel8_mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1402 uint8_t * const half= (uint8_t*)temp;\
1403 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride);\
1404 OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8);\
1407 static void OPNAME ## qpel8_mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1408 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, src, stride, stride);\
1411 static void OPNAME ## qpel8_mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1413 uint8_t * const half= (uint8_t*)temp;\
1414 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride);\
1415 OPNAME ## pixels8_l2_ ## MMX(dst, src+stride, half, stride, stride, 8);\
1417 static void OPNAME ## qpel8_mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1418 uint64_t half[8 + 9];\
1419 uint8_t * const halfH= ((uint8_t*)half) + 64;\
1420 uint8_t * const halfHV= ((uint8_t*)half);\
1421 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
1422 put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\
1423 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
1424 OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\
1426 static void OPNAME ## qpel8_mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1427 uint64_t half[8 + 9];\
1428 uint8_t * const halfH= ((uint8_t*)half) + 64;\
1429 uint8_t * const halfHV= ((uint8_t*)half);\
1430 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
1431 put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\
1432 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
1433 OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\
1435 static void OPNAME ## qpel8_mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1436 uint64_t half[8 + 9];\
1437 uint8_t * const halfH= ((uint8_t*)half) + 64;\
1438 uint8_t * const halfHV= ((uint8_t*)half);\
1439 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
1440 put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\
1441 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
1442 OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\
1444 static void OPNAME ## qpel8_mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1445 uint64_t half[8 + 9];\
1446 uint8_t * const halfH= ((uint8_t*)half) + 64;\
1447 uint8_t * const halfHV= ((uint8_t*)half);\
1448 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
1449 put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\
1450 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
1451 OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\
1453 static void OPNAME ## qpel8_mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1454 uint64_t half[8 + 9];\
1455 uint8_t * const halfH= ((uint8_t*)half) + 64;\
1456 uint8_t * const halfHV= ((uint8_t*)half);\
1457 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
1458 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
1459 OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\
1461 static void OPNAME ## qpel8_mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1462 uint64_t half[8 + 9];\
1463 uint8_t * const halfH= ((uint8_t*)half) + 64;\
1464 uint8_t * const halfHV= ((uint8_t*)half);\
1465 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
1466 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
1467 OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\
1469 static void OPNAME ## qpel8_mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1470 uint64_t half[8 + 9];\
1471 uint8_t * const halfH= ((uint8_t*)half);\
1472 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
1473 put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\
1474 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
1476 static void OPNAME ## qpel8_mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1477 uint64_t half[8 + 9];\
1478 uint8_t * const halfH= ((uint8_t*)half);\
1479 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
1480 put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\
1481 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
1483 static void OPNAME ## qpel8_mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1485 uint8_t * const halfH= ((uint8_t*)half);\
1486 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
1487 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
1489 static void OPNAME ## qpel16_mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\
1490 OPNAME ## pixels16_ ## MMX(dst, src, stride, 16);\
1493 static void OPNAME ## qpel16_mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1495 uint8_t * const half= (uint8_t*)temp;\
1496 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, stride, 16);\
1497 OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16);\
1500 static void OPNAME ## qpel16_mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1501 OPNAME ## mpeg4_qpel16_h_lowpass_ ## MMX(dst, src, stride, stride, 16);\
1504 static void OPNAME ## qpel16_mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1506 uint8_t * const half= (uint8_t*)temp;\
1507 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, stride, 16);\
1508 OPNAME ## pixels16_l2_ ## MMX(dst, src+1, half, stride, stride, 16);\
1511 static void OPNAME ## qpel16_mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1513 uint8_t * const half= (uint8_t*)temp;\
1514 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, stride);\
1515 OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16);\
1518 static void OPNAME ## qpel16_mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1519 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, src, stride, stride);\
1522 static void OPNAME ## qpel16_mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1524 uint8_t * const half= (uint8_t*)temp;\
1525 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, stride);\
1526 OPNAME ## pixels16_l2_ ## MMX(dst, src+stride, half, stride, stride, 16);\
1528 static void OPNAME ## qpel16_mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1529 uint64_t half[16*2 + 17*2];\
1530 uint8_t * const halfH= ((uint8_t*)half) + 256;\
1531 uint8_t * const halfHV= ((uint8_t*)half);\
1532 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
1533 put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\
1534 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
1535 OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\
1537 static void OPNAME ## qpel16_mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1538 uint64_t half[16*2 + 17*2];\
1539 uint8_t * const halfH= ((uint8_t*)half) + 256;\
1540 uint8_t * const halfHV= ((uint8_t*)half);\
1541 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
1542 put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\
1543 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
1544 OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\
1546 static void OPNAME ## qpel16_mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1547 uint64_t half[16*2 + 17*2];\
1548 uint8_t * const halfH= ((uint8_t*)half) + 256;\
1549 uint8_t * const halfHV= ((uint8_t*)half);\
1550 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
1551 put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\
1552 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
1553 OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\
1555 static void OPNAME ## qpel16_mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1556 uint64_t half[16*2 + 17*2];\
1557 uint8_t * const halfH= ((uint8_t*)half) + 256;\
1558 uint8_t * const halfHV= ((uint8_t*)half);\
1559 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
1560 put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\
1561 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
1562 OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\
1564 static void OPNAME ## qpel16_mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1565 uint64_t half[16*2 + 17*2];\
1566 uint8_t * const halfH= ((uint8_t*)half) + 256;\
1567 uint8_t * const halfHV= ((uint8_t*)half);\
1568 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
1569 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
1570 OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\
1572 static void OPNAME ## qpel16_mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1573 uint64_t half[16*2 + 17*2];\
1574 uint8_t * const halfH= ((uint8_t*)half) + 256;\
1575 uint8_t * const halfHV= ((uint8_t*)half);\
1576 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
1577 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
1578 OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\
1580 static void OPNAME ## qpel16_mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1581 uint64_t half[17*2];\
1582 uint8_t * const halfH= ((uint8_t*)half);\
1583 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
1584 put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\
1585 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
1587 static void OPNAME ## qpel16_mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1588 uint64_t half[17*2];\
1589 uint8_t * const halfH= ((uint8_t*)half);\
1590 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
1591 put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\
1592 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
1594 static void OPNAME ## qpel16_mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1595 uint64_t half[17*2];\
1596 uint8_t * const halfH= ((uint8_t*)half);\
1597 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
1598 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
1601 #define PUT_OP(a,b,temp, size) "mov" #size " " #a ", " #b " \n\t"
1602 #define AVG_3DNOW_OP(a,b,temp, size) \
1603 "mov" #size " " #b ", " #temp " \n\t"\
1604 "pavgusb " #temp ", " #a " \n\t"\
1605 "mov" #size " " #a ", " #b " \n\t"
1606 #define AVG_MMX2_OP(a,b,temp, size) \
1607 "mov" #size " " #b ", " #temp " \n\t"\
1608 "pavgb " #temp ", " #a " \n\t"\
1609 "mov" #size " " #a ", " #b " \n\t"
1611 QPEL_BASE(put_ , ff_pw_16, _ , PUT_OP, PUT_OP)
1612 QPEL_BASE(avg_ , ff_pw_16, _ , AVG_MMX2_OP, AVG_3DNOW_OP)
1613 QPEL_BASE(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, PUT_OP)
1614 QPEL_OP(put_ , ff_pw_16, _ , PUT_OP, 3dnow)
1615 QPEL_OP(avg_ , ff_pw_16, _ , AVG_3DNOW_OP, 3dnow)
1616 QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, 3dnow)
1617 QPEL_OP(put_ , ff_pw_16, _ , PUT_OP, mmx2)
1618 QPEL_OP(avg_ , ff_pw_16, _ , AVG_MMX2_OP, mmx2)
1619 QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, mmx2)
1621 /***********************************/
1622 /* bilinear qpel: not compliant to any spec, only for -lavdopts fast */
1624 #define QPEL_2TAP_XY(OPNAME, SIZE, MMX, XY, HPEL)\
1625 static void OPNAME ## 2tap_qpel ## SIZE ## _mc ## XY ## _ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1626 OPNAME ## pixels ## SIZE ## HPEL(dst, src, stride, SIZE);\
1628 #define QPEL_2TAP_L3(OPNAME, SIZE, MMX, XY, S0, S1, S2)\
1629 static void OPNAME ## 2tap_qpel ## SIZE ## _mc ## XY ## _ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1630 OPNAME ## 2tap_qpel ## SIZE ## _l3_ ## MMX(dst, src+S0, stride, SIZE, S1, S2);\
1633 #define QPEL_2TAP(OPNAME, SIZE, MMX)\
1634 QPEL_2TAP_XY(OPNAME, SIZE, MMX, 20, _x2_ ## MMX)\
1635 QPEL_2TAP_XY(OPNAME, SIZE, MMX, 02, _y2_ ## MMX)\
1636 QPEL_2TAP_XY(OPNAME, SIZE, MMX, 22, _xy2_mmx)\
1637 static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc00_ ## MMX =\
1638 OPNAME ## qpel ## SIZE ## _mc00_ ## MMX;\
1639 static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc21_ ## MMX =\
1640 OPNAME ## 2tap_qpel ## SIZE ## _mc20_ ## MMX;\
1641 static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc12_ ## MMX =\
1642 OPNAME ## 2tap_qpel ## SIZE ## _mc02_ ## MMX;\
1643 static void OPNAME ## 2tap_qpel ## SIZE ## _mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1644 OPNAME ## pixels ## SIZE ## _y2_ ## MMX(dst, src+1, stride, SIZE);\
1646 static void OPNAME ## 2tap_qpel ## SIZE ## _mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1647 OPNAME ## pixels ## SIZE ## _x2_ ## MMX(dst, src+stride, stride, SIZE);\
1649 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 10, 0, 1, 0)\
1650 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 30, 1, -1, 0)\
1651 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 01, 0, stride, 0)\
1652 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 03, stride, -stride, 0)\
1653 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 11, 0, stride, 1)\
1654 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 31, 1, stride, -1)\
1655 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 13, stride, -stride, 1)\
1656 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 33, stride+1, -stride, -1)\
1658 QPEL_2TAP(put_, 16, mmx2)
1659 QPEL_2TAP(avg_, 16, mmx2)
1660 QPEL_2TAP(put_, 8, mmx2)
1661 QPEL_2TAP(avg_, 8, mmx2)
1662 QPEL_2TAP(put_, 16, 3dnow)
1663 QPEL_2TAP(avg_, 16, 3dnow)
1664 QPEL_2TAP(put_, 8, 3dnow)
1665 QPEL_2TAP(avg_, 8, 3dnow)
1669 static void just_return(void) { return; }
1673 typedef void emu_edge_core_func (uint8_t *buf, const uint8_t *src,
1674 x86_reg linesize, x86_reg start_y,
1675 x86_reg end_y, x86_reg block_h,
1676 x86_reg start_x, x86_reg end_x,
1678 extern emu_edge_core_func ff_emu_edge_core_mmx;
1679 extern emu_edge_core_func ff_emu_edge_core_sse;
1681 static av_always_inline
1682 void emulated_edge_mc(uint8_t *buf, const uint8_t *src, int linesize,
1683 int block_w, int block_h,
1684 int src_x, int src_y, int w, int h,
1685 emu_edge_core_func *core_fn)
1687 int start_y, start_x, end_y, end_x, src_y_add=0;
1690 src_y_add = h-1-src_y;
1692 }else if(src_y<=-block_h){
1693 src_y_add = 1-block_h-src_y;
1699 }else if(src_x<=-block_w){
1700 src+= (1-block_w-src_x);
1704 start_y= FFMAX(0, -src_y);
1705 start_x= FFMAX(0, -src_x);
1706 end_y= FFMIN(block_h, h-src_y);
1707 end_x= FFMIN(block_w, w-src_x);
1708 assert(start_x < end_x && block_w > 0);
1709 assert(start_y < end_y && block_h > 0);
1711 // fill in the to-be-copied part plus all above/below
1712 src += (src_y_add+start_y)*linesize + start_x;
1714 core_fn(buf, src, linesize, start_y, end_y, block_h, start_x, end_x, block_w);
1719 void emulated_edge_mc_mmx(uint8_t *buf, const uint8_t *src, int linesize,
1720 int block_w, int block_h,
1721 int src_x, int src_y, int w, int h)
1723 emulated_edge_mc(buf, src, linesize, block_w, block_h, src_x, src_y,
1724 w, h, &ff_emu_edge_core_mmx);
1728 void emulated_edge_mc_sse(uint8_t *buf, const uint8_t *src, int linesize,
1729 int block_w, int block_h,
1730 int src_x, int src_y, int w, int h)
1732 emulated_edge_mc(buf, src, linesize, block_w, block_h, src_x, src_y,
1733 w, h, &ff_emu_edge_core_sse);
1735 #endif /* HAVE_YASM */
1737 typedef void emulated_edge_mc_func (uint8_t *dst, const uint8_t *src,
1738 int linesize, int block_w, int block_h,
1739 int src_x, int src_y, int w, int h);
1741 static av_always_inline
1742 void gmc(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
1743 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height,
1744 emulated_edge_mc_func *emu_edge_fn)
1747 const int ix = ox>>(16+shift);
1748 const int iy = oy>>(16+shift);
1749 const int oxs = ox>>4;
1750 const int oys = oy>>4;
1751 const int dxxs = dxx>>4;
1752 const int dxys = dxy>>4;
1753 const int dyxs = dyx>>4;
1754 const int dyys = dyy>>4;
1755 const uint16_t r4[4] = {r,r,r,r};
1756 const uint16_t dxy4[4] = {dxys,dxys,dxys,dxys};
1757 const uint16_t dyy4[4] = {dyys,dyys,dyys,dyys};
1758 const uint64_t shift2 = 2*shift;
1759 uint8_t edge_buf[(h+1)*stride];
1762 const int dxw = (dxx-(1<<(16+shift)))*(w-1);
1763 const int dyh = (dyy-(1<<(16+shift)))*(h-1);
1764 const int dxh = dxy*(h-1);
1765 const int dyw = dyx*(w-1);
1766 if( // non-constant fullpel offset (3% of blocks)
1767 ((ox^(ox+dxw)) | (ox^(ox+dxh)) | (ox^(ox+dxw+dxh)) |
1768 (oy^(oy+dyw)) | (oy^(oy+dyh)) | (oy^(oy+dyw+dyh))) >> (16+shift)
1769 // uses more than 16 bits of subpel mv (only at huge resolution)
1770 || (dxx|dxy|dyx|dyy)&15 )
1772 //FIXME could still use mmx for some of the rows
1773 ff_gmc_c(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r, width, height);
1777 src += ix + iy*stride;
1778 if( (unsigned)ix >= width-w ||
1779 (unsigned)iy >= height-h )
1781 emu_edge_fn(edge_buf, src, stride, w+1, h+1, ix, iy, width, height);
1786 "movd %0, %%mm6 \n\t"
1787 "pxor %%mm7, %%mm7 \n\t"
1788 "punpcklwd %%mm6, %%mm6 \n\t"
1789 "punpcklwd %%mm6, %%mm6 \n\t"
1793 for(x=0; x<w; x+=4){
1794 uint16_t dx4[4] = { oxs - dxys + dxxs*(x+0),
1795 oxs - dxys + dxxs*(x+1),
1796 oxs - dxys + dxxs*(x+2),
1797 oxs - dxys + dxxs*(x+3) };
1798 uint16_t dy4[4] = { oys - dyys + dyxs*(x+0),
1799 oys - dyys + dyxs*(x+1),
1800 oys - dyys + dyxs*(x+2),
1801 oys - dyys + dyxs*(x+3) };
1805 "movq %0, %%mm4 \n\t"
1806 "movq %1, %%mm5 \n\t"
1807 "paddw %2, %%mm4 \n\t"
1808 "paddw %3, %%mm5 \n\t"
1809 "movq %%mm4, %0 \n\t"
1810 "movq %%mm5, %1 \n\t"
1811 "psrlw $12, %%mm4 \n\t"
1812 "psrlw $12, %%mm5 \n\t"
1813 : "+m"(*dx4), "+m"(*dy4)
1814 : "m"(*dxy4), "m"(*dyy4)
1818 "movq %%mm6, %%mm2 \n\t"
1819 "movq %%mm6, %%mm1 \n\t"
1820 "psubw %%mm4, %%mm2 \n\t"
1821 "psubw %%mm5, %%mm1 \n\t"
1822 "movq %%mm2, %%mm0 \n\t"
1823 "movq %%mm4, %%mm3 \n\t"
1824 "pmullw %%mm1, %%mm0 \n\t" // (s-dx)*(s-dy)
1825 "pmullw %%mm5, %%mm3 \n\t" // dx*dy
1826 "pmullw %%mm5, %%mm2 \n\t" // (s-dx)*dy
1827 "pmullw %%mm4, %%mm1 \n\t" // dx*(s-dy)
1829 "movd %4, %%mm5 \n\t"
1830 "movd %3, %%mm4 \n\t"
1831 "punpcklbw %%mm7, %%mm5 \n\t"
1832 "punpcklbw %%mm7, %%mm4 \n\t"
1833 "pmullw %%mm5, %%mm3 \n\t" // src[1,1] * dx*dy
1834 "pmullw %%mm4, %%mm2 \n\t" // src[0,1] * (s-dx)*dy
1836 "movd %2, %%mm5 \n\t"
1837 "movd %1, %%mm4 \n\t"
1838 "punpcklbw %%mm7, %%mm5 \n\t"
1839 "punpcklbw %%mm7, %%mm4 \n\t"
1840 "pmullw %%mm5, %%mm1 \n\t" // src[1,0] * dx*(s-dy)
1841 "pmullw %%mm4, %%mm0 \n\t" // src[0,0] * (s-dx)*(s-dy)
1842 "paddw %5, %%mm1 \n\t"
1843 "paddw %%mm3, %%mm2 \n\t"
1844 "paddw %%mm1, %%mm0 \n\t"
1845 "paddw %%mm2, %%mm0 \n\t"
1847 "psrlw %6, %%mm0 \n\t"
1848 "packuswb %%mm0, %%mm0 \n\t"
1849 "movd %%mm0, %0 \n\t"
1851 : "=m"(dst[x+y*stride])
1852 : "m"(src[0]), "m"(src[1]),
1853 "m"(src[stride]), "m"(src[stride+1]),
1854 "m"(*r4), "m"(shift2)
1864 static void gmc_mmx(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
1865 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
1867 gmc(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r,
1868 width, height, &emulated_edge_mc_mmx);
1871 static void gmc_sse(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
1872 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
1874 gmc(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r,
1875 width, height, &emulated_edge_mc_sse);
1878 static void gmc_mmx(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
1879 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
1881 gmc(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r,
1882 width, height, &ff_emulated_edge_mc_8);
1886 #define PREFETCH(name, op) \
1887 static void name(void *mem, int stride, int h){\
1888 const uint8_t *p= mem;\
1890 __asm__ volatile(#op" %0" :: "m"(*p));\
1894 PREFETCH(prefetch_mmx2, prefetcht0)
1895 PREFETCH(prefetch_3dnow, prefetch)
1898 #include "h264_qpel_mmx.c"
1900 void ff_put_h264_chroma_mc8_mmx_rnd (uint8_t *dst, uint8_t *src,
1901 int stride, int h, int x, int y);
1902 void ff_put_rv40_chroma_mc8_mmx (uint8_t *dst, uint8_t *src,
1903 int stride, int h, int x, int y);
1904 void ff_avg_h264_chroma_mc8_mmx2_rnd (uint8_t *dst, uint8_t *src,
1905 int stride, int h, int x, int y);
1906 void ff_avg_rv40_chroma_mc8_mmx2 (uint8_t *dst, uint8_t *src,
1907 int stride, int h, int x, int y);
1908 void ff_avg_h264_chroma_mc8_3dnow_rnd (uint8_t *dst, uint8_t *src,
1909 int stride, int h, int x, int y);
1910 void ff_avg_rv40_chroma_mc8_3dnow (uint8_t *dst, uint8_t *src,
1911 int stride, int h, int x, int y);
1913 void ff_put_h264_chroma_mc4_mmx (uint8_t *dst, uint8_t *src,
1914 int stride, int h, int x, int y);
1915 void ff_put_rv40_chroma_mc4_mmx (uint8_t *dst, uint8_t *src,
1916 int stride, int h, int x, int y);
1917 void ff_avg_h264_chroma_mc4_mmx2 (uint8_t *dst, uint8_t *src,
1918 int stride, int h, int x, int y);
1919 void ff_avg_rv40_chroma_mc4_mmx2 (uint8_t *dst, uint8_t *src,
1920 int stride, int h, int x, int y);
1921 void ff_avg_h264_chroma_mc4_3dnow (uint8_t *dst, uint8_t *src,
1922 int stride, int h, int x, int y);
1923 void ff_avg_rv40_chroma_mc4_3dnow (uint8_t *dst, uint8_t *src,
1924 int stride, int h, int x, int y);
1926 void ff_put_h264_chroma_mc2_mmx2 (uint8_t *dst, uint8_t *src,
1927 int stride, int h, int x, int y);
1928 void ff_avg_h264_chroma_mc2_mmx2 (uint8_t *dst, uint8_t *src,
1929 int stride, int h, int x, int y);
1931 void ff_put_h264_chroma_mc8_ssse3_rnd (uint8_t *dst, uint8_t *src,
1932 int stride, int h, int x, int y);
1933 void ff_put_h264_chroma_mc4_ssse3 (uint8_t *dst, uint8_t *src,
1934 int stride, int h, int x, int y);
1936 void ff_avg_h264_chroma_mc8_ssse3_rnd (uint8_t *dst, uint8_t *src,
1937 int stride, int h, int x, int y);
1938 void ff_avg_h264_chroma_mc4_ssse3 (uint8_t *dst, uint8_t *src,
1939 int stride, int h, int x, int y);
1941 #define CHROMA_MC(OP, NUM, DEPTH, OPT) \
1942 void ff_ ## OP ## _h264_chroma_mc ## NUM ## _ ## DEPTH ## _ ## OPT \
1943 (uint8_t *dst, uint8_t *src,\
1944 int stride, int h, int x, int y);
1946 CHROMA_MC(put, 2, 10, mmxext)
1947 CHROMA_MC(avg, 2, 10, mmxext)
1948 CHROMA_MC(put, 4, 10, mmxext)
1949 CHROMA_MC(avg, 4, 10, mmxext)
1950 CHROMA_MC(put, 8, 10, sse2)
1951 CHROMA_MC(avg, 8, 10, sse2)
1952 CHROMA_MC(put, 8, 10, avx)
1953 CHROMA_MC(avg, 8, 10, avx)
1956 void ff_put_cavs_qpel8_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) {
1957 put_pixels8_mmx(dst, src, stride, 8);
1959 void ff_avg_cavs_qpel8_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) {
1960 avg_pixels8_mmx(dst, src, stride, 8);
1962 void ff_put_cavs_qpel16_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) {
1963 put_pixels16_mmx(dst, src, stride, 16);
1965 void ff_avg_cavs_qpel16_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) {
1966 avg_pixels16_mmx(dst, src, stride, 16);
1970 void ff_put_vc1_mspel_mc00_mmx(uint8_t *dst, const uint8_t *src, int stride, int rnd) {
1971 put_pixels8_mmx(dst, src, stride, 8);
1973 void ff_avg_vc1_mspel_mc00_mmx2(uint8_t *dst, const uint8_t *src, int stride, int rnd) {
1974 avg_pixels8_mmx2(dst, src, stride, 8);
1977 /* XXX: those functions should be suppressed ASAP when all IDCTs are
1980 static void ff_libmpeg2mmx_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
1982 ff_mmx_idct (block);
1983 ff_put_pixels_clamped_mmx(block, dest, line_size);
1985 static void ff_libmpeg2mmx_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
1987 ff_mmx_idct (block);
1988 ff_add_pixels_clamped_mmx(block, dest, line_size);
1990 static void ff_libmpeg2mmx2_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
1992 ff_mmxext_idct (block);
1993 ff_put_pixels_clamped_mmx(block, dest, line_size);
1995 static void ff_libmpeg2mmx2_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
1997 ff_mmxext_idct (block);
1998 ff_add_pixels_clamped_mmx(block, dest, line_size);
2001 static void ff_idct_xvid_mmx_put(uint8_t *dest, int line_size, DCTELEM *block)
2003 ff_idct_xvid_mmx (block);
2004 ff_put_pixels_clamped_mmx(block, dest, line_size);
2006 static void ff_idct_xvid_mmx_add(uint8_t *dest, int line_size, DCTELEM *block)
2008 ff_idct_xvid_mmx (block);
2009 ff_add_pixels_clamped_mmx(block, dest, line_size);
2011 static void ff_idct_xvid_mmx2_put(uint8_t *dest, int line_size, DCTELEM *block)
2013 ff_idct_xvid_mmx2 (block);
2014 ff_put_pixels_clamped_mmx(block, dest, line_size);
2016 static void ff_idct_xvid_mmx2_add(uint8_t *dest, int line_size, DCTELEM *block)
2018 ff_idct_xvid_mmx2 (block);
2019 ff_add_pixels_clamped_mmx(block, dest, line_size);
2022 static void vorbis_inverse_coupling_3dnow(float *mag, float *ang, int blocksize)
2025 __asm__ volatile("pxor %%mm7, %%mm7":);
2026 for(i=0; i<blocksize; i+=2) {
2028 "movq %0, %%mm0 \n\t"
2029 "movq %1, %%mm1 \n\t"
2030 "movq %%mm0, %%mm2 \n\t"
2031 "movq %%mm1, %%mm3 \n\t"
2032 "pfcmpge %%mm7, %%mm2 \n\t" // m <= 0.0
2033 "pfcmpge %%mm7, %%mm3 \n\t" // a <= 0.0
2034 "pslld $31, %%mm2 \n\t" // keep only the sign bit
2035 "pxor %%mm2, %%mm1 \n\t"
2036 "movq %%mm3, %%mm4 \n\t"
2037 "pand %%mm1, %%mm3 \n\t"
2038 "pandn %%mm1, %%mm4 \n\t"
2039 "pfadd %%mm0, %%mm3 \n\t" // a = m + ((a<0) & (a ^ sign(m)))
2040 "pfsub %%mm4, %%mm0 \n\t" // m = m + ((a>0) & (a ^ sign(m)))
2041 "movq %%mm3, %1 \n\t"
2042 "movq %%mm0, %0 \n\t"
2043 :"+m"(mag[i]), "+m"(ang[i])
2047 __asm__ volatile("femms");
2049 static void vorbis_inverse_coupling_sse(float *mag, float *ang, int blocksize)
2054 "movaps %0, %%xmm5 \n\t"
2055 ::"m"(ff_pdw_80000000[0])
2057 for(i=0; i<blocksize; i+=4) {
2059 "movaps %0, %%xmm0 \n\t"
2060 "movaps %1, %%xmm1 \n\t"
2061 "xorps %%xmm2, %%xmm2 \n\t"
2062 "xorps %%xmm3, %%xmm3 \n\t"
2063 "cmpleps %%xmm0, %%xmm2 \n\t" // m <= 0.0
2064 "cmpleps %%xmm1, %%xmm3 \n\t" // a <= 0.0
2065 "andps %%xmm5, %%xmm2 \n\t" // keep only the sign bit
2066 "xorps %%xmm2, %%xmm1 \n\t"
2067 "movaps %%xmm3, %%xmm4 \n\t"
2068 "andps %%xmm1, %%xmm3 \n\t"
2069 "andnps %%xmm1, %%xmm4 \n\t"
2070 "addps %%xmm0, %%xmm3 \n\t" // a = m + ((a<0) & (a ^ sign(m)))
2071 "subps %%xmm4, %%xmm0 \n\t" // m = m + ((a>0) & (a ^ sign(m)))
2072 "movaps %%xmm3, %1 \n\t"
2073 "movaps %%xmm0, %0 \n\t"
2074 :"+m"(mag[i]), "+m"(ang[i])
2083 #define MIX5(mono,stereo)\
2085 "movss 0(%2), %%xmm5 \n"\
2086 "movss 8(%2), %%xmm6 \n"\
2087 "movss 24(%2), %%xmm7 \n"\
2088 "shufps $0, %%xmm5, %%xmm5 \n"\
2089 "shufps $0, %%xmm6, %%xmm6 \n"\
2090 "shufps $0, %%xmm7, %%xmm7 \n"\
2092 "movaps (%0,%1), %%xmm0 \n"\
2093 "movaps 0x400(%0,%1), %%xmm1 \n"\
2094 "movaps 0x800(%0,%1), %%xmm2 \n"\
2095 "movaps 0xc00(%0,%1), %%xmm3 \n"\
2096 "movaps 0x1000(%0,%1), %%xmm4 \n"\
2097 "mulps %%xmm5, %%xmm0 \n"\
2098 "mulps %%xmm6, %%xmm1 \n"\
2099 "mulps %%xmm5, %%xmm2 \n"\
2100 "mulps %%xmm7, %%xmm3 \n"\
2101 "mulps %%xmm7, %%xmm4 \n"\
2102 stereo("addps %%xmm1, %%xmm0 \n")\
2103 "addps %%xmm1, %%xmm2 \n"\
2104 "addps %%xmm3, %%xmm0 \n"\
2105 "addps %%xmm4, %%xmm2 \n"\
2106 mono("addps %%xmm2, %%xmm0 \n")\
2107 "movaps %%xmm0, (%0,%1) \n"\
2108 stereo("movaps %%xmm2, 0x400(%0,%1) \n")\
2112 :"r"(samples[0]+len), "r"(matrix)\
2113 :XMM_CLOBBERS("%xmm0", "%xmm1", "%xmm2", "%xmm3", \
2114 "%xmm4", "%xmm5", "%xmm6", "%xmm7",)\
2118 #define MIX_MISC(stereo)\
2121 "movaps (%3,%0), %%xmm0 \n"\
2122 stereo("movaps %%xmm0, %%xmm1 \n")\
2123 "mulps %%xmm4, %%xmm0 \n"\
2124 stereo("mulps %%xmm5, %%xmm1 \n")\
2125 "lea 1024(%3,%0), %1 \n"\
2128 "movaps (%1), %%xmm2 \n"\
2129 stereo("movaps %%xmm2, %%xmm3 \n")\
2130 "mulps (%4,%2), %%xmm2 \n"\
2131 stereo("mulps 16(%4,%2), %%xmm3 \n")\
2132 "addps %%xmm2, %%xmm0 \n"\
2133 stereo("addps %%xmm3, %%xmm1 \n")\
2137 "movaps %%xmm0, (%3,%0) \n"\
2138 stereo("movaps %%xmm1, 1024(%3,%0) \n")\
2141 :"+&r"(i), "=&r"(j), "=&r"(k)\
2142 :"r"(samples[0]+len), "r"(matrix_simd+in_ch), "g"((intptr_t)-32*(in_ch-1))\
2146 static void ac3_downmix_sse(float (*samples)[256], float (*matrix)[2], int out_ch, int in_ch, int len)
2148 int (*matrix_cmp)[2] = (int(*)[2])matrix;
2151 i = -len*sizeof(float);
2152 if(in_ch == 5 && out_ch == 2 && !(matrix_cmp[0][1]|matrix_cmp[2][0]|matrix_cmp[3][1]|matrix_cmp[4][0]|(matrix_cmp[1][0]^matrix_cmp[1][1])|(matrix_cmp[0][0]^matrix_cmp[2][1]))) {
2154 } else if(in_ch == 5 && out_ch == 1 && matrix_cmp[0][0]==matrix_cmp[2][0] && matrix_cmp[3][0]==matrix_cmp[4][0]) {
2157 DECLARE_ALIGNED(16, float, matrix_simd)[AC3_MAX_CHANNELS][2][4];
2158 j = 2*in_ch*sizeof(float);
2162 "movss (%2,%0), %%xmm4 \n"
2163 "movss 4(%2,%0), %%xmm5 \n"
2164 "shufps $0, %%xmm4, %%xmm4 \n"
2165 "shufps $0, %%xmm5, %%xmm5 \n"
2166 "movaps %%xmm4, (%1,%0,4) \n"
2167 "movaps %%xmm5, 16(%1,%0,4) \n"
2170 :"r"(matrix_simd), "r"(matrix)
2181 static void vector_fmul_3dnow(float *dst, const float *src0, const float *src1, int len){
2182 x86_reg i = (len-4)*4;
2185 "movq (%2,%0), %%mm0 \n\t"
2186 "movq 8(%2,%0), %%mm1 \n\t"
2187 "pfmul (%3,%0), %%mm0 \n\t"
2188 "pfmul 8(%3,%0), %%mm1 \n\t"
2189 "movq %%mm0, (%1,%0) \n\t"
2190 "movq %%mm1, 8(%1,%0) \n\t"
2195 :"r"(dst), "r"(src0), "r"(src1)
2199 static void vector_fmul_sse(float *dst, const float *src0, const float *src1, int len){
2200 x86_reg i = (len-8)*4;
2203 "movaps (%2,%0), %%xmm0 \n\t"
2204 "movaps 16(%2,%0), %%xmm1 \n\t"
2205 "mulps (%3,%0), %%xmm0 \n\t"
2206 "mulps 16(%3,%0), %%xmm1 \n\t"
2207 "movaps %%xmm0, (%1,%0) \n\t"
2208 "movaps %%xmm1, 16(%1,%0) \n\t"
2212 :"r"(dst), "r"(src0), "r"(src1)
2217 static void vector_fmul_reverse_3dnow2(float *dst, const float *src0, const float *src1, int len){
2218 x86_reg i = len*4-16;
2221 "pswapd 8(%1), %%mm0 \n\t"
2222 "pswapd (%1), %%mm1 \n\t"
2223 "pfmul (%3,%0), %%mm0 \n\t"
2224 "pfmul 8(%3,%0), %%mm1 \n\t"
2225 "movq %%mm0, (%2,%0) \n\t"
2226 "movq %%mm1, 8(%2,%0) \n\t"
2230 :"+r"(i), "+r"(src1)
2231 :"r"(dst), "r"(src0)
2233 __asm__ volatile("femms");
2235 static void vector_fmul_reverse_sse(float *dst, const float *src0, const float *src1, int len){
2236 x86_reg i = len*4-32;
2239 "movaps 16(%1), %%xmm0 \n\t"
2240 "movaps (%1), %%xmm1 \n\t"
2241 "shufps $0x1b, %%xmm0, %%xmm0 \n\t"
2242 "shufps $0x1b, %%xmm1, %%xmm1 \n\t"
2243 "mulps (%3,%0), %%xmm0 \n\t"
2244 "mulps 16(%3,%0), %%xmm1 \n\t"
2245 "movaps %%xmm0, (%2,%0) \n\t"
2246 "movaps %%xmm1, 16(%2,%0) \n\t"
2250 :"+r"(i), "+r"(src1)
2251 :"r"(dst), "r"(src0)
2255 static void vector_fmul_add_3dnow(float *dst, const float *src0, const float *src1,
2256 const float *src2, int len){
2257 x86_reg i = (len-4)*4;
2260 "movq (%2,%0), %%mm0 \n\t"
2261 "movq 8(%2,%0), %%mm1 \n\t"
2262 "pfmul (%3,%0), %%mm0 \n\t"
2263 "pfmul 8(%3,%0), %%mm1 \n\t"
2264 "pfadd (%4,%0), %%mm0 \n\t"
2265 "pfadd 8(%4,%0), %%mm1 \n\t"
2266 "movq %%mm0, (%1,%0) \n\t"
2267 "movq %%mm1, 8(%1,%0) \n\t"
2271 :"r"(dst), "r"(src0), "r"(src1), "r"(src2)
2274 __asm__ volatile("femms");
2276 static void vector_fmul_add_sse(float *dst, const float *src0, const float *src1,
2277 const float *src2, int len){
2278 x86_reg i = (len-8)*4;
2281 "movaps (%2,%0), %%xmm0 \n\t"
2282 "movaps 16(%2,%0), %%xmm1 \n\t"
2283 "mulps (%3,%0), %%xmm0 \n\t"
2284 "mulps 16(%3,%0), %%xmm1 \n\t"
2285 "addps (%4,%0), %%xmm0 \n\t"
2286 "addps 16(%4,%0), %%xmm1 \n\t"
2287 "movaps %%xmm0, (%1,%0) \n\t"
2288 "movaps %%xmm1, 16(%1,%0) \n\t"
2292 :"r"(dst), "r"(src0), "r"(src1), "r"(src2)
2298 static void vector_fmul_window_3dnow2(float *dst, const float *src0, const float *src1,
2299 const float *win, int len){
2301 x86_reg j = len*4-8;
2304 "pswapd (%5,%1), %%mm1 \n"
2305 "movq (%5,%0), %%mm0 \n"
2306 "pswapd (%4,%1), %%mm5 \n"
2307 "movq (%3,%0), %%mm4 \n"
2308 "movq %%mm0, %%mm2 \n"
2309 "movq %%mm1, %%mm3 \n"
2310 "pfmul %%mm4, %%mm2 \n" // src0[len+i]*win[len+i]
2311 "pfmul %%mm5, %%mm3 \n" // src1[ j]*win[len+j]
2312 "pfmul %%mm4, %%mm1 \n" // src0[len+i]*win[len+j]
2313 "pfmul %%mm5, %%mm0 \n" // src1[ j]*win[len+i]
2314 "pfadd %%mm3, %%mm2 \n"
2315 "pfsub %%mm0, %%mm1 \n"
2316 "pswapd %%mm2, %%mm2 \n"
2317 "movq %%mm1, (%2,%0) \n"
2318 "movq %%mm2, (%2,%1) \n"
2324 :"r"(dst+len), "r"(src0+len), "r"(src1), "r"(win+len)
2328 static void vector_fmul_window_sse(float *dst, const float *src0, const float *src1,
2329 const float *win, int len){
2331 x86_reg j = len*4-16;
2334 "movaps (%5,%1), %%xmm1 \n"
2335 "movaps (%5,%0), %%xmm0 \n"
2336 "movaps (%4,%1), %%xmm5 \n"
2337 "movaps (%3,%0), %%xmm4 \n"
2338 "shufps $0x1b, %%xmm1, %%xmm1 \n"
2339 "shufps $0x1b, %%xmm5, %%xmm5 \n"
2340 "movaps %%xmm0, %%xmm2 \n"
2341 "movaps %%xmm1, %%xmm3 \n"
2342 "mulps %%xmm4, %%xmm2 \n" // src0[len+i]*win[len+i]
2343 "mulps %%xmm5, %%xmm3 \n" // src1[ j]*win[len+j]
2344 "mulps %%xmm4, %%xmm1 \n" // src0[len+i]*win[len+j]
2345 "mulps %%xmm5, %%xmm0 \n" // src1[ j]*win[len+i]
2346 "addps %%xmm3, %%xmm2 \n"
2347 "subps %%xmm0, %%xmm1 \n"
2348 "shufps $0x1b, %%xmm2, %%xmm2 \n"
2349 "movaps %%xmm1, (%2,%0) \n"
2350 "movaps %%xmm2, (%2,%1) \n"
2355 :"r"(dst+len), "r"(src0+len), "r"(src1), "r"(win+len)
2358 #endif /* HAVE_6REGS */
2360 static void vector_clipf_sse(float *dst, const float *src, float min, float max,
2363 x86_reg i = (len-16)*4;
2365 "movss %3, %%xmm4 \n"
2366 "movss %4, %%xmm5 \n"
2367 "shufps $0, %%xmm4, %%xmm4 \n"
2368 "shufps $0, %%xmm5, %%xmm5 \n"
2370 "movaps (%2,%0), %%xmm0 \n\t" // 3/1 on intel
2371 "movaps 16(%2,%0), %%xmm1 \n\t"
2372 "movaps 32(%2,%0), %%xmm2 \n\t"
2373 "movaps 48(%2,%0), %%xmm3 \n\t"
2374 "maxps %%xmm4, %%xmm0 \n\t"
2375 "maxps %%xmm4, %%xmm1 \n\t"
2376 "maxps %%xmm4, %%xmm2 \n\t"
2377 "maxps %%xmm4, %%xmm3 \n\t"
2378 "minps %%xmm5, %%xmm0 \n\t"
2379 "minps %%xmm5, %%xmm1 \n\t"
2380 "minps %%xmm5, %%xmm2 \n\t"
2381 "minps %%xmm5, %%xmm3 \n\t"
2382 "movaps %%xmm0, (%1,%0) \n\t"
2383 "movaps %%xmm1, 16(%1,%0) \n\t"
2384 "movaps %%xmm2, 32(%1,%0) \n\t"
2385 "movaps %%xmm3, 48(%1,%0) \n\t"
2389 :"r"(dst), "r"(src), "m"(min), "m"(max)
2394 void ff_vp3_idct_mmx(int16_t *input_data);
2395 void ff_vp3_idct_put_mmx(uint8_t *dest, int line_size, DCTELEM *block);
2396 void ff_vp3_idct_add_mmx(uint8_t *dest, int line_size, DCTELEM *block);
2398 void ff_vp3_idct_dc_add_mmx2(uint8_t *dest, int line_size, const DCTELEM *block);
2400 void ff_vp3_v_loop_filter_mmx2(uint8_t *src, int stride, int *bounding_values);
2401 void ff_vp3_h_loop_filter_mmx2(uint8_t *src, int stride, int *bounding_values);
2403 void ff_vp3_idct_sse2(int16_t *input_data);
2404 void ff_vp3_idct_put_sse2(uint8_t *dest, int line_size, DCTELEM *block);
2405 void ff_vp3_idct_add_sse2(uint8_t *dest, int line_size, DCTELEM *block);
2407 int32_t ff_scalarproduct_int16_mmx2(const int16_t *v1, const int16_t *v2, int order, int shift);
2408 int32_t ff_scalarproduct_int16_sse2(const int16_t *v1, const int16_t *v2, int order, int shift);
2409 int32_t ff_scalarproduct_and_madd_int16_mmx2(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul);
2410 int32_t ff_scalarproduct_and_madd_int16_sse2(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul);
2411 int32_t ff_scalarproduct_and_madd_int16_ssse3(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul);
2413 void ff_apply_window_int16_mmxext (int16_t *output, const int16_t *input,
2414 const int16_t *window, unsigned int len);
2415 void ff_apply_window_int16_mmxext_ba (int16_t *output, const int16_t *input,
2416 const int16_t *window, unsigned int len);
2417 void ff_apply_window_int16_sse2 (int16_t *output, const int16_t *input,
2418 const int16_t *window, unsigned int len);
2419 void ff_apply_window_int16_sse2_ba (int16_t *output, const int16_t *input,
2420 const int16_t *window, unsigned int len);
2421 void ff_apply_window_int16_ssse3 (int16_t *output, const int16_t *input,
2422 const int16_t *window, unsigned int len);
2423 void ff_apply_window_int16_ssse3_atom(int16_t *output, const int16_t *input,
2424 const int16_t *window, unsigned int len);
2426 void ff_add_hfyu_median_prediction_mmx2(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int w, int *left, int *left_top);
2427 int ff_add_hfyu_left_prediction_ssse3(uint8_t *dst, const uint8_t *src, int w, int left);
2428 int ff_add_hfyu_left_prediction_sse4(uint8_t *dst, const uint8_t *src, int w, int left);
2430 float ff_scalarproduct_float_sse(const float *v1, const float *v2, int order);
2432 void ff_vector_clip_int32_mmx (int32_t *dst, const int32_t *src, int32_t min,
2433 int32_t max, unsigned int len);
2434 void ff_vector_clip_int32_sse2 (int32_t *dst, const int32_t *src, int32_t min,
2435 int32_t max, unsigned int len);
2436 void ff_vector_clip_int32_sse2_int(int32_t *dst, const int32_t *src, int32_t min,
2437 int32_t max, unsigned int len);
2438 void ff_vector_clip_int32_sse41 (int32_t *dst, const int32_t *src, int32_t min,
2439 int32_t max, unsigned int len);
2441 void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
2443 int mm_flags = av_get_cpu_flags();
2444 const int high_bit_depth = avctx->codec_id == CODEC_ID_H264 && avctx->bits_per_raw_sample > 8;
2445 const int bit_depth = avctx->bits_per_raw_sample;
2447 if (avctx->dsp_mask) {
2448 if (avctx->dsp_mask & AV_CPU_FLAG_FORCE)
2449 mm_flags |= (avctx->dsp_mask & 0xffff);
2451 mm_flags &= ~(avctx->dsp_mask & 0xffff);
2455 av_log(avctx, AV_LOG_INFO, "libavcodec: CPU flags:");
2456 if (mm_flags & AV_CPU_FLAG_MMX)
2457 av_log(avctx, AV_LOG_INFO, " mmx");
2458 if (mm_flags & AV_CPU_FLAG_MMX2)
2459 av_log(avctx, AV_LOG_INFO, " mmx2");
2460 if (mm_flags & AV_CPU_FLAG_3DNOW)
2461 av_log(avctx, AV_LOG_INFO, " 3dnow");
2462 if (mm_flags & AV_CPU_FLAG_SSE)
2463 av_log(avctx, AV_LOG_INFO, " sse");
2464 if (mm_flags & AV_CPU_FLAG_SSE2)
2465 av_log(avctx, AV_LOG_INFO, " sse2");
2466 av_log(avctx, AV_LOG_INFO, "\n");
2469 if (mm_flags & AV_CPU_FLAG_MMX) {
2470 const int idct_algo= avctx->idct_algo;
2472 if(avctx->lowres==0){
2473 if(idct_algo==FF_IDCT_AUTO || idct_algo==FF_IDCT_SIMPLEMMX){
2474 c->idct_put= ff_simple_idct_put_mmx;
2475 c->idct_add= ff_simple_idct_add_mmx;
2476 c->idct = ff_simple_idct_mmx;
2477 c->idct_permutation_type= FF_SIMPLE_IDCT_PERM;
2479 }else if(idct_algo==FF_IDCT_LIBMPEG2MMX){
2480 if(mm_flags & AV_CPU_FLAG_MMX2){
2481 c->idct_put= ff_libmpeg2mmx2_idct_put;
2482 c->idct_add= ff_libmpeg2mmx2_idct_add;
2483 c->idct = ff_mmxext_idct;
2485 c->idct_put= ff_libmpeg2mmx_idct_put;
2486 c->idct_add= ff_libmpeg2mmx_idct_add;
2487 c->idct = ff_mmx_idct;
2489 c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
2491 }else if((CONFIG_VP3_DECODER || CONFIG_VP5_DECODER || CONFIG_VP6_DECODER) &&
2492 idct_algo==FF_IDCT_VP3 && HAVE_YASM){
2493 if(mm_flags & AV_CPU_FLAG_SSE2){
2494 c->idct_put= ff_vp3_idct_put_sse2;
2495 c->idct_add= ff_vp3_idct_add_sse2;
2496 c->idct = ff_vp3_idct_sse2;
2497 c->idct_permutation_type= FF_TRANSPOSE_IDCT_PERM;
2499 c->idct_put= ff_vp3_idct_put_mmx;
2500 c->idct_add= ff_vp3_idct_add_mmx;
2501 c->idct = ff_vp3_idct_mmx;
2502 c->idct_permutation_type= FF_PARTTRANS_IDCT_PERM;
2504 }else if(idct_algo==FF_IDCT_CAVS){
2505 c->idct_permutation_type= FF_TRANSPOSE_IDCT_PERM;
2506 }else if(idct_algo==FF_IDCT_XVIDMMX){
2507 if(mm_flags & AV_CPU_FLAG_SSE2){
2508 c->idct_put= ff_idct_xvid_sse2_put;
2509 c->idct_add= ff_idct_xvid_sse2_add;
2510 c->idct = ff_idct_xvid_sse2;
2511 c->idct_permutation_type= FF_SSE2_IDCT_PERM;
2512 }else if(mm_flags & AV_CPU_FLAG_MMX2){
2513 c->idct_put= ff_idct_xvid_mmx2_put;
2514 c->idct_add= ff_idct_xvid_mmx2_add;
2515 c->idct = ff_idct_xvid_mmx2;
2517 c->idct_put= ff_idct_xvid_mmx_put;
2518 c->idct_add= ff_idct_xvid_mmx_add;
2519 c->idct = ff_idct_xvid_mmx;
2524 c->put_pixels_clamped = ff_put_pixels_clamped_mmx;
2525 c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_mmx;
2526 c->add_pixels_clamped = ff_add_pixels_clamped_mmx;
2527 if (!high_bit_depth) {
2528 c->clear_block = clear_block_mmx;
2529 c->clear_blocks = clear_blocks_mmx;
2530 if ((mm_flags & AV_CPU_FLAG_SSE) &&
2531 !(CONFIG_MPEG_XVMC_DECODER && avctx->xvmc_acceleration > 1)){
2532 /* XvMCCreateBlocks() may not allocate 16-byte aligned blocks */
2533 c->clear_block = clear_block_sse;
2534 c->clear_blocks = clear_blocks_sse;
2538 #define SET_HPEL_FUNCS(PFX, IDX, SIZE, CPU) \
2539 c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## SIZE ## _ ## CPU; \
2540 c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## SIZE ## _x2_ ## CPU; \
2541 c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## SIZE ## _y2_ ## CPU; \
2542 c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## SIZE ## _xy2_ ## CPU
2544 if (!high_bit_depth) {
2545 SET_HPEL_FUNCS(put, 0, 16, mmx);
2546 SET_HPEL_FUNCS(put_no_rnd, 0, 16, mmx);
2547 SET_HPEL_FUNCS(avg, 0, 16, mmx);
2548 SET_HPEL_FUNCS(avg_no_rnd, 0, 16, mmx);
2549 SET_HPEL_FUNCS(put, 1, 8, mmx);
2550 SET_HPEL_FUNCS(put_no_rnd, 1, 8, mmx);
2551 SET_HPEL_FUNCS(avg, 1, 8, mmx);
2552 SET_HPEL_FUNCS(avg_no_rnd, 1, 8, mmx);
2555 #if ARCH_X86_32 || !HAVE_YASM
2558 #if ARCH_X86_32 && HAVE_YASM
2559 if (!high_bit_depth)
2560 c->emulated_edge_mc = emulated_edge_mc_mmx;
2563 c->add_bytes= add_bytes_mmx;
2564 c->add_bytes_l2= add_bytes_l2_mmx;
2566 if (!high_bit_depth)
2567 c->draw_edges = draw_edges_mmx;
2569 if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
2570 c->h263_v_loop_filter= h263_v_loop_filter_mmx;
2571 c->h263_h_loop_filter= h263_h_loop_filter_mmx;
2575 if (!high_bit_depth) {
2576 c->put_h264_chroma_pixels_tab[0]= ff_put_h264_chroma_mc8_mmx_rnd;
2577 c->put_h264_chroma_pixels_tab[1]= ff_put_h264_chroma_mc4_mmx;
2580 c->put_rv40_chroma_pixels_tab[0]= ff_put_rv40_chroma_mc8_mmx;
2581 c->put_rv40_chroma_pixels_tab[1]= ff_put_rv40_chroma_mc4_mmx;
2583 c->vector_clip_int32 = ff_vector_clip_int32_mmx;
2586 if (mm_flags & AV_CPU_FLAG_MMX2) {
2587 c->prefetch = prefetch_mmx2;
2589 if (!high_bit_depth) {
2590 c->put_pixels_tab[0][1] = put_pixels16_x2_mmx2;
2591 c->put_pixels_tab[0][2] = put_pixels16_y2_mmx2;
2593 c->avg_pixels_tab[0][0] = avg_pixels16_mmx2;
2594 c->avg_pixels_tab[0][1] = avg_pixels16_x2_mmx2;
2595 c->avg_pixels_tab[0][2] = avg_pixels16_y2_mmx2;
2597 c->put_pixels_tab[1][1] = put_pixels8_x2_mmx2;
2598 c->put_pixels_tab[1][2] = put_pixels8_y2_mmx2;
2600 c->avg_pixels_tab[1][0] = avg_pixels8_mmx2;
2601 c->avg_pixels_tab[1][1] = avg_pixels8_x2_mmx2;
2602 c->avg_pixels_tab[1][2] = avg_pixels8_y2_mmx2;
2605 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
2606 if (!high_bit_depth) {
2607 c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmx2;
2608 c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmx2;
2609 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_mmx2;
2610 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_mmx2;
2611 c->avg_pixels_tab[0][3] = avg_pixels16_xy2_mmx2;
2612 c->avg_pixels_tab[1][3] = avg_pixels8_xy2_mmx2;
2615 if (CONFIG_VP3_DECODER && HAVE_YASM) {
2616 c->vp3_v_loop_filter= ff_vp3_v_loop_filter_mmx2;
2617 c->vp3_h_loop_filter= ff_vp3_h_loop_filter_mmx2;
2620 if (CONFIG_VP3_DECODER && HAVE_YASM) {
2621 c->vp3_idct_dc_add = ff_vp3_idct_dc_add_mmx2;
2624 if (CONFIG_VP3_DECODER
2625 && (avctx->codec_id == CODEC_ID_VP3 || avctx->codec_id == CODEC_ID_THEORA)) {
2626 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_exact_mmx2;
2627 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_exact_mmx2;
2630 #define SET_QPEL_FUNCS(PFX, IDX, SIZE, CPU, PREFIX) \
2631 c->PFX ## _pixels_tab[IDX][ 0] = PREFIX ## PFX ## SIZE ## _mc00_ ## CPU; \
2632 c->PFX ## _pixels_tab[IDX][ 1] = PREFIX ## PFX ## SIZE ## _mc10_ ## CPU; \
2633 c->PFX ## _pixels_tab[IDX][ 2] = PREFIX ## PFX ## SIZE ## _mc20_ ## CPU; \
2634 c->PFX ## _pixels_tab[IDX][ 3] = PREFIX ## PFX ## SIZE ## _mc30_ ## CPU; \
2635 c->PFX ## _pixels_tab[IDX][ 4] = PREFIX ## PFX ## SIZE ## _mc01_ ## CPU; \
2636 c->PFX ## _pixels_tab[IDX][ 5] = PREFIX ## PFX ## SIZE ## _mc11_ ## CPU; \
2637 c->PFX ## _pixels_tab[IDX][ 6] = PREFIX ## PFX ## SIZE ## _mc21_ ## CPU; \
2638 c->PFX ## _pixels_tab[IDX][ 7] = PREFIX ## PFX ## SIZE ## _mc31_ ## CPU; \
2639 c->PFX ## _pixels_tab[IDX][ 8] = PREFIX ## PFX ## SIZE ## _mc02_ ## CPU; \
2640 c->PFX ## _pixels_tab[IDX][ 9] = PREFIX ## PFX ## SIZE ## _mc12_ ## CPU; \
2641 c->PFX ## _pixels_tab[IDX][10] = PREFIX ## PFX ## SIZE ## _mc22_ ## CPU; \
2642 c->PFX ## _pixels_tab[IDX][11] = PREFIX ## PFX ## SIZE ## _mc32_ ## CPU; \
2643 c->PFX ## _pixels_tab[IDX][12] = PREFIX ## PFX ## SIZE ## _mc03_ ## CPU; \
2644 c->PFX ## _pixels_tab[IDX][13] = PREFIX ## PFX ## SIZE ## _mc13_ ## CPU; \
2645 c->PFX ## _pixels_tab[IDX][14] = PREFIX ## PFX ## SIZE ## _mc23_ ## CPU; \
2646 c->PFX ## _pixels_tab[IDX][15] = PREFIX ## PFX ## SIZE ## _mc33_ ## CPU
2648 SET_QPEL_FUNCS(put_qpel, 0, 16, mmx2, );
2649 SET_QPEL_FUNCS(put_qpel, 1, 8, mmx2, );
2650 SET_QPEL_FUNCS(put_no_rnd_qpel, 0, 16, mmx2, );
2651 SET_QPEL_FUNCS(put_no_rnd_qpel, 1, 8, mmx2, );
2652 SET_QPEL_FUNCS(avg_qpel, 0, 16, mmx2, );
2653 SET_QPEL_FUNCS(avg_qpel, 1, 8, mmx2, );
2655 if (!high_bit_depth) {
2656 SET_QPEL_FUNCS(put_h264_qpel, 0, 16, mmx2, );
2657 SET_QPEL_FUNCS(put_h264_qpel, 1, 8, mmx2, );
2658 SET_QPEL_FUNCS(put_h264_qpel, 2, 4, mmx2, );
2659 SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, mmx2, );
2660 SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, mmx2, );
2661 SET_QPEL_FUNCS(avg_h264_qpel, 2, 4, mmx2, );
2663 else if (bit_depth == 10) {
2666 SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, 10_mmxext, ff_);
2667 SET_QPEL_FUNCS(put_h264_qpel, 0, 16, 10_mmxext, ff_);
2668 SET_QPEL_FUNCS(put_h264_qpel, 1, 8, 10_mmxext, ff_);
2669 SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, 10_mmxext, ff_);
2671 SET_QPEL_FUNCS(put_h264_qpel, 2, 4, 10_mmxext, ff_);
2672 SET_QPEL_FUNCS(avg_h264_qpel, 2, 4, 10_mmxext, ff_);
2676 SET_QPEL_FUNCS(put_2tap_qpel, 0, 16, mmx2, );
2677 SET_QPEL_FUNCS(put_2tap_qpel, 1, 8, mmx2, );
2678 SET_QPEL_FUNCS(avg_2tap_qpel, 0, 16, mmx2, );
2679 SET_QPEL_FUNCS(avg_2tap_qpel, 1, 8, mmx2, );
2682 c->avg_rv40_chroma_pixels_tab[0]= ff_avg_rv40_chroma_mc8_mmx2;
2683 c->avg_rv40_chroma_pixels_tab[1]= ff_avg_rv40_chroma_mc4_mmx2;
2685 if (!high_bit_depth) {
2686 c->avg_h264_chroma_pixels_tab[0]= ff_avg_h264_chroma_mc8_mmx2_rnd;
2687 c->avg_h264_chroma_pixels_tab[1]= ff_avg_h264_chroma_mc4_mmx2;
2688 c->avg_h264_chroma_pixels_tab[2]= ff_avg_h264_chroma_mc2_mmx2;
2689 c->put_h264_chroma_pixels_tab[2]= ff_put_h264_chroma_mc2_mmx2;
2691 if (bit_depth == 10) {
2692 c->put_h264_chroma_pixels_tab[2]= ff_put_h264_chroma_mc2_10_mmxext;
2693 c->avg_h264_chroma_pixels_tab[2]= ff_avg_h264_chroma_mc2_10_mmxext;
2694 c->put_h264_chroma_pixels_tab[1]= ff_put_h264_chroma_mc4_10_mmxext;
2695 c->avg_h264_chroma_pixels_tab[1]= ff_avg_h264_chroma_mc4_10_mmxext;
2698 c->add_hfyu_median_prediction = ff_add_hfyu_median_prediction_mmx2;
2701 if( mm_flags&AV_CPU_FLAG_3DNOW )
2702 c->add_hfyu_median_prediction = add_hfyu_median_prediction_cmov;
2705 c->add_png_paeth_prediction= add_png_paeth_prediction_mmx2;
2706 } else if (mm_flags & AV_CPU_FLAG_3DNOW) {
2707 c->prefetch = prefetch_3dnow;
2709 if (!high_bit_depth) {
2710 c->put_pixels_tab[0][1] = put_pixels16_x2_3dnow;
2711 c->put_pixels_tab[0][2] = put_pixels16_y2_3dnow;
2713 c->avg_pixels_tab[0][0] = avg_pixels16_3dnow;
2714 c->avg_pixels_tab[0][1] = avg_pixels16_x2_3dnow;
2715 c->avg_pixels_tab[0][2] = avg_pixels16_y2_3dnow;
2717 c->put_pixels_tab[1][1] = put_pixels8_x2_3dnow;
2718 c->put_pixels_tab[1][2] = put_pixels8_y2_3dnow;
2720 c->avg_pixels_tab[1][0] = avg_pixels8_3dnow;
2721 c->avg_pixels_tab[1][1] = avg_pixels8_x2_3dnow;
2722 c->avg_pixels_tab[1][2] = avg_pixels8_y2_3dnow;
2724 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
2725 c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_3dnow;
2726 c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_3dnow;
2727 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_3dnow;
2728 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_3dnow;
2729 c->avg_pixels_tab[0][3] = avg_pixels16_xy2_3dnow;
2730 c->avg_pixels_tab[1][3] = avg_pixels8_xy2_3dnow;
2734 if (CONFIG_VP3_DECODER
2735 && (avctx->codec_id == CODEC_ID_VP3 || avctx->codec_id == CODEC_ID_THEORA)) {
2736 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_exact_3dnow;
2737 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_exact_3dnow;
2740 SET_QPEL_FUNCS(put_qpel, 0, 16, 3dnow, );
2741 SET_QPEL_FUNCS(put_qpel, 1, 8, 3dnow, );
2742 SET_QPEL_FUNCS(put_no_rnd_qpel, 0, 16, 3dnow, );
2743 SET_QPEL_FUNCS(put_no_rnd_qpel, 1, 8, 3dnow, );
2744 SET_QPEL_FUNCS(avg_qpel, 0, 16, 3dnow, );
2745 SET_QPEL_FUNCS(avg_qpel, 1, 8, 3dnow, );
2747 if (!high_bit_depth) {
2748 SET_QPEL_FUNCS(put_h264_qpel, 0, 16, 3dnow, );
2749 SET_QPEL_FUNCS(put_h264_qpel, 1, 8, 3dnow, );
2750 SET_QPEL_FUNCS(put_h264_qpel, 2, 4, 3dnow, );
2751 SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, 3dnow, );
2752 SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, 3dnow, );
2753 SET_QPEL_FUNCS(avg_h264_qpel, 2, 4, 3dnow, );
2756 SET_QPEL_FUNCS(put_2tap_qpel, 0, 16, 3dnow, );
2757 SET_QPEL_FUNCS(put_2tap_qpel, 1, 8, 3dnow, );
2758 SET_QPEL_FUNCS(avg_2tap_qpel, 0, 16, 3dnow, );
2759 SET_QPEL_FUNCS(avg_2tap_qpel, 1, 8, 3dnow, );
2762 if (!high_bit_depth) {
2763 c->avg_h264_chroma_pixels_tab[0]= ff_avg_h264_chroma_mc8_3dnow_rnd;
2764 c->avg_h264_chroma_pixels_tab[1]= ff_avg_h264_chroma_mc4_3dnow;
2767 c->avg_rv40_chroma_pixels_tab[0]= ff_avg_rv40_chroma_mc8_3dnow;
2768 c->avg_rv40_chroma_pixels_tab[1]= ff_avg_rv40_chroma_mc4_3dnow;
2773 #define H264_QPEL_FUNCS(x, y, CPU)\
2774 c->put_h264_qpel_pixels_tab[0][x+y*4] = put_h264_qpel16_mc##x##y##_##CPU;\
2775 c->put_h264_qpel_pixels_tab[1][x+y*4] = put_h264_qpel8_mc##x##y##_##CPU;\
2776 c->avg_h264_qpel_pixels_tab[0][x+y*4] = avg_h264_qpel16_mc##x##y##_##CPU;\
2777 c->avg_h264_qpel_pixels_tab[1][x+y*4] = avg_h264_qpel8_mc##x##y##_##CPU;
2778 if((mm_flags & AV_CPU_FLAG_SSE2) && !(mm_flags & AV_CPU_FLAG_3DNOW)){
2779 // these functions are slower than mmx on AMD, but faster on Intel
2780 if (!high_bit_depth) {
2781 c->put_pixels_tab[0][0] = put_pixels16_sse2;
2782 c->put_no_rnd_pixels_tab[0][0] = put_pixels16_sse2;
2783 c->avg_pixels_tab[0][0] = avg_pixels16_sse2;
2784 H264_QPEL_FUNCS(0, 0, sse2);
2787 if(mm_flags & AV_CPU_FLAG_SSE2){
2788 if (!high_bit_depth) {
2789 H264_QPEL_FUNCS(0, 1, sse2);
2790 H264_QPEL_FUNCS(0, 2, sse2);
2791 H264_QPEL_FUNCS(0, 3, sse2);
2792 H264_QPEL_FUNCS(1, 1, sse2);
2793 H264_QPEL_FUNCS(1, 2, sse2);
2794 H264_QPEL_FUNCS(1, 3, sse2);
2795 H264_QPEL_FUNCS(2, 1, sse2);
2796 H264_QPEL_FUNCS(2, 2, sse2);
2797 H264_QPEL_FUNCS(2, 3, sse2);
2798 H264_QPEL_FUNCS(3, 1, sse2);
2799 H264_QPEL_FUNCS(3, 2, sse2);
2800 H264_QPEL_FUNCS(3, 3, sse2);
2803 #define H264_QPEL_FUNCS_10(x, y, CPU)\
2804 c->put_h264_qpel_pixels_tab[0][x+y*4] = ff_put_h264_qpel16_mc##x##y##_10_##CPU;\
2805 c->put_h264_qpel_pixels_tab[1][x+y*4] = ff_put_h264_qpel8_mc##x##y##_10_##CPU;\
2806 c->avg_h264_qpel_pixels_tab[0][x+y*4] = ff_avg_h264_qpel16_mc##x##y##_10_##CPU;\
2807 c->avg_h264_qpel_pixels_tab[1][x+y*4] = ff_avg_h264_qpel8_mc##x##y##_10_##CPU;
2808 if (bit_depth == 10) {
2809 SET_QPEL_FUNCS(put_h264_qpel, 0, 16, 10_sse2, ff_);
2810 SET_QPEL_FUNCS(put_h264_qpel, 1, 8, 10_sse2, ff_);
2811 SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, 10_sse2, ff_);
2812 SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, 10_sse2, ff_);
2813 H264_QPEL_FUNCS_10(1, 0, sse2_cache64)
2814 H264_QPEL_FUNCS_10(2, 0, sse2_cache64)
2815 H264_QPEL_FUNCS_10(3, 0, sse2_cache64)
2817 c->put_h264_chroma_pixels_tab[0]= ff_put_h264_chroma_mc8_10_sse2;
2818 c->avg_h264_chroma_pixels_tab[0]= ff_avg_h264_chroma_mc8_10_sse2;
2823 if(mm_flags & AV_CPU_FLAG_SSSE3){
2824 if (!high_bit_depth) {
2825 H264_QPEL_FUNCS(1, 0, ssse3);
2826 H264_QPEL_FUNCS(1, 1, ssse3);
2827 H264_QPEL_FUNCS(1, 2, ssse3);
2828 H264_QPEL_FUNCS(1, 3, ssse3);
2829 H264_QPEL_FUNCS(2, 0, ssse3);
2830 H264_QPEL_FUNCS(2, 1, ssse3);
2831 H264_QPEL_FUNCS(2, 2, ssse3);
2832 H264_QPEL_FUNCS(2, 3, ssse3);
2833 H264_QPEL_FUNCS(3, 0, ssse3);
2834 H264_QPEL_FUNCS(3, 1, ssse3);
2835 H264_QPEL_FUNCS(3, 2, ssse3);
2836 H264_QPEL_FUNCS(3, 3, ssse3);
2839 else if (bit_depth == 10) {
2840 H264_QPEL_FUNCS_10(1, 0, ssse3_cache64)
2841 H264_QPEL_FUNCS_10(2, 0, ssse3_cache64)
2842 H264_QPEL_FUNCS_10(3, 0, ssse3_cache64)
2845 c->add_png_paeth_prediction= add_png_paeth_prediction_ssse3;
2847 if (!high_bit_depth) {
2848 c->put_h264_chroma_pixels_tab[0]= ff_put_h264_chroma_mc8_ssse3_rnd;
2849 c->avg_h264_chroma_pixels_tab[0]= ff_avg_h264_chroma_mc8_ssse3_rnd;
2850 c->put_h264_chroma_pixels_tab[1]= ff_put_h264_chroma_mc4_ssse3;
2851 c->avg_h264_chroma_pixels_tab[1]= ff_avg_h264_chroma_mc4_ssse3;
2853 c->add_hfyu_left_prediction = ff_add_hfyu_left_prediction_ssse3;
2854 if (mm_flags & AV_CPU_FLAG_SSE4) // not really sse4, just slow on Conroe
2855 c->add_hfyu_left_prediction = ff_add_hfyu_left_prediction_sse4;
2860 if(mm_flags & AV_CPU_FLAG_3DNOW){
2861 c->vorbis_inverse_coupling = vorbis_inverse_coupling_3dnow;
2862 c->vector_fmul = vector_fmul_3dnow;
2864 if(mm_flags & AV_CPU_FLAG_3DNOWEXT){
2865 c->vector_fmul_reverse = vector_fmul_reverse_3dnow2;
2867 c->vector_fmul_window = vector_fmul_window_3dnow2;
2870 if(mm_flags & AV_CPU_FLAG_MMX2){
2872 c->scalarproduct_int16 = ff_scalarproduct_int16_mmx2;
2873 c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_mmx2;
2874 if (avctx->flags & CODEC_FLAG_BITEXACT) {
2875 c->apply_window_int16 = ff_apply_window_int16_mmxext_ba;
2877 c->apply_window_int16 = ff_apply_window_int16_mmxext;
2881 if(mm_flags & AV_CPU_FLAG_SSE){
2882 c->vorbis_inverse_coupling = vorbis_inverse_coupling_sse;
2883 c->ac3_downmix = ac3_downmix_sse;
2884 c->vector_fmul = vector_fmul_sse;
2885 c->vector_fmul_reverse = vector_fmul_reverse_sse;
2886 c->vector_fmul_add = vector_fmul_add_sse;
2888 c->vector_fmul_window = vector_fmul_window_sse;
2890 c->vector_clipf = vector_clipf_sse;
2892 c->scalarproduct_float = ff_scalarproduct_float_sse;
2895 if(mm_flags & AV_CPU_FLAG_3DNOW)
2896 c->vector_fmul_add = vector_fmul_add_3dnow; // faster than sse
2897 if(mm_flags & AV_CPU_FLAG_SSE2){
2899 c->scalarproduct_int16 = ff_scalarproduct_int16_sse2;
2900 c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_sse2;
2901 if (mm_flags & AV_CPU_FLAG_ATOM) {
2902 c->vector_clip_int32 = ff_vector_clip_int32_sse2_int;
2904 c->vector_clip_int32 = ff_vector_clip_int32_sse2;
2906 if (avctx->flags & CODEC_FLAG_BITEXACT) {
2907 c->apply_window_int16 = ff_apply_window_int16_sse2_ba;
2909 if (!(mm_flags & AV_CPU_FLAG_SSE2SLOW)) {
2910 c->apply_window_int16 = ff_apply_window_int16_sse2;
2914 if (!high_bit_depth)
2915 c->emulated_edge_mc = emulated_edge_mc_sse;
2919 if (mm_flags & AV_CPU_FLAG_SSSE3) {
2921 if (mm_flags & AV_CPU_FLAG_ATOM) {
2922 c->apply_window_int16 = ff_apply_window_int16_ssse3_atom;
2924 c->apply_window_int16 = ff_apply_window_int16_ssse3;
2926 if (!(mm_flags & (AV_CPU_FLAG_SSE42|AV_CPU_FLAG_3DNOW))) { // cachesplit
2927 c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_ssse3;
2932 if (mm_flags & AV_CPU_FLAG_SSE4 && HAVE_SSE) {
2934 c->vector_clip_int32 = ff_vector_clip_int32_sse41;
2938 #if HAVE_AVX && HAVE_YASM
2939 if (mm_flags & AV_CPU_FLAG_AVX) {
2940 if (bit_depth == 10) {
2941 //AVX implies !cache64.
2942 //TODO: Port cache(32|64) detection from x264.
2943 H264_QPEL_FUNCS_10(1, 0, sse2)
2944 H264_QPEL_FUNCS_10(2, 0, sse2)
2945 H264_QPEL_FUNCS_10(3, 0, sse2)
2947 c->put_h264_chroma_pixels_tab[0]= ff_put_h264_chroma_mc8_10_avx;
2948 c->avg_h264_chroma_pixels_tab[0]= ff_avg_h264_chroma_mc8_10_avx;
2954 if (CONFIG_ENCODERS)
2955 dsputilenc_init_mmx(c, avctx);
2958 // for speed testing
2959 get_pixels = just_return;
2960 put_pixels_clamped = just_return;
2961 add_pixels_clamped = just_return;
2963 pix_abs16x16 = just_return;
2964 pix_abs16x16_x2 = just_return;
2965 pix_abs16x16_y2 = just_return;
2966 pix_abs16x16_xy2 = just_return;
2968 put_pixels_tab[0] = just_return;
2969 put_pixels_tab[1] = just_return;
2970 put_pixels_tab[2] = just_return;
2971 put_pixels_tab[3] = just_return;
2973 put_no_rnd_pixels_tab[0] = just_return;
2974 put_no_rnd_pixels_tab[1] = just_return;
2975 put_no_rnd_pixels_tab[2] = just_return;
2976 put_no_rnd_pixels_tab[3] = just_return;
2978 avg_pixels_tab[0] = just_return;
2979 avg_pixels_tab[1] = just_return;
2980 avg_pixels_tab[2] = just_return;
2981 avg_pixels_tab[3] = just_return;
2983 avg_no_rnd_pixels_tab[0] = just_return;
2984 avg_no_rnd_pixels_tab[1] = just_return;
2985 avg_no_rnd_pixels_tab[2] = just_return;
2986 avg_no_rnd_pixels_tab[3] = just_return;
2988 //av_fdct = just_return;
2989 //ff_idct = just_return;