2 * MMX optimized DSP utils
3 * Copyright (c) 2000, 2001 Fabrice Bellard
4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
6 * This file is part of FFmpeg.
8 * FFmpeg is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License as published by the Free Software Foundation; either
11 * version 2.1 of the License, or (at your option) any later version.
13 * FFmpeg is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 * Lesser General Public License for more details.
18 * You should have received a copy of the GNU Lesser General Public
19 * License along with FFmpeg; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22 * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
25 #include "libavutil/cpu.h"
26 #include "libavutil/x86_cpu.h"
27 #include "libavcodec/dsputil.h"
28 #include "libavcodec/h264dsp.h"
29 #include "libavcodec/mpegvideo.h"
30 #include "libavcodec/simple_idct.h"
31 #include "libavcodec/ac3dec.h"
32 #include "dsputil_mmx.h"
33 #include "idct_xvid.h"
34 #include "diracdsp_mmx.h"
39 /* pixel operations */
40 DECLARE_ALIGNED(8, const uint64_t, ff_bone) = 0x0101010101010101ULL;
41 DECLARE_ALIGNED(8, const uint64_t, ff_wtwo) = 0x0002000200020002ULL;
43 DECLARE_ALIGNED(16, const uint64_t, ff_pdw_80000000)[2] =
44 {0x8000000080000000ULL, 0x8000000080000000ULL};
46 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_1 ) = {0x0001000100010001ULL, 0x0001000100010001ULL};
47 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_2 ) = {0x0002000200020002ULL, 0x0002000200020002ULL};
48 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_3 ) = {0x0003000300030003ULL, 0x0003000300030003ULL};
49 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_4 ) = {0x0004000400040004ULL, 0x0004000400040004ULL};
50 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_5 ) = {0x0005000500050005ULL, 0x0005000500050005ULL};
51 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_8 ) = {0x0008000800080008ULL, 0x0008000800080008ULL};
52 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_9 ) = {0x0009000900090009ULL, 0x0009000900090009ULL};
53 DECLARE_ALIGNED(8, const uint64_t, ff_pw_15 ) = 0x000F000F000F000FULL;
54 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_16 ) = {0x0010001000100010ULL, 0x0010001000100010ULL};
55 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_17 ) = {0x0011001100110011ULL, 0x0011001100110011ULL};
56 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_18 ) = {0x0012001200120012ULL, 0x0012001200120012ULL};
57 DECLARE_ALIGNED(8, const uint64_t, ff_pw_20 ) = 0x0014001400140014ULL;
58 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_27 ) = {0x001B001B001B001BULL, 0x001B001B001B001BULL};
59 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_28 ) = {0x001C001C001C001CULL, 0x001C001C001C001CULL};
60 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_32 ) = {0x0020002000200020ULL, 0x0020002000200020ULL};
61 DECLARE_ALIGNED(8, const uint64_t, ff_pw_42 ) = 0x002A002A002A002AULL;
62 DECLARE_ALIGNED(8, const uint64_t, ff_pw_53 ) = 0x0035003500350035ULL;
63 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_63 ) = {0x003F003F003F003FULL, 0x003F003F003F003FULL};
64 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_64 ) = {0x0040004000400040ULL, 0x0040004000400040ULL};
65 DECLARE_ALIGNED(8, const uint64_t, ff_pw_96 ) = 0x0060006000600060ULL;
66 DECLARE_ALIGNED(8, const uint64_t, ff_pw_128) = 0x0080008000800080ULL;
67 DECLARE_ALIGNED(8, const uint64_t, ff_pw_255) = 0x00ff00ff00ff00ffULL;
68 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_512) = {0x0200020002000200ULL, 0x0200020002000200ULL};
69 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_1019)= {0x03FB03FB03FB03FBULL, 0x03FB03FB03FB03FBULL};
71 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_0 ) = {0x0000000000000000ULL, 0x0000000000000000ULL};
72 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_1 ) = {0x0101010101010101ULL, 0x0101010101010101ULL};
73 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_3 ) = {0x0303030303030303ULL, 0x0303030303030303ULL};
74 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_4 ) = {0x0404040404040404ULL, 0x0404040404040404ULL};
75 DECLARE_ALIGNED(8, const uint64_t, ff_pb_7 ) = 0x0707070707070707ULL;
76 DECLARE_ALIGNED(8, const uint64_t, ff_pb_1F ) = 0x1F1F1F1F1F1F1F1FULL;
77 DECLARE_ALIGNED(8, const uint64_t, ff_pb_3F ) = 0x3F3F3F3F3F3F3F3FULL;
78 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_80 ) = {0x8080808080808080ULL, 0x8080808080808080ULL};
79 DECLARE_ALIGNED(8, const uint64_t, ff_pb_81 ) = 0x8181818181818181ULL;
80 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_A1 ) = {0xA1A1A1A1A1A1A1A1ULL, 0xA1A1A1A1A1A1A1A1ULL};
81 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_F8 ) = {0xF8F8F8F8F8F8F8F8ULL, 0xF8F8F8F8F8F8F8F8ULL};
82 DECLARE_ALIGNED(8, const uint64_t, ff_pb_FC ) = 0xFCFCFCFCFCFCFCFCULL;
83 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_FE ) = {0xFEFEFEFEFEFEFEFEULL, 0xFEFEFEFEFEFEFEFEULL};
85 DECLARE_ALIGNED(16, const double, ff_pd_1)[2] = { 1.0, 1.0 };
86 DECLARE_ALIGNED(16, const double, ff_pd_2)[2] = { 2.0, 2.0 };
88 #define JUMPALIGN() __asm__ volatile (".p2align 3"::)
89 #define MOVQ_ZERO(regd) __asm__ volatile ("pxor %%" #regd ", %%" #regd ::)
91 #define MOVQ_BFE(regd) \
93 "pcmpeqd %%" #regd ", %%" #regd " \n\t"\
94 "paddb %%" #regd ", %%" #regd " \n\t" ::)
97 #define MOVQ_BONE(regd) __asm__ volatile ("movq %0, %%" #regd " \n\t" ::"m"(ff_bone))
98 #define MOVQ_WTWO(regd) __asm__ volatile ("movq %0, %%" #regd " \n\t" ::"m"(ff_wtwo))
100 // for shared library it's better to use this way for accessing constants
102 #define MOVQ_BONE(regd) \
104 "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
105 "psrlw $15, %%" #regd " \n\t" \
106 "packuswb %%" #regd ", %%" #regd " \n\t" ::)
108 #define MOVQ_WTWO(regd) \
110 "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
111 "psrlw $15, %%" #regd " \n\t" \
112 "psllw $1, %%" #regd " \n\t"::)
116 // using regr as temporary and for the output result
117 // first argument is unmodifed and second is trashed
118 // regfe is supposed to contain 0xfefefefefefefefe
119 #define PAVGB_MMX_NO_RND(rega, regb, regr, regfe) \
120 "movq " #rega ", " #regr " \n\t"\
121 "pand " #regb ", " #regr " \n\t"\
122 "pxor " #rega ", " #regb " \n\t"\
123 "pand " #regfe "," #regb " \n\t"\
124 "psrlq $1, " #regb " \n\t"\
125 "paddb " #regb ", " #regr " \n\t"
127 #define PAVGB_MMX(rega, regb, regr, regfe) \
128 "movq " #rega ", " #regr " \n\t"\
129 "por " #regb ", " #regr " \n\t"\
130 "pxor " #rega ", " #regb " \n\t"\
131 "pand " #regfe "," #regb " \n\t"\
132 "psrlq $1, " #regb " \n\t"\
133 "psubb " #regb ", " #regr " \n\t"
135 // mm6 is supposed to contain 0xfefefefefefefefe
136 #define PAVGBP_MMX_NO_RND(rega, regb, regr, regc, regd, regp) \
137 "movq " #rega ", " #regr " \n\t"\
138 "movq " #regc ", " #regp " \n\t"\
139 "pand " #regb ", " #regr " \n\t"\
140 "pand " #regd ", " #regp " \n\t"\
141 "pxor " #rega ", " #regb " \n\t"\
142 "pxor " #regc ", " #regd " \n\t"\
143 "pand %%mm6, " #regb " \n\t"\
144 "pand %%mm6, " #regd " \n\t"\
145 "psrlq $1, " #regb " \n\t"\
146 "psrlq $1, " #regd " \n\t"\
147 "paddb " #regb ", " #regr " \n\t"\
148 "paddb " #regd ", " #regp " \n\t"
150 #define PAVGBP_MMX(rega, regb, regr, regc, regd, regp) \
151 "movq " #rega ", " #regr " \n\t"\
152 "movq " #regc ", " #regp " \n\t"\
153 "por " #regb ", " #regr " \n\t"\
154 "por " #regd ", " #regp " \n\t"\
155 "pxor " #rega ", " #regb " \n\t"\
156 "pxor " #regc ", " #regd " \n\t"\
157 "pand %%mm6, " #regb " \n\t"\
158 "pand %%mm6, " #regd " \n\t"\
159 "psrlq $1, " #regd " \n\t"\
160 "psrlq $1, " #regb " \n\t"\
161 "psubb " #regb ", " #regr " \n\t"\
162 "psubb " #regd ", " #regp " \n\t"
164 /***********************************/
165 /* MMX no rounding */
166 #define DEF(x, y) x ## _no_rnd_ ## y ##_mmx
167 #define SET_RND MOVQ_WONE
168 #define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX_NO_RND(a, b, c, d, e, f)
169 #define PAVGB(a, b, c, e) PAVGB_MMX_NO_RND(a, b, c, e)
170 #define OP_AVG(a, b, c, e) PAVGB_MMX(a, b, c, e)
172 #include "dsputil_mmx_rnd_template.c"
178 /***********************************/
181 #define DEF(x, y) x ## _ ## y ##_mmx
182 #define SET_RND MOVQ_WTWO
183 #define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX(a, b, c, d, e, f)
184 #define PAVGB(a, b, c, e) PAVGB_MMX(a, b, c, e)
186 #include "dsputil_mmx_rnd_template.c"
194 /***********************************/
197 #define DEF(x) x ## _3dnow
198 #define PAVGB "pavgusb"
201 #include "dsputil_mmx_avg_template.c"
207 /***********************************/
210 #define DEF(x) x ## _mmx2
212 /* Introduced only in MMX2 set */
213 #define PAVGB "pavgb"
216 #include "dsputil_mmx_avg_template.c"
222 #define put_no_rnd_pixels16_mmx put_pixels16_mmx
223 #define put_no_rnd_pixels8_mmx put_pixels8_mmx
224 #define put_pixels16_mmx2 put_pixels16_mmx
225 #define put_pixels8_mmx2 put_pixels8_mmx
226 #define put_pixels4_mmx2 put_pixels4_mmx
227 #define put_no_rnd_pixels16_mmx2 put_no_rnd_pixels16_mmx
228 #define put_no_rnd_pixels8_mmx2 put_no_rnd_pixels8_mmx
229 #define put_pixels16_3dnow put_pixels16_mmx
230 #define put_pixels8_3dnow put_pixels8_mmx
231 #define put_pixels4_3dnow put_pixels4_mmx
232 #define put_no_rnd_pixels16_3dnow put_no_rnd_pixels16_mmx
233 #define put_no_rnd_pixels8_3dnow put_no_rnd_pixels8_mmx
235 /***********************************/
238 void ff_put_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size)
243 /* read the pixels */
248 "movq %3, %%mm0 \n\t"
249 "movq 8%3, %%mm1 \n\t"
250 "movq 16%3, %%mm2 \n\t"
251 "movq 24%3, %%mm3 \n\t"
252 "movq 32%3, %%mm4 \n\t"
253 "movq 40%3, %%mm5 \n\t"
254 "movq 48%3, %%mm6 \n\t"
255 "movq 56%3, %%mm7 \n\t"
256 "packuswb %%mm1, %%mm0 \n\t"
257 "packuswb %%mm3, %%mm2 \n\t"
258 "packuswb %%mm5, %%mm4 \n\t"
259 "packuswb %%mm7, %%mm6 \n\t"
260 "movq %%mm0, (%0) \n\t"
261 "movq %%mm2, (%0, %1) \n\t"
262 "movq %%mm4, (%0, %1, 2) \n\t"
263 "movq %%mm6, (%0, %2) \n\t"
264 ::"r" (pix), "r" ((x86_reg)line_size), "r" ((x86_reg)line_size*3), "m"(*p)
269 // if here would be an exact copy of the code above
270 // compiler would generate some very strange code
273 "movq (%3), %%mm0 \n\t"
274 "movq 8(%3), %%mm1 \n\t"
275 "movq 16(%3), %%mm2 \n\t"
276 "movq 24(%3), %%mm3 \n\t"
277 "movq 32(%3), %%mm4 \n\t"
278 "movq 40(%3), %%mm5 \n\t"
279 "movq 48(%3), %%mm6 \n\t"
280 "movq 56(%3), %%mm7 \n\t"
281 "packuswb %%mm1, %%mm0 \n\t"
282 "packuswb %%mm3, %%mm2 \n\t"
283 "packuswb %%mm5, %%mm4 \n\t"
284 "packuswb %%mm7, %%mm6 \n\t"
285 "movq %%mm0, (%0) \n\t"
286 "movq %%mm2, (%0, %1) \n\t"
287 "movq %%mm4, (%0, %1, 2) \n\t"
288 "movq %%mm6, (%0, %2) \n\t"
289 ::"r" (pix), "r" ((x86_reg)line_size), "r" ((x86_reg)line_size*3), "r"(p)
293 #define put_signed_pixels_clamped_mmx_half(off) \
294 "movq "#off"(%2), %%mm1 \n\t"\
295 "movq 16+"#off"(%2), %%mm2 \n\t"\
296 "movq 32+"#off"(%2), %%mm3 \n\t"\
297 "movq 48+"#off"(%2), %%mm4 \n\t"\
298 "packsswb 8+"#off"(%2), %%mm1 \n\t"\
299 "packsswb 24+"#off"(%2), %%mm2 \n\t"\
300 "packsswb 40+"#off"(%2), %%mm3 \n\t"\
301 "packsswb 56+"#off"(%2), %%mm4 \n\t"\
302 "paddb %%mm0, %%mm1 \n\t"\
303 "paddb %%mm0, %%mm2 \n\t"\
304 "paddb %%mm0, %%mm3 \n\t"\
305 "paddb %%mm0, %%mm4 \n\t"\
306 "movq %%mm1, (%0) \n\t"\
307 "movq %%mm2, (%0, %3) \n\t"\
308 "movq %%mm3, (%0, %3, 2) \n\t"\
309 "movq %%mm4, (%0, %1) \n\t"
311 void ff_put_signed_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size)
313 x86_reg line_skip = line_size;
317 "movq "MANGLE(ff_pb_80)", %%mm0 \n\t"
318 "lea (%3, %3, 2), %1 \n\t"
319 put_signed_pixels_clamped_mmx_half(0)
320 "lea (%0, %3, 4), %0 \n\t"
321 put_signed_pixels_clamped_mmx_half(64)
322 :"+&r" (pixels), "=&r" (line_skip3)
323 :"r" (block), "r"(line_skip)
327 void ff_add_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size)
333 /* read the pixels */
340 "movq (%2), %%mm0 \n\t"
341 "movq 8(%2), %%mm1 \n\t"
342 "movq 16(%2), %%mm2 \n\t"
343 "movq 24(%2), %%mm3 \n\t"
344 "movq %0, %%mm4 \n\t"
345 "movq %1, %%mm6 \n\t"
346 "movq %%mm4, %%mm5 \n\t"
347 "punpcklbw %%mm7, %%mm4 \n\t"
348 "punpckhbw %%mm7, %%mm5 \n\t"
349 "paddsw %%mm4, %%mm0 \n\t"
350 "paddsw %%mm5, %%mm1 \n\t"
351 "movq %%mm6, %%mm5 \n\t"
352 "punpcklbw %%mm7, %%mm6 \n\t"
353 "punpckhbw %%mm7, %%mm5 \n\t"
354 "paddsw %%mm6, %%mm2 \n\t"
355 "paddsw %%mm5, %%mm3 \n\t"
356 "packuswb %%mm1, %%mm0 \n\t"
357 "packuswb %%mm3, %%mm2 \n\t"
358 "movq %%mm0, %0 \n\t"
359 "movq %%mm2, %1 \n\t"
360 :"+m"(*pix), "+m"(*(pix+line_size))
368 static void put_pixels4_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h)
371 "lea (%3, %3), %%"REG_a" \n\t"
374 "movd (%1), %%mm0 \n\t"
375 "movd (%1, %3), %%mm1 \n\t"
376 "movd %%mm0, (%2) \n\t"
377 "movd %%mm1, (%2, %3) \n\t"
378 "add %%"REG_a", %1 \n\t"
379 "add %%"REG_a", %2 \n\t"
380 "movd (%1), %%mm0 \n\t"
381 "movd (%1, %3), %%mm1 \n\t"
382 "movd %%mm0, (%2) \n\t"
383 "movd %%mm1, (%2, %3) \n\t"
384 "add %%"REG_a", %1 \n\t"
385 "add %%"REG_a", %2 \n\t"
388 : "+g"(h), "+r" (pixels), "+r" (block)
389 : "r"((x86_reg)line_size)
394 static void put_pixels8_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h)
397 "lea (%3, %3), %%"REG_a" \n\t"
400 "movq (%1), %%mm0 \n\t"
401 "movq (%1, %3), %%mm1 \n\t"
402 "movq %%mm0, (%2) \n\t"
403 "movq %%mm1, (%2, %3) \n\t"
404 "add %%"REG_a", %1 \n\t"
405 "add %%"REG_a", %2 \n\t"
406 "movq (%1), %%mm0 \n\t"
407 "movq (%1, %3), %%mm1 \n\t"
408 "movq %%mm0, (%2) \n\t"
409 "movq %%mm1, (%2, %3) \n\t"
410 "add %%"REG_a", %1 \n\t"
411 "add %%"REG_a", %2 \n\t"
414 : "+g"(h), "+r" (pixels), "+r" (block)
415 : "r"((x86_reg)line_size)
420 static void put_pixels16_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h)
423 "lea (%3, %3), %%"REG_a" \n\t"
426 "movq (%1), %%mm0 \n\t"
427 "movq 8(%1), %%mm4 \n\t"
428 "movq (%1, %3), %%mm1 \n\t"
429 "movq 8(%1, %3), %%mm5 \n\t"
430 "movq %%mm0, (%2) \n\t"
431 "movq %%mm4, 8(%2) \n\t"
432 "movq %%mm1, (%2, %3) \n\t"
433 "movq %%mm5, 8(%2, %3) \n\t"
434 "add %%"REG_a", %1 \n\t"
435 "add %%"REG_a", %2 \n\t"
436 "movq (%1), %%mm0 \n\t"
437 "movq 8(%1), %%mm4 \n\t"
438 "movq (%1, %3), %%mm1 \n\t"
439 "movq 8(%1, %3), %%mm5 \n\t"
440 "movq %%mm0, (%2) \n\t"
441 "movq %%mm4, 8(%2) \n\t"
442 "movq %%mm1, (%2, %3) \n\t"
443 "movq %%mm5, 8(%2, %3) \n\t"
444 "add %%"REG_a", %1 \n\t"
445 "add %%"REG_a", %2 \n\t"
448 : "+g"(h), "+r" (pixels), "+r" (block)
449 : "r"((x86_reg)line_size)
454 static void put_pixels16_sse2(uint8_t *block, const uint8_t *pixels, int line_size, int h)
458 "movdqu (%1), %%xmm0 \n\t"
459 "movdqu (%1,%3), %%xmm1 \n\t"
460 "movdqu (%1,%3,2), %%xmm2 \n\t"
461 "movdqu (%1,%4), %%xmm3 \n\t"
462 "lea (%1,%3,4), %1 \n\t"
463 "movdqa %%xmm0, (%2) \n\t"
464 "movdqa %%xmm1, (%2,%3) \n\t"
465 "movdqa %%xmm2, (%2,%3,2) \n\t"
466 "movdqa %%xmm3, (%2,%4) \n\t"
468 "lea (%2,%3,4), %2 \n\t"
470 : "+g"(h), "+r" (pixels), "+r" (block)
471 : "r"((x86_reg)line_size), "r"((x86_reg)3L*line_size)
476 static void avg_pixels16_sse2(uint8_t *block, const uint8_t *pixels, int line_size, int h)
480 "movdqu (%1), %%xmm0 \n\t"
481 "movdqu (%1,%3), %%xmm1 \n\t"
482 "movdqu (%1,%3,2), %%xmm2 \n\t"
483 "movdqu (%1,%4), %%xmm3 \n\t"
484 "lea (%1,%3,4), %1 \n\t"
485 "pavgb (%2), %%xmm0 \n\t"
486 "pavgb (%2,%3), %%xmm1 \n\t"
487 "pavgb (%2,%3,2), %%xmm2 \n\t"
488 "pavgb (%2,%4), %%xmm3 \n\t"
489 "movdqa %%xmm0, (%2) \n\t"
490 "movdqa %%xmm1, (%2,%3) \n\t"
491 "movdqa %%xmm2, (%2,%3,2) \n\t"
492 "movdqa %%xmm3, (%2,%4) \n\t"
494 "lea (%2,%3,4), %2 \n\t"
496 : "+g"(h), "+r" (pixels), "+r" (block)
497 : "r"((x86_reg)line_size), "r"((x86_reg)3L*line_size)
502 #define CLEAR_BLOCKS(name,n) \
503 static void name(DCTELEM *blocks)\
506 "pxor %%mm7, %%mm7 \n\t"\
507 "mov %1, %%"REG_a" \n\t"\
509 "movq %%mm7, (%0, %%"REG_a") \n\t"\
510 "movq %%mm7, 8(%0, %%"REG_a") \n\t"\
511 "movq %%mm7, 16(%0, %%"REG_a") \n\t"\
512 "movq %%mm7, 24(%0, %%"REG_a") \n\t"\
513 "add $32, %%"REG_a" \n\t"\
515 : : "r" (((uint8_t *)blocks)+128*n),\
520 CLEAR_BLOCKS(clear_blocks_mmx, 6)
521 CLEAR_BLOCKS(clear_block_mmx, 1)
523 static void clear_block_sse(DCTELEM *block)
526 "xorps %%xmm0, %%xmm0 \n"
527 "movaps %%xmm0, (%0) \n"
528 "movaps %%xmm0, 16(%0) \n"
529 "movaps %%xmm0, 32(%0) \n"
530 "movaps %%xmm0, 48(%0) \n"
531 "movaps %%xmm0, 64(%0) \n"
532 "movaps %%xmm0, 80(%0) \n"
533 "movaps %%xmm0, 96(%0) \n"
534 "movaps %%xmm0, 112(%0) \n"
540 static void clear_blocks_sse(DCTELEM *blocks)
543 "xorps %%xmm0, %%xmm0 \n"
544 "mov %1, %%"REG_a" \n"
546 "movaps %%xmm0, (%0, %%"REG_a") \n"
547 "movaps %%xmm0, 16(%0, %%"REG_a") \n"
548 "movaps %%xmm0, 32(%0, %%"REG_a") \n"
549 "movaps %%xmm0, 48(%0, %%"REG_a") \n"
550 "movaps %%xmm0, 64(%0, %%"REG_a") \n"
551 "movaps %%xmm0, 80(%0, %%"REG_a") \n"
552 "movaps %%xmm0, 96(%0, %%"REG_a") \n"
553 "movaps %%xmm0, 112(%0, %%"REG_a") \n"
554 "add $128, %%"REG_a" \n"
556 : : "r" (((uint8_t *)blocks)+128*6),
562 static void add_bytes_mmx(uint8_t *dst, uint8_t *src, int w){
567 "movq (%1, %0), %%mm0 \n\t"
568 "movq (%2, %0), %%mm1 \n\t"
569 "paddb %%mm0, %%mm1 \n\t"
570 "movq %%mm1, (%2, %0) \n\t"
571 "movq 8(%1, %0), %%mm0 \n\t"
572 "movq 8(%2, %0), %%mm1 \n\t"
573 "paddb %%mm0, %%mm1 \n\t"
574 "movq %%mm1, 8(%2, %0) \n\t"
580 : "r"(src), "r"(dst), "r"((x86_reg)w-15)
583 dst[i+0] += src[i+0];
587 static void add_hfyu_median_prediction_cmov(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int w, int *left, int *left_top) {
590 int l = *left & 0xff;
591 int tl = *left_top & 0xff;
596 "movzbl (%3,%4), %2 \n"
609 "add (%6,%4), %b0 \n"
610 "mov %b0, (%5,%4) \n"
613 :"+&q"(l), "+&q"(tl), "=&r"(t), "=&q"(x), "+&r"(w2)
614 :"r"(dst+w), "r"(diff+w), "rm"(top+w)
621 #define H263_LOOP_FILTER \
622 "pxor %%mm7, %%mm7 \n\t"\
623 "movq %0, %%mm0 \n\t"\
624 "movq %0, %%mm1 \n\t"\
625 "movq %3, %%mm2 \n\t"\
626 "movq %3, %%mm3 \n\t"\
627 "punpcklbw %%mm7, %%mm0 \n\t"\
628 "punpckhbw %%mm7, %%mm1 \n\t"\
629 "punpcklbw %%mm7, %%mm2 \n\t"\
630 "punpckhbw %%mm7, %%mm3 \n\t"\
631 "psubw %%mm2, %%mm0 \n\t"\
632 "psubw %%mm3, %%mm1 \n\t"\
633 "movq %1, %%mm2 \n\t"\
634 "movq %1, %%mm3 \n\t"\
635 "movq %2, %%mm4 \n\t"\
636 "movq %2, %%mm5 \n\t"\
637 "punpcklbw %%mm7, %%mm2 \n\t"\
638 "punpckhbw %%mm7, %%mm3 \n\t"\
639 "punpcklbw %%mm7, %%mm4 \n\t"\
640 "punpckhbw %%mm7, %%mm5 \n\t"\
641 "psubw %%mm2, %%mm4 \n\t"\
642 "psubw %%mm3, %%mm5 \n\t"\
643 "psllw $2, %%mm4 \n\t"\
644 "psllw $2, %%mm5 \n\t"\
645 "paddw %%mm0, %%mm4 \n\t"\
646 "paddw %%mm1, %%mm5 \n\t"\
647 "pxor %%mm6, %%mm6 \n\t"\
648 "pcmpgtw %%mm4, %%mm6 \n\t"\
649 "pcmpgtw %%mm5, %%mm7 \n\t"\
650 "pxor %%mm6, %%mm4 \n\t"\
651 "pxor %%mm7, %%mm5 \n\t"\
652 "psubw %%mm6, %%mm4 \n\t"\
653 "psubw %%mm7, %%mm5 \n\t"\
654 "psrlw $3, %%mm4 \n\t"\
655 "psrlw $3, %%mm5 \n\t"\
656 "packuswb %%mm5, %%mm4 \n\t"\
657 "packsswb %%mm7, %%mm6 \n\t"\
658 "pxor %%mm7, %%mm7 \n\t"\
659 "movd %4, %%mm2 \n\t"\
660 "punpcklbw %%mm2, %%mm2 \n\t"\
661 "punpcklbw %%mm2, %%mm2 \n\t"\
662 "punpcklbw %%mm2, %%mm2 \n\t"\
663 "psubusb %%mm4, %%mm2 \n\t"\
664 "movq %%mm2, %%mm3 \n\t"\
665 "psubusb %%mm4, %%mm3 \n\t"\
666 "psubb %%mm3, %%mm2 \n\t"\
667 "movq %1, %%mm3 \n\t"\
668 "movq %2, %%mm4 \n\t"\
669 "pxor %%mm6, %%mm3 \n\t"\
670 "pxor %%mm6, %%mm4 \n\t"\
671 "paddusb %%mm2, %%mm3 \n\t"\
672 "psubusb %%mm2, %%mm4 \n\t"\
673 "pxor %%mm6, %%mm3 \n\t"\
674 "pxor %%mm6, %%mm4 \n\t"\
675 "paddusb %%mm2, %%mm2 \n\t"\
676 "packsswb %%mm1, %%mm0 \n\t"\
677 "pcmpgtb %%mm0, %%mm7 \n\t"\
678 "pxor %%mm7, %%mm0 \n\t"\
679 "psubb %%mm7, %%mm0 \n\t"\
680 "movq %%mm0, %%mm1 \n\t"\
681 "psubusb %%mm2, %%mm0 \n\t"\
682 "psubb %%mm0, %%mm1 \n\t"\
683 "pand %5, %%mm1 \n\t"\
684 "psrlw $2, %%mm1 \n\t"\
685 "pxor %%mm7, %%mm1 \n\t"\
686 "psubb %%mm7, %%mm1 \n\t"\
687 "movq %0, %%mm5 \n\t"\
688 "movq %3, %%mm6 \n\t"\
689 "psubb %%mm1, %%mm5 \n\t"\
690 "paddb %%mm1, %%mm6 \n\t"
692 static void h263_v_loop_filter_mmx(uint8_t *src, int stride, int qscale){
693 if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
694 const int strength= ff_h263_loop_filter_strength[qscale];
700 "movq %%mm3, %1 \n\t"
701 "movq %%mm4, %2 \n\t"
702 "movq %%mm5, %0 \n\t"
703 "movq %%mm6, %3 \n\t"
704 : "+m" (*(uint64_t*)(src - 2*stride)),
705 "+m" (*(uint64_t*)(src - 1*stride)),
706 "+m" (*(uint64_t*)(src + 0*stride)),
707 "+m" (*(uint64_t*)(src + 1*stride))
708 : "g" (2*strength), "m"(ff_pb_FC)
713 static void h263_h_loop_filter_mmx(uint8_t *src, int stride, int qscale){
714 if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
715 const int strength= ff_h263_loop_filter_strength[qscale];
716 DECLARE_ALIGNED(8, uint64_t, temp)[4];
717 uint8_t *btemp= (uint8_t*)temp;
721 transpose4x4(btemp , src , 8, stride);
722 transpose4x4(btemp+4, src + 4*stride, 8, stride);
724 H263_LOOP_FILTER // 5 3 4 6
730 : "g" (2*strength), "m"(ff_pb_FC)
734 "movq %%mm5, %%mm1 \n\t"
735 "movq %%mm4, %%mm0 \n\t"
736 "punpcklbw %%mm3, %%mm5 \n\t"
737 "punpcklbw %%mm6, %%mm4 \n\t"
738 "punpckhbw %%mm3, %%mm1 \n\t"
739 "punpckhbw %%mm6, %%mm0 \n\t"
740 "movq %%mm5, %%mm3 \n\t"
741 "movq %%mm1, %%mm6 \n\t"
742 "punpcklwd %%mm4, %%mm5 \n\t"
743 "punpcklwd %%mm0, %%mm1 \n\t"
744 "punpckhwd %%mm4, %%mm3 \n\t"
745 "punpckhwd %%mm0, %%mm6 \n\t"
746 "movd %%mm5, (%0) \n\t"
747 "punpckhdq %%mm5, %%mm5 \n\t"
748 "movd %%mm5, (%0,%2) \n\t"
749 "movd %%mm3, (%0,%2,2) \n\t"
750 "punpckhdq %%mm3, %%mm3 \n\t"
751 "movd %%mm3, (%0,%3) \n\t"
752 "movd %%mm1, (%1) \n\t"
753 "punpckhdq %%mm1, %%mm1 \n\t"
754 "movd %%mm1, (%1,%2) \n\t"
755 "movd %%mm6, (%1,%2,2) \n\t"
756 "punpckhdq %%mm6, %%mm6 \n\t"
757 "movd %%mm6, (%1,%3) \n\t"
759 "r" (src + 4*stride),
760 "r" ((x86_reg) stride ),
761 "r" ((x86_reg)(3*stride))
766 /* draw the edges of width 'w' of an image of size width, height
767 this mmx version can only handle w==8 || w==16 */
768 static void draw_edges_mmx(uint8_t *buf, int wrap, int width, int height, int w, int h, int sides)
770 uint8_t *ptr, *last_line;
773 last_line = buf + (height - 1) * wrap;
780 "movd (%0), %%mm0 \n\t"
781 "punpcklbw %%mm0, %%mm0 \n\t"
782 "punpcklwd %%mm0, %%mm0 \n\t"
783 "punpckldq %%mm0, %%mm0 \n\t"
784 "movq %%mm0, -8(%0) \n\t"
785 "movq -8(%0, %2), %%mm1 \n\t"
786 "punpckhbw %%mm1, %%mm1 \n\t"
787 "punpckhwd %%mm1, %%mm1 \n\t"
788 "punpckhdq %%mm1, %%mm1 \n\t"
789 "movq %%mm1, (%0, %2) \n\t"
794 : "r" ((x86_reg)wrap), "r" ((x86_reg)width), "r" (ptr + wrap*height)
801 "movd (%0), %%mm0 \n\t"
802 "punpcklbw %%mm0, %%mm0 \n\t"
803 "punpcklwd %%mm0, %%mm0 \n\t"
804 "punpckldq %%mm0, %%mm0 \n\t"
805 "movq %%mm0, -8(%0) \n\t"
806 "movq %%mm0, -16(%0) \n\t"
807 "movq -8(%0, %2), %%mm1 \n\t"
808 "punpckhbw %%mm1, %%mm1 \n\t"
809 "punpckhwd %%mm1, %%mm1 \n\t"
810 "punpckhdq %%mm1, %%mm1 \n\t"
811 "movq %%mm1, (%0, %2) \n\t"
812 "movq %%mm1, 8(%0, %2) \n\t"
817 : "r" ((x86_reg)wrap), "r" ((x86_reg)width), "r" (ptr + wrap*height)
821 /* top and bottom (and hopefully also the corners) */
822 if (sides&EDGE_TOP) {
823 for(i = 0; i < h; i += 4) {
824 ptr= buf - (i + 1) * wrap - w;
827 "movq (%1, %0), %%mm0 \n\t"
828 "movq %%mm0, (%0) \n\t"
829 "movq %%mm0, (%0, %2) \n\t"
830 "movq %%mm0, (%0, %2, 2) \n\t"
831 "movq %%mm0, (%0, %3) \n\t"
836 : "r" ((x86_reg)buf - (x86_reg)ptr - w), "r" ((x86_reg)-wrap), "r" ((x86_reg)-wrap*3), "r" (ptr+width+2*w)
841 if (sides&EDGE_BOTTOM) {
842 for(i = 0; i < w; i += 4) {
843 ptr= last_line + (i + 1) * wrap - w;
846 "movq (%1, %0), %%mm0 \n\t"
847 "movq %%mm0, (%0) \n\t"
848 "movq %%mm0, (%0, %2) \n\t"
849 "movq %%mm0, (%0, %2, 2) \n\t"
850 "movq %%mm0, (%0, %3) \n\t"
855 : "r" ((x86_reg)last_line - (x86_reg)ptr - w), "r" ((x86_reg)wrap), "r" ((x86_reg)wrap*3), "r" (ptr+width+2*w)
861 #define QPEL_V_LOW(m3,m4,m5,m6, pw_20, pw_3, rnd, in0, in1, in2, in7, out, OP)\
862 "paddw " #m4 ", " #m3 " \n\t" /* x1 */\
863 "movq "MANGLE(ff_pw_20)", %%mm4 \n\t" /* 20 */\
864 "pmullw " #m3 ", %%mm4 \n\t" /* 20x1 */\
865 "movq "#in7", " #m3 " \n\t" /* d */\
866 "movq "#in0", %%mm5 \n\t" /* D */\
867 "paddw " #m3 ", %%mm5 \n\t" /* x4 */\
868 "psubw %%mm5, %%mm4 \n\t" /* 20x1 - x4 */\
869 "movq "#in1", %%mm5 \n\t" /* C */\
870 "movq "#in2", %%mm6 \n\t" /* B */\
871 "paddw " #m6 ", %%mm5 \n\t" /* x3 */\
872 "paddw " #m5 ", %%mm6 \n\t" /* x2 */\
873 "paddw %%mm6, %%mm6 \n\t" /* 2x2 */\
874 "psubw %%mm6, %%mm5 \n\t" /* -2x2 + x3 */\
875 "pmullw "MANGLE(ff_pw_3)", %%mm5 \n\t" /* -6x2 + 3x3 */\
876 "paddw " #rnd ", %%mm4 \n\t" /* x2 */\
877 "paddw %%mm4, %%mm5 \n\t" /* 20x1 - 6x2 + 3x3 - x4 */\
878 "psraw $5, %%mm5 \n\t"\
879 "packuswb %%mm5, %%mm5 \n\t"\
880 OP(%%mm5, out, %%mm7, d)
882 #define QPEL_BASE(OPNAME, ROUNDER, RND, OP_MMX2, OP_3DNOW)\
883 static void OPNAME ## mpeg4_qpel16_h_lowpass_mmx2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
887 "pxor %%mm7, %%mm7 \n\t"\
889 "movq (%0), %%mm0 \n\t" /* ABCDEFGH */\
890 "movq %%mm0, %%mm1 \n\t" /* ABCDEFGH */\
891 "movq %%mm0, %%mm2 \n\t" /* ABCDEFGH */\
892 "punpcklbw %%mm7, %%mm0 \n\t" /* 0A0B0C0D */\
893 "punpckhbw %%mm7, %%mm1 \n\t" /* 0E0F0G0H */\
894 "pshufw $0x90, %%mm0, %%mm5 \n\t" /* 0A0A0B0C */\
895 "pshufw $0x41, %%mm0, %%mm6 \n\t" /* 0B0A0A0B */\
896 "movq %%mm2, %%mm3 \n\t" /* ABCDEFGH */\
897 "movq %%mm2, %%mm4 \n\t" /* ABCDEFGH */\
898 "psllq $8, %%mm2 \n\t" /* 0ABCDEFG */\
899 "psllq $16, %%mm3 \n\t" /* 00ABCDEF */\
900 "psllq $24, %%mm4 \n\t" /* 000ABCDE */\
901 "punpckhbw %%mm7, %%mm2 \n\t" /* 0D0E0F0G */\
902 "punpckhbw %%mm7, %%mm3 \n\t" /* 0C0D0E0F */\
903 "punpckhbw %%mm7, %%mm4 \n\t" /* 0B0C0D0E */\
904 "paddw %%mm3, %%mm5 \n\t" /* b */\
905 "paddw %%mm2, %%mm6 \n\t" /* c */\
906 "paddw %%mm5, %%mm5 \n\t" /* 2b */\
907 "psubw %%mm5, %%mm6 \n\t" /* c - 2b */\
908 "pshufw $0x06, %%mm0, %%mm5 \n\t" /* 0C0B0A0A */\
909 "pmullw "MANGLE(ff_pw_3)", %%mm6 \n\t" /* 3c - 6b */\
910 "paddw %%mm4, %%mm0 \n\t" /* a */\
911 "paddw %%mm1, %%mm5 \n\t" /* d */\
912 "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" /* 20a */\
913 "psubw %%mm5, %%mm0 \n\t" /* 20a - d */\
914 "paddw %6, %%mm6 \n\t"\
915 "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\
916 "psraw $5, %%mm0 \n\t"\
917 "movq %%mm0, %5 \n\t"\
918 /* mm1=EFGH, mm2=DEFG, mm3=CDEF, mm4=BCDE, mm7=0 */\
920 "movq 5(%0), %%mm0 \n\t" /* FGHIJKLM */\
921 "movq %%mm0, %%mm5 \n\t" /* FGHIJKLM */\
922 "movq %%mm0, %%mm6 \n\t" /* FGHIJKLM */\
923 "psrlq $8, %%mm0 \n\t" /* GHIJKLM0 */\
924 "psrlq $16, %%mm5 \n\t" /* HIJKLM00 */\
925 "punpcklbw %%mm7, %%mm0 \n\t" /* 0G0H0I0J */\
926 "punpcklbw %%mm7, %%mm5 \n\t" /* 0H0I0J0K */\
927 "paddw %%mm0, %%mm2 \n\t" /* b */\
928 "paddw %%mm5, %%mm3 \n\t" /* c */\
929 "paddw %%mm2, %%mm2 \n\t" /* 2b */\
930 "psubw %%mm2, %%mm3 \n\t" /* c - 2b */\
931 "movq %%mm6, %%mm2 \n\t" /* FGHIJKLM */\
932 "psrlq $24, %%mm6 \n\t" /* IJKLM000 */\
933 "punpcklbw %%mm7, %%mm2 \n\t" /* 0F0G0H0I */\
934 "punpcklbw %%mm7, %%mm6 \n\t" /* 0I0J0K0L */\
935 "pmullw "MANGLE(ff_pw_3)", %%mm3 \n\t" /* 3c - 6b */\
936 "paddw %%mm2, %%mm1 \n\t" /* a */\
937 "paddw %%mm6, %%mm4 \n\t" /* d */\
938 "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" /* 20a */\
939 "psubw %%mm4, %%mm3 \n\t" /* - 6b +3c - d */\
940 "paddw %6, %%mm1 \n\t"\
941 "paddw %%mm1, %%mm3 \n\t" /* 20a - 6b +3c - d */\
942 "psraw $5, %%mm3 \n\t"\
943 "movq %5, %%mm1 \n\t"\
944 "packuswb %%mm3, %%mm1 \n\t"\
945 OP_MMX2(%%mm1, (%1),%%mm4, q)\
946 /* mm0= GHIJ, mm2=FGHI, mm5=HIJK, mm6=IJKL, mm7=0 */\
948 "movq 9(%0), %%mm1 \n\t" /* JKLMNOPQ */\
949 "movq %%mm1, %%mm4 \n\t" /* JKLMNOPQ */\
950 "movq %%mm1, %%mm3 \n\t" /* JKLMNOPQ */\
951 "psrlq $8, %%mm1 \n\t" /* KLMNOPQ0 */\
952 "psrlq $16, %%mm4 \n\t" /* LMNOPQ00 */\
953 "punpcklbw %%mm7, %%mm1 \n\t" /* 0K0L0M0N */\
954 "punpcklbw %%mm7, %%mm4 \n\t" /* 0L0M0N0O */\
955 "paddw %%mm1, %%mm5 \n\t" /* b */\
956 "paddw %%mm4, %%mm0 \n\t" /* c */\
957 "paddw %%mm5, %%mm5 \n\t" /* 2b */\
958 "psubw %%mm5, %%mm0 \n\t" /* c - 2b */\
959 "movq %%mm3, %%mm5 \n\t" /* JKLMNOPQ */\
960 "psrlq $24, %%mm3 \n\t" /* MNOPQ000 */\
961 "pmullw "MANGLE(ff_pw_3)", %%mm0 \n\t" /* 3c - 6b */\
962 "punpcklbw %%mm7, %%mm3 \n\t" /* 0M0N0O0P */\
963 "paddw %%mm3, %%mm2 \n\t" /* d */\
964 "psubw %%mm2, %%mm0 \n\t" /* -6b + 3c - d */\
965 "movq %%mm5, %%mm2 \n\t" /* JKLMNOPQ */\
966 "punpcklbw %%mm7, %%mm2 \n\t" /* 0J0K0L0M */\
967 "punpckhbw %%mm7, %%mm5 \n\t" /* 0N0O0P0Q */\
968 "paddw %%mm2, %%mm6 \n\t" /* a */\
969 "pmullw "MANGLE(ff_pw_20)", %%mm6 \n\t" /* 20a */\
970 "paddw %6, %%mm0 \n\t"\
971 "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\
972 "psraw $5, %%mm0 \n\t"\
973 /* mm1=KLMN, mm2=JKLM, mm3=MNOP, mm4=LMNO, mm5=NOPQ mm7=0 */\
975 "paddw %%mm5, %%mm3 \n\t" /* a */\
976 "pshufw $0xF9, %%mm5, %%mm6 \n\t" /* 0O0P0Q0Q */\
977 "paddw %%mm4, %%mm6 \n\t" /* b */\
978 "pshufw $0xBE, %%mm5, %%mm4 \n\t" /* 0P0Q0Q0P */\
979 "pshufw $0x6F, %%mm5, %%mm5 \n\t" /* 0Q0Q0P0O */\
980 "paddw %%mm1, %%mm4 \n\t" /* c */\
981 "paddw %%mm2, %%mm5 \n\t" /* d */\
982 "paddw %%mm6, %%mm6 \n\t" /* 2b */\
983 "psubw %%mm6, %%mm4 \n\t" /* c - 2b */\
984 "pmullw "MANGLE(ff_pw_20)", %%mm3 \n\t" /* 20a */\
985 "pmullw "MANGLE(ff_pw_3)", %%mm4 \n\t" /* 3c - 6b */\
986 "psubw %%mm5, %%mm3 \n\t" /* -6b + 3c - d */\
987 "paddw %6, %%mm4 \n\t"\
988 "paddw %%mm3, %%mm4 \n\t" /* 20a - 6b + 3c - d */\
989 "psraw $5, %%mm4 \n\t"\
990 "packuswb %%mm4, %%mm0 \n\t"\
991 OP_MMX2(%%mm0, 8(%1), %%mm4, q)\
997 : "+a"(src), "+c"(dst), "+D"(h)\
998 : "d"((x86_reg)srcStride), "S"((x86_reg)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(temp), "m"(ROUNDER)\
1003 static void OPNAME ## mpeg4_qpel16_h_lowpass_3dnow(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1006 /* quick HACK, XXX FIXME MUST be optimized */\
1009 temp[ 0]= (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]);\
1010 temp[ 1]= (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]);\
1011 temp[ 2]= (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]);\
1012 temp[ 3]= (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]);\
1013 temp[ 4]= (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]);\
1014 temp[ 5]= (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]);\
1015 temp[ 6]= (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]);\
1016 temp[ 7]= (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]);\
1017 temp[ 8]= (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]);\
1018 temp[ 9]= (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]);\
1019 temp[10]= (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]);\
1020 temp[11]= (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]);\
1021 temp[12]= (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]);\
1022 temp[13]= (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]);\
1023 temp[14]= (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]);\
1024 temp[15]= (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]);\
1026 "movq (%0), %%mm0 \n\t"\
1027 "movq 8(%0), %%mm1 \n\t"\
1028 "paddw %2, %%mm0 \n\t"\
1029 "paddw %2, %%mm1 \n\t"\
1030 "psraw $5, %%mm0 \n\t"\
1031 "psraw $5, %%mm1 \n\t"\
1032 "packuswb %%mm1, %%mm0 \n\t"\
1033 OP_3DNOW(%%mm0, (%1), %%mm1, q)\
1034 "movq 16(%0), %%mm0 \n\t"\
1035 "movq 24(%0), %%mm1 \n\t"\
1036 "paddw %2, %%mm0 \n\t"\
1037 "paddw %2, %%mm1 \n\t"\
1038 "psraw $5, %%mm0 \n\t"\
1039 "psraw $5, %%mm1 \n\t"\
1040 "packuswb %%mm1, %%mm0 \n\t"\
1041 OP_3DNOW(%%mm0, 8(%1), %%mm1, q)\
1042 :: "r"(temp), "r"(dst), "m"(ROUNDER)\
1050 static void OPNAME ## mpeg4_qpel8_h_lowpass_mmx2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1052 "pxor %%mm7, %%mm7 \n\t"\
1054 "movq (%0), %%mm0 \n\t" /* ABCDEFGH */\
1055 "movq %%mm0, %%mm1 \n\t" /* ABCDEFGH */\
1056 "movq %%mm0, %%mm2 \n\t" /* ABCDEFGH */\
1057 "punpcklbw %%mm7, %%mm0 \n\t" /* 0A0B0C0D */\
1058 "punpckhbw %%mm7, %%mm1 \n\t" /* 0E0F0G0H */\
1059 "pshufw $0x90, %%mm0, %%mm5 \n\t" /* 0A0A0B0C */\
1060 "pshufw $0x41, %%mm0, %%mm6 \n\t" /* 0B0A0A0B */\
1061 "movq %%mm2, %%mm3 \n\t" /* ABCDEFGH */\
1062 "movq %%mm2, %%mm4 \n\t" /* ABCDEFGH */\
1063 "psllq $8, %%mm2 \n\t" /* 0ABCDEFG */\
1064 "psllq $16, %%mm3 \n\t" /* 00ABCDEF */\
1065 "psllq $24, %%mm4 \n\t" /* 000ABCDE */\
1066 "punpckhbw %%mm7, %%mm2 \n\t" /* 0D0E0F0G */\
1067 "punpckhbw %%mm7, %%mm3 \n\t" /* 0C0D0E0F */\
1068 "punpckhbw %%mm7, %%mm4 \n\t" /* 0B0C0D0E */\
1069 "paddw %%mm3, %%mm5 \n\t" /* b */\
1070 "paddw %%mm2, %%mm6 \n\t" /* c */\
1071 "paddw %%mm5, %%mm5 \n\t" /* 2b */\
1072 "psubw %%mm5, %%mm6 \n\t" /* c - 2b */\
1073 "pshufw $0x06, %%mm0, %%mm5 \n\t" /* 0C0B0A0A */\
1074 "pmullw "MANGLE(ff_pw_3)", %%mm6 \n\t" /* 3c - 6b */\
1075 "paddw %%mm4, %%mm0 \n\t" /* a */\
1076 "paddw %%mm1, %%mm5 \n\t" /* d */\
1077 "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" /* 20a */\
1078 "psubw %%mm5, %%mm0 \n\t" /* 20a - d */\
1079 "paddw %5, %%mm6 \n\t"\
1080 "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\
1081 "psraw $5, %%mm0 \n\t"\
1082 /* mm1=EFGH, mm2=DEFG, mm3=CDEF, mm4=BCDE, mm7=0 */\
1084 "movd 5(%0), %%mm5 \n\t" /* FGHI */\
1085 "punpcklbw %%mm7, %%mm5 \n\t" /* 0F0G0H0I */\
1086 "pshufw $0xF9, %%mm5, %%mm6 \n\t" /* 0G0H0I0I */\
1087 "paddw %%mm5, %%mm1 \n\t" /* a */\
1088 "paddw %%mm6, %%mm2 \n\t" /* b */\
1089 "pshufw $0xBE, %%mm5, %%mm6 \n\t" /* 0H0I0I0H */\
1090 "pshufw $0x6F, %%mm5, %%mm5 \n\t" /* 0I0I0H0G */\
1091 "paddw %%mm6, %%mm3 \n\t" /* c */\
1092 "paddw %%mm5, %%mm4 \n\t" /* d */\
1093 "paddw %%mm2, %%mm2 \n\t" /* 2b */\
1094 "psubw %%mm2, %%mm3 \n\t" /* c - 2b */\
1095 "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" /* 20a */\
1096 "pmullw "MANGLE(ff_pw_3)", %%mm3 \n\t" /* 3c - 6b */\
1097 "psubw %%mm4, %%mm3 \n\t" /* -6b + 3c - d */\
1098 "paddw %5, %%mm1 \n\t"\
1099 "paddw %%mm1, %%mm3 \n\t" /* 20a - 6b + 3c - d */\
1100 "psraw $5, %%mm3 \n\t"\
1101 "packuswb %%mm3, %%mm0 \n\t"\
1102 OP_MMX2(%%mm0, (%1), %%mm4, q)\
1108 : "+a"(src), "+c"(dst), "+d"(h)\
1109 : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(ROUNDER)\
1114 static void OPNAME ## mpeg4_qpel8_h_lowpass_3dnow(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1117 /* quick HACK, XXX FIXME MUST be optimized */\
1120 temp[ 0]= (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]);\
1121 temp[ 1]= (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]);\
1122 temp[ 2]= (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]);\
1123 temp[ 3]= (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]);\
1124 temp[ 4]= (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]);\
1125 temp[ 5]= (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 8]);\
1126 temp[ 6]= (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 8])*3 - (src[ 3]+src[ 7]);\
1127 temp[ 7]= (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 8])*6 + (src[ 5]+src[ 7])*3 - (src[ 4]+src[ 6]);\
1129 "movq (%0), %%mm0 \n\t"\
1130 "movq 8(%0), %%mm1 \n\t"\
1131 "paddw %2, %%mm0 \n\t"\
1132 "paddw %2, %%mm1 \n\t"\
1133 "psraw $5, %%mm0 \n\t"\
1134 "psraw $5, %%mm1 \n\t"\
1135 "packuswb %%mm1, %%mm0 \n\t"\
1136 OP_3DNOW(%%mm0, (%1), %%mm1, q)\
1137 :: "r"(temp), "r"(dst), "m"(ROUNDER)\
1145 #define QPEL_OP(OPNAME, ROUNDER, RND, OP, MMX)\
1147 static void OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1148 uint64_t temp[17*4];\
1149 uint64_t *temp_ptr= temp;\
1154 "pxor %%mm7, %%mm7 \n\t"\
1156 "movq (%0), %%mm0 \n\t"\
1157 "movq (%0), %%mm1 \n\t"\
1158 "movq 8(%0), %%mm2 \n\t"\
1159 "movq 8(%0), %%mm3 \n\t"\
1160 "punpcklbw %%mm7, %%mm0 \n\t"\
1161 "punpckhbw %%mm7, %%mm1 \n\t"\
1162 "punpcklbw %%mm7, %%mm2 \n\t"\
1163 "punpckhbw %%mm7, %%mm3 \n\t"\
1164 "movq %%mm0, (%1) \n\t"\
1165 "movq %%mm1, 17*8(%1) \n\t"\
1166 "movq %%mm2, 2*17*8(%1) \n\t"\
1167 "movq %%mm3, 3*17*8(%1) \n\t"\
1172 : "+r" (src), "+r" (temp_ptr), "+r"(count)\
1173 : "r" ((x86_reg)srcStride)\
1180 /*FIXME reorder for speed */\
1182 /*"pxor %%mm7, %%mm7 \n\t"*/\
1184 "movq (%0), %%mm0 \n\t"\
1185 "movq 8(%0), %%mm1 \n\t"\
1186 "movq 16(%0), %%mm2 \n\t"\
1187 "movq 24(%0), %%mm3 \n\t"\
1188 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP)\
1189 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP)\
1191 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP)\
1193 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP)\
1195 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP)\
1196 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 72(%0), (%1, %3), OP)\
1198 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 80(%0), (%1), OP)\
1199 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 88(%0), (%1, %3), OP)\
1201 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 40(%0), 48(%0), 56(%0), 96(%0), (%1), OP)\
1202 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 48(%0), 56(%0), 64(%0),104(%0), (%1, %3), OP)\
1204 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 56(%0), 64(%0), 72(%0),112(%0), (%1), OP)\
1205 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 64(%0), 72(%0), 80(%0),120(%0), (%1, %3), OP)\
1207 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 72(%0), 80(%0), 88(%0),128(%0), (%1), OP)\
1209 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 80(%0), 88(%0), 96(%0),128(%0), (%1, %3), OP)\
1211 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 88(%0), 96(%0),104(%0),120(%0), (%1), OP)\
1212 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 96(%0),104(%0),112(%0),112(%0), (%1, %3), OP)\
1214 "add $136, %0 \n\t"\
1219 : "+r"(temp_ptr), "+r"(dst), "+g"(count)\
1220 : "r"((x86_reg)dstStride), "r"(2*(x86_reg)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(ROUNDER), "g"(4-14*(x86_reg)dstStride)\
1225 static void OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1226 uint64_t temp[9*2];\
1227 uint64_t *temp_ptr= temp;\
1232 "pxor %%mm7, %%mm7 \n\t"\
1234 "movq (%0), %%mm0 \n\t"\
1235 "movq (%0), %%mm1 \n\t"\
1236 "punpcklbw %%mm7, %%mm0 \n\t"\
1237 "punpckhbw %%mm7, %%mm1 \n\t"\
1238 "movq %%mm0, (%1) \n\t"\
1239 "movq %%mm1, 9*8(%1) \n\t"\
1244 : "+r" (src), "+r" (temp_ptr), "+r"(count)\
1245 : "r" ((x86_reg)srcStride)\
1252 /*FIXME reorder for speed */\
1254 /*"pxor %%mm7, %%mm7 \n\t"*/\
1256 "movq (%0), %%mm0 \n\t"\
1257 "movq 8(%0), %%mm1 \n\t"\
1258 "movq 16(%0), %%mm2 \n\t"\
1259 "movq 24(%0), %%mm3 \n\t"\
1260 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP)\
1261 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP)\
1263 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP)\
1265 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP)\
1267 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP)\
1269 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 64(%0), (%1, %3), OP)\
1271 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 56(%0), (%1), OP)\
1272 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 48(%0), (%1, %3), OP)\
1279 : "+r"(temp_ptr), "+r"(dst), "+g"(count)\
1280 : "r"((x86_reg)dstStride), "r"(2*(x86_reg)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(ROUNDER), "g"(4-6*(x86_reg)dstStride)\
1285 static void OPNAME ## qpel8_mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\
1286 OPNAME ## pixels8_ ## MMX(dst, src, stride, 8);\
1289 static void OPNAME ## qpel8_mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1291 uint8_t * const half= (uint8_t*)temp;\
1292 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, stride, 8);\
1293 OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8);\
1296 static void OPNAME ## qpel8_mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1297 OPNAME ## mpeg4_qpel8_h_lowpass_ ## MMX(dst, src, stride, stride, 8);\
1300 static void OPNAME ## qpel8_mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1302 uint8_t * const half= (uint8_t*)temp;\
1303 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, stride, 8);\
1304 OPNAME ## pixels8_l2_ ## MMX(dst, src+1, half, stride, stride, 8);\
1307 static void OPNAME ## qpel8_mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1309 uint8_t * const half= (uint8_t*)temp;\
1310 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride);\
1311 OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8);\
1314 static void OPNAME ## qpel8_mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1315 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, src, stride, stride);\
1318 static void OPNAME ## qpel8_mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1320 uint8_t * const half= (uint8_t*)temp;\
1321 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride);\
1322 OPNAME ## pixels8_l2_ ## MMX(dst, src+stride, half, stride, stride, 8);\
1324 static void OPNAME ## qpel8_mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1325 uint64_t half[8 + 9];\
1326 uint8_t * const halfH= ((uint8_t*)half) + 64;\
1327 uint8_t * const halfHV= ((uint8_t*)half);\
1328 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
1329 put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\
1330 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
1331 OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\
1333 static void OPNAME ## qpel8_mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1334 uint64_t half[8 + 9];\
1335 uint8_t * const halfH= ((uint8_t*)half) + 64;\
1336 uint8_t * const halfHV= ((uint8_t*)half);\
1337 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
1338 put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\
1339 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
1340 OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\
1342 static void OPNAME ## qpel8_mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1343 uint64_t half[8 + 9];\
1344 uint8_t * const halfH= ((uint8_t*)half) + 64;\
1345 uint8_t * const halfHV= ((uint8_t*)half);\
1346 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
1347 put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\
1348 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
1349 OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\
1351 static void OPNAME ## qpel8_mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1352 uint64_t half[8 + 9];\
1353 uint8_t * const halfH= ((uint8_t*)half) + 64;\
1354 uint8_t * const halfHV= ((uint8_t*)half);\
1355 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
1356 put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\
1357 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
1358 OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\
1360 static void OPNAME ## qpel8_mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1361 uint64_t half[8 + 9];\
1362 uint8_t * const halfH= ((uint8_t*)half) + 64;\
1363 uint8_t * const halfHV= ((uint8_t*)half);\
1364 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
1365 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
1366 OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\
1368 static void OPNAME ## qpel8_mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1369 uint64_t half[8 + 9];\
1370 uint8_t * const halfH= ((uint8_t*)half) + 64;\
1371 uint8_t * const halfHV= ((uint8_t*)half);\
1372 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
1373 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
1374 OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\
1376 static void OPNAME ## qpel8_mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1377 uint64_t half[8 + 9];\
1378 uint8_t * const halfH= ((uint8_t*)half);\
1379 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
1380 put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\
1381 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
1383 static void OPNAME ## qpel8_mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1384 uint64_t half[8 + 9];\
1385 uint8_t * const halfH= ((uint8_t*)half);\
1386 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
1387 put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\
1388 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
1390 static void OPNAME ## qpel8_mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1392 uint8_t * const halfH= ((uint8_t*)half);\
1393 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
1394 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
1396 static void OPNAME ## qpel16_mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\
1397 OPNAME ## pixels16_ ## MMX(dst, src, stride, 16);\
1400 static void OPNAME ## qpel16_mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1402 uint8_t * const half= (uint8_t*)temp;\
1403 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, stride, 16);\
1404 OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16);\
1407 static void OPNAME ## qpel16_mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1408 OPNAME ## mpeg4_qpel16_h_lowpass_ ## MMX(dst, src, stride, stride, 16);\
1411 static void OPNAME ## qpel16_mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1413 uint8_t * const half= (uint8_t*)temp;\
1414 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, stride, 16);\
1415 OPNAME ## pixels16_l2_ ## MMX(dst, src+1, half, stride, stride, 16);\
1418 static void OPNAME ## qpel16_mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1420 uint8_t * const half= (uint8_t*)temp;\
1421 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, stride);\
1422 OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16);\
1425 static void OPNAME ## qpel16_mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1426 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, src, stride, stride);\
1429 static void OPNAME ## qpel16_mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1431 uint8_t * const half= (uint8_t*)temp;\
1432 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, stride);\
1433 OPNAME ## pixels16_l2_ ## MMX(dst, src+stride, half, stride, stride, 16);\
1435 static void OPNAME ## qpel16_mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1436 uint64_t half[16*2 + 17*2];\
1437 uint8_t * const halfH= ((uint8_t*)half) + 256;\
1438 uint8_t * const halfHV= ((uint8_t*)half);\
1439 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
1440 put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\
1441 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
1442 OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\
1444 static void OPNAME ## qpel16_mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1445 uint64_t half[16*2 + 17*2];\
1446 uint8_t * const halfH= ((uint8_t*)half) + 256;\
1447 uint8_t * const halfHV= ((uint8_t*)half);\
1448 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
1449 put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\
1450 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
1451 OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\
1453 static void OPNAME ## qpel16_mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1454 uint64_t half[16*2 + 17*2];\
1455 uint8_t * const halfH= ((uint8_t*)half) + 256;\
1456 uint8_t * const halfHV= ((uint8_t*)half);\
1457 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
1458 put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\
1459 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
1460 OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\
1462 static void OPNAME ## qpel16_mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1463 uint64_t half[16*2 + 17*2];\
1464 uint8_t * const halfH= ((uint8_t*)half) + 256;\
1465 uint8_t * const halfHV= ((uint8_t*)half);\
1466 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
1467 put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\
1468 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
1469 OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\
1471 static void OPNAME ## qpel16_mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1472 uint64_t half[16*2 + 17*2];\
1473 uint8_t * const halfH= ((uint8_t*)half) + 256;\
1474 uint8_t * const halfHV= ((uint8_t*)half);\
1475 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
1476 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
1477 OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\
1479 static void OPNAME ## qpel16_mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1480 uint64_t half[16*2 + 17*2];\
1481 uint8_t * const halfH= ((uint8_t*)half) + 256;\
1482 uint8_t * const halfHV= ((uint8_t*)half);\
1483 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
1484 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
1485 OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\
1487 static void OPNAME ## qpel16_mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1488 uint64_t half[17*2];\
1489 uint8_t * const halfH= ((uint8_t*)half);\
1490 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
1491 put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\
1492 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
1494 static void OPNAME ## qpel16_mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1495 uint64_t half[17*2];\
1496 uint8_t * const halfH= ((uint8_t*)half);\
1497 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
1498 put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\
1499 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
1501 static void OPNAME ## qpel16_mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1502 uint64_t half[17*2];\
1503 uint8_t * const halfH= ((uint8_t*)half);\
1504 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
1505 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
1508 #define PUT_OP(a,b,temp, size) "mov" #size " " #a ", " #b " \n\t"
1509 #define AVG_3DNOW_OP(a,b,temp, size) \
1510 "mov" #size " " #b ", " #temp " \n\t"\
1511 "pavgusb " #temp ", " #a " \n\t"\
1512 "mov" #size " " #a ", " #b " \n\t"
1513 #define AVG_MMX2_OP(a,b,temp, size) \
1514 "mov" #size " " #b ", " #temp " \n\t"\
1515 "pavgb " #temp ", " #a " \n\t"\
1516 "mov" #size " " #a ", " #b " \n\t"
1518 QPEL_BASE(put_ , ff_pw_16, _ , PUT_OP, PUT_OP)
1519 QPEL_BASE(avg_ , ff_pw_16, _ , AVG_MMX2_OP, AVG_3DNOW_OP)
1520 QPEL_BASE(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, PUT_OP)
1521 QPEL_OP(put_ , ff_pw_16, _ , PUT_OP, 3dnow)
1522 QPEL_OP(avg_ , ff_pw_16, _ , AVG_3DNOW_OP, 3dnow)
1523 QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, 3dnow)
1524 QPEL_OP(put_ , ff_pw_16, _ , PUT_OP, mmx2)
1525 QPEL_OP(avg_ , ff_pw_16, _ , AVG_MMX2_OP, mmx2)
1526 QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, mmx2)
1528 /***********************************/
1529 /* bilinear qpel: not compliant to any spec, only for -lavdopts fast */
1531 #define QPEL_2TAP_XY(OPNAME, SIZE, MMX, XY, HPEL)\
1532 static void OPNAME ## 2tap_qpel ## SIZE ## _mc ## XY ## _ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1533 OPNAME ## pixels ## SIZE ## HPEL(dst, src, stride, SIZE);\
1535 #define QPEL_2TAP_L3(OPNAME, SIZE, MMX, XY, S0, S1, S2)\
1536 static void OPNAME ## 2tap_qpel ## SIZE ## _mc ## XY ## _ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1537 OPNAME ## 2tap_qpel ## SIZE ## _l3_ ## MMX(dst, src+S0, stride, SIZE, S1, S2);\
1540 #define QPEL_2TAP(OPNAME, SIZE, MMX)\
1541 QPEL_2TAP_XY(OPNAME, SIZE, MMX, 20, _x2_ ## MMX)\
1542 QPEL_2TAP_XY(OPNAME, SIZE, MMX, 02, _y2_ ## MMX)\
1543 QPEL_2TAP_XY(OPNAME, SIZE, MMX, 22, _xy2_mmx)\
1544 static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc00_ ## MMX =\
1545 OPNAME ## qpel ## SIZE ## _mc00_ ## MMX;\
1546 static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc21_ ## MMX =\
1547 OPNAME ## 2tap_qpel ## SIZE ## _mc20_ ## MMX;\
1548 static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc12_ ## MMX =\
1549 OPNAME ## 2tap_qpel ## SIZE ## _mc02_ ## MMX;\
1550 static void OPNAME ## 2tap_qpel ## SIZE ## _mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1551 OPNAME ## pixels ## SIZE ## _y2_ ## MMX(dst, src+1, stride, SIZE);\
1553 static void OPNAME ## 2tap_qpel ## SIZE ## _mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1554 OPNAME ## pixels ## SIZE ## _x2_ ## MMX(dst, src+stride, stride, SIZE);\
1556 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 10, 0, 1, 0)\
1557 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 30, 1, -1, 0)\
1558 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 01, 0, stride, 0)\
1559 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 03, stride, -stride, 0)\
1560 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 11, 0, stride, 1)\
1561 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 31, 1, stride, -1)\
1562 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 13, stride, -stride, 1)\
1563 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 33, stride+1, -stride, -1)\
1565 QPEL_2TAP(put_, 16, mmx2)
1566 QPEL_2TAP(avg_, 16, mmx2)
1567 QPEL_2TAP(put_, 8, mmx2)
1568 QPEL_2TAP(avg_, 8, mmx2)
1569 QPEL_2TAP(put_, 16, 3dnow)
1570 QPEL_2TAP(avg_, 16, 3dnow)
1571 QPEL_2TAP(put_, 8, 3dnow)
1572 QPEL_2TAP(avg_, 8, 3dnow)
1576 typedef void emu_edge_core_func (uint8_t *buf, const uint8_t *src,
1577 x86_reg linesize, x86_reg start_y,
1578 x86_reg end_y, x86_reg block_h,
1579 x86_reg start_x, x86_reg end_x,
1581 extern emu_edge_core_func ff_emu_edge_core_mmx;
1582 extern emu_edge_core_func ff_emu_edge_core_sse;
1584 static av_always_inline
1585 void emulated_edge_mc(uint8_t *buf, const uint8_t *src, int linesize,
1586 int block_w, int block_h,
1587 int src_x, int src_y, int w, int h,
1588 emu_edge_core_func *core_fn)
1590 int start_y, start_x, end_y, end_x, src_y_add=0;
1593 src_y_add = h-1-src_y;
1595 }else if(src_y<=-block_h){
1596 src_y_add = 1-block_h-src_y;
1602 }else if(src_x<=-block_w){
1603 src+= (1-block_w-src_x);
1607 start_y= FFMAX(0, -src_y);
1608 start_x= FFMAX(0, -src_x);
1609 end_y= FFMIN(block_h, h-src_y);
1610 end_x= FFMIN(block_w, w-src_x);
1611 assert(start_x < end_x && block_w > 0);
1612 assert(start_y < end_y && block_h > 0);
1614 // fill in the to-be-copied part plus all above/below
1615 src += (src_y_add+start_y)*linesize + start_x;
1617 core_fn(buf, src, linesize, start_y, end_y, block_h, start_x, end_x, block_w);
1622 void emulated_edge_mc_mmx(uint8_t *buf, const uint8_t *src, int linesize,
1623 int block_w, int block_h,
1624 int src_x, int src_y, int w, int h)
1626 emulated_edge_mc(buf, src, linesize, block_w, block_h, src_x, src_y,
1627 w, h, &ff_emu_edge_core_mmx);
1631 void emulated_edge_mc_sse(uint8_t *buf, const uint8_t *src, int linesize,
1632 int block_w, int block_h,
1633 int src_x, int src_y, int w, int h)
1635 emulated_edge_mc(buf, src, linesize, block_w, block_h, src_x, src_y,
1636 w, h, &ff_emu_edge_core_sse);
1638 #endif /* HAVE_YASM */
1640 typedef void emulated_edge_mc_func (uint8_t *dst, const uint8_t *src,
1641 int linesize, int block_w, int block_h,
1642 int src_x, int src_y, int w, int h);
1644 static av_always_inline
1645 void gmc(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
1646 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height,
1647 emulated_edge_mc_func *emu_edge_fn)
1650 const int ix = ox>>(16+shift);
1651 const int iy = oy>>(16+shift);
1652 const int oxs = ox>>4;
1653 const int oys = oy>>4;
1654 const int dxxs = dxx>>4;
1655 const int dxys = dxy>>4;
1656 const int dyxs = dyx>>4;
1657 const int dyys = dyy>>4;
1658 const uint16_t r4[4] = {r,r,r,r};
1659 const uint16_t dxy4[4] = {dxys,dxys,dxys,dxys};
1660 const uint16_t dyy4[4] = {dyys,dyys,dyys,dyys};
1661 const uint64_t shift2 = 2*shift;
1662 uint8_t edge_buf[(h+1)*stride];
1665 const int dxw = (dxx-(1<<(16+shift)))*(w-1);
1666 const int dyh = (dyy-(1<<(16+shift)))*(h-1);
1667 const int dxh = dxy*(h-1);
1668 const int dyw = dyx*(w-1);
1669 if( // non-constant fullpel offset (3% of blocks)
1670 ((ox^(ox+dxw)) | (ox^(ox+dxh)) | (ox^(ox+dxw+dxh)) |
1671 (oy^(oy+dyw)) | (oy^(oy+dyh)) | (oy^(oy+dyw+dyh))) >> (16+shift)
1672 // uses more than 16 bits of subpel mv (only at huge resolution)
1673 || (dxx|dxy|dyx|dyy)&15 )
1675 //FIXME could still use mmx for some of the rows
1676 ff_gmc_c(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r, width, height);
1680 src += ix + iy*stride;
1681 if( (unsigned)ix >= width-w ||
1682 (unsigned)iy >= height-h )
1684 emu_edge_fn(edge_buf, src, stride, w+1, h+1, ix, iy, width, height);
1689 "movd %0, %%mm6 \n\t"
1690 "pxor %%mm7, %%mm7 \n\t"
1691 "punpcklwd %%mm6, %%mm6 \n\t"
1692 "punpcklwd %%mm6, %%mm6 \n\t"
1696 for(x=0; x<w; x+=4){
1697 uint16_t dx4[4] = { oxs - dxys + dxxs*(x+0),
1698 oxs - dxys + dxxs*(x+1),
1699 oxs - dxys + dxxs*(x+2),
1700 oxs - dxys + dxxs*(x+3) };
1701 uint16_t dy4[4] = { oys - dyys + dyxs*(x+0),
1702 oys - dyys + dyxs*(x+1),
1703 oys - dyys + dyxs*(x+2),
1704 oys - dyys + dyxs*(x+3) };
1708 "movq %0, %%mm4 \n\t"
1709 "movq %1, %%mm5 \n\t"
1710 "paddw %2, %%mm4 \n\t"
1711 "paddw %3, %%mm5 \n\t"
1712 "movq %%mm4, %0 \n\t"
1713 "movq %%mm5, %1 \n\t"
1714 "psrlw $12, %%mm4 \n\t"
1715 "psrlw $12, %%mm5 \n\t"
1716 : "+m"(*dx4), "+m"(*dy4)
1717 : "m"(*dxy4), "m"(*dyy4)
1721 "movq %%mm6, %%mm2 \n\t"
1722 "movq %%mm6, %%mm1 \n\t"
1723 "psubw %%mm4, %%mm2 \n\t"
1724 "psubw %%mm5, %%mm1 \n\t"
1725 "movq %%mm2, %%mm0 \n\t"
1726 "movq %%mm4, %%mm3 \n\t"
1727 "pmullw %%mm1, %%mm0 \n\t" // (s-dx)*(s-dy)
1728 "pmullw %%mm5, %%mm3 \n\t" // dx*dy
1729 "pmullw %%mm5, %%mm2 \n\t" // (s-dx)*dy
1730 "pmullw %%mm4, %%mm1 \n\t" // dx*(s-dy)
1732 "movd %4, %%mm5 \n\t"
1733 "movd %3, %%mm4 \n\t"
1734 "punpcklbw %%mm7, %%mm5 \n\t"
1735 "punpcklbw %%mm7, %%mm4 \n\t"
1736 "pmullw %%mm5, %%mm3 \n\t" // src[1,1] * dx*dy
1737 "pmullw %%mm4, %%mm2 \n\t" // src[0,1] * (s-dx)*dy
1739 "movd %2, %%mm5 \n\t"
1740 "movd %1, %%mm4 \n\t"
1741 "punpcklbw %%mm7, %%mm5 \n\t"
1742 "punpcklbw %%mm7, %%mm4 \n\t"
1743 "pmullw %%mm5, %%mm1 \n\t" // src[1,0] * dx*(s-dy)
1744 "pmullw %%mm4, %%mm0 \n\t" // src[0,0] * (s-dx)*(s-dy)
1745 "paddw %5, %%mm1 \n\t"
1746 "paddw %%mm3, %%mm2 \n\t"
1747 "paddw %%mm1, %%mm0 \n\t"
1748 "paddw %%mm2, %%mm0 \n\t"
1750 "psrlw %6, %%mm0 \n\t"
1751 "packuswb %%mm0, %%mm0 \n\t"
1752 "movd %%mm0, %0 \n\t"
1754 : "=m"(dst[x+y*stride])
1755 : "m"(src[0]), "m"(src[1]),
1756 "m"(src[stride]), "m"(src[stride+1]),
1757 "m"(*r4), "m"(shift2)
1767 static void gmc_mmx(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
1768 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
1770 gmc(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r,
1771 width, height, &emulated_edge_mc_mmx);
1774 static void gmc_sse(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
1775 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
1777 gmc(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r,
1778 width, height, &emulated_edge_mc_sse);
1781 static void gmc_mmx(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
1782 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
1784 gmc(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r,
1785 width, height, &ff_emulated_edge_mc_8);
1789 #define PREFETCH(name, op) \
1790 static void name(void *mem, int stride, int h){\
1791 const uint8_t *p= mem;\
1793 __asm__ volatile(#op" %0" :: "m"(*p));\
1797 PREFETCH(prefetch_mmx2, prefetcht0)
1798 PREFETCH(prefetch_3dnow, prefetch)
1801 #include "h264_qpel_mmx.c"
1803 void ff_put_h264_chroma_mc8_mmx_rnd (uint8_t *dst, uint8_t *src,
1804 int stride, int h, int x, int y);
1805 void ff_avg_h264_chroma_mc8_mmx2_rnd (uint8_t *dst, uint8_t *src,
1806 int stride, int h, int x, int y);
1807 void ff_avg_h264_chroma_mc8_3dnow_rnd (uint8_t *dst, uint8_t *src,
1808 int stride, int h, int x, int y);
1810 void ff_put_h264_chroma_mc4_mmx (uint8_t *dst, uint8_t *src,
1811 int stride, int h, int x, int y);
1812 void ff_avg_h264_chroma_mc4_mmx2 (uint8_t *dst, uint8_t *src,
1813 int stride, int h, int x, int y);
1814 void ff_avg_h264_chroma_mc4_3dnow (uint8_t *dst, uint8_t *src,
1815 int stride, int h, int x, int y);
1817 void ff_put_h264_chroma_mc2_mmx2 (uint8_t *dst, uint8_t *src,
1818 int stride, int h, int x, int y);
1819 void ff_avg_h264_chroma_mc2_mmx2 (uint8_t *dst, uint8_t *src,
1820 int stride, int h, int x, int y);
1822 void ff_put_h264_chroma_mc8_ssse3_rnd (uint8_t *dst, uint8_t *src,
1823 int stride, int h, int x, int y);
1824 void ff_put_h264_chroma_mc4_ssse3 (uint8_t *dst, uint8_t *src,
1825 int stride, int h, int x, int y);
1827 void ff_avg_h264_chroma_mc8_ssse3_rnd (uint8_t *dst, uint8_t *src,
1828 int stride, int h, int x, int y);
1829 void ff_avg_h264_chroma_mc4_ssse3 (uint8_t *dst, uint8_t *src,
1830 int stride, int h, int x, int y);
1832 #define CHROMA_MC(OP, NUM, DEPTH, OPT) \
1833 void ff_ ## OP ## _h264_chroma_mc ## NUM ## _ ## DEPTH ## _ ## OPT \
1834 (uint8_t *dst, uint8_t *src,\
1835 int stride, int h, int x, int y);
1837 CHROMA_MC(put, 2, 10, mmxext)
1838 CHROMA_MC(avg, 2, 10, mmxext)
1839 CHROMA_MC(put, 4, 10, mmxext)
1840 CHROMA_MC(avg, 4, 10, mmxext)
1841 CHROMA_MC(put, 8, 10, sse2)
1842 CHROMA_MC(avg, 8, 10, sse2)
1843 CHROMA_MC(put, 8, 10, avx)
1844 CHROMA_MC(avg, 8, 10, avx)
1847 void ff_put_cavs_qpel8_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) {
1848 put_pixels8_mmx(dst, src, stride, 8);
1850 void ff_avg_cavs_qpel8_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) {
1851 avg_pixels8_mmx(dst, src, stride, 8);
1853 void ff_put_cavs_qpel16_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) {
1854 put_pixels16_mmx(dst, src, stride, 16);
1856 void ff_avg_cavs_qpel16_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) {
1857 avg_pixels16_mmx(dst, src, stride, 16);
1861 void ff_put_vc1_mspel_mc00_mmx(uint8_t *dst, const uint8_t *src, int stride, int rnd) {
1862 put_pixels8_mmx(dst, src, stride, 8);
1864 void ff_avg_vc1_mspel_mc00_mmx2(uint8_t *dst, const uint8_t *src, int stride, int rnd) {
1865 avg_pixels8_mmx2(dst, src, stride, 8);
1868 /* only used in VP3/5/6 */
1869 static void put_vp_no_rnd_pixels8_l2_mmx(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h)
1875 "movq (%1), %%mm0 \n\t"
1876 "movq (%2), %%mm1 \n\t"
1877 "movq (%1,%4), %%mm2 \n\t"
1878 "movq (%2,%4), %%mm3 \n\t"
1879 PAVGBP_MMX_NO_RND(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
1880 "movq %%mm4, (%3) \n\t"
1881 "movq %%mm5, (%3,%4) \n\t"
1883 "movq (%1,%4,2), %%mm0 \n\t"
1884 "movq (%2,%4,2), %%mm1 \n\t"
1885 "movq (%1,%5), %%mm2 \n\t"
1886 "movq (%2,%5), %%mm3 \n\t"
1887 "lea (%1,%4,4), %1 \n\t"
1888 "lea (%2,%4,4), %2 \n\t"
1889 PAVGBP_MMX_NO_RND(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
1890 "movq %%mm4, (%3,%4,2) \n\t"
1891 "movq %%mm5, (%3,%5) \n\t"
1892 "lea (%3,%4,4), %3 \n\t"
1895 :"+r"(h), "+r"(a), "+r"(b), "+r"(dst)
1896 :"r"((x86_reg)stride), "r"((x86_reg)3L*stride)
1898 // STOP_TIMER("put_vp_no_rnd_pixels8_l2_mmx")
1900 static void put_vp_no_rnd_pixels16_l2_mmx(uint8_t *dst, const uint8_t *a, const uint8_t *b, int stride, int h)
1902 put_vp_no_rnd_pixels8_l2_mmx(dst, a, b, stride, h);
1903 put_vp_no_rnd_pixels8_l2_mmx(dst+8, a+8, b+8, stride, h);
1906 #if CONFIG_DIRAC_DECODER
1907 #define DIRAC_PIXOP(OPNAME, EXT)\
1908 void ff_ ## OPNAME ## _dirac_pixels8_ ## EXT(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1910 OPNAME ## _pixels8_ ## EXT(dst, src[0], stride, h);\
1912 void ff_ ## OPNAME ## _dirac_pixels16_ ## EXT(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1914 OPNAME ## _pixels16_ ## EXT(dst, src[0], stride, h);\
1916 void ff_ ## OPNAME ## _dirac_pixels32_ ## EXT(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
1918 OPNAME ## _pixels16_ ## EXT(dst , src[0] , stride, h);\
1919 OPNAME ## _pixels16_ ## EXT(dst+16, src[0]+16, stride, h);\
1922 DIRAC_PIXOP(put, mmx)
1923 DIRAC_PIXOP(avg, mmx)
1924 DIRAC_PIXOP(avg, mmx2)
1926 void ff_put_dirac_pixels16_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h)
1928 put_pixels16_sse2(dst, src[0], stride, h);
1930 void ff_avg_dirac_pixels16_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h)
1932 avg_pixels16_sse2(dst, src[0], stride, h);
1934 void ff_put_dirac_pixels32_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h)
1936 put_pixels16_sse2(dst , src[0] , stride, h);
1937 put_pixels16_sse2(dst+16, src[0]+16, stride, h);
1939 void ff_avg_dirac_pixels32_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h)
1941 avg_pixels16_sse2(dst , src[0] , stride, h);
1942 avg_pixels16_sse2(dst+16, src[0]+16, stride, h);
1946 /* XXX: those functions should be suppressed ASAP when all IDCTs are
1949 static void ff_libmpeg2mmx_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
1951 ff_mmx_idct (block);
1952 ff_put_pixels_clamped_mmx(block, dest, line_size);
1954 static void ff_libmpeg2mmx_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
1956 ff_mmx_idct (block);
1957 ff_add_pixels_clamped_mmx(block, dest, line_size);
1959 static void ff_libmpeg2mmx2_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
1961 ff_mmxext_idct (block);
1962 ff_put_pixels_clamped_mmx(block, dest, line_size);
1964 static void ff_libmpeg2mmx2_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
1966 ff_mmxext_idct (block);
1967 ff_add_pixels_clamped_mmx(block, dest, line_size);
1970 static void ff_idct_xvid_mmx_put(uint8_t *dest, int line_size, DCTELEM *block)
1972 ff_idct_xvid_mmx (block);
1973 ff_put_pixels_clamped_mmx(block, dest, line_size);
1975 static void ff_idct_xvid_mmx_add(uint8_t *dest, int line_size, DCTELEM *block)
1977 ff_idct_xvid_mmx (block);
1978 ff_add_pixels_clamped_mmx(block, dest, line_size);
1980 static void ff_idct_xvid_mmx2_put(uint8_t *dest, int line_size, DCTELEM *block)
1982 ff_idct_xvid_mmx2 (block);
1983 ff_put_pixels_clamped_mmx(block, dest, line_size);
1985 static void ff_idct_xvid_mmx2_add(uint8_t *dest, int line_size, DCTELEM *block)
1987 ff_idct_xvid_mmx2 (block);
1988 ff_add_pixels_clamped_mmx(block, dest, line_size);
1991 static void vorbis_inverse_coupling_3dnow(float *mag, float *ang, int blocksize)
1994 __asm__ volatile("pxor %%mm7, %%mm7":);
1995 for(i=0; i<blocksize; i+=2) {
1997 "movq %0, %%mm0 \n\t"
1998 "movq %1, %%mm1 \n\t"
1999 "movq %%mm0, %%mm2 \n\t"
2000 "movq %%mm1, %%mm3 \n\t"
2001 "pfcmpge %%mm7, %%mm2 \n\t" // m <= 0.0
2002 "pfcmpge %%mm7, %%mm3 \n\t" // a <= 0.0
2003 "pslld $31, %%mm2 \n\t" // keep only the sign bit
2004 "pxor %%mm2, %%mm1 \n\t"
2005 "movq %%mm3, %%mm4 \n\t"
2006 "pand %%mm1, %%mm3 \n\t"
2007 "pandn %%mm1, %%mm4 \n\t"
2008 "pfadd %%mm0, %%mm3 \n\t" // a = m + ((a<0) & (a ^ sign(m)))
2009 "pfsub %%mm4, %%mm0 \n\t" // m = m + ((a>0) & (a ^ sign(m)))
2010 "movq %%mm3, %1 \n\t"
2011 "movq %%mm0, %0 \n\t"
2012 :"+m"(mag[i]), "+m"(ang[i])
2016 __asm__ volatile("femms");
2018 static void vorbis_inverse_coupling_sse(float *mag, float *ang, int blocksize)
2023 "movaps %0, %%xmm5 \n\t"
2024 ::"m"(ff_pdw_80000000[0])
2026 for(i=0; i<blocksize; i+=4) {
2028 "movaps %0, %%xmm0 \n\t"
2029 "movaps %1, %%xmm1 \n\t"
2030 "xorps %%xmm2, %%xmm2 \n\t"
2031 "xorps %%xmm3, %%xmm3 \n\t"
2032 "cmpleps %%xmm0, %%xmm2 \n\t" // m <= 0.0
2033 "cmpleps %%xmm1, %%xmm3 \n\t" // a <= 0.0
2034 "andps %%xmm5, %%xmm2 \n\t" // keep only the sign bit
2035 "xorps %%xmm2, %%xmm1 \n\t"
2036 "movaps %%xmm3, %%xmm4 \n\t"
2037 "andps %%xmm1, %%xmm3 \n\t"
2038 "andnps %%xmm1, %%xmm4 \n\t"
2039 "addps %%xmm0, %%xmm3 \n\t" // a = m + ((a<0) & (a ^ sign(m)))
2040 "subps %%xmm4, %%xmm0 \n\t" // m = m + ((a>0) & (a ^ sign(m)))
2041 "movaps %%xmm3, %1 \n\t"
2042 "movaps %%xmm0, %0 \n\t"
2043 :"+m"(mag[i]), "+m"(ang[i])
2052 #define MIX5(mono,stereo)\
2054 "movss 0(%2), %%xmm5 \n"\
2055 "movss 8(%2), %%xmm6 \n"\
2056 "movss 24(%2), %%xmm7 \n"\
2057 "shufps $0, %%xmm5, %%xmm5 \n"\
2058 "shufps $0, %%xmm6, %%xmm6 \n"\
2059 "shufps $0, %%xmm7, %%xmm7 \n"\
2061 "movaps (%0,%1), %%xmm0 \n"\
2062 "movaps 0x400(%0,%1), %%xmm1 \n"\
2063 "movaps 0x800(%0,%1), %%xmm2 \n"\
2064 "movaps 0xc00(%0,%1), %%xmm3 \n"\
2065 "movaps 0x1000(%0,%1), %%xmm4 \n"\
2066 "mulps %%xmm5, %%xmm0 \n"\
2067 "mulps %%xmm6, %%xmm1 \n"\
2068 "mulps %%xmm5, %%xmm2 \n"\
2069 "mulps %%xmm7, %%xmm3 \n"\
2070 "mulps %%xmm7, %%xmm4 \n"\
2071 stereo("addps %%xmm1, %%xmm0 \n")\
2072 "addps %%xmm1, %%xmm2 \n"\
2073 "addps %%xmm3, %%xmm0 \n"\
2074 "addps %%xmm4, %%xmm2 \n"\
2075 mono("addps %%xmm2, %%xmm0 \n")\
2076 "movaps %%xmm0, (%0,%1) \n"\
2077 stereo("movaps %%xmm2, 0x400(%0,%1) \n")\
2081 :"r"(samples[0]+len), "r"(matrix)\
2082 :XMM_CLOBBERS("%xmm0", "%xmm1", "%xmm2", "%xmm3", \
2083 "%xmm4", "%xmm5", "%xmm6", "%xmm7",)\
2087 #define MIX_MISC(stereo)\
2090 "movaps (%3,%0), %%xmm0 \n"\
2091 stereo("movaps %%xmm0, %%xmm1 \n")\
2092 "mulps %%xmm4, %%xmm0 \n"\
2093 stereo("mulps %%xmm5, %%xmm1 \n")\
2094 "lea 1024(%3,%0), %1 \n"\
2097 "movaps (%1), %%xmm2 \n"\
2098 stereo("movaps %%xmm2, %%xmm3 \n")\
2099 "mulps (%4,%2), %%xmm2 \n"\
2100 stereo("mulps 16(%4,%2), %%xmm3 \n")\
2101 "addps %%xmm2, %%xmm0 \n"\
2102 stereo("addps %%xmm3, %%xmm1 \n")\
2106 "movaps %%xmm0, (%3,%0) \n"\
2107 stereo("movaps %%xmm1, 1024(%3,%0) \n")\
2110 :"+&r"(i), "=&r"(j), "=&r"(k)\
2111 :"r"(samples[0]+len), "r"(matrix_simd+in_ch), "g"((intptr_t)-32*(in_ch-1))\
2115 static void ac3_downmix_sse(float (*samples)[256], float (*matrix)[2], int out_ch, int in_ch, int len)
2117 int (*matrix_cmp)[2] = (int(*)[2])matrix;
2120 i = -len*sizeof(float);
2121 if(in_ch == 5 && out_ch == 2 && !(matrix_cmp[0][1]|matrix_cmp[2][0]|matrix_cmp[3][1]|matrix_cmp[4][0]|(matrix_cmp[1][0]^matrix_cmp[1][1])|(matrix_cmp[0][0]^matrix_cmp[2][1]))) {
2123 } else if(in_ch == 5 && out_ch == 1 && matrix_cmp[0][0]==matrix_cmp[2][0] && matrix_cmp[3][0]==matrix_cmp[4][0]) {
2126 DECLARE_ALIGNED(16, float, matrix_simd)[AC3_MAX_CHANNELS][2][4];
2127 j = 2*in_ch*sizeof(float);
2131 "movss (%2,%0), %%xmm4 \n"
2132 "movss 4(%2,%0), %%xmm5 \n"
2133 "shufps $0, %%xmm4, %%xmm4 \n"
2134 "shufps $0, %%xmm5, %%xmm5 \n"
2135 "movaps %%xmm4, (%1,%0,4) \n"
2136 "movaps %%xmm5, 16(%1,%0,4) \n"
2139 :"r"(matrix_simd), "r"(matrix)
2150 static void vector_fmul_3dnow(float *dst, const float *src0, const float *src1, int len){
2151 x86_reg i = (len-4)*4;
2154 "movq (%2,%0), %%mm0 \n\t"
2155 "movq 8(%2,%0), %%mm1 \n\t"
2156 "pfmul (%3,%0), %%mm0 \n\t"
2157 "pfmul 8(%3,%0), %%mm1 \n\t"
2158 "movq %%mm0, (%1,%0) \n\t"
2159 "movq %%mm1, 8(%1,%0) \n\t"
2164 :"r"(dst), "r"(src0), "r"(src1)
2168 static void vector_fmul_sse(float *dst, const float *src0, const float *src1, int len){
2169 x86_reg i = (len-8)*4;
2172 "movaps (%2,%0), %%xmm0 \n\t"
2173 "movaps 16(%2,%0), %%xmm1 \n\t"
2174 "mulps (%3,%0), %%xmm0 \n\t"
2175 "mulps 16(%3,%0), %%xmm1 \n\t"
2176 "movaps %%xmm0, (%1,%0) \n\t"
2177 "movaps %%xmm1, 16(%1,%0) \n\t"
2181 :"r"(dst), "r"(src0), "r"(src1)
2186 static void vector_fmul_reverse_3dnow2(float *dst, const float *src0, const float *src1, int len){
2187 x86_reg i = len*4-16;
2190 "pswapd 8(%1), %%mm0 \n\t"
2191 "pswapd (%1), %%mm1 \n\t"
2192 "pfmul (%3,%0), %%mm0 \n\t"
2193 "pfmul 8(%3,%0), %%mm1 \n\t"
2194 "movq %%mm0, (%2,%0) \n\t"
2195 "movq %%mm1, 8(%2,%0) \n\t"
2199 :"+r"(i), "+r"(src1)
2200 :"r"(dst), "r"(src0)
2202 __asm__ volatile("femms");
2204 static void vector_fmul_reverse_sse(float *dst, const float *src0, const float *src1, int len){
2205 x86_reg i = len*4-32;
2208 "movaps 16(%1), %%xmm0 \n\t"
2209 "movaps (%1), %%xmm1 \n\t"
2210 "shufps $0x1b, %%xmm0, %%xmm0 \n\t"
2211 "shufps $0x1b, %%xmm1, %%xmm1 \n\t"
2212 "mulps (%3,%0), %%xmm0 \n\t"
2213 "mulps 16(%3,%0), %%xmm1 \n\t"
2214 "movaps %%xmm0, (%2,%0) \n\t"
2215 "movaps %%xmm1, 16(%2,%0) \n\t"
2219 :"+r"(i), "+r"(src1)
2220 :"r"(dst), "r"(src0)
2224 static void vector_fmul_add_3dnow(float *dst, const float *src0, const float *src1,
2225 const float *src2, int len){
2226 x86_reg i = (len-4)*4;
2229 "movq (%2,%0), %%mm0 \n\t"
2230 "movq 8(%2,%0), %%mm1 \n\t"
2231 "pfmul (%3,%0), %%mm0 \n\t"
2232 "pfmul 8(%3,%0), %%mm1 \n\t"
2233 "pfadd (%4,%0), %%mm0 \n\t"
2234 "pfadd 8(%4,%0), %%mm1 \n\t"
2235 "movq %%mm0, (%1,%0) \n\t"
2236 "movq %%mm1, 8(%1,%0) \n\t"
2240 :"r"(dst), "r"(src0), "r"(src1), "r"(src2)
2243 __asm__ volatile("femms");
2245 static void vector_fmul_add_sse(float *dst, const float *src0, const float *src1,
2246 const float *src2, int len){
2247 x86_reg i = (len-8)*4;
2250 "movaps (%2,%0), %%xmm0 \n\t"
2251 "movaps 16(%2,%0), %%xmm1 \n\t"
2252 "mulps (%3,%0), %%xmm0 \n\t"
2253 "mulps 16(%3,%0), %%xmm1 \n\t"
2254 "addps (%4,%0), %%xmm0 \n\t"
2255 "addps 16(%4,%0), %%xmm1 \n\t"
2256 "movaps %%xmm0, (%1,%0) \n\t"
2257 "movaps %%xmm1, 16(%1,%0) \n\t"
2261 :"r"(dst), "r"(src0), "r"(src1), "r"(src2)
2267 static void vector_fmul_window_3dnow2(float *dst, const float *src0, const float *src1,
2268 const float *win, int len){
2270 x86_reg j = len*4-8;
2273 "pswapd (%5,%1), %%mm1 \n"
2274 "movq (%5,%0), %%mm0 \n"
2275 "pswapd (%4,%1), %%mm5 \n"
2276 "movq (%3,%0), %%mm4 \n"
2277 "movq %%mm0, %%mm2 \n"
2278 "movq %%mm1, %%mm3 \n"
2279 "pfmul %%mm4, %%mm2 \n" // src0[len+i]*win[len+i]
2280 "pfmul %%mm5, %%mm3 \n" // src1[ j]*win[len+j]
2281 "pfmul %%mm4, %%mm1 \n" // src0[len+i]*win[len+j]
2282 "pfmul %%mm5, %%mm0 \n" // src1[ j]*win[len+i]
2283 "pfadd %%mm3, %%mm2 \n"
2284 "pfsub %%mm0, %%mm1 \n"
2285 "pswapd %%mm2, %%mm2 \n"
2286 "movq %%mm1, (%2,%0) \n"
2287 "movq %%mm2, (%2,%1) \n"
2293 :"r"(dst+len), "r"(src0+len), "r"(src1), "r"(win+len)
2297 static void vector_fmul_window_sse(float *dst, const float *src0, const float *src1,
2298 const float *win, int len){
2300 x86_reg j = len*4-16;
2303 "movaps (%5,%1), %%xmm1 \n"
2304 "movaps (%5,%0), %%xmm0 \n"
2305 "movaps (%4,%1), %%xmm5 \n"
2306 "movaps (%3,%0), %%xmm4 \n"
2307 "shufps $0x1b, %%xmm1, %%xmm1 \n"
2308 "shufps $0x1b, %%xmm5, %%xmm5 \n"
2309 "movaps %%xmm0, %%xmm2 \n"
2310 "movaps %%xmm1, %%xmm3 \n"
2311 "mulps %%xmm4, %%xmm2 \n" // src0[len+i]*win[len+i]
2312 "mulps %%xmm5, %%xmm3 \n" // src1[ j]*win[len+j]
2313 "mulps %%xmm4, %%xmm1 \n" // src0[len+i]*win[len+j]
2314 "mulps %%xmm5, %%xmm0 \n" // src1[ j]*win[len+i]
2315 "addps %%xmm3, %%xmm2 \n"
2316 "subps %%xmm0, %%xmm1 \n"
2317 "shufps $0x1b, %%xmm2, %%xmm2 \n"
2318 "movaps %%xmm1, (%2,%0) \n"
2319 "movaps %%xmm2, (%2,%1) \n"
2324 :"r"(dst+len), "r"(src0+len), "r"(src1), "r"(win+len)
2327 #endif /* HAVE_6REGS */
2329 static void vector_clipf_sse(float *dst, const float *src, float min, float max,
2332 x86_reg i = (len-16)*4;
2334 "movss %3, %%xmm4 \n"
2335 "movss %4, %%xmm5 \n"
2336 "shufps $0, %%xmm4, %%xmm4 \n"
2337 "shufps $0, %%xmm5, %%xmm5 \n"
2339 "movaps (%2,%0), %%xmm0 \n\t" // 3/1 on intel
2340 "movaps 16(%2,%0), %%xmm1 \n\t"
2341 "movaps 32(%2,%0), %%xmm2 \n\t"
2342 "movaps 48(%2,%0), %%xmm3 \n\t"
2343 "maxps %%xmm4, %%xmm0 \n\t"
2344 "maxps %%xmm4, %%xmm1 \n\t"
2345 "maxps %%xmm4, %%xmm2 \n\t"
2346 "maxps %%xmm4, %%xmm3 \n\t"
2347 "minps %%xmm5, %%xmm0 \n\t"
2348 "minps %%xmm5, %%xmm1 \n\t"
2349 "minps %%xmm5, %%xmm2 \n\t"
2350 "minps %%xmm5, %%xmm3 \n\t"
2351 "movaps %%xmm0, (%1,%0) \n\t"
2352 "movaps %%xmm1, 16(%1,%0) \n\t"
2353 "movaps %%xmm2, 32(%1,%0) \n\t"
2354 "movaps %%xmm3, 48(%1,%0) \n\t"
2358 :"r"(dst), "r"(src), "m"(min), "m"(max)
2363 void ff_vp3_idct_mmx(int16_t *input_data);
2364 void ff_vp3_idct_put_mmx(uint8_t *dest, int line_size, DCTELEM *block);
2365 void ff_vp3_idct_add_mmx(uint8_t *dest, int line_size, DCTELEM *block);
2367 void ff_vp3_idct_dc_add_mmx2(uint8_t *dest, int line_size, const DCTELEM *block);
2369 void ff_vp3_v_loop_filter_mmx2(uint8_t *src, int stride, int *bounding_values);
2370 void ff_vp3_h_loop_filter_mmx2(uint8_t *src, int stride, int *bounding_values);
2372 void ff_vp3_idct_sse2(int16_t *input_data);
2373 void ff_vp3_idct_put_sse2(uint8_t *dest, int line_size, DCTELEM *block);
2374 void ff_vp3_idct_add_sse2(uint8_t *dest, int line_size, DCTELEM *block);
2376 int32_t ff_scalarproduct_int16_mmx2(const int16_t *v1, const int16_t *v2, int order, int shift);
2377 int32_t ff_scalarproduct_int16_sse2(const int16_t *v1, const int16_t *v2, int order, int shift);
2378 int32_t ff_scalarproduct_and_madd_int16_mmx2(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul);
2379 int32_t ff_scalarproduct_and_madd_int16_sse2(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul);
2380 int32_t ff_scalarproduct_and_madd_int16_ssse3(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul);
2382 void ff_apply_window_int16_mmxext (int16_t *output, const int16_t *input,
2383 const int16_t *window, unsigned int len);
2384 void ff_apply_window_int16_mmxext_ba (int16_t *output, const int16_t *input,
2385 const int16_t *window, unsigned int len);
2386 void ff_apply_window_int16_sse2 (int16_t *output, const int16_t *input,
2387 const int16_t *window, unsigned int len);
2388 void ff_apply_window_int16_sse2_ba (int16_t *output, const int16_t *input,
2389 const int16_t *window, unsigned int len);
2390 void ff_apply_window_int16_ssse3 (int16_t *output, const int16_t *input,
2391 const int16_t *window, unsigned int len);
2392 void ff_apply_window_int16_ssse3_atom(int16_t *output, const int16_t *input,
2393 const int16_t *window, unsigned int len);
2395 void ff_add_hfyu_median_prediction_mmx2(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int w, int *left, int *left_top);
2396 int ff_add_hfyu_left_prediction_ssse3(uint8_t *dst, const uint8_t *src, int w, int left);
2397 int ff_add_hfyu_left_prediction_sse4(uint8_t *dst, const uint8_t *src, int w, int left);
2399 float ff_scalarproduct_float_sse(const float *v1, const float *v2, int order);
2401 void ff_vector_clip_int32_mmx (int32_t *dst, const int32_t *src, int32_t min,
2402 int32_t max, unsigned int len);
2403 void ff_vector_clip_int32_sse2 (int32_t *dst, const int32_t *src, int32_t min,
2404 int32_t max, unsigned int len);
2405 void ff_vector_clip_int32_int_sse2(int32_t *dst, const int32_t *src, int32_t min,
2406 int32_t max, unsigned int len);
2407 void ff_vector_clip_int32_sse4 (int32_t *dst, const int32_t *src, int32_t min,
2408 int32_t max, unsigned int len);
2410 extern void ff_butterflies_float_interleave_sse(float *dst, const float *src0,
2411 const float *src1, int len);
2412 extern void ff_butterflies_float_interleave_avx(float *dst, const float *src0,
2413 const float *src1, int len);
2415 void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
2417 int mm_flags = av_get_cpu_flags();
2418 const int high_bit_depth = avctx->bits_per_raw_sample > 8;
2419 const int bit_depth = avctx->bits_per_raw_sample;
2421 if (avctx->dsp_mask) {
2422 if (avctx->dsp_mask & AV_CPU_FLAG_FORCE)
2423 mm_flags |= (avctx->dsp_mask & 0xffff);
2425 mm_flags &= ~(avctx->dsp_mask & 0xffff);
2429 av_log(avctx, AV_LOG_INFO, "libavcodec: CPU flags:");
2430 if (mm_flags & AV_CPU_FLAG_MMX)
2431 av_log(avctx, AV_LOG_INFO, " mmx");
2432 if (mm_flags & AV_CPU_FLAG_MMX2)
2433 av_log(avctx, AV_LOG_INFO, " mmx2");
2434 if (mm_flags & AV_CPU_FLAG_3DNOW)
2435 av_log(avctx, AV_LOG_INFO, " 3dnow");
2436 if (mm_flags & AV_CPU_FLAG_SSE)
2437 av_log(avctx, AV_LOG_INFO, " sse");
2438 if (mm_flags & AV_CPU_FLAG_SSE2)
2439 av_log(avctx, AV_LOG_INFO, " sse2");
2440 av_log(avctx, AV_LOG_INFO, "\n");
2443 if (mm_flags & AV_CPU_FLAG_MMX) {
2444 const int idct_algo= avctx->idct_algo;
2446 if (avctx->lowres == 0 && avctx->bits_per_raw_sample <= 8) {
2447 if(idct_algo==FF_IDCT_AUTO || idct_algo==FF_IDCT_SIMPLEMMX){
2448 c->idct_put= ff_simple_idct_put_mmx;
2449 c->idct_add= ff_simple_idct_add_mmx;
2450 c->idct = ff_simple_idct_mmx;
2451 c->idct_permutation_type= FF_SIMPLE_IDCT_PERM;
2453 }else if(idct_algo==FF_IDCT_LIBMPEG2MMX){
2454 if(mm_flags & AV_CPU_FLAG_MMX2){
2455 c->idct_put= ff_libmpeg2mmx2_idct_put;
2456 c->idct_add= ff_libmpeg2mmx2_idct_add;
2457 c->idct = ff_mmxext_idct;
2459 c->idct_put= ff_libmpeg2mmx_idct_put;
2460 c->idct_add= ff_libmpeg2mmx_idct_add;
2461 c->idct = ff_mmx_idct;
2463 c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
2465 }else if((CONFIG_VP3_DECODER || CONFIG_VP5_DECODER || CONFIG_VP6_DECODER) &&
2466 idct_algo==FF_IDCT_VP3 && HAVE_YASM){
2467 if(mm_flags & AV_CPU_FLAG_SSE2){
2468 c->idct_put= ff_vp3_idct_put_sse2;
2469 c->idct_add= ff_vp3_idct_add_sse2;
2470 c->idct = ff_vp3_idct_sse2;
2471 c->idct_permutation_type= FF_TRANSPOSE_IDCT_PERM;
2473 c->idct_put= ff_vp3_idct_put_mmx;
2474 c->idct_add= ff_vp3_idct_add_mmx;
2475 c->idct = ff_vp3_idct_mmx;
2476 c->idct_permutation_type= FF_PARTTRANS_IDCT_PERM;
2478 }else if(idct_algo==FF_IDCT_CAVS){
2479 c->idct_permutation_type= FF_TRANSPOSE_IDCT_PERM;
2480 }else if(idct_algo==FF_IDCT_XVIDMMX){
2481 if(mm_flags & AV_CPU_FLAG_SSE2){
2482 c->idct_put= ff_idct_xvid_sse2_put;
2483 c->idct_add= ff_idct_xvid_sse2_add;
2484 c->idct = ff_idct_xvid_sse2;
2485 c->idct_permutation_type= FF_SSE2_IDCT_PERM;
2486 }else if(mm_flags & AV_CPU_FLAG_MMX2){
2487 c->idct_put= ff_idct_xvid_mmx2_put;
2488 c->idct_add= ff_idct_xvid_mmx2_add;
2489 c->idct = ff_idct_xvid_mmx2;
2491 c->idct_put= ff_idct_xvid_mmx_put;
2492 c->idct_add= ff_idct_xvid_mmx_add;
2493 c->idct = ff_idct_xvid_mmx;
2498 c->put_pixels_clamped = ff_put_pixels_clamped_mmx;
2499 c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_mmx;
2500 c->add_pixels_clamped = ff_add_pixels_clamped_mmx;
2501 if (!high_bit_depth) {
2502 c->clear_block = clear_block_mmx;
2503 c->clear_blocks = clear_blocks_mmx;
2504 if ((mm_flags & AV_CPU_FLAG_SSE) &&
2505 !(CONFIG_MPEG_XVMC_DECODER && avctx->xvmc_acceleration > 1)){
2506 /* XvMCCreateBlocks() may not allocate 16-byte aligned blocks */
2507 c->clear_block = clear_block_sse;
2508 c->clear_blocks = clear_blocks_sse;
2512 #define SET_HPEL_FUNCS(PFX, IDX, SIZE, CPU) \
2513 c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## SIZE ## _ ## CPU; \
2514 c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## SIZE ## _x2_ ## CPU; \
2515 c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## SIZE ## _y2_ ## CPU; \
2516 c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## SIZE ## _xy2_ ## CPU
2518 if (!high_bit_depth) {
2519 SET_HPEL_FUNCS(put, 0, 16, mmx);
2520 SET_HPEL_FUNCS(put_no_rnd, 0, 16, mmx);
2521 SET_HPEL_FUNCS(avg, 0, 16, mmx);
2522 SET_HPEL_FUNCS(avg_no_rnd, 0, 16, mmx);
2523 SET_HPEL_FUNCS(put, 1, 8, mmx);
2524 SET_HPEL_FUNCS(put_no_rnd, 1, 8, mmx);
2525 SET_HPEL_FUNCS(avg, 1, 8, mmx);
2526 SET_HPEL_FUNCS(avg_no_rnd, 1, 8, mmx);
2529 #if ARCH_X86_32 || !HAVE_YASM
2532 #if ARCH_X86_32 && HAVE_YASM
2533 if (!high_bit_depth)
2534 c->emulated_edge_mc = emulated_edge_mc_mmx;
2537 c->add_bytes= add_bytes_mmx;
2539 if (!high_bit_depth)
2540 c->draw_edges = draw_edges_mmx;
2542 c->put_no_rnd_pixels_l2[0]= put_vp_no_rnd_pixels16_l2_mmx;
2543 c->put_no_rnd_pixels_l2[1]= put_vp_no_rnd_pixels8_l2_mmx;
2545 if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
2546 c->h263_v_loop_filter= h263_v_loop_filter_mmx;
2547 c->h263_h_loop_filter= h263_h_loop_filter_mmx;
2551 if (!high_bit_depth && CONFIG_H264CHROMA) {
2552 c->put_h264_chroma_pixels_tab[0]= ff_put_h264_chroma_mc8_mmx_rnd;
2553 c->put_h264_chroma_pixels_tab[1]= ff_put_h264_chroma_mc4_mmx;
2556 c->vector_clip_int32 = ff_vector_clip_int32_mmx;
2559 if (mm_flags & AV_CPU_FLAG_MMX2) {
2560 c->prefetch = prefetch_mmx2;
2562 if (!high_bit_depth) {
2563 c->put_pixels_tab[0][1] = put_pixels16_x2_mmx2;
2564 c->put_pixels_tab[0][2] = put_pixels16_y2_mmx2;
2566 c->avg_pixels_tab[0][0] = avg_pixels16_mmx2;
2567 c->avg_pixels_tab[0][1] = avg_pixels16_x2_mmx2;
2568 c->avg_pixels_tab[0][2] = avg_pixels16_y2_mmx2;
2570 c->put_pixels_tab[1][1] = put_pixels8_x2_mmx2;
2571 c->put_pixels_tab[1][2] = put_pixels8_y2_mmx2;
2573 c->avg_pixels_tab[1][0] = avg_pixels8_mmx2;
2574 c->avg_pixels_tab[1][1] = avg_pixels8_x2_mmx2;
2575 c->avg_pixels_tab[1][2] = avg_pixels8_y2_mmx2;
2578 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
2579 if (!high_bit_depth) {
2580 c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmx2;
2581 c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmx2;
2582 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_mmx2;
2583 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_mmx2;
2584 c->avg_pixels_tab[0][3] = avg_pixels16_xy2_mmx2;
2585 c->avg_pixels_tab[1][3] = avg_pixels8_xy2_mmx2;
2588 if (CONFIG_VP3_DECODER && HAVE_YASM) {
2589 c->vp3_v_loop_filter= ff_vp3_v_loop_filter_mmx2;
2590 c->vp3_h_loop_filter= ff_vp3_h_loop_filter_mmx2;
2593 if (CONFIG_VP3_DECODER && HAVE_YASM) {
2594 c->vp3_idct_dc_add = ff_vp3_idct_dc_add_mmx2;
2597 if (CONFIG_VP3_DECODER
2598 && (avctx->codec_id == CODEC_ID_VP3 || avctx->codec_id == CODEC_ID_THEORA)) {
2599 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_exact_mmx2;
2600 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_exact_mmx2;
2603 #define SET_QPEL_FUNCS(PFX, IDX, SIZE, CPU, PREFIX) \
2604 c->PFX ## _pixels_tab[IDX][ 0] = PREFIX ## PFX ## SIZE ## _mc00_ ## CPU; \
2605 c->PFX ## _pixels_tab[IDX][ 1] = PREFIX ## PFX ## SIZE ## _mc10_ ## CPU; \
2606 c->PFX ## _pixels_tab[IDX][ 2] = PREFIX ## PFX ## SIZE ## _mc20_ ## CPU; \
2607 c->PFX ## _pixels_tab[IDX][ 3] = PREFIX ## PFX ## SIZE ## _mc30_ ## CPU; \
2608 c->PFX ## _pixels_tab[IDX][ 4] = PREFIX ## PFX ## SIZE ## _mc01_ ## CPU; \
2609 c->PFX ## _pixels_tab[IDX][ 5] = PREFIX ## PFX ## SIZE ## _mc11_ ## CPU; \
2610 c->PFX ## _pixels_tab[IDX][ 6] = PREFIX ## PFX ## SIZE ## _mc21_ ## CPU; \
2611 c->PFX ## _pixels_tab[IDX][ 7] = PREFIX ## PFX ## SIZE ## _mc31_ ## CPU; \
2612 c->PFX ## _pixels_tab[IDX][ 8] = PREFIX ## PFX ## SIZE ## _mc02_ ## CPU; \
2613 c->PFX ## _pixels_tab[IDX][ 9] = PREFIX ## PFX ## SIZE ## _mc12_ ## CPU; \
2614 c->PFX ## _pixels_tab[IDX][10] = PREFIX ## PFX ## SIZE ## _mc22_ ## CPU; \
2615 c->PFX ## _pixels_tab[IDX][11] = PREFIX ## PFX ## SIZE ## _mc32_ ## CPU; \
2616 c->PFX ## _pixels_tab[IDX][12] = PREFIX ## PFX ## SIZE ## _mc03_ ## CPU; \
2617 c->PFX ## _pixels_tab[IDX][13] = PREFIX ## PFX ## SIZE ## _mc13_ ## CPU; \
2618 c->PFX ## _pixels_tab[IDX][14] = PREFIX ## PFX ## SIZE ## _mc23_ ## CPU; \
2619 c->PFX ## _pixels_tab[IDX][15] = PREFIX ## PFX ## SIZE ## _mc33_ ## CPU
2621 SET_QPEL_FUNCS(put_qpel, 0, 16, mmx2, );
2622 SET_QPEL_FUNCS(put_qpel, 1, 8, mmx2, );
2623 SET_QPEL_FUNCS(put_no_rnd_qpel, 0, 16, mmx2, );
2624 SET_QPEL_FUNCS(put_no_rnd_qpel, 1, 8, mmx2, );
2625 SET_QPEL_FUNCS(avg_qpel, 0, 16, mmx2, );
2626 SET_QPEL_FUNCS(avg_qpel, 1, 8, mmx2, );
2628 if (!high_bit_depth) {
2629 SET_QPEL_FUNCS(put_h264_qpel, 0, 16, mmx2, );
2630 SET_QPEL_FUNCS(put_h264_qpel, 1, 8, mmx2, );
2631 SET_QPEL_FUNCS(put_h264_qpel, 2, 4, mmx2, );
2632 SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, mmx2, );
2633 SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, mmx2, );
2634 SET_QPEL_FUNCS(avg_h264_qpel, 2, 4, mmx2, );
2636 else if (bit_depth == 10) {
2639 SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, 10_mmxext, ff_);
2640 SET_QPEL_FUNCS(put_h264_qpel, 0, 16, 10_mmxext, ff_);
2641 SET_QPEL_FUNCS(put_h264_qpel, 1, 8, 10_mmxext, ff_);
2642 SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, 10_mmxext, ff_);
2644 SET_QPEL_FUNCS(put_h264_qpel, 2, 4, 10_mmxext, ff_);
2645 SET_QPEL_FUNCS(avg_h264_qpel, 2, 4, 10_mmxext, ff_);
2649 SET_QPEL_FUNCS(put_2tap_qpel, 0, 16, mmx2, );
2650 SET_QPEL_FUNCS(put_2tap_qpel, 1, 8, mmx2, );
2651 SET_QPEL_FUNCS(avg_2tap_qpel, 0, 16, mmx2, );
2652 SET_QPEL_FUNCS(avg_2tap_qpel, 1, 8, mmx2, );
2655 if (!high_bit_depth && CONFIG_H264CHROMA) {
2656 c->avg_h264_chroma_pixels_tab[0]= ff_avg_h264_chroma_mc8_mmx2_rnd;
2657 c->avg_h264_chroma_pixels_tab[1]= ff_avg_h264_chroma_mc4_mmx2;
2658 c->avg_h264_chroma_pixels_tab[2]= ff_avg_h264_chroma_mc2_mmx2;
2659 c->put_h264_chroma_pixels_tab[2]= ff_put_h264_chroma_mc2_mmx2;
2661 if (bit_depth == 10 && CONFIG_H264CHROMA) {
2662 c->put_h264_chroma_pixels_tab[2]= ff_put_h264_chroma_mc2_10_mmxext;
2663 c->avg_h264_chroma_pixels_tab[2]= ff_avg_h264_chroma_mc2_10_mmxext;
2664 c->put_h264_chroma_pixels_tab[1]= ff_put_h264_chroma_mc4_10_mmxext;
2665 c->avg_h264_chroma_pixels_tab[1]= ff_avg_h264_chroma_mc4_10_mmxext;
2668 c->add_hfyu_median_prediction = ff_add_hfyu_median_prediction_mmx2;
2671 if (HAVE_AMD3DNOW && (mm_flags & AV_CPU_FLAG_3DNOW))
2672 c->add_hfyu_median_prediction = add_hfyu_median_prediction_cmov;
2675 } else if (HAVE_AMD3DNOW && (mm_flags & AV_CPU_FLAG_3DNOW)) {
2676 c->prefetch = prefetch_3dnow;
2678 if (!high_bit_depth) {
2679 c->put_pixels_tab[0][1] = put_pixels16_x2_3dnow;
2680 c->put_pixels_tab[0][2] = put_pixels16_y2_3dnow;
2682 c->avg_pixels_tab[0][0] = avg_pixels16_3dnow;
2683 c->avg_pixels_tab[0][1] = avg_pixels16_x2_3dnow;
2684 c->avg_pixels_tab[0][2] = avg_pixels16_y2_3dnow;
2686 c->put_pixels_tab[1][1] = put_pixels8_x2_3dnow;
2687 c->put_pixels_tab[1][2] = put_pixels8_y2_3dnow;
2689 c->avg_pixels_tab[1][0] = avg_pixels8_3dnow;
2690 c->avg_pixels_tab[1][1] = avg_pixels8_x2_3dnow;
2691 c->avg_pixels_tab[1][2] = avg_pixels8_y2_3dnow;
2693 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
2694 c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_3dnow;
2695 c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_3dnow;
2696 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_3dnow;
2697 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_3dnow;
2698 c->avg_pixels_tab[0][3] = avg_pixels16_xy2_3dnow;
2699 c->avg_pixels_tab[1][3] = avg_pixels8_xy2_3dnow;
2703 if (CONFIG_VP3_DECODER
2704 && (avctx->codec_id == CODEC_ID_VP3 || avctx->codec_id == CODEC_ID_THEORA)) {
2705 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_exact_3dnow;
2706 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_exact_3dnow;
2709 SET_QPEL_FUNCS(put_qpel, 0, 16, 3dnow, );
2710 SET_QPEL_FUNCS(put_qpel, 1, 8, 3dnow, );
2711 SET_QPEL_FUNCS(put_no_rnd_qpel, 0, 16, 3dnow, );
2712 SET_QPEL_FUNCS(put_no_rnd_qpel, 1, 8, 3dnow, );
2713 SET_QPEL_FUNCS(avg_qpel, 0, 16, 3dnow, );
2714 SET_QPEL_FUNCS(avg_qpel, 1, 8, 3dnow, );
2716 if (!high_bit_depth) {
2717 SET_QPEL_FUNCS(put_h264_qpel, 0, 16, 3dnow, );
2718 SET_QPEL_FUNCS(put_h264_qpel, 1, 8, 3dnow, );
2719 SET_QPEL_FUNCS(put_h264_qpel, 2, 4, 3dnow, );
2720 SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, 3dnow, );
2721 SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, 3dnow, );
2722 SET_QPEL_FUNCS(avg_h264_qpel, 2, 4, 3dnow, );
2725 SET_QPEL_FUNCS(put_2tap_qpel, 0, 16, 3dnow, );
2726 SET_QPEL_FUNCS(put_2tap_qpel, 1, 8, 3dnow, );
2727 SET_QPEL_FUNCS(avg_2tap_qpel, 0, 16, 3dnow, );
2728 SET_QPEL_FUNCS(avg_2tap_qpel, 1, 8, 3dnow, );
2731 if (!high_bit_depth && CONFIG_H264CHROMA) {
2732 c->avg_h264_chroma_pixels_tab[0]= ff_avg_h264_chroma_mc8_3dnow_rnd;
2733 c->avg_h264_chroma_pixels_tab[1]= ff_avg_h264_chroma_mc4_3dnow;
2740 #define H264_QPEL_FUNCS(x, y, CPU)\
2741 c->put_h264_qpel_pixels_tab[0][x+y*4] = put_h264_qpel16_mc##x##y##_##CPU;\
2742 c->put_h264_qpel_pixels_tab[1][x+y*4] = put_h264_qpel8_mc##x##y##_##CPU;\
2743 c->avg_h264_qpel_pixels_tab[0][x+y*4] = avg_h264_qpel16_mc##x##y##_##CPU;\
2744 c->avg_h264_qpel_pixels_tab[1][x+y*4] = avg_h264_qpel8_mc##x##y##_##CPU;
2745 if((mm_flags & AV_CPU_FLAG_SSE2) && !(mm_flags & AV_CPU_FLAG_3DNOW)){
2746 // these functions are slower than mmx on AMD, but faster on Intel
2747 if (!high_bit_depth) {
2748 c->put_pixels_tab[0][0] = put_pixels16_sse2;
2749 c->put_no_rnd_pixels_tab[0][0] = put_pixels16_sse2;
2750 c->avg_pixels_tab[0][0] = avg_pixels16_sse2;
2751 H264_QPEL_FUNCS(0, 0, sse2);
2754 if(mm_flags & AV_CPU_FLAG_SSE2){
2755 if (!high_bit_depth) {
2756 H264_QPEL_FUNCS(0, 1, sse2);
2757 H264_QPEL_FUNCS(0, 2, sse2);
2758 H264_QPEL_FUNCS(0, 3, sse2);
2759 H264_QPEL_FUNCS(1, 1, sse2);
2760 H264_QPEL_FUNCS(1, 2, sse2);
2761 H264_QPEL_FUNCS(1, 3, sse2);
2762 H264_QPEL_FUNCS(2, 1, sse2);
2763 H264_QPEL_FUNCS(2, 2, sse2);
2764 H264_QPEL_FUNCS(2, 3, sse2);
2765 H264_QPEL_FUNCS(3, 1, sse2);
2766 H264_QPEL_FUNCS(3, 2, sse2);
2767 H264_QPEL_FUNCS(3, 3, sse2);
2770 #define H264_QPEL_FUNCS_10(x, y, CPU)\
2771 c->put_h264_qpel_pixels_tab[0][x+y*4] = ff_put_h264_qpel16_mc##x##y##_10_##CPU;\
2772 c->put_h264_qpel_pixels_tab[1][x+y*4] = ff_put_h264_qpel8_mc##x##y##_10_##CPU;\
2773 c->avg_h264_qpel_pixels_tab[0][x+y*4] = ff_avg_h264_qpel16_mc##x##y##_10_##CPU;\
2774 c->avg_h264_qpel_pixels_tab[1][x+y*4] = ff_avg_h264_qpel8_mc##x##y##_10_##CPU;
2775 if (bit_depth == 10) {
2776 SET_QPEL_FUNCS(put_h264_qpel, 0, 16, 10_sse2, ff_);
2777 SET_QPEL_FUNCS(put_h264_qpel, 1, 8, 10_sse2, ff_);
2778 SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, 10_sse2, ff_);
2779 SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, 10_sse2, ff_);
2780 H264_QPEL_FUNCS_10(1, 0, sse2_cache64)
2781 H264_QPEL_FUNCS_10(2, 0, sse2_cache64)
2782 H264_QPEL_FUNCS_10(3, 0, sse2_cache64)
2784 if (CONFIG_H264CHROMA) {
2785 c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_10_sse2;
2786 c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_10_sse2;
2792 if(mm_flags & AV_CPU_FLAG_SSSE3){
2793 if (!high_bit_depth) {
2794 H264_QPEL_FUNCS(1, 0, ssse3);
2795 H264_QPEL_FUNCS(1, 1, ssse3);
2796 H264_QPEL_FUNCS(1, 2, ssse3);
2797 H264_QPEL_FUNCS(1, 3, ssse3);
2798 H264_QPEL_FUNCS(2, 0, ssse3);
2799 H264_QPEL_FUNCS(2, 1, ssse3);
2800 H264_QPEL_FUNCS(2, 2, ssse3);
2801 H264_QPEL_FUNCS(2, 3, ssse3);
2802 H264_QPEL_FUNCS(3, 0, ssse3);
2803 H264_QPEL_FUNCS(3, 1, ssse3);
2804 H264_QPEL_FUNCS(3, 2, ssse3);
2805 H264_QPEL_FUNCS(3, 3, ssse3);
2808 else if (bit_depth == 10) {
2809 H264_QPEL_FUNCS_10(1, 0, ssse3_cache64)
2810 H264_QPEL_FUNCS_10(2, 0, ssse3_cache64)
2811 H264_QPEL_FUNCS_10(3, 0, ssse3_cache64)
2813 if (!high_bit_depth && CONFIG_H264CHROMA) {
2814 c->put_h264_chroma_pixels_tab[0]= ff_put_h264_chroma_mc8_ssse3_rnd;
2815 c->avg_h264_chroma_pixels_tab[0]= ff_avg_h264_chroma_mc8_ssse3_rnd;
2816 c->put_h264_chroma_pixels_tab[1]= ff_put_h264_chroma_mc4_ssse3;
2817 c->avg_h264_chroma_pixels_tab[1]= ff_avg_h264_chroma_mc4_ssse3;
2819 c->add_hfyu_left_prediction = ff_add_hfyu_left_prediction_ssse3;
2820 if (mm_flags & AV_CPU_FLAG_SSE4) // not really sse4, just slow on Conroe
2821 c->add_hfyu_left_prediction = ff_add_hfyu_left_prediction_sse4;
2826 if (HAVE_AMD3DNOW && (mm_flags & AV_CPU_FLAG_3DNOW)) {
2827 c->vorbis_inverse_coupling = vorbis_inverse_coupling_3dnow;
2828 c->vector_fmul = vector_fmul_3dnow;
2830 if (HAVE_AMD3DNOWEXT && (mm_flags & AV_CPU_FLAG_3DNOWEXT)) {
2831 c->vector_fmul_reverse = vector_fmul_reverse_3dnow2;
2833 c->vector_fmul_window = vector_fmul_window_3dnow2;
2836 if(mm_flags & AV_CPU_FLAG_MMX2){
2838 c->scalarproduct_int16 = ff_scalarproduct_int16_mmx2;
2839 c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_mmx2;
2840 if (avctx->flags & CODEC_FLAG_BITEXACT) {
2841 c->apply_window_int16 = ff_apply_window_int16_mmxext_ba;
2843 c->apply_window_int16 = ff_apply_window_int16_mmxext;
2847 if(mm_flags & AV_CPU_FLAG_SSE){
2848 c->vorbis_inverse_coupling = vorbis_inverse_coupling_sse;
2849 c->ac3_downmix = ac3_downmix_sse;
2850 c->vector_fmul = vector_fmul_sse;
2851 c->vector_fmul_reverse = vector_fmul_reverse_sse;
2852 c->vector_fmul_add = vector_fmul_add_sse;
2854 c->vector_fmul_window = vector_fmul_window_sse;
2856 c->vector_clipf = vector_clipf_sse;
2858 c->scalarproduct_float = ff_scalarproduct_float_sse;
2859 c->butterflies_float_interleave = ff_butterflies_float_interleave_sse;
2861 if (!high_bit_depth)
2862 c->emulated_edge_mc = emulated_edge_mc_sse;
2866 if (HAVE_AMD3DNOW && (mm_flags & AV_CPU_FLAG_3DNOW))
2867 c->vector_fmul_add = vector_fmul_add_3dnow; // faster than sse
2868 if(mm_flags & AV_CPU_FLAG_SSE2){
2870 c->scalarproduct_int16 = ff_scalarproduct_int16_sse2;
2871 c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_sse2;
2872 if (mm_flags & AV_CPU_FLAG_ATOM) {
2873 c->vector_clip_int32 = ff_vector_clip_int32_int_sse2;
2875 c->vector_clip_int32 = ff_vector_clip_int32_sse2;
2877 if (avctx->flags & CODEC_FLAG_BITEXACT) {
2878 c->apply_window_int16 = ff_apply_window_int16_sse2_ba;
2880 if (!(mm_flags & AV_CPU_FLAG_SSE2SLOW)) {
2881 c->apply_window_int16 = ff_apply_window_int16_sse2;
2886 if (mm_flags & AV_CPU_FLAG_SSSE3) {
2888 if (mm_flags & AV_CPU_FLAG_ATOM) {
2889 c->apply_window_int16 = ff_apply_window_int16_ssse3_atom;
2891 c->apply_window_int16 = ff_apply_window_int16_ssse3;
2893 if (!(mm_flags & (AV_CPU_FLAG_SSE42|AV_CPU_FLAG_3DNOW))) { // cachesplit
2894 c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_ssse3;
2899 if (mm_flags & AV_CPU_FLAG_SSE4 && HAVE_SSE) {
2901 c->vector_clip_int32 = ff_vector_clip_int32_sse4;
2905 #if HAVE_AVX && HAVE_YASM
2906 if (mm_flags & AV_CPU_FLAG_AVX) {
2907 if (bit_depth == 10) {
2908 //AVX implies !cache64.
2909 //TODO: Port cache(32|64) detection from x264.
2910 H264_QPEL_FUNCS_10(1, 0, sse2)
2911 H264_QPEL_FUNCS_10(2, 0, sse2)
2912 H264_QPEL_FUNCS_10(3, 0, sse2)
2914 if (CONFIG_H264CHROMA) {
2915 c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_10_avx;
2916 c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_10_avx;
2919 c->butterflies_float_interleave = ff_butterflies_float_interleave_avx;
2924 if (CONFIG_ENCODERS)
2925 dsputilenc_init_mmx(c, avctx);