2 * MMX optimized DSP utils
3 * Copyright (c) 2000, 2001 Fabrice Bellard
4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
6 * This file is part of Libav.
8 * Libav is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License as published by the Free Software Foundation; either
11 * version 2.1 of the License, or (at your option) any later version.
13 * Libav is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 * Lesser General Public License for more details.
18 * You should have received a copy of the GNU Lesser General Public
19 * License along with Libav; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22 * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
25 #include "libavutil/cpu.h"
26 #include "libavutil/x86_cpu.h"
27 #include "libavcodec/dsputil.h"
28 #include "libavcodec/h264dsp.h"
29 #include "libavcodec/mpegvideo.h"
30 #include "libavcodec/simple_idct.h"
31 #include "libavcodec/ac3dec.h"
32 #include "dsputil_mmx.h"
33 #include "idct_xvid.h"
38 /* pixel operations */
39 DECLARE_ALIGNED(8, const uint64_t, ff_bone) = 0x0101010101010101ULL;
40 DECLARE_ALIGNED(8, const uint64_t, ff_wtwo) = 0x0002000200020002ULL;
42 DECLARE_ALIGNED(16, const uint64_t, ff_pdw_80000000)[2] =
43 {0x8000000080000000ULL, 0x8000000080000000ULL};
45 DECLARE_ALIGNED(8, const uint64_t, ff_pw_1 ) = 0x0001000100010001ULL;
46 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_2 ) = {0x0002000200020002ULL, 0x0002000200020002ULL};
47 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_3 ) = {0x0003000300030003ULL, 0x0003000300030003ULL};
48 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_4 ) = {0x0004000400040004ULL, 0x0004000400040004ULL};
49 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_5 ) = {0x0005000500050005ULL, 0x0005000500050005ULL};
50 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_8 ) = {0x0008000800080008ULL, 0x0008000800080008ULL};
51 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_9 ) = {0x0009000900090009ULL, 0x0009000900090009ULL};
52 DECLARE_ALIGNED(8, const uint64_t, ff_pw_15 ) = 0x000F000F000F000FULL;
53 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_16 ) = {0x0010001000100010ULL, 0x0010001000100010ULL};
54 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_17 ) = {0x0011001100110011ULL, 0x0011001100110011ULL};
55 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_18 ) = {0x0012001200120012ULL, 0x0012001200120012ULL};
56 DECLARE_ALIGNED(8, const uint64_t, ff_pw_20 ) = 0x0014001400140014ULL;
57 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_27 ) = {0x001B001B001B001BULL, 0x001B001B001B001BULL};
58 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_28 ) = {0x001C001C001C001CULL, 0x001C001C001C001CULL};
59 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_32 ) = {0x0020002000200020ULL, 0x0020002000200020ULL};
60 DECLARE_ALIGNED(8, const uint64_t, ff_pw_42 ) = 0x002A002A002A002AULL;
61 DECLARE_ALIGNED(8, const uint64_t, ff_pw_53 ) = 0x0035003500350035ULL;
62 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_63 ) = {0x003F003F003F003FULL, 0x003F003F003F003FULL};
63 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_64 ) = {0x0040004000400040ULL, 0x0040004000400040ULL};
64 DECLARE_ALIGNED(8, const uint64_t, ff_pw_96 ) = 0x0060006000600060ULL;
65 DECLARE_ALIGNED(8, const uint64_t, ff_pw_128) = 0x0080008000800080ULL;
66 DECLARE_ALIGNED(8, const uint64_t, ff_pw_255) = 0x00ff00ff00ff00ffULL;
68 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_0 ) = {0x0000000000000000ULL, 0x0000000000000000ULL};
69 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_1 ) = {0x0101010101010101ULL, 0x0101010101010101ULL};
70 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_3 ) = {0x0303030303030303ULL, 0x0303030303030303ULL};
71 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_4 ) = {0x0404040404040404ULL, 0x0404040404040404ULL};
72 DECLARE_ALIGNED(8, const uint64_t, ff_pb_7 ) = 0x0707070707070707ULL;
73 DECLARE_ALIGNED(8, const uint64_t, ff_pb_1F ) = 0x1F1F1F1F1F1F1F1FULL;
74 DECLARE_ALIGNED(8, const uint64_t, ff_pb_3F ) = 0x3F3F3F3F3F3F3F3FULL;
75 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_80 ) = {0x8080808080808080ULL, 0x8080808080808080ULL};
76 DECLARE_ALIGNED(8, const uint64_t, ff_pb_81 ) = 0x8181818181818181ULL;
77 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_A1 ) = {0xA1A1A1A1A1A1A1A1ULL, 0xA1A1A1A1A1A1A1A1ULL};
78 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_F8 ) = {0xF8F8F8F8F8F8F8F8ULL, 0xF8F8F8F8F8F8F8F8ULL};
79 DECLARE_ALIGNED(8, const uint64_t, ff_pb_FC ) = 0xFCFCFCFCFCFCFCFCULL;
80 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_FE ) = {0xFEFEFEFEFEFEFEFEULL, 0xFEFEFEFEFEFEFEFEULL};
82 DECLARE_ALIGNED(16, const double, ff_pd_1)[2] = { 1.0, 1.0 };
83 DECLARE_ALIGNED(16, const double, ff_pd_2)[2] = { 2.0, 2.0 };
85 #define JUMPALIGN() __asm__ volatile (".p2align 3"::)
86 #define MOVQ_ZERO(regd) __asm__ volatile ("pxor %%" #regd ", %%" #regd ::)
88 #define MOVQ_BFE(regd) \
90 "pcmpeqd %%" #regd ", %%" #regd " \n\t"\
91 "paddb %%" #regd ", %%" #regd " \n\t" ::)
94 #define MOVQ_BONE(regd) __asm__ volatile ("movq %0, %%" #regd " \n\t" ::"m"(ff_bone))
95 #define MOVQ_WTWO(regd) __asm__ volatile ("movq %0, %%" #regd " \n\t" ::"m"(ff_wtwo))
97 // for shared library it's better to use this way for accessing constants
99 #define MOVQ_BONE(regd) \
101 "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
102 "psrlw $15, %%" #regd " \n\t" \
103 "packuswb %%" #regd ", %%" #regd " \n\t" ::)
105 #define MOVQ_WTWO(regd) \
107 "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
108 "psrlw $15, %%" #regd " \n\t" \
109 "psllw $1, %%" #regd " \n\t"::)
113 // using regr as temporary and for the output result
114 // first argument is unmodifed and second is trashed
115 // regfe is supposed to contain 0xfefefefefefefefe
116 #define PAVGB_MMX_NO_RND(rega, regb, regr, regfe) \
117 "movq " #rega ", " #regr " \n\t"\
118 "pand " #regb ", " #regr " \n\t"\
119 "pxor " #rega ", " #regb " \n\t"\
120 "pand " #regfe "," #regb " \n\t"\
121 "psrlq $1, " #regb " \n\t"\
122 "paddb " #regb ", " #regr " \n\t"
124 #define PAVGB_MMX(rega, regb, regr, regfe) \
125 "movq " #rega ", " #regr " \n\t"\
126 "por " #regb ", " #regr " \n\t"\
127 "pxor " #rega ", " #regb " \n\t"\
128 "pand " #regfe "," #regb " \n\t"\
129 "psrlq $1, " #regb " \n\t"\
130 "psubb " #regb ", " #regr " \n\t"
132 // mm6 is supposed to contain 0xfefefefefefefefe
133 #define PAVGBP_MMX_NO_RND(rega, regb, regr, regc, regd, regp) \
134 "movq " #rega ", " #regr " \n\t"\
135 "movq " #regc ", " #regp " \n\t"\
136 "pand " #regb ", " #regr " \n\t"\
137 "pand " #regd ", " #regp " \n\t"\
138 "pxor " #rega ", " #regb " \n\t"\
139 "pxor " #regc ", " #regd " \n\t"\
140 "pand %%mm6, " #regb " \n\t"\
141 "pand %%mm6, " #regd " \n\t"\
142 "psrlq $1, " #regb " \n\t"\
143 "psrlq $1, " #regd " \n\t"\
144 "paddb " #regb ", " #regr " \n\t"\
145 "paddb " #regd ", " #regp " \n\t"
147 #define PAVGBP_MMX(rega, regb, regr, regc, regd, regp) \
148 "movq " #rega ", " #regr " \n\t"\
149 "movq " #regc ", " #regp " \n\t"\
150 "por " #regb ", " #regr " \n\t"\
151 "por " #regd ", " #regp " \n\t"\
152 "pxor " #rega ", " #regb " \n\t"\
153 "pxor " #regc ", " #regd " \n\t"\
154 "pand %%mm6, " #regb " \n\t"\
155 "pand %%mm6, " #regd " \n\t"\
156 "psrlq $1, " #regd " \n\t"\
157 "psrlq $1, " #regb " \n\t"\
158 "psubb " #regb ", " #regr " \n\t"\
159 "psubb " #regd ", " #regp " \n\t"
161 /***********************************/
162 /* MMX no rounding */
163 #define DEF(x, y) x ## _no_rnd_ ## y ##_mmx
164 #define SET_RND MOVQ_WONE
165 #define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX_NO_RND(a, b, c, d, e, f)
166 #define PAVGB(a, b, c, e) PAVGB_MMX_NO_RND(a, b, c, e)
167 #define OP_AVG(a, b, c, e) PAVGB_MMX(a, b, c, e)
169 #include "dsputil_mmx_rnd_template.c"
175 /***********************************/
178 #define DEF(x, y) x ## _ ## y ##_mmx
179 #define SET_RND MOVQ_WTWO
180 #define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX(a, b, c, d, e, f)
181 #define PAVGB(a, b, c, e) PAVGB_MMX(a, b, c, e)
183 #include "dsputil_mmx_rnd_template.c"
191 /***********************************/
194 #define DEF(x) x ## _3dnow
195 #define PAVGB "pavgusb"
198 #include "dsputil_mmx_avg_template.c"
204 /***********************************/
207 #define DEF(x) x ## _mmx2
209 /* Introduced only in MMX2 set */
210 #define PAVGB "pavgb"
213 #include "dsputil_mmx_avg_template.c"
219 #define put_no_rnd_pixels16_mmx put_pixels16_mmx
220 #define put_no_rnd_pixels8_mmx put_pixels8_mmx
221 #define put_pixels16_mmx2 put_pixels16_mmx
222 #define put_pixels8_mmx2 put_pixels8_mmx
223 #define put_pixels4_mmx2 put_pixels4_mmx
224 #define put_no_rnd_pixels16_mmx2 put_no_rnd_pixels16_mmx
225 #define put_no_rnd_pixels8_mmx2 put_no_rnd_pixels8_mmx
226 #define put_pixels16_3dnow put_pixels16_mmx
227 #define put_pixels8_3dnow put_pixels8_mmx
228 #define put_pixels4_3dnow put_pixels4_mmx
229 #define put_no_rnd_pixels16_3dnow put_no_rnd_pixels16_mmx
230 #define put_no_rnd_pixels8_3dnow put_no_rnd_pixels8_mmx
232 /***********************************/
235 void ff_put_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size)
240 /* read the pixels */
245 "movq %3, %%mm0 \n\t"
246 "movq 8%3, %%mm1 \n\t"
247 "movq 16%3, %%mm2 \n\t"
248 "movq 24%3, %%mm3 \n\t"
249 "movq 32%3, %%mm4 \n\t"
250 "movq 40%3, %%mm5 \n\t"
251 "movq 48%3, %%mm6 \n\t"
252 "movq 56%3, %%mm7 \n\t"
253 "packuswb %%mm1, %%mm0 \n\t"
254 "packuswb %%mm3, %%mm2 \n\t"
255 "packuswb %%mm5, %%mm4 \n\t"
256 "packuswb %%mm7, %%mm6 \n\t"
257 "movq %%mm0, (%0) \n\t"
258 "movq %%mm2, (%0, %1) \n\t"
259 "movq %%mm4, (%0, %1, 2) \n\t"
260 "movq %%mm6, (%0, %2) \n\t"
261 ::"r" (pix), "r" ((x86_reg)line_size), "r" ((x86_reg)line_size*3), "m"(*p)
266 // if here would be an exact copy of the code above
267 // compiler would generate some very strange code
270 "movq (%3), %%mm0 \n\t"
271 "movq 8(%3), %%mm1 \n\t"
272 "movq 16(%3), %%mm2 \n\t"
273 "movq 24(%3), %%mm3 \n\t"
274 "movq 32(%3), %%mm4 \n\t"
275 "movq 40(%3), %%mm5 \n\t"
276 "movq 48(%3), %%mm6 \n\t"
277 "movq 56(%3), %%mm7 \n\t"
278 "packuswb %%mm1, %%mm0 \n\t"
279 "packuswb %%mm3, %%mm2 \n\t"
280 "packuswb %%mm5, %%mm4 \n\t"
281 "packuswb %%mm7, %%mm6 \n\t"
282 "movq %%mm0, (%0) \n\t"
283 "movq %%mm2, (%0, %1) \n\t"
284 "movq %%mm4, (%0, %1, 2) \n\t"
285 "movq %%mm6, (%0, %2) \n\t"
286 ::"r" (pix), "r" ((x86_reg)line_size), "r" ((x86_reg)line_size*3), "r"(p)
290 #define put_signed_pixels_clamped_mmx_half(off) \
291 "movq "#off"(%2), %%mm1 \n\t"\
292 "movq 16+"#off"(%2), %%mm2 \n\t"\
293 "movq 32+"#off"(%2), %%mm3 \n\t"\
294 "movq 48+"#off"(%2), %%mm4 \n\t"\
295 "packsswb 8+"#off"(%2), %%mm1 \n\t"\
296 "packsswb 24+"#off"(%2), %%mm2 \n\t"\
297 "packsswb 40+"#off"(%2), %%mm3 \n\t"\
298 "packsswb 56+"#off"(%2), %%mm4 \n\t"\
299 "paddb %%mm0, %%mm1 \n\t"\
300 "paddb %%mm0, %%mm2 \n\t"\
301 "paddb %%mm0, %%mm3 \n\t"\
302 "paddb %%mm0, %%mm4 \n\t"\
303 "movq %%mm1, (%0) \n\t"\
304 "movq %%mm2, (%0, %3) \n\t"\
305 "movq %%mm3, (%0, %3, 2) \n\t"\
306 "movq %%mm4, (%0, %1) \n\t"
308 void ff_put_signed_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size)
310 x86_reg line_skip = line_size;
314 "movq "MANGLE(ff_pb_80)", %%mm0 \n\t"
315 "lea (%3, %3, 2), %1 \n\t"
316 put_signed_pixels_clamped_mmx_half(0)
317 "lea (%0, %3, 4), %0 \n\t"
318 put_signed_pixels_clamped_mmx_half(64)
319 :"+&r" (pixels), "=&r" (line_skip3)
320 :"r" (block), "r"(line_skip)
324 void ff_add_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size)
330 /* read the pixels */
337 "movq (%2), %%mm0 \n\t"
338 "movq 8(%2), %%mm1 \n\t"
339 "movq 16(%2), %%mm2 \n\t"
340 "movq 24(%2), %%mm3 \n\t"
341 "movq %0, %%mm4 \n\t"
342 "movq %1, %%mm6 \n\t"
343 "movq %%mm4, %%mm5 \n\t"
344 "punpcklbw %%mm7, %%mm4 \n\t"
345 "punpckhbw %%mm7, %%mm5 \n\t"
346 "paddsw %%mm4, %%mm0 \n\t"
347 "paddsw %%mm5, %%mm1 \n\t"
348 "movq %%mm6, %%mm5 \n\t"
349 "punpcklbw %%mm7, %%mm6 \n\t"
350 "punpckhbw %%mm7, %%mm5 \n\t"
351 "paddsw %%mm6, %%mm2 \n\t"
352 "paddsw %%mm5, %%mm3 \n\t"
353 "packuswb %%mm1, %%mm0 \n\t"
354 "packuswb %%mm3, %%mm2 \n\t"
355 "movq %%mm0, %0 \n\t"
356 "movq %%mm2, %1 \n\t"
357 :"+m"(*pix), "+m"(*(pix+line_size))
365 static void put_pixels4_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h)
368 "lea (%3, %3), %%"REG_a" \n\t"
371 "movd (%1), %%mm0 \n\t"
372 "movd (%1, %3), %%mm1 \n\t"
373 "movd %%mm0, (%2) \n\t"
374 "movd %%mm1, (%2, %3) \n\t"
375 "add %%"REG_a", %1 \n\t"
376 "add %%"REG_a", %2 \n\t"
377 "movd (%1), %%mm0 \n\t"
378 "movd (%1, %3), %%mm1 \n\t"
379 "movd %%mm0, (%2) \n\t"
380 "movd %%mm1, (%2, %3) \n\t"
381 "add %%"REG_a", %1 \n\t"
382 "add %%"REG_a", %2 \n\t"
385 : "+g"(h), "+r" (pixels), "+r" (block)
386 : "r"((x86_reg)line_size)
391 static void put_pixels8_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h)
394 "lea (%3, %3), %%"REG_a" \n\t"
397 "movq (%1), %%mm0 \n\t"
398 "movq (%1, %3), %%mm1 \n\t"
399 "movq %%mm0, (%2) \n\t"
400 "movq %%mm1, (%2, %3) \n\t"
401 "add %%"REG_a", %1 \n\t"
402 "add %%"REG_a", %2 \n\t"
403 "movq (%1), %%mm0 \n\t"
404 "movq (%1, %3), %%mm1 \n\t"
405 "movq %%mm0, (%2) \n\t"
406 "movq %%mm1, (%2, %3) \n\t"
407 "add %%"REG_a", %1 \n\t"
408 "add %%"REG_a", %2 \n\t"
411 : "+g"(h), "+r" (pixels), "+r" (block)
412 : "r"((x86_reg)line_size)
417 static void put_pixels16_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h)
420 "lea (%3, %3), %%"REG_a" \n\t"
423 "movq (%1), %%mm0 \n\t"
424 "movq 8(%1), %%mm4 \n\t"
425 "movq (%1, %3), %%mm1 \n\t"
426 "movq 8(%1, %3), %%mm5 \n\t"
427 "movq %%mm0, (%2) \n\t"
428 "movq %%mm4, 8(%2) \n\t"
429 "movq %%mm1, (%2, %3) \n\t"
430 "movq %%mm5, 8(%2, %3) \n\t"
431 "add %%"REG_a", %1 \n\t"
432 "add %%"REG_a", %2 \n\t"
433 "movq (%1), %%mm0 \n\t"
434 "movq 8(%1), %%mm4 \n\t"
435 "movq (%1, %3), %%mm1 \n\t"
436 "movq 8(%1, %3), %%mm5 \n\t"
437 "movq %%mm0, (%2) \n\t"
438 "movq %%mm4, 8(%2) \n\t"
439 "movq %%mm1, (%2, %3) \n\t"
440 "movq %%mm5, 8(%2, %3) \n\t"
441 "add %%"REG_a", %1 \n\t"
442 "add %%"REG_a", %2 \n\t"
445 : "+g"(h), "+r" (pixels), "+r" (block)
446 : "r"((x86_reg)line_size)
451 static void put_pixels16_sse2(uint8_t *block, const uint8_t *pixels, int line_size, int h)
455 "movdqu (%1), %%xmm0 \n\t"
456 "movdqu (%1,%3), %%xmm1 \n\t"
457 "movdqu (%1,%3,2), %%xmm2 \n\t"
458 "movdqu (%1,%4), %%xmm3 \n\t"
459 "movdqa %%xmm0, (%2) \n\t"
460 "movdqa %%xmm1, (%2,%3) \n\t"
461 "movdqa %%xmm2, (%2,%3,2) \n\t"
462 "movdqa %%xmm3, (%2,%4) \n\t"
464 "lea (%1,%3,4), %1 \n\t"
465 "lea (%2,%3,4), %2 \n\t"
467 : "+g"(h), "+r" (pixels), "+r" (block)
468 : "r"((x86_reg)line_size), "r"((x86_reg)3L*line_size)
473 static void avg_pixels16_sse2(uint8_t *block, const uint8_t *pixels, int line_size, int h)
477 "movdqu (%1), %%xmm0 \n\t"
478 "movdqu (%1,%3), %%xmm1 \n\t"
479 "movdqu (%1,%3,2), %%xmm2 \n\t"
480 "movdqu (%1,%4), %%xmm3 \n\t"
481 "pavgb (%2), %%xmm0 \n\t"
482 "pavgb (%2,%3), %%xmm1 \n\t"
483 "pavgb (%2,%3,2), %%xmm2 \n\t"
484 "pavgb (%2,%4), %%xmm3 \n\t"
485 "movdqa %%xmm0, (%2) \n\t"
486 "movdqa %%xmm1, (%2,%3) \n\t"
487 "movdqa %%xmm2, (%2,%3,2) \n\t"
488 "movdqa %%xmm3, (%2,%4) \n\t"
490 "lea (%1,%3,4), %1 \n\t"
491 "lea (%2,%3,4), %2 \n\t"
493 : "+g"(h), "+r" (pixels), "+r" (block)
494 : "r"((x86_reg)line_size), "r"((x86_reg)3L*line_size)
499 #define CLEAR_BLOCKS(name,n) \
500 static void name(DCTELEM *blocks)\
503 "pxor %%mm7, %%mm7 \n\t"\
504 "mov %1, %%"REG_a" \n\t"\
506 "movq %%mm7, (%0, %%"REG_a") \n\t"\
507 "movq %%mm7, 8(%0, %%"REG_a") \n\t"\
508 "movq %%mm7, 16(%0, %%"REG_a") \n\t"\
509 "movq %%mm7, 24(%0, %%"REG_a") \n\t"\
510 "add $32, %%"REG_a" \n\t"\
512 : : "r" (((uint8_t *)blocks)+128*n),\
517 CLEAR_BLOCKS(clear_blocks_mmx, 6)
518 CLEAR_BLOCKS(clear_block_mmx, 1)
520 static void clear_block_sse(DCTELEM *block)
523 "xorps %%xmm0, %%xmm0 \n"
524 "movaps %%xmm0, (%0) \n"
525 "movaps %%xmm0, 16(%0) \n"
526 "movaps %%xmm0, 32(%0) \n"
527 "movaps %%xmm0, 48(%0) \n"
528 "movaps %%xmm0, 64(%0) \n"
529 "movaps %%xmm0, 80(%0) \n"
530 "movaps %%xmm0, 96(%0) \n"
531 "movaps %%xmm0, 112(%0) \n"
537 static void clear_blocks_sse(DCTELEM *blocks)
540 "xorps %%xmm0, %%xmm0 \n"
541 "mov %1, %%"REG_a" \n"
543 "movaps %%xmm0, (%0, %%"REG_a") \n"
544 "movaps %%xmm0, 16(%0, %%"REG_a") \n"
545 "movaps %%xmm0, 32(%0, %%"REG_a") \n"
546 "movaps %%xmm0, 48(%0, %%"REG_a") \n"
547 "movaps %%xmm0, 64(%0, %%"REG_a") \n"
548 "movaps %%xmm0, 80(%0, %%"REG_a") \n"
549 "movaps %%xmm0, 96(%0, %%"REG_a") \n"
550 "movaps %%xmm0, 112(%0, %%"REG_a") \n"
551 "add $128, %%"REG_a" \n"
553 : : "r" (((uint8_t *)blocks)+128*6),
559 static void add_bytes_mmx(uint8_t *dst, uint8_t *src, int w){
564 "movq (%1, %0), %%mm0 \n\t"
565 "movq (%2, %0), %%mm1 \n\t"
566 "paddb %%mm0, %%mm1 \n\t"
567 "movq %%mm1, (%2, %0) \n\t"
568 "movq 8(%1, %0), %%mm0 \n\t"
569 "movq 8(%2, %0), %%mm1 \n\t"
570 "paddb %%mm0, %%mm1 \n\t"
571 "movq %%mm1, 8(%2, %0) \n\t"
577 : "r"(src), "r"(dst), "r"((x86_reg)w-15)
580 dst[i+0] += src[i+0];
583 static void add_bytes_l2_mmx(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
588 "movq (%2, %0), %%mm0 \n\t"
589 "movq 8(%2, %0), %%mm1 \n\t"
590 "paddb (%3, %0), %%mm0 \n\t"
591 "paddb 8(%3, %0), %%mm1 \n\t"
592 "movq %%mm0, (%1, %0) \n\t"
593 "movq %%mm1, 8(%1, %0) \n\t"
599 : "r"(dst), "r"(src1), "r"(src2), "r"((x86_reg)w-15)
602 dst[i] = src1[i] + src2[i];
605 #if HAVE_7REGS && HAVE_TEN_OPERANDS
606 static void add_hfyu_median_prediction_cmov(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int w, int *left, int *left_top) {
609 int l = *left & 0xff;
610 int tl = *left_top & 0xff;
615 "movzbl (%3,%4), %2 \n"
628 "add (%6,%4), %b0 \n"
629 "mov %b0, (%5,%4) \n"
632 :"+&q"(l), "+&q"(tl), "=&r"(t), "=&q"(x), "+&r"(w2)
633 :"r"(dst+w), "r"(diff+w), "rm"(top+w)
640 #define H263_LOOP_FILTER \
641 "pxor %%mm7, %%mm7 \n\t"\
642 "movq %0, %%mm0 \n\t"\
643 "movq %0, %%mm1 \n\t"\
644 "movq %3, %%mm2 \n\t"\
645 "movq %3, %%mm3 \n\t"\
646 "punpcklbw %%mm7, %%mm0 \n\t"\
647 "punpckhbw %%mm7, %%mm1 \n\t"\
648 "punpcklbw %%mm7, %%mm2 \n\t"\
649 "punpckhbw %%mm7, %%mm3 \n\t"\
650 "psubw %%mm2, %%mm0 \n\t"\
651 "psubw %%mm3, %%mm1 \n\t"\
652 "movq %1, %%mm2 \n\t"\
653 "movq %1, %%mm3 \n\t"\
654 "movq %2, %%mm4 \n\t"\
655 "movq %2, %%mm5 \n\t"\
656 "punpcklbw %%mm7, %%mm2 \n\t"\
657 "punpckhbw %%mm7, %%mm3 \n\t"\
658 "punpcklbw %%mm7, %%mm4 \n\t"\
659 "punpckhbw %%mm7, %%mm5 \n\t"\
660 "psubw %%mm2, %%mm4 \n\t"\
661 "psubw %%mm3, %%mm5 \n\t"\
662 "psllw $2, %%mm4 \n\t"\
663 "psllw $2, %%mm5 \n\t"\
664 "paddw %%mm0, %%mm4 \n\t"\
665 "paddw %%mm1, %%mm5 \n\t"\
666 "pxor %%mm6, %%mm6 \n\t"\
667 "pcmpgtw %%mm4, %%mm6 \n\t"\
668 "pcmpgtw %%mm5, %%mm7 \n\t"\
669 "pxor %%mm6, %%mm4 \n\t"\
670 "pxor %%mm7, %%mm5 \n\t"\
671 "psubw %%mm6, %%mm4 \n\t"\
672 "psubw %%mm7, %%mm5 \n\t"\
673 "psrlw $3, %%mm4 \n\t"\
674 "psrlw $3, %%mm5 \n\t"\
675 "packuswb %%mm5, %%mm4 \n\t"\
676 "packsswb %%mm7, %%mm6 \n\t"\
677 "pxor %%mm7, %%mm7 \n\t"\
678 "movd %4, %%mm2 \n\t"\
679 "punpcklbw %%mm2, %%mm2 \n\t"\
680 "punpcklbw %%mm2, %%mm2 \n\t"\
681 "punpcklbw %%mm2, %%mm2 \n\t"\
682 "psubusb %%mm4, %%mm2 \n\t"\
683 "movq %%mm2, %%mm3 \n\t"\
684 "psubusb %%mm4, %%mm3 \n\t"\
685 "psubb %%mm3, %%mm2 \n\t"\
686 "movq %1, %%mm3 \n\t"\
687 "movq %2, %%mm4 \n\t"\
688 "pxor %%mm6, %%mm3 \n\t"\
689 "pxor %%mm6, %%mm4 \n\t"\
690 "paddusb %%mm2, %%mm3 \n\t"\
691 "psubusb %%mm2, %%mm4 \n\t"\
692 "pxor %%mm6, %%mm3 \n\t"\
693 "pxor %%mm6, %%mm4 \n\t"\
694 "paddusb %%mm2, %%mm2 \n\t"\
695 "packsswb %%mm1, %%mm0 \n\t"\
696 "pcmpgtb %%mm0, %%mm7 \n\t"\
697 "pxor %%mm7, %%mm0 \n\t"\
698 "psubb %%mm7, %%mm0 \n\t"\
699 "movq %%mm0, %%mm1 \n\t"\
700 "psubusb %%mm2, %%mm0 \n\t"\
701 "psubb %%mm0, %%mm1 \n\t"\
702 "pand %5, %%mm1 \n\t"\
703 "psrlw $2, %%mm1 \n\t"\
704 "pxor %%mm7, %%mm1 \n\t"\
705 "psubb %%mm7, %%mm1 \n\t"\
706 "movq %0, %%mm5 \n\t"\
707 "movq %3, %%mm6 \n\t"\
708 "psubb %%mm1, %%mm5 \n\t"\
709 "paddb %%mm1, %%mm6 \n\t"
711 static void h263_v_loop_filter_mmx(uint8_t *src, int stride, int qscale){
712 if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
713 const int strength= ff_h263_loop_filter_strength[qscale];
719 "movq %%mm3, %1 \n\t"
720 "movq %%mm4, %2 \n\t"
721 "movq %%mm5, %0 \n\t"
722 "movq %%mm6, %3 \n\t"
723 : "+m" (*(uint64_t*)(src - 2*stride)),
724 "+m" (*(uint64_t*)(src - 1*stride)),
725 "+m" (*(uint64_t*)(src + 0*stride)),
726 "+m" (*(uint64_t*)(src + 1*stride))
727 : "g" (2*strength), "m"(ff_pb_FC)
732 static void h263_h_loop_filter_mmx(uint8_t *src, int stride, int qscale){
733 if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
734 const int strength= ff_h263_loop_filter_strength[qscale];
735 DECLARE_ALIGNED(8, uint64_t, temp)[4];
736 uint8_t *btemp= (uint8_t*)temp;
740 transpose4x4(btemp , src , 8, stride);
741 transpose4x4(btemp+4, src + 4*stride, 8, stride);
743 H263_LOOP_FILTER // 5 3 4 6
749 : "g" (2*strength), "m"(ff_pb_FC)
753 "movq %%mm5, %%mm1 \n\t"
754 "movq %%mm4, %%mm0 \n\t"
755 "punpcklbw %%mm3, %%mm5 \n\t"
756 "punpcklbw %%mm6, %%mm4 \n\t"
757 "punpckhbw %%mm3, %%mm1 \n\t"
758 "punpckhbw %%mm6, %%mm0 \n\t"
759 "movq %%mm5, %%mm3 \n\t"
760 "movq %%mm1, %%mm6 \n\t"
761 "punpcklwd %%mm4, %%mm5 \n\t"
762 "punpcklwd %%mm0, %%mm1 \n\t"
763 "punpckhwd %%mm4, %%mm3 \n\t"
764 "punpckhwd %%mm0, %%mm6 \n\t"
765 "movd %%mm5, (%0) \n\t"
766 "punpckhdq %%mm5, %%mm5 \n\t"
767 "movd %%mm5, (%0,%2) \n\t"
768 "movd %%mm3, (%0,%2,2) \n\t"
769 "punpckhdq %%mm3, %%mm3 \n\t"
770 "movd %%mm3, (%0,%3) \n\t"
771 "movd %%mm1, (%1) \n\t"
772 "punpckhdq %%mm1, %%mm1 \n\t"
773 "movd %%mm1, (%1,%2) \n\t"
774 "movd %%mm6, (%1,%2,2) \n\t"
775 "punpckhdq %%mm6, %%mm6 \n\t"
776 "movd %%mm6, (%1,%3) \n\t"
778 "r" (src + 4*stride),
779 "r" ((x86_reg) stride ),
780 "r" ((x86_reg)(3*stride))
785 /* draw the edges of width 'w' of an image of size width, height
786 this mmx version can only handle w==8 || w==16 */
787 static void draw_edges_mmx(uint8_t *buf, int wrap, int width, int height, int w, int sides)
789 uint8_t *ptr, *last_line;
792 last_line = buf + (height - 1) * wrap;
799 "movd (%0), %%mm0 \n\t"
800 "punpcklbw %%mm0, %%mm0 \n\t"
801 "punpcklwd %%mm0, %%mm0 \n\t"
802 "punpckldq %%mm0, %%mm0 \n\t"
803 "movq %%mm0, -8(%0) \n\t"
804 "movq -8(%0, %2), %%mm1 \n\t"
805 "punpckhbw %%mm1, %%mm1 \n\t"
806 "punpckhwd %%mm1, %%mm1 \n\t"
807 "punpckhdq %%mm1, %%mm1 \n\t"
808 "movq %%mm1, (%0, %2) \n\t"
813 : "r" ((x86_reg)wrap), "r" ((x86_reg)width), "r" (ptr + wrap*height)
820 "movd (%0), %%mm0 \n\t"
821 "punpcklbw %%mm0, %%mm0 \n\t"
822 "punpcklwd %%mm0, %%mm0 \n\t"
823 "punpckldq %%mm0, %%mm0 \n\t"
824 "movq %%mm0, -8(%0) \n\t"
825 "movq %%mm0, -16(%0) \n\t"
826 "movq -8(%0, %2), %%mm1 \n\t"
827 "punpckhbw %%mm1, %%mm1 \n\t"
828 "punpckhwd %%mm1, %%mm1 \n\t"
829 "punpckhdq %%mm1, %%mm1 \n\t"
830 "movq %%mm1, (%0, %2) \n\t"
831 "movq %%mm1, 8(%0, %2) \n\t"
836 : "r" ((x86_reg)wrap), "r" ((x86_reg)width), "r" (ptr + wrap*height)
840 /* top and bottom (and hopefully also the corners) */
841 if (sides&EDGE_TOP) {
842 for(i = 0; i < w; i += 4) {
843 ptr= buf - (i + 1) * wrap - w;
846 "movq (%1, %0), %%mm0 \n\t"
847 "movq %%mm0, (%0) \n\t"
848 "movq %%mm0, (%0, %2) \n\t"
849 "movq %%mm0, (%0, %2, 2) \n\t"
850 "movq %%mm0, (%0, %3) \n\t"
855 : "r" ((x86_reg)buf - (x86_reg)ptr - w), "r" ((x86_reg)-wrap), "r" ((x86_reg)-wrap*3), "r" (ptr+width+2*w)
860 if (sides&EDGE_BOTTOM) {
861 for(i = 0; i < w; i += 4) {
862 ptr= last_line + (i + 1) * wrap - w;
865 "movq (%1, %0), %%mm0 \n\t"
866 "movq %%mm0, (%0) \n\t"
867 "movq %%mm0, (%0, %2) \n\t"
868 "movq %%mm0, (%0, %2, 2) \n\t"
869 "movq %%mm0, (%0, %3) \n\t"
874 : "r" ((x86_reg)last_line - (x86_reg)ptr - w), "r" ((x86_reg)wrap), "r" ((x86_reg)wrap*3), "r" (ptr+width+2*w)
880 #define PAETH(cpu, abs3)\
881 static void add_png_paeth_prediction_##cpu(uint8_t *dst, uint8_t *src, uint8_t *top, int w, int bpp)\
886 "pxor %%mm7, %%mm7 \n"\
887 "movd (%1,%0), %%mm0 \n"\
888 "movd (%2,%0), %%mm1 \n"\
889 "punpcklbw %%mm7, %%mm0 \n"\
890 "punpcklbw %%mm7, %%mm1 \n"\
893 "movq %%mm1, %%mm2 \n"\
894 "movd (%2,%0), %%mm1 \n"\
895 "movq %%mm2, %%mm3 \n"\
896 "punpcklbw %%mm7, %%mm1 \n"\
897 "movq %%mm2, %%mm4 \n"\
898 "psubw %%mm1, %%mm3 \n"\
899 "psubw %%mm0, %%mm4 \n"\
900 "movq %%mm3, %%mm5 \n"\
901 "paddw %%mm4, %%mm5 \n"\
903 "movq %%mm4, %%mm6 \n"\
904 "pminsw %%mm5, %%mm6 \n"\
905 "pcmpgtw %%mm6, %%mm3 \n"\
906 "pcmpgtw %%mm5, %%mm4 \n"\
907 "movq %%mm4, %%mm6 \n"\
908 "pand %%mm3, %%mm4 \n"\
909 "pandn %%mm3, %%mm6 \n"\
910 "pandn %%mm0, %%mm3 \n"\
911 "movd (%3,%0), %%mm0 \n"\
912 "pand %%mm1, %%mm6 \n"\
913 "pand %%mm4, %%mm2 \n"\
914 "punpcklbw %%mm7, %%mm0 \n"\
916 "paddw %%mm6, %%mm0 \n"\
917 "paddw %%mm2, %%mm3 \n"\
918 "paddw %%mm3, %%mm0 \n"\
919 "pand %%mm5, %%mm0 \n"\
920 "movq %%mm0, %%mm3 \n"\
921 "packuswb %%mm3, %%mm3 \n"\
922 "movd %%mm3, (%1,%0) \n"\
927 :"r"(dst), "r"(top), "r"(src), "r"((x86_reg)bpp), "g"(end),\
934 "psubw %%mm5, %%mm7 \n"\
935 "pmaxsw %%mm7, %%mm5 \n"\
936 "pxor %%mm6, %%mm6 \n"\
937 "pxor %%mm7, %%mm7 \n"\
938 "psubw %%mm3, %%mm6 \n"\
939 "psubw %%mm4, %%mm7 \n"\
940 "pmaxsw %%mm6, %%mm3 \n"\
941 "pmaxsw %%mm7, %%mm4 \n"\
942 "pxor %%mm7, %%mm7 \n"
945 "pabsw %%mm3, %%mm3 \n"\
946 "pabsw %%mm4, %%mm4 \n"\
947 "pabsw %%mm5, %%mm5 \n"
949 PAETH(mmx2, ABS3_MMX2)
951 PAETH(ssse3, ABS3_SSSE3)
954 #define QPEL_V_LOW(m3,m4,m5,m6, pw_20, pw_3, rnd, in0, in1, in2, in7, out, OP)\
955 "paddw " #m4 ", " #m3 " \n\t" /* x1 */\
956 "movq "MANGLE(ff_pw_20)", %%mm4 \n\t" /* 20 */\
957 "pmullw " #m3 ", %%mm4 \n\t" /* 20x1 */\
958 "movq "#in7", " #m3 " \n\t" /* d */\
959 "movq "#in0", %%mm5 \n\t" /* D */\
960 "paddw " #m3 ", %%mm5 \n\t" /* x4 */\
961 "psubw %%mm5, %%mm4 \n\t" /* 20x1 - x4 */\
962 "movq "#in1", %%mm5 \n\t" /* C */\
963 "movq "#in2", %%mm6 \n\t" /* B */\
964 "paddw " #m6 ", %%mm5 \n\t" /* x3 */\
965 "paddw " #m5 ", %%mm6 \n\t" /* x2 */\
966 "paddw %%mm6, %%mm6 \n\t" /* 2x2 */\
967 "psubw %%mm6, %%mm5 \n\t" /* -2x2 + x3 */\
968 "pmullw "MANGLE(ff_pw_3)", %%mm5 \n\t" /* -6x2 + 3x3 */\
969 "paddw " #rnd ", %%mm4 \n\t" /* x2 */\
970 "paddw %%mm4, %%mm5 \n\t" /* 20x1 - 6x2 + 3x3 - x4 */\
971 "psraw $5, %%mm5 \n\t"\
972 "packuswb %%mm5, %%mm5 \n\t"\
973 OP(%%mm5, out, %%mm7, d)
975 #define QPEL_BASE(OPNAME, ROUNDER, RND, OP_MMX2, OP_3DNOW)\
976 static void OPNAME ## mpeg4_qpel16_h_lowpass_mmx2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
980 "pxor %%mm7, %%mm7 \n\t"\
982 "movq (%0), %%mm0 \n\t" /* ABCDEFGH */\
983 "movq %%mm0, %%mm1 \n\t" /* ABCDEFGH */\
984 "movq %%mm0, %%mm2 \n\t" /* ABCDEFGH */\
985 "punpcklbw %%mm7, %%mm0 \n\t" /* 0A0B0C0D */\
986 "punpckhbw %%mm7, %%mm1 \n\t" /* 0E0F0G0H */\
987 "pshufw $0x90, %%mm0, %%mm5 \n\t" /* 0A0A0B0C */\
988 "pshufw $0x41, %%mm0, %%mm6 \n\t" /* 0B0A0A0B */\
989 "movq %%mm2, %%mm3 \n\t" /* ABCDEFGH */\
990 "movq %%mm2, %%mm4 \n\t" /* ABCDEFGH */\
991 "psllq $8, %%mm2 \n\t" /* 0ABCDEFG */\
992 "psllq $16, %%mm3 \n\t" /* 00ABCDEF */\
993 "psllq $24, %%mm4 \n\t" /* 000ABCDE */\
994 "punpckhbw %%mm7, %%mm2 \n\t" /* 0D0E0F0G */\
995 "punpckhbw %%mm7, %%mm3 \n\t" /* 0C0D0E0F */\
996 "punpckhbw %%mm7, %%mm4 \n\t" /* 0B0C0D0E */\
997 "paddw %%mm3, %%mm5 \n\t" /* b */\
998 "paddw %%mm2, %%mm6 \n\t" /* c */\
999 "paddw %%mm5, %%mm5 \n\t" /* 2b */\
1000 "psubw %%mm5, %%mm6 \n\t" /* c - 2b */\
1001 "pshufw $0x06, %%mm0, %%mm5 \n\t" /* 0C0B0A0A */\
1002 "pmullw "MANGLE(ff_pw_3)", %%mm6 \n\t" /* 3c - 6b */\
1003 "paddw %%mm4, %%mm0 \n\t" /* a */\
1004 "paddw %%mm1, %%mm5 \n\t" /* d */\
1005 "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" /* 20a */\
1006 "psubw %%mm5, %%mm0 \n\t" /* 20a - d */\
1007 "paddw %6, %%mm6 \n\t"\
1008 "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\
1009 "psraw $5, %%mm0 \n\t"\
1010 "movq %%mm0, %5 \n\t"\
1011 /* mm1=EFGH, mm2=DEFG, mm3=CDEF, mm4=BCDE, mm7=0 */\
1013 "movq 5(%0), %%mm0 \n\t" /* FGHIJKLM */\
1014 "movq %%mm0, %%mm5 \n\t" /* FGHIJKLM */\
1015 "movq %%mm0, %%mm6 \n\t" /* FGHIJKLM */\
1016 "psrlq $8, %%mm0 \n\t" /* GHIJKLM0 */\
1017 "psrlq $16, %%mm5 \n\t" /* HIJKLM00 */\
1018 "punpcklbw %%mm7, %%mm0 \n\t" /* 0G0H0I0J */\
1019 "punpcklbw %%mm7, %%mm5 \n\t" /* 0H0I0J0K */\
1020 "paddw %%mm0, %%mm2 \n\t" /* b */\
1021 "paddw %%mm5, %%mm3 \n\t" /* c */\
1022 "paddw %%mm2, %%mm2 \n\t" /* 2b */\
1023 "psubw %%mm2, %%mm3 \n\t" /* c - 2b */\
1024 "movq %%mm6, %%mm2 \n\t" /* FGHIJKLM */\
1025 "psrlq $24, %%mm6 \n\t" /* IJKLM000 */\
1026 "punpcklbw %%mm7, %%mm2 \n\t" /* 0F0G0H0I */\
1027 "punpcklbw %%mm7, %%mm6 \n\t" /* 0I0J0K0L */\
1028 "pmullw "MANGLE(ff_pw_3)", %%mm3 \n\t" /* 3c - 6b */\
1029 "paddw %%mm2, %%mm1 \n\t" /* a */\
1030 "paddw %%mm6, %%mm4 \n\t" /* d */\
1031 "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" /* 20a */\
1032 "psubw %%mm4, %%mm3 \n\t" /* - 6b +3c - d */\
1033 "paddw %6, %%mm1 \n\t"\
1034 "paddw %%mm1, %%mm3 \n\t" /* 20a - 6b +3c - d */\
1035 "psraw $5, %%mm3 \n\t"\
1036 "movq %5, %%mm1 \n\t"\
1037 "packuswb %%mm3, %%mm1 \n\t"\
1038 OP_MMX2(%%mm1, (%1),%%mm4, q)\
1039 /* mm0= GHIJ, mm2=FGHI, mm5=HIJK, mm6=IJKL, mm7=0 */\
1041 "movq 9(%0), %%mm1 \n\t" /* JKLMNOPQ */\
1042 "movq %%mm1, %%mm4 \n\t" /* JKLMNOPQ */\
1043 "movq %%mm1, %%mm3 \n\t" /* JKLMNOPQ */\
1044 "psrlq $8, %%mm1 \n\t" /* KLMNOPQ0 */\
1045 "psrlq $16, %%mm4 \n\t" /* LMNOPQ00 */\
1046 "punpcklbw %%mm7, %%mm1 \n\t" /* 0K0L0M0N */\
1047 "punpcklbw %%mm7, %%mm4 \n\t" /* 0L0M0N0O */\
1048 "paddw %%mm1, %%mm5 \n\t" /* b */\
1049 "paddw %%mm4, %%mm0 \n\t" /* c */\
1050 "paddw %%mm5, %%mm5 \n\t" /* 2b */\
1051 "psubw %%mm5, %%mm0 \n\t" /* c - 2b */\
1052 "movq %%mm3, %%mm5 \n\t" /* JKLMNOPQ */\
1053 "psrlq $24, %%mm3 \n\t" /* MNOPQ000 */\
1054 "pmullw "MANGLE(ff_pw_3)", %%mm0 \n\t" /* 3c - 6b */\
1055 "punpcklbw %%mm7, %%mm3 \n\t" /* 0M0N0O0P */\
1056 "paddw %%mm3, %%mm2 \n\t" /* d */\
1057 "psubw %%mm2, %%mm0 \n\t" /* -6b + 3c - d */\
1058 "movq %%mm5, %%mm2 \n\t" /* JKLMNOPQ */\
1059 "punpcklbw %%mm7, %%mm2 \n\t" /* 0J0K0L0M */\
1060 "punpckhbw %%mm7, %%mm5 \n\t" /* 0N0O0P0Q */\
1061 "paddw %%mm2, %%mm6 \n\t" /* a */\
1062 "pmullw "MANGLE(ff_pw_20)", %%mm6 \n\t" /* 20a */\
1063 "paddw %6, %%mm0 \n\t"\
1064 "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\
1065 "psraw $5, %%mm0 \n\t"\
1066 /* mm1=KLMN, mm2=JKLM, mm3=MNOP, mm4=LMNO, mm5=NOPQ mm7=0 */\
1068 "paddw %%mm5, %%mm3 \n\t" /* a */\
1069 "pshufw $0xF9, %%mm5, %%mm6 \n\t" /* 0O0P0Q0Q */\
1070 "paddw %%mm4, %%mm6 \n\t" /* b */\
1071 "pshufw $0xBE, %%mm5, %%mm4 \n\t" /* 0P0Q0Q0P */\
1072 "pshufw $0x6F, %%mm5, %%mm5 \n\t" /* 0Q0Q0P0O */\
1073 "paddw %%mm1, %%mm4 \n\t" /* c */\
1074 "paddw %%mm2, %%mm5 \n\t" /* d */\
1075 "paddw %%mm6, %%mm6 \n\t" /* 2b */\
1076 "psubw %%mm6, %%mm4 \n\t" /* c - 2b */\
1077 "pmullw "MANGLE(ff_pw_20)", %%mm3 \n\t" /* 20a */\
1078 "pmullw "MANGLE(ff_pw_3)", %%mm4 \n\t" /* 3c - 6b */\
1079 "psubw %%mm5, %%mm3 \n\t" /* -6b + 3c - d */\
1080 "paddw %6, %%mm4 \n\t"\
1081 "paddw %%mm3, %%mm4 \n\t" /* 20a - 6b + 3c - d */\
1082 "psraw $5, %%mm4 \n\t"\
1083 "packuswb %%mm4, %%mm0 \n\t"\
1084 OP_MMX2(%%mm0, 8(%1), %%mm4, q)\
1090 : "+a"(src), "+c"(dst), "+D"(h)\
1091 : "d"((x86_reg)srcStride), "S"((x86_reg)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(temp), "m"(ROUNDER)\
1096 static void OPNAME ## mpeg4_qpel16_h_lowpass_3dnow(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1099 /* quick HACK, XXX FIXME MUST be optimized */\
1102 temp[ 0]= (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]);\
1103 temp[ 1]= (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]);\
1104 temp[ 2]= (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]);\
1105 temp[ 3]= (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]);\
1106 temp[ 4]= (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]);\
1107 temp[ 5]= (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]);\
1108 temp[ 6]= (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]);\
1109 temp[ 7]= (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]);\
1110 temp[ 8]= (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]);\
1111 temp[ 9]= (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]);\
1112 temp[10]= (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]);\
1113 temp[11]= (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]);\
1114 temp[12]= (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]);\
1115 temp[13]= (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]);\
1116 temp[14]= (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]);\
1117 temp[15]= (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]);\
1119 "movq (%0), %%mm0 \n\t"\
1120 "movq 8(%0), %%mm1 \n\t"\
1121 "paddw %2, %%mm0 \n\t"\
1122 "paddw %2, %%mm1 \n\t"\
1123 "psraw $5, %%mm0 \n\t"\
1124 "psraw $5, %%mm1 \n\t"\
1125 "packuswb %%mm1, %%mm0 \n\t"\
1126 OP_3DNOW(%%mm0, (%1), %%mm1, q)\
1127 "movq 16(%0), %%mm0 \n\t"\
1128 "movq 24(%0), %%mm1 \n\t"\
1129 "paddw %2, %%mm0 \n\t"\
1130 "paddw %2, %%mm1 \n\t"\
1131 "psraw $5, %%mm0 \n\t"\
1132 "psraw $5, %%mm1 \n\t"\
1133 "packuswb %%mm1, %%mm0 \n\t"\
1134 OP_3DNOW(%%mm0, 8(%1), %%mm1, q)\
1135 :: "r"(temp), "r"(dst), "m"(ROUNDER)\
1143 static void OPNAME ## mpeg4_qpel8_h_lowpass_mmx2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1145 "pxor %%mm7, %%mm7 \n\t"\
1147 "movq (%0), %%mm0 \n\t" /* ABCDEFGH */\
1148 "movq %%mm0, %%mm1 \n\t" /* ABCDEFGH */\
1149 "movq %%mm0, %%mm2 \n\t" /* ABCDEFGH */\
1150 "punpcklbw %%mm7, %%mm0 \n\t" /* 0A0B0C0D */\
1151 "punpckhbw %%mm7, %%mm1 \n\t" /* 0E0F0G0H */\
1152 "pshufw $0x90, %%mm0, %%mm5 \n\t" /* 0A0A0B0C */\
1153 "pshufw $0x41, %%mm0, %%mm6 \n\t" /* 0B0A0A0B */\
1154 "movq %%mm2, %%mm3 \n\t" /* ABCDEFGH */\
1155 "movq %%mm2, %%mm4 \n\t" /* ABCDEFGH */\
1156 "psllq $8, %%mm2 \n\t" /* 0ABCDEFG */\
1157 "psllq $16, %%mm3 \n\t" /* 00ABCDEF */\
1158 "psllq $24, %%mm4 \n\t" /* 000ABCDE */\
1159 "punpckhbw %%mm7, %%mm2 \n\t" /* 0D0E0F0G */\
1160 "punpckhbw %%mm7, %%mm3 \n\t" /* 0C0D0E0F */\
1161 "punpckhbw %%mm7, %%mm4 \n\t" /* 0B0C0D0E */\
1162 "paddw %%mm3, %%mm5 \n\t" /* b */\
1163 "paddw %%mm2, %%mm6 \n\t" /* c */\
1164 "paddw %%mm5, %%mm5 \n\t" /* 2b */\
1165 "psubw %%mm5, %%mm6 \n\t" /* c - 2b */\
1166 "pshufw $0x06, %%mm0, %%mm5 \n\t" /* 0C0B0A0A */\
1167 "pmullw "MANGLE(ff_pw_3)", %%mm6 \n\t" /* 3c - 6b */\
1168 "paddw %%mm4, %%mm0 \n\t" /* a */\
1169 "paddw %%mm1, %%mm5 \n\t" /* d */\
1170 "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" /* 20a */\
1171 "psubw %%mm5, %%mm0 \n\t" /* 20a - d */\
1172 "paddw %5, %%mm6 \n\t"\
1173 "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\
1174 "psraw $5, %%mm0 \n\t"\
1175 /* mm1=EFGH, mm2=DEFG, mm3=CDEF, mm4=BCDE, mm7=0 */\
1177 "movd 5(%0), %%mm5 \n\t" /* FGHI */\
1178 "punpcklbw %%mm7, %%mm5 \n\t" /* 0F0G0H0I */\
1179 "pshufw $0xF9, %%mm5, %%mm6 \n\t" /* 0G0H0I0I */\
1180 "paddw %%mm5, %%mm1 \n\t" /* a */\
1181 "paddw %%mm6, %%mm2 \n\t" /* b */\
1182 "pshufw $0xBE, %%mm5, %%mm6 \n\t" /* 0H0I0I0H */\
1183 "pshufw $0x6F, %%mm5, %%mm5 \n\t" /* 0I0I0H0G */\
1184 "paddw %%mm6, %%mm3 \n\t" /* c */\
1185 "paddw %%mm5, %%mm4 \n\t" /* d */\
1186 "paddw %%mm2, %%mm2 \n\t" /* 2b */\
1187 "psubw %%mm2, %%mm3 \n\t" /* c - 2b */\
1188 "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" /* 20a */\
1189 "pmullw "MANGLE(ff_pw_3)", %%mm3 \n\t" /* 3c - 6b */\
1190 "psubw %%mm4, %%mm3 \n\t" /* -6b + 3c - d */\
1191 "paddw %5, %%mm1 \n\t"\
1192 "paddw %%mm1, %%mm3 \n\t" /* 20a - 6b + 3c - d */\
1193 "psraw $5, %%mm3 \n\t"\
1194 "packuswb %%mm3, %%mm0 \n\t"\
1195 OP_MMX2(%%mm0, (%1), %%mm4, q)\
1201 : "+a"(src), "+c"(dst), "+d"(h)\
1202 : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(ROUNDER)\
1207 static void OPNAME ## mpeg4_qpel8_h_lowpass_3dnow(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1210 /* quick HACK, XXX FIXME MUST be optimized */\
1213 temp[ 0]= (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]);\
1214 temp[ 1]= (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]);\
1215 temp[ 2]= (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]);\
1216 temp[ 3]= (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]);\
1217 temp[ 4]= (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]);\
1218 temp[ 5]= (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 8]);\
1219 temp[ 6]= (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 8])*3 - (src[ 3]+src[ 7]);\
1220 temp[ 7]= (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 8])*6 + (src[ 5]+src[ 7])*3 - (src[ 4]+src[ 6]);\
1222 "movq (%0), %%mm0 \n\t"\
1223 "movq 8(%0), %%mm1 \n\t"\
1224 "paddw %2, %%mm0 \n\t"\
1225 "paddw %2, %%mm1 \n\t"\
1226 "psraw $5, %%mm0 \n\t"\
1227 "psraw $5, %%mm1 \n\t"\
1228 "packuswb %%mm1, %%mm0 \n\t"\
1229 OP_3DNOW(%%mm0, (%1), %%mm1, q)\
1230 :: "r"(temp), "r"(dst), "m"(ROUNDER)\
1238 #define QPEL_OP(OPNAME, ROUNDER, RND, OP, MMX)\
1240 static void OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1241 uint64_t temp[17*4];\
1242 uint64_t *temp_ptr= temp;\
1247 "pxor %%mm7, %%mm7 \n\t"\
1249 "movq (%0), %%mm0 \n\t"\
1250 "movq (%0), %%mm1 \n\t"\
1251 "movq 8(%0), %%mm2 \n\t"\
1252 "movq 8(%0), %%mm3 \n\t"\
1253 "punpcklbw %%mm7, %%mm0 \n\t"\
1254 "punpckhbw %%mm7, %%mm1 \n\t"\
1255 "punpcklbw %%mm7, %%mm2 \n\t"\
1256 "punpckhbw %%mm7, %%mm3 \n\t"\
1257 "movq %%mm0, (%1) \n\t"\
1258 "movq %%mm1, 17*8(%1) \n\t"\
1259 "movq %%mm2, 2*17*8(%1) \n\t"\
1260 "movq %%mm3, 3*17*8(%1) \n\t"\
1265 : "+r" (src), "+r" (temp_ptr), "+r"(count)\
1266 : "r" ((x86_reg)srcStride)\
1273 /*FIXME reorder for speed */\
1275 /*"pxor %%mm7, %%mm7 \n\t"*/\
1277 "movq (%0), %%mm0 \n\t"\
1278 "movq 8(%0), %%mm1 \n\t"\
1279 "movq 16(%0), %%mm2 \n\t"\
1280 "movq 24(%0), %%mm3 \n\t"\
1281 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP)\
1282 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP)\
1284 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP)\
1286 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP)\
1288 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP)\
1289 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 72(%0), (%1, %3), OP)\
1291 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 80(%0), (%1), OP)\
1292 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 88(%0), (%1, %3), OP)\
1294 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 40(%0), 48(%0), 56(%0), 96(%0), (%1), OP)\
1295 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 48(%0), 56(%0), 64(%0),104(%0), (%1, %3), OP)\
1297 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 56(%0), 64(%0), 72(%0),112(%0), (%1), OP)\
1298 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 64(%0), 72(%0), 80(%0),120(%0), (%1, %3), OP)\
1300 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 72(%0), 80(%0), 88(%0),128(%0), (%1), OP)\
1302 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 80(%0), 88(%0), 96(%0),128(%0), (%1, %3), OP)\
1304 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 88(%0), 96(%0),104(%0),120(%0), (%1), OP)\
1305 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 96(%0),104(%0),112(%0),112(%0), (%1, %3), OP)\
1307 "add $136, %0 \n\t"\
1312 : "+r"(temp_ptr), "+r"(dst), "+g"(count)\
1313 : "r"((x86_reg)dstStride), "r"(2*(x86_reg)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(ROUNDER), "g"(4-14*(x86_reg)dstStride)\
1318 static void OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1319 uint64_t temp[9*2];\
1320 uint64_t *temp_ptr= temp;\
1325 "pxor %%mm7, %%mm7 \n\t"\
1327 "movq (%0), %%mm0 \n\t"\
1328 "movq (%0), %%mm1 \n\t"\
1329 "punpcklbw %%mm7, %%mm0 \n\t"\
1330 "punpckhbw %%mm7, %%mm1 \n\t"\
1331 "movq %%mm0, (%1) \n\t"\
1332 "movq %%mm1, 9*8(%1) \n\t"\
1337 : "+r" (src), "+r" (temp_ptr), "+r"(count)\
1338 : "r" ((x86_reg)srcStride)\
1345 /*FIXME reorder for speed */\
1347 /*"pxor %%mm7, %%mm7 \n\t"*/\
1349 "movq (%0), %%mm0 \n\t"\
1350 "movq 8(%0), %%mm1 \n\t"\
1351 "movq 16(%0), %%mm2 \n\t"\
1352 "movq 24(%0), %%mm3 \n\t"\
1353 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP)\
1354 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP)\
1356 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP)\
1358 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP)\
1360 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP)\
1362 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 64(%0), (%1, %3), OP)\
1364 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 56(%0), (%1), OP)\
1365 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 48(%0), (%1, %3), OP)\
1372 : "+r"(temp_ptr), "+r"(dst), "+g"(count)\
1373 : "r"((x86_reg)dstStride), "r"(2*(x86_reg)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(ROUNDER), "g"(4-6*(x86_reg)dstStride)\
1378 static void OPNAME ## qpel8_mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\
1379 OPNAME ## pixels8_ ## MMX(dst, src, stride, 8);\
1382 static void OPNAME ## qpel8_mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1384 uint8_t * const half= (uint8_t*)temp;\
1385 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, stride, 8);\
1386 OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8);\
1389 static void OPNAME ## qpel8_mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1390 OPNAME ## mpeg4_qpel8_h_lowpass_ ## MMX(dst, src, stride, stride, 8);\
1393 static void OPNAME ## qpel8_mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1395 uint8_t * const half= (uint8_t*)temp;\
1396 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, stride, 8);\
1397 OPNAME ## pixels8_l2_ ## MMX(dst, src+1, half, stride, stride, 8);\
1400 static void OPNAME ## qpel8_mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1402 uint8_t * const half= (uint8_t*)temp;\
1403 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride);\
1404 OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8);\
1407 static void OPNAME ## qpel8_mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1408 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, src, stride, stride);\
1411 static void OPNAME ## qpel8_mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1413 uint8_t * const half= (uint8_t*)temp;\
1414 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride);\
1415 OPNAME ## pixels8_l2_ ## MMX(dst, src+stride, half, stride, stride, 8);\
1417 static void OPNAME ## qpel8_mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1418 uint64_t half[8 + 9];\
1419 uint8_t * const halfH= ((uint8_t*)half) + 64;\
1420 uint8_t * const halfHV= ((uint8_t*)half);\
1421 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
1422 put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\
1423 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
1424 OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\
1426 static void OPNAME ## qpel8_mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1427 uint64_t half[8 + 9];\
1428 uint8_t * const halfH= ((uint8_t*)half) + 64;\
1429 uint8_t * const halfHV= ((uint8_t*)half);\
1430 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
1431 put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\
1432 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
1433 OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\
1435 static void OPNAME ## qpel8_mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1436 uint64_t half[8 + 9];\
1437 uint8_t * const halfH= ((uint8_t*)half) + 64;\
1438 uint8_t * const halfHV= ((uint8_t*)half);\
1439 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
1440 put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\
1441 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
1442 OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\
1444 static void OPNAME ## qpel8_mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1445 uint64_t half[8 + 9];\
1446 uint8_t * const halfH= ((uint8_t*)half) + 64;\
1447 uint8_t * const halfHV= ((uint8_t*)half);\
1448 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
1449 put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\
1450 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
1451 OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\
1453 static void OPNAME ## qpel8_mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1454 uint64_t half[8 + 9];\
1455 uint8_t * const halfH= ((uint8_t*)half) + 64;\
1456 uint8_t * const halfHV= ((uint8_t*)half);\
1457 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
1458 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
1459 OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\
1461 static void OPNAME ## qpel8_mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1462 uint64_t half[8 + 9];\
1463 uint8_t * const halfH= ((uint8_t*)half) + 64;\
1464 uint8_t * const halfHV= ((uint8_t*)half);\
1465 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
1466 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
1467 OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\
1469 static void OPNAME ## qpel8_mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1470 uint64_t half[8 + 9];\
1471 uint8_t * const halfH= ((uint8_t*)half);\
1472 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
1473 put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\
1474 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
1476 static void OPNAME ## qpel8_mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1477 uint64_t half[8 + 9];\
1478 uint8_t * const halfH= ((uint8_t*)half);\
1479 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
1480 put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\
1481 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
1483 static void OPNAME ## qpel8_mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1485 uint8_t * const halfH= ((uint8_t*)half);\
1486 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
1487 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
1489 static void OPNAME ## qpel16_mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\
1490 OPNAME ## pixels16_ ## MMX(dst, src, stride, 16);\
1493 static void OPNAME ## qpel16_mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1495 uint8_t * const half= (uint8_t*)temp;\
1496 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, stride, 16);\
1497 OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16);\
1500 static void OPNAME ## qpel16_mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1501 OPNAME ## mpeg4_qpel16_h_lowpass_ ## MMX(dst, src, stride, stride, 16);\
1504 static void OPNAME ## qpel16_mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1506 uint8_t * const half= (uint8_t*)temp;\
1507 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, stride, 16);\
1508 OPNAME ## pixels16_l2_ ## MMX(dst, src+1, half, stride, stride, 16);\
1511 static void OPNAME ## qpel16_mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1513 uint8_t * const half= (uint8_t*)temp;\
1514 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, stride);\
1515 OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16);\
1518 static void OPNAME ## qpel16_mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1519 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, src, stride, stride);\
1522 static void OPNAME ## qpel16_mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1524 uint8_t * const half= (uint8_t*)temp;\
1525 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, stride);\
1526 OPNAME ## pixels16_l2_ ## MMX(dst, src+stride, half, stride, stride, 16);\
1528 static void OPNAME ## qpel16_mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1529 uint64_t half[16*2 + 17*2];\
1530 uint8_t * const halfH= ((uint8_t*)half) + 256;\
1531 uint8_t * const halfHV= ((uint8_t*)half);\
1532 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
1533 put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\
1534 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
1535 OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\
1537 static void OPNAME ## qpel16_mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1538 uint64_t half[16*2 + 17*2];\
1539 uint8_t * const halfH= ((uint8_t*)half) + 256;\
1540 uint8_t * const halfHV= ((uint8_t*)half);\
1541 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
1542 put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\
1543 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
1544 OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\
1546 static void OPNAME ## qpel16_mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1547 uint64_t half[16*2 + 17*2];\
1548 uint8_t * const halfH= ((uint8_t*)half) + 256;\
1549 uint8_t * const halfHV= ((uint8_t*)half);\
1550 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
1551 put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\
1552 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
1553 OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\
1555 static void OPNAME ## qpel16_mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1556 uint64_t half[16*2 + 17*2];\
1557 uint8_t * const halfH= ((uint8_t*)half) + 256;\
1558 uint8_t * const halfHV= ((uint8_t*)half);\
1559 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
1560 put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\
1561 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
1562 OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\
1564 static void OPNAME ## qpel16_mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1565 uint64_t half[16*2 + 17*2];\
1566 uint8_t * const halfH= ((uint8_t*)half) + 256;\
1567 uint8_t * const halfHV= ((uint8_t*)half);\
1568 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
1569 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
1570 OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\
1572 static void OPNAME ## qpel16_mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1573 uint64_t half[16*2 + 17*2];\
1574 uint8_t * const halfH= ((uint8_t*)half) + 256;\
1575 uint8_t * const halfHV= ((uint8_t*)half);\
1576 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
1577 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
1578 OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\
1580 static void OPNAME ## qpel16_mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1581 uint64_t half[17*2];\
1582 uint8_t * const halfH= ((uint8_t*)half);\
1583 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
1584 put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\
1585 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
1587 static void OPNAME ## qpel16_mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1588 uint64_t half[17*2];\
1589 uint8_t * const halfH= ((uint8_t*)half);\
1590 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
1591 put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\
1592 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
1594 static void OPNAME ## qpel16_mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1595 uint64_t half[17*2];\
1596 uint8_t * const halfH= ((uint8_t*)half);\
1597 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
1598 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
1601 #define PUT_OP(a,b,temp, size) "mov" #size " " #a ", " #b " \n\t"
1602 #define AVG_3DNOW_OP(a,b,temp, size) \
1603 "mov" #size " " #b ", " #temp " \n\t"\
1604 "pavgusb " #temp ", " #a " \n\t"\
1605 "mov" #size " " #a ", " #b " \n\t"
1606 #define AVG_MMX2_OP(a,b,temp, size) \
1607 "mov" #size " " #b ", " #temp " \n\t"\
1608 "pavgb " #temp ", " #a " \n\t"\
1609 "mov" #size " " #a ", " #b " \n\t"
1611 QPEL_BASE(put_ , ff_pw_16, _ , PUT_OP, PUT_OP)
1612 QPEL_BASE(avg_ , ff_pw_16, _ , AVG_MMX2_OP, AVG_3DNOW_OP)
1613 QPEL_BASE(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, PUT_OP)
1614 QPEL_OP(put_ , ff_pw_16, _ , PUT_OP, 3dnow)
1615 QPEL_OP(avg_ , ff_pw_16, _ , AVG_3DNOW_OP, 3dnow)
1616 QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, 3dnow)
1617 QPEL_OP(put_ , ff_pw_16, _ , PUT_OP, mmx2)
1618 QPEL_OP(avg_ , ff_pw_16, _ , AVG_MMX2_OP, mmx2)
1619 QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, mmx2)
1621 /***********************************/
1622 /* bilinear qpel: not compliant to any spec, only for -lavdopts fast */
1624 #define QPEL_2TAP_XY(OPNAME, SIZE, MMX, XY, HPEL)\
1625 static void OPNAME ## 2tap_qpel ## SIZE ## _mc ## XY ## _ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1626 OPNAME ## pixels ## SIZE ## HPEL(dst, src, stride, SIZE);\
1628 #define QPEL_2TAP_L3(OPNAME, SIZE, MMX, XY, S0, S1, S2)\
1629 static void OPNAME ## 2tap_qpel ## SIZE ## _mc ## XY ## _ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1630 OPNAME ## 2tap_qpel ## SIZE ## _l3_ ## MMX(dst, src+S0, stride, SIZE, S1, S2);\
1633 #define QPEL_2TAP(OPNAME, SIZE, MMX)\
1634 QPEL_2TAP_XY(OPNAME, SIZE, MMX, 20, _x2_ ## MMX)\
1635 QPEL_2TAP_XY(OPNAME, SIZE, MMX, 02, _y2_ ## MMX)\
1636 QPEL_2TAP_XY(OPNAME, SIZE, MMX, 22, _xy2_mmx)\
1637 static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc00_ ## MMX =\
1638 OPNAME ## qpel ## SIZE ## _mc00_ ## MMX;\
1639 static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc21_ ## MMX =\
1640 OPNAME ## 2tap_qpel ## SIZE ## _mc20_ ## MMX;\
1641 static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc12_ ## MMX =\
1642 OPNAME ## 2tap_qpel ## SIZE ## _mc02_ ## MMX;\
1643 static void OPNAME ## 2tap_qpel ## SIZE ## _mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1644 OPNAME ## pixels ## SIZE ## _y2_ ## MMX(dst, src+1, stride, SIZE);\
1646 static void OPNAME ## 2tap_qpel ## SIZE ## _mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1647 OPNAME ## pixels ## SIZE ## _x2_ ## MMX(dst, src+stride, stride, SIZE);\
1649 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 10, 0, 1, 0)\
1650 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 30, 1, -1, 0)\
1651 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 01, 0, stride, 0)\
1652 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 03, stride, -stride, 0)\
1653 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 11, 0, stride, 1)\
1654 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 31, 1, stride, -1)\
1655 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 13, stride, -stride, 1)\
1656 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 33, stride+1, -stride, -1)\
1658 QPEL_2TAP(put_, 16, mmx2)
1659 QPEL_2TAP(avg_, 16, mmx2)
1660 QPEL_2TAP(put_, 8, mmx2)
1661 QPEL_2TAP(avg_, 8, mmx2)
1662 QPEL_2TAP(put_, 16, 3dnow)
1663 QPEL_2TAP(avg_, 16, 3dnow)
1664 QPEL_2TAP(put_, 8, 3dnow)
1665 QPEL_2TAP(avg_, 8, 3dnow)
1669 static void just_return(void) { return; }
1673 typedef void emu_edge_core_func (uint8_t *buf, const uint8_t *src,
1674 x86_reg linesize, x86_reg start_y,
1675 x86_reg end_y, x86_reg block_h,
1676 x86_reg start_x, x86_reg end_x,
1678 extern emu_edge_core_func ff_emu_edge_core_mmx;
1679 extern emu_edge_core_func ff_emu_edge_core_sse;
1681 static av_always_inline
1682 void emulated_edge_mc(uint8_t *buf, const uint8_t *src, int linesize,
1683 int block_w, int block_h,
1684 int src_x, int src_y, int w, int h,
1685 emu_edge_core_func *core_fn)
1687 int start_y, start_x, end_y, end_x, src_y_add=0;
1690 src_y_add = h-1-src_y;
1692 }else if(src_y<=-block_h){
1693 src_y_add = 1-block_h-src_y;
1699 }else if(src_x<=-block_w){
1700 src+= (1-block_w-src_x);
1704 start_y= FFMAX(0, -src_y);
1705 start_x= FFMAX(0, -src_x);
1706 end_y= FFMIN(block_h, h-src_y);
1707 end_x= FFMIN(block_w, w-src_x);
1708 assert(start_x < end_x && block_w > 0);
1709 assert(start_y < end_y && block_h > 0);
1711 // fill in the to-be-copied part plus all above/below
1712 src += (src_y_add+start_y)*linesize + start_x;
1714 core_fn(buf, src, linesize, start_y, end_y, block_h, start_x, end_x, block_w);
1719 void emulated_edge_mc_mmx(uint8_t *buf, const uint8_t *src, int linesize,
1720 int block_w, int block_h,
1721 int src_x, int src_y, int w, int h)
1723 emulated_edge_mc(buf, src, linesize, block_w, block_h, src_x, src_y,
1724 w, h, &ff_emu_edge_core_mmx);
1728 void emulated_edge_mc_sse(uint8_t *buf, const uint8_t *src, int linesize,
1729 int block_w, int block_h,
1730 int src_x, int src_y, int w, int h)
1732 emulated_edge_mc(buf, src, linesize, block_w, block_h, src_x, src_y,
1733 w, h, &ff_emu_edge_core_sse);
1735 #endif /* HAVE_YASM */
1737 typedef void emulated_edge_mc_func (uint8_t *dst, const uint8_t *src,
1738 int linesize, int block_w, int block_h,
1739 int src_x, int src_y, int w, int h);
1741 static av_always_inline
1742 void gmc(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
1743 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height,
1744 emulated_edge_mc_func *emu_edge_fn)
1747 const int ix = ox>>(16+shift);
1748 const int iy = oy>>(16+shift);
1749 const int oxs = ox>>4;
1750 const int oys = oy>>4;
1751 const int dxxs = dxx>>4;
1752 const int dxys = dxy>>4;
1753 const int dyxs = dyx>>4;
1754 const int dyys = dyy>>4;
1755 const uint16_t r4[4] = {r,r,r,r};
1756 const uint16_t dxy4[4] = {dxys,dxys,dxys,dxys};
1757 const uint16_t dyy4[4] = {dyys,dyys,dyys,dyys};
1758 const uint64_t shift2 = 2*shift;
1759 uint8_t edge_buf[(h+1)*stride];
1762 const int dxw = (dxx-(1<<(16+shift)))*(w-1);
1763 const int dyh = (dyy-(1<<(16+shift)))*(h-1);
1764 const int dxh = dxy*(h-1);
1765 const int dyw = dyx*(w-1);
1766 if( // non-constant fullpel offset (3% of blocks)
1767 ((ox^(ox+dxw)) | (ox^(ox+dxh)) | (ox^(ox+dxw+dxh)) |
1768 (oy^(oy+dyw)) | (oy^(oy+dyh)) | (oy^(oy+dyw+dyh))) >> (16+shift)
1769 // uses more than 16 bits of subpel mv (only at huge resolution)
1770 || (dxx|dxy|dyx|dyy)&15 )
1772 //FIXME could still use mmx for some of the rows
1773 ff_gmc_c(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r, width, height);
1777 src += ix + iy*stride;
1778 if( (unsigned)ix >= width-w ||
1779 (unsigned)iy >= height-h )
1781 emu_edge_fn(edge_buf, src, stride, w+1, h+1, ix, iy, width, height);
1786 "movd %0, %%mm6 \n\t"
1787 "pxor %%mm7, %%mm7 \n\t"
1788 "punpcklwd %%mm6, %%mm6 \n\t"
1789 "punpcklwd %%mm6, %%mm6 \n\t"
1793 for(x=0; x<w; x+=4){
1794 uint16_t dx4[4] = { oxs - dxys + dxxs*(x+0),
1795 oxs - dxys + dxxs*(x+1),
1796 oxs - dxys + dxxs*(x+2),
1797 oxs - dxys + dxxs*(x+3) };
1798 uint16_t dy4[4] = { oys - dyys + dyxs*(x+0),
1799 oys - dyys + dyxs*(x+1),
1800 oys - dyys + dyxs*(x+2),
1801 oys - dyys + dyxs*(x+3) };
1805 "movq %0, %%mm4 \n\t"
1806 "movq %1, %%mm5 \n\t"
1807 "paddw %2, %%mm4 \n\t"
1808 "paddw %3, %%mm5 \n\t"
1809 "movq %%mm4, %0 \n\t"
1810 "movq %%mm5, %1 \n\t"
1811 "psrlw $12, %%mm4 \n\t"
1812 "psrlw $12, %%mm5 \n\t"
1813 : "+m"(*dx4), "+m"(*dy4)
1814 : "m"(*dxy4), "m"(*dyy4)
1818 "movq %%mm6, %%mm2 \n\t"
1819 "movq %%mm6, %%mm1 \n\t"
1820 "psubw %%mm4, %%mm2 \n\t"
1821 "psubw %%mm5, %%mm1 \n\t"
1822 "movq %%mm2, %%mm0 \n\t"
1823 "movq %%mm4, %%mm3 \n\t"
1824 "pmullw %%mm1, %%mm0 \n\t" // (s-dx)*(s-dy)
1825 "pmullw %%mm5, %%mm3 \n\t" // dx*dy
1826 "pmullw %%mm5, %%mm2 \n\t" // (s-dx)*dy
1827 "pmullw %%mm4, %%mm1 \n\t" // dx*(s-dy)
1829 "movd %4, %%mm5 \n\t"
1830 "movd %3, %%mm4 \n\t"
1831 "punpcklbw %%mm7, %%mm5 \n\t"
1832 "punpcklbw %%mm7, %%mm4 \n\t"
1833 "pmullw %%mm5, %%mm3 \n\t" // src[1,1] * dx*dy
1834 "pmullw %%mm4, %%mm2 \n\t" // src[0,1] * (s-dx)*dy
1836 "movd %2, %%mm5 \n\t"
1837 "movd %1, %%mm4 \n\t"
1838 "punpcklbw %%mm7, %%mm5 \n\t"
1839 "punpcklbw %%mm7, %%mm4 \n\t"
1840 "pmullw %%mm5, %%mm1 \n\t" // src[1,0] * dx*(s-dy)
1841 "pmullw %%mm4, %%mm0 \n\t" // src[0,0] * (s-dx)*(s-dy)
1842 "paddw %5, %%mm1 \n\t"
1843 "paddw %%mm3, %%mm2 \n\t"
1844 "paddw %%mm1, %%mm0 \n\t"
1845 "paddw %%mm2, %%mm0 \n\t"
1847 "psrlw %6, %%mm0 \n\t"
1848 "packuswb %%mm0, %%mm0 \n\t"
1849 "movd %%mm0, %0 \n\t"
1851 : "=m"(dst[x+y*stride])
1852 : "m"(src[0]), "m"(src[1]),
1853 "m"(src[stride]), "m"(src[stride+1]),
1854 "m"(*r4), "m"(shift2)
1864 static void gmc_mmx(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
1865 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
1867 gmc(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r,
1868 width, height, &emulated_edge_mc_mmx);
1871 static void gmc_sse(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
1872 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
1874 gmc(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r,
1875 width, height, &emulated_edge_mc_sse);
1878 static void gmc_mmx(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
1879 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
1881 gmc(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r,
1882 width, height, &ff_emulated_edge_mc);
1886 #define PREFETCH(name, op) \
1887 static void name(void *mem, int stride, int h){\
1888 const uint8_t *p= mem;\
1890 __asm__ volatile(#op" %0" :: "m"(*p));\
1894 PREFETCH(prefetch_mmx2, prefetcht0)
1895 PREFETCH(prefetch_3dnow, prefetch)
1898 #include "h264_qpel_mmx.c"
1900 void ff_put_h264_chroma_mc8_mmx_rnd (uint8_t *dst, uint8_t *src,
1901 int stride, int h, int x, int y);
1902 void ff_put_rv40_chroma_mc8_mmx (uint8_t *dst, uint8_t *src,
1903 int stride, int h, int x, int y);
1904 void ff_avg_h264_chroma_mc8_mmx2_rnd (uint8_t *dst, uint8_t *src,
1905 int stride, int h, int x, int y);
1906 void ff_avg_rv40_chroma_mc8_mmx2 (uint8_t *dst, uint8_t *src,
1907 int stride, int h, int x, int y);
1908 void ff_avg_h264_chroma_mc8_3dnow_rnd (uint8_t *dst, uint8_t *src,
1909 int stride, int h, int x, int y);
1910 void ff_avg_rv40_chroma_mc8_3dnow (uint8_t *dst, uint8_t *src,
1911 int stride, int h, int x, int y);
1913 void ff_put_h264_chroma_mc4_mmx (uint8_t *dst, uint8_t *src,
1914 int stride, int h, int x, int y);
1915 void ff_put_rv40_chroma_mc4_mmx (uint8_t *dst, uint8_t *src,
1916 int stride, int h, int x, int y);
1917 void ff_avg_h264_chroma_mc4_mmx2 (uint8_t *dst, uint8_t *src,
1918 int stride, int h, int x, int y);
1919 void ff_avg_rv40_chroma_mc4_mmx2 (uint8_t *dst, uint8_t *src,
1920 int stride, int h, int x, int y);
1921 void ff_avg_h264_chroma_mc4_3dnow (uint8_t *dst, uint8_t *src,
1922 int stride, int h, int x, int y);
1923 void ff_avg_rv40_chroma_mc4_3dnow (uint8_t *dst, uint8_t *src,
1924 int stride, int h, int x, int y);
1926 void ff_put_h264_chroma_mc2_mmx2 (uint8_t *dst, uint8_t *src,
1927 int stride, int h, int x, int y);
1928 void ff_avg_h264_chroma_mc2_mmx2 (uint8_t *dst, uint8_t *src,
1929 int stride, int h, int x, int y);
1931 void ff_put_h264_chroma_mc8_ssse3_rnd (uint8_t *dst, uint8_t *src,
1932 int stride, int h, int x, int y);
1933 void ff_put_h264_chroma_mc4_ssse3 (uint8_t *dst, uint8_t *src,
1934 int stride, int h, int x, int y);
1936 void ff_avg_h264_chroma_mc8_ssse3_rnd (uint8_t *dst, uint8_t *src,
1937 int stride, int h, int x, int y);
1938 void ff_avg_h264_chroma_mc4_ssse3 (uint8_t *dst, uint8_t *src,
1939 int stride, int h, int x, int y);
1943 void ff_put_cavs_qpel8_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) {
1944 put_pixels8_mmx(dst, src, stride, 8);
1946 void ff_avg_cavs_qpel8_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) {
1947 avg_pixels8_mmx(dst, src, stride, 8);
1949 void ff_put_cavs_qpel16_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) {
1950 put_pixels16_mmx(dst, src, stride, 16);
1952 void ff_avg_cavs_qpel16_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) {
1953 avg_pixels16_mmx(dst, src, stride, 16);
1957 void ff_put_vc1_mspel_mc00_mmx(uint8_t *dst, const uint8_t *src, int stride, int rnd) {
1958 put_pixels8_mmx(dst, src, stride, 8);
1960 void ff_avg_vc1_mspel_mc00_mmx2(uint8_t *dst, const uint8_t *src, int stride, int rnd) {
1961 avg_pixels8_mmx2(dst, src, stride, 8);
1964 /* XXX: those functions should be suppressed ASAP when all IDCTs are
1967 static void ff_libmpeg2mmx_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
1969 ff_mmx_idct (block);
1970 ff_put_pixels_clamped_mmx(block, dest, line_size);
1972 static void ff_libmpeg2mmx_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
1974 ff_mmx_idct (block);
1975 ff_add_pixels_clamped_mmx(block, dest, line_size);
1977 static void ff_libmpeg2mmx2_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
1979 ff_mmxext_idct (block);
1980 ff_put_pixels_clamped_mmx(block, dest, line_size);
1982 static void ff_libmpeg2mmx2_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
1984 ff_mmxext_idct (block);
1985 ff_add_pixels_clamped_mmx(block, dest, line_size);
1988 static void ff_idct_xvid_mmx_put(uint8_t *dest, int line_size, DCTELEM *block)
1990 ff_idct_xvid_mmx (block);
1991 ff_put_pixels_clamped_mmx(block, dest, line_size);
1993 static void ff_idct_xvid_mmx_add(uint8_t *dest, int line_size, DCTELEM *block)
1995 ff_idct_xvid_mmx (block);
1996 ff_add_pixels_clamped_mmx(block, dest, line_size);
1998 static void ff_idct_xvid_mmx2_put(uint8_t *dest, int line_size, DCTELEM *block)
2000 ff_idct_xvid_mmx2 (block);
2001 ff_put_pixels_clamped_mmx(block, dest, line_size);
2003 static void ff_idct_xvid_mmx2_add(uint8_t *dest, int line_size, DCTELEM *block)
2005 ff_idct_xvid_mmx2 (block);
2006 ff_add_pixels_clamped_mmx(block, dest, line_size);
2009 static void vorbis_inverse_coupling_3dnow(float *mag, float *ang, int blocksize)
2012 __asm__ volatile("pxor %%mm7, %%mm7":);
2013 for(i=0; i<blocksize; i+=2) {
2015 "movq %0, %%mm0 \n\t"
2016 "movq %1, %%mm1 \n\t"
2017 "movq %%mm0, %%mm2 \n\t"
2018 "movq %%mm1, %%mm3 \n\t"
2019 "pfcmpge %%mm7, %%mm2 \n\t" // m <= 0.0
2020 "pfcmpge %%mm7, %%mm3 \n\t" // a <= 0.0
2021 "pslld $31, %%mm2 \n\t" // keep only the sign bit
2022 "pxor %%mm2, %%mm1 \n\t"
2023 "movq %%mm3, %%mm4 \n\t"
2024 "pand %%mm1, %%mm3 \n\t"
2025 "pandn %%mm1, %%mm4 \n\t"
2026 "pfadd %%mm0, %%mm3 \n\t" // a = m + ((a<0) & (a ^ sign(m)))
2027 "pfsub %%mm4, %%mm0 \n\t" // m = m + ((a>0) & (a ^ sign(m)))
2028 "movq %%mm3, %1 \n\t"
2029 "movq %%mm0, %0 \n\t"
2030 :"+m"(mag[i]), "+m"(ang[i])
2034 __asm__ volatile("femms");
2036 static void vorbis_inverse_coupling_sse(float *mag, float *ang, int blocksize)
2041 "movaps %0, %%xmm5 \n\t"
2042 ::"m"(ff_pdw_80000000[0])
2044 for(i=0; i<blocksize; i+=4) {
2046 "movaps %0, %%xmm0 \n\t"
2047 "movaps %1, %%xmm1 \n\t"
2048 "xorps %%xmm2, %%xmm2 \n\t"
2049 "xorps %%xmm3, %%xmm3 \n\t"
2050 "cmpleps %%xmm0, %%xmm2 \n\t" // m <= 0.0
2051 "cmpleps %%xmm1, %%xmm3 \n\t" // a <= 0.0
2052 "andps %%xmm5, %%xmm2 \n\t" // keep only the sign bit
2053 "xorps %%xmm2, %%xmm1 \n\t"
2054 "movaps %%xmm3, %%xmm4 \n\t"
2055 "andps %%xmm1, %%xmm3 \n\t"
2056 "andnps %%xmm1, %%xmm4 \n\t"
2057 "addps %%xmm0, %%xmm3 \n\t" // a = m + ((a<0) & (a ^ sign(m)))
2058 "subps %%xmm4, %%xmm0 \n\t" // m = m + ((a>0) & (a ^ sign(m)))
2059 "movaps %%xmm3, %1 \n\t"
2060 "movaps %%xmm0, %0 \n\t"
2061 :"+m"(mag[i]), "+m"(ang[i])
2070 #define MIX5(mono,stereo)\
2072 "movss 0(%2), %%xmm5 \n"\
2073 "movss 8(%2), %%xmm6 \n"\
2074 "movss 24(%2), %%xmm7 \n"\
2075 "shufps $0, %%xmm5, %%xmm5 \n"\
2076 "shufps $0, %%xmm6, %%xmm6 \n"\
2077 "shufps $0, %%xmm7, %%xmm7 \n"\
2079 "movaps (%0,%1), %%xmm0 \n"\
2080 "movaps 0x400(%0,%1), %%xmm1 \n"\
2081 "movaps 0x800(%0,%1), %%xmm2 \n"\
2082 "movaps 0xc00(%0,%1), %%xmm3 \n"\
2083 "movaps 0x1000(%0,%1), %%xmm4 \n"\
2084 "mulps %%xmm5, %%xmm0 \n"\
2085 "mulps %%xmm6, %%xmm1 \n"\
2086 "mulps %%xmm5, %%xmm2 \n"\
2087 "mulps %%xmm7, %%xmm3 \n"\
2088 "mulps %%xmm7, %%xmm4 \n"\
2089 stereo("addps %%xmm1, %%xmm0 \n")\
2090 "addps %%xmm1, %%xmm2 \n"\
2091 "addps %%xmm3, %%xmm0 \n"\
2092 "addps %%xmm4, %%xmm2 \n"\
2093 mono("addps %%xmm2, %%xmm0 \n")\
2094 "movaps %%xmm0, (%0,%1) \n"\
2095 stereo("movaps %%xmm2, 0x400(%0,%1) \n")\
2099 :"r"(samples[0]+len), "r"(matrix)\
2100 :XMM_CLOBBERS("%xmm0", "%xmm1", "%xmm2", "%xmm3", \
2101 "%xmm4", "%xmm5", "%xmm6", "%xmm7",)\
2105 #define MIX_MISC(stereo)\
2108 "movaps (%3,%0), %%xmm0 \n"\
2109 stereo("movaps %%xmm0, %%xmm1 \n")\
2110 "mulps %%xmm4, %%xmm0 \n"\
2111 stereo("mulps %%xmm5, %%xmm1 \n")\
2112 "lea 1024(%3,%0), %1 \n"\
2115 "movaps (%1), %%xmm2 \n"\
2116 stereo("movaps %%xmm2, %%xmm3 \n")\
2117 "mulps (%4,%2), %%xmm2 \n"\
2118 stereo("mulps 16(%4,%2), %%xmm3 \n")\
2119 "addps %%xmm2, %%xmm0 \n"\
2120 stereo("addps %%xmm3, %%xmm1 \n")\
2124 "movaps %%xmm0, (%3,%0) \n"\
2125 stereo("movaps %%xmm1, 1024(%3,%0) \n")\
2128 :"+&r"(i), "=&r"(j), "=&r"(k)\
2129 :"r"(samples[0]+len), "r"(matrix_simd+in_ch), "g"((intptr_t)-32*(in_ch-1))\
2133 static void ac3_downmix_sse(float (*samples)[256], float (*matrix)[2], int out_ch, int in_ch, int len)
2135 int (*matrix_cmp)[2] = (int(*)[2])matrix;
2138 i = -len*sizeof(float);
2139 if(in_ch == 5 && out_ch == 2 && !(matrix_cmp[0][1]|matrix_cmp[2][0]|matrix_cmp[3][1]|matrix_cmp[4][0]|(matrix_cmp[1][0]^matrix_cmp[1][1])|(matrix_cmp[0][0]^matrix_cmp[2][1]))) {
2141 } else if(in_ch == 5 && out_ch == 1 && matrix_cmp[0][0]==matrix_cmp[2][0] && matrix_cmp[3][0]==matrix_cmp[4][0]) {
2144 DECLARE_ALIGNED(16, float, matrix_simd)[AC3_MAX_CHANNELS][2][4];
2145 j = 2*in_ch*sizeof(float);
2149 "movss (%2,%0), %%xmm4 \n"
2150 "movss 4(%2,%0), %%xmm5 \n"
2151 "shufps $0, %%xmm4, %%xmm4 \n"
2152 "shufps $0, %%xmm5, %%xmm5 \n"
2153 "movaps %%xmm4, (%1,%0,4) \n"
2154 "movaps %%xmm5, 16(%1,%0,4) \n"
2157 :"r"(matrix_simd), "r"(matrix)
2168 static void vector_fmul_3dnow(float *dst, const float *src0, const float *src1, int len){
2169 x86_reg i = (len-4)*4;
2172 "movq (%2,%0), %%mm0 \n\t"
2173 "movq 8(%2,%0), %%mm1 \n\t"
2174 "pfmul (%3,%0), %%mm0 \n\t"
2175 "pfmul 8(%3,%0), %%mm1 \n\t"
2176 "movq %%mm0, (%1,%0) \n\t"
2177 "movq %%mm1, 8(%1,%0) \n\t"
2182 :"r"(dst), "r"(src0), "r"(src1)
2186 static void vector_fmul_sse(float *dst, const float *src0, const float *src1, int len){
2187 x86_reg i = (len-8)*4;
2190 "movaps (%2,%0), %%xmm0 \n\t"
2191 "movaps 16(%2,%0), %%xmm1 \n\t"
2192 "mulps (%3,%0), %%xmm0 \n\t"
2193 "mulps 16(%3,%0), %%xmm1 \n\t"
2194 "movaps %%xmm0, (%1,%0) \n\t"
2195 "movaps %%xmm1, 16(%1,%0) \n\t"
2199 :"r"(dst), "r"(src0), "r"(src1)
2204 static void vector_fmul_reverse_3dnow2(float *dst, const float *src0, const float *src1, int len){
2205 x86_reg i = len*4-16;
2208 "pswapd 8(%1), %%mm0 \n\t"
2209 "pswapd (%1), %%mm1 \n\t"
2210 "pfmul (%3,%0), %%mm0 \n\t"
2211 "pfmul 8(%3,%0), %%mm1 \n\t"
2212 "movq %%mm0, (%2,%0) \n\t"
2213 "movq %%mm1, 8(%2,%0) \n\t"
2217 :"+r"(i), "+r"(src1)
2218 :"r"(dst), "r"(src0)
2220 __asm__ volatile("femms");
2222 static void vector_fmul_reverse_sse(float *dst, const float *src0, const float *src1, int len){
2223 x86_reg i = len*4-32;
2226 "movaps 16(%1), %%xmm0 \n\t"
2227 "movaps (%1), %%xmm1 \n\t"
2228 "shufps $0x1b, %%xmm0, %%xmm0 \n\t"
2229 "shufps $0x1b, %%xmm1, %%xmm1 \n\t"
2230 "mulps (%3,%0), %%xmm0 \n\t"
2231 "mulps 16(%3,%0), %%xmm1 \n\t"
2232 "movaps %%xmm0, (%2,%0) \n\t"
2233 "movaps %%xmm1, 16(%2,%0) \n\t"
2237 :"+r"(i), "+r"(src1)
2238 :"r"(dst), "r"(src0)
2242 static void vector_fmul_add_3dnow(float *dst, const float *src0, const float *src1,
2243 const float *src2, int len){
2244 x86_reg i = (len-4)*4;
2247 "movq (%2,%0), %%mm0 \n\t"
2248 "movq 8(%2,%0), %%mm1 \n\t"
2249 "pfmul (%3,%0), %%mm0 \n\t"
2250 "pfmul 8(%3,%0), %%mm1 \n\t"
2251 "pfadd (%4,%0), %%mm0 \n\t"
2252 "pfadd 8(%4,%0), %%mm1 \n\t"
2253 "movq %%mm0, (%1,%0) \n\t"
2254 "movq %%mm1, 8(%1,%0) \n\t"
2258 :"r"(dst), "r"(src0), "r"(src1), "r"(src2)
2261 __asm__ volatile("femms");
2263 static void vector_fmul_add_sse(float *dst, const float *src0, const float *src1,
2264 const float *src2, int len){
2265 x86_reg i = (len-8)*4;
2268 "movaps (%2,%0), %%xmm0 \n\t"
2269 "movaps 16(%2,%0), %%xmm1 \n\t"
2270 "mulps (%3,%0), %%xmm0 \n\t"
2271 "mulps 16(%3,%0), %%xmm1 \n\t"
2272 "addps (%4,%0), %%xmm0 \n\t"
2273 "addps 16(%4,%0), %%xmm1 \n\t"
2274 "movaps %%xmm0, (%1,%0) \n\t"
2275 "movaps %%xmm1, 16(%1,%0) \n\t"
2279 :"r"(dst), "r"(src0), "r"(src1), "r"(src2)
2285 static void vector_fmul_window_3dnow2(float *dst, const float *src0, const float *src1,
2286 const float *win, int len){
2288 x86_reg j = len*4-8;
2291 "pswapd (%5,%1), %%mm1 \n"
2292 "movq (%5,%0), %%mm0 \n"
2293 "pswapd (%4,%1), %%mm5 \n"
2294 "movq (%3,%0), %%mm4 \n"
2295 "movq %%mm0, %%mm2 \n"
2296 "movq %%mm1, %%mm3 \n"
2297 "pfmul %%mm4, %%mm2 \n" // src0[len+i]*win[len+i]
2298 "pfmul %%mm5, %%mm3 \n" // src1[ j]*win[len+j]
2299 "pfmul %%mm4, %%mm1 \n" // src0[len+i]*win[len+j]
2300 "pfmul %%mm5, %%mm0 \n" // src1[ j]*win[len+i]
2301 "pfadd %%mm3, %%mm2 \n"
2302 "pfsub %%mm0, %%mm1 \n"
2303 "pswapd %%mm2, %%mm2 \n"
2304 "movq %%mm1, (%2,%0) \n"
2305 "movq %%mm2, (%2,%1) \n"
2311 :"r"(dst+len), "r"(src0+len), "r"(src1), "r"(win+len)
2315 static void vector_fmul_window_sse(float *dst, const float *src0, const float *src1,
2316 const float *win, int len){
2318 x86_reg j = len*4-16;
2321 "movaps (%5,%1), %%xmm1 \n"
2322 "movaps (%5,%0), %%xmm0 \n"
2323 "movaps (%4,%1), %%xmm5 \n"
2324 "movaps (%3,%0), %%xmm4 \n"
2325 "shufps $0x1b, %%xmm1, %%xmm1 \n"
2326 "shufps $0x1b, %%xmm5, %%xmm5 \n"
2327 "movaps %%xmm0, %%xmm2 \n"
2328 "movaps %%xmm1, %%xmm3 \n"
2329 "mulps %%xmm4, %%xmm2 \n" // src0[len+i]*win[len+i]
2330 "mulps %%xmm5, %%xmm3 \n" // src1[ j]*win[len+j]
2331 "mulps %%xmm4, %%xmm1 \n" // src0[len+i]*win[len+j]
2332 "mulps %%xmm5, %%xmm0 \n" // src1[ j]*win[len+i]
2333 "addps %%xmm3, %%xmm2 \n"
2334 "subps %%xmm0, %%xmm1 \n"
2335 "shufps $0x1b, %%xmm2, %%xmm2 \n"
2336 "movaps %%xmm1, (%2,%0) \n"
2337 "movaps %%xmm2, (%2,%1) \n"
2342 :"r"(dst+len), "r"(src0+len), "r"(src1), "r"(win+len)
2345 #endif /* HAVE_6REGS */
2347 static void vector_clipf_sse(float *dst, const float *src, float min, float max,
2350 x86_reg i = (len-16)*4;
2352 "movss %3, %%xmm4 \n"
2353 "movss %4, %%xmm5 \n"
2354 "shufps $0, %%xmm4, %%xmm4 \n"
2355 "shufps $0, %%xmm5, %%xmm5 \n"
2357 "movaps (%2,%0), %%xmm0 \n\t" // 3/1 on intel
2358 "movaps 16(%2,%0), %%xmm1 \n\t"
2359 "movaps 32(%2,%0), %%xmm2 \n\t"
2360 "movaps 48(%2,%0), %%xmm3 \n\t"
2361 "maxps %%xmm4, %%xmm0 \n\t"
2362 "maxps %%xmm4, %%xmm1 \n\t"
2363 "maxps %%xmm4, %%xmm2 \n\t"
2364 "maxps %%xmm4, %%xmm3 \n\t"
2365 "minps %%xmm5, %%xmm0 \n\t"
2366 "minps %%xmm5, %%xmm1 \n\t"
2367 "minps %%xmm5, %%xmm2 \n\t"
2368 "minps %%xmm5, %%xmm3 \n\t"
2369 "movaps %%xmm0, (%1,%0) \n\t"
2370 "movaps %%xmm1, 16(%1,%0) \n\t"
2371 "movaps %%xmm2, 32(%1,%0) \n\t"
2372 "movaps %%xmm3, 48(%1,%0) \n\t"
2376 :"r"(dst), "r"(src), "m"(min), "m"(max)
2381 void ff_vp3_idct_mmx(int16_t *input_data);
2382 void ff_vp3_idct_put_mmx(uint8_t *dest, int line_size, DCTELEM *block);
2383 void ff_vp3_idct_add_mmx(uint8_t *dest, int line_size, DCTELEM *block);
2385 void ff_vp3_idct_dc_add_mmx2(uint8_t *dest, int line_size, const DCTELEM *block);
2387 void ff_vp3_v_loop_filter_mmx2(uint8_t *src, int stride, int *bounding_values);
2388 void ff_vp3_h_loop_filter_mmx2(uint8_t *src, int stride, int *bounding_values);
2390 void ff_vp3_idct_sse2(int16_t *input_data);
2391 void ff_vp3_idct_put_sse2(uint8_t *dest, int line_size, DCTELEM *block);
2392 void ff_vp3_idct_add_sse2(uint8_t *dest, int line_size, DCTELEM *block);
2394 int32_t ff_scalarproduct_int16_mmx2(const int16_t *v1, const int16_t *v2, int order, int shift);
2395 int32_t ff_scalarproduct_int16_sse2(const int16_t *v1, const int16_t *v2, int order, int shift);
2396 int32_t ff_scalarproduct_and_madd_int16_mmx2(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul);
2397 int32_t ff_scalarproduct_and_madd_int16_sse2(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul);
2398 int32_t ff_scalarproduct_and_madd_int16_ssse3(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul);
2400 void ff_apply_window_int16_mmxext (int16_t *output, const int16_t *input,
2401 const int16_t *window, unsigned int len);
2402 void ff_apply_window_int16_mmxext_ba (int16_t *output, const int16_t *input,
2403 const int16_t *window, unsigned int len);
2404 void ff_apply_window_int16_sse2 (int16_t *output, const int16_t *input,
2405 const int16_t *window, unsigned int len);
2406 void ff_apply_window_int16_sse2_ba (int16_t *output, const int16_t *input,
2407 const int16_t *window, unsigned int len);
2408 void ff_apply_window_int16_ssse3 (int16_t *output, const int16_t *input,
2409 const int16_t *window, unsigned int len);
2410 void ff_apply_window_int16_ssse3_atom(int16_t *output, const int16_t *input,
2411 const int16_t *window, unsigned int len);
2413 void ff_add_hfyu_median_prediction_mmx2(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int w, int *left, int *left_top);
2414 int ff_add_hfyu_left_prediction_ssse3(uint8_t *dst, const uint8_t *src, int w, int left);
2415 int ff_add_hfyu_left_prediction_sse4(uint8_t *dst, const uint8_t *src, int w, int left);
2417 float ff_scalarproduct_float_sse(const float *v1, const float *v2, int order);
2419 void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
2421 int mm_flags = av_get_cpu_flags();
2422 const int high_bit_depth = avctx->codec_id == CODEC_ID_H264 && avctx->bits_per_raw_sample > 8;
2424 if (avctx->dsp_mask) {
2425 if (avctx->dsp_mask & AV_CPU_FLAG_FORCE)
2426 mm_flags |= (avctx->dsp_mask & 0xffff);
2428 mm_flags &= ~(avctx->dsp_mask & 0xffff);
2432 av_log(avctx, AV_LOG_INFO, "libavcodec: CPU flags:");
2433 if (mm_flags & AV_CPU_FLAG_MMX)
2434 av_log(avctx, AV_LOG_INFO, " mmx");
2435 if (mm_flags & AV_CPU_FLAG_MMX2)
2436 av_log(avctx, AV_LOG_INFO, " mmx2");
2437 if (mm_flags & AV_CPU_FLAG_3DNOW)
2438 av_log(avctx, AV_LOG_INFO, " 3dnow");
2439 if (mm_flags & AV_CPU_FLAG_SSE)
2440 av_log(avctx, AV_LOG_INFO, " sse");
2441 if (mm_flags & AV_CPU_FLAG_SSE2)
2442 av_log(avctx, AV_LOG_INFO, " sse2");
2443 av_log(avctx, AV_LOG_INFO, "\n");
2446 if (mm_flags & AV_CPU_FLAG_MMX) {
2447 const int idct_algo= avctx->idct_algo;
2449 if(avctx->lowres==0){
2450 if(idct_algo==FF_IDCT_AUTO || idct_algo==FF_IDCT_SIMPLEMMX){
2451 c->idct_put= ff_simple_idct_put_mmx;
2452 c->idct_add= ff_simple_idct_add_mmx;
2453 c->idct = ff_simple_idct_mmx;
2454 c->idct_permutation_type= FF_SIMPLE_IDCT_PERM;
2456 }else if(idct_algo==FF_IDCT_LIBMPEG2MMX){
2457 if(mm_flags & AV_CPU_FLAG_MMX2){
2458 c->idct_put= ff_libmpeg2mmx2_idct_put;
2459 c->idct_add= ff_libmpeg2mmx2_idct_add;
2460 c->idct = ff_mmxext_idct;
2462 c->idct_put= ff_libmpeg2mmx_idct_put;
2463 c->idct_add= ff_libmpeg2mmx_idct_add;
2464 c->idct = ff_mmx_idct;
2466 c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
2468 }else if((CONFIG_VP3_DECODER || CONFIG_VP5_DECODER || CONFIG_VP6_DECODER) &&
2469 idct_algo==FF_IDCT_VP3 && HAVE_YASM){
2470 if(mm_flags & AV_CPU_FLAG_SSE2){
2471 c->idct_put= ff_vp3_idct_put_sse2;
2472 c->idct_add= ff_vp3_idct_add_sse2;
2473 c->idct = ff_vp3_idct_sse2;
2474 c->idct_permutation_type= FF_TRANSPOSE_IDCT_PERM;
2476 c->idct_put= ff_vp3_idct_put_mmx;
2477 c->idct_add= ff_vp3_idct_add_mmx;
2478 c->idct = ff_vp3_idct_mmx;
2479 c->idct_permutation_type= FF_PARTTRANS_IDCT_PERM;
2481 }else if(idct_algo==FF_IDCT_CAVS){
2482 c->idct_permutation_type= FF_TRANSPOSE_IDCT_PERM;
2483 }else if(idct_algo==FF_IDCT_XVIDMMX){
2484 if(mm_flags & AV_CPU_FLAG_SSE2){
2485 c->idct_put= ff_idct_xvid_sse2_put;
2486 c->idct_add= ff_idct_xvid_sse2_add;
2487 c->idct = ff_idct_xvid_sse2;
2488 c->idct_permutation_type= FF_SSE2_IDCT_PERM;
2489 }else if(mm_flags & AV_CPU_FLAG_MMX2){
2490 c->idct_put= ff_idct_xvid_mmx2_put;
2491 c->idct_add= ff_idct_xvid_mmx2_add;
2492 c->idct = ff_idct_xvid_mmx2;
2494 c->idct_put= ff_idct_xvid_mmx_put;
2495 c->idct_add= ff_idct_xvid_mmx_add;
2496 c->idct = ff_idct_xvid_mmx;
2501 c->put_pixels_clamped = ff_put_pixels_clamped_mmx;
2502 c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_mmx;
2503 c->add_pixels_clamped = ff_add_pixels_clamped_mmx;
2504 if (!high_bit_depth) {
2505 c->clear_block = clear_block_mmx;
2506 c->clear_blocks = clear_blocks_mmx;
2507 if ((mm_flags & AV_CPU_FLAG_SSE) &&
2508 !(CONFIG_MPEG_XVMC_DECODER && avctx->xvmc_acceleration > 1)){
2509 /* XvMCCreateBlocks() may not allocate 16-byte aligned blocks */
2510 c->clear_block = clear_block_sse;
2511 c->clear_blocks = clear_blocks_sse;
2515 #define SET_HPEL_FUNCS(PFX, IDX, SIZE, CPU) \
2516 c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## SIZE ## _ ## CPU; \
2517 c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## SIZE ## _x2_ ## CPU; \
2518 c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## SIZE ## _y2_ ## CPU; \
2519 c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## SIZE ## _xy2_ ## CPU
2521 if (!high_bit_depth) {
2522 SET_HPEL_FUNCS(put, 0, 16, mmx);
2523 SET_HPEL_FUNCS(put_no_rnd, 0, 16, mmx);
2524 SET_HPEL_FUNCS(avg, 0, 16, mmx);
2525 SET_HPEL_FUNCS(avg_no_rnd, 0, 16, mmx);
2526 SET_HPEL_FUNCS(put, 1, 8, mmx);
2527 SET_HPEL_FUNCS(put_no_rnd, 1, 8, mmx);
2528 SET_HPEL_FUNCS(avg, 1, 8, mmx);
2529 SET_HPEL_FUNCS(avg_no_rnd, 1, 8, mmx);
2532 #if ARCH_X86_32 || !HAVE_YASM
2535 #if ARCH_X86_32 && HAVE_YASM
2536 if (!high_bit_depth)
2537 c->emulated_edge_mc = emulated_edge_mc_mmx;
2540 c->add_bytes= add_bytes_mmx;
2541 c->add_bytes_l2= add_bytes_l2_mmx;
2543 if (!high_bit_depth)
2544 c->draw_edges = draw_edges_mmx;
2546 if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
2547 c->h263_v_loop_filter= h263_v_loop_filter_mmx;
2548 c->h263_h_loop_filter= h263_h_loop_filter_mmx;
2552 if (!high_bit_depth) {
2553 c->put_h264_chroma_pixels_tab[0]= ff_put_h264_chroma_mc8_mmx_rnd;
2554 c->put_h264_chroma_pixels_tab[1]= ff_put_h264_chroma_mc4_mmx;
2557 c->put_rv40_chroma_pixels_tab[0]= ff_put_rv40_chroma_mc8_mmx;
2558 c->put_rv40_chroma_pixels_tab[1]= ff_put_rv40_chroma_mc4_mmx;
2561 if (mm_flags & AV_CPU_FLAG_MMX2) {
2562 c->prefetch = prefetch_mmx2;
2564 if (!high_bit_depth) {
2565 c->put_pixels_tab[0][1] = put_pixels16_x2_mmx2;
2566 c->put_pixels_tab[0][2] = put_pixels16_y2_mmx2;
2568 c->avg_pixels_tab[0][0] = avg_pixels16_mmx2;
2569 c->avg_pixels_tab[0][1] = avg_pixels16_x2_mmx2;
2570 c->avg_pixels_tab[0][2] = avg_pixels16_y2_mmx2;
2572 c->put_pixels_tab[1][1] = put_pixels8_x2_mmx2;
2573 c->put_pixels_tab[1][2] = put_pixels8_y2_mmx2;
2575 c->avg_pixels_tab[1][0] = avg_pixels8_mmx2;
2576 c->avg_pixels_tab[1][1] = avg_pixels8_x2_mmx2;
2577 c->avg_pixels_tab[1][2] = avg_pixels8_y2_mmx2;
2580 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
2581 if (!high_bit_depth) {
2582 c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmx2;
2583 c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmx2;
2584 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_mmx2;
2585 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_mmx2;
2586 c->avg_pixels_tab[0][3] = avg_pixels16_xy2_mmx2;
2587 c->avg_pixels_tab[1][3] = avg_pixels8_xy2_mmx2;
2590 if (CONFIG_VP3_DECODER && HAVE_YASM) {
2591 c->vp3_v_loop_filter= ff_vp3_v_loop_filter_mmx2;
2592 c->vp3_h_loop_filter= ff_vp3_h_loop_filter_mmx2;
2595 if (CONFIG_VP3_DECODER && HAVE_YASM) {
2596 c->vp3_idct_dc_add = ff_vp3_idct_dc_add_mmx2;
2599 if (CONFIG_VP3_DECODER
2600 && (avctx->codec_id == CODEC_ID_VP3 || avctx->codec_id == CODEC_ID_THEORA)) {
2601 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_exact_mmx2;
2602 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_exact_mmx2;
2605 #define SET_QPEL_FUNCS(PFX, IDX, SIZE, CPU) \
2606 c->PFX ## _pixels_tab[IDX][ 0] = PFX ## SIZE ## _mc00_ ## CPU; \
2607 c->PFX ## _pixels_tab[IDX][ 1] = PFX ## SIZE ## _mc10_ ## CPU; \
2608 c->PFX ## _pixels_tab[IDX][ 2] = PFX ## SIZE ## _mc20_ ## CPU; \
2609 c->PFX ## _pixels_tab[IDX][ 3] = PFX ## SIZE ## _mc30_ ## CPU; \
2610 c->PFX ## _pixels_tab[IDX][ 4] = PFX ## SIZE ## _mc01_ ## CPU; \
2611 c->PFX ## _pixels_tab[IDX][ 5] = PFX ## SIZE ## _mc11_ ## CPU; \
2612 c->PFX ## _pixels_tab[IDX][ 6] = PFX ## SIZE ## _mc21_ ## CPU; \
2613 c->PFX ## _pixels_tab[IDX][ 7] = PFX ## SIZE ## _mc31_ ## CPU; \
2614 c->PFX ## _pixels_tab[IDX][ 8] = PFX ## SIZE ## _mc02_ ## CPU; \
2615 c->PFX ## _pixels_tab[IDX][ 9] = PFX ## SIZE ## _mc12_ ## CPU; \
2616 c->PFX ## _pixels_tab[IDX][10] = PFX ## SIZE ## _mc22_ ## CPU; \
2617 c->PFX ## _pixels_tab[IDX][11] = PFX ## SIZE ## _mc32_ ## CPU; \
2618 c->PFX ## _pixels_tab[IDX][12] = PFX ## SIZE ## _mc03_ ## CPU; \
2619 c->PFX ## _pixels_tab[IDX][13] = PFX ## SIZE ## _mc13_ ## CPU; \
2620 c->PFX ## _pixels_tab[IDX][14] = PFX ## SIZE ## _mc23_ ## CPU; \
2621 c->PFX ## _pixels_tab[IDX][15] = PFX ## SIZE ## _mc33_ ## CPU
2623 SET_QPEL_FUNCS(put_qpel, 0, 16, mmx2);
2624 SET_QPEL_FUNCS(put_qpel, 1, 8, mmx2);
2625 SET_QPEL_FUNCS(put_no_rnd_qpel, 0, 16, mmx2);
2626 SET_QPEL_FUNCS(put_no_rnd_qpel, 1, 8, mmx2);
2627 SET_QPEL_FUNCS(avg_qpel, 0, 16, mmx2);
2628 SET_QPEL_FUNCS(avg_qpel, 1, 8, mmx2);
2630 if (!high_bit_depth) {
2631 SET_QPEL_FUNCS(put_h264_qpel, 0, 16, mmx2);
2632 SET_QPEL_FUNCS(put_h264_qpel, 1, 8, mmx2);
2633 SET_QPEL_FUNCS(put_h264_qpel, 2, 4, mmx2);
2634 SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, mmx2);
2635 SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, mmx2);
2636 SET_QPEL_FUNCS(avg_h264_qpel, 2, 4, mmx2);
2639 SET_QPEL_FUNCS(put_2tap_qpel, 0, 16, mmx2);
2640 SET_QPEL_FUNCS(put_2tap_qpel, 1, 8, mmx2);
2641 SET_QPEL_FUNCS(avg_2tap_qpel, 0, 16, mmx2);
2642 SET_QPEL_FUNCS(avg_2tap_qpel, 1, 8, mmx2);
2645 c->avg_rv40_chroma_pixels_tab[0]= ff_avg_rv40_chroma_mc8_mmx2;
2646 c->avg_rv40_chroma_pixels_tab[1]= ff_avg_rv40_chroma_mc4_mmx2;
2648 if (!high_bit_depth) {
2649 c->avg_h264_chroma_pixels_tab[0]= ff_avg_h264_chroma_mc8_mmx2_rnd;
2650 c->avg_h264_chroma_pixels_tab[1]= ff_avg_h264_chroma_mc4_mmx2;
2651 c->avg_h264_chroma_pixels_tab[2]= ff_avg_h264_chroma_mc2_mmx2;
2652 c->put_h264_chroma_pixels_tab[2]= ff_put_h264_chroma_mc2_mmx2;
2655 c->add_hfyu_median_prediction = ff_add_hfyu_median_prediction_mmx2;
2657 #if HAVE_7REGS && HAVE_TEN_OPERANDS
2658 if( mm_flags&AV_CPU_FLAG_3DNOW )
2659 c->add_hfyu_median_prediction = add_hfyu_median_prediction_cmov;
2662 c->add_png_paeth_prediction= add_png_paeth_prediction_mmx2;
2663 } else if (mm_flags & AV_CPU_FLAG_3DNOW) {
2664 c->prefetch = prefetch_3dnow;
2666 if (!high_bit_depth) {
2667 c->put_pixels_tab[0][1] = put_pixels16_x2_3dnow;
2668 c->put_pixels_tab[0][2] = put_pixels16_y2_3dnow;
2670 c->avg_pixels_tab[0][0] = avg_pixels16_3dnow;
2671 c->avg_pixels_tab[0][1] = avg_pixels16_x2_3dnow;
2672 c->avg_pixels_tab[0][2] = avg_pixels16_y2_3dnow;
2674 c->put_pixels_tab[1][1] = put_pixels8_x2_3dnow;
2675 c->put_pixels_tab[1][2] = put_pixels8_y2_3dnow;
2677 c->avg_pixels_tab[1][0] = avg_pixels8_3dnow;
2678 c->avg_pixels_tab[1][1] = avg_pixels8_x2_3dnow;
2679 c->avg_pixels_tab[1][2] = avg_pixels8_y2_3dnow;
2681 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
2682 c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_3dnow;
2683 c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_3dnow;
2684 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_3dnow;
2685 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_3dnow;
2686 c->avg_pixels_tab[0][3] = avg_pixels16_xy2_3dnow;
2687 c->avg_pixels_tab[1][3] = avg_pixels8_xy2_3dnow;
2691 if (CONFIG_VP3_DECODER
2692 && (avctx->codec_id == CODEC_ID_VP3 || avctx->codec_id == CODEC_ID_THEORA)) {
2693 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_exact_3dnow;
2694 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_exact_3dnow;
2697 SET_QPEL_FUNCS(put_qpel, 0, 16, 3dnow);
2698 SET_QPEL_FUNCS(put_qpel, 1, 8, 3dnow);
2699 SET_QPEL_FUNCS(put_no_rnd_qpel, 0, 16, 3dnow);
2700 SET_QPEL_FUNCS(put_no_rnd_qpel, 1, 8, 3dnow);
2701 SET_QPEL_FUNCS(avg_qpel, 0, 16, 3dnow);
2702 SET_QPEL_FUNCS(avg_qpel, 1, 8, 3dnow);
2704 if (!high_bit_depth) {
2705 SET_QPEL_FUNCS(put_h264_qpel, 0, 16, 3dnow);
2706 SET_QPEL_FUNCS(put_h264_qpel, 1, 8, 3dnow);
2707 SET_QPEL_FUNCS(put_h264_qpel, 2, 4, 3dnow);
2708 SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, 3dnow);
2709 SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, 3dnow);
2710 SET_QPEL_FUNCS(avg_h264_qpel, 2, 4, 3dnow);
2713 SET_QPEL_FUNCS(put_2tap_qpel, 0, 16, 3dnow);
2714 SET_QPEL_FUNCS(put_2tap_qpel, 1, 8, 3dnow);
2715 SET_QPEL_FUNCS(avg_2tap_qpel, 0, 16, 3dnow);
2716 SET_QPEL_FUNCS(avg_2tap_qpel, 1, 8, 3dnow);
2719 if (!high_bit_depth) {
2720 c->avg_h264_chroma_pixels_tab[0]= ff_avg_h264_chroma_mc8_3dnow_rnd;
2721 c->avg_h264_chroma_pixels_tab[1]= ff_avg_h264_chroma_mc4_3dnow;
2724 c->avg_rv40_chroma_pixels_tab[0]= ff_avg_rv40_chroma_mc8_3dnow;
2725 c->avg_rv40_chroma_pixels_tab[1]= ff_avg_rv40_chroma_mc4_3dnow;
2730 #define H264_QPEL_FUNCS(x, y, CPU)\
2731 c->put_h264_qpel_pixels_tab[0][x+y*4] = put_h264_qpel16_mc##x##y##_##CPU;\
2732 c->put_h264_qpel_pixels_tab[1][x+y*4] = put_h264_qpel8_mc##x##y##_##CPU;\
2733 c->avg_h264_qpel_pixels_tab[0][x+y*4] = avg_h264_qpel16_mc##x##y##_##CPU;\
2734 c->avg_h264_qpel_pixels_tab[1][x+y*4] = avg_h264_qpel8_mc##x##y##_##CPU;
2735 if((mm_flags & AV_CPU_FLAG_SSE2) && !(mm_flags & AV_CPU_FLAG_3DNOW)){
2736 // these functions are slower than mmx on AMD, but faster on Intel
2737 if (!high_bit_depth) {
2738 c->put_pixels_tab[0][0] = put_pixels16_sse2;
2739 c->put_no_rnd_pixels_tab[0][0] = put_pixels16_sse2;
2740 c->avg_pixels_tab[0][0] = avg_pixels16_sse2;
2741 H264_QPEL_FUNCS(0, 0, sse2);
2744 if(mm_flags & AV_CPU_FLAG_SSE2){
2745 if (!high_bit_depth) {
2746 H264_QPEL_FUNCS(0, 1, sse2);
2747 H264_QPEL_FUNCS(0, 2, sse2);
2748 H264_QPEL_FUNCS(0, 3, sse2);
2749 H264_QPEL_FUNCS(1, 1, sse2);
2750 H264_QPEL_FUNCS(1, 2, sse2);
2751 H264_QPEL_FUNCS(1, 3, sse2);
2752 H264_QPEL_FUNCS(2, 1, sse2);
2753 H264_QPEL_FUNCS(2, 2, sse2);
2754 H264_QPEL_FUNCS(2, 3, sse2);
2755 H264_QPEL_FUNCS(3, 1, sse2);
2756 H264_QPEL_FUNCS(3, 2, sse2);
2757 H264_QPEL_FUNCS(3, 3, sse2);
2761 if(mm_flags & AV_CPU_FLAG_SSSE3){
2762 if (!high_bit_depth) {
2763 H264_QPEL_FUNCS(1, 0, ssse3);
2764 H264_QPEL_FUNCS(1, 1, ssse3);
2765 H264_QPEL_FUNCS(1, 2, ssse3);
2766 H264_QPEL_FUNCS(1, 3, ssse3);
2767 H264_QPEL_FUNCS(2, 0, ssse3);
2768 H264_QPEL_FUNCS(2, 1, ssse3);
2769 H264_QPEL_FUNCS(2, 2, ssse3);
2770 H264_QPEL_FUNCS(2, 3, ssse3);
2771 H264_QPEL_FUNCS(3, 0, ssse3);
2772 H264_QPEL_FUNCS(3, 1, ssse3);
2773 H264_QPEL_FUNCS(3, 2, ssse3);
2774 H264_QPEL_FUNCS(3, 3, ssse3);
2776 c->add_png_paeth_prediction= add_png_paeth_prediction_ssse3;
2778 if (!high_bit_depth) {
2779 c->put_h264_chroma_pixels_tab[0]= ff_put_h264_chroma_mc8_ssse3_rnd;
2780 c->avg_h264_chroma_pixels_tab[0]= ff_avg_h264_chroma_mc8_ssse3_rnd;
2781 c->put_h264_chroma_pixels_tab[1]= ff_put_h264_chroma_mc4_ssse3;
2782 c->avg_h264_chroma_pixels_tab[1]= ff_avg_h264_chroma_mc4_ssse3;
2784 c->add_hfyu_left_prediction = ff_add_hfyu_left_prediction_ssse3;
2785 if (mm_flags & AV_CPU_FLAG_SSE4) // not really sse4, just slow on Conroe
2786 c->add_hfyu_left_prediction = ff_add_hfyu_left_prediction_sse4;
2791 if(mm_flags & AV_CPU_FLAG_3DNOW){
2792 c->vorbis_inverse_coupling = vorbis_inverse_coupling_3dnow;
2793 c->vector_fmul = vector_fmul_3dnow;
2795 if(mm_flags & AV_CPU_FLAG_3DNOWEXT){
2796 c->vector_fmul_reverse = vector_fmul_reverse_3dnow2;
2798 c->vector_fmul_window = vector_fmul_window_3dnow2;
2801 if(mm_flags & AV_CPU_FLAG_MMX2){
2803 c->scalarproduct_int16 = ff_scalarproduct_int16_mmx2;
2804 c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_mmx2;
2805 if (avctx->flags & CODEC_FLAG_BITEXACT) {
2806 c->apply_window_int16 = ff_apply_window_int16_mmxext_ba;
2808 c->apply_window_int16 = ff_apply_window_int16_mmxext;
2812 if(mm_flags & AV_CPU_FLAG_SSE){
2813 c->vorbis_inverse_coupling = vorbis_inverse_coupling_sse;
2814 c->ac3_downmix = ac3_downmix_sse;
2815 c->vector_fmul = vector_fmul_sse;
2816 c->vector_fmul_reverse = vector_fmul_reverse_sse;
2817 c->vector_fmul_add = vector_fmul_add_sse;
2819 c->vector_fmul_window = vector_fmul_window_sse;
2821 c->vector_clipf = vector_clipf_sse;
2823 c->scalarproduct_float = ff_scalarproduct_float_sse;
2826 if(mm_flags & AV_CPU_FLAG_3DNOW)
2827 c->vector_fmul_add = vector_fmul_add_3dnow; // faster than sse
2828 if(mm_flags & AV_CPU_FLAG_SSE2){
2830 c->scalarproduct_int16 = ff_scalarproduct_int16_sse2;
2831 c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_sse2;
2832 if (avctx->flags & CODEC_FLAG_BITEXACT) {
2833 c->apply_window_int16 = ff_apply_window_int16_sse2_ba;
2835 if (!(mm_flags & AV_CPU_FLAG_SSE2SLOW)) {
2836 c->apply_window_int16 = ff_apply_window_int16_sse2;
2840 if (!high_bit_depth)
2841 c->emulated_edge_mc = emulated_edge_mc_sse;
2845 if (mm_flags & AV_CPU_FLAG_SSSE3) {
2847 if (mm_flags & AV_CPU_FLAG_ATOM) {
2848 c->apply_window_int16 = ff_apply_window_int16_ssse3_atom;
2850 c->apply_window_int16 = ff_apply_window_int16_ssse3;
2852 if (!(mm_flags & (AV_CPU_FLAG_SSE42|AV_CPU_FLAG_3DNOW))) { // cachesplit
2853 c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_ssse3;
2859 if (CONFIG_ENCODERS)
2860 dsputilenc_init_mmx(c, avctx);
2863 // for speed testing
2864 get_pixels = just_return;
2865 put_pixels_clamped = just_return;
2866 add_pixels_clamped = just_return;
2868 pix_abs16x16 = just_return;
2869 pix_abs16x16_x2 = just_return;
2870 pix_abs16x16_y2 = just_return;
2871 pix_abs16x16_xy2 = just_return;
2873 put_pixels_tab[0] = just_return;
2874 put_pixels_tab[1] = just_return;
2875 put_pixels_tab[2] = just_return;
2876 put_pixels_tab[3] = just_return;
2878 put_no_rnd_pixels_tab[0] = just_return;
2879 put_no_rnd_pixels_tab[1] = just_return;
2880 put_no_rnd_pixels_tab[2] = just_return;
2881 put_no_rnd_pixels_tab[3] = just_return;
2883 avg_pixels_tab[0] = just_return;
2884 avg_pixels_tab[1] = just_return;
2885 avg_pixels_tab[2] = just_return;
2886 avg_pixels_tab[3] = just_return;
2888 avg_no_rnd_pixels_tab[0] = just_return;
2889 avg_no_rnd_pixels_tab[1] = just_return;
2890 avg_no_rnd_pixels_tab[2] = just_return;
2891 avg_no_rnd_pixels_tab[3] = just_return;
2893 //av_fdct = just_return;
2894 //ff_idct = just_return;