2 * MMX optimized DSP utils
3 * Copyright (c) 2000, 2001 Fabrice Bellard
4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
6 * This file is part of Libav.
8 * Libav is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License as published by the Free Software Foundation; either
11 * version 2.1 of the License, or (at your option) any later version.
13 * Libav is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 * Lesser General Public License for more details.
18 * You should have received a copy of the GNU Lesser General Public
19 * License along with Libav; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22 * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
25 #include "libavutil/cpu.h"
26 #include "libavutil/x86_cpu.h"
27 #include "libavcodec/dsputil.h"
28 #include "libavcodec/h264dsp.h"
29 #include "libavcodec/mpegvideo.h"
30 #include "libavcodec/simple_idct.h"
31 #include "libavcodec/ac3dec.h"
32 #include "dsputil_mmx.h"
33 #include "idct_xvid.h"
38 /* pixel operations */
39 DECLARE_ALIGNED(8, const uint64_t, ff_bone) = 0x0101010101010101ULL;
40 DECLARE_ALIGNED(8, const uint64_t, ff_wtwo) = 0x0002000200020002ULL;
42 DECLARE_ALIGNED(16, const uint64_t, ff_pdw_80000000)[2] =
43 {0x8000000080000000ULL, 0x8000000080000000ULL};
45 DECLARE_ALIGNED(8, const uint64_t, ff_pw_1 ) = 0x0001000100010001ULL;
46 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_3 ) = {0x0003000300030003ULL, 0x0003000300030003ULL};
47 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_4 ) = {0x0004000400040004ULL, 0x0004000400040004ULL};
48 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_5 ) = {0x0005000500050005ULL, 0x0005000500050005ULL};
49 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_8 ) = {0x0008000800080008ULL, 0x0008000800080008ULL};
50 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_9 ) = {0x0009000900090009ULL, 0x0009000900090009ULL};
51 DECLARE_ALIGNED(8, const uint64_t, ff_pw_15 ) = 0x000F000F000F000FULL;
52 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_16 ) = {0x0010001000100010ULL, 0x0010001000100010ULL};
53 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_17 ) = {0x0011001100110011ULL, 0x0011001100110011ULL};
54 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_18 ) = {0x0012001200120012ULL, 0x0012001200120012ULL};
55 DECLARE_ALIGNED(8, const uint64_t, ff_pw_20 ) = 0x0014001400140014ULL;
56 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_27 ) = {0x001B001B001B001BULL, 0x001B001B001B001BULL};
57 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_28 ) = {0x001C001C001C001CULL, 0x001C001C001C001CULL};
58 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_32 ) = {0x0020002000200020ULL, 0x0020002000200020ULL};
59 DECLARE_ALIGNED(8, const uint64_t, ff_pw_42 ) = 0x002A002A002A002AULL;
60 DECLARE_ALIGNED(8, const uint64_t, ff_pw_53 ) = 0x0035003500350035ULL;
61 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_63 ) = {0x003F003F003F003FULL, 0x003F003F003F003FULL};
62 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_64 ) = {0x0040004000400040ULL, 0x0040004000400040ULL};
63 DECLARE_ALIGNED(8, const uint64_t, ff_pw_96 ) = 0x0060006000600060ULL;
64 DECLARE_ALIGNED(8, const uint64_t, ff_pw_128) = 0x0080008000800080ULL;
65 DECLARE_ALIGNED(8, const uint64_t, ff_pw_255) = 0x00ff00ff00ff00ffULL;
67 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_0 ) = {0x0000000000000000ULL, 0x0000000000000000ULL};
68 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_1 ) = {0x0101010101010101ULL, 0x0101010101010101ULL};
69 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_3 ) = {0x0303030303030303ULL, 0x0303030303030303ULL};
70 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_4 ) = {0x0404040404040404ULL, 0x0404040404040404ULL};
71 DECLARE_ALIGNED(8, const uint64_t, ff_pb_7 ) = 0x0707070707070707ULL;
72 DECLARE_ALIGNED(8, const uint64_t, ff_pb_1F ) = 0x1F1F1F1F1F1F1F1FULL;
73 DECLARE_ALIGNED(8, const uint64_t, ff_pb_3F ) = 0x3F3F3F3F3F3F3F3FULL;
74 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_80 ) = {0x8080808080808080ULL, 0x8080808080808080ULL};
75 DECLARE_ALIGNED(8, const uint64_t, ff_pb_81 ) = 0x8181818181818181ULL;
76 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_A1 ) = {0xA1A1A1A1A1A1A1A1ULL, 0xA1A1A1A1A1A1A1A1ULL};
77 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_F8 ) = {0xF8F8F8F8F8F8F8F8ULL, 0xF8F8F8F8F8F8F8F8ULL};
78 DECLARE_ALIGNED(8, const uint64_t, ff_pb_FC ) = 0xFCFCFCFCFCFCFCFCULL;
79 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_FE ) = {0xFEFEFEFEFEFEFEFEULL, 0xFEFEFEFEFEFEFEFEULL};
81 DECLARE_ALIGNED(16, const double, ff_pd_1)[2] = { 1.0, 1.0 };
82 DECLARE_ALIGNED(16, const double, ff_pd_2)[2] = { 2.0, 2.0 };
84 #define JUMPALIGN() __asm__ volatile (".p2align 3"::)
85 #define MOVQ_ZERO(regd) __asm__ volatile ("pxor %%" #regd ", %%" #regd ::)
87 #define MOVQ_BFE(regd) \
89 "pcmpeqd %%" #regd ", %%" #regd " \n\t"\
90 "paddb %%" #regd ", %%" #regd " \n\t" ::)
93 #define MOVQ_BONE(regd) __asm__ volatile ("movq %0, %%" #regd " \n\t" ::"m"(ff_bone))
94 #define MOVQ_WTWO(regd) __asm__ volatile ("movq %0, %%" #regd " \n\t" ::"m"(ff_wtwo))
96 // for shared library it's better to use this way for accessing constants
98 #define MOVQ_BONE(regd) \
100 "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
101 "psrlw $15, %%" #regd " \n\t" \
102 "packuswb %%" #regd ", %%" #regd " \n\t" ::)
104 #define MOVQ_WTWO(regd) \
106 "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
107 "psrlw $15, %%" #regd " \n\t" \
108 "psllw $1, %%" #regd " \n\t"::)
112 // using regr as temporary and for the output result
113 // first argument is unmodifed and second is trashed
114 // regfe is supposed to contain 0xfefefefefefefefe
115 #define PAVGB_MMX_NO_RND(rega, regb, regr, regfe) \
116 "movq " #rega ", " #regr " \n\t"\
117 "pand " #regb ", " #regr " \n\t"\
118 "pxor " #rega ", " #regb " \n\t"\
119 "pand " #regfe "," #regb " \n\t"\
120 "psrlq $1, " #regb " \n\t"\
121 "paddb " #regb ", " #regr " \n\t"
123 #define PAVGB_MMX(rega, regb, regr, regfe) \
124 "movq " #rega ", " #regr " \n\t"\
125 "por " #regb ", " #regr " \n\t"\
126 "pxor " #rega ", " #regb " \n\t"\
127 "pand " #regfe "," #regb " \n\t"\
128 "psrlq $1, " #regb " \n\t"\
129 "psubb " #regb ", " #regr " \n\t"
131 // mm6 is supposed to contain 0xfefefefefefefefe
132 #define PAVGBP_MMX_NO_RND(rega, regb, regr, regc, regd, regp) \
133 "movq " #rega ", " #regr " \n\t"\
134 "movq " #regc ", " #regp " \n\t"\
135 "pand " #regb ", " #regr " \n\t"\
136 "pand " #regd ", " #regp " \n\t"\
137 "pxor " #rega ", " #regb " \n\t"\
138 "pxor " #regc ", " #regd " \n\t"\
139 "pand %%mm6, " #regb " \n\t"\
140 "pand %%mm6, " #regd " \n\t"\
141 "psrlq $1, " #regb " \n\t"\
142 "psrlq $1, " #regd " \n\t"\
143 "paddb " #regb ", " #regr " \n\t"\
144 "paddb " #regd ", " #regp " \n\t"
146 #define PAVGBP_MMX(rega, regb, regr, regc, regd, regp) \
147 "movq " #rega ", " #regr " \n\t"\
148 "movq " #regc ", " #regp " \n\t"\
149 "por " #regb ", " #regr " \n\t"\
150 "por " #regd ", " #regp " \n\t"\
151 "pxor " #rega ", " #regb " \n\t"\
152 "pxor " #regc ", " #regd " \n\t"\
153 "pand %%mm6, " #regb " \n\t"\
154 "pand %%mm6, " #regd " \n\t"\
155 "psrlq $1, " #regd " \n\t"\
156 "psrlq $1, " #regb " \n\t"\
157 "psubb " #regb ", " #regr " \n\t"\
158 "psubb " #regd ", " #regp " \n\t"
160 /***********************************/
161 /* MMX no rounding */
162 #define DEF(x, y) x ## _no_rnd_ ## y ##_mmx
163 #define SET_RND MOVQ_WONE
164 #define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX_NO_RND(a, b, c, d, e, f)
165 #define PAVGB(a, b, c, e) PAVGB_MMX_NO_RND(a, b, c, e)
166 #define OP_AVG(a, b, c, e) PAVGB_MMX(a, b, c, e)
168 #include "dsputil_mmx_rnd_template.c"
174 /***********************************/
177 #define DEF(x, y) x ## _ ## y ##_mmx
178 #define SET_RND MOVQ_WTWO
179 #define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX(a, b, c, d, e, f)
180 #define PAVGB(a, b, c, e) PAVGB_MMX(a, b, c, e)
182 #include "dsputil_mmx_rnd_template.c"
190 /***********************************/
193 #define DEF(x) x ## _3dnow
194 #define PAVGB "pavgusb"
197 #include "dsputil_mmx_avg_template.c"
203 /***********************************/
206 #define DEF(x) x ## _mmx2
208 /* Introduced only in MMX2 set */
209 #define PAVGB "pavgb"
212 #include "dsputil_mmx_avg_template.c"
218 #define put_no_rnd_pixels16_mmx put_pixels16_mmx
219 #define put_no_rnd_pixels8_mmx put_pixels8_mmx
220 #define put_pixels16_mmx2 put_pixels16_mmx
221 #define put_pixels8_mmx2 put_pixels8_mmx
222 #define put_pixels4_mmx2 put_pixels4_mmx
223 #define put_no_rnd_pixels16_mmx2 put_no_rnd_pixels16_mmx
224 #define put_no_rnd_pixels8_mmx2 put_no_rnd_pixels8_mmx
225 #define put_pixels16_3dnow put_pixels16_mmx
226 #define put_pixels8_3dnow put_pixels8_mmx
227 #define put_pixels4_3dnow put_pixels4_mmx
228 #define put_no_rnd_pixels16_3dnow put_no_rnd_pixels16_mmx
229 #define put_no_rnd_pixels8_3dnow put_no_rnd_pixels8_mmx
231 /***********************************/
234 void ff_put_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size)
239 /* read the pixels */
244 "movq %3, %%mm0 \n\t"
245 "movq 8%3, %%mm1 \n\t"
246 "movq 16%3, %%mm2 \n\t"
247 "movq 24%3, %%mm3 \n\t"
248 "movq 32%3, %%mm4 \n\t"
249 "movq 40%3, %%mm5 \n\t"
250 "movq 48%3, %%mm6 \n\t"
251 "movq 56%3, %%mm7 \n\t"
252 "packuswb %%mm1, %%mm0 \n\t"
253 "packuswb %%mm3, %%mm2 \n\t"
254 "packuswb %%mm5, %%mm4 \n\t"
255 "packuswb %%mm7, %%mm6 \n\t"
256 "movq %%mm0, (%0) \n\t"
257 "movq %%mm2, (%0, %1) \n\t"
258 "movq %%mm4, (%0, %1, 2) \n\t"
259 "movq %%mm6, (%0, %2) \n\t"
260 ::"r" (pix), "r" ((x86_reg)line_size), "r" ((x86_reg)line_size*3), "m"(*p)
265 // if here would be an exact copy of the code above
266 // compiler would generate some very strange code
269 "movq (%3), %%mm0 \n\t"
270 "movq 8(%3), %%mm1 \n\t"
271 "movq 16(%3), %%mm2 \n\t"
272 "movq 24(%3), %%mm3 \n\t"
273 "movq 32(%3), %%mm4 \n\t"
274 "movq 40(%3), %%mm5 \n\t"
275 "movq 48(%3), %%mm6 \n\t"
276 "movq 56(%3), %%mm7 \n\t"
277 "packuswb %%mm1, %%mm0 \n\t"
278 "packuswb %%mm3, %%mm2 \n\t"
279 "packuswb %%mm5, %%mm4 \n\t"
280 "packuswb %%mm7, %%mm6 \n\t"
281 "movq %%mm0, (%0) \n\t"
282 "movq %%mm2, (%0, %1) \n\t"
283 "movq %%mm4, (%0, %1, 2) \n\t"
284 "movq %%mm6, (%0, %2) \n\t"
285 ::"r" (pix), "r" ((x86_reg)line_size), "r" ((x86_reg)line_size*3), "r"(p)
289 #define put_signed_pixels_clamped_mmx_half(off) \
290 "movq "#off"(%2), %%mm1 \n\t"\
291 "movq 16+"#off"(%2), %%mm2 \n\t"\
292 "movq 32+"#off"(%2), %%mm3 \n\t"\
293 "movq 48+"#off"(%2), %%mm4 \n\t"\
294 "packsswb 8+"#off"(%2), %%mm1 \n\t"\
295 "packsswb 24+"#off"(%2), %%mm2 \n\t"\
296 "packsswb 40+"#off"(%2), %%mm3 \n\t"\
297 "packsswb 56+"#off"(%2), %%mm4 \n\t"\
298 "paddb %%mm0, %%mm1 \n\t"\
299 "paddb %%mm0, %%mm2 \n\t"\
300 "paddb %%mm0, %%mm3 \n\t"\
301 "paddb %%mm0, %%mm4 \n\t"\
302 "movq %%mm1, (%0) \n\t"\
303 "movq %%mm2, (%0, %3) \n\t"\
304 "movq %%mm3, (%0, %3, 2) \n\t"\
305 "movq %%mm4, (%0, %1) \n\t"
307 void ff_put_signed_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size)
309 x86_reg line_skip = line_size;
313 "movq "MANGLE(ff_pb_80)", %%mm0 \n\t"
314 "lea (%3, %3, 2), %1 \n\t"
315 put_signed_pixels_clamped_mmx_half(0)
316 "lea (%0, %3, 4), %0 \n\t"
317 put_signed_pixels_clamped_mmx_half(64)
318 :"+&r" (pixels), "=&r" (line_skip3)
319 :"r" (block), "r"(line_skip)
323 void ff_add_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size)
329 /* read the pixels */
336 "movq (%2), %%mm0 \n\t"
337 "movq 8(%2), %%mm1 \n\t"
338 "movq 16(%2), %%mm2 \n\t"
339 "movq 24(%2), %%mm3 \n\t"
340 "movq %0, %%mm4 \n\t"
341 "movq %1, %%mm6 \n\t"
342 "movq %%mm4, %%mm5 \n\t"
343 "punpcklbw %%mm7, %%mm4 \n\t"
344 "punpckhbw %%mm7, %%mm5 \n\t"
345 "paddsw %%mm4, %%mm0 \n\t"
346 "paddsw %%mm5, %%mm1 \n\t"
347 "movq %%mm6, %%mm5 \n\t"
348 "punpcklbw %%mm7, %%mm6 \n\t"
349 "punpckhbw %%mm7, %%mm5 \n\t"
350 "paddsw %%mm6, %%mm2 \n\t"
351 "paddsw %%mm5, %%mm3 \n\t"
352 "packuswb %%mm1, %%mm0 \n\t"
353 "packuswb %%mm3, %%mm2 \n\t"
354 "movq %%mm0, %0 \n\t"
355 "movq %%mm2, %1 \n\t"
356 :"+m"(*pix), "+m"(*(pix+line_size))
364 static void put_pixels4_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h)
367 "lea (%3, %3), %%"REG_a" \n\t"
370 "movd (%1), %%mm0 \n\t"
371 "movd (%1, %3), %%mm1 \n\t"
372 "movd %%mm0, (%2) \n\t"
373 "movd %%mm1, (%2, %3) \n\t"
374 "add %%"REG_a", %1 \n\t"
375 "add %%"REG_a", %2 \n\t"
376 "movd (%1), %%mm0 \n\t"
377 "movd (%1, %3), %%mm1 \n\t"
378 "movd %%mm0, (%2) \n\t"
379 "movd %%mm1, (%2, %3) \n\t"
380 "add %%"REG_a", %1 \n\t"
381 "add %%"REG_a", %2 \n\t"
384 : "+g"(h), "+r" (pixels), "+r" (block)
385 : "r"((x86_reg)line_size)
390 static void put_pixels8_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h)
393 "lea (%3, %3), %%"REG_a" \n\t"
396 "movq (%1), %%mm0 \n\t"
397 "movq (%1, %3), %%mm1 \n\t"
398 "movq %%mm0, (%2) \n\t"
399 "movq %%mm1, (%2, %3) \n\t"
400 "add %%"REG_a", %1 \n\t"
401 "add %%"REG_a", %2 \n\t"
402 "movq (%1), %%mm0 \n\t"
403 "movq (%1, %3), %%mm1 \n\t"
404 "movq %%mm0, (%2) \n\t"
405 "movq %%mm1, (%2, %3) \n\t"
406 "add %%"REG_a", %1 \n\t"
407 "add %%"REG_a", %2 \n\t"
410 : "+g"(h), "+r" (pixels), "+r" (block)
411 : "r"((x86_reg)line_size)
416 static void put_pixels16_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h)
419 "lea (%3, %3), %%"REG_a" \n\t"
422 "movq (%1), %%mm0 \n\t"
423 "movq 8(%1), %%mm4 \n\t"
424 "movq (%1, %3), %%mm1 \n\t"
425 "movq 8(%1, %3), %%mm5 \n\t"
426 "movq %%mm0, (%2) \n\t"
427 "movq %%mm4, 8(%2) \n\t"
428 "movq %%mm1, (%2, %3) \n\t"
429 "movq %%mm5, 8(%2, %3) \n\t"
430 "add %%"REG_a", %1 \n\t"
431 "add %%"REG_a", %2 \n\t"
432 "movq (%1), %%mm0 \n\t"
433 "movq 8(%1), %%mm4 \n\t"
434 "movq (%1, %3), %%mm1 \n\t"
435 "movq 8(%1, %3), %%mm5 \n\t"
436 "movq %%mm0, (%2) \n\t"
437 "movq %%mm4, 8(%2) \n\t"
438 "movq %%mm1, (%2, %3) \n\t"
439 "movq %%mm5, 8(%2, %3) \n\t"
440 "add %%"REG_a", %1 \n\t"
441 "add %%"REG_a", %2 \n\t"
444 : "+g"(h), "+r" (pixels), "+r" (block)
445 : "r"((x86_reg)line_size)
450 static void put_pixels16_sse2(uint8_t *block, const uint8_t *pixels, int line_size, int h)
454 "movdqu (%1), %%xmm0 \n\t"
455 "movdqu (%1,%3), %%xmm1 \n\t"
456 "movdqu (%1,%3,2), %%xmm2 \n\t"
457 "movdqu (%1,%4), %%xmm3 \n\t"
458 "movdqa %%xmm0, (%2) \n\t"
459 "movdqa %%xmm1, (%2,%3) \n\t"
460 "movdqa %%xmm2, (%2,%3,2) \n\t"
461 "movdqa %%xmm3, (%2,%4) \n\t"
463 "lea (%1,%3,4), %1 \n\t"
464 "lea (%2,%3,4), %2 \n\t"
466 : "+g"(h), "+r" (pixels), "+r" (block)
467 : "r"((x86_reg)line_size), "r"((x86_reg)3L*line_size)
472 static void avg_pixels16_sse2(uint8_t *block, const uint8_t *pixels, int line_size, int h)
476 "movdqu (%1), %%xmm0 \n\t"
477 "movdqu (%1,%3), %%xmm1 \n\t"
478 "movdqu (%1,%3,2), %%xmm2 \n\t"
479 "movdqu (%1,%4), %%xmm3 \n\t"
480 "pavgb (%2), %%xmm0 \n\t"
481 "pavgb (%2,%3), %%xmm1 \n\t"
482 "pavgb (%2,%3,2), %%xmm2 \n\t"
483 "pavgb (%2,%4), %%xmm3 \n\t"
484 "movdqa %%xmm0, (%2) \n\t"
485 "movdqa %%xmm1, (%2,%3) \n\t"
486 "movdqa %%xmm2, (%2,%3,2) \n\t"
487 "movdqa %%xmm3, (%2,%4) \n\t"
489 "lea (%1,%3,4), %1 \n\t"
490 "lea (%2,%3,4), %2 \n\t"
492 : "+g"(h), "+r" (pixels), "+r" (block)
493 : "r"((x86_reg)line_size), "r"((x86_reg)3L*line_size)
498 #define CLEAR_BLOCKS(name,n) \
499 static void name(DCTELEM *blocks)\
502 "pxor %%mm7, %%mm7 \n\t"\
503 "mov %1, %%"REG_a" \n\t"\
505 "movq %%mm7, (%0, %%"REG_a") \n\t"\
506 "movq %%mm7, 8(%0, %%"REG_a") \n\t"\
507 "movq %%mm7, 16(%0, %%"REG_a") \n\t"\
508 "movq %%mm7, 24(%0, %%"REG_a") \n\t"\
509 "add $32, %%"REG_a" \n\t"\
511 : : "r" (((uint8_t *)blocks)+128*n),\
516 CLEAR_BLOCKS(clear_blocks_mmx, 6)
517 CLEAR_BLOCKS(clear_block_mmx, 1)
519 static void clear_block_sse(DCTELEM *block)
522 "xorps %%xmm0, %%xmm0 \n"
523 "movaps %%xmm0, (%0) \n"
524 "movaps %%xmm0, 16(%0) \n"
525 "movaps %%xmm0, 32(%0) \n"
526 "movaps %%xmm0, 48(%0) \n"
527 "movaps %%xmm0, 64(%0) \n"
528 "movaps %%xmm0, 80(%0) \n"
529 "movaps %%xmm0, 96(%0) \n"
530 "movaps %%xmm0, 112(%0) \n"
536 static void clear_blocks_sse(DCTELEM *blocks)
539 "xorps %%xmm0, %%xmm0 \n"
540 "mov %1, %%"REG_a" \n"
542 "movaps %%xmm0, (%0, %%"REG_a") \n"
543 "movaps %%xmm0, 16(%0, %%"REG_a") \n"
544 "movaps %%xmm0, 32(%0, %%"REG_a") \n"
545 "movaps %%xmm0, 48(%0, %%"REG_a") \n"
546 "movaps %%xmm0, 64(%0, %%"REG_a") \n"
547 "movaps %%xmm0, 80(%0, %%"REG_a") \n"
548 "movaps %%xmm0, 96(%0, %%"REG_a") \n"
549 "movaps %%xmm0, 112(%0, %%"REG_a") \n"
550 "add $128, %%"REG_a" \n"
552 : : "r" (((uint8_t *)blocks)+128*6),
558 static void add_bytes_mmx(uint8_t *dst, uint8_t *src, int w){
563 "movq (%1, %0), %%mm0 \n\t"
564 "movq (%2, %0), %%mm1 \n\t"
565 "paddb %%mm0, %%mm1 \n\t"
566 "movq %%mm1, (%2, %0) \n\t"
567 "movq 8(%1, %0), %%mm0 \n\t"
568 "movq 8(%2, %0), %%mm1 \n\t"
569 "paddb %%mm0, %%mm1 \n\t"
570 "movq %%mm1, 8(%2, %0) \n\t"
576 : "r"(src), "r"(dst), "r"((x86_reg)w-15)
579 dst[i+0] += src[i+0];
582 static void add_bytes_l2_mmx(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w){
587 "movq (%2, %0), %%mm0 \n\t"
588 "movq 8(%2, %0), %%mm1 \n\t"
589 "paddb (%3, %0), %%mm0 \n\t"
590 "paddb 8(%3, %0), %%mm1 \n\t"
591 "movq %%mm0, (%1, %0) \n\t"
592 "movq %%mm1, 8(%1, %0) \n\t"
598 : "r"(dst), "r"(src1), "r"(src2), "r"((x86_reg)w-15)
601 dst[i] = src1[i] + src2[i];
604 #if HAVE_7REGS && HAVE_TEN_OPERANDS
605 static void add_hfyu_median_prediction_cmov(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int w, int *left, int *left_top) {
608 int l = *left & 0xff;
609 int tl = *left_top & 0xff;
614 "movzbl (%3,%4), %2 \n"
627 "add (%6,%4), %b0 \n"
628 "mov %b0, (%5,%4) \n"
631 :"+&q"(l), "+&q"(tl), "=&r"(t), "=&q"(x), "+&r"(w2)
632 :"r"(dst+w), "r"(diff+w), "rm"(top+w)
639 #define H263_LOOP_FILTER \
640 "pxor %%mm7, %%mm7 \n\t"\
641 "movq %0, %%mm0 \n\t"\
642 "movq %0, %%mm1 \n\t"\
643 "movq %3, %%mm2 \n\t"\
644 "movq %3, %%mm3 \n\t"\
645 "punpcklbw %%mm7, %%mm0 \n\t"\
646 "punpckhbw %%mm7, %%mm1 \n\t"\
647 "punpcklbw %%mm7, %%mm2 \n\t"\
648 "punpckhbw %%mm7, %%mm3 \n\t"\
649 "psubw %%mm2, %%mm0 \n\t"\
650 "psubw %%mm3, %%mm1 \n\t"\
651 "movq %1, %%mm2 \n\t"\
652 "movq %1, %%mm3 \n\t"\
653 "movq %2, %%mm4 \n\t"\
654 "movq %2, %%mm5 \n\t"\
655 "punpcklbw %%mm7, %%mm2 \n\t"\
656 "punpckhbw %%mm7, %%mm3 \n\t"\
657 "punpcklbw %%mm7, %%mm4 \n\t"\
658 "punpckhbw %%mm7, %%mm5 \n\t"\
659 "psubw %%mm2, %%mm4 \n\t"\
660 "psubw %%mm3, %%mm5 \n\t"\
661 "psllw $2, %%mm4 \n\t"\
662 "psllw $2, %%mm5 \n\t"\
663 "paddw %%mm0, %%mm4 \n\t"\
664 "paddw %%mm1, %%mm5 \n\t"\
665 "pxor %%mm6, %%mm6 \n\t"\
666 "pcmpgtw %%mm4, %%mm6 \n\t"\
667 "pcmpgtw %%mm5, %%mm7 \n\t"\
668 "pxor %%mm6, %%mm4 \n\t"\
669 "pxor %%mm7, %%mm5 \n\t"\
670 "psubw %%mm6, %%mm4 \n\t"\
671 "psubw %%mm7, %%mm5 \n\t"\
672 "psrlw $3, %%mm4 \n\t"\
673 "psrlw $3, %%mm5 \n\t"\
674 "packuswb %%mm5, %%mm4 \n\t"\
675 "packsswb %%mm7, %%mm6 \n\t"\
676 "pxor %%mm7, %%mm7 \n\t"\
677 "movd %4, %%mm2 \n\t"\
678 "punpcklbw %%mm2, %%mm2 \n\t"\
679 "punpcklbw %%mm2, %%mm2 \n\t"\
680 "punpcklbw %%mm2, %%mm2 \n\t"\
681 "psubusb %%mm4, %%mm2 \n\t"\
682 "movq %%mm2, %%mm3 \n\t"\
683 "psubusb %%mm4, %%mm3 \n\t"\
684 "psubb %%mm3, %%mm2 \n\t"\
685 "movq %1, %%mm3 \n\t"\
686 "movq %2, %%mm4 \n\t"\
687 "pxor %%mm6, %%mm3 \n\t"\
688 "pxor %%mm6, %%mm4 \n\t"\
689 "paddusb %%mm2, %%mm3 \n\t"\
690 "psubusb %%mm2, %%mm4 \n\t"\
691 "pxor %%mm6, %%mm3 \n\t"\
692 "pxor %%mm6, %%mm4 \n\t"\
693 "paddusb %%mm2, %%mm2 \n\t"\
694 "packsswb %%mm1, %%mm0 \n\t"\
695 "pcmpgtb %%mm0, %%mm7 \n\t"\
696 "pxor %%mm7, %%mm0 \n\t"\
697 "psubb %%mm7, %%mm0 \n\t"\
698 "movq %%mm0, %%mm1 \n\t"\
699 "psubusb %%mm2, %%mm0 \n\t"\
700 "psubb %%mm0, %%mm1 \n\t"\
701 "pand %5, %%mm1 \n\t"\
702 "psrlw $2, %%mm1 \n\t"\
703 "pxor %%mm7, %%mm1 \n\t"\
704 "psubb %%mm7, %%mm1 \n\t"\
705 "movq %0, %%mm5 \n\t"\
706 "movq %3, %%mm6 \n\t"\
707 "psubb %%mm1, %%mm5 \n\t"\
708 "paddb %%mm1, %%mm6 \n\t"
710 static void h263_v_loop_filter_mmx(uint8_t *src, int stride, int qscale){
711 if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
712 const int strength= ff_h263_loop_filter_strength[qscale];
718 "movq %%mm3, %1 \n\t"
719 "movq %%mm4, %2 \n\t"
720 "movq %%mm5, %0 \n\t"
721 "movq %%mm6, %3 \n\t"
722 : "+m" (*(uint64_t*)(src - 2*stride)),
723 "+m" (*(uint64_t*)(src - 1*stride)),
724 "+m" (*(uint64_t*)(src + 0*stride)),
725 "+m" (*(uint64_t*)(src + 1*stride))
726 : "g" (2*strength), "m"(ff_pb_FC)
731 static void h263_h_loop_filter_mmx(uint8_t *src, int stride, int qscale){
732 if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
733 const int strength= ff_h263_loop_filter_strength[qscale];
734 DECLARE_ALIGNED(8, uint64_t, temp)[4];
735 uint8_t *btemp= (uint8_t*)temp;
739 transpose4x4(btemp , src , 8, stride);
740 transpose4x4(btemp+4, src + 4*stride, 8, stride);
742 H263_LOOP_FILTER // 5 3 4 6
748 : "g" (2*strength), "m"(ff_pb_FC)
752 "movq %%mm5, %%mm1 \n\t"
753 "movq %%mm4, %%mm0 \n\t"
754 "punpcklbw %%mm3, %%mm5 \n\t"
755 "punpcklbw %%mm6, %%mm4 \n\t"
756 "punpckhbw %%mm3, %%mm1 \n\t"
757 "punpckhbw %%mm6, %%mm0 \n\t"
758 "movq %%mm5, %%mm3 \n\t"
759 "movq %%mm1, %%mm6 \n\t"
760 "punpcklwd %%mm4, %%mm5 \n\t"
761 "punpcklwd %%mm0, %%mm1 \n\t"
762 "punpckhwd %%mm4, %%mm3 \n\t"
763 "punpckhwd %%mm0, %%mm6 \n\t"
764 "movd %%mm5, (%0) \n\t"
765 "punpckhdq %%mm5, %%mm5 \n\t"
766 "movd %%mm5, (%0,%2) \n\t"
767 "movd %%mm3, (%0,%2,2) \n\t"
768 "punpckhdq %%mm3, %%mm3 \n\t"
769 "movd %%mm3, (%0,%3) \n\t"
770 "movd %%mm1, (%1) \n\t"
771 "punpckhdq %%mm1, %%mm1 \n\t"
772 "movd %%mm1, (%1,%2) \n\t"
773 "movd %%mm6, (%1,%2,2) \n\t"
774 "punpckhdq %%mm6, %%mm6 \n\t"
775 "movd %%mm6, (%1,%3) \n\t"
777 "r" (src + 4*stride),
778 "r" ((x86_reg) stride ),
779 "r" ((x86_reg)(3*stride))
784 /* draw the edges of width 'w' of an image of size width, height
785 this mmx version can only handle w==8 || w==16 */
786 static void draw_edges_mmx(uint8_t *buf, int wrap, int width, int height, int w, int sides)
788 uint8_t *ptr, *last_line;
791 last_line = buf + (height - 1) * wrap;
798 "movd (%0), %%mm0 \n\t"
799 "punpcklbw %%mm0, %%mm0 \n\t"
800 "punpcklwd %%mm0, %%mm0 \n\t"
801 "punpckldq %%mm0, %%mm0 \n\t"
802 "movq %%mm0, -8(%0) \n\t"
803 "movq -8(%0, %2), %%mm1 \n\t"
804 "punpckhbw %%mm1, %%mm1 \n\t"
805 "punpckhwd %%mm1, %%mm1 \n\t"
806 "punpckhdq %%mm1, %%mm1 \n\t"
807 "movq %%mm1, (%0, %2) \n\t"
812 : "r" ((x86_reg)wrap), "r" ((x86_reg)width), "r" (ptr + wrap*height)
819 "movd (%0), %%mm0 \n\t"
820 "punpcklbw %%mm0, %%mm0 \n\t"
821 "punpcklwd %%mm0, %%mm0 \n\t"
822 "punpckldq %%mm0, %%mm0 \n\t"
823 "movq %%mm0, -8(%0) \n\t"
824 "movq %%mm0, -16(%0) \n\t"
825 "movq -8(%0, %2), %%mm1 \n\t"
826 "punpckhbw %%mm1, %%mm1 \n\t"
827 "punpckhwd %%mm1, %%mm1 \n\t"
828 "punpckhdq %%mm1, %%mm1 \n\t"
829 "movq %%mm1, (%0, %2) \n\t"
830 "movq %%mm1, 8(%0, %2) \n\t"
835 : "r" ((x86_reg)wrap), "r" ((x86_reg)width), "r" (ptr + wrap*height)
839 /* top and bottom (and hopefully also the corners) */
840 if (sides&EDGE_TOP) {
841 for(i = 0; i < w; i += 4) {
842 ptr= buf - (i + 1) * wrap - w;
845 "movq (%1, %0), %%mm0 \n\t"
846 "movq %%mm0, (%0) \n\t"
847 "movq %%mm0, (%0, %2) \n\t"
848 "movq %%mm0, (%0, %2, 2) \n\t"
849 "movq %%mm0, (%0, %3) \n\t"
854 : "r" ((x86_reg)buf - (x86_reg)ptr - w), "r" ((x86_reg)-wrap), "r" ((x86_reg)-wrap*3), "r" (ptr+width+2*w)
859 if (sides&EDGE_BOTTOM) {
860 for(i = 0; i < w; i += 4) {
861 ptr= last_line + (i + 1) * wrap - w;
864 "movq (%1, %0), %%mm0 \n\t"
865 "movq %%mm0, (%0) \n\t"
866 "movq %%mm0, (%0, %2) \n\t"
867 "movq %%mm0, (%0, %2, 2) \n\t"
868 "movq %%mm0, (%0, %3) \n\t"
873 : "r" ((x86_reg)last_line - (x86_reg)ptr - w), "r" ((x86_reg)wrap), "r" ((x86_reg)wrap*3), "r" (ptr+width+2*w)
879 #define PAETH(cpu, abs3)\
880 static void add_png_paeth_prediction_##cpu(uint8_t *dst, uint8_t *src, uint8_t *top, int w, int bpp)\
885 "pxor %%mm7, %%mm7 \n"\
886 "movd (%1,%0), %%mm0 \n"\
887 "movd (%2,%0), %%mm1 \n"\
888 "punpcklbw %%mm7, %%mm0 \n"\
889 "punpcklbw %%mm7, %%mm1 \n"\
892 "movq %%mm1, %%mm2 \n"\
893 "movd (%2,%0), %%mm1 \n"\
894 "movq %%mm2, %%mm3 \n"\
895 "punpcklbw %%mm7, %%mm1 \n"\
896 "movq %%mm2, %%mm4 \n"\
897 "psubw %%mm1, %%mm3 \n"\
898 "psubw %%mm0, %%mm4 \n"\
899 "movq %%mm3, %%mm5 \n"\
900 "paddw %%mm4, %%mm5 \n"\
902 "movq %%mm4, %%mm6 \n"\
903 "pminsw %%mm5, %%mm6 \n"\
904 "pcmpgtw %%mm6, %%mm3 \n"\
905 "pcmpgtw %%mm5, %%mm4 \n"\
906 "movq %%mm4, %%mm6 \n"\
907 "pand %%mm3, %%mm4 \n"\
908 "pandn %%mm3, %%mm6 \n"\
909 "pandn %%mm0, %%mm3 \n"\
910 "movd (%3,%0), %%mm0 \n"\
911 "pand %%mm1, %%mm6 \n"\
912 "pand %%mm4, %%mm2 \n"\
913 "punpcklbw %%mm7, %%mm0 \n"\
915 "paddw %%mm6, %%mm0 \n"\
916 "paddw %%mm2, %%mm3 \n"\
917 "paddw %%mm3, %%mm0 \n"\
918 "pand %%mm5, %%mm0 \n"\
919 "movq %%mm0, %%mm3 \n"\
920 "packuswb %%mm3, %%mm3 \n"\
921 "movd %%mm3, (%1,%0) \n"\
926 :"r"(dst), "r"(top), "r"(src), "r"((x86_reg)bpp), "g"(end),\
933 "psubw %%mm5, %%mm7 \n"\
934 "pmaxsw %%mm7, %%mm5 \n"\
935 "pxor %%mm6, %%mm6 \n"\
936 "pxor %%mm7, %%mm7 \n"\
937 "psubw %%mm3, %%mm6 \n"\
938 "psubw %%mm4, %%mm7 \n"\
939 "pmaxsw %%mm6, %%mm3 \n"\
940 "pmaxsw %%mm7, %%mm4 \n"\
941 "pxor %%mm7, %%mm7 \n"
944 "pabsw %%mm3, %%mm3 \n"\
945 "pabsw %%mm4, %%mm4 \n"\
946 "pabsw %%mm5, %%mm5 \n"
948 PAETH(mmx2, ABS3_MMX2)
950 PAETH(ssse3, ABS3_SSSE3)
953 #define QPEL_V_LOW(m3,m4,m5,m6, pw_20, pw_3, rnd, in0, in1, in2, in7, out, OP)\
954 "paddw " #m4 ", " #m3 " \n\t" /* x1 */\
955 "movq "MANGLE(ff_pw_20)", %%mm4 \n\t" /* 20 */\
956 "pmullw " #m3 ", %%mm4 \n\t" /* 20x1 */\
957 "movq "#in7", " #m3 " \n\t" /* d */\
958 "movq "#in0", %%mm5 \n\t" /* D */\
959 "paddw " #m3 ", %%mm5 \n\t" /* x4 */\
960 "psubw %%mm5, %%mm4 \n\t" /* 20x1 - x4 */\
961 "movq "#in1", %%mm5 \n\t" /* C */\
962 "movq "#in2", %%mm6 \n\t" /* B */\
963 "paddw " #m6 ", %%mm5 \n\t" /* x3 */\
964 "paddw " #m5 ", %%mm6 \n\t" /* x2 */\
965 "paddw %%mm6, %%mm6 \n\t" /* 2x2 */\
966 "psubw %%mm6, %%mm5 \n\t" /* -2x2 + x3 */\
967 "pmullw "MANGLE(ff_pw_3)", %%mm5 \n\t" /* -6x2 + 3x3 */\
968 "paddw " #rnd ", %%mm4 \n\t" /* x2 */\
969 "paddw %%mm4, %%mm5 \n\t" /* 20x1 - 6x2 + 3x3 - x4 */\
970 "psraw $5, %%mm5 \n\t"\
971 "packuswb %%mm5, %%mm5 \n\t"\
972 OP(%%mm5, out, %%mm7, d)
974 #define QPEL_BASE(OPNAME, ROUNDER, RND, OP_MMX2, OP_3DNOW)\
975 static void OPNAME ## mpeg4_qpel16_h_lowpass_mmx2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
979 "pxor %%mm7, %%mm7 \n\t"\
981 "movq (%0), %%mm0 \n\t" /* ABCDEFGH */\
982 "movq %%mm0, %%mm1 \n\t" /* ABCDEFGH */\
983 "movq %%mm0, %%mm2 \n\t" /* ABCDEFGH */\
984 "punpcklbw %%mm7, %%mm0 \n\t" /* 0A0B0C0D */\
985 "punpckhbw %%mm7, %%mm1 \n\t" /* 0E0F0G0H */\
986 "pshufw $0x90, %%mm0, %%mm5 \n\t" /* 0A0A0B0C */\
987 "pshufw $0x41, %%mm0, %%mm6 \n\t" /* 0B0A0A0B */\
988 "movq %%mm2, %%mm3 \n\t" /* ABCDEFGH */\
989 "movq %%mm2, %%mm4 \n\t" /* ABCDEFGH */\
990 "psllq $8, %%mm2 \n\t" /* 0ABCDEFG */\
991 "psllq $16, %%mm3 \n\t" /* 00ABCDEF */\
992 "psllq $24, %%mm4 \n\t" /* 000ABCDE */\
993 "punpckhbw %%mm7, %%mm2 \n\t" /* 0D0E0F0G */\
994 "punpckhbw %%mm7, %%mm3 \n\t" /* 0C0D0E0F */\
995 "punpckhbw %%mm7, %%mm4 \n\t" /* 0B0C0D0E */\
996 "paddw %%mm3, %%mm5 \n\t" /* b */\
997 "paddw %%mm2, %%mm6 \n\t" /* c */\
998 "paddw %%mm5, %%mm5 \n\t" /* 2b */\
999 "psubw %%mm5, %%mm6 \n\t" /* c - 2b */\
1000 "pshufw $0x06, %%mm0, %%mm5 \n\t" /* 0C0B0A0A */\
1001 "pmullw "MANGLE(ff_pw_3)", %%mm6 \n\t" /* 3c - 6b */\
1002 "paddw %%mm4, %%mm0 \n\t" /* a */\
1003 "paddw %%mm1, %%mm5 \n\t" /* d */\
1004 "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" /* 20a */\
1005 "psubw %%mm5, %%mm0 \n\t" /* 20a - d */\
1006 "paddw %6, %%mm6 \n\t"\
1007 "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\
1008 "psraw $5, %%mm0 \n\t"\
1009 "movq %%mm0, %5 \n\t"\
1010 /* mm1=EFGH, mm2=DEFG, mm3=CDEF, mm4=BCDE, mm7=0 */\
1012 "movq 5(%0), %%mm0 \n\t" /* FGHIJKLM */\
1013 "movq %%mm0, %%mm5 \n\t" /* FGHIJKLM */\
1014 "movq %%mm0, %%mm6 \n\t" /* FGHIJKLM */\
1015 "psrlq $8, %%mm0 \n\t" /* GHIJKLM0 */\
1016 "psrlq $16, %%mm5 \n\t" /* HIJKLM00 */\
1017 "punpcklbw %%mm7, %%mm0 \n\t" /* 0G0H0I0J */\
1018 "punpcklbw %%mm7, %%mm5 \n\t" /* 0H0I0J0K */\
1019 "paddw %%mm0, %%mm2 \n\t" /* b */\
1020 "paddw %%mm5, %%mm3 \n\t" /* c */\
1021 "paddw %%mm2, %%mm2 \n\t" /* 2b */\
1022 "psubw %%mm2, %%mm3 \n\t" /* c - 2b */\
1023 "movq %%mm6, %%mm2 \n\t" /* FGHIJKLM */\
1024 "psrlq $24, %%mm6 \n\t" /* IJKLM000 */\
1025 "punpcklbw %%mm7, %%mm2 \n\t" /* 0F0G0H0I */\
1026 "punpcklbw %%mm7, %%mm6 \n\t" /* 0I0J0K0L */\
1027 "pmullw "MANGLE(ff_pw_3)", %%mm3 \n\t" /* 3c - 6b */\
1028 "paddw %%mm2, %%mm1 \n\t" /* a */\
1029 "paddw %%mm6, %%mm4 \n\t" /* d */\
1030 "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" /* 20a */\
1031 "psubw %%mm4, %%mm3 \n\t" /* - 6b +3c - d */\
1032 "paddw %6, %%mm1 \n\t"\
1033 "paddw %%mm1, %%mm3 \n\t" /* 20a - 6b +3c - d */\
1034 "psraw $5, %%mm3 \n\t"\
1035 "movq %5, %%mm1 \n\t"\
1036 "packuswb %%mm3, %%mm1 \n\t"\
1037 OP_MMX2(%%mm1, (%1),%%mm4, q)\
1038 /* mm0= GHIJ, mm2=FGHI, mm5=HIJK, mm6=IJKL, mm7=0 */\
1040 "movq 9(%0), %%mm1 \n\t" /* JKLMNOPQ */\
1041 "movq %%mm1, %%mm4 \n\t" /* JKLMNOPQ */\
1042 "movq %%mm1, %%mm3 \n\t" /* JKLMNOPQ */\
1043 "psrlq $8, %%mm1 \n\t" /* KLMNOPQ0 */\
1044 "psrlq $16, %%mm4 \n\t" /* LMNOPQ00 */\
1045 "punpcklbw %%mm7, %%mm1 \n\t" /* 0K0L0M0N */\
1046 "punpcklbw %%mm7, %%mm4 \n\t" /* 0L0M0N0O */\
1047 "paddw %%mm1, %%mm5 \n\t" /* b */\
1048 "paddw %%mm4, %%mm0 \n\t" /* c */\
1049 "paddw %%mm5, %%mm5 \n\t" /* 2b */\
1050 "psubw %%mm5, %%mm0 \n\t" /* c - 2b */\
1051 "movq %%mm3, %%mm5 \n\t" /* JKLMNOPQ */\
1052 "psrlq $24, %%mm3 \n\t" /* MNOPQ000 */\
1053 "pmullw "MANGLE(ff_pw_3)", %%mm0 \n\t" /* 3c - 6b */\
1054 "punpcklbw %%mm7, %%mm3 \n\t" /* 0M0N0O0P */\
1055 "paddw %%mm3, %%mm2 \n\t" /* d */\
1056 "psubw %%mm2, %%mm0 \n\t" /* -6b + 3c - d */\
1057 "movq %%mm5, %%mm2 \n\t" /* JKLMNOPQ */\
1058 "punpcklbw %%mm7, %%mm2 \n\t" /* 0J0K0L0M */\
1059 "punpckhbw %%mm7, %%mm5 \n\t" /* 0N0O0P0Q */\
1060 "paddw %%mm2, %%mm6 \n\t" /* a */\
1061 "pmullw "MANGLE(ff_pw_20)", %%mm6 \n\t" /* 20a */\
1062 "paddw %6, %%mm0 \n\t"\
1063 "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\
1064 "psraw $5, %%mm0 \n\t"\
1065 /* mm1=KLMN, mm2=JKLM, mm3=MNOP, mm4=LMNO, mm5=NOPQ mm7=0 */\
1067 "paddw %%mm5, %%mm3 \n\t" /* a */\
1068 "pshufw $0xF9, %%mm5, %%mm6 \n\t" /* 0O0P0Q0Q */\
1069 "paddw %%mm4, %%mm6 \n\t" /* b */\
1070 "pshufw $0xBE, %%mm5, %%mm4 \n\t" /* 0P0Q0Q0P */\
1071 "pshufw $0x6F, %%mm5, %%mm5 \n\t" /* 0Q0Q0P0O */\
1072 "paddw %%mm1, %%mm4 \n\t" /* c */\
1073 "paddw %%mm2, %%mm5 \n\t" /* d */\
1074 "paddw %%mm6, %%mm6 \n\t" /* 2b */\
1075 "psubw %%mm6, %%mm4 \n\t" /* c - 2b */\
1076 "pmullw "MANGLE(ff_pw_20)", %%mm3 \n\t" /* 20a */\
1077 "pmullw "MANGLE(ff_pw_3)", %%mm4 \n\t" /* 3c - 6b */\
1078 "psubw %%mm5, %%mm3 \n\t" /* -6b + 3c - d */\
1079 "paddw %6, %%mm4 \n\t"\
1080 "paddw %%mm3, %%mm4 \n\t" /* 20a - 6b + 3c - d */\
1081 "psraw $5, %%mm4 \n\t"\
1082 "packuswb %%mm4, %%mm0 \n\t"\
1083 OP_MMX2(%%mm0, 8(%1), %%mm4, q)\
1089 : "+a"(src), "+c"(dst), "+D"(h)\
1090 : "d"((x86_reg)srcStride), "S"((x86_reg)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(temp), "m"(ROUNDER)\
1095 static void OPNAME ## mpeg4_qpel16_h_lowpass_3dnow(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1098 /* quick HACK, XXX FIXME MUST be optimized */\
1101 temp[ 0]= (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]);\
1102 temp[ 1]= (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]);\
1103 temp[ 2]= (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]);\
1104 temp[ 3]= (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]);\
1105 temp[ 4]= (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]);\
1106 temp[ 5]= (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]);\
1107 temp[ 6]= (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]);\
1108 temp[ 7]= (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]);\
1109 temp[ 8]= (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]);\
1110 temp[ 9]= (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]);\
1111 temp[10]= (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]);\
1112 temp[11]= (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]);\
1113 temp[12]= (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]);\
1114 temp[13]= (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]);\
1115 temp[14]= (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]);\
1116 temp[15]= (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]);\
1118 "movq (%0), %%mm0 \n\t"\
1119 "movq 8(%0), %%mm1 \n\t"\
1120 "paddw %2, %%mm0 \n\t"\
1121 "paddw %2, %%mm1 \n\t"\
1122 "psraw $5, %%mm0 \n\t"\
1123 "psraw $5, %%mm1 \n\t"\
1124 "packuswb %%mm1, %%mm0 \n\t"\
1125 OP_3DNOW(%%mm0, (%1), %%mm1, q)\
1126 "movq 16(%0), %%mm0 \n\t"\
1127 "movq 24(%0), %%mm1 \n\t"\
1128 "paddw %2, %%mm0 \n\t"\
1129 "paddw %2, %%mm1 \n\t"\
1130 "psraw $5, %%mm0 \n\t"\
1131 "psraw $5, %%mm1 \n\t"\
1132 "packuswb %%mm1, %%mm0 \n\t"\
1133 OP_3DNOW(%%mm0, 8(%1), %%mm1, q)\
1134 :: "r"(temp), "r"(dst), "m"(ROUNDER)\
1142 static void OPNAME ## mpeg4_qpel8_h_lowpass_mmx2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1144 "pxor %%mm7, %%mm7 \n\t"\
1146 "movq (%0), %%mm0 \n\t" /* ABCDEFGH */\
1147 "movq %%mm0, %%mm1 \n\t" /* ABCDEFGH */\
1148 "movq %%mm0, %%mm2 \n\t" /* ABCDEFGH */\
1149 "punpcklbw %%mm7, %%mm0 \n\t" /* 0A0B0C0D */\
1150 "punpckhbw %%mm7, %%mm1 \n\t" /* 0E0F0G0H */\
1151 "pshufw $0x90, %%mm0, %%mm5 \n\t" /* 0A0A0B0C */\
1152 "pshufw $0x41, %%mm0, %%mm6 \n\t" /* 0B0A0A0B */\
1153 "movq %%mm2, %%mm3 \n\t" /* ABCDEFGH */\
1154 "movq %%mm2, %%mm4 \n\t" /* ABCDEFGH */\
1155 "psllq $8, %%mm2 \n\t" /* 0ABCDEFG */\
1156 "psllq $16, %%mm3 \n\t" /* 00ABCDEF */\
1157 "psllq $24, %%mm4 \n\t" /* 000ABCDE */\
1158 "punpckhbw %%mm7, %%mm2 \n\t" /* 0D0E0F0G */\
1159 "punpckhbw %%mm7, %%mm3 \n\t" /* 0C0D0E0F */\
1160 "punpckhbw %%mm7, %%mm4 \n\t" /* 0B0C0D0E */\
1161 "paddw %%mm3, %%mm5 \n\t" /* b */\
1162 "paddw %%mm2, %%mm6 \n\t" /* c */\
1163 "paddw %%mm5, %%mm5 \n\t" /* 2b */\
1164 "psubw %%mm5, %%mm6 \n\t" /* c - 2b */\
1165 "pshufw $0x06, %%mm0, %%mm5 \n\t" /* 0C0B0A0A */\
1166 "pmullw "MANGLE(ff_pw_3)", %%mm6 \n\t" /* 3c - 6b */\
1167 "paddw %%mm4, %%mm0 \n\t" /* a */\
1168 "paddw %%mm1, %%mm5 \n\t" /* d */\
1169 "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" /* 20a */\
1170 "psubw %%mm5, %%mm0 \n\t" /* 20a - d */\
1171 "paddw %5, %%mm6 \n\t"\
1172 "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\
1173 "psraw $5, %%mm0 \n\t"\
1174 /* mm1=EFGH, mm2=DEFG, mm3=CDEF, mm4=BCDE, mm7=0 */\
1176 "movd 5(%0), %%mm5 \n\t" /* FGHI */\
1177 "punpcklbw %%mm7, %%mm5 \n\t" /* 0F0G0H0I */\
1178 "pshufw $0xF9, %%mm5, %%mm6 \n\t" /* 0G0H0I0I */\
1179 "paddw %%mm5, %%mm1 \n\t" /* a */\
1180 "paddw %%mm6, %%mm2 \n\t" /* b */\
1181 "pshufw $0xBE, %%mm5, %%mm6 \n\t" /* 0H0I0I0H */\
1182 "pshufw $0x6F, %%mm5, %%mm5 \n\t" /* 0I0I0H0G */\
1183 "paddw %%mm6, %%mm3 \n\t" /* c */\
1184 "paddw %%mm5, %%mm4 \n\t" /* d */\
1185 "paddw %%mm2, %%mm2 \n\t" /* 2b */\
1186 "psubw %%mm2, %%mm3 \n\t" /* c - 2b */\
1187 "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" /* 20a */\
1188 "pmullw "MANGLE(ff_pw_3)", %%mm3 \n\t" /* 3c - 6b */\
1189 "psubw %%mm4, %%mm3 \n\t" /* -6b + 3c - d */\
1190 "paddw %5, %%mm1 \n\t"\
1191 "paddw %%mm1, %%mm3 \n\t" /* 20a - 6b + 3c - d */\
1192 "psraw $5, %%mm3 \n\t"\
1193 "packuswb %%mm3, %%mm0 \n\t"\
1194 OP_MMX2(%%mm0, (%1), %%mm4, q)\
1200 : "+a"(src), "+c"(dst), "+d"(h)\
1201 : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(ROUNDER)\
1206 static void OPNAME ## mpeg4_qpel8_h_lowpass_3dnow(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1209 /* quick HACK, XXX FIXME MUST be optimized */\
1212 temp[ 0]= (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]);\
1213 temp[ 1]= (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]);\
1214 temp[ 2]= (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]);\
1215 temp[ 3]= (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]);\
1216 temp[ 4]= (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]);\
1217 temp[ 5]= (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 8]);\
1218 temp[ 6]= (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 8])*3 - (src[ 3]+src[ 7]);\
1219 temp[ 7]= (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 8])*6 + (src[ 5]+src[ 7])*3 - (src[ 4]+src[ 6]);\
1221 "movq (%0), %%mm0 \n\t"\
1222 "movq 8(%0), %%mm1 \n\t"\
1223 "paddw %2, %%mm0 \n\t"\
1224 "paddw %2, %%mm1 \n\t"\
1225 "psraw $5, %%mm0 \n\t"\
1226 "psraw $5, %%mm1 \n\t"\
1227 "packuswb %%mm1, %%mm0 \n\t"\
1228 OP_3DNOW(%%mm0, (%1), %%mm1, q)\
1229 :: "r"(temp), "r"(dst), "m"(ROUNDER)\
1237 #define QPEL_OP(OPNAME, ROUNDER, RND, OP, MMX)\
1239 static void OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1240 uint64_t temp[17*4];\
1241 uint64_t *temp_ptr= temp;\
1246 "pxor %%mm7, %%mm7 \n\t"\
1248 "movq (%0), %%mm0 \n\t"\
1249 "movq (%0), %%mm1 \n\t"\
1250 "movq 8(%0), %%mm2 \n\t"\
1251 "movq 8(%0), %%mm3 \n\t"\
1252 "punpcklbw %%mm7, %%mm0 \n\t"\
1253 "punpckhbw %%mm7, %%mm1 \n\t"\
1254 "punpcklbw %%mm7, %%mm2 \n\t"\
1255 "punpckhbw %%mm7, %%mm3 \n\t"\
1256 "movq %%mm0, (%1) \n\t"\
1257 "movq %%mm1, 17*8(%1) \n\t"\
1258 "movq %%mm2, 2*17*8(%1) \n\t"\
1259 "movq %%mm3, 3*17*8(%1) \n\t"\
1264 : "+r" (src), "+r" (temp_ptr), "+r"(count)\
1265 : "r" ((x86_reg)srcStride)\
1272 /*FIXME reorder for speed */\
1274 /*"pxor %%mm7, %%mm7 \n\t"*/\
1276 "movq (%0), %%mm0 \n\t"\
1277 "movq 8(%0), %%mm1 \n\t"\
1278 "movq 16(%0), %%mm2 \n\t"\
1279 "movq 24(%0), %%mm3 \n\t"\
1280 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP)\
1281 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP)\
1283 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP)\
1285 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP)\
1287 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP)\
1288 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 72(%0), (%1, %3), OP)\
1290 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 80(%0), (%1), OP)\
1291 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 88(%0), (%1, %3), OP)\
1293 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 40(%0), 48(%0), 56(%0), 96(%0), (%1), OP)\
1294 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 48(%0), 56(%0), 64(%0),104(%0), (%1, %3), OP)\
1296 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 56(%0), 64(%0), 72(%0),112(%0), (%1), OP)\
1297 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 64(%0), 72(%0), 80(%0),120(%0), (%1, %3), OP)\
1299 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 72(%0), 80(%0), 88(%0),128(%0), (%1), OP)\
1301 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 80(%0), 88(%0), 96(%0),128(%0), (%1, %3), OP)\
1303 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 88(%0), 96(%0),104(%0),120(%0), (%1), OP)\
1304 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 96(%0),104(%0),112(%0),112(%0), (%1, %3), OP)\
1306 "add $136, %0 \n\t"\
1311 : "+r"(temp_ptr), "+r"(dst), "+g"(count)\
1312 : "r"((x86_reg)dstStride), "r"(2*(x86_reg)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(ROUNDER), "g"(4-14*(x86_reg)dstStride)\
1317 static void OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1318 uint64_t temp[9*2];\
1319 uint64_t *temp_ptr= temp;\
1324 "pxor %%mm7, %%mm7 \n\t"\
1326 "movq (%0), %%mm0 \n\t"\
1327 "movq (%0), %%mm1 \n\t"\
1328 "punpcklbw %%mm7, %%mm0 \n\t"\
1329 "punpckhbw %%mm7, %%mm1 \n\t"\
1330 "movq %%mm0, (%1) \n\t"\
1331 "movq %%mm1, 9*8(%1) \n\t"\
1336 : "+r" (src), "+r" (temp_ptr), "+r"(count)\
1337 : "r" ((x86_reg)srcStride)\
1344 /*FIXME reorder for speed */\
1346 /*"pxor %%mm7, %%mm7 \n\t"*/\
1348 "movq (%0), %%mm0 \n\t"\
1349 "movq 8(%0), %%mm1 \n\t"\
1350 "movq 16(%0), %%mm2 \n\t"\
1351 "movq 24(%0), %%mm3 \n\t"\
1352 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP)\
1353 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP)\
1355 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP)\
1357 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP)\
1359 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP)\
1361 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 64(%0), (%1, %3), OP)\
1363 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 56(%0), (%1), OP)\
1364 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 48(%0), (%1, %3), OP)\
1371 : "+r"(temp_ptr), "+r"(dst), "+g"(count)\
1372 : "r"((x86_reg)dstStride), "r"(2*(x86_reg)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(ROUNDER), "g"(4-6*(x86_reg)dstStride)\
1377 static void OPNAME ## qpel8_mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\
1378 OPNAME ## pixels8_ ## MMX(dst, src, stride, 8);\
1381 static void OPNAME ## qpel8_mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1383 uint8_t * const half= (uint8_t*)temp;\
1384 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, stride, 8);\
1385 OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8);\
1388 static void OPNAME ## qpel8_mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1389 OPNAME ## mpeg4_qpel8_h_lowpass_ ## MMX(dst, src, stride, stride, 8);\
1392 static void OPNAME ## qpel8_mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1394 uint8_t * const half= (uint8_t*)temp;\
1395 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, stride, 8);\
1396 OPNAME ## pixels8_l2_ ## MMX(dst, src+1, half, stride, stride, 8);\
1399 static void OPNAME ## qpel8_mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1401 uint8_t * const half= (uint8_t*)temp;\
1402 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride);\
1403 OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8);\
1406 static void OPNAME ## qpel8_mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1407 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, src, stride, stride);\
1410 static void OPNAME ## qpel8_mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1412 uint8_t * const half= (uint8_t*)temp;\
1413 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride);\
1414 OPNAME ## pixels8_l2_ ## MMX(dst, src+stride, half, stride, stride, 8);\
1416 static void OPNAME ## qpel8_mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1417 uint64_t half[8 + 9];\
1418 uint8_t * const halfH= ((uint8_t*)half) + 64;\
1419 uint8_t * const halfHV= ((uint8_t*)half);\
1420 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
1421 put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\
1422 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
1423 OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\
1425 static void OPNAME ## qpel8_mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1426 uint64_t half[8 + 9];\
1427 uint8_t * const halfH= ((uint8_t*)half) + 64;\
1428 uint8_t * const halfHV= ((uint8_t*)half);\
1429 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
1430 put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\
1431 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
1432 OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\
1434 static void OPNAME ## qpel8_mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1435 uint64_t half[8 + 9];\
1436 uint8_t * const halfH= ((uint8_t*)half) + 64;\
1437 uint8_t * const halfHV= ((uint8_t*)half);\
1438 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
1439 put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\
1440 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
1441 OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\
1443 static void OPNAME ## qpel8_mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1444 uint64_t half[8 + 9];\
1445 uint8_t * const halfH= ((uint8_t*)half) + 64;\
1446 uint8_t * const halfHV= ((uint8_t*)half);\
1447 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
1448 put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\
1449 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
1450 OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\
1452 static void OPNAME ## qpel8_mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1453 uint64_t half[8 + 9];\
1454 uint8_t * const halfH= ((uint8_t*)half) + 64;\
1455 uint8_t * const halfHV= ((uint8_t*)half);\
1456 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
1457 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
1458 OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\
1460 static void OPNAME ## qpel8_mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1461 uint64_t half[8 + 9];\
1462 uint8_t * const halfH= ((uint8_t*)half) + 64;\
1463 uint8_t * const halfHV= ((uint8_t*)half);\
1464 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
1465 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
1466 OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\
1468 static void OPNAME ## qpel8_mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1469 uint64_t half[8 + 9];\
1470 uint8_t * const halfH= ((uint8_t*)half);\
1471 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
1472 put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\
1473 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
1475 static void OPNAME ## qpel8_mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1476 uint64_t half[8 + 9];\
1477 uint8_t * const halfH= ((uint8_t*)half);\
1478 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
1479 put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\
1480 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
1482 static void OPNAME ## qpel8_mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1484 uint8_t * const halfH= ((uint8_t*)half);\
1485 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
1486 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
1488 static void OPNAME ## qpel16_mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\
1489 OPNAME ## pixels16_ ## MMX(dst, src, stride, 16);\
1492 static void OPNAME ## qpel16_mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1494 uint8_t * const half= (uint8_t*)temp;\
1495 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, stride, 16);\
1496 OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16);\
1499 static void OPNAME ## qpel16_mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1500 OPNAME ## mpeg4_qpel16_h_lowpass_ ## MMX(dst, src, stride, stride, 16);\
1503 static void OPNAME ## qpel16_mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1505 uint8_t * const half= (uint8_t*)temp;\
1506 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, stride, 16);\
1507 OPNAME ## pixels16_l2_ ## MMX(dst, src+1, half, stride, stride, 16);\
1510 static void OPNAME ## qpel16_mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1512 uint8_t * const half= (uint8_t*)temp;\
1513 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, stride);\
1514 OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16);\
1517 static void OPNAME ## qpel16_mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1518 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, src, stride, stride);\
1521 static void OPNAME ## qpel16_mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1523 uint8_t * const half= (uint8_t*)temp;\
1524 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, stride);\
1525 OPNAME ## pixels16_l2_ ## MMX(dst, src+stride, half, stride, stride, 16);\
1527 static void OPNAME ## qpel16_mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1528 uint64_t half[16*2 + 17*2];\
1529 uint8_t * const halfH= ((uint8_t*)half) + 256;\
1530 uint8_t * const halfHV= ((uint8_t*)half);\
1531 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
1532 put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\
1533 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
1534 OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\
1536 static void OPNAME ## qpel16_mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1537 uint64_t half[16*2 + 17*2];\
1538 uint8_t * const halfH= ((uint8_t*)half) + 256;\
1539 uint8_t * const halfHV= ((uint8_t*)half);\
1540 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
1541 put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\
1542 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
1543 OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\
1545 static void OPNAME ## qpel16_mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1546 uint64_t half[16*2 + 17*2];\
1547 uint8_t * const halfH= ((uint8_t*)half) + 256;\
1548 uint8_t * const halfHV= ((uint8_t*)half);\
1549 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
1550 put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\
1551 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
1552 OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\
1554 static void OPNAME ## qpel16_mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1555 uint64_t half[16*2 + 17*2];\
1556 uint8_t * const halfH= ((uint8_t*)half) + 256;\
1557 uint8_t * const halfHV= ((uint8_t*)half);\
1558 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
1559 put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\
1560 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
1561 OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\
1563 static void OPNAME ## qpel16_mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1564 uint64_t half[16*2 + 17*2];\
1565 uint8_t * const halfH= ((uint8_t*)half) + 256;\
1566 uint8_t * const halfHV= ((uint8_t*)half);\
1567 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
1568 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
1569 OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\
1571 static void OPNAME ## qpel16_mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1572 uint64_t half[16*2 + 17*2];\
1573 uint8_t * const halfH= ((uint8_t*)half) + 256;\
1574 uint8_t * const halfHV= ((uint8_t*)half);\
1575 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
1576 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
1577 OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\
1579 static void OPNAME ## qpel16_mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1580 uint64_t half[17*2];\
1581 uint8_t * const halfH= ((uint8_t*)half);\
1582 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
1583 put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\
1584 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
1586 static void OPNAME ## qpel16_mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1587 uint64_t half[17*2];\
1588 uint8_t * const halfH= ((uint8_t*)half);\
1589 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
1590 put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\
1591 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
1593 static void OPNAME ## qpel16_mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1594 uint64_t half[17*2];\
1595 uint8_t * const halfH= ((uint8_t*)half);\
1596 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
1597 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
1600 #define PUT_OP(a,b,temp, size) "mov" #size " " #a ", " #b " \n\t"
1601 #define AVG_3DNOW_OP(a,b,temp, size) \
1602 "mov" #size " " #b ", " #temp " \n\t"\
1603 "pavgusb " #temp ", " #a " \n\t"\
1604 "mov" #size " " #a ", " #b " \n\t"
1605 #define AVG_MMX2_OP(a,b,temp, size) \
1606 "mov" #size " " #b ", " #temp " \n\t"\
1607 "pavgb " #temp ", " #a " \n\t"\
1608 "mov" #size " " #a ", " #b " \n\t"
1610 QPEL_BASE(put_ , ff_pw_16, _ , PUT_OP, PUT_OP)
1611 QPEL_BASE(avg_ , ff_pw_16, _ , AVG_MMX2_OP, AVG_3DNOW_OP)
1612 QPEL_BASE(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, PUT_OP)
1613 QPEL_OP(put_ , ff_pw_16, _ , PUT_OP, 3dnow)
1614 QPEL_OP(avg_ , ff_pw_16, _ , AVG_3DNOW_OP, 3dnow)
1615 QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, 3dnow)
1616 QPEL_OP(put_ , ff_pw_16, _ , PUT_OP, mmx2)
1617 QPEL_OP(avg_ , ff_pw_16, _ , AVG_MMX2_OP, mmx2)
1618 QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, mmx2)
1620 /***********************************/
1621 /* bilinear qpel: not compliant to any spec, only for -lavdopts fast */
1623 #define QPEL_2TAP_XY(OPNAME, SIZE, MMX, XY, HPEL)\
1624 static void OPNAME ## 2tap_qpel ## SIZE ## _mc ## XY ## _ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1625 OPNAME ## pixels ## SIZE ## HPEL(dst, src, stride, SIZE);\
1627 #define QPEL_2TAP_L3(OPNAME, SIZE, MMX, XY, S0, S1, S2)\
1628 static void OPNAME ## 2tap_qpel ## SIZE ## _mc ## XY ## _ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1629 OPNAME ## 2tap_qpel ## SIZE ## _l3_ ## MMX(dst, src+S0, stride, SIZE, S1, S2);\
1632 #define QPEL_2TAP(OPNAME, SIZE, MMX)\
1633 QPEL_2TAP_XY(OPNAME, SIZE, MMX, 20, _x2_ ## MMX)\
1634 QPEL_2TAP_XY(OPNAME, SIZE, MMX, 02, _y2_ ## MMX)\
1635 QPEL_2TAP_XY(OPNAME, SIZE, MMX, 22, _xy2_mmx)\
1636 static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc00_ ## MMX =\
1637 OPNAME ## qpel ## SIZE ## _mc00_ ## MMX;\
1638 static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc21_ ## MMX =\
1639 OPNAME ## 2tap_qpel ## SIZE ## _mc20_ ## MMX;\
1640 static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc12_ ## MMX =\
1641 OPNAME ## 2tap_qpel ## SIZE ## _mc02_ ## MMX;\
1642 static void OPNAME ## 2tap_qpel ## SIZE ## _mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1643 OPNAME ## pixels ## SIZE ## _y2_ ## MMX(dst, src+1, stride, SIZE);\
1645 static void OPNAME ## 2tap_qpel ## SIZE ## _mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1646 OPNAME ## pixels ## SIZE ## _x2_ ## MMX(dst, src+stride, stride, SIZE);\
1648 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 10, 0, 1, 0)\
1649 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 30, 1, -1, 0)\
1650 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 01, 0, stride, 0)\
1651 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 03, stride, -stride, 0)\
1652 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 11, 0, stride, 1)\
1653 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 31, 1, stride, -1)\
1654 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 13, stride, -stride, 1)\
1655 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 33, stride+1, -stride, -1)\
1657 QPEL_2TAP(put_, 16, mmx2)
1658 QPEL_2TAP(avg_, 16, mmx2)
1659 QPEL_2TAP(put_, 8, mmx2)
1660 QPEL_2TAP(avg_, 8, mmx2)
1661 QPEL_2TAP(put_, 16, 3dnow)
1662 QPEL_2TAP(avg_, 16, 3dnow)
1663 QPEL_2TAP(put_, 8, 3dnow)
1664 QPEL_2TAP(avg_, 8, 3dnow)
1668 static void just_return(void) { return; }
1672 typedef void emu_edge_core_func (uint8_t *buf, const uint8_t *src,
1673 x86_reg linesize, x86_reg start_y,
1674 x86_reg end_y, x86_reg block_h,
1675 x86_reg start_x, x86_reg end_x,
1677 extern emu_edge_core_func ff_emu_edge_core_mmx;
1678 extern emu_edge_core_func ff_emu_edge_core_sse;
1680 static av_always_inline
1681 void emulated_edge_mc(uint8_t *buf, const uint8_t *src, int linesize,
1682 int block_w, int block_h,
1683 int src_x, int src_y, int w, int h,
1684 emu_edge_core_func *core_fn)
1686 int start_y, start_x, end_y, end_x, src_y_add=0;
1689 src_y_add = h-1-src_y;
1691 }else if(src_y<=-block_h){
1692 src_y_add = 1-block_h-src_y;
1698 }else if(src_x<=-block_w){
1699 src+= (1-block_w-src_x);
1703 start_y= FFMAX(0, -src_y);
1704 start_x= FFMAX(0, -src_x);
1705 end_y= FFMIN(block_h, h-src_y);
1706 end_x= FFMIN(block_w, w-src_x);
1707 assert(start_x < end_x && block_w > 0);
1708 assert(start_y < end_y && block_h > 0);
1710 // fill in the to-be-copied part plus all above/below
1711 src += (src_y_add+start_y)*linesize + start_x;
1713 core_fn(buf, src, linesize, start_y, end_y, block_h, start_x, end_x, block_w);
1718 void emulated_edge_mc_mmx(uint8_t *buf, const uint8_t *src, int linesize,
1719 int block_w, int block_h,
1720 int src_x, int src_y, int w, int h)
1722 emulated_edge_mc(buf, src, linesize, block_w, block_h, src_x, src_y,
1723 w, h, &ff_emu_edge_core_mmx);
1727 void emulated_edge_mc_sse(uint8_t *buf, const uint8_t *src, int linesize,
1728 int block_w, int block_h,
1729 int src_x, int src_y, int w, int h)
1731 emulated_edge_mc(buf, src, linesize, block_w, block_h, src_x, src_y,
1732 w, h, &ff_emu_edge_core_sse);
1734 #endif /* HAVE_YASM */
1736 typedef void emulated_edge_mc_func (uint8_t *dst, const uint8_t *src,
1737 int linesize, int block_w, int block_h,
1738 int src_x, int src_y, int w, int h);
1740 static av_always_inline
1741 void gmc(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
1742 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height,
1743 emulated_edge_mc_func *emu_edge_fn)
1746 const int ix = ox>>(16+shift);
1747 const int iy = oy>>(16+shift);
1748 const int oxs = ox>>4;
1749 const int oys = oy>>4;
1750 const int dxxs = dxx>>4;
1751 const int dxys = dxy>>4;
1752 const int dyxs = dyx>>4;
1753 const int dyys = dyy>>4;
1754 const uint16_t r4[4] = {r,r,r,r};
1755 const uint16_t dxy4[4] = {dxys,dxys,dxys,dxys};
1756 const uint16_t dyy4[4] = {dyys,dyys,dyys,dyys};
1757 const uint64_t shift2 = 2*shift;
1758 uint8_t edge_buf[(h+1)*stride];
1761 const int dxw = (dxx-(1<<(16+shift)))*(w-1);
1762 const int dyh = (dyy-(1<<(16+shift)))*(h-1);
1763 const int dxh = dxy*(h-1);
1764 const int dyw = dyx*(w-1);
1765 if( // non-constant fullpel offset (3% of blocks)
1766 ((ox^(ox+dxw)) | (ox^(ox+dxh)) | (ox^(ox+dxw+dxh)) |
1767 (oy^(oy+dyw)) | (oy^(oy+dyh)) | (oy^(oy+dyw+dyh))) >> (16+shift)
1768 // uses more than 16 bits of subpel mv (only at huge resolution)
1769 || (dxx|dxy|dyx|dyy)&15 )
1771 //FIXME could still use mmx for some of the rows
1772 ff_gmc_c(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r, width, height);
1776 src += ix + iy*stride;
1777 if( (unsigned)ix >= width-w ||
1778 (unsigned)iy >= height-h )
1780 emu_edge_fn(edge_buf, src, stride, w+1, h+1, ix, iy, width, height);
1785 "movd %0, %%mm6 \n\t"
1786 "pxor %%mm7, %%mm7 \n\t"
1787 "punpcklwd %%mm6, %%mm6 \n\t"
1788 "punpcklwd %%mm6, %%mm6 \n\t"
1792 for(x=0; x<w; x+=4){
1793 uint16_t dx4[4] = { oxs - dxys + dxxs*(x+0),
1794 oxs - dxys + dxxs*(x+1),
1795 oxs - dxys + dxxs*(x+2),
1796 oxs - dxys + dxxs*(x+3) };
1797 uint16_t dy4[4] = { oys - dyys + dyxs*(x+0),
1798 oys - dyys + dyxs*(x+1),
1799 oys - dyys + dyxs*(x+2),
1800 oys - dyys + dyxs*(x+3) };
1804 "movq %0, %%mm4 \n\t"
1805 "movq %1, %%mm5 \n\t"
1806 "paddw %2, %%mm4 \n\t"
1807 "paddw %3, %%mm5 \n\t"
1808 "movq %%mm4, %0 \n\t"
1809 "movq %%mm5, %1 \n\t"
1810 "psrlw $12, %%mm4 \n\t"
1811 "psrlw $12, %%mm5 \n\t"
1812 : "+m"(*dx4), "+m"(*dy4)
1813 : "m"(*dxy4), "m"(*dyy4)
1817 "movq %%mm6, %%mm2 \n\t"
1818 "movq %%mm6, %%mm1 \n\t"
1819 "psubw %%mm4, %%mm2 \n\t"
1820 "psubw %%mm5, %%mm1 \n\t"
1821 "movq %%mm2, %%mm0 \n\t"
1822 "movq %%mm4, %%mm3 \n\t"
1823 "pmullw %%mm1, %%mm0 \n\t" // (s-dx)*(s-dy)
1824 "pmullw %%mm5, %%mm3 \n\t" // dx*dy
1825 "pmullw %%mm5, %%mm2 \n\t" // (s-dx)*dy
1826 "pmullw %%mm4, %%mm1 \n\t" // dx*(s-dy)
1828 "movd %4, %%mm5 \n\t"
1829 "movd %3, %%mm4 \n\t"
1830 "punpcklbw %%mm7, %%mm5 \n\t"
1831 "punpcklbw %%mm7, %%mm4 \n\t"
1832 "pmullw %%mm5, %%mm3 \n\t" // src[1,1] * dx*dy
1833 "pmullw %%mm4, %%mm2 \n\t" // src[0,1] * (s-dx)*dy
1835 "movd %2, %%mm5 \n\t"
1836 "movd %1, %%mm4 \n\t"
1837 "punpcklbw %%mm7, %%mm5 \n\t"
1838 "punpcklbw %%mm7, %%mm4 \n\t"
1839 "pmullw %%mm5, %%mm1 \n\t" // src[1,0] * dx*(s-dy)
1840 "pmullw %%mm4, %%mm0 \n\t" // src[0,0] * (s-dx)*(s-dy)
1841 "paddw %5, %%mm1 \n\t"
1842 "paddw %%mm3, %%mm2 \n\t"
1843 "paddw %%mm1, %%mm0 \n\t"
1844 "paddw %%mm2, %%mm0 \n\t"
1846 "psrlw %6, %%mm0 \n\t"
1847 "packuswb %%mm0, %%mm0 \n\t"
1848 "movd %%mm0, %0 \n\t"
1850 : "=m"(dst[x+y*stride])
1851 : "m"(src[0]), "m"(src[1]),
1852 "m"(src[stride]), "m"(src[stride+1]),
1853 "m"(*r4), "m"(shift2)
1863 static void gmc_mmx(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
1864 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
1866 gmc(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r,
1867 width, height, &emulated_edge_mc_mmx);
1870 static void gmc_sse(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
1871 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
1873 gmc(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r,
1874 width, height, &emulated_edge_mc_sse);
1877 static void gmc_mmx(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
1878 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
1880 gmc(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r,
1881 width, height, &ff_emulated_edge_mc);
1885 #define PREFETCH(name, op) \
1886 static void name(void *mem, int stride, int h){\
1887 const uint8_t *p= mem;\
1889 __asm__ volatile(#op" %0" :: "m"(*p));\
1893 PREFETCH(prefetch_mmx2, prefetcht0)
1894 PREFETCH(prefetch_3dnow, prefetch)
1897 #include "h264_qpel_mmx.c"
1899 void ff_put_h264_chroma_mc8_mmx_rnd (uint8_t *dst, uint8_t *src,
1900 int stride, int h, int x, int y);
1901 void ff_put_rv40_chroma_mc8_mmx (uint8_t *dst, uint8_t *src,
1902 int stride, int h, int x, int y);
1903 void ff_avg_h264_chroma_mc8_mmx2_rnd (uint8_t *dst, uint8_t *src,
1904 int stride, int h, int x, int y);
1905 void ff_avg_rv40_chroma_mc8_mmx2 (uint8_t *dst, uint8_t *src,
1906 int stride, int h, int x, int y);
1907 void ff_avg_h264_chroma_mc8_3dnow_rnd (uint8_t *dst, uint8_t *src,
1908 int stride, int h, int x, int y);
1909 void ff_avg_rv40_chroma_mc8_3dnow (uint8_t *dst, uint8_t *src,
1910 int stride, int h, int x, int y);
1912 void ff_put_h264_chroma_mc4_mmx (uint8_t *dst, uint8_t *src,
1913 int stride, int h, int x, int y);
1914 void ff_put_rv40_chroma_mc4_mmx (uint8_t *dst, uint8_t *src,
1915 int stride, int h, int x, int y);
1916 void ff_avg_h264_chroma_mc4_mmx2 (uint8_t *dst, uint8_t *src,
1917 int stride, int h, int x, int y);
1918 void ff_avg_rv40_chroma_mc4_mmx2 (uint8_t *dst, uint8_t *src,
1919 int stride, int h, int x, int y);
1920 void ff_avg_h264_chroma_mc4_3dnow (uint8_t *dst, uint8_t *src,
1921 int stride, int h, int x, int y);
1922 void ff_avg_rv40_chroma_mc4_3dnow (uint8_t *dst, uint8_t *src,
1923 int stride, int h, int x, int y);
1925 void ff_put_h264_chroma_mc2_mmx2 (uint8_t *dst, uint8_t *src,
1926 int stride, int h, int x, int y);
1927 void ff_avg_h264_chroma_mc2_mmx2 (uint8_t *dst, uint8_t *src,
1928 int stride, int h, int x, int y);
1930 void ff_put_h264_chroma_mc8_ssse3_rnd (uint8_t *dst, uint8_t *src,
1931 int stride, int h, int x, int y);
1932 void ff_put_h264_chroma_mc4_ssse3 (uint8_t *dst, uint8_t *src,
1933 int stride, int h, int x, int y);
1935 void ff_avg_h264_chroma_mc8_ssse3_rnd (uint8_t *dst, uint8_t *src,
1936 int stride, int h, int x, int y);
1937 void ff_avg_h264_chroma_mc4_ssse3 (uint8_t *dst, uint8_t *src,
1938 int stride, int h, int x, int y);
1942 void ff_put_cavs_qpel8_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) {
1943 put_pixels8_mmx(dst, src, stride, 8);
1945 void ff_avg_cavs_qpel8_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) {
1946 avg_pixels8_mmx(dst, src, stride, 8);
1948 void ff_put_cavs_qpel16_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) {
1949 put_pixels16_mmx(dst, src, stride, 16);
1951 void ff_avg_cavs_qpel16_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) {
1952 avg_pixels16_mmx(dst, src, stride, 16);
1956 void ff_put_vc1_mspel_mc00_mmx(uint8_t *dst, const uint8_t *src, int stride, int rnd) {
1957 put_pixels8_mmx(dst, src, stride, 8);
1959 void ff_avg_vc1_mspel_mc00_mmx2(uint8_t *dst, const uint8_t *src, int stride, int rnd) {
1960 avg_pixels8_mmx2(dst, src, stride, 8);
1963 /* XXX: those functions should be suppressed ASAP when all IDCTs are
1966 static void ff_libmpeg2mmx_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
1968 ff_mmx_idct (block);
1969 ff_put_pixels_clamped_mmx(block, dest, line_size);
1971 static void ff_libmpeg2mmx_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
1973 ff_mmx_idct (block);
1974 ff_add_pixels_clamped_mmx(block, dest, line_size);
1976 static void ff_libmpeg2mmx2_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
1978 ff_mmxext_idct (block);
1979 ff_put_pixels_clamped_mmx(block, dest, line_size);
1981 static void ff_libmpeg2mmx2_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
1983 ff_mmxext_idct (block);
1984 ff_add_pixels_clamped_mmx(block, dest, line_size);
1987 static void ff_idct_xvid_mmx_put(uint8_t *dest, int line_size, DCTELEM *block)
1989 ff_idct_xvid_mmx (block);
1990 ff_put_pixels_clamped_mmx(block, dest, line_size);
1992 static void ff_idct_xvid_mmx_add(uint8_t *dest, int line_size, DCTELEM *block)
1994 ff_idct_xvid_mmx (block);
1995 ff_add_pixels_clamped_mmx(block, dest, line_size);
1997 static void ff_idct_xvid_mmx2_put(uint8_t *dest, int line_size, DCTELEM *block)
1999 ff_idct_xvid_mmx2 (block);
2000 ff_put_pixels_clamped_mmx(block, dest, line_size);
2002 static void ff_idct_xvid_mmx2_add(uint8_t *dest, int line_size, DCTELEM *block)
2004 ff_idct_xvid_mmx2 (block);
2005 ff_add_pixels_clamped_mmx(block, dest, line_size);
2008 static void vorbis_inverse_coupling_3dnow(float *mag, float *ang, int blocksize)
2011 __asm__ volatile("pxor %%mm7, %%mm7":);
2012 for(i=0; i<blocksize; i+=2) {
2014 "movq %0, %%mm0 \n\t"
2015 "movq %1, %%mm1 \n\t"
2016 "movq %%mm0, %%mm2 \n\t"
2017 "movq %%mm1, %%mm3 \n\t"
2018 "pfcmpge %%mm7, %%mm2 \n\t" // m <= 0.0
2019 "pfcmpge %%mm7, %%mm3 \n\t" // a <= 0.0
2020 "pslld $31, %%mm2 \n\t" // keep only the sign bit
2021 "pxor %%mm2, %%mm1 \n\t"
2022 "movq %%mm3, %%mm4 \n\t"
2023 "pand %%mm1, %%mm3 \n\t"
2024 "pandn %%mm1, %%mm4 \n\t"
2025 "pfadd %%mm0, %%mm3 \n\t" // a = m + ((a<0) & (a ^ sign(m)))
2026 "pfsub %%mm4, %%mm0 \n\t" // m = m + ((a>0) & (a ^ sign(m)))
2027 "movq %%mm3, %1 \n\t"
2028 "movq %%mm0, %0 \n\t"
2029 :"+m"(mag[i]), "+m"(ang[i])
2033 __asm__ volatile("femms");
2035 static void vorbis_inverse_coupling_sse(float *mag, float *ang, int blocksize)
2040 "movaps %0, %%xmm5 \n\t"
2041 ::"m"(ff_pdw_80000000[0])
2043 for(i=0; i<blocksize; i+=4) {
2045 "movaps %0, %%xmm0 \n\t"
2046 "movaps %1, %%xmm1 \n\t"
2047 "xorps %%xmm2, %%xmm2 \n\t"
2048 "xorps %%xmm3, %%xmm3 \n\t"
2049 "cmpleps %%xmm0, %%xmm2 \n\t" // m <= 0.0
2050 "cmpleps %%xmm1, %%xmm3 \n\t" // a <= 0.0
2051 "andps %%xmm5, %%xmm2 \n\t" // keep only the sign bit
2052 "xorps %%xmm2, %%xmm1 \n\t"
2053 "movaps %%xmm3, %%xmm4 \n\t"
2054 "andps %%xmm1, %%xmm3 \n\t"
2055 "andnps %%xmm1, %%xmm4 \n\t"
2056 "addps %%xmm0, %%xmm3 \n\t" // a = m + ((a<0) & (a ^ sign(m)))
2057 "subps %%xmm4, %%xmm0 \n\t" // m = m + ((a>0) & (a ^ sign(m)))
2058 "movaps %%xmm3, %1 \n\t"
2059 "movaps %%xmm0, %0 \n\t"
2060 :"+m"(mag[i]), "+m"(ang[i])
2069 #define MIX5(mono,stereo)\
2071 "movss 0(%2), %%xmm5 \n"\
2072 "movss 8(%2), %%xmm6 \n"\
2073 "movss 24(%2), %%xmm7 \n"\
2074 "shufps $0, %%xmm5, %%xmm5 \n"\
2075 "shufps $0, %%xmm6, %%xmm6 \n"\
2076 "shufps $0, %%xmm7, %%xmm7 \n"\
2078 "movaps (%0,%1), %%xmm0 \n"\
2079 "movaps 0x400(%0,%1), %%xmm1 \n"\
2080 "movaps 0x800(%0,%1), %%xmm2 \n"\
2081 "movaps 0xc00(%0,%1), %%xmm3 \n"\
2082 "movaps 0x1000(%0,%1), %%xmm4 \n"\
2083 "mulps %%xmm5, %%xmm0 \n"\
2084 "mulps %%xmm6, %%xmm1 \n"\
2085 "mulps %%xmm5, %%xmm2 \n"\
2086 "mulps %%xmm7, %%xmm3 \n"\
2087 "mulps %%xmm7, %%xmm4 \n"\
2088 stereo("addps %%xmm1, %%xmm0 \n")\
2089 "addps %%xmm1, %%xmm2 \n"\
2090 "addps %%xmm3, %%xmm0 \n"\
2091 "addps %%xmm4, %%xmm2 \n"\
2092 mono("addps %%xmm2, %%xmm0 \n")\
2093 "movaps %%xmm0, (%0,%1) \n"\
2094 stereo("movaps %%xmm2, 0x400(%0,%1) \n")\
2098 :"r"(samples[0]+len), "r"(matrix)\
2099 :XMM_CLOBBERS("%xmm0", "%xmm1", "%xmm2", "%xmm3", \
2100 "%xmm4", "%xmm5", "%xmm6", "%xmm7",)\
2104 #define MIX_MISC(stereo)\
2107 "movaps (%3,%0), %%xmm0 \n"\
2108 stereo("movaps %%xmm0, %%xmm1 \n")\
2109 "mulps %%xmm4, %%xmm0 \n"\
2110 stereo("mulps %%xmm5, %%xmm1 \n")\
2111 "lea 1024(%3,%0), %1 \n"\
2114 "movaps (%1), %%xmm2 \n"\
2115 stereo("movaps %%xmm2, %%xmm3 \n")\
2116 "mulps (%4,%2), %%xmm2 \n"\
2117 stereo("mulps 16(%4,%2), %%xmm3 \n")\
2118 "addps %%xmm2, %%xmm0 \n"\
2119 stereo("addps %%xmm3, %%xmm1 \n")\
2123 "movaps %%xmm0, (%3,%0) \n"\
2124 stereo("movaps %%xmm1, 1024(%3,%0) \n")\
2127 :"+&r"(i), "=&r"(j), "=&r"(k)\
2128 :"r"(samples[0]+len), "r"(matrix_simd+in_ch), "g"((intptr_t)-32*(in_ch-1))\
2132 static void ac3_downmix_sse(float (*samples)[256], float (*matrix)[2], int out_ch, int in_ch, int len)
2134 int (*matrix_cmp)[2] = (int(*)[2])matrix;
2137 i = -len*sizeof(float);
2138 if(in_ch == 5 && out_ch == 2 && !(matrix_cmp[0][1]|matrix_cmp[2][0]|matrix_cmp[3][1]|matrix_cmp[4][0]|(matrix_cmp[1][0]^matrix_cmp[1][1])|(matrix_cmp[0][0]^matrix_cmp[2][1]))) {
2140 } else if(in_ch == 5 && out_ch == 1 && matrix_cmp[0][0]==matrix_cmp[2][0] && matrix_cmp[3][0]==matrix_cmp[4][0]) {
2143 DECLARE_ALIGNED(16, float, matrix_simd)[AC3_MAX_CHANNELS][2][4];
2144 j = 2*in_ch*sizeof(float);
2148 "movss (%2,%0), %%xmm4 \n"
2149 "movss 4(%2,%0), %%xmm5 \n"
2150 "shufps $0, %%xmm4, %%xmm4 \n"
2151 "shufps $0, %%xmm5, %%xmm5 \n"
2152 "movaps %%xmm4, (%1,%0,4) \n"
2153 "movaps %%xmm5, 16(%1,%0,4) \n"
2156 :"r"(matrix_simd), "r"(matrix)
2167 static void vector_fmul_3dnow(float *dst, const float *src0, const float *src1, int len){
2168 x86_reg i = (len-4)*4;
2171 "movq (%2,%0), %%mm0 \n\t"
2172 "movq 8(%2,%0), %%mm1 \n\t"
2173 "pfmul (%3,%0), %%mm0 \n\t"
2174 "pfmul 8(%3,%0), %%mm1 \n\t"
2175 "movq %%mm0, (%1,%0) \n\t"
2176 "movq %%mm1, 8(%1,%0) \n\t"
2181 :"r"(dst), "r"(src0), "r"(src1)
2185 static void vector_fmul_sse(float *dst, const float *src0, const float *src1, int len){
2186 x86_reg i = (len-8)*4;
2189 "movaps (%2,%0), %%xmm0 \n\t"
2190 "movaps 16(%2,%0), %%xmm1 \n\t"
2191 "mulps (%3,%0), %%xmm0 \n\t"
2192 "mulps 16(%3,%0), %%xmm1 \n\t"
2193 "movaps %%xmm0, (%1,%0) \n\t"
2194 "movaps %%xmm1, 16(%1,%0) \n\t"
2198 :"r"(dst), "r"(src0), "r"(src1)
2203 static void vector_fmul_reverse_3dnow2(float *dst, const float *src0, const float *src1, int len){
2204 x86_reg i = len*4-16;
2207 "pswapd 8(%1), %%mm0 \n\t"
2208 "pswapd (%1), %%mm1 \n\t"
2209 "pfmul (%3,%0), %%mm0 \n\t"
2210 "pfmul 8(%3,%0), %%mm1 \n\t"
2211 "movq %%mm0, (%2,%0) \n\t"
2212 "movq %%mm1, 8(%2,%0) \n\t"
2216 :"+r"(i), "+r"(src1)
2217 :"r"(dst), "r"(src0)
2219 __asm__ volatile("femms");
2221 static void vector_fmul_reverse_sse(float *dst, const float *src0, const float *src1, int len){
2222 x86_reg i = len*4-32;
2225 "movaps 16(%1), %%xmm0 \n\t"
2226 "movaps (%1), %%xmm1 \n\t"
2227 "shufps $0x1b, %%xmm0, %%xmm0 \n\t"
2228 "shufps $0x1b, %%xmm1, %%xmm1 \n\t"
2229 "mulps (%3,%0), %%xmm0 \n\t"
2230 "mulps 16(%3,%0), %%xmm1 \n\t"
2231 "movaps %%xmm0, (%2,%0) \n\t"
2232 "movaps %%xmm1, 16(%2,%0) \n\t"
2236 :"+r"(i), "+r"(src1)
2237 :"r"(dst), "r"(src0)
2241 static void vector_fmul_add_3dnow(float *dst, const float *src0, const float *src1,
2242 const float *src2, int len){
2243 x86_reg i = (len-4)*4;
2246 "movq (%2,%0), %%mm0 \n\t"
2247 "movq 8(%2,%0), %%mm1 \n\t"
2248 "pfmul (%3,%0), %%mm0 \n\t"
2249 "pfmul 8(%3,%0), %%mm1 \n\t"
2250 "pfadd (%4,%0), %%mm0 \n\t"
2251 "pfadd 8(%4,%0), %%mm1 \n\t"
2252 "movq %%mm0, (%1,%0) \n\t"
2253 "movq %%mm1, 8(%1,%0) \n\t"
2257 :"r"(dst), "r"(src0), "r"(src1), "r"(src2)
2260 __asm__ volatile("femms");
2262 static void vector_fmul_add_sse(float *dst, const float *src0, const float *src1,
2263 const float *src2, int len){
2264 x86_reg i = (len-8)*4;
2267 "movaps (%2,%0), %%xmm0 \n\t"
2268 "movaps 16(%2,%0), %%xmm1 \n\t"
2269 "mulps (%3,%0), %%xmm0 \n\t"
2270 "mulps 16(%3,%0), %%xmm1 \n\t"
2271 "addps (%4,%0), %%xmm0 \n\t"
2272 "addps 16(%4,%0), %%xmm1 \n\t"
2273 "movaps %%xmm0, (%1,%0) \n\t"
2274 "movaps %%xmm1, 16(%1,%0) \n\t"
2278 :"r"(dst), "r"(src0), "r"(src1), "r"(src2)
2284 static void vector_fmul_window_3dnow2(float *dst, const float *src0, const float *src1,
2285 const float *win, int len){
2287 x86_reg j = len*4-8;
2290 "pswapd (%5,%1), %%mm1 \n"
2291 "movq (%5,%0), %%mm0 \n"
2292 "pswapd (%4,%1), %%mm5 \n"
2293 "movq (%3,%0), %%mm4 \n"
2294 "movq %%mm0, %%mm2 \n"
2295 "movq %%mm1, %%mm3 \n"
2296 "pfmul %%mm4, %%mm2 \n" // src0[len+i]*win[len+i]
2297 "pfmul %%mm5, %%mm3 \n" // src1[ j]*win[len+j]
2298 "pfmul %%mm4, %%mm1 \n" // src0[len+i]*win[len+j]
2299 "pfmul %%mm5, %%mm0 \n" // src1[ j]*win[len+i]
2300 "pfadd %%mm3, %%mm2 \n"
2301 "pfsub %%mm0, %%mm1 \n"
2302 "pswapd %%mm2, %%mm2 \n"
2303 "movq %%mm1, (%2,%0) \n"
2304 "movq %%mm2, (%2,%1) \n"
2310 :"r"(dst+len), "r"(src0+len), "r"(src1), "r"(win+len)
2314 static void vector_fmul_window_sse(float *dst, const float *src0, const float *src1,
2315 const float *win, int len){
2317 x86_reg j = len*4-16;
2320 "movaps (%5,%1), %%xmm1 \n"
2321 "movaps (%5,%0), %%xmm0 \n"
2322 "movaps (%4,%1), %%xmm5 \n"
2323 "movaps (%3,%0), %%xmm4 \n"
2324 "shufps $0x1b, %%xmm1, %%xmm1 \n"
2325 "shufps $0x1b, %%xmm5, %%xmm5 \n"
2326 "movaps %%xmm0, %%xmm2 \n"
2327 "movaps %%xmm1, %%xmm3 \n"
2328 "mulps %%xmm4, %%xmm2 \n" // src0[len+i]*win[len+i]
2329 "mulps %%xmm5, %%xmm3 \n" // src1[ j]*win[len+j]
2330 "mulps %%xmm4, %%xmm1 \n" // src0[len+i]*win[len+j]
2331 "mulps %%xmm5, %%xmm0 \n" // src1[ j]*win[len+i]
2332 "addps %%xmm3, %%xmm2 \n"
2333 "subps %%xmm0, %%xmm1 \n"
2334 "shufps $0x1b, %%xmm2, %%xmm2 \n"
2335 "movaps %%xmm1, (%2,%0) \n"
2336 "movaps %%xmm2, (%2,%1) \n"
2341 :"r"(dst+len), "r"(src0+len), "r"(src1), "r"(win+len)
2344 #endif /* HAVE_6REGS */
2346 static void vector_clipf_sse(float *dst, const float *src, float min, float max,
2349 x86_reg i = (len-16)*4;
2351 "movss %3, %%xmm4 \n"
2352 "movss %4, %%xmm5 \n"
2353 "shufps $0, %%xmm4, %%xmm4 \n"
2354 "shufps $0, %%xmm5, %%xmm5 \n"
2356 "movaps (%2,%0), %%xmm0 \n\t" // 3/1 on intel
2357 "movaps 16(%2,%0), %%xmm1 \n\t"
2358 "movaps 32(%2,%0), %%xmm2 \n\t"
2359 "movaps 48(%2,%0), %%xmm3 \n\t"
2360 "maxps %%xmm4, %%xmm0 \n\t"
2361 "maxps %%xmm4, %%xmm1 \n\t"
2362 "maxps %%xmm4, %%xmm2 \n\t"
2363 "maxps %%xmm4, %%xmm3 \n\t"
2364 "minps %%xmm5, %%xmm0 \n\t"
2365 "minps %%xmm5, %%xmm1 \n\t"
2366 "minps %%xmm5, %%xmm2 \n\t"
2367 "minps %%xmm5, %%xmm3 \n\t"
2368 "movaps %%xmm0, (%1,%0) \n\t"
2369 "movaps %%xmm1, 16(%1,%0) \n\t"
2370 "movaps %%xmm2, 32(%1,%0) \n\t"
2371 "movaps %%xmm3, 48(%1,%0) \n\t"
2375 :"r"(dst), "r"(src), "m"(min), "m"(max)
2380 void ff_vp3_idct_mmx(int16_t *input_data);
2381 void ff_vp3_idct_put_mmx(uint8_t *dest, int line_size, DCTELEM *block);
2382 void ff_vp3_idct_add_mmx(uint8_t *dest, int line_size, DCTELEM *block);
2384 void ff_vp3_idct_dc_add_mmx2(uint8_t *dest, int line_size, const DCTELEM *block);
2386 void ff_vp3_v_loop_filter_mmx2(uint8_t *src, int stride, int *bounding_values);
2387 void ff_vp3_h_loop_filter_mmx2(uint8_t *src, int stride, int *bounding_values);
2389 void ff_vp3_idct_sse2(int16_t *input_data);
2390 void ff_vp3_idct_put_sse2(uint8_t *dest, int line_size, DCTELEM *block);
2391 void ff_vp3_idct_add_sse2(uint8_t *dest, int line_size, DCTELEM *block);
2393 int32_t ff_scalarproduct_int16_mmx2(const int16_t *v1, const int16_t *v2, int order, int shift);
2394 int32_t ff_scalarproduct_int16_sse2(const int16_t *v1, const int16_t *v2, int order, int shift);
2395 int32_t ff_scalarproduct_and_madd_int16_mmx2(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul);
2396 int32_t ff_scalarproduct_and_madd_int16_sse2(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul);
2397 int32_t ff_scalarproduct_and_madd_int16_ssse3(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul);
2399 void ff_apply_window_int16_mmxext (int16_t *output, const int16_t *input,
2400 const int16_t *window, unsigned int len);
2401 void ff_apply_window_int16_mmxext_ba (int16_t *output, const int16_t *input,
2402 const int16_t *window, unsigned int len);
2403 void ff_apply_window_int16_sse2 (int16_t *output, const int16_t *input,
2404 const int16_t *window, unsigned int len);
2405 void ff_apply_window_int16_sse2_ba (int16_t *output, const int16_t *input,
2406 const int16_t *window, unsigned int len);
2407 void ff_apply_window_int16_ssse3 (int16_t *output, const int16_t *input,
2408 const int16_t *window, unsigned int len);
2409 void ff_apply_window_int16_ssse3_atom(int16_t *output, const int16_t *input,
2410 const int16_t *window, unsigned int len);
2412 void ff_add_hfyu_median_prediction_mmx2(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int w, int *left, int *left_top);
2413 int ff_add_hfyu_left_prediction_ssse3(uint8_t *dst, const uint8_t *src, int w, int left);
2414 int ff_add_hfyu_left_prediction_sse4(uint8_t *dst, const uint8_t *src, int w, int left);
2416 float ff_scalarproduct_float_sse(const float *v1, const float *v2, int order);
2418 void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
2420 int mm_flags = av_get_cpu_flags();
2421 const int high_bit_depth = avctx->codec_id == CODEC_ID_H264 && avctx->bits_per_raw_sample > 8;
2423 if (avctx->dsp_mask) {
2424 if (avctx->dsp_mask & AV_CPU_FLAG_FORCE)
2425 mm_flags |= (avctx->dsp_mask & 0xffff);
2427 mm_flags &= ~(avctx->dsp_mask & 0xffff);
2431 av_log(avctx, AV_LOG_INFO, "libavcodec: CPU flags:");
2432 if (mm_flags & AV_CPU_FLAG_MMX)
2433 av_log(avctx, AV_LOG_INFO, " mmx");
2434 if (mm_flags & AV_CPU_FLAG_MMX2)
2435 av_log(avctx, AV_LOG_INFO, " mmx2");
2436 if (mm_flags & AV_CPU_FLAG_3DNOW)
2437 av_log(avctx, AV_LOG_INFO, " 3dnow");
2438 if (mm_flags & AV_CPU_FLAG_SSE)
2439 av_log(avctx, AV_LOG_INFO, " sse");
2440 if (mm_flags & AV_CPU_FLAG_SSE2)
2441 av_log(avctx, AV_LOG_INFO, " sse2");
2442 av_log(avctx, AV_LOG_INFO, "\n");
2445 if (mm_flags & AV_CPU_FLAG_MMX) {
2446 const int idct_algo= avctx->idct_algo;
2448 if(avctx->lowres==0){
2449 if(idct_algo==FF_IDCT_AUTO || idct_algo==FF_IDCT_SIMPLEMMX){
2450 c->idct_put= ff_simple_idct_put_mmx;
2451 c->idct_add= ff_simple_idct_add_mmx;
2452 c->idct = ff_simple_idct_mmx;
2453 c->idct_permutation_type= FF_SIMPLE_IDCT_PERM;
2455 }else if(idct_algo==FF_IDCT_LIBMPEG2MMX){
2456 if(mm_flags & AV_CPU_FLAG_MMX2){
2457 c->idct_put= ff_libmpeg2mmx2_idct_put;
2458 c->idct_add= ff_libmpeg2mmx2_idct_add;
2459 c->idct = ff_mmxext_idct;
2461 c->idct_put= ff_libmpeg2mmx_idct_put;
2462 c->idct_add= ff_libmpeg2mmx_idct_add;
2463 c->idct = ff_mmx_idct;
2465 c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
2467 }else if((CONFIG_VP3_DECODER || CONFIG_VP5_DECODER || CONFIG_VP6_DECODER) &&
2468 idct_algo==FF_IDCT_VP3 && HAVE_YASM){
2469 if(mm_flags & AV_CPU_FLAG_SSE2){
2470 c->idct_put= ff_vp3_idct_put_sse2;
2471 c->idct_add= ff_vp3_idct_add_sse2;
2472 c->idct = ff_vp3_idct_sse2;
2473 c->idct_permutation_type= FF_TRANSPOSE_IDCT_PERM;
2475 c->idct_put= ff_vp3_idct_put_mmx;
2476 c->idct_add= ff_vp3_idct_add_mmx;
2477 c->idct = ff_vp3_idct_mmx;
2478 c->idct_permutation_type= FF_PARTTRANS_IDCT_PERM;
2480 }else if(idct_algo==FF_IDCT_CAVS){
2481 c->idct_permutation_type= FF_TRANSPOSE_IDCT_PERM;
2482 }else if(idct_algo==FF_IDCT_XVIDMMX){
2483 if(mm_flags & AV_CPU_FLAG_SSE2){
2484 c->idct_put= ff_idct_xvid_sse2_put;
2485 c->idct_add= ff_idct_xvid_sse2_add;
2486 c->idct = ff_idct_xvid_sse2;
2487 c->idct_permutation_type= FF_SSE2_IDCT_PERM;
2488 }else if(mm_flags & AV_CPU_FLAG_MMX2){
2489 c->idct_put= ff_idct_xvid_mmx2_put;
2490 c->idct_add= ff_idct_xvid_mmx2_add;
2491 c->idct = ff_idct_xvid_mmx2;
2493 c->idct_put= ff_idct_xvid_mmx_put;
2494 c->idct_add= ff_idct_xvid_mmx_add;
2495 c->idct = ff_idct_xvid_mmx;
2500 c->put_pixels_clamped = ff_put_pixels_clamped_mmx;
2501 c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_mmx;
2502 c->add_pixels_clamped = ff_add_pixels_clamped_mmx;
2503 if (!high_bit_depth) {
2504 c->clear_block = clear_block_mmx;
2505 c->clear_blocks = clear_blocks_mmx;
2506 if ((mm_flags & AV_CPU_FLAG_SSE) &&
2507 !(CONFIG_MPEG_XVMC_DECODER && avctx->xvmc_acceleration > 1)){
2508 /* XvMCCreateBlocks() may not allocate 16-byte aligned blocks */
2509 c->clear_block = clear_block_sse;
2510 c->clear_blocks = clear_blocks_sse;
2514 #define SET_HPEL_FUNCS(PFX, IDX, SIZE, CPU) \
2515 c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## SIZE ## _ ## CPU; \
2516 c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## SIZE ## _x2_ ## CPU; \
2517 c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## SIZE ## _y2_ ## CPU; \
2518 c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## SIZE ## _xy2_ ## CPU
2520 if (!high_bit_depth) {
2521 SET_HPEL_FUNCS(put, 0, 16, mmx);
2522 SET_HPEL_FUNCS(put_no_rnd, 0, 16, mmx);
2523 SET_HPEL_FUNCS(avg, 0, 16, mmx);
2524 SET_HPEL_FUNCS(avg_no_rnd, 0, 16, mmx);
2525 SET_HPEL_FUNCS(put, 1, 8, mmx);
2526 SET_HPEL_FUNCS(put_no_rnd, 1, 8, mmx);
2527 SET_HPEL_FUNCS(avg, 1, 8, mmx);
2528 SET_HPEL_FUNCS(avg_no_rnd, 1, 8, mmx);
2531 #if ARCH_X86_32 || !HAVE_YASM
2534 #if ARCH_X86_32 && HAVE_YASM
2535 if (!high_bit_depth)
2536 c->emulated_edge_mc = emulated_edge_mc_mmx;
2539 c->add_bytes= add_bytes_mmx;
2540 c->add_bytes_l2= add_bytes_l2_mmx;
2542 if (!high_bit_depth)
2543 c->draw_edges = draw_edges_mmx;
2545 if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
2546 c->h263_v_loop_filter= h263_v_loop_filter_mmx;
2547 c->h263_h_loop_filter= h263_h_loop_filter_mmx;
2551 if (!high_bit_depth) {
2552 c->put_h264_chroma_pixels_tab[0]= ff_put_h264_chroma_mc8_mmx_rnd;
2553 c->put_h264_chroma_pixels_tab[1]= ff_put_h264_chroma_mc4_mmx;
2556 c->put_rv40_chroma_pixels_tab[0]= ff_put_rv40_chroma_mc8_mmx;
2557 c->put_rv40_chroma_pixels_tab[1]= ff_put_rv40_chroma_mc4_mmx;
2560 if (mm_flags & AV_CPU_FLAG_MMX2) {
2561 c->prefetch = prefetch_mmx2;
2563 if (!high_bit_depth) {
2564 c->put_pixels_tab[0][1] = put_pixels16_x2_mmx2;
2565 c->put_pixels_tab[0][2] = put_pixels16_y2_mmx2;
2567 c->avg_pixels_tab[0][0] = avg_pixels16_mmx2;
2568 c->avg_pixels_tab[0][1] = avg_pixels16_x2_mmx2;
2569 c->avg_pixels_tab[0][2] = avg_pixels16_y2_mmx2;
2571 c->put_pixels_tab[1][1] = put_pixels8_x2_mmx2;
2572 c->put_pixels_tab[1][2] = put_pixels8_y2_mmx2;
2574 c->avg_pixels_tab[1][0] = avg_pixels8_mmx2;
2575 c->avg_pixels_tab[1][1] = avg_pixels8_x2_mmx2;
2576 c->avg_pixels_tab[1][2] = avg_pixels8_y2_mmx2;
2579 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
2580 if (!high_bit_depth) {
2581 c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmx2;
2582 c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmx2;
2583 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_mmx2;
2584 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_mmx2;
2585 c->avg_pixels_tab[0][3] = avg_pixels16_xy2_mmx2;
2586 c->avg_pixels_tab[1][3] = avg_pixels8_xy2_mmx2;
2589 if (CONFIG_VP3_DECODER && HAVE_YASM) {
2590 c->vp3_v_loop_filter= ff_vp3_v_loop_filter_mmx2;
2591 c->vp3_h_loop_filter= ff_vp3_h_loop_filter_mmx2;
2594 if (CONFIG_VP3_DECODER && HAVE_YASM) {
2595 c->vp3_idct_dc_add = ff_vp3_idct_dc_add_mmx2;
2598 if (CONFIG_VP3_DECODER
2599 && (avctx->codec_id == CODEC_ID_VP3 || avctx->codec_id == CODEC_ID_THEORA)) {
2600 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_exact_mmx2;
2601 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_exact_mmx2;
2604 #define SET_QPEL_FUNCS(PFX, IDX, SIZE, CPU) \
2605 c->PFX ## _pixels_tab[IDX][ 0] = PFX ## SIZE ## _mc00_ ## CPU; \
2606 c->PFX ## _pixels_tab[IDX][ 1] = PFX ## SIZE ## _mc10_ ## CPU; \
2607 c->PFX ## _pixels_tab[IDX][ 2] = PFX ## SIZE ## _mc20_ ## CPU; \
2608 c->PFX ## _pixels_tab[IDX][ 3] = PFX ## SIZE ## _mc30_ ## CPU; \
2609 c->PFX ## _pixels_tab[IDX][ 4] = PFX ## SIZE ## _mc01_ ## CPU; \
2610 c->PFX ## _pixels_tab[IDX][ 5] = PFX ## SIZE ## _mc11_ ## CPU; \
2611 c->PFX ## _pixels_tab[IDX][ 6] = PFX ## SIZE ## _mc21_ ## CPU; \
2612 c->PFX ## _pixels_tab[IDX][ 7] = PFX ## SIZE ## _mc31_ ## CPU; \
2613 c->PFX ## _pixels_tab[IDX][ 8] = PFX ## SIZE ## _mc02_ ## CPU; \
2614 c->PFX ## _pixels_tab[IDX][ 9] = PFX ## SIZE ## _mc12_ ## CPU; \
2615 c->PFX ## _pixels_tab[IDX][10] = PFX ## SIZE ## _mc22_ ## CPU; \
2616 c->PFX ## _pixels_tab[IDX][11] = PFX ## SIZE ## _mc32_ ## CPU; \
2617 c->PFX ## _pixels_tab[IDX][12] = PFX ## SIZE ## _mc03_ ## CPU; \
2618 c->PFX ## _pixels_tab[IDX][13] = PFX ## SIZE ## _mc13_ ## CPU; \
2619 c->PFX ## _pixels_tab[IDX][14] = PFX ## SIZE ## _mc23_ ## CPU; \
2620 c->PFX ## _pixels_tab[IDX][15] = PFX ## SIZE ## _mc33_ ## CPU
2622 SET_QPEL_FUNCS(put_qpel, 0, 16, mmx2);
2623 SET_QPEL_FUNCS(put_qpel, 1, 8, mmx2);
2624 SET_QPEL_FUNCS(put_no_rnd_qpel, 0, 16, mmx2);
2625 SET_QPEL_FUNCS(put_no_rnd_qpel, 1, 8, mmx2);
2626 SET_QPEL_FUNCS(avg_qpel, 0, 16, mmx2);
2627 SET_QPEL_FUNCS(avg_qpel, 1, 8, mmx2);
2629 if (!high_bit_depth) {
2630 SET_QPEL_FUNCS(put_h264_qpel, 0, 16, mmx2);
2631 SET_QPEL_FUNCS(put_h264_qpel, 1, 8, mmx2);
2632 SET_QPEL_FUNCS(put_h264_qpel, 2, 4, mmx2);
2633 SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, mmx2);
2634 SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, mmx2);
2635 SET_QPEL_FUNCS(avg_h264_qpel, 2, 4, mmx2);
2638 SET_QPEL_FUNCS(put_2tap_qpel, 0, 16, mmx2);
2639 SET_QPEL_FUNCS(put_2tap_qpel, 1, 8, mmx2);
2640 SET_QPEL_FUNCS(avg_2tap_qpel, 0, 16, mmx2);
2641 SET_QPEL_FUNCS(avg_2tap_qpel, 1, 8, mmx2);
2644 c->avg_rv40_chroma_pixels_tab[0]= ff_avg_rv40_chroma_mc8_mmx2;
2645 c->avg_rv40_chroma_pixels_tab[1]= ff_avg_rv40_chroma_mc4_mmx2;
2647 if (!high_bit_depth) {
2648 c->avg_h264_chroma_pixels_tab[0]= ff_avg_h264_chroma_mc8_mmx2_rnd;
2649 c->avg_h264_chroma_pixels_tab[1]= ff_avg_h264_chroma_mc4_mmx2;
2650 c->avg_h264_chroma_pixels_tab[2]= ff_avg_h264_chroma_mc2_mmx2;
2651 c->put_h264_chroma_pixels_tab[2]= ff_put_h264_chroma_mc2_mmx2;
2654 c->add_hfyu_median_prediction = ff_add_hfyu_median_prediction_mmx2;
2656 #if HAVE_7REGS && HAVE_TEN_OPERANDS
2657 if( mm_flags&AV_CPU_FLAG_3DNOW )
2658 c->add_hfyu_median_prediction = add_hfyu_median_prediction_cmov;
2661 c->add_png_paeth_prediction= add_png_paeth_prediction_mmx2;
2662 } else if (mm_flags & AV_CPU_FLAG_3DNOW) {
2663 c->prefetch = prefetch_3dnow;
2665 if (!high_bit_depth) {
2666 c->put_pixels_tab[0][1] = put_pixels16_x2_3dnow;
2667 c->put_pixels_tab[0][2] = put_pixels16_y2_3dnow;
2669 c->avg_pixels_tab[0][0] = avg_pixels16_3dnow;
2670 c->avg_pixels_tab[0][1] = avg_pixels16_x2_3dnow;
2671 c->avg_pixels_tab[0][2] = avg_pixels16_y2_3dnow;
2673 c->put_pixels_tab[1][1] = put_pixels8_x2_3dnow;
2674 c->put_pixels_tab[1][2] = put_pixels8_y2_3dnow;
2676 c->avg_pixels_tab[1][0] = avg_pixels8_3dnow;
2677 c->avg_pixels_tab[1][1] = avg_pixels8_x2_3dnow;
2678 c->avg_pixels_tab[1][2] = avg_pixels8_y2_3dnow;
2680 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
2681 c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_3dnow;
2682 c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_3dnow;
2683 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_3dnow;
2684 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_3dnow;
2685 c->avg_pixels_tab[0][3] = avg_pixels16_xy2_3dnow;
2686 c->avg_pixels_tab[1][3] = avg_pixels8_xy2_3dnow;
2690 if (CONFIG_VP3_DECODER
2691 && (avctx->codec_id == CODEC_ID_VP3 || avctx->codec_id == CODEC_ID_THEORA)) {
2692 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_exact_3dnow;
2693 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_exact_3dnow;
2696 SET_QPEL_FUNCS(put_qpel, 0, 16, 3dnow);
2697 SET_QPEL_FUNCS(put_qpel, 1, 8, 3dnow);
2698 SET_QPEL_FUNCS(put_no_rnd_qpel, 0, 16, 3dnow);
2699 SET_QPEL_FUNCS(put_no_rnd_qpel, 1, 8, 3dnow);
2700 SET_QPEL_FUNCS(avg_qpel, 0, 16, 3dnow);
2701 SET_QPEL_FUNCS(avg_qpel, 1, 8, 3dnow);
2703 if (!high_bit_depth) {
2704 SET_QPEL_FUNCS(put_h264_qpel, 0, 16, 3dnow);
2705 SET_QPEL_FUNCS(put_h264_qpel, 1, 8, 3dnow);
2706 SET_QPEL_FUNCS(put_h264_qpel, 2, 4, 3dnow);
2707 SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, 3dnow);
2708 SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, 3dnow);
2709 SET_QPEL_FUNCS(avg_h264_qpel, 2, 4, 3dnow);
2712 SET_QPEL_FUNCS(put_2tap_qpel, 0, 16, 3dnow);
2713 SET_QPEL_FUNCS(put_2tap_qpel, 1, 8, 3dnow);
2714 SET_QPEL_FUNCS(avg_2tap_qpel, 0, 16, 3dnow);
2715 SET_QPEL_FUNCS(avg_2tap_qpel, 1, 8, 3dnow);
2718 if (!high_bit_depth) {
2719 c->avg_h264_chroma_pixels_tab[0]= ff_avg_h264_chroma_mc8_3dnow_rnd;
2720 c->avg_h264_chroma_pixels_tab[1]= ff_avg_h264_chroma_mc4_3dnow;
2723 c->avg_rv40_chroma_pixels_tab[0]= ff_avg_rv40_chroma_mc8_3dnow;
2724 c->avg_rv40_chroma_pixels_tab[1]= ff_avg_rv40_chroma_mc4_3dnow;
2729 #define H264_QPEL_FUNCS(x, y, CPU)\
2730 c->put_h264_qpel_pixels_tab[0][x+y*4] = put_h264_qpel16_mc##x##y##_##CPU;\
2731 c->put_h264_qpel_pixels_tab[1][x+y*4] = put_h264_qpel8_mc##x##y##_##CPU;\
2732 c->avg_h264_qpel_pixels_tab[0][x+y*4] = avg_h264_qpel16_mc##x##y##_##CPU;\
2733 c->avg_h264_qpel_pixels_tab[1][x+y*4] = avg_h264_qpel8_mc##x##y##_##CPU;
2734 if((mm_flags & AV_CPU_FLAG_SSE2) && !(mm_flags & AV_CPU_FLAG_3DNOW)){
2735 // these functions are slower than mmx on AMD, but faster on Intel
2736 if (!high_bit_depth) {
2737 c->put_pixels_tab[0][0] = put_pixels16_sse2;
2738 c->put_no_rnd_pixels_tab[0][0] = put_pixels16_sse2;
2739 c->avg_pixels_tab[0][0] = avg_pixels16_sse2;
2740 H264_QPEL_FUNCS(0, 0, sse2);
2743 if(mm_flags & AV_CPU_FLAG_SSE2){
2744 if (!high_bit_depth) {
2745 H264_QPEL_FUNCS(0, 1, sse2);
2746 H264_QPEL_FUNCS(0, 2, sse2);
2747 H264_QPEL_FUNCS(0, 3, sse2);
2748 H264_QPEL_FUNCS(1, 1, sse2);
2749 H264_QPEL_FUNCS(1, 2, sse2);
2750 H264_QPEL_FUNCS(1, 3, sse2);
2751 H264_QPEL_FUNCS(2, 1, sse2);
2752 H264_QPEL_FUNCS(2, 2, sse2);
2753 H264_QPEL_FUNCS(2, 3, sse2);
2754 H264_QPEL_FUNCS(3, 1, sse2);
2755 H264_QPEL_FUNCS(3, 2, sse2);
2756 H264_QPEL_FUNCS(3, 3, sse2);
2760 if(mm_flags & AV_CPU_FLAG_SSSE3){
2761 if (!high_bit_depth) {
2762 H264_QPEL_FUNCS(1, 0, ssse3);
2763 H264_QPEL_FUNCS(1, 1, ssse3);
2764 H264_QPEL_FUNCS(1, 2, ssse3);
2765 H264_QPEL_FUNCS(1, 3, ssse3);
2766 H264_QPEL_FUNCS(2, 0, ssse3);
2767 H264_QPEL_FUNCS(2, 1, ssse3);
2768 H264_QPEL_FUNCS(2, 2, ssse3);
2769 H264_QPEL_FUNCS(2, 3, ssse3);
2770 H264_QPEL_FUNCS(3, 0, ssse3);
2771 H264_QPEL_FUNCS(3, 1, ssse3);
2772 H264_QPEL_FUNCS(3, 2, ssse3);
2773 H264_QPEL_FUNCS(3, 3, ssse3);
2775 c->add_png_paeth_prediction= add_png_paeth_prediction_ssse3;
2777 if (!high_bit_depth) {
2778 c->put_h264_chroma_pixels_tab[0]= ff_put_h264_chroma_mc8_ssse3_rnd;
2779 c->avg_h264_chroma_pixels_tab[0]= ff_avg_h264_chroma_mc8_ssse3_rnd;
2780 c->put_h264_chroma_pixels_tab[1]= ff_put_h264_chroma_mc4_ssse3;
2781 c->avg_h264_chroma_pixels_tab[1]= ff_avg_h264_chroma_mc4_ssse3;
2783 c->add_hfyu_left_prediction = ff_add_hfyu_left_prediction_ssse3;
2784 if (mm_flags & AV_CPU_FLAG_SSE4) // not really sse4, just slow on Conroe
2785 c->add_hfyu_left_prediction = ff_add_hfyu_left_prediction_sse4;
2790 if(mm_flags & AV_CPU_FLAG_3DNOW){
2791 c->vorbis_inverse_coupling = vorbis_inverse_coupling_3dnow;
2792 c->vector_fmul = vector_fmul_3dnow;
2794 if(mm_flags & AV_CPU_FLAG_3DNOWEXT){
2795 c->vector_fmul_reverse = vector_fmul_reverse_3dnow2;
2797 c->vector_fmul_window = vector_fmul_window_3dnow2;
2800 if(mm_flags & AV_CPU_FLAG_MMX2){
2802 c->scalarproduct_int16 = ff_scalarproduct_int16_mmx2;
2803 c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_mmx2;
2804 if (avctx->flags & CODEC_FLAG_BITEXACT) {
2805 c->apply_window_int16 = ff_apply_window_int16_mmxext_ba;
2807 c->apply_window_int16 = ff_apply_window_int16_mmxext;
2811 if(mm_flags & AV_CPU_FLAG_SSE){
2812 c->vorbis_inverse_coupling = vorbis_inverse_coupling_sse;
2813 c->ac3_downmix = ac3_downmix_sse;
2814 c->vector_fmul = vector_fmul_sse;
2815 c->vector_fmul_reverse = vector_fmul_reverse_sse;
2816 c->vector_fmul_add = vector_fmul_add_sse;
2818 c->vector_fmul_window = vector_fmul_window_sse;
2820 c->vector_clipf = vector_clipf_sse;
2822 c->scalarproduct_float = ff_scalarproduct_float_sse;
2825 if(mm_flags & AV_CPU_FLAG_3DNOW)
2826 c->vector_fmul_add = vector_fmul_add_3dnow; // faster than sse
2827 if(mm_flags & AV_CPU_FLAG_SSE2){
2829 c->scalarproduct_int16 = ff_scalarproduct_int16_sse2;
2830 c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_sse2;
2831 if (avctx->flags & CODEC_FLAG_BITEXACT) {
2832 c->apply_window_int16 = ff_apply_window_int16_sse2_ba;
2834 if (!(mm_flags & AV_CPU_FLAG_SSE2SLOW)) {
2835 c->apply_window_int16 = ff_apply_window_int16_sse2;
2839 if (!high_bit_depth)
2840 c->emulated_edge_mc = emulated_edge_mc_sse;
2844 if (mm_flags & AV_CPU_FLAG_SSSE3) {
2846 if (mm_flags & AV_CPU_FLAG_ATOM) {
2847 c->apply_window_int16 = ff_apply_window_int16_ssse3_atom;
2849 c->apply_window_int16 = ff_apply_window_int16_ssse3;
2851 if (!(mm_flags & (AV_CPU_FLAG_SSE42|AV_CPU_FLAG_3DNOW))) { // cachesplit
2852 c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_ssse3;
2858 if (CONFIG_ENCODERS)
2859 dsputilenc_init_mmx(c, avctx);
2862 // for speed testing
2863 get_pixels = just_return;
2864 put_pixels_clamped = just_return;
2865 add_pixels_clamped = just_return;
2867 pix_abs16x16 = just_return;
2868 pix_abs16x16_x2 = just_return;
2869 pix_abs16x16_y2 = just_return;
2870 pix_abs16x16_xy2 = just_return;
2872 put_pixels_tab[0] = just_return;
2873 put_pixels_tab[1] = just_return;
2874 put_pixels_tab[2] = just_return;
2875 put_pixels_tab[3] = just_return;
2877 put_no_rnd_pixels_tab[0] = just_return;
2878 put_no_rnd_pixels_tab[1] = just_return;
2879 put_no_rnd_pixels_tab[2] = just_return;
2880 put_no_rnd_pixels_tab[3] = just_return;
2882 avg_pixels_tab[0] = just_return;
2883 avg_pixels_tab[1] = just_return;
2884 avg_pixels_tab[2] = just_return;
2885 avg_pixels_tab[3] = just_return;
2887 avg_no_rnd_pixels_tab[0] = just_return;
2888 avg_no_rnd_pixels_tab[1] = just_return;
2889 avg_no_rnd_pixels_tab[2] = just_return;
2890 avg_no_rnd_pixels_tab[3] = just_return;
2892 //av_fdct = just_return;
2893 //ff_idct = just_return;