2 * MMX optimized DSP utils
3 * Copyright (c) 2000, 2001 Fabrice Bellard
4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
6 * This file is part of FFmpeg.
8 * FFmpeg is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License as published by the Free Software Foundation; either
11 * version 2.1 of the License, or (at your option) any later version.
13 * FFmpeg is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 * Lesser General Public License for more details.
18 * You should have received a copy of the GNU Lesser General Public
19 * License along with FFmpeg; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22 * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
25 #include "libavutil/cpu.h"
26 #include "libavutil/x86_cpu.h"
27 #include "libavcodec/dsputil.h"
28 #include "libavcodec/h264dsp.h"
29 #include "libavcodec/mpegvideo.h"
30 #include "libavcodec/simple_idct.h"
31 #include "libavcodec/ac3dec.h"
32 #include "dsputil_mmx.h"
33 #include "idct_xvid.h"
38 /* pixel operations */
39 DECLARE_ALIGNED(8, const uint64_t, ff_bone) = 0x0101010101010101ULL;
40 DECLARE_ALIGNED(8, const uint64_t, ff_wtwo) = 0x0002000200020002ULL;
42 DECLARE_ALIGNED(16, const uint64_t, ff_pdw_80000000)[2] =
43 {0x8000000080000000ULL, 0x8000000080000000ULL};
45 DECLARE_ALIGNED(8, const uint64_t, ff_pw_1 ) = 0x0001000100010001ULL;
46 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_2 ) = {0x0002000200020002ULL, 0x0002000200020002ULL};
47 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_3 ) = {0x0003000300030003ULL, 0x0003000300030003ULL};
48 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_4 ) = {0x0004000400040004ULL, 0x0004000400040004ULL};
49 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_5 ) = {0x0005000500050005ULL, 0x0005000500050005ULL};
50 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_8 ) = {0x0008000800080008ULL, 0x0008000800080008ULL};
51 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_9 ) = {0x0009000900090009ULL, 0x0009000900090009ULL};
52 DECLARE_ALIGNED(8, const uint64_t, ff_pw_15 ) = 0x000F000F000F000FULL;
53 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_16 ) = {0x0010001000100010ULL, 0x0010001000100010ULL};
54 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_17 ) = {0x0011001100110011ULL, 0x0011001100110011ULL};
55 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_18 ) = {0x0012001200120012ULL, 0x0012001200120012ULL};
56 DECLARE_ALIGNED(8, const uint64_t, ff_pw_20 ) = 0x0014001400140014ULL;
57 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_27 ) = {0x001B001B001B001BULL, 0x001B001B001B001BULL};
58 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_28 ) = {0x001C001C001C001CULL, 0x001C001C001C001CULL};
59 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_32 ) = {0x0020002000200020ULL, 0x0020002000200020ULL};
60 DECLARE_ALIGNED(8, const uint64_t, ff_pw_42 ) = 0x002A002A002A002AULL;
61 DECLARE_ALIGNED(8, const uint64_t, ff_pw_53 ) = 0x0035003500350035ULL;
62 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_63 ) = {0x003F003F003F003FULL, 0x003F003F003F003FULL};
63 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_64 ) = {0x0040004000400040ULL, 0x0040004000400040ULL};
64 DECLARE_ALIGNED(8, const uint64_t, ff_pw_96 ) = 0x0060006000600060ULL;
65 DECLARE_ALIGNED(8, const uint64_t, ff_pw_128) = 0x0080008000800080ULL;
66 DECLARE_ALIGNED(8, const uint64_t, ff_pw_255) = 0x00ff00ff00ff00ffULL;
68 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_0 ) = {0x0000000000000000ULL, 0x0000000000000000ULL};
69 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_1 ) = {0x0101010101010101ULL, 0x0101010101010101ULL};
70 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_3 ) = {0x0303030303030303ULL, 0x0303030303030303ULL};
71 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_4 ) = {0x0404040404040404ULL, 0x0404040404040404ULL};
72 DECLARE_ALIGNED(8, const uint64_t, ff_pb_7 ) = 0x0707070707070707ULL;
73 DECLARE_ALIGNED(8, const uint64_t, ff_pb_1F ) = 0x1F1F1F1F1F1F1F1FULL;
74 DECLARE_ALIGNED(8, const uint64_t, ff_pb_3F ) = 0x3F3F3F3F3F3F3F3FULL;
75 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_80 ) = {0x8080808080808080ULL, 0x8080808080808080ULL};
76 DECLARE_ALIGNED(8, const uint64_t, ff_pb_81 ) = 0x8181818181818181ULL;
77 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_A1 ) = {0xA1A1A1A1A1A1A1A1ULL, 0xA1A1A1A1A1A1A1A1ULL};
78 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_F8 ) = {0xF8F8F8F8F8F8F8F8ULL, 0xF8F8F8F8F8F8F8F8ULL};
79 DECLARE_ALIGNED(8, const uint64_t, ff_pb_FC ) = 0xFCFCFCFCFCFCFCFCULL;
80 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_FE ) = {0xFEFEFEFEFEFEFEFEULL, 0xFEFEFEFEFEFEFEFEULL};
82 DECLARE_ALIGNED(16, const double, ff_pd_1)[2] = { 1.0, 1.0 };
83 DECLARE_ALIGNED(16, const double, ff_pd_2)[2] = { 2.0, 2.0 };
85 #define JUMPALIGN() __asm__ volatile (".p2align 3"::)
86 #define MOVQ_ZERO(regd) __asm__ volatile ("pxor %%" #regd ", %%" #regd ::)
88 #define MOVQ_BFE(regd) \
90 "pcmpeqd %%" #regd ", %%" #regd " \n\t"\
91 "paddb %%" #regd ", %%" #regd " \n\t" ::)
94 #define MOVQ_BONE(regd) __asm__ volatile ("movq %0, %%" #regd " \n\t" ::"m"(ff_bone))
95 #define MOVQ_WTWO(regd) __asm__ volatile ("movq %0, %%" #regd " \n\t" ::"m"(ff_wtwo))
97 // for shared library it's better to use this way for accessing constants
99 #define MOVQ_BONE(regd) \
101 "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
102 "psrlw $15, %%" #regd " \n\t" \
103 "packuswb %%" #regd ", %%" #regd " \n\t" ::)
105 #define MOVQ_WTWO(regd) \
107 "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
108 "psrlw $15, %%" #regd " \n\t" \
109 "psllw $1, %%" #regd " \n\t"::)
113 // using regr as temporary and for the output result
114 // first argument is unmodifed and second is trashed
115 // regfe is supposed to contain 0xfefefefefefefefe
116 #define PAVGB_MMX_NO_RND(rega, regb, regr, regfe) \
117 "movq " #rega ", " #regr " \n\t"\
118 "pand " #regb ", " #regr " \n\t"\
119 "pxor " #rega ", " #regb " \n\t"\
120 "pand " #regfe "," #regb " \n\t"\
121 "psrlq $1, " #regb " \n\t"\
122 "paddb " #regb ", " #regr " \n\t"
124 #define PAVGB_MMX(rega, regb, regr, regfe) \
125 "movq " #rega ", " #regr " \n\t"\
126 "por " #regb ", " #regr " \n\t"\
127 "pxor " #rega ", " #regb " \n\t"\
128 "pand " #regfe "," #regb " \n\t"\
129 "psrlq $1, " #regb " \n\t"\
130 "psubb " #regb ", " #regr " \n\t"
132 // mm6 is supposed to contain 0xfefefefefefefefe
133 #define PAVGBP_MMX_NO_RND(rega, regb, regr, regc, regd, regp) \
134 "movq " #rega ", " #regr " \n\t"\
135 "movq " #regc ", " #regp " \n\t"\
136 "pand " #regb ", " #regr " \n\t"\
137 "pand " #regd ", " #regp " \n\t"\
138 "pxor " #rega ", " #regb " \n\t"\
139 "pxor " #regc ", " #regd " \n\t"\
140 "pand %%mm6, " #regb " \n\t"\
141 "pand %%mm6, " #regd " \n\t"\
142 "psrlq $1, " #regb " \n\t"\
143 "psrlq $1, " #regd " \n\t"\
144 "paddb " #regb ", " #regr " \n\t"\
145 "paddb " #regd ", " #regp " \n\t"
147 #define PAVGBP_MMX(rega, regb, regr, regc, regd, regp) \
148 "movq " #rega ", " #regr " \n\t"\
149 "movq " #regc ", " #regp " \n\t"\
150 "por " #regb ", " #regr " \n\t"\
151 "por " #regd ", " #regp " \n\t"\
152 "pxor " #rega ", " #regb " \n\t"\
153 "pxor " #regc ", " #regd " \n\t"\
154 "pand %%mm6, " #regb " \n\t"\
155 "pand %%mm6, " #regd " \n\t"\
156 "psrlq $1, " #regd " \n\t"\
157 "psrlq $1, " #regb " \n\t"\
158 "psubb " #regb ", " #regr " \n\t"\
159 "psubb " #regd ", " #regp " \n\t"
161 /***********************************/
162 /* MMX no rounding */
163 #define DEF(x, y) x ## _no_rnd_ ## y ##_mmx
164 #define SET_RND MOVQ_WONE
165 #define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX_NO_RND(a, b, c, d, e, f)
166 #define PAVGB(a, b, c, e) PAVGB_MMX_NO_RND(a, b, c, e)
167 #define OP_AVG(a, b, c, e) PAVGB_MMX(a, b, c, e)
169 #include "dsputil_mmx_rnd_template.c"
175 /***********************************/
178 #define DEF(x, y) x ## _ ## y ##_mmx
179 #define SET_RND MOVQ_WTWO
180 #define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX(a, b, c, d, e, f)
181 #define PAVGB(a, b, c, e) PAVGB_MMX(a, b, c, e)
183 #include "dsputil_mmx_rnd_template.c"
191 /***********************************/
194 #define DEF(x) x ## _3dnow
195 #define PAVGB "pavgusb"
198 #include "dsputil_mmx_avg_template.c"
204 /***********************************/
207 #define DEF(x) x ## _mmx2
209 /* Introduced only in MMX2 set */
210 #define PAVGB "pavgb"
213 #include "dsputil_mmx_avg_template.c"
219 #define put_no_rnd_pixels16_mmx put_pixels16_mmx
220 #define put_no_rnd_pixels8_mmx put_pixels8_mmx
221 #define put_pixels16_mmx2 put_pixels16_mmx
222 #define put_pixels8_mmx2 put_pixels8_mmx
223 #define put_pixels4_mmx2 put_pixels4_mmx
224 #define put_no_rnd_pixels16_mmx2 put_no_rnd_pixels16_mmx
225 #define put_no_rnd_pixels8_mmx2 put_no_rnd_pixels8_mmx
226 #define put_pixels16_3dnow put_pixels16_mmx
227 #define put_pixels8_3dnow put_pixels8_mmx
228 #define put_pixels4_3dnow put_pixels4_mmx
229 #define put_no_rnd_pixels16_3dnow put_no_rnd_pixels16_mmx
230 #define put_no_rnd_pixels8_3dnow put_no_rnd_pixels8_mmx
232 /***********************************/
235 void ff_put_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size)
240 /* read the pixels */
245 "movq %3, %%mm0 \n\t"
246 "movq 8%3, %%mm1 \n\t"
247 "movq 16%3, %%mm2 \n\t"
248 "movq 24%3, %%mm3 \n\t"
249 "movq 32%3, %%mm4 \n\t"
250 "movq 40%3, %%mm5 \n\t"
251 "movq 48%3, %%mm6 \n\t"
252 "movq 56%3, %%mm7 \n\t"
253 "packuswb %%mm1, %%mm0 \n\t"
254 "packuswb %%mm3, %%mm2 \n\t"
255 "packuswb %%mm5, %%mm4 \n\t"
256 "packuswb %%mm7, %%mm6 \n\t"
257 "movq %%mm0, (%0) \n\t"
258 "movq %%mm2, (%0, %1) \n\t"
259 "movq %%mm4, (%0, %1, 2) \n\t"
260 "movq %%mm6, (%0, %2) \n\t"
261 ::"r" (pix), "r" ((x86_reg)line_size), "r" ((x86_reg)line_size*3), "m"(*p)
266 // if here would be an exact copy of the code above
267 // compiler would generate some very strange code
270 "movq (%3), %%mm0 \n\t"
271 "movq 8(%3), %%mm1 \n\t"
272 "movq 16(%3), %%mm2 \n\t"
273 "movq 24(%3), %%mm3 \n\t"
274 "movq 32(%3), %%mm4 \n\t"
275 "movq 40(%3), %%mm5 \n\t"
276 "movq 48(%3), %%mm6 \n\t"
277 "movq 56(%3), %%mm7 \n\t"
278 "packuswb %%mm1, %%mm0 \n\t"
279 "packuswb %%mm3, %%mm2 \n\t"
280 "packuswb %%mm5, %%mm4 \n\t"
281 "packuswb %%mm7, %%mm6 \n\t"
282 "movq %%mm0, (%0) \n\t"
283 "movq %%mm2, (%0, %1) \n\t"
284 "movq %%mm4, (%0, %1, 2) \n\t"
285 "movq %%mm6, (%0, %2) \n\t"
286 ::"r" (pix), "r" ((x86_reg)line_size), "r" ((x86_reg)line_size*3), "r"(p)
290 #define put_signed_pixels_clamped_mmx_half(off) \
291 "movq "#off"(%2), %%mm1 \n\t"\
292 "movq 16+"#off"(%2), %%mm2 \n\t"\
293 "movq 32+"#off"(%2), %%mm3 \n\t"\
294 "movq 48+"#off"(%2), %%mm4 \n\t"\
295 "packsswb 8+"#off"(%2), %%mm1 \n\t"\
296 "packsswb 24+"#off"(%2), %%mm2 \n\t"\
297 "packsswb 40+"#off"(%2), %%mm3 \n\t"\
298 "packsswb 56+"#off"(%2), %%mm4 \n\t"\
299 "paddb %%mm0, %%mm1 \n\t"\
300 "paddb %%mm0, %%mm2 \n\t"\
301 "paddb %%mm0, %%mm3 \n\t"\
302 "paddb %%mm0, %%mm4 \n\t"\
303 "movq %%mm1, (%0) \n\t"\
304 "movq %%mm2, (%0, %3) \n\t"\
305 "movq %%mm3, (%0, %3, 2) \n\t"\
306 "movq %%mm4, (%0, %1) \n\t"
308 void ff_put_signed_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size)
310 x86_reg line_skip = line_size;
314 "movq "MANGLE(ff_pb_80)", %%mm0 \n\t"
315 "lea (%3, %3, 2), %1 \n\t"
316 put_signed_pixels_clamped_mmx_half(0)
317 "lea (%0, %3, 4), %0 \n\t"
318 put_signed_pixels_clamped_mmx_half(64)
319 :"+&r" (pixels), "=&r" (line_skip3)
320 :"r" (block), "r"(line_skip)
324 void ff_add_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size)
330 /* read the pixels */
337 "movq (%2), %%mm0 \n\t"
338 "movq 8(%2), %%mm1 \n\t"
339 "movq 16(%2), %%mm2 \n\t"
340 "movq 24(%2), %%mm3 \n\t"
341 "movq %0, %%mm4 \n\t"
342 "movq %1, %%mm6 \n\t"
343 "movq %%mm4, %%mm5 \n\t"
344 "punpcklbw %%mm7, %%mm4 \n\t"
345 "punpckhbw %%mm7, %%mm5 \n\t"
346 "paddsw %%mm4, %%mm0 \n\t"
347 "paddsw %%mm5, %%mm1 \n\t"
348 "movq %%mm6, %%mm5 \n\t"
349 "punpcklbw %%mm7, %%mm6 \n\t"
350 "punpckhbw %%mm7, %%mm5 \n\t"
351 "paddsw %%mm6, %%mm2 \n\t"
352 "paddsw %%mm5, %%mm3 \n\t"
353 "packuswb %%mm1, %%mm0 \n\t"
354 "packuswb %%mm3, %%mm2 \n\t"
355 "movq %%mm0, %0 \n\t"
356 "movq %%mm2, %1 \n\t"
357 :"+m"(*pix), "+m"(*(pix+line_size))
365 static void put_pixels4_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h)
368 "lea (%3, %3), %%"REG_a" \n\t"
371 "movd (%1), %%mm0 \n\t"
372 "movd (%1, %3), %%mm1 \n\t"
373 "movd %%mm0, (%2) \n\t"
374 "movd %%mm1, (%2, %3) \n\t"
375 "add %%"REG_a", %1 \n\t"
376 "add %%"REG_a", %2 \n\t"
377 "movd (%1), %%mm0 \n\t"
378 "movd (%1, %3), %%mm1 \n\t"
379 "movd %%mm0, (%2) \n\t"
380 "movd %%mm1, (%2, %3) \n\t"
381 "add %%"REG_a", %1 \n\t"
382 "add %%"REG_a", %2 \n\t"
385 : "+g"(h), "+r" (pixels), "+r" (block)
386 : "r"((x86_reg)line_size)
391 static void put_pixels8_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h)
394 "lea (%3, %3), %%"REG_a" \n\t"
397 "movq (%1), %%mm0 \n\t"
398 "movq (%1, %3), %%mm1 \n\t"
399 "movq %%mm0, (%2) \n\t"
400 "movq %%mm1, (%2, %3) \n\t"
401 "add %%"REG_a", %1 \n\t"
402 "add %%"REG_a", %2 \n\t"
403 "movq (%1), %%mm0 \n\t"
404 "movq (%1, %3), %%mm1 \n\t"
405 "movq %%mm0, (%2) \n\t"
406 "movq %%mm1, (%2, %3) \n\t"
407 "add %%"REG_a", %1 \n\t"
408 "add %%"REG_a", %2 \n\t"
411 : "+g"(h), "+r" (pixels), "+r" (block)
412 : "r"((x86_reg)line_size)
417 static void put_pixels16_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h)
420 "lea (%3, %3), %%"REG_a" \n\t"
423 "movq (%1), %%mm0 \n\t"
424 "movq 8(%1), %%mm4 \n\t"
425 "movq (%1, %3), %%mm1 \n\t"
426 "movq 8(%1, %3), %%mm5 \n\t"
427 "movq %%mm0, (%2) \n\t"
428 "movq %%mm4, 8(%2) \n\t"
429 "movq %%mm1, (%2, %3) \n\t"
430 "movq %%mm5, 8(%2, %3) \n\t"
431 "add %%"REG_a", %1 \n\t"
432 "add %%"REG_a", %2 \n\t"
433 "movq (%1), %%mm0 \n\t"
434 "movq 8(%1), %%mm4 \n\t"
435 "movq (%1, %3), %%mm1 \n\t"
436 "movq 8(%1, %3), %%mm5 \n\t"
437 "movq %%mm0, (%2) \n\t"
438 "movq %%mm4, 8(%2) \n\t"
439 "movq %%mm1, (%2, %3) \n\t"
440 "movq %%mm5, 8(%2, %3) \n\t"
441 "add %%"REG_a", %1 \n\t"
442 "add %%"REG_a", %2 \n\t"
445 : "+g"(h), "+r" (pixels), "+r" (block)
446 : "r"((x86_reg)line_size)
451 static void put_pixels16_sse2(uint8_t *block, const uint8_t *pixels, int line_size, int h)
455 "movdqu (%1), %%xmm0 \n\t"
456 "movdqu (%1,%3), %%xmm1 \n\t"
457 "movdqu (%1,%3,2), %%xmm2 \n\t"
458 "movdqu (%1,%4), %%xmm3 \n\t"
459 "movdqa %%xmm0, (%2) \n\t"
460 "movdqa %%xmm1, (%2,%3) \n\t"
461 "movdqa %%xmm2, (%2,%3,2) \n\t"
462 "movdqa %%xmm3, (%2,%4) \n\t"
464 "lea (%1,%3,4), %1 \n\t"
465 "lea (%2,%3,4), %2 \n\t"
467 : "+g"(h), "+r" (pixels), "+r" (block)
468 : "r"((x86_reg)line_size), "r"((x86_reg)3L*line_size)
473 static void avg_pixels16_sse2(uint8_t *block, const uint8_t *pixels, int line_size, int h)
477 "movdqu (%1), %%xmm0 \n\t"
478 "movdqu (%1,%3), %%xmm1 \n\t"
479 "movdqu (%1,%3,2), %%xmm2 \n\t"
480 "movdqu (%1,%4), %%xmm3 \n\t"
481 "pavgb (%2), %%xmm0 \n\t"
482 "pavgb (%2,%3), %%xmm1 \n\t"
483 "pavgb (%2,%3,2), %%xmm2 \n\t"
484 "pavgb (%2,%4), %%xmm3 \n\t"
485 "movdqa %%xmm0, (%2) \n\t"
486 "movdqa %%xmm1, (%2,%3) \n\t"
487 "movdqa %%xmm2, (%2,%3,2) \n\t"
488 "movdqa %%xmm3, (%2,%4) \n\t"
490 "lea (%1,%3,4), %1 \n\t"
491 "lea (%2,%3,4), %2 \n\t"
493 : "+g"(h), "+r" (pixels), "+r" (block)
494 : "r"((x86_reg)line_size), "r"((x86_reg)3L*line_size)
499 #define CLEAR_BLOCKS(name,n) \
500 static void name(DCTELEM *blocks)\
503 "pxor %%mm7, %%mm7 \n\t"\
504 "mov %1, %%"REG_a" \n\t"\
506 "movq %%mm7, (%0, %%"REG_a") \n\t"\
507 "movq %%mm7, 8(%0, %%"REG_a") \n\t"\
508 "movq %%mm7, 16(%0, %%"REG_a") \n\t"\
509 "movq %%mm7, 24(%0, %%"REG_a") \n\t"\
510 "add $32, %%"REG_a" \n\t"\
512 : : "r" (((uint8_t *)blocks)+128*n),\
517 CLEAR_BLOCKS(clear_blocks_mmx, 6)
518 CLEAR_BLOCKS(clear_block_mmx, 1)
520 static void clear_block_sse(DCTELEM *block)
523 "xorps %%xmm0, %%xmm0 \n"
524 "movaps %%xmm0, (%0) \n"
525 "movaps %%xmm0, 16(%0) \n"
526 "movaps %%xmm0, 32(%0) \n"
527 "movaps %%xmm0, 48(%0) \n"
528 "movaps %%xmm0, 64(%0) \n"
529 "movaps %%xmm0, 80(%0) \n"
530 "movaps %%xmm0, 96(%0) \n"
531 "movaps %%xmm0, 112(%0) \n"
537 static void clear_blocks_sse(DCTELEM *blocks)
540 "xorps %%xmm0, %%xmm0 \n"
541 "mov %1, %%"REG_a" \n"
543 "movaps %%xmm0, (%0, %%"REG_a") \n"
544 "movaps %%xmm0, 16(%0, %%"REG_a") \n"
545 "movaps %%xmm0, 32(%0, %%"REG_a") \n"
546 "movaps %%xmm0, 48(%0, %%"REG_a") \n"
547 "movaps %%xmm0, 64(%0, %%"REG_a") \n"
548 "movaps %%xmm0, 80(%0, %%"REG_a") \n"
549 "movaps %%xmm0, 96(%0, %%"REG_a") \n"
550 "movaps %%xmm0, 112(%0, %%"REG_a") \n"
551 "add $128, %%"REG_a" \n"
553 : : "r" (((uint8_t *)blocks)+128*6),
559 static void add_bytes_mmx(uint8_t *dst, uint8_t *src, int w){
564 "movq (%1, %0), %%mm0 \n\t"
565 "movq (%2, %0), %%mm1 \n\t"
566 "paddb %%mm0, %%mm1 \n\t"
567 "movq %%mm1, (%2, %0) \n\t"
568 "movq 8(%1, %0), %%mm0 \n\t"
569 "movq 8(%2, %0), %%mm1 \n\t"
570 "paddb %%mm0, %%mm1 \n\t"
571 "movq %%mm1, 8(%2, %0) \n\t"
577 : "r"(src), "r"(dst), "r"((x86_reg)w-15)
580 dst[i+0] += src[i+0];
584 static void add_hfyu_median_prediction_cmov(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int w, int *left, int *left_top) {
587 int l = *left & 0xff;
588 int tl = *left_top & 0xff;
593 "movzbl (%3,%4), %2 \n"
606 "add (%6,%4), %b0 \n"
607 "mov %b0, (%5,%4) \n"
610 :"+&q"(l), "+&q"(tl), "=&r"(t), "=&q"(x), "+&r"(w2)
611 :"r"(dst+w), "r"(diff+w), "rm"(top+w)
618 #define H263_LOOP_FILTER \
619 "pxor %%mm7, %%mm7 \n\t"\
620 "movq %0, %%mm0 \n\t"\
621 "movq %0, %%mm1 \n\t"\
622 "movq %3, %%mm2 \n\t"\
623 "movq %3, %%mm3 \n\t"\
624 "punpcklbw %%mm7, %%mm0 \n\t"\
625 "punpckhbw %%mm7, %%mm1 \n\t"\
626 "punpcklbw %%mm7, %%mm2 \n\t"\
627 "punpckhbw %%mm7, %%mm3 \n\t"\
628 "psubw %%mm2, %%mm0 \n\t"\
629 "psubw %%mm3, %%mm1 \n\t"\
630 "movq %1, %%mm2 \n\t"\
631 "movq %1, %%mm3 \n\t"\
632 "movq %2, %%mm4 \n\t"\
633 "movq %2, %%mm5 \n\t"\
634 "punpcklbw %%mm7, %%mm2 \n\t"\
635 "punpckhbw %%mm7, %%mm3 \n\t"\
636 "punpcklbw %%mm7, %%mm4 \n\t"\
637 "punpckhbw %%mm7, %%mm5 \n\t"\
638 "psubw %%mm2, %%mm4 \n\t"\
639 "psubw %%mm3, %%mm5 \n\t"\
640 "psllw $2, %%mm4 \n\t"\
641 "psllw $2, %%mm5 \n\t"\
642 "paddw %%mm0, %%mm4 \n\t"\
643 "paddw %%mm1, %%mm5 \n\t"\
644 "pxor %%mm6, %%mm6 \n\t"\
645 "pcmpgtw %%mm4, %%mm6 \n\t"\
646 "pcmpgtw %%mm5, %%mm7 \n\t"\
647 "pxor %%mm6, %%mm4 \n\t"\
648 "pxor %%mm7, %%mm5 \n\t"\
649 "psubw %%mm6, %%mm4 \n\t"\
650 "psubw %%mm7, %%mm5 \n\t"\
651 "psrlw $3, %%mm4 \n\t"\
652 "psrlw $3, %%mm5 \n\t"\
653 "packuswb %%mm5, %%mm4 \n\t"\
654 "packsswb %%mm7, %%mm6 \n\t"\
655 "pxor %%mm7, %%mm7 \n\t"\
656 "movd %4, %%mm2 \n\t"\
657 "punpcklbw %%mm2, %%mm2 \n\t"\
658 "punpcklbw %%mm2, %%mm2 \n\t"\
659 "punpcklbw %%mm2, %%mm2 \n\t"\
660 "psubusb %%mm4, %%mm2 \n\t"\
661 "movq %%mm2, %%mm3 \n\t"\
662 "psubusb %%mm4, %%mm3 \n\t"\
663 "psubb %%mm3, %%mm2 \n\t"\
664 "movq %1, %%mm3 \n\t"\
665 "movq %2, %%mm4 \n\t"\
666 "pxor %%mm6, %%mm3 \n\t"\
667 "pxor %%mm6, %%mm4 \n\t"\
668 "paddusb %%mm2, %%mm3 \n\t"\
669 "psubusb %%mm2, %%mm4 \n\t"\
670 "pxor %%mm6, %%mm3 \n\t"\
671 "pxor %%mm6, %%mm4 \n\t"\
672 "paddusb %%mm2, %%mm2 \n\t"\
673 "packsswb %%mm1, %%mm0 \n\t"\
674 "pcmpgtb %%mm0, %%mm7 \n\t"\
675 "pxor %%mm7, %%mm0 \n\t"\
676 "psubb %%mm7, %%mm0 \n\t"\
677 "movq %%mm0, %%mm1 \n\t"\
678 "psubusb %%mm2, %%mm0 \n\t"\
679 "psubb %%mm0, %%mm1 \n\t"\
680 "pand %5, %%mm1 \n\t"\
681 "psrlw $2, %%mm1 \n\t"\
682 "pxor %%mm7, %%mm1 \n\t"\
683 "psubb %%mm7, %%mm1 \n\t"\
684 "movq %0, %%mm5 \n\t"\
685 "movq %3, %%mm6 \n\t"\
686 "psubb %%mm1, %%mm5 \n\t"\
687 "paddb %%mm1, %%mm6 \n\t"
689 static void h263_v_loop_filter_mmx(uint8_t *src, int stride, int qscale){
690 if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
691 const int strength= ff_h263_loop_filter_strength[qscale];
697 "movq %%mm3, %1 \n\t"
698 "movq %%mm4, %2 \n\t"
699 "movq %%mm5, %0 \n\t"
700 "movq %%mm6, %3 \n\t"
701 : "+m" (*(uint64_t*)(src - 2*stride)),
702 "+m" (*(uint64_t*)(src - 1*stride)),
703 "+m" (*(uint64_t*)(src + 0*stride)),
704 "+m" (*(uint64_t*)(src + 1*stride))
705 : "g" (2*strength), "m"(ff_pb_FC)
710 static void h263_h_loop_filter_mmx(uint8_t *src, int stride, int qscale){
711 if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
712 const int strength= ff_h263_loop_filter_strength[qscale];
713 DECLARE_ALIGNED(8, uint64_t, temp)[4];
714 uint8_t *btemp= (uint8_t*)temp;
718 transpose4x4(btemp , src , 8, stride);
719 transpose4x4(btemp+4, src + 4*stride, 8, stride);
721 H263_LOOP_FILTER // 5 3 4 6
727 : "g" (2*strength), "m"(ff_pb_FC)
731 "movq %%mm5, %%mm1 \n\t"
732 "movq %%mm4, %%mm0 \n\t"
733 "punpcklbw %%mm3, %%mm5 \n\t"
734 "punpcklbw %%mm6, %%mm4 \n\t"
735 "punpckhbw %%mm3, %%mm1 \n\t"
736 "punpckhbw %%mm6, %%mm0 \n\t"
737 "movq %%mm5, %%mm3 \n\t"
738 "movq %%mm1, %%mm6 \n\t"
739 "punpcklwd %%mm4, %%mm5 \n\t"
740 "punpcklwd %%mm0, %%mm1 \n\t"
741 "punpckhwd %%mm4, %%mm3 \n\t"
742 "punpckhwd %%mm0, %%mm6 \n\t"
743 "movd %%mm5, (%0) \n\t"
744 "punpckhdq %%mm5, %%mm5 \n\t"
745 "movd %%mm5, (%0,%2) \n\t"
746 "movd %%mm3, (%0,%2,2) \n\t"
747 "punpckhdq %%mm3, %%mm3 \n\t"
748 "movd %%mm3, (%0,%3) \n\t"
749 "movd %%mm1, (%1) \n\t"
750 "punpckhdq %%mm1, %%mm1 \n\t"
751 "movd %%mm1, (%1,%2) \n\t"
752 "movd %%mm6, (%1,%2,2) \n\t"
753 "punpckhdq %%mm6, %%mm6 \n\t"
754 "movd %%mm6, (%1,%3) \n\t"
756 "r" (src + 4*stride),
757 "r" ((x86_reg) stride ),
758 "r" ((x86_reg)(3*stride))
763 /* draw the edges of width 'w' of an image of size width, height
764 this mmx version can only handle w==8 || w==16 */
765 static void draw_edges_mmx(uint8_t *buf, int wrap, int width, int height, int w, int h, int sides)
767 uint8_t *ptr, *last_line;
770 last_line = buf + (height - 1) * wrap;
777 "movd (%0), %%mm0 \n\t"
778 "punpcklbw %%mm0, %%mm0 \n\t"
779 "punpcklwd %%mm0, %%mm0 \n\t"
780 "punpckldq %%mm0, %%mm0 \n\t"
781 "movq %%mm0, -8(%0) \n\t"
782 "movq -8(%0, %2), %%mm1 \n\t"
783 "punpckhbw %%mm1, %%mm1 \n\t"
784 "punpckhwd %%mm1, %%mm1 \n\t"
785 "punpckhdq %%mm1, %%mm1 \n\t"
786 "movq %%mm1, (%0, %2) \n\t"
791 : "r" ((x86_reg)wrap), "r" ((x86_reg)width), "r" (ptr + wrap*height)
798 "movd (%0), %%mm0 \n\t"
799 "punpcklbw %%mm0, %%mm0 \n\t"
800 "punpcklwd %%mm0, %%mm0 \n\t"
801 "punpckldq %%mm0, %%mm0 \n\t"
802 "movq %%mm0, -8(%0) \n\t"
803 "movq %%mm0, -16(%0) \n\t"
804 "movq -8(%0, %2), %%mm1 \n\t"
805 "punpckhbw %%mm1, %%mm1 \n\t"
806 "punpckhwd %%mm1, %%mm1 \n\t"
807 "punpckhdq %%mm1, %%mm1 \n\t"
808 "movq %%mm1, (%0, %2) \n\t"
809 "movq %%mm1, 8(%0, %2) \n\t"
814 : "r" ((x86_reg)wrap), "r" ((x86_reg)width), "r" (ptr + wrap*height)
818 /* top and bottom (and hopefully also the corners) */
819 if (sides&EDGE_TOP) {
820 for(i = 0; i < h; i += 4) {
821 ptr= buf - (i + 1) * wrap - w;
824 "movq (%1, %0), %%mm0 \n\t"
825 "movq %%mm0, (%0) \n\t"
826 "movq %%mm0, (%0, %2) \n\t"
827 "movq %%mm0, (%0, %2, 2) \n\t"
828 "movq %%mm0, (%0, %3) \n\t"
833 : "r" ((x86_reg)buf - (x86_reg)ptr - w), "r" ((x86_reg)-wrap), "r" ((x86_reg)-wrap*3), "r" (ptr+width+2*w)
838 if (sides&EDGE_BOTTOM) {
839 for(i = 0; i < w; i += 4) {
840 ptr= last_line + (i + 1) * wrap - w;
843 "movq (%1, %0), %%mm0 \n\t"
844 "movq %%mm0, (%0) \n\t"
845 "movq %%mm0, (%0, %2) \n\t"
846 "movq %%mm0, (%0, %2, 2) \n\t"
847 "movq %%mm0, (%0, %3) \n\t"
852 : "r" ((x86_reg)last_line - (x86_reg)ptr - w), "r" ((x86_reg)wrap), "r" ((x86_reg)wrap*3), "r" (ptr+width+2*w)
858 #define QPEL_V_LOW(m3,m4,m5,m6, pw_20, pw_3, rnd, in0, in1, in2, in7, out, OP)\
859 "paddw " #m4 ", " #m3 " \n\t" /* x1 */\
860 "movq "MANGLE(ff_pw_20)", %%mm4 \n\t" /* 20 */\
861 "pmullw " #m3 ", %%mm4 \n\t" /* 20x1 */\
862 "movq "#in7", " #m3 " \n\t" /* d */\
863 "movq "#in0", %%mm5 \n\t" /* D */\
864 "paddw " #m3 ", %%mm5 \n\t" /* x4 */\
865 "psubw %%mm5, %%mm4 \n\t" /* 20x1 - x4 */\
866 "movq "#in1", %%mm5 \n\t" /* C */\
867 "movq "#in2", %%mm6 \n\t" /* B */\
868 "paddw " #m6 ", %%mm5 \n\t" /* x3 */\
869 "paddw " #m5 ", %%mm6 \n\t" /* x2 */\
870 "paddw %%mm6, %%mm6 \n\t" /* 2x2 */\
871 "psubw %%mm6, %%mm5 \n\t" /* -2x2 + x3 */\
872 "pmullw "MANGLE(ff_pw_3)", %%mm5 \n\t" /* -6x2 + 3x3 */\
873 "paddw " #rnd ", %%mm4 \n\t" /* x2 */\
874 "paddw %%mm4, %%mm5 \n\t" /* 20x1 - 6x2 + 3x3 - x4 */\
875 "psraw $5, %%mm5 \n\t"\
876 "packuswb %%mm5, %%mm5 \n\t"\
877 OP(%%mm5, out, %%mm7, d)
879 #define QPEL_BASE(OPNAME, ROUNDER, RND, OP_MMX2, OP_3DNOW)\
880 static void OPNAME ## mpeg4_qpel16_h_lowpass_mmx2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
884 "pxor %%mm7, %%mm7 \n\t"\
886 "movq (%0), %%mm0 \n\t" /* ABCDEFGH */\
887 "movq %%mm0, %%mm1 \n\t" /* ABCDEFGH */\
888 "movq %%mm0, %%mm2 \n\t" /* ABCDEFGH */\
889 "punpcklbw %%mm7, %%mm0 \n\t" /* 0A0B0C0D */\
890 "punpckhbw %%mm7, %%mm1 \n\t" /* 0E0F0G0H */\
891 "pshufw $0x90, %%mm0, %%mm5 \n\t" /* 0A0A0B0C */\
892 "pshufw $0x41, %%mm0, %%mm6 \n\t" /* 0B0A0A0B */\
893 "movq %%mm2, %%mm3 \n\t" /* ABCDEFGH */\
894 "movq %%mm2, %%mm4 \n\t" /* ABCDEFGH */\
895 "psllq $8, %%mm2 \n\t" /* 0ABCDEFG */\
896 "psllq $16, %%mm3 \n\t" /* 00ABCDEF */\
897 "psllq $24, %%mm4 \n\t" /* 000ABCDE */\
898 "punpckhbw %%mm7, %%mm2 \n\t" /* 0D0E0F0G */\
899 "punpckhbw %%mm7, %%mm3 \n\t" /* 0C0D0E0F */\
900 "punpckhbw %%mm7, %%mm4 \n\t" /* 0B0C0D0E */\
901 "paddw %%mm3, %%mm5 \n\t" /* b */\
902 "paddw %%mm2, %%mm6 \n\t" /* c */\
903 "paddw %%mm5, %%mm5 \n\t" /* 2b */\
904 "psubw %%mm5, %%mm6 \n\t" /* c - 2b */\
905 "pshufw $0x06, %%mm0, %%mm5 \n\t" /* 0C0B0A0A */\
906 "pmullw "MANGLE(ff_pw_3)", %%mm6 \n\t" /* 3c - 6b */\
907 "paddw %%mm4, %%mm0 \n\t" /* a */\
908 "paddw %%mm1, %%mm5 \n\t" /* d */\
909 "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" /* 20a */\
910 "psubw %%mm5, %%mm0 \n\t" /* 20a - d */\
911 "paddw %6, %%mm6 \n\t"\
912 "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\
913 "psraw $5, %%mm0 \n\t"\
914 "movq %%mm0, %5 \n\t"\
915 /* mm1=EFGH, mm2=DEFG, mm3=CDEF, mm4=BCDE, mm7=0 */\
917 "movq 5(%0), %%mm0 \n\t" /* FGHIJKLM */\
918 "movq %%mm0, %%mm5 \n\t" /* FGHIJKLM */\
919 "movq %%mm0, %%mm6 \n\t" /* FGHIJKLM */\
920 "psrlq $8, %%mm0 \n\t" /* GHIJKLM0 */\
921 "psrlq $16, %%mm5 \n\t" /* HIJKLM00 */\
922 "punpcklbw %%mm7, %%mm0 \n\t" /* 0G0H0I0J */\
923 "punpcklbw %%mm7, %%mm5 \n\t" /* 0H0I0J0K */\
924 "paddw %%mm0, %%mm2 \n\t" /* b */\
925 "paddw %%mm5, %%mm3 \n\t" /* c */\
926 "paddw %%mm2, %%mm2 \n\t" /* 2b */\
927 "psubw %%mm2, %%mm3 \n\t" /* c - 2b */\
928 "movq %%mm6, %%mm2 \n\t" /* FGHIJKLM */\
929 "psrlq $24, %%mm6 \n\t" /* IJKLM000 */\
930 "punpcklbw %%mm7, %%mm2 \n\t" /* 0F0G0H0I */\
931 "punpcklbw %%mm7, %%mm6 \n\t" /* 0I0J0K0L */\
932 "pmullw "MANGLE(ff_pw_3)", %%mm3 \n\t" /* 3c - 6b */\
933 "paddw %%mm2, %%mm1 \n\t" /* a */\
934 "paddw %%mm6, %%mm4 \n\t" /* d */\
935 "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" /* 20a */\
936 "psubw %%mm4, %%mm3 \n\t" /* - 6b +3c - d */\
937 "paddw %6, %%mm1 \n\t"\
938 "paddw %%mm1, %%mm3 \n\t" /* 20a - 6b +3c - d */\
939 "psraw $5, %%mm3 \n\t"\
940 "movq %5, %%mm1 \n\t"\
941 "packuswb %%mm3, %%mm1 \n\t"\
942 OP_MMX2(%%mm1, (%1),%%mm4, q)\
943 /* mm0= GHIJ, mm2=FGHI, mm5=HIJK, mm6=IJKL, mm7=0 */\
945 "movq 9(%0), %%mm1 \n\t" /* JKLMNOPQ */\
946 "movq %%mm1, %%mm4 \n\t" /* JKLMNOPQ */\
947 "movq %%mm1, %%mm3 \n\t" /* JKLMNOPQ */\
948 "psrlq $8, %%mm1 \n\t" /* KLMNOPQ0 */\
949 "psrlq $16, %%mm4 \n\t" /* LMNOPQ00 */\
950 "punpcklbw %%mm7, %%mm1 \n\t" /* 0K0L0M0N */\
951 "punpcklbw %%mm7, %%mm4 \n\t" /* 0L0M0N0O */\
952 "paddw %%mm1, %%mm5 \n\t" /* b */\
953 "paddw %%mm4, %%mm0 \n\t" /* c */\
954 "paddw %%mm5, %%mm5 \n\t" /* 2b */\
955 "psubw %%mm5, %%mm0 \n\t" /* c - 2b */\
956 "movq %%mm3, %%mm5 \n\t" /* JKLMNOPQ */\
957 "psrlq $24, %%mm3 \n\t" /* MNOPQ000 */\
958 "pmullw "MANGLE(ff_pw_3)", %%mm0 \n\t" /* 3c - 6b */\
959 "punpcklbw %%mm7, %%mm3 \n\t" /* 0M0N0O0P */\
960 "paddw %%mm3, %%mm2 \n\t" /* d */\
961 "psubw %%mm2, %%mm0 \n\t" /* -6b + 3c - d */\
962 "movq %%mm5, %%mm2 \n\t" /* JKLMNOPQ */\
963 "punpcklbw %%mm7, %%mm2 \n\t" /* 0J0K0L0M */\
964 "punpckhbw %%mm7, %%mm5 \n\t" /* 0N0O0P0Q */\
965 "paddw %%mm2, %%mm6 \n\t" /* a */\
966 "pmullw "MANGLE(ff_pw_20)", %%mm6 \n\t" /* 20a */\
967 "paddw %6, %%mm0 \n\t"\
968 "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\
969 "psraw $5, %%mm0 \n\t"\
970 /* mm1=KLMN, mm2=JKLM, mm3=MNOP, mm4=LMNO, mm5=NOPQ mm7=0 */\
972 "paddw %%mm5, %%mm3 \n\t" /* a */\
973 "pshufw $0xF9, %%mm5, %%mm6 \n\t" /* 0O0P0Q0Q */\
974 "paddw %%mm4, %%mm6 \n\t" /* b */\
975 "pshufw $0xBE, %%mm5, %%mm4 \n\t" /* 0P0Q0Q0P */\
976 "pshufw $0x6F, %%mm5, %%mm5 \n\t" /* 0Q0Q0P0O */\
977 "paddw %%mm1, %%mm4 \n\t" /* c */\
978 "paddw %%mm2, %%mm5 \n\t" /* d */\
979 "paddw %%mm6, %%mm6 \n\t" /* 2b */\
980 "psubw %%mm6, %%mm4 \n\t" /* c - 2b */\
981 "pmullw "MANGLE(ff_pw_20)", %%mm3 \n\t" /* 20a */\
982 "pmullw "MANGLE(ff_pw_3)", %%mm4 \n\t" /* 3c - 6b */\
983 "psubw %%mm5, %%mm3 \n\t" /* -6b + 3c - d */\
984 "paddw %6, %%mm4 \n\t"\
985 "paddw %%mm3, %%mm4 \n\t" /* 20a - 6b + 3c - d */\
986 "psraw $5, %%mm4 \n\t"\
987 "packuswb %%mm4, %%mm0 \n\t"\
988 OP_MMX2(%%mm0, 8(%1), %%mm4, q)\
994 : "+a"(src), "+c"(dst), "+D"(h)\
995 : "d"((x86_reg)srcStride), "S"((x86_reg)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(temp), "m"(ROUNDER)\
1000 static void OPNAME ## mpeg4_qpel16_h_lowpass_3dnow(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1003 /* quick HACK, XXX FIXME MUST be optimized */\
1006 temp[ 0]= (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]);\
1007 temp[ 1]= (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]);\
1008 temp[ 2]= (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]);\
1009 temp[ 3]= (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]);\
1010 temp[ 4]= (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]);\
1011 temp[ 5]= (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]);\
1012 temp[ 6]= (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]);\
1013 temp[ 7]= (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]);\
1014 temp[ 8]= (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]);\
1015 temp[ 9]= (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]);\
1016 temp[10]= (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]);\
1017 temp[11]= (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]);\
1018 temp[12]= (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]);\
1019 temp[13]= (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]);\
1020 temp[14]= (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]);\
1021 temp[15]= (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]);\
1023 "movq (%0), %%mm0 \n\t"\
1024 "movq 8(%0), %%mm1 \n\t"\
1025 "paddw %2, %%mm0 \n\t"\
1026 "paddw %2, %%mm1 \n\t"\
1027 "psraw $5, %%mm0 \n\t"\
1028 "psraw $5, %%mm1 \n\t"\
1029 "packuswb %%mm1, %%mm0 \n\t"\
1030 OP_3DNOW(%%mm0, (%1), %%mm1, q)\
1031 "movq 16(%0), %%mm0 \n\t"\
1032 "movq 24(%0), %%mm1 \n\t"\
1033 "paddw %2, %%mm0 \n\t"\
1034 "paddw %2, %%mm1 \n\t"\
1035 "psraw $5, %%mm0 \n\t"\
1036 "psraw $5, %%mm1 \n\t"\
1037 "packuswb %%mm1, %%mm0 \n\t"\
1038 OP_3DNOW(%%mm0, 8(%1), %%mm1, q)\
1039 :: "r"(temp), "r"(dst), "m"(ROUNDER)\
1047 static void OPNAME ## mpeg4_qpel8_h_lowpass_mmx2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1049 "pxor %%mm7, %%mm7 \n\t"\
1051 "movq (%0), %%mm0 \n\t" /* ABCDEFGH */\
1052 "movq %%mm0, %%mm1 \n\t" /* ABCDEFGH */\
1053 "movq %%mm0, %%mm2 \n\t" /* ABCDEFGH */\
1054 "punpcklbw %%mm7, %%mm0 \n\t" /* 0A0B0C0D */\
1055 "punpckhbw %%mm7, %%mm1 \n\t" /* 0E0F0G0H */\
1056 "pshufw $0x90, %%mm0, %%mm5 \n\t" /* 0A0A0B0C */\
1057 "pshufw $0x41, %%mm0, %%mm6 \n\t" /* 0B0A0A0B */\
1058 "movq %%mm2, %%mm3 \n\t" /* ABCDEFGH */\
1059 "movq %%mm2, %%mm4 \n\t" /* ABCDEFGH */\
1060 "psllq $8, %%mm2 \n\t" /* 0ABCDEFG */\
1061 "psllq $16, %%mm3 \n\t" /* 00ABCDEF */\
1062 "psllq $24, %%mm4 \n\t" /* 000ABCDE */\
1063 "punpckhbw %%mm7, %%mm2 \n\t" /* 0D0E0F0G */\
1064 "punpckhbw %%mm7, %%mm3 \n\t" /* 0C0D0E0F */\
1065 "punpckhbw %%mm7, %%mm4 \n\t" /* 0B0C0D0E */\
1066 "paddw %%mm3, %%mm5 \n\t" /* b */\
1067 "paddw %%mm2, %%mm6 \n\t" /* c */\
1068 "paddw %%mm5, %%mm5 \n\t" /* 2b */\
1069 "psubw %%mm5, %%mm6 \n\t" /* c - 2b */\
1070 "pshufw $0x06, %%mm0, %%mm5 \n\t" /* 0C0B0A0A */\
1071 "pmullw "MANGLE(ff_pw_3)", %%mm6 \n\t" /* 3c - 6b */\
1072 "paddw %%mm4, %%mm0 \n\t" /* a */\
1073 "paddw %%mm1, %%mm5 \n\t" /* d */\
1074 "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" /* 20a */\
1075 "psubw %%mm5, %%mm0 \n\t" /* 20a - d */\
1076 "paddw %5, %%mm6 \n\t"\
1077 "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\
1078 "psraw $5, %%mm0 \n\t"\
1079 /* mm1=EFGH, mm2=DEFG, mm3=CDEF, mm4=BCDE, mm7=0 */\
1081 "movd 5(%0), %%mm5 \n\t" /* FGHI */\
1082 "punpcklbw %%mm7, %%mm5 \n\t" /* 0F0G0H0I */\
1083 "pshufw $0xF9, %%mm5, %%mm6 \n\t" /* 0G0H0I0I */\
1084 "paddw %%mm5, %%mm1 \n\t" /* a */\
1085 "paddw %%mm6, %%mm2 \n\t" /* b */\
1086 "pshufw $0xBE, %%mm5, %%mm6 \n\t" /* 0H0I0I0H */\
1087 "pshufw $0x6F, %%mm5, %%mm5 \n\t" /* 0I0I0H0G */\
1088 "paddw %%mm6, %%mm3 \n\t" /* c */\
1089 "paddw %%mm5, %%mm4 \n\t" /* d */\
1090 "paddw %%mm2, %%mm2 \n\t" /* 2b */\
1091 "psubw %%mm2, %%mm3 \n\t" /* c - 2b */\
1092 "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" /* 20a */\
1093 "pmullw "MANGLE(ff_pw_3)", %%mm3 \n\t" /* 3c - 6b */\
1094 "psubw %%mm4, %%mm3 \n\t" /* -6b + 3c - d */\
1095 "paddw %5, %%mm1 \n\t"\
1096 "paddw %%mm1, %%mm3 \n\t" /* 20a - 6b + 3c - d */\
1097 "psraw $5, %%mm3 \n\t"\
1098 "packuswb %%mm3, %%mm0 \n\t"\
1099 OP_MMX2(%%mm0, (%1), %%mm4, q)\
1105 : "+a"(src), "+c"(dst), "+d"(h)\
1106 : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(ROUNDER)\
1111 static void OPNAME ## mpeg4_qpel8_h_lowpass_3dnow(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1114 /* quick HACK, XXX FIXME MUST be optimized */\
1117 temp[ 0]= (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]);\
1118 temp[ 1]= (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]);\
1119 temp[ 2]= (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]);\
1120 temp[ 3]= (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]);\
1121 temp[ 4]= (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]);\
1122 temp[ 5]= (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 8]);\
1123 temp[ 6]= (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 8])*3 - (src[ 3]+src[ 7]);\
1124 temp[ 7]= (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 8])*6 + (src[ 5]+src[ 7])*3 - (src[ 4]+src[ 6]);\
1126 "movq (%0), %%mm0 \n\t"\
1127 "movq 8(%0), %%mm1 \n\t"\
1128 "paddw %2, %%mm0 \n\t"\
1129 "paddw %2, %%mm1 \n\t"\
1130 "psraw $5, %%mm0 \n\t"\
1131 "psraw $5, %%mm1 \n\t"\
1132 "packuswb %%mm1, %%mm0 \n\t"\
1133 OP_3DNOW(%%mm0, (%1), %%mm1, q)\
1134 :: "r"(temp), "r"(dst), "m"(ROUNDER)\
1142 #define QPEL_OP(OPNAME, ROUNDER, RND, OP, MMX)\
1144 static void OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1145 uint64_t temp[17*4];\
1146 uint64_t *temp_ptr= temp;\
1151 "pxor %%mm7, %%mm7 \n\t"\
1153 "movq (%0), %%mm0 \n\t"\
1154 "movq (%0), %%mm1 \n\t"\
1155 "movq 8(%0), %%mm2 \n\t"\
1156 "movq 8(%0), %%mm3 \n\t"\
1157 "punpcklbw %%mm7, %%mm0 \n\t"\
1158 "punpckhbw %%mm7, %%mm1 \n\t"\
1159 "punpcklbw %%mm7, %%mm2 \n\t"\
1160 "punpckhbw %%mm7, %%mm3 \n\t"\
1161 "movq %%mm0, (%1) \n\t"\
1162 "movq %%mm1, 17*8(%1) \n\t"\
1163 "movq %%mm2, 2*17*8(%1) \n\t"\
1164 "movq %%mm3, 3*17*8(%1) \n\t"\
1169 : "+r" (src), "+r" (temp_ptr), "+r"(count)\
1170 : "r" ((x86_reg)srcStride)\
1177 /*FIXME reorder for speed */\
1179 /*"pxor %%mm7, %%mm7 \n\t"*/\
1181 "movq (%0), %%mm0 \n\t"\
1182 "movq 8(%0), %%mm1 \n\t"\
1183 "movq 16(%0), %%mm2 \n\t"\
1184 "movq 24(%0), %%mm3 \n\t"\
1185 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP)\
1186 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP)\
1188 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP)\
1190 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP)\
1192 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP)\
1193 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 72(%0), (%1, %3), OP)\
1195 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 80(%0), (%1), OP)\
1196 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 88(%0), (%1, %3), OP)\
1198 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 40(%0), 48(%0), 56(%0), 96(%0), (%1), OP)\
1199 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 48(%0), 56(%0), 64(%0),104(%0), (%1, %3), OP)\
1201 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 56(%0), 64(%0), 72(%0),112(%0), (%1), OP)\
1202 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 64(%0), 72(%0), 80(%0),120(%0), (%1, %3), OP)\
1204 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 72(%0), 80(%0), 88(%0),128(%0), (%1), OP)\
1206 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 80(%0), 88(%0), 96(%0),128(%0), (%1, %3), OP)\
1208 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 88(%0), 96(%0),104(%0),120(%0), (%1), OP)\
1209 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 96(%0),104(%0),112(%0),112(%0), (%1, %3), OP)\
1211 "add $136, %0 \n\t"\
1216 : "+r"(temp_ptr), "+r"(dst), "+g"(count)\
1217 : "r"((x86_reg)dstStride), "r"(2*(x86_reg)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(ROUNDER), "g"(4-14*(x86_reg)dstStride)\
1222 static void OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1223 uint64_t temp[9*2];\
1224 uint64_t *temp_ptr= temp;\
1229 "pxor %%mm7, %%mm7 \n\t"\
1231 "movq (%0), %%mm0 \n\t"\
1232 "movq (%0), %%mm1 \n\t"\
1233 "punpcklbw %%mm7, %%mm0 \n\t"\
1234 "punpckhbw %%mm7, %%mm1 \n\t"\
1235 "movq %%mm0, (%1) \n\t"\
1236 "movq %%mm1, 9*8(%1) \n\t"\
1241 : "+r" (src), "+r" (temp_ptr), "+r"(count)\
1242 : "r" ((x86_reg)srcStride)\
1249 /*FIXME reorder for speed */\
1251 /*"pxor %%mm7, %%mm7 \n\t"*/\
1253 "movq (%0), %%mm0 \n\t"\
1254 "movq 8(%0), %%mm1 \n\t"\
1255 "movq 16(%0), %%mm2 \n\t"\
1256 "movq 24(%0), %%mm3 \n\t"\
1257 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP)\
1258 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP)\
1260 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP)\
1262 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP)\
1264 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP)\
1266 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 64(%0), (%1, %3), OP)\
1268 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 56(%0), (%1), OP)\
1269 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 48(%0), (%1, %3), OP)\
1276 : "+r"(temp_ptr), "+r"(dst), "+g"(count)\
1277 : "r"((x86_reg)dstStride), "r"(2*(x86_reg)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(ROUNDER), "g"(4-6*(x86_reg)dstStride)\
1282 static void OPNAME ## qpel8_mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\
1283 OPNAME ## pixels8_ ## MMX(dst, src, stride, 8);\
1286 static void OPNAME ## qpel8_mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1288 uint8_t * const half= (uint8_t*)temp;\
1289 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, stride, 8);\
1290 OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8);\
1293 static void OPNAME ## qpel8_mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1294 OPNAME ## mpeg4_qpel8_h_lowpass_ ## MMX(dst, src, stride, stride, 8);\
1297 static void OPNAME ## qpel8_mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1299 uint8_t * const half= (uint8_t*)temp;\
1300 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, stride, 8);\
1301 OPNAME ## pixels8_l2_ ## MMX(dst, src+1, half, stride, stride, 8);\
1304 static void OPNAME ## qpel8_mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1306 uint8_t * const half= (uint8_t*)temp;\
1307 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride);\
1308 OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8);\
1311 static void OPNAME ## qpel8_mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1312 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, src, stride, stride);\
1315 static void OPNAME ## qpel8_mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1317 uint8_t * const half= (uint8_t*)temp;\
1318 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride);\
1319 OPNAME ## pixels8_l2_ ## MMX(dst, src+stride, half, stride, stride, 8);\
1321 static void OPNAME ## qpel8_mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1322 uint64_t half[8 + 9];\
1323 uint8_t * const halfH= ((uint8_t*)half) + 64;\
1324 uint8_t * const halfHV= ((uint8_t*)half);\
1325 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
1326 put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\
1327 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
1328 OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\
1330 static void OPNAME ## qpel8_mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1331 uint64_t half[8 + 9];\
1332 uint8_t * const halfH= ((uint8_t*)half) + 64;\
1333 uint8_t * const halfHV= ((uint8_t*)half);\
1334 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
1335 put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\
1336 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
1337 OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\
1339 static void OPNAME ## qpel8_mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1340 uint64_t half[8 + 9];\
1341 uint8_t * const halfH= ((uint8_t*)half) + 64;\
1342 uint8_t * const halfHV= ((uint8_t*)half);\
1343 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
1344 put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\
1345 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
1346 OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\
1348 static void OPNAME ## qpel8_mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1349 uint64_t half[8 + 9];\
1350 uint8_t * const halfH= ((uint8_t*)half) + 64;\
1351 uint8_t * const halfHV= ((uint8_t*)half);\
1352 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
1353 put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\
1354 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
1355 OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\
1357 static void OPNAME ## qpel8_mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1358 uint64_t half[8 + 9];\
1359 uint8_t * const halfH= ((uint8_t*)half) + 64;\
1360 uint8_t * const halfHV= ((uint8_t*)half);\
1361 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
1362 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
1363 OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\
1365 static void OPNAME ## qpel8_mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1366 uint64_t half[8 + 9];\
1367 uint8_t * const halfH= ((uint8_t*)half) + 64;\
1368 uint8_t * const halfHV= ((uint8_t*)half);\
1369 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
1370 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
1371 OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\
1373 static void OPNAME ## qpel8_mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1374 uint64_t half[8 + 9];\
1375 uint8_t * const halfH= ((uint8_t*)half);\
1376 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
1377 put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\
1378 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
1380 static void OPNAME ## qpel8_mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1381 uint64_t half[8 + 9];\
1382 uint8_t * const halfH= ((uint8_t*)half);\
1383 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
1384 put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\
1385 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
1387 static void OPNAME ## qpel8_mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1389 uint8_t * const halfH= ((uint8_t*)half);\
1390 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
1391 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
1393 static void OPNAME ## qpel16_mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\
1394 OPNAME ## pixels16_ ## MMX(dst, src, stride, 16);\
1397 static void OPNAME ## qpel16_mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1399 uint8_t * const half= (uint8_t*)temp;\
1400 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, stride, 16);\
1401 OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16);\
1404 static void OPNAME ## qpel16_mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1405 OPNAME ## mpeg4_qpel16_h_lowpass_ ## MMX(dst, src, stride, stride, 16);\
1408 static void OPNAME ## qpel16_mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1410 uint8_t * const half= (uint8_t*)temp;\
1411 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, stride, 16);\
1412 OPNAME ## pixels16_l2_ ## MMX(dst, src+1, half, stride, stride, 16);\
1415 static void OPNAME ## qpel16_mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1417 uint8_t * const half= (uint8_t*)temp;\
1418 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, stride);\
1419 OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16);\
1422 static void OPNAME ## qpel16_mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1423 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, src, stride, stride);\
1426 static void OPNAME ## qpel16_mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1428 uint8_t * const half= (uint8_t*)temp;\
1429 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, stride);\
1430 OPNAME ## pixels16_l2_ ## MMX(dst, src+stride, half, stride, stride, 16);\
1432 static void OPNAME ## qpel16_mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1433 uint64_t half[16*2 + 17*2];\
1434 uint8_t * const halfH= ((uint8_t*)half) + 256;\
1435 uint8_t * const halfHV= ((uint8_t*)half);\
1436 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
1437 put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\
1438 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
1439 OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\
1441 static void OPNAME ## qpel16_mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1442 uint64_t half[16*2 + 17*2];\
1443 uint8_t * const halfH= ((uint8_t*)half) + 256;\
1444 uint8_t * const halfHV= ((uint8_t*)half);\
1445 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
1446 put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\
1447 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
1448 OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\
1450 static void OPNAME ## qpel16_mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1451 uint64_t half[16*2 + 17*2];\
1452 uint8_t * const halfH= ((uint8_t*)half) + 256;\
1453 uint8_t * const halfHV= ((uint8_t*)half);\
1454 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
1455 put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\
1456 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
1457 OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\
1459 static void OPNAME ## qpel16_mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1460 uint64_t half[16*2 + 17*2];\
1461 uint8_t * const halfH= ((uint8_t*)half) + 256;\
1462 uint8_t * const halfHV= ((uint8_t*)half);\
1463 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
1464 put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\
1465 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
1466 OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\
1468 static void OPNAME ## qpel16_mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1469 uint64_t half[16*2 + 17*2];\
1470 uint8_t * const halfH= ((uint8_t*)half) + 256;\
1471 uint8_t * const halfHV= ((uint8_t*)half);\
1472 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
1473 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
1474 OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\
1476 static void OPNAME ## qpel16_mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1477 uint64_t half[16*2 + 17*2];\
1478 uint8_t * const halfH= ((uint8_t*)half) + 256;\
1479 uint8_t * const halfHV= ((uint8_t*)half);\
1480 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
1481 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
1482 OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\
1484 static void OPNAME ## qpel16_mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1485 uint64_t half[17*2];\
1486 uint8_t * const halfH= ((uint8_t*)half);\
1487 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
1488 put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\
1489 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
1491 static void OPNAME ## qpel16_mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1492 uint64_t half[17*2];\
1493 uint8_t * const halfH= ((uint8_t*)half);\
1494 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
1495 put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\
1496 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
1498 static void OPNAME ## qpel16_mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1499 uint64_t half[17*2];\
1500 uint8_t * const halfH= ((uint8_t*)half);\
1501 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
1502 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
1505 #define PUT_OP(a,b,temp, size) "mov" #size " " #a ", " #b " \n\t"
1506 #define AVG_3DNOW_OP(a,b,temp, size) \
1507 "mov" #size " " #b ", " #temp " \n\t"\
1508 "pavgusb " #temp ", " #a " \n\t"\
1509 "mov" #size " " #a ", " #b " \n\t"
1510 #define AVG_MMX2_OP(a,b,temp, size) \
1511 "mov" #size " " #b ", " #temp " \n\t"\
1512 "pavgb " #temp ", " #a " \n\t"\
1513 "mov" #size " " #a ", " #b " \n\t"
1515 QPEL_BASE(put_ , ff_pw_16, _ , PUT_OP, PUT_OP)
1516 QPEL_BASE(avg_ , ff_pw_16, _ , AVG_MMX2_OP, AVG_3DNOW_OP)
1517 QPEL_BASE(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, PUT_OP)
1518 QPEL_OP(put_ , ff_pw_16, _ , PUT_OP, 3dnow)
1519 QPEL_OP(avg_ , ff_pw_16, _ , AVG_3DNOW_OP, 3dnow)
1520 QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, 3dnow)
1521 QPEL_OP(put_ , ff_pw_16, _ , PUT_OP, mmx2)
1522 QPEL_OP(avg_ , ff_pw_16, _ , AVG_MMX2_OP, mmx2)
1523 QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, mmx2)
1525 /***********************************/
1526 /* bilinear qpel: not compliant to any spec, only for -lavdopts fast */
1528 #define QPEL_2TAP_XY(OPNAME, SIZE, MMX, XY, HPEL)\
1529 static void OPNAME ## 2tap_qpel ## SIZE ## _mc ## XY ## _ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1530 OPNAME ## pixels ## SIZE ## HPEL(dst, src, stride, SIZE);\
1532 #define QPEL_2TAP_L3(OPNAME, SIZE, MMX, XY, S0, S1, S2)\
1533 static void OPNAME ## 2tap_qpel ## SIZE ## _mc ## XY ## _ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1534 OPNAME ## 2tap_qpel ## SIZE ## _l3_ ## MMX(dst, src+S0, stride, SIZE, S1, S2);\
1537 #define QPEL_2TAP(OPNAME, SIZE, MMX)\
1538 QPEL_2TAP_XY(OPNAME, SIZE, MMX, 20, _x2_ ## MMX)\
1539 QPEL_2TAP_XY(OPNAME, SIZE, MMX, 02, _y2_ ## MMX)\
1540 QPEL_2TAP_XY(OPNAME, SIZE, MMX, 22, _xy2_mmx)\
1541 static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc00_ ## MMX =\
1542 OPNAME ## qpel ## SIZE ## _mc00_ ## MMX;\
1543 static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc21_ ## MMX =\
1544 OPNAME ## 2tap_qpel ## SIZE ## _mc20_ ## MMX;\
1545 static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc12_ ## MMX =\
1546 OPNAME ## 2tap_qpel ## SIZE ## _mc02_ ## MMX;\
1547 static void OPNAME ## 2tap_qpel ## SIZE ## _mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1548 OPNAME ## pixels ## SIZE ## _y2_ ## MMX(dst, src+1, stride, SIZE);\
1550 static void OPNAME ## 2tap_qpel ## SIZE ## _mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1551 OPNAME ## pixels ## SIZE ## _x2_ ## MMX(dst, src+stride, stride, SIZE);\
1553 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 10, 0, 1, 0)\
1554 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 30, 1, -1, 0)\
1555 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 01, 0, stride, 0)\
1556 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 03, stride, -stride, 0)\
1557 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 11, 0, stride, 1)\
1558 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 31, 1, stride, -1)\
1559 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 13, stride, -stride, 1)\
1560 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 33, stride+1, -stride, -1)\
1562 QPEL_2TAP(put_, 16, mmx2)
1563 QPEL_2TAP(avg_, 16, mmx2)
1564 QPEL_2TAP(put_, 8, mmx2)
1565 QPEL_2TAP(avg_, 8, mmx2)
1566 QPEL_2TAP(put_, 16, 3dnow)
1567 QPEL_2TAP(avg_, 16, 3dnow)
1568 QPEL_2TAP(put_, 8, 3dnow)
1569 QPEL_2TAP(avg_, 8, 3dnow)
1573 static void just_return(void) { return; }
1577 typedef void emu_edge_core_func (uint8_t *buf, const uint8_t *src,
1578 x86_reg linesize, x86_reg start_y,
1579 x86_reg end_y, x86_reg block_h,
1580 x86_reg start_x, x86_reg end_x,
1582 extern emu_edge_core_func ff_emu_edge_core_mmx;
1583 extern emu_edge_core_func ff_emu_edge_core_sse;
1585 static av_always_inline
1586 void emulated_edge_mc(uint8_t *buf, const uint8_t *src, int linesize,
1587 int block_w, int block_h,
1588 int src_x, int src_y, int w, int h,
1589 emu_edge_core_func *core_fn)
1591 int start_y, start_x, end_y, end_x, src_y_add=0;
1594 src_y_add = h-1-src_y;
1596 }else if(src_y<=-block_h){
1597 src_y_add = 1-block_h-src_y;
1603 }else if(src_x<=-block_w){
1604 src+= (1-block_w-src_x);
1608 start_y= FFMAX(0, -src_y);
1609 start_x= FFMAX(0, -src_x);
1610 end_y= FFMIN(block_h, h-src_y);
1611 end_x= FFMIN(block_w, w-src_x);
1612 assert(start_x < end_x && block_w > 0);
1613 assert(start_y < end_y && block_h > 0);
1615 // fill in the to-be-copied part plus all above/below
1616 src += (src_y_add+start_y)*linesize + start_x;
1618 core_fn(buf, src, linesize, start_y, end_y, block_h, start_x, end_x, block_w);
1623 void emulated_edge_mc_mmx(uint8_t *buf, const uint8_t *src, int linesize,
1624 int block_w, int block_h,
1625 int src_x, int src_y, int w, int h)
1627 emulated_edge_mc(buf, src, linesize, block_w, block_h, src_x, src_y,
1628 w, h, &ff_emu_edge_core_mmx);
1632 void emulated_edge_mc_sse(uint8_t *buf, const uint8_t *src, int linesize,
1633 int block_w, int block_h,
1634 int src_x, int src_y, int w, int h)
1636 emulated_edge_mc(buf, src, linesize, block_w, block_h, src_x, src_y,
1637 w, h, &ff_emu_edge_core_sse);
1639 #endif /* HAVE_YASM */
1641 typedef void emulated_edge_mc_func (uint8_t *dst, const uint8_t *src,
1642 int linesize, int block_w, int block_h,
1643 int src_x, int src_y, int w, int h);
1645 static av_always_inline
1646 void gmc(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
1647 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height,
1648 emulated_edge_mc_func *emu_edge_fn)
1651 const int ix = ox>>(16+shift);
1652 const int iy = oy>>(16+shift);
1653 const int oxs = ox>>4;
1654 const int oys = oy>>4;
1655 const int dxxs = dxx>>4;
1656 const int dxys = dxy>>4;
1657 const int dyxs = dyx>>4;
1658 const int dyys = dyy>>4;
1659 const uint16_t r4[4] = {r,r,r,r};
1660 const uint16_t dxy4[4] = {dxys,dxys,dxys,dxys};
1661 const uint16_t dyy4[4] = {dyys,dyys,dyys,dyys};
1662 const uint64_t shift2 = 2*shift;
1663 uint8_t edge_buf[(h+1)*stride];
1666 const int dxw = (dxx-(1<<(16+shift)))*(w-1);
1667 const int dyh = (dyy-(1<<(16+shift)))*(h-1);
1668 const int dxh = dxy*(h-1);
1669 const int dyw = dyx*(w-1);
1670 if( // non-constant fullpel offset (3% of blocks)
1671 ((ox^(ox+dxw)) | (ox^(ox+dxh)) | (ox^(ox+dxw+dxh)) |
1672 (oy^(oy+dyw)) | (oy^(oy+dyh)) | (oy^(oy+dyw+dyh))) >> (16+shift)
1673 // uses more than 16 bits of subpel mv (only at huge resolution)
1674 || (dxx|dxy|dyx|dyy)&15 )
1676 //FIXME could still use mmx for some of the rows
1677 ff_gmc_c(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r, width, height);
1681 src += ix + iy*stride;
1682 if( (unsigned)ix >= width-w ||
1683 (unsigned)iy >= height-h )
1685 emu_edge_fn(edge_buf, src, stride, w+1, h+1, ix, iy, width, height);
1690 "movd %0, %%mm6 \n\t"
1691 "pxor %%mm7, %%mm7 \n\t"
1692 "punpcklwd %%mm6, %%mm6 \n\t"
1693 "punpcklwd %%mm6, %%mm6 \n\t"
1697 for(x=0; x<w; x+=4){
1698 uint16_t dx4[4] = { oxs - dxys + dxxs*(x+0),
1699 oxs - dxys + dxxs*(x+1),
1700 oxs - dxys + dxxs*(x+2),
1701 oxs - dxys + dxxs*(x+3) };
1702 uint16_t dy4[4] = { oys - dyys + dyxs*(x+0),
1703 oys - dyys + dyxs*(x+1),
1704 oys - dyys + dyxs*(x+2),
1705 oys - dyys + dyxs*(x+3) };
1709 "movq %0, %%mm4 \n\t"
1710 "movq %1, %%mm5 \n\t"
1711 "paddw %2, %%mm4 \n\t"
1712 "paddw %3, %%mm5 \n\t"
1713 "movq %%mm4, %0 \n\t"
1714 "movq %%mm5, %1 \n\t"
1715 "psrlw $12, %%mm4 \n\t"
1716 "psrlw $12, %%mm5 \n\t"
1717 : "+m"(*dx4), "+m"(*dy4)
1718 : "m"(*dxy4), "m"(*dyy4)
1722 "movq %%mm6, %%mm2 \n\t"
1723 "movq %%mm6, %%mm1 \n\t"
1724 "psubw %%mm4, %%mm2 \n\t"
1725 "psubw %%mm5, %%mm1 \n\t"
1726 "movq %%mm2, %%mm0 \n\t"
1727 "movq %%mm4, %%mm3 \n\t"
1728 "pmullw %%mm1, %%mm0 \n\t" // (s-dx)*(s-dy)
1729 "pmullw %%mm5, %%mm3 \n\t" // dx*dy
1730 "pmullw %%mm5, %%mm2 \n\t" // (s-dx)*dy
1731 "pmullw %%mm4, %%mm1 \n\t" // dx*(s-dy)
1733 "movd %4, %%mm5 \n\t"
1734 "movd %3, %%mm4 \n\t"
1735 "punpcklbw %%mm7, %%mm5 \n\t"
1736 "punpcklbw %%mm7, %%mm4 \n\t"
1737 "pmullw %%mm5, %%mm3 \n\t" // src[1,1] * dx*dy
1738 "pmullw %%mm4, %%mm2 \n\t" // src[0,1] * (s-dx)*dy
1740 "movd %2, %%mm5 \n\t"
1741 "movd %1, %%mm4 \n\t"
1742 "punpcklbw %%mm7, %%mm5 \n\t"
1743 "punpcklbw %%mm7, %%mm4 \n\t"
1744 "pmullw %%mm5, %%mm1 \n\t" // src[1,0] * dx*(s-dy)
1745 "pmullw %%mm4, %%mm0 \n\t" // src[0,0] * (s-dx)*(s-dy)
1746 "paddw %5, %%mm1 \n\t"
1747 "paddw %%mm3, %%mm2 \n\t"
1748 "paddw %%mm1, %%mm0 \n\t"
1749 "paddw %%mm2, %%mm0 \n\t"
1751 "psrlw %6, %%mm0 \n\t"
1752 "packuswb %%mm0, %%mm0 \n\t"
1753 "movd %%mm0, %0 \n\t"
1755 : "=m"(dst[x+y*stride])
1756 : "m"(src[0]), "m"(src[1]),
1757 "m"(src[stride]), "m"(src[stride+1]),
1758 "m"(*r4), "m"(shift2)
1768 static void gmc_mmx(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
1769 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
1771 gmc(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r,
1772 width, height, &emulated_edge_mc_mmx);
1775 static void gmc_sse(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
1776 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
1778 gmc(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r,
1779 width, height, &emulated_edge_mc_sse);
1782 static void gmc_mmx(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
1783 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
1785 gmc(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r,
1786 width, height, &ff_emulated_edge_mc);
1790 #define PREFETCH(name, op) \
1791 static void name(void *mem, int stride, int h){\
1792 const uint8_t *p= mem;\
1794 __asm__ volatile(#op" %0" :: "m"(*p));\
1798 PREFETCH(prefetch_mmx2, prefetcht0)
1799 PREFETCH(prefetch_3dnow, prefetch)
1802 #include "h264_qpel_mmx.c"
1804 void ff_put_h264_chroma_mc8_mmx_rnd (uint8_t *dst, uint8_t *src,
1805 int stride, int h, int x, int y);
1806 void ff_put_rv40_chroma_mc8_mmx (uint8_t *dst, uint8_t *src,
1807 int stride, int h, int x, int y);
1808 void ff_avg_h264_chroma_mc8_mmx2_rnd (uint8_t *dst, uint8_t *src,
1809 int stride, int h, int x, int y);
1810 void ff_avg_rv40_chroma_mc8_mmx2 (uint8_t *dst, uint8_t *src,
1811 int stride, int h, int x, int y);
1812 void ff_avg_h264_chroma_mc8_3dnow_rnd (uint8_t *dst, uint8_t *src,
1813 int stride, int h, int x, int y);
1814 void ff_avg_rv40_chroma_mc8_3dnow (uint8_t *dst, uint8_t *src,
1815 int stride, int h, int x, int y);
1817 void ff_put_h264_chroma_mc4_mmx (uint8_t *dst, uint8_t *src,
1818 int stride, int h, int x, int y);
1819 void ff_put_rv40_chroma_mc4_mmx (uint8_t *dst, uint8_t *src,
1820 int stride, int h, int x, int y);
1821 void ff_avg_h264_chroma_mc4_mmx2 (uint8_t *dst, uint8_t *src,
1822 int stride, int h, int x, int y);
1823 void ff_avg_rv40_chroma_mc4_mmx2 (uint8_t *dst, uint8_t *src,
1824 int stride, int h, int x, int y);
1825 void ff_avg_h264_chroma_mc4_3dnow (uint8_t *dst, uint8_t *src,
1826 int stride, int h, int x, int y);
1827 void ff_avg_rv40_chroma_mc4_3dnow (uint8_t *dst, uint8_t *src,
1828 int stride, int h, int x, int y);
1830 void ff_put_h264_chroma_mc2_mmx2 (uint8_t *dst, uint8_t *src,
1831 int stride, int h, int x, int y);
1832 void ff_avg_h264_chroma_mc2_mmx2 (uint8_t *dst, uint8_t *src,
1833 int stride, int h, int x, int y);
1835 void ff_put_h264_chroma_mc8_ssse3_rnd (uint8_t *dst, uint8_t *src,
1836 int stride, int h, int x, int y);
1837 void ff_put_h264_chroma_mc4_ssse3 (uint8_t *dst, uint8_t *src,
1838 int stride, int h, int x, int y);
1840 void ff_avg_h264_chroma_mc8_ssse3_rnd (uint8_t *dst, uint8_t *src,
1841 int stride, int h, int x, int y);
1842 void ff_avg_h264_chroma_mc4_ssse3 (uint8_t *dst, uint8_t *src,
1843 int stride, int h, int x, int y);
1845 #define CHROMA_MC(OP, NUM, DEPTH, OPT) \
1846 void ff_ ## OP ## _h264_chroma_mc ## NUM ## _ ## DEPTH ## _ ## OPT \
1847 (uint8_t *dst, uint8_t *src,\
1848 int stride, int h, int x, int y);
1850 CHROMA_MC(put, 2, 10, mmxext)
1851 CHROMA_MC(avg, 2, 10, mmxext)
1852 CHROMA_MC(put, 4, 10, mmxext)
1853 CHROMA_MC(avg, 4, 10, mmxext)
1854 CHROMA_MC(put, 8, 10, sse2)
1855 CHROMA_MC(avg, 8, 10, sse2)
1856 CHROMA_MC(put, 8, 10, avx)
1857 CHROMA_MC(avg, 8, 10, avx)
1860 void ff_put_cavs_qpel8_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) {
1861 put_pixels8_mmx(dst, src, stride, 8);
1863 void ff_avg_cavs_qpel8_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) {
1864 avg_pixels8_mmx(dst, src, stride, 8);
1866 void ff_put_cavs_qpel16_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) {
1867 put_pixels16_mmx(dst, src, stride, 16);
1869 void ff_avg_cavs_qpel16_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) {
1870 avg_pixels16_mmx(dst, src, stride, 16);
1874 void ff_put_vc1_mspel_mc00_mmx(uint8_t *dst, const uint8_t *src, int stride, int rnd) {
1875 put_pixels8_mmx(dst, src, stride, 8);
1877 void ff_avg_vc1_mspel_mc00_mmx2(uint8_t *dst, const uint8_t *src, int stride, int rnd) {
1878 avg_pixels8_mmx2(dst, src, stride, 8);
1881 /* XXX: those functions should be suppressed ASAP when all IDCTs are
1884 static void ff_libmpeg2mmx_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
1886 ff_mmx_idct (block);
1887 ff_put_pixels_clamped_mmx(block, dest, line_size);
1889 static void ff_libmpeg2mmx_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
1891 ff_mmx_idct (block);
1892 ff_add_pixels_clamped_mmx(block, dest, line_size);
1894 static void ff_libmpeg2mmx2_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
1896 ff_mmxext_idct (block);
1897 ff_put_pixels_clamped_mmx(block, dest, line_size);
1899 static void ff_libmpeg2mmx2_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
1901 ff_mmxext_idct (block);
1902 ff_add_pixels_clamped_mmx(block, dest, line_size);
1905 static void ff_idct_xvid_mmx_put(uint8_t *dest, int line_size, DCTELEM *block)
1907 ff_idct_xvid_mmx (block);
1908 ff_put_pixels_clamped_mmx(block, dest, line_size);
1910 static void ff_idct_xvid_mmx_add(uint8_t *dest, int line_size, DCTELEM *block)
1912 ff_idct_xvid_mmx (block);
1913 ff_add_pixels_clamped_mmx(block, dest, line_size);
1915 static void ff_idct_xvid_mmx2_put(uint8_t *dest, int line_size, DCTELEM *block)
1917 ff_idct_xvid_mmx2 (block);
1918 ff_put_pixels_clamped_mmx(block, dest, line_size);
1920 static void ff_idct_xvid_mmx2_add(uint8_t *dest, int line_size, DCTELEM *block)
1922 ff_idct_xvid_mmx2 (block);
1923 ff_add_pixels_clamped_mmx(block, dest, line_size);
1926 static void vorbis_inverse_coupling_3dnow(float *mag, float *ang, int blocksize)
1929 __asm__ volatile("pxor %%mm7, %%mm7":);
1930 for(i=0; i<blocksize; i+=2) {
1932 "movq %0, %%mm0 \n\t"
1933 "movq %1, %%mm1 \n\t"
1934 "movq %%mm0, %%mm2 \n\t"
1935 "movq %%mm1, %%mm3 \n\t"
1936 "pfcmpge %%mm7, %%mm2 \n\t" // m <= 0.0
1937 "pfcmpge %%mm7, %%mm3 \n\t" // a <= 0.0
1938 "pslld $31, %%mm2 \n\t" // keep only the sign bit
1939 "pxor %%mm2, %%mm1 \n\t"
1940 "movq %%mm3, %%mm4 \n\t"
1941 "pand %%mm1, %%mm3 \n\t"
1942 "pandn %%mm1, %%mm4 \n\t"
1943 "pfadd %%mm0, %%mm3 \n\t" // a = m + ((a<0) & (a ^ sign(m)))
1944 "pfsub %%mm4, %%mm0 \n\t" // m = m + ((a>0) & (a ^ sign(m)))
1945 "movq %%mm3, %1 \n\t"
1946 "movq %%mm0, %0 \n\t"
1947 :"+m"(mag[i]), "+m"(ang[i])
1951 __asm__ volatile("femms");
1953 static void vorbis_inverse_coupling_sse(float *mag, float *ang, int blocksize)
1958 "movaps %0, %%xmm5 \n\t"
1959 ::"m"(ff_pdw_80000000[0])
1961 for(i=0; i<blocksize; i+=4) {
1963 "movaps %0, %%xmm0 \n\t"
1964 "movaps %1, %%xmm1 \n\t"
1965 "xorps %%xmm2, %%xmm2 \n\t"
1966 "xorps %%xmm3, %%xmm3 \n\t"
1967 "cmpleps %%xmm0, %%xmm2 \n\t" // m <= 0.0
1968 "cmpleps %%xmm1, %%xmm3 \n\t" // a <= 0.0
1969 "andps %%xmm5, %%xmm2 \n\t" // keep only the sign bit
1970 "xorps %%xmm2, %%xmm1 \n\t"
1971 "movaps %%xmm3, %%xmm4 \n\t"
1972 "andps %%xmm1, %%xmm3 \n\t"
1973 "andnps %%xmm1, %%xmm4 \n\t"
1974 "addps %%xmm0, %%xmm3 \n\t" // a = m + ((a<0) & (a ^ sign(m)))
1975 "subps %%xmm4, %%xmm0 \n\t" // m = m + ((a>0) & (a ^ sign(m)))
1976 "movaps %%xmm3, %1 \n\t"
1977 "movaps %%xmm0, %0 \n\t"
1978 :"+m"(mag[i]), "+m"(ang[i])
1987 #define MIX5(mono,stereo)\
1989 "movss 0(%2), %%xmm5 \n"\
1990 "movss 8(%2), %%xmm6 \n"\
1991 "movss 24(%2), %%xmm7 \n"\
1992 "shufps $0, %%xmm5, %%xmm5 \n"\
1993 "shufps $0, %%xmm6, %%xmm6 \n"\
1994 "shufps $0, %%xmm7, %%xmm7 \n"\
1996 "movaps (%0,%1), %%xmm0 \n"\
1997 "movaps 0x400(%0,%1), %%xmm1 \n"\
1998 "movaps 0x800(%0,%1), %%xmm2 \n"\
1999 "movaps 0xc00(%0,%1), %%xmm3 \n"\
2000 "movaps 0x1000(%0,%1), %%xmm4 \n"\
2001 "mulps %%xmm5, %%xmm0 \n"\
2002 "mulps %%xmm6, %%xmm1 \n"\
2003 "mulps %%xmm5, %%xmm2 \n"\
2004 "mulps %%xmm7, %%xmm3 \n"\
2005 "mulps %%xmm7, %%xmm4 \n"\
2006 stereo("addps %%xmm1, %%xmm0 \n")\
2007 "addps %%xmm1, %%xmm2 \n"\
2008 "addps %%xmm3, %%xmm0 \n"\
2009 "addps %%xmm4, %%xmm2 \n"\
2010 mono("addps %%xmm2, %%xmm0 \n")\
2011 "movaps %%xmm0, (%0,%1) \n"\
2012 stereo("movaps %%xmm2, 0x400(%0,%1) \n")\
2016 :"r"(samples[0]+len), "r"(matrix)\
2017 :XMM_CLOBBERS("%xmm0", "%xmm1", "%xmm2", "%xmm3", \
2018 "%xmm4", "%xmm5", "%xmm6", "%xmm7",)\
2022 #define MIX_MISC(stereo)\
2025 "movaps (%3,%0), %%xmm0 \n"\
2026 stereo("movaps %%xmm0, %%xmm1 \n")\
2027 "mulps %%xmm4, %%xmm0 \n"\
2028 stereo("mulps %%xmm5, %%xmm1 \n")\
2029 "lea 1024(%3,%0), %1 \n"\
2032 "movaps (%1), %%xmm2 \n"\
2033 stereo("movaps %%xmm2, %%xmm3 \n")\
2034 "mulps (%4,%2), %%xmm2 \n"\
2035 stereo("mulps 16(%4,%2), %%xmm3 \n")\
2036 "addps %%xmm2, %%xmm0 \n"\
2037 stereo("addps %%xmm3, %%xmm1 \n")\
2041 "movaps %%xmm0, (%3,%0) \n"\
2042 stereo("movaps %%xmm1, 1024(%3,%0) \n")\
2045 :"+&r"(i), "=&r"(j), "=&r"(k)\
2046 :"r"(samples[0]+len), "r"(matrix_simd+in_ch), "g"((intptr_t)-32*(in_ch-1))\
2050 static void ac3_downmix_sse(float (*samples)[256], float (*matrix)[2], int out_ch, int in_ch, int len)
2052 int (*matrix_cmp)[2] = (int(*)[2])matrix;
2055 i = -len*sizeof(float);
2056 if(in_ch == 5 && out_ch == 2 && !(matrix_cmp[0][1]|matrix_cmp[2][0]|matrix_cmp[3][1]|matrix_cmp[4][0]|(matrix_cmp[1][0]^matrix_cmp[1][1])|(matrix_cmp[0][0]^matrix_cmp[2][1]))) {
2058 } else if(in_ch == 5 && out_ch == 1 && matrix_cmp[0][0]==matrix_cmp[2][0] && matrix_cmp[3][0]==matrix_cmp[4][0]) {
2061 DECLARE_ALIGNED(16, float, matrix_simd)[AC3_MAX_CHANNELS][2][4];
2062 j = 2*in_ch*sizeof(float);
2066 "movss (%2,%0), %%xmm4 \n"
2067 "movss 4(%2,%0), %%xmm5 \n"
2068 "shufps $0, %%xmm4, %%xmm4 \n"
2069 "shufps $0, %%xmm5, %%xmm5 \n"
2070 "movaps %%xmm4, (%1,%0,4) \n"
2071 "movaps %%xmm5, 16(%1,%0,4) \n"
2074 :"r"(matrix_simd), "r"(matrix)
2085 static void vector_fmul_3dnow(float *dst, const float *src0, const float *src1, int len){
2086 x86_reg i = (len-4)*4;
2089 "movq (%2,%0), %%mm0 \n\t"
2090 "movq 8(%2,%0), %%mm1 \n\t"
2091 "pfmul (%3,%0), %%mm0 \n\t"
2092 "pfmul 8(%3,%0), %%mm1 \n\t"
2093 "movq %%mm0, (%1,%0) \n\t"
2094 "movq %%mm1, 8(%1,%0) \n\t"
2099 :"r"(dst), "r"(src0), "r"(src1)
2103 static void vector_fmul_sse(float *dst, const float *src0, const float *src1, int len){
2104 x86_reg i = (len-8)*4;
2107 "movaps (%2,%0), %%xmm0 \n\t"
2108 "movaps 16(%2,%0), %%xmm1 \n\t"
2109 "mulps (%3,%0), %%xmm0 \n\t"
2110 "mulps 16(%3,%0), %%xmm1 \n\t"
2111 "movaps %%xmm0, (%1,%0) \n\t"
2112 "movaps %%xmm1, 16(%1,%0) \n\t"
2116 :"r"(dst), "r"(src0), "r"(src1)
2121 static void vector_fmul_reverse_3dnow2(float *dst, const float *src0, const float *src1, int len){
2122 x86_reg i = len*4-16;
2125 "pswapd 8(%1), %%mm0 \n\t"
2126 "pswapd (%1), %%mm1 \n\t"
2127 "pfmul (%3,%0), %%mm0 \n\t"
2128 "pfmul 8(%3,%0), %%mm1 \n\t"
2129 "movq %%mm0, (%2,%0) \n\t"
2130 "movq %%mm1, 8(%2,%0) \n\t"
2134 :"+r"(i), "+r"(src1)
2135 :"r"(dst), "r"(src0)
2137 __asm__ volatile("femms");
2139 static void vector_fmul_reverse_sse(float *dst, const float *src0, const float *src1, int len){
2140 x86_reg i = len*4-32;
2143 "movaps 16(%1), %%xmm0 \n\t"
2144 "movaps (%1), %%xmm1 \n\t"
2145 "shufps $0x1b, %%xmm0, %%xmm0 \n\t"
2146 "shufps $0x1b, %%xmm1, %%xmm1 \n\t"
2147 "mulps (%3,%0), %%xmm0 \n\t"
2148 "mulps 16(%3,%0), %%xmm1 \n\t"
2149 "movaps %%xmm0, (%2,%0) \n\t"
2150 "movaps %%xmm1, 16(%2,%0) \n\t"
2154 :"+r"(i), "+r"(src1)
2155 :"r"(dst), "r"(src0)
2159 static void vector_fmul_add_3dnow(float *dst, const float *src0, const float *src1,
2160 const float *src2, int len){
2161 x86_reg i = (len-4)*4;
2164 "movq (%2,%0), %%mm0 \n\t"
2165 "movq 8(%2,%0), %%mm1 \n\t"
2166 "pfmul (%3,%0), %%mm0 \n\t"
2167 "pfmul 8(%3,%0), %%mm1 \n\t"
2168 "pfadd (%4,%0), %%mm0 \n\t"
2169 "pfadd 8(%4,%0), %%mm1 \n\t"
2170 "movq %%mm0, (%1,%0) \n\t"
2171 "movq %%mm1, 8(%1,%0) \n\t"
2175 :"r"(dst), "r"(src0), "r"(src1), "r"(src2)
2178 __asm__ volatile("femms");
2180 static void vector_fmul_add_sse(float *dst, const float *src0, const float *src1,
2181 const float *src2, int len){
2182 x86_reg i = (len-8)*4;
2185 "movaps (%2,%0), %%xmm0 \n\t"
2186 "movaps 16(%2,%0), %%xmm1 \n\t"
2187 "mulps (%3,%0), %%xmm0 \n\t"
2188 "mulps 16(%3,%0), %%xmm1 \n\t"
2189 "addps (%4,%0), %%xmm0 \n\t"
2190 "addps 16(%4,%0), %%xmm1 \n\t"
2191 "movaps %%xmm0, (%1,%0) \n\t"
2192 "movaps %%xmm1, 16(%1,%0) \n\t"
2196 :"r"(dst), "r"(src0), "r"(src1), "r"(src2)
2202 static void vector_fmul_window_3dnow2(float *dst, const float *src0, const float *src1,
2203 const float *win, int len){
2205 x86_reg j = len*4-8;
2208 "pswapd (%5,%1), %%mm1 \n"
2209 "movq (%5,%0), %%mm0 \n"
2210 "pswapd (%4,%1), %%mm5 \n"
2211 "movq (%3,%0), %%mm4 \n"
2212 "movq %%mm0, %%mm2 \n"
2213 "movq %%mm1, %%mm3 \n"
2214 "pfmul %%mm4, %%mm2 \n" // src0[len+i]*win[len+i]
2215 "pfmul %%mm5, %%mm3 \n" // src1[ j]*win[len+j]
2216 "pfmul %%mm4, %%mm1 \n" // src0[len+i]*win[len+j]
2217 "pfmul %%mm5, %%mm0 \n" // src1[ j]*win[len+i]
2218 "pfadd %%mm3, %%mm2 \n"
2219 "pfsub %%mm0, %%mm1 \n"
2220 "pswapd %%mm2, %%mm2 \n"
2221 "movq %%mm1, (%2,%0) \n"
2222 "movq %%mm2, (%2,%1) \n"
2228 :"r"(dst+len), "r"(src0+len), "r"(src1), "r"(win+len)
2232 static void vector_fmul_window_sse(float *dst, const float *src0, const float *src1,
2233 const float *win, int len){
2235 x86_reg j = len*4-16;
2238 "movaps (%5,%1), %%xmm1 \n"
2239 "movaps (%5,%0), %%xmm0 \n"
2240 "movaps (%4,%1), %%xmm5 \n"
2241 "movaps (%3,%0), %%xmm4 \n"
2242 "shufps $0x1b, %%xmm1, %%xmm1 \n"
2243 "shufps $0x1b, %%xmm5, %%xmm5 \n"
2244 "movaps %%xmm0, %%xmm2 \n"
2245 "movaps %%xmm1, %%xmm3 \n"
2246 "mulps %%xmm4, %%xmm2 \n" // src0[len+i]*win[len+i]
2247 "mulps %%xmm5, %%xmm3 \n" // src1[ j]*win[len+j]
2248 "mulps %%xmm4, %%xmm1 \n" // src0[len+i]*win[len+j]
2249 "mulps %%xmm5, %%xmm0 \n" // src1[ j]*win[len+i]
2250 "addps %%xmm3, %%xmm2 \n"
2251 "subps %%xmm0, %%xmm1 \n"
2252 "shufps $0x1b, %%xmm2, %%xmm2 \n"
2253 "movaps %%xmm1, (%2,%0) \n"
2254 "movaps %%xmm2, (%2,%1) \n"
2259 :"r"(dst+len), "r"(src0+len), "r"(src1), "r"(win+len)
2262 #endif /* HAVE_6REGS */
2264 static void vector_clipf_sse(float *dst, const float *src, float min, float max,
2267 x86_reg i = (len-16)*4;
2269 "movss %3, %%xmm4 \n"
2270 "movss %4, %%xmm5 \n"
2271 "shufps $0, %%xmm4, %%xmm4 \n"
2272 "shufps $0, %%xmm5, %%xmm5 \n"
2274 "movaps (%2,%0), %%xmm0 \n\t" // 3/1 on intel
2275 "movaps 16(%2,%0), %%xmm1 \n\t"
2276 "movaps 32(%2,%0), %%xmm2 \n\t"
2277 "movaps 48(%2,%0), %%xmm3 \n\t"
2278 "maxps %%xmm4, %%xmm0 \n\t"
2279 "maxps %%xmm4, %%xmm1 \n\t"
2280 "maxps %%xmm4, %%xmm2 \n\t"
2281 "maxps %%xmm4, %%xmm3 \n\t"
2282 "minps %%xmm5, %%xmm0 \n\t"
2283 "minps %%xmm5, %%xmm1 \n\t"
2284 "minps %%xmm5, %%xmm2 \n\t"
2285 "minps %%xmm5, %%xmm3 \n\t"
2286 "movaps %%xmm0, (%1,%0) \n\t"
2287 "movaps %%xmm1, 16(%1,%0) \n\t"
2288 "movaps %%xmm2, 32(%1,%0) \n\t"
2289 "movaps %%xmm3, 48(%1,%0) \n\t"
2293 :"r"(dst), "r"(src), "m"(min), "m"(max)
2298 void ff_vp3_idct_mmx(int16_t *input_data);
2299 void ff_vp3_idct_put_mmx(uint8_t *dest, int line_size, DCTELEM *block);
2300 void ff_vp3_idct_add_mmx(uint8_t *dest, int line_size, DCTELEM *block);
2302 void ff_vp3_idct_dc_add_mmx2(uint8_t *dest, int line_size, const DCTELEM *block);
2304 void ff_vp3_v_loop_filter_mmx2(uint8_t *src, int stride, int *bounding_values);
2305 void ff_vp3_h_loop_filter_mmx2(uint8_t *src, int stride, int *bounding_values);
2307 void ff_vp3_idct_sse2(int16_t *input_data);
2308 void ff_vp3_idct_put_sse2(uint8_t *dest, int line_size, DCTELEM *block);
2309 void ff_vp3_idct_add_sse2(uint8_t *dest, int line_size, DCTELEM *block);
2311 int32_t ff_scalarproduct_int16_mmx2(const int16_t *v1, const int16_t *v2, int order, int shift);
2312 int32_t ff_scalarproduct_int16_sse2(const int16_t *v1, const int16_t *v2, int order, int shift);
2313 int32_t ff_scalarproduct_and_madd_int16_mmx2(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul);
2314 int32_t ff_scalarproduct_and_madd_int16_sse2(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul);
2315 int32_t ff_scalarproduct_and_madd_int16_ssse3(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul);
2317 void ff_apply_window_int16_mmxext (int16_t *output, const int16_t *input,
2318 const int16_t *window, unsigned int len);
2319 void ff_apply_window_int16_mmxext_ba (int16_t *output, const int16_t *input,
2320 const int16_t *window, unsigned int len);
2321 void ff_apply_window_int16_sse2 (int16_t *output, const int16_t *input,
2322 const int16_t *window, unsigned int len);
2323 void ff_apply_window_int16_sse2_ba (int16_t *output, const int16_t *input,
2324 const int16_t *window, unsigned int len);
2325 void ff_apply_window_int16_ssse3 (int16_t *output, const int16_t *input,
2326 const int16_t *window, unsigned int len);
2327 void ff_apply_window_int16_ssse3_atom(int16_t *output, const int16_t *input,
2328 const int16_t *window, unsigned int len);
2330 void ff_add_hfyu_median_prediction_mmx2(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int w, int *left, int *left_top);
2331 int ff_add_hfyu_left_prediction_ssse3(uint8_t *dst, const uint8_t *src, int w, int left);
2332 int ff_add_hfyu_left_prediction_sse4(uint8_t *dst, const uint8_t *src, int w, int left);
2334 float ff_scalarproduct_float_sse(const float *v1, const float *v2, int order);
2336 void ff_vector_clip_int32_mmx (int32_t *dst, const int32_t *src, int32_t min,
2337 int32_t max, unsigned int len);
2338 void ff_vector_clip_int32_sse2 (int32_t *dst, const int32_t *src, int32_t min,
2339 int32_t max, unsigned int len);
2340 void ff_vector_clip_int32_sse2_int(int32_t *dst, const int32_t *src, int32_t min,
2341 int32_t max, unsigned int len);
2342 void ff_vector_clip_int32_sse41 (int32_t *dst, const int32_t *src, int32_t min,
2343 int32_t max, unsigned int len);
2345 void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
2347 int mm_flags = av_get_cpu_flags();
2348 const int high_bit_depth = avctx->codec_id == CODEC_ID_H264 && avctx->bits_per_raw_sample > 8;
2349 const int bit_depth = avctx->bits_per_raw_sample;
2351 if (avctx->dsp_mask) {
2352 if (avctx->dsp_mask & AV_CPU_FLAG_FORCE)
2353 mm_flags |= (avctx->dsp_mask & 0xffff);
2355 mm_flags &= ~(avctx->dsp_mask & 0xffff);
2359 av_log(avctx, AV_LOG_INFO, "libavcodec: CPU flags:");
2360 if (mm_flags & AV_CPU_FLAG_MMX)
2361 av_log(avctx, AV_LOG_INFO, " mmx");
2362 if (mm_flags & AV_CPU_FLAG_MMX2)
2363 av_log(avctx, AV_LOG_INFO, " mmx2");
2364 if (mm_flags & AV_CPU_FLAG_3DNOW)
2365 av_log(avctx, AV_LOG_INFO, " 3dnow");
2366 if (mm_flags & AV_CPU_FLAG_SSE)
2367 av_log(avctx, AV_LOG_INFO, " sse");
2368 if (mm_flags & AV_CPU_FLAG_SSE2)
2369 av_log(avctx, AV_LOG_INFO, " sse2");
2370 av_log(avctx, AV_LOG_INFO, "\n");
2373 if (mm_flags & AV_CPU_FLAG_MMX) {
2374 const int idct_algo= avctx->idct_algo;
2376 if(avctx->lowres==0){
2377 if(idct_algo==FF_IDCT_AUTO || idct_algo==FF_IDCT_SIMPLEMMX){
2378 c->idct_put= ff_simple_idct_put_mmx;
2379 c->idct_add= ff_simple_idct_add_mmx;
2380 c->idct = ff_simple_idct_mmx;
2381 c->idct_permutation_type= FF_SIMPLE_IDCT_PERM;
2383 }else if(idct_algo==FF_IDCT_LIBMPEG2MMX){
2384 if(mm_flags & AV_CPU_FLAG_MMX2){
2385 c->idct_put= ff_libmpeg2mmx2_idct_put;
2386 c->idct_add= ff_libmpeg2mmx2_idct_add;
2387 c->idct = ff_mmxext_idct;
2389 c->idct_put= ff_libmpeg2mmx_idct_put;
2390 c->idct_add= ff_libmpeg2mmx_idct_add;
2391 c->idct = ff_mmx_idct;
2393 c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
2395 }else if((CONFIG_VP3_DECODER || CONFIG_VP5_DECODER || CONFIG_VP6_DECODER) &&
2396 idct_algo==FF_IDCT_VP3 && HAVE_YASM){
2397 if(mm_flags & AV_CPU_FLAG_SSE2){
2398 c->idct_put= ff_vp3_idct_put_sse2;
2399 c->idct_add= ff_vp3_idct_add_sse2;
2400 c->idct = ff_vp3_idct_sse2;
2401 c->idct_permutation_type= FF_TRANSPOSE_IDCT_PERM;
2403 c->idct_put= ff_vp3_idct_put_mmx;
2404 c->idct_add= ff_vp3_idct_add_mmx;
2405 c->idct = ff_vp3_idct_mmx;
2406 c->idct_permutation_type= FF_PARTTRANS_IDCT_PERM;
2408 }else if(idct_algo==FF_IDCT_CAVS){
2409 c->idct_permutation_type= FF_TRANSPOSE_IDCT_PERM;
2410 }else if(idct_algo==FF_IDCT_XVIDMMX){
2411 if(mm_flags & AV_CPU_FLAG_SSE2){
2412 c->idct_put= ff_idct_xvid_sse2_put;
2413 c->idct_add= ff_idct_xvid_sse2_add;
2414 c->idct = ff_idct_xvid_sse2;
2415 c->idct_permutation_type= FF_SSE2_IDCT_PERM;
2416 }else if(mm_flags & AV_CPU_FLAG_MMX2){
2417 c->idct_put= ff_idct_xvid_mmx2_put;
2418 c->idct_add= ff_idct_xvid_mmx2_add;
2419 c->idct = ff_idct_xvid_mmx2;
2421 c->idct_put= ff_idct_xvid_mmx_put;
2422 c->idct_add= ff_idct_xvid_mmx_add;
2423 c->idct = ff_idct_xvid_mmx;
2428 c->put_pixels_clamped = ff_put_pixels_clamped_mmx;
2429 c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_mmx;
2430 c->add_pixels_clamped = ff_add_pixels_clamped_mmx;
2431 if (!high_bit_depth) {
2432 c->clear_block = clear_block_mmx;
2433 c->clear_blocks = clear_blocks_mmx;
2434 if ((mm_flags & AV_CPU_FLAG_SSE) &&
2435 !(CONFIG_MPEG_XVMC_DECODER && avctx->xvmc_acceleration > 1)){
2436 /* XvMCCreateBlocks() may not allocate 16-byte aligned blocks */
2437 c->clear_block = clear_block_sse;
2438 c->clear_blocks = clear_blocks_sse;
2442 #define SET_HPEL_FUNCS(PFX, IDX, SIZE, CPU) \
2443 c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## SIZE ## _ ## CPU; \
2444 c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## SIZE ## _x2_ ## CPU; \
2445 c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## SIZE ## _y2_ ## CPU; \
2446 c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## SIZE ## _xy2_ ## CPU
2448 if (!high_bit_depth) {
2449 SET_HPEL_FUNCS(put, 0, 16, mmx);
2450 SET_HPEL_FUNCS(put_no_rnd, 0, 16, mmx);
2451 SET_HPEL_FUNCS(avg, 0, 16, mmx);
2452 SET_HPEL_FUNCS(avg_no_rnd, 0, 16, mmx);
2453 SET_HPEL_FUNCS(put, 1, 8, mmx);
2454 SET_HPEL_FUNCS(put_no_rnd, 1, 8, mmx);
2455 SET_HPEL_FUNCS(avg, 1, 8, mmx);
2456 SET_HPEL_FUNCS(avg_no_rnd, 1, 8, mmx);
2459 #if ARCH_X86_32 || !HAVE_YASM
2462 #if ARCH_X86_32 && HAVE_YASM
2463 if (!high_bit_depth)
2464 c->emulated_edge_mc = emulated_edge_mc_mmx;
2467 c->add_bytes= add_bytes_mmx;
2469 if (!high_bit_depth)
2470 c->draw_edges = draw_edges_mmx;
2472 if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
2473 c->h263_v_loop_filter= h263_v_loop_filter_mmx;
2474 c->h263_h_loop_filter= h263_h_loop_filter_mmx;
2478 if (!high_bit_depth) {
2479 c->put_h264_chroma_pixels_tab[0]= ff_put_h264_chroma_mc8_mmx_rnd;
2480 c->put_h264_chroma_pixels_tab[1]= ff_put_h264_chroma_mc4_mmx;
2483 c->put_rv40_chroma_pixels_tab[0]= ff_put_rv40_chroma_mc8_mmx;
2484 c->put_rv40_chroma_pixels_tab[1]= ff_put_rv40_chroma_mc4_mmx;
2486 c->vector_clip_int32 = ff_vector_clip_int32_mmx;
2489 if (mm_flags & AV_CPU_FLAG_MMX2) {
2490 c->prefetch = prefetch_mmx2;
2492 if (!high_bit_depth) {
2493 c->put_pixels_tab[0][1] = put_pixels16_x2_mmx2;
2494 c->put_pixels_tab[0][2] = put_pixels16_y2_mmx2;
2496 c->avg_pixels_tab[0][0] = avg_pixels16_mmx2;
2497 c->avg_pixels_tab[0][1] = avg_pixels16_x2_mmx2;
2498 c->avg_pixels_tab[0][2] = avg_pixels16_y2_mmx2;
2500 c->put_pixels_tab[1][1] = put_pixels8_x2_mmx2;
2501 c->put_pixels_tab[1][2] = put_pixels8_y2_mmx2;
2503 c->avg_pixels_tab[1][0] = avg_pixels8_mmx2;
2504 c->avg_pixels_tab[1][1] = avg_pixels8_x2_mmx2;
2505 c->avg_pixels_tab[1][2] = avg_pixels8_y2_mmx2;
2508 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
2509 if (!high_bit_depth) {
2510 c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmx2;
2511 c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmx2;
2512 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_mmx2;
2513 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_mmx2;
2514 c->avg_pixels_tab[0][3] = avg_pixels16_xy2_mmx2;
2515 c->avg_pixels_tab[1][3] = avg_pixels8_xy2_mmx2;
2518 if (CONFIG_VP3_DECODER && HAVE_YASM) {
2519 c->vp3_v_loop_filter= ff_vp3_v_loop_filter_mmx2;
2520 c->vp3_h_loop_filter= ff_vp3_h_loop_filter_mmx2;
2523 if (CONFIG_VP3_DECODER && HAVE_YASM) {
2524 c->vp3_idct_dc_add = ff_vp3_idct_dc_add_mmx2;
2527 if (CONFIG_VP3_DECODER
2528 && (avctx->codec_id == CODEC_ID_VP3 || avctx->codec_id == CODEC_ID_THEORA)) {
2529 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_exact_mmx2;
2530 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_exact_mmx2;
2533 #define SET_QPEL_FUNCS(PFX, IDX, SIZE, CPU, PREFIX) \
2534 c->PFX ## _pixels_tab[IDX][ 0] = PREFIX ## PFX ## SIZE ## _mc00_ ## CPU; \
2535 c->PFX ## _pixels_tab[IDX][ 1] = PREFIX ## PFX ## SIZE ## _mc10_ ## CPU; \
2536 c->PFX ## _pixels_tab[IDX][ 2] = PREFIX ## PFX ## SIZE ## _mc20_ ## CPU; \
2537 c->PFX ## _pixels_tab[IDX][ 3] = PREFIX ## PFX ## SIZE ## _mc30_ ## CPU; \
2538 c->PFX ## _pixels_tab[IDX][ 4] = PREFIX ## PFX ## SIZE ## _mc01_ ## CPU; \
2539 c->PFX ## _pixels_tab[IDX][ 5] = PREFIX ## PFX ## SIZE ## _mc11_ ## CPU; \
2540 c->PFX ## _pixels_tab[IDX][ 6] = PREFIX ## PFX ## SIZE ## _mc21_ ## CPU; \
2541 c->PFX ## _pixels_tab[IDX][ 7] = PREFIX ## PFX ## SIZE ## _mc31_ ## CPU; \
2542 c->PFX ## _pixels_tab[IDX][ 8] = PREFIX ## PFX ## SIZE ## _mc02_ ## CPU; \
2543 c->PFX ## _pixels_tab[IDX][ 9] = PREFIX ## PFX ## SIZE ## _mc12_ ## CPU; \
2544 c->PFX ## _pixels_tab[IDX][10] = PREFIX ## PFX ## SIZE ## _mc22_ ## CPU; \
2545 c->PFX ## _pixels_tab[IDX][11] = PREFIX ## PFX ## SIZE ## _mc32_ ## CPU; \
2546 c->PFX ## _pixels_tab[IDX][12] = PREFIX ## PFX ## SIZE ## _mc03_ ## CPU; \
2547 c->PFX ## _pixels_tab[IDX][13] = PREFIX ## PFX ## SIZE ## _mc13_ ## CPU; \
2548 c->PFX ## _pixels_tab[IDX][14] = PREFIX ## PFX ## SIZE ## _mc23_ ## CPU; \
2549 c->PFX ## _pixels_tab[IDX][15] = PREFIX ## PFX ## SIZE ## _mc33_ ## CPU
2551 SET_QPEL_FUNCS(put_qpel, 0, 16, mmx2, );
2552 SET_QPEL_FUNCS(put_qpel, 1, 8, mmx2, );
2553 SET_QPEL_FUNCS(put_no_rnd_qpel, 0, 16, mmx2, );
2554 SET_QPEL_FUNCS(put_no_rnd_qpel, 1, 8, mmx2, );
2555 SET_QPEL_FUNCS(avg_qpel, 0, 16, mmx2, );
2556 SET_QPEL_FUNCS(avg_qpel, 1, 8, mmx2, );
2558 if (!high_bit_depth) {
2559 SET_QPEL_FUNCS(put_h264_qpel, 0, 16, mmx2, );
2560 SET_QPEL_FUNCS(put_h264_qpel, 1, 8, mmx2, );
2561 SET_QPEL_FUNCS(put_h264_qpel, 2, 4, mmx2, );
2562 SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, mmx2, );
2563 SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, mmx2, );
2564 SET_QPEL_FUNCS(avg_h264_qpel, 2, 4, mmx2, );
2566 else if (bit_depth == 10) {
2569 SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, 10_mmxext, ff_);
2570 SET_QPEL_FUNCS(put_h264_qpel, 0, 16, 10_mmxext, ff_);
2571 SET_QPEL_FUNCS(put_h264_qpel, 1, 8, 10_mmxext, ff_);
2572 SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, 10_mmxext, ff_);
2574 SET_QPEL_FUNCS(put_h264_qpel, 2, 4, 10_mmxext, ff_);
2575 SET_QPEL_FUNCS(avg_h264_qpel, 2, 4, 10_mmxext, ff_);
2579 SET_QPEL_FUNCS(put_2tap_qpel, 0, 16, mmx2, );
2580 SET_QPEL_FUNCS(put_2tap_qpel, 1, 8, mmx2, );
2581 SET_QPEL_FUNCS(avg_2tap_qpel, 0, 16, mmx2, );
2582 SET_QPEL_FUNCS(avg_2tap_qpel, 1, 8, mmx2, );
2585 c->avg_rv40_chroma_pixels_tab[0]= ff_avg_rv40_chroma_mc8_mmx2;
2586 c->avg_rv40_chroma_pixels_tab[1]= ff_avg_rv40_chroma_mc4_mmx2;
2588 if (!high_bit_depth) {
2589 c->avg_h264_chroma_pixels_tab[0]= ff_avg_h264_chroma_mc8_mmx2_rnd;
2590 c->avg_h264_chroma_pixels_tab[1]= ff_avg_h264_chroma_mc4_mmx2;
2591 c->avg_h264_chroma_pixels_tab[2]= ff_avg_h264_chroma_mc2_mmx2;
2592 c->put_h264_chroma_pixels_tab[2]= ff_put_h264_chroma_mc2_mmx2;
2594 if (bit_depth == 10) {
2595 c->put_h264_chroma_pixels_tab[2]= ff_put_h264_chroma_mc2_10_mmxext;
2596 c->avg_h264_chroma_pixels_tab[2]= ff_avg_h264_chroma_mc2_10_mmxext;
2597 c->put_h264_chroma_pixels_tab[1]= ff_put_h264_chroma_mc4_10_mmxext;
2598 c->avg_h264_chroma_pixels_tab[1]= ff_avg_h264_chroma_mc4_10_mmxext;
2601 c->add_hfyu_median_prediction = ff_add_hfyu_median_prediction_mmx2;
2604 if( mm_flags&AV_CPU_FLAG_3DNOW )
2605 c->add_hfyu_median_prediction = add_hfyu_median_prediction_cmov;
2608 } else if (mm_flags & AV_CPU_FLAG_3DNOW) {
2609 c->prefetch = prefetch_3dnow;
2611 if (!high_bit_depth) {
2612 c->put_pixels_tab[0][1] = put_pixels16_x2_3dnow;
2613 c->put_pixels_tab[0][2] = put_pixels16_y2_3dnow;
2615 c->avg_pixels_tab[0][0] = avg_pixels16_3dnow;
2616 c->avg_pixels_tab[0][1] = avg_pixels16_x2_3dnow;
2617 c->avg_pixels_tab[0][2] = avg_pixels16_y2_3dnow;
2619 c->put_pixels_tab[1][1] = put_pixels8_x2_3dnow;
2620 c->put_pixels_tab[1][2] = put_pixels8_y2_3dnow;
2622 c->avg_pixels_tab[1][0] = avg_pixels8_3dnow;
2623 c->avg_pixels_tab[1][1] = avg_pixels8_x2_3dnow;
2624 c->avg_pixels_tab[1][2] = avg_pixels8_y2_3dnow;
2626 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
2627 c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_3dnow;
2628 c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_3dnow;
2629 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_3dnow;
2630 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_3dnow;
2631 c->avg_pixels_tab[0][3] = avg_pixels16_xy2_3dnow;
2632 c->avg_pixels_tab[1][3] = avg_pixels8_xy2_3dnow;
2636 if (CONFIG_VP3_DECODER
2637 && (avctx->codec_id == CODEC_ID_VP3 || avctx->codec_id == CODEC_ID_THEORA)) {
2638 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_exact_3dnow;
2639 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_exact_3dnow;
2642 SET_QPEL_FUNCS(put_qpel, 0, 16, 3dnow, );
2643 SET_QPEL_FUNCS(put_qpel, 1, 8, 3dnow, );
2644 SET_QPEL_FUNCS(put_no_rnd_qpel, 0, 16, 3dnow, );
2645 SET_QPEL_FUNCS(put_no_rnd_qpel, 1, 8, 3dnow, );
2646 SET_QPEL_FUNCS(avg_qpel, 0, 16, 3dnow, );
2647 SET_QPEL_FUNCS(avg_qpel, 1, 8, 3dnow, );
2649 if (!high_bit_depth) {
2650 SET_QPEL_FUNCS(put_h264_qpel, 0, 16, 3dnow, );
2651 SET_QPEL_FUNCS(put_h264_qpel, 1, 8, 3dnow, );
2652 SET_QPEL_FUNCS(put_h264_qpel, 2, 4, 3dnow, );
2653 SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, 3dnow, );
2654 SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, 3dnow, );
2655 SET_QPEL_FUNCS(avg_h264_qpel, 2, 4, 3dnow, );
2658 SET_QPEL_FUNCS(put_2tap_qpel, 0, 16, 3dnow, );
2659 SET_QPEL_FUNCS(put_2tap_qpel, 1, 8, 3dnow, );
2660 SET_QPEL_FUNCS(avg_2tap_qpel, 0, 16, 3dnow, );
2661 SET_QPEL_FUNCS(avg_2tap_qpel, 1, 8, 3dnow, );
2664 if (!high_bit_depth) {
2665 c->avg_h264_chroma_pixels_tab[0]= ff_avg_h264_chroma_mc8_3dnow_rnd;
2666 c->avg_h264_chroma_pixels_tab[1]= ff_avg_h264_chroma_mc4_3dnow;
2669 c->avg_rv40_chroma_pixels_tab[0]= ff_avg_rv40_chroma_mc8_3dnow;
2670 c->avg_rv40_chroma_pixels_tab[1]= ff_avg_rv40_chroma_mc4_3dnow;
2675 #define H264_QPEL_FUNCS(x, y, CPU)\
2676 c->put_h264_qpel_pixels_tab[0][x+y*4] = put_h264_qpel16_mc##x##y##_##CPU;\
2677 c->put_h264_qpel_pixels_tab[1][x+y*4] = put_h264_qpel8_mc##x##y##_##CPU;\
2678 c->avg_h264_qpel_pixels_tab[0][x+y*4] = avg_h264_qpel16_mc##x##y##_##CPU;\
2679 c->avg_h264_qpel_pixels_tab[1][x+y*4] = avg_h264_qpel8_mc##x##y##_##CPU;
2680 if((mm_flags & AV_CPU_FLAG_SSE2) && !(mm_flags & AV_CPU_FLAG_3DNOW)){
2681 // these functions are slower than mmx on AMD, but faster on Intel
2682 if (!high_bit_depth) {
2683 c->put_pixels_tab[0][0] = put_pixels16_sse2;
2684 c->put_no_rnd_pixels_tab[0][0] = put_pixels16_sse2;
2685 c->avg_pixels_tab[0][0] = avg_pixels16_sse2;
2686 H264_QPEL_FUNCS(0, 0, sse2);
2689 if(mm_flags & AV_CPU_FLAG_SSE2){
2690 if (!high_bit_depth) {
2691 H264_QPEL_FUNCS(0, 1, sse2);
2692 H264_QPEL_FUNCS(0, 2, sse2);
2693 H264_QPEL_FUNCS(0, 3, sse2);
2694 H264_QPEL_FUNCS(1, 1, sse2);
2695 H264_QPEL_FUNCS(1, 2, sse2);
2696 H264_QPEL_FUNCS(1, 3, sse2);
2697 H264_QPEL_FUNCS(2, 1, sse2);
2698 H264_QPEL_FUNCS(2, 2, sse2);
2699 H264_QPEL_FUNCS(2, 3, sse2);
2700 H264_QPEL_FUNCS(3, 1, sse2);
2701 H264_QPEL_FUNCS(3, 2, sse2);
2702 H264_QPEL_FUNCS(3, 3, sse2);
2705 #define H264_QPEL_FUNCS_10(x, y, CPU)\
2706 c->put_h264_qpel_pixels_tab[0][x+y*4] = ff_put_h264_qpel16_mc##x##y##_10_##CPU;\
2707 c->put_h264_qpel_pixels_tab[1][x+y*4] = ff_put_h264_qpel8_mc##x##y##_10_##CPU;\
2708 c->avg_h264_qpel_pixels_tab[0][x+y*4] = ff_avg_h264_qpel16_mc##x##y##_10_##CPU;\
2709 c->avg_h264_qpel_pixels_tab[1][x+y*4] = ff_avg_h264_qpel8_mc##x##y##_10_##CPU;
2710 if (bit_depth == 10) {
2711 SET_QPEL_FUNCS(put_h264_qpel, 0, 16, 10_sse2, ff_);
2712 SET_QPEL_FUNCS(put_h264_qpel, 1, 8, 10_sse2, ff_);
2713 SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, 10_sse2, ff_);
2714 SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, 10_sse2, ff_);
2715 H264_QPEL_FUNCS_10(1, 0, sse2_cache64)
2716 H264_QPEL_FUNCS_10(2, 0, sse2_cache64)
2717 H264_QPEL_FUNCS_10(3, 0, sse2_cache64)
2719 c->put_h264_chroma_pixels_tab[0]= ff_put_h264_chroma_mc8_10_sse2;
2720 c->avg_h264_chroma_pixels_tab[0]= ff_avg_h264_chroma_mc8_10_sse2;
2725 if(mm_flags & AV_CPU_FLAG_SSSE3){
2726 if (!high_bit_depth) {
2727 H264_QPEL_FUNCS(1, 0, ssse3);
2728 H264_QPEL_FUNCS(1, 1, ssse3);
2729 H264_QPEL_FUNCS(1, 2, ssse3);
2730 H264_QPEL_FUNCS(1, 3, ssse3);
2731 H264_QPEL_FUNCS(2, 0, ssse3);
2732 H264_QPEL_FUNCS(2, 1, ssse3);
2733 H264_QPEL_FUNCS(2, 2, ssse3);
2734 H264_QPEL_FUNCS(2, 3, ssse3);
2735 H264_QPEL_FUNCS(3, 0, ssse3);
2736 H264_QPEL_FUNCS(3, 1, ssse3);
2737 H264_QPEL_FUNCS(3, 2, ssse3);
2738 H264_QPEL_FUNCS(3, 3, ssse3);
2741 else if (bit_depth == 10) {
2742 H264_QPEL_FUNCS_10(1, 0, ssse3_cache64)
2743 H264_QPEL_FUNCS_10(2, 0, ssse3_cache64)
2744 H264_QPEL_FUNCS_10(3, 0, ssse3_cache64)
2746 if (!high_bit_depth) {
2747 c->put_h264_chroma_pixels_tab[0]= ff_put_h264_chroma_mc8_ssse3_rnd;
2748 c->avg_h264_chroma_pixels_tab[0]= ff_avg_h264_chroma_mc8_ssse3_rnd;
2749 c->put_h264_chroma_pixels_tab[1]= ff_put_h264_chroma_mc4_ssse3;
2750 c->avg_h264_chroma_pixels_tab[1]= ff_avg_h264_chroma_mc4_ssse3;
2752 c->add_hfyu_left_prediction = ff_add_hfyu_left_prediction_ssse3;
2753 if (mm_flags & AV_CPU_FLAG_SSE4) // not really sse4, just slow on Conroe
2754 c->add_hfyu_left_prediction = ff_add_hfyu_left_prediction_sse4;
2759 if(mm_flags & AV_CPU_FLAG_3DNOW){
2760 c->vorbis_inverse_coupling = vorbis_inverse_coupling_3dnow;
2761 c->vector_fmul = vector_fmul_3dnow;
2763 if(mm_flags & AV_CPU_FLAG_3DNOWEXT){
2764 c->vector_fmul_reverse = vector_fmul_reverse_3dnow2;
2766 c->vector_fmul_window = vector_fmul_window_3dnow2;
2769 if(mm_flags & AV_CPU_FLAG_MMX2){
2771 c->scalarproduct_int16 = ff_scalarproduct_int16_mmx2;
2772 c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_mmx2;
2773 if (avctx->flags & CODEC_FLAG_BITEXACT) {
2774 c->apply_window_int16 = ff_apply_window_int16_mmxext_ba;
2776 c->apply_window_int16 = ff_apply_window_int16_mmxext;
2780 if(mm_flags & AV_CPU_FLAG_SSE){
2781 c->vorbis_inverse_coupling = vorbis_inverse_coupling_sse;
2782 c->ac3_downmix = ac3_downmix_sse;
2783 c->vector_fmul = vector_fmul_sse;
2784 c->vector_fmul_reverse = vector_fmul_reverse_sse;
2785 c->vector_fmul_add = vector_fmul_add_sse;
2787 c->vector_fmul_window = vector_fmul_window_sse;
2789 c->vector_clipf = vector_clipf_sse;
2791 c->scalarproduct_float = ff_scalarproduct_float_sse;
2794 if(mm_flags & AV_CPU_FLAG_3DNOW)
2795 c->vector_fmul_add = vector_fmul_add_3dnow; // faster than sse
2796 if(mm_flags & AV_CPU_FLAG_SSE2){
2798 c->scalarproduct_int16 = ff_scalarproduct_int16_sse2;
2799 c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_sse2;
2800 if (mm_flags & AV_CPU_FLAG_ATOM) {
2801 c->vector_clip_int32 = ff_vector_clip_int32_sse2_int;
2803 c->vector_clip_int32 = ff_vector_clip_int32_sse2;
2805 if (avctx->flags & CODEC_FLAG_BITEXACT) {
2806 c->apply_window_int16 = ff_apply_window_int16_sse2_ba;
2808 if (!(mm_flags & AV_CPU_FLAG_SSE2SLOW)) {
2809 c->apply_window_int16 = ff_apply_window_int16_sse2;
2813 if (!high_bit_depth)
2814 c->emulated_edge_mc = emulated_edge_mc_sse;
2818 if (mm_flags & AV_CPU_FLAG_SSSE3) {
2820 if (mm_flags & AV_CPU_FLAG_ATOM) {
2821 c->apply_window_int16 = ff_apply_window_int16_ssse3_atom;
2823 c->apply_window_int16 = ff_apply_window_int16_ssse3;
2825 if (!(mm_flags & (AV_CPU_FLAG_SSE42|AV_CPU_FLAG_3DNOW))) { // cachesplit
2826 c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_ssse3;
2831 if (mm_flags & AV_CPU_FLAG_SSE4 && HAVE_SSE) {
2833 c->vector_clip_int32 = ff_vector_clip_int32_sse41;
2837 #if HAVE_AVX && HAVE_YASM
2838 if (mm_flags & AV_CPU_FLAG_AVX) {
2839 if (bit_depth == 10) {
2840 //AVX implies !cache64.
2841 //TODO: Port cache(32|64) detection from x264.
2842 H264_QPEL_FUNCS_10(1, 0, sse2)
2843 H264_QPEL_FUNCS_10(2, 0, sse2)
2844 H264_QPEL_FUNCS_10(3, 0, sse2)
2846 c->put_h264_chroma_pixels_tab[0]= ff_put_h264_chroma_mc8_10_avx;
2847 c->avg_h264_chroma_pixels_tab[0]= ff_avg_h264_chroma_mc8_10_avx;
2853 if (CONFIG_ENCODERS)
2854 dsputilenc_init_mmx(c, avctx);
2857 // for speed testing
2858 get_pixels = just_return;
2859 put_pixels_clamped = just_return;
2860 add_pixels_clamped = just_return;
2862 pix_abs16x16 = just_return;
2863 pix_abs16x16_x2 = just_return;
2864 pix_abs16x16_y2 = just_return;
2865 pix_abs16x16_xy2 = just_return;
2867 put_pixels_tab[0] = just_return;
2868 put_pixels_tab[1] = just_return;
2869 put_pixels_tab[2] = just_return;
2870 put_pixels_tab[3] = just_return;
2872 put_no_rnd_pixels_tab[0] = just_return;
2873 put_no_rnd_pixels_tab[1] = just_return;
2874 put_no_rnd_pixels_tab[2] = just_return;
2875 put_no_rnd_pixels_tab[3] = just_return;
2877 avg_pixels_tab[0] = just_return;
2878 avg_pixels_tab[1] = just_return;
2879 avg_pixels_tab[2] = just_return;
2880 avg_pixels_tab[3] = just_return;
2882 avg_no_rnd_pixels_tab[0] = just_return;
2883 avg_no_rnd_pixels_tab[1] = just_return;
2884 avg_no_rnd_pixels_tab[2] = just_return;
2885 avg_no_rnd_pixels_tab[3] = just_return;
2887 //av_fdct = just_return;
2888 //ff_idct = just_return;