2 * MMX optimized DSP utils
3 * Copyright (c) 2000, 2001 Fabrice Bellard
4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
6 * This file is part of FFmpeg.
8 * FFmpeg is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License as published by the Free Software Foundation; either
11 * version 2.1 of the License, or (at your option) any later version.
13 * FFmpeg is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 * Lesser General Public License for more details.
18 * You should have received a copy of the GNU Lesser General Public
19 * License along with FFmpeg; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22 * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
25 #include "libavutil/cpu.h"
26 #include "libavutil/x86_cpu.h"
27 #include "libavcodec/dsputil.h"
28 #include "libavcodec/h264dsp.h"
29 #include "libavcodec/mpegvideo.h"
30 #include "libavcodec/simple_idct.h"
31 #include "libavcodec/ac3dec.h"
32 #include "dsputil_mmx.h"
33 #include "idct_xvid.h"
38 /* pixel operations */
39 DECLARE_ALIGNED(8, const uint64_t, ff_bone) = 0x0101010101010101ULL;
40 DECLARE_ALIGNED(8, const uint64_t, ff_wtwo) = 0x0002000200020002ULL;
42 DECLARE_ALIGNED(16, const uint64_t, ff_pdw_80000000)[2] =
43 {0x8000000080000000ULL, 0x8000000080000000ULL};
45 DECLARE_ALIGNED(8, const uint64_t, ff_pw_1 ) = 0x0001000100010001ULL;
46 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_3 ) = {0x0003000300030003ULL, 0x0003000300030003ULL};
47 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_4 ) = {0x0004000400040004ULL, 0x0004000400040004ULL};
48 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_5 ) = {0x0005000500050005ULL, 0x0005000500050005ULL};
49 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_8 ) = {0x0008000800080008ULL, 0x0008000800080008ULL};
50 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_9 ) = {0x0009000900090009ULL, 0x0009000900090009ULL};
51 DECLARE_ALIGNED(8, const uint64_t, ff_pw_15 ) = 0x000F000F000F000FULL;
52 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_16 ) = {0x0010001000100010ULL, 0x0010001000100010ULL};
53 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_17 ) = {0x0011001100110011ULL, 0x0011001100110011ULL};
54 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_18 ) = {0x0012001200120012ULL, 0x0012001200120012ULL};
55 DECLARE_ALIGNED(8, const uint64_t, ff_pw_20 ) = 0x0014001400140014ULL;
56 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_27 ) = {0x001B001B001B001BULL, 0x001B001B001B001BULL};
57 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_28 ) = {0x001C001C001C001CULL, 0x001C001C001C001CULL};
58 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_32 ) = {0x0020002000200020ULL, 0x0020002000200020ULL};
59 DECLARE_ALIGNED(8, const uint64_t, ff_pw_42 ) = 0x002A002A002A002AULL;
60 DECLARE_ALIGNED(8, const uint64_t, ff_pw_53 ) = 0x0035003500350035ULL;
61 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_63 ) = {0x003F003F003F003FULL, 0x003F003F003F003FULL};
62 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_64 ) = {0x0040004000400040ULL, 0x0040004000400040ULL};
63 DECLARE_ALIGNED(8, const uint64_t, ff_pw_96 ) = 0x0060006000600060ULL;
64 DECLARE_ALIGNED(8, const uint64_t, ff_pw_128) = 0x0080008000800080ULL;
65 DECLARE_ALIGNED(8, const uint64_t, ff_pw_255) = 0x00ff00ff00ff00ffULL;
67 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_0 ) = {0x0000000000000000ULL, 0x0000000000000000ULL};
68 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_1 ) = {0x0101010101010101ULL, 0x0101010101010101ULL};
69 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_3 ) = {0x0303030303030303ULL, 0x0303030303030303ULL};
70 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_4 ) = {0x0404040404040404ULL, 0x0404040404040404ULL};
71 DECLARE_ALIGNED(8, const uint64_t, ff_pb_7 ) = 0x0707070707070707ULL;
72 DECLARE_ALIGNED(8, const uint64_t, ff_pb_1F ) = 0x1F1F1F1F1F1F1F1FULL;
73 DECLARE_ALIGNED(8, const uint64_t, ff_pb_3F ) = 0x3F3F3F3F3F3F3F3FULL;
74 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_80 ) = {0x8080808080808080ULL, 0x8080808080808080ULL};
75 DECLARE_ALIGNED(8, const uint64_t, ff_pb_81 ) = 0x8181818181818181ULL;
76 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_A1 ) = {0xA1A1A1A1A1A1A1A1ULL, 0xA1A1A1A1A1A1A1A1ULL};
77 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_F8 ) = {0xF8F8F8F8F8F8F8F8ULL, 0xF8F8F8F8F8F8F8F8ULL};
78 DECLARE_ALIGNED(8, const uint64_t, ff_pb_FC ) = 0xFCFCFCFCFCFCFCFCULL;
79 DECLARE_ALIGNED(16, const xmm_reg, ff_pb_FE ) = {0xFEFEFEFEFEFEFEFEULL, 0xFEFEFEFEFEFEFEFEULL};
81 DECLARE_ALIGNED(16, const double, ff_pd_1)[2] = { 1.0, 1.0 };
82 DECLARE_ALIGNED(16, const double, ff_pd_2)[2] = { 2.0, 2.0 };
84 #define JUMPALIGN() __asm__ volatile (".p2align 3"::)
85 #define MOVQ_ZERO(regd) __asm__ volatile ("pxor %%" #regd ", %%" #regd ::)
87 #define MOVQ_BFE(regd) \
89 "pcmpeqd %%" #regd ", %%" #regd " \n\t"\
90 "paddb %%" #regd ", %%" #regd " \n\t" ::)
93 #define MOVQ_BONE(regd) __asm__ volatile ("movq %0, %%" #regd " \n\t" ::"m"(ff_bone))
94 #define MOVQ_WTWO(regd) __asm__ volatile ("movq %0, %%" #regd " \n\t" ::"m"(ff_wtwo))
96 // for shared library it's better to use this way for accessing constants
98 #define MOVQ_BONE(regd) \
100 "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
101 "psrlw $15, %%" #regd " \n\t" \
102 "packuswb %%" #regd ", %%" #regd " \n\t" ::)
104 #define MOVQ_WTWO(regd) \
106 "pcmpeqd %%" #regd ", %%" #regd " \n\t" \
107 "psrlw $15, %%" #regd " \n\t" \
108 "psllw $1, %%" #regd " \n\t"::)
112 // using regr as temporary and for the output result
113 // first argument is unmodifed and second is trashed
114 // regfe is supposed to contain 0xfefefefefefefefe
115 #define PAVGB_MMX_NO_RND(rega, regb, regr, regfe) \
116 "movq " #rega ", " #regr " \n\t"\
117 "pand " #regb ", " #regr " \n\t"\
118 "pxor " #rega ", " #regb " \n\t"\
119 "pand " #regfe "," #regb " \n\t"\
120 "psrlq $1, " #regb " \n\t"\
121 "paddb " #regb ", " #regr " \n\t"
123 #define PAVGB_MMX(rega, regb, regr, regfe) \
124 "movq " #rega ", " #regr " \n\t"\
125 "por " #regb ", " #regr " \n\t"\
126 "pxor " #rega ", " #regb " \n\t"\
127 "pand " #regfe "," #regb " \n\t"\
128 "psrlq $1, " #regb " \n\t"\
129 "psubb " #regb ", " #regr " \n\t"
131 // mm6 is supposed to contain 0xfefefefefefefefe
132 #define PAVGBP_MMX_NO_RND(rega, regb, regr, regc, regd, regp) \
133 "movq " #rega ", " #regr " \n\t"\
134 "movq " #regc ", " #regp " \n\t"\
135 "pand " #regb ", " #regr " \n\t"\
136 "pand " #regd ", " #regp " \n\t"\
137 "pxor " #rega ", " #regb " \n\t"\
138 "pxor " #regc ", " #regd " \n\t"\
139 "pand %%mm6, " #regb " \n\t"\
140 "pand %%mm6, " #regd " \n\t"\
141 "psrlq $1, " #regb " \n\t"\
142 "psrlq $1, " #regd " \n\t"\
143 "paddb " #regb ", " #regr " \n\t"\
144 "paddb " #regd ", " #regp " \n\t"
146 #define PAVGBP_MMX(rega, regb, regr, regc, regd, regp) \
147 "movq " #rega ", " #regr " \n\t"\
148 "movq " #regc ", " #regp " \n\t"\
149 "por " #regb ", " #regr " \n\t"\
150 "por " #regd ", " #regp " \n\t"\
151 "pxor " #rega ", " #regb " \n\t"\
152 "pxor " #regc ", " #regd " \n\t"\
153 "pand %%mm6, " #regb " \n\t"\
154 "pand %%mm6, " #regd " \n\t"\
155 "psrlq $1, " #regd " \n\t"\
156 "psrlq $1, " #regb " \n\t"\
157 "psubb " #regb ", " #regr " \n\t"\
158 "psubb " #regd ", " #regp " \n\t"
160 /***********************************/
161 /* MMX no rounding */
162 #define DEF(x, y) x ## _no_rnd_ ## y ##_mmx
163 #define SET_RND MOVQ_WONE
164 #define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX_NO_RND(a, b, c, d, e, f)
165 #define PAVGB(a, b, c, e) PAVGB_MMX_NO_RND(a, b, c, e)
166 #define OP_AVG(a, b, c, e) PAVGB_MMX(a, b, c, e)
168 #include "dsputil_mmx_rnd_template.c"
174 /***********************************/
177 #define DEF(x, y) x ## _ ## y ##_mmx
178 #define SET_RND MOVQ_WTWO
179 #define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX(a, b, c, d, e, f)
180 #define PAVGB(a, b, c, e) PAVGB_MMX(a, b, c, e)
182 #include "dsputil_mmx_rnd_template.c"
190 /***********************************/
193 #define DEF(x) x ## _3dnow
194 #define PAVGB "pavgusb"
197 #include "dsputil_mmx_avg_template.c"
203 /***********************************/
206 #define DEF(x) x ## _mmx2
208 /* Introduced only in MMX2 set */
209 #define PAVGB "pavgb"
212 #include "dsputil_mmx_avg_template.c"
218 #define put_no_rnd_pixels16_mmx put_pixels16_mmx
219 #define put_no_rnd_pixels8_mmx put_pixels8_mmx
220 #define put_pixels16_mmx2 put_pixels16_mmx
221 #define put_pixels8_mmx2 put_pixels8_mmx
222 #define put_pixels4_mmx2 put_pixels4_mmx
223 #define put_no_rnd_pixels16_mmx2 put_no_rnd_pixels16_mmx
224 #define put_no_rnd_pixels8_mmx2 put_no_rnd_pixels8_mmx
225 #define put_pixels16_3dnow put_pixels16_mmx
226 #define put_pixels8_3dnow put_pixels8_mmx
227 #define put_pixels4_3dnow put_pixels4_mmx
228 #define put_no_rnd_pixels16_3dnow put_no_rnd_pixels16_mmx
229 #define put_no_rnd_pixels8_3dnow put_no_rnd_pixels8_mmx
231 /***********************************/
234 void ff_put_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size)
239 /* read the pixels */
244 "movq %3, %%mm0 \n\t"
245 "movq 8%3, %%mm1 \n\t"
246 "movq 16%3, %%mm2 \n\t"
247 "movq 24%3, %%mm3 \n\t"
248 "movq 32%3, %%mm4 \n\t"
249 "movq 40%3, %%mm5 \n\t"
250 "movq 48%3, %%mm6 \n\t"
251 "movq 56%3, %%mm7 \n\t"
252 "packuswb %%mm1, %%mm0 \n\t"
253 "packuswb %%mm3, %%mm2 \n\t"
254 "packuswb %%mm5, %%mm4 \n\t"
255 "packuswb %%mm7, %%mm6 \n\t"
256 "movq %%mm0, (%0) \n\t"
257 "movq %%mm2, (%0, %1) \n\t"
258 "movq %%mm4, (%0, %1, 2) \n\t"
259 "movq %%mm6, (%0, %2) \n\t"
260 ::"r" (pix), "r" ((x86_reg)line_size), "r" ((x86_reg)line_size*3), "m"(*p)
265 // if here would be an exact copy of the code above
266 // compiler would generate some very strange code
269 "movq (%3), %%mm0 \n\t"
270 "movq 8(%3), %%mm1 \n\t"
271 "movq 16(%3), %%mm2 \n\t"
272 "movq 24(%3), %%mm3 \n\t"
273 "movq 32(%3), %%mm4 \n\t"
274 "movq 40(%3), %%mm5 \n\t"
275 "movq 48(%3), %%mm6 \n\t"
276 "movq 56(%3), %%mm7 \n\t"
277 "packuswb %%mm1, %%mm0 \n\t"
278 "packuswb %%mm3, %%mm2 \n\t"
279 "packuswb %%mm5, %%mm4 \n\t"
280 "packuswb %%mm7, %%mm6 \n\t"
281 "movq %%mm0, (%0) \n\t"
282 "movq %%mm2, (%0, %1) \n\t"
283 "movq %%mm4, (%0, %1, 2) \n\t"
284 "movq %%mm6, (%0, %2) \n\t"
285 ::"r" (pix), "r" ((x86_reg)line_size), "r" ((x86_reg)line_size*3), "r"(p)
289 #define put_signed_pixels_clamped_mmx_half(off) \
290 "movq "#off"(%2), %%mm1 \n\t"\
291 "movq 16+"#off"(%2), %%mm2 \n\t"\
292 "movq 32+"#off"(%2), %%mm3 \n\t"\
293 "movq 48+"#off"(%2), %%mm4 \n\t"\
294 "packsswb 8+"#off"(%2), %%mm1 \n\t"\
295 "packsswb 24+"#off"(%2), %%mm2 \n\t"\
296 "packsswb 40+"#off"(%2), %%mm3 \n\t"\
297 "packsswb 56+"#off"(%2), %%mm4 \n\t"\
298 "paddb %%mm0, %%mm1 \n\t"\
299 "paddb %%mm0, %%mm2 \n\t"\
300 "paddb %%mm0, %%mm3 \n\t"\
301 "paddb %%mm0, %%mm4 \n\t"\
302 "movq %%mm1, (%0) \n\t"\
303 "movq %%mm2, (%0, %3) \n\t"\
304 "movq %%mm3, (%0, %3, 2) \n\t"\
305 "movq %%mm4, (%0, %1) \n\t"
307 void ff_put_signed_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size)
309 x86_reg line_skip = line_size;
313 "movq "MANGLE(ff_pb_80)", %%mm0 \n\t"
314 "lea (%3, %3, 2), %1 \n\t"
315 put_signed_pixels_clamped_mmx_half(0)
316 "lea (%0, %3, 4), %0 \n\t"
317 put_signed_pixels_clamped_mmx_half(64)
318 :"+&r" (pixels), "=&r" (line_skip3)
319 :"r" (block), "r"(line_skip)
323 void ff_add_pixels_clamped_mmx(const DCTELEM *block, uint8_t *pixels, int line_size)
329 /* read the pixels */
336 "movq (%2), %%mm0 \n\t"
337 "movq 8(%2), %%mm1 \n\t"
338 "movq 16(%2), %%mm2 \n\t"
339 "movq 24(%2), %%mm3 \n\t"
340 "movq %0, %%mm4 \n\t"
341 "movq %1, %%mm6 \n\t"
342 "movq %%mm4, %%mm5 \n\t"
343 "punpcklbw %%mm7, %%mm4 \n\t"
344 "punpckhbw %%mm7, %%mm5 \n\t"
345 "paddsw %%mm4, %%mm0 \n\t"
346 "paddsw %%mm5, %%mm1 \n\t"
347 "movq %%mm6, %%mm5 \n\t"
348 "punpcklbw %%mm7, %%mm6 \n\t"
349 "punpckhbw %%mm7, %%mm5 \n\t"
350 "paddsw %%mm6, %%mm2 \n\t"
351 "paddsw %%mm5, %%mm3 \n\t"
352 "packuswb %%mm1, %%mm0 \n\t"
353 "packuswb %%mm3, %%mm2 \n\t"
354 "movq %%mm0, %0 \n\t"
355 "movq %%mm2, %1 \n\t"
356 :"+m"(*pix), "+m"(*(pix+line_size))
364 static void put_pixels4_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h)
367 "lea (%3, %3), %%"REG_a" \n\t"
370 "movd (%1), %%mm0 \n\t"
371 "movd (%1, %3), %%mm1 \n\t"
372 "movd %%mm0, (%2) \n\t"
373 "movd %%mm1, (%2, %3) \n\t"
374 "add %%"REG_a", %1 \n\t"
375 "add %%"REG_a", %2 \n\t"
376 "movd (%1), %%mm0 \n\t"
377 "movd (%1, %3), %%mm1 \n\t"
378 "movd %%mm0, (%2) \n\t"
379 "movd %%mm1, (%2, %3) \n\t"
380 "add %%"REG_a", %1 \n\t"
381 "add %%"REG_a", %2 \n\t"
384 : "+g"(h), "+r" (pixels), "+r" (block)
385 : "r"((x86_reg)line_size)
390 static void put_pixels8_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h)
393 "lea (%3, %3), %%"REG_a" \n\t"
396 "movq (%1), %%mm0 \n\t"
397 "movq (%1, %3), %%mm1 \n\t"
398 "movq %%mm0, (%2) \n\t"
399 "movq %%mm1, (%2, %3) \n\t"
400 "add %%"REG_a", %1 \n\t"
401 "add %%"REG_a", %2 \n\t"
402 "movq (%1), %%mm0 \n\t"
403 "movq (%1, %3), %%mm1 \n\t"
404 "movq %%mm0, (%2) \n\t"
405 "movq %%mm1, (%2, %3) \n\t"
406 "add %%"REG_a", %1 \n\t"
407 "add %%"REG_a", %2 \n\t"
410 : "+g"(h), "+r" (pixels), "+r" (block)
411 : "r"((x86_reg)line_size)
416 static void put_pixels16_mmx(uint8_t *block, const uint8_t *pixels, int line_size, int h)
419 "lea (%3, %3), %%"REG_a" \n\t"
422 "movq (%1), %%mm0 \n\t"
423 "movq 8(%1), %%mm4 \n\t"
424 "movq (%1, %3), %%mm1 \n\t"
425 "movq 8(%1, %3), %%mm5 \n\t"
426 "movq %%mm0, (%2) \n\t"
427 "movq %%mm4, 8(%2) \n\t"
428 "movq %%mm1, (%2, %3) \n\t"
429 "movq %%mm5, 8(%2, %3) \n\t"
430 "add %%"REG_a", %1 \n\t"
431 "add %%"REG_a", %2 \n\t"
432 "movq (%1), %%mm0 \n\t"
433 "movq 8(%1), %%mm4 \n\t"
434 "movq (%1, %3), %%mm1 \n\t"
435 "movq 8(%1, %3), %%mm5 \n\t"
436 "movq %%mm0, (%2) \n\t"
437 "movq %%mm4, 8(%2) \n\t"
438 "movq %%mm1, (%2, %3) \n\t"
439 "movq %%mm5, 8(%2, %3) \n\t"
440 "add %%"REG_a", %1 \n\t"
441 "add %%"REG_a", %2 \n\t"
444 : "+g"(h), "+r" (pixels), "+r" (block)
445 : "r"((x86_reg)line_size)
450 static void put_pixels16_sse2(uint8_t *block, const uint8_t *pixels, int line_size, int h)
454 "movdqu (%1), %%xmm0 \n\t"
455 "movdqu (%1,%3), %%xmm1 \n\t"
456 "movdqu (%1,%3,2), %%xmm2 \n\t"
457 "movdqu (%1,%4), %%xmm3 \n\t"
458 "movdqa %%xmm0, (%2) \n\t"
459 "movdqa %%xmm1, (%2,%3) \n\t"
460 "movdqa %%xmm2, (%2,%3,2) \n\t"
461 "movdqa %%xmm3, (%2,%4) \n\t"
463 "lea (%1,%3,4), %1 \n\t"
464 "lea (%2,%3,4), %2 \n\t"
466 : "+g"(h), "+r" (pixels), "+r" (block)
467 : "r"((x86_reg)line_size), "r"((x86_reg)3L*line_size)
472 static void avg_pixels16_sse2(uint8_t *block, const uint8_t *pixels, int line_size, int h)
476 "movdqu (%1), %%xmm0 \n\t"
477 "movdqu (%1,%3), %%xmm1 \n\t"
478 "movdqu (%1,%3,2), %%xmm2 \n\t"
479 "movdqu (%1,%4), %%xmm3 \n\t"
480 "pavgb (%2), %%xmm0 \n\t"
481 "pavgb (%2,%3), %%xmm1 \n\t"
482 "pavgb (%2,%3,2), %%xmm2 \n\t"
483 "pavgb (%2,%4), %%xmm3 \n\t"
484 "movdqa %%xmm0, (%2) \n\t"
485 "movdqa %%xmm1, (%2,%3) \n\t"
486 "movdqa %%xmm2, (%2,%3,2) \n\t"
487 "movdqa %%xmm3, (%2,%4) \n\t"
489 "lea (%1,%3,4), %1 \n\t"
490 "lea (%2,%3,4), %2 \n\t"
492 : "+g"(h), "+r" (pixels), "+r" (block)
493 : "r"((x86_reg)line_size), "r"((x86_reg)3L*line_size)
498 #define CLEAR_BLOCKS(name,n) \
499 static void name(DCTELEM *blocks)\
502 "pxor %%mm7, %%mm7 \n\t"\
503 "mov %1, %%"REG_a" \n\t"\
505 "movq %%mm7, (%0, %%"REG_a") \n\t"\
506 "movq %%mm7, 8(%0, %%"REG_a") \n\t"\
507 "movq %%mm7, 16(%0, %%"REG_a") \n\t"\
508 "movq %%mm7, 24(%0, %%"REG_a") \n\t"\
509 "add $32, %%"REG_a" \n\t"\
511 : : "r" (((uint8_t *)blocks)+128*n),\
516 CLEAR_BLOCKS(clear_blocks_mmx, 6)
517 CLEAR_BLOCKS(clear_block_mmx, 1)
519 static void clear_block_sse(DCTELEM *block)
522 "xorps %%xmm0, %%xmm0 \n"
523 "movaps %%xmm0, (%0) \n"
524 "movaps %%xmm0, 16(%0) \n"
525 "movaps %%xmm0, 32(%0) \n"
526 "movaps %%xmm0, 48(%0) \n"
527 "movaps %%xmm0, 64(%0) \n"
528 "movaps %%xmm0, 80(%0) \n"
529 "movaps %%xmm0, 96(%0) \n"
530 "movaps %%xmm0, 112(%0) \n"
536 static void clear_blocks_sse(DCTELEM *blocks)
539 "xorps %%xmm0, %%xmm0 \n"
540 "mov %1, %%"REG_a" \n"
542 "movaps %%xmm0, (%0, %%"REG_a") \n"
543 "movaps %%xmm0, 16(%0, %%"REG_a") \n"
544 "movaps %%xmm0, 32(%0, %%"REG_a") \n"
545 "movaps %%xmm0, 48(%0, %%"REG_a") \n"
546 "movaps %%xmm0, 64(%0, %%"REG_a") \n"
547 "movaps %%xmm0, 80(%0, %%"REG_a") \n"
548 "movaps %%xmm0, 96(%0, %%"REG_a") \n"
549 "movaps %%xmm0, 112(%0, %%"REG_a") \n"
550 "add $128, %%"REG_a" \n"
552 : : "r" (((uint8_t *)blocks)+128*6),
558 static void add_bytes_mmx(uint8_t *dst, uint8_t *src, int w){
563 "movq (%1, %0), %%mm0 \n\t"
564 "movq (%2, %0), %%mm1 \n\t"
565 "paddb %%mm0, %%mm1 \n\t"
566 "movq %%mm1, (%2, %0) \n\t"
567 "movq 8(%1, %0), %%mm0 \n\t"
568 "movq 8(%2, %0), %%mm1 \n\t"
569 "paddb %%mm0, %%mm1 \n\t"
570 "movq %%mm1, 8(%2, %0) \n\t"
576 : "r"(src), "r"(dst), "r"((x86_reg)w-15)
579 dst[i+0] += src[i+0];
582 #if HAVE_7REGS && HAVE_TEN_OPERANDS
583 static void add_hfyu_median_prediction_cmov(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int w, int *left, int *left_top) {
586 int l = *left & 0xff;
587 int tl = *left_top & 0xff;
592 "movzbl (%3,%4), %2 \n"
605 "add (%6,%4), %b0 \n"
606 "mov %b0, (%5,%4) \n"
609 :"+&q"(l), "+&q"(tl), "=&r"(t), "=&q"(x), "+&r"(w2)
610 :"r"(dst+w), "r"(diff+w), "rm"(top+w)
617 #define H263_LOOP_FILTER \
618 "pxor %%mm7, %%mm7 \n\t"\
619 "movq %0, %%mm0 \n\t"\
620 "movq %0, %%mm1 \n\t"\
621 "movq %3, %%mm2 \n\t"\
622 "movq %3, %%mm3 \n\t"\
623 "punpcklbw %%mm7, %%mm0 \n\t"\
624 "punpckhbw %%mm7, %%mm1 \n\t"\
625 "punpcklbw %%mm7, %%mm2 \n\t"\
626 "punpckhbw %%mm7, %%mm3 \n\t"\
627 "psubw %%mm2, %%mm0 \n\t"\
628 "psubw %%mm3, %%mm1 \n\t"\
629 "movq %1, %%mm2 \n\t"\
630 "movq %1, %%mm3 \n\t"\
631 "movq %2, %%mm4 \n\t"\
632 "movq %2, %%mm5 \n\t"\
633 "punpcklbw %%mm7, %%mm2 \n\t"\
634 "punpckhbw %%mm7, %%mm3 \n\t"\
635 "punpcklbw %%mm7, %%mm4 \n\t"\
636 "punpckhbw %%mm7, %%mm5 \n\t"\
637 "psubw %%mm2, %%mm4 \n\t"\
638 "psubw %%mm3, %%mm5 \n\t"\
639 "psllw $2, %%mm4 \n\t"\
640 "psllw $2, %%mm5 \n\t"\
641 "paddw %%mm0, %%mm4 \n\t"\
642 "paddw %%mm1, %%mm5 \n\t"\
643 "pxor %%mm6, %%mm6 \n\t"\
644 "pcmpgtw %%mm4, %%mm6 \n\t"\
645 "pcmpgtw %%mm5, %%mm7 \n\t"\
646 "pxor %%mm6, %%mm4 \n\t"\
647 "pxor %%mm7, %%mm5 \n\t"\
648 "psubw %%mm6, %%mm4 \n\t"\
649 "psubw %%mm7, %%mm5 \n\t"\
650 "psrlw $3, %%mm4 \n\t"\
651 "psrlw $3, %%mm5 \n\t"\
652 "packuswb %%mm5, %%mm4 \n\t"\
653 "packsswb %%mm7, %%mm6 \n\t"\
654 "pxor %%mm7, %%mm7 \n\t"\
655 "movd %4, %%mm2 \n\t"\
656 "punpcklbw %%mm2, %%mm2 \n\t"\
657 "punpcklbw %%mm2, %%mm2 \n\t"\
658 "punpcklbw %%mm2, %%mm2 \n\t"\
659 "psubusb %%mm4, %%mm2 \n\t"\
660 "movq %%mm2, %%mm3 \n\t"\
661 "psubusb %%mm4, %%mm3 \n\t"\
662 "psubb %%mm3, %%mm2 \n\t"\
663 "movq %1, %%mm3 \n\t"\
664 "movq %2, %%mm4 \n\t"\
665 "pxor %%mm6, %%mm3 \n\t"\
666 "pxor %%mm6, %%mm4 \n\t"\
667 "paddusb %%mm2, %%mm3 \n\t"\
668 "psubusb %%mm2, %%mm4 \n\t"\
669 "pxor %%mm6, %%mm3 \n\t"\
670 "pxor %%mm6, %%mm4 \n\t"\
671 "paddusb %%mm2, %%mm2 \n\t"\
672 "packsswb %%mm1, %%mm0 \n\t"\
673 "pcmpgtb %%mm0, %%mm7 \n\t"\
674 "pxor %%mm7, %%mm0 \n\t"\
675 "psubb %%mm7, %%mm0 \n\t"\
676 "movq %%mm0, %%mm1 \n\t"\
677 "psubusb %%mm2, %%mm0 \n\t"\
678 "psubb %%mm0, %%mm1 \n\t"\
679 "pand %5, %%mm1 \n\t"\
680 "psrlw $2, %%mm1 \n\t"\
681 "pxor %%mm7, %%mm1 \n\t"\
682 "psubb %%mm7, %%mm1 \n\t"\
683 "movq %0, %%mm5 \n\t"\
684 "movq %3, %%mm6 \n\t"\
685 "psubb %%mm1, %%mm5 \n\t"\
686 "paddb %%mm1, %%mm6 \n\t"
688 static void h263_v_loop_filter_mmx(uint8_t *src, int stride, int qscale){
689 if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
690 const int strength= ff_h263_loop_filter_strength[qscale];
696 "movq %%mm3, %1 \n\t"
697 "movq %%mm4, %2 \n\t"
698 "movq %%mm5, %0 \n\t"
699 "movq %%mm6, %3 \n\t"
700 : "+m" (*(uint64_t*)(src - 2*stride)),
701 "+m" (*(uint64_t*)(src - 1*stride)),
702 "+m" (*(uint64_t*)(src + 0*stride)),
703 "+m" (*(uint64_t*)(src + 1*stride))
704 : "g" (2*strength), "m"(ff_pb_FC)
709 static void h263_h_loop_filter_mmx(uint8_t *src, int stride, int qscale){
710 if(CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
711 const int strength= ff_h263_loop_filter_strength[qscale];
712 DECLARE_ALIGNED(8, uint64_t, temp)[4];
713 uint8_t *btemp= (uint8_t*)temp;
717 transpose4x4(btemp , src , 8, stride);
718 transpose4x4(btemp+4, src + 4*stride, 8, stride);
720 H263_LOOP_FILTER // 5 3 4 6
726 : "g" (2*strength), "m"(ff_pb_FC)
730 "movq %%mm5, %%mm1 \n\t"
731 "movq %%mm4, %%mm0 \n\t"
732 "punpcklbw %%mm3, %%mm5 \n\t"
733 "punpcklbw %%mm6, %%mm4 \n\t"
734 "punpckhbw %%mm3, %%mm1 \n\t"
735 "punpckhbw %%mm6, %%mm0 \n\t"
736 "movq %%mm5, %%mm3 \n\t"
737 "movq %%mm1, %%mm6 \n\t"
738 "punpcklwd %%mm4, %%mm5 \n\t"
739 "punpcklwd %%mm0, %%mm1 \n\t"
740 "punpckhwd %%mm4, %%mm3 \n\t"
741 "punpckhwd %%mm0, %%mm6 \n\t"
742 "movd %%mm5, (%0) \n\t"
743 "punpckhdq %%mm5, %%mm5 \n\t"
744 "movd %%mm5, (%0,%2) \n\t"
745 "movd %%mm3, (%0,%2,2) \n\t"
746 "punpckhdq %%mm3, %%mm3 \n\t"
747 "movd %%mm3, (%0,%3) \n\t"
748 "movd %%mm1, (%1) \n\t"
749 "punpckhdq %%mm1, %%mm1 \n\t"
750 "movd %%mm1, (%1,%2) \n\t"
751 "movd %%mm6, (%1,%2,2) \n\t"
752 "punpckhdq %%mm6, %%mm6 \n\t"
753 "movd %%mm6, (%1,%3) \n\t"
755 "r" (src + 4*stride),
756 "r" ((x86_reg) stride ),
757 "r" ((x86_reg)(3*stride))
762 /* draw the edges of width 'w' of an image of size width, height
763 this mmx version can only handle w==8 || w==16 */
764 static void draw_edges_mmx(uint8_t *buf, int wrap, int width, int height, int w, int sides)
766 uint8_t *ptr, *last_line;
769 last_line = buf + (height - 1) * wrap;
776 "movd (%0), %%mm0 \n\t"
777 "punpcklbw %%mm0, %%mm0 \n\t"
778 "punpcklwd %%mm0, %%mm0 \n\t"
779 "punpckldq %%mm0, %%mm0 \n\t"
780 "movq %%mm0, -8(%0) \n\t"
781 "movq -8(%0, %2), %%mm1 \n\t"
782 "punpckhbw %%mm1, %%mm1 \n\t"
783 "punpckhwd %%mm1, %%mm1 \n\t"
784 "punpckhdq %%mm1, %%mm1 \n\t"
785 "movq %%mm1, (%0, %2) \n\t"
790 : "r" ((x86_reg)wrap), "r" ((x86_reg)width), "r" (ptr + wrap*height)
797 "movd (%0), %%mm0 \n\t"
798 "punpcklbw %%mm0, %%mm0 \n\t"
799 "punpcklwd %%mm0, %%mm0 \n\t"
800 "punpckldq %%mm0, %%mm0 \n\t"
801 "movq %%mm0, -8(%0) \n\t"
802 "movq %%mm0, -16(%0) \n\t"
803 "movq -8(%0, %2), %%mm1 \n\t"
804 "punpckhbw %%mm1, %%mm1 \n\t"
805 "punpckhwd %%mm1, %%mm1 \n\t"
806 "punpckhdq %%mm1, %%mm1 \n\t"
807 "movq %%mm1, (%0, %2) \n\t"
808 "movq %%mm1, 8(%0, %2) \n\t"
813 : "r" ((x86_reg)wrap), "r" ((x86_reg)width), "r" (ptr + wrap*height)
817 /* top and bottom (and hopefully also the corners) */
818 if (sides&EDGE_TOP) {
819 for(i = 0; i < w; i += 4) {
820 ptr= buf - (i + 1) * wrap - w;
823 "movq (%1, %0), %%mm0 \n\t"
824 "movq %%mm0, (%0) \n\t"
825 "movq %%mm0, (%0, %2) \n\t"
826 "movq %%mm0, (%0, %2, 2) \n\t"
827 "movq %%mm0, (%0, %3) \n\t"
832 : "r" ((x86_reg)buf - (x86_reg)ptr - w), "r" ((x86_reg)-wrap), "r" ((x86_reg)-wrap*3), "r" (ptr+width+2*w)
837 if (sides&EDGE_BOTTOM) {
838 for(i = 0; i < w; i += 4) {
839 ptr= last_line + (i + 1) * wrap - w;
842 "movq (%1, %0), %%mm0 \n\t"
843 "movq %%mm0, (%0) \n\t"
844 "movq %%mm0, (%0, %2) \n\t"
845 "movq %%mm0, (%0, %2, 2) \n\t"
846 "movq %%mm0, (%0, %3) \n\t"
851 : "r" ((x86_reg)last_line - (x86_reg)ptr - w), "r" ((x86_reg)wrap), "r" ((x86_reg)wrap*3), "r" (ptr+width+2*w)
857 #define QPEL_V_LOW(m3,m4,m5,m6, pw_20, pw_3, rnd, in0, in1, in2, in7, out, OP)\
858 "paddw " #m4 ", " #m3 " \n\t" /* x1 */\
859 "movq "MANGLE(ff_pw_20)", %%mm4 \n\t" /* 20 */\
860 "pmullw " #m3 ", %%mm4 \n\t" /* 20x1 */\
861 "movq "#in7", " #m3 " \n\t" /* d */\
862 "movq "#in0", %%mm5 \n\t" /* D */\
863 "paddw " #m3 ", %%mm5 \n\t" /* x4 */\
864 "psubw %%mm5, %%mm4 \n\t" /* 20x1 - x4 */\
865 "movq "#in1", %%mm5 \n\t" /* C */\
866 "movq "#in2", %%mm6 \n\t" /* B */\
867 "paddw " #m6 ", %%mm5 \n\t" /* x3 */\
868 "paddw " #m5 ", %%mm6 \n\t" /* x2 */\
869 "paddw %%mm6, %%mm6 \n\t" /* 2x2 */\
870 "psubw %%mm6, %%mm5 \n\t" /* -2x2 + x3 */\
871 "pmullw "MANGLE(ff_pw_3)", %%mm5 \n\t" /* -6x2 + 3x3 */\
872 "paddw " #rnd ", %%mm4 \n\t" /* x2 */\
873 "paddw %%mm4, %%mm5 \n\t" /* 20x1 - 6x2 + 3x3 - x4 */\
874 "psraw $5, %%mm5 \n\t"\
875 "packuswb %%mm5, %%mm5 \n\t"\
876 OP(%%mm5, out, %%mm7, d)
878 #define QPEL_BASE(OPNAME, ROUNDER, RND, OP_MMX2, OP_3DNOW)\
879 static void OPNAME ## mpeg4_qpel16_h_lowpass_mmx2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
883 "pxor %%mm7, %%mm7 \n\t"\
885 "movq (%0), %%mm0 \n\t" /* ABCDEFGH */\
886 "movq %%mm0, %%mm1 \n\t" /* ABCDEFGH */\
887 "movq %%mm0, %%mm2 \n\t" /* ABCDEFGH */\
888 "punpcklbw %%mm7, %%mm0 \n\t" /* 0A0B0C0D */\
889 "punpckhbw %%mm7, %%mm1 \n\t" /* 0E0F0G0H */\
890 "pshufw $0x90, %%mm0, %%mm5 \n\t" /* 0A0A0B0C */\
891 "pshufw $0x41, %%mm0, %%mm6 \n\t" /* 0B0A0A0B */\
892 "movq %%mm2, %%mm3 \n\t" /* ABCDEFGH */\
893 "movq %%mm2, %%mm4 \n\t" /* ABCDEFGH */\
894 "psllq $8, %%mm2 \n\t" /* 0ABCDEFG */\
895 "psllq $16, %%mm3 \n\t" /* 00ABCDEF */\
896 "psllq $24, %%mm4 \n\t" /* 000ABCDE */\
897 "punpckhbw %%mm7, %%mm2 \n\t" /* 0D0E0F0G */\
898 "punpckhbw %%mm7, %%mm3 \n\t" /* 0C0D0E0F */\
899 "punpckhbw %%mm7, %%mm4 \n\t" /* 0B0C0D0E */\
900 "paddw %%mm3, %%mm5 \n\t" /* b */\
901 "paddw %%mm2, %%mm6 \n\t" /* c */\
902 "paddw %%mm5, %%mm5 \n\t" /* 2b */\
903 "psubw %%mm5, %%mm6 \n\t" /* c - 2b */\
904 "pshufw $0x06, %%mm0, %%mm5 \n\t" /* 0C0B0A0A */\
905 "pmullw "MANGLE(ff_pw_3)", %%mm6 \n\t" /* 3c - 6b */\
906 "paddw %%mm4, %%mm0 \n\t" /* a */\
907 "paddw %%mm1, %%mm5 \n\t" /* d */\
908 "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" /* 20a */\
909 "psubw %%mm5, %%mm0 \n\t" /* 20a - d */\
910 "paddw %6, %%mm6 \n\t"\
911 "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\
912 "psraw $5, %%mm0 \n\t"\
913 "movq %%mm0, %5 \n\t"\
914 /* mm1=EFGH, mm2=DEFG, mm3=CDEF, mm4=BCDE, mm7=0 */\
916 "movq 5(%0), %%mm0 \n\t" /* FGHIJKLM */\
917 "movq %%mm0, %%mm5 \n\t" /* FGHIJKLM */\
918 "movq %%mm0, %%mm6 \n\t" /* FGHIJKLM */\
919 "psrlq $8, %%mm0 \n\t" /* GHIJKLM0 */\
920 "psrlq $16, %%mm5 \n\t" /* HIJKLM00 */\
921 "punpcklbw %%mm7, %%mm0 \n\t" /* 0G0H0I0J */\
922 "punpcklbw %%mm7, %%mm5 \n\t" /* 0H0I0J0K */\
923 "paddw %%mm0, %%mm2 \n\t" /* b */\
924 "paddw %%mm5, %%mm3 \n\t" /* c */\
925 "paddw %%mm2, %%mm2 \n\t" /* 2b */\
926 "psubw %%mm2, %%mm3 \n\t" /* c - 2b */\
927 "movq %%mm6, %%mm2 \n\t" /* FGHIJKLM */\
928 "psrlq $24, %%mm6 \n\t" /* IJKLM000 */\
929 "punpcklbw %%mm7, %%mm2 \n\t" /* 0F0G0H0I */\
930 "punpcklbw %%mm7, %%mm6 \n\t" /* 0I0J0K0L */\
931 "pmullw "MANGLE(ff_pw_3)", %%mm3 \n\t" /* 3c - 6b */\
932 "paddw %%mm2, %%mm1 \n\t" /* a */\
933 "paddw %%mm6, %%mm4 \n\t" /* d */\
934 "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" /* 20a */\
935 "psubw %%mm4, %%mm3 \n\t" /* - 6b +3c - d */\
936 "paddw %6, %%mm1 \n\t"\
937 "paddw %%mm1, %%mm3 \n\t" /* 20a - 6b +3c - d */\
938 "psraw $5, %%mm3 \n\t"\
939 "movq %5, %%mm1 \n\t"\
940 "packuswb %%mm3, %%mm1 \n\t"\
941 OP_MMX2(%%mm1, (%1),%%mm4, q)\
942 /* mm0= GHIJ, mm2=FGHI, mm5=HIJK, mm6=IJKL, mm7=0 */\
944 "movq 9(%0), %%mm1 \n\t" /* JKLMNOPQ */\
945 "movq %%mm1, %%mm4 \n\t" /* JKLMNOPQ */\
946 "movq %%mm1, %%mm3 \n\t" /* JKLMNOPQ */\
947 "psrlq $8, %%mm1 \n\t" /* KLMNOPQ0 */\
948 "psrlq $16, %%mm4 \n\t" /* LMNOPQ00 */\
949 "punpcklbw %%mm7, %%mm1 \n\t" /* 0K0L0M0N */\
950 "punpcklbw %%mm7, %%mm4 \n\t" /* 0L0M0N0O */\
951 "paddw %%mm1, %%mm5 \n\t" /* b */\
952 "paddw %%mm4, %%mm0 \n\t" /* c */\
953 "paddw %%mm5, %%mm5 \n\t" /* 2b */\
954 "psubw %%mm5, %%mm0 \n\t" /* c - 2b */\
955 "movq %%mm3, %%mm5 \n\t" /* JKLMNOPQ */\
956 "psrlq $24, %%mm3 \n\t" /* MNOPQ000 */\
957 "pmullw "MANGLE(ff_pw_3)", %%mm0 \n\t" /* 3c - 6b */\
958 "punpcklbw %%mm7, %%mm3 \n\t" /* 0M0N0O0P */\
959 "paddw %%mm3, %%mm2 \n\t" /* d */\
960 "psubw %%mm2, %%mm0 \n\t" /* -6b + 3c - d */\
961 "movq %%mm5, %%mm2 \n\t" /* JKLMNOPQ */\
962 "punpcklbw %%mm7, %%mm2 \n\t" /* 0J0K0L0M */\
963 "punpckhbw %%mm7, %%mm5 \n\t" /* 0N0O0P0Q */\
964 "paddw %%mm2, %%mm6 \n\t" /* a */\
965 "pmullw "MANGLE(ff_pw_20)", %%mm6 \n\t" /* 20a */\
966 "paddw %6, %%mm0 \n\t"\
967 "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\
968 "psraw $5, %%mm0 \n\t"\
969 /* mm1=KLMN, mm2=JKLM, mm3=MNOP, mm4=LMNO, mm5=NOPQ mm7=0 */\
971 "paddw %%mm5, %%mm3 \n\t" /* a */\
972 "pshufw $0xF9, %%mm5, %%mm6 \n\t" /* 0O0P0Q0Q */\
973 "paddw %%mm4, %%mm6 \n\t" /* b */\
974 "pshufw $0xBE, %%mm5, %%mm4 \n\t" /* 0P0Q0Q0P */\
975 "pshufw $0x6F, %%mm5, %%mm5 \n\t" /* 0Q0Q0P0O */\
976 "paddw %%mm1, %%mm4 \n\t" /* c */\
977 "paddw %%mm2, %%mm5 \n\t" /* d */\
978 "paddw %%mm6, %%mm6 \n\t" /* 2b */\
979 "psubw %%mm6, %%mm4 \n\t" /* c - 2b */\
980 "pmullw "MANGLE(ff_pw_20)", %%mm3 \n\t" /* 20a */\
981 "pmullw "MANGLE(ff_pw_3)", %%mm4 \n\t" /* 3c - 6b */\
982 "psubw %%mm5, %%mm3 \n\t" /* -6b + 3c - d */\
983 "paddw %6, %%mm4 \n\t"\
984 "paddw %%mm3, %%mm4 \n\t" /* 20a - 6b + 3c - d */\
985 "psraw $5, %%mm4 \n\t"\
986 "packuswb %%mm4, %%mm0 \n\t"\
987 OP_MMX2(%%mm0, 8(%1), %%mm4, q)\
993 : "+a"(src), "+c"(dst), "+D"(h)\
994 : "d"((x86_reg)srcStride), "S"((x86_reg)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(temp), "m"(ROUNDER)\
999 static void OPNAME ## mpeg4_qpel16_h_lowpass_3dnow(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1002 /* quick HACK, XXX FIXME MUST be optimized */\
1005 temp[ 0]= (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]);\
1006 temp[ 1]= (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]);\
1007 temp[ 2]= (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]);\
1008 temp[ 3]= (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]);\
1009 temp[ 4]= (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]);\
1010 temp[ 5]= (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 9]);\
1011 temp[ 6]= (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 9])*3 - (src[ 3]+src[10]);\
1012 temp[ 7]= (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 9])*6 + (src[ 5]+src[10])*3 - (src[ 4]+src[11]);\
1013 temp[ 8]= (src[ 8]+src[ 9])*20 - (src[ 7]+src[10])*6 + (src[ 6]+src[11])*3 - (src[ 5]+src[12]);\
1014 temp[ 9]= (src[ 9]+src[10])*20 - (src[ 8]+src[11])*6 + (src[ 7]+src[12])*3 - (src[ 6]+src[13]);\
1015 temp[10]= (src[10]+src[11])*20 - (src[ 9]+src[12])*6 + (src[ 8]+src[13])*3 - (src[ 7]+src[14]);\
1016 temp[11]= (src[11]+src[12])*20 - (src[10]+src[13])*6 + (src[ 9]+src[14])*3 - (src[ 8]+src[15]);\
1017 temp[12]= (src[12]+src[13])*20 - (src[11]+src[14])*6 + (src[10]+src[15])*3 - (src[ 9]+src[16]);\
1018 temp[13]= (src[13]+src[14])*20 - (src[12]+src[15])*6 + (src[11]+src[16])*3 - (src[10]+src[16]);\
1019 temp[14]= (src[14]+src[15])*20 - (src[13]+src[16])*6 + (src[12]+src[16])*3 - (src[11]+src[15]);\
1020 temp[15]= (src[15]+src[16])*20 - (src[14]+src[16])*6 + (src[13]+src[15])*3 - (src[12]+src[14]);\
1022 "movq (%0), %%mm0 \n\t"\
1023 "movq 8(%0), %%mm1 \n\t"\
1024 "paddw %2, %%mm0 \n\t"\
1025 "paddw %2, %%mm1 \n\t"\
1026 "psraw $5, %%mm0 \n\t"\
1027 "psraw $5, %%mm1 \n\t"\
1028 "packuswb %%mm1, %%mm0 \n\t"\
1029 OP_3DNOW(%%mm0, (%1), %%mm1, q)\
1030 "movq 16(%0), %%mm0 \n\t"\
1031 "movq 24(%0), %%mm1 \n\t"\
1032 "paddw %2, %%mm0 \n\t"\
1033 "paddw %2, %%mm1 \n\t"\
1034 "psraw $5, %%mm0 \n\t"\
1035 "psraw $5, %%mm1 \n\t"\
1036 "packuswb %%mm1, %%mm0 \n\t"\
1037 OP_3DNOW(%%mm0, 8(%1), %%mm1, q)\
1038 :: "r"(temp), "r"(dst), "m"(ROUNDER)\
1046 static void OPNAME ## mpeg4_qpel8_h_lowpass_mmx2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1048 "pxor %%mm7, %%mm7 \n\t"\
1050 "movq (%0), %%mm0 \n\t" /* ABCDEFGH */\
1051 "movq %%mm0, %%mm1 \n\t" /* ABCDEFGH */\
1052 "movq %%mm0, %%mm2 \n\t" /* ABCDEFGH */\
1053 "punpcklbw %%mm7, %%mm0 \n\t" /* 0A0B0C0D */\
1054 "punpckhbw %%mm7, %%mm1 \n\t" /* 0E0F0G0H */\
1055 "pshufw $0x90, %%mm0, %%mm5 \n\t" /* 0A0A0B0C */\
1056 "pshufw $0x41, %%mm0, %%mm6 \n\t" /* 0B0A0A0B */\
1057 "movq %%mm2, %%mm3 \n\t" /* ABCDEFGH */\
1058 "movq %%mm2, %%mm4 \n\t" /* ABCDEFGH */\
1059 "psllq $8, %%mm2 \n\t" /* 0ABCDEFG */\
1060 "psllq $16, %%mm3 \n\t" /* 00ABCDEF */\
1061 "psllq $24, %%mm4 \n\t" /* 000ABCDE */\
1062 "punpckhbw %%mm7, %%mm2 \n\t" /* 0D0E0F0G */\
1063 "punpckhbw %%mm7, %%mm3 \n\t" /* 0C0D0E0F */\
1064 "punpckhbw %%mm7, %%mm4 \n\t" /* 0B0C0D0E */\
1065 "paddw %%mm3, %%mm5 \n\t" /* b */\
1066 "paddw %%mm2, %%mm6 \n\t" /* c */\
1067 "paddw %%mm5, %%mm5 \n\t" /* 2b */\
1068 "psubw %%mm5, %%mm6 \n\t" /* c - 2b */\
1069 "pshufw $0x06, %%mm0, %%mm5 \n\t" /* 0C0B0A0A */\
1070 "pmullw "MANGLE(ff_pw_3)", %%mm6 \n\t" /* 3c - 6b */\
1071 "paddw %%mm4, %%mm0 \n\t" /* a */\
1072 "paddw %%mm1, %%mm5 \n\t" /* d */\
1073 "pmullw "MANGLE(ff_pw_20)", %%mm0 \n\t" /* 20a */\
1074 "psubw %%mm5, %%mm0 \n\t" /* 20a - d */\
1075 "paddw %5, %%mm6 \n\t"\
1076 "paddw %%mm6, %%mm0 \n\t" /* 20a - 6b + 3c - d */\
1077 "psraw $5, %%mm0 \n\t"\
1078 /* mm1=EFGH, mm2=DEFG, mm3=CDEF, mm4=BCDE, mm7=0 */\
1080 "movd 5(%0), %%mm5 \n\t" /* FGHI */\
1081 "punpcklbw %%mm7, %%mm5 \n\t" /* 0F0G0H0I */\
1082 "pshufw $0xF9, %%mm5, %%mm6 \n\t" /* 0G0H0I0I */\
1083 "paddw %%mm5, %%mm1 \n\t" /* a */\
1084 "paddw %%mm6, %%mm2 \n\t" /* b */\
1085 "pshufw $0xBE, %%mm5, %%mm6 \n\t" /* 0H0I0I0H */\
1086 "pshufw $0x6F, %%mm5, %%mm5 \n\t" /* 0I0I0H0G */\
1087 "paddw %%mm6, %%mm3 \n\t" /* c */\
1088 "paddw %%mm5, %%mm4 \n\t" /* d */\
1089 "paddw %%mm2, %%mm2 \n\t" /* 2b */\
1090 "psubw %%mm2, %%mm3 \n\t" /* c - 2b */\
1091 "pmullw "MANGLE(ff_pw_20)", %%mm1 \n\t" /* 20a */\
1092 "pmullw "MANGLE(ff_pw_3)", %%mm3 \n\t" /* 3c - 6b */\
1093 "psubw %%mm4, %%mm3 \n\t" /* -6b + 3c - d */\
1094 "paddw %5, %%mm1 \n\t"\
1095 "paddw %%mm1, %%mm3 \n\t" /* 20a - 6b + 3c - d */\
1096 "psraw $5, %%mm3 \n\t"\
1097 "packuswb %%mm3, %%mm0 \n\t"\
1098 OP_MMX2(%%mm0, (%1), %%mm4, q)\
1104 : "+a"(src), "+c"(dst), "+d"(h)\
1105 : "S"((x86_reg)srcStride), "D"((x86_reg)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(ROUNDER)\
1110 static void OPNAME ## mpeg4_qpel8_h_lowpass_3dnow(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
1113 /* quick HACK, XXX FIXME MUST be optimized */\
1116 temp[ 0]= (src[ 0]+src[ 1])*20 - (src[ 0]+src[ 2])*6 + (src[ 1]+src[ 3])*3 - (src[ 2]+src[ 4]);\
1117 temp[ 1]= (src[ 1]+src[ 2])*20 - (src[ 0]+src[ 3])*6 + (src[ 0]+src[ 4])*3 - (src[ 1]+src[ 5]);\
1118 temp[ 2]= (src[ 2]+src[ 3])*20 - (src[ 1]+src[ 4])*6 + (src[ 0]+src[ 5])*3 - (src[ 0]+src[ 6]);\
1119 temp[ 3]= (src[ 3]+src[ 4])*20 - (src[ 2]+src[ 5])*6 + (src[ 1]+src[ 6])*3 - (src[ 0]+src[ 7]);\
1120 temp[ 4]= (src[ 4]+src[ 5])*20 - (src[ 3]+src[ 6])*6 + (src[ 2]+src[ 7])*3 - (src[ 1]+src[ 8]);\
1121 temp[ 5]= (src[ 5]+src[ 6])*20 - (src[ 4]+src[ 7])*6 + (src[ 3]+src[ 8])*3 - (src[ 2]+src[ 8]);\
1122 temp[ 6]= (src[ 6]+src[ 7])*20 - (src[ 5]+src[ 8])*6 + (src[ 4]+src[ 8])*3 - (src[ 3]+src[ 7]);\
1123 temp[ 7]= (src[ 7]+src[ 8])*20 - (src[ 6]+src[ 8])*6 + (src[ 5]+src[ 7])*3 - (src[ 4]+src[ 6]);\
1125 "movq (%0), %%mm0 \n\t"\
1126 "movq 8(%0), %%mm1 \n\t"\
1127 "paddw %2, %%mm0 \n\t"\
1128 "paddw %2, %%mm1 \n\t"\
1129 "psraw $5, %%mm0 \n\t"\
1130 "psraw $5, %%mm1 \n\t"\
1131 "packuswb %%mm1, %%mm0 \n\t"\
1132 OP_3DNOW(%%mm0, (%1), %%mm1, q)\
1133 :: "r"(temp), "r"(dst), "m"(ROUNDER)\
1141 #define QPEL_OP(OPNAME, ROUNDER, RND, OP, MMX)\
1143 static void OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1144 uint64_t temp[17*4];\
1145 uint64_t *temp_ptr= temp;\
1150 "pxor %%mm7, %%mm7 \n\t"\
1152 "movq (%0), %%mm0 \n\t"\
1153 "movq (%0), %%mm1 \n\t"\
1154 "movq 8(%0), %%mm2 \n\t"\
1155 "movq 8(%0), %%mm3 \n\t"\
1156 "punpcklbw %%mm7, %%mm0 \n\t"\
1157 "punpckhbw %%mm7, %%mm1 \n\t"\
1158 "punpcklbw %%mm7, %%mm2 \n\t"\
1159 "punpckhbw %%mm7, %%mm3 \n\t"\
1160 "movq %%mm0, (%1) \n\t"\
1161 "movq %%mm1, 17*8(%1) \n\t"\
1162 "movq %%mm2, 2*17*8(%1) \n\t"\
1163 "movq %%mm3, 3*17*8(%1) \n\t"\
1168 : "+r" (src), "+r" (temp_ptr), "+r"(count)\
1169 : "r" ((x86_reg)srcStride)\
1176 /*FIXME reorder for speed */\
1178 /*"pxor %%mm7, %%mm7 \n\t"*/\
1180 "movq (%0), %%mm0 \n\t"\
1181 "movq 8(%0), %%mm1 \n\t"\
1182 "movq 16(%0), %%mm2 \n\t"\
1183 "movq 24(%0), %%mm3 \n\t"\
1184 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP)\
1185 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP)\
1187 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP)\
1189 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP)\
1191 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP)\
1192 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 72(%0), (%1, %3), OP)\
1194 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 80(%0), (%1), OP)\
1195 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 88(%0), (%1, %3), OP)\
1197 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 40(%0), 48(%0), 56(%0), 96(%0), (%1), OP)\
1198 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 48(%0), 56(%0), 64(%0),104(%0), (%1, %3), OP)\
1200 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 56(%0), 64(%0), 72(%0),112(%0), (%1), OP)\
1201 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 64(%0), 72(%0), 80(%0),120(%0), (%1, %3), OP)\
1203 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 72(%0), 80(%0), 88(%0),128(%0), (%1), OP)\
1205 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 80(%0), 88(%0), 96(%0),128(%0), (%1, %3), OP)\
1207 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 88(%0), 96(%0),104(%0),120(%0), (%1), OP)\
1208 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 96(%0),104(%0),112(%0),112(%0), (%1, %3), OP)\
1210 "add $136, %0 \n\t"\
1215 : "+r"(temp_ptr), "+r"(dst), "+g"(count)\
1216 : "r"((x86_reg)dstStride), "r"(2*(x86_reg)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(ROUNDER), "g"(4-14*(x86_reg)dstStride)\
1221 static void OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
1222 uint64_t temp[9*2];\
1223 uint64_t *temp_ptr= temp;\
1228 "pxor %%mm7, %%mm7 \n\t"\
1230 "movq (%0), %%mm0 \n\t"\
1231 "movq (%0), %%mm1 \n\t"\
1232 "punpcklbw %%mm7, %%mm0 \n\t"\
1233 "punpckhbw %%mm7, %%mm1 \n\t"\
1234 "movq %%mm0, (%1) \n\t"\
1235 "movq %%mm1, 9*8(%1) \n\t"\
1240 : "+r" (src), "+r" (temp_ptr), "+r"(count)\
1241 : "r" ((x86_reg)srcStride)\
1248 /*FIXME reorder for speed */\
1250 /*"pxor %%mm7, %%mm7 \n\t"*/\
1252 "movq (%0), %%mm0 \n\t"\
1253 "movq 8(%0), %%mm1 \n\t"\
1254 "movq 16(%0), %%mm2 \n\t"\
1255 "movq 24(%0), %%mm3 \n\t"\
1256 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 16(%0), 8(%0), (%0), 32(%0), (%1), OP)\
1257 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 8(%0), (%0), (%0), 40(%0), (%1, %3), OP)\
1259 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, (%0), (%0), 8(%0), 48(%0), (%1), OP)\
1261 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, (%0), 8(%0), 16(%0), 56(%0), (%1, %3), OP)\
1263 QPEL_V_LOW(%%mm0, %%mm1, %%mm2, %%mm3, %5, %6, %5, 8(%0), 16(%0), 24(%0), 64(%0), (%1), OP)\
1265 QPEL_V_LOW(%%mm1, %%mm2, %%mm3, %%mm0, %5, %6, %5, 16(%0), 24(%0), 32(%0), 64(%0), (%1, %3), OP)\
1267 QPEL_V_LOW(%%mm2, %%mm3, %%mm0, %%mm1, %5, %6, %5, 24(%0), 32(%0), 40(%0), 56(%0), (%1), OP)\
1268 QPEL_V_LOW(%%mm3, %%mm0, %%mm1, %%mm2, %5, %6, %5, 32(%0), 40(%0), 48(%0), 48(%0), (%1, %3), OP)\
1275 : "+r"(temp_ptr), "+r"(dst), "+g"(count)\
1276 : "r"((x86_reg)dstStride), "r"(2*(x86_reg)dstStride), /*"m"(ff_pw_20), "m"(ff_pw_3),*/ "m"(ROUNDER), "g"(4-6*(x86_reg)dstStride)\
1281 static void OPNAME ## qpel8_mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\
1282 OPNAME ## pixels8_ ## MMX(dst, src, stride, 8);\
1285 static void OPNAME ## qpel8_mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1287 uint8_t * const half= (uint8_t*)temp;\
1288 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, stride, 8);\
1289 OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8);\
1292 static void OPNAME ## qpel8_mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1293 OPNAME ## mpeg4_qpel8_h_lowpass_ ## MMX(dst, src, stride, stride, 8);\
1296 static void OPNAME ## qpel8_mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1298 uint8_t * const half= (uint8_t*)temp;\
1299 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, stride, 8);\
1300 OPNAME ## pixels8_l2_ ## MMX(dst, src+1, half, stride, stride, 8);\
1303 static void OPNAME ## qpel8_mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1305 uint8_t * const half= (uint8_t*)temp;\
1306 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride);\
1307 OPNAME ## pixels8_l2_ ## MMX(dst, src, half, stride, stride, 8);\
1310 static void OPNAME ## qpel8_mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1311 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, src, stride, stride);\
1314 static void OPNAME ## qpel8_mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1316 uint8_t * const half= (uint8_t*)temp;\
1317 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, 8, stride);\
1318 OPNAME ## pixels8_l2_ ## MMX(dst, src+stride, half, stride, stride, 8);\
1320 static void OPNAME ## qpel8_mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1321 uint64_t half[8 + 9];\
1322 uint8_t * const halfH= ((uint8_t*)half) + 64;\
1323 uint8_t * const halfHV= ((uint8_t*)half);\
1324 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
1325 put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\
1326 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
1327 OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\
1329 static void OPNAME ## qpel8_mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1330 uint64_t half[8 + 9];\
1331 uint8_t * const halfH= ((uint8_t*)half) + 64;\
1332 uint8_t * const halfHV= ((uint8_t*)half);\
1333 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
1334 put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\
1335 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
1336 OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\
1338 static void OPNAME ## qpel8_mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1339 uint64_t half[8 + 9];\
1340 uint8_t * const halfH= ((uint8_t*)half) + 64;\
1341 uint8_t * const halfHV= ((uint8_t*)half);\
1342 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
1343 put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\
1344 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
1345 OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\
1347 static void OPNAME ## qpel8_mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1348 uint64_t half[8 + 9];\
1349 uint8_t * const halfH= ((uint8_t*)half) + 64;\
1350 uint8_t * const halfHV= ((uint8_t*)half);\
1351 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
1352 put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\
1353 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
1354 OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\
1356 static void OPNAME ## qpel8_mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1357 uint64_t half[8 + 9];\
1358 uint8_t * const halfH= ((uint8_t*)half) + 64;\
1359 uint8_t * const halfHV= ((uint8_t*)half);\
1360 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
1361 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
1362 OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, stride, 8, 8);\
1364 static void OPNAME ## qpel8_mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1365 uint64_t half[8 + 9];\
1366 uint8_t * const halfH= ((uint8_t*)half) + 64;\
1367 uint8_t * const halfHV= ((uint8_t*)half);\
1368 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
1369 put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
1370 OPNAME ## pixels8_l2_ ## MMX(dst, halfH+8, halfHV, stride, 8, 8);\
1372 static void OPNAME ## qpel8_mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1373 uint64_t half[8 + 9];\
1374 uint8_t * const halfH= ((uint8_t*)half);\
1375 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
1376 put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, stride, 9);\
1377 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
1379 static void OPNAME ## qpel8_mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1380 uint64_t half[8 + 9];\
1381 uint8_t * const halfH= ((uint8_t*)half);\
1382 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
1383 put ## RND ## pixels8_l2_ ## MMX(halfH, src+1, halfH, 8, stride, 9);\
1384 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
1386 static void OPNAME ## qpel8_mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1388 uint8_t * const halfH= ((uint8_t*)half);\
1389 put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, stride, 9);\
1390 OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, stride, 8);\
1392 static void OPNAME ## qpel16_mc00_ ## MMX (uint8_t *dst, uint8_t *src, int stride){\
1393 OPNAME ## pixels16_ ## MMX(dst, src, stride, 16);\
1396 static void OPNAME ## qpel16_mc10_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1398 uint8_t * const half= (uint8_t*)temp;\
1399 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, stride, 16);\
1400 OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16);\
1403 static void OPNAME ## qpel16_mc20_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1404 OPNAME ## mpeg4_qpel16_h_lowpass_ ## MMX(dst, src, stride, stride, 16);\
1407 static void OPNAME ## qpel16_mc30_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1409 uint8_t * const half= (uint8_t*)temp;\
1410 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, stride, 16);\
1411 OPNAME ## pixels16_l2_ ## MMX(dst, src+1, half, stride, stride, 16);\
1414 static void OPNAME ## qpel16_mc01_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1416 uint8_t * const half= (uint8_t*)temp;\
1417 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, stride);\
1418 OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, stride, 16);\
1421 static void OPNAME ## qpel16_mc02_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1422 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, src, stride, stride);\
1425 static void OPNAME ## qpel16_mc03_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1427 uint8_t * const half= (uint8_t*)temp;\
1428 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, stride);\
1429 OPNAME ## pixels16_l2_ ## MMX(dst, src+stride, half, stride, stride, 16);\
1431 static void OPNAME ## qpel16_mc11_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1432 uint64_t half[16*2 + 17*2];\
1433 uint8_t * const halfH= ((uint8_t*)half) + 256;\
1434 uint8_t * const halfHV= ((uint8_t*)half);\
1435 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
1436 put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\
1437 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
1438 OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\
1440 static void OPNAME ## qpel16_mc31_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1441 uint64_t half[16*2 + 17*2];\
1442 uint8_t * const halfH= ((uint8_t*)half) + 256;\
1443 uint8_t * const halfHV= ((uint8_t*)half);\
1444 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
1445 put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\
1446 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
1447 OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\
1449 static void OPNAME ## qpel16_mc13_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1450 uint64_t half[16*2 + 17*2];\
1451 uint8_t * const halfH= ((uint8_t*)half) + 256;\
1452 uint8_t * const halfHV= ((uint8_t*)half);\
1453 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
1454 put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\
1455 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
1456 OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\
1458 static void OPNAME ## qpel16_mc33_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1459 uint64_t half[16*2 + 17*2];\
1460 uint8_t * const halfH= ((uint8_t*)half) + 256;\
1461 uint8_t * const halfHV= ((uint8_t*)half);\
1462 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
1463 put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\
1464 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
1465 OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\
1467 static void OPNAME ## qpel16_mc21_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1468 uint64_t half[16*2 + 17*2];\
1469 uint8_t * const halfH= ((uint8_t*)half) + 256;\
1470 uint8_t * const halfHV= ((uint8_t*)half);\
1471 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
1472 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
1473 OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, stride, 16, 16);\
1475 static void OPNAME ## qpel16_mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1476 uint64_t half[16*2 + 17*2];\
1477 uint8_t * const halfH= ((uint8_t*)half) + 256;\
1478 uint8_t * const halfHV= ((uint8_t*)half);\
1479 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
1480 put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, 16, 16);\
1481 OPNAME ## pixels16_l2_ ## MMX(dst, halfH+16, halfHV, stride, 16, 16);\
1483 static void OPNAME ## qpel16_mc12_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1484 uint64_t half[17*2];\
1485 uint8_t * const halfH= ((uint8_t*)half);\
1486 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
1487 put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, stride, 17);\
1488 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
1490 static void OPNAME ## qpel16_mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1491 uint64_t half[17*2];\
1492 uint8_t * const halfH= ((uint8_t*)half);\
1493 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
1494 put ## RND ## pixels16_l2_ ## MMX(halfH, src+1, halfH, 16, stride, 17);\
1495 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
1497 static void OPNAME ## qpel16_mc22_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1498 uint64_t half[17*2];\
1499 uint8_t * const halfH= ((uint8_t*)half);\
1500 put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, stride, 17);\
1501 OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, stride, 16);\
1504 #define PUT_OP(a,b,temp, size) "mov" #size " " #a ", " #b " \n\t"
1505 #define AVG_3DNOW_OP(a,b,temp, size) \
1506 "mov" #size " " #b ", " #temp " \n\t"\
1507 "pavgusb " #temp ", " #a " \n\t"\
1508 "mov" #size " " #a ", " #b " \n\t"
1509 #define AVG_MMX2_OP(a,b,temp, size) \
1510 "mov" #size " " #b ", " #temp " \n\t"\
1511 "pavgb " #temp ", " #a " \n\t"\
1512 "mov" #size " " #a ", " #b " \n\t"
1514 QPEL_BASE(put_ , ff_pw_16, _ , PUT_OP, PUT_OP)
1515 QPEL_BASE(avg_ , ff_pw_16, _ , AVG_MMX2_OP, AVG_3DNOW_OP)
1516 QPEL_BASE(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, PUT_OP)
1517 QPEL_OP(put_ , ff_pw_16, _ , PUT_OP, 3dnow)
1518 QPEL_OP(avg_ , ff_pw_16, _ , AVG_3DNOW_OP, 3dnow)
1519 QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, 3dnow)
1520 QPEL_OP(put_ , ff_pw_16, _ , PUT_OP, mmx2)
1521 QPEL_OP(avg_ , ff_pw_16, _ , AVG_MMX2_OP, mmx2)
1522 QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, PUT_OP, mmx2)
1524 /***********************************/
1525 /* bilinear qpel: not compliant to any spec, only for -lavdopts fast */
1527 #define QPEL_2TAP_XY(OPNAME, SIZE, MMX, XY, HPEL)\
1528 static void OPNAME ## 2tap_qpel ## SIZE ## _mc ## XY ## _ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1529 OPNAME ## pixels ## SIZE ## HPEL(dst, src, stride, SIZE);\
1531 #define QPEL_2TAP_L3(OPNAME, SIZE, MMX, XY, S0, S1, S2)\
1532 static void OPNAME ## 2tap_qpel ## SIZE ## _mc ## XY ## _ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1533 OPNAME ## 2tap_qpel ## SIZE ## _l3_ ## MMX(dst, src+S0, stride, SIZE, S1, S2);\
1536 #define QPEL_2TAP(OPNAME, SIZE, MMX)\
1537 QPEL_2TAP_XY(OPNAME, SIZE, MMX, 20, _x2_ ## MMX)\
1538 QPEL_2TAP_XY(OPNAME, SIZE, MMX, 02, _y2_ ## MMX)\
1539 QPEL_2TAP_XY(OPNAME, SIZE, MMX, 22, _xy2_mmx)\
1540 static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc00_ ## MMX =\
1541 OPNAME ## qpel ## SIZE ## _mc00_ ## MMX;\
1542 static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc21_ ## MMX =\
1543 OPNAME ## 2tap_qpel ## SIZE ## _mc20_ ## MMX;\
1544 static const qpel_mc_func OPNAME ## 2tap_qpel ## SIZE ## _mc12_ ## MMX =\
1545 OPNAME ## 2tap_qpel ## SIZE ## _mc02_ ## MMX;\
1546 static void OPNAME ## 2tap_qpel ## SIZE ## _mc32_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1547 OPNAME ## pixels ## SIZE ## _y2_ ## MMX(dst, src+1, stride, SIZE);\
1549 static void OPNAME ## 2tap_qpel ## SIZE ## _mc23_ ## MMX(uint8_t *dst, uint8_t *src, int stride){\
1550 OPNAME ## pixels ## SIZE ## _x2_ ## MMX(dst, src+stride, stride, SIZE);\
1552 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 10, 0, 1, 0)\
1553 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 30, 1, -1, 0)\
1554 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 01, 0, stride, 0)\
1555 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 03, stride, -stride, 0)\
1556 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 11, 0, stride, 1)\
1557 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 31, 1, stride, -1)\
1558 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 13, stride, -stride, 1)\
1559 QPEL_2TAP_L3(OPNAME, SIZE, MMX, 33, stride+1, -stride, -1)\
1561 QPEL_2TAP(put_, 16, mmx2)
1562 QPEL_2TAP(avg_, 16, mmx2)
1563 QPEL_2TAP(put_, 8, mmx2)
1564 QPEL_2TAP(avg_, 8, mmx2)
1565 QPEL_2TAP(put_, 16, 3dnow)
1566 QPEL_2TAP(avg_, 16, 3dnow)
1567 QPEL_2TAP(put_, 8, 3dnow)
1568 QPEL_2TAP(avg_, 8, 3dnow)
1572 static void just_return(void) { return; }
1576 typedef void emu_edge_core_func (uint8_t *buf, const uint8_t *src,
1577 x86_reg linesize, x86_reg start_y,
1578 x86_reg end_y, x86_reg block_h,
1579 x86_reg start_x, x86_reg end_x,
1581 extern emu_edge_core_func ff_emu_edge_core_mmx;
1582 extern emu_edge_core_func ff_emu_edge_core_sse;
1584 static av_always_inline
1585 void emulated_edge_mc(uint8_t *buf, const uint8_t *src, int linesize,
1586 int block_w, int block_h,
1587 int src_x, int src_y, int w, int h,
1588 emu_edge_core_func *core_fn)
1590 int start_y, start_x, end_y, end_x, src_y_add=0;
1593 src_y_add = h-1-src_y;
1595 }else if(src_y<=-block_h){
1596 src_y_add = 1-block_h-src_y;
1602 }else if(src_x<=-block_w){
1603 src+= (1-block_w-src_x);
1607 start_y= FFMAX(0, -src_y);
1608 start_x= FFMAX(0, -src_x);
1609 end_y= FFMIN(block_h, h-src_y);
1610 end_x= FFMIN(block_w, w-src_x);
1611 assert(start_x < end_x && block_w > 0);
1612 assert(start_y < end_y && block_h > 0);
1614 // fill in the to-be-copied part plus all above/below
1615 src += (src_y_add+start_y)*linesize + start_x;
1617 core_fn(buf, src, linesize, start_y, end_y, block_h, start_x, end_x, block_w);
1622 void emulated_edge_mc_mmx(uint8_t *buf, const uint8_t *src, int linesize,
1623 int block_w, int block_h,
1624 int src_x, int src_y, int w, int h)
1626 emulated_edge_mc(buf, src, linesize, block_w, block_h, src_x, src_y,
1627 w, h, &ff_emu_edge_core_mmx);
1631 void emulated_edge_mc_sse(uint8_t *buf, const uint8_t *src, int linesize,
1632 int block_w, int block_h,
1633 int src_x, int src_y, int w, int h)
1635 emulated_edge_mc(buf, src, linesize, block_w, block_h, src_x, src_y,
1636 w, h, &ff_emu_edge_core_sse);
1638 #endif /* HAVE_YASM */
1640 typedef void emulated_edge_mc_func (uint8_t *dst, const uint8_t *src,
1641 int linesize, int block_w, int block_h,
1642 int src_x, int src_y, int w, int h);
1644 static av_always_inline
1645 void gmc(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
1646 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height,
1647 emulated_edge_mc_func *emu_edge_fn)
1650 const int ix = ox>>(16+shift);
1651 const int iy = oy>>(16+shift);
1652 const int oxs = ox>>4;
1653 const int oys = oy>>4;
1654 const int dxxs = dxx>>4;
1655 const int dxys = dxy>>4;
1656 const int dyxs = dyx>>4;
1657 const int dyys = dyy>>4;
1658 const uint16_t r4[4] = {r,r,r,r};
1659 const uint16_t dxy4[4] = {dxys,dxys,dxys,dxys};
1660 const uint16_t dyy4[4] = {dyys,dyys,dyys,dyys};
1661 const uint64_t shift2 = 2*shift;
1662 uint8_t edge_buf[(h+1)*stride];
1665 const int dxw = (dxx-(1<<(16+shift)))*(w-1);
1666 const int dyh = (dyy-(1<<(16+shift)))*(h-1);
1667 const int dxh = dxy*(h-1);
1668 const int dyw = dyx*(w-1);
1669 if( // non-constant fullpel offset (3% of blocks)
1670 ((ox^(ox+dxw)) | (ox^(ox+dxh)) | (ox^(ox+dxw+dxh)) |
1671 (oy^(oy+dyw)) | (oy^(oy+dyh)) | (oy^(oy+dyw+dyh))) >> (16+shift)
1672 // uses more than 16 bits of subpel mv (only at huge resolution)
1673 || (dxx|dxy|dyx|dyy)&15 )
1675 //FIXME could still use mmx for some of the rows
1676 ff_gmc_c(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r, width, height);
1680 src += ix + iy*stride;
1681 if( (unsigned)ix >= width-w ||
1682 (unsigned)iy >= height-h )
1684 emu_edge_fn(edge_buf, src, stride, w+1, h+1, ix, iy, width, height);
1689 "movd %0, %%mm6 \n\t"
1690 "pxor %%mm7, %%mm7 \n\t"
1691 "punpcklwd %%mm6, %%mm6 \n\t"
1692 "punpcklwd %%mm6, %%mm6 \n\t"
1696 for(x=0; x<w; x+=4){
1697 uint16_t dx4[4] = { oxs - dxys + dxxs*(x+0),
1698 oxs - dxys + dxxs*(x+1),
1699 oxs - dxys + dxxs*(x+2),
1700 oxs - dxys + dxxs*(x+3) };
1701 uint16_t dy4[4] = { oys - dyys + dyxs*(x+0),
1702 oys - dyys + dyxs*(x+1),
1703 oys - dyys + dyxs*(x+2),
1704 oys - dyys + dyxs*(x+3) };
1708 "movq %0, %%mm4 \n\t"
1709 "movq %1, %%mm5 \n\t"
1710 "paddw %2, %%mm4 \n\t"
1711 "paddw %3, %%mm5 \n\t"
1712 "movq %%mm4, %0 \n\t"
1713 "movq %%mm5, %1 \n\t"
1714 "psrlw $12, %%mm4 \n\t"
1715 "psrlw $12, %%mm5 \n\t"
1716 : "+m"(*dx4), "+m"(*dy4)
1717 : "m"(*dxy4), "m"(*dyy4)
1721 "movq %%mm6, %%mm2 \n\t"
1722 "movq %%mm6, %%mm1 \n\t"
1723 "psubw %%mm4, %%mm2 \n\t"
1724 "psubw %%mm5, %%mm1 \n\t"
1725 "movq %%mm2, %%mm0 \n\t"
1726 "movq %%mm4, %%mm3 \n\t"
1727 "pmullw %%mm1, %%mm0 \n\t" // (s-dx)*(s-dy)
1728 "pmullw %%mm5, %%mm3 \n\t" // dx*dy
1729 "pmullw %%mm5, %%mm2 \n\t" // (s-dx)*dy
1730 "pmullw %%mm4, %%mm1 \n\t" // dx*(s-dy)
1732 "movd %4, %%mm5 \n\t"
1733 "movd %3, %%mm4 \n\t"
1734 "punpcklbw %%mm7, %%mm5 \n\t"
1735 "punpcklbw %%mm7, %%mm4 \n\t"
1736 "pmullw %%mm5, %%mm3 \n\t" // src[1,1] * dx*dy
1737 "pmullw %%mm4, %%mm2 \n\t" // src[0,1] * (s-dx)*dy
1739 "movd %2, %%mm5 \n\t"
1740 "movd %1, %%mm4 \n\t"
1741 "punpcklbw %%mm7, %%mm5 \n\t"
1742 "punpcklbw %%mm7, %%mm4 \n\t"
1743 "pmullw %%mm5, %%mm1 \n\t" // src[1,0] * dx*(s-dy)
1744 "pmullw %%mm4, %%mm0 \n\t" // src[0,0] * (s-dx)*(s-dy)
1745 "paddw %5, %%mm1 \n\t"
1746 "paddw %%mm3, %%mm2 \n\t"
1747 "paddw %%mm1, %%mm0 \n\t"
1748 "paddw %%mm2, %%mm0 \n\t"
1750 "psrlw %6, %%mm0 \n\t"
1751 "packuswb %%mm0, %%mm0 \n\t"
1752 "movd %%mm0, %0 \n\t"
1754 : "=m"(dst[x+y*stride])
1755 : "m"(src[0]), "m"(src[1]),
1756 "m"(src[stride]), "m"(src[stride+1]),
1757 "m"(*r4), "m"(shift2)
1767 static void gmc_mmx(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
1768 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
1770 gmc(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r,
1771 width, height, &emulated_edge_mc_mmx);
1774 static void gmc_sse(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
1775 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
1777 gmc(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r,
1778 width, height, &emulated_edge_mc_sse);
1781 static void gmc_mmx(uint8_t *dst, uint8_t *src, int stride, int h, int ox, int oy,
1782 int dxx, int dxy, int dyx, int dyy, int shift, int r, int width, int height)
1784 gmc(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r,
1785 width, height, &ff_emulated_edge_mc);
1789 #define PREFETCH(name, op) \
1790 static void name(void *mem, int stride, int h){\
1791 const uint8_t *p= mem;\
1793 __asm__ volatile(#op" %0" :: "m"(*p));\
1797 PREFETCH(prefetch_mmx2, prefetcht0)
1798 PREFETCH(prefetch_3dnow, prefetch)
1801 #include "h264_qpel_mmx.c"
1803 void ff_put_h264_chroma_mc8_mmx_rnd (uint8_t *dst, uint8_t *src,
1804 int stride, int h, int x, int y);
1805 void ff_put_rv40_chroma_mc8_mmx (uint8_t *dst, uint8_t *src,
1806 int stride, int h, int x, int y);
1807 void ff_avg_h264_chroma_mc8_mmx2_rnd (uint8_t *dst, uint8_t *src,
1808 int stride, int h, int x, int y);
1809 void ff_avg_rv40_chroma_mc8_mmx2 (uint8_t *dst, uint8_t *src,
1810 int stride, int h, int x, int y);
1811 void ff_avg_h264_chroma_mc8_3dnow_rnd (uint8_t *dst, uint8_t *src,
1812 int stride, int h, int x, int y);
1813 void ff_avg_rv40_chroma_mc8_3dnow (uint8_t *dst, uint8_t *src,
1814 int stride, int h, int x, int y);
1816 void ff_put_h264_chroma_mc4_mmx (uint8_t *dst, uint8_t *src,
1817 int stride, int h, int x, int y);
1818 void ff_put_rv40_chroma_mc4_mmx (uint8_t *dst, uint8_t *src,
1819 int stride, int h, int x, int y);
1820 void ff_avg_h264_chroma_mc4_mmx2 (uint8_t *dst, uint8_t *src,
1821 int stride, int h, int x, int y);
1822 void ff_avg_rv40_chroma_mc4_mmx2 (uint8_t *dst, uint8_t *src,
1823 int stride, int h, int x, int y);
1824 void ff_avg_h264_chroma_mc4_3dnow (uint8_t *dst, uint8_t *src,
1825 int stride, int h, int x, int y);
1826 void ff_avg_rv40_chroma_mc4_3dnow (uint8_t *dst, uint8_t *src,
1827 int stride, int h, int x, int y);
1829 void ff_put_h264_chroma_mc2_mmx2 (uint8_t *dst, uint8_t *src,
1830 int stride, int h, int x, int y);
1831 void ff_avg_h264_chroma_mc2_mmx2 (uint8_t *dst, uint8_t *src,
1832 int stride, int h, int x, int y);
1834 void ff_put_h264_chroma_mc8_ssse3_rnd (uint8_t *dst, uint8_t *src,
1835 int stride, int h, int x, int y);
1836 void ff_put_h264_chroma_mc4_ssse3 (uint8_t *dst, uint8_t *src,
1837 int stride, int h, int x, int y);
1839 void ff_avg_h264_chroma_mc8_ssse3_rnd (uint8_t *dst, uint8_t *src,
1840 int stride, int h, int x, int y);
1841 void ff_avg_h264_chroma_mc4_ssse3 (uint8_t *dst, uint8_t *src,
1842 int stride, int h, int x, int y);
1846 void ff_put_cavs_qpel8_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) {
1847 put_pixels8_mmx(dst, src, stride, 8);
1849 void ff_avg_cavs_qpel8_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) {
1850 avg_pixels8_mmx(dst, src, stride, 8);
1852 void ff_put_cavs_qpel16_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) {
1853 put_pixels16_mmx(dst, src, stride, 16);
1855 void ff_avg_cavs_qpel16_mc00_mmx2(uint8_t *dst, uint8_t *src, int stride) {
1856 avg_pixels16_mmx(dst, src, stride, 16);
1860 void ff_put_vc1_mspel_mc00_mmx(uint8_t *dst, const uint8_t *src, int stride, int rnd) {
1861 put_pixels8_mmx(dst, src, stride, 8);
1863 void ff_avg_vc1_mspel_mc00_mmx2(uint8_t *dst, const uint8_t *src, int stride, int rnd) {
1864 avg_pixels8_mmx2(dst, src, stride, 8);
1867 /* XXX: those functions should be suppressed ASAP when all IDCTs are
1870 static void ff_libmpeg2mmx_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
1872 ff_mmx_idct (block);
1873 ff_put_pixels_clamped_mmx(block, dest, line_size);
1875 static void ff_libmpeg2mmx_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
1877 ff_mmx_idct (block);
1878 ff_add_pixels_clamped_mmx(block, dest, line_size);
1880 static void ff_libmpeg2mmx2_idct_put(uint8_t *dest, int line_size, DCTELEM *block)
1882 ff_mmxext_idct (block);
1883 ff_put_pixels_clamped_mmx(block, dest, line_size);
1885 static void ff_libmpeg2mmx2_idct_add(uint8_t *dest, int line_size, DCTELEM *block)
1887 ff_mmxext_idct (block);
1888 ff_add_pixels_clamped_mmx(block, dest, line_size);
1891 static void ff_idct_xvid_mmx_put(uint8_t *dest, int line_size, DCTELEM *block)
1893 ff_idct_xvid_mmx (block);
1894 ff_put_pixels_clamped_mmx(block, dest, line_size);
1896 static void ff_idct_xvid_mmx_add(uint8_t *dest, int line_size, DCTELEM *block)
1898 ff_idct_xvid_mmx (block);
1899 ff_add_pixels_clamped_mmx(block, dest, line_size);
1901 static void ff_idct_xvid_mmx2_put(uint8_t *dest, int line_size, DCTELEM *block)
1903 ff_idct_xvid_mmx2 (block);
1904 ff_put_pixels_clamped_mmx(block, dest, line_size);
1906 static void ff_idct_xvid_mmx2_add(uint8_t *dest, int line_size, DCTELEM *block)
1908 ff_idct_xvid_mmx2 (block);
1909 ff_add_pixels_clamped_mmx(block, dest, line_size);
1912 static void vorbis_inverse_coupling_3dnow(float *mag, float *ang, int blocksize)
1915 __asm__ volatile("pxor %%mm7, %%mm7":);
1916 for(i=0; i<blocksize; i+=2) {
1918 "movq %0, %%mm0 \n\t"
1919 "movq %1, %%mm1 \n\t"
1920 "movq %%mm0, %%mm2 \n\t"
1921 "movq %%mm1, %%mm3 \n\t"
1922 "pfcmpge %%mm7, %%mm2 \n\t" // m <= 0.0
1923 "pfcmpge %%mm7, %%mm3 \n\t" // a <= 0.0
1924 "pslld $31, %%mm2 \n\t" // keep only the sign bit
1925 "pxor %%mm2, %%mm1 \n\t"
1926 "movq %%mm3, %%mm4 \n\t"
1927 "pand %%mm1, %%mm3 \n\t"
1928 "pandn %%mm1, %%mm4 \n\t"
1929 "pfadd %%mm0, %%mm3 \n\t" // a = m + ((a<0) & (a ^ sign(m)))
1930 "pfsub %%mm4, %%mm0 \n\t" // m = m + ((a>0) & (a ^ sign(m)))
1931 "movq %%mm3, %1 \n\t"
1932 "movq %%mm0, %0 \n\t"
1933 :"+m"(mag[i]), "+m"(ang[i])
1937 __asm__ volatile("femms");
1939 static void vorbis_inverse_coupling_sse(float *mag, float *ang, int blocksize)
1944 "movaps %0, %%xmm5 \n\t"
1945 ::"m"(ff_pdw_80000000[0])
1947 for(i=0; i<blocksize; i+=4) {
1949 "movaps %0, %%xmm0 \n\t"
1950 "movaps %1, %%xmm1 \n\t"
1951 "xorps %%xmm2, %%xmm2 \n\t"
1952 "xorps %%xmm3, %%xmm3 \n\t"
1953 "cmpleps %%xmm0, %%xmm2 \n\t" // m <= 0.0
1954 "cmpleps %%xmm1, %%xmm3 \n\t" // a <= 0.0
1955 "andps %%xmm5, %%xmm2 \n\t" // keep only the sign bit
1956 "xorps %%xmm2, %%xmm1 \n\t"
1957 "movaps %%xmm3, %%xmm4 \n\t"
1958 "andps %%xmm1, %%xmm3 \n\t"
1959 "andnps %%xmm1, %%xmm4 \n\t"
1960 "addps %%xmm0, %%xmm3 \n\t" // a = m + ((a<0) & (a ^ sign(m)))
1961 "subps %%xmm4, %%xmm0 \n\t" // m = m + ((a>0) & (a ^ sign(m)))
1962 "movaps %%xmm3, %1 \n\t"
1963 "movaps %%xmm0, %0 \n\t"
1964 :"+m"(mag[i]), "+m"(ang[i])
1973 #define MIX5(mono,stereo)\
1975 "movss 0(%2), %%xmm5 \n"\
1976 "movss 8(%2), %%xmm6 \n"\
1977 "movss 24(%2), %%xmm7 \n"\
1978 "shufps $0, %%xmm5, %%xmm5 \n"\
1979 "shufps $0, %%xmm6, %%xmm6 \n"\
1980 "shufps $0, %%xmm7, %%xmm7 \n"\
1982 "movaps (%0,%1), %%xmm0 \n"\
1983 "movaps 0x400(%0,%1), %%xmm1 \n"\
1984 "movaps 0x800(%0,%1), %%xmm2 \n"\
1985 "movaps 0xc00(%0,%1), %%xmm3 \n"\
1986 "movaps 0x1000(%0,%1), %%xmm4 \n"\
1987 "mulps %%xmm5, %%xmm0 \n"\
1988 "mulps %%xmm6, %%xmm1 \n"\
1989 "mulps %%xmm5, %%xmm2 \n"\
1990 "mulps %%xmm7, %%xmm3 \n"\
1991 "mulps %%xmm7, %%xmm4 \n"\
1992 stereo("addps %%xmm1, %%xmm0 \n")\
1993 "addps %%xmm1, %%xmm2 \n"\
1994 "addps %%xmm3, %%xmm0 \n"\
1995 "addps %%xmm4, %%xmm2 \n"\
1996 mono("addps %%xmm2, %%xmm0 \n")\
1997 "movaps %%xmm0, (%0,%1) \n"\
1998 stereo("movaps %%xmm2, 0x400(%0,%1) \n")\
2002 :"r"(samples[0]+len), "r"(matrix)\
2003 :XMM_CLOBBERS("%xmm0", "%xmm1", "%xmm2", "%xmm3", \
2004 "%xmm4", "%xmm5", "%xmm6", "%xmm7",)\
2008 #define MIX_MISC(stereo)\
2011 "movaps (%3,%0), %%xmm0 \n"\
2012 stereo("movaps %%xmm0, %%xmm1 \n")\
2013 "mulps %%xmm4, %%xmm0 \n"\
2014 stereo("mulps %%xmm5, %%xmm1 \n")\
2015 "lea 1024(%3,%0), %1 \n"\
2018 "movaps (%1), %%xmm2 \n"\
2019 stereo("movaps %%xmm2, %%xmm3 \n")\
2020 "mulps (%4,%2), %%xmm2 \n"\
2021 stereo("mulps 16(%4,%2), %%xmm3 \n")\
2022 "addps %%xmm2, %%xmm0 \n"\
2023 stereo("addps %%xmm3, %%xmm1 \n")\
2027 "movaps %%xmm0, (%3,%0) \n"\
2028 stereo("movaps %%xmm1, 1024(%3,%0) \n")\
2031 :"+&r"(i), "=&r"(j), "=&r"(k)\
2032 :"r"(samples[0]+len), "r"(matrix_simd+in_ch), "g"((intptr_t)-32*(in_ch-1))\
2036 static void ac3_downmix_sse(float (*samples)[256], float (*matrix)[2], int out_ch, int in_ch, int len)
2038 int (*matrix_cmp)[2] = (int(*)[2])matrix;
2041 i = -len*sizeof(float);
2042 if(in_ch == 5 && out_ch == 2 && !(matrix_cmp[0][1]|matrix_cmp[2][0]|matrix_cmp[3][1]|matrix_cmp[4][0]|(matrix_cmp[1][0]^matrix_cmp[1][1])|(matrix_cmp[0][0]^matrix_cmp[2][1]))) {
2044 } else if(in_ch == 5 && out_ch == 1 && matrix_cmp[0][0]==matrix_cmp[2][0] && matrix_cmp[3][0]==matrix_cmp[4][0]) {
2047 DECLARE_ALIGNED(16, float, matrix_simd)[AC3_MAX_CHANNELS][2][4];
2048 j = 2*in_ch*sizeof(float);
2052 "movss (%2,%0), %%xmm4 \n"
2053 "movss 4(%2,%0), %%xmm5 \n"
2054 "shufps $0, %%xmm4, %%xmm4 \n"
2055 "shufps $0, %%xmm5, %%xmm5 \n"
2056 "movaps %%xmm4, (%1,%0,4) \n"
2057 "movaps %%xmm5, 16(%1,%0,4) \n"
2060 :"r"(matrix_simd), "r"(matrix)
2071 static void vector_fmul_3dnow(float *dst, const float *src0, const float *src1, int len){
2072 x86_reg i = (len-4)*4;
2075 "movq (%2,%0), %%mm0 \n\t"
2076 "movq 8(%2,%0), %%mm1 \n\t"
2077 "pfmul (%3,%0), %%mm0 \n\t"
2078 "pfmul 8(%3,%0), %%mm1 \n\t"
2079 "movq %%mm0, (%1,%0) \n\t"
2080 "movq %%mm1, 8(%1,%0) \n\t"
2085 :"r"(dst), "r"(src0), "r"(src1)
2089 static void vector_fmul_sse(float *dst, const float *src0, const float *src1, int len){
2090 x86_reg i = (len-8)*4;
2093 "movaps (%2,%0), %%xmm0 \n\t"
2094 "movaps 16(%2,%0), %%xmm1 \n\t"
2095 "mulps (%3,%0), %%xmm0 \n\t"
2096 "mulps 16(%3,%0), %%xmm1 \n\t"
2097 "movaps %%xmm0, (%1,%0) \n\t"
2098 "movaps %%xmm1, 16(%1,%0) \n\t"
2102 :"r"(dst), "r"(src0), "r"(src1)
2107 static void vector_fmul_reverse_3dnow2(float *dst, const float *src0, const float *src1, int len){
2108 x86_reg i = len*4-16;
2111 "pswapd 8(%1), %%mm0 \n\t"
2112 "pswapd (%1), %%mm1 \n\t"
2113 "pfmul (%3,%0), %%mm0 \n\t"
2114 "pfmul 8(%3,%0), %%mm1 \n\t"
2115 "movq %%mm0, (%2,%0) \n\t"
2116 "movq %%mm1, 8(%2,%0) \n\t"
2120 :"+r"(i), "+r"(src1)
2121 :"r"(dst), "r"(src0)
2123 __asm__ volatile("femms");
2125 static void vector_fmul_reverse_sse(float *dst, const float *src0, const float *src1, int len){
2126 x86_reg i = len*4-32;
2129 "movaps 16(%1), %%xmm0 \n\t"
2130 "movaps (%1), %%xmm1 \n\t"
2131 "shufps $0x1b, %%xmm0, %%xmm0 \n\t"
2132 "shufps $0x1b, %%xmm1, %%xmm1 \n\t"
2133 "mulps (%3,%0), %%xmm0 \n\t"
2134 "mulps 16(%3,%0), %%xmm1 \n\t"
2135 "movaps %%xmm0, (%2,%0) \n\t"
2136 "movaps %%xmm1, 16(%2,%0) \n\t"
2140 :"+r"(i), "+r"(src1)
2141 :"r"(dst), "r"(src0)
2145 static void vector_fmul_add_3dnow(float *dst, const float *src0, const float *src1,
2146 const float *src2, int len){
2147 x86_reg i = (len-4)*4;
2150 "movq (%2,%0), %%mm0 \n\t"
2151 "movq 8(%2,%0), %%mm1 \n\t"
2152 "pfmul (%3,%0), %%mm0 \n\t"
2153 "pfmul 8(%3,%0), %%mm1 \n\t"
2154 "pfadd (%4,%0), %%mm0 \n\t"
2155 "pfadd 8(%4,%0), %%mm1 \n\t"
2156 "movq %%mm0, (%1,%0) \n\t"
2157 "movq %%mm1, 8(%1,%0) \n\t"
2161 :"r"(dst), "r"(src0), "r"(src1), "r"(src2)
2164 __asm__ volatile("femms");
2166 static void vector_fmul_add_sse(float *dst, const float *src0, const float *src1,
2167 const float *src2, int len){
2168 x86_reg i = (len-8)*4;
2171 "movaps (%2,%0), %%xmm0 \n\t"
2172 "movaps 16(%2,%0), %%xmm1 \n\t"
2173 "mulps (%3,%0), %%xmm0 \n\t"
2174 "mulps 16(%3,%0), %%xmm1 \n\t"
2175 "addps (%4,%0), %%xmm0 \n\t"
2176 "addps 16(%4,%0), %%xmm1 \n\t"
2177 "movaps %%xmm0, (%1,%0) \n\t"
2178 "movaps %%xmm1, 16(%1,%0) \n\t"
2182 :"r"(dst), "r"(src0), "r"(src1), "r"(src2)
2188 static void vector_fmul_window_3dnow2(float *dst, const float *src0, const float *src1,
2189 const float *win, int len){
2191 x86_reg j = len*4-8;
2194 "pswapd (%5,%1), %%mm1 \n"
2195 "movq (%5,%0), %%mm0 \n"
2196 "pswapd (%4,%1), %%mm5 \n"
2197 "movq (%3,%0), %%mm4 \n"
2198 "movq %%mm0, %%mm2 \n"
2199 "movq %%mm1, %%mm3 \n"
2200 "pfmul %%mm4, %%mm2 \n" // src0[len+i]*win[len+i]
2201 "pfmul %%mm5, %%mm3 \n" // src1[ j]*win[len+j]
2202 "pfmul %%mm4, %%mm1 \n" // src0[len+i]*win[len+j]
2203 "pfmul %%mm5, %%mm0 \n" // src1[ j]*win[len+i]
2204 "pfadd %%mm3, %%mm2 \n"
2205 "pfsub %%mm0, %%mm1 \n"
2206 "pswapd %%mm2, %%mm2 \n"
2207 "movq %%mm1, (%2,%0) \n"
2208 "movq %%mm2, (%2,%1) \n"
2214 :"r"(dst+len), "r"(src0+len), "r"(src1), "r"(win+len)
2218 static void vector_fmul_window_sse(float *dst, const float *src0, const float *src1,
2219 const float *win, int len){
2221 x86_reg j = len*4-16;
2224 "movaps (%5,%1), %%xmm1 \n"
2225 "movaps (%5,%0), %%xmm0 \n"
2226 "movaps (%4,%1), %%xmm5 \n"
2227 "movaps (%3,%0), %%xmm4 \n"
2228 "shufps $0x1b, %%xmm1, %%xmm1 \n"
2229 "shufps $0x1b, %%xmm5, %%xmm5 \n"
2230 "movaps %%xmm0, %%xmm2 \n"
2231 "movaps %%xmm1, %%xmm3 \n"
2232 "mulps %%xmm4, %%xmm2 \n" // src0[len+i]*win[len+i]
2233 "mulps %%xmm5, %%xmm3 \n" // src1[ j]*win[len+j]
2234 "mulps %%xmm4, %%xmm1 \n" // src0[len+i]*win[len+j]
2235 "mulps %%xmm5, %%xmm0 \n" // src1[ j]*win[len+i]
2236 "addps %%xmm3, %%xmm2 \n"
2237 "subps %%xmm0, %%xmm1 \n"
2238 "shufps $0x1b, %%xmm2, %%xmm2 \n"
2239 "movaps %%xmm1, (%2,%0) \n"
2240 "movaps %%xmm2, (%2,%1) \n"
2245 :"r"(dst+len), "r"(src0+len), "r"(src1), "r"(win+len)
2248 #endif /* HAVE_6REGS */
2250 static void vector_clipf_sse(float *dst, const float *src, float min, float max,
2253 x86_reg i = (len-16)*4;
2255 "movss %3, %%xmm4 \n"
2256 "movss %4, %%xmm5 \n"
2257 "shufps $0, %%xmm4, %%xmm4 \n"
2258 "shufps $0, %%xmm5, %%xmm5 \n"
2260 "movaps (%2,%0), %%xmm0 \n\t" // 3/1 on intel
2261 "movaps 16(%2,%0), %%xmm1 \n\t"
2262 "movaps 32(%2,%0), %%xmm2 \n\t"
2263 "movaps 48(%2,%0), %%xmm3 \n\t"
2264 "maxps %%xmm4, %%xmm0 \n\t"
2265 "maxps %%xmm4, %%xmm1 \n\t"
2266 "maxps %%xmm4, %%xmm2 \n\t"
2267 "maxps %%xmm4, %%xmm3 \n\t"
2268 "minps %%xmm5, %%xmm0 \n\t"
2269 "minps %%xmm5, %%xmm1 \n\t"
2270 "minps %%xmm5, %%xmm2 \n\t"
2271 "minps %%xmm5, %%xmm3 \n\t"
2272 "movaps %%xmm0, (%1,%0) \n\t"
2273 "movaps %%xmm1, 16(%1,%0) \n\t"
2274 "movaps %%xmm2, 32(%1,%0) \n\t"
2275 "movaps %%xmm3, 48(%1,%0) \n\t"
2279 :"r"(dst), "r"(src), "m"(min), "m"(max)
2284 void ff_vp3_idct_mmx(int16_t *input_data);
2285 void ff_vp3_idct_put_mmx(uint8_t *dest, int line_size, DCTELEM *block);
2286 void ff_vp3_idct_add_mmx(uint8_t *dest, int line_size, DCTELEM *block);
2288 void ff_vp3_idct_dc_add_mmx2(uint8_t *dest, int line_size, const DCTELEM *block);
2290 void ff_vp3_v_loop_filter_mmx2(uint8_t *src, int stride, int *bounding_values);
2291 void ff_vp3_h_loop_filter_mmx2(uint8_t *src, int stride, int *bounding_values);
2293 void ff_vp3_idct_sse2(int16_t *input_data);
2294 void ff_vp3_idct_put_sse2(uint8_t *dest, int line_size, DCTELEM *block);
2295 void ff_vp3_idct_add_sse2(uint8_t *dest, int line_size, DCTELEM *block);
2297 int32_t ff_scalarproduct_int16_mmx2(const int16_t *v1, const int16_t *v2, int order, int shift);
2298 int32_t ff_scalarproduct_int16_sse2(const int16_t *v1, const int16_t *v2, int order, int shift);
2299 int32_t ff_scalarproduct_and_madd_int16_mmx2(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul);
2300 int32_t ff_scalarproduct_and_madd_int16_sse2(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul);
2301 int32_t ff_scalarproduct_and_madd_int16_ssse3(int16_t *v1, const int16_t *v2, const int16_t *v3, int order, int mul);
2303 void ff_apply_window_int16_mmxext (int16_t *output, const int16_t *input,
2304 const int16_t *window, unsigned int len);
2305 void ff_apply_window_int16_mmxext_ba (int16_t *output, const int16_t *input,
2306 const int16_t *window, unsigned int len);
2307 void ff_apply_window_int16_sse2 (int16_t *output, const int16_t *input,
2308 const int16_t *window, unsigned int len);
2309 void ff_apply_window_int16_sse2_ba (int16_t *output, const int16_t *input,
2310 const int16_t *window, unsigned int len);
2311 void ff_apply_window_int16_ssse3 (int16_t *output, const int16_t *input,
2312 const int16_t *window, unsigned int len);
2313 void ff_apply_window_int16_ssse3_atom(int16_t *output, const int16_t *input,
2314 const int16_t *window, unsigned int len);
2316 void ff_add_hfyu_median_prediction_mmx2(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int w, int *left, int *left_top);
2317 int ff_add_hfyu_left_prediction_ssse3(uint8_t *dst, const uint8_t *src, int w, int left);
2318 int ff_add_hfyu_left_prediction_sse4(uint8_t *dst, const uint8_t *src, int w, int left);
2320 float ff_scalarproduct_float_sse(const float *v1, const float *v2, int order);
2322 void dsputil_init_mmx(DSPContext* c, AVCodecContext *avctx)
2324 int mm_flags = av_get_cpu_flags();
2325 const int h264_high_depth = avctx->codec_id == CODEC_ID_H264 && avctx->bits_per_raw_sample > 8;
2327 if (avctx->dsp_mask) {
2328 if (avctx->dsp_mask & AV_CPU_FLAG_FORCE)
2329 mm_flags |= (avctx->dsp_mask & 0xffff);
2331 mm_flags &= ~(avctx->dsp_mask & 0xffff);
2335 av_log(avctx, AV_LOG_INFO, "libavcodec: CPU flags:");
2336 if (mm_flags & AV_CPU_FLAG_MMX)
2337 av_log(avctx, AV_LOG_INFO, " mmx");
2338 if (mm_flags & AV_CPU_FLAG_MMX2)
2339 av_log(avctx, AV_LOG_INFO, " mmx2");
2340 if (mm_flags & AV_CPU_FLAG_3DNOW)
2341 av_log(avctx, AV_LOG_INFO, " 3dnow");
2342 if (mm_flags & AV_CPU_FLAG_SSE)
2343 av_log(avctx, AV_LOG_INFO, " sse");
2344 if (mm_flags & AV_CPU_FLAG_SSE2)
2345 av_log(avctx, AV_LOG_INFO, " sse2");
2346 av_log(avctx, AV_LOG_INFO, "\n");
2349 if (mm_flags & AV_CPU_FLAG_MMX) {
2350 const int idct_algo= avctx->idct_algo;
2352 if(avctx->lowres==0){
2353 if(idct_algo==FF_IDCT_AUTO || idct_algo==FF_IDCT_SIMPLEMMX){
2354 c->idct_put= ff_simple_idct_put_mmx;
2355 c->idct_add= ff_simple_idct_add_mmx;
2356 c->idct = ff_simple_idct_mmx;
2357 c->idct_permutation_type= FF_SIMPLE_IDCT_PERM;
2359 }else if(idct_algo==FF_IDCT_LIBMPEG2MMX){
2360 if(mm_flags & AV_CPU_FLAG_MMX2){
2361 c->idct_put= ff_libmpeg2mmx2_idct_put;
2362 c->idct_add= ff_libmpeg2mmx2_idct_add;
2363 c->idct = ff_mmxext_idct;
2365 c->idct_put= ff_libmpeg2mmx_idct_put;
2366 c->idct_add= ff_libmpeg2mmx_idct_add;
2367 c->idct = ff_mmx_idct;
2369 c->idct_permutation_type= FF_LIBMPEG2_IDCT_PERM;
2371 }else if((CONFIG_VP3_DECODER || CONFIG_VP5_DECODER || CONFIG_VP6_DECODER) &&
2372 idct_algo==FF_IDCT_VP3 && HAVE_YASM){
2373 if(mm_flags & AV_CPU_FLAG_SSE2){
2374 c->idct_put= ff_vp3_idct_put_sse2;
2375 c->idct_add= ff_vp3_idct_add_sse2;
2376 c->idct = ff_vp3_idct_sse2;
2377 c->idct_permutation_type= FF_TRANSPOSE_IDCT_PERM;
2379 c->idct_put= ff_vp3_idct_put_mmx;
2380 c->idct_add= ff_vp3_idct_add_mmx;
2381 c->idct = ff_vp3_idct_mmx;
2382 c->idct_permutation_type= FF_PARTTRANS_IDCT_PERM;
2384 }else if(idct_algo==FF_IDCT_CAVS){
2385 c->idct_permutation_type= FF_TRANSPOSE_IDCT_PERM;
2386 }else if(idct_algo==FF_IDCT_XVIDMMX){
2387 if(mm_flags & AV_CPU_FLAG_SSE2){
2388 c->idct_put= ff_idct_xvid_sse2_put;
2389 c->idct_add= ff_idct_xvid_sse2_add;
2390 c->idct = ff_idct_xvid_sse2;
2391 c->idct_permutation_type= FF_SSE2_IDCT_PERM;
2392 }else if(mm_flags & AV_CPU_FLAG_MMX2){
2393 c->idct_put= ff_idct_xvid_mmx2_put;
2394 c->idct_add= ff_idct_xvid_mmx2_add;
2395 c->idct = ff_idct_xvid_mmx2;
2397 c->idct_put= ff_idct_xvid_mmx_put;
2398 c->idct_add= ff_idct_xvid_mmx_add;
2399 c->idct = ff_idct_xvid_mmx;
2404 c->put_pixels_clamped = ff_put_pixels_clamped_mmx;
2405 c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_mmx;
2406 c->add_pixels_clamped = ff_add_pixels_clamped_mmx;
2407 if (!h264_high_depth) {
2408 c->clear_block = clear_block_mmx;
2409 c->clear_blocks = clear_blocks_mmx;
2410 if ((mm_flags & AV_CPU_FLAG_SSE) &&
2411 !(CONFIG_MPEG_XVMC_DECODER && avctx->xvmc_acceleration > 1)){
2412 /* XvMCCreateBlocks() may not allocate 16-byte aligned blocks */
2413 c->clear_block = clear_block_sse;
2414 c->clear_blocks = clear_blocks_sse;
2418 #define SET_HPEL_FUNCS(PFX, IDX, SIZE, CPU) \
2419 c->PFX ## _pixels_tab[IDX][0] = PFX ## _pixels ## SIZE ## _ ## CPU; \
2420 c->PFX ## _pixels_tab[IDX][1] = PFX ## _pixels ## SIZE ## _x2_ ## CPU; \
2421 c->PFX ## _pixels_tab[IDX][2] = PFX ## _pixels ## SIZE ## _y2_ ## CPU; \
2422 c->PFX ## _pixels_tab[IDX][3] = PFX ## _pixels ## SIZE ## _xy2_ ## CPU
2424 if (!h264_high_depth) {
2425 SET_HPEL_FUNCS(put, 0, 16, mmx);
2426 SET_HPEL_FUNCS(put_no_rnd, 0, 16, mmx);
2427 SET_HPEL_FUNCS(avg, 0, 16, mmx);
2428 SET_HPEL_FUNCS(avg_no_rnd, 0, 16, mmx);
2429 SET_HPEL_FUNCS(put, 1, 8, mmx);
2430 SET_HPEL_FUNCS(put_no_rnd, 1, 8, mmx);
2431 SET_HPEL_FUNCS(avg, 1, 8, mmx);
2432 SET_HPEL_FUNCS(avg_no_rnd, 1, 8, mmx);
2435 #if ARCH_X86_32 || !HAVE_YASM
2438 #if ARCH_X86_32 && HAVE_YASM
2439 if (!h264_high_depth)
2440 c->emulated_edge_mc = emulated_edge_mc_mmx;
2443 c->add_bytes= add_bytes_mmx;
2445 if (!h264_high_depth)
2446 c->draw_edges = draw_edges_mmx;
2448 if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
2449 c->h263_v_loop_filter= h263_v_loop_filter_mmx;
2450 c->h263_h_loop_filter= h263_h_loop_filter_mmx;
2454 if (!h264_high_depth) {
2455 c->put_h264_chroma_pixels_tab[0]= ff_put_h264_chroma_mc8_mmx_rnd;
2456 c->put_h264_chroma_pixels_tab[1]= ff_put_h264_chroma_mc4_mmx;
2459 c->put_rv40_chroma_pixels_tab[0]= ff_put_rv40_chroma_mc8_mmx;
2460 c->put_rv40_chroma_pixels_tab[1]= ff_put_rv40_chroma_mc4_mmx;
2463 if (mm_flags & AV_CPU_FLAG_MMX2) {
2464 c->prefetch = prefetch_mmx2;
2466 if (!h264_high_depth) {
2467 c->put_pixels_tab[0][1] = put_pixels16_x2_mmx2;
2468 c->put_pixels_tab[0][2] = put_pixels16_y2_mmx2;
2470 c->avg_pixels_tab[0][0] = avg_pixels16_mmx2;
2471 c->avg_pixels_tab[0][1] = avg_pixels16_x2_mmx2;
2472 c->avg_pixels_tab[0][2] = avg_pixels16_y2_mmx2;
2474 c->put_pixels_tab[1][1] = put_pixels8_x2_mmx2;
2475 c->put_pixels_tab[1][2] = put_pixels8_y2_mmx2;
2477 c->avg_pixels_tab[1][0] = avg_pixels8_mmx2;
2478 c->avg_pixels_tab[1][1] = avg_pixels8_x2_mmx2;
2479 c->avg_pixels_tab[1][2] = avg_pixels8_y2_mmx2;
2482 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
2483 if (!h264_high_depth) {
2484 c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmx2;
2485 c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmx2;
2486 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_mmx2;
2487 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_mmx2;
2488 c->avg_pixels_tab[0][3] = avg_pixels16_xy2_mmx2;
2489 c->avg_pixels_tab[1][3] = avg_pixels8_xy2_mmx2;
2492 if (CONFIG_VP3_DECODER && HAVE_YASM) {
2493 c->vp3_v_loop_filter= ff_vp3_v_loop_filter_mmx2;
2494 c->vp3_h_loop_filter= ff_vp3_h_loop_filter_mmx2;
2497 if (CONFIG_VP3_DECODER && HAVE_YASM) {
2498 c->vp3_idct_dc_add = ff_vp3_idct_dc_add_mmx2;
2501 if (CONFIG_VP3_DECODER
2502 && (avctx->codec_id == CODEC_ID_VP3 || avctx->codec_id == CODEC_ID_THEORA)) {
2503 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_exact_mmx2;
2504 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_exact_mmx2;
2507 #define SET_QPEL_FUNCS(PFX, IDX, SIZE, CPU) \
2508 c->PFX ## _pixels_tab[IDX][ 0] = PFX ## SIZE ## _mc00_ ## CPU; \
2509 c->PFX ## _pixels_tab[IDX][ 1] = PFX ## SIZE ## _mc10_ ## CPU; \
2510 c->PFX ## _pixels_tab[IDX][ 2] = PFX ## SIZE ## _mc20_ ## CPU; \
2511 c->PFX ## _pixels_tab[IDX][ 3] = PFX ## SIZE ## _mc30_ ## CPU; \
2512 c->PFX ## _pixels_tab[IDX][ 4] = PFX ## SIZE ## _mc01_ ## CPU; \
2513 c->PFX ## _pixels_tab[IDX][ 5] = PFX ## SIZE ## _mc11_ ## CPU; \
2514 c->PFX ## _pixels_tab[IDX][ 6] = PFX ## SIZE ## _mc21_ ## CPU; \
2515 c->PFX ## _pixels_tab[IDX][ 7] = PFX ## SIZE ## _mc31_ ## CPU; \
2516 c->PFX ## _pixels_tab[IDX][ 8] = PFX ## SIZE ## _mc02_ ## CPU; \
2517 c->PFX ## _pixels_tab[IDX][ 9] = PFX ## SIZE ## _mc12_ ## CPU; \
2518 c->PFX ## _pixels_tab[IDX][10] = PFX ## SIZE ## _mc22_ ## CPU; \
2519 c->PFX ## _pixels_tab[IDX][11] = PFX ## SIZE ## _mc32_ ## CPU; \
2520 c->PFX ## _pixels_tab[IDX][12] = PFX ## SIZE ## _mc03_ ## CPU; \
2521 c->PFX ## _pixels_tab[IDX][13] = PFX ## SIZE ## _mc13_ ## CPU; \
2522 c->PFX ## _pixels_tab[IDX][14] = PFX ## SIZE ## _mc23_ ## CPU; \
2523 c->PFX ## _pixels_tab[IDX][15] = PFX ## SIZE ## _mc33_ ## CPU
2525 SET_QPEL_FUNCS(put_qpel, 0, 16, mmx2);
2526 SET_QPEL_FUNCS(put_qpel, 1, 8, mmx2);
2527 SET_QPEL_FUNCS(put_no_rnd_qpel, 0, 16, mmx2);
2528 SET_QPEL_FUNCS(put_no_rnd_qpel, 1, 8, mmx2);
2529 SET_QPEL_FUNCS(avg_qpel, 0, 16, mmx2);
2530 SET_QPEL_FUNCS(avg_qpel, 1, 8, mmx2);
2532 if (!h264_high_depth) {
2533 SET_QPEL_FUNCS(put_h264_qpel, 0, 16, mmx2);
2534 SET_QPEL_FUNCS(put_h264_qpel, 1, 8, mmx2);
2535 SET_QPEL_FUNCS(put_h264_qpel, 2, 4, mmx2);
2536 SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, mmx2);
2537 SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, mmx2);
2538 SET_QPEL_FUNCS(avg_h264_qpel, 2, 4, mmx2);
2541 SET_QPEL_FUNCS(put_2tap_qpel, 0, 16, mmx2);
2542 SET_QPEL_FUNCS(put_2tap_qpel, 1, 8, mmx2);
2543 SET_QPEL_FUNCS(avg_2tap_qpel, 0, 16, mmx2);
2544 SET_QPEL_FUNCS(avg_2tap_qpel, 1, 8, mmx2);
2547 c->avg_rv40_chroma_pixels_tab[0]= ff_avg_rv40_chroma_mc8_mmx2;
2548 c->avg_rv40_chroma_pixels_tab[1]= ff_avg_rv40_chroma_mc4_mmx2;
2550 if (!h264_high_depth) {
2551 c->avg_h264_chroma_pixels_tab[0]= ff_avg_h264_chroma_mc8_mmx2_rnd;
2552 c->avg_h264_chroma_pixels_tab[1]= ff_avg_h264_chroma_mc4_mmx2;
2553 c->avg_h264_chroma_pixels_tab[2]= ff_avg_h264_chroma_mc2_mmx2;
2554 c->put_h264_chroma_pixels_tab[2]= ff_put_h264_chroma_mc2_mmx2;
2557 c->add_hfyu_median_prediction = ff_add_hfyu_median_prediction_mmx2;
2559 #if HAVE_7REGS && HAVE_TEN_OPERANDS
2560 if( mm_flags&AV_CPU_FLAG_3DNOW )
2561 c->add_hfyu_median_prediction = add_hfyu_median_prediction_cmov;
2564 } else if (mm_flags & AV_CPU_FLAG_3DNOW) {
2565 c->prefetch = prefetch_3dnow;
2567 if (!h264_high_depth) {
2568 c->put_pixels_tab[0][1] = put_pixels16_x2_3dnow;
2569 c->put_pixels_tab[0][2] = put_pixels16_y2_3dnow;
2571 c->avg_pixels_tab[0][0] = avg_pixels16_3dnow;
2572 c->avg_pixels_tab[0][1] = avg_pixels16_x2_3dnow;
2573 c->avg_pixels_tab[0][2] = avg_pixels16_y2_3dnow;
2575 c->put_pixels_tab[1][1] = put_pixels8_x2_3dnow;
2576 c->put_pixels_tab[1][2] = put_pixels8_y2_3dnow;
2578 c->avg_pixels_tab[1][0] = avg_pixels8_3dnow;
2579 c->avg_pixels_tab[1][1] = avg_pixels8_x2_3dnow;
2580 c->avg_pixels_tab[1][2] = avg_pixels8_y2_3dnow;
2582 if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
2583 c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_3dnow;
2584 c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_3dnow;
2585 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_3dnow;
2586 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_3dnow;
2587 c->avg_pixels_tab[0][3] = avg_pixels16_xy2_3dnow;
2588 c->avg_pixels_tab[1][3] = avg_pixels8_xy2_3dnow;
2592 if (CONFIG_VP3_DECODER
2593 && (avctx->codec_id == CODEC_ID_VP3 || avctx->codec_id == CODEC_ID_THEORA)) {
2594 c->put_no_rnd_pixels_tab[1][1] = put_no_rnd_pixels8_x2_exact_3dnow;
2595 c->put_no_rnd_pixels_tab[1][2] = put_no_rnd_pixels8_y2_exact_3dnow;
2598 SET_QPEL_FUNCS(put_qpel, 0, 16, 3dnow);
2599 SET_QPEL_FUNCS(put_qpel, 1, 8, 3dnow);
2600 SET_QPEL_FUNCS(put_no_rnd_qpel, 0, 16, 3dnow);
2601 SET_QPEL_FUNCS(put_no_rnd_qpel, 1, 8, 3dnow);
2602 SET_QPEL_FUNCS(avg_qpel, 0, 16, 3dnow);
2603 SET_QPEL_FUNCS(avg_qpel, 1, 8, 3dnow);
2605 if (!h264_high_depth) {
2606 SET_QPEL_FUNCS(put_h264_qpel, 0, 16, 3dnow);
2607 SET_QPEL_FUNCS(put_h264_qpel, 1, 8, 3dnow);
2608 SET_QPEL_FUNCS(put_h264_qpel, 2, 4, 3dnow);
2609 SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, 3dnow);
2610 SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, 3dnow);
2611 SET_QPEL_FUNCS(avg_h264_qpel, 2, 4, 3dnow);
2614 SET_QPEL_FUNCS(put_2tap_qpel, 0, 16, 3dnow);
2615 SET_QPEL_FUNCS(put_2tap_qpel, 1, 8, 3dnow);
2616 SET_QPEL_FUNCS(avg_2tap_qpel, 0, 16, 3dnow);
2617 SET_QPEL_FUNCS(avg_2tap_qpel, 1, 8, 3dnow);
2620 if (!h264_high_depth) {
2621 c->avg_h264_chroma_pixels_tab[0]= ff_avg_h264_chroma_mc8_3dnow_rnd;
2622 c->avg_h264_chroma_pixels_tab[1]= ff_avg_h264_chroma_mc4_3dnow;
2625 c->avg_rv40_chroma_pixels_tab[0]= ff_avg_rv40_chroma_mc8_3dnow;
2626 c->avg_rv40_chroma_pixels_tab[1]= ff_avg_rv40_chroma_mc4_3dnow;
2631 #define H264_QPEL_FUNCS(x, y, CPU)\
2632 c->put_h264_qpel_pixels_tab[0][x+y*4] = put_h264_qpel16_mc##x##y##_##CPU;\
2633 c->put_h264_qpel_pixels_tab[1][x+y*4] = put_h264_qpel8_mc##x##y##_##CPU;\
2634 c->avg_h264_qpel_pixels_tab[0][x+y*4] = avg_h264_qpel16_mc##x##y##_##CPU;\
2635 c->avg_h264_qpel_pixels_tab[1][x+y*4] = avg_h264_qpel8_mc##x##y##_##CPU;
2636 if((mm_flags & AV_CPU_FLAG_SSE2) && !(mm_flags & AV_CPU_FLAG_3DNOW)){
2637 // these functions are slower than mmx on AMD, but faster on Intel
2638 if (!h264_high_depth) {
2639 c->put_pixels_tab[0][0] = put_pixels16_sse2;
2640 c->put_no_rnd_pixels_tab[0][0] = put_pixels16_sse2;
2641 c->avg_pixels_tab[0][0] = avg_pixels16_sse2;
2642 H264_QPEL_FUNCS(0, 0, sse2);
2645 if(mm_flags & AV_CPU_FLAG_SSE2){
2646 if (!h264_high_depth) {
2647 H264_QPEL_FUNCS(0, 1, sse2);
2648 H264_QPEL_FUNCS(0, 2, sse2);
2649 H264_QPEL_FUNCS(0, 3, sse2);
2650 H264_QPEL_FUNCS(1, 1, sse2);
2651 H264_QPEL_FUNCS(1, 2, sse2);
2652 H264_QPEL_FUNCS(1, 3, sse2);
2653 H264_QPEL_FUNCS(2, 1, sse2);
2654 H264_QPEL_FUNCS(2, 2, sse2);
2655 H264_QPEL_FUNCS(2, 3, sse2);
2656 H264_QPEL_FUNCS(3, 1, sse2);
2657 H264_QPEL_FUNCS(3, 2, sse2);
2658 H264_QPEL_FUNCS(3, 3, sse2);
2662 if(mm_flags & AV_CPU_FLAG_SSSE3){
2663 if (!h264_high_depth) {
2664 H264_QPEL_FUNCS(1, 0, ssse3);
2665 H264_QPEL_FUNCS(1, 1, ssse3);
2666 H264_QPEL_FUNCS(1, 2, ssse3);
2667 H264_QPEL_FUNCS(1, 3, ssse3);
2668 H264_QPEL_FUNCS(2, 0, ssse3);
2669 H264_QPEL_FUNCS(2, 1, ssse3);
2670 H264_QPEL_FUNCS(2, 2, ssse3);
2671 H264_QPEL_FUNCS(2, 3, ssse3);
2672 H264_QPEL_FUNCS(3, 0, ssse3);
2673 H264_QPEL_FUNCS(3, 1, ssse3);
2674 H264_QPEL_FUNCS(3, 2, ssse3);
2675 H264_QPEL_FUNCS(3, 3, ssse3);
2678 if (!h264_high_depth) {
2679 c->put_h264_chroma_pixels_tab[0]= ff_put_h264_chroma_mc8_ssse3_rnd;
2680 c->avg_h264_chroma_pixels_tab[0]= ff_avg_h264_chroma_mc8_ssse3_rnd;
2681 c->put_h264_chroma_pixels_tab[1]= ff_put_h264_chroma_mc4_ssse3;
2682 c->avg_h264_chroma_pixels_tab[1]= ff_avg_h264_chroma_mc4_ssse3;
2684 c->add_hfyu_left_prediction = ff_add_hfyu_left_prediction_ssse3;
2685 if (mm_flags & AV_CPU_FLAG_SSE4) // not really sse4, just slow on Conroe
2686 c->add_hfyu_left_prediction = ff_add_hfyu_left_prediction_sse4;
2691 if(mm_flags & AV_CPU_FLAG_3DNOW){
2692 c->vorbis_inverse_coupling = vorbis_inverse_coupling_3dnow;
2693 c->vector_fmul = vector_fmul_3dnow;
2695 if(mm_flags & AV_CPU_FLAG_3DNOWEXT){
2696 c->vector_fmul_reverse = vector_fmul_reverse_3dnow2;
2698 c->vector_fmul_window = vector_fmul_window_3dnow2;
2701 if(mm_flags & AV_CPU_FLAG_MMX2){
2703 c->scalarproduct_int16 = ff_scalarproduct_int16_mmx2;
2704 c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_mmx2;
2705 if (avctx->flags & CODEC_FLAG_BITEXACT) {
2706 c->apply_window_int16 = ff_apply_window_int16_mmxext_ba;
2708 c->apply_window_int16 = ff_apply_window_int16_mmxext;
2712 if(mm_flags & AV_CPU_FLAG_SSE){
2713 c->vorbis_inverse_coupling = vorbis_inverse_coupling_sse;
2714 c->ac3_downmix = ac3_downmix_sse;
2715 c->vector_fmul = vector_fmul_sse;
2716 c->vector_fmul_reverse = vector_fmul_reverse_sse;
2717 c->vector_fmul_add = vector_fmul_add_sse;
2719 c->vector_fmul_window = vector_fmul_window_sse;
2721 c->vector_clipf = vector_clipf_sse;
2723 c->scalarproduct_float = ff_scalarproduct_float_sse;
2726 if(mm_flags & AV_CPU_FLAG_3DNOW)
2727 c->vector_fmul_add = vector_fmul_add_3dnow; // faster than sse
2728 if(mm_flags & AV_CPU_FLAG_SSE2){
2730 c->scalarproduct_int16 = ff_scalarproduct_int16_sse2;
2731 c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_sse2;
2732 if (avctx->flags & CODEC_FLAG_BITEXACT) {
2733 c->apply_window_int16 = ff_apply_window_int16_sse2_ba;
2735 if (!(mm_flags & AV_CPU_FLAG_SSE2SLOW)) {
2736 c->apply_window_int16 = ff_apply_window_int16_sse2;
2740 if (!h264_high_depth)
2741 c->emulated_edge_mc = emulated_edge_mc_sse;
2745 if (mm_flags & AV_CPU_FLAG_SSSE3) {
2747 if (mm_flags & AV_CPU_FLAG_ATOM) {
2748 c->apply_window_int16 = ff_apply_window_int16_ssse3_atom;
2750 c->apply_window_int16 = ff_apply_window_int16_ssse3;
2752 if (!(mm_flags & (AV_CPU_FLAG_SSE42|AV_CPU_FLAG_3DNOW))) { // cachesplit
2753 c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_ssse3;
2759 if (CONFIG_ENCODERS)
2760 dsputilenc_init_mmx(c, avctx);
2763 // for speed testing
2764 get_pixels = just_return;
2765 put_pixels_clamped = just_return;
2766 add_pixels_clamped = just_return;
2768 pix_abs16x16 = just_return;
2769 pix_abs16x16_x2 = just_return;
2770 pix_abs16x16_y2 = just_return;
2771 pix_abs16x16_xy2 = just_return;
2773 put_pixels_tab[0] = just_return;
2774 put_pixels_tab[1] = just_return;
2775 put_pixels_tab[2] = just_return;
2776 put_pixels_tab[3] = just_return;
2778 put_no_rnd_pixels_tab[0] = just_return;
2779 put_no_rnd_pixels_tab[1] = just_return;
2780 put_no_rnd_pixels_tab[2] = just_return;
2781 put_no_rnd_pixels_tab[3] = just_return;
2783 avg_pixels_tab[0] = just_return;
2784 avg_pixels_tab[1] = just_return;
2785 avg_pixels_tab[2] = just_return;
2786 avg_pixels_tab[3] = just_return;
2788 avg_no_rnd_pixels_tab[0] = just_return;
2789 avg_no_rnd_pixels_tab[1] = just_return;
2790 avg_no_rnd_pixels_tab[2] = just_return;
2791 avg_no_rnd_pixels_tab[3] = just_return;
2793 //av_fdct = just_return;
2794 //ff_idct = just_return;