2 * MMX optimized DSP utils
3 * Copyright (c) 2000, 2001 Fabrice Bellard
4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
6 * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
8 * This file is part of Libav.
10 * Libav is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public
12 * License as published by the Free Software Foundation; either
13 * version 2.1 of the License, or (at your option) any later version.
15 * Libav is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * Lesser General Public License for more details.
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with Libav; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
25 #include "libavutil/attributes.h"
26 #include "libavutil/cpu.h"
27 #include "libavutil/x86/asm.h"
28 #include "libavcodec/dsputil.h"
29 #include "libavcodec/h264dsp.h"
30 #include "libavcodec/mpegvideo.h"
31 #include "libavcodec/simple_idct.h"
32 #include "dsputil_mmx.h"
33 #include "idct_xvid.h"
38 /* pixel operations */
39 DECLARE_ALIGNED(8, const uint64_t, ff_pw_15) = 0x000F000F000F000FULL;
40 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_17) = { 0x0011001100110011ULL, 0x0011001100110011ULL };
41 DECLARE_ALIGNED(8, const uint64_t, ff_pw_20) = 0x0014001400140014ULL;
42 DECLARE_ALIGNED(8, const uint64_t, ff_pw_42) = 0x002A002A002A002AULL;
43 DECLARE_ALIGNED(8, const uint64_t, ff_pw_53) = 0x0035003500350035ULL;
44 DECLARE_ALIGNED(8, const uint64_t, ff_pw_96) = 0x0060006000600060ULL;
45 DECLARE_ALIGNED(8, const uint64_t, ff_pw_128) = 0x0080008000800080ULL;
46 DECLARE_ALIGNED(8, const uint64_t, ff_pw_255) = 0x00ff00ff00ff00ffULL;
47 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_512) = { 0x0200020002000200ULL, 0x0200020002000200ULL };
48 DECLARE_ALIGNED(16, const xmm_reg, ff_pw_1019) = { 0x03FB03FB03FB03FBULL, 0x03FB03FB03FB03FBULL };
50 DECLARE_ALIGNED(8, const uint64_t, ff_pb_3F) = 0x3F3F3F3F3F3F3F3FULL;
51 DECLARE_ALIGNED(8, const uint64_t, ff_pb_FC) = 0xFCFCFCFCFCFCFCFCULL;
53 DECLARE_ALIGNED(16, const double, ff_pd_1)[2] = { 1.0, 1.0 };
54 DECLARE_ALIGNED(16, const double, ff_pd_2)[2] = { 2.0, 2.0 };
57 void ff_put_pixels8_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
58 int dstStride, int src1Stride, int h);
59 void ff_put_no_rnd_pixels8_l2_mmxext(uint8_t *dst, uint8_t *src1,
60 uint8_t *src2, int dstStride,
61 int src1Stride, int h);
62 void ff_avg_pixels8_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
63 int dstStride, int src1Stride, int h);
64 void ff_put_pixels16_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
65 int dstStride, int src1Stride, int h);
66 void ff_avg_pixels16_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
67 int dstStride, int src1Stride, int h);
68 void ff_put_no_rnd_pixels16_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
69 int dstStride, int src1Stride, int h);
70 void ff_put_mpeg4_qpel16_h_lowpass_mmxext(uint8_t *dst, uint8_t *src,
71 int dstStride, int srcStride, int h);
72 void ff_avg_mpeg4_qpel16_h_lowpass_mmxext(uint8_t *dst, uint8_t *src,
73 int dstStride, int srcStride, int h);
74 void ff_put_no_rnd_mpeg4_qpel16_h_lowpass_mmxext(uint8_t *dst, uint8_t *src,
75 int dstStride, int srcStride,
77 void ff_put_mpeg4_qpel8_h_lowpass_mmxext(uint8_t *dst, uint8_t *src,
78 int dstStride, int srcStride, int h);
79 void ff_avg_mpeg4_qpel8_h_lowpass_mmxext(uint8_t *dst, uint8_t *src,
80 int dstStride, int srcStride, int h);
81 void ff_put_no_rnd_mpeg4_qpel8_h_lowpass_mmxext(uint8_t *dst, uint8_t *src,
82 int dstStride, int srcStride,
84 void ff_put_mpeg4_qpel16_v_lowpass_mmxext(uint8_t *dst, uint8_t *src,
85 int dstStride, int srcStride);
86 void ff_avg_mpeg4_qpel16_v_lowpass_mmxext(uint8_t *dst, uint8_t *src,
87 int dstStride, int srcStride);
88 void ff_put_no_rnd_mpeg4_qpel16_v_lowpass_mmxext(uint8_t *dst, uint8_t *src,
89 int dstStride, int srcStride);
90 void ff_put_mpeg4_qpel8_v_lowpass_mmxext(uint8_t *dst, uint8_t *src,
91 int dstStride, int srcStride);
92 void ff_avg_mpeg4_qpel8_v_lowpass_mmxext(uint8_t *dst, uint8_t *src,
93 int dstStride, int srcStride);
94 void ff_put_no_rnd_mpeg4_qpel8_v_lowpass_mmxext(uint8_t *dst, uint8_t *src,
95 int dstStride, int srcStride);
96 #define ff_put_no_rnd_pixels16_mmxext ff_put_pixels16_mmxext
97 #define ff_put_no_rnd_pixels8_mmxext ff_put_pixels8_mmxext
102 /***********************************/
105 #define DEF(x, y) x ## _ ## y ## _mmx
106 #define SET_RND MOVQ_WTWO
107 #define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX(a, b, c, d, e, f)
108 #define PAVGB(a, b, c, e) PAVGB_MMX(a, b, c, e)
110 #include "rnd_template.c"
117 /***********************************/
120 void ff_put_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
126 /* read the pixels */
131 "movq (%3), %%mm0 \n\t"
132 "movq 8(%3), %%mm1 \n\t"
133 "movq 16(%3), %%mm2 \n\t"
134 "movq 24(%3), %%mm3 \n\t"
135 "movq 32(%3), %%mm4 \n\t"
136 "movq 40(%3), %%mm5 \n\t"
137 "movq 48(%3), %%mm6 \n\t"
138 "movq 56(%3), %%mm7 \n\t"
139 "packuswb %%mm1, %%mm0 \n\t"
140 "packuswb %%mm3, %%mm2 \n\t"
141 "packuswb %%mm5, %%mm4 \n\t"
142 "packuswb %%mm7, %%mm6 \n\t"
143 "movq %%mm0, (%0) \n\t"
144 "movq %%mm2, (%0, %1) \n\t"
145 "movq %%mm4, (%0, %1, 2) \n\t"
146 "movq %%mm6, (%0, %2) \n\t"
147 :: "r"(pix), "r"((x86_reg)line_size), "r"((x86_reg)line_size * 3),
150 pix += line_size * 4;
153 // if here would be an exact copy of the code above
154 // compiler would generate some very strange code
157 "movq (%3), %%mm0 \n\t"
158 "movq 8(%3), %%mm1 \n\t"
159 "movq 16(%3), %%mm2 \n\t"
160 "movq 24(%3), %%mm3 \n\t"
161 "movq 32(%3), %%mm4 \n\t"
162 "movq 40(%3), %%mm5 \n\t"
163 "movq 48(%3), %%mm6 \n\t"
164 "movq 56(%3), %%mm7 \n\t"
165 "packuswb %%mm1, %%mm0 \n\t"
166 "packuswb %%mm3, %%mm2 \n\t"
167 "packuswb %%mm5, %%mm4 \n\t"
168 "packuswb %%mm7, %%mm6 \n\t"
169 "movq %%mm0, (%0) \n\t"
170 "movq %%mm2, (%0, %1) \n\t"
171 "movq %%mm4, (%0, %1, 2) \n\t"
172 "movq %%mm6, (%0, %2) \n\t"
173 :: "r"(pix), "r"((x86_reg)line_size), "r"((x86_reg)line_size * 3), "r"(p)
177 #define put_signed_pixels_clamped_mmx_half(off) \
178 "movq "#off"(%2), %%mm1 \n\t" \
179 "movq 16 + "#off"(%2), %%mm2 \n\t" \
180 "movq 32 + "#off"(%2), %%mm3 \n\t" \
181 "movq 48 + "#off"(%2), %%mm4 \n\t" \
182 "packsswb 8 + "#off"(%2), %%mm1 \n\t" \
183 "packsswb 24 + "#off"(%2), %%mm2 \n\t" \
184 "packsswb 40 + "#off"(%2), %%mm3 \n\t" \
185 "packsswb 56 + "#off"(%2), %%mm4 \n\t" \
186 "paddb %%mm0, %%mm1 \n\t" \
187 "paddb %%mm0, %%mm2 \n\t" \
188 "paddb %%mm0, %%mm3 \n\t" \
189 "paddb %%mm0, %%mm4 \n\t" \
190 "movq %%mm1, (%0) \n\t" \
191 "movq %%mm2, (%0, %3) \n\t" \
192 "movq %%mm3, (%0, %3, 2) \n\t" \
193 "movq %%mm4, (%0, %1) \n\t"
195 void ff_put_signed_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
198 x86_reg line_skip = line_size;
202 "movq "MANGLE(ff_pb_80)", %%mm0 \n\t"
203 "lea (%3, %3, 2), %1 \n\t"
204 put_signed_pixels_clamped_mmx_half(0)
205 "lea (%0, %3, 4), %0 \n\t"
206 put_signed_pixels_clamped_mmx_half(64)
207 : "+&r"(pixels), "=&r"(line_skip3)
208 : "r"(block), "r"(line_skip)
212 void ff_add_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
219 /* read the pixels */
226 "movq (%2), %%mm0 \n\t"
227 "movq 8(%2), %%mm1 \n\t"
228 "movq 16(%2), %%mm2 \n\t"
229 "movq 24(%2), %%mm3 \n\t"
230 "movq %0, %%mm4 \n\t"
231 "movq %1, %%mm6 \n\t"
232 "movq %%mm4, %%mm5 \n\t"
233 "punpcklbw %%mm7, %%mm4 \n\t"
234 "punpckhbw %%mm7, %%mm5 \n\t"
235 "paddsw %%mm4, %%mm0 \n\t"
236 "paddsw %%mm5, %%mm1 \n\t"
237 "movq %%mm6, %%mm5 \n\t"
238 "punpcklbw %%mm7, %%mm6 \n\t"
239 "punpckhbw %%mm7, %%mm5 \n\t"
240 "paddsw %%mm6, %%mm2 \n\t"
241 "paddsw %%mm5, %%mm3 \n\t"
242 "packuswb %%mm1, %%mm0 \n\t"
243 "packuswb %%mm3, %%mm2 \n\t"
244 "movq %%mm0, %0 \n\t"
245 "movq %%mm2, %1 \n\t"
246 : "+m"(*pix), "+m"(*(pix + line_size))
249 pix += line_size * 2;
254 #define CLEAR_BLOCKS(name, n) \
255 static void name(int16_t *blocks) \
258 "pxor %%mm7, %%mm7 \n\t" \
259 "mov %1, %%"REG_a" \n\t" \
261 "movq %%mm7, (%0, %%"REG_a") \n\t" \
262 "movq %%mm7, 8(%0, %%"REG_a") \n\t" \
263 "movq %%mm7, 16(%0, %%"REG_a") \n\t" \
264 "movq %%mm7, 24(%0, %%"REG_a") \n\t" \
265 "add $32, %%"REG_a" \n\t" \
267 :: "r"(((uint8_t *)blocks) + 128 * n), \
272 CLEAR_BLOCKS(clear_blocks_mmx, 6)
273 CLEAR_BLOCKS(clear_block_mmx, 1)
275 static void clear_block_sse(int16_t *block)
278 "xorps %%xmm0, %%xmm0 \n"
279 "movaps %%xmm0, (%0) \n"
280 "movaps %%xmm0, 16(%0) \n"
281 "movaps %%xmm0, 32(%0) \n"
282 "movaps %%xmm0, 48(%0) \n"
283 "movaps %%xmm0, 64(%0) \n"
284 "movaps %%xmm0, 80(%0) \n"
285 "movaps %%xmm0, 96(%0) \n"
286 "movaps %%xmm0, 112(%0) \n"
292 static void clear_blocks_sse(int16_t *blocks)
295 "xorps %%xmm0, %%xmm0 \n"
296 "mov %1, %%"REG_a" \n"
298 "movaps %%xmm0, (%0, %%"REG_a") \n"
299 "movaps %%xmm0, 16(%0, %%"REG_a") \n"
300 "movaps %%xmm0, 32(%0, %%"REG_a") \n"
301 "movaps %%xmm0, 48(%0, %%"REG_a") \n"
302 "movaps %%xmm0, 64(%0, %%"REG_a") \n"
303 "movaps %%xmm0, 80(%0, %%"REG_a") \n"
304 "movaps %%xmm0, 96(%0, %%"REG_a") \n"
305 "movaps %%xmm0, 112(%0, %%"REG_a") \n"
306 "add $128, %%"REG_a" \n"
308 :: "r"(((uint8_t *)blocks) + 128 * 6),
314 static void add_bytes_mmx(uint8_t *dst, uint8_t *src, int w)
320 "movq (%1, %0), %%mm0 \n\t"
321 "movq (%2, %0), %%mm1 \n\t"
322 "paddb %%mm0, %%mm1 \n\t"
323 "movq %%mm1, (%2, %0) \n\t"
324 "movq 8(%1, %0), %%mm0 \n\t"
325 "movq 8(%2, %0), %%mm1 \n\t"
326 "paddb %%mm0, %%mm1 \n\t"
327 "movq %%mm1, 8(%2, %0) \n\t"
333 : "r"(src), "r"(dst), "r"((x86_reg)w - 15)
336 dst[i + 0] += src[i + 0];
340 static void add_hfyu_median_prediction_cmov(uint8_t *dst, const uint8_t *top,
341 const uint8_t *diff, int w,
342 int *left, int *left_top)
346 int l = *left & 0xff;
347 int tl = *left_top & 0xff;
352 "movzbl (%3, %4), %2 \n"
365 "add (%6, %4), %b0 \n"
366 "mov %b0, (%5, %4) \n"
369 : "+&q"(l), "+&q"(tl), "=&r"(t), "=&q"(x), "+&r"(w2)
370 : "r"(dst + w), "r"(diff + w), "rm"(top + w)
377 /* Draw the edges of width 'w' of an image of size width, height
378 * this MMX version can only handle w == 8 || w == 16. */
379 static void draw_edges_mmx(uint8_t *buf, int wrap, int width, int height,
380 int w, int h, int sides)
382 uint8_t *ptr, *last_line;
385 last_line = buf + (height - 1) * wrap;
391 "movd (%0), %%mm0 \n\t"
392 "punpcklbw %%mm0, %%mm0 \n\t"
393 "punpcklwd %%mm0, %%mm0 \n\t"
394 "punpckldq %%mm0, %%mm0 \n\t"
395 "movq %%mm0, -8(%0) \n\t"
396 "movq -8(%0, %2), %%mm1 \n\t"
397 "punpckhbw %%mm1, %%mm1 \n\t"
398 "punpckhwd %%mm1, %%mm1 \n\t"
399 "punpckhdq %%mm1, %%mm1 \n\t"
400 "movq %%mm1, (%0, %2) \n\t"
405 : "r"((x86_reg)wrap), "r"((x86_reg)width), "r"(ptr + wrap * height)
410 "movd (%0), %%mm0 \n\t"
411 "punpcklbw %%mm0, %%mm0 \n\t"
412 "punpcklwd %%mm0, %%mm0 \n\t"
413 "punpckldq %%mm0, %%mm0 \n\t"
414 "movq %%mm0, -8(%0) \n\t"
415 "movq %%mm0, -16(%0) \n\t"
416 "movq -8(%0, %2), %%mm1 \n\t"
417 "punpckhbw %%mm1, %%mm1 \n\t"
418 "punpckhwd %%mm1, %%mm1 \n\t"
419 "punpckhdq %%mm1, %%mm1 \n\t"
420 "movq %%mm1, (%0, %2) \n\t"
421 "movq %%mm1, 8(%0, %2) \n\t"
426 : "r"((x86_reg)wrap), "r"((x86_reg)width), "r"(ptr + wrap * height)
430 /* top and bottom (and hopefully also the corners) */
431 if (sides & EDGE_TOP) {
432 for (i = 0; i < h; i += 4) {
433 ptr = buf - (i + 1) * wrap - w;
436 "movq (%1, %0), %%mm0 \n\t"
437 "movq %%mm0, (%0) \n\t"
438 "movq %%mm0, (%0, %2) \n\t"
439 "movq %%mm0, (%0, %2, 2) \n\t"
440 "movq %%mm0, (%0, %3) \n\t"
445 : "r"((x86_reg)buf - (x86_reg)ptr - w), "r"((x86_reg) -wrap),
446 "r"((x86_reg) -wrap * 3), "r"(ptr + width + 2 * w)
451 if (sides & EDGE_BOTTOM) {
452 for (i = 0; i < h; i += 4) {
453 ptr = last_line + (i + 1) * wrap - w;
456 "movq (%1, %0), %%mm0 \n\t"
457 "movq %%mm0, (%0) \n\t"
458 "movq %%mm0, (%0, %2) \n\t"
459 "movq %%mm0, (%0, %2, 2) \n\t"
460 "movq %%mm0, (%0, %3) \n\t"
465 : "r"((x86_reg)last_line - (x86_reg)ptr - w),
466 "r"((x86_reg)wrap), "r"((x86_reg)wrap * 3),
467 "r"(ptr + width + 2 * w)
472 #endif /* HAVE_INLINE_ASM */
476 static void ff_avg_pixels16_mmxext(uint8_t *block, const uint8_t *pixels,
477 int line_size, int h)
479 ff_avg_pixels8_mmxext(block, pixels, line_size, h);
480 ff_avg_pixels8_mmxext(block + 8, pixels + 8, line_size, h);
483 static void ff_put_pixels16_mmxext(uint8_t *block, const uint8_t *pixels,
484 ptrdiff_t line_size, int h)
486 ff_put_pixels8_mmxext(block, pixels, line_size, h);
487 ff_put_pixels8_mmxext(block + 8, pixels + 8, line_size, h);
490 #define QPEL_OP(OPNAME, ROUNDER, RND, MMX) \
491 static void OPNAME ## qpel8_mc00_ ## MMX (uint8_t *dst, uint8_t *src, \
494 ff_ ## OPNAME ## pixels8_ ## MMX(dst, src, stride, 8); \
497 static void OPNAME ## qpel8_mc10_ ## MMX(uint8_t *dst, uint8_t *src, \
501 uint8_t * const half = (uint8_t*)temp; \
502 ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, \
504 ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, src, half, \
505 stride, stride, 8); \
508 static void OPNAME ## qpel8_mc20_ ## MMX(uint8_t *dst, uint8_t *src, \
511 ff_ ## OPNAME ## mpeg4_qpel8_h_lowpass_ ## MMX(dst, src, stride, \
515 static void OPNAME ## qpel8_mc30_ ## MMX(uint8_t *dst, uint8_t *src, \
519 uint8_t * const half = (uint8_t*)temp; \
520 ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, \
522 ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, src + 1, half, stride, \
526 static void OPNAME ## qpel8_mc01_ ## MMX(uint8_t *dst, uint8_t *src, \
530 uint8_t * const half = (uint8_t*)temp; \
531 ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, \
533 ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, src, half, \
534 stride, stride, 8); \
537 static void OPNAME ## qpel8_mc02_ ## MMX(uint8_t *dst, uint8_t *src, \
540 ff_ ## OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, src, \
544 static void OPNAME ## qpel8_mc03_ ## MMX(uint8_t *dst, uint8_t *src, \
548 uint8_t * const half = (uint8_t*)temp; \
549 ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, \
551 ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, src + stride, half, stride,\
555 static void OPNAME ## qpel8_mc11_ ## MMX(uint8_t *dst, uint8_t *src, \
558 uint64_t half[8 + 9]; \
559 uint8_t * const halfH = ((uint8_t*)half) + 64; \
560 uint8_t * const halfHV = ((uint8_t*)half); \
561 ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
563 ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, \
565 ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
566 ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, \
570 static void OPNAME ## qpel8_mc31_ ## MMX(uint8_t *dst, uint8_t *src, \
573 uint64_t half[8 + 9]; \
574 uint8_t * const halfH = ((uint8_t*)half) + 64; \
575 uint8_t * const halfHV = ((uint8_t*)half); \
576 ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
578 ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src + 1, halfH, 8, \
580 ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
581 ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, \
585 static void OPNAME ## qpel8_mc13_ ## MMX(uint8_t *dst, uint8_t *src, \
588 uint64_t half[8 + 9]; \
589 uint8_t * const halfH = ((uint8_t*)half) + 64; \
590 uint8_t * const halfHV = ((uint8_t*)half); \
591 ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
593 ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, \
595 ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
596 ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH + 8, halfHV, \
600 static void OPNAME ## qpel8_mc33_ ## MMX(uint8_t *dst, uint8_t *src, \
603 uint64_t half[8 + 9]; \
604 uint8_t * const halfH = ((uint8_t*)half) + 64; \
605 uint8_t * const halfHV = ((uint8_t*)half); \
606 ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
608 ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src + 1, halfH, 8, \
610 ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
611 ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH + 8, halfHV, \
615 static void OPNAME ## qpel8_mc21_ ## MMX(uint8_t *dst, uint8_t *src, \
618 uint64_t half[8 + 9]; \
619 uint8_t * const halfH = ((uint8_t*)half) + 64; \
620 uint8_t * const halfHV = ((uint8_t*)half); \
621 ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
623 ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
624 ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, \
628 static void OPNAME ## qpel8_mc23_ ## MMX(uint8_t *dst, uint8_t *src, \
631 uint64_t half[8 + 9]; \
632 uint8_t * const halfH = ((uint8_t*)half) + 64; \
633 uint8_t * const halfHV = ((uint8_t*)half); \
634 ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
636 ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
637 ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH + 8, halfHV, \
641 static void OPNAME ## qpel8_mc12_ ## MMX(uint8_t *dst, uint8_t *src, \
644 uint64_t half[8 + 9]; \
645 uint8_t * const halfH = ((uint8_t*)half); \
646 ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
648 ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, \
650 ff_ ## OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, \
654 static void OPNAME ## qpel8_mc32_ ## MMX(uint8_t *dst, uint8_t *src, \
657 uint64_t half[8 + 9]; \
658 uint8_t * const halfH = ((uint8_t*)half); \
659 ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
661 ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src + 1, halfH, 8, \
663 ff_ ## OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, \
667 static void OPNAME ## qpel8_mc22_ ## MMX(uint8_t *dst, uint8_t *src, \
671 uint8_t * const halfH = ((uint8_t*)half); \
672 ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
674 ff_ ## OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, \
678 static void OPNAME ## qpel16_mc00_ ## MMX (uint8_t *dst, uint8_t *src, \
681 ff_ ## OPNAME ## pixels16_ ## MMX(dst, src, stride, 16); \
684 static void OPNAME ## qpel16_mc10_ ## MMX(uint8_t *dst, uint8_t *src, \
688 uint8_t * const half = (uint8_t*)temp; \
689 ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, \
691 ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, \
695 static void OPNAME ## qpel16_mc20_ ## MMX(uint8_t *dst, uint8_t *src, \
698 ff_ ## OPNAME ## mpeg4_qpel16_h_lowpass_ ## MMX(dst, src, \
699 stride, stride, 16);\
702 static void OPNAME ## qpel16_mc30_ ## MMX(uint8_t *dst, uint8_t *src, \
706 uint8_t * const half = (uint8_t*)temp; \
707 ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, \
709 ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, src + 1, half, \
710 stride, stride, 16); \
713 static void OPNAME ## qpel16_mc01_ ## MMX(uint8_t *dst, uint8_t *src, \
717 uint8_t * const half = (uint8_t*)temp; \
718 ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, \
720 ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, \
724 static void OPNAME ## qpel16_mc02_ ## MMX(uint8_t *dst, uint8_t *src, \
727 ff_ ## OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, src, \
731 static void OPNAME ## qpel16_mc03_ ## MMX(uint8_t *dst, uint8_t *src, \
735 uint8_t * const half = (uint8_t*)temp; \
736 ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, \
738 ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, src+stride, half, \
739 stride, stride, 16); \
742 static void OPNAME ## qpel16_mc11_ ## MMX(uint8_t *dst, uint8_t *src, \
745 uint64_t half[16 * 2 + 17 * 2]; \
746 uint8_t * const halfH = ((uint8_t*)half) + 256; \
747 uint8_t * const halfHV = ((uint8_t*)half); \
748 ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
750 ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, \
752 ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
754 ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, \
758 static void OPNAME ## qpel16_mc31_ ## MMX(uint8_t *dst, uint8_t *src, \
761 uint64_t half[16 * 2 + 17 * 2]; \
762 uint8_t * const halfH = ((uint8_t*)half) + 256; \
763 uint8_t * const halfHV = ((uint8_t*)half); \
764 ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
766 ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src + 1, halfH, 16, \
768 ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
770 ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, \
774 static void OPNAME ## qpel16_mc13_ ## MMX(uint8_t *dst, uint8_t *src, \
777 uint64_t half[16 * 2 + 17 * 2]; \
778 uint8_t * const halfH = ((uint8_t*)half) + 256; \
779 uint8_t * const halfHV = ((uint8_t*)half); \
780 ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
782 ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, \
784 ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
786 ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH + 16, halfHV, \
790 static void OPNAME ## qpel16_mc33_ ## MMX(uint8_t *dst, uint8_t *src, \
793 uint64_t half[16 * 2 + 17 * 2]; \
794 uint8_t * const halfH = ((uint8_t*)half) + 256; \
795 uint8_t * const halfHV = ((uint8_t*)half); \
796 ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
798 ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src + 1, halfH, 16, \
800 ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
802 ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH + 16, halfHV, \
806 static void OPNAME ## qpel16_mc21_ ## MMX(uint8_t *dst, uint8_t *src, \
809 uint64_t half[16 * 2 + 17 * 2]; \
810 uint8_t * const halfH = ((uint8_t*)half) + 256; \
811 uint8_t * const halfHV = ((uint8_t*)half); \
812 ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
814 ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
816 ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, \
820 static void OPNAME ## qpel16_mc23_ ## MMX(uint8_t *dst, uint8_t *src, \
823 uint64_t half[16 * 2 + 17 * 2]; \
824 uint8_t * const halfH = ((uint8_t*)half) + 256; \
825 uint8_t * const halfHV = ((uint8_t*)half); \
826 ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
828 ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
830 ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH + 16, halfHV, \
834 static void OPNAME ## qpel16_mc12_ ## MMX(uint8_t *dst, uint8_t *src, \
837 uint64_t half[17 * 2]; \
838 uint8_t * const halfH = ((uint8_t*)half); \
839 ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
841 ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, \
843 ff_ ## OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, \
847 static void OPNAME ## qpel16_mc32_ ## MMX(uint8_t *dst, uint8_t *src, \
850 uint64_t half[17 * 2]; \
851 uint8_t * const halfH = ((uint8_t*)half); \
852 ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
854 ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src + 1, halfH, 16, \
856 ff_ ## OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, \
860 static void OPNAME ## qpel16_mc22_ ## MMX(uint8_t *dst, uint8_t *src, \
863 uint64_t half[17 * 2]; \
864 uint8_t * const halfH = ((uint8_t*)half); \
865 ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
867 ff_ ## OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, \
871 QPEL_OP(put_, ff_pw_16, _, mmxext)
872 QPEL_OP(avg_, ff_pw_16, _, mmxext)
873 QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, mmxext)
874 #endif /* HAVE_YASM */
878 void ff_put_rv40_qpel8_mc33_mmx(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
880 put_pixels8_xy2_mmx(dst, src, stride, 8);
882 void ff_put_rv40_qpel16_mc33_mmx(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
884 put_pixels16_xy2_mmx(dst, src, stride, 16);
886 void ff_avg_rv40_qpel8_mc33_mmx(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
888 avg_pixels8_xy2_mmx(dst, src, stride, 8);
890 void ff_avg_rv40_qpel16_mc33_mmx(uint8_t *dst, uint8_t *src, ptrdiff_t stride)
892 avg_pixels16_xy2_mmx(dst, src, stride, 16);
895 static void gmc_mmx(uint8_t *dst, uint8_t *src,
896 int stride, int h, int ox, int oy,
897 int dxx, int dxy, int dyx, int dyy,
898 int shift, int r, int width, int height)
901 const int ix = ox >> (16 + shift);
902 const int iy = oy >> (16 + shift);
903 const int oxs = ox >> 4;
904 const int oys = oy >> 4;
905 const int dxxs = dxx >> 4;
906 const int dxys = dxy >> 4;
907 const int dyxs = dyx >> 4;
908 const int dyys = dyy >> 4;
909 const uint16_t r4[4] = { r, r, r, r };
910 const uint16_t dxy4[4] = { dxys, dxys, dxys, dxys };
911 const uint16_t dyy4[4] = { dyys, dyys, dyys, dyys };
912 const uint64_t shift2 = 2 * shift;
915 const int dxw = (dxx - (1 << (16 + shift))) * (w - 1);
916 const int dyh = (dyy - (1 << (16 + shift))) * (h - 1);
917 const int dxh = dxy * (h - 1);
918 const int dyw = dyx * (w - 1);
919 if ( // non-constant fullpel offset (3% of blocks)
920 ((ox ^ (ox + dxw)) | (ox ^ (ox + dxh)) | (ox ^ (ox + dxw + dxh)) |
921 (oy ^ (oy + dyw)) | (oy ^ (oy + dyh)) | (oy ^ (oy + dyw + dyh))) >> (16 + shift)
922 // uses more than 16 bits of subpel mv (only at huge resolution)
923 || (dxx | dxy | dyx | dyy) & 15 ||
924 (unsigned)ix >= width - w ||
925 (unsigned)iy >= height - h) {
926 // FIXME could still use mmx for some of the rows
927 ff_gmc_c(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy,
928 shift, r, width, height);
932 src += ix + iy * stride;
935 "movd %0, %%mm6 \n\t"
936 "pxor %%mm7, %%mm7 \n\t"
937 "punpcklwd %%mm6, %%mm6 \n\t"
938 "punpcklwd %%mm6, %%mm6 \n\t"
942 for (x = 0; x < w; x += 4) {
943 uint16_t dx4[4] = { oxs - dxys + dxxs * (x + 0),
944 oxs - dxys + dxxs * (x + 1),
945 oxs - dxys + dxxs * (x + 2),
946 oxs - dxys + dxxs * (x + 3) };
947 uint16_t dy4[4] = { oys - dyys + dyxs * (x + 0),
948 oys - dyys + dyxs * (x + 1),
949 oys - dyys + dyxs * (x + 2),
950 oys - dyys + dyxs * (x + 3) };
952 for (y = 0; y < h; y++) {
954 "movq %0, %%mm4 \n\t"
955 "movq %1, %%mm5 \n\t"
956 "paddw %2, %%mm4 \n\t"
957 "paddw %3, %%mm5 \n\t"
958 "movq %%mm4, %0 \n\t"
959 "movq %%mm5, %1 \n\t"
960 "psrlw $12, %%mm4 \n\t"
961 "psrlw $12, %%mm5 \n\t"
962 : "+m"(*dx4), "+m"(*dy4)
963 : "m"(*dxy4), "m"(*dyy4)
967 "movq %%mm6, %%mm2 \n\t"
968 "movq %%mm6, %%mm1 \n\t"
969 "psubw %%mm4, %%mm2 \n\t"
970 "psubw %%mm5, %%mm1 \n\t"
971 "movq %%mm2, %%mm0 \n\t"
972 "movq %%mm4, %%mm3 \n\t"
973 "pmullw %%mm1, %%mm0 \n\t" // (s - dx) * (s - dy)
974 "pmullw %%mm5, %%mm3 \n\t" // dx * dy
975 "pmullw %%mm5, %%mm2 \n\t" // (s - dx) * dy
976 "pmullw %%mm4, %%mm1 \n\t" // dx * (s - dy)
978 "movd %4, %%mm5 \n\t"
979 "movd %3, %%mm4 \n\t"
980 "punpcklbw %%mm7, %%mm5 \n\t"
981 "punpcklbw %%mm7, %%mm4 \n\t"
982 "pmullw %%mm5, %%mm3 \n\t" // src[1, 1] * dx * dy
983 "pmullw %%mm4, %%mm2 \n\t" // src[0, 1] * (s - dx) * dy
985 "movd %2, %%mm5 \n\t"
986 "movd %1, %%mm4 \n\t"
987 "punpcklbw %%mm7, %%mm5 \n\t"
988 "punpcklbw %%mm7, %%mm4 \n\t"
989 "pmullw %%mm5, %%mm1 \n\t" // src[1, 0] * dx * (s - dy)
990 "pmullw %%mm4, %%mm0 \n\t" // src[0, 0] * (s - dx) * (s - dy)
991 "paddw %5, %%mm1 \n\t"
992 "paddw %%mm3, %%mm2 \n\t"
993 "paddw %%mm1, %%mm0 \n\t"
994 "paddw %%mm2, %%mm0 \n\t"
996 "psrlw %6, %%mm0 \n\t"
997 "packuswb %%mm0, %%mm0 \n\t"
998 "movd %%mm0, %0 \n\t"
1000 : "=m"(dst[x + y * stride])
1001 : "m"(src[0]), "m"(src[1]),
1002 "m"(src[stride]), "m"(src[stride + 1]),
1003 "m"(*r4), "m"(shift2)
1007 src += 4 - h * stride;
1011 static void vector_clipf_sse(float *dst, const float *src,
1012 float min, float max, int len)
1014 x86_reg i = (len - 16) * 4;
1016 "movss %3, %%xmm4 \n\t"
1017 "movss %4, %%xmm5 \n\t"
1018 "shufps $0, %%xmm4, %%xmm4 \n\t"
1019 "shufps $0, %%xmm5, %%xmm5 \n\t"
1021 "movaps (%2, %0), %%xmm0 \n\t" // 3/1 on intel
1022 "movaps 16(%2, %0), %%xmm1 \n\t"
1023 "movaps 32(%2, %0), %%xmm2 \n\t"
1024 "movaps 48(%2, %0), %%xmm3 \n\t"
1025 "maxps %%xmm4, %%xmm0 \n\t"
1026 "maxps %%xmm4, %%xmm1 \n\t"
1027 "maxps %%xmm4, %%xmm2 \n\t"
1028 "maxps %%xmm4, %%xmm3 \n\t"
1029 "minps %%xmm5, %%xmm0 \n\t"
1030 "minps %%xmm5, %%xmm1 \n\t"
1031 "minps %%xmm5, %%xmm2 \n\t"
1032 "minps %%xmm5, %%xmm3 \n\t"
1033 "movaps %%xmm0, (%1, %0) \n\t"
1034 "movaps %%xmm1, 16(%1, %0) \n\t"
1035 "movaps %%xmm2, 32(%1, %0) \n\t"
1036 "movaps %%xmm3, 48(%1, %0) \n\t"
1040 : "r"(dst), "r"(src), "m"(min), "m"(max)
1045 #endif /* HAVE_INLINE_ASM */
1047 void ff_h263_v_loop_filter_mmx(uint8_t *src, int stride, int qscale);
1048 void ff_h263_h_loop_filter_mmx(uint8_t *src, int stride, int qscale);
1050 int32_t ff_scalarproduct_int16_mmxext(const int16_t *v1, const int16_t *v2,
1052 int32_t ff_scalarproduct_int16_sse2(const int16_t *v1, const int16_t *v2,
1054 int32_t ff_scalarproduct_and_madd_int16_mmxext(int16_t *v1, const int16_t *v2,
1056 int order, int mul);
1057 int32_t ff_scalarproduct_and_madd_int16_sse2(int16_t *v1, const int16_t *v2,
1059 int order, int mul);
1060 int32_t ff_scalarproduct_and_madd_int16_ssse3(int16_t *v1, const int16_t *v2,
1062 int order, int mul);
1064 void ff_apply_window_int16_round_mmxext(int16_t *output, const int16_t *input,
1065 const int16_t *window, unsigned int len);
1066 void ff_apply_window_int16_round_sse2(int16_t *output, const int16_t *input,
1067 const int16_t *window, unsigned int len);
1068 void ff_apply_window_int16_mmxext(int16_t *output, const int16_t *input,
1069 const int16_t *window, unsigned int len);
1070 void ff_apply_window_int16_sse2(int16_t *output, const int16_t *input,
1071 const int16_t *window, unsigned int len);
1072 void ff_apply_window_int16_ssse3(int16_t *output, const int16_t *input,
1073 const int16_t *window, unsigned int len);
1074 void ff_apply_window_int16_ssse3_atom(int16_t *output, const int16_t *input,
1075 const int16_t *window, unsigned int len);
1077 void ff_bswap32_buf_ssse3(uint32_t *dst, const uint32_t *src, int w);
1078 void ff_bswap32_buf_sse2(uint32_t *dst, const uint32_t *src, int w);
1080 void ff_add_hfyu_median_prediction_mmxext(uint8_t *dst, const uint8_t *top,
1081 const uint8_t *diff, int w,
1082 int *left, int *left_top);
1083 int ff_add_hfyu_left_prediction_ssse3(uint8_t *dst, const uint8_t *src,
1085 int ff_add_hfyu_left_prediction_sse4(uint8_t *dst, const uint8_t *src,
1088 void ff_vector_clip_int32_mmx (int32_t *dst, const int32_t *src,
1089 int32_t min, int32_t max, unsigned int len);
1090 void ff_vector_clip_int32_sse2 (int32_t *dst, const int32_t *src,
1091 int32_t min, int32_t max, unsigned int len);
1092 void ff_vector_clip_int32_int_sse2(int32_t *dst, const int32_t *src,
1093 int32_t min, int32_t max, unsigned int len);
1094 void ff_vector_clip_int32_sse4 (int32_t *dst, const int32_t *src,
1095 int32_t min, int32_t max, unsigned int len);
1097 #define SET_QPEL_FUNCS(PFX, IDX, SIZE, CPU, PREFIX) \
1099 c->PFX ## _pixels_tab[IDX][ 0] = PREFIX ## PFX ## SIZE ## _mc00_ ## CPU; \
1100 c->PFX ## _pixels_tab[IDX][ 1] = PREFIX ## PFX ## SIZE ## _mc10_ ## CPU; \
1101 c->PFX ## _pixels_tab[IDX][ 2] = PREFIX ## PFX ## SIZE ## _mc20_ ## CPU; \
1102 c->PFX ## _pixels_tab[IDX][ 3] = PREFIX ## PFX ## SIZE ## _mc30_ ## CPU; \
1103 c->PFX ## _pixels_tab[IDX][ 4] = PREFIX ## PFX ## SIZE ## _mc01_ ## CPU; \
1104 c->PFX ## _pixels_tab[IDX][ 5] = PREFIX ## PFX ## SIZE ## _mc11_ ## CPU; \
1105 c->PFX ## _pixels_tab[IDX][ 6] = PREFIX ## PFX ## SIZE ## _mc21_ ## CPU; \
1106 c->PFX ## _pixels_tab[IDX][ 7] = PREFIX ## PFX ## SIZE ## _mc31_ ## CPU; \
1107 c->PFX ## _pixels_tab[IDX][ 8] = PREFIX ## PFX ## SIZE ## _mc02_ ## CPU; \
1108 c->PFX ## _pixels_tab[IDX][ 9] = PREFIX ## PFX ## SIZE ## _mc12_ ## CPU; \
1109 c->PFX ## _pixels_tab[IDX][10] = PREFIX ## PFX ## SIZE ## _mc22_ ## CPU; \
1110 c->PFX ## _pixels_tab[IDX][11] = PREFIX ## PFX ## SIZE ## _mc32_ ## CPU; \
1111 c->PFX ## _pixels_tab[IDX][12] = PREFIX ## PFX ## SIZE ## _mc03_ ## CPU; \
1112 c->PFX ## _pixels_tab[IDX][13] = PREFIX ## PFX ## SIZE ## _mc13_ ## CPU; \
1113 c->PFX ## _pixels_tab[IDX][14] = PREFIX ## PFX ## SIZE ## _mc23_ ## CPU; \
1114 c->PFX ## _pixels_tab[IDX][15] = PREFIX ## PFX ## SIZE ## _mc33_ ## CPU; \
1117 static av_cold void dsputil_init_mmx(DSPContext *c, AVCodecContext *avctx,
1121 const int high_bit_depth = avctx->bits_per_raw_sample > 8;
1123 c->put_pixels_clamped = ff_put_pixels_clamped_mmx;
1124 c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_mmx;
1125 c->add_pixels_clamped = ff_add_pixels_clamped_mmx;
1127 if (!high_bit_depth) {
1128 c->clear_block = clear_block_mmx;
1129 c->clear_blocks = clear_blocks_mmx;
1130 c->draw_edges = draw_edges_mmx;
1132 switch (avctx->idct_algo) {
1134 case FF_IDCT_SIMPLEMMX:
1135 c->idct_put = ff_simple_idct_put_mmx;
1136 c->idct_add = ff_simple_idct_add_mmx;
1137 c->idct = ff_simple_idct_mmx;
1138 c->idct_permutation_type = FF_SIMPLE_IDCT_PERM;
1140 case FF_IDCT_XVIDMMX:
1141 c->idct_put = ff_idct_xvid_mmx_put;
1142 c->idct_add = ff_idct_xvid_mmx_add;
1143 c->idct = ff_idct_xvid_mmx;
1150 c->add_bytes = add_bytes_mmx;
1151 #endif /* HAVE_MMX_INLINE */
1153 #if HAVE_MMX_EXTERNAL
1154 if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
1155 c->h263_v_loop_filter = ff_h263_v_loop_filter_mmx;
1156 c->h263_h_loop_filter = ff_h263_h_loop_filter_mmx;
1159 c->vector_clip_int32 = ff_vector_clip_int32_mmx;
1160 #endif /* HAVE_MMX_EXTERNAL */
1163 static av_cold void dsputil_init_mmxext(DSPContext *c, AVCodecContext *avctx,
1166 #if HAVE_MMXEXT_INLINE
1167 const int high_bit_depth = avctx->bits_per_raw_sample > 8;
1169 if (!high_bit_depth && avctx->idct_algo == FF_IDCT_XVIDMMX) {
1170 c->idct_put = ff_idct_xvid_mmxext_put;
1171 c->idct_add = ff_idct_xvid_mmxext_add;
1172 c->idct = ff_idct_xvid_mmxext;
1174 #endif /* HAVE_MMXEXT_INLINE */
1176 #if HAVE_MMXEXT_EXTERNAL
1177 SET_QPEL_FUNCS(avg_qpel, 0, 16, mmxext, );
1178 SET_QPEL_FUNCS(avg_qpel, 1, 8, mmxext, );
1180 SET_QPEL_FUNCS(put_qpel, 0, 16, mmxext, );
1181 SET_QPEL_FUNCS(put_qpel, 1, 8, mmxext, );
1182 SET_QPEL_FUNCS(put_no_rnd_qpel, 0, 16, mmxext, );
1183 SET_QPEL_FUNCS(put_no_rnd_qpel, 1, 8, mmxext, );
1185 /* slower than cmov version on AMD */
1186 if (!(mm_flags & AV_CPU_FLAG_3DNOW))
1187 c->add_hfyu_median_prediction = ff_add_hfyu_median_prediction_mmxext;
1189 c->scalarproduct_int16 = ff_scalarproduct_int16_mmxext;
1190 c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_mmxext;
1192 if (avctx->flags & CODEC_FLAG_BITEXACT) {
1193 c->apply_window_int16 = ff_apply_window_int16_mmxext;
1195 c->apply_window_int16 = ff_apply_window_int16_round_mmxext;
1197 #endif /* HAVE_MMXEXT_EXTERNAL */
1200 static av_cold void dsputil_init_sse(DSPContext *c, AVCodecContext *avctx,
1204 const int high_bit_depth = avctx->bits_per_raw_sample > 8;
1206 if (!high_bit_depth) {
1207 if (!(CONFIG_MPEG_XVMC_DECODER && avctx->xvmc_acceleration > 1)) {
1208 /* XvMCCreateBlocks() may not allocate 16-byte aligned blocks */
1209 c->clear_block = clear_block_sse;
1210 c->clear_blocks = clear_blocks_sse;
1214 c->vector_clipf = vector_clipf_sse;
1215 #endif /* HAVE_SSE_INLINE */
1218 static av_cold void dsputil_init_sse2(DSPContext *c, AVCodecContext *avctx,
1221 #if HAVE_SSE2_INLINE
1222 const int high_bit_depth = avctx->bits_per_raw_sample > 8;
1224 if (!high_bit_depth && avctx->idct_algo == FF_IDCT_XVIDMMX) {
1225 c->idct_put = ff_idct_xvid_sse2_put;
1226 c->idct_add = ff_idct_xvid_sse2_add;
1227 c->idct = ff_idct_xvid_sse2;
1228 c->idct_permutation_type = FF_SSE2_IDCT_PERM;
1230 #endif /* HAVE_SSE2_INLINE */
1232 #if HAVE_SSE2_EXTERNAL
1233 c->scalarproduct_int16 = ff_scalarproduct_int16_sse2;
1234 c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_sse2;
1235 if (mm_flags & AV_CPU_FLAG_ATOM) {
1236 c->vector_clip_int32 = ff_vector_clip_int32_int_sse2;
1238 c->vector_clip_int32 = ff_vector_clip_int32_sse2;
1240 if (avctx->flags & CODEC_FLAG_BITEXACT) {
1241 c->apply_window_int16 = ff_apply_window_int16_sse2;
1242 } else if (!(mm_flags & AV_CPU_FLAG_SSE2SLOW)) {
1243 c->apply_window_int16 = ff_apply_window_int16_round_sse2;
1245 c->bswap_buf = ff_bswap32_buf_sse2;
1246 #endif /* HAVE_SSE2_EXTERNAL */
1249 static av_cold void dsputil_init_ssse3(DSPContext *c, AVCodecContext *avctx,
1252 #if HAVE_SSSE3_EXTERNAL
1253 c->add_hfyu_left_prediction = ff_add_hfyu_left_prediction_ssse3;
1254 if (mm_flags & AV_CPU_FLAG_SSE4) // not really sse4, just slow on Conroe
1255 c->add_hfyu_left_prediction = ff_add_hfyu_left_prediction_sse4;
1257 if (mm_flags & AV_CPU_FLAG_ATOM)
1258 c->apply_window_int16 = ff_apply_window_int16_ssse3_atom;
1260 c->apply_window_int16 = ff_apply_window_int16_ssse3;
1261 if (!(mm_flags & (AV_CPU_FLAG_SSE42|AV_CPU_FLAG_3DNOW))) // cachesplit
1262 c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_ssse3;
1263 c->bswap_buf = ff_bswap32_buf_ssse3;
1264 #endif /* HAVE_SSSE3_EXTERNAL */
1267 static av_cold void dsputil_init_sse4(DSPContext *c, AVCodecContext *avctx,
1270 #if HAVE_SSE4_EXTERNAL
1271 c->vector_clip_int32 = ff_vector_clip_int32_sse4;
1272 #endif /* HAVE_SSE4_EXTERNAL */
1275 av_cold void ff_dsputil_init_mmx(DSPContext *c, AVCodecContext *avctx)
1277 int mm_flags = av_get_cpu_flags();
1279 #if HAVE_7REGS && HAVE_INLINE_ASM
1280 if (mm_flags & AV_CPU_FLAG_CMOV)
1281 c->add_hfyu_median_prediction = add_hfyu_median_prediction_cmov;
1284 if (mm_flags & AV_CPU_FLAG_MMX)
1285 dsputil_init_mmx(c, avctx, mm_flags);
1287 if (mm_flags & AV_CPU_FLAG_MMXEXT)
1288 dsputil_init_mmxext(c, avctx, mm_flags);
1290 if (mm_flags & AV_CPU_FLAG_SSE)
1291 dsputil_init_sse(c, avctx, mm_flags);
1293 if (mm_flags & AV_CPU_FLAG_SSE2)
1294 dsputil_init_sse2(c, avctx, mm_flags);
1296 if (mm_flags & AV_CPU_FLAG_SSSE3)
1297 dsputil_init_ssse3(c, avctx, mm_flags);
1299 if (mm_flags & AV_CPU_FLAG_SSE4)
1300 dsputil_init_sse4(c, avctx, mm_flags);
1302 if (CONFIG_ENCODERS)
1303 ff_dsputilenc_init_mmx(c, avctx);