2 * MMX optimized DSP utils
3 * Copyright (c) 2000, 2001 Fabrice Bellard
4 * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
6 * MMX optimization by Nick Kurshev <nickols_k@mail.ru>
8 * This file is part of Libav.
10 * Libav is free software; you can redistribute it and/or
11 * modify it under the terms of the GNU Lesser General Public
12 * License as published by the Free Software Foundation; either
13 * version 2.1 of the License, or (at your option) any later version.
15 * Libav is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * Lesser General Public License for more details.
20 * You should have received a copy of the GNU Lesser General Public
21 * License along with Libav; if not, write to the Free Software
22 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
25 #include "libavutil/attributes.h"
26 #include "libavutil/cpu.h"
27 #include "libavutil/x86/asm.h"
28 #include "libavcodec/dsputil.h"
29 #include "libavcodec/h264dsp.h"
30 #include "libavcodec/mpegvideo.h"
31 #include "libavcodec/simple_idct.h"
32 #include "dsputil_mmx.h"
33 #include "idct_xvid.h"
38 void ff_put_pixels8_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
39 int dstStride, int src1Stride, int h);
40 void ff_put_no_rnd_pixels8_l2_mmxext(uint8_t *dst, uint8_t *src1,
41 uint8_t *src2, int dstStride,
42 int src1Stride, int h);
43 void ff_avg_pixels8_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
44 int dstStride, int src1Stride, int h);
45 void ff_put_pixels16_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
46 int dstStride, int src1Stride, int h);
47 void ff_avg_pixels16_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
48 int dstStride, int src1Stride, int h);
49 void ff_put_no_rnd_pixels16_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
50 int dstStride, int src1Stride, int h);
51 void ff_put_mpeg4_qpel16_h_lowpass_mmxext(uint8_t *dst, uint8_t *src,
52 int dstStride, int srcStride, int h);
53 void ff_avg_mpeg4_qpel16_h_lowpass_mmxext(uint8_t *dst, uint8_t *src,
54 int dstStride, int srcStride, int h);
55 void ff_put_no_rnd_mpeg4_qpel16_h_lowpass_mmxext(uint8_t *dst, uint8_t *src,
56 int dstStride, int srcStride,
58 void ff_put_mpeg4_qpel8_h_lowpass_mmxext(uint8_t *dst, uint8_t *src,
59 int dstStride, int srcStride, int h);
60 void ff_avg_mpeg4_qpel8_h_lowpass_mmxext(uint8_t *dst, uint8_t *src,
61 int dstStride, int srcStride, int h);
62 void ff_put_no_rnd_mpeg4_qpel8_h_lowpass_mmxext(uint8_t *dst, uint8_t *src,
63 int dstStride, int srcStride,
65 void ff_put_mpeg4_qpel16_v_lowpass_mmxext(uint8_t *dst, uint8_t *src,
66 int dstStride, int srcStride);
67 void ff_avg_mpeg4_qpel16_v_lowpass_mmxext(uint8_t *dst, uint8_t *src,
68 int dstStride, int srcStride);
69 void ff_put_no_rnd_mpeg4_qpel16_v_lowpass_mmxext(uint8_t *dst, uint8_t *src,
70 int dstStride, int srcStride);
71 void ff_put_mpeg4_qpel8_v_lowpass_mmxext(uint8_t *dst, uint8_t *src,
72 int dstStride, int srcStride);
73 void ff_avg_mpeg4_qpel8_v_lowpass_mmxext(uint8_t *dst, uint8_t *src,
74 int dstStride, int srcStride);
75 void ff_put_no_rnd_mpeg4_qpel8_v_lowpass_mmxext(uint8_t *dst, uint8_t *src,
76 int dstStride, int srcStride);
77 #define ff_put_no_rnd_pixels16_mmxext ff_put_pixels16_mmxext
78 #define ff_put_no_rnd_pixels8_mmxext ff_put_pixels8_mmxext
83 /***********************************/
86 void ff_put_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
97 "movq (%3), %%mm0 \n\t"
98 "movq 8(%3), %%mm1 \n\t"
99 "movq 16(%3), %%mm2 \n\t"
100 "movq 24(%3), %%mm3 \n\t"
101 "movq 32(%3), %%mm4 \n\t"
102 "movq 40(%3), %%mm5 \n\t"
103 "movq 48(%3), %%mm6 \n\t"
104 "movq 56(%3), %%mm7 \n\t"
105 "packuswb %%mm1, %%mm0 \n\t"
106 "packuswb %%mm3, %%mm2 \n\t"
107 "packuswb %%mm5, %%mm4 \n\t"
108 "packuswb %%mm7, %%mm6 \n\t"
109 "movq %%mm0, (%0) \n\t"
110 "movq %%mm2, (%0, %1) \n\t"
111 "movq %%mm4, (%0, %1, 2) \n\t"
112 "movq %%mm6, (%0, %2) \n\t"
113 :: "r"(pix), "r"((x86_reg)line_size), "r"((x86_reg)line_size * 3),
116 pix += line_size * 4;
119 // if here would be an exact copy of the code above
120 // compiler would generate some very strange code
123 "movq (%3), %%mm0 \n\t"
124 "movq 8(%3), %%mm1 \n\t"
125 "movq 16(%3), %%mm2 \n\t"
126 "movq 24(%3), %%mm3 \n\t"
127 "movq 32(%3), %%mm4 \n\t"
128 "movq 40(%3), %%mm5 \n\t"
129 "movq 48(%3), %%mm6 \n\t"
130 "movq 56(%3), %%mm7 \n\t"
131 "packuswb %%mm1, %%mm0 \n\t"
132 "packuswb %%mm3, %%mm2 \n\t"
133 "packuswb %%mm5, %%mm4 \n\t"
134 "packuswb %%mm7, %%mm6 \n\t"
135 "movq %%mm0, (%0) \n\t"
136 "movq %%mm2, (%0, %1) \n\t"
137 "movq %%mm4, (%0, %1, 2) \n\t"
138 "movq %%mm6, (%0, %2) \n\t"
139 :: "r"(pix), "r"((x86_reg)line_size), "r"((x86_reg)line_size * 3), "r"(p)
143 #define put_signed_pixels_clamped_mmx_half(off) \
144 "movq "#off"(%2), %%mm1 \n\t" \
145 "movq 16 + "#off"(%2), %%mm2 \n\t" \
146 "movq 32 + "#off"(%2), %%mm3 \n\t" \
147 "movq 48 + "#off"(%2), %%mm4 \n\t" \
148 "packsswb 8 + "#off"(%2), %%mm1 \n\t" \
149 "packsswb 24 + "#off"(%2), %%mm2 \n\t" \
150 "packsswb 40 + "#off"(%2), %%mm3 \n\t" \
151 "packsswb 56 + "#off"(%2), %%mm4 \n\t" \
152 "paddb %%mm0, %%mm1 \n\t" \
153 "paddb %%mm0, %%mm2 \n\t" \
154 "paddb %%mm0, %%mm3 \n\t" \
155 "paddb %%mm0, %%mm4 \n\t" \
156 "movq %%mm1, (%0) \n\t" \
157 "movq %%mm2, (%0, %3) \n\t" \
158 "movq %%mm3, (%0, %3, 2) \n\t" \
159 "movq %%mm4, (%0, %1) \n\t"
161 void ff_put_signed_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
164 x86_reg line_skip = line_size;
168 "movq "MANGLE(ff_pb_80)", %%mm0 \n\t"
169 "lea (%3, %3, 2), %1 \n\t"
170 put_signed_pixels_clamped_mmx_half(0)
171 "lea (%0, %3, 4), %0 \n\t"
172 put_signed_pixels_clamped_mmx_half(64)
173 : "+&r"(pixels), "=&r"(line_skip3)
174 : "r"(block), "r"(line_skip)
178 void ff_add_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
185 /* read the pixels */
192 "movq (%2), %%mm0 \n\t"
193 "movq 8(%2), %%mm1 \n\t"
194 "movq 16(%2), %%mm2 \n\t"
195 "movq 24(%2), %%mm3 \n\t"
196 "movq %0, %%mm4 \n\t"
197 "movq %1, %%mm6 \n\t"
198 "movq %%mm4, %%mm5 \n\t"
199 "punpcklbw %%mm7, %%mm4 \n\t"
200 "punpckhbw %%mm7, %%mm5 \n\t"
201 "paddsw %%mm4, %%mm0 \n\t"
202 "paddsw %%mm5, %%mm1 \n\t"
203 "movq %%mm6, %%mm5 \n\t"
204 "punpcklbw %%mm7, %%mm6 \n\t"
205 "punpckhbw %%mm7, %%mm5 \n\t"
206 "paddsw %%mm6, %%mm2 \n\t"
207 "paddsw %%mm5, %%mm3 \n\t"
208 "packuswb %%mm1, %%mm0 \n\t"
209 "packuswb %%mm3, %%mm2 \n\t"
210 "movq %%mm0, %0 \n\t"
211 "movq %%mm2, %1 \n\t"
212 : "+m"(*pix), "+m"(*(pix + line_size))
215 pix += line_size * 2;
220 #define CLEAR_BLOCKS(name, n) \
221 static void name(int16_t *blocks) \
224 "pxor %%mm7, %%mm7 \n\t" \
225 "mov %1, %%"REG_a" \n\t" \
227 "movq %%mm7, (%0, %%"REG_a") \n\t" \
228 "movq %%mm7, 8(%0, %%"REG_a") \n\t" \
229 "movq %%mm7, 16(%0, %%"REG_a") \n\t" \
230 "movq %%mm7, 24(%0, %%"REG_a") \n\t" \
231 "add $32, %%"REG_a" \n\t" \
233 :: "r"(((uint8_t *)blocks) + 128 * n), \
238 CLEAR_BLOCKS(clear_blocks_mmx, 6)
239 CLEAR_BLOCKS(clear_block_mmx, 1)
241 static void clear_block_sse(int16_t *block)
244 "xorps %%xmm0, %%xmm0 \n"
245 "movaps %%xmm0, (%0) \n"
246 "movaps %%xmm0, 16(%0) \n"
247 "movaps %%xmm0, 32(%0) \n"
248 "movaps %%xmm0, 48(%0) \n"
249 "movaps %%xmm0, 64(%0) \n"
250 "movaps %%xmm0, 80(%0) \n"
251 "movaps %%xmm0, 96(%0) \n"
252 "movaps %%xmm0, 112(%0) \n"
258 static void clear_blocks_sse(int16_t *blocks)
261 "xorps %%xmm0, %%xmm0 \n"
262 "mov %1, %%"REG_a" \n"
264 "movaps %%xmm0, (%0, %%"REG_a") \n"
265 "movaps %%xmm0, 16(%0, %%"REG_a") \n"
266 "movaps %%xmm0, 32(%0, %%"REG_a") \n"
267 "movaps %%xmm0, 48(%0, %%"REG_a") \n"
268 "movaps %%xmm0, 64(%0, %%"REG_a") \n"
269 "movaps %%xmm0, 80(%0, %%"REG_a") \n"
270 "movaps %%xmm0, 96(%0, %%"REG_a") \n"
271 "movaps %%xmm0, 112(%0, %%"REG_a") \n"
272 "add $128, %%"REG_a" \n"
274 :: "r"(((uint8_t *)blocks) + 128 * 6),
280 static void add_bytes_mmx(uint8_t *dst, uint8_t *src, int w)
286 "movq (%1, %0), %%mm0 \n\t"
287 "movq (%2, %0), %%mm1 \n\t"
288 "paddb %%mm0, %%mm1 \n\t"
289 "movq %%mm1, (%2, %0) \n\t"
290 "movq 8(%1, %0), %%mm0 \n\t"
291 "movq 8(%2, %0), %%mm1 \n\t"
292 "paddb %%mm0, %%mm1 \n\t"
293 "movq %%mm1, 8(%2, %0) \n\t"
299 : "r"(src), "r"(dst), "r"((x86_reg)w - 15)
302 dst[i + 0] += src[i + 0];
306 static void add_hfyu_median_prediction_cmov(uint8_t *dst, const uint8_t *top,
307 const uint8_t *diff, int w,
308 int *left, int *left_top)
312 int l = *left & 0xff;
313 int tl = *left_top & 0xff;
318 "movzbl (%3, %4), %2 \n"
331 "add (%6, %4), %b0 \n"
332 "mov %b0, (%5, %4) \n"
335 : "+&q"(l), "+&q"(tl), "=&r"(t), "=&q"(x), "+&r"(w2)
336 : "r"(dst + w), "r"(diff + w), "rm"(top + w)
343 /* Draw the edges of width 'w' of an image of size width, height
344 * this MMX version can only handle w == 8 || w == 16. */
345 static void draw_edges_mmx(uint8_t *buf, int wrap, int width, int height,
346 int w, int h, int sides)
348 uint8_t *ptr, *last_line;
351 last_line = buf + (height - 1) * wrap;
357 "movd (%0), %%mm0 \n\t"
358 "punpcklbw %%mm0, %%mm0 \n\t"
359 "punpcklwd %%mm0, %%mm0 \n\t"
360 "punpckldq %%mm0, %%mm0 \n\t"
361 "movq %%mm0, -8(%0) \n\t"
362 "movq -8(%0, %2), %%mm1 \n\t"
363 "punpckhbw %%mm1, %%mm1 \n\t"
364 "punpckhwd %%mm1, %%mm1 \n\t"
365 "punpckhdq %%mm1, %%mm1 \n\t"
366 "movq %%mm1, (%0, %2) \n\t"
371 : "r"((x86_reg)wrap), "r"((x86_reg)width), "r"(ptr + wrap * height)
376 "movd (%0), %%mm0 \n\t"
377 "punpcklbw %%mm0, %%mm0 \n\t"
378 "punpcklwd %%mm0, %%mm0 \n\t"
379 "punpckldq %%mm0, %%mm0 \n\t"
380 "movq %%mm0, -8(%0) \n\t"
381 "movq %%mm0, -16(%0) \n\t"
382 "movq -8(%0, %2), %%mm1 \n\t"
383 "punpckhbw %%mm1, %%mm1 \n\t"
384 "punpckhwd %%mm1, %%mm1 \n\t"
385 "punpckhdq %%mm1, %%mm1 \n\t"
386 "movq %%mm1, (%0, %2) \n\t"
387 "movq %%mm1, 8(%0, %2) \n\t"
392 : "r"((x86_reg)wrap), "r"((x86_reg)width), "r"(ptr + wrap * height)
396 /* top and bottom (and hopefully also the corners) */
397 if (sides & EDGE_TOP) {
398 for (i = 0; i < h; i += 4) {
399 ptr = buf - (i + 1) * wrap - w;
402 "movq (%1, %0), %%mm0 \n\t"
403 "movq %%mm0, (%0) \n\t"
404 "movq %%mm0, (%0, %2) \n\t"
405 "movq %%mm0, (%0, %2, 2) \n\t"
406 "movq %%mm0, (%0, %3) \n\t"
411 : "r"((x86_reg)buf - (x86_reg)ptr - w), "r"((x86_reg) -wrap),
412 "r"((x86_reg) -wrap * 3), "r"(ptr + width + 2 * w)
417 if (sides & EDGE_BOTTOM) {
418 for (i = 0; i < h; i += 4) {
419 ptr = last_line + (i + 1) * wrap - w;
422 "movq (%1, %0), %%mm0 \n\t"
423 "movq %%mm0, (%0) \n\t"
424 "movq %%mm0, (%0, %2) \n\t"
425 "movq %%mm0, (%0, %2, 2) \n\t"
426 "movq %%mm0, (%0, %3) \n\t"
431 : "r"((x86_reg)last_line - (x86_reg)ptr - w),
432 "r"((x86_reg)wrap), "r"((x86_reg)wrap * 3),
433 "r"(ptr + width + 2 * w)
438 #endif /* HAVE_INLINE_ASM */
442 static void ff_avg_pixels16_mmxext(uint8_t *block, const uint8_t *pixels,
443 int line_size, int h)
445 ff_avg_pixels8_mmxext(block, pixels, line_size, h);
446 ff_avg_pixels8_mmxext(block + 8, pixels + 8, line_size, h);
449 static void ff_put_pixels16_mmxext(uint8_t *block, const uint8_t *pixels,
450 ptrdiff_t line_size, int h)
452 ff_put_pixels8_mmxext(block, pixels, line_size, h);
453 ff_put_pixels8_mmxext(block + 8, pixels + 8, line_size, h);
456 #define QPEL_OP(OPNAME, ROUNDER, RND, MMX) \
457 static void OPNAME ## qpel8_mc00_ ## MMX (uint8_t *dst, uint8_t *src, \
460 ff_ ## OPNAME ## pixels8_ ## MMX(dst, src, stride, 8); \
463 static void OPNAME ## qpel8_mc10_ ## MMX(uint8_t *dst, uint8_t *src, \
467 uint8_t * const half = (uint8_t*)temp; \
468 ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, \
470 ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, src, half, \
471 stride, stride, 8); \
474 static void OPNAME ## qpel8_mc20_ ## MMX(uint8_t *dst, uint8_t *src, \
477 ff_ ## OPNAME ## mpeg4_qpel8_h_lowpass_ ## MMX(dst, src, stride, \
481 static void OPNAME ## qpel8_mc30_ ## MMX(uint8_t *dst, uint8_t *src, \
485 uint8_t * const half = (uint8_t*)temp; \
486 ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, \
488 ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, src + 1, half, stride, \
492 static void OPNAME ## qpel8_mc01_ ## MMX(uint8_t *dst, uint8_t *src, \
496 uint8_t * const half = (uint8_t*)temp; \
497 ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, \
499 ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, src, half, \
500 stride, stride, 8); \
503 static void OPNAME ## qpel8_mc02_ ## MMX(uint8_t *dst, uint8_t *src, \
506 ff_ ## OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, src, \
510 static void OPNAME ## qpel8_mc03_ ## MMX(uint8_t *dst, uint8_t *src, \
514 uint8_t * const half = (uint8_t*)temp; \
515 ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, \
517 ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, src + stride, half, stride,\
521 static void OPNAME ## qpel8_mc11_ ## MMX(uint8_t *dst, uint8_t *src, \
524 uint64_t half[8 + 9]; \
525 uint8_t * const halfH = ((uint8_t*)half) + 64; \
526 uint8_t * const halfHV = ((uint8_t*)half); \
527 ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
529 ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, \
531 ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
532 ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, \
536 static void OPNAME ## qpel8_mc31_ ## MMX(uint8_t *dst, uint8_t *src, \
539 uint64_t half[8 + 9]; \
540 uint8_t * const halfH = ((uint8_t*)half) + 64; \
541 uint8_t * const halfHV = ((uint8_t*)half); \
542 ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
544 ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src + 1, halfH, 8, \
546 ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
547 ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, \
551 static void OPNAME ## qpel8_mc13_ ## MMX(uint8_t *dst, uint8_t *src, \
554 uint64_t half[8 + 9]; \
555 uint8_t * const halfH = ((uint8_t*)half) + 64; \
556 uint8_t * const halfHV = ((uint8_t*)half); \
557 ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
559 ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, \
561 ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
562 ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH + 8, halfHV, \
566 static void OPNAME ## qpel8_mc33_ ## MMX(uint8_t *dst, uint8_t *src, \
569 uint64_t half[8 + 9]; \
570 uint8_t * const halfH = ((uint8_t*)half) + 64; \
571 uint8_t * const halfHV = ((uint8_t*)half); \
572 ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
574 ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src + 1, halfH, 8, \
576 ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
577 ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH + 8, halfHV, \
581 static void OPNAME ## qpel8_mc21_ ## MMX(uint8_t *dst, uint8_t *src, \
584 uint64_t half[8 + 9]; \
585 uint8_t * const halfH = ((uint8_t*)half) + 64; \
586 uint8_t * const halfHV = ((uint8_t*)half); \
587 ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
589 ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
590 ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, \
594 static void OPNAME ## qpel8_mc23_ ## MMX(uint8_t *dst, uint8_t *src, \
597 uint64_t half[8 + 9]; \
598 uint8_t * const halfH = ((uint8_t*)half) + 64; \
599 uint8_t * const halfHV = ((uint8_t*)half); \
600 ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
602 ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
603 ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH + 8, halfHV, \
607 static void OPNAME ## qpel8_mc12_ ## MMX(uint8_t *dst, uint8_t *src, \
610 uint64_t half[8 + 9]; \
611 uint8_t * const halfH = ((uint8_t*)half); \
612 ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
614 ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, \
616 ff_ ## OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, \
620 static void OPNAME ## qpel8_mc32_ ## MMX(uint8_t *dst, uint8_t *src, \
623 uint64_t half[8 + 9]; \
624 uint8_t * const halfH = ((uint8_t*)half); \
625 ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
627 ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src + 1, halfH, 8, \
629 ff_ ## OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, \
633 static void OPNAME ## qpel8_mc22_ ## MMX(uint8_t *dst, uint8_t *src, \
637 uint8_t * const halfH = ((uint8_t*)half); \
638 ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
640 ff_ ## OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, \
644 static void OPNAME ## qpel16_mc00_ ## MMX (uint8_t *dst, uint8_t *src, \
647 ff_ ## OPNAME ## pixels16_ ## MMX(dst, src, stride, 16); \
650 static void OPNAME ## qpel16_mc10_ ## MMX(uint8_t *dst, uint8_t *src, \
654 uint8_t * const half = (uint8_t*)temp; \
655 ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, \
657 ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, \
661 static void OPNAME ## qpel16_mc20_ ## MMX(uint8_t *dst, uint8_t *src, \
664 ff_ ## OPNAME ## mpeg4_qpel16_h_lowpass_ ## MMX(dst, src, \
665 stride, stride, 16);\
668 static void OPNAME ## qpel16_mc30_ ## MMX(uint8_t *dst, uint8_t *src, \
672 uint8_t * const half = (uint8_t*)temp; \
673 ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, \
675 ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, src + 1, half, \
676 stride, stride, 16); \
679 static void OPNAME ## qpel16_mc01_ ## MMX(uint8_t *dst, uint8_t *src, \
683 uint8_t * const half = (uint8_t*)temp; \
684 ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, \
686 ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, \
690 static void OPNAME ## qpel16_mc02_ ## MMX(uint8_t *dst, uint8_t *src, \
693 ff_ ## OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, src, \
697 static void OPNAME ## qpel16_mc03_ ## MMX(uint8_t *dst, uint8_t *src, \
701 uint8_t * const half = (uint8_t*)temp; \
702 ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, \
704 ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, src+stride, half, \
705 stride, stride, 16); \
708 static void OPNAME ## qpel16_mc11_ ## MMX(uint8_t *dst, uint8_t *src, \
711 uint64_t half[16 * 2 + 17 * 2]; \
712 uint8_t * const halfH = ((uint8_t*)half) + 256; \
713 uint8_t * const halfHV = ((uint8_t*)half); \
714 ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
716 ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, \
718 ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
720 ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, \
724 static void OPNAME ## qpel16_mc31_ ## MMX(uint8_t *dst, uint8_t *src, \
727 uint64_t half[16 * 2 + 17 * 2]; \
728 uint8_t * const halfH = ((uint8_t*)half) + 256; \
729 uint8_t * const halfHV = ((uint8_t*)half); \
730 ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
732 ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src + 1, halfH, 16, \
734 ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
736 ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, \
740 static void OPNAME ## qpel16_mc13_ ## MMX(uint8_t *dst, uint8_t *src, \
743 uint64_t half[16 * 2 + 17 * 2]; \
744 uint8_t * const halfH = ((uint8_t*)half) + 256; \
745 uint8_t * const halfHV = ((uint8_t*)half); \
746 ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
748 ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, \
750 ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
752 ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH + 16, halfHV, \
756 static void OPNAME ## qpel16_mc33_ ## MMX(uint8_t *dst, uint8_t *src, \
759 uint64_t half[16 * 2 + 17 * 2]; \
760 uint8_t * const halfH = ((uint8_t*)half) + 256; \
761 uint8_t * const halfHV = ((uint8_t*)half); \
762 ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
764 ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src + 1, halfH, 16, \
766 ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
768 ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH + 16, halfHV, \
772 static void OPNAME ## qpel16_mc21_ ## MMX(uint8_t *dst, uint8_t *src, \
775 uint64_t half[16 * 2 + 17 * 2]; \
776 uint8_t * const halfH = ((uint8_t*)half) + 256; \
777 uint8_t * const halfHV = ((uint8_t*)half); \
778 ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
780 ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
782 ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, \
786 static void OPNAME ## qpel16_mc23_ ## MMX(uint8_t *dst, uint8_t *src, \
789 uint64_t half[16 * 2 + 17 * 2]; \
790 uint8_t * const halfH = ((uint8_t*)half) + 256; \
791 uint8_t * const halfHV = ((uint8_t*)half); \
792 ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
794 ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
796 ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH + 16, halfHV, \
800 static void OPNAME ## qpel16_mc12_ ## MMX(uint8_t *dst, uint8_t *src, \
803 uint64_t half[17 * 2]; \
804 uint8_t * const halfH = ((uint8_t*)half); \
805 ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
807 ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, \
809 ff_ ## OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, \
813 static void OPNAME ## qpel16_mc32_ ## MMX(uint8_t *dst, uint8_t *src, \
816 uint64_t half[17 * 2]; \
817 uint8_t * const halfH = ((uint8_t*)half); \
818 ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
820 ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src + 1, halfH, 16, \
822 ff_ ## OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, \
826 static void OPNAME ## qpel16_mc22_ ## MMX(uint8_t *dst, uint8_t *src, \
829 uint64_t half[17 * 2]; \
830 uint8_t * const halfH = ((uint8_t*)half); \
831 ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
833 ff_ ## OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, \
837 QPEL_OP(put_, ff_pw_16, _, mmxext)
838 QPEL_OP(avg_, ff_pw_16, _, mmxext)
839 QPEL_OP(put_no_rnd_, ff_pw_15, _no_rnd_, mmxext)
840 #endif /* HAVE_YASM */
844 static void gmc_mmx(uint8_t *dst, uint8_t *src,
845 int stride, int h, int ox, int oy,
846 int dxx, int dxy, int dyx, int dyy,
847 int shift, int r, int width, int height)
850 const int ix = ox >> (16 + shift);
851 const int iy = oy >> (16 + shift);
852 const int oxs = ox >> 4;
853 const int oys = oy >> 4;
854 const int dxxs = dxx >> 4;
855 const int dxys = dxy >> 4;
856 const int dyxs = dyx >> 4;
857 const int dyys = dyy >> 4;
858 const uint16_t r4[4] = { r, r, r, r };
859 const uint16_t dxy4[4] = { dxys, dxys, dxys, dxys };
860 const uint16_t dyy4[4] = { dyys, dyys, dyys, dyys };
861 const uint64_t shift2 = 2 * shift;
864 const int dxw = (dxx - (1 << (16 + shift))) * (w - 1);
865 const int dyh = (dyy - (1 << (16 + shift))) * (h - 1);
866 const int dxh = dxy * (h - 1);
867 const int dyw = dyx * (w - 1);
868 if ( // non-constant fullpel offset (3% of blocks)
869 ((ox ^ (ox + dxw)) | (ox ^ (ox + dxh)) | (ox ^ (ox + dxw + dxh)) |
870 (oy ^ (oy + dyw)) | (oy ^ (oy + dyh)) | (oy ^ (oy + dyw + dyh))) >> (16 + shift)
871 // uses more than 16 bits of subpel mv (only at huge resolution)
872 || (dxx | dxy | dyx | dyy) & 15 ||
873 (unsigned)ix >= width - w ||
874 (unsigned)iy >= height - h) {
875 // FIXME could still use mmx for some of the rows
876 ff_gmc_c(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy,
877 shift, r, width, height);
881 src += ix + iy * stride;
884 "movd %0, %%mm6 \n\t"
885 "pxor %%mm7, %%mm7 \n\t"
886 "punpcklwd %%mm6, %%mm6 \n\t"
887 "punpcklwd %%mm6, %%mm6 \n\t"
891 for (x = 0; x < w; x += 4) {
892 uint16_t dx4[4] = { oxs - dxys + dxxs * (x + 0),
893 oxs - dxys + dxxs * (x + 1),
894 oxs - dxys + dxxs * (x + 2),
895 oxs - dxys + dxxs * (x + 3) };
896 uint16_t dy4[4] = { oys - dyys + dyxs * (x + 0),
897 oys - dyys + dyxs * (x + 1),
898 oys - dyys + dyxs * (x + 2),
899 oys - dyys + dyxs * (x + 3) };
901 for (y = 0; y < h; y++) {
903 "movq %0, %%mm4 \n\t"
904 "movq %1, %%mm5 \n\t"
905 "paddw %2, %%mm4 \n\t"
906 "paddw %3, %%mm5 \n\t"
907 "movq %%mm4, %0 \n\t"
908 "movq %%mm5, %1 \n\t"
909 "psrlw $12, %%mm4 \n\t"
910 "psrlw $12, %%mm5 \n\t"
911 : "+m"(*dx4), "+m"(*dy4)
912 : "m"(*dxy4), "m"(*dyy4)
916 "movq %%mm6, %%mm2 \n\t"
917 "movq %%mm6, %%mm1 \n\t"
918 "psubw %%mm4, %%mm2 \n\t"
919 "psubw %%mm5, %%mm1 \n\t"
920 "movq %%mm2, %%mm0 \n\t"
921 "movq %%mm4, %%mm3 \n\t"
922 "pmullw %%mm1, %%mm0 \n\t" // (s - dx) * (s - dy)
923 "pmullw %%mm5, %%mm3 \n\t" // dx * dy
924 "pmullw %%mm5, %%mm2 \n\t" // (s - dx) * dy
925 "pmullw %%mm4, %%mm1 \n\t" // dx * (s - dy)
927 "movd %4, %%mm5 \n\t"
928 "movd %3, %%mm4 \n\t"
929 "punpcklbw %%mm7, %%mm5 \n\t"
930 "punpcklbw %%mm7, %%mm4 \n\t"
931 "pmullw %%mm5, %%mm3 \n\t" // src[1, 1] * dx * dy
932 "pmullw %%mm4, %%mm2 \n\t" // src[0, 1] * (s - dx) * dy
934 "movd %2, %%mm5 \n\t"
935 "movd %1, %%mm4 \n\t"
936 "punpcklbw %%mm7, %%mm5 \n\t"
937 "punpcklbw %%mm7, %%mm4 \n\t"
938 "pmullw %%mm5, %%mm1 \n\t" // src[1, 0] * dx * (s - dy)
939 "pmullw %%mm4, %%mm0 \n\t" // src[0, 0] * (s - dx) * (s - dy)
940 "paddw %5, %%mm1 \n\t"
941 "paddw %%mm3, %%mm2 \n\t"
942 "paddw %%mm1, %%mm0 \n\t"
943 "paddw %%mm2, %%mm0 \n\t"
945 "psrlw %6, %%mm0 \n\t"
946 "packuswb %%mm0, %%mm0 \n\t"
947 "movd %%mm0, %0 \n\t"
949 : "=m"(dst[x + y * stride])
950 : "m"(src[0]), "m"(src[1]),
951 "m"(src[stride]), "m"(src[stride + 1]),
952 "m"(*r4), "m"(shift2)
956 src += 4 - h * stride;
960 static void vector_clipf_sse(float *dst, const float *src,
961 float min, float max, int len)
963 x86_reg i = (len - 16) * 4;
965 "movss %3, %%xmm4 \n\t"
966 "movss %4, %%xmm5 \n\t"
967 "shufps $0, %%xmm4, %%xmm4 \n\t"
968 "shufps $0, %%xmm5, %%xmm5 \n\t"
970 "movaps (%2, %0), %%xmm0 \n\t" // 3/1 on intel
971 "movaps 16(%2, %0), %%xmm1 \n\t"
972 "movaps 32(%2, %0), %%xmm2 \n\t"
973 "movaps 48(%2, %0), %%xmm3 \n\t"
974 "maxps %%xmm4, %%xmm0 \n\t"
975 "maxps %%xmm4, %%xmm1 \n\t"
976 "maxps %%xmm4, %%xmm2 \n\t"
977 "maxps %%xmm4, %%xmm3 \n\t"
978 "minps %%xmm5, %%xmm0 \n\t"
979 "minps %%xmm5, %%xmm1 \n\t"
980 "minps %%xmm5, %%xmm2 \n\t"
981 "minps %%xmm5, %%xmm3 \n\t"
982 "movaps %%xmm0, (%1, %0) \n\t"
983 "movaps %%xmm1, 16(%1, %0) \n\t"
984 "movaps %%xmm2, 32(%1, %0) \n\t"
985 "movaps %%xmm3, 48(%1, %0) \n\t"
989 : "r"(dst), "r"(src), "m"(min), "m"(max)
994 #endif /* HAVE_INLINE_ASM */
996 void ff_h263_v_loop_filter_mmx(uint8_t *src, int stride, int qscale);
997 void ff_h263_h_loop_filter_mmx(uint8_t *src, int stride, int qscale);
999 int32_t ff_scalarproduct_int16_mmxext(const int16_t *v1, const int16_t *v2,
1001 int32_t ff_scalarproduct_int16_sse2(const int16_t *v1, const int16_t *v2,
1003 int32_t ff_scalarproduct_and_madd_int16_mmxext(int16_t *v1, const int16_t *v2,
1005 int order, int mul);
1006 int32_t ff_scalarproduct_and_madd_int16_sse2(int16_t *v1, const int16_t *v2,
1008 int order, int mul);
1009 int32_t ff_scalarproduct_and_madd_int16_ssse3(int16_t *v1, const int16_t *v2,
1011 int order, int mul);
1013 void ff_apply_window_int16_round_mmxext(int16_t *output, const int16_t *input,
1014 const int16_t *window, unsigned int len);
1015 void ff_apply_window_int16_round_sse2(int16_t *output, const int16_t *input,
1016 const int16_t *window, unsigned int len);
1017 void ff_apply_window_int16_mmxext(int16_t *output, const int16_t *input,
1018 const int16_t *window, unsigned int len);
1019 void ff_apply_window_int16_sse2(int16_t *output, const int16_t *input,
1020 const int16_t *window, unsigned int len);
1021 void ff_apply_window_int16_ssse3(int16_t *output, const int16_t *input,
1022 const int16_t *window, unsigned int len);
1023 void ff_apply_window_int16_ssse3_atom(int16_t *output, const int16_t *input,
1024 const int16_t *window, unsigned int len);
1026 void ff_bswap32_buf_ssse3(uint32_t *dst, const uint32_t *src, int w);
1027 void ff_bswap32_buf_sse2(uint32_t *dst, const uint32_t *src, int w);
1029 void ff_add_hfyu_median_prediction_mmxext(uint8_t *dst, const uint8_t *top,
1030 const uint8_t *diff, int w,
1031 int *left, int *left_top);
1032 int ff_add_hfyu_left_prediction_ssse3(uint8_t *dst, const uint8_t *src,
1034 int ff_add_hfyu_left_prediction_sse4(uint8_t *dst, const uint8_t *src,
1037 void ff_vector_clip_int32_mmx (int32_t *dst, const int32_t *src,
1038 int32_t min, int32_t max, unsigned int len);
1039 void ff_vector_clip_int32_sse2 (int32_t *dst, const int32_t *src,
1040 int32_t min, int32_t max, unsigned int len);
1041 void ff_vector_clip_int32_int_sse2(int32_t *dst, const int32_t *src,
1042 int32_t min, int32_t max, unsigned int len);
1043 void ff_vector_clip_int32_sse4 (int32_t *dst, const int32_t *src,
1044 int32_t min, int32_t max, unsigned int len);
1046 #define SET_QPEL_FUNCS(PFX, IDX, SIZE, CPU, PREFIX) \
1048 c->PFX ## _pixels_tab[IDX][ 0] = PREFIX ## PFX ## SIZE ## _mc00_ ## CPU; \
1049 c->PFX ## _pixels_tab[IDX][ 1] = PREFIX ## PFX ## SIZE ## _mc10_ ## CPU; \
1050 c->PFX ## _pixels_tab[IDX][ 2] = PREFIX ## PFX ## SIZE ## _mc20_ ## CPU; \
1051 c->PFX ## _pixels_tab[IDX][ 3] = PREFIX ## PFX ## SIZE ## _mc30_ ## CPU; \
1052 c->PFX ## _pixels_tab[IDX][ 4] = PREFIX ## PFX ## SIZE ## _mc01_ ## CPU; \
1053 c->PFX ## _pixels_tab[IDX][ 5] = PREFIX ## PFX ## SIZE ## _mc11_ ## CPU; \
1054 c->PFX ## _pixels_tab[IDX][ 6] = PREFIX ## PFX ## SIZE ## _mc21_ ## CPU; \
1055 c->PFX ## _pixels_tab[IDX][ 7] = PREFIX ## PFX ## SIZE ## _mc31_ ## CPU; \
1056 c->PFX ## _pixels_tab[IDX][ 8] = PREFIX ## PFX ## SIZE ## _mc02_ ## CPU; \
1057 c->PFX ## _pixels_tab[IDX][ 9] = PREFIX ## PFX ## SIZE ## _mc12_ ## CPU; \
1058 c->PFX ## _pixels_tab[IDX][10] = PREFIX ## PFX ## SIZE ## _mc22_ ## CPU; \
1059 c->PFX ## _pixels_tab[IDX][11] = PREFIX ## PFX ## SIZE ## _mc32_ ## CPU; \
1060 c->PFX ## _pixels_tab[IDX][12] = PREFIX ## PFX ## SIZE ## _mc03_ ## CPU; \
1061 c->PFX ## _pixels_tab[IDX][13] = PREFIX ## PFX ## SIZE ## _mc13_ ## CPU; \
1062 c->PFX ## _pixels_tab[IDX][14] = PREFIX ## PFX ## SIZE ## _mc23_ ## CPU; \
1063 c->PFX ## _pixels_tab[IDX][15] = PREFIX ## PFX ## SIZE ## _mc33_ ## CPU; \
1066 static av_cold void dsputil_init_mmx(DSPContext *c, AVCodecContext *avctx,
1070 const int high_bit_depth = avctx->bits_per_raw_sample > 8;
1072 c->put_pixels_clamped = ff_put_pixels_clamped_mmx;
1073 c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_mmx;
1074 c->add_pixels_clamped = ff_add_pixels_clamped_mmx;
1076 if (!high_bit_depth) {
1077 c->clear_block = clear_block_mmx;
1078 c->clear_blocks = clear_blocks_mmx;
1079 c->draw_edges = draw_edges_mmx;
1081 switch (avctx->idct_algo) {
1083 case FF_IDCT_SIMPLEMMX:
1084 c->idct_put = ff_simple_idct_put_mmx;
1085 c->idct_add = ff_simple_idct_add_mmx;
1086 c->idct = ff_simple_idct_mmx;
1087 c->idct_permutation_type = FF_SIMPLE_IDCT_PERM;
1089 case FF_IDCT_XVIDMMX:
1090 c->idct_put = ff_idct_xvid_mmx_put;
1091 c->idct_add = ff_idct_xvid_mmx_add;
1092 c->idct = ff_idct_xvid_mmx;
1099 c->add_bytes = add_bytes_mmx;
1100 #endif /* HAVE_MMX_INLINE */
1102 #if HAVE_MMX_EXTERNAL
1103 if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
1104 c->h263_v_loop_filter = ff_h263_v_loop_filter_mmx;
1105 c->h263_h_loop_filter = ff_h263_h_loop_filter_mmx;
1108 c->vector_clip_int32 = ff_vector_clip_int32_mmx;
1109 #endif /* HAVE_MMX_EXTERNAL */
1112 static av_cold void dsputil_init_mmxext(DSPContext *c, AVCodecContext *avctx,
1115 #if HAVE_MMXEXT_INLINE
1116 const int high_bit_depth = avctx->bits_per_raw_sample > 8;
1118 if (!high_bit_depth && avctx->idct_algo == FF_IDCT_XVIDMMX) {
1119 c->idct_put = ff_idct_xvid_mmxext_put;
1120 c->idct_add = ff_idct_xvid_mmxext_add;
1121 c->idct = ff_idct_xvid_mmxext;
1123 #endif /* HAVE_MMXEXT_INLINE */
1125 #if HAVE_MMXEXT_EXTERNAL
1126 SET_QPEL_FUNCS(avg_qpel, 0, 16, mmxext, );
1127 SET_QPEL_FUNCS(avg_qpel, 1, 8, mmxext, );
1129 SET_QPEL_FUNCS(put_qpel, 0, 16, mmxext, );
1130 SET_QPEL_FUNCS(put_qpel, 1, 8, mmxext, );
1131 SET_QPEL_FUNCS(put_no_rnd_qpel, 0, 16, mmxext, );
1132 SET_QPEL_FUNCS(put_no_rnd_qpel, 1, 8, mmxext, );
1134 /* slower than cmov version on AMD */
1135 if (!(mm_flags & AV_CPU_FLAG_3DNOW))
1136 c->add_hfyu_median_prediction = ff_add_hfyu_median_prediction_mmxext;
1138 c->scalarproduct_int16 = ff_scalarproduct_int16_mmxext;
1139 c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_mmxext;
1141 if (avctx->flags & CODEC_FLAG_BITEXACT) {
1142 c->apply_window_int16 = ff_apply_window_int16_mmxext;
1144 c->apply_window_int16 = ff_apply_window_int16_round_mmxext;
1146 #endif /* HAVE_MMXEXT_EXTERNAL */
1149 static av_cold void dsputil_init_sse(DSPContext *c, AVCodecContext *avctx,
1153 const int high_bit_depth = avctx->bits_per_raw_sample > 8;
1155 if (!high_bit_depth) {
1156 if (!(CONFIG_MPEG_XVMC_DECODER && avctx->xvmc_acceleration > 1)) {
1157 /* XvMCCreateBlocks() may not allocate 16-byte aligned blocks */
1158 c->clear_block = clear_block_sse;
1159 c->clear_blocks = clear_blocks_sse;
1163 c->vector_clipf = vector_clipf_sse;
1164 #endif /* HAVE_SSE_INLINE */
1167 static av_cold void dsputil_init_sse2(DSPContext *c, AVCodecContext *avctx,
1170 #if HAVE_SSE2_INLINE
1171 const int high_bit_depth = avctx->bits_per_raw_sample > 8;
1173 if (!high_bit_depth && avctx->idct_algo == FF_IDCT_XVIDMMX) {
1174 c->idct_put = ff_idct_xvid_sse2_put;
1175 c->idct_add = ff_idct_xvid_sse2_add;
1176 c->idct = ff_idct_xvid_sse2;
1177 c->idct_permutation_type = FF_SSE2_IDCT_PERM;
1179 #endif /* HAVE_SSE2_INLINE */
1181 #if HAVE_SSE2_EXTERNAL
1182 c->scalarproduct_int16 = ff_scalarproduct_int16_sse2;
1183 c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_sse2;
1184 if (mm_flags & AV_CPU_FLAG_ATOM) {
1185 c->vector_clip_int32 = ff_vector_clip_int32_int_sse2;
1187 c->vector_clip_int32 = ff_vector_clip_int32_sse2;
1189 if (avctx->flags & CODEC_FLAG_BITEXACT) {
1190 c->apply_window_int16 = ff_apply_window_int16_sse2;
1191 } else if (!(mm_flags & AV_CPU_FLAG_SSE2SLOW)) {
1192 c->apply_window_int16 = ff_apply_window_int16_round_sse2;
1194 c->bswap_buf = ff_bswap32_buf_sse2;
1195 #endif /* HAVE_SSE2_EXTERNAL */
1198 static av_cold void dsputil_init_ssse3(DSPContext *c, AVCodecContext *avctx,
1201 #if HAVE_SSSE3_EXTERNAL
1202 c->add_hfyu_left_prediction = ff_add_hfyu_left_prediction_ssse3;
1203 if (mm_flags & AV_CPU_FLAG_SSE4) // not really sse4, just slow on Conroe
1204 c->add_hfyu_left_prediction = ff_add_hfyu_left_prediction_sse4;
1206 if (mm_flags & AV_CPU_FLAG_ATOM)
1207 c->apply_window_int16 = ff_apply_window_int16_ssse3_atom;
1209 c->apply_window_int16 = ff_apply_window_int16_ssse3;
1210 if (!(mm_flags & (AV_CPU_FLAG_SSE42|AV_CPU_FLAG_3DNOW))) // cachesplit
1211 c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_ssse3;
1212 c->bswap_buf = ff_bswap32_buf_ssse3;
1213 #endif /* HAVE_SSSE3_EXTERNAL */
1216 static av_cold void dsputil_init_sse4(DSPContext *c, AVCodecContext *avctx,
1219 #if HAVE_SSE4_EXTERNAL
1220 c->vector_clip_int32 = ff_vector_clip_int32_sse4;
1221 #endif /* HAVE_SSE4_EXTERNAL */
1224 av_cold void ff_dsputil_init_mmx(DSPContext *c, AVCodecContext *avctx)
1226 int mm_flags = av_get_cpu_flags();
1228 #if HAVE_7REGS && HAVE_INLINE_ASM
1229 if (mm_flags & AV_CPU_FLAG_CMOV)
1230 c->add_hfyu_median_prediction = add_hfyu_median_prediction_cmov;
1233 if (mm_flags & AV_CPU_FLAG_MMX)
1234 dsputil_init_mmx(c, avctx, mm_flags);
1236 if (mm_flags & AV_CPU_FLAG_MMXEXT)
1237 dsputil_init_mmxext(c, avctx, mm_flags);
1239 if (mm_flags & AV_CPU_FLAG_SSE)
1240 dsputil_init_sse(c, avctx, mm_flags);
1242 if (mm_flags & AV_CPU_FLAG_SSE2)
1243 dsputil_init_sse2(c, avctx, mm_flags);
1245 if (mm_flags & AV_CPU_FLAG_SSSE3)
1246 dsputil_init_ssse3(c, avctx, mm_flags);
1248 if (mm_flags & AV_CPU_FLAG_SSE4)
1249 dsputil_init_sse4(c, avctx, mm_flags);
1251 if (CONFIG_ENCODERS)
1252 ff_dsputilenc_init_mmx(c, avctx);