]> git.sesse.net Git - ffmpeg/blob - libavcodec/x86/dsputil_init.c
Fix the mime types for MPEG and MPEG-TS formats
[ffmpeg] / libavcodec / x86 / dsputil_init.c
1 /*
2  * Copyright (c) 2000, 2001 Fabrice Bellard
3  * Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
4  *
5  * This file is part of FFmpeg.
6  *
7  * FFmpeg is free software; you can redistribute it and/or
8  * modify it under the terms of the GNU Lesser General Public
9  * License as published by the Free Software Foundation; either
10  * version 2.1 of the License, or (at your option) any later version.
11  *
12  * FFmpeg is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15  * Lesser General Public License for more details.
16  *
17  * You should have received a copy of the GNU Lesser General Public
18  * License along with FFmpeg; if not, write to the Free Software
19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20  */
21
22 #include "config.h"
23 #include "libavutil/attributes.h"
24 #include "libavutil/cpu.h"
25 #include "libavutil/internal.h"
26 #include "libavutil/x86/asm.h"
27 #include "libavutil/x86/cpu.h"
28 #include "libavcodec/avcodec.h"
29 #include "libavcodec/dsputil.h"
30 #include "libavcodec/pixels.h"
31 #include "libavcodec/simple_idct.h"
32 #include "libavcodec/version.h"
33 #include "dsputil_x86.h"
34 #include "fpel.h"
35 #include "idct_xvid.h"
36
37 void ff_put_pixels8_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
38                               int dstStride, int src1Stride, int h);
39 void ff_put_no_rnd_pixels8_l2_mmxext(uint8_t *dst, uint8_t *src1,
40                                      uint8_t *src2, int dstStride,
41                                      int src1Stride, int h);
42 void ff_avg_pixels8_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
43                               int dstStride, int src1Stride, int h);
44 void ff_put_pixels16_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
45                                int dstStride, int src1Stride, int h);
46 void ff_avg_pixels16_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
47                                int dstStride, int src1Stride, int h);
48 void ff_put_no_rnd_pixels16_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
49                                       int dstStride, int src1Stride, int h);
50 void ff_put_mpeg4_qpel16_h_lowpass_mmxext(uint8_t *dst, uint8_t *src,
51                                           int dstStride, int srcStride, int h);
52 void ff_avg_mpeg4_qpel16_h_lowpass_mmxext(uint8_t *dst, uint8_t *src,
53                                           int dstStride, int srcStride, int h);
54 void ff_put_no_rnd_mpeg4_qpel16_h_lowpass_mmxext(uint8_t *dst, uint8_t *src,
55                                                  int dstStride, int srcStride,
56                                                  int h);
57 void ff_put_mpeg4_qpel8_h_lowpass_mmxext(uint8_t *dst, uint8_t *src,
58                                          int dstStride, int srcStride, int h);
59 void ff_avg_mpeg4_qpel8_h_lowpass_mmxext(uint8_t *dst, uint8_t *src,
60                                          int dstStride, int srcStride, int h);
61 void ff_put_no_rnd_mpeg4_qpel8_h_lowpass_mmxext(uint8_t *dst, uint8_t *src,
62                                                 int dstStride, int srcStride,
63                                                 int h);
64 void ff_put_mpeg4_qpel16_v_lowpass_mmxext(uint8_t *dst, uint8_t *src,
65                                           int dstStride, int srcStride);
66 void ff_avg_mpeg4_qpel16_v_lowpass_mmxext(uint8_t *dst, uint8_t *src,
67                                           int dstStride, int srcStride);
68 void ff_put_no_rnd_mpeg4_qpel16_v_lowpass_mmxext(uint8_t *dst, uint8_t *src,
69                                                  int dstStride, int srcStride);
70 void ff_put_mpeg4_qpel8_v_lowpass_mmxext(uint8_t *dst, uint8_t *src,
71                                          int dstStride, int srcStride);
72 void ff_avg_mpeg4_qpel8_v_lowpass_mmxext(uint8_t *dst, uint8_t *src,
73                                          int dstStride, int srcStride);
74 void ff_put_no_rnd_mpeg4_qpel8_v_lowpass_mmxext(uint8_t *dst, uint8_t *src,
75                                                 int dstStride, int srcStride);
76 #define ff_put_no_rnd_pixels16_mmxext ff_put_pixels16_mmx
77 #define ff_put_no_rnd_pixels8_mmxext ff_put_pixels8_mmx
78
79 int32_t ff_scalarproduct_int16_mmxext(const int16_t *v1, const int16_t *v2,
80                                       int order);
81 int32_t ff_scalarproduct_int16_sse2(const int16_t *v1, const int16_t *v2,
82                                     int order);
83 int32_t ff_scalarproduct_and_madd_int16_mmxext(int16_t *v1, const int16_t *v2,
84                                                const int16_t *v3,
85                                                int order, int mul);
86 int32_t ff_scalarproduct_and_madd_int16_sse2(int16_t *v1, const int16_t *v2,
87                                              const int16_t *v3,
88                                              int order, int mul);
89 int32_t ff_scalarproduct_and_madd_int16_ssse3(int16_t *v1, const int16_t *v2,
90                                               const int16_t *v3,
91                                               int order, int mul);
92
93 void ff_bswap32_buf_ssse3(uint32_t *dst, const uint32_t *src, int w);
94 void ff_bswap32_buf_sse2(uint32_t *dst, const uint32_t *src, int w);
95
96 void ff_add_hfyu_median_prediction_mmxext(uint8_t *dst, const uint8_t *top,
97                                           const uint8_t *diff, int w,
98                                           int *left, int *left_top);
99 int ff_add_hfyu_left_prediction_ssse3(uint8_t *dst, const uint8_t *src,
100                                       int w, int left);
101 int ff_add_hfyu_left_prediction_sse4(uint8_t *dst, const uint8_t *src,
102                                      int w, int left);
103
104 void ff_vector_clip_int32_mmx(int32_t *dst, const int32_t *src,
105                               int32_t min, int32_t max, unsigned int len);
106 void ff_vector_clip_int32_sse2(int32_t *dst, const int32_t *src,
107                                int32_t min, int32_t max, unsigned int len);
108 void ff_vector_clip_int32_int_sse2(int32_t *dst, const int32_t *src,
109                                    int32_t min, int32_t max, unsigned int len);
110 void ff_vector_clip_int32_sse4(int32_t *dst, const int32_t *src,
111                                int32_t min, int32_t max, unsigned int len);
112
113 #if HAVE_YASM
114
115 #define ff_put_pixels16_mmxext ff_put_pixels16_mmx
116 #define ff_put_pixels8_mmxext  ff_put_pixels8_mmx
117
118 void ff_avg_pixels16_mmxext(uint8_t *block, const uint8_t *pixels,
119                             ptrdiff_t line_size, int h);
120
121 #define QPEL_OP(OPNAME, RND, MMX)                                       \
122 static void OPNAME ## qpel8_mc00_ ## MMX(uint8_t *dst, uint8_t *src,    \
123                                          ptrdiff_t stride)              \
124 {                                                                       \
125     ff_ ## OPNAME ## pixels8_ ## MMX(dst, src, stride, 8);              \
126 }                                                                       \
127                                                                         \
128 static void OPNAME ## qpel8_mc10_ ## MMX(uint8_t *dst, uint8_t *src,    \
129                                          ptrdiff_t stride)              \
130 {                                                                       \
131     uint64_t temp[8];                                                   \
132     uint8_t *const half = (uint8_t *) temp;                             \
133     ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8,        \
134                                                    stride, 8);          \
135     ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, src, half,                 \
136                                         stride, stride, 8);             \
137 }                                                                       \
138                                                                         \
139 static void OPNAME ## qpel8_mc20_ ## MMX(uint8_t *dst, uint8_t *src,    \
140                                          ptrdiff_t stride)              \
141 {                                                                       \
142     ff_ ## OPNAME ## mpeg4_qpel8_h_lowpass_ ## MMX(dst, src, stride,    \
143                                                    stride, 8);          \
144 }                                                                       \
145                                                                         \
146 static void OPNAME ## qpel8_mc30_ ## MMX(uint8_t *dst, uint8_t *src,    \
147                                          ptrdiff_t stride)              \
148 {                                                                       \
149     uint64_t temp[8];                                                   \
150     uint8_t *const half = (uint8_t *) temp;                             \
151     ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8,        \
152                                                    stride, 8);          \
153     ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, src + 1, half, stride,     \
154                                         stride, 8);                     \
155 }                                                                       \
156                                                                         \
157 static void OPNAME ## qpel8_mc01_ ## MMX(uint8_t *dst, uint8_t *src,    \
158                                          ptrdiff_t stride)              \
159 {                                                                       \
160     uint64_t temp[8];                                                   \
161     uint8_t *const half = (uint8_t *) temp;                             \
162     ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src,           \
163                                                    8, stride);          \
164     ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, src, half,                 \
165                                         stride, stride, 8);             \
166 }                                                                       \
167                                                                         \
168 static void OPNAME ## qpel8_mc02_ ## MMX(uint8_t *dst, uint8_t *src,    \
169                                          ptrdiff_t stride)              \
170 {                                                                       \
171     ff_ ## OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, src,            \
172                                                    stride, stride);     \
173 }                                                                       \
174                                                                         \
175 static void OPNAME ## qpel8_mc03_ ## MMX(uint8_t *dst, uint8_t *src,    \
176                                          ptrdiff_t stride)              \
177 {                                                                       \
178     uint64_t temp[8];                                                   \
179     uint8_t *const half = (uint8_t *) temp;                             \
180     ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src,           \
181                                                    8, stride);          \
182     ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, src + stride, half, stride,\
183                                         stride, 8);                     \
184 }                                                                       \
185                                                                         \
186 static void OPNAME ## qpel8_mc11_ ## MMX(uint8_t *dst, uint8_t *src,    \
187                                          ptrdiff_t stride)              \
188 {                                                                       \
189     uint64_t half[8 + 9];                                               \
190     uint8_t *const halfH  = (uint8_t *) half + 64;                      \
191     uint8_t *const halfHV = (uint8_t *) half;                           \
192     ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8,       \
193                                                    stride, 9);          \
194     ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8,           \
195                                         stride, 9);                     \
196     ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
197     ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV,             \
198                                         stride, 8, 8);                  \
199 }                                                                       \
200                                                                         \
201 static void OPNAME ## qpel8_mc31_ ## MMX(uint8_t *dst, uint8_t *src,    \
202                                          ptrdiff_t stride)              \
203 {                                                                       \
204     uint64_t half[8 + 9];                                               \
205     uint8_t *const halfH  = (uint8_t *) half + 64;                      \
206     uint8_t *const halfHV = (uint8_t *) half;                           \
207     ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8,       \
208                                                    stride, 9);          \
209     ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src + 1, halfH, 8,       \
210                                         stride, 9);                     \
211     ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
212     ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV,             \
213                                         stride, 8, 8);                  \
214 }                                                                       \
215                                                                         \
216 static void OPNAME ## qpel8_mc13_ ## MMX(uint8_t *dst, uint8_t *src,    \
217                                          ptrdiff_t stride)              \
218 {                                                                       \
219     uint64_t half[8 + 9];                                               \
220     uint8_t *const halfH  = (uint8_t *) half + 64;                      \
221     uint8_t *const halfHV = (uint8_t *) half;                           \
222     ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8,       \
223                                                    stride, 9);          \
224     ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8,           \
225                                         stride, 9);                     \
226     ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
227     ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH + 8, halfHV,         \
228                                         stride, 8, 8);                  \
229 }                                                                       \
230                                                                         \
231 static void OPNAME ## qpel8_mc33_ ## MMX(uint8_t *dst, uint8_t *src,    \
232                                          ptrdiff_t stride)              \
233 {                                                                       \
234     uint64_t half[8 + 9];                                               \
235     uint8_t *const halfH  = (uint8_t *) half + 64;                      \
236     uint8_t *const halfHV = (uint8_t *) half;                           \
237     ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8,       \
238                                                    stride, 9);          \
239     ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src + 1, halfH, 8,       \
240                                         stride, 9);                     \
241     ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
242     ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH + 8, halfHV,         \
243                                         stride, 8, 8);                  \
244 }                                                                       \
245                                                                         \
246 static void OPNAME ## qpel8_mc21_ ## MMX(uint8_t *dst, uint8_t *src,    \
247                                          ptrdiff_t stride)              \
248 {                                                                       \
249     uint64_t half[8 + 9];                                               \
250     uint8_t *const halfH  = (uint8_t *) half + 64;                      \
251     uint8_t *const halfHV = (uint8_t *) half;                           \
252     ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8,       \
253                                                    stride, 9);          \
254     ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
255     ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV,             \
256                                         stride, 8, 8);                  \
257 }                                                                       \
258                                                                         \
259 static void OPNAME ## qpel8_mc23_ ## MMX(uint8_t *dst, uint8_t *src,    \
260                                          ptrdiff_t stride)              \
261 {                                                                       \
262     uint64_t half[8 + 9];                                               \
263     uint8_t *const halfH  = (uint8_t *) half + 64;                      \
264     uint8_t *const halfHV = (uint8_t *) half;                           \
265     ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8,       \
266                                                    stride, 9);          \
267     ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
268     ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH + 8, halfHV,         \
269                                         stride, 8, 8);                  \
270 }                                                                       \
271                                                                         \
272 static void OPNAME ## qpel8_mc12_ ## MMX(uint8_t *dst, uint8_t *src,    \
273                                          ptrdiff_t stride)              \
274 {                                                                       \
275     uint64_t half[8 + 9];                                               \
276     uint8_t *const halfH = (uint8_t *) half;                            \
277     ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8,       \
278                                                    stride, 9);          \
279     ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH,              \
280                                         8, stride, 9);                  \
281     ff_ ## OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH,          \
282                                                    stride, 8);          \
283 }                                                                       \
284                                                                         \
285 static void OPNAME ## qpel8_mc32_ ## MMX(uint8_t *dst, uint8_t *src,    \
286                                          ptrdiff_t stride)              \
287 {                                                                       \
288     uint64_t half[8 + 9];                                               \
289     uint8_t *const halfH = (uint8_t *) half;                            \
290     ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8,       \
291                                                    stride, 9);          \
292     ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src + 1, halfH, 8,       \
293                                         stride, 9);                     \
294     ff_ ## OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH,          \
295                                                    stride, 8);          \
296 }                                                                       \
297                                                                         \
298 static void OPNAME ## qpel8_mc22_ ## MMX(uint8_t *dst, uint8_t *src,    \
299                                          ptrdiff_t stride)              \
300 {                                                                       \
301     uint64_t half[9];                                                   \
302     uint8_t *const halfH = (uint8_t *) half;                            \
303     ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8,       \
304                                                    stride, 9);          \
305     ff_ ## OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH,          \
306                                                    stride, 8);          \
307 }                                                                       \
308                                                                         \
309 static void OPNAME ## qpel16_mc00_ ## MMX(uint8_t *dst, uint8_t *src,   \
310                                           ptrdiff_t stride)             \
311 {                                                                       \
312     ff_ ## OPNAME ## pixels16_ ## MMX(dst, src, stride, 16);            \
313 }                                                                       \
314                                                                         \
315 static void OPNAME ## qpel16_mc10_ ## MMX(uint8_t *dst, uint8_t *src,   \
316                                           ptrdiff_t stride)             \
317 {                                                                       \
318     uint64_t temp[32];                                                  \
319     uint8_t *const half = (uint8_t *) temp;                             \
320     ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16,      \
321                                                     stride, 16);        \
322     ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride,        \
323                                          stride, 16);                   \
324 }                                                                       \
325                                                                         \
326 static void OPNAME ## qpel16_mc20_ ## MMX(uint8_t *dst, uint8_t *src,   \
327                                           ptrdiff_t stride)             \
328 {                                                                       \
329     ff_ ## OPNAME ## mpeg4_qpel16_h_lowpass_ ## MMX(dst, src,           \
330                                                     stride, stride, 16);\
331 }                                                                       \
332                                                                         \
333 static void OPNAME ## qpel16_mc30_ ## MMX(uint8_t *dst, uint8_t *src,   \
334                                           ptrdiff_t stride)             \
335 {                                                                       \
336     uint64_t temp[32];                                                  \
337     uint8_t *const half = (uint8_t*) temp;                              \
338     ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16,      \
339                                                     stride, 16);        \
340     ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, src + 1, half,            \
341                                          stride, stride, 16);           \
342 }                                                                       \
343                                                                         \
344 static void OPNAME ## qpel16_mc01_ ## MMX(uint8_t *dst, uint8_t *src,   \
345                                           ptrdiff_t stride)             \
346 {                                                                       \
347     uint64_t temp[32];                                                  \
348     uint8_t *const half = (uint8_t *) temp;                             \
349     ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16,      \
350                                                     stride);            \
351     ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride,        \
352                                          stride, 16);                   \
353 }                                                                       \
354                                                                         \
355 static void OPNAME ## qpel16_mc02_ ## MMX(uint8_t *dst, uint8_t *src,   \
356                                           ptrdiff_t stride)             \
357 {                                                                       \
358     ff_ ## OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, src,           \
359                                                     stride, stride);    \
360 }                                                                       \
361                                                                         \
362 static void OPNAME ## qpel16_mc03_ ## MMX(uint8_t *dst, uint8_t *src,   \
363                                           ptrdiff_t stride)             \
364 {                                                                       \
365     uint64_t temp[32];                                                  \
366     uint8_t *const half = (uint8_t *) temp;                             \
367     ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16,      \
368                                                     stride);            \
369     ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, src+stride, half,         \
370                                          stride, stride, 16);           \
371 }                                                                       \
372                                                                         \
373 static void OPNAME ## qpel16_mc11_ ## MMX(uint8_t *dst, uint8_t *src,   \
374                                           ptrdiff_t stride)             \
375 {                                                                       \
376     uint64_t half[16 * 2 + 17 * 2];                                     \
377     uint8_t *const halfH  = (uint8_t *) half + 256;                     \
378     uint8_t *const halfHV = (uint8_t *) half;                           \
379     ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16,     \
380                                                     stride, 17);        \
381     ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16,         \
382                                          stride, 17);                   \
383     ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH,      \
384                                                     16, 16);            \
385     ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV,            \
386                                          stride, 16, 16);               \
387 }                                                                       \
388                                                                         \
389 static void OPNAME ## qpel16_mc31_ ## MMX(uint8_t *dst, uint8_t *src,   \
390                                           ptrdiff_t stride)             \
391 {                                                                       \
392     uint64_t half[16 * 2 + 17 * 2];                                     \
393     uint8_t *const halfH  = (uint8_t *) half + 256;                     \
394     uint8_t *const halfHV = (uint8_t *) half;                           \
395     ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16,     \
396                                                     stride, 17);        \
397     ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src + 1, halfH, 16,     \
398                                          stride, 17);                   \
399     ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH,      \
400                                                     16, 16);            \
401     ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV,            \
402                                          stride, 16, 16);               \
403 }                                                                       \
404                                                                         \
405 static void OPNAME ## qpel16_mc13_ ## MMX(uint8_t *dst, uint8_t *src,   \
406                                           ptrdiff_t stride)             \
407 {                                                                       \
408     uint64_t half[16 * 2 + 17 * 2];                                     \
409     uint8_t *const halfH  = (uint8_t *) half + 256;                     \
410     uint8_t *const halfHV = (uint8_t *) half;                           \
411     ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16,     \
412                                                     stride, 17);        \
413     ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16,         \
414                                          stride, 17);                   \
415     ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH,      \
416                                                     16, 16);            \
417     ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH + 16, halfHV,       \
418                                          stride, 16, 16);               \
419 }                                                                       \
420                                                                         \
421 static void OPNAME ## qpel16_mc33_ ## MMX(uint8_t *dst, uint8_t *src,   \
422                                           ptrdiff_t stride)             \
423 {                                                                       \
424     uint64_t half[16 * 2 + 17 * 2];                                     \
425     uint8_t *const halfH  = (uint8_t *) half + 256;                     \
426     uint8_t *const halfHV = (uint8_t *) half;                           \
427     ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16,     \
428                                                     stride, 17);        \
429     ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src + 1, halfH, 16,     \
430                                          stride, 17);                   \
431     ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH,      \
432                                                     16, 16);            \
433     ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH + 16, halfHV,       \
434                                          stride, 16, 16);               \
435 }                                                                       \
436                                                                         \
437 static void OPNAME ## qpel16_mc21_ ## MMX(uint8_t *dst, uint8_t *src,   \
438                                           ptrdiff_t stride)             \
439 {                                                                       \
440     uint64_t half[16 * 2 + 17 * 2];                                     \
441     uint8_t *const halfH  = (uint8_t *) half + 256;                     \
442     uint8_t *const halfHV = (uint8_t *) half;                           \
443     ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16,     \
444                                                     stride, 17);        \
445     ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH,      \
446                                                     16, 16);            \
447     ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV,            \
448                                          stride, 16, 16);               \
449 }                                                                       \
450                                                                         \
451 static void OPNAME ## qpel16_mc23_ ## MMX(uint8_t *dst, uint8_t *src,   \
452                                           ptrdiff_t stride)             \
453 {                                                                       \
454     uint64_t half[16 * 2 + 17 * 2];                                     \
455     uint8_t *const halfH  = (uint8_t *) half + 256;                     \
456     uint8_t *const halfHV = (uint8_t *) half;                           \
457     ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16,     \
458                                                     stride, 17);        \
459     ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH,      \
460                                                     16, 16);            \
461     ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH + 16, halfHV,       \
462                                          stride, 16, 16);               \
463 }                                                                       \
464                                                                         \
465 static void OPNAME ## qpel16_mc12_ ## MMX(uint8_t *dst, uint8_t *src,   \
466                                           ptrdiff_t stride)             \
467 {                                                                       \
468     uint64_t half[17 * 2];                                              \
469     uint8_t *const halfH = (uint8_t *) half;                            \
470     ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16,     \
471                                                     stride, 17);        \
472     ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16,         \
473                                          stride, 17);                   \
474     ff_ ## OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH,         \
475                                                     stride, 16);        \
476 }                                                                       \
477                                                                         \
478 static void OPNAME ## qpel16_mc32_ ## MMX(uint8_t *dst, uint8_t *src,   \
479                                           ptrdiff_t stride)             \
480 {                                                                       \
481     uint64_t half[17 * 2];                                              \
482     uint8_t *const halfH = (uint8_t *) half;                            \
483     ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16,     \
484                                                     stride, 17);        \
485     ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src + 1, halfH, 16,     \
486                                          stride, 17);                   \
487     ff_ ## OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH,         \
488                                                     stride, 16);        \
489 }                                                                       \
490                                                                         \
491 static void OPNAME ## qpel16_mc22_ ## MMX(uint8_t *dst, uint8_t *src,   \
492                                           ptrdiff_t stride)             \
493 {                                                                       \
494     uint64_t half[17 * 2];                                              \
495     uint8_t *const halfH = (uint8_t *) half;                            \
496     ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16,     \
497                                                     stride, 17);        \
498     ff_ ## OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH,         \
499                                                     stride, 16);        \
500 }
501
502 QPEL_OP(put_,        _,        mmxext)
503 QPEL_OP(avg_,        _,        mmxext)
504 QPEL_OP(put_no_rnd_, _no_rnd_, mmxext)
505
506 #endif /* HAVE_YASM */
507
508 #define SET_QPEL_FUNCS(PFX, IDX, SIZE, CPU, PREFIX)                          \
509 do {                                                                         \
510     c->PFX ## _pixels_tab[IDX][ 0] = PREFIX ## PFX ## SIZE ## _mc00_ ## CPU; \
511     c->PFX ## _pixels_tab[IDX][ 1] = PREFIX ## PFX ## SIZE ## _mc10_ ## CPU; \
512     c->PFX ## _pixels_tab[IDX][ 2] = PREFIX ## PFX ## SIZE ## _mc20_ ## CPU; \
513     c->PFX ## _pixels_tab[IDX][ 3] = PREFIX ## PFX ## SIZE ## _mc30_ ## CPU; \
514     c->PFX ## _pixels_tab[IDX][ 4] = PREFIX ## PFX ## SIZE ## _mc01_ ## CPU; \
515     c->PFX ## _pixels_tab[IDX][ 5] = PREFIX ## PFX ## SIZE ## _mc11_ ## CPU; \
516     c->PFX ## _pixels_tab[IDX][ 6] = PREFIX ## PFX ## SIZE ## _mc21_ ## CPU; \
517     c->PFX ## _pixels_tab[IDX][ 7] = PREFIX ## PFX ## SIZE ## _mc31_ ## CPU; \
518     c->PFX ## _pixels_tab[IDX][ 8] = PREFIX ## PFX ## SIZE ## _mc02_ ## CPU; \
519     c->PFX ## _pixels_tab[IDX][ 9] = PREFIX ## PFX ## SIZE ## _mc12_ ## CPU; \
520     c->PFX ## _pixels_tab[IDX][10] = PREFIX ## PFX ## SIZE ## _mc22_ ## CPU; \
521     c->PFX ## _pixels_tab[IDX][11] = PREFIX ## PFX ## SIZE ## _mc32_ ## CPU; \
522     c->PFX ## _pixels_tab[IDX][12] = PREFIX ## PFX ## SIZE ## _mc03_ ## CPU; \
523     c->PFX ## _pixels_tab[IDX][13] = PREFIX ## PFX ## SIZE ## _mc13_ ## CPU; \
524     c->PFX ## _pixels_tab[IDX][14] = PREFIX ## PFX ## SIZE ## _mc23_ ## CPU; \
525     c->PFX ## _pixels_tab[IDX][15] = PREFIX ## PFX ## SIZE ## _mc33_ ## CPU; \
526 } while (0)
527
528 static av_cold void dsputil_init_mmx(DSPContext *c, AVCodecContext *avctx,
529                                      int cpu_flags, unsigned high_bit_depth)
530 {
531 #if HAVE_MMX_INLINE
532     c->put_pixels_clamped        = ff_put_pixels_clamped_mmx;
533     c->add_pixels_clamped        = ff_add_pixels_clamped_mmx;
534
535     if (!high_bit_depth) {
536         c->draw_edges   = ff_draw_edges_mmx;
537     }
538
539 #if CONFIG_VIDEODSP && (ARCH_X86_32 || !HAVE_YASM)
540     c->gmc = ff_gmc_mmx;
541 #endif
542
543     c->add_bytes = ff_add_bytes_mmx;
544 #endif /* HAVE_MMX_INLINE */
545
546 #if HAVE_MMX_EXTERNAL
547     if (!high_bit_depth) {
548         c->clear_block  = ff_clear_block_mmx;
549         c->clear_blocks = ff_clear_blocks_mmx;
550     }
551     c->vector_clip_int32 = ff_vector_clip_int32_mmx;
552     c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_mmx;
553 #endif /* HAVE_MMX_EXTERNAL */
554 }
555
556 static av_cold void dsputil_init_mmxext(DSPContext *c, AVCodecContext *avctx,
557                                         int cpu_flags, unsigned high_bit_depth)
558 {
559 #if HAVE_MMXEXT_INLINE
560     if (!high_bit_depth && avctx->idct_algo == FF_IDCT_XVIDMMX && avctx->lowres == 0) {
561         c->idct_put = ff_idct_xvid_mmxext_put;
562         c->idct_add = ff_idct_xvid_mmxext_add;
563         c->idct     = ff_idct_xvid_mmxext;
564     }
565 #endif /* HAVE_MMXEXT_INLINE */
566
567 #if HAVE_MMXEXT_EXTERNAL
568     SET_QPEL_FUNCS(avg_qpel,        0, 16, mmxext, );
569     SET_QPEL_FUNCS(avg_qpel,        1,  8, mmxext, );
570
571     SET_QPEL_FUNCS(put_qpel,        0, 16, mmxext, );
572     SET_QPEL_FUNCS(put_qpel,        1,  8, mmxext, );
573     SET_QPEL_FUNCS(put_no_rnd_qpel, 0, 16, mmxext, );
574     SET_QPEL_FUNCS(put_no_rnd_qpel, 1,  8, mmxext, );
575
576     /* slower than cmov version on AMD */
577     if (!(cpu_flags & AV_CPU_FLAG_3DNOW))
578         c->add_hfyu_median_prediction = ff_add_hfyu_median_prediction_mmxext;
579
580     c->scalarproduct_int16          = ff_scalarproduct_int16_mmxext;
581     c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_mmxext;
582 #endif /* HAVE_MMXEXT_EXTERNAL */
583 }
584
585 static av_cold void dsputil_init_sse(DSPContext *c, AVCodecContext *avctx,
586                                      int cpu_flags, unsigned high_bit_depth)
587 {
588 #if HAVE_SSE_INLINE
589     c->vector_clipf = ff_vector_clipf_sse;
590 #endif /* HAVE_SSE_INLINE */
591
592 #if HAVE_YASM
593 #if HAVE_SSE_EXTERNAL
594     /* XvMCCreateBlocks() may not allocate 16-byte aligned blocks */
595     if (CONFIG_XVMC && avctx->hwaccel && avctx->hwaccel->decode_mb)
596         return;
597
598     if (!high_bit_depth) {
599         c->clear_block  = ff_clear_block_sse;
600         c->clear_blocks = ff_clear_blocks_sse;
601     }
602 #endif
603 #if HAVE_INLINE_ASM && CONFIG_VIDEODSP
604     c->gmc = ff_gmc_sse;
605 #endif
606 #endif /* HAVE_YASM */
607 }
608
609 static av_cold void dsputil_init_sse2(DSPContext *c, AVCodecContext *avctx,
610                                       int cpu_flags, unsigned high_bit_depth)
611 {
612 #if HAVE_SSE2_INLINE
613     if (!high_bit_depth && avctx->idct_algo == FF_IDCT_XVIDMMX && avctx->lowres == 0) {
614         c->idct_put              = ff_idct_xvid_sse2_put;
615         c->idct_add              = ff_idct_xvid_sse2_add;
616         c->idct                  = ff_idct_xvid_sse2;
617         c->idct_permutation_type = FF_SSE2_IDCT_PERM;
618     }
619 #endif /* HAVE_SSE2_INLINE */
620
621 #if HAVE_SSE2_EXTERNAL
622     c->scalarproduct_int16          = ff_scalarproduct_int16_sse2;
623     c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_sse2;
624     if (cpu_flags & AV_CPU_FLAG_ATOM) {
625         c->vector_clip_int32 = ff_vector_clip_int32_int_sse2;
626     } else {
627         c->vector_clip_int32 = ff_vector_clip_int32_sse2;
628     }
629     c->bswap_buf = ff_bswap32_buf_sse2;
630     c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_sse2;
631 #endif /* HAVE_SSE2_EXTERNAL */
632 }
633
634 static av_cold void dsputil_init_ssse3(DSPContext *c, AVCodecContext *avctx,
635                                        int cpu_flags, unsigned high_bit_depth)
636 {
637 #if HAVE_SSSE3_EXTERNAL
638     c->add_hfyu_left_prediction = ff_add_hfyu_left_prediction_ssse3;
639     if (cpu_flags & AV_CPU_FLAG_SSE4) // not really SSE4, just slow on Conroe
640         c->add_hfyu_left_prediction = ff_add_hfyu_left_prediction_sse4;
641
642     if (!(cpu_flags & (AV_CPU_FLAG_SSE42 | AV_CPU_FLAG_3DNOW))) // cachesplit
643         c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_ssse3;
644     c->bswap_buf = ff_bswap32_buf_ssse3;
645 #endif /* HAVE_SSSE3_EXTERNAL */
646 }
647
648 static av_cold void dsputil_init_sse4(DSPContext *c, AVCodecContext *avctx,
649                                       int cpu_flags, unsigned high_bit_depth)
650 {
651 #if HAVE_SSE4_EXTERNAL
652     c->vector_clip_int32 = ff_vector_clip_int32_sse4;
653 #endif /* HAVE_SSE4_EXTERNAL */
654 }
655
656 av_cold void ff_dsputil_init_x86(DSPContext *c, AVCodecContext *avctx,
657                                  unsigned high_bit_depth)
658 {
659     int cpu_flags = av_get_cpu_flags();
660
661 #if HAVE_7REGS && HAVE_INLINE_ASM
662     if (HAVE_MMX && cpu_flags & AV_CPU_FLAG_CMOV)
663         c->add_hfyu_median_prediction = ff_add_hfyu_median_prediction_cmov;
664 #endif
665
666     if (X86_MMX(cpu_flags)) {
667 #if HAVE_INLINE_ASM
668         const int idct_algo = avctx->idct_algo;
669
670         if (avctx->lowres == 0 && !high_bit_depth) {
671             if (idct_algo == FF_IDCT_AUTO || idct_algo == FF_IDCT_SIMPLEMMX) {
672                 c->idct_put              = ff_simple_idct_put_mmx;
673                 c->idct_add              = ff_simple_idct_add_mmx;
674                 c->idct                  = ff_simple_idct_mmx;
675                 c->idct_permutation_type = FF_SIMPLE_IDCT_PERM;
676             } else if (idct_algo == FF_IDCT_XVIDMMX) {
677                 c->idct_put              = ff_idct_xvid_mmx_put;
678                 c->idct_add              = ff_idct_xvid_mmx_add;
679                 c->idct                  = ff_idct_xvid_mmx;
680             }
681         }
682 #endif /* HAVE_INLINE_ASM */
683
684         dsputil_init_mmx(c, avctx, cpu_flags, high_bit_depth);
685     }
686
687     if (X86_MMXEXT(cpu_flags))
688         dsputil_init_mmxext(c, avctx, cpu_flags, high_bit_depth);
689
690     if (X86_SSE(cpu_flags))
691         dsputil_init_sse(c, avctx, cpu_flags, high_bit_depth);
692
693     if (X86_SSE2(cpu_flags))
694         dsputil_init_sse2(c, avctx, cpu_flags, high_bit_depth);
695
696     if (EXTERNAL_SSSE3(cpu_flags))
697         dsputil_init_ssse3(c, avctx, cpu_flags, high_bit_depth);
698
699     if (EXTERNAL_SSE4(cpu_flags))
700         dsputil_init_sse4(c, avctx, cpu_flags, high_bit_depth);
701
702     if (CONFIG_ENCODERS)
703         ff_dsputilenc_init_mmx(c, avctx, high_bit_depth);
704 }