2 * VP8 DSP functions x86-optimized
3 * Copyright (c) 2010 Ronald S. Bultje <rsbultje@gmail.com>
4 * Copyright (c) 2010 Jason Garrett-Glaser <darkshikari@gmail.com>
6 * This file is part of FFmpeg.
8 * FFmpeg is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License as published by the Free Software Foundation; either
11 * version 2.1 of the License, or (at your option) any later version.
13 * FFmpeg is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 * Lesser General Public License for more details.
18 * You should have received a copy of the GNU Lesser General Public
19 * License along with FFmpeg; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23 #include "libavutil/cpu.h"
24 #include "libavutil/x86_cpu.h"
25 #include "libavcodec/vp8dsp.h"
32 extern void ff_put_vp8_epel4_h4_mmx2 (uint8_t *dst, ptrdiff_t dststride,
33 uint8_t *src, ptrdiff_t srcstride,
34 int height, int mx, int my);
35 extern void ff_put_vp8_epel4_h6_mmx2 (uint8_t *dst, ptrdiff_t dststride,
36 uint8_t *src, ptrdiff_t srcstride,
37 int height, int mx, int my);
38 extern void ff_put_vp8_epel4_v4_mmx2 (uint8_t *dst, ptrdiff_t dststride,
39 uint8_t *src, ptrdiff_t srcstride,
40 int height, int mx, int my);
41 extern void ff_put_vp8_epel4_v6_mmx2 (uint8_t *dst, ptrdiff_t dststride,
42 uint8_t *src, ptrdiff_t srcstride,
43 int height, int mx, int my);
45 extern void ff_put_vp8_epel8_h4_sse2 (uint8_t *dst, ptrdiff_t dststride,
46 uint8_t *src, ptrdiff_t srcstride,
47 int height, int mx, int my);
48 extern void ff_put_vp8_epel8_h6_sse2 (uint8_t *dst, ptrdiff_t dststride,
49 uint8_t *src, ptrdiff_t srcstride,
50 int height, int mx, int my);
51 extern void ff_put_vp8_epel8_v4_sse2 (uint8_t *dst, ptrdiff_t dststride,
52 uint8_t *src, ptrdiff_t srcstride,
53 int height, int mx, int my);
54 extern void ff_put_vp8_epel8_v6_sse2 (uint8_t *dst, ptrdiff_t dststride,
55 uint8_t *src, ptrdiff_t srcstride,
56 int height, int mx, int my);
58 extern void ff_put_vp8_epel4_h4_ssse3 (uint8_t *dst, ptrdiff_t dststride,
59 uint8_t *src, ptrdiff_t srcstride,
60 int height, int mx, int my);
61 extern void ff_put_vp8_epel4_h6_ssse3 (uint8_t *dst, ptrdiff_t dststride,
62 uint8_t *src, ptrdiff_t srcstride,
63 int height, int mx, int my);
64 extern void ff_put_vp8_epel4_v4_ssse3 (uint8_t *dst, ptrdiff_t dststride,
65 uint8_t *src, ptrdiff_t srcstride,
66 int height, int mx, int my);
67 extern void ff_put_vp8_epel4_v6_ssse3 (uint8_t *dst, ptrdiff_t dststride,
68 uint8_t *src, ptrdiff_t srcstride,
69 int height, int mx, int my);
70 extern void ff_put_vp8_epel8_h4_ssse3 (uint8_t *dst, ptrdiff_t dststride,
71 uint8_t *src, ptrdiff_t srcstride,
72 int height, int mx, int my);
73 extern void ff_put_vp8_epel8_h6_ssse3 (uint8_t *dst, ptrdiff_t dststride,
74 uint8_t *src, ptrdiff_t srcstride,
75 int height, int mx, int my);
76 extern void ff_put_vp8_epel8_v4_ssse3 (uint8_t *dst, ptrdiff_t dststride,
77 uint8_t *src, ptrdiff_t srcstride,
78 int height, int mx, int my);
79 extern void ff_put_vp8_epel8_v6_ssse3 (uint8_t *dst, ptrdiff_t dststride,
80 uint8_t *src, ptrdiff_t srcstride,
81 int height, int mx, int my);
83 extern void ff_put_vp8_bilinear4_h_mmx2 (uint8_t *dst, ptrdiff_t dststride,
84 uint8_t *src, ptrdiff_t srcstride,
85 int height, int mx, int my);
86 extern void ff_put_vp8_bilinear8_h_sse2 (uint8_t *dst, ptrdiff_t dststride,
87 uint8_t *src, ptrdiff_t srcstride,
88 int height, int mx, int my);
89 extern void ff_put_vp8_bilinear4_h_ssse3 (uint8_t *dst, ptrdiff_t dststride,
90 uint8_t *src, ptrdiff_t srcstride,
91 int height, int mx, int my);
92 extern void ff_put_vp8_bilinear8_h_ssse3 (uint8_t *dst, ptrdiff_t dststride,
93 uint8_t *src, ptrdiff_t srcstride,
94 int height, int mx, int my);
96 extern void ff_put_vp8_bilinear4_v_mmx2 (uint8_t *dst, ptrdiff_t dststride,
97 uint8_t *src, ptrdiff_t srcstride,
98 int height, int mx, int my);
99 extern void ff_put_vp8_bilinear8_v_sse2 (uint8_t *dst, ptrdiff_t dststride,
100 uint8_t *src, ptrdiff_t srcstride,
101 int height, int mx, int my);
102 extern void ff_put_vp8_bilinear4_v_ssse3 (uint8_t *dst, ptrdiff_t dststride,
103 uint8_t *src, ptrdiff_t srcstride,
104 int height, int mx, int my);
105 extern void ff_put_vp8_bilinear8_v_ssse3 (uint8_t *dst, ptrdiff_t dststride,
106 uint8_t *src, ptrdiff_t srcstride,
107 int height, int mx, int my);
110 extern void ff_put_vp8_pixels8_mmx (uint8_t *dst, ptrdiff_t dststride,
111 uint8_t *src, ptrdiff_t srcstride,
112 int height, int mx, int my);
113 extern void ff_put_vp8_pixels16_mmx(uint8_t *dst, ptrdiff_t dststride,
114 uint8_t *src, ptrdiff_t srcstride,
115 int height, int mx, int my);
116 extern void ff_put_vp8_pixels16_sse(uint8_t *dst, ptrdiff_t dststride,
117 uint8_t *src, ptrdiff_t srcstride,
118 int height, int mx, int my);
120 #define TAP_W16(OPT, FILTERTYPE, TAPTYPE) \
121 static void ff_put_vp8_ ## FILTERTYPE ## 16_ ## TAPTYPE ## _ ## OPT( \
122 uint8_t *dst, ptrdiff_t dststride, uint8_t *src, \
123 ptrdiff_t srcstride, int height, int mx, int my) \
125 ff_put_vp8_ ## FILTERTYPE ## 8_ ## TAPTYPE ## _ ## OPT( \
126 dst, dststride, src, srcstride, height, mx, my); \
127 ff_put_vp8_ ## FILTERTYPE ## 8_ ## TAPTYPE ## _ ## OPT( \
128 dst + 8, dststride, src + 8, srcstride, height, mx, my); \
130 #define TAP_W8(OPT, FILTERTYPE, TAPTYPE) \
131 static void ff_put_vp8_ ## FILTERTYPE ## 8_ ## TAPTYPE ## _ ## OPT( \
132 uint8_t *dst, ptrdiff_t dststride, uint8_t *src, \
133 ptrdiff_t srcstride, int height, int mx, int my) \
135 ff_put_vp8_ ## FILTERTYPE ## 4_ ## TAPTYPE ## _ ## OPT( \
136 dst, dststride, src, srcstride, height, mx, my); \
137 ff_put_vp8_ ## FILTERTYPE ## 4_ ## TAPTYPE ## _ ## OPT( \
138 dst + 4, dststride, src + 4, srcstride, height, mx, my); \
142 TAP_W8 (mmx2, epel, h4)
143 TAP_W8 (mmx2, epel, h6)
144 TAP_W16(mmx2, epel, h6)
145 TAP_W8 (mmx2, epel, v4)
146 TAP_W8 (mmx2, epel, v6)
147 TAP_W16(mmx2, epel, v6)
148 TAP_W8 (mmx2, bilinear, h)
149 TAP_W16(mmx2, bilinear, h)
150 TAP_W8 (mmx2, bilinear, v)
151 TAP_W16(mmx2, bilinear, v)
154 TAP_W16(sse2, epel, h6)
155 TAP_W16(sse2, epel, v6)
156 TAP_W16(sse2, bilinear, h)
157 TAP_W16(sse2, bilinear, v)
159 TAP_W16(ssse3, epel, h6)
160 TAP_W16(ssse3, epel, v6)
161 TAP_W16(ssse3, bilinear, h)
162 TAP_W16(ssse3, bilinear, v)
164 #define HVTAP(OPT, ALIGN, TAPNUMX, TAPNUMY, SIZE, MAXHEIGHT) \
165 static void ff_put_vp8_epel ## SIZE ## _h ## TAPNUMX ## v ## TAPNUMY ## _ ## OPT( \
166 uint8_t *dst, ptrdiff_t dststride, uint8_t *src, \
167 ptrdiff_t srcstride, int height, int mx, int my) \
169 DECLARE_ALIGNED(ALIGN, uint8_t, tmp)[SIZE * (MAXHEIGHT + TAPNUMY - 1)]; \
170 uint8_t *tmpptr = tmp + SIZE * (TAPNUMY / 2 - 1); \
171 src -= srcstride * (TAPNUMY / 2 - 1); \
172 ff_put_vp8_epel ## SIZE ## _h ## TAPNUMX ## _ ## OPT( \
173 tmp, SIZE, src, srcstride, height + TAPNUMY - 1, mx, my); \
174 ff_put_vp8_epel ## SIZE ## _v ## TAPNUMY ## _ ## OPT( \
175 dst, dststride, tmpptr, SIZE, height, mx, my); \
179 #define HVTAPMMX(x, y) \
180 HVTAP(mmx2, 8, x, y, 4, 8) \
181 HVTAP(mmx2, 8, x, y, 8, 16)
183 HVTAP(mmx2, 8, 6, 6, 16, 16)
185 #define HVTAPMMX(x, y) \
186 HVTAP(mmx2, 8, x, y, 4, 8)
194 #define HVTAPSSE2(x, y, w) \
195 HVTAP(sse2, 16, x, y, w, 16) \
196 HVTAP(ssse3, 16, x, y, w, 16)
204 HVTAP(ssse3, 16, 4, 4, 4, 8)
205 HVTAP(ssse3, 16, 4, 6, 4, 8)
206 HVTAP(ssse3, 16, 6, 4, 4, 8)
207 HVTAP(ssse3, 16, 6, 6, 4, 8)
209 #define HVBILIN(OPT, ALIGN, SIZE, MAXHEIGHT) \
210 static void ff_put_vp8_bilinear ## SIZE ## _hv_ ## OPT( \
211 uint8_t *dst, ptrdiff_t dststride, uint8_t *src, \
212 ptrdiff_t srcstride, int height, int mx, int my) \
214 DECLARE_ALIGNED(ALIGN, uint8_t, tmp)[SIZE * (MAXHEIGHT + 2)]; \
215 ff_put_vp8_bilinear ## SIZE ## _h_ ## OPT( \
216 tmp, SIZE, src, srcstride, height + 1, mx, my); \
217 ff_put_vp8_bilinear ## SIZE ## _v_ ## OPT( \
218 dst, dststride, tmp, SIZE, height, mx, my); \
221 HVBILIN(mmx2, 8, 4, 8)
223 HVBILIN(mmx2, 8, 8, 16)
224 HVBILIN(mmx2, 8, 16, 16)
226 HVBILIN(sse2, 8, 8, 16)
227 HVBILIN(sse2, 8, 16, 16)
228 HVBILIN(ssse3, 8, 4, 8)
229 HVBILIN(ssse3, 8, 8, 16)
230 HVBILIN(ssse3, 8, 16, 16)
232 extern void ff_vp8_idct_dc_add_mmx(uint8_t *dst, DCTELEM block[16],
234 extern void ff_vp8_idct_dc_add_sse4(uint8_t *dst, DCTELEM block[16],
236 extern void ff_vp8_idct_dc_add4y_mmx(uint8_t *dst, DCTELEM block[4][16],
238 extern void ff_vp8_idct_dc_add4y_sse2(uint8_t *dst, DCTELEM block[4][16],
240 extern void ff_vp8_idct_dc_add4uv_mmx(uint8_t *dst, DCTELEM block[2][16],
242 extern void ff_vp8_luma_dc_wht_mmx(DCTELEM block[4][4][16], DCTELEM dc[16]);
243 extern void ff_vp8_luma_dc_wht_sse(DCTELEM block[4][4][16], DCTELEM dc[16]);
244 extern void ff_vp8_idct_add_mmx(uint8_t *dst, DCTELEM block[16],
246 extern void ff_vp8_idct_add_sse(uint8_t *dst, DCTELEM block[16],
249 #define DECLARE_LOOP_FILTER(NAME)\
250 extern void ff_vp8_v_loop_filter_simple_ ## NAME(uint8_t *dst, \
253 extern void ff_vp8_h_loop_filter_simple_ ## NAME(uint8_t *dst, \
256 extern void ff_vp8_v_loop_filter16y_inner_ ## NAME (uint8_t *dst, \
258 int e, int i, int hvt);\
259 extern void ff_vp8_h_loop_filter16y_inner_ ## NAME (uint8_t *dst, \
261 int e, int i, int hvt);\
262 extern void ff_vp8_v_loop_filter8uv_inner_ ## NAME (uint8_t *dstU, \
265 int e, int i, int hvt);\
266 extern void ff_vp8_h_loop_filter8uv_inner_ ## NAME (uint8_t *dstU, \
269 int e, int i, int hvt);\
270 extern void ff_vp8_v_loop_filter16y_mbedge_ ## NAME(uint8_t *dst, \
272 int e, int i, int hvt);\
273 extern void ff_vp8_h_loop_filter16y_mbedge_ ## NAME(uint8_t *dst, \
275 int e, int i, int hvt);\
276 extern void ff_vp8_v_loop_filter8uv_mbedge_ ## NAME(uint8_t *dstU, \
279 int e, int i, int hvt);\
280 extern void ff_vp8_h_loop_filter8uv_mbedge_ ## NAME(uint8_t *dstU, \
283 int e, int i, int hvt);
285 DECLARE_LOOP_FILTER(mmx)
286 DECLARE_LOOP_FILTER(mmx2)
287 DECLARE_LOOP_FILTER(sse2)
288 DECLARE_LOOP_FILTER(ssse3)
289 DECLARE_LOOP_FILTER(sse4)
293 #define VP8_LUMA_MC_FUNC(IDX, SIZE, OPT) \
294 c->put_vp8_epel_pixels_tab[IDX][0][2] = ff_put_vp8_epel ## SIZE ## _h6_ ## OPT; \
295 c->put_vp8_epel_pixels_tab[IDX][2][0] = ff_put_vp8_epel ## SIZE ## _v6_ ## OPT; \
296 c->put_vp8_epel_pixels_tab[IDX][2][2] = ff_put_vp8_epel ## SIZE ## _h6v6_ ## OPT
298 #define VP8_MC_FUNC(IDX, SIZE, OPT) \
299 c->put_vp8_epel_pixels_tab[IDX][0][1] = ff_put_vp8_epel ## SIZE ## _h4_ ## OPT; \
300 c->put_vp8_epel_pixels_tab[IDX][1][0] = ff_put_vp8_epel ## SIZE ## _v4_ ## OPT; \
301 c->put_vp8_epel_pixels_tab[IDX][1][1] = ff_put_vp8_epel ## SIZE ## _h4v4_ ## OPT; \
302 c->put_vp8_epel_pixels_tab[IDX][1][2] = ff_put_vp8_epel ## SIZE ## _h6v4_ ## OPT; \
303 c->put_vp8_epel_pixels_tab[IDX][2][1] = ff_put_vp8_epel ## SIZE ## _h4v6_ ## OPT; \
304 VP8_LUMA_MC_FUNC(IDX, SIZE, OPT)
306 #define VP8_BILINEAR_MC_FUNC(IDX, SIZE, OPT) \
307 c->put_vp8_bilinear_pixels_tab[IDX][0][1] = ff_put_vp8_bilinear ## SIZE ## _h_ ## OPT; \
308 c->put_vp8_bilinear_pixels_tab[IDX][0][2] = ff_put_vp8_bilinear ## SIZE ## _h_ ## OPT; \
309 c->put_vp8_bilinear_pixels_tab[IDX][1][0] = ff_put_vp8_bilinear ## SIZE ## _v_ ## OPT; \
310 c->put_vp8_bilinear_pixels_tab[IDX][1][1] = ff_put_vp8_bilinear ## SIZE ## _hv_ ## OPT; \
311 c->put_vp8_bilinear_pixels_tab[IDX][1][2] = ff_put_vp8_bilinear ## SIZE ## _hv_ ## OPT; \
312 c->put_vp8_bilinear_pixels_tab[IDX][2][0] = ff_put_vp8_bilinear ## SIZE ## _v_ ## OPT; \
313 c->put_vp8_bilinear_pixels_tab[IDX][2][1] = ff_put_vp8_bilinear ## SIZE ## _hv_ ## OPT; \
314 c->put_vp8_bilinear_pixels_tab[IDX][2][2] = ff_put_vp8_bilinear ## SIZE ## _hv_ ## OPT
317 av_cold void ff_vp8dsp_init_x86(VP8DSPContext* c)
320 int mm_flags = av_get_cpu_flags();
322 if (mm_flags & AV_CPU_FLAG_MMX) {
323 c->vp8_idct_dc_add = ff_vp8_idct_dc_add_mmx;
324 c->vp8_idct_dc_add4uv = ff_vp8_idct_dc_add4uv_mmx;
326 c->vp8_idct_dc_add4y = ff_vp8_idct_dc_add4y_mmx;
327 c->vp8_idct_add = ff_vp8_idct_add_mmx;
328 c->vp8_luma_dc_wht = ff_vp8_luma_dc_wht_mmx;
329 c->put_vp8_epel_pixels_tab[0][0][0] =
330 c->put_vp8_bilinear_pixels_tab[0][0][0] = ff_put_vp8_pixels16_mmx;
332 c->put_vp8_epel_pixels_tab[1][0][0] =
333 c->put_vp8_bilinear_pixels_tab[1][0][0] = ff_put_vp8_pixels8_mmx;
336 c->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter_simple_mmx;
337 c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_mmx;
339 c->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16y_inner_mmx;
340 c->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16y_inner_mmx;
341 c->vp8_v_loop_filter8uv_inner = ff_vp8_v_loop_filter8uv_inner_mmx;
342 c->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_mmx;
344 c->vp8_v_loop_filter16y = ff_vp8_v_loop_filter16y_mbedge_mmx;
345 c->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16y_mbedge_mmx;
346 c->vp8_v_loop_filter8uv = ff_vp8_v_loop_filter8uv_mbedge_mmx;
347 c->vp8_h_loop_filter8uv = ff_vp8_h_loop_filter8uv_mbedge_mmx;
351 /* note that 4-tap width=16 functions are missing because w=16
352 * is only used for luma, and luma is always a copy or sixtap. */
353 if (mm_flags & AV_CPU_FLAG_MMX2) {
354 VP8_MC_FUNC(2, 4, mmx2);
355 VP8_BILINEAR_MC_FUNC(2, 4, mmx2);
357 VP8_LUMA_MC_FUNC(0, 16, mmx2);
358 VP8_MC_FUNC(1, 8, mmx2);
359 VP8_BILINEAR_MC_FUNC(0, 16, mmx2);
360 VP8_BILINEAR_MC_FUNC(1, 8, mmx2);
362 c->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter_simple_mmx2;
363 c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_mmx2;
365 c->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16y_inner_mmx2;
366 c->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16y_inner_mmx2;
367 c->vp8_v_loop_filter8uv_inner = ff_vp8_v_loop_filter8uv_inner_mmx2;
368 c->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_mmx2;
370 c->vp8_v_loop_filter16y = ff_vp8_v_loop_filter16y_mbedge_mmx2;
371 c->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16y_mbedge_mmx2;
372 c->vp8_v_loop_filter8uv = ff_vp8_v_loop_filter8uv_mbedge_mmx2;
373 c->vp8_h_loop_filter8uv = ff_vp8_h_loop_filter8uv_mbedge_mmx2;
377 if (mm_flags & AV_CPU_FLAG_SSE) {
378 c->vp8_idct_add = ff_vp8_idct_add_sse;
379 c->vp8_luma_dc_wht = ff_vp8_luma_dc_wht_sse;
380 c->put_vp8_epel_pixels_tab[0][0][0] =
381 c->put_vp8_bilinear_pixels_tab[0][0][0] = ff_put_vp8_pixels16_sse;
384 if (mm_flags & (AV_CPU_FLAG_SSE2|AV_CPU_FLAG_SSE2SLOW)) {
385 VP8_LUMA_MC_FUNC(0, 16, sse2);
386 VP8_MC_FUNC(1, 8, sse2);
387 VP8_BILINEAR_MC_FUNC(0, 16, sse2);
388 VP8_BILINEAR_MC_FUNC(1, 8, sse2);
390 c->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter_simple_sse2;
392 c->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16y_inner_sse2;
393 c->vp8_v_loop_filter8uv_inner = ff_vp8_v_loop_filter8uv_inner_sse2;
395 c->vp8_v_loop_filter16y = ff_vp8_v_loop_filter16y_mbedge_sse2;
396 c->vp8_v_loop_filter8uv = ff_vp8_v_loop_filter8uv_mbedge_sse2;
399 if (mm_flags & AV_CPU_FLAG_SSE2) {
400 c->vp8_idct_dc_add4y = ff_vp8_idct_dc_add4y_sse2;
402 c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_sse2;
404 c->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16y_inner_sse2;
405 c->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_sse2;
407 c->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16y_mbedge_sse2;
408 c->vp8_h_loop_filter8uv = ff_vp8_h_loop_filter8uv_mbedge_sse2;
411 if (mm_flags & AV_CPU_FLAG_SSSE3) {
412 VP8_LUMA_MC_FUNC(0, 16, ssse3);
413 VP8_MC_FUNC(1, 8, ssse3);
414 VP8_MC_FUNC(2, 4, ssse3);
415 VP8_BILINEAR_MC_FUNC(0, 16, ssse3);
416 VP8_BILINEAR_MC_FUNC(1, 8, ssse3);
417 VP8_BILINEAR_MC_FUNC(2, 4, ssse3);
419 c->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter_simple_ssse3;
420 c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_ssse3;
422 c->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16y_inner_ssse3;
423 c->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16y_inner_ssse3;
424 c->vp8_v_loop_filter8uv_inner = ff_vp8_v_loop_filter8uv_inner_ssse3;
425 c->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_ssse3;
427 c->vp8_v_loop_filter16y = ff_vp8_v_loop_filter16y_mbedge_ssse3;
428 c->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16y_mbedge_ssse3;
429 c->vp8_v_loop_filter8uv = ff_vp8_v_loop_filter8uv_mbedge_ssse3;
430 c->vp8_h_loop_filter8uv = ff_vp8_h_loop_filter8uv_mbedge_ssse3;
433 if (mm_flags & AV_CPU_FLAG_SSE4) {
434 c->vp8_idct_dc_add = ff_vp8_idct_dc_add_sse4;
436 c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_sse4;
437 c->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16y_mbedge_sse4;
438 c->vp8_h_loop_filter8uv = ff_vp8_h_loop_filter8uv_mbedge_sse4;