2 * VP9 SIMD optimizations
4 * Copyright (c) 2013 Ronald S. Bultje <rsbultje gmail com>
6 * This file is part of FFmpeg.
8 * FFmpeg is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU Lesser General Public
10 * License as published by the Free Software Foundation; either
11 * version 2.1 of the License, or (at your option) any later version.
13 * FFmpeg is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 * Lesser General Public License for more details.
18 * You should have received a copy of the GNU Lesser General Public
19 * License along with FFmpeg; if not, write to the Free Software
20 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23 #include "libavutil/attributes.h"
24 #include "libavutil/cpu.h"
25 #include "libavutil/mem.h"
26 #include "libavutil/x86/asm.h"
27 #include "libavutil/x86/cpu.h"
28 #include "libavcodec/vp9dsp.h"
32 #define fpel_func(avg, sz, opt) \
33 void ff_##avg##sz##_##opt(uint8_t *dst, ptrdiff_t dst_stride, \
34 const uint8_t *src, ptrdiff_t src_stride, \
35 int h, int mx, int my)
36 fpel_func(put, 4, mmx);
37 fpel_func(put, 8, mmx);
38 fpel_func(put, 16, sse);
39 fpel_func(put, 32, sse);
40 fpel_func(put, 64, sse);
41 fpel_func(avg, 4, sse);
42 fpel_func(avg, 8, sse);
43 fpel_func(avg, 16, sse2);
44 fpel_func(avg, 32, sse2);
45 fpel_func(avg, 64, sse2);
48 #define mc_func(avg, sz, dir, opt) \
49 void ff_##avg##_8tap_1d_##dir##_##sz##_##opt(uint8_t *dst, ptrdiff_t dst_stride, \
50 const uint8_t *src, ptrdiff_t src_stride, \
51 int h, const int8_t (*filter)[16])
52 #define mc_funcs(sz) \
53 mc_func(put, sz, h, ssse3); \
54 mc_func(avg, sz, h, ssse3); \
55 mc_func(put, sz, v, ssse3); \
56 mc_func(avg, sz, v, ssse3)
67 #define mc_rep_func(avg, sz, hsz, dir, opt) \
68 static av_always_inline void \
69 ff_##avg##_8tap_1d_##dir##_##sz##_##opt(uint8_t *dst, ptrdiff_t dst_stride, \
70 const uint8_t *src, ptrdiff_t src_stride, \
71 int h, const int8_t (*filter)[16]) \
73 ff_##avg##_8tap_1d_##dir##_##hsz##_##opt(dst, dst_stride, src, \
74 src_stride, h, filter); \
75 ff_##avg##_8tap_1d_##dir##_##hsz##_##opt(dst + hsz, dst_stride, src + hsz, \
76 src_stride, h, filter); \
79 #define mc_rep_funcs(sz, hsz) \
80 mc_rep_func(put, sz, hsz, h, ssse3); \
81 mc_rep_func(avg, sz, hsz, h, ssse3); \
82 mc_rep_func(put, sz, hsz, v, ssse3); \
83 mc_rep_func(avg, sz, hsz, v, ssse3)
94 extern const int8_t ff_filters_ssse3[3][15][4][16];
96 #define filter_8tap_2d_fn(op, sz, f, fname) \
97 static void op##_8tap_##fname##_##sz##hv_ssse3(uint8_t *dst, ptrdiff_t dst_stride, \
98 const uint8_t *src, ptrdiff_t src_stride, \
99 int h, int mx, int my) \
101 LOCAL_ALIGNED_16(uint8_t, temp, [71 * 64]); \
102 ff_put_8tap_1d_h_##sz##_ssse3(temp, 64, src - 3 * src_stride, src_stride, \
103 h + 7, ff_filters_ssse3[f][mx - 1]); \
104 ff_##op##_8tap_1d_v_##sz##_ssse3(dst, dst_stride, temp + 3 * 64, 64, \
105 h, ff_filters_ssse3[f][my - 1]); \
108 #define filters_8tap_2d_fn(op, sz) \
109 filter_8tap_2d_fn(op, sz, FILTER_8TAP_REGULAR, regular) \
110 filter_8tap_2d_fn(op, sz, FILTER_8TAP_SHARP, sharp) \
111 filter_8tap_2d_fn(op, sz, FILTER_8TAP_SMOOTH, smooth)
113 #define filters_8tap_2d_fn2(op) \
114 filters_8tap_2d_fn(op, 64) \
115 filters_8tap_2d_fn(op, 32) \
116 filters_8tap_2d_fn(op, 16) \
117 filters_8tap_2d_fn(op, 8) \
118 filters_8tap_2d_fn(op, 4)
120 filters_8tap_2d_fn2(put)
121 filters_8tap_2d_fn2(avg)
123 #undef filters_8tap_2d_fn2
124 #undef filters_8tap_2d_fn
125 #undef filter_8tap_2d_fn
127 #define filter_8tap_1d_fn(op, sz, f, fname, dir, dvar) \
128 static void op##_8tap_##fname##_##sz##dir##_ssse3(uint8_t *dst, ptrdiff_t dst_stride, \
129 const uint8_t *src, ptrdiff_t src_stride, \
130 int h, int mx, int my) \
132 ff_##op##_8tap_1d_##dir##_##sz##_ssse3(dst, dst_stride, src, src_stride, \
133 h, ff_filters_ssse3[f][dvar - 1]); \
136 #define filters_8tap_1d_fn(op, sz, dir, dvar) \
137 filter_8tap_1d_fn(op, sz, FILTER_8TAP_REGULAR, regular, dir, dvar) \
138 filter_8tap_1d_fn(op, sz, FILTER_8TAP_SHARP, sharp, dir, dvar) \
139 filter_8tap_1d_fn(op, sz, FILTER_8TAP_SMOOTH, smooth, dir, dvar)
141 #define filters_8tap_1d_fn2(op, sz) \
142 filters_8tap_1d_fn(op, sz, h, mx) \
143 filters_8tap_1d_fn(op, sz, v, my)
145 #define filters_8tap_1d_fn3(op) \
146 filters_8tap_1d_fn2(op, 64) \
147 filters_8tap_1d_fn2(op, 32) \
148 filters_8tap_1d_fn2(op, 16) \
149 filters_8tap_1d_fn2(op, 8) \
150 filters_8tap_1d_fn2(op, 4)
152 filters_8tap_1d_fn3(put)
153 filters_8tap_1d_fn3(avg)
155 #undef filters_8tap_1d_fn
156 #undef filters_8tap_1d_fn2
157 #undef filters_8tap_1d_fn3
158 #undef filter_8tap_1d_fn
160 void ff_vp9_idct_idct_4x4_add_ssse3(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
161 void ff_vp9_idct_idct_8x8_add_ssse3(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
162 void ff_vp9_idct_idct_16x16_add_ssse3(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
163 void ff_vp9_idct_idct_32x32_add_ssse3(uint8_t *dst, ptrdiff_t stride, int16_t *block, int eob);
165 void ff_vp9_loop_filter_v_16_16_ssse3(uint8_t *dst, ptrdiff_t stride, int E, int I, int H);
166 void ff_vp9_loop_filter_h_16_16_ssse3(uint8_t *dst, ptrdiff_t stride, int E, int I, int H);
168 #endif /* HAVE_YASM */
170 av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp)
173 int cpu_flags = av_get_cpu_flags();
175 #define init_fpel(idx1, idx2, sz, type, opt) \
176 dsp->mc[idx1][FILTER_8TAP_SMOOTH ][idx2][0][0] = \
177 dsp->mc[idx1][FILTER_8TAP_REGULAR][idx2][0][0] = \
178 dsp->mc[idx1][FILTER_8TAP_SHARP ][idx2][0][0] = \
179 dsp->mc[idx1][FILTER_BILINEAR ][idx2][0][0] = ff_##type##sz##_##opt
182 #define init_subpel1(idx1, idx2, idxh, idxv, sz, dir, type, opt) \
183 dsp->mc[idx1][FILTER_8TAP_SMOOTH ][idx2][idxh][idxv] = type##_8tap_smooth_##sz##dir##_##opt; \
184 dsp->mc[idx1][FILTER_8TAP_REGULAR][idx2][idxh][idxv] = type##_8tap_regular_##sz##dir##_##opt; \
185 dsp->mc[idx1][FILTER_8TAP_SHARP ][idx2][idxh][idxv] = type##_8tap_sharp_##sz##dir##_##opt
187 #define init_subpel2(idx, idxh, idxv, dir, type, opt) \
188 init_subpel1(0, idx, idxh, idxv, 64, dir, type, opt); \
189 init_subpel1(1, idx, idxh, idxv, 32, dir, type, opt); \
190 init_subpel1(2, idx, idxh, idxv, 16, dir, type, opt); \
191 init_subpel1(3, idx, idxh, idxv, 8, dir, type, opt); \
192 init_subpel1(4, idx, idxh, idxv, 4, dir, type, opt)
194 #define init_subpel3(idx, type, opt) \
195 init_subpel2(idx, 1, 1, hv, type, opt); \
196 init_subpel2(idx, 0, 1, v, type, opt); \
197 init_subpel2(idx, 1, 0, h, type, opt)
199 if (EXTERNAL_MMX(cpu_flags)) {
200 init_fpel(4, 0, 4, put, mmx);
201 init_fpel(3, 0, 8, put, mmx);
204 if (EXTERNAL_SSE(cpu_flags)) {
205 init_fpel(2, 0, 16, put, sse);
206 init_fpel(1, 0, 32, put, sse);
207 init_fpel(0, 0, 64, put, sse);
208 init_fpel(4, 1, 4, avg, sse);
209 init_fpel(3, 1, 8, avg, sse);
212 if (EXTERNAL_SSE2(cpu_flags)) {
213 init_fpel(2, 1, 16, avg, sse2);
214 init_fpel(1, 1, 32, avg, sse2);
215 init_fpel(0, 1, 64, avg, sse2);
218 if (EXTERNAL_SSSE3(cpu_flags)) {
219 init_subpel3(0, put, ssse3);
220 init_subpel3(1, avg, ssse3);
221 dsp->itxfm_add[TX_4X4][DCT_DCT] = ff_vp9_idct_idct_4x4_add_ssse3;
223 dsp->itxfm_add[TX_8X8][DCT_DCT] = ff_vp9_idct_idct_8x8_add_ssse3;
224 dsp->itxfm_add[TX_16X16][DCT_DCT] = ff_vp9_idct_idct_16x16_add_ssse3;
225 dsp->itxfm_add[TX_32X32][ADST_ADST] =
226 dsp->itxfm_add[TX_32X32][ADST_DCT] =
227 dsp->itxfm_add[TX_32X32][DCT_ADST] =
228 dsp->itxfm_add[TX_32X32][DCT_DCT] = ff_vp9_idct_idct_32x32_add_ssse3;
229 dsp->loop_filter_16[0] = ff_vp9_loop_filter_h_16_16_ssse3;
230 dsp->loop_filter_16[1] = ff_vp9_loop_filter_v_16_16_ssse3;
239 #endif /* HAVE_YASM */