2 * Copyright (c) 2015 -2017 Parag Salasakar (Parag.Salasakar@imgtec.com)
4 * This file is part of FFmpeg.
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 #include "libavutil/mips/generic_macros_msa.h"
22 #include "h264dsp_mips.h"
24 static const uint8_t luma_mask_arr[16 * 6] __attribute__((aligned(0x40))) = {
26 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12,
27 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 9, 7, 10, 8, 11,
28 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10,
31 0, 5, 1, 6, 2, 7, 3, 8, 16, 21, 17, 22, 18, 23, 19, 24,
32 1, 4, 2, 5, 3, 6, 4, 7, 17, 20, 18, 21, 19, 22, 20, 23,
33 2, 3, 3, 4, 4, 5, 5, 6, 18, 19, 19, 20, 20, 21, 21, 22,
36 #define AVC_CALC_DPADD_B_6PIX_2COEFF_SH(vec0, vec1, vec2, vec3, vec4, vec5, \
39 v16i8 tmp0_m, tmp1_m; \
40 v16i8 minus5b_m = __msa_ldi_b(-5); \
41 v16i8 plus20b_m = __msa_ldi_b(20); \
43 ILVRL_B2_SB(vec5, vec0, tmp0_m, tmp1_m); \
44 HADD_SB2_SH(tmp0_m, tmp1_m, out1, out2); \
45 ILVRL_B2_SB(vec4, vec1, tmp0_m, tmp1_m); \
46 DPADD_SB2_SH(tmp0_m, tmp1_m, minus5b_m, minus5b_m, out1, out2); \
47 ILVRL_B2_SB(vec3, vec2, tmp0_m, tmp1_m); \
48 DPADD_SB2_SH(tmp0_m, tmp1_m, plus20b_m, plus20b_m, out1, out2); \
51 #define AVC_HORZ_FILTER_SH(in0, in1, mask0, mask1, mask2) \
55 v16i8 minus5b = __msa_ldi_b(-5); \
56 v16i8 plus20b = __msa_ldi_b(20); \
58 tmp0_m = __msa_vshf_b((v16i8) mask0, in1, in0); \
59 out0_m = __msa_hadd_s_h(tmp0_m, tmp0_m); \
61 tmp0_m = __msa_vshf_b((v16i8) mask1, in1, in0); \
62 out0_m = __msa_dpadd_s_h(out0_m, minus5b, tmp0_m); \
64 tmp0_m = __msa_vshf_b((v16i8) mask2, in1, in0); \
65 out0_m = __msa_dpadd_s_h(out0_m, plus20b, tmp0_m); \
70 #define AVC_DOT_SH3_SH(in0, in1, in2, coeff0, coeff1, coeff2) \
74 out0_m = __msa_dotp_s_h((v16i8) in0, (v16i8) coeff0); \
75 out0_m = __msa_dpadd_s_h(out0_m, (v16i8) in1, (v16i8) coeff1); \
76 out0_m = __msa_dpadd_s_h(out0_m, (v16i8) in2, (v16i8) coeff2); \
81 #define AVC_DOT_SW3_SW(in0, in1, in2, coeff0, coeff1, coeff2) \
85 out0_m = __msa_dotp_s_w((v8i16) in0, (v8i16) coeff0); \
86 out0_m = __msa_dpadd_s_w(out0_m, (v8i16) in1, (v8i16) coeff1); \
87 out0_m = __msa_dpadd_s_w(out0_m, (v8i16) in2, (v8i16) coeff2); \
88 out0_m = __msa_srari_w(out0_m, 10); \
89 out0_m = __msa_sat_s_w(out0_m, 7); \
93 static void avc_luma_hv_qrt_4x4_msa(const uint8_t *src_x, const uint8_t *src_y,
94 uint8_t *dst, int32_t stride)
96 const int16_t filt_const0 = 0xfb01;
97 const int16_t filt_const1 = 0x1414;
98 const int16_t filt_const2 = 0x1fb;
100 v16i8 src_hz0, src_hz1, src_hz2, src_hz3, src_vt7, src_vt8;
101 v16i8 src_vt0, src_vt1, src_vt2, src_vt3, src_vt4, src_vt5, src_vt6;
102 v16i8 src_vt10_r, src_vt32_r, src_vt54_r, src_vt76_r;
103 v16i8 mask0, mask1, mask2, filt0, filt1, filt2;
104 v8i16 hz_out0, hz_out1, vt_out0, vt_out1, out0, out1;
106 filt0 = (v16i8) __msa_fill_h(filt_const0);
107 filt1 = (v16i8) __msa_fill_h(filt_const1);
108 filt2 = (v16i8) __msa_fill_h(filt_const2);
110 LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2);
112 LD_SB5(src_y, stride, src_vt0, src_vt1, src_vt2, src_vt3, src_vt4);
113 src_y += (5 * stride);
115 src_vt0 = (v16i8) __msa_insve_w((v4i32) src_vt0, 1, (v4i32) src_vt1);
116 src_vt1 = (v16i8) __msa_insve_w((v4i32) src_vt1, 1, (v4i32) src_vt2);
117 src_vt2 = (v16i8) __msa_insve_w((v4i32) src_vt2, 1, (v4i32) src_vt3);
118 src_vt3 = (v16i8) __msa_insve_w((v4i32) src_vt3, 1, (v4i32) src_vt4);
120 XORI_B4_128_SB(src_vt0, src_vt1, src_vt2, src_vt3);
122 LD_SB4(src_x, stride, src_hz0, src_hz1, src_hz2, src_hz3);
123 XORI_B4_128_SB(src_hz0, src_hz1, src_hz2, src_hz3);
124 hz_out0 = AVC_HORZ_FILTER_SH(src_hz0, src_hz1, mask0, mask1, mask2);
125 hz_out1 = AVC_HORZ_FILTER_SH(src_hz2, src_hz3, mask0, mask1, mask2);
127 SRARI_H2_SH(hz_out0, hz_out1, 5);
128 SAT_SH2_SH(hz_out0, hz_out1, 7);
130 LD_SB4(src_y, stride, src_vt5, src_vt6, src_vt7, src_vt8);
132 src_vt4 = (v16i8) __msa_insve_w((v4i32) src_vt4, 1, (v4i32) src_vt5);
133 src_vt5 = (v16i8) __msa_insve_w((v4i32) src_vt5, 1, (v4i32) src_vt6);
134 src_vt6 = (v16i8) __msa_insve_w((v4i32) src_vt6, 1, (v4i32) src_vt7);
135 src_vt7 = (v16i8) __msa_insve_w((v4i32) src_vt7, 1, (v4i32) src_vt8);
137 XORI_B4_128_SB(src_vt4, src_vt5, src_vt6, src_vt7);
138 ILVR_B2_SB(src_vt1, src_vt0, src_vt3, src_vt2, src_vt10_r, src_vt32_r);
139 ILVR_B2_SB(src_vt5, src_vt4, src_vt7, src_vt6, src_vt54_r, src_vt76_r);
140 vt_out0 = AVC_DOT_SH3_SH(src_vt10_r, src_vt32_r, src_vt54_r, filt0, filt1,
142 vt_out1 = AVC_DOT_SH3_SH(src_vt32_r, src_vt54_r, src_vt76_r, filt0, filt1,
144 SRARI_H2_SH(vt_out0, vt_out1, 5);
145 SAT_SH2_SH(vt_out0, vt_out1, 7);
147 out0 = __msa_srari_h((hz_out0 + vt_out0), 1);
148 out1 = __msa_srari_h((hz_out1 + vt_out1), 1);
150 SAT_SH2_SH(out0, out1, 7);
151 out = PCKEV_XORI128_UB(out0, out1);
152 ST4x4_UB(out, out, 0, 1, 2, 3, dst, stride);
155 static void avc_luma_hv_qrt_8x8_msa(const uint8_t *src_x, const uint8_t *src_y,
156 uint8_t *dst, int32_t stride)
158 const int16_t filt_const0 = 0xfb01;
159 const int16_t filt_const1 = 0x1414;
160 const int16_t filt_const2 = 0x1fb;
162 v16i8 src_hz0, src_hz1, src_hz2, src_hz3, mask0, mask1, mask2;
163 v16i8 src_vt0, src_vt1, src_vt2, src_vt3, src_vt4, src_vt5, src_vt6;
164 v16i8 src_vt7, src_vt8, src_vt9, src_vt10, src_vt11, src_vt12;
165 v16i8 src_vt10_r, src_vt21_r, src_vt32_r, src_vt43_r, src_vt54_r;
166 v16i8 src_vt65_r, src_vt76_r, src_vt87_r, src_vt98_r, src_vt109_r;
167 v16i8 src_vt1110_r, src_vt1211_r, filt0, filt1, filt2;
168 v8i16 hz_out0, hz_out1, hz_out2, hz_out3, vt_out0, vt_out1, vt_out2;
169 v8i16 vt_out3, tmp0, tmp1, tmp2, tmp3;
171 filt0 = (v16i8) __msa_fill_h(filt_const0);
172 filt1 = (v16i8) __msa_fill_h(filt_const1);
173 filt2 = (v16i8) __msa_fill_h(filt_const2);
175 LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
176 LD_SB5(src_y, stride, src_vt0, src_vt1, src_vt2, src_vt3, src_vt4);
177 src_y += (5 * stride);
179 XORI_B5_128_SB(src_vt0, src_vt1, src_vt2, src_vt3, src_vt4);
181 LD_SB4(src_x, stride, src_hz0, src_hz1, src_hz2, src_hz3);
182 XORI_B4_128_SB(src_hz0, src_hz1, src_hz2, src_hz3);
183 src_x += (4 * stride);
185 hz_out0 = AVC_HORZ_FILTER_SH(src_hz0, src_hz0, mask0, mask1, mask2);
186 hz_out1 = AVC_HORZ_FILTER_SH(src_hz1, src_hz1, mask0, mask1, mask2);
187 hz_out2 = AVC_HORZ_FILTER_SH(src_hz2, src_hz2, mask0, mask1, mask2);
188 hz_out3 = AVC_HORZ_FILTER_SH(src_hz3, src_hz3, mask0, mask1, mask2);
190 SRARI_H4_SH(hz_out0, hz_out1, hz_out2, hz_out3, 5);
191 SAT_SH4_SH(hz_out0, hz_out1, hz_out2, hz_out3, 7);
193 LD_SB4(src_y, stride, src_vt5, src_vt6, src_vt7, src_vt8);
194 src_y += (4 * stride);
195 XORI_B4_128_SB(src_vt5, src_vt6, src_vt7, src_vt8);
197 ILVR_B4_SB(src_vt1, src_vt0, src_vt2, src_vt1, src_vt3, src_vt2, src_vt4,
198 src_vt3, src_vt10_r, src_vt21_r, src_vt32_r, src_vt43_r);
199 ILVR_B4_SB(src_vt5, src_vt4, src_vt6, src_vt5, src_vt7, src_vt6, src_vt8,
200 src_vt7, src_vt54_r, src_vt65_r, src_vt76_r, src_vt87_r);
201 vt_out0 = AVC_DOT_SH3_SH(src_vt10_r, src_vt32_r, src_vt54_r, filt0, filt1,
203 vt_out1 = AVC_DOT_SH3_SH(src_vt21_r, src_vt43_r, src_vt65_r, filt0, filt1,
205 vt_out2 = AVC_DOT_SH3_SH(src_vt32_r, src_vt54_r, src_vt76_r, filt0, filt1,
207 vt_out3 = AVC_DOT_SH3_SH(src_vt43_r, src_vt65_r, src_vt87_r, filt0, filt1,
209 SRARI_H4_SH(vt_out0, vt_out1, vt_out2, vt_out3, 5);
210 SAT_SH4_SH(vt_out0, vt_out1, vt_out2, vt_out3, 7);
212 tmp0 = __msa_srari_h((hz_out0 + vt_out0), 1);
213 tmp1 = __msa_srari_h((hz_out1 + vt_out1), 1);
214 tmp2 = __msa_srari_h((hz_out2 + vt_out2), 1);
215 tmp3 = __msa_srari_h((hz_out3 + vt_out3), 1);
217 LD_SB4(src_x, stride, src_hz0, src_hz1, src_hz2, src_hz3);
218 XORI_B4_128_SB(src_hz0, src_hz1, src_hz2, src_hz3);
220 SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
221 out0 = PCKEV_XORI128_UB(tmp0, tmp1);
222 out1 = PCKEV_XORI128_UB(tmp2, tmp3);
223 ST8x4_UB(out0, out1, dst, stride);
226 LD_SB4(src_y, stride, src_vt9, src_vt10, src_vt11, src_vt12);
227 XORI_B4_128_SB(src_vt9, src_vt10, src_vt11, src_vt12);
229 hz_out0 = AVC_HORZ_FILTER_SH(src_hz0, src_hz0, mask0, mask1, mask2);
230 hz_out1 = AVC_HORZ_FILTER_SH(src_hz1, src_hz1, mask0, mask1, mask2);
231 hz_out2 = AVC_HORZ_FILTER_SH(src_hz2, src_hz2, mask0, mask1, mask2);
232 hz_out3 = AVC_HORZ_FILTER_SH(src_hz3, src_hz3, mask0, mask1, mask2);
234 SRARI_H4_SH(hz_out0, hz_out1, hz_out2, hz_out3, 5);
235 SAT_SH4_SH(hz_out0, hz_out1, hz_out2, hz_out3, 7);
237 ILVR_B4_SB(src_vt9, src_vt8, src_vt10, src_vt9, src_vt11, src_vt10,
238 src_vt12, src_vt11, src_vt98_r, src_vt109_r, src_vt1110_r,
240 vt_out0 = AVC_DOT_SH3_SH(src_vt54_r, src_vt76_r, src_vt98_r, filt0, filt1,
242 vt_out1 = AVC_DOT_SH3_SH(src_vt65_r, src_vt87_r, src_vt109_r, filt0, filt1,
244 vt_out2 = AVC_DOT_SH3_SH(src_vt76_r, src_vt98_r, src_vt1110_r, filt0, filt1,
246 vt_out3 = AVC_DOT_SH3_SH(src_vt87_r, src_vt109_r, src_vt1211_r, filt0,
248 SRARI_H4_SH(vt_out0, vt_out1, vt_out2, vt_out3, 5);
249 SAT_SH4_SH(vt_out0, vt_out1, vt_out2, vt_out3, 7);
251 tmp0 = __msa_srari_h((hz_out0 + vt_out0), 1);
252 tmp1 = __msa_srari_h((hz_out1 + vt_out1), 1);
253 tmp2 = __msa_srari_h((hz_out2 + vt_out2), 1);
254 tmp3 = __msa_srari_h((hz_out3 + vt_out3), 1);
256 SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
257 out0 = PCKEV_XORI128_UB(tmp0, tmp1);
258 out1 = PCKEV_XORI128_UB(tmp2, tmp3);
259 ST8x4_UB(out0, out1, dst, stride);
263 static void avc_luma_hv_qrt_16x16_msa(const uint8_t *src_x,
264 const uint8_t *src_y, uint8_t *dst,
267 const int16_t filt_const0 = 0xfb01;
268 const int16_t filt_const1 = 0x1414;
269 const int16_t filt_const2 = 0x1fb;
270 const uint8_t *src_x_tmp = src_x;
271 const uint8_t *src_y_tmp = src_y;
272 uint8_t *dst_tmp = dst;
273 uint32_t multiple8_cnt, loop_cnt;
275 v16i8 src_hz0, src_hz1, src_hz2, src_hz3, mask0, mask1, mask2;
276 v16i8 src_vt0, src_vt1, src_vt2, src_vt3, src_vt4, src_vt5, src_vt6;
277 v16i8 src_vt7, src_vt8;
278 v16i8 src_vt10_r, src_vt21_r, src_vt32_r, src_vt43_r, src_vt54_r;
279 v16i8 src_vt65_r, src_vt76_r, src_vt87_r, filt0, filt1, filt2;
280 v8i16 hz_out0, hz_out1, hz_out2, hz_out3, vt_out0, vt_out1, vt_out2;
281 v8i16 vt_out3, out0, out1, out2, out3;
283 filt0 = (v16i8) __msa_fill_h(filt_const0);
284 filt1 = (v16i8) __msa_fill_h(filt_const1);
285 filt2 = (v16i8) __msa_fill_h(filt_const2);
287 LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
289 for (multiple8_cnt = 2; multiple8_cnt--;) {
294 LD_SB5(src_y, stride, src_vt0, src_vt1, src_vt2, src_vt3, src_vt4);
295 src_y += (5 * stride);
297 XORI_B5_128_SB(src_vt0, src_vt1, src_vt2, src_vt3, src_vt4);
299 for (loop_cnt = 4; loop_cnt--;) {
300 LD_SB4(src_x, stride, src_hz0, src_hz1, src_hz2, src_hz3);
301 XORI_B4_128_SB(src_hz0, src_hz1, src_hz2, src_hz3);
302 src_x += (4 * stride);
304 hz_out0 = AVC_HORZ_FILTER_SH(src_hz0, src_hz0, mask0, mask1, mask2);
305 hz_out1 = AVC_HORZ_FILTER_SH(src_hz1, src_hz1, mask0, mask1, mask2);
306 hz_out2 = AVC_HORZ_FILTER_SH(src_hz2, src_hz2, mask0, mask1, mask2);
307 hz_out3 = AVC_HORZ_FILTER_SH(src_hz3, src_hz3, mask0, mask1, mask2);
308 SRARI_H4_SH(hz_out0, hz_out1, hz_out2, hz_out3, 5);
309 SAT_SH4_SH(hz_out0, hz_out1, hz_out2, hz_out3, 7);
311 LD_SB4(src_y, stride, src_vt5, src_vt6, src_vt7, src_vt8);
312 src_y += (4 * stride);
314 XORI_B4_128_SB(src_vt5, src_vt6, src_vt7, src_vt8);
315 ILVR_B4_SB(src_vt1, src_vt0, src_vt2, src_vt1, src_vt3, src_vt2,
316 src_vt4, src_vt3, src_vt10_r, src_vt21_r, src_vt32_r,
318 ILVR_B4_SB(src_vt5, src_vt4, src_vt6, src_vt5, src_vt7, src_vt6,
319 src_vt8, src_vt7, src_vt54_r, src_vt65_r, src_vt76_r,
321 vt_out0 = AVC_DOT_SH3_SH(src_vt10_r, src_vt32_r, src_vt54_r, filt0,
323 vt_out1 = AVC_DOT_SH3_SH(src_vt21_r, src_vt43_r, src_vt65_r, filt0,
325 vt_out2 = AVC_DOT_SH3_SH(src_vt32_r, src_vt54_r, src_vt76_r, filt0,
327 vt_out3 = AVC_DOT_SH3_SH(src_vt43_r, src_vt65_r, src_vt87_r, filt0,
329 SRARI_H4_SH(vt_out0, vt_out1, vt_out2, vt_out3, 5);
330 SAT_SH4_SH(vt_out0, vt_out1, vt_out2, vt_out3, 7);
332 out0 = __msa_srari_h((hz_out0 + vt_out0), 1);
333 out1 = __msa_srari_h((hz_out1 + vt_out1), 1);
334 out2 = __msa_srari_h((hz_out2 + vt_out2), 1);
335 out3 = __msa_srari_h((hz_out3 + vt_out3), 1);
337 SAT_SH4_SH(out0, out1, out2, out3, 7);
338 tmp0 = PCKEV_XORI128_UB(out0, out1);
339 tmp1 = PCKEV_XORI128_UB(out2, out3);
340 ST8x4_UB(tmp0, tmp1, dst, stride);
356 static void avc_luma_hv_qrt_and_aver_dst_4x4_msa(const uint8_t *src_x,
357 const uint8_t *src_y,
361 uint32_t tp0, tp1, tp2, tp3;
362 const int16_t filt_const0 = 0xfb01;
363 const int16_t filt_const1 = 0x1414;
364 const int16_t filt_const2 = 0x1fb;
365 v16u8 res, dst0 = { 0 };
366 v16i8 src_hz0, src_hz1, src_hz2, src_hz3, src_vt7, src_vt8;
367 v16i8 src_vt0, src_vt1, src_vt2, src_vt3, src_vt4, src_vt5, src_vt6;
368 v16i8 src_vt10_r, src_vt32_r, src_vt54_r, src_vt76_r;
369 v16i8 mask0, mask1, mask2, filt0, filt1, filt2;
370 v8i16 hz_out0, hz_out1, vt_out0, vt_out1, res0, res1;
372 filt0 = (v16i8) __msa_fill_h(filt_const0);
373 filt1 = (v16i8) __msa_fill_h(filt_const1);
374 filt2 = (v16i8) __msa_fill_h(filt_const2);
376 LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2);
378 LD_SB5(src_y, stride, src_vt0, src_vt1, src_vt2, src_vt3, src_vt4);
379 src_y += (5 * stride);
381 src_vt0 = (v16i8) __msa_insve_w((v4i32) src_vt0, 1, (v4i32) src_vt1);
382 src_vt1 = (v16i8) __msa_insve_w((v4i32) src_vt1, 1, (v4i32) src_vt2);
383 src_vt2 = (v16i8) __msa_insve_w((v4i32) src_vt2, 1, (v4i32) src_vt3);
384 src_vt3 = (v16i8) __msa_insve_w((v4i32) src_vt3, 1, (v4i32) src_vt4);
386 XORI_B4_128_SB(src_vt0, src_vt1, src_vt2, src_vt3);
388 LD_SB4(src_x, stride, src_hz0, src_hz1, src_hz2, src_hz3);
389 XORI_B4_128_SB(src_hz0, src_hz1, src_hz2, src_hz3);
390 hz_out0 = AVC_HORZ_FILTER_SH(src_hz0, src_hz1, mask0, mask1, mask2);
391 hz_out1 = AVC_HORZ_FILTER_SH(src_hz2, src_hz3, mask0, mask1, mask2);
393 SRARI_H2_SH(hz_out0, hz_out1, 5);
394 SAT_SH2_SH(hz_out0, hz_out1, 7);
396 LD_SB4(src_y, stride, src_vt5, src_vt6, src_vt7, src_vt8);
398 src_vt4 = (v16i8) __msa_insve_w((v4i32) src_vt4, 1, (v4i32) src_vt5);
399 src_vt5 = (v16i8) __msa_insve_w((v4i32) src_vt5, 1, (v4i32) src_vt6);
400 src_vt6 = (v16i8) __msa_insve_w((v4i32) src_vt6, 1, (v4i32) src_vt7);
401 src_vt7 = (v16i8) __msa_insve_w((v4i32) src_vt7, 1, (v4i32) src_vt8);
403 XORI_B4_128_SB(src_vt4, src_vt5, src_vt6, src_vt7);
404 ILVR_B2_SB(src_vt1, src_vt0, src_vt3, src_vt2, src_vt10_r, src_vt32_r);
405 ILVR_B2_SB(src_vt5, src_vt4, src_vt7, src_vt6, src_vt54_r, src_vt76_r);
406 vt_out0 = AVC_DOT_SH3_SH(src_vt10_r, src_vt32_r, src_vt54_r, filt0, filt1,
408 vt_out1 = AVC_DOT_SH3_SH(src_vt32_r, src_vt54_r, src_vt76_r, filt0, filt1,
410 SRARI_H2_SH(vt_out0, vt_out1, 5);
411 SAT_SH2_SH(vt_out0, vt_out1, 7);
412 LW4(dst, stride, tp0, tp1, tp2, tp3);
413 INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
415 res1 = __msa_srari_h((hz_out1 + vt_out1), 1);
416 res0 = __msa_srari_h((hz_out0 + vt_out0), 1);
418 SAT_SH2_SH(res0, res1, 7);
419 res = PCKEV_XORI128_UB(res0, res1);
420 dst0 = __msa_aver_u_b(res, dst0);
422 ST4x4_UB(dst0, dst0, 0, 1, 2, 3, dst, stride);
425 static void avc_luma_hv_qrt_and_aver_dst_8x8_msa(const uint8_t *src_x,
426 const uint8_t *src_y,
430 const int16_t filt_const0 = 0xfb01;
431 const int16_t filt_const1 = 0x1414;
432 const int16_t filt_const2 = 0x1fb;
433 uint64_t tp0, tp1, tp2, tp3;
434 v16u8 out0, out1, dst0 = { 0 }, dst1 = { 0 };
435 v16i8 src_hz0, src_hz1, src_hz2, src_hz3, src_vt0, src_vt1, src_vt2;
436 v16i8 src_vt3, src_vt4, src_vt5, src_vt6, src_vt7, src_vt8;
437 v16i8 src_vt9, src_vt10, src_vt11, src_vt12, mask0, mask1, mask2;
438 v16i8 src_vt10_r, src_vt21_r, src_vt32_r, src_vt43_r, src_vt54_r;
439 v16i8 src_vt65_r, src_vt76_r, src_vt87_r, src_vt98_r, src_vt109_r;
440 v16i8 src_vt1110_r, src_vt1211_r, filt0, filt1, filt2;
441 v8i16 hz_out0, hz_out1, hz_out2, hz_out3, vt_out0, vt_out1, vt_out2;
442 v8i16 vt_out3, tmp0, tmp1, tmp2, tmp3;
444 filt0 = (v16i8) __msa_fill_h(filt_const0);
445 filt1 = (v16i8) __msa_fill_h(filt_const1);
446 filt2 = (v16i8) __msa_fill_h(filt_const2);
448 LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
449 LD_SB5(src_y, stride, src_vt0, src_vt1, src_vt2, src_vt3, src_vt4);
450 src_y += (5 * stride);
452 XORI_B5_128_SB(src_vt0, src_vt1, src_vt2, src_vt3, src_vt4);
454 LD_SB4(src_x, stride, src_hz0, src_hz1, src_hz2, src_hz3);
455 XORI_B4_128_SB(src_hz0, src_hz1, src_hz2, src_hz3);
456 src_x += (4 * stride);
458 hz_out0 = AVC_HORZ_FILTER_SH(src_hz0, src_hz0, mask0, mask1, mask2);
459 hz_out1 = AVC_HORZ_FILTER_SH(src_hz1, src_hz1, mask0, mask1, mask2);
460 hz_out2 = AVC_HORZ_FILTER_SH(src_hz2, src_hz2, mask0, mask1, mask2);
461 hz_out3 = AVC_HORZ_FILTER_SH(src_hz3, src_hz3, mask0, mask1, mask2);
463 SRARI_H4_SH(hz_out0, hz_out1, hz_out2, hz_out3, 5);
464 SAT_SH4_SH(hz_out0, hz_out1, hz_out2, hz_out3, 7);
466 LD_SB4(src_y, stride, src_vt5, src_vt6, src_vt7, src_vt8);
467 src_y += (4 * stride);
468 XORI_B4_128_SB(src_vt5, src_vt6, src_vt7, src_vt8);
470 ILVR_B4_SB(src_vt1, src_vt0, src_vt2, src_vt1, src_vt3, src_vt2, src_vt4,
471 src_vt3, src_vt10_r, src_vt21_r, src_vt32_r, src_vt43_r);
472 ILVR_B4_SB(src_vt5, src_vt4, src_vt6, src_vt5, src_vt7, src_vt6, src_vt8,
473 src_vt7, src_vt54_r, src_vt65_r, src_vt76_r, src_vt87_r);
474 vt_out0 = AVC_DOT_SH3_SH(src_vt10_r, src_vt32_r, src_vt54_r, filt0, filt1,
476 vt_out1 = AVC_DOT_SH3_SH(src_vt21_r, src_vt43_r, src_vt65_r, filt0, filt1,
478 vt_out2 = AVC_DOT_SH3_SH(src_vt32_r, src_vt54_r, src_vt76_r, filt0, filt1,
480 vt_out3 = AVC_DOT_SH3_SH(src_vt43_r, src_vt65_r, src_vt87_r, filt0, filt1,
482 SRARI_H4_SH(vt_out0, vt_out1, vt_out2, vt_out3, 5);
483 SAT_SH4_SH(vt_out0, vt_out1, vt_out2, vt_out3, 7);
485 tmp0 = __msa_srari_h((hz_out0 + vt_out0), 1);
486 tmp1 = __msa_srari_h((hz_out1 + vt_out1), 1);
487 tmp2 = __msa_srari_h((hz_out2 + vt_out2), 1);
488 tmp3 = __msa_srari_h((hz_out3 + vt_out3), 1);
490 LD_SB4(src_x, stride, src_hz0, src_hz1, src_hz2, src_hz3);
491 XORI_B4_128_SB(src_hz0, src_hz1, src_hz2, src_hz3);
493 LD4(dst, stride, tp0, tp1, tp2, tp3);
494 INSERT_D2_UB(tp0, tp1, dst0);
495 INSERT_D2_UB(tp2, tp3, dst1);
497 SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
498 out0 = PCKEV_XORI128_UB(tmp0, tmp1);
499 out1 = PCKEV_XORI128_UB(tmp2, tmp3);
500 AVER_UB2_UB(out0, dst0, out1, dst1, dst0, dst1);
501 ST8x4_UB(dst0, dst1, dst, stride);
504 LD_SB4(src_y, stride, src_vt9, src_vt10, src_vt11, src_vt12);
505 XORI_B4_128_SB(src_vt9, src_vt10, src_vt11, src_vt12);
507 hz_out0 = AVC_HORZ_FILTER_SH(src_hz0, src_hz0, mask0, mask1, mask2);
508 hz_out1 = AVC_HORZ_FILTER_SH(src_hz1, src_hz1, mask0, mask1, mask2);
509 hz_out2 = AVC_HORZ_FILTER_SH(src_hz2, src_hz2, mask0, mask1, mask2);
510 hz_out3 = AVC_HORZ_FILTER_SH(src_hz3, src_hz3, mask0, mask1, mask2);
512 SRARI_H4_SH(hz_out0, hz_out1, hz_out2, hz_out3, 5);
513 SAT_SH4_SH(hz_out0, hz_out1, hz_out2, hz_out3, 7);
515 ILVR_B4_SB(src_vt9, src_vt8, src_vt10, src_vt9, src_vt11, src_vt10,
516 src_vt12, src_vt11, src_vt98_r, src_vt109_r, src_vt1110_r,
518 vt_out0 = AVC_DOT_SH3_SH(src_vt54_r, src_vt76_r, src_vt98_r, filt0, filt1,
520 vt_out1 = AVC_DOT_SH3_SH(src_vt65_r, src_vt87_r, src_vt109_r, filt0, filt1,
522 vt_out2 = AVC_DOT_SH3_SH(src_vt76_r, src_vt98_r, src_vt1110_r, filt0, filt1,
524 vt_out3 = AVC_DOT_SH3_SH(src_vt87_r, src_vt109_r, src_vt1211_r, filt0,
526 SRARI_H4_SH(vt_out0, vt_out1, vt_out2, vt_out3, 5);
527 SAT_SH4_SH(vt_out0, vt_out1, vt_out2, vt_out3, 7);
529 tmp0 = __msa_srari_h((hz_out0 + vt_out0), 1);
530 tmp1 = __msa_srari_h((hz_out1 + vt_out1), 1);
531 tmp2 = __msa_srari_h((hz_out2 + vt_out2), 1);
532 tmp3 = __msa_srari_h((hz_out3 + vt_out3), 1);
534 LD4(dst, stride, tp0, tp1, tp2, tp3);
535 INSERT_D2_UB(tp0, tp1, dst0);
536 INSERT_D2_UB(tp2, tp3, dst1);
538 SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
539 out0 = PCKEV_XORI128_UB(tmp0, tmp1);
540 out1 = PCKEV_XORI128_UB(tmp2, tmp3);
541 AVER_UB2_UB(out0, dst0, out1, dst1, dst0, dst1);
542 ST8x4_UB(dst0, dst1, dst, stride);
546 static void avc_luma_hv_qrt_and_aver_dst_16x16_msa(const uint8_t *src_x,
547 const uint8_t *src_y,
551 const int16_t filt_const0 = 0xfb01;
552 const int16_t filt_const1 = 0x1414;
553 const int16_t filt_const2 = 0x1fb;
554 const uint8_t *src_x_tmp = src_x;
555 const uint8_t *src_y_tmp = src_y;
556 uint8_t *dst_tmp = dst;
557 uint32_t multiple8_cnt, loop_cnt;
558 uint64_t tp0, tp1, tp2, tp3;
559 v16u8 tmp0, tmp1, dst0 = { 0 }, dst1 = { 0 };
560 v16i8 src_hz0, src_hz1, src_hz2, src_hz3, mask0, mask1, mask2;
561 v16i8 src_vt0, src_vt1, src_vt2, src_vt3, src_vt4, src_vt5, src_vt6;
562 v16i8 src_vt7, src_vt8;
563 v16i8 src_vt10_r, src_vt21_r, src_vt32_r, src_vt43_r, src_vt54_r;
564 v16i8 src_vt65_r, src_vt76_r, src_vt87_r, filt0, filt1, filt2;
565 v8i16 hz_out0, hz_out1, hz_out2, hz_out3, vt_out0, vt_out1, vt_out2;
566 v8i16 vt_out3, out0, out1, out2, out3;
568 filt0 = (v16i8) __msa_fill_h(filt_const0);
569 filt1 = (v16i8) __msa_fill_h(filt_const1);
570 filt2 = (v16i8) __msa_fill_h(filt_const2);
572 LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
574 for (multiple8_cnt = 2; multiple8_cnt--;) {
579 LD_SB5(src_y, stride, src_vt0, src_vt1, src_vt2, src_vt3, src_vt4);
580 src_y += (5 * stride);
582 XORI_B5_128_SB(src_vt0, src_vt1, src_vt2, src_vt3, src_vt4);
584 for (loop_cnt = 4; loop_cnt--;) {
585 LD_SB4(src_x, stride, src_hz0, src_hz1, src_hz2, src_hz3);
586 XORI_B4_128_SB(src_hz0, src_hz1, src_hz2, src_hz3);
587 src_x += (4 * stride);
589 hz_out0 = AVC_HORZ_FILTER_SH(src_hz0, src_hz0, mask0, mask1, mask2);
590 hz_out1 = AVC_HORZ_FILTER_SH(src_hz1, src_hz1, mask0, mask1, mask2);
591 hz_out2 = AVC_HORZ_FILTER_SH(src_hz2, src_hz2, mask0, mask1, mask2);
592 hz_out3 = AVC_HORZ_FILTER_SH(src_hz3, src_hz3, mask0, mask1, mask2);
593 SRARI_H4_SH(hz_out0, hz_out1, hz_out2, hz_out3, 5);
594 SAT_SH4_SH(hz_out0, hz_out1, hz_out2, hz_out3, 7);
596 LD_SB4(src_y, stride, src_vt5, src_vt6, src_vt7, src_vt8);
597 src_y += (4 * stride);
599 XORI_B4_128_SB(src_vt5, src_vt6, src_vt7, src_vt8);
600 ILVR_B4_SB(src_vt1, src_vt0, src_vt2, src_vt1, src_vt3, src_vt2,
601 src_vt4, src_vt3, src_vt10_r, src_vt21_r, src_vt32_r,
603 ILVR_B4_SB(src_vt5, src_vt4, src_vt6, src_vt5, src_vt7, src_vt6,
604 src_vt8, src_vt7, src_vt54_r, src_vt65_r, src_vt76_r,
606 vt_out0 = AVC_DOT_SH3_SH(src_vt10_r, src_vt32_r, src_vt54_r, filt0,
608 vt_out1 = AVC_DOT_SH3_SH(src_vt21_r, src_vt43_r, src_vt65_r, filt0,
610 vt_out2 = AVC_DOT_SH3_SH(src_vt32_r, src_vt54_r, src_vt76_r, filt0,
612 vt_out3 = AVC_DOT_SH3_SH(src_vt43_r, src_vt65_r, src_vt87_r, filt0,
614 SRARI_H4_SH(vt_out0, vt_out1, vt_out2, vt_out3, 5);
615 SAT_SH4_SH(vt_out0, vt_out1, vt_out2, vt_out3, 7);
617 out0 = __msa_srari_h((hz_out0 + vt_out0), 1);
618 out1 = __msa_srari_h((hz_out1 + vt_out1), 1);
619 out2 = __msa_srari_h((hz_out2 + vt_out2), 1);
620 out3 = __msa_srari_h((hz_out3 + vt_out3), 1);
622 LD4(dst, stride, tp0, tp1, tp2, tp3);
623 INSERT_D2_UB(tp0, tp1, dst0);
624 INSERT_D2_UB(tp2, tp3, dst1);
626 SAT_SH4_SH(out0, out1, out2, out3, 7);
627 tmp0 = PCKEV_XORI128_UB(out0, out1);
628 tmp1 = PCKEV_XORI128_UB(out2, out3);
629 AVER_UB2_UB(tmp0, dst0, tmp1, dst1, dst0, dst1);
630 ST8x4_UB(dst0, dst1, dst, stride);
646 void ff_put_h264_qpel16_mc00_msa(uint8_t *dst, const uint8_t *src,
649 v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
650 v16u8 src8, src9, src10, src11, src12, src13, src14, src15;
652 LD_UB8(src, stride, src0, src1, src2, src3, src4, src5, src6, src7);
654 LD_UB8(src, stride, src8, src9, src10, src11, src12, src13, src14, src15);
656 ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst, stride);
658 ST_UB8(src8, src9, src10, src11, src12, src13, src14, src15, dst, stride);
661 void ff_put_h264_qpel8_mc00_msa(uint8_t *dst, const uint8_t *src,
664 uint64_t src0, src1, src2, src3, src4, src5, src6, src7;
666 LD4(src, stride, src0, src1, src2, src3);
668 LD4(src, stride, src4, src5, src6, src7);
669 SD4(src0, src1, src2, src3, dst, stride);
671 SD4(src4, src5, src6, src7, dst, stride);
674 void ff_avg_h264_qpel16_mc00_msa(uint8_t *dst, const uint8_t *src,
677 v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
678 v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
680 LD_UB8(src, stride, src0, src1, src2, src3, src4, src5, src6, src7);
682 LD_UB8(dst, stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
684 AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3, dst0, dst1,
686 AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7, dst4, dst5,
688 ST_UB8(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst, stride);
691 LD_UB8(src, stride, src0, src1, src2, src3, src4, src5, src6, src7);
692 LD_UB8(dst, stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
694 AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3, dst0, dst1,
696 AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7, dst4, dst5,
698 ST_UB8(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst, stride);
701 void ff_avg_h264_qpel8_mc00_msa(uint8_t *dst, const uint8_t *src,
704 uint64_t tp0, tp1, tp2, tp3, tp4, tp5, tp6, tp7;
705 v16u8 src0 = { 0 }, src1 = { 0 }, src2 = { 0 }, src3 = { 0 };
706 v16u8 dst0 = { 0 }, dst1 = { 0 }, dst2 = { 0 }, dst3 = { 0 };
708 LD4(src, stride, tp0, tp1, tp2, tp3);
710 LD4(src, stride, tp4, tp5, tp6, tp7);
711 INSERT_D2_UB(tp0, tp1, src0);
712 INSERT_D2_UB(tp2, tp3, src1);
713 INSERT_D2_UB(tp4, tp5, src2);
714 INSERT_D2_UB(tp6, tp7, src3);
716 LD4(dst, stride, tp0, tp1, tp2, tp3);
717 LD4(dst + 4 * stride, stride, tp4, tp5, tp6, tp7);
718 INSERT_D2_UB(tp0, tp1, dst0);
719 INSERT_D2_UB(tp2, tp3, dst1);
720 INSERT_D2_UB(tp4, tp5, dst2);
721 INSERT_D2_UB(tp6, tp7, dst3);
723 AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3, dst0, dst1,
726 ST8x8_UB(dst0, dst1, dst2, dst3, dst, stride);
729 void ff_avg_h264_qpel4_mc00_msa(uint8_t *dst, const uint8_t *src,
732 uint32_t tp0, tp1, tp2, tp3;
733 v16u8 src0 = { 0 }, dst0 = { 0 };
735 LW4(src, stride, tp0, tp1, tp2, tp3);
736 INSERT_W4_UB(tp0, tp1, tp2, tp3, src0);
737 LW4(dst, stride, tp0, tp1, tp2, tp3);
738 INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
740 dst0 = __msa_aver_u_b(src0, dst0);
742 ST4x4_UB(dst0, dst0, 0, 1, 2, 3, dst, stride);
745 void ff_put_h264_qpel16_mc10_msa(uint8_t *dst, const uint8_t *src,
749 v16i8 dst0, dst1, dst2, dst3, src0, src1, src2, src3, src4, src5, src6;
750 v16i8 mask0, mask1, mask2, mask3, mask4, mask5, src7, vec11;
751 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
752 v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
753 v16i8 minus5b = __msa_ldi_b(-5);
754 v16i8 plus20b = __msa_ldi_b(20);
756 LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
762 for (loop_cnt = 4; loop_cnt--;) {
763 LD_SB2(src, 16, src0, src1);
765 LD_SB2(src, 16, src2, src3);
767 LD_SB2(src, 16, src4, src5);
769 LD_SB2(src, 16, src6, src7);
772 XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
773 VSHF_B2_SB(src0, src0, src0, src1, mask0, mask3, vec0, vec3);
774 VSHF_B2_SB(src2, src2, src2, src3, mask0, mask3, vec6, vec9);
775 VSHF_B2_SB(src0, src0, src0, src1, mask1, mask4, vec1, vec4);
776 VSHF_B2_SB(src2, src2, src2, src3, mask1, mask4, vec7, vec10);
777 VSHF_B2_SB(src0, src0, src0, src1, mask2, mask5, vec2, vec5);
778 VSHF_B2_SB(src2, src2, src2, src3, mask2, mask5, vec8, vec11);
779 HADD_SB4_SH(vec0, vec3, vec6, vec9, res0, res1, res2, res3);
780 DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
781 minus5b, res0, res1, res2, res3);
782 DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
783 plus20b, res0, res1, res2, res3);
784 VSHF_B2_SB(src4, src4, src4, src5, mask0, mask3, vec0, vec3);
785 VSHF_B2_SB(src6, src6, src6, src7, mask0, mask3, vec6, vec9);
786 VSHF_B2_SB(src4, src4, src4, src5, mask1, mask4, vec1, vec4);
787 VSHF_B2_SB(src6, src6, src6, src7, mask1, mask4, vec7, vec10);
788 VSHF_B2_SB(src4, src4, src4, src5, mask2, mask5, vec2, vec5);
789 VSHF_B2_SB(src6, src6, src6, src7, mask2, mask5, vec8, vec11);
790 HADD_SB4_SH(vec0, vec3, vec6, vec9, res4, res5, res6, res7);
791 DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
792 minus5b, res4, res5, res6, res7);
793 DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
794 plus20b, res4, res5, res6, res7);
795 SLDI_B2_SB(src1, src3, src0, src2, src0, src2, 2);
796 SLDI_B2_SB(src5, src7, src4, src6, src4, src6, 2);
797 SRARI_H4_SH(res0, res1, res2, res3, 5);
798 SRARI_H4_SH(res4, res5, res6, res7, 5);
799 SAT_SH4_SH(res0, res1, res2, res3, 7);
800 SAT_SH4_SH(res4, res5, res6, res7, 7);
801 PCKEV_B2_SB(res1, res0, res3, res2, dst0, dst1);
802 PCKEV_B2_SB(res5, res4, res7, res6, dst2, dst3);
803 dst0 = __msa_aver_s_b(dst0, src0);
804 dst1 = __msa_aver_s_b(dst1, src2);
805 dst2 = __msa_aver_s_b(dst2, src4);
806 dst3 = __msa_aver_s_b(dst3, src6);
807 XORI_B4_128_SB(dst0, dst1, dst2, dst3);
808 ST_SB4(dst0, dst1, dst2, dst3, dst, stride);
813 void ff_put_h264_qpel16_mc30_msa(uint8_t *dst, const uint8_t *src,
817 v16i8 dst0, dst1, dst2, dst3, src0, src1, src2, src3, src4, src5, src6;
818 v16i8 mask0, mask1, mask2, mask3, mask4, mask5, src7, vec11;
819 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
820 v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
821 v16i8 minus5b = __msa_ldi_b(-5);
822 v16i8 plus20b = __msa_ldi_b(20);
824 LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
830 for (loop_cnt = 4; loop_cnt--;) {
831 LD_SB2(src, 16, src0, src1);
833 LD_SB2(src, 16, src2, src3);
835 LD_SB2(src, 16, src4, src5);
837 LD_SB2(src, 16, src6, src7);
840 XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
841 VSHF_B2_SB(src0, src0, src0, src1, mask0, mask3, vec0, vec3);
842 VSHF_B2_SB(src2, src2, src2, src3, mask0, mask3, vec6, vec9);
843 VSHF_B2_SB(src0, src0, src0, src1, mask1, mask4, vec1, vec4);
844 VSHF_B2_SB(src2, src2, src2, src3, mask1, mask4, vec7, vec10);
845 VSHF_B2_SB(src0, src0, src0, src1, mask2, mask5, vec2, vec5);
846 VSHF_B2_SB(src2, src2, src2, src3, mask2, mask5, vec8, vec11);
847 HADD_SB4_SH(vec0, vec3, vec6, vec9, res0, res1, res2, res3);
848 DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
849 minus5b, res0, res1, res2, res3);
850 DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
851 plus20b, res0, res1, res2, res3);
852 VSHF_B2_SB(src4, src4, src4, src5, mask0, mask3, vec0, vec3);
853 VSHF_B2_SB(src6, src6, src6, src7, mask0, mask3, vec6, vec9);
854 VSHF_B2_SB(src4, src4, src4, src5, mask1, mask4, vec1, vec4);
855 VSHF_B2_SB(src6, src6, src6, src7, mask1, mask4, vec7, vec10);
856 VSHF_B2_SB(src4, src4, src4, src5, mask2, mask5, vec2, vec5);
857 VSHF_B2_SB(src6, src6, src6, src7, mask2, mask5, vec8, vec11);
858 HADD_SB4_SH(vec0, vec3, vec6, vec9, res4, res5, res6, res7);
859 DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
860 minus5b, res4, res5, res6, res7);
861 DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
862 plus20b, res4, res5, res6, res7);
863 SLDI_B2_SB(src1, src3, src0, src2, src0, src2, 3);
864 SLDI_B2_SB(src5, src7, src4, src6, src4, src6, 3);
865 SRARI_H4_SH(res0, res1, res2, res3, 5);
866 SRARI_H4_SH(res4, res5, res6, res7, 5);
867 SAT_SH4_SH(res0, res1, res2, res3, 7);
868 SAT_SH4_SH(res4, res5, res6, res7, 7);
869 PCKEV_B2_SB(res1, res0, res3, res2, dst0, dst1);
870 PCKEV_B2_SB(res5, res4, res7, res6, dst2, dst3);
871 dst0 = __msa_aver_s_b(dst0, src0);
872 dst1 = __msa_aver_s_b(dst1, src2);
873 dst2 = __msa_aver_s_b(dst2, src4);
874 dst3 = __msa_aver_s_b(dst3, src6);
875 XORI_B4_128_SB(dst0, dst1, dst2, dst3);
876 ST_SB4(dst0, dst1, dst2, dst3, dst, stride);
881 void ff_put_h264_qpel8_mc10_msa(uint8_t *dst, const uint8_t *src,
884 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask0, mask1, mask2;
885 v16i8 tmp0, tmp1, tmp2, tmp3, vec11;
886 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
887 v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
888 v16i8 minus5b = __msa_ldi_b(-5);
889 v16i8 plus20b = __msa_ldi_b(20);
891 LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
892 LD_SB8(src - 2, stride, src0, src1, src2, src3, src4, src5, src6, src7);
893 XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
894 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
895 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3);
896 HADD_SB4_SH(vec0, vec1, vec2, vec3, res0, res1, res2, res3);
897 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4, vec5);
898 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6, vec7);
899 DPADD_SB4_SH(vec4, vec5, vec6, vec7, minus5b, minus5b, minus5b, minus5b,
900 res0, res1, res2, res3);
901 VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec8, vec9);
902 VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec10, vec11);
903 DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b, plus20b,
904 res0, res1, res2, res3);
905 VSHF_B2_SB(src4, src4, src5, src5, mask0, mask0, vec0, vec1);
906 VSHF_B2_SB(src6, src6, src7, src7, mask0, mask0, vec2, vec3);
907 HADD_SB4_SH(vec0, vec1, vec2, vec3, res4, res5, res6, res7);
908 VSHF_B2_SB(src4, src4, src5, src5, mask1, mask1, vec4, vec5);
909 VSHF_B2_SB(src6, src6, src7, src7, mask1, mask1, vec6, vec7);
910 DPADD_SB4_SH(vec4, vec5, vec6, vec7, minus5b, minus5b, minus5b, minus5b,
911 res4, res5, res6, res7);
912 VSHF_B2_SB(src4, src4, src5, src5, mask2, mask2, vec8, vec9);
913 VSHF_B2_SB(src6, src6, src7, src7, mask2, mask2, vec10, vec11);
914 DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b, plus20b,
915 res4, res5, res6, res7);
916 SLDI_B2_SB(src0, src1, src0, src1, src0, src1, 2);
917 SLDI_B2_SB(src2, src3, src2, src3, src2, src3, 2);
918 SLDI_B2_SB(src4, src5, src4, src5, src4, src5, 2);
919 SLDI_B2_SB(src6, src7, src6, src7, src6, src7, 2);
920 PCKEV_D2_SB(src1, src0, src3, src2, src0, src1);
921 PCKEV_D2_SB(src5, src4, src7, src6, src4, src5);
922 SRARI_H4_SH(res0, res1, res2, res3, 5);
923 SRARI_H4_SH(res4, res5, res6, res7, 5);
924 SAT_SH4_SH(res0, res1, res2, res3, 7);
925 SAT_SH4_SH(res4, res5, res6, res7, 7);
926 PCKEV_B2_SB(res1, res0, res3, res2, tmp0, tmp1);
927 PCKEV_B2_SB(res5, res4, res7, res6, tmp2, tmp3);
928 tmp0 = __msa_aver_s_b(tmp0, src0);
929 tmp1 = __msa_aver_s_b(tmp1, src1);
930 tmp2 = __msa_aver_s_b(tmp2, src4);
931 tmp3 = __msa_aver_s_b(tmp3, src5);
932 XORI_B4_128_SB(tmp0, tmp1, tmp2, tmp3);
933 ST8x8_UB(tmp0, tmp1, tmp2, tmp3, dst, stride);
936 void ff_put_h264_qpel8_mc30_msa(uint8_t *dst, const uint8_t *src,
939 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask0, mask1, mask2;
940 v16i8 tmp0, tmp1, tmp2, tmp3, vec11;
941 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
942 v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
943 v16i8 minus5b = __msa_ldi_b(-5);
944 v16i8 plus20b = __msa_ldi_b(20);
946 LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
947 LD_SB8(src - 2, stride, src0, src1, src2, src3, src4, src5, src6, src7);
948 XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
949 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
950 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3);
951 HADD_SB4_SH(vec0, vec1, vec2, vec3, res0, res1, res2, res3);
952 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4, vec5);
953 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6, vec7);
954 DPADD_SB4_SH(vec4, vec5, vec6, vec7, minus5b, minus5b, minus5b, minus5b,
955 res0, res1, res2, res3);
956 VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec8, vec9);
957 VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec10, vec11);
958 DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b, plus20b,
959 res0, res1, res2, res3);
960 VSHF_B2_SB(src4, src4, src5, src5, mask0, mask0, vec0, vec1);
961 VSHF_B2_SB(src6, src6, src7, src7, mask0, mask0, vec2, vec3);
962 HADD_SB4_SH(vec0, vec1, vec2, vec3, res4, res5, res6, res7);
963 VSHF_B2_SB(src4, src4, src5, src5, mask1, mask1, vec4, vec5);
964 VSHF_B2_SB(src6, src6, src7, src7, mask1, mask1, vec6, vec7);
965 DPADD_SB4_SH(vec4, vec5, vec6, vec7, minus5b, minus5b, minus5b, minus5b,
966 res4, res5, res6, res7);
967 VSHF_B2_SB(src4, src4, src5, src5, mask2, mask2, vec8, vec9);
968 VSHF_B2_SB(src6, src6, src7, src7, mask2, mask2, vec10, vec11);
969 DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b, plus20b,
970 res4, res5, res6, res7);
971 SLDI_B2_SB(src0, src1, src0, src1, src0, src1, 3);
972 SLDI_B2_SB(src2, src3, src2, src3, src2, src3, 3);
973 SLDI_B2_SB(src4, src5, src4, src5, src4, src5, 3);
974 SLDI_B2_SB(src6, src7, src6, src7, src6, src7, 3);
975 PCKEV_D2_SB(src1, src0, src3, src2, src0, src1);
976 PCKEV_D2_SB(src5, src4, src7, src6, src4, src5);
977 SRARI_H4_SH(res0, res1, res2, res3, 5);
978 SRARI_H4_SH(res4, res5, res6, res7, 5);
979 SAT_SH4_SH(res0, res1, res2, res3, 7);
980 SAT_SH4_SH(res4, res5, res6, res7, 7);
981 PCKEV_B2_SB(res1, res0, res3, res2, tmp0, tmp1);
982 PCKEV_B2_SB(res5, res4, res7, res6, tmp2, tmp3);
983 tmp0 = __msa_aver_s_b(tmp0, src0);
984 tmp1 = __msa_aver_s_b(tmp1, src1);
985 tmp2 = __msa_aver_s_b(tmp2, src4);
986 tmp3 = __msa_aver_s_b(tmp3, src5);
987 XORI_B4_128_SB(tmp0, tmp1, tmp2, tmp3);
988 ST8x8_UB(tmp0, tmp1, tmp2, tmp3, dst, stride);
991 void ff_put_h264_qpel4_mc10_msa(uint8_t *dst, const uint8_t *src,
994 v16i8 src0, src1, src2, src3, res, mask0, mask1, mask2;
995 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
997 v16i8 minus5b = __msa_ldi_b(-5);
998 v16i8 plus20b = __msa_ldi_b(20);
1000 LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2);
1001 LD_SB4(src - 2, stride, src0, src1, src2, src3);
1002 XORI_B4_128_SB(src0, src1, src2, src3);
1003 VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0, vec1);
1004 HADD_SB2_SH(vec0, vec1, res0, res1);
1005 VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2, vec3);
1006 DPADD_SB2_SH(vec2, vec3, minus5b, minus5b, res0, res1);
1007 VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4, vec5);
1008 DPADD_SB2_SH(vec4, vec5, plus20b, plus20b, res0, res1);
1009 SRARI_H2_SH(res0, res1, 5);
1010 SAT_SH2_SH(res0, res1, 7);
1011 res = __msa_pckev_b((v16i8) res1, (v16i8) res0);
1012 SLDI_B2_SB(src0, src1, src0, src1, src0, src1, 2);
1013 SLDI_B2_SB(src2, src3, src2, src3, src2, src3, 2);
1014 src0 = (v16i8) __msa_insve_w((v4i32) src0, 1, (v4i32) src1);
1015 src1 = (v16i8) __msa_insve_w((v4i32) src2, 1, (v4i32) src3);
1016 src0 = (v16i8) __msa_insve_d((v2i64) src0, 1, (v2i64) src1);
1017 res = __msa_aver_s_b(res, src0);
1018 res = (v16i8) __msa_xori_b((v16u8) res, 128);
1019 ST4x4_UB(res, res, 0, 1, 2, 3, dst, stride);
1022 void ff_put_h264_qpel4_mc30_msa(uint8_t *dst, const uint8_t *src,
1025 v16i8 src0, src1, src2, src3, res, mask0, mask1, mask2;
1026 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
1028 v16i8 minus5b = __msa_ldi_b(-5);
1029 v16i8 plus20b = __msa_ldi_b(20);
1031 LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2);
1032 LD_SB4(src - 2, stride, src0, src1, src2, src3);
1033 XORI_B4_128_SB(src0, src1, src2, src3);
1034 VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0, vec1);
1035 HADD_SB2_SH(vec0, vec1, res0, res1);
1036 VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2, vec3);
1037 DPADD_SB2_SH(vec2, vec3, minus5b, minus5b, res0, res1);
1038 VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4, vec5);
1039 DPADD_SB2_SH(vec4, vec5, plus20b, plus20b, res0, res1);
1040 SRARI_H2_SH(res0, res1, 5);
1041 SAT_SH2_SH(res0, res1, 7);
1042 res = __msa_pckev_b((v16i8) res1, (v16i8) res0);
1043 SLDI_B2_SB(src0, src1, src0, src1, src0, src1, 3);
1044 SLDI_B2_SB(src2, src3, src2, src3, src2, src3, 3);
1045 src0 = (v16i8) __msa_insve_w((v4i32) src0, 1, (v4i32) src1);
1046 src1 = (v16i8) __msa_insve_w((v4i32) src2, 1, (v4i32) src3);
1047 src0 = (v16i8) __msa_insve_d((v2i64) src0, 1, (v2i64) src1);
1048 res = __msa_aver_s_b(res, src0);
1049 res = (v16i8) __msa_xori_b((v16u8) res, 128);
1050 ST4x4_UB(res, res, 0, 1, 2, 3, dst, stride);
1053 void ff_put_h264_qpel16_mc20_msa(uint8_t *dst, const uint8_t *src,
1057 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask0, mask1, mask2;
1058 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
1060 v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
1061 v16i8 minus5b = __msa_ldi_b(-5);
1062 v16i8 plus20b = __msa_ldi_b(20);
1064 LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
1067 for (loop_cnt = 4; loop_cnt--;) {
1068 LD_SB2(src, 8, src0, src1);
1070 LD_SB2(src, 8, src2, src3);
1072 LD_SB2(src, 8, src4, src5);
1074 LD_SB2(src, 8, src6, src7);
1077 XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
1078 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec3);
1079 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec6, vec9);
1080 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec1, vec4);
1081 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec7, vec10);
1082 VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec2, vec5);
1083 VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec8, vec11);
1084 HADD_SB4_SH(vec0, vec3, vec6, vec9, res0, res1, res2, res3);
1085 DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
1086 minus5b, res0, res1, res2, res3);
1087 DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
1088 plus20b, res0, res1, res2, res3);
1089 VSHF_B2_SB(src4, src4, src5, src5, mask0, mask0, vec0, vec3);
1090 VSHF_B2_SB(src6, src6, src7, src7, mask0, mask0, vec6, vec9);
1091 VSHF_B2_SB(src4, src4, src5, src5, mask1, mask1, vec1, vec4);
1092 VSHF_B2_SB(src6, src6, src7, src7, mask1, mask1, vec7, vec10);
1093 VSHF_B2_SB(src4, src4, src5, src5, mask2, mask2, vec2, vec5);
1094 VSHF_B2_SB(src6, src6, src7, src7, mask2, mask2, vec8, vec11);
1095 HADD_SB4_SH(vec0, vec3, vec6, vec9, res4, res5, res6, res7);
1096 DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
1097 minus5b, res4, res5, res6, res7);
1098 DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
1099 plus20b, res4, res5, res6, res7);
1100 SRARI_H4_SH(res0, res1, res2, res3, 5);
1101 SRARI_H4_SH(res4, res5, res6, res7, 5);
1102 SAT_SH4_SH(res0, res1, res2, res3, 7);
1103 SAT_SH4_SH(res4, res5, res6, res7, 7);
1104 PCKEV_B4_SB(res1, res0, res3, res2, res5, res4, res7, res6, vec0, vec1,
1106 XORI_B4_128_SB(vec0, vec1, vec2, vec3);
1107 ST_SB4(vec0, vec1, vec2, vec3, dst, stride);
1108 dst += (4 * stride);
1112 void ff_put_h264_qpel8_mc20_msa(uint8_t *dst, const uint8_t *src,
1115 v16u8 out0, out1, out2, out3;
1116 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask0, mask1, mask2;
1117 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
1119 v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
1120 v16i8 minus5b = __msa_ldi_b(-5);
1121 v16i8 plus20b = __msa_ldi_b(20);
1123 LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
1124 LD_SB8(src - 2, stride, src0, src1, src2, src3, src4, src5, src6, src7);
1125 XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
1126 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
1127 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3);
1128 HADD_SB4_SH(vec0, vec1, vec2, vec3, res0, res1, res2, res3);
1129 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4, vec5);
1130 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6, vec7);
1131 DPADD_SB4_SH(vec4, vec5, vec6, vec7, minus5b, minus5b, minus5b, minus5b,
1132 res0, res1, res2, res3);
1133 VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec8, vec9);
1134 VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec10, vec11);
1135 DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b,
1136 plus20b, res0, res1, res2, res3);
1137 VSHF_B2_SB(src4, src4, src5, src5, mask0, mask0, vec0, vec1);
1138 VSHF_B2_SB(src6, src6, src7, src7, mask0, mask0, vec2, vec3);
1139 HADD_SB4_SH(vec0, vec1, vec2, vec3, res4, res5, res6, res7);
1140 VSHF_B2_SB(src4, src4, src5, src5, mask1, mask1, vec4, vec5);
1141 VSHF_B2_SB(src6, src6, src7, src7, mask1, mask1, vec6, vec7);
1142 DPADD_SB4_SH(vec4, vec5, vec6, vec7, minus5b, minus5b, minus5b, minus5b,
1143 res4, res5, res6, res7);
1144 VSHF_B2_SB(src4, src4, src5, src5, mask2, mask2, vec8, vec9);
1145 VSHF_B2_SB(src6, src6, src7, src7, mask2, mask2, vec10, vec11);
1146 DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b,
1147 plus20b, res4, res5, res6, res7);
1148 SRARI_H4_SH(res0, res1, res2, res3, 5);
1149 SRARI_H4_SH(res4, res5, res6, res7, 5);
1150 SAT_SH4_SH(res0, res1, res2, res3, 7);
1151 SAT_SH4_SH(res4, res5, res6, res7, 7);
1152 out0 = PCKEV_XORI128_UB(res0, res1);
1153 out1 = PCKEV_XORI128_UB(res2, res3);
1154 out2 = PCKEV_XORI128_UB(res4, res5);
1155 out3 = PCKEV_XORI128_UB(res6, res7);
1156 ST8x8_UB(out0, out1, out2, out3, dst, stride);
1159 void ff_put_h264_qpel4_mc20_msa(uint8_t *dst, const uint8_t *src,
1163 v16i8 src0, src1, src2, src3, mask0, mask1, mask2;
1164 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
1166 v16i8 minus5b = __msa_ldi_b(-5);
1167 v16i8 plus20b = __msa_ldi_b(20);
1169 LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2);
1170 LD_SB4(src - 2, stride, src0, src1, src2, src3);
1171 XORI_B4_128_SB(src0, src1, src2, src3);
1172 VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0, vec1);
1173 HADD_SB2_SH(vec0, vec1, res0, res1);
1174 VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2, vec3);
1175 DPADD_SB2_SH(vec2, vec3, minus5b, minus5b, res0, res1);
1176 VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4, vec5);
1177 DPADD_SB2_SH(vec4, vec5, plus20b, plus20b, res0, res1);
1178 SRARI_H2_SH(res0, res1, 5);
1179 SAT_SH2_SH(res0, res1, 7);
1180 out = PCKEV_XORI128_UB(res0, res1);
1181 ST4x4_UB(out, out, 0, 1, 2, 3, dst, stride);
1184 void ff_put_h264_qpel16_mc01_msa(uint8_t *dst, const uint8_t *src,
1188 int16_t filt_const0 = 0xfb01;
1189 int16_t filt_const1 = 0x1414;
1190 int16_t filt_const2 = 0x1fb;
1191 v16u8 res0, res1, res2, res3;
1192 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
1193 v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
1194 v16i8 src87_r, src10_l, src32_l, src54_l, src76_l, src21_l, src43_l;
1195 v16i8 src65_l, src87_l, filt0, filt1, filt2;
1196 v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
1198 filt0 = (v16i8) __msa_fill_h(filt_const0);
1199 filt1 = (v16i8) __msa_fill_h(filt_const1);
1200 filt2 = (v16i8) __msa_fill_h(filt_const2);
1202 src -= (stride * 2);
1204 LD_SB5(src, stride, src0, src1, src2, src3, src4);
1205 src += (5 * stride);
1207 XORI_B5_128_SB(src0, src1, src2, src3, src4);
1208 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
1210 ILVL_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_l, src21_l,
1213 for (loop_cnt = 4; loop_cnt--;) {
1214 LD_SB4(src, stride, src5, src6, src7, src8);
1215 src += (4 * stride);
1217 XORI_B4_128_SB(src5, src6, src7, src8);
1218 ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r,
1219 src65_r, src76_r, src87_r);
1220 ILVL_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_l,
1221 src65_l, src76_l, src87_l);
1222 out0_r = AVC_DOT_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2);
1223 out1_r = AVC_DOT_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2);
1224 out2_r = AVC_DOT_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2);
1225 out3_r = AVC_DOT_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2);
1226 out0_l = AVC_DOT_SH3_SH(src10_l, src32_l, src54_l, filt0, filt1, filt2);
1227 out1_l = AVC_DOT_SH3_SH(src21_l, src43_l, src65_l, filt0, filt1, filt2);
1228 out2_l = AVC_DOT_SH3_SH(src32_l, src54_l, src76_l, filt0, filt1, filt2);
1229 out3_l = AVC_DOT_SH3_SH(src43_l, src65_l, src87_l, filt0, filt1, filt2);
1230 SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 5);
1231 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
1232 SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 5);
1233 SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
1234 PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
1235 out3_r, res0, res1, res2, res3);
1236 res0 = (v16u8) __msa_aver_s_b((v16i8) res0, src2);
1237 res1 = (v16u8) __msa_aver_s_b((v16i8) res1, src3);
1238 res2 = (v16u8) __msa_aver_s_b((v16i8) res2, src4);
1239 res3 = (v16u8) __msa_aver_s_b((v16i8) res3, src5);
1240 XORI_B4_128_UB(res0, res1, res2, res3);
1241 ST_UB4(res0, res1, res2, res3, dst, stride);
1242 dst += (4 * stride);
1258 void ff_put_h264_qpel16_mc03_msa(uint8_t *dst, const uint8_t *src,
1262 int16_t filt_const0 = 0xfb01;
1263 int16_t filt_const1 = 0x1414;
1264 int16_t filt_const2 = 0x1fb;
1265 v16u8 res0, res1, res2, res3;
1266 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
1267 v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
1268 v16i8 src87_r, src10_l, src32_l, src54_l, src76_l, src21_l, src43_l;
1269 v16i8 src65_l, src87_l, filt0, filt1, filt2;
1270 v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
1272 filt0 = (v16i8) __msa_fill_h(filt_const0);
1273 filt1 = (v16i8) __msa_fill_h(filt_const1);
1274 filt2 = (v16i8) __msa_fill_h(filt_const2);
1276 src -= (stride * 2);
1278 LD_SB5(src, stride, src0, src1, src2, src3, src4);
1279 src += (5 * stride);
1281 XORI_B5_128_SB(src0, src1, src2, src3, src4);
1282 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
1284 ILVL_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_l, src21_l,
1287 for (loop_cnt = 4; loop_cnt--;) {
1288 LD_SB4(src, stride, src5, src6, src7, src8);
1289 src += (4 * stride);
1291 XORI_B4_128_SB(src5, src6, src7, src8);
1292 ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r,
1293 src65_r, src76_r, src87_r);
1294 ILVL_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_l,
1295 src65_l, src76_l, src87_l);
1296 out0_r = AVC_DOT_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2);
1297 out1_r = AVC_DOT_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2);
1298 out2_r = AVC_DOT_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2);
1299 out3_r = AVC_DOT_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2);
1300 out0_l = AVC_DOT_SH3_SH(src10_l, src32_l, src54_l, filt0, filt1, filt2);
1301 out1_l = AVC_DOT_SH3_SH(src21_l, src43_l, src65_l, filt0, filt1, filt2);
1302 out2_l = AVC_DOT_SH3_SH(src32_l, src54_l, src76_l, filt0, filt1, filt2);
1303 out3_l = AVC_DOT_SH3_SH(src43_l, src65_l, src87_l, filt0, filt1, filt2);
1304 SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 5);
1305 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
1306 SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 5);
1307 SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
1308 PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
1309 out3_r, res0, res1, res2, res3);
1310 res0 = (v16u8) __msa_aver_s_b((v16i8) res0, src3);
1311 res1 = (v16u8) __msa_aver_s_b((v16i8) res1, src4);
1312 res2 = (v16u8) __msa_aver_s_b((v16i8) res2, src5);
1313 res3 = (v16u8) __msa_aver_s_b((v16i8) res3, src6);
1314 XORI_B4_128_UB(res0, res1, res2, res3);
1315 ST_UB4(res0, res1, res2, res3, dst, stride);
1316 dst += (4 * stride);
1331 void ff_put_h264_qpel8_mc01_msa(uint8_t *dst, const uint8_t *src,
1334 const int16_t filt_const0 = 0xfb01;
1335 const int16_t filt_const1 = 0x1414;
1336 const int16_t filt_const2 = 0x1fb;
1337 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1338 v16i8 src11, src12, src10_r, src32_r, src54_r, src65_r, src76_r, src98_r;
1339 v16i8 src21_r, src43_r, src87_r, src109_r, src1211_r, src1110_r;
1340 v16i8 tmp0, tmp1, tmp2, tmp3, filt0, filt1, filt2, out0, out1, out2, out3;
1341 v8i16 out0_r, out1_r, out2_r, out3_r, out4_r, out5_r, out6_r, out7_r;
1343 filt0 = (v16i8) __msa_fill_h(filt_const0);
1344 filt1 = (v16i8) __msa_fill_h(filt_const1);
1345 filt2 = (v16i8) __msa_fill_h(filt_const2);
1347 src -= (stride * 2);
1349 LD_SB5(src, stride, src0, src1, src2, src3, src4);
1350 src += (5 * stride);
1351 LD_SB8(src, stride, src5, src6, src7, src8, src9, src10, src11, src12);
1352 XORI_B8_128_SB(src5, src6, src7, src8, src9, src10, src11, src12);
1353 XORI_B5_128_SB(src0, src1, src2, src3, src4);
1354 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
1356 ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
1358 ILVR_B4_SB(src9, src8, src10, src9, src11, src10, src12, src11, src98_r,
1359 src109_r, src1110_r, src1211_r);
1360 out0_r = AVC_DOT_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2);
1361 out1_r = AVC_DOT_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2);
1362 out2_r = AVC_DOT_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2);
1363 out3_r = AVC_DOT_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2);
1364 out4_r = AVC_DOT_SH3_SH(src54_r, src76_r, src98_r, filt0, filt1, filt2);
1365 out5_r = AVC_DOT_SH3_SH(src65_r, src87_r, src109_r, filt0, filt1, filt2);
1366 out6_r = AVC_DOT_SH3_SH(src76_r, src98_r, src1110_r, filt0, filt1, filt2);
1367 out7_r = AVC_DOT_SH3_SH(src87_r, src109_r, src1211_r, filt0, filt1, filt2);
1368 PCKEV_D2_SB(src3, src2, src5, src4, tmp0, tmp1);
1369 PCKEV_D2_SB(src7, src6, src9, src8, tmp2, tmp3);
1370 SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 5);
1371 SRARI_H4_SH(out4_r, out5_r, out6_r, out7_r, 5);
1372 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
1373 SAT_SH4_SH(out4_r, out5_r, out6_r, out7_r, 7);
1374 PCKEV_B2_SB(out1_r, out0_r, out3_r, out2_r, out0, out1);
1375 PCKEV_B2_SB(out5_r, out4_r, out7_r, out6_r, out2, out3);
1376 out0 = __msa_aver_s_b(out0, tmp0);
1377 out1 = __msa_aver_s_b(out1, tmp1);
1378 out2 = __msa_aver_s_b(out2, tmp2);
1379 out3 = __msa_aver_s_b(out3, tmp3);
1380 XORI_B4_128_SB(out0, out1, out2, out3);
1381 ST8x8_UB(out0, out1, out2, out3, dst, stride);
1384 void ff_put_h264_qpel8_mc03_msa(uint8_t *dst, const uint8_t *src,
1387 const int16_t filt_const0 = 0xfb01;
1388 const int16_t filt_const1 = 0x1414;
1389 const int16_t filt_const2 = 0x1fb;
1390 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1391 v16i8 src11, src12, src10_r, src32_r, src54_r, src65_r, src76_r, src98_r;
1392 v16i8 src21_r, src43_r, src87_r, src109_r, src1211_r, src1110_r;
1393 v16i8 filt0, filt1, filt2, out0, out1, out2, out3, tmp0, tmp1, tmp2, tmp3;
1394 v8i16 out0_r, out1_r, out2_r, out3_r, out4_r, out5_r, out6_r, out7_r;
1396 filt0 = (v16i8) __msa_fill_h(filt_const0);
1397 filt1 = (v16i8) __msa_fill_h(filt_const1);
1398 filt2 = (v16i8) __msa_fill_h(filt_const2);
1400 src -= (stride * 2);
1402 LD_SB5(src, stride, src0, src1, src2, src3, src4);
1403 src += (5 * stride);
1404 LD_SB8(src, stride, src5, src6, src7, src8, src9, src10, src11, src12);
1405 XORI_B5_128_SB(src0, src1, src2, src3, src4);
1406 XORI_B8_128_SB(src5, src6, src7, src8, src9, src10, src11, src12);
1407 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
1409 ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
1411 ILVR_B4_SB(src9, src8, src10, src9, src11, src10, src12, src11, src98_r,
1412 src109_r, src1110_r, src1211_r);
1413 out0_r = AVC_DOT_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2);
1414 out1_r = AVC_DOT_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2);
1415 out2_r = AVC_DOT_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2);
1416 out3_r = AVC_DOT_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2);
1417 out4_r = AVC_DOT_SH3_SH(src54_r, src76_r, src98_r, filt0, filt1, filt2);
1418 out5_r = AVC_DOT_SH3_SH(src65_r, src87_r, src109_r, filt0, filt1, filt2);
1419 out6_r = AVC_DOT_SH3_SH(src76_r, src98_r, src1110_r, filt0, filt1, filt2);
1420 out7_r = AVC_DOT_SH3_SH(src87_r, src109_r, src1211_r, filt0, filt1, filt2);
1421 PCKEV_D2_SB(src4, src3, src6, src5, tmp0, tmp1);
1422 PCKEV_D2_SB(src8, src7, src10, src9, tmp2, tmp3);
1423 SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 5);
1424 SRARI_H4_SH(out4_r, out5_r, out6_r, out7_r, 5);
1425 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
1426 SAT_SH4_SH(out4_r, out5_r, out6_r, out7_r, 7);
1427 PCKEV_B2_SB(out1_r, out0_r, out3_r, out2_r, out0, out1);
1428 PCKEV_B2_SB(out5_r, out4_r, out7_r, out6_r, out2, out3);
1429 out0 = __msa_aver_s_b(out0, tmp0);
1430 out1 = __msa_aver_s_b(out1, tmp1);
1431 out2 = __msa_aver_s_b(out2, tmp2);
1432 out3 = __msa_aver_s_b(out3, tmp3);
1433 XORI_B4_128_SB(out0, out1, out2, out3);
1434 ST8x8_UB(out0, out1, out2, out3, dst, stride);
1437 void ff_put_h264_qpel4_mc01_msa(uint8_t *dst, const uint8_t *src,
1440 int16_t filt_const0 = 0xfb01;
1441 int16_t filt_const1 = 0x1414;
1442 int16_t filt_const2 = 0x1fb;
1444 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
1445 v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
1446 v16i8 src87_r, src2110, src4332, src6554, src8776, filt0, filt1, filt2;
1449 filt0 = (v16i8) __msa_fill_h(filt_const0);
1450 filt1 = (v16i8) __msa_fill_h(filt_const1);
1451 filt2 = (v16i8) __msa_fill_h(filt_const2);
1453 src -= (stride * 2);
1455 LD_SB5(src, stride, src0, src1, src2, src3, src4);
1456 src += (5 * stride);
1457 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
1459 ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
1460 XORI_B2_128_SB(src2110, src4332);
1461 LD_SB4(src, stride, src5, src6, src7, src8);
1462 ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
1464 ILVR_D2_SB(src65_r, src54_r, src87_r, src76_r, src6554, src8776);
1465 XORI_B2_128_SB(src6554, src8776);
1466 out10 = AVC_DOT_SH3_SH(src2110, src4332, src6554, filt0, filt1, filt2);
1467 out32 = AVC_DOT_SH3_SH(src4332, src6554, src8776, filt0, filt1, filt2);
1468 SRARI_H2_SH(out10, out32, 5);
1469 SAT_SH2_SH(out10, out32, 7);
1470 out = PCKEV_XORI128_UB(out10, out32);
1471 src32_r = (v16i8) __msa_insve_w((v4i32) src2, 1, (v4i32) src3);
1472 src54_r = (v16i8) __msa_insve_w((v4i32) src4, 1, (v4i32) src5);
1473 src32_r = (v16i8) __msa_insve_d((v2i64) src32_r, 1, (v2i64) src54_r);
1474 out = __msa_aver_u_b(out, (v16u8) src32_r);
1475 ST4x4_UB(out, out, 0, 1, 2, 3, dst, stride);
1478 void ff_put_h264_qpel4_mc03_msa(uint8_t *dst, const uint8_t *src,
1481 int16_t filt_const0 = 0xfb01;
1482 int16_t filt_const1 = 0x1414;
1483 int16_t filt_const2 = 0x1fb;
1485 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
1486 v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
1487 v16i8 src87_r, src2110, src4332, src6554, src8776, filt0, filt1, filt2;
1490 filt0 = (v16i8) __msa_fill_h(filt_const0);
1491 filt1 = (v16i8) __msa_fill_h(filt_const1);
1492 filt2 = (v16i8) __msa_fill_h(filt_const2);
1494 src -= (stride * 2);
1496 LD_SB5(src, stride, src0, src1, src2, src3, src4);
1497 src += (5 * stride);
1498 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
1500 ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
1501 XORI_B2_128_SB(src2110, src4332);
1502 LD_SB4(src, stride, src5, src6, src7, src8);
1503 ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
1505 ILVR_D2_SB(src65_r, src54_r, src87_r, src76_r, src6554, src8776);
1506 XORI_B2_128_SB(src6554, src8776);
1507 out10 = AVC_DOT_SH3_SH(src2110, src4332, src6554, filt0, filt1, filt2);
1508 out32 = AVC_DOT_SH3_SH(src4332, src6554, src8776, filt0, filt1, filt2);
1509 SRARI_H2_SH(out10, out32, 5);
1510 SAT_SH2_SH(out10, out32, 7);
1511 out = PCKEV_XORI128_UB(out10, out32);
1512 src32_r = (v16i8) __msa_insve_w((v4i32) src3, 1, (v4i32) src4);
1513 src54_r = (v16i8) __msa_insve_w((v4i32) src5, 1, (v4i32) src6);
1514 src32_r = (v16i8) __msa_insve_d((v2i64) src32_r, 1, (v2i64) src54_r);
1515 out = __msa_aver_u_b(out, (v16u8) src32_r);
1516 ST4x4_UB(out, out, 0, 1, 2, 3, dst, stride);
1519 void ff_put_h264_qpel16_mc11_msa(uint8_t *dst, const uint8_t *src,
1522 avc_luma_hv_qrt_16x16_msa(src - 2, src - (stride * 2), dst, stride);
1525 void ff_put_h264_qpel16_mc31_msa(uint8_t *dst, const uint8_t *src,
1528 avc_luma_hv_qrt_16x16_msa(src - 2, src - (stride * 2) + 1, dst, stride);
1531 void ff_put_h264_qpel16_mc13_msa(uint8_t *dst, const uint8_t *src,
1534 avc_luma_hv_qrt_16x16_msa(src + stride - 2, src - (stride * 2), dst,
1538 void ff_put_h264_qpel16_mc33_msa(uint8_t *dst, const uint8_t *src,
1541 avc_luma_hv_qrt_16x16_msa(src + stride - 2, src - (stride * 2) + 1, dst,
1545 void ff_put_h264_qpel8_mc11_msa(uint8_t *dst, const uint8_t *src,
1548 avc_luma_hv_qrt_8x8_msa(src - 2, src - (stride * 2), dst, stride);
1551 void ff_put_h264_qpel8_mc31_msa(uint8_t *dst, const uint8_t *src,
1554 avc_luma_hv_qrt_8x8_msa(src - 2, src - (stride * 2) + 1, dst, stride);
1557 void ff_put_h264_qpel8_mc13_msa(uint8_t *dst, const uint8_t *src,
1560 avc_luma_hv_qrt_8x8_msa(src + stride - 2, src - (stride * 2), dst, stride);
1563 void ff_put_h264_qpel8_mc33_msa(uint8_t *dst, const uint8_t *src,
1566 avc_luma_hv_qrt_8x8_msa(src + stride - 2, src - (stride * 2) + 1, dst,
1571 void ff_put_h264_qpel4_mc11_msa(uint8_t *dst, const uint8_t *src,
1574 avc_luma_hv_qrt_4x4_msa(src - 2, src - (stride * 2), dst, stride);
1577 void ff_put_h264_qpel4_mc31_msa(uint8_t *dst, const uint8_t *src,
1580 avc_luma_hv_qrt_4x4_msa(src - 2, src - (stride * 2) + 1, dst, stride);
1583 void ff_put_h264_qpel4_mc13_msa(uint8_t *dst, const uint8_t *src,
1586 avc_luma_hv_qrt_4x4_msa(src + stride - 2, src - (stride * 2), dst, stride);
1589 void ff_put_h264_qpel4_mc33_msa(uint8_t *dst, const uint8_t *src,
1592 avc_luma_hv_qrt_4x4_msa(src + stride - 2, src - (stride * 2) + 1, dst,
1596 void ff_put_h264_qpel16_mc21_msa(uint8_t *dst, const uint8_t *src,
1599 uint8_t *dst_tmp = dst;
1600 const uint8_t *src_tmp = src - (2 * stride) - 2;
1601 uint32_t multiple8_cnt, loop_cnt;
1602 const int32_t filt_const0 = 0xfffb0001;
1603 const int32_t filt_const1 = 0x140014;
1604 const int32_t filt_const2 = 0x1fffb;
1606 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, mask0, mask1;
1608 v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
1609 v8i16 hz_out7, hz_out8, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
1610 v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r;
1611 v8i16 hz_out65_r, hz_out76_r, hz_out87_r, hz_out10_l, hz_out21_l;
1612 v8i16 hz_out32_l, hz_out43_l, hz_out54_l, hz_out65_l, hz_out76_l;
1613 v8i16 hz_out87_l, filt0, filt1, filt2;
1616 filt0 = (v8i16) __msa_fill_w(filt_const0);
1617 filt1 = (v8i16) __msa_fill_w(filt_const1);
1618 filt2 = (v8i16) __msa_fill_w(filt_const2);
1620 LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
1622 for (multiple8_cnt = 2; multiple8_cnt--;) {
1626 LD_SB5(src, stride, src0, src1, src2, src3, src4);
1627 XORI_B5_128_SB(src0, src1, src2, src3, src4);
1628 src += (5 * stride);
1630 hz_out0 = AVC_HORZ_FILTER_SH(src0, src0, mask0, mask1, mask2);
1631 hz_out1 = AVC_HORZ_FILTER_SH(src1, src1, mask0, mask1, mask2);
1632 hz_out2 = AVC_HORZ_FILTER_SH(src2, src2, mask0, mask1, mask2);
1633 hz_out3 = AVC_HORZ_FILTER_SH(src3, src3, mask0, mask1, mask2);
1634 hz_out4 = AVC_HORZ_FILTER_SH(src4, src4, mask0, mask1, mask2);
1636 for (loop_cnt = 4; loop_cnt--;) {
1637 LD_SB4(src, stride, src5, src6, src7, src8);
1638 src += (4 * stride);
1640 XORI_B4_128_SB(src5, src6, src7, src8);
1642 hz_out5 = AVC_HORZ_FILTER_SH(src5, src5, mask0, mask1, mask2);
1643 hz_out6 = AVC_HORZ_FILTER_SH(src6, src6, mask0, mask1, mask2);
1644 hz_out7 = AVC_HORZ_FILTER_SH(src7, src7, mask0, mask1, mask2);
1645 hz_out8 = AVC_HORZ_FILTER_SH(src8, src8, mask0, mask1, mask2);
1647 ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2,
1648 hz_out4, hz_out3, hz_out10_r, hz_out21_r, hz_out32_r,
1650 ILVL_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2,
1651 hz_out4, hz_out3, hz_out10_l, hz_out21_l, hz_out32_l,
1653 ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6,
1654 hz_out8, hz_out7, hz_out54_r, hz_out65_r, hz_out76_r,
1656 ILVL_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6,
1657 hz_out8, hz_out7, hz_out54_l, hz_out65_l, hz_out76_l,
1660 tmp0 = AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0,
1662 tmp1 = AVC_DOT_SW3_SW(hz_out10_l, hz_out32_l, hz_out54_l, filt0,
1664 dst0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
1665 tmp0 = AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0,
1667 tmp1 = AVC_DOT_SW3_SW(hz_out21_l, hz_out43_l, hz_out65_l, filt0,
1669 dst2 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
1670 tmp0 = AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0,
1672 tmp1 = AVC_DOT_SW3_SW(hz_out32_l, hz_out54_l, hz_out76_l, filt0,
1674 dst4 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
1675 tmp0 = AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0,
1677 tmp1 = AVC_DOT_SW3_SW(hz_out43_l, hz_out65_l, hz_out87_l, filt0,
1679 dst6 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
1681 dst1 = __msa_srari_h(hz_out2, 5);
1682 dst3 = __msa_srari_h(hz_out3, 5);
1683 dst5 = __msa_srari_h(hz_out4, 5);
1684 dst7 = __msa_srari_h(hz_out5, 5);
1685 SAT_SH4_SH(dst1, dst3, dst5, dst7, 7);
1687 dst0 = __msa_aver_s_h(dst0, dst1);
1688 dst1 = __msa_aver_s_h(dst2, dst3);
1689 dst2 = __msa_aver_s_h(dst4, dst5);
1690 dst3 = __msa_aver_s_h(dst6, dst7);
1692 out0 = PCKEV_XORI128_UB(dst0, dst1);
1693 out1 = PCKEV_XORI128_UB(dst2, dst3);
1694 ST8x4_UB(out0, out1, dst, stride);
1695 dst += (4 * stride);
1709 void ff_put_h264_qpel16_mc23_msa(uint8_t *dst, const uint8_t *src,
1712 uint8_t *dst_tmp = dst;
1713 const uint8_t *src_tmp = src - (2 * stride) - 2;
1714 uint32_t multiple8_cnt, loop_cnt;
1715 const int32_t filt_const0 = 0xfffb0001;
1716 const int32_t filt_const1 = 0x140014;
1717 const int32_t filt_const2 = 0x1fffb;
1719 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, mask0, mask1;
1721 v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
1722 v8i16 hz_out7, hz_out8, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
1723 v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r;
1724 v8i16 hz_out65_r, hz_out76_r, hz_out87_r, hz_out10_l, hz_out21_l;
1725 v8i16 hz_out32_l, hz_out43_l, hz_out54_l, hz_out65_l, hz_out76_l;
1726 v8i16 hz_out87_l, filt0, filt1, filt2;
1729 filt0 = (v8i16) __msa_fill_w(filt_const0);
1730 filt1 = (v8i16) __msa_fill_w(filt_const1);
1731 filt2 = (v8i16) __msa_fill_w(filt_const2);
1733 LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
1735 for (multiple8_cnt = 2; multiple8_cnt--;) {
1739 LD_SB5(src, stride, src0, src1, src2, src3, src4);
1740 XORI_B5_128_SB(src0, src1, src2, src3, src4);
1741 src += (5 * stride);
1743 hz_out0 = AVC_HORZ_FILTER_SH(src0, src0, mask0, mask1, mask2);
1744 hz_out1 = AVC_HORZ_FILTER_SH(src1, src1, mask0, mask1, mask2);
1745 hz_out2 = AVC_HORZ_FILTER_SH(src2, src2, mask0, mask1, mask2);
1746 hz_out3 = AVC_HORZ_FILTER_SH(src3, src3, mask0, mask1, mask2);
1747 hz_out4 = AVC_HORZ_FILTER_SH(src4, src4, mask0, mask1, mask2);
1749 for (loop_cnt = 4; loop_cnt--;) {
1750 LD_SB4(src, stride, src5, src6, src7, src8);
1751 src += (4 * stride);
1753 XORI_B4_128_SB(src5, src6, src7, src8);
1755 hz_out5 = AVC_HORZ_FILTER_SH(src5, src5, mask0, mask1, mask2);
1756 hz_out6 = AVC_HORZ_FILTER_SH(src6, src6, mask0, mask1, mask2);
1757 hz_out7 = AVC_HORZ_FILTER_SH(src7, src7, mask0, mask1, mask2);
1758 hz_out8 = AVC_HORZ_FILTER_SH(src8, src8, mask0, mask1, mask2);
1760 ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2,
1761 hz_out4, hz_out3, hz_out10_r, hz_out21_r, hz_out32_r,
1763 ILVL_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2,
1764 hz_out4, hz_out3, hz_out10_l, hz_out21_l, hz_out32_l,
1766 ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6,
1767 hz_out8, hz_out7, hz_out54_r, hz_out65_r, hz_out76_r,
1769 ILVL_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6,
1770 hz_out8, hz_out7, hz_out54_l, hz_out65_l, hz_out76_l,
1773 tmp0 = AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0,
1775 tmp1 = AVC_DOT_SW3_SW(hz_out10_l, hz_out32_l, hz_out54_l, filt0,
1777 dst0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
1778 tmp0 = AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0,
1780 tmp1 = AVC_DOT_SW3_SW(hz_out21_l, hz_out43_l, hz_out65_l, filt0,
1782 dst2 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
1783 tmp0 = AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0,
1785 tmp1 = AVC_DOT_SW3_SW(hz_out32_l, hz_out54_l, hz_out76_l, filt0,
1787 dst4 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
1788 tmp0 = AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0,
1790 tmp1 = AVC_DOT_SW3_SW(hz_out43_l, hz_out65_l, hz_out87_l, filt0,
1792 dst6 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
1794 dst1 = __msa_srari_h(hz_out3, 5);
1795 dst3 = __msa_srari_h(hz_out4, 5);
1796 dst5 = __msa_srari_h(hz_out5, 5);
1797 dst7 = __msa_srari_h(hz_out6, 5);
1798 SAT_SH4_SH(dst1, dst3, dst5, dst7, 7);
1800 dst0 = __msa_aver_s_h(dst0, dst1);
1801 dst1 = __msa_aver_s_h(dst2, dst3);
1802 dst2 = __msa_aver_s_h(dst4, dst5);
1803 dst3 = __msa_aver_s_h(dst6, dst7);
1805 out0 = PCKEV_XORI128_UB(dst0, dst1);
1806 out1 = PCKEV_XORI128_UB(dst2, dst3);
1807 ST8x4_UB(out0, out1, dst, stride);
1808 dst += (4 * stride);
1822 void ff_put_h264_qpel8_mc21_msa(uint8_t *dst, const uint8_t *src,
1825 const int32_t filt_const0 = 0xfffb0001;
1826 const int32_t filt_const1 = 0x140014;
1827 const int32_t filt_const2 = 0x1fffb;
1829 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1830 v16i8 src11, src12, mask0, mask1, mask2;
1831 v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
1832 v8i16 hz_out7, hz_out8, hz_out9, hz_out10, hz_out11, hz_out12;
1833 v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r;
1834 v8i16 hz_out65_r, hz_out76_r, hz_out87_r, hz_out89_r, hz_out910_r;
1835 v8i16 hz_out1110_r, hz_out1211_r, dst0, dst1, dst2, dst3;
1836 v8i16 hz_out10_l, hz_out21_l, hz_out32_l, hz_out43_l, hz_out54_l;
1837 v8i16 hz_out65_l, hz_out76_l, hz_out87_l, hz_out89_l, hz_out910_l;
1838 v8i16 hz_out1110_l, hz_out1211_l, filt0, filt1, filt2;
1841 LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
1843 filt0 = (v8i16) __msa_fill_w(filt_const0);
1844 filt1 = (v8i16) __msa_fill_w(filt_const1);
1845 filt2 = (v8i16) __msa_fill_w(filt_const2);
1847 src -= ((2 * stride) + 2);
1849 LD_SB5(src, stride, src0, src1, src2, src3, src4);
1850 XORI_B5_128_SB(src0, src1, src2, src3, src4);
1851 src += (5 * stride);
1853 hz_out0 = AVC_HORZ_FILTER_SH(src0, src0, mask0, mask1, mask2);
1854 hz_out1 = AVC_HORZ_FILTER_SH(src1, src1, mask0, mask1, mask2);
1855 hz_out2 = AVC_HORZ_FILTER_SH(src2, src2, mask0, mask1, mask2);
1856 hz_out3 = AVC_HORZ_FILTER_SH(src3, src3, mask0, mask1, mask2);
1857 hz_out4 = AVC_HORZ_FILTER_SH(src4, src4, mask0, mask1, mask2);
1859 LD_SB4(src, stride, src5, src6, src7, src8);
1860 src += (4 * stride);
1861 XORI_B4_128_SB(src5, src6, src7, src8);
1863 hz_out5 = AVC_HORZ_FILTER_SH(src5, src5, mask0, mask1, mask2);
1864 hz_out6 = AVC_HORZ_FILTER_SH(src6, src6, mask0, mask1, mask2);
1865 hz_out7 = AVC_HORZ_FILTER_SH(src7, src7, mask0, mask1, mask2);
1866 hz_out8 = AVC_HORZ_FILTER_SH(src8, src8, mask0, mask1, mask2);
1868 ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4,
1869 hz_out3, hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r);
1870 ILVL_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4,
1871 hz_out3, hz_out10_l, hz_out21_l, hz_out32_l, hz_out43_l);
1872 ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8,
1873 hz_out7, hz_out54_r, hz_out65_r, hz_out76_r, hz_out87_r);
1874 ILVL_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8,
1875 hz_out7, hz_out54_l, hz_out65_l, hz_out76_l, hz_out87_l);
1877 tmp0 = AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0, filt1,
1879 tmp1 = AVC_DOT_SW3_SW(hz_out10_l, hz_out32_l, hz_out54_l, filt0, filt1,
1881 dst0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
1882 tmp0 = AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0, filt1,
1884 tmp1 = AVC_DOT_SW3_SW(hz_out21_l, hz_out43_l, hz_out65_l, filt0, filt1,
1886 dst1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
1887 tmp0 = AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0, filt1,
1889 tmp1 = AVC_DOT_SW3_SW(hz_out32_l, hz_out54_l, hz_out76_l, filt0, filt1,
1891 dst2 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
1892 tmp0 = AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0, filt1,
1894 tmp1 = AVC_DOT_SW3_SW(hz_out43_l, hz_out65_l, hz_out87_l, filt0, filt1,
1896 dst3 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
1898 SRARI_H4_SH(hz_out2, hz_out3, hz_out4, hz_out5, 5);
1899 SAT_SH4_SH(hz_out2, hz_out3, hz_out4, hz_out5, 7);
1901 dst0 = __msa_aver_s_h(dst0, hz_out2);
1902 dst1 = __msa_aver_s_h(dst1, hz_out3);
1903 dst2 = __msa_aver_s_h(dst2, hz_out4);
1904 dst3 = __msa_aver_s_h(dst3, hz_out5);
1906 out0 = PCKEV_XORI128_UB(dst0, dst1);
1907 out1 = PCKEV_XORI128_UB(dst2, dst3);
1908 ST8x4_UB(out0, out1, dst, stride);
1909 dst += (4 * stride);
1911 LD_SB4(src, stride, src9, src10, src11, src12);
1912 XORI_B4_128_SB(src9, src10, src11, src12);
1913 hz_out9 = AVC_HORZ_FILTER_SH(src9, src9, mask0, mask1, mask2);
1914 hz_out10 = AVC_HORZ_FILTER_SH(src10, src10, mask0, mask1, mask2);
1915 hz_out11 = AVC_HORZ_FILTER_SH(src11, src11, mask0, mask1, mask2);
1916 hz_out12 = AVC_HORZ_FILTER_SH(src12, src12, mask0, mask1, mask2);
1917 ILVR_H4_SH(hz_out9, hz_out8, hz_out10, hz_out9, hz_out11, hz_out10,
1918 hz_out12, hz_out11, hz_out89_r, hz_out910_r, hz_out1110_r,
1920 ILVL_H4_SH(hz_out9, hz_out8, hz_out10, hz_out9, hz_out11, hz_out10,
1921 hz_out12, hz_out11, hz_out89_l, hz_out910_l, hz_out1110_l,
1923 tmp0 = AVC_DOT_SW3_SW(hz_out54_r, hz_out76_r, hz_out89_r, filt0, filt1,
1925 tmp1 = AVC_DOT_SW3_SW(hz_out54_l, hz_out76_l, hz_out89_l, filt0, filt1,
1927 dst0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
1928 tmp0 = AVC_DOT_SW3_SW(hz_out65_r, hz_out87_r, hz_out910_r, filt0, filt1,
1930 tmp1 = AVC_DOT_SW3_SW(hz_out65_l, hz_out87_l, hz_out910_l, filt0, filt1,
1932 dst1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
1933 tmp0 = AVC_DOT_SW3_SW(hz_out76_r, hz_out89_r, hz_out1110_r, filt0, filt1,
1935 tmp1 = AVC_DOT_SW3_SW(hz_out76_l, hz_out89_l, hz_out1110_l, filt0, filt1,
1937 dst2 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
1938 tmp0 = AVC_DOT_SW3_SW(hz_out87_r, hz_out910_r, hz_out1211_r, filt0, filt1,
1940 tmp1 = AVC_DOT_SW3_SW(hz_out87_l, hz_out910_l, hz_out1211_l, filt0, filt1,
1942 dst3 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
1944 SRARI_H4_SH(hz_out6, hz_out7, hz_out8, hz_out9, 5);
1945 SAT_SH4_SH(hz_out6, hz_out7, hz_out8, hz_out9, 7);
1947 dst0 = __msa_aver_s_h(dst0, hz_out6);
1948 dst1 = __msa_aver_s_h(dst1, hz_out7);
1949 dst2 = __msa_aver_s_h(dst2, hz_out8);
1950 dst3 = __msa_aver_s_h(dst3, hz_out9);
1952 out0 = PCKEV_XORI128_UB(dst0, dst1);
1953 out1 = PCKEV_XORI128_UB(dst2, dst3);
1954 ST8x4_UB(out0, out1, dst, stride);
1957 void ff_put_h264_qpel8_mc23_msa(uint8_t *dst, const uint8_t *src,
1960 const int32_t filt_const0 = 0xfffb0001;
1961 const int32_t filt_const1 = 0x140014;
1962 const int32_t filt_const2 = 0x1fffb;
1964 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1965 v16i8 src11, src12, mask0, mask1, mask2;
1966 v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
1967 v8i16 hz_out7, hz_out8, hz_out9, hz_out10, hz_out11, hz_out12;
1968 v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r;
1969 v8i16 hz_out65_r, hz_out76_r, hz_out87_r, hz_out89_r, hz_out910_r;
1970 v8i16 hz_out1110_r, hz_out1211_r, dst0, dst1, dst2, dst3;
1971 v8i16 hz_out10_l, hz_out21_l, hz_out32_l, hz_out43_l, hz_out54_l;
1972 v8i16 hz_out65_l, hz_out76_l, hz_out87_l, hz_out89_l, hz_out910_l;
1973 v8i16 hz_out1110_l, hz_out1211_l, filt0, filt1, filt2;
1976 LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
1978 filt0 = (v8i16) __msa_fill_w(filt_const0);
1979 filt1 = (v8i16) __msa_fill_w(filt_const1);
1980 filt2 = (v8i16) __msa_fill_w(filt_const2);
1982 src -= ((2 * stride) + 2);
1984 LD_SB5(src, stride, src0, src1, src2, src3, src4);
1985 XORI_B5_128_SB(src0, src1, src2, src3, src4);
1986 src += (5 * stride);
1988 hz_out0 = AVC_HORZ_FILTER_SH(src0, src0, mask0, mask1, mask2);
1989 hz_out1 = AVC_HORZ_FILTER_SH(src1, src1, mask0, mask1, mask2);
1990 hz_out2 = AVC_HORZ_FILTER_SH(src2, src2, mask0, mask1, mask2);
1991 hz_out3 = AVC_HORZ_FILTER_SH(src3, src3, mask0, mask1, mask2);
1992 hz_out4 = AVC_HORZ_FILTER_SH(src4, src4, mask0, mask1, mask2);
1994 LD_SB4(src, stride, src5, src6, src7, src8);
1995 src += (4 * stride);
1996 XORI_B4_128_SB(src5, src6, src7, src8);
1998 hz_out5 = AVC_HORZ_FILTER_SH(src5, src5, mask0, mask1, mask2);
1999 hz_out6 = AVC_HORZ_FILTER_SH(src6, src6, mask0, mask1, mask2);
2000 hz_out7 = AVC_HORZ_FILTER_SH(src7, src7, mask0, mask1, mask2);
2001 hz_out8 = AVC_HORZ_FILTER_SH(src8, src8, mask0, mask1, mask2);
2003 ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4,
2004 hz_out3, hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r);
2005 ILVL_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4,
2006 hz_out3, hz_out10_l, hz_out21_l, hz_out32_l, hz_out43_l);
2007 ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8,
2008 hz_out7, hz_out54_r, hz_out65_r, hz_out76_r, hz_out87_r);
2009 ILVL_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8,
2010 hz_out7, hz_out54_l, hz_out65_l, hz_out76_l, hz_out87_l);
2012 tmp0 = AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0, filt1,
2014 tmp1 = AVC_DOT_SW3_SW(hz_out10_l, hz_out32_l, hz_out54_l, filt0, filt1,
2016 dst0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
2017 tmp0 = AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0, filt1,
2019 tmp1 = AVC_DOT_SW3_SW(hz_out21_l, hz_out43_l, hz_out65_l, filt0, filt1,
2021 dst1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
2022 tmp0 = AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0, filt1,
2024 tmp1 = AVC_DOT_SW3_SW(hz_out32_l, hz_out54_l, hz_out76_l, filt0, filt1,
2026 dst2 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
2027 tmp0 = AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0, filt1,
2029 tmp1 = AVC_DOT_SW3_SW(hz_out43_l, hz_out65_l, hz_out87_l, filt0, filt1,
2031 dst3 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
2033 SRARI_H4_SH(hz_out3, hz_out4, hz_out5, hz_out6, 5);
2034 SAT_SH4_SH(hz_out3, hz_out4, hz_out5, hz_out6, 7);
2036 dst0 = __msa_aver_s_h(dst0, hz_out3);
2037 dst1 = __msa_aver_s_h(dst1, hz_out4);
2038 dst2 = __msa_aver_s_h(dst2, hz_out5);
2039 dst3 = __msa_aver_s_h(dst3, hz_out6);
2041 out0 = PCKEV_XORI128_UB(dst0, dst1);
2042 out1 = PCKEV_XORI128_UB(dst2, dst3);
2043 ST8x4_UB(out0, out1, dst, stride);
2044 dst += (4 * stride);
2046 LD_SB4(src, stride, src9, src10, src11, src12);
2047 XORI_B4_128_SB(src9, src10, src11, src12);
2048 hz_out9 = AVC_HORZ_FILTER_SH(src9, src9, mask0, mask1, mask2);
2049 hz_out10 = AVC_HORZ_FILTER_SH(src10, src10, mask0, mask1, mask2);
2050 hz_out11 = AVC_HORZ_FILTER_SH(src11, src11, mask0, mask1, mask2);
2051 hz_out12 = AVC_HORZ_FILTER_SH(src12, src12, mask0, mask1, mask2);
2052 ILVR_H4_SH(hz_out9, hz_out8, hz_out10, hz_out9, hz_out11, hz_out10,
2053 hz_out12, hz_out11, hz_out89_r, hz_out910_r, hz_out1110_r,
2055 ILVL_H4_SH(hz_out9, hz_out8, hz_out10, hz_out9, hz_out11, hz_out10,
2056 hz_out12, hz_out11, hz_out89_l, hz_out910_l, hz_out1110_l,
2058 tmp0 = AVC_DOT_SW3_SW(hz_out54_r, hz_out76_r, hz_out89_r, filt0, filt1,
2060 tmp1 = AVC_DOT_SW3_SW(hz_out54_l, hz_out76_l, hz_out89_l, filt0, filt1,
2062 dst0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
2063 tmp0 = AVC_DOT_SW3_SW(hz_out65_r, hz_out87_r, hz_out910_r, filt0, filt1,
2065 tmp1 = AVC_DOT_SW3_SW(hz_out65_l, hz_out87_l, hz_out910_l, filt0, filt1,
2067 dst1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
2068 tmp0 = AVC_DOT_SW3_SW(hz_out76_r, hz_out89_r, hz_out1110_r, filt0, filt1,
2070 tmp1 = AVC_DOT_SW3_SW(hz_out76_l, hz_out89_l, hz_out1110_l, filt0, filt1,
2072 dst2 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
2073 tmp0 = AVC_DOT_SW3_SW(hz_out87_r, hz_out910_r, hz_out1211_r, filt0, filt1,
2075 tmp1 = AVC_DOT_SW3_SW(hz_out87_l, hz_out910_l, hz_out1211_l, filt0, filt1,
2077 dst3 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
2079 SRARI_H4_SH(hz_out7, hz_out8, hz_out9, hz_out10, 5);
2080 SAT_SH4_SH(hz_out7, hz_out8, hz_out9, hz_out10, 7);
2082 dst0 = __msa_aver_s_h(dst0, hz_out7);
2083 dst1 = __msa_aver_s_h(dst1, hz_out8);
2084 dst2 = __msa_aver_s_h(dst2, hz_out9);
2085 dst3 = __msa_aver_s_h(dst3, hz_out10);
2087 out0 = PCKEV_XORI128_UB(dst0, dst1);
2088 out1 = PCKEV_XORI128_UB(dst2, dst3);
2089 ST8x4_UB(out0, out1, dst, stride);
2092 void ff_put_h264_qpel4_mc21_msa(uint8_t *dst, const uint8_t *src,
2095 const int32_t filt_const0 = 0xfffb0001;
2096 const int32_t filt_const1 = 0x140014;
2097 const int32_t filt_const2 = 0x1fffb;
2099 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
2100 v16i8 mask0, mask1, mask2;
2101 v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
2102 v8i16 hz_out7, hz_out8, dst0, dst1, filt0, filt1, filt2;
2103 v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r;
2104 v8i16 hz_out65_r, hz_out76_r, hz_out87_r;
2107 LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2);
2109 filt0 = (v8i16) __msa_fill_w(filt_const0);
2110 filt1 = (v8i16) __msa_fill_w(filt_const1);
2111 filt2 = (v8i16) __msa_fill_w(filt_const2);
2113 src -= ((2 * stride) + 2);
2115 LD_SB5(src, stride, src0, src1, src2, src3, src4);
2116 src += (5 * stride);
2117 LD_SB4(src, stride, src5, src6, src7, src8);
2119 XORI_B5_128_SB(src0, src1, src2, src3, src4);
2120 XORI_B4_128_SB(src5, src6, src7, src8);
2122 hz_out0 = AVC_HORZ_FILTER_SH(src0, src1, mask0, mask1, mask2);
2123 hz_out2 = AVC_HORZ_FILTER_SH(src2, src3, mask0, mask1, mask2);
2124 hz_out4 = AVC_HORZ_FILTER_SH(src4, src5, mask0, mask1, mask2);
2125 hz_out6 = AVC_HORZ_FILTER_SH(src6, src7, mask0, mask1, mask2);
2126 hz_out8 = AVC_HORZ_FILTER_SH(src8, src8, mask0, mask1, mask2);
2127 PCKOD_D2_SH(hz_out0, hz_out0, hz_out2, hz_out2, hz_out1, hz_out3);
2128 PCKOD_D2_SH(hz_out4, hz_out4, hz_out6, hz_out6, hz_out5, hz_out7);
2130 ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4,
2131 hz_out3, hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r);
2132 ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8,
2133 hz_out7, hz_out54_r, hz_out65_r, hz_out76_r, hz_out87_r);
2135 tmp0 = AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0, filt1,
2137 tmp1 = AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0, filt1,
2139 dst0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
2140 tmp0 = AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0, filt1,
2142 tmp1 = AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0, filt1,
2144 dst1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
2146 SRARI_H2_SH(hz_out2, hz_out4, 5);
2147 SAT_SH2_SH(hz_out2, hz_out4, 7);
2149 dst0 = __msa_aver_s_h(dst0, hz_out2);
2150 dst1 = __msa_aver_s_h(dst1, hz_out4);
2152 res = PCKEV_XORI128_UB(dst0, dst1);
2153 ST4x4_UB(res, res, 0, 1, 2, 3, dst, stride);
2156 void ff_put_h264_qpel4_mc23_msa(uint8_t *dst, const uint8_t *src,
2159 const int32_t filt_const0 = 0xfffb0001;
2160 const int32_t filt_const1 = 0x140014;
2161 const int32_t filt_const2 = 0x1fffb;
2163 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
2164 v16i8 mask0, mask1, mask2;
2165 v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
2166 v8i16 hz_out7, hz_out8, dst0, dst1, filt0, filt1, filt2;
2167 v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r;
2168 v8i16 hz_out65_r, hz_out76_r, hz_out87_r;
2171 LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2);
2173 filt0 = (v8i16) __msa_fill_w(filt_const0);
2174 filt1 = (v8i16) __msa_fill_w(filt_const1);
2175 filt2 = (v8i16) __msa_fill_w(filt_const2);
2177 src -= ((2 * stride) + 2);
2179 LD_SB5(src, stride, src0, src1, src2, src3, src4);
2180 src += (5 * stride);
2181 LD_SB4(src, stride, src5, src6, src7, src8);
2183 XORI_B5_128_SB(src0, src1, src2, src3, src4);
2184 XORI_B4_128_SB(src5, src6, src7, src8);
2186 hz_out0 = AVC_HORZ_FILTER_SH(src0, src1, mask0, mask1, mask2);
2187 hz_out2 = AVC_HORZ_FILTER_SH(src2, src3, mask0, mask1, mask2);
2188 hz_out4 = AVC_HORZ_FILTER_SH(src4, src5, mask0, mask1, mask2);
2189 hz_out6 = AVC_HORZ_FILTER_SH(src6, src7, mask0, mask1, mask2);
2190 hz_out8 = AVC_HORZ_FILTER_SH(src8, src8, mask0, mask1, mask2);
2191 PCKOD_D2_SH(hz_out0, hz_out0, hz_out2, hz_out2, hz_out1, hz_out3);
2192 PCKOD_D2_SH(hz_out4, hz_out4, hz_out6, hz_out6, hz_out5, hz_out7);
2194 ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4,
2195 hz_out3, hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r);
2196 ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8,
2197 hz_out7, hz_out54_r, hz_out65_r, hz_out76_r, hz_out87_r);
2199 tmp0 = AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0, filt1,
2201 tmp1 = AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0, filt1,
2203 dst0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
2204 tmp0 = AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0, filt1,
2206 tmp1 = AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0, filt1,
2208 dst1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
2210 PCKEV_D2_SH(hz_out4, hz_out3, hz_out6, hz_out5, hz_out0, hz_out1);
2211 SRARI_H2_SH(hz_out0, hz_out1, 5);
2212 SAT_SH2_SH(hz_out0, hz_out1, 7);
2214 dst0 = __msa_aver_s_h(dst0, hz_out0);
2215 dst1 = __msa_aver_s_h(dst1, hz_out1);
2217 res = PCKEV_XORI128_UB(dst0, dst1);
2218 ST4x4_UB(res, res, 0, 1, 2, 3, dst, stride);
2221 void ff_put_h264_qpel16_mc02_msa(uint8_t *dst, const uint8_t *src,
2225 int16_t filt_const0 = 0xfb01;
2226 int16_t filt_const1 = 0x1414;
2227 int16_t filt_const2 = 0x1fb;
2228 v16u8 res0, res1, res2, res3;
2229 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
2230 v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
2231 v16i8 src87_r, src10_l, src32_l, src54_l, src76_l, src21_l, src43_l;
2232 v16i8 src65_l, src87_l, filt0, filt1, filt2;
2233 v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
2235 filt0 = (v16i8) __msa_fill_h(filt_const0);
2236 filt1 = (v16i8) __msa_fill_h(filt_const1);
2237 filt2 = (v16i8) __msa_fill_h(filt_const2);
2238 src -= (stride * 2);
2240 LD_SB5(src, stride, src0, src1, src2, src3, src4);
2241 src += (5 * stride);
2243 XORI_B5_128_SB(src0, src1, src2, src3, src4);
2244 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
2246 ILVL_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_l, src21_l,
2249 for (loop_cnt = 4; loop_cnt--;) {
2250 LD_SB4(src, stride, src5, src6, src7, src8);
2251 src += (4 * stride);
2253 XORI_B4_128_SB(src5, src6, src7, src8);
2254 ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r,
2255 src65_r, src76_r, src87_r);
2256 ILVL_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_l,
2257 src65_l, src76_l, src87_l);
2258 out0_r = AVC_DOT_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2);
2259 out1_r = AVC_DOT_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2);
2260 out2_r = AVC_DOT_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2);
2261 out3_r = AVC_DOT_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2);
2262 out0_l = AVC_DOT_SH3_SH(src10_l, src32_l, src54_l, filt0, filt1, filt2);
2263 out1_l = AVC_DOT_SH3_SH(src21_l, src43_l, src65_l, filt0, filt1, filt2);
2264 out2_l = AVC_DOT_SH3_SH(src32_l, src54_l, src76_l, filt0, filt1, filt2);
2265 out3_l = AVC_DOT_SH3_SH(src43_l, src65_l, src87_l, filt0, filt1, filt2);
2266 SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 5);
2267 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
2268 SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 5);
2269 SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
2270 PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
2271 out3_r, res0, res1, res2, res3);
2272 XORI_B4_128_UB(res0, res1, res2, res3);
2273 ST_UB4(res0, res1, res2, res3, dst, stride);
2274 dst += (4 * stride);
2288 void ff_put_h264_qpel8_mc02_msa(uint8_t *dst, const uint8_t *src,
2291 const int16_t filt_const0 = 0xfb01;
2292 const int16_t filt_const1 = 0x1414;
2293 const int16_t filt_const2 = 0x1fb;
2294 v16u8 out0, out1, out2, out3;
2295 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
2296 v16i8 src11, src12, src10_r, src21_r, src32_r, src43_r, src76_r, src87_r;
2297 v16i8 src98_r, src109_r, src89_r, src910_r, src1110_r, src1211_r;
2298 v16i8 filt0, filt1, filt2;
2299 v8i16 out0_r, out1_r, out2_r, out3_r, out4_r, out5_r, out6_r, out7_r;
2301 filt0 = (v16i8) __msa_fill_h(filt_const0);
2302 filt1 = (v16i8) __msa_fill_h(filt_const1);
2303 filt2 = (v16i8) __msa_fill_h(filt_const2);
2305 src -= (stride * 2);
2307 LD_SB8(src, stride, src0, src1, src2, src3, src4, src5, src6, src7);
2308 src += (8 * stride);
2309 LD_SB5(src, stride, src8, src9, src10, src11, src12);
2310 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
2312 ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src76_r, src87_r,
2314 ILVR_B4_SB(src9, src8, src10, src9, src11, src10, src12, src11, src89_r,
2315 src910_r, src1110_r, src1211_r);
2316 XORI_B4_128_SB(src10_r, src21_r, src32_r, src43_r);
2317 XORI_B4_128_SB(src76_r, src87_r, src98_r, src109_r);
2318 XORI_B4_128_SB(src89_r, src910_r, src1110_r, src1211_r);
2319 out0_r = AVC_DOT_SH3_SH(src10_r, src32_r, src76_r, filt0, filt1, filt2);
2320 out1_r = AVC_DOT_SH3_SH(src21_r, src43_r, src87_r, filt0, filt1, filt2);
2321 out2_r = AVC_DOT_SH3_SH(src32_r, src76_r, src98_r, filt0, filt1, filt2);
2322 out3_r = AVC_DOT_SH3_SH(src43_r, src87_r, src109_r, filt0, filt1, filt2);
2323 out4_r = AVC_DOT_SH3_SH(src76_r, src98_r, src89_r, filt0, filt1, filt2);
2324 out5_r = AVC_DOT_SH3_SH(src87_r, src109_r, src910_r, filt0, filt1, filt2);
2325 out6_r = AVC_DOT_SH3_SH(src98_r, src89_r, src1110_r, filt0, filt1, filt2);
2326 out7_r = AVC_DOT_SH3_SH(src109_r, src910_r, src1211_r, filt0, filt1, filt2);
2327 SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 5);
2328 SRARI_H4_SH(out4_r, out5_r, out6_r, out7_r, 5);
2329 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
2330 SAT_SH4_SH(out4_r, out5_r, out6_r, out7_r, 7);
2331 out0 = PCKEV_XORI128_UB(out0_r, out1_r);
2332 out1 = PCKEV_XORI128_UB(out2_r, out3_r);
2333 out2 = PCKEV_XORI128_UB(out4_r, out5_r);
2334 out3 = PCKEV_XORI128_UB(out6_r, out7_r);
2335 ST8x8_UB(out0, out1, out2, out3, dst, stride);
2338 void ff_put_h264_qpel4_mc02_msa(uint8_t *dst, const uint8_t *src,
2341 const int16_t filt_const0 = 0xfb01;
2342 const int16_t filt_const1 = 0x1414;
2343 const int16_t filt_const2 = 0x1fb;
2345 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
2346 v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
2347 v16i8 src87_r, src2110, src4332, src6554, src8776, filt0, filt1, filt2;
2350 filt0 = (v16i8) __msa_fill_h(filt_const0);
2351 filt1 = (v16i8) __msa_fill_h(filt_const1);
2352 filt2 = (v16i8) __msa_fill_h(filt_const2);
2354 src -= (stride * 2);
2356 LD_SB5(src, stride, src0, src1, src2, src3, src4);
2357 src += (5 * stride);
2358 LD_SB4(src, stride, src5, src6, src7, src8);
2360 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
2362 ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
2364 ILVR_D4_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r, src87_r,
2365 src76_r, src2110, src4332, src6554, src8776);
2366 XORI_B4_128_SB(src2110, src4332, src6554, src8776);
2367 out10 = AVC_DOT_SH3_SH(src2110, src4332, src6554, filt0, filt1, filt2);
2368 out32 = AVC_DOT_SH3_SH(src4332, src6554, src8776, filt0, filt1, filt2);
2369 SRARI_H2_SH(out10, out32, 5);
2370 SAT_SH2_SH(out10, out32, 7);
2371 out = PCKEV_XORI128_UB(out10, out32);
2372 ST4x4_UB(out, out, 0, 1, 2, 3, dst, stride);
2375 void ff_put_h264_qpel16_mc12_msa(uint8_t *dst, const uint8_t *src,
2380 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
2382 v8i16 vt_res0, vt_res1, vt_res2, vt_res3, dst0, dst1, dst2, dst3, mask3;
2383 v8i16 shf_vec0, shf_vec1, shf_vec2, shf_vec3, shf_vec4, shf_vec5, shf_vec6;
2384 v8i16 shf_vec7, shf_vec8, shf_vec9, shf_vec10, shf_vec11, mask4, mask5;
2385 v4i32 hz_res0, hz_res1, hz_res2, hz_res3;
2386 v8i16 mask0 = { 0, 5, 1, 6, 2, 7, 3, 8 };
2387 v8i16 mask1 = { 1, 4, 2, 5, 3, 6, 4, 7 };
2388 v8i16 mask2 = { 2, 3, 3, 4, 4, 5, 5, 6 };
2389 v8i16 minus5h = __msa_ldi_h(-5);
2390 v8i16 plus20h = __msa_ldi_h(20);
2396 src -= ((2 * stride) + 2);
2398 LD_SB5(src, stride, src0, src1, src2, src3, src4);
2399 LD_SB5(src + 8, stride, src7, src8, src9, src10, src11);
2400 src += (5 * stride);
2401 XORI_B5_128_SB(src0, src1, src2, src3, src4);
2402 XORI_B5_128_SB(src7, src8, src9, src10, src11);
2404 for (row = 16; row--;) {
2405 LD_SB2(src, 8, src5, src6);
2407 XORI_B2_128_SB(src5, src6);
2409 AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src0, src1, src2, src3, src4, src5,
2411 AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src7, src8, src9, src10, src11, src6,
2413 VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask0,
2414 mask1, mask2, shf_vec0, shf_vec1, shf_vec2);
2415 VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask0,
2416 mask1, mask2, shf_vec3, shf_vec4, shf_vec5);
2417 VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask3,
2418 mask4, mask5, shf_vec6, shf_vec7, shf_vec8);
2419 VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask3,
2420 mask4, mask5, shf_vec9, shf_vec10, shf_vec11);
2421 hz_res0 = __msa_hadd_s_w(shf_vec0, shf_vec0);
2422 hz_res1 = __msa_hadd_s_w(shf_vec3, shf_vec3);
2423 hz_res2 = __msa_hadd_s_w(shf_vec6, shf_vec6);
2424 hz_res3 = __msa_hadd_s_w(shf_vec9, shf_vec9);
2425 DPADD_SH2_SW(shf_vec1, shf_vec2, minus5h, plus20h, hz_res0, hz_res0);
2426 DPADD_SH2_SW(shf_vec4, shf_vec5, minus5h, plus20h, hz_res1, hz_res1);
2427 DPADD_SH2_SW(shf_vec7, shf_vec8, minus5h, plus20h, hz_res2, hz_res2);
2428 DPADD_SH2_SW(shf_vec10, shf_vec11, minus5h, plus20h, hz_res3, hz_res3);
2429 SRARI_W4_SW(hz_res0, hz_res1, hz_res2, hz_res3, 10);
2430 SAT_SW4_SW(hz_res0, hz_res1, hz_res2, hz_res3, 7);
2431 dst0 = __msa_srari_h(shf_vec2, 5);
2432 dst1 = __msa_srari_h(shf_vec5, 5);
2433 dst2 = __msa_srari_h(shf_vec8, 5);
2434 dst3 = __msa_srari_h(shf_vec11, 5);
2435 SAT_SH4_SH(dst0, dst1, dst2, dst3, 7);
2436 PCKEV_H2_SH(dst2, dst0, dst3, dst1, dst0, dst1);
2437 PCKEV_H2_SH(hz_res2, hz_res0, hz_res3, hz_res1, dst2, dst3);
2438 dst0 = __msa_aver_s_h(dst2, dst0);
2439 dst1 = __msa_aver_s_h(dst3, dst1);
2440 out = PCKEV_XORI128_UB(dst0, dst1);
2457 void ff_put_h264_qpel16_mc32_msa(uint8_t *dst, const uint8_t *src,
2462 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
2464 v8i16 vt_res0, vt_res1, vt_res2, vt_res3, dst0, dst1, dst2, dst3, mask3;
2465 v8i16 shf_vec0, shf_vec1, shf_vec2, shf_vec3, shf_vec4, shf_vec5, shf_vec6;
2466 v8i16 shf_vec7, shf_vec8, shf_vec9, shf_vec10, shf_vec11, mask4, mask5;
2467 v4i32 hz_res0, hz_res1, hz_res2, hz_res3;
2468 v8i16 mask0 = { 0, 5, 1, 6, 2, 7, 3, 8 };
2469 v8i16 mask1 = { 1, 4, 2, 5, 3, 6, 4, 7 };
2470 v8i16 mask2 = { 2, 3, 3, 4, 4, 5, 5, 6 };
2471 v8i16 minus5h = __msa_ldi_h(-5);
2472 v8i16 plus20h = __msa_ldi_h(20);
2478 src -= ((2 * stride) + 2);
2480 LD_SB5(src, stride, src0, src1, src2, src3, src4);
2481 LD_SB5(src + 8, stride, src7, src8, src9, src10, src11);
2482 src += (5 * stride);
2483 XORI_B5_128_SB(src0, src1, src2, src3, src4);
2484 XORI_B5_128_SB(src7, src8, src9, src10, src11);
2486 for (row = 16; row--;) {
2487 LD_SB2(src, 8, src5, src6);
2489 XORI_B2_128_SB(src5, src6);
2491 AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src0, src1, src2, src3, src4, src5,
2493 AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src7, src8, src9, src10, src11, src6,
2495 VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask0,
2496 mask1, mask2, shf_vec0, shf_vec1, shf_vec2);
2497 VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask0,
2498 mask1, mask2, shf_vec3, shf_vec4, shf_vec5);
2499 VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask3,
2500 mask4, mask5, shf_vec6, shf_vec7, shf_vec8);
2501 VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask3,
2502 mask4, mask5, shf_vec9, shf_vec10, shf_vec11);
2503 hz_res0 = __msa_hadd_s_w(shf_vec0, shf_vec0);
2504 hz_res1 = __msa_hadd_s_w(shf_vec3, shf_vec3);
2505 hz_res2 = __msa_hadd_s_w(shf_vec6, shf_vec6);
2506 hz_res3 = __msa_hadd_s_w(shf_vec9, shf_vec9);
2507 DPADD_SH2_SW(shf_vec1, shf_vec2, minus5h, plus20h, hz_res0, hz_res0);
2508 DPADD_SH2_SW(shf_vec4, shf_vec5, minus5h, plus20h, hz_res1, hz_res1);
2509 DPADD_SH2_SW(shf_vec7, shf_vec8, minus5h, plus20h, hz_res2, hz_res2);
2510 DPADD_SH2_SW(shf_vec10, shf_vec11, minus5h, plus20h, hz_res3, hz_res3);
2511 SRARI_W4_SW(hz_res0, hz_res1, hz_res2, hz_res3, 10);
2512 SAT_SW4_SW(hz_res0, hz_res1, hz_res2, hz_res3, 7);
2513 dst0 = __msa_srari_h(shf_vec2, 5);
2514 dst1 = __msa_srari_h(shf_vec5, 5);
2515 dst2 = __msa_srari_h(shf_vec8, 5);
2516 dst3 = __msa_srari_h(shf_vec11, 5);
2517 SAT_SH4_SH(dst0, dst1, dst2, dst3, 7);
2518 dst0 = __msa_pckod_h(dst2, dst0);
2519 dst1 = __msa_pckod_h(dst3, dst1);
2520 PCKEV_H2_SH(hz_res2, hz_res0, hz_res3, hz_res1, dst2, dst3);
2521 dst0 = __msa_aver_s_h(dst2, dst0);
2522 dst1 = __msa_aver_s_h(dst3, dst1);
2523 out = PCKEV_XORI128_UB(dst0, dst1);
2540 void ff_put_h264_qpel8_mc12_msa(uint8_t *dst, const uint8_t *src,
2545 v16i8 src0, src1, src2, src3, src4, src5, src6;
2546 v8i16 vt_res0, vt_res1, vt_res2, vt_res3, dst0, dst1, dst2, dst3;
2547 v8i16 shf_vec0, shf_vec1, shf_vec2, shf_vec3, shf_vec4, shf_vec5, shf_vec6;
2548 v8i16 shf_vec7, shf_vec8, shf_vec9, shf_vec10, shf_vec11;
2549 v8i16 mask3, mask4, mask5;
2550 v4i32 hz_res0, hz_res1, hz_res2, hz_res3;
2551 v8i16 mask0 = { 0, 5, 1, 6, 2, 7, 3, 8 };
2552 v8i16 mask1 = { 1, 4, 2, 5, 3, 6, 4, 7 };
2553 v8i16 mask2 = { 2, 3, 3, 4, 4, 5, 5, 6 };
2554 v8i16 minus5h = __msa_ldi_h(-5);
2555 v8i16 plus20h = __msa_ldi_h(20);
2561 src -= ((2 * stride) + 2);
2563 LD_SB5(src, stride, src0, src1, src2, src3, src4);
2564 src += (5 * stride);
2565 XORI_B5_128_SB(src0, src1, src2, src3, src4);
2567 for (row = 4; row--;) {
2568 LD_SB2(src, stride, src5, src6);
2569 src += (2 * stride);
2570 XORI_B2_128_SB(src5, src6);
2572 AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src0, src1, src2, src3, src4, src5,
2574 AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src1, src2, src3, src4, src5, src6,
2576 VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask0,
2577 mask1, mask2, shf_vec0, shf_vec1, shf_vec2);
2578 VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask0,
2579 mask1, mask2, shf_vec3, shf_vec4, shf_vec5);
2580 VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask3,
2581 mask4, mask5, shf_vec6, shf_vec7, shf_vec8);
2582 VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask3,
2583 mask4, mask5, shf_vec9, shf_vec10, shf_vec11);
2584 hz_res0 = __msa_hadd_s_w(shf_vec0, shf_vec0);
2585 hz_res1 = __msa_hadd_s_w(shf_vec3, shf_vec3);
2586 hz_res2 = __msa_hadd_s_w(shf_vec6, shf_vec6);
2587 hz_res3 = __msa_hadd_s_w(shf_vec9, shf_vec9);
2588 DPADD_SH2_SW(shf_vec1, shf_vec2, minus5h, plus20h, hz_res0, hz_res0);
2589 DPADD_SH2_SW(shf_vec4, shf_vec5, minus5h, plus20h, hz_res1, hz_res1);
2590 DPADD_SH2_SW(shf_vec7, shf_vec8, minus5h, plus20h, hz_res2, hz_res2);
2591 DPADD_SH2_SW(shf_vec10, shf_vec11, minus5h, plus20h, hz_res3, hz_res3);
2592 SRARI_W4_SW(hz_res0, hz_res1, hz_res2, hz_res3, 10);
2593 SAT_SW4_SW(hz_res0, hz_res1, hz_res2, hz_res3, 7);
2594 dst0 = __msa_srari_h(shf_vec2, 5);
2595 dst1 = __msa_srari_h(shf_vec5, 5);
2596 dst2 = __msa_srari_h(shf_vec8, 5);
2597 dst3 = __msa_srari_h(shf_vec11, 5);
2598 SAT_SH4_SH(dst0, dst1, dst2, dst3, 7);
2599 PCKEV_H2_SH(dst2, dst0, dst3, dst1, dst0, dst1);
2600 PCKEV_H2_SH(hz_res2, hz_res0, hz_res3, hz_res1, dst2, dst3);
2601 dst0 = __msa_aver_s_h(dst2, dst0);
2602 dst1 = __msa_aver_s_h(dst3, dst1);
2603 out = PCKEV_XORI128_UB(dst0, dst1);
2604 ST8x2_UB(out, dst, stride);
2605 dst += (2 * stride);
2615 void ff_put_h264_qpel8_mc32_msa(uint8_t *dst, const uint8_t *src,
2620 v16i8 src0, src1, src2, src3, src4, src5, src6;
2621 v8i16 vt_res0, vt_res1, vt_res2, vt_res3, dst0, dst1, dst2, dst3;
2622 v8i16 shf_vec0, shf_vec1, shf_vec2, shf_vec3, shf_vec4, shf_vec5, shf_vec6;
2623 v8i16 shf_vec7, shf_vec8, shf_vec9, shf_vec10, shf_vec11;
2624 v8i16 mask3, mask4, mask5;
2625 v4i32 hz_res0, hz_res1, hz_res2, hz_res3;
2626 v8i16 mask0 = { 0, 5, 1, 6, 2, 7, 3, 8 };
2627 v8i16 mask1 = { 1, 4, 2, 5, 3, 6, 4, 7 };
2628 v8i16 mask2 = { 2, 3, 3, 4, 4, 5, 5, 6 };
2629 v8i16 minus5h = __msa_ldi_h(-5);
2630 v8i16 plus20h = __msa_ldi_h(20);
2636 src -= ((2 * stride) + 2);
2638 LD_SB5(src, stride, src0, src1, src2, src3, src4);
2639 src += (5 * stride);
2640 XORI_B5_128_SB(src0, src1, src2, src3, src4);
2642 for (row = 4; row--;) {
2643 LD_SB2(src, stride, src5, src6);
2644 src += (2 * stride);
2645 XORI_B2_128_SB(src5, src6);
2647 AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src0, src1, src2, src3, src4, src5,
2649 AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src1, src2, src3, src4, src5, src6,
2651 VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask0,
2652 mask1, mask2, shf_vec0, shf_vec1, shf_vec2);
2653 VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask0,
2654 mask1, mask2, shf_vec3, shf_vec4, shf_vec5);
2655 VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask3,
2656 mask4, mask5, shf_vec6, shf_vec7, shf_vec8);
2657 VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask3,
2658 mask4, mask5, shf_vec9, shf_vec10, shf_vec11);
2659 hz_res0 = __msa_hadd_s_w(shf_vec0, shf_vec0);
2660 hz_res1 = __msa_hadd_s_w(shf_vec3, shf_vec3);
2661 hz_res2 = __msa_hadd_s_w(shf_vec6, shf_vec6);
2662 hz_res3 = __msa_hadd_s_w(shf_vec9, shf_vec9);
2663 DPADD_SH2_SW(shf_vec1, shf_vec2, minus5h, plus20h, hz_res0, hz_res0);
2664 DPADD_SH2_SW(shf_vec4, shf_vec5, minus5h, plus20h, hz_res1, hz_res1);
2665 DPADD_SH2_SW(shf_vec7, shf_vec8, minus5h, plus20h, hz_res2, hz_res2);
2666 DPADD_SH2_SW(shf_vec10, shf_vec11, minus5h, plus20h, hz_res3, hz_res3);
2667 SRARI_W4_SW(hz_res0, hz_res1, hz_res2, hz_res3, 10);
2668 SAT_SW4_SW(hz_res0, hz_res1, hz_res2, hz_res3, 7);
2669 dst0 = __msa_srari_h(shf_vec2, 5);
2670 dst1 = __msa_srari_h(shf_vec5, 5);
2671 dst2 = __msa_srari_h(shf_vec8, 5);
2672 dst3 = __msa_srari_h(shf_vec11, 5);
2673 SAT_SH4_SH(dst0, dst1, dst2, dst3, 7);
2674 dst0 = __msa_pckod_h(dst2, dst0);
2675 dst1 = __msa_pckod_h(dst3, dst1);
2676 PCKEV_H2_SH(hz_res2, hz_res0, hz_res3, hz_res1, dst2, dst3);
2677 dst0 = __msa_aver_s_h(dst2, dst0);
2678 dst1 = __msa_aver_s_h(dst3, dst1);
2679 out = PCKEV_XORI128_UB(dst0, dst1);
2680 ST8x2_UB(out, dst, stride);
2681 dst += (2 * stride);
2691 void ff_put_h264_qpel4_mc12_msa(uint8_t *dst, const uint8_t *src,
2694 const int16_t filt_const0 = 0xfb01;
2695 const int16_t filt_const1 = 0x1414;
2696 const int16_t filt_const2 = 0x1fb;
2698 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
2699 v16i8 src10_r, src21_r, src32_r, src43_r, src54_r, src65_r, src76_r;
2700 v16i8 src87_r, src10_l, src21_l, src32_l, src43_l, src54_l, src65_l;
2701 v16i8 src76_l, src87_l, filt0, filt1, filt2;
2702 v8i16 vt_res0, vt_res1, vt_res2, vt_res3, dst0, dst1, dst2, dst3, shf_vec7;
2703 v8i16 shf_vec0, shf_vec1, shf_vec2, shf_vec3, shf_vec4, shf_vec5, shf_vec6;
2704 v4i32 hz_res0, hz_res1, hz_res2, hz_res3;
2705 v8i16 mask0 = { 0, 5, 1, 6, 2, 7, 3, 8 };
2706 v8i16 mask1 = { 1, 4, 2, 5, 3, 6, 4, 7 };
2707 v8i16 mask2 = { 2, 3, 3, 4, 4, 5, 5, 6 };
2708 v8i16 minus5h = __msa_ldi_h(-5);
2709 v8i16 plus20h = __msa_ldi_h(20);
2710 v8i16 zeros = { 0 };
2712 filt0 = (v16i8) __msa_fill_h(filt_const0);
2713 filt1 = (v16i8) __msa_fill_h(filt_const1);
2714 filt2 = (v16i8) __msa_fill_h(filt_const2);
2716 src -= ((2 * stride) + 2);
2718 LD_SB5(src, stride, src0, src1, src2, src3, src4);
2719 src += (5 * stride);
2720 XORI_B5_128_SB(src0, src1, src2, src3, src4);
2721 LD_SB4(src, stride, src5, src6, src7, src8);
2722 XORI_B4_128_SB(src5, src6, src7, src8);
2724 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
2726 ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
2728 ILVL_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_l, src21_l,
2730 ILVL_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_l, src65_l,
2732 vt_res0 = AVC_DOT_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2);
2733 vt_res1 = AVC_DOT_SH3_SH(src10_l, src32_l, src54_l, filt0, filt1, filt2);
2734 vt_res2 = AVC_DOT_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2);
2735 vt_res3 = AVC_DOT_SH3_SH(src21_l, src43_l, src65_l, filt0, filt1, filt2);
2736 VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask0,
2737 mask1, mask2, shf_vec0, shf_vec1, shf_vec2);
2738 VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask0,
2739 mask1, mask2, shf_vec3, shf_vec4, shf_vec5);
2740 hz_res0 = __msa_hadd_s_w(shf_vec0, shf_vec0);
2741 DPADD_SH2_SW(shf_vec1, shf_vec2, minus5h, plus20h, hz_res0, hz_res0);
2742 hz_res1 = __msa_hadd_s_w(shf_vec3, shf_vec3);
2743 DPADD_SH2_SW(shf_vec4, shf_vec5, minus5h, plus20h, hz_res1, hz_res1);
2745 vt_res0 = AVC_DOT_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2);
2746 vt_res1 = AVC_DOT_SH3_SH(src32_l, src54_l, src76_l, filt0, filt1, filt2);
2747 vt_res2 = AVC_DOT_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2);
2748 vt_res3 = AVC_DOT_SH3_SH(src43_l, src65_l, src87_l, filt0, filt1, filt2);
2749 VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask0,
2750 mask1, mask2, shf_vec0, shf_vec1, shf_vec6);
2751 VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask0,
2752 mask1, mask2, shf_vec3, shf_vec4, shf_vec7);
2753 hz_res2 = __msa_hadd_s_w(shf_vec0, shf_vec0);
2754 DPADD_SH2_SW(shf_vec1, shf_vec6, minus5h, plus20h, hz_res2, hz_res2);
2755 hz_res3 = __msa_hadd_s_w(shf_vec3, shf_vec3);
2756 DPADD_SH2_SW(shf_vec4, shf_vec7, minus5h, plus20h, hz_res3, hz_res3);
2758 SRARI_W2_SW(hz_res0, hz_res1, 10);
2759 SAT_SW2_SW(hz_res0, hz_res1, 7);
2760 SRARI_W2_SW(hz_res2, hz_res3, 10);
2761 SAT_SW2_SW(hz_res2, hz_res3, 7);
2763 dst0 = __msa_srari_h(shf_vec2, 5);
2764 dst1 = __msa_srari_h(shf_vec5, 5);
2765 dst2 = __msa_srari_h(shf_vec6, 5);
2766 dst3 = __msa_srari_h(shf_vec7, 5);
2768 SAT_SH2_SH(dst0, dst1, 7);
2769 SAT_SH2_SH(dst2, dst3, 7);
2770 ILVEV_H2_SH(dst0, zeros, dst1, zeros, dst0, dst1);
2771 ILVEV_H2_SH(dst2, zeros, dst3, zeros, dst2, dst3);
2773 hz_res0 = __msa_aver_s_w(hz_res0, (v4i32) dst0);
2774 hz_res1 = __msa_aver_s_w(hz_res1, (v4i32) dst1);
2775 hz_res2 = __msa_aver_s_w(hz_res2, (v4i32) dst2);
2776 hz_res3 = __msa_aver_s_w(hz_res3, (v4i32) dst3);
2778 PCKEV_H2_SH(hz_res1, hz_res0, hz_res3, hz_res2, dst0, dst2);
2779 out = PCKEV_XORI128_UB(dst0, dst2);
2780 ST4x4_UB(out, out, 0, 1, 2, 3, dst, stride);
2783 void ff_put_h264_qpel4_mc32_msa(uint8_t *dst, const uint8_t *src,
2786 const int16_t filt_const0 = 0xfb01;
2787 const int16_t filt_const1 = 0x1414;
2788 const int16_t filt_const2 = 0x1fb;
2790 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
2791 v16i8 src10_r, src21_r, src32_r, src43_r, src54_r, src65_r, src76_r;
2792 v16i8 src87_r, src10_l, src21_l, src32_l, src43_l, src54_l, src65_l;
2793 v16i8 src76_l, src87_l, filt0, filt1, filt2;
2794 v8i16 vt_res0, vt_res1, vt_res2, vt_res3, dst0, dst1, dst2, dst3, shf_vec7;
2795 v8i16 shf_vec0, shf_vec1, shf_vec2, shf_vec3, shf_vec4, shf_vec5, shf_vec6;
2796 v4i32 hz_res0, hz_res1, hz_res2, hz_res3;
2797 v8i16 mask0 = { 0, 5, 1, 6, 2, 7, 3, 8 };
2798 v8i16 mask1 = { 1, 4, 2, 5, 3, 6, 4, 7 };
2799 v8i16 mask2 = { 2, 3, 3, 4, 4, 5, 5, 6 };
2800 v8i16 minus5h = __msa_ldi_h(-5);
2801 v8i16 plus20h = __msa_ldi_h(20);
2802 v8i16 zeros = { 0 };
2804 filt0 = (v16i8) __msa_fill_h(filt_const0);
2805 filt1 = (v16i8) __msa_fill_h(filt_const1);
2806 filt2 = (v16i8) __msa_fill_h(filt_const2);
2808 src -= ((2 * stride) + 2);
2810 LD_SB5(src, stride, src0, src1, src2, src3, src4);
2811 src += (5 * stride);
2812 XORI_B5_128_SB(src0, src1, src2, src3, src4);
2813 LD_SB4(src, stride, src5, src6, src7, src8);
2814 XORI_B4_128_SB(src5, src6, src7, src8);
2816 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
2818 ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
2820 ILVL_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_l, src21_l,
2822 ILVL_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_l, src65_l,
2825 vt_res0 = AVC_DOT_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2);
2826 vt_res1 = AVC_DOT_SH3_SH(src10_l, src32_l, src54_l, filt0, filt1, filt2);
2827 vt_res2 = AVC_DOT_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2);
2828 vt_res3 = AVC_DOT_SH3_SH(src21_l, src43_l, src65_l, filt0, filt1, filt2);
2829 VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask0,
2830 mask1, mask2, shf_vec0, shf_vec1, shf_vec2);
2831 VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask0,
2832 mask1, mask2, shf_vec3, shf_vec4, shf_vec5);
2833 hz_res0 = __msa_hadd_s_w(shf_vec0, shf_vec0);
2834 DPADD_SH2_SW(shf_vec1, shf_vec2, minus5h, plus20h, hz_res0, hz_res0);
2835 hz_res1 = __msa_hadd_s_w(shf_vec3, shf_vec3);
2836 DPADD_SH2_SW(shf_vec4, shf_vec5, minus5h, plus20h, hz_res1, hz_res1);
2838 vt_res0 = AVC_DOT_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2);
2839 vt_res1 = AVC_DOT_SH3_SH(src32_l, src54_l, src76_l, filt0, filt1, filt2);
2840 vt_res2 = AVC_DOT_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2);
2841 vt_res3 = AVC_DOT_SH3_SH(src43_l, src65_l, src87_l, filt0, filt1, filt2);
2842 VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask0,
2843 mask1, mask2, shf_vec0, shf_vec1, shf_vec6);
2844 VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask0,
2845 mask1, mask2, shf_vec3, shf_vec4, shf_vec7);
2846 hz_res2 = __msa_hadd_s_w(shf_vec0, shf_vec0);
2847 DPADD_SH2_SW(shf_vec1, shf_vec6, minus5h, plus20h, hz_res2, hz_res2);
2848 hz_res3 = __msa_hadd_s_w(shf_vec3, shf_vec3);
2849 DPADD_SH2_SW(shf_vec4, shf_vec7, minus5h, plus20h, hz_res3, hz_res3);
2851 SRARI_W2_SW(hz_res0, hz_res1, 10);
2852 SAT_SW2_SW(hz_res0, hz_res1, 7);
2853 SRARI_W2_SW(hz_res2, hz_res3, 10);
2854 SAT_SW2_SW(hz_res2, hz_res3, 7);
2856 dst0 = __msa_srari_h(shf_vec2, 5);
2857 dst1 = __msa_srari_h(shf_vec5, 5);
2858 dst2 = __msa_srari_h(shf_vec6, 5);
2859 dst3 = __msa_srari_h(shf_vec7, 5);
2861 SAT_SH2_SH(dst0, dst1, 7);
2862 SAT_SH2_SH(dst2, dst3, 7);
2864 dst0 = __msa_ilvod_h(zeros, dst0);
2865 dst1 = __msa_ilvod_h(zeros, dst1);
2866 dst2 = __msa_ilvod_h(zeros, dst2);
2867 dst3 = __msa_ilvod_h(zeros, dst3);
2869 hz_res0 = __msa_aver_s_w(hz_res0, (v4i32) dst0);
2870 hz_res1 = __msa_aver_s_w(hz_res1, (v4i32) dst1);
2871 hz_res2 = __msa_aver_s_w(hz_res2, (v4i32) dst2);
2872 hz_res3 = __msa_aver_s_w(hz_res3, (v4i32) dst3);
2874 PCKEV_H2_SH(hz_res1, hz_res0, hz_res3, hz_res2, dst0, dst2);
2875 out = PCKEV_XORI128_UB(dst0, dst2);
2876 ST4x4_UB(out, out, 0, 1, 2, 3, dst, stride);
2879 void ff_put_h264_qpel16_mc22_msa(uint8_t *dst, const uint8_t *src,
2882 const int32_t filt_const0 = 0xfffb0001;
2883 const int32_t filt_const1 = 0x140014;
2884 const int32_t filt_const2 = 0x1fffb;
2885 const uint8_t *src_tmp = src - (2 * stride) - 2;
2886 uint8_t *dst_tmp = dst;
2887 uint32_t multiple8_cnt, loop_cnt;
2889 v16i8 src0, src1, src2, src3, src4, mask0, mask1, mask2;
2890 v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
2891 v8i16 hz_out7, hz_out8, dst0, dst1, dst2, dst3;
2892 v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r;
2893 v8i16 hz_out65_r, hz_out76_r, hz_out87_r, hz_out10_l, hz_out21_l;
2894 v8i16 hz_out32_l, hz_out43_l, hz_out54_l, hz_out65_l, hz_out76_l;
2895 v8i16 hz_out87_l, filt0, filt1, filt2;
2898 filt0 = (v8i16) __msa_fill_w(filt_const0);
2899 filt1 = (v8i16) __msa_fill_w(filt_const1);
2900 filt2 = (v8i16) __msa_fill_w(filt_const2);
2902 LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
2904 for (multiple8_cnt = 2; multiple8_cnt--;) {
2908 LD_SB5(src, stride, src0, src1, src2, src3, src4);
2909 XORI_B5_128_SB(src0, src1, src2, src3, src4);
2910 src += (5 * stride);
2912 hz_out0 = AVC_HORZ_FILTER_SH(src0, src0, mask0, mask1, mask2);
2913 hz_out1 = AVC_HORZ_FILTER_SH(src1, src1, mask0, mask1, mask2);
2914 hz_out2 = AVC_HORZ_FILTER_SH(src2, src2, mask0, mask1, mask2);
2915 hz_out3 = AVC_HORZ_FILTER_SH(src3, src3, mask0, mask1, mask2);
2916 hz_out4 = AVC_HORZ_FILTER_SH(src4, src4, mask0, mask1, mask2);
2918 for (loop_cnt = 4; loop_cnt--;) {
2919 LD_SB4(src, stride, src0, src1, src2, src3);
2920 XORI_B4_128_SB(src0, src1, src2, src3);
2921 src += (4 * stride);
2923 hz_out5 = AVC_HORZ_FILTER_SH(src0, src0, mask0, mask1, mask2);
2924 hz_out6 = AVC_HORZ_FILTER_SH(src1, src1, mask0, mask1, mask2);
2925 hz_out7 = AVC_HORZ_FILTER_SH(src2, src2, mask0, mask1, mask2);
2926 hz_out8 = AVC_HORZ_FILTER_SH(src3, src3, mask0, mask1, mask2);
2928 ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2,
2929 hz_out4, hz_out3, hz_out10_r, hz_out21_r, hz_out32_r,
2931 ILVL_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2,
2932 hz_out4, hz_out3, hz_out10_l, hz_out21_l, hz_out32_l,
2934 ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6,
2935 hz_out8, hz_out7, hz_out54_r, hz_out65_r, hz_out76_r,
2937 ILVL_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6,
2938 hz_out8, hz_out7, hz_out54_l, hz_out65_l, hz_out76_l,
2941 tmp0 = AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0,
2943 tmp1 = AVC_DOT_SW3_SW(hz_out10_l, hz_out32_l, hz_out54_l, filt0,
2945 dst0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
2946 tmp0 = AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0,
2948 tmp1 = AVC_DOT_SW3_SW(hz_out21_l, hz_out43_l, hz_out65_l, filt0,
2950 dst1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
2951 tmp0 = AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0,
2953 tmp1 = AVC_DOT_SW3_SW(hz_out32_l, hz_out54_l, hz_out76_l, filt0,
2955 dst2 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
2956 tmp0 = AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0,
2958 tmp1 = AVC_DOT_SW3_SW(hz_out43_l, hz_out65_l, hz_out87_l, filt0,
2960 dst3 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
2962 out0 = PCKEV_XORI128_UB(dst0, dst1);
2963 out1 = PCKEV_XORI128_UB(dst2, dst3);
2964 ST8x4_UB(out0, out1, dst, stride);
2965 dst += (4 * stride);
2979 void ff_put_h264_qpel8_mc22_msa(uint8_t *dst, const uint8_t *src,
2982 const int32_t filt_const0 = 0xfffb0001;
2983 const int32_t filt_const1 = 0x140014;
2984 const int32_t filt_const2 = 0x1fffb;
2986 v16i8 src0, src1, src2, src3, src4, mask0, mask1, mask2;
2987 v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
2988 v8i16 hz_out7, hz_out8, hz_out9, hz_out10, hz_out11, hz_out12;
2989 v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r;
2990 v8i16 hz_out65_r, hz_out76_r, hz_out87_r, hz_out89_r, hz_out910_r;
2991 v8i16 hz_out1110_r, hz_out1211_r, dst0, dst1, dst2, dst3;
2992 v8i16 hz_out10_l, hz_out21_l, hz_out32_l, hz_out43_l, hz_out54_l;
2993 v8i16 hz_out65_l, hz_out76_l, hz_out87_l, hz_out89_l, hz_out910_l;
2994 v8i16 hz_out1110_l, hz_out1211_l, filt0, filt1, filt2;
2997 filt0 = (v8i16) __msa_fill_w(filt_const0);
2998 filt1 = (v8i16) __msa_fill_w(filt_const1);
2999 filt2 = (v8i16) __msa_fill_w(filt_const2);
3001 LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
3003 src -= ((2 * stride) + 2);
3004 LD_SB5(src, stride, src0, src1, src2, src3, src4);
3005 XORI_B5_128_SB(src0, src1, src2, src3, src4);
3006 src += (5 * stride);
3008 hz_out0 = AVC_HORZ_FILTER_SH(src0, src0, mask0, mask1, mask2);
3009 hz_out1 = AVC_HORZ_FILTER_SH(src1, src1, mask0, mask1, mask2);
3010 hz_out2 = AVC_HORZ_FILTER_SH(src2, src2, mask0, mask1, mask2);
3011 hz_out3 = AVC_HORZ_FILTER_SH(src3, src3, mask0, mask1, mask2);
3012 hz_out4 = AVC_HORZ_FILTER_SH(src4, src4, mask0, mask1, mask2);
3014 LD_SB4(src, stride, src0, src1, src2, src3);
3015 XORI_B4_128_SB(src0, src1, src2, src3);
3016 src += (4 * stride);
3017 hz_out5 = AVC_HORZ_FILTER_SH(src0, src0, mask0, mask1, mask2);
3018 hz_out6 = AVC_HORZ_FILTER_SH(src1, src1, mask0, mask1, mask2);
3019 hz_out7 = AVC_HORZ_FILTER_SH(src2, src2, mask0, mask1, mask2);
3020 hz_out8 = AVC_HORZ_FILTER_SH(src3, src3, mask0, mask1, mask2);
3021 ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4,
3022 hz_out3, hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r);
3023 ILVL_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4,
3024 hz_out3, hz_out10_l, hz_out21_l, hz_out32_l, hz_out43_l);
3025 ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8,
3026 hz_out7, hz_out54_r, hz_out65_r, hz_out76_r, hz_out87_r);
3027 ILVL_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8,
3028 hz_out7, hz_out54_l, hz_out65_l, hz_out76_l, hz_out87_l);
3030 tmp0 = AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0, filt1,
3032 tmp1 = AVC_DOT_SW3_SW(hz_out10_l, hz_out32_l, hz_out54_l, filt0, filt1,
3034 dst0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
3035 tmp0 = AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0, filt1,
3037 tmp1 = AVC_DOT_SW3_SW(hz_out21_l, hz_out43_l, hz_out65_l, filt0, filt1,
3039 dst1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
3040 tmp0 = AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0, filt1,
3042 tmp1 = AVC_DOT_SW3_SW(hz_out32_l, hz_out54_l, hz_out76_l, filt0, filt1,
3044 dst2 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
3045 tmp0 = AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0, filt1,
3047 tmp1 = AVC_DOT_SW3_SW(hz_out43_l, hz_out65_l, hz_out87_l, filt0, filt1,
3049 dst3 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
3050 out0 = PCKEV_XORI128_UB(dst0, dst1);
3051 out1 = PCKEV_XORI128_UB(dst2, dst3);
3052 ST8x4_UB(out0, out1, dst, stride);
3053 dst += (4 * stride);
3055 LD_SB4(src, stride, src0, src1, src2, src3);
3056 XORI_B4_128_SB(src0, src1, src2, src3);
3057 hz_out9 = AVC_HORZ_FILTER_SH(src0, src0, mask0, mask1, mask2);
3058 hz_out10 = AVC_HORZ_FILTER_SH(src1, src1, mask0, mask1, mask2);
3059 hz_out11 = AVC_HORZ_FILTER_SH(src2, src2, mask0, mask1, mask2);
3060 hz_out12 = AVC_HORZ_FILTER_SH(src3, src3, mask0, mask1, mask2);
3061 ILVR_H4_SH(hz_out9, hz_out8, hz_out10, hz_out9, hz_out11, hz_out10,
3062 hz_out12, hz_out11, hz_out89_r, hz_out910_r, hz_out1110_r,
3064 ILVL_H4_SH(hz_out9, hz_out8, hz_out10, hz_out9, hz_out11, hz_out10,
3065 hz_out12, hz_out11, hz_out89_l, hz_out910_l, hz_out1110_l,
3067 tmp0 = AVC_DOT_SW3_SW(hz_out54_r, hz_out76_r, hz_out89_r, filt0, filt1,
3069 tmp1 = AVC_DOT_SW3_SW(hz_out54_l, hz_out76_l, hz_out89_l, filt0, filt1,
3071 dst0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
3072 tmp0 = AVC_DOT_SW3_SW(hz_out65_r, hz_out87_r, hz_out910_r, filt0, filt1,
3074 tmp1 = AVC_DOT_SW3_SW(hz_out65_l, hz_out87_l, hz_out910_l, filt0, filt1,
3076 dst1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
3077 tmp0 = AVC_DOT_SW3_SW(hz_out76_r, hz_out89_r, hz_out1110_r, filt0, filt1,
3079 tmp1 = AVC_DOT_SW3_SW(hz_out76_l, hz_out89_l, hz_out1110_l, filt0, filt1,
3081 dst2 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
3082 tmp0 = AVC_DOT_SW3_SW(hz_out87_r, hz_out910_r, hz_out1211_r, filt0, filt1,
3084 tmp1 = AVC_DOT_SW3_SW(hz_out87_l, hz_out910_l, hz_out1211_l, filt0, filt1,
3086 dst3 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
3087 out0 = PCKEV_XORI128_UB(dst0, dst1);
3088 out1 = PCKEV_XORI128_UB(dst2, dst3);
3089 ST8x4_UB(out0, out1, dst, stride);
3092 void ff_put_h264_qpel4_mc22_msa(uint8_t *dst, const uint8_t *src,
3095 const int32_t filt_const0 = 0xfffb0001;
3096 const int32_t filt_const1 = 0x140014;
3097 const int32_t filt_const2 = 0x1fffb;
3099 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
3100 v16i8 mask0, mask1, mask2;
3101 v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
3102 v8i16 hz_out7, hz_out8, dst0, dst1, filt0, filt1, filt2;
3103 v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r;
3104 v8i16 hz_out65_r, hz_out76_r, hz_out87_r;
3107 LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2);
3109 filt0 = (v8i16) __msa_fill_w(filt_const0);
3110 filt1 = (v8i16) __msa_fill_w(filt_const1);
3111 filt2 = (v8i16) __msa_fill_w(filt_const2);
3113 src -= ((2 * stride) + 2);
3115 LD_SB5(src, stride, src0, src1, src2, src3, src4);
3116 src += (5 * stride);
3117 LD_SB4(src, stride, src5, src6, src7, src8);
3119 XORI_B5_128_SB(src0, src1, src2, src3, src4);
3120 XORI_B4_128_SB(src5, src6, src7, src8);
3121 hz_out0 = AVC_HORZ_FILTER_SH(src0, src1, mask0, mask1, mask2);
3122 hz_out2 = AVC_HORZ_FILTER_SH(src2, src3, mask0, mask1, mask2);
3123 hz_out4 = AVC_HORZ_FILTER_SH(src4, src5, mask0, mask1, mask2);
3124 hz_out6 = AVC_HORZ_FILTER_SH(src6, src7, mask0, mask1, mask2);
3125 hz_out8 = AVC_HORZ_FILTER_SH(src8, src8, mask0, mask1, mask2);
3126 PCKOD_D2_SH(hz_out0, hz_out0, hz_out2, hz_out2, hz_out1, hz_out3);
3127 PCKOD_D2_SH(hz_out4, hz_out4, hz_out6, hz_out6, hz_out5, hz_out7);
3128 ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4,
3129 hz_out3, hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r);
3130 ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8,
3131 hz_out7, hz_out54_r, hz_out65_r, hz_out76_r, hz_out87_r);
3133 tmp0 = AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0, filt1,
3135 tmp1 = AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0, filt1,
3137 dst0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
3138 tmp0 = AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0, filt1,
3140 tmp1 = AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0, filt1,
3142 dst1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
3143 res = PCKEV_XORI128_UB(dst0, dst1);
3144 ST4x4_UB(res, res, 0, 1, 2, 3, dst, stride);
3147 void ff_avg_h264_qpel16_mc10_msa(uint8_t *dst, const uint8_t *src,
3151 v16u8 dst0, dst1, dst2, dst3;
3152 v16i8 out0, out1, out2, out3, src0, src1, src2, src3, src4, src5, src6;
3153 v16i8 mask0, mask1, mask2, mask3, mask4, mask5, src7, vec11;
3154 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
3155 v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
3156 v16i8 minus5b = __msa_ldi_b(-5);
3157 v16i8 plus20b = __msa_ldi_b(20);
3159 LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
3165 for (loop_cnt = 4; loop_cnt--;) {
3166 LD_SB2(src, 16, src0, src1);
3168 LD_SB2(src, 16, src2, src3);
3170 LD_SB2(src, 16, src4, src5);
3172 LD_SB2(src, 16, src6, src7);
3175 LD_UB4(dst, stride, dst0, dst1, dst2, dst3);
3176 XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
3177 VSHF_B2_SB(src0, src0, src0, src1, mask0, mask3, vec0, vec3);
3178 VSHF_B2_SB(src2, src2, src2, src3, mask0, mask3, vec6, vec9);
3179 VSHF_B2_SB(src0, src0, src0, src1, mask1, mask4, vec1, vec4);
3180 VSHF_B2_SB(src2, src2, src2, src3, mask1, mask4, vec7, vec10);
3181 VSHF_B2_SB(src0, src0, src0, src1, mask2, mask5, vec2, vec5);
3182 VSHF_B2_SB(src2, src2, src2, src3, mask2, mask5, vec8, vec11);
3183 HADD_SB4_SH(vec0, vec3, vec6, vec9, res0, res1, res2, res3);
3184 DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
3185 minus5b, res0, res1, res2, res3);
3186 DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
3187 plus20b, res0, res1, res2, res3);
3188 VSHF_B2_SB(src4, src4, src4, src5, mask0, mask3, vec0, vec3);
3189 VSHF_B2_SB(src6, src6, src6, src7, mask0, mask3, vec6, vec9);
3190 VSHF_B2_SB(src4, src4, src4, src5, mask1, mask4, vec1, vec4);
3191 VSHF_B2_SB(src6, src6, src6, src7, mask1, mask4, vec7, vec10);
3192 VSHF_B2_SB(src4, src4, src4, src5, mask2, mask5, vec2, vec5);
3193 VSHF_B2_SB(src6, src6, src6, src7, mask2, mask5, vec8, vec11);
3194 HADD_SB4_SH(vec0, vec3, vec6, vec9, res4, res5, res6, res7);
3195 DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
3196 minus5b, res4, res5, res6, res7);
3197 DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
3198 plus20b, res4, res5, res6, res7);
3199 SLDI_B2_SB(src1, src3, src0, src2, src0, src2, 2);
3200 SLDI_B2_SB(src5, src7, src4, src6, src4, src6, 2);
3201 SRARI_H4_SH(res0, res1, res2, res3, 5);
3202 SRARI_H4_SH(res4, res5, res6, res7, 5);
3203 SAT_SH4_SH(res0, res1, res2, res3, 7);
3204 SAT_SH4_SH(res4, res5, res6, res7, 7);
3205 PCKEV_B2_SB(res1, res0, res3, res2, out0, out1);
3206 PCKEV_B2_SB(res5, res4, res7, res6, out2, out3);
3207 out0 = __msa_aver_s_b(out0, src0);
3208 out1 = __msa_aver_s_b(out1, src2);
3209 out2 = __msa_aver_s_b(out2, src4);
3210 out3 = __msa_aver_s_b(out3, src6);
3211 XORI_B4_128_SB(out0, out1, out2, out3);
3212 AVER_UB2_UB(out0, dst0, out1, dst1, dst0, dst1);
3213 AVER_UB2_UB(out2, dst2, out3, dst3, dst2, dst3);
3214 ST_UB4(dst0, dst1, dst2, dst3, dst, stride);
3215 dst += (4 * stride);
3219 void ff_avg_h264_qpel16_mc30_msa(uint8_t *dst, const uint8_t *src,
3223 v16u8 dst0, dst1, dst2, dst3;
3224 v16i8 out0, out1, out2, out3, src0, src1, src2, src3, src4, src5, src6;
3225 v16i8 mask0, mask1, mask2, mask3, mask4, mask5, src7, vec11;
3226 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
3227 v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
3228 v16i8 minus5b = __msa_ldi_b(-5);
3229 v16i8 plus20b = __msa_ldi_b(20);
3231 LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
3237 for (loop_cnt = 4; loop_cnt--;) {
3238 LD_SB2(src, 16, src0, src1);
3240 LD_SB2(src, 16, src2, src3);
3242 LD_SB2(src, 16, src4, src5);
3244 LD_SB2(src, 16, src6, src7);
3247 LD_UB4(dst, stride, dst0, dst1, dst2, dst3);
3248 XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
3249 VSHF_B2_SB(src0, src0, src0, src1, mask0, mask3, vec0, vec3);
3250 VSHF_B2_SB(src2, src2, src2, src3, mask0, mask3, vec6, vec9);
3251 VSHF_B2_SB(src0, src0, src0, src1, mask1, mask4, vec1, vec4);
3252 VSHF_B2_SB(src2, src2, src2, src3, mask1, mask4, vec7, vec10);
3253 VSHF_B2_SB(src0, src0, src0, src1, mask2, mask5, vec2, vec5);
3254 VSHF_B2_SB(src2, src2, src2, src3, mask2, mask5, vec8, vec11);
3255 HADD_SB4_SH(vec0, vec3, vec6, vec9, res0, res1, res2, res3);
3256 DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
3257 minus5b, res0, res1, res2, res3);
3258 DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
3259 plus20b, res0, res1, res2, res3);
3260 VSHF_B2_SB(src4, src4, src4, src5, mask0, mask3, vec0, vec3);
3261 VSHF_B2_SB(src6, src6, src6, src7, mask0, mask3, vec6, vec9);
3262 VSHF_B2_SB(src4, src4, src4, src5, mask1, mask4, vec1, vec4);
3263 VSHF_B2_SB(src6, src6, src6, src7, mask1, mask4, vec7, vec10);
3264 VSHF_B2_SB(src4, src4, src4, src5, mask2, mask5, vec2, vec5);
3265 VSHF_B2_SB(src6, src6, src6, src7, mask2, mask5, vec8, vec11);
3266 HADD_SB4_SH(vec0, vec3, vec6, vec9, res4, res5, res6, res7);
3267 DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
3268 minus5b, res4, res5, res6, res7);
3269 DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
3270 plus20b, res4, res5, res6, res7);
3271 SLDI_B2_SB(src1, src3, src0, src2, src0, src2, 3);
3272 SLDI_B2_SB(src5, src7, src4, src6, src4, src6, 3);
3273 SRARI_H4_SH(res0, res1, res2, res3, 5);
3274 SRARI_H4_SH(res4, res5, res6, res7, 5);
3275 SAT_SH4_SH(res0, res1, res2, res3, 7);
3276 SAT_SH4_SH(res4, res5, res6, res7, 7);
3277 PCKEV_B2_SB(res1, res0, res3, res2, out0, out1);
3278 PCKEV_B2_SB(res5, res4, res7, res6, out2, out3);
3279 out0 = __msa_aver_s_b(out0, src0);
3280 out1 = __msa_aver_s_b(out1, src2);
3281 out2 = __msa_aver_s_b(out2, src4);
3282 out3 = __msa_aver_s_b(out3, src6);
3283 XORI_B4_128_SB(out0, out1, out2, out3);
3284 AVER_UB2_UB(out0, dst0, out1, dst1, dst0, dst1);
3285 AVER_UB2_UB(out2, dst2, out3, dst3, dst2, dst3);
3286 ST_UB4(dst0, dst1, dst2, dst3, dst, stride);
3287 dst += (4 * stride);
3291 void ff_avg_h264_qpel8_mc10_msa(uint8_t *dst, const uint8_t *src,
3294 uint64_t tp0, tp1, tp2, tp3;
3295 v16u8 dst0 = { 0 }, dst1 = { 0 }, dst2 = { 0 }, dst3 = { 0 };
3296 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask0, mask1, mask2;
3297 v16i8 tmp0, tmp1, tmp2, tmp3, vec11;
3298 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
3299 v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
3300 v16i8 minus5b = __msa_ldi_b(-5);
3301 v16i8 plus20b = __msa_ldi_b(20);
3303 LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
3304 LD_SB8(src - 2, stride, src0, src1, src2, src3, src4, src5, src6, src7);
3305 XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
3306 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
3307 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3);
3308 HADD_SB4_SH(vec0, vec1, vec2, vec3, res0, res1, res2, res3);
3309 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4, vec5);
3310 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6, vec7);
3311 DPADD_SB4_SH(vec4, vec5, vec6, vec7, minus5b, minus5b, minus5b, minus5b,
3312 res0, res1, res2, res3);
3313 VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec8, vec9);
3314 VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec10, vec11);
3315 DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b, plus20b,
3316 res0, res1, res2, res3);
3317 VSHF_B2_SB(src4, src4, src5, src5, mask0, mask0, vec0, vec1);
3318 VSHF_B2_SB(src6, src6, src7, src7, mask0, mask0, vec2, vec3);
3319 HADD_SB4_SH(vec0, vec1, vec2, vec3, res4, res5, res6, res7);
3320 VSHF_B2_SB(src4, src4, src5, src5, mask1, mask1, vec4, vec5);
3321 VSHF_B2_SB(src6, src6, src7, src7, mask1, mask1, vec6, vec7);
3322 DPADD_SB4_SH(vec4, vec5, vec6, vec7, minus5b, minus5b, minus5b, minus5b,
3323 res4, res5, res6, res7);
3324 VSHF_B2_SB(src4, src4, src5, src5, mask2, mask2, vec8, vec9);
3325 VSHF_B2_SB(src6, src6, src7, src7, mask2, mask2, vec10, vec11);
3326 DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b, plus20b,
3327 res4, res5, res6, res7);
3328 SLDI_B2_SB(src0, src1, src0, src1, src0, src1, 2);
3329 SLDI_B2_SB(src2, src3, src2, src3, src2, src3, 2);
3330 SLDI_B2_SB(src4, src5, src4, src5, src4, src5, 2);
3331 SLDI_B2_SB(src6, src7, src6, src7, src6, src7, 2);
3332 PCKEV_D2_SB(src1, src0, src3, src2, src0, src1);
3333 PCKEV_D2_SB(src5, src4, src7, src6, src4, src5);
3334 SRARI_H4_SH(res0, res1, res2, res3, 5);
3335 SRARI_H4_SH(res4, res5, res6, res7, 5);
3336 SAT_SH4_SH(res0, res1, res2, res3, 7);
3337 SAT_SH4_SH(res4, res5, res6, res7, 7);
3338 PCKEV_B2_SB(res1, res0, res3, res2, tmp0, tmp1);
3339 PCKEV_B2_SB(res5, res4, res7, res6, tmp2, tmp3);
3340 tmp0 = __msa_aver_s_b(tmp0, src0);
3341 tmp1 = __msa_aver_s_b(tmp1, src1);
3342 tmp2 = __msa_aver_s_b(tmp2, src4);
3343 tmp3 = __msa_aver_s_b(tmp3, src5);
3344 XORI_B4_128_SB(tmp0, tmp1, tmp2, tmp3);
3345 LD4(dst, stride, tp0, tp1, tp2, tp3);
3346 INSERT_D2_UB(tp0, tp1, dst0);
3347 INSERT_D2_UB(tp2, tp3, dst1);
3348 LD4(dst + 4 * stride, stride, tp0, tp1, tp2, tp3);
3349 INSERT_D2_UB(tp0, tp1, dst2);
3350 INSERT_D2_UB(tp2, tp3, dst3);
3351 AVER_UB2_UB(tmp0, dst0, tmp1, dst1, dst0, dst1);
3352 AVER_UB2_UB(tmp2, dst2, tmp3, dst3, dst2, dst3);
3353 ST8x8_UB(dst0, dst1, dst2, dst3, dst, stride);
3356 void ff_avg_h264_qpel8_mc30_msa(uint8_t *dst, const uint8_t *src,
3359 uint64_t tp0, tp1, tp2, tp3;
3360 v16u8 dst0 = { 0 }, dst1 = { 0 }, dst2 = { 0 }, dst3 = { 0 };
3361 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask0, mask1, mask2;
3362 v16i8 tmp0, tmp1, tmp2, tmp3, vec11;
3363 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
3364 v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
3365 v16i8 minus5b = __msa_ldi_b(-5);
3366 v16i8 plus20b = __msa_ldi_b(20);
3368 LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
3369 LD_SB8(src - 2, stride, src0, src1, src2, src3, src4, src5, src6, src7);
3370 XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
3371 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
3372 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3);
3373 HADD_SB4_SH(vec0, vec1, vec2, vec3, res0, res1, res2, res3);
3374 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4, vec5);
3375 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6, vec7);
3376 DPADD_SB4_SH(vec4, vec5, vec6, vec7, minus5b, minus5b, minus5b, minus5b,
3377 res0, res1, res2, res3);
3378 VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec8, vec9);
3379 VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec10, vec11);
3380 DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b, plus20b,
3381 res0, res1, res2, res3);
3382 VSHF_B2_SB(src4, src4, src5, src5, mask0, mask0, vec0, vec1);
3383 VSHF_B2_SB(src6, src6, src7, src7, mask0, mask0, vec2, vec3);
3384 HADD_SB4_SH(vec0, vec1, vec2, vec3, res4, res5, res6, res7);
3385 VSHF_B2_SB(src4, src4, src5, src5, mask1, mask1, vec4, vec5);
3386 VSHF_B2_SB(src6, src6, src7, src7, mask1, mask1, vec6, vec7);
3387 DPADD_SB4_SH(vec4, vec5, vec6, vec7, minus5b, minus5b, minus5b, minus5b,
3388 res4, res5, res6, res7);
3389 VSHF_B2_SB(src4, src4, src5, src5, mask2, mask2, vec8, vec9);
3390 VSHF_B2_SB(src6, src6, src7, src7, mask2, mask2, vec10, vec11);
3391 DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b, plus20b,
3392 res4, res5, res6, res7);
3393 SLDI_B2_SB(src0, src1, src0, src1, src0, src1, 3);
3394 SLDI_B2_SB(src2, src3, src2, src3, src2, src3, 3);
3395 SLDI_B2_SB(src4, src5, src4, src5, src4, src5, 3);
3396 SLDI_B2_SB(src6, src7, src6, src7, src6, src7, 3);
3397 PCKEV_D2_SB(src1, src0, src3, src2, src0, src1);
3398 PCKEV_D2_SB(src5, src4, src7, src6, src4, src5);
3399 SRARI_H4_SH(res0, res1, res2, res3, 5);
3400 SRARI_H4_SH(res4, res5, res6, res7, 5);
3401 SAT_SH4_SH(res0, res1, res2, res3, 7);
3402 SAT_SH4_SH(res4, res5, res6, res7, 7);
3403 PCKEV_B2_SB(res1, res0, res3, res2, tmp0, tmp1);
3404 PCKEV_B2_SB(res5, res4, res7, res6, tmp2, tmp3);
3405 tmp0 = __msa_aver_s_b(tmp0, src0);
3406 tmp1 = __msa_aver_s_b(tmp1, src1);
3407 tmp2 = __msa_aver_s_b(tmp2, src4);
3408 tmp3 = __msa_aver_s_b(tmp3, src5);
3409 XORI_B4_128_SB(tmp0, tmp1, tmp2, tmp3);
3410 LD4(dst, stride, tp0, tp1, tp2, tp3);
3411 INSERT_D2_UB(tp0, tp1, dst0);
3412 INSERT_D2_UB(tp2, tp3, dst1);
3413 LD4(dst + 4 * stride, stride, tp0, tp1, tp2, tp3);
3414 INSERT_D2_UB(tp0, tp1, dst2);
3415 INSERT_D2_UB(tp2, tp3, dst3);
3416 AVER_UB2_UB(tmp0, dst0, tmp1, dst1, dst0, dst1);
3417 AVER_UB2_UB(tmp2, dst2, tmp3, dst3, dst2, dst3);
3418 ST8x8_UB(dst0, dst1, dst2, dst3, dst, stride);
3421 void ff_avg_h264_qpel4_mc10_msa(uint8_t *dst, const uint8_t *src,
3424 uint32_t tp0, tp1, tp2, tp3;
3426 v16i8 src0, src1, src2, src3, res, vec0, vec1, vec2, vec3, vec4, vec5;
3427 v16i8 mask0, mask1, mask2;
3429 v16i8 minus5b = __msa_ldi_b(-5);
3430 v16i8 plus20b = __msa_ldi_b(20);
3432 LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2);
3433 LD_SB4(src - 2, stride, src0, src1, src2, src3);
3434 XORI_B4_128_SB(src0, src1, src2, src3);
3435 VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0, vec1);
3436 HADD_SB2_SH(vec0, vec1, out0, out1);
3437 VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2, vec3);
3438 DPADD_SB2_SH(vec2, vec3, minus5b, minus5b, out0, out1);
3439 VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4, vec5);
3440 DPADD_SB2_SH(vec4, vec5, plus20b, plus20b, out0, out1);
3441 SRARI_H2_SH(out0, out1, 5);
3442 SAT_SH2_SH(out0, out1, 7);
3443 res = __msa_pckev_b((v16i8) out1, (v16i8) out0);
3444 SLDI_B2_SB(src0, src1, src0, src1, src0, src1, 2);
3445 SLDI_B2_SB(src2, src3, src2, src3, src2, src3, 2);
3446 src0 = (v16i8) __msa_insve_w((v4i32) src0, 1, (v4i32) src1);
3447 src1 = (v16i8) __msa_insve_w((v4i32) src2, 1, (v4i32) src3);
3448 src0 = (v16i8) __msa_insve_d((v2i64) src0, 1, (v2i64) src1);
3449 res = __msa_aver_s_b(res, src0);
3450 res = (v16i8) __msa_xori_b((v16u8) res, 128);
3451 LW4(dst, stride, tp0, tp1, tp2, tp3);
3452 INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
3453 dst0 = __msa_aver_u_b((v16u8) res, dst0);
3454 ST4x4_UB(dst0, dst0, 0, 1, 2, 3, dst, stride);
3457 void ff_avg_h264_qpel4_mc30_msa(uint8_t *dst, const uint8_t *src,
3460 uint32_t tp0, tp1, tp2, tp3;
3462 v16i8 src0, src1, src2, src3, res, vec0, vec1, vec2, vec3, vec4, vec5;
3463 v16i8 mask0, mask1, mask2;
3465 v16i8 minus5b = __msa_ldi_b(-5);
3466 v16i8 plus20b = __msa_ldi_b(20);
3468 LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2);
3469 LD_SB4(src - 2, stride, src0, src1, src2, src3);
3470 XORI_B4_128_SB(src0, src1, src2, src3);
3471 VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0, vec1);
3472 HADD_SB2_SH(vec0, vec1, out0, out1);
3473 VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2, vec3);
3474 DPADD_SB2_SH(vec2, vec3, minus5b, minus5b, out0, out1);
3475 VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4, vec5);
3476 DPADD_SB2_SH(vec4, vec5, plus20b, plus20b, out0, out1);
3477 SRARI_H2_SH(out0, out1, 5);
3478 SAT_SH2_SH(out0, out1, 7);
3479 res = __msa_pckev_b((v16i8) out1, (v16i8) out0);
3480 SLDI_B2_SB(src0, src1, src0, src1, src0, src1, 3);
3481 SLDI_B2_SB(src2, src3, src2, src3, src2, src3, 3);
3482 src0 = (v16i8) __msa_insve_w((v4i32) src0, 1, (v4i32) src1);
3483 src1 = (v16i8) __msa_insve_w((v4i32) src2, 1, (v4i32) src3);
3484 src0 = (v16i8) __msa_insve_d((v2i64) src0, 1, (v2i64) src1);
3485 res = __msa_aver_s_b(res, src0);
3486 res = (v16i8) __msa_xori_b((v16u8) res, 128);
3487 LW4(dst, stride, tp0, tp1, tp2, tp3);
3488 INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
3489 dst0 = __msa_aver_u_b((v16u8) res, dst0);
3490 ST4x4_UB(dst0, dst0, 0, 1, 2, 3, dst, stride);
3493 void ff_avg_h264_qpel16_mc20_msa(uint8_t *dst, const uint8_t *src,
3497 v16u8 dst0, dst1, dst2, dst3;
3498 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask0, mask1, mask2;
3499 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
3501 v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
3502 v16i8 minus5b = __msa_ldi_b(-5);
3503 v16i8 plus20b = __msa_ldi_b(20);
3505 LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
3508 for (loop_cnt = 4; loop_cnt--;) {
3509 LD_SB2(src, 8, src0, src1);
3511 LD_SB2(src, 8, src2, src3);
3513 LD_SB2(src, 8, src4, src5);
3515 LD_SB2(src, 8, src6, src7);
3518 LD_UB4(dst, stride, dst0, dst1, dst2, dst3);
3519 XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
3520 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec3);
3521 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec6, vec9);
3522 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec1, vec4);
3523 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec7, vec10);
3524 VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec2, vec5);
3525 VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec8, vec11);
3526 HADD_SB4_SH(vec0, vec3, vec6, vec9, res0, res1, res2, res3);
3527 DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
3528 minus5b, res0, res1, res2, res3);
3529 DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
3530 plus20b, res0, res1, res2, res3);
3531 VSHF_B2_SB(src4, src4, src5, src5, mask0, mask0, vec0, vec3);
3532 VSHF_B2_SB(src6, src6, src7, src7, mask0, mask0, vec6, vec9);
3533 VSHF_B2_SB(src4, src4, src5, src5, mask1, mask1, vec1, vec4);
3534 VSHF_B2_SB(src6, src6, src7, src7, mask1, mask1, vec7, vec10);
3535 VSHF_B2_SB(src4, src4, src5, src5, mask2, mask2, vec2, vec5);
3536 VSHF_B2_SB(src6, src6, src7, src7, mask2, mask2, vec8, vec11);
3537 HADD_SB4_SH(vec0, vec3, vec6, vec9, res4, res5, res6, res7);
3538 DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
3539 minus5b, res4, res5, res6, res7);
3540 DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
3541 plus20b, res4, res5, res6, res7);
3542 SRARI_H4_SH(res0, res1, res2, res3, 5);
3543 SRARI_H4_SH(res4, res5, res6, res7, 5);
3544 SAT_SH4_SH(res0, res1, res2, res3, 7);
3545 SAT_SH4_SH(res4, res5, res6, res7, 7);
3546 PCKEV_B4_SB(res1, res0, res3, res2, res5, res4, res7, res6, vec0, vec1,
3548 XORI_B4_128_SB(vec0, vec1, vec2, vec3);
3549 AVER_UB2_UB(vec0, dst0, vec1, dst1, dst0, dst1);
3550 AVER_UB2_UB(vec2, dst2, vec3, dst3, dst2, dst3);
3551 ST_UB4(dst0, dst1, dst2, dst3, dst, stride);
3552 dst += (4 * stride);
3556 void ff_avg_h264_qpel8_mc20_msa(uint8_t *dst, const uint8_t *src,
3559 uint64_t tp0, tp1, tp2, tp3;
3560 v16u8 out0, out1, out2 = { 0 }, out3 = { 0 };
3561 v16u8 out4, out5, out6 = { 0 }, out7 = { 0 };
3562 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask0, mask1, mask2;
3563 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
3565 v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
3566 v16i8 minus5b = __msa_ldi_b(-5);
3567 v16i8 plus20b = __msa_ldi_b(20);
3569 LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
3571 LD_SB8(src - 2, stride, src0, src1, src2, src3, src4, src5, src6, src7);
3572 XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
3573 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
3574 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3);
3575 HADD_SB4_SH(vec0, vec1, vec2, vec3, res0, res1, res2, res3);
3576 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4, vec5);
3577 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6, vec7);
3578 DPADD_SB4_SH(vec4, vec5, vec6, vec7, minus5b, minus5b, minus5b, minus5b,
3579 res0, res1, res2, res3);
3580 VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec8, vec9);
3581 VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec10, vec11);
3582 DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b, plus20b,
3583 res0, res1, res2, res3);
3584 VSHF_B2_SB(src4, src4, src5, src5, mask0, mask0, vec0, vec1);
3585 VSHF_B2_SB(src6, src6, src7, src7, mask0, mask0, vec2, vec3);
3586 HADD_SB4_SH(vec0, vec1, vec2, vec3, res4, res5, res6, res7);
3587 VSHF_B2_SB(src4, src4, src5, src5, mask1, mask1, vec4, vec5);
3588 VSHF_B2_SB(src6, src6, src7, src7, mask1, mask1, vec6, vec7);
3589 DPADD_SB4_SH(vec4, vec5, vec6, vec7, minus5b, minus5b, minus5b, minus5b,
3590 res4, res5, res6, res7);
3591 VSHF_B2_SB(src4, src4, src5, src5, mask2, mask2, vec8, vec9);
3592 VSHF_B2_SB(src6, src6, src7, src7, mask2, mask2, vec10, vec11);
3593 DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b, plus20b,
3594 res4, res5, res6, res7);
3595 SRARI_H4_SH(res0, res1, res2, res3, 5);
3596 SRARI_H4_SH(res4, res5, res6, res7, 5);
3597 SAT_SH4_SH(res0, res1, res2, res3, 7);
3598 SAT_SH4_SH(res4, res5, res6, res7, 7);
3599 out0 = PCKEV_XORI128_UB(res0, res1);
3600 out1 = PCKEV_XORI128_UB(res2, res3);
3601 out4 = PCKEV_XORI128_UB(res4, res5);
3602 out5 = PCKEV_XORI128_UB(res6, res7);
3603 LD4(dst, stride, tp0, tp1, tp2, tp3);
3604 INSERT_D2_UB(tp0, tp1, out2);
3605 INSERT_D2_UB(tp2, tp3, out3);
3606 LD4(dst + 4 * stride, stride, tp0, tp1, tp2, tp3);
3607 INSERT_D2_UB(tp0, tp1, out6);
3608 INSERT_D2_UB(tp2, tp3, out7);
3609 AVER_UB2_UB(out0, out2, out1, out3, out0, out1);
3610 AVER_UB2_UB(out4, out6, out5, out7, out4, out5);
3611 ST8x8_UB(out0, out1, out4, out5, dst, stride);
3614 void ff_avg_h264_qpel4_mc20_msa(uint8_t *dst, const uint8_t *src,
3617 uint32_t tp0, tp1, tp2, tp3;
3618 v16u8 res, dst0 = { 0 };
3619 v16i8 src0, src1, src2, src3, vec0, vec1, vec2, vec3, vec4, vec5;
3620 v16i8 mask0, mask1, mask2;
3622 v16i8 minus5b = __msa_ldi_b(-5);
3623 v16i8 plus20b = __msa_ldi_b(20);
3625 LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2);
3626 LD_SB4(src - 2, stride, src0, src1, src2, src3);
3627 XORI_B4_128_SB(src0, src1, src2, src3);
3628 VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0, vec1);
3629 HADD_SB2_SH(vec0, vec1, res0, res1);
3630 VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2, vec3);
3631 DPADD_SB2_SH(vec2, vec3, minus5b, minus5b, res0, res1);
3632 VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4, vec5);
3633 DPADD_SB2_SH(vec4, vec5, plus20b, plus20b, res0, res1);
3634 SRARI_H2_SH(res0, res1, 5);
3635 SAT_SH2_SH(res0, res1, 7);
3636 res = PCKEV_XORI128_UB(res0, res1);
3637 LW4(dst, stride, tp0, tp1, tp2, tp3);
3638 INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
3639 res = __msa_aver_u_b(res, dst0);
3640 ST4x4_UB(res, res, 0, 1, 2, 3, dst, stride);
3643 void ff_avg_h264_qpel16_mc01_msa(uint8_t *dst, const uint8_t *src,
3647 int16_t filt_const0 = 0xfb01;
3648 int16_t filt_const1 = 0x1414;
3649 int16_t filt_const2 = 0x1fb;
3650 v16u8 res0, res1, res2, res3, dst0, dst1, dst2, dst3;
3651 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
3652 v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
3653 v16i8 src87_r, src10_l, src32_l, src54_l, src76_l, src21_l, src43_l;
3654 v16i8 src65_l, src87_l, filt0, filt1, filt2;
3655 v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
3657 filt0 = (v16i8) __msa_fill_h(filt_const0);
3658 filt1 = (v16i8) __msa_fill_h(filt_const1);
3659 filt2 = (v16i8) __msa_fill_h(filt_const2);
3661 src -= (stride * 2);
3663 LD_SB5(src, stride, src0, src1, src2, src3, src4);
3664 src += (5 * stride);
3666 XORI_B5_128_SB(src0, src1, src2, src3, src4);
3667 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
3669 ILVL_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_l, src21_l,
3672 for (loop_cnt = 4; loop_cnt--;) {
3673 LD_SB4(src, stride, src5, src6, src7, src8);
3674 src += (4 * stride);
3676 XORI_B4_128_SB(src5, src6, src7, src8);
3677 ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r,
3678 src65_r, src76_r, src87_r);
3679 ILVL_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_l,
3680 src65_l, src76_l, src87_l);
3681 out0_r = AVC_DOT_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2);
3682 out1_r = AVC_DOT_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2);
3683 out2_r = AVC_DOT_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2);
3684 out3_r = AVC_DOT_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2);
3685 out0_l = AVC_DOT_SH3_SH(src10_l, src32_l, src54_l, filt0, filt1, filt2);
3686 out1_l = AVC_DOT_SH3_SH(src21_l, src43_l, src65_l, filt0, filt1, filt2);
3687 out2_l = AVC_DOT_SH3_SH(src32_l, src54_l, src76_l, filt0, filt1, filt2);
3688 out3_l = AVC_DOT_SH3_SH(src43_l, src65_l, src87_l, filt0, filt1, filt2);
3689 SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 5);
3690 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
3691 SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 5);
3692 SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
3693 PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
3694 out3_r, res0, res1, res2, res3);
3695 res0 = (v16u8) __msa_aver_s_b((v16i8) res0, src2);
3696 res1 = (v16u8) __msa_aver_s_b((v16i8) res1, src3);
3697 res2 = (v16u8) __msa_aver_s_b((v16i8) res2, src4);
3698 res3 = (v16u8) __msa_aver_s_b((v16i8) res3, src5);
3699 LD_UB4(dst, stride, dst0, dst1, dst2, dst3);
3700 XORI_B4_128_UB(res0, res1, res2, res3);
3701 AVER_UB2_UB(res0, dst0, res1, dst1, dst0, dst1);
3702 AVER_UB2_UB(res2, dst2, res3, dst3, dst2, dst3);
3703 ST_UB4(dst0, dst1, dst2, dst3, dst, stride);
3704 dst += (4 * stride);
3720 void ff_avg_h264_qpel16_mc03_msa(uint8_t *dst, const uint8_t *src,
3724 int16_t filt_const0 = 0xfb01;
3725 int16_t filt_const1 = 0x1414;
3726 int16_t filt_const2 = 0x1fb;
3727 v16u8 res0, res1, res2, res3, dst0, dst1, dst2, dst3;
3728 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
3729 v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
3730 v16i8 src87_r, src10_l, src32_l, src54_l, src76_l, src21_l, src43_l;
3731 v16i8 src65_l, src87_l, filt0, filt1, filt2;
3732 v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
3734 filt0 = (v16i8) __msa_fill_h(filt_const0);
3735 filt1 = (v16i8) __msa_fill_h(filt_const1);
3736 filt2 = (v16i8) __msa_fill_h(filt_const2);
3738 src -= (stride * 2);
3740 LD_SB5(src, stride, src0, src1, src2, src3, src4);
3741 src += (5 * stride);
3743 XORI_B5_128_SB(src0, src1, src2, src3, src4);
3744 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
3746 ILVL_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_l, src21_l,
3749 for (loop_cnt = 4; loop_cnt--;) {
3750 LD_SB4(src, stride, src5, src6, src7, src8);
3751 src += (4 * stride);
3753 XORI_B4_128_SB(src5, src6, src7, src8);
3754 ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r,
3755 src65_r, src76_r, src87_r);
3756 ILVL_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_l,
3757 src65_l, src76_l, src87_l);
3758 out0_r = AVC_DOT_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2);
3759 out1_r = AVC_DOT_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2);
3760 out2_r = AVC_DOT_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2);
3761 out3_r = AVC_DOT_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2);
3762 out0_l = AVC_DOT_SH3_SH(src10_l, src32_l, src54_l, filt0, filt1, filt2);
3763 out1_l = AVC_DOT_SH3_SH(src21_l, src43_l, src65_l, filt0, filt1, filt2);
3764 out2_l = AVC_DOT_SH3_SH(src32_l, src54_l, src76_l, filt0, filt1, filt2);
3765 out3_l = AVC_DOT_SH3_SH(src43_l, src65_l, src87_l, filt0, filt1, filt2);
3766 SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 5);
3767 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
3768 SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 5);
3769 SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
3770 PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
3771 out3_r, res0, res1, res2, res3);
3772 res0 = (v16u8) __msa_aver_s_b((v16i8) res0, src3);
3773 res1 = (v16u8) __msa_aver_s_b((v16i8) res1, src4);
3774 res2 = (v16u8) __msa_aver_s_b((v16i8) res2, src5);
3775 res3 = (v16u8) __msa_aver_s_b((v16i8) res3, src6);
3776 LD_UB4(dst, stride, dst0, dst1, dst2, dst3);
3777 XORI_B4_128_UB(res0, res1, res2, res3);
3778 AVER_UB2_UB(res0, dst0, res1, dst1, dst0, dst1);
3779 AVER_UB2_UB(res2, dst2, res3, dst3, dst2, dst3);
3780 ST_UB4(dst0, dst1, dst2, dst3, dst, stride);
3781 dst += (4 * stride);
3796 void ff_avg_h264_qpel8_mc01_msa(uint8_t *dst, const uint8_t *src,
3799 uint64_t tp0, tp1, tp2, tp3;
3800 const int16_t filt_const0 = 0xfb01;
3801 const int16_t filt_const1 = 0x1414;
3802 const int16_t filt_const2 = 0x1fb;
3803 v16u8 dst0 = { 0 }, dst1 = { 0 }, dst2 = { 0 }, dst3 = { 0 };
3804 v16i8 src0, src1, src2, src3, src4, src7, src8, src9, src10, src11, src12;
3805 v16i8 src13, src14, tmp0, tmp1, tmp2, tmp3, src109_r;
3806 v16i8 src10_r, src32_r, src76_r, src98_r, src21_r, src43_r, src87_r;
3807 v16i8 filt0, filt1, filt2, out0, out1, out2, out3;
3808 v8i16 out0_r, out1_r, out2_r, out3_r, out4_r, out5_r, out6_r, out7_r;
3810 filt0 = (v16i8) __msa_fill_h(filt_const0);
3811 filt1 = (v16i8) __msa_fill_h(filt_const1);
3812 filt2 = (v16i8) __msa_fill_h(filt_const2);
3814 src -= (stride * 2);
3816 LD_SB5(src, stride, src0, src1, src2, src3, src4);
3817 src += (5 * stride);
3819 XORI_B5_128_SB(src0, src1, src2, src3, src4);
3820 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
3822 LD_SB8(src, stride, src7, src8, src9, src10, src11, src12, src13, src14);
3823 XORI_B8_128_SB(src7, src8, src9, src10, src11, src12, src13, src14);
3824 ILVR_B4_SB(src7, src4, src8, src7, src9, src8, src10, src9, src76_r,
3825 src87_r, src98_r, src109_r);
3826 out0_r = AVC_DOT_SH3_SH(src10_r, src32_r, src76_r, filt0, filt1, filt2);
3827 out1_r = AVC_DOT_SH3_SH(src21_r, src43_r, src87_r, filt0, filt1, filt2);
3828 out2_r = AVC_DOT_SH3_SH(src32_r, src76_r, src98_r, filt0, filt1, filt2);
3829 out3_r = AVC_DOT_SH3_SH(src43_r, src87_r, src109_r, filt0, filt1, filt2);
3830 PCKEV_D2_SB(src3, src2, src7, src4, tmp0, tmp1);
3831 ILVR_B4_SB(src11, src10, src12, src11, src13, src12, src14, src13, src10_r,
3832 src21_r, src32_r, src43_r);
3833 out4_r = AVC_DOT_SH3_SH(src76_r, src98_r, src10_r, filt0, filt1, filt2);
3834 out5_r = AVC_DOT_SH3_SH(src87_r, src109_r, src21_r, filt0, filt1, filt2);
3835 out6_r = AVC_DOT_SH3_SH(src98_r, src10_r, src32_r, filt0, filt1, filt2);
3836 out7_r = AVC_DOT_SH3_SH(src109_r, src21_r, src43_r, filt0, filt1, filt2);
3837 PCKEV_D2_SB(src9, src8, src11, src10, tmp2, tmp3);
3838 SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 5);
3839 SRARI_H4_SH(out4_r, out5_r, out6_r, out7_r, 5);
3840 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
3841 SAT_SH4_SH(out4_r, out5_r, out6_r, out7_r, 7);
3843 LD4(dst, stride, tp0, tp1, tp2, tp3);
3844 INSERT_D2_UB(tp0, tp1, dst0);
3845 INSERT_D2_UB(tp2, tp3, dst1);
3846 LD4(dst + 4 * stride, stride, tp0, tp1, tp2, tp3);
3847 INSERT_D2_UB(tp0, tp1, dst2);
3848 INSERT_D2_UB(tp2, tp3, dst3);
3850 PCKEV_B2_SB(out1_r, out0_r, out3_r, out2_r, out0, out1);
3851 PCKEV_B2_SB(out5_r, out4_r, out7_r, out6_r, out2, out3);
3852 out0 = __msa_aver_s_b(out0, tmp0);
3853 out1 = __msa_aver_s_b(out1, tmp1);
3854 out2 = __msa_aver_s_b(out2, tmp2);
3855 out3 = __msa_aver_s_b(out3, tmp3);
3856 XORI_B4_128_SB(out0, out1, out2, out3);
3857 AVER_UB4_UB(out0, dst0, out1, dst1, out2, dst2, out3, dst3, dst0, dst1,
3859 ST8x8_UB(dst0, dst1, dst2, dst3, dst, stride);
3862 void ff_avg_h264_qpel8_mc03_msa(uint8_t *dst, const uint8_t *src,
3865 uint64_t tp0, tp1, tp2, tp3;
3866 const int16_t filt_const0 = 0xfb01;
3867 const int16_t filt_const1 = 0x1414;
3868 const int16_t filt_const2 = 0x1fb;
3869 v16u8 dst0 = { 0 }, dst1 = { 0 }, dst2 = { 0 }, dst3 = { 0 };
3870 v16i8 src0, src1, src2, src3, src4, src7, src8, src9, src10, src11, src12;
3871 v16i8 src13, src14, tmp0, tmp1, tmp2, tmp3, src109_r;
3872 v16i8 src10_r, src32_r, src76_r, src98_r, src21_r, src43_r, src87_r;
3873 v16i8 filt0, filt1, filt2, out0, out1, out2, out3;
3874 v8i16 out0_r, out1_r, out2_r, out3_r, out4_r, out5_r, out6_r, out7_r;
3876 filt0 = (v16i8) __msa_fill_h(filt_const0);
3877 filt1 = (v16i8) __msa_fill_h(filt_const1);
3878 filt2 = (v16i8) __msa_fill_h(filt_const2);
3880 src -= (stride * 2);
3882 LD_SB5(src, stride, src0, src1, src2, src3, src4);
3883 src += (5 * stride);
3885 XORI_B5_128_SB(src0, src1, src2, src3, src4);
3886 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
3888 LD_SB8(src, stride, src7, src8, src9, src10, src11, src12, src13, src14);
3889 XORI_B8_128_SB(src7, src8, src9, src10, src11, src12, src13, src14);
3890 ILVR_B4_SB(src7, src4, src8, src7, src9, src8, src10, src9, src76_r,
3891 src87_r, src98_r, src109_r);
3892 out0_r = AVC_DOT_SH3_SH(src10_r, src32_r, src76_r, filt0, filt1, filt2);
3893 out1_r = AVC_DOT_SH3_SH(src21_r, src43_r, src87_r, filt0, filt1, filt2);
3894 out2_r = AVC_DOT_SH3_SH(src32_r, src76_r, src98_r, filt0, filt1, filt2);
3895 out3_r = AVC_DOT_SH3_SH(src43_r, src87_r, src109_r, filt0, filt1, filt2);
3896 PCKEV_D2_SB(src4, src3, src8, src7, tmp0, tmp1);
3897 ILVR_B4_SB(src11, src10, src12, src11, src13, src12, src14, src13, src10_r,
3898 src21_r, src32_r, src43_r);
3899 out4_r = AVC_DOT_SH3_SH(src76_r, src98_r, src10_r, filt0, filt1, filt2);
3900 out5_r = AVC_DOT_SH3_SH(src87_r, src109_r, src21_r, filt0, filt1, filt2);
3901 out6_r = AVC_DOT_SH3_SH(src98_r, src10_r, src32_r, filt0, filt1, filt2);
3902 out7_r = AVC_DOT_SH3_SH(src109_r, src21_r, src43_r, filt0, filt1, filt2);
3903 PCKEV_D2_SB(src10, src9, src12, src11, tmp2, tmp3);
3904 SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 5);
3905 SRARI_H4_SH(out4_r, out5_r, out6_r, out7_r, 5);
3906 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
3907 SAT_SH4_SH(out4_r, out5_r, out6_r, out7_r, 7);
3909 LD4(dst, stride, tp0, tp1, tp2, tp3);
3910 INSERT_D2_UB(tp0, tp1, dst0);
3911 INSERT_D2_UB(tp2, tp3, dst1);
3912 LD4(dst + 4 * stride, stride, tp0, tp1, tp2, tp3);
3913 INSERT_D2_UB(tp0, tp1, dst2);
3914 INSERT_D2_UB(tp2, tp3, dst3);
3916 PCKEV_B2_SB(out1_r, out0_r, out3_r, out2_r, out0, out1);
3917 PCKEV_B2_SB(out5_r, out4_r, out7_r, out6_r, out2, out3);
3918 out0 = __msa_aver_s_b(out0, tmp0);
3919 out1 = __msa_aver_s_b(out1, tmp1);
3920 out2 = __msa_aver_s_b(out2, tmp2);
3921 out3 = __msa_aver_s_b(out3, tmp3);
3922 XORI_B4_128_SB(out0, out1, out2, out3);
3923 AVER_UB4_UB(out0, dst0, out1, dst1, out2, dst2, out3, dst3, dst0, dst1,
3925 ST8x8_UB(dst0, dst1, dst2, dst3, dst, stride);
3928 void ff_avg_h264_qpel4_mc01_msa(uint8_t *dst, const uint8_t *src,
3931 uint32_t tp0, tp1, tp2, tp3;
3932 int16_t filt_const0 = 0xfb01;
3933 int16_t filt_const1 = 0x1414;
3934 int16_t filt_const2 = 0x1fb;
3935 v16u8 res, dst0 = { 0 };
3936 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
3937 v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
3938 v16i8 src87_r, src2110, src4332, src6554, src8776, filt0, filt1, filt2;
3941 filt0 = (v16i8) __msa_fill_h(filt_const0);
3942 filt1 = (v16i8) __msa_fill_h(filt_const1);
3943 filt2 = (v16i8) __msa_fill_h(filt_const2);
3945 src -= (stride * 2);
3946 LD_SB5(src, stride, src0, src1, src2, src3, src4);
3947 src += (5 * stride);
3949 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
3951 ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
3952 XORI_B2_128_SB(src2110, src4332);
3953 LD_SB4(src, stride, src5, src6, src7, src8);
3954 ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
3956 ILVR_D2_SB(src65_r, src54_r, src87_r, src76_r, src6554, src8776);
3957 XORI_B2_128_SB(src6554, src8776);
3958 src32_r = (v16i8) __msa_insve_w((v4i32) src2, 1, (v4i32) src3);
3959 src54_r = (v16i8) __msa_insve_w((v4i32) src4, 1, (v4i32) src5);
3960 src32_r = (v16i8) __msa_insve_d((v2i64) src32_r, 1, (v2i64) src54_r);
3961 out10 = AVC_DOT_SH3_SH(src2110, src4332, src6554, filt0, filt1, filt2);
3962 out32 = AVC_DOT_SH3_SH(src4332, src6554, src8776, filt0, filt1, filt2);
3963 SRARI_H2_SH(out10, out32, 5);
3964 SAT_SH2_SH(out10, out32, 7);
3965 LW4(dst, stride, tp0, tp1, tp2, tp3);
3966 INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
3967 res = PCKEV_XORI128_UB(out10, out32);
3968 res = __msa_aver_u_b(res, (v16u8) src32_r);
3969 dst0 = __msa_aver_u_b(res, dst0);
3970 ST4x4_UB(dst0, dst0, 0, 1, 2, 3, dst, stride);
3973 void ff_avg_h264_qpel4_mc03_msa(uint8_t *dst, const uint8_t *src,
3976 uint32_t tp0, tp1, tp2, tp3;
3977 int16_t filt_const0 = 0xfb01;
3978 int16_t filt_const1 = 0x1414;
3979 int16_t filt_const2 = 0x1fb;
3980 v16u8 res, dst0 = { 0 };
3981 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
3982 v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
3983 v16i8 src87_r, src2110, src4332, src6554, src8776, filt0, filt1, filt2;
3986 filt0 = (v16i8) __msa_fill_h(filt_const0);
3987 filt1 = (v16i8) __msa_fill_h(filt_const1);
3988 filt2 = (v16i8) __msa_fill_h(filt_const2);
3990 src -= (stride * 2);
3992 LD_SB5(src, stride, src0, src1, src2, src3, src4);
3993 src += (5 * stride);
3995 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
3997 ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
3998 XORI_B2_128_SB(src2110, src4332);
3999 LD_SB4(src, stride, src5, src6, src7, src8);
4000 ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
4002 ILVR_D2_SB(src65_r, src54_r, src87_r, src76_r, src6554, src8776);
4003 XORI_B2_128_SB(src6554, src8776);
4004 out10 = AVC_DOT_SH3_SH(src2110, src4332, src6554, filt0, filt1, filt2);
4005 out32 = AVC_DOT_SH3_SH(src4332, src6554, src8776, filt0, filt1, filt2);
4006 SRARI_H2_SH(out10, out32, 5);
4007 SAT_SH2_SH(out10, out32, 7);
4008 LW4(dst, stride, tp0, tp1, tp2, tp3);
4009 INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
4010 res = PCKEV_XORI128_UB(out10, out32);
4011 src32_r = (v16i8) __msa_insve_w((v4i32) src3, 1, (v4i32) src4);
4012 src54_r = (v16i8) __msa_insve_w((v4i32) src5, 1, (v4i32) src6);
4013 src32_r = (v16i8) __msa_insve_d((v2i64) src32_r, 1, (v2i64) src54_r);
4014 res = __msa_aver_u_b(res, (v16u8) src32_r);
4015 dst0 = __msa_aver_u_b(res, dst0);
4016 ST4x4_UB(dst0, dst0, 0, 1, 2, 3, dst, stride);
4019 void ff_avg_h264_qpel16_mc11_msa(uint8_t *dst, const uint8_t *src,
4022 avc_luma_hv_qrt_and_aver_dst_16x16_msa(src - 2,
4027 void ff_avg_h264_qpel16_mc31_msa(uint8_t *dst, const uint8_t *src,
4030 avc_luma_hv_qrt_and_aver_dst_16x16_msa(src - 2,
4031 src - (stride * 2) +
4036 void ff_avg_h264_qpel16_mc13_msa(uint8_t *dst, const uint8_t *src,
4039 avc_luma_hv_qrt_and_aver_dst_16x16_msa(src + stride - 2,
4044 void ff_avg_h264_qpel16_mc33_msa(uint8_t *dst, const uint8_t *src,
4047 avc_luma_hv_qrt_and_aver_dst_16x16_msa(src + stride - 2,
4048 src - (stride * 2) +
4053 void ff_avg_h264_qpel8_mc11_msa(uint8_t *dst, const uint8_t *src,
4056 avc_luma_hv_qrt_and_aver_dst_8x8_msa(src - 2,
4061 void ff_avg_h264_qpel8_mc31_msa(uint8_t *dst, const uint8_t *src,
4064 avc_luma_hv_qrt_and_aver_dst_8x8_msa(src - 2,
4065 src - (stride * 2) +
4066 sizeof(uint8_t), dst, stride);
4069 void ff_avg_h264_qpel8_mc13_msa(uint8_t *dst, const uint8_t *src,
4072 avc_luma_hv_qrt_and_aver_dst_8x8_msa(src + stride - 2,
4077 void ff_avg_h264_qpel8_mc33_msa(uint8_t *dst, const uint8_t *src,
4080 avc_luma_hv_qrt_and_aver_dst_8x8_msa(src + stride - 2,
4081 src - (stride * 2) +
4082 sizeof(uint8_t), dst, stride);
4086 void ff_avg_h264_qpel4_mc11_msa(uint8_t *dst, const uint8_t *src,
4089 avc_luma_hv_qrt_and_aver_dst_4x4_msa(src - 2,
4094 void ff_avg_h264_qpel4_mc31_msa(uint8_t *dst, const uint8_t *src,
4097 avc_luma_hv_qrt_and_aver_dst_4x4_msa(src - 2,
4098 src - (stride * 2) +
4099 sizeof(uint8_t), dst, stride);
4102 void ff_avg_h264_qpel4_mc13_msa(uint8_t *dst, const uint8_t *src,
4105 avc_luma_hv_qrt_and_aver_dst_4x4_msa(src + stride - 2,
4110 void ff_avg_h264_qpel4_mc33_msa(uint8_t *dst, const uint8_t *src,
4113 avc_luma_hv_qrt_and_aver_dst_4x4_msa(src + stride - 2,
4114 src - (stride * 2) +
4115 sizeof(uint8_t), dst, stride);
4118 void ff_avg_h264_qpel16_mc21_msa(uint8_t *dst, const uint8_t *src,
4121 uint64_t tp0, tp1, tp2, tp3;
4122 uint8_t *dst_tmp = dst;
4123 const uint8_t *src_tmp = src - (2 * stride) - 2;
4124 uint32_t multiple8_cnt, loop_cnt;
4125 const int32_t filt_const0 = 0xfffb0001;
4126 const int32_t filt_const1 = 0x140014;
4127 const int32_t filt_const2 = 0x1fffb;
4128 v16u8 out0, out1, dst0 = { 0 }, dst1 = { 0 };
4129 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, mask0, mask1;
4131 v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
4132 v8i16 hz_out7, hz_out8, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
4133 v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r;
4134 v8i16 hz_out65_r, hz_out76_r, hz_out87_r, hz_out10_l, hz_out21_l;
4135 v8i16 hz_out32_l, hz_out43_l, hz_out54_l, hz_out65_l, hz_out76_l;
4136 v8i16 hz_out87_l, filt0, filt1, filt2;
4137 v4i32 tmp0_w, tmp1_w;
4139 filt0 = (v8i16) __msa_fill_w(filt_const0);
4140 filt1 = (v8i16) __msa_fill_w(filt_const1);
4141 filt2 = (v8i16) __msa_fill_w(filt_const2);
4143 LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
4145 for (multiple8_cnt = 2; multiple8_cnt--;) {
4149 LD_SB5(src, stride, src0, src1, src2, src3, src4);
4150 XORI_B5_128_SB(src0, src1, src2, src3, src4);
4151 src += (5 * stride);
4153 hz_out0 = AVC_HORZ_FILTER_SH(src0, src0, mask0, mask1, mask2);
4154 hz_out1 = AVC_HORZ_FILTER_SH(src1, src1, mask0, mask1, mask2);
4155 hz_out2 = AVC_HORZ_FILTER_SH(src2, src2, mask0, mask1, mask2);
4156 hz_out3 = AVC_HORZ_FILTER_SH(src3, src3, mask0, mask1, mask2);
4157 hz_out4 = AVC_HORZ_FILTER_SH(src4, src4, mask0, mask1, mask2);
4159 for (loop_cnt = 4; loop_cnt--;) {
4160 LD_SB2(src, stride, src5, src6);
4161 src += (2 * stride);
4163 XORI_B2_128_SB(src5, src6);
4164 hz_out5 = AVC_HORZ_FILTER_SH(src5, src5, mask0, mask1, mask2);
4165 hz_out6 = AVC_HORZ_FILTER_SH(src6, src6, mask0, mask1, mask2);
4166 ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2,
4167 hz_out4, hz_out3, hz_out10_r, hz_out21_r, hz_out32_r,
4169 ILVL_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2,
4170 hz_out4, hz_out3, hz_out10_l, hz_out21_l, hz_out32_l,
4172 ILVR_H2_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out54_r,
4174 ILVL_H2_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out54_l,
4176 tmp0_w = AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0,
4178 tmp1_w = AVC_DOT_SW3_SW(hz_out10_l, hz_out32_l, hz_out54_l, filt0,
4180 tmp0 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
4181 tmp0_w = AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0,
4183 tmp1_w = AVC_DOT_SW3_SW(hz_out21_l, hz_out43_l, hz_out65_l, filt0,
4185 tmp2 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
4187 tmp1 = __msa_srari_h(hz_out2, 5);
4188 tmp3 = __msa_srari_h(hz_out3, 5);
4189 SAT_SH2_SH(tmp1, tmp3, 7);
4191 tmp0 = __msa_aver_s_h(tmp0, tmp1);
4192 tmp1 = __msa_aver_s_h(tmp2, tmp3);
4194 LD2(dst, stride, tp0, tp1);
4195 INSERT_D2_UB(tp0, tp1, dst0);
4197 out0 = PCKEV_XORI128_UB(tmp0, tmp1);
4198 dst0 = __msa_aver_u_b(out0, dst0);
4199 ST8x2_UB(dst0, dst, stride);
4200 dst += (2 * stride);
4202 LD_SB2(src, stride, src7, src8);
4203 src += (2 * stride);
4205 XORI_B2_128_SB(src7, src8);
4206 hz_out7 = AVC_HORZ_FILTER_SH(src7, src7, mask0, mask1, mask2);
4207 hz_out8 = AVC_HORZ_FILTER_SH(src8, src8, mask0, mask1, mask2);
4208 ILVR_H2_SH(hz_out7, hz_out6, hz_out8, hz_out7, hz_out76_r,
4210 ILVL_H2_SH(hz_out7, hz_out6, hz_out8, hz_out7, hz_out76_l,
4212 tmp0_w = AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0,
4214 tmp1_w = AVC_DOT_SW3_SW(hz_out32_l, hz_out54_l, hz_out76_l, filt0,
4216 tmp4 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
4217 tmp0_w = AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0,
4219 tmp1_w = AVC_DOT_SW3_SW(hz_out43_l, hz_out65_l, hz_out87_l, filt0,
4221 tmp6 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
4223 tmp5 = __msa_srari_h(hz_out4, 5);
4224 tmp7 = __msa_srari_h(hz_out5, 5);
4225 SAT_SH2_SH(tmp5, tmp7, 7);
4227 tmp2 = __msa_aver_s_h(tmp4, tmp5);
4228 tmp3 = __msa_aver_s_h(tmp6, tmp7);
4230 LD2(dst, stride, tp2, tp3);
4231 INSERT_D2_UB(tp2, tp3, dst1);
4233 out1 = PCKEV_XORI128_UB(tmp2, tmp3);
4234 dst1 = __msa_aver_u_b(out1, dst1);
4235 ST8x2_UB(dst1, dst, stride);
4236 dst += (2 * stride);
4250 void ff_avg_h264_qpel16_mc23_msa(uint8_t *dst, const uint8_t *src,
4253 uint64_t tp0, tp1, tp2, tp3;
4254 uint8_t *dst_tmp = dst;
4255 const uint8_t *src_tmp = src - (2 * stride) - 2;
4256 uint32_t multiple8_cnt, loop_cnt;
4257 const int32_t filt_const0 = 0xfffb0001;
4258 const int32_t filt_const1 = 0x140014;
4259 const int32_t filt_const2 = 0x1fffb;
4260 v16u8 out0, out1, dst0 = { 0 }, dst1 = { 0 };
4261 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, mask0, mask1;
4263 v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
4264 v8i16 hz_out7, hz_out8, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
4265 v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r;
4266 v8i16 hz_out65_r, hz_out76_r, hz_out87_r, hz_out10_l, hz_out21_l;
4267 v8i16 hz_out32_l, hz_out43_l, hz_out54_l, hz_out65_l, hz_out76_l;
4268 v8i16 hz_out87_l, filt0, filt1, filt2;
4269 v4i32 tmp0_w, tmp1_w;
4271 filt0 = (v8i16) __msa_fill_w(filt_const0);
4272 filt1 = (v8i16) __msa_fill_w(filt_const1);
4273 filt2 = (v8i16) __msa_fill_w(filt_const2);
4275 LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
4277 for (multiple8_cnt = 2; multiple8_cnt--;) {
4281 LD_SB5(src, stride, src0, src1, src2, src3, src4);
4282 XORI_B5_128_SB(src0, src1, src2, src3, src4);
4283 src += (5 * stride);
4285 hz_out0 = AVC_HORZ_FILTER_SH(src0, src0, mask0, mask1, mask2);
4286 hz_out1 = AVC_HORZ_FILTER_SH(src1, src1, mask0, mask1, mask2);
4287 hz_out2 = AVC_HORZ_FILTER_SH(src2, src2, mask0, mask1, mask2);
4288 hz_out3 = AVC_HORZ_FILTER_SH(src3, src3, mask0, mask1, mask2);
4289 hz_out4 = AVC_HORZ_FILTER_SH(src4, src4, mask0, mask1, mask2);
4291 for (loop_cnt = 4; loop_cnt--;) {
4292 LD_SB2(src, stride, src5, src6);
4293 src += (2 * stride);
4295 XORI_B2_128_SB(src5, src6);
4296 hz_out5 = AVC_HORZ_FILTER_SH(src5, src5, mask0, mask1, mask2);
4297 hz_out6 = AVC_HORZ_FILTER_SH(src6, src6, mask0, mask1, mask2);
4298 ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2,
4299 hz_out4, hz_out3, hz_out10_r, hz_out21_r, hz_out32_r,
4301 ILVL_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2,
4302 hz_out4, hz_out3, hz_out10_l, hz_out21_l, hz_out32_l,
4304 ILVR_H2_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out54_r, hz_out65_r);
4305 ILVL_H2_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out54_l, hz_out65_l);
4307 tmp0_w = AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0,
4309 tmp1_w = AVC_DOT_SW3_SW(hz_out10_l, hz_out32_l, hz_out54_l, filt0,
4311 tmp0 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
4312 tmp0_w = AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0,
4314 tmp1_w = AVC_DOT_SW3_SW(hz_out21_l, hz_out43_l, hz_out65_l, filt0,
4316 tmp2 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
4318 tmp1 = __msa_srari_h(hz_out3, 5);
4319 tmp3 = __msa_srari_h(hz_out4, 5);
4320 SAT_SH2_SH(tmp1, tmp3, 7);
4322 tmp0 = __msa_aver_s_h(tmp0, tmp1);
4323 tmp1 = __msa_aver_s_h(tmp2, tmp3);
4325 LD2(dst, stride, tp0, tp1);
4326 INSERT_D2_UB(tp0, tp1, dst0);
4327 out0 = PCKEV_XORI128_UB(tmp0, tmp1);
4328 dst0 = __msa_aver_u_b(out0, dst0);
4329 ST8x2_UB(dst0, dst, stride);
4330 dst += (2 * stride);
4332 LD_SB2(src, stride, src7, src8);
4333 src += (2 * stride);
4335 XORI_B2_128_SB(src7, src8);
4336 hz_out7 = AVC_HORZ_FILTER_SH(src7, src7, mask0, mask1, mask2);
4337 hz_out8 = AVC_HORZ_FILTER_SH(src8, src8, mask0, mask1, mask2);
4338 ILVR_H2_SH(hz_out7, hz_out6, hz_out8, hz_out7, hz_out76_r,
4340 ILVL_H2_SH(hz_out7, hz_out6, hz_out8, hz_out7, hz_out76_l,
4342 tmp0_w = AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0,
4344 tmp1_w = AVC_DOT_SW3_SW(hz_out32_l, hz_out54_l, hz_out76_l, filt0,
4346 tmp4 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
4347 tmp0_w = AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0,
4349 tmp1_w = AVC_DOT_SW3_SW(hz_out43_l, hz_out65_l, hz_out87_l, filt0,
4351 tmp6 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
4353 tmp5 = __msa_srari_h(hz_out5, 5);
4354 tmp7 = __msa_srari_h(hz_out6, 5);
4355 SAT_SH2_SH(tmp5, tmp7, 7);
4357 tmp2 = __msa_aver_s_h(tmp4, tmp5);
4358 tmp3 = __msa_aver_s_h(tmp6, tmp7);
4360 LD2(dst, stride, tp2, tp3);
4361 INSERT_D2_UB(tp2, tp3, dst1);
4362 out1 = PCKEV_XORI128_UB(tmp2, tmp3);
4363 dst1 = __msa_aver_u_b(out1, dst1);
4364 ST8x2_UB(dst1, dst, stride);
4365 dst += (2 * stride);
4379 void ff_avg_h264_qpel8_mc21_msa(uint8_t *dst, const uint8_t *src,
4382 const int32_t filt_const0 = 0xfffb0001;
4383 const int32_t filt_const1 = 0x140014;
4384 const int32_t filt_const2 = 0x1fffb;
4385 uint64_t tp0, tp1, tp2, tp3;
4386 v16u8 dst0 = { 0 }, dst1 = { 0 }, out0, out1;
4387 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
4388 v16i8 src11, src12, mask0, mask1, mask2;
4389 v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
4390 v8i16 hz_out7, hz_out8, hz_out9, hz_out10, hz_out11, hz_out12;
4391 v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r;
4392 v8i16 hz_out65_r, hz_out76_r, hz_out87_r, hz_out89_r, hz_out910_r;
4393 v8i16 hz_out1110_r, hz_out1211_r, tmp0, tmp1, tmp2, tmp3;
4394 v8i16 hz_out10_l, hz_out21_l, hz_out32_l, hz_out43_l, hz_out54_l;
4395 v8i16 hz_out65_l, hz_out76_l, hz_out87_l, hz_out89_l, hz_out910_l;
4396 v8i16 hz_out1110_l, hz_out1211_l, filt0, filt1, filt2;
4397 v4i32 tmp0_w, tmp1_w;
4399 LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
4401 filt0 = (v8i16) __msa_fill_w(filt_const0);
4402 filt1 = (v8i16) __msa_fill_w(filt_const1);
4403 filt2 = (v8i16) __msa_fill_w(filt_const2);
4405 src -= ((2 * stride) + 2);
4407 LD_SB5(src, stride, src0, src1, src2, src3, src4);
4408 XORI_B5_128_SB(src0, src1, src2, src3, src4);
4409 src += (5 * stride);
4411 hz_out0 = AVC_HORZ_FILTER_SH(src0, src0, mask0, mask1, mask2);
4412 hz_out1 = AVC_HORZ_FILTER_SH(src1, src1, mask0, mask1, mask2);
4413 hz_out2 = AVC_HORZ_FILTER_SH(src2, src2, mask0, mask1, mask2);
4414 hz_out3 = AVC_HORZ_FILTER_SH(src3, src3, mask0, mask1, mask2);
4415 hz_out4 = AVC_HORZ_FILTER_SH(src4, src4, mask0, mask1, mask2);
4417 LD_SB4(src, stride, src5, src6, src7, src8);
4418 src += (4 * stride);
4419 XORI_B4_128_SB(src5, src6, src7, src8);
4421 hz_out5 = AVC_HORZ_FILTER_SH(src5, src5, mask0, mask1, mask2);
4422 hz_out6 = AVC_HORZ_FILTER_SH(src6, src6, mask0, mask1, mask2);
4423 hz_out7 = AVC_HORZ_FILTER_SH(src7, src7, mask0, mask1, mask2);
4424 hz_out8 = AVC_HORZ_FILTER_SH(src8, src8, mask0, mask1, mask2);
4426 ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4,
4427 hz_out3, hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r);
4428 ILVL_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4,
4429 hz_out3, hz_out10_l, hz_out21_l, hz_out32_l, hz_out43_l);
4430 ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8,
4431 hz_out7, hz_out54_r, hz_out65_r, hz_out76_r, hz_out87_r);
4432 ILVL_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8,
4433 hz_out7, hz_out54_l, hz_out65_l, hz_out76_l, hz_out87_l);
4435 tmp0_w = AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0, filt1,
4437 tmp1_w = AVC_DOT_SW3_SW(hz_out10_l, hz_out32_l, hz_out54_l, filt0, filt1,
4439 tmp0 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
4440 tmp0_w = AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0, filt1,
4442 tmp1_w = AVC_DOT_SW3_SW(hz_out21_l, hz_out43_l, hz_out65_l, filt0, filt1,
4444 tmp1 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
4445 tmp0_w = AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0, filt1,
4447 tmp1_w = AVC_DOT_SW3_SW(hz_out32_l, hz_out54_l, hz_out76_l, filt0, filt1,
4449 tmp2 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
4450 tmp0_w = AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0, filt1,
4452 tmp1_w = AVC_DOT_SW3_SW(hz_out43_l, hz_out65_l, hz_out87_l, filt0, filt1,
4454 tmp3 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
4456 SRARI_H4_SH(hz_out2, hz_out3, hz_out4, hz_out5, 5);
4457 SAT_SH4_SH(hz_out2, hz_out3, hz_out4, hz_out5, 7);
4459 LD4(dst, stride, tp0, tp1, tp2, tp3);
4460 INSERT_D2_UB(tp0, tp1, dst0);
4461 INSERT_D2_UB(tp2, tp3, dst1);
4463 tmp0 = __msa_aver_s_h(tmp0, hz_out2);
4464 tmp1 = __msa_aver_s_h(tmp1, hz_out3);
4465 tmp2 = __msa_aver_s_h(tmp2, hz_out4);
4466 tmp3 = __msa_aver_s_h(tmp3, hz_out5);
4468 out0 = PCKEV_XORI128_UB(tmp0, tmp1);
4469 out1 = PCKEV_XORI128_UB(tmp2, tmp3);
4470 AVER_UB2_UB(out0, dst0, out1, dst1, dst0, dst1);
4471 ST8x4_UB(dst0, dst1, dst, stride);
4472 dst += (4 * stride);
4474 LD_SB4(src, stride, src9, src10, src11, src12);
4475 XORI_B4_128_SB(src9, src10, src11, src12);
4476 hz_out9 = AVC_HORZ_FILTER_SH(src9, src9, mask0, mask1, mask2);
4477 hz_out10 = AVC_HORZ_FILTER_SH(src10, src10, mask0, mask1, mask2);
4478 hz_out11 = AVC_HORZ_FILTER_SH(src11, src11, mask0, mask1, mask2);
4479 hz_out12 = AVC_HORZ_FILTER_SH(src12, src12, mask0, mask1, mask2);
4480 ILVR_H4_SH(hz_out9, hz_out8, hz_out10, hz_out9, hz_out11, hz_out10,
4481 hz_out12, hz_out11, hz_out89_r, hz_out910_r, hz_out1110_r,
4483 ILVL_H4_SH(hz_out9, hz_out8, hz_out10, hz_out9, hz_out11, hz_out10,
4484 hz_out12, hz_out11, hz_out89_l, hz_out910_l, hz_out1110_l,
4486 tmp0_w = AVC_DOT_SW3_SW(hz_out54_r, hz_out76_r, hz_out89_r, filt0, filt1,
4488 tmp1_w = AVC_DOT_SW3_SW(hz_out54_l, hz_out76_l, hz_out89_l, filt0, filt1,
4490 tmp0 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
4491 tmp0_w = AVC_DOT_SW3_SW(hz_out65_r, hz_out87_r, hz_out910_r, filt0, filt1,
4493 tmp1_w = AVC_DOT_SW3_SW(hz_out65_l, hz_out87_l, hz_out910_l, filt0, filt1,
4495 tmp1 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
4496 tmp0_w = AVC_DOT_SW3_SW(hz_out76_r, hz_out89_r, hz_out1110_r, filt0, filt1,
4498 tmp1_w = AVC_DOT_SW3_SW(hz_out76_l, hz_out89_l, hz_out1110_l, filt0, filt1,
4500 tmp2 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
4501 tmp0_w = AVC_DOT_SW3_SW(hz_out87_r, hz_out910_r, hz_out1211_r, filt0, filt1,
4503 tmp1_w = AVC_DOT_SW3_SW(hz_out87_l, hz_out910_l, hz_out1211_l, filt0, filt1,
4505 tmp3 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
4507 SRARI_H4_SH(hz_out6, hz_out7, hz_out8, hz_out9, 5);
4508 SAT_SH4_SH(hz_out6, hz_out7, hz_out8, hz_out9, 7);
4510 LD4(dst, stride, tp0, tp1, tp2, tp3);
4511 INSERT_D2_UB(tp0, tp1, dst0);
4512 INSERT_D2_UB(tp2, tp3, dst1);
4514 tmp0 = __msa_aver_s_h(tmp0, hz_out6);
4515 tmp1 = __msa_aver_s_h(tmp1, hz_out7);
4516 tmp2 = __msa_aver_s_h(tmp2, hz_out8);
4517 tmp3 = __msa_aver_s_h(tmp3, hz_out9);
4519 out0 = PCKEV_XORI128_UB(tmp0, tmp1);
4520 out1 = PCKEV_XORI128_UB(tmp2, tmp3);
4521 AVER_UB2_UB(out0, dst0, out1, dst1, dst0, dst1);
4522 ST8x4_UB(dst0, dst1, dst, stride);
4525 void ff_avg_h264_qpel8_mc23_msa(uint8_t *dst, const uint8_t *src,
4528 const int32_t filt_const0 = 0xfffb0001;
4529 const int32_t filt_const1 = 0x140014;
4530 const int32_t filt_const2 = 0x1fffb;
4531 uint64_t tp0, tp1, tp2, tp3;
4532 v16u8 dst0 = { 0 }, dst1 = { 0 }, out0, out1;
4533 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
4534 v16i8 src11, src12, mask0, mask1, mask2;
4535 v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
4536 v8i16 hz_out7, hz_out8, hz_out9, hz_out10, hz_out11, hz_out12;
4537 v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r;
4538 v8i16 hz_out65_r, hz_out76_r, hz_out87_r, hz_out89_r, hz_out910_r;
4539 v8i16 hz_out1110_r, hz_out1211_r, tmp0, tmp1, tmp2, tmp3;
4540 v8i16 hz_out10_l, hz_out21_l, hz_out32_l, hz_out43_l, hz_out54_l;
4541 v8i16 hz_out65_l, hz_out76_l, hz_out87_l, hz_out89_l, hz_out910_l;
4542 v8i16 hz_out1110_l, hz_out1211_l, filt0, filt1, filt2;
4543 v4i32 tmp0_w, tmp1_w;
4545 LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
4547 filt0 = (v8i16) __msa_fill_w(filt_const0);
4548 filt1 = (v8i16) __msa_fill_w(filt_const1);
4549 filt2 = (v8i16) __msa_fill_w(filt_const2);
4551 src -= ((2 * stride) + 2);
4553 LD_SB5(src, stride, src0, src1, src2, src3, src4);
4554 XORI_B5_128_SB(src0, src1, src2, src3, src4);
4555 src += (5 * stride);
4557 hz_out0 = AVC_HORZ_FILTER_SH(src0, src0, mask0, mask1, mask2);
4558 hz_out1 = AVC_HORZ_FILTER_SH(src1, src1, mask0, mask1, mask2);
4559 hz_out2 = AVC_HORZ_FILTER_SH(src2, src2, mask0, mask1, mask2);
4560 hz_out3 = AVC_HORZ_FILTER_SH(src3, src3, mask0, mask1, mask2);
4561 hz_out4 = AVC_HORZ_FILTER_SH(src4, src4, mask0, mask1, mask2);
4563 LD_SB4(src, stride, src5, src6, src7, src8);
4564 src += (4 * stride);
4565 XORI_B4_128_SB(src5, src6, src7, src8);
4567 hz_out5 = AVC_HORZ_FILTER_SH(src5, src5, mask0, mask1, mask2);
4568 hz_out6 = AVC_HORZ_FILTER_SH(src6, src6, mask0, mask1, mask2);
4569 hz_out7 = AVC_HORZ_FILTER_SH(src7, src7, mask0, mask1, mask2);
4570 hz_out8 = AVC_HORZ_FILTER_SH(src8, src8, mask0, mask1, mask2);
4572 ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4,
4573 hz_out3, hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r);
4574 ILVL_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4,
4575 hz_out3, hz_out10_l, hz_out21_l, hz_out32_l, hz_out43_l);
4576 ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8,
4577 hz_out7, hz_out54_r, hz_out65_r, hz_out76_r, hz_out87_r);
4578 ILVL_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8,
4579 hz_out7, hz_out54_l, hz_out65_l, hz_out76_l, hz_out87_l);
4581 tmp0_w = AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0, filt1,
4583 tmp1_w = AVC_DOT_SW3_SW(hz_out10_l, hz_out32_l, hz_out54_l, filt0, filt1,
4585 tmp0 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
4586 tmp0_w = AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0, filt1,
4588 tmp1_w = AVC_DOT_SW3_SW(hz_out21_l, hz_out43_l, hz_out65_l, filt0, filt1,
4590 tmp1 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
4591 tmp0_w = AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0, filt1,
4593 tmp1_w = AVC_DOT_SW3_SW(hz_out32_l, hz_out54_l, hz_out76_l, filt0, filt1,
4595 tmp2 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
4596 tmp0_w = AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0, filt1,
4598 tmp1_w = AVC_DOT_SW3_SW(hz_out43_l, hz_out65_l, hz_out87_l, filt0, filt1,
4600 tmp3 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
4602 SRARI_H4_SH(hz_out3, hz_out4, hz_out5, hz_out6, 5);
4603 SAT_SH4_SH(hz_out3, hz_out4, hz_out5, hz_out6, 7);
4605 LD4(dst, stride, tp0, tp1, tp2, tp3);
4606 INSERT_D2_UB(tp0, tp1, dst0);
4607 INSERT_D2_UB(tp2, tp3, dst1);
4609 tmp0 = __msa_aver_s_h(tmp0, hz_out3);
4610 tmp1 = __msa_aver_s_h(tmp1, hz_out4);
4611 tmp2 = __msa_aver_s_h(tmp2, hz_out5);
4612 tmp3 = __msa_aver_s_h(tmp3, hz_out6);
4614 out0 = PCKEV_XORI128_UB(tmp0, tmp1);
4615 out1 = PCKEV_XORI128_UB(tmp2, tmp3);
4616 AVER_UB2_UB(out0, dst0, out1, dst1, dst0, dst1);
4617 ST8x4_UB(dst0, dst1, dst, stride);
4618 dst += (4 * stride);
4620 LD_SB4(src, stride, src9, src10, src11, src12);
4621 XORI_B4_128_SB(src9, src10, src11, src12);
4622 hz_out9 = AVC_HORZ_FILTER_SH(src9, src9, mask0, mask1, mask2);
4623 hz_out10 = AVC_HORZ_FILTER_SH(src10, src10, mask0, mask1, mask2);
4624 hz_out11 = AVC_HORZ_FILTER_SH(src11, src11, mask0, mask1, mask2);
4625 hz_out12 = AVC_HORZ_FILTER_SH(src12, src12, mask0, mask1, mask2);
4626 ILVR_H4_SH(hz_out9, hz_out8, hz_out10, hz_out9, hz_out11, hz_out10,
4627 hz_out12, hz_out11, hz_out89_r, hz_out910_r, hz_out1110_r,
4629 ILVL_H4_SH(hz_out9, hz_out8, hz_out10, hz_out9, hz_out11, hz_out10,
4630 hz_out12, hz_out11, hz_out89_l, hz_out910_l, hz_out1110_l,
4632 tmp0_w = AVC_DOT_SW3_SW(hz_out54_r, hz_out76_r, hz_out89_r, filt0, filt1,
4634 tmp1_w = AVC_DOT_SW3_SW(hz_out54_l, hz_out76_l, hz_out89_l, filt0, filt1,
4636 tmp0 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
4637 tmp0_w = AVC_DOT_SW3_SW(hz_out65_r, hz_out87_r, hz_out910_r, filt0, filt1,
4639 tmp1_w = AVC_DOT_SW3_SW(hz_out65_l, hz_out87_l, hz_out910_l, filt0, filt1,
4641 tmp1 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
4642 tmp0_w = AVC_DOT_SW3_SW(hz_out76_r, hz_out89_r, hz_out1110_r, filt0, filt1,
4644 tmp1_w = AVC_DOT_SW3_SW(hz_out76_l, hz_out89_l, hz_out1110_l, filt0, filt1,
4646 tmp2 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
4647 tmp0_w = AVC_DOT_SW3_SW(hz_out87_r, hz_out910_r, hz_out1211_r, filt0, filt1,
4649 tmp1_w = AVC_DOT_SW3_SW(hz_out87_l, hz_out910_l, hz_out1211_l, filt0, filt1,
4651 tmp3 = __msa_pckev_h((v8i16) tmp1_w, (v8i16) tmp0_w);
4653 SRARI_H4_SH(hz_out7, hz_out8, hz_out9, hz_out10, 5);
4654 SAT_SH4_SH(hz_out7, hz_out8, hz_out9, hz_out10, 7);
4656 LD4(dst, stride, tp0, tp1, tp2, tp3);
4657 INSERT_D2_UB(tp0, tp1, dst0);
4658 INSERT_D2_UB(tp2, tp3, dst1);
4660 tmp0 = __msa_aver_s_h(tmp0, hz_out7);
4661 tmp1 = __msa_aver_s_h(tmp1, hz_out8);
4662 tmp2 = __msa_aver_s_h(tmp2, hz_out9);
4663 tmp3 = __msa_aver_s_h(tmp3, hz_out10);
4665 out0 = PCKEV_XORI128_UB(tmp0, tmp1);
4666 out1 = PCKEV_XORI128_UB(tmp2, tmp3);
4667 AVER_UB2_UB(out0, dst0, out1, dst1, dst0, dst1);
4668 ST8x4_UB(dst0, dst1, dst, stride);
4671 void ff_avg_h264_qpel4_mc21_msa(uint8_t *dst, const uint8_t *src,
4674 uint32_t tp0, tp1, tp2, tp3;
4675 const int32_t filt_const0 = 0xfffb0001;
4676 const int32_t filt_const1 = 0x140014;
4677 const int32_t filt_const2 = 0x1fffb;
4678 v16u8 res, out = { 0 };
4679 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
4680 v16i8 mask0, mask1, mask2;
4681 v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
4682 v8i16 hz_out7, hz_out8, dst0, dst1, filt0, filt1, filt2;
4683 v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r;
4684 v8i16 hz_out65_r, hz_out76_r, hz_out87_r;
4687 LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2);
4689 filt0 = (v8i16) __msa_fill_w(filt_const0);
4690 filt1 = (v8i16) __msa_fill_w(filt_const1);
4691 filt2 = (v8i16) __msa_fill_w(filt_const2);
4693 src -= ((2 * stride) + 2);
4695 LD_SB5(src, stride, src0, src1, src2, src3, src4);
4696 src += (5 * stride);
4697 LD_SB4(src, stride, src5, src6, src7, src8);
4699 XORI_B5_128_SB(src0, src1, src2, src3, src4);
4700 XORI_B4_128_SB(src5, src6, src7, src8);
4702 hz_out0 = AVC_HORZ_FILTER_SH(src0, src1, mask0, mask1, mask2);
4703 hz_out2 = AVC_HORZ_FILTER_SH(src2, src3, mask0, mask1, mask2);
4704 hz_out4 = AVC_HORZ_FILTER_SH(src4, src5, mask0, mask1, mask2);
4705 hz_out6 = AVC_HORZ_FILTER_SH(src6, src7, mask0, mask1, mask2);
4706 hz_out8 = AVC_HORZ_FILTER_SH(src8, src8, mask0, mask1, mask2);
4707 PCKOD_D2_SH(hz_out0, hz_out0, hz_out2, hz_out2, hz_out1, hz_out3);
4708 PCKOD_D2_SH(hz_out4, hz_out4, hz_out6, hz_out6, hz_out5, hz_out7);
4710 ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4,
4711 hz_out3, hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r);
4712 ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8,
4713 hz_out7, hz_out54_r, hz_out65_r, hz_out76_r, hz_out87_r);
4715 tmp0 = AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0, filt1,
4717 tmp1 = AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0, filt1,
4719 dst0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
4720 tmp0 = AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0, filt1,
4722 tmp1 = AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0, filt1,
4724 dst1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
4726 SRARI_H2_SH(hz_out2, hz_out4, 5);
4727 SAT_SH2_SH(hz_out2, hz_out4, 7);
4729 dst0 = __msa_aver_s_h(dst0, hz_out2);
4730 dst1 = __msa_aver_s_h(dst1, hz_out4);
4731 LW4(dst, stride, tp0, tp1, tp2, tp3);
4732 INSERT_W4_UB(tp0, tp1, tp2, tp3, out);
4733 res = PCKEV_XORI128_UB(dst0, dst1);
4734 res = __msa_aver_u_b(res, out);
4735 ST4x4_UB(res, res, 0, 1, 2, 3, dst, stride);
4738 void ff_avg_h264_qpel4_mc23_msa(uint8_t *dst, const uint8_t *src,
4741 const int32_t filt_const0 = 0xfffb0001;
4742 const int32_t filt_const1 = 0x140014;
4743 const int32_t filt_const2 = 0x1fffb;
4744 uint32_t tp0, tp1, tp2, tp3;
4745 v16u8 res, out = { 0 };
4746 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
4747 v16i8 mask0, mask1, mask2;
4748 v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
4749 v8i16 hz_out7, hz_out8, dst0, dst1, filt0, filt1, filt2;
4750 v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r;
4751 v8i16 hz_out65_r, hz_out76_r, hz_out87_r;
4754 LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2);
4756 filt0 = (v8i16) __msa_fill_w(filt_const0);
4757 filt1 = (v8i16) __msa_fill_w(filt_const1);
4758 filt2 = (v8i16) __msa_fill_w(filt_const2);
4760 src -= ((2 * stride) + 2);
4762 LD_SB5(src, stride, src0, src1, src2, src3, src4);
4763 src += (5 * stride);
4764 LD_SB4(src, stride, src5, src6, src7, src8);
4766 XORI_B5_128_SB(src0, src1, src2, src3, src4);
4767 XORI_B4_128_SB(src5, src6, src7, src8);
4769 hz_out0 = AVC_HORZ_FILTER_SH(src0, src1, mask0, mask1, mask2);
4770 hz_out2 = AVC_HORZ_FILTER_SH(src2, src3, mask0, mask1, mask2);
4771 hz_out4 = AVC_HORZ_FILTER_SH(src4, src5, mask0, mask1, mask2);
4772 hz_out6 = AVC_HORZ_FILTER_SH(src6, src7, mask0, mask1, mask2);
4773 hz_out8 = AVC_HORZ_FILTER_SH(src8, src8, mask0, mask1, mask2);
4774 PCKOD_D2_SH(hz_out0, hz_out0, hz_out2, hz_out2, hz_out1, hz_out3);
4775 PCKOD_D2_SH(hz_out4, hz_out4, hz_out6, hz_out6, hz_out5, hz_out7);
4777 ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4,
4778 hz_out3, hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r);
4779 ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8,
4780 hz_out7, hz_out54_r, hz_out65_r, hz_out76_r, hz_out87_r);
4782 tmp0 = AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0, filt1,
4784 tmp1 = AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0, filt1,
4786 dst0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
4787 tmp0 = AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0, filt1,
4789 tmp1 = AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0, filt1,
4791 dst1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
4793 PCKEV_D2_SH(hz_out4, hz_out3, hz_out6, hz_out5, hz_out0, hz_out1);
4794 SRARI_H2_SH(hz_out0, hz_out1, 5);
4795 SAT_SH2_SH(hz_out0, hz_out1, 7);
4797 dst0 = __msa_aver_s_h(dst0, hz_out0);
4798 dst1 = __msa_aver_s_h(dst1, hz_out1);
4799 LW4(dst, stride, tp0, tp1, tp2, tp3);
4800 INSERT_W4_UB(tp0, tp1, tp2, tp3, out);
4801 res = PCKEV_XORI128_UB(dst0, dst1);
4802 res = __msa_aver_u_b(res, out);
4803 ST4x4_UB(res, res, 0, 1, 2, 3, dst, stride);
4806 void ff_avg_h264_qpel16_mc02_msa(uint8_t *dst, const uint8_t *src,
4810 int16_t filt_const0 = 0xfb01;
4811 int16_t filt_const1 = 0x1414;
4812 int16_t filt_const2 = 0x1fb;
4813 v16u8 res0, res1, res2, res3, dst0, dst1, dst2, dst3;
4814 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
4815 v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
4816 v16i8 src87_r, src10_l, src32_l, src54_l, src76_l, src21_l, src43_l;
4817 v16i8 src65_l, src87_l, filt0, filt1, filt2;
4818 v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
4820 filt0 = (v16i8) __msa_fill_h(filt_const0);
4821 filt1 = (v16i8) __msa_fill_h(filt_const1);
4822 filt2 = (v16i8) __msa_fill_h(filt_const2);
4823 src -= (stride * 2);
4825 LD_SB5(src, stride, src0, src1, src2, src3, src4);
4826 src += (5 * stride);
4828 XORI_B5_128_SB(src0, src1, src2, src3, src4);
4829 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
4831 ILVL_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_l, src21_l,
4834 for (loop_cnt = 4; loop_cnt--;) {
4835 LD_SB4(src, stride, src5, src6, src7, src8);
4836 src += (4 * stride);
4838 XORI_B4_128_SB(src5, src6, src7, src8);
4839 ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r,
4840 src65_r, src76_r, src87_r);
4841 ILVL_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_l,
4842 src65_l, src76_l, src87_l);
4843 out0_r = AVC_DOT_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2);
4844 out1_r = AVC_DOT_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2);
4845 out2_r = AVC_DOT_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2);
4846 out3_r = AVC_DOT_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2);
4847 out0_l = AVC_DOT_SH3_SH(src10_l, src32_l, src54_l, filt0, filt1, filt2);
4848 out1_l = AVC_DOT_SH3_SH(src21_l, src43_l, src65_l, filt0, filt1, filt2);
4849 out2_l = AVC_DOT_SH3_SH(src32_l, src54_l, src76_l, filt0, filt1, filt2);
4850 out3_l = AVC_DOT_SH3_SH(src43_l, src65_l, src87_l, filt0, filt1, filt2);
4851 SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 5);
4852 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
4853 SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 5);
4854 SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
4855 LD_UB4(dst, stride, dst0, dst1, dst2, dst3);
4856 PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
4857 out3_r, res0, res1, res2, res3);
4858 XORI_B4_128_UB(res0, res1, res2, res3);
4859 AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
4860 AVER_UB2_UB(res2, dst2, res3, dst3, res2, res3);
4861 ST_UB4(res0, res1, res2, res3, dst, stride);
4862 dst += (4 * stride);
4876 void ff_avg_h264_qpel8_mc02_msa(uint8_t *dst, const uint8_t *src,
4879 uint64_t tp0, tp1, tp2, tp3;
4880 const int16_t filt_const0 = 0xfb01;
4881 const int16_t filt_const1 = 0x1414;
4882 const int16_t filt_const2 = 0x1fb;
4883 v16u8 dst0 = { 0 }, dst1 = { 0 }, dst2 = { 0 }, dst3 = { 0 };
4884 v16u8 out0, out1, out2, out3;
4885 v16i8 src0, src1, src2, src3, src4, src7, src8, src9, src10, src109_r;
4886 v16i8 src10_r, src32_r, src76_r, src98_r, src21_r, src43_r, src87_r;
4887 v16i8 filt0, filt1, filt2;
4888 v8i16 out0_r, out1_r, out2_r, out3_r, out4_r, out5_r, out6_r, out7_r;
4890 filt0 = (v16i8) __msa_fill_h(filt_const0);
4891 filt1 = (v16i8) __msa_fill_h(filt_const1);
4892 filt2 = (v16i8) __msa_fill_h(filt_const2);
4894 src -= (stride * 2);
4896 LD_SB5(src, stride, src0, src1, src2, src3, src4);
4897 src += (5 * stride);
4899 XORI_B5_128_SB(src0, src1, src2, src3, src4);
4900 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
4903 LD_SB4(src, stride, src7, src8, src9, src10);
4904 src += (4 * stride);
4905 XORI_B4_128_SB(src7, src8, src9, src10);
4906 ILVR_B4_SB(src7, src4, src8, src7, src9, src8, src10, src9, src76_r,
4907 src87_r, src98_r, src109_r);
4908 out0_r = AVC_DOT_SH3_SH(src10_r, src32_r, src76_r, filt0, filt1, filt2);
4909 out1_r = AVC_DOT_SH3_SH(src21_r, src43_r, src87_r, filt0, filt1, filt2);
4910 out2_r = AVC_DOT_SH3_SH(src32_r, src76_r, src98_r, filt0, filt1, filt2);
4911 out3_r = AVC_DOT_SH3_SH(src43_r, src87_r, src109_r, filt0, filt1, filt2);
4913 LD_SB4(src, stride, src0, src1, src2, src3);
4914 XORI_B4_128_SB(src0, src1, src2, src3);
4915 ILVR_B4_SB(src0, src10, src1, src0, src2, src1, src3, src2, src10_r,
4916 src21_r, src32_r, src43_r);
4917 out4_r = AVC_DOT_SH3_SH(src76_r, src98_r, src10_r, filt0, filt1, filt2);
4918 out5_r = AVC_DOT_SH3_SH(src87_r, src109_r, src21_r, filt0, filt1, filt2);
4919 out6_r = AVC_DOT_SH3_SH(src98_r, src10_r, src32_r, filt0, filt1, filt2);
4920 out7_r = AVC_DOT_SH3_SH(src109_r, src21_r, src43_r, filt0, filt1, filt2);
4922 LD4(dst, stride, tp0, tp1, tp2, tp3);
4923 INSERT_D2_UB(tp0, tp1, dst0);
4924 INSERT_D2_UB(tp2, tp3, dst1);
4925 LD4(dst + 4 * stride, stride, tp0, tp1, tp2, tp3);
4926 INSERT_D2_UB(tp0, tp1, dst2);
4927 INSERT_D2_UB(tp2, tp3, dst3);
4929 SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 5);
4930 SRARI_H4_SH(out4_r, out5_r, out6_r, out7_r, 5);
4931 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
4932 SAT_SH4_SH(out4_r, out5_r, out6_r, out7_r, 7);
4933 out0 = PCKEV_XORI128_UB(out0_r, out1_r);
4934 out1 = PCKEV_XORI128_UB(out2_r, out3_r);
4935 out2 = PCKEV_XORI128_UB(out4_r, out5_r);
4936 out3 = PCKEV_XORI128_UB(out6_r, out7_r);
4937 AVER_UB4_UB(out0, dst0, out1, dst1, out2, dst2, out3, dst3, dst0, dst1,
4939 ST8x8_UB(dst0, dst1, dst2, dst3, dst, stride);
4942 void ff_avg_h264_qpel4_mc02_msa(uint8_t *dst, const uint8_t *src,
4945 uint32_t tp0, tp1, tp2, tp3;
4946 int16_t filt_const0 = 0xfb01;
4947 int16_t filt_const1 = 0x1414;
4948 int16_t filt_const2 = 0x1fb;
4949 v16u8 res, dst0 = { 0 };
4950 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
4951 v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
4952 v16i8 src87_r, src2110, src4332, src6554, src8776, filt0, filt1, filt2;
4955 filt0 = (v16i8) __msa_fill_h(filt_const0);
4956 filt1 = (v16i8) __msa_fill_h(filt_const1);
4957 filt2 = (v16i8) __msa_fill_h(filt_const2);
4959 src -= (stride * 2);
4960 LD_SB5(src, stride, src0, src1, src2, src3, src4);
4961 src += (5 * stride);
4963 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
4965 ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
4966 XORI_B2_128_SB(src2110, src4332);
4967 LD_SB4(src, stride, src5, src6, src7, src8);
4968 ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
4970 ILVR_D2_SB(src65_r, src54_r, src87_r, src76_r, src6554, src8776);
4971 XORI_B2_128_SB(src6554, src8776);
4972 out10 = AVC_DOT_SH3_SH(src2110, src4332, src6554, filt0, filt1, filt2);
4973 out32 = AVC_DOT_SH3_SH(src4332, src6554, src8776, filt0, filt1, filt2);
4974 SRARI_H2_SH(out10, out32, 5);
4975 SAT_SH2_SH(out10, out32, 7);
4976 LW4(dst, stride, tp0, tp1, tp2, tp3);
4977 INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
4978 res = PCKEV_XORI128_UB(out10, out32);
4979 dst0 = __msa_aver_u_b(res, dst0);
4980 ST4x4_UB(dst0, dst0, 0, 1, 2, 3, dst, stride);
4983 void ff_avg_h264_qpel16_mc12_msa(uint8_t *dst, const uint8_t *src,
4988 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
4990 v8i16 vt_res0, vt_res1, vt_res2, vt_res3, tmp0, tmp1, tmp2, tmp3, mask3;
4991 v8i16 shf_vec0, shf_vec1, shf_vec2, shf_vec3, shf_vec4, shf_vec5, shf_vec6;
4992 v8i16 shf_vec7, shf_vec8, shf_vec9, shf_vec10, shf_vec11, mask4, mask5;
4993 v4i32 hz_res0, hz_res1, hz_res2, hz_res3;
4994 v8i16 mask0 = { 0, 5, 1, 6, 2, 7, 3, 8 };
4995 v8i16 mask1 = { 1, 4, 2, 5, 3, 6, 4, 7 };
4996 v8i16 mask2 = { 2, 3, 3, 4, 4, 5, 5, 6 };
4997 v8i16 minus5h = __msa_ldi_h(-5);
4998 v8i16 plus20h = __msa_ldi_h(20);
5004 src -= ((2 * stride) + 2);
5006 LD_SB5(src, stride, src0, src1, src2, src3, src4);
5007 LD_SB5(src + 8, stride, src7, src8, src9, src10, src11);
5008 src += (5 * stride);
5009 XORI_B5_128_SB(src0, src1, src2, src3, src4);
5010 XORI_B5_128_SB(src7, src8, src9, src10, src11);
5012 for (row = 16; row--;) {
5013 LD_SB2(src, 8, src5, src6);
5015 XORI_B2_128_SB(src5, src6);
5018 AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src0, src1, src2, src3, src4, src5,
5020 AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src7, src8, src9, src10, src11, src6,
5022 VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask0,
5023 mask1, mask2, shf_vec0, shf_vec1, shf_vec2);
5024 VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask0,
5025 mask1, mask2, shf_vec3, shf_vec4, shf_vec5);
5026 VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask3,
5027 mask4, mask5, shf_vec6, shf_vec7, shf_vec8);
5028 VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask3,
5029 mask4, mask5, shf_vec9, shf_vec10, shf_vec11);
5030 hz_res0 = __msa_hadd_s_w(shf_vec0, shf_vec0);
5031 hz_res1 = __msa_hadd_s_w(shf_vec3, shf_vec3);
5032 hz_res2 = __msa_hadd_s_w(shf_vec6, shf_vec6);
5033 hz_res3 = __msa_hadd_s_w(shf_vec9, shf_vec9);
5034 DPADD_SH2_SW(shf_vec1, shf_vec2, minus5h, plus20h, hz_res0, hz_res0);
5035 DPADD_SH2_SW(shf_vec4, shf_vec5, minus5h, plus20h, hz_res1, hz_res1);
5036 DPADD_SH2_SW(shf_vec7, shf_vec8, minus5h, plus20h, hz_res2, hz_res2);
5037 DPADD_SH2_SW(shf_vec10, shf_vec11, minus5h, plus20h, hz_res3, hz_res3);
5038 SRARI_W4_SW(hz_res0, hz_res1, hz_res2, hz_res3, 10);
5039 SAT_SW4_SW(hz_res0, hz_res1, hz_res2, hz_res3, 7);
5040 tmp0 = __msa_srari_h(shf_vec2, 5);
5041 tmp1 = __msa_srari_h(shf_vec5, 5);
5042 tmp2 = __msa_srari_h(shf_vec8, 5);
5043 tmp3 = __msa_srari_h(shf_vec11, 5);
5044 SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
5045 PCKEV_H2_SH(tmp2, tmp0, tmp3, tmp1, tmp0, tmp1);
5046 PCKEV_H2_SH(hz_res2, hz_res0, hz_res3, hz_res1, tmp2, tmp3);
5047 tmp0 = __msa_aver_s_h(tmp2, tmp0);
5048 tmp1 = __msa_aver_s_h(tmp3, tmp1);
5049 out = PCKEV_XORI128_UB(tmp0, tmp1);
5050 out = __msa_aver_u_b(out, dst0);
5067 void ff_avg_h264_qpel16_mc32_msa(uint8_t *dst, const uint8_t *src,
5072 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
5074 v8i16 vt_res0, vt_res1, vt_res2, vt_res3, tmp0, tmp1, tmp2, tmp3, mask3;
5075 v8i16 shf_vec0, shf_vec1, shf_vec2, shf_vec3, shf_vec4, shf_vec5, shf_vec6;
5076 v8i16 shf_vec7, shf_vec8, shf_vec9, shf_vec10, shf_vec11, mask4, mask5;
5077 v4i32 hz_res0, hz_res1, hz_res2, hz_res3;
5078 v8i16 mask0 = { 0, 5, 1, 6, 2, 7, 3, 8 };
5079 v8i16 mask1 = { 1, 4, 2, 5, 3, 6, 4, 7 };
5080 v8i16 mask2 = { 2, 3, 3, 4, 4, 5, 5, 6 };
5081 v8i16 minus5h = __msa_ldi_h(-5);
5082 v8i16 plus20h = __msa_ldi_h(20);
5088 src -= ((2 * stride) + 2);
5090 LD_SB5(src, stride, src0, src1, src2, src3, src4);
5091 LD_SB5(src + 8, stride, src7, src8, src9, src10, src11);
5092 src += (5 * stride);
5093 XORI_B5_128_SB(src0, src1, src2, src3, src4);
5094 XORI_B5_128_SB(src7, src8, src9, src10, src11);
5096 for (row = 16; row--;) {
5097 LD_SB2(src, 8, src5, src6);
5099 XORI_B2_128_SB(src5, src6);
5102 AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src0, src1, src2, src3, src4, src5,
5104 AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src7, src8, src9, src10, src11, src6,
5106 VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask0,
5107 mask1, mask2, shf_vec0, shf_vec1, shf_vec2);
5108 VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask0,
5109 mask1, mask2, shf_vec3, shf_vec4, shf_vec5);
5110 VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask3,
5111 mask4, mask5, shf_vec6, shf_vec7, shf_vec8);
5112 VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask3,
5113 mask4, mask5, shf_vec9, shf_vec10, shf_vec11);
5114 hz_res0 = __msa_hadd_s_w(shf_vec0, shf_vec0);
5115 hz_res1 = __msa_hadd_s_w(shf_vec3, shf_vec3);
5116 hz_res2 = __msa_hadd_s_w(shf_vec6, shf_vec6);
5117 hz_res3 = __msa_hadd_s_w(shf_vec9, shf_vec9);
5118 DPADD_SH2_SW(shf_vec1, shf_vec2, minus5h, plus20h, hz_res0, hz_res0);
5119 DPADD_SH2_SW(shf_vec4, shf_vec5, minus5h, plus20h, hz_res1, hz_res1);
5120 DPADD_SH2_SW(shf_vec7, shf_vec8, minus5h, plus20h, hz_res2, hz_res2);
5121 DPADD_SH2_SW(shf_vec10, shf_vec11, minus5h, plus20h, hz_res3, hz_res3);
5122 SRARI_W4_SW(hz_res0, hz_res1, hz_res2, hz_res3, 10);
5123 SAT_SW4_SW(hz_res0, hz_res1, hz_res2, hz_res3, 7);
5124 tmp0 = __msa_srari_h(shf_vec2, 5);
5125 tmp1 = __msa_srari_h(shf_vec5, 5);
5126 tmp2 = __msa_srari_h(shf_vec8, 5);
5127 tmp3 = __msa_srari_h(shf_vec11, 5);
5128 SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
5129 tmp0 = __msa_pckod_h(tmp2, tmp0);
5130 tmp1 = __msa_pckod_h(tmp3, tmp1);
5131 PCKEV_H2_SH(hz_res2, hz_res0, hz_res3, hz_res1, tmp2, tmp3);
5132 tmp0 = __msa_aver_s_h(tmp2, tmp0);
5133 tmp1 = __msa_aver_s_h(tmp3, tmp1);
5134 out = PCKEV_XORI128_UB(tmp0, tmp1);
5135 out = __msa_aver_u_b(out, dst0);
5152 void ff_avg_h264_qpel8_mc12_msa(uint8_t *dst, const uint8_t *src,
5157 v16u8 out, dst0 = { 0 };
5158 v16i8 src0, src1, src2, src3, src4, src5, src6;
5159 v8i16 vt_res0, vt_res1, vt_res2, vt_res3, tmp0, tmp1, tmp2, tmp3;
5160 v8i16 shf_vec0, shf_vec1, shf_vec2, shf_vec3, shf_vec4, shf_vec5, shf_vec6;
5161 v8i16 shf_vec7, shf_vec8, shf_vec9, shf_vec10, shf_vec11;
5162 v8i16 mask3, mask4, mask5;
5163 v4i32 hz_res0, hz_res1, hz_res2, hz_res3;
5164 v8i16 mask0 = { 0, 5, 1, 6, 2, 7, 3, 8 };
5165 v8i16 mask1 = { 1, 4, 2, 5, 3, 6, 4, 7 };
5166 v8i16 mask2 = { 2, 3, 3, 4, 4, 5, 5, 6 };
5167 v8i16 minus5h = __msa_ldi_h(-5);
5168 v8i16 plus20h = __msa_ldi_h(20);
5174 src -= ((2 * stride) + 2);
5176 LD_SB5(src, stride, src0, src1, src2, src3, src4);
5177 src += (5 * stride);
5178 XORI_B5_128_SB(src0, src1, src2, src3, src4);
5180 for (row = 4; row--;) {
5181 LD_SB2(src, stride, src5, src6);
5182 src += (2 * stride);
5183 XORI_B2_128_SB(src5, src6);
5185 AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src0, src1, src2, src3, src4, src5,
5187 AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src1, src2, src3, src4, src5, src6,
5189 VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask0,
5190 mask1, mask2, shf_vec0, shf_vec1, shf_vec2);
5191 VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask0,
5192 mask1, mask2, shf_vec3, shf_vec4, shf_vec5);
5193 VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask3,
5194 mask4, mask5, shf_vec6, shf_vec7, shf_vec8);
5195 VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask3,
5196 mask4, mask5, shf_vec9, shf_vec10, shf_vec11);
5197 hz_res0 = __msa_hadd_s_w(shf_vec0, shf_vec0);
5198 hz_res1 = __msa_hadd_s_w(shf_vec3, shf_vec3);
5199 hz_res2 = __msa_hadd_s_w(shf_vec6, shf_vec6);
5200 hz_res3 = __msa_hadd_s_w(shf_vec9, shf_vec9);
5201 DPADD_SH2_SW(shf_vec1, shf_vec2, minus5h, plus20h, hz_res0, hz_res0);
5202 DPADD_SH2_SW(shf_vec4, shf_vec5, minus5h, plus20h, hz_res1, hz_res1);
5203 DPADD_SH2_SW(shf_vec7, shf_vec8, minus5h, plus20h, hz_res2, hz_res2);
5204 DPADD_SH2_SW(shf_vec10, shf_vec11, minus5h, plus20h, hz_res3, hz_res3);
5205 SRARI_W4_SW(hz_res0, hz_res1, hz_res2, hz_res3, 10);
5206 SAT_SW4_SW(hz_res0, hz_res1, hz_res2, hz_res3, 7);
5207 tmp0 = __msa_srari_h(shf_vec2, 5);
5208 tmp1 = __msa_srari_h(shf_vec5, 5);
5209 tmp2 = __msa_srari_h(shf_vec8, 5);
5210 tmp3 = __msa_srari_h(shf_vec11, 5);
5211 LD2(dst, stride, tp0, tp1);
5212 INSERT_D2_UB(tp0, tp1, dst0);
5213 SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
5214 PCKEV_H2_SH(tmp2, tmp0, tmp3, tmp1, tmp0, tmp1);
5215 PCKEV_H2_SH(hz_res2, hz_res0, hz_res3, hz_res1, tmp2, tmp3);
5216 tmp0 = __msa_aver_s_h(tmp2, tmp0);
5217 tmp1 = __msa_aver_s_h(tmp3, tmp1);
5218 out = PCKEV_XORI128_UB(tmp0, tmp1);
5219 out = __msa_aver_u_b(out, dst0);
5220 ST8x2_UB(out, dst, stride);
5221 dst += (2 * stride);
5231 void ff_avg_h264_qpel8_mc32_msa(uint8_t *dst, const uint8_t *src,
5236 v16u8 out, dst0 = { 0 };
5237 v16i8 src0, src1, src2, src3, src4, src5, src6;
5238 v8i16 vt_res0, vt_res1, vt_res2, vt_res3, tmp0, tmp1, tmp2, tmp3;
5239 v8i16 shf_vec0, shf_vec1, shf_vec2, shf_vec3, shf_vec4, shf_vec5, shf_vec6;
5240 v8i16 shf_vec7, shf_vec8, shf_vec9, shf_vec10, shf_vec11;
5241 v8i16 mask3, mask4, mask5;
5242 v4i32 hz_res0, hz_res1, hz_res2, hz_res3;
5243 v8i16 mask0 = { 0, 5, 1, 6, 2, 7, 3, 8 };
5244 v8i16 mask1 = { 1, 4, 2, 5, 3, 6, 4, 7 };
5245 v8i16 mask2 = { 2, 3, 3, 4, 4, 5, 5, 6 };
5246 v8i16 minus5h = __msa_ldi_h(-5);
5247 v8i16 plus20h = __msa_ldi_h(20);
5253 src -= ((2 * stride) + 2);
5255 LD_SB5(src, stride, src0, src1, src2, src3, src4);
5256 src += (5 * stride);
5257 XORI_B5_128_SB(src0, src1, src2, src3, src4);
5259 for (row = 4; row--;) {
5260 LD_SB2(src, stride, src5, src6);
5261 src += (2 * stride);
5262 XORI_B2_128_SB(src5, src6);
5264 AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src0, src1, src2, src3, src4, src5,
5266 AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src1, src2, src3, src4, src5, src6,
5268 VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask0,
5269 mask1, mask2, shf_vec0, shf_vec1, shf_vec2);
5270 VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask0,
5271 mask1, mask2, shf_vec3, shf_vec4, shf_vec5);
5272 VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask3,
5273 mask4, mask5, shf_vec6, shf_vec7, shf_vec8);
5274 VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask3,
5275 mask4, mask5, shf_vec9, shf_vec10, shf_vec11);
5276 hz_res0 = __msa_hadd_s_w(shf_vec0, shf_vec0);
5277 hz_res1 = __msa_hadd_s_w(shf_vec3, shf_vec3);
5278 hz_res2 = __msa_hadd_s_w(shf_vec6, shf_vec6);
5279 hz_res3 = __msa_hadd_s_w(shf_vec9, shf_vec9);
5280 DPADD_SH2_SW(shf_vec1, shf_vec2, minus5h, plus20h, hz_res0, hz_res0);
5281 DPADD_SH2_SW(shf_vec4, shf_vec5, minus5h, plus20h, hz_res1, hz_res1);
5282 DPADD_SH2_SW(shf_vec7, shf_vec8, minus5h, plus20h, hz_res2, hz_res2);
5283 DPADD_SH2_SW(shf_vec10, shf_vec11, minus5h, plus20h, hz_res3, hz_res3);
5284 SRARI_W4_SW(hz_res0, hz_res1, hz_res2, hz_res3, 10);
5285 SAT_SW4_SW(hz_res0, hz_res1, hz_res2, hz_res3, 7);
5286 tmp0 = __msa_srari_h(shf_vec2, 5);
5287 tmp1 = __msa_srari_h(shf_vec5, 5);
5288 tmp2 = __msa_srari_h(shf_vec8, 5);
5289 tmp3 = __msa_srari_h(shf_vec11, 5);
5290 LD2(dst, stride, tp0, tp1);
5291 INSERT_D2_UB(tp0, tp1, dst0);
5292 SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
5293 tmp0 = __msa_pckod_h(tmp2, tmp0);
5294 tmp1 = __msa_pckod_h(tmp3, tmp1);
5295 PCKEV_H2_SH(hz_res2, hz_res0, hz_res3, hz_res1, tmp2, tmp3);
5296 tmp0 = __msa_aver_s_h(tmp2, tmp0);
5297 tmp1 = __msa_aver_s_h(tmp3, tmp1);
5298 out = PCKEV_XORI128_UB(tmp0, tmp1);
5299 out = __msa_aver_u_b(out, dst0);
5300 ST8x2_UB(out, dst, stride);
5301 dst += (2 * stride);
5311 void ff_avg_h264_qpel4_mc12_msa(uint8_t *dst, const uint8_t *src,
5314 uint32_t tp0, tp1, tp2, tp3;
5315 const int16_t filt_const0 = 0xfb01;
5316 const int16_t filt_const1 = 0x1414;
5317 const int16_t filt_const2 = 0x1fb;
5318 v16u8 out, dstv = { 0 };
5319 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
5320 v16i8 src10_r, src21_r, src32_r, src43_r, src54_r, src65_r, src76_r;
5321 v16i8 src87_r, src10_l, src21_l, src32_l, src43_l, src54_l, src65_l;
5322 v16i8 src76_l, src87_l, filt0, filt1, filt2;
5323 v8i16 vt_res0, vt_res1, vt_res2, vt_res3, dst0, dst1, dst2, dst3, shf_vec7;
5324 v8i16 shf_vec0, shf_vec1, shf_vec2, shf_vec3, shf_vec4, shf_vec5, shf_vec6;
5325 v4i32 hz_res0, hz_res1, hz_res2, hz_res3;
5326 v8i16 mask0 = { 0, 5, 1, 6, 2, 7, 3, 8 };
5327 v8i16 mask1 = { 1, 4, 2, 5, 3, 6, 4, 7 };
5328 v8i16 mask2 = { 2, 3, 3, 4, 4, 5, 5, 6 };
5329 v8i16 minus5h = __msa_ldi_h(-5);
5330 v8i16 plus20h = __msa_ldi_h(20);
5331 v8i16 zeros = { 0 };
5333 filt0 = (v16i8) __msa_fill_h(filt_const0);
5334 filt1 = (v16i8) __msa_fill_h(filt_const1);
5335 filt2 = (v16i8) __msa_fill_h(filt_const2);
5337 src -= ((2 * stride) + 2);
5339 LD_SB5(src, stride, src0, src1, src2, src3, src4);
5340 src += (5 * stride);
5341 XORI_B5_128_SB(src0, src1, src2, src3, src4);
5342 LD_SB4(src, stride, src5, src6, src7, src8);
5343 XORI_B4_128_SB(src5, src6, src7, src8);
5345 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
5347 ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
5349 ILVL_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_l, src21_l,
5351 ILVL_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_l, src65_l,
5353 vt_res0 = AVC_DOT_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2);
5354 vt_res1 = AVC_DOT_SH3_SH(src10_l, src32_l, src54_l, filt0, filt1, filt2);
5355 vt_res2 = AVC_DOT_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2);
5356 vt_res3 = AVC_DOT_SH3_SH(src21_l, src43_l, src65_l, filt0, filt1, filt2);
5357 VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask0,
5358 mask1, mask2, shf_vec0, shf_vec1, shf_vec2);
5359 VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask0,
5360 mask1, mask2, shf_vec3, shf_vec4, shf_vec5);
5361 hz_res0 = __msa_hadd_s_w(shf_vec0, shf_vec0);
5362 DPADD_SH2_SW(shf_vec1, shf_vec2, minus5h, plus20h, hz_res0, hz_res0);
5363 hz_res1 = __msa_hadd_s_w(shf_vec3, shf_vec3);
5364 DPADD_SH2_SW(shf_vec4, shf_vec5, minus5h, plus20h, hz_res1, hz_res1);
5366 vt_res0 = AVC_DOT_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2);
5367 vt_res1 = AVC_DOT_SH3_SH(src32_l, src54_l, src76_l, filt0, filt1, filt2);
5368 vt_res2 = AVC_DOT_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2);
5369 vt_res3 = AVC_DOT_SH3_SH(src43_l, src65_l, src87_l, filt0, filt1, filt2);
5370 VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask0,
5371 mask1, mask2, shf_vec0, shf_vec1, shf_vec6);
5372 VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask0,
5373 mask1, mask2, shf_vec3, shf_vec4, shf_vec7);
5374 hz_res2 = __msa_hadd_s_w(shf_vec0, shf_vec0);
5375 DPADD_SH2_SW(shf_vec1, shf_vec6, minus5h, plus20h, hz_res2, hz_res2);
5376 hz_res3 = __msa_hadd_s_w(shf_vec3, shf_vec3);
5377 DPADD_SH2_SW(shf_vec4, shf_vec7, minus5h, plus20h, hz_res3, hz_res3);
5379 SRARI_W2_SW(hz_res0, hz_res1, 10);
5380 SAT_SW2_SW(hz_res0, hz_res1, 7);
5381 SRARI_W2_SW(hz_res2, hz_res3, 10);
5382 SAT_SW2_SW(hz_res2, hz_res3, 7);
5384 dst0 = __msa_srari_h(shf_vec2, 5);
5385 dst1 = __msa_srari_h(shf_vec5, 5);
5386 dst2 = __msa_srari_h(shf_vec6, 5);
5387 dst3 = __msa_srari_h(shf_vec7, 5);
5389 SAT_SH2_SH(dst0, dst1, 7);
5390 SAT_SH2_SH(dst2, dst3, 7);
5391 ILVEV_H2_SH(dst0, zeros, dst1, zeros, dst0, dst1);
5392 ILVEV_H2_SH(dst2, zeros, dst3, zeros, dst2, dst3);
5394 hz_res0 = __msa_aver_s_w(hz_res0, (v4i32) dst0);
5395 hz_res1 = __msa_aver_s_w(hz_res1, (v4i32) dst1);
5396 hz_res2 = __msa_aver_s_w(hz_res2, (v4i32) dst2);
5397 hz_res3 = __msa_aver_s_w(hz_res3, (v4i32) dst3);
5399 LW4(dst, stride, tp0, tp1, tp2, tp3);
5400 INSERT_W4_UB(tp0, tp1, tp2, tp3, dstv);
5401 PCKEV_H2_SH(hz_res1, hz_res0, hz_res3, hz_res2, dst0, dst2);
5402 out = PCKEV_XORI128_UB(dst0, dst2);
5403 out = __msa_aver_u_b(out, dstv);
5404 ST4x4_UB(out, out, 0, 1, 2, 3, dst, stride);
5407 void ff_avg_h264_qpel4_mc32_msa(uint8_t *dst, const uint8_t *src,
5410 uint32_t tp0, tp1, tp2, tp3;
5411 const int16_t filt_const0 = 0xfb01;
5412 const int16_t filt_const1 = 0x1414;
5413 const int16_t filt_const2 = 0x1fb;
5414 v16u8 out, dstv = { 0 };
5415 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
5416 v16i8 src10_r, src21_r, src32_r, src43_r, src54_r, src65_r, src76_r;
5417 v16i8 src87_r, src10_l, src21_l, src32_l, src43_l, src54_l, src65_l;
5418 v16i8 src76_l, src87_l, filt0, filt1, filt2;
5419 v8i16 vt_res0, vt_res1, vt_res2, vt_res3, dst0, dst1, dst2, dst3, shf_vec7;
5420 v8i16 shf_vec0, shf_vec1, shf_vec2, shf_vec3, shf_vec4, shf_vec5, shf_vec6;
5421 v4i32 hz_res0, hz_res1, hz_res2, hz_res3;
5422 v8i16 mask0 = { 0, 5, 1, 6, 2, 7, 3, 8 };
5423 v8i16 mask1 = { 1, 4, 2, 5, 3, 6, 4, 7 };
5424 v8i16 mask2 = { 2, 3, 3, 4, 4, 5, 5, 6 };
5425 v8i16 minus5h = __msa_ldi_h(-5);
5426 v8i16 plus20h = __msa_ldi_h(20);
5427 v8i16 zeros = { 0 };
5429 filt0 = (v16i8) __msa_fill_h(filt_const0);
5430 filt1 = (v16i8) __msa_fill_h(filt_const1);
5431 filt2 = (v16i8) __msa_fill_h(filt_const2);
5433 src -= ((2 * stride) + 2);
5435 LD_SB5(src, stride, src0, src1, src2, src3, src4);
5436 src += (5 * stride);
5437 XORI_B5_128_SB(src0, src1, src2, src3, src4);
5438 LD_SB4(src, stride, src5, src6, src7, src8);
5439 XORI_B4_128_SB(src5, src6, src7, src8);
5441 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
5443 ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
5445 ILVL_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_l, src21_l,
5447 ILVL_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_l, src65_l,
5449 vt_res0 = AVC_DOT_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2);
5450 vt_res1 = AVC_DOT_SH3_SH(src10_l, src32_l, src54_l, filt0, filt1, filt2);
5451 vt_res2 = AVC_DOT_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2);
5452 vt_res3 = AVC_DOT_SH3_SH(src21_l, src43_l, src65_l, filt0, filt1, filt2);
5453 VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask0,
5454 mask1, mask2, shf_vec0, shf_vec1, shf_vec2);
5455 VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask0,
5456 mask1, mask2, shf_vec3, shf_vec4, shf_vec5);
5457 hz_res0 = __msa_hadd_s_w(shf_vec0, shf_vec0);
5458 DPADD_SH2_SW(shf_vec1, shf_vec2, minus5h, plus20h, hz_res0, hz_res0);
5459 hz_res1 = __msa_hadd_s_w(shf_vec3, shf_vec3);
5460 DPADD_SH2_SW(shf_vec4, shf_vec5, minus5h, plus20h, hz_res1, hz_res1);
5462 vt_res0 = AVC_DOT_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2);
5463 vt_res1 = AVC_DOT_SH3_SH(src32_l, src54_l, src76_l, filt0, filt1, filt2);
5464 vt_res2 = AVC_DOT_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2);
5465 vt_res3 = AVC_DOT_SH3_SH(src43_l, src65_l, src87_l, filt0, filt1, filt2);
5466 VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask0,
5467 mask1, mask2, shf_vec0, shf_vec1, shf_vec6);
5468 VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask0,
5469 mask1, mask2, shf_vec3, shf_vec4, shf_vec7);
5470 hz_res2 = __msa_hadd_s_w(shf_vec0, shf_vec0);
5471 DPADD_SH2_SW(shf_vec1, shf_vec6, minus5h, plus20h, hz_res2, hz_res2);
5472 hz_res3 = __msa_hadd_s_w(shf_vec3, shf_vec3);
5473 DPADD_SH2_SW(shf_vec4, shf_vec7, minus5h, plus20h, hz_res3, hz_res3);
5475 SRARI_W2_SW(hz_res0, hz_res1, 10);
5476 SAT_SW2_SW(hz_res0, hz_res1, 7);
5477 SRARI_W2_SW(hz_res2, hz_res3, 10);
5478 SAT_SW2_SW(hz_res2, hz_res3, 7);
5480 dst0 = __msa_srari_h(shf_vec2, 5);
5481 dst1 = __msa_srari_h(shf_vec5, 5);
5482 dst2 = __msa_srari_h(shf_vec6, 5);
5483 dst3 = __msa_srari_h(shf_vec7, 5);
5485 SAT_SH2_SH(dst0, dst1, 7);
5486 SAT_SH2_SH(dst2, dst3, 7);
5488 dst0 = __msa_ilvod_h(zeros, dst0);
5489 dst1 = __msa_ilvod_h(zeros, dst1);
5490 dst2 = __msa_ilvod_h(zeros, dst2);
5491 dst3 = __msa_ilvod_h(zeros, dst3);
5493 hz_res0 = __msa_aver_s_w(hz_res0, (v4i32) dst0);
5494 hz_res1 = __msa_aver_s_w(hz_res1, (v4i32) dst1);
5495 hz_res2 = __msa_aver_s_w(hz_res2, (v4i32) dst2);
5496 hz_res3 = __msa_aver_s_w(hz_res3, (v4i32) dst3);
5498 LW4(dst, stride, tp0, tp1, tp2, tp3);
5499 INSERT_W4_UB(tp0, tp1, tp2, tp3, dstv);
5500 PCKEV_H2_SH(hz_res1, hz_res0, hz_res3, hz_res2, dst0, dst2);
5501 out = PCKEV_XORI128_UB(dst0, dst2);
5502 out = __msa_aver_u_b(out, dstv);
5503 ST4x4_UB(out, out, 0, 1, 2, 3, dst, stride);
5506 void ff_avg_h264_qpel16_mc22_msa(uint8_t *dst, const uint8_t *src,
5509 const int32_t filt_const0 = 0xfffb0001;
5510 const int32_t filt_const1 = 0x140014;
5511 const int32_t filt_const2 = 0x1fffb;
5512 const uint8_t *src_tmp = src - (2 * stride) - 2;
5513 uint8_t *dst_tmp = dst;
5514 uint64_t tp0, tp1, tp2, tp3;
5515 uint32_t multiple8_cnt, loop_cnt;
5516 v16u8 dst0, dst1, out0, out1;
5517 v16i8 src0, src1, src2, src3, src4, mask0, mask1, mask2;
5518 v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
5519 v8i16 hz_out7, hz_out8, res0, res1, res2, res3;
5520 v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r;
5521 v8i16 hz_out65_r, hz_out76_r, hz_out87_r, hz_out10_l, hz_out21_l;
5522 v8i16 hz_out32_l, hz_out43_l, hz_out54_l, hz_out65_l, hz_out76_l;
5523 v8i16 hz_out87_l, filt0, filt1, filt2;
5526 filt0 = (v8i16) __msa_fill_w(filt_const0);
5527 filt1 = (v8i16) __msa_fill_w(filt_const1);
5528 filt2 = (v8i16) __msa_fill_w(filt_const2);
5530 LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
5532 for (multiple8_cnt = 2; multiple8_cnt--;) {
5536 LD_SB5(src, stride, src0, src1, src2, src3, src4);
5537 XORI_B5_128_SB(src0, src1, src2, src3, src4);
5538 src += (5 * stride);
5540 hz_out0 = AVC_HORZ_FILTER_SH(src0, src0, mask0, mask1, mask2);
5541 hz_out1 = AVC_HORZ_FILTER_SH(src1, src1, mask0, mask1, mask2);
5542 hz_out2 = AVC_HORZ_FILTER_SH(src2, src2, mask0, mask1, mask2);
5543 hz_out3 = AVC_HORZ_FILTER_SH(src3, src3, mask0, mask1, mask2);
5544 hz_out4 = AVC_HORZ_FILTER_SH(src4, src4, mask0, mask1, mask2);
5546 for (loop_cnt = 4; loop_cnt--;) {
5547 LD_SB4(src, stride, src0, src1, src2, src3);
5548 XORI_B4_128_SB(src0, src1, src2, src3);
5549 src += (4 * stride);
5551 hz_out5 = AVC_HORZ_FILTER_SH(src0, src0, mask0, mask1, mask2);
5552 hz_out6 = AVC_HORZ_FILTER_SH(src1, src1, mask0, mask1, mask2);
5553 hz_out7 = AVC_HORZ_FILTER_SH(src2, src2, mask0, mask1, mask2);
5554 hz_out8 = AVC_HORZ_FILTER_SH(src3, src3, mask0, mask1, mask2);
5555 ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2,
5556 hz_out4, hz_out3, hz_out10_r, hz_out21_r, hz_out32_r,
5558 ILVL_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2,
5559 hz_out4, hz_out3, hz_out10_l, hz_out21_l, hz_out32_l,
5561 ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6,
5562 hz_out8, hz_out7, hz_out54_r, hz_out65_r, hz_out76_r,
5564 ILVL_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6,
5565 hz_out8, hz_out7, hz_out54_l, hz_out65_l, hz_out76_l,
5568 tmp0 = AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0,
5570 tmp1 = AVC_DOT_SW3_SW(hz_out10_l, hz_out32_l, hz_out54_l, filt0,
5572 res0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
5573 tmp0 = AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0,
5575 tmp1 = AVC_DOT_SW3_SW(hz_out21_l, hz_out43_l, hz_out65_l, filt0,
5577 res1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
5578 tmp0 = AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0,
5580 tmp1 = AVC_DOT_SW3_SW(hz_out32_l, hz_out54_l, hz_out76_l, filt0,
5582 res2 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
5583 tmp0 = AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0,
5585 tmp1 = AVC_DOT_SW3_SW(hz_out43_l, hz_out65_l, hz_out87_l, filt0,
5587 res3 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
5589 LD4(dst, stride, tp0, tp1, tp2, tp3);
5590 INSERT_D2_UB(tp0, tp1, dst0);
5591 INSERT_D2_UB(tp2, tp3, dst1);
5592 out0 = PCKEV_XORI128_UB(res0, res1);
5593 out1 = PCKEV_XORI128_UB(res2, res3);
5594 AVER_UB2_UB(out0, dst0, out1, dst1, out0, out1);
5595 ST8x4_UB(out0, out1, dst, stride);
5596 dst += (4 * stride);
5610 void ff_avg_h264_qpel8_mc22_msa(uint8_t *dst, const uint8_t *src,
5613 const int32_t filt_const0 = 0xfffb0001;
5614 const int32_t filt_const1 = 0x140014;
5615 const int32_t filt_const2 = 0x1fffb;
5616 uint64_t tp0, tp1, tp2, tp3;
5617 v16u8 out0, out1, dst0 = { 0 }, dst1 = { 0 };
5618 v16i8 src0, src1, src2, src3, src4, mask0, mask1, mask2;
5619 v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
5620 v8i16 hz_out7, hz_out8, hz_out9, hz_out10, hz_out11, hz_out12;
5621 v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r;
5622 v8i16 hz_out65_r, hz_out76_r, hz_out87_r, hz_out89_r, hz_out910_r;
5623 v8i16 hz_out1110_r, hz_out1211_r, res0, res1, res2, res3;
5624 v8i16 hz_out10_l, hz_out21_l, hz_out32_l, hz_out43_l, hz_out54_l;
5625 v8i16 hz_out65_l, hz_out76_l, hz_out87_l, hz_out89_l, hz_out910_l;
5626 v8i16 hz_out1110_l, hz_out1211_l, filt0, filt1, filt2;
5629 filt0 = (v8i16) __msa_fill_w(filt_const0);
5630 filt1 = (v8i16) __msa_fill_w(filt_const1);
5631 filt2 = (v8i16) __msa_fill_w(filt_const2);
5633 LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
5635 src -= ((2 * stride) + 2);
5636 LD_SB5(src, stride, src0, src1, src2, src3, src4);
5637 XORI_B5_128_SB(src0, src1, src2, src3, src4);
5638 src += (5 * stride);
5640 hz_out0 = AVC_HORZ_FILTER_SH(src0, src0, mask0, mask1, mask2);
5641 hz_out1 = AVC_HORZ_FILTER_SH(src1, src1, mask0, mask1, mask2);
5642 hz_out2 = AVC_HORZ_FILTER_SH(src2, src2, mask0, mask1, mask2);
5643 hz_out3 = AVC_HORZ_FILTER_SH(src3, src3, mask0, mask1, mask2);
5644 hz_out4 = AVC_HORZ_FILTER_SH(src4, src4, mask0, mask1, mask2);
5646 LD_SB4(src, stride, src0, src1, src2, src3);
5647 XORI_B4_128_SB(src0, src1, src2, src3);
5648 src += (4 * stride);
5649 hz_out5 = AVC_HORZ_FILTER_SH(src0, src0, mask0, mask1, mask2);
5650 hz_out6 = AVC_HORZ_FILTER_SH(src1, src1, mask0, mask1, mask2);
5651 hz_out7 = AVC_HORZ_FILTER_SH(src2, src2, mask0, mask1, mask2);
5652 hz_out8 = AVC_HORZ_FILTER_SH(src3, src3, mask0, mask1, mask2);
5653 ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4,
5654 hz_out3, hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r);
5655 ILVL_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4,
5656 hz_out3, hz_out10_l, hz_out21_l, hz_out32_l, hz_out43_l);
5657 ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8,
5658 hz_out7, hz_out54_r, hz_out65_r, hz_out76_r, hz_out87_r);
5659 ILVL_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8,
5660 hz_out7, hz_out54_l, hz_out65_l, hz_out76_l, hz_out87_l);
5662 tmp0 = AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0, filt1,
5664 tmp1 = AVC_DOT_SW3_SW(hz_out10_l, hz_out32_l, hz_out54_l, filt0, filt1,
5666 res0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
5667 tmp0 = AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0, filt1,
5669 tmp1 = AVC_DOT_SW3_SW(hz_out21_l, hz_out43_l, hz_out65_l, filt0, filt1,
5671 res1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
5672 tmp0 = AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0, filt1,
5674 tmp1 = AVC_DOT_SW3_SW(hz_out32_l, hz_out54_l, hz_out76_l, filt0, filt1,
5676 res2 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
5677 tmp0 = AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0, filt1,
5679 tmp1 = AVC_DOT_SW3_SW(hz_out43_l, hz_out65_l, hz_out87_l, filt0, filt1,
5681 res3 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
5682 LD4(dst, stride, tp0, tp1, tp2, tp3);
5683 INSERT_D2_UB(tp0, tp1, dst0);
5684 INSERT_D2_UB(tp2, tp3, dst1);
5685 out0 = PCKEV_XORI128_UB(res0, res1);
5686 out1 = PCKEV_XORI128_UB(res2, res3);
5687 AVER_UB2_UB(out0, dst0, out1, dst1, dst0, dst1);
5688 ST8x4_UB(dst0, dst1, dst, stride);
5689 dst += (4 * stride);
5691 LD_SB4(src, stride, src0, src1, src2, src3);
5692 XORI_B4_128_SB(src0, src1, src2, src3);
5693 hz_out9 = AVC_HORZ_FILTER_SH(src0, src0, mask0, mask1, mask2);
5694 hz_out10 = AVC_HORZ_FILTER_SH(src1, src1, mask0, mask1, mask2);
5695 hz_out11 = AVC_HORZ_FILTER_SH(src2, src2, mask0, mask1, mask2);
5696 hz_out12 = AVC_HORZ_FILTER_SH(src3, src3, mask0, mask1, mask2);
5697 ILVR_H4_SH(hz_out9, hz_out8, hz_out10, hz_out9, hz_out11, hz_out10,
5698 hz_out12, hz_out11, hz_out89_r, hz_out910_r, hz_out1110_r,
5700 ILVL_H4_SH(hz_out9, hz_out8, hz_out10, hz_out9, hz_out11, hz_out10,
5701 hz_out12, hz_out11, hz_out89_l, hz_out910_l, hz_out1110_l,
5703 tmp0 = AVC_DOT_SW3_SW(hz_out54_r, hz_out76_r, hz_out89_r, filt0, filt1,
5705 tmp1 = AVC_DOT_SW3_SW(hz_out54_l, hz_out76_l, hz_out89_l, filt0, filt1,
5707 res0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
5708 tmp0 = AVC_DOT_SW3_SW(hz_out65_r, hz_out87_r, hz_out910_r, filt0, filt1,
5710 tmp1 = AVC_DOT_SW3_SW(hz_out65_l, hz_out87_l, hz_out910_l, filt0, filt1,
5712 res1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
5713 tmp0 = AVC_DOT_SW3_SW(hz_out76_r, hz_out89_r, hz_out1110_r, filt0, filt1,
5715 tmp1 = AVC_DOT_SW3_SW(hz_out76_l, hz_out89_l, hz_out1110_l, filt0, filt1,
5717 res2 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
5718 tmp0 = AVC_DOT_SW3_SW(hz_out87_r, hz_out910_r, hz_out1211_r, filt0, filt1,
5720 tmp1 = AVC_DOT_SW3_SW(hz_out87_l, hz_out910_l, hz_out1211_l, filt0, filt1,
5722 res3 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
5723 LD4(dst, stride, tp0, tp1, tp2, tp3);
5724 INSERT_D2_UB(tp0, tp1, dst0);
5725 INSERT_D2_UB(tp2, tp3, dst1);
5726 out0 = PCKEV_XORI128_UB(res0, res1);
5727 out1 = PCKEV_XORI128_UB(res2, res3);
5728 AVER_UB2_UB(out0, dst0, out1, dst1, dst0, dst1);
5729 ST8x4_UB(dst0, dst1, dst, stride);
5732 void ff_avg_h264_qpel4_mc22_msa(uint8_t *dst, const uint8_t *src,
5735 const int32_t filt_const0 = 0xfffb0001;
5736 const int32_t filt_const1 = 0x140014;
5737 const int32_t filt_const2 = 0x1fffb;
5738 uint32_t tp0, tp1, tp2, tp3;
5739 v16u8 res, dst0 = { 0 };
5740 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
5741 v16i8 mask0, mask1, mask2;
5742 v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
5743 v8i16 hz_out7, hz_out8, res0, res1, filt0, filt1, filt2;
5744 v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r;
5745 v8i16 hz_out65_r, hz_out76_r, hz_out87_r;
5748 LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2);
5750 filt0 = (v8i16) __msa_fill_w(filt_const0);
5751 filt1 = (v8i16) __msa_fill_w(filt_const1);
5752 filt2 = (v8i16) __msa_fill_w(filt_const2);
5754 src -= ((2 * stride) + 2);
5756 LD_SB5(src, stride, src0, src1, src2, src3, src4);
5757 src += (5 * stride);
5758 LD_SB4(src, stride, src5, src6, src7, src8);
5760 XORI_B5_128_SB(src0, src1, src2, src3, src4);
5761 XORI_B4_128_SB(src5, src6, src7, src8);
5762 hz_out0 = AVC_HORZ_FILTER_SH(src0, src1, mask0, mask1, mask2);
5763 hz_out2 = AVC_HORZ_FILTER_SH(src2, src3, mask0, mask1, mask2);
5764 hz_out4 = AVC_HORZ_FILTER_SH(src4, src5, mask0, mask1, mask2);
5765 hz_out6 = AVC_HORZ_FILTER_SH(src6, src7, mask0, mask1, mask2);
5766 hz_out8 = AVC_HORZ_FILTER_SH(src8, src8, mask0, mask1, mask2);
5767 PCKOD_D2_SH(hz_out0, hz_out0, hz_out2, hz_out2, hz_out1, hz_out3);
5768 PCKOD_D2_SH(hz_out4, hz_out4, hz_out6, hz_out6, hz_out5, hz_out7);
5769 ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4,
5770 hz_out3, hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r);
5771 ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8,
5772 hz_out7, hz_out54_r, hz_out65_r, hz_out76_r, hz_out87_r);
5774 tmp0 = AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0, filt1,
5776 tmp1 = AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0, filt1,
5778 res0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
5779 tmp0 = AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0, filt1,
5781 tmp1 = AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0, filt1,
5783 res1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
5784 LW4(dst, stride, tp0, tp1, tp2, tp3);
5785 INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
5786 res = PCKEV_XORI128_UB(res0, res1);
5787 res = __msa_aver_u_b(res, dst0);
5788 ST4x4_UB(res, res, 0, 1, 2, 3, dst, stride);