2 * Copyright (c) 2015 -2017 Parag Salasakar (Parag.Salasakar@imgtec.com)
4 * This file is part of FFmpeg.
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 #include "libavutil/mips/generic_macros_msa.h"
22 #include "h264dsp_mips.h"
24 #define AVC_CALC_DPADD_H_6PIX_2COEFF_SH(in0, in1, in2, in3, in4, in5) \
26 v4i32 tmp0_m, tmp1_m; \
27 v8i16 out0_m, out1_m, out2_m, out3_m; \
28 v8i16 minus5h_m = __msa_ldi_h(-5); \
29 v8i16 plus20h_m = __msa_ldi_h(20); \
31 ILVRL_H2_SW(in5, in0, tmp0_m, tmp1_m); \
33 tmp0_m = __msa_hadd_s_w((v8i16) tmp0_m, (v8i16) tmp0_m); \
34 tmp1_m = __msa_hadd_s_w((v8i16) tmp1_m, (v8i16) tmp1_m); \
36 ILVRL_H2_SH(in1, in4, out0_m, out1_m); \
37 DPADD_SH2_SW(out0_m, out1_m, minus5h_m, minus5h_m, tmp0_m, tmp1_m); \
38 ILVRL_H2_SH(in2, in3, out2_m, out3_m); \
39 DPADD_SH2_SW(out2_m, out3_m, plus20h_m, plus20h_m, tmp0_m, tmp1_m); \
41 SRARI_W2_SW(tmp0_m, tmp1_m, 10); \
42 SAT_SW2_SW(tmp0_m, tmp1_m, 7); \
43 out0_m = __msa_pckev_h((v8i16) tmp1_m, (v8i16) tmp0_m); \
48 static const uint8_t luma_mask_arr[16 * 8] = {
50 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12,
51 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 9, 7, 10, 8, 11,
52 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10,
55 0, 5, 1, 6, 2, 7, 3, 8, 16, 21, 17, 22, 18, 23, 19, 24,
56 1, 4, 2, 5, 3, 6, 4, 7, 17, 20, 18, 21, 19, 22, 20, 23,
57 2, 3, 3, 4, 4, 5, 5, 6, 18, 19, 19, 20, 20, 21, 21, 22,
59 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 24, 25,
60 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26
63 #define AVC_CALC_DPADD_B_6PIX_2COEFF_SH(vec0, vec1, vec2, vec3, vec4, vec5, \
66 v16i8 tmp0_m, tmp1_m; \
67 v16i8 minus5b_m = __msa_ldi_b(-5); \
68 v16i8 plus20b_m = __msa_ldi_b(20); \
70 ILVRL_B2_SB(vec5, vec0, tmp0_m, tmp1_m); \
71 HADD_SB2_SH(tmp0_m, tmp1_m, out1, out2); \
72 ILVRL_B2_SB(vec4, vec1, tmp0_m, tmp1_m); \
73 DPADD_SB2_SH(tmp0_m, tmp1_m, minus5b_m, minus5b_m, out1, out2); \
74 ILVRL_B2_SB(vec3, vec2, tmp0_m, tmp1_m); \
75 DPADD_SB2_SH(tmp0_m, tmp1_m, plus20b_m, plus20b_m, out1, out2); \
78 #define AVC_CALC_DPADD_B_6PIX_2COEFF_R_SH(vec0, vec1, vec2, vec3, vec4, vec5) \
81 v16i8 tmp0_m, tmp2_m; \
82 v16i8 minus5b_m = __msa_ldi_b(-5); \
83 v16i8 plus20b_m = __msa_ldi_b(20); \
85 tmp1_m = (v8i16) __msa_ilvr_b((v16i8) vec5, (v16i8) vec0); \
86 tmp1_m = __msa_hadd_s_h((v16i8) tmp1_m, (v16i8) tmp1_m); \
88 ILVR_B2_SB(vec4, vec1, vec3, vec2, tmp0_m, tmp2_m); \
89 DPADD_SB2_SH(tmp0_m, tmp2_m, minus5b_m, plus20b_m, tmp1_m, tmp1_m); \
94 #define AVC_CALC_DPADD_H_6PIX_2COEFF_R_SH(vec0, vec1, vec2, vec3, vec4, vec5) \
97 v8i16 tmp2_m, tmp3_m; \
98 v8i16 minus5h_m = __msa_ldi_h(-5); \
99 v8i16 plus20h_m = __msa_ldi_h(20); \
101 tmp1_m = (v4i32) __msa_ilvr_h((v8i16) vec5, (v8i16) vec0); \
102 tmp1_m = __msa_hadd_s_w((v8i16) tmp1_m, (v8i16) tmp1_m); \
104 ILVR_H2_SH(vec1, vec4, vec2, vec3, tmp2_m, tmp3_m); \
105 DPADD_SH2_SW(tmp2_m, tmp3_m, minus5h_m, plus20h_m, tmp1_m, tmp1_m); \
107 tmp1_m = __msa_srari_w(tmp1_m, 10); \
108 tmp1_m = __msa_sat_s_w(tmp1_m, 7); \
110 tmp2_m = __msa_pckev_h((v8i16) tmp1_m, (v8i16) tmp1_m); \
115 #define AVC_XOR_VSHF_B_AND_APPLY_6TAP_HORIZ_FILT_SH(src0, src1, \
116 mask0, mask1, mask2) \
119 v16i8 vec0_m, vec1_m, vec2_m; \
120 v16i8 minus5b_m = __msa_ldi_b(-5); \
121 v16i8 plus20b_m = __msa_ldi_b(20); \
123 vec0_m = __msa_vshf_b((v16i8) mask0, (v16i8) src1, (v16i8) src0); \
124 hz_out_m = __msa_hadd_s_h(vec0_m, vec0_m); \
126 VSHF_B2_SB(src0, src1, src0, src1, mask1, mask2, vec1_m, vec2_m); \
127 DPADD_SB2_SH(vec1_m, vec2_m, minus5b_m, plus20b_m, hz_out_m, hz_out_m); \
132 #define AVC_HORZ_FILTER_SH(in0, in1, mask0, mask1, mask2) \
136 v16i8 minus5b = __msa_ldi_b(-5); \
137 v16i8 plus20b = __msa_ldi_b(20); \
139 tmp0_m = __msa_vshf_b((v16i8) mask0, in1, in0); \
140 out0_m = __msa_hadd_s_h(tmp0_m, tmp0_m); \
142 tmp0_m = __msa_vshf_b((v16i8) mask1, in1, in0); \
143 out0_m = __msa_dpadd_s_h(out0_m, minus5b, tmp0_m); \
145 tmp0_m = __msa_vshf_b((v16i8) mask2, in1, in0); \
146 out0_m = __msa_dpadd_s_h(out0_m, plus20b, tmp0_m); \
151 #define AVC_DOT_SH3_SH(in0, in1, in2, coeff0, coeff1, coeff2) \
155 out0_m = __msa_dotp_s_h((v16i8) in0, (v16i8) coeff0); \
156 out0_m = __msa_dpadd_s_h(out0_m, (v16i8) in1, (v16i8) coeff1); \
157 out0_m = __msa_dpadd_s_h(out0_m, (v16i8) in2, (v16i8) coeff2); \
162 #define AVC_DOT_SW3_SW(in0, in1, in2, coeff0, coeff1, coeff2) \
166 out0_m = __msa_dotp_s_w((v8i16) in0, (v8i16) coeff0); \
167 out0_m = __msa_dpadd_s_w(out0_m, (v8i16) in1, (v8i16) coeff1); \
168 out0_m = __msa_dpadd_s_w(out0_m, (v8i16) in2, (v8i16) coeff2); \
169 out0_m = __msa_srari_w(out0_m, 10); \
170 out0_m = __msa_sat_s_w(out0_m, 7); \
174 static void avc_luma_hv_qrt_4w_msa(const uint8_t *src_x, const uint8_t *src_y,
175 int32_t src_stride, uint8_t *dst,
176 int32_t dst_stride, int32_t height)
179 v16i8 src_hz0, src_hz1, src_hz2, src_hz3;
180 v16i8 src_vt0, src_vt1, src_vt2, src_vt3, src_vt4;
181 v16i8 src_vt5, src_vt6, src_vt7, src_vt8;
182 v16i8 mask0, mask1, mask2;
183 v8i16 hz_out0, hz_out1, vert_out0, vert_out1;
187 LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2);
189 LD_SB5(src_y, src_stride, src_vt0, src_vt1, src_vt2, src_vt3, src_vt4);
190 src_y += (5 * src_stride);
192 src_vt0 = (v16i8) __msa_insve_w((v4i32) src_vt0, 1, (v4i32) src_vt1);
193 src_vt1 = (v16i8) __msa_insve_w((v4i32) src_vt1, 1, (v4i32) src_vt2);
194 src_vt2 = (v16i8) __msa_insve_w((v4i32) src_vt2, 1, (v4i32) src_vt3);
195 src_vt3 = (v16i8) __msa_insve_w((v4i32) src_vt3, 1, (v4i32) src_vt4);
197 XORI_B4_128_SB(src_vt0, src_vt1, src_vt2, src_vt3);
199 for (loop_cnt = (height >> 2); loop_cnt--;) {
200 LD_SB4(src_x, src_stride, src_hz0, src_hz1, src_hz2, src_hz3);
201 src_x += (4 * src_stride);
203 XORI_B4_128_SB(src_hz0, src_hz1, src_hz2, src_hz3);
205 hz_out0 = AVC_XOR_VSHF_B_AND_APPLY_6TAP_HORIZ_FILT_SH(src_hz0,
208 hz_out1 = AVC_XOR_VSHF_B_AND_APPLY_6TAP_HORIZ_FILT_SH(src_hz2,
212 SRARI_H2_SH(hz_out0, hz_out1, 5);
213 SAT_SH2_SH(hz_out0, hz_out1, 7);
215 LD_SB4(src_y, src_stride, src_vt5, src_vt6, src_vt7, src_vt8);
216 src_y += (4 * src_stride);
218 src_vt4 = (v16i8) __msa_insve_w((v4i32) src_vt4, 1, (v4i32) src_vt5);
219 src_vt5 = (v16i8) __msa_insve_w((v4i32) src_vt5, 1, (v4i32) src_vt6);
220 src_vt6 = (v16i8) __msa_insve_w((v4i32) src_vt6, 1, (v4i32) src_vt7);
221 src_vt7 = (v16i8) __msa_insve_w((v4i32) src_vt7, 1, (v4i32) src_vt8);
223 XORI_B4_128_SB(src_vt4, src_vt5, src_vt6, src_vt7);
226 vert_out0 = AVC_CALC_DPADD_B_6PIX_2COEFF_R_SH(src_vt0, src_vt1,
229 vert_out1 = AVC_CALC_DPADD_B_6PIX_2COEFF_R_SH(src_vt2, src_vt3,
233 SRARI_H2_SH(vert_out0, vert_out1, 5);
234 SAT_SH2_SH(vert_out0, vert_out1, 7);
236 out0 = __msa_srari_h((hz_out0 + vert_out0), 1);
237 out1 = __msa_srari_h((hz_out1 + vert_out1), 1);
239 SAT_SH2_SH(out0, out1, 7);
240 out = PCKEV_XORI128_UB(out0, out1);
241 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
242 dst += (4 * dst_stride);
252 static void avc_luma_hv_qrt_8w_msa(const uint8_t *src_x, const uint8_t *src_y,
253 int32_t src_stride, uint8_t *dst,
254 int32_t dst_stride, int32_t height)
257 v16i8 src_hz0, src_hz1, src_hz2, src_hz3;
258 v16i8 src_vt0, src_vt1, src_vt2, src_vt3, src_vt4;
259 v16i8 src_vt5, src_vt6, src_vt7, src_vt8;
260 v16i8 mask0, mask1, mask2;
261 v8i16 hz_out0, hz_out1, hz_out2, hz_out3;
262 v8i16 vert_out0, vert_out1, vert_out2, vert_out3;
263 v8i16 out0, out1, out2, out3;
266 LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
267 LD_SB5(src_y, src_stride, src_vt0, src_vt1, src_vt2, src_vt3, src_vt4);
268 src_y += (5 * src_stride);
270 src_vt0 = (v16i8) __msa_insve_d((v2i64) src_vt0, 1, (v2i64) src_vt1);
271 src_vt1 = (v16i8) __msa_insve_d((v2i64) src_vt1, 1, (v2i64) src_vt2);
272 src_vt2 = (v16i8) __msa_insve_d((v2i64) src_vt2, 1, (v2i64) src_vt3);
273 src_vt3 = (v16i8) __msa_insve_d((v2i64) src_vt3, 1, (v2i64) src_vt4);
275 XORI_B4_128_SB(src_vt0, src_vt1, src_vt2, src_vt3);
277 for (loop_cnt = (height >> 2); loop_cnt--;) {
278 LD_SB4(src_x, src_stride, src_hz0, src_hz1, src_hz2, src_hz3);
279 XORI_B4_128_SB(src_hz0, src_hz1, src_hz2, src_hz3);
280 src_x += (4 * src_stride);
282 hz_out0 = AVC_HORZ_FILTER_SH(src_hz0, src_hz0, mask0, mask1, mask2);
283 hz_out1 = AVC_HORZ_FILTER_SH(src_hz1, src_hz1, mask0, mask1, mask2);
284 hz_out2 = AVC_HORZ_FILTER_SH(src_hz2, src_hz2, mask0, mask1, mask2);
285 hz_out3 = AVC_HORZ_FILTER_SH(src_hz3, src_hz3, mask0, mask1, mask2);
287 SRARI_H4_SH(hz_out0, hz_out1, hz_out2, hz_out3, 5);
288 SAT_SH4_SH(hz_out0, hz_out1, hz_out2, hz_out3, 7);
290 LD_SB4(src_y, src_stride, src_vt5, src_vt6, src_vt7, src_vt8);
291 src_y += (4 * src_stride);
293 src_vt4 = (v16i8) __msa_insve_d((v2i64) src_vt4, 1, (v2i64) src_vt5);
294 src_vt5 = (v16i8) __msa_insve_d((v2i64) src_vt5, 1, (v2i64) src_vt6);
295 src_vt6 = (v16i8) __msa_insve_d((v2i64) src_vt6, 1, (v2i64) src_vt7);
296 src_vt7 = (v16i8) __msa_insve_d((v2i64) src_vt7, 1, (v2i64) src_vt8);
298 XORI_B4_128_SB(src_vt4, src_vt5, src_vt6, src_vt7);
301 AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src_vt0, src_vt1, src_vt2, src_vt3,
302 src_vt4, src_vt5, vert_out0, vert_out1);
303 AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src_vt2, src_vt3, src_vt4, src_vt5,
304 src_vt6, src_vt7, vert_out2, vert_out3);
306 SRARI_H4_SH(vert_out0, vert_out1, vert_out2, vert_out3, 5);
307 SAT_SH4_SH(vert_out0, vert_out1, vert_out2, vert_out3, 7);
309 out0 = __msa_srari_h((hz_out0 + vert_out0), 1);
310 out1 = __msa_srari_h((hz_out1 + vert_out1), 1);
311 out2 = __msa_srari_h((hz_out2 + vert_out2), 1);
312 out3 = __msa_srari_h((hz_out3 + vert_out3), 1);
314 SAT_SH4_SH(out0, out1, out2, out3, 7);
315 tmp0 = PCKEV_XORI128_UB(out0, out1);
316 tmp1 = PCKEV_XORI128_UB(out2, out3);
317 ST8x4_UB(tmp0, tmp1, dst, dst_stride);
319 dst += (4 * dst_stride);
329 static void avc_luma_hv_qrt_16w_msa(const uint8_t *src_x, const uint8_t *src_y,
330 int32_t src_stride, uint8_t *dst,
331 int32_t dst_stride, int32_t height)
333 uint32_t multiple8_cnt;
335 for (multiple8_cnt = 2; multiple8_cnt--;) {
336 avc_luma_hv_qrt_8w_msa(src_x, src_y, src_stride, dst, dst_stride,
345 static void avc_luma_hz_and_aver_dst_4x4_msa(const uint8_t *src,
347 uint8_t *dst, int32_t dst_stride)
349 v16i8 src0, src1, src2, src3;
350 v16u8 dst0, dst1, dst2, dst3, res;
352 v16i8 mask0, mask1, mask2;
353 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
354 v16i8 minus5b = __msa_ldi_b(-5);
355 v16i8 plus20b = __msa_ldi_b(20);
357 LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2);
358 LD_SB4(src, src_stride, src0, src1, src2, src3);
360 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
361 XORI_B4_128_SB(src0, src1, src2, src3);
362 VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0, vec1);
363 HADD_SB2_SH(vec0, vec1, res0, res1);
364 VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2, vec3);
365 DPADD_SB2_SH(vec2, vec3, minus5b, minus5b, res0, res1);
366 VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4, vec5);
367 DPADD_SB2_SH(vec4, vec5, plus20b, plus20b, res0, res1);
368 SRARI_H2_SH(res0, res1, 5);
369 SAT_SH2_SH(res0, res1, 7);
370 res = PCKEV_XORI128_UB(res0, res1);
371 ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst1);
373 dst0 = (v16u8) __msa_pckev_d((v2i64) dst1, (v2i64) dst0);
374 res = __msa_aver_u_b(res, dst0);
376 ST4x4_UB(res, res, 0, 1, 2, 3, dst, dst_stride);
379 static void avc_luma_hz_and_aver_dst_8x8_msa(const uint8_t *src,
381 uint8_t *dst, int32_t dst_stride)
384 v16i8 src0, src1, src2, src3;
385 v16u8 dst0, dst1, dst2, dst3;
386 v8i16 res0, res1, res2, res3;
387 v16i8 mask0, mask1, mask2;
388 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
389 v16i8 vec6, vec7, vec8, vec9, vec10, vec11;
390 v16i8 minus5b = __msa_ldi_b(-5);
391 v16i8 plus20b = __msa_ldi_b(20);
393 LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
395 for (loop_cnt = 2; loop_cnt--;) {
396 LD_SB4(src, src_stride, src0, src1, src2, src3);
397 src += (4 * src_stride);
399 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
401 XORI_B4_128_SB(src0, src1, src2, src3);
402 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
403 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3);
404 HADD_SB4_SH(vec0, vec1, vec2, vec3, res0, res1, res2, res3);
405 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4, vec5);
406 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6, vec7);
407 DPADD_SB4_SH(vec4, vec5, vec6, vec7, minus5b, minus5b, minus5b, minus5b,
408 res0, res1, res2, res3);
409 VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec8, vec9);
410 VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec10, vec11);
411 DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b,
412 plus20b, res0, res1, res2, res3);
413 SRARI_H4_SH(res0, res1, res2, res3, 5);
414 SAT_SH4_SH(res0, res1, res2, res3, 7);
415 ILVR_D2_UB(dst1, dst0, dst3, dst2, dst0, dst1);
416 CONVERT_UB_AVG_ST8x4_UB(res0, res1, res2, res3, dst0, dst1,
419 dst += (4 * dst_stride);
423 static void avc_luma_hz_and_aver_dst_16x16_msa(const uint8_t *src,
425 uint8_t *dst, int32_t dst_stride)
428 v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
429 v16u8 dst0, dst1, dst2, dst3;
430 v16i8 mask0, mask1, mask2;
431 v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
432 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
433 v16i8 vec6, vec7, vec8, vec9, vec10, vec11;
434 v16i8 minus5b = __msa_ldi_b(-5);
435 v16i8 plus20b = __msa_ldi_b(20);
437 LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
439 for (loop_cnt = 4; loop_cnt--;) {
440 LD_SB2(src, 8, src0, src1);
442 LD_SB2(src, 8, src2, src3);
445 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
447 XORI_B4_128_SB(src0, src1, src2, src3);
448 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec3);
449 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec6, vec9);
450 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec1, vec4);
451 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec7, vec10);
452 VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec2, vec5);
453 VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec8, vec11);
454 HADD_SB4_SH(vec0, vec3, vec6, vec9, res0, res1, res2, res3);
455 DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
456 minus5b, res0, res1, res2, res3);
457 DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
458 plus20b, res0, res1, res2, res3);
459 LD_SB2(src, 8, src4, src5);
461 LD_SB2(src, 8, src6, src7);
463 XORI_B4_128_SB(src4, src5, src6, src7);
464 VSHF_B2_SB(src4, src4, src5, src5, mask0, mask0, vec0, vec3);
465 VSHF_B2_SB(src6, src6, src7, src7, mask0, mask0, vec6, vec9);
466 VSHF_B2_SB(src4, src4, src5, src5, mask1, mask1, vec1, vec4);
467 VSHF_B2_SB(src6, src6, src7, src7, mask1, mask1, vec7, vec10);
468 VSHF_B2_SB(src4, src4, src5, src5, mask2, mask2, vec2, vec5);
469 VSHF_B2_SB(src6, src6, src7, src7, mask2, mask2, vec8, vec11);
470 HADD_SB4_SH(vec0, vec3, vec6, vec9, res4, res5, res6, res7);
471 DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
472 minus5b, res4, res5, res6, res7);
473 DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
474 plus20b, res4, res5, res6, res7);
475 SRARI_H4_SH(res0, res1, res2, res3, 5);
476 SRARI_H4_SH(res4, res5, res6, res7, 5);
477 SAT_SH4_SH(res0, res1, res2, res3, 7);
478 SAT_SH4_SH(res4, res5, res6, res7, 7);
479 PCKEV_B4_SB(res1, res0, res3, res2, res5, res4, res7, res6,
480 vec0, vec1, vec2, vec3);
481 XORI_B4_128_SB(vec0, vec1, vec2, vec3);
482 AVER_UB4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3,
483 dst0, dst1, dst2, dst3);
484 ST_UB4(dst0, dst1, dst2, dst3, dst, dst_stride);
485 dst += (4 * dst_stride);
489 static void avc_luma_hz_qrt_and_aver_dst_4x4_msa(const uint8_t *src,
496 v16i8 src0, src1, src2, src3;
497 v16u8 dst0, dst1, dst2, dst3;
498 v16i8 mask0, mask1, mask2;
499 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
501 v16i8 minus5b = __msa_ldi_b(-5);
502 v16i8 plus20b = __msa_ldi_b(20);
505 LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2);
513 LD_SB4(src, src_stride, src0, src1, src2, src3);
514 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
516 XORI_B4_128_SB(src0, src1, src2, src3);
517 VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0, vec1);
518 HADD_SB2_SH(vec0, vec1, out0, out1);
519 VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2, vec3);
520 DPADD_SB2_SH(vec2, vec3, minus5b, minus5b, out0, out1);
521 VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4, vec5);
522 DPADD_SB2_SH(vec4, vec5, plus20b, plus20b, out0, out1);
523 SRARI_H2_SH(out0, out1, 5);
524 SAT_SH2_SH(out0, out1, 7);
526 PCKEV_B2_UB(out0, out0, out1, out1, res0, res1);
528 src0 = __msa_sld_b(src0, src0, slide);
529 src1 = __msa_sld_b(src1, src1, slide);
530 src2 = __msa_sld_b(src2, src2, slide);
531 src3 = __msa_sld_b(src3, src3, slide);
532 src0 = (v16i8) __msa_insve_w((v4i32) src0, 1, (v4i32) src1);
533 src1 = (v16i8) __msa_insve_w((v4i32) src2, 1, (v4i32) src3);
534 res0 = (v16u8) __msa_aver_s_b((v16i8) res0, src0);
535 res1 = (v16u8) __msa_aver_s_b((v16i8) res1, src1);
537 XORI_B2_128_UB(res0, res1);
539 dst0 = (v16u8) __msa_insve_w((v4i32) dst0, 1, (v4i32) dst1);
540 dst1 = (v16u8) __msa_insve_w((v4i32) dst2, 1, (v4i32) dst3);
542 AVER_UB2_UB(res0, dst0, res1, dst1, dst0, dst1);
544 ST4x4_UB(dst0, dst1, 0, 1, 0, 1, dst, dst_stride);
547 static void avc_luma_hz_qrt_and_aver_dst_8x8_msa(const uint8_t *src,
555 v16i8 src0, src1, src2, src3;
556 v16i8 mask0, mask1, mask2;
557 v16u8 dst0, dst1, dst2, dst3;
558 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
559 v16i8 vec6, vec7, vec8, vec9, vec10, vec11;
560 v8i16 out0, out1, out2, out3;
561 v16i8 minus5b = __msa_ldi_b(-5);
562 v16i8 plus20b = __msa_ldi_b(20);
563 v16i8 res0, res1, res2, res3;
565 LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
573 for (loop_cnt = 2; loop_cnt--;) {
574 LD_SB4(src, src_stride, src0, src1, src2, src3);
575 src += (4 * src_stride);
577 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
579 XORI_B4_128_SB(src0, src1, src2, src3);
580 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
581 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3);
582 HADD_SB4_SH(vec0, vec1, vec2, vec3, out0, out1, out2, out3);
583 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4, vec5);
584 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6, vec7);
585 DPADD_SB4_SH(vec4, vec5, vec6, vec7, minus5b, minus5b, minus5b, minus5b,
586 out0, out1, out2, out3);
587 VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec8, vec9);
588 VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec10, vec11);
589 DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b,
590 plus20b, out0, out1, out2, out3);
592 src0 = __msa_sld_b(src0, src0, slide);
593 src1 = __msa_sld_b(src1, src1, slide);
594 src2 = __msa_sld_b(src2, src2, slide);
595 src3 = __msa_sld_b(src3, src3, slide);
597 SRARI_H4_SH(out0, out1, out2, out3, 5);
598 SAT_SH4_SH(out0, out1, out2, out3, 7);
600 PCKEV_B4_SB(out0, out0, out1, out1, out2, out2, out3, out3,
601 res0, res1, res2, res3);
603 res0 = __msa_aver_s_b(res0, src0);
604 res1 = __msa_aver_s_b(res1, src1);
605 res2 = __msa_aver_s_b(res2, src2);
606 res3 = __msa_aver_s_b(res3, src3);
608 XORI_B4_128_SB(res0, res1, res2, res3);
609 AVER_ST8x4_UB(res0, dst0, res1, dst1, res2, dst2, res3, dst3,
612 dst += (4 * dst_stride);
616 static void avc_luma_hz_qrt_and_aver_dst_16x16_msa(const uint8_t *src,
624 v16i8 src0, src1, src2, src3;
625 v16i8 mask0, mask1, mask2, vshf;
627 v8i16 res0, res1, res2, res3;
628 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
629 v16i8 vec6, vec7, vec8, vec9, vec10, vec11;
630 v16i8 minus5b = __msa_ldi_b(-5);
631 v16i8 plus20b = __msa_ldi_b(20);
633 LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
636 vshf = LD_SB(&luma_mask_arr[16 + 96]);
638 vshf = LD_SB(&luma_mask_arr[96]);
641 for (loop_cnt = 8; loop_cnt--;) {
642 LD_SB2(src, 8, src0, src1);
644 LD_SB2(src, 8, src2, src3);
647 LD_UB2(dst, dst_stride, dst0, dst1);
649 XORI_B4_128_SB(src0, src1, src2, src3);
650 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec3);
651 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec6, vec9);
652 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec1, vec4);
653 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec7, vec10);
654 VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec2, vec5);
655 VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec8, vec11);
656 HADD_SB4_SH(vec0, vec3, vec6, vec9, res0, res1, res2, res3);
657 DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
658 minus5b, res0, res1, res2, res3);
659 DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
660 plus20b, res0, res1, res2, res3);
661 VSHF_B2_SB(src0, src1, src2, src3, vshf, vshf, src0, src2);
662 SRARI_H4_SH(res0, res1, res2, res3, 5);
663 SAT_SH4_SH(res0, res1, res2, res3, 7);
664 PCKEV_B2_SB(res1, res0, res3, res2, out0, out1);
666 out0 = __msa_aver_s_b(out0, src0);
667 out1 = __msa_aver_s_b(out1, src2);
669 XORI_B2_128_SB(out0, out1);
670 AVER_UB2_UB(out0, dst0, out1, dst1, dst0, dst1);
671 ST_UB2(dst0, dst1, dst, dst_stride);
672 dst += (2 * dst_stride);
676 static void avc_luma_vt_and_aver_dst_4x4_msa(const uint8_t *src,
678 uint8_t *dst, int32_t dst_stride)
680 int16_t filt_const0 = 0xfb01;
681 int16_t filt_const1 = 0x1414;
682 int16_t filt_const2 = 0x1fb;
683 v16u8 dst0, dst1, dst2, dst3;
684 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
685 v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
686 v16i8 src87_r, src2110, src4332, src6554, src8776;
688 v16i8 filt0, filt1, filt2;
691 filt0 = (v16i8) __msa_fill_h(filt_const0);
692 filt1 = (v16i8) __msa_fill_h(filt_const1);
693 filt2 = (v16i8) __msa_fill_h(filt_const2);
695 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
696 src += (5 * src_stride);
698 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3,
699 src10_r, src21_r, src32_r, src43_r);
700 ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
701 XORI_B2_128_SB(src2110, src4332);
702 LD_SB4(src, src_stride, src5, src6, src7, src8);
703 ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7,
704 src54_r, src65_r, src76_r, src87_r);
705 ILVR_D2_SB(src65_r, src54_r, src87_r, src76_r, src6554, src8776);
706 XORI_B2_128_SB(src6554, src8776);
707 out10 = DPADD_SH3_SH(src2110, src4332, src6554, filt0, filt1, filt2);
708 out32 = DPADD_SH3_SH(src4332, src6554, src8776, filt0, filt1, filt2);
709 SRARI_H2_SH(out10, out32, 5);
710 SAT_SH2_SH(out10, out32, 7);
711 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
712 res = PCKEV_XORI128_UB(out10, out32);
714 ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst1);
716 dst0 = (v16u8) __msa_pckev_d((v2i64) dst1, (v2i64) dst0);
717 dst0 = __msa_aver_u_b(res, dst0);
719 ST4x4_UB(dst0, dst0, 0, 1, 2, 3, dst, dst_stride);
722 static void avc_luma_vt_and_aver_dst_8x8_msa(const uint8_t *src,
724 uint8_t *dst, int32_t dst_stride)
727 int16_t filt_const0 = 0xfb01;
728 int16_t filt_const1 = 0x1414;
729 int16_t filt_const2 = 0x1fb;
730 v16u8 dst0, dst1, dst2, dst3;
731 v16i8 src0, src1, src2, src3, src4, src7, src8, src9, src10;
732 v16i8 src10_r, src32_r, src76_r, src98_r;
733 v16i8 src21_r, src43_r, src87_r, src109_r;
734 v8i16 out0, out1, out2, out3;
735 v16i8 filt0, filt1, filt2;
737 filt0 = (v16i8) __msa_fill_h(filt_const0);
738 filt1 = (v16i8) __msa_fill_h(filt_const1);
739 filt2 = (v16i8) __msa_fill_h(filt_const2);
741 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
742 src += (5 * src_stride);
744 XORI_B5_128_SB(src0, src1, src2, src3, src4);
745 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3,
746 src10_r, src21_r, src32_r, src43_r);
748 for (loop_cnt = 2; loop_cnt--;) {
749 LD_SB4(src, src_stride, src7, src8, src9, src10);
750 src += (4 * src_stride);
752 XORI_B4_128_SB(src7, src8, src9, src10);
753 ILVR_B4_SB(src7, src4, src8, src7, src9, src8, src10, src9,
754 src76_r, src87_r, src98_r, src109_r);
755 out0 = DPADD_SH3_SH(src10_r, src32_r, src76_r, filt0, filt1, filt2);
756 out1 = DPADD_SH3_SH(src21_r, src43_r, src87_r, filt0, filt1, filt2);
757 out2 = DPADD_SH3_SH(src32_r, src76_r, src98_r, filt0, filt1, filt2);
758 out3 = DPADD_SH3_SH(src43_r, src87_r, src109_r, filt0, filt1, filt2);
759 SRARI_H4_SH(out0, out1, out2, out3, 5);
760 SAT_SH4_SH(out0, out1, out2, out3, 7);
761 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
762 ILVR_D2_UB(dst1, dst0, dst3, dst2, dst0, dst1);
763 CONVERT_UB_AVG_ST8x4_UB(out0, out1, out2, out3, dst0, dst1,
765 dst += (4 * dst_stride);
775 static void avc_luma_vt_and_aver_dst_16x16_msa(const uint8_t *src,
777 uint8_t *dst, int32_t dst_stride)
780 int16_t filt_const0 = 0xfb01;
781 int16_t filt_const1 = 0x1414;
782 int16_t filt_const2 = 0x1fb;
783 v16u8 dst0, dst1, dst2, dst3;
784 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
785 v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
786 v16i8 src87_r, src10_l, src32_l, src54_l, src76_l, src21_l, src43_l;
787 v16i8 src65_l, src87_l;
788 v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
789 v16i8 filt0, filt1, filt2;
790 v16u8 res0, res1, res2, res3;
792 filt0 = (v16i8) __msa_fill_h(filt_const0);
793 filt1 = (v16i8) __msa_fill_h(filt_const1);
794 filt2 = (v16i8) __msa_fill_h(filt_const2);
796 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
797 src += (5 * src_stride);
799 XORI_B5_128_SB(src0, src1, src2, src3, src4);
800 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3,
801 src10_r, src21_r, src32_r, src43_r);
802 ILVL_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3,
803 src10_l, src21_l, src32_l, src43_l);
805 for (loop_cnt = 4; loop_cnt--;) {
806 LD_SB4(src, src_stride, src5, src6, src7, src8);
807 src += (4 * src_stride);
809 XORI_B4_128_SB(src5, src6, src7, src8);
810 ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7,
811 src54_r, src65_r, src76_r, src87_r);
812 ILVL_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7,
813 src54_l, src65_l, src76_l, src87_l);
814 out0_r = DPADD_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2);
815 out1_r = DPADD_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2);
816 out2_r = DPADD_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2);
817 out3_r = DPADD_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2);
818 out0_l = DPADD_SH3_SH(src10_l, src32_l, src54_l, filt0, filt1, filt2);
819 out1_l = DPADD_SH3_SH(src21_l, src43_l, src65_l, filt0, filt1, filt2);
820 out2_l = DPADD_SH3_SH(src32_l, src54_l, src76_l, filt0, filt1, filt2);
821 out3_l = DPADD_SH3_SH(src43_l, src65_l, src87_l, filt0, filt1, filt2);
822 SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 5);
823 SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 5);
824 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
825 SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
826 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
827 PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
828 out3_r, res0, res1, res2, res3);
829 XORI_B4_128_UB(res0, res1, res2, res3);
830 AVER_UB4_UB(res0, dst0, res1, dst1, res2, dst2, res3, dst3,
831 res0, res1, res2, res3);
832 ST_UB4(res0, res1, res2, res3, dst, dst_stride);
833 dst += (4 * dst_stride);
847 static void avc_luma_vt_qrt_and_aver_dst_4x4_msa(const uint8_t *src,
853 int16_t filt_const0 = 0xfb01;
854 int16_t filt_const1 = 0x1414;
855 int16_t filt_const2 = 0x1fb;
856 v16u8 dst0, dst1, dst2, dst3;
857 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
858 v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
859 v16i8 src87_r, src2110, src4332, src6554, src8776;
861 v16i8 filt0, filt1, filt2;
864 filt0 = (v16i8) __msa_fill_h(filt_const0);
865 filt1 = (v16i8) __msa_fill_h(filt_const1);
866 filt2 = (v16i8) __msa_fill_h(filt_const2);
868 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
869 src += (5 * src_stride);
871 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3,
872 src10_r, src21_r, src32_r, src43_r);
873 ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
874 XORI_B2_128_SB(src2110, src4332);
875 LD_SB4(src, src_stride, src5, src6, src7, src8);
876 ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7,
877 src54_r, src65_r, src76_r, src87_r);
878 ILVR_D2_SB(src65_r, src54_r, src87_r, src76_r, src6554, src8776);
879 XORI_B2_128_SB(src6554, src8776);
880 out10 = DPADD_SH3_SH(src2110, src4332, src6554, filt0, filt1, filt2);
881 out32 = DPADD_SH3_SH(src4332, src6554, src8776, filt0, filt1, filt2);
882 SRARI_H2_SH(out10, out32, 5);
883 SAT_SH2_SH(out10, out32, 7);
884 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
885 res = PCKEV_XORI128_UB(out10, out32);
888 src32_r = (v16i8) __msa_insve_w((v4i32) src3, 1, (v4i32) src4);
889 src54_r = (v16i8) __msa_insve_w((v4i32) src5, 1, (v4i32) src6);
891 src32_r = (v16i8) __msa_insve_w((v4i32) src2, 1, (v4i32) src3);
892 src54_r = (v16i8) __msa_insve_w((v4i32) src4, 1, (v4i32) src5);
895 src32_r = (v16i8) __msa_insve_d((v2i64) src32_r, 1, (v2i64) src54_r);
896 res = __msa_aver_u_b(res, (v16u8) src32_r);
898 ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst1);
900 dst0 = (v16u8) __msa_pckev_d((v2i64) dst1, (v2i64) dst0);
901 dst0 = __msa_aver_u_b(res, dst0);
903 ST4x4_UB(dst0, dst0, 0, 1, 2, 3, dst, dst_stride);
906 static void avc_luma_vt_qrt_and_aver_dst_8x8_msa(const uint8_t *src,
913 int16_t filt_const0 = 0xfb01;
914 int16_t filt_const1 = 0x1414;
915 int16_t filt_const2 = 0x1fb;
916 v16u8 dst0, dst1, dst2, dst3;
917 v16i8 src0, src1, src2, src3, src4, src7, src8, src9, src10;
918 v16i8 src10_r, src32_r, src76_r, src98_r;
919 v16i8 src21_r, src43_r, src87_r, src109_r;
920 v8i16 out0_r, out1_r, out2_r, out3_r;
923 v16i8 filt0, filt1, filt2;
925 filt0 = (v16i8) __msa_fill_h(filt_const0);
926 filt1 = (v16i8) __msa_fill_h(filt_const1);
927 filt2 = (v16i8) __msa_fill_h(filt_const2);
929 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
930 src += (5 * src_stride);
932 XORI_B5_128_SB(src0, src1, src2, src3, src4);
933 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3,
934 src10_r, src21_r, src32_r, src43_r);
936 for (loop_cnt = 2; loop_cnt--;) {
937 LD_SB4(src, src_stride, src7, src8, src9, src10);
938 src += (4 * src_stride);
940 XORI_B4_128_SB(src7, src8, src9, src10);
941 ILVR_B4_SB(src7, src4, src8, src7, src9, src8, src10, src9,
942 src76_r, src87_r, src98_r, src109_r);
943 out0_r = DPADD_SH3_SH(src10_r, src32_r, src76_r, filt0, filt1, filt2);
944 out1_r = DPADD_SH3_SH(src21_r, src43_r, src87_r, filt0, filt1, filt2);
945 out2_r = DPADD_SH3_SH(src32_r, src76_r, src98_r, filt0, filt1, filt2);
946 out3_r = DPADD_SH3_SH(src43_r, src87_r, src109_r, filt0, filt1, filt2);
947 SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 5);
948 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
949 PCKEV_B2_SB(out1_r, out0_r, out3_r, out2_r, res0, res1);
952 PCKEV_D2_SB(src4, src3, src8, src7, src10_r, src32_r);
954 PCKEV_D2_SB(src3, src2, src7, src4, src10_r, src32_r);
957 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
958 ILVR_D2_UB(dst1, dst0, dst3, dst2, dst0, dst1);
960 vec0 = (v16u8) __msa_aver_s_b(res0, src10_r);
961 vec1 = (v16u8) __msa_aver_s_b(res1, src32_r);
963 XORI_B2_128_UB(vec0, vec1);
964 AVER_UB2_UB(vec0, dst0, vec1, dst1, vec0, vec1);
965 ST8x4_UB(vec0, vec1, dst, dst_stride);
966 dst += (4 * dst_stride);
978 static void avc_luma_vt_qrt_and_aver_dst_16x16_msa(const uint8_t *src,
985 int16_t filt_const0 = 0xfb01;
986 int16_t filt_const1 = 0x1414;
987 int16_t filt_const2 = 0x1fb;
988 v16u8 dst0, dst1, dst2, dst3;
989 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
990 v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
991 v16i8 src87_r, src10_l, src32_l, src54_l, src76_l, src21_l, src43_l;
992 v16i8 src65_l, src87_l;
993 v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
994 v16i8 out0, out1, out2, out3;
995 v16i8 filt0, filt1, filt2;
996 v16u8 res0, res1, res2, res3;
998 filt0 = (v16i8) __msa_fill_h(filt_const0);
999 filt1 = (v16i8) __msa_fill_h(filt_const1);
1000 filt2 = (v16i8) __msa_fill_h(filt_const2);
1002 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
1003 src += (5 * src_stride);
1005 XORI_B5_128_SB(src0, src1, src2, src3, src4);
1006 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3,
1007 src10_r, src21_r, src32_r, src43_r);
1008 ILVL_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3,
1009 src10_l, src21_l, src32_l, src43_l);
1011 for (loop_cnt = 4; loop_cnt--;) {
1012 LD_SB4(src, src_stride, src5, src6, src7, src8);
1013 src += (4 * src_stride);
1015 XORI_B4_128_SB(src5, src6, src7, src8);
1016 ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7,
1017 src54_r, src65_r, src76_r, src87_r);
1018 ILVL_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7,
1019 src54_l, src65_l, src76_l, src87_l);
1020 out0_r = DPADD_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2);
1021 out1_r = DPADD_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2);
1022 out2_r = DPADD_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2);
1023 out3_r = DPADD_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2);
1024 out0_l = DPADD_SH3_SH(src10_l, src32_l, src54_l, filt0, filt1, filt2);
1025 out1_l = DPADD_SH3_SH(src21_l, src43_l, src65_l, filt0, filt1, filt2);
1026 out2_l = DPADD_SH3_SH(src32_l, src54_l, src76_l, filt0, filt1, filt2);
1027 out3_l = DPADD_SH3_SH(src43_l, src65_l, src87_l, filt0, filt1, filt2);
1028 SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 5);
1029 SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 5);
1030 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
1031 SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
1032 PCKEV_B4_SB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
1033 out3_r, out0, out1, out2, out3);
1034 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
1037 res0 = (v16u8) __msa_aver_s_b(out0, src3);
1038 res1 = (v16u8) __msa_aver_s_b(out1, src4);
1039 res2 = (v16u8) __msa_aver_s_b(out2, src5);
1040 res3 = (v16u8) __msa_aver_s_b(out3, src6);
1042 res0 = (v16u8) __msa_aver_s_b(out0, src2);
1043 res1 = (v16u8) __msa_aver_s_b(out1, src3);
1044 res2 = (v16u8) __msa_aver_s_b(out2, src4);
1045 res3 = (v16u8) __msa_aver_s_b(out3, src5);
1048 XORI_B4_128_UB(res0, res1, res2, res3);
1049 AVER_UB4_UB(res0, dst0, res1, dst1, res2, dst2, res3, dst3,
1050 dst0, dst1, dst2, dst3);
1051 ST_UB4(dst0, dst1, dst2, dst3, dst, dst_stride);
1052 dst += (4 * dst_stride);
1068 static void avc_luma_mid_and_aver_dst_4x4_msa(const uint8_t *src,
1070 uint8_t *dst, int32_t dst_stride)
1072 v16i8 src0, src1, src2, src3, src4;
1073 v16i8 mask0, mask1, mask2;
1074 v8i16 hz_out0, hz_out1, hz_out2, hz_out3;
1075 v8i16 hz_out4, hz_out5, hz_out6, hz_out7, hz_out8;
1076 v8i16 res0, res1, res2, res3;
1077 v16u8 dst0, dst1, dst2, dst3;
1078 v16u8 tmp0, tmp1, tmp2, tmp3;
1080 LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2);
1081 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
1082 src += (5 * src_stride);
1084 XORI_B5_128_SB(src0, src1, src2, src3, src4);
1086 hz_out0 = AVC_XOR_VSHF_B_AND_APPLY_6TAP_HORIZ_FILT_SH(src0, src1,
1087 mask0, mask1, mask2);
1088 hz_out2 = AVC_XOR_VSHF_B_AND_APPLY_6TAP_HORIZ_FILT_SH(src2, src3,
1089 mask0, mask1, mask2);
1091 PCKOD_D2_SH(hz_out0, hz_out0, hz_out2, hz_out2, hz_out1, hz_out3);
1093 hz_out4 = AVC_HORZ_FILTER_SH(src4, src4, mask0, mask1, mask2);
1095 LD_SB4(src, src_stride, src0, src1, src2, src3);
1096 XORI_B4_128_SB(src0, src1, src2, src3);
1098 hz_out5 = AVC_XOR_VSHF_B_AND_APPLY_6TAP_HORIZ_FILT_SH(src0, src1,
1099 mask0, mask1, mask2);
1100 hz_out7 = AVC_XOR_VSHF_B_AND_APPLY_6TAP_HORIZ_FILT_SH(src2, src3,
1101 mask0, mask1, mask2);
1103 PCKOD_D2_SH(hz_out5, hz_out5, hz_out7, hz_out7, hz_out6, hz_out8);
1105 res0 = AVC_CALC_DPADD_H_6PIX_2COEFF_R_SH(hz_out0, hz_out1, hz_out2,
1106 hz_out3, hz_out4, hz_out5);
1107 res1 = AVC_CALC_DPADD_H_6PIX_2COEFF_R_SH(hz_out1, hz_out2, hz_out3,
1108 hz_out4, hz_out5, hz_out6);
1109 res2 = AVC_CALC_DPADD_H_6PIX_2COEFF_R_SH(hz_out2, hz_out3, hz_out4,
1110 hz_out5, hz_out6, hz_out7);
1111 res3 = AVC_CALC_DPADD_H_6PIX_2COEFF_R_SH(hz_out3, hz_out4, hz_out5,
1112 hz_out6, hz_out7, hz_out8);
1113 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
1114 tmp0 = PCKEV_XORI128_UB(res0, res1);
1115 tmp1 = PCKEV_XORI128_UB(res2, res3);
1116 PCKEV_D2_UB(dst1, dst0, dst3, dst2, tmp2, tmp3);
1117 AVER_UB2_UB(tmp0, tmp2, tmp1, tmp3, tmp0, tmp1);
1119 ST4x4_UB(tmp0, tmp1, 0, 2, 0, 2, dst, dst_stride);
1122 static void avc_luma_mid_and_aver_dst_8w_msa(const uint8_t *src,
1124 uint8_t *dst, int32_t dst_stride,
1128 v16i8 src0, src1, src2, src3, src4;
1129 v16i8 mask0, mask1, mask2;
1130 v8i16 hz_out0, hz_out1, hz_out2, hz_out3;
1131 v8i16 hz_out4, hz_out5, hz_out6, hz_out7, hz_out8;
1132 v16u8 dst0, dst1, dst2, dst3;
1133 v8i16 res0, res1, res2, res3;
1135 LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
1137 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
1138 XORI_B5_128_SB(src0, src1, src2, src3, src4);
1139 src += (5 * src_stride);
1141 hz_out0 = AVC_HORZ_FILTER_SH(src0, src0, mask0, mask1, mask2);
1142 hz_out1 = AVC_HORZ_FILTER_SH(src1, src1, mask0, mask1, mask2);
1143 hz_out2 = AVC_HORZ_FILTER_SH(src2, src2, mask0, mask1, mask2);
1144 hz_out3 = AVC_HORZ_FILTER_SH(src3, src3, mask0, mask1, mask2);
1145 hz_out4 = AVC_HORZ_FILTER_SH(src4, src4, mask0, mask1, mask2);
1147 for (loop_cnt = (height >> 2); loop_cnt--;) {
1148 LD_SB4(src, src_stride, src0, src1, src2, src3);
1149 XORI_B4_128_SB(src0, src1, src2, src3);
1150 src += (4 * src_stride);
1152 hz_out5 = AVC_HORZ_FILTER_SH(src0, src0, mask0, mask1, mask2);
1153 hz_out6 = AVC_HORZ_FILTER_SH(src1, src1, mask0, mask1, mask2);
1154 hz_out7 = AVC_HORZ_FILTER_SH(src2, src2, mask0, mask1, mask2);
1155 hz_out8 = AVC_HORZ_FILTER_SH(src3, src3, mask0, mask1, mask2);
1157 res0 = AVC_CALC_DPADD_H_6PIX_2COEFF_SH(hz_out0, hz_out1, hz_out2,
1158 hz_out3, hz_out4, hz_out5);
1159 res1 = AVC_CALC_DPADD_H_6PIX_2COEFF_SH(hz_out1, hz_out2, hz_out3,
1160 hz_out4, hz_out5, hz_out6);
1161 res2 = AVC_CALC_DPADD_H_6PIX_2COEFF_SH(hz_out2, hz_out3, hz_out4,
1162 hz_out5, hz_out6, hz_out7);
1163 res3 = AVC_CALC_DPADD_H_6PIX_2COEFF_SH(hz_out3, hz_out4, hz_out5,
1164 hz_out6, hz_out7, hz_out8);
1165 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
1166 ILVR_D2_UB(dst1, dst0, dst3, dst2, dst0, dst1);
1167 CONVERT_UB_AVG_ST8x4_UB(res0, res1, res2, res3, dst0, dst1,
1170 dst += (4 * dst_stride);
1180 static void avc_luma_mid_and_aver_dst_16x16_msa(const uint8_t *src,
1185 avc_luma_mid_and_aver_dst_8w_msa(src, src_stride, dst, dst_stride, 16);
1186 avc_luma_mid_and_aver_dst_8w_msa(src + 8, src_stride, dst + 8, dst_stride,
1190 static void avc_luma_midh_qrt_and_aver_dst_4w_msa(const uint8_t *src,
1195 uint8_t horiz_offset)
1198 v16i8 src0, src1, src2, src3, src4, src5, src6;
1199 v16u8 dst0, dst1, res;
1200 v8i16 vt_res0, vt_res1, vt_res2, vt_res3;
1201 v4i32 hz_res0, hz_res1;
1203 v8i16 shf_vec0, shf_vec1, shf_vec2, shf_vec3, shf_vec4, shf_vec5;
1204 v8i16 mask0 = { 0, 5, 1, 6, 2, 7, 3, 8 };
1205 v8i16 mask1 = { 1, 4, 2, 5, 3, 6, 4, 7 };
1206 v8i16 mask2 = { 2, 3, 3, 4, 4, 5, 5, 6 };
1207 v8i16 minus5h = __msa_ldi_h(-5);
1208 v8i16 plus20h = __msa_ldi_h(20);
1209 v8i16 zeros = { 0 };
1211 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
1212 src += (5 * src_stride);
1214 XORI_B5_128_SB(src0, src1, src2, src3, src4);
1216 for (row = (height >> 1); row--;) {
1217 LD_SB2(src, src_stride, src5, src6);
1218 src += (2 * src_stride);
1220 XORI_B2_128_SB(src5, src6);
1221 LD_UB2(dst, dst_stride, dst0, dst1);
1223 dst0 = (v16u8) __msa_ilvr_w((v4i32) dst1, (v4i32) dst0);
1225 AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src0, src1, src2, src3, src4, src5,
1227 AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src1, src2, src3, src4, src5, src6,
1229 VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1,
1230 mask0, mask1, mask2, shf_vec0, shf_vec1, shf_vec2);
1231 VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3,
1232 mask0, mask1, mask2, shf_vec3, shf_vec4, shf_vec5);
1234 hz_res0 = __msa_hadd_s_w(shf_vec0, shf_vec0);
1235 DPADD_SH2_SW(shf_vec1, shf_vec2, minus5h, plus20h, hz_res0, hz_res0);
1237 hz_res1 = __msa_hadd_s_w(shf_vec3, shf_vec3);
1238 DPADD_SH2_SW(shf_vec4, shf_vec5, minus5h, plus20h, hz_res1, hz_res1);
1240 SRARI_W2_SW(hz_res0, hz_res1, 10);
1241 SAT_SW2_SW(hz_res0, hz_res1, 7);
1243 res0 = __msa_srari_h(shf_vec2, 5);
1244 res1 = __msa_srari_h(shf_vec5, 5);
1246 SAT_SH2_SH(res0, res1, 7);
1249 res0 = __msa_ilvod_h(zeros, res0);
1250 res1 = __msa_ilvod_h(zeros, res1);
1252 ILVEV_H2_SH(res0, zeros, res1, zeros, res0, res1);
1254 hz_res0 = __msa_aver_s_w(hz_res0, (v4i32) res0);
1255 hz_res1 = __msa_aver_s_w(hz_res1, (v4i32) res1);
1256 res0 = __msa_pckev_h((v8i16) hz_res1, (v8i16) hz_res0);
1258 res = PCKEV_XORI128_UB(res0, res0);
1260 dst0 = __msa_aver_u_b(res, dst0);
1262 ST4x2_UB(dst0, dst, dst_stride);
1263 dst += (2 * dst_stride);
1273 static void avc_luma_midh_qrt_and_aver_dst_8w_msa(const uint8_t *src,
1278 uint8_t horiz_offset)
1280 uint32_t multiple8_cnt;
1282 for (multiple8_cnt = 2; multiple8_cnt--;) {
1283 avc_luma_midh_qrt_and_aver_dst_4w_msa(src, src_stride, dst, dst_stride,
1284 height, horiz_offset);
1291 static void avc_luma_midh_qrt_and_aver_dst_16w_msa(const uint8_t *src,
1296 uint8_t horiz_offset)
1298 uint32_t multiple8_cnt;
1300 for (multiple8_cnt = 4; multiple8_cnt--;) {
1301 avc_luma_midh_qrt_and_aver_dst_4w_msa(src, src_stride, dst, dst_stride,
1302 height, horiz_offset);
1309 static void avc_luma_midv_qrt_and_aver_dst_4w_msa(const uint8_t *src,
1318 v16i8 src0, src1, src2, src3, src4;
1320 v16i8 mask0, mask1, mask2;
1321 v8i16 hz_out0, hz_out1, hz_out2, hz_out3;
1322 v8i16 hz_out4, hz_out5, hz_out6;
1323 v8i16 res0, res1, res2, res3;
1326 LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2);
1327 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
1328 src += (5 * src_stride);
1330 XORI_B5_128_SB(src0, src1, src2, src3, src4);
1332 hz_out0 = AVC_XOR_VSHF_B_AND_APPLY_6TAP_HORIZ_FILT_SH(src0, src1,
1333 mask0, mask1, mask2);
1334 hz_out2 = AVC_XOR_VSHF_B_AND_APPLY_6TAP_HORIZ_FILT_SH(src2, src3,
1335 mask0, mask1, mask2);
1337 PCKOD_D2_SH(hz_out0, hz_out0, hz_out2, hz_out2, hz_out1, hz_out3);
1339 hz_out4 = AVC_HORZ_FILTER_SH(src4, src4, mask0, mask1, mask2);
1341 for (loop_cnt = (height >> 1); loop_cnt--;) {
1342 LD_SB2(src, src_stride, src0, src1);
1343 src += (2 * src_stride);
1345 XORI_B2_128_SB(src0, src1);
1346 LD_UB2(dst, dst_stride, dst0, dst1);
1347 hz_out5 = AVC_XOR_VSHF_B_AND_APPLY_6TAP_HORIZ_FILT_SH(src0, src1,
1350 hz_out6 = (v8i16) __msa_pckod_d((v2i64) hz_out5, (v2i64) hz_out5);
1351 res0 = AVC_CALC_DPADD_H_6PIX_2COEFF_R_SH(hz_out0, hz_out1, hz_out2,
1352 hz_out3, hz_out4, hz_out5);
1353 res2 = AVC_CALC_DPADD_H_6PIX_2COEFF_R_SH(hz_out1, hz_out2, hz_out3,
1354 hz_out4, hz_out5, hz_out6);
1357 res1 = __msa_srari_h(hz_out3, 5);
1358 res3 = __msa_srari_h(hz_out4, 5);
1360 res1 = __msa_srari_h(hz_out2, 5);
1361 res3 = __msa_srari_h(hz_out3, 5);
1364 SAT_SH2_SH(res1, res3, 7);
1366 res0 = __msa_aver_s_h(res0, res1);
1367 res1 = __msa_aver_s_h(res2, res3);
1369 vec0 = PCKEV_XORI128_UB(res0, res0);
1370 vec1 = PCKEV_XORI128_UB(res1, res1);
1372 AVER_UB2_UB(vec0, dst0, vec1, dst1, dst0, dst1);
1374 out0 = __msa_copy_u_w((v4i32) dst0, 0);
1375 out1 = __msa_copy_u_w((v4i32) dst1, 0);
1389 static void avc_luma_midv_qrt_and_aver_dst_8w_msa(const uint8_t *src,
1394 uint8_t vert_offset)
1397 v16i8 src0, src1, src2, src3, src4;
1398 v16u8 dst0, dst1, dst2, dst3;
1399 v16i8 mask0, mask1, mask2;
1400 v8i16 hz_out0, hz_out1, hz_out2, hz_out3;
1401 v8i16 hz_out4, hz_out5, hz_out6, hz_out7, hz_out8;
1402 v8i16 res0, res1, res2, res3;
1403 v8i16 res4, res5, res6, res7;
1405 LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
1407 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
1408 XORI_B5_128_SB(src0, src1, src2, src3, src4);
1409 src += (5 * src_stride);
1411 hz_out0 = AVC_HORZ_FILTER_SH(src0, src0, mask0, mask1, mask2);
1412 hz_out1 = AVC_HORZ_FILTER_SH(src1, src1, mask0, mask1, mask2);
1413 hz_out2 = AVC_HORZ_FILTER_SH(src2, src2, mask0, mask1, mask2);
1414 hz_out3 = AVC_HORZ_FILTER_SH(src3, src3, mask0, mask1, mask2);
1415 hz_out4 = AVC_HORZ_FILTER_SH(src4, src4, mask0, mask1, mask2);
1417 for (loop_cnt = (height >> 2); loop_cnt--;) {
1418 LD_SB4(src, src_stride, src0, src1, src2, src3);
1419 XORI_B4_128_SB(src0, src1, src2, src3);
1420 src += (4 * src_stride);
1422 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
1424 hz_out5 = AVC_HORZ_FILTER_SH(src0, src0, mask0, mask1, mask2);
1425 hz_out6 = AVC_HORZ_FILTER_SH(src1, src1, mask0, mask1, mask2);
1426 hz_out7 = AVC_HORZ_FILTER_SH(src2, src2, mask0, mask1, mask2);
1427 hz_out8 = AVC_HORZ_FILTER_SH(src3, src3, mask0, mask1, mask2);
1429 res0 = AVC_CALC_DPADD_H_6PIX_2COEFF_SH(hz_out0, hz_out1, hz_out2,
1430 hz_out3, hz_out4, hz_out5);
1431 res2 = AVC_CALC_DPADD_H_6PIX_2COEFF_SH(hz_out1, hz_out2, hz_out3,
1432 hz_out4, hz_out5, hz_out6);
1433 res4 = AVC_CALC_DPADD_H_6PIX_2COEFF_SH(hz_out2, hz_out3, hz_out4,
1434 hz_out5, hz_out6, hz_out7);
1435 res6 = AVC_CALC_DPADD_H_6PIX_2COEFF_SH(hz_out3, hz_out4, hz_out5,
1436 hz_out6, hz_out7, hz_out8);
1439 res1 = __msa_srari_h(hz_out3, 5);
1440 res3 = __msa_srari_h(hz_out4, 5);
1441 res5 = __msa_srari_h(hz_out5, 5);
1442 res7 = __msa_srari_h(hz_out6, 5);
1444 res1 = __msa_srari_h(hz_out2, 5);
1445 res3 = __msa_srari_h(hz_out3, 5);
1446 res5 = __msa_srari_h(hz_out4, 5);
1447 res7 = __msa_srari_h(hz_out5, 5);
1450 SAT_SH4_SH(res1, res3, res5, res7, 7);
1452 res0 = __msa_aver_s_h(res0, res1);
1453 res1 = __msa_aver_s_h(res2, res3);
1454 res2 = __msa_aver_s_h(res4, res5);
1455 res3 = __msa_aver_s_h(res6, res7);
1456 ILVR_D2_UB(dst1, dst0, dst3, dst2, dst0, dst1);
1457 CONVERT_UB_AVG_ST8x4_UB(res0, res1, res2, res3, dst0, dst1,
1459 dst += (4 * dst_stride);
1469 static void avc_luma_midv_qrt_and_aver_dst_16w_msa(const uint8_t *src,
1474 uint8_t vert_offset)
1476 int32_t multiple8_cnt;
1478 for (multiple8_cnt = 2; multiple8_cnt--;) {
1479 avc_luma_midv_qrt_and_aver_dst_8w_msa(src, src_stride, dst, dst_stride,
1480 height, vert_offset);
1487 static void avc_luma_hv_qrt_and_aver_dst_4x4_msa(const uint8_t *src_x,
1488 const uint8_t *src_y,
1493 v16i8 src_hz0, src_hz1, src_hz2, src_hz3;
1494 v16u8 dst0, dst1, dst2, dst3;
1495 v16i8 src_vt0, src_vt1, src_vt2, src_vt3, src_vt4;
1496 v16i8 src_vt5, src_vt6, src_vt7, src_vt8;
1497 v16i8 mask0, mask1, mask2;
1498 v8i16 hz_out0, hz_out1, vert_out0, vert_out1;
1502 LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2);
1503 LD_SB5(src_y, src_stride, src_vt0, src_vt1, src_vt2, src_vt3, src_vt4);
1504 src_y += (5 * src_stride);
1506 src_vt0 = (v16i8) __msa_insve_w((v4i32) src_vt0, 1, (v4i32) src_vt1);
1507 src_vt1 = (v16i8) __msa_insve_w((v4i32) src_vt1, 1, (v4i32) src_vt2);
1508 src_vt2 = (v16i8) __msa_insve_w((v4i32) src_vt2, 1, (v4i32) src_vt3);
1509 src_vt3 = (v16i8) __msa_insve_w((v4i32) src_vt3, 1, (v4i32) src_vt4);
1511 XORI_B4_128_SB(src_vt0, src_vt1, src_vt2, src_vt3);
1512 LD_SB4(src_x, src_stride, src_hz0, src_hz1, src_hz2, src_hz3);
1513 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
1514 XORI_B4_128_SB(src_hz0, src_hz1, src_hz2, src_hz3);
1515 hz_out0 = AVC_XOR_VSHF_B_AND_APPLY_6TAP_HORIZ_FILT_SH(src_hz0, src_hz1,
1516 mask0, mask1, mask2);
1517 hz_out1 = AVC_XOR_VSHF_B_AND_APPLY_6TAP_HORIZ_FILT_SH(src_hz2, src_hz3,
1518 mask0, mask1, mask2);
1519 SRARI_H2_SH(hz_out0, hz_out1, 5);
1520 SAT_SH2_SH(hz_out0, hz_out1, 7);
1521 LD_SB4(src_y, src_stride, src_vt5, src_vt6, src_vt7, src_vt8);
1523 src_vt4 = (v16i8) __msa_insve_w((v4i32) src_vt4, 1, (v4i32) src_vt5);
1524 src_vt5 = (v16i8) __msa_insve_w((v4i32) src_vt5, 1, (v4i32) src_vt6);
1525 src_vt6 = (v16i8) __msa_insve_w((v4i32) src_vt6, 1, (v4i32) src_vt7);
1526 src_vt7 = (v16i8) __msa_insve_w((v4i32) src_vt7, 1, (v4i32) src_vt8);
1528 XORI_B4_128_SB(src_vt4, src_vt5, src_vt6, src_vt7);
1531 vert_out0 = AVC_CALC_DPADD_B_6PIX_2COEFF_R_SH(src_vt0, src_vt1, src_vt2,
1532 src_vt3, src_vt4, src_vt5);
1533 vert_out1 = AVC_CALC_DPADD_B_6PIX_2COEFF_R_SH(src_vt2, src_vt3, src_vt4,
1534 src_vt5, src_vt6, src_vt7);
1535 SRARI_H2_SH(vert_out0, vert_out1, 5);
1536 SAT_SH2_SH(vert_out0, vert_out1, 7);
1538 res1 = __msa_srari_h((hz_out1 + vert_out1), 1);
1539 res0 = __msa_srari_h((hz_out0 + vert_out0), 1);
1541 SAT_SH2_SH(res0, res1, 7);
1542 res = PCKEV_XORI128_UB(res0, res1);
1544 dst0 = (v16u8) __msa_insve_w((v4i32) dst0, 1, (v4i32) dst1);
1545 dst1 = (v16u8) __msa_insve_w((v4i32) dst2, 1, (v4i32) dst3);
1546 dst0 = (v16u8) __msa_insve_d((v2i64) dst0, 1, (v2i64) dst1);
1547 dst0 = __msa_aver_u_b(res, dst0);
1549 ST4x4_UB(dst0, dst0, 0, 1, 2, 3, dst, dst_stride);
1552 static void avc_luma_hv_qrt_and_aver_dst_8x8_msa(const uint8_t *src_x,
1553 const uint8_t *src_y,
1559 v16i8 src_hz0, src_hz1, src_hz2, src_hz3;
1560 v16u8 dst0, dst1, dst2, dst3;
1561 v16i8 src_vt0, src_vt1, src_vt2, src_vt3;
1562 v16i8 src_vt4, src_vt5, src_vt6, src_vt7, src_vt8;
1563 v16i8 mask0, mask1, mask2;
1564 v8i16 hz_out0, hz_out1, hz_out2, hz_out3;
1565 v8i16 vert_out0, vert_out1, vert_out2, vert_out3;
1566 v8i16 out0, out1, out2, out3;
1568 LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
1570 LD_SB5(src_y, src_stride, src_vt0, src_vt1, src_vt2, src_vt3, src_vt4);
1571 src_y += (5 * src_stride);
1573 src_vt0 = (v16i8) __msa_insve_d((v2i64) src_vt0, 1, (v2i64) src_vt1);
1574 src_vt1 = (v16i8) __msa_insve_d((v2i64) src_vt1, 1, (v2i64) src_vt2);
1575 src_vt2 = (v16i8) __msa_insve_d((v2i64) src_vt2, 1, (v2i64) src_vt3);
1576 src_vt3 = (v16i8) __msa_insve_d((v2i64) src_vt3, 1, (v2i64) src_vt4);
1578 XORI_B4_128_SB(src_vt0, src_vt1, src_vt2, src_vt3);
1580 for (loop_cnt = 2; loop_cnt--;) {
1581 LD_SB4(src_x, src_stride, src_hz0, src_hz1, src_hz2, src_hz3);
1582 XORI_B4_128_SB(src_hz0, src_hz1, src_hz2, src_hz3);
1583 src_x += (4 * src_stride);
1585 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
1586 hz_out0 = AVC_HORZ_FILTER_SH(src_hz0, src_hz0, mask0, mask1, mask2);
1587 hz_out1 = AVC_HORZ_FILTER_SH(src_hz1, src_hz1, mask0, mask1, mask2);
1588 hz_out2 = AVC_HORZ_FILTER_SH(src_hz2, src_hz2, mask0, mask1, mask2);
1589 hz_out3 = AVC_HORZ_FILTER_SH(src_hz3, src_hz3, mask0, mask1, mask2);
1590 SRARI_H4_SH(hz_out0, hz_out1, hz_out2, hz_out3, 5);
1591 SAT_SH4_SH(hz_out0, hz_out1, hz_out2, hz_out3, 7);
1592 LD_SB4(src_y, src_stride, src_vt5, src_vt6, src_vt7, src_vt8);
1593 src_y += (4 * src_stride);
1595 src_vt4 = (v16i8) __msa_insve_d((v2i64) src_vt4, 1, (v2i64) src_vt5);
1596 src_vt5 = (v16i8) __msa_insve_d((v2i64) src_vt5, 1, (v2i64) src_vt6);
1597 src_vt6 = (v16i8) __msa_insve_d((v2i64) src_vt6, 1, (v2i64) src_vt7);
1598 src_vt7 = (v16i8) __msa_insve_d((v2i64) src_vt7, 1, (v2i64) src_vt8);
1600 XORI_B4_128_SB(src_vt4, src_vt5, src_vt6, src_vt7);
1601 AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src_vt0, src_vt1, src_vt2, src_vt3,
1602 src_vt4, src_vt5, vert_out0, vert_out1);
1603 AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src_vt2, src_vt3, src_vt4, src_vt5,
1604 src_vt6, src_vt7, vert_out2, vert_out3);
1605 SRARI_H4_SH(vert_out0, vert_out1, vert_out2, vert_out3, 5);
1606 SAT_SH4_SH(vert_out0, vert_out1, vert_out2, vert_out3, 7);
1608 out0 = __msa_srari_h((hz_out0 + vert_out0), 1);
1609 out1 = __msa_srari_h((hz_out1 + vert_out1), 1);
1610 out2 = __msa_srari_h((hz_out2 + vert_out2), 1);
1611 out3 = __msa_srari_h((hz_out3 + vert_out3), 1);
1613 SAT_SH4_SH(out0, out1, out2, out3, 7);
1614 ILVR_D2_UB(dst1, dst0, dst3, dst2, dst0, dst1);
1615 CONVERT_UB_AVG_ST8x4_UB(out0, out1, out2, out3, dst0, dst1,
1617 dst += (4 * dst_stride);
1627 static void avc_luma_hv_qrt_and_aver_dst_16x16_msa(const uint8_t *src_x,
1628 const uint8_t *src_y,
1633 uint32_t multiple8_cnt;
1635 for (multiple8_cnt = 2; multiple8_cnt--;) {
1636 avc_luma_hv_qrt_and_aver_dst_8x8_msa(src_x, src_y, src_stride,
1644 src_x += (8 * src_stride) - 16;
1645 src_y += (8 * src_stride) - 16;
1646 dst += (8 * dst_stride) - 16;
1648 for (multiple8_cnt = 2; multiple8_cnt--;) {
1649 avc_luma_hv_qrt_and_aver_dst_8x8_msa(src_x, src_y, src_stride,
1658 void ff_put_h264_qpel16_mc00_msa(uint8_t *dst, const uint8_t *src,
1661 v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
1662 v16u8 src8, src9, src10, src11, src12, src13, src14, src15;
1664 LD_UB8(src, stride, src0, src1, src2, src3, src4, src5, src6, src7);
1665 src += (8 * stride);
1666 LD_UB8(src, stride, src8, src9, src10, src11, src12, src13, src14, src15);
1668 ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst, stride);
1669 dst += (8 * stride);
1670 ST_UB8(src8, src9, src10, src11, src12, src13, src14, src15, dst, stride);
1673 void ff_put_h264_qpel8_mc00_msa(uint8_t *dst, const uint8_t *src,
1676 uint64_t src0, src1, src2, src3, src4, src5, src6, src7;
1678 LD4(src, stride, src0, src1, src2, src3);
1680 LD4(src, stride, src4, src5, src6, src7);
1681 SD4(src0, src1, src2, src3, dst, stride);
1683 SD4(src4, src5, src6, src7, dst, stride);
1686 void ff_avg_h264_qpel16_mc00_msa(uint8_t *dst, const uint8_t *src,
1689 v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
1690 v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
1692 LD_UB8(src, stride, src0, src1, src2, src3, src4, src5, src6, src7);
1693 src += (8 * stride);
1694 LD_UB8(dst, stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
1696 AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3, dst0, dst1,
1698 AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7, dst4, dst5,
1700 ST_UB8(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst, stride);
1701 dst += (8 * stride);
1703 LD_UB8(src, stride, src0, src1, src2, src3, src4, src5, src6, src7);
1704 LD_UB8(dst, stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
1706 AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3, dst0, dst1,
1708 AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7, dst4, dst5,
1710 ST_UB8(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst, stride);
1713 void ff_avg_h264_qpel8_mc00_msa(uint8_t *dst, const uint8_t *src,
1716 uint64_t tp0, tp1, tp2, tp3, tp4, tp5, tp6, tp7;
1717 v16u8 src0 = { 0 }, src1 = { 0 }, src2 = { 0 }, src3 = { 0 };
1718 v16u8 dst0 = { 0 }, dst1 = { 0 }, dst2 = { 0 }, dst3 = { 0 };
1720 LD4(src, stride, tp0, tp1, tp2, tp3);
1722 LD4(src, stride, tp4, tp5, tp6, tp7);
1723 INSERT_D2_UB(tp0, tp1, src0);
1724 INSERT_D2_UB(tp2, tp3, src1);
1725 INSERT_D2_UB(tp4, tp5, src2);
1726 INSERT_D2_UB(tp6, tp7, src3);
1728 LD4(dst, stride, tp0, tp1, tp2, tp3);
1729 LD4(dst + 4 * stride, stride, tp4, tp5, tp6, tp7);
1730 INSERT_D2_UB(tp0, tp1, dst0);
1731 INSERT_D2_UB(tp2, tp3, dst1);
1732 INSERT_D2_UB(tp4, tp5, dst2);
1733 INSERT_D2_UB(tp6, tp7, dst3);
1735 AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3, dst0, dst1,
1738 ST8x8_UB(dst0, dst1, dst2, dst3, dst, stride);
1741 void ff_avg_h264_qpel4_mc00_msa(uint8_t *dst, const uint8_t *src,
1744 uint32_t tp0, tp1, tp2, tp3;
1745 v16u8 src0 = { 0 }, dst0 = { 0 };
1747 LW4(src, stride, tp0, tp1, tp2, tp3);
1748 INSERT_W4_UB(tp0, tp1, tp2, tp3, src0);
1749 LW4(dst, stride, tp0, tp1, tp2, tp3);
1750 INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
1752 dst0 = __msa_aver_u_b(src0, dst0);
1754 ST4x4_UB(dst0, dst0, 0, 1, 2, 3, dst, stride);
1757 void ff_put_h264_qpel16_mc10_msa(uint8_t *dst, const uint8_t *src,
1761 v16i8 dst0, dst1, dst2, dst3, src0, src1, src2, src3, src4, src5, src6;
1762 v16i8 mask0, mask1, mask2, mask3, mask4, mask5, src7, vec11;
1763 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
1764 v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
1765 v16i8 minus5b = __msa_ldi_b(-5);
1766 v16i8 plus20b = __msa_ldi_b(20);
1768 LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
1774 for (loop_cnt = 4; loop_cnt--;) {
1775 LD_SB2(src, 16, src0, src1);
1777 LD_SB2(src, 16, src2, src3);
1779 LD_SB2(src, 16, src4, src5);
1781 LD_SB2(src, 16, src6, src7);
1784 XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
1785 VSHF_B2_SB(src0, src0, src0, src1, mask0, mask3, vec0, vec3);
1786 VSHF_B2_SB(src2, src2, src2, src3, mask0, mask3, vec6, vec9);
1787 VSHF_B2_SB(src0, src0, src0, src1, mask1, mask4, vec1, vec4);
1788 VSHF_B2_SB(src2, src2, src2, src3, mask1, mask4, vec7, vec10);
1789 VSHF_B2_SB(src0, src0, src0, src1, mask2, mask5, vec2, vec5);
1790 VSHF_B2_SB(src2, src2, src2, src3, mask2, mask5, vec8, vec11);
1791 HADD_SB4_SH(vec0, vec3, vec6, vec9, res0, res1, res2, res3);
1792 DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
1793 minus5b, res0, res1, res2, res3);
1794 DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
1795 plus20b, res0, res1, res2, res3);
1796 VSHF_B2_SB(src4, src4, src4, src5, mask0, mask3, vec0, vec3);
1797 VSHF_B2_SB(src6, src6, src6, src7, mask0, mask3, vec6, vec9);
1798 VSHF_B2_SB(src4, src4, src4, src5, mask1, mask4, vec1, vec4);
1799 VSHF_B2_SB(src6, src6, src6, src7, mask1, mask4, vec7, vec10);
1800 VSHF_B2_SB(src4, src4, src4, src5, mask2, mask5, vec2, vec5);
1801 VSHF_B2_SB(src6, src6, src6, src7, mask2, mask5, vec8, vec11);
1802 HADD_SB4_SH(vec0, vec3, vec6, vec9, res4, res5, res6, res7);
1803 DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
1804 minus5b, res4, res5, res6, res7);
1805 DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
1806 plus20b, res4, res5, res6, res7);
1807 SLDI_B2_SB(src1, src3, src0, src2, src0, src2, 2);
1808 SLDI_B2_SB(src5, src7, src4, src6, src4, src6, 2);
1809 SRARI_H4_SH(res0, res1, res2, res3, 5);
1810 SRARI_H4_SH(res4, res5, res6, res7, 5);
1811 SAT_SH4_SH(res0, res1, res2, res3, 7);
1812 SAT_SH4_SH(res4, res5, res6, res7, 7);
1813 PCKEV_B2_SB(res1, res0, res3, res2, dst0, dst1);
1814 PCKEV_B2_SB(res5, res4, res7, res6, dst2, dst3);
1815 dst0 = __msa_aver_s_b(dst0, src0);
1816 dst1 = __msa_aver_s_b(dst1, src2);
1817 dst2 = __msa_aver_s_b(dst2, src4);
1818 dst3 = __msa_aver_s_b(dst3, src6);
1819 XORI_B4_128_SB(dst0, dst1, dst2, dst3);
1820 ST_SB4(dst0, dst1, dst2, dst3, dst, stride);
1821 dst += (4 * stride);
1825 void ff_put_h264_qpel16_mc30_msa(uint8_t *dst, const uint8_t *src,
1829 v16i8 dst0, dst1, dst2, dst3, src0, src1, src2, src3, src4, src5, src6;
1830 v16i8 mask0, mask1, mask2, mask3, mask4, mask5, src7, vec11;
1831 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
1832 v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
1833 v16i8 minus5b = __msa_ldi_b(-5);
1834 v16i8 plus20b = __msa_ldi_b(20);
1836 LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
1842 for (loop_cnt = 4; loop_cnt--;) {
1843 LD_SB2(src, 16, src0, src1);
1845 LD_SB2(src, 16, src2, src3);
1847 LD_SB2(src, 16, src4, src5);
1849 LD_SB2(src, 16, src6, src7);
1852 XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
1853 VSHF_B2_SB(src0, src0, src0, src1, mask0, mask3, vec0, vec3);
1854 VSHF_B2_SB(src2, src2, src2, src3, mask0, mask3, vec6, vec9);
1855 VSHF_B2_SB(src0, src0, src0, src1, mask1, mask4, vec1, vec4);
1856 VSHF_B2_SB(src2, src2, src2, src3, mask1, mask4, vec7, vec10);
1857 VSHF_B2_SB(src0, src0, src0, src1, mask2, mask5, vec2, vec5);
1858 VSHF_B2_SB(src2, src2, src2, src3, mask2, mask5, vec8, vec11);
1859 HADD_SB4_SH(vec0, vec3, vec6, vec9, res0, res1, res2, res3);
1860 DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
1861 minus5b, res0, res1, res2, res3);
1862 DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
1863 plus20b, res0, res1, res2, res3);
1864 VSHF_B2_SB(src4, src4, src4, src5, mask0, mask3, vec0, vec3);
1865 VSHF_B2_SB(src6, src6, src6, src7, mask0, mask3, vec6, vec9);
1866 VSHF_B2_SB(src4, src4, src4, src5, mask1, mask4, vec1, vec4);
1867 VSHF_B2_SB(src6, src6, src6, src7, mask1, mask4, vec7, vec10);
1868 VSHF_B2_SB(src4, src4, src4, src5, mask2, mask5, vec2, vec5);
1869 VSHF_B2_SB(src6, src6, src6, src7, mask2, mask5, vec8, vec11);
1870 HADD_SB4_SH(vec0, vec3, vec6, vec9, res4, res5, res6, res7);
1871 DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
1872 minus5b, res4, res5, res6, res7);
1873 DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
1874 plus20b, res4, res5, res6, res7);
1875 SLDI_B2_SB(src1, src3, src0, src2, src0, src2, 3);
1876 SLDI_B2_SB(src5, src7, src4, src6, src4, src6, 3);
1877 SRARI_H4_SH(res0, res1, res2, res3, 5);
1878 SRARI_H4_SH(res4, res5, res6, res7, 5);
1879 SAT_SH4_SH(res0, res1, res2, res3, 7);
1880 SAT_SH4_SH(res4, res5, res6, res7, 7);
1881 PCKEV_B2_SB(res1, res0, res3, res2, dst0, dst1);
1882 PCKEV_B2_SB(res5, res4, res7, res6, dst2, dst3);
1883 dst0 = __msa_aver_s_b(dst0, src0);
1884 dst1 = __msa_aver_s_b(dst1, src2);
1885 dst2 = __msa_aver_s_b(dst2, src4);
1886 dst3 = __msa_aver_s_b(dst3, src6);
1887 XORI_B4_128_SB(dst0, dst1, dst2, dst3);
1888 ST_SB4(dst0, dst1, dst2, dst3, dst, stride);
1889 dst += (4 * stride);
1893 void ff_put_h264_qpel8_mc10_msa(uint8_t *dst, const uint8_t *src,
1896 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask0, mask1, mask2;
1897 v16i8 tmp0, tmp1, tmp2, tmp3, vec11;
1898 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
1899 v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
1900 v16i8 minus5b = __msa_ldi_b(-5);
1901 v16i8 plus20b = __msa_ldi_b(20);
1903 LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
1904 LD_SB8(src - 2, stride, src0, src1, src2, src3, src4, src5, src6, src7);
1905 XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
1906 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
1907 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3);
1908 HADD_SB4_SH(vec0, vec1, vec2, vec3, res0, res1, res2, res3);
1909 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4, vec5);
1910 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6, vec7);
1911 DPADD_SB4_SH(vec4, vec5, vec6, vec7, minus5b, minus5b, minus5b, minus5b,
1912 res0, res1, res2, res3);
1913 VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec8, vec9);
1914 VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec10, vec11);
1915 DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b, plus20b,
1916 res0, res1, res2, res3);
1917 VSHF_B2_SB(src4, src4, src5, src5, mask0, mask0, vec0, vec1);
1918 VSHF_B2_SB(src6, src6, src7, src7, mask0, mask0, vec2, vec3);
1919 HADD_SB4_SH(vec0, vec1, vec2, vec3, res4, res5, res6, res7);
1920 VSHF_B2_SB(src4, src4, src5, src5, mask1, mask1, vec4, vec5);
1921 VSHF_B2_SB(src6, src6, src7, src7, mask1, mask1, vec6, vec7);
1922 DPADD_SB4_SH(vec4, vec5, vec6, vec7, minus5b, minus5b, minus5b, minus5b,
1923 res4, res5, res6, res7);
1924 VSHF_B2_SB(src4, src4, src5, src5, mask2, mask2, vec8, vec9);
1925 VSHF_B2_SB(src6, src6, src7, src7, mask2, mask2, vec10, vec11);
1926 DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b, plus20b,
1927 res4, res5, res6, res7);
1928 SLDI_B2_SB(src0, src1, src0, src1, src0, src1, 2);
1929 SLDI_B2_SB(src2, src3, src2, src3, src2, src3, 2);
1930 SLDI_B2_SB(src4, src5, src4, src5, src4, src5, 2);
1931 SLDI_B2_SB(src6, src7, src6, src7, src6, src7, 2);
1932 PCKEV_D2_SB(src1, src0, src3, src2, src0, src1);
1933 PCKEV_D2_SB(src5, src4, src7, src6, src4, src5);
1934 SRARI_H4_SH(res0, res1, res2, res3, 5);
1935 SRARI_H4_SH(res4, res5, res6, res7, 5);
1936 SAT_SH4_SH(res0, res1, res2, res3, 7);
1937 SAT_SH4_SH(res4, res5, res6, res7, 7);
1938 PCKEV_B2_SB(res1, res0, res3, res2, tmp0, tmp1);
1939 PCKEV_B2_SB(res5, res4, res7, res6, tmp2, tmp3);
1940 tmp0 = __msa_aver_s_b(tmp0, src0);
1941 tmp1 = __msa_aver_s_b(tmp1, src1);
1942 tmp2 = __msa_aver_s_b(tmp2, src4);
1943 tmp3 = __msa_aver_s_b(tmp3, src5);
1944 XORI_B4_128_SB(tmp0, tmp1, tmp2, tmp3);
1945 ST8x8_UB(tmp0, tmp1, tmp2, tmp3, dst, stride);
1948 void ff_put_h264_qpel8_mc30_msa(uint8_t *dst, const uint8_t *src,
1951 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask0, mask1, mask2;
1952 v16i8 tmp0, tmp1, tmp2, tmp3, vec11;
1953 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
1954 v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
1955 v16i8 minus5b = __msa_ldi_b(-5);
1956 v16i8 plus20b = __msa_ldi_b(20);
1958 LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
1959 LD_SB8(src - 2, stride, src0, src1, src2, src3, src4, src5, src6, src7);
1960 XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
1961 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
1962 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3);
1963 HADD_SB4_SH(vec0, vec1, vec2, vec3, res0, res1, res2, res3);
1964 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4, vec5);
1965 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6, vec7);
1966 DPADD_SB4_SH(vec4, vec5, vec6, vec7, minus5b, minus5b, minus5b, minus5b,
1967 res0, res1, res2, res3);
1968 VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec8, vec9);
1969 VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec10, vec11);
1970 DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b, plus20b,
1971 res0, res1, res2, res3);
1972 VSHF_B2_SB(src4, src4, src5, src5, mask0, mask0, vec0, vec1);
1973 VSHF_B2_SB(src6, src6, src7, src7, mask0, mask0, vec2, vec3);
1974 HADD_SB4_SH(vec0, vec1, vec2, vec3, res4, res5, res6, res7);
1975 VSHF_B2_SB(src4, src4, src5, src5, mask1, mask1, vec4, vec5);
1976 VSHF_B2_SB(src6, src6, src7, src7, mask1, mask1, vec6, vec7);
1977 DPADD_SB4_SH(vec4, vec5, vec6, vec7, minus5b, minus5b, minus5b, minus5b,
1978 res4, res5, res6, res7);
1979 VSHF_B2_SB(src4, src4, src5, src5, mask2, mask2, vec8, vec9);
1980 VSHF_B2_SB(src6, src6, src7, src7, mask2, mask2, vec10, vec11);
1981 DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b, plus20b,
1982 res4, res5, res6, res7);
1983 SLDI_B2_SB(src0, src1, src0, src1, src0, src1, 3);
1984 SLDI_B2_SB(src2, src3, src2, src3, src2, src3, 3);
1985 SLDI_B2_SB(src4, src5, src4, src5, src4, src5, 3);
1986 SLDI_B2_SB(src6, src7, src6, src7, src6, src7, 3);
1987 PCKEV_D2_SB(src1, src0, src3, src2, src0, src1);
1988 PCKEV_D2_SB(src5, src4, src7, src6, src4, src5);
1989 SRARI_H4_SH(res0, res1, res2, res3, 5);
1990 SRARI_H4_SH(res4, res5, res6, res7, 5);
1991 SAT_SH4_SH(res0, res1, res2, res3, 7);
1992 SAT_SH4_SH(res4, res5, res6, res7, 7);
1993 PCKEV_B2_SB(res1, res0, res3, res2, tmp0, tmp1);
1994 PCKEV_B2_SB(res5, res4, res7, res6, tmp2, tmp3);
1995 tmp0 = __msa_aver_s_b(tmp0, src0);
1996 tmp1 = __msa_aver_s_b(tmp1, src1);
1997 tmp2 = __msa_aver_s_b(tmp2, src4);
1998 tmp3 = __msa_aver_s_b(tmp3, src5);
1999 XORI_B4_128_SB(tmp0, tmp1, tmp2, tmp3);
2000 ST8x8_UB(tmp0, tmp1, tmp2, tmp3, dst, stride);
2003 void ff_put_h264_qpel4_mc10_msa(uint8_t *dst, const uint8_t *src,
2006 v16i8 src0, src1, src2, src3, res, mask0, mask1, mask2;
2007 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
2009 v16i8 minus5b = __msa_ldi_b(-5);
2010 v16i8 plus20b = __msa_ldi_b(20);
2012 LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2);
2013 LD_SB4(src - 2, stride, src0, src1, src2, src3);
2014 XORI_B4_128_SB(src0, src1, src2, src3);
2015 VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0, vec1);
2016 HADD_SB2_SH(vec0, vec1, res0, res1);
2017 VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2, vec3);
2018 DPADD_SB2_SH(vec2, vec3, minus5b, minus5b, res0, res1);
2019 VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4, vec5);
2020 DPADD_SB2_SH(vec4, vec5, plus20b, plus20b, res0, res1);
2021 SRARI_H2_SH(res0, res1, 5);
2022 SAT_SH2_SH(res0, res1, 7);
2023 res = __msa_pckev_b((v16i8) res1, (v16i8) res0);
2024 SLDI_B2_SB(src0, src1, src0, src1, src0, src1, 2);
2025 SLDI_B2_SB(src2, src3, src2, src3, src2, src3, 2);
2026 src0 = (v16i8) __msa_insve_w((v4i32) src0, 1, (v4i32) src1);
2027 src1 = (v16i8) __msa_insve_w((v4i32) src2, 1, (v4i32) src3);
2028 src0 = (v16i8) __msa_insve_d((v2i64) src0, 1, (v2i64) src1);
2029 res = __msa_aver_s_b(res, src0);
2030 res = (v16i8) __msa_xori_b((v16u8) res, 128);
2031 ST4x4_UB(res, res, 0, 1, 2, 3, dst, stride);
2034 void ff_put_h264_qpel4_mc30_msa(uint8_t *dst, const uint8_t *src,
2037 v16i8 src0, src1, src2, src3, res, mask0, mask1, mask2;
2038 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
2040 v16i8 minus5b = __msa_ldi_b(-5);
2041 v16i8 plus20b = __msa_ldi_b(20);
2043 LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2);
2044 LD_SB4(src - 2, stride, src0, src1, src2, src3);
2045 XORI_B4_128_SB(src0, src1, src2, src3);
2046 VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0, vec1);
2047 HADD_SB2_SH(vec0, vec1, res0, res1);
2048 VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2, vec3);
2049 DPADD_SB2_SH(vec2, vec3, minus5b, minus5b, res0, res1);
2050 VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4, vec5);
2051 DPADD_SB2_SH(vec4, vec5, plus20b, plus20b, res0, res1);
2052 SRARI_H2_SH(res0, res1, 5);
2053 SAT_SH2_SH(res0, res1, 7);
2054 res = __msa_pckev_b((v16i8) res1, (v16i8) res0);
2055 SLDI_B2_SB(src0, src1, src0, src1, src0, src1, 3);
2056 SLDI_B2_SB(src2, src3, src2, src3, src2, src3, 3);
2057 src0 = (v16i8) __msa_insve_w((v4i32) src0, 1, (v4i32) src1);
2058 src1 = (v16i8) __msa_insve_w((v4i32) src2, 1, (v4i32) src3);
2059 src0 = (v16i8) __msa_insve_d((v2i64) src0, 1, (v2i64) src1);
2060 res = __msa_aver_s_b(res, src0);
2061 res = (v16i8) __msa_xori_b((v16u8) res, 128);
2062 ST4x4_UB(res, res, 0, 1, 2, 3, dst, stride);
2065 void ff_put_h264_qpel16_mc20_msa(uint8_t *dst, const uint8_t *src,
2069 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask0, mask1, mask2;
2070 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
2072 v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
2073 v16i8 minus5b = __msa_ldi_b(-5);
2074 v16i8 plus20b = __msa_ldi_b(20);
2076 LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
2079 for (loop_cnt = 4; loop_cnt--;) {
2080 LD_SB2(src, 8, src0, src1);
2082 LD_SB2(src, 8, src2, src3);
2084 LD_SB2(src, 8, src4, src5);
2086 LD_SB2(src, 8, src6, src7);
2089 XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
2090 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec3);
2091 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec6, vec9);
2092 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec1, vec4);
2093 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec7, vec10);
2094 VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec2, vec5);
2095 VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec8, vec11);
2096 HADD_SB4_SH(vec0, vec3, vec6, vec9, res0, res1, res2, res3);
2097 DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
2098 minus5b, res0, res1, res2, res3);
2099 DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
2100 plus20b, res0, res1, res2, res3);
2101 VSHF_B2_SB(src4, src4, src5, src5, mask0, mask0, vec0, vec3);
2102 VSHF_B2_SB(src6, src6, src7, src7, mask0, mask0, vec6, vec9);
2103 VSHF_B2_SB(src4, src4, src5, src5, mask1, mask1, vec1, vec4);
2104 VSHF_B2_SB(src6, src6, src7, src7, mask1, mask1, vec7, vec10);
2105 VSHF_B2_SB(src4, src4, src5, src5, mask2, mask2, vec2, vec5);
2106 VSHF_B2_SB(src6, src6, src7, src7, mask2, mask2, vec8, vec11);
2107 HADD_SB4_SH(vec0, vec3, vec6, vec9, res4, res5, res6, res7);
2108 DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
2109 minus5b, res4, res5, res6, res7);
2110 DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
2111 plus20b, res4, res5, res6, res7);
2112 SRARI_H4_SH(res0, res1, res2, res3, 5);
2113 SRARI_H4_SH(res4, res5, res6, res7, 5);
2114 SAT_SH4_SH(res0, res1, res2, res3, 7);
2115 SAT_SH4_SH(res4, res5, res6, res7, 7);
2116 PCKEV_B4_SB(res1, res0, res3, res2, res5, res4, res7, res6, vec0, vec1,
2118 XORI_B4_128_SB(vec0, vec1, vec2, vec3);
2119 ST_SB4(vec0, vec1, vec2, vec3, dst, stride);
2120 dst += (4 * stride);
2124 void ff_put_h264_qpel8_mc20_msa(uint8_t *dst, const uint8_t *src,
2127 v16u8 out0, out1, out2, out3;
2128 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask0, mask1, mask2;
2129 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
2131 v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
2132 v16i8 minus5b = __msa_ldi_b(-5);
2133 v16i8 plus20b = __msa_ldi_b(20);
2135 LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
2136 LD_SB8(src - 2, stride, src0, src1, src2, src3, src4, src5, src6, src7);
2137 XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
2138 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
2139 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3);
2140 HADD_SB4_SH(vec0, vec1, vec2, vec3, res0, res1, res2, res3);
2141 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4, vec5);
2142 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6, vec7);
2143 DPADD_SB4_SH(vec4, vec5, vec6, vec7, minus5b, minus5b, minus5b, minus5b,
2144 res0, res1, res2, res3);
2145 VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec8, vec9);
2146 VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec10, vec11);
2147 DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b,
2148 plus20b, res0, res1, res2, res3);
2149 VSHF_B2_SB(src4, src4, src5, src5, mask0, mask0, vec0, vec1);
2150 VSHF_B2_SB(src6, src6, src7, src7, mask0, mask0, vec2, vec3);
2151 HADD_SB4_SH(vec0, vec1, vec2, vec3, res4, res5, res6, res7);
2152 VSHF_B2_SB(src4, src4, src5, src5, mask1, mask1, vec4, vec5);
2153 VSHF_B2_SB(src6, src6, src7, src7, mask1, mask1, vec6, vec7);
2154 DPADD_SB4_SH(vec4, vec5, vec6, vec7, minus5b, minus5b, minus5b, minus5b,
2155 res4, res5, res6, res7);
2156 VSHF_B2_SB(src4, src4, src5, src5, mask2, mask2, vec8, vec9);
2157 VSHF_B2_SB(src6, src6, src7, src7, mask2, mask2, vec10, vec11);
2158 DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b,
2159 plus20b, res4, res5, res6, res7);
2160 SRARI_H4_SH(res0, res1, res2, res3, 5);
2161 SRARI_H4_SH(res4, res5, res6, res7, 5);
2162 SAT_SH4_SH(res0, res1, res2, res3, 7);
2163 SAT_SH4_SH(res4, res5, res6, res7, 7);
2164 out0 = PCKEV_XORI128_UB(res0, res1);
2165 out1 = PCKEV_XORI128_UB(res2, res3);
2166 out2 = PCKEV_XORI128_UB(res4, res5);
2167 out3 = PCKEV_XORI128_UB(res6, res7);
2168 ST8x8_UB(out0, out1, out2, out3, dst, stride);
2171 void ff_put_h264_qpel4_mc20_msa(uint8_t *dst, const uint8_t *src,
2175 v16i8 src0, src1, src2, src3, mask0, mask1, mask2;
2176 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
2178 v16i8 minus5b = __msa_ldi_b(-5);
2179 v16i8 plus20b = __msa_ldi_b(20);
2181 LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2);
2182 LD_SB4(src - 2, stride, src0, src1, src2, src3);
2183 XORI_B4_128_SB(src0, src1, src2, src3);
2184 VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0, vec1);
2185 HADD_SB2_SH(vec0, vec1, res0, res1);
2186 VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2, vec3);
2187 DPADD_SB2_SH(vec2, vec3, minus5b, minus5b, res0, res1);
2188 VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4, vec5);
2189 DPADD_SB2_SH(vec4, vec5, plus20b, plus20b, res0, res1);
2190 SRARI_H2_SH(res0, res1, 5);
2191 SAT_SH2_SH(res0, res1, 7);
2192 out = PCKEV_XORI128_UB(res0, res1);
2193 ST4x4_UB(out, out, 0, 1, 2, 3, dst, stride);
2196 void ff_put_h264_qpel16_mc01_msa(uint8_t *dst, const uint8_t *src,
2200 int16_t filt_const0 = 0xfb01;
2201 int16_t filt_const1 = 0x1414;
2202 int16_t filt_const2 = 0x1fb;
2203 v16u8 res0, res1, res2, res3;
2204 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
2205 v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
2206 v16i8 src87_r, src10_l, src32_l, src54_l, src76_l, src21_l, src43_l;
2207 v16i8 src65_l, src87_l, filt0, filt1, filt2;
2208 v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
2210 filt0 = (v16i8) __msa_fill_h(filt_const0);
2211 filt1 = (v16i8) __msa_fill_h(filt_const1);
2212 filt2 = (v16i8) __msa_fill_h(filt_const2);
2214 src -= (stride * 2);
2216 LD_SB5(src, stride, src0, src1, src2, src3, src4);
2217 src += (5 * stride);
2219 XORI_B5_128_SB(src0, src1, src2, src3, src4);
2220 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
2222 ILVL_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_l, src21_l,
2225 for (loop_cnt = 4; loop_cnt--;) {
2226 LD_SB4(src, stride, src5, src6, src7, src8);
2227 src += (4 * stride);
2229 XORI_B4_128_SB(src5, src6, src7, src8);
2230 ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r,
2231 src65_r, src76_r, src87_r);
2232 ILVL_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_l,
2233 src65_l, src76_l, src87_l);
2234 out0_r = AVC_DOT_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2);
2235 out1_r = AVC_DOT_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2);
2236 out2_r = AVC_DOT_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2);
2237 out3_r = AVC_DOT_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2);
2238 out0_l = AVC_DOT_SH3_SH(src10_l, src32_l, src54_l, filt0, filt1, filt2);
2239 out1_l = AVC_DOT_SH3_SH(src21_l, src43_l, src65_l, filt0, filt1, filt2);
2240 out2_l = AVC_DOT_SH3_SH(src32_l, src54_l, src76_l, filt0, filt1, filt2);
2241 out3_l = AVC_DOT_SH3_SH(src43_l, src65_l, src87_l, filt0, filt1, filt2);
2242 SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 5);
2243 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
2244 SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 5);
2245 SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
2246 PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
2247 out3_r, res0, res1, res2, res3);
2248 res0 = (v16u8) __msa_aver_s_b((v16i8) res0, src2);
2249 res1 = (v16u8) __msa_aver_s_b((v16i8) res1, src3);
2250 res2 = (v16u8) __msa_aver_s_b((v16i8) res2, src4);
2251 res3 = (v16u8) __msa_aver_s_b((v16i8) res3, src5);
2252 XORI_B4_128_UB(res0, res1, res2, res3);
2253 ST_UB4(res0, res1, res2, res3, dst, stride);
2254 dst += (4 * stride);
2270 void ff_put_h264_qpel16_mc03_msa(uint8_t *dst, const uint8_t *src,
2274 int16_t filt_const0 = 0xfb01;
2275 int16_t filt_const1 = 0x1414;
2276 int16_t filt_const2 = 0x1fb;
2277 v16u8 res0, res1, res2, res3;
2278 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
2279 v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
2280 v16i8 src87_r, src10_l, src32_l, src54_l, src76_l, src21_l, src43_l;
2281 v16i8 src65_l, src87_l, filt0, filt1, filt2;
2282 v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
2284 filt0 = (v16i8) __msa_fill_h(filt_const0);
2285 filt1 = (v16i8) __msa_fill_h(filt_const1);
2286 filt2 = (v16i8) __msa_fill_h(filt_const2);
2288 src -= (stride * 2);
2290 LD_SB5(src, stride, src0, src1, src2, src3, src4);
2291 src += (5 * stride);
2293 XORI_B5_128_SB(src0, src1, src2, src3, src4);
2294 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
2296 ILVL_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_l, src21_l,
2299 for (loop_cnt = 4; loop_cnt--;) {
2300 LD_SB4(src, stride, src5, src6, src7, src8);
2301 src += (4 * stride);
2303 XORI_B4_128_SB(src5, src6, src7, src8);
2304 ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r,
2305 src65_r, src76_r, src87_r);
2306 ILVL_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_l,
2307 src65_l, src76_l, src87_l);
2308 out0_r = AVC_DOT_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2);
2309 out1_r = AVC_DOT_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2);
2310 out2_r = AVC_DOT_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2);
2311 out3_r = AVC_DOT_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2);
2312 out0_l = AVC_DOT_SH3_SH(src10_l, src32_l, src54_l, filt0, filt1, filt2);
2313 out1_l = AVC_DOT_SH3_SH(src21_l, src43_l, src65_l, filt0, filt1, filt2);
2314 out2_l = AVC_DOT_SH3_SH(src32_l, src54_l, src76_l, filt0, filt1, filt2);
2315 out3_l = AVC_DOT_SH3_SH(src43_l, src65_l, src87_l, filt0, filt1, filt2);
2316 SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 5);
2317 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
2318 SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 5);
2319 SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
2320 PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
2321 out3_r, res0, res1, res2, res3);
2322 res0 = (v16u8) __msa_aver_s_b((v16i8) res0, src3);
2323 res1 = (v16u8) __msa_aver_s_b((v16i8) res1, src4);
2324 res2 = (v16u8) __msa_aver_s_b((v16i8) res2, src5);
2325 res3 = (v16u8) __msa_aver_s_b((v16i8) res3, src6);
2326 XORI_B4_128_UB(res0, res1, res2, res3);
2327 ST_UB4(res0, res1, res2, res3, dst, stride);
2328 dst += (4 * stride);
2343 void ff_put_h264_qpel8_mc01_msa(uint8_t *dst, const uint8_t *src,
2346 const int16_t filt_const0 = 0xfb01;
2347 const int16_t filt_const1 = 0x1414;
2348 const int16_t filt_const2 = 0x1fb;
2349 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
2350 v16i8 src11, src12, src10_r, src32_r, src54_r, src65_r, src76_r, src98_r;
2351 v16i8 src21_r, src43_r, src87_r, src109_r, src1211_r, src1110_r;
2352 v16i8 tmp0, tmp1, tmp2, tmp3, filt0, filt1, filt2, out0, out1, out2, out3;
2353 v8i16 out0_r, out1_r, out2_r, out3_r, out4_r, out5_r, out6_r, out7_r;
2355 filt0 = (v16i8) __msa_fill_h(filt_const0);
2356 filt1 = (v16i8) __msa_fill_h(filt_const1);
2357 filt2 = (v16i8) __msa_fill_h(filt_const2);
2359 src -= (stride * 2);
2361 LD_SB5(src, stride, src0, src1, src2, src3, src4);
2362 src += (5 * stride);
2363 LD_SB8(src, stride, src5, src6, src7, src8, src9, src10, src11, src12);
2364 XORI_B8_128_SB(src5, src6, src7, src8, src9, src10, src11, src12);
2365 XORI_B5_128_SB(src0, src1, src2, src3, src4);
2366 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
2368 ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
2370 ILVR_B4_SB(src9, src8, src10, src9, src11, src10, src12, src11, src98_r,
2371 src109_r, src1110_r, src1211_r);
2372 out0_r = AVC_DOT_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2);
2373 out1_r = AVC_DOT_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2);
2374 out2_r = AVC_DOT_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2);
2375 out3_r = AVC_DOT_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2);
2376 out4_r = AVC_DOT_SH3_SH(src54_r, src76_r, src98_r, filt0, filt1, filt2);
2377 out5_r = AVC_DOT_SH3_SH(src65_r, src87_r, src109_r, filt0, filt1, filt2);
2378 out6_r = AVC_DOT_SH3_SH(src76_r, src98_r, src1110_r, filt0, filt1, filt2);
2379 out7_r = AVC_DOT_SH3_SH(src87_r, src109_r, src1211_r, filt0, filt1, filt2);
2380 PCKEV_D2_SB(src3, src2, src5, src4, tmp0, tmp1);
2381 PCKEV_D2_SB(src7, src6, src9, src8, tmp2, tmp3);
2382 SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 5);
2383 SRARI_H4_SH(out4_r, out5_r, out6_r, out7_r, 5);
2384 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
2385 SAT_SH4_SH(out4_r, out5_r, out6_r, out7_r, 7);
2386 PCKEV_B2_SB(out1_r, out0_r, out3_r, out2_r, out0, out1);
2387 PCKEV_B2_SB(out5_r, out4_r, out7_r, out6_r, out2, out3);
2388 out0 = __msa_aver_s_b(out0, tmp0);
2389 out1 = __msa_aver_s_b(out1, tmp1);
2390 out2 = __msa_aver_s_b(out2, tmp2);
2391 out3 = __msa_aver_s_b(out3, tmp3);
2392 XORI_B4_128_SB(out0, out1, out2, out3);
2393 ST8x8_UB(out0, out1, out2, out3, dst, stride);
2396 void ff_put_h264_qpel8_mc03_msa(uint8_t *dst, const uint8_t *src,
2399 const int16_t filt_const0 = 0xfb01;
2400 const int16_t filt_const1 = 0x1414;
2401 const int16_t filt_const2 = 0x1fb;
2402 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
2403 v16i8 src11, src12, src10_r, src32_r, src54_r, src65_r, src76_r, src98_r;
2404 v16i8 src21_r, src43_r, src87_r, src109_r, src1211_r, src1110_r;
2405 v16i8 filt0, filt1, filt2, out0, out1, out2, out3, tmp0, tmp1, tmp2, tmp3;
2406 v8i16 out0_r, out1_r, out2_r, out3_r, out4_r, out5_r, out6_r, out7_r;
2408 filt0 = (v16i8) __msa_fill_h(filt_const0);
2409 filt1 = (v16i8) __msa_fill_h(filt_const1);
2410 filt2 = (v16i8) __msa_fill_h(filt_const2);
2412 src -= (stride * 2);
2414 LD_SB5(src, stride, src0, src1, src2, src3, src4);
2415 src += (5 * stride);
2416 LD_SB8(src, stride, src5, src6, src7, src8, src9, src10, src11, src12);
2417 XORI_B5_128_SB(src0, src1, src2, src3, src4);
2418 XORI_B8_128_SB(src5, src6, src7, src8, src9, src10, src11, src12);
2419 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
2421 ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
2423 ILVR_B4_SB(src9, src8, src10, src9, src11, src10, src12, src11, src98_r,
2424 src109_r, src1110_r, src1211_r);
2425 out0_r = AVC_DOT_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2);
2426 out1_r = AVC_DOT_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2);
2427 out2_r = AVC_DOT_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2);
2428 out3_r = AVC_DOT_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2);
2429 out4_r = AVC_DOT_SH3_SH(src54_r, src76_r, src98_r, filt0, filt1, filt2);
2430 out5_r = AVC_DOT_SH3_SH(src65_r, src87_r, src109_r, filt0, filt1, filt2);
2431 out6_r = AVC_DOT_SH3_SH(src76_r, src98_r, src1110_r, filt0, filt1, filt2);
2432 out7_r = AVC_DOT_SH3_SH(src87_r, src109_r, src1211_r, filt0, filt1, filt2);
2433 PCKEV_D2_SB(src4, src3, src6, src5, tmp0, tmp1);
2434 PCKEV_D2_SB(src8, src7, src10, src9, tmp2, tmp3);
2435 SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 5);
2436 SRARI_H4_SH(out4_r, out5_r, out6_r, out7_r, 5);
2437 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
2438 SAT_SH4_SH(out4_r, out5_r, out6_r, out7_r, 7);
2439 PCKEV_B2_SB(out1_r, out0_r, out3_r, out2_r, out0, out1);
2440 PCKEV_B2_SB(out5_r, out4_r, out7_r, out6_r, out2, out3);
2441 out0 = __msa_aver_s_b(out0, tmp0);
2442 out1 = __msa_aver_s_b(out1, tmp1);
2443 out2 = __msa_aver_s_b(out2, tmp2);
2444 out3 = __msa_aver_s_b(out3, tmp3);
2445 XORI_B4_128_SB(out0, out1, out2, out3);
2446 ST8x8_UB(out0, out1, out2, out3, dst, stride);
2449 void ff_put_h264_qpel4_mc01_msa(uint8_t *dst, const uint8_t *src,
2452 int16_t filt_const0 = 0xfb01;
2453 int16_t filt_const1 = 0x1414;
2454 int16_t filt_const2 = 0x1fb;
2456 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
2457 v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
2458 v16i8 src87_r, src2110, src4332, src6554, src8776, filt0, filt1, filt2;
2461 filt0 = (v16i8) __msa_fill_h(filt_const0);
2462 filt1 = (v16i8) __msa_fill_h(filt_const1);
2463 filt2 = (v16i8) __msa_fill_h(filt_const2);
2465 src -= (stride * 2);
2467 LD_SB5(src, stride, src0, src1, src2, src3, src4);
2468 src += (5 * stride);
2469 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
2471 ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
2472 XORI_B2_128_SB(src2110, src4332);
2473 LD_SB4(src, stride, src5, src6, src7, src8);
2474 ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
2476 ILVR_D2_SB(src65_r, src54_r, src87_r, src76_r, src6554, src8776);
2477 XORI_B2_128_SB(src6554, src8776);
2478 out10 = AVC_DOT_SH3_SH(src2110, src4332, src6554, filt0, filt1, filt2);
2479 out32 = AVC_DOT_SH3_SH(src4332, src6554, src8776, filt0, filt1, filt2);
2480 SRARI_H2_SH(out10, out32, 5);
2481 SAT_SH2_SH(out10, out32, 7);
2482 out = PCKEV_XORI128_UB(out10, out32);
2483 src32_r = (v16i8) __msa_insve_w((v4i32) src2, 1, (v4i32) src3);
2484 src54_r = (v16i8) __msa_insve_w((v4i32) src4, 1, (v4i32) src5);
2485 src32_r = (v16i8) __msa_insve_d((v2i64) src32_r, 1, (v2i64) src54_r);
2486 out = __msa_aver_u_b(out, (v16u8) src32_r);
2487 ST4x4_UB(out, out, 0, 1, 2, 3, dst, stride);
2490 void ff_put_h264_qpel4_mc03_msa(uint8_t *dst, const uint8_t *src,
2493 int16_t filt_const0 = 0xfb01;
2494 int16_t filt_const1 = 0x1414;
2495 int16_t filt_const2 = 0x1fb;
2497 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
2498 v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
2499 v16i8 src87_r, src2110, src4332, src6554, src8776, filt0, filt1, filt2;
2502 filt0 = (v16i8) __msa_fill_h(filt_const0);
2503 filt1 = (v16i8) __msa_fill_h(filt_const1);
2504 filt2 = (v16i8) __msa_fill_h(filt_const2);
2506 src -= (stride * 2);
2508 LD_SB5(src, stride, src0, src1, src2, src3, src4);
2509 src += (5 * stride);
2510 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
2512 ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
2513 XORI_B2_128_SB(src2110, src4332);
2514 LD_SB4(src, stride, src5, src6, src7, src8);
2515 ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
2517 ILVR_D2_SB(src65_r, src54_r, src87_r, src76_r, src6554, src8776);
2518 XORI_B2_128_SB(src6554, src8776);
2519 out10 = AVC_DOT_SH3_SH(src2110, src4332, src6554, filt0, filt1, filt2);
2520 out32 = AVC_DOT_SH3_SH(src4332, src6554, src8776, filt0, filt1, filt2);
2521 SRARI_H2_SH(out10, out32, 5);
2522 SAT_SH2_SH(out10, out32, 7);
2523 out = PCKEV_XORI128_UB(out10, out32);
2524 src32_r = (v16i8) __msa_insve_w((v4i32) src3, 1, (v4i32) src4);
2525 src54_r = (v16i8) __msa_insve_w((v4i32) src5, 1, (v4i32) src6);
2526 src32_r = (v16i8) __msa_insve_d((v2i64) src32_r, 1, (v2i64) src54_r);
2527 out = __msa_aver_u_b(out, (v16u8) src32_r);
2528 ST4x4_UB(out, out, 0, 1, 2, 3, dst, stride);
2531 void ff_put_h264_qpel16_mc11_msa(uint8_t *dst, const uint8_t *src,
2534 avc_luma_hv_qrt_16w_msa(src - 2,
2535 src - (stride * 2), stride, dst, stride, 16);
2538 void ff_put_h264_qpel16_mc31_msa(uint8_t *dst, const uint8_t *src,
2541 avc_luma_hv_qrt_16w_msa(src - 2,
2542 src - (stride * 2) +
2543 sizeof(uint8_t), stride, dst, stride, 16);
2546 void ff_put_h264_qpel16_mc13_msa(uint8_t *dst, const uint8_t *src,
2549 avc_luma_hv_qrt_16w_msa(src + stride - 2,
2550 src - (stride * 2), stride, dst, stride, 16);
2553 void ff_put_h264_qpel16_mc33_msa(uint8_t *dst, const uint8_t *src,
2556 avc_luma_hv_qrt_16w_msa(src + stride - 2,
2557 src - (stride * 2) +
2558 sizeof(uint8_t), stride, dst, stride, 16);
2561 void ff_put_h264_qpel8_mc11_msa(uint8_t *dst, const uint8_t *src,
2564 avc_luma_hv_qrt_8w_msa(src - 2, src - (stride * 2), stride, dst, stride, 8);
2567 void ff_put_h264_qpel8_mc31_msa(uint8_t *dst, const uint8_t *src,
2570 avc_luma_hv_qrt_8w_msa(src - 2,
2571 src - (stride * 2) +
2572 sizeof(uint8_t), stride, dst, stride, 8);
2575 void ff_put_h264_qpel8_mc13_msa(uint8_t *dst, const uint8_t *src,
2578 avc_luma_hv_qrt_8w_msa(src + stride - 2,
2579 src - (stride * 2), stride, dst, stride, 8);
2582 void ff_put_h264_qpel8_mc33_msa(uint8_t *dst, const uint8_t *src,
2585 avc_luma_hv_qrt_8w_msa(src + stride - 2,
2586 src - (stride * 2) +
2587 sizeof(uint8_t), stride, dst, stride, 8);
2591 void ff_put_h264_qpel4_mc11_msa(uint8_t *dst, const uint8_t *src,
2594 avc_luma_hv_qrt_4w_msa(src - 2, src - (stride * 2), stride, dst, stride, 4);
2597 void ff_put_h264_qpel4_mc31_msa(uint8_t *dst, const uint8_t *src,
2600 avc_luma_hv_qrt_4w_msa(src - 2,
2601 src - (stride * 2) +
2602 sizeof(uint8_t), stride, dst, stride, 4);
2605 void ff_put_h264_qpel4_mc13_msa(uint8_t *dst, const uint8_t *src,
2608 avc_luma_hv_qrt_4w_msa(src + stride - 2,
2609 src - (stride * 2), stride, dst, stride, 4);
2612 void ff_put_h264_qpel4_mc33_msa(uint8_t *dst, const uint8_t *src,
2615 avc_luma_hv_qrt_4w_msa(src + stride - 2,
2616 src - (stride * 2) +
2617 sizeof(uint8_t), stride, dst, stride, 4);
2620 void ff_put_h264_qpel16_mc21_msa(uint8_t *dst, const uint8_t *src,
2623 uint8_t *dst_tmp = dst;
2624 const uint8_t *src_tmp = src - (2 * stride) - 2;
2625 uint32_t multiple8_cnt, loop_cnt;
2626 const int32_t filt_const0 = 0xfffb0001;
2627 const int32_t filt_const1 = 0x140014;
2628 const int32_t filt_const2 = 0x1fffb;
2630 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, mask0, mask1;
2632 v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
2633 v8i16 hz_out7, hz_out8, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
2634 v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r;
2635 v8i16 hz_out65_r, hz_out76_r, hz_out87_r, hz_out10_l, hz_out21_l;
2636 v8i16 hz_out32_l, hz_out43_l, hz_out54_l, hz_out65_l, hz_out76_l;
2637 v8i16 hz_out87_l, filt0, filt1, filt2;
2640 filt0 = (v8i16) __msa_fill_w(filt_const0);
2641 filt1 = (v8i16) __msa_fill_w(filt_const1);
2642 filt2 = (v8i16) __msa_fill_w(filt_const2);
2644 LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
2646 for (multiple8_cnt = 2; multiple8_cnt--;) {
2650 LD_SB5(src, stride, src0, src1, src2, src3, src4);
2651 XORI_B5_128_SB(src0, src1, src2, src3, src4);
2652 src += (5 * stride);
2654 hz_out0 = AVC_HORZ_FILTER_SH(src0, src0, mask0, mask1, mask2);
2655 hz_out1 = AVC_HORZ_FILTER_SH(src1, src1, mask0, mask1, mask2);
2656 hz_out2 = AVC_HORZ_FILTER_SH(src2, src2, mask0, mask1, mask2);
2657 hz_out3 = AVC_HORZ_FILTER_SH(src3, src3, mask0, mask1, mask2);
2658 hz_out4 = AVC_HORZ_FILTER_SH(src4, src4, mask0, mask1, mask2);
2660 for (loop_cnt = 4; loop_cnt--;) {
2661 LD_SB4(src, stride, src5, src6, src7, src8);
2662 src += (4 * stride);
2664 XORI_B4_128_SB(src5, src6, src7, src8);
2666 hz_out5 = AVC_HORZ_FILTER_SH(src5, src5, mask0, mask1, mask2);
2667 hz_out6 = AVC_HORZ_FILTER_SH(src6, src6, mask0, mask1, mask2);
2668 hz_out7 = AVC_HORZ_FILTER_SH(src7, src7, mask0, mask1, mask2);
2669 hz_out8 = AVC_HORZ_FILTER_SH(src8, src8, mask0, mask1, mask2);
2671 ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2,
2672 hz_out4, hz_out3, hz_out10_r, hz_out21_r, hz_out32_r,
2674 ILVL_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2,
2675 hz_out4, hz_out3, hz_out10_l, hz_out21_l, hz_out32_l,
2677 ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6,
2678 hz_out8, hz_out7, hz_out54_r, hz_out65_r, hz_out76_r,
2680 ILVL_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6,
2681 hz_out8, hz_out7, hz_out54_l, hz_out65_l, hz_out76_l,
2684 tmp0 = AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0,
2686 tmp1 = AVC_DOT_SW3_SW(hz_out10_l, hz_out32_l, hz_out54_l, filt0,
2688 dst0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
2689 tmp0 = AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0,
2691 tmp1 = AVC_DOT_SW3_SW(hz_out21_l, hz_out43_l, hz_out65_l, filt0,
2693 dst2 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
2694 tmp0 = AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0,
2696 tmp1 = AVC_DOT_SW3_SW(hz_out32_l, hz_out54_l, hz_out76_l, filt0,
2698 dst4 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
2699 tmp0 = AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0,
2701 tmp1 = AVC_DOT_SW3_SW(hz_out43_l, hz_out65_l, hz_out87_l, filt0,
2703 dst6 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
2705 dst1 = __msa_srari_h(hz_out2, 5);
2706 dst3 = __msa_srari_h(hz_out3, 5);
2707 dst5 = __msa_srari_h(hz_out4, 5);
2708 dst7 = __msa_srari_h(hz_out5, 5);
2709 SAT_SH4_SH(dst1, dst3, dst5, dst7, 7);
2711 dst0 = __msa_aver_s_h(dst0, dst1);
2712 dst1 = __msa_aver_s_h(dst2, dst3);
2713 dst2 = __msa_aver_s_h(dst4, dst5);
2714 dst3 = __msa_aver_s_h(dst6, dst7);
2716 out0 = PCKEV_XORI128_UB(dst0, dst1);
2717 out1 = PCKEV_XORI128_UB(dst2, dst3);
2718 ST8x4_UB(out0, out1, dst, stride);
2719 dst += (4 * stride);
2733 void ff_put_h264_qpel16_mc23_msa(uint8_t *dst, const uint8_t *src,
2736 uint8_t *dst_tmp = dst;
2737 const uint8_t *src_tmp = src - (2 * stride) - 2;
2738 uint32_t multiple8_cnt, loop_cnt;
2739 const int32_t filt_const0 = 0xfffb0001;
2740 const int32_t filt_const1 = 0x140014;
2741 const int32_t filt_const2 = 0x1fffb;
2743 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, mask0, mask1;
2745 v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
2746 v8i16 hz_out7, hz_out8, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
2747 v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r;
2748 v8i16 hz_out65_r, hz_out76_r, hz_out87_r, hz_out10_l, hz_out21_l;
2749 v8i16 hz_out32_l, hz_out43_l, hz_out54_l, hz_out65_l, hz_out76_l;
2750 v8i16 hz_out87_l, filt0, filt1, filt2;
2753 filt0 = (v8i16) __msa_fill_w(filt_const0);
2754 filt1 = (v8i16) __msa_fill_w(filt_const1);
2755 filt2 = (v8i16) __msa_fill_w(filt_const2);
2757 LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
2759 for (multiple8_cnt = 2; multiple8_cnt--;) {
2763 LD_SB5(src, stride, src0, src1, src2, src3, src4);
2764 XORI_B5_128_SB(src0, src1, src2, src3, src4);
2765 src += (5 * stride);
2767 hz_out0 = AVC_HORZ_FILTER_SH(src0, src0, mask0, mask1, mask2);
2768 hz_out1 = AVC_HORZ_FILTER_SH(src1, src1, mask0, mask1, mask2);
2769 hz_out2 = AVC_HORZ_FILTER_SH(src2, src2, mask0, mask1, mask2);
2770 hz_out3 = AVC_HORZ_FILTER_SH(src3, src3, mask0, mask1, mask2);
2771 hz_out4 = AVC_HORZ_FILTER_SH(src4, src4, mask0, mask1, mask2);
2773 for (loop_cnt = 4; loop_cnt--;) {
2774 LD_SB4(src, stride, src5, src6, src7, src8);
2775 src += (4 * stride);
2777 XORI_B4_128_SB(src5, src6, src7, src8);
2779 hz_out5 = AVC_HORZ_FILTER_SH(src5, src5, mask0, mask1, mask2);
2780 hz_out6 = AVC_HORZ_FILTER_SH(src6, src6, mask0, mask1, mask2);
2781 hz_out7 = AVC_HORZ_FILTER_SH(src7, src7, mask0, mask1, mask2);
2782 hz_out8 = AVC_HORZ_FILTER_SH(src8, src8, mask0, mask1, mask2);
2784 ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2,
2785 hz_out4, hz_out3, hz_out10_r, hz_out21_r, hz_out32_r,
2787 ILVL_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2,
2788 hz_out4, hz_out3, hz_out10_l, hz_out21_l, hz_out32_l,
2790 ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6,
2791 hz_out8, hz_out7, hz_out54_r, hz_out65_r, hz_out76_r,
2793 ILVL_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6,
2794 hz_out8, hz_out7, hz_out54_l, hz_out65_l, hz_out76_l,
2797 tmp0 = AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0,
2799 tmp1 = AVC_DOT_SW3_SW(hz_out10_l, hz_out32_l, hz_out54_l, filt0,
2801 dst0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
2802 tmp0 = AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0,
2804 tmp1 = AVC_DOT_SW3_SW(hz_out21_l, hz_out43_l, hz_out65_l, filt0,
2806 dst2 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
2807 tmp0 = AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0,
2809 tmp1 = AVC_DOT_SW3_SW(hz_out32_l, hz_out54_l, hz_out76_l, filt0,
2811 dst4 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
2812 tmp0 = AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0,
2814 tmp1 = AVC_DOT_SW3_SW(hz_out43_l, hz_out65_l, hz_out87_l, filt0,
2816 dst6 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
2818 dst1 = __msa_srari_h(hz_out3, 5);
2819 dst3 = __msa_srari_h(hz_out4, 5);
2820 dst5 = __msa_srari_h(hz_out5, 5);
2821 dst7 = __msa_srari_h(hz_out6, 5);
2822 SAT_SH4_SH(dst1, dst3, dst5, dst7, 7);
2824 dst0 = __msa_aver_s_h(dst0, dst1);
2825 dst1 = __msa_aver_s_h(dst2, dst3);
2826 dst2 = __msa_aver_s_h(dst4, dst5);
2827 dst3 = __msa_aver_s_h(dst6, dst7);
2829 out0 = PCKEV_XORI128_UB(dst0, dst1);
2830 out1 = PCKEV_XORI128_UB(dst2, dst3);
2831 ST8x4_UB(out0, out1, dst, stride);
2832 dst += (4 * stride);
2846 void ff_put_h264_qpel8_mc21_msa(uint8_t *dst, const uint8_t *src,
2849 const int32_t filt_const0 = 0xfffb0001;
2850 const int32_t filt_const1 = 0x140014;
2851 const int32_t filt_const2 = 0x1fffb;
2853 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
2854 v16i8 src11, src12, mask0, mask1, mask2;
2855 v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
2856 v8i16 hz_out7, hz_out8, hz_out9, hz_out10, hz_out11, hz_out12;
2857 v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r;
2858 v8i16 hz_out65_r, hz_out76_r, hz_out87_r, hz_out89_r, hz_out910_r;
2859 v8i16 hz_out1110_r, hz_out1211_r, dst0, dst1, dst2, dst3;
2860 v8i16 hz_out10_l, hz_out21_l, hz_out32_l, hz_out43_l, hz_out54_l;
2861 v8i16 hz_out65_l, hz_out76_l, hz_out87_l, hz_out89_l, hz_out910_l;
2862 v8i16 hz_out1110_l, hz_out1211_l, filt0, filt1, filt2;
2865 LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
2867 filt0 = (v8i16) __msa_fill_w(filt_const0);
2868 filt1 = (v8i16) __msa_fill_w(filt_const1);
2869 filt2 = (v8i16) __msa_fill_w(filt_const2);
2871 src -= ((2 * stride) + 2);
2873 LD_SB5(src, stride, src0, src1, src2, src3, src4);
2874 XORI_B5_128_SB(src0, src1, src2, src3, src4);
2875 src += (5 * stride);
2877 hz_out0 = AVC_HORZ_FILTER_SH(src0, src0, mask0, mask1, mask2);
2878 hz_out1 = AVC_HORZ_FILTER_SH(src1, src1, mask0, mask1, mask2);
2879 hz_out2 = AVC_HORZ_FILTER_SH(src2, src2, mask0, mask1, mask2);
2880 hz_out3 = AVC_HORZ_FILTER_SH(src3, src3, mask0, mask1, mask2);
2881 hz_out4 = AVC_HORZ_FILTER_SH(src4, src4, mask0, mask1, mask2);
2883 LD_SB4(src, stride, src5, src6, src7, src8);
2884 src += (4 * stride);
2885 XORI_B4_128_SB(src5, src6, src7, src8);
2887 hz_out5 = AVC_HORZ_FILTER_SH(src5, src5, mask0, mask1, mask2);
2888 hz_out6 = AVC_HORZ_FILTER_SH(src6, src6, mask0, mask1, mask2);
2889 hz_out7 = AVC_HORZ_FILTER_SH(src7, src7, mask0, mask1, mask2);
2890 hz_out8 = AVC_HORZ_FILTER_SH(src8, src8, mask0, mask1, mask2);
2892 ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4,
2893 hz_out3, hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r);
2894 ILVL_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4,
2895 hz_out3, hz_out10_l, hz_out21_l, hz_out32_l, hz_out43_l);
2896 ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8,
2897 hz_out7, hz_out54_r, hz_out65_r, hz_out76_r, hz_out87_r);
2898 ILVL_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8,
2899 hz_out7, hz_out54_l, hz_out65_l, hz_out76_l, hz_out87_l);
2901 tmp0 = AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0, filt1,
2903 tmp1 = AVC_DOT_SW3_SW(hz_out10_l, hz_out32_l, hz_out54_l, filt0, filt1,
2905 dst0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
2906 tmp0 = AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0, filt1,
2908 tmp1 = AVC_DOT_SW3_SW(hz_out21_l, hz_out43_l, hz_out65_l, filt0, filt1,
2910 dst1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
2911 tmp0 = AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0, filt1,
2913 tmp1 = AVC_DOT_SW3_SW(hz_out32_l, hz_out54_l, hz_out76_l, filt0, filt1,
2915 dst2 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
2916 tmp0 = AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0, filt1,
2918 tmp1 = AVC_DOT_SW3_SW(hz_out43_l, hz_out65_l, hz_out87_l, filt0, filt1,
2920 dst3 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
2922 SRARI_H4_SH(hz_out2, hz_out3, hz_out4, hz_out5, 5);
2923 SAT_SH4_SH(hz_out2, hz_out3, hz_out4, hz_out5, 7);
2925 dst0 = __msa_aver_s_h(dst0, hz_out2);
2926 dst1 = __msa_aver_s_h(dst1, hz_out3);
2927 dst2 = __msa_aver_s_h(dst2, hz_out4);
2928 dst3 = __msa_aver_s_h(dst3, hz_out5);
2930 out0 = PCKEV_XORI128_UB(dst0, dst1);
2931 out1 = PCKEV_XORI128_UB(dst2, dst3);
2932 ST8x4_UB(out0, out1, dst, stride);
2933 dst += (4 * stride);
2935 LD_SB4(src, stride, src9, src10, src11, src12);
2936 XORI_B4_128_SB(src9, src10, src11, src12);
2937 hz_out9 = AVC_HORZ_FILTER_SH(src9, src9, mask0, mask1, mask2);
2938 hz_out10 = AVC_HORZ_FILTER_SH(src10, src10, mask0, mask1, mask2);
2939 hz_out11 = AVC_HORZ_FILTER_SH(src11, src11, mask0, mask1, mask2);
2940 hz_out12 = AVC_HORZ_FILTER_SH(src12, src12, mask0, mask1, mask2);
2941 ILVR_H4_SH(hz_out9, hz_out8, hz_out10, hz_out9, hz_out11, hz_out10,
2942 hz_out12, hz_out11, hz_out89_r, hz_out910_r, hz_out1110_r,
2944 ILVL_H4_SH(hz_out9, hz_out8, hz_out10, hz_out9, hz_out11, hz_out10,
2945 hz_out12, hz_out11, hz_out89_l, hz_out910_l, hz_out1110_l,
2947 tmp0 = AVC_DOT_SW3_SW(hz_out54_r, hz_out76_r, hz_out89_r, filt0, filt1,
2949 tmp1 = AVC_DOT_SW3_SW(hz_out54_l, hz_out76_l, hz_out89_l, filt0, filt1,
2951 dst0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
2952 tmp0 = AVC_DOT_SW3_SW(hz_out65_r, hz_out87_r, hz_out910_r, filt0, filt1,
2954 tmp1 = AVC_DOT_SW3_SW(hz_out65_l, hz_out87_l, hz_out910_l, filt0, filt1,
2956 dst1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
2957 tmp0 = AVC_DOT_SW3_SW(hz_out76_r, hz_out89_r, hz_out1110_r, filt0, filt1,
2959 tmp1 = AVC_DOT_SW3_SW(hz_out76_l, hz_out89_l, hz_out1110_l, filt0, filt1,
2961 dst2 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
2962 tmp0 = AVC_DOT_SW3_SW(hz_out87_r, hz_out910_r, hz_out1211_r, filt0, filt1,
2964 tmp1 = AVC_DOT_SW3_SW(hz_out87_l, hz_out910_l, hz_out1211_l, filt0, filt1,
2966 dst3 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
2968 SRARI_H4_SH(hz_out6, hz_out7, hz_out8, hz_out9, 5);
2969 SAT_SH4_SH(hz_out6, hz_out7, hz_out8, hz_out9, 7);
2971 dst0 = __msa_aver_s_h(dst0, hz_out6);
2972 dst1 = __msa_aver_s_h(dst1, hz_out7);
2973 dst2 = __msa_aver_s_h(dst2, hz_out8);
2974 dst3 = __msa_aver_s_h(dst3, hz_out9);
2976 out0 = PCKEV_XORI128_UB(dst0, dst1);
2977 out1 = PCKEV_XORI128_UB(dst2, dst3);
2978 ST8x4_UB(out0, out1, dst, stride);
2981 void ff_put_h264_qpel8_mc23_msa(uint8_t *dst, const uint8_t *src,
2984 const int32_t filt_const0 = 0xfffb0001;
2985 const int32_t filt_const1 = 0x140014;
2986 const int32_t filt_const2 = 0x1fffb;
2988 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
2989 v16i8 src11, src12, mask0, mask1, mask2;
2990 v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
2991 v8i16 hz_out7, hz_out8, hz_out9, hz_out10, hz_out11, hz_out12;
2992 v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r;
2993 v8i16 hz_out65_r, hz_out76_r, hz_out87_r, hz_out89_r, hz_out910_r;
2994 v8i16 hz_out1110_r, hz_out1211_r, dst0, dst1, dst2, dst3;
2995 v8i16 hz_out10_l, hz_out21_l, hz_out32_l, hz_out43_l, hz_out54_l;
2996 v8i16 hz_out65_l, hz_out76_l, hz_out87_l, hz_out89_l, hz_out910_l;
2997 v8i16 hz_out1110_l, hz_out1211_l, filt0, filt1, filt2;
3000 LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
3002 filt0 = (v8i16) __msa_fill_w(filt_const0);
3003 filt1 = (v8i16) __msa_fill_w(filt_const1);
3004 filt2 = (v8i16) __msa_fill_w(filt_const2);
3006 src -= ((2 * stride) + 2);
3008 LD_SB5(src, stride, src0, src1, src2, src3, src4);
3009 XORI_B5_128_SB(src0, src1, src2, src3, src4);
3010 src += (5 * stride);
3012 hz_out0 = AVC_HORZ_FILTER_SH(src0, src0, mask0, mask1, mask2);
3013 hz_out1 = AVC_HORZ_FILTER_SH(src1, src1, mask0, mask1, mask2);
3014 hz_out2 = AVC_HORZ_FILTER_SH(src2, src2, mask0, mask1, mask2);
3015 hz_out3 = AVC_HORZ_FILTER_SH(src3, src3, mask0, mask1, mask2);
3016 hz_out4 = AVC_HORZ_FILTER_SH(src4, src4, mask0, mask1, mask2);
3018 LD_SB4(src, stride, src5, src6, src7, src8);
3019 src += (4 * stride);
3020 XORI_B4_128_SB(src5, src6, src7, src8);
3022 hz_out5 = AVC_HORZ_FILTER_SH(src5, src5, mask0, mask1, mask2);
3023 hz_out6 = AVC_HORZ_FILTER_SH(src6, src6, mask0, mask1, mask2);
3024 hz_out7 = AVC_HORZ_FILTER_SH(src7, src7, mask0, mask1, mask2);
3025 hz_out8 = AVC_HORZ_FILTER_SH(src8, src8, mask0, mask1, mask2);
3027 ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4,
3028 hz_out3, hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r);
3029 ILVL_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4,
3030 hz_out3, hz_out10_l, hz_out21_l, hz_out32_l, hz_out43_l);
3031 ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8,
3032 hz_out7, hz_out54_r, hz_out65_r, hz_out76_r, hz_out87_r);
3033 ILVL_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8,
3034 hz_out7, hz_out54_l, hz_out65_l, hz_out76_l, hz_out87_l);
3036 tmp0 = AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0, filt1,
3038 tmp1 = AVC_DOT_SW3_SW(hz_out10_l, hz_out32_l, hz_out54_l, filt0, filt1,
3040 dst0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
3041 tmp0 = AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0, filt1,
3043 tmp1 = AVC_DOT_SW3_SW(hz_out21_l, hz_out43_l, hz_out65_l, filt0, filt1,
3045 dst1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
3046 tmp0 = AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0, filt1,
3048 tmp1 = AVC_DOT_SW3_SW(hz_out32_l, hz_out54_l, hz_out76_l, filt0, filt1,
3050 dst2 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
3051 tmp0 = AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0, filt1,
3053 tmp1 = AVC_DOT_SW3_SW(hz_out43_l, hz_out65_l, hz_out87_l, filt0, filt1,
3055 dst3 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
3057 SRARI_H4_SH(hz_out3, hz_out4, hz_out5, hz_out6, 5);
3058 SAT_SH4_SH(hz_out3, hz_out4, hz_out5, hz_out6, 7);
3060 dst0 = __msa_aver_s_h(dst0, hz_out3);
3061 dst1 = __msa_aver_s_h(dst1, hz_out4);
3062 dst2 = __msa_aver_s_h(dst2, hz_out5);
3063 dst3 = __msa_aver_s_h(dst3, hz_out6);
3065 out0 = PCKEV_XORI128_UB(dst0, dst1);
3066 out1 = PCKEV_XORI128_UB(dst2, dst3);
3067 ST8x4_UB(out0, out1, dst, stride);
3068 dst += (4 * stride);
3070 LD_SB4(src, stride, src9, src10, src11, src12);
3071 XORI_B4_128_SB(src9, src10, src11, src12);
3072 hz_out9 = AVC_HORZ_FILTER_SH(src9, src9, mask0, mask1, mask2);
3073 hz_out10 = AVC_HORZ_FILTER_SH(src10, src10, mask0, mask1, mask2);
3074 hz_out11 = AVC_HORZ_FILTER_SH(src11, src11, mask0, mask1, mask2);
3075 hz_out12 = AVC_HORZ_FILTER_SH(src12, src12, mask0, mask1, mask2);
3076 ILVR_H4_SH(hz_out9, hz_out8, hz_out10, hz_out9, hz_out11, hz_out10,
3077 hz_out12, hz_out11, hz_out89_r, hz_out910_r, hz_out1110_r,
3079 ILVL_H4_SH(hz_out9, hz_out8, hz_out10, hz_out9, hz_out11, hz_out10,
3080 hz_out12, hz_out11, hz_out89_l, hz_out910_l, hz_out1110_l,
3082 tmp0 = AVC_DOT_SW3_SW(hz_out54_r, hz_out76_r, hz_out89_r, filt0, filt1,
3084 tmp1 = AVC_DOT_SW3_SW(hz_out54_l, hz_out76_l, hz_out89_l, filt0, filt1,
3086 dst0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
3087 tmp0 = AVC_DOT_SW3_SW(hz_out65_r, hz_out87_r, hz_out910_r, filt0, filt1,
3089 tmp1 = AVC_DOT_SW3_SW(hz_out65_l, hz_out87_l, hz_out910_l, filt0, filt1,
3091 dst1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
3092 tmp0 = AVC_DOT_SW3_SW(hz_out76_r, hz_out89_r, hz_out1110_r, filt0, filt1,
3094 tmp1 = AVC_DOT_SW3_SW(hz_out76_l, hz_out89_l, hz_out1110_l, filt0, filt1,
3096 dst2 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
3097 tmp0 = AVC_DOT_SW3_SW(hz_out87_r, hz_out910_r, hz_out1211_r, filt0, filt1,
3099 tmp1 = AVC_DOT_SW3_SW(hz_out87_l, hz_out910_l, hz_out1211_l, filt0, filt1,
3101 dst3 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
3103 SRARI_H4_SH(hz_out7, hz_out8, hz_out9, hz_out10, 5);
3104 SAT_SH4_SH(hz_out7, hz_out8, hz_out9, hz_out10, 7);
3106 dst0 = __msa_aver_s_h(dst0, hz_out7);
3107 dst1 = __msa_aver_s_h(dst1, hz_out8);
3108 dst2 = __msa_aver_s_h(dst2, hz_out9);
3109 dst3 = __msa_aver_s_h(dst3, hz_out10);
3111 out0 = PCKEV_XORI128_UB(dst0, dst1);
3112 out1 = PCKEV_XORI128_UB(dst2, dst3);
3113 ST8x4_UB(out0, out1, dst, stride);
3116 void ff_put_h264_qpel4_mc21_msa(uint8_t *dst, const uint8_t *src,
3119 const int32_t filt_const0 = 0xfffb0001;
3120 const int32_t filt_const1 = 0x140014;
3121 const int32_t filt_const2 = 0x1fffb;
3123 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
3124 v16i8 mask0, mask1, mask2;
3125 v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
3126 v8i16 hz_out7, hz_out8, dst0, dst1, filt0, filt1, filt2;
3127 v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r;
3128 v8i16 hz_out65_r, hz_out76_r, hz_out87_r;
3131 LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2);
3133 filt0 = (v8i16) __msa_fill_w(filt_const0);
3134 filt1 = (v8i16) __msa_fill_w(filt_const1);
3135 filt2 = (v8i16) __msa_fill_w(filt_const2);
3137 src -= ((2 * stride) + 2);
3139 LD_SB5(src, stride, src0, src1, src2, src3, src4);
3140 src += (5 * stride);
3141 LD_SB4(src, stride, src5, src6, src7, src8);
3143 XORI_B5_128_SB(src0, src1, src2, src3, src4);
3144 XORI_B4_128_SB(src5, src6, src7, src8);
3146 hz_out0 = AVC_HORZ_FILTER_SH(src0, src1, mask0, mask1, mask2);
3147 hz_out2 = AVC_HORZ_FILTER_SH(src2, src3, mask0, mask1, mask2);
3148 hz_out4 = AVC_HORZ_FILTER_SH(src4, src5, mask0, mask1, mask2);
3149 hz_out6 = AVC_HORZ_FILTER_SH(src6, src7, mask0, mask1, mask2);
3150 hz_out8 = AVC_HORZ_FILTER_SH(src8, src8, mask0, mask1, mask2);
3151 PCKOD_D2_SH(hz_out0, hz_out0, hz_out2, hz_out2, hz_out1, hz_out3);
3152 PCKOD_D2_SH(hz_out4, hz_out4, hz_out6, hz_out6, hz_out5, hz_out7);
3154 ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4,
3155 hz_out3, hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r);
3156 ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8,
3157 hz_out7, hz_out54_r, hz_out65_r, hz_out76_r, hz_out87_r);
3159 tmp0 = AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0, filt1,
3161 tmp1 = AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0, filt1,
3163 dst0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
3164 tmp0 = AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0, filt1,
3166 tmp1 = AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0, filt1,
3168 dst1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
3170 SRARI_H2_SH(hz_out2, hz_out4, 5);
3171 SAT_SH2_SH(hz_out2, hz_out4, 7);
3173 dst0 = __msa_aver_s_h(dst0, hz_out2);
3174 dst1 = __msa_aver_s_h(dst1, hz_out4);
3176 res = PCKEV_XORI128_UB(dst0, dst1);
3177 ST4x4_UB(res, res, 0, 1, 2, 3, dst, stride);
3180 void ff_put_h264_qpel4_mc23_msa(uint8_t *dst, const uint8_t *src,
3183 const int32_t filt_const0 = 0xfffb0001;
3184 const int32_t filt_const1 = 0x140014;
3185 const int32_t filt_const2 = 0x1fffb;
3187 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
3188 v16i8 mask0, mask1, mask2;
3189 v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
3190 v8i16 hz_out7, hz_out8, dst0, dst1, filt0, filt1, filt2;
3191 v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r;
3192 v8i16 hz_out65_r, hz_out76_r, hz_out87_r;
3195 LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2);
3197 filt0 = (v8i16) __msa_fill_w(filt_const0);
3198 filt1 = (v8i16) __msa_fill_w(filt_const1);
3199 filt2 = (v8i16) __msa_fill_w(filt_const2);
3201 src -= ((2 * stride) + 2);
3203 LD_SB5(src, stride, src0, src1, src2, src3, src4);
3204 src += (5 * stride);
3205 LD_SB4(src, stride, src5, src6, src7, src8);
3207 XORI_B5_128_SB(src0, src1, src2, src3, src4);
3208 XORI_B4_128_SB(src5, src6, src7, src8);
3210 hz_out0 = AVC_HORZ_FILTER_SH(src0, src1, mask0, mask1, mask2);
3211 hz_out2 = AVC_HORZ_FILTER_SH(src2, src3, mask0, mask1, mask2);
3212 hz_out4 = AVC_HORZ_FILTER_SH(src4, src5, mask0, mask1, mask2);
3213 hz_out6 = AVC_HORZ_FILTER_SH(src6, src7, mask0, mask1, mask2);
3214 hz_out8 = AVC_HORZ_FILTER_SH(src8, src8, mask0, mask1, mask2);
3215 PCKOD_D2_SH(hz_out0, hz_out0, hz_out2, hz_out2, hz_out1, hz_out3);
3216 PCKOD_D2_SH(hz_out4, hz_out4, hz_out6, hz_out6, hz_out5, hz_out7);
3218 ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4,
3219 hz_out3, hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r);
3220 ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8,
3221 hz_out7, hz_out54_r, hz_out65_r, hz_out76_r, hz_out87_r);
3223 tmp0 = AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0, filt1,
3225 tmp1 = AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0, filt1,
3227 dst0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
3228 tmp0 = AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0, filt1,
3230 tmp1 = AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0, filt1,
3232 dst1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
3234 PCKEV_D2_SH(hz_out4, hz_out3, hz_out6, hz_out5, hz_out0, hz_out1);
3235 SRARI_H2_SH(hz_out0, hz_out1, 5);
3236 SAT_SH2_SH(hz_out0, hz_out1, 7);
3238 dst0 = __msa_aver_s_h(dst0, hz_out0);
3239 dst1 = __msa_aver_s_h(dst1, hz_out1);
3241 res = PCKEV_XORI128_UB(dst0, dst1);
3242 ST4x4_UB(res, res, 0, 1, 2, 3, dst, stride);
3245 void ff_put_h264_qpel16_mc02_msa(uint8_t *dst, const uint8_t *src,
3249 int16_t filt_const0 = 0xfb01;
3250 int16_t filt_const1 = 0x1414;
3251 int16_t filt_const2 = 0x1fb;
3252 v16u8 res0, res1, res2, res3;
3253 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
3254 v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
3255 v16i8 src87_r, src10_l, src32_l, src54_l, src76_l, src21_l, src43_l;
3256 v16i8 src65_l, src87_l, filt0, filt1, filt2;
3257 v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
3259 filt0 = (v16i8) __msa_fill_h(filt_const0);
3260 filt1 = (v16i8) __msa_fill_h(filt_const1);
3261 filt2 = (v16i8) __msa_fill_h(filt_const2);
3262 src -= (stride * 2);
3264 LD_SB5(src, stride, src0, src1, src2, src3, src4);
3265 src += (5 * stride);
3267 XORI_B5_128_SB(src0, src1, src2, src3, src4);
3268 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
3270 ILVL_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_l, src21_l,
3273 for (loop_cnt = 4; loop_cnt--;) {
3274 LD_SB4(src, stride, src5, src6, src7, src8);
3275 src += (4 * stride);
3277 XORI_B4_128_SB(src5, src6, src7, src8);
3278 ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r,
3279 src65_r, src76_r, src87_r);
3280 ILVL_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_l,
3281 src65_l, src76_l, src87_l);
3282 out0_r = AVC_DOT_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2);
3283 out1_r = AVC_DOT_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2);
3284 out2_r = AVC_DOT_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2);
3285 out3_r = AVC_DOT_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2);
3286 out0_l = AVC_DOT_SH3_SH(src10_l, src32_l, src54_l, filt0, filt1, filt2);
3287 out1_l = AVC_DOT_SH3_SH(src21_l, src43_l, src65_l, filt0, filt1, filt2);
3288 out2_l = AVC_DOT_SH3_SH(src32_l, src54_l, src76_l, filt0, filt1, filt2);
3289 out3_l = AVC_DOT_SH3_SH(src43_l, src65_l, src87_l, filt0, filt1, filt2);
3290 SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 5);
3291 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
3292 SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 5);
3293 SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
3294 PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
3295 out3_r, res0, res1, res2, res3);
3296 XORI_B4_128_UB(res0, res1, res2, res3);
3297 ST_UB4(res0, res1, res2, res3, dst, stride);
3298 dst += (4 * stride);
3312 void ff_put_h264_qpel8_mc02_msa(uint8_t *dst, const uint8_t *src,
3315 const int16_t filt_const0 = 0xfb01;
3316 const int16_t filt_const1 = 0x1414;
3317 const int16_t filt_const2 = 0x1fb;
3318 v16u8 out0, out1, out2, out3;
3319 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
3320 v16i8 src11, src12, src10_r, src21_r, src32_r, src43_r, src76_r, src87_r;
3321 v16i8 src98_r, src109_r, src89_r, src910_r, src1110_r, src1211_r;
3322 v16i8 filt0, filt1, filt2;
3323 v8i16 out0_r, out1_r, out2_r, out3_r, out4_r, out5_r, out6_r, out7_r;
3325 filt0 = (v16i8) __msa_fill_h(filt_const0);
3326 filt1 = (v16i8) __msa_fill_h(filt_const1);
3327 filt2 = (v16i8) __msa_fill_h(filt_const2);
3329 src -= (stride * 2);
3331 LD_SB8(src, stride, src0, src1, src2, src3, src4, src5, src6, src7);
3332 src += (8 * stride);
3333 LD_SB5(src, stride, src8, src9, src10, src11, src12);
3334 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
3336 ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src76_r, src87_r,
3338 ILVR_B4_SB(src9, src8, src10, src9, src11, src10, src12, src11, src89_r,
3339 src910_r, src1110_r, src1211_r);
3340 XORI_B4_128_SB(src10_r, src21_r, src32_r, src43_r);
3341 XORI_B4_128_SB(src76_r, src87_r, src98_r, src109_r);
3342 XORI_B4_128_SB(src89_r, src910_r, src1110_r, src1211_r);
3343 out0_r = AVC_DOT_SH3_SH(src10_r, src32_r, src76_r, filt0, filt1, filt2);
3344 out1_r = AVC_DOT_SH3_SH(src21_r, src43_r, src87_r, filt0, filt1, filt2);
3345 out2_r = AVC_DOT_SH3_SH(src32_r, src76_r, src98_r, filt0, filt1, filt2);
3346 out3_r = AVC_DOT_SH3_SH(src43_r, src87_r, src109_r, filt0, filt1, filt2);
3347 out4_r = AVC_DOT_SH3_SH(src76_r, src98_r, src89_r, filt0, filt1, filt2);
3348 out5_r = AVC_DOT_SH3_SH(src87_r, src109_r, src910_r, filt0, filt1, filt2);
3349 out6_r = AVC_DOT_SH3_SH(src98_r, src89_r, src1110_r, filt0, filt1, filt2);
3350 out7_r = AVC_DOT_SH3_SH(src109_r, src910_r, src1211_r, filt0, filt1, filt2);
3351 SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 5);
3352 SRARI_H4_SH(out4_r, out5_r, out6_r, out7_r, 5);
3353 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
3354 SAT_SH4_SH(out4_r, out5_r, out6_r, out7_r, 7);
3355 out0 = PCKEV_XORI128_UB(out0_r, out1_r);
3356 out1 = PCKEV_XORI128_UB(out2_r, out3_r);
3357 out2 = PCKEV_XORI128_UB(out4_r, out5_r);
3358 out3 = PCKEV_XORI128_UB(out6_r, out7_r);
3359 ST8x8_UB(out0, out1, out2, out3, dst, stride);
3362 void ff_put_h264_qpel4_mc02_msa(uint8_t *dst, const uint8_t *src,
3365 const int16_t filt_const0 = 0xfb01;
3366 const int16_t filt_const1 = 0x1414;
3367 const int16_t filt_const2 = 0x1fb;
3369 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
3370 v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
3371 v16i8 src87_r, src2110, src4332, src6554, src8776, filt0, filt1, filt2;
3374 filt0 = (v16i8) __msa_fill_h(filt_const0);
3375 filt1 = (v16i8) __msa_fill_h(filt_const1);
3376 filt2 = (v16i8) __msa_fill_h(filt_const2);
3378 src -= (stride * 2);
3380 LD_SB5(src, stride, src0, src1, src2, src3, src4);
3381 src += (5 * stride);
3382 LD_SB4(src, stride, src5, src6, src7, src8);
3384 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
3386 ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
3388 ILVR_D4_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r, src87_r,
3389 src76_r, src2110, src4332, src6554, src8776);
3390 XORI_B4_128_SB(src2110, src4332, src6554, src8776);
3391 out10 = AVC_DOT_SH3_SH(src2110, src4332, src6554, filt0, filt1, filt2);
3392 out32 = AVC_DOT_SH3_SH(src4332, src6554, src8776, filt0, filt1, filt2);
3393 SRARI_H2_SH(out10, out32, 5);
3394 SAT_SH2_SH(out10, out32, 7);
3395 out = PCKEV_XORI128_UB(out10, out32);
3396 ST4x4_UB(out, out, 0, 1, 2, 3, dst, stride);
3399 void ff_put_h264_qpel16_mc12_msa(uint8_t *dst, const uint8_t *src,
3404 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
3406 v8i16 vt_res0, vt_res1, vt_res2, vt_res3, dst0, dst1, dst2, dst3, mask3;
3407 v8i16 shf_vec0, shf_vec1, shf_vec2, shf_vec3, shf_vec4, shf_vec5, shf_vec6;
3408 v8i16 shf_vec7, shf_vec8, shf_vec9, shf_vec10, shf_vec11, mask4, mask5;
3409 v4i32 hz_res0, hz_res1, hz_res2, hz_res3;
3410 v8i16 mask0 = { 0, 5, 1, 6, 2, 7, 3, 8 };
3411 v8i16 mask1 = { 1, 4, 2, 5, 3, 6, 4, 7 };
3412 v8i16 mask2 = { 2, 3, 3, 4, 4, 5, 5, 6 };
3413 v8i16 minus5h = __msa_ldi_h(-5);
3414 v8i16 plus20h = __msa_ldi_h(20);
3420 src -= ((2 * stride) + 2);
3422 LD_SB5(src, stride, src0, src1, src2, src3, src4);
3423 LD_SB5(src + 8, stride, src7, src8, src9, src10, src11);
3424 src += (5 * stride);
3425 XORI_B5_128_SB(src0, src1, src2, src3, src4);
3426 XORI_B5_128_SB(src7, src8, src9, src10, src11);
3428 for (row = 16; row--;) {
3429 LD_SB2(src, 8, src5, src6);
3431 XORI_B2_128_SB(src5, src6);
3433 AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src0, src1, src2, src3, src4, src5,
3435 AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src7, src8, src9, src10, src11, src6,
3437 VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask0,
3438 mask1, mask2, shf_vec0, shf_vec1, shf_vec2);
3439 VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask0,
3440 mask1, mask2, shf_vec3, shf_vec4, shf_vec5);
3441 VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask3,
3442 mask4, mask5, shf_vec6, shf_vec7, shf_vec8);
3443 VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask3,
3444 mask4, mask5, shf_vec9, shf_vec10, shf_vec11);
3445 hz_res0 = __msa_hadd_s_w(shf_vec0, shf_vec0);
3446 hz_res1 = __msa_hadd_s_w(shf_vec3, shf_vec3);
3447 hz_res2 = __msa_hadd_s_w(shf_vec6, shf_vec6);
3448 hz_res3 = __msa_hadd_s_w(shf_vec9, shf_vec9);
3449 DPADD_SH2_SW(shf_vec1, shf_vec2, minus5h, plus20h, hz_res0, hz_res0);
3450 DPADD_SH2_SW(shf_vec4, shf_vec5, minus5h, plus20h, hz_res1, hz_res1);
3451 DPADD_SH2_SW(shf_vec7, shf_vec8, minus5h, plus20h, hz_res2, hz_res2);
3452 DPADD_SH2_SW(shf_vec10, shf_vec11, minus5h, plus20h, hz_res3, hz_res3);
3453 SRARI_W4_SW(hz_res0, hz_res1, hz_res2, hz_res3, 10);
3454 SAT_SW4_SW(hz_res0, hz_res1, hz_res2, hz_res3, 7);
3455 dst0 = __msa_srari_h(shf_vec2, 5);
3456 dst1 = __msa_srari_h(shf_vec5, 5);
3457 dst2 = __msa_srari_h(shf_vec8, 5);
3458 dst3 = __msa_srari_h(shf_vec11, 5);
3459 SAT_SH4_SH(dst0, dst1, dst2, dst3, 7);
3460 PCKEV_H2_SH(dst2, dst0, dst3, dst1, dst0, dst1);
3461 PCKEV_H2_SH(hz_res2, hz_res0, hz_res3, hz_res1, dst2, dst3);
3462 dst0 = __msa_aver_s_h(dst2, dst0);
3463 dst1 = __msa_aver_s_h(dst3, dst1);
3464 out = PCKEV_XORI128_UB(dst0, dst1);
3481 void ff_put_h264_qpel16_mc32_msa(uint8_t *dst, const uint8_t *src,
3486 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
3488 v8i16 vt_res0, vt_res1, vt_res2, vt_res3, dst0, dst1, dst2, dst3, mask3;
3489 v8i16 shf_vec0, shf_vec1, shf_vec2, shf_vec3, shf_vec4, shf_vec5, shf_vec6;
3490 v8i16 shf_vec7, shf_vec8, shf_vec9, shf_vec10, shf_vec11, mask4, mask5;
3491 v4i32 hz_res0, hz_res1, hz_res2, hz_res3;
3492 v8i16 mask0 = { 0, 5, 1, 6, 2, 7, 3, 8 };
3493 v8i16 mask1 = { 1, 4, 2, 5, 3, 6, 4, 7 };
3494 v8i16 mask2 = { 2, 3, 3, 4, 4, 5, 5, 6 };
3495 v8i16 minus5h = __msa_ldi_h(-5);
3496 v8i16 plus20h = __msa_ldi_h(20);
3502 src -= ((2 * stride) + 2);
3504 LD_SB5(src, stride, src0, src1, src2, src3, src4);
3505 LD_SB5(src + 8, stride, src7, src8, src9, src10, src11);
3506 src += (5 * stride);
3507 XORI_B5_128_SB(src0, src1, src2, src3, src4);
3508 XORI_B5_128_SB(src7, src8, src9, src10, src11);
3510 for (row = 16; row--;) {
3511 LD_SB2(src, 8, src5, src6);
3513 XORI_B2_128_SB(src5, src6);
3515 AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src0, src1, src2, src3, src4, src5,
3517 AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src7, src8, src9, src10, src11, src6,
3519 VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask0,
3520 mask1, mask2, shf_vec0, shf_vec1, shf_vec2);
3521 VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask0,
3522 mask1, mask2, shf_vec3, shf_vec4, shf_vec5);
3523 VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask3,
3524 mask4, mask5, shf_vec6, shf_vec7, shf_vec8);
3525 VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask3,
3526 mask4, mask5, shf_vec9, shf_vec10, shf_vec11);
3527 hz_res0 = __msa_hadd_s_w(shf_vec0, shf_vec0);
3528 hz_res1 = __msa_hadd_s_w(shf_vec3, shf_vec3);
3529 hz_res2 = __msa_hadd_s_w(shf_vec6, shf_vec6);
3530 hz_res3 = __msa_hadd_s_w(shf_vec9, shf_vec9);
3531 DPADD_SH2_SW(shf_vec1, shf_vec2, minus5h, plus20h, hz_res0, hz_res0);
3532 DPADD_SH2_SW(shf_vec4, shf_vec5, minus5h, plus20h, hz_res1, hz_res1);
3533 DPADD_SH2_SW(shf_vec7, shf_vec8, minus5h, plus20h, hz_res2, hz_res2);
3534 DPADD_SH2_SW(shf_vec10, shf_vec11, minus5h, plus20h, hz_res3, hz_res3);
3535 SRARI_W4_SW(hz_res0, hz_res1, hz_res2, hz_res3, 10);
3536 SAT_SW4_SW(hz_res0, hz_res1, hz_res2, hz_res3, 7);
3537 dst0 = __msa_srari_h(shf_vec2, 5);
3538 dst1 = __msa_srari_h(shf_vec5, 5);
3539 dst2 = __msa_srari_h(shf_vec8, 5);
3540 dst3 = __msa_srari_h(shf_vec11, 5);
3541 SAT_SH4_SH(dst0, dst1, dst2, dst3, 7);
3542 dst0 = __msa_pckod_h(dst2, dst0);
3543 dst1 = __msa_pckod_h(dst3, dst1);
3544 PCKEV_H2_SH(hz_res2, hz_res0, hz_res3, hz_res1, dst2, dst3);
3545 dst0 = __msa_aver_s_h(dst2, dst0);
3546 dst1 = __msa_aver_s_h(dst3, dst1);
3547 out = PCKEV_XORI128_UB(dst0, dst1);
3564 void ff_put_h264_qpel8_mc12_msa(uint8_t *dst, const uint8_t *src,
3569 v16i8 src0, src1, src2, src3, src4, src5, src6;
3570 v8i16 vt_res0, vt_res1, vt_res2, vt_res3, dst0, dst1, dst2, dst3;
3571 v8i16 shf_vec0, shf_vec1, shf_vec2, shf_vec3, shf_vec4, shf_vec5, shf_vec6;
3572 v8i16 shf_vec7, shf_vec8, shf_vec9, shf_vec10, shf_vec11;
3573 v8i16 mask3, mask4, mask5;
3574 v4i32 hz_res0, hz_res1, hz_res2, hz_res3;
3575 v8i16 mask0 = { 0, 5, 1, 6, 2, 7, 3, 8 };
3576 v8i16 mask1 = { 1, 4, 2, 5, 3, 6, 4, 7 };
3577 v8i16 mask2 = { 2, 3, 3, 4, 4, 5, 5, 6 };
3578 v8i16 minus5h = __msa_ldi_h(-5);
3579 v8i16 plus20h = __msa_ldi_h(20);
3585 src -= ((2 * stride) + 2);
3587 LD_SB5(src, stride, src0, src1, src2, src3, src4);
3588 src += (5 * stride);
3589 XORI_B5_128_SB(src0, src1, src2, src3, src4);
3591 for (row = 4; row--;) {
3592 LD_SB2(src, stride, src5, src6);
3593 src += (2 * stride);
3594 XORI_B2_128_SB(src5, src6);
3596 AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src0, src1, src2, src3, src4, src5,
3598 AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src1, src2, src3, src4, src5, src6,
3600 VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask0,
3601 mask1, mask2, shf_vec0, shf_vec1, shf_vec2);
3602 VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask0,
3603 mask1, mask2, shf_vec3, shf_vec4, shf_vec5);
3604 VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask3,
3605 mask4, mask5, shf_vec6, shf_vec7, shf_vec8);
3606 VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask3,
3607 mask4, mask5, shf_vec9, shf_vec10, shf_vec11);
3608 hz_res0 = __msa_hadd_s_w(shf_vec0, shf_vec0);
3609 hz_res1 = __msa_hadd_s_w(shf_vec3, shf_vec3);
3610 hz_res2 = __msa_hadd_s_w(shf_vec6, shf_vec6);
3611 hz_res3 = __msa_hadd_s_w(shf_vec9, shf_vec9);
3612 DPADD_SH2_SW(shf_vec1, shf_vec2, minus5h, plus20h, hz_res0, hz_res0);
3613 DPADD_SH2_SW(shf_vec4, shf_vec5, minus5h, plus20h, hz_res1, hz_res1);
3614 DPADD_SH2_SW(shf_vec7, shf_vec8, minus5h, plus20h, hz_res2, hz_res2);
3615 DPADD_SH2_SW(shf_vec10, shf_vec11, minus5h, plus20h, hz_res3, hz_res3);
3616 SRARI_W4_SW(hz_res0, hz_res1, hz_res2, hz_res3, 10);
3617 SAT_SW4_SW(hz_res0, hz_res1, hz_res2, hz_res3, 7);
3618 dst0 = __msa_srari_h(shf_vec2, 5);
3619 dst1 = __msa_srari_h(shf_vec5, 5);
3620 dst2 = __msa_srari_h(shf_vec8, 5);
3621 dst3 = __msa_srari_h(shf_vec11, 5);
3622 SAT_SH4_SH(dst0, dst1, dst2, dst3, 7);
3623 PCKEV_H2_SH(dst2, dst0, dst3, dst1, dst0, dst1);
3624 PCKEV_H2_SH(hz_res2, hz_res0, hz_res3, hz_res1, dst2, dst3);
3625 dst0 = __msa_aver_s_h(dst2, dst0);
3626 dst1 = __msa_aver_s_h(dst3, dst1);
3627 out = PCKEV_XORI128_UB(dst0, dst1);
3628 ST8x2_UB(out, dst, stride);
3629 dst += (2 * stride);
3639 void ff_put_h264_qpel8_mc32_msa(uint8_t *dst, const uint8_t *src,
3644 v16i8 src0, src1, src2, src3, src4, src5, src6;
3645 v8i16 vt_res0, vt_res1, vt_res2, vt_res3, dst0, dst1, dst2, dst3;
3646 v8i16 shf_vec0, shf_vec1, shf_vec2, shf_vec3, shf_vec4, shf_vec5, shf_vec6;
3647 v8i16 shf_vec7, shf_vec8, shf_vec9, shf_vec10, shf_vec11;
3648 v8i16 mask3, mask4, mask5;
3649 v4i32 hz_res0, hz_res1, hz_res2, hz_res3;
3650 v8i16 mask0 = { 0, 5, 1, 6, 2, 7, 3, 8 };
3651 v8i16 mask1 = { 1, 4, 2, 5, 3, 6, 4, 7 };
3652 v8i16 mask2 = { 2, 3, 3, 4, 4, 5, 5, 6 };
3653 v8i16 minus5h = __msa_ldi_h(-5);
3654 v8i16 plus20h = __msa_ldi_h(20);
3660 src -= ((2 * stride) + 2);
3662 LD_SB5(src, stride, src0, src1, src2, src3, src4);
3663 src += (5 * stride);
3664 XORI_B5_128_SB(src0, src1, src2, src3, src4);
3666 for (row = 4; row--;) {
3667 LD_SB2(src, stride, src5, src6);
3668 src += (2 * stride);
3669 XORI_B2_128_SB(src5, src6);
3671 AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src0, src1, src2, src3, src4, src5,
3673 AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src1, src2, src3, src4, src5, src6,
3675 VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask0,
3676 mask1, mask2, shf_vec0, shf_vec1, shf_vec2);
3677 VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask0,
3678 mask1, mask2, shf_vec3, shf_vec4, shf_vec5);
3679 VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask3,
3680 mask4, mask5, shf_vec6, shf_vec7, shf_vec8);
3681 VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask3,
3682 mask4, mask5, shf_vec9, shf_vec10, shf_vec11);
3683 hz_res0 = __msa_hadd_s_w(shf_vec0, shf_vec0);
3684 hz_res1 = __msa_hadd_s_w(shf_vec3, shf_vec3);
3685 hz_res2 = __msa_hadd_s_w(shf_vec6, shf_vec6);
3686 hz_res3 = __msa_hadd_s_w(shf_vec9, shf_vec9);
3687 DPADD_SH2_SW(shf_vec1, shf_vec2, minus5h, plus20h, hz_res0, hz_res0);
3688 DPADD_SH2_SW(shf_vec4, shf_vec5, minus5h, plus20h, hz_res1, hz_res1);
3689 DPADD_SH2_SW(shf_vec7, shf_vec8, minus5h, plus20h, hz_res2, hz_res2);
3690 DPADD_SH2_SW(shf_vec10, shf_vec11, minus5h, plus20h, hz_res3, hz_res3);
3691 SRARI_W4_SW(hz_res0, hz_res1, hz_res2, hz_res3, 10);
3692 SAT_SW4_SW(hz_res0, hz_res1, hz_res2, hz_res3, 7);
3693 dst0 = __msa_srari_h(shf_vec2, 5);
3694 dst1 = __msa_srari_h(shf_vec5, 5);
3695 dst2 = __msa_srari_h(shf_vec8, 5);
3696 dst3 = __msa_srari_h(shf_vec11, 5);
3697 SAT_SH4_SH(dst0, dst1, dst2, dst3, 7);
3698 dst0 = __msa_pckod_h(dst2, dst0);
3699 dst1 = __msa_pckod_h(dst3, dst1);
3700 PCKEV_H2_SH(hz_res2, hz_res0, hz_res3, hz_res1, dst2, dst3);
3701 dst0 = __msa_aver_s_h(dst2, dst0);
3702 dst1 = __msa_aver_s_h(dst3, dst1);
3703 out = PCKEV_XORI128_UB(dst0, dst1);
3704 ST8x2_UB(out, dst, stride);
3705 dst += (2 * stride);
3715 void ff_put_h264_qpel4_mc12_msa(uint8_t *dst, const uint8_t *src,
3718 const int16_t filt_const0 = 0xfb01;
3719 const int16_t filt_const1 = 0x1414;
3720 const int16_t filt_const2 = 0x1fb;
3722 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
3723 v16i8 src10_r, src21_r, src32_r, src43_r, src54_r, src65_r, src76_r;
3724 v16i8 src87_r, src10_l, src21_l, src32_l, src43_l, src54_l, src65_l;
3725 v16i8 src76_l, src87_l, filt0, filt1, filt2;
3726 v8i16 vt_res0, vt_res1, vt_res2, vt_res3, dst0, dst1, dst2, dst3, shf_vec7;
3727 v8i16 shf_vec0, shf_vec1, shf_vec2, shf_vec3, shf_vec4, shf_vec5, shf_vec6;
3728 v4i32 hz_res0, hz_res1, hz_res2, hz_res3;
3729 v8i16 mask0 = { 0, 5, 1, 6, 2, 7, 3, 8 };
3730 v8i16 mask1 = { 1, 4, 2, 5, 3, 6, 4, 7 };
3731 v8i16 mask2 = { 2, 3, 3, 4, 4, 5, 5, 6 };
3732 v8i16 minus5h = __msa_ldi_h(-5);
3733 v8i16 plus20h = __msa_ldi_h(20);
3734 v8i16 zeros = { 0 };
3736 filt0 = (v16i8) __msa_fill_h(filt_const0);
3737 filt1 = (v16i8) __msa_fill_h(filt_const1);
3738 filt2 = (v16i8) __msa_fill_h(filt_const2);
3740 src -= ((2 * stride) + 2);
3742 LD_SB5(src, stride, src0, src1, src2, src3, src4);
3743 src += (5 * stride);
3744 XORI_B5_128_SB(src0, src1, src2, src3, src4);
3745 LD_SB4(src, stride, src5, src6, src7, src8);
3746 XORI_B4_128_SB(src5, src6, src7, src8);
3748 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
3750 ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
3752 ILVL_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_l, src21_l,
3754 ILVL_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_l, src65_l,
3756 vt_res0 = AVC_DOT_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2);
3757 vt_res1 = AVC_DOT_SH3_SH(src10_l, src32_l, src54_l, filt0, filt1, filt2);
3758 vt_res2 = AVC_DOT_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2);
3759 vt_res3 = AVC_DOT_SH3_SH(src21_l, src43_l, src65_l, filt0, filt1, filt2);
3760 VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask0,
3761 mask1, mask2, shf_vec0, shf_vec1, shf_vec2);
3762 VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask0,
3763 mask1, mask2, shf_vec3, shf_vec4, shf_vec5);
3764 hz_res0 = __msa_hadd_s_w(shf_vec0, shf_vec0);
3765 DPADD_SH2_SW(shf_vec1, shf_vec2, minus5h, plus20h, hz_res0, hz_res0);
3766 hz_res1 = __msa_hadd_s_w(shf_vec3, shf_vec3);
3767 DPADD_SH2_SW(shf_vec4, shf_vec5, minus5h, plus20h, hz_res1, hz_res1);
3769 vt_res0 = AVC_DOT_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2);
3770 vt_res1 = AVC_DOT_SH3_SH(src32_l, src54_l, src76_l, filt0, filt1, filt2);
3771 vt_res2 = AVC_DOT_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2);
3772 vt_res3 = AVC_DOT_SH3_SH(src43_l, src65_l, src87_l, filt0, filt1, filt2);
3773 VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask0,
3774 mask1, mask2, shf_vec0, shf_vec1, shf_vec6);
3775 VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask0,
3776 mask1, mask2, shf_vec3, shf_vec4, shf_vec7);
3777 hz_res2 = __msa_hadd_s_w(shf_vec0, shf_vec0);
3778 DPADD_SH2_SW(shf_vec1, shf_vec6, minus5h, plus20h, hz_res2, hz_res2);
3779 hz_res3 = __msa_hadd_s_w(shf_vec3, shf_vec3);
3780 DPADD_SH2_SW(shf_vec4, shf_vec7, minus5h, plus20h, hz_res3, hz_res3);
3782 SRARI_W2_SW(hz_res0, hz_res1, 10);
3783 SAT_SW2_SW(hz_res0, hz_res1, 7);
3784 SRARI_W2_SW(hz_res2, hz_res3, 10);
3785 SAT_SW2_SW(hz_res2, hz_res3, 7);
3787 dst0 = __msa_srari_h(shf_vec2, 5);
3788 dst1 = __msa_srari_h(shf_vec5, 5);
3789 dst2 = __msa_srari_h(shf_vec6, 5);
3790 dst3 = __msa_srari_h(shf_vec7, 5);
3792 SAT_SH2_SH(dst0, dst1, 7);
3793 SAT_SH2_SH(dst2, dst3, 7);
3794 ILVEV_H2_SH(dst0, zeros, dst1, zeros, dst0, dst1);
3795 ILVEV_H2_SH(dst2, zeros, dst3, zeros, dst2, dst3);
3797 hz_res0 = __msa_aver_s_w(hz_res0, (v4i32) dst0);
3798 hz_res1 = __msa_aver_s_w(hz_res1, (v4i32) dst1);
3799 hz_res2 = __msa_aver_s_w(hz_res2, (v4i32) dst2);
3800 hz_res3 = __msa_aver_s_w(hz_res3, (v4i32) dst3);
3802 PCKEV_H2_SH(hz_res1, hz_res0, hz_res3, hz_res2, dst0, dst2);
3803 out = PCKEV_XORI128_UB(dst0, dst2);
3804 ST4x4_UB(out, out, 0, 1, 2, 3, dst, stride);
3807 void ff_put_h264_qpel4_mc32_msa(uint8_t *dst, const uint8_t *src,
3810 const int16_t filt_const0 = 0xfb01;
3811 const int16_t filt_const1 = 0x1414;
3812 const int16_t filt_const2 = 0x1fb;
3814 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
3815 v16i8 src10_r, src21_r, src32_r, src43_r, src54_r, src65_r, src76_r;
3816 v16i8 src87_r, src10_l, src21_l, src32_l, src43_l, src54_l, src65_l;
3817 v16i8 src76_l, src87_l, filt0, filt1, filt2;
3818 v8i16 vt_res0, vt_res1, vt_res2, vt_res3, dst0, dst1, dst2, dst3, shf_vec7;
3819 v8i16 shf_vec0, shf_vec1, shf_vec2, shf_vec3, shf_vec4, shf_vec5, shf_vec6;
3820 v4i32 hz_res0, hz_res1, hz_res2, hz_res3;
3821 v8i16 mask0 = { 0, 5, 1, 6, 2, 7, 3, 8 };
3822 v8i16 mask1 = { 1, 4, 2, 5, 3, 6, 4, 7 };
3823 v8i16 mask2 = { 2, 3, 3, 4, 4, 5, 5, 6 };
3824 v8i16 minus5h = __msa_ldi_h(-5);
3825 v8i16 plus20h = __msa_ldi_h(20);
3826 v8i16 zeros = { 0 };
3828 filt0 = (v16i8) __msa_fill_h(filt_const0);
3829 filt1 = (v16i8) __msa_fill_h(filt_const1);
3830 filt2 = (v16i8) __msa_fill_h(filt_const2);
3832 src -= ((2 * stride) + 2);
3834 LD_SB5(src, stride, src0, src1, src2, src3, src4);
3835 src += (5 * stride);
3836 XORI_B5_128_SB(src0, src1, src2, src3, src4);
3837 LD_SB4(src, stride, src5, src6, src7, src8);
3838 XORI_B4_128_SB(src5, src6, src7, src8);
3840 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
3842 ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
3844 ILVL_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_l, src21_l,
3846 ILVL_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_l, src65_l,
3849 vt_res0 = AVC_DOT_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2);
3850 vt_res1 = AVC_DOT_SH3_SH(src10_l, src32_l, src54_l, filt0, filt1, filt2);
3851 vt_res2 = AVC_DOT_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2);
3852 vt_res3 = AVC_DOT_SH3_SH(src21_l, src43_l, src65_l, filt0, filt1, filt2);
3853 VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask0,
3854 mask1, mask2, shf_vec0, shf_vec1, shf_vec2);
3855 VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask0,
3856 mask1, mask2, shf_vec3, shf_vec4, shf_vec5);
3857 hz_res0 = __msa_hadd_s_w(shf_vec0, shf_vec0);
3858 DPADD_SH2_SW(shf_vec1, shf_vec2, minus5h, plus20h, hz_res0, hz_res0);
3859 hz_res1 = __msa_hadd_s_w(shf_vec3, shf_vec3);
3860 DPADD_SH2_SW(shf_vec4, shf_vec5, minus5h, plus20h, hz_res1, hz_res1);
3862 vt_res0 = AVC_DOT_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2);
3863 vt_res1 = AVC_DOT_SH3_SH(src32_l, src54_l, src76_l, filt0, filt1, filt2);
3864 vt_res2 = AVC_DOT_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2);
3865 vt_res3 = AVC_DOT_SH3_SH(src43_l, src65_l, src87_l, filt0, filt1, filt2);
3866 VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1, mask0,
3867 mask1, mask2, shf_vec0, shf_vec1, shf_vec6);
3868 VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3, mask0,
3869 mask1, mask2, shf_vec3, shf_vec4, shf_vec7);
3870 hz_res2 = __msa_hadd_s_w(shf_vec0, shf_vec0);
3871 DPADD_SH2_SW(shf_vec1, shf_vec6, minus5h, plus20h, hz_res2, hz_res2);
3872 hz_res3 = __msa_hadd_s_w(shf_vec3, shf_vec3);
3873 DPADD_SH2_SW(shf_vec4, shf_vec7, minus5h, plus20h, hz_res3, hz_res3);
3875 SRARI_W2_SW(hz_res0, hz_res1, 10);
3876 SAT_SW2_SW(hz_res0, hz_res1, 7);
3877 SRARI_W2_SW(hz_res2, hz_res3, 10);
3878 SAT_SW2_SW(hz_res2, hz_res3, 7);
3880 dst0 = __msa_srari_h(shf_vec2, 5);
3881 dst1 = __msa_srari_h(shf_vec5, 5);
3882 dst2 = __msa_srari_h(shf_vec6, 5);
3883 dst3 = __msa_srari_h(shf_vec7, 5);
3885 SAT_SH2_SH(dst0, dst1, 7);
3886 SAT_SH2_SH(dst2, dst3, 7);
3888 dst0 = __msa_ilvod_h(zeros, dst0);
3889 dst1 = __msa_ilvod_h(zeros, dst1);
3890 dst2 = __msa_ilvod_h(zeros, dst2);
3891 dst3 = __msa_ilvod_h(zeros, dst3);
3893 hz_res0 = __msa_aver_s_w(hz_res0, (v4i32) dst0);
3894 hz_res1 = __msa_aver_s_w(hz_res1, (v4i32) dst1);
3895 hz_res2 = __msa_aver_s_w(hz_res2, (v4i32) dst2);
3896 hz_res3 = __msa_aver_s_w(hz_res3, (v4i32) dst3);
3898 PCKEV_H2_SH(hz_res1, hz_res0, hz_res3, hz_res2, dst0, dst2);
3899 out = PCKEV_XORI128_UB(dst0, dst2);
3900 ST4x4_UB(out, out, 0, 1, 2, 3, dst, stride);
3903 void ff_put_h264_qpel16_mc22_msa(uint8_t *dst, const uint8_t *src,
3906 const int32_t filt_const0 = 0xfffb0001;
3907 const int32_t filt_const1 = 0x140014;
3908 const int32_t filt_const2 = 0x1fffb;
3909 const uint8_t *src_tmp = src - (2 * stride) - 2;
3910 uint8_t *dst_tmp = dst;
3911 uint32_t multiple8_cnt, loop_cnt;
3913 v16i8 src0, src1, src2, src3, src4, mask0, mask1, mask2;
3914 v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
3915 v8i16 hz_out7, hz_out8, dst0, dst1, dst2, dst3;
3916 v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r;
3917 v8i16 hz_out65_r, hz_out76_r, hz_out87_r, hz_out10_l, hz_out21_l;
3918 v8i16 hz_out32_l, hz_out43_l, hz_out54_l, hz_out65_l, hz_out76_l;
3919 v8i16 hz_out87_l, filt0, filt1, filt2;
3922 filt0 = (v8i16) __msa_fill_w(filt_const0);
3923 filt1 = (v8i16) __msa_fill_w(filt_const1);
3924 filt2 = (v8i16) __msa_fill_w(filt_const2);
3926 LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
3928 for (multiple8_cnt = 2; multiple8_cnt--;) {
3932 LD_SB5(src, stride, src0, src1, src2, src3, src4);
3933 XORI_B5_128_SB(src0, src1, src2, src3, src4);
3934 src += (5 * stride);
3936 hz_out0 = AVC_HORZ_FILTER_SH(src0, src0, mask0, mask1, mask2);
3937 hz_out1 = AVC_HORZ_FILTER_SH(src1, src1, mask0, mask1, mask2);
3938 hz_out2 = AVC_HORZ_FILTER_SH(src2, src2, mask0, mask1, mask2);
3939 hz_out3 = AVC_HORZ_FILTER_SH(src3, src3, mask0, mask1, mask2);
3940 hz_out4 = AVC_HORZ_FILTER_SH(src4, src4, mask0, mask1, mask2);
3942 for (loop_cnt = 4; loop_cnt--;) {
3943 LD_SB4(src, stride, src0, src1, src2, src3);
3944 XORI_B4_128_SB(src0, src1, src2, src3);
3945 src += (4 * stride);
3947 hz_out5 = AVC_HORZ_FILTER_SH(src0, src0, mask0, mask1, mask2);
3948 hz_out6 = AVC_HORZ_FILTER_SH(src1, src1, mask0, mask1, mask2);
3949 hz_out7 = AVC_HORZ_FILTER_SH(src2, src2, mask0, mask1, mask2);
3950 hz_out8 = AVC_HORZ_FILTER_SH(src3, src3, mask0, mask1, mask2);
3952 ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2,
3953 hz_out4, hz_out3, hz_out10_r, hz_out21_r, hz_out32_r,
3955 ILVL_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2,
3956 hz_out4, hz_out3, hz_out10_l, hz_out21_l, hz_out32_l,
3958 ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6,
3959 hz_out8, hz_out7, hz_out54_r, hz_out65_r, hz_out76_r,
3961 ILVL_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6,
3962 hz_out8, hz_out7, hz_out54_l, hz_out65_l, hz_out76_l,
3965 tmp0 = AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0,
3967 tmp1 = AVC_DOT_SW3_SW(hz_out10_l, hz_out32_l, hz_out54_l, filt0,
3969 dst0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
3970 tmp0 = AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0,
3972 tmp1 = AVC_DOT_SW3_SW(hz_out21_l, hz_out43_l, hz_out65_l, filt0,
3974 dst1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
3975 tmp0 = AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0,
3977 tmp1 = AVC_DOT_SW3_SW(hz_out32_l, hz_out54_l, hz_out76_l, filt0,
3979 dst2 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
3980 tmp0 = AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0,
3982 tmp1 = AVC_DOT_SW3_SW(hz_out43_l, hz_out65_l, hz_out87_l, filt0,
3984 dst3 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
3986 out0 = PCKEV_XORI128_UB(dst0, dst1);
3987 out1 = PCKEV_XORI128_UB(dst2, dst3);
3988 ST8x4_UB(out0, out1, dst, stride);
3989 dst += (4 * stride);
4003 void ff_put_h264_qpel8_mc22_msa(uint8_t *dst, const uint8_t *src,
4006 const int32_t filt_const0 = 0xfffb0001;
4007 const int32_t filt_const1 = 0x140014;
4008 const int32_t filt_const2 = 0x1fffb;
4010 v16i8 src0, src1, src2, src3, src4, mask0, mask1, mask2;
4011 v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
4012 v8i16 hz_out7, hz_out8, hz_out9, hz_out10, hz_out11, hz_out12;
4013 v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r;
4014 v8i16 hz_out65_r, hz_out76_r, hz_out87_r, hz_out89_r, hz_out910_r;
4015 v8i16 hz_out1110_r, hz_out1211_r, dst0, dst1, dst2, dst3;
4016 v8i16 hz_out10_l, hz_out21_l, hz_out32_l, hz_out43_l, hz_out54_l;
4017 v8i16 hz_out65_l, hz_out76_l, hz_out87_l, hz_out89_l, hz_out910_l;
4018 v8i16 hz_out1110_l, hz_out1211_l, filt0, filt1, filt2;
4021 filt0 = (v8i16) __msa_fill_w(filt_const0);
4022 filt1 = (v8i16) __msa_fill_w(filt_const1);
4023 filt2 = (v8i16) __msa_fill_w(filt_const2);
4025 LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
4027 src -= ((2 * stride) + 2);
4028 LD_SB5(src, stride, src0, src1, src2, src3, src4);
4029 XORI_B5_128_SB(src0, src1, src2, src3, src4);
4030 src += (5 * stride);
4032 hz_out0 = AVC_HORZ_FILTER_SH(src0, src0, mask0, mask1, mask2);
4033 hz_out1 = AVC_HORZ_FILTER_SH(src1, src1, mask0, mask1, mask2);
4034 hz_out2 = AVC_HORZ_FILTER_SH(src2, src2, mask0, mask1, mask2);
4035 hz_out3 = AVC_HORZ_FILTER_SH(src3, src3, mask0, mask1, mask2);
4036 hz_out4 = AVC_HORZ_FILTER_SH(src4, src4, mask0, mask1, mask2);
4038 LD_SB4(src, stride, src0, src1, src2, src3);
4039 XORI_B4_128_SB(src0, src1, src2, src3);
4040 src += (4 * stride);
4041 hz_out5 = AVC_HORZ_FILTER_SH(src0, src0, mask0, mask1, mask2);
4042 hz_out6 = AVC_HORZ_FILTER_SH(src1, src1, mask0, mask1, mask2);
4043 hz_out7 = AVC_HORZ_FILTER_SH(src2, src2, mask0, mask1, mask2);
4044 hz_out8 = AVC_HORZ_FILTER_SH(src3, src3, mask0, mask1, mask2);
4045 ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4,
4046 hz_out3, hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r);
4047 ILVL_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4,
4048 hz_out3, hz_out10_l, hz_out21_l, hz_out32_l, hz_out43_l);
4049 ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8,
4050 hz_out7, hz_out54_r, hz_out65_r, hz_out76_r, hz_out87_r);
4051 ILVL_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8,
4052 hz_out7, hz_out54_l, hz_out65_l, hz_out76_l, hz_out87_l);
4054 tmp0 = AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0, filt1,
4056 tmp1 = AVC_DOT_SW3_SW(hz_out10_l, hz_out32_l, hz_out54_l, filt0, filt1,
4058 dst0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
4059 tmp0 = AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0, filt1,
4061 tmp1 = AVC_DOT_SW3_SW(hz_out21_l, hz_out43_l, hz_out65_l, filt0, filt1,
4063 dst1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
4064 tmp0 = AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0, filt1,
4066 tmp1 = AVC_DOT_SW3_SW(hz_out32_l, hz_out54_l, hz_out76_l, filt0, filt1,
4068 dst2 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
4069 tmp0 = AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0, filt1,
4071 tmp1 = AVC_DOT_SW3_SW(hz_out43_l, hz_out65_l, hz_out87_l, filt0, filt1,
4073 dst3 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
4074 out0 = PCKEV_XORI128_UB(dst0, dst1);
4075 out1 = PCKEV_XORI128_UB(dst2, dst3);
4076 ST8x4_UB(out0, out1, dst, stride);
4077 dst += (4 * stride);
4079 LD_SB4(src, stride, src0, src1, src2, src3);
4080 XORI_B4_128_SB(src0, src1, src2, src3);
4081 hz_out9 = AVC_HORZ_FILTER_SH(src0, src0, mask0, mask1, mask2);
4082 hz_out10 = AVC_HORZ_FILTER_SH(src1, src1, mask0, mask1, mask2);
4083 hz_out11 = AVC_HORZ_FILTER_SH(src2, src2, mask0, mask1, mask2);
4084 hz_out12 = AVC_HORZ_FILTER_SH(src3, src3, mask0, mask1, mask2);
4085 ILVR_H4_SH(hz_out9, hz_out8, hz_out10, hz_out9, hz_out11, hz_out10,
4086 hz_out12, hz_out11, hz_out89_r, hz_out910_r, hz_out1110_r,
4088 ILVL_H4_SH(hz_out9, hz_out8, hz_out10, hz_out9, hz_out11, hz_out10,
4089 hz_out12, hz_out11, hz_out89_l, hz_out910_l, hz_out1110_l,
4091 tmp0 = AVC_DOT_SW3_SW(hz_out54_r, hz_out76_r, hz_out89_r, filt0, filt1,
4093 tmp1 = AVC_DOT_SW3_SW(hz_out54_l, hz_out76_l, hz_out89_l, filt0, filt1,
4095 dst0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
4096 tmp0 = AVC_DOT_SW3_SW(hz_out65_r, hz_out87_r, hz_out910_r, filt0, filt1,
4098 tmp1 = AVC_DOT_SW3_SW(hz_out65_l, hz_out87_l, hz_out910_l, filt0, filt1,
4100 dst1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
4101 tmp0 = AVC_DOT_SW3_SW(hz_out76_r, hz_out89_r, hz_out1110_r, filt0, filt1,
4103 tmp1 = AVC_DOT_SW3_SW(hz_out76_l, hz_out89_l, hz_out1110_l, filt0, filt1,
4105 dst2 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
4106 tmp0 = AVC_DOT_SW3_SW(hz_out87_r, hz_out910_r, hz_out1211_r, filt0, filt1,
4108 tmp1 = AVC_DOT_SW3_SW(hz_out87_l, hz_out910_l, hz_out1211_l, filt0, filt1,
4110 dst3 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
4111 out0 = PCKEV_XORI128_UB(dst0, dst1);
4112 out1 = PCKEV_XORI128_UB(dst2, dst3);
4113 ST8x4_UB(out0, out1, dst, stride);
4116 void ff_put_h264_qpel4_mc22_msa(uint8_t *dst, const uint8_t *src,
4119 const int32_t filt_const0 = 0xfffb0001;
4120 const int32_t filt_const1 = 0x140014;
4121 const int32_t filt_const2 = 0x1fffb;
4123 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
4124 v16i8 mask0, mask1, mask2;
4125 v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
4126 v8i16 hz_out7, hz_out8, dst0, dst1, filt0, filt1, filt2;
4127 v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r;
4128 v8i16 hz_out65_r, hz_out76_r, hz_out87_r;
4131 LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2);
4133 filt0 = (v8i16) __msa_fill_w(filt_const0);
4134 filt1 = (v8i16) __msa_fill_w(filt_const1);
4135 filt2 = (v8i16) __msa_fill_w(filt_const2);
4137 src -= ((2 * stride) + 2);
4139 LD_SB5(src, stride, src0, src1, src2, src3, src4);
4140 src += (5 * stride);
4141 LD_SB4(src, stride, src5, src6, src7, src8);
4143 XORI_B5_128_SB(src0, src1, src2, src3, src4);
4144 XORI_B4_128_SB(src5, src6, src7, src8);
4145 hz_out0 = AVC_HORZ_FILTER_SH(src0, src1, mask0, mask1, mask2);
4146 hz_out2 = AVC_HORZ_FILTER_SH(src2, src3, mask0, mask1, mask2);
4147 hz_out4 = AVC_HORZ_FILTER_SH(src4, src5, mask0, mask1, mask2);
4148 hz_out6 = AVC_HORZ_FILTER_SH(src6, src7, mask0, mask1, mask2);
4149 hz_out8 = AVC_HORZ_FILTER_SH(src8, src8, mask0, mask1, mask2);
4150 PCKOD_D2_SH(hz_out0, hz_out0, hz_out2, hz_out2, hz_out1, hz_out3);
4151 PCKOD_D2_SH(hz_out4, hz_out4, hz_out6, hz_out6, hz_out5, hz_out7);
4152 ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4,
4153 hz_out3, hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r);
4154 ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8,
4155 hz_out7, hz_out54_r, hz_out65_r, hz_out76_r, hz_out87_r);
4157 tmp0 = AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0, filt1,
4159 tmp1 = AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0, filt1,
4161 dst0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
4162 tmp0 = AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0, filt1,
4164 tmp1 = AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0, filt1,
4166 dst1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
4167 res = PCKEV_XORI128_UB(dst0, dst1);
4168 ST4x4_UB(res, res, 0, 1, 2, 3, dst, stride);
4171 void ff_avg_h264_qpel16_mc10_msa(uint8_t *dst, const uint8_t *src,
4174 avc_luma_hz_qrt_and_aver_dst_16x16_msa(src - 2, stride, dst, stride, 0);
4177 void ff_avg_h264_qpel16_mc30_msa(uint8_t *dst, const uint8_t *src,
4180 avc_luma_hz_qrt_and_aver_dst_16x16_msa(src - 2, stride, dst, stride, 1);
4183 void ff_avg_h264_qpel8_mc10_msa(uint8_t *dst, const uint8_t *src,
4186 avc_luma_hz_qrt_and_aver_dst_8x8_msa(src - 2, stride, dst, stride, 0);
4189 void ff_avg_h264_qpel8_mc30_msa(uint8_t *dst, const uint8_t *src,
4192 avc_luma_hz_qrt_and_aver_dst_8x8_msa(src - 2, stride, dst, stride, 1);
4195 void ff_avg_h264_qpel4_mc10_msa(uint8_t *dst, const uint8_t *src,
4198 avc_luma_hz_qrt_and_aver_dst_4x4_msa(src - 2, stride, dst, stride, 0);
4201 void ff_avg_h264_qpel4_mc30_msa(uint8_t *dst, const uint8_t *src,
4204 avc_luma_hz_qrt_and_aver_dst_4x4_msa(src - 2, stride, dst, stride, 1);
4207 void ff_avg_h264_qpel16_mc20_msa(uint8_t *dst, const uint8_t *src,
4210 avc_luma_hz_and_aver_dst_16x16_msa(src - 2, stride, dst, stride);
4213 void ff_avg_h264_qpel8_mc20_msa(uint8_t *dst, const uint8_t *src,
4216 avc_luma_hz_and_aver_dst_8x8_msa(src - 2, stride, dst, stride);
4219 void ff_avg_h264_qpel4_mc20_msa(uint8_t *dst, const uint8_t *src,
4222 avc_luma_hz_and_aver_dst_4x4_msa(src - 2, stride, dst, stride);
4225 void ff_avg_h264_qpel16_mc01_msa(uint8_t *dst, const uint8_t *src,
4228 avc_luma_vt_qrt_and_aver_dst_16x16_msa(src - (stride * 2),
4229 stride, dst, stride, 0);
4232 void ff_avg_h264_qpel16_mc03_msa(uint8_t *dst, const uint8_t *src,
4235 avc_luma_vt_qrt_and_aver_dst_16x16_msa(src - (stride * 2),
4236 stride, dst, stride, 1);
4239 void ff_avg_h264_qpel8_mc01_msa(uint8_t *dst, const uint8_t *src,
4242 avc_luma_vt_qrt_and_aver_dst_8x8_msa(src - (stride * 2),
4243 stride, dst, stride, 0);
4246 void ff_avg_h264_qpel8_mc03_msa(uint8_t *dst, const uint8_t *src,
4249 avc_luma_vt_qrt_and_aver_dst_8x8_msa(src - (stride * 2),
4250 stride, dst, stride, 1);
4253 void ff_avg_h264_qpel4_mc01_msa(uint8_t *dst, const uint8_t *src,
4256 avc_luma_vt_qrt_and_aver_dst_4x4_msa(src - (stride * 2),
4257 stride, dst, stride, 0);
4260 void ff_avg_h264_qpel4_mc03_msa(uint8_t *dst, const uint8_t *src,
4263 avc_luma_vt_qrt_and_aver_dst_4x4_msa(src - (stride * 2),
4264 stride, dst, stride, 1);
4267 void ff_avg_h264_qpel16_mc11_msa(uint8_t *dst, const uint8_t *src,
4270 avc_luma_hv_qrt_and_aver_dst_16x16_msa(src - 2,
4272 stride, dst, stride);
4275 void ff_avg_h264_qpel16_mc31_msa(uint8_t *dst, const uint8_t *src,
4278 avc_luma_hv_qrt_and_aver_dst_16x16_msa(src - 2,
4279 src - (stride * 2) +
4280 sizeof(uint8_t), stride,
4284 void ff_avg_h264_qpel16_mc13_msa(uint8_t *dst, const uint8_t *src,
4287 avc_luma_hv_qrt_and_aver_dst_16x16_msa(src + stride - 2,
4289 stride, dst, stride);
4292 void ff_avg_h264_qpel16_mc33_msa(uint8_t *dst, const uint8_t *src,
4295 avc_luma_hv_qrt_and_aver_dst_16x16_msa(src + stride - 2,
4296 src - (stride * 2) +
4297 sizeof(uint8_t), stride,
4301 void ff_avg_h264_qpel8_mc11_msa(uint8_t *dst, const uint8_t *src,
4304 avc_luma_hv_qrt_and_aver_dst_8x8_msa(src - 2,
4306 stride, dst, stride);
4309 void ff_avg_h264_qpel8_mc31_msa(uint8_t *dst, const uint8_t *src,
4312 avc_luma_hv_qrt_and_aver_dst_8x8_msa(src - 2,
4313 src - (stride * 2) +
4314 sizeof(uint8_t), stride, dst, stride);
4317 void ff_avg_h264_qpel8_mc13_msa(uint8_t *dst, const uint8_t *src,
4320 avc_luma_hv_qrt_and_aver_dst_8x8_msa(src + stride - 2,
4322 stride, dst, stride);
4325 void ff_avg_h264_qpel8_mc33_msa(uint8_t *dst, const uint8_t *src,
4328 avc_luma_hv_qrt_and_aver_dst_8x8_msa(src + stride - 2,
4329 src - (stride * 2) +
4330 sizeof(uint8_t), stride, dst, stride);
4334 void ff_avg_h264_qpel4_mc11_msa(uint8_t *dst, const uint8_t *src,
4337 avc_luma_hv_qrt_and_aver_dst_4x4_msa(src - 2,
4339 stride, dst, stride);
4342 void ff_avg_h264_qpel4_mc31_msa(uint8_t *dst, const uint8_t *src,
4345 avc_luma_hv_qrt_and_aver_dst_4x4_msa(src - 2,
4346 src - (stride * 2) +
4347 sizeof(uint8_t), stride, dst, stride);
4350 void ff_avg_h264_qpel4_mc13_msa(uint8_t *dst, const uint8_t *src,
4353 avc_luma_hv_qrt_and_aver_dst_4x4_msa(src + stride - 2,
4355 stride, dst, stride);
4358 void ff_avg_h264_qpel4_mc33_msa(uint8_t *dst, const uint8_t *src,
4361 avc_luma_hv_qrt_and_aver_dst_4x4_msa(src + stride - 2,
4362 src - (stride * 2) +
4363 sizeof(uint8_t), stride, dst, stride);
4366 void ff_avg_h264_qpel16_mc21_msa(uint8_t *dst, const uint8_t *src,
4369 avc_luma_midv_qrt_and_aver_dst_16w_msa(src - (2 * stride) - 2,
4370 stride, dst, stride, 16, 0);
4373 void ff_avg_h264_qpel16_mc23_msa(uint8_t *dst, const uint8_t *src,
4376 avc_luma_midv_qrt_and_aver_dst_16w_msa(src - (2 * stride) - 2,
4377 stride, dst, stride, 16, 1);
4380 void ff_avg_h264_qpel8_mc21_msa(uint8_t *dst, const uint8_t *src,
4383 avc_luma_midv_qrt_and_aver_dst_8w_msa(src - (2 * stride) - 2,
4384 stride, dst, stride, 8, 0);
4387 void ff_avg_h264_qpel8_mc23_msa(uint8_t *dst, const uint8_t *src,
4390 avc_luma_midv_qrt_and_aver_dst_8w_msa(src - (2 * stride) - 2,
4391 stride, dst, stride, 8, 1);
4394 void ff_avg_h264_qpel4_mc21_msa(uint8_t *dst, const uint8_t *src,
4397 avc_luma_midv_qrt_and_aver_dst_4w_msa(src - (2 * stride) - 2,
4398 stride, dst, stride, 4, 0);
4401 void ff_avg_h264_qpel4_mc23_msa(uint8_t *dst, const uint8_t *src,
4404 avc_luma_midv_qrt_and_aver_dst_4w_msa(src - (2 * stride) - 2,
4405 stride, dst, stride, 4, 1);
4408 void ff_avg_h264_qpel16_mc02_msa(uint8_t *dst, const uint8_t *src,
4411 avc_luma_vt_and_aver_dst_16x16_msa(src - (stride * 2), stride, dst, stride);
4414 void ff_avg_h264_qpel8_mc02_msa(uint8_t *dst, const uint8_t *src,
4417 avc_luma_vt_and_aver_dst_8x8_msa(src - (stride * 2), stride, dst, stride);
4420 void ff_avg_h264_qpel4_mc02_msa(uint8_t *dst, const uint8_t *src,
4423 avc_luma_vt_and_aver_dst_4x4_msa(src - (stride * 2), stride, dst, stride);
4426 void ff_avg_h264_qpel16_mc12_msa(uint8_t *dst, const uint8_t *src,
4429 avc_luma_midh_qrt_and_aver_dst_16w_msa(src - (2 * stride) - 2,
4430 stride, dst, stride, 16, 0);
4433 void ff_avg_h264_qpel16_mc32_msa(uint8_t *dst, const uint8_t *src,
4436 avc_luma_midh_qrt_and_aver_dst_16w_msa(src - (2 * stride) - 2,
4437 stride, dst, stride, 16, 1);
4440 void ff_avg_h264_qpel8_mc12_msa(uint8_t *dst, const uint8_t *src,
4443 avc_luma_midh_qrt_and_aver_dst_8w_msa(src - (2 * stride) - 2,
4444 stride, dst, stride, 8, 0);
4447 void ff_avg_h264_qpel8_mc32_msa(uint8_t *dst, const uint8_t *src,
4450 avc_luma_midh_qrt_and_aver_dst_8w_msa(src - (2 * stride) - 2,
4451 stride, dst, stride, 8, 1);
4454 void ff_avg_h264_qpel4_mc12_msa(uint8_t *dst, const uint8_t *src,
4457 avc_luma_midh_qrt_and_aver_dst_4w_msa(src - (2 * stride) - 2,
4458 stride, dst, stride, 4, 0);
4461 void ff_avg_h264_qpel4_mc32_msa(uint8_t *dst, const uint8_t *src,
4464 avc_luma_midh_qrt_and_aver_dst_4w_msa(src - (2 * stride) - 2,
4465 stride, dst, stride, 4, 1);
4468 void ff_avg_h264_qpel16_mc22_msa(uint8_t *dst, const uint8_t *src,
4471 avc_luma_mid_and_aver_dst_16x16_msa(src - (2 * stride) - 2,
4472 stride, dst, stride);
4475 void ff_avg_h264_qpel8_mc22_msa(uint8_t *dst, const uint8_t *src,
4478 avc_luma_mid_and_aver_dst_8w_msa(src - (2 * stride) - 2,
4479 stride, dst, stride, 8);
4482 void ff_avg_h264_qpel4_mc22_msa(uint8_t *dst, const uint8_t *src,
4485 avc_luma_mid_and_aver_dst_4x4_msa(src - (2 * stride) - 2,
4486 stride, dst, stride);