2 * Copyright (c) 2015 -2017 Parag Salasakar (Parag.Salasakar@imgtec.com)
4 * This file is part of FFmpeg.
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 #include "libavutil/mips/generic_macros_msa.h"
22 #include "h264dsp_mips.h"
24 #define AVC_CALC_DPADD_H_6PIX_2COEFF_SH(in0, in1, in2, in3, in4, in5) \
26 v4i32 tmp0_m, tmp1_m; \
27 v8i16 out0_m, out1_m, out2_m, out3_m; \
28 v8i16 minus5h_m = __msa_ldi_h(-5); \
29 v8i16 plus20h_m = __msa_ldi_h(20); \
31 ILVRL_H2_SW(in5, in0, tmp0_m, tmp1_m); \
33 tmp0_m = __msa_hadd_s_w((v8i16) tmp0_m, (v8i16) tmp0_m); \
34 tmp1_m = __msa_hadd_s_w((v8i16) tmp1_m, (v8i16) tmp1_m); \
36 ILVRL_H2_SH(in1, in4, out0_m, out1_m); \
37 DPADD_SH2_SW(out0_m, out1_m, minus5h_m, minus5h_m, tmp0_m, tmp1_m); \
38 ILVRL_H2_SH(in2, in3, out2_m, out3_m); \
39 DPADD_SH2_SW(out2_m, out3_m, plus20h_m, plus20h_m, tmp0_m, tmp1_m); \
41 SRARI_W2_SW(tmp0_m, tmp1_m, 10); \
42 SAT_SW2_SW(tmp0_m, tmp1_m, 7); \
43 out0_m = __msa_pckev_h((v8i16) tmp1_m, (v8i16) tmp0_m); \
48 static const uint8_t luma_mask_arr[16 * 8] = {
50 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12,
51 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 9, 7, 10, 8, 11,
52 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10,
55 0, 5, 1, 6, 2, 7, 3, 8, 16, 21, 17, 22, 18, 23, 19, 24,
56 1, 4, 2, 5, 3, 6, 4, 7, 17, 20, 18, 21, 19, 22, 20, 23,
57 2, 3, 3, 4, 4, 5, 5, 6, 18, 19, 19, 20, 20, 21, 21, 22,
59 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 24, 25,
60 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26
63 #define AVC_CALC_DPADD_B_6PIX_2COEFF_SH(vec0, vec1, vec2, vec3, vec4, vec5, \
66 v16i8 tmp0_m, tmp1_m; \
67 v16i8 minus5b_m = __msa_ldi_b(-5); \
68 v16i8 plus20b_m = __msa_ldi_b(20); \
70 ILVRL_B2_SB(vec5, vec0, tmp0_m, tmp1_m); \
71 HADD_SB2_SH(tmp0_m, tmp1_m, out1, out2); \
72 ILVRL_B2_SB(vec4, vec1, tmp0_m, tmp1_m); \
73 DPADD_SB2_SH(tmp0_m, tmp1_m, minus5b_m, minus5b_m, out1, out2); \
74 ILVRL_B2_SB(vec3, vec2, tmp0_m, tmp1_m); \
75 DPADD_SB2_SH(tmp0_m, tmp1_m, plus20b_m, plus20b_m, out1, out2); \
78 #define AVC_CALC_DPADD_B_6PIX_2COEFF_R_SH(vec0, vec1, vec2, vec3, vec4, vec5) \
81 v16i8 tmp0_m, tmp2_m; \
82 v16i8 minus5b_m = __msa_ldi_b(-5); \
83 v16i8 plus20b_m = __msa_ldi_b(20); \
85 tmp1_m = (v8i16) __msa_ilvr_b((v16i8) vec5, (v16i8) vec0); \
86 tmp1_m = __msa_hadd_s_h((v16i8) tmp1_m, (v16i8) tmp1_m); \
88 ILVR_B2_SB(vec4, vec1, vec3, vec2, tmp0_m, tmp2_m); \
89 DPADD_SB2_SH(tmp0_m, tmp2_m, minus5b_m, plus20b_m, tmp1_m, tmp1_m); \
94 #define AVC_CALC_DPADD_H_6PIX_2COEFF_R_SH(vec0, vec1, vec2, vec3, vec4, vec5) \
97 v8i16 tmp2_m, tmp3_m; \
98 v8i16 minus5h_m = __msa_ldi_h(-5); \
99 v8i16 plus20h_m = __msa_ldi_h(20); \
101 tmp1_m = (v4i32) __msa_ilvr_h((v8i16) vec5, (v8i16) vec0); \
102 tmp1_m = __msa_hadd_s_w((v8i16) tmp1_m, (v8i16) tmp1_m); \
104 ILVR_H2_SH(vec1, vec4, vec2, vec3, tmp2_m, tmp3_m); \
105 DPADD_SH2_SW(tmp2_m, tmp3_m, minus5h_m, plus20h_m, tmp1_m, tmp1_m); \
107 tmp1_m = __msa_srari_w(tmp1_m, 10); \
108 tmp1_m = __msa_sat_s_w(tmp1_m, 7); \
110 tmp2_m = __msa_pckev_h((v8i16) tmp1_m, (v8i16) tmp1_m); \
115 #define AVC_XOR_VSHF_B_AND_APPLY_6TAP_HORIZ_FILT_SH(src0, src1, \
116 mask0, mask1, mask2) \
119 v16i8 vec0_m, vec1_m, vec2_m; \
120 v16i8 minus5b_m = __msa_ldi_b(-5); \
121 v16i8 plus20b_m = __msa_ldi_b(20); \
123 vec0_m = __msa_vshf_b((v16i8) mask0, (v16i8) src1, (v16i8) src0); \
124 hz_out_m = __msa_hadd_s_h(vec0_m, vec0_m); \
126 VSHF_B2_SB(src0, src1, src0, src1, mask1, mask2, vec1_m, vec2_m); \
127 DPADD_SB2_SH(vec1_m, vec2_m, minus5b_m, plus20b_m, hz_out_m, hz_out_m); \
132 #define AVC_HORZ_FILTER_SH(in0, in1, mask0, mask1, mask2) \
136 v16i8 minus5b = __msa_ldi_b(-5); \
137 v16i8 plus20b = __msa_ldi_b(20); \
139 tmp0_m = __msa_vshf_b((v16i8) mask0, in1, in0); \
140 out0_m = __msa_hadd_s_h(tmp0_m, tmp0_m); \
142 tmp0_m = __msa_vshf_b((v16i8) mask1, in1, in0); \
143 out0_m = __msa_dpadd_s_h(out0_m, minus5b, tmp0_m); \
145 tmp0_m = __msa_vshf_b((v16i8) mask2, in1, in0); \
146 out0_m = __msa_dpadd_s_h(out0_m, plus20b, tmp0_m); \
151 #define AVC_DOT_SH3_SH(in0, in1, in2, coeff0, coeff1, coeff2) \
155 out0_m = __msa_dotp_s_h((v16i8) in0, (v16i8) coeff0); \
156 out0_m = __msa_dpadd_s_h(out0_m, (v16i8) in1, (v16i8) coeff1); \
157 out0_m = __msa_dpadd_s_h(out0_m, (v16i8) in2, (v16i8) coeff2); \
162 #define AVC_DOT_SW3_SW(in0, in1, in2, coeff0, coeff1, coeff2) \
166 out0_m = __msa_dotp_s_w((v8i16) in0, (v8i16) coeff0); \
167 out0_m = __msa_dpadd_s_w(out0_m, (v8i16) in1, (v8i16) coeff1); \
168 out0_m = __msa_dpadd_s_w(out0_m, (v8i16) in2, (v8i16) coeff2); \
169 out0_m = __msa_srari_w(out0_m, 10); \
170 out0_m = __msa_sat_s_w(out0_m, 7); \
174 static void avc_luma_mid_4w_msa(const uint8_t *src, int32_t src_stride,
175 uint8_t *dst, int32_t dst_stride,
179 v16i8 src0, src1, src2, src3, src4;
180 v16i8 mask0, mask1, mask2;
181 v8i16 hz_out0, hz_out1, hz_out2, hz_out3;
182 v8i16 hz_out4, hz_out5, hz_out6, hz_out7, hz_out8;
183 v8i16 dst0, dst1, dst2, dst3;
185 LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2);
186 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
187 src += (5 * src_stride);
189 XORI_B5_128_SB(src0, src1, src2, src3, src4);
191 hz_out0 = AVC_XOR_VSHF_B_AND_APPLY_6TAP_HORIZ_FILT_SH(src0, src1,
192 mask0, mask1, mask2);
193 hz_out2 = AVC_XOR_VSHF_B_AND_APPLY_6TAP_HORIZ_FILT_SH(src2, src3,
194 mask0, mask1, mask2);
196 PCKOD_D2_SH(hz_out0, hz_out0, hz_out2, hz_out2, hz_out1, hz_out3);
198 hz_out4 = AVC_HORZ_FILTER_SH(src4, src4, mask0, mask1, mask2);
200 for (loop_cnt = (height >> 2); loop_cnt--;) {
201 LD_SB4(src, src_stride, src0, src1, src2, src3);
202 src += (4 * src_stride);
204 XORI_B4_128_SB(src0, src1, src2, src3);
206 hz_out5 = AVC_XOR_VSHF_B_AND_APPLY_6TAP_HORIZ_FILT_SH(src0, src1,
209 hz_out7 = AVC_XOR_VSHF_B_AND_APPLY_6TAP_HORIZ_FILT_SH(src2, src3,
213 PCKOD_D2_SH(hz_out5, hz_out5, hz_out7, hz_out7, hz_out6, hz_out8);
215 dst0 = AVC_CALC_DPADD_H_6PIX_2COEFF_R_SH(hz_out0, hz_out1, hz_out2,
216 hz_out3, hz_out4, hz_out5);
217 dst1 = AVC_CALC_DPADD_H_6PIX_2COEFF_R_SH(hz_out1, hz_out2, hz_out3,
218 hz_out4, hz_out5, hz_out6);
219 dst2 = AVC_CALC_DPADD_H_6PIX_2COEFF_R_SH(hz_out2, hz_out3, hz_out4,
220 hz_out5, hz_out6, hz_out7);
221 dst3 = AVC_CALC_DPADD_H_6PIX_2COEFF_R_SH(hz_out3, hz_out4, hz_out5,
222 hz_out6, hz_out7, hz_out8);
224 PCKEV_B2_SB(dst1, dst0, dst3, dst2, src0, src1);
225 XORI_B2_128_SB(src0, src1);
227 ST4x4_UB(src0, src1, 0, 2, 0, 2, dst, dst_stride);
229 dst += (4 * dst_stride);
239 static void avc_luma_mid_8w_msa(const uint8_t *src, int32_t src_stride,
240 uint8_t *dst, int32_t dst_stride,
244 v16i8 src0, src1, src2, src3, src4;
245 v16i8 mask0, mask1, mask2;
246 v8i16 hz_out0, hz_out1, hz_out2, hz_out3;
247 v8i16 hz_out4, hz_out5, hz_out6, hz_out7, hz_out8;
248 v8i16 dst0, dst1, dst2, dst3;
251 LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
253 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
254 XORI_B5_128_SB(src0, src1, src2, src3, src4);
255 src += (5 * src_stride);
257 hz_out0 = AVC_HORZ_FILTER_SH(src0, src0, mask0, mask1, mask2);
258 hz_out1 = AVC_HORZ_FILTER_SH(src1, src1, mask0, mask1, mask2);
259 hz_out2 = AVC_HORZ_FILTER_SH(src2, src2, mask0, mask1, mask2);
260 hz_out3 = AVC_HORZ_FILTER_SH(src3, src3, mask0, mask1, mask2);
261 hz_out4 = AVC_HORZ_FILTER_SH(src4, src4, mask0, mask1, mask2);
263 for (loop_cnt = (height >> 2); loop_cnt--;) {
264 LD_SB4(src, src_stride, src0, src1, src2, src3);
265 XORI_B4_128_SB(src0, src1, src2, src3);
266 src += (4 * src_stride);
268 hz_out5 = AVC_HORZ_FILTER_SH(src0, src0, mask0, mask1, mask2);
269 hz_out6 = AVC_HORZ_FILTER_SH(src1, src1, mask0, mask1, mask2);
270 hz_out7 = AVC_HORZ_FILTER_SH(src2, src2, mask0, mask1, mask2);
271 hz_out8 = AVC_HORZ_FILTER_SH(src3, src3, mask0, mask1, mask2);
272 dst0 = AVC_CALC_DPADD_H_6PIX_2COEFF_SH(hz_out0, hz_out1, hz_out2,
273 hz_out3, hz_out4, hz_out5);
274 dst1 = AVC_CALC_DPADD_H_6PIX_2COEFF_SH(hz_out1, hz_out2, hz_out3,
275 hz_out4, hz_out5, hz_out6);
276 dst2 = AVC_CALC_DPADD_H_6PIX_2COEFF_SH(hz_out2, hz_out3, hz_out4,
277 hz_out5, hz_out6, hz_out7);
278 dst3 = AVC_CALC_DPADD_H_6PIX_2COEFF_SH(hz_out3, hz_out4, hz_out5,
279 hz_out6, hz_out7, hz_out8);
280 out0 = PCKEV_XORI128_UB(dst0, dst1);
281 out1 = PCKEV_XORI128_UB(dst2, dst3);
282 ST8x4_UB(out0, out1, dst, dst_stride);
284 dst += (4 * dst_stride);
294 static void avc_luma_mid_16w_msa(const uint8_t *src, int32_t src_stride,
295 uint8_t *dst, int32_t dst_stride,
298 uint32_t multiple8_cnt;
300 for (multiple8_cnt = 2; multiple8_cnt--;) {
301 avc_luma_mid_8w_msa(src, src_stride, dst, dst_stride, height);
307 static void avc_luma_midh_qrt_4w_msa(const uint8_t *src, int32_t src_stride,
308 uint8_t *dst, int32_t dst_stride,
309 int32_t height, uint8_t horiz_offset)
312 v16i8 src0, src1, src2, src3, src4, src5, src6;
313 v8i16 vt_res0, vt_res1, vt_res2, vt_res3;
314 v4i32 hz_res0, hz_res1;
316 v8i16 shf_vec0, shf_vec1, shf_vec2, shf_vec3, shf_vec4, shf_vec5;
317 v8i16 mask0 = { 0, 5, 1, 6, 2, 7, 3, 8 };
318 v8i16 mask1 = { 1, 4, 2, 5, 3, 6, 4, 7 };
319 v8i16 mask2 = { 2, 3, 3, 4, 4, 5, 5, 6 };
320 v8i16 minus5h = __msa_ldi_h(-5);
321 v8i16 plus20h = __msa_ldi_h(20);
325 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
326 src += (5 * src_stride);
327 XORI_B5_128_SB(src0, src1, src2, src3, src4);
329 for (row = (height >> 1); row--;) {
330 LD_SB2(src, src_stride, src5, src6);
331 src += (2 * src_stride);
333 XORI_B2_128_SB(src5, src6);
334 AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src0, src1, src2, src3, src4, src5,
336 AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src1, src2, src3, src4, src5, src6,
338 VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1,
339 mask0, mask1, mask2, shf_vec0, shf_vec1, shf_vec2);
340 VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3,
341 mask0, mask1, mask2, shf_vec3, shf_vec4, shf_vec5);
342 hz_res0 = __msa_hadd_s_w(shf_vec0, shf_vec0);
343 DPADD_SH2_SW(shf_vec1, shf_vec2, minus5h, plus20h, hz_res0, hz_res0);
344 hz_res1 = __msa_hadd_s_w(shf_vec3, shf_vec3);
345 DPADD_SH2_SW(shf_vec4, shf_vec5, minus5h, plus20h, hz_res1, hz_res1);
347 SRARI_W2_SW(hz_res0, hz_res1, 10);
348 SAT_SW2_SW(hz_res0, hz_res1, 7);
350 dst0 = __msa_srari_h(shf_vec2, 5);
351 dst1 = __msa_srari_h(shf_vec5, 5);
353 SAT_SH2_SH(dst0, dst1, 7);
356 dst0 = __msa_ilvod_h(zeros, dst0);
357 dst1 = __msa_ilvod_h(zeros, dst1);
359 ILVEV_H2_SH(dst0, zeros, dst1, zeros, dst0, dst1);
362 hz_res0 = __msa_aver_s_w(hz_res0, (v4i32) dst0);
363 hz_res1 = __msa_aver_s_w(hz_res1, (v4i32) dst1);
364 dst0 = __msa_pckev_h((v8i16) hz_res1, (v8i16) hz_res0);
366 out = PCKEV_XORI128_UB(dst0, dst0);
367 ST4x2_UB(out, dst, dst_stride);
369 dst += (2 * dst_stride);
379 static void avc_luma_midh_qrt_8w_msa(const uint8_t *src, int32_t src_stride,
380 uint8_t *dst, int32_t dst_stride,
381 int32_t height, uint8_t horiz_offset)
383 uint32_t multiple8_cnt;
385 for (multiple8_cnt = 2; multiple8_cnt--;) {
386 avc_luma_midh_qrt_4w_msa(src, src_stride, dst, dst_stride, height,
394 static void avc_luma_midh_qrt_16w_msa(const uint8_t *src, int32_t src_stride,
395 uint8_t *dst, int32_t dst_stride,
396 int32_t height, uint8_t horiz_offset)
398 uint32_t multiple8_cnt;
400 for (multiple8_cnt = 4; multiple8_cnt--;) {
401 avc_luma_midh_qrt_4w_msa(src, src_stride, dst, dst_stride, height,
409 static void avc_luma_hv_qrt_4w_msa(const uint8_t *src_x, const uint8_t *src_y,
410 int32_t src_stride, uint8_t *dst,
411 int32_t dst_stride, int32_t height)
414 v16i8 src_hz0, src_hz1, src_hz2, src_hz3;
415 v16i8 src_vt0, src_vt1, src_vt2, src_vt3, src_vt4;
416 v16i8 src_vt5, src_vt6, src_vt7, src_vt8;
417 v16i8 mask0, mask1, mask2;
418 v8i16 hz_out0, hz_out1, vert_out0, vert_out1;
422 LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2);
424 LD_SB5(src_y, src_stride, src_vt0, src_vt1, src_vt2, src_vt3, src_vt4);
425 src_y += (5 * src_stride);
427 src_vt0 = (v16i8) __msa_insve_w((v4i32) src_vt0, 1, (v4i32) src_vt1);
428 src_vt1 = (v16i8) __msa_insve_w((v4i32) src_vt1, 1, (v4i32) src_vt2);
429 src_vt2 = (v16i8) __msa_insve_w((v4i32) src_vt2, 1, (v4i32) src_vt3);
430 src_vt3 = (v16i8) __msa_insve_w((v4i32) src_vt3, 1, (v4i32) src_vt4);
432 XORI_B4_128_SB(src_vt0, src_vt1, src_vt2, src_vt3);
434 for (loop_cnt = (height >> 2); loop_cnt--;) {
435 LD_SB4(src_x, src_stride, src_hz0, src_hz1, src_hz2, src_hz3);
436 src_x += (4 * src_stride);
438 XORI_B4_128_SB(src_hz0, src_hz1, src_hz2, src_hz3);
440 hz_out0 = AVC_XOR_VSHF_B_AND_APPLY_6TAP_HORIZ_FILT_SH(src_hz0,
443 hz_out1 = AVC_XOR_VSHF_B_AND_APPLY_6TAP_HORIZ_FILT_SH(src_hz2,
447 SRARI_H2_SH(hz_out0, hz_out1, 5);
448 SAT_SH2_SH(hz_out0, hz_out1, 7);
450 LD_SB4(src_y, src_stride, src_vt5, src_vt6, src_vt7, src_vt8);
451 src_y += (4 * src_stride);
453 src_vt4 = (v16i8) __msa_insve_w((v4i32) src_vt4, 1, (v4i32) src_vt5);
454 src_vt5 = (v16i8) __msa_insve_w((v4i32) src_vt5, 1, (v4i32) src_vt6);
455 src_vt6 = (v16i8) __msa_insve_w((v4i32) src_vt6, 1, (v4i32) src_vt7);
456 src_vt7 = (v16i8) __msa_insve_w((v4i32) src_vt7, 1, (v4i32) src_vt8);
458 XORI_B4_128_SB(src_vt4, src_vt5, src_vt6, src_vt7);
461 vert_out0 = AVC_CALC_DPADD_B_6PIX_2COEFF_R_SH(src_vt0, src_vt1,
464 vert_out1 = AVC_CALC_DPADD_B_6PIX_2COEFF_R_SH(src_vt2, src_vt3,
468 SRARI_H2_SH(vert_out0, vert_out1, 5);
469 SAT_SH2_SH(vert_out0, vert_out1, 7);
471 out0 = __msa_srari_h((hz_out0 + vert_out0), 1);
472 out1 = __msa_srari_h((hz_out1 + vert_out1), 1);
474 SAT_SH2_SH(out0, out1, 7);
475 out = PCKEV_XORI128_UB(out0, out1);
476 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
477 dst += (4 * dst_stride);
487 static void avc_luma_hv_qrt_8w_msa(const uint8_t *src_x, const uint8_t *src_y,
488 int32_t src_stride, uint8_t *dst,
489 int32_t dst_stride, int32_t height)
492 v16i8 src_hz0, src_hz1, src_hz2, src_hz3;
493 v16i8 src_vt0, src_vt1, src_vt2, src_vt3, src_vt4;
494 v16i8 src_vt5, src_vt6, src_vt7, src_vt8;
495 v16i8 mask0, mask1, mask2;
496 v8i16 hz_out0, hz_out1, hz_out2, hz_out3;
497 v8i16 vert_out0, vert_out1, vert_out2, vert_out3;
498 v8i16 out0, out1, out2, out3;
501 LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
502 LD_SB5(src_y, src_stride, src_vt0, src_vt1, src_vt2, src_vt3, src_vt4);
503 src_y += (5 * src_stride);
505 src_vt0 = (v16i8) __msa_insve_d((v2i64) src_vt0, 1, (v2i64) src_vt1);
506 src_vt1 = (v16i8) __msa_insve_d((v2i64) src_vt1, 1, (v2i64) src_vt2);
507 src_vt2 = (v16i8) __msa_insve_d((v2i64) src_vt2, 1, (v2i64) src_vt3);
508 src_vt3 = (v16i8) __msa_insve_d((v2i64) src_vt3, 1, (v2i64) src_vt4);
510 XORI_B4_128_SB(src_vt0, src_vt1, src_vt2, src_vt3);
512 for (loop_cnt = (height >> 2); loop_cnt--;) {
513 LD_SB4(src_x, src_stride, src_hz0, src_hz1, src_hz2, src_hz3);
514 XORI_B4_128_SB(src_hz0, src_hz1, src_hz2, src_hz3);
515 src_x += (4 * src_stride);
517 hz_out0 = AVC_HORZ_FILTER_SH(src_hz0, src_hz0, mask0, mask1, mask2);
518 hz_out1 = AVC_HORZ_FILTER_SH(src_hz1, src_hz1, mask0, mask1, mask2);
519 hz_out2 = AVC_HORZ_FILTER_SH(src_hz2, src_hz2, mask0, mask1, mask2);
520 hz_out3 = AVC_HORZ_FILTER_SH(src_hz3, src_hz3, mask0, mask1, mask2);
522 SRARI_H4_SH(hz_out0, hz_out1, hz_out2, hz_out3, 5);
523 SAT_SH4_SH(hz_out0, hz_out1, hz_out2, hz_out3, 7);
525 LD_SB4(src_y, src_stride, src_vt5, src_vt6, src_vt7, src_vt8);
526 src_y += (4 * src_stride);
528 src_vt4 = (v16i8) __msa_insve_d((v2i64) src_vt4, 1, (v2i64) src_vt5);
529 src_vt5 = (v16i8) __msa_insve_d((v2i64) src_vt5, 1, (v2i64) src_vt6);
530 src_vt6 = (v16i8) __msa_insve_d((v2i64) src_vt6, 1, (v2i64) src_vt7);
531 src_vt7 = (v16i8) __msa_insve_d((v2i64) src_vt7, 1, (v2i64) src_vt8);
533 XORI_B4_128_SB(src_vt4, src_vt5, src_vt6, src_vt7);
536 AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src_vt0, src_vt1, src_vt2, src_vt3,
537 src_vt4, src_vt5, vert_out0, vert_out1);
538 AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src_vt2, src_vt3, src_vt4, src_vt5,
539 src_vt6, src_vt7, vert_out2, vert_out3);
541 SRARI_H4_SH(vert_out0, vert_out1, vert_out2, vert_out3, 5);
542 SAT_SH4_SH(vert_out0, vert_out1, vert_out2, vert_out3, 7);
544 out0 = __msa_srari_h((hz_out0 + vert_out0), 1);
545 out1 = __msa_srari_h((hz_out1 + vert_out1), 1);
546 out2 = __msa_srari_h((hz_out2 + vert_out2), 1);
547 out3 = __msa_srari_h((hz_out3 + vert_out3), 1);
549 SAT_SH4_SH(out0, out1, out2, out3, 7);
550 tmp0 = PCKEV_XORI128_UB(out0, out1);
551 tmp1 = PCKEV_XORI128_UB(out2, out3);
552 ST8x4_UB(tmp0, tmp1, dst, dst_stride);
554 dst += (4 * dst_stride);
564 static void avc_luma_hv_qrt_16w_msa(const uint8_t *src_x, const uint8_t *src_y,
565 int32_t src_stride, uint8_t *dst,
566 int32_t dst_stride, int32_t height)
568 uint32_t multiple8_cnt;
570 for (multiple8_cnt = 2; multiple8_cnt--;) {
571 avc_luma_hv_qrt_8w_msa(src_x, src_y, src_stride, dst, dst_stride,
580 static void avc_luma_hz_and_aver_dst_4x4_msa(const uint8_t *src,
582 uint8_t *dst, int32_t dst_stride)
584 v16i8 src0, src1, src2, src3;
585 v16u8 dst0, dst1, dst2, dst3, res;
587 v16i8 mask0, mask1, mask2;
588 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
589 v16i8 minus5b = __msa_ldi_b(-5);
590 v16i8 plus20b = __msa_ldi_b(20);
592 LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2);
593 LD_SB4(src, src_stride, src0, src1, src2, src3);
595 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
596 XORI_B4_128_SB(src0, src1, src2, src3);
597 VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0, vec1);
598 HADD_SB2_SH(vec0, vec1, res0, res1);
599 VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2, vec3);
600 DPADD_SB2_SH(vec2, vec3, minus5b, minus5b, res0, res1);
601 VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4, vec5);
602 DPADD_SB2_SH(vec4, vec5, plus20b, plus20b, res0, res1);
603 SRARI_H2_SH(res0, res1, 5);
604 SAT_SH2_SH(res0, res1, 7);
605 res = PCKEV_XORI128_UB(res0, res1);
606 ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst1);
608 dst0 = (v16u8) __msa_pckev_d((v2i64) dst1, (v2i64) dst0);
609 res = __msa_aver_u_b(res, dst0);
611 ST4x4_UB(res, res, 0, 1, 2, 3, dst, dst_stride);
614 static void avc_luma_hz_and_aver_dst_8x8_msa(const uint8_t *src,
616 uint8_t *dst, int32_t dst_stride)
619 v16i8 src0, src1, src2, src3;
620 v16u8 dst0, dst1, dst2, dst3;
621 v8i16 res0, res1, res2, res3;
622 v16i8 mask0, mask1, mask2;
623 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
624 v16i8 vec6, vec7, vec8, vec9, vec10, vec11;
625 v16i8 minus5b = __msa_ldi_b(-5);
626 v16i8 plus20b = __msa_ldi_b(20);
628 LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
630 for (loop_cnt = 2; loop_cnt--;) {
631 LD_SB4(src, src_stride, src0, src1, src2, src3);
632 src += (4 * src_stride);
634 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
636 XORI_B4_128_SB(src0, src1, src2, src3);
637 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
638 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3);
639 HADD_SB4_SH(vec0, vec1, vec2, vec3, res0, res1, res2, res3);
640 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4, vec5);
641 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6, vec7);
642 DPADD_SB4_SH(vec4, vec5, vec6, vec7, minus5b, minus5b, minus5b, minus5b,
643 res0, res1, res2, res3);
644 VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec8, vec9);
645 VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec10, vec11);
646 DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b,
647 plus20b, res0, res1, res2, res3);
648 SRARI_H4_SH(res0, res1, res2, res3, 5);
649 SAT_SH4_SH(res0, res1, res2, res3, 7);
650 ILVR_D2_UB(dst1, dst0, dst3, dst2, dst0, dst1);
651 CONVERT_UB_AVG_ST8x4_UB(res0, res1, res2, res3, dst0, dst1,
654 dst += (4 * dst_stride);
658 static void avc_luma_hz_and_aver_dst_16x16_msa(const uint8_t *src,
660 uint8_t *dst, int32_t dst_stride)
663 v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
664 v16u8 dst0, dst1, dst2, dst3;
665 v16i8 mask0, mask1, mask2;
666 v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
667 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
668 v16i8 vec6, vec7, vec8, vec9, vec10, vec11;
669 v16i8 minus5b = __msa_ldi_b(-5);
670 v16i8 plus20b = __msa_ldi_b(20);
672 LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
674 for (loop_cnt = 4; loop_cnt--;) {
675 LD_SB2(src, 8, src0, src1);
677 LD_SB2(src, 8, src2, src3);
680 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
682 XORI_B4_128_SB(src0, src1, src2, src3);
683 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec3);
684 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec6, vec9);
685 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec1, vec4);
686 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec7, vec10);
687 VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec2, vec5);
688 VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec8, vec11);
689 HADD_SB4_SH(vec0, vec3, vec6, vec9, res0, res1, res2, res3);
690 DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
691 minus5b, res0, res1, res2, res3);
692 DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
693 plus20b, res0, res1, res2, res3);
694 LD_SB2(src, 8, src4, src5);
696 LD_SB2(src, 8, src6, src7);
698 XORI_B4_128_SB(src4, src5, src6, src7);
699 VSHF_B2_SB(src4, src4, src5, src5, mask0, mask0, vec0, vec3);
700 VSHF_B2_SB(src6, src6, src7, src7, mask0, mask0, vec6, vec9);
701 VSHF_B2_SB(src4, src4, src5, src5, mask1, mask1, vec1, vec4);
702 VSHF_B2_SB(src6, src6, src7, src7, mask1, mask1, vec7, vec10);
703 VSHF_B2_SB(src4, src4, src5, src5, mask2, mask2, vec2, vec5);
704 VSHF_B2_SB(src6, src6, src7, src7, mask2, mask2, vec8, vec11);
705 HADD_SB4_SH(vec0, vec3, vec6, vec9, res4, res5, res6, res7);
706 DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
707 minus5b, res4, res5, res6, res7);
708 DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
709 plus20b, res4, res5, res6, res7);
710 SRARI_H4_SH(res0, res1, res2, res3, 5);
711 SRARI_H4_SH(res4, res5, res6, res7, 5);
712 SAT_SH4_SH(res0, res1, res2, res3, 7);
713 SAT_SH4_SH(res4, res5, res6, res7, 7);
714 PCKEV_B4_SB(res1, res0, res3, res2, res5, res4, res7, res6,
715 vec0, vec1, vec2, vec3);
716 XORI_B4_128_SB(vec0, vec1, vec2, vec3);
717 AVER_UB4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3,
718 dst0, dst1, dst2, dst3);
719 ST_UB4(dst0, dst1, dst2, dst3, dst, dst_stride);
720 dst += (4 * dst_stride);
724 static void avc_luma_hz_qrt_and_aver_dst_4x4_msa(const uint8_t *src,
731 v16i8 src0, src1, src2, src3;
732 v16u8 dst0, dst1, dst2, dst3;
733 v16i8 mask0, mask1, mask2;
734 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
736 v16i8 minus5b = __msa_ldi_b(-5);
737 v16i8 plus20b = __msa_ldi_b(20);
740 LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2);
748 LD_SB4(src, src_stride, src0, src1, src2, src3);
749 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
751 XORI_B4_128_SB(src0, src1, src2, src3);
752 VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0, vec1);
753 HADD_SB2_SH(vec0, vec1, out0, out1);
754 VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2, vec3);
755 DPADD_SB2_SH(vec2, vec3, minus5b, minus5b, out0, out1);
756 VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4, vec5);
757 DPADD_SB2_SH(vec4, vec5, plus20b, plus20b, out0, out1);
758 SRARI_H2_SH(out0, out1, 5);
759 SAT_SH2_SH(out0, out1, 7);
761 PCKEV_B2_UB(out0, out0, out1, out1, res0, res1);
763 src0 = __msa_sld_b(src0, src0, slide);
764 src1 = __msa_sld_b(src1, src1, slide);
765 src2 = __msa_sld_b(src2, src2, slide);
766 src3 = __msa_sld_b(src3, src3, slide);
767 src0 = (v16i8) __msa_insve_w((v4i32) src0, 1, (v4i32) src1);
768 src1 = (v16i8) __msa_insve_w((v4i32) src2, 1, (v4i32) src3);
769 res0 = (v16u8) __msa_aver_s_b((v16i8) res0, src0);
770 res1 = (v16u8) __msa_aver_s_b((v16i8) res1, src1);
772 XORI_B2_128_UB(res0, res1);
774 dst0 = (v16u8) __msa_insve_w((v4i32) dst0, 1, (v4i32) dst1);
775 dst1 = (v16u8) __msa_insve_w((v4i32) dst2, 1, (v4i32) dst3);
777 AVER_UB2_UB(res0, dst0, res1, dst1, dst0, dst1);
779 ST4x4_UB(dst0, dst1, 0, 1, 0, 1, dst, dst_stride);
782 static void avc_luma_hz_qrt_and_aver_dst_8x8_msa(const uint8_t *src,
790 v16i8 src0, src1, src2, src3;
791 v16i8 mask0, mask1, mask2;
792 v16u8 dst0, dst1, dst2, dst3;
793 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
794 v16i8 vec6, vec7, vec8, vec9, vec10, vec11;
795 v8i16 out0, out1, out2, out3;
796 v16i8 minus5b = __msa_ldi_b(-5);
797 v16i8 plus20b = __msa_ldi_b(20);
798 v16i8 res0, res1, res2, res3;
800 LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
808 for (loop_cnt = 2; loop_cnt--;) {
809 LD_SB4(src, src_stride, src0, src1, src2, src3);
810 src += (4 * src_stride);
812 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
814 XORI_B4_128_SB(src0, src1, src2, src3);
815 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
816 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3);
817 HADD_SB4_SH(vec0, vec1, vec2, vec3, out0, out1, out2, out3);
818 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4, vec5);
819 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6, vec7);
820 DPADD_SB4_SH(vec4, vec5, vec6, vec7, minus5b, minus5b, minus5b, minus5b,
821 out0, out1, out2, out3);
822 VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec8, vec9);
823 VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec10, vec11);
824 DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b,
825 plus20b, out0, out1, out2, out3);
827 src0 = __msa_sld_b(src0, src0, slide);
828 src1 = __msa_sld_b(src1, src1, slide);
829 src2 = __msa_sld_b(src2, src2, slide);
830 src3 = __msa_sld_b(src3, src3, slide);
832 SRARI_H4_SH(out0, out1, out2, out3, 5);
833 SAT_SH4_SH(out0, out1, out2, out3, 7);
835 PCKEV_B4_SB(out0, out0, out1, out1, out2, out2, out3, out3,
836 res0, res1, res2, res3);
838 res0 = __msa_aver_s_b(res0, src0);
839 res1 = __msa_aver_s_b(res1, src1);
840 res2 = __msa_aver_s_b(res2, src2);
841 res3 = __msa_aver_s_b(res3, src3);
843 XORI_B4_128_SB(res0, res1, res2, res3);
844 AVER_ST8x4_UB(res0, dst0, res1, dst1, res2, dst2, res3, dst3,
847 dst += (4 * dst_stride);
851 static void avc_luma_hz_qrt_and_aver_dst_16x16_msa(const uint8_t *src,
859 v16i8 src0, src1, src2, src3;
860 v16i8 mask0, mask1, mask2, vshf;
862 v8i16 res0, res1, res2, res3;
863 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
864 v16i8 vec6, vec7, vec8, vec9, vec10, vec11;
865 v16i8 minus5b = __msa_ldi_b(-5);
866 v16i8 plus20b = __msa_ldi_b(20);
868 LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
871 vshf = LD_SB(&luma_mask_arr[16 + 96]);
873 vshf = LD_SB(&luma_mask_arr[96]);
876 for (loop_cnt = 8; loop_cnt--;) {
877 LD_SB2(src, 8, src0, src1);
879 LD_SB2(src, 8, src2, src3);
882 LD_UB2(dst, dst_stride, dst0, dst1);
884 XORI_B4_128_SB(src0, src1, src2, src3);
885 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec3);
886 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec6, vec9);
887 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec1, vec4);
888 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec7, vec10);
889 VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec2, vec5);
890 VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec8, vec11);
891 HADD_SB4_SH(vec0, vec3, vec6, vec9, res0, res1, res2, res3);
892 DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
893 minus5b, res0, res1, res2, res3);
894 DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
895 plus20b, res0, res1, res2, res3);
896 VSHF_B2_SB(src0, src1, src2, src3, vshf, vshf, src0, src2);
897 SRARI_H4_SH(res0, res1, res2, res3, 5);
898 SAT_SH4_SH(res0, res1, res2, res3, 7);
899 PCKEV_B2_SB(res1, res0, res3, res2, out0, out1);
901 out0 = __msa_aver_s_b(out0, src0);
902 out1 = __msa_aver_s_b(out1, src2);
904 XORI_B2_128_SB(out0, out1);
905 AVER_UB2_UB(out0, dst0, out1, dst1, dst0, dst1);
906 ST_UB2(dst0, dst1, dst, dst_stride);
907 dst += (2 * dst_stride);
911 static void avc_luma_vt_and_aver_dst_4x4_msa(const uint8_t *src,
913 uint8_t *dst, int32_t dst_stride)
915 int16_t filt_const0 = 0xfb01;
916 int16_t filt_const1 = 0x1414;
917 int16_t filt_const2 = 0x1fb;
918 v16u8 dst0, dst1, dst2, dst3;
919 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
920 v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
921 v16i8 src87_r, src2110, src4332, src6554, src8776;
923 v16i8 filt0, filt1, filt2;
926 filt0 = (v16i8) __msa_fill_h(filt_const0);
927 filt1 = (v16i8) __msa_fill_h(filt_const1);
928 filt2 = (v16i8) __msa_fill_h(filt_const2);
930 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
931 src += (5 * src_stride);
933 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3,
934 src10_r, src21_r, src32_r, src43_r);
935 ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
936 XORI_B2_128_SB(src2110, src4332);
937 LD_SB4(src, src_stride, src5, src6, src7, src8);
938 ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7,
939 src54_r, src65_r, src76_r, src87_r);
940 ILVR_D2_SB(src65_r, src54_r, src87_r, src76_r, src6554, src8776);
941 XORI_B2_128_SB(src6554, src8776);
942 out10 = DPADD_SH3_SH(src2110, src4332, src6554, filt0, filt1, filt2);
943 out32 = DPADD_SH3_SH(src4332, src6554, src8776, filt0, filt1, filt2);
944 SRARI_H2_SH(out10, out32, 5);
945 SAT_SH2_SH(out10, out32, 7);
946 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
947 res = PCKEV_XORI128_UB(out10, out32);
949 ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst1);
951 dst0 = (v16u8) __msa_pckev_d((v2i64) dst1, (v2i64) dst0);
952 dst0 = __msa_aver_u_b(res, dst0);
954 ST4x4_UB(dst0, dst0, 0, 1, 2, 3, dst, dst_stride);
957 static void avc_luma_vt_and_aver_dst_8x8_msa(const uint8_t *src,
959 uint8_t *dst, int32_t dst_stride)
962 int16_t filt_const0 = 0xfb01;
963 int16_t filt_const1 = 0x1414;
964 int16_t filt_const2 = 0x1fb;
965 v16u8 dst0, dst1, dst2, dst3;
966 v16i8 src0, src1, src2, src3, src4, src7, src8, src9, src10;
967 v16i8 src10_r, src32_r, src76_r, src98_r;
968 v16i8 src21_r, src43_r, src87_r, src109_r;
969 v8i16 out0, out1, out2, out3;
970 v16i8 filt0, filt1, filt2;
972 filt0 = (v16i8) __msa_fill_h(filt_const0);
973 filt1 = (v16i8) __msa_fill_h(filt_const1);
974 filt2 = (v16i8) __msa_fill_h(filt_const2);
976 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
977 src += (5 * src_stride);
979 XORI_B5_128_SB(src0, src1, src2, src3, src4);
980 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3,
981 src10_r, src21_r, src32_r, src43_r);
983 for (loop_cnt = 2; loop_cnt--;) {
984 LD_SB4(src, src_stride, src7, src8, src9, src10);
985 src += (4 * src_stride);
987 XORI_B4_128_SB(src7, src8, src9, src10);
988 ILVR_B4_SB(src7, src4, src8, src7, src9, src8, src10, src9,
989 src76_r, src87_r, src98_r, src109_r);
990 out0 = DPADD_SH3_SH(src10_r, src32_r, src76_r, filt0, filt1, filt2);
991 out1 = DPADD_SH3_SH(src21_r, src43_r, src87_r, filt0, filt1, filt2);
992 out2 = DPADD_SH3_SH(src32_r, src76_r, src98_r, filt0, filt1, filt2);
993 out3 = DPADD_SH3_SH(src43_r, src87_r, src109_r, filt0, filt1, filt2);
994 SRARI_H4_SH(out0, out1, out2, out3, 5);
995 SAT_SH4_SH(out0, out1, out2, out3, 7);
996 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
997 ILVR_D2_UB(dst1, dst0, dst3, dst2, dst0, dst1);
998 CONVERT_UB_AVG_ST8x4_UB(out0, out1, out2, out3, dst0, dst1,
1000 dst += (4 * dst_stride);
1010 static void avc_luma_vt_and_aver_dst_16x16_msa(const uint8_t *src,
1012 uint8_t *dst, int32_t dst_stride)
1015 int16_t filt_const0 = 0xfb01;
1016 int16_t filt_const1 = 0x1414;
1017 int16_t filt_const2 = 0x1fb;
1018 v16u8 dst0, dst1, dst2, dst3;
1019 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
1020 v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
1021 v16i8 src87_r, src10_l, src32_l, src54_l, src76_l, src21_l, src43_l;
1022 v16i8 src65_l, src87_l;
1023 v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
1024 v16i8 filt0, filt1, filt2;
1025 v16u8 res0, res1, res2, res3;
1027 filt0 = (v16i8) __msa_fill_h(filt_const0);
1028 filt1 = (v16i8) __msa_fill_h(filt_const1);
1029 filt2 = (v16i8) __msa_fill_h(filt_const2);
1031 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
1032 src += (5 * src_stride);
1034 XORI_B5_128_SB(src0, src1, src2, src3, src4);
1035 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3,
1036 src10_r, src21_r, src32_r, src43_r);
1037 ILVL_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3,
1038 src10_l, src21_l, src32_l, src43_l);
1040 for (loop_cnt = 4; loop_cnt--;) {
1041 LD_SB4(src, src_stride, src5, src6, src7, src8);
1042 src += (4 * src_stride);
1044 XORI_B4_128_SB(src5, src6, src7, src8);
1045 ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7,
1046 src54_r, src65_r, src76_r, src87_r);
1047 ILVL_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7,
1048 src54_l, src65_l, src76_l, src87_l);
1049 out0_r = DPADD_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2);
1050 out1_r = DPADD_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2);
1051 out2_r = DPADD_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2);
1052 out3_r = DPADD_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2);
1053 out0_l = DPADD_SH3_SH(src10_l, src32_l, src54_l, filt0, filt1, filt2);
1054 out1_l = DPADD_SH3_SH(src21_l, src43_l, src65_l, filt0, filt1, filt2);
1055 out2_l = DPADD_SH3_SH(src32_l, src54_l, src76_l, filt0, filt1, filt2);
1056 out3_l = DPADD_SH3_SH(src43_l, src65_l, src87_l, filt0, filt1, filt2);
1057 SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 5);
1058 SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 5);
1059 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
1060 SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
1061 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
1062 PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
1063 out3_r, res0, res1, res2, res3);
1064 XORI_B4_128_UB(res0, res1, res2, res3);
1065 AVER_UB4_UB(res0, dst0, res1, dst1, res2, dst2, res3, dst3,
1066 res0, res1, res2, res3);
1067 ST_UB4(res0, res1, res2, res3, dst, dst_stride);
1068 dst += (4 * dst_stride);
1082 static void avc_luma_vt_qrt_and_aver_dst_4x4_msa(const uint8_t *src,
1088 int16_t filt_const0 = 0xfb01;
1089 int16_t filt_const1 = 0x1414;
1090 int16_t filt_const2 = 0x1fb;
1091 v16u8 dst0, dst1, dst2, dst3;
1092 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
1093 v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
1094 v16i8 src87_r, src2110, src4332, src6554, src8776;
1096 v16i8 filt0, filt1, filt2;
1099 filt0 = (v16i8) __msa_fill_h(filt_const0);
1100 filt1 = (v16i8) __msa_fill_h(filt_const1);
1101 filt2 = (v16i8) __msa_fill_h(filt_const2);
1103 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
1104 src += (5 * src_stride);
1106 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3,
1107 src10_r, src21_r, src32_r, src43_r);
1108 ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
1109 XORI_B2_128_SB(src2110, src4332);
1110 LD_SB4(src, src_stride, src5, src6, src7, src8);
1111 ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7,
1112 src54_r, src65_r, src76_r, src87_r);
1113 ILVR_D2_SB(src65_r, src54_r, src87_r, src76_r, src6554, src8776);
1114 XORI_B2_128_SB(src6554, src8776);
1115 out10 = DPADD_SH3_SH(src2110, src4332, src6554, filt0, filt1, filt2);
1116 out32 = DPADD_SH3_SH(src4332, src6554, src8776, filt0, filt1, filt2);
1117 SRARI_H2_SH(out10, out32, 5);
1118 SAT_SH2_SH(out10, out32, 7);
1119 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
1120 res = PCKEV_XORI128_UB(out10, out32);
1123 src32_r = (v16i8) __msa_insve_w((v4i32) src3, 1, (v4i32) src4);
1124 src54_r = (v16i8) __msa_insve_w((v4i32) src5, 1, (v4i32) src6);
1126 src32_r = (v16i8) __msa_insve_w((v4i32) src2, 1, (v4i32) src3);
1127 src54_r = (v16i8) __msa_insve_w((v4i32) src4, 1, (v4i32) src5);
1130 src32_r = (v16i8) __msa_insve_d((v2i64) src32_r, 1, (v2i64) src54_r);
1131 res = __msa_aver_u_b(res, (v16u8) src32_r);
1133 ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst1);
1135 dst0 = (v16u8) __msa_pckev_d((v2i64) dst1, (v2i64) dst0);
1136 dst0 = __msa_aver_u_b(res, dst0);
1138 ST4x4_UB(dst0, dst0, 0, 1, 2, 3, dst, dst_stride);
1141 static void avc_luma_vt_qrt_and_aver_dst_8x8_msa(const uint8_t *src,
1148 int16_t filt_const0 = 0xfb01;
1149 int16_t filt_const1 = 0x1414;
1150 int16_t filt_const2 = 0x1fb;
1151 v16u8 dst0, dst1, dst2, dst3;
1152 v16i8 src0, src1, src2, src3, src4, src7, src8, src9, src10;
1153 v16i8 src10_r, src32_r, src76_r, src98_r;
1154 v16i8 src21_r, src43_r, src87_r, src109_r;
1155 v8i16 out0_r, out1_r, out2_r, out3_r;
1158 v16i8 filt0, filt1, filt2;
1160 filt0 = (v16i8) __msa_fill_h(filt_const0);
1161 filt1 = (v16i8) __msa_fill_h(filt_const1);
1162 filt2 = (v16i8) __msa_fill_h(filt_const2);
1164 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
1165 src += (5 * src_stride);
1167 XORI_B5_128_SB(src0, src1, src2, src3, src4);
1168 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3,
1169 src10_r, src21_r, src32_r, src43_r);
1171 for (loop_cnt = 2; loop_cnt--;) {
1172 LD_SB4(src, src_stride, src7, src8, src9, src10);
1173 src += (4 * src_stride);
1175 XORI_B4_128_SB(src7, src8, src9, src10);
1176 ILVR_B4_SB(src7, src4, src8, src7, src9, src8, src10, src9,
1177 src76_r, src87_r, src98_r, src109_r);
1178 out0_r = DPADD_SH3_SH(src10_r, src32_r, src76_r, filt0, filt1, filt2);
1179 out1_r = DPADD_SH3_SH(src21_r, src43_r, src87_r, filt0, filt1, filt2);
1180 out2_r = DPADD_SH3_SH(src32_r, src76_r, src98_r, filt0, filt1, filt2);
1181 out3_r = DPADD_SH3_SH(src43_r, src87_r, src109_r, filt0, filt1, filt2);
1182 SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 5);
1183 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
1184 PCKEV_B2_SB(out1_r, out0_r, out3_r, out2_r, res0, res1);
1187 PCKEV_D2_SB(src4, src3, src8, src7, src10_r, src32_r);
1189 PCKEV_D2_SB(src3, src2, src7, src4, src10_r, src32_r);
1192 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
1193 ILVR_D2_UB(dst1, dst0, dst3, dst2, dst0, dst1);
1195 vec0 = (v16u8) __msa_aver_s_b(res0, src10_r);
1196 vec1 = (v16u8) __msa_aver_s_b(res1, src32_r);
1198 XORI_B2_128_UB(vec0, vec1);
1199 AVER_UB2_UB(vec0, dst0, vec1, dst1, vec0, vec1);
1200 ST8x4_UB(vec0, vec1, dst, dst_stride);
1201 dst += (4 * dst_stride);
1213 static void avc_luma_vt_qrt_and_aver_dst_16x16_msa(const uint8_t *src,
1220 int16_t filt_const0 = 0xfb01;
1221 int16_t filt_const1 = 0x1414;
1222 int16_t filt_const2 = 0x1fb;
1223 v16u8 dst0, dst1, dst2, dst3;
1224 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
1225 v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
1226 v16i8 src87_r, src10_l, src32_l, src54_l, src76_l, src21_l, src43_l;
1227 v16i8 src65_l, src87_l;
1228 v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
1229 v16i8 out0, out1, out2, out3;
1230 v16i8 filt0, filt1, filt2;
1231 v16u8 res0, res1, res2, res3;
1233 filt0 = (v16i8) __msa_fill_h(filt_const0);
1234 filt1 = (v16i8) __msa_fill_h(filt_const1);
1235 filt2 = (v16i8) __msa_fill_h(filt_const2);
1237 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
1238 src += (5 * src_stride);
1240 XORI_B5_128_SB(src0, src1, src2, src3, src4);
1241 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3,
1242 src10_r, src21_r, src32_r, src43_r);
1243 ILVL_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3,
1244 src10_l, src21_l, src32_l, src43_l);
1246 for (loop_cnt = 4; loop_cnt--;) {
1247 LD_SB4(src, src_stride, src5, src6, src7, src8);
1248 src += (4 * src_stride);
1250 XORI_B4_128_SB(src5, src6, src7, src8);
1251 ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7,
1252 src54_r, src65_r, src76_r, src87_r);
1253 ILVL_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7,
1254 src54_l, src65_l, src76_l, src87_l);
1255 out0_r = DPADD_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2);
1256 out1_r = DPADD_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2);
1257 out2_r = DPADD_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2);
1258 out3_r = DPADD_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2);
1259 out0_l = DPADD_SH3_SH(src10_l, src32_l, src54_l, filt0, filt1, filt2);
1260 out1_l = DPADD_SH3_SH(src21_l, src43_l, src65_l, filt0, filt1, filt2);
1261 out2_l = DPADD_SH3_SH(src32_l, src54_l, src76_l, filt0, filt1, filt2);
1262 out3_l = DPADD_SH3_SH(src43_l, src65_l, src87_l, filt0, filt1, filt2);
1263 SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 5);
1264 SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 5);
1265 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
1266 SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
1267 PCKEV_B4_SB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
1268 out3_r, out0, out1, out2, out3);
1269 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
1272 res0 = (v16u8) __msa_aver_s_b(out0, src3);
1273 res1 = (v16u8) __msa_aver_s_b(out1, src4);
1274 res2 = (v16u8) __msa_aver_s_b(out2, src5);
1275 res3 = (v16u8) __msa_aver_s_b(out3, src6);
1277 res0 = (v16u8) __msa_aver_s_b(out0, src2);
1278 res1 = (v16u8) __msa_aver_s_b(out1, src3);
1279 res2 = (v16u8) __msa_aver_s_b(out2, src4);
1280 res3 = (v16u8) __msa_aver_s_b(out3, src5);
1283 XORI_B4_128_UB(res0, res1, res2, res3);
1284 AVER_UB4_UB(res0, dst0, res1, dst1, res2, dst2, res3, dst3,
1285 dst0, dst1, dst2, dst3);
1286 ST_UB4(dst0, dst1, dst2, dst3, dst, dst_stride);
1287 dst += (4 * dst_stride);
1303 static void avc_luma_mid_and_aver_dst_4x4_msa(const uint8_t *src,
1305 uint8_t *dst, int32_t dst_stride)
1307 v16i8 src0, src1, src2, src3, src4;
1308 v16i8 mask0, mask1, mask2;
1309 v8i16 hz_out0, hz_out1, hz_out2, hz_out3;
1310 v8i16 hz_out4, hz_out5, hz_out6, hz_out7, hz_out8;
1311 v8i16 res0, res1, res2, res3;
1312 v16u8 dst0, dst1, dst2, dst3;
1313 v16u8 tmp0, tmp1, tmp2, tmp3;
1315 LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2);
1316 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
1317 src += (5 * src_stride);
1319 XORI_B5_128_SB(src0, src1, src2, src3, src4);
1321 hz_out0 = AVC_XOR_VSHF_B_AND_APPLY_6TAP_HORIZ_FILT_SH(src0, src1,
1322 mask0, mask1, mask2);
1323 hz_out2 = AVC_XOR_VSHF_B_AND_APPLY_6TAP_HORIZ_FILT_SH(src2, src3,
1324 mask0, mask1, mask2);
1326 PCKOD_D2_SH(hz_out0, hz_out0, hz_out2, hz_out2, hz_out1, hz_out3);
1328 hz_out4 = AVC_HORZ_FILTER_SH(src4, src4, mask0, mask1, mask2);
1330 LD_SB4(src, src_stride, src0, src1, src2, src3);
1331 XORI_B4_128_SB(src0, src1, src2, src3);
1333 hz_out5 = AVC_XOR_VSHF_B_AND_APPLY_6TAP_HORIZ_FILT_SH(src0, src1,
1334 mask0, mask1, mask2);
1335 hz_out7 = AVC_XOR_VSHF_B_AND_APPLY_6TAP_HORIZ_FILT_SH(src2, src3,
1336 mask0, mask1, mask2);
1338 PCKOD_D2_SH(hz_out5, hz_out5, hz_out7, hz_out7, hz_out6, hz_out8);
1340 res0 = AVC_CALC_DPADD_H_6PIX_2COEFF_R_SH(hz_out0, hz_out1, hz_out2,
1341 hz_out3, hz_out4, hz_out5);
1342 res1 = AVC_CALC_DPADD_H_6PIX_2COEFF_R_SH(hz_out1, hz_out2, hz_out3,
1343 hz_out4, hz_out5, hz_out6);
1344 res2 = AVC_CALC_DPADD_H_6PIX_2COEFF_R_SH(hz_out2, hz_out3, hz_out4,
1345 hz_out5, hz_out6, hz_out7);
1346 res3 = AVC_CALC_DPADD_H_6PIX_2COEFF_R_SH(hz_out3, hz_out4, hz_out5,
1347 hz_out6, hz_out7, hz_out8);
1348 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
1349 tmp0 = PCKEV_XORI128_UB(res0, res1);
1350 tmp1 = PCKEV_XORI128_UB(res2, res3);
1351 PCKEV_D2_UB(dst1, dst0, dst3, dst2, tmp2, tmp3);
1352 AVER_UB2_UB(tmp0, tmp2, tmp1, tmp3, tmp0, tmp1);
1354 ST4x4_UB(tmp0, tmp1, 0, 2, 0, 2, dst, dst_stride);
1357 static void avc_luma_mid_and_aver_dst_8w_msa(const uint8_t *src,
1359 uint8_t *dst, int32_t dst_stride,
1363 v16i8 src0, src1, src2, src3, src4;
1364 v16i8 mask0, mask1, mask2;
1365 v8i16 hz_out0, hz_out1, hz_out2, hz_out3;
1366 v8i16 hz_out4, hz_out5, hz_out6, hz_out7, hz_out8;
1367 v16u8 dst0, dst1, dst2, dst3;
1368 v8i16 res0, res1, res2, res3;
1370 LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
1372 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
1373 XORI_B5_128_SB(src0, src1, src2, src3, src4);
1374 src += (5 * src_stride);
1376 hz_out0 = AVC_HORZ_FILTER_SH(src0, src0, mask0, mask1, mask2);
1377 hz_out1 = AVC_HORZ_FILTER_SH(src1, src1, mask0, mask1, mask2);
1378 hz_out2 = AVC_HORZ_FILTER_SH(src2, src2, mask0, mask1, mask2);
1379 hz_out3 = AVC_HORZ_FILTER_SH(src3, src3, mask0, mask1, mask2);
1380 hz_out4 = AVC_HORZ_FILTER_SH(src4, src4, mask0, mask1, mask2);
1382 for (loop_cnt = (height >> 2); loop_cnt--;) {
1383 LD_SB4(src, src_stride, src0, src1, src2, src3);
1384 XORI_B4_128_SB(src0, src1, src2, src3);
1385 src += (4 * src_stride);
1387 hz_out5 = AVC_HORZ_FILTER_SH(src0, src0, mask0, mask1, mask2);
1388 hz_out6 = AVC_HORZ_FILTER_SH(src1, src1, mask0, mask1, mask2);
1389 hz_out7 = AVC_HORZ_FILTER_SH(src2, src2, mask0, mask1, mask2);
1390 hz_out8 = AVC_HORZ_FILTER_SH(src3, src3, mask0, mask1, mask2);
1392 res0 = AVC_CALC_DPADD_H_6PIX_2COEFF_SH(hz_out0, hz_out1, hz_out2,
1393 hz_out3, hz_out4, hz_out5);
1394 res1 = AVC_CALC_DPADD_H_6PIX_2COEFF_SH(hz_out1, hz_out2, hz_out3,
1395 hz_out4, hz_out5, hz_out6);
1396 res2 = AVC_CALC_DPADD_H_6PIX_2COEFF_SH(hz_out2, hz_out3, hz_out4,
1397 hz_out5, hz_out6, hz_out7);
1398 res3 = AVC_CALC_DPADD_H_6PIX_2COEFF_SH(hz_out3, hz_out4, hz_out5,
1399 hz_out6, hz_out7, hz_out8);
1400 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
1401 ILVR_D2_UB(dst1, dst0, dst3, dst2, dst0, dst1);
1402 CONVERT_UB_AVG_ST8x4_UB(res0, res1, res2, res3, dst0, dst1,
1405 dst += (4 * dst_stride);
1415 static void avc_luma_mid_and_aver_dst_16x16_msa(const uint8_t *src,
1420 avc_luma_mid_and_aver_dst_8w_msa(src, src_stride, dst, dst_stride, 16);
1421 avc_luma_mid_and_aver_dst_8w_msa(src + 8, src_stride, dst + 8, dst_stride,
1425 static void avc_luma_midh_qrt_and_aver_dst_4w_msa(const uint8_t *src,
1430 uint8_t horiz_offset)
1433 v16i8 src0, src1, src2, src3, src4, src5, src6;
1434 v16u8 dst0, dst1, res;
1435 v8i16 vt_res0, vt_res1, vt_res2, vt_res3;
1436 v4i32 hz_res0, hz_res1;
1438 v8i16 shf_vec0, shf_vec1, shf_vec2, shf_vec3, shf_vec4, shf_vec5;
1439 v8i16 mask0 = { 0, 5, 1, 6, 2, 7, 3, 8 };
1440 v8i16 mask1 = { 1, 4, 2, 5, 3, 6, 4, 7 };
1441 v8i16 mask2 = { 2, 3, 3, 4, 4, 5, 5, 6 };
1442 v8i16 minus5h = __msa_ldi_h(-5);
1443 v8i16 plus20h = __msa_ldi_h(20);
1444 v8i16 zeros = { 0 };
1446 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
1447 src += (5 * src_stride);
1449 XORI_B5_128_SB(src0, src1, src2, src3, src4);
1451 for (row = (height >> 1); row--;) {
1452 LD_SB2(src, src_stride, src5, src6);
1453 src += (2 * src_stride);
1455 XORI_B2_128_SB(src5, src6);
1456 LD_UB2(dst, dst_stride, dst0, dst1);
1458 dst0 = (v16u8) __msa_ilvr_w((v4i32) dst1, (v4i32) dst0);
1460 AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src0, src1, src2, src3, src4, src5,
1462 AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src1, src2, src3, src4, src5, src6,
1464 VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1,
1465 mask0, mask1, mask2, shf_vec0, shf_vec1, shf_vec2);
1466 VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3,
1467 mask0, mask1, mask2, shf_vec3, shf_vec4, shf_vec5);
1469 hz_res0 = __msa_hadd_s_w(shf_vec0, shf_vec0);
1470 DPADD_SH2_SW(shf_vec1, shf_vec2, minus5h, plus20h, hz_res0, hz_res0);
1472 hz_res1 = __msa_hadd_s_w(shf_vec3, shf_vec3);
1473 DPADD_SH2_SW(shf_vec4, shf_vec5, minus5h, plus20h, hz_res1, hz_res1);
1475 SRARI_W2_SW(hz_res0, hz_res1, 10);
1476 SAT_SW2_SW(hz_res0, hz_res1, 7);
1478 res0 = __msa_srari_h(shf_vec2, 5);
1479 res1 = __msa_srari_h(shf_vec5, 5);
1481 SAT_SH2_SH(res0, res1, 7);
1484 res0 = __msa_ilvod_h(zeros, res0);
1485 res1 = __msa_ilvod_h(zeros, res1);
1487 ILVEV_H2_SH(res0, zeros, res1, zeros, res0, res1);
1489 hz_res0 = __msa_aver_s_w(hz_res0, (v4i32) res0);
1490 hz_res1 = __msa_aver_s_w(hz_res1, (v4i32) res1);
1491 res0 = __msa_pckev_h((v8i16) hz_res1, (v8i16) hz_res0);
1493 res = PCKEV_XORI128_UB(res0, res0);
1495 dst0 = __msa_aver_u_b(res, dst0);
1497 ST4x2_UB(dst0, dst, dst_stride);
1498 dst += (2 * dst_stride);
1508 static void avc_luma_midh_qrt_and_aver_dst_8w_msa(const uint8_t *src,
1513 uint8_t horiz_offset)
1515 uint32_t multiple8_cnt;
1517 for (multiple8_cnt = 2; multiple8_cnt--;) {
1518 avc_luma_midh_qrt_and_aver_dst_4w_msa(src, src_stride, dst, dst_stride,
1519 height, horiz_offset);
1526 static void avc_luma_midh_qrt_and_aver_dst_16w_msa(const uint8_t *src,
1531 uint8_t horiz_offset)
1533 uint32_t multiple8_cnt;
1535 for (multiple8_cnt = 4; multiple8_cnt--;) {
1536 avc_luma_midh_qrt_and_aver_dst_4w_msa(src, src_stride, dst, dst_stride,
1537 height, horiz_offset);
1544 static void avc_luma_midv_qrt_and_aver_dst_4w_msa(const uint8_t *src,
1553 v16i8 src0, src1, src2, src3, src4;
1555 v16i8 mask0, mask1, mask2;
1556 v8i16 hz_out0, hz_out1, hz_out2, hz_out3;
1557 v8i16 hz_out4, hz_out5, hz_out6;
1558 v8i16 res0, res1, res2, res3;
1561 LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2);
1562 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
1563 src += (5 * src_stride);
1565 XORI_B5_128_SB(src0, src1, src2, src3, src4);
1567 hz_out0 = AVC_XOR_VSHF_B_AND_APPLY_6TAP_HORIZ_FILT_SH(src0, src1,
1568 mask0, mask1, mask2);
1569 hz_out2 = AVC_XOR_VSHF_B_AND_APPLY_6TAP_HORIZ_FILT_SH(src2, src3,
1570 mask0, mask1, mask2);
1572 PCKOD_D2_SH(hz_out0, hz_out0, hz_out2, hz_out2, hz_out1, hz_out3);
1574 hz_out4 = AVC_HORZ_FILTER_SH(src4, src4, mask0, mask1, mask2);
1576 for (loop_cnt = (height >> 1); loop_cnt--;) {
1577 LD_SB2(src, src_stride, src0, src1);
1578 src += (2 * src_stride);
1580 XORI_B2_128_SB(src0, src1);
1581 LD_UB2(dst, dst_stride, dst0, dst1);
1582 hz_out5 = AVC_XOR_VSHF_B_AND_APPLY_6TAP_HORIZ_FILT_SH(src0, src1,
1585 hz_out6 = (v8i16) __msa_pckod_d((v2i64) hz_out5, (v2i64) hz_out5);
1586 res0 = AVC_CALC_DPADD_H_6PIX_2COEFF_R_SH(hz_out0, hz_out1, hz_out2,
1587 hz_out3, hz_out4, hz_out5);
1588 res2 = AVC_CALC_DPADD_H_6PIX_2COEFF_R_SH(hz_out1, hz_out2, hz_out3,
1589 hz_out4, hz_out5, hz_out6);
1592 res1 = __msa_srari_h(hz_out3, 5);
1593 res3 = __msa_srari_h(hz_out4, 5);
1595 res1 = __msa_srari_h(hz_out2, 5);
1596 res3 = __msa_srari_h(hz_out3, 5);
1599 SAT_SH2_SH(res1, res3, 7);
1601 res0 = __msa_aver_s_h(res0, res1);
1602 res1 = __msa_aver_s_h(res2, res3);
1604 vec0 = PCKEV_XORI128_UB(res0, res0);
1605 vec1 = PCKEV_XORI128_UB(res1, res1);
1607 AVER_UB2_UB(vec0, dst0, vec1, dst1, dst0, dst1);
1609 out0 = __msa_copy_u_w((v4i32) dst0, 0);
1610 out1 = __msa_copy_u_w((v4i32) dst1, 0);
1624 static void avc_luma_midv_qrt_and_aver_dst_8w_msa(const uint8_t *src,
1629 uint8_t vert_offset)
1632 v16i8 src0, src1, src2, src3, src4;
1633 v16u8 dst0, dst1, dst2, dst3;
1634 v16i8 mask0, mask1, mask2;
1635 v8i16 hz_out0, hz_out1, hz_out2, hz_out3;
1636 v8i16 hz_out4, hz_out5, hz_out6, hz_out7, hz_out8;
1637 v8i16 res0, res1, res2, res3;
1638 v8i16 res4, res5, res6, res7;
1640 LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
1642 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
1643 XORI_B5_128_SB(src0, src1, src2, src3, src4);
1644 src += (5 * src_stride);
1646 hz_out0 = AVC_HORZ_FILTER_SH(src0, src0, mask0, mask1, mask2);
1647 hz_out1 = AVC_HORZ_FILTER_SH(src1, src1, mask0, mask1, mask2);
1648 hz_out2 = AVC_HORZ_FILTER_SH(src2, src2, mask0, mask1, mask2);
1649 hz_out3 = AVC_HORZ_FILTER_SH(src3, src3, mask0, mask1, mask2);
1650 hz_out4 = AVC_HORZ_FILTER_SH(src4, src4, mask0, mask1, mask2);
1652 for (loop_cnt = (height >> 2); loop_cnt--;) {
1653 LD_SB4(src, src_stride, src0, src1, src2, src3);
1654 XORI_B4_128_SB(src0, src1, src2, src3);
1655 src += (4 * src_stride);
1657 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
1659 hz_out5 = AVC_HORZ_FILTER_SH(src0, src0, mask0, mask1, mask2);
1660 hz_out6 = AVC_HORZ_FILTER_SH(src1, src1, mask0, mask1, mask2);
1661 hz_out7 = AVC_HORZ_FILTER_SH(src2, src2, mask0, mask1, mask2);
1662 hz_out8 = AVC_HORZ_FILTER_SH(src3, src3, mask0, mask1, mask2);
1664 res0 = AVC_CALC_DPADD_H_6PIX_2COEFF_SH(hz_out0, hz_out1, hz_out2,
1665 hz_out3, hz_out4, hz_out5);
1666 res2 = AVC_CALC_DPADD_H_6PIX_2COEFF_SH(hz_out1, hz_out2, hz_out3,
1667 hz_out4, hz_out5, hz_out6);
1668 res4 = AVC_CALC_DPADD_H_6PIX_2COEFF_SH(hz_out2, hz_out3, hz_out4,
1669 hz_out5, hz_out6, hz_out7);
1670 res6 = AVC_CALC_DPADD_H_6PIX_2COEFF_SH(hz_out3, hz_out4, hz_out5,
1671 hz_out6, hz_out7, hz_out8);
1674 res1 = __msa_srari_h(hz_out3, 5);
1675 res3 = __msa_srari_h(hz_out4, 5);
1676 res5 = __msa_srari_h(hz_out5, 5);
1677 res7 = __msa_srari_h(hz_out6, 5);
1679 res1 = __msa_srari_h(hz_out2, 5);
1680 res3 = __msa_srari_h(hz_out3, 5);
1681 res5 = __msa_srari_h(hz_out4, 5);
1682 res7 = __msa_srari_h(hz_out5, 5);
1685 SAT_SH4_SH(res1, res3, res5, res7, 7);
1687 res0 = __msa_aver_s_h(res0, res1);
1688 res1 = __msa_aver_s_h(res2, res3);
1689 res2 = __msa_aver_s_h(res4, res5);
1690 res3 = __msa_aver_s_h(res6, res7);
1691 ILVR_D2_UB(dst1, dst0, dst3, dst2, dst0, dst1);
1692 CONVERT_UB_AVG_ST8x4_UB(res0, res1, res2, res3, dst0, dst1,
1694 dst += (4 * dst_stride);
1704 static void avc_luma_midv_qrt_and_aver_dst_16w_msa(const uint8_t *src,
1709 uint8_t vert_offset)
1711 int32_t multiple8_cnt;
1713 for (multiple8_cnt = 2; multiple8_cnt--;) {
1714 avc_luma_midv_qrt_and_aver_dst_8w_msa(src, src_stride, dst, dst_stride,
1715 height, vert_offset);
1722 static void avc_luma_hv_qrt_and_aver_dst_4x4_msa(const uint8_t *src_x,
1723 const uint8_t *src_y,
1728 v16i8 src_hz0, src_hz1, src_hz2, src_hz3;
1729 v16u8 dst0, dst1, dst2, dst3;
1730 v16i8 src_vt0, src_vt1, src_vt2, src_vt3, src_vt4;
1731 v16i8 src_vt5, src_vt6, src_vt7, src_vt8;
1732 v16i8 mask0, mask1, mask2;
1733 v8i16 hz_out0, hz_out1, vert_out0, vert_out1;
1737 LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2);
1738 LD_SB5(src_y, src_stride, src_vt0, src_vt1, src_vt2, src_vt3, src_vt4);
1739 src_y += (5 * src_stride);
1741 src_vt0 = (v16i8) __msa_insve_w((v4i32) src_vt0, 1, (v4i32) src_vt1);
1742 src_vt1 = (v16i8) __msa_insve_w((v4i32) src_vt1, 1, (v4i32) src_vt2);
1743 src_vt2 = (v16i8) __msa_insve_w((v4i32) src_vt2, 1, (v4i32) src_vt3);
1744 src_vt3 = (v16i8) __msa_insve_w((v4i32) src_vt3, 1, (v4i32) src_vt4);
1746 XORI_B4_128_SB(src_vt0, src_vt1, src_vt2, src_vt3);
1747 LD_SB4(src_x, src_stride, src_hz0, src_hz1, src_hz2, src_hz3);
1748 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
1749 XORI_B4_128_SB(src_hz0, src_hz1, src_hz2, src_hz3);
1750 hz_out0 = AVC_XOR_VSHF_B_AND_APPLY_6TAP_HORIZ_FILT_SH(src_hz0, src_hz1,
1751 mask0, mask1, mask2);
1752 hz_out1 = AVC_XOR_VSHF_B_AND_APPLY_6TAP_HORIZ_FILT_SH(src_hz2, src_hz3,
1753 mask0, mask1, mask2);
1754 SRARI_H2_SH(hz_out0, hz_out1, 5);
1755 SAT_SH2_SH(hz_out0, hz_out1, 7);
1756 LD_SB4(src_y, src_stride, src_vt5, src_vt6, src_vt7, src_vt8);
1758 src_vt4 = (v16i8) __msa_insve_w((v4i32) src_vt4, 1, (v4i32) src_vt5);
1759 src_vt5 = (v16i8) __msa_insve_w((v4i32) src_vt5, 1, (v4i32) src_vt6);
1760 src_vt6 = (v16i8) __msa_insve_w((v4i32) src_vt6, 1, (v4i32) src_vt7);
1761 src_vt7 = (v16i8) __msa_insve_w((v4i32) src_vt7, 1, (v4i32) src_vt8);
1763 XORI_B4_128_SB(src_vt4, src_vt5, src_vt6, src_vt7);
1766 vert_out0 = AVC_CALC_DPADD_B_6PIX_2COEFF_R_SH(src_vt0, src_vt1, src_vt2,
1767 src_vt3, src_vt4, src_vt5);
1768 vert_out1 = AVC_CALC_DPADD_B_6PIX_2COEFF_R_SH(src_vt2, src_vt3, src_vt4,
1769 src_vt5, src_vt6, src_vt7);
1770 SRARI_H2_SH(vert_out0, vert_out1, 5);
1771 SAT_SH2_SH(vert_out0, vert_out1, 7);
1773 res1 = __msa_srari_h((hz_out1 + vert_out1), 1);
1774 res0 = __msa_srari_h((hz_out0 + vert_out0), 1);
1776 SAT_SH2_SH(res0, res1, 7);
1777 res = PCKEV_XORI128_UB(res0, res1);
1779 dst0 = (v16u8) __msa_insve_w((v4i32) dst0, 1, (v4i32) dst1);
1780 dst1 = (v16u8) __msa_insve_w((v4i32) dst2, 1, (v4i32) dst3);
1781 dst0 = (v16u8) __msa_insve_d((v2i64) dst0, 1, (v2i64) dst1);
1782 dst0 = __msa_aver_u_b(res, dst0);
1784 ST4x4_UB(dst0, dst0, 0, 1, 2, 3, dst, dst_stride);
1787 static void avc_luma_hv_qrt_and_aver_dst_8x8_msa(const uint8_t *src_x,
1788 const uint8_t *src_y,
1794 v16i8 src_hz0, src_hz1, src_hz2, src_hz3;
1795 v16u8 dst0, dst1, dst2, dst3;
1796 v16i8 src_vt0, src_vt1, src_vt2, src_vt3;
1797 v16i8 src_vt4, src_vt5, src_vt6, src_vt7, src_vt8;
1798 v16i8 mask0, mask1, mask2;
1799 v8i16 hz_out0, hz_out1, hz_out2, hz_out3;
1800 v8i16 vert_out0, vert_out1, vert_out2, vert_out3;
1801 v8i16 out0, out1, out2, out3;
1803 LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
1805 LD_SB5(src_y, src_stride, src_vt0, src_vt1, src_vt2, src_vt3, src_vt4);
1806 src_y += (5 * src_stride);
1808 src_vt0 = (v16i8) __msa_insve_d((v2i64) src_vt0, 1, (v2i64) src_vt1);
1809 src_vt1 = (v16i8) __msa_insve_d((v2i64) src_vt1, 1, (v2i64) src_vt2);
1810 src_vt2 = (v16i8) __msa_insve_d((v2i64) src_vt2, 1, (v2i64) src_vt3);
1811 src_vt3 = (v16i8) __msa_insve_d((v2i64) src_vt3, 1, (v2i64) src_vt4);
1813 XORI_B4_128_SB(src_vt0, src_vt1, src_vt2, src_vt3);
1815 for (loop_cnt = 2; loop_cnt--;) {
1816 LD_SB4(src_x, src_stride, src_hz0, src_hz1, src_hz2, src_hz3);
1817 XORI_B4_128_SB(src_hz0, src_hz1, src_hz2, src_hz3);
1818 src_x += (4 * src_stride);
1820 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
1821 hz_out0 = AVC_HORZ_FILTER_SH(src_hz0, src_hz0, mask0, mask1, mask2);
1822 hz_out1 = AVC_HORZ_FILTER_SH(src_hz1, src_hz1, mask0, mask1, mask2);
1823 hz_out2 = AVC_HORZ_FILTER_SH(src_hz2, src_hz2, mask0, mask1, mask2);
1824 hz_out3 = AVC_HORZ_FILTER_SH(src_hz3, src_hz3, mask0, mask1, mask2);
1825 SRARI_H4_SH(hz_out0, hz_out1, hz_out2, hz_out3, 5);
1826 SAT_SH4_SH(hz_out0, hz_out1, hz_out2, hz_out3, 7);
1827 LD_SB4(src_y, src_stride, src_vt5, src_vt6, src_vt7, src_vt8);
1828 src_y += (4 * src_stride);
1830 src_vt4 = (v16i8) __msa_insve_d((v2i64) src_vt4, 1, (v2i64) src_vt5);
1831 src_vt5 = (v16i8) __msa_insve_d((v2i64) src_vt5, 1, (v2i64) src_vt6);
1832 src_vt6 = (v16i8) __msa_insve_d((v2i64) src_vt6, 1, (v2i64) src_vt7);
1833 src_vt7 = (v16i8) __msa_insve_d((v2i64) src_vt7, 1, (v2i64) src_vt8);
1835 XORI_B4_128_SB(src_vt4, src_vt5, src_vt6, src_vt7);
1836 AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src_vt0, src_vt1, src_vt2, src_vt3,
1837 src_vt4, src_vt5, vert_out0, vert_out1);
1838 AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src_vt2, src_vt3, src_vt4, src_vt5,
1839 src_vt6, src_vt7, vert_out2, vert_out3);
1840 SRARI_H4_SH(vert_out0, vert_out1, vert_out2, vert_out3, 5);
1841 SAT_SH4_SH(vert_out0, vert_out1, vert_out2, vert_out3, 7);
1843 out0 = __msa_srari_h((hz_out0 + vert_out0), 1);
1844 out1 = __msa_srari_h((hz_out1 + vert_out1), 1);
1845 out2 = __msa_srari_h((hz_out2 + vert_out2), 1);
1846 out3 = __msa_srari_h((hz_out3 + vert_out3), 1);
1848 SAT_SH4_SH(out0, out1, out2, out3, 7);
1849 ILVR_D2_UB(dst1, dst0, dst3, dst2, dst0, dst1);
1850 CONVERT_UB_AVG_ST8x4_UB(out0, out1, out2, out3, dst0, dst1,
1852 dst += (4 * dst_stride);
1862 static void avc_luma_hv_qrt_and_aver_dst_16x16_msa(const uint8_t *src_x,
1863 const uint8_t *src_y,
1868 uint32_t multiple8_cnt;
1870 for (multiple8_cnt = 2; multiple8_cnt--;) {
1871 avc_luma_hv_qrt_and_aver_dst_8x8_msa(src_x, src_y, src_stride,
1879 src_x += (8 * src_stride) - 16;
1880 src_y += (8 * src_stride) - 16;
1881 dst += (8 * dst_stride) - 16;
1883 for (multiple8_cnt = 2; multiple8_cnt--;) {
1884 avc_luma_hv_qrt_and_aver_dst_8x8_msa(src_x, src_y, src_stride,
1893 void ff_put_h264_qpel16_mc00_msa(uint8_t *dst, const uint8_t *src,
1896 v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
1897 v16u8 src8, src9, src10, src11, src12, src13, src14, src15;
1899 LD_UB8(src, stride, src0, src1, src2, src3, src4, src5, src6, src7);
1900 src += (8 * stride);
1901 LD_UB8(src, stride, src8, src9, src10, src11, src12, src13, src14, src15);
1903 ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst, stride);
1904 dst += (8 * stride);
1905 ST_UB8(src8, src9, src10, src11, src12, src13, src14, src15, dst, stride);
1908 void ff_put_h264_qpel8_mc00_msa(uint8_t *dst, const uint8_t *src,
1911 uint64_t src0, src1, src2, src3, src4, src5, src6, src7;
1913 LD4(src, stride, src0, src1, src2, src3);
1915 LD4(src, stride, src4, src5, src6, src7);
1916 SD4(src0, src1, src2, src3, dst, stride);
1918 SD4(src4, src5, src6, src7, dst, stride);
1921 void ff_avg_h264_qpel16_mc00_msa(uint8_t *dst, const uint8_t *src,
1924 v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
1925 v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
1927 LD_UB8(src, stride, src0, src1, src2, src3, src4, src5, src6, src7);
1928 src += (8 * stride);
1929 LD_UB8(dst, stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
1931 AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3, dst0, dst1,
1933 AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7, dst4, dst5,
1935 ST_UB8(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst, stride);
1936 dst += (8 * stride);
1938 LD_UB8(src, stride, src0, src1, src2, src3, src4, src5, src6, src7);
1939 LD_UB8(dst, stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
1941 AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3, dst0, dst1,
1943 AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7, dst4, dst5,
1945 ST_UB8(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst, stride);
1948 void ff_avg_h264_qpel8_mc00_msa(uint8_t *dst, const uint8_t *src,
1951 uint64_t tp0, tp1, tp2, tp3, tp4, tp5, tp6, tp7;
1952 v16u8 src0 = { 0 }, src1 = { 0 }, src2 = { 0 }, src3 = { 0 };
1953 v16u8 dst0 = { 0 }, dst1 = { 0 }, dst2 = { 0 }, dst3 = { 0 };
1955 LD4(src, stride, tp0, tp1, tp2, tp3);
1957 LD4(src, stride, tp4, tp5, tp6, tp7);
1958 INSERT_D2_UB(tp0, tp1, src0);
1959 INSERT_D2_UB(tp2, tp3, src1);
1960 INSERT_D2_UB(tp4, tp5, src2);
1961 INSERT_D2_UB(tp6, tp7, src3);
1963 LD4(dst, stride, tp0, tp1, tp2, tp3);
1964 LD4(dst + 4 * stride, stride, tp4, tp5, tp6, tp7);
1965 INSERT_D2_UB(tp0, tp1, dst0);
1966 INSERT_D2_UB(tp2, tp3, dst1);
1967 INSERT_D2_UB(tp4, tp5, dst2);
1968 INSERT_D2_UB(tp6, tp7, dst3);
1970 AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3, dst0, dst1,
1973 ST8x8_UB(dst0, dst1, dst2, dst3, dst, stride);
1976 void ff_avg_h264_qpel4_mc00_msa(uint8_t *dst, const uint8_t *src,
1979 uint32_t tp0, tp1, tp2, tp3;
1980 v16u8 src0 = { 0 }, dst0 = { 0 };
1982 LW4(src, stride, tp0, tp1, tp2, tp3);
1983 INSERT_W4_UB(tp0, tp1, tp2, tp3, src0);
1984 LW4(dst, stride, tp0, tp1, tp2, tp3);
1985 INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
1987 dst0 = __msa_aver_u_b(src0, dst0);
1989 ST4x4_UB(dst0, dst0, 0, 1, 2, 3, dst, stride);
1992 void ff_put_h264_qpel16_mc10_msa(uint8_t *dst, const uint8_t *src,
1996 v16i8 dst0, dst1, dst2, dst3, src0, src1, src2, src3, src4, src5, src6;
1997 v16i8 mask0, mask1, mask2, mask3, mask4, mask5, src7, vec11;
1998 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
1999 v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
2000 v16i8 minus5b = __msa_ldi_b(-5);
2001 v16i8 plus20b = __msa_ldi_b(20);
2003 LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
2009 for (loop_cnt = 4; loop_cnt--;) {
2010 LD_SB2(src, 16, src0, src1);
2012 LD_SB2(src, 16, src2, src3);
2014 LD_SB2(src, 16, src4, src5);
2016 LD_SB2(src, 16, src6, src7);
2019 XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
2020 VSHF_B2_SB(src0, src0, src0, src1, mask0, mask3, vec0, vec3);
2021 VSHF_B2_SB(src2, src2, src2, src3, mask0, mask3, vec6, vec9);
2022 VSHF_B2_SB(src0, src0, src0, src1, mask1, mask4, vec1, vec4);
2023 VSHF_B2_SB(src2, src2, src2, src3, mask1, mask4, vec7, vec10);
2024 VSHF_B2_SB(src0, src0, src0, src1, mask2, mask5, vec2, vec5);
2025 VSHF_B2_SB(src2, src2, src2, src3, mask2, mask5, vec8, vec11);
2026 HADD_SB4_SH(vec0, vec3, vec6, vec9, res0, res1, res2, res3);
2027 DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
2028 minus5b, res0, res1, res2, res3);
2029 DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
2030 plus20b, res0, res1, res2, res3);
2031 VSHF_B2_SB(src4, src4, src4, src5, mask0, mask3, vec0, vec3);
2032 VSHF_B2_SB(src6, src6, src6, src7, mask0, mask3, vec6, vec9);
2033 VSHF_B2_SB(src4, src4, src4, src5, mask1, mask4, vec1, vec4);
2034 VSHF_B2_SB(src6, src6, src6, src7, mask1, mask4, vec7, vec10);
2035 VSHF_B2_SB(src4, src4, src4, src5, mask2, mask5, vec2, vec5);
2036 VSHF_B2_SB(src6, src6, src6, src7, mask2, mask5, vec8, vec11);
2037 HADD_SB4_SH(vec0, vec3, vec6, vec9, res4, res5, res6, res7);
2038 DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
2039 minus5b, res4, res5, res6, res7);
2040 DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
2041 plus20b, res4, res5, res6, res7);
2042 SLDI_B2_SB(src1, src3, src0, src2, src0, src2, 2);
2043 SLDI_B2_SB(src5, src7, src4, src6, src4, src6, 2);
2044 SRARI_H4_SH(res0, res1, res2, res3, 5);
2045 SRARI_H4_SH(res4, res5, res6, res7, 5);
2046 SAT_SH4_SH(res0, res1, res2, res3, 7);
2047 SAT_SH4_SH(res4, res5, res6, res7, 7);
2048 PCKEV_B2_SB(res1, res0, res3, res2, dst0, dst1);
2049 PCKEV_B2_SB(res5, res4, res7, res6, dst2, dst3);
2050 dst0 = __msa_aver_s_b(dst0, src0);
2051 dst1 = __msa_aver_s_b(dst1, src2);
2052 dst2 = __msa_aver_s_b(dst2, src4);
2053 dst3 = __msa_aver_s_b(dst3, src6);
2054 XORI_B4_128_SB(dst0, dst1, dst2, dst3);
2055 ST_SB4(dst0, dst1, dst2, dst3, dst, stride);
2056 dst += (4 * stride);
2060 void ff_put_h264_qpel16_mc30_msa(uint8_t *dst, const uint8_t *src,
2064 v16i8 dst0, dst1, dst2, dst3, src0, src1, src2, src3, src4, src5, src6;
2065 v16i8 mask0, mask1, mask2, mask3, mask4, mask5, src7, vec11;
2066 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
2067 v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
2068 v16i8 minus5b = __msa_ldi_b(-5);
2069 v16i8 plus20b = __msa_ldi_b(20);
2071 LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
2077 for (loop_cnt = 4; loop_cnt--;) {
2078 LD_SB2(src, 16, src0, src1);
2080 LD_SB2(src, 16, src2, src3);
2082 LD_SB2(src, 16, src4, src5);
2084 LD_SB2(src, 16, src6, src7);
2087 XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
2088 VSHF_B2_SB(src0, src0, src0, src1, mask0, mask3, vec0, vec3);
2089 VSHF_B2_SB(src2, src2, src2, src3, mask0, mask3, vec6, vec9);
2090 VSHF_B2_SB(src0, src0, src0, src1, mask1, mask4, vec1, vec4);
2091 VSHF_B2_SB(src2, src2, src2, src3, mask1, mask4, vec7, vec10);
2092 VSHF_B2_SB(src0, src0, src0, src1, mask2, mask5, vec2, vec5);
2093 VSHF_B2_SB(src2, src2, src2, src3, mask2, mask5, vec8, vec11);
2094 HADD_SB4_SH(vec0, vec3, vec6, vec9, res0, res1, res2, res3);
2095 DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
2096 minus5b, res0, res1, res2, res3);
2097 DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
2098 plus20b, res0, res1, res2, res3);
2099 VSHF_B2_SB(src4, src4, src4, src5, mask0, mask3, vec0, vec3);
2100 VSHF_B2_SB(src6, src6, src6, src7, mask0, mask3, vec6, vec9);
2101 VSHF_B2_SB(src4, src4, src4, src5, mask1, mask4, vec1, vec4);
2102 VSHF_B2_SB(src6, src6, src6, src7, mask1, mask4, vec7, vec10);
2103 VSHF_B2_SB(src4, src4, src4, src5, mask2, mask5, vec2, vec5);
2104 VSHF_B2_SB(src6, src6, src6, src7, mask2, mask5, vec8, vec11);
2105 HADD_SB4_SH(vec0, vec3, vec6, vec9, res4, res5, res6, res7);
2106 DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
2107 minus5b, res4, res5, res6, res7);
2108 DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
2109 plus20b, res4, res5, res6, res7);
2110 SLDI_B2_SB(src1, src3, src0, src2, src0, src2, 3);
2111 SLDI_B2_SB(src5, src7, src4, src6, src4, src6, 3);
2112 SRARI_H4_SH(res0, res1, res2, res3, 5);
2113 SRARI_H4_SH(res4, res5, res6, res7, 5);
2114 SAT_SH4_SH(res0, res1, res2, res3, 7);
2115 SAT_SH4_SH(res4, res5, res6, res7, 7);
2116 PCKEV_B2_SB(res1, res0, res3, res2, dst0, dst1);
2117 PCKEV_B2_SB(res5, res4, res7, res6, dst2, dst3);
2118 dst0 = __msa_aver_s_b(dst0, src0);
2119 dst1 = __msa_aver_s_b(dst1, src2);
2120 dst2 = __msa_aver_s_b(dst2, src4);
2121 dst3 = __msa_aver_s_b(dst3, src6);
2122 XORI_B4_128_SB(dst0, dst1, dst2, dst3);
2123 ST_SB4(dst0, dst1, dst2, dst3, dst, stride);
2124 dst += (4 * stride);
2128 void ff_put_h264_qpel8_mc10_msa(uint8_t *dst, const uint8_t *src,
2131 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask0, mask1, mask2;
2132 v16i8 tmp0, tmp1, tmp2, tmp3, vec11;
2133 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
2134 v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
2135 v16i8 minus5b = __msa_ldi_b(-5);
2136 v16i8 plus20b = __msa_ldi_b(20);
2138 LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
2139 LD_SB8(src - 2, stride, src0, src1, src2, src3, src4, src5, src6, src7);
2140 XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
2141 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
2142 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3);
2143 HADD_SB4_SH(vec0, vec1, vec2, vec3, res0, res1, res2, res3);
2144 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4, vec5);
2145 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6, vec7);
2146 DPADD_SB4_SH(vec4, vec5, vec6, vec7, minus5b, minus5b, minus5b, minus5b,
2147 res0, res1, res2, res3);
2148 VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec8, vec9);
2149 VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec10, vec11);
2150 DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b, plus20b,
2151 res0, res1, res2, res3);
2152 VSHF_B2_SB(src4, src4, src5, src5, mask0, mask0, vec0, vec1);
2153 VSHF_B2_SB(src6, src6, src7, src7, mask0, mask0, vec2, vec3);
2154 HADD_SB4_SH(vec0, vec1, vec2, vec3, res4, res5, res6, res7);
2155 VSHF_B2_SB(src4, src4, src5, src5, mask1, mask1, vec4, vec5);
2156 VSHF_B2_SB(src6, src6, src7, src7, mask1, mask1, vec6, vec7);
2157 DPADD_SB4_SH(vec4, vec5, vec6, vec7, minus5b, minus5b, minus5b, minus5b,
2158 res4, res5, res6, res7);
2159 VSHF_B2_SB(src4, src4, src5, src5, mask2, mask2, vec8, vec9);
2160 VSHF_B2_SB(src6, src6, src7, src7, mask2, mask2, vec10, vec11);
2161 DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b, plus20b,
2162 res4, res5, res6, res7);
2163 SLDI_B2_SB(src0, src1, src0, src1, src0, src1, 2);
2164 SLDI_B2_SB(src2, src3, src2, src3, src2, src3, 2);
2165 SLDI_B2_SB(src4, src5, src4, src5, src4, src5, 2);
2166 SLDI_B2_SB(src6, src7, src6, src7, src6, src7, 2);
2167 PCKEV_D2_SB(src1, src0, src3, src2, src0, src1);
2168 PCKEV_D2_SB(src5, src4, src7, src6, src4, src5);
2169 SRARI_H4_SH(res0, res1, res2, res3, 5);
2170 SRARI_H4_SH(res4, res5, res6, res7, 5);
2171 SAT_SH4_SH(res0, res1, res2, res3, 7);
2172 SAT_SH4_SH(res4, res5, res6, res7, 7);
2173 PCKEV_B2_SB(res1, res0, res3, res2, tmp0, tmp1);
2174 PCKEV_B2_SB(res5, res4, res7, res6, tmp2, tmp3);
2175 tmp0 = __msa_aver_s_b(tmp0, src0);
2176 tmp1 = __msa_aver_s_b(tmp1, src1);
2177 tmp2 = __msa_aver_s_b(tmp2, src4);
2178 tmp3 = __msa_aver_s_b(tmp3, src5);
2179 XORI_B4_128_SB(tmp0, tmp1, tmp2, tmp3);
2180 ST8x8_UB(tmp0, tmp1, tmp2, tmp3, dst, stride);
2183 void ff_put_h264_qpel8_mc30_msa(uint8_t *dst, const uint8_t *src,
2186 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask0, mask1, mask2;
2187 v16i8 tmp0, tmp1, tmp2, tmp3, vec11;
2188 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
2189 v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
2190 v16i8 minus5b = __msa_ldi_b(-5);
2191 v16i8 plus20b = __msa_ldi_b(20);
2193 LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
2194 LD_SB8(src - 2, stride, src0, src1, src2, src3, src4, src5, src6, src7);
2195 XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
2196 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
2197 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3);
2198 HADD_SB4_SH(vec0, vec1, vec2, vec3, res0, res1, res2, res3);
2199 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4, vec5);
2200 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6, vec7);
2201 DPADD_SB4_SH(vec4, vec5, vec6, vec7, minus5b, minus5b, minus5b, minus5b,
2202 res0, res1, res2, res3);
2203 VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec8, vec9);
2204 VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec10, vec11);
2205 DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b, plus20b,
2206 res0, res1, res2, res3);
2207 VSHF_B2_SB(src4, src4, src5, src5, mask0, mask0, vec0, vec1);
2208 VSHF_B2_SB(src6, src6, src7, src7, mask0, mask0, vec2, vec3);
2209 HADD_SB4_SH(vec0, vec1, vec2, vec3, res4, res5, res6, res7);
2210 VSHF_B2_SB(src4, src4, src5, src5, mask1, mask1, vec4, vec5);
2211 VSHF_B2_SB(src6, src6, src7, src7, mask1, mask1, vec6, vec7);
2212 DPADD_SB4_SH(vec4, vec5, vec6, vec7, minus5b, minus5b, minus5b, minus5b,
2213 res4, res5, res6, res7);
2214 VSHF_B2_SB(src4, src4, src5, src5, mask2, mask2, vec8, vec9);
2215 VSHF_B2_SB(src6, src6, src7, src7, mask2, mask2, vec10, vec11);
2216 DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b, plus20b,
2217 res4, res5, res6, res7);
2218 SLDI_B2_SB(src0, src1, src0, src1, src0, src1, 3);
2219 SLDI_B2_SB(src2, src3, src2, src3, src2, src3, 3);
2220 SLDI_B2_SB(src4, src5, src4, src5, src4, src5, 3);
2221 SLDI_B2_SB(src6, src7, src6, src7, src6, src7, 3);
2222 PCKEV_D2_SB(src1, src0, src3, src2, src0, src1);
2223 PCKEV_D2_SB(src5, src4, src7, src6, src4, src5);
2224 SRARI_H4_SH(res0, res1, res2, res3, 5);
2225 SRARI_H4_SH(res4, res5, res6, res7, 5);
2226 SAT_SH4_SH(res0, res1, res2, res3, 7);
2227 SAT_SH4_SH(res4, res5, res6, res7, 7);
2228 PCKEV_B2_SB(res1, res0, res3, res2, tmp0, tmp1);
2229 PCKEV_B2_SB(res5, res4, res7, res6, tmp2, tmp3);
2230 tmp0 = __msa_aver_s_b(tmp0, src0);
2231 tmp1 = __msa_aver_s_b(tmp1, src1);
2232 tmp2 = __msa_aver_s_b(tmp2, src4);
2233 tmp3 = __msa_aver_s_b(tmp3, src5);
2234 XORI_B4_128_SB(tmp0, tmp1, tmp2, tmp3);
2235 ST8x8_UB(tmp0, tmp1, tmp2, tmp3, dst, stride);
2238 void ff_put_h264_qpel4_mc10_msa(uint8_t *dst, const uint8_t *src,
2241 v16i8 src0, src1, src2, src3, res, mask0, mask1, mask2;
2242 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
2244 v16i8 minus5b = __msa_ldi_b(-5);
2245 v16i8 plus20b = __msa_ldi_b(20);
2247 LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2);
2248 LD_SB4(src - 2, stride, src0, src1, src2, src3);
2249 XORI_B4_128_SB(src0, src1, src2, src3);
2250 VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0, vec1);
2251 HADD_SB2_SH(vec0, vec1, res0, res1);
2252 VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2, vec3);
2253 DPADD_SB2_SH(vec2, vec3, minus5b, minus5b, res0, res1);
2254 VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4, vec5);
2255 DPADD_SB2_SH(vec4, vec5, plus20b, plus20b, res0, res1);
2256 SRARI_H2_SH(res0, res1, 5);
2257 SAT_SH2_SH(res0, res1, 7);
2258 res = __msa_pckev_b((v16i8) res1, (v16i8) res0);
2259 SLDI_B2_SB(src0, src1, src0, src1, src0, src1, 2);
2260 SLDI_B2_SB(src2, src3, src2, src3, src2, src3, 2);
2261 src0 = (v16i8) __msa_insve_w((v4i32) src0, 1, (v4i32) src1);
2262 src1 = (v16i8) __msa_insve_w((v4i32) src2, 1, (v4i32) src3);
2263 src0 = (v16i8) __msa_insve_d((v2i64) src0, 1, (v2i64) src1);
2264 res = __msa_aver_s_b(res, src0);
2265 res = (v16i8) __msa_xori_b((v16u8) res, 128);
2266 ST4x4_UB(res, res, 0, 1, 2, 3, dst, stride);
2269 void ff_put_h264_qpel4_mc30_msa(uint8_t *dst, const uint8_t *src,
2272 v16i8 src0, src1, src2, src3, res, mask0, mask1, mask2;
2273 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
2275 v16i8 minus5b = __msa_ldi_b(-5);
2276 v16i8 plus20b = __msa_ldi_b(20);
2278 LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2);
2279 LD_SB4(src - 2, stride, src0, src1, src2, src3);
2280 XORI_B4_128_SB(src0, src1, src2, src3);
2281 VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0, vec1);
2282 HADD_SB2_SH(vec0, vec1, res0, res1);
2283 VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2, vec3);
2284 DPADD_SB2_SH(vec2, vec3, minus5b, minus5b, res0, res1);
2285 VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4, vec5);
2286 DPADD_SB2_SH(vec4, vec5, plus20b, plus20b, res0, res1);
2287 SRARI_H2_SH(res0, res1, 5);
2288 SAT_SH2_SH(res0, res1, 7);
2289 res = __msa_pckev_b((v16i8) res1, (v16i8) res0);
2290 SLDI_B2_SB(src0, src1, src0, src1, src0, src1, 3);
2291 SLDI_B2_SB(src2, src3, src2, src3, src2, src3, 3);
2292 src0 = (v16i8) __msa_insve_w((v4i32) src0, 1, (v4i32) src1);
2293 src1 = (v16i8) __msa_insve_w((v4i32) src2, 1, (v4i32) src3);
2294 src0 = (v16i8) __msa_insve_d((v2i64) src0, 1, (v2i64) src1);
2295 res = __msa_aver_s_b(res, src0);
2296 res = (v16i8) __msa_xori_b((v16u8) res, 128);
2297 ST4x4_UB(res, res, 0, 1, 2, 3, dst, stride);
2300 void ff_put_h264_qpel16_mc20_msa(uint8_t *dst, const uint8_t *src,
2304 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask0, mask1, mask2;
2305 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
2307 v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
2308 v16i8 minus5b = __msa_ldi_b(-5);
2309 v16i8 plus20b = __msa_ldi_b(20);
2311 LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
2314 for (loop_cnt = 4; loop_cnt--;) {
2315 LD_SB2(src, 8, src0, src1);
2317 LD_SB2(src, 8, src2, src3);
2319 LD_SB2(src, 8, src4, src5);
2321 LD_SB2(src, 8, src6, src7);
2324 XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
2325 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec3);
2326 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec6, vec9);
2327 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec1, vec4);
2328 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec7, vec10);
2329 VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec2, vec5);
2330 VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec8, vec11);
2331 HADD_SB4_SH(vec0, vec3, vec6, vec9, res0, res1, res2, res3);
2332 DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
2333 minus5b, res0, res1, res2, res3);
2334 DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
2335 plus20b, res0, res1, res2, res3);
2336 VSHF_B2_SB(src4, src4, src5, src5, mask0, mask0, vec0, vec3);
2337 VSHF_B2_SB(src6, src6, src7, src7, mask0, mask0, vec6, vec9);
2338 VSHF_B2_SB(src4, src4, src5, src5, mask1, mask1, vec1, vec4);
2339 VSHF_B2_SB(src6, src6, src7, src7, mask1, mask1, vec7, vec10);
2340 VSHF_B2_SB(src4, src4, src5, src5, mask2, mask2, vec2, vec5);
2341 VSHF_B2_SB(src6, src6, src7, src7, mask2, mask2, vec8, vec11);
2342 HADD_SB4_SH(vec0, vec3, vec6, vec9, res4, res5, res6, res7);
2343 DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
2344 minus5b, res4, res5, res6, res7);
2345 DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
2346 plus20b, res4, res5, res6, res7);
2347 SRARI_H4_SH(res0, res1, res2, res3, 5);
2348 SRARI_H4_SH(res4, res5, res6, res7, 5);
2349 SAT_SH4_SH(res0, res1, res2, res3, 7);
2350 SAT_SH4_SH(res4, res5, res6, res7, 7);
2351 PCKEV_B4_SB(res1, res0, res3, res2, res5, res4, res7, res6, vec0, vec1,
2353 XORI_B4_128_SB(vec0, vec1, vec2, vec3);
2354 ST_SB4(vec0, vec1, vec2, vec3, dst, stride);
2355 dst += (4 * stride);
2359 void ff_put_h264_qpel8_mc20_msa(uint8_t *dst, const uint8_t *src,
2362 v16u8 out0, out1, out2, out3;
2363 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask0, mask1, mask2;
2364 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
2366 v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
2367 v16i8 minus5b = __msa_ldi_b(-5);
2368 v16i8 plus20b = __msa_ldi_b(20);
2370 LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
2371 LD_SB8(src - 2, stride, src0, src1, src2, src3, src4, src5, src6, src7);
2372 XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
2373 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
2374 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3);
2375 HADD_SB4_SH(vec0, vec1, vec2, vec3, res0, res1, res2, res3);
2376 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4, vec5);
2377 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6, vec7);
2378 DPADD_SB4_SH(vec4, vec5, vec6, vec7, minus5b, minus5b, minus5b, minus5b,
2379 res0, res1, res2, res3);
2380 VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec8, vec9);
2381 VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec10, vec11);
2382 DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b,
2383 plus20b, res0, res1, res2, res3);
2384 VSHF_B2_SB(src4, src4, src5, src5, mask0, mask0, vec0, vec1);
2385 VSHF_B2_SB(src6, src6, src7, src7, mask0, mask0, vec2, vec3);
2386 HADD_SB4_SH(vec0, vec1, vec2, vec3, res4, res5, res6, res7);
2387 VSHF_B2_SB(src4, src4, src5, src5, mask1, mask1, vec4, vec5);
2388 VSHF_B2_SB(src6, src6, src7, src7, mask1, mask1, vec6, vec7);
2389 DPADD_SB4_SH(vec4, vec5, vec6, vec7, minus5b, minus5b, minus5b, minus5b,
2390 res4, res5, res6, res7);
2391 VSHF_B2_SB(src4, src4, src5, src5, mask2, mask2, vec8, vec9);
2392 VSHF_B2_SB(src6, src6, src7, src7, mask2, mask2, vec10, vec11);
2393 DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b,
2394 plus20b, res4, res5, res6, res7);
2395 SRARI_H4_SH(res0, res1, res2, res3, 5);
2396 SRARI_H4_SH(res4, res5, res6, res7, 5);
2397 SAT_SH4_SH(res0, res1, res2, res3, 7);
2398 SAT_SH4_SH(res4, res5, res6, res7, 7);
2399 out0 = PCKEV_XORI128_UB(res0, res1);
2400 out1 = PCKEV_XORI128_UB(res2, res3);
2401 out2 = PCKEV_XORI128_UB(res4, res5);
2402 out3 = PCKEV_XORI128_UB(res6, res7);
2403 ST8x8_UB(out0, out1, out2, out3, dst, stride);
2406 void ff_put_h264_qpel4_mc20_msa(uint8_t *dst, const uint8_t *src,
2410 v16i8 src0, src1, src2, src3, mask0, mask1, mask2;
2411 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
2413 v16i8 minus5b = __msa_ldi_b(-5);
2414 v16i8 plus20b = __msa_ldi_b(20);
2416 LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2);
2417 LD_SB4(src - 2, stride, src0, src1, src2, src3);
2418 XORI_B4_128_SB(src0, src1, src2, src3);
2419 VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0, vec1);
2420 HADD_SB2_SH(vec0, vec1, res0, res1);
2421 VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2, vec3);
2422 DPADD_SB2_SH(vec2, vec3, minus5b, minus5b, res0, res1);
2423 VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4, vec5);
2424 DPADD_SB2_SH(vec4, vec5, plus20b, plus20b, res0, res1);
2425 SRARI_H2_SH(res0, res1, 5);
2426 SAT_SH2_SH(res0, res1, 7);
2427 out = PCKEV_XORI128_UB(res0, res1);
2428 ST4x4_UB(out, out, 0, 1, 2, 3, dst, stride);
2431 void ff_put_h264_qpel16_mc01_msa(uint8_t *dst, const uint8_t *src,
2435 int16_t filt_const0 = 0xfb01;
2436 int16_t filt_const1 = 0x1414;
2437 int16_t filt_const2 = 0x1fb;
2438 v16u8 res0, res1, res2, res3;
2439 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
2440 v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
2441 v16i8 src87_r, src10_l, src32_l, src54_l, src76_l, src21_l, src43_l;
2442 v16i8 src65_l, src87_l, filt0, filt1, filt2;
2443 v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
2445 filt0 = (v16i8) __msa_fill_h(filt_const0);
2446 filt1 = (v16i8) __msa_fill_h(filt_const1);
2447 filt2 = (v16i8) __msa_fill_h(filt_const2);
2449 src -= (stride * 2);
2451 LD_SB5(src, stride, src0, src1, src2, src3, src4);
2452 src += (5 * stride);
2454 XORI_B5_128_SB(src0, src1, src2, src3, src4);
2455 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
2457 ILVL_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_l, src21_l,
2460 for (loop_cnt = 4; loop_cnt--;) {
2461 LD_SB4(src, stride, src5, src6, src7, src8);
2462 src += (4 * stride);
2464 XORI_B4_128_SB(src5, src6, src7, src8);
2465 ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r,
2466 src65_r, src76_r, src87_r);
2467 ILVL_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_l,
2468 src65_l, src76_l, src87_l);
2469 out0_r = AVC_DOT_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2);
2470 out1_r = AVC_DOT_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2);
2471 out2_r = AVC_DOT_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2);
2472 out3_r = AVC_DOT_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2);
2473 out0_l = AVC_DOT_SH3_SH(src10_l, src32_l, src54_l, filt0, filt1, filt2);
2474 out1_l = AVC_DOT_SH3_SH(src21_l, src43_l, src65_l, filt0, filt1, filt2);
2475 out2_l = AVC_DOT_SH3_SH(src32_l, src54_l, src76_l, filt0, filt1, filt2);
2476 out3_l = AVC_DOT_SH3_SH(src43_l, src65_l, src87_l, filt0, filt1, filt2);
2477 SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 5);
2478 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
2479 SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 5);
2480 SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
2481 PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
2482 out3_r, res0, res1, res2, res3);
2483 res0 = (v16u8) __msa_aver_s_b((v16i8) res0, src2);
2484 res1 = (v16u8) __msa_aver_s_b((v16i8) res1, src3);
2485 res2 = (v16u8) __msa_aver_s_b((v16i8) res2, src4);
2486 res3 = (v16u8) __msa_aver_s_b((v16i8) res3, src5);
2487 XORI_B4_128_UB(res0, res1, res2, res3);
2488 ST_UB4(res0, res1, res2, res3, dst, stride);
2489 dst += (4 * stride);
2505 void ff_put_h264_qpel16_mc03_msa(uint8_t *dst, const uint8_t *src,
2509 int16_t filt_const0 = 0xfb01;
2510 int16_t filt_const1 = 0x1414;
2511 int16_t filt_const2 = 0x1fb;
2512 v16u8 res0, res1, res2, res3;
2513 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
2514 v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
2515 v16i8 src87_r, src10_l, src32_l, src54_l, src76_l, src21_l, src43_l;
2516 v16i8 src65_l, src87_l, filt0, filt1, filt2;
2517 v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
2519 filt0 = (v16i8) __msa_fill_h(filt_const0);
2520 filt1 = (v16i8) __msa_fill_h(filt_const1);
2521 filt2 = (v16i8) __msa_fill_h(filt_const2);
2523 src -= (stride * 2);
2525 LD_SB5(src, stride, src0, src1, src2, src3, src4);
2526 src += (5 * stride);
2528 XORI_B5_128_SB(src0, src1, src2, src3, src4);
2529 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
2531 ILVL_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_l, src21_l,
2534 for (loop_cnt = 4; loop_cnt--;) {
2535 LD_SB4(src, stride, src5, src6, src7, src8);
2536 src += (4 * stride);
2538 XORI_B4_128_SB(src5, src6, src7, src8);
2539 ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r,
2540 src65_r, src76_r, src87_r);
2541 ILVL_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_l,
2542 src65_l, src76_l, src87_l);
2543 out0_r = AVC_DOT_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2);
2544 out1_r = AVC_DOT_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2);
2545 out2_r = AVC_DOT_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2);
2546 out3_r = AVC_DOT_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2);
2547 out0_l = AVC_DOT_SH3_SH(src10_l, src32_l, src54_l, filt0, filt1, filt2);
2548 out1_l = AVC_DOT_SH3_SH(src21_l, src43_l, src65_l, filt0, filt1, filt2);
2549 out2_l = AVC_DOT_SH3_SH(src32_l, src54_l, src76_l, filt0, filt1, filt2);
2550 out3_l = AVC_DOT_SH3_SH(src43_l, src65_l, src87_l, filt0, filt1, filt2);
2551 SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 5);
2552 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
2553 SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 5);
2554 SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
2555 PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
2556 out3_r, res0, res1, res2, res3);
2557 res0 = (v16u8) __msa_aver_s_b((v16i8) res0, src3);
2558 res1 = (v16u8) __msa_aver_s_b((v16i8) res1, src4);
2559 res2 = (v16u8) __msa_aver_s_b((v16i8) res2, src5);
2560 res3 = (v16u8) __msa_aver_s_b((v16i8) res3, src6);
2561 XORI_B4_128_UB(res0, res1, res2, res3);
2562 ST_UB4(res0, res1, res2, res3, dst, stride);
2563 dst += (4 * stride);
2578 void ff_put_h264_qpel8_mc01_msa(uint8_t *dst, const uint8_t *src,
2581 const int16_t filt_const0 = 0xfb01;
2582 const int16_t filt_const1 = 0x1414;
2583 const int16_t filt_const2 = 0x1fb;
2584 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
2585 v16i8 src11, src12, src10_r, src32_r, src54_r, src65_r, src76_r, src98_r;
2586 v16i8 src21_r, src43_r, src87_r, src109_r, src1211_r, src1110_r;
2587 v16i8 tmp0, tmp1, tmp2, tmp3, filt0, filt1, filt2, out0, out1, out2, out3;
2588 v8i16 out0_r, out1_r, out2_r, out3_r, out4_r, out5_r, out6_r, out7_r;
2590 filt0 = (v16i8) __msa_fill_h(filt_const0);
2591 filt1 = (v16i8) __msa_fill_h(filt_const1);
2592 filt2 = (v16i8) __msa_fill_h(filt_const2);
2594 src -= (stride * 2);
2596 LD_SB5(src, stride, src0, src1, src2, src3, src4);
2597 src += (5 * stride);
2598 LD_SB8(src, stride, src5, src6, src7, src8, src9, src10, src11, src12);
2599 XORI_B8_128_SB(src5, src6, src7, src8, src9, src10, src11, src12);
2600 XORI_B5_128_SB(src0, src1, src2, src3, src4);
2601 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
2603 ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
2605 ILVR_B4_SB(src9, src8, src10, src9, src11, src10, src12, src11, src98_r,
2606 src109_r, src1110_r, src1211_r);
2607 out0_r = AVC_DOT_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2);
2608 out1_r = AVC_DOT_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2);
2609 out2_r = AVC_DOT_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2);
2610 out3_r = AVC_DOT_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2);
2611 out4_r = AVC_DOT_SH3_SH(src54_r, src76_r, src98_r, filt0, filt1, filt2);
2612 out5_r = AVC_DOT_SH3_SH(src65_r, src87_r, src109_r, filt0, filt1, filt2);
2613 out6_r = AVC_DOT_SH3_SH(src76_r, src98_r, src1110_r, filt0, filt1, filt2);
2614 out7_r = AVC_DOT_SH3_SH(src87_r, src109_r, src1211_r, filt0, filt1, filt2);
2615 PCKEV_D2_SB(src3, src2, src5, src4, tmp0, tmp1);
2616 PCKEV_D2_SB(src7, src6, src9, src8, tmp2, tmp3);
2617 SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 5);
2618 SRARI_H4_SH(out4_r, out5_r, out6_r, out7_r, 5);
2619 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
2620 SAT_SH4_SH(out4_r, out5_r, out6_r, out7_r, 7);
2621 PCKEV_B2_SB(out1_r, out0_r, out3_r, out2_r, out0, out1);
2622 PCKEV_B2_SB(out5_r, out4_r, out7_r, out6_r, out2, out3);
2623 out0 = __msa_aver_s_b(out0, tmp0);
2624 out1 = __msa_aver_s_b(out1, tmp1);
2625 out2 = __msa_aver_s_b(out2, tmp2);
2626 out3 = __msa_aver_s_b(out3, tmp3);
2627 XORI_B4_128_SB(out0, out1, out2, out3);
2628 ST8x8_UB(out0, out1, out2, out3, dst, stride);
2631 void ff_put_h264_qpel8_mc03_msa(uint8_t *dst, const uint8_t *src,
2634 const int16_t filt_const0 = 0xfb01;
2635 const int16_t filt_const1 = 0x1414;
2636 const int16_t filt_const2 = 0x1fb;
2637 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
2638 v16i8 src11, src12, src10_r, src32_r, src54_r, src65_r, src76_r, src98_r;
2639 v16i8 src21_r, src43_r, src87_r, src109_r, src1211_r, src1110_r;
2640 v16i8 filt0, filt1, filt2, out0, out1, out2, out3, tmp0, tmp1, tmp2, tmp3;
2641 v8i16 out0_r, out1_r, out2_r, out3_r, out4_r, out5_r, out6_r, out7_r;
2643 filt0 = (v16i8) __msa_fill_h(filt_const0);
2644 filt1 = (v16i8) __msa_fill_h(filt_const1);
2645 filt2 = (v16i8) __msa_fill_h(filt_const2);
2647 src -= (stride * 2);
2649 LD_SB5(src, stride, src0, src1, src2, src3, src4);
2650 src += (5 * stride);
2651 LD_SB8(src, stride, src5, src6, src7, src8, src9, src10, src11, src12);
2652 XORI_B5_128_SB(src0, src1, src2, src3, src4);
2653 XORI_B8_128_SB(src5, src6, src7, src8, src9, src10, src11, src12);
2654 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
2656 ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
2658 ILVR_B4_SB(src9, src8, src10, src9, src11, src10, src12, src11, src98_r,
2659 src109_r, src1110_r, src1211_r);
2660 out0_r = AVC_DOT_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2);
2661 out1_r = AVC_DOT_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2);
2662 out2_r = AVC_DOT_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2);
2663 out3_r = AVC_DOT_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2);
2664 out4_r = AVC_DOT_SH3_SH(src54_r, src76_r, src98_r, filt0, filt1, filt2);
2665 out5_r = AVC_DOT_SH3_SH(src65_r, src87_r, src109_r, filt0, filt1, filt2);
2666 out6_r = AVC_DOT_SH3_SH(src76_r, src98_r, src1110_r, filt0, filt1, filt2);
2667 out7_r = AVC_DOT_SH3_SH(src87_r, src109_r, src1211_r, filt0, filt1, filt2);
2668 PCKEV_D2_SB(src4, src3, src6, src5, tmp0, tmp1);
2669 PCKEV_D2_SB(src8, src7, src10, src9, tmp2, tmp3);
2670 SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 5);
2671 SRARI_H4_SH(out4_r, out5_r, out6_r, out7_r, 5);
2672 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
2673 SAT_SH4_SH(out4_r, out5_r, out6_r, out7_r, 7);
2674 PCKEV_B2_SB(out1_r, out0_r, out3_r, out2_r, out0, out1);
2675 PCKEV_B2_SB(out5_r, out4_r, out7_r, out6_r, out2, out3);
2676 out0 = __msa_aver_s_b(out0, tmp0);
2677 out1 = __msa_aver_s_b(out1, tmp1);
2678 out2 = __msa_aver_s_b(out2, tmp2);
2679 out3 = __msa_aver_s_b(out3, tmp3);
2680 XORI_B4_128_SB(out0, out1, out2, out3);
2681 ST8x8_UB(out0, out1, out2, out3, dst, stride);
2684 void ff_put_h264_qpel4_mc01_msa(uint8_t *dst, const uint8_t *src,
2687 int16_t filt_const0 = 0xfb01;
2688 int16_t filt_const1 = 0x1414;
2689 int16_t filt_const2 = 0x1fb;
2691 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
2692 v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
2693 v16i8 src87_r, src2110, src4332, src6554, src8776, filt0, filt1, filt2;
2696 filt0 = (v16i8) __msa_fill_h(filt_const0);
2697 filt1 = (v16i8) __msa_fill_h(filt_const1);
2698 filt2 = (v16i8) __msa_fill_h(filt_const2);
2700 src -= (stride * 2);
2702 LD_SB5(src, stride, src0, src1, src2, src3, src4);
2703 src += (5 * stride);
2704 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
2706 ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
2707 XORI_B2_128_SB(src2110, src4332);
2708 LD_SB4(src, stride, src5, src6, src7, src8);
2709 ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
2711 ILVR_D2_SB(src65_r, src54_r, src87_r, src76_r, src6554, src8776);
2712 XORI_B2_128_SB(src6554, src8776);
2713 out10 = AVC_DOT_SH3_SH(src2110, src4332, src6554, filt0, filt1, filt2);
2714 out32 = AVC_DOT_SH3_SH(src4332, src6554, src8776, filt0, filt1, filt2);
2715 SRARI_H2_SH(out10, out32, 5);
2716 SAT_SH2_SH(out10, out32, 7);
2717 out = PCKEV_XORI128_UB(out10, out32);
2718 src32_r = (v16i8) __msa_insve_w((v4i32) src2, 1, (v4i32) src3);
2719 src54_r = (v16i8) __msa_insve_w((v4i32) src4, 1, (v4i32) src5);
2720 src32_r = (v16i8) __msa_insve_d((v2i64) src32_r, 1, (v2i64) src54_r);
2721 out = __msa_aver_u_b(out, (v16u8) src32_r);
2722 ST4x4_UB(out, out, 0, 1, 2, 3, dst, stride);
2725 void ff_put_h264_qpel4_mc03_msa(uint8_t *dst, const uint8_t *src,
2728 int16_t filt_const0 = 0xfb01;
2729 int16_t filt_const1 = 0x1414;
2730 int16_t filt_const2 = 0x1fb;
2732 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
2733 v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
2734 v16i8 src87_r, src2110, src4332, src6554, src8776, filt0, filt1, filt2;
2737 filt0 = (v16i8) __msa_fill_h(filt_const0);
2738 filt1 = (v16i8) __msa_fill_h(filt_const1);
2739 filt2 = (v16i8) __msa_fill_h(filt_const2);
2741 src -= (stride * 2);
2743 LD_SB5(src, stride, src0, src1, src2, src3, src4);
2744 src += (5 * stride);
2745 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
2747 ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
2748 XORI_B2_128_SB(src2110, src4332);
2749 LD_SB4(src, stride, src5, src6, src7, src8);
2750 ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
2752 ILVR_D2_SB(src65_r, src54_r, src87_r, src76_r, src6554, src8776);
2753 XORI_B2_128_SB(src6554, src8776);
2754 out10 = AVC_DOT_SH3_SH(src2110, src4332, src6554, filt0, filt1, filt2);
2755 out32 = AVC_DOT_SH3_SH(src4332, src6554, src8776, filt0, filt1, filt2);
2756 SRARI_H2_SH(out10, out32, 5);
2757 SAT_SH2_SH(out10, out32, 7);
2758 out = PCKEV_XORI128_UB(out10, out32);
2759 src32_r = (v16i8) __msa_insve_w((v4i32) src3, 1, (v4i32) src4);
2760 src54_r = (v16i8) __msa_insve_w((v4i32) src5, 1, (v4i32) src6);
2761 src32_r = (v16i8) __msa_insve_d((v2i64) src32_r, 1, (v2i64) src54_r);
2762 out = __msa_aver_u_b(out, (v16u8) src32_r);
2763 ST4x4_UB(out, out, 0, 1, 2, 3, dst, stride);
2766 void ff_put_h264_qpel16_mc11_msa(uint8_t *dst, const uint8_t *src,
2769 avc_luma_hv_qrt_16w_msa(src - 2,
2770 src - (stride * 2), stride, dst, stride, 16);
2773 void ff_put_h264_qpel16_mc31_msa(uint8_t *dst, const uint8_t *src,
2776 avc_luma_hv_qrt_16w_msa(src - 2,
2777 src - (stride * 2) +
2778 sizeof(uint8_t), stride, dst, stride, 16);
2781 void ff_put_h264_qpel16_mc13_msa(uint8_t *dst, const uint8_t *src,
2784 avc_luma_hv_qrt_16w_msa(src + stride - 2,
2785 src - (stride * 2), stride, dst, stride, 16);
2788 void ff_put_h264_qpel16_mc33_msa(uint8_t *dst, const uint8_t *src,
2791 avc_luma_hv_qrt_16w_msa(src + stride - 2,
2792 src - (stride * 2) +
2793 sizeof(uint8_t), stride, dst, stride, 16);
2796 void ff_put_h264_qpel8_mc11_msa(uint8_t *dst, const uint8_t *src,
2799 avc_luma_hv_qrt_8w_msa(src - 2, src - (stride * 2), stride, dst, stride, 8);
2802 void ff_put_h264_qpel8_mc31_msa(uint8_t *dst, const uint8_t *src,
2805 avc_luma_hv_qrt_8w_msa(src - 2,
2806 src - (stride * 2) +
2807 sizeof(uint8_t), stride, dst, stride, 8);
2810 void ff_put_h264_qpel8_mc13_msa(uint8_t *dst, const uint8_t *src,
2813 avc_luma_hv_qrt_8w_msa(src + stride - 2,
2814 src - (stride * 2), stride, dst, stride, 8);
2817 void ff_put_h264_qpel8_mc33_msa(uint8_t *dst, const uint8_t *src,
2820 avc_luma_hv_qrt_8w_msa(src + stride - 2,
2821 src - (stride * 2) +
2822 sizeof(uint8_t), stride, dst, stride, 8);
2826 void ff_put_h264_qpel4_mc11_msa(uint8_t *dst, const uint8_t *src,
2829 avc_luma_hv_qrt_4w_msa(src - 2, src - (stride * 2), stride, dst, stride, 4);
2832 void ff_put_h264_qpel4_mc31_msa(uint8_t *dst, const uint8_t *src,
2835 avc_luma_hv_qrt_4w_msa(src - 2,
2836 src - (stride * 2) +
2837 sizeof(uint8_t), stride, dst, stride, 4);
2840 void ff_put_h264_qpel4_mc13_msa(uint8_t *dst, const uint8_t *src,
2843 avc_luma_hv_qrt_4w_msa(src + stride - 2,
2844 src - (stride * 2), stride, dst, stride, 4);
2847 void ff_put_h264_qpel4_mc33_msa(uint8_t *dst, const uint8_t *src,
2850 avc_luma_hv_qrt_4w_msa(src + stride - 2,
2851 src - (stride * 2) +
2852 sizeof(uint8_t), stride, dst, stride, 4);
2855 void ff_put_h264_qpel16_mc21_msa(uint8_t *dst, const uint8_t *src,
2858 uint8_t *dst_tmp = dst;
2859 const uint8_t *src_tmp = src - (2 * stride) - 2;
2860 uint32_t multiple8_cnt, loop_cnt;
2861 const int32_t filt_const0 = 0xfffb0001;
2862 const int32_t filt_const1 = 0x140014;
2863 const int32_t filt_const2 = 0x1fffb;
2865 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, mask0, mask1;
2867 v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
2868 v8i16 hz_out7, hz_out8, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
2869 v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r;
2870 v8i16 hz_out65_r, hz_out76_r, hz_out87_r, hz_out10_l, hz_out21_l;
2871 v8i16 hz_out32_l, hz_out43_l, hz_out54_l, hz_out65_l, hz_out76_l;
2872 v8i16 hz_out87_l, filt0, filt1, filt2;
2875 filt0 = (v8i16) __msa_fill_w(filt_const0);
2876 filt1 = (v8i16) __msa_fill_w(filt_const1);
2877 filt2 = (v8i16) __msa_fill_w(filt_const2);
2879 LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
2881 for (multiple8_cnt = 2; multiple8_cnt--;) {
2885 LD_SB5(src, stride, src0, src1, src2, src3, src4);
2886 XORI_B5_128_SB(src0, src1, src2, src3, src4);
2887 src += (5 * stride);
2889 hz_out0 = AVC_HORZ_FILTER_SH(src0, src0, mask0, mask1, mask2);
2890 hz_out1 = AVC_HORZ_FILTER_SH(src1, src1, mask0, mask1, mask2);
2891 hz_out2 = AVC_HORZ_FILTER_SH(src2, src2, mask0, mask1, mask2);
2892 hz_out3 = AVC_HORZ_FILTER_SH(src3, src3, mask0, mask1, mask2);
2893 hz_out4 = AVC_HORZ_FILTER_SH(src4, src4, mask0, mask1, mask2);
2895 for (loop_cnt = 4; loop_cnt--;) {
2896 LD_SB4(src, stride, src5, src6, src7, src8);
2897 src += (4 * stride);
2899 XORI_B4_128_SB(src5, src6, src7, src8);
2901 hz_out5 = AVC_HORZ_FILTER_SH(src5, src5, mask0, mask1, mask2);
2902 hz_out6 = AVC_HORZ_FILTER_SH(src6, src6, mask0, mask1, mask2);
2903 hz_out7 = AVC_HORZ_FILTER_SH(src7, src7, mask0, mask1, mask2);
2904 hz_out8 = AVC_HORZ_FILTER_SH(src8, src8, mask0, mask1, mask2);
2906 ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2,
2907 hz_out4, hz_out3, hz_out10_r, hz_out21_r, hz_out32_r,
2909 ILVL_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2,
2910 hz_out4, hz_out3, hz_out10_l, hz_out21_l, hz_out32_l,
2912 ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6,
2913 hz_out8, hz_out7, hz_out54_r, hz_out65_r, hz_out76_r,
2915 ILVL_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6,
2916 hz_out8, hz_out7, hz_out54_l, hz_out65_l, hz_out76_l,
2919 tmp0 = AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0,
2921 tmp1 = AVC_DOT_SW3_SW(hz_out10_l, hz_out32_l, hz_out54_l, filt0,
2923 dst0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
2924 tmp0 = AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0,
2926 tmp1 = AVC_DOT_SW3_SW(hz_out21_l, hz_out43_l, hz_out65_l, filt0,
2928 dst2 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
2929 tmp0 = AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0,
2931 tmp1 = AVC_DOT_SW3_SW(hz_out32_l, hz_out54_l, hz_out76_l, filt0,
2933 dst4 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
2934 tmp0 = AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0,
2936 tmp1 = AVC_DOT_SW3_SW(hz_out43_l, hz_out65_l, hz_out87_l, filt0,
2938 dst6 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
2940 dst1 = __msa_srari_h(hz_out2, 5);
2941 dst3 = __msa_srari_h(hz_out3, 5);
2942 dst5 = __msa_srari_h(hz_out4, 5);
2943 dst7 = __msa_srari_h(hz_out5, 5);
2944 SAT_SH4_SH(dst1, dst3, dst5, dst7, 7);
2946 dst0 = __msa_aver_s_h(dst0, dst1);
2947 dst1 = __msa_aver_s_h(dst2, dst3);
2948 dst2 = __msa_aver_s_h(dst4, dst5);
2949 dst3 = __msa_aver_s_h(dst6, dst7);
2951 out0 = PCKEV_XORI128_UB(dst0, dst1);
2952 out1 = PCKEV_XORI128_UB(dst2, dst3);
2953 ST8x4_UB(out0, out1, dst, stride);
2954 dst += (4 * stride);
2968 void ff_put_h264_qpel16_mc23_msa(uint8_t *dst, const uint8_t *src,
2971 uint8_t *dst_tmp = dst;
2972 const uint8_t *src_tmp = src - (2 * stride) - 2;
2973 uint32_t multiple8_cnt, loop_cnt;
2974 const int32_t filt_const0 = 0xfffb0001;
2975 const int32_t filt_const1 = 0x140014;
2976 const int32_t filt_const2 = 0x1fffb;
2978 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, mask0, mask1;
2980 v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
2981 v8i16 hz_out7, hz_out8, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
2982 v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r;
2983 v8i16 hz_out65_r, hz_out76_r, hz_out87_r, hz_out10_l, hz_out21_l;
2984 v8i16 hz_out32_l, hz_out43_l, hz_out54_l, hz_out65_l, hz_out76_l;
2985 v8i16 hz_out87_l, filt0, filt1, filt2;
2988 filt0 = (v8i16) __msa_fill_w(filt_const0);
2989 filt1 = (v8i16) __msa_fill_w(filt_const1);
2990 filt2 = (v8i16) __msa_fill_w(filt_const2);
2992 LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
2994 for (multiple8_cnt = 2; multiple8_cnt--;) {
2998 LD_SB5(src, stride, src0, src1, src2, src3, src4);
2999 XORI_B5_128_SB(src0, src1, src2, src3, src4);
3000 src += (5 * stride);
3002 hz_out0 = AVC_HORZ_FILTER_SH(src0, src0, mask0, mask1, mask2);
3003 hz_out1 = AVC_HORZ_FILTER_SH(src1, src1, mask0, mask1, mask2);
3004 hz_out2 = AVC_HORZ_FILTER_SH(src2, src2, mask0, mask1, mask2);
3005 hz_out3 = AVC_HORZ_FILTER_SH(src3, src3, mask0, mask1, mask2);
3006 hz_out4 = AVC_HORZ_FILTER_SH(src4, src4, mask0, mask1, mask2);
3008 for (loop_cnt = 4; loop_cnt--;) {
3009 LD_SB4(src, stride, src5, src6, src7, src8);
3010 src += (4 * stride);
3012 XORI_B4_128_SB(src5, src6, src7, src8);
3014 hz_out5 = AVC_HORZ_FILTER_SH(src5, src5, mask0, mask1, mask2);
3015 hz_out6 = AVC_HORZ_FILTER_SH(src6, src6, mask0, mask1, mask2);
3016 hz_out7 = AVC_HORZ_FILTER_SH(src7, src7, mask0, mask1, mask2);
3017 hz_out8 = AVC_HORZ_FILTER_SH(src8, src8, mask0, mask1, mask2);
3019 ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2,
3020 hz_out4, hz_out3, hz_out10_r, hz_out21_r, hz_out32_r,
3022 ILVL_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2,
3023 hz_out4, hz_out3, hz_out10_l, hz_out21_l, hz_out32_l,
3025 ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6,
3026 hz_out8, hz_out7, hz_out54_r, hz_out65_r, hz_out76_r,
3028 ILVL_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6,
3029 hz_out8, hz_out7, hz_out54_l, hz_out65_l, hz_out76_l,
3032 tmp0 = AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0,
3034 tmp1 = AVC_DOT_SW3_SW(hz_out10_l, hz_out32_l, hz_out54_l, filt0,
3036 dst0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
3037 tmp0 = AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0,
3039 tmp1 = AVC_DOT_SW3_SW(hz_out21_l, hz_out43_l, hz_out65_l, filt0,
3041 dst2 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
3042 tmp0 = AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0,
3044 tmp1 = AVC_DOT_SW3_SW(hz_out32_l, hz_out54_l, hz_out76_l, filt0,
3046 dst4 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
3047 tmp0 = AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0,
3049 tmp1 = AVC_DOT_SW3_SW(hz_out43_l, hz_out65_l, hz_out87_l, filt0,
3051 dst6 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
3053 dst1 = __msa_srari_h(hz_out3, 5);
3054 dst3 = __msa_srari_h(hz_out4, 5);
3055 dst5 = __msa_srari_h(hz_out5, 5);
3056 dst7 = __msa_srari_h(hz_out6, 5);
3057 SAT_SH4_SH(dst1, dst3, dst5, dst7, 7);
3059 dst0 = __msa_aver_s_h(dst0, dst1);
3060 dst1 = __msa_aver_s_h(dst2, dst3);
3061 dst2 = __msa_aver_s_h(dst4, dst5);
3062 dst3 = __msa_aver_s_h(dst6, dst7);
3064 out0 = PCKEV_XORI128_UB(dst0, dst1);
3065 out1 = PCKEV_XORI128_UB(dst2, dst3);
3066 ST8x4_UB(out0, out1, dst, stride);
3067 dst += (4 * stride);
3081 void ff_put_h264_qpel8_mc21_msa(uint8_t *dst, const uint8_t *src,
3084 const int32_t filt_const0 = 0xfffb0001;
3085 const int32_t filt_const1 = 0x140014;
3086 const int32_t filt_const2 = 0x1fffb;
3088 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
3089 v16i8 src11, src12, mask0, mask1, mask2;
3090 v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
3091 v8i16 hz_out7, hz_out8, hz_out9, hz_out10, hz_out11, hz_out12;
3092 v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r;
3093 v8i16 hz_out65_r, hz_out76_r, hz_out87_r, hz_out89_r, hz_out910_r;
3094 v8i16 hz_out1110_r, hz_out1211_r, dst0, dst1, dst2, dst3;
3095 v8i16 hz_out10_l, hz_out21_l, hz_out32_l, hz_out43_l, hz_out54_l;
3096 v8i16 hz_out65_l, hz_out76_l, hz_out87_l, hz_out89_l, hz_out910_l;
3097 v8i16 hz_out1110_l, hz_out1211_l, filt0, filt1, filt2;
3100 LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
3102 filt0 = (v8i16) __msa_fill_w(filt_const0);
3103 filt1 = (v8i16) __msa_fill_w(filt_const1);
3104 filt2 = (v8i16) __msa_fill_w(filt_const2);
3106 src -= ((2 * stride) + 2);
3108 LD_SB5(src, stride, src0, src1, src2, src3, src4);
3109 XORI_B5_128_SB(src0, src1, src2, src3, src4);
3110 src += (5 * stride);
3112 hz_out0 = AVC_HORZ_FILTER_SH(src0, src0, mask0, mask1, mask2);
3113 hz_out1 = AVC_HORZ_FILTER_SH(src1, src1, mask0, mask1, mask2);
3114 hz_out2 = AVC_HORZ_FILTER_SH(src2, src2, mask0, mask1, mask2);
3115 hz_out3 = AVC_HORZ_FILTER_SH(src3, src3, mask0, mask1, mask2);
3116 hz_out4 = AVC_HORZ_FILTER_SH(src4, src4, mask0, mask1, mask2);
3118 LD_SB4(src, stride, src5, src6, src7, src8);
3119 src += (4 * stride);
3120 XORI_B4_128_SB(src5, src6, src7, src8);
3122 hz_out5 = AVC_HORZ_FILTER_SH(src5, src5, mask0, mask1, mask2);
3123 hz_out6 = AVC_HORZ_FILTER_SH(src6, src6, mask0, mask1, mask2);
3124 hz_out7 = AVC_HORZ_FILTER_SH(src7, src7, mask0, mask1, mask2);
3125 hz_out8 = AVC_HORZ_FILTER_SH(src8, src8, mask0, mask1, mask2);
3127 ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4,
3128 hz_out3, hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r);
3129 ILVL_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4,
3130 hz_out3, hz_out10_l, hz_out21_l, hz_out32_l, hz_out43_l);
3131 ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8,
3132 hz_out7, hz_out54_r, hz_out65_r, hz_out76_r, hz_out87_r);
3133 ILVL_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8,
3134 hz_out7, hz_out54_l, hz_out65_l, hz_out76_l, hz_out87_l);
3136 tmp0 = AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0, filt1,
3138 tmp1 = AVC_DOT_SW3_SW(hz_out10_l, hz_out32_l, hz_out54_l, filt0, filt1,
3140 dst0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
3141 tmp0 = AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0, filt1,
3143 tmp1 = AVC_DOT_SW3_SW(hz_out21_l, hz_out43_l, hz_out65_l, filt0, filt1,
3145 dst1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
3146 tmp0 = AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0, filt1,
3148 tmp1 = AVC_DOT_SW3_SW(hz_out32_l, hz_out54_l, hz_out76_l, filt0, filt1,
3150 dst2 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
3151 tmp0 = AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0, filt1,
3153 tmp1 = AVC_DOT_SW3_SW(hz_out43_l, hz_out65_l, hz_out87_l, filt0, filt1,
3155 dst3 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
3157 SRARI_H4_SH(hz_out2, hz_out3, hz_out4, hz_out5, 5);
3158 SAT_SH4_SH(hz_out2, hz_out3, hz_out4, hz_out5, 7);
3160 dst0 = __msa_aver_s_h(dst0, hz_out2);
3161 dst1 = __msa_aver_s_h(dst1, hz_out3);
3162 dst2 = __msa_aver_s_h(dst2, hz_out4);
3163 dst3 = __msa_aver_s_h(dst3, hz_out5);
3165 out0 = PCKEV_XORI128_UB(dst0, dst1);
3166 out1 = PCKEV_XORI128_UB(dst2, dst3);
3167 ST8x4_UB(out0, out1, dst, stride);
3168 dst += (4 * stride);
3170 LD_SB4(src, stride, src9, src10, src11, src12);
3171 XORI_B4_128_SB(src9, src10, src11, src12);
3172 hz_out9 = AVC_HORZ_FILTER_SH(src9, src9, mask0, mask1, mask2);
3173 hz_out10 = AVC_HORZ_FILTER_SH(src10, src10, mask0, mask1, mask2);
3174 hz_out11 = AVC_HORZ_FILTER_SH(src11, src11, mask0, mask1, mask2);
3175 hz_out12 = AVC_HORZ_FILTER_SH(src12, src12, mask0, mask1, mask2);
3176 ILVR_H4_SH(hz_out9, hz_out8, hz_out10, hz_out9, hz_out11, hz_out10,
3177 hz_out12, hz_out11, hz_out89_r, hz_out910_r, hz_out1110_r,
3179 ILVL_H4_SH(hz_out9, hz_out8, hz_out10, hz_out9, hz_out11, hz_out10,
3180 hz_out12, hz_out11, hz_out89_l, hz_out910_l, hz_out1110_l,
3182 tmp0 = AVC_DOT_SW3_SW(hz_out54_r, hz_out76_r, hz_out89_r, filt0, filt1,
3184 tmp1 = AVC_DOT_SW3_SW(hz_out54_l, hz_out76_l, hz_out89_l, filt0, filt1,
3186 dst0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
3187 tmp0 = AVC_DOT_SW3_SW(hz_out65_r, hz_out87_r, hz_out910_r, filt0, filt1,
3189 tmp1 = AVC_DOT_SW3_SW(hz_out65_l, hz_out87_l, hz_out910_l, filt0, filt1,
3191 dst1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
3192 tmp0 = AVC_DOT_SW3_SW(hz_out76_r, hz_out89_r, hz_out1110_r, filt0, filt1,
3194 tmp1 = AVC_DOT_SW3_SW(hz_out76_l, hz_out89_l, hz_out1110_l, filt0, filt1,
3196 dst2 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
3197 tmp0 = AVC_DOT_SW3_SW(hz_out87_r, hz_out910_r, hz_out1211_r, filt0, filt1,
3199 tmp1 = AVC_DOT_SW3_SW(hz_out87_l, hz_out910_l, hz_out1211_l, filt0, filt1,
3201 dst3 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
3203 SRARI_H4_SH(hz_out6, hz_out7, hz_out8, hz_out9, 5);
3204 SAT_SH4_SH(hz_out6, hz_out7, hz_out8, hz_out9, 7);
3206 dst0 = __msa_aver_s_h(dst0, hz_out6);
3207 dst1 = __msa_aver_s_h(dst1, hz_out7);
3208 dst2 = __msa_aver_s_h(dst2, hz_out8);
3209 dst3 = __msa_aver_s_h(dst3, hz_out9);
3211 out0 = PCKEV_XORI128_UB(dst0, dst1);
3212 out1 = PCKEV_XORI128_UB(dst2, dst3);
3213 ST8x4_UB(out0, out1, dst, stride);
3216 void ff_put_h264_qpel8_mc23_msa(uint8_t *dst, const uint8_t *src,
3219 const int32_t filt_const0 = 0xfffb0001;
3220 const int32_t filt_const1 = 0x140014;
3221 const int32_t filt_const2 = 0x1fffb;
3223 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
3224 v16i8 src11, src12, mask0, mask1, mask2;
3225 v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
3226 v8i16 hz_out7, hz_out8, hz_out9, hz_out10, hz_out11, hz_out12;
3227 v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r;
3228 v8i16 hz_out65_r, hz_out76_r, hz_out87_r, hz_out89_r, hz_out910_r;
3229 v8i16 hz_out1110_r, hz_out1211_r, dst0, dst1, dst2, dst3;
3230 v8i16 hz_out10_l, hz_out21_l, hz_out32_l, hz_out43_l, hz_out54_l;
3231 v8i16 hz_out65_l, hz_out76_l, hz_out87_l, hz_out89_l, hz_out910_l;
3232 v8i16 hz_out1110_l, hz_out1211_l, filt0, filt1, filt2;
3235 LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
3237 filt0 = (v8i16) __msa_fill_w(filt_const0);
3238 filt1 = (v8i16) __msa_fill_w(filt_const1);
3239 filt2 = (v8i16) __msa_fill_w(filt_const2);
3241 src -= ((2 * stride) + 2);
3243 LD_SB5(src, stride, src0, src1, src2, src3, src4);
3244 XORI_B5_128_SB(src0, src1, src2, src3, src4);
3245 src += (5 * stride);
3247 hz_out0 = AVC_HORZ_FILTER_SH(src0, src0, mask0, mask1, mask2);
3248 hz_out1 = AVC_HORZ_FILTER_SH(src1, src1, mask0, mask1, mask2);
3249 hz_out2 = AVC_HORZ_FILTER_SH(src2, src2, mask0, mask1, mask2);
3250 hz_out3 = AVC_HORZ_FILTER_SH(src3, src3, mask0, mask1, mask2);
3251 hz_out4 = AVC_HORZ_FILTER_SH(src4, src4, mask0, mask1, mask2);
3253 LD_SB4(src, stride, src5, src6, src7, src8);
3254 src += (4 * stride);
3255 XORI_B4_128_SB(src5, src6, src7, src8);
3257 hz_out5 = AVC_HORZ_FILTER_SH(src5, src5, mask0, mask1, mask2);
3258 hz_out6 = AVC_HORZ_FILTER_SH(src6, src6, mask0, mask1, mask2);
3259 hz_out7 = AVC_HORZ_FILTER_SH(src7, src7, mask0, mask1, mask2);
3260 hz_out8 = AVC_HORZ_FILTER_SH(src8, src8, mask0, mask1, mask2);
3262 ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4,
3263 hz_out3, hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r);
3264 ILVL_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4,
3265 hz_out3, hz_out10_l, hz_out21_l, hz_out32_l, hz_out43_l);
3266 ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8,
3267 hz_out7, hz_out54_r, hz_out65_r, hz_out76_r, hz_out87_r);
3268 ILVL_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8,
3269 hz_out7, hz_out54_l, hz_out65_l, hz_out76_l, hz_out87_l);
3271 tmp0 = AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0, filt1,
3273 tmp1 = AVC_DOT_SW3_SW(hz_out10_l, hz_out32_l, hz_out54_l, filt0, filt1,
3275 dst0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
3276 tmp0 = AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0, filt1,
3278 tmp1 = AVC_DOT_SW3_SW(hz_out21_l, hz_out43_l, hz_out65_l, filt0, filt1,
3280 dst1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
3281 tmp0 = AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0, filt1,
3283 tmp1 = AVC_DOT_SW3_SW(hz_out32_l, hz_out54_l, hz_out76_l, filt0, filt1,
3285 dst2 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
3286 tmp0 = AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0, filt1,
3288 tmp1 = AVC_DOT_SW3_SW(hz_out43_l, hz_out65_l, hz_out87_l, filt0, filt1,
3290 dst3 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
3292 SRARI_H4_SH(hz_out3, hz_out4, hz_out5, hz_out6, 5);
3293 SAT_SH4_SH(hz_out3, hz_out4, hz_out5, hz_out6, 7);
3295 dst0 = __msa_aver_s_h(dst0, hz_out3);
3296 dst1 = __msa_aver_s_h(dst1, hz_out4);
3297 dst2 = __msa_aver_s_h(dst2, hz_out5);
3298 dst3 = __msa_aver_s_h(dst3, hz_out6);
3300 out0 = PCKEV_XORI128_UB(dst0, dst1);
3301 out1 = PCKEV_XORI128_UB(dst2, dst3);
3302 ST8x4_UB(out0, out1, dst, stride);
3303 dst += (4 * stride);
3305 LD_SB4(src, stride, src9, src10, src11, src12);
3306 XORI_B4_128_SB(src9, src10, src11, src12);
3307 hz_out9 = AVC_HORZ_FILTER_SH(src9, src9, mask0, mask1, mask2);
3308 hz_out10 = AVC_HORZ_FILTER_SH(src10, src10, mask0, mask1, mask2);
3309 hz_out11 = AVC_HORZ_FILTER_SH(src11, src11, mask0, mask1, mask2);
3310 hz_out12 = AVC_HORZ_FILTER_SH(src12, src12, mask0, mask1, mask2);
3311 ILVR_H4_SH(hz_out9, hz_out8, hz_out10, hz_out9, hz_out11, hz_out10,
3312 hz_out12, hz_out11, hz_out89_r, hz_out910_r, hz_out1110_r,
3314 ILVL_H4_SH(hz_out9, hz_out8, hz_out10, hz_out9, hz_out11, hz_out10,
3315 hz_out12, hz_out11, hz_out89_l, hz_out910_l, hz_out1110_l,
3317 tmp0 = AVC_DOT_SW3_SW(hz_out54_r, hz_out76_r, hz_out89_r, filt0, filt1,
3319 tmp1 = AVC_DOT_SW3_SW(hz_out54_l, hz_out76_l, hz_out89_l, filt0, filt1,
3321 dst0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
3322 tmp0 = AVC_DOT_SW3_SW(hz_out65_r, hz_out87_r, hz_out910_r, filt0, filt1,
3324 tmp1 = AVC_DOT_SW3_SW(hz_out65_l, hz_out87_l, hz_out910_l, filt0, filt1,
3326 dst1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
3327 tmp0 = AVC_DOT_SW3_SW(hz_out76_r, hz_out89_r, hz_out1110_r, filt0, filt1,
3329 tmp1 = AVC_DOT_SW3_SW(hz_out76_l, hz_out89_l, hz_out1110_l, filt0, filt1,
3331 dst2 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
3332 tmp0 = AVC_DOT_SW3_SW(hz_out87_r, hz_out910_r, hz_out1211_r, filt0, filt1,
3334 tmp1 = AVC_DOT_SW3_SW(hz_out87_l, hz_out910_l, hz_out1211_l, filt0, filt1,
3336 dst3 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
3338 SRARI_H4_SH(hz_out7, hz_out8, hz_out9, hz_out10, 5);
3339 SAT_SH4_SH(hz_out7, hz_out8, hz_out9, hz_out10, 7);
3341 dst0 = __msa_aver_s_h(dst0, hz_out7);
3342 dst1 = __msa_aver_s_h(dst1, hz_out8);
3343 dst2 = __msa_aver_s_h(dst2, hz_out9);
3344 dst3 = __msa_aver_s_h(dst3, hz_out10);
3346 out0 = PCKEV_XORI128_UB(dst0, dst1);
3347 out1 = PCKEV_XORI128_UB(dst2, dst3);
3348 ST8x4_UB(out0, out1, dst, stride);
3351 void ff_put_h264_qpel4_mc21_msa(uint8_t *dst, const uint8_t *src,
3354 const int32_t filt_const0 = 0xfffb0001;
3355 const int32_t filt_const1 = 0x140014;
3356 const int32_t filt_const2 = 0x1fffb;
3358 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
3359 v16i8 mask0, mask1, mask2;
3360 v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
3361 v8i16 hz_out7, hz_out8, dst0, dst1, filt0, filt1, filt2;
3362 v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r;
3363 v8i16 hz_out65_r, hz_out76_r, hz_out87_r;
3366 LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2);
3368 filt0 = (v8i16) __msa_fill_w(filt_const0);
3369 filt1 = (v8i16) __msa_fill_w(filt_const1);
3370 filt2 = (v8i16) __msa_fill_w(filt_const2);
3372 src -= ((2 * stride) + 2);
3374 LD_SB5(src, stride, src0, src1, src2, src3, src4);
3375 src += (5 * stride);
3376 LD_SB4(src, stride, src5, src6, src7, src8);
3378 XORI_B5_128_SB(src0, src1, src2, src3, src4);
3379 XORI_B4_128_SB(src5, src6, src7, src8);
3381 hz_out0 = AVC_HORZ_FILTER_SH(src0, src1, mask0, mask1, mask2);
3382 hz_out2 = AVC_HORZ_FILTER_SH(src2, src3, mask0, mask1, mask2);
3383 hz_out4 = AVC_HORZ_FILTER_SH(src4, src5, mask0, mask1, mask2);
3384 hz_out6 = AVC_HORZ_FILTER_SH(src6, src7, mask0, mask1, mask2);
3385 hz_out8 = AVC_HORZ_FILTER_SH(src8, src8, mask0, mask1, mask2);
3386 PCKOD_D2_SH(hz_out0, hz_out0, hz_out2, hz_out2, hz_out1, hz_out3);
3387 PCKOD_D2_SH(hz_out4, hz_out4, hz_out6, hz_out6, hz_out5, hz_out7);
3389 ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4,
3390 hz_out3, hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r);
3391 ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8,
3392 hz_out7, hz_out54_r, hz_out65_r, hz_out76_r, hz_out87_r);
3394 tmp0 = AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0, filt1,
3396 tmp1 = AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0, filt1,
3398 dst0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
3399 tmp0 = AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0, filt1,
3401 tmp1 = AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0, filt1,
3403 dst1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
3405 SRARI_H2_SH(hz_out2, hz_out4, 5);
3406 SAT_SH2_SH(hz_out2, hz_out4, 7);
3408 dst0 = __msa_aver_s_h(dst0, hz_out2);
3409 dst1 = __msa_aver_s_h(dst1, hz_out4);
3411 res = PCKEV_XORI128_UB(dst0, dst1);
3412 ST4x4_UB(res, res, 0, 1, 2, 3, dst, stride);
3415 void ff_put_h264_qpel4_mc23_msa(uint8_t *dst, const uint8_t *src,
3418 const int32_t filt_const0 = 0xfffb0001;
3419 const int32_t filt_const1 = 0x140014;
3420 const int32_t filt_const2 = 0x1fffb;
3422 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
3423 v16i8 mask0, mask1, mask2;
3424 v8i16 hz_out0, hz_out1, hz_out2, hz_out3, hz_out4, hz_out5, hz_out6;
3425 v8i16 hz_out7, hz_out8, dst0, dst1, filt0, filt1, filt2;
3426 v8i16 hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r, hz_out54_r;
3427 v8i16 hz_out65_r, hz_out76_r, hz_out87_r;
3430 LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2);
3432 filt0 = (v8i16) __msa_fill_w(filt_const0);
3433 filt1 = (v8i16) __msa_fill_w(filt_const1);
3434 filt2 = (v8i16) __msa_fill_w(filt_const2);
3436 src -= ((2 * stride) + 2);
3438 LD_SB5(src, stride, src0, src1, src2, src3, src4);
3439 src += (5 * stride);
3440 LD_SB4(src, stride, src5, src6, src7, src8);
3442 XORI_B5_128_SB(src0, src1, src2, src3, src4);
3443 XORI_B4_128_SB(src5, src6, src7, src8);
3445 hz_out0 = AVC_HORZ_FILTER_SH(src0, src1, mask0, mask1, mask2);
3446 hz_out2 = AVC_HORZ_FILTER_SH(src2, src3, mask0, mask1, mask2);
3447 hz_out4 = AVC_HORZ_FILTER_SH(src4, src5, mask0, mask1, mask2);
3448 hz_out6 = AVC_HORZ_FILTER_SH(src6, src7, mask0, mask1, mask2);
3449 hz_out8 = AVC_HORZ_FILTER_SH(src8, src8, mask0, mask1, mask2);
3450 PCKOD_D2_SH(hz_out0, hz_out0, hz_out2, hz_out2, hz_out1, hz_out3);
3451 PCKOD_D2_SH(hz_out4, hz_out4, hz_out6, hz_out6, hz_out5, hz_out7);
3453 ILVR_H4_SH(hz_out1, hz_out0, hz_out2, hz_out1, hz_out3, hz_out2, hz_out4,
3454 hz_out3, hz_out10_r, hz_out21_r, hz_out32_r, hz_out43_r);
3455 ILVR_H4_SH(hz_out5, hz_out4, hz_out6, hz_out5, hz_out7, hz_out6, hz_out8,
3456 hz_out7, hz_out54_r, hz_out65_r, hz_out76_r, hz_out87_r);
3458 tmp0 = AVC_DOT_SW3_SW(hz_out10_r, hz_out32_r, hz_out54_r, filt0, filt1,
3460 tmp1 = AVC_DOT_SW3_SW(hz_out21_r, hz_out43_r, hz_out65_r, filt0, filt1,
3462 dst0 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
3463 tmp0 = AVC_DOT_SW3_SW(hz_out32_r, hz_out54_r, hz_out76_r, filt0, filt1,
3465 tmp1 = AVC_DOT_SW3_SW(hz_out43_r, hz_out65_r, hz_out87_r, filt0, filt1,
3467 dst1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
3469 PCKEV_D2_SH(hz_out4, hz_out3, hz_out6, hz_out5, hz_out0, hz_out1);
3470 SRARI_H2_SH(hz_out0, hz_out1, 5);
3471 SAT_SH2_SH(hz_out0, hz_out1, 7);
3473 dst0 = __msa_aver_s_h(dst0, hz_out0);
3474 dst1 = __msa_aver_s_h(dst1, hz_out1);
3476 res = PCKEV_XORI128_UB(dst0, dst1);
3477 ST4x4_UB(res, res, 0, 1, 2, 3, dst, stride);
3480 void ff_put_h264_qpel16_mc02_msa(uint8_t *dst, const uint8_t *src,
3484 int16_t filt_const0 = 0xfb01;
3485 int16_t filt_const1 = 0x1414;
3486 int16_t filt_const2 = 0x1fb;
3487 v16u8 res0, res1, res2, res3;
3488 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
3489 v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
3490 v16i8 src87_r, src10_l, src32_l, src54_l, src76_l, src21_l, src43_l;
3491 v16i8 src65_l, src87_l, filt0, filt1, filt2;
3492 v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
3494 filt0 = (v16i8) __msa_fill_h(filt_const0);
3495 filt1 = (v16i8) __msa_fill_h(filt_const1);
3496 filt2 = (v16i8) __msa_fill_h(filt_const2);
3497 src -= (stride * 2);
3499 LD_SB5(src, stride, src0, src1, src2, src3, src4);
3500 src += (5 * stride);
3502 XORI_B5_128_SB(src0, src1, src2, src3, src4);
3503 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
3505 ILVL_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_l, src21_l,
3508 for (loop_cnt = 4; loop_cnt--;) {
3509 LD_SB4(src, stride, src5, src6, src7, src8);
3510 src += (4 * stride);
3512 XORI_B4_128_SB(src5, src6, src7, src8);
3513 ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r,
3514 src65_r, src76_r, src87_r);
3515 ILVL_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_l,
3516 src65_l, src76_l, src87_l);
3517 out0_r = AVC_DOT_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2);
3518 out1_r = AVC_DOT_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2);
3519 out2_r = AVC_DOT_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2);
3520 out3_r = AVC_DOT_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2);
3521 out0_l = AVC_DOT_SH3_SH(src10_l, src32_l, src54_l, filt0, filt1, filt2);
3522 out1_l = AVC_DOT_SH3_SH(src21_l, src43_l, src65_l, filt0, filt1, filt2);
3523 out2_l = AVC_DOT_SH3_SH(src32_l, src54_l, src76_l, filt0, filt1, filt2);
3524 out3_l = AVC_DOT_SH3_SH(src43_l, src65_l, src87_l, filt0, filt1, filt2);
3525 SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 5);
3526 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
3527 SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 5);
3528 SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
3529 PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
3530 out3_r, res0, res1, res2, res3);
3531 XORI_B4_128_UB(res0, res1, res2, res3);
3532 ST_UB4(res0, res1, res2, res3, dst, stride);
3533 dst += (4 * stride);
3547 void ff_put_h264_qpel8_mc02_msa(uint8_t *dst, const uint8_t *src,
3550 const int16_t filt_const0 = 0xfb01;
3551 const int16_t filt_const1 = 0x1414;
3552 const int16_t filt_const2 = 0x1fb;
3553 v16u8 out0, out1, out2, out3;
3554 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
3555 v16i8 src11, src12, src10_r, src21_r, src32_r, src43_r, src76_r, src87_r;
3556 v16i8 src98_r, src109_r, src89_r, src910_r, src1110_r, src1211_r;
3557 v16i8 filt0, filt1, filt2;
3558 v8i16 out0_r, out1_r, out2_r, out3_r, out4_r, out5_r, out6_r, out7_r;
3560 filt0 = (v16i8) __msa_fill_h(filt_const0);
3561 filt1 = (v16i8) __msa_fill_h(filt_const1);
3562 filt2 = (v16i8) __msa_fill_h(filt_const2);
3564 src -= (stride * 2);
3566 LD_SB8(src, stride, src0, src1, src2, src3, src4, src5, src6, src7);
3567 src += (8 * stride);
3568 LD_SB5(src, stride, src8, src9, src10, src11, src12);
3569 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
3571 ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src76_r, src87_r,
3573 ILVR_B4_SB(src9, src8, src10, src9, src11, src10, src12, src11, src89_r,
3574 src910_r, src1110_r, src1211_r);
3575 XORI_B4_128_SB(src10_r, src21_r, src32_r, src43_r);
3576 XORI_B4_128_SB(src76_r, src87_r, src98_r, src109_r);
3577 XORI_B4_128_SB(src89_r, src910_r, src1110_r, src1211_r);
3578 out0_r = AVC_DOT_SH3_SH(src10_r, src32_r, src76_r, filt0, filt1, filt2);
3579 out1_r = AVC_DOT_SH3_SH(src21_r, src43_r, src87_r, filt0, filt1, filt2);
3580 out2_r = AVC_DOT_SH3_SH(src32_r, src76_r, src98_r, filt0, filt1, filt2);
3581 out3_r = AVC_DOT_SH3_SH(src43_r, src87_r, src109_r, filt0, filt1, filt2);
3582 out4_r = AVC_DOT_SH3_SH(src76_r, src98_r, src89_r, filt0, filt1, filt2);
3583 out5_r = AVC_DOT_SH3_SH(src87_r, src109_r, src910_r, filt0, filt1, filt2);
3584 out6_r = AVC_DOT_SH3_SH(src98_r, src89_r, src1110_r, filt0, filt1, filt2);
3585 out7_r = AVC_DOT_SH3_SH(src109_r, src910_r, src1211_r, filt0, filt1, filt2);
3586 SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 5);
3587 SRARI_H4_SH(out4_r, out5_r, out6_r, out7_r, 5);
3588 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
3589 SAT_SH4_SH(out4_r, out5_r, out6_r, out7_r, 7);
3590 out0 = PCKEV_XORI128_UB(out0_r, out1_r);
3591 out1 = PCKEV_XORI128_UB(out2_r, out3_r);
3592 out2 = PCKEV_XORI128_UB(out4_r, out5_r);
3593 out3 = PCKEV_XORI128_UB(out6_r, out7_r);
3594 ST8x8_UB(out0, out1, out2, out3, dst, stride);
3597 void ff_put_h264_qpel4_mc02_msa(uint8_t *dst, const uint8_t *src,
3600 const int16_t filt_const0 = 0xfb01;
3601 const int16_t filt_const1 = 0x1414;
3602 const int16_t filt_const2 = 0x1fb;
3604 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
3605 v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
3606 v16i8 src87_r, src2110, src4332, src6554, src8776, filt0, filt1, filt2;
3609 filt0 = (v16i8) __msa_fill_h(filt_const0);
3610 filt1 = (v16i8) __msa_fill_h(filt_const1);
3611 filt2 = (v16i8) __msa_fill_h(filt_const2);
3613 src -= (stride * 2);
3615 LD_SB5(src, stride, src0, src1, src2, src3, src4);
3616 src += (5 * stride);
3617 LD_SB4(src, stride, src5, src6, src7, src8);
3619 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
3621 ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
3623 ILVR_D4_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r, src87_r,
3624 src76_r, src2110, src4332, src6554, src8776);
3625 XORI_B4_128_SB(src2110, src4332, src6554, src8776);
3626 out10 = AVC_DOT_SH3_SH(src2110, src4332, src6554, filt0, filt1, filt2);
3627 out32 = AVC_DOT_SH3_SH(src4332, src6554, src8776, filt0, filt1, filt2);
3628 SRARI_H2_SH(out10, out32, 5);
3629 SAT_SH2_SH(out10, out32, 7);
3630 out = PCKEV_XORI128_UB(out10, out32);
3631 ST4x4_UB(out, out, 0, 1, 2, 3, dst, stride);
3634 void ff_put_h264_qpel16_mc12_msa(uint8_t *dst, const uint8_t *src,
3637 avc_luma_midh_qrt_16w_msa(src - (2 * stride) - 2,
3638 stride, dst, stride, 16, 0);
3641 void ff_put_h264_qpel16_mc32_msa(uint8_t *dst, const uint8_t *src,
3644 avc_luma_midh_qrt_16w_msa(src - (2 * stride) - 2,
3645 stride, dst, stride, 16, 1);
3648 void ff_put_h264_qpel8_mc12_msa(uint8_t *dst, const uint8_t *src,
3651 avc_luma_midh_qrt_8w_msa(src - (2 * stride) - 2, stride, dst, stride, 8, 0);
3654 void ff_put_h264_qpel8_mc32_msa(uint8_t *dst, const uint8_t *src,
3657 avc_luma_midh_qrt_8w_msa(src - (2 * stride) - 2, stride, dst, stride, 8, 1);
3660 void ff_put_h264_qpel4_mc12_msa(uint8_t *dst, const uint8_t *src,
3663 avc_luma_midh_qrt_4w_msa(src - (2 * stride) - 2, stride, dst, stride, 4, 0);
3666 void ff_put_h264_qpel4_mc32_msa(uint8_t *dst, const uint8_t *src,
3669 avc_luma_midh_qrt_4w_msa(src - (2 * stride) - 2, stride, dst, stride, 4, 1);
3672 void ff_put_h264_qpel16_mc22_msa(uint8_t *dst, const uint8_t *src,
3675 avc_luma_mid_16w_msa(src - (2 * stride) - 2, stride, dst, stride, 16);
3678 void ff_put_h264_qpel8_mc22_msa(uint8_t *dst, const uint8_t *src,
3681 avc_luma_mid_8w_msa(src - (2 * stride) - 2, stride, dst, stride, 8);
3684 void ff_put_h264_qpel4_mc22_msa(uint8_t *dst, const uint8_t *src,
3687 avc_luma_mid_4w_msa(src - (2 * stride) - 2, stride, dst, stride, 4);
3690 void ff_avg_h264_qpel16_mc10_msa(uint8_t *dst, const uint8_t *src,
3693 avc_luma_hz_qrt_and_aver_dst_16x16_msa(src - 2, stride, dst, stride, 0);
3696 void ff_avg_h264_qpel16_mc30_msa(uint8_t *dst, const uint8_t *src,
3699 avc_luma_hz_qrt_and_aver_dst_16x16_msa(src - 2, stride, dst, stride, 1);
3702 void ff_avg_h264_qpel8_mc10_msa(uint8_t *dst, const uint8_t *src,
3705 avc_luma_hz_qrt_and_aver_dst_8x8_msa(src - 2, stride, dst, stride, 0);
3708 void ff_avg_h264_qpel8_mc30_msa(uint8_t *dst, const uint8_t *src,
3711 avc_luma_hz_qrt_and_aver_dst_8x8_msa(src - 2, stride, dst, stride, 1);
3714 void ff_avg_h264_qpel4_mc10_msa(uint8_t *dst, const uint8_t *src,
3717 avc_luma_hz_qrt_and_aver_dst_4x4_msa(src - 2, stride, dst, stride, 0);
3720 void ff_avg_h264_qpel4_mc30_msa(uint8_t *dst, const uint8_t *src,
3723 avc_luma_hz_qrt_and_aver_dst_4x4_msa(src - 2, stride, dst, stride, 1);
3726 void ff_avg_h264_qpel16_mc20_msa(uint8_t *dst, const uint8_t *src,
3729 avc_luma_hz_and_aver_dst_16x16_msa(src - 2, stride, dst, stride);
3732 void ff_avg_h264_qpel8_mc20_msa(uint8_t *dst, const uint8_t *src,
3735 avc_luma_hz_and_aver_dst_8x8_msa(src - 2, stride, dst, stride);
3738 void ff_avg_h264_qpel4_mc20_msa(uint8_t *dst, const uint8_t *src,
3741 avc_luma_hz_and_aver_dst_4x4_msa(src - 2, stride, dst, stride);
3744 void ff_avg_h264_qpel16_mc01_msa(uint8_t *dst, const uint8_t *src,
3747 avc_luma_vt_qrt_and_aver_dst_16x16_msa(src - (stride * 2),
3748 stride, dst, stride, 0);
3751 void ff_avg_h264_qpel16_mc03_msa(uint8_t *dst, const uint8_t *src,
3754 avc_luma_vt_qrt_and_aver_dst_16x16_msa(src - (stride * 2),
3755 stride, dst, stride, 1);
3758 void ff_avg_h264_qpel8_mc01_msa(uint8_t *dst, const uint8_t *src,
3761 avc_luma_vt_qrt_and_aver_dst_8x8_msa(src - (stride * 2),
3762 stride, dst, stride, 0);
3765 void ff_avg_h264_qpel8_mc03_msa(uint8_t *dst, const uint8_t *src,
3768 avc_luma_vt_qrt_and_aver_dst_8x8_msa(src - (stride * 2),
3769 stride, dst, stride, 1);
3772 void ff_avg_h264_qpel4_mc01_msa(uint8_t *dst, const uint8_t *src,
3775 avc_luma_vt_qrt_and_aver_dst_4x4_msa(src - (stride * 2),
3776 stride, dst, stride, 0);
3779 void ff_avg_h264_qpel4_mc03_msa(uint8_t *dst, const uint8_t *src,
3782 avc_luma_vt_qrt_and_aver_dst_4x4_msa(src - (stride * 2),
3783 stride, dst, stride, 1);
3786 void ff_avg_h264_qpel16_mc11_msa(uint8_t *dst, const uint8_t *src,
3789 avc_luma_hv_qrt_and_aver_dst_16x16_msa(src - 2,
3791 stride, dst, stride);
3794 void ff_avg_h264_qpel16_mc31_msa(uint8_t *dst, const uint8_t *src,
3797 avc_luma_hv_qrt_and_aver_dst_16x16_msa(src - 2,
3798 src - (stride * 2) +
3799 sizeof(uint8_t), stride,
3803 void ff_avg_h264_qpel16_mc13_msa(uint8_t *dst, const uint8_t *src,
3806 avc_luma_hv_qrt_and_aver_dst_16x16_msa(src + stride - 2,
3808 stride, dst, stride);
3811 void ff_avg_h264_qpel16_mc33_msa(uint8_t *dst, const uint8_t *src,
3814 avc_luma_hv_qrt_and_aver_dst_16x16_msa(src + stride - 2,
3815 src - (stride * 2) +
3816 sizeof(uint8_t), stride,
3820 void ff_avg_h264_qpel8_mc11_msa(uint8_t *dst, const uint8_t *src,
3823 avc_luma_hv_qrt_and_aver_dst_8x8_msa(src - 2,
3825 stride, dst, stride);
3828 void ff_avg_h264_qpel8_mc31_msa(uint8_t *dst, const uint8_t *src,
3831 avc_luma_hv_qrt_and_aver_dst_8x8_msa(src - 2,
3832 src - (stride * 2) +
3833 sizeof(uint8_t), stride, dst, stride);
3836 void ff_avg_h264_qpel8_mc13_msa(uint8_t *dst, const uint8_t *src,
3839 avc_luma_hv_qrt_and_aver_dst_8x8_msa(src + stride - 2,
3841 stride, dst, stride);
3844 void ff_avg_h264_qpel8_mc33_msa(uint8_t *dst, const uint8_t *src,
3847 avc_luma_hv_qrt_and_aver_dst_8x8_msa(src + stride - 2,
3848 src - (stride * 2) +
3849 sizeof(uint8_t), stride, dst, stride);
3853 void ff_avg_h264_qpel4_mc11_msa(uint8_t *dst, const uint8_t *src,
3856 avc_luma_hv_qrt_and_aver_dst_4x4_msa(src - 2,
3858 stride, dst, stride);
3861 void ff_avg_h264_qpel4_mc31_msa(uint8_t *dst, const uint8_t *src,
3864 avc_luma_hv_qrt_and_aver_dst_4x4_msa(src - 2,
3865 src - (stride * 2) +
3866 sizeof(uint8_t), stride, dst, stride);
3869 void ff_avg_h264_qpel4_mc13_msa(uint8_t *dst, const uint8_t *src,
3872 avc_luma_hv_qrt_and_aver_dst_4x4_msa(src + stride - 2,
3874 stride, dst, stride);
3877 void ff_avg_h264_qpel4_mc33_msa(uint8_t *dst, const uint8_t *src,
3880 avc_luma_hv_qrt_and_aver_dst_4x4_msa(src + stride - 2,
3881 src - (stride * 2) +
3882 sizeof(uint8_t), stride, dst, stride);
3885 void ff_avg_h264_qpel16_mc21_msa(uint8_t *dst, const uint8_t *src,
3888 avc_luma_midv_qrt_and_aver_dst_16w_msa(src - (2 * stride) - 2,
3889 stride, dst, stride, 16, 0);
3892 void ff_avg_h264_qpel16_mc23_msa(uint8_t *dst, const uint8_t *src,
3895 avc_luma_midv_qrt_and_aver_dst_16w_msa(src - (2 * stride) - 2,
3896 stride, dst, stride, 16, 1);
3899 void ff_avg_h264_qpel8_mc21_msa(uint8_t *dst, const uint8_t *src,
3902 avc_luma_midv_qrt_and_aver_dst_8w_msa(src - (2 * stride) - 2,
3903 stride, dst, stride, 8, 0);
3906 void ff_avg_h264_qpel8_mc23_msa(uint8_t *dst, const uint8_t *src,
3909 avc_luma_midv_qrt_and_aver_dst_8w_msa(src - (2 * stride) - 2,
3910 stride, dst, stride, 8, 1);
3913 void ff_avg_h264_qpel4_mc21_msa(uint8_t *dst, const uint8_t *src,
3916 avc_luma_midv_qrt_and_aver_dst_4w_msa(src - (2 * stride) - 2,
3917 stride, dst, stride, 4, 0);
3920 void ff_avg_h264_qpel4_mc23_msa(uint8_t *dst, const uint8_t *src,
3923 avc_luma_midv_qrt_and_aver_dst_4w_msa(src - (2 * stride) - 2,
3924 stride, dst, stride, 4, 1);
3927 void ff_avg_h264_qpel16_mc02_msa(uint8_t *dst, const uint8_t *src,
3930 avc_luma_vt_and_aver_dst_16x16_msa(src - (stride * 2), stride, dst, stride);
3933 void ff_avg_h264_qpel8_mc02_msa(uint8_t *dst, const uint8_t *src,
3936 avc_luma_vt_and_aver_dst_8x8_msa(src - (stride * 2), stride, dst, stride);
3939 void ff_avg_h264_qpel4_mc02_msa(uint8_t *dst, const uint8_t *src,
3942 avc_luma_vt_and_aver_dst_4x4_msa(src - (stride * 2), stride, dst, stride);
3945 void ff_avg_h264_qpel16_mc12_msa(uint8_t *dst, const uint8_t *src,
3948 avc_luma_midh_qrt_and_aver_dst_16w_msa(src - (2 * stride) - 2,
3949 stride, dst, stride, 16, 0);
3952 void ff_avg_h264_qpel16_mc32_msa(uint8_t *dst, const uint8_t *src,
3955 avc_luma_midh_qrt_and_aver_dst_16w_msa(src - (2 * stride) - 2,
3956 stride, dst, stride, 16, 1);
3959 void ff_avg_h264_qpel8_mc12_msa(uint8_t *dst, const uint8_t *src,
3962 avc_luma_midh_qrt_and_aver_dst_8w_msa(src - (2 * stride) - 2,
3963 stride, dst, stride, 8, 0);
3966 void ff_avg_h264_qpel8_mc32_msa(uint8_t *dst, const uint8_t *src,
3969 avc_luma_midh_qrt_and_aver_dst_8w_msa(src - (2 * stride) - 2,
3970 stride, dst, stride, 8, 1);
3973 void ff_avg_h264_qpel4_mc12_msa(uint8_t *dst, const uint8_t *src,
3976 avc_luma_midh_qrt_and_aver_dst_4w_msa(src - (2 * stride) - 2,
3977 stride, dst, stride, 4, 0);
3980 void ff_avg_h264_qpel4_mc32_msa(uint8_t *dst, const uint8_t *src,
3983 avc_luma_midh_qrt_and_aver_dst_4w_msa(src - (2 * stride) - 2,
3984 stride, dst, stride, 4, 1);
3987 void ff_avg_h264_qpel16_mc22_msa(uint8_t *dst, const uint8_t *src,
3990 avc_luma_mid_and_aver_dst_16x16_msa(src - (2 * stride) - 2,
3991 stride, dst, stride);
3994 void ff_avg_h264_qpel8_mc22_msa(uint8_t *dst, const uint8_t *src,
3997 avc_luma_mid_and_aver_dst_8w_msa(src - (2 * stride) - 2,
3998 stride, dst, stride, 8);
4001 void ff_avg_h264_qpel4_mc22_msa(uint8_t *dst, const uint8_t *src,
4004 avc_luma_mid_and_aver_dst_4x4_msa(src - (2 * stride) - 2,
4005 stride, dst, stride);