2 * Copyright (c) 2015 -2017 Parag Salasakar (Parag.Salasakar@imgtec.com)
4 * This file is part of FFmpeg.
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 #include "libavutil/mips/generic_macros_msa.h"
22 #include "h264dsp_mips.h"
24 #define AVC_CALC_DPADD_H_6PIX_2COEFF_SH(in0, in1, in2, in3, in4, in5) \
26 v4i32 tmp0_m, tmp1_m; \
27 v8i16 out0_m, out1_m, out2_m, out3_m; \
28 v8i16 minus5h_m = __msa_ldi_h(-5); \
29 v8i16 plus20h_m = __msa_ldi_h(20); \
31 ILVRL_H2_SW(in5, in0, tmp0_m, tmp1_m); \
33 tmp0_m = __msa_hadd_s_w((v8i16) tmp0_m, (v8i16) tmp0_m); \
34 tmp1_m = __msa_hadd_s_w((v8i16) tmp1_m, (v8i16) tmp1_m); \
36 ILVRL_H2_SH(in1, in4, out0_m, out1_m); \
37 DPADD_SH2_SW(out0_m, out1_m, minus5h_m, minus5h_m, tmp0_m, tmp1_m); \
38 ILVRL_H2_SH(in2, in3, out2_m, out3_m); \
39 DPADD_SH2_SW(out2_m, out3_m, plus20h_m, plus20h_m, tmp0_m, tmp1_m); \
41 SRARI_W2_SW(tmp0_m, tmp1_m, 10); \
42 SAT_SW2_SW(tmp0_m, tmp1_m, 7); \
43 out0_m = __msa_pckev_h((v8i16) tmp1_m, (v8i16) tmp0_m); \
48 #define AVC_HORZ_FILTER_SH(in, mask0, mask1, mask2) \
50 v8i16 out0_m, out1_m; \
51 v16i8 tmp0_m, tmp1_m; \
52 v16i8 minus5b = __msa_ldi_b(-5); \
53 v16i8 plus20b = __msa_ldi_b(20); \
55 tmp0_m = __msa_vshf_b((v16i8) mask0, in, in); \
56 out0_m = __msa_hadd_s_h(tmp0_m, tmp0_m); \
58 tmp0_m = __msa_vshf_b((v16i8) mask1, in, in); \
59 out0_m = __msa_dpadd_s_h(out0_m, minus5b, tmp0_m); \
61 tmp1_m = __msa_vshf_b((v16i8) (mask2), in, in); \
62 out1_m = __msa_dpadd_s_h(out0_m, plus20b, tmp1_m); \
67 static const uint8_t luma_mask_arr[16 * 8] = {
69 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12,
70 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 9, 7, 10, 8, 11,
71 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10,
74 0, 5, 1, 6, 2, 7, 3, 8, 16, 21, 17, 22, 18, 23, 19, 24,
75 1, 4, 2, 5, 3, 6, 4, 7, 17, 20, 18, 21, 19, 22, 20, 23,
76 2, 3, 3, 4, 4, 5, 5, 6, 18, 19, 19, 20, 20, 21, 21, 22,
78 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 24, 25,
79 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26
82 #define AVC_CALC_DPADD_B_6PIX_2COEFF_SH(vec0, vec1, vec2, vec3, vec4, vec5, \
85 v16i8 tmp0_m, tmp1_m; \
86 v16i8 minus5b_m = __msa_ldi_b(-5); \
87 v16i8 plus20b_m = __msa_ldi_b(20); \
89 ILVRL_B2_SB(vec5, vec0, tmp0_m, tmp1_m); \
90 HADD_SB2_SH(tmp0_m, tmp1_m, out1, out2); \
91 ILVRL_B2_SB(vec4, vec1, tmp0_m, tmp1_m); \
92 DPADD_SB2_SH(tmp0_m, tmp1_m, minus5b_m, minus5b_m, out1, out2); \
93 ILVRL_B2_SB(vec3, vec2, tmp0_m, tmp1_m); \
94 DPADD_SB2_SH(tmp0_m, tmp1_m, plus20b_m, plus20b_m, out1, out2); \
97 #define AVC_CALC_DPADD_B_6PIX_2COEFF_R_SH(vec0, vec1, vec2, vec3, vec4, vec5) \
100 v16i8 tmp0_m, tmp2_m; \
101 v16i8 minus5b_m = __msa_ldi_b(-5); \
102 v16i8 plus20b_m = __msa_ldi_b(20); \
104 tmp1_m = (v8i16) __msa_ilvr_b((v16i8) vec5, (v16i8) vec0); \
105 tmp1_m = __msa_hadd_s_h((v16i8) tmp1_m, (v16i8) tmp1_m); \
107 ILVR_B2_SB(vec4, vec1, vec3, vec2, tmp0_m, tmp2_m); \
108 DPADD_SB2_SH(tmp0_m, tmp2_m, minus5b_m, plus20b_m, tmp1_m, tmp1_m); \
113 #define AVC_CALC_DPADD_H_6PIX_2COEFF_R_SH(vec0, vec1, vec2, vec3, vec4, vec5) \
116 v8i16 tmp2_m, tmp3_m; \
117 v8i16 minus5h_m = __msa_ldi_h(-5); \
118 v8i16 plus20h_m = __msa_ldi_h(20); \
120 tmp1_m = (v4i32) __msa_ilvr_h((v8i16) vec5, (v8i16) vec0); \
121 tmp1_m = __msa_hadd_s_w((v8i16) tmp1_m, (v8i16) tmp1_m); \
123 ILVR_H2_SH(vec1, vec4, vec2, vec3, tmp2_m, tmp3_m); \
124 DPADD_SH2_SW(tmp2_m, tmp3_m, minus5h_m, plus20h_m, tmp1_m, tmp1_m); \
126 tmp1_m = __msa_srari_w(tmp1_m, 10); \
127 tmp1_m = __msa_sat_s_w(tmp1_m, 7); \
129 tmp2_m = __msa_pckev_h((v8i16) tmp1_m, (v8i16) tmp1_m); \
134 #define AVC_XOR_VSHF_B_AND_APPLY_6TAP_HORIZ_FILT_SH(src0, src1, \
135 mask0, mask1, mask2) \
138 v16i8 vec0_m, vec1_m, vec2_m; \
139 v16i8 minus5b_m = __msa_ldi_b(-5); \
140 v16i8 plus20b_m = __msa_ldi_b(20); \
142 vec0_m = __msa_vshf_b((v16i8) mask0, (v16i8) src1, (v16i8) src0); \
143 hz_out_m = __msa_hadd_s_h(vec0_m, vec0_m); \
145 VSHF_B2_SB(src0, src1, src0, src1, mask1, mask2, vec1_m, vec2_m); \
146 DPADD_SB2_SH(vec1_m, vec2_m, minus5b_m, plus20b_m, hz_out_m, hz_out_m); \
151 #define AVC_DOT_SH3_SH(in0, in1, in2, coeff0, coeff1, coeff2) \
155 out0_m = __msa_dotp_s_h((v16i8) in0, (v16i8) coeff0); \
156 out0_m = __msa_dpadd_s_h(out0_m, (v16i8) in1, (v16i8) coeff1); \
157 out0_m = __msa_dpadd_s_h(out0_m, (v16i8) in2, (v16i8) coeff2); \
162 static void avc_luma_hz_4w_msa(const uint8_t *src, int32_t src_stride,
163 uint8_t *dst, int32_t dst_stride,
167 v16i8 src0, src1, src2, src3;
170 v16i8 mask0, mask1, mask2;
171 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
172 v16i8 minus5b = __msa_ldi_b(-5);
173 v16i8 plus20b = __msa_ldi_b(20);
175 LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2);
176 for (loop_cnt = (height >> 2); loop_cnt--;) {
177 LD_SB4(src, src_stride, src0, src1, src2, src3);
178 src += (4 * src_stride);
180 XORI_B4_128_SB(src0, src1, src2, src3);
181 VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0, vec1);
182 HADD_SB2_SH(vec0, vec1, res0, res1);
183 VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2, vec3);
184 DPADD_SB2_SH(vec2, vec3, minus5b, minus5b, res0, res1);
185 VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4, vec5);
186 DPADD_SB2_SH(vec4, vec5, plus20b, plus20b, res0, res1);
187 SRARI_H2_SH(res0, res1, 5);
188 SAT_SH2_SH(res0, res1, 7);
189 out = PCKEV_XORI128_UB(res0, res1);
190 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
191 dst += (4 * dst_stride);
195 static void avc_luma_hz_8w_msa(const uint8_t *src, int32_t src_stride,
196 uint8_t *dst, int32_t dst_stride,
200 v16i8 src0, src1, src2, src3;
201 v8i16 res0, res1, res2, res3;
202 v16i8 mask0, mask1, mask2;
203 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
204 v16i8 vec6, vec7, vec8, vec9, vec10, vec11;
205 v16i8 minus5b = __msa_ldi_b(-5);
206 v16i8 plus20b = __msa_ldi_b(20);
209 LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
211 for (loop_cnt = (height >> 2); loop_cnt--;) {
212 LD_SB4(src, src_stride, src0, src1, src2, src3);
213 src += (4 * src_stride);
215 XORI_B4_128_SB(src0, src1, src2, src3);
216 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
217 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3);
218 HADD_SB4_SH(vec0, vec1, vec2, vec3, res0, res1, res2, res3);
219 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4, vec5);
220 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6, vec7);
221 DPADD_SB4_SH(vec4, vec5, vec6, vec7, minus5b, minus5b, minus5b, minus5b,
222 res0, res1, res2, res3);
223 VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec8, vec9);
224 VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec10, vec11);
225 DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b,
226 plus20b, res0, res1, res2, res3);
227 SRARI_H4_SH(res0, res1, res2, res3, 5);
228 SAT_SH4_SH(res0, res1, res2, res3, 7);
229 out0 = PCKEV_XORI128_UB(res0, res1);
230 out1 = PCKEV_XORI128_UB(res2, res3);
231 ST8x4_UB(out0, out1, dst, dst_stride);
232 dst += (4 * dst_stride);
236 static void avc_luma_hz_16w_msa(const uint8_t *src, int32_t src_stride,
237 uint8_t *dst, int32_t dst_stride,
241 v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
242 v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
243 v16i8 mask0, mask1, mask2;
244 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
245 v16i8 vec6, vec7, vec8, vec9, vec10, vec11;
246 v16i8 minus5b = __msa_ldi_b(-5);
247 v16i8 plus20b = __msa_ldi_b(20);
249 LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
251 for (loop_cnt = (height >> 2); loop_cnt--;) {
252 LD_SB2(src, 8, src0, src1);
254 LD_SB2(src, 8, src2, src3);
257 XORI_B4_128_SB(src0, src1, src2, src3);
258 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec3);
259 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec6, vec9);
260 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec1, vec4);
261 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec7, vec10);
262 VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec2, vec5);
263 VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec8, vec11);
264 HADD_SB4_SH(vec0, vec3, vec6, vec9, res0, res1, res2, res3);
265 DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
266 minus5b, res0, res1, res2, res3);
267 DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
268 plus20b, res0, res1, res2, res3);
270 LD_SB2(src, 8, src4, src5);
272 LD_SB2(src, 8, src6, src7);
275 XORI_B4_128_SB(src4, src5, src6, src7);
276 VSHF_B2_SB(src4, src4, src5, src5, mask0, mask0, vec0, vec3);
277 VSHF_B2_SB(src6, src6, src7, src7, mask0, mask0, vec6, vec9);
278 VSHF_B2_SB(src4, src4, src5, src5, mask1, mask1, vec1, vec4);
279 VSHF_B2_SB(src6, src6, src7, src7, mask1, mask1, vec7, vec10);
280 VSHF_B2_SB(src4, src4, src5, src5, mask2, mask2, vec2, vec5);
281 VSHF_B2_SB(src6, src6, src7, src7, mask2, mask2, vec8, vec11);
282 HADD_SB4_SH(vec0, vec3, vec6, vec9, res4, res5, res6, res7);
283 DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
284 minus5b, res4, res5, res6, res7);
285 DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
286 plus20b, res4, res5, res6, res7);
287 SRARI_H4_SH(res0, res1, res2, res3, 5);
288 SRARI_H4_SH(res4, res5, res6, res7, 5);
289 SAT_SH4_SH(res0, res1, res2, res3, 7);
290 SAT_SH4_SH(res4, res5, res6, res7, 7);
291 PCKEV_B4_SB(res1, res0, res3, res2, res5, res4, res7, res6,
292 vec0, vec1, vec2, vec3);
293 XORI_B4_128_SB(vec0, vec1, vec2, vec3);
295 ST_SB4(vec0, vec1, vec2, vec3, dst, dst_stride);
296 dst += (4 * dst_stride);
300 static void avc_luma_hz_qrt_4w_msa(const uint8_t *src, int32_t src_stride,
301 uint8_t *dst, int32_t dst_stride,
302 int32_t height, uint8_t hor_offset)
306 v16i8 src0, src1, src2, src3;
308 v16i8 res, mask0, mask1, mask2;
309 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
310 v16i8 minus5b = __msa_ldi_b(-5);
311 v16i8 plus20b = __msa_ldi_b(20);
313 LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2);
314 slide = 2 + hor_offset;
316 for (loop_cnt = (height >> 2); loop_cnt--;) {
317 LD_SB4(src, src_stride, src0, src1, src2, src3);
318 src += (4 * src_stride);
320 XORI_B4_128_SB(src0, src1, src2, src3);
321 VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0, vec1);
322 HADD_SB2_SH(vec0, vec1, res0, res1);
323 VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2, vec3);
324 DPADD_SB2_SH(vec2, vec3, minus5b, minus5b, res0, res1);
325 VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4, vec5);
326 DPADD_SB2_SH(vec4, vec5, plus20b, plus20b, res0, res1);
327 SRARI_H2_SH(res0, res1, 5);
328 SAT_SH2_SH(res0, res1, 7);
330 res = __msa_pckev_b((v16i8) res1, (v16i8) res0);
331 src0 = __msa_sld_b(src0, src0, slide);
332 src1 = __msa_sld_b(src1, src1, slide);
333 src2 = __msa_sld_b(src2, src2, slide);
334 src3 = __msa_sld_b(src3, src3, slide);
335 src0 = (v16i8) __msa_insve_w((v4i32) src0, 1, (v4i32) src1);
336 src1 = (v16i8) __msa_insve_w((v4i32) src2, 1, (v4i32) src3);
337 src0 = (v16i8) __msa_insve_d((v2i64) src0, 1, (v2i64) src1);
338 res = __msa_aver_s_b(res, src0);
339 res = (v16i8) __msa_xori_b((v16u8) res, 128);
341 ST4x4_UB(res, res, 0, 1, 2, 3, dst, dst_stride);
342 dst += (4 * dst_stride);
346 static void avc_luma_hz_qrt_8w_msa(const uint8_t *src, int32_t src_stride,
347 uint8_t *dst, int32_t dst_stride,
348 int32_t height, uint8_t hor_offset)
352 v16i8 src0, src1, src2, src3;
354 v8i16 res0, res1, res2, res3;
355 v16i8 mask0, mask1, mask2;
356 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
357 v16i8 vec6, vec7, vec8, vec9, vec10, vec11;
358 v16i8 minus5b = __msa_ldi_b(-5);
359 v16i8 plus20b = __msa_ldi_b(20);
361 LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
362 slide = 2 + hor_offset;
364 for (loop_cnt = height >> 2; loop_cnt--;) {
365 LD_SB4(src, src_stride, src0, src1, src2, src3);
366 src += (4 * src_stride);
368 XORI_B4_128_SB(src0, src1, src2, src3);
369 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
370 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3);
371 HADD_SB4_SH(vec0, vec1, vec2, vec3, res0, res1, res2, res3);
372 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4, vec5);
373 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6, vec7);
374 DPADD_SB4_SH(vec4, vec5, vec6, vec7, minus5b, minus5b, minus5b, minus5b,
375 res0, res1, res2, res3);
376 VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec8, vec9);
377 VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec10, vec11);
378 DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b,
379 plus20b, res0, res1, res2, res3);
381 src0 = __msa_sld_b(src0, src0, slide);
382 src1 = __msa_sld_b(src1, src1, slide);
383 src2 = __msa_sld_b(src2, src2, slide);
384 src3 = __msa_sld_b(src3, src3, slide);
386 SRARI_H4_SH(res0, res1, res2, res3, 5);
387 SAT_SH4_SH(res0, res1, res2, res3, 7);
388 PCKEV_B2_SB(res1, res0, res3, res2, tmp0, tmp1);
389 PCKEV_D2_SB(src1, src0, src3, src2, src0, src1);
391 tmp0 = __msa_aver_s_b(tmp0, src0);
392 tmp1 = __msa_aver_s_b(tmp1, src1);
394 XORI_B2_128_SB(tmp0, tmp1);
395 ST8x4_UB(tmp0, tmp1, dst, dst_stride);
397 dst += (4 * dst_stride);
401 static void avc_luma_hz_qrt_16w_msa(const uint8_t *src, int32_t src_stride,
402 uint8_t *dst, int32_t dst_stride,
403 int32_t height, uint8_t hor_offset)
407 v16i8 src0, src1, src2, src3;
408 v16i8 mask0, mask1, mask2, vshf;
409 v8i16 res0, res1, res2, res3;
410 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
411 v16i8 vec6, vec7, vec8, vec9, vec10, vec11;
412 v16i8 minus5b = __msa_ldi_b(-5);
413 v16i8 plus20b = __msa_ldi_b(20);
415 LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
418 vshf = LD_SB(&luma_mask_arr[16 + 96]);
420 vshf = LD_SB(&luma_mask_arr[96]);
423 for (loop_cnt = height >> 1; loop_cnt--;) {
424 LD_SB2(src, 8, src0, src1);
426 LD_SB2(src, 8, src2, src3);
429 XORI_B4_128_SB(src0, src1, src2, src3);
430 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec3);
431 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec6, vec9);
432 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec1, vec4);
433 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec7, vec10);
434 VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec2, vec5);
435 VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec8, vec11);
436 HADD_SB4_SH(vec0, vec3, vec6, vec9, res0, res1, res2, res3);
437 DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
438 minus5b, res0, res1, res2, res3);
439 DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
440 plus20b, res0, res1, res2, res3);
441 VSHF_B2_SB(src0, src1, src2, src3, vshf, vshf, src0, src2);
442 SRARI_H4_SH(res0, res1, res2, res3, 5);
443 SAT_SH4_SH(res0, res1, res2, res3, 7);
444 PCKEV_B2_SB(res1, res0, res3, res2, dst0, dst1);
446 dst0 = __msa_aver_s_b(dst0, src0);
447 dst1 = __msa_aver_s_b(dst1, src2);
449 XORI_B2_128_SB(dst0, dst1);
451 ST_SB2(dst0, dst1, dst, dst_stride);
452 dst += (2 * dst_stride);
456 static void avc_luma_vt_4w_msa(const uint8_t *src, int32_t src_stride,
457 uint8_t *dst, int32_t dst_stride,
461 int16_t filt_const0 = 0xfb01;
462 int16_t filt_const1 = 0x1414;
463 int16_t filt_const2 = 0x1fb;
464 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
465 v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
466 v16i8 src87_r, src2110, src4332, src6554, src8776;
467 v16i8 filt0, filt1, filt2;
471 filt0 = (v16i8) __msa_fill_h(filt_const0);
472 filt1 = (v16i8) __msa_fill_h(filt_const1);
473 filt2 = (v16i8) __msa_fill_h(filt_const2);
475 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
476 src += (5 * src_stride);
478 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3,
479 src10_r, src21_r, src32_r, src43_r);
480 ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
481 XORI_B2_128_SB(src2110, src4332);
483 for (loop_cnt = (height >> 2); loop_cnt--;) {
484 LD_SB4(src, src_stride, src5, src6, src7, src8);
485 src += (4 * src_stride);
487 ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7,
488 src54_r, src65_r, src76_r, src87_r);
489 ILVR_D2_SB(src65_r, src54_r, src87_r, src76_r, src6554, src8776);
490 XORI_B2_128_SB(src6554, src8776);
491 out10 = DPADD_SH3_SH(src2110, src4332, src6554, filt0, filt1, filt2);
492 out32 = DPADD_SH3_SH(src4332, src6554, src8776, filt0, filt1, filt2);
493 SRARI_H2_SH(out10, out32, 5);
494 SAT_SH2_SH(out10, out32, 7);
495 out = PCKEV_XORI128_UB(out10, out32);
496 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
498 dst += (4 * dst_stride);
505 static void avc_luma_vt_8w_msa(const uint8_t *src, int32_t src_stride,
506 uint8_t *dst, int32_t dst_stride,
510 int16_t filt_const0 = 0xfb01;
511 int16_t filt_const1 = 0x1414;
512 int16_t filt_const2 = 0x1fb;
513 v16i8 src0, src1, src2, src3, src4, src7, src8, src9, src10;
514 v16i8 src10_r, src32_r, src76_r, src98_r;
515 v16i8 src21_r, src43_r, src87_r, src109_r;
516 v8i16 out0_r, out1_r, out2_r, out3_r;
517 v16i8 filt0, filt1, filt2;
520 filt0 = (v16i8) __msa_fill_h(filt_const0);
521 filt1 = (v16i8) __msa_fill_h(filt_const1);
522 filt2 = (v16i8) __msa_fill_h(filt_const2);
524 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
525 src += (5 * src_stride);
527 XORI_B5_128_SB(src0, src1, src2, src3, src4);
528 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3,
529 src10_r, src21_r, src32_r, src43_r);
531 for (loop_cnt = (height >> 2); loop_cnt--;) {
532 LD_SB4(src, src_stride, src7, src8, src9, src10);
533 src += (4 * src_stride);
535 XORI_B4_128_SB(src7, src8, src9, src10);
536 ILVR_B4_SB(src7, src4, src8, src7, src9, src8, src10, src9,
537 src76_r, src87_r, src98_r, src109_r);
538 out0_r = DPADD_SH3_SH(src10_r, src32_r, src76_r, filt0, filt1, filt2);
539 out1_r = DPADD_SH3_SH(src21_r, src43_r, src87_r, filt0, filt1, filt2);
540 out2_r = DPADD_SH3_SH(src32_r, src76_r, src98_r, filt0, filt1, filt2);
541 out3_r = DPADD_SH3_SH(src43_r, src87_r, src109_r, filt0, filt1, filt2);
542 SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 5);
543 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
544 out0 = PCKEV_XORI128_UB(out0_r, out1_r);
545 out1 = PCKEV_XORI128_UB(out2_r, out3_r);
546 ST8x4_UB(out0, out1, dst, dst_stride);
547 dst += (4 * dst_stride);
557 static void avc_luma_vt_16w_msa(const uint8_t *src, int32_t src_stride,
558 uint8_t *dst, int32_t dst_stride,
562 int16_t filt_const0 = 0xfb01;
563 int16_t filt_const1 = 0x1414;
564 int16_t filt_const2 = 0x1fb;
565 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
566 v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
567 v16i8 src87_r, src10_l, src32_l, src54_l, src76_l, src21_l, src43_l;
568 v16i8 src65_l, src87_l;
569 v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
570 v16u8 res0, res1, res2, res3;
571 v16i8 filt0, filt1, filt2;
573 filt0 = (v16i8) __msa_fill_h(filt_const0);
574 filt1 = (v16i8) __msa_fill_h(filt_const1);
575 filt2 = (v16i8) __msa_fill_h(filt_const2);
577 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
578 src += (5 * src_stride);
580 XORI_B5_128_SB(src0, src1, src2, src3, src4);
581 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3,
582 src10_r, src21_r, src32_r, src43_r);
583 ILVL_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3,
584 src10_l, src21_l, src32_l, src43_l);
586 for (loop_cnt = (height >> 2); loop_cnt--;) {
587 LD_SB4(src, src_stride, src5, src6, src7, src8);
588 src += (4 * src_stride);
590 XORI_B4_128_SB(src5, src6, src7, src8);
591 ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7,
592 src54_r, src65_r, src76_r, src87_r);
593 ILVL_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7,
594 src54_l, src65_l, src76_l, src87_l);
595 out0_r = DPADD_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2);
596 out1_r = DPADD_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2);
597 out2_r = DPADD_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2);
598 out3_r = DPADD_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2);
599 out0_l = DPADD_SH3_SH(src10_l, src32_l, src54_l, filt0, filt1, filt2);
600 out1_l = DPADD_SH3_SH(src21_l, src43_l, src65_l, filt0, filt1, filt2);
601 out2_l = DPADD_SH3_SH(src32_l, src54_l, src76_l, filt0, filt1, filt2);
602 out3_l = DPADD_SH3_SH(src43_l, src65_l, src87_l, filt0, filt1, filt2);
603 SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 5);
604 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
605 SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 5);
606 SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
607 PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
608 out3_r, res0, res1, res2, res3);
609 XORI_B4_128_UB(res0, res1, res2, res3);
611 ST_UB4(res0, res1, res2, res3, dst, dst_stride);
612 dst += (4 * dst_stride);
626 static void avc_luma_vt_qrt_4w_msa(const uint8_t *src, int32_t src_stride,
627 uint8_t *dst, int32_t dst_stride,
628 int32_t height, uint8_t ver_offset)
631 int16_t filt_const0 = 0xfb01;
632 int16_t filt_const1 = 0x1414;
633 int16_t filt_const2 = 0x1fb;
634 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
635 v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
636 v16i8 src87_r, src2110, src4332, src6554, src8776;
638 v16i8 filt0, filt1, filt2;
641 filt0 = (v16i8) __msa_fill_h(filt_const0);
642 filt1 = (v16i8) __msa_fill_h(filt_const1);
643 filt2 = (v16i8) __msa_fill_h(filt_const2);
645 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
646 src += (5 * src_stride);
648 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3,
649 src10_r, src21_r, src32_r, src43_r);
650 ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
651 XORI_B2_128_SB(src2110, src4332);
653 for (loop_cnt = (height >> 2); loop_cnt--;) {
654 LD_SB4(src, src_stride, src5, src6, src7, src8);
655 src += (4 * src_stride);
657 ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7,
658 src54_r, src65_r, src76_r, src87_r);
659 ILVR_D2_SB(src65_r, src54_r, src87_r, src76_r, src6554, src8776);
660 XORI_B2_128_SB(src6554, src8776);
661 out10 = DPADD_SH3_SH(src2110, src4332, src6554, filt0, filt1, filt2);
662 out32 = DPADD_SH3_SH(src4332, src6554, src8776, filt0, filt1, filt2);
663 SRARI_H2_SH(out10, out32, 5);
664 SAT_SH2_SH(out10, out32, 7);
666 out = PCKEV_XORI128_UB(out10, out32);
669 src32_r = (v16i8) __msa_insve_w((v4i32) src3, 1, (v4i32) src4);
670 src54_r = (v16i8) __msa_insve_w((v4i32) src5, 1, (v4i32) src6);
672 src32_r = (v16i8) __msa_insve_w((v4i32) src2, 1, (v4i32) src3);
673 src54_r = (v16i8) __msa_insve_w((v4i32) src4, 1, (v4i32) src5);
676 src32_r = (v16i8) __msa_insve_d((v2i64) src32_r, 1, (v2i64) src54_r);
677 out = __msa_aver_u_b(out, (v16u8) src32_r);
679 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
680 dst += (4 * dst_stride);
689 static void avc_luma_vt_qrt_8w_msa(const uint8_t *src, int32_t src_stride,
690 uint8_t *dst, int32_t dst_stride,
691 int32_t height, uint8_t ver_offset)
694 int16_t filt_const0 = 0xfb01;
695 int16_t filt_const1 = 0x1414;
696 int16_t filt_const2 = 0x1fb;
697 v16i8 src0, src1, src2, src3, src4, src7, src8, src9, src10;
698 v16i8 src10_r, src32_r, src76_r, src98_r;
699 v16i8 src21_r, src43_r, src87_r, src109_r;
700 v8i16 out0_r, out1_r, out2_r, out3_r;
702 v16i8 filt0, filt1, filt2;
704 filt0 = (v16i8) __msa_fill_h(filt_const0);
705 filt1 = (v16i8) __msa_fill_h(filt_const1);
706 filt2 = (v16i8) __msa_fill_h(filt_const2);
708 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
709 src += (5 * src_stride);
711 XORI_B5_128_SB(src0, src1, src2, src3, src4);
712 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3,
713 src10_r, src21_r, src32_r, src43_r);
715 for (loop_cnt = (height >> 2); loop_cnt--;) {
716 LD_SB4(src, src_stride, src7, src8, src9, src10);
717 src += (4 * src_stride);
719 XORI_B4_128_SB(src7, src8, src9, src10);
720 ILVR_B4_SB(src7, src4, src8, src7, src9, src8, src10, src9,
721 src76_r, src87_r, src98_r, src109_r);
722 out0_r = DPADD_SH3_SH(src10_r, src32_r, src76_r, filt0, filt1, filt2);
723 out1_r = DPADD_SH3_SH(src21_r, src43_r, src87_r, filt0, filt1, filt2);
724 out2_r = DPADD_SH3_SH(src32_r, src76_r, src98_r, filt0, filt1, filt2);
725 out3_r = DPADD_SH3_SH(src43_r, src87_r, src109_r, filt0, filt1, filt2);
726 SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 5);
727 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
728 PCKEV_B2_SB(out1_r, out0_r, out3_r, out2_r, res0, res1);
731 PCKEV_D2_SB(src4, src3, src8, src7, src10_r, src32_r);
733 PCKEV_D2_SB(src3, src2, src7, src4, src10_r, src32_r);
736 res0 = __msa_aver_s_b(res0, (v16i8) src10_r);
737 res1 = __msa_aver_s_b(res1, (v16i8) src32_r);
739 XORI_B2_128_SB(res0, res1);
740 ST8x4_UB(res0, res1, dst, dst_stride);
742 dst += (4 * dst_stride);
753 static void avc_luma_vt_qrt_16w_msa(const uint8_t *src, int32_t src_stride,
754 uint8_t *dst, int32_t dst_stride,
755 int32_t height, uint8_t ver_offset)
758 int16_t filt_const0 = 0xfb01;
759 int16_t filt_const1 = 0x1414;
760 int16_t filt_const2 = 0x1fb;
761 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
762 v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
763 v16i8 src87_r, src10_l, src32_l, src54_l, src76_l, src21_l, src43_l;
764 v16i8 src65_l, src87_l;
765 v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
766 v16u8 res0, res1, res2, res3;
767 v16i8 filt0, filt1, filt2;
769 filt0 = (v16i8) __msa_fill_h(filt_const0);
770 filt1 = (v16i8) __msa_fill_h(filt_const1);
771 filt2 = (v16i8) __msa_fill_h(filt_const2);
773 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
774 src += (5 * src_stride);
776 XORI_B5_128_SB(src0, src1, src2, src3, src4);
777 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3,
778 src10_r, src21_r, src32_r, src43_r);
779 ILVL_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3,
780 src10_l, src21_l, src32_l, src43_l);
782 for (loop_cnt = (height >> 2); loop_cnt--;) {
783 LD_SB4(src, src_stride, src5, src6, src7, src8);
784 src += (4 * src_stride);
786 XORI_B4_128_SB(src5, src6, src7, src8);
787 ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7,
788 src54_r, src65_r, src76_r, src87_r);
789 ILVL_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7,
790 src54_l, src65_l, src76_l, src87_l);
791 out0_r = DPADD_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2);
792 out1_r = DPADD_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2);
793 out2_r = DPADD_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2);
794 out3_r = DPADD_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2);
795 out0_l = DPADD_SH3_SH(src10_l, src32_l, src54_l, filt0, filt1, filt2);
796 out1_l = DPADD_SH3_SH(src21_l, src43_l, src65_l, filt0, filt1, filt2);
797 out2_l = DPADD_SH3_SH(src32_l, src54_l, src76_l, filt0, filt1, filt2);
798 out3_l = DPADD_SH3_SH(src43_l, src65_l, src87_l, filt0, filt1, filt2);
799 SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 5);
800 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
801 SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 5);
802 SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
803 PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
804 out3_r, res0, res1, res2, res3);
807 res0 = (v16u8) __msa_aver_s_b((v16i8) res0, src3);
808 res1 = (v16u8) __msa_aver_s_b((v16i8) res1, src4);
809 res2 = (v16u8) __msa_aver_s_b((v16i8) res2, src5);
810 res3 = (v16u8) __msa_aver_s_b((v16i8) res3, src6);
812 res0 = (v16u8) __msa_aver_s_b((v16i8) res0, src2);
813 res1 = (v16u8) __msa_aver_s_b((v16i8) res1, src3);
814 res2 = (v16u8) __msa_aver_s_b((v16i8) res2, src4);
815 res3 = (v16u8) __msa_aver_s_b((v16i8) res3, src5);
818 XORI_B4_128_UB(res0, res1, res2, res3);
819 ST_UB4(res0, res1, res2, res3, dst, dst_stride);
821 dst += (4 * dst_stride);
837 static void avc_luma_mid_4w_msa(const uint8_t *src, int32_t src_stride,
838 uint8_t *dst, int32_t dst_stride,
842 v16i8 src0, src1, src2, src3, src4;
843 v16i8 mask0, mask1, mask2;
844 v8i16 hz_out0, hz_out1, hz_out2, hz_out3;
845 v8i16 hz_out4, hz_out5, hz_out6, hz_out7, hz_out8;
846 v8i16 dst0, dst1, dst2, dst3;
848 LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2);
849 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
850 src += (5 * src_stride);
852 XORI_B5_128_SB(src0, src1, src2, src3, src4);
854 hz_out0 = AVC_XOR_VSHF_B_AND_APPLY_6TAP_HORIZ_FILT_SH(src0, src1,
855 mask0, mask1, mask2);
856 hz_out2 = AVC_XOR_VSHF_B_AND_APPLY_6TAP_HORIZ_FILT_SH(src2, src3,
857 mask0, mask1, mask2);
859 PCKOD_D2_SH(hz_out0, hz_out0, hz_out2, hz_out2, hz_out1, hz_out3);
861 hz_out4 = AVC_HORZ_FILTER_SH(src4, mask0, mask1, mask2);
863 for (loop_cnt = (height >> 2); loop_cnt--;) {
864 LD_SB4(src, src_stride, src0, src1, src2, src3);
865 src += (4 * src_stride);
867 XORI_B4_128_SB(src0, src1, src2, src3);
869 hz_out5 = AVC_XOR_VSHF_B_AND_APPLY_6TAP_HORIZ_FILT_SH(src0, src1,
872 hz_out7 = AVC_XOR_VSHF_B_AND_APPLY_6TAP_HORIZ_FILT_SH(src2, src3,
876 PCKOD_D2_SH(hz_out5, hz_out5, hz_out7, hz_out7, hz_out6, hz_out8);
878 dst0 = AVC_CALC_DPADD_H_6PIX_2COEFF_R_SH(hz_out0, hz_out1, hz_out2,
879 hz_out3, hz_out4, hz_out5);
880 dst1 = AVC_CALC_DPADD_H_6PIX_2COEFF_R_SH(hz_out1, hz_out2, hz_out3,
881 hz_out4, hz_out5, hz_out6);
882 dst2 = AVC_CALC_DPADD_H_6PIX_2COEFF_R_SH(hz_out2, hz_out3, hz_out4,
883 hz_out5, hz_out6, hz_out7);
884 dst3 = AVC_CALC_DPADD_H_6PIX_2COEFF_R_SH(hz_out3, hz_out4, hz_out5,
885 hz_out6, hz_out7, hz_out8);
887 PCKEV_B2_SB(dst1, dst0, dst3, dst2, src0, src1);
888 XORI_B2_128_SB(src0, src1);
890 ST4x4_UB(src0, src1, 0, 2, 0, 2, dst, dst_stride);
892 dst += (4 * dst_stride);
902 static void avc_luma_mid_8w_msa(const uint8_t *src, int32_t src_stride,
903 uint8_t *dst, int32_t dst_stride,
907 v16i8 src0, src1, src2, src3, src4;
908 v16i8 mask0, mask1, mask2;
909 v8i16 hz_out0, hz_out1, hz_out2, hz_out3;
910 v8i16 hz_out4, hz_out5, hz_out6, hz_out7, hz_out8;
911 v8i16 dst0, dst1, dst2, dst3;
914 LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
916 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
917 XORI_B5_128_SB(src0, src1, src2, src3, src4);
918 src += (5 * src_stride);
920 hz_out0 = AVC_HORZ_FILTER_SH(src0, mask0, mask1, mask2);
921 hz_out1 = AVC_HORZ_FILTER_SH(src1, mask0, mask1, mask2);
922 hz_out2 = AVC_HORZ_FILTER_SH(src2, mask0, mask1, mask2);
923 hz_out3 = AVC_HORZ_FILTER_SH(src3, mask0, mask1, mask2);
924 hz_out4 = AVC_HORZ_FILTER_SH(src4, mask0, mask1, mask2);
926 for (loop_cnt = (height >> 2); loop_cnt--;) {
927 LD_SB4(src, src_stride, src0, src1, src2, src3);
928 XORI_B4_128_SB(src0, src1, src2, src3);
929 src += (4 * src_stride);
931 hz_out5 = AVC_HORZ_FILTER_SH(src0, mask0, mask1, mask2);
932 hz_out6 = AVC_HORZ_FILTER_SH(src1, mask0, mask1, mask2);
933 hz_out7 = AVC_HORZ_FILTER_SH(src2, mask0, mask1, mask2);
934 hz_out8 = AVC_HORZ_FILTER_SH(src3, mask0, mask1, mask2);
935 dst0 = AVC_CALC_DPADD_H_6PIX_2COEFF_SH(hz_out0, hz_out1, hz_out2,
936 hz_out3, hz_out4, hz_out5);
937 dst1 = AVC_CALC_DPADD_H_6PIX_2COEFF_SH(hz_out1, hz_out2, hz_out3,
938 hz_out4, hz_out5, hz_out6);
939 dst2 = AVC_CALC_DPADD_H_6PIX_2COEFF_SH(hz_out2, hz_out3, hz_out4,
940 hz_out5, hz_out6, hz_out7);
941 dst3 = AVC_CALC_DPADD_H_6PIX_2COEFF_SH(hz_out3, hz_out4, hz_out5,
942 hz_out6, hz_out7, hz_out8);
943 out0 = PCKEV_XORI128_UB(dst0, dst1);
944 out1 = PCKEV_XORI128_UB(dst2, dst3);
945 ST8x4_UB(out0, out1, dst, dst_stride);
947 dst += (4 * dst_stride);
957 static void avc_luma_mid_16w_msa(const uint8_t *src, int32_t src_stride,
958 uint8_t *dst, int32_t dst_stride,
961 uint32_t multiple8_cnt;
963 for (multiple8_cnt = 2; multiple8_cnt--;) {
964 avc_luma_mid_8w_msa(src, src_stride, dst, dst_stride, height);
970 static void avc_luma_midh_qrt_4w_msa(const uint8_t *src, int32_t src_stride,
971 uint8_t *dst, int32_t dst_stride,
972 int32_t height, uint8_t horiz_offset)
975 v16i8 src0, src1, src2, src3, src4, src5, src6;
976 v8i16 vt_res0, vt_res1, vt_res2, vt_res3;
977 v4i32 hz_res0, hz_res1;
979 v8i16 shf_vec0, shf_vec1, shf_vec2, shf_vec3, shf_vec4, shf_vec5;
980 v8i16 mask0 = { 0, 5, 1, 6, 2, 7, 3, 8 };
981 v8i16 mask1 = { 1, 4, 2, 5, 3, 6, 4, 7 };
982 v8i16 mask2 = { 2, 3, 3, 4, 4, 5, 5, 6 };
983 v8i16 minus5h = __msa_ldi_h(-5);
984 v8i16 plus20h = __msa_ldi_h(20);
988 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
989 src += (5 * src_stride);
990 XORI_B5_128_SB(src0, src1, src2, src3, src4);
992 for (row = (height >> 1); row--;) {
993 LD_SB2(src, src_stride, src5, src6);
994 src += (2 * src_stride);
996 XORI_B2_128_SB(src5, src6);
997 AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src0, src1, src2, src3, src4, src5,
999 AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src1, src2, src3, src4, src5, src6,
1001 VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1,
1002 mask0, mask1, mask2, shf_vec0, shf_vec1, shf_vec2);
1003 VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3,
1004 mask0, mask1, mask2, shf_vec3, shf_vec4, shf_vec5);
1005 hz_res0 = __msa_hadd_s_w(shf_vec0, shf_vec0);
1006 DPADD_SH2_SW(shf_vec1, shf_vec2, minus5h, plus20h, hz_res0, hz_res0);
1007 hz_res1 = __msa_hadd_s_w(shf_vec3, shf_vec3);
1008 DPADD_SH2_SW(shf_vec4, shf_vec5, minus5h, plus20h, hz_res1, hz_res1);
1010 SRARI_W2_SW(hz_res0, hz_res1, 10);
1011 SAT_SW2_SW(hz_res0, hz_res1, 7);
1013 dst0 = __msa_srari_h(shf_vec2, 5);
1014 dst1 = __msa_srari_h(shf_vec5, 5);
1016 SAT_SH2_SH(dst0, dst1, 7);
1019 dst0 = __msa_ilvod_h(zeros, dst0);
1020 dst1 = __msa_ilvod_h(zeros, dst1);
1022 ILVEV_H2_SH(dst0, zeros, dst1, zeros, dst0, dst1);
1025 hz_res0 = __msa_aver_s_w(hz_res0, (v4i32) dst0);
1026 hz_res1 = __msa_aver_s_w(hz_res1, (v4i32) dst1);
1027 dst0 = __msa_pckev_h((v8i16) hz_res1, (v8i16) hz_res0);
1029 out = PCKEV_XORI128_UB(dst0, dst0);
1030 ST4x2_UB(out, dst, dst_stride);
1032 dst += (2 * dst_stride);
1042 static void avc_luma_midh_qrt_8w_msa(const uint8_t *src, int32_t src_stride,
1043 uint8_t *dst, int32_t dst_stride,
1044 int32_t height, uint8_t horiz_offset)
1046 uint32_t multiple8_cnt;
1048 for (multiple8_cnt = 2; multiple8_cnt--;) {
1049 avc_luma_midh_qrt_4w_msa(src, src_stride, dst, dst_stride, height,
1057 static void avc_luma_midh_qrt_16w_msa(const uint8_t *src, int32_t src_stride,
1058 uint8_t *dst, int32_t dst_stride,
1059 int32_t height, uint8_t horiz_offset)
1061 uint32_t multiple8_cnt;
1063 for (multiple8_cnt = 4; multiple8_cnt--;) {
1064 avc_luma_midh_qrt_4w_msa(src, src_stride, dst, dst_stride, height,
1072 static void avc_luma_midv_qrt_4w_msa(const uint8_t *src, int32_t src_stride,
1073 uint8_t *dst, int32_t dst_stride,
1074 int32_t height, uint8_t ver_offset)
1077 v16i8 src0, src1, src2, src3, src4;
1078 v16i8 mask0, mask1, mask2;
1079 v8i16 hz_out0, hz_out1, hz_out2, hz_out3;
1080 v8i16 hz_out4, hz_out5, hz_out6, hz_out7, hz_out8;
1081 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
1083 LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2);
1084 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
1085 src += (5 * src_stride);
1087 XORI_B5_128_SB(src0, src1, src2, src3, src4);
1089 hz_out0 = AVC_XOR_VSHF_B_AND_APPLY_6TAP_HORIZ_FILT_SH(src0, src1,
1090 mask0, mask1, mask2);
1091 hz_out2 = AVC_XOR_VSHF_B_AND_APPLY_6TAP_HORIZ_FILT_SH(src2, src3,
1092 mask0, mask1, mask2);
1094 PCKOD_D2_SH(hz_out0, hz_out0, hz_out2, hz_out2, hz_out1, hz_out3);
1096 hz_out4 = AVC_HORZ_FILTER_SH(src4, mask0, mask1, mask2);
1098 for (loop_cnt = (height >> 2); loop_cnt--;) {
1099 LD_SB4(src, src_stride, src0, src1, src2, src3);
1100 src += (4 * src_stride);
1101 XORI_B4_128_SB(src0, src1, src2, src3);
1103 hz_out5 = AVC_XOR_VSHF_B_AND_APPLY_6TAP_HORIZ_FILT_SH(src0, src1,
1106 hz_out7 = AVC_XOR_VSHF_B_AND_APPLY_6TAP_HORIZ_FILT_SH(src2, src3,
1110 PCKOD_D2_SH(hz_out5, hz_out5, hz_out7, hz_out7, hz_out6, hz_out8);
1112 dst0 = AVC_CALC_DPADD_H_6PIX_2COEFF_SH(hz_out0, hz_out1, hz_out2,
1113 hz_out3, hz_out4, hz_out5);
1114 dst2 = AVC_CALC_DPADD_H_6PIX_2COEFF_SH(hz_out1, hz_out2, hz_out3,
1115 hz_out4, hz_out5, hz_out6);
1116 dst4 = AVC_CALC_DPADD_H_6PIX_2COEFF_SH(hz_out2, hz_out3, hz_out4,
1117 hz_out5, hz_out6, hz_out7);
1118 dst6 = AVC_CALC_DPADD_H_6PIX_2COEFF_SH(hz_out3, hz_out4, hz_out5,
1119 hz_out6, hz_out7, hz_out8);
1122 dst1 = __msa_srari_h(hz_out3, 5);
1123 dst3 = __msa_srari_h(hz_out4, 5);
1124 dst5 = __msa_srari_h(hz_out5, 5);
1125 dst7 = __msa_srari_h(hz_out6, 5);
1127 dst1 = __msa_srari_h(hz_out2, 5);
1128 dst3 = __msa_srari_h(hz_out3, 5);
1129 dst5 = __msa_srari_h(hz_out4, 5);
1130 dst7 = __msa_srari_h(hz_out5, 5);
1133 SAT_SH4_SH(dst1, dst3, dst5, dst7, 7);
1135 dst0 = __msa_aver_s_h(dst0, dst1);
1136 dst1 = __msa_aver_s_h(dst2, dst3);
1137 dst2 = __msa_aver_s_h(dst4, dst5);
1138 dst3 = __msa_aver_s_h(dst6, dst7);
1140 PCKEV_B2_SB(dst1, dst0, dst3, dst2, src0, src1);
1141 XORI_B2_128_SB(src0, src1);
1143 ST4x4_UB(src0, src1, 0, 2, 0, 2, dst, dst_stride);
1145 dst += (4 * dst_stride);
1154 static void avc_luma_midv_qrt_8w_msa(const uint8_t *src, int32_t src_stride,
1155 uint8_t *dst, int32_t dst_stride,
1156 int32_t height, uint8_t ver_offset)
1159 v16i8 src0, src1, src2, src3, src4;
1160 v16i8 mask0, mask1, mask2;
1161 v8i16 hz_out0, hz_out1, hz_out2, hz_out3;
1162 v8i16 hz_out4, hz_out5, hz_out6, hz_out7, hz_out8;
1163 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
1166 LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
1168 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
1169 XORI_B5_128_SB(src0, src1, src2, src3, src4);
1170 src += (5 * src_stride);
1172 hz_out0 = AVC_HORZ_FILTER_SH(src0, mask0, mask1, mask2);
1173 hz_out1 = AVC_HORZ_FILTER_SH(src1, mask0, mask1, mask2);
1174 hz_out2 = AVC_HORZ_FILTER_SH(src2, mask0, mask1, mask2);
1175 hz_out3 = AVC_HORZ_FILTER_SH(src3, mask0, mask1, mask2);
1176 hz_out4 = AVC_HORZ_FILTER_SH(src4, mask0, mask1, mask2);
1178 for (loop_cnt = (height >> 2); loop_cnt--;) {
1179 LD_SB4(src, src_stride, src0, src1, src2, src3);
1180 XORI_B4_128_SB(src0, src1, src2, src3);
1181 src += (4 * src_stride);
1183 hz_out5 = AVC_HORZ_FILTER_SH(src0, mask0, mask1, mask2);
1184 hz_out6 = AVC_HORZ_FILTER_SH(src1, mask0, mask1, mask2);
1185 hz_out7 = AVC_HORZ_FILTER_SH(src2, mask0, mask1, mask2);
1186 hz_out8 = AVC_HORZ_FILTER_SH(src3, mask0, mask1, mask2);
1188 dst0 = AVC_CALC_DPADD_H_6PIX_2COEFF_SH(hz_out0, hz_out1, hz_out2,
1189 hz_out3, hz_out4, hz_out5);
1190 dst2 = AVC_CALC_DPADD_H_6PIX_2COEFF_SH(hz_out1, hz_out2, hz_out3,
1191 hz_out4, hz_out5, hz_out6);
1192 dst4 = AVC_CALC_DPADD_H_6PIX_2COEFF_SH(hz_out2, hz_out3, hz_out4,
1193 hz_out5, hz_out6, hz_out7);
1194 dst6 = AVC_CALC_DPADD_H_6PIX_2COEFF_SH(hz_out3, hz_out4, hz_out5,
1195 hz_out6, hz_out7, hz_out8);
1198 dst1 = __msa_srari_h(hz_out3, 5);
1199 dst3 = __msa_srari_h(hz_out4, 5);
1200 dst5 = __msa_srari_h(hz_out5, 5);
1201 dst7 = __msa_srari_h(hz_out6, 5);
1203 dst1 = __msa_srari_h(hz_out2, 5);
1204 dst3 = __msa_srari_h(hz_out3, 5);
1205 dst5 = __msa_srari_h(hz_out4, 5);
1206 dst7 = __msa_srari_h(hz_out5, 5);
1209 SAT_SH4_SH(dst1, dst3, dst5, dst7, 7);
1211 dst0 = __msa_aver_s_h(dst0, dst1);
1212 dst1 = __msa_aver_s_h(dst2, dst3);
1213 dst2 = __msa_aver_s_h(dst4, dst5);
1214 dst3 = __msa_aver_s_h(dst6, dst7);
1216 out = PCKEV_XORI128_UB(dst0, dst0);
1219 out = PCKEV_XORI128_UB(dst1, dst1);
1222 out = PCKEV_XORI128_UB(dst2, dst2);
1225 out = PCKEV_XORI128_UB(dst3, dst3);
1237 static void avc_luma_midv_qrt_16w_msa(const uint8_t *src, int32_t src_stride,
1238 uint8_t *dst, int32_t dst_stride,
1239 int32_t height, uint8_t vert_offset)
1241 uint32_t multiple8_cnt;
1243 for (multiple8_cnt = 2; multiple8_cnt--;) {
1244 avc_luma_midv_qrt_8w_msa(src, src_stride, dst, dst_stride, height,
1252 static void avc_luma_hv_qrt_4w_msa(const uint8_t *src_x, const uint8_t *src_y,
1253 int32_t src_stride, uint8_t *dst,
1254 int32_t dst_stride, int32_t height)
1257 v16i8 src_hz0, src_hz1, src_hz2, src_hz3;
1258 v16i8 src_vt0, src_vt1, src_vt2, src_vt3, src_vt4;
1259 v16i8 src_vt5, src_vt6, src_vt7, src_vt8;
1260 v16i8 mask0, mask1, mask2;
1261 v8i16 hz_out0, hz_out1, vert_out0, vert_out1;
1265 LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2);
1267 LD_SB5(src_y, src_stride, src_vt0, src_vt1, src_vt2, src_vt3, src_vt4);
1268 src_y += (5 * src_stride);
1270 src_vt0 = (v16i8) __msa_insve_w((v4i32) src_vt0, 1, (v4i32) src_vt1);
1271 src_vt1 = (v16i8) __msa_insve_w((v4i32) src_vt1, 1, (v4i32) src_vt2);
1272 src_vt2 = (v16i8) __msa_insve_w((v4i32) src_vt2, 1, (v4i32) src_vt3);
1273 src_vt3 = (v16i8) __msa_insve_w((v4i32) src_vt3, 1, (v4i32) src_vt4);
1275 XORI_B4_128_SB(src_vt0, src_vt1, src_vt2, src_vt3);
1277 for (loop_cnt = (height >> 2); loop_cnt--;) {
1278 LD_SB4(src_x, src_stride, src_hz0, src_hz1, src_hz2, src_hz3);
1279 src_x += (4 * src_stride);
1281 XORI_B4_128_SB(src_hz0, src_hz1, src_hz2, src_hz3);
1283 hz_out0 = AVC_XOR_VSHF_B_AND_APPLY_6TAP_HORIZ_FILT_SH(src_hz0,
1286 hz_out1 = AVC_XOR_VSHF_B_AND_APPLY_6TAP_HORIZ_FILT_SH(src_hz2,
1290 SRARI_H2_SH(hz_out0, hz_out1, 5);
1291 SAT_SH2_SH(hz_out0, hz_out1, 7);
1293 LD_SB4(src_y, src_stride, src_vt5, src_vt6, src_vt7, src_vt8);
1294 src_y += (4 * src_stride);
1296 src_vt4 = (v16i8) __msa_insve_w((v4i32) src_vt4, 1, (v4i32) src_vt5);
1297 src_vt5 = (v16i8) __msa_insve_w((v4i32) src_vt5, 1, (v4i32) src_vt6);
1298 src_vt6 = (v16i8) __msa_insve_w((v4i32) src_vt6, 1, (v4i32) src_vt7);
1299 src_vt7 = (v16i8) __msa_insve_w((v4i32) src_vt7, 1, (v4i32) src_vt8);
1301 XORI_B4_128_SB(src_vt4, src_vt5, src_vt6, src_vt7);
1304 vert_out0 = AVC_CALC_DPADD_B_6PIX_2COEFF_R_SH(src_vt0, src_vt1,
1307 vert_out1 = AVC_CALC_DPADD_B_6PIX_2COEFF_R_SH(src_vt2, src_vt3,
1311 SRARI_H2_SH(vert_out0, vert_out1, 5);
1312 SAT_SH2_SH(vert_out0, vert_out1, 7);
1314 out0 = __msa_srari_h((hz_out0 + vert_out0), 1);
1315 out1 = __msa_srari_h((hz_out1 + vert_out1), 1);
1317 SAT_SH2_SH(out0, out1, 7);
1318 out = PCKEV_XORI128_UB(out0, out1);
1319 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
1320 dst += (4 * dst_stride);
1330 static void avc_luma_hv_qrt_8w_msa(const uint8_t *src_x, const uint8_t *src_y,
1331 int32_t src_stride, uint8_t *dst,
1332 int32_t dst_stride, int32_t height)
1335 v16i8 src_hz0, src_hz1, src_hz2, src_hz3;
1336 v16i8 src_vt0, src_vt1, src_vt2, src_vt3, src_vt4;
1337 v16i8 src_vt5, src_vt6, src_vt7, src_vt8;
1338 v16i8 mask0, mask1, mask2;
1339 v8i16 hz_out0, hz_out1, hz_out2, hz_out3;
1340 v8i16 vert_out0, vert_out1, vert_out2, vert_out3;
1341 v8i16 out0, out1, out2, out3;
1344 LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
1345 LD_SB5(src_y, src_stride, src_vt0, src_vt1, src_vt2, src_vt3, src_vt4);
1346 src_y += (5 * src_stride);
1348 src_vt0 = (v16i8) __msa_insve_d((v2i64) src_vt0, 1, (v2i64) src_vt1);
1349 src_vt1 = (v16i8) __msa_insve_d((v2i64) src_vt1, 1, (v2i64) src_vt2);
1350 src_vt2 = (v16i8) __msa_insve_d((v2i64) src_vt2, 1, (v2i64) src_vt3);
1351 src_vt3 = (v16i8) __msa_insve_d((v2i64) src_vt3, 1, (v2i64) src_vt4);
1353 XORI_B4_128_SB(src_vt0, src_vt1, src_vt2, src_vt3);
1355 for (loop_cnt = (height >> 2); loop_cnt--;) {
1356 LD_SB4(src_x, src_stride, src_hz0, src_hz1, src_hz2, src_hz3);
1357 XORI_B4_128_SB(src_hz0, src_hz1, src_hz2, src_hz3);
1358 src_x += (4 * src_stride);
1360 hz_out0 = AVC_HORZ_FILTER_SH(src_hz0, mask0, mask1, mask2);
1361 hz_out1 = AVC_HORZ_FILTER_SH(src_hz1, mask0, mask1, mask2);
1362 hz_out2 = AVC_HORZ_FILTER_SH(src_hz2, mask0, mask1, mask2);
1363 hz_out3 = AVC_HORZ_FILTER_SH(src_hz3, mask0, mask1, mask2);
1365 SRARI_H4_SH(hz_out0, hz_out1, hz_out2, hz_out3, 5);
1366 SAT_SH4_SH(hz_out0, hz_out1, hz_out2, hz_out3, 7);
1368 LD_SB4(src_y, src_stride, src_vt5, src_vt6, src_vt7, src_vt8);
1369 src_y += (4 * src_stride);
1371 src_vt4 = (v16i8) __msa_insve_d((v2i64) src_vt4, 1, (v2i64) src_vt5);
1372 src_vt5 = (v16i8) __msa_insve_d((v2i64) src_vt5, 1, (v2i64) src_vt6);
1373 src_vt6 = (v16i8) __msa_insve_d((v2i64) src_vt6, 1, (v2i64) src_vt7);
1374 src_vt7 = (v16i8) __msa_insve_d((v2i64) src_vt7, 1, (v2i64) src_vt8);
1376 XORI_B4_128_SB(src_vt4, src_vt5, src_vt6, src_vt7);
1379 AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src_vt0, src_vt1, src_vt2, src_vt3,
1380 src_vt4, src_vt5, vert_out0, vert_out1);
1381 AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src_vt2, src_vt3, src_vt4, src_vt5,
1382 src_vt6, src_vt7, vert_out2, vert_out3);
1384 SRARI_H4_SH(vert_out0, vert_out1, vert_out2, vert_out3, 5);
1385 SAT_SH4_SH(vert_out0, vert_out1, vert_out2, vert_out3, 7);
1387 out0 = __msa_srari_h((hz_out0 + vert_out0), 1);
1388 out1 = __msa_srari_h((hz_out1 + vert_out1), 1);
1389 out2 = __msa_srari_h((hz_out2 + vert_out2), 1);
1390 out3 = __msa_srari_h((hz_out3 + vert_out3), 1);
1392 SAT_SH4_SH(out0, out1, out2, out3, 7);
1393 tmp0 = PCKEV_XORI128_UB(out0, out1);
1394 tmp1 = PCKEV_XORI128_UB(out2, out3);
1395 ST8x4_UB(tmp0, tmp1, dst, dst_stride);
1397 dst += (4 * dst_stride);
1407 static void avc_luma_hv_qrt_16w_msa(const uint8_t *src_x, const uint8_t *src_y,
1408 int32_t src_stride, uint8_t *dst,
1409 int32_t dst_stride, int32_t height)
1411 uint32_t multiple8_cnt;
1413 for (multiple8_cnt = 2; multiple8_cnt--;) {
1414 avc_luma_hv_qrt_8w_msa(src_x, src_y, src_stride, dst, dst_stride,
1423 static void avc_luma_hz_and_aver_dst_4x4_msa(const uint8_t *src,
1425 uint8_t *dst, int32_t dst_stride)
1427 v16i8 src0, src1, src2, src3;
1428 v16u8 dst0, dst1, dst2, dst3, res;
1430 v16i8 mask0, mask1, mask2;
1431 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
1432 v16i8 minus5b = __msa_ldi_b(-5);
1433 v16i8 plus20b = __msa_ldi_b(20);
1435 LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2);
1436 LD_SB4(src, src_stride, src0, src1, src2, src3);
1438 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
1439 XORI_B4_128_SB(src0, src1, src2, src3);
1440 VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0, vec1);
1441 HADD_SB2_SH(vec0, vec1, res0, res1);
1442 VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2, vec3);
1443 DPADD_SB2_SH(vec2, vec3, minus5b, minus5b, res0, res1);
1444 VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4, vec5);
1445 DPADD_SB2_SH(vec4, vec5, plus20b, plus20b, res0, res1);
1446 SRARI_H2_SH(res0, res1, 5);
1447 SAT_SH2_SH(res0, res1, 7);
1448 res = PCKEV_XORI128_UB(res0, res1);
1449 ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst1);
1451 dst0 = (v16u8) __msa_pckev_d((v2i64) dst1, (v2i64) dst0);
1452 res = __msa_aver_u_b(res, dst0);
1454 ST4x4_UB(res, res, 0, 1, 2, 3, dst, dst_stride);
1457 static void avc_luma_hz_and_aver_dst_8x8_msa(const uint8_t *src,
1459 uint8_t *dst, int32_t dst_stride)
1462 v16i8 src0, src1, src2, src3;
1463 v16u8 dst0, dst1, dst2, dst3;
1464 v8i16 res0, res1, res2, res3;
1465 v16i8 mask0, mask1, mask2;
1466 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
1467 v16i8 vec6, vec7, vec8, vec9, vec10, vec11;
1468 v16i8 minus5b = __msa_ldi_b(-5);
1469 v16i8 plus20b = __msa_ldi_b(20);
1471 LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
1473 for (loop_cnt = 2; loop_cnt--;) {
1474 LD_SB4(src, src_stride, src0, src1, src2, src3);
1475 src += (4 * src_stride);
1477 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
1479 XORI_B4_128_SB(src0, src1, src2, src3);
1480 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
1481 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3);
1482 HADD_SB4_SH(vec0, vec1, vec2, vec3, res0, res1, res2, res3);
1483 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4, vec5);
1484 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6, vec7);
1485 DPADD_SB4_SH(vec4, vec5, vec6, vec7, minus5b, minus5b, minus5b, minus5b,
1486 res0, res1, res2, res3);
1487 VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec8, vec9);
1488 VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec10, vec11);
1489 DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b,
1490 plus20b, res0, res1, res2, res3);
1491 SRARI_H4_SH(res0, res1, res2, res3, 5);
1492 SAT_SH4_SH(res0, res1, res2, res3, 7);
1493 ILVR_D2_UB(dst1, dst0, dst3, dst2, dst0, dst1);
1494 CONVERT_UB_AVG_ST8x4_UB(res0, res1, res2, res3, dst0, dst1,
1497 dst += (4 * dst_stride);
1501 static void avc_luma_hz_and_aver_dst_16x16_msa(const uint8_t *src,
1503 uint8_t *dst, int32_t dst_stride)
1506 v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
1507 v16u8 dst0, dst1, dst2, dst3;
1508 v16i8 mask0, mask1, mask2;
1509 v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
1510 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
1511 v16i8 vec6, vec7, vec8, vec9, vec10, vec11;
1512 v16i8 minus5b = __msa_ldi_b(-5);
1513 v16i8 plus20b = __msa_ldi_b(20);
1515 LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
1517 for (loop_cnt = 4; loop_cnt--;) {
1518 LD_SB2(src, 8, src0, src1);
1520 LD_SB2(src, 8, src2, src3);
1523 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
1525 XORI_B4_128_SB(src0, src1, src2, src3);
1526 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec3);
1527 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec6, vec9);
1528 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec1, vec4);
1529 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec7, vec10);
1530 VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec2, vec5);
1531 VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec8, vec11);
1532 HADD_SB4_SH(vec0, vec3, vec6, vec9, res0, res1, res2, res3);
1533 DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
1534 minus5b, res0, res1, res2, res3);
1535 DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
1536 plus20b, res0, res1, res2, res3);
1537 LD_SB2(src, 8, src4, src5);
1539 LD_SB2(src, 8, src6, src7);
1541 XORI_B4_128_SB(src4, src5, src6, src7);
1542 VSHF_B2_SB(src4, src4, src5, src5, mask0, mask0, vec0, vec3);
1543 VSHF_B2_SB(src6, src6, src7, src7, mask0, mask0, vec6, vec9);
1544 VSHF_B2_SB(src4, src4, src5, src5, mask1, mask1, vec1, vec4);
1545 VSHF_B2_SB(src6, src6, src7, src7, mask1, mask1, vec7, vec10);
1546 VSHF_B2_SB(src4, src4, src5, src5, mask2, mask2, vec2, vec5);
1547 VSHF_B2_SB(src6, src6, src7, src7, mask2, mask2, vec8, vec11);
1548 HADD_SB4_SH(vec0, vec3, vec6, vec9, res4, res5, res6, res7);
1549 DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
1550 minus5b, res4, res5, res6, res7);
1551 DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
1552 plus20b, res4, res5, res6, res7);
1553 SRARI_H4_SH(res0, res1, res2, res3, 5);
1554 SRARI_H4_SH(res4, res5, res6, res7, 5);
1555 SAT_SH4_SH(res0, res1, res2, res3, 7);
1556 SAT_SH4_SH(res4, res5, res6, res7, 7);
1557 PCKEV_B4_SB(res1, res0, res3, res2, res5, res4, res7, res6,
1558 vec0, vec1, vec2, vec3);
1559 XORI_B4_128_SB(vec0, vec1, vec2, vec3);
1560 AVER_UB4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3,
1561 dst0, dst1, dst2, dst3);
1562 ST_UB4(dst0, dst1, dst2, dst3, dst, dst_stride);
1563 dst += (4 * dst_stride);
1567 static void avc_luma_hz_qrt_and_aver_dst_4x4_msa(const uint8_t *src,
1574 v16i8 src0, src1, src2, src3;
1575 v16u8 dst0, dst1, dst2, dst3;
1576 v16i8 mask0, mask1, mask2;
1577 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
1579 v16i8 minus5b = __msa_ldi_b(-5);
1580 v16i8 plus20b = __msa_ldi_b(20);
1583 LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2);
1591 LD_SB4(src, src_stride, src0, src1, src2, src3);
1592 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
1594 XORI_B4_128_SB(src0, src1, src2, src3);
1595 VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0, vec1);
1596 HADD_SB2_SH(vec0, vec1, out0, out1);
1597 VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2, vec3);
1598 DPADD_SB2_SH(vec2, vec3, minus5b, minus5b, out0, out1);
1599 VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4, vec5);
1600 DPADD_SB2_SH(vec4, vec5, plus20b, plus20b, out0, out1);
1601 SRARI_H2_SH(out0, out1, 5);
1602 SAT_SH2_SH(out0, out1, 7);
1604 PCKEV_B2_UB(out0, out0, out1, out1, res0, res1);
1606 src0 = __msa_sld_b(src0, src0, slide);
1607 src1 = __msa_sld_b(src1, src1, slide);
1608 src2 = __msa_sld_b(src2, src2, slide);
1609 src3 = __msa_sld_b(src3, src3, slide);
1610 src0 = (v16i8) __msa_insve_w((v4i32) src0, 1, (v4i32) src1);
1611 src1 = (v16i8) __msa_insve_w((v4i32) src2, 1, (v4i32) src3);
1612 res0 = (v16u8) __msa_aver_s_b((v16i8) res0, src0);
1613 res1 = (v16u8) __msa_aver_s_b((v16i8) res1, src1);
1615 XORI_B2_128_UB(res0, res1);
1617 dst0 = (v16u8) __msa_insve_w((v4i32) dst0, 1, (v4i32) dst1);
1618 dst1 = (v16u8) __msa_insve_w((v4i32) dst2, 1, (v4i32) dst3);
1620 AVER_UB2_UB(res0, dst0, res1, dst1, dst0, dst1);
1622 ST4x4_UB(dst0, dst1, 0, 1, 0, 1, dst, dst_stride);
1625 static void avc_luma_hz_qrt_and_aver_dst_8x8_msa(const uint8_t *src,
1633 v16i8 src0, src1, src2, src3;
1634 v16i8 mask0, mask1, mask2;
1635 v16u8 dst0, dst1, dst2, dst3;
1636 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
1637 v16i8 vec6, vec7, vec8, vec9, vec10, vec11;
1638 v8i16 out0, out1, out2, out3;
1639 v16i8 minus5b = __msa_ldi_b(-5);
1640 v16i8 plus20b = __msa_ldi_b(20);
1641 v16i8 res0, res1, res2, res3;
1643 LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
1651 for (loop_cnt = 2; loop_cnt--;) {
1652 LD_SB4(src, src_stride, src0, src1, src2, src3);
1653 src += (4 * src_stride);
1655 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
1657 XORI_B4_128_SB(src0, src1, src2, src3);
1658 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
1659 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3);
1660 HADD_SB4_SH(vec0, vec1, vec2, vec3, out0, out1, out2, out3);
1661 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4, vec5);
1662 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6, vec7);
1663 DPADD_SB4_SH(vec4, vec5, vec6, vec7, minus5b, minus5b, minus5b, minus5b,
1664 out0, out1, out2, out3);
1665 VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec8, vec9);
1666 VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec10, vec11);
1667 DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b,
1668 plus20b, out0, out1, out2, out3);
1670 src0 = __msa_sld_b(src0, src0, slide);
1671 src1 = __msa_sld_b(src1, src1, slide);
1672 src2 = __msa_sld_b(src2, src2, slide);
1673 src3 = __msa_sld_b(src3, src3, slide);
1675 SRARI_H4_SH(out0, out1, out2, out3, 5);
1676 SAT_SH4_SH(out0, out1, out2, out3, 7);
1678 PCKEV_B4_SB(out0, out0, out1, out1, out2, out2, out3, out3,
1679 res0, res1, res2, res3);
1681 res0 = __msa_aver_s_b(res0, src0);
1682 res1 = __msa_aver_s_b(res1, src1);
1683 res2 = __msa_aver_s_b(res2, src2);
1684 res3 = __msa_aver_s_b(res3, src3);
1686 XORI_B4_128_SB(res0, res1, res2, res3);
1687 AVER_ST8x4_UB(res0, dst0, res1, dst1, res2, dst2, res3, dst3,
1690 dst += (4 * dst_stride);
1694 static void avc_luma_hz_qrt_and_aver_dst_16x16_msa(const uint8_t *src,
1702 v16i8 src0, src1, src2, src3;
1703 v16i8 mask0, mask1, mask2, vshf;
1705 v8i16 res0, res1, res2, res3;
1706 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
1707 v16i8 vec6, vec7, vec8, vec9, vec10, vec11;
1708 v16i8 minus5b = __msa_ldi_b(-5);
1709 v16i8 plus20b = __msa_ldi_b(20);
1711 LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
1714 vshf = LD_SB(&luma_mask_arr[16 + 96]);
1716 vshf = LD_SB(&luma_mask_arr[96]);
1719 for (loop_cnt = 8; loop_cnt--;) {
1720 LD_SB2(src, 8, src0, src1);
1722 LD_SB2(src, 8, src2, src3);
1725 LD_UB2(dst, dst_stride, dst0, dst1);
1727 XORI_B4_128_SB(src0, src1, src2, src3);
1728 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec3);
1729 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec6, vec9);
1730 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec1, vec4);
1731 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec7, vec10);
1732 VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec2, vec5);
1733 VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec8, vec11);
1734 HADD_SB4_SH(vec0, vec3, vec6, vec9, res0, res1, res2, res3);
1735 DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
1736 minus5b, res0, res1, res2, res3);
1737 DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
1738 plus20b, res0, res1, res2, res3);
1739 VSHF_B2_SB(src0, src1, src2, src3, vshf, vshf, src0, src2);
1740 SRARI_H4_SH(res0, res1, res2, res3, 5);
1741 SAT_SH4_SH(res0, res1, res2, res3, 7);
1742 PCKEV_B2_SB(res1, res0, res3, res2, out0, out1);
1744 out0 = __msa_aver_s_b(out0, src0);
1745 out1 = __msa_aver_s_b(out1, src2);
1747 XORI_B2_128_SB(out0, out1);
1748 AVER_UB2_UB(out0, dst0, out1, dst1, dst0, dst1);
1749 ST_UB2(dst0, dst1, dst, dst_stride);
1750 dst += (2 * dst_stride);
1754 static void avc_luma_vt_and_aver_dst_4x4_msa(const uint8_t *src,
1756 uint8_t *dst, int32_t dst_stride)
1758 int16_t filt_const0 = 0xfb01;
1759 int16_t filt_const1 = 0x1414;
1760 int16_t filt_const2 = 0x1fb;
1761 v16u8 dst0, dst1, dst2, dst3;
1762 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
1763 v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
1764 v16i8 src87_r, src2110, src4332, src6554, src8776;
1766 v16i8 filt0, filt1, filt2;
1769 filt0 = (v16i8) __msa_fill_h(filt_const0);
1770 filt1 = (v16i8) __msa_fill_h(filt_const1);
1771 filt2 = (v16i8) __msa_fill_h(filt_const2);
1773 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
1774 src += (5 * src_stride);
1776 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3,
1777 src10_r, src21_r, src32_r, src43_r);
1778 ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
1779 XORI_B2_128_SB(src2110, src4332);
1780 LD_SB4(src, src_stride, src5, src6, src7, src8);
1781 ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7,
1782 src54_r, src65_r, src76_r, src87_r);
1783 ILVR_D2_SB(src65_r, src54_r, src87_r, src76_r, src6554, src8776);
1784 XORI_B2_128_SB(src6554, src8776);
1785 out10 = DPADD_SH3_SH(src2110, src4332, src6554, filt0, filt1, filt2);
1786 out32 = DPADD_SH3_SH(src4332, src6554, src8776, filt0, filt1, filt2);
1787 SRARI_H2_SH(out10, out32, 5);
1788 SAT_SH2_SH(out10, out32, 7);
1789 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
1790 res = PCKEV_XORI128_UB(out10, out32);
1792 ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst1);
1794 dst0 = (v16u8) __msa_pckev_d((v2i64) dst1, (v2i64) dst0);
1795 dst0 = __msa_aver_u_b(res, dst0);
1797 ST4x4_UB(dst0, dst0, 0, 1, 2, 3, dst, dst_stride);
1800 static void avc_luma_vt_and_aver_dst_8x8_msa(const uint8_t *src,
1802 uint8_t *dst, int32_t dst_stride)
1805 int16_t filt_const0 = 0xfb01;
1806 int16_t filt_const1 = 0x1414;
1807 int16_t filt_const2 = 0x1fb;
1808 v16u8 dst0, dst1, dst2, dst3;
1809 v16i8 src0, src1, src2, src3, src4, src7, src8, src9, src10;
1810 v16i8 src10_r, src32_r, src76_r, src98_r;
1811 v16i8 src21_r, src43_r, src87_r, src109_r;
1812 v8i16 out0, out1, out2, out3;
1813 v16i8 filt0, filt1, filt2;
1815 filt0 = (v16i8) __msa_fill_h(filt_const0);
1816 filt1 = (v16i8) __msa_fill_h(filt_const1);
1817 filt2 = (v16i8) __msa_fill_h(filt_const2);
1819 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
1820 src += (5 * src_stride);
1822 XORI_B5_128_SB(src0, src1, src2, src3, src4);
1823 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3,
1824 src10_r, src21_r, src32_r, src43_r);
1826 for (loop_cnt = 2; loop_cnt--;) {
1827 LD_SB4(src, src_stride, src7, src8, src9, src10);
1828 src += (4 * src_stride);
1830 XORI_B4_128_SB(src7, src8, src9, src10);
1831 ILVR_B4_SB(src7, src4, src8, src7, src9, src8, src10, src9,
1832 src76_r, src87_r, src98_r, src109_r);
1833 out0 = DPADD_SH3_SH(src10_r, src32_r, src76_r, filt0, filt1, filt2);
1834 out1 = DPADD_SH3_SH(src21_r, src43_r, src87_r, filt0, filt1, filt2);
1835 out2 = DPADD_SH3_SH(src32_r, src76_r, src98_r, filt0, filt1, filt2);
1836 out3 = DPADD_SH3_SH(src43_r, src87_r, src109_r, filt0, filt1, filt2);
1837 SRARI_H4_SH(out0, out1, out2, out3, 5);
1838 SAT_SH4_SH(out0, out1, out2, out3, 7);
1839 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
1840 ILVR_D2_UB(dst1, dst0, dst3, dst2, dst0, dst1);
1841 CONVERT_UB_AVG_ST8x4_UB(out0, out1, out2, out3, dst0, dst1,
1843 dst += (4 * dst_stride);
1853 static void avc_luma_vt_and_aver_dst_16x16_msa(const uint8_t *src,
1855 uint8_t *dst, int32_t dst_stride)
1858 int16_t filt_const0 = 0xfb01;
1859 int16_t filt_const1 = 0x1414;
1860 int16_t filt_const2 = 0x1fb;
1861 v16u8 dst0, dst1, dst2, dst3;
1862 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
1863 v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
1864 v16i8 src87_r, src10_l, src32_l, src54_l, src76_l, src21_l, src43_l;
1865 v16i8 src65_l, src87_l;
1866 v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
1867 v16i8 filt0, filt1, filt2;
1868 v16u8 res0, res1, res2, res3;
1870 filt0 = (v16i8) __msa_fill_h(filt_const0);
1871 filt1 = (v16i8) __msa_fill_h(filt_const1);
1872 filt2 = (v16i8) __msa_fill_h(filt_const2);
1874 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
1875 src += (5 * src_stride);
1877 XORI_B5_128_SB(src0, src1, src2, src3, src4);
1878 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3,
1879 src10_r, src21_r, src32_r, src43_r);
1880 ILVL_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3,
1881 src10_l, src21_l, src32_l, src43_l);
1883 for (loop_cnt = 4; loop_cnt--;) {
1884 LD_SB4(src, src_stride, src5, src6, src7, src8);
1885 src += (4 * src_stride);
1887 XORI_B4_128_SB(src5, src6, src7, src8);
1888 ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7,
1889 src54_r, src65_r, src76_r, src87_r);
1890 ILVL_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7,
1891 src54_l, src65_l, src76_l, src87_l);
1892 out0_r = DPADD_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2);
1893 out1_r = DPADD_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2);
1894 out2_r = DPADD_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2);
1895 out3_r = DPADD_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2);
1896 out0_l = DPADD_SH3_SH(src10_l, src32_l, src54_l, filt0, filt1, filt2);
1897 out1_l = DPADD_SH3_SH(src21_l, src43_l, src65_l, filt0, filt1, filt2);
1898 out2_l = DPADD_SH3_SH(src32_l, src54_l, src76_l, filt0, filt1, filt2);
1899 out3_l = DPADD_SH3_SH(src43_l, src65_l, src87_l, filt0, filt1, filt2);
1900 SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 5);
1901 SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 5);
1902 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
1903 SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
1904 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
1905 PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
1906 out3_r, res0, res1, res2, res3);
1907 XORI_B4_128_UB(res0, res1, res2, res3);
1908 AVER_UB4_UB(res0, dst0, res1, dst1, res2, dst2, res3, dst3,
1909 res0, res1, res2, res3);
1910 ST_UB4(res0, res1, res2, res3, dst, dst_stride);
1911 dst += (4 * dst_stride);
1925 static void avc_luma_vt_qrt_and_aver_dst_4x4_msa(const uint8_t *src,
1931 int16_t filt_const0 = 0xfb01;
1932 int16_t filt_const1 = 0x1414;
1933 int16_t filt_const2 = 0x1fb;
1934 v16u8 dst0, dst1, dst2, dst3;
1935 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
1936 v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
1937 v16i8 src87_r, src2110, src4332, src6554, src8776;
1939 v16i8 filt0, filt1, filt2;
1942 filt0 = (v16i8) __msa_fill_h(filt_const0);
1943 filt1 = (v16i8) __msa_fill_h(filt_const1);
1944 filt2 = (v16i8) __msa_fill_h(filt_const2);
1946 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
1947 src += (5 * src_stride);
1949 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3,
1950 src10_r, src21_r, src32_r, src43_r);
1951 ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
1952 XORI_B2_128_SB(src2110, src4332);
1953 LD_SB4(src, src_stride, src5, src6, src7, src8);
1954 ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7,
1955 src54_r, src65_r, src76_r, src87_r);
1956 ILVR_D2_SB(src65_r, src54_r, src87_r, src76_r, src6554, src8776);
1957 XORI_B2_128_SB(src6554, src8776);
1958 out10 = DPADD_SH3_SH(src2110, src4332, src6554, filt0, filt1, filt2);
1959 out32 = DPADD_SH3_SH(src4332, src6554, src8776, filt0, filt1, filt2);
1960 SRARI_H2_SH(out10, out32, 5);
1961 SAT_SH2_SH(out10, out32, 7);
1962 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
1963 res = PCKEV_XORI128_UB(out10, out32);
1966 src32_r = (v16i8) __msa_insve_w((v4i32) src3, 1, (v4i32) src4);
1967 src54_r = (v16i8) __msa_insve_w((v4i32) src5, 1, (v4i32) src6);
1969 src32_r = (v16i8) __msa_insve_w((v4i32) src2, 1, (v4i32) src3);
1970 src54_r = (v16i8) __msa_insve_w((v4i32) src4, 1, (v4i32) src5);
1973 src32_r = (v16i8) __msa_insve_d((v2i64) src32_r, 1, (v2i64) src54_r);
1974 res = __msa_aver_u_b(res, (v16u8) src32_r);
1976 ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst1);
1978 dst0 = (v16u8) __msa_pckev_d((v2i64) dst1, (v2i64) dst0);
1979 dst0 = __msa_aver_u_b(res, dst0);
1981 ST4x4_UB(dst0, dst0, 0, 1, 2, 3, dst, dst_stride);
1984 static void avc_luma_vt_qrt_and_aver_dst_8x8_msa(const uint8_t *src,
1991 int16_t filt_const0 = 0xfb01;
1992 int16_t filt_const1 = 0x1414;
1993 int16_t filt_const2 = 0x1fb;
1994 v16u8 dst0, dst1, dst2, dst3;
1995 v16i8 src0, src1, src2, src3, src4, src7, src8, src9, src10;
1996 v16i8 src10_r, src32_r, src76_r, src98_r;
1997 v16i8 src21_r, src43_r, src87_r, src109_r;
1998 v8i16 out0_r, out1_r, out2_r, out3_r;
2001 v16i8 filt0, filt1, filt2;
2003 filt0 = (v16i8) __msa_fill_h(filt_const0);
2004 filt1 = (v16i8) __msa_fill_h(filt_const1);
2005 filt2 = (v16i8) __msa_fill_h(filt_const2);
2007 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
2008 src += (5 * src_stride);
2010 XORI_B5_128_SB(src0, src1, src2, src3, src4);
2011 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3,
2012 src10_r, src21_r, src32_r, src43_r);
2014 for (loop_cnt = 2; loop_cnt--;) {
2015 LD_SB4(src, src_stride, src7, src8, src9, src10);
2016 src += (4 * src_stride);
2018 XORI_B4_128_SB(src7, src8, src9, src10);
2019 ILVR_B4_SB(src7, src4, src8, src7, src9, src8, src10, src9,
2020 src76_r, src87_r, src98_r, src109_r);
2021 out0_r = DPADD_SH3_SH(src10_r, src32_r, src76_r, filt0, filt1, filt2);
2022 out1_r = DPADD_SH3_SH(src21_r, src43_r, src87_r, filt0, filt1, filt2);
2023 out2_r = DPADD_SH3_SH(src32_r, src76_r, src98_r, filt0, filt1, filt2);
2024 out3_r = DPADD_SH3_SH(src43_r, src87_r, src109_r, filt0, filt1, filt2);
2025 SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 5);
2026 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
2027 PCKEV_B2_SB(out1_r, out0_r, out3_r, out2_r, res0, res1);
2030 PCKEV_D2_SB(src4, src3, src8, src7, src10_r, src32_r);
2032 PCKEV_D2_SB(src3, src2, src7, src4, src10_r, src32_r);
2035 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
2036 ILVR_D2_UB(dst1, dst0, dst3, dst2, dst0, dst1);
2038 vec0 = (v16u8) __msa_aver_s_b(res0, src10_r);
2039 vec1 = (v16u8) __msa_aver_s_b(res1, src32_r);
2041 XORI_B2_128_UB(vec0, vec1);
2042 AVER_UB2_UB(vec0, dst0, vec1, dst1, vec0, vec1);
2043 ST8x4_UB(vec0, vec1, dst, dst_stride);
2044 dst += (4 * dst_stride);
2056 static void avc_luma_vt_qrt_and_aver_dst_16x16_msa(const uint8_t *src,
2063 int16_t filt_const0 = 0xfb01;
2064 int16_t filt_const1 = 0x1414;
2065 int16_t filt_const2 = 0x1fb;
2066 v16u8 dst0, dst1, dst2, dst3;
2067 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
2068 v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
2069 v16i8 src87_r, src10_l, src32_l, src54_l, src76_l, src21_l, src43_l;
2070 v16i8 src65_l, src87_l;
2071 v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
2072 v16i8 out0, out1, out2, out3;
2073 v16i8 filt0, filt1, filt2;
2074 v16u8 res0, res1, res2, res3;
2076 filt0 = (v16i8) __msa_fill_h(filt_const0);
2077 filt1 = (v16i8) __msa_fill_h(filt_const1);
2078 filt2 = (v16i8) __msa_fill_h(filt_const2);
2080 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
2081 src += (5 * src_stride);
2083 XORI_B5_128_SB(src0, src1, src2, src3, src4);
2084 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3,
2085 src10_r, src21_r, src32_r, src43_r);
2086 ILVL_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3,
2087 src10_l, src21_l, src32_l, src43_l);
2089 for (loop_cnt = 4; loop_cnt--;) {
2090 LD_SB4(src, src_stride, src5, src6, src7, src8);
2091 src += (4 * src_stride);
2093 XORI_B4_128_SB(src5, src6, src7, src8);
2094 ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7,
2095 src54_r, src65_r, src76_r, src87_r);
2096 ILVL_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7,
2097 src54_l, src65_l, src76_l, src87_l);
2098 out0_r = DPADD_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2);
2099 out1_r = DPADD_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2);
2100 out2_r = DPADD_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2);
2101 out3_r = DPADD_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2);
2102 out0_l = DPADD_SH3_SH(src10_l, src32_l, src54_l, filt0, filt1, filt2);
2103 out1_l = DPADD_SH3_SH(src21_l, src43_l, src65_l, filt0, filt1, filt2);
2104 out2_l = DPADD_SH3_SH(src32_l, src54_l, src76_l, filt0, filt1, filt2);
2105 out3_l = DPADD_SH3_SH(src43_l, src65_l, src87_l, filt0, filt1, filt2);
2106 SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 5);
2107 SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 5);
2108 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
2109 SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
2110 PCKEV_B4_SB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
2111 out3_r, out0, out1, out2, out3);
2112 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
2115 res0 = (v16u8) __msa_aver_s_b(out0, src3);
2116 res1 = (v16u8) __msa_aver_s_b(out1, src4);
2117 res2 = (v16u8) __msa_aver_s_b(out2, src5);
2118 res3 = (v16u8) __msa_aver_s_b(out3, src6);
2120 res0 = (v16u8) __msa_aver_s_b(out0, src2);
2121 res1 = (v16u8) __msa_aver_s_b(out1, src3);
2122 res2 = (v16u8) __msa_aver_s_b(out2, src4);
2123 res3 = (v16u8) __msa_aver_s_b(out3, src5);
2126 XORI_B4_128_UB(res0, res1, res2, res3);
2127 AVER_UB4_UB(res0, dst0, res1, dst1, res2, dst2, res3, dst3,
2128 dst0, dst1, dst2, dst3);
2129 ST_UB4(dst0, dst1, dst2, dst3, dst, dst_stride);
2130 dst += (4 * dst_stride);
2146 static void avc_luma_mid_and_aver_dst_4x4_msa(const uint8_t *src,
2148 uint8_t *dst, int32_t dst_stride)
2150 v16i8 src0, src1, src2, src3, src4;
2151 v16i8 mask0, mask1, mask2;
2152 v8i16 hz_out0, hz_out1, hz_out2, hz_out3;
2153 v8i16 hz_out4, hz_out5, hz_out6, hz_out7, hz_out8;
2154 v8i16 res0, res1, res2, res3;
2155 v16u8 dst0, dst1, dst2, dst3;
2156 v16u8 tmp0, tmp1, tmp2, tmp3;
2158 LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2);
2159 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
2160 src += (5 * src_stride);
2162 XORI_B5_128_SB(src0, src1, src2, src3, src4);
2164 hz_out0 = AVC_XOR_VSHF_B_AND_APPLY_6TAP_HORIZ_FILT_SH(src0, src1,
2165 mask0, mask1, mask2);
2166 hz_out2 = AVC_XOR_VSHF_B_AND_APPLY_6TAP_HORIZ_FILT_SH(src2, src3,
2167 mask0, mask1, mask2);
2169 PCKOD_D2_SH(hz_out0, hz_out0, hz_out2, hz_out2, hz_out1, hz_out3);
2171 hz_out4 = AVC_HORZ_FILTER_SH(src4, mask0, mask1, mask2);
2173 LD_SB4(src, src_stride, src0, src1, src2, src3);
2174 XORI_B4_128_SB(src0, src1, src2, src3);
2176 hz_out5 = AVC_XOR_VSHF_B_AND_APPLY_6TAP_HORIZ_FILT_SH(src0, src1,
2177 mask0, mask1, mask2);
2178 hz_out7 = AVC_XOR_VSHF_B_AND_APPLY_6TAP_HORIZ_FILT_SH(src2, src3,
2179 mask0, mask1, mask2);
2181 PCKOD_D2_SH(hz_out5, hz_out5, hz_out7, hz_out7, hz_out6, hz_out8);
2183 res0 = AVC_CALC_DPADD_H_6PIX_2COEFF_R_SH(hz_out0, hz_out1, hz_out2,
2184 hz_out3, hz_out4, hz_out5);
2185 res1 = AVC_CALC_DPADD_H_6PIX_2COEFF_R_SH(hz_out1, hz_out2, hz_out3,
2186 hz_out4, hz_out5, hz_out6);
2187 res2 = AVC_CALC_DPADD_H_6PIX_2COEFF_R_SH(hz_out2, hz_out3, hz_out4,
2188 hz_out5, hz_out6, hz_out7);
2189 res3 = AVC_CALC_DPADD_H_6PIX_2COEFF_R_SH(hz_out3, hz_out4, hz_out5,
2190 hz_out6, hz_out7, hz_out8);
2191 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
2192 tmp0 = PCKEV_XORI128_UB(res0, res1);
2193 tmp1 = PCKEV_XORI128_UB(res2, res3);
2194 PCKEV_D2_UB(dst1, dst0, dst3, dst2, tmp2, tmp3);
2195 AVER_UB2_UB(tmp0, tmp2, tmp1, tmp3, tmp0, tmp1);
2197 ST4x4_UB(tmp0, tmp1, 0, 2, 0, 2, dst, dst_stride);
2200 static void avc_luma_mid_and_aver_dst_8w_msa(const uint8_t *src,
2202 uint8_t *dst, int32_t dst_stride,
2206 v16i8 src0, src1, src2, src3, src4;
2207 v16i8 mask0, mask1, mask2;
2208 v8i16 hz_out0, hz_out1, hz_out2, hz_out3;
2209 v8i16 hz_out4, hz_out5, hz_out6, hz_out7, hz_out8;
2210 v16u8 dst0, dst1, dst2, dst3;
2211 v8i16 res0, res1, res2, res3;
2213 LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
2215 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
2216 XORI_B5_128_SB(src0, src1, src2, src3, src4);
2217 src += (5 * src_stride);
2219 hz_out0 = AVC_HORZ_FILTER_SH(src0, mask0, mask1, mask2);
2220 hz_out1 = AVC_HORZ_FILTER_SH(src1, mask0, mask1, mask2);
2221 hz_out2 = AVC_HORZ_FILTER_SH(src2, mask0, mask1, mask2);
2222 hz_out3 = AVC_HORZ_FILTER_SH(src3, mask0, mask1, mask2);
2223 hz_out4 = AVC_HORZ_FILTER_SH(src4, mask0, mask1, mask2);
2225 for (loop_cnt = (height >> 2); loop_cnt--;) {
2226 LD_SB4(src, src_stride, src0, src1, src2, src3);
2227 XORI_B4_128_SB(src0, src1, src2, src3);
2228 src += (4 * src_stride);
2230 hz_out5 = AVC_HORZ_FILTER_SH(src0, mask0, mask1, mask2);
2231 hz_out6 = AVC_HORZ_FILTER_SH(src1, mask0, mask1, mask2);
2232 hz_out7 = AVC_HORZ_FILTER_SH(src2, mask0, mask1, mask2);
2233 hz_out8 = AVC_HORZ_FILTER_SH(src3, mask0, mask1, mask2);
2235 res0 = AVC_CALC_DPADD_H_6PIX_2COEFF_SH(hz_out0, hz_out1, hz_out2,
2236 hz_out3, hz_out4, hz_out5);
2237 res1 = AVC_CALC_DPADD_H_6PIX_2COEFF_SH(hz_out1, hz_out2, hz_out3,
2238 hz_out4, hz_out5, hz_out6);
2239 res2 = AVC_CALC_DPADD_H_6PIX_2COEFF_SH(hz_out2, hz_out3, hz_out4,
2240 hz_out5, hz_out6, hz_out7);
2241 res3 = AVC_CALC_DPADD_H_6PIX_2COEFF_SH(hz_out3, hz_out4, hz_out5,
2242 hz_out6, hz_out7, hz_out8);
2243 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
2244 ILVR_D2_UB(dst1, dst0, dst3, dst2, dst0, dst1);
2245 CONVERT_UB_AVG_ST8x4_UB(res0, res1, res2, res3, dst0, dst1,
2248 dst += (4 * dst_stride);
2258 static void avc_luma_mid_and_aver_dst_16x16_msa(const uint8_t *src,
2263 avc_luma_mid_and_aver_dst_8w_msa(src, src_stride, dst, dst_stride, 16);
2264 avc_luma_mid_and_aver_dst_8w_msa(src + 8, src_stride, dst + 8, dst_stride,
2268 static void avc_luma_midh_qrt_and_aver_dst_4w_msa(const uint8_t *src,
2273 uint8_t horiz_offset)
2276 v16i8 src0, src1, src2, src3, src4, src5, src6;
2277 v16u8 dst0, dst1, res;
2278 v8i16 vt_res0, vt_res1, vt_res2, vt_res3;
2279 v4i32 hz_res0, hz_res1;
2281 v8i16 shf_vec0, shf_vec1, shf_vec2, shf_vec3, shf_vec4, shf_vec5;
2282 v8i16 mask0 = { 0, 5, 1, 6, 2, 7, 3, 8 };
2283 v8i16 mask1 = { 1, 4, 2, 5, 3, 6, 4, 7 };
2284 v8i16 mask2 = { 2, 3, 3, 4, 4, 5, 5, 6 };
2285 v8i16 minus5h = __msa_ldi_h(-5);
2286 v8i16 plus20h = __msa_ldi_h(20);
2287 v8i16 zeros = { 0 };
2289 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
2290 src += (5 * src_stride);
2292 XORI_B5_128_SB(src0, src1, src2, src3, src4);
2294 for (row = (height >> 1); row--;) {
2295 LD_SB2(src, src_stride, src5, src6);
2296 src += (2 * src_stride);
2298 XORI_B2_128_SB(src5, src6);
2299 LD_UB2(dst, dst_stride, dst0, dst1);
2301 dst0 = (v16u8) __msa_ilvr_w((v4i32) dst1, (v4i32) dst0);
2303 AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src0, src1, src2, src3, src4, src5,
2305 AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src1, src2, src3, src4, src5, src6,
2307 VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1,
2308 mask0, mask1, mask2, shf_vec0, shf_vec1, shf_vec2);
2309 VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3,
2310 mask0, mask1, mask2, shf_vec3, shf_vec4, shf_vec5);
2312 hz_res0 = __msa_hadd_s_w(shf_vec0, shf_vec0);
2313 DPADD_SH2_SW(shf_vec1, shf_vec2, minus5h, plus20h, hz_res0, hz_res0);
2315 hz_res1 = __msa_hadd_s_w(shf_vec3, shf_vec3);
2316 DPADD_SH2_SW(shf_vec4, shf_vec5, minus5h, plus20h, hz_res1, hz_res1);
2318 SRARI_W2_SW(hz_res0, hz_res1, 10);
2319 SAT_SW2_SW(hz_res0, hz_res1, 7);
2321 res0 = __msa_srari_h(shf_vec2, 5);
2322 res1 = __msa_srari_h(shf_vec5, 5);
2324 SAT_SH2_SH(res0, res1, 7);
2327 res0 = __msa_ilvod_h(zeros, res0);
2328 res1 = __msa_ilvod_h(zeros, res1);
2330 ILVEV_H2_SH(res0, zeros, res1, zeros, res0, res1);
2332 hz_res0 = __msa_aver_s_w(hz_res0, (v4i32) res0);
2333 hz_res1 = __msa_aver_s_w(hz_res1, (v4i32) res1);
2334 res0 = __msa_pckev_h((v8i16) hz_res1, (v8i16) hz_res0);
2336 res = PCKEV_XORI128_UB(res0, res0);
2338 dst0 = __msa_aver_u_b(res, dst0);
2340 ST4x2_UB(dst0, dst, dst_stride);
2341 dst += (2 * dst_stride);
2351 static void avc_luma_midh_qrt_and_aver_dst_8w_msa(const uint8_t *src,
2356 uint8_t horiz_offset)
2358 uint32_t multiple8_cnt;
2360 for (multiple8_cnt = 2; multiple8_cnt--;) {
2361 avc_luma_midh_qrt_and_aver_dst_4w_msa(src, src_stride, dst, dst_stride,
2362 height, horiz_offset);
2369 static void avc_luma_midh_qrt_and_aver_dst_16w_msa(const uint8_t *src,
2374 uint8_t horiz_offset)
2376 uint32_t multiple8_cnt;
2378 for (multiple8_cnt = 4; multiple8_cnt--;) {
2379 avc_luma_midh_qrt_and_aver_dst_4w_msa(src, src_stride, dst, dst_stride,
2380 height, horiz_offset);
2387 static void avc_luma_midv_qrt_and_aver_dst_4w_msa(const uint8_t *src,
2396 v16i8 src0, src1, src2, src3, src4;
2398 v16i8 mask0, mask1, mask2;
2399 v8i16 hz_out0, hz_out1, hz_out2, hz_out3;
2400 v8i16 hz_out4, hz_out5, hz_out6;
2401 v8i16 res0, res1, res2, res3;
2404 LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2);
2405 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
2406 src += (5 * src_stride);
2408 XORI_B5_128_SB(src0, src1, src2, src3, src4);
2410 hz_out0 = AVC_XOR_VSHF_B_AND_APPLY_6TAP_HORIZ_FILT_SH(src0, src1,
2411 mask0, mask1, mask2);
2412 hz_out2 = AVC_XOR_VSHF_B_AND_APPLY_6TAP_HORIZ_FILT_SH(src2, src3,
2413 mask0, mask1, mask2);
2415 PCKOD_D2_SH(hz_out0, hz_out0, hz_out2, hz_out2, hz_out1, hz_out3);
2417 hz_out4 = AVC_HORZ_FILTER_SH(src4, mask0, mask1, mask2);
2419 for (loop_cnt = (height >> 1); loop_cnt--;) {
2420 LD_SB2(src, src_stride, src0, src1);
2421 src += (2 * src_stride);
2423 XORI_B2_128_SB(src0, src1);
2424 LD_UB2(dst, dst_stride, dst0, dst1);
2425 hz_out5 = AVC_XOR_VSHF_B_AND_APPLY_6TAP_HORIZ_FILT_SH(src0, src1,
2428 hz_out6 = (v8i16) __msa_pckod_d((v2i64) hz_out5, (v2i64) hz_out5);
2429 res0 = AVC_CALC_DPADD_H_6PIX_2COEFF_R_SH(hz_out0, hz_out1, hz_out2,
2430 hz_out3, hz_out4, hz_out5);
2431 res2 = AVC_CALC_DPADD_H_6PIX_2COEFF_R_SH(hz_out1, hz_out2, hz_out3,
2432 hz_out4, hz_out5, hz_out6);
2435 res1 = __msa_srari_h(hz_out3, 5);
2436 res3 = __msa_srari_h(hz_out4, 5);
2438 res1 = __msa_srari_h(hz_out2, 5);
2439 res3 = __msa_srari_h(hz_out3, 5);
2442 SAT_SH2_SH(res1, res3, 7);
2444 res0 = __msa_aver_s_h(res0, res1);
2445 res1 = __msa_aver_s_h(res2, res3);
2447 vec0 = PCKEV_XORI128_UB(res0, res0);
2448 vec1 = PCKEV_XORI128_UB(res1, res1);
2450 AVER_UB2_UB(vec0, dst0, vec1, dst1, dst0, dst1);
2452 out0 = __msa_copy_u_w((v4i32) dst0, 0);
2453 out1 = __msa_copy_u_w((v4i32) dst1, 0);
2467 static void avc_luma_midv_qrt_and_aver_dst_8w_msa(const uint8_t *src,
2472 uint8_t vert_offset)
2475 v16i8 src0, src1, src2, src3, src4;
2476 v16u8 dst0, dst1, dst2, dst3;
2477 v16i8 mask0, mask1, mask2;
2478 v8i16 hz_out0, hz_out1, hz_out2, hz_out3;
2479 v8i16 hz_out4, hz_out5, hz_out6, hz_out7, hz_out8;
2480 v8i16 res0, res1, res2, res3;
2481 v8i16 res4, res5, res6, res7;
2483 LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
2485 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
2486 XORI_B5_128_SB(src0, src1, src2, src3, src4);
2487 src += (5 * src_stride);
2489 hz_out0 = AVC_HORZ_FILTER_SH(src0, mask0, mask1, mask2);
2490 hz_out1 = AVC_HORZ_FILTER_SH(src1, mask0, mask1, mask2);
2491 hz_out2 = AVC_HORZ_FILTER_SH(src2, mask0, mask1, mask2);
2492 hz_out3 = AVC_HORZ_FILTER_SH(src3, mask0, mask1, mask2);
2493 hz_out4 = AVC_HORZ_FILTER_SH(src4, mask0, mask1, mask2);
2495 for (loop_cnt = (height >> 2); loop_cnt--;) {
2496 LD_SB4(src, src_stride, src0, src1, src2, src3);
2497 XORI_B4_128_SB(src0, src1, src2, src3);
2498 src += (4 * src_stride);
2500 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
2502 hz_out5 = AVC_HORZ_FILTER_SH(src0, mask0, mask1, mask2);
2503 hz_out6 = AVC_HORZ_FILTER_SH(src1, mask0, mask1, mask2);
2504 hz_out7 = AVC_HORZ_FILTER_SH(src2, mask0, mask1, mask2);
2505 hz_out8 = AVC_HORZ_FILTER_SH(src3, mask0, mask1, mask2);
2507 res0 = AVC_CALC_DPADD_H_6PIX_2COEFF_SH(hz_out0, hz_out1, hz_out2,
2508 hz_out3, hz_out4, hz_out5);
2509 res2 = AVC_CALC_DPADD_H_6PIX_2COEFF_SH(hz_out1, hz_out2, hz_out3,
2510 hz_out4, hz_out5, hz_out6);
2511 res4 = AVC_CALC_DPADD_H_6PIX_2COEFF_SH(hz_out2, hz_out3, hz_out4,
2512 hz_out5, hz_out6, hz_out7);
2513 res6 = AVC_CALC_DPADD_H_6PIX_2COEFF_SH(hz_out3, hz_out4, hz_out5,
2514 hz_out6, hz_out7, hz_out8);
2517 res1 = __msa_srari_h(hz_out3, 5);
2518 res3 = __msa_srari_h(hz_out4, 5);
2519 res5 = __msa_srari_h(hz_out5, 5);
2520 res7 = __msa_srari_h(hz_out6, 5);
2522 res1 = __msa_srari_h(hz_out2, 5);
2523 res3 = __msa_srari_h(hz_out3, 5);
2524 res5 = __msa_srari_h(hz_out4, 5);
2525 res7 = __msa_srari_h(hz_out5, 5);
2528 SAT_SH4_SH(res1, res3, res5, res7, 7);
2530 res0 = __msa_aver_s_h(res0, res1);
2531 res1 = __msa_aver_s_h(res2, res3);
2532 res2 = __msa_aver_s_h(res4, res5);
2533 res3 = __msa_aver_s_h(res6, res7);
2534 ILVR_D2_UB(dst1, dst0, dst3, dst2, dst0, dst1);
2535 CONVERT_UB_AVG_ST8x4_UB(res0, res1, res2, res3, dst0, dst1,
2537 dst += (4 * dst_stride);
2547 static void avc_luma_midv_qrt_and_aver_dst_16w_msa(const uint8_t *src,
2552 uint8_t vert_offset)
2554 int32_t multiple8_cnt;
2556 for (multiple8_cnt = 2; multiple8_cnt--;) {
2557 avc_luma_midv_qrt_and_aver_dst_8w_msa(src, src_stride, dst, dst_stride,
2558 height, vert_offset);
2565 static void avc_luma_hv_qrt_and_aver_dst_4x4_msa(const uint8_t *src_x,
2566 const uint8_t *src_y,
2571 v16i8 src_hz0, src_hz1, src_hz2, src_hz3;
2572 v16u8 dst0, dst1, dst2, dst3;
2573 v16i8 src_vt0, src_vt1, src_vt2, src_vt3, src_vt4;
2574 v16i8 src_vt5, src_vt6, src_vt7, src_vt8;
2575 v16i8 mask0, mask1, mask2;
2576 v8i16 hz_out0, hz_out1, vert_out0, vert_out1;
2580 LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2);
2581 LD_SB5(src_y, src_stride, src_vt0, src_vt1, src_vt2, src_vt3, src_vt4);
2582 src_y += (5 * src_stride);
2584 src_vt0 = (v16i8) __msa_insve_w((v4i32) src_vt0, 1, (v4i32) src_vt1);
2585 src_vt1 = (v16i8) __msa_insve_w((v4i32) src_vt1, 1, (v4i32) src_vt2);
2586 src_vt2 = (v16i8) __msa_insve_w((v4i32) src_vt2, 1, (v4i32) src_vt3);
2587 src_vt3 = (v16i8) __msa_insve_w((v4i32) src_vt3, 1, (v4i32) src_vt4);
2589 XORI_B4_128_SB(src_vt0, src_vt1, src_vt2, src_vt3);
2590 LD_SB4(src_x, src_stride, src_hz0, src_hz1, src_hz2, src_hz3);
2591 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
2592 XORI_B4_128_SB(src_hz0, src_hz1, src_hz2, src_hz3);
2593 hz_out0 = AVC_XOR_VSHF_B_AND_APPLY_6TAP_HORIZ_FILT_SH(src_hz0, src_hz1,
2594 mask0, mask1, mask2);
2595 hz_out1 = AVC_XOR_VSHF_B_AND_APPLY_6TAP_HORIZ_FILT_SH(src_hz2, src_hz3,
2596 mask0, mask1, mask2);
2597 SRARI_H2_SH(hz_out0, hz_out1, 5);
2598 SAT_SH2_SH(hz_out0, hz_out1, 7);
2599 LD_SB4(src_y, src_stride, src_vt5, src_vt6, src_vt7, src_vt8);
2601 src_vt4 = (v16i8) __msa_insve_w((v4i32) src_vt4, 1, (v4i32) src_vt5);
2602 src_vt5 = (v16i8) __msa_insve_w((v4i32) src_vt5, 1, (v4i32) src_vt6);
2603 src_vt6 = (v16i8) __msa_insve_w((v4i32) src_vt6, 1, (v4i32) src_vt7);
2604 src_vt7 = (v16i8) __msa_insve_w((v4i32) src_vt7, 1, (v4i32) src_vt8);
2606 XORI_B4_128_SB(src_vt4, src_vt5, src_vt6, src_vt7);
2609 vert_out0 = AVC_CALC_DPADD_B_6PIX_2COEFF_R_SH(src_vt0, src_vt1, src_vt2,
2610 src_vt3, src_vt4, src_vt5);
2611 vert_out1 = AVC_CALC_DPADD_B_6PIX_2COEFF_R_SH(src_vt2, src_vt3, src_vt4,
2612 src_vt5, src_vt6, src_vt7);
2613 SRARI_H2_SH(vert_out0, vert_out1, 5);
2614 SAT_SH2_SH(vert_out0, vert_out1, 7);
2616 res1 = __msa_srari_h((hz_out1 + vert_out1), 1);
2617 res0 = __msa_srari_h((hz_out0 + vert_out0), 1);
2619 SAT_SH2_SH(res0, res1, 7);
2620 res = PCKEV_XORI128_UB(res0, res1);
2622 dst0 = (v16u8) __msa_insve_w((v4i32) dst0, 1, (v4i32) dst1);
2623 dst1 = (v16u8) __msa_insve_w((v4i32) dst2, 1, (v4i32) dst3);
2624 dst0 = (v16u8) __msa_insve_d((v2i64) dst0, 1, (v2i64) dst1);
2625 dst0 = __msa_aver_u_b(res, dst0);
2627 ST4x4_UB(dst0, dst0, 0, 1, 2, 3, dst, dst_stride);
2630 static void avc_luma_hv_qrt_and_aver_dst_8x8_msa(const uint8_t *src_x,
2631 const uint8_t *src_y,
2637 v16i8 src_hz0, src_hz1, src_hz2, src_hz3;
2638 v16u8 dst0, dst1, dst2, dst3;
2639 v16i8 src_vt0, src_vt1, src_vt2, src_vt3;
2640 v16i8 src_vt4, src_vt5, src_vt6, src_vt7, src_vt8;
2641 v16i8 mask0, mask1, mask2;
2642 v8i16 hz_out0, hz_out1, hz_out2, hz_out3;
2643 v8i16 vert_out0, vert_out1, vert_out2, vert_out3;
2644 v8i16 out0, out1, out2, out3;
2646 LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
2648 LD_SB5(src_y, src_stride, src_vt0, src_vt1, src_vt2, src_vt3, src_vt4);
2649 src_y += (5 * src_stride);
2651 src_vt0 = (v16i8) __msa_insve_d((v2i64) src_vt0, 1, (v2i64) src_vt1);
2652 src_vt1 = (v16i8) __msa_insve_d((v2i64) src_vt1, 1, (v2i64) src_vt2);
2653 src_vt2 = (v16i8) __msa_insve_d((v2i64) src_vt2, 1, (v2i64) src_vt3);
2654 src_vt3 = (v16i8) __msa_insve_d((v2i64) src_vt3, 1, (v2i64) src_vt4);
2656 XORI_B4_128_SB(src_vt0, src_vt1, src_vt2, src_vt3);
2658 for (loop_cnt = 2; loop_cnt--;) {
2659 LD_SB4(src_x, src_stride, src_hz0, src_hz1, src_hz2, src_hz3);
2660 XORI_B4_128_SB(src_hz0, src_hz1, src_hz2, src_hz3);
2661 src_x += (4 * src_stride);
2663 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
2664 hz_out0 = AVC_HORZ_FILTER_SH(src_hz0, mask0, mask1, mask2);
2665 hz_out1 = AVC_HORZ_FILTER_SH(src_hz1, mask0, mask1, mask2);
2666 hz_out2 = AVC_HORZ_FILTER_SH(src_hz2, mask0, mask1, mask2);
2667 hz_out3 = AVC_HORZ_FILTER_SH(src_hz3, mask0, mask1, mask2);
2668 SRARI_H4_SH(hz_out0, hz_out1, hz_out2, hz_out3, 5);
2669 SAT_SH4_SH(hz_out0, hz_out1, hz_out2, hz_out3, 7);
2670 LD_SB4(src_y, src_stride, src_vt5, src_vt6, src_vt7, src_vt8);
2671 src_y += (4 * src_stride);
2673 src_vt4 = (v16i8) __msa_insve_d((v2i64) src_vt4, 1, (v2i64) src_vt5);
2674 src_vt5 = (v16i8) __msa_insve_d((v2i64) src_vt5, 1, (v2i64) src_vt6);
2675 src_vt6 = (v16i8) __msa_insve_d((v2i64) src_vt6, 1, (v2i64) src_vt7);
2676 src_vt7 = (v16i8) __msa_insve_d((v2i64) src_vt7, 1, (v2i64) src_vt8);
2678 XORI_B4_128_SB(src_vt4, src_vt5, src_vt6, src_vt7);
2679 AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src_vt0, src_vt1, src_vt2, src_vt3,
2680 src_vt4, src_vt5, vert_out0, vert_out1);
2681 AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src_vt2, src_vt3, src_vt4, src_vt5,
2682 src_vt6, src_vt7, vert_out2, vert_out3);
2683 SRARI_H4_SH(vert_out0, vert_out1, vert_out2, vert_out3, 5);
2684 SAT_SH4_SH(vert_out0, vert_out1, vert_out2, vert_out3, 7);
2686 out0 = __msa_srari_h((hz_out0 + vert_out0), 1);
2687 out1 = __msa_srari_h((hz_out1 + vert_out1), 1);
2688 out2 = __msa_srari_h((hz_out2 + vert_out2), 1);
2689 out3 = __msa_srari_h((hz_out3 + vert_out3), 1);
2691 SAT_SH4_SH(out0, out1, out2, out3, 7);
2692 ILVR_D2_UB(dst1, dst0, dst3, dst2, dst0, dst1);
2693 CONVERT_UB_AVG_ST8x4_UB(out0, out1, out2, out3, dst0, dst1,
2695 dst += (4 * dst_stride);
2705 static void avc_luma_hv_qrt_and_aver_dst_16x16_msa(const uint8_t *src_x,
2706 const uint8_t *src_y,
2711 uint32_t multiple8_cnt;
2713 for (multiple8_cnt = 2; multiple8_cnt--;) {
2714 avc_luma_hv_qrt_and_aver_dst_8x8_msa(src_x, src_y, src_stride,
2722 src_x += (8 * src_stride) - 16;
2723 src_y += (8 * src_stride) - 16;
2724 dst += (8 * dst_stride) - 16;
2726 for (multiple8_cnt = 2; multiple8_cnt--;) {
2727 avc_luma_hv_qrt_and_aver_dst_8x8_msa(src_x, src_y, src_stride,
2736 static void copy_width8_msa(const uint8_t *src, int32_t src_stride,
2737 uint8_t *dst, int32_t dst_stride,
2741 uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
2742 v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
2744 if (0 == height % 12) {
2745 for (cnt = (height / 12); cnt--;) {
2746 LD_UB8(src, src_stride,
2747 src0, src1, src2, src3, src4, src5, src6, src7);
2748 src += (8 * src_stride);
2750 out0 = __msa_copy_u_d((v2i64) src0, 0);
2751 out1 = __msa_copy_u_d((v2i64) src1, 0);
2752 out2 = __msa_copy_u_d((v2i64) src2, 0);
2753 out3 = __msa_copy_u_d((v2i64) src3, 0);
2754 out4 = __msa_copy_u_d((v2i64) src4, 0);
2755 out5 = __msa_copy_u_d((v2i64) src5, 0);
2756 out6 = __msa_copy_u_d((v2i64) src6, 0);
2757 out7 = __msa_copy_u_d((v2i64) src7, 0);
2759 SD4(out0, out1, out2, out3, dst, dst_stride);
2760 dst += (4 * dst_stride);
2761 SD4(out4, out5, out6, out7, dst, dst_stride);
2762 dst += (4 * dst_stride);
2764 LD_UB4(src, src_stride, src0, src1, src2, src3);
2765 src += (4 * src_stride);
2767 out0 = __msa_copy_u_d((v2i64) src0, 0);
2768 out1 = __msa_copy_u_d((v2i64) src1, 0);
2769 out2 = __msa_copy_u_d((v2i64) src2, 0);
2770 out3 = __msa_copy_u_d((v2i64) src3, 0);
2772 SD4(out0, out1, out2, out3, dst, dst_stride);
2773 dst += (4 * dst_stride);
2775 } else if (0 == height % 8) {
2776 for (cnt = height >> 3; cnt--;) {
2777 LD_UB8(src, src_stride,
2778 src0, src1, src2, src3, src4, src5, src6, src7);
2779 src += (8 * src_stride);
2781 out0 = __msa_copy_u_d((v2i64) src0, 0);
2782 out1 = __msa_copy_u_d((v2i64) src1, 0);
2783 out2 = __msa_copy_u_d((v2i64) src2, 0);
2784 out3 = __msa_copy_u_d((v2i64) src3, 0);
2785 out4 = __msa_copy_u_d((v2i64) src4, 0);
2786 out5 = __msa_copy_u_d((v2i64) src5, 0);
2787 out6 = __msa_copy_u_d((v2i64) src6, 0);
2788 out7 = __msa_copy_u_d((v2i64) src7, 0);
2790 SD4(out0, out1, out2, out3, dst, dst_stride);
2791 dst += (4 * dst_stride);
2792 SD4(out4, out5, out6, out7, dst, dst_stride);
2793 dst += (4 * dst_stride);
2795 } else if (0 == height % 4) {
2796 for (cnt = (height / 4); cnt--;) {
2797 LD_UB4(src, src_stride, src0, src1, src2, src3);
2798 src += (4 * src_stride);
2799 out0 = __msa_copy_u_d((v2i64) src0, 0);
2800 out1 = __msa_copy_u_d((v2i64) src1, 0);
2801 out2 = __msa_copy_u_d((v2i64) src2, 0);
2802 out3 = __msa_copy_u_d((v2i64) src3, 0);
2804 SD4(out0, out1, out2, out3, dst, dst_stride);
2805 dst += (4 * dst_stride);
2807 } else if (0 == height % 2) {
2808 for (cnt = (height / 2); cnt--;) {
2809 LD_UB2(src, src_stride, src0, src1);
2810 src += (2 * src_stride);
2811 out0 = __msa_copy_u_d((v2i64) src0, 0);
2812 out1 = __msa_copy_u_d((v2i64) src1, 0);
2822 static void copy_16multx8mult_msa(const uint8_t *src, int32_t src_stride,
2823 uint8_t *dst, int32_t dst_stride,
2824 int32_t height, int32_t width)
2826 int32_t cnt, loop_cnt;
2827 const uint8_t *src_tmp;
2829 v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
2831 for (cnt = (width >> 4); cnt--;) {
2835 for (loop_cnt = (height >> 3); loop_cnt--;) {
2836 LD_UB8(src_tmp, src_stride,
2837 src0, src1, src2, src3, src4, src5, src6, src7);
2838 src_tmp += (8 * src_stride);
2840 ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7,
2841 dst_tmp, dst_stride);
2842 dst_tmp += (8 * dst_stride);
2850 static void copy_width16_msa(const uint8_t *src, int32_t src_stride,
2851 uint8_t *dst, int32_t dst_stride,
2855 v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
2857 if (0 == height % 12) {
2858 for (cnt = (height / 12); cnt--;) {
2859 LD_UB8(src, src_stride,
2860 src0, src1, src2, src3, src4, src5, src6, src7);
2861 src += (8 * src_stride);
2862 ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7,
2864 dst += (8 * dst_stride);
2866 LD_UB4(src, src_stride, src0, src1, src2, src3);
2867 src += (4 * src_stride);
2868 ST_UB4(src0, src1, src2, src3, dst, dst_stride);
2869 dst += (4 * dst_stride);
2871 } else if (0 == height % 8) {
2872 copy_16multx8mult_msa(src, src_stride, dst, dst_stride, height, 16);
2873 } else if (0 == height % 4) {
2874 for (cnt = (height >> 2); cnt--;) {
2875 LD_UB4(src, src_stride, src0, src1, src2, src3);
2876 src += (4 * src_stride);
2878 ST_UB4(src0, src1, src2, src3, dst, dst_stride);
2879 dst += (4 * dst_stride);
2884 static void avg_width4_msa(const uint8_t *src, int32_t src_stride,
2885 uint8_t *dst, int32_t dst_stride,
2889 uint32_t out0, out1, out2, out3;
2890 v16u8 src0, src1, src2, src3;
2891 v16u8 dst0, dst1, dst2, dst3;
2893 if (0 == (height % 4)) {
2894 for (cnt = (height / 4); cnt--;) {
2895 LD_UB4(src, src_stride, src0, src1, src2, src3);
2896 src += (4 * src_stride);
2898 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
2900 AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3,
2901 dst0, dst1, dst2, dst3);
2903 out0 = __msa_copy_u_w((v4i32) dst0, 0);
2904 out1 = __msa_copy_u_w((v4i32) dst1, 0);
2905 out2 = __msa_copy_u_w((v4i32) dst2, 0);
2906 out3 = __msa_copy_u_w((v4i32) dst3, 0);
2907 SW4(out0, out1, out2, out3, dst, dst_stride);
2908 dst += (4 * dst_stride);
2910 } else if (0 == (height % 2)) {
2911 for (cnt = (height / 2); cnt--;) {
2912 LD_UB2(src, src_stride, src0, src1);
2913 src += (2 * src_stride);
2915 LD_UB2(dst, dst_stride, dst0, dst1);
2917 AVER_UB2_UB(src0, dst0, src1, dst1, dst0, dst1);
2919 out0 = __msa_copy_u_w((v4i32) dst0, 0);
2920 out1 = __msa_copy_u_w((v4i32) dst1, 0);
2929 static void avg_width8_msa(const uint8_t *src, int32_t src_stride,
2930 uint8_t *dst, int32_t dst_stride,
2934 uint64_t out0, out1, out2, out3;
2935 v16u8 src0, src1, src2, src3;
2936 v16u8 dst0, dst1, dst2, dst3;
2938 for (cnt = (height / 4); cnt--;) {
2939 LD_UB4(src, src_stride, src0, src1, src2, src3);
2940 src += (4 * src_stride);
2941 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
2943 AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3,
2944 dst0, dst1, dst2, dst3);
2946 out0 = __msa_copy_u_d((v2i64) dst0, 0);
2947 out1 = __msa_copy_u_d((v2i64) dst1, 0);
2948 out2 = __msa_copy_u_d((v2i64) dst2, 0);
2949 out3 = __msa_copy_u_d((v2i64) dst3, 0);
2950 SD4(out0, out1, out2, out3, dst, dst_stride);
2951 dst += (4 * dst_stride);
2955 static void avg_width16_msa(const uint8_t *src, int32_t src_stride,
2956 uint8_t *dst, int32_t dst_stride,
2960 v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
2961 v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
2963 for (cnt = (height / 8); cnt--;) {
2964 LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
2965 src += (8 * src_stride);
2966 LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
2968 AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3,
2969 dst0, dst1, dst2, dst3);
2970 AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7,
2971 dst4, dst5, dst6, dst7);
2972 ST_UB8(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst, dst_stride);
2973 dst += (8 * dst_stride);
2977 void ff_put_h264_qpel16_mc00_msa(uint8_t *dst, const uint8_t *src,
2980 v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
2981 v16u8 src8, src9, src10, src11, src12, src13, src14, src15;
2983 LD_UB8(src, stride, src0, src1, src2, src3, src4, src5, src6, src7);
2984 src += (8 * stride);
2985 LD_UB8(src, stride, src8, src9, src10, src11, src12, src13, src14, src15);
2987 ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst, stride);
2988 dst += (8 * stride);
2989 ST_UB8(src8, src9, src10, src11, src12, src13, src14, src15, dst, stride);
2992 void ff_put_h264_qpel8_mc00_msa(uint8_t *dst, const uint8_t *src,
2995 uint64_t src0, src1, src2, src3, src4, src5, src6, src7;
2997 LD4(src, stride, src0, src1, src2, src3);
2999 LD4(src, stride, src4, src5, src6, src7);
3000 SD4(src0, src1, src2, src3, dst, stride);
3002 SD4(src4, src5, src6, src7, dst, stride);
3005 void ff_avg_h264_qpel16_mc00_msa(uint8_t *dst, const uint8_t *src,
3008 v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
3009 v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
3011 LD_UB8(src, stride, src0, src1, src2, src3, src4, src5, src6, src7);
3012 src += (8 * stride);
3013 LD_UB8(dst, stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
3015 AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3, dst0, dst1,
3017 AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7, dst4, dst5,
3019 ST_UB8(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst, stride);
3020 dst += (8 * stride);
3022 LD_UB8(src, stride, src0, src1, src2, src3, src4, src5, src6, src7);
3023 LD_UB8(dst, stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
3025 AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3, dst0, dst1,
3027 AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7, dst4, dst5,
3029 ST_UB8(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst, stride);
3032 void ff_avg_h264_qpel8_mc00_msa(uint8_t *dst, const uint8_t *src,
3035 uint64_t tp0, tp1, tp2, tp3, tp4, tp5, tp6, tp7;
3036 v16u8 src0 = { 0 }, src1 = { 0 }, src2 = { 0 }, src3 = { 0 };
3037 v16u8 dst0 = { 0 }, dst1 = { 0 }, dst2 = { 0 }, dst3 = { 0 };
3039 LD4(src, stride, tp0, tp1, tp2, tp3);
3041 LD4(src, stride, tp4, tp5, tp6, tp7);
3042 INSERT_D2_UB(tp0, tp1, src0);
3043 INSERT_D2_UB(tp2, tp3, src1);
3044 INSERT_D2_UB(tp4, tp5, src2);
3045 INSERT_D2_UB(tp6, tp7, src3);
3047 LD4(dst, stride, tp0, tp1, tp2, tp3);
3048 LD4(dst + 4 * stride, stride, tp4, tp5, tp6, tp7);
3049 INSERT_D2_UB(tp0, tp1, dst0);
3050 INSERT_D2_UB(tp2, tp3, dst1);
3051 INSERT_D2_UB(tp4, tp5, dst2);
3052 INSERT_D2_UB(tp6, tp7, dst3);
3054 AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3, dst0, dst1,
3057 ST8x8_UB(dst0, dst1, dst2, dst3, dst, stride);
3060 void ff_avg_h264_qpel4_mc00_msa(uint8_t *dst, const uint8_t *src,
3063 uint32_t tp0, tp1, tp2, tp3;
3064 v16u8 src0 = { 0 }, dst0 = { 0 };
3066 LW4(src, stride, tp0, tp1, tp2, tp3);
3067 INSERT_W4_UB(tp0, tp1, tp2, tp3, src0);
3068 LW4(dst, stride, tp0, tp1, tp2, tp3);
3069 INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
3071 dst0 = __msa_aver_u_b(src0, dst0);
3073 ST4x4_UB(dst0, dst0, 0, 1, 2, 3, dst, stride);
3076 void ff_put_h264_qpel16_mc10_msa(uint8_t *dst, const uint8_t *src,
3080 v16i8 dst0, dst1, dst2, dst3, src0, src1, src2, src3, src4, src5, src6;
3081 v16i8 mask0, mask1, mask2, mask3, mask4, mask5, src7, vec11;
3082 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
3083 v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
3084 v16i8 minus5b = __msa_ldi_b(-5);
3085 v16i8 plus20b = __msa_ldi_b(20);
3087 LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
3093 for (loop_cnt = 4; loop_cnt--;) {
3094 LD_SB2(src, 16, src0, src1);
3096 LD_SB2(src, 16, src2, src3);
3098 LD_SB2(src, 16, src4, src5);
3100 LD_SB2(src, 16, src6, src7);
3103 XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
3104 VSHF_B2_SB(src0, src0, src0, src1, mask0, mask3, vec0, vec3);
3105 VSHF_B2_SB(src2, src2, src2, src3, mask0, mask3, vec6, vec9);
3106 VSHF_B2_SB(src0, src0, src0, src1, mask1, mask4, vec1, vec4);
3107 VSHF_B2_SB(src2, src2, src2, src3, mask1, mask4, vec7, vec10);
3108 VSHF_B2_SB(src0, src0, src0, src1, mask2, mask5, vec2, vec5);
3109 VSHF_B2_SB(src2, src2, src2, src3, mask2, mask5, vec8, vec11);
3110 HADD_SB4_SH(vec0, vec3, vec6, vec9, res0, res1, res2, res3);
3111 DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
3112 minus5b, res0, res1, res2, res3);
3113 DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
3114 plus20b, res0, res1, res2, res3);
3115 VSHF_B2_SB(src4, src4, src4, src5, mask0, mask3, vec0, vec3);
3116 VSHF_B2_SB(src6, src6, src6, src7, mask0, mask3, vec6, vec9);
3117 VSHF_B2_SB(src4, src4, src4, src5, mask1, mask4, vec1, vec4);
3118 VSHF_B2_SB(src6, src6, src6, src7, mask1, mask4, vec7, vec10);
3119 VSHF_B2_SB(src4, src4, src4, src5, mask2, mask5, vec2, vec5);
3120 VSHF_B2_SB(src6, src6, src6, src7, mask2, mask5, vec8, vec11);
3121 HADD_SB4_SH(vec0, vec3, vec6, vec9, res4, res5, res6, res7);
3122 DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
3123 minus5b, res4, res5, res6, res7);
3124 DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
3125 plus20b, res4, res5, res6, res7);
3126 SLDI_B2_SB(src1, src3, src0, src2, src0, src2, 2);
3127 SLDI_B2_SB(src5, src7, src4, src6, src4, src6, 2);
3128 SRARI_H4_SH(res0, res1, res2, res3, 5);
3129 SRARI_H4_SH(res4, res5, res6, res7, 5);
3130 SAT_SH4_SH(res0, res1, res2, res3, 7);
3131 SAT_SH4_SH(res4, res5, res6, res7, 7);
3132 PCKEV_B2_SB(res1, res0, res3, res2, dst0, dst1);
3133 PCKEV_B2_SB(res5, res4, res7, res6, dst2, dst3);
3134 dst0 = __msa_aver_s_b(dst0, src0);
3135 dst1 = __msa_aver_s_b(dst1, src2);
3136 dst2 = __msa_aver_s_b(dst2, src4);
3137 dst3 = __msa_aver_s_b(dst3, src6);
3138 XORI_B4_128_SB(dst0, dst1, dst2, dst3);
3139 ST_SB4(dst0, dst1, dst2, dst3, dst, stride);
3140 dst += (4 * stride);
3144 void ff_put_h264_qpel16_mc30_msa(uint8_t *dst, const uint8_t *src,
3148 v16i8 dst0, dst1, dst2, dst3, src0, src1, src2, src3, src4, src5, src6;
3149 v16i8 mask0, mask1, mask2, mask3, mask4, mask5, src7, vec11;
3150 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
3151 v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
3152 v16i8 minus5b = __msa_ldi_b(-5);
3153 v16i8 plus20b = __msa_ldi_b(20);
3155 LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
3161 for (loop_cnt = 4; loop_cnt--;) {
3162 LD_SB2(src, 16, src0, src1);
3164 LD_SB2(src, 16, src2, src3);
3166 LD_SB2(src, 16, src4, src5);
3168 LD_SB2(src, 16, src6, src7);
3171 XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
3172 VSHF_B2_SB(src0, src0, src0, src1, mask0, mask3, vec0, vec3);
3173 VSHF_B2_SB(src2, src2, src2, src3, mask0, mask3, vec6, vec9);
3174 VSHF_B2_SB(src0, src0, src0, src1, mask1, mask4, vec1, vec4);
3175 VSHF_B2_SB(src2, src2, src2, src3, mask1, mask4, vec7, vec10);
3176 VSHF_B2_SB(src0, src0, src0, src1, mask2, mask5, vec2, vec5);
3177 VSHF_B2_SB(src2, src2, src2, src3, mask2, mask5, vec8, vec11);
3178 HADD_SB4_SH(vec0, vec3, vec6, vec9, res0, res1, res2, res3);
3179 DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
3180 minus5b, res0, res1, res2, res3);
3181 DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
3182 plus20b, res0, res1, res2, res3);
3183 VSHF_B2_SB(src4, src4, src4, src5, mask0, mask3, vec0, vec3);
3184 VSHF_B2_SB(src6, src6, src6, src7, mask0, mask3, vec6, vec9);
3185 VSHF_B2_SB(src4, src4, src4, src5, mask1, mask4, vec1, vec4);
3186 VSHF_B2_SB(src6, src6, src6, src7, mask1, mask4, vec7, vec10);
3187 VSHF_B2_SB(src4, src4, src4, src5, mask2, mask5, vec2, vec5);
3188 VSHF_B2_SB(src6, src6, src6, src7, mask2, mask5, vec8, vec11);
3189 HADD_SB4_SH(vec0, vec3, vec6, vec9, res4, res5, res6, res7);
3190 DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
3191 minus5b, res4, res5, res6, res7);
3192 DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
3193 plus20b, res4, res5, res6, res7);
3194 SLDI_B2_SB(src1, src3, src0, src2, src0, src2, 3);
3195 SLDI_B2_SB(src5, src7, src4, src6, src4, src6, 3);
3196 SRARI_H4_SH(res0, res1, res2, res3, 5);
3197 SRARI_H4_SH(res4, res5, res6, res7, 5);
3198 SAT_SH4_SH(res0, res1, res2, res3, 7);
3199 SAT_SH4_SH(res4, res5, res6, res7, 7);
3200 PCKEV_B2_SB(res1, res0, res3, res2, dst0, dst1);
3201 PCKEV_B2_SB(res5, res4, res7, res6, dst2, dst3);
3202 dst0 = __msa_aver_s_b(dst0, src0);
3203 dst1 = __msa_aver_s_b(dst1, src2);
3204 dst2 = __msa_aver_s_b(dst2, src4);
3205 dst3 = __msa_aver_s_b(dst3, src6);
3206 XORI_B4_128_SB(dst0, dst1, dst2, dst3);
3207 ST_SB4(dst0, dst1, dst2, dst3, dst, stride);
3208 dst += (4 * stride);
3212 void ff_put_h264_qpel8_mc10_msa(uint8_t *dst, const uint8_t *src,
3215 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask0, mask1, mask2;
3216 v16i8 tmp0, tmp1, tmp2, tmp3, vec11;
3217 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
3218 v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
3219 v16i8 minus5b = __msa_ldi_b(-5);
3220 v16i8 plus20b = __msa_ldi_b(20);
3222 LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
3223 LD_SB8(src - 2, stride, src0, src1, src2, src3, src4, src5, src6, src7);
3224 XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
3225 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
3226 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3);
3227 HADD_SB4_SH(vec0, vec1, vec2, vec3, res0, res1, res2, res3);
3228 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4, vec5);
3229 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6, vec7);
3230 DPADD_SB4_SH(vec4, vec5, vec6, vec7, minus5b, minus5b, minus5b, minus5b,
3231 res0, res1, res2, res3);
3232 VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec8, vec9);
3233 VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec10, vec11);
3234 DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b, plus20b,
3235 res0, res1, res2, res3);
3236 VSHF_B2_SB(src4, src4, src5, src5, mask0, mask0, vec0, vec1);
3237 VSHF_B2_SB(src6, src6, src7, src7, mask0, mask0, vec2, vec3);
3238 HADD_SB4_SH(vec0, vec1, vec2, vec3, res4, res5, res6, res7);
3239 VSHF_B2_SB(src4, src4, src5, src5, mask1, mask1, vec4, vec5);
3240 VSHF_B2_SB(src6, src6, src7, src7, mask1, mask1, vec6, vec7);
3241 DPADD_SB4_SH(vec4, vec5, vec6, vec7, minus5b, minus5b, minus5b, minus5b,
3242 res4, res5, res6, res7);
3243 VSHF_B2_SB(src4, src4, src5, src5, mask2, mask2, vec8, vec9);
3244 VSHF_B2_SB(src6, src6, src7, src7, mask2, mask2, vec10, vec11);
3245 DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b, plus20b,
3246 res4, res5, res6, res7);
3247 SLDI_B2_SB(src0, src1, src0, src1, src0, src1, 2);
3248 SLDI_B2_SB(src2, src3, src2, src3, src2, src3, 2);
3249 SLDI_B2_SB(src4, src5, src4, src5, src4, src5, 2);
3250 SLDI_B2_SB(src6, src7, src6, src7, src6, src7, 2);
3251 PCKEV_D2_SB(src1, src0, src3, src2, src0, src1);
3252 PCKEV_D2_SB(src5, src4, src7, src6, src4, src5);
3253 SRARI_H4_SH(res0, res1, res2, res3, 5);
3254 SRARI_H4_SH(res4, res5, res6, res7, 5);
3255 SAT_SH4_SH(res0, res1, res2, res3, 7);
3256 SAT_SH4_SH(res4, res5, res6, res7, 7);
3257 PCKEV_B2_SB(res1, res0, res3, res2, tmp0, tmp1);
3258 PCKEV_B2_SB(res5, res4, res7, res6, tmp2, tmp3);
3259 tmp0 = __msa_aver_s_b(tmp0, src0);
3260 tmp1 = __msa_aver_s_b(tmp1, src1);
3261 tmp2 = __msa_aver_s_b(tmp2, src4);
3262 tmp3 = __msa_aver_s_b(tmp3, src5);
3263 XORI_B4_128_SB(tmp0, tmp1, tmp2, tmp3);
3264 ST8x8_UB(tmp0, tmp1, tmp2, tmp3, dst, stride);
3267 void ff_put_h264_qpel8_mc30_msa(uint8_t *dst, const uint8_t *src,
3270 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask0, mask1, mask2;
3271 v16i8 tmp0, tmp1, tmp2, tmp3, vec11;
3272 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
3273 v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
3274 v16i8 minus5b = __msa_ldi_b(-5);
3275 v16i8 plus20b = __msa_ldi_b(20);
3277 LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
3278 LD_SB8(src - 2, stride, src0, src1, src2, src3, src4, src5, src6, src7);
3279 XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
3280 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
3281 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3);
3282 HADD_SB4_SH(vec0, vec1, vec2, vec3, res0, res1, res2, res3);
3283 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4, vec5);
3284 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6, vec7);
3285 DPADD_SB4_SH(vec4, vec5, vec6, vec7, minus5b, minus5b, minus5b, minus5b,
3286 res0, res1, res2, res3);
3287 VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec8, vec9);
3288 VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec10, vec11);
3289 DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b, plus20b,
3290 res0, res1, res2, res3);
3291 VSHF_B2_SB(src4, src4, src5, src5, mask0, mask0, vec0, vec1);
3292 VSHF_B2_SB(src6, src6, src7, src7, mask0, mask0, vec2, vec3);
3293 HADD_SB4_SH(vec0, vec1, vec2, vec3, res4, res5, res6, res7);
3294 VSHF_B2_SB(src4, src4, src5, src5, mask1, mask1, vec4, vec5);
3295 VSHF_B2_SB(src6, src6, src7, src7, mask1, mask1, vec6, vec7);
3296 DPADD_SB4_SH(vec4, vec5, vec6, vec7, minus5b, minus5b, minus5b, minus5b,
3297 res4, res5, res6, res7);
3298 VSHF_B2_SB(src4, src4, src5, src5, mask2, mask2, vec8, vec9);
3299 VSHF_B2_SB(src6, src6, src7, src7, mask2, mask2, vec10, vec11);
3300 DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b, plus20b,
3301 res4, res5, res6, res7);
3302 SLDI_B2_SB(src0, src1, src0, src1, src0, src1, 3);
3303 SLDI_B2_SB(src2, src3, src2, src3, src2, src3, 3);
3304 SLDI_B2_SB(src4, src5, src4, src5, src4, src5, 3);
3305 SLDI_B2_SB(src6, src7, src6, src7, src6, src7, 3);
3306 PCKEV_D2_SB(src1, src0, src3, src2, src0, src1);
3307 PCKEV_D2_SB(src5, src4, src7, src6, src4, src5);
3308 SRARI_H4_SH(res0, res1, res2, res3, 5);
3309 SRARI_H4_SH(res4, res5, res6, res7, 5);
3310 SAT_SH4_SH(res0, res1, res2, res3, 7);
3311 SAT_SH4_SH(res4, res5, res6, res7, 7);
3312 PCKEV_B2_SB(res1, res0, res3, res2, tmp0, tmp1);
3313 PCKEV_B2_SB(res5, res4, res7, res6, tmp2, tmp3);
3314 tmp0 = __msa_aver_s_b(tmp0, src0);
3315 tmp1 = __msa_aver_s_b(tmp1, src1);
3316 tmp2 = __msa_aver_s_b(tmp2, src4);
3317 tmp3 = __msa_aver_s_b(tmp3, src5);
3318 XORI_B4_128_SB(tmp0, tmp1, tmp2, tmp3);
3319 ST8x8_UB(tmp0, tmp1, tmp2, tmp3, dst, stride);
3322 void ff_put_h264_qpel4_mc10_msa(uint8_t *dst, const uint8_t *src,
3325 v16i8 src0, src1, src2, src3, res, mask0, mask1, mask2;
3326 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
3328 v16i8 minus5b = __msa_ldi_b(-5);
3329 v16i8 plus20b = __msa_ldi_b(20);
3331 LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2);
3332 LD_SB4(src - 2, stride, src0, src1, src2, src3);
3333 XORI_B4_128_SB(src0, src1, src2, src3);
3334 VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0, vec1);
3335 HADD_SB2_SH(vec0, vec1, res0, res1);
3336 VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2, vec3);
3337 DPADD_SB2_SH(vec2, vec3, minus5b, minus5b, res0, res1);
3338 VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4, vec5);
3339 DPADD_SB2_SH(vec4, vec5, plus20b, plus20b, res0, res1);
3340 SRARI_H2_SH(res0, res1, 5);
3341 SAT_SH2_SH(res0, res1, 7);
3342 res = __msa_pckev_b((v16i8) res1, (v16i8) res0);
3343 SLDI_B2_SB(src0, src1, src0, src1, src0, src1, 2);
3344 SLDI_B2_SB(src2, src3, src2, src3, src2, src3, 2);
3345 src0 = (v16i8) __msa_insve_w((v4i32) src0, 1, (v4i32) src1);
3346 src1 = (v16i8) __msa_insve_w((v4i32) src2, 1, (v4i32) src3);
3347 src0 = (v16i8) __msa_insve_d((v2i64) src0, 1, (v2i64) src1);
3348 res = __msa_aver_s_b(res, src0);
3349 res = (v16i8) __msa_xori_b((v16u8) res, 128);
3350 ST4x4_UB(res, res, 0, 1, 2, 3, dst, stride);
3353 void ff_put_h264_qpel4_mc30_msa(uint8_t *dst, const uint8_t *src,
3356 v16i8 src0, src1, src2, src3, res, mask0, mask1, mask2;
3357 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
3359 v16i8 minus5b = __msa_ldi_b(-5);
3360 v16i8 plus20b = __msa_ldi_b(20);
3362 LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2);
3363 LD_SB4(src - 2, stride, src0, src1, src2, src3);
3364 XORI_B4_128_SB(src0, src1, src2, src3);
3365 VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0, vec1);
3366 HADD_SB2_SH(vec0, vec1, res0, res1);
3367 VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2, vec3);
3368 DPADD_SB2_SH(vec2, vec3, minus5b, minus5b, res0, res1);
3369 VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4, vec5);
3370 DPADD_SB2_SH(vec4, vec5, plus20b, plus20b, res0, res1);
3371 SRARI_H2_SH(res0, res1, 5);
3372 SAT_SH2_SH(res0, res1, 7);
3373 res = __msa_pckev_b((v16i8) res1, (v16i8) res0);
3374 SLDI_B2_SB(src0, src1, src0, src1, src0, src1, 3);
3375 SLDI_B2_SB(src2, src3, src2, src3, src2, src3, 3);
3376 src0 = (v16i8) __msa_insve_w((v4i32) src0, 1, (v4i32) src1);
3377 src1 = (v16i8) __msa_insve_w((v4i32) src2, 1, (v4i32) src3);
3378 src0 = (v16i8) __msa_insve_d((v2i64) src0, 1, (v2i64) src1);
3379 res = __msa_aver_s_b(res, src0);
3380 res = (v16i8) __msa_xori_b((v16u8) res, 128);
3381 ST4x4_UB(res, res, 0, 1, 2, 3, dst, stride);
3384 void ff_put_h264_qpel16_mc20_msa(uint8_t *dst, const uint8_t *src,
3388 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask0, mask1, mask2;
3389 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
3391 v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
3392 v16i8 minus5b = __msa_ldi_b(-5);
3393 v16i8 plus20b = __msa_ldi_b(20);
3395 LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
3398 for (loop_cnt = 4; loop_cnt--;) {
3399 LD_SB2(src, 8, src0, src1);
3401 LD_SB2(src, 8, src2, src3);
3403 LD_SB2(src, 8, src4, src5);
3405 LD_SB2(src, 8, src6, src7);
3408 XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
3409 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec3);
3410 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec6, vec9);
3411 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec1, vec4);
3412 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec7, vec10);
3413 VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec2, vec5);
3414 VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec8, vec11);
3415 HADD_SB4_SH(vec0, vec3, vec6, vec9, res0, res1, res2, res3);
3416 DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
3417 minus5b, res0, res1, res2, res3);
3418 DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
3419 plus20b, res0, res1, res2, res3);
3420 VSHF_B2_SB(src4, src4, src5, src5, mask0, mask0, vec0, vec3);
3421 VSHF_B2_SB(src6, src6, src7, src7, mask0, mask0, vec6, vec9);
3422 VSHF_B2_SB(src4, src4, src5, src5, mask1, mask1, vec1, vec4);
3423 VSHF_B2_SB(src6, src6, src7, src7, mask1, mask1, vec7, vec10);
3424 VSHF_B2_SB(src4, src4, src5, src5, mask2, mask2, vec2, vec5);
3425 VSHF_B2_SB(src6, src6, src7, src7, mask2, mask2, vec8, vec11);
3426 HADD_SB4_SH(vec0, vec3, vec6, vec9, res4, res5, res6, res7);
3427 DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
3428 minus5b, res4, res5, res6, res7);
3429 DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
3430 plus20b, res4, res5, res6, res7);
3431 SRARI_H4_SH(res0, res1, res2, res3, 5);
3432 SRARI_H4_SH(res4, res5, res6, res7, 5);
3433 SAT_SH4_SH(res0, res1, res2, res3, 7);
3434 SAT_SH4_SH(res4, res5, res6, res7, 7);
3435 PCKEV_B4_SB(res1, res0, res3, res2, res5, res4, res7, res6, vec0, vec1,
3437 XORI_B4_128_SB(vec0, vec1, vec2, vec3);
3438 ST_SB4(vec0, vec1, vec2, vec3, dst, stride);
3439 dst += (4 * stride);
3443 void ff_put_h264_qpel8_mc20_msa(uint8_t *dst, const uint8_t *src,
3446 v16u8 out0, out1, out2, out3;
3447 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, mask0, mask1, mask2;
3448 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
3450 v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
3451 v16i8 minus5b = __msa_ldi_b(-5);
3452 v16i8 plus20b = __msa_ldi_b(20);
3454 LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
3455 LD_SB8(src - 2, stride, src0, src1, src2, src3, src4, src5, src6, src7);
3456 XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
3457 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
3458 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3);
3459 HADD_SB4_SH(vec0, vec1, vec2, vec3, res0, res1, res2, res3);
3460 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4, vec5);
3461 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6, vec7);
3462 DPADD_SB4_SH(vec4, vec5, vec6, vec7, minus5b, minus5b, minus5b, minus5b,
3463 res0, res1, res2, res3);
3464 VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec8, vec9);
3465 VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec10, vec11);
3466 DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b,
3467 plus20b, res0, res1, res2, res3);
3468 VSHF_B2_SB(src4, src4, src5, src5, mask0, mask0, vec0, vec1);
3469 VSHF_B2_SB(src6, src6, src7, src7, mask0, mask0, vec2, vec3);
3470 HADD_SB4_SH(vec0, vec1, vec2, vec3, res4, res5, res6, res7);
3471 VSHF_B2_SB(src4, src4, src5, src5, mask1, mask1, vec4, vec5);
3472 VSHF_B2_SB(src6, src6, src7, src7, mask1, mask1, vec6, vec7);
3473 DPADD_SB4_SH(vec4, vec5, vec6, vec7, minus5b, minus5b, minus5b, minus5b,
3474 res4, res5, res6, res7);
3475 VSHF_B2_SB(src4, src4, src5, src5, mask2, mask2, vec8, vec9);
3476 VSHF_B2_SB(src6, src6, src7, src7, mask2, mask2, vec10, vec11);
3477 DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b,
3478 plus20b, res4, res5, res6, res7);
3479 SRARI_H4_SH(res0, res1, res2, res3, 5);
3480 SRARI_H4_SH(res4, res5, res6, res7, 5);
3481 SAT_SH4_SH(res0, res1, res2, res3, 7);
3482 SAT_SH4_SH(res4, res5, res6, res7, 7);
3483 out0 = PCKEV_XORI128_UB(res0, res1);
3484 out1 = PCKEV_XORI128_UB(res2, res3);
3485 out2 = PCKEV_XORI128_UB(res4, res5);
3486 out3 = PCKEV_XORI128_UB(res6, res7);
3487 ST8x8_UB(out0, out1, out2, out3, dst, stride);
3490 void ff_put_h264_qpel4_mc20_msa(uint8_t *dst, const uint8_t *src,
3494 v16i8 src0, src1, src2, src3, mask0, mask1, mask2;
3495 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
3497 v16i8 minus5b = __msa_ldi_b(-5);
3498 v16i8 plus20b = __msa_ldi_b(20);
3500 LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2);
3501 LD_SB4(src - 2, stride, src0, src1, src2, src3);
3502 XORI_B4_128_SB(src0, src1, src2, src3);
3503 VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0, vec1);
3504 HADD_SB2_SH(vec0, vec1, res0, res1);
3505 VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2, vec3);
3506 DPADD_SB2_SH(vec2, vec3, minus5b, minus5b, res0, res1);
3507 VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4, vec5);
3508 DPADD_SB2_SH(vec4, vec5, plus20b, plus20b, res0, res1);
3509 SRARI_H2_SH(res0, res1, 5);
3510 SAT_SH2_SH(res0, res1, 7);
3511 out = PCKEV_XORI128_UB(res0, res1);
3512 ST4x4_UB(out, out, 0, 1, 2, 3, dst, stride);
3515 void ff_put_h264_qpel16_mc01_msa(uint8_t *dst, const uint8_t *src,
3519 int16_t filt_const0 = 0xfb01;
3520 int16_t filt_const1 = 0x1414;
3521 int16_t filt_const2 = 0x1fb;
3522 v16u8 res0, res1, res2, res3;
3523 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
3524 v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
3525 v16i8 src87_r, src10_l, src32_l, src54_l, src76_l, src21_l, src43_l;
3526 v16i8 src65_l, src87_l, filt0, filt1, filt2;
3527 v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
3529 filt0 = (v16i8) __msa_fill_h(filt_const0);
3530 filt1 = (v16i8) __msa_fill_h(filt_const1);
3531 filt2 = (v16i8) __msa_fill_h(filt_const2);
3533 src -= (stride * 2);
3535 LD_SB5(src, stride, src0, src1, src2, src3, src4);
3536 src += (5 * stride);
3538 XORI_B5_128_SB(src0, src1, src2, src3, src4);
3539 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
3541 ILVL_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_l, src21_l,
3544 for (loop_cnt = 4; loop_cnt--;) {
3545 LD_SB4(src, stride, src5, src6, src7, src8);
3546 src += (4 * stride);
3548 XORI_B4_128_SB(src5, src6, src7, src8);
3549 ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r,
3550 src65_r, src76_r, src87_r);
3551 ILVL_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_l,
3552 src65_l, src76_l, src87_l);
3553 out0_r = AVC_DOT_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2);
3554 out1_r = AVC_DOT_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2);
3555 out2_r = AVC_DOT_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2);
3556 out3_r = AVC_DOT_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2);
3557 out0_l = AVC_DOT_SH3_SH(src10_l, src32_l, src54_l, filt0, filt1, filt2);
3558 out1_l = AVC_DOT_SH3_SH(src21_l, src43_l, src65_l, filt0, filt1, filt2);
3559 out2_l = AVC_DOT_SH3_SH(src32_l, src54_l, src76_l, filt0, filt1, filt2);
3560 out3_l = AVC_DOT_SH3_SH(src43_l, src65_l, src87_l, filt0, filt1, filt2);
3561 SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 5);
3562 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
3563 SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 5);
3564 SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
3565 PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
3566 out3_r, res0, res1, res2, res3);
3567 res0 = (v16u8) __msa_aver_s_b((v16i8) res0, src2);
3568 res1 = (v16u8) __msa_aver_s_b((v16i8) res1, src3);
3569 res2 = (v16u8) __msa_aver_s_b((v16i8) res2, src4);
3570 res3 = (v16u8) __msa_aver_s_b((v16i8) res3, src5);
3571 XORI_B4_128_UB(res0, res1, res2, res3);
3572 ST_UB4(res0, res1, res2, res3, dst, stride);
3573 dst += (4 * stride);
3589 void ff_put_h264_qpel16_mc03_msa(uint8_t *dst, const uint8_t *src,
3593 int16_t filt_const0 = 0xfb01;
3594 int16_t filt_const1 = 0x1414;
3595 int16_t filt_const2 = 0x1fb;
3596 v16u8 res0, res1, res2, res3;
3597 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
3598 v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
3599 v16i8 src87_r, src10_l, src32_l, src54_l, src76_l, src21_l, src43_l;
3600 v16i8 src65_l, src87_l, filt0, filt1, filt2;
3601 v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
3603 filt0 = (v16i8) __msa_fill_h(filt_const0);
3604 filt1 = (v16i8) __msa_fill_h(filt_const1);
3605 filt2 = (v16i8) __msa_fill_h(filt_const2);
3607 src -= (stride * 2);
3609 LD_SB5(src, stride, src0, src1, src2, src3, src4);
3610 src += (5 * stride);
3612 XORI_B5_128_SB(src0, src1, src2, src3, src4);
3613 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
3615 ILVL_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_l, src21_l,
3618 for (loop_cnt = 4; loop_cnt--;) {
3619 LD_SB4(src, stride, src5, src6, src7, src8);
3620 src += (4 * stride);
3622 XORI_B4_128_SB(src5, src6, src7, src8);
3623 ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r,
3624 src65_r, src76_r, src87_r);
3625 ILVL_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_l,
3626 src65_l, src76_l, src87_l);
3627 out0_r = AVC_DOT_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2);
3628 out1_r = AVC_DOT_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2);
3629 out2_r = AVC_DOT_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2);
3630 out3_r = AVC_DOT_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2);
3631 out0_l = AVC_DOT_SH3_SH(src10_l, src32_l, src54_l, filt0, filt1, filt2);
3632 out1_l = AVC_DOT_SH3_SH(src21_l, src43_l, src65_l, filt0, filt1, filt2);
3633 out2_l = AVC_DOT_SH3_SH(src32_l, src54_l, src76_l, filt0, filt1, filt2);
3634 out3_l = AVC_DOT_SH3_SH(src43_l, src65_l, src87_l, filt0, filt1, filt2);
3635 SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 5);
3636 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
3637 SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 5);
3638 SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
3639 PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
3640 out3_r, res0, res1, res2, res3);
3641 res0 = (v16u8) __msa_aver_s_b((v16i8) res0, src3);
3642 res1 = (v16u8) __msa_aver_s_b((v16i8) res1, src4);
3643 res2 = (v16u8) __msa_aver_s_b((v16i8) res2, src5);
3644 res3 = (v16u8) __msa_aver_s_b((v16i8) res3, src6);
3645 XORI_B4_128_UB(res0, res1, res2, res3);
3646 ST_UB4(res0, res1, res2, res3, dst, stride);
3647 dst += (4 * stride);
3662 void ff_put_h264_qpel8_mc01_msa(uint8_t *dst, const uint8_t *src,
3665 const int16_t filt_const0 = 0xfb01;
3666 const int16_t filt_const1 = 0x1414;
3667 const int16_t filt_const2 = 0x1fb;
3668 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
3669 v16i8 src11, src12, src10_r, src32_r, src54_r, src65_r, src76_r, src98_r;
3670 v16i8 src21_r, src43_r, src87_r, src109_r, src1211_r, src1110_r;
3671 v16i8 tmp0, tmp1, tmp2, tmp3, filt0, filt1, filt2, out0, out1, out2, out3;
3672 v8i16 out0_r, out1_r, out2_r, out3_r, out4_r, out5_r, out6_r, out7_r;
3674 filt0 = (v16i8) __msa_fill_h(filt_const0);
3675 filt1 = (v16i8) __msa_fill_h(filt_const1);
3676 filt2 = (v16i8) __msa_fill_h(filt_const2);
3678 src -= (stride * 2);
3680 LD_SB5(src, stride, src0, src1, src2, src3, src4);
3681 src += (5 * stride);
3682 LD_SB8(src, stride, src5, src6, src7, src8, src9, src10, src11, src12);
3683 XORI_B8_128_SB(src5, src6, src7, src8, src9, src10, src11, src12);
3684 XORI_B5_128_SB(src0, src1, src2, src3, src4);
3685 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
3687 ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
3689 ILVR_B4_SB(src9, src8, src10, src9, src11, src10, src12, src11, src98_r,
3690 src109_r, src1110_r, src1211_r);
3691 out0_r = AVC_DOT_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2);
3692 out1_r = AVC_DOT_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2);
3693 out2_r = AVC_DOT_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2);
3694 out3_r = AVC_DOT_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2);
3695 out4_r = AVC_DOT_SH3_SH(src54_r, src76_r, src98_r, filt0, filt1, filt2);
3696 out5_r = AVC_DOT_SH3_SH(src65_r, src87_r, src109_r, filt0, filt1, filt2);
3697 out6_r = AVC_DOT_SH3_SH(src76_r, src98_r, src1110_r, filt0, filt1, filt2);
3698 out7_r = AVC_DOT_SH3_SH(src87_r, src109_r, src1211_r, filt0, filt1, filt2);
3699 PCKEV_D2_SB(src3, src2, src5, src4, tmp0, tmp1);
3700 PCKEV_D2_SB(src7, src6, src9, src8, tmp2, tmp3);
3701 SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 5);
3702 SRARI_H4_SH(out4_r, out5_r, out6_r, out7_r, 5);
3703 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
3704 SAT_SH4_SH(out4_r, out5_r, out6_r, out7_r, 7);
3705 PCKEV_B2_SB(out1_r, out0_r, out3_r, out2_r, out0, out1);
3706 PCKEV_B2_SB(out5_r, out4_r, out7_r, out6_r, out2, out3);
3707 out0 = __msa_aver_s_b(out0, tmp0);
3708 out1 = __msa_aver_s_b(out1, tmp1);
3709 out2 = __msa_aver_s_b(out2, tmp2);
3710 out3 = __msa_aver_s_b(out3, tmp3);
3711 XORI_B4_128_SB(out0, out1, out2, out3);
3712 ST8x8_UB(out0, out1, out2, out3, dst, stride);
3715 void ff_put_h264_qpel8_mc03_msa(uint8_t *dst, const uint8_t *src,
3718 const int16_t filt_const0 = 0xfb01;
3719 const int16_t filt_const1 = 0x1414;
3720 const int16_t filt_const2 = 0x1fb;
3721 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
3722 v16i8 src11, src12, src10_r, src32_r, src54_r, src65_r, src76_r, src98_r;
3723 v16i8 src21_r, src43_r, src87_r, src109_r, src1211_r, src1110_r;
3724 v16i8 filt0, filt1, filt2, out0, out1, out2, out3, tmp0, tmp1, tmp2, tmp3;
3725 v8i16 out0_r, out1_r, out2_r, out3_r, out4_r, out5_r, out6_r, out7_r;
3727 filt0 = (v16i8) __msa_fill_h(filt_const0);
3728 filt1 = (v16i8) __msa_fill_h(filt_const1);
3729 filt2 = (v16i8) __msa_fill_h(filt_const2);
3731 src -= (stride * 2);
3733 LD_SB5(src, stride, src0, src1, src2, src3, src4);
3734 src += (5 * stride);
3735 LD_SB8(src, stride, src5, src6, src7, src8, src9, src10, src11, src12);
3736 XORI_B5_128_SB(src0, src1, src2, src3, src4);
3737 XORI_B8_128_SB(src5, src6, src7, src8, src9, src10, src11, src12);
3738 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
3740 ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
3742 ILVR_B4_SB(src9, src8, src10, src9, src11, src10, src12, src11, src98_r,
3743 src109_r, src1110_r, src1211_r);
3744 out0_r = AVC_DOT_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2);
3745 out1_r = AVC_DOT_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2);
3746 out2_r = AVC_DOT_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2);
3747 out3_r = AVC_DOT_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2);
3748 out4_r = AVC_DOT_SH3_SH(src54_r, src76_r, src98_r, filt0, filt1, filt2);
3749 out5_r = AVC_DOT_SH3_SH(src65_r, src87_r, src109_r, filt0, filt1, filt2);
3750 out6_r = AVC_DOT_SH3_SH(src76_r, src98_r, src1110_r, filt0, filt1, filt2);
3751 out7_r = AVC_DOT_SH3_SH(src87_r, src109_r, src1211_r, filt0, filt1, filt2);
3752 PCKEV_D2_SB(src4, src3, src6, src5, tmp0, tmp1);
3753 PCKEV_D2_SB(src8, src7, src10, src9, tmp2, tmp3);
3754 SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 5);
3755 SRARI_H4_SH(out4_r, out5_r, out6_r, out7_r, 5);
3756 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
3757 SAT_SH4_SH(out4_r, out5_r, out6_r, out7_r, 7);
3758 PCKEV_B2_SB(out1_r, out0_r, out3_r, out2_r, out0, out1);
3759 PCKEV_B2_SB(out5_r, out4_r, out7_r, out6_r, out2, out3);
3760 out0 = __msa_aver_s_b(out0, tmp0);
3761 out1 = __msa_aver_s_b(out1, tmp1);
3762 out2 = __msa_aver_s_b(out2, tmp2);
3763 out3 = __msa_aver_s_b(out3, tmp3);
3764 XORI_B4_128_SB(out0, out1, out2, out3);
3765 ST8x8_UB(out0, out1, out2, out3, dst, stride);
3768 void ff_put_h264_qpel4_mc01_msa(uint8_t *dst, const uint8_t *src,
3771 int16_t filt_const0 = 0xfb01;
3772 int16_t filt_const1 = 0x1414;
3773 int16_t filt_const2 = 0x1fb;
3775 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
3776 v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
3777 v16i8 src87_r, src2110, src4332, src6554, src8776, filt0, filt1, filt2;
3780 filt0 = (v16i8) __msa_fill_h(filt_const0);
3781 filt1 = (v16i8) __msa_fill_h(filt_const1);
3782 filt2 = (v16i8) __msa_fill_h(filt_const2);
3784 src -= (stride * 2);
3786 LD_SB5(src, stride, src0, src1, src2, src3, src4);
3787 src += (5 * stride);
3788 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
3790 ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
3791 XORI_B2_128_SB(src2110, src4332);
3792 LD_SB4(src, stride, src5, src6, src7, src8);
3793 ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
3795 ILVR_D2_SB(src65_r, src54_r, src87_r, src76_r, src6554, src8776);
3796 XORI_B2_128_SB(src6554, src8776);
3797 out10 = AVC_DOT_SH3_SH(src2110, src4332, src6554, filt0, filt1, filt2);
3798 out32 = AVC_DOT_SH3_SH(src4332, src6554, src8776, filt0, filt1, filt2);
3799 SRARI_H2_SH(out10, out32, 5);
3800 SAT_SH2_SH(out10, out32, 7);
3801 out = PCKEV_XORI128_UB(out10, out32);
3802 src32_r = (v16i8) __msa_insve_w((v4i32) src2, 1, (v4i32) src3);
3803 src54_r = (v16i8) __msa_insve_w((v4i32) src4, 1, (v4i32) src5);
3804 src32_r = (v16i8) __msa_insve_d((v2i64) src32_r, 1, (v2i64) src54_r);
3805 out = __msa_aver_u_b(out, (v16u8) src32_r);
3806 ST4x4_UB(out, out, 0, 1, 2, 3, dst, stride);
3809 void ff_put_h264_qpel4_mc03_msa(uint8_t *dst, const uint8_t *src,
3812 int16_t filt_const0 = 0xfb01;
3813 int16_t filt_const1 = 0x1414;
3814 int16_t filt_const2 = 0x1fb;
3816 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
3817 v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
3818 v16i8 src87_r, src2110, src4332, src6554, src8776, filt0, filt1, filt2;
3821 filt0 = (v16i8) __msa_fill_h(filt_const0);
3822 filt1 = (v16i8) __msa_fill_h(filt_const1);
3823 filt2 = (v16i8) __msa_fill_h(filt_const2);
3825 src -= (stride * 2);
3827 LD_SB5(src, stride, src0, src1, src2, src3, src4);
3828 src += (5 * stride);
3829 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3, src10_r, src21_r,
3831 ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
3832 XORI_B2_128_SB(src2110, src4332);
3833 LD_SB4(src, stride, src5, src6, src7, src8);
3834 ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7, src54_r, src65_r,
3836 ILVR_D2_SB(src65_r, src54_r, src87_r, src76_r, src6554, src8776);
3837 XORI_B2_128_SB(src6554, src8776);
3838 out10 = AVC_DOT_SH3_SH(src2110, src4332, src6554, filt0, filt1, filt2);
3839 out32 = AVC_DOT_SH3_SH(src4332, src6554, src8776, filt0, filt1, filt2);
3840 SRARI_H2_SH(out10, out32, 5);
3841 SAT_SH2_SH(out10, out32, 7);
3842 out = PCKEV_XORI128_UB(out10, out32);
3843 src32_r = (v16i8) __msa_insve_w((v4i32) src3, 1, (v4i32) src4);
3844 src54_r = (v16i8) __msa_insve_w((v4i32) src5, 1, (v4i32) src6);
3845 src32_r = (v16i8) __msa_insve_d((v2i64) src32_r, 1, (v2i64) src54_r);
3846 out = __msa_aver_u_b(out, (v16u8) src32_r);
3847 ST4x4_UB(out, out, 0, 1, 2, 3, dst, stride);
3850 void ff_put_h264_qpel16_mc11_msa(uint8_t *dst, const uint8_t *src,
3853 avc_luma_hv_qrt_16w_msa(src - 2,
3854 src - (stride * 2), stride, dst, stride, 16);
3857 void ff_put_h264_qpel16_mc31_msa(uint8_t *dst, const uint8_t *src,
3860 avc_luma_hv_qrt_16w_msa(src - 2,
3861 src - (stride * 2) +
3862 sizeof(uint8_t), stride, dst, stride, 16);
3865 void ff_put_h264_qpel16_mc13_msa(uint8_t *dst, const uint8_t *src,
3868 avc_luma_hv_qrt_16w_msa(src + stride - 2,
3869 src - (stride * 2), stride, dst, stride, 16);
3872 void ff_put_h264_qpel16_mc33_msa(uint8_t *dst, const uint8_t *src,
3875 avc_luma_hv_qrt_16w_msa(src + stride - 2,
3876 src - (stride * 2) +
3877 sizeof(uint8_t), stride, dst, stride, 16);
3880 void ff_put_h264_qpel8_mc11_msa(uint8_t *dst, const uint8_t *src,
3883 avc_luma_hv_qrt_8w_msa(src - 2, src - (stride * 2), stride, dst, stride, 8);
3886 void ff_put_h264_qpel8_mc31_msa(uint8_t *dst, const uint8_t *src,
3889 avc_luma_hv_qrt_8w_msa(src - 2,
3890 src - (stride * 2) +
3891 sizeof(uint8_t), stride, dst, stride, 8);
3894 void ff_put_h264_qpel8_mc13_msa(uint8_t *dst, const uint8_t *src,
3897 avc_luma_hv_qrt_8w_msa(src + stride - 2,
3898 src - (stride * 2), stride, dst, stride, 8);
3901 void ff_put_h264_qpel8_mc33_msa(uint8_t *dst, const uint8_t *src,
3904 avc_luma_hv_qrt_8w_msa(src + stride - 2,
3905 src - (stride * 2) +
3906 sizeof(uint8_t), stride, dst, stride, 8);
3910 void ff_put_h264_qpel4_mc11_msa(uint8_t *dst, const uint8_t *src,
3913 avc_luma_hv_qrt_4w_msa(src - 2, src - (stride * 2), stride, dst, stride, 4);
3916 void ff_put_h264_qpel4_mc31_msa(uint8_t *dst, const uint8_t *src,
3919 avc_luma_hv_qrt_4w_msa(src - 2,
3920 src - (stride * 2) +
3921 sizeof(uint8_t), stride, dst, stride, 4);
3924 void ff_put_h264_qpel4_mc13_msa(uint8_t *dst, const uint8_t *src,
3927 avc_luma_hv_qrt_4w_msa(src + stride - 2,
3928 src - (stride * 2), stride, dst, stride, 4);
3931 void ff_put_h264_qpel4_mc33_msa(uint8_t *dst, const uint8_t *src,
3934 avc_luma_hv_qrt_4w_msa(src + stride - 2,
3935 src - (stride * 2) +
3936 sizeof(uint8_t), stride, dst, stride, 4);
3939 void ff_put_h264_qpel16_mc21_msa(uint8_t *dst, const uint8_t *src,
3942 avc_luma_midv_qrt_16w_msa(src - (2 * stride) - 2,
3943 stride, dst, stride, 16, 0);
3946 void ff_put_h264_qpel16_mc23_msa(uint8_t *dst, const uint8_t *src,
3949 avc_luma_midv_qrt_16w_msa(src - (2 * stride) - 2,
3950 stride, dst, stride, 16, 1);
3953 void ff_put_h264_qpel8_mc21_msa(uint8_t *dst, const uint8_t *src,
3956 avc_luma_midv_qrt_8w_msa(src - (2 * stride) - 2, stride, dst, stride, 8, 0);
3959 void ff_put_h264_qpel8_mc23_msa(uint8_t *dst, const uint8_t *src,
3962 avc_luma_midv_qrt_8w_msa(src - (2 * stride) - 2, stride, dst, stride, 8, 1);
3965 void ff_put_h264_qpel4_mc21_msa(uint8_t *dst, const uint8_t *src,
3968 avc_luma_midv_qrt_4w_msa(src - (2 * stride) - 2, stride, dst, stride, 4, 0);
3971 void ff_put_h264_qpel4_mc23_msa(uint8_t *dst, const uint8_t *src,
3974 avc_luma_midv_qrt_4w_msa(src - (2 * stride) - 2, stride, dst, stride, 4, 1);
3977 void ff_put_h264_qpel16_mc02_msa(uint8_t *dst, const uint8_t *src,
3980 avc_luma_vt_16w_msa(src - (stride * 2), stride, dst, stride, 16);
3983 void ff_put_h264_qpel8_mc02_msa(uint8_t *dst, const uint8_t *src,
3986 avc_luma_vt_8w_msa(src - (stride * 2), stride, dst, stride, 8);
3989 void ff_put_h264_qpel4_mc02_msa(uint8_t *dst, const uint8_t *src,
3992 avc_luma_vt_4w_msa(src - (stride * 2), stride, dst, stride, 4);
3995 void ff_put_h264_qpel16_mc12_msa(uint8_t *dst, const uint8_t *src,
3998 avc_luma_midh_qrt_16w_msa(src - (2 * stride) - 2,
3999 stride, dst, stride, 16, 0);
4002 void ff_put_h264_qpel16_mc32_msa(uint8_t *dst, const uint8_t *src,
4005 avc_luma_midh_qrt_16w_msa(src - (2 * stride) - 2,
4006 stride, dst, stride, 16, 1);
4009 void ff_put_h264_qpel8_mc12_msa(uint8_t *dst, const uint8_t *src,
4012 avc_luma_midh_qrt_8w_msa(src - (2 * stride) - 2, stride, dst, stride, 8, 0);
4015 void ff_put_h264_qpel8_mc32_msa(uint8_t *dst, const uint8_t *src,
4018 avc_luma_midh_qrt_8w_msa(src - (2 * stride) - 2, stride, dst, stride, 8, 1);
4021 void ff_put_h264_qpel4_mc12_msa(uint8_t *dst, const uint8_t *src,
4024 avc_luma_midh_qrt_4w_msa(src - (2 * stride) - 2, stride, dst, stride, 4, 0);
4027 void ff_put_h264_qpel4_mc32_msa(uint8_t *dst, const uint8_t *src,
4030 avc_luma_midh_qrt_4w_msa(src - (2 * stride) - 2, stride, dst, stride, 4, 1);
4033 void ff_put_h264_qpel16_mc22_msa(uint8_t *dst, const uint8_t *src,
4036 avc_luma_mid_16w_msa(src - (2 * stride) - 2, stride, dst, stride, 16);
4039 void ff_put_h264_qpel8_mc22_msa(uint8_t *dst, const uint8_t *src,
4042 avc_luma_mid_8w_msa(src - (2 * stride) - 2, stride, dst, stride, 8);
4045 void ff_put_h264_qpel4_mc22_msa(uint8_t *dst, const uint8_t *src,
4048 avc_luma_mid_4w_msa(src - (2 * stride) - 2, stride, dst, stride, 4);
4051 void ff_avg_h264_qpel16_mc10_msa(uint8_t *dst, const uint8_t *src,
4054 avc_luma_hz_qrt_and_aver_dst_16x16_msa(src - 2, stride, dst, stride, 0);
4057 void ff_avg_h264_qpel16_mc30_msa(uint8_t *dst, const uint8_t *src,
4060 avc_luma_hz_qrt_and_aver_dst_16x16_msa(src - 2, stride, dst, stride, 1);
4063 void ff_avg_h264_qpel8_mc10_msa(uint8_t *dst, const uint8_t *src,
4066 avc_luma_hz_qrt_and_aver_dst_8x8_msa(src - 2, stride, dst, stride, 0);
4069 void ff_avg_h264_qpel8_mc30_msa(uint8_t *dst, const uint8_t *src,
4072 avc_luma_hz_qrt_and_aver_dst_8x8_msa(src - 2, stride, dst, stride, 1);
4075 void ff_avg_h264_qpel4_mc10_msa(uint8_t *dst, const uint8_t *src,
4078 avc_luma_hz_qrt_and_aver_dst_4x4_msa(src - 2, stride, dst, stride, 0);
4081 void ff_avg_h264_qpel4_mc30_msa(uint8_t *dst, const uint8_t *src,
4084 avc_luma_hz_qrt_and_aver_dst_4x4_msa(src - 2, stride, dst, stride, 1);
4087 void ff_avg_h264_qpel16_mc20_msa(uint8_t *dst, const uint8_t *src,
4090 avc_luma_hz_and_aver_dst_16x16_msa(src - 2, stride, dst, stride);
4093 void ff_avg_h264_qpel8_mc20_msa(uint8_t *dst, const uint8_t *src,
4096 avc_luma_hz_and_aver_dst_8x8_msa(src - 2, stride, dst, stride);
4099 void ff_avg_h264_qpel4_mc20_msa(uint8_t *dst, const uint8_t *src,
4102 avc_luma_hz_and_aver_dst_4x4_msa(src - 2, stride, dst, stride);
4105 void ff_avg_h264_qpel16_mc01_msa(uint8_t *dst, const uint8_t *src,
4108 avc_luma_vt_qrt_and_aver_dst_16x16_msa(src - (stride * 2),
4109 stride, dst, stride, 0);
4112 void ff_avg_h264_qpel16_mc03_msa(uint8_t *dst, const uint8_t *src,
4115 avc_luma_vt_qrt_and_aver_dst_16x16_msa(src - (stride * 2),
4116 stride, dst, stride, 1);
4119 void ff_avg_h264_qpel8_mc01_msa(uint8_t *dst, const uint8_t *src,
4122 avc_luma_vt_qrt_and_aver_dst_8x8_msa(src - (stride * 2),
4123 stride, dst, stride, 0);
4126 void ff_avg_h264_qpel8_mc03_msa(uint8_t *dst, const uint8_t *src,
4129 avc_luma_vt_qrt_and_aver_dst_8x8_msa(src - (stride * 2),
4130 stride, dst, stride, 1);
4133 void ff_avg_h264_qpel4_mc01_msa(uint8_t *dst, const uint8_t *src,
4136 avc_luma_vt_qrt_and_aver_dst_4x4_msa(src - (stride * 2),
4137 stride, dst, stride, 0);
4140 void ff_avg_h264_qpel4_mc03_msa(uint8_t *dst, const uint8_t *src,
4143 avc_luma_vt_qrt_and_aver_dst_4x4_msa(src - (stride * 2),
4144 stride, dst, stride, 1);
4147 void ff_avg_h264_qpel16_mc11_msa(uint8_t *dst, const uint8_t *src,
4150 avc_luma_hv_qrt_and_aver_dst_16x16_msa(src - 2,
4152 stride, dst, stride);
4155 void ff_avg_h264_qpel16_mc31_msa(uint8_t *dst, const uint8_t *src,
4158 avc_luma_hv_qrt_and_aver_dst_16x16_msa(src - 2,
4159 src - (stride * 2) +
4160 sizeof(uint8_t), stride,
4164 void ff_avg_h264_qpel16_mc13_msa(uint8_t *dst, const uint8_t *src,
4167 avc_luma_hv_qrt_and_aver_dst_16x16_msa(src + stride - 2,
4169 stride, dst, stride);
4172 void ff_avg_h264_qpel16_mc33_msa(uint8_t *dst, const uint8_t *src,
4175 avc_luma_hv_qrt_and_aver_dst_16x16_msa(src + stride - 2,
4176 src - (stride * 2) +
4177 sizeof(uint8_t), stride,
4181 void ff_avg_h264_qpel8_mc11_msa(uint8_t *dst, const uint8_t *src,
4184 avc_luma_hv_qrt_and_aver_dst_8x8_msa(src - 2,
4186 stride, dst, stride);
4189 void ff_avg_h264_qpel8_mc31_msa(uint8_t *dst, const uint8_t *src,
4192 avc_luma_hv_qrt_and_aver_dst_8x8_msa(src - 2,
4193 src - (stride * 2) +
4194 sizeof(uint8_t), stride, dst, stride);
4197 void ff_avg_h264_qpel8_mc13_msa(uint8_t *dst, const uint8_t *src,
4200 avc_luma_hv_qrt_and_aver_dst_8x8_msa(src + stride - 2,
4202 stride, dst, stride);
4205 void ff_avg_h264_qpel8_mc33_msa(uint8_t *dst, const uint8_t *src,
4208 avc_luma_hv_qrt_and_aver_dst_8x8_msa(src + stride - 2,
4209 src - (stride * 2) +
4210 sizeof(uint8_t), stride, dst, stride);
4214 void ff_avg_h264_qpel4_mc11_msa(uint8_t *dst, const uint8_t *src,
4217 avc_luma_hv_qrt_and_aver_dst_4x4_msa(src - 2,
4219 stride, dst, stride);
4222 void ff_avg_h264_qpel4_mc31_msa(uint8_t *dst, const uint8_t *src,
4225 avc_luma_hv_qrt_and_aver_dst_4x4_msa(src - 2,
4226 src - (stride * 2) +
4227 sizeof(uint8_t), stride, dst, stride);
4230 void ff_avg_h264_qpel4_mc13_msa(uint8_t *dst, const uint8_t *src,
4233 avc_luma_hv_qrt_and_aver_dst_4x4_msa(src + stride - 2,
4235 stride, dst, stride);
4238 void ff_avg_h264_qpel4_mc33_msa(uint8_t *dst, const uint8_t *src,
4241 avc_luma_hv_qrt_and_aver_dst_4x4_msa(src + stride - 2,
4242 src - (stride * 2) +
4243 sizeof(uint8_t), stride, dst, stride);
4246 void ff_avg_h264_qpel16_mc21_msa(uint8_t *dst, const uint8_t *src,
4249 avc_luma_midv_qrt_and_aver_dst_16w_msa(src - (2 * stride) - 2,
4250 stride, dst, stride, 16, 0);
4253 void ff_avg_h264_qpel16_mc23_msa(uint8_t *dst, const uint8_t *src,
4256 avc_luma_midv_qrt_and_aver_dst_16w_msa(src - (2 * stride) - 2,
4257 stride, dst, stride, 16, 1);
4260 void ff_avg_h264_qpel8_mc21_msa(uint8_t *dst, const uint8_t *src,
4263 avc_luma_midv_qrt_and_aver_dst_8w_msa(src - (2 * stride) - 2,
4264 stride, dst, stride, 8, 0);
4267 void ff_avg_h264_qpel8_mc23_msa(uint8_t *dst, const uint8_t *src,
4270 avc_luma_midv_qrt_and_aver_dst_8w_msa(src - (2 * stride) - 2,
4271 stride, dst, stride, 8, 1);
4274 void ff_avg_h264_qpel4_mc21_msa(uint8_t *dst, const uint8_t *src,
4277 avc_luma_midv_qrt_and_aver_dst_4w_msa(src - (2 * stride) - 2,
4278 stride, dst, stride, 4, 0);
4281 void ff_avg_h264_qpel4_mc23_msa(uint8_t *dst, const uint8_t *src,
4284 avc_luma_midv_qrt_and_aver_dst_4w_msa(src - (2 * stride) - 2,
4285 stride, dst, stride, 4, 1);
4288 void ff_avg_h264_qpel16_mc02_msa(uint8_t *dst, const uint8_t *src,
4291 avc_luma_vt_and_aver_dst_16x16_msa(src - (stride * 2), stride, dst, stride);
4294 void ff_avg_h264_qpel8_mc02_msa(uint8_t *dst, const uint8_t *src,
4297 avc_luma_vt_and_aver_dst_8x8_msa(src - (stride * 2), stride, dst, stride);
4300 void ff_avg_h264_qpel4_mc02_msa(uint8_t *dst, const uint8_t *src,
4303 avc_luma_vt_and_aver_dst_4x4_msa(src - (stride * 2), stride, dst, stride);
4306 void ff_avg_h264_qpel16_mc12_msa(uint8_t *dst, const uint8_t *src,
4309 avc_luma_midh_qrt_and_aver_dst_16w_msa(src - (2 * stride) - 2,
4310 stride, dst, stride, 16, 0);
4313 void ff_avg_h264_qpel16_mc32_msa(uint8_t *dst, const uint8_t *src,
4316 avc_luma_midh_qrt_and_aver_dst_16w_msa(src - (2 * stride) - 2,
4317 stride, dst, stride, 16, 1);
4320 void ff_avg_h264_qpel8_mc12_msa(uint8_t *dst, const uint8_t *src,
4323 avc_luma_midh_qrt_and_aver_dst_8w_msa(src - (2 * stride) - 2,
4324 stride, dst, stride, 8, 0);
4327 void ff_avg_h264_qpel8_mc32_msa(uint8_t *dst, const uint8_t *src,
4330 avc_luma_midh_qrt_and_aver_dst_8w_msa(src - (2 * stride) - 2,
4331 stride, dst, stride, 8, 1);
4334 void ff_avg_h264_qpel4_mc12_msa(uint8_t *dst, const uint8_t *src,
4337 avc_luma_midh_qrt_and_aver_dst_4w_msa(src - (2 * stride) - 2,
4338 stride, dst, stride, 4, 0);
4341 void ff_avg_h264_qpel4_mc32_msa(uint8_t *dst, const uint8_t *src,
4344 avc_luma_midh_qrt_and_aver_dst_4w_msa(src - (2 * stride) - 2,
4345 stride, dst, stride, 4, 1);
4348 void ff_avg_h264_qpel16_mc22_msa(uint8_t *dst, const uint8_t *src,
4351 avc_luma_mid_and_aver_dst_16x16_msa(src - (2 * stride) - 2,
4352 stride, dst, stride);
4355 void ff_avg_h264_qpel8_mc22_msa(uint8_t *dst, const uint8_t *src,
4358 avc_luma_mid_and_aver_dst_8w_msa(src - (2 * stride) - 2,
4359 stride, dst, stride, 8);
4362 void ff_avg_h264_qpel4_mc22_msa(uint8_t *dst, const uint8_t *src,
4365 avc_luma_mid_and_aver_dst_4x4_msa(src - (2 * stride) - 2,
4366 stride, dst, stride);