2 * Copyright (c) 2015 Parag Salasakar (Parag.Salasakar@imgtec.com)
4 * This file is part of FFmpeg.
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 #include "libavutil/mips/generic_macros_msa.h"
22 #include "h264dsp_mips.h"
24 #define AVC_CALC_DPADD_H_6PIX_2COEFF_SH(in0, in1, in2, in3, in4, in5) \
26 v4i32 tmp0_m, tmp1_m; \
27 v8i16 out0_m, out1_m, out2_m, out3_m; \
28 v8i16 minus5h_m = __msa_ldi_h(-5); \
29 v8i16 plus20h_m = __msa_ldi_h(20); \
31 ILVRL_H2_SW(in5, in0, tmp0_m, tmp1_m); \
33 tmp0_m = __msa_hadd_s_w((v8i16) tmp0_m, (v8i16) tmp0_m); \
34 tmp1_m = __msa_hadd_s_w((v8i16) tmp1_m, (v8i16) tmp1_m); \
36 ILVRL_H2_SH(in1, in4, out0_m, out1_m); \
37 DPADD_SH2_SW(out0_m, out1_m, minus5h_m, minus5h_m, tmp0_m, tmp1_m); \
38 ILVRL_H2_SH(in2, in3, out2_m, out3_m); \
39 DPADD_SH2_SW(out2_m, out3_m, plus20h_m, plus20h_m, tmp0_m, tmp1_m); \
41 SRARI_W2_SW(tmp0_m, tmp1_m, 10); \
42 SAT_SW2_SW(tmp0_m, tmp1_m, 7); \
43 out0_m = __msa_pckev_h((v8i16) tmp1_m, (v8i16) tmp0_m); \
48 #define AVC_HORZ_FILTER_SH(in, mask0, mask1, mask2) \
50 v8i16 out0_m, out1_m; \
51 v16i8 tmp0_m, tmp1_m; \
52 v16i8 minus5b = __msa_ldi_b(-5); \
53 v16i8 plus20b = __msa_ldi_b(20); \
55 tmp0_m = __msa_vshf_b((v16i8) mask0, in, in); \
56 out0_m = __msa_hadd_s_h(tmp0_m, tmp0_m); \
58 tmp0_m = __msa_vshf_b((v16i8) mask1, in, in); \
59 out0_m = __msa_dpadd_s_h(out0_m, minus5b, tmp0_m); \
61 tmp1_m = __msa_vshf_b((v16i8) (mask2), in, in); \
62 out1_m = __msa_dpadd_s_h(out0_m, plus20b, tmp1_m); \
67 static const uint8_t luma_mask_arr[16 * 8] = {
69 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12,
70 1, 4, 2, 5, 3, 6, 4, 7, 5, 8, 6, 9, 7, 10, 8, 11,
71 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10,
74 0, 5, 1, 6, 2, 7, 3, 8, 16, 21, 17, 22, 18, 23, 19, 24,
75 1, 4, 2, 5, 3, 6, 4, 7, 17, 20, 18, 21, 19, 22, 20, 23,
76 2, 3, 3, 4, 4, 5, 5, 6, 18, 19, 19, 20, 20, 21, 21, 22,
78 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 24, 25,
79 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 24, 25, 26
82 #define AVC_CALC_DPADD_B_6PIX_2COEFF_SH(vec0, vec1, vec2, vec3, vec4, vec5, \
85 v16i8 tmp0_m, tmp1_m; \
86 v16i8 minus5b_m = __msa_ldi_b(-5); \
87 v16i8 plus20b_m = __msa_ldi_b(20); \
89 ILVRL_B2_SB(vec5, vec0, tmp0_m, tmp1_m); \
90 HADD_SB2_SH(tmp0_m, tmp1_m, out1, out2); \
91 ILVRL_B2_SB(vec4, vec1, tmp0_m, tmp1_m); \
92 DPADD_SB2_SH(tmp0_m, tmp1_m, minus5b_m, minus5b_m, out1, out2); \
93 ILVRL_B2_SB(vec3, vec2, tmp0_m, tmp1_m); \
94 DPADD_SB2_SH(tmp0_m, tmp1_m, plus20b_m, plus20b_m, out1, out2); \
97 #define AVC_CALC_DPADD_B_6PIX_2COEFF_R_SH(vec0, vec1, vec2, vec3, vec4, vec5) \
100 v16i8 tmp0_m, tmp2_m; \
101 v16i8 minus5b_m = __msa_ldi_b(-5); \
102 v16i8 plus20b_m = __msa_ldi_b(20); \
104 tmp1_m = (v8i16) __msa_ilvr_b((v16i8) vec5, (v16i8) vec0); \
105 tmp1_m = __msa_hadd_s_h((v16i8) tmp1_m, (v16i8) tmp1_m); \
107 ILVR_B2_SB(vec4, vec1, vec3, vec2, tmp0_m, tmp2_m); \
108 DPADD_SB2_SH(tmp0_m, tmp2_m, minus5b_m, plus20b_m, tmp1_m, tmp1_m); \
113 #define AVC_CALC_DPADD_H_6PIX_2COEFF_R_SH(vec0, vec1, vec2, vec3, vec4, vec5) \
116 v8i16 tmp2_m, tmp3_m; \
117 v8i16 minus5h_m = __msa_ldi_h(-5); \
118 v8i16 plus20h_m = __msa_ldi_h(20); \
120 tmp1_m = (v4i32) __msa_ilvr_h((v8i16) vec5, (v8i16) vec0); \
121 tmp1_m = __msa_hadd_s_w((v8i16) tmp1_m, (v8i16) tmp1_m); \
123 ILVR_H2_SH(vec1, vec4, vec2, vec3, tmp2_m, tmp3_m); \
124 DPADD_SH2_SW(tmp2_m, tmp3_m, minus5h_m, plus20h_m, tmp1_m, tmp1_m); \
126 tmp1_m = __msa_srari_w(tmp1_m, 10); \
127 tmp1_m = __msa_sat_s_w(tmp1_m, 7); \
129 tmp2_m = __msa_pckev_h((v8i16) tmp1_m, (v8i16) tmp1_m); \
134 #define AVC_XOR_VSHF_B_AND_APPLY_6TAP_HORIZ_FILT_SH(src0, src1, \
135 mask0, mask1, mask2) \
138 v16i8 vec0_m, vec1_m, vec2_m; \
139 v16i8 minus5b_m = __msa_ldi_b(-5); \
140 v16i8 plus20b_m = __msa_ldi_b(20); \
142 vec0_m = __msa_vshf_b((v16i8) mask0, (v16i8) src1, (v16i8) src0); \
143 hz_out_m = __msa_hadd_s_h(vec0_m, vec0_m); \
145 VSHF_B2_SB(src0, src1, src0, src1, mask1, mask2, vec1_m, vec2_m); \
146 DPADD_SB2_SH(vec1_m, vec2_m, minus5b_m, plus20b_m, hz_out_m, hz_out_m); \
151 static void avc_luma_hz_4w_msa(const uint8_t *src, int32_t src_stride,
152 uint8_t *dst, int32_t dst_stride,
156 v16i8 src0, src1, src2, src3;
159 v16i8 mask0, mask1, mask2;
160 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
161 v16i8 minus5b = __msa_ldi_b(-5);
162 v16i8 plus20b = __msa_ldi_b(20);
164 LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2);
165 for (loop_cnt = (height >> 2); loop_cnt--;) {
166 LD_SB4(src, src_stride, src0, src1, src2, src3);
167 src += (4 * src_stride);
169 XORI_B4_128_SB(src0, src1, src2, src3);
170 VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0, vec1);
171 HADD_SB2_SH(vec0, vec1, res0, res1);
172 VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2, vec3);
173 DPADD_SB2_SH(vec2, vec3, minus5b, minus5b, res0, res1);
174 VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4, vec5);
175 DPADD_SB2_SH(vec4, vec5, plus20b, plus20b, res0, res1);
176 SRARI_H2_SH(res0, res1, 5);
177 SAT_SH2_SH(res0, res1, 7);
178 out = PCKEV_XORI128_UB(res0, res1);
179 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
180 dst += (4 * dst_stride);
184 static void avc_luma_hz_8w_msa(const uint8_t *src, int32_t src_stride,
185 uint8_t *dst, int32_t dst_stride,
189 v16i8 src0, src1, src2, src3;
190 v8i16 res0, res1, res2, res3;
191 v16i8 mask0, mask1, mask2;
192 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
193 v16i8 vec6, vec7, vec8, vec9, vec10, vec11;
194 v16i8 minus5b = __msa_ldi_b(-5);
195 v16i8 plus20b = __msa_ldi_b(20);
198 LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
200 for (loop_cnt = (height >> 2); loop_cnt--;) {
201 LD_SB4(src, src_stride, src0, src1, src2, src3);
202 src += (4 * src_stride);
204 XORI_B4_128_SB(src0, src1, src2, src3);
205 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
206 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3);
207 HADD_SB4_SH(vec0, vec1, vec2, vec3, res0, res1, res2, res3);
208 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4, vec5);
209 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6, vec7);
210 DPADD_SB4_SH(vec4, vec5, vec6, vec7, minus5b, minus5b, minus5b, minus5b,
211 res0, res1, res2, res3);
212 VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec8, vec9);
213 VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec10, vec11);
214 DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b,
215 plus20b, res0, res1, res2, res3);
216 SRARI_H4_SH(res0, res1, res2, res3, 5);
217 SAT_SH4_SH(res0, res1, res2, res3, 7);
218 out0 = PCKEV_XORI128_UB(res0, res1);
219 out1 = PCKEV_XORI128_UB(res2, res3);
220 ST8x4_UB(out0, out1, dst, dst_stride);
221 dst += (4 * dst_stride);
225 static void avc_luma_hz_16w_msa(const uint8_t *src, int32_t src_stride,
226 uint8_t *dst, int32_t dst_stride,
230 v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
231 v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
232 v16i8 mask0, mask1, mask2;
233 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
234 v16i8 vec6, vec7, vec8, vec9, vec10, vec11;
235 v16i8 minus5b = __msa_ldi_b(-5);
236 v16i8 plus20b = __msa_ldi_b(20);
238 LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
240 for (loop_cnt = (height >> 2); loop_cnt--;) {
241 LD_SB2(src, 8, src0, src1);
243 LD_SB2(src, 8, src2, src3);
246 XORI_B4_128_SB(src0, src1, src2, src3);
247 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec3);
248 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec6, vec9);
249 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec1, vec4);
250 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec7, vec10);
251 VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec2, vec5);
252 VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec8, vec11);
253 HADD_SB4_SH(vec0, vec3, vec6, vec9, res0, res1, res2, res3);
254 DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
255 minus5b, res0, res1, res2, res3);
256 DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
257 plus20b, res0, res1, res2, res3);
259 LD_SB2(src, 8, src4, src5);
261 LD_SB2(src, 8, src6, src7);
264 XORI_B4_128_SB(src4, src5, src6, src7);
265 VSHF_B2_SB(src4, src4, src5, src5, mask0, mask0, vec0, vec3);
266 VSHF_B2_SB(src6, src6, src7, src7, mask0, mask0, vec6, vec9);
267 VSHF_B2_SB(src4, src4, src5, src5, mask1, mask1, vec1, vec4);
268 VSHF_B2_SB(src6, src6, src7, src7, mask1, mask1, vec7, vec10);
269 VSHF_B2_SB(src4, src4, src5, src5, mask2, mask2, vec2, vec5);
270 VSHF_B2_SB(src6, src6, src7, src7, mask2, mask2, vec8, vec11);
271 HADD_SB4_SH(vec0, vec3, vec6, vec9, res4, res5, res6, res7);
272 DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
273 minus5b, res4, res5, res6, res7);
274 DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
275 plus20b, res4, res5, res6, res7);
276 SRARI_H4_SH(res0, res1, res2, res3, 5);
277 SRARI_H4_SH(res4, res5, res6, res7, 5);
278 SAT_SH4_SH(res0, res1, res2, res3, 7);
279 SAT_SH4_SH(res4, res5, res6, res7, 7);
280 PCKEV_B4_SB(res1, res0, res3, res2, res5, res4, res7, res6,
281 vec0, vec1, vec2, vec3);
282 XORI_B4_128_SB(vec0, vec1, vec2, vec3);
284 ST_SB4(vec0, vec1, vec2, vec3, dst, dst_stride);
285 dst += (4 * dst_stride);
289 static void avc_luma_hz_qrt_4w_msa(const uint8_t *src, int32_t src_stride,
290 uint8_t *dst, int32_t dst_stride,
291 int32_t height, uint8_t hor_offset)
295 v16i8 src0, src1, src2, src3;
297 v16i8 res, mask0, mask1, mask2;
298 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
299 v16i8 minus5b = __msa_ldi_b(-5);
300 v16i8 plus20b = __msa_ldi_b(20);
302 LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2);
303 slide = 2 + hor_offset;
305 for (loop_cnt = (height >> 2); loop_cnt--;) {
306 LD_SB4(src, src_stride, src0, src1, src2, src3);
307 src += (4 * src_stride);
309 XORI_B4_128_SB(src0, src1, src2, src3);
310 VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0, vec1);
311 HADD_SB2_SH(vec0, vec1, res0, res1);
312 VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2, vec3);
313 DPADD_SB2_SH(vec2, vec3, minus5b, minus5b, res0, res1);
314 VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4, vec5);
315 DPADD_SB2_SH(vec4, vec5, plus20b, plus20b, res0, res1);
316 SRARI_H2_SH(res0, res1, 5);
317 SAT_SH2_SH(res0, res1, 7);
319 res = __msa_pckev_b((v16i8) res1, (v16i8) res0);
320 src0 = __msa_sld_b(src0, src0, slide);
321 src1 = __msa_sld_b(src1, src1, slide);
322 src2 = __msa_sld_b(src2, src2, slide);
323 src3 = __msa_sld_b(src3, src3, slide);
324 src0 = (v16i8) __msa_insve_w((v4i32) src0, 1, (v4i32) src1);
325 src1 = (v16i8) __msa_insve_w((v4i32) src2, 1, (v4i32) src3);
326 src0 = (v16i8) __msa_insve_d((v2i64) src0, 1, (v2i64) src1);
327 res = __msa_aver_s_b(res, src0);
328 res = (v16i8) __msa_xori_b((v16u8) res, 128);
330 ST4x4_UB(res, res, 0, 1, 2, 3, dst, dst_stride);
331 dst += (4 * dst_stride);
335 static void avc_luma_hz_qrt_8w_msa(const uint8_t *src, int32_t src_stride,
336 uint8_t *dst, int32_t dst_stride,
337 int32_t height, uint8_t hor_offset)
341 v16i8 src0, src1, src2, src3;
343 v8i16 res0, res1, res2, res3;
344 v16i8 mask0, mask1, mask2;
345 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
346 v16i8 vec6, vec7, vec8, vec9, vec10, vec11;
347 v16i8 minus5b = __msa_ldi_b(-5);
348 v16i8 plus20b = __msa_ldi_b(20);
350 LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
351 slide = 2 + hor_offset;
353 for (loop_cnt = height >> 2; loop_cnt--;) {
354 LD_SB4(src, src_stride, src0, src1, src2, src3);
355 src += (4 * src_stride);
357 XORI_B4_128_SB(src0, src1, src2, src3);
358 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
359 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3);
360 HADD_SB4_SH(vec0, vec1, vec2, vec3, res0, res1, res2, res3);
361 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4, vec5);
362 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6, vec7);
363 DPADD_SB4_SH(vec4, vec5, vec6, vec7, minus5b, minus5b, minus5b, minus5b,
364 res0, res1, res2, res3);
365 VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec8, vec9);
366 VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec10, vec11);
367 DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b,
368 plus20b, res0, res1, res2, res3);
370 src0 = __msa_sld_b(src0, src0, slide);
371 src1 = __msa_sld_b(src1, src1, slide);
372 src2 = __msa_sld_b(src2, src2, slide);
373 src3 = __msa_sld_b(src3, src3, slide);
375 SRARI_H4_SH(res0, res1, res2, res3, 5);
376 SAT_SH4_SH(res0, res1, res2, res3, 7);
377 PCKEV_B2_SB(res1, res0, res3, res2, tmp0, tmp1);
378 PCKEV_D2_SB(src1, src0, src3, src2, src0, src1);
380 tmp0 = __msa_aver_s_b(tmp0, src0);
381 tmp1 = __msa_aver_s_b(tmp1, src1);
383 XORI_B2_128_SB(tmp0, tmp1);
384 ST8x4_UB(tmp0, tmp1, dst, dst_stride);
386 dst += (4 * dst_stride);
390 static void avc_luma_hz_qrt_16w_msa(const uint8_t *src, int32_t src_stride,
391 uint8_t *dst, int32_t dst_stride,
392 int32_t height, uint8_t hor_offset)
396 v16i8 src0, src1, src2, src3;
397 v16i8 mask0, mask1, mask2, vshf;
398 v8i16 res0, res1, res2, res3;
399 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
400 v16i8 vec6, vec7, vec8, vec9, vec10, vec11;
401 v16i8 minus5b = __msa_ldi_b(-5);
402 v16i8 plus20b = __msa_ldi_b(20);
404 LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
407 vshf = LD_SB(&luma_mask_arr[16 + 96]);
409 vshf = LD_SB(&luma_mask_arr[96]);
412 for (loop_cnt = height >> 1; loop_cnt--;) {
413 LD_SB2(src, 8, src0, src1);
415 LD_SB2(src, 8, src2, src3);
418 XORI_B4_128_SB(src0, src1, src2, src3);
419 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec3);
420 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec6, vec9);
421 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec1, vec4);
422 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec7, vec10);
423 VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec2, vec5);
424 VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec8, vec11);
425 HADD_SB4_SH(vec0, vec3, vec6, vec9, res0, res1, res2, res3);
426 DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
427 minus5b, res0, res1, res2, res3);
428 DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
429 plus20b, res0, res1, res2, res3);
430 VSHF_B2_SB(src0, src1, src2, src3, vshf, vshf, src0, src2);
431 SRARI_H4_SH(res0, res1, res2, res3, 5);
432 SAT_SH4_SH(res0, res1, res2, res3, 7);
433 PCKEV_B2_SB(res1, res0, res3, res2, dst0, dst1);
435 dst0 = __msa_aver_s_b(dst0, src0);
436 dst1 = __msa_aver_s_b(dst1, src2);
438 XORI_B2_128_SB(dst0, dst1);
440 ST_SB2(dst0, dst1, dst, dst_stride);
441 dst += (2 * dst_stride);
445 static void avc_luma_vt_4w_msa(const uint8_t *src, int32_t src_stride,
446 uint8_t *dst, int32_t dst_stride,
450 int16_t filt_const0 = 0xfb01;
451 int16_t filt_const1 = 0x1414;
452 int16_t filt_const2 = 0x1fb;
453 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
454 v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
455 v16i8 src87_r, src2110, src4332, src6554, src8776;
456 v16i8 filt0, filt1, filt2;
460 filt0 = (v16i8) __msa_fill_h(filt_const0);
461 filt1 = (v16i8) __msa_fill_h(filt_const1);
462 filt2 = (v16i8) __msa_fill_h(filt_const2);
464 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
465 src += (5 * src_stride);
467 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3,
468 src10_r, src21_r, src32_r, src43_r);
469 ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
470 XORI_B2_128_SB(src2110, src4332);
472 for (loop_cnt = (height >> 2); loop_cnt--;) {
473 LD_SB4(src, src_stride, src5, src6, src7, src8);
474 src += (4 * src_stride);
476 ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7,
477 src54_r, src65_r, src76_r, src87_r);
478 ILVR_D2_SB(src65_r, src54_r, src87_r, src76_r, src6554, src8776);
479 XORI_B2_128_SB(src6554, src8776);
480 out10 = DPADD_SH3_SH(src2110, src4332, src6554, filt0, filt1, filt2);
481 out32 = DPADD_SH3_SH(src4332, src6554, src8776, filt0, filt1, filt2);
482 SRARI_H2_SH(out10, out32, 5);
483 SAT_SH2_SH(out10, out32, 7);
484 out = PCKEV_XORI128_UB(out10, out32);
485 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
487 dst += (4 * dst_stride);
494 static void avc_luma_vt_8w_msa(const uint8_t *src, int32_t src_stride,
495 uint8_t *dst, int32_t dst_stride,
499 int16_t filt_const0 = 0xfb01;
500 int16_t filt_const1 = 0x1414;
501 int16_t filt_const2 = 0x1fb;
502 v16i8 src0, src1, src2, src3, src4, src7, src8, src9, src10;
503 v16i8 src10_r, src32_r, src76_r, src98_r;
504 v16i8 src21_r, src43_r, src87_r, src109_r;
505 v8i16 out0_r, out1_r, out2_r, out3_r;
506 v16i8 filt0, filt1, filt2;
509 filt0 = (v16i8) __msa_fill_h(filt_const0);
510 filt1 = (v16i8) __msa_fill_h(filt_const1);
511 filt2 = (v16i8) __msa_fill_h(filt_const2);
513 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
514 src += (5 * src_stride);
516 XORI_B5_128_SB(src0, src1, src2, src3, src4);
517 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3,
518 src10_r, src21_r, src32_r, src43_r);
520 for (loop_cnt = (height >> 2); loop_cnt--;) {
521 LD_SB4(src, src_stride, src7, src8, src9, src10);
522 src += (4 * src_stride);
524 XORI_B4_128_SB(src7, src8, src9, src10);
525 ILVR_B4_SB(src7, src4, src8, src7, src9, src8, src10, src9,
526 src76_r, src87_r, src98_r, src109_r);
527 out0_r = DPADD_SH3_SH(src10_r, src32_r, src76_r, filt0, filt1, filt2);
528 out1_r = DPADD_SH3_SH(src21_r, src43_r, src87_r, filt0, filt1, filt2);
529 out2_r = DPADD_SH3_SH(src32_r, src76_r, src98_r, filt0, filt1, filt2);
530 out3_r = DPADD_SH3_SH(src43_r, src87_r, src109_r, filt0, filt1, filt2);
531 SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 5);
532 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
533 out0 = PCKEV_XORI128_UB(out0_r, out1_r);
534 out1 = PCKEV_XORI128_UB(out2_r, out3_r);
535 ST8x4_UB(out0, out1, dst, dst_stride);
536 dst += (4 * dst_stride);
546 static void avc_luma_vt_16w_msa(const uint8_t *src, int32_t src_stride,
547 uint8_t *dst, int32_t dst_stride,
551 int16_t filt_const0 = 0xfb01;
552 int16_t filt_const1 = 0x1414;
553 int16_t filt_const2 = 0x1fb;
554 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
555 v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
556 v16i8 src87_r, src10_l, src32_l, src54_l, src76_l, src21_l, src43_l;
557 v16i8 src65_l, src87_l;
558 v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
559 v16u8 res0, res1, res2, res3;
560 v16i8 filt0, filt1, filt2;
562 filt0 = (v16i8) __msa_fill_h(filt_const0);
563 filt1 = (v16i8) __msa_fill_h(filt_const1);
564 filt2 = (v16i8) __msa_fill_h(filt_const2);
566 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
567 src += (5 * src_stride);
569 XORI_B5_128_SB(src0, src1, src2, src3, src4);
570 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3,
571 src10_r, src21_r, src32_r, src43_r);
572 ILVL_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3,
573 src10_l, src21_l, src32_l, src43_l);
575 for (loop_cnt = (height >> 2); loop_cnt--;) {
576 LD_SB4(src, src_stride, src5, src6, src7, src8);
577 src += (4 * src_stride);
579 XORI_B4_128_SB(src5, src6, src7, src8);
580 ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7,
581 src54_r, src65_r, src76_r, src87_r);
582 ILVL_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7,
583 src54_l, src65_l, src76_l, src87_l);
584 out0_r = DPADD_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2);
585 out1_r = DPADD_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2);
586 out2_r = DPADD_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2);
587 out3_r = DPADD_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2);
588 out0_l = DPADD_SH3_SH(src10_l, src32_l, src54_l, filt0, filt1, filt2);
589 out1_l = DPADD_SH3_SH(src21_l, src43_l, src65_l, filt0, filt1, filt2);
590 out2_l = DPADD_SH3_SH(src32_l, src54_l, src76_l, filt0, filt1, filt2);
591 out3_l = DPADD_SH3_SH(src43_l, src65_l, src87_l, filt0, filt1, filt2);
592 SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 5);
593 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
594 SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 5);
595 SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
596 PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
597 out3_r, res0, res1, res2, res3);
598 XORI_B4_128_UB(res0, res1, res2, res3);
600 ST_UB4(res0, res1, res2, res3, dst, dst_stride);
601 dst += (4 * dst_stride);
615 static void avc_luma_vt_qrt_4w_msa(const uint8_t *src, int32_t src_stride,
616 uint8_t *dst, int32_t dst_stride,
617 int32_t height, uint8_t ver_offset)
620 int16_t filt_const0 = 0xfb01;
621 int16_t filt_const1 = 0x1414;
622 int16_t filt_const2 = 0x1fb;
623 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
624 v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
625 v16i8 src87_r, src2110, src4332, src6554, src8776;
627 v16i8 filt0, filt1, filt2;
630 filt0 = (v16i8) __msa_fill_h(filt_const0);
631 filt1 = (v16i8) __msa_fill_h(filt_const1);
632 filt2 = (v16i8) __msa_fill_h(filt_const2);
634 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
635 src += (5 * src_stride);
637 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3,
638 src10_r, src21_r, src32_r, src43_r);
639 ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
640 XORI_B2_128_SB(src2110, src4332);
642 for (loop_cnt = (height >> 2); loop_cnt--;) {
643 LD_SB4(src, src_stride, src5, src6, src7, src8);
644 src += (4 * src_stride);
646 ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7,
647 src54_r, src65_r, src76_r, src87_r);
648 ILVR_D2_SB(src65_r, src54_r, src87_r, src76_r, src6554, src8776);
649 XORI_B2_128_SB(src6554, src8776);
650 out10 = DPADD_SH3_SH(src2110, src4332, src6554, filt0, filt1, filt2);
651 out32 = DPADD_SH3_SH(src4332, src6554, src8776, filt0, filt1, filt2);
652 SRARI_H2_SH(out10, out32, 5);
653 SAT_SH2_SH(out10, out32, 7);
655 out = PCKEV_XORI128_UB(out10, out32);
658 src32_r = (v16i8) __msa_insve_w((v4i32) src3, 1, (v4i32) src4);
659 src54_r = (v16i8) __msa_insve_w((v4i32) src5, 1, (v4i32) src6);
661 src32_r = (v16i8) __msa_insve_w((v4i32) src2, 1, (v4i32) src3);
662 src54_r = (v16i8) __msa_insve_w((v4i32) src4, 1, (v4i32) src5);
665 src32_r = (v16i8) __msa_insve_d((v2i64) src32_r, 1, (v2i64) src54_r);
666 out = __msa_aver_u_b(out, (v16u8) src32_r);
668 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
669 dst += (4 * dst_stride);
678 static void avc_luma_vt_qrt_8w_msa(const uint8_t *src, int32_t src_stride,
679 uint8_t *dst, int32_t dst_stride,
680 int32_t height, uint8_t ver_offset)
683 int16_t filt_const0 = 0xfb01;
684 int16_t filt_const1 = 0x1414;
685 int16_t filt_const2 = 0x1fb;
686 v16i8 src0, src1, src2, src3, src4, src7, src8, src9, src10;
687 v16i8 src10_r, src32_r, src76_r, src98_r;
688 v16i8 src21_r, src43_r, src87_r, src109_r;
689 v8i16 out0_r, out1_r, out2_r, out3_r;
691 v16i8 filt0, filt1, filt2;
693 filt0 = (v16i8) __msa_fill_h(filt_const0);
694 filt1 = (v16i8) __msa_fill_h(filt_const1);
695 filt2 = (v16i8) __msa_fill_h(filt_const2);
697 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
698 src += (5 * src_stride);
700 XORI_B5_128_SB(src0, src1, src2, src3, src4);
701 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3,
702 src10_r, src21_r, src32_r, src43_r);
704 for (loop_cnt = (height >> 2); loop_cnt--;) {
705 LD_SB4(src, src_stride, src7, src8, src9, src10);
706 src += (4 * src_stride);
708 XORI_B4_128_SB(src7, src8, src9, src10);
709 ILVR_B4_SB(src7, src4, src8, src7, src9, src8, src10, src9,
710 src76_r, src87_r, src98_r, src109_r);
711 out0_r = DPADD_SH3_SH(src10_r, src32_r, src76_r, filt0, filt1, filt2);
712 out1_r = DPADD_SH3_SH(src21_r, src43_r, src87_r, filt0, filt1, filt2);
713 out2_r = DPADD_SH3_SH(src32_r, src76_r, src98_r, filt0, filt1, filt2);
714 out3_r = DPADD_SH3_SH(src43_r, src87_r, src109_r, filt0, filt1, filt2);
715 SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 5);
716 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
717 PCKEV_B2_SB(out1_r, out0_r, out3_r, out2_r, res0, res1);
720 PCKEV_D2_SB(src4, src3, src8, src7, src10_r, src32_r);
722 PCKEV_D2_SB(src3, src2, src7, src4, src10_r, src32_r);
725 res0 = __msa_aver_s_b(res0, (v16i8) src10_r);
726 res1 = __msa_aver_s_b(res1, (v16i8) src32_r);
728 XORI_B2_128_SB(res0, res1);
729 ST8x4_UB(res0, res1, dst, dst_stride);
731 dst += (4 * dst_stride);
742 static void avc_luma_vt_qrt_16w_msa(const uint8_t *src, int32_t src_stride,
743 uint8_t *dst, int32_t dst_stride,
744 int32_t height, uint8_t ver_offset)
747 int16_t filt_const0 = 0xfb01;
748 int16_t filt_const1 = 0x1414;
749 int16_t filt_const2 = 0x1fb;
750 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
751 v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
752 v16i8 src87_r, src10_l, src32_l, src54_l, src76_l, src21_l, src43_l;
753 v16i8 src65_l, src87_l;
754 v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
755 v16u8 res0, res1, res2, res3;
756 v16i8 filt0, filt1, filt2;
758 filt0 = (v16i8) __msa_fill_h(filt_const0);
759 filt1 = (v16i8) __msa_fill_h(filt_const1);
760 filt2 = (v16i8) __msa_fill_h(filt_const2);
762 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
763 src += (5 * src_stride);
765 XORI_B5_128_SB(src0, src1, src2, src3, src4);
766 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3,
767 src10_r, src21_r, src32_r, src43_r);
768 ILVL_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3,
769 src10_l, src21_l, src32_l, src43_l);
771 for (loop_cnt = (height >> 2); loop_cnt--;) {
772 LD_SB4(src, src_stride, src5, src6, src7, src8);
773 src += (4 * src_stride);
775 XORI_B4_128_SB(src5, src6, src7, src8);
776 ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7,
777 src54_r, src65_r, src76_r, src87_r);
778 ILVL_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7,
779 src54_l, src65_l, src76_l, src87_l);
780 out0_r = DPADD_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2);
781 out1_r = DPADD_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2);
782 out2_r = DPADD_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2);
783 out3_r = DPADD_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2);
784 out0_l = DPADD_SH3_SH(src10_l, src32_l, src54_l, filt0, filt1, filt2);
785 out1_l = DPADD_SH3_SH(src21_l, src43_l, src65_l, filt0, filt1, filt2);
786 out2_l = DPADD_SH3_SH(src32_l, src54_l, src76_l, filt0, filt1, filt2);
787 out3_l = DPADD_SH3_SH(src43_l, src65_l, src87_l, filt0, filt1, filt2);
788 SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 5);
789 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
790 SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 5);
791 SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
792 PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
793 out3_r, res0, res1, res2, res3);
796 res0 = (v16u8) __msa_aver_s_b((v16i8) res0, src3);
797 res1 = (v16u8) __msa_aver_s_b((v16i8) res1, src4);
798 res2 = (v16u8) __msa_aver_s_b((v16i8) res2, src5);
799 res3 = (v16u8) __msa_aver_s_b((v16i8) res3, src6);
801 res0 = (v16u8) __msa_aver_s_b((v16i8) res0, src2);
802 res1 = (v16u8) __msa_aver_s_b((v16i8) res1, src3);
803 res2 = (v16u8) __msa_aver_s_b((v16i8) res2, src4);
804 res3 = (v16u8) __msa_aver_s_b((v16i8) res3, src5);
807 XORI_B4_128_UB(res0, res1, res2, res3);
808 ST_UB4(res0, res1, res2, res3, dst, dst_stride);
810 dst += (4 * dst_stride);
826 static void avc_luma_mid_4w_msa(const uint8_t *src, int32_t src_stride,
827 uint8_t *dst, int32_t dst_stride,
831 v16i8 src0, src1, src2, src3, src4;
832 v16i8 mask0, mask1, mask2;
833 v8i16 hz_out0, hz_out1, hz_out2, hz_out3;
834 v8i16 hz_out4, hz_out5, hz_out6, hz_out7, hz_out8;
835 v8i16 dst0, dst1, dst2, dst3;
837 LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2);
838 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
839 src += (5 * src_stride);
841 XORI_B5_128_SB(src0, src1, src2, src3, src4);
843 hz_out0 = AVC_XOR_VSHF_B_AND_APPLY_6TAP_HORIZ_FILT_SH(src0, src1,
844 mask0, mask1, mask2);
845 hz_out2 = AVC_XOR_VSHF_B_AND_APPLY_6TAP_HORIZ_FILT_SH(src2, src3,
846 mask0, mask1, mask2);
848 PCKOD_D2_SH(hz_out0, hz_out0, hz_out2, hz_out2, hz_out1, hz_out3);
850 hz_out4 = AVC_HORZ_FILTER_SH(src4, mask0, mask1, mask2);
852 for (loop_cnt = (height >> 2); loop_cnt--;) {
853 LD_SB4(src, src_stride, src0, src1, src2, src3);
854 src += (4 * src_stride);
856 XORI_B4_128_SB(src0, src1, src2, src3);
858 hz_out5 = AVC_XOR_VSHF_B_AND_APPLY_6TAP_HORIZ_FILT_SH(src0, src1,
861 hz_out7 = AVC_XOR_VSHF_B_AND_APPLY_6TAP_HORIZ_FILT_SH(src2, src3,
865 PCKOD_D2_SH(hz_out5, hz_out5, hz_out7, hz_out7, hz_out6, hz_out8);
867 dst0 = AVC_CALC_DPADD_H_6PIX_2COEFF_R_SH(hz_out0, hz_out1, hz_out2,
868 hz_out3, hz_out4, hz_out5);
869 dst1 = AVC_CALC_DPADD_H_6PIX_2COEFF_R_SH(hz_out1, hz_out2, hz_out3,
870 hz_out4, hz_out5, hz_out6);
871 dst2 = AVC_CALC_DPADD_H_6PIX_2COEFF_R_SH(hz_out2, hz_out3, hz_out4,
872 hz_out5, hz_out6, hz_out7);
873 dst3 = AVC_CALC_DPADD_H_6PIX_2COEFF_R_SH(hz_out3, hz_out4, hz_out5,
874 hz_out6, hz_out7, hz_out8);
876 PCKEV_B2_SB(dst1, dst0, dst3, dst2, src0, src1);
877 XORI_B2_128_SB(src0, src1);
879 ST4x4_UB(src0, src1, 0, 2, 0, 2, dst, dst_stride);
881 dst += (4 * dst_stride);
891 static void avc_luma_mid_8w_msa(const uint8_t *src, int32_t src_stride,
892 uint8_t *dst, int32_t dst_stride,
896 v16i8 src0, src1, src2, src3, src4;
897 v16i8 mask0, mask1, mask2;
898 v8i16 hz_out0, hz_out1, hz_out2, hz_out3;
899 v8i16 hz_out4, hz_out5, hz_out6, hz_out7, hz_out8;
900 v8i16 dst0, dst1, dst2, dst3;
903 LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
905 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
906 XORI_B5_128_SB(src0, src1, src2, src3, src4);
907 src += (5 * src_stride);
909 hz_out0 = AVC_HORZ_FILTER_SH(src0, mask0, mask1, mask2);
910 hz_out1 = AVC_HORZ_FILTER_SH(src1, mask0, mask1, mask2);
911 hz_out2 = AVC_HORZ_FILTER_SH(src2, mask0, mask1, mask2);
912 hz_out3 = AVC_HORZ_FILTER_SH(src3, mask0, mask1, mask2);
913 hz_out4 = AVC_HORZ_FILTER_SH(src4, mask0, mask1, mask2);
915 for (loop_cnt = (height >> 2); loop_cnt--;) {
916 LD_SB4(src, src_stride, src0, src1, src2, src3);
917 XORI_B4_128_SB(src0, src1, src2, src3);
918 src += (4 * src_stride);
920 hz_out5 = AVC_HORZ_FILTER_SH(src0, mask0, mask1, mask2);
921 hz_out6 = AVC_HORZ_FILTER_SH(src1, mask0, mask1, mask2);
922 hz_out7 = AVC_HORZ_FILTER_SH(src2, mask0, mask1, mask2);
923 hz_out8 = AVC_HORZ_FILTER_SH(src3, mask0, mask1, mask2);
924 dst0 = AVC_CALC_DPADD_H_6PIX_2COEFF_SH(hz_out0, hz_out1, hz_out2,
925 hz_out3, hz_out4, hz_out5);
926 dst1 = AVC_CALC_DPADD_H_6PIX_2COEFF_SH(hz_out1, hz_out2, hz_out3,
927 hz_out4, hz_out5, hz_out6);
928 dst2 = AVC_CALC_DPADD_H_6PIX_2COEFF_SH(hz_out2, hz_out3, hz_out4,
929 hz_out5, hz_out6, hz_out7);
930 dst3 = AVC_CALC_DPADD_H_6PIX_2COEFF_SH(hz_out3, hz_out4, hz_out5,
931 hz_out6, hz_out7, hz_out8);
932 out0 = PCKEV_XORI128_UB(dst0, dst1);
933 out1 = PCKEV_XORI128_UB(dst2, dst3);
934 ST8x4_UB(out0, out1, dst, dst_stride);
936 dst += (4 * dst_stride);
946 static void avc_luma_mid_16w_msa(const uint8_t *src, int32_t src_stride,
947 uint8_t *dst, int32_t dst_stride,
950 uint32_t multiple8_cnt;
952 for (multiple8_cnt = 2; multiple8_cnt--;) {
953 avc_luma_mid_8w_msa(src, src_stride, dst, dst_stride, height);
959 static void avc_luma_midh_qrt_4w_msa(const uint8_t *src, int32_t src_stride,
960 uint8_t *dst, int32_t dst_stride,
961 int32_t height, uint8_t horiz_offset)
964 v16i8 src0, src1, src2, src3, src4, src5, src6;
965 v8i16 vt_res0, vt_res1, vt_res2, vt_res3;
966 v4i32 hz_res0, hz_res1;
968 v8i16 shf_vec0, shf_vec1, shf_vec2, shf_vec3, shf_vec4, shf_vec5;
969 v8i16 mask0 = { 0, 5, 1, 6, 2, 7, 3, 8 };
970 v8i16 mask1 = { 1, 4, 2, 5, 3, 6, 4, 7 };
971 v8i16 mask2 = { 2, 3, 3, 4, 4, 5, 5, 6 };
972 v8i16 minus5h = __msa_ldi_h(-5);
973 v8i16 plus20h = __msa_ldi_h(20);
977 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
978 src += (5 * src_stride);
979 XORI_B5_128_SB(src0, src1, src2, src3, src4);
981 for (row = (height >> 1); row--;) {
982 LD_SB2(src, src_stride, src5, src6);
983 src += (2 * src_stride);
985 XORI_B2_128_SB(src5, src6);
986 AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src0, src1, src2, src3, src4, src5,
988 AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src1, src2, src3, src4, src5, src6,
990 VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1,
991 mask0, mask1, mask2, shf_vec0, shf_vec1, shf_vec2);
992 VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3,
993 mask0, mask1, mask2, shf_vec3, shf_vec4, shf_vec5);
994 hz_res0 = __msa_hadd_s_w(shf_vec0, shf_vec0);
995 DPADD_SH2_SW(shf_vec1, shf_vec2, minus5h, plus20h, hz_res0, hz_res0);
996 hz_res1 = __msa_hadd_s_w(shf_vec3, shf_vec3);
997 DPADD_SH2_SW(shf_vec4, shf_vec5, minus5h, plus20h, hz_res1, hz_res1);
999 SRARI_W2_SW(hz_res0, hz_res1, 10);
1000 SAT_SW2_SW(hz_res0, hz_res1, 7);
1002 dst0 = __msa_srari_h(shf_vec2, 5);
1003 dst1 = __msa_srari_h(shf_vec5, 5);
1005 SAT_SH2_SH(dst0, dst1, 7);
1008 dst0 = __msa_ilvod_h(zeros, dst0);
1009 dst1 = __msa_ilvod_h(zeros, dst1);
1011 ILVEV_H2_SH(dst0, zeros, dst1, zeros, dst0, dst1);
1014 hz_res0 = __msa_aver_s_w(hz_res0, (v4i32) dst0);
1015 hz_res1 = __msa_aver_s_w(hz_res1, (v4i32) dst1);
1016 dst0 = __msa_pckev_h((v8i16) hz_res1, (v8i16) hz_res0);
1018 out = PCKEV_XORI128_UB(dst0, dst0);
1019 ST4x2_UB(out, dst, dst_stride);
1021 dst += (2 * dst_stride);
1031 static void avc_luma_midh_qrt_8w_msa(const uint8_t *src, int32_t src_stride,
1032 uint8_t *dst, int32_t dst_stride,
1033 int32_t height, uint8_t horiz_offset)
1035 uint32_t multiple8_cnt;
1037 for (multiple8_cnt = 2; multiple8_cnt--;) {
1038 avc_luma_midh_qrt_4w_msa(src, src_stride, dst, dst_stride, height,
1046 static void avc_luma_midh_qrt_16w_msa(const uint8_t *src, int32_t src_stride,
1047 uint8_t *dst, int32_t dst_stride,
1048 int32_t height, uint8_t horiz_offset)
1050 uint32_t multiple8_cnt;
1052 for (multiple8_cnt = 4; multiple8_cnt--;) {
1053 avc_luma_midh_qrt_4w_msa(src, src_stride, dst, dst_stride, height,
1061 static void avc_luma_midv_qrt_4w_msa(const uint8_t *src, int32_t src_stride,
1062 uint8_t *dst, int32_t dst_stride,
1063 int32_t height, uint8_t ver_offset)
1066 v16i8 src0, src1, src2, src3, src4;
1067 v16i8 mask0, mask1, mask2;
1068 v8i16 hz_out0, hz_out1, hz_out2, hz_out3;
1069 v8i16 hz_out4, hz_out5, hz_out6, hz_out7, hz_out8;
1070 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
1072 LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2);
1073 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
1074 src += (5 * src_stride);
1076 XORI_B5_128_SB(src0, src1, src2, src3, src4);
1078 hz_out0 = AVC_XOR_VSHF_B_AND_APPLY_6TAP_HORIZ_FILT_SH(src0, src1,
1079 mask0, mask1, mask2);
1080 hz_out2 = AVC_XOR_VSHF_B_AND_APPLY_6TAP_HORIZ_FILT_SH(src2, src3,
1081 mask0, mask1, mask2);
1083 PCKOD_D2_SH(hz_out0, hz_out0, hz_out2, hz_out2, hz_out1, hz_out3);
1085 hz_out4 = AVC_HORZ_FILTER_SH(src4, mask0, mask1, mask2);
1087 for (loop_cnt = (height >> 2); loop_cnt--;) {
1088 LD_SB4(src, src_stride, src0, src1, src2, src3);
1089 src += (4 * src_stride);
1090 XORI_B4_128_SB(src0, src1, src2, src3);
1092 hz_out5 = AVC_XOR_VSHF_B_AND_APPLY_6TAP_HORIZ_FILT_SH(src0, src1,
1095 hz_out7 = AVC_XOR_VSHF_B_AND_APPLY_6TAP_HORIZ_FILT_SH(src2, src3,
1099 PCKOD_D2_SH(hz_out5, hz_out5, hz_out7, hz_out7, hz_out6, hz_out8);
1101 dst0 = AVC_CALC_DPADD_H_6PIX_2COEFF_SH(hz_out0, hz_out1, hz_out2,
1102 hz_out3, hz_out4, hz_out5);
1103 dst2 = AVC_CALC_DPADD_H_6PIX_2COEFF_SH(hz_out1, hz_out2, hz_out3,
1104 hz_out4, hz_out5, hz_out6);
1105 dst4 = AVC_CALC_DPADD_H_6PIX_2COEFF_SH(hz_out2, hz_out3, hz_out4,
1106 hz_out5, hz_out6, hz_out7);
1107 dst6 = AVC_CALC_DPADD_H_6PIX_2COEFF_SH(hz_out3, hz_out4, hz_out5,
1108 hz_out6, hz_out7, hz_out8);
1111 dst1 = __msa_srari_h(hz_out3, 5);
1112 dst3 = __msa_srari_h(hz_out4, 5);
1113 dst5 = __msa_srari_h(hz_out5, 5);
1114 dst7 = __msa_srari_h(hz_out6, 5);
1116 dst1 = __msa_srari_h(hz_out2, 5);
1117 dst3 = __msa_srari_h(hz_out3, 5);
1118 dst5 = __msa_srari_h(hz_out4, 5);
1119 dst7 = __msa_srari_h(hz_out5, 5);
1122 SAT_SH4_SH(dst1, dst3, dst5, dst7, 7);
1124 dst0 = __msa_aver_s_h(dst0, dst1);
1125 dst1 = __msa_aver_s_h(dst2, dst3);
1126 dst2 = __msa_aver_s_h(dst4, dst5);
1127 dst3 = __msa_aver_s_h(dst6, dst7);
1129 PCKEV_B2_SB(dst1, dst0, dst3, dst2, src0, src1);
1130 XORI_B2_128_SB(src0, src1);
1132 ST4x4_UB(src0, src1, 0, 2, 0, 2, dst, dst_stride);
1134 dst += (4 * dst_stride);
1143 static void avc_luma_midv_qrt_8w_msa(const uint8_t *src, int32_t src_stride,
1144 uint8_t *dst, int32_t dst_stride,
1145 int32_t height, uint8_t ver_offset)
1148 v16i8 src0, src1, src2, src3, src4;
1149 v16i8 mask0, mask1, mask2;
1150 v8i16 hz_out0, hz_out1, hz_out2, hz_out3;
1151 v8i16 hz_out4, hz_out5, hz_out6, hz_out7, hz_out8;
1152 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
1155 LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
1157 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
1158 XORI_B5_128_SB(src0, src1, src2, src3, src4);
1159 src += (5 * src_stride);
1161 hz_out0 = AVC_HORZ_FILTER_SH(src0, mask0, mask1, mask2);
1162 hz_out1 = AVC_HORZ_FILTER_SH(src1, mask0, mask1, mask2);
1163 hz_out2 = AVC_HORZ_FILTER_SH(src2, mask0, mask1, mask2);
1164 hz_out3 = AVC_HORZ_FILTER_SH(src3, mask0, mask1, mask2);
1165 hz_out4 = AVC_HORZ_FILTER_SH(src4, mask0, mask1, mask2);
1167 for (loop_cnt = (height >> 2); loop_cnt--;) {
1168 LD_SB4(src, src_stride, src0, src1, src2, src3);
1169 XORI_B4_128_SB(src0, src1, src2, src3);
1170 src += (4 * src_stride);
1172 hz_out5 = AVC_HORZ_FILTER_SH(src0, mask0, mask1, mask2);
1173 hz_out6 = AVC_HORZ_FILTER_SH(src1, mask0, mask1, mask2);
1174 hz_out7 = AVC_HORZ_FILTER_SH(src2, mask0, mask1, mask2);
1175 hz_out8 = AVC_HORZ_FILTER_SH(src3, mask0, mask1, mask2);
1177 dst0 = AVC_CALC_DPADD_H_6PIX_2COEFF_SH(hz_out0, hz_out1, hz_out2,
1178 hz_out3, hz_out4, hz_out5);
1179 dst2 = AVC_CALC_DPADD_H_6PIX_2COEFF_SH(hz_out1, hz_out2, hz_out3,
1180 hz_out4, hz_out5, hz_out6);
1181 dst4 = AVC_CALC_DPADD_H_6PIX_2COEFF_SH(hz_out2, hz_out3, hz_out4,
1182 hz_out5, hz_out6, hz_out7);
1183 dst6 = AVC_CALC_DPADD_H_6PIX_2COEFF_SH(hz_out3, hz_out4, hz_out5,
1184 hz_out6, hz_out7, hz_out8);
1187 dst1 = __msa_srari_h(hz_out3, 5);
1188 dst3 = __msa_srari_h(hz_out4, 5);
1189 dst5 = __msa_srari_h(hz_out5, 5);
1190 dst7 = __msa_srari_h(hz_out6, 5);
1192 dst1 = __msa_srari_h(hz_out2, 5);
1193 dst3 = __msa_srari_h(hz_out3, 5);
1194 dst5 = __msa_srari_h(hz_out4, 5);
1195 dst7 = __msa_srari_h(hz_out5, 5);
1198 SAT_SH4_SH(dst1, dst3, dst5, dst7, 7);
1200 dst0 = __msa_aver_s_h(dst0, dst1);
1201 dst1 = __msa_aver_s_h(dst2, dst3);
1202 dst2 = __msa_aver_s_h(dst4, dst5);
1203 dst3 = __msa_aver_s_h(dst6, dst7);
1205 out = PCKEV_XORI128_UB(dst0, dst0);
1208 out = PCKEV_XORI128_UB(dst1, dst1);
1211 out = PCKEV_XORI128_UB(dst2, dst2);
1214 out = PCKEV_XORI128_UB(dst3, dst3);
1226 static void avc_luma_midv_qrt_16w_msa(const uint8_t *src, int32_t src_stride,
1227 uint8_t *dst, int32_t dst_stride,
1228 int32_t height, uint8_t vert_offset)
1230 uint32_t multiple8_cnt;
1232 for (multiple8_cnt = 2; multiple8_cnt--;) {
1233 avc_luma_midv_qrt_8w_msa(src, src_stride, dst, dst_stride, height,
1241 static void avc_luma_hv_qrt_4w_msa(const uint8_t *src_x, const uint8_t *src_y,
1242 int32_t src_stride, uint8_t *dst,
1243 int32_t dst_stride, int32_t height)
1246 v16i8 src_hz0, src_hz1, src_hz2, src_hz3;
1247 v16i8 src_vt0, src_vt1, src_vt2, src_vt3, src_vt4;
1248 v16i8 src_vt5, src_vt6, src_vt7, src_vt8;
1249 v16i8 mask0, mask1, mask2;
1250 v8i16 hz_out0, hz_out1, vert_out0, vert_out1;
1254 LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2);
1256 LD_SB5(src_y, src_stride, src_vt0, src_vt1, src_vt2, src_vt3, src_vt4);
1257 src_y += (5 * src_stride);
1259 src_vt0 = (v16i8) __msa_insve_w((v4i32) src_vt0, 1, (v4i32) src_vt1);
1260 src_vt1 = (v16i8) __msa_insve_w((v4i32) src_vt1, 1, (v4i32) src_vt2);
1261 src_vt2 = (v16i8) __msa_insve_w((v4i32) src_vt2, 1, (v4i32) src_vt3);
1262 src_vt3 = (v16i8) __msa_insve_w((v4i32) src_vt3, 1, (v4i32) src_vt4);
1264 XORI_B4_128_SB(src_vt0, src_vt1, src_vt2, src_vt3);
1266 for (loop_cnt = (height >> 2); loop_cnt--;) {
1267 LD_SB4(src_x, src_stride, src_hz0, src_hz1, src_hz2, src_hz3);
1268 src_x += (4 * src_stride);
1270 XORI_B4_128_SB(src_hz0, src_hz1, src_hz2, src_hz3);
1272 hz_out0 = AVC_XOR_VSHF_B_AND_APPLY_6TAP_HORIZ_FILT_SH(src_hz0,
1275 hz_out1 = AVC_XOR_VSHF_B_AND_APPLY_6TAP_HORIZ_FILT_SH(src_hz2,
1279 SRARI_H2_SH(hz_out0, hz_out1, 5);
1280 SAT_SH2_SH(hz_out0, hz_out1, 7);
1282 LD_SB4(src_y, src_stride, src_vt5, src_vt6, src_vt7, src_vt8);
1283 src_y += (4 * src_stride);
1285 src_vt4 = (v16i8) __msa_insve_w((v4i32) src_vt4, 1, (v4i32) src_vt5);
1286 src_vt5 = (v16i8) __msa_insve_w((v4i32) src_vt5, 1, (v4i32) src_vt6);
1287 src_vt6 = (v16i8) __msa_insve_w((v4i32) src_vt6, 1, (v4i32) src_vt7);
1288 src_vt7 = (v16i8) __msa_insve_w((v4i32) src_vt7, 1, (v4i32) src_vt8);
1290 XORI_B4_128_SB(src_vt4, src_vt5, src_vt6, src_vt7);
1293 vert_out0 = AVC_CALC_DPADD_B_6PIX_2COEFF_R_SH(src_vt0, src_vt1,
1296 vert_out1 = AVC_CALC_DPADD_B_6PIX_2COEFF_R_SH(src_vt2, src_vt3,
1300 SRARI_H2_SH(vert_out0, vert_out1, 5);
1301 SAT_SH2_SH(vert_out0, vert_out1, 7);
1303 out0 = __msa_srari_h((hz_out0 + vert_out0), 1);
1304 out1 = __msa_srari_h((hz_out1 + vert_out1), 1);
1306 SAT_SH2_SH(out0, out1, 7);
1307 out = PCKEV_XORI128_UB(out0, out1);
1308 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
1309 dst += (4 * dst_stride);
1319 static void avc_luma_hv_qrt_8w_msa(const uint8_t *src_x, const uint8_t *src_y,
1320 int32_t src_stride, uint8_t *dst,
1321 int32_t dst_stride, int32_t height)
1324 v16i8 src_hz0, src_hz1, src_hz2, src_hz3;
1325 v16i8 src_vt0, src_vt1, src_vt2, src_vt3, src_vt4;
1326 v16i8 src_vt5, src_vt6, src_vt7, src_vt8;
1327 v16i8 mask0, mask1, mask2;
1328 v8i16 hz_out0, hz_out1, hz_out2, hz_out3;
1329 v8i16 vert_out0, vert_out1, vert_out2, vert_out3;
1330 v8i16 out0, out1, out2, out3;
1333 LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
1334 LD_SB5(src_y, src_stride, src_vt0, src_vt1, src_vt2, src_vt3, src_vt4);
1335 src_y += (5 * src_stride);
1337 src_vt0 = (v16i8) __msa_insve_d((v2i64) src_vt0, 1, (v2i64) src_vt1);
1338 src_vt1 = (v16i8) __msa_insve_d((v2i64) src_vt1, 1, (v2i64) src_vt2);
1339 src_vt2 = (v16i8) __msa_insve_d((v2i64) src_vt2, 1, (v2i64) src_vt3);
1340 src_vt3 = (v16i8) __msa_insve_d((v2i64) src_vt3, 1, (v2i64) src_vt4);
1342 XORI_B4_128_SB(src_vt0, src_vt1, src_vt2, src_vt3);
1344 for (loop_cnt = (height >> 2); loop_cnt--;) {
1345 LD_SB4(src_x, src_stride, src_hz0, src_hz1, src_hz2, src_hz3);
1346 XORI_B4_128_SB(src_hz0, src_hz1, src_hz2, src_hz3);
1347 src_x += (4 * src_stride);
1349 hz_out0 = AVC_HORZ_FILTER_SH(src_hz0, mask0, mask1, mask2);
1350 hz_out1 = AVC_HORZ_FILTER_SH(src_hz1, mask0, mask1, mask2);
1351 hz_out2 = AVC_HORZ_FILTER_SH(src_hz2, mask0, mask1, mask2);
1352 hz_out3 = AVC_HORZ_FILTER_SH(src_hz3, mask0, mask1, mask2);
1354 SRARI_H4_SH(hz_out0, hz_out1, hz_out2, hz_out3, 5);
1355 SAT_SH4_SH(hz_out0, hz_out1, hz_out2, hz_out3, 7);
1357 LD_SB4(src_y, src_stride, src_vt5, src_vt6, src_vt7, src_vt8);
1358 src_y += (4 * src_stride);
1360 src_vt4 = (v16i8) __msa_insve_d((v2i64) src_vt4, 1, (v2i64) src_vt5);
1361 src_vt5 = (v16i8) __msa_insve_d((v2i64) src_vt5, 1, (v2i64) src_vt6);
1362 src_vt6 = (v16i8) __msa_insve_d((v2i64) src_vt6, 1, (v2i64) src_vt7);
1363 src_vt7 = (v16i8) __msa_insve_d((v2i64) src_vt7, 1, (v2i64) src_vt8);
1365 XORI_B4_128_SB(src_vt4, src_vt5, src_vt6, src_vt7);
1368 AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src_vt0, src_vt1, src_vt2, src_vt3,
1369 src_vt4, src_vt5, vert_out0, vert_out1);
1370 AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src_vt2, src_vt3, src_vt4, src_vt5,
1371 src_vt6, src_vt7, vert_out2, vert_out3);
1373 SRARI_H4_SH(vert_out0, vert_out1, vert_out2, vert_out3, 5);
1374 SAT_SH4_SH(vert_out0, vert_out1, vert_out2, vert_out3, 7);
1376 out0 = __msa_srari_h((hz_out0 + vert_out0), 1);
1377 out1 = __msa_srari_h((hz_out1 + vert_out1), 1);
1378 out2 = __msa_srari_h((hz_out2 + vert_out2), 1);
1379 out3 = __msa_srari_h((hz_out3 + vert_out3), 1);
1381 SAT_SH4_SH(out0, out1, out2, out3, 7);
1382 tmp0 = PCKEV_XORI128_UB(out0, out1);
1383 tmp1 = PCKEV_XORI128_UB(out2, out3);
1384 ST8x4_UB(tmp0, tmp1, dst, dst_stride);
1386 dst += (4 * dst_stride);
1396 static void avc_luma_hv_qrt_16w_msa(const uint8_t *src_x, const uint8_t *src_y,
1397 int32_t src_stride, uint8_t *dst,
1398 int32_t dst_stride, int32_t height)
1400 uint32_t multiple8_cnt;
1402 for (multiple8_cnt = 2; multiple8_cnt--;) {
1403 avc_luma_hv_qrt_8w_msa(src_x, src_y, src_stride, dst, dst_stride,
1412 static void avc_luma_hz_and_aver_dst_4x4_msa(const uint8_t *src,
1414 uint8_t *dst, int32_t dst_stride)
1416 v16i8 src0, src1, src2, src3;
1417 v16u8 dst0, dst1, dst2, dst3, res;
1419 v16i8 mask0, mask1, mask2;
1420 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
1421 v16i8 minus5b = __msa_ldi_b(-5);
1422 v16i8 plus20b = __msa_ldi_b(20);
1424 LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2);
1425 LD_SB4(src, src_stride, src0, src1, src2, src3);
1427 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
1428 XORI_B4_128_SB(src0, src1, src2, src3);
1429 VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0, vec1);
1430 HADD_SB2_SH(vec0, vec1, res0, res1);
1431 VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2, vec3);
1432 DPADD_SB2_SH(vec2, vec3, minus5b, minus5b, res0, res1);
1433 VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4, vec5);
1434 DPADD_SB2_SH(vec4, vec5, plus20b, plus20b, res0, res1);
1435 SRARI_H2_SH(res0, res1, 5);
1436 SAT_SH2_SH(res0, res1, 7);
1437 res = PCKEV_XORI128_UB(res0, res1);
1438 ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst1);
1440 dst0 = (v16u8) __msa_pckev_d((v2i64) dst1, (v2i64) dst0);
1441 res = __msa_aver_u_b(res, dst0);
1443 ST4x4_UB(res, res, 0, 1, 2, 3, dst, dst_stride);
1446 static void avc_luma_hz_and_aver_dst_8x8_msa(const uint8_t *src,
1448 uint8_t *dst, int32_t dst_stride)
1451 v16i8 src0, src1, src2, src3;
1452 v16u8 dst0, dst1, dst2, dst3;
1453 v8i16 res0, res1, res2, res3;
1454 v16i8 mask0, mask1, mask2;
1455 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
1456 v16i8 vec6, vec7, vec8, vec9, vec10, vec11;
1457 v16i8 minus5b = __msa_ldi_b(-5);
1458 v16i8 plus20b = __msa_ldi_b(20);
1460 LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
1462 for (loop_cnt = 2; loop_cnt--;) {
1463 LD_SB4(src, src_stride, src0, src1, src2, src3);
1464 src += (4 * src_stride);
1466 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
1468 XORI_B4_128_SB(src0, src1, src2, src3);
1469 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
1470 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3);
1471 HADD_SB4_SH(vec0, vec1, vec2, vec3, res0, res1, res2, res3);
1472 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4, vec5);
1473 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6, vec7);
1474 DPADD_SB4_SH(vec4, vec5, vec6, vec7, minus5b, minus5b, minus5b, minus5b,
1475 res0, res1, res2, res3);
1476 VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec8, vec9);
1477 VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec10, vec11);
1478 DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b,
1479 plus20b, res0, res1, res2, res3);
1480 SRARI_H4_SH(res0, res1, res2, res3, 5);
1481 SAT_SH4_SH(res0, res1, res2, res3, 7);
1482 CONVERT_UB_AVG_ST8x4_UB(res0, res1, res2, res3, dst0, dst1, dst2, dst3,
1485 dst += (4 * dst_stride);
1489 static void avc_luma_hz_and_aver_dst_16x16_msa(const uint8_t *src,
1491 uint8_t *dst, int32_t dst_stride)
1494 v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
1495 v16u8 dst0, dst1, dst2, dst3;
1496 v16i8 mask0, mask1, mask2;
1497 v8i16 res0, res1, res2, res3, res4, res5, res6, res7;
1498 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
1499 v16i8 vec6, vec7, vec8, vec9, vec10, vec11;
1500 v16i8 minus5b = __msa_ldi_b(-5);
1501 v16i8 plus20b = __msa_ldi_b(20);
1503 LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
1505 for (loop_cnt = 4; loop_cnt--;) {
1506 LD_SB2(src, 8, src0, src1);
1508 LD_SB2(src, 8, src2, src3);
1511 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
1513 XORI_B4_128_SB(src0, src1, src2, src3);
1514 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec3);
1515 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec6, vec9);
1516 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec1, vec4);
1517 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec7, vec10);
1518 VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec2, vec5);
1519 VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec8, vec11);
1520 HADD_SB4_SH(vec0, vec3, vec6, vec9, res0, res1, res2, res3);
1521 DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
1522 minus5b, res0, res1, res2, res3);
1523 DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
1524 plus20b, res0, res1, res2, res3);
1525 LD_SB2(src, 8, src4, src5);
1527 LD_SB2(src, 8, src6, src7);
1529 XORI_B4_128_SB(src4, src5, src6, src7);
1530 VSHF_B2_SB(src4, src4, src5, src5, mask0, mask0, vec0, vec3);
1531 VSHF_B2_SB(src6, src6, src7, src7, mask0, mask0, vec6, vec9);
1532 VSHF_B2_SB(src4, src4, src5, src5, mask1, mask1, vec1, vec4);
1533 VSHF_B2_SB(src6, src6, src7, src7, mask1, mask1, vec7, vec10);
1534 VSHF_B2_SB(src4, src4, src5, src5, mask2, mask2, vec2, vec5);
1535 VSHF_B2_SB(src6, src6, src7, src7, mask2, mask2, vec8, vec11);
1536 HADD_SB4_SH(vec0, vec3, vec6, vec9, res4, res5, res6, res7);
1537 DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
1538 minus5b, res4, res5, res6, res7);
1539 DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
1540 plus20b, res4, res5, res6, res7);
1541 SRARI_H4_SH(res0, res1, res2, res3, 5);
1542 SRARI_H4_SH(res4, res5, res6, res7, 5);
1543 SAT_SH4_SH(res0, res1, res2, res3, 7);
1544 SAT_SH4_SH(res4, res5, res6, res7, 7);
1545 PCKEV_B4_SB(res1, res0, res3, res2, res5, res4, res7, res6,
1546 vec0, vec1, vec2, vec3);
1547 XORI_B4_128_SB(vec0, vec1, vec2, vec3);
1548 AVER_UB4_UB(vec0, dst0, vec1, dst1, vec2, dst2, vec3, dst3,
1549 dst0, dst1, dst2, dst3);
1550 ST_UB4(dst0, dst1, dst2, dst3, dst, dst_stride);
1551 dst += (4 * dst_stride);
1555 static void avc_luma_hz_qrt_and_aver_dst_4x4_msa(const uint8_t *src,
1562 v16i8 src0, src1, src2, src3;
1563 v16u8 dst0, dst1, dst2, dst3;
1564 v16i8 mask0, mask1, mask2;
1565 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
1567 v16i8 minus5b = __msa_ldi_b(-5);
1568 v16i8 plus20b = __msa_ldi_b(20);
1571 LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2);
1579 LD_SB4(src, src_stride, src0, src1, src2, src3);
1580 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
1582 XORI_B4_128_SB(src0, src1, src2, src3);
1583 VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0, vec1);
1584 HADD_SB2_SH(vec0, vec1, out0, out1);
1585 VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2, vec3);
1586 DPADD_SB2_SH(vec2, vec3, minus5b, minus5b, out0, out1);
1587 VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4, vec5);
1588 DPADD_SB2_SH(vec4, vec5, plus20b, plus20b, out0, out1);
1589 SRARI_H2_SH(out0, out1, 5);
1590 SAT_SH2_SH(out0, out1, 7);
1592 PCKEV_B2_UB(out0, out0, out1, out1, res0, res1);
1594 src0 = __msa_sld_b(src0, src0, slide);
1595 src1 = __msa_sld_b(src1, src1, slide);
1596 src2 = __msa_sld_b(src2, src2, slide);
1597 src3 = __msa_sld_b(src3, src3, slide);
1598 src0 = (v16i8) __msa_insve_w((v4i32) src0, 1, (v4i32) src1);
1599 src1 = (v16i8) __msa_insve_w((v4i32) src2, 1, (v4i32) src3);
1600 res0 = (v16u8) __msa_aver_s_b((v16i8) res0, src0);
1601 res1 = (v16u8) __msa_aver_s_b((v16i8) res1, src1);
1603 XORI_B2_128_UB(res0, res1);
1605 dst0 = (v16u8) __msa_insve_w((v4i32) dst0, 1, (v4i32) dst1);
1606 dst1 = (v16u8) __msa_insve_w((v4i32) dst2, 1, (v4i32) dst3);
1608 AVER_UB2_UB(res0, dst0, res1, dst1, dst0, dst1);
1610 ST4x4_UB(dst0, dst1, 0, 1, 0, 1, dst, dst_stride);
1613 static void avc_luma_hz_qrt_and_aver_dst_8x8_msa(const uint8_t *src,
1621 v16i8 src0, src1, src2, src3;
1622 v16i8 mask0, mask1, mask2;
1623 v16u8 dst0, dst1, dst2, dst3;
1624 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
1625 v16i8 vec6, vec7, vec8, vec9, vec10, vec11;
1626 v8i16 out0, out1, out2, out3;
1627 v16i8 minus5b = __msa_ldi_b(-5);
1628 v16i8 plus20b = __msa_ldi_b(20);
1629 v16i8 res0, res1, res2, res3;
1631 LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
1639 for (loop_cnt = 2; loop_cnt--;) {
1640 LD_SB4(src, src_stride, src0, src1, src2, src3);
1641 src += (4 * src_stride);
1643 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
1645 XORI_B4_128_SB(src0, src1, src2, src3);
1646 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
1647 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3);
1648 HADD_SB4_SH(vec0, vec1, vec2, vec3, out0, out1, out2, out3);
1649 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4, vec5);
1650 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6, vec7);
1651 DPADD_SB4_SH(vec4, vec5, vec6, vec7, minus5b, minus5b, minus5b, minus5b,
1652 out0, out1, out2, out3);
1653 VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec8, vec9);
1654 VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec10, vec11);
1655 DPADD_SB4_SH(vec8, vec9, vec10, vec11, plus20b, plus20b, plus20b,
1656 plus20b, out0, out1, out2, out3);
1658 src0 = __msa_sld_b(src0, src0, slide);
1659 src1 = __msa_sld_b(src1, src1, slide);
1660 src2 = __msa_sld_b(src2, src2, slide);
1661 src3 = __msa_sld_b(src3, src3, slide);
1663 SRARI_H4_SH(out0, out1, out2, out3, 5);
1664 SAT_SH4_SH(out0, out1, out2, out3, 7);
1666 PCKEV_B4_SB(out0, out0, out1, out1, out2, out2, out3, out3,
1667 res0, res1, res2, res3);
1669 res0 = __msa_aver_s_b(res0, src0);
1670 res1 = __msa_aver_s_b(res1, src1);
1671 res2 = __msa_aver_s_b(res2, src2);
1672 res3 = __msa_aver_s_b(res3, src3);
1674 XORI_B4_128_SB(res0, res1, res2, res3);
1675 AVER_ST8x4_UB(res0, dst0, res1, dst1, res2, dst2, res3, dst3,
1678 dst += (4 * dst_stride);
1682 static void avc_luma_hz_qrt_and_aver_dst_16x16_msa(const uint8_t *src,
1690 v16i8 src0, src1, src2, src3;
1691 v16i8 mask0, mask1, mask2, vshf;
1693 v8i16 res0, res1, res2, res3;
1694 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
1695 v16i8 vec6, vec7, vec8, vec9, vec10, vec11;
1696 v16i8 minus5b = __msa_ldi_b(-5);
1697 v16i8 plus20b = __msa_ldi_b(20);
1699 LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
1702 vshf = LD_SB(&luma_mask_arr[16 + 96]);
1704 vshf = LD_SB(&luma_mask_arr[96]);
1707 for (loop_cnt = 8; loop_cnt--;) {
1708 LD_SB2(src, 8, src0, src1);
1710 LD_SB2(src, 8, src2, src3);
1713 LD_UB2(dst, dst_stride, dst0, dst1);
1715 XORI_B4_128_SB(src0, src1, src2, src3);
1716 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec3);
1717 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec6, vec9);
1718 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec1, vec4);
1719 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec7, vec10);
1720 VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec2, vec5);
1721 VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec8, vec11);
1722 HADD_SB4_SH(vec0, vec3, vec6, vec9, res0, res1, res2, res3);
1723 DPADD_SB4_SH(vec1, vec4, vec7, vec10, minus5b, minus5b, minus5b,
1724 minus5b, res0, res1, res2, res3);
1725 DPADD_SB4_SH(vec2, vec5, vec8, vec11, plus20b, plus20b, plus20b,
1726 plus20b, res0, res1, res2, res3);
1727 VSHF_B2_SB(src0, src1, src2, src3, vshf, vshf, src0, src2);
1728 SRARI_H4_SH(res0, res1, res2, res3, 5);
1729 SAT_SH4_SH(res0, res1, res2, res3, 7);
1730 PCKEV_B2_SB(res1, res0, res3, res2, out0, out1);
1732 out0 = __msa_aver_s_b(out0, src0);
1733 out1 = __msa_aver_s_b(out1, src2);
1735 XORI_B2_128_SB(out0, out1);
1736 AVER_UB2_UB(out0, dst0, out1, dst1, dst0, dst1);
1737 ST_UB2(dst0, dst1, dst, dst_stride);
1738 dst += (2 * dst_stride);
1742 static void avc_luma_vt_and_aver_dst_4x4_msa(const uint8_t *src,
1744 uint8_t *dst, int32_t dst_stride)
1746 int16_t filt_const0 = 0xfb01;
1747 int16_t filt_const1 = 0x1414;
1748 int16_t filt_const2 = 0x1fb;
1749 v16u8 dst0, dst1, dst2, dst3;
1750 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
1751 v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
1752 v16i8 src87_r, src2110, src4332, src6554, src8776;
1754 v16i8 filt0, filt1, filt2;
1757 filt0 = (v16i8) __msa_fill_h(filt_const0);
1758 filt1 = (v16i8) __msa_fill_h(filt_const1);
1759 filt2 = (v16i8) __msa_fill_h(filt_const2);
1761 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
1762 src += (5 * src_stride);
1764 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3,
1765 src10_r, src21_r, src32_r, src43_r);
1766 ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
1767 XORI_B2_128_SB(src2110, src4332);
1768 LD_SB4(src, src_stride, src5, src6, src7, src8);
1769 ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7,
1770 src54_r, src65_r, src76_r, src87_r);
1771 ILVR_D2_SB(src65_r, src54_r, src87_r, src76_r, src6554, src8776);
1772 XORI_B2_128_SB(src6554, src8776);
1773 out10 = DPADD_SH3_SH(src2110, src4332, src6554, filt0, filt1, filt2);
1774 out32 = DPADD_SH3_SH(src4332, src6554, src8776, filt0, filt1, filt2);
1775 SRARI_H2_SH(out10, out32, 5);
1776 SAT_SH2_SH(out10, out32, 7);
1777 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
1778 res = PCKEV_XORI128_UB(out10, out32);
1780 ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst1);
1782 dst0 = (v16u8) __msa_pckev_d((v2i64) dst1, (v2i64) dst0);
1783 dst0 = __msa_aver_u_b(res, dst0);
1785 ST4x4_UB(dst0, dst0, 0, 1, 2, 3, dst, dst_stride);
1788 static void avc_luma_vt_and_aver_dst_8x8_msa(const uint8_t *src,
1790 uint8_t *dst, int32_t dst_stride)
1793 int16_t filt_const0 = 0xfb01;
1794 int16_t filt_const1 = 0x1414;
1795 int16_t filt_const2 = 0x1fb;
1796 v16u8 dst0, dst1, dst2, dst3;
1797 v16i8 src0, src1, src2, src3, src4, src7, src8, src9, src10;
1798 v16i8 src10_r, src32_r, src76_r, src98_r;
1799 v16i8 src21_r, src43_r, src87_r, src109_r;
1800 v8i16 out0, out1, out2, out3;
1801 v16i8 filt0, filt1, filt2;
1803 filt0 = (v16i8) __msa_fill_h(filt_const0);
1804 filt1 = (v16i8) __msa_fill_h(filt_const1);
1805 filt2 = (v16i8) __msa_fill_h(filt_const2);
1807 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
1808 src += (5 * src_stride);
1810 XORI_B5_128_SB(src0, src1, src2, src3, src4);
1811 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3,
1812 src10_r, src21_r, src32_r, src43_r);
1814 for (loop_cnt = 2; loop_cnt--;) {
1815 LD_SB4(src, src_stride, src7, src8, src9, src10);
1816 src += (4 * src_stride);
1818 XORI_B4_128_SB(src7, src8, src9, src10);
1819 ILVR_B4_SB(src7, src4, src8, src7, src9, src8, src10, src9,
1820 src76_r, src87_r, src98_r, src109_r);
1821 out0 = DPADD_SH3_SH(src10_r, src32_r, src76_r, filt0, filt1, filt2);
1822 out1 = DPADD_SH3_SH(src21_r, src43_r, src87_r, filt0, filt1, filt2);
1823 out2 = DPADD_SH3_SH(src32_r, src76_r, src98_r, filt0, filt1, filt2);
1824 out3 = DPADD_SH3_SH(src43_r, src87_r, src109_r, filt0, filt1, filt2);
1825 SRARI_H4_SH(out0, out1, out2, out3, 5);
1826 SAT_SH4_SH(out0, out1, out2, out3, 7);
1827 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
1829 CONVERT_UB_AVG_ST8x4_UB(out0, out1, out2, out3, dst0, dst1, dst2, dst3,
1831 dst += (4 * dst_stride);
1841 static void avc_luma_vt_and_aver_dst_16x16_msa(const uint8_t *src,
1843 uint8_t *dst, int32_t dst_stride)
1846 int16_t filt_const0 = 0xfb01;
1847 int16_t filt_const1 = 0x1414;
1848 int16_t filt_const2 = 0x1fb;
1849 v16u8 dst0, dst1, dst2, dst3;
1850 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
1851 v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
1852 v16i8 src87_r, src10_l, src32_l, src54_l, src76_l, src21_l, src43_l;
1853 v16i8 src65_l, src87_l;
1854 v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
1855 v16i8 filt0, filt1, filt2;
1856 v16u8 res0, res1, res2, res3;
1858 filt0 = (v16i8) __msa_fill_h(filt_const0);
1859 filt1 = (v16i8) __msa_fill_h(filt_const1);
1860 filt2 = (v16i8) __msa_fill_h(filt_const2);
1862 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
1863 src += (5 * src_stride);
1865 XORI_B5_128_SB(src0, src1, src2, src3, src4);
1866 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3,
1867 src10_r, src21_r, src32_r, src43_r);
1868 ILVL_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3,
1869 src10_l, src21_l, src32_l, src43_l);
1871 for (loop_cnt = 4; loop_cnt--;) {
1872 LD_SB4(src, src_stride, src5, src6, src7, src8);
1873 src += (4 * src_stride);
1875 XORI_B4_128_SB(src5, src6, src7, src8);
1876 ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7,
1877 src54_r, src65_r, src76_r, src87_r);
1878 ILVL_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7,
1879 src54_l, src65_l, src76_l, src87_l);
1880 out0_r = DPADD_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2);
1881 out1_r = DPADD_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2);
1882 out2_r = DPADD_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2);
1883 out3_r = DPADD_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2);
1884 out0_l = DPADD_SH3_SH(src10_l, src32_l, src54_l, filt0, filt1, filt2);
1885 out1_l = DPADD_SH3_SH(src21_l, src43_l, src65_l, filt0, filt1, filt2);
1886 out2_l = DPADD_SH3_SH(src32_l, src54_l, src76_l, filt0, filt1, filt2);
1887 out3_l = DPADD_SH3_SH(src43_l, src65_l, src87_l, filt0, filt1, filt2);
1888 SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 5);
1889 SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 5);
1890 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
1891 SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
1892 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
1893 PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
1894 out3_r, res0, res1, res2, res3);
1895 XORI_B4_128_UB(res0, res1, res2, res3);
1896 AVER_UB4_UB(res0, dst0, res1, dst1, res2, dst2, res3, dst3,
1897 res0, res1, res2, res3);
1898 ST_UB4(res0, res1, res2, res3, dst, dst_stride);
1899 dst += (4 * dst_stride);
1913 static void avc_luma_vt_qrt_and_aver_dst_4x4_msa(const uint8_t *src,
1919 int16_t filt_const0 = 0xfb01;
1920 int16_t filt_const1 = 0x1414;
1921 int16_t filt_const2 = 0x1fb;
1922 v16u8 dst0, dst1, dst2, dst3;
1923 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
1924 v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
1925 v16i8 src87_r, src2110, src4332, src6554, src8776;
1927 v16i8 filt0, filt1, filt2;
1930 filt0 = (v16i8) __msa_fill_h(filt_const0);
1931 filt1 = (v16i8) __msa_fill_h(filt_const1);
1932 filt2 = (v16i8) __msa_fill_h(filt_const2);
1934 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
1935 src += (5 * src_stride);
1937 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3,
1938 src10_r, src21_r, src32_r, src43_r);
1939 ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
1940 XORI_B2_128_SB(src2110, src4332);
1941 LD_SB4(src, src_stride, src5, src6, src7, src8);
1942 ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7,
1943 src54_r, src65_r, src76_r, src87_r);
1944 ILVR_D2_SB(src65_r, src54_r, src87_r, src76_r, src6554, src8776);
1945 XORI_B2_128_SB(src6554, src8776);
1946 out10 = DPADD_SH3_SH(src2110, src4332, src6554, filt0, filt1, filt2);
1947 out32 = DPADD_SH3_SH(src4332, src6554, src8776, filt0, filt1, filt2);
1948 SRARI_H2_SH(out10, out32, 5);
1949 SAT_SH2_SH(out10, out32, 7);
1950 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
1951 res = PCKEV_XORI128_UB(out10, out32);
1954 src32_r = (v16i8) __msa_insve_w((v4i32) src3, 1, (v4i32) src4);
1955 src54_r = (v16i8) __msa_insve_w((v4i32) src5, 1, (v4i32) src6);
1957 src32_r = (v16i8) __msa_insve_w((v4i32) src2, 1, (v4i32) src3);
1958 src54_r = (v16i8) __msa_insve_w((v4i32) src4, 1, (v4i32) src5);
1961 src32_r = (v16i8) __msa_insve_d((v2i64) src32_r, 1, (v2i64) src54_r);
1962 res = __msa_aver_u_b(res, (v16u8) src32_r);
1964 ILVR_W2_UB(dst1, dst0, dst3, dst2, dst0, dst1);
1966 dst0 = (v16u8) __msa_pckev_d((v2i64) dst1, (v2i64) dst0);
1967 dst0 = __msa_aver_u_b(res, dst0);
1969 ST4x4_UB(dst0, dst0, 0, 1, 2, 3, dst, dst_stride);
1972 static void avc_luma_vt_qrt_and_aver_dst_8x8_msa(const uint8_t *src,
1979 int16_t filt_const0 = 0xfb01;
1980 int16_t filt_const1 = 0x1414;
1981 int16_t filt_const2 = 0x1fb;
1982 v16u8 dst0, dst1, dst2, dst3;
1983 v16i8 src0, src1, src2, src3, src4, src7, src8, src9, src10;
1984 v16i8 src10_r, src32_r, src76_r, src98_r;
1985 v16i8 src21_r, src43_r, src87_r, src109_r;
1986 v8i16 out0_r, out1_r, out2_r, out3_r;
1989 v16i8 filt0, filt1, filt2;
1991 filt0 = (v16i8) __msa_fill_h(filt_const0);
1992 filt1 = (v16i8) __msa_fill_h(filt_const1);
1993 filt2 = (v16i8) __msa_fill_h(filt_const2);
1995 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
1996 src += (5 * src_stride);
1998 XORI_B5_128_SB(src0, src1, src2, src3, src4);
1999 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3,
2000 src10_r, src21_r, src32_r, src43_r);
2002 for (loop_cnt = 2; loop_cnt--;) {
2003 LD_SB4(src, src_stride, src7, src8, src9, src10);
2004 src += (4 * src_stride);
2006 XORI_B4_128_SB(src7, src8, src9, src10);
2007 ILVR_B4_SB(src7, src4, src8, src7, src9, src8, src10, src9,
2008 src76_r, src87_r, src98_r, src109_r);
2009 out0_r = DPADD_SH3_SH(src10_r, src32_r, src76_r, filt0, filt1, filt2);
2010 out1_r = DPADD_SH3_SH(src21_r, src43_r, src87_r, filt0, filt1, filt2);
2011 out2_r = DPADD_SH3_SH(src32_r, src76_r, src98_r, filt0, filt1, filt2);
2012 out3_r = DPADD_SH3_SH(src43_r, src87_r, src109_r, filt0, filt1, filt2);
2013 SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 5);
2014 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
2015 PCKEV_B2_SB(out1_r, out0_r, out3_r, out2_r, res0, res1);
2018 PCKEV_D2_SB(src4, src3, src8, src7, src10_r, src32_r);
2020 PCKEV_D2_SB(src3, src2, src7, src4, src10_r, src32_r);
2023 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
2024 ILVR_D2_UB(dst1, dst0, dst3, dst2, dst0, dst1);
2026 vec0 = (v16u8) __msa_aver_s_b(res0, src10_r);
2027 vec1 = (v16u8) __msa_aver_s_b(res1, src32_r);
2029 XORI_B2_128_UB(vec0, vec1);
2030 AVER_UB2_UB(vec0, dst0, vec1, dst1, vec0, vec1);
2031 ST8x4_UB(vec0, vec1, dst, dst_stride);
2032 dst += (4 * dst_stride);
2044 static void avc_luma_vt_qrt_and_aver_dst_16x16_msa(const uint8_t *src,
2051 int16_t filt_const0 = 0xfb01;
2052 int16_t filt_const1 = 0x1414;
2053 int16_t filt_const2 = 0x1fb;
2054 v16u8 dst0, dst1, dst2, dst3;
2055 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
2056 v16i8 src10_r, src32_r, src54_r, src76_r, src21_r, src43_r, src65_r;
2057 v16i8 src87_r, src10_l, src32_l, src54_l, src76_l, src21_l, src43_l;
2058 v16i8 src65_l, src87_l;
2059 v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
2060 v16i8 out0, out1, out2, out3;
2061 v16i8 filt0, filt1, filt2;
2062 v16u8 res0, res1, res2, res3;
2064 filt0 = (v16i8) __msa_fill_h(filt_const0);
2065 filt1 = (v16i8) __msa_fill_h(filt_const1);
2066 filt2 = (v16i8) __msa_fill_h(filt_const2);
2068 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
2069 src += (5 * src_stride);
2071 XORI_B5_128_SB(src0, src1, src2, src3, src4);
2072 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3,
2073 src10_r, src21_r, src32_r, src43_r);
2074 ILVL_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3,
2075 src10_l, src21_l, src32_l, src43_l);
2077 for (loop_cnt = 4; loop_cnt--;) {
2078 LD_SB4(src, src_stride, src5, src6, src7, src8);
2079 src += (4 * src_stride);
2081 XORI_B4_128_SB(src5, src6, src7, src8);
2082 ILVR_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7,
2083 src54_r, src65_r, src76_r, src87_r);
2084 ILVL_B4_SB(src5, src4, src6, src5, src7, src6, src8, src7,
2085 src54_l, src65_l, src76_l, src87_l);
2086 out0_r = DPADD_SH3_SH(src10_r, src32_r, src54_r, filt0, filt1, filt2);
2087 out1_r = DPADD_SH3_SH(src21_r, src43_r, src65_r, filt0, filt1, filt2);
2088 out2_r = DPADD_SH3_SH(src32_r, src54_r, src76_r, filt0, filt1, filt2);
2089 out3_r = DPADD_SH3_SH(src43_r, src65_r, src87_r, filt0, filt1, filt2);
2090 out0_l = DPADD_SH3_SH(src10_l, src32_l, src54_l, filt0, filt1, filt2);
2091 out1_l = DPADD_SH3_SH(src21_l, src43_l, src65_l, filt0, filt1, filt2);
2092 out2_l = DPADD_SH3_SH(src32_l, src54_l, src76_l, filt0, filt1, filt2);
2093 out3_l = DPADD_SH3_SH(src43_l, src65_l, src87_l, filt0, filt1, filt2);
2094 SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 5);
2095 SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 5);
2096 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
2097 SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
2098 PCKEV_B4_SB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
2099 out3_r, out0, out1, out2, out3);
2100 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
2103 res0 = (v16u8) __msa_aver_s_b(out0, src3);
2104 res1 = (v16u8) __msa_aver_s_b(out1, src4);
2105 res2 = (v16u8) __msa_aver_s_b(out2, src5);
2106 res3 = (v16u8) __msa_aver_s_b(out3, src6);
2108 res0 = (v16u8) __msa_aver_s_b(out0, src2);
2109 res1 = (v16u8) __msa_aver_s_b(out1, src3);
2110 res2 = (v16u8) __msa_aver_s_b(out2, src4);
2111 res3 = (v16u8) __msa_aver_s_b(out3, src5);
2114 XORI_B4_128_UB(res0, res1, res2, res3);
2115 AVER_UB4_UB(res0, dst0, res1, dst1, res2, dst2, res3, dst3,
2116 dst0, dst1, dst2, dst3);
2117 ST_UB4(dst0, dst1, dst2, dst3, dst, dst_stride);
2118 dst += (4 * dst_stride);
2134 static void avc_luma_mid_and_aver_dst_4x4_msa(const uint8_t *src,
2136 uint8_t *dst, int32_t dst_stride)
2138 v16i8 src0, src1, src2, src3, src4;
2139 v16i8 mask0, mask1, mask2;
2140 v8i16 hz_out0, hz_out1, hz_out2, hz_out3;
2141 v8i16 hz_out4, hz_out5, hz_out6, hz_out7, hz_out8;
2142 v8i16 res0, res1, res2, res3;
2143 v16u8 dst0, dst1, dst2, dst3;
2144 v16u8 tmp0, tmp1, tmp2, tmp3;
2146 LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2);
2147 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
2148 src += (5 * src_stride);
2150 XORI_B5_128_SB(src0, src1, src2, src3, src4);
2152 hz_out0 = AVC_XOR_VSHF_B_AND_APPLY_6TAP_HORIZ_FILT_SH(src0, src1,
2153 mask0, mask1, mask2);
2154 hz_out2 = AVC_XOR_VSHF_B_AND_APPLY_6TAP_HORIZ_FILT_SH(src2, src3,
2155 mask0, mask1, mask2);
2157 PCKOD_D2_SH(hz_out0, hz_out0, hz_out2, hz_out2, hz_out1, hz_out3);
2159 hz_out4 = AVC_HORZ_FILTER_SH(src4, mask0, mask1, mask2);
2161 LD_SB4(src, src_stride, src0, src1, src2, src3);
2162 XORI_B4_128_SB(src0, src1, src2, src3);
2164 hz_out5 = AVC_XOR_VSHF_B_AND_APPLY_6TAP_HORIZ_FILT_SH(src0, src1,
2165 mask0, mask1, mask2);
2166 hz_out7 = AVC_XOR_VSHF_B_AND_APPLY_6TAP_HORIZ_FILT_SH(src2, src3,
2167 mask0, mask1, mask2);
2169 PCKOD_D2_SH(hz_out5, hz_out5, hz_out7, hz_out7, hz_out6, hz_out8);
2171 res0 = AVC_CALC_DPADD_H_6PIX_2COEFF_R_SH(hz_out0, hz_out1, hz_out2,
2172 hz_out3, hz_out4, hz_out5);
2173 res1 = AVC_CALC_DPADD_H_6PIX_2COEFF_R_SH(hz_out1, hz_out2, hz_out3,
2174 hz_out4, hz_out5, hz_out6);
2175 res2 = AVC_CALC_DPADD_H_6PIX_2COEFF_R_SH(hz_out2, hz_out3, hz_out4,
2176 hz_out5, hz_out6, hz_out7);
2177 res3 = AVC_CALC_DPADD_H_6PIX_2COEFF_R_SH(hz_out3, hz_out4, hz_out5,
2178 hz_out6, hz_out7, hz_out8);
2179 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
2180 tmp0 = PCKEV_XORI128_UB(res0, res1);
2181 tmp1 = PCKEV_XORI128_UB(res2, res3);
2182 PCKEV_D2_UB(dst1, dst0, dst3, dst2, tmp2, tmp3);
2183 AVER_UB2_UB(tmp0, tmp2, tmp1, tmp3, tmp0, tmp1);
2185 ST4x4_UB(tmp0, tmp1, 0, 2, 0, 2, dst, dst_stride);
2188 static void avc_luma_mid_and_aver_dst_8w_msa(const uint8_t *src,
2190 uint8_t *dst, int32_t dst_stride,
2194 v16i8 src0, src1, src2, src3, src4;
2195 v16i8 mask0, mask1, mask2;
2196 v8i16 hz_out0, hz_out1, hz_out2, hz_out3;
2197 v8i16 hz_out4, hz_out5, hz_out6, hz_out7, hz_out8;
2198 v16u8 dst0, dst1, dst2, dst3;
2199 v8i16 res0, res1, res2, res3;
2201 LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
2203 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
2204 XORI_B5_128_SB(src0, src1, src2, src3, src4);
2205 src += (5 * src_stride);
2207 hz_out0 = AVC_HORZ_FILTER_SH(src0, mask0, mask1, mask2);
2208 hz_out1 = AVC_HORZ_FILTER_SH(src1, mask0, mask1, mask2);
2209 hz_out2 = AVC_HORZ_FILTER_SH(src2, mask0, mask1, mask2);
2210 hz_out3 = AVC_HORZ_FILTER_SH(src3, mask0, mask1, mask2);
2211 hz_out4 = AVC_HORZ_FILTER_SH(src4, mask0, mask1, mask2);
2213 for (loop_cnt = (height >> 2); loop_cnt--;) {
2214 LD_SB4(src, src_stride, src0, src1, src2, src3);
2215 XORI_B4_128_SB(src0, src1, src2, src3);
2216 src += (4 * src_stride);
2218 hz_out5 = AVC_HORZ_FILTER_SH(src0, mask0, mask1, mask2);
2219 hz_out6 = AVC_HORZ_FILTER_SH(src1, mask0, mask1, mask2);
2220 hz_out7 = AVC_HORZ_FILTER_SH(src2, mask0, mask1, mask2);
2221 hz_out8 = AVC_HORZ_FILTER_SH(src3, mask0, mask1, mask2);
2223 res0 = AVC_CALC_DPADD_H_6PIX_2COEFF_SH(hz_out0, hz_out1, hz_out2,
2224 hz_out3, hz_out4, hz_out5);
2225 res1 = AVC_CALC_DPADD_H_6PIX_2COEFF_SH(hz_out1, hz_out2, hz_out3,
2226 hz_out4, hz_out5, hz_out6);
2227 res2 = AVC_CALC_DPADD_H_6PIX_2COEFF_SH(hz_out2, hz_out3, hz_out4,
2228 hz_out5, hz_out6, hz_out7);
2229 res3 = AVC_CALC_DPADD_H_6PIX_2COEFF_SH(hz_out3, hz_out4, hz_out5,
2230 hz_out6, hz_out7, hz_out8);
2231 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
2232 CONVERT_UB_AVG_ST8x4_UB(res0, res1, res2, res3, dst0, dst1, dst2, dst3,
2235 dst += (4 * dst_stride);
2245 static void avc_luma_mid_and_aver_dst_16x16_msa(const uint8_t *src,
2250 avc_luma_mid_and_aver_dst_8w_msa(src, src_stride, dst, dst_stride, 16);
2251 avc_luma_mid_and_aver_dst_8w_msa(src + 8, src_stride, dst + 8, dst_stride,
2255 static void avc_luma_midh_qrt_and_aver_dst_4w_msa(const uint8_t *src,
2260 uint8_t horiz_offset)
2263 v16i8 src0, src1, src2, src3, src4, src5, src6;
2264 v16u8 dst0, dst1, res;
2265 v8i16 vt_res0, vt_res1, vt_res2, vt_res3;
2266 v4i32 hz_res0, hz_res1;
2268 v8i16 shf_vec0, shf_vec1, shf_vec2, shf_vec3, shf_vec4, shf_vec5;
2269 v8i16 mask0 = { 0, 5, 1, 6, 2, 7, 3, 8 };
2270 v8i16 mask1 = { 1, 4, 2, 5, 3, 6, 4, 7 };
2271 v8i16 mask2 = { 2, 3, 3, 4, 4, 5, 5, 6 };
2272 v8i16 minus5h = __msa_ldi_h(-5);
2273 v8i16 plus20h = __msa_ldi_h(20);
2274 v8i16 zeros = { 0 };
2276 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
2277 src += (5 * src_stride);
2279 XORI_B5_128_SB(src0, src1, src2, src3, src4);
2281 for (row = (height >> 1); row--;) {
2282 LD_SB2(src, src_stride, src5, src6);
2283 src += (2 * src_stride);
2285 XORI_B2_128_SB(src5, src6);
2286 LD_UB2(dst, dst_stride, dst0, dst1);
2288 dst0 = (v16u8) __msa_ilvr_w((v4i32) dst1, (v4i32) dst0);
2290 AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src0, src1, src2, src3, src4, src5,
2292 AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src1, src2, src3, src4, src5, src6,
2294 VSHF_H3_SH(vt_res0, vt_res1, vt_res0, vt_res1, vt_res0, vt_res1,
2295 mask0, mask1, mask2, shf_vec0, shf_vec1, shf_vec2);
2296 VSHF_H3_SH(vt_res2, vt_res3, vt_res2, vt_res3, vt_res2, vt_res3,
2297 mask0, mask1, mask2, shf_vec3, shf_vec4, shf_vec5);
2299 hz_res0 = __msa_hadd_s_w(shf_vec0, shf_vec0);
2300 DPADD_SH2_SW(shf_vec1, shf_vec2, minus5h, plus20h, hz_res0, hz_res0);
2302 hz_res1 = __msa_hadd_s_w(shf_vec3, shf_vec3);
2303 DPADD_SH2_SW(shf_vec4, shf_vec5, minus5h, plus20h, hz_res1, hz_res1);
2305 SRARI_W2_SW(hz_res0, hz_res1, 10);
2306 SAT_SW2_SW(hz_res0, hz_res1, 7);
2308 res0 = __msa_srari_h(shf_vec2, 5);
2309 res1 = __msa_srari_h(shf_vec5, 5);
2311 SAT_SH2_SH(res0, res1, 7);
2314 res0 = __msa_ilvod_h(zeros, res0);
2315 res1 = __msa_ilvod_h(zeros, res1);
2317 ILVEV_H2_SH(res0, zeros, res1, zeros, res0, res1);
2319 hz_res0 = __msa_aver_s_w(hz_res0, (v4i32) res0);
2320 hz_res1 = __msa_aver_s_w(hz_res1, (v4i32) res1);
2321 res0 = __msa_pckev_h((v8i16) hz_res1, (v8i16) hz_res0);
2323 res = PCKEV_XORI128_UB(res0, res0);
2325 dst0 = __msa_aver_u_b(res, dst0);
2327 ST4x2_UB(dst0, dst, dst_stride);
2328 dst += (2 * dst_stride);
2338 static void avc_luma_midh_qrt_and_aver_dst_8w_msa(const uint8_t *src,
2343 uint8_t horiz_offset)
2345 uint32_t multiple8_cnt;
2347 for (multiple8_cnt = 2; multiple8_cnt--;) {
2348 avc_luma_midh_qrt_and_aver_dst_4w_msa(src, src_stride, dst, dst_stride,
2349 height, horiz_offset);
2356 static void avc_luma_midh_qrt_and_aver_dst_16w_msa(const uint8_t *src,
2361 uint8_t horiz_offset)
2363 uint32_t multiple8_cnt;
2365 for (multiple8_cnt = 4; multiple8_cnt--;) {
2366 avc_luma_midh_qrt_and_aver_dst_4w_msa(src, src_stride, dst, dst_stride,
2367 height, horiz_offset);
2374 static void avc_luma_midv_qrt_and_aver_dst_4w_msa(const uint8_t *src,
2383 v16i8 src0, src1, src2, src3, src4;
2385 v16i8 mask0, mask1, mask2;
2386 v8i16 hz_out0, hz_out1, hz_out2, hz_out3;
2387 v8i16 hz_out4, hz_out5, hz_out6;
2388 v8i16 res0, res1, res2, res3;
2391 LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2);
2392 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
2393 src += (5 * src_stride);
2395 XORI_B5_128_SB(src0, src1, src2, src3, src4);
2397 hz_out0 = AVC_XOR_VSHF_B_AND_APPLY_6TAP_HORIZ_FILT_SH(src0, src1,
2398 mask0, mask1, mask2);
2399 hz_out2 = AVC_XOR_VSHF_B_AND_APPLY_6TAP_HORIZ_FILT_SH(src2, src3,
2400 mask0, mask1, mask2);
2402 PCKOD_D2_SH(hz_out0, hz_out0, hz_out2, hz_out2, hz_out1, hz_out3);
2404 hz_out4 = AVC_HORZ_FILTER_SH(src4, mask0, mask1, mask2);
2406 for (loop_cnt = (height >> 1); loop_cnt--;) {
2407 LD_SB2(src, src_stride, src0, src1);
2408 src += (2 * src_stride);
2410 XORI_B2_128_SB(src0, src1);
2411 LD_UB2(dst, dst_stride, dst0, dst1);
2412 hz_out5 = AVC_XOR_VSHF_B_AND_APPLY_6TAP_HORIZ_FILT_SH(src0, src1,
2415 hz_out6 = (v8i16) __msa_pckod_d((v2i64) hz_out5, (v2i64) hz_out5);
2416 res0 = AVC_CALC_DPADD_H_6PIX_2COEFF_R_SH(hz_out0, hz_out1, hz_out2,
2417 hz_out3, hz_out4, hz_out5);
2418 res2 = AVC_CALC_DPADD_H_6PIX_2COEFF_R_SH(hz_out1, hz_out2, hz_out3,
2419 hz_out4, hz_out5, hz_out6);
2422 res1 = __msa_srari_h(hz_out3, 5);
2423 res3 = __msa_srari_h(hz_out4, 5);
2425 res1 = __msa_srari_h(hz_out2, 5);
2426 res3 = __msa_srari_h(hz_out3, 5);
2429 SAT_SH2_SH(res1, res3, 7);
2431 res0 = __msa_aver_s_h(res0, res1);
2432 res1 = __msa_aver_s_h(res2, res3);
2434 vec0 = PCKEV_XORI128_UB(res0, res0);
2435 vec1 = PCKEV_XORI128_UB(res1, res1);
2437 AVER_UB2_UB(vec0, dst0, vec1, dst1, dst0, dst1);
2439 out0 = __msa_copy_u_w((v4i32) dst0, 0);
2440 out1 = __msa_copy_u_w((v4i32) dst1, 0);
2454 static void avc_luma_midv_qrt_and_aver_dst_8w_msa(const uint8_t *src,
2459 uint8_t vert_offset)
2462 v16i8 src0, src1, src2, src3, src4;
2463 v16u8 dst0, dst1, dst2, dst3;
2464 v16i8 mask0, mask1, mask2;
2465 v8i16 hz_out0, hz_out1, hz_out2, hz_out3;
2466 v8i16 hz_out4, hz_out5, hz_out6, hz_out7, hz_out8;
2467 v8i16 res0, res1, res2, res3;
2468 v8i16 res4, res5, res6, res7;
2470 LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
2472 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
2473 XORI_B5_128_SB(src0, src1, src2, src3, src4);
2474 src += (5 * src_stride);
2476 hz_out0 = AVC_HORZ_FILTER_SH(src0, mask0, mask1, mask2);
2477 hz_out1 = AVC_HORZ_FILTER_SH(src1, mask0, mask1, mask2);
2478 hz_out2 = AVC_HORZ_FILTER_SH(src2, mask0, mask1, mask2);
2479 hz_out3 = AVC_HORZ_FILTER_SH(src3, mask0, mask1, mask2);
2480 hz_out4 = AVC_HORZ_FILTER_SH(src4, mask0, mask1, mask2);
2482 for (loop_cnt = (height >> 2); loop_cnt--;) {
2483 LD_SB4(src, src_stride, src0, src1, src2, src3);
2484 XORI_B4_128_SB(src0, src1, src2, src3);
2485 src += (4 * src_stride);
2487 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
2489 hz_out5 = AVC_HORZ_FILTER_SH(src0, mask0, mask1, mask2);
2490 hz_out6 = AVC_HORZ_FILTER_SH(src1, mask0, mask1, mask2);
2491 hz_out7 = AVC_HORZ_FILTER_SH(src2, mask0, mask1, mask2);
2492 hz_out8 = AVC_HORZ_FILTER_SH(src3, mask0, mask1, mask2);
2494 res0 = AVC_CALC_DPADD_H_6PIX_2COEFF_SH(hz_out0, hz_out1, hz_out2,
2495 hz_out3, hz_out4, hz_out5);
2496 res2 = AVC_CALC_DPADD_H_6PIX_2COEFF_SH(hz_out1, hz_out2, hz_out3,
2497 hz_out4, hz_out5, hz_out6);
2498 res4 = AVC_CALC_DPADD_H_6PIX_2COEFF_SH(hz_out2, hz_out3, hz_out4,
2499 hz_out5, hz_out6, hz_out7);
2500 res6 = AVC_CALC_DPADD_H_6PIX_2COEFF_SH(hz_out3, hz_out4, hz_out5,
2501 hz_out6, hz_out7, hz_out8);
2504 res1 = __msa_srari_h(hz_out3, 5);
2505 res3 = __msa_srari_h(hz_out4, 5);
2506 res5 = __msa_srari_h(hz_out5, 5);
2507 res7 = __msa_srari_h(hz_out6, 5);
2509 res1 = __msa_srari_h(hz_out2, 5);
2510 res3 = __msa_srari_h(hz_out3, 5);
2511 res5 = __msa_srari_h(hz_out4, 5);
2512 res7 = __msa_srari_h(hz_out5, 5);
2515 SAT_SH4_SH(res1, res3, res5, res7, 7);
2517 res0 = __msa_aver_s_h(res0, res1);
2518 res1 = __msa_aver_s_h(res2, res3);
2519 res2 = __msa_aver_s_h(res4, res5);
2520 res3 = __msa_aver_s_h(res6, res7);
2522 CONVERT_UB_AVG_ST8x4_UB(res0, res1, res2, res3, dst0, dst1, dst2, dst3,
2524 dst += (4 * dst_stride);
2534 static void avc_luma_midv_qrt_and_aver_dst_16w_msa(const uint8_t *src,
2539 uint8_t vert_offset)
2541 int32_t multiple8_cnt;
2543 for (multiple8_cnt = 2; multiple8_cnt--;) {
2544 avc_luma_midv_qrt_and_aver_dst_8w_msa(src, src_stride, dst, dst_stride,
2545 height, vert_offset);
2552 static void avc_luma_hv_qrt_and_aver_dst_4x4_msa(const uint8_t *src_x,
2553 const uint8_t *src_y,
2558 v16i8 src_hz0, src_hz1, src_hz2, src_hz3;
2559 v16u8 dst0, dst1, dst2, dst3;
2560 v16i8 src_vt0, src_vt1, src_vt2, src_vt3, src_vt4;
2561 v16i8 src_vt5, src_vt6, src_vt7, src_vt8;
2562 v16i8 mask0, mask1, mask2;
2563 v8i16 hz_out0, hz_out1, vert_out0, vert_out1;
2567 LD_SB3(&luma_mask_arr[48], 16, mask0, mask1, mask2);
2568 LD_SB5(src_y, src_stride, src_vt0, src_vt1, src_vt2, src_vt3, src_vt4);
2569 src_y += (5 * src_stride);
2571 src_vt0 = (v16i8) __msa_insve_w((v4i32) src_vt0, 1, (v4i32) src_vt1);
2572 src_vt1 = (v16i8) __msa_insve_w((v4i32) src_vt1, 1, (v4i32) src_vt2);
2573 src_vt2 = (v16i8) __msa_insve_w((v4i32) src_vt2, 1, (v4i32) src_vt3);
2574 src_vt3 = (v16i8) __msa_insve_w((v4i32) src_vt3, 1, (v4i32) src_vt4);
2576 XORI_B4_128_SB(src_vt0, src_vt1, src_vt2, src_vt3);
2577 LD_SB4(src_x, src_stride, src_hz0, src_hz1, src_hz2, src_hz3);
2578 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
2579 XORI_B4_128_SB(src_hz0, src_hz1, src_hz2, src_hz3);
2580 hz_out0 = AVC_XOR_VSHF_B_AND_APPLY_6TAP_HORIZ_FILT_SH(src_hz0, src_hz1,
2581 mask0, mask1, mask2);
2582 hz_out1 = AVC_XOR_VSHF_B_AND_APPLY_6TAP_HORIZ_FILT_SH(src_hz2, src_hz3,
2583 mask0, mask1, mask2);
2584 SRARI_H2_SH(hz_out0, hz_out1, 5);
2585 SAT_SH2_SH(hz_out0, hz_out1, 7);
2586 LD_SB4(src_y, src_stride, src_vt5, src_vt6, src_vt7, src_vt8);
2588 src_vt4 = (v16i8) __msa_insve_w((v4i32) src_vt4, 1, (v4i32) src_vt5);
2589 src_vt5 = (v16i8) __msa_insve_w((v4i32) src_vt5, 1, (v4i32) src_vt6);
2590 src_vt6 = (v16i8) __msa_insve_w((v4i32) src_vt6, 1, (v4i32) src_vt7);
2591 src_vt7 = (v16i8) __msa_insve_w((v4i32) src_vt7, 1, (v4i32) src_vt8);
2593 XORI_B4_128_SB(src_vt4, src_vt5, src_vt6, src_vt7);
2596 vert_out0 = AVC_CALC_DPADD_B_6PIX_2COEFF_R_SH(src_vt0, src_vt1, src_vt2,
2597 src_vt3, src_vt4, src_vt5);
2598 vert_out1 = AVC_CALC_DPADD_B_6PIX_2COEFF_R_SH(src_vt2, src_vt3, src_vt4,
2599 src_vt5, src_vt6, src_vt7);
2600 SRARI_H2_SH(vert_out0, vert_out1, 5);
2601 SAT_SH2_SH(vert_out0, vert_out1, 7);
2603 res1 = __msa_srari_h((hz_out1 + vert_out1), 1);
2604 res0 = __msa_srari_h((hz_out0 + vert_out0), 1);
2606 SAT_SH2_SH(res0, res1, 7);
2607 res = PCKEV_XORI128_UB(res0, res1);
2609 dst0 = (v16u8) __msa_insve_w((v4i32) dst0, 1, (v4i32) dst1);
2610 dst1 = (v16u8) __msa_insve_w((v4i32) dst2, 1, (v4i32) dst3);
2611 dst0 = (v16u8) __msa_insve_d((v2i64) dst0, 1, (v2i64) dst1);
2612 dst0 = __msa_aver_u_b(res, dst0);
2614 ST4x4_UB(dst0, dst0, 0, 1, 2, 3, dst, dst_stride);
2617 static void avc_luma_hv_qrt_and_aver_dst_8x8_msa(const uint8_t *src_x,
2618 const uint8_t *src_y,
2624 v16i8 src_hz0, src_hz1, src_hz2, src_hz3;
2625 v16u8 dst0, dst1, dst2, dst3;
2626 v16i8 src_vt0, src_vt1, src_vt2, src_vt3;
2627 v16i8 src_vt4, src_vt5, src_vt6, src_vt7, src_vt8;
2628 v16i8 mask0, mask1, mask2;
2629 v8i16 hz_out0, hz_out1, hz_out2, hz_out3;
2630 v8i16 vert_out0, vert_out1, vert_out2, vert_out3;
2631 v8i16 out0, out1, out2, out3;
2633 LD_SB3(&luma_mask_arr[0], 16, mask0, mask1, mask2);
2635 LD_SB5(src_y, src_stride, src_vt0, src_vt1, src_vt2, src_vt3, src_vt4);
2636 src_y += (5 * src_stride);
2638 src_vt0 = (v16i8) __msa_insve_d((v2i64) src_vt0, 1, (v2i64) src_vt1);
2639 src_vt1 = (v16i8) __msa_insve_d((v2i64) src_vt1, 1, (v2i64) src_vt2);
2640 src_vt2 = (v16i8) __msa_insve_d((v2i64) src_vt2, 1, (v2i64) src_vt3);
2641 src_vt3 = (v16i8) __msa_insve_d((v2i64) src_vt3, 1, (v2i64) src_vt4);
2643 XORI_B4_128_SB(src_vt0, src_vt1, src_vt2, src_vt3);
2645 for (loop_cnt = 2; loop_cnt--;) {
2646 LD_SB4(src_x, src_stride, src_hz0, src_hz1, src_hz2, src_hz3);
2647 XORI_B4_128_SB(src_hz0, src_hz1, src_hz2, src_hz3);
2648 src_x += (4 * src_stride);
2650 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
2651 hz_out0 = AVC_HORZ_FILTER_SH(src_hz0, mask0, mask1, mask2);
2652 hz_out1 = AVC_HORZ_FILTER_SH(src_hz1, mask0, mask1, mask2);
2653 hz_out2 = AVC_HORZ_FILTER_SH(src_hz2, mask0, mask1, mask2);
2654 hz_out3 = AVC_HORZ_FILTER_SH(src_hz3, mask0, mask1, mask2);
2655 SRARI_H4_SH(hz_out0, hz_out1, hz_out2, hz_out3, 5);
2656 SAT_SH4_SH(hz_out0, hz_out1, hz_out2, hz_out3, 7);
2657 LD_SB4(src_y, src_stride, src_vt5, src_vt6, src_vt7, src_vt8);
2658 src_y += (4 * src_stride);
2660 src_vt4 = (v16i8) __msa_insve_d((v2i64) src_vt4, 1, (v2i64) src_vt5);
2661 src_vt5 = (v16i8) __msa_insve_d((v2i64) src_vt5, 1, (v2i64) src_vt6);
2662 src_vt6 = (v16i8) __msa_insve_d((v2i64) src_vt6, 1, (v2i64) src_vt7);
2663 src_vt7 = (v16i8) __msa_insve_d((v2i64) src_vt7, 1, (v2i64) src_vt8);
2665 XORI_B4_128_SB(src_vt4, src_vt5, src_vt6, src_vt7);
2666 AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src_vt0, src_vt1, src_vt2, src_vt3,
2667 src_vt4, src_vt5, vert_out0, vert_out1);
2668 AVC_CALC_DPADD_B_6PIX_2COEFF_SH(src_vt2, src_vt3, src_vt4, src_vt5,
2669 src_vt6, src_vt7, vert_out2, vert_out3);
2670 SRARI_H4_SH(vert_out0, vert_out1, vert_out2, vert_out3, 5);
2671 SAT_SH4_SH(vert_out0, vert_out1, vert_out2, vert_out3, 7);
2673 out0 = __msa_srari_h((hz_out0 + vert_out0), 1);
2674 out1 = __msa_srari_h((hz_out1 + vert_out1), 1);
2675 out2 = __msa_srari_h((hz_out2 + vert_out2), 1);
2676 out3 = __msa_srari_h((hz_out3 + vert_out3), 1);
2678 SAT_SH4_SH(out0, out1, out2, out3, 7);
2679 CONVERT_UB_AVG_ST8x4_UB(out0, out1, out2, out3, dst0, dst1, dst2, dst3,
2681 dst += (4 * dst_stride);
2691 static void avc_luma_hv_qrt_and_aver_dst_16x16_msa(const uint8_t *src_x,
2692 const uint8_t *src_y,
2697 uint32_t multiple8_cnt;
2699 for (multiple8_cnt = 2; multiple8_cnt--;) {
2700 avc_luma_hv_qrt_and_aver_dst_8x8_msa(src_x, src_y, src_stride,
2708 src_x += (8 * src_stride) - 16;
2709 src_y += (8 * src_stride) - 16;
2710 dst += (8 * dst_stride) - 16;
2712 for (multiple8_cnt = 2; multiple8_cnt--;) {
2713 avc_luma_hv_qrt_and_aver_dst_8x8_msa(src_x, src_y, src_stride,
2722 static void copy_width8_msa(const uint8_t *src, int32_t src_stride,
2723 uint8_t *dst, int32_t dst_stride,
2727 uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
2728 v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
2730 if (0 == height % 12) {
2731 for (cnt = (height / 12); cnt--;) {
2732 LD_UB8(src, src_stride,
2733 src0, src1, src2, src3, src4, src5, src6, src7);
2734 src += (8 * src_stride);
2736 out0 = __msa_copy_u_d((v2i64) src0, 0);
2737 out1 = __msa_copy_u_d((v2i64) src1, 0);
2738 out2 = __msa_copy_u_d((v2i64) src2, 0);
2739 out3 = __msa_copy_u_d((v2i64) src3, 0);
2740 out4 = __msa_copy_u_d((v2i64) src4, 0);
2741 out5 = __msa_copy_u_d((v2i64) src5, 0);
2742 out6 = __msa_copy_u_d((v2i64) src6, 0);
2743 out7 = __msa_copy_u_d((v2i64) src7, 0);
2745 SD4(out0, out1, out2, out3, dst, dst_stride);
2746 dst += (4 * dst_stride);
2747 SD4(out4, out5, out6, out7, dst, dst_stride);
2748 dst += (4 * dst_stride);
2750 LD_UB4(src, src_stride, src0, src1, src2, src3);
2751 src += (4 * src_stride);
2753 out0 = __msa_copy_u_d((v2i64) src0, 0);
2754 out1 = __msa_copy_u_d((v2i64) src1, 0);
2755 out2 = __msa_copy_u_d((v2i64) src2, 0);
2756 out3 = __msa_copy_u_d((v2i64) src3, 0);
2758 SD4(out0, out1, out2, out3, dst, dst_stride);
2759 dst += (4 * dst_stride);
2761 } else if (0 == height % 8) {
2762 for (cnt = height >> 3; cnt--;) {
2763 LD_UB8(src, src_stride,
2764 src0, src1, src2, src3, src4, src5, src6, src7);
2765 src += (8 * src_stride);
2767 out0 = __msa_copy_u_d((v2i64) src0, 0);
2768 out1 = __msa_copy_u_d((v2i64) src1, 0);
2769 out2 = __msa_copy_u_d((v2i64) src2, 0);
2770 out3 = __msa_copy_u_d((v2i64) src3, 0);
2771 out4 = __msa_copy_u_d((v2i64) src4, 0);
2772 out5 = __msa_copy_u_d((v2i64) src5, 0);
2773 out6 = __msa_copy_u_d((v2i64) src6, 0);
2774 out7 = __msa_copy_u_d((v2i64) src7, 0);
2776 SD4(out0, out1, out2, out3, dst, dst_stride);
2777 dst += (4 * dst_stride);
2778 SD4(out4, out5, out6, out7, dst, dst_stride);
2779 dst += (4 * dst_stride);
2781 } else if (0 == height % 4) {
2782 for (cnt = (height / 4); cnt--;) {
2783 LD_UB4(src, src_stride, src0, src1, src2, src3);
2784 src += (4 * src_stride);
2785 out0 = __msa_copy_u_d((v2i64) src0, 0);
2786 out1 = __msa_copy_u_d((v2i64) src1, 0);
2787 out2 = __msa_copy_u_d((v2i64) src2, 0);
2788 out3 = __msa_copy_u_d((v2i64) src3, 0);
2790 SD4(out0, out1, out2, out3, dst, dst_stride);
2791 dst += (4 * dst_stride);
2793 } else if (0 == height % 2) {
2794 for (cnt = (height / 2); cnt--;) {
2795 LD_UB2(src, src_stride, src0, src1);
2796 src += (2 * src_stride);
2797 out0 = __msa_copy_u_d((v2i64) src0, 0);
2798 out1 = __msa_copy_u_d((v2i64) src1, 0);
2808 static void copy_16multx8mult_msa(const uint8_t *src, int32_t src_stride,
2809 uint8_t *dst, int32_t dst_stride,
2810 int32_t height, int32_t width)
2812 int32_t cnt, loop_cnt;
2813 const uint8_t *src_tmp;
2815 v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
2817 for (cnt = (width >> 4); cnt--;) {
2821 for (loop_cnt = (height >> 3); loop_cnt--;) {
2822 LD_UB8(src_tmp, src_stride,
2823 src0, src1, src2, src3, src4, src5, src6, src7);
2824 src_tmp += (8 * src_stride);
2826 ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7,
2827 dst_tmp, dst_stride);
2828 dst_tmp += (8 * dst_stride);
2836 static void copy_width16_msa(const uint8_t *src, int32_t src_stride,
2837 uint8_t *dst, int32_t dst_stride,
2841 v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
2843 if (0 == height % 12) {
2844 for (cnt = (height / 12); cnt--;) {
2845 LD_UB8(src, src_stride,
2846 src0, src1, src2, src3, src4, src5, src6, src7);
2847 src += (8 * src_stride);
2848 ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7,
2850 dst += (8 * dst_stride);
2852 LD_UB4(src, src_stride, src0, src1, src2, src3);
2853 src += (4 * src_stride);
2854 ST_UB4(src0, src1, src2, src3, dst, dst_stride);
2855 dst += (4 * dst_stride);
2857 } else if (0 == height % 8) {
2858 copy_16multx8mult_msa(src, src_stride, dst, dst_stride, height, 16);
2859 } else if (0 == height % 4) {
2860 for (cnt = (height >> 2); cnt--;) {
2861 LD_UB4(src, src_stride, src0, src1, src2, src3);
2862 src += (4 * src_stride);
2864 ST_UB4(src0, src1, src2, src3, dst, dst_stride);
2865 dst += (4 * dst_stride);
2870 static void avg_width4_msa(const uint8_t *src, int32_t src_stride,
2871 uint8_t *dst, int32_t dst_stride,
2875 uint32_t out0, out1, out2, out3;
2876 v16u8 src0, src1, src2, src3;
2877 v16u8 dst0, dst1, dst2, dst3;
2879 if (0 == (height % 4)) {
2880 for (cnt = (height / 4); cnt--;) {
2881 LD_UB4(src, src_stride, src0, src1, src2, src3);
2882 src += (4 * src_stride);
2884 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
2886 AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3,
2887 dst0, dst1, dst2, dst3);
2889 out0 = __msa_copy_u_w((v4i32) dst0, 0);
2890 out1 = __msa_copy_u_w((v4i32) dst1, 0);
2891 out2 = __msa_copy_u_w((v4i32) dst2, 0);
2892 out3 = __msa_copy_u_w((v4i32) dst3, 0);
2893 SW4(out0, out1, out2, out3, dst, dst_stride);
2894 dst += (4 * dst_stride);
2896 } else if (0 == (height % 2)) {
2897 for (cnt = (height / 2); cnt--;) {
2898 LD_UB2(src, src_stride, src0, src1);
2899 src += (2 * src_stride);
2901 LD_UB2(dst, dst_stride, dst0, dst1);
2903 AVER_UB2_UB(src0, dst0, src1, dst1, dst0, dst1);
2905 out0 = __msa_copy_u_w((v4i32) dst0, 0);
2906 out1 = __msa_copy_u_w((v4i32) dst1, 0);
2915 static void avg_width8_msa(const uint8_t *src, int32_t src_stride,
2916 uint8_t *dst, int32_t dst_stride,
2920 uint64_t out0, out1, out2, out3;
2921 v16u8 src0, src1, src2, src3;
2922 v16u8 dst0, dst1, dst2, dst3;
2924 for (cnt = (height / 4); cnt--;) {
2925 LD_UB4(src, src_stride, src0, src1, src2, src3);
2926 src += (4 * src_stride);
2927 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
2929 AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3,
2930 dst0, dst1, dst2, dst3);
2932 out0 = __msa_copy_u_d((v2i64) dst0, 0);
2933 out1 = __msa_copy_u_d((v2i64) dst1, 0);
2934 out2 = __msa_copy_u_d((v2i64) dst2, 0);
2935 out3 = __msa_copy_u_d((v2i64) dst3, 0);
2936 SD4(out0, out1, out2, out3, dst, dst_stride);
2937 dst += (4 * dst_stride);
2941 static void avg_width16_msa(const uint8_t *src, int32_t src_stride,
2942 uint8_t *dst, int32_t dst_stride,
2946 v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
2947 v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
2949 for (cnt = (height / 8); cnt--;) {
2950 LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
2951 src += (8 * src_stride);
2952 LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
2954 AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3,
2955 dst0, dst1, dst2, dst3);
2956 AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7,
2957 dst4, dst5, dst6, dst7);
2958 ST_UB8(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst, dst_stride);
2959 dst += (8 * dst_stride);
2963 void ff_put_h264_qpel16_mc00_msa(uint8_t *dst, const uint8_t *src,
2966 copy_width16_msa(src, stride, dst, stride, 16);
2969 void ff_put_h264_qpel8_mc00_msa(uint8_t *dst, const uint8_t *src,
2972 copy_width8_msa(src, stride, dst, stride, 8);
2975 void ff_avg_h264_qpel16_mc00_msa(uint8_t *dst, const uint8_t *src,
2978 avg_width16_msa(src, stride, dst, stride, 16);
2981 void ff_avg_h264_qpel8_mc00_msa(uint8_t *dst, const uint8_t *src,
2984 avg_width8_msa(src, stride, dst, stride, 8);
2987 void ff_avg_h264_qpel4_mc00_msa(uint8_t *dst, const uint8_t *src,
2990 avg_width4_msa(src, stride, dst, stride, 4);
2993 void ff_put_h264_qpel16_mc10_msa(uint8_t *dst, const uint8_t *src,
2996 avc_luma_hz_qrt_16w_msa(src - 2, stride, dst, stride, 16, 0);
2999 void ff_put_h264_qpel16_mc30_msa(uint8_t *dst, const uint8_t *src,
3002 avc_luma_hz_qrt_16w_msa(src - 2, stride, dst, stride, 16, 1);
3005 void ff_put_h264_qpel8_mc10_msa(uint8_t *dst, const uint8_t *src,
3008 avc_luma_hz_qrt_8w_msa(src - 2, stride, dst, stride, 8, 0);
3011 void ff_put_h264_qpel8_mc30_msa(uint8_t *dst, const uint8_t *src,
3014 avc_luma_hz_qrt_8w_msa(src - 2, stride, dst, stride, 8, 1);
3017 void ff_put_h264_qpel4_mc10_msa(uint8_t *dst, const uint8_t *src,
3020 avc_luma_hz_qrt_4w_msa(src - 2, stride, dst, stride, 4, 0);
3023 void ff_put_h264_qpel4_mc30_msa(uint8_t *dst, const uint8_t *src,
3026 avc_luma_hz_qrt_4w_msa(src - 2, stride, dst, stride, 4, 1);
3029 void ff_put_h264_qpel16_mc20_msa(uint8_t *dst, const uint8_t *src,
3032 avc_luma_hz_16w_msa(src - 2, stride, dst, stride, 16);
3035 void ff_put_h264_qpel8_mc20_msa(uint8_t *dst, const uint8_t *src,
3038 avc_luma_hz_8w_msa(src - 2, stride, dst, stride, 8);
3041 void ff_put_h264_qpel4_mc20_msa(uint8_t *dst, const uint8_t *src,
3044 avc_luma_hz_4w_msa(src - 2, stride, dst, stride, 4);
3047 void ff_put_h264_qpel16_mc01_msa(uint8_t *dst, const uint8_t *src,
3050 avc_luma_vt_qrt_16w_msa(src - (stride * 2), stride, dst, stride, 16, 0);
3053 void ff_put_h264_qpel16_mc03_msa(uint8_t *dst, const uint8_t *src,
3056 avc_luma_vt_qrt_16w_msa(src - (stride * 2), stride, dst, stride, 16, 1);
3059 void ff_put_h264_qpel8_mc01_msa(uint8_t *dst, const uint8_t *src,
3062 avc_luma_vt_qrt_8w_msa(src - (stride * 2), stride, dst, stride, 8, 0);
3065 void ff_put_h264_qpel8_mc03_msa(uint8_t *dst, const uint8_t *src,
3068 avc_luma_vt_qrt_8w_msa(src - (stride * 2), stride, dst, stride, 8, 1);
3071 void ff_put_h264_qpel4_mc01_msa(uint8_t *dst, const uint8_t *src,
3074 avc_luma_vt_qrt_4w_msa(src - (stride * 2), stride, dst, stride, 4, 0);
3077 void ff_put_h264_qpel4_mc03_msa(uint8_t *dst, const uint8_t *src,
3080 avc_luma_vt_qrt_4w_msa(src - (stride * 2), stride, dst, stride, 4, 1);
3083 void ff_put_h264_qpel16_mc11_msa(uint8_t *dst, const uint8_t *src,
3086 avc_luma_hv_qrt_16w_msa(src - 2,
3087 src - (stride * 2), stride, dst, stride, 16);
3090 void ff_put_h264_qpel16_mc31_msa(uint8_t *dst, const uint8_t *src,
3093 avc_luma_hv_qrt_16w_msa(src - 2,
3094 src - (stride * 2) +
3095 sizeof(uint8_t), stride, dst, stride, 16);
3098 void ff_put_h264_qpel16_mc13_msa(uint8_t *dst, const uint8_t *src,
3101 avc_luma_hv_qrt_16w_msa(src + stride - 2,
3102 src - (stride * 2), stride, dst, stride, 16);
3105 void ff_put_h264_qpel16_mc33_msa(uint8_t *dst, const uint8_t *src,
3108 avc_luma_hv_qrt_16w_msa(src + stride - 2,
3109 src - (stride * 2) +
3110 sizeof(uint8_t), stride, dst, stride, 16);
3113 void ff_put_h264_qpel8_mc11_msa(uint8_t *dst, const uint8_t *src,
3116 avc_luma_hv_qrt_8w_msa(src - 2, src - (stride * 2), stride, dst, stride, 8);
3119 void ff_put_h264_qpel8_mc31_msa(uint8_t *dst, const uint8_t *src,
3122 avc_luma_hv_qrt_8w_msa(src - 2,
3123 src - (stride * 2) +
3124 sizeof(uint8_t), stride, dst, stride, 8);
3127 void ff_put_h264_qpel8_mc13_msa(uint8_t *dst, const uint8_t *src,
3130 avc_luma_hv_qrt_8w_msa(src + stride - 2,
3131 src - (stride * 2), stride, dst, stride, 8);
3134 void ff_put_h264_qpel8_mc33_msa(uint8_t *dst, const uint8_t *src,
3137 avc_luma_hv_qrt_8w_msa(src + stride - 2,
3138 src - (stride * 2) +
3139 sizeof(uint8_t), stride, dst, stride, 8);
3143 void ff_put_h264_qpel4_mc11_msa(uint8_t *dst, const uint8_t *src,
3146 avc_luma_hv_qrt_4w_msa(src - 2, src - (stride * 2), stride, dst, stride, 4);
3149 void ff_put_h264_qpel4_mc31_msa(uint8_t *dst, const uint8_t *src,
3152 avc_luma_hv_qrt_4w_msa(src - 2,
3153 src - (stride * 2) +
3154 sizeof(uint8_t), stride, dst, stride, 4);
3157 void ff_put_h264_qpel4_mc13_msa(uint8_t *dst, const uint8_t *src,
3160 avc_luma_hv_qrt_4w_msa(src + stride - 2,
3161 src - (stride * 2), stride, dst, stride, 4);
3164 void ff_put_h264_qpel4_mc33_msa(uint8_t *dst, const uint8_t *src,
3167 avc_luma_hv_qrt_4w_msa(src + stride - 2,
3168 src - (stride * 2) +
3169 sizeof(uint8_t), stride, dst, stride, 4);
3172 void ff_put_h264_qpel16_mc21_msa(uint8_t *dst, const uint8_t *src,
3175 avc_luma_midv_qrt_16w_msa(src - (2 * stride) - 2,
3176 stride, dst, stride, 16, 0);
3179 void ff_put_h264_qpel16_mc23_msa(uint8_t *dst, const uint8_t *src,
3182 avc_luma_midv_qrt_16w_msa(src - (2 * stride) - 2,
3183 stride, dst, stride, 16, 1);
3186 void ff_put_h264_qpel8_mc21_msa(uint8_t *dst, const uint8_t *src,
3189 avc_luma_midv_qrt_8w_msa(src - (2 * stride) - 2, stride, dst, stride, 8, 0);
3192 void ff_put_h264_qpel8_mc23_msa(uint8_t *dst, const uint8_t *src,
3195 avc_luma_midv_qrt_8w_msa(src - (2 * stride) - 2, stride, dst, stride, 8, 1);
3198 void ff_put_h264_qpel4_mc21_msa(uint8_t *dst, const uint8_t *src,
3201 avc_luma_midv_qrt_4w_msa(src - (2 * stride) - 2, stride, dst, stride, 4, 0);
3204 void ff_put_h264_qpel4_mc23_msa(uint8_t *dst, const uint8_t *src,
3207 avc_luma_midv_qrt_4w_msa(src - (2 * stride) - 2, stride, dst, stride, 4, 1);
3210 void ff_put_h264_qpel16_mc02_msa(uint8_t *dst, const uint8_t *src,
3213 avc_luma_vt_16w_msa(src - (stride * 2), stride, dst, stride, 16);
3216 void ff_put_h264_qpel8_mc02_msa(uint8_t *dst, const uint8_t *src,
3219 avc_luma_vt_8w_msa(src - (stride * 2), stride, dst, stride, 8);
3222 void ff_put_h264_qpel4_mc02_msa(uint8_t *dst, const uint8_t *src,
3225 avc_luma_vt_4w_msa(src - (stride * 2), stride, dst, stride, 4);
3228 void ff_put_h264_qpel16_mc12_msa(uint8_t *dst, const uint8_t *src,
3231 avc_luma_midh_qrt_16w_msa(src - (2 * stride) - 2,
3232 stride, dst, stride, 16, 0);
3235 void ff_put_h264_qpel16_mc32_msa(uint8_t *dst, const uint8_t *src,
3238 avc_luma_midh_qrt_16w_msa(src - (2 * stride) - 2,
3239 stride, dst, stride, 16, 1);
3242 void ff_put_h264_qpel8_mc12_msa(uint8_t *dst, const uint8_t *src,
3245 avc_luma_midh_qrt_8w_msa(src - (2 * stride) - 2, stride, dst, stride, 8, 0);
3248 void ff_put_h264_qpel8_mc32_msa(uint8_t *dst, const uint8_t *src,
3251 avc_luma_midh_qrt_8w_msa(src - (2 * stride) - 2, stride, dst, stride, 8, 1);
3254 void ff_put_h264_qpel4_mc12_msa(uint8_t *dst, const uint8_t *src,
3257 avc_luma_midh_qrt_4w_msa(src - (2 * stride) - 2, stride, dst, stride, 4, 0);
3260 void ff_put_h264_qpel4_mc32_msa(uint8_t *dst, const uint8_t *src,
3263 avc_luma_midh_qrt_4w_msa(src - (2 * stride) - 2, stride, dst, stride, 4, 1);
3266 void ff_put_h264_qpel16_mc22_msa(uint8_t *dst, const uint8_t *src,
3269 avc_luma_mid_16w_msa(src - (2 * stride) - 2, stride, dst, stride, 16);
3272 void ff_put_h264_qpel8_mc22_msa(uint8_t *dst, const uint8_t *src,
3275 avc_luma_mid_8w_msa(src - (2 * stride) - 2, stride, dst, stride, 8);
3278 void ff_put_h264_qpel4_mc22_msa(uint8_t *dst, const uint8_t *src,
3281 avc_luma_mid_4w_msa(src - (2 * stride) - 2, stride, dst, stride, 4);
3284 void ff_avg_h264_qpel16_mc10_msa(uint8_t *dst, const uint8_t *src,
3287 avc_luma_hz_qrt_and_aver_dst_16x16_msa(src - 2, stride, dst, stride, 0);
3290 void ff_avg_h264_qpel16_mc30_msa(uint8_t *dst, const uint8_t *src,
3293 avc_luma_hz_qrt_and_aver_dst_16x16_msa(src - 2, stride, dst, stride, 1);
3296 void ff_avg_h264_qpel8_mc10_msa(uint8_t *dst, const uint8_t *src,
3299 avc_luma_hz_qrt_and_aver_dst_8x8_msa(src - 2, stride, dst, stride, 0);
3302 void ff_avg_h264_qpel8_mc30_msa(uint8_t *dst, const uint8_t *src,
3305 avc_luma_hz_qrt_and_aver_dst_8x8_msa(src - 2, stride, dst, stride, 1);
3308 void ff_avg_h264_qpel4_mc10_msa(uint8_t *dst, const uint8_t *src,
3311 avc_luma_hz_qrt_and_aver_dst_4x4_msa(src - 2, stride, dst, stride, 0);
3314 void ff_avg_h264_qpel4_mc30_msa(uint8_t *dst, const uint8_t *src,
3317 avc_luma_hz_qrt_and_aver_dst_4x4_msa(src - 2, stride, dst, stride, 1);
3320 void ff_avg_h264_qpel16_mc20_msa(uint8_t *dst, const uint8_t *src,
3323 avc_luma_hz_and_aver_dst_16x16_msa(src - 2, stride, dst, stride);
3326 void ff_avg_h264_qpel8_mc20_msa(uint8_t *dst, const uint8_t *src,
3329 avc_luma_hz_and_aver_dst_8x8_msa(src - 2, stride, dst, stride);
3332 void ff_avg_h264_qpel4_mc20_msa(uint8_t *dst, const uint8_t *src,
3335 avc_luma_hz_and_aver_dst_4x4_msa(src - 2, stride, dst, stride);
3338 void ff_avg_h264_qpel16_mc01_msa(uint8_t *dst, const uint8_t *src,
3341 avc_luma_vt_qrt_and_aver_dst_16x16_msa(src - (stride * 2),
3342 stride, dst, stride, 0);
3345 void ff_avg_h264_qpel16_mc03_msa(uint8_t *dst, const uint8_t *src,
3348 avc_luma_vt_qrt_and_aver_dst_16x16_msa(src - (stride * 2),
3349 stride, dst, stride, 1);
3352 void ff_avg_h264_qpel8_mc01_msa(uint8_t *dst, const uint8_t *src,
3355 avc_luma_vt_qrt_and_aver_dst_8x8_msa(src - (stride * 2),
3356 stride, dst, stride, 0);
3359 void ff_avg_h264_qpel8_mc03_msa(uint8_t *dst, const uint8_t *src,
3362 avc_luma_vt_qrt_and_aver_dst_8x8_msa(src - (stride * 2),
3363 stride, dst, stride, 1);
3366 void ff_avg_h264_qpel4_mc01_msa(uint8_t *dst, const uint8_t *src,
3369 avc_luma_vt_qrt_and_aver_dst_4x4_msa(src - (stride * 2),
3370 stride, dst, stride, 0);
3373 void ff_avg_h264_qpel4_mc03_msa(uint8_t *dst, const uint8_t *src,
3376 avc_luma_vt_qrt_and_aver_dst_4x4_msa(src - (stride * 2),
3377 stride, dst, stride, 1);
3380 void ff_avg_h264_qpel16_mc11_msa(uint8_t *dst, const uint8_t *src,
3383 avc_luma_hv_qrt_and_aver_dst_16x16_msa(src - 2,
3385 stride, dst, stride);
3388 void ff_avg_h264_qpel16_mc31_msa(uint8_t *dst, const uint8_t *src,
3391 avc_luma_hv_qrt_and_aver_dst_16x16_msa(src - 2,
3392 src - (stride * 2) +
3393 sizeof(uint8_t), stride,
3397 void ff_avg_h264_qpel16_mc13_msa(uint8_t *dst, const uint8_t *src,
3400 avc_luma_hv_qrt_and_aver_dst_16x16_msa(src + stride - 2,
3402 stride, dst, stride);
3405 void ff_avg_h264_qpel16_mc33_msa(uint8_t *dst, const uint8_t *src,
3408 avc_luma_hv_qrt_and_aver_dst_16x16_msa(src + stride - 2,
3409 src - (stride * 2) +
3410 sizeof(uint8_t), stride,
3414 void ff_avg_h264_qpel8_mc11_msa(uint8_t *dst, const uint8_t *src,
3417 avc_luma_hv_qrt_and_aver_dst_8x8_msa(src - 2,
3419 stride, dst, stride);
3422 void ff_avg_h264_qpel8_mc31_msa(uint8_t *dst, const uint8_t *src,
3425 avc_luma_hv_qrt_and_aver_dst_8x8_msa(src - 2,
3426 src - (stride * 2) +
3427 sizeof(uint8_t), stride, dst, stride);
3430 void ff_avg_h264_qpel8_mc13_msa(uint8_t *dst, const uint8_t *src,
3433 avc_luma_hv_qrt_and_aver_dst_8x8_msa(src + stride - 2,
3435 stride, dst, stride);
3438 void ff_avg_h264_qpel8_mc33_msa(uint8_t *dst, const uint8_t *src,
3441 avc_luma_hv_qrt_and_aver_dst_8x8_msa(src + stride - 2,
3442 src - (stride * 2) +
3443 sizeof(uint8_t), stride, dst, stride);
3447 void ff_avg_h264_qpel4_mc11_msa(uint8_t *dst, const uint8_t *src,
3450 avc_luma_hv_qrt_and_aver_dst_4x4_msa(src - 2,
3452 stride, dst, stride);
3455 void ff_avg_h264_qpel4_mc31_msa(uint8_t *dst, const uint8_t *src,
3458 avc_luma_hv_qrt_and_aver_dst_4x4_msa(src - 2,
3459 src - (stride * 2) +
3460 sizeof(uint8_t), stride, dst, stride);
3463 void ff_avg_h264_qpel4_mc13_msa(uint8_t *dst, const uint8_t *src,
3466 avc_luma_hv_qrt_and_aver_dst_4x4_msa(src + stride - 2,
3468 stride, dst, stride);
3471 void ff_avg_h264_qpel4_mc33_msa(uint8_t *dst, const uint8_t *src,
3474 avc_luma_hv_qrt_and_aver_dst_4x4_msa(src + stride - 2,
3475 src - (stride * 2) +
3476 sizeof(uint8_t), stride, dst, stride);
3479 void ff_avg_h264_qpel16_mc21_msa(uint8_t *dst, const uint8_t *src,
3482 avc_luma_midv_qrt_and_aver_dst_16w_msa(src - (2 * stride) - 2,
3483 stride, dst, stride, 16, 0);
3486 void ff_avg_h264_qpel16_mc23_msa(uint8_t *dst, const uint8_t *src,
3489 avc_luma_midv_qrt_and_aver_dst_16w_msa(src - (2 * stride) - 2,
3490 stride, dst, stride, 16, 1);
3493 void ff_avg_h264_qpel8_mc21_msa(uint8_t *dst, const uint8_t *src,
3496 avc_luma_midv_qrt_and_aver_dst_8w_msa(src - (2 * stride) - 2,
3497 stride, dst, stride, 8, 0);
3500 void ff_avg_h264_qpel8_mc23_msa(uint8_t *dst, const uint8_t *src,
3503 avc_luma_midv_qrt_and_aver_dst_8w_msa(src - (2 * stride) - 2,
3504 stride, dst, stride, 8, 1);
3507 void ff_avg_h264_qpel4_mc21_msa(uint8_t *dst, const uint8_t *src,
3510 avc_luma_midv_qrt_and_aver_dst_4w_msa(src - (2 * stride) - 2,
3511 stride, dst, stride, 4, 0);
3514 void ff_avg_h264_qpel4_mc23_msa(uint8_t *dst, const uint8_t *src,
3517 avc_luma_midv_qrt_and_aver_dst_4w_msa(src - (2 * stride) - 2,
3518 stride, dst, stride, 4, 1);
3521 void ff_avg_h264_qpel16_mc02_msa(uint8_t *dst, const uint8_t *src,
3524 avc_luma_vt_and_aver_dst_16x16_msa(src - (stride * 2), stride, dst, stride);
3527 void ff_avg_h264_qpel8_mc02_msa(uint8_t *dst, const uint8_t *src,
3530 avc_luma_vt_and_aver_dst_8x8_msa(src - (stride * 2), stride, dst, stride);
3533 void ff_avg_h264_qpel4_mc02_msa(uint8_t *dst, const uint8_t *src,
3536 avc_luma_vt_and_aver_dst_4x4_msa(src - (stride * 2), stride, dst, stride);
3539 void ff_avg_h264_qpel16_mc12_msa(uint8_t *dst, const uint8_t *src,
3542 avc_luma_midh_qrt_and_aver_dst_16w_msa(src - (2 * stride) - 2,
3543 stride, dst, stride, 16, 0);
3546 void ff_avg_h264_qpel16_mc32_msa(uint8_t *dst, const uint8_t *src,
3549 avc_luma_midh_qrt_and_aver_dst_16w_msa(src - (2 * stride) - 2,
3550 stride, dst, stride, 16, 1);
3553 void ff_avg_h264_qpel8_mc12_msa(uint8_t *dst, const uint8_t *src,
3556 avc_luma_midh_qrt_and_aver_dst_8w_msa(src - (2 * stride) - 2,
3557 stride, dst, stride, 8, 0);
3560 void ff_avg_h264_qpel8_mc32_msa(uint8_t *dst, const uint8_t *src,
3563 avc_luma_midh_qrt_and_aver_dst_8w_msa(src - (2 * stride) - 2,
3564 stride, dst, stride, 8, 1);
3567 void ff_avg_h264_qpel4_mc12_msa(uint8_t *dst, const uint8_t *src,
3570 avc_luma_midh_qrt_and_aver_dst_4w_msa(src - (2 * stride) - 2,
3571 stride, dst, stride, 4, 0);
3574 void ff_avg_h264_qpel4_mc32_msa(uint8_t *dst, const uint8_t *src,
3577 avc_luma_midh_qrt_and_aver_dst_4w_msa(src - (2 * stride) - 2,
3578 stride, dst, stride, 4, 1);
3581 void ff_avg_h264_qpel16_mc22_msa(uint8_t *dst, const uint8_t *src,
3584 avc_luma_mid_and_aver_dst_16x16_msa(src - (2 * stride) - 2,
3585 stride, dst, stride);
3588 void ff_avg_h264_qpel8_mc22_msa(uint8_t *dst, const uint8_t *src,
3591 avc_luma_mid_and_aver_dst_8w_msa(src - (2 * stride) - 2,
3592 stride, dst, stride, 8);
3595 void ff_avg_h264_qpel4_mc22_msa(uint8_t *dst, const uint8_t *src,
3598 avc_luma_mid_and_aver_dst_4x4_msa(src - (2 * stride) - 2,
3599 stride, dst, stride);