2 * Copyright (c) 2015 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com)
4 * This file is part of FFmpeg.
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 #include "libavutil/mips/generic_macros_msa.h"
22 #include "libavcodec/mips/hevcdsp_mips.h"
23 #include "libavcodec/mips/hevc_macros_msa.h"
25 static void copy_width8_msa(uint8_t *src, int32_t src_stride,
26 uint8_t *dst, int32_t dst_stride,
30 uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
31 v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
33 if (0 == height % 12) {
34 for (cnt = (height / 12); cnt--;) {
35 LD_UB8(src, src_stride,
36 src0, src1, src2, src3, src4, src5, src6, src7);
37 src += (8 * src_stride);
39 out0 = __msa_copy_u_d((v2i64) src0, 0);
40 out1 = __msa_copy_u_d((v2i64) src1, 0);
41 out2 = __msa_copy_u_d((v2i64) src2, 0);
42 out3 = __msa_copy_u_d((v2i64) src3, 0);
43 out4 = __msa_copy_u_d((v2i64) src4, 0);
44 out5 = __msa_copy_u_d((v2i64) src5, 0);
45 out6 = __msa_copy_u_d((v2i64) src6, 0);
46 out7 = __msa_copy_u_d((v2i64) src7, 0);
48 SD4(out0, out1, out2, out3, dst, dst_stride);
49 dst += (4 * dst_stride);
50 SD4(out4, out5, out6, out7, dst, dst_stride);
51 dst += (4 * dst_stride);
53 LD_UB4(src, src_stride, src0, src1, src2, src3);
54 src += (4 * src_stride);
56 out0 = __msa_copy_u_d((v2i64) src0, 0);
57 out1 = __msa_copy_u_d((v2i64) src1, 0);
58 out2 = __msa_copy_u_d((v2i64) src2, 0);
59 out3 = __msa_copy_u_d((v2i64) src3, 0);
61 SD4(out0, out1, out2, out3, dst, dst_stride);
62 dst += (4 * dst_stride);
64 } else if (0 == height % 8) {
65 for (cnt = height >> 3; cnt--;) {
66 LD_UB8(src, src_stride,
67 src0, src1, src2, src3, src4, src5, src6, src7);
68 src += (8 * src_stride);
70 out0 = __msa_copy_u_d((v2i64) src0, 0);
71 out1 = __msa_copy_u_d((v2i64) src1, 0);
72 out2 = __msa_copy_u_d((v2i64) src2, 0);
73 out3 = __msa_copy_u_d((v2i64) src3, 0);
74 out4 = __msa_copy_u_d((v2i64) src4, 0);
75 out5 = __msa_copy_u_d((v2i64) src5, 0);
76 out6 = __msa_copy_u_d((v2i64) src6, 0);
77 out7 = __msa_copy_u_d((v2i64) src7, 0);
79 SD4(out0, out1, out2, out3, dst, dst_stride);
80 dst += (4 * dst_stride);
81 SD4(out4, out5, out6, out7, dst, dst_stride);
82 dst += (4 * dst_stride);
84 } else if (0 == height % 4) {
85 for (cnt = (height / 4); cnt--;) {
86 LD_UB4(src, src_stride, src0, src1, src2, src3);
87 src += (4 * src_stride);
88 out0 = __msa_copy_u_d((v2i64) src0, 0);
89 out1 = __msa_copy_u_d((v2i64) src1, 0);
90 out2 = __msa_copy_u_d((v2i64) src2, 0);
91 out3 = __msa_copy_u_d((v2i64) src3, 0);
93 SD4(out0, out1, out2, out3, dst, dst_stride);
94 dst += (4 * dst_stride);
96 } else if (0 == height % 2) {
97 for (cnt = (height / 2); cnt--;) {
98 LD_UB2(src, src_stride, src0, src1);
99 src += (2 * src_stride);
100 out0 = __msa_copy_u_d((v2i64) src0, 0);
101 out1 = __msa_copy_u_d((v2i64) src1, 0);
111 static void copy_width12_msa(uint8_t *src, int32_t src_stride,
112 uint8_t *dst, int32_t dst_stride,
115 v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
117 LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
118 src += (8 * src_stride);
119 ST12x8_UB(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride);
120 dst += (8 * dst_stride);
121 LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
122 ST12x8_UB(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride);
125 static void copy_16multx8mult_msa(uint8_t *src, int32_t src_stride,
126 uint8_t *dst, int32_t dst_stride,
127 int32_t height, int32_t width)
129 int32_t cnt, loop_cnt;
130 uint8_t *src_tmp, *dst_tmp;
131 v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
133 for (cnt = (width >> 4); cnt--;) {
137 for (loop_cnt = (height >> 3); loop_cnt--;) {
138 LD_UB8(src_tmp, src_stride,
139 src0, src1, src2, src3, src4, src5, src6, src7);
140 src_tmp += (8 * src_stride);
142 ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7,
143 dst_tmp, dst_stride);
144 dst_tmp += (8 * dst_stride);
152 static void copy_width16_msa(uint8_t *src, int32_t src_stride,
153 uint8_t *dst, int32_t dst_stride,
157 v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
159 if (0 == height % 12) {
160 for (cnt = (height / 12); cnt--;) {
161 LD_UB8(src, src_stride,
162 src0, src1, src2, src3, src4, src5, src6, src7);
163 src += (8 * src_stride);
164 ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7,
166 dst += (8 * dst_stride);
168 LD_UB4(src, src_stride, src0, src1, src2, src3);
169 src += (4 * src_stride);
170 ST_UB4(src0, src1, src2, src3, dst, dst_stride);
171 dst += (4 * dst_stride);
173 } else if (0 == height % 8) {
174 copy_16multx8mult_msa(src, src_stride, dst, dst_stride, height, 16);
175 } else if (0 == height % 4) {
176 for (cnt = (height >> 2); cnt--;) {
177 LD_UB4(src, src_stride, src0, src1, src2, src3);
178 src += (4 * src_stride);
180 ST_UB4(src0, src1, src2, src3, dst, dst_stride);
181 dst += (4 * dst_stride);
186 static void copy_width24_msa(uint8_t *src, int32_t src_stride,
187 uint8_t *dst, int32_t dst_stride,
190 copy_16multx8mult_msa(src, src_stride, dst, dst_stride, height, 16);
191 copy_width8_msa(src + 16, src_stride, dst + 16, dst_stride, height);
194 static void copy_width32_msa(uint8_t *src, int32_t src_stride,
195 uint8_t *dst, int32_t dst_stride,
199 v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
201 if (0 == height % 12) {
202 for (cnt = (height / 12); cnt--;) {
203 LD_UB4(src, src_stride, src0, src1, src2, src3);
204 LD_UB4(src + 16, src_stride, src4, src5, src6, src7);
205 src += (4 * src_stride);
206 ST_UB4(src0, src1, src2, src3, dst, dst_stride);
207 ST_UB4(src4, src5, src6, src7, dst + 16, dst_stride);
208 dst += (4 * dst_stride);
210 LD_UB4(src, src_stride, src0, src1, src2, src3);
211 LD_UB4(src + 16, src_stride, src4, src5, src6, src7);
212 src += (4 * src_stride);
213 ST_UB4(src0, src1, src2, src3, dst, dst_stride);
214 ST_UB4(src4, src5, src6, src7, dst + 16, dst_stride);
215 dst += (4 * dst_stride);
217 LD_UB4(src, src_stride, src0, src1, src2, src3);
218 LD_UB4(src + 16, src_stride, src4, src5, src6, src7);
219 src += (4 * src_stride);
220 ST_UB4(src0, src1, src2, src3, dst, dst_stride);
221 ST_UB4(src4, src5, src6, src7, dst + 16, dst_stride);
222 dst += (4 * dst_stride);
224 } else if (0 == height % 8) {
225 copy_16multx8mult_msa(src, src_stride, dst, dst_stride, height, 32);
226 } else if (0 == height % 4) {
227 for (cnt = (height >> 2); cnt--;) {
228 LD_UB4(src, src_stride, src0, src1, src2, src3);
229 LD_UB4(src + 16, src_stride, src4, src5, src6, src7);
230 src += (4 * src_stride);
231 ST_UB4(src0, src1, src2, src3, dst, dst_stride);
232 ST_UB4(src4, src5, src6, src7, dst + 16, dst_stride);
233 dst += (4 * dst_stride);
238 static void copy_width48_msa(uint8_t *src, int32_t src_stride,
239 uint8_t *dst, int32_t dst_stride,
242 copy_16multx8mult_msa(src, src_stride, dst, dst_stride, height, 48);
245 static void copy_width64_msa(uint8_t *src, int32_t src_stride,
246 uint8_t *dst, int32_t dst_stride,
249 copy_16multx8mult_msa(src, src_stride, dst, dst_stride, height, 64);
252 static const uint8_t mc_filt_mask_arr[16 * 3] = {
254 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
256 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20,
258 8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
261 #define FILT_8TAP_DPADD_S_H(vec0, vec1, vec2, vec3, \
262 filt0, filt1, filt2, filt3) \
266 tmp0 = __msa_dotp_s_h((v16i8) vec0, (v16i8) filt0); \
267 tmp0 = __msa_dpadd_s_h(tmp0, (v16i8) vec1, (v16i8) filt1); \
268 tmp1 = __msa_dotp_s_h((v16i8) vec2, (v16i8) filt2); \
269 tmp1 = __msa_dpadd_s_h(tmp1, (v16i8) vec3, (v16i8) filt3); \
270 tmp0 = __msa_adds_s_h(tmp0, tmp1); \
275 #define HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, \
276 mask0, mask1, mask2, mask3, \
277 filt0, filt1, filt2, filt3, \
280 v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \
281 v8i16 res0_m, res1_m, res2_m, res3_m; \
283 VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0_m, vec1_m); \
284 DOTP_SB2_SH(vec0_m, vec1_m, filt0, filt0, res0_m, res1_m); \
285 VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2_m, vec3_m); \
286 DPADD_SB2_SH(vec2_m, vec3_m, filt1, filt1, res0_m, res1_m); \
287 VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4_m, vec5_m); \
288 DOTP_SB2_SH(vec4_m, vec5_m, filt2, filt2, res2_m, res3_m); \
289 VSHF_B2_SB(src0, src1, src2, src3, mask3, mask3, vec6_m, vec7_m); \
290 DPADD_SB2_SH(vec6_m, vec7_m, filt3, filt3, res2_m, res3_m); \
291 ADDS_SH2_SH(res0_m, res2_m, res1_m, res3_m, out0, out1); \
294 #define HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, \
295 mask0, mask1, mask2, mask3, \
296 filt0, filt1, filt2, filt3, \
297 out0, out1, out2, out3) \
299 v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \
300 v8i16 res0_m, res1_m, res2_m, res3_m, res4_m, res5_m, res6_m, res7_m; \
302 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m); \
303 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m); \
304 DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0, \
305 res0_m, res1_m, res2_m, res3_m); \
306 VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec0_m, vec1_m); \
307 VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2_m, vec3_m); \
308 DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt2, filt2, filt2, filt2, \
309 res4_m, res5_m, res6_m, res7_m); \
310 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4_m, vec5_m); \
311 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6_m, vec7_m); \
312 DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt1, filt1, filt1, filt1, \
313 res0_m, res1_m, res2_m, res3_m); \
314 VSHF_B2_SB(src0, src0, src1, src1, mask3, mask3, vec4_m, vec5_m); \
315 VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec6_m, vec7_m); \
316 DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt3, filt3, filt3, filt3, \
317 res4_m, res5_m, res6_m, res7_m); \
318 ADDS_SH4_SH(res0_m, res4_m, res1_m, res5_m, res2_m, res6_m, res3_m, \
319 res7_m, out0, out1, out2, out3); \
322 #define FILT_4TAP_DPADD_S_H(vec0, vec1, filt0, filt1) \
326 tmp0 = __msa_dotp_s_h((v16i8) vec0, (v16i8) filt0); \
327 tmp0 = __msa_dpadd_s_h(tmp0, (v16i8) vec1, (v16i8) filt1); \
332 #define HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, \
333 mask0, mask1, filt0, filt1, \
336 v16i8 vec0_m, vec1_m, vec2_m, vec3_m; \
338 VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0_m, vec1_m); \
339 DOTP_SB2_SH(vec0_m, vec1_m, filt0, filt0, out0, out1); \
340 VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2_m, vec3_m); \
341 DPADD_SB2_SH(vec2_m, vec3_m, filt1, filt1, out0, out1); \
344 #define HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, \
345 mask0, mask1, filt0, filt1, \
346 out0, out1, out2, out3) \
348 v16i8 vec0_m, vec1_m, vec2_m, vec3_m; \
350 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m); \
351 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m); \
352 DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0, \
353 out0, out1, out2, out3); \
354 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0_m, vec1_m); \
355 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2_m, vec3_m); \
356 DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt1, filt1, filt1, filt1, \
357 out0, out1, out2, out3); \
360 static void common_hz_8t_4x4_msa(uint8_t *src, int32_t src_stride,
361 uint8_t *dst, int32_t dst_stride,
362 const int8_t *filter, uint8_t rnd_val)
364 v16u8 mask0, mask1, mask2, mask3, out;
365 v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
366 v8i16 filt, out0, out1;
369 mask0 = LD_UB(&mc_filt_mask_arr[16]);
371 rnd_vec = __msa_fill_h(rnd_val);
373 /* rearranging filter */
374 filt = LD_SH(filter);
375 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
381 LD_SB4(src, src_stride, src0, src1, src2, src3);
382 XORI_B4_128_SB(src0, src1, src2, src3);
383 HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
384 mask3, filt0, filt1, filt2, filt3, out0, out1);
385 SRAR_H2_SH(out0, out1, rnd_vec);
386 SAT_SH2_SH(out0, out1, 7);
387 out = PCKEV_XORI128_UB(out0, out1);
388 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
391 static void common_hz_8t_4x8_msa(uint8_t *src, int32_t src_stride,
392 uint8_t *dst, int32_t dst_stride,
393 const int8_t *filter, uint8_t rnd_val)
395 v16i8 filt0, filt1, filt2, filt3;
396 v16i8 src0, src1, src2, src3;
397 v16u8 mask0, mask1, mask2, mask3, out;
398 v8i16 filt, out0, out1, out2, out3;
401 mask0 = LD_UB(&mc_filt_mask_arr[16]);
403 rnd_vec = __msa_fill_h(rnd_val);
405 /* rearranging filter */
406 filt = LD_SH(filter);
407 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
413 LD_SB4(src, src_stride, src0, src1, src2, src3);
414 XORI_B4_128_SB(src0, src1, src2, src3);
415 src += (4 * src_stride);
416 HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
417 mask3, filt0, filt1, filt2, filt3, out0, out1);
418 LD_SB4(src, src_stride, src0, src1, src2, src3);
419 XORI_B4_128_SB(src0, src1, src2, src3);
420 HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
421 mask3, filt0, filt1, filt2, filt3, out2, out3);
422 SRAR_H4_SH(out0, out1, out2, out3, rnd_vec);
423 SAT_SH4_SH(out0, out1, out2, out3, 7);
424 out = PCKEV_XORI128_UB(out0, out1);
425 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
426 dst += (4 * dst_stride);
427 out = PCKEV_XORI128_UB(out2, out3);
428 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
431 static void common_hz_8t_4x16_msa(uint8_t *src, int32_t src_stride,
432 uint8_t *dst, int32_t dst_stride,
433 const int8_t *filter, uint8_t rnd_val)
435 v16u8 mask0, mask1, mask2, mask3, out;
436 v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
437 v8i16 filt, out0, out1, out2, out3;
440 mask0 = LD_UB(&mc_filt_mask_arr[16]);
442 rnd_vec = __msa_fill_h(rnd_val);
444 /* rearranging filter */
445 filt = LD_SH(filter);
446 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
452 LD_SB4(src, src_stride, src0, src1, src2, src3);
453 XORI_B4_128_SB(src0, src1, src2, src3);
454 src += (4 * src_stride);
455 HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
456 mask3, filt0, filt1, filt2, filt3, out0, out1);
457 LD_SB4(src, src_stride, src0, src1, src2, src3);
458 XORI_B4_128_SB(src0, src1, src2, src3);
459 src += (4 * src_stride);
460 HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
461 mask3, filt0, filt1, filt2, filt3, out2, out3);
462 SRAR_H4_SH(out0, out1, out2, out3, rnd_vec);
463 SAT_SH4_SH(out0, out1, out2, out3, 7);
464 out = PCKEV_XORI128_UB(out0, out1);
465 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
466 dst += (4 * dst_stride);
467 out = PCKEV_XORI128_UB(out2, out3);
468 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
469 dst += (4 * dst_stride);
471 LD_SB4(src, src_stride, src0, src1, src2, src3);
472 XORI_B4_128_SB(src0, src1, src2, src3);
473 src += (4 * src_stride);
474 HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
475 mask3, filt0, filt1, filt2, filt3, out0, out1);
476 LD_SB4(src, src_stride, src0, src1, src2, src3);
477 XORI_B4_128_SB(src0, src1, src2, src3);
478 src += (4 * src_stride);
479 HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
480 mask3, filt0, filt1, filt2, filt3, out2, out3);
482 SRAR_H4_SH(out0, out1, out2, out3, rnd_vec);
483 SAT_SH4_SH(out0, out1, out2, out3, 7);
484 out = PCKEV_XORI128_UB(out0, out1);
485 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
486 dst += (4 * dst_stride);
487 out = PCKEV_XORI128_UB(out2, out3);
488 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
491 static void common_hz_8t_4w_msa(uint8_t *src, int32_t src_stride,
492 uint8_t *dst, int32_t dst_stride,
493 const int8_t *filter, int32_t height, uint8_t rnd_val)
496 common_hz_8t_4x4_msa(src, src_stride, dst, dst_stride, filter, rnd_val);
497 } else if (8 == height) {
498 common_hz_8t_4x8_msa(src, src_stride, dst, dst_stride, filter, rnd_val);
499 } else if (16 == height) {
500 common_hz_8t_4x16_msa(src, src_stride, dst, dst_stride, filter,
505 static void common_hz_8t_8x4_msa(uint8_t *src, int32_t src_stride,
506 uint8_t *dst, int32_t dst_stride,
507 const int8_t *filter, uint8_t rnd_val)
509 v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
510 v16u8 mask0, mask1, mask2, mask3, tmp0, tmp1;
511 v8i16 filt, out0, out1, out2, out3;
514 mask0 = LD_UB(&mc_filt_mask_arr[0]);
516 rnd_vec = __msa_fill_h(rnd_val);
518 /* rearranging filter */
519 filt = LD_SH(filter);
520 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
526 LD_SB4(src, src_stride, src0, src1, src2, src3);
527 XORI_B4_128_SB(src0, src1, src2, src3);
528 HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
529 mask3, filt0, filt1, filt2, filt3, out0, out1,
531 SRAR_H4_SH(out0, out1, out2, out3, rnd_vec);
532 SAT_SH4_SH(out0, out1, out2, out3, 7);
533 tmp0 = PCKEV_XORI128_UB(out0, out1);
534 tmp1 = PCKEV_XORI128_UB(out2, out3);
535 ST8x4_UB(tmp0, tmp1, dst, dst_stride);
538 static void common_hz_8t_8x8mult_msa(uint8_t *src, int32_t src_stride,
539 uint8_t *dst, int32_t dst_stride,
540 const int8_t *filter, int32_t height,
544 v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
545 v16u8 mask0, mask1, mask2, mask3, tmp0, tmp1;
546 v8i16 filt, out0, out1, out2, out3;
549 mask0 = LD_UB(&mc_filt_mask_arr[0]);
551 rnd_vec = __msa_fill_h(rnd_val);
553 /* rearranging filter */
554 filt = LD_SH(filter);
555 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
561 for (loop_cnt = (height >> 2); loop_cnt--;) {
562 LD_SB4(src, src_stride, src0, src1, src2, src3);
563 XORI_B4_128_SB(src0, src1, src2, src3);
564 src += (4 * src_stride);
565 HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
566 mask3, filt0, filt1, filt2, filt3, out0,
568 SRAR_H4_SH(out0, out1, out2, out3, rnd_vec);
569 SAT_SH4_SH(out0, out1, out2, out3, 7);
570 tmp0 = PCKEV_XORI128_UB(out0, out1);
571 tmp1 = PCKEV_XORI128_UB(out2, out3);
572 ST8x4_UB(tmp0, tmp1, dst, dst_stride);
573 dst += (4 * dst_stride);
577 static void common_hz_8t_8w_msa(uint8_t *src, int32_t src_stride,
578 uint8_t *dst, int32_t dst_stride,
579 const int8_t *filter, int32_t height,
583 common_hz_8t_8x4_msa(src, src_stride, dst, dst_stride, filter, rnd_val);
585 common_hz_8t_8x8mult_msa(src, src_stride, dst, dst_stride, filter,
590 static void common_hz_8t_12w_msa(uint8_t *src, int32_t src_stride,
591 uint8_t *dst, int32_t dst_stride,
592 const int8_t *filter, int32_t height,
595 uint8_t *src1_ptr, *dst1;
597 v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
598 v8i16 filt, out0, out1, out2, out3;
599 v16u8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask00, tmp0, tmp1;
602 mask00 = LD_UB(&mc_filt_mask_arr[0]);
603 mask0 = LD_UB(&mc_filt_mask_arr[16]);
604 rnd_vec = __msa_fill_h(rnd_val);
612 /* rearranging filter */
613 filt = LD_SH(filter);
614 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
623 for (loop_cnt = (height >> 2); loop_cnt--;) {
625 LD_SB4(src1_ptr, src_stride, src0, src1, src2, src3);
626 XORI_B4_128_SB(src0, src1, src2, src3);
627 src1_ptr += (4 * src_stride);
628 HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask00, mask1, mask2,
629 mask3, filt0, filt1, filt2, filt3, out0,
631 SRAR_H4_SH(out0, out1, out2, out3, rnd_vec);
632 SAT_SH4_SH(out0, out1, out2, out3, 7);
633 tmp0 = PCKEV_XORI128_UB(out0, out1);
634 tmp1 = PCKEV_XORI128_UB(out2, out3);
635 ST8x4_UB(tmp0, tmp1, dst1, dst_stride);
636 dst1 += (4 * dst_stride);
639 LD_SB4(src, src_stride, src0, src1, src2, src3);
640 XORI_B4_128_SB(src0, src1, src2, src3);
641 src += (4 * src_stride);
642 HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask4, mask5,
643 mask6, filt0, filt1, filt2, filt3, out0,
645 SRAR_H2_SH(out0, out1, rnd_vec);
646 SAT_SH2_SH(out0, out1, 7);
647 tmp0 = PCKEV_XORI128_UB(out0, out1);
648 ST4x4_UB(tmp0, tmp0, 0, 1, 2, 3, dst, dst_stride);
649 dst += (4 * dst_stride);
653 static void common_hz_8t_16w_msa(uint8_t *src, int32_t src_stride,
654 uint8_t *dst, int32_t dst_stride,
655 const int8_t *filter, int32_t height,
659 v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
660 v16u8 mask0, mask1, mask2, mask3, out;
661 v8i16 filt, out0, out1, out2, out3;
664 mask0 = LD_UB(&mc_filt_mask_arr[0]);
666 rnd_vec = __msa_fill_h(rnd_val);
668 /* rearranging filter */
669 filt = LD_SH(filter);
670 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
676 for (loop_cnt = (height >> 1); loop_cnt--;) {
677 LD_SB2(src, src_stride, src0, src2);
678 LD_SB2(src + 8, src_stride, src1, src3);
679 XORI_B4_128_SB(src0, src1, src2, src3);
680 src += (2 * src_stride);
681 HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
682 mask3, filt0, filt1, filt2, filt3, out0,
684 SRAR_H4_SH(out0, out1, out2, out3, rnd_vec);
685 SAT_SH4_SH(out0, out1, out2, out3, 7);
686 out = PCKEV_XORI128_UB(out0, out1);
689 out = PCKEV_XORI128_UB(out2, out3);
695 static void common_hz_8t_24w_msa(uint8_t *src, int32_t src_stride,
696 uint8_t *dst, int32_t dst_stride,
697 const int8_t *filter, int32_t height,
701 v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
702 v16u8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7, out;
703 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
705 v8i16 out0, out1, out2, out3, out4, out5, out6, out7, out8, out9, out10;
709 mask0 = LD_UB(&mc_filt_mask_arr[0]);
711 rnd_vec = __msa_fill_h(rnd_val);
713 /* rearranging filter */
714 filt = LD_SH(filter);
715 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
725 for (loop_cnt = (height >> 1); loop_cnt--;) {
726 LD_SB2(src, src_stride, src0, src2);
727 LD_SB2(src + 16, src_stride, src1, src3);
728 XORI_B4_128_SB(src0, src1, src2, src3);
729 src += (2 * src_stride);
730 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec8);
731 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec9);
732 VSHF_B2_SB(src0, src1, src2, src3, mask4, mask4, vec1, vec3);
733 DOTP_SB4_SH(vec0, vec8, vec2, vec9, filt0, filt0, filt0, filt0, out0,
735 DOTP_SB2_SH(vec1, vec3, filt0, filt0, out1, out3);
736 VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec0, vec8);
737 VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2, vec9);
738 VSHF_B2_SB(src0, src1, src2, src3, mask6, mask6, vec1, vec3);
739 DOTP_SB4_SH(vec0, vec8, vec2, vec9, filt2, filt2, filt2, filt2, out4,
741 DOTP_SB2_SH(vec1, vec3, filt2, filt2, out5, out7);
742 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4, vec10);
743 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6, vec11);
744 VSHF_B2_SB(src0, src1, src2, src3, mask5, mask5, vec5, vec7);
745 DPADD_SB4_SH(vec4, vec10, vec6, vec11, filt1, filt1, filt1, filt1,
746 out0, out8, out2, out9);
747 DPADD_SB2_SH(vec5, vec7, filt1, filt1, out1, out3);
748 VSHF_B2_SB(src0, src0, src1, src1, mask3, mask3, vec4, vec10);
749 VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec6, vec11);
750 VSHF_B2_SB(src0, src1, src2, src3, mask7, mask7, vec5, vec7);
751 DPADD_SB4_SH(vec4, vec10, vec6, vec11, filt3, filt3, filt3, filt3,
752 out4, out10, out6, out11);
753 DPADD_SB2_SH(vec5, vec7, filt3, filt3, out5, out7);
754 ADDS_SH4_SH(out0, out4, out8, out10, out2, out6, out9, out11, out0,
756 ADDS_SH2_SH(out1, out5, out3, out7, out1, out3);
757 SRAR_H4_SH(out0, out8, out2, out9, rnd_vec);
758 SRAR_H2_SH(out1, out3, rnd_vec);
759 SAT_SH4_SH(out0, out8, out2, out9, 7);
760 SAT_SH2_SH(out1, out3, 7);
761 out = PCKEV_XORI128_UB(out8, out9);
762 ST8x2_UB(out, dst + 16, dst_stride);
763 out = PCKEV_XORI128_UB(out0, out1);
766 out = PCKEV_XORI128_UB(out2, out3);
772 static void common_hz_8t_32w_msa(uint8_t *src, int32_t src_stride,
773 uint8_t *dst, int32_t dst_stride,
774 const int8_t *filter, int32_t height,
778 v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
779 v16u8 mask0, mask1, mask2, mask3, out;
780 v8i16 filt, out0, out1, out2, out3;
783 mask0 = LD_UB(&mc_filt_mask_arr[0]);
785 rnd_vec = __msa_fill_h(rnd_val);
787 /* rearranging filter */
788 filt = LD_SH(filter);
789 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
795 for (loop_cnt = (height >> 1); loop_cnt--;) {
797 src2 = LD_SB(src + 16);
798 src3 = LD_SB(src + 24);
799 src1 = __msa_sldi_b(src2, src0, 8);
801 XORI_B4_128_SB(src0, src1, src2, src3);
802 HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
803 mask3, filt0, filt1, filt2, filt3, out0,
805 SRAR_H4_SH(out0, out1, out2, out3, rnd_vec);
806 SAT_SH4_SH(out0, out1, out2, out3, 7);
809 src2 = LD_SB(src + 16);
810 src3 = LD_SB(src + 24);
811 src1 = __msa_sldi_b(src2, src0, 8);
814 out = PCKEV_XORI128_UB(out0, out1);
816 out = PCKEV_XORI128_UB(out2, out3);
817 ST_UB(out, dst + 16);
820 XORI_B4_128_SB(src0, src1, src2, src3);
821 HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
822 mask3, filt0, filt1, filt2, filt3, out0,
824 SRAR_H4_SH(out0, out1, out2, out3, rnd_vec);
825 SAT_SH4_SH(out0, out1, out2, out3, 7);
826 out = PCKEV_XORI128_UB(out0, out1);
828 out = PCKEV_XORI128_UB(out2, out3);
829 ST_UB(out, dst + 16);
834 static void common_hz_8t_48w_msa(uint8_t *src, int32_t src_stride,
835 uint8_t *dst, int32_t dst_stride,
836 const int8_t *filter, int32_t height,
840 v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3, vec0, vec1, vec2;
841 v16u8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7, out;
842 v8i16 filt, out0, out1, out2, out3, out4, out5, out6;
845 mask0 = LD_UB(&mc_filt_mask_arr[0]);
847 rnd_vec = __msa_fill_h(rnd_val);
849 /* rearranging filter */
850 filt = LD_SH(filter);
851 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
861 for (loop_cnt = height; loop_cnt--;) {
862 LD_SB3(src, 16, src0, src2, src3);
863 src1 = __msa_sldi_b(src2, src0, 8);
865 XORI_B4_128_SB(src0, src1, src2, src3);
866 VSHF_B3_SB(src0, src0, src1, src1, src2, src2, mask0, mask0, mask0,
868 DOTP_SB3_SH(vec0, vec1, vec2, filt0, filt0, filt0, out0, out1, out2);
869 VSHF_B3_SB(src0, src0, src1, src1, src2, src2, mask1, mask1, mask1,
871 DPADD_SB2_SH(vec0, vec1, filt1, filt1, out0, out1);
872 out2 = __msa_dpadd_s_h(out2, vec2, filt1);
873 VSHF_B3_SB(src0, src0, src1, src1, src2, src2, mask2, mask2, mask2,
875 DOTP_SB3_SH(vec0, vec1, vec2, filt2, filt2, filt2, out3, out4, out5);
876 VSHF_B3_SB(src0, src0, src1, src1, src2, src2, mask3, mask3, mask3,
878 DPADD_SB2_SH(vec0, vec1, filt3, filt3, out3, out4);
879 out5 = __msa_dpadd_s_h(out5, vec2, filt3);
880 ADDS_SH2_SH(out0, out3, out1, out4, out0, out1);
881 out2 = __msa_adds_s_h(out2, out5);
882 SRAR_H2_SH(out0, out1, rnd_vec);
883 out6 = __msa_srar_h(out2, rnd_vec);
884 SAT_SH3_SH(out0, out1, out6, 7);
885 out = PCKEV_XORI128_UB(out0, out1);
888 src1 = LD_SB(src + 40);
890 src1 = (v16i8) __msa_xori_b((v16u8) src1, 128);
892 VSHF_B3_SB(src2, src3, src3, src3, src1, src1, mask4, mask0, mask0,
894 DOTP_SB3_SH(vec0, vec1, vec2, filt0, filt0, filt0, out0, out1, out2);
895 VSHF_B3_SB(src2, src3, src3, src3, src1, src1, mask5, mask1, mask1,
897 DPADD_SB2_SH(vec0, vec1, filt1, filt1, out0, out1);
898 out2 = __msa_dpadd_s_h(out2, vec2, filt1);
899 VSHF_B3_SB(src2, src3, src3, src3, src1, src1, mask6, mask2, mask2,
901 DOTP_SB3_SH(vec0, vec1, vec2, filt2, filt2, filt2, out3, out4, out5);
902 VSHF_B3_SB(src2, src3, src3, src3, src1, src1, mask7, mask3, mask3,
904 DPADD_SB2_SH(vec0, vec1, filt3, filt3, out3, out4);
905 out5 = __msa_dpadd_s_h(out5, vec2, filt3);
906 ADDS_SH2_SH(out0, out3, out1, out4, out3, out4);
907 out5 = __msa_adds_s_h(out2, out5);
908 SRAR_H3_SH(out3, out4, out5, rnd_vec);
909 SAT_SH3_SH(out3, out4, out5, 7);
910 out = PCKEV_XORI128_UB(out6, out3);
911 ST_UB(out, dst + 16);
912 out = PCKEV_XORI128_UB(out4, out5);
913 ST_UB(out, dst + 32);
918 static void common_hz_8t_64w_msa(uint8_t *src, int32_t src_stride,
919 uint8_t *dst, int32_t dst_stride,
920 const int8_t *filter, int32_t height,
924 v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
925 v16u8 mask0, mask1, mask2, mask3, out;
926 v8i16 filt, out0, out1, out2, out3;
929 mask0 = LD_UB(&mc_filt_mask_arr[0]);
931 rnd_vec = __msa_fill_h(rnd_val);
933 /* rearranging filter */
934 filt = LD_SH(filter);
935 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
941 for (loop_cnt = height; loop_cnt--;) {
943 src2 = LD_SB(src + 16);
944 src3 = LD_SB(src + 24);
945 src1 = __msa_sldi_b(src2, src0, 8);
947 XORI_B4_128_SB(src0, src1, src2, src3);
948 HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,
949 mask2, mask3, filt0, filt1, filt2, filt3,
950 out0, out1, out2, out3);
951 SRAR_H4_SH(out0, out1, out2, out3, rnd_vec);
952 SAT_SH4_SH(out0, out1, out2, out3, 7);
953 out = PCKEV_XORI128_UB(out0, out1);
955 out = PCKEV_XORI128_UB(out2, out3);
956 ST_UB(out, dst + 16);
958 src0 = LD_SB(src + 32);
959 src2 = LD_SB(src + 48);
960 src3 = LD_SB(src + 56);
961 src1 = __msa_sldi_b(src2, src0, 8);
964 XORI_B4_128_SB(src0, src1, src2, src3);
965 HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,
966 mask2, mask3, filt0, filt1, filt2, filt3,
967 out0, out1, out2, out3);
968 SRAR_H4_SH(out0, out1, out2, out3, rnd_vec);
969 SAT_SH4_SH(out0, out1, out2, out3, 7);
970 out = PCKEV_XORI128_UB(out0, out1);
971 ST_UB(out, dst + 32);
972 out = PCKEV_XORI128_UB(out2, out3);
973 ST_UB(out, dst + 48);
978 static void common_vt_8t_4w_msa(uint8_t *src, int32_t src_stride,
979 uint8_t *dst, int32_t dst_stride,
980 const int8_t *filter, int32_t height,
984 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
985 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
986 v16i8 src65_r, src87_r, src109_r, src2110, src4332, src6554, src8776;
987 v16i8 src10998, filt0, filt1, filt2, filt3;
989 v8i16 filt, out10, out32;
992 src -= (3 * src_stride);
993 rnd_vec = __msa_fill_h(rnd_val);
995 filt = LD_SH(filter);
996 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
998 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
999 src += (7 * src_stride);
1001 ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
1003 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1004 ILVR_D3_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r, src2110,
1006 XORI_B3_128_SB(src2110, src4332, src6554);
1008 for (loop_cnt = (height >> 2); loop_cnt--;) {
1009 LD_SB4(src, src_stride, src7, src8, src9, src10);
1010 src += (4 * src_stride);
1012 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
1013 src87_r, src98_r, src109_r);
1014 ILVR_D2_SB(src87_r, src76_r, src109_r, src98_r, src8776, src10998);
1015 XORI_B2_128_SB(src8776, src10998);
1016 out10 = FILT_8TAP_DPADD_S_H(src2110, src4332, src6554, src8776, filt0,
1017 filt1, filt2, filt3);
1018 out32 = FILT_8TAP_DPADD_S_H(src4332, src6554, src8776, src10998, filt0,
1019 filt1, filt2, filt3);
1020 SRAR_H2_SH(out10, out32, rnd_vec);
1021 SAT_SH2_SH(out10, out32, 7);
1022 out = PCKEV_XORI128_UB(out10, out32);
1023 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
1024 dst += (4 * dst_stride);
1033 static void common_vt_8t_8w_msa(uint8_t *src, int32_t src_stride,
1034 uint8_t *dst, int32_t dst_stride,
1035 const int8_t *filter, int32_t height,
1039 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1040 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
1041 v16i8 src65_r, src87_r, src109_r, filt0, filt1, filt2, filt3;
1043 v8i16 filt, out0_r, out1_r, out2_r, out3_r;
1046 src -= (3 * src_stride);
1047 rnd_vec = __msa_fill_h(rnd_val);
1049 filt = LD_SH(filter);
1050 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1052 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
1053 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1054 src += (7 * src_stride);
1055 ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
1057 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1059 for (loop_cnt = (height >> 2); loop_cnt--;) {
1060 LD_SB4(src, src_stride, src7, src8, src9, src10);
1061 XORI_B4_128_SB(src7, src8, src9, src10);
1062 src += (4 * src_stride);
1064 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
1065 src87_r, src98_r, src109_r);
1066 out0_r = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r, filt0,
1067 filt1, filt2, filt3);
1068 out1_r = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r, filt0,
1069 filt1, filt2, filt3);
1070 out2_r = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r, filt0,
1071 filt1, filt2, filt3);
1072 out3_r = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r, filt0,
1073 filt1, filt2, filt3);
1074 SRAR_H4_SH(out0_r, out1_r, out2_r, out3_r, rnd_vec);
1075 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
1076 tmp0 = PCKEV_XORI128_UB(out0_r, out1_r);
1077 tmp1 = PCKEV_XORI128_UB(out2_r, out3_r);
1078 ST8x4_UB(tmp0, tmp1, dst, dst_stride);
1079 dst += (4 * dst_stride);
1091 static void common_vt_8t_12w_msa(uint8_t *src, int32_t src_stride,
1092 uint8_t *dst, int32_t dst_stride,
1093 const int8_t *filter, int32_t height,
1097 uint32_t out2, out3;
1098 uint64_t out0, out1;
1099 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, res0, res1;
1100 v16i8 res2, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1101 v8i16 vec01, vec23, vec45, vec67, tmp0, tmp1, tmp2;
1102 v8i16 filt, filt0, filt1, filt2, filt3;
1104 v4i32 mask = { 2, 6, 2, 6 };
1106 src -= (3 * src_stride);
1107 rnd_vec = __msa_fill_h(rnd_val);
1109 /* rearranging filter_y */
1110 filt = LD_SH(filter);
1111 SPLATI_H4_SH(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1113 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
1114 src += (7 * src_stride);
1116 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1119 VSHF_W2_SB(src0, src1, src1, src2, mask, mask, vec0, vec1);
1120 VSHF_W2_SB(src2, src3, src3, src4, mask, mask, vec2, vec3);
1121 VSHF_W2_SB(src4, src5, src5, src6, mask, mask, vec4, vec5);
1123 for (loop_cnt = (height >> 1); loop_cnt--;) {
1124 LD_SB2(src, src_stride, src7, src8);
1125 XORI_B2_128_SB(src7, src8);
1126 src += (2 * src_stride);
1128 ILVR_B4_SH(src1, src0, src3, src2, src5, src4, src7, src6,
1129 vec01, vec23, vec45, vec67);
1130 tmp0 = FILT_8TAP_DPADD_S_H(vec01, vec23, vec45, vec67, filt0, filt1,
1132 ILVR_B4_SH(src2, src1, src4, src3, src6, src5, src8, src7, vec01, vec23,
1134 tmp1 = FILT_8TAP_DPADD_S_H(vec01, vec23, vec45, vec67, filt0, filt1,
1138 VSHF_W2_SB(src6, src7, src7, src8, mask, mask, vec6, vec7);
1139 ILVR_B4_SH(vec1, vec0, vec3, vec2, vec5, vec4, vec7, vec6, vec01, vec23,
1141 tmp2 = FILT_8TAP_DPADD_S_H(vec01, vec23, vec45, vec67, filt0, filt1,
1143 SRAR_H3_SH(tmp0, tmp1, tmp2, rnd_vec);
1144 SAT_SH3_SH(tmp0, tmp1, tmp2, 7);
1145 PCKEV_B3_SB(tmp0, tmp0, tmp1, tmp1, tmp2, tmp2, res0, res1, res2);
1146 XORI_B3_128_SB(res0, res1, res2);
1148 out0 = __msa_copy_u_d((v2i64) res0, 0);
1149 out1 = __msa_copy_u_d((v2i64) res1, 0);
1150 out2 = __msa_copy_u_w((v4i32) res2, 0);
1151 out3 = __msa_copy_u_w((v4i32) res2, 1);
1153 SW(out2, (dst + 8));
1156 SW(out3, (dst + 8));
1175 static void common_vt_8t_16w_msa(uint8_t *src, int32_t src_stride,
1176 uint8_t *dst, int32_t dst_stride,
1177 const int8_t *filter, int32_t height,
1181 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1182 v16i8 filt0, filt1, filt2, filt3;
1183 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
1184 v16i8 src65_r, src87_r, src109_r, src10_l, src32_l, src54_l, src76_l;
1185 v16i8 src98_l, src21_l, src43_l, src65_l, src87_l, src109_l;
1186 v16u8 tmp0, tmp1, tmp2, tmp3;
1187 v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
1190 src -= (3 * src_stride);
1191 rnd_vec = __msa_fill_h(rnd_val);
1193 filt = LD_SH(filter);
1194 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1196 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
1197 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1198 src += (7 * src_stride);
1199 ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
1201 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1202 ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_l, src32_l,
1204 ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
1206 for (loop_cnt = (height >> 2); loop_cnt--;) {
1207 LD_SB4(src, src_stride, src7, src8, src9, src10);
1208 XORI_B4_128_SB(src7, src8, src9, src10);
1209 src += (4 * src_stride);
1211 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
1212 src87_r, src98_r, src109_r);
1213 ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_l,
1214 src87_l, src98_l, src109_l);
1215 out0_r = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r, filt0,
1216 filt1, filt2, filt3);
1217 out1_r = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r, filt0,
1218 filt1, filt2, filt3);
1219 out2_r = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r, filt0,
1220 filt1, filt2, filt3);
1221 out3_r = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r, filt0,
1222 filt1, filt2, filt3);
1223 out0_l = FILT_8TAP_DPADD_S_H(src10_l, src32_l, src54_l, src76_l, filt0,
1224 filt1, filt2, filt3);
1225 out1_l = FILT_8TAP_DPADD_S_H(src21_l, src43_l, src65_l, src87_l, filt0,
1226 filt1, filt2, filt3);
1227 out2_l = FILT_8TAP_DPADD_S_H(src32_l, src54_l, src76_l, src98_l, filt0,
1228 filt1, filt2, filt3);
1229 out3_l = FILT_8TAP_DPADD_S_H(src43_l, src65_l, src87_l, src109_l, filt0,
1230 filt1, filt2, filt3);
1231 SRAR_H4_SH(out0_r, out1_r, out2_r, out3_r, rnd_vec);
1232 SRAR_H4_SH(out0_l, out1_l, out2_l, out3_l, rnd_vec);
1233 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
1234 SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
1235 PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
1236 out3_r, tmp0, tmp1, tmp2, tmp3);
1237 XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3);
1238 ST_UB4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride);
1239 dst += (4 * dst_stride);
1257 static void common_vt_8t_16w_mult_msa(uint8_t *src, int32_t src_stride,
1258 uint8_t *dst, int32_t dst_stride,
1259 const int8_t *filter, int32_t height,
1260 uint8_t rnd_val, int32_t width)
1264 uint32_t loop_cnt, cnt;
1265 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1266 v16i8 filt0, filt1, filt2, filt3;
1267 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
1268 v16i8 src65_r, src87_r, src109_r, src10_l, src32_l, src54_l, src76_l;
1269 v16i8 src98_l, src21_l, src43_l, src65_l, src87_l, src109_l;
1270 v16u8 tmp0, tmp1, tmp2, tmp3;
1271 v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
1274 src -= (3 * src_stride);
1275 rnd_vec = __msa_fill_h(rnd_val);
1277 filt = LD_SH(filter);
1278 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1280 for (cnt = (width >> 4); cnt--;) {
1284 LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6);
1285 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1286 src_tmp += (7 * src_stride);
1287 ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r,
1288 src32_r, src54_r, src21_r);
1289 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1290 ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_l,
1291 src32_l, src54_l, src21_l);
1292 ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
1294 for (loop_cnt = (height >> 2); loop_cnt--;) {
1295 LD_SB4(src_tmp, src_stride, src7, src8, src9, src10);
1296 XORI_B4_128_SB(src7, src8, src9, src10);
1297 src_tmp += (4 * src_stride);
1298 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
1299 src87_r, src98_r, src109_r);
1300 ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_l,
1301 src87_l, src98_l, src109_l);
1302 out0_r = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r,
1303 filt0, filt1, filt2, filt3);
1304 out1_r = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r,
1305 filt0, filt1, filt2, filt3);
1306 out2_r = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r,
1307 filt0, filt1, filt2, filt3);
1308 out3_r = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r,
1309 filt0, filt1, filt2, filt3);
1310 out0_l = FILT_8TAP_DPADD_S_H(src10_l, src32_l, src54_l, src76_l,
1311 filt0, filt1, filt2, filt3);
1312 out1_l = FILT_8TAP_DPADD_S_H(src21_l, src43_l, src65_l, src87_l,
1313 filt0, filt1, filt2, filt3);
1314 out2_l = FILT_8TAP_DPADD_S_H(src32_l, src54_l, src76_l, src98_l,
1315 filt0, filt1, filt2, filt3);
1316 out3_l = FILT_8TAP_DPADD_S_H(src43_l, src65_l, src87_l, src109_l,
1317 filt0, filt1, filt2, filt3);
1318 SRAR_H4_SH(out0_r, out1_r, out2_r, out3_r, rnd_vec);
1319 SRAR_H4_SH(out0_l, out1_l, out2_l, out3_l, rnd_vec);
1320 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
1321 SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
1322 PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
1323 out3_r, tmp0, tmp1, tmp2, tmp3);
1324 XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3);
1325 ST_UB4(tmp0, tmp1, tmp2, tmp3, dst_tmp, dst_stride);
1326 dst_tmp += (4 * dst_stride);
1348 static void common_vt_8t_24w_msa(uint8_t *src, int32_t src_stride,
1349 uint8_t *dst, int32_t dst_stride,
1350 const int8_t *filter, int32_t height, uint8_t rnd_val)
1352 common_vt_8t_16w_mult_msa(src, src_stride, dst, dst_stride, filter, height,
1355 common_vt_8t_8w_msa(src + 16, src_stride, dst + 16, dst_stride, filter,
1359 static void common_vt_8t_32w_msa(uint8_t *src, int32_t src_stride,
1360 uint8_t *dst, int32_t dst_stride,
1361 const int8_t *filter, int32_t height, uint8_t rnd_val)
1363 common_vt_8t_16w_mult_msa(src, src_stride, dst, dst_stride, filter, height,
1367 static void common_vt_8t_48w_msa(uint8_t *src, int32_t src_stride,
1368 uint8_t *dst, int32_t dst_stride,
1369 const int8_t *filter, int32_t height, uint8_t rnd_val)
1371 common_vt_8t_16w_mult_msa(src, src_stride, dst, dst_stride, filter, height,
1375 static void common_vt_8t_64w_msa(uint8_t *src, int32_t src_stride,
1376 uint8_t *dst, int32_t dst_stride,
1377 const int8_t *filter, int32_t height, uint8_t rnd_val)
1379 common_vt_8t_16w_mult_msa(src, src_stride, dst, dst_stride, filter, height,
1383 static void hevc_hv_uni_8t_4w_msa(uint8_t *src,
1387 const int8_t *filter_x,
1388 const int8_t *filter_y,
1392 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
1393 v8i16 filt0, filt1, filt2, filt3;
1394 v4i32 filt_h0, filt_h1, filt_h2, filt_h3;
1395 v16i8 mask1, mask2, mask3;
1396 v8i16 filter_vec, const_vec;
1397 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1398 v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1399 v8i16 dst30, dst41, dst52, dst63, dst66, dst87;
1400 v4i32 dst0_r, dst1_r;
1401 v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
1402 v8i16 dst21_r, dst43_r, dst65_r, dst87_r;
1403 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 };
1404 v8i16 mask4 = { 0, 4, 1, 5, 2, 6, 3, 7 };
1406 src -= ((3 * src_stride) + 3);
1407 filter_vec = LD_SH(filter_x);
1408 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1410 filter_vec = LD_SH(filter_y);
1411 vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
1412 filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
1414 SPLATI_W4_SW(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
1420 const_vec = __msa_ldi_h(128);
1423 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
1424 src += (7 * src_stride);
1425 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1427 VSHF_B4_SB(src0, src3, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
1428 VSHF_B4_SB(src1, src4, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7);
1429 VSHF_B4_SB(src2, src5, mask0, mask1, mask2, mask3,
1430 vec8, vec9, vec10, vec11);
1431 VSHF_B4_SB(src3, src6, mask0, mask1, mask2, mask3,
1432 vec12, vec13, vec14, vec15);
1435 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
1436 dst30, dst30, dst30, dst30);
1438 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, filt3,
1439 dst41, dst41, dst41, dst41);
1441 DPADD_SB4_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, filt3,
1442 dst52, dst52, dst52, dst52);
1444 DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2, filt3,
1445 dst63, dst63, dst63, dst63);
1447 ILVR_H3_SH(dst41, dst30, dst52, dst41, dst63, dst52,
1448 dst10_r, dst21_r, dst32_r);
1449 dst43_r = __msa_ilvl_h(dst41, dst30);
1450 dst54_r = __msa_ilvl_h(dst52, dst41);
1451 dst65_r = __msa_ilvl_h(dst63, dst52);
1452 dst66 = (v8i16) __msa_splati_d((v2i64) dst63, 1);
1454 for (loop_cnt = height >> 1; loop_cnt--;) {
1455 LD_SB2(src, src_stride, src7, src8);
1456 src += 2 * src_stride;
1457 XORI_B2_128_SB(src7, src8);
1459 VSHF_B4_SB(src7, src8, mask0, mask1, mask2, mask3,
1460 vec0, vec1, vec2, vec3);
1462 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
1463 dst87, dst87, dst87, dst87);
1465 dst76_r = __msa_ilvr_h(dst87, dst66);
1466 dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r,
1467 filt_h0, filt_h1, filt_h2, filt_h3);
1468 dst87_r = __msa_vshf_h(mask4, dst87, dst87);
1469 dst1_r = HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r,
1470 filt_h0, filt_h1, filt_h2, filt_h3);
1474 SRARI_W2_SW(dst0_r, dst1_r, 6);
1475 dst0_r = CLIP_SW_0_255(dst0_r);
1476 dst1_r = CLIP_SW_0_255(dst1_r);
1478 HEVC_PCK_SW_SB2(dst1_r, dst0_r, dst0_r);
1479 ST4x2_UB(dst0_r, dst, dst_stride);
1480 dst += (2 * dst_stride);
1488 dst66 = (v8i16) __msa_splati_d((v2i64) dst87, 1);
1492 static void hevc_hv_uni_8t_8multx2mult_msa(uint8_t *src,
1496 const int8_t *filter_x,
1497 const int8_t *filter_y,
1498 int32_t height, int32_t width)
1500 uint32_t loop_cnt, cnt;
1503 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
1504 v8i16 filt0, filt1, filt2, filt3;
1505 v4i32 filt_h0, filt_h1, filt_h2, filt_h3;
1506 v16i8 mask1, mask2, mask3;
1507 v8i16 filter_vec, const_vec;
1508 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1509 v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1510 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
1511 v4i32 dst0_r, dst0_l, dst1_r, dst1_l;
1512 v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
1513 v8i16 dst10_l, dst32_l, dst54_l, dst76_l;
1514 v8i16 dst21_r, dst43_r, dst65_r, dst87_r;
1515 v8i16 dst21_l, dst43_l, dst65_l, dst87_l;
1516 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
1518 src -= ((3 * src_stride) + 3);
1519 const_vec = __msa_ldi_h(128);
1522 filter_vec = LD_SH(filter_x);
1523 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1525 filter_vec = LD_SH(filter_y);
1526 vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
1527 filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
1529 SPLATI_W4_SW(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
1535 for (cnt = width >> 3; cnt--;) {
1539 LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6);
1540 src_tmp += (7 * src_stride);
1541 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1543 /* row 0 row 1 row 2 row 3 */
1544 VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
1545 vec0, vec1, vec2, vec3);
1546 VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
1547 vec4, vec5, vec6, vec7);
1548 VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
1549 vec8, vec9, vec10, vec11);
1550 VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
1551 vec12, vec13, vec14, vec15);
1553 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
1554 dst0, dst0, dst0, dst0);
1556 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, filt3,
1557 dst1, dst1, dst1, dst1);
1559 DPADD_SB4_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, filt3,
1560 dst2, dst2, dst2, dst2);
1562 DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2, filt3,
1563 dst3, dst3, dst3, dst3);
1565 VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3,
1566 vec0, vec1, vec2, vec3);
1567 VSHF_B4_SB(src5, src5, mask0, mask1, mask2, mask3,
1568 vec4, vec5, vec6, vec7);
1569 VSHF_B4_SB(src6, src6, mask0, mask1, mask2, mask3,
1570 vec8, vec9, vec10, vec11);
1572 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
1573 dst4, dst4, dst4, dst4);
1575 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, filt3,
1576 dst5, dst5, dst5, dst5);
1578 DPADD_SB4_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, filt3,
1579 dst6, dst6, dst6, dst6);
1581 ILVR_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1,
1582 dst10_r, dst32_r, dst54_r, dst21_r);
1583 ILVR_H2_SH(dst4, dst3, dst6, dst5, dst43_r, dst65_r);
1584 ILVL_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1,
1585 dst10_l, dst32_l, dst54_l, dst21_l);
1586 ILVL_H2_SH(dst4, dst3, dst6, dst5, dst43_l, dst65_l);
1588 for (loop_cnt = height >> 1; loop_cnt--;) {
1589 LD_SB2(src_tmp, src_stride, src7, src8);
1590 XORI_B2_128_SB(src7, src8);
1591 src_tmp += 2 * src_stride;
1593 VSHF_B4_SB(src7, src7, mask0, mask1, mask2, mask3,
1594 vec0, vec1, vec2, vec3);
1596 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
1597 dst7, dst7, dst7, dst7);
1599 ILVRL_H2_SH(dst7, dst6, dst76_r, dst76_l);
1600 dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r,
1601 filt_h0, filt_h1, filt_h2, filt_h3);
1602 dst0_l = HEVC_FILT_8TAP(dst10_l, dst32_l, dst54_l, dst76_l,
1603 filt_h0, filt_h1, filt_h2, filt_h3);
1607 VSHF_B4_SB(src8, src8, mask0, mask1, mask2, mask3,
1608 vec0, vec1, vec2, vec3);
1610 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
1611 dst8, dst8, dst8, dst8);
1613 ILVRL_H2_SH(dst8, dst7, dst87_r, dst87_l);
1614 dst1_r = HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r,
1615 filt_h0, filt_h1, filt_h2, filt_h3);
1616 dst1_l = HEVC_FILT_8TAP(dst21_l, dst43_l, dst65_l, dst87_l,
1617 filt_h0, filt_h1, filt_h2, filt_h3);
1620 SRARI_W4_SW(dst0_r, dst0_l, dst1_r, dst1_l, 6);
1621 dst0_r = CLIP_SW_0_255(dst0_r);
1622 dst0_l = CLIP_SW_0_255(dst0_l);
1623 dst1_r = CLIP_SW_0_255(dst1_r);
1624 dst1_l = CLIP_SW_0_255(dst1_l);
1626 HEVC_PCK_SW_SB4(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r);
1627 ST8x2_UB(dst0_r, dst_tmp, dst_stride);
1628 dst_tmp += (2 * dst_stride);
1650 static void hevc_hv_uni_8t_8w_msa(uint8_t *src,
1654 const int8_t *filter_x,
1655 const int8_t *filter_y,
1658 hevc_hv_uni_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
1659 filter_x, filter_y, height, 8);
1662 static void hevc_hv_uni_8t_12w_msa(uint8_t *src,
1666 const int8_t *filter_x,
1667 const int8_t *filter_y,
1670 hevc_hv_uni_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
1671 filter_x, filter_y, height, 8);
1673 hevc_hv_uni_8t_4w_msa(src + 8, src_stride, dst + 8, dst_stride,
1674 filter_x, filter_y, height);
1677 static void hevc_hv_uni_8t_16w_msa(uint8_t *src,
1681 const int8_t *filter_x,
1682 const int8_t *filter_y,
1685 hevc_hv_uni_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
1686 filter_x, filter_y, height, 16);
1689 static void hevc_hv_uni_8t_24w_msa(uint8_t *src,
1693 const int8_t *filter_x,
1694 const int8_t *filter_y,
1697 hevc_hv_uni_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
1698 filter_x, filter_y, height, 24);
1701 static void hevc_hv_uni_8t_32w_msa(uint8_t *src,
1705 const int8_t *filter_x,
1706 const int8_t *filter_y,
1709 hevc_hv_uni_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
1710 filter_x, filter_y, height, 32);
1713 static void hevc_hv_uni_8t_48w_msa(uint8_t *src,
1717 const int8_t *filter_x,
1718 const int8_t *filter_y,
1721 hevc_hv_uni_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
1722 filter_x, filter_y, height, 48);
1725 static void hevc_hv_uni_8t_64w_msa(uint8_t *src,
1729 const int8_t *filter_x,
1730 const int8_t *filter_y,
1733 hevc_hv_uni_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
1734 filter_x, filter_y, height, 64);
1737 static void common_hz_4t_4x2_msa(uint8_t *src, int32_t src_stride,
1738 uint8_t *dst, int32_t dst_stride,
1739 const int8_t *filter, uint8_t rnd_val)
1741 v16i8 filt0, filt1, src0, src1, mask0, mask1, vec0, vec1;
1746 mask0 = LD_SB(&mc_filt_mask_arr[16]);
1748 rnd_vec = __msa_fill_h(rnd_val);
1750 /* rearranging filter */
1751 filt = LD_SH(filter);
1752 SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
1756 LD_SB2(src, src_stride, src0, src1);
1757 XORI_B2_128_SB(src0, src1);
1758 VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
1759 res0 = FILT_4TAP_DPADD_S_H(vec0, vec1, filt0, filt1);
1760 res0 = __msa_srar_h(res0, rnd_vec);
1761 res0 = __msa_sat_s_h(res0, 7);
1762 out = PCKEV_XORI128_UB(res0, res0);
1763 ST4x2_UB(out, dst, dst_stride);
1766 static void common_hz_4t_4x4_msa(uint8_t *src, int32_t src_stride,
1767 uint8_t *dst, int32_t dst_stride,
1768 const int8_t *filter, uint8_t rnd_val)
1770 v16i8 src0, src1, src2, src3, filt0, filt1, mask0, mask1;
1771 v8i16 filt, out0, out1;
1775 mask0 = LD_SB(&mc_filt_mask_arr[16]);
1777 rnd_vec = __msa_fill_h(rnd_val);
1779 /* rearranging filter */
1780 filt = LD_SH(filter);
1781 SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
1785 LD_SB4(src, src_stride, src0, src1, src2, src3);
1786 XORI_B4_128_SB(src0, src1, src2, src3);
1787 HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,
1788 filt0, filt1, out0, out1);
1789 SRAR_H2_SH(out0, out1, rnd_vec);
1790 SAT_SH2_SH(out0, out1, 7);
1791 out = PCKEV_XORI128_UB(out0, out1);
1792 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
1795 static void common_hz_4t_4x8_msa(uint8_t *src, int32_t src_stride,
1796 uint8_t *dst, int32_t dst_stride,
1797 const int8_t *filter, uint8_t rnd_val)
1799 v16i8 src0, src1, src2, src3, filt0, filt1, mask0, mask1;
1801 v8i16 filt, out0, out1, out2, out3;
1804 mask0 = LD_SB(&mc_filt_mask_arr[16]);
1806 rnd_vec = __msa_fill_h(rnd_val);
1808 /* rearranging filter */
1809 filt = LD_SH(filter);
1810 SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
1814 LD_SB4(src, src_stride, src0, src1, src2, src3);
1815 src += (4 * src_stride);
1817 XORI_B4_128_SB(src0, src1, src2, src3);
1818 HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,
1819 filt0, filt1, out0, out1);
1820 LD_SB4(src, src_stride, src0, src1, src2, src3);
1821 XORI_B4_128_SB(src0, src1, src2, src3);
1822 HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,
1823 filt0, filt1, out2, out3);
1824 SRAR_H4_SH(out0, out1, out2, out3, rnd_vec);
1825 SAT_SH4_SH(out0, out1, out2, out3, 7);
1826 out = PCKEV_XORI128_UB(out0, out1);
1827 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
1828 dst += (4 * dst_stride);
1829 out = PCKEV_XORI128_UB(out2, out3);
1830 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
1833 static void common_hz_4t_4x16_msa(uint8_t *src, int32_t src_stride,
1834 uint8_t *dst, int32_t dst_stride,
1835 const int8_t *filter, uint8_t rnd_val)
1837 v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
1838 v16i8 filt0, filt1, mask0, mask1;
1840 v8i16 filt, out0, out1, out2, out3;
1843 mask0 = LD_SB(&mc_filt_mask_arr[16]);
1845 rnd_vec = __msa_fill_h(rnd_val);
1847 /* rearranging filter */
1848 filt = LD_SH(filter);
1849 SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
1853 LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
1854 src += (8 * src_stride);
1855 XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
1856 HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,
1857 filt0, filt1, out0, out1);
1858 HORIZ_4TAP_4WID_4VECS_FILT(src4, src5, src6, src7, mask0, mask1,
1859 filt0, filt1, out2, out3);
1860 SRAR_H4_SH(out0, out1, out2, out3, rnd_vec);
1861 SAT_SH4_SH(out0, out1, out2, out3, 7);
1862 out = PCKEV_XORI128_UB(out0, out1);
1863 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
1864 dst += (4 * dst_stride);
1865 out = PCKEV_XORI128_UB(out2, out3);
1866 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
1867 dst += (4 * dst_stride);
1869 LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
1870 src += (8 * src_stride);
1871 XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
1872 HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,
1873 filt0, filt1, out0, out1);
1874 HORIZ_4TAP_4WID_4VECS_FILT(src4, src5, src6, src7, mask0, mask1,
1875 filt0, filt1, out2, out3);
1876 SRAR_H4_SH(out0, out1, out2, out3, rnd_vec);
1877 SAT_SH4_SH(out0, out1, out2, out3, 7);
1878 out = PCKEV_XORI128_UB(out0, out1);
1879 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
1880 dst += (4 * dst_stride);
1881 out = PCKEV_XORI128_UB(out2, out3);
1882 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
1885 static void common_hz_4t_4w_msa(uint8_t *src, int32_t src_stride,
1886 uint8_t *dst, int32_t dst_stride,
1887 const int8_t *filter, int32_t height,
1891 common_hz_4t_4x2_msa(src, src_stride, dst, dst_stride, filter, rnd_val);
1892 } else if (4 == height) {
1893 common_hz_4t_4x4_msa(src, src_stride, dst, dst_stride, filter, rnd_val);
1894 } else if (8 == height) {
1895 common_hz_4t_4x8_msa(src, src_stride, dst, dst_stride, filter, rnd_val);
1896 } else if (16 == height) {
1897 common_hz_4t_4x16_msa(src, src_stride, dst, dst_stride, filter,
1902 static void common_hz_4t_6w_msa(uint8_t *src, int32_t src_stride,
1903 uint8_t *dst, int32_t dst_stride,
1904 const int8_t *filter, int32_t height,
1908 v16i8 src0, src1, src2, src3, filt0, filt1, mask0, mask1;
1910 v8i16 filt, out0, out1, out2, out3;
1913 mask0 = LD_SB(&mc_filt_mask_arr[0]);
1915 rnd_vec = __msa_fill_h(rnd_val);
1917 /* rearranging filter */
1918 filt = LD_SH(filter);
1919 SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
1923 for (loop_cnt = (height >> 2); loop_cnt--;) {
1924 LD_SB4(src, src_stride, src0, src1, src2, src3);
1925 src += (4 * src_stride);
1927 XORI_B4_128_SB(src0, src1, src2, src3);
1928 HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0,
1929 filt1, out0, out1, out2, out3);
1930 SRAR_H4_SH(out0, out1, out2, out3, rnd_vec);
1931 SAT_SH4_SH(out0, out1, out2, out3, 7);
1933 out4 = PCKEV_XORI128_UB(out0, out1);
1934 out5 = PCKEV_XORI128_UB(out2, out3);
1935 ST6x4_UB(out4, out5, dst, dst_stride);
1936 dst += (4 * dst_stride);
1940 static void common_hz_4t_8x2mult_msa(uint8_t *src, int32_t src_stride,
1941 uint8_t *dst, int32_t dst_stride,
1942 const int8_t *filter, int32_t height,
1946 v16i8 src0, src1, filt0, filt1, mask0, mask1;
1948 v8i16 filt, vec0, vec1, vec2, vec3;
1951 mask0 = LD_SB(&mc_filt_mask_arr[0]);
1953 rnd_vec = __msa_fill_h(rnd_val);
1955 filt = LD_SH(filter);
1956 SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
1960 for (loop_cnt = (height >> 1); loop_cnt--;) {
1961 LD_SB2(src, src_stride, src0, src1);
1962 src += (2 * src_stride);
1964 XORI_B2_128_SB(src0, src1);
1965 VSHF_B2_SH(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
1966 DOTP_SB2_SH(vec0, vec1, filt0, filt0, vec0, vec1);
1967 VSHF_B2_SH(src0, src0, src1, src1, mask1, mask1, vec2, vec3);
1968 DPADD_SB2_SH(vec2, vec3, filt1, filt1, vec0, vec1);
1969 SRAR_H2_SH(vec0, vec1, rnd_vec);
1970 SAT_SH2_SH(vec0, vec1, 7);
1971 out = PCKEV_XORI128_UB(vec0, vec1);
1972 ST8x2_UB(out, dst, dst_stride);
1973 dst += (2 * dst_stride);
1977 static void common_hz_4t_8x4mult_msa(uint8_t *src, int32_t src_stride,
1978 uint8_t *dst, int32_t dst_stride,
1979 const int8_t *filter, int32_t height,
1983 v16i8 src0, src1, src2, src3, filt0, filt1, mask0, mask1;
1985 v8i16 filt, out0, out1, out2, out3;
1988 mask0 = LD_SB(&mc_filt_mask_arr[0]);
1990 rnd_vec = __msa_fill_h(rnd_val);
1992 /* rearranging filter */
1993 filt = LD_SH(filter);
1994 SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
1998 for (loop_cnt = (height >> 2); loop_cnt--;) {
1999 LD_SB4(src, src_stride, src0, src1, src2, src3);
2000 src += (4 * src_stride);
2002 XORI_B4_128_SB(src0, src1, src2, src3);
2003 HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0,
2004 filt1, out0, out1, out2, out3);
2005 SRAR_H4_SH(out0, out1, out2, out3, rnd_vec);
2006 SAT_SH4_SH(out0, out1, out2, out3, 7);
2007 tmp0 = PCKEV_XORI128_UB(out0, out1);
2008 tmp1 = PCKEV_XORI128_UB(out2, out3);
2009 ST8x4_UB(tmp0, tmp1, dst, dst_stride);
2010 dst += (4 * dst_stride);
2014 static void common_hz_4t_8w_msa(uint8_t *src, int32_t src_stride,
2015 uint8_t *dst, int32_t dst_stride,
2016 const int8_t *filter, int32_t height,
2019 if ((2 == height) || (6 == height)) {
2020 common_hz_4t_8x2mult_msa(src, src_stride, dst, dst_stride, filter,
2023 common_hz_4t_8x4mult_msa(src, src_stride, dst, dst_stride, filter,
2028 static void common_hz_4t_12w_msa(uint8_t *src, int32_t src_stride,
2029 uint8_t *dst, int32_t dst_stride,
2030 const int8_t *filter, int32_t height,
2034 v16i8 src0, src1, src2, src3, filt0, filt1, mask0, mask1, mask2, mask3;
2035 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
2038 v8i16 filt, out0, out1, out2, out3, out4, out5;
2041 mask0 = LD_SB(&mc_filt_mask_arr[0]);
2042 mask2 = LD_SB(&mc_filt_mask_arr[32]);
2046 /* rearranging filter */
2047 filt = LD_SH(filter);
2048 SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2053 rnd_vec = __msa_fill_h(rnd_val);
2055 for (loop_cnt = (height >> 2); loop_cnt--;) {
2056 LD_SB4(src, src_stride, src0, src1, src2, src3);
2057 src += (4 * src_stride);
2059 XORI_B4_128_SB(src0, src1, src2, src3);
2060 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec4, vec5);
2061 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec6, vec7);
2062 VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec0, vec1);
2063 DOTP_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0,
2064 out2, out3, out4, out5);
2065 DOTP_SB2_SH(vec0, vec1, filt0, filt0, out0, out1);
2066 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec8, vec9);
2067 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec10, vec11);
2068 VSHF_B2_SB(src0, src1, src2, src3, mask3, mask3, vec2, vec3);
2069 DPADD_SB4_SH(vec8, vec9, vec10, vec11, filt1, filt1, filt1, filt1,
2070 out2, out3, out4, out5);
2071 DPADD_SB2_SH(vec2, vec3, filt1, filt1, out0, out1);
2072 SRAR_H4_SH(out0, out1, out2, out3, rnd_vec);
2073 SRAR_H2_SH(out4, out5, rnd_vec);
2074 SAT_SH4_SH(out0, out1, out2, out3, 7);
2075 SAT_SH2_SH(out4, out5, 7);
2076 tmp0 = PCKEV_XORI128_UB(out2, out3);
2077 tmp1 = PCKEV_XORI128_UB(out4, out5);
2078 ST8x4_UB(tmp0, tmp1, dst, dst_stride);
2079 tmp0 = PCKEV_XORI128_UB(out0, out1);
2080 ST4x4_UB(tmp0, tmp0, 0, 1, 2, 3, dst + 8, dst_stride);
2081 dst += (4 * dst_stride);
2085 static void common_hz_4t_16w_msa(uint8_t *src, int32_t src_stride,
2086 uint8_t *dst, int32_t dst_stride,
2087 const int8_t *filter, int32_t height,
2091 v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
2092 v16i8 filt0, filt1, mask0, mask1;
2093 v8i16 filt, out0, out1, out2, out3, out4, out5, out6, out7;
2097 mask0 = LD_SB(&mc_filt_mask_arr[0]);
2099 rnd_vec = __msa_fill_h(rnd_val);
2101 /* rearranging filter */
2102 filt = LD_SH(filter);
2103 SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2107 for (loop_cnt = (height >> 2); loop_cnt--;) {
2108 LD_SB4(src, src_stride, src0, src2, src4, src6);
2109 LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
2110 src += (4 * src_stride);
2112 XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
2113 HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0,
2114 filt1, out0, out1, out2, out3);
2115 HORIZ_4TAP_8WID_4VECS_FILT(src4, src5, src6, src7, mask0, mask1, filt0,
2116 filt1, out4, out5, out6, out7);
2117 SRAR_H4_SH(out0, out1, out2, out3, rnd_vec);
2118 SRAR_H4_SH(out4, out5, out6, out7, rnd_vec);
2119 SAT_SH4_SH(out0, out1, out2, out3, 7);
2120 SAT_SH4_SH(out4, out5, out6, out7, 7);
2121 out = PCKEV_XORI128_UB(out0, out1);
2124 out = PCKEV_XORI128_UB(out2, out3);
2127 out = PCKEV_XORI128_UB(out4, out5);
2130 out = PCKEV_XORI128_UB(out6, out7);
2136 static void common_hz_4t_24w_msa(uint8_t *src, int32_t src_stride,
2137 uint8_t *dst, int32_t dst_stride,
2138 const int8_t *filter, int32_t height,
2141 uint8_t *dst1 = dst + 16;
2143 v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
2144 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
2145 v16i8 filt0, filt1, mask0, mask1, mask00, mask11;
2146 v8i16 filt, out0, out1, out2, out3;
2150 mask0 = LD_SB(&mc_filt_mask_arr[0]);
2152 rnd_vec = __msa_fill_h(rnd_val);
2154 /* rearranging filter */
2155 filt = LD_SH(filter);
2156 SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2160 mask11 = mask0 + 10;
2162 for (loop_cnt = (height >> 2); loop_cnt--;) {
2163 LD_SB4(src, src_stride, src0, src2, src4, src6);
2164 LD_SB4(src + 16, src_stride, src1, src3, src5, src7);
2165 src += (4 * src_stride);
2167 XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
2168 VSHF_B2_SB(src0, src0, src0, src1, mask0, mask00, vec0, vec1);
2169 VSHF_B2_SB(src2, src2, src2, src3, mask0, mask00, vec2, vec3);
2170 VSHF_B2_SB(src0, src0, src0, src1, mask1, mask11, vec4, vec5);
2171 VSHF_B2_SB(src2, src2, src2, src3, mask1, mask11, vec6, vec7);
2172 DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
2173 out0, out1, out2, out3);
2174 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1,
2175 out0, out1, out2, out3);
2176 SRAR_H4_SH(out0, out1, out2, out3, rnd_vec);
2177 SAT_SH4_SH(out0, out1, out2, out3, 7);
2178 tmp0 = PCKEV_XORI128_UB(out0, out1);
2181 tmp0 = PCKEV_XORI128_UB(out2, out3);
2185 VSHF_B2_SB(src4, src4, src4, src5, mask0, mask00, vec0, vec1);
2186 VSHF_B2_SB(src6, src6, src6, src7, mask0, mask00, vec2, vec3);
2187 VSHF_B2_SB(src4, src4, src4, src5, mask1, mask11, vec4, vec5);
2188 VSHF_B2_SB(src6, src6, src6, src7, mask1, mask11, vec6, vec7);
2189 DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
2190 out0, out1, out2, out3);
2191 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1,
2192 out0, out1, out2, out3);
2193 SRAR_H4_SH(out0, out1, out2, out3, rnd_vec);
2194 SAT_SH4_SH(out0, out1, out2, out3, 7);
2195 tmp0 = PCKEV_XORI128_UB(out0, out1);
2198 tmp0 = PCKEV_XORI128_UB(out2, out3);
2203 VSHF_B2_SB(src1, src1, src3, src3, mask0, mask0, vec0, vec1);
2204 VSHF_B2_SB(src5, src5, src7, src7, mask0, mask0, vec2, vec3);
2205 VSHF_B2_SB(src1, src1, src3, src3, mask1, mask1, vec4, vec5);
2206 VSHF_B2_SB(src5, src5, src7, src7, mask1, mask1, vec6, vec7);
2208 DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
2209 out0, out1, out2, out3);
2210 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1,
2211 out0, out1, out2, out3);
2213 SRAR_H4_SH(out0, out1, out2, out3, rnd_vec);
2214 SAT_SH4_SH(out0, out1, out2, out3, 7);
2215 tmp0 = PCKEV_XORI128_UB(out0, out1);
2216 tmp1 = PCKEV_XORI128_UB(out2, out3);
2217 ST8x4_UB(tmp0, tmp1, dst1, dst_stride);
2218 dst1 += (4 * dst_stride);
2222 static void common_hz_4t_32w_msa(uint8_t *src, int32_t src_stride,
2223 uint8_t *dst, int32_t dst_stride,
2224 const int8_t *filter, int32_t height,
2228 v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
2229 v16i8 filt0, filt1, mask0, mask1;
2231 v8i16 filt, out0, out1, out2, out3, out4, out5, out6, out7;
2234 mask0 = LD_SB(&mc_filt_mask_arr[0]);
2236 rnd_vec = __msa_fill_h(rnd_val);
2238 /* rearranging filter */
2239 filt = LD_SH(filter);
2240 SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2244 for (loop_cnt = (height >> 1); loop_cnt--;) {
2246 src2 = LD_SB(src + 16);
2247 src3 = LD_SB(src + 24);
2250 src6 = LD_SB(src + 16);
2251 src7 = LD_SB(src + 24);
2252 SLDI_B2_SB(src2, src6, src0, src4, src1, src5, 8);
2255 XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
2256 HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,
2257 filt0, filt1, out0, out1, out2, out3);
2258 HORIZ_4TAP_8WID_4VECS_FILT(src4, src5, src6, src7, mask0, mask1,
2259 filt0, filt1, out4, out5, out6, out7);
2260 SRAR_H4_SH(out0, out1, out2, out3, rnd_vec);
2261 SRAR_H4_SH(out4, out5, out6, out7, rnd_vec);
2262 SAT_SH4_SH(out0, out1, out2, out3, 7);
2263 SAT_SH4_SH(out4, out5, out6, out7, 7);
2264 out = PCKEV_XORI128_UB(out0, out1);
2266 out = PCKEV_XORI128_UB(out2, out3);
2267 ST_UB(out, dst + 16);
2269 out = PCKEV_XORI128_UB(out4, out5);
2271 out = PCKEV_XORI128_UB(out6, out7);
2272 ST_UB(out, dst + 16);
2277 static void common_vt_4t_4x2_msa(uint8_t *src, int32_t src_stride,
2278 uint8_t *dst, int32_t dst_stride,
2279 const int8_t *filter, uint8_t rnd_val)
2281 v16i8 src0, src1, src2, src3, src4, src10_r, src32_r, src21_r, src43_r;
2282 v16i8 src2110, src4332, filt0, filt1;
2288 rnd_vec = __msa_fill_h(rnd_val);
2290 filt = LD_SH(filter);
2291 SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2293 LD_SB3(src, src_stride, src0, src1, src2);
2294 src += (3 * src_stride);
2296 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2297 src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
2298 src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
2299 LD_SB2(src, src_stride, src3, src4);
2300 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
2301 src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_r, (v2i64) src32_r);
2302 src4332 = (v16i8) __msa_xori_b((v16u8) src4332, 128);
2303 out10 = FILT_4TAP_DPADD_S_H(src2110, src4332, filt0, filt1);
2304 out10 = __msa_srar_h(out10, rnd_vec);
2305 out10 = __msa_sat_s_h(out10, 7);
2306 out = PCKEV_XORI128_UB(out10, out10);
2307 ST4x2_UB(out, dst, dst_stride);
2310 static void common_vt_4t_4x4multiple_msa(uint8_t *src, int32_t src_stride,
2311 uint8_t *dst, int32_t dst_stride,
2312 const int8_t *filter, int32_t height,
2316 v16i8 src0, src1, src2, src3, src4, src5;
2317 v16i8 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r;
2318 v16i8 src2110, src4332, filt0, filt1;
2319 v8i16 filt, out10, out32;
2324 rnd_vec = __msa_fill_h(rnd_val);
2326 filt = LD_SH(filter);
2327 SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2329 LD_SB3(src, src_stride, src0, src1, src2);
2330 src += (3 * src_stride);
2332 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2334 src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
2335 src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
2337 for (loop_cnt = (height >> 2); loop_cnt--;) {
2338 LD_SB3(src, src_stride, src3, src4, src5);
2339 src += (3 * src_stride);
2340 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
2341 src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_r, (v2i64) src32_r);
2342 src4332 = (v16i8) __msa_xori_b((v16u8) src4332, 128);
2343 out10 = FILT_4TAP_DPADD_S_H(src2110, src4332, filt0, filt1);
2346 src += (src_stride);
2347 ILVR_B2_SB(src5, src4, src2, src5, src54_r, src65_r);
2348 src2110 = (v16i8) __msa_ilvr_d((v2i64) src65_r, (v2i64) src54_r);
2349 src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
2350 out32 = FILT_4TAP_DPADD_S_H(src4332, src2110, filt0, filt1);
2351 SRAR_H2_SH(out10, out32, rnd_vec);
2352 SAT_SH2_SH(out10, out32, 7);
2353 out = PCKEV_XORI128_UB(out10, out32);
2354 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
2355 dst += (4 * dst_stride);
2359 static void common_vt_4t_4w_msa(uint8_t *src, int32_t src_stride,
2360 uint8_t *dst, int32_t dst_stride,
2361 const int8_t *filter, int32_t height,
2365 common_vt_4t_4x2_msa(src, src_stride, dst, dst_stride, filter, rnd_val);
2367 common_vt_4t_4x4multiple_msa(src, src_stride, dst, dst_stride, filter,
2372 static void common_vt_4t_6w_msa(uint8_t *src, int32_t src_stride,
2373 uint8_t *dst, int32_t dst_stride,
2374 const int8_t *filter, int32_t height,
2378 v16u8 src0, src1, src2, src3, vec0, vec1, vec2, vec3, out0, out1;
2379 v8i16 vec01, vec12, vec23, vec30, tmp0, tmp1, tmp2, tmp3;
2380 v8i16 filt, filt0, filt1;
2384 rnd_vec = __msa_fill_h(rnd_val);
2386 /* rearranging filter_y */
2387 filt = LD_SH(filter);
2388 SPLATI_H2_SH(filt, 0, 1, filt0, filt1);
2390 LD_UB3(src, src_stride, src0, src1, src2);
2391 src += (3 * src_stride);
2393 vec0 = (v16u8) __msa_xori_b((v16u8) src0, 128);
2394 vec1 = (v16u8) __msa_xori_b((v16u8) src1, 128);
2395 vec2 = (v16u8) __msa_xori_b((v16u8) src2, 128);
2397 for (loop_cnt = (height >> 2); loop_cnt--;) {
2398 LD_UB4(src, src_stride, src3, src0, src1, src2);
2399 src += (4 * src_stride);
2401 vec3 = (v16u8) __msa_xori_b((v16u8) src3, 128);
2402 ILVR_B2_SH(vec1, vec0, vec3, vec2, vec01, vec23);
2403 tmp0 = FILT_4TAP_DPADD_S_H(vec01, vec23, filt0, filt1);
2405 vec0 = __msa_xori_b((v16u8) src0, 128);
2406 ILVR_B2_SH(vec2, vec1, vec0, vec3, vec12, vec30);
2407 tmp1 = FILT_4TAP_DPADD_S_H(vec12, vec30, filt0, filt1);
2409 vec1 = __msa_xori_b((v16u8) src1, 128);
2410 vec01 = (v8i16) __msa_ilvr_b((v16i8) vec1, (v16i8) vec0);
2411 tmp2 = FILT_4TAP_DPADD_S_H(vec23, vec01, filt0, filt1);
2413 vec2 = __msa_xori_b((v16u8) src2, 128);
2414 vec12 = (v8i16) __msa_ilvr_b((v16i8) vec2, (v16i8) vec1);
2415 tmp3 = FILT_4TAP_DPADD_S_H(vec30, vec12, filt0, filt1);
2417 SRAR_H4_SH(tmp0, tmp1, tmp2, tmp3, rnd_vec);
2418 SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
2419 out0 = PCKEV_XORI128_UB(tmp0, tmp1);
2420 out1 = PCKEV_XORI128_UB(tmp2, tmp3);
2421 ST6x4_UB(out0, out1, dst, dst_stride);
2422 dst += (4 * dst_stride);
2426 static void common_vt_4t_8x2_msa(uint8_t *src, int32_t src_stride,
2427 uint8_t *dst, int32_t dst_stride,
2428 const int8_t *filter, uint8_t rnd_val)
2430 v16i8 src0, src1, src2, src3, src4;
2431 v8i16 src01, src12, src23, src34, tmp0, tmp1, filt, filt0, filt1;
2436 rnd_vec = __msa_fill_h(rnd_val);
2438 /* rearranging filter_y */
2439 filt = LD_SH(filter);
2440 SPLATI_H2_SH(filt, 0, 1, filt0, filt1);
2442 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
2443 XORI_B5_128_SB(src0, src1, src2, src3, src4);
2444 ILVR_B2_SH(src1, src0, src3, src2, src01, src23);
2445 tmp0 = FILT_4TAP_DPADD_S_H(src01, src23, filt0, filt1);
2446 ILVR_B2_SH(src2, src1, src4, src3, src12, src34);
2447 tmp1 = FILT_4TAP_DPADD_S_H(src12, src34, filt0, filt1);
2448 SRAR_H2_SH(tmp0, tmp1, rnd_vec);
2449 SAT_SH2_SH(tmp0, tmp1, 7);
2450 out = PCKEV_XORI128_UB(tmp0, tmp1);
2451 ST8x2_UB(out, dst, dst_stride);
2454 static void common_vt_4t_8x6_msa(uint8_t *src, int32_t src_stride,
2455 uint8_t *dst, int32_t dst_stride,
2456 const int8_t *filter, uint8_t rnd_val)
2459 uint64_t out0, out1, out2;
2460 v16i8 src0, src1, src2, src3, src4, src5;
2461 v8i16 vec0, vec1, vec2, vec3, vec4, tmp0, tmp1, tmp2;
2462 v8i16 filt, filt0, filt1;
2466 rnd_vec = __msa_fill_h(rnd_val);
2468 /* rearranging filter_y */
2469 filt = LD_SH(filter);
2470 SPLATI_H2_SH(filt, 0, 1, filt0, filt1);
2472 LD_SB3(src, src_stride, src0, src1, src2);
2473 src += (3 * src_stride);
2475 XORI_B3_128_SB(src0, src1, src2);
2476 ILVR_B2_SH(src1, src0, src2, src1, vec0, vec2);
2478 for (loop_cnt = 2; loop_cnt--;) {
2479 LD_SB3(src, src_stride, src3, src4, src5);
2480 src += (3 * src_stride);
2482 XORI_B3_128_SB(src3, src4, src5);
2483 ILVR_B3_SH(src3, src2, src4, src3, src5, src4, vec1, vec3, vec4);
2484 tmp0 = FILT_4TAP_DPADD_S_H(vec0, vec1, filt0, filt1);
2485 tmp1 = FILT_4TAP_DPADD_S_H(vec2, vec3, filt0, filt1);
2486 tmp2 = FILT_4TAP_DPADD_S_H(vec1, vec4, filt0, filt1);
2487 SRAR_H3_SH(tmp0, tmp1, tmp2, rnd_vec);
2488 SAT_SH3_SH(tmp0, tmp1, tmp2, 7);
2489 PCKEV_B2_SH(tmp1, tmp0, tmp2, tmp2, tmp0, tmp2);
2490 XORI_B2_128_SH(tmp0, tmp2);
2492 out0 = __msa_copy_u_d((v2i64) tmp0, 0);
2493 out1 = __msa_copy_u_d((v2i64) tmp0, 1);
2494 out2 = __msa_copy_u_d((v2i64) tmp2, 0);
2508 static void common_vt_4t_8x4mult_msa(uint8_t *src, int32_t src_stride,
2509 uint8_t *dst, int32_t dst_stride,
2510 const int8_t *filter, int32_t height,
2514 v16i8 src0, src1, src2, src7, src8, src9, src10;
2515 v16i8 src10_r, src72_r, src98_r, src21_r, src87_r, src109_r, filt0, filt1;
2517 v8i16 filt, out0_r, out1_r, out2_r, out3_r;
2521 rnd_vec = __msa_fill_h(rnd_val);
2523 filt = LD_SH(filter);
2524 SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2526 LD_SB3(src, src_stride, src0, src1, src2);
2527 src += (3 * src_stride);
2529 XORI_B3_128_SB(src0, src1, src2);
2530 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2532 for (loop_cnt = (height >> 2); loop_cnt--;) {
2533 LD_SB4(src, src_stride, src7, src8, src9, src10);
2534 src += (4 * src_stride);
2536 XORI_B4_128_SB(src7, src8, src9, src10);
2537 ILVR_B4_SB(src7, src2, src8, src7, src9, src8, src10, src9,
2538 src72_r, src87_r, src98_r, src109_r);
2539 out0_r = FILT_4TAP_DPADD_S_H(src10_r, src72_r, filt0, filt1);
2540 out1_r = FILT_4TAP_DPADD_S_H(src21_r, src87_r, filt0, filt1);
2541 out2_r = FILT_4TAP_DPADD_S_H(src72_r, src98_r, filt0, filt1);
2542 out3_r = FILT_4TAP_DPADD_S_H(src87_r, src109_r, filt0, filt1);
2543 SRAR_H4_SH(out0_r, out1_r, out2_r, out3_r, rnd_vec);
2544 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
2545 tmp0 = PCKEV_XORI128_UB(out0_r, out1_r);
2546 tmp1 = PCKEV_XORI128_UB(out2_r, out3_r);
2547 ST8x4_UB(tmp0, tmp1, dst, dst_stride);
2548 dst += (4 * dst_stride);
2556 static void common_vt_4t_8w_msa(uint8_t *src, int32_t src_stride,
2557 uint8_t *dst, int32_t dst_stride,
2558 const int8_t *filter, int32_t height,
2562 common_vt_4t_8x2_msa(src, src_stride, dst, dst_stride, filter, rnd_val);
2563 } else if (6 == height) {
2564 common_vt_4t_8x6_msa(src, src_stride, dst, dst_stride, filter, rnd_val);
2566 common_vt_4t_8x4mult_msa(src, src_stride, dst, dst_stride,
2567 filter, height, rnd_val);
2571 static void common_vt_4t_12w_msa(uint8_t *src, int32_t src_stride,
2572 uint8_t *dst, int32_t dst_stride,
2573 const int8_t *filter, int32_t height,
2577 v16i8 src0, src1, src2, src3, src4, src5, src6;
2578 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
2580 v8i16 src10, src21, src32, src43, src54, src65, src87, src109, src1211;
2581 v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, filt, filt0, filt1;
2582 v4u32 mask = { 2, 6, 2, 6 };
2585 /* rearranging filter_y */
2586 filt = LD_SH(filter);
2587 SPLATI_H2_SH(filt, 0, 1, filt0, filt1);
2589 rnd_vec = __msa_fill_h(rnd_val);
2593 LD_SB3(src, src_stride, src0, src1, src2);
2594 src += (3 * src_stride);
2596 XORI_B3_128_SB(src0, src1, src2);
2597 VSHF_W2_SB(src0, src1, src1, src2, mask, mask, vec0, vec1);
2599 for (loop_cnt = (height >> 2); loop_cnt--;) {
2600 LD_SB4(src, src_stride, src3, src4, src5, src6);
2601 src += (4 * src_stride);
2603 XORI_B4_128_SB(src3, src4, src5, src6);
2604 ILVR_B2_SH(src1, src0, src3, src2, src10, src32);
2605 VSHF_W2_SB(src2, src3, src3, src4, mask, mask, vec2, vec3);
2606 VSHF_W2_SB(src4, src5, src5, src6, mask, mask, vec4, vec5);
2607 tmp0 = FILT_4TAP_DPADD_S_H(src10, src32, filt0, filt1);
2608 ILVR_B4_SH(src2, src1, src4, src3, src5, src4, src6, src5,
2609 src21, src43, src54, src65);
2610 tmp1 = FILT_4TAP_DPADD_S_H(src21, src43, filt0, filt1);
2611 tmp2 = FILT_4TAP_DPADD_S_H(src32, src54, filt0, filt1);
2612 tmp3 = FILT_4TAP_DPADD_S_H(src43, src65, filt0, filt1);
2613 ILVR_B3_SH(vec1, vec0, vec3, vec2, vec5, vec4, src87, src109, src1211);
2614 tmp4 = FILT_4TAP_DPADD_S_H(src87, src109, filt0, filt1);
2615 tmp5 = FILT_4TAP_DPADD_S_H(src109, src1211, filt0, filt1);
2616 SRAR_H4_SH(tmp0, tmp1, tmp2, tmp3, rnd_vec);
2617 SRAR_H2_SH(tmp4, tmp5, rnd_vec);
2618 SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
2619 SAT_SH2_SH(tmp4, tmp5, 7);
2620 out0 = PCKEV_XORI128_UB(tmp0, tmp1);
2621 out1 = PCKEV_XORI128_UB(tmp2, tmp3);
2622 ST8x4_UB(out0, out1, dst, dst_stride);
2623 out0 = PCKEV_XORI128_UB(tmp4, tmp5);
2624 ST4x4_UB(out0, out0, 0, 1, 2, 3, dst + 8, dst_stride);
2625 dst += (4 * dst_stride);
2636 static void common_vt_4t_16w_msa(uint8_t *src, int32_t src_stride,
2637 uint8_t *dst, int32_t dst_stride,
2638 const int8_t *filter, int32_t height,
2642 v16i8 src0, src1, src2, src3, src4, src5, src6;
2643 v16i8 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r, src10_l;
2644 v16i8 src32_l, src54_l, src21_l, src43_l, src65_l, filt0, filt1;
2645 v16u8 tmp0, tmp1, tmp2, tmp3;
2646 v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
2650 rnd_vec = __msa_fill_h(rnd_val);
2652 filt = LD_SH(filter);
2653 SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2655 LD_SB3(src, src_stride, src0, src1, src2);
2656 src += (3 * src_stride);
2658 XORI_B3_128_SB(src0, src1, src2);
2659 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2660 ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
2662 for (loop_cnt = (height >> 2); loop_cnt--;) {
2663 LD_SB4(src, src_stride, src3, src4, src5, src6);
2664 src += (4 * src_stride);
2666 XORI_B4_128_SB(src3, src4, src5, src6);
2667 ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
2668 src32_r, src43_r, src54_r, src65_r);
2669 ILVL_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
2670 src32_l, src43_l, src54_l, src65_l);
2671 out0_r = FILT_4TAP_DPADD_S_H(src10_r, src32_r, filt0, filt1);
2672 out1_r = FILT_4TAP_DPADD_S_H(src21_r, src43_r, filt0, filt1);
2673 out2_r = FILT_4TAP_DPADD_S_H(src32_r, src54_r, filt0, filt1);
2674 out3_r = FILT_4TAP_DPADD_S_H(src43_r, src65_r, filt0, filt1);
2675 out0_l = FILT_4TAP_DPADD_S_H(src10_l, src32_l, filt0, filt1);
2676 out1_l = FILT_4TAP_DPADD_S_H(src21_l, src43_l, filt0, filt1);
2677 out2_l = FILT_4TAP_DPADD_S_H(src32_l, src54_l, filt0, filt1);
2678 out3_l = FILT_4TAP_DPADD_S_H(src43_l, src65_l, filt0, filt1);
2679 SRAR_H4_SH(out0_r, out1_r, out2_r, out3_r, rnd_vec);
2680 SRAR_H4_SH(out0_l, out1_l, out2_l, out3_l, rnd_vec);
2681 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
2682 SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
2683 PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
2684 out3_r, tmp0, tmp1, tmp2, tmp3);
2685 XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3);
2686 ST_UB4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride);
2687 dst += (4 * dst_stride);
2697 static void common_vt_4t_24w_msa(uint8_t *src, int32_t src_stride,
2698 uint8_t *dst, int32_t dst_stride,
2699 const int8_t *filter, int32_t height,
2703 uint64_t out0, out1;
2704 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
2705 v16i8 src11, filt0, filt1;
2706 v16i8 src10_r, src32_r, src76_r, src98_r, src21_r, src43_r, src87_r;
2707 v16i8 src109_r, src10_l, src32_l, src21_l, src43_l;
2709 v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l;
2714 filt = LD_SH(filter);
2715 SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2717 rnd_vec = __msa_fill_h(rnd_val);
2720 LD_SB3(src, src_stride, src0, src1, src2);
2721 XORI_B3_128_SB(src0, src1, src2);
2722 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2723 ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
2726 LD_SB3(src + 16, src_stride, src6, src7, src8);
2727 src += (3 * src_stride);
2728 XORI_B3_128_SB(src6, src7, src8);
2729 ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
2731 for (loop_cnt = (height >> 2); loop_cnt--;) {
2733 LD_SB2(src, src_stride, src3, src4);
2734 XORI_B2_128_SB(src3, src4);
2735 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
2736 ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
2739 LD_SB2(src + 16, src_stride, src9, src10);
2740 src += (2 * src_stride);
2741 XORI_B2_128_SB(src9, src10);
2742 ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r);
2745 out0_r = FILT_4TAP_DPADD_S_H(src10_r, src32_r, filt0, filt1);
2746 out0_l = FILT_4TAP_DPADD_S_H(src10_l, src32_l, filt0, filt1);
2747 out1_r = FILT_4TAP_DPADD_S_H(src21_r, src43_r, filt0, filt1);
2748 out1_l = FILT_4TAP_DPADD_S_H(src21_l, src43_l, filt0, filt1);
2751 out2_r = FILT_4TAP_DPADD_S_H(src76_r, src98_r, filt0, filt1);
2752 out3_r = FILT_4TAP_DPADD_S_H(src87_r, src109_r, filt0, filt1);
2755 SRAR_H4_SH(out0_r, out1_r, out2_r, out3_r, rnd_vec);
2756 SRAR_H2_SH(out0_l, out1_l, rnd_vec);
2757 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
2758 SAT_SH2_SH(out0_l, out1_l, 7);
2759 out = PCKEV_XORI128_UB(out0_r, out0_l);
2761 PCKEV_B2_SH(out2_r, out2_r, out3_r, out3_r, out2_r, out3_r);
2762 XORI_B2_128_SH(out2_r, out3_r);
2763 out0 = __msa_copy_u_d((v2i64) out2_r, 0);
2764 out1 = __msa_copy_u_d((v2i64) out3_r, 0);
2767 out = PCKEV_XORI128_UB(out1_r, out1_l);
2773 LD_SB2(src, src_stride, src5, src2);
2774 XORI_B2_128_SB(src5, src2);
2775 ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r);
2776 ILVL_B2_SB(src5, src4, src2, src5, src10_l, src21_l);
2779 LD_SB2(src + 16, src_stride, src11, src8);
2780 src += (2 * src_stride);
2781 XORI_B2_128_SB(src11, src8);
2782 ILVR_B2_SB(src11, src10, src8, src11, src76_r, src87_r);
2785 out0_r = FILT_4TAP_DPADD_S_H(src32_r, src10_r, filt0, filt1);
2786 out0_l = FILT_4TAP_DPADD_S_H(src32_l, src10_l, filt0, filt1);
2787 out1_r = FILT_4TAP_DPADD_S_H(src43_r, src21_r, filt0, filt1);
2788 out1_l = FILT_4TAP_DPADD_S_H(src43_l, src21_l, filt0, filt1);
2791 out2_r = FILT_4TAP_DPADD_S_H(src98_r, src76_r, filt0, filt1);
2792 out3_r = FILT_4TAP_DPADD_S_H(src109_r, src87_r, filt0, filt1);
2795 SRAR_H4_SH(out0_r, out1_r, out2_r, out3_r, rnd_vec);
2796 SRAR_H2_SH(out0_l, out1_l, rnd_vec);
2797 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
2798 SAT_SH2_SH(out0_l, out1_l, 7);
2799 out = PCKEV_XORI128_UB(out0_r, out0_l);
2801 out = PCKEV_XORI128_UB(out2_r, out2_r);
2802 ST8x1_UB(out, dst + 16);
2804 out = PCKEV_XORI128_UB(out1_r, out1_l);
2806 out = PCKEV_XORI128_UB(out3_r, out3_r);
2807 ST8x1_UB(out, dst + 16);
2812 static void common_vt_4t_32w_mult_msa(uint8_t *src, int32_t src_stride,
2813 uint8_t *dst, int32_t dst_stride,
2814 const int8_t *filter, int32_t height,
2815 uint8_t rnd_val, int32_t width)
2817 uint32_t loop_cnt, cnt;
2818 uint8_t *dst_tmp, *src_tmp;
2819 v16i8 src0, src1, src2, src3, src4, src6, src7, src8, src9, src10;
2820 v16i8 src10_r, src32_r, src76_r, src98_r;
2821 v16i8 src21_r, src43_r, src87_r, src109_r;
2822 v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
2823 v16i8 src10_l, src32_l, src76_l, src98_l;
2824 v16i8 src21_l, src43_l, src87_l, src109_l;
2831 rnd_vec = __msa_fill_h(rnd_val);
2833 filt = LD_SH(filter);
2834 SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2836 for (cnt = (width >> 5); cnt--;) {
2841 LD_SB3(src_tmp, src_stride, src0, src1, src2);
2842 XORI_B3_128_SB(src0, src1, src2);
2844 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2845 ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
2848 LD_SB3(src_tmp + 16, src_stride, src6, src7, src8);
2849 src_tmp += (3 * src_stride);
2851 XORI_B3_128_SB(src6, src7, src8);
2852 ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
2853 ILVL_B2_SB(src7, src6, src8, src7, src76_l, src87_l);
2855 for (loop_cnt = (height >> 1); loop_cnt--;) {
2857 LD_SB2(src_tmp, src_stride, src3, src4);
2858 XORI_B2_128_SB(src3, src4);
2859 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
2860 ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
2863 out0_r = FILT_4TAP_DPADD_S_H(src10_r, src32_r, filt0, filt1);
2864 out0_l = FILT_4TAP_DPADD_S_H(src10_l, src32_l, filt0, filt1);
2865 out1_r = FILT_4TAP_DPADD_S_H(src21_r, src43_r, filt0, filt1);
2866 out1_l = FILT_4TAP_DPADD_S_H(src21_l, src43_l, filt0, filt1);
2869 SRAR_H4_SH(out0_r, out1_r, out0_l, out1_l, rnd_vec);
2870 SAT_SH4_SH(out0_r, out1_r, out0_l, out1_l, 7);
2871 out = PCKEV_XORI128_UB(out0_r, out0_l);
2872 ST_UB(out, dst_tmp);
2873 out = PCKEV_XORI128_UB(out1_r, out1_l);
2874 ST_UB(out, dst_tmp + dst_stride);
2883 LD_SB2(src_tmp + 16, src_stride, src9, src10);
2884 src_tmp += (2 * src_stride);
2885 XORI_B2_128_SB(src9, src10);
2886 ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r);
2887 ILVL_B2_SB(src9, src8, src10, src9, src98_l, src109_l);
2890 out2_r = FILT_4TAP_DPADD_S_H(src76_r, src98_r, filt0, filt1);
2891 out2_l = FILT_4TAP_DPADD_S_H(src76_l, src98_l, filt0, filt1);
2892 out3_r = FILT_4TAP_DPADD_S_H(src87_r, src109_r, filt0, filt1);
2893 out3_l = FILT_4TAP_DPADD_S_H(src87_l, src109_l, filt0, filt1);
2896 SRAR_H4_SH(out2_r, out3_r, out2_l, out3_l, rnd_vec);
2897 SAT_SH4_SH(out2_r, out3_r, out2_l, out3_l, 7);
2898 out = PCKEV_XORI128_UB(out2_r, out2_l);
2899 ST_UB(out, dst_tmp + 16);
2900 out = PCKEV_XORI128_UB(out3_r, out3_l);
2901 ST_UB(out, dst_tmp + 16 + dst_stride);
2903 dst_tmp += 2 * dst_stride;
2917 static void common_vt_4t_32w_msa(uint8_t *src, int32_t src_stride,
2918 uint8_t *dst, int32_t dst_stride,
2919 const int8_t *filter, int32_t height,
2922 common_vt_4t_32w_mult_msa(src, src_stride, dst, dst_stride,
2923 filter, height, rnd_val, 32);
2926 static void hevc_hv_uni_4t_4x2_msa(uint8_t *src,
2930 const int8_t *filter_x,
2931 const int8_t *filter_y,
2934 v16i8 src0, src1, src2, src3, src4;
2936 v4i32 filt_h0, filt_h1;
2937 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
2939 v8i16 filter_vec, const_vec;
2940 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
2941 v8i16 dst0, dst1, dst2, dst3, dst4;
2942 v4i32 dst0_r, dst1_r;
2943 v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
2945 src -= (src_stride + 1);
2947 filter_vec = LD_SH(filter_x);
2948 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2950 filter_vec = LD_SH(filter_y);
2951 vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
2952 filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
2954 SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1);
2958 const_vec = __msa_ldi_h(128);
2961 LD_SB3(src, src_stride, src0, src1, src2);
2962 src += (3 * src_stride);
2964 XORI_B3_128_SB(src0, src1, src2);
2966 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
2967 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
2968 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
2971 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
2973 DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1);
2975 DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2);
2977 ILVR_H2_SH(dst1, dst0, dst2, dst1, dst10_r, dst21_r);
2978 LD_SB2(src, src_stride, src3, src4);
2979 XORI_B2_128_SB(src3, src4);
2982 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
2984 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
2986 dst32_r = __msa_ilvr_h(dst3, dst2);
2987 dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
2991 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
2993 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
2995 dst43_r = __msa_ilvr_h(dst4, dst3);
2996 dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
2999 dst0_r = (v4i32) __msa_pckev_h((v8i16) dst1_r, (v8i16) dst0_r);
3000 dst0_r = (v4i32) __msa_srari_h((v8i16) dst0_r, 6);
3001 dst0_r = (v4i32) CLIP_SH_0_255(dst0_r);
3002 dst0_r = (v4i32) __msa_pckev_b((v16i8) dst0_r, (v16i8) dst0_r);
3004 ST4x2_UB(dst0_r, dst, dst_stride);
3007 static void hevc_hv_uni_4t_4x4_msa(uint8_t *src,
3011 const int8_t *filter_x,
3012 const int8_t *filter_y,
3015 v16i8 src0, src1, src2, src3, src4, src5, src6;
3017 v4i32 filt_h0, filt_h1;
3018 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
3020 v8i16 filter_vec, const_vec;
3021 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
3022 v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
3023 v4i32 dst0_r, dst1_r, dst2_r, dst3_r;
3024 v8i16 out0_r, out1_r;
3025 v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
3027 src -= (src_stride + 1);
3029 filter_vec = LD_SH(filter_x);
3030 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3032 filter_vec = LD_SH(filter_y);
3033 vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
3034 filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
3036 SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1);
3040 const_vec = __msa_ldi_h(128);
3043 LD_SB3(src, src_stride, src0, src1, src2);
3044 src += (3 * src_stride);
3046 XORI_B3_128_SB(src0, src1, src2);
3048 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3049 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
3050 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3053 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
3055 DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1);
3057 DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2);
3059 ILVR_H2_SH(dst1, dst0, dst2, dst1, dst10_r, dst21_r);
3060 LD_SB4(src, src_stride, src3, src4, src5, src6);
3061 XORI_B4_128_SB(src3, src4, src5, src6);
3064 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3066 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
3068 dst32_r = __msa_ilvr_h(dst3, dst2);
3069 dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
3073 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
3075 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
3077 dst43_r = __msa_ilvr_h(dst4, dst3);
3078 dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
3082 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
3084 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5);
3086 dst10_r = __msa_ilvr_h(dst5, dst4);
3087 dst2_r = HEVC_FILT_4TAP(dst32_r, dst10_r, filt_h0, filt_h1);
3091 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
3093 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
3095 dst21_r = __msa_ilvr_h(dst2, dst5);
3096 dst3_r = HEVC_FILT_4TAP(dst43_r, dst21_r, filt_h0, filt_h1);
3099 PCKEV_H2_SH(dst1_r, dst0_r, dst3_r, dst2_r, out0_r, out1_r);
3100 SRARI_H2_SH(out0_r, out1_r, 6);
3101 CLIP_SH2_0_255(out0_r, out1_r);
3102 out0_r = (v8i16) __msa_pckev_b((v16i8) out1_r, (v16i8) out0_r);
3104 ST4x4_UB(out0_r, out0_r, 0, 1, 2, 3, dst, dst_stride);
3107 static void hevc_hv_uni_4t_4multx8mult_msa(uint8_t *src,
3111 const int8_t *filter_x,
3112 const int8_t *filter_y,
3116 v16i8 src0, src1, src2, src3, src4, src5;
3117 v16i8 src6, src7, src8, src9, src10;
3119 v4i32 filt_h0, filt_h1;
3120 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
3122 v8i16 filter_vec, const_vec;
3123 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
3124 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8, dst9;
3125 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r, dst6_r, dst7_r;
3126 v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
3127 v8i16 dst21_r, dst43_r, dst65_r, dst87_r;
3128 v8i16 out0_r, out1_r, out2_r, out3_r;
3130 src -= (src_stride + 1);
3132 filter_vec = LD_SH(filter_x);
3133 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3135 filter_vec = LD_SH(filter_y);
3136 vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
3137 filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
3139 SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1);
3143 const_vec = __msa_ldi_h(128);
3146 LD_SB3(src, src_stride, src0, src1, src2);
3147 src += (3 * src_stride);
3149 XORI_B3_128_SB(src0, src1, src2);
3151 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3152 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
3153 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3156 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
3158 DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1);
3160 DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2);
3162 ILVR_H2_SH(dst1, dst0, dst2, dst1, dst10_r, dst21_r);
3164 for (loop_cnt = height >> 3; loop_cnt--;) {
3165 LD_SB8(src, src_stride,
3166 src3, src4, src5, src6, src7, src8, src9, src10);
3167 src += (8 * src_stride);
3169 XORI_B8_128_SB(src3, src4, src5, src6, src7, src8, src9, src10);
3172 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3174 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
3176 dst32_r = __msa_ilvr_h(dst3, dst2);
3177 dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
3181 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
3183 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
3185 dst43_r = __msa_ilvr_h(dst4, dst3);
3186 dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
3190 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
3192 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5);
3194 dst54_r = __msa_ilvr_h(dst5, dst4);
3195 dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
3199 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
3201 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst6, dst6);
3203 dst65_r = __msa_ilvr_h(dst6, dst5);
3204 dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
3208 VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1);
3210 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst7, dst7);
3212 dst76_r = __msa_ilvr_h(dst7, dst6);
3213 dst4_r = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
3217 VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec0, vec1);
3219 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst8, dst8);
3221 dst87_r = __msa_ilvr_h(dst8, dst7);
3222 dst5_r = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
3226 VSHF_B2_SB(src9, src9, src9, src9, mask0, mask1, vec0, vec1);
3228 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst9, dst9);
3230 dst10_r = __msa_ilvr_h(dst9, dst8);
3231 dst6_r = HEVC_FILT_4TAP(dst76_r, dst10_r, filt_h0, filt_h1);
3235 VSHF_B2_SB(src10, src10, src10, src10, mask0, mask1, vec0, vec1);
3237 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
3239 dst21_r = __msa_ilvr_h(dst2, dst9);
3240 dst7_r = HEVC_FILT_4TAP(dst87_r, dst21_r, filt_h0, filt_h1);
3243 PCKEV_H4_SH(dst1_r, dst0_r, dst3_r, dst2_r,
3244 dst5_r, dst4_r, dst7_r, dst6_r,
3245 out0_r, out1_r, out2_r, out3_r);
3247 SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6);
3248 CLIP_SH4_0_255(out0_r, out1_r, out2_r, out3_r);
3250 PCKEV_B2_SH(out1_r, out0_r, out3_r, out2_r, out0_r, out1_r);
3251 ST4x8_UB(out0_r, out1_r, dst, dst_stride);
3252 dst += (8 * dst_stride);
3256 static void hevc_hv_uni_4t_4w_msa(uint8_t *src,
3260 const int8_t *filter_x,
3261 const int8_t *filter_y,
3265 hevc_hv_uni_4t_4x2_msa(src, src_stride, dst, dst_stride,
3266 filter_x, filter_y, height);
3267 } else if (4 == height) {
3268 hevc_hv_uni_4t_4x4_msa(src, src_stride, dst, dst_stride,
3269 filter_x, filter_y, height);
3270 } else if (0 == (height % 8)) {
3271 hevc_hv_uni_4t_4multx8mult_msa(src, src_stride, dst, dst_stride,
3272 filter_x, filter_y, height);
3276 static void hevc_hv_uni_4t_6w_msa(uint8_t *src,
3280 const int8_t *filter_x,
3281 const int8_t *filter_y,
3285 v16i8 src0, src1, src2, src3, src4, src5, src6;
3287 v4i32 filt_h0, filt_h1;
3288 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
3290 v8i16 filter_vec, const_vec;
3291 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
3292 v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
3293 v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
3294 v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
3295 v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
3296 v8i16 out0_r, out1_r, out2_r, out3_r;
3298 src -= (src_stride + 1);
3300 filter_vec = LD_SH(filter_x);
3301 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3303 filter_vec = LD_SH(filter_y);
3304 vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
3305 filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
3307 SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1);
3311 const_vec = __msa_ldi_h(128);
3314 LD_SB3(src, src_stride, src0, src1, src2);
3315 src += (3 * src_stride);
3317 XORI_B3_128_SB(src0, src1, src2);
3319 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3320 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
3321 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3324 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
3326 DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1);
3328 DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2);
3330 ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
3331 ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
3333 for (loop_cnt = height >> 2; loop_cnt--;) {
3334 LD_SB4(src, src_stride, src3, src4, src5, src6);
3335 src += (4 * src_stride);
3337 XORI_B4_128_SB(src3, src4, src5, src6);
3340 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3342 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
3344 ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
3345 dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
3346 dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
3351 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
3353 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
3355 ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
3356 dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
3357 dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
3362 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
3364 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5);
3366 ILVRL_H2_SH(dst5, dst4, dst10_r, dst10_l);
3367 dst2_r = HEVC_FILT_4TAP(dst32_r, dst10_r, filt_h0, filt_h1);
3368 dst2_l = HEVC_FILT_4TAP(dst32_l, dst10_l, filt_h0, filt_h1);
3374 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
3376 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
3378 ILVRL_H2_SH(dst2, dst5, dst21_r, dst21_l);
3379 dst3_r = HEVC_FILT_4TAP(dst43_r, dst21_r, filt_h0, filt_h1);
3380 dst3_l = HEVC_FILT_4TAP(dst43_l, dst21_l, filt_h0, filt_h1);
3385 PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r,
3386 dst2_l, dst2_r, dst3_l, dst3_r,
3387 out0_r, out1_r, out2_r, out3_r);
3389 SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6);
3390 CLIP_SH4_0_255(out0_r, out1_r, out2_r, out3_r);
3392 PCKEV_B2_SH(out1_r, out0_r, out3_r, out2_r, out0_r, out1_r);
3393 ST6x4_UB(out0_r, out1_r, dst, dst_stride);
3394 dst += (4 * dst_stride);
3398 static void hevc_hv_uni_4t_8x2_msa(uint8_t *src,
3402 const int8_t *filter_x,
3403 const int8_t *filter_y,
3406 v16i8 src0, src1, src2, src3, src4;
3408 v4i32 filt_h0, filt_h1;
3409 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
3411 v8i16 filter_vec, const_vec;
3412 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
3413 v8i16 dst0, dst1, dst2, dst3, dst4;
3414 v4i32 dst0_r, dst0_l, dst1_r, dst1_l;
3415 v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
3416 v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
3417 v8i16 out0_r, out1_r;
3419 src -= (src_stride + 1);
3421 filter_vec = LD_SH(filter_x);
3422 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3424 filter_vec = LD_SH(filter_y);
3425 vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
3426 filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
3428 SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1);
3432 const_vec = __msa_ldi_h(128);
3435 LD_SB3(src, src_stride, src0, src1, src2);
3436 src += (3 * src_stride);
3438 XORI_B3_128_SB(src0, src1, src2);
3440 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3441 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
3442 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3445 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
3447 DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1);
3449 DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2);
3451 ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
3452 ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
3454 LD_SB2(src, src_stride, src3, src4);
3455 XORI_B2_128_SB(src3, src4);
3458 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3460 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
3462 ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
3463 dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
3464 dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
3469 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
3471 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
3473 ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
3474 dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
3475 dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
3479 PCKEV_H2_SH(dst0_l, dst0_r, dst1_l, dst1_r, out0_r, out1_r);
3480 SRARI_H2_SH(out0_r, out1_r, 6);
3481 CLIP_SH2_0_255(out0_r, out1_r);
3482 out0_r = (v8i16) __msa_pckev_b((v16i8) out1_r, (v16i8) out0_r);
3484 ST8x2_UB(out0_r, dst, dst_stride);
3487 static void hevc_hv_uni_4t_8x6_msa(uint8_t *src,
3491 const int8_t *filter_x,
3492 const int8_t *filter_y,
3495 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
3497 v4i32 filt_h0, filt_h1;
3498 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
3500 v8i16 filter_vec, const_vec;
3501 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
3502 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
3503 v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
3504 v4i32 dst4_r, dst4_l, dst5_r, dst5_l;
3505 v8i16 dst10_r, dst32_r, dst10_l, dst32_l;
3506 v8i16 dst21_r, dst43_r, dst21_l, dst43_l;
3507 v8i16 dst54_r, dst54_l, dst65_r, dst65_l;
3508 v8i16 dst76_r, dst76_l, dst87_r, dst87_l;
3509 v8i16 out0_r, out1_r, out2_r, out3_r, out4_r, out5_r;
3511 src -= (src_stride + 1);
3513 filter_vec = LD_SH(filter_x);
3514 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3516 filter_vec = LD_SH(filter_y);
3517 vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
3518 filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
3520 SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1);
3524 const_vec = __msa_ldi_h(128);
3527 LD_SB3(src, src_stride, src0, src1, src2);
3528 src += (3 * src_stride);
3530 XORI_B3_128_SB(src0, src1, src2);
3532 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3533 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
3534 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3537 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
3539 DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1);
3541 DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2);
3543 ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
3544 ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
3546 LD_SB2(src, src_stride, src3, src4);
3547 src += (2 * src_stride);
3549 XORI_B2_128_SB(src3, src4);
3552 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3554 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
3556 ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
3557 dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
3558 dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
3564 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
3566 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
3568 ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
3569 dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
3570 dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
3574 LD_SB2(src, src_stride, src5, src6);
3575 src += (2 * src_stride);
3577 XORI_B2_128_SB(src5, src6);
3580 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
3582 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5);
3584 ILVRL_H2_SH(dst5, dst4, dst54_r, dst54_l);
3585 dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
3586 dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1);
3591 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
3593 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst6, dst6);
3595 ILVRL_H2_SH(dst6, dst5, dst65_r, dst65_l);
3596 dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
3597 dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1);
3601 LD_SB2(src, src_stride, src7, src8);
3602 src += (2 * src_stride);
3604 XORI_B2_128_SB(src7, src8);
3607 VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1);
3609 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst7, dst7);
3611 ILVRL_H2_SH(dst7, dst6, dst76_r, dst76_l);
3612 dst4_r = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
3613 dst4_l = HEVC_FILT_4TAP(dst54_l, dst76_l, filt_h0, filt_h1);
3619 VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec0, vec1);
3621 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst8, dst8);
3623 ILVRL_H2_SH(dst8, dst7, dst87_r, dst87_l);
3624 dst5_r = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
3625 dst5_l = HEVC_FILT_4TAP(dst65_l, dst87_l, filt_h0, filt_h1);
3629 PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r,
3630 dst2_l, dst2_r, dst3_l, dst3_r, out0_r, out1_r, out2_r, out3_r);
3631 PCKEV_H2_SH(dst4_l, dst4_r, dst5_l, dst5_r, out4_r, out5_r);
3632 SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6);
3633 SRARI_H2_SH(out4_r, out5_r, 6);
3634 CLIP_SH4_0_255(out0_r, out1_r, out2_r, out3_r);
3635 CLIP_SH2_0_255(out4_r, out5_r);
3637 PCKEV_B2_SH(out1_r, out0_r, out3_r, out2_r, out0_r, out1_r);
3638 out2_r = (v8i16) __msa_pckev_b((v16i8) out5_r, (v16i8) out4_r);
3640 ST8x4_UB(out0_r, out1_r, dst, dst_stride);
3641 dst += (4 * dst_stride);
3642 ST8x2_UB(out2_r, dst, dst_stride);
3645 static void hevc_hv_uni_4t_8w_mult_msa(uint8_t *src,
3649 const int8_t *filter_x,
3650 const int8_t *filter_y,
3654 uint32_t loop_cnt, cnt;
3657 v16i8 src0, src1, src2, src3, src4, src5, src6;
3659 v4i32 filt_h0, filt_h1;
3660 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
3662 v8i16 filter_vec, const_vec;
3663 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
3664 v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
3665 v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
3666 v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
3667 v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
3668 v8i16 out0_r, out1_r, out2_r, out3_r;
3670 src -= (src_stride + 1);
3672 filter_vec = LD_SH(filter_x);
3673 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3675 filter_vec = LD_SH(filter_y);
3676 vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
3677 filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
3679 SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1);
3683 const_vec = __msa_ldi_h(128);
3686 for (cnt = width >> 3; cnt--;) {
3690 LD_SB3(src_tmp, src_stride, src0, src1, src2);
3691 src_tmp += (3 * src_stride);
3693 XORI_B3_128_SB(src0, src1, src2);
3695 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3696 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
3697 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3700 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
3702 DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1);
3704 DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2);
3706 ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
3707 ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
3709 for (loop_cnt = height >> 2; loop_cnt--;) {
3710 LD_SB4(src_tmp, src_stride, src3, src4, src5, src6);
3711 src_tmp += (4 * src_stride);
3713 XORI_B4_128_SB(src3, src4, src5, src6);
3716 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3718 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
3720 ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
3721 dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
3722 dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
3728 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
3730 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
3732 ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
3733 dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
3734 dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
3739 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
3741 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5);
3743 ILVRL_H2_SH(dst5, dst4, dst10_r, dst10_l);
3744 dst2_r = HEVC_FILT_4TAP(dst32_r, dst10_r, filt_h0, filt_h1);
3745 dst2_l = HEVC_FILT_4TAP(dst32_l, dst10_l, filt_h0, filt_h1);
3751 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
3753 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
3755 ILVRL_H2_SH(dst2, dst5, dst21_r, dst21_l);
3756 dst3_r = HEVC_FILT_4TAP(dst43_r, dst21_r, filt_h0, filt_h1);
3757 dst3_l = HEVC_FILT_4TAP(dst43_l, dst21_l, filt_h0, filt_h1);
3762 PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r,
3763 dst2_l, dst2_r, dst3_l, dst3_r,
3764 out0_r, out1_r, out2_r, out3_r);
3766 SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6);
3767 CLIP_SH4_0_255(out0_r, out1_r, out2_r, out3_r);
3769 PCKEV_B2_SH(out1_r, out0_r, out3_r, out2_r, out0_r, out1_r);
3770 ST8x4_UB(out0_r, out1_r, dst_tmp, dst_stride);
3771 dst_tmp += (4 * dst_stride);
3779 static void hevc_hv_uni_4t_8w_msa(uint8_t *src,
3783 const int8_t *filter_x,
3784 const int8_t *filter_y,
3788 hevc_hv_uni_4t_8x2_msa(src, src_stride, dst, dst_stride,
3789 filter_x, filter_y, height);
3790 } else if (6 == height) {
3791 hevc_hv_uni_4t_8x6_msa(src, src_stride, dst, dst_stride,
3792 filter_x, filter_y, height);
3793 } else if (0 == (height % 4)) {
3794 hevc_hv_uni_4t_8w_mult_msa(src, src_stride, dst, dst_stride,
3795 filter_x, filter_y, height, 8);
3799 static void hevc_hv_uni_4t_12w_msa(uint8_t *src,
3803 const int8_t *filter_x,
3804 const int8_t *filter_y,
3807 hevc_hv_uni_4t_8w_mult_msa(src, src_stride, dst, dst_stride,
3808 filter_x, filter_y, height, 8);
3810 hevc_hv_uni_4t_4w_msa(src + 8, src_stride, dst + 8, dst_stride,
3811 filter_x, filter_y, height);
3814 static void hevc_hv_uni_4t_16w_msa(uint8_t *src,
3818 const int8_t *filter_x,
3819 const int8_t *filter_y,
3822 hevc_hv_uni_4t_8w_mult_msa(src, src_stride, dst, dst_stride,
3823 filter_x, filter_y, height, 16);
3826 static void hevc_hv_uni_4t_24w_msa(uint8_t *src,
3830 const int8_t *filter_x,
3831 const int8_t *filter_y,
3834 hevc_hv_uni_4t_8w_mult_msa(src, src_stride, dst, dst_stride,
3835 filter_x, filter_y, height, 24);
3838 static void hevc_hv_uni_4t_32w_msa(uint8_t *src,
3842 const int8_t *filter_x,
3843 const int8_t *filter_y,
3846 hevc_hv_uni_4t_8w_mult_msa(src, src_stride, dst, dst_stride,
3847 filter_x, filter_y, height, 32);
3850 #define UNI_MC_COPY(WIDTH) \
3851 void ff_hevc_put_hevc_uni_pel_pixels##WIDTH##_8_msa(uint8_t *dst, \
3852 ptrdiff_t dst_stride, \
3854 ptrdiff_t src_stride, \
3860 copy_width##WIDTH##_msa(src, src_stride, dst, dst_stride, height); \
3873 #define UNI_MC(PEL, DIR, WIDTH, TAP, DIR1, FILT_DIR) \
3874 void ff_hevc_put_hevc_uni_##PEL##_##DIR##WIDTH##_8_msa(uint8_t *dst, \
3885 const int8_t *filter = ff_hevc_##PEL##_filters[FILT_DIR - 1]; \
3887 common_##DIR1##_##TAP##t_##WIDTH##w_msa(src, src_stride, dst, dst_stride, \
3888 filter, height, 6); \
3891 UNI_MC(qpel, h, 4, 8, hz, mx);
3892 UNI_MC(qpel, h, 8, 8, hz, mx);
3893 UNI_MC(qpel, h, 12, 8, hz, mx);
3894 UNI_MC(qpel, h, 16, 8, hz, mx);
3895 UNI_MC(qpel, h, 24, 8, hz, mx);
3896 UNI_MC(qpel, h, 32, 8, hz, mx);
3897 UNI_MC(qpel, h, 48, 8, hz, mx);
3898 UNI_MC(qpel, h, 64, 8, hz, mx);
3900 UNI_MC(qpel, v, 4, 8, vt, my);
3901 UNI_MC(qpel, v, 8, 8, vt, my);
3902 UNI_MC(qpel, v, 12, 8, vt, my);
3903 UNI_MC(qpel, v, 16, 8, vt, my);
3904 UNI_MC(qpel, v, 24, 8, vt, my);
3905 UNI_MC(qpel, v, 32, 8, vt, my);
3906 UNI_MC(qpel, v, 48, 8, vt, my);
3907 UNI_MC(qpel, v, 64, 8, vt, my);
3909 UNI_MC(epel, h, 4, 4, hz, mx);
3910 UNI_MC(epel, h, 6, 4, hz, mx);
3911 UNI_MC(epel, h, 8, 4, hz, mx);
3912 UNI_MC(epel, h, 12, 4, hz, mx);
3913 UNI_MC(epel, h, 16, 4, hz, mx);
3914 UNI_MC(epel, h, 24, 4, hz, mx);
3915 UNI_MC(epel, h, 32, 4, hz, mx);
3917 UNI_MC(epel, v, 4, 4, vt, my);
3918 UNI_MC(epel, v, 6, 4, vt, my);
3919 UNI_MC(epel, v, 8, 4, vt, my);
3920 UNI_MC(epel, v, 12, 4, vt, my);
3921 UNI_MC(epel, v, 16, 4, vt, my);
3922 UNI_MC(epel, v, 24, 4, vt, my);
3923 UNI_MC(epel, v, 32, 4, vt, my);
3927 #define UNI_MC_HV(PEL, DIR, WIDTH, TAP, DIR1) \
3928 void ff_hevc_put_hevc_uni_##PEL##_##DIR##WIDTH##_8_msa(uint8_t *dst, \
3939 const int8_t *filter_x = ff_hevc_##PEL##_filters[mx - 1]; \
3940 const int8_t *filter_y = ff_hevc_##PEL##_filters[my - 1]; \
3942 hevc_##DIR1##_uni_##TAP##t_##WIDTH##w_msa(src, src_stride, dst, \
3943 dst_stride, filter_x, \
3944 filter_y, height); \
3947 UNI_MC_HV(qpel, hv, 4, 8, hv);
3948 UNI_MC_HV(qpel, hv, 8, 8, hv);
3949 UNI_MC_HV(qpel, hv, 12, 8, hv);
3950 UNI_MC_HV(qpel, hv, 16, 8, hv);
3951 UNI_MC_HV(qpel, hv, 24, 8, hv);
3952 UNI_MC_HV(qpel, hv, 32, 8, hv);
3953 UNI_MC_HV(qpel, hv, 48, 8, hv);
3954 UNI_MC_HV(qpel, hv, 64, 8, hv);
3956 UNI_MC_HV(epel, hv, 4, 4, hv);
3957 UNI_MC_HV(epel, hv, 6, 4, hv);
3958 UNI_MC_HV(epel, hv, 8, 4, hv);
3959 UNI_MC_HV(epel, hv, 12, 4, hv);
3960 UNI_MC_HV(epel, hv, 16, 4, hv);
3961 UNI_MC_HV(epel, hv, 24, 4, hv);
3962 UNI_MC_HV(epel, hv, 32, 4, hv);