2 * Copyright (c) 2015 - 2017 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com)
4 * This file is part of FFmpeg.
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 #include "libavutil/mips/generic_macros_msa.h"
22 #include "libavcodec/mips/hevcdsp_mips.h"
23 #include "libavcodec/mips/hevc_macros_msa.h"
25 static void copy_width8_msa(uint8_t *src, int32_t src_stride,
26 uint8_t *dst, int32_t dst_stride,
30 uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
31 v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
33 if (0 == height % 12) {
34 for (cnt = (height / 12); cnt--;) {
35 LD_UB8(src, src_stride,
36 src0, src1, src2, src3, src4, src5, src6, src7);
37 src += (8 * src_stride);
39 out0 = __msa_copy_u_d((v2i64) src0, 0);
40 out1 = __msa_copy_u_d((v2i64) src1, 0);
41 out2 = __msa_copy_u_d((v2i64) src2, 0);
42 out3 = __msa_copy_u_d((v2i64) src3, 0);
43 out4 = __msa_copy_u_d((v2i64) src4, 0);
44 out5 = __msa_copy_u_d((v2i64) src5, 0);
45 out6 = __msa_copy_u_d((v2i64) src6, 0);
46 out7 = __msa_copy_u_d((v2i64) src7, 0);
48 SD4(out0, out1, out2, out3, dst, dst_stride);
49 dst += (4 * dst_stride);
50 SD4(out4, out5, out6, out7, dst, dst_stride);
51 dst += (4 * dst_stride);
53 LD_UB4(src, src_stride, src0, src1, src2, src3);
54 src += (4 * src_stride);
56 out0 = __msa_copy_u_d((v2i64) src0, 0);
57 out1 = __msa_copy_u_d((v2i64) src1, 0);
58 out2 = __msa_copy_u_d((v2i64) src2, 0);
59 out3 = __msa_copy_u_d((v2i64) src3, 0);
61 SD4(out0, out1, out2, out3, dst, dst_stride);
62 dst += (4 * dst_stride);
64 } else if (0 == height % 8) {
65 for (cnt = height >> 3; cnt--;) {
66 LD_UB8(src, src_stride,
67 src0, src1, src2, src3, src4, src5, src6, src7);
68 src += (8 * src_stride);
70 out0 = __msa_copy_u_d((v2i64) src0, 0);
71 out1 = __msa_copy_u_d((v2i64) src1, 0);
72 out2 = __msa_copy_u_d((v2i64) src2, 0);
73 out3 = __msa_copy_u_d((v2i64) src3, 0);
74 out4 = __msa_copy_u_d((v2i64) src4, 0);
75 out5 = __msa_copy_u_d((v2i64) src5, 0);
76 out6 = __msa_copy_u_d((v2i64) src6, 0);
77 out7 = __msa_copy_u_d((v2i64) src7, 0);
79 SD4(out0, out1, out2, out3, dst, dst_stride);
80 dst += (4 * dst_stride);
81 SD4(out4, out5, out6, out7, dst, dst_stride);
82 dst += (4 * dst_stride);
84 } else if (0 == height % 4) {
85 for (cnt = (height / 4); cnt--;) {
86 LD_UB4(src, src_stride, src0, src1, src2, src3);
87 src += (4 * src_stride);
88 out0 = __msa_copy_u_d((v2i64) src0, 0);
89 out1 = __msa_copy_u_d((v2i64) src1, 0);
90 out2 = __msa_copy_u_d((v2i64) src2, 0);
91 out3 = __msa_copy_u_d((v2i64) src3, 0);
93 SD4(out0, out1, out2, out3, dst, dst_stride);
94 dst += (4 * dst_stride);
96 } else if (0 == height % 2) {
97 for (cnt = (height / 2); cnt--;) {
98 LD_UB2(src, src_stride, src0, src1);
99 src += (2 * src_stride);
100 out0 = __msa_copy_u_d((v2i64) src0, 0);
101 out1 = __msa_copy_u_d((v2i64) src1, 0);
111 static void copy_width12_msa(uint8_t *src, int32_t src_stride,
112 uint8_t *dst, int32_t dst_stride,
115 v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
117 LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
118 src += (8 * src_stride);
119 ST12x8_UB(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride);
120 dst += (8 * dst_stride);
121 LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
122 ST12x8_UB(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride);
125 static void copy_16multx8mult_msa(uint8_t *src, int32_t src_stride,
126 uint8_t *dst, int32_t dst_stride,
127 int32_t height, int32_t width)
129 int32_t cnt, loop_cnt;
130 uint8_t *src_tmp, *dst_tmp;
131 v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
133 for (cnt = (width >> 4); cnt--;) {
137 for (loop_cnt = (height >> 3); loop_cnt--;) {
138 LD_UB8(src_tmp, src_stride,
139 src0, src1, src2, src3, src4, src5, src6, src7);
140 src_tmp += (8 * src_stride);
142 ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7,
143 dst_tmp, dst_stride);
144 dst_tmp += (8 * dst_stride);
152 static void copy_width16_msa(uint8_t *src, int32_t src_stride,
153 uint8_t *dst, int32_t dst_stride,
157 v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
159 if (0 == height % 12) {
160 for (cnt = (height / 12); cnt--;) {
161 LD_UB8(src, src_stride,
162 src0, src1, src2, src3, src4, src5, src6, src7);
163 src += (8 * src_stride);
164 ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7,
166 dst += (8 * dst_stride);
168 LD_UB4(src, src_stride, src0, src1, src2, src3);
169 src += (4 * src_stride);
170 ST_UB4(src0, src1, src2, src3, dst, dst_stride);
171 dst += (4 * dst_stride);
173 } else if (0 == height % 8) {
174 copy_16multx8mult_msa(src, src_stride, dst, dst_stride, height, 16);
175 } else if (0 == height % 4) {
176 for (cnt = (height >> 2); cnt--;) {
177 LD_UB4(src, src_stride, src0, src1, src2, src3);
178 src += (4 * src_stride);
180 ST_UB4(src0, src1, src2, src3, dst, dst_stride);
181 dst += (4 * dst_stride);
186 static void copy_width24_msa(uint8_t *src, int32_t src_stride,
187 uint8_t *dst, int32_t dst_stride,
190 copy_16multx8mult_msa(src, src_stride, dst, dst_stride, height, 16);
191 copy_width8_msa(src + 16, src_stride, dst + 16, dst_stride, height);
194 static void copy_width32_msa(uint8_t *src, int32_t src_stride,
195 uint8_t *dst, int32_t dst_stride,
199 v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
201 if (0 == height % 12) {
202 for (cnt = (height / 12); cnt--;) {
203 LD_UB4(src, src_stride, src0, src1, src2, src3);
204 LD_UB4(src + 16, src_stride, src4, src5, src6, src7);
205 src += (4 * src_stride);
206 ST_UB4(src0, src1, src2, src3, dst, dst_stride);
207 ST_UB4(src4, src5, src6, src7, dst + 16, dst_stride);
208 dst += (4 * dst_stride);
210 LD_UB4(src, src_stride, src0, src1, src2, src3);
211 LD_UB4(src + 16, src_stride, src4, src5, src6, src7);
212 src += (4 * src_stride);
213 ST_UB4(src0, src1, src2, src3, dst, dst_stride);
214 ST_UB4(src4, src5, src6, src7, dst + 16, dst_stride);
215 dst += (4 * dst_stride);
217 LD_UB4(src, src_stride, src0, src1, src2, src3);
218 LD_UB4(src + 16, src_stride, src4, src5, src6, src7);
219 src += (4 * src_stride);
220 ST_UB4(src0, src1, src2, src3, dst, dst_stride);
221 ST_UB4(src4, src5, src6, src7, dst + 16, dst_stride);
222 dst += (4 * dst_stride);
224 } else if (0 == height % 8) {
225 copy_16multx8mult_msa(src, src_stride, dst, dst_stride, height, 32);
226 } else if (0 == height % 4) {
227 for (cnt = (height >> 2); cnt--;) {
228 LD_UB4(src, src_stride, src0, src1, src2, src3);
229 LD_UB4(src + 16, src_stride, src4, src5, src6, src7);
230 src += (4 * src_stride);
231 ST_UB4(src0, src1, src2, src3, dst, dst_stride);
232 ST_UB4(src4, src5, src6, src7, dst + 16, dst_stride);
233 dst += (4 * dst_stride);
238 static void copy_width48_msa(uint8_t *src, int32_t src_stride,
239 uint8_t *dst, int32_t dst_stride,
242 copy_16multx8mult_msa(src, src_stride, dst, dst_stride, height, 48);
245 static void copy_width64_msa(uint8_t *src, int32_t src_stride,
246 uint8_t *dst, int32_t dst_stride,
249 copy_16multx8mult_msa(src, src_stride, dst, dst_stride, height, 64);
252 static const uint8_t mc_filt_mask_arr[16 * 3] = {
254 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
256 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20,
258 8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
261 #define FILT_8TAP_DPADD_S_H(vec0, vec1, vec2, vec3, \
262 filt0, filt1, filt2, filt3) \
266 tmp0 = __msa_dotp_s_h((v16i8) vec0, (v16i8) filt0); \
267 tmp0 = __msa_dpadd_s_h(tmp0, (v16i8) vec1, (v16i8) filt1); \
268 tmp1 = __msa_dotp_s_h((v16i8) vec2, (v16i8) filt2); \
269 tmp1 = __msa_dpadd_s_h(tmp1, (v16i8) vec3, (v16i8) filt3); \
270 tmp0 = __msa_adds_s_h(tmp0, tmp1); \
275 #define HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, \
276 mask0, mask1, mask2, mask3, \
277 filt0, filt1, filt2, filt3, \
280 v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \
281 v8i16 res0_m, res1_m, res2_m, res3_m; \
283 VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0_m, vec1_m); \
284 DOTP_SB2_SH(vec0_m, vec1_m, filt0, filt0, res0_m, res1_m); \
285 VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2_m, vec3_m); \
286 DPADD_SB2_SH(vec2_m, vec3_m, filt1, filt1, res0_m, res1_m); \
287 VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4_m, vec5_m); \
288 DOTP_SB2_SH(vec4_m, vec5_m, filt2, filt2, res2_m, res3_m); \
289 VSHF_B2_SB(src0, src1, src2, src3, mask3, mask3, vec6_m, vec7_m); \
290 DPADD_SB2_SH(vec6_m, vec7_m, filt3, filt3, res2_m, res3_m); \
291 ADDS_SH2_SH(res0_m, res2_m, res1_m, res3_m, out0, out1); \
294 #define HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, \
295 mask0, mask1, mask2, mask3, \
296 filt0, filt1, filt2, filt3, \
297 out0, out1, out2, out3) \
299 v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \
300 v8i16 res0_m, res1_m, res2_m, res3_m, res4_m, res5_m, res6_m, res7_m; \
302 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m); \
303 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m); \
304 DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0, \
305 res0_m, res1_m, res2_m, res3_m); \
306 VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec0_m, vec1_m); \
307 VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2_m, vec3_m); \
308 DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt2, filt2, filt2, filt2, \
309 res4_m, res5_m, res6_m, res7_m); \
310 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4_m, vec5_m); \
311 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6_m, vec7_m); \
312 DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt1, filt1, filt1, filt1, \
313 res0_m, res1_m, res2_m, res3_m); \
314 VSHF_B2_SB(src0, src0, src1, src1, mask3, mask3, vec4_m, vec5_m); \
315 VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec6_m, vec7_m); \
316 DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt3, filt3, filt3, filt3, \
317 res4_m, res5_m, res6_m, res7_m); \
318 ADDS_SH4_SH(res0_m, res4_m, res1_m, res5_m, res2_m, res6_m, res3_m, \
319 res7_m, out0, out1, out2, out3); \
322 #define FILT_4TAP_DPADD_S_H(vec0, vec1, filt0, filt1) \
326 tmp0 = __msa_dotp_s_h((v16i8) vec0, (v16i8) filt0); \
327 tmp0 = __msa_dpadd_s_h(tmp0, (v16i8) vec1, (v16i8) filt1); \
332 #define HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, \
333 mask0, mask1, filt0, filt1, \
336 v16i8 vec0_m, vec1_m, vec2_m, vec3_m; \
338 VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0_m, vec1_m); \
339 DOTP_SB2_SH(vec0_m, vec1_m, filt0, filt0, out0, out1); \
340 VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2_m, vec3_m); \
341 DPADD_SB2_SH(vec2_m, vec3_m, filt1, filt1, out0, out1); \
344 #define HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, \
345 mask0, mask1, filt0, filt1, \
346 out0, out1, out2, out3) \
348 v16i8 vec0_m, vec1_m, vec2_m, vec3_m; \
350 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m); \
351 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m); \
352 DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0, \
353 out0, out1, out2, out3); \
354 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0_m, vec1_m); \
355 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2_m, vec3_m); \
356 DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt1, filt1, filt1, filt1, \
357 out0, out1, out2, out3); \
360 static void common_hz_8t_4x4_msa(uint8_t *src, int32_t src_stride,
361 uint8_t *dst, int32_t dst_stride,
362 const int8_t *filter)
364 v16u8 mask0, mask1, mask2, mask3, out;
365 v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
366 v8i16 filt, out0, out1;
368 mask0 = LD_UB(&mc_filt_mask_arr[16]);
371 /* rearranging filter */
372 filt = LD_SH(filter);
373 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
379 LD_SB4(src, src_stride, src0, src1, src2, src3);
380 XORI_B4_128_SB(src0, src1, src2, src3);
381 HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
382 mask3, filt0, filt1, filt2, filt3, out0, out1);
383 SRARI_H2_SH(out0, out1, 6);
384 SAT_SH2_SH(out0, out1, 7);
385 out = PCKEV_XORI128_UB(out0, out1);
386 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
389 static void common_hz_8t_4x8_msa(uint8_t *src, int32_t src_stride,
390 uint8_t *dst, int32_t dst_stride,
391 const int8_t *filter)
393 v16i8 filt0, filt1, filt2, filt3;
394 v16i8 src0, src1, src2, src3;
395 v16u8 mask0, mask1, mask2, mask3, out;
396 v8i16 filt, out0, out1, out2, out3;
398 mask0 = LD_UB(&mc_filt_mask_arr[16]);
401 /* rearranging filter */
402 filt = LD_SH(filter);
403 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
409 LD_SB4(src, src_stride, src0, src1, src2, src3);
410 XORI_B4_128_SB(src0, src1, src2, src3);
411 src += (4 * src_stride);
412 HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
413 mask3, filt0, filt1, filt2, filt3, out0, out1);
414 LD_SB4(src, src_stride, src0, src1, src2, src3);
415 XORI_B4_128_SB(src0, src1, src2, src3);
416 HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
417 mask3, filt0, filt1, filt2, filt3, out2, out3);
418 SRARI_H4_SH(out0, out1, out2, out3, 6);
419 SAT_SH4_SH(out0, out1, out2, out3, 7);
420 out = PCKEV_XORI128_UB(out0, out1);
421 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
422 dst += (4 * dst_stride);
423 out = PCKEV_XORI128_UB(out2, out3);
424 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
427 static void common_hz_8t_4x16_msa(uint8_t *src, int32_t src_stride,
428 uint8_t *dst, int32_t dst_stride,
429 const int8_t *filter)
431 v16u8 mask0, mask1, mask2, mask3, out;
432 v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
433 v8i16 filt, out0, out1, out2, out3;
435 mask0 = LD_UB(&mc_filt_mask_arr[16]);
438 /* rearranging filter */
439 filt = LD_SH(filter);
440 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
446 LD_SB4(src, src_stride, src0, src1, src2, src3);
447 XORI_B4_128_SB(src0, src1, src2, src3);
448 src += (4 * src_stride);
449 HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
450 mask3, filt0, filt1, filt2, filt3, out0, out1);
451 LD_SB4(src, src_stride, src0, src1, src2, src3);
452 XORI_B4_128_SB(src0, src1, src2, src3);
453 src += (4 * src_stride);
454 HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
455 mask3, filt0, filt1, filt2, filt3, out2, out3);
456 SRARI_H4_SH(out0, out1, out2, out3, 6);
457 SAT_SH4_SH(out0, out1, out2, out3, 7);
458 out = PCKEV_XORI128_UB(out0, out1);
459 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
460 dst += (4 * dst_stride);
461 out = PCKEV_XORI128_UB(out2, out3);
462 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
463 dst += (4 * dst_stride);
465 LD_SB4(src, src_stride, src0, src1, src2, src3);
466 XORI_B4_128_SB(src0, src1, src2, src3);
467 src += (4 * src_stride);
468 HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
469 mask3, filt0, filt1, filt2, filt3, out0, out1);
470 LD_SB4(src, src_stride, src0, src1, src2, src3);
471 XORI_B4_128_SB(src0, src1, src2, src3);
472 src += (4 * src_stride);
473 HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
474 mask3, filt0, filt1, filt2, filt3, out2, out3);
476 SRARI_H4_SH(out0, out1, out2, out3, 6);
477 SAT_SH4_SH(out0, out1, out2, out3, 7);
478 out = PCKEV_XORI128_UB(out0, out1);
479 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
480 dst += (4 * dst_stride);
481 out = PCKEV_XORI128_UB(out2, out3);
482 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
485 static void common_hz_8t_4w_msa(uint8_t *src, int32_t src_stride,
486 uint8_t *dst, int32_t dst_stride,
487 const int8_t *filter, int32_t height)
490 common_hz_8t_4x4_msa(src, src_stride, dst, dst_stride, filter);
491 } else if (8 == height) {
492 common_hz_8t_4x8_msa(src, src_stride, dst, dst_stride, filter);
493 } else if (16 == height) {
494 common_hz_8t_4x16_msa(src, src_stride, dst, dst_stride, filter);
498 static void common_hz_8t_8x4_msa(uint8_t *src, int32_t src_stride,
499 uint8_t *dst, int32_t dst_stride,
500 const int8_t *filter)
502 v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
503 v16u8 mask0, mask1, mask2, mask3, tmp0, tmp1;
504 v8i16 filt, out0, out1, out2, out3;
506 mask0 = LD_UB(&mc_filt_mask_arr[0]);
509 /* rearranging filter */
510 filt = LD_SH(filter);
511 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
517 LD_SB4(src, src_stride, src0, src1, src2, src3);
518 XORI_B4_128_SB(src0, src1, src2, src3);
519 HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
520 mask3, filt0, filt1, filt2, filt3, out0, out1,
522 SRARI_H4_SH(out0, out1, out2, out3, 6);
523 SAT_SH4_SH(out0, out1, out2, out3, 7);
524 tmp0 = PCKEV_XORI128_UB(out0, out1);
525 tmp1 = PCKEV_XORI128_UB(out2, out3);
526 ST8x4_UB(tmp0, tmp1, dst, dst_stride);
529 static void common_hz_8t_8x8mult_msa(uint8_t *src, int32_t src_stride,
530 uint8_t *dst, int32_t dst_stride,
531 const int8_t *filter, int32_t height)
534 v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
535 v16u8 mask0, mask1, mask2, mask3, tmp0, tmp1;
536 v8i16 filt, out0, out1, out2, out3;
538 mask0 = LD_UB(&mc_filt_mask_arr[0]);
541 /* rearranging filter */
542 filt = LD_SH(filter);
543 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
549 for (loop_cnt = (height >> 2); loop_cnt--;) {
550 LD_SB4(src, src_stride, src0, src1, src2, src3);
551 XORI_B4_128_SB(src0, src1, src2, src3);
552 src += (4 * src_stride);
553 HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
554 mask3, filt0, filt1, filt2, filt3, out0,
556 SRARI_H4_SH(out0, out1, out2, out3, 6);
557 SAT_SH4_SH(out0, out1, out2, out3, 7);
558 tmp0 = PCKEV_XORI128_UB(out0, out1);
559 tmp1 = PCKEV_XORI128_UB(out2, out3);
560 ST8x4_UB(tmp0, tmp1, dst, dst_stride);
561 dst += (4 * dst_stride);
565 static void common_hz_8t_8w_msa(uint8_t *src, int32_t src_stride,
566 uint8_t *dst, int32_t dst_stride,
567 const int8_t *filter, int32_t height)
570 common_hz_8t_8x4_msa(src, src_stride, dst, dst_stride, filter);
572 common_hz_8t_8x8mult_msa(src, src_stride, dst, dst_stride, filter,
577 static void common_hz_8t_12w_msa(uint8_t *src, int32_t src_stride,
578 uint8_t *dst, int32_t dst_stride,
579 const int8_t *filter, int32_t height)
581 uint8_t *src1_ptr, *dst1;
583 v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
584 v8i16 filt, out0, out1, out2, out3;
585 v16u8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask00, tmp0, tmp1;
587 mask00 = LD_UB(&mc_filt_mask_arr[0]);
588 mask0 = LD_UB(&mc_filt_mask_arr[16]);
596 /* rearranging filter */
597 filt = LD_SH(filter);
598 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
607 for (loop_cnt = (height >> 2); loop_cnt--;) {
609 LD_SB4(src1_ptr, src_stride, src0, src1, src2, src3);
610 XORI_B4_128_SB(src0, src1, src2, src3);
611 src1_ptr += (4 * src_stride);
612 HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask00, mask1, mask2,
613 mask3, filt0, filt1, filt2, filt3, out0,
615 SRARI_H4_SH(out0, out1, out2, out3, 6);
616 SAT_SH4_SH(out0, out1, out2, out3, 7);
617 tmp0 = PCKEV_XORI128_UB(out0, out1);
618 tmp1 = PCKEV_XORI128_UB(out2, out3);
619 ST8x4_UB(tmp0, tmp1, dst1, dst_stride);
620 dst1 += (4 * dst_stride);
623 LD_SB4(src, src_stride, src0, src1, src2, src3);
624 XORI_B4_128_SB(src0, src1, src2, src3);
625 src += (4 * src_stride);
626 HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask4, mask5,
627 mask6, filt0, filt1, filt2, filt3, out0,
629 SRARI_H2_SH(out0, out1, 6);
630 SAT_SH2_SH(out0, out1, 7);
631 tmp0 = PCKEV_XORI128_UB(out0, out1);
632 ST4x4_UB(tmp0, tmp0, 0, 1, 2, 3, dst, dst_stride);
633 dst += (4 * dst_stride);
637 static void common_hz_8t_16w_msa(uint8_t *src, int32_t src_stride,
638 uint8_t *dst, int32_t dst_stride,
639 const int8_t *filter, int32_t height)
642 v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
643 v16u8 mask0, mask1, mask2, mask3, out;
644 v8i16 filt, out0, out1, out2, out3;
646 mask0 = LD_UB(&mc_filt_mask_arr[0]);
649 /* rearranging filter */
650 filt = LD_SH(filter);
651 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
657 for (loop_cnt = (height >> 1); loop_cnt--;) {
658 LD_SB2(src, src_stride, src0, src2);
659 LD_SB2(src + 8, src_stride, src1, src3);
660 XORI_B4_128_SB(src0, src1, src2, src3);
661 src += (2 * src_stride);
662 HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
663 mask3, filt0, filt1, filt2, filt3, out0,
665 SRARI_H4_SH(out0, out1, out2, out3, 6);
666 SAT_SH4_SH(out0, out1, out2, out3, 7);
667 out = PCKEV_XORI128_UB(out0, out1);
670 out = PCKEV_XORI128_UB(out2, out3);
676 static void common_hz_8t_24w_msa(uint8_t *src, int32_t src_stride,
677 uint8_t *dst, int32_t dst_stride,
678 const int8_t *filter, int32_t height)
681 v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
682 v16u8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7, out;
683 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
685 v8i16 out0, out1, out2, out3, out4, out5, out6, out7, out8, out9, out10;
688 mask0 = LD_UB(&mc_filt_mask_arr[0]);
691 /* rearranging filter */
692 filt = LD_SH(filter);
693 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
703 for (loop_cnt = (height >> 1); loop_cnt--;) {
704 LD_SB2(src, src_stride, src0, src2);
705 LD_SB2(src + 16, src_stride, src1, src3);
706 XORI_B4_128_SB(src0, src1, src2, src3);
707 src += (2 * src_stride);
708 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec8);
709 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec9);
710 VSHF_B2_SB(src0, src1, src2, src3, mask4, mask4, vec1, vec3);
711 DOTP_SB4_SH(vec0, vec8, vec2, vec9, filt0, filt0, filt0, filt0, out0,
713 DOTP_SB2_SH(vec1, vec3, filt0, filt0, out1, out3);
714 VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec0, vec8);
715 VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2, vec9);
716 VSHF_B2_SB(src0, src1, src2, src3, mask6, mask6, vec1, vec3);
717 DOTP_SB4_SH(vec0, vec8, vec2, vec9, filt2, filt2, filt2, filt2, out4,
719 DOTP_SB2_SH(vec1, vec3, filt2, filt2, out5, out7);
720 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4, vec10);
721 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6, vec11);
722 VSHF_B2_SB(src0, src1, src2, src3, mask5, mask5, vec5, vec7);
723 DPADD_SB4_SH(vec4, vec10, vec6, vec11, filt1, filt1, filt1, filt1,
724 out0, out8, out2, out9);
725 DPADD_SB2_SH(vec5, vec7, filt1, filt1, out1, out3);
726 VSHF_B2_SB(src0, src0, src1, src1, mask3, mask3, vec4, vec10);
727 VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec6, vec11);
728 VSHF_B2_SB(src0, src1, src2, src3, mask7, mask7, vec5, vec7);
729 DPADD_SB4_SH(vec4, vec10, vec6, vec11, filt3, filt3, filt3, filt3,
730 out4, out10, out6, out11);
731 DPADD_SB2_SH(vec5, vec7, filt3, filt3, out5, out7);
732 ADDS_SH4_SH(out0, out4, out8, out10, out2, out6, out9, out11, out0,
734 ADDS_SH2_SH(out1, out5, out3, out7, out1, out3);
735 SRARI_H4_SH(out0, out8, out2, out9, 6);
736 SRARI_H2_SH(out1, out3, 6);
737 SAT_SH4_SH(out0, out8, out2, out9, 7);
738 SAT_SH2_SH(out1, out3, 7);
739 out = PCKEV_XORI128_UB(out8, out9);
740 ST8x2_UB(out, dst + 16, dst_stride);
741 out = PCKEV_XORI128_UB(out0, out1);
744 out = PCKEV_XORI128_UB(out2, out3);
750 static void common_hz_8t_32w_msa(uint8_t *src, int32_t src_stride,
751 uint8_t *dst, int32_t dst_stride,
752 const int8_t *filter, int32_t height)
755 v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
756 v16u8 mask0, mask1, mask2, mask3, out;
757 v8i16 filt, out0, out1, out2, out3;
759 mask0 = LD_UB(&mc_filt_mask_arr[0]);
762 /* rearranging filter */
763 filt = LD_SH(filter);
764 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
770 for (loop_cnt = (height >> 1); loop_cnt--;) {
772 src2 = LD_SB(src + 16);
773 src3 = LD_SB(src + 24);
774 src1 = __msa_sldi_b(src2, src0, 8);
776 XORI_B4_128_SB(src0, src1, src2, src3);
777 HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
778 mask3, filt0, filt1, filt2, filt3, out0,
780 SRARI_H4_SH(out0, out1, out2, out3, 6);
781 SAT_SH4_SH(out0, out1, out2, out3, 7);
784 src2 = LD_SB(src + 16);
785 src3 = LD_SB(src + 24);
786 src1 = __msa_sldi_b(src2, src0, 8);
789 out = PCKEV_XORI128_UB(out0, out1);
791 out = PCKEV_XORI128_UB(out2, out3);
792 ST_UB(out, dst + 16);
795 XORI_B4_128_SB(src0, src1, src2, src3);
796 HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
797 mask3, filt0, filt1, filt2, filt3, out0,
799 SRARI_H4_SH(out0, out1, out2, out3, 6);
800 SAT_SH4_SH(out0, out1, out2, out3, 7);
801 out = PCKEV_XORI128_UB(out0, out1);
803 out = PCKEV_XORI128_UB(out2, out3);
804 ST_UB(out, dst + 16);
809 static void common_hz_8t_48w_msa(uint8_t *src, int32_t src_stride,
810 uint8_t *dst, int32_t dst_stride,
811 const int8_t *filter, int32_t height)
814 v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3, vec0, vec1, vec2;
815 v16u8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7, out;
816 v8i16 filt, out0, out1, out2, out3, out4, out5, out6;
818 mask0 = LD_UB(&mc_filt_mask_arr[0]);
821 /* rearranging filter */
822 filt = LD_SH(filter);
823 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
833 for (loop_cnt = height; loop_cnt--;) {
834 LD_SB3(src, 16, src0, src2, src3);
835 src1 = __msa_sldi_b(src2, src0, 8);
837 XORI_B4_128_SB(src0, src1, src2, src3);
838 VSHF_B3_SB(src0, src0, src1, src1, src2, src2, mask0, mask0, mask0,
840 DOTP_SB3_SH(vec0, vec1, vec2, filt0, filt0, filt0, out0, out1, out2);
841 VSHF_B3_SB(src0, src0, src1, src1, src2, src2, mask1, mask1, mask1,
843 DPADD_SB2_SH(vec0, vec1, filt1, filt1, out0, out1);
844 out2 = __msa_dpadd_s_h(out2, vec2, filt1);
845 VSHF_B3_SB(src0, src0, src1, src1, src2, src2, mask2, mask2, mask2,
847 DOTP_SB3_SH(vec0, vec1, vec2, filt2, filt2, filt2, out3, out4, out5);
848 VSHF_B3_SB(src0, src0, src1, src1, src2, src2, mask3, mask3, mask3,
850 DPADD_SB2_SH(vec0, vec1, filt3, filt3, out3, out4);
851 out5 = __msa_dpadd_s_h(out5, vec2, filt3);
852 ADDS_SH2_SH(out0, out3, out1, out4, out0, out1);
853 out2 = __msa_adds_s_h(out2, out5);
854 SRARI_H2_SH(out0, out1, 6);
855 out6 = __msa_srari_h(out2, 6);
856 SAT_SH3_SH(out0, out1, out6, 7);
857 out = PCKEV_XORI128_UB(out0, out1);
860 src1 = LD_SB(src + 40);
862 src1 = (v16i8) __msa_xori_b((v16u8) src1, 128);
864 VSHF_B3_SB(src2, src3, src3, src3, src1, src1, mask4, mask0, mask0,
866 DOTP_SB3_SH(vec0, vec1, vec2, filt0, filt0, filt0, out0, out1, out2);
867 VSHF_B3_SB(src2, src3, src3, src3, src1, src1, mask5, mask1, mask1,
869 DPADD_SB2_SH(vec0, vec1, filt1, filt1, out0, out1);
870 out2 = __msa_dpadd_s_h(out2, vec2, filt1);
871 VSHF_B3_SB(src2, src3, src3, src3, src1, src1, mask6, mask2, mask2,
873 DOTP_SB3_SH(vec0, vec1, vec2, filt2, filt2, filt2, out3, out4, out5);
874 VSHF_B3_SB(src2, src3, src3, src3, src1, src1, mask7, mask3, mask3,
876 DPADD_SB2_SH(vec0, vec1, filt3, filt3, out3, out4);
877 out5 = __msa_dpadd_s_h(out5, vec2, filt3);
878 ADDS_SH2_SH(out0, out3, out1, out4, out3, out4);
879 out5 = __msa_adds_s_h(out2, out5);
880 SRARI_H2_SH(out3, out4, 6);
881 out5 = __msa_srari_h(out5, 6);
882 SAT_SH3_SH(out3, out4, out5, 7);
883 out = PCKEV_XORI128_UB(out6, out3);
884 ST_UB(out, dst + 16);
885 out = PCKEV_XORI128_UB(out4, out5);
886 ST_UB(out, dst + 32);
891 static void common_hz_8t_64w_msa(uint8_t *src, int32_t src_stride,
892 uint8_t *dst, int32_t dst_stride,
893 const int8_t *filter, int32_t height)
896 v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
897 v16u8 mask0, mask1, mask2, mask3, out;
898 v8i16 filt, out0, out1, out2, out3;
900 mask0 = LD_UB(&mc_filt_mask_arr[0]);
903 /* rearranging filter */
904 filt = LD_SH(filter);
905 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
911 for (loop_cnt = height; loop_cnt--;) {
913 src2 = LD_SB(src + 16);
914 src3 = LD_SB(src + 24);
915 src1 = __msa_sldi_b(src2, src0, 8);
917 XORI_B4_128_SB(src0, src1, src2, src3);
918 HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,
919 mask2, mask3, filt0, filt1, filt2, filt3,
920 out0, out1, out2, out3);
921 SRARI_H4_SH(out0, out1, out2, out3, 6);
922 SAT_SH4_SH(out0, out1, out2, out3, 7);
923 out = PCKEV_XORI128_UB(out0, out1);
925 out = PCKEV_XORI128_UB(out2, out3);
926 ST_UB(out, dst + 16);
928 src0 = LD_SB(src + 32);
929 src2 = LD_SB(src + 48);
930 src3 = LD_SB(src + 56);
931 src1 = __msa_sldi_b(src2, src0, 8);
934 XORI_B4_128_SB(src0, src1, src2, src3);
935 HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,
936 mask2, mask3, filt0, filt1, filt2, filt3,
937 out0, out1, out2, out3);
938 SRARI_H4_SH(out0, out1, out2, out3, 6);
939 SAT_SH4_SH(out0, out1, out2, out3, 7);
940 out = PCKEV_XORI128_UB(out0, out1);
941 ST_UB(out, dst + 32);
942 out = PCKEV_XORI128_UB(out2, out3);
943 ST_UB(out, dst + 48);
948 static void common_vt_8t_4w_msa(uint8_t *src, int32_t src_stride,
949 uint8_t *dst, int32_t dst_stride,
950 const int8_t *filter, int32_t height)
953 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
954 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
955 v16i8 src65_r, src87_r, src109_r, src2110, src4332, src6554, src8776;
956 v16i8 src10998, filt0, filt1, filt2, filt3;
958 v8i16 filt, out10, out32;
960 src -= (3 * src_stride);
962 filt = LD_SH(filter);
963 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
965 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
966 src += (7 * src_stride);
968 ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
970 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
971 ILVR_D3_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r, src2110,
973 XORI_B3_128_SB(src2110, src4332, src6554);
975 for (loop_cnt = (height >> 2); loop_cnt--;) {
976 LD_SB4(src, src_stride, src7, src8, src9, src10);
977 src += (4 * src_stride);
979 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
980 src87_r, src98_r, src109_r);
981 ILVR_D2_SB(src87_r, src76_r, src109_r, src98_r, src8776, src10998);
982 XORI_B2_128_SB(src8776, src10998);
983 out10 = FILT_8TAP_DPADD_S_H(src2110, src4332, src6554, src8776, filt0,
984 filt1, filt2, filt3);
985 out32 = FILT_8TAP_DPADD_S_H(src4332, src6554, src8776, src10998, filt0,
986 filt1, filt2, filt3);
987 SRARI_H2_SH(out10, out32, 6);
988 SAT_SH2_SH(out10, out32, 7);
989 out = PCKEV_XORI128_UB(out10, out32);
990 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
991 dst += (4 * dst_stride);
1000 static void common_vt_8t_8w_msa(uint8_t *src, int32_t src_stride,
1001 uint8_t *dst, int32_t dst_stride,
1002 const int8_t *filter, int32_t height)
1005 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1006 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
1007 v16i8 src65_r, src87_r, src109_r, filt0, filt1, filt2, filt3;
1009 v8i16 filt, out0_r, out1_r, out2_r, out3_r;
1011 src -= (3 * src_stride);
1013 filt = LD_SH(filter);
1014 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1016 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
1017 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1018 src += (7 * src_stride);
1019 ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
1021 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1023 for (loop_cnt = (height >> 2); loop_cnt--;) {
1024 LD_SB4(src, src_stride, src7, src8, src9, src10);
1025 XORI_B4_128_SB(src7, src8, src9, src10);
1026 src += (4 * src_stride);
1028 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
1029 src87_r, src98_r, src109_r);
1030 out0_r = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r, filt0,
1031 filt1, filt2, filt3);
1032 out1_r = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r, filt0,
1033 filt1, filt2, filt3);
1034 out2_r = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r, filt0,
1035 filt1, filt2, filt3);
1036 out3_r = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r, filt0,
1037 filt1, filt2, filt3);
1038 SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6);
1039 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
1040 tmp0 = PCKEV_XORI128_UB(out0_r, out1_r);
1041 tmp1 = PCKEV_XORI128_UB(out2_r, out3_r);
1042 ST8x4_UB(tmp0, tmp1, dst, dst_stride);
1043 dst += (4 * dst_stride);
1055 static void common_vt_8t_12w_msa(uint8_t *src, int32_t src_stride,
1056 uint8_t *dst, int32_t dst_stride,
1057 const int8_t *filter, int32_t height)
1060 uint32_t out2, out3;
1061 uint64_t out0, out1;
1062 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, res0, res1;
1063 v16i8 res2, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1064 v8i16 vec01, vec23, vec45, vec67, tmp0, tmp1, tmp2;
1065 v8i16 filt, filt0, filt1, filt2, filt3;
1066 v4i32 mask = { 2, 6, 2, 6 };
1068 src -= (3 * src_stride);
1070 /* rearranging filter_y */
1071 filt = LD_SH(filter);
1072 SPLATI_H4_SH(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1074 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
1075 src += (7 * src_stride);
1077 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1080 VSHF_W2_SB(src0, src1, src1, src2, mask, mask, vec0, vec1);
1081 VSHF_W2_SB(src2, src3, src3, src4, mask, mask, vec2, vec3);
1082 VSHF_W2_SB(src4, src5, src5, src6, mask, mask, vec4, vec5);
1084 for (loop_cnt = (height >> 1); loop_cnt--;) {
1085 LD_SB2(src, src_stride, src7, src8);
1086 XORI_B2_128_SB(src7, src8);
1087 src += (2 * src_stride);
1089 ILVR_B4_SH(src1, src0, src3, src2, src5, src4, src7, src6,
1090 vec01, vec23, vec45, vec67);
1091 tmp0 = FILT_8TAP_DPADD_S_H(vec01, vec23, vec45, vec67, filt0, filt1,
1093 ILVR_B4_SH(src2, src1, src4, src3, src6, src5, src8, src7, vec01, vec23,
1095 tmp1 = FILT_8TAP_DPADD_S_H(vec01, vec23, vec45, vec67, filt0, filt1,
1099 VSHF_W2_SB(src6, src7, src7, src8, mask, mask, vec6, vec7);
1100 ILVR_B4_SH(vec1, vec0, vec3, vec2, vec5, vec4, vec7, vec6, vec01, vec23,
1102 tmp2 = FILT_8TAP_DPADD_S_H(vec01, vec23, vec45, vec67, filt0, filt1,
1104 SRARI_H2_SH(tmp0, tmp1, 6);
1105 tmp2 = __msa_srari_h(tmp2, 6);
1106 SAT_SH3_SH(tmp0, tmp1, tmp2, 7);
1107 PCKEV_B3_SB(tmp0, tmp0, tmp1, tmp1, tmp2, tmp2, res0, res1, res2);
1108 XORI_B3_128_SB(res0, res1, res2);
1110 out0 = __msa_copy_u_d((v2i64) res0, 0);
1111 out1 = __msa_copy_u_d((v2i64) res1, 0);
1112 out2 = __msa_copy_u_w((v4i32) res2, 0);
1113 out3 = __msa_copy_u_w((v4i32) res2, 1);
1115 SW(out2, (dst + 8));
1118 SW(out3, (dst + 8));
1137 static void common_vt_8t_16w_msa(uint8_t *src, int32_t src_stride,
1138 uint8_t *dst, int32_t dst_stride,
1139 const int8_t *filter, int32_t height)
1142 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1143 v16i8 filt0, filt1, filt2, filt3;
1144 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
1145 v16i8 src65_r, src87_r, src109_r, src10_l, src32_l, src54_l, src76_l;
1146 v16i8 src98_l, src21_l, src43_l, src65_l, src87_l, src109_l;
1147 v16u8 tmp0, tmp1, tmp2, tmp3;
1148 v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
1150 src -= (3 * src_stride);
1152 filt = LD_SH(filter);
1153 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1155 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
1156 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1157 src += (7 * src_stride);
1158 ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
1160 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1161 ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_l, src32_l,
1163 ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
1165 for (loop_cnt = (height >> 2); loop_cnt--;) {
1166 LD_SB4(src, src_stride, src7, src8, src9, src10);
1167 XORI_B4_128_SB(src7, src8, src9, src10);
1168 src += (4 * src_stride);
1170 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
1171 src87_r, src98_r, src109_r);
1172 ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_l,
1173 src87_l, src98_l, src109_l);
1174 out0_r = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r, filt0,
1175 filt1, filt2, filt3);
1176 out1_r = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r, filt0,
1177 filt1, filt2, filt3);
1178 out2_r = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r, filt0,
1179 filt1, filt2, filt3);
1180 out3_r = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r, filt0,
1181 filt1, filt2, filt3);
1182 out0_l = FILT_8TAP_DPADD_S_H(src10_l, src32_l, src54_l, src76_l, filt0,
1183 filt1, filt2, filt3);
1184 out1_l = FILT_8TAP_DPADD_S_H(src21_l, src43_l, src65_l, src87_l, filt0,
1185 filt1, filt2, filt3);
1186 out2_l = FILT_8TAP_DPADD_S_H(src32_l, src54_l, src76_l, src98_l, filt0,
1187 filt1, filt2, filt3);
1188 out3_l = FILT_8TAP_DPADD_S_H(src43_l, src65_l, src87_l, src109_l, filt0,
1189 filt1, filt2, filt3);
1190 SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6);
1191 SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 6);
1192 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
1193 SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
1194 PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
1195 out3_r, tmp0, tmp1, tmp2, tmp3);
1196 XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3);
1197 ST_UB4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride);
1198 dst += (4 * dst_stride);
1216 static void common_vt_8t_16w_mult_msa(uint8_t *src, int32_t src_stride,
1217 uint8_t *dst, int32_t dst_stride,
1218 const int8_t *filter, int32_t height,
1223 uint32_t loop_cnt, cnt;
1224 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1225 v16i8 filt0, filt1, filt2, filt3;
1226 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
1227 v16i8 src65_r, src87_r, src109_r, src10_l, src32_l, src54_l, src76_l;
1228 v16i8 src98_l, src21_l, src43_l, src65_l, src87_l, src109_l;
1229 v16u8 tmp0, tmp1, tmp2, tmp3;
1230 v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
1232 src -= (3 * src_stride);
1234 filt = LD_SH(filter);
1235 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1237 for (cnt = (width >> 4); cnt--;) {
1241 LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6);
1242 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1243 src_tmp += (7 * src_stride);
1244 ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r,
1245 src32_r, src54_r, src21_r);
1246 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1247 ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_l,
1248 src32_l, src54_l, src21_l);
1249 ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
1251 for (loop_cnt = (height >> 2); loop_cnt--;) {
1252 LD_SB4(src_tmp, src_stride, src7, src8, src9, src10);
1253 XORI_B4_128_SB(src7, src8, src9, src10);
1254 src_tmp += (4 * src_stride);
1255 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
1256 src87_r, src98_r, src109_r);
1257 ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_l,
1258 src87_l, src98_l, src109_l);
1259 out0_r = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r,
1260 filt0, filt1, filt2, filt3);
1261 out1_r = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r,
1262 filt0, filt1, filt2, filt3);
1263 out2_r = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r,
1264 filt0, filt1, filt2, filt3);
1265 out3_r = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r,
1266 filt0, filt1, filt2, filt3);
1267 out0_l = FILT_8TAP_DPADD_S_H(src10_l, src32_l, src54_l, src76_l,
1268 filt0, filt1, filt2, filt3);
1269 out1_l = FILT_8TAP_DPADD_S_H(src21_l, src43_l, src65_l, src87_l,
1270 filt0, filt1, filt2, filt3);
1271 out2_l = FILT_8TAP_DPADD_S_H(src32_l, src54_l, src76_l, src98_l,
1272 filt0, filt1, filt2, filt3);
1273 out3_l = FILT_8TAP_DPADD_S_H(src43_l, src65_l, src87_l, src109_l,
1274 filt0, filt1, filt2, filt3);
1275 SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6);
1276 SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 6);
1277 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
1278 SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
1279 PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
1280 out3_r, tmp0, tmp1, tmp2, tmp3);
1281 XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3);
1282 ST_UB4(tmp0, tmp1, tmp2, tmp3, dst_tmp, dst_stride);
1283 dst_tmp += (4 * dst_stride);
1305 static void common_vt_8t_24w_msa(uint8_t *src, int32_t src_stride,
1306 uint8_t *dst, int32_t dst_stride,
1307 const int8_t *filter, int32_t height)
1309 common_vt_8t_16w_mult_msa(src, src_stride, dst, dst_stride, filter, height,
1312 common_vt_8t_8w_msa(src + 16, src_stride, dst + 16, dst_stride, filter,
1316 static void common_vt_8t_32w_msa(uint8_t *src, int32_t src_stride,
1317 uint8_t *dst, int32_t dst_stride,
1318 const int8_t *filter, int32_t height)
1320 common_vt_8t_16w_mult_msa(src, src_stride, dst, dst_stride, filter, height,
1324 static void common_vt_8t_48w_msa(uint8_t *src, int32_t src_stride,
1325 uint8_t *dst, int32_t dst_stride,
1326 const int8_t *filter, int32_t height)
1328 common_vt_8t_16w_mult_msa(src, src_stride, dst, dst_stride, filter, height,
1332 static void common_vt_8t_64w_msa(uint8_t *src, int32_t src_stride,
1333 uint8_t *dst, int32_t dst_stride,
1334 const int8_t *filter, int32_t height)
1336 common_vt_8t_16w_mult_msa(src, src_stride, dst, dst_stride, filter, height,
1340 static void hevc_hv_uni_8t_4w_msa(uint8_t *src,
1344 const int8_t *filter_x,
1345 const int8_t *filter_y,
1349 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
1350 v8i16 filt0, filt1, filt2, filt3;
1351 v4i32 filt_h0, filt_h1, filt_h2, filt_h3;
1352 v16i8 mask1, mask2, mask3;
1353 v8i16 filter_vec, const_vec;
1354 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1355 v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1356 v8i16 dst30, dst41, dst52, dst63, dst66, dst87;
1357 v4i32 dst0_r, dst1_r;
1358 v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
1359 v8i16 dst21_r, dst43_r, dst65_r, dst87_r;
1360 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 };
1361 v8i16 mask4 = { 0, 4, 1, 5, 2, 6, 3, 7 };
1363 src -= ((3 * src_stride) + 3);
1364 filter_vec = LD_SH(filter_x);
1365 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1367 filter_vec = LD_SH(filter_y);
1368 vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
1369 filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
1371 SPLATI_W4_SW(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
1377 const_vec = __msa_ldi_h(128);
1380 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
1381 src += (7 * src_stride);
1382 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1384 VSHF_B4_SB(src0, src3, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
1385 VSHF_B4_SB(src1, src4, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7);
1386 VSHF_B4_SB(src2, src5, mask0, mask1, mask2, mask3,
1387 vec8, vec9, vec10, vec11);
1388 VSHF_B4_SB(src3, src6, mask0, mask1, mask2, mask3,
1389 vec12, vec13, vec14, vec15);
1392 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
1393 dst30, dst30, dst30, dst30);
1395 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, filt3,
1396 dst41, dst41, dst41, dst41);
1398 DPADD_SB4_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, filt3,
1399 dst52, dst52, dst52, dst52);
1401 DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2, filt3,
1402 dst63, dst63, dst63, dst63);
1404 ILVR_H3_SH(dst41, dst30, dst52, dst41, dst63, dst52,
1405 dst10_r, dst21_r, dst32_r);
1406 dst43_r = __msa_ilvl_h(dst41, dst30);
1407 dst54_r = __msa_ilvl_h(dst52, dst41);
1408 dst65_r = __msa_ilvl_h(dst63, dst52);
1409 dst66 = (v8i16) __msa_splati_d((v2i64) dst63, 1);
1411 for (loop_cnt = height >> 1; loop_cnt--;) {
1412 LD_SB2(src, src_stride, src7, src8);
1413 src += 2 * src_stride;
1414 XORI_B2_128_SB(src7, src8);
1416 VSHF_B4_SB(src7, src8, mask0, mask1, mask2, mask3,
1417 vec0, vec1, vec2, vec3);
1419 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
1420 dst87, dst87, dst87, dst87);
1422 dst76_r = __msa_ilvr_h(dst87, dst66);
1423 dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r,
1424 filt_h0, filt_h1, filt_h2, filt_h3);
1425 dst87_r = __msa_vshf_h(mask4, dst87, dst87);
1426 dst1_r = HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r,
1427 filt_h0, filt_h1, filt_h2, filt_h3);
1431 SRARI_W2_SW(dst0_r, dst1_r, 6);
1432 dst0_r = CLIP_SW_0_255(dst0_r);
1433 dst1_r = CLIP_SW_0_255(dst1_r);
1435 HEVC_PCK_SW_SB2(dst1_r, dst0_r, dst0_r);
1436 ST4x2_UB(dst0_r, dst, dst_stride);
1437 dst += (2 * dst_stride);
1445 dst66 = (v8i16) __msa_splati_d((v2i64) dst87, 1);
1449 static void hevc_hv_uni_8t_8multx2mult_msa(uint8_t *src,
1453 const int8_t *filter_x,
1454 const int8_t *filter_y,
1455 int32_t height, int32_t width)
1457 uint32_t loop_cnt, cnt;
1460 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
1461 v8i16 filt0, filt1, filt2, filt3;
1462 v4i32 filt_h0, filt_h1, filt_h2, filt_h3;
1463 v16i8 mask1, mask2, mask3;
1464 v8i16 filter_vec, const_vec;
1465 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1466 v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1467 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
1468 v4i32 dst0_r, dst0_l, dst1_r, dst1_l;
1469 v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
1470 v8i16 dst10_l, dst32_l, dst54_l, dst76_l;
1471 v8i16 dst21_r, dst43_r, dst65_r, dst87_r;
1472 v8i16 dst21_l, dst43_l, dst65_l, dst87_l;
1473 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
1475 src -= ((3 * src_stride) + 3);
1476 const_vec = __msa_ldi_h(128);
1479 filter_vec = LD_SH(filter_x);
1480 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1482 filter_vec = LD_SH(filter_y);
1483 vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
1484 filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
1486 SPLATI_W4_SW(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
1492 for (cnt = width >> 3; cnt--;) {
1496 LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6);
1497 src_tmp += (7 * src_stride);
1498 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1500 /* row 0 row 1 row 2 row 3 */
1501 VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
1502 vec0, vec1, vec2, vec3);
1503 VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
1504 vec4, vec5, vec6, vec7);
1505 VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
1506 vec8, vec9, vec10, vec11);
1507 VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
1508 vec12, vec13, vec14, vec15);
1510 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
1511 dst0, dst0, dst0, dst0);
1513 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, filt3,
1514 dst1, dst1, dst1, dst1);
1516 DPADD_SB4_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, filt3,
1517 dst2, dst2, dst2, dst2);
1519 DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2, filt3,
1520 dst3, dst3, dst3, dst3);
1522 VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3,
1523 vec0, vec1, vec2, vec3);
1524 VSHF_B4_SB(src5, src5, mask0, mask1, mask2, mask3,
1525 vec4, vec5, vec6, vec7);
1526 VSHF_B4_SB(src6, src6, mask0, mask1, mask2, mask3,
1527 vec8, vec9, vec10, vec11);
1529 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
1530 dst4, dst4, dst4, dst4);
1532 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, filt3,
1533 dst5, dst5, dst5, dst5);
1535 DPADD_SB4_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, filt3,
1536 dst6, dst6, dst6, dst6);
1538 ILVR_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1,
1539 dst10_r, dst32_r, dst54_r, dst21_r);
1540 ILVR_H2_SH(dst4, dst3, dst6, dst5, dst43_r, dst65_r);
1541 ILVL_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1,
1542 dst10_l, dst32_l, dst54_l, dst21_l);
1543 ILVL_H2_SH(dst4, dst3, dst6, dst5, dst43_l, dst65_l);
1545 for (loop_cnt = height >> 1; loop_cnt--;) {
1546 LD_SB2(src_tmp, src_stride, src7, src8);
1547 XORI_B2_128_SB(src7, src8);
1548 src_tmp += 2 * src_stride;
1550 VSHF_B4_SB(src7, src7, mask0, mask1, mask2, mask3,
1551 vec0, vec1, vec2, vec3);
1553 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
1554 dst7, dst7, dst7, dst7);
1556 ILVRL_H2_SH(dst7, dst6, dst76_r, dst76_l);
1557 dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r,
1558 filt_h0, filt_h1, filt_h2, filt_h3);
1559 dst0_l = HEVC_FILT_8TAP(dst10_l, dst32_l, dst54_l, dst76_l,
1560 filt_h0, filt_h1, filt_h2, filt_h3);
1564 VSHF_B4_SB(src8, src8, mask0, mask1, mask2, mask3,
1565 vec0, vec1, vec2, vec3);
1567 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
1568 dst8, dst8, dst8, dst8);
1570 ILVRL_H2_SH(dst8, dst7, dst87_r, dst87_l);
1571 dst1_r = HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r,
1572 filt_h0, filt_h1, filt_h2, filt_h3);
1573 dst1_l = HEVC_FILT_8TAP(dst21_l, dst43_l, dst65_l, dst87_l,
1574 filt_h0, filt_h1, filt_h2, filt_h3);
1577 SRARI_W4_SW(dst0_r, dst0_l, dst1_r, dst1_l, 6);
1578 dst0_r = CLIP_SW_0_255(dst0_r);
1579 dst0_l = CLIP_SW_0_255(dst0_l);
1580 dst1_r = CLIP_SW_0_255(dst1_r);
1581 dst1_l = CLIP_SW_0_255(dst1_l);
1583 HEVC_PCK_SW_SB4(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r);
1584 ST8x2_UB(dst0_r, dst_tmp, dst_stride);
1585 dst_tmp += (2 * dst_stride);
1607 static void hevc_hv_uni_8t_8w_msa(uint8_t *src,
1611 const int8_t *filter_x,
1612 const int8_t *filter_y,
1615 hevc_hv_uni_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
1616 filter_x, filter_y, height, 8);
1619 static void hevc_hv_uni_8t_12w_msa(uint8_t *src,
1623 const int8_t *filter_x,
1624 const int8_t *filter_y,
1627 hevc_hv_uni_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
1628 filter_x, filter_y, height, 8);
1630 hevc_hv_uni_8t_4w_msa(src + 8, src_stride, dst + 8, dst_stride,
1631 filter_x, filter_y, height);
1634 static void hevc_hv_uni_8t_16w_msa(uint8_t *src,
1638 const int8_t *filter_x,
1639 const int8_t *filter_y,
1642 hevc_hv_uni_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
1643 filter_x, filter_y, height, 16);
1646 static void hevc_hv_uni_8t_24w_msa(uint8_t *src,
1650 const int8_t *filter_x,
1651 const int8_t *filter_y,
1654 hevc_hv_uni_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
1655 filter_x, filter_y, height, 24);
1658 static void hevc_hv_uni_8t_32w_msa(uint8_t *src,
1662 const int8_t *filter_x,
1663 const int8_t *filter_y,
1666 hevc_hv_uni_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
1667 filter_x, filter_y, height, 32);
1670 static void hevc_hv_uni_8t_48w_msa(uint8_t *src,
1674 const int8_t *filter_x,
1675 const int8_t *filter_y,
1678 hevc_hv_uni_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
1679 filter_x, filter_y, height, 48);
1682 static void hevc_hv_uni_8t_64w_msa(uint8_t *src,
1686 const int8_t *filter_x,
1687 const int8_t *filter_y,
1690 hevc_hv_uni_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
1691 filter_x, filter_y, height, 64);
1694 static void common_hz_4t_4x2_msa(uint8_t *src, int32_t src_stride,
1695 uint8_t *dst, int32_t dst_stride,
1696 const int8_t *filter)
1698 v16i8 filt0, filt1, src0, src1, mask0, mask1, vec0, vec1;
1702 mask0 = LD_SB(&mc_filt_mask_arr[16]);
1705 /* rearranging filter */
1706 filt = LD_SH(filter);
1707 SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
1711 LD_SB2(src, src_stride, src0, src1);
1712 XORI_B2_128_SB(src0, src1);
1713 VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
1714 res0 = FILT_4TAP_DPADD_S_H(vec0, vec1, filt0, filt1);
1715 res0 = __msa_srari_h(res0, 6);
1716 res0 = __msa_sat_s_h(res0, 7);
1717 out = PCKEV_XORI128_UB(res0, res0);
1718 ST4x2_UB(out, dst, dst_stride);
1721 static void common_hz_4t_4x4_msa(uint8_t *src, int32_t src_stride,
1722 uint8_t *dst, int32_t dst_stride,
1723 const int8_t *filter)
1725 v16i8 src0, src1, src2, src3, filt0, filt1, mask0, mask1;
1726 v8i16 filt, out0, out1;
1729 mask0 = LD_SB(&mc_filt_mask_arr[16]);
1732 /* rearranging filter */
1733 filt = LD_SH(filter);
1734 SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
1738 LD_SB4(src, src_stride, src0, src1, src2, src3);
1739 XORI_B4_128_SB(src0, src1, src2, src3);
1740 HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,
1741 filt0, filt1, out0, out1);
1742 SRARI_H2_SH(out0, out1, 6);
1743 SAT_SH2_SH(out0, out1, 7);
1744 out = PCKEV_XORI128_UB(out0, out1);
1745 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
1748 static void common_hz_4t_4x8_msa(uint8_t *src, int32_t src_stride,
1749 uint8_t *dst, int32_t dst_stride,
1750 const int8_t *filter)
1752 v16i8 src0, src1, src2, src3, filt0, filt1, mask0, mask1;
1754 v8i16 filt, out0, out1, out2, out3;
1756 mask0 = LD_SB(&mc_filt_mask_arr[16]);
1759 /* rearranging filter */
1760 filt = LD_SH(filter);
1761 SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
1765 LD_SB4(src, src_stride, src0, src1, src2, src3);
1766 src += (4 * src_stride);
1768 XORI_B4_128_SB(src0, src1, src2, src3);
1769 HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,
1770 filt0, filt1, out0, out1);
1771 LD_SB4(src, src_stride, src0, src1, src2, src3);
1772 XORI_B4_128_SB(src0, src1, src2, src3);
1773 HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,
1774 filt0, filt1, out2, out3);
1775 SRARI_H4_SH(out0, out1, out2, out3, 6);
1776 SAT_SH4_SH(out0, out1, out2, out3, 7);
1777 out = PCKEV_XORI128_UB(out0, out1);
1778 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
1779 dst += (4 * dst_stride);
1780 out = PCKEV_XORI128_UB(out2, out3);
1781 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
1784 static void common_hz_4t_4x16_msa(uint8_t *src, int32_t src_stride,
1785 uint8_t *dst, int32_t dst_stride,
1786 const int8_t *filter)
1788 v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
1789 v16i8 filt0, filt1, mask0, mask1;
1791 v8i16 filt, out0, out1, out2, out3;
1793 mask0 = LD_SB(&mc_filt_mask_arr[16]);
1796 /* rearranging filter */
1797 filt = LD_SH(filter);
1798 SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
1802 LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
1803 src += (8 * src_stride);
1804 XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
1805 HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,
1806 filt0, filt1, out0, out1);
1807 HORIZ_4TAP_4WID_4VECS_FILT(src4, src5, src6, src7, mask0, mask1,
1808 filt0, filt1, out2, out3);
1809 SRARI_H4_SH(out0, out1, out2, out3, 6);
1810 SAT_SH4_SH(out0, out1, out2, out3, 7);
1811 out = PCKEV_XORI128_UB(out0, out1);
1812 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
1813 dst += (4 * dst_stride);
1814 out = PCKEV_XORI128_UB(out2, out3);
1815 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
1816 dst += (4 * dst_stride);
1818 LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
1819 src += (8 * src_stride);
1820 XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
1821 HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,
1822 filt0, filt1, out0, out1);
1823 HORIZ_4TAP_4WID_4VECS_FILT(src4, src5, src6, src7, mask0, mask1,
1824 filt0, filt1, out2, out3);
1825 SRARI_H4_SH(out0, out1, out2, out3, 6);
1826 SAT_SH4_SH(out0, out1, out2, out3, 7);
1827 out = PCKEV_XORI128_UB(out0, out1);
1828 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
1829 dst += (4 * dst_stride);
1830 out = PCKEV_XORI128_UB(out2, out3);
1831 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
1834 static void common_hz_4t_4w_msa(uint8_t *src, int32_t src_stride,
1835 uint8_t *dst, int32_t dst_stride,
1836 const int8_t *filter, int32_t height)
1839 common_hz_4t_4x2_msa(src, src_stride, dst, dst_stride, filter);
1840 } else if (4 == height) {
1841 common_hz_4t_4x4_msa(src, src_stride, dst, dst_stride, filter);
1842 } else if (8 == height) {
1843 common_hz_4t_4x8_msa(src, src_stride, dst, dst_stride, filter);
1844 } else if (16 == height) {
1845 common_hz_4t_4x16_msa(src, src_stride, dst, dst_stride, filter);
1849 static void common_hz_4t_6w_msa(uint8_t *src, int32_t src_stride,
1850 uint8_t *dst, int32_t dst_stride,
1851 const int8_t *filter, int32_t height)
1854 v16i8 src0, src1, src2, src3, filt0, filt1, mask0, mask1;
1856 v8i16 filt, out0, out1, out2, out3;
1858 mask0 = LD_SB(&mc_filt_mask_arr[0]);
1861 /* rearranging filter */
1862 filt = LD_SH(filter);
1863 SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
1867 for (loop_cnt = (height >> 2); loop_cnt--;) {
1868 LD_SB4(src, src_stride, src0, src1, src2, src3);
1869 src += (4 * src_stride);
1871 XORI_B4_128_SB(src0, src1, src2, src3);
1872 HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0,
1873 filt1, out0, out1, out2, out3);
1874 SRARI_H4_SH(out0, out1, out2, out3, 6);
1875 SAT_SH4_SH(out0, out1, out2, out3, 7);
1877 out4 = PCKEV_XORI128_UB(out0, out1);
1878 out5 = PCKEV_XORI128_UB(out2, out3);
1879 ST6x4_UB(out4, out5, dst, dst_stride);
1880 dst += (4 * dst_stride);
1884 static void common_hz_4t_8x2mult_msa(uint8_t *src, int32_t src_stride,
1885 uint8_t *dst, int32_t dst_stride,
1886 const int8_t *filter, int32_t height)
1889 v16i8 src0, src1, filt0, filt1, mask0, mask1;
1891 v8i16 filt, vec0, vec1, vec2, vec3;
1893 mask0 = LD_SB(&mc_filt_mask_arr[0]);
1896 filt = LD_SH(filter);
1897 SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
1901 for (loop_cnt = (height >> 1); loop_cnt--;) {
1902 LD_SB2(src, src_stride, src0, src1);
1903 src += (2 * src_stride);
1905 XORI_B2_128_SB(src0, src1);
1906 VSHF_B2_SH(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
1907 DOTP_SB2_SH(vec0, vec1, filt0, filt0, vec0, vec1);
1908 VSHF_B2_SH(src0, src0, src1, src1, mask1, mask1, vec2, vec3);
1909 DPADD_SB2_SH(vec2, vec3, filt1, filt1, vec0, vec1);
1910 SRARI_H2_SH(vec0, vec1, 6);
1911 SAT_SH2_SH(vec0, vec1, 7);
1912 out = PCKEV_XORI128_UB(vec0, vec1);
1913 ST8x2_UB(out, dst, dst_stride);
1914 dst += (2 * dst_stride);
1918 static void common_hz_4t_8x4mult_msa(uint8_t *src, int32_t src_stride,
1919 uint8_t *dst, int32_t dst_stride,
1920 const int8_t *filter, int32_t height)
1923 v16i8 src0, src1, src2, src3, filt0, filt1, mask0, mask1;
1925 v8i16 filt, out0, out1, out2, out3;
1927 mask0 = LD_SB(&mc_filt_mask_arr[0]);
1930 /* rearranging filter */
1931 filt = LD_SH(filter);
1932 SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
1936 for (loop_cnt = (height >> 2); loop_cnt--;) {
1937 LD_SB4(src, src_stride, src0, src1, src2, src3);
1938 src += (4 * src_stride);
1940 XORI_B4_128_SB(src0, src1, src2, src3);
1941 HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0,
1942 filt1, out0, out1, out2, out3);
1943 SRARI_H4_SH(out0, out1, out2, out3, 6);
1944 SAT_SH4_SH(out0, out1, out2, out3, 7);
1945 tmp0 = PCKEV_XORI128_UB(out0, out1);
1946 tmp1 = PCKEV_XORI128_UB(out2, out3);
1947 ST8x4_UB(tmp0, tmp1, dst, dst_stride);
1948 dst += (4 * dst_stride);
1952 static void common_hz_4t_8w_msa(uint8_t *src, int32_t src_stride,
1953 uint8_t *dst, int32_t dst_stride,
1954 const int8_t *filter, int32_t height)
1956 if ((2 == height) || (6 == height)) {
1957 common_hz_4t_8x2mult_msa(src, src_stride, dst, dst_stride, filter,
1960 common_hz_4t_8x4mult_msa(src, src_stride, dst, dst_stride, filter,
1965 static void common_hz_4t_12w_msa(uint8_t *src, int32_t src_stride,
1966 uint8_t *dst, int32_t dst_stride,
1967 const int8_t *filter, int32_t height)
1970 v16i8 src0, src1, src2, src3, filt0, filt1, mask0, mask1, mask2, mask3;
1971 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
1974 v8i16 filt, out0, out1, out2, out3, out4, out5;
1976 mask0 = LD_SB(&mc_filt_mask_arr[0]);
1977 mask2 = LD_SB(&mc_filt_mask_arr[32]);
1981 /* rearranging filter */
1982 filt = LD_SH(filter);
1983 SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
1988 for (loop_cnt = (height >> 2); loop_cnt--;) {
1989 LD_SB4(src, src_stride, src0, src1, src2, src3);
1990 src += (4 * src_stride);
1992 XORI_B4_128_SB(src0, src1, src2, src3);
1993 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec4, vec5);
1994 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec6, vec7);
1995 VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec0, vec1);
1996 DOTP_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0,
1997 out2, out3, out4, out5);
1998 DOTP_SB2_SH(vec0, vec1, filt0, filt0, out0, out1);
1999 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec8, vec9);
2000 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec10, vec11);
2001 VSHF_B2_SB(src0, src1, src2, src3, mask3, mask3, vec2, vec3);
2002 DPADD_SB4_SH(vec8, vec9, vec10, vec11, filt1, filt1, filt1, filt1,
2003 out2, out3, out4, out5);
2004 DPADD_SB2_SH(vec2, vec3, filt1, filt1, out0, out1);
2005 SRARI_H4_SH(out0, out1, out2, out3, 6);
2006 SRARI_H2_SH(out4, out5, 6);
2007 SAT_SH4_SH(out0, out1, out2, out3, 7);
2008 SAT_SH2_SH(out4, out5, 7);
2009 tmp0 = PCKEV_XORI128_UB(out2, out3);
2010 tmp1 = PCKEV_XORI128_UB(out4, out5);
2011 ST8x4_UB(tmp0, tmp1, dst, dst_stride);
2012 tmp0 = PCKEV_XORI128_UB(out0, out1);
2013 ST4x4_UB(tmp0, tmp0, 0, 1, 2, 3, dst + 8, dst_stride);
2014 dst += (4 * dst_stride);
2018 static void common_hz_4t_16w_msa(uint8_t *src, int32_t src_stride,
2019 uint8_t *dst, int32_t dst_stride,
2020 const int8_t *filter, int32_t height)
2023 v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
2024 v16i8 filt0, filt1, mask0, mask1;
2025 v8i16 filt, out0, out1, out2, out3, out4, out5, out6, out7;
2028 mask0 = LD_SB(&mc_filt_mask_arr[0]);
2031 /* rearranging filter */
2032 filt = LD_SH(filter);
2033 SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2037 for (loop_cnt = (height >> 2); loop_cnt--;) {
2038 LD_SB4(src, src_stride, src0, src2, src4, src6);
2039 LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
2040 src += (4 * src_stride);
2042 XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
2043 HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0,
2044 filt1, out0, out1, out2, out3);
2045 HORIZ_4TAP_8WID_4VECS_FILT(src4, src5, src6, src7, mask0, mask1, filt0,
2046 filt1, out4, out5, out6, out7);
2047 SRARI_H4_SH(out0, out1, out2, out3, 6);
2048 SRARI_H4_SH(out4, out5, out6, out7, 6);
2049 SAT_SH4_SH(out0, out1, out2, out3, 7);
2050 SAT_SH4_SH(out4, out5, out6, out7, 7);
2051 out = PCKEV_XORI128_UB(out0, out1);
2054 out = PCKEV_XORI128_UB(out2, out3);
2057 out = PCKEV_XORI128_UB(out4, out5);
2060 out = PCKEV_XORI128_UB(out6, out7);
2066 static void common_hz_4t_24w_msa(uint8_t *src, int32_t src_stride,
2067 uint8_t *dst, int32_t dst_stride,
2068 const int8_t *filter, int32_t height)
2070 uint8_t *dst1 = dst + 16;
2072 v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
2073 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
2074 v16i8 filt0, filt1, mask0, mask1, mask00, mask11;
2075 v8i16 filt, out0, out1, out2, out3;
2078 mask0 = LD_SB(&mc_filt_mask_arr[0]);
2081 /* rearranging filter */
2082 filt = LD_SH(filter);
2083 SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2087 mask11 = mask0 + 10;
2089 for (loop_cnt = (height >> 2); loop_cnt--;) {
2090 LD_SB4(src, src_stride, src0, src2, src4, src6);
2091 LD_SB4(src + 16, src_stride, src1, src3, src5, src7);
2092 src += (4 * src_stride);
2094 XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
2095 VSHF_B2_SB(src0, src0, src0, src1, mask0, mask00, vec0, vec1);
2096 VSHF_B2_SB(src2, src2, src2, src3, mask0, mask00, vec2, vec3);
2097 VSHF_B2_SB(src0, src0, src0, src1, mask1, mask11, vec4, vec5);
2098 VSHF_B2_SB(src2, src2, src2, src3, mask1, mask11, vec6, vec7);
2099 DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
2100 out0, out1, out2, out3);
2101 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1,
2102 out0, out1, out2, out3);
2103 SRARI_H4_SH(out0, out1, out2, out3, 6);
2104 SAT_SH4_SH(out0, out1, out2, out3, 7);
2105 tmp0 = PCKEV_XORI128_UB(out0, out1);
2108 tmp0 = PCKEV_XORI128_UB(out2, out3);
2112 VSHF_B2_SB(src4, src4, src4, src5, mask0, mask00, vec0, vec1);
2113 VSHF_B2_SB(src6, src6, src6, src7, mask0, mask00, vec2, vec3);
2114 VSHF_B2_SB(src4, src4, src4, src5, mask1, mask11, vec4, vec5);
2115 VSHF_B2_SB(src6, src6, src6, src7, mask1, mask11, vec6, vec7);
2116 DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
2117 out0, out1, out2, out3);
2118 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1,
2119 out0, out1, out2, out3);
2120 SRARI_H4_SH(out0, out1, out2, out3, 6);
2121 SAT_SH4_SH(out0, out1, out2, out3, 7);
2122 tmp0 = PCKEV_XORI128_UB(out0, out1);
2125 tmp0 = PCKEV_XORI128_UB(out2, out3);
2130 VSHF_B2_SB(src1, src1, src3, src3, mask0, mask0, vec0, vec1);
2131 VSHF_B2_SB(src5, src5, src7, src7, mask0, mask0, vec2, vec3);
2132 VSHF_B2_SB(src1, src1, src3, src3, mask1, mask1, vec4, vec5);
2133 VSHF_B2_SB(src5, src5, src7, src7, mask1, mask1, vec6, vec7);
2135 DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
2136 out0, out1, out2, out3);
2137 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1,
2138 out0, out1, out2, out3);
2140 SRARI_H4_SH(out0, out1, out2, out3, 6);
2141 SAT_SH4_SH(out0, out1, out2, out3, 7);
2142 tmp0 = PCKEV_XORI128_UB(out0, out1);
2143 tmp1 = PCKEV_XORI128_UB(out2, out3);
2144 ST8x4_UB(tmp0, tmp1, dst1, dst_stride);
2145 dst1 += (4 * dst_stride);
2149 static void common_hz_4t_32w_msa(uint8_t *src, int32_t src_stride,
2150 uint8_t *dst, int32_t dst_stride,
2151 const int8_t *filter, int32_t height)
2154 v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
2155 v16i8 filt0, filt1, mask0, mask1;
2157 v8i16 filt, out0, out1, out2, out3, out4, out5, out6, out7;
2159 mask0 = LD_SB(&mc_filt_mask_arr[0]);
2162 /* rearranging filter */
2163 filt = LD_SH(filter);
2164 SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2168 for (loop_cnt = (height >> 1); loop_cnt--;) {
2170 src2 = LD_SB(src + 16);
2171 src3 = LD_SB(src + 24);
2174 src6 = LD_SB(src + 16);
2175 src7 = LD_SB(src + 24);
2176 SLDI_B2_SB(src2, src6, src0, src4, src1, src5, 8);
2179 XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
2180 HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,
2181 filt0, filt1, out0, out1, out2, out3);
2182 HORIZ_4TAP_8WID_4VECS_FILT(src4, src5, src6, src7, mask0, mask1,
2183 filt0, filt1, out4, out5, out6, out7);
2184 SRARI_H4_SH(out0, out1, out2, out3, 6);
2185 SRARI_H4_SH(out4, out5, out6, out7, 6);
2186 SAT_SH4_SH(out0, out1, out2, out3, 7);
2187 SAT_SH4_SH(out4, out5, out6, out7, 7);
2188 out = PCKEV_XORI128_UB(out0, out1);
2190 out = PCKEV_XORI128_UB(out2, out3);
2191 ST_UB(out, dst + 16);
2193 out = PCKEV_XORI128_UB(out4, out5);
2195 out = PCKEV_XORI128_UB(out6, out7);
2196 ST_UB(out, dst + 16);
2201 static void common_vt_4t_4x2_msa(uint8_t *src, int32_t src_stride,
2202 uint8_t *dst, int32_t dst_stride,
2203 const int8_t *filter)
2205 v16i8 src0, src1, src2, src3, src4, src10_r, src32_r, src21_r, src43_r;
2206 v16i8 src2110, src4332, filt0, filt1;
2212 filt = LD_SH(filter);
2213 SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2215 LD_SB3(src, src_stride, src0, src1, src2);
2216 src += (3 * src_stride);
2218 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2219 src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
2220 src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
2221 LD_SB2(src, src_stride, src3, src4);
2222 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
2223 src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_r, (v2i64) src32_r);
2224 src4332 = (v16i8) __msa_xori_b((v16u8) src4332, 128);
2225 out10 = FILT_4TAP_DPADD_S_H(src2110, src4332, filt0, filt1);
2226 out10 = __msa_srari_h(out10, 6);
2227 out10 = __msa_sat_s_h(out10, 7);
2228 out = PCKEV_XORI128_UB(out10, out10);
2229 ST4x2_UB(out, dst, dst_stride);
2232 static void common_vt_4t_4x4multiple_msa(uint8_t *src, int32_t src_stride,
2233 uint8_t *dst, int32_t dst_stride,
2234 const int8_t *filter, int32_t height)
2237 v16i8 src0, src1, src2, src3, src4, src5;
2238 v16i8 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r;
2239 v16i8 src2110, src4332, filt0, filt1;
2240 v8i16 filt, out10, out32;
2245 filt = LD_SH(filter);
2246 SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2248 LD_SB3(src, src_stride, src0, src1, src2);
2249 src += (3 * src_stride);
2251 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2253 src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
2254 src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
2256 for (loop_cnt = (height >> 2); loop_cnt--;) {
2257 LD_SB3(src, src_stride, src3, src4, src5);
2258 src += (3 * src_stride);
2259 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
2260 src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_r, (v2i64) src32_r);
2261 src4332 = (v16i8) __msa_xori_b((v16u8) src4332, 128);
2262 out10 = FILT_4TAP_DPADD_S_H(src2110, src4332, filt0, filt1);
2265 src += (src_stride);
2266 ILVR_B2_SB(src5, src4, src2, src5, src54_r, src65_r);
2267 src2110 = (v16i8) __msa_ilvr_d((v2i64) src65_r, (v2i64) src54_r);
2268 src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
2269 out32 = FILT_4TAP_DPADD_S_H(src4332, src2110, filt0, filt1);
2270 SRARI_H2_SH(out10, out32, 6);
2271 SAT_SH2_SH(out10, out32, 7);
2272 out = PCKEV_XORI128_UB(out10, out32);
2273 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
2274 dst += (4 * dst_stride);
2278 static void common_vt_4t_4w_msa(uint8_t *src, int32_t src_stride,
2279 uint8_t *dst, int32_t dst_stride,
2280 const int8_t *filter, int32_t height)
2283 common_vt_4t_4x2_msa(src, src_stride, dst, dst_stride, filter);
2285 common_vt_4t_4x4multiple_msa(src, src_stride, dst, dst_stride, filter,
2290 static void common_vt_4t_6w_msa(uint8_t *src, int32_t src_stride,
2291 uint8_t *dst, int32_t dst_stride,
2292 const int8_t *filter, int32_t height)
2295 v16u8 src0, src1, src2, src3, vec0, vec1, vec2, vec3, out0, out1;
2296 v8i16 vec01, vec12, vec23, vec30, tmp0, tmp1, tmp2, tmp3;
2297 v8i16 filt, filt0, filt1;
2301 /* rearranging filter_y */
2302 filt = LD_SH(filter);
2303 SPLATI_H2_SH(filt, 0, 1, filt0, filt1);
2305 LD_UB3(src, src_stride, src0, src1, src2);
2306 src += (3 * src_stride);
2308 vec0 = (v16u8) __msa_xori_b((v16u8) src0, 128);
2309 vec1 = (v16u8) __msa_xori_b((v16u8) src1, 128);
2310 vec2 = (v16u8) __msa_xori_b((v16u8) src2, 128);
2312 for (loop_cnt = (height >> 2); loop_cnt--;) {
2313 LD_UB4(src, src_stride, src3, src0, src1, src2);
2314 src += (4 * src_stride);
2316 vec3 = (v16u8) __msa_xori_b((v16u8) src3, 128);
2317 ILVR_B2_SH(vec1, vec0, vec3, vec2, vec01, vec23);
2318 tmp0 = FILT_4TAP_DPADD_S_H(vec01, vec23, filt0, filt1);
2320 vec0 = __msa_xori_b((v16u8) src0, 128);
2321 ILVR_B2_SH(vec2, vec1, vec0, vec3, vec12, vec30);
2322 tmp1 = FILT_4TAP_DPADD_S_H(vec12, vec30, filt0, filt1);
2324 vec1 = __msa_xori_b((v16u8) src1, 128);
2325 vec01 = (v8i16) __msa_ilvr_b((v16i8) vec1, (v16i8) vec0);
2326 tmp2 = FILT_4TAP_DPADD_S_H(vec23, vec01, filt0, filt1);
2328 vec2 = __msa_xori_b((v16u8) src2, 128);
2329 vec12 = (v8i16) __msa_ilvr_b((v16i8) vec2, (v16i8) vec1);
2330 tmp3 = FILT_4TAP_DPADD_S_H(vec30, vec12, filt0, filt1);
2332 SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 6);
2333 SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
2334 out0 = PCKEV_XORI128_UB(tmp0, tmp1);
2335 out1 = PCKEV_XORI128_UB(tmp2, tmp3);
2336 ST6x4_UB(out0, out1, dst, dst_stride);
2337 dst += (4 * dst_stride);
2341 static void common_vt_4t_8x2_msa(uint8_t *src, int32_t src_stride,
2342 uint8_t *dst, int32_t dst_stride,
2343 const int8_t *filter)
2345 v16i8 src0, src1, src2, src3, src4;
2346 v8i16 src01, src12, src23, src34, tmp0, tmp1, filt, filt0, filt1;
2351 /* rearranging filter_y */
2352 filt = LD_SH(filter);
2353 SPLATI_H2_SH(filt, 0, 1, filt0, filt1);
2355 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
2356 XORI_B5_128_SB(src0, src1, src2, src3, src4);
2357 ILVR_B2_SH(src1, src0, src3, src2, src01, src23);
2358 tmp0 = FILT_4TAP_DPADD_S_H(src01, src23, filt0, filt1);
2359 ILVR_B2_SH(src2, src1, src4, src3, src12, src34);
2360 tmp1 = FILT_4TAP_DPADD_S_H(src12, src34, filt0, filt1);
2361 SRARI_H2_SH(tmp0, tmp1, 6);
2362 SAT_SH2_SH(tmp0, tmp1, 7);
2363 out = PCKEV_XORI128_UB(tmp0, tmp1);
2364 ST8x2_UB(out, dst, dst_stride);
2367 static void common_vt_4t_8x6_msa(uint8_t *src, int32_t src_stride,
2368 uint8_t *dst, int32_t dst_stride,
2369 const int8_t *filter)
2372 uint64_t out0, out1, out2;
2373 v16i8 src0, src1, src2, src3, src4, src5;
2374 v8i16 vec0, vec1, vec2, vec3, vec4, tmp0, tmp1, tmp2;
2375 v8i16 filt, filt0, filt1;
2379 /* rearranging filter_y */
2380 filt = LD_SH(filter);
2381 SPLATI_H2_SH(filt, 0, 1, filt0, filt1);
2383 LD_SB3(src, src_stride, src0, src1, src2);
2384 src += (3 * src_stride);
2386 XORI_B3_128_SB(src0, src1, src2);
2387 ILVR_B2_SH(src1, src0, src2, src1, vec0, vec2);
2389 for (loop_cnt = 2; loop_cnt--;) {
2390 LD_SB3(src, src_stride, src3, src4, src5);
2391 src += (3 * src_stride);
2393 XORI_B3_128_SB(src3, src4, src5);
2394 ILVR_B3_SH(src3, src2, src4, src3, src5, src4, vec1, vec3, vec4);
2395 tmp0 = FILT_4TAP_DPADD_S_H(vec0, vec1, filt0, filt1);
2396 tmp1 = FILT_4TAP_DPADD_S_H(vec2, vec3, filt0, filt1);
2397 tmp2 = FILT_4TAP_DPADD_S_H(vec1, vec4, filt0, filt1);
2398 SRARI_H2_SH(tmp0, tmp1, 6);
2399 tmp2 = __msa_srari_h(tmp2, 6);
2400 SAT_SH3_SH(tmp0, tmp1, tmp2, 7);
2401 PCKEV_B2_SH(tmp1, tmp0, tmp2, tmp2, tmp0, tmp2);
2402 XORI_B2_128_SH(tmp0, tmp2);
2404 out0 = __msa_copy_u_d((v2i64) tmp0, 0);
2405 out1 = __msa_copy_u_d((v2i64) tmp0, 1);
2406 out2 = __msa_copy_u_d((v2i64) tmp2, 0);
2420 static void common_vt_4t_8x4mult_msa(uint8_t *src, int32_t src_stride,
2421 uint8_t *dst, int32_t dst_stride,
2422 const int8_t *filter, int32_t height)
2425 v16i8 src0, src1, src2, src7, src8, src9, src10;
2426 v16i8 src10_r, src72_r, src98_r, src21_r, src87_r, src109_r, filt0, filt1;
2428 v8i16 filt, out0_r, out1_r, out2_r, out3_r;
2432 filt = LD_SH(filter);
2433 SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2435 LD_SB3(src, src_stride, src0, src1, src2);
2436 src += (3 * src_stride);
2438 XORI_B3_128_SB(src0, src1, src2);
2439 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2441 for (loop_cnt = (height >> 2); loop_cnt--;) {
2442 LD_SB4(src, src_stride, src7, src8, src9, src10);
2443 src += (4 * src_stride);
2445 XORI_B4_128_SB(src7, src8, src9, src10);
2446 ILVR_B4_SB(src7, src2, src8, src7, src9, src8, src10, src9,
2447 src72_r, src87_r, src98_r, src109_r);
2448 out0_r = FILT_4TAP_DPADD_S_H(src10_r, src72_r, filt0, filt1);
2449 out1_r = FILT_4TAP_DPADD_S_H(src21_r, src87_r, filt0, filt1);
2450 out2_r = FILT_4TAP_DPADD_S_H(src72_r, src98_r, filt0, filt1);
2451 out3_r = FILT_4TAP_DPADD_S_H(src87_r, src109_r, filt0, filt1);
2452 SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6);
2453 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
2454 tmp0 = PCKEV_XORI128_UB(out0_r, out1_r);
2455 tmp1 = PCKEV_XORI128_UB(out2_r, out3_r);
2456 ST8x4_UB(tmp0, tmp1, dst, dst_stride);
2457 dst += (4 * dst_stride);
2465 static void common_vt_4t_8w_msa(uint8_t *src, int32_t src_stride,
2466 uint8_t *dst, int32_t dst_stride,
2467 const int8_t *filter, int32_t height)
2470 common_vt_4t_8x2_msa(src, src_stride, dst, dst_stride, filter);
2471 } else if (6 == height) {
2472 common_vt_4t_8x6_msa(src, src_stride, dst, dst_stride, filter);
2474 common_vt_4t_8x4mult_msa(src, src_stride, dst, dst_stride,
2479 static void common_vt_4t_12w_msa(uint8_t *src, int32_t src_stride,
2480 uint8_t *dst, int32_t dst_stride,
2481 const int8_t *filter, int32_t height)
2484 v16i8 src0, src1, src2, src3, src4, src5, src6;
2485 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
2487 v8i16 src10, src21, src32, src43, src54, src65, src87, src109, src1211;
2488 v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, filt, filt0, filt1;
2489 v4u32 mask = { 2, 6, 2, 6 };
2491 /* rearranging filter_y */
2492 filt = LD_SH(filter);
2493 SPLATI_H2_SH(filt, 0, 1, filt0, filt1);
2497 LD_SB3(src, src_stride, src0, src1, src2);
2498 src += (3 * src_stride);
2500 XORI_B3_128_SB(src0, src1, src2);
2501 VSHF_W2_SB(src0, src1, src1, src2, mask, mask, vec0, vec1);
2503 for (loop_cnt = (height >> 2); loop_cnt--;) {
2504 LD_SB4(src, src_stride, src3, src4, src5, src6);
2505 src += (4 * src_stride);
2507 XORI_B4_128_SB(src3, src4, src5, src6);
2508 ILVR_B2_SH(src1, src0, src3, src2, src10, src32);
2509 VSHF_W2_SB(src2, src3, src3, src4, mask, mask, vec2, vec3);
2510 VSHF_W2_SB(src4, src5, src5, src6, mask, mask, vec4, vec5);
2511 tmp0 = FILT_4TAP_DPADD_S_H(src10, src32, filt0, filt1);
2512 ILVR_B4_SH(src2, src1, src4, src3, src5, src4, src6, src5,
2513 src21, src43, src54, src65);
2514 tmp1 = FILT_4TAP_DPADD_S_H(src21, src43, filt0, filt1);
2515 tmp2 = FILT_4TAP_DPADD_S_H(src32, src54, filt0, filt1);
2516 tmp3 = FILT_4TAP_DPADD_S_H(src43, src65, filt0, filt1);
2517 ILVR_B3_SH(vec1, vec0, vec3, vec2, vec5, vec4, src87, src109, src1211);
2518 tmp4 = FILT_4TAP_DPADD_S_H(src87, src109, filt0, filt1);
2519 tmp5 = FILT_4TAP_DPADD_S_H(src109, src1211, filt0, filt1);
2520 SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 6);
2521 SRARI_H2_SH(tmp4, tmp5, 6);
2522 SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
2523 SAT_SH2_SH(tmp4, tmp5, 7);
2524 out0 = PCKEV_XORI128_UB(tmp0, tmp1);
2525 out1 = PCKEV_XORI128_UB(tmp2, tmp3);
2526 ST8x4_UB(out0, out1, dst, dst_stride);
2527 out0 = PCKEV_XORI128_UB(tmp4, tmp5);
2528 ST4x4_UB(out0, out0, 0, 1, 2, 3, dst + 8, dst_stride);
2529 dst += (4 * dst_stride);
2540 static void common_vt_4t_16w_msa(uint8_t *src, int32_t src_stride,
2541 uint8_t *dst, int32_t dst_stride,
2542 const int8_t *filter, int32_t height)
2545 v16i8 src0, src1, src2, src3, src4, src5, src6;
2546 v16i8 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r, src10_l;
2547 v16i8 src32_l, src54_l, src21_l, src43_l, src65_l, filt0, filt1;
2548 v16u8 tmp0, tmp1, tmp2, tmp3;
2549 v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
2553 filt = LD_SH(filter);
2554 SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2556 LD_SB3(src, src_stride, src0, src1, src2);
2557 src += (3 * src_stride);
2559 XORI_B3_128_SB(src0, src1, src2);
2560 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2561 ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
2563 for (loop_cnt = (height >> 2); loop_cnt--;) {
2564 LD_SB4(src, src_stride, src3, src4, src5, src6);
2565 src += (4 * src_stride);
2567 XORI_B4_128_SB(src3, src4, src5, src6);
2568 ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
2569 src32_r, src43_r, src54_r, src65_r);
2570 ILVL_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
2571 src32_l, src43_l, src54_l, src65_l);
2572 out0_r = FILT_4TAP_DPADD_S_H(src10_r, src32_r, filt0, filt1);
2573 out1_r = FILT_4TAP_DPADD_S_H(src21_r, src43_r, filt0, filt1);
2574 out2_r = FILT_4TAP_DPADD_S_H(src32_r, src54_r, filt0, filt1);
2575 out3_r = FILT_4TAP_DPADD_S_H(src43_r, src65_r, filt0, filt1);
2576 out0_l = FILT_4TAP_DPADD_S_H(src10_l, src32_l, filt0, filt1);
2577 out1_l = FILT_4TAP_DPADD_S_H(src21_l, src43_l, filt0, filt1);
2578 out2_l = FILT_4TAP_DPADD_S_H(src32_l, src54_l, filt0, filt1);
2579 out3_l = FILT_4TAP_DPADD_S_H(src43_l, src65_l, filt0, filt1);
2580 SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6);
2581 SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 6);
2582 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
2583 SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
2584 PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
2585 out3_r, tmp0, tmp1, tmp2, tmp3);
2586 XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3);
2587 ST_UB4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride);
2588 dst += (4 * dst_stride);
2598 static void common_vt_4t_24w_msa(uint8_t *src, int32_t src_stride,
2599 uint8_t *dst, int32_t dst_stride,
2600 const int8_t *filter, int32_t height)
2603 uint64_t out0, out1;
2604 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
2605 v16i8 src11, filt0, filt1;
2606 v16i8 src10_r, src32_r, src76_r, src98_r, src21_r, src43_r, src87_r;
2607 v16i8 src109_r, src10_l, src32_l, src21_l, src43_l;
2609 v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l;
2613 filt = LD_SH(filter);
2614 SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2617 LD_SB3(src, src_stride, src0, src1, src2);
2618 XORI_B3_128_SB(src0, src1, src2);
2619 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2620 ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
2623 LD_SB3(src + 16, src_stride, src6, src7, src8);
2624 src += (3 * src_stride);
2625 XORI_B3_128_SB(src6, src7, src8);
2626 ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
2628 for (loop_cnt = (height >> 2); loop_cnt--;) {
2630 LD_SB2(src, src_stride, src3, src4);
2631 XORI_B2_128_SB(src3, src4);
2632 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
2633 ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
2636 LD_SB2(src + 16, src_stride, src9, src10);
2637 src += (2 * src_stride);
2638 XORI_B2_128_SB(src9, src10);
2639 ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r);
2642 out0_r = FILT_4TAP_DPADD_S_H(src10_r, src32_r, filt0, filt1);
2643 out0_l = FILT_4TAP_DPADD_S_H(src10_l, src32_l, filt0, filt1);
2644 out1_r = FILT_4TAP_DPADD_S_H(src21_r, src43_r, filt0, filt1);
2645 out1_l = FILT_4TAP_DPADD_S_H(src21_l, src43_l, filt0, filt1);
2648 out2_r = FILT_4TAP_DPADD_S_H(src76_r, src98_r, filt0, filt1);
2649 out3_r = FILT_4TAP_DPADD_S_H(src87_r, src109_r, filt0, filt1);
2652 SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6);
2653 SRARI_H2_SH(out0_l, out1_l, 6);
2654 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
2655 SAT_SH2_SH(out0_l, out1_l, 7);
2656 out = PCKEV_XORI128_UB(out0_r, out0_l);
2658 PCKEV_B2_SH(out2_r, out2_r, out3_r, out3_r, out2_r, out3_r);
2659 XORI_B2_128_SH(out2_r, out3_r);
2660 out0 = __msa_copy_u_d((v2i64) out2_r, 0);
2661 out1 = __msa_copy_u_d((v2i64) out3_r, 0);
2664 out = PCKEV_XORI128_UB(out1_r, out1_l);
2670 LD_SB2(src, src_stride, src5, src2);
2671 XORI_B2_128_SB(src5, src2);
2672 ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r);
2673 ILVL_B2_SB(src5, src4, src2, src5, src10_l, src21_l);
2676 LD_SB2(src + 16, src_stride, src11, src8);
2677 src += (2 * src_stride);
2678 XORI_B2_128_SB(src11, src8);
2679 ILVR_B2_SB(src11, src10, src8, src11, src76_r, src87_r);
2682 out0_r = FILT_4TAP_DPADD_S_H(src32_r, src10_r, filt0, filt1);
2683 out0_l = FILT_4TAP_DPADD_S_H(src32_l, src10_l, filt0, filt1);
2684 out1_r = FILT_4TAP_DPADD_S_H(src43_r, src21_r, filt0, filt1);
2685 out1_l = FILT_4TAP_DPADD_S_H(src43_l, src21_l, filt0, filt1);
2688 out2_r = FILT_4TAP_DPADD_S_H(src98_r, src76_r, filt0, filt1);
2689 out3_r = FILT_4TAP_DPADD_S_H(src109_r, src87_r, filt0, filt1);
2692 SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6);
2693 SRARI_H2_SH(out0_l, out1_l, 6);
2694 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
2695 SAT_SH2_SH(out0_l, out1_l, 7);
2696 out = PCKEV_XORI128_UB(out0_r, out0_l);
2698 out = PCKEV_XORI128_UB(out2_r, out2_r);
2699 ST8x1_UB(out, dst + 16);
2701 out = PCKEV_XORI128_UB(out1_r, out1_l);
2703 out = PCKEV_XORI128_UB(out3_r, out3_r);
2704 ST8x1_UB(out, dst + 16);
2709 static void common_vt_4t_32w_mult_msa(uint8_t *src, int32_t src_stride,
2710 uint8_t *dst, int32_t dst_stride,
2711 const int8_t *filter, int32_t height,
2714 uint32_t loop_cnt, cnt;
2715 uint8_t *dst_tmp, *src_tmp;
2716 v16i8 src0, src1, src2, src3, src4, src6, src7, src8, src9, src10;
2717 v16i8 src10_r, src32_r, src76_r, src98_r;
2718 v16i8 src21_r, src43_r, src87_r, src109_r;
2719 v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
2720 v16i8 src10_l, src32_l, src76_l, src98_l;
2721 v16i8 src21_l, src43_l, src87_l, src109_l;
2728 filt = LD_SH(filter);
2729 SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2731 for (cnt = (width >> 5); cnt--;) {
2736 LD_SB3(src_tmp, src_stride, src0, src1, src2);
2737 XORI_B3_128_SB(src0, src1, src2);
2739 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2740 ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
2743 LD_SB3(src_tmp + 16, src_stride, src6, src7, src8);
2744 src_tmp += (3 * src_stride);
2746 XORI_B3_128_SB(src6, src7, src8);
2747 ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
2748 ILVL_B2_SB(src7, src6, src8, src7, src76_l, src87_l);
2750 for (loop_cnt = (height >> 1); loop_cnt--;) {
2752 LD_SB2(src_tmp, src_stride, src3, src4);
2753 XORI_B2_128_SB(src3, src4);
2754 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
2755 ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
2758 out0_r = FILT_4TAP_DPADD_S_H(src10_r, src32_r, filt0, filt1);
2759 out0_l = FILT_4TAP_DPADD_S_H(src10_l, src32_l, filt0, filt1);
2760 out1_r = FILT_4TAP_DPADD_S_H(src21_r, src43_r, filt0, filt1);
2761 out1_l = FILT_4TAP_DPADD_S_H(src21_l, src43_l, filt0, filt1);
2764 SRARI_H4_SH(out0_r, out1_r, out0_l, out1_l, 6);
2765 SAT_SH4_SH(out0_r, out1_r, out0_l, out1_l, 7);
2766 out = PCKEV_XORI128_UB(out0_r, out0_l);
2767 ST_UB(out, dst_tmp);
2768 out = PCKEV_XORI128_UB(out1_r, out1_l);
2769 ST_UB(out, dst_tmp + dst_stride);
2778 LD_SB2(src_tmp + 16, src_stride, src9, src10);
2779 src_tmp += (2 * src_stride);
2780 XORI_B2_128_SB(src9, src10);
2781 ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r);
2782 ILVL_B2_SB(src9, src8, src10, src9, src98_l, src109_l);
2785 out2_r = FILT_4TAP_DPADD_S_H(src76_r, src98_r, filt0, filt1);
2786 out2_l = FILT_4TAP_DPADD_S_H(src76_l, src98_l, filt0, filt1);
2787 out3_r = FILT_4TAP_DPADD_S_H(src87_r, src109_r, filt0, filt1);
2788 out3_l = FILT_4TAP_DPADD_S_H(src87_l, src109_l, filt0, filt1);
2791 SRARI_H4_SH(out2_r, out3_r, out2_l, out3_l, 6);
2792 SAT_SH4_SH(out2_r, out3_r, out2_l, out3_l, 7);
2793 out = PCKEV_XORI128_UB(out2_r, out2_l);
2794 ST_UB(out, dst_tmp + 16);
2795 out = PCKEV_XORI128_UB(out3_r, out3_l);
2796 ST_UB(out, dst_tmp + 16 + dst_stride);
2798 dst_tmp += 2 * dst_stride;
2812 static void common_vt_4t_32w_msa(uint8_t *src, int32_t src_stride,
2813 uint8_t *dst, int32_t dst_stride,
2814 const int8_t *filter, int32_t height)
2816 common_vt_4t_32w_mult_msa(src, src_stride, dst, dst_stride,
2817 filter, height, 32);
2820 static void hevc_hv_uni_4t_4x2_msa(uint8_t *src,
2824 const int8_t *filter_x,
2825 const int8_t *filter_y,
2828 v16i8 src0, src1, src2, src3, src4;
2830 v4i32 filt_h0, filt_h1;
2831 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
2833 v8i16 filter_vec, const_vec;
2834 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
2835 v8i16 dst0, dst1, dst2, dst3, dst4;
2836 v4i32 dst0_r, dst1_r;
2837 v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
2839 src -= (src_stride + 1);
2841 filter_vec = LD_SH(filter_x);
2842 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2844 filter_vec = LD_SH(filter_y);
2845 vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
2846 filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
2848 SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1);
2852 const_vec = __msa_ldi_h(128);
2855 LD_SB3(src, src_stride, src0, src1, src2);
2856 src += (3 * src_stride);
2858 XORI_B3_128_SB(src0, src1, src2);
2860 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
2861 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
2862 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
2865 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
2867 DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1);
2869 DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2);
2871 ILVR_H2_SH(dst1, dst0, dst2, dst1, dst10_r, dst21_r);
2872 LD_SB2(src, src_stride, src3, src4);
2873 XORI_B2_128_SB(src3, src4);
2876 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
2878 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
2880 dst32_r = __msa_ilvr_h(dst3, dst2);
2881 dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
2885 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
2887 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
2889 dst43_r = __msa_ilvr_h(dst4, dst3);
2890 dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
2893 dst0_r = (v4i32) __msa_pckev_h((v8i16) dst1_r, (v8i16) dst0_r);
2894 dst0_r = (v4i32) __msa_srari_h((v8i16) dst0_r, 6);
2895 dst0_r = (v4i32) CLIP_SH_0_255(dst0_r);
2896 dst0_r = (v4i32) __msa_pckev_b((v16i8) dst0_r, (v16i8) dst0_r);
2898 ST4x2_UB(dst0_r, dst, dst_stride);
2901 static void hevc_hv_uni_4t_4x4_msa(uint8_t *src,
2905 const int8_t *filter_x,
2906 const int8_t *filter_y,
2909 v16i8 src0, src1, src2, src3, src4, src5, src6;
2911 v4i32 filt_h0, filt_h1;
2912 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
2914 v8i16 filter_vec, const_vec;
2915 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
2916 v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
2917 v4i32 dst0_r, dst1_r, dst2_r, dst3_r;
2918 v8i16 out0_r, out1_r;
2919 v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
2921 src -= (src_stride + 1);
2923 filter_vec = LD_SH(filter_x);
2924 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2926 filter_vec = LD_SH(filter_y);
2927 vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
2928 filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
2930 SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1);
2934 const_vec = __msa_ldi_h(128);
2937 LD_SB3(src, src_stride, src0, src1, src2);
2938 src += (3 * src_stride);
2940 XORI_B3_128_SB(src0, src1, src2);
2942 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
2943 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
2944 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
2947 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
2949 DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1);
2951 DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2);
2953 ILVR_H2_SH(dst1, dst0, dst2, dst1, dst10_r, dst21_r);
2954 LD_SB4(src, src_stride, src3, src4, src5, src6);
2955 XORI_B4_128_SB(src3, src4, src5, src6);
2958 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
2960 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
2962 dst32_r = __msa_ilvr_h(dst3, dst2);
2963 dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
2967 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
2969 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
2971 dst43_r = __msa_ilvr_h(dst4, dst3);
2972 dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
2976 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
2978 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5);
2980 dst10_r = __msa_ilvr_h(dst5, dst4);
2981 dst2_r = HEVC_FILT_4TAP(dst32_r, dst10_r, filt_h0, filt_h1);
2985 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
2987 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
2989 dst21_r = __msa_ilvr_h(dst2, dst5);
2990 dst3_r = HEVC_FILT_4TAP(dst43_r, dst21_r, filt_h0, filt_h1);
2993 PCKEV_H2_SH(dst1_r, dst0_r, dst3_r, dst2_r, out0_r, out1_r);
2994 SRARI_H2_SH(out0_r, out1_r, 6);
2995 CLIP_SH2_0_255(out0_r, out1_r);
2996 out0_r = (v8i16) __msa_pckev_b((v16i8) out1_r, (v16i8) out0_r);
2998 ST4x4_UB(out0_r, out0_r, 0, 1, 2, 3, dst, dst_stride);
3001 static void hevc_hv_uni_4t_4multx8mult_msa(uint8_t *src,
3005 const int8_t *filter_x,
3006 const int8_t *filter_y,
3010 v16i8 src0, src1, src2, src3, src4, src5;
3011 v16i8 src6, src7, src8, src9, src10;
3013 v4i32 filt_h0, filt_h1;
3014 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
3016 v8i16 filter_vec, const_vec;
3017 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
3018 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8, dst9;
3019 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r, dst6_r, dst7_r;
3020 v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
3021 v8i16 dst21_r, dst43_r, dst65_r, dst87_r;
3022 v8i16 out0_r, out1_r, out2_r, out3_r;
3024 src -= (src_stride + 1);
3026 filter_vec = LD_SH(filter_x);
3027 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3029 filter_vec = LD_SH(filter_y);
3030 vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
3031 filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
3033 SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1);
3037 const_vec = __msa_ldi_h(128);
3040 LD_SB3(src, src_stride, src0, src1, src2);
3041 src += (3 * src_stride);
3043 XORI_B3_128_SB(src0, src1, src2);
3045 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3046 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
3047 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3050 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
3052 DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1);
3054 DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2);
3056 ILVR_H2_SH(dst1, dst0, dst2, dst1, dst10_r, dst21_r);
3058 for (loop_cnt = height >> 3; loop_cnt--;) {
3059 LD_SB8(src, src_stride,
3060 src3, src4, src5, src6, src7, src8, src9, src10);
3061 src += (8 * src_stride);
3063 XORI_B8_128_SB(src3, src4, src5, src6, src7, src8, src9, src10);
3066 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3068 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
3070 dst32_r = __msa_ilvr_h(dst3, dst2);
3071 dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
3075 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
3077 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
3079 dst43_r = __msa_ilvr_h(dst4, dst3);
3080 dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
3084 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
3086 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5);
3088 dst54_r = __msa_ilvr_h(dst5, dst4);
3089 dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
3093 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
3095 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst6, dst6);
3097 dst65_r = __msa_ilvr_h(dst6, dst5);
3098 dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
3102 VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1);
3104 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst7, dst7);
3106 dst76_r = __msa_ilvr_h(dst7, dst6);
3107 dst4_r = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
3111 VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec0, vec1);
3113 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst8, dst8);
3115 dst87_r = __msa_ilvr_h(dst8, dst7);
3116 dst5_r = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
3120 VSHF_B2_SB(src9, src9, src9, src9, mask0, mask1, vec0, vec1);
3122 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst9, dst9);
3124 dst10_r = __msa_ilvr_h(dst9, dst8);
3125 dst6_r = HEVC_FILT_4TAP(dst76_r, dst10_r, filt_h0, filt_h1);
3129 VSHF_B2_SB(src10, src10, src10, src10, mask0, mask1, vec0, vec1);
3131 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
3133 dst21_r = __msa_ilvr_h(dst2, dst9);
3134 dst7_r = HEVC_FILT_4TAP(dst87_r, dst21_r, filt_h0, filt_h1);
3137 PCKEV_H4_SH(dst1_r, dst0_r, dst3_r, dst2_r,
3138 dst5_r, dst4_r, dst7_r, dst6_r,
3139 out0_r, out1_r, out2_r, out3_r);
3141 SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6);
3142 CLIP_SH4_0_255(out0_r, out1_r, out2_r, out3_r);
3144 PCKEV_B2_SH(out1_r, out0_r, out3_r, out2_r, out0_r, out1_r);
3145 ST4x8_UB(out0_r, out1_r, dst, dst_stride);
3146 dst += (8 * dst_stride);
3150 static void hevc_hv_uni_4t_4w_msa(uint8_t *src,
3154 const int8_t *filter_x,
3155 const int8_t *filter_y,
3159 hevc_hv_uni_4t_4x2_msa(src, src_stride, dst, dst_stride,
3160 filter_x, filter_y, height);
3161 } else if (4 == height) {
3162 hevc_hv_uni_4t_4x4_msa(src, src_stride, dst, dst_stride,
3163 filter_x, filter_y, height);
3164 } else if (0 == (height % 8)) {
3165 hevc_hv_uni_4t_4multx8mult_msa(src, src_stride, dst, dst_stride,
3166 filter_x, filter_y, height);
3170 static void hevc_hv_uni_4t_6w_msa(uint8_t *src,
3174 const int8_t *filter_x,
3175 const int8_t *filter_y,
3179 v16i8 src0, src1, src2, src3, src4, src5, src6;
3181 v4i32 filt_h0, filt_h1;
3182 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
3184 v8i16 filter_vec, const_vec;
3185 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
3186 v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
3187 v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
3188 v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
3189 v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
3190 v8i16 out0_r, out1_r, out2_r, out3_r;
3192 src -= (src_stride + 1);
3194 filter_vec = LD_SH(filter_x);
3195 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3197 filter_vec = LD_SH(filter_y);
3198 vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
3199 filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
3201 SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1);
3205 const_vec = __msa_ldi_h(128);
3208 LD_SB3(src, src_stride, src0, src1, src2);
3209 src += (3 * src_stride);
3211 XORI_B3_128_SB(src0, src1, src2);
3213 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3214 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
3215 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3218 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
3220 DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1);
3222 DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2);
3224 ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
3225 ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
3227 for (loop_cnt = height >> 2; loop_cnt--;) {
3228 LD_SB4(src, src_stride, src3, src4, src5, src6);
3229 src += (4 * src_stride);
3231 XORI_B4_128_SB(src3, src4, src5, src6);
3234 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3236 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
3238 ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
3239 dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
3240 dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
3245 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
3247 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
3249 ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
3250 dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
3251 dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
3256 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
3258 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5);
3260 ILVRL_H2_SH(dst5, dst4, dst10_r, dst10_l);
3261 dst2_r = HEVC_FILT_4TAP(dst32_r, dst10_r, filt_h0, filt_h1);
3262 dst2_l = HEVC_FILT_4TAP(dst32_l, dst10_l, filt_h0, filt_h1);
3268 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
3270 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
3272 ILVRL_H2_SH(dst2, dst5, dst21_r, dst21_l);
3273 dst3_r = HEVC_FILT_4TAP(dst43_r, dst21_r, filt_h0, filt_h1);
3274 dst3_l = HEVC_FILT_4TAP(dst43_l, dst21_l, filt_h0, filt_h1);
3279 PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r,
3280 dst2_l, dst2_r, dst3_l, dst3_r,
3281 out0_r, out1_r, out2_r, out3_r);
3283 SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6);
3284 CLIP_SH4_0_255(out0_r, out1_r, out2_r, out3_r);
3286 PCKEV_B2_SH(out1_r, out0_r, out3_r, out2_r, out0_r, out1_r);
3287 ST6x4_UB(out0_r, out1_r, dst, dst_stride);
3288 dst += (4 * dst_stride);
3292 static void hevc_hv_uni_4t_8x2_msa(uint8_t *src,
3296 const int8_t *filter_x,
3297 const int8_t *filter_y,
3300 v16i8 src0, src1, src2, src3, src4;
3302 v4i32 filt_h0, filt_h1;
3303 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
3305 v8i16 filter_vec, const_vec;
3306 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
3307 v8i16 dst0, dst1, dst2, dst3, dst4;
3308 v4i32 dst0_r, dst0_l, dst1_r, dst1_l;
3309 v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
3310 v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
3311 v8i16 out0_r, out1_r;
3313 src -= (src_stride + 1);
3315 filter_vec = LD_SH(filter_x);
3316 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3318 filter_vec = LD_SH(filter_y);
3319 vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
3320 filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
3322 SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1);
3326 const_vec = __msa_ldi_h(128);
3329 LD_SB3(src, src_stride, src0, src1, src2);
3330 src += (3 * src_stride);
3332 XORI_B3_128_SB(src0, src1, src2);
3334 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3335 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
3336 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3339 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
3341 DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1);
3343 DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2);
3345 ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
3346 ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
3348 LD_SB2(src, src_stride, src3, src4);
3349 XORI_B2_128_SB(src3, src4);
3352 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3354 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
3356 ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
3357 dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
3358 dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
3363 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
3365 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
3367 ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
3368 dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
3369 dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
3373 PCKEV_H2_SH(dst0_l, dst0_r, dst1_l, dst1_r, out0_r, out1_r);
3374 SRARI_H2_SH(out0_r, out1_r, 6);
3375 CLIP_SH2_0_255(out0_r, out1_r);
3376 out0_r = (v8i16) __msa_pckev_b((v16i8) out1_r, (v16i8) out0_r);
3378 ST8x2_UB(out0_r, dst, dst_stride);
3381 static void hevc_hv_uni_4t_8x6_msa(uint8_t *src,
3385 const int8_t *filter_x,
3386 const int8_t *filter_y,
3389 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
3391 v4i32 filt_h0, filt_h1;
3392 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
3394 v8i16 filter_vec, const_vec;
3395 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
3396 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
3397 v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
3398 v4i32 dst4_r, dst4_l, dst5_r, dst5_l;
3399 v8i16 dst10_r, dst32_r, dst10_l, dst32_l;
3400 v8i16 dst21_r, dst43_r, dst21_l, dst43_l;
3401 v8i16 dst54_r, dst54_l, dst65_r, dst65_l;
3402 v8i16 dst76_r, dst76_l, dst87_r, dst87_l;
3403 v8i16 out0_r, out1_r, out2_r, out3_r, out4_r, out5_r;
3405 src -= (src_stride + 1);
3407 filter_vec = LD_SH(filter_x);
3408 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3410 filter_vec = LD_SH(filter_y);
3411 vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
3412 filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
3414 SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1);
3418 const_vec = __msa_ldi_h(128);
3421 LD_SB3(src, src_stride, src0, src1, src2);
3422 src += (3 * src_stride);
3424 XORI_B3_128_SB(src0, src1, src2);
3426 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3427 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
3428 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3431 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
3433 DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1);
3435 DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2);
3437 ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
3438 ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
3440 LD_SB2(src, src_stride, src3, src4);
3441 src += (2 * src_stride);
3443 XORI_B2_128_SB(src3, src4);
3446 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3448 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
3450 ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
3451 dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
3452 dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
3458 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
3460 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
3462 ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
3463 dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
3464 dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
3468 LD_SB2(src, src_stride, src5, src6);
3469 src += (2 * src_stride);
3471 XORI_B2_128_SB(src5, src6);
3474 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
3476 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5);
3478 ILVRL_H2_SH(dst5, dst4, dst54_r, dst54_l);
3479 dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
3480 dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1);
3485 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
3487 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst6, dst6);
3489 ILVRL_H2_SH(dst6, dst5, dst65_r, dst65_l);
3490 dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
3491 dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1);
3495 LD_SB2(src, src_stride, src7, src8);
3496 src += (2 * src_stride);
3498 XORI_B2_128_SB(src7, src8);
3501 VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1);
3503 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst7, dst7);
3505 ILVRL_H2_SH(dst7, dst6, dst76_r, dst76_l);
3506 dst4_r = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
3507 dst4_l = HEVC_FILT_4TAP(dst54_l, dst76_l, filt_h0, filt_h1);
3513 VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec0, vec1);
3515 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst8, dst8);
3517 ILVRL_H2_SH(dst8, dst7, dst87_r, dst87_l);
3518 dst5_r = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
3519 dst5_l = HEVC_FILT_4TAP(dst65_l, dst87_l, filt_h0, filt_h1);
3523 PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r,
3524 dst2_l, dst2_r, dst3_l, dst3_r, out0_r, out1_r, out2_r, out3_r);
3525 PCKEV_H2_SH(dst4_l, dst4_r, dst5_l, dst5_r, out4_r, out5_r);
3526 SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6);
3527 SRARI_H2_SH(out4_r, out5_r, 6);
3528 CLIP_SH4_0_255(out0_r, out1_r, out2_r, out3_r);
3529 CLIP_SH2_0_255(out4_r, out5_r);
3531 PCKEV_B2_SH(out1_r, out0_r, out3_r, out2_r, out0_r, out1_r);
3532 out2_r = (v8i16) __msa_pckev_b((v16i8) out5_r, (v16i8) out4_r);
3534 ST8x4_UB(out0_r, out1_r, dst, dst_stride);
3535 dst += (4 * dst_stride);
3536 ST8x2_UB(out2_r, dst, dst_stride);
3539 static void hevc_hv_uni_4t_8w_mult_msa(uint8_t *src,
3543 const int8_t *filter_x,
3544 const int8_t *filter_y,
3548 uint32_t loop_cnt, cnt;
3551 v16i8 src0, src1, src2, src3, src4, src5, src6;
3553 v4i32 filt_h0, filt_h1;
3554 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
3556 v8i16 filter_vec, const_vec;
3557 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
3558 v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
3559 v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
3560 v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
3561 v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
3562 v8i16 out0_r, out1_r, out2_r, out3_r;
3564 src -= (src_stride + 1);
3566 filter_vec = LD_SH(filter_x);
3567 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3569 filter_vec = LD_SH(filter_y);
3570 vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
3571 filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
3573 SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1);
3577 const_vec = __msa_ldi_h(128);
3580 for (cnt = width >> 3; cnt--;) {
3584 LD_SB3(src_tmp, src_stride, src0, src1, src2);
3585 src_tmp += (3 * src_stride);
3587 XORI_B3_128_SB(src0, src1, src2);
3589 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3590 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
3591 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3594 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
3596 DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1);
3598 DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2);
3600 ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
3601 ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
3603 for (loop_cnt = height >> 2; loop_cnt--;) {
3604 LD_SB4(src_tmp, src_stride, src3, src4, src5, src6);
3605 src_tmp += (4 * src_stride);
3607 XORI_B4_128_SB(src3, src4, src5, src6);
3610 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3612 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
3614 ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
3615 dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
3616 dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
3622 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
3624 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
3626 ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
3627 dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
3628 dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
3633 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
3635 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5);
3637 ILVRL_H2_SH(dst5, dst4, dst10_r, dst10_l);
3638 dst2_r = HEVC_FILT_4TAP(dst32_r, dst10_r, filt_h0, filt_h1);
3639 dst2_l = HEVC_FILT_4TAP(dst32_l, dst10_l, filt_h0, filt_h1);
3645 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
3647 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
3649 ILVRL_H2_SH(dst2, dst5, dst21_r, dst21_l);
3650 dst3_r = HEVC_FILT_4TAP(dst43_r, dst21_r, filt_h0, filt_h1);
3651 dst3_l = HEVC_FILT_4TAP(dst43_l, dst21_l, filt_h0, filt_h1);
3656 PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r,
3657 dst2_l, dst2_r, dst3_l, dst3_r,
3658 out0_r, out1_r, out2_r, out3_r);
3660 SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6);
3661 CLIP_SH4_0_255(out0_r, out1_r, out2_r, out3_r);
3663 PCKEV_B2_SH(out1_r, out0_r, out3_r, out2_r, out0_r, out1_r);
3664 ST8x4_UB(out0_r, out1_r, dst_tmp, dst_stride);
3665 dst_tmp += (4 * dst_stride);
3673 static void hevc_hv_uni_4t_8w_msa(uint8_t *src,
3677 const int8_t *filter_x,
3678 const int8_t *filter_y,
3682 hevc_hv_uni_4t_8x2_msa(src, src_stride, dst, dst_stride,
3683 filter_x, filter_y, height);
3684 } else if (6 == height) {
3685 hevc_hv_uni_4t_8x6_msa(src, src_stride, dst, dst_stride,
3686 filter_x, filter_y, height);
3687 } else if (0 == (height % 4)) {
3688 hevc_hv_uni_4t_8w_mult_msa(src, src_stride, dst, dst_stride,
3689 filter_x, filter_y, height, 8);
3693 static void hevc_hv_uni_4t_12w_msa(uint8_t *src,
3697 const int8_t *filter_x,
3698 const int8_t *filter_y,
3701 hevc_hv_uni_4t_8w_mult_msa(src, src_stride, dst, dst_stride,
3702 filter_x, filter_y, height, 8);
3704 hevc_hv_uni_4t_4w_msa(src + 8, src_stride, dst + 8, dst_stride,
3705 filter_x, filter_y, height);
3708 static void hevc_hv_uni_4t_16w_msa(uint8_t *src,
3712 const int8_t *filter_x,
3713 const int8_t *filter_y,
3716 hevc_hv_uni_4t_8w_mult_msa(src, src_stride, dst, dst_stride,
3717 filter_x, filter_y, height, 16);
3720 static void hevc_hv_uni_4t_24w_msa(uint8_t *src,
3724 const int8_t *filter_x,
3725 const int8_t *filter_y,
3728 hevc_hv_uni_4t_8w_mult_msa(src, src_stride, dst, dst_stride,
3729 filter_x, filter_y, height, 24);
3732 static void hevc_hv_uni_4t_32w_msa(uint8_t *src,
3736 const int8_t *filter_x,
3737 const int8_t *filter_y,
3740 hevc_hv_uni_4t_8w_mult_msa(src, src_stride, dst, dst_stride,
3741 filter_x, filter_y, height, 32);
3744 #define UNI_MC_COPY(WIDTH) \
3745 void ff_hevc_put_hevc_uni_pel_pixels##WIDTH##_8_msa(uint8_t *dst, \
3746 ptrdiff_t dst_stride, \
3748 ptrdiff_t src_stride, \
3754 copy_width##WIDTH##_msa(src, src_stride, dst, dst_stride, height); \
3767 #define UNI_MC(PEL, DIR, WIDTH, TAP, DIR1, FILT_DIR) \
3768 void ff_hevc_put_hevc_uni_##PEL##_##DIR##WIDTH##_8_msa(uint8_t *dst, \
3779 const int8_t *filter = ff_hevc_##PEL##_filters[FILT_DIR - 1]; \
3781 common_##DIR1##_##TAP##t_##WIDTH##w_msa(src, src_stride, dst, dst_stride, \
3785 UNI_MC(qpel, h, 4, 8, hz, mx);
3786 UNI_MC(qpel, h, 8, 8, hz, mx);
3787 UNI_MC(qpel, h, 12, 8, hz, mx);
3788 UNI_MC(qpel, h, 16, 8, hz, mx);
3789 UNI_MC(qpel, h, 24, 8, hz, mx);
3790 UNI_MC(qpel, h, 32, 8, hz, mx);
3791 UNI_MC(qpel, h, 48, 8, hz, mx);
3792 UNI_MC(qpel, h, 64, 8, hz, mx);
3794 UNI_MC(qpel, v, 4, 8, vt, my);
3795 UNI_MC(qpel, v, 8, 8, vt, my);
3796 UNI_MC(qpel, v, 12, 8, vt, my);
3797 UNI_MC(qpel, v, 16, 8, vt, my);
3798 UNI_MC(qpel, v, 24, 8, vt, my);
3799 UNI_MC(qpel, v, 32, 8, vt, my);
3800 UNI_MC(qpel, v, 48, 8, vt, my);
3801 UNI_MC(qpel, v, 64, 8, vt, my);
3803 UNI_MC(epel, h, 4, 4, hz, mx);
3804 UNI_MC(epel, h, 6, 4, hz, mx);
3805 UNI_MC(epel, h, 8, 4, hz, mx);
3806 UNI_MC(epel, h, 12, 4, hz, mx);
3807 UNI_MC(epel, h, 16, 4, hz, mx);
3808 UNI_MC(epel, h, 24, 4, hz, mx);
3809 UNI_MC(epel, h, 32, 4, hz, mx);
3811 UNI_MC(epel, v, 4, 4, vt, my);
3812 UNI_MC(epel, v, 6, 4, vt, my);
3813 UNI_MC(epel, v, 8, 4, vt, my);
3814 UNI_MC(epel, v, 12, 4, vt, my);
3815 UNI_MC(epel, v, 16, 4, vt, my);
3816 UNI_MC(epel, v, 24, 4, vt, my);
3817 UNI_MC(epel, v, 32, 4, vt, my);
3821 #define UNI_MC_HV(PEL, DIR, WIDTH, TAP, DIR1) \
3822 void ff_hevc_put_hevc_uni_##PEL##_##DIR##WIDTH##_8_msa(uint8_t *dst, \
3833 const int8_t *filter_x = ff_hevc_##PEL##_filters[mx - 1]; \
3834 const int8_t *filter_y = ff_hevc_##PEL##_filters[my - 1]; \
3836 hevc_##DIR1##_uni_##TAP##t_##WIDTH##w_msa(src, src_stride, dst, \
3837 dst_stride, filter_x, \
3838 filter_y, height); \
3841 UNI_MC_HV(qpel, hv, 4, 8, hv);
3842 UNI_MC_HV(qpel, hv, 8, 8, hv);
3843 UNI_MC_HV(qpel, hv, 12, 8, hv);
3844 UNI_MC_HV(qpel, hv, 16, 8, hv);
3845 UNI_MC_HV(qpel, hv, 24, 8, hv);
3846 UNI_MC_HV(qpel, hv, 32, 8, hv);
3847 UNI_MC_HV(qpel, hv, 48, 8, hv);
3848 UNI_MC_HV(qpel, hv, 64, 8, hv);
3850 UNI_MC_HV(epel, hv, 4, 4, hv);
3851 UNI_MC_HV(epel, hv, 6, 4, hv);
3852 UNI_MC_HV(epel, hv, 8, 4, hv);
3853 UNI_MC_HV(epel, hv, 12, 4, hv);
3854 UNI_MC_HV(epel, hv, 16, 4, hv);
3855 UNI_MC_HV(epel, hv, 24, 4, hv);
3856 UNI_MC_HV(epel, hv, 32, 4, hv);