2 * Copyright (c) 2015 - 2017 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com)
4 * This file is part of FFmpeg.
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 #include "libavutil/mips/generic_macros_msa.h"
22 #include "libavcodec/mips/hevcdsp_mips.h"
23 #include "libavcodec/mips/hevc_macros_msa.h"
25 #define HEVC_HV_UNIW_RND_CLIP4(in0, in1, in2, in3, wgt, offset, rnd, \
26 out0, out1, out2, out3) \
28 MUL4(in0, wgt, in1, wgt, in2, wgt, in3, wgt, out0, out1, out2, out3); \
29 SRAR_W4_SW(out0, out1, out2, out3, rnd); \
30 ADD4(out0, offset, out1, offset, out2, offset, out3, offset, \
31 out0, out1, out2, out3); \
32 out0 = CLIP_SW_0_255(out0); \
33 out1 = CLIP_SW_0_255(out1); \
34 out2 = CLIP_SW_0_255(out2); \
35 out3 = CLIP_SW_0_255(out3); \
38 #define HEVC_UNIW_RND_CLIP2(in0, in1, wgt, offset, rnd, \
39 out0_r, out1_r, out0_l, out1_l) \
41 ILVR_H2_SW(in0, in0, in1, in1, out0_r, out1_r); \
42 ILVL_H2_SW(in0, in0, in1, in1, out0_l, out1_l); \
43 DOTP_SH4_SW(out0_r, out1_r, out0_l, out1_l, wgt, wgt, wgt, wgt, \
44 out0_r, out1_r, out0_l, out1_l); \
45 SRAR_W4_SW(out0_r, out1_r, out0_l, out1_l, rnd); \
46 ADD4(out0_r, offset, out1_r, offset, \
47 out0_l, offset, out1_l, offset, \
48 out0_r, out1_r, out0_l, out1_l); \
49 out0_r = CLIP_SW_0_255(out0_r); \
50 out1_r = CLIP_SW_0_255(out1_r); \
51 out0_l = CLIP_SW_0_255(out0_l); \
52 out1_l = CLIP_SW_0_255(out1_l); \
55 #define HEVC_UNIW_RND_CLIP4(in0, in1, in2, in3, wgt, offset, rnd, \
56 out0_r, out1_r, out2_r, out3_r, \
57 out0_l, out1_l, out2_l, out3_l) \
59 HEVC_UNIW_RND_CLIP2(in0, in1, wgt, offset, rnd, \
60 out0_r, out1_r, out0_l, out1_l); \
61 HEVC_UNIW_RND_CLIP2(in2, in3, wgt, offset, rnd, \
62 out2_r, out3_r, out2_l, out3_l); \
65 #define HEVC_UNIW_RND_CLIP2_MAX_SATU_H(in0_h, in1_h, wgt_w, offset_h, rnd_w, \
68 v4i32 in0_r_m, in0_l_m, in1_r_m, in1_l_m; \
70 ILVRL_H2_SW(in0_h, in0_h, in0_r_m, in0_l_m); \
71 ILVRL_H2_SW(in1_h, in1_h, in1_r_m, in1_l_m); \
72 DOTP_SH4_SW(in0_r_m, in1_r_m, in0_l_m, in1_l_m, wgt_w, wgt_w, wgt_w, \
73 wgt_w, in0_r_m, in1_r_m, in0_l_m, in1_l_m); \
74 SRAR_W4_SW(in0_r_m, in1_r_m, in0_l_m, in1_l_m, rnd_w); \
75 PCKEV_H2_SH(in0_l_m, in0_r_m, in1_l_m, in1_r_m, out0_h, out1_h); \
76 ADDS_SH2_SH(out0_h, offset_h, out1_h, offset_h, out0_h, out1_h); \
77 CLIP_SH2_0_255_MAX_SATU(out0_h, out1_h); \
80 #define HEVC_UNIW_RND_CLIP4_MAX_SATU_H(in0_h, in1_h, in2_h, in3_h, wgt_w, \
81 offset_h, rnd_w, out0_h, out1_h, \
84 HEVC_UNIW_RND_CLIP2_MAX_SATU_H(in0_h, in1_h, wgt_w, offset_h, rnd_w, \
86 HEVC_UNIW_RND_CLIP2_MAX_SATU_H(in2_h, in3_h, wgt_w, offset_h, rnd_w, \
90 static void hevc_uniwgt_copy_4w_msa(uint8_t *src,
99 uint32_t loop_cnt, tp0, tp1, tp2, tp3;
102 v16i8 src0 = { 0 }, src1 = { 0 };
103 v8i16 dst0, dst1, dst2, dst3, offset_vec;
104 v4i32 weight_vec, rnd_vec;
106 weight = weight & 0x0000FFFF;
107 weight_vec = __msa_fill_w(weight);
108 offset_vec = __msa_fill_h(offset);
109 rnd_vec = __msa_fill_w(rnd_val);
112 v4i32 dst0_r, dst0_l;
114 LW2(src, src_stride, tp0, tp1);
115 INSERT_W2_SB(tp0, tp1, src0);
116 dst0 = (v8i16) __msa_ilvr_b(zero, src0);
119 ILVRL_H2_SW(dst0, dst0, dst0_r, dst0_l);
120 DOTP_SH2_SW(dst0_r, dst0_l, weight_vec, weight_vec, dst0_r, dst0_l);
121 SRAR_W2_SW(dst0_r, dst0_l, rnd_vec);
122 dst0 = __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r);
124 dst0 = CLIP_SH_0_255_MAX_SATU(dst0);
125 out0 = (v16u8) __msa_pckev_b((v16i8) dst0, (v16i8) dst0);
126 ST4x2_UB(out0, dst, dst_stride);
127 } else if (4 == height) {
128 LW4(src, src_stride, tp0, tp1, tp2, tp3);
129 INSERT_W4_SB(tp0, tp1, tp2, tp3, src0);
130 ILVRL_B2_SH(zero, src0, dst0, dst1);
131 SLLI_2V(dst0, dst1, 6);
132 HEVC_UNIW_RND_CLIP2_MAX_SATU_H(dst0, dst1, weight_vec, offset_vec,
133 rnd_vec, dst0, dst1);
134 out0 = (v16u8) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
135 ST4x4_UB(out0, out0, 0, 1, 2, 3, dst, dst_stride);
136 } else if (0 == (height % 8)) {
137 for (loop_cnt = (height >> 3); loop_cnt--;) {
138 LW4(src, src_stride, tp0, tp1, tp2, tp3);
139 src += 4 * src_stride;
140 INSERT_W4_SB(tp0, tp1, tp2, tp3, src0);
141 LW4(src, src_stride, tp0, tp1, tp2, tp3);
142 src += 4 * src_stride;
143 INSERT_W4_SB(tp0, tp1, tp2, tp3, src1);
144 ILVRL_B2_SH(zero, src0, dst0, dst1);
145 ILVRL_B2_SH(zero, src1, dst2, dst3);
146 SLLI_4V(dst0, dst1, dst2, dst3, 6);
147 HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
148 offset_vec, rnd_vec, dst0, dst1,
150 PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
151 ST4x8_UB(out0, out1, dst, dst_stride);
152 dst += 8 * dst_stride;
157 static void hevc_uniwgt_copy_6w_msa(uint8_t *src,
167 uint64_t tp0, tp1, tp2, tp3;
169 v16u8 out0, out1, out2, out3;
170 v16i8 src0, src1, src2, src3;
171 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, offset_vec;
172 v4i32 weight_vec, rnd_vec;
174 weight = weight & 0x0000FFFF;
175 weight_vec = __msa_fill_w(weight);
176 offset_vec = __msa_fill_h(offset);
177 rnd_vec = __msa_fill_w(rnd_val);
179 for (loop_cnt = (height >> 3); loop_cnt--;) {
180 LD4(src, src_stride, tp0, tp1, tp2, tp3);
181 src += (4 * src_stride);
182 INSERT_D2_SB(tp0, tp1, src0);
183 INSERT_D2_SB(tp2, tp3, src1);
184 LD4(src, src_stride, tp0, tp1, tp2, tp3);
185 src += (4 * src_stride);
186 INSERT_D2_SB(tp0, tp1, src2);
187 INSERT_D2_SB(tp2, tp3, src3);
189 ILVRL_B2_SH(zero, src0, dst0, dst1);
190 ILVRL_B2_SH(zero, src1, dst2, dst3);
191 ILVRL_B2_SH(zero, src2, dst4, dst5);
192 ILVRL_B2_SH(zero, src3, dst6, dst7);
194 SLLI_4V(dst0, dst1, dst2, dst3, 6);
195 SLLI_4V(dst4, dst5, dst6, dst7, 6);
197 HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
198 offset_vec, rnd_vec, dst0, dst1, dst2,
200 HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst4, dst5, dst6, dst7, weight_vec,
201 offset_vec, rnd_vec, dst4, dst5, dst6,
203 PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
204 PCKEV_B2_UB(dst5, dst4, dst7, dst6, out2, out3);
206 ST6x4_UB(out0, out1, dst, dst_stride);
207 dst += (4 * dst_stride);
208 ST6x4_UB(out2, out3, dst, dst_stride);
209 dst += (4 * dst_stride);
213 static void hevc_uniwgt_copy_8w_msa(uint8_t *src,
223 uint64_t tp0, tp1, tp2, tp3;
224 v16i8 src0 = { 0 }, src1 = { 0 }, src2 = { 0 }, src3 = { 0 };
226 v16u8 out0, out1, out2, out3;
227 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, offset_vec;
228 v4i32 weight_vec, rnd_vec;
230 weight = weight & 0x0000FFFF;
231 weight_vec = __msa_fill_w(weight);
232 offset_vec = __msa_fill_h(offset);
233 rnd_vec = __msa_fill_w(rnd_val);
236 LD2(src, src_stride, tp0, tp1);
237 INSERT_D2_SB(tp0, tp1, src0);
238 ILVRL_B2_SH(zero, src0, dst0, dst1);
239 SLLI_2V(dst0, dst1, 6);
240 HEVC_UNIW_RND_CLIP2_MAX_SATU_H(dst0, dst1, weight_vec, offset_vec,
241 rnd_vec, dst0, dst1);
242 out0 = (v16u8) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
243 ST8x2_UB(out0, dst, dst_stride);
244 } else if (4 == height) {
245 LD4(src, src_stride, tp0, tp1, tp2, tp3);
246 INSERT_D2_SB(tp0, tp1, src0);
247 INSERT_D2_SB(tp2, tp3, src1);
248 ILVRL_B2_SH(zero, src0, dst0, dst1);
249 ILVRL_B2_SH(zero, src1, dst2, dst3);
250 SLLI_4V(dst0, dst1, dst2, dst3, 6);
251 HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
252 offset_vec, rnd_vec, dst0, dst1, dst2,
254 PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
255 ST8x4_UB(out0, out1, dst, dst_stride);
256 } else if (6 == height) {
257 LD4(src, src_stride, tp0, tp1, tp2, tp3);
258 src += 4 * src_stride;
259 INSERT_D2_SB(tp0, tp1, src0);
260 INSERT_D2_SB(tp2, tp3, src1);
261 LD2(src, src_stride, tp0, tp1);
262 INSERT_D2_SB(tp0, tp1, src2);
263 ILVRL_B2_SH(zero, src0, dst0, dst1);
264 ILVRL_B2_SH(zero, src1, dst2, dst3);
265 ILVRL_B2_SH(zero, src2, dst4, dst5);
266 SLLI_4V(dst0, dst1, dst2, dst3, 6);
267 SLLI_2V(dst4, dst5, 6);
268 HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
269 offset_vec, rnd_vec, dst0, dst1, dst2,
271 HEVC_UNIW_RND_CLIP2_MAX_SATU_H(dst4, dst5, weight_vec, offset_vec,
272 rnd_vec, dst4, dst5);
273 PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
274 ST8x4_UB(out0, out1, dst, dst_stride);
275 dst += (4 * dst_stride);
276 ST8x2_UB(out2, dst, dst_stride);
277 } else if (0 == height % 8) {
278 for (loop_cnt = (height >> 3); loop_cnt--;) {
279 LD4(src, src_stride, tp0, tp1, tp2, tp3);
280 src += 4 * src_stride;
281 INSERT_D2_SB(tp0, tp1, src0);
282 INSERT_D2_SB(tp2, tp3, src1);
283 LD4(src, src_stride, tp0, tp1, tp2, tp3);
284 src += 4 * src_stride;
285 INSERT_D2_SB(tp0, tp1, src2);
286 INSERT_D2_SB(tp2, tp3, src3);
288 ILVRL_B2_SH(zero, src0, dst0, dst1);
289 ILVRL_B2_SH(zero, src1, dst2, dst3);
290 ILVRL_B2_SH(zero, src2, dst4, dst5);
291 ILVRL_B2_SH(zero, src3, dst6, dst7);
292 SLLI_4V(dst0, dst1, dst2, dst3, 6);
293 SLLI_4V(dst4, dst5, dst6, dst7, 6);
294 HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
295 offset_vec, rnd_vec, dst0, dst1,
297 HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst4, dst5, dst6, dst7, weight_vec,
298 offset_vec, rnd_vec, dst4, dst5,
300 PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
301 PCKEV_B2_UB(dst5, dst4, dst7, dst6, out2, out3);
302 ST8x4_UB(out0, out1, dst, dst_stride);
303 dst += (4 * dst_stride);
304 ST8x4_UB(out2, out3, dst, dst_stride);
305 dst += (4 * dst_stride);
310 static void hevc_uniwgt_copy_12w_msa(uint8_t *src,
320 v16u8 out0, out1, out2;
321 v16i8 src0, src1, src2, src3;
322 v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
325 v4i32 weight_vec, rnd_vec;
327 weight = weight & 0x0000FFFF;
328 weight_vec = __msa_fill_w(weight);
329 offset_vec = __msa_fill_h(offset);
330 rnd_vec = __msa_fill_w(rnd_val);
332 for (loop_cnt = 4; loop_cnt--;) {
333 LD_SB4(src, src_stride, src0, src1, src2, src3);
334 src += (4 * src_stride);
335 ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
336 dst0, dst1, dst2, dst3);
338 ILVL_W2_SB(src1, src0, src3, src2, src0, src1);
339 ILVR_B2_SH(zero, src0, zero, src1, dst4, dst5);
340 SLLI_4V(dst0, dst1, dst2, dst3, 6);
341 SLLI_2V(dst4, dst5, 6);
342 HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
343 offset_vec, rnd_vec, dst0, dst1, dst2,
345 HEVC_UNIW_RND_CLIP2_MAX_SATU_H(dst4, dst5, weight_vec, offset_vec,
346 rnd_vec, dst4, dst5);
348 PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
349 ST12x4_UB(out0, out1, out2, dst, dst_stride);
350 dst += (4 * dst_stride);
354 static void hevc_uniwgt_copy_16multx4mult_msa(uint8_t *src,
364 uint32_t loop_cnt, cnt;
367 v16i8 src0, src1, src2, src3;
368 v8i16 tmp0, tmp1, tmp2, tmp3;
369 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l;
371 v4i32 weight_vec, offset_vec, rnd_vec;
373 weight = weight & 0x0000FFFF;
374 weight_vec = __msa_fill_w(weight);
375 offset_vec = __msa_fill_w(offset);
376 rnd_vec = __msa_fill_w(rnd_val);
378 for (cnt = width >> 4; cnt--;) {
382 for (loop_cnt = height >> 2; loop_cnt--;) {
383 LD_SB4(src_tmp, src_stride, src0, src1, src2, src3);
384 src_tmp += (4 * src_stride);
385 ILVR_B2_SH(zero, src0, zero, src1, tmp0, tmp1);
386 ILVL_B2_SH(zero, src0, zero, src1, tmp2, tmp3);
388 SLLI_4V(tmp0, tmp1, tmp2, tmp3, 6);
389 HEVC_UNIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3,
390 weight_vec, offset_vec, rnd_vec,
391 dst0_r, dst1_r, dst2_r, dst3_r,
392 dst0_l, dst1_l, dst2_l, dst3_l);
394 HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst2_l, dst2_r,
395 dst1_l, dst1_r, dst3_l, dst3_r, dst0_r, dst1_r);
397 ST_SW2(dst0_r, dst1_r, dst_tmp, dst_stride);
398 dst_tmp += (2 * dst_stride);
400 ILVR_B2_SH(zero, src2, zero, src3, tmp0, tmp1);
401 ILVL_B2_SH(zero, src2, zero, src3, tmp2, tmp3);
403 SLLI_4V(tmp0, tmp1, tmp2, tmp3, 6);
404 HEVC_UNIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3,
405 weight_vec, offset_vec, rnd_vec,
406 dst0_r, dst1_r, dst2_r, dst3_r,
407 dst0_l, dst1_l, dst2_l, dst3_l);
409 HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst2_l, dst2_r,
410 dst1_l, dst1_r, dst3_l, dst3_r, dst0_r, dst1_r);
412 ST_SW2(dst0_r, dst1_r, dst_tmp, dst_stride);
413 dst_tmp += (2 * dst_stride);
421 static void hevc_uniwgt_copy_16w_msa(uint8_t *src,
431 v16u8 out0, out1, out2, out3;
432 v16i8 src0, src1, src2, src3;
434 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, offset_vec;
435 v4i32 weight_vec, rnd_vec;
437 weight = weight & 0x0000FFFF;
438 weight_vec = __msa_fill_w(weight);
439 offset_vec = __msa_fill_h(offset);
440 rnd_vec = __msa_fill_w(rnd_val);
442 for (loop_cnt = height >> 2; loop_cnt--;) {
443 LD_SB4(src, src_stride, src0, src1, src2, src3);
444 src += (4 * src_stride);
445 ILVRL_B2_SH(zero, src0, dst0, dst1);
446 ILVRL_B2_SH(zero, src1, dst2, dst3);
447 ILVRL_B2_SH(zero, src2, dst4, dst5);
448 ILVRL_B2_SH(zero, src3, dst6, dst7);
449 SLLI_4V(dst0, dst1, dst2, dst3, 6);
450 SLLI_4V(dst4, dst5, dst6, dst7, 6);
451 HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
452 offset_vec, rnd_vec, dst0, dst1, dst2,
454 HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst4, dst5, dst6, dst7, weight_vec,
455 offset_vec, rnd_vec, dst4, dst5, dst6,
457 PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
458 PCKEV_B2_UB(dst5, dst4, dst7, dst6, out2, out3);
459 ST_UB4(out0, out1, out2, out3, dst, dst_stride);
460 dst += (4 * dst_stride);
464 static void hevc_uniwgt_copy_24w_msa(uint8_t *src,
474 v16u8 out0, out1, out2, out3, out4, out5;
475 v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
477 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, offset_vec;
478 v8i16 dst8, dst9, dst10, dst11;
479 v4i32 weight_vec, rnd_vec;
481 weight = weight & 0x0000FFFF;
482 weight_vec = __msa_fill_w(weight);
483 offset_vec = __msa_fill_h(offset);
484 rnd_vec = __msa_fill_w(rnd_val);
486 for (loop_cnt = (height >> 2); loop_cnt--;) {
487 LD_SB4(src, src_stride, src0, src1, src4, src5);
488 LD_SB4(src + 16, src_stride, src2, src3, src6, src7);
489 src += (4 * src_stride);
491 ILVRL_B2_SH(zero, src0, dst0, dst1);
492 ILVRL_B2_SH(zero, src1, dst2, dst3);
493 ILVR_B2_SH(zero, src2, zero, src3, dst4, dst5);
494 ILVRL_B2_SH(zero, src4, dst6, dst7);
495 ILVRL_B2_SH(zero, src5, dst8, dst9);
496 ILVR_B2_SH(zero, src6, zero, src7, dst10, dst11);
497 SLLI_4V(dst0, dst1, dst2, dst3, 6);
498 SLLI_4V(dst4, dst5, dst6, dst7, 6);
499 SLLI_4V(dst8, dst9, dst10, dst11, 6);
500 HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
501 offset_vec, rnd_vec, dst0, dst1, dst2,
503 HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst4, dst5, dst6, dst7, weight_vec,
504 offset_vec, rnd_vec, dst4, dst5, dst6,
506 HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst8, dst9, dst10, dst11, weight_vec,
507 offset_vec, rnd_vec, dst8, dst9, dst10,
509 PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
510 PCKEV_B3_UB(dst7, dst6, dst9, dst8, dst11, dst10, out3, out4, out5);
511 ST_UB4(out0, out1, out3, out4, dst, dst_stride);
512 ST8x4_UB(out2, out5, dst + 16, dst_stride);
513 dst += (4 * dst_stride);
517 static void hevc_uniwgt_copy_32w_msa(uint8_t *src,
527 v16u8 out0, out1, out2, out3;
528 v16i8 src0, src1, src2, src3;
530 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, offset_vec;
531 v4i32 weight_vec, rnd_vec;
533 weight = weight & 0x0000FFFF;
534 weight_vec = __msa_fill_w(weight);
535 offset_vec = __msa_fill_h(offset);
536 rnd_vec = __msa_fill_w(rnd_val);
538 for (loop_cnt = (height >> 1); loop_cnt--;) {
539 LD_SB2(src, src_stride, src0, src1);
540 LD_SB2(src + 16, src_stride, src2, src3);
541 src += (2 * src_stride);
543 ILVRL_B2_SH(zero, src0, dst0, dst1);
544 ILVRL_B2_SH(zero, src1, dst2, dst3);
545 ILVRL_B2_SH(zero, src2, dst4, dst5);
546 ILVRL_B2_SH(zero, src3, dst6, dst7);
547 SLLI_4V(dst0, dst1, dst2, dst3, 6);
548 SLLI_4V(dst4, dst5, dst6, dst7, 6);
549 HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
550 offset_vec, rnd_vec, dst0, dst1, dst2,
552 HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst4, dst5, dst6, dst7, weight_vec,
553 offset_vec, rnd_vec, dst4, dst5, dst6,
555 PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
556 PCKEV_B2_UB(dst5, dst4, dst7, dst6, out2, out3);
557 ST_UB2(out0, out1, dst, dst_stride);
558 ST_UB2(out2, out3, dst + 16, dst_stride);
559 dst += (2 * dst_stride);
563 static void hevc_uniwgt_copy_48w_msa(uint8_t *src,
573 v16u8 out0, out1, out2, out3, out4, out5;
574 v16i8 src0, src1, src2, src3, src4, src5;
576 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, offset_vec;
577 v8i16 dst6, dst7, dst8, dst9, dst10, dst11;
578 v4i32 weight_vec, rnd_vec;
580 weight = weight & 0x0000FFFF;
581 weight_vec = __msa_fill_w(weight);
582 offset_vec = __msa_fill_h(offset);
583 rnd_vec = __msa_fill_w(rnd_val);
585 for (loop_cnt = (height >> 1); loop_cnt--;) {
586 LD_SB3(src, 16, src0, src1, src2);
588 LD_SB3(src, 16, src3, src4, src5);
591 ILVRL_B2_SH(zero, src0, dst0, dst1);
592 ILVRL_B2_SH(zero, src1, dst2, dst3);
593 ILVRL_B2_SH(zero, src2, dst4, dst5);
594 ILVRL_B2_SH(zero, src3, dst6, dst7);
595 ILVRL_B2_SH(zero, src4, dst8, dst9);
596 ILVRL_B2_SH(zero, src5, dst10, dst11);
597 SLLI_4V(dst0, dst1, dst2, dst3, 6);
598 SLLI_4V(dst4, dst5, dst6, dst7, 6);
599 SLLI_4V(dst8, dst9, dst10, dst11, 6);
600 HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
601 offset_vec, rnd_vec, dst0, dst1, dst2,
603 HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst4, dst5, dst6, dst7, weight_vec,
604 offset_vec, rnd_vec, dst4, dst5, dst6,
606 HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst8, dst9, dst10, dst11, weight_vec,
607 offset_vec, rnd_vec, dst8, dst9, dst10,
609 PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
610 PCKEV_B3_UB(dst7, dst6, dst9, dst8, dst11, dst10, out3, out4, out5);
611 ST_UB2(out0, out1, dst, 16);
612 ST_UB(out2, dst + 32);
614 ST_UB2(out3, out4, dst, 16);
615 ST_UB(out5, dst + 32);
620 static void hevc_uniwgt_copy_64w_msa(uint8_t *src,
630 v16u8 out0, out1, out2, out3, out4, out5, out6, out7;
631 v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
633 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, offset_vec;
634 v8i16 dst8, dst9, dst10, dst11, dst12, dst13, dst14, dst15;
635 v4i32 weight_vec, rnd_vec;
637 weight = weight & 0x0000FFFF;
638 weight_vec = __msa_fill_w(weight);
639 offset_vec = __msa_fill_h(offset);
640 rnd_vec = __msa_fill_w(rnd_val);
642 for (loop_cnt = (height >> 1); loop_cnt--;) {
643 LD_SB4(src, 16, src0, src1, src2, src3);
645 LD_SB4(src, 16, src4, src5, src6, src7);
648 ILVRL_B2_SH(zero, src0, dst0, dst1);
649 ILVRL_B2_SH(zero, src1, dst2, dst3);
650 ILVRL_B2_SH(zero, src2, dst4, dst5);
651 ILVRL_B2_SH(zero, src3, dst6, dst7);
652 ILVRL_B2_SH(zero, src4, dst8, dst9);
653 ILVRL_B2_SH(zero, src5, dst10, dst11);
654 ILVRL_B2_SH(zero, src6, dst12, dst13);
655 ILVRL_B2_SH(zero, src7, dst14, dst15);
656 SLLI_4V(dst0, dst1, dst2, dst3, 6);
657 SLLI_4V(dst4, dst5, dst6, dst7, 6);
658 SLLI_4V(dst8, dst9, dst10, dst11, 6);
659 SLLI_4V(dst12, dst13, dst14, dst15, 6);
660 HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
661 offset_vec, rnd_vec, dst0, dst1, dst2,
663 HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst4, dst5, dst6, dst7, weight_vec,
664 offset_vec, rnd_vec, dst4, dst5, dst6,
666 HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst8, dst9, dst10, dst11, weight_vec,
667 offset_vec, rnd_vec, dst8, dst9, dst10,
669 HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst12, dst13, dst14, dst15, weight_vec,
670 offset_vec, rnd_vec, dst12, dst13, dst14,
672 PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
673 PCKEV_B2_UB(dst5, dst4, dst7, dst6, out2, out3);
674 PCKEV_B2_UB(dst9, dst8, dst11, dst10, out4, out5);
675 PCKEV_B2_UB(dst13, dst12, dst15, dst14, out6, out7);
676 ST_UB4(out0, out1, out2, out3, dst, 16);
678 ST_UB4(out4, out5, out6, out7, dst, 16);
683 static void hevc_hz_uniwgt_8t_4w_msa(uint8_t *src,
687 const int8_t *filter,
694 v8i16 filt0, filt1, filt2, filt3;
695 v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
696 v16i8 mask1, mask2, mask3;
697 v8i16 filter_vec, const_vec;
698 v16i8 vec0, vec1, vec2, vec3;
699 v8i16 dst0, dst1, dst2, dst3;
700 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l;
701 v4i32 weight_vec, offset_vec, rnd_vec;
702 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 };
705 weight = weight & 0x0000FFFF;
706 const_vec = __msa_ldi_h(128);
709 weight_vec = __msa_fill_w(weight);
710 offset_vec = __msa_fill_w(offset);
711 rnd_vec = __msa_fill_w(rnd_val);
713 filter_vec = LD_SH(filter);
714 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
720 for (loop_cnt = (height >> 3); loop_cnt--;) {
721 LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
722 src += (8 * src_stride);
723 XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
725 VSHF_B4_SB(src0, src1, mask0, mask1, mask2, mask3,
726 vec0, vec1, vec2, vec3);
729 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
730 dst0, dst0, dst0, dst0);
731 VSHF_B4_SB(src2, src3, mask0, mask1, mask2, mask3,
732 vec0, vec1, vec2, vec3);
734 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
735 dst1, dst1, dst1, dst1);
736 VSHF_B4_SB(src4, src5, mask0, mask1, mask2, mask3,
737 vec0, vec1, vec2, vec3);
739 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
740 dst2, dst2, dst2, dst2);
741 VSHF_B4_SB(src6, src7, mask0, mask1, mask2, mask3,
742 vec0, vec1, vec2, vec3);
744 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
745 dst3, dst3, dst3, dst3);
747 HEVC_UNIW_RND_CLIP4(dst0, dst1, dst2, dst3,
748 weight_vec, offset_vec, rnd_vec,
749 dst0_r, dst1_r, dst2_r, dst3_r,
750 dst0_l, dst1_l, dst2_l, dst3_l);
752 HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r,
753 dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
754 ST4x8_UB(dst0_r, dst1_r, dst, dst_stride);
755 dst += (8 * dst_stride);
759 static void hevc_hz_uniwgt_8t_8w_msa(uint8_t *src,
763 const int8_t *filter,
770 v16i8 src0, src1, src2, src3;
771 v8i16 filt0, filt1, filt2, filt3;
772 v16i8 mask1, mask2, mask3;
773 v8i16 filter_vec, const_vec;
774 v16i8 vec0, vec1, vec2, vec3;
775 v8i16 dst0, dst1, dst2, dst3;
776 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l;
777 v4i32 weight_vec, offset_vec, rnd_vec;
778 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
781 weight = weight & 0x0000FFFF;
782 const_vec = __msa_ldi_h(128);
785 weight_vec = __msa_fill_w(weight);
786 offset_vec = __msa_fill_w(offset);
787 rnd_vec = __msa_fill_w(rnd_val);
789 filter_vec = LD_SH(filter);
790 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
796 for (loop_cnt = (height >> 2); loop_cnt--;) {
797 LD_SB4(src, src_stride, src0, src1, src2, src3);
798 src += (4 * src_stride);
799 XORI_B4_128_SB(src0, src1, src2, src3);
801 VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
802 vec0, vec1, vec2, vec3);
804 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
805 dst0, dst0, dst0, dst0);
806 VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
807 vec0, vec1, vec2, vec3);
809 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
810 dst1, dst1, dst1, dst1);
811 VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
812 vec0, vec1, vec2, vec3);
814 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
815 dst2, dst2, dst2, dst2);
816 VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
817 vec0, vec1, vec2, vec3);
819 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
820 dst3, dst3, dst3, dst3);
822 HEVC_UNIW_RND_CLIP4(dst0, dst1, dst2, dst3,
823 weight_vec, offset_vec, rnd_vec,
824 dst0_r, dst1_r, dst2_r, dst3_r,
825 dst0_l, dst1_l, dst2_l, dst3_l);
827 HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r,
828 dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
829 ST8x4_UB(dst0_r, dst1_r, dst, dst_stride);
830 dst += (4 * dst_stride);
834 static void hevc_hz_uniwgt_8t_12w_msa(uint8_t *src,
838 const int8_t *filter,
844 hevc_hz_uniwgt_8t_8w_msa(src, src_stride, dst, dst_stride,
845 filter, height, weight, offset, rnd_val);
846 hevc_hz_uniwgt_8t_4w_msa(src + 8, src_stride, dst + 8, dst_stride,
847 filter, height, weight, offset, rnd_val);
850 static void hevc_hz_uniwgt_8t_16w_msa(uint8_t *src,
854 const int8_t *filter,
861 v16i8 src0, src1, src2, src3;
862 v8i16 filt0, filt1, filt2, filt3;
863 v16i8 mask1, mask2, mask3;
864 v8i16 filter_vec, const_vec;
865 v16i8 vec0, vec1, vec2, vec3;
866 v8i16 dst0, dst1, dst2, dst3;
867 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l;
868 v4i32 weight_vec, offset_vec, rnd_vec;
869 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
872 const_vec = __msa_ldi_h(128);
875 weight = weight & 0x0000FFFF;
876 weight_vec = __msa_fill_w(weight);
877 offset_vec = __msa_fill_w(offset);
878 rnd_vec = __msa_fill_w(rnd_val);
880 filter_vec = LD_SH(filter);
881 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
887 for (loop_cnt = (height >> 1); loop_cnt--;) {
888 LD_SB2(src, src_stride, src0, src2);
889 LD_SB2(src + 8, src_stride, src1, src3);
890 src += (2 * src_stride);
891 XORI_B4_128_SB(src0, src1, src2, src3);
893 VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
894 vec0, vec1, vec2, vec3);
896 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
897 dst0, dst0, dst0, dst0);
898 VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
899 vec0, vec1, vec2, vec3);
901 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
902 dst1, dst1, dst1, dst1);
903 VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
904 vec0, vec1, vec2, vec3);
906 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
907 dst2, dst2, dst2, dst2);
908 VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
909 vec0, vec1, vec2, vec3);
911 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
912 dst3, dst3, dst3, dst3);
914 HEVC_UNIW_RND_CLIP4(dst0, dst1, dst2, dst3,
915 weight_vec, offset_vec, rnd_vec,
916 dst0_r, dst1_r, dst2_r, dst3_r,
917 dst0_l, dst1_l, dst2_l, dst3_l);
919 HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r,
920 dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
921 ST_SW2(dst0_r, dst1_r, dst, dst_stride);
922 dst += (2 * dst_stride);
926 static void hevc_hz_uniwgt_8t_24w_msa(uint8_t *src,
930 const int8_t *filter,
937 v16i8 src0, src1, src2, src3;
938 v8i16 filt0, filt1, filt2, filt3;
939 v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
940 v16i8 vec0, vec1, vec2, vec3;
941 v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
942 v8i16 filter_vec, const_vec;
943 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r;
944 v4i32 dst0_l, dst1_l, dst2_l, dst3_l, dst4_l, dst5_l;
945 v4i32 weight_vec, offset_vec, rnd_vec;
946 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
949 const_vec = __msa_ldi_h(128);
952 weight = weight & 0x0000FFFF;
953 weight_vec = __msa_fill_w(weight);
954 offset_vec = __msa_fill_w(offset);
955 rnd_vec = __msa_fill_w(rnd_val);
957 filter_vec = LD_SH(filter);
958 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
968 for (loop_cnt = (height >> 1); loop_cnt--;) {
969 LD_SB2(src, 16, src0, src1);
971 LD_SB2(src, 16, src2, src3);
973 XORI_B4_128_SB(src0, src1, src2, src3);
974 VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
975 vec0, vec1, vec2, vec3);
978 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
979 dst0, dst0, dst0, dst0);
980 VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7,
981 vec0, vec1, vec2, vec3);
983 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
984 dst1, dst1, dst1, dst1);
985 VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
986 vec0, vec1, vec2, vec3);
988 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
989 dst2, dst2, dst2, dst2);
990 VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
991 vec0, vec1, vec2, vec3);
993 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
994 dst3, dst3, dst3, dst3);
995 VSHF_B4_SB(src2, src3, mask4, mask5, mask6, mask7,
996 vec0, vec1, vec2, vec3);
998 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
999 dst4, dst4, dst4, dst4);
1000 VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
1001 vec0, vec1, vec2, vec3);
1003 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
1004 dst5, dst5, dst5, dst5);
1006 HEVC_UNIW_RND_CLIP4(dst0, dst1, dst2, dst3,
1007 weight_vec, offset_vec, rnd_vec,
1008 dst0_r, dst1_r, dst2_r, dst3_r,
1009 dst0_l, dst1_l, dst2_l, dst3_l);
1010 HEVC_UNIW_RND_CLIP2(dst4, dst5, weight_vec, offset_vec, rnd_vec,
1011 dst4_r, dst5_r, dst4_l, dst5_l);
1013 HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r,
1014 dst3_l, dst3_r, dst4_l, dst4_r, dst0_r, dst1_r);
1015 HEVC_PCK_SW_SB4(dst2_l, dst2_r, dst5_l, dst5_r, dst2_r);
1016 ST_SW2(dst0_r, dst1_r, dst, dst_stride);
1017 ST8x2_UB(dst2_r, dst + 16, dst_stride);
1018 dst += (2 * dst_stride);
1022 static void hevc_hz_uniwgt_8t_32w_msa(uint8_t *src,
1026 const int8_t *filter,
1033 v16i8 src0, src1, src2;
1034 v8i16 filt0, filt1, filt2, filt3;
1035 v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
1036 v16i8 vec0, vec1, vec2, vec3;
1037 v8i16 dst0, dst1, dst2, dst3;
1038 v8i16 filter_vec, const_vec;
1039 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l;
1040 v4i32 weight_vec, offset_vec, rnd_vec;
1041 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
1044 const_vec = __msa_ldi_h(128);
1047 weight = weight & 0x0000FFFF;
1048 weight_vec = __msa_fill_w(weight);
1049 offset_vec = __msa_fill_w(offset);
1050 rnd_vec = __msa_fill_w(rnd_val);
1052 filter_vec = LD_SH(filter);
1053 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1063 for (loop_cnt = height; loop_cnt--;) {
1064 LD_SB2(src, 16, src0, src1);
1065 src2 = LD_SB(src + 24);
1067 XORI_B3_128_SB(src0, src1, src2);
1069 VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
1070 vec0, vec1, vec2, vec3);
1072 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
1073 dst0, dst0, dst0, dst0);
1074 VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7,
1075 vec0, vec1, vec2, vec3);
1077 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
1078 dst1, dst1, dst1, dst1);
1079 VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
1080 vec0, vec1, vec2, vec3);
1082 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
1083 dst2, dst2, dst2, dst2);
1084 VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
1085 vec0, vec1, vec2, vec3);
1087 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
1088 dst3, dst3, dst3, dst3);
1090 HEVC_UNIW_RND_CLIP4(dst0, dst1, dst2, dst3,
1091 weight_vec, offset_vec, rnd_vec,
1092 dst0_r, dst1_r, dst2_r, dst3_r,
1093 dst0_l, dst1_l, dst2_l, dst3_l);
1095 HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r,
1096 dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
1097 ST_SW2(dst0_r, dst1_r, dst, 16);
1102 static void hevc_hz_uniwgt_8t_48w_msa(uint8_t *src,
1106 const int8_t *filter,
1113 v16i8 src0, src1, src2, src3;
1114 v8i16 filt0, filt1, filt2, filt3;
1115 v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
1116 v16i8 vec0, vec1, vec2, vec3;
1117 v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
1118 v8i16 filter_vec, const_vec;
1119 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r;
1120 v4i32 dst0_l, dst1_l, dst2_l, dst3_l, dst4_l, dst5_l;
1121 v4i32 weight_vec, offset_vec, rnd_vec;
1122 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
1125 const_vec = __msa_ldi_h(128);
1128 weight = weight & 0x0000FFFF;
1129 weight_vec = __msa_fill_w(weight);
1130 offset_vec = __msa_fill_w(offset);
1131 rnd_vec = __msa_fill_w(rnd_val);
1133 filter_vec = LD_SH(filter);
1134 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1144 for (loop_cnt = height; loop_cnt--;) {
1145 LD_SB3(src, 16, src0, src1, src2);
1146 src3 = LD_SB(src + 40);
1148 XORI_B4_128_SB(src0, src1, src2, src3);
1150 VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
1151 vec0, vec1, vec2, vec3);
1153 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
1154 dst0, dst0, dst0, dst0);
1155 VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7,
1156 vec0, vec1, vec2, vec3);
1158 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
1159 dst1, dst1, dst1, dst1);
1160 VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
1161 vec0, vec1, vec2, vec3);
1163 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
1164 dst2, dst2, dst2, dst2);
1165 VSHF_B4_SB(src1, src2, mask4, mask5, mask6, mask7,
1166 vec0, vec1, vec2, vec3);
1168 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
1169 dst3, dst3, dst3, dst3);
1170 VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
1171 vec0, vec1, vec2, vec3);
1173 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
1174 dst4, dst4, dst4, dst4);
1175 VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
1176 vec0, vec1, vec2, vec3);
1178 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
1179 dst5, dst5, dst5, dst5);
1181 HEVC_UNIW_RND_CLIP4(dst0, dst1, dst2, dst3,
1182 weight_vec, offset_vec, rnd_vec,
1183 dst0_r, dst1_r, dst2_r, dst3_r,
1184 dst0_l, dst1_l, dst2_l, dst3_l);
1186 HEVC_UNIW_RND_CLIP2(dst4, dst5, weight_vec, offset_vec, rnd_vec,
1187 dst4_r, dst5_r, dst4_l, dst5_l);
1189 HEVC_PCK_SW_SB12(dst0_l, dst0_r, dst1_l, dst1_r,
1190 dst2_l, dst2_r, dst3_l, dst3_r,
1191 dst4_l, dst4_r, dst5_l, dst5_r,
1192 dst0_r, dst1_r, dst2_r);
1193 ST_SW2(dst0_r, dst1_r, dst, 16);
1194 ST_SW(dst2_r, dst + 32);
1199 static void hevc_hz_uniwgt_8t_64w_msa(uint8_t *src,
1203 const int8_t *filter,
1211 uint32_t loop_cnt, cnt;
1212 v16i8 src0, src1, src2;
1213 v8i16 filt0, filt1, filt2, filt3;
1214 v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
1215 v16i8 vec0, vec1, vec2, vec3;
1216 v8i16 dst0, dst1, dst2, dst3;
1217 v8i16 filter_vec, const_vec;
1218 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l;
1219 v4i32 weight_vec, offset_vec, rnd_vec;
1220 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
1223 const_vec = __msa_ldi_h(128);
1226 weight = weight & 0x0000FFFF;
1227 weight_vec = __msa_fill_w(weight);
1228 offset_vec = __msa_fill_w(offset);
1229 rnd_vec = __msa_fill_w(rnd_val);
1231 filter_vec = LD_SH(filter);
1232 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1242 for (loop_cnt = height; loop_cnt--;) {
1246 for (cnt = 2; cnt--;) {
1247 LD_SB2(src_tmp, 16, src0, src1);
1248 src2 = LD_SB(src_tmp + 24);
1250 XORI_B3_128_SB(src0, src1, src2);
1252 VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
1253 vec0, vec1, vec2, vec3);
1255 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
1256 dst0, dst0, dst0, dst0);
1257 VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7,
1258 vec0, vec1, vec2, vec3);
1260 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
1261 dst1, dst1, dst1, dst1);
1262 VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
1263 vec0, vec1, vec2, vec3);
1265 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
1266 dst2, dst2, dst2, dst2);
1267 VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
1268 vec0, vec1, vec2, vec3);
1270 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
1271 dst3, dst3, dst3, dst3);
1273 HEVC_UNIW_RND_CLIP4(dst0, dst1, dst2, dst3,
1274 weight_vec, offset_vec, rnd_vec,
1275 dst0_r, dst1_r, dst2_r, dst3_r,
1276 dst0_l, dst1_l, dst2_l, dst3_l);
1278 HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r,
1279 dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
1280 ST_SW2(dst0_r, dst1_r, dst_tmp, 16);
1289 static void hevc_vt_uniwgt_8t_4w_msa(uint8_t *src,
1293 const int8_t *filter,
1300 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
1301 v16i8 src9, src10, src11, src12, src13, src14;
1302 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
1303 v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
1304 v16i8 src1110_r, src1211_r, src1312_r, src1413_r;
1305 v16i8 src2110, src4332, src6554, src8776, src10998;
1306 v16i8 src12111110, src14131312;
1307 v8i16 dst10, dst32, dst54, dst76;
1308 v8i16 filt0, filt1, filt2, filt3;
1309 v8i16 filter_vec, const_vec;
1310 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l;
1311 v4i32 weight_vec, offset_vec, rnd_vec;
1313 src -= (3 * src_stride);
1314 const_vec = __msa_ldi_h(128);
1317 weight = weight & 0x0000FFFF;
1318 weight_vec = __msa_fill_w(weight);
1319 offset_vec = __msa_fill_w(offset);
1320 rnd_vec = __msa_fill_w(rnd_val);
1322 filter_vec = LD_SH(filter);
1323 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1325 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
1326 src += (7 * src_stride);
1328 ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1329 src10_r, src32_r, src54_r, src21_r);
1331 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1333 ILVR_D3_SB(src21_r, src10_r, src43_r,
1334 src32_r, src65_r, src54_r, src2110, src4332, src6554);
1336 XORI_B3_128_SB(src2110, src4332, src6554);
1338 for (loop_cnt = (height >> 3); loop_cnt--;) {
1339 LD_SB8(src, src_stride,
1340 src7, src8, src9, src10, src11, src12, src13, src14);
1341 src += (8 * src_stride);
1342 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
1343 src76_r, src87_r, src98_r, src109_r);
1344 ILVR_B4_SB(src11, src10, src12, src11, src13, src12, src14, src13,
1345 src1110_r, src1211_r, src1312_r, src1413_r);
1346 ILVR_D4_SB(src87_r, src76_r, src109_r, src98_r, src1211_r, src1110_r,
1347 src1413_r, src1312_r,
1348 src8776, src10998, src12111110, src14131312);
1349 XORI_B4_128_SB(src8776, src10998, src12111110, src14131312);
1352 DPADD_SB4_SH(src2110, src4332, src6554, src8776, filt0, filt1,
1353 filt2, filt3, dst10, dst10, dst10, dst10);
1355 DPADD_SB4_SH(src4332, src6554, src8776, src10998,
1356 filt0, filt1, filt2, filt3, dst32, dst32, dst32, dst32);
1358 DPADD_SB4_SH(src6554, src8776, src10998, src12111110,
1359 filt0, filt1, filt2, filt3, dst54, dst54, dst54, dst54);
1361 DPADD_SB4_SH(src8776, src10998, src12111110, src14131312,
1362 filt0, filt1, filt2, filt3, dst76, dst76, dst76, dst76);
1364 HEVC_UNIW_RND_CLIP4(dst10, dst32, dst54, dst76,
1365 weight_vec, offset_vec, rnd_vec,
1366 dst0_r, dst1_r, dst2_r, dst3_r,
1367 dst0_l, dst1_l, dst2_l, dst3_l);
1369 HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r,
1370 dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
1371 ST4x8_UB(dst0_r, dst1_r, dst, dst_stride);
1372 dst += (8 * dst_stride);
1375 src4332 = src12111110;
1376 src6554 = src14131312;
1381 static void hevc_vt_uniwgt_8t_8w_msa(uint8_t *src,
1385 const int8_t *filter,
1392 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1393 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
1394 v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
1395 v8i16 tmp0, tmp1, tmp2, tmp3;
1396 v8i16 filt0, filt1, filt2, filt3;
1397 v8i16 filter_vec, const_vec;
1398 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l;
1399 v4i32 weight_vec, offset_vec, rnd_vec;
1401 src -= (3 * src_stride);
1402 const_vec = __msa_ldi_h(128);
1405 weight = weight & 0x0000FFFF;
1406 weight_vec = __msa_fill_w(weight);
1407 offset_vec = __msa_fill_w(offset);
1408 rnd_vec = __msa_fill_w(rnd_val);
1410 filter_vec = LD_SH(filter);
1411 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1413 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
1414 src += (7 * src_stride);
1415 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1417 ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1418 src10_r, src32_r, src54_r, src21_r);
1419 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1421 for (loop_cnt = (height >> 2); loop_cnt--;) {
1422 LD_SB4(src, src_stride, src7, src8, src9, src10);
1423 src += (4 * src_stride);
1424 XORI_B4_128_SB(src7, src8, src9, src10);
1425 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
1426 src76_r, src87_r, src98_r, src109_r);
1429 DPADD_SB4_SH(src10_r, src32_r, src54_r, src76_r,
1430 filt0, filt1, filt2, filt3, tmp0, tmp0, tmp0, tmp0);
1432 DPADD_SB4_SH(src21_r, src43_r, src65_r, src87_r,
1433 filt0, filt1, filt2, filt3, tmp1, tmp1, tmp1, tmp1);
1435 DPADD_SB4_SH(src32_r, src54_r, src76_r, src98_r,
1436 filt0, filt1, filt2, filt3, tmp2, tmp2, tmp2, tmp2);
1438 DPADD_SB4_SH(src43_r, src65_r, src87_r, src109_r,
1439 filt0, filt1, filt2, filt3, tmp3, tmp3, tmp3, tmp3);
1441 HEVC_UNIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3,
1442 weight_vec, offset_vec, rnd_vec,
1443 dst0_r, dst1_r, dst2_r, dst3_r,
1444 dst0_l, dst1_l, dst2_l, dst3_l);
1446 HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r,
1447 dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
1448 ST8x4_UB(dst0_r, dst1_r, dst, dst_stride);
1449 dst += (4 * dst_stride);
1461 static void hevc_vt_uniwgt_8t_12w_msa(uint8_t *src,
1465 const int8_t *filter,
1472 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1473 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
1474 v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
1475 v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
1476 v16i8 src10_l, src32_l, src54_l, src76_l, src98_l;
1477 v16i8 src21_l, src43_l, src65_l, src87_l, src109_l;
1478 v16i8 src2110, src4332, src6554, src8776, src10998;
1479 v8i16 filt0, filt1, filt2, filt3;
1480 v8i16 filter_vec, const_vec;
1481 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r;
1482 v4i32 dst0_l, dst1_l, dst2_l, dst3_l, dst4_l, dst5_l;
1483 v4i32 weight_vec, offset_vec, rnd_vec;
1485 src -= (3 * src_stride);
1486 const_vec = __msa_ldi_h(128);
1489 weight = weight & 0x0000FFFF;
1490 weight_vec = __msa_fill_w(weight);
1491 offset_vec = __msa_fill_w(offset);
1492 rnd_vec = __msa_fill_w(rnd_val);
1494 filter_vec = LD_SH(filter);
1495 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1497 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
1498 src += (7 * src_stride);
1499 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1501 ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1502 src10_r, src32_r, src54_r, src21_r);
1503 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1504 ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1505 src10_l, src32_l, src54_l, src21_l);
1506 ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
1507 ILVR_D3_SB(src21_l, src10_l, src43_l, src32_l, src65_l, src54_l,
1508 src2110, src4332, src6554);
1510 for (loop_cnt = (height >> 2); loop_cnt--;) {
1511 LD_SB4(src, src_stride, src7, src8, src9, src10);
1512 src += (4 * src_stride);
1513 XORI_B4_128_SB(src7, src8, src9, src10);
1515 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
1516 src76_r, src87_r, src98_r, src109_r);
1517 ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
1518 src76_l, src87_l, src98_l, src109_l);
1519 ILVR_D2_SB(src87_l, src76_l, src109_l, src98_l, src8776, src10998);
1522 DPADD_SB4_SH(src10_r, src32_r, src54_r, src76_r,
1523 filt0, filt1, filt2, filt3, tmp0, tmp0, tmp0, tmp0);
1525 DPADD_SB4_SH(src21_r, src43_r, src65_r, src87_r,
1526 filt0, filt1, filt2, filt3, tmp1, tmp1, tmp1, tmp1);
1528 DPADD_SB4_SH(src32_r, src54_r, src76_r, src98_r,
1529 filt0, filt1, filt2, filt3, tmp2, tmp2, tmp2, tmp2);
1531 DPADD_SB4_SH(src43_r, src65_r, src87_r, src109_r,
1532 filt0, filt1, filt2, filt3, tmp3, tmp3, tmp3, tmp3);
1534 DPADD_SB4_SH(src2110, src4332, src6554, src8776,
1535 filt0, filt1, filt2, filt3, tmp4, tmp4, tmp4, tmp4);
1537 DPADD_SB4_SH(src4332, src6554, src8776, src10998,
1538 filt0, filt1, filt2, filt3, tmp5, tmp5, tmp5, tmp5);
1540 HEVC_UNIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3,
1541 weight_vec, offset_vec, rnd_vec,
1542 dst0_r, dst1_r, dst2_r, dst3_r,
1543 dst0_l, dst1_l, dst2_l, dst3_l);
1544 HEVC_UNIW_RND_CLIP2(tmp4, tmp5, weight_vec, offset_vec, rnd_vec,
1545 dst4_r, dst5_r, dst4_l, dst5_l);
1547 HEVC_PCK_SW_SB12(dst0_l, dst0_r, dst1_l, dst1_r,
1548 dst2_l, dst2_r, dst3_l, dst3_r,
1549 dst4_l, dst4_r, dst5_l, dst5_r,
1550 dst0_r, dst1_r, dst2_r);
1551 ST12x4_UB(dst0_r, dst1_r, dst2_r, dst, dst_stride);
1552 dst += (4 * dst_stride);
1567 static void hevc_vt_uniwgt_8t_16multx2mult_msa(uint8_t *src,
1571 const int8_t *filter,
1580 int32_t loop_cnt, cnt;
1581 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
1582 v16i8 src10_r, src32_r, src54_r, src76_r;
1583 v16i8 src21_r, src43_r, src65_r, src87_r;
1584 v8i16 tmp0, tmp1, tmp2, tmp3;
1585 v16i8 src10_l, src32_l, src54_l, src76_l;
1586 v16i8 src21_l, src43_l, src65_l, src87_l;
1587 v8i16 filt0, filt1, filt2, filt3;
1588 v8i16 filter_vec, const_vec;
1589 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l;
1590 v4i32 weight_vec, offset_vec, rnd_vec;
1592 src -= (3 * src_stride);
1593 const_vec = __msa_ldi_h(128);
1596 weight = weight & 0x0000FFFF;
1597 weight_vec = __msa_fill_w(weight);
1598 offset_vec = __msa_fill_w(offset);
1599 rnd_vec = __msa_fill_w(rnd_val);
1601 filter_vec = LD_SH(filter);
1602 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1604 for (cnt = (width >> 4); cnt--;) {
1608 LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6);
1609 src_tmp += (7 * src_stride);
1610 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1611 ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1612 src10_r, src32_r, src54_r, src21_r);
1613 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1614 ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1615 src10_l, src32_l, src54_l, src21_l);
1616 ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
1618 for (loop_cnt = (height >> 1); loop_cnt--;) {
1619 LD_SB2(src_tmp, src_stride, src7, src8);
1620 src_tmp += (2 * src_stride);
1621 XORI_B2_128_SB(src7, src8);
1622 ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
1623 ILVL_B2_SB(src7, src6, src8, src7, src76_l, src87_l);
1626 DPADD_SB4_SH(src10_r, src32_r, src54_r, src76_r,
1627 filt0, filt1, filt2, filt3, tmp0, tmp0, tmp0, tmp0);
1629 DPADD_SB4_SH(src21_r, src43_r, src65_r, src87_r,
1630 filt0, filt1, filt2, filt3, tmp1, tmp1, tmp1, tmp1);
1632 DPADD_SB4_SH(src10_l, src32_l, src54_l, src76_l,
1633 filt0, filt1, filt2, filt3, tmp2, tmp2, tmp2, tmp2);
1635 DPADD_SB4_SH(src21_l, src43_l, src65_l, src87_l,
1636 filt0, filt1, filt2, filt3, tmp3, tmp3, tmp3, tmp3);
1638 HEVC_UNIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3,
1639 weight_vec, offset_vec, rnd_vec,
1640 dst0_r, dst1_r, dst2_r, dst3_r,
1641 dst0_l, dst1_l, dst2_l, dst3_l);
1643 HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst2_l, dst2_r,
1644 dst1_l, dst1_r, dst3_l, dst3_r, dst0_r, dst1_r);
1645 ST_SW2(dst0_r, dst1_r, dst_tmp, dst_stride);
1646 dst_tmp += (2 * dst_stride);
1668 static void hevc_vt_uniwgt_8t_16w_msa(uint8_t *src,
1672 const int8_t *filter,
1678 hevc_vt_uniwgt_8t_16multx2mult_msa(src, src_stride, dst, dst_stride,
1679 filter, height, weight,
1680 offset, rnd_val, 16);
1683 static void hevc_vt_uniwgt_8t_24w_msa(uint8_t *src,
1687 const int8_t *filter,
1693 hevc_vt_uniwgt_8t_16multx2mult_msa(src, src_stride, dst, dst_stride,
1694 filter, height, weight,
1695 offset, rnd_val, 16);
1697 hevc_vt_uniwgt_8t_8w_msa(src + 16, src_stride, dst + 16, dst_stride,
1698 filter, height, weight, offset, rnd_val);
1701 static void hevc_vt_uniwgt_8t_32w_msa(uint8_t *src,
1705 const int8_t *filter,
1711 hevc_vt_uniwgt_8t_16multx2mult_msa(src, src_stride, dst, dst_stride,
1712 filter, height, weight,
1713 offset, rnd_val, 32);
1716 static void hevc_vt_uniwgt_8t_48w_msa(uint8_t *src,
1720 const int8_t *filter,
1726 hevc_vt_uniwgt_8t_16multx2mult_msa(src, src_stride, dst, dst_stride,
1727 filter, height, weight,
1728 offset, rnd_val, 48);
1731 static void hevc_vt_uniwgt_8t_64w_msa(uint8_t *src,
1735 const int8_t *filter,
1741 hevc_vt_uniwgt_8t_16multx2mult_msa(src, src_stride, dst, dst_stride,
1742 filter, height, weight,
1743 offset, rnd_val, 64);
1746 static void hevc_hv_uniwgt_8t_4w_msa(uint8_t *src,
1750 const int8_t *filter_x,
1751 const int8_t *filter_y,
1758 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
1759 v8i16 filt0, filt1, filt2, filt3;
1760 v4i32 filt_h0, filt_h1, filt_h2, filt_h3;
1761 v16i8 mask1, mask2, mask3;
1762 v8i16 filter_vec, const_vec;
1763 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1764 v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1765 v8i16 dst30, dst41, dst52, dst63, dst66, dst87;
1766 v4i32 dst0_r, dst1_r, weight_vec, offset_vec, rnd_vec;
1767 v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
1768 v8i16 dst21_r, dst43_r, dst65_r, dst87_r;
1769 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 };
1770 v8u16 mask4 = { 0, 4, 1, 5, 2, 6, 3, 7 };
1772 src -= ((3 * src_stride) + 3);
1773 filter_vec = LD_SH(filter_x);
1774 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1776 filter_vec = LD_SH(filter_y);
1777 vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
1778 filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
1780 SPLATI_W4_SW(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
1786 const_vec = __msa_ldi_h(128);
1789 weight_vec = __msa_fill_w(weight);
1790 offset_vec = __msa_fill_w(offset);
1791 rnd_vec = __msa_fill_w(rnd_val);
1793 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
1794 src += (7 * src_stride);
1795 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1797 /* row 0 row 1 row 2 row 3 */
1798 VSHF_B4_SB(src0, src3, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
1799 VSHF_B4_SB(src1, src4, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7);
1800 VSHF_B4_SB(src2, src5, mask0, mask1, mask2, mask3,
1801 vec8, vec9, vec10, vec11);
1802 VSHF_B4_SB(src3, src6, mask0, mask1, mask2, mask3,
1803 vec12, vec13, vec14, vec15);
1805 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
1806 dst30, dst30, dst30, dst30);
1808 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, filt3,
1809 dst41, dst41, dst41, dst41);
1811 DPADD_SB4_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, filt3,
1812 dst52, dst52, dst52, dst52);
1814 DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2, filt3,
1815 dst63, dst63, dst63, dst63);
1817 ILVR_H3_SH(dst41, dst30, dst52, dst41, dst63, dst52,
1818 dst10_r, dst21_r, dst32_r);
1820 dst43_r = __msa_ilvl_h(dst41, dst30);
1821 dst54_r = __msa_ilvl_h(dst52, dst41);
1822 dst65_r = __msa_ilvl_h(dst63, dst52);
1824 dst66 = (v8i16) __msa_splati_d((v2i64) dst63, 1);
1826 for (loop_cnt = height >> 1; loop_cnt--;) {
1827 LD_SB2(src, src_stride, src7, src8);
1828 src += (2 * src_stride);
1829 XORI_B2_128_SB(src7, src8);
1831 VSHF_B4_SB(src7, src8, mask0, mask1, mask2, mask3,
1832 vec0, vec1, vec2, vec3);
1834 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
1835 dst87, dst87, dst87, dst87);
1836 dst76_r = __msa_ilvr_h(dst87, dst66);
1837 dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r,
1838 filt_h0, filt_h1, filt_h2, filt_h3);
1839 dst87_r = __msa_vshf_h((v8i16) mask4, dst87, dst87);
1840 dst1_r = HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r,
1841 filt_h0, filt_h1, filt_h2, filt_h3);
1845 MUL2(dst0_r, weight_vec, dst1_r, weight_vec, dst0_r, dst1_r);
1846 SRAR_W2_SW(dst0_r, dst1_r, rnd_vec);
1847 ADD2(dst0_r, offset_vec, dst1_r, offset_vec, dst0_r, dst1_r);
1848 dst0_r = CLIP_SW_0_255(dst0_r);
1849 dst1_r = CLIP_SW_0_255(dst1_r);
1851 HEVC_PCK_SW_SB2(dst1_r, dst0_r, dst0_r);
1852 ST4x2_UB(dst0_r, dst, dst_stride);
1853 dst += (2 * dst_stride);
1861 dst66 = (v8i16) __msa_splati_d((v2i64) dst87, 1);
1865 static void hevc_hv_uniwgt_8t_8multx2mult_msa(uint8_t *src,
1869 const int8_t *filter_x,
1870 const int8_t *filter_y,
1877 uint32_t loop_cnt, cnt;
1880 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
1881 v8i16 filt0, filt1, filt2, filt3;
1882 v4i32 filt_h0, filt_h1, filt_h2, filt_h3;
1883 v16i8 mask1, mask2, mask3;
1884 v8i16 filter_vec, const_vec;
1885 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1886 v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1887 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
1888 v4i32 dst0_r, dst0_l, dst1_r, dst1_l;
1889 v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
1890 v8i16 dst10_l, dst32_l, dst54_l, dst76_l;
1891 v8i16 dst21_r, dst43_r, dst65_r, dst87_r;
1892 v8i16 dst21_l, dst43_l, dst65_l, dst87_l;
1893 v4i32 weight_vec, offset_vec, rnd_vec;
1894 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
1896 src -= ((3 * src_stride) + 3);
1897 const_vec = __msa_ldi_h(128);
1900 weight_vec = __msa_fill_w(weight);
1901 offset_vec = __msa_fill_w(offset);
1902 rnd_vec = __msa_fill_w(rnd_val);
1904 filter_vec = LD_SH(filter_x);
1905 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1907 filter_vec = LD_SH(filter_y);
1908 vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
1909 filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
1910 SPLATI_W4_SW(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
1916 for (cnt = width >> 3; cnt--;) {
1920 LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6);
1921 src_tmp += (7 * src_stride);
1922 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1924 VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
1925 vec0, vec1, vec2, vec3);
1926 VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
1927 vec4, vec5, vec6, vec7);
1928 VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
1929 vec8, vec9, vec10, vec11);
1930 VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
1931 vec12, vec13, vec14, vec15);
1933 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
1934 dst0, dst0, dst0, dst0);
1936 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, filt3,
1937 dst1, dst1, dst1, dst1);
1939 DPADD_SB4_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, filt3,
1940 dst2, dst2, dst2, dst2);
1942 DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2, filt3,
1943 dst3, dst3, dst3, dst3);
1945 VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3,
1946 vec0, vec1, vec2, vec3);
1947 VSHF_B4_SB(src5, src5, mask0, mask1, mask2, mask3,
1948 vec4, vec5, vec6, vec7);
1949 VSHF_B4_SB(src6, src6, mask0, mask1, mask2, mask3,
1950 vec8, vec9, vec10, vec11);
1952 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
1953 dst4, dst4, dst4, dst4);
1955 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, filt3,
1956 dst5, dst5, dst5, dst5);
1958 DPADD_SB4_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, filt3,
1959 dst6, dst6, dst6, dst6);
1961 ILVR_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1,
1962 dst10_r, dst32_r, dst54_r, dst21_r);
1963 ILVR_H2_SH(dst4, dst3, dst6, dst5, dst43_r, dst65_r);
1964 ILVL_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1,
1965 dst10_l, dst32_l, dst54_l, dst21_l);
1966 ILVL_H2_SH(dst4, dst3, dst6, dst5, dst43_l, dst65_l);
1968 for (loop_cnt = height >> 1; loop_cnt--;) {
1969 LD_SB2(src_tmp, src_stride, src7, src8);
1970 src_tmp += 2 * src_stride;
1971 XORI_B2_128_SB(src7, src8);
1973 VSHF_B4_SB(src7, src7, mask0, mask1, mask2, mask3,
1974 vec0, vec1, vec2, vec3);
1976 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
1977 dst7, dst7, dst7, dst7);
1979 ILVRL_H2_SH(dst7, dst6, dst76_r, dst76_l);
1980 dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r,
1981 filt_h0, filt_h1, filt_h2, filt_h3);
1982 dst0_l = HEVC_FILT_8TAP(dst10_l, dst32_l, dst54_l, dst76_l,
1983 filt_h0, filt_h1, filt_h2, filt_h3);
1988 VSHF_B4_SB(src8, src8, mask0, mask1, mask2, mask3,
1989 vec0, vec1, vec2, vec3);
1991 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
1992 dst8, dst8, dst8, dst8);
1994 ILVRL_H2_SH(dst8, dst7, dst87_r, dst87_l);
1995 dst1_r = HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r,
1996 filt_h0, filt_h1, filt_h2, filt_h3);
1997 dst1_l = HEVC_FILT_8TAP(dst21_l, dst43_l, dst65_l, dst87_l,
1998 filt_h0, filt_h1, filt_h2, filt_h3);
2002 HEVC_HV_UNIW_RND_CLIP4(dst0_r, dst1_r, dst0_l, dst1_l,
2003 weight_vec, offset_vec, rnd_vec,
2004 dst0_r, dst1_r, dst0_l, dst1_l);
2006 HEVC_PCK_SW_SB4(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r);
2007 ST8x2_UB(dst0_r, dst_tmp, dst_stride);
2008 dst_tmp += (2 * dst_stride);
2030 static void hevc_hv_uniwgt_8t_8w_msa(uint8_t *src,
2034 const int8_t *filter_x,
2035 const int8_t *filter_y,
2041 hevc_hv_uniwgt_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
2042 filter_x, filter_y, height, weight,
2043 offset, rnd_val, 8);
2046 static void hevc_hv_uniwgt_8t_12w_msa(uint8_t *src,
2050 const int8_t *filter_x,
2051 const int8_t *filter_y,
2057 hevc_hv_uniwgt_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
2058 filter_x, filter_y, height, weight,
2059 offset, rnd_val, 8);
2060 hevc_hv_uniwgt_8t_4w_msa(src + 8, src_stride, dst + 8, dst_stride,
2061 filter_x, filter_y, height, weight, offset,
2065 static void hevc_hv_uniwgt_8t_16w_msa(uint8_t *src,
2069 const int8_t *filter_x,
2070 const int8_t *filter_y,
2076 hevc_hv_uniwgt_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
2077 filter_x, filter_y, height, weight,
2078 offset, rnd_val, 16);
2081 static void hevc_hv_uniwgt_8t_24w_msa(uint8_t *src,
2085 const int8_t *filter_x,
2086 const int8_t *filter_y,
2092 hevc_hv_uniwgt_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
2093 filter_x, filter_y, height, weight,
2094 offset, rnd_val, 24);
2097 static void hevc_hv_uniwgt_8t_32w_msa(uint8_t *src,
2101 const int8_t *filter_x,
2102 const int8_t *filter_y,
2108 hevc_hv_uniwgt_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
2109 filter_x, filter_y, height, weight,
2110 offset, rnd_val, 32);
2113 static void hevc_hv_uniwgt_8t_48w_msa(uint8_t *src,
2117 const int8_t *filter_x,
2118 const int8_t *filter_y,
2124 hevc_hv_uniwgt_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
2125 filter_x, filter_y, height, weight,
2126 offset, rnd_val, 48);
2129 static void hevc_hv_uniwgt_8t_64w_msa(uint8_t *src,
2133 const int8_t *filter_x,
2134 const int8_t *filter_y,
2140 hevc_hv_uniwgt_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
2141 filter_x, filter_y, height, weight,
2142 offset, rnd_val, 64);
2145 static void hevc_hz_uniwgt_4t_4x2_msa(uint8_t *src,
2149 const int8_t *filter,
2156 v16i8 src0, src1, vec0, vec1;
2159 v4i32 dst0_r, dst0_l;
2160 v8i16 filter_vec, const_vec;
2161 v4i32 weight_vec, offset_vec, rnd_vec;
2162 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 };
2166 filter_vec = LD_SH(filter);
2167 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2171 weight = weight & 0x0000FFFF;
2173 const_vec = __msa_ldi_h(128);
2176 weight_vec = __msa_fill_w(weight);
2177 offset_vec = __msa_fill_w(offset);
2178 rnd_vec = __msa_fill_w(rnd_val);
2180 LD_SB2(src, src_stride, src0, src1);
2181 XORI_B2_128_SB(src0, src1);
2183 VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
2185 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
2187 ILVRL_H2_SW(dst0, dst0, dst0_r, dst0_l);
2188 DOTP_SH2_SW(dst0_r, dst0_l, weight_vec, weight_vec, dst0_r, dst0_l);
2189 SRAR_W2_SW(dst0_r, dst0_l, rnd_vec);
2190 ADD2(dst0_r, offset_vec, dst0_l, offset_vec, dst0_r, dst0_l);
2191 dst0_r = CLIP_SW_0_255(dst0_r);
2192 dst0_l = CLIP_SW_0_255(dst0_l);
2194 HEVC_PCK_SW_SB2(dst0_l, dst0_r, dst0_r);
2195 ST4x2_UB(dst0_r, dst, dst_stride);
2196 dst += (4 * dst_stride);
2199 static void hevc_hz_uniwgt_4t_4x4_msa(uint8_t *src,
2203 const int8_t *filter,
2210 v16i8 src0, src1, src2, src3;
2211 v16i8 mask1, vec0, vec1;
2213 v4i32 dst0_r, dst1_r, dst0_l, dst1_l;
2214 v8i16 filter_vec, const_vec;
2215 v4i32 weight_vec, offset_vec, rnd_vec;
2216 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 };
2220 /* rearranging filter */
2221 filter_vec = LD_SH(filter);
2222 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2226 weight = weight & 0x0000FFFF;
2228 const_vec = __msa_ldi_h(128);
2231 weight_vec = __msa_fill_w(weight);
2232 offset_vec = __msa_fill_w(offset);
2233 rnd_vec = __msa_fill_w(rnd_val);
2235 LD_SB4(src, src_stride, src0, src1, src2, src3);
2236 XORI_B4_128_SB(src0, src1, src2, src3);
2238 VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
2240 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
2242 VSHF_B2_SB(src2, src3, src2, src3, mask0, mask1, vec0, vec1);
2244 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
2246 HEVC_UNIW_RND_CLIP2(dst0, dst1, weight_vec, offset_vec, rnd_vec,
2247 dst0_r, dst1_r, dst0_l, dst1_l);
2249 HEVC_PCK_SW_SB4(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r);
2250 ST4x4_UB(dst0_r, dst0_r, 0, 1, 2, 3, dst, dst_stride);
2251 dst += (4 * dst_stride);
2254 static void hevc_hz_uniwgt_4t_4x8multiple_msa(uint8_t *src,
2258 const int8_t *filter,
2266 v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
2267 v16i8 mask1, vec0, vec1;
2268 v8i16 dst0, dst1, dst2, dst3;
2269 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l;
2270 v8i16 filter_vec, const_vec;
2271 v4i32 weight_vec, offset_vec, rnd_vec;
2272 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 };
2276 filter_vec = LD_SH(filter);
2277 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2279 weight = weight & 0x0000FFFF;
2280 const_vec = __msa_ldi_h(128);
2283 weight_vec = __msa_fill_w(weight);
2284 offset_vec = __msa_fill_w(offset);
2285 rnd_vec = __msa_fill_w(rnd_val);
2289 for (loop_cnt = (height >> 3); loop_cnt--;) {
2290 LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
2291 src += (8 * src_stride);
2293 XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
2295 VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
2297 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
2299 VSHF_B2_SB(src2, src3, src2, src3, mask0, mask1, vec0, vec1);
2301 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
2303 VSHF_B2_SB(src4, src5, src4, src5, mask0, mask1, vec0, vec1);
2305 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
2307 VSHF_B2_SB(src6, src7, src6, src7, mask0, mask1, vec0, vec1);
2309 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
2311 HEVC_UNIW_RND_CLIP4(dst0, dst1, dst2, dst3,
2312 weight_vec, offset_vec, rnd_vec,
2313 dst0_r, dst1_r, dst2_r, dst3_r,
2314 dst0_l, dst1_l, dst2_l, dst3_l);
2316 HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r,
2317 dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
2318 ST4x8_UB(dst0_r, dst1_r, dst, dst_stride);
2319 dst += (8 * dst_stride);
2323 static void hevc_hz_uniwgt_4t_4w_msa(uint8_t *src,
2327 const int8_t *filter,
2334 hevc_hz_uniwgt_4t_4x2_msa(src, src_stride, dst, dst_stride,
2335 filter, height, weight, offset, rnd_val);
2336 } else if (4 == height) {
2337 hevc_hz_uniwgt_4t_4x4_msa(src, src_stride, dst, dst_stride,
2338 filter, height, weight, offset, rnd_val);
2339 } else if (8 == height || 16 == height) {
2340 hevc_hz_uniwgt_4t_4x8multiple_msa(src, src_stride, dst, dst_stride,
2341 filter, height, weight,
2346 static void hevc_hz_uniwgt_4t_6w_msa(uint8_t *src,
2350 const int8_t *filter,
2358 v16i8 src0, src1, src2, src3;
2359 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
2362 v8i16 dst0, dst1, dst2, dst3;
2363 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l;
2364 v8i16 filter_vec, const_vec;
2365 v4i32 weight_vec, offset_vec, rnd_vec;
2369 filter_vec = LD_SH(filter);
2370 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2372 weight = weight & 0x0000FFFF;
2373 const_vec = __msa_ldi_h(128);
2376 weight_vec = __msa_fill_w(weight);
2377 offset_vec = __msa_fill_w(offset);
2378 rnd_vec = __msa_fill_w(rnd_val);
2382 for (loop_cnt = (height >> 2); loop_cnt--;) {
2383 LD_SB4(src, src_stride, src0, src1, src2, src3);
2384 src += (4 * src_stride);
2386 XORI_B4_128_SB(src0, src1, src2, src3);
2388 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
2390 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
2392 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
2394 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
2396 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
2398 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
2400 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
2402 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
2404 HEVC_UNIW_RND_CLIP4(dst0, dst1, dst2, dst3,
2405 weight_vec, offset_vec, rnd_vec,
2406 dst0_r, dst1_r, dst2_r, dst3_r,
2407 dst0_l, dst1_l, dst2_l, dst3_l);
2409 HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r,
2410 dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
2412 ST6x4_UB(dst0_r, dst1_r, dst, dst_stride);
2413 dst += (4 * dst_stride);
2417 static void hevc_hz_uniwgt_4t_8x2_msa(uint8_t *src,
2421 const int8_t *filter,
2427 v8i16 filt0, filt1, dst0, dst1;
2429 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
2432 v8i16 filter_vec, const_vec;
2433 v4i32 dst0_r, dst1_r, dst0_l, dst1_l;
2434 v4i32 weight_vec, offset_vec, rnd_vec;
2438 filter_vec = LD_SH(filter);
2439 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2441 weight = weight & 0x0000FFFF;
2442 const_vec = __msa_ldi_h(128);
2445 weight_vec = __msa_fill_w(weight);
2446 offset_vec = __msa_fill_w(offset);
2447 rnd_vec = __msa_fill_w(rnd_val);
2451 LD_SB2(src, src_stride, src0, src1);
2452 XORI_B2_128_SB(src0, src1);
2454 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
2456 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
2457 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
2459 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
2461 HEVC_UNIW_RND_CLIP2(dst0, dst1, weight_vec, offset_vec, rnd_vec,
2462 dst0_r, dst1_r, dst0_l, dst1_l);
2464 HEVC_PCK_SW_SB4(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r);
2465 ST8x2_UB(dst0_r, dst, dst_stride);
2468 static void hevc_hz_uniwgt_4t_8x6_msa(uint8_t *src,
2472 const int8_t *filter,
2479 v16i8 src0, src1, src2, src3, src4, src5;
2480 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
2483 v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
2484 v8i16 filter_vec, const_vec;
2485 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r;
2486 v4i32 dst0_l, dst1_l, dst2_l, dst3_l, dst4_l, dst5_l;
2487 v4i32 weight_vec, offset_vec, rnd_vec;
2491 filter_vec = LD_SH(filter);
2492 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2494 weight = weight & 0x0000FFFF;
2495 const_vec = __msa_ldi_h(128);
2498 weight_vec = __msa_fill_w(weight);
2499 offset_vec = __msa_fill_w(offset);
2500 rnd_vec = __msa_fill_w(rnd_val);
2504 LD_SB6(src, src_stride, src0, src1, src2, src3, src4, src5);
2505 LD_SB6(src, src_stride, src0, src1, src2, src3, src4, src5);
2506 XORI_B6_128_SB(src0, src1, src2, src3, src4, src5);
2508 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
2510 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
2512 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
2514 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
2516 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
2518 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
2520 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
2522 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
2524 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
2526 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
2528 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
2530 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5);
2532 HEVC_UNIW_RND_CLIP4(dst0, dst1, dst2, dst3,
2533 weight_vec, offset_vec, rnd_vec,
2534 dst0_r, dst1_r, dst2_r, dst3_r,
2535 dst0_l, dst1_l, dst2_l, dst3_l);
2537 HEVC_UNIW_RND_CLIP2(dst4, dst5, weight_vec, offset_vec, rnd_vec,
2538 dst4_r, dst5_r, dst4_l, dst5_l);
2540 HEVC_PCK_SW_SB12(dst0_l, dst0_r, dst1_l, dst1_r,
2541 dst2_l, dst2_r, dst3_l, dst3_r,
2542 dst4_l, dst4_r, dst5_l, dst5_r, dst0_r, dst1_r, dst2_r);
2544 ST8x4_UB(dst0_r, dst1_r, dst, dst_stride);
2545 dst += (4 * dst_stride);
2546 ST8x2_UB(dst2_r, dst, dst_stride);
2549 static void hevc_hz_uniwgt_4t_8x4multiple_msa(uint8_t *src,
2553 const int8_t *filter,
2561 v16i8 src0, src1, src2, src3;
2562 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
2565 v8i16 dst0, dst1, dst2, dst3;
2566 v8i16 filter_vec, const_vec;
2567 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l;
2568 v4i32 weight_vec, offset_vec, rnd_vec;
2572 filter_vec = LD_SH(filter);
2573 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2575 weight = weight & 0x0000FFFF;
2576 const_vec = __msa_ldi_h(128);
2579 weight_vec = __msa_fill_w(weight);
2580 offset_vec = __msa_fill_w(offset);
2581 rnd_vec = __msa_fill_w(rnd_val);
2585 for (loop_cnt = (height >> 2); loop_cnt--;) {
2586 LD_SB4(src, src_stride, src0, src1, src2, src3);
2587 src += (4 * src_stride);
2589 XORI_B4_128_SB(src0, src1, src2, src3);
2591 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
2593 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
2595 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
2597 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
2599 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
2601 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
2603 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
2605 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
2607 HEVC_UNIW_RND_CLIP4(dst0, dst1, dst2, dst3,
2608 weight_vec, offset_vec, rnd_vec,
2609 dst0_r, dst1_r, dst2_r, dst3_r,
2610 dst0_l, dst1_l, dst2_l, dst3_l);
2612 HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r,
2613 dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
2615 ST8x4_UB(dst0_r, dst1_r, dst, dst_stride);
2616 dst += (4 * dst_stride);
2620 static void hevc_hz_uniwgt_4t_8w_msa(uint8_t *src,
2624 const int8_t *filter,
2631 hevc_hz_uniwgt_4t_8x2_msa(src, src_stride, dst, dst_stride,
2632 filter, height, weight, offset, rnd_val);
2633 } else if (6 == height) {
2634 hevc_hz_uniwgt_4t_8x6_msa(src, src_stride, dst, dst_stride,
2635 filter, height, weight, offset, rnd_val);
2637 hevc_hz_uniwgt_4t_8x4multiple_msa(src, src_stride, dst, dst_stride,
2638 filter, height, weight, offset,
2643 static void hevc_hz_uniwgt_4t_12w_msa(uint8_t *src,
2647 const int8_t *filter,
2655 v16i8 src0, src1, src2, src3;
2656 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
2657 v16i8 mask2 = { 8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
2661 v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
2662 v8i16 filter_vec, const_vec;
2664 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r;
2665 v4i32 dst0_l, dst1_l, dst2_l, dst3_l, dst4_l, dst5_l;
2666 v4i32 weight_vec, offset_vec, rnd_vec;
2670 filter_vec = LD_SH(filter);
2671 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2673 weight = weight & 0x0000FFFF;
2674 const_vec = __msa_ldi_h(128);
2677 weight_vec = __msa_fill_w(weight);
2678 offset_vec = __msa_fill_w(offset);
2679 rnd_vec = __msa_fill_w(rnd_val);
2684 for (loop_cnt = (height >> 2); loop_cnt--;) {
2685 LD_SB4(src, src_stride, src0, src1, src2, src3);
2686 src += (4 * src_stride);
2688 XORI_B4_128_SB(src0, src1, src2, src3);
2690 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
2692 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
2694 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
2696 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
2698 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
2700 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
2702 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
2704 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
2706 VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec0, vec1);
2708 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
2710 VSHF_B2_SB(src2, src3, src2, src3, mask2, mask3, vec0, vec1);
2712 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5);
2714 HEVC_UNIW_RND_CLIP4(dst0, dst1, dst2, dst3,
2715 weight_vec, offset_vec, rnd_vec,
2716 dst0_r, dst1_r, dst2_r, dst3_r,
2717 dst0_l, dst1_l, dst2_l, dst3_l);
2719 HEVC_UNIW_RND_CLIP2(dst4, dst5, weight_vec, offset_vec, rnd_vec,
2720 dst4_r, dst5_r, dst4_l, dst5_l);
2722 HEVC_PCK_SW_SB12(dst0_l, dst0_r, dst1_l, dst1_r,
2723 dst2_l, dst2_r, dst3_l, dst3_r,
2724 dst4_l, dst4_r, dst5_l, dst5_r,
2725 dst0_r, dst1_r, dst2_r);
2727 ST12x4_UB(dst0_r, dst1_r, dst2_r, dst, dst_stride);
2728 dst += (4 * dst_stride);
2732 static void hevc_hz_uniwgt_4t_16w_msa(uint8_t *src,
2736 const int8_t *filter,
2743 v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
2745 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
2747 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
2749 v8i16 filter_vec, const_vec;
2750 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l;
2751 v4i32 weight_vec, offset_vec, rnd_vec;
2755 filter_vec = LD_SH(filter);
2756 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2758 weight = weight & 0x0000FFFF;
2759 const_vec = __msa_ldi_h(128);
2762 weight_vec = __msa_fill_w(weight);
2763 offset_vec = __msa_fill_w(offset);
2764 rnd_vec = __msa_fill_w(rnd_val);
2768 for (loop_cnt = (height >> 2); loop_cnt--;) {
2769 LD_SB4(src, src_stride, src0, src2, src4, src6);
2770 LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
2771 src += (4 * src_stride);
2773 XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
2775 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
2777 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
2779 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
2781 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
2783 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
2785 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
2787 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
2789 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
2791 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
2793 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
2795 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
2797 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5);
2799 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
2801 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst6, dst6);
2803 VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1);
2805 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst7, dst7);
2807 HEVC_UNIW_RND_CLIP4(dst0, dst1, dst2, dst3,
2808 weight_vec, offset_vec, rnd_vec,
2809 dst0_r, dst1_r, dst2_r, dst3_r,
2810 dst0_l, dst1_l, dst2_l, dst3_l);
2812 HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r,
2813 dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
2814 ST_SW2(dst0_r, dst1_r, dst, dst_stride);
2815 dst += (2 * dst_stride);
2817 HEVC_UNIW_RND_CLIP4(dst4, dst5, dst6, dst7,
2818 weight_vec, offset_vec, rnd_vec,
2819 dst0_r, dst1_r, dst2_r, dst3_r,
2820 dst0_l, dst1_l, dst2_l, dst3_l);
2822 HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r,
2823 dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
2824 ST_SW2(dst0_r, dst1_r, dst, dst_stride);
2825 dst += (2 * dst_stride);
2829 static void hevc_hz_uniwgt_4t_24w_msa(uint8_t *src,
2833 const int8_t *filter,
2840 uint8_t *dst_tmp = dst + 16;
2841 v16i8 src0, src1, src2, src3;
2843 v8i16 dst0, dst1, dst2, dst3;
2844 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
2845 v16i8 mask1, mask2, mask3;
2847 v8i16 filter_vec, const_vec;
2848 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l;
2849 v4i32 weight_vec, offset_vec, rnd_vec;
2853 filter_vec = LD_SH(filter);
2854 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2856 weight = weight & 0x0000FFFF;
2857 const_vec = __msa_ldi_h(128);
2860 weight_vec = __msa_fill_w(weight);
2861 offset_vec = __msa_fill_w(offset);
2862 rnd_vec = __msa_fill_w(rnd_val);
2868 for (loop_cnt = (height >> 1); loop_cnt--;) {
2870 LD_SB2(src, src_stride, src0, src2);
2871 LD_SB2(src + 16, src_stride, src1, src3);
2872 src += (2 * src_stride);
2874 XORI_B4_128_SB(src0, src1, src2, src3);
2876 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
2878 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
2880 VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec0, vec1);
2882 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
2884 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
2886 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
2888 VSHF_B2_SB(src2, src3, src2, src3, mask2, mask3, vec0, vec1);
2890 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
2892 HEVC_UNIW_RND_CLIP4(dst0, dst1, dst2, dst3,
2893 weight_vec, offset_vec, rnd_vec,
2894 dst0_r, dst1_r, dst2_r, dst3_r,
2895 dst0_l, dst1_l, dst2_l, dst3_l);
2897 HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r,
2898 dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
2899 ST_SW2(dst0_r, dst1_r, dst, dst_stride);
2900 dst += (2 * dst_stride);
2903 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
2905 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
2907 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
2909 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
2911 HEVC_UNIW_RND_CLIP2(dst0, dst1, weight_vec, offset_vec, rnd_vec,
2912 dst0_r, dst1_r, dst0_l, dst1_l);
2914 HEVC_PCK_SW_SB4(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r);
2915 ST8x2_UB(dst0_r, dst_tmp, dst_stride);
2916 dst_tmp += (2 * dst_stride);
2920 static void hevc_hz_uniwgt_4t_32w_msa(uint8_t *src,
2924 const int8_t *filter,
2931 v16i8 src0, src1, src2;
2933 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
2934 v16i8 mask1, mask2, mask3;
2935 v8i16 dst0, dst1, dst2, dst3;
2937 v8i16 filter_vec, const_vec;
2938 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l;
2939 v4i32 weight_vec, offset_vec, rnd_vec;
2943 filter_vec = LD_SH(filter);
2944 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2946 weight = weight & 0x0000FFFF;
2947 const_vec = __msa_ldi_h(128);
2950 weight_vec = __msa_fill_w(weight);
2951 offset_vec = __msa_fill_w(offset);
2952 rnd_vec = __msa_fill_w(rnd_val);
2958 for (loop_cnt = (height >> 1); loop_cnt--;) {
2959 LD_SB2(src, 16, src0, src1);
2960 src2 = LD_SB(src + 24);
2963 XORI_B3_128_SB(src0, src1, src2);
2965 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
2967 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
2969 VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec0, vec1);
2971 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
2973 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
2975 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
2977 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
2979 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
2981 HEVC_UNIW_RND_CLIP4(dst0, dst1, dst2, dst3,
2982 weight_vec, offset_vec, rnd_vec,
2983 dst0_r, dst1_r, dst2_r, dst3_r,
2984 dst0_l, dst1_l, dst2_l, dst3_l);
2986 HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r,
2987 dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
2988 ST_SW2(dst0_r, dst1_r, dst, 16);
2991 LD_SB2(src, 16, src0, src1);
2992 src2 = LD_SB(src + 24);
2995 XORI_B3_128_SB(src0, src1, src2);
2997 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
2999 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
3001 VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec0, vec1);
3003 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
3005 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
3007 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
3009 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
3011 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
3013 HEVC_UNIW_RND_CLIP4(dst0, dst1, dst2, dst3,
3014 weight_vec, offset_vec, rnd_vec,
3015 dst0_r, dst1_r, dst2_r, dst3_r,
3016 dst0_l, dst1_l, dst2_l, dst3_l);
3018 HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r,
3019 dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
3020 ST_SW2(dst0_r, dst1_r, dst, 16);
3025 static void hevc_vt_uniwgt_4t_4x2_msa(uint8_t *src,
3029 const int8_t *filter,
3035 v16i8 src0, src1, src2, src3, src4;
3036 v16i8 src10_r, src32_r, src21_r, src43_r;
3037 v16i8 src2110, src4332;
3039 v4i32 dst0_r, dst0_l;
3041 v8i16 filter_vec, const_vec;
3042 v4i32 weight_vec, offset_vec, rnd_vec;
3046 const_vec = __msa_ldi_h(128);
3048 weight = weight & 0x0000FFFF;
3050 weight_vec = __msa_fill_w(weight);
3051 offset_vec = __msa_fill_w(offset);
3052 rnd_vec = __msa_fill_w(rnd_val);
3054 filter_vec = LD_SH(filter);
3055 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3057 LD_SB3(src, src_stride, src0, src1, src2);
3058 src += (3 * src_stride);
3059 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3060 src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
3061 src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
3062 LD_SB2(src, src_stride, src3, src4);
3063 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3064 src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_r, (v2i64) src32_r);
3065 src4332 = (v16i8) __msa_xori_b((v16u8) src4332, 128);
3068 DPADD_SB2_SH(src2110, src4332, filt0, filt1, dst10, dst10);
3070 ILVRL_H2_SW(dst10, dst10, dst0_r, dst0_l);
3071 DOTP_SH2_SW(dst0_r, dst0_l, weight_vec, weight_vec, dst0_r, dst0_l);
3072 SRAR_W2_SW(dst0_r, dst0_l, rnd_vec);
3073 ADD2(dst0_r, offset_vec, dst0_l, offset_vec, dst0_r, dst0_l);
3074 dst0_r = CLIP_SW_0_255(dst0_r);
3075 dst0_l = CLIP_SW_0_255(dst0_l);
3077 HEVC_PCK_SW_SB2(dst0_l, dst0_r, dst0_r);
3078 ST4x2_UB(dst0_r, dst, dst_stride);
3081 static void hevc_vt_uniwgt_4t_4x4_msa(uint8_t *src,
3085 const int8_t *filter,
3091 v16i8 src0, src1, src2, src3, src4, src5, src6;
3092 v16i8 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r;
3093 v16i8 src2110, src4332, src6554;
3095 v4i32 dst0_r, dst1_r, dst0_l, dst1_l;
3097 v8i16 filter_vec, const_vec;
3098 v4i32 weight_vec, offset_vec, rnd_vec;
3102 const_vec = __msa_ldi_h(128);
3104 weight = weight & 0x0000FFFF;
3106 weight_vec = __msa_fill_w(weight);
3107 offset_vec = __msa_fill_w(offset);
3108 rnd_vec = __msa_fill_w(rnd_val);
3110 filter_vec = LD_SH(filter);
3111 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3113 LD_SB3(src, src_stride, src0, src1, src2);
3114 src += (3 * src_stride);
3115 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3116 src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
3117 src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
3119 LD_SB4(src, src_stride, src3, src4, src5, src6);
3120 ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
3121 src32_r, src43_r, src54_r, src65_r);
3122 ILVR_D2_SB(src43_r, src32_r, src65_r, src54_r, src4332, src6554);
3123 XORI_B2_128_SB(src4332, src6554);
3126 DPADD_SB2_SH(src2110, src4332, filt0, filt1, dst10, dst10);
3128 DPADD_SB2_SH(src4332, src6554, filt0, filt1, dst32, dst32);
3129 HEVC_UNIW_RND_CLIP2(dst10, dst32, weight_vec, offset_vec, rnd_vec,
3130 dst0_r, dst1_r, dst0_l, dst1_l);
3132 HEVC_PCK_SW_SB4(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r);
3133 ST4x4_UB(dst0_r, dst0_r, 0, 1, 2, 3, dst, dst_stride);
3134 dst += (4 * dst_stride);
3137 static void hevc_vt_uniwgt_4t_4x8multiple_msa(uint8_t *src,
3141 const int8_t *filter,
3148 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9;
3149 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
3150 v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
3151 v16i8 src2110, src4332, src6554, src8776;
3152 v8i16 dst10, dst32, dst54, dst76;
3153 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l;
3155 v8i16 filter_vec, const_vec;
3156 v4i32 weight_vec, offset_vec, rnd_vec;
3160 const_vec = __msa_ldi_h(128);
3162 weight = weight & 0x0000FFFF;
3164 weight_vec = __msa_fill_w(weight);
3165 offset_vec = __msa_fill_w(offset);
3166 rnd_vec = __msa_fill_w(rnd_val);
3168 filter_vec = LD_SH(filter);
3169 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3171 LD_SB3(src, src_stride, src0, src1, src2);
3172 src += (3 * src_stride);
3173 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3174 src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
3175 src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
3177 for (loop_cnt = (height >> 3); loop_cnt--;) {
3178 LD_SB6(src, src_stride, src3, src4, src5, src6, src7, src8);
3179 src += (6 * src_stride);
3180 ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
3181 src32_r, src43_r, src54_r, src65_r);
3182 ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
3183 ILVR_D3_SB(src43_r, src32_r, src65_r, src54_r, src87_r, src76_r,
3184 src4332, src6554, src8776);
3185 XORI_B3_128_SB(src4332, src6554, src8776);
3188 DPADD_SB2_SH(src2110, src4332, filt0, filt1, dst10, dst10);
3190 DPADD_SB2_SH(src4332, src6554, filt0, filt1, dst32, dst32);
3192 DPADD_SB2_SH(src6554, src8776, filt0, filt1, dst54, dst54);
3194 LD_SB2(src, src_stride, src9, src2);
3195 src += (2 * src_stride);
3196 ILVR_B2_SB(src9, src8, src2, src9, src98_r, src109_r);
3197 src2110 = (v16i8) __msa_ilvr_d((v2i64) src109_r, (v2i64) src98_r);
3198 src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
3201 DPADD_SB2_SH(src8776, src2110, filt0, filt1, dst76, dst76);
3202 HEVC_UNIW_RND_CLIP4(dst10, dst32, dst54, dst76,
3203 weight_vec, offset_vec, rnd_vec,
3204 dst0_r, dst1_r, dst2_r, dst3_r,
3205 dst0_l, dst1_l, dst2_l, dst3_l);
3207 HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r,
3208 dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
3209 ST4x8_UB(dst0_r, dst1_r, dst, dst_stride);
3210 dst += (8 * dst_stride);
3214 static void hevc_vt_uniwgt_4t_4w_msa(uint8_t *src,
3218 const int8_t *filter,
3225 hevc_vt_uniwgt_4t_4x2_msa(src, src_stride, dst, dst_stride,
3226 filter, height, weight, offset, rnd_val);
3227 } else if (4 == height) {
3228 hevc_vt_uniwgt_4t_4x4_msa(src, src_stride, dst, dst_stride,
3229 filter, height, weight, offset, rnd_val);
3230 } else if (0 == (height % 8)) {
3231 hevc_vt_uniwgt_4t_4x8multiple_msa(src, src_stride, dst, dst_stride,
3232 filter, height, weight, offset,
3237 static void hevc_vt_uniwgt_4t_6w_msa(uint8_t *src,
3241 const int8_t *filter,
3248 v16i8 src0, src1, src2, src3, src4;
3249 v16i8 src10_r, src32_r, src21_r, src43_r;
3250 v8i16 tmp0, tmp1, tmp2, tmp3;
3252 v8i16 filter_vec, const_vec;
3253 v4i32 weight_vec, offset_vec, rnd_vec;
3254 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l;
3258 const_vec = __msa_ldi_h(128);
3260 weight = weight & 0x0000FFFF;
3262 weight_vec = __msa_fill_w(weight);
3263 offset_vec = __msa_fill_w(offset);
3264 rnd_vec = __msa_fill_w(rnd_val);
3266 filter_vec = LD_SH(filter);
3267 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3269 LD_SB3(src, src_stride, src0, src1, src2);
3270 src += (3 * src_stride);
3271 XORI_B3_128_SB(src0, src1, src2);
3272 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3274 for (loop_cnt = (height >> 2); loop_cnt--;) {
3275 LD_SB2(src, src_stride, src3, src4);
3276 src += (2 * src_stride);
3277 XORI_B2_128_SB(src3, src4);
3278 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3281 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, tmp0, tmp0);
3283 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, tmp1, tmp1);
3285 LD_SB2(src, src_stride, src1, src2);
3286 src += (2 * src_stride);
3287 XORI_B2_128_SB(src1, src2);
3288 ILVR_B2_SB(src1, src4, src2, src1, src10_r, src21_r);
3291 DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, tmp2, tmp2);
3293 DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, tmp3, tmp3);
3294 HEVC_UNIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3,
3295 weight_vec, offset_vec, rnd_vec,
3296 dst0_r, dst1_r, dst2_r, dst3_r,
3297 dst0_l, dst1_l, dst2_l, dst3_l);
3299 HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r,
3300 dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
3302 ST6x4_UB(dst0_r, dst1_r, dst, dst_stride);
3303 dst += (4 * dst_stride);
3307 static void hevc_vt_uniwgt_4t_8x2_msa(uint8_t *src,
3311 const int8_t *filter,
3317 v16i8 src0, src1, src2, src3, src4;
3318 v16i8 src10_r, src32_r, src21_r, src43_r;
3321 v8i16 filter_vec, const_vec;
3322 v4i32 weight_vec, offset_vec, rnd_vec;
3323 v4i32 dst0_r, dst1_r, dst0_l, dst1_l;
3327 const_vec = __msa_ldi_h(128);
3329 weight = weight & 0x0000FFFF;
3331 weight_vec = __msa_fill_w(weight);
3332 offset_vec = __msa_fill_w(offset);
3333 rnd_vec = __msa_fill_w(rnd_val);
3335 filter_vec = LD_SH(filter);
3336 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3338 LD_SB3(src, src_stride, src0, src1, src2);
3339 src += (3 * src_stride);
3340 XORI_B3_128_SB(src0, src1, src2);
3341 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3342 LD_SB2(src, src_stride, src3, src4);
3343 XORI_B2_128_SB(src3, src4);
3344 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3347 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, tmp0, tmp0);
3349 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, tmp1, tmp1);
3350 HEVC_UNIW_RND_CLIP2(tmp0, tmp1, weight_vec, offset_vec, rnd_vec,
3351 dst0_r, dst1_r, dst0_l, dst1_l);
3353 HEVC_PCK_SW_SB4(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r);
3354 ST8x2_UB(dst0_r, dst, dst_stride);
3357 static void hevc_vt_uniwgt_4t_8x6_msa(uint8_t *src,
3361 const int8_t *filter,
3367 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
3368 v16i8 src10_r, src32_r, src54_r, src76_r;
3369 v16i8 src21_r, src43_r, src65_r, src87_r;
3370 v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
3372 v8i16 filter_vec, const_vec;
3373 v4i32 weight_vec, offset_vec, rnd_vec;
3374 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r;
3375 v4i32 dst0_l, dst1_l, dst2_l, dst3_l, dst4_l, dst5_l;
3379 const_vec = __msa_ldi_h(128);
3381 weight = weight & 0x0000FFFF;
3383 weight_vec = __msa_fill_w(weight);
3384 offset_vec = __msa_fill_w(offset);
3385 rnd_vec = __msa_fill_w(rnd_val);
3387 filter_vec = LD_SH(filter);
3388 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3390 LD_SB3(src, src_stride, src0, src1, src2);
3391 src += (3 * src_stride);
3392 XORI_B3_128_SB(src0, src1, src2);
3393 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3395 LD_SB6(src, src_stride, src3, src4, src5, src6, src7, src8);
3396 XORI_B6_128_SB(src3, src4, src5, src6, src7, src8);
3397 ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
3398 src32_r, src43_r, src54_r, src65_r);
3399 ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
3402 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, tmp0, tmp0);
3404 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, tmp1, tmp1);
3406 DPADD_SB2_SH(src32_r, src54_r, filt0, filt1, tmp2, tmp2);
3408 DPADD_SB2_SH(src43_r, src65_r, filt0, filt1, tmp3, tmp3);
3410 DPADD_SB2_SH(src54_r, src76_r, filt0, filt1, tmp4, tmp4);
3412 DPADD_SB2_SH(src65_r, src87_r, filt0, filt1, tmp5, tmp5);
3413 HEVC_UNIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3,
3414 weight_vec, offset_vec, rnd_vec,
3415 dst0_r, dst1_r, dst2_r, dst3_r,
3416 dst0_l, dst1_l, dst2_l, dst3_l);
3417 HEVC_UNIW_RND_CLIP2(tmp4, tmp5, weight_vec, offset_vec, rnd_vec,
3418 dst4_r, dst5_r, dst4_l, dst5_l);
3420 HEVC_PCK_SW_SB12(dst0_l, dst0_r, dst1_l, dst1_r,
3421 dst2_l, dst2_r, dst3_l, dst3_r,
3422 dst4_l, dst4_r, dst5_l, dst5_r, dst0_r, dst1_r, dst2_r);
3423 ST8x4_UB(dst0_r, dst1_r, dst, dst_stride);
3424 dst += (4 * dst_stride);
3425 ST8x2_UB(dst2_r, dst, dst_stride);
3428 static void hevc_vt_uniwgt_4t_8x4multiple_msa(uint8_t *src,
3432 const int8_t *filter,
3439 v16i8 src0, src1, src2, src3, src4;
3440 v16i8 src10_r, src32_r, src21_r, src43_r;
3441 v8i16 tmp0, tmp1, tmp2, tmp3;
3443 v8i16 filter_vec, const_vec;
3444 v4i32 weight_vec, offset_vec, rnd_vec;
3445 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l;
3449 const_vec = __msa_ldi_h(128);
3451 weight = weight & 0x0000FFFF;
3453 weight_vec = __msa_fill_w(weight);
3454 offset_vec = __msa_fill_w(offset);
3455 rnd_vec = __msa_fill_w(rnd_val);
3457 filter_vec = LD_SH(filter);
3458 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3460 LD_SB3(src, src_stride, src0, src1, src2);
3461 src += (3 * src_stride);
3462 XORI_B3_128_SB(src0, src1, src2);
3463 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3465 for (loop_cnt = (height >> 2); loop_cnt--;) {
3466 LD_SB2(src, src_stride, src3, src4);
3467 src += (2 * src_stride);
3468 XORI_B2_128_SB(src3, src4);
3469 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3472 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, tmp0, tmp0);
3474 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, tmp1, tmp1);
3476 LD_SB2(src, src_stride, src1, src2);
3477 src += (2 * src_stride);
3478 XORI_B2_128_SB(src1, src2);
3479 ILVR_B2_SB(src1, src4, src2, src1, src10_r, src21_r);
3482 DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, tmp2, tmp2);
3484 DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, tmp3, tmp3);
3485 HEVC_UNIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3,
3486 weight_vec, offset_vec, rnd_vec,
3487 dst0_r, dst1_r, dst2_r, dst3_r,
3488 dst0_l, dst1_l, dst2_l, dst3_l);
3490 HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r,
3491 dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
3492 ST8x4_UB(dst0_r, dst1_r, dst, dst_stride);
3493 dst += (4 * dst_stride);
3497 static void hevc_vt_uniwgt_4t_8w_msa(uint8_t *src,
3501 const int8_t *filter,
3508 hevc_vt_uniwgt_4t_8x2_msa(src, src_stride, dst, dst_stride,
3509 filter, height, weight, offset, rnd_val);
3510 } else if (6 == height) {
3511 hevc_vt_uniwgt_4t_8x6_msa(src, src_stride, dst, dst_stride,
3512 filter, height, weight, offset, rnd_val);
3514 hevc_vt_uniwgt_4t_8x4multiple_msa(src, src_stride, dst, dst_stride,
3515 filter, height, weight, offset,
3520 static void hevc_vt_uniwgt_4t_12w_msa(uint8_t *src,
3524 const int8_t *filter,
3531 v16i8 src0, src1, src2, src3, src4, src5;
3532 v16i8 src10_r, src32_r, src21_r, src43_r;
3533 v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
3534 v16i8 src10_l, src32_l, src54_l, src21_l, src43_l, src65_l;
3535 v16i8 src2110, src4332;
3537 v8i16 filter_vec, const_vec;
3538 v4i32 weight_vec, offset_vec, rnd_vec;
3539 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r;
3540 v4i32 dst0_l, dst1_l, dst2_l, dst3_l, dst4_l, dst5_l;
3542 src -= (1 * src_stride);
3544 const_vec = __msa_ldi_h(128);
3546 weight = weight & 0x0000FFFF;
3548 weight_vec = __msa_fill_w(weight);
3549 offset_vec = __msa_fill_w(offset);
3550 rnd_vec = __msa_fill_w(rnd_val);
3552 filter_vec = LD_SH(filter);
3553 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3555 LD_SB3(src, src_stride, src0, src1, src2);
3556 src += (3 * src_stride);
3557 XORI_B3_128_SB(src0, src1, src2);
3558 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3559 ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
3560 src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_l, (v2i64) src10_l);
3562 for (loop_cnt = (height >> 2); loop_cnt--;) {
3563 LD_SB2(src, src_stride, src3, src4);
3564 src += (2 * src_stride);
3565 XORI_B2_128_SB(src3, src4);
3566 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3567 ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
3568 src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_l, (v2i64) src32_l);
3571 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, tmp0, tmp0);
3573 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, tmp1, tmp1);
3575 DPADD_SB2_SH(src2110, src4332, filt0, filt1, tmp4, tmp4);
3577 LD_SB2(src, src_stride, src5, src2);
3578 src += (2 * src_stride);
3579 XORI_B2_128_SB(src5, src2);
3580 ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r);
3581 ILVL_B2_SB(src5, src4, src2, src5, src54_l, src65_l);
3582 src2110 = (v16i8) __msa_ilvr_d((v2i64) src65_l, (v2i64) src54_l);
3585 DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, tmp2, tmp2);
3587 DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, tmp3, tmp3);
3589 DPADD_SB2_SH(src4332, src2110, filt0, filt1, tmp5, tmp5);
3590 HEVC_UNIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3,
3591 weight_vec, offset_vec, rnd_vec,
3592 dst0_r, dst1_r, dst2_r, dst3_r,
3593 dst0_l, dst1_l, dst2_l, dst3_l);
3594 HEVC_UNIW_RND_CLIP2(tmp4, tmp5, weight_vec, offset_vec, rnd_vec,
3595 dst4_r, dst5_r, dst4_l, dst5_l);
3597 HEVC_PCK_SW_SB12(dst0_l, dst0_r, dst1_l, dst1_r,
3598 dst2_l, dst2_r, dst3_l, dst3_r,
3599 dst4_l, dst4_r, dst5_l, dst5_r,
3600 dst0_r, dst1_r, dst2_r);
3601 ST12x4_UB(dst0_r, dst1_r, dst2_r, dst, dst_stride);
3602 dst += (4 * dst_stride);
3606 static void hevc_vt_uniwgt_4t_16w_msa(uint8_t *src,
3610 const int8_t *filter,
3617 v16i8 src0, src1, src2, src3, src4, src5;
3618 v16i8 src10_r, src32_r, src21_r, src43_r;
3619 v16i8 src10_l, src32_l, src21_l, src43_l;
3620 v8i16 tmp0, tmp1, tmp2, tmp3;
3622 v8i16 filter_vec, const_vec;
3623 v4i32 weight_vec, offset_vec, rnd_vec;
3624 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l;
3628 const_vec = __msa_ldi_h(128);
3630 weight = weight & 0x0000FFFF;
3632 weight_vec = __msa_fill_w(weight);
3633 offset_vec = __msa_fill_w(offset);
3634 rnd_vec = __msa_fill_w(rnd_val);
3636 filter_vec = LD_SH(filter);
3637 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3639 LD_SB3(src, src_stride, src0, src1, src2);
3640 src += (3 * src_stride);
3641 XORI_B3_128_SB(src0, src1, src2);
3642 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3643 ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
3645 for (loop_cnt = (height >> 2); loop_cnt--;) {
3646 LD_SB2(src, src_stride, src3, src4);
3647 src += (2 * src_stride);
3648 XORI_B2_128_SB(src3, src4);
3649 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3650 ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
3653 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, tmp0, tmp0);
3655 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, tmp1, tmp1);
3657 DPADD_SB2_SH(src10_l, src32_l, filt0, filt1, tmp2, tmp2);
3659 DPADD_SB2_SH(src21_l, src43_l, filt0, filt1, tmp3, tmp3);
3660 HEVC_UNIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3,
3661 weight_vec, offset_vec, rnd_vec,
3662 dst0_r, dst1_r, dst2_r, dst3_r,
3663 dst0_l, dst1_l, dst2_l, dst3_l);
3665 HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst2_l, dst2_r,
3666 dst1_l, dst1_r, dst3_l, dst3_r, dst0_r, dst1_r);
3667 ST_SW2(dst0_r, dst1_r, dst, dst_stride);
3668 dst += (2 * dst_stride);
3670 LD_SB2(src, src_stride, src5, src2);
3671 src += (2 * src_stride);
3672 XORI_B2_128_SB(src5, src2);
3673 ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r);
3674 ILVL_B2_SB(src5, src4, src2, src5, src10_l, src21_l);
3677 DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, tmp0, tmp0);
3679 DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, tmp1, tmp1);
3681 DPADD_SB2_SH(src32_l, src10_l, filt0, filt1, tmp2, tmp2);
3683 DPADD_SB2_SH(src43_l, src21_l, filt0, filt1, tmp3, tmp3);
3684 HEVC_UNIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3,
3685 weight_vec, offset_vec, rnd_vec,
3686 dst0_r, dst1_r, dst2_r, dst3_r,
3687 dst0_l, dst1_l, dst2_l, dst3_l);
3689 HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst2_l, dst2_r,
3690 dst1_l, dst1_r, dst3_l, dst3_r, dst0_r, dst1_r);
3691 ST_SW2(dst0_r, dst1_r, dst, dst_stride);
3692 dst += (2 * dst_stride);
3696 static void hevc_vt_uniwgt_4t_24w_msa(uint8_t *src,
3700 const int8_t *filter,
3707 v16i8 src0, src1, src2, src3, src4, src5;
3708 v16i8 src6, src7, src8, src9, src10, src11;
3709 v16i8 src10_r, src32_r, src76_r, src98_r;
3710 v16i8 src21_r, src43_r, src87_r, src109_r;
3711 v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
3712 v16i8 src10_l, src32_l, src21_l, src43_l;
3714 v8i16 filter_vec, const_vec;
3715 v4i32 weight_vec, offset_vec, rnd_vec;
3716 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r;
3717 v4i32 dst0_l, dst1_l, dst2_l, dst3_l, dst4_l, dst5_l;
3721 const_vec = __msa_ldi_h(128);
3723 weight = weight & 0x0000FFFF;
3725 weight_vec = __msa_fill_w(weight);
3726 offset_vec = __msa_fill_w(offset);
3727 rnd_vec = __msa_fill_w(rnd_val);
3729 filter_vec = LD_SH(filter);
3730 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3732 LD_SB3(src, src_stride, src0, src1, src2);
3733 XORI_B3_128_SB(src0, src1, src2);
3734 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3735 ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
3737 LD_SB3(src + 16, src_stride, src6, src7, src8);
3738 src += (3 * src_stride);
3739 XORI_B3_128_SB(src6, src7, src8);
3740 ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
3742 for (loop_cnt = (height >> 2); loop_cnt--;) {
3743 LD_SB2(src, src_stride, src3, src4);
3744 XORI_B2_128_SB(src3, src4);
3745 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3746 ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
3747 LD_SB2(src + 16, src_stride, src9, src10);
3748 src += (2 * src_stride);
3749 XORI_B2_128_SB(src9, src10);
3750 ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r);
3753 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, tmp0, tmp0);
3755 DPADD_SB2_SH(src10_l, src32_l, filt0, filt1, tmp4, tmp4);
3757 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, tmp1, tmp1);
3759 DPADD_SB2_SH(src21_l, src43_l, filt0, filt1, tmp5, tmp5);
3761 DPADD_SB2_SH(src76_r, src98_r, filt0, filt1, tmp2, tmp2);
3763 DPADD_SB2_SH(src87_r, src109_r, filt0, filt1, tmp3, tmp3);
3765 HEVC_UNIW_RND_CLIP4(tmp0, tmp1, tmp4, tmp5,
3766 weight_vec, offset_vec, rnd_vec,
3767 dst0_r, dst1_r, dst2_r, dst3_r,
3768 dst0_l, dst1_l, dst2_l, dst3_l);
3769 HEVC_UNIW_RND_CLIP2(tmp2, tmp3, weight_vec, offset_vec, rnd_vec,
3770 dst4_r, dst5_r, dst4_l, dst5_l);
3772 HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst2_l, dst2_r,
3773 dst1_l, dst1_r, dst3_l, dst3_r, dst0_r, dst1_r);
3774 HEVC_PCK_SW_SB4(dst4_l, dst4_r, dst5_l, dst5_r, dst4_r);
3775 ST_SW2(dst0_r, dst1_r, dst, dst_stride);
3776 ST8x2_UB(dst4_r, dst + 16, dst_stride);
3777 dst += (2 * dst_stride);
3779 LD_SB2(src, src_stride, src5, src2);
3780 XORI_B2_128_SB(src5, src2);
3781 ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r);
3782 ILVL_B2_SB(src5, src4, src2, src5, src10_l, src21_l);
3783 LD_SB2(src + 16, src_stride, src11, src8);
3784 src += (2 * src_stride);
3785 XORI_B2_128_SB(src11, src8);
3786 ILVR_B2_SB(src11, src10, src8, src11, src76_r, src87_r);
3789 DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, tmp0, tmp0);
3791 DPADD_SB2_SH(src32_l, src10_l, filt0, filt1, tmp4, tmp4);
3793 DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, tmp1, tmp1);
3795 DPADD_SB2_SH(src43_l, src21_l, filt0, filt1, tmp5, tmp5);
3797 DPADD_SB2_SH(src98_r, src76_r, filt0, filt1, tmp2, tmp2);
3799 DPADD_SB2_SH(src109_r, src87_r, filt0, filt1, tmp3, tmp3);
3801 HEVC_UNIW_RND_CLIP4(tmp0, tmp1, tmp4, tmp5,
3802 weight_vec, offset_vec, rnd_vec,
3803 dst0_r, dst1_r, dst2_r, dst3_r,
3804 dst0_l, dst1_l, dst2_l, dst3_l);
3805 HEVC_UNIW_RND_CLIP2(tmp2, tmp3, weight_vec, offset_vec, rnd_vec,
3806 dst4_r, dst5_r, dst4_l, dst5_l);
3808 HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst2_l, dst2_r,
3809 dst1_l, dst1_r, dst3_l, dst3_r, dst0_r, dst1_r);
3810 HEVC_PCK_SW_SB4(dst4_l, dst4_r, dst5_l, dst5_r, dst4_r);
3811 ST_SW2(dst0_r, dst1_r, dst, dst_stride);
3812 ST8x2_UB(dst4_r, dst + 16, dst_stride);
3813 dst += (2 * dst_stride);
3817 static void hevc_vt_uniwgt_4t_32w_msa(uint8_t *src,
3821 const int8_t *filter,
3828 uint8_t *dst_tmp = dst + 16;
3829 v16i8 src0, src1, src2, src3, src4, src6, src7, src8, src9, src10;
3830 v16i8 src10_r, src32_r, src76_r, src98_r;
3831 v16i8 src21_r, src43_r, src87_r, src109_r;
3832 v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
3833 v16i8 src10_l, src32_l, src76_l, src98_l;
3834 v16i8 src21_l, src43_l, src87_l, src109_l;
3836 v8i16 filter_vec, const_vec;
3837 v4i32 weight_vec, offset_vec, rnd_vec;
3838 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r, dst6_r, dst7_r;
3839 v4i32 dst0_l, dst1_l, dst2_l, dst3_l, dst4_l, dst5_l, dst6_l, dst7_l;
3843 const_vec = __msa_ldi_h(128);
3845 weight = weight & 0x0000FFFF;
3847 weight_vec = __msa_fill_w(weight);
3848 offset_vec = __msa_fill_w(offset);
3849 rnd_vec = __msa_fill_w(rnd_val);
3851 filter_vec = LD_SH(filter);
3852 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3854 LD_SB3(src, src_stride, src0, src1, src2);
3855 XORI_B3_128_SB(src0, src1, src2);
3856 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3857 ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
3859 LD_SB3(src + 16, src_stride, src6, src7, src8);
3860 src += (3 * src_stride);
3861 XORI_B3_128_SB(src6, src7, src8);
3862 ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
3863 ILVL_B2_SB(src7, src6, src8, src7, src76_l, src87_l);
3865 for (loop_cnt = (height >> 1); loop_cnt--;) {
3866 LD_SB2(src, src_stride, src3, src4);
3867 XORI_B2_128_SB(src3, src4);
3868 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3869 ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
3872 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, tmp0, tmp0);
3874 DPADD_SB2_SH(src10_l, src32_l, filt0, filt1, tmp4, tmp4);
3876 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, tmp1, tmp1);
3878 DPADD_SB2_SH(src21_l, src43_l, filt0, filt1, tmp5, tmp5);
3880 HEVC_UNIW_RND_CLIP4(tmp0, tmp1, tmp4, tmp5,
3881 weight_vec, offset_vec, rnd_vec,
3882 dst0_r, dst1_r, dst2_r, dst3_r,
3883 dst0_l, dst1_l, dst2_l, dst3_l);
3884 HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst2_l, dst2_r,
3885 dst1_l, dst1_r, dst3_l, dst3_r, dst0_r, dst1_r);
3886 ST_SW2(dst0_r, dst1_r, dst, dst_stride);
3887 dst += (2 * dst_stride);
3895 LD_SB2(src + 16, src_stride, src9, src10);
3896 src += (2 * src_stride);
3897 XORI_B2_128_SB(src9, src10);
3898 ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r);
3899 ILVL_B2_SB(src9, src8, src10, src9, src98_l, src109_l);
3902 DPADD_SB2_SH(src76_r, src98_r, filt0, filt1, tmp2, tmp2);
3904 DPADD_SB2_SH(src76_l, src98_l, filt0, filt1, tmp6, tmp6);
3906 DPADD_SB2_SH(src87_r, src109_r, filt0, filt1, tmp3, tmp3);
3908 DPADD_SB2_SH(src87_l, src109_l, filt0, filt1, tmp7, tmp7);
3910 HEVC_UNIW_RND_CLIP4(tmp2, tmp3, tmp6, tmp7,
3911 weight_vec, offset_vec, rnd_vec,
3912 dst4_r, dst5_r, dst6_r, dst7_r,
3913 dst4_l, dst5_l, dst6_l, dst7_l);
3915 HEVC_PCK_SW_SB8(dst4_l, dst4_r, dst6_l, dst6_r,
3916 dst5_l, dst5_r, dst7_l, dst7_r, dst4_r, dst5_r);
3917 ST_SW2(dst4_r, dst5_r, dst_tmp, dst_stride);
3918 dst_tmp += (2 * dst_stride);
3928 static void hevc_hv_uniwgt_4t_4x2_msa(uint8_t *src,
3932 const int8_t *filter_x,
3933 const int8_t *filter_y,
3939 v16i8 src0, src1, src2, src3, src4;
3941 v4i32 filt_h0, filt_h1;
3942 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
3944 v8i16 filter_vec, const_vec;
3945 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
3946 v8i16 dst0, dst1, dst2, dst3, dst4;
3947 v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
3948 v4i32 dst0_r, dst1_r;
3949 v4i32 weight_vec, offset_vec, rnd_vec;
3951 src -= (src_stride + 1);
3953 filter_vec = LD_SH(filter_x);
3954 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3956 filter_vec = LD_SH(filter_y);
3957 vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
3958 filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
3960 SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1);
3964 const_vec = __msa_ldi_h(128);
3967 weight_vec = __msa_fill_w(weight);
3968 offset_vec = __msa_fill_w(offset);
3969 rnd_vec = __msa_fill_w(rnd_val);
3971 LD_SB3(src, src_stride, src0, src1, src2);
3972 src += (3 * src_stride);
3973 XORI_B3_128_SB(src0, src1, src2);
3975 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3976 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
3977 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3979 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
3981 DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1);
3983 DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2);
3985 ILVR_H2_SH(dst1, dst0, dst2, dst1, dst10_r, dst21_r);
3986 LD_SB2(src, src_stride, src3, src4);
3987 XORI_B2_128_SB(src3, src4);
3990 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3992 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
3994 dst32_r = __msa_ilvr_h(dst3, dst2);
3995 dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
3998 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
4000 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
4002 dst43_r = __msa_ilvr_h(dst4, dst3);
4003 dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
4006 MUL2(dst0_r, weight_vec, dst1_r, weight_vec, dst0_r, dst1_r);
4007 SRAR_W2_SW(dst0_r, dst1_r, rnd_vec);
4008 ADD2(dst0_r, offset_vec, dst1_r, offset_vec, dst0_r, dst1_r);
4009 dst0_r = CLIP_SW_0_255(dst0_r);
4010 dst1_r = CLIP_SW_0_255(dst1_r);
4012 HEVC_PCK_SW_SB2(dst1_r, dst0_r, dst0_r);
4013 ST4x2_UB(dst0_r, dst, dst_stride);
4016 static void hevc_hv_uniwgt_4t_4x4_msa(uint8_t *src,
4020 const int8_t *filter_x,
4021 const int8_t *filter_y,
4027 v16i8 src0, src1, src2, src3, src4, src5, src6;
4029 v4i32 filt_h0, filt_h1;
4030 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
4032 v8i16 filter_vec, const_vec;
4033 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
4034 v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
4035 v4i32 dst0_r, dst1_r, dst2_r, dst3_r;
4036 v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
4037 v4i32 weight_vec, offset_vec, rnd_vec;
4039 src -= (src_stride + 1);
4041 filter_vec = LD_SH(filter_x);
4042 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
4044 filter_vec = LD_SH(filter_y);
4045 vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
4046 filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
4048 SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1);
4052 const_vec = __msa_ldi_h(128);
4055 weight_vec = __msa_fill_w(weight);
4056 offset_vec = __msa_fill_w(offset);
4057 rnd_vec = __msa_fill_w(rnd_val);
4059 LD_SB3(src, src_stride, src0, src1, src2);
4060 src += (3 * src_stride);
4061 XORI_B3_128_SB(src0, src1, src2);
4063 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
4064 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
4065 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
4067 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
4069 DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1);
4071 DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2);
4073 ILVR_H2_SH(dst1, dst0, dst2, dst1, dst10_r, dst21_r);
4075 LD_SB4(src, src_stride, src3, src4, src5, src6);
4076 XORI_B4_128_SB(src3, src4, src5, src6);
4079 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
4081 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
4082 dst32_r = __msa_ilvr_h(dst3, dst2);
4083 dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
4087 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
4089 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
4090 dst43_r = __msa_ilvr_h(dst4, dst3);
4091 dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
4095 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
4097 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5);
4098 dst10_r = __msa_ilvr_h(dst5, dst4);
4099 dst2_r = HEVC_FILT_4TAP(dst32_r, dst10_r, filt_h0, filt_h1);
4103 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
4105 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
4106 dst21_r = __msa_ilvr_h(dst2, dst5);
4107 dst3_r = HEVC_FILT_4TAP(dst43_r, dst21_r, filt_h0, filt_h1);
4110 HEVC_HV_UNIW_RND_CLIP4(dst0_r, dst1_r, dst2_r, dst3_r,
4111 weight_vec, offset_vec, rnd_vec,
4112 dst0_r, dst1_r, dst2_r, dst3_r);
4113 HEVC_PCK_SW_SB4(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r);
4114 ST4x4_UB(dst0_r, dst0_r, 0, 1, 2, 3, dst, dst_stride);
4117 static void hevc_hv_uniwgt_4t_4multx8mult_msa(uint8_t *src,
4121 const int8_t *filter_x,
4122 const int8_t *filter_y,
4129 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
4131 v4i32 filt_h0, filt_h1;
4132 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
4134 v8i16 filter_vec, const_vec;
4135 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
4136 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8, dst9;
4137 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r, dst6_r, dst7_r;
4138 v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
4139 v8i16 dst21_r, dst43_r, dst65_r, dst87_r;
4140 v4i32 weight_vec, offset_vec, rnd_vec;
4142 src -= (src_stride + 1);
4144 filter_vec = LD_SH(filter_x);
4145 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
4147 filter_vec = LD_SH(filter_y);
4148 vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
4149 filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
4151 SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1);
4155 const_vec = __msa_ldi_h(128);
4158 weight_vec = __msa_fill_w(weight);
4159 offset_vec = __msa_fill_w(offset);
4160 rnd_vec = __msa_fill_w(rnd_val);
4162 LD_SB3(src, src_stride, src0, src1, src2);
4163 src += (3 * src_stride);
4164 XORI_B3_128_SB(src0, src1, src2);
4166 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
4167 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
4168 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
4170 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
4172 DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1);
4174 DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2);
4175 ILVR_H2_SH(dst1, dst0, dst2, dst1, dst10_r, dst21_r);
4177 for (loop_cnt = height >> 3; loop_cnt--;) {
4178 LD_SB8(src, src_stride,
4179 src3, src4, src5, src6, src7, src8, src9, src10);
4180 src += (8 * src_stride);
4181 XORI_B8_128_SB(src3, src4, src5, src6, src7, src8, src9, src10);
4183 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
4185 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
4186 dst32_r = __msa_ilvr_h(dst3, dst2);
4187 dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
4190 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
4192 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
4193 dst43_r = __msa_ilvr_h(dst4, dst3);
4194 dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
4197 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
4199 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5);
4200 dst54_r = __msa_ilvr_h(dst5, dst4);
4201 dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
4204 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
4206 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst6, dst6);
4207 dst65_r = __msa_ilvr_h(dst6, dst5);
4208 dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
4211 VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1);
4213 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst7, dst7);
4214 dst76_r = __msa_ilvr_h(dst7, dst6);
4215 dst4_r = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
4218 VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec0, vec1);
4220 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst8, dst8);
4221 dst87_r = __msa_ilvr_h(dst8, dst7);
4222 dst5_r = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
4225 VSHF_B2_SB(src9, src9, src9, src9, mask0, mask1, vec0, vec1);
4227 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst9, dst9);
4228 dst10_r = __msa_ilvr_h(dst9, dst8);
4229 dst6_r = HEVC_FILT_4TAP(dst76_r, dst10_r, filt_h0, filt_h1);
4232 VSHF_B2_SB(src10, src10, src10, src10, mask0, mask1, vec0, vec1);
4234 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
4235 dst21_r = __msa_ilvr_h(dst2, dst9);
4236 dst7_r = HEVC_FILT_4TAP(dst87_r, dst21_r, filt_h0, filt_h1);
4239 HEVC_HV_UNIW_RND_CLIP4(dst0_r, dst1_r, dst2_r, dst3_r,
4240 weight_vec, offset_vec, rnd_vec,
4241 dst0_r, dst1_r, dst2_r, dst3_r);
4242 HEVC_PCK_SW_SB4(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r);
4243 ST4x4_UB(dst0_r, dst0_r, 0, 1, 2, 3, dst, dst_stride);
4244 dst += (4 * dst_stride);
4246 HEVC_HV_UNIW_RND_CLIP4(dst4_r, dst5_r, dst6_r, dst7_r,
4247 weight_vec, offset_vec, rnd_vec,
4248 dst4_r, dst5_r, dst6_r, dst7_r);
4249 HEVC_PCK_SW_SB4(dst5_r, dst4_r, dst7_r, dst6_r, dst0_r);
4250 ST4x4_UB(dst0_r, dst0_r, 0, 1, 2, 3, dst, dst_stride);
4251 dst += (4 * dst_stride);
4255 static void hevc_hv_uniwgt_4t_4w_msa(uint8_t *src,
4259 const int8_t *filter_x,
4260 const int8_t *filter_y,
4267 hevc_hv_uniwgt_4t_4x2_msa(src, src_stride, dst, dst_stride,
4268 filter_x, filter_y, height, weight,
4270 } else if (4 == height) {
4271 hevc_hv_uniwgt_4t_4x4_msa(src, src_stride, dst, dst_stride,
4272 filter_x, filter_y, height, weight,
4274 } else if (0 == (height % 8)) {
4275 hevc_hv_uniwgt_4t_4multx8mult_msa(src, src_stride, dst, dst_stride,
4276 filter_x, filter_y, height, weight,
4281 static void hevc_hv_uniwgt_4t_6w_msa(uint8_t *src,
4285 const int8_t *filter_x,
4286 const int8_t *filter_y,
4293 v16i8 src0, src1, src2, src3, src4, src5, src6;
4295 v4i32 filt_h0, filt_h1;
4296 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
4298 v8i16 filter_vec, const_vec;
4299 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
4300 v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
4301 v4i32 dst0_r, dst0_l, dst1_r, dst1_l;
4302 v4i32 weight_vec, offset_vec, rnd_vec;
4303 v4i32 dst2_r, dst2_l, dst3_r, dst3_l;
4304 v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
4305 v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
4307 src -= (src_stride + 1);
4309 filter_vec = LD_SH(filter_x);
4310 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
4312 filter_vec = LD_SH(filter_y);
4313 vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
4314 filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
4316 SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1);
4320 const_vec = __msa_ldi_h(128);
4323 weight_vec = __msa_fill_w(weight);
4324 offset_vec = __msa_fill_w(offset);
4325 rnd_vec = __msa_fill_w(rnd_val);
4327 LD_SB3(src, src_stride, src0, src1, src2);
4328 src += (3 * src_stride);
4329 XORI_B3_128_SB(src0, src1, src2);
4331 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
4332 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
4333 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
4335 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
4337 DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1);
4339 DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2);
4341 ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
4342 ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
4344 for (loop_cnt = height >> 2; loop_cnt--;) {
4345 LD_SB4(src, src_stride, src3, src4, src5, src6);
4346 src += (4 * src_stride);
4347 XORI_B4_128_SB(src3, src4, src5, src6);
4350 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
4352 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
4353 ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
4354 dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
4355 dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
4360 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
4362 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
4363 ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
4364 dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
4365 dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
4370 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
4372 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5);
4373 ILVRL_H2_SH(dst5, dst4, dst10_r, dst10_l);
4374 dst2_r = HEVC_FILT_4TAP(dst32_r, dst10_r, filt_h0, filt_h1);
4375 dst2_l = HEVC_FILT_4TAP(dst32_l, dst10_l, filt_h0, filt_h1);
4380 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
4382 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
4383 ILVRL_H2_SH(dst2, dst5, dst21_r, dst21_l);
4384 dst3_r = HEVC_FILT_4TAP(dst43_r, dst21_r, filt_h0, filt_h1);
4385 dst3_l = HEVC_FILT_4TAP(dst43_l, dst21_l, filt_h0, filt_h1);
4389 HEVC_HV_UNIW_RND_CLIP4(dst0_r, dst1_r, dst0_l, dst1_l,
4390 weight_vec, offset_vec, rnd_vec,
4391 dst0_r, dst1_r, dst0_l, dst1_l);
4392 HEVC_HV_UNIW_RND_CLIP4(dst2_r, dst3_r, dst2_l, dst3_l,
4393 weight_vec, offset_vec, rnd_vec,
4394 dst2_r, dst3_r, dst2_l, dst3_l);
4395 HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r,
4396 dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
4397 ST6x4_UB(dst0_r, dst1_r, dst, dst_stride);
4398 dst += (4 * dst_stride);
4402 static void hevc_hv_uniwgt_4t_8x2_msa(uint8_t *src,
4406 const int8_t *filter_x,
4407 const int8_t *filter_y,
4413 v16i8 src0, src1, src2, src3, src4;
4415 v4i32 filt_h0, filt_h1;
4416 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
4418 v8i16 filter_vec, const_vec;
4419 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
4420 v8i16 dst0, dst1, dst2, dst3, dst4;
4421 v4i32 dst0_r, dst0_l, dst1_r, dst1_l;
4422 v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
4423 v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
4424 v4i32 weight_vec, offset_vec, rnd_vec;
4426 src -= (src_stride + 1);
4428 filter_vec = LD_SH(filter_x);
4429 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
4431 filter_vec = LD_SH(filter_y);
4432 vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
4433 filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
4435 SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1);
4439 const_vec = __msa_ldi_h(128);
4442 weight_vec = __msa_fill_w(weight);
4443 offset_vec = __msa_fill_w(offset);
4444 rnd_vec = __msa_fill_w(rnd_val);
4446 LD_SB3(src, src_stride, src0, src1, src2);
4447 src += (3 * src_stride);
4448 XORI_B3_128_SB(src0, src1, src2);
4450 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
4451 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
4452 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
4454 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
4456 DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1);
4458 DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2);
4460 ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
4461 ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
4463 LD_SB2(src, src_stride, src3, src4);
4464 src += (2 * src_stride);
4465 XORI_B2_128_SB(src3, src4);
4467 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
4469 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
4470 ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
4471 dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
4472 dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
4476 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
4478 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
4479 ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
4480 dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
4481 dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
4485 HEVC_HV_UNIW_RND_CLIP4(dst0_r, dst1_r, dst0_l, dst1_l,
4486 weight_vec, offset_vec, rnd_vec,
4487 dst0_r, dst1_r, dst0_l, dst1_l);
4488 HEVC_PCK_SW_SB4(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r);
4489 ST8x2_UB(dst0_r, dst, dst_stride);
4490 dst += (2 * dst_stride);
4493 static void hevc_hv_uniwgt_4t_8x6_msa(uint8_t *src,
4497 const int8_t *filter_x,
4498 const int8_t *filter_y,
4504 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
4506 v4i32 filt_h0, filt_h1;
4507 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
4509 v8i16 filter_vec, const_vec;
4510 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
4511 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
4512 v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
4513 v4i32 dst4_r, dst4_l, dst5_r, dst5_l;
4514 v8i16 dst10_r, dst32_r, dst10_l, dst32_l;
4515 v8i16 dst21_r, dst43_r, dst21_l, dst43_l;
4516 v8i16 dst54_r, dst54_l, dst65_r, dst65_l;
4517 v8i16 dst76_r, dst76_l, dst87_r, dst87_l;
4518 v4i32 weight_vec, offset_vec, rnd_vec;
4520 src -= (src_stride + 1);
4522 filter_vec = LD_SH(filter_x);
4523 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
4525 filter_vec = LD_SH(filter_y);
4526 vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
4527 filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
4529 SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1);
4533 const_vec = __msa_ldi_h(128);
4536 weight_vec = __msa_fill_w(weight);
4537 offset_vec = __msa_fill_w(offset);
4538 rnd_vec = __msa_fill_w(rnd_val);
4540 LD_SB3(src, src_stride, src0, src1, src2);
4541 src += (3 * src_stride);
4543 XORI_B3_128_SB(src0, src1, src2);
4545 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
4546 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
4547 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
4549 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
4551 DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1);
4553 DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2);
4555 ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
4556 ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
4558 LD_SB2(src, src_stride, src3, src4);
4559 src += (2 * src_stride);
4560 XORI_B2_128_SB(src3, src4);
4563 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
4565 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
4566 ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
4567 dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
4568 dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
4573 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
4575 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
4576 ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
4577 dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
4578 dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
4582 LD_SB2(src, src_stride, src5, src6);
4583 src += (2 * src_stride);
4584 XORI_B2_128_SB(src5, src6);
4587 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
4589 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5);
4590 ILVRL_H2_SH(dst5, dst4, dst54_r, dst54_l);
4591 dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
4592 dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1);
4597 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
4599 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst6, dst6);
4600 ILVRL_H2_SH(dst6, dst5, dst65_r, dst65_l);
4601 dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
4602 dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1);
4606 LD_SB2(src, src_stride, src7, src8);
4607 src += (2 * src_stride);
4608 XORI_B2_128_SB(src7, src8);
4611 VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1);
4613 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst7, dst7);
4614 ILVRL_H2_SH(dst7, dst6, dst76_r, dst76_l);
4615 dst4_r = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
4616 dst4_l = HEVC_FILT_4TAP(dst54_l, dst76_l, filt_h0, filt_h1);
4622 VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec0, vec1);
4624 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst8, dst8);
4625 ILVRL_H2_SH(dst8, dst7, dst87_r, dst87_l);
4626 dst5_r = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
4627 dst5_l = HEVC_FILT_4TAP(dst65_l, dst87_l, filt_h0, filt_h1);
4631 HEVC_HV_UNIW_RND_CLIP4(dst0_r, dst1_r, dst0_l, dst1_l,
4632 weight_vec, offset_vec, rnd_vec,
4633 dst0_r, dst1_r, dst0_l, dst1_l);
4634 HEVC_HV_UNIW_RND_CLIP4(dst2_r, dst3_r, dst2_l, dst3_l,
4635 weight_vec, offset_vec, rnd_vec,
4636 dst2_r, dst3_r, dst2_l, dst3_l);
4637 HEVC_HV_UNIW_RND_CLIP4(dst4_r, dst5_r, dst4_l, dst5_l,
4638 weight_vec, offset_vec, rnd_vec,
4639 dst4_r, dst5_r, dst4_l, dst5_l);
4640 HEVC_PCK_SW_SB12(dst0_l, dst0_r, dst1_l, dst1_r,
4641 dst2_l, dst2_r, dst3_l, dst3_r,
4642 dst4_l, dst4_r, dst5_l, dst5_r, dst0_r, dst1_r, dst2_r);
4643 ST8x4_UB(dst0_r, dst1_r, dst, dst_stride);
4644 dst += (4 * dst_stride);
4645 ST8x2_UB(dst2_r, dst, dst_stride);
4648 static void hevc_hv_uniwgt_4t_8multx4mult_msa(uint8_t *src,
4652 const int8_t *filter_x,
4653 const int8_t *filter_y,
4660 uint32_t loop_cnt, cnt;
4663 v16i8 src0, src1, src2, src3, src4, src5, src6;
4665 v4i32 filt_h0, filt_h1;
4666 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
4668 v8i16 filter_vec, const_vec;
4669 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
4670 v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
4671 v4i32 dst0_r, dst0_l, dst1_r, dst1_l;
4672 v4i32 weight_vec, offset_vec, rnd_vec;
4673 v4i32 dst2_r, dst2_l, dst3_r, dst3_l;
4674 v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
4675 v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
4677 src -= (src_stride + 1);
4679 filter_vec = LD_SH(filter_x);
4680 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
4682 filter_vec = LD_SH(filter_y);
4683 vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
4684 filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
4686 SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1);
4690 const_vec = __msa_ldi_h(128);
4693 weight_vec = __msa_fill_w(weight);
4694 offset_vec = __msa_fill_w(offset);
4695 rnd_vec = __msa_fill_w(rnd_val);
4697 for (cnt = width >> 3; cnt--;) {
4701 LD_SB3(src_tmp, src_stride, src0, src1, src2);
4702 src_tmp += (3 * src_stride);
4703 XORI_B3_128_SB(src0, src1, src2);
4705 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
4706 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
4707 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
4709 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
4711 DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1);
4713 DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2);
4715 ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
4716 ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
4718 for (loop_cnt = height >> 2; loop_cnt--;) {
4719 LD_SB4(src_tmp, src_stride, src3, src4, src5, src6);
4720 src_tmp += (4 * src_stride);
4721 XORI_B4_128_SB(src3, src4, src5, src6);
4723 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
4725 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
4726 ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
4727 dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
4728 dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
4732 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
4734 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
4735 ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
4736 dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
4737 dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
4741 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
4743 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5);
4744 ILVRL_H2_SH(dst5, dst4, dst10_r, dst10_l);
4745 dst2_r = HEVC_FILT_4TAP(dst32_r, dst10_r, filt_h0, filt_h1);
4746 dst2_l = HEVC_FILT_4TAP(dst32_l, dst10_l, filt_h0, filt_h1);
4750 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
4752 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
4753 ILVRL_H2_SH(dst2, dst5, dst21_r, dst21_l);
4754 dst3_r = HEVC_FILT_4TAP(dst43_r, dst21_r, filt_h0, filt_h1);
4755 dst3_l = HEVC_FILT_4TAP(dst43_l, dst21_l, filt_h0, filt_h1);
4759 HEVC_HV_UNIW_RND_CLIP4(dst0_r, dst1_r, dst0_l, dst1_l,
4760 weight_vec, offset_vec, rnd_vec,
4761 dst0_r, dst1_r, dst0_l, dst1_l);
4762 HEVC_HV_UNIW_RND_CLIP4(dst2_r, dst3_r, dst2_l, dst3_l,
4763 weight_vec, offset_vec, rnd_vec,
4764 dst2_r, dst3_r, dst2_l, dst3_l);
4765 HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r,
4766 dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
4767 ST8x4_UB(dst0_r, dst1_r, dst_tmp, dst_stride);
4768 dst_tmp += (4 * dst_stride);
4776 static void hevc_hv_uniwgt_4t_8w_msa(uint8_t *src,
4780 const int8_t *filter_x,
4781 const int8_t *filter_y,
4789 hevc_hv_uniwgt_4t_8x2_msa(src, src_stride, dst, dst_stride,
4790 filter_x, filter_y, height, weight,
4792 } else if (6 == height) {
4793 hevc_hv_uniwgt_4t_8x6_msa(src, src_stride, dst, dst_stride,
4794 filter_x, filter_y, height, weight,
4796 } else if (0 == (height % 4)) {
4797 hevc_hv_uniwgt_4t_8multx4mult_msa(src, src_stride, dst, dst_stride,
4798 filter_x, filter_y, height, weight,
4799 offset, rnd_val, 8);
4803 static void hevc_hv_uniwgt_4t_12w_msa(uint8_t *src,
4807 const int8_t *filter_x,
4808 const int8_t *filter_y,
4814 hevc_hv_uniwgt_4t_8multx4mult_msa(src, src_stride, dst, dst_stride,
4815 filter_x, filter_y, height, weight,
4816 offset, rnd_val, 8);
4817 hevc_hv_uniwgt_4t_4w_msa(src + 8, src_stride, dst + 8, dst_stride,
4818 filter_x, filter_y, height, weight,
4822 static void hevc_hv_uniwgt_4t_16w_msa(uint8_t *src,
4826 const int8_t *filter_x,
4827 const int8_t *filter_y,
4833 hevc_hv_uniwgt_4t_8multx4mult_msa(src, src_stride, dst, dst_stride,
4834 filter_x, filter_y, height, weight,
4835 offset, rnd_val, 16);
4838 static void hevc_hv_uniwgt_4t_24w_msa(uint8_t *src,
4842 const int8_t *filter_x,
4843 const int8_t *filter_y,
4849 hevc_hv_uniwgt_4t_8multx4mult_msa(src, src_stride, dst, dst_stride,
4850 filter_x, filter_y, height, weight,
4851 offset, rnd_val, 24);
4854 static void hevc_hv_uniwgt_4t_32w_msa(uint8_t *src,
4858 const int8_t *filter_x,
4859 const int8_t *filter_y,
4865 hevc_hv_uniwgt_4t_8multx4mult_msa(src, src_stride, dst, dst_stride,
4866 filter_x, filter_y, height, weight,
4867 offset, rnd_val, 32);
4870 #define UNIWGT_MC_COPY(WIDTH) \
4871 void ff_hevc_put_hevc_uni_w_pel_pixels##WIDTH##_8_msa(uint8_t *dst, \
4872 ptrdiff_t dst_stride, \
4874 ptrdiff_t src_stride, \
4883 int shift = denom + 14 - 8; \
4884 hevc_uniwgt_copy_##WIDTH##w_msa(src, src_stride, dst, dst_stride, \
4885 height, weight, offset, shift); \
4898 #undef UNIWGT_MC_COPY
4900 #define UNI_W_MC(PEL, DIR, WIDTH, TAP, DIR1, FILT_DIR) \
4901 void ff_hevc_put_hevc_uni_w_##PEL##_##DIR##WIDTH##_8_msa(uint8_t *dst, \
4915 const int8_t *filter = ff_hevc_##PEL##_filters[FILT_DIR - 1]; \
4916 int shift = denom + 14 - 8; \
4918 hevc_##DIR1##_uniwgt_##TAP##t_##WIDTH##w_msa(src, src_stride, dst, \
4919 dst_stride, filter, height, \
4920 weight, offset, shift); \
4923 UNI_W_MC(qpel, h, 4, 8, hz, mx);
4924 UNI_W_MC(qpel, h, 8, 8, hz, mx);
4925 UNI_W_MC(qpel, h, 12, 8, hz, mx);
4926 UNI_W_MC(qpel, h, 16, 8, hz, mx);
4927 UNI_W_MC(qpel, h, 24, 8, hz, mx);
4928 UNI_W_MC(qpel, h, 32, 8, hz, mx);
4929 UNI_W_MC(qpel, h, 48, 8, hz, mx);
4930 UNI_W_MC(qpel, h, 64, 8, hz, mx);
4932 UNI_W_MC(qpel, v, 4, 8, vt, my);
4933 UNI_W_MC(qpel, v, 8, 8, vt, my);
4934 UNI_W_MC(qpel, v, 12, 8, vt, my);
4935 UNI_W_MC(qpel, v, 16, 8, vt, my);
4936 UNI_W_MC(qpel, v, 24, 8, vt, my);
4937 UNI_W_MC(qpel, v, 32, 8, vt, my);
4938 UNI_W_MC(qpel, v, 48, 8, vt, my);
4939 UNI_W_MC(qpel, v, 64, 8, vt, my);
4941 UNI_W_MC(epel, h, 4, 4, hz, mx);
4942 UNI_W_MC(epel, h, 6, 4, hz, mx);
4943 UNI_W_MC(epel, h, 8, 4, hz, mx);
4944 UNI_W_MC(epel, h, 12, 4, hz, mx);
4945 UNI_W_MC(epel, h, 16, 4, hz, mx);
4946 UNI_W_MC(epel, h, 24, 4, hz, mx);
4947 UNI_W_MC(epel, h, 32, 4, hz, mx);
4949 UNI_W_MC(epel, v, 4, 4, vt, my);
4950 UNI_W_MC(epel, v, 6, 4, vt, my);
4951 UNI_W_MC(epel, v, 8, 4, vt, my);
4952 UNI_W_MC(epel, v, 12, 4, vt, my);
4953 UNI_W_MC(epel, v, 16, 4, vt, my);
4954 UNI_W_MC(epel, v, 24, 4, vt, my);
4955 UNI_W_MC(epel, v, 32, 4, vt, my);
4959 #define UNI_W_MC_HV(PEL, DIR, WIDTH, TAP, DIR1) \
4960 void ff_hevc_put_hevc_uni_w_##PEL##_##DIR##WIDTH##_8_msa(uint8_t *dst, \
4974 const int8_t *filter_x = ff_hevc_##PEL##_filters[mx - 1]; \
4975 const int8_t *filter_y = ff_hevc_##PEL##_filters[my - 1]; \
4976 int shift = denom + 14 - 8; \
4978 hevc_##DIR1##_uniwgt_##TAP##t_##WIDTH##w_msa(src, src_stride, dst, \
4979 dst_stride, filter_x, \
4980 filter_y, height, weight, \
4984 UNI_W_MC_HV(qpel, hv, 4, 8, hv);
4985 UNI_W_MC_HV(qpel, hv, 8, 8, hv);
4986 UNI_W_MC_HV(qpel, hv, 12, 8, hv);
4987 UNI_W_MC_HV(qpel, hv, 16, 8, hv);
4988 UNI_W_MC_HV(qpel, hv, 24, 8, hv);
4989 UNI_W_MC_HV(qpel, hv, 32, 8, hv);
4990 UNI_W_MC_HV(qpel, hv, 48, 8, hv);
4991 UNI_W_MC_HV(qpel, hv, 64, 8, hv);
4993 UNI_W_MC_HV(epel, hv, 4, 4, hv);
4994 UNI_W_MC_HV(epel, hv, 6, 4, hv);
4995 UNI_W_MC_HV(epel, hv, 8, 4, hv);
4996 UNI_W_MC_HV(epel, hv, 12, 4, hv);
4997 UNI_W_MC_HV(epel, hv, 16, 4, hv);
4998 UNI_W_MC_HV(epel, hv, 24, 4, hv);
4999 UNI_W_MC_HV(epel, hv, 32, 4, hv);