2 * Copyright (c) 2015 - 2017 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com)
4 * This file is part of FFmpeg.
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 #include "libavutil/mips/generic_macros_msa.h"
22 #include "libavcodec/mips/hevcdsp_mips.h"
23 #include "libavcodec/mips/hevc_macros_msa.h"
25 static const uint8_t ff_hevc_mask_arr[16 * 2] __attribute__((aligned(0x40))) = {
27 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
28 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20
31 #define HEVC_BIW_RND_CLIP2(in0, in1, vec0, vec1, wgt, rnd, offset, \
34 v4i32 out0_r, out1_r, out0_l, out1_l; \
36 ILVR_H2_SW(in0, vec0, in1, vec1, out0_r, out1_r); \
37 ILVL_H2_SW(in0, vec0, in1, vec1, out0_l, out1_l); \
39 out0_r = __msa_dpadd_s_w(offset, (v8i16) out0_r, (v8i16) wgt); \
40 out1_r = __msa_dpadd_s_w(offset, (v8i16) out1_r, (v8i16) wgt); \
41 out0_l = __msa_dpadd_s_w(offset, (v8i16) out0_l, (v8i16) wgt); \
42 out1_l = __msa_dpadd_s_w(offset, (v8i16) out1_l, (v8i16) wgt); \
44 SRAR_W4_SW(out0_r, out1_r, out0_l, out1_l, rnd); \
45 PCKEV_H2_SH(out0_l, out0_r, out1_l, out1_r, out0, out1); \
46 CLIP_SH2_0_255(out0, out1); \
49 #define HEVC_BIW_RND_CLIP4(in0, in1, in2, in3, vec0, vec1, vec2, vec3, \
50 wgt, rnd, offset, out0, out1, out2, out3) \
52 HEVC_BIW_RND_CLIP2(in0, in1, vec0, vec1, wgt, rnd, offset, out0, out1); \
53 HEVC_BIW_RND_CLIP2(in2, in3, vec2, vec3, wgt, rnd, offset, out2, out3); \
56 #define HEVC_BIW_RND_CLIP2_MAX_SATU(in0, in1, vec0, vec1, wgt, rnd, \
59 v4i32 out0_r, out1_r, out0_l, out1_l; \
61 ILVR_H2_SW(in0, vec0, in1, vec1, out0_r, out1_r); \
62 ILVL_H2_SW(in0, vec0, in1, vec1, out0_l, out1_l); \
63 out0_r = __msa_dpadd_s_w(offset, (v8i16) out0_r, (v8i16) wgt); \
64 out1_r = __msa_dpadd_s_w(offset, (v8i16) out1_r, (v8i16) wgt); \
65 out0_l = __msa_dpadd_s_w(offset, (v8i16) out0_l, (v8i16) wgt); \
66 out1_l = __msa_dpadd_s_w(offset, (v8i16) out1_l, (v8i16) wgt); \
67 SRAR_W4_SW(out0_r, out1_r, out0_l, out1_l, rnd); \
68 PCKEV_H2_SH(out0_l, out0_r, out1_l, out1_r, out0, out1); \
69 CLIP_SH2_0_255_MAX_SATU(out0, out1); \
72 #define HEVC_BIW_RND_CLIP4_MAX_SATU(in0, in1, in2, in3, vec0, vec1, vec2, \
73 vec3, wgt, rnd, offset, out0, out1, \
76 HEVC_BIW_RND_CLIP2_MAX_SATU(in0, in1, vec0, vec1, wgt, rnd, offset, \
78 HEVC_BIW_RND_CLIP2_MAX_SATU(in2, in3, vec2, vec3, wgt, rnd, offset, \
82 static void hevc_biwgt_copy_4w_msa(uint8_t *src0_ptr,
95 uint32_t loop_cnt, tp0, tp1, tp2, tp3;
96 uint64_t tpd0, tpd1, tpd2, tpd3;
97 int32_t offset, weight;
100 v16i8 src0 = { 0 }, src1 = { 0 };
101 v8i16 in0 = { 0 }, in1 = { 0 }, in2 = { 0 }, in3 = { 0 };
102 v8i16 dst0, dst1, dst2, dst3, weight_vec;
103 v4i32 dst0_r, dst0_l, offset_vec, rnd_vec;
105 offset = (offset0 + offset1) << rnd_val;
106 weight0 = weight0 & 0x0000FFFF;
107 weight = weight0 | (weight1 << 16);
109 offset_vec = __msa_fill_w(offset);
110 weight_vec = (v8i16) __msa_fill_w(weight);
111 rnd_vec = __msa_fill_w(rnd_val + 1);
114 LW2(src0_ptr, src_stride, tp0, tp1);
115 INSERT_W2_SB(tp0, tp1, src0);
116 LD2(src1_ptr, src2_stride, tpd0, tpd1);
117 INSERT_D2_SH(tpd0, tpd1, in0);
119 dst0 = (v8i16) __msa_ilvr_b(zero, src0);
122 ILVRL_H2_SW(dst0, in0, dst0_r, dst0_l);
123 dst0_r = __msa_dpadd_s_w(offset_vec, (v8i16) dst0_r, weight_vec);
124 dst0_l = __msa_dpadd_s_w(offset_vec, (v8i16) dst0_l, weight_vec);
125 SRAR_W2_SW(dst0_r, dst0_l, rnd_vec);
126 dst0 = (v8i16) __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r);
127 dst0 = CLIP_SH_0_255_MAX_SATU(dst0);
128 out0 = (v16u8) __msa_pckev_b((v16i8) dst0, (v16i8) dst0);
129 ST4x2_UB(out0, dst, dst_stride);
130 } else if (4 == height) {
131 LW4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
132 INSERT_W4_SB(tp0, tp1, tp2, tp3, src0);
133 LD4(src1_ptr, src2_stride, tpd0, tpd1, tpd2, tpd3);
134 INSERT_D2_SH(tpd0, tpd1, in0);
135 INSERT_D2_SH(tpd2, tpd3, in1);
136 ILVRL_B2_SH(zero, src0, dst0, dst1);
137 SLLI_2V(dst0, dst1, 6);
138 HEVC_BIW_RND_CLIP2_MAX_SATU(dst0, dst1, in0, in1, weight_vec, rnd_vec,
139 offset_vec, dst0, dst1);
140 out0 = (v16u8) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
141 ST4x4_UB(out0, out0, 0, 1, 2, 3, dst, dst_stride);
142 } else if (0 == height % 8) {
143 for (loop_cnt = (height >> 3); loop_cnt--;) {
144 LW4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
145 src0_ptr += 4 * src_stride;
146 INSERT_W4_SB(tp0, tp1, tp2, tp3, src0);
147 LW4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
148 src0_ptr += 4 * src_stride;
149 INSERT_W4_SB(tp0, tp1, tp2, tp3, src1);
150 LD4(src1_ptr, src2_stride, tpd0, tpd1, tpd2, tpd3);
151 src1_ptr += (4 * src2_stride);
152 INSERT_D2_SH(tpd0, tpd1, in0);
153 INSERT_D2_SH(tpd2, tpd3, in1);
154 LD4(src1_ptr, src2_stride, tpd0, tpd1, tpd2, tpd3);
155 src1_ptr += (4 * src2_stride);
156 INSERT_D2_SH(tpd0, tpd1, in2);
157 INSERT_D2_SH(tpd2, tpd3, in3);
158 ILVRL_B2_SH(zero, src0, dst0, dst1);
159 ILVRL_B2_SH(zero, src1, dst2, dst3);
160 SLLI_4V(dst0, dst1, dst2, dst3, 6);
161 HEVC_BIW_RND_CLIP4_MAX_SATU(dst0, dst1, dst2, dst3, in0, in1, in2,
162 in3, weight_vec, rnd_vec, offset_vec,
163 dst0, dst1, dst2, dst3);
164 PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
165 ST4x8_UB(out0, out1, dst, dst_stride);
166 dst += (8 * dst_stride);
171 static void hevc_biwgt_copy_6w_msa(uint8_t *src0_ptr,
185 int32_t offset, weight;
186 uint64_t tp0, tp1, tp2, tp3;
189 v16i8 src0 = { 0 }, src1 = { 0 };
190 v8i16 in0, in1, in2, in3;
191 v8i16 dst0, dst1, dst2, dst3;
192 v4i32 offset_vec, weight_vec, rnd_vec;
194 offset = (offset0 + offset1) << rnd_val;
195 weight0 = weight0 & 0x0000FFFF;
196 weight = weight0 | (weight1 << 16);
198 weight_vec = __msa_fill_w(weight);
199 offset_vec = __msa_fill_w(offset);
200 rnd_vec = __msa_fill_w(rnd_val + 1);
202 for (loop_cnt = (height >> 2); loop_cnt--;) {
203 LD4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
204 src0_ptr += (4 * src_stride);
205 INSERT_D2_SB(tp0, tp1, src0);
206 INSERT_D2_SB(tp2, tp3, src1);
207 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
208 src1_ptr += (4 * src2_stride);
209 ILVRL_B2_SH(zero, src0, dst0, dst1);
210 ILVRL_B2_SH(zero, src1, dst2, dst3);
211 SLLI_4V(dst0, dst1, dst2, dst3, 6);
212 HEVC_BIW_RND_CLIP4_MAX_SATU(dst0, dst1, dst2, dst3,
214 weight_vec, rnd_vec, offset_vec,
215 dst0, dst1, dst2, dst3);
216 PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
217 ST6x4_UB(out0, out1, dst, dst_stride);
218 dst += (4 * dst_stride);
222 static void hevc_biwgt_copy_8w_msa(uint8_t *src0_ptr,
235 uint64_t tp0, tp1, tp2, tp3;
236 int32_t offset, weight;
237 v16u8 out0, out1, out2;
239 v16i8 src0 = { 0 }, src1 = { 0 }, src2 = { 0 };
240 v8i16 in0, in1, in2, in3, in4, in5;
241 v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
242 v4i32 offset_vec, weight_vec, rnd_vec;
244 offset = (offset0 + offset1) << rnd_val;
245 weight0 = weight0 & 0x0000FFFF;
246 weight = weight0 | (weight1 << 16);
248 offset_vec = __msa_fill_w(offset);
249 weight_vec = __msa_fill_w(weight);
250 rnd_vec = __msa_fill_w(rnd_val + 1);
253 LD2(src0_ptr, src_stride, tp0, tp1);
254 INSERT_D2_SB(tp0, tp1, src0);
255 LD_SH2(src1_ptr, src2_stride, in0, in1);
256 ILVRL_B2_SH(zero, src0, dst0, dst1);
257 SLLI_2V(dst0, dst1, 6);
259 HEVC_BIW_RND_CLIP2(dst0, dst1, in0, in1,
260 weight_vec, rnd_vec, offset_vec,
263 out0 = (v16u8) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
264 ST8x2_UB(out0, dst, dst_stride);
265 } else if (6 == height) {
266 LD4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
267 src0_ptr += 4 * src_stride;
268 INSERT_D2_SB(tp0, tp1, src0);
269 INSERT_D2_SB(tp2, tp3, src1);
270 LD2(src0_ptr, src_stride, tp0, tp1);
271 INSERT_D2_SB(tp0, tp1, src2);
272 ILVRL_B2_SH(zero, src0, dst0, dst1);
273 ILVRL_B2_SH(zero, src1, dst2, dst3);
274 ILVRL_B2_SH(zero, src2, dst4, dst5);
275 LD_SH6(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5);
276 SLLI_4V(dst0, dst1, dst2, dst3, 6);
277 SLLI_2V(dst4, dst5, 6);
278 HEVC_BIW_RND_CLIP4_MAX_SATU(dst0, dst1, dst2, dst3, in0, in1, in2, in3,
279 weight_vec, rnd_vec, offset_vec, dst0, dst1,
281 HEVC_BIW_RND_CLIP2_MAX_SATU(dst4, dst5, in4, in5, weight_vec, rnd_vec,
282 offset_vec, dst4, dst5);
283 PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
284 ST8x4_UB(out0, out1, dst, dst_stride);
285 dst += (4 * dst_stride);
286 ST8x2_UB(out2, dst, dst_stride);
287 } else if (0 == height % 4) {
290 for (loop_cnt = (height >> 2); loop_cnt--;) {
291 LD4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
292 src0_ptr += (4 * src_stride);
293 INSERT_D2_SB(tp0, tp1, src0);
294 INSERT_D2_SB(tp2, tp3, src1);
295 ILVRL_B2_SH(zero, src0, dst0, dst1);
296 ILVRL_B2_SH(zero, src1, dst2, dst3);
297 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
298 src1_ptr += (4 * src2_stride);
300 SLLI_4V(dst0, dst1, dst2, dst3, 6);
301 HEVC_BIW_RND_CLIP4_MAX_SATU(dst0, dst1, dst2, dst3, in0, in1, in2,
302 in3, weight_vec, rnd_vec, offset_vec,
303 dst0, dst1, dst2, dst3);
304 PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
305 ST8x4_UB(out0, out1, dst, dst_stride);
306 dst += (4 * dst_stride);
311 static void hevc_biwgt_copy_12w_msa(uint8_t *src0_ptr,
325 int32_t offset, weight;
327 v16u8 out0, out1, out2;
328 v16i8 src0, src1, src2, src3;
329 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
330 v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
331 v4i32 offset_vec, weight_vec, rnd_vec;
333 offset = (offset0 + offset1) << rnd_val;
334 weight0 = weight0 & 0x0000FFFF;
335 weight = weight0 | (weight1 << 16);
337 offset_vec = __msa_fill_w(offset);
338 weight_vec = __msa_fill_w(weight);
339 rnd_vec = __msa_fill_w(rnd_val + 1);
341 for (loop_cnt = (16 >> 2); loop_cnt--;) {
342 LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
343 src0_ptr += (4 * src_stride);
344 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
345 LD_SH4(src1_ptr + 8, src2_stride, in4, in5, in6, in7);
346 src1_ptr += (4 * src2_stride);
348 ILVR_D2_SH(in5, in4, in7, in6, in4, in5);
349 ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
350 dst0, dst1, dst2, dst3);
352 SLLI_4V(dst0, dst1, dst2, dst3, 6);
353 ILVL_W2_SB(src1, src0, src3, src2, src0, src1);
354 ILVR_B2_SH(zero, src0, zero, src1, dst4, dst5);
358 HEVC_BIW_RND_CLIP4_MAX_SATU(dst0, dst1, dst2, dst3, in0, in1, in2, in3,
359 weight_vec, rnd_vec, offset_vec, dst0, dst1,
361 HEVC_BIW_RND_CLIP2_MAX_SATU(dst4, dst5, in4, in5, weight_vec, rnd_vec,
362 offset_vec, dst4, dst5);
363 PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
364 ST12x4_UB(out0, out1, out2, dst, dst_stride);
365 dst += (4 * dst_stride);
369 static void hevc_biwgt_copy_16w_msa(uint8_t *src0_ptr,
383 int32_t offset, weight;
384 v16u8 out0, out1, out2, out3;
386 v16i8 src0, src1, src2, src3;
387 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
388 v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
389 v4i32 offset_vec, weight_vec, rnd_vec;
391 offset = (offset0 + offset1) << rnd_val;
392 weight0 = weight0 & 0x0000FFFF;
393 weight = weight0 | (weight1 << 16);
395 offset_vec = __msa_fill_w(offset);
396 weight_vec = __msa_fill_w(weight);
397 rnd_vec = __msa_fill_w(rnd_val + 1);
399 for (loop_cnt = (height >> 2); loop_cnt--;) {
400 LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
401 src0_ptr += (4 * src_stride);
402 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
403 LD_SH4(src1_ptr + 8, src2_stride, in4, in5, in6, in7);
404 src1_ptr += (4 * src2_stride);
405 ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, tmp0, tmp1,
407 ILVL_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, tmp4, tmp5,
409 SLLI_4V(tmp0, tmp1, tmp2, tmp3, 6);
410 SLLI_4V(tmp4, tmp5, tmp6, tmp7, 6);
411 HEVC_BIW_RND_CLIP4_MAX_SATU(tmp0, tmp1, tmp4, tmp5, in0, in1, in4, in5,
412 weight_vec, rnd_vec, offset_vec, tmp0, tmp1,
414 HEVC_BIW_RND_CLIP4_MAX_SATU(tmp2, tmp3, tmp6, tmp7, in2, in3, in6, in7,
415 weight_vec, rnd_vec, offset_vec, tmp2, tmp3,
417 PCKEV_B2_UB(tmp4, tmp0, tmp5, tmp1, out0, out1);
418 PCKEV_B2_UB(tmp6, tmp2, tmp7, tmp3, out2, out3);
419 ST_UB4(out0, out1, out2, out3, dst, dst_stride);
420 dst += (4 * dst_stride);
424 static void hevc_biwgt_copy_24w_msa(uint8_t *src0_ptr,
438 int32_t offset, weight;
439 v16u8 out0, out1, out2, out3, out4, out5;
440 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, zero = { 0 };
441 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8, dst9, dst10;
442 v8i16 in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, dst11;
443 v4i32 offset_vec, weight_vec, rnd_vec;
445 offset = (offset0 + offset1) << rnd_val;
446 weight0 = weight0 & 0x0000FFFF;
447 weight = weight0 | (weight1 << 16);
449 offset_vec = __msa_fill_w(offset);
450 weight_vec = __msa_fill_w(weight);
451 rnd_vec = __msa_fill_w(rnd_val + 1);
453 for (loop_cnt = 8; loop_cnt--;) {
454 LD_SB4(src0_ptr, src_stride, src0, src1, src4, src5);
455 LD_SB4(src0_ptr + 16, src_stride, src2, src3, src6, src7);
456 src0_ptr += (4 * src_stride);
457 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
458 LD_SH4(src1_ptr + 8, src2_stride, in4, in5, in6, in7);
459 LD_SH4(src1_ptr + 16, src2_stride, in8, in9, in10, in11);
460 src1_ptr += (4 * src2_stride);
462 ILVRL_B2_SH(zero, src0, dst0, dst1);
463 ILVRL_B2_SH(zero, src1, dst2, dst3);
464 ILVR_B2_SH(zero, src2, zero, src3, dst4, dst5);
465 ILVRL_B2_SH(zero, src4, dst6, dst7);
466 ILVRL_B2_SH(zero, src5, dst8, dst9);
467 ILVR_B2_SH(zero, src6, zero, src7, dst10, dst11);
468 SLLI_4V(dst0, dst1, dst2, dst3, 6);
469 SLLI_4V(dst4, dst5, dst6, dst7, 6);
470 SLLI_4V(dst8, dst9, dst10, dst11, 6);
471 HEVC_BIW_RND_CLIP4_MAX_SATU(dst0, dst1, dst2, dst3, in0, in4, in1, in5,
472 weight_vec, rnd_vec, offset_vec, dst0, dst1,
474 HEVC_BIW_RND_CLIP4_MAX_SATU(dst4, dst5, dst6, dst7, in8, in9, in2, in6,
475 weight_vec, rnd_vec, offset_vec, dst4, dst5,
477 HEVC_BIW_RND_CLIP4_MAX_SATU(dst8, dst9, dst10, dst11, in3, in7, in10,
478 in11, weight_vec, rnd_vec, offset_vec,
479 dst8, dst9, dst10, dst11);
480 PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
481 PCKEV_B3_UB(dst7, dst6, dst9, dst8, dst11, dst10, out3, out4, out5);
482 ST_UB4(out0, out1, out3, out4, dst, dst_stride);
483 ST8x4_UB(out2, out5, dst + 16, dst_stride);
484 dst += (4 * dst_stride);
488 static void hevc_biwgt_copy_32w_msa(uint8_t *src0_ptr,
502 int32_t offset, weight;
503 v16u8 out0, out1, out2, out3;
505 v16i8 src0, src1, src2, src3;
506 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
507 v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
508 v4i32 offset_vec, weight_vec, rnd_vec;
510 offset = (offset0 + offset1) << rnd_val;
511 weight0 = weight0 & 0x0000FFFF;
512 weight = weight0 | (weight1 << 16);
514 offset_vec = __msa_fill_w(offset);
515 weight_vec = __msa_fill_w(weight);
516 rnd_vec = __msa_fill_w(rnd_val + 1);
518 for (loop_cnt = (height >> 1); loop_cnt--;) {
519 LD_SB2(src0_ptr, 16, src0, src1);
520 src0_ptr += src_stride;
521 LD_SB2(src0_ptr, 16, src2, src3);
522 src0_ptr += src_stride;
523 LD_SH4(src1_ptr, 8, in0, in1, in2, in3);
524 src1_ptr += src2_stride;
525 LD_SH4(src1_ptr, 8, in4, in5, in6, in7);
526 src1_ptr += src2_stride;
528 ILVRL_B2_SH(zero, src0, tmp0, tmp4);
529 ILVRL_B2_SH(zero, src1, tmp1, tmp5);
530 ILVRL_B2_SH(zero, src2, tmp2, tmp6);
531 ILVRL_B2_SH(zero, src3, tmp3, tmp7);
532 SLLI_4V(tmp0, tmp1, tmp2, tmp3, 6);
533 SLLI_4V(tmp4, tmp5, tmp6, tmp7, 6);
534 HEVC_BIW_RND_CLIP4_MAX_SATU(tmp0, tmp4, tmp1, tmp5, in0, in1, in2, in3,
535 weight_vec, rnd_vec, offset_vec, tmp0, tmp4,
537 HEVC_BIW_RND_CLIP4_MAX_SATU(tmp2, tmp6, tmp3, tmp7, in4, in5, in6, in7,
538 weight_vec, rnd_vec, offset_vec, tmp2, tmp6,
540 PCKEV_B2_UB(tmp4, tmp0, tmp5, tmp1, out0, out1);
541 PCKEV_B2_UB(tmp6, tmp2, tmp7, tmp3, out2, out3);
542 ST_UB2(out0, out1, dst, 16);
544 ST_UB2(out2, out3, dst, 16);
549 static void hevc_biwgt_copy_48w_msa(uint8_t *src0_ptr,
563 int32_t offset, weight;
564 v16u8 out0, out1, out2;
565 v16i8 src0, src1, src2;
567 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, in0, in1, in2, in3, in4, in5;
568 v4i32 offset_vec, weight_vec, rnd_vec;
570 offset = (offset0 + offset1) << rnd_val;
571 weight0 = weight0 & 0x0000FFFF;
572 weight = weight0 | (weight1 << 16);
574 offset_vec = __msa_fill_w(offset);
575 weight_vec = __msa_fill_w(weight);
576 rnd_vec = __msa_fill_w(rnd_val + 1);
578 for (loop_cnt = 64; loop_cnt--;) {
579 LD_SB3(src0_ptr, 16, src0, src1, src2);
580 src0_ptr += src_stride;
581 LD_SH6(src1_ptr, 8, in0, in1, in2, in3, in4, in5);
582 src1_ptr += src2_stride;
584 ILVRL_B2_SH(zero, src0, dst0, dst1);
585 ILVRL_B2_SH(zero, src1, dst2, dst3);
586 ILVRL_B2_SH(zero, src2, dst4, dst5);
587 SLLI_4V(dst0, dst1, dst2, dst3, 6);
588 SLLI_2V(dst4, dst5, 6);
589 HEVC_BIW_RND_CLIP4_MAX_SATU(dst0, dst1, dst2, dst3, in0, in1, in2, in3,
590 weight_vec, rnd_vec, offset_vec, dst0, dst1,
592 HEVC_BIW_RND_CLIP2_MAX_SATU(dst4, dst5, in4, in5, weight_vec, rnd_vec,
593 offset_vec, dst4, dst5);
594 PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
595 ST_UB2(out0, out1, dst, 16);
596 ST_UB(out2, dst + 32);
601 static void hevc_biwgt_copy_64w_msa(uint8_t *src0_ptr,
615 int32_t offset, weight;
616 v16u8 out0, out1, out2, out3;
618 v16i8 src0, src1, src2, src3;
619 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
620 v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
621 v4i32 offset_vec, weight_vec, rnd_vec;
623 offset = (offset0 + offset1) << rnd_val;
624 weight0 = weight0 & 0x0000FFFF;
625 weight = weight0 | (weight1 << 16);
627 offset_vec = __msa_fill_w(offset);
628 weight_vec = __msa_fill_w(weight);
629 rnd_vec = __msa_fill_w(rnd_val + 1);
631 for (loop_cnt = height; loop_cnt--;) {
632 LD_SB4(src0_ptr, 16, src0, src1, src2, src3);
633 src0_ptr += src_stride;
634 LD_SH8(src1_ptr, 8, in0, in1, in2, in3, in4, in5, in6, in7);
635 src1_ptr += src2_stride;
637 ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, tmp0, tmp1,
639 ILVL_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, tmp4, tmp5,
641 SLLI_4V(tmp0, tmp1, tmp2, tmp3, 6);
642 SLLI_4V(tmp4, tmp5, tmp6, tmp7, 6);
643 HEVC_BIW_RND_CLIP4_MAX_SATU(tmp0, tmp4, tmp1, tmp5, in0, in1, in2, in3,
644 weight_vec, rnd_vec, offset_vec, tmp0, tmp4,
646 HEVC_BIW_RND_CLIP4_MAX_SATU(tmp2, tmp6, tmp3, tmp7, in4, in5, in6, in7,
647 weight_vec, rnd_vec, offset_vec, tmp2, tmp6,
649 PCKEV_B2_UB(tmp4, tmp0, tmp5, tmp1, out0, out1);
650 PCKEV_B2_UB(tmp6, tmp2, tmp7, tmp3, out2, out3);
651 ST_UB4(out0, out1, out2, out3, dst, 16);
656 static void hevc_hz_biwgt_8t_4w_msa(uint8_t *src0_ptr,
662 const int8_t *filter,
671 int32_t offset, weight, constant;
672 v8i16 filt0, filt1, filt2, filt3;
673 v16i8 src0, src1, src2, src3;
674 v16i8 mask1, mask2, mask3;
675 v16i8 vec0, vec1, vec2, vec3;
677 v8i16 in0, in1, in2, in3;
678 v8i16 filter_vec, out0, out1;
679 v4i32 weight_vec, offset_vec, rnd_vec;
680 v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[16]);
683 filter_vec = LD_SH(filter);
684 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
690 offset = (offset0 + offset1) << rnd_val;
691 weight0 = weight0 & 0x0000FFFF;
692 weight = weight0 | (weight1 << 16);
693 constant = 128 * weight1;
697 offset_vec = __msa_fill_w(offset);
698 weight_vec = __msa_fill_w(weight);
699 rnd_vec = __msa_fill_w(rnd_val + 1);
701 for (loop_cnt = (height >> 2); loop_cnt--;) {
702 LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
703 src0_ptr += (4 * src_stride);
704 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
705 src1_ptr += (4 * src2_stride);
706 ILVR_D2_SH(in1, in0, in3, in2, in0, in1);
707 XORI_B4_128_SB(src0, src1, src2, src3);
709 VSHF_B4_SB(src0, src1, mask0, mask1, mask2, mask3,
710 vec0, vec1, vec2, vec3);
711 dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
713 VSHF_B4_SB(src2, src3, mask0, mask1, mask2, mask3,
714 vec0, vec1, vec2, vec3);
715 dst1 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
718 HEVC_BIW_RND_CLIP2(dst0, dst1, in0, in1,
719 weight_vec, rnd_vec, offset_vec,
722 out0 = (v8i16) __msa_pckev_b((v16i8) out1, (v16i8) out0);
723 ST4x4_UB(out0, out0, 0, 1, 2, 3, dst, dst_stride);
724 dst += (4 * dst_stride);
728 static void hevc_hz_biwgt_8t_8w_msa(uint8_t *src0_ptr,
734 const int8_t *filter,
743 int32_t offset, weight, constant;
744 v8i16 filt0, filt1, filt2, filt3;
745 v16i8 src0, src1, src2, src3;
746 v16i8 mask1, mask2, mask3;
747 v16i8 vec0, vec1, vec2, vec3;
748 v8i16 dst0, dst1, dst2, dst3;
749 v8i16 in0, in1, in2, in3;
750 v8i16 filter_vec, out0, out1, out2, out3;
751 v4i32 weight_vec, offset_vec, rnd_vec;
752 v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
755 offset = (offset0 + offset1) << rnd_val;
756 weight0 = weight0 & 0x0000FFFF;
757 weight = weight0 | (weight1 << 16);
758 constant = 128 * weight1;
762 offset_vec = __msa_fill_w(offset);
763 weight_vec = __msa_fill_w(weight);
764 rnd_vec = __msa_fill_w(rnd_val + 1);
766 filter_vec = LD_SH(filter);
767 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
773 for (loop_cnt = (height >> 2); loop_cnt--;) {
774 LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
775 src0_ptr += (4 * src_stride);
776 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
777 src1_ptr += (4 * src2_stride);
778 XORI_B4_128_SB(src0, src1, src2, src3);
780 VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
781 vec0, vec1, vec2, vec3);
782 dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
784 VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
785 vec0, vec1, vec2, vec3);
786 dst1 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
788 VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
789 vec0, vec1, vec2, vec3);
790 dst2 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
792 VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
793 vec0, vec1, vec2, vec3);
794 dst3 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
797 HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3,
799 weight_vec, rnd_vec, offset_vec,
800 out0, out1, out2, out3);
802 PCKEV_B2_SH(out1, out0, out3, out2, out0, out1);
803 ST8x4_UB(out0, out1, dst, dst_stride);
804 dst += (4 * dst_stride);
808 static void hevc_hz_biwgt_8t_12w_msa(uint8_t *src0_ptr,
814 const int8_t *filter,
823 int32_t offset, weight, constant;
824 v16i8 src0, src1, src2, src3, vec0, vec1, vec2, vec3;
825 v16i8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7;
826 v8i16 filt0, filt1, filt2, filt3, out0, out1, out2, out3;
827 v8i16 dst0, dst1, dst2, dst3, in0, in1, in2, in3, filter_vec;
828 v4i32 weight_vec, offset_vec, rnd_vec;
832 weight0 = weight0 & 0x0000FFFF;
833 weight = weight0 | (weight1 << 16);
834 constant = 128 * weight1;
836 offset = (offset0 + offset1) << rnd_val;
839 offset_vec = __msa_fill_w(offset);
840 weight_vec = __msa_fill_w(weight);
841 rnd_vec = __msa_fill_w(rnd_val + 1);
843 filter_vec = LD_SH(filter);
844 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
846 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
850 mask4 = LD_SB(&ff_hevc_mask_arr[16]);
855 for (loop_cnt = 4; loop_cnt--;) {
856 LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
857 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
858 XORI_B4_128_SB(src0, src1, src2, src3);
859 VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3, vec0, vec1, vec2,
861 dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
863 VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3, vec0, vec1, vec2,
865 dst1 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
867 VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3, vec0, vec1, vec2,
869 dst2 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
871 VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3, vec0, vec1, vec2,
873 dst3 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
875 HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3, in0, in1, in2, in3,
876 weight_vec, rnd_vec, offset_vec, out0, out1, out2,
878 PCKEV_B2_SH(out1, out0, out3, out2, out0, out1);
879 ST8x4_UB(out0, out1, dst, dst_stride);
881 LD_SB4(src0_ptr + 8, src_stride, src0, src1, src2, src3);
882 src0_ptr += (4 * src_stride);
883 LD_SH4(src1_ptr + 8, src2_stride, in0, in1, in2, in3);
884 src1_ptr += (4 * src2_stride);
885 ILVR_D2_SH(in1, in0, in3, in2, in0, in1);
886 XORI_B4_128_SB(src0, src1, src2, src3);
887 VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7, vec0, vec1, vec2,
889 dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
891 VSHF_B4_SB(src2, src3, mask4, mask5, mask6, mask7, vec0, vec1, vec2,
893 dst1 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
895 HEVC_BIW_RND_CLIP2(dst0, dst1, in0, in1, weight_vec, rnd_vec,
896 offset_vec, out0, out1);
897 out0 = (v8i16) __msa_pckev_b((v16i8) out1, (v16i8) out0);
898 ST4x4_UB(out0, out0, 0, 1, 2, 3, dst + 8, dst_stride);
899 dst += (4 * dst_stride);
903 static void hevc_hz_biwgt_8t_16w_msa(uint8_t *src0_ptr,
909 const int8_t *filter,
918 int32_t offset, weight, constant;
919 v16i8 src0, src1, src2, src3;
920 v8i16 in0, in1, in2, in3;
921 v8i16 filt0, filt1, filt2, filt3;
922 v16i8 mask1, mask2, mask3;
923 v8i16 filter_vec, out0, out1, out2, out3;
924 v16i8 vec0, vec1, vec2, vec3;
925 v8i16 dst0, dst1, dst2, dst3;
926 v4i32 weight_vec, offset_vec, rnd_vec;
927 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
930 offset = (offset0 + offset1) << rnd_val;
931 weight0 = weight0 & 0x0000FFFF;
932 weight = weight0 | (weight1 << 16);
933 constant = 128 * weight1;
937 offset_vec = __msa_fill_w(offset);
938 weight_vec = __msa_fill_w(weight);
939 rnd_vec = __msa_fill_w(rnd_val + 1);
941 filter_vec = LD_SH(filter);
942 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
948 for (loop_cnt = (height >> 1); loop_cnt--;) {
949 LD_SB2(src0_ptr, 8, src0, src1);
950 src0_ptr += src_stride;
951 LD_SB2(src0_ptr, 8, src2, src3);
952 src0_ptr += src_stride;
953 LD_SH2(src1_ptr, 8, in0, in1);
954 src1_ptr += src2_stride;
955 LD_SH2(src1_ptr, 8, in2, in3);
956 src1_ptr += src2_stride;
957 XORI_B4_128_SB(src0, src1, src2, src3);
959 VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
960 vec0, vec1, vec2, vec3);
961 dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
963 VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
964 vec0, vec1, vec2, vec3);
965 dst1 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
967 VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
968 vec0, vec1, vec2, vec3);
969 dst2 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
971 VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
972 vec0, vec1, vec2, vec3);
973 dst3 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
976 HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3,
978 weight_vec, rnd_vec, offset_vec,
979 out0, out1, out2, out3);
981 PCKEV_B2_SH(out1, out0, out3, out2, out0, out1);
982 ST_SH2(out0, out1, dst, dst_stride);
983 dst += (2 * dst_stride);
987 static void hevc_hz_biwgt_8t_24w_msa(uint8_t *src0_ptr,
993 const int8_t *filter,
1003 int32_t offset, weight, constant;
1005 v8i16 in0, in1, in2;
1006 v8i16 filt0, filt1, filt2, filt3;
1007 v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
1008 v16i8 vec0, vec1, vec2, vec3;
1009 v8i16 dst0, dst1, dst2;
1010 v4i32 dst2_r, dst2_l;
1011 v8i16 filter_vec, out0, out1, out2;
1012 v4i32 weight_vec, offset_vec, rnd_vec;
1013 v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
1015 src0_ptr = src0_ptr - 3;
1016 offset = (offset0 + offset1) << rnd_val;
1017 weight0 = weight0 & 0x0000FFFF;
1018 weight = weight0 | (weight1 << 16);
1019 constant = 128 * weight1;
1023 offset_vec = __msa_fill_w(offset);
1024 weight_vec = __msa_fill_w(weight);
1025 rnd_vec = __msa_fill_w(rnd_val + 1);
1027 filter_vec = LD_SH(filter);
1028 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1038 LD_SB2(src0_ptr, 16, src0, src1);
1039 src0_ptr += src_stride;
1040 LD_SH2(src1_ptr, 8, in0, in1);
1041 in2 = LD_SH(src1_ptr + 16);
1042 src1_ptr += src2_stride;
1043 XORI_B2_128_SB(src0, src1);
1045 for (loop_cnt = 31; loop_cnt--;) {
1046 VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
1047 vec0, vec1, vec2, vec3);
1048 dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1050 VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7,
1051 vec0, vec1, vec2, vec3);
1052 dst1 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1054 VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
1055 vec0, vec1, vec2, vec3);
1056 dst2 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1059 HEVC_BIW_RND_CLIP2(dst0, dst1, in0, in1,
1060 weight_vec, rnd_vec, offset_vec,
1063 ILVRL_H2_SW(dst2, in2, dst2_r, dst2_l);
1064 dst2_r = __msa_dpadd_s_w(offset_vec, (v8i16) dst2_r,
1065 (v8i16) weight_vec);
1066 dst2_l = __msa_dpadd_s_w(offset_vec, (v8i16) dst2_l,
1067 (v8i16) weight_vec);
1068 SRAR_W2_SW(dst2_r, dst2_l, rnd_vec);
1069 dst2_r = (v4i32) __msa_pckev_h((v8i16) dst2_l, (v8i16) dst2_r);
1070 out2 = CLIP_SH_0_255(dst2_r);
1072 LD_SB2(src0_ptr, 16, src0, src1);
1073 src0_ptr += src_stride;
1074 LD_SH2(src1_ptr, 8, in0, in1);
1075 in2 = LD_SH(src1_ptr + 16);
1076 src1_ptr += src2_stride;
1077 XORI_B2_128_SB(src0, src1);
1078 PCKEV_B2_SH(out1, out0, out2, out2, out0, out2);
1079 dst_val0 = __msa_copy_u_d((v2i64) out2, 0);
1081 SD(dst_val0, dst + 16);
1085 VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
1086 dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1088 VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7, vec0, vec1, vec2, vec3);
1089 dst1 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1091 VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
1092 dst2 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1094 HEVC_BIW_RND_CLIP2(dst0, dst1, in0, in1, weight_vec, rnd_vec, offset_vec,
1096 ILVRL_H2_SW(dst2, in2, dst2_r, dst2_l);
1097 dst2_r = __msa_dpadd_s_w(offset_vec, (v8i16) dst2_r, (v8i16) weight_vec);
1098 dst2_l = __msa_dpadd_s_w(offset_vec, (v8i16) dst2_l, (v8i16) weight_vec);
1099 SRAR_W2_SW(dst2_r, dst2_l, rnd_vec);
1100 dst2_r = (v4i32) __msa_pckev_h((v8i16) dst2_l, (v8i16) dst2_r);
1101 out2 = CLIP_SH_0_255(dst2_r);
1102 PCKEV_B2_SH(out1, out0, out2, out2, out0, out2);
1103 dst_val0 = __msa_copy_u_d((v2i64) out2, 0);
1105 SD(dst_val0, dst + 16);
1109 static void hevc_hz_biwgt_8t_32w_msa(uint8_t *src0_ptr,
1112 int32_t src2_stride,
1115 const int8_t *filter,
1124 int32_t offset, weight, constant;
1125 v16i8 src0, src1, src2;
1126 v8i16 in0, in1, in2, in3;
1127 v8i16 filt0, filt1, filt2, filt3;
1128 v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
1129 v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
1130 v16i8 vec0, vec1, vec2, vec3;
1131 v8i16 dst0, dst1, dst2, dst3;
1132 v8i16 filter_vec, out0, out1, out2, out3;
1133 v4i32 weight_vec, offset_vec, rnd_vec;
1136 offset = (offset0 + offset1) << rnd_val;
1137 weight0 = weight0 & 0x0000FFFF;
1138 weight = weight0 | (weight1 << 16);
1139 constant = 128 * weight1;
1143 offset_vec = __msa_fill_w(offset);
1144 weight_vec = __msa_fill_w(weight);
1145 rnd_vec = __msa_fill_w(rnd_val + 1);
1147 filter_vec = LD_SH(filter);
1148 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1158 for (loop_cnt = height; loop_cnt--;) {
1159 LD_SB2(src0_ptr, 16, src0, src1);
1160 src2 = LD_SB(src0_ptr + 24);
1161 src0_ptr += src_stride;
1162 LD_SH4(src1_ptr, 8, in0, in1, in2, in3);
1163 src1_ptr += src2_stride;
1165 XORI_B3_128_SB(src0, src1, src2);
1167 VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
1168 vec0, vec1, vec2, vec3);
1169 dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1171 VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7,
1172 vec0, vec1, vec2, vec3);
1173 dst1 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1175 VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
1176 vec0, vec1, vec2, vec3);
1177 dst2 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1179 VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
1180 vec0, vec1, vec2, vec3);
1181 dst3 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1184 HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3,
1186 weight_vec, rnd_vec, offset_vec,
1187 out0, out1, out2, out3);
1189 PCKEV_B2_SH(out1, out0, out3, out2, out0, out1);
1190 ST_SH2(out0, out1, dst, 16);
1195 static void hevc_hz_biwgt_8t_48w_msa(uint8_t *src0_ptr,
1198 int32_t src2_stride,
1201 const int8_t *filter,
1210 int32_t offset, weight, constant;
1211 v16i8 src0, src1, src2, src3, src4;
1212 v8i16 in0, in1, in2, in3;
1213 v8i16 filt0, filt1, filt2, filt3;
1214 v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
1215 v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
1216 v16i8 vec0, vec1, vec2, vec3;
1217 v8i16 dst0, dst1, dst2, dst3;
1218 v8i16 filter_vec, out0, out1, out2, out3;
1219 v4i32 weight_vec, offset_vec, rnd_vec;
1222 offset = (offset0 + offset1) << rnd_val;
1223 weight0 = weight0 & 0x0000FFFF;
1224 weight = weight0 | (weight1 << 16);
1225 constant = 128 * weight1;
1229 offset_vec = __msa_fill_w(offset);
1230 weight_vec = __msa_fill_w(weight);
1231 rnd_vec = __msa_fill_w(rnd_val + 1);
1233 filter_vec = LD_SH(filter);
1234 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1244 for (loop_cnt = 64; loop_cnt--;) {
1245 LD_SB2(src0_ptr, 16, src0, src1);
1246 src2 = LD_SB(src0_ptr + 24);
1247 LD_SH4(src1_ptr, 8, in0, in1, in2, in3);
1248 XORI_B3_128_SB(src0, src1, src2);
1249 LD_SB2(src0_ptr + 32, 8, src3, src4);
1250 src0_ptr += src_stride;
1251 XORI_B2_128_SB(src3, src4);
1253 VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
1254 vec0, vec1, vec2, vec3);
1255 dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1257 VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7,
1258 vec0, vec1, vec2, vec3);
1259 dst1 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1261 VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
1262 vec0, vec1, vec2, vec3);
1263 dst2 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1265 VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
1266 vec0, vec1, vec2, vec3);
1267 dst3 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1270 HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3, in0, in1, in2, in3,
1271 weight_vec, rnd_vec, offset_vec,
1272 out0, out1, out2, out3);
1274 PCKEV_B2_SH(out1, out0, out3, out2, out0, out1);
1275 ST_SH2(out0, out1, dst, 16);
1277 LD_SH2(src1_ptr + 32, 8, in2, in3);
1278 src1_ptr += src2_stride;
1280 VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
1281 vec0, vec1, vec2, vec3);
1282 dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1284 VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3,
1285 vec0, vec1, vec2, vec3);
1286 dst1 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1289 HEVC_BIW_RND_CLIP2(dst0, dst1, in2, in3,
1290 weight_vec, rnd_vec, offset_vec,
1293 out0 = (v8i16) __msa_pckev_b((v16i8) out1, (v16i8) out0);
1294 ST_SH(out0, dst + 32);
1299 static void hevc_hz_biwgt_8t_64w_msa(uint8_t *src0_ptr,
1302 int32_t src2_stride,
1305 const int8_t *filter,
1313 uint8_t *src0_ptr_tmp;
1315 int16_t *src1_ptr_tmp;
1316 uint32_t loop_cnt, cnt;
1317 int32_t offset, weight, constant;
1318 v16i8 src0, src1, src2;
1319 v8i16 in0, in1, in2, in3;
1320 v8i16 filt0, filt1, filt2, filt3;
1321 v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
1322 v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
1323 v16i8 vec0, vec1, vec2, vec3;
1324 v8i16 dst0, dst1, dst2, dst3;
1325 v8i16 filter_vec, out0, out1, out2, out3;
1326 v4i32 weight_vec, offset_vec, rnd_vec;
1329 offset = (offset0 + offset1) << rnd_val;
1330 weight0 = weight0 & 0x0000FFFF;
1331 weight = weight0 | (weight1 << 16);
1332 constant = 128 * weight1;
1336 offset_vec = __msa_fill_w(offset);
1337 weight_vec = __msa_fill_w(weight);
1338 rnd_vec = __msa_fill_w(rnd_val + 1);
1340 filter_vec = LD_SH(filter);
1341 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1351 for (loop_cnt = height; loop_cnt--;) {
1352 src0_ptr_tmp = src0_ptr;
1354 src1_ptr_tmp = src1_ptr;
1356 for (cnt = 2; cnt--;) {
1357 LD_SB2(src0_ptr_tmp, 16, src0, src1);
1358 src2 = LD_SB(src0_ptr_tmp + 24);
1360 LD_SH4(src1_ptr_tmp, 8, in0, in1, in2, in3);
1362 XORI_B3_128_SB(src0, src1, src2);
1364 VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
1365 vec0, vec1, vec2, vec3);
1366 dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1,
1368 VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7,
1369 vec0, vec1, vec2, vec3);
1370 dst1 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1,
1372 VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
1373 vec0, vec1, vec2, vec3);
1374 dst2 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1,
1376 VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
1377 vec0, vec1, vec2, vec3);
1378 dst3 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1,
1381 HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3,
1383 weight_vec, rnd_vec, offset_vec,
1384 out0, out1, out2, out3);
1386 PCKEV_B2_SH(out1, out0, out3, out2, out0, out1);
1387 ST_SH2(out0, out1, dst_tmp, 16);
1391 src0_ptr += src_stride;
1392 src1_ptr += src2_stride;
1398 static void hevc_vt_biwgt_8t_4w_msa(uint8_t *src0_ptr,
1401 int32_t src2_stride,
1404 const int8_t *filter,
1413 int32_t offset, weight;
1414 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1415 v16i8 src11, src12, src13, src14;
1416 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
1417 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
1418 v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
1419 v16i8 src1110_r, src1211_r, src1312_r, src1413_r;
1420 v16i8 src2110, src4332, src6554, src8776, src10998;
1421 v16i8 src12111110, src14131312;
1422 v8i16 dst10, dst32, dst54, dst76;
1423 v8i16 filt0, filt1, filt2, filt3;
1424 v8i16 filter_vec, out0, out1, out2, out3;
1425 v4i32 weight_vec, weight1_vec, offset_vec, rnd_vec, const_vec;
1427 src0_ptr -= (3 * src_stride);
1428 offset = (offset0 + offset1) << rnd_val;
1429 weight0 = weight0 & 0x0000FFFF;
1430 weight = weight0 | (weight1 << 16);
1432 const_vec = __msa_ldi_w(128);
1434 offset_vec = __msa_fill_w(offset);
1435 weight_vec = __msa_fill_w(weight);
1436 rnd_vec = __msa_fill_w(rnd_val + 1);
1437 weight1_vec = __msa_fill_w(weight1);
1438 offset_vec += const_vec * weight1_vec;
1440 filter_vec = LD_SH(filter);
1441 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1443 LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6);
1444 src0_ptr += (7 * src_stride);
1446 ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1447 src10_r, src32_r, src54_r, src21_r);
1448 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1449 ILVR_D3_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r,
1450 src2110, src4332, src6554);
1451 XORI_B3_128_SB(src2110, src4332, src6554);
1453 for (loop_cnt = (height >> 3); loop_cnt--;) {
1454 LD_SB8(src0_ptr, src_stride,
1455 src7, src8, src9, src10, src11, src12, src13, src14);
1456 src0_ptr += (8 * src_stride);
1457 LD_SH8(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5, in6, in7);
1458 src1_ptr += (8 * src2_stride);
1460 ILVR_D2_SH(in1, in0, in3, in2, in0, in1);
1461 ILVR_D2_SH(in5, in4, in7, in6, in2, in3);
1462 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
1463 src76_r, src87_r, src98_r, src109_r);
1464 ILVR_B4_SB(src11, src10, src12, src11, src13, src12, src14, src13,
1465 src1110_r, src1211_r, src1312_r, src1413_r);
1466 ILVR_D4_SB(src87_r, src76_r, src109_r, src98_r, src1211_r, src1110_r,
1467 src1413_r, src1312_r,
1468 src8776, src10998, src12111110, src14131312);
1469 XORI_B4_128_SB(src8776, src10998, src12111110, src14131312);
1471 DOTP_SB4_SH(src2110, src4332, src6554, src8776, filt0, filt0, filt0,
1472 filt0, dst10, dst32, dst54, dst76);
1473 DPADD_SB4_SH(src4332, src6554, src8776, src10998, filt1, filt1, filt1,
1474 filt1, dst10, dst32, dst54, dst76);
1475 DPADD_SB4_SH(src6554, src8776, src10998, src12111110, filt2, filt2,
1476 filt2, filt2, dst10, dst32, dst54, dst76);
1477 DPADD_SB4_SH(src8776, src10998, src12111110, src14131312, filt3, filt3,
1478 filt3, filt3, dst10, dst32, dst54, dst76);
1480 HEVC_BIW_RND_CLIP4(dst10, dst32, dst54, dst76,
1482 weight_vec, rnd_vec, offset_vec,
1483 out0, out1, out2, out3);
1485 PCKEV_B2_SH(out1, out0, out3, out2, out0, out1);
1486 ST4x8_UB(out0, out1, dst, dst_stride);
1487 dst += (8 * dst_stride);
1490 src4332 = src12111110;
1491 src6554 = src14131312;
1496 static void hevc_vt_biwgt_8t_8w_msa(uint8_t *src0_ptr,
1499 int32_t src2_stride,
1502 const int8_t *filter,
1511 int32_t offset, weight;
1512 v16i8 src0, src1, src2, src3, src4, src5;
1513 v16i8 src6, src7, src8, src9, src10;
1514 v8i16 in0, in1, in2, in3;
1515 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
1516 v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
1517 v8i16 tmp0, tmp1, tmp2, tmp3;
1518 v8i16 filt0, filt1, filt2, filt3;
1519 v8i16 filter_vec, out0, out1, out2, out3;
1520 v4i32 weight_vec, weight1_vec, offset_vec, rnd_vec, const_vec;
1522 src0_ptr -= (3 * src_stride);
1523 offset = (offset0 + offset1) << rnd_val;
1524 weight0 = weight0 & 0x0000FFFF;
1525 weight = weight0 | (weight1 << 16);
1527 const_vec = __msa_ldi_w(128);
1529 offset_vec = __msa_fill_w(offset);
1530 weight_vec = __msa_fill_w(weight);
1531 rnd_vec = __msa_fill_w(rnd_val + 1);
1532 weight1_vec = __msa_fill_w(weight1);
1533 offset_vec += const_vec * weight1_vec;
1535 filter_vec = LD_SH(filter);
1536 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1538 LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6);
1539 src0_ptr += (7 * src_stride);
1540 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1542 ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1543 src10_r, src32_r, src54_r, src21_r);
1544 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1546 for (loop_cnt = (height >> 2); loop_cnt--;) {
1547 LD_SB4(src0_ptr, src_stride, src7, src8, src9, src10);
1548 src0_ptr += (4 * src_stride);
1549 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
1550 src1_ptr += (4 * src2_stride);
1552 XORI_B4_128_SB(src7, src8, src9, src10);
1553 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
1554 src76_r, src87_r, src98_r, src109_r);
1556 DOTP_SB4_SH(src10_r, src21_r, src32_r, src43_r, filt0, filt0, filt0,
1557 filt0, tmp0, tmp1, tmp2, tmp3);
1558 DPADD_SB4_SH(src32_r, src43_r, src54_r, src65_r, filt1, filt1, filt1,
1559 filt1, tmp0, tmp1, tmp2, tmp3);
1560 DPADD_SB4_SH(src54_r, src65_r, src76_r, src87_r, filt2, filt2, filt2,
1561 filt2, tmp0, tmp1, tmp2, tmp3);
1562 DPADD_SB4_SH(src76_r, src87_r, src98_r, src109_r, filt3, filt3, filt3,
1563 filt3, tmp0, tmp1, tmp2, tmp3);
1565 HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3,
1567 weight_vec, rnd_vec, offset_vec,
1568 out0, out1, out2, out3);
1570 PCKEV_B2_SH(out1, out0, out3, out2, out0, out1);
1571 ST8x4_UB(out0, out1, dst, dst_stride);
1572 dst += (4 * dst_stride);
1584 static void hevc_vt_biwgt_8t_12w_msa(uint8_t *src0_ptr,
1587 int32_t src2_stride,
1590 const int8_t *filter,
1599 int32_t offset, weight;
1600 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
1601 v8i16 in0, in1, in2, in3;
1602 v16i8 src10_r, src32_r, src54_r, src76_r;
1603 v16i8 src21_r, src43_r, src65_r, src87_r;
1604 v8i16 tmp0, tmp1, tmp2;
1605 v16i8 src10_l, src32_l, src54_l, src76_l;
1606 v16i8 src21_l, src43_l, src65_l, src87_l;
1607 v16i8 src2110, src4332, src6554, src8776;
1608 v8i16 filt0, filt1, filt2, filt3;
1609 v8i16 out0, out1, out2, filter_vec;
1610 v4i32 dst2_r, dst2_l;
1611 v4i32 weight_vec, weight1_vec, offset_vec, rnd_vec, const_vec;
1613 src0_ptr -= (3 * src_stride);
1614 offset = (offset0 + offset1) << rnd_val;
1615 weight0 = weight0 & 0x0000FFFF;
1616 weight = weight0 | (weight1 << 16);
1618 const_vec = __msa_ldi_w(128);
1620 offset_vec = __msa_fill_w(offset);
1621 weight_vec = __msa_fill_w(weight);
1622 rnd_vec = __msa_fill_w(rnd_val + 1);
1623 weight1_vec = __msa_fill_w(weight1);
1624 offset_vec += const_vec * weight1_vec;
1626 filter_vec = LD_SH(filter);
1627 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1629 LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6);
1630 src0_ptr += (7 * src_stride);
1631 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1633 ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1634 src10_r, src32_r, src54_r, src21_r);
1635 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1636 ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1637 src10_l, src32_l, src54_l, src21_l);
1638 ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
1639 ILVR_D3_SB(src21_l, src10_l, src43_l, src32_l, src65_l, src54_l,
1640 src2110, src4332, src6554);
1642 for (loop_cnt = 8; loop_cnt--;) {
1643 LD_SB2(src0_ptr, src_stride, src7, src8);
1644 src0_ptr += (2 * src_stride);
1645 LD_SH2(src1_ptr, src2_stride, in0, in1);
1646 LD_SH2((src1_ptr + 8), src2_stride, in2, in3);
1647 src1_ptr += (2 * src2_stride);
1648 in2 = (v8i16) __msa_ilvr_d((v2i64) in3, (v2i64) in2);
1649 XORI_B2_128_SB(src7, src8);
1651 ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
1652 ILVL_B2_SB(src7, src6, src8, src7, src76_l, src87_l);
1653 src8776 = (v16i8) __msa_ilvr_d((v2i64) src87_l, (v2i64) src76_l);
1655 DOTP_SB3_SH(src10_r, src21_r, src2110, filt0, filt0, filt0,
1657 DPADD_SB2_SH(src32_r, src43_r, filt1, filt1, tmp0, tmp1);
1658 tmp2 = __msa_dpadd_s_h(tmp2, src4332, (v16i8) filt1);
1659 DPADD_SB2_SH(src54_r, src65_r, filt2, filt2, tmp0, tmp1);
1660 tmp2 = __msa_dpadd_s_h(tmp2, src6554, (v16i8) filt2);
1661 DPADD_SB2_SH(src76_r, src87_r, filt3, filt3, tmp0, tmp1);
1662 tmp2 = __msa_dpadd_s_h(tmp2, src8776, (v16i8) filt3);
1664 HEVC_BIW_RND_CLIP2(tmp0, tmp1, in0, in1,
1665 weight_vec, rnd_vec, offset_vec,
1668 ILVRL_H2_SW(tmp2, in2, dst2_r, dst2_l);
1669 dst2_r = __msa_dpadd_s_w(offset_vec, (v8i16) dst2_r,
1670 (v8i16) weight_vec);
1671 dst2_l = __msa_dpadd_s_w(offset_vec, (v8i16) dst2_l,
1672 (v8i16) weight_vec);
1673 SRAR_W2_SW(dst2_r, dst2_l, rnd_vec);
1674 dst2_r = (v4i32) __msa_pckev_h((v8i16) dst2_l, (v8i16) dst2_r);
1675 out2 = CLIP_SH_0_255(dst2_r);
1676 PCKEV_B2_SH(out1, out0, out2, out2, out0, out2);
1677 ST8x2_UB(out0, dst, dst_stride);
1678 ST4x2_UB(out2, dst + 8, dst_stride);
1679 dst += (2 * dst_stride);
1694 static void hevc_vt_biwgt_8t_16multx2mult_msa(uint8_t *src0_ptr,
1697 int32_t src2_stride,
1700 const int8_t *filter,
1709 uint8_t *src0_ptr_tmp;
1710 int16_t *src1_ptr_tmp;
1712 uint32_t loop_cnt, cnt;
1713 int32_t offset, weight;
1714 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
1715 v8i16 in0, in1, in2, in3;
1716 v16i8 src10_r, src32_r, src54_r, src76_r;
1717 v16i8 src21_r, src43_r, src65_r, src87_r;
1718 v16i8 src10_l, src32_l, src54_l, src76_l;
1719 v16i8 src21_l, src43_l, src65_l, src87_l;
1720 v8i16 tmp0, tmp1, tmp2, tmp3;
1721 v8i16 filt0, filt1, filt2, filt3;
1723 v8i16 out0, out1, out2, out3;
1724 v4i32 weight_vec, weight1_vec, offset_vec, rnd_vec, const_vec;
1726 src0_ptr -= (3 * src_stride);
1728 offset = (offset0 + offset1) << rnd_val;
1729 weight0 = weight0 & 0x0000FFFF;
1730 weight = weight0 | (weight1 << 16);
1732 const_vec = __msa_ldi_w(128);
1734 offset_vec = __msa_fill_w(offset);
1735 weight_vec = __msa_fill_w(weight);
1736 rnd_vec = __msa_fill_w(rnd_val + 1);
1737 weight1_vec = __msa_fill_w(weight1);
1738 offset_vec += const_vec * weight1_vec;
1740 filter_vec = LD_SH(filter);
1741 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1743 for (cnt = (width >> 4); cnt--;) {
1744 src0_ptr_tmp = src0_ptr;
1745 src1_ptr_tmp = src1_ptr;
1748 LD_SB7(src0_ptr_tmp, src_stride,
1749 src0, src1, src2, src3, src4, src5, src6);
1750 src0_ptr_tmp += (7 * src_stride);
1752 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1753 ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1754 src10_r, src32_r, src54_r, src21_r);
1755 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1756 ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1757 src10_l, src32_l, src54_l, src21_l);
1758 ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
1760 for (loop_cnt = (height >> 1); loop_cnt--;) {
1761 LD_SB2(src0_ptr_tmp, src_stride, src7, src8);
1762 src0_ptr_tmp += (2 * src_stride);
1763 LD_SH2(src1_ptr_tmp, src2_stride, in0, in1);
1764 LD_SH2((src1_ptr_tmp + 8), src2_stride, in2, in3);
1765 src1_ptr_tmp += (2 * src2_stride);
1767 XORI_B2_128_SB(src7, src8);
1768 ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
1769 ILVL_B2_SB(src7, src6, src8, src7, src76_l, src87_l);
1771 DOTP_SB4_SH(src10_r, src21_r, src10_l, src21_l, filt0, filt0,
1772 filt0, filt0, tmp0, tmp1, tmp2, tmp3);
1773 DPADD_SB4_SH(src32_r, src43_r, src32_l, src43_l, filt1, filt1,
1774 filt1, filt1, tmp0, tmp1, tmp2, tmp3);
1775 DPADD_SB4_SH(src54_r, src65_r, src54_l, src65_l, filt2, filt2,
1776 filt2, filt2, tmp0, tmp1, tmp2, tmp3);
1777 DPADD_SB4_SH(src76_r, src87_r, src76_l, src87_l, filt3, filt3,
1778 filt3, filt3, tmp0, tmp1, tmp2, tmp3);
1780 HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3,
1782 weight_vec, rnd_vec, offset_vec,
1783 out0, out1, out2, out3);
1785 PCKEV_B2_SH(out2, out0, out3, out1, out0, out1);
1786 ST_SH2(out0, out1, dst_tmp, dst_stride);
1787 dst_tmp += (2 * dst_stride);
1810 static void hevc_vt_biwgt_8t_16w_msa(uint8_t *src0_ptr,
1813 int32_t src2_stride,
1816 const int8_t *filter,
1824 hevc_vt_biwgt_8t_16multx2mult_msa(src0_ptr, src_stride,
1825 src1_ptr, src2_stride,
1826 dst, dst_stride, filter, height,
1827 weight0, weight1, offset0, offset1,
1831 static void hevc_vt_biwgt_8t_24w_msa(uint8_t *src0_ptr,
1834 int32_t src2_stride,
1837 const int8_t *filter,
1845 hevc_vt_biwgt_8t_16multx2mult_msa(src0_ptr, src_stride,
1846 src1_ptr, src2_stride,
1847 dst, dst_stride, filter, height,
1848 weight0, weight1, offset0, offset1,
1850 hevc_vt_biwgt_8t_8w_msa(src0_ptr + 16, src_stride,
1851 src1_ptr + 16, src2_stride,
1852 dst + 16, dst_stride, filter, height,
1853 weight0, weight1, offset0, offset1, rnd_val);
1856 static void hevc_vt_biwgt_8t_32w_msa(uint8_t *src0_ptr,
1859 int32_t src2_stride,
1862 const int8_t *filter,
1870 hevc_vt_biwgt_8t_16multx2mult_msa(src0_ptr, src_stride,
1871 src1_ptr, src2_stride,
1872 dst, dst_stride, filter, height,
1873 weight0, weight1, offset0, offset1,
1877 static void hevc_vt_biwgt_8t_48w_msa(uint8_t *src0_ptr,
1880 int32_t src2_stride,
1883 const int8_t *filter,
1891 hevc_vt_biwgt_8t_16multx2mult_msa(src0_ptr, src_stride,
1892 src1_ptr, src2_stride,
1893 dst, dst_stride, filter, height,
1894 weight0, weight1, offset0, offset1,
1898 static void hevc_vt_biwgt_8t_64w_msa(uint8_t *src0_ptr,
1901 int32_t src2_stride,
1904 const int8_t *filter,
1912 hevc_vt_biwgt_8t_16multx2mult_msa(src0_ptr, src_stride,
1913 src1_ptr, src2_stride,
1914 dst, dst_stride, filter, height,
1915 weight0, weight1, offset0, offset1,
1919 static void hevc_hv_biwgt_8t_4w_msa(uint8_t *src0_ptr,
1922 int32_t src2_stride,
1925 const int8_t *filter_x,
1926 const int8_t *filter_y,
1936 int32_t offset, weight;
1938 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1939 v8i16 in0 = { 0 }, in1 = { 0 };
1940 v8i16 filt0, filt1, filt2, filt3;
1941 v8i16 filt_h0, filt_h1, filt_h2, filt_h3;
1942 v16i8 mask1, mask2, mask3;
1943 v8i16 filter_vec, weight_vec;
1944 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1945 v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1946 v8i16 dst30, dst41, dst52, dst63, dst66, dst87;
1947 v8i16 tmp0, tmp1, tmp2, tmp3;
1948 v8i16 dst10, dst32, dst54, dst76;
1949 v8i16 dst21, dst43, dst65, dst97, dst108, dst109, dst98;
1950 v4i32 offset_vec, rnd_vec, const_vec, dst0, dst1, dst2, dst3;
1951 v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16);
1953 src0_ptr -= ((3 * src_stride) + 3);
1955 filter_vec = LD_SH(filter_x);
1956 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1958 filter_vec = LD_SH(filter_y);
1959 UNPCK_R_SB_SH(filter_vec, filter_vec);
1961 SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
1967 offset = (offset0 + offset1) << rnd_val;
1968 weight0 = weight0 & 0x0000FFFF;
1969 weight = weight0 | (weight1 << 16);
1971 const_vec = __msa_fill_w((128 * weight1));
1973 offset_vec = __msa_fill_w(offset);
1974 rnd_vec = __msa_fill_w(rnd_val + 1);
1975 offset_vec += const_vec;
1976 weight_vec = (v8i16) __msa_fill_w(weight);
1978 LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6);
1979 src0_ptr += (7 * src_stride);
1981 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1983 VSHF_B4_SB(src0, src3, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
1984 VSHF_B4_SB(src1, src4, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7);
1985 VSHF_B4_SB(src2, src5, mask0, mask1, mask2, mask3,
1986 vec8, vec9, vec10, vec11);
1987 VSHF_B4_SB(src3, src6, mask0, mask1, mask2, mask3,
1988 vec12, vec13, vec14, vec15);
1990 dst30 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1992 dst41 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
1994 dst52 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
1996 dst63 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2,
1999 ILVRL_H2_SH(dst41, dst30, dst10, dst43);
2000 ILVRL_H2_SH(dst52, dst41, dst21, dst54);
2001 ILVRL_H2_SH(dst63, dst52, dst32, dst65);
2003 dst66 = (v8i16) __msa_splati_d((v2i64) dst63, 1);
2005 for (loop_cnt = height >> 2; loop_cnt--;) {
2006 LD_SB4(src0_ptr, src_stride, src7, src8, src9, src10);
2007 src0_ptr += (4 * src_stride);
2008 XORI_B4_128_SB(src7, src8, src9, src10);
2010 LD2(src1_ptr, src2_stride, tp0, tp1);
2011 INSERT_D2_SH(tp0, tp1, in0);
2012 src1_ptr += (2 * src2_stride);
2013 LD2(src1_ptr, src2_stride, tp0, tp1);
2014 INSERT_D2_SH(tp0, tp1, in1);
2015 src1_ptr += (2 * src2_stride);
2017 VSHF_B4_SB(src7, src9, mask0, mask1, mask2, mask3,
2018 vec0, vec1, vec2, vec3);
2019 VSHF_B4_SB(src8, src10, mask0, mask1, mask2, mask3,
2020 vec4, vec5, vec6, vec7);
2021 dst97 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
2023 dst108 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
2026 dst76 = __msa_ilvr_h(dst97, dst66);
2027 ILVRL_H2_SH(dst108, dst97, dst87, dst109);
2028 dst66 = (v8i16) __msa_splati_d((v2i64) dst97, 1);
2029 dst98 = __msa_ilvr_h(dst66, dst108);
2031 dst0 = HEVC_FILT_8TAP(dst10, dst32, dst54, dst76, filt_h0, filt_h1,
2033 dst1 = HEVC_FILT_8TAP(dst21, dst43, dst65, dst87, filt_h0, filt_h1,
2035 dst2 = HEVC_FILT_8TAP(dst32, dst54, dst76, dst98, filt_h0, filt_h1,
2037 dst3 = HEVC_FILT_8TAP(dst43, dst65, dst87, dst109, filt_h0, filt_h1,
2039 SRA_4V(dst0, dst1, dst2, dst3, 6);
2040 PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp1, tmp3);
2041 ILVRL_H2_SH(tmp1, in0, tmp0, tmp1);
2042 ILVRL_H2_SH(tmp3, in1, tmp2, tmp3);
2043 dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
2044 dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
2045 dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
2046 dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
2047 SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec);
2048 CLIP_SW4_0_255_MAX_SATU(dst0, dst1, dst2, dst3);
2049 PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp0, tmp1);
2050 out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
2051 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
2052 dst += (4 * dst_stride);
2060 dst66 = (v8i16) __msa_splati_d((v2i64) dst108, 1);
2064 static void hevc_hv_biwgt_8t_8multx2mult_msa(uint8_t *src0_ptr,
2067 int32_t src2_stride,
2070 const int8_t *filter_x,
2071 const int8_t *filter_y,
2080 uint32_t loop_cnt, cnt;
2081 int32_t offset, weight;
2082 uint8_t *src0_ptr_tmp;
2083 int16_t *src1_ptr_tmp;
2086 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
2088 v8i16 filt0, filt1, filt2, filt3;
2089 v8i16 filt_h0, filt_h1, filt_h2, filt_h3;
2090 v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
2091 v16i8 mask1, mask2, mask3;
2092 v8i16 filter_vec, weight_vec;
2093 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
2094 v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
2095 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
2096 v4i32 dst0_r, dst0_l, dst1_r, dst1_l;
2097 v8i16 tmp0, tmp1, tmp2, tmp3;
2098 v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
2099 v8i16 dst10_l, dst32_l, dst54_l, dst76_l;
2100 v8i16 dst21_r, dst43_r, dst65_r, dst87_r;
2101 v8i16 dst21_l, dst43_l, dst65_l, dst87_l;
2102 v4i32 offset_vec, rnd_vec, const_vec;
2104 src0_ptr -= ((3 * src_stride) + 3);
2106 offset = (offset0 + offset1) << rnd_val;
2107 weight0 = weight0 & 0x0000FFFF;
2108 weight = weight0 | (weight1 << 16);
2110 const_vec = __msa_fill_w((128 * weight1));
2112 offset_vec = __msa_fill_w(offset);
2113 rnd_vec = __msa_fill_w(rnd_val + 1);
2114 offset_vec += const_vec;
2115 weight_vec = (v8i16) __msa_fill_w(weight);
2117 filter_vec = LD_SH(filter_x);
2118 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
2120 filter_vec = LD_SH(filter_y);
2121 UNPCK_R_SB_SH(filter_vec, filter_vec);
2123 SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
2129 for (cnt = width8mult; cnt--;) {
2130 src0_ptr_tmp = src0_ptr;
2131 src1_ptr_tmp = src1_ptr;
2134 LD_SB7(src0_ptr_tmp, src_stride,
2135 src0, src1, src2, src3, src4, src5, src6);
2136 src0_ptr_tmp += (7 * src_stride);
2138 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
2140 /* row 0 row 1 row 2 row 3 */
2141 VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
2142 vec0, vec1, vec2, vec3);
2143 VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
2144 vec4, vec5, vec6, vec7);
2145 VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
2146 vec8, vec9, vec10, vec11);
2147 VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
2148 vec12, vec13, vec14, vec15);
2150 dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
2152 dst1 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
2154 dst2 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
2156 dst3 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1,
2159 /* row 4 row 5 row 6 */
2160 VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3,
2161 vec0, vec1, vec2, vec3);
2162 VSHF_B4_SB(src5, src5, mask0, mask1, mask2, mask3,
2163 vec4, vec5, vec6, vec7);
2164 VSHF_B4_SB(src6, src6, mask0, mask1, mask2, mask3,
2165 vec8, vec9, vec10, vec11);
2167 dst4 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
2169 dst5 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
2171 dst6 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
2174 for (loop_cnt = height >> 1; loop_cnt--;) {
2175 LD_SB2(src0_ptr_tmp, src_stride, src7, src8);
2176 XORI_B2_128_SB(src7, src8);
2177 src0_ptr_tmp += 2 * src_stride;
2179 LD_SH2(src1_ptr_tmp, src2_stride, in0, in1);
2180 src1_ptr_tmp += (2 * src2_stride);
2182 ILVR_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1, dst10_r,
2183 dst32_r, dst54_r, dst21_r);
2184 ILVL_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1, dst10_l,
2185 dst32_l, dst54_l, dst21_l);
2186 ILVR_H2_SH(dst4, dst3, dst6, dst5, dst43_r, dst65_r);
2187 ILVL_H2_SH(dst4, dst3, dst6, dst5, dst43_l, dst65_l);
2189 VSHF_B4_SB(src7, src7, mask0, mask1, mask2, mask3,
2190 vec0, vec1, vec2, vec3);
2191 dst7 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1,
2194 ILVRL_H2_SH(dst7, dst6, dst76_r, dst76_l);
2195 dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r,
2196 filt_h0, filt_h1, filt_h2, filt_h3);
2197 dst0_l = HEVC_FILT_8TAP(dst10_l, dst32_l, dst54_l, dst76_l,
2198 filt_h0, filt_h1, filt_h2, filt_h3);
2204 VSHF_B4_SB(src8, src8, mask0, mask1, mask2, mask3,
2205 vec0, vec1, vec2, vec3);
2206 dst8 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1,
2209 ILVRL_H2_SH(dst8, dst7, dst87_r, dst87_l);
2210 dst1_r = HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r,
2211 filt_h0, filt_h1, filt_h2, filt_h3);
2212 dst1_l = HEVC_FILT_8TAP(dst21_l, dst43_l, dst65_l, dst87_l,
2213 filt_h0, filt_h1, filt_h2, filt_h3);
2218 PCKEV_H2_SH(dst0_l, dst0_r, dst1_l, dst1_r, tmp1, tmp3);
2219 ILVRL_H2_SH(tmp1, in0, tmp0, tmp1);
2220 ILVRL_H2_SH(tmp3, in1, tmp2, tmp3);
2221 dst0_r = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
2222 dst0_l = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
2223 dst1_r = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
2224 dst1_l = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
2225 SRAR_W4_SW(dst0_l, dst0_r, dst1_l, dst1_r, rnd_vec);
2226 CLIP_SW4_0_255_MAX_SATU(dst0_l, dst0_r, dst1_l, dst1_r);
2227 PCKEV_H2_SH(dst0_l, dst0_r, dst1_l, dst1_r, tmp0, tmp1);
2228 out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
2229 ST8x2_UB(out, dst_tmp, dst_stride);
2230 dst_tmp += (2 * dst_stride);
2247 static void hevc_hv_biwgt_8t_8w_msa(uint8_t *src0_ptr,
2250 int32_t src2_stride,
2253 const int8_t *filter_x,
2254 const int8_t *filter_y,
2262 hevc_hv_biwgt_8t_8multx2mult_msa(src0_ptr, src_stride,
2263 src1_ptr, src2_stride,
2264 dst, dst_stride, filter_x, filter_y,
2265 height, weight0, weight1, offset0,
2266 offset1, rnd_val, 1);
2269 static void hevc_hv_biwgt_8t_12w_msa(uint8_t *src0_ptr,
2272 int32_t src2_stride,
2275 const int8_t *filter_x,
2276 const int8_t *filter_y,
2285 uint8_t *src0_ptr_tmp, *dst_tmp;
2286 int16_t *src1_ptr_tmp;
2287 int32_t offset, weight;
2290 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
2291 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
2292 v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
2293 v16i8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7;
2294 v8i16 in0 = { 0 }, in1 = { 0 };
2295 v8i16 filter_vec, weight_vec, tmp0, tmp1, tmp2, tmp3;
2296 v8i16 filt0, filt1, filt2, filt3, filt_h0, filt_h1, filt_h2, filt_h3;
2297 v8i16 dsth0, dsth1, dsth2, dsth3, dsth4, dsth5, dsth6, dsth7, dsth8;
2298 v8i16 dst10_r, dst32_r, dst54_r, dst76_r, dst21_r, dst43_r, dst65_r;
2299 v8i16 dst10_l, dst32_l, dst54_l, dst76_l, dst21_l, dst43_l, dst65_l;
2300 v8i16 dst30, dst41, dst52, dst63, dst66, dst87, dst10, dst32, dst54, dst76;
2301 v8i16 dst21, dst43, dst65, dst97, dst108, dst109, dst98, dst87_r, dst87_l;
2302 v4i32 offset_vec, rnd_vec, const_vec, dst0, dst1, dst2, dst3;
2304 src0_ptr -= ((3 * src_stride) + 3);
2306 offset = (offset0 + offset1) << rnd_val;
2307 weight0 = weight0 & 0x0000FFFF;
2308 weight = weight0 | (weight1 << 16);
2310 const_vec = __msa_fill_w((128 * weight1));
2312 offset_vec = __msa_fill_w(offset);
2313 rnd_vec = __msa_fill_w(rnd_val + 1);
2314 offset_vec += const_vec;
2315 weight_vec = (v8i16) __msa_fill_w(weight);
2317 filter_vec = LD_SH(filter_x);
2318 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
2320 filter_vec = LD_SH(filter_y);
2321 UNPCK_R_SB_SH(filter_vec, filter_vec);
2323 SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
2325 mask0 = LD_SB(ff_hevc_mask_arr);
2330 src0_ptr_tmp = src0_ptr;
2331 src1_ptr_tmp = src1_ptr;
2334 LD_SB7(src0_ptr_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6);
2335 src0_ptr_tmp += (7 * src_stride);
2336 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
2338 VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
2339 VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7);
2340 VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3, vec8, vec9, vec10,
2342 VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3, vec12, vec13, vec14,
2344 dsth0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
2346 dsth1 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
2348 dsth2 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
2350 dsth3 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1,
2352 VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
2353 VSHF_B4_SB(src5, src5, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7);
2354 VSHF_B4_SB(src6, src6, mask0, mask1, mask2, mask3, vec8, vec9, vec10,
2356 dsth4 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
2358 dsth5 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
2360 dsth6 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
2363 for (loop_cnt = 8; loop_cnt--;) {
2364 LD_SB2(src0_ptr_tmp, src_stride, src7, src8);
2365 src0_ptr_tmp += (2 * src_stride);
2366 XORI_B2_128_SB(src7, src8);
2368 LD_SH2(src1_ptr_tmp, src2_stride, in0, in1);
2369 src1_ptr_tmp += (2 * src2_stride);
2371 ILVR_H4_SH(dsth1, dsth0, dsth3, dsth2, dsth5, dsth4, dsth2, dsth1,
2372 dst10_r, dst32_r, dst54_r, dst21_r);
2373 ILVL_H4_SH(dsth1, dsth0, dsth3, dsth2, dsth5, dsth4, dsth2, dsth1,
2374 dst10_l, dst32_l, dst54_l, dst21_l);
2375 ILVR_H2_SH(dsth4, dsth3, dsth6, dsth5, dst43_r, dst65_r);
2376 ILVL_H2_SH(dsth4, dsth3, dsth6, dsth5, dst43_l, dst65_l);
2378 VSHF_B4_SB(src7, src7, mask0, mask1, mask2, mask3, vec0, vec1, vec2,
2380 dsth7 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
2383 ILVRL_H2_SH(dsth7, dsth6, dst76_r, dst76_l);
2384 dst0 = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r, filt_h0,
2385 filt_h1, filt_h2, filt_h3);
2386 dst1 = HEVC_FILT_8TAP(dst10_l, dst32_l, dst54_l, dst76_l, filt_h0,
2387 filt_h1, filt_h2, filt_h3);
2391 VSHF_B4_SB(src8, src8, mask0, mask1, mask2, mask3, vec0, vec1, vec2,
2393 dsth8 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
2396 ILVRL_H2_SH(dsth8, dsth7, dst87_r, dst87_l);
2397 dst2 = HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r, filt_h0,
2398 filt_h1, filt_h2, filt_h3);
2399 dst3 = HEVC_FILT_8TAP(dst21_l, dst43_l, dst65_l, dst87_l, filt_h0,
2400 filt_h1, filt_h2, filt_h3);
2404 PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp1, tmp3);
2405 ILVRL_H2_SH(tmp1, in0, tmp0, tmp1);
2406 ILVRL_H2_SH(tmp3, in1, tmp2, tmp3);
2407 dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
2408 dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
2409 dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
2410 dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
2411 SRAR_W4_SW(dst1, dst0, dst3, dst2, rnd_vec);
2412 CLIP_SW4_0_255_MAX_SATU(dst1, dst0, dst3, dst2);
2413 PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp0, tmp1);
2414 out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
2415 ST8x2_UB(out, dst_tmp, dst_stride);
2416 dst_tmp += (2 * dst_stride);
2431 mask4 = LD_SB(ff_hevc_mask_arr + 16);
2436 LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6);
2437 src0_ptr += (7 * src_stride);
2438 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
2440 VSHF_B4_SB(src0, src3, mask4, mask5, mask6, mask7, vec0, vec1, vec2, vec3);
2441 VSHF_B4_SB(src1, src4, mask4, mask5, mask6, mask7, vec4, vec5, vec6, vec7);
2442 VSHF_B4_SB(src2, src5, mask4, mask5, mask6, mask7, vec8, vec9, vec10,
2444 VSHF_B4_SB(src3, src6, mask4, mask5, mask6, mask7, vec12, vec13, vec14,
2446 dst30 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
2448 dst41 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
2450 dst52 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
2452 dst63 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2,
2454 ILVRL_H2_SH(dst41, dst30, dst10, dst43);
2455 ILVRL_H2_SH(dst52, dst41, dst21, dst54);
2456 ILVRL_H2_SH(dst63, dst52, dst32, dst65);
2458 dst66 = (v8i16) __msa_splati_d((v2i64) dst63, 1);
2460 for (loop_cnt = 4; loop_cnt--;) {
2461 LD_SB4(src0_ptr, src_stride, src7, src8, src9, src10);
2462 src0_ptr += (4 * src_stride);
2463 XORI_B4_128_SB(src7, src8, src9, src10);
2465 LD2(src1_ptr, src2_stride, tp0, tp1);
2466 INSERT_D2_SH(tp0, tp1, in0);
2467 src1_ptr += (2 * src2_stride);
2468 LD2(src1_ptr, src2_stride, tp0, tp1);
2469 INSERT_D2_SH(tp0, tp1, in1);
2470 src1_ptr += (2 * src2_stride);
2472 VSHF_B4_SB(src7, src9, mask4, mask5, mask6, mask7, vec0, vec1, vec2,
2474 VSHF_B4_SB(src8, src10, mask4, mask5, mask6, mask7, vec4, vec5, vec6,
2476 dst97 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
2478 dst108 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
2481 dst76 = __msa_ilvr_h(dst97, dst66);
2482 ILVRL_H2_SH(dst108, dst97, dst87, dst109);
2483 dst66 = (v8i16) __msa_splati_d((v2i64) dst97, 1);
2484 dst98 = __msa_ilvr_h(dst66, dst108);
2486 dst0 = HEVC_FILT_8TAP(dst10, dst32, dst54, dst76, filt_h0, filt_h1,
2488 dst1 = HEVC_FILT_8TAP(dst21, dst43, dst65, dst87, filt_h0, filt_h1,
2490 dst2 = HEVC_FILT_8TAP(dst32, dst54, dst76, dst98, filt_h0, filt_h1,
2492 dst3 = HEVC_FILT_8TAP(dst43, dst65, dst87, dst109, filt_h0, filt_h1,
2494 SRA_4V(dst0, dst1, dst2, dst3, 6);
2495 PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp1, tmp3);
2496 ILVRL_H2_SH(tmp1, in0, tmp0, tmp1);
2497 ILVRL_H2_SH(tmp3, in1, tmp2, tmp3);
2498 dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
2499 dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
2500 dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
2501 dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
2502 SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec);
2503 CLIP_SW4_0_255_MAX_SATU(dst0, dst1, dst2, dst3);
2504 PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp0, tmp1);
2505 out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
2506 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
2507 dst += (4 * dst_stride);
2515 dst66 = (v8i16) __msa_splati_d((v2i64) dst108, 1);
2519 static void hevc_hv_biwgt_8t_16w_msa(uint8_t *src0_ptr,
2522 int32_t src2_stride,
2525 const int8_t *filter_x,
2526 const int8_t *filter_y,
2534 hevc_hv_biwgt_8t_8multx2mult_msa(src0_ptr, src_stride,
2535 src1_ptr, src2_stride,
2536 dst, dst_stride, filter_x, filter_y,
2537 height, weight0, weight1, offset0,
2538 offset1, rnd_val, 2);
2541 static void hevc_hv_biwgt_8t_24w_msa(uint8_t *src0_ptr,
2544 int32_t src2_stride,
2547 const int8_t *filter_x,
2548 const int8_t *filter_y,
2556 hevc_hv_biwgt_8t_8multx2mult_msa(src0_ptr, src_stride,
2557 src1_ptr, src2_stride,
2558 dst, dst_stride, filter_x, filter_y,
2559 height, weight0, weight1, offset0,
2560 offset1, rnd_val, 3);
2563 static void hevc_hv_biwgt_8t_32w_msa(uint8_t *src0_ptr,
2566 int32_t src2_stride,
2569 const int8_t *filter_x,
2570 const int8_t *filter_y,
2578 hevc_hv_biwgt_8t_8multx2mult_msa(src0_ptr, src_stride,
2579 src1_ptr, src2_stride,
2580 dst, dst_stride, filter_x, filter_y,
2581 height, weight0, weight1, offset0,
2582 offset1, rnd_val, 4);
2585 static void hevc_hv_biwgt_8t_48w_msa(uint8_t *src0_ptr,
2588 int32_t src2_stride,
2591 const int8_t *filter_x,
2592 const int8_t *filter_y,
2600 hevc_hv_biwgt_8t_8multx2mult_msa(src0_ptr, src_stride,
2601 src1_ptr, src2_stride,
2602 dst, dst_stride, filter_x, filter_y,
2603 height, weight0, weight1, offset0,
2604 offset1, rnd_val, 6);
2607 static void hevc_hv_biwgt_8t_64w_msa(uint8_t *src0_ptr,
2610 int32_t src2_stride,
2613 const int8_t *filter_x,
2614 const int8_t *filter_y,
2622 hevc_hv_biwgt_8t_8multx2mult_msa(src0_ptr, src_stride,
2623 src1_ptr, src2_stride,
2624 dst, dst_stride, filter_x, filter_y,
2625 height, weight0, weight1, offset0,
2626 offset1, rnd_val, 8);
2629 static void hevc_hz_biwgt_4t_4x2_msa(uint8_t *src0_ptr,
2632 int32_t src2_stride,
2635 const int8_t *filter,
2642 int32_t offset, weight, constant;
2646 v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[16]);
2647 v16i8 mask1, vec0, vec1;
2649 v4i32 dst0_r, dst0_l;
2650 v8i16 out0, filter_vec;
2651 v4i32 weight_vec, offset_vec, rnd_vec;
2655 filter_vec = LD_SH(filter);
2656 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2660 offset = (offset0 + offset1) << rnd_val;
2661 weight0 = weight0 & 0x0000FFFF;
2662 weight = weight0 | (weight1 << 16);
2663 constant = 128 * weight1;
2667 offset_vec = __msa_fill_w(offset);
2668 weight_vec = __msa_fill_w(weight);
2669 rnd_vec = __msa_fill_w(rnd_val + 1);
2671 LD_SB2(src0_ptr, src_stride, src0, src1);
2672 LD_SH2(src1_ptr, src2_stride, in0, in1);
2673 in0 = (v8i16) __msa_ilvr_d((v2i64) in1, (v2i64) in0);
2674 XORI_B2_128_SB(src0, src1);
2676 VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
2677 dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
2679 ILVRL_H2_SW(dst0, in0, dst0_r, dst0_l);
2680 dst0_r = __msa_dpadd_s_w(offset_vec, (v8i16) dst0_r, (v8i16) weight_vec);
2681 dst0_l = __msa_dpadd_s_w(offset_vec, (v8i16) dst0_l, (v8i16) weight_vec);
2682 SRAR_W2_SW(dst0_r, dst0_l, rnd_vec);
2683 dst0_r = (v4i32) __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r);
2684 out0 = CLIP_SH_0_255(dst0_r);
2685 out0 = (v8i16) __msa_pckev_b((v16i8) out0, (v16i8) out0);
2686 ST4x2_UB(out0, dst, dst_stride);
2689 static void hevc_hz_biwgt_4t_4x4_msa(uint8_t *src0_ptr,
2692 int32_t src2_stride,
2695 const int8_t *filter,
2702 int32_t offset, weight, constant;
2704 v16i8 src0, src1, src2, src3;
2705 v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[16]);
2709 v8i16 in0, in1, in2, in3;
2711 v4i32 weight_vec, offset_vec, rnd_vec;
2715 /* rearranging filter */
2716 filter_vec = LD_SH(filter);
2717 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2721 offset = (offset0 + offset1) << rnd_val;
2722 weight0 = weight0 & 0x0000FFFF;
2723 weight = weight0 | (weight1 << 16);
2724 constant = 128 * weight1;
2728 offset_vec = __msa_fill_w(offset);
2729 weight_vec = __msa_fill_w(weight);
2730 rnd_vec = __msa_fill_w(rnd_val + 1);
2732 LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
2733 XORI_B4_128_SB(src0, src1, src2, src3);
2734 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
2735 ILVR_D2_SH(in1, in0, in3, in2, in0, in1);
2737 VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
2738 dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
2739 VSHF_B2_SB(src2, src3, src2, src3, mask0, mask1, vec0, vec1);
2740 dst1 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
2741 HEVC_BIW_RND_CLIP2(dst0, dst1, in0, in1,
2742 weight_vec, rnd_vec, offset_vec,
2745 dst0 = (v8i16) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
2746 ST4x4_UB(dst0, dst0, 0, 1, 2, 3, dst, dst_stride);
2749 static void hevc_hz_biwgt_4t_4x8multiple_msa(uint8_t *src0_ptr,
2752 int32_t src2_stride,
2755 const int8_t *filter,
2764 int32_t weight, offset, constant;
2766 v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
2767 v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[16]);
2770 v8i16 dst0, dst1, dst2, dst3;
2771 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
2773 v4i32 weight_vec, offset_vec, rnd_vec;
2777 filter_vec = LD_SH(filter);
2778 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2780 offset = (offset0 + offset1) << rnd_val;
2781 weight0 = weight0 & 0x0000FFFF;
2782 weight = weight0 | (weight1 << 16);
2783 constant = 128 * weight1;
2787 offset_vec = __msa_fill_w(offset);
2788 weight_vec = __msa_fill_w(weight);
2789 rnd_vec = __msa_fill_w(rnd_val + 1);
2793 for (loop_cnt = (height >> 3); loop_cnt--;) {
2794 LD_SB8(src0_ptr, src_stride,
2795 src0, src1, src2, src3, src4, src5, src6, src7);
2796 src0_ptr += (8 * src_stride);
2797 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
2798 src1_ptr += (4 * src2_stride);
2799 LD_SH4(src1_ptr, src2_stride, in4, in5, in6, in7);
2800 src1_ptr += (4 * src2_stride);
2801 ILVR_D2_SH(in1, in0, in3, in2, in0, in1);
2802 ILVR_D2_SH(in5, in4, in7, in6, in2, in3);
2803 XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
2805 VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
2806 dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
2807 VSHF_B2_SB(src2, src3, src2, src3, mask0, mask1, vec0, vec1);
2808 dst1 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
2809 VSHF_B2_SB(src4, src5, src4, src5, mask0, mask1, vec0, vec1);
2810 dst2 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
2811 VSHF_B2_SB(src6, src7, src6, src7, mask0, mask1, vec0, vec1);
2812 dst3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
2813 HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3,
2815 weight_vec, rnd_vec, offset_vec,
2816 dst0, dst1, dst2, dst3);
2818 PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
2819 ST4x8_UB(dst0, dst1, dst, dst_stride);
2820 dst += (8 * dst_stride);
2824 static void hevc_hz_biwgt_4t_4w_msa(uint8_t *src0_ptr,
2827 int32_t src2_stride,
2830 const int8_t *filter,
2839 hevc_hz_biwgt_4t_4x2_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
2840 dst, dst_stride, filter,
2841 weight0, weight1, offset0, offset1, rnd_val);
2842 } else if (4 == height) {
2843 hevc_hz_biwgt_4t_4x4_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
2844 dst, dst_stride, filter,
2845 weight0, weight1, offset0, offset1, rnd_val);
2846 } else if (0 == (height % 8)) {
2847 hevc_hz_biwgt_4t_4x8multiple_msa(src0_ptr, src_stride,
2848 src1_ptr, src2_stride,
2849 dst, dst_stride, filter, height,
2850 weight0, weight1, offset0, offset1,
2855 static void hevc_hz_biwgt_4t_6w_msa(uint8_t *src0_ptr,
2858 int32_t src2_stride,
2861 const int8_t *filter,
2870 int32_t offset, weight, constant;
2872 v16i8 src0, src1, src2, src3;
2873 v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
2876 v8i16 in0, in1, in2, in3;
2877 v8i16 dst0, dst1, dst2, dst3;
2879 v4i32 weight_vec, offset_vec, rnd_vec;
2883 filter_vec = LD_SH(filter);
2884 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2886 offset = (offset0 + offset1) << rnd_val;
2887 weight0 = weight0 & 0x0000FFFF;
2888 weight = weight0 | (weight1 << 16);
2889 constant = 128 * weight1;
2893 offset_vec = __msa_fill_w(offset);
2894 weight_vec = __msa_fill_w(weight);
2895 rnd_vec = __msa_fill_w(rnd_val + 1);
2899 for (loop_cnt = 2; loop_cnt--;) {
2900 LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
2901 src0_ptr += (4 * src_stride);
2902 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
2903 src1_ptr += (4 * src2_stride);
2904 XORI_B4_128_SB(src0, src1, src2, src3);
2906 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
2907 dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
2908 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
2909 dst1 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
2910 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
2911 dst2 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
2912 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
2913 dst3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
2915 HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3,
2917 weight_vec, rnd_vec, offset_vec,
2918 dst0, dst1, dst2, dst3);
2920 PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
2921 ST6x4_UB(dst0, dst1, dst, dst_stride);
2922 dst += (4 * dst_stride);
2926 static void hevc_hz_biwgt_4t_8x2_msa(uint8_t *src0_ptr,
2929 int32_t src2_stride,
2932 const int8_t *filter,
2939 int32_t offset, weight, constant;
2943 v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
2944 v16i8 mask1, vec0, vec1;
2947 v4i32 weight_vec, offset_vec, rnd_vec;
2951 filter_vec = LD_SH(filter);
2952 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2954 offset = (offset0 + offset1) << rnd_val;
2955 weight0 = weight0 & 0x0000FFFF;
2956 weight = weight0 | (weight1 << 16);
2957 constant = 128 * weight1;
2961 offset_vec = __msa_fill_w(offset);
2962 weight_vec = __msa_fill_w(weight);
2963 rnd_vec = __msa_fill_w(rnd_val + 1);
2967 LD_SB2(src0_ptr, src_stride, src0, src1);
2968 LD_SH2(src1_ptr, src2_stride, in0, in1);
2969 XORI_B2_128_SB(src0, src1);
2970 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
2971 dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
2972 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
2973 dst1 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
2974 HEVC_BIW_RND_CLIP2(dst0, dst1, in0, in1,
2975 weight_vec, rnd_vec, offset_vec,
2978 dst0 = (v8i16) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
2979 ST8x2_UB(dst0, dst, dst_stride);
2982 static void hevc_hz_biwgt_4t_8x6_msa(uint8_t *src0_ptr,
2985 int32_t src2_stride,
2988 const int8_t *filter,
2995 int32_t weight, offset, constant;
2997 v16i8 src0, src1, src2, src3, src4, src5;
2998 v8i16 in0, in1, in2, in3, in4, in5;
2999 v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
3002 v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
3004 v4i32 weight_vec, offset_vec, rnd_vec;
3008 filter_vec = LD_SH(filter);
3009 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3011 offset = (offset0 + offset1) << rnd_val;
3012 weight0 = weight0 & 0x0000FFFF;
3013 weight = weight0 | (weight1 << 16);
3014 constant = 128 * weight1;
3018 offset_vec = __msa_fill_w(offset);
3019 weight_vec = __msa_fill_w(weight);
3020 rnd_vec = __msa_fill_w(rnd_val + 1);
3024 LD_SB6(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5);
3026 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
3027 src1_ptr += (4 * src2_stride);
3028 LD_SH2(src1_ptr, src2_stride, in4, in5);
3029 XORI_B6_128_SB(src0, src1, src2, src3, src4, src5);
3030 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3031 dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3032 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
3033 dst1 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3034 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
3035 dst2 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3036 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3037 dst3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3038 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
3039 dst4 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3040 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
3041 dst5 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3042 HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3,
3044 weight_vec, rnd_vec, offset_vec,
3045 dst0, dst1, dst2, dst3);
3046 HEVC_BIW_RND_CLIP2(dst4, dst5, in4, in5,
3047 weight_vec, rnd_vec, offset_vec,
3050 PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
3051 dst3 = (v8i16) __msa_pckev_b((v16i8) dst5, (v16i8) dst4);
3052 ST8x4_UB(dst0, dst1, dst, dst_stride);
3053 dst += (4 * dst_stride);
3054 ST8x2_UB(dst3, dst, dst_stride);
3057 static void hevc_hz_biwgt_4t_8x4multiple_msa(uint8_t *src0_ptr,
3060 int32_t src2_stride,
3063 const int8_t *filter,
3072 int32_t offset, weight, constant;
3074 v16i8 src0, src1, src2, src3;
3075 v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
3078 v8i16 in0, in1, in2, in3;
3079 v8i16 dst0, dst1, dst2, dst3;
3081 v4i32 weight_vec, offset_vec, rnd_vec;
3085 filter_vec = LD_SH(filter);
3086 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3088 offset = (offset0 + offset1) << rnd_val;
3089 weight0 = weight0 & 0x0000FFFF;
3090 weight = weight0 | (weight1 << 16);
3091 constant = 128 * weight1;
3095 offset_vec = __msa_fill_w(offset);
3096 weight_vec = __msa_fill_w(weight);
3097 rnd_vec = __msa_fill_w(rnd_val + 1);
3101 for (loop_cnt = (height >> 2); loop_cnt--;) {
3102 LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
3103 src0_ptr += (4 * src_stride);
3104 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
3105 src1_ptr += (4 * src2_stride);
3106 XORI_B4_128_SB(src0, src1, src2, src3);
3108 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3109 dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3110 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
3111 dst1 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3112 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
3113 dst2 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3114 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3115 dst3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3116 HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3,
3118 weight_vec, rnd_vec, offset_vec,
3119 dst0, dst1, dst2, dst3);
3121 PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
3122 ST8x4_UB(dst0, dst1, dst, dst_stride);
3123 dst += (4 * dst_stride);
3127 static void hevc_hz_biwgt_4t_8w_msa(uint8_t *src0_ptr,
3130 int32_t src2_stride,
3133 const int8_t *filter,
3142 hevc_hz_biwgt_4t_8x2_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
3143 dst, dst_stride, filter,
3144 weight0, weight1, offset0, offset1, rnd_val);
3145 } else if (6 == height) {
3146 hevc_hz_biwgt_4t_8x6_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
3147 dst, dst_stride, filter,
3148 weight0, weight1, offset0, offset1, rnd_val);
3149 } else if (0 == (height % 4)) {
3150 hevc_hz_biwgt_4t_8x4multiple_msa(src0_ptr, src_stride,
3151 src1_ptr, src2_stride,
3152 dst, dst_stride, filter, height,
3153 weight0, weight1, offset0, offset1,
3158 static void hevc_hz_biwgt_4t_12w_msa(uint8_t *src0_ptr,
3161 int32_t src2_stride,
3164 const int8_t *filter,
3173 int32_t offset, weight, constant;
3175 v16i8 src0, src1, src2, src3;
3176 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
3177 v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
3179 8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
3183 v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
3185 v4i32 weight_vec, offset_vec, rnd_vec;
3189 filter_vec = LD_SH(filter);
3190 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3192 offset = (offset0 + offset1) << rnd_val;
3193 weight0 = weight0 & 0x0000FFFF;
3194 weight = weight0 | (weight1 << 16);
3195 constant = 128 * weight1;
3199 offset_vec = __msa_fill_w(offset);
3200 weight_vec = __msa_fill_w(weight);
3201 rnd_vec = __msa_fill_w(rnd_val + 1);
3206 for (loop_cnt = 4; loop_cnt--;) {
3207 LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
3208 src0_ptr += (4 * src_stride);
3209 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
3210 LD_SH4(src1_ptr + 8, src2_stride, in4, in5, in6, in7);
3211 src1_ptr += (4 * src2_stride);
3212 ILVR_D2_SH(in5, in4, in7, in6, in4, in5);
3213 XORI_B4_128_SB(src0, src1, src2, src3);
3215 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3216 dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3217 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
3218 dst1 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3219 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
3220 dst2 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3221 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3222 dst3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3223 VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec0, vec1);
3224 dst4 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3225 VSHF_B2_SB(src2, src3, src2, src3, mask2, mask3, vec0, vec1);
3226 dst5 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3228 HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3,
3230 weight_vec, rnd_vec, offset_vec,
3231 dst0, dst1, dst2, dst3);
3232 HEVC_BIW_RND_CLIP2(dst4, dst5, in4, in5,
3233 weight_vec, rnd_vec, offset_vec,
3236 PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
3237 dst3 = (v8i16) __msa_pckev_b((v16i8) dst5, (v16i8) dst4);
3238 ST12x4_UB(dst0, dst1, dst3, dst, dst_stride);
3239 dst += (4 * dst_stride);
3243 static void hevc_hz_biwgt_4t_16w_msa(uint8_t *src0_ptr,
3246 int32_t src2_stride,
3249 const int8_t *filter,
3258 int32_t offset, weight, constant;
3259 v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
3260 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
3262 v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
3264 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
3267 v4i32 weight_vec, offset_vec, rnd_vec;
3271 filter_vec = LD_SH(filter);
3272 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3274 offset = (offset0 + offset1) << rnd_val;
3275 weight0 = weight0 & 0x0000FFFF;
3276 weight = weight0 | (weight1 << 16);
3277 constant = 128 * weight1;
3281 offset_vec = __msa_fill_w(offset);
3282 weight_vec = __msa_fill_w(weight);
3283 rnd_vec = __msa_fill_w(rnd_val + 1);
3287 for (loop_cnt = (height >> 2); loop_cnt--;) {
3288 LD_SB4(src0_ptr, src_stride, src0, src2, src4, src6);
3289 LD_SB4(src0_ptr + 8, src_stride, src1, src3, src5, src7);
3290 src0_ptr += (4 * src_stride);
3291 LD_SH4(src1_ptr, src2_stride, in0, in2, in4, in6);
3292 LD_SH4(src1_ptr + 8, src2_stride, in1, in3, in5, in7);
3293 src1_ptr += (4 * src2_stride);
3294 XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
3296 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3297 dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3298 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
3299 dst1 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3300 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
3301 dst2 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3302 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3303 dst3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3304 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
3305 dst4 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3306 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
3307 dst5 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3308 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
3309 dst6 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3310 VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1);
3311 dst7 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3312 HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3,
3314 weight_vec, rnd_vec, offset_vec,
3315 dst0, dst1, dst2, dst3);
3317 PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
3318 ST_SH2(dst0, dst1, dst, dst_stride);
3319 dst += (2 * dst_stride);
3321 HEVC_BIW_RND_CLIP4(dst4, dst5, dst6, dst7,
3323 weight_vec, rnd_vec, offset_vec,
3324 dst0, dst1, dst2, dst3);
3326 PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
3327 ST_SH2(dst0, dst1, dst, dst_stride);
3328 dst += (2 * dst_stride);
3332 static void hevc_hz_biwgt_4t_24w_msa(uint8_t *src0_ptr,
3335 int32_t src2_stride,
3338 const int8_t *filter,
3347 int32_t offset, weight, constant;
3348 v16i8 src0, src1, src2, src3;
3350 v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
3351 v16i8 mask1, mask2, mask3;
3353 v8i16 dst0, dst1, dst2, dst3;
3354 v8i16 in0, in1, in2, in3, in4, in5;
3356 v4i32 weight_vec, offset_vec, rnd_vec;
3360 filter_vec = LD_SH(filter);
3361 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3363 offset = (offset0 + offset1) << rnd_val;
3364 weight0 = weight0 & 0x0000FFFF;
3365 weight = weight0 | (weight1 << 16);
3366 constant = 128 * weight1;
3370 offset_vec = __msa_fill_w(offset);
3371 weight_vec = __msa_fill_w(weight);
3372 rnd_vec = __msa_fill_w(rnd_val + 1);
3378 for (loop_cnt = 16; loop_cnt--;) {
3379 LD_SB2(src0_ptr, src_stride, src0, src2);
3380 LD_SB2(src0_ptr + 16, src_stride, src1, src3);
3381 src0_ptr += (2 * src_stride);
3382 LD_SH2(src1_ptr, src2_stride, in0, in2);
3383 LD_SH2(src1_ptr + 8, src2_stride, in1, in3);
3384 LD_SH2(src1_ptr + 16, src2_stride, in4, in5);
3385 src1_ptr += (2 * src2_stride);
3386 XORI_B4_128_SB(src0, src1, src2, src3);
3388 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3389 dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3390 VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec0, vec1);
3391 dst1 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3392 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
3393 dst2 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3394 VSHF_B2_SB(src2, src3, src2, src3, mask2, mask3, vec0, vec1);
3395 dst3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3396 HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3,
3398 weight_vec, rnd_vec, offset_vec,
3399 dst0, dst1, dst2, dst3);
3401 PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
3402 ST_SH2(dst0, dst1, dst, dst_stride);
3405 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
3406 dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3407 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3408 dst1 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3409 HEVC_BIW_RND_CLIP2(dst0, dst1, in4, in5,
3410 weight_vec, rnd_vec, offset_vec,
3413 dst0 = (v8i16) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
3414 ST8x2_UB(dst0, (dst + 16), dst_stride);
3415 dst += (2 * dst_stride);
3419 static void hevc_hz_biwgt_4t_32w_msa(uint8_t *src0_ptr,
3422 int32_t src2_stride,
3425 const int8_t *filter,
3434 int32_t offset, weight, constant;
3435 v16i8 src0, src1, src2;
3437 v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
3438 v16i8 mask1, mask2, mask3;
3439 v8i16 dst0, dst1, dst2, dst3;
3441 v8i16 in0, in1, in2, in3;
3443 v4i32 weight_vec, offset_vec, rnd_vec;
3447 filter_vec = LD_SH(filter);
3448 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3450 offset = (offset0 + offset1) << rnd_val;
3451 weight0 = weight0 & 0x0000FFFF;
3452 weight = weight0 | (weight1 << 16);
3453 constant = 128 * weight1;
3457 offset_vec = __msa_fill_w(offset);
3458 weight_vec = __msa_fill_w(weight);
3459 rnd_vec = __msa_fill_w(rnd_val + 1);
3465 for (loop_cnt = height; loop_cnt--;) {
3466 LD_SB2(src0_ptr, 16, src0, src1);
3467 src2 = LD_SB(src0_ptr + 24);
3468 src0_ptr += src_stride;
3469 LD_SH4(src1_ptr, 8, in0, in1, in2, in3);
3470 src1_ptr += src2_stride;
3471 XORI_B3_128_SB(src0, src1, src2);
3473 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3474 dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3475 VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec0, vec1);
3476 dst1 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3477 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
3478 dst2 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3479 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
3480 dst3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3481 HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3,
3483 weight_vec, rnd_vec, offset_vec,
3484 dst0, dst1, dst2, dst3);
3486 PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
3487 ST_SH2(dst0, dst1, dst, 16);
3492 static void hevc_vt_biwgt_4t_4x2_msa(uint8_t *src0_ptr,
3495 int32_t src2_stride,
3498 const int8_t *filter,
3505 int32_t weight, offset, constant;
3506 v16i8 src0, src1, src2, src3, src4;
3507 v8i16 in0, in1, dst10;
3508 v16i8 src10_r, src32_r, src21_r, src43_r, src2110, src4332;
3509 v4i32 dst10_r, dst10_l;
3511 v8i16 filter_vec, out;
3512 v4i32 weight_vec, offset_vec, rnd_vec;
3514 src0_ptr -= src_stride;
3516 offset = (offset0 + offset1) << rnd_val;
3517 weight0 = weight0 & 0x0000FFFF;
3518 weight = weight0 | (weight1 << 16);
3519 constant = 128 * weight1;
3523 offset_vec = __msa_fill_w(offset);
3524 weight_vec = __msa_fill_w(weight);
3525 rnd_vec = __msa_fill_w(rnd_val + 1);
3527 filter_vec = LD_SH(filter);
3528 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3530 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
3531 src0_ptr += (3 * src_stride);
3532 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3533 src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
3534 src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
3535 LD_SB2(src0_ptr, src_stride, src3, src4);
3536 src0_ptr += (2 * src_stride);
3537 LD_SH2(src1_ptr, src2_stride, in0, in1);
3538 src1_ptr += (2 * src2_stride);
3540 in0 = (v8i16) __msa_ilvr_d((v2i64) in1, (v2i64) in0);
3541 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3542 src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_r, (v2i64) src32_r);
3543 src4332 = (v16i8) __msa_xori_b((v16u8) src4332, 128);
3545 dst10 = HEVC_FILT_4TAP_SH(src2110, src4332, filt0, filt1);
3547 ILVRL_H2_SW(dst10, in0, dst10_r, dst10_l);
3548 dst10_r = __msa_dpadd_s_w(offset_vec, (v8i16) dst10_r, (v8i16) weight_vec);
3549 dst10_l = __msa_dpadd_s_w(offset_vec, (v8i16) dst10_l, (v8i16) weight_vec);
3550 SRAR_W2_SW(dst10_r, dst10_l, rnd_vec);
3551 dst10_r = (v4i32) __msa_pckev_h((v8i16) dst10_l, (v8i16) dst10_r);
3552 out = CLIP_SH_0_255(dst10_r);
3553 out = (v8i16) __msa_pckev_b((v16i8) out, (v16i8) out);
3554 ST4x2_UB(out, dst, dst_stride);
3557 static void hevc_vt_biwgt_4t_4x4_msa(uint8_t *src0_ptr,
3560 int32_t src2_stride,
3563 const int8_t *filter,
3570 int32_t weight, offset, constant;
3571 v16i8 src0, src1, src2, src3, src4, src5, src6;
3572 v8i16 in0, in1, in2, in3;
3573 v16i8 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r;
3574 v16i8 src2110, src4332, src6554;
3578 v4i32 weight_vec, offset_vec, rnd_vec;
3580 src0_ptr -= src_stride;
3582 offset = (offset0 + offset1) << rnd_val;
3583 weight0 = weight0 & 0x0000FFFF;
3584 weight = weight0 | (weight1 << 16);
3585 constant = 128 * weight1;
3589 offset_vec = __msa_fill_w(offset);
3590 weight_vec = __msa_fill_w(weight);
3591 rnd_vec = __msa_fill_w(rnd_val + 1);
3593 filter_vec = LD_SH(filter);
3594 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3596 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
3597 src0_ptr += (3 * src_stride);
3598 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3599 src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
3600 src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
3602 LD_SB4(src0_ptr, src_stride, src3, src4, src5, src6);
3603 src0_ptr += (4 * src_stride);
3604 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
3605 src1_ptr += (4 * src2_stride);
3606 ILVR_D2_SH(in1, in0, in3, in2, in0, in1);
3607 ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
3608 src32_r, src43_r, src54_r, src65_r);
3609 ILVR_D2_SB(src43_r, src32_r, src65_r, src54_r, src4332, src6554);
3610 XORI_B2_128_SB(src4332, src6554);
3612 dst10 = HEVC_FILT_4TAP_SH(src2110, src4332, filt0, filt1);
3613 dst32 = HEVC_FILT_4TAP_SH(src4332, src6554, filt0, filt1);
3615 HEVC_BIW_RND_CLIP2(dst10, dst32, in0, in1,
3616 weight_vec, rnd_vec, offset_vec,
3619 dst10 = (v8i16) __msa_pckev_b((v16i8) dst32, (v16i8) dst10);
3620 ST4x4_UB(dst10, dst10, 0, 1, 2, 3, dst, dst_stride);
3621 dst += (4 * dst_stride);
3624 static void hevc_vt_biwgt_4t_4x8multiple_msa(uint8_t *src0_ptr,
3627 int32_t src2_stride,
3630 const int8_t *filter,
3639 int32_t weight, offset, constant;
3640 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9;
3641 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
3642 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
3643 v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
3644 v16i8 src2110, src4332, src6554, src8776;
3645 v8i16 dst10, dst32, dst54, dst76;
3648 v4i32 weight_vec, offset_vec, rnd_vec;
3650 src0_ptr -= src_stride;
3652 offset = (offset0 + offset1) << rnd_val;
3653 weight0 = weight0 & 0x0000FFFF;
3654 weight = weight0 | (weight1 << 16);
3655 constant = 128 * weight1;
3659 offset_vec = __msa_fill_w(offset);
3660 weight_vec = __msa_fill_w(weight);
3661 rnd_vec = __msa_fill_w(rnd_val + 1);
3663 filter_vec = LD_SH(filter);
3664 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3666 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
3667 src0_ptr += (3 * src_stride);
3668 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3669 src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
3670 src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
3672 for (loop_cnt = (height >> 3); loop_cnt--;) {
3673 LD_SB6(src0_ptr, src_stride, src3, src4, src5, src6, src7, src8);
3674 src0_ptr += (6 * src_stride);
3675 LD_SH8(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5, in6, in7);
3676 src1_ptr += (8 * src2_stride);
3678 ILVR_D2_SH(in1, in0, in3, in2, in0, in1);
3679 ILVR_D2_SH(in5, in4, in7, in6, in2, in3);
3681 ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
3682 src32_r, src43_r, src54_r, src65_r);
3683 ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
3684 ILVR_D3_SB(src43_r, src32_r, src65_r, src54_r, src87_r, src76_r,
3685 src4332, src6554, src8776);
3686 XORI_B3_128_SB(src4332, src6554, src8776);
3688 dst10 = HEVC_FILT_4TAP_SH(src2110, src4332, filt0, filt1);
3689 dst32 = HEVC_FILT_4TAP_SH(src4332, src6554, filt0, filt1);
3690 dst54 = HEVC_FILT_4TAP_SH(src6554, src8776, filt0, filt1);
3692 LD_SB2(src0_ptr, src_stride, src9, src2);
3693 src0_ptr += (2 * src_stride);
3694 ILVR_B2_SB(src9, src8, src2, src9, src98_r, src109_r);
3695 src2110 = (v16i8) __msa_ilvr_d((v2i64) src109_r, (v2i64) src98_r);
3696 src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
3698 dst76 = HEVC_FILT_4TAP_SH(src8776, src2110, filt0, filt1);
3699 HEVC_BIW_RND_CLIP4(dst10, dst32, dst54, dst76,
3701 weight_vec, rnd_vec, offset_vec,
3702 dst10, dst32, dst54, dst76);
3704 PCKEV_B2_SH(dst32, dst10, dst76, dst54, dst10, dst32);
3705 ST4x8_UB(dst10, dst32, dst, dst_stride);
3706 dst += (8 * dst_stride);
3710 static void hevc_vt_biwgt_4t_4w_msa(uint8_t *src0_ptr,
3713 int32_t src2_stride,
3716 const int8_t *filter,
3725 hevc_vt_biwgt_4t_4x2_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
3726 dst, dst_stride, filter,
3727 weight0, weight1, offset0, offset1, rnd_val);
3728 } else if (4 == height) {
3729 hevc_vt_biwgt_4t_4x4_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
3730 dst, dst_stride, filter,
3731 weight0, weight1, offset0, offset1, rnd_val);
3732 } else if (0 == (height % 8)) {
3733 hevc_vt_biwgt_4t_4x8multiple_msa(src0_ptr, src_stride,
3734 src1_ptr, src2_stride,
3735 dst, dst_stride, filter, height,
3736 weight0, weight1, offset0, offset1,
3741 static void hevc_vt_biwgt_4t_6w_msa(uint8_t *src0_ptr,
3744 int32_t src2_stride,
3747 const int8_t *filter,
3756 int32_t offset, weight, constant;
3757 v16i8 src0, src1, src2, src3, src4;
3758 v8i16 in0, in1, in2, in3;
3759 v16i8 src10_r, src32_r, src21_r, src43_r;
3760 v8i16 tmp0, tmp1, tmp2, tmp3;
3763 v4i32 weight_vec, offset_vec, rnd_vec;
3765 src0_ptr -= src_stride;
3767 offset = (offset0 + offset1) << rnd_val;
3768 weight0 = weight0 & 0x0000FFFF;
3769 weight = weight0 | (weight1 << 16);
3770 constant = 128 * weight1;
3774 offset_vec = __msa_fill_w(offset);
3775 weight_vec = __msa_fill_w(weight);
3776 rnd_vec = __msa_fill_w(rnd_val + 1);
3778 filter_vec = LD_SH(filter);
3779 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3781 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
3782 src0_ptr += (3 * src_stride);
3783 XORI_B3_128_SB(src0, src1, src2);
3784 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3786 for (loop_cnt = (height >> 2); loop_cnt--;) {
3787 LD_SB2(src0_ptr, src_stride, src3, src4);
3788 src0_ptr += (2 * src_stride);
3789 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
3790 src1_ptr += (4 * src2_stride);
3791 XORI_B2_128_SB(src3, src4);
3792 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3794 tmp0 = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
3795 tmp1 = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
3797 LD_SB2(src0_ptr, src_stride, src1, src2);
3798 src0_ptr += (2 * src_stride);
3799 XORI_B2_128_SB(src1, src2);
3800 ILVR_B2_SB(src1, src4, src2, src1, src10_r, src21_r);
3802 tmp2 = HEVC_FILT_4TAP_SH(src32_r, src10_r, filt0, filt1);
3803 tmp3 = HEVC_FILT_4TAP_SH(src43_r, src21_r, filt0, filt1);
3804 HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3,
3806 weight_vec, rnd_vec, offset_vec,
3807 tmp0, tmp1, tmp2, tmp3);
3809 PCKEV_B2_SH(tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
3810 ST6x4_UB(tmp0, tmp1, dst, dst_stride);
3811 dst += (4 * dst_stride);
3815 static void hevc_vt_biwgt_4t_8x2_msa(uint8_t *src0_ptr,
3818 int32_t src2_stride,
3821 const int8_t *filter,
3828 int32_t offset, weight, constant;
3829 v16i8 src0, src1, src2, src3, src4;
3830 v8i16 in0, in1, tmp0, tmp1;
3831 v16i8 src10_r, src32_r, src21_r, src43_r;
3834 v4i32 weight_vec, offset_vec, rnd_vec;
3836 src0_ptr -= src_stride;
3838 offset = (offset0 + offset1) << rnd_val;
3839 weight0 = weight0 & 0x0000FFFF;
3840 weight = weight0 | (weight1 << 16);
3841 constant = 128 * weight1;
3845 offset_vec = __msa_fill_w(offset);
3846 weight_vec = __msa_fill_w(weight);
3847 rnd_vec = __msa_fill_w(rnd_val + 1);
3849 filter_vec = LD_SH(filter);
3850 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3852 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
3853 src0_ptr += (3 * src_stride);
3854 XORI_B3_128_SB(src0, src1, src2);
3855 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3857 LD_SB2(src0_ptr, src_stride, src3, src4);
3858 LD_SH2(src1_ptr, src2_stride, in0, in1);
3859 XORI_B2_128_SB(src3, src4);
3860 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3862 tmp0 = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
3863 tmp1 = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
3864 HEVC_BIW_RND_CLIP2(tmp0, tmp1, in0, in1,
3865 weight_vec, rnd_vec, offset_vec,
3868 tmp0 = (v8i16) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
3869 ST8x2_UB(tmp0, dst, dst_stride);
3872 static void hevc_vt_biwgt_4t_8x6_msa(uint8_t *src0_ptr,
3875 int32_t src2_stride,
3878 const int8_t *filter,
3885 int32_t offset, weight, constant;
3886 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
3887 v8i16 in0, in1, in2, in3, in4, in5;
3888 v16i8 src10_r, src32_r, src54_r, src76_r;
3889 v16i8 src21_r, src43_r, src65_r, src87_r;
3890 v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
3893 v4i32 weight_vec, offset_vec, rnd_vec;
3895 src0_ptr -= src_stride;
3897 offset = (offset0 + offset1) << rnd_val;
3898 weight0 = weight0 & 0x0000FFFF;
3899 weight = weight0 | (weight1 << 16);
3900 constant = 128 * weight1;
3904 offset_vec = __msa_fill_w(offset);
3905 weight_vec = __msa_fill_w(weight);
3906 rnd_vec = __msa_fill_w(rnd_val + 1);
3908 filter_vec = LD_SH(filter);
3909 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3911 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
3912 src0_ptr += (3 * src_stride);
3913 XORI_B3_128_SB(src0, src1, src2);
3914 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3916 LD_SB6(src0_ptr, src_stride, src3, src4, src5, src6, src7, src8);
3917 LD_SH6(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5);
3918 XORI_B6_128_SB(src3, src4, src5, src6, src7, src8);
3919 ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
3920 src32_r, src43_r, src54_r, src65_r);
3921 ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
3923 tmp0 = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
3924 tmp1 = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
3925 tmp2 = HEVC_FILT_4TAP_SH(src32_r, src54_r, filt0, filt1);
3926 tmp3 = HEVC_FILT_4TAP_SH(src43_r, src65_r, filt0, filt1);
3927 tmp4 = HEVC_FILT_4TAP_SH(src54_r, src76_r, filt0, filt1);
3928 tmp5 = HEVC_FILT_4TAP_SH(src65_r, src87_r, filt0, filt1);
3929 HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3,
3931 weight_vec, rnd_vec, offset_vec,
3932 tmp0, tmp1, tmp2, tmp3);
3933 HEVC_BIW_RND_CLIP2(tmp4, tmp5, in4, in5,
3934 weight_vec, rnd_vec, offset_vec,
3937 PCKEV_B2_SH(tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
3938 tmp3 = (v8i16) __msa_pckev_b((v16i8) tmp5, (v16i8) tmp4);
3939 ST8x4_UB(tmp0, tmp1, dst, dst_stride);
3940 dst += (4 * dst_stride);
3941 ST8x2_UB(tmp3, dst, dst_stride);
3944 static void hevc_vt_biwgt_4t_8x4multiple_msa(uint8_t *src0_ptr,
3947 int32_t src2_stride,
3950 const int8_t *filter,
3959 int32_t offset, weight, constant;
3960 v16i8 src0, src1, src2, src3, src4;
3961 v8i16 in0, in1, in2, in3;
3962 v16i8 src10_r, src32_r, src21_r, src43_r;
3963 v8i16 tmp0, tmp1, tmp2, tmp3;
3966 v4i32 weight_vec, offset_vec, rnd_vec;
3968 src0_ptr -= src_stride;
3970 offset = (offset0 + offset1) << rnd_val;
3971 weight0 = weight0 & 0x0000FFFF;
3972 weight = weight0 | (weight1 << 16);
3973 constant = 128 * weight1;
3977 offset_vec = __msa_fill_w(offset);
3978 weight_vec = __msa_fill_w(weight);
3979 rnd_vec = __msa_fill_w(rnd_val + 1);
3981 filter_vec = LD_SH(filter);
3982 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3984 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
3985 src0_ptr += (3 * src_stride);
3986 XORI_B3_128_SB(src0, src1, src2);
3987 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3989 for (loop_cnt = (height >> 2); loop_cnt--;) {
3990 LD_SB2(src0_ptr, src_stride, src3, src4);
3991 src0_ptr += (2 * src_stride);
3992 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
3993 src1_ptr += (4 * src2_stride);
3994 XORI_B2_128_SB(src3, src4);
3995 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3997 tmp0 = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
3998 tmp1 = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
4000 LD_SB2(src0_ptr, src_stride, src1, src2);
4001 src0_ptr += (2 * src_stride);
4002 XORI_B2_128_SB(src1, src2);
4003 ILVR_B2_SB(src1, src4, src2, src1, src10_r, src21_r);
4005 tmp2 = HEVC_FILT_4TAP_SH(src32_r, src10_r, filt0, filt1);
4006 tmp3 = HEVC_FILT_4TAP_SH(src43_r, src21_r, filt0, filt1);
4007 HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3,
4009 weight_vec, rnd_vec, offset_vec,
4010 tmp0, tmp1, tmp2, tmp3);
4012 PCKEV_B2_SH(tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
4013 ST8x4_UB(tmp0, tmp1, dst, dst_stride);
4014 dst += (4 * dst_stride);
4018 static void hevc_vt_biwgt_4t_8w_msa(uint8_t *src0_ptr,
4021 int32_t src2_stride,
4024 const int8_t *filter,
4033 hevc_vt_biwgt_4t_8x2_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
4034 dst, dst_stride, filter,
4035 weight0, weight1, offset0, offset1, rnd_val);
4036 } else if (6 == height) {
4037 hevc_vt_biwgt_4t_8x6_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
4038 dst, dst_stride, filter,
4039 weight0, weight1, offset0, offset1, rnd_val);
4041 hevc_vt_biwgt_4t_8x4multiple_msa(src0_ptr, src_stride,
4042 src1_ptr, src2_stride,
4043 dst, dst_stride, filter, height,
4044 weight0, weight1, offset0, offset1,
4049 static void hevc_vt_biwgt_4t_12w_msa(uint8_t *src0_ptr,
4052 int32_t src2_stride,
4055 const int8_t *filter,
4064 int32_t offset, weight, constant;
4065 v16i8 src0, src1, src2, src3, src4, src5;
4066 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
4067 v16i8 src10_r, src32_r, src21_r, src43_r;
4068 v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
4069 v16i8 src10_l, src32_l, src54_l, src21_l, src43_l, src65_l;
4070 v16i8 src2110, src4332;
4073 v4i32 weight_vec, offset_vec, rnd_vec;
4075 src0_ptr -= (1 * src_stride);
4077 offset = (offset0 + offset1) << rnd_val;
4078 weight0 = weight0 & 0x0000FFFF;
4079 weight = weight0 | (weight1 << 16);
4080 constant = 128 * weight1;
4084 offset_vec = __msa_fill_w(offset);
4085 weight_vec = __msa_fill_w(weight);
4086 rnd_vec = __msa_fill_w(rnd_val + 1);
4088 filter_vec = LD_SH(filter);
4089 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
4091 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
4092 src0_ptr += (3 * src_stride);
4093 XORI_B3_128_SB(src0, src1, src2);
4094 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
4095 ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
4096 src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_l, (v2i64) src10_l);
4098 for (loop_cnt = (height >> 2); loop_cnt--;) {
4099 LD_SB2(src0_ptr, src_stride, src3, src4);
4100 src0_ptr += (2 * src_stride);
4101 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
4102 LD_SH4(src1_ptr + 8, src2_stride, in4, in5, in6, in7);
4103 src1_ptr += (4 * src2_stride);
4104 ILVR_D2_SH(in5, in4, in7, in6, in4, in5);
4105 XORI_B2_128_SB(src3, src4);
4107 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
4108 ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
4109 src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_l, (v2i64) src32_l);
4111 tmp0 = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
4112 tmp1 = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
4113 tmp4 = HEVC_FILT_4TAP_SH(src2110, src4332, filt0, filt1);
4115 LD_SB2(src0_ptr, src_stride, src5, src2);
4116 src0_ptr += (2 * src_stride);
4117 XORI_B2_128_SB(src5, src2);
4118 ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r);
4119 ILVL_B2_SB(src5, src4, src2, src5, src54_l, src65_l);
4120 src2110 = (v16i8) __msa_ilvr_d((v2i64) src65_l, (v2i64) src54_l);
4122 tmp2 = HEVC_FILT_4TAP_SH(src32_r, src10_r, filt0, filt1);
4123 tmp3 = HEVC_FILT_4TAP_SH(src43_r, src21_r, filt0, filt1);
4124 tmp5 = HEVC_FILT_4TAP_SH(src4332, src2110, filt0, filt1);
4125 HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3,
4127 weight_vec, rnd_vec, offset_vec,
4128 tmp0, tmp1, tmp2, tmp3);
4129 HEVC_BIW_RND_CLIP2(tmp4, tmp5, in4, in5,
4130 weight_vec, rnd_vec, offset_vec,
4133 PCKEV_B2_SH(tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
4134 tmp2 = (v8i16) __msa_pckev_b((v16i8) tmp5, (v16i8) tmp4);
4135 ST12x4_UB(tmp0, tmp1, tmp2, dst, dst_stride);
4136 dst += (4 * dst_stride);
4140 static void hevc_vt_biwgt_4t_16w_msa(uint8_t *src0_ptr,
4143 int32_t src2_stride,
4146 const int8_t *filter,
4155 int32_t offset, weight, constant;
4156 v16i8 src0, src1, src2, src3, src4, src5;
4157 v8i16 in0, in1, in2, in3;
4158 v16i8 src10_r, src32_r, src21_r, src43_r;
4159 v16i8 src10_l, src32_l, src21_l, src43_l;
4160 v8i16 tmp0, tmp1, tmp2, tmp3;
4163 v4i32 weight_vec, offset_vec, rnd_vec;
4165 src0_ptr -= src_stride;
4167 offset = (offset0 + offset1) << rnd_val;
4168 weight0 = weight0 & 0x0000FFFF;
4169 weight = weight0 | (weight1 << 16);
4170 constant = 128 * weight1;
4174 offset_vec = __msa_fill_w(offset);
4175 weight_vec = __msa_fill_w(weight);
4176 rnd_vec = __msa_fill_w(rnd_val + 1);
4178 filter_vec = LD_SH(filter);
4179 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
4181 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
4182 src0_ptr += (3 * src_stride);
4183 XORI_B3_128_SB(src0, src1, src2);
4184 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
4185 ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
4187 for (loop_cnt = (height >> 2); loop_cnt--;) {
4188 LD_SB2(src0_ptr, src_stride, src3, src4);
4189 src0_ptr += (2 * src_stride);
4190 LD_SH2(src1_ptr, src2_stride, in0, in1);
4191 LD_SH2(src1_ptr + 8, src2_stride, in2, in3);
4192 src1_ptr += (2 * src2_stride);
4193 XORI_B2_128_SB(src3, src4);
4194 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
4195 ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
4197 tmp0 = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
4198 tmp1 = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
4199 tmp2 = HEVC_FILT_4TAP_SH(src10_l, src32_l, filt0, filt1);
4200 tmp3 = HEVC_FILT_4TAP_SH(src21_l, src43_l, filt0, filt1);
4202 HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3,
4204 weight_vec, rnd_vec, offset_vec,
4205 tmp0, tmp1, tmp2, tmp3);
4206 PCKEV_B2_SH(tmp2, tmp0, tmp3, tmp1, tmp0, tmp1);
4207 ST_SH2(tmp0, tmp1, dst, dst_stride);
4208 dst += (2 * dst_stride);
4209 LD_SB2(src0_ptr, src_stride, src5, src2);
4210 src0_ptr += (2 * src_stride);
4212 LD_SH2(src1_ptr, src2_stride, in0, in1);
4213 LD_SH2(src1_ptr + 8, src2_stride, in2, in3);
4214 src1_ptr += (2 * src2_stride);
4215 XORI_B2_128_SB(src5, src2);
4216 ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r);
4217 ILVL_B2_SB(src5, src4, src2, src5, src10_l, src21_l);
4219 tmp0 = HEVC_FILT_4TAP_SH(src32_r, src10_r, filt0, filt1);
4220 tmp1 = HEVC_FILT_4TAP_SH(src43_r, src21_r, filt0, filt1);
4221 tmp2 = HEVC_FILT_4TAP_SH(src32_l, src10_l, filt0, filt1);
4222 tmp3 = HEVC_FILT_4TAP_SH(src43_l, src21_l, filt0, filt1);
4223 HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3,
4225 weight_vec, rnd_vec, offset_vec,
4226 tmp0, tmp1, tmp2, tmp3);
4228 PCKEV_B2_SH(tmp2, tmp0, tmp3, tmp1, tmp0, tmp1);
4229 ST_SH2(tmp0, tmp1, dst, dst_stride);
4230 dst += (2 * dst_stride);
4234 static void hevc_vt_biwgt_4t_24w_msa(uint8_t *src0_ptr,
4237 int32_t src2_stride,
4240 const int8_t *filter,
4249 int32_t offset, weight, constant;
4250 v16i8 src0, src1, src2, src3, src4, src5;
4251 v16i8 src6, src7, src8, src9, src10, src11;
4252 v8i16 in0, in1, in2, in3, in4, in5;
4253 v16i8 src10_r, src32_r, src76_r, src98_r;
4254 v16i8 src10_l, src32_l, src21_l, src43_l;
4255 v16i8 src21_r, src43_r, src87_r, src109_r;
4256 v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
4259 v4i32 weight_vec, offset_vec, rnd_vec;
4261 src0_ptr -= src_stride;
4263 offset = (offset0 + offset1) << rnd_val;
4264 weight0 = weight0 & 0x0000FFFF;
4265 weight = weight0 | (weight1 << 16);
4266 constant = 128 * weight1;
4270 offset_vec = __msa_fill_w(offset);
4271 weight_vec = __msa_fill_w(weight);
4272 rnd_vec = __msa_fill_w(rnd_val + 1);
4274 filter_vec = LD_SH(filter);
4275 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
4278 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
4279 XORI_B3_128_SB(src0, src1, src2);
4280 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
4281 ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
4283 LD_SB3(src0_ptr + 16, src_stride, src6, src7, src8);
4284 src0_ptr += (3 * src_stride);
4285 XORI_B3_128_SB(src6, src7, src8);
4286 ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
4288 for (loop_cnt = (height >> 2); loop_cnt--;) {
4290 LD_SB2(src0_ptr, src_stride, src3, src4);
4291 LD_SH2(src1_ptr, src2_stride, in0, in1);
4292 LD_SH2(src1_ptr + 8, src2_stride, in2, in3);
4293 XORI_B2_128_SB(src3, src4);
4294 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
4295 ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
4298 LD_SB2(src0_ptr + 16, src_stride, src9, src10);
4299 src0_ptr += (2 * src_stride);
4300 LD_SH2(src1_ptr + 16, src2_stride, in4, in5);
4301 src1_ptr += (2 * src2_stride);
4302 XORI_B2_128_SB(src9, src10);
4303 ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r);
4305 tmp0 = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
4306 tmp4 = HEVC_FILT_4TAP_SH(src10_l, src32_l, filt0, filt1);
4307 tmp1 = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
4308 tmp5 = HEVC_FILT_4TAP_SH(src21_l, src43_l, filt0, filt1);
4310 tmp2 = HEVC_FILT_4TAP_SH(src76_r, src98_r, filt0, filt1);
4311 tmp3 = HEVC_FILT_4TAP_SH(src87_r, src109_r, filt0, filt1);
4313 HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp4, tmp5,
4315 weight_vec, rnd_vec, offset_vec,
4316 tmp0, tmp1, tmp4, tmp5);
4318 HEVC_BIW_RND_CLIP2(tmp2, tmp3, in4, in5,
4319 weight_vec, rnd_vec, offset_vec,
4322 PCKEV_B2_SH(tmp4, tmp0, tmp5, tmp1, tmp0, tmp1);
4324 tmp2 = (v8i16) __msa_pckev_b((v16i8) tmp3, (v16i8) tmp2);
4325 ST_SH2(tmp0, tmp1, dst, dst_stride);
4326 ST8x2_UB(tmp2, dst + 16, dst_stride);
4327 dst += (2 * dst_stride);
4330 LD_SB2(src0_ptr, src_stride, src5, src2);
4331 LD_SH2(src1_ptr, src2_stride, in0, in1);
4332 LD_SH2(src1_ptr + 8, src2_stride, in2, in3);
4333 XORI_B2_128_SB(src5, src2);
4334 ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r);
4335 ILVL_B2_SB(src5, src4, src2, src5, src10_l, src21_l);
4337 LD_SB2(src0_ptr + 16, src_stride, src11, src8);
4338 src0_ptr += (2 * src_stride);
4339 LD_SH2(src1_ptr + 16, src2_stride, in4, in5);
4340 src1_ptr += (2 * src2_stride);
4341 XORI_B2_128_SB(src11, src8);
4342 ILVR_B2_SB(src11, src10, src8, src11, src76_r, src87_r);
4344 tmp0 = HEVC_FILT_4TAP_SH(src32_r, src10_r, filt0, filt1);
4345 tmp4 = HEVC_FILT_4TAP_SH(src32_l, src10_l, filt0, filt1);
4346 tmp1 = HEVC_FILT_4TAP_SH(src43_r, src21_r, filt0, filt1);
4347 tmp5 = HEVC_FILT_4TAP_SH(src43_l, src21_l, filt0, filt1);
4349 tmp2 = HEVC_FILT_4TAP_SH(src98_r, src76_r, filt0, filt1);
4350 tmp3 = HEVC_FILT_4TAP_SH(src109_r, src87_r, filt0, filt1);
4352 HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp4, tmp5,
4354 weight_vec, rnd_vec, offset_vec,
4355 tmp0, tmp1, tmp4, tmp5);
4357 HEVC_BIW_RND_CLIP2(tmp2, tmp3, in4, in5,
4358 weight_vec, rnd_vec, offset_vec,
4361 PCKEV_B2_SH(tmp4, tmp0, tmp5, tmp1, tmp0, tmp1);
4364 tmp2 = (v8i16) __msa_pckev_b((v16i8) tmp3, (v16i8) tmp2);
4365 ST_SH2(tmp0, tmp1, dst, dst_stride);
4366 ST8x2_UB(tmp2, dst + 16, dst_stride);
4367 dst += (2 * dst_stride);
4371 static void hevc_vt_biwgt_4t_32w_msa(uint8_t *src0_ptr,
4374 int32_t src2_stride,
4377 const int8_t *filter,
4386 uint8_t *dst_tmp = dst + 16;
4387 int32_t offset, weight, constant;
4388 v16i8 src0, src1, src2, src3, src4, src6, src7, src8, src9, src10;
4389 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
4390 v16i8 src10_r, src32_r, src76_r, src98_r;
4391 v16i8 src21_r, src43_r, src87_r, src109_r;
4392 v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
4393 v16i8 src10_l, src32_l, src76_l, src98_l;
4394 v16i8 src21_l, src43_l, src87_l, src109_l;
4397 v4i32 weight_vec, offset_vec, rnd_vec;
4399 src0_ptr -= src_stride;
4401 offset = (offset0 + offset1) << rnd_val;
4402 weight0 = weight0 & 0x0000FFFF;
4403 weight = weight0 | (weight1 << 16);
4404 constant = 128 * weight1;
4408 offset_vec = __msa_fill_w(offset);
4409 weight_vec = __msa_fill_w(weight);
4410 rnd_vec = __msa_fill_w(rnd_val + 1);
4412 filter_vec = LD_SH(filter);
4413 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
4416 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
4417 XORI_B3_128_SB(src0, src1, src2);
4418 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
4419 ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
4421 LD_SB3(src0_ptr + 16, src_stride, src6, src7, src8);
4422 src0_ptr += (3 * src_stride);
4423 XORI_B3_128_SB(src6, src7, src8);
4424 ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
4425 ILVL_B2_SB(src7, src6, src8, src7, src76_l, src87_l);
4427 for (loop_cnt = (height >> 1); loop_cnt--;) {
4429 LD_SB2(src0_ptr, src_stride, src3, src4);
4430 LD_SH2(src1_ptr, src2_stride, in0, in1);
4431 LD_SH2(src1_ptr + 8, src2_stride, in2, in3);
4432 XORI_B2_128_SB(src3, src4);
4433 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
4434 ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
4437 tmp0 = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
4438 tmp4 = HEVC_FILT_4TAP_SH(src10_l, src32_l, filt0, filt1);
4439 tmp1 = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
4440 tmp5 = HEVC_FILT_4TAP_SH(src21_l, src43_l, filt0, filt1);
4442 HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp4, tmp5,
4444 weight_vec, rnd_vec, offset_vec,
4445 tmp0, tmp1, tmp4, tmp5);
4447 PCKEV_B2_SH(tmp4, tmp0, tmp5, tmp1, tmp0, tmp1);
4448 ST_SH2(tmp0, tmp1, dst, dst_stride);
4449 dst += (2 * dst_stride);
4458 LD_SB2(src0_ptr + 16, src_stride, src9, src10);
4459 src0_ptr += (2 * src_stride);
4460 LD_SH2(src1_ptr + 16, src2_stride, in4, in5);
4461 LD_SH2(src1_ptr + 24, src2_stride, in6, in7);
4462 src1_ptr += (2 * src2_stride);
4463 XORI_B2_128_SB(src9, src10);
4464 ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r);
4465 ILVL_B2_SB(src9, src8, src10, src9, src98_l, src109_l);
4467 tmp2 = HEVC_FILT_4TAP_SH(src76_r, src98_r, filt0, filt1);
4468 tmp6 = HEVC_FILT_4TAP_SH(src76_l, src98_l, filt0, filt1);
4469 tmp3 = HEVC_FILT_4TAP_SH(src87_r, src109_r, filt0, filt1);
4470 tmp7 = HEVC_FILT_4TAP_SH(src87_l, src109_l, filt0, filt1);
4472 HEVC_BIW_RND_CLIP4(tmp2, tmp3, tmp6, tmp7,
4474 weight_vec, rnd_vec, offset_vec,
4475 tmp2, tmp3, tmp6, tmp7);
4478 PCKEV_B2_SH(tmp6, tmp2, tmp7, tmp3, tmp2, tmp3);
4479 ST_SH2(tmp2, tmp3, dst_tmp, dst_stride);
4480 dst_tmp += (2 * dst_stride);
4490 static void hevc_hv_biwgt_4t_4x2_msa(uint8_t *src0_ptr,
4493 int32_t src2_stride,
4496 const int8_t *filter_x,
4497 const int8_t *filter_y,
4505 int32_t offset, weight;
4508 v16i8 src0, src1, src2, src3, src4;
4510 v8i16 filt_h0, filt_h1;
4511 v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16);
4513 v8i16 filter_vec, tmp, weight_vec;
4514 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
4515 v8i16 dst20, dst31, dst42, dst10, dst32, dst21, dst43, tmp0, tmp1;
4516 v4i32 dst0, dst1, offset_vec, rnd_vec, const_vec;
4518 src0_ptr -= (src_stride + 1);
4520 filter_vec = LD_SH(filter_x);
4521 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
4523 filter_vec = LD_SH(filter_y);
4524 UNPCK_R_SB_SH(filter_vec, filter_vec);
4526 SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
4530 offset = (offset0 + offset1) << rnd_val;
4531 weight0 = weight0 & 0x0000FFFF;
4532 weight = weight0 | (weight1 << 16);
4534 const_vec = __msa_fill_w((128 * weight1));
4536 offset_vec = __msa_fill_w(offset);
4537 weight_vec = (v8i16) __msa_fill_w(weight);
4538 rnd_vec = __msa_fill_w(rnd_val + 1);
4539 offset_vec += const_vec;
4541 LD_SB5(src0_ptr, src_stride, src0, src1, src2, src3, src4);
4542 XORI_B5_128_SB(src0, src1, src2, src3, src4);
4544 VSHF_B2_SB(src0, src2, src0, src2, mask0, mask1, vec0, vec1);
4545 VSHF_B2_SB(src1, src3, src1, src3, mask0, mask1, vec2, vec3);
4546 VSHF_B2_SB(src2, src4, src2, src4, mask0, mask1, vec4, vec5);
4548 dst20 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
4549 dst31 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
4550 dst42 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
4552 ILVRL_H2_SH(dst31, dst20, dst10, dst32);
4553 ILVRL_H2_SH(dst42, dst31, dst21, dst43);
4555 dst0 = HEVC_FILT_4TAP(dst10, dst32, filt_h0, filt_h1);
4556 dst1 = HEVC_FILT_4TAP(dst21, dst43, filt_h0, filt_h1);
4559 dst0 = (v4i32) __msa_pckev_h((v8i16) dst1, (v8i16) dst0);
4561 LD2(src1_ptr, src2_stride, tp0, tp1);
4562 INSERT_D2_SH(tp0, tp1, in0);
4564 ILVRL_H2_SH(dst0, in0, tmp0, tmp1);
4565 dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
4566 dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
4567 SRAR_W2_SW(dst0, dst1, rnd_vec);
4568 tmp = __msa_pckev_h((v8i16) dst1, (v8i16) dst0);
4569 tmp = CLIP_SH_0_255_MAX_SATU(tmp);
4570 out = (v16u8) __msa_pckev_b((v16i8) tmp, (v16i8) tmp);
4571 ST4x2_UB(out, dst, dst_stride);
4574 static void hevc_hv_biwgt_4t_4x4_msa(uint8_t *src0_ptr,
4577 int32_t src2_stride,
4580 const int8_t *filter_x,
4581 const int8_t *filter_y,
4589 int32_t offset, weight;
4591 v8i16 in0 = { 0 }, in1 = { 0 };
4592 v16i8 src0, src1, src2, src3, src4, src5, src6;
4594 v8i16 filt_h0, filt_h1;
4595 v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16);
4597 v8i16 filter_vec, weight_vec;
4598 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
4599 v8i16 tmp0, tmp1, tmp2, tmp3;
4600 v8i16 dst30, dst41, dst52, dst63;
4601 v8i16 dst10, dst32, dst54, dst21, dst43, dst65;
4602 v4i32 offset_vec, rnd_vec, const_vec;
4603 v4i32 dst0, dst1, dst2, dst3;
4605 src0_ptr -= (src_stride + 1);
4607 filter_vec = LD_SH(filter_x);
4608 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
4610 filter_vec = LD_SH(filter_y);
4611 UNPCK_R_SB_SH(filter_vec, filter_vec);
4613 SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
4617 offset = (offset0 + offset1) << rnd_val;
4618 weight0 = weight0 & 0x0000FFFF;
4619 weight = weight0 | (weight1 << 16);
4621 const_vec = __msa_fill_w((128 * weight1));
4623 offset_vec = __msa_fill_w(offset);
4624 weight_vec = (v8i16) __msa_fill_w(weight);
4625 rnd_vec = __msa_fill_w(rnd_val + 1);
4626 offset_vec += const_vec;
4628 LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6);
4629 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
4631 VSHF_B2_SB(src0, src3, src0, src3, mask0, mask1, vec0, vec1);
4632 VSHF_B2_SB(src1, src4, src1, src4, mask0, mask1, vec2, vec3);
4633 VSHF_B2_SB(src2, src5, src2, src5, mask0, mask1, vec4, vec5);
4634 VSHF_B2_SB(src3, src6, src3, src6, mask0, mask1, vec6, vec7);
4636 dst30 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
4637 dst41 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
4638 dst52 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
4639 dst63 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
4641 ILVRL_H2_SH(dst41, dst30, dst10, dst43);
4642 ILVRL_H2_SH(dst52, dst41, dst21, dst54);
4643 ILVRL_H2_SH(dst63, dst52, dst32, dst65);
4644 dst0 = HEVC_FILT_4TAP(dst10, dst32, filt_h0, filt_h1);
4645 dst1 = HEVC_FILT_4TAP(dst21, dst43, filt_h0, filt_h1);
4646 dst2 = HEVC_FILT_4TAP(dst32, dst54, filt_h0, filt_h1);
4647 dst3 = HEVC_FILT_4TAP(dst43, dst65, filt_h0, filt_h1);
4648 SRA_4V(dst0, dst1, dst2, dst3, 6);
4649 PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp1, tmp3);
4651 LD2(src1_ptr, src2_stride, tp0, tp1);
4652 INSERT_D2_SH(tp0, tp1, in0);
4653 src1_ptr += (2 * src2_stride);
4654 LD2(src1_ptr, src2_stride, tp0, tp1);
4655 INSERT_D2_SH(tp0, tp1, in1);
4657 ILVRL_H2_SH(tmp1, in0, tmp0, tmp1);
4658 ILVRL_H2_SH(tmp3, in1, tmp2, tmp3);
4660 dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
4661 dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
4662 dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
4663 dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
4664 SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec);
4665 PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp0, tmp1);
4666 CLIP_SH2_0_255_MAX_SATU(tmp0, tmp1);
4667 out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
4668 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
4671 static void hevc_hv_biwgt_4t_4multx8mult_msa(uint8_t *src0_ptr,
4674 int32_t src2_stride,
4677 const int8_t *filter_x,
4678 const int8_t *filter_y,
4688 int32_t offset, weight;
4690 v8i16 in0 = { 0 }, in1 = { 0 }, in2 = { 0 }, in3 = { 0 };
4691 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
4693 v8i16 filt_h0, filt_h1;
4694 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
4695 v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16);
4697 v8i16 filter_vec, weight_vec;
4698 v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
4699 v8i16 dst10, dst21, dst22, dst73, dst84, dst95, dst106;
4700 v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
4701 v8i16 dst21_r, dst43_r, dst65_r, dst87_r;
4702 v8i16 dst98_r, dst109_r;
4703 v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
4704 v4i32 offset_vec, rnd_vec, const_vec;
4706 src0_ptr -= (src_stride + 1);
4708 filter_vec = LD_SH(filter_x);
4709 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
4711 filter_vec = LD_SH(filter_y);
4712 UNPCK_R_SB_SH(filter_vec, filter_vec);
4714 SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
4718 offset = (offset0 + offset1) << rnd_val;
4719 weight0 = weight0 & 0x0000FFFF;
4720 weight = weight0 | (weight1 << 16);
4722 const_vec = __msa_fill_w((128 * weight1));
4724 offset_vec = __msa_fill_w(offset);
4725 weight_vec = (v8i16) __msa_fill_w(weight);
4726 rnd_vec = __msa_fill_w(rnd_val + 1);
4727 offset_vec += const_vec;
4729 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
4730 src0_ptr += (3 * src_stride);
4731 XORI_B3_128_SB(src0, src1, src2);
4733 VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
4734 VSHF_B2_SB(src1, src2, src1, src2, mask0, mask1, vec2, vec3);
4735 dst10 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
4736 dst21 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
4737 ILVRL_H2_SH(dst21, dst10, dst10_r, dst21_r);
4738 dst22 = (v8i16) __msa_splati_d((v2i64) dst21, 1);
4740 for (loop_cnt = height >> 3; loop_cnt--;) {
4741 LD_SB8(src0_ptr, src_stride,
4742 src3, src4, src5, src6, src7, src8, src9, src10);
4743 src0_ptr += (8 * src_stride);
4744 XORI_B8_128_SB(src3, src4, src5, src6, src7, src8, src9, src10);
4745 VSHF_B2_SB(src3, src7, src3, src7, mask0, mask1, vec0, vec1);
4746 VSHF_B2_SB(src4, src8, src4, src8, mask0, mask1, vec2, vec3);
4747 VSHF_B2_SB(src5, src9, src5, src9, mask0, mask1, vec4, vec5);
4748 VSHF_B2_SB(src6, src10, src6, src10, mask0, mask1, vec6, vec7);
4750 dst73 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
4751 dst84 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
4752 dst95 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
4753 dst106 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
4755 dst32_r = __msa_ilvr_h(dst73, dst22);
4756 ILVRL_H2_SH(dst84, dst73, dst43_r, dst87_r);
4757 ILVRL_H2_SH(dst95, dst84, dst54_r, dst98_r);
4758 ILVRL_H2_SH(dst106, dst95, dst65_r, dst109_r);
4759 dst22 = (v8i16) __msa_splati_d((v2i64) dst73, 1);
4760 dst76_r = __msa_ilvr_h(dst22, dst106);
4762 LD2(src1_ptr, src2_stride, tp0, tp1);
4763 src1_ptr += 2 * src2_stride;
4764 INSERT_D2_SH(tp0, tp1, in0);
4765 LD2(src1_ptr, src2_stride, tp0, tp1);
4766 src1_ptr += 2 * src2_stride;
4767 INSERT_D2_SH(tp0, tp1, in1);
4769 LD2(src1_ptr, src2_stride, tp0, tp1);
4770 src1_ptr += 2 * src2_stride;
4771 INSERT_D2_SH(tp0, tp1, in2);
4772 LD2(src1_ptr, src2_stride, tp0, tp1);
4773 src1_ptr += 2 * src2_stride;
4774 INSERT_D2_SH(tp0, tp1, in3);
4776 dst0 = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
4777 dst1 = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
4778 dst2 = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
4779 dst3 = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
4780 dst4 = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
4781 dst5 = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
4782 dst6 = HEVC_FILT_4TAP(dst76_r, dst98_r, filt_h0, filt_h1);
4783 dst7 = HEVC_FILT_4TAP(dst87_r, dst109_r, filt_h0, filt_h1);
4784 SRA_4V(dst0, dst1, dst2, dst3, 6);
4785 SRA_4V(dst4, dst5, dst6, dst7, 6);
4786 PCKEV_H4_SW(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, dst0, dst1,
4788 ILVRL_H2_SH(dst0, in0, tmp0, tmp1);
4789 ILVRL_H2_SH(dst1, in1, tmp2, tmp3);
4790 ILVRL_H2_SH(dst2, in2, tmp4, tmp5);
4791 ILVRL_H2_SH(dst3, in3, tmp6, tmp7);
4792 dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
4793 dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
4794 dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
4795 dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
4796 dst4 = __msa_dpadd_s_w(offset_vec, tmp4, weight_vec);
4797 dst5 = __msa_dpadd_s_w(offset_vec, tmp5, weight_vec);
4798 dst6 = __msa_dpadd_s_w(offset_vec, tmp6, weight_vec);
4799 dst7 = __msa_dpadd_s_w(offset_vec, tmp7, weight_vec);
4800 SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec);
4801 SRAR_W4_SW(dst4, dst5, dst6, dst7, rnd_vec);
4802 PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, tmp0, tmp1,
4804 CLIP_SH4_0_255_MAX_SATU(tmp0, tmp1, tmp2, tmp3);
4805 PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
4806 ST4x8_UB(out0, out1, dst, dst_stride);
4807 dst += (8 * dst_stride);
4811 dst22 = (v8i16) __msa_splati_d((v2i64) dst106, 1);
4815 static void hevc_hv_biwgt_4t_4w_msa(uint8_t *src0_ptr,
4818 int32_t src2_stride,
4821 const int8_t *filter_x,
4822 const int8_t *filter_y,
4831 hevc_hv_biwgt_4t_4x2_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
4832 dst, dst_stride, filter_x, filter_y,
4833 weight0, weight1, offset0, offset1, rnd_val);
4834 } else if (4 == height) {
4835 hevc_hv_biwgt_4t_4x4_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
4836 dst, dst_stride, filter_x, filter_y,
4837 weight0, weight1, offset0, offset1, rnd_val);
4838 } else if (0 == (height % 8)) {
4839 hevc_hv_biwgt_4t_4multx8mult_msa(src0_ptr, src_stride,
4840 src1_ptr, src2_stride,
4841 dst, dst_stride, filter_x, filter_y,
4842 height, weight0, weight1,
4843 offset0, offset1, rnd_val);
4847 static void hevc_hv_biwgt_4t_6w_msa(uint8_t *src0_ptr,
4850 int32_t src2_stride,
4853 const int8_t *filter_x,
4854 const int8_t *filter_y,
4862 uint32_t tpw0, tpw1, tpw2, tpw3;
4864 int32_t offset, weight;
4865 v16u8 out0, out1, out2;
4866 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
4867 v8i16 in0 = { 0 }, in1 = { 0 }, in2 = { 0 }, in3 = { 0 };
4868 v8i16 in4 = { 0 }, in5 = { 0 };
4870 v8i16 filt_h0, filt_h1, filter_vec;
4871 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
4872 v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
4874 v8i16 dsth0, dsth1, dsth2, dsth3, dsth4, dsth5, dsth6, dsth7, dsth8, dsth9;
4875 v8i16 dsth10, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, weight_vec;
4876 v8i16 dst10_r, dst32_r, dst54_r, dst76_r, dst98_r, dst21_r, dst43_r;
4877 v8i16 dst65_r, dst87_r, dst109_r, dst10_l, dst32_l, dst54_l, dst76_l;
4878 v8i16 dst98_l, dst21_l, dst43_l, dst65_l, dst87_l, dst109_l;
4879 v8i16 dst1021_l, dst3243_l, dst5465_l, dst7687_l, dst98109_l;
4880 v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
4881 v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
4882 v4i32 dst4_r, dst5_r, dst6_r, dst7_r;
4883 v4i32 offset_vec, rnd_vec, const_vec;
4885 src0_ptr -= (src_stride + 1);
4887 filter_vec = LD_SH(filter_x);
4888 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
4890 filter_vec = LD_SH(filter_y);
4891 UNPCK_R_SB_SH(filter_vec, filter_vec);
4893 SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
4897 offset = (offset0 + offset1) << rnd_val;
4898 weight0 = weight0 & 0x0000FFFF;
4899 weight = weight0 | (weight1 << 16);
4901 const_vec = __msa_fill_w((128 * weight1));
4903 offset_vec = __msa_fill_w(offset);
4904 weight_vec = (v8i16) __msa_fill_w(weight);
4905 rnd_vec = __msa_fill_w(rnd_val + 1);
4906 offset_vec += const_vec;
4908 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
4909 src0_ptr += (3 * src_stride);
4910 XORI_B3_128_SB(src0, src1, src2);
4912 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
4913 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
4914 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
4915 dsth0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
4916 dsth1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
4917 dsth2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
4919 ILVRL_H2_SH(dsth1, dsth0, dst10_r, dst10_l);
4920 ILVRL_H2_SH(dsth2, dsth1, dst21_r, dst21_l);
4922 LD_SB8(src0_ptr, src_stride, src3, src4, src5, src6, src7, src8, src9,
4924 XORI_B8_128_SB(src3, src4, src5, src6, src7, src8, src9, src10);
4926 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
4927 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
4928 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
4929 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
4931 dsth3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
4932 dsth4 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
4933 dsth5 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
4934 dsth6 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
4936 VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1);
4937 VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec2, vec3);
4938 VSHF_B2_SB(src9, src9, src9, src9, mask0, mask1, vec4, vec5);
4939 VSHF_B2_SB(src10, src10, src10, src10, mask0, mask1, vec6, vec7);
4941 dsth7 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
4942 dsth8 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
4943 dsth9 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
4944 dsth10 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
4946 ILVRL_H2_SH(dsth3, dsth2, dst32_r, dst32_l);
4947 ILVRL_H2_SH(dsth4, dsth3, dst43_r, dst43_l);
4948 ILVRL_H2_SH(dsth5, dsth4, dst54_r, dst54_l);
4949 ILVRL_H2_SH(dsth6, dsth5, dst65_r, dst65_l);
4950 ILVRL_H2_SH(dsth7, dsth6, dst76_r, dst76_l);
4951 ILVRL_H2_SH(dsth8, dsth7, dst87_r, dst87_l);
4952 ILVRL_H2_SH(dsth9, dsth8, dst98_r, dst98_l);
4953 ILVRL_H2_SH(dsth10, dsth9, dst109_r, dst109_l);
4954 PCKEV_D2_SH(dst21_l, dst10_l, dst43_l, dst32_l, dst1021_l, dst3243_l);
4955 PCKEV_D2_SH(dst65_l, dst54_l, dst87_l, dst76_l, dst5465_l, dst7687_l);
4956 dst98109_l = (v8i16) __msa_pckev_d((v2i64) dst109_l, (v2i64) dst98_l);
4958 dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
4959 dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
4960 dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
4961 dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
4962 dst4_r = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
4963 dst5_r = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
4964 dst6_r = HEVC_FILT_4TAP(dst76_r, dst98_r, filt_h0, filt_h1);
4965 dst7_r = HEVC_FILT_4TAP(dst87_r, dst109_r, filt_h0, filt_h1);
4966 dst0_l = HEVC_FILT_4TAP(dst1021_l, dst3243_l, filt_h0, filt_h1);
4967 dst1_l = HEVC_FILT_4TAP(dst3243_l, dst5465_l, filt_h0, filt_h1);
4968 dst2_l = HEVC_FILT_4TAP(dst5465_l, dst7687_l, filt_h0, filt_h1);
4969 dst3_l = HEVC_FILT_4TAP(dst7687_l, dst98109_l, filt_h0, filt_h1);
4970 SRA_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6);
4971 SRA_4V(dst4_r, dst5_r, dst6_r, dst7_r, 6);
4972 SRA_4V(dst0_l, dst1_l, dst2_l, dst3_l, 6);
4973 PCKEV_H2_SW(dst1_r, dst0_r, dst3_r, dst2_r, dst0, dst1);
4974 PCKEV_H2_SW(dst5_r, dst4_r, dst7_r, dst6_r, dst2, dst3);
4976 LD2(src1_ptr, src2_stride, tp0, tp1);
4977 INSERT_D2_SH(tp0, tp1, in0);
4978 LD2(src1_ptr + 2 * src2_stride, src2_stride, tp0, tp1);
4979 INSERT_D2_SH(tp0, tp1, in1);
4981 LD2(src1_ptr + 4 * src2_stride, src2_stride, tp0, tp1);
4982 INSERT_D2_SH(tp0, tp1, in2);
4983 LD2(src1_ptr + 6 * src2_stride, src2_stride, tp0, tp1);
4984 INSERT_D2_SH(tp0, tp1, in3);
4986 ILVRL_H2_SH(dst0, in0, tmp0, tmp1);
4987 ILVRL_H2_SH(dst1, in1, tmp2, tmp3);
4988 ILVRL_H2_SH(dst2, in2, tmp4, tmp5);
4989 ILVRL_H2_SH(dst3, in3, tmp6, tmp7);
4990 dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
4991 dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
4992 dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
4993 dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
4994 dst4 = __msa_dpadd_s_w(offset_vec, tmp4, weight_vec);
4995 dst5 = __msa_dpadd_s_w(offset_vec, tmp5, weight_vec);
4996 dst6 = __msa_dpadd_s_w(offset_vec, tmp6, weight_vec);
4997 dst7 = __msa_dpadd_s_w(offset_vec, tmp7, weight_vec);
4998 SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec);
4999 SRAR_W4_SW(dst4, dst5, dst6, dst7, rnd_vec);
5000 PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6, tmp0, tmp1,
5002 CLIP_SH4_0_255_MAX_SATU(tmp0, tmp1, tmp2, tmp3);
5003 PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
5004 ST4x8_UB(out0, out1, dst, dst_stride);
5006 PCKEV_H2_SW(dst1_l, dst0_l, dst3_l, dst2_l, dst4, dst5);
5008 LW4(src1_ptr + 4, src2_stride, tpw0, tpw1, tpw2, tpw3);
5009 src1_ptr += (4 * src2_stride);
5010 INSERT_W4_SH(tpw0, tpw1, tpw2, tpw3, in4);
5011 LW4(src1_ptr + 4, src2_stride, tpw0, tpw1, tpw2, tpw3);
5012 INSERT_W4_SH(tpw0, tpw1, tpw2, tpw3, in5);
5014 ILVRL_H2_SH(dst4, in4, tmp0, tmp1);
5015 ILVRL_H2_SH(dst5, in5, tmp2, tmp3);
5017 dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
5018 dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
5019 dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
5020 dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
5021 SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec);
5022 PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp4, tmp5);
5024 CLIP_SH2_0_255_MAX_SATU(tmp4, tmp5);
5025 out2 = (v16u8) __msa_pckev_b((v16i8) tmp5, (v16i8) tmp4);
5026 ST2x4_UB(out2, 0, dst + 4, dst_stride);
5027 dst += 4 * dst_stride;
5028 ST2x4_UB(out2, 4, dst + 4, dst_stride);
5031 static void hevc_hv_biwgt_4t_8x2_msa(uint8_t *src0_ptr,
5034 int32_t src2_stride,
5037 const int8_t *filter_x,
5038 const int8_t *filter_y,
5045 int32_t weight, offset;
5047 v16i8 src0, src1, src2, src3, src4;
5049 v8i16 filt_h0, filt_h1;
5050 v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
5052 v8i16 filter_vec, weight_vec;
5053 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
5054 v8i16 dst0, dst1, dst2, dst3, dst4;
5056 v4i32 dst0_r, dst0_l, dst1_r, dst1_l;
5057 v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
5058 v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
5059 v8i16 tmp0, tmp1, tmp2, tmp3;
5060 v4i32 offset_vec, rnd_vec, const_vec;
5062 src0_ptr -= (src_stride + 1);
5064 filter_vec = LD_SH(filter_x);
5065 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
5067 filter_vec = LD_SH(filter_y);
5068 UNPCK_R_SB_SH(filter_vec, filter_vec);
5070 SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
5074 offset = (offset0 + offset1) << rnd_val;
5075 weight0 = weight0 & 0x0000FFFF;
5076 weight = weight0 | (weight1 << 16);
5078 const_vec = __msa_fill_w((128 * weight1));
5080 offset_vec = __msa_fill_w(offset);
5081 weight_vec = (v8i16) __msa_fill_w(weight);
5082 rnd_vec = __msa_fill_w(rnd_val + 1);
5083 offset_vec += const_vec;
5085 LD_SB5(src0_ptr, src_stride, src0, src1, src2, src3, src4);
5086 XORI_B5_128_SB(src0, src1, src2, src3, src4);
5088 LD_SH2(src1_ptr, src2_stride, in0, in1);
5090 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
5091 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
5092 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
5093 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7);
5094 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec8, vec9);
5096 dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
5097 dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
5098 dst2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
5099 dst3 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
5100 dst4 = HEVC_FILT_4TAP_SH(vec8, vec9, filt0, filt1);
5102 ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
5103 ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
5104 ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
5105 ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
5106 dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
5107 dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
5108 dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
5109 dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
5110 SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
5111 PCKEV_H2_SH(dst0_l, dst0_r, dst1_l, dst1_r, tmp1, tmp3);
5113 ILVRL_H2_SH(tmp1, in0, tmp0, tmp1);
5114 ILVRL_H2_SH(tmp3, in1, tmp2, tmp3);
5116 dst0_r = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
5117 dst0_l = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
5118 dst1_r = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
5119 dst1_l = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
5120 SRAR_W4_SW(dst0_r, dst0_l, dst1_r, dst1_l, rnd_vec);
5121 PCKEV_H2_SH(dst0_l, dst0_r, dst1_l, dst1_r, tmp0, tmp1);
5122 CLIP_SH2_0_255_MAX_SATU(tmp0, tmp1);
5123 out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
5124 ST8x2_UB(out, dst, dst_stride);
5127 static void hevc_hv_biwgt_4t_8multx4_msa(uint8_t *src0_ptr,
5130 int32_t src2_stride,
5133 const int8_t *filter_x,
5134 const int8_t *filter_y,
5142 int32_t weight, offset;
5145 v16i8 src0, src1, src2, src3, src4, src5, src6, mask0, mask1;
5146 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
5147 v8i16 filt0, filt1, filt_h0, filt_h1, filter_vec, weight_vec;
5148 v8i16 dsth0, dsth1, dsth2, dsth3, dsth4, dsth5, dsth6;
5149 v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, in0, in1, in2, in3;
5150 v8i16 dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r;
5151 v8i16 dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l;
5152 v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
5153 v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
5154 v4i32 offset_vec, rnd_vec, const_vec;
5156 src0_ptr -= (src_stride + 1);
5158 filter_vec = LD_SH(filter_x);
5159 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
5161 filter_vec = LD_SH(filter_y);
5162 UNPCK_R_SB_SH(filter_vec, filter_vec);
5164 SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
5166 mask0 = LD_SB(ff_hevc_mask_arr);
5169 offset = (offset0 + offset1) << rnd_val;
5170 weight0 = weight0 & 0x0000FFFF;
5171 weight = weight0 | (weight1 << 16);
5173 const_vec = __msa_fill_w((128 * weight1));
5175 offset_vec = __msa_fill_w(offset);
5176 rnd_vec = __msa_fill_w(rnd_val + 1);
5177 offset_vec += const_vec;
5178 weight_vec = (v8i16) __msa_fill_w(weight);
5180 for (cnt = width8mult; cnt--;) {
5181 LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6);
5183 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
5185 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
5188 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
5189 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
5190 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
5192 dsth0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
5193 dsth1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
5194 dsth2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
5196 ILVRL_H2_SH(dsth1, dsth0, dst10_r, dst10_l);
5197 ILVRL_H2_SH(dsth2, dsth1, dst21_r, dst21_l);
5199 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
5200 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
5201 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
5202 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
5204 dsth3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
5205 dsth4 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
5206 dsth5 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
5207 dsth6 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
5209 ILVRL_H2_SH(dsth3, dsth2, dst32_r, dst32_l);
5210 ILVRL_H2_SH(dsth4, dsth3, dst43_r, dst43_l);
5211 ILVRL_H2_SH(dsth5, dsth4, dst54_r, dst54_l);
5212 ILVRL_H2_SH(dsth6, dsth5, dst65_r, dst65_l);
5214 dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
5215 dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
5216 dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
5217 dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
5218 dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
5219 dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1);
5220 dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
5221 dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1);
5223 SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
5224 SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
5225 PCKEV_H4_SW(dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, dst3_l,
5226 dst3_r, dst0, dst1, dst2, dst3);
5228 ILVRL_H2_SH(dst0, in0, tmp0, tmp1);
5229 ILVRL_H2_SH(dst1, in1, tmp2, tmp3);
5230 ILVRL_H2_SH(dst2, in2, tmp4, tmp5);
5231 ILVRL_H2_SH(dst3, in3, tmp6, tmp7);
5232 dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
5233 dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
5234 dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
5235 dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
5236 dst4 = __msa_dpadd_s_w(offset_vec, tmp4, weight_vec);
5237 dst5 = __msa_dpadd_s_w(offset_vec, tmp5, weight_vec);
5238 dst6 = __msa_dpadd_s_w(offset_vec, tmp6, weight_vec);
5239 dst7 = __msa_dpadd_s_w(offset_vec, tmp7, weight_vec);
5240 SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec);
5241 SRAR_W4_SW(dst4, dst5, dst6, dst7, rnd_vec);
5242 PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6,
5243 tmp0, tmp1, tmp2, tmp3);
5244 CLIP_SH4_0_255_MAX_SATU(tmp0, tmp1, tmp2, tmp3);
5245 PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
5246 ST8x4_UB(out0, out1, dst, dst_stride);
5251 static void hevc_hv_biwgt_4t_8x6_msa(uint8_t *src0_ptr,
5254 int32_t src2_stride,
5257 const int8_t *filter_x,
5258 const int8_t *filter_y,
5265 uint32_t offset, weight;
5266 v16u8 out0, out1, out2;
5267 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
5269 v8i16 filt_h0, filt_h1;
5270 v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
5272 v8i16 filter_vec, weight_vec;
5273 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
5274 v16i8 vec10, vec11, vec12, vec13, vec14, vec15, vec16, vec17;
5275 v8i16 dsth0, dsth1, dsth2, dsth3, dsth4, dsth5, dsth6, dsth7, dsth8;
5276 v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
5277 v4i32 dst4_r, dst4_l, dst5_r, dst5_l;
5278 v8i16 dst10_r, dst32_r, dst10_l, dst32_l;
5279 v8i16 dst21_r, dst43_r, dst21_l, dst43_l;
5280 v8i16 dst54_r, dst54_l, dst65_r, dst65_l;
5281 v8i16 dst76_r, dst76_l, dst87_r, dst87_l;
5282 v8i16 in0, in1, in2, in3, in4, in5;
5283 v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
5284 v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
5285 v4i32 offset_vec, rnd_vec, const_vec;
5287 src0_ptr -= (src_stride + 1);
5289 filter_vec = LD_SH(filter_x);
5290 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
5292 filter_vec = LD_SH(filter_y);
5293 UNPCK_R_SB_SH(filter_vec, filter_vec);
5295 SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
5299 offset = (offset0 + offset1) << rnd_val;
5300 weight0 = weight0 & 0x0000FFFF;
5301 weight = weight0 | (weight1 << 16);
5303 const_vec = __msa_fill_w((128 * weight1));
5305 offset_vec = __msa_fill_w(offset);
5306 weight_vec = (v8i16) __msa_fill_w(weight);
5307 rnd_vec = __msa_fill_w(rnd_val + 1);
5308 offset_vec += const_vec;
5310 LD_SB5(src0_ptr, src_stride, src0, src1, src2, src3, src4);
5311 src0_ptr += (5 * src_stride);
5312 LD_SB4(src0_ptr, src_stride, src5, src6, src7, src8);
5314 XORI_B5_128_SB(src0, src1, src2, src3, src4);
5315 XORI_B4_128_SB(src5, src6, src7, src8);
5317 LD_SH6(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5);
5319 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
5320 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
5321 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
5322 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7);
5323 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec8, vec9);
5324 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec10, vec11);
5325 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec12, vec13);
5326 VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec14, vec15);
5327 VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec16, vec17);
5329 dsth0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
5330 dsth1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
5331 dsth2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
5332 dsth3 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
5333 dsth4 = HEVC_FILT_4TAP_SH(vec8, vec9, filt0, filt1);
5334 dsth5 = HEVC_FILT_4TAP_SH(vec10, vec11, filt0, filt1);
5335 dsth6 = HEVC_FILT_4TAP_SH(vec12, vec13, filt0, filt1);
5336 dsth7 = HEVC_FILT_4TAP_SH(vec14, vec15, filt0, filt1);
5337 dsth8 = HEVC_FILT_4TAP_SH(vec16, vec17, filt0, filt1);
5339 ILVRL_H2_SH(dsth1, dsth0, dst10_r, dst10_l);
5340 ILVRL_H2_SH(dsth2, dsth1, dst21_r, dst21_l);
5341 ILVRL_H2_SH(dsth3, dsth2, dst32_r, dst32_l);
5342 ILVRL_H2_SH(dsth4, dsth3, dst43_r, dst43_l);
5343 ILVRL_H2_SH(dsth5, dsth4, dst54_r, dst54_l);
5344 ILVRL_H2_SH(dsth6, dsth5, dst65_r, dst65_l);
5345 ILVRL_H2_SH(dsth7, dsth6, dst76_r, dst76_l);
5346 ILVRL_H2_SH(dsth8, dsth7, dst87_r, dst87_l);
5348 dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
5349 dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
5350 dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
5351 dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
5352 dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
5353 dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1);
5354 dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
5355 dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1);
5356 dst4_r = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
5357 dst4_l = HEVC_FILT_4TAP(dst54_l, dst76_l, filt_h0, filt_h1);
5358 dst5_r = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
5359 dst5_l = HEVC_FILT_4TAP(dst65_l, dst87_l, filt_h0, filt_h1);
5361 SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
5362 SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
5363 SRA_4V(dst4_r, dst4_l, dst5_r, dst5_l, 6);
5364 PCKEV_H4_SW(dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, dst3_l, dst3_r,
5365 dst0, dst1, dst2, dst3);
5367 ILVRL_H2_SH(dst0, in0, tmp0, tmp1);
5368 ILVRL_H2_SH(dst1, in1, tmp2, tmp3);
5369 ILVRL_H2_SH(dst2, in2, tmp4, tmp5);
5370 ILVRL_H2_SH(dst3, in3, tmp6, tmp7);
5371 dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
5372 dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
5373 dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
5374 dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
5375 dst4 = __msa_dpadd_s_w(offset_vec, tmp4, weight_vec);
5376 dst5 = __msa_dpadd_s_w(offset_vec, tmp5, weight_vec);
5377 dst6 = __msa_dpadd_s_w(offset_vec, tmp6, weight_vec);
5378 dst7 = __msa_dpadd_s_w(offset_vec, tmp7, weight_vec);
5379 SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec);
5380 SRAR_W4_SW(dst4, dst5, dst6, dst7, rnd_vec);
5381 PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6,
5382 tmp0, tmp1, tmp2, tmp3);
5383 CLIP_SH4_0_255_MAX_SATU(tmp0, tmp1, tmp2, tmp3);
5384 PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
5386 PCKEV_H2_SW(dst4_l, dst4_r, dst5_l, dst5_r, dst0, dst1);
5387 ILVRL_H2_SH(dst0, in4, tmp0, tmp1);
5388 ILVRL_H2_SH(dst1, in5, tmp2, tmp3);
5389 dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
5390 dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
5391 dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
5392 dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
5393 SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec);
5394 PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp4, tmp5);
5395 CLIP_SH2_0_255_MAX_SATU(tmp4, tmp5);
5396 out2 = (v16u8) __msa_pckev_b((v16i8) tmp5, (v16i8) tmp4);
5397 ST8x4_UB(out0, out1, dst, dst_stride);
5398 dst += (4 * dst_stride);
5399 ST8x2_UB(out2, dst, dst_stride);
5402 static void hevc_hv_biwgt_4t_8multx4mult_msa(uint8_t *src0_ptr,
5405 int32_t src2_stride,
5408 const int8_t *filter_x,
5409 const int8_t *filter_y,
5420 int32_t offset, weight;
5421 uint8_t *src0_ptr_tmp;
5422 int16_t *src1_ptr_tmp;
5425 v16i8 src0, src1, src2, src3, src4, src5, src6;
5426 v8i16 in0, in1, in2, in3;
5428 v8i16 filt_h0, filt_h1;
5429 v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
5432 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
5433 v8i16 dsth0, dsth1, dsth2, dsth3, dsth4, dsth5, dsth6;
5434 v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
5435 v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
5436 v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
5437 v8i16 dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r;
5438 v8i16 dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l, weight_vec;
5439 v4i32 offset_vec, rnd_vec, const_vec;
5441 src0_ptr -= (src_stride + 1);
5443 filter_vec = LD_SH(filter_x);
5444 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
5446 filter_vec = LD_SH(filter_y);
5447 UNPCK_R_SB_SH(filter_vec, filter_vec);
5449 SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
5453 offset = (offset0 + offset1) << rnd_val;
5454 weight0 = weight0 & 0x0000FFFF;
5455 weight = weight0 | (weight1 << 16);
5457 const_vec = __msa_fill_w((128 * weight1));
5459 offset_vec = __msa_fill_w(offset);
5460 weight_vec = (v8i16) __msa_fill_w(weight);
5461 rnd_vec = __msa_fill_w(rnd_val + 1);
5462 offset_vec += const_vec;
5464 for (cnt = width >> 3; cnt--;) {
5465 src0_ptr_tmp = src0_ptr;
5466 src1_ptr_tmp = src1_ptr;
5469 LD_SB3(src0_ptr_tmp, src_stride, src0, src1, src2);
5470 src0_ptr_tmp += (3 * src_stride);
5471 XORI_B3_128_SB(src0, src1, src2);
5473 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
5474 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
5475 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
5476 dsth0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
5477 dsth1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
5478 dsth2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
5480 ILVRL_H2_SH(dsth1, dsth0, dst10_r, dst10_l);
5481 ILVRL_H2_SH(dsth2, dsth1, dst21_r, dst21_l);
5483 for (loop_cnt = height >> 2; loop_cnt--;) {
5484 LD_SB4(src0_ptr_tmp, src_stride, src3, src4, src5, src6);
5485 src0_ptr_tmp += (4 * src_stride);
5486 LD_SH4(src1_ptr_tmp, src2_stride, in0, in1, in2, in3);
5487 src1_ptr_tmp += (4 * src2_stride);
5488 XORI_B4_128_SB(src3, src4, src5, src6);
5490 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
5491 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
5492 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
5493 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
5495 dsth3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
5496 dsth4 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
5497 dsth5 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
5498 dsth6 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
5500 ILVRL_H2_SH(dsth3, dsth2, dst32_r, dst32_l);
5501 ILVRL_H2_SH(dsth4, dsth3, dst43_r, dst43_l);
5502 ILVRL_H2_SH(dsth5, dsth4, dst54_r, dst54_l);
5503 ILVRL_H2_SH(dsth6, dsth5, dst65_r, dst65_l);
5505 dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
5506 dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
5507 dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
5508 dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
5509 dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
5510 dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1);
5511 dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
5512 dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1);
5514 SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
5515 SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
5516 PCKEV_H4_SW(dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, dst3_l,
5517 dst3_r, dst0, dst1, dst2, dst3);
5518 ILVRL_H2_SH(dst0, in0, tmp0, tmp1);
5519 ILVRL_H2_SH(dst1, in1, tmp2, tmp3);
5520 ILVRL_H2_SH(dst2, in2, tmp4, tmp5);
5521 ILVRL_H2_SH(dst3, in3, tmp6, tmp7);
5522 dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
5523 dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
5524 dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
5525 dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
5526 dst4 = __msa_dpadd_s_w(offset_vec, tmp4, weight_vec);
5527 dst5 = __msa_dpadd_s_w(offset_vec, tmp5, weight_vec);
5528 dst6 = __msa_dpadd_s_w(offset_vec, tmp6, weight_vec);
5529 dst7 = __msa_dpadd_s_w(offset_vec, tmp7, weight_vec);
5530 SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec);
5531 SRAR_W4_SW(dst4, dst5, dst6, dst7, rnd_vec);
5532 PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6,
5533 tmp0, tmp1, tmp2, tmp3);
5534 CLIP_SH4_0_255_MAX_SATU(tmp0, tmp1, tmp2, tmp3);
5535 PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
5536 ST8x4_UB(out0, out1, dst_tmp, dst_stride);
5537 dst_tmp += (4 * dst_stride);
5552 static void hevc_hv_biwgt_4t_8w_msa(uint8_t *src0_ptr,
5555 int32_t src2_stride,
5558 const int8_t *filter_x,
5559 const int8_t *filter_y,
5568 hevc_hv_biwgt_4t_8x2_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
5569 dst, dst_stride, filter_x, filter_y,
5570 weight0, weight1, offset0, offset1, rnd_val);
5571 } else if (4 == height) {
5572 hevc_hv_biwgt_4t_8multx4_msa(src0_ptr, src_stride, src1_ptr,
5573 src2_stride, dst, dst_stride, filter_x,
5574 filter_y, weight0, weight1, offset0,
5575 offset1, rnd_val, 1);
5576 } else if (6 == height) {
5577 hevc_hv_biwgt_4t_8x6_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
5578 dst, dst_stride, filter_x, filter_y,
5579 weight0, weight1, offset0, offset1, rnd_val);
5580 } else if (0 == (height % 4)) {
5581 hevc_hv_biwgt_4t_8multx4mult_msa(src0_ptr, src_stride,
5582 src1_ptr, src2_stride,
5583 dst, dst_stride, filter_x, filter_y,
5585 weight1, offset0, offset1, rnd_val, 8);
5589 static void hevc_hv_biwgt_4t_12w_msa(uint8_t *src0_ptr,
5592 int32_t src2_stride,
5595 const int8_t *filter_x,
5596 const int8_t *filter_y,
5606 int32_t offset, weight;
5607 uint8_t *src0_ptr_tmp, *dst_tmp;
5608 int16_t *src1_ptr_tmp;
5610 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
5611 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
5612 v16i8 mask0, mask1, mask2, mask3;
5613 v8i16 filt0, filt1, filt_h0, filt_h1, filter_vec;
5614 v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
5615 v8i16 dsth0, dsth1, dsth2, dsth3, dsth4, dsth5, dsth6, weight_vec;
5616 v8i16 dst10, dst21, dst22, dst73, dst84, dst95, dst106;
5617 v8i16 dst76_r, dst98_r, dst87_r, dst109_r;
5618 v8i16 in0 = { 0 }, in1 = { 0 }, in2 = { 0 }, in3 = { 0 };
5619 v8i16 dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r;
5620 v8i16 dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l;
5621 v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
5622 v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
5623 v4i32 offset_vec, rnd_vec, const_vec;
5625 src0_ptr -= (src_stride + 1);
5627 filter_vec = LD_SH(filter_x);
5628 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
5630 filter_vec = LD_SH(filter_y);
5631 UNPCK_R_SB_SH(filter_vec, filter_vec);
5633 SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
5635 mask0 = LD_SB(ff_hevc_mask_arr);
5638 offset = (offset0 + offset1) << rnd_val;
5639 weight0 = weight0 & 0x0000FFFF;
5640 weight = weight0 | (weight1 << 16);
5642 const_vec = __msa_fill_w((128 * weight1));
5644 offset_vec = __msa_fill_w(offset);
5645 rnd_vec = __msa_fill_w(rnd_val + 1);
5646 offset_vec += const_vec;
5647 weight_vec = (v8i16) __msa_fill_w(weight);
5649 src0_ptr_tmp = src0_ptr;
5651 src1_ptr_tmp = src1_ptr;
5653 LD_SB3(src0_ptr_tmp, src_stride, src0, src1, src2);
5654 src0_ptr_tmp += (3 * src_stride);
5656 XORI_B3_128_SB(src0, src1, src2);
5658 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
5659 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
5660 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
5662 dsth0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
5663 dsth1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
5664 dsth2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
5666 ILVRL_H2_SH(dsth1, dsth0, dst10_r, dst10_l);
5667 ILVRL_H2_SH(dsth2, dsth1, dst21_r, dst21_l);
5669 for (loop_cnt = 4; loop_cnt--;) {
5670 LD_SB4(src0_ptr_tmp, src_stride, src3, src4, src5, src6);
5671 src0_ptr_tmp += (4 * src_stride);
5672 XORI_B4_128_SB(src3, src4, src5, src6);
5674 LD_SH4(src1_ptr_tmp, src2_stride, in0, in1, in2, in3);
5675 src1_ptr_tmp += (4 * src2_stride);
5677 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
5678 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
5679 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
5680 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
5682 dsth3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
5683 dsth4 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
5684 dsth5 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
5685 dsth6 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
5687 ILVRL_H2_SH(dsth3, dsth2, dst32_r, dst32_l);
5688 ILVRL_H2_SH(dsth4, dsth3, dst43_r, dst43_l);
5689 ILVRL_H2_SH(dsth5, dsth4, dst54_r, dst54_l);
5690 ILVRL_H2_SH(dsth6, dsth5, dst65_r, dst65_l);
5692 dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
5693 dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
5694 dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
5695 dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
5696 dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
5697 dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1);
5698 dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
5699 dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1);
5701 SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
5702 SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
5703 PCKEV_H4_SW(dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, dst3_l,
5704 dst3_r, dst0, dst1, dst2, dst3);
5705 ILVRL_H2_SH(dst0, in0, tmp0, tmp1);
5706 ILVRL_H2_SH(dst1, in1, tmp2, tmp3);
5707 ILVRL_H2_SH(dst2, in2, tmp4, tmp5);
5708 ILVRL_H2_SH(dst3, in3, tmp6, tmp7);
5709 dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
5710 dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
5711 dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
5712 dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
5713 dst4 = __msa_dpadd_s_w(offset_vec, tmp4, weight_vec);
5714 dst5 = __msa_dpadd_s_w(offset_vec, tmp5, weight_vec);
5715 dst6 = __msa_dpadd_s_w(offset_vec, tmp6, weight_vec);
5716 dst7 = __msa_dpadd_s_w(offset_vec, tmp7, weight_vec);
5717 SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec);
5718 SRAR_W4_SW(dst4, dst5, dst6, dst7, rnd_vec);
5719 PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6,
5720 tmp0, tmp1, tmp2, tmp3);
5721 CLIP_SH4_0_255_MAX_SATU(tmp0, tmp1, tmp2, tmp3);
5722 PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
5723 ST8x4_UB(out0, out1, dst_tmp, dst_stride);
5724 dst_tmp += (4 * dst_stride);
5737 mask2 = LD_SB(ff_hevc_mask_arr + 16);
5740 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
5741 src0_ptr += (3 * src_stride);
5742 XORI_B3_128_SB(src0, src1, src2);
5743 VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec0, vec1);
5744 VSHF_B2_SB(src1, src2, src1, src2, mask2, mask3, vec2, vec3);
5746 dst10 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
5747 dst21 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
5749 ILVRL_H2_SH(dst21, dst10, dst10_r, dst21_r);
5750 dst22 = (v8i16) __msa_splati_d((v2i64) dst21, 1);
5752 for (loop_cnt = 2; loop_cnt--;) {
5753 LD_SB8(src0_ptr, src_stride, src3, src4, src5, src6, src7, src8, src9,
5755 src0_ptr += (8 * src_stride);
5756 XORI_B8_128_SB(src3, src4, src5, src6, src7, src8, src9, src10);
5757 VSHF_B2_SB(src3, src7, src3, src7, mask2, mask3, vec0, vec1);
5758 VSHF_B2_SB(src4, src8, src4, src8, mask2, mask3, vec2, vec3);
5759 VSHF_B2_SB(src5, src9, src5, src9, mask2, mask3, vec4, vec5);
5760 VSHF_B2_SB(src6, src10, src6, src10, mask2, mask3, vec6, vec7);
5762 dst73 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
5763 dst84 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
5764 dst95 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
5765 dst106 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
5767 dst32_r = __msa_ilvr_h(dst73, dst22);
5768 ILVRL_H2_SH(dst84, dst73, dst43_r, dst87_r);
5769 ILVRL_H2_SH(dst95, dst84, dst54_r, dst98_r);
5770 ILVRL_H2_SH(dst106, dst95, dst65_r, dst109_r);
5771 dst22 = (v8i16) __msa_splati_d((v2i64) dst73, 1);
5772 dst76_r = __msa_ilvr_h(dst22, dst106);
5774 LD2(src1_ptr, src2_stride, tp0, tp1);
5775 src1_ptr += 2 * src2_stride;
5776 INSERT_D2_SH(tp0, tp1, in0);
5777 LD2(src1_ptr, src2_stride, tp0, tp1);
5778 src1_ptr += 2 * src2_stride;
5779 INSERT_D2_SH(tp0, tp1, in1);
5781 LD2(src1_ptr, src2_stride, tp0, tp1);
5782 src1_ptr += 2 * src2_stride;
5783 INSERT_D2_SH(tp0, tp1, in2);
5784 LD2(src1_ptr, src2_stride, tp0, tp1);
5785 src1_ptr += 2 * src2_stride;
5786 INSERT_D2_SH(tp0, tp1, in3);
5788 dst0 = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
5789 dst1 = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
5790 dst2 = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
5791 dst3 = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
5792 dst4 = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
5793 dst5 = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
5794 dst6 = HEVC_FILT_4TAP(dst76_r, dst98_r, filt_h0, filt_h1);
5795 dst7 = HEVC_FILT_4TAP(dst87_r, dst109_r, filt_h0, filt_h1);
5797 SRA_4V(dst0, dst1, dst2, dst3, 6);
5798 SRA_4V(dst4, dst5, dst6, dst7, 6);
5799 PCKEV_H4_SW(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6,
5800 dst0, dst1, dst2, dst3);
5801 ILVRL_H2_SH(dst0, in0, tmp0, tmp1);
5802 ILVRL_H2_SH(dst1, in1, tmp2, tmp3);
5803 ILVRL_H2_SH(dst2, in2, tmp4, tmp5);
5804 ILVRL_H2_SH(dst3, in3, tmp6, tmp7);
5805 dst0 = __msa_dpadd_s_w(offset_vec, tmp0, weight_vec);
5806 dst1 = __msa_dpadd_s_w(offset_vec, tmp1, weight_vec);
5807 dst2 = __msa_dpadd_s_w(offset_vec, tmp2, weight_vec);
5808 dst3 = __msa_dpadd_s_w(offset_vec, tmp3, weight_vec);
5809 dst4 = __msa_dpadd_s_w(offset_vec, tmp4, weight_vec);
5810 dst5 = __msa_dpadd_s_w(offset_vec, tmp5, weight_vec);
5811 dst6 = __msa_dpadd_s_w(offset_vec, tmp6, weight_vec);
5812 dst7 = __msa_dpadd_s_w(offset_vec, tmp7, weight_vec);
5813 SRAR_W4_SW(dst0, dst1, dst2, dst3, rnd_vec);
5814 SRAR_W4_SW(dst4, dst5, dst6, dst7, rnd_vec);
5815 PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6,
5816 tmp0, tmp1, tmp2, tmp3);
5817 CLIP_SH4_0_255_MAX_SATU(tmp0, tmp1, tmp2, tmp3);
5818 PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
5819 ST4x8_UB(out0, out1, dst, dst_stride);
5820 dst += (8 * dst_stride);
5824 dst22 = (v8i16) __msa_splati_d((v2i64) dst106, 1);
5828 static void hevc_hv_biwgt_4t_16w_msa(uint8_t *src0_ptr,
5831 int32_t src2_stride,
5834 const int8_t *filter_x,
5835 const int8_t *filter_y,
5844 hevc_hv_biwgt_4t_8multx4_msa(src0_ptr, src_stride, src1_ptr,
5845 src2_stride, dst, dst_stride, filter_x,
5846 filter_y, weight0, weight1, offset0,
5847 offset1, rnd_val, 2);
5849 hevc_hv_biwgt_4t_8multx4mult_msa(src0_ptr, src_stride, src1_ptr,
5850 src2_stride, dst, dst_stride,
5851 filter_x, filter_y, height, weight0,
5852 weight1, offset0, offset1, rnd_val, 16);
5856 static void hevc_hv_biwgt_4t_24w_msa(uint8_t *src0_ptr,
5859 int32_t src2_stride,
5862 const int8_t *filter_x,
5863 const int8_t *filter_y,
5871 hevc_hv_biwgt_4t_8multx4mult_msa(src0_ptr, src_stride,
5872 src1_ptr, src2_stride,
5874 filter_x, filter_y, height, weight0,
5875 weight1, offset0, offset1, rnd_val, 24);
5878 static void hevc_hv_biwgt_4t_32w_msa(uint8_t *src0_ptr,
5881 int32_t src2_stride,
5884 const int8_t *filter_x,
5885 const int8_t *filter_y,
5893 hevc_hv_biwgt_4t_8multx4mult_msa(src0_ptr, src_stride,
5894 src1_ptr, src2_stride,
5896 filter_x, filter_y, height, weight0,
5897 weight1, offset0, offset1, rnd_val, 32);
5900 #define BI_W_MC_COPY(WIDTH) \
5901 void ff_hevc_put_hevc_bi_w_pel_pixels##WIDTH##_8_msa(uint8_t *dst, \
5902 ptrdiff_t dst_stride, \
5904 ptrdiff_t src_stride, \
5905 int16_t *src_16bit, \
5916 int shift = 14 + 1 - 8; \
5917 int log2Wd = denom + shift - 1; \
5919 hevc_biwgt_copy_##WIDTH##w_msa(src, src_stride, src_16bit, MAX_PB_SIZE, \
5920 dst, dst_stride, height, \
5921 weight0, weight1, offset0, \
5937 #define BI_W_MC(PEL, DIR, WIDTH, TAP, DIR1, FILT_DIR) \
5938 void ff_hevc_put_hevc_bi_w_##PEL##_##DIR##WIDTH##_8_msa(uint8_t *dst, \
5944 int16_t *src_16bit, \
5955 const int8_t *filter = ff_hevc_##PEL##_filters[FILT_DIR - 1]; \
5956 int log2Wd = denom + 14 - 8; \
5958 hevc_##DIR1##_biwgt_##TAP##t_##WIDTH##w_msa(src, src_stride, src_16bit, \
5959 MAX_PB_SIZE, dst, dst_stride, \
5960 filter, height, weight0, \
5961 weight1, offset0, offset1, \
5965 BI_W_MC(qpel, h, 4, 8, hz, mx);
5966 BI_W_MC(qpel, h, 8, 8, hz, mx);
5967 BI_W_MC(qpel, h, 12, 8, hz, mx);
5968 BI_W_MC(qpel, h, 16, 8, hz, mx);
5969 BI_W_MC(qpel, h, 24, 8, hz, mx);
5970 BI_W_MC(qpel, h, 32, 8, hz, mx);
5971 BI_W_MC(qpel, h, 48, 8, hz, mx);
5972 BI_W_MC(qpel, h, 64, 8, hz, mx);
5974 BI_W_MC(qpel, v, 4, 8, vt, my);
5975 BI_W_MC(qpel, v, 8, 8, vt, my);
5976 BI_W_MC(qpel, v, 12, 8, vt, my);
5977 BI_W_MC(qpel, v, 16, 8, vt, my);
5978 BI_W_MC(qpel, v, 24, 8, vt, my);
5979 BI_W_MC(qpel, v, 32, 8, vt, my);
5980 BI_W_MC(qpel, v, 48, 8, vt, my);
5981 BI_W_MC(qpel, v, 64, 8, vt, my);
5983 BI_W_MC(epel, h, 4, 4, hz, mx);
5984 BI_W_MC(epel, h, 8, 4, hz, mx);
5985 BI_W_MC(epel, h, 6, 4, hz, mx);
5986 BI_W_MC(epel, h, 12, 4, hz, mx);
5987 BI_W_MC(epel, h, 16, 4, hz, mx);
5988 BI_W_MC(epel, h, 24, 4, hz, mx);
5989 BI_W_MC(epel, h, 32, 4, hz, mx);
5991 BI_W_MC(epel, v, 4, 4, vt, my);
5992 BI_W_MC(epel, v, 8, 4, vt, my);
5993 BI_W_MC(epel, v, 6, 4, vt, my);
5994 BI_W_MC(epel, v, 12, 4, vt, my);
5995 BI_W_MC(epel, v, 16, 4, vt, my);
5996 BI_W_MC(epel, v, 24, 4, vt, my);
5997 BI_W_MC(epel, v, 32, 4, vt, my);
6001 #define BI_W_MC_HV(PEL, WIDTH, TAP) \
6002 void ff_hevc_put_hevc_bi_w_##PEL##_hv##WIDTH##_8_msa(uint8_t *dst, \
6003 ptrdiff_t dst_stride, \
6005 ptrdiff_t src_stride, \
6006 int16_t *src_16bit, \
6017 const int8_t *filter_x = ff_hevc_##PEL##_filters[mx - 1]; \
6018 const int8_t *filter_y = ff_hevc_##PEL##_filters[my - 1]; \
6019 int log2Wd = denom + 14 - 8; \
6021 hevc_hv_biwgt_##TAP##t_##WIDTH##w_msa(src, src_stride, src_16bit, \
6022 MAX_PB_SIZE, dst, dst_stride, \
6023 filter_x, filter_y, height, \
6024 weight0, weight1, offset0, \
6028 BI_W_MC_HV(qpel, 4, 8);
6029 BI_W_MC_HV(qpel, 8, 8);
6030 BI_W_MC_HV(qpel, 12, 8);
6031 BI_W_MC_HV(qpel, 16, 8);
6032 BI_W_MC_HV(qpel, 24, 8);
6033 BI_W_MC_HV(qpel, 32, 8);
6034 BI_W_MC_HV(qpel, 48, 8);
6035 BI_W_MC_HV(qpel, 64, 8);
6037 BI_W_MC_HV(epel, 4, 4);
6038 BI_W_MC_HV(epel, 8, 4);
6039 BI_W_MC_HV(epel, 6, 4);
6040 BI_W_MC_HV(epel, 12, 4);
6041 BI_W_MC_HV(epel, 16, 4);
6042 BI_W_MC_HV(epel, 24, 4);
6043 BI_W_MC_HV(epel, 32, 4);