2 * Copyright (c) 2015 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com)
4 * This file is part of FFmpeg.
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 #include "libavutil/mips/generic_macros_msa.h"
22 #include "libavcodec/mips/hevcdsp_mips.h"
23 #include "libavcodec/mips/hevc_macros_msa.h"
25 #define HEVC_BIW_RND_CLIP2(in0, in1, vec0, vec1, wgt, rnd, offset, \
26 out0_r, out1_r, out0_l, out1_l) \
28 ILVR_H2_SW(in0, vec0, in1, vec1, out0_r, out1_r); \
29 ILVL_H2_SW(in0, vec0, in1, vec1, out0_l, out1_l); \
31 out0_r = __msa_dpadd_s_w(offset, (v8i16) out0_r, (v8i16) wgt); \
32 out1_r = __msa_dpadd_s_w(offset, (v8i16) out1_r, (v8i16) wgt); \
33 out0_l = __msa_dpadd_s_w(offset, (v8i16) out0_l, (v8i16) wgt); \
34 out1_l = __msa_dpadd_s_w(offset, (v8i16) out1_l, (v8i16) wgt); \
36 SRAR_W4_SW(out0_r, out1_r, out0_l, out1_l, rnd); \
38 out0_r = CLIP_SW_0_255(out0_r); \
39 out1_r = CLIP_SW_0_255(out1_r); \
40 out0_l = CLIP_SW_0_255(out0_l); \
41 out1_l = CLIP_SW_0_255(out1_l); \
44 #define HEVC_BIW_RND_CLIP4(in0, in1, in2, in3, vec0, vec1, vec2, vec3, \
46 out0_r, out1_r, out2_r, out3_r, \
47 out0_l, out1_l, out2_l, out3_l) \
49 HEVC_BIW_RND_CLIP2(in0, in1, vec0, vec1, wgt, rnd, offset, \
50 out0_r, out1_r, out0_l, out1_l) \
51 HEVC_BIW_RND_CLIP2(in2, in3, vec2, vec3, wgt, rnd, offset, \
52 out2_r, out3_r, out2_l, out3_l) \
55 #define HEVC_BI_RND_CLIP2(in0, in1, vec0, vec1, rnd_val, out0, out1) \
57 ADDS_SH2_SH(vec0, in0, vec1, in1, out0, out1); \
58 SRARI_H2_SH(out0, out1, rnd_val); \
59 CLIP_SH2_0_255(out0, out1); \
62 #define HEVC_BI_RND_CLIP4(in0, in1, in2, in3, \
63 vec0, vec1, vec2, vec3, rnd_val, \
64 out0, out1, out2, out3) \
66 HEVC_BI_RND_CLIP2(in0, in1, vec0, vec1, rnd_val, out0, out1); \
67 HEVC_BI_RND_CLIP2(in2, in3, vec2, vec3, rnd_val, out2, out3); \
70 static void hevc_biwgt_copy_4w_msa(uint8_t *src0_ptr,
83 int32_t offset, weight;
85 v4i32 weight_vec, offset_vec, rnd_vec;
87 offset = (offset0 + offset1) << rnd_val;
88 weight0 = weight0 & 0x0000FFFF;
89 weight = weight0 | (weight1 << 16);
91 offset_vec = __msa_fill_w(offset);
92 weight_vec = __msa_fill_w(weight);
93 rnd_vec = __msa_fill_w(rnd_val + 1);
100 LD_SB2(src0_ptr, src_stride, src0, src1);
101 LD_SH2(src1_ptr, src2_stride, in0, in1);
102 in0 = (v8i16) __msa_ilvr_d((v2i64) in1, (v2i64) in0);
103 src0 = (v16i8) __msa_ilvr_w((v4i32) src1, (v4i32) src0);
105 dst0 = (v8i16) __msa_ilvr_b(zero, src0);
108 ILVRL_H2_SW(dst0, in0, dst0_r, dst0_l);
109 dst0_r = __msa_dpadd_s_w(offset_vec, (v8i16) dst0_r,
111 dst0_l = __msa_dpadd_s_w(offset_vec, (v8i16) dst0_l,
113 SRAR_W2_SW(dst0_r, dst0_l, rnd_vec);
114 dst0_r = CLIP_SW_0_255(dst0_r);
115 dst0_l = CLIP_SW_0_255(dst0_l);
117 HEVC_PCK_SW_SB2(dst0_l, dst0_r, dst0_r);
118 ST4x2_UB(dst0_r, dst, dst_stride);
119 } else if (4 == height) {
120 v16i8 src0, src1, src2, src3;
121 v8i16 in0, in1, in2, in3;
123 v4i32 dst0_r, dst1_r, dst0_l, dst1_l;
125 LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
126 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
127 ILVR_D2_SH(in1, in0, in3, in2, in0, in1);
128 ILVR_W2_SB(src1, src0, src3, src2, src0, src1);
129 ILVR_B2_SH(zero, src0, zero, src1, dst0, dst1);
132 HEVC_BIW_RND_CLIP2(dst0, dst1, in0, in1,
133 weight_vec, rnd_vec, offset_vec,
134 dst0_r, dst1_r, dst0_l, dst1_l);
136 HEVC_PCK_SW_SB4(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r);
137 ST4x4_UB(dst0_r, dst0_r, 0, 1, 2, 3, dst, dst_stride);
138 } else if (0 == height % 8) {
140 v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
141 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
142 v8i16 dst0, dst1, dst2, dst3;
143 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l;
145 for (loop_cnt = (height >> 3); loop_cnt--;) {
146 LD_SB8(src0_ptr, src_stride,
147 src0, src1, src2, src3, src4, src5, src6, src7);
148 src0_ptr += (8 * src_stride);
149 LD_SH8(src1_ptr, src2_stride,
150 in0, in1, in2, in3, in4, in5, in6, in7);
151 src1_ptr += (8 * src2_stride);
153 ILVR_D2_SH(in1, in0, in3, in2, in0, in1);
154 ILVR_D2_SH(in5, in4, in7, in6, in2, in3);
155 ILVR_W4_SB(src1, src0, src3, src2, src5, src4, src7, src6,
156 src0, src1, src2, src3);
157 ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
158 dst0, dst1, dst2, dst3);
160 SLLI_4V(dst0, dst1, dst2, dst3, 6);
161 HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3,
163 weight_vec, rnd_vec, offset_vec,
164 dst0_r, dst1_r, dst2_r, dst3_r,
165 dst0_l, dst1_l, dst2_l, dst3_l);
167 HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r,
168 dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
169 ST4x8_UB(dst0_r, dst1_r, dst, dst_stride);
170 dst += (8 * dst_stride);
175 static void hevc_biwgt_copy_6w_msa(uint8_t *src0_ptr,
189 int32_t offset, weight;
191 v16i8 src0, src1, src2, src3;
192 v8i16 in0, in1, in2, in3;
193 v8i16 dst0, dst1, dst2, dst3;
194 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l;
195 v4i32 offset_vec, weight_vec, rnd_vec;
197 offset = (offset0 + offset1) << rnd_val;
198 weight0 = weight0 & 0x0000FFFF;
199 weight = weight0 | (weight1 << 16);
201 weight_vec = __msa_fill_w(weight);
202 offset_vec = __msa_fill_w(offset);
203 rnd_vec = __msa_fill_w(rnd_val + 1);
205 for (loop_cnt = (height >> 2); loop_cnt--;) {
206 LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
207 src0_ptr += (4 * src_stride);
208 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
209 src1_ptr += (4 * src2_stride);
210 ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
211 dst0, dst1, dst2, dst3);
213 SLLI_4V(dst0, dst1, dst2, dst3, 6);
214 HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3,
216 weight_vec, rnd_vec, offset_vec,
217 dst0_r, dst1_r, dst2_r, dst3_r,
218 dst0_l, dst1_l, dst2_l, dst3_l);
220 HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r,
221 dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
222 ST6x4_UB(dst0_r, dst1_r, dst, dst_stride);
223 dst += (4 * dst_stride);
227 static void hevc_biwgt_copy_8w_msa(uint8_t *src0_ptr,
240 int32_t offset, weight;
242 v4i32 offset_vec, weight_vec, rnd_vec;
244 offset = (offset0 + offset1) << rnd_val;
245 weight0 = weight0 & 0x0000FFFF;
246 weight = weight0 | (weight1 << 16);
248 offset_vec = __msa_fill_w(offset);
249 weight_vec = __msa_fill_w(weight);
250 rnd_vec = __msa_fill_w(rnd_val + 1);
254 v8i16 in0, in1, dst0, dst1;
255 v4i32 dst0_r, dst1_r, dst0_l, dst1_l;
257 LD_SB2(src0_ptr, src_stride, src0, src1);
258 LD_SH2(src1_ptr, src2_stride, in0, in1);
260 ILVR_B2_SH(zero, src0, zero, src1, dst0, dst1);
264 HEVC_BIW_RND_CLIP2(dst0, dst1, in0, in1,
265 weight_vec, rnd_vec, offset_vec,
266 dst0_r, dst1_r, dst0_l, dst1_l);
268 HEVC_PCK_SW_SB4(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r);
269 ST8x2_UB(dst0_r, dst, dst_stride);
270 } else if (6 == height) {
271 v16i8 src0, src1, src2, src3, src4, src5;
272 v8i16 in0, in1, in2, in3, in4, in5;
273 v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
274 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r;
275 v4i32 dst0_l, dst1_l, dst2_l, dst3_l, dst4_l, dst5_l;
277 LD_SB6(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5);
278 LD_SH6(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5);
279 ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
280 dst0, dst1, dst2, dst3);
281 ILVR_B2_SH(zero, src4, zero, src5, dst4, dst5);
283 SLLI_4V(dst0, dst1, dst2, dst3, 6);
286 HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3,
288 weight_vec, rnd_vec, offset_vec,
289 dst0_r, dst1_r, dst2_r, dst3_r,
290 dst0_l, dst1_l, dst2_l, dst3_l);
291 HEVC_BIW_RND_CLIP2(dst4, dst5, in4, in5,
292 weight_vec, rnd_vec, offset_vec,
293 dst4_r, dst5_r, dst4_l, dst5_l);
295 HEVC_PCK_SW_SB12(dst0_l, dst0_r, dst1_l, dst1_r,
296 dst2_l, dst2_r, dst3_l, dst3_r,
297 dst4_l, dst4_r, dst5_l, dst5_r,
298 dst0_r, dst1_r, dst2_r);
299 ST8x4_UB(dst0_r, dst1_r, dst, dst_stride);
300 dst += (4 * dst_stride);
301 ST8x2_UB(dst2_r, dst, dst_stride);
302 } else if (0 == height % 4) {
304 v16i8 src0, src1, src2, src3;
305 v8i16 in0, in1, in2, in3;
306 v8i16 dst0, dst1, dst2, dst3;
307 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l;
309 for (loop_cnt = (height >> 2); loop_cnt--;) {
310 LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
311 src0_ptr += (4 * src_stride);
312 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
313 src1_ptr += (4 * src2_stride);
314 ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
315 dst0, dst1, dst2, dst3);
317 SLLI_4V(dst0, dst1, dst2, dst3, 6);
318 HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3,
320 weight_vec, rnd_vec, offset_vec,
321 dst0_r, dst1_r, dst2_r, dst3_r,
322 dst0_l, dst1_l, dst2_l, dst3_l);
324 HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r,
325 dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
326 ST8x4_UB(dst0_r, dst1_r, dst, dst_stride);
327 dst += (4 * dst_stride);
332 static void hevc_biwgt_copy_12w_msa(uint8_t *src0_ptr,
346 int32_t offset, weight;
348 v16i8 src0, src1, src2, src3;
349 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
350 v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
351 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r;
352 v4i32 dst0_l, dst1_l, dst2_l, dst3_l, dst4_l, dst5_l;
353 v4i32 offset_vec, weight_vec, rnd_vec;
355 offset = (offset0 + offset1) << rnd_val;
356 weight0 = weight0 & 0x0000FFFF;
357 weight = weight0 | (weight1 << 16);
359 offset_vec = __msa_fill_w(offset);
360 weight_vec = __msa_fill_w(weight);
361 rnd_vec = __msa_fill_w(rnd_val + 1);
363 for (loop_cnt = (16 >> 2); loop_cnt--;) {
364 LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
365 src0_ptr += (4 * src_stride);
366 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
367 LD_SH4(src1_ptr + 8, src2_stride, in4, in5, in6, in7);
368 src1_ptr += (4 * src2_stride);
370 ILVR_D2_SH(in5, in4, in7, in6, in4, in5);
371 ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
372 dst0, dst1, dst2, dst3);
374 SLLI_4V(dst0, dst1, dst2, dst3, 6);
375 ILVL_W2_SB(src1, src0, src3, src2, src0, src1);
376 ILVR_B2_SH(zero, src0, zero, src1, dst4, dst5);
380 HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3,
382 weight_vec, rnd_vec, offset_vec,
383 dst0_r, dst1_r, dst2_r, dst3_r,
384 dst0_l, dst1_l, dst2_l, dst3_l);
385 HEVC_BIW_RND_CLIP2(dst4, dst5, in4, in5,
386 weight_vec, rnd_vec, offset_vec,
387 dst4_r, dst5_r, dst4_l, dst5_l);
389 HEVC_PCK_SW_SB12(dst0_l, dst0_r, dst1_l, dst1_r,
390 dst2_l, dst2_r, dst3_l, dst3_r,
391 dst4_l, dst4_r, dst5_l, dst5_r,
392 dst0_r, dst1_r, dst2_r);
393 ST12x4_UB(dst0_r, dst1_r, dst2_r, dst, dst_stride);
394 dst += (4 * dst_stride);
398 static void hevc_biwgt_copy_16multx4mult_msa(uint8_t *src0_ptr,
412 uint32_t loop_cnt, cnt;
413 uint8_t *src0_ptr_tmp;
414 int16_t *src1_ptr_tmp;
416 int32_t offset, weight;
418 v4i32 offset_vec, weight_vec, rnd_vec;
420 offset = (offset0 + offset1) << rnd_val;
421 weight0 = weight0 & 0x0000FFFF;
422 weight = weight0 | (weight1 << 16);
424 offset_vec = __msa_fill_w(offset);
425 weight_vec = __msa_fill_w(weight);
426 rnd_vec = __msa_fill_w(rnd_val + 1);
428 for (cnt = (width >> 4); cnt--;) {
429 src0_ptr_tmp = src0_ptr;
430 src1_ptr_tmp = src1_ptr;
433 for (loop_cnt = (height >> 2); loop_cnt--;) {
434 v16i8 src0, src1, src2, src3;
435 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
436 v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
437 v4i32 dst0_r, dst1_r, dst2_r, dst3_r;
438 v4i32 dst0_l, dst1_l, dst2_l, dst3_l;
440 LD_SB4(src0_ptr_tmp, src_stride, src0, src1, src2, src3);
441 src0_ptr_tmp += (4 * src_stride);
442 LD_SH4(src1_ptr_tmp, src2_stride, in0, in1, in2, in3);
443 LD_SH4(src1_ptr_tmp + 8, src2_stride, in4, in5, in6, in7);
444 src1_ptr_tmp += (4 * src2_stride);
446 ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
447 tmp0, tmp1, tmp2, tmp3);
448 ILVL_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
449 tmp4, tmp5, tmp6, tmp7);
451 SLLI_4V(tmp0, tmp1, tmp2, tmp3, 6);
452 SLLI_4V(tmp4, tmp5, tmp6, tmp7, 6);
453 HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp4, tmp5,
455 weight_vec, rnd_vec, offset_vec,
456 dst0_r, dst1_r, dst2_r, dst3_r,
457 dst0_l, dst1_l, dst2_l, dst3_l);
459 HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst2_l, dst2_r,
460 dst1_l, dst1_r, dst3_l, dst3_r, dst0_r, dst1_r);
461 ST_SW2(dst0_r, dst1_r, dst_tmp, dst_stride);
462 dst_tmp += (2 * dst_stride);
464 HEVC_BIW_RND_CLIP4(tmp2, tmp3, tmp6, tmp7,
466 weight_vec, rnd_vec, offset_vec,
467 dst0_r, dst1_r, dst2_r, dst3_r,
468 dst0_l, dst1_l, dst2_l, dst3_l);
470 HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst2_l, dst2_r,
471 dst1_l, dst1_r, dst3_l, dst3_r, dst0_r, dst1_r);
472 ST_SW2(dst0_r, dst1_r, dst_tmp, dst_stride);
473 dst_tmp += (2 * dst_stride);
482 static void hevc_biwgt_copy_16w_msa(uint8_t *src0_ptr,
495 hevc_biwgt_copy_16multx4mult_msa(src0_ptr, src_stride,
496 src1_ptr, src2_stride,
497 dst, dst_stride, height, weight0,
498 weight1, offset0, offset1, rnd_val, 16);
501 static void hevc_biwgt_copy_24w_msa(uint8_t *src0_ptr,
514 hevc_biwgt_copy_16multx4mult_msa(src0_ptr, src_stride,
515 src1_ptr, src2_stride,
516 dst, dst_stride, height, weight0,
517 weight1, offset0, offset1, rnd_val, 16);
518 hevc_biwgt_copy_8w_msa(src0_ptr + 16, src_stride,
519 src1_ptr + 16, src2_stride,
520 dst + 16, dst_stride, height, weight0,
521 weight1, offset0, offset1, rnd_val);
524 static void hevc_biwgt_copy_32w_msa(uint8_t *src0_ptr,
537 hevc_biwgt_copy_16multx4mult_msa(src0_ptr, src_stride,
538 src1_ptr, src2_stride,
539 dst, dst_stride, height, weight0,
540 weight1, offset0, offset1, rnd_val, 32);
543 static void hevc_biwgt_copy_48w_msa(uint8_t *src0_ptr,
556 hevc_biwgt_copy_16multx4mult_msa(src0_ptr, src_stride,
557 src1_ptr, src2_stride,
558 dst, dst_stride, height, weight0,
559 weight1, offset0, offset1, rnd_val, 48);
562 static void hevc_biwgt_copy_64w_msa(uint8_t *src0_ptr,
575 hevc_biwgt_copy_16multx4mult_msa(src0_ptr, src_stride,
576 src1_ptr, src2_stride,
577 dst, dst_stride, height, weight0,
578 weight1, offset0, offset1, rnd_val, 64);
581 static void hevc_hz_biwgt_8t_4w_msa(uint8_t *src0_ptr,
587 const int8_t *filter,
596 int32_t offset, weight;
597 v8i16 filt0, filt1, filt2, filt3;
598 v16i8 src0, src1, src2, src3;
599 v16i8 mask1, mask2, mask3;
600 v16i8 vec0, vec1, vec2, vec3;
602 v8i16 in0, in1, in2, in3;
603 v4i32 dst0_r, dst1_r, dst0_l, dst1_l;
604 v8i16 filter_vec, const_vec;
605 v4i32 weight_vec, offset_vec, rnd_vec;
606 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 };
609 filter_vec = LD_SH(filter);
610 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
616 offset = (offset0 + offset1) << rnd_val;
617 weight0 = weight0 & 0x0000FFFF;
618 weight = weight0 | (weight1 << 16);
620 const_vec = __msa_ldi_h(128);
622 offset_vec = __msa_fill_w(offset);
623 weight_vec = __msa_fill_w(weight);
624 rnd_vec = __msa_fill_w(rnd_val + 1);
626 for (loop_cnt = (height >> 2); loop_cnt--;) {
627 LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
628 src0_ptr += (4 * src_stride);
629 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
630 src1_ptr += (4 * src2_stride);
631 ILVR_D2_SH(in1, in0, in3, in2, in0, in1);
632 XORI_B4_128_SB(src0, src1, src2, src3);
634 VSHF_B4_SB(src0, src1, mask0, mask1, mask2, mask3,
635 vec0, vec1, vec2, vec3);
637 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
638 dst0, dst0, dst0, dst0);
639 VSHF_B4_SB(src2, src3, mask0, mask1, mask2, mask3,
640 vec0, vec1, vec2, vec3);
642 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
643 dst1, dst1, dst1, dst1);
645 HEVC_BIW_RND_CLIP2(dst0, dst1, in0, in1,
646 weight_vec, rnd_vec, offset_vec,
647 dst0_r, dst1_r, dst0_l, dst1_l);
649 HEVC_PCK_SW_SB4(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r);
650 ST4x4_UB(dst0_r, dst0_r, 0, 1, 2, 3, dst, dst_stride);
651 dst += (4 * dst_stride);
655 static void hevc_hz_biwgt_8t_8w_msa(uint8_t *src0_ptr,
661 const int8_t *filter,
670 int32_t offset, weight;
671 v8i16 filt0, filt1, filt2, filt3;
672 v16i8 src0, src1, src2, src3;
673 v16i8 mask1, mask2, mask3;
674 v16i8 vec0, vec1, vec2, vec3;
675 v8i16 dst0, dst1, dst2, dst3;
676 v8i16 in0, in1, in2, in3;
677 v4i32 dst0_r, dst1_r, dst0_l, dst1_l, dst2_r, dst3_r, dst2_l, dst3_l;
678 v8i16 filter_vec, const_vec;
679 v4i32 weight_vec, offset_vec, rnd_vec;
680 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
683 offset = (offset0 + offset1) << rnd_val;
684 weight0 = weight0 & 0x0000FFFF;
685 weight = weight0 | (weight1 << 16);
687 const_vec = __msa_ldi_h(128);
689 offset_vec = __msa_fill_w(offset);
690 weight_vec = __msa_fill_w(weight);
691 rnd_vec = __msa_fill_w(rnd_val + 1);
693 filter_vec = LD_SH(filter);
694 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
700 for (loop_cnt = (height >> 2); loop_cnt--;) {
701 LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
702 src0_ptr += (4 * src_stride);
703 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
704 src1_ptr += (4 * src2_stride);
705 XORI_B4_128_SB(src0, src1, src2, src3);
707 VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
708 vec0, vec1, vec2, vec3);
710 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
711 dst0, dst0, dst0, dst0);
712 VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
713 vec0, vec1, vec2, vec3);
715 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
716 dst1, dst1, dst1, dst1);
717 VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
718 vec0, vec1, vec2, vec3);
720 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
721 dst2, dst2, dst2, dst2);
722 VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
723 vec0, vec1, vec2, vec3);
725 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
726 dst3, dst3, dst3, dst3);
728 HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3,
730 weight_vec, rnd_vec, offset_vec,
731 dst0_r, dst1_r, dst2_r, dst3_r,
732 dst0_l, dst1_l, dst2_l, dst3_l);
734 HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r,
735 dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
736 ST8x4_UB(dst0_r, dst1_r, dst, dst_stride);
737 dst += (4 * dst_stride);
741 static void hevc_hz_biwgt_8t_12w_msa(uint8_t *src0_ptr,
747 const int8_t *filter,
755 hevc_hz_biwgt_8t_8w_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
756 dst, dst_stride, filter, height,
757 weight0, weight1, offset0, offset1, rnd_val);
758 hevc_hz_biwgt_8t_4w_msa(src0_ptr + 8, src_stride, src1_ptr + 8, src2_stride,
759 dst + 8, dst_stride, filter, height,
760 weight0, weight1, offset0, offset1, rnd_val);
763 static void hevc_hz_biwgt_8t_16w_msa(uint8_t *src0_ptr,
769 const int8_t *filter,
778 int32_t offset, weight;
779 v16i8 src0, src1, src2, src3;
780 v8i16 in0, in1, in2, in3;
781 v8i16 filt0, filt1, filt2, filt3;
782 v16i8 mask1, mask2, mask3;
783 v8i16 filter_vec, const_vec;
784 v16i8 vec0, vec1, vec2, vec3;
785 v8i16 dst0, dst1, dst2, dst3;
786 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l;
787 v4i32 weight_vec, offset_vec, rnd_vec;
788 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
791 offset = (offset0 + offset1) << rnd_val;
792 weight0 = weight0 & 0x0000FFFF;
793 weight = weight0 | (weight1 << 16);
795 const_vec = __msa_ldi_h(128);
797 offset_vec = __msa_fill_w(offset);
798 weight_vec = __msa_fill_w(weight);
799 rnd_vec = __msa_fill_w(rnd_val + 1);
801 filter_vec = LD_SH(filter);
802 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
808 for (loop_cnt = (height >> 1); loop_cnt--;) {
809 LD_SB2(src0_ptr, 8, src0, src1);
810 src0_ptr += src_stride;
811 LD_SB2(src0_ptr, 8, src2, src3);
812 src0_ptr += src_stride;
813 LD_SH2(src1_ptr, 8, in0, in1);
814 src1_ptr += src2_stride;
815 LD_SH2(src1_ptr, 8, in2, in3);
816 src1_ptr += src2_stride;
817 XORI_B4_128_SB(src0, src1, src2, src3);
819 VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
820 vec0, vec1, vec2, vec3);
822 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
823 dst0, dst0, dst0, dst0);
824 VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
825 vec0, vec1, vec2, vec3);
827 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
828 dst1, dst1, dst1, dst1);
829 VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
830 vec0, vec1, vec2, vec3);
832 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
833 dst2, dst2, dst2, dst2);
834 VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
835 vec0, vec1, vec2, vec3);
837 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
838 dst3, dst3, dst3, dst3);
840 HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3,
842 weight_vec, rnd_vec, offset_vec,
843 dst0_r, dst1_r, dst2_r, dst3_r,
844 dst0_l, dst1_l, dst2_l, dst3_l);
846 HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r,
847 dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
848 ST_SW2(dst0_r, dst1_r, dst, dst_stride);
849 dst += (2 * dst_stride);
853 static void hevc_hz_biwgt_8t_24w_msa(uint8_t *src0_ptr,
859 const int8_t *filter,
869 int32_t offset, weight;
872 v8i16 filt0, filt1, filt2, filt3;
873 v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
874 v16i8 vec0, vec1, vec2, vec3;
875 v8i16 dst0, dst1, dst2;
876 v4i32 dst0_r, dst1_r, dst2_r, dst0_l, dst1_l, dst2_l;
877 v8i16 filter_vec, const_vec;
878 v4i32 weight_vec, offset_vec, rnd_vec;
879 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
881 src0_ptr = src0_ptr - 3;
882 offset = (offset0 + offset1) << rnd_val;
883 weight0 = weight0 & 0x0000FFFF;
884 weight = weight0 | (weight1 << 16);
886 const_vec = __msa_ldi_h(128);
888 offset_vec = __msa_fill_w(offset);
889 weight_vec = __msa_fill_w(weight);
890 rnd_vec = __msa_fill_w(rnd_val + 1);
892 filter_vec = LD_SH(filter);
893 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
903 for (loop_cnt = height; loop_cnt--;) {
904 LD_SB2(src0_ptr, 16, src0, src1);
905 src0_ptr += src_stride;
906 LD_SH2(src1_ptr, 8, in0, in1);
907 in2 = LD_SH(src1_ptr + 16);
908 src1_ptr += src2_stride;
909 XORI_B2_128_SB(src0, src1);
911 VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
912 vec0, vec1, vec2, vec3);
914 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
915 dst0, dst0, dst0, dst0);
916 VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7,
917 vec0, vec1, vec2, vec3);
919 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
920 dst1, dst1, dst1, dst1);
921 VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
922 vec0, vec1, vec2, vec3);
924 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
925 dst2, dst2, dst2, dst2);
927 HEVC_BIW_RND_CLIP2(dst0, dst1, in0, in1,
928 weight_vec, rnd_vec, offset_vec,
929 dst0_r, dst1_r, dst0_l, dst1_l);
931 ILVRL_H2_SW(dst2, in2, dst2_r, dst2_l);
932 dst2_r = __msa_dpadd_s_w(offset_vec, (v8i16) dst2_r,
934 dst2_l = __msa_dpadd_s_w(offset_vec, (v8i16) dst2_l,
936 SRAR_W2_SW(dst2_r, dst2_l, rnd_vec);
937 dst2_r = CLIP_SW_0_255(dst2_r);
938 dst2_l = CLIP_SW_0_255(dst2_l);
940 HEVC_PCK_SW_SB4(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r);
941 HEVC_PCK_SW_SB2(dst2_l, dst2_r, dst2_r);
942 dst_val0 = __msa_copy_u_d((v2i64) dst2_r, 0);
944 SD(dst_val0, dst + 16);
949 static void hevc_hz_biwgt_8t_32w_msa(uint8_t *src0_ptr,
955 const int8_t *filter,
964 int32_t offset, weight;
965 v16i8 src0, src1, src2;
966 v8i16 in0, in1, in2, in3;
967 v8i16 filt0, filt1, filt2, filt3;
968 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
969 v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
970 v16i8 vec0, vec1, vec2, vec3;
971 v8i16 dst0, dst1, dst2, dst3;
972 v8i16 filter_vec, const_vec;
973 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l;
974 v4i32 weight_vec, offset_vec, rnd_vec;
977 offset = (offset0 + offset1) << rnd_val;
978 weight0 = weight0 & 0x0000FFFF;
979 weight = weight0 | (weight1 << 16);
981 const_vec = __msa_ldi_h(128);
983 offset_vec = __msa_fill_w(offset);
984 weight_vec = __msa_fill_w(weight);
985 rnd_vec = __msa_fill_w(rnd_val + 1);
987 filter_vec = LD_SH(filter);
988 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
998 for (loop_cnt = height; loop_cnt--;) {
999 LD_SB2(src0_ptr, 16, src0, src1);
1000 src2 = LD_SB(src0_ptr + 24);
1001 src0_ptr += src_stride;
1002 LD_SH4(src1_ptr, 8, in0, in1, in2, in3);
1003 src1_ptr += src2_stride;
1005 XORI_B3_128_SB(src0, src1, src2);
1007 VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
1008 vec0, vec1, vec2, vec3);
1010 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
1011 dst0, dst0, dst0, dst0);
1012 VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7,
1013 vec0, vec1, vec2, vec3);
1015 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
1016 dst1, dst1, dst1, dst1);
1017 VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
1018 vec0, vec1, vec2, vec3);
1020 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
1021 dst2, dst2, dst2, dst2);
1022 VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
1023 vec0, vec1, vec2, vec3);
1025 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
1026 dst3, dst3, dst3, dst3);
1028 HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3,
1030 weight_vec, rnd_vec, offset_vec,
1031 dst0_r, dst1_r, dst2_r, dst3_r,
1032 dst0_l, dst1_l, dst2_l, dst3_l);
1034 HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r,
1035 dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
1036 ST_SW2(dst0_r, dst1_r, dst, 16);
1041 static void hevc_hz_biwgt_8t_48w_msa(uint8_t *src0_ptr,
1044 int32_t src2_stride,
1047 const int8_t *filter,
1056 int32_t offset, weight;
1058 v16i8 src0, src1, src2, src3;
1059 v8i16 in0, in1, in2, in3, in4, in5;
1060 v8i16 filt0, filt1, filt2, filt3;
1061 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
1062 v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
1063 v16i8 vec0, vec1, vec2, vec3;
1064 v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
1065 v8i16 filter_vec, const_vec;
1066 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r;
1067 v4i32 dst0_l, dst1_l, dst2_l, dst3_l, dst4_l, dst5_l;
1068 v4i32 weight_vec, offset_vec, rnd_vec;
1071 offset = (offset0 + offset1) << rnd_val;
1072 weight0 = weight0 & 0x0000FFFF;
1073 weight = weight0 | (weight1 << 16);
1075 const_vec = __msa_ldi_h(128);
1077 offset_vec = __msa_fill_w(offset);
1078 weight_vec = __msa_fill_w(weight);
1079 rnd_vec = __msa_fill_w(rnd_val + 1);
1081 filter_vec = LD_SH(filter);
1082 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1092 for (loop_cnt = height; loop_cnt--;) {
1093 LD_SB3(src0_ptr, 16, src0, src1, src2);
1094 src3 = LD_SB(src0_ptr + 40);
1095 src0_ptr += src_stride;
1096 LD_SH2(src1_ptr, 8, in0, in1);
1097 in2 = LD_SH(src1_ptr + 16);
1098 XORI_B4_128_SB(src0, src1, src2, src3);
1100 VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
1101 vec0, vec1, vec2, vec3);
1103 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
1104 dst0, dst0, dst0, dst0);
1105 VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7,
1106 vec0, vec1, vec2, vec3);
1108 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
1109 dst1, dst1, dst1, dst1);
1110 VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
1111 vec0, vec1, vec2, vec3);
1113 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
1114 dst2, dst2, dst2, dst2);
1115 VSHF_B4_SB(src1, src2, mask4, mask5, mask6, mask7,
1116 vec0, vec1, vec2, vec3);
1118 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
1119 dst3, dst3, dst3, dst3);
1120 VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
1121 vec0, vec1, vec2, vec3);
1123 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
1124 dst4, dst4, dst4, dst4);
1125 VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
1126 vec0, vec1, vec2, vec3);
1128 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
1129 dst5, dst5, dst5, dst5);
1131 HEVC_BIW_RND_CLIP2(dst0, dst1, in0, in1,
1132 weight_vec, rnd_vec, offset_vec,
1133 dst0_r, dst1_r, dst0_l, dst1_l);
1135 ILVRL_H2_SW(dst2, in2, dst2_r, dst2_l);
1136 dst2_r = __msa_dpadd_s_w(offset_vec, (v8i16) dst2_r,
1137 (v8i16) weight_vec);
1138 dst2_l = __msa_dpadd_s_w(offset_vec, (v8i16) dst2_l,
1139 (v8i16) weight_vec);
1140 SRAR_W2_SW(dst2_r, dst2_l, rnd_vec);
1141 dst2_r = CLIP_SW_0_255(dst2_r);
1142 dst2_l = CLIP_SW_0_255(dst2_l);
1144 HEVC_PCK_SW_SB4(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r);
1145 HEVC_PCK_SW_SB2(dst2_l, dst2_r, dst2_r);
1146 dst_val0 = __msa_copy_u_d((v2i64) dst2_r, 0);
1148 SD(dst_val0, dst + 16);
1150 LD_SH2(src1_ptr + 24, 8, in3, in4);
1151 in5 = LD_SH(src1_ptr + 40);
1152 src1_ptr += src2_stride;
1154 HEVC_BIW_RND_CLIP2(dst3, dst4, in3, in4,
1155 weight_vec, rnd_vec, offset_vec,
1156 dst3_r, dst4_r, dst3_l, dst4_l);
1158 ILVRL_H2_SW(dst5, in5, dst5_r, dst5_l);
1159 dst5_r = __msa_dpadd_s_w(offset_vec, (v8i16) dst5_r,
1160 (v8i16) weight_vec);
1161 dst5_l = __msa_dpadd_s_w(offset_vec, (v8i16) dst5_l,
1162 (v8i16) weight_vec);
1163 SRAR_W2_SW(dst5_r, dst5_l, rnd_vec);
1164 dst5_r = CLIP_SW_0_255(dst5_r);
1165 dst5_l = CLIP_SW_0_255(dst5_l);
1167 HEVC_PCK_SW_SB4(dst4_l, dst4_r, dst5_l, dst5_r, dst4_r);
1168 HEVC_PCK_SW_SB2(dst3_l, dst3_r, dst3_r);
1169 dst_val0 = __msa_copy_u_d((v2i64) dst3_r, 0);
1170 SD(dst_val0, dst + 24);
1171 ST_SW(dst4_r, dst + 32);
1176 static void hevc_hz_biwgt_8t_64w_msa(uint8_t *src0_ptr,
1179 int32_t src2_stride,
1182 const int8_t *filter,
1190 uint8_t *src0_ptr_tmp;
1192 int16_t *src1_ptr_tmp;
1193 uint32_t loop_cnt, cnt;
1194 int32_t offset, weight;
1195 v16i8 src0, src1, src2;
1196 v8i16 in0, in1, in2, in3;
1197 v8i16 filt0, filt1, filt2, filt3;
1198 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
1199 v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
1200 v16i8 vec0, vec1, vec2, vec3;
1201 v8i16 dst0, dst1, dst2, dst3;
1202 v8i16 filter_vec, const_vec;
1203 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l;
1204 v4i32 weight_vec, offset_vec, rnd_vec;
1207 offset = (offset0 + offset1) << rnd_val;
1208 weight0 = weight0 & 0x0000FFFF;
1209 weight = weight0 | (weight1 << 16);
1211 const_vec = __msa_ldi_h(128);
1213 offset_vec = __msa_fill_w(offset);
1214 weight_vec = __msa_fill_w(weight);
1215 rnd_vec = __msa_fill_w(rnd_val + 1);
1217 filter_vec = LD_SH(filter);
1218 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1228 for (loop_cnt = height; loop_cnt--;) {
1229 src0_ptr_tmp = src0_ptr;
1231 src1_ptr_tmp = src1_ptr;
1233 for (cnt = 2; cnt--;) {
1234 LD_SB2(src0_ptr_tmp, 16, src0, src1);
1235 src2 = LD_SB(src0_ptr_tmp + 24);
1237 LD_SH4(src1_ptr_tmp, 8, in0, in1, in2, in3);
1239 XORI_B3_128_SB(src0, src1, src2);
1241 VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
1242 vec0, vec1, vec2, vec3);
1244 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
1245 dst0, dst0, dst0, dst0);
1246 VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7,
1247 vec0, vec1, vec2, vec3);
1249 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
1250 dst1, dst1, dst1, dst1);
1251 VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
1252 vec0, vec1, vec2, vec3);
1254 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
1255 dst2, dst2, dst2, dst2);
1256 VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
1257 vec0, vec1, vec2, vec3);
1259 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
1260 dst3, dst3, dst3, dst3);
1262 HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3,
1264 weight_vec, rnd_vec, offset_vec,
1265 dst0_r, dst1_r, dst2_r, dst3_r,
1266 dst0_l, dst1_l, dst2_l, dst3_l);
1268 HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r,
1269 dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
1270 ST_SW2(dst0_r, dst1_r, dst_tmp, 16);
1274 src0_ptr += src_stride;
1275 src1_ptr += src2_stride;
1281 static void hevc_vt_biwgt_8t_4w_msa(uint8_t *src0_ptr,
1284 int32_t src2_stride,
1287 const int8_t *filter,
1296 int32_t offset, weight;
1297 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1298 v16i8 src11, src12, src13, src14;
1299 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
1300 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
1301 v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
1302 v16i8 src1110_r, src1211_r, src1312_r, src1413_r;
1303 v16i8 src2110, src4332, src6554, src8776, src10998;
1304 v16i8 src12111110, src14131312;
1305 v8i16 dst10, dst32, dst54, dst76;
1306 v4i32 dst10_r, dst32_r, dst54_r, dst76_r;
1307 v4i32 dst10_l, dst32_l, dst54_l, dst76_l;
1308 v8i16 filt0, filt1, filt2, filt3;
1309 v8i16 filter_vec, const_vec;
1310 v4i32 weight_vec, offset_vec, rnd_vec;
1312 src0_ptr -= (3 * src_stride);
1313 offset = (offset0 + offset1) << rnd_val;
1314 weight0 = weight0 & 0x0000FFFF;
1315 weight = weight0 | (weight1 << 16);
1317 const_vec = __msa_ldi_h(128);
1319 offset_vec = __msa_fill_w(offset);
1320 weight_vec = __msa_fill_w(weight);
1321 rnd_vec = __msa_fill_w(rnd_val + 1);
1323 filter_vec = LD_SH(filter);
1324 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1326 LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6);
1327 src0_ptr += (7 * src_stride);
1329 ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1330 src10_r, src32_r, src54_r, src21_r);
1331 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1332 ILVR_D3_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r,
1333 src2110, src4332, src6554);
1334 XORI_B3_128_SB(src2110, src4332, src6554);
1336 for (loop_cnt = (height >> 3); loop_cnt--;) {
1337 LD_SB8(src0_ptr, src_stride,
1338 src7, src8, src9, src10, src11, src12, src13, src14);
1339 src0_ptr += (8 * src_stride);
1340 LD_SH8(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5, in6, in7);
1341 src1_ptr += (8 * src2_stride);
1343 ILVR_D2_SH(in1, in0, in3, in2, in0, in1);
1344 ILVR_D2_SH(in5, in4, in7, in6, in2, in3);
1345 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
1346 src76_r, src87_r, src98_r, src109_r);
1347 ILVR_B4_SB(src11, src10, src12, src11, src13, src12, src14, src13,
1348 src1110_r, src1211_r, src1312_r, src1413_r);
1349 ILVR_D4_SB(src87_r, src76_r, src109_r, src98_r, src1211_r, src1110_r,
1350 src1413_r, src1312_r,
1351 src8776, src10998, src12111110, src14131312);
1352 XORI_B4_128_SB(src8776, src10998, src12111110, src14131312);
1355 DPADD_SB4_SH(src2110, src4332, src6554, src8776, filt0, filt1,
1356 filt2, filt3, dst10, dst10, dst10, dst10);
1358 DPADD_SB4_SH(src4332, src6554, src8776, src10998,
1359 filt0, filt1, filt2, filt3, dst32, dst32, dst32, dst32);
1361 DPADD_SB4_SH(src6554, src8776, src10998, src12111110,
1362 filt0, filt1, filt2, filt3, dst54, dst54, dst54, dst54);
1364 DPADD_SB4_SH(src8776, src10998, src12111110, src14131312,
1365 filt0, filt1, filt2, filt3, dst76, dst76, dst76, dst76);
1367 HEVC_BIW_RND_CLIP4(dst10, dst32, dst54, dst76,
1369 weight_vec, rnd_vec, offset_vec,
1370 dst10_r, dst32_r, dst54_r, dst76_r,
1371 dst10_l, dst32_l, dst54_l, dst76_l);
1373 HEVC_PCK_SW_SB8(dst10_l, dst10_r, dst32_l, dst32_r,
1374 dst54_l, dst54_r, dst76_l, dst76_r, dst10_r, dst54_r);
1375 ST4x8_UB(dst10_r, dst54_r, dst, dst_stride);
1376 dst += (8 * dst_stride);
1379 src4332 = src12111110;
1380 src6554 = src14131312;
1385 static void hevc_vt_biwgt_8t_8w_msa(uint8_t *src0_ptr,
1388 int32_t src2_stride,
1391 const int8_t *filter,
1400 int32_t offset, weight;
1401 v16i8 src0, src1, src2, src3, src4, src5;
1402 v16i8 src6, src7, src8, src9, src10;
1403 v8i16 in0, in1, in2, in3;
1404 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
1405 v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
1406 v8i16 tmp0, tmp1, tmp2, tmp3;
1407 v8i16 filt0, filt1, filt2, filt3;
1408 v8i16 filter_vec, const_vec;
1409 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l;
1410 v4i32 weight_vec, offset_vec, rnd_vec;
1412 src0_ptr -= (3 * src_stride);
1413 offset = (offset0 + offset1) << rnd_val;
1414 weight0 = weight0 & 0x0000FFFF;
1415 weight = weight0 | (weight1 << 16);
1417 const_vec = __msa_ldi_h(128);
1419 offset_vec = __msa_fill_w(offset);
1420 weight_vec = __msa_fill_w(weight);
1421 rnd_vec = __msa_fill_w(rnd_val + 1);
1423 filter_vec = LD_SH(filter);
1424 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1426 LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6);
1427 src0_ptr += (7 * src_stride);
1428 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1430 ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1431 src10_r, src32_r, src54_r, src21_r);
1432 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1434 for (loop_cnt = (height >> 2); loop_cnt--;) {
1435 LD_SB4(src0_ptr, src_stride, src7, src8, src9, src10);
1436 src0_ptr += (4 * src_stride);
1437 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
1438 src1_ptr += (4 * src2_stride);
1440 XORI_B4_128_SB(src7, src8, src9, src10);
1441 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
1442 src76_r, src87_r, src98_r, src109_r);
1445 DPADD_SB4_SH(src10_r, src32_r, src54_r, src76_r,
1446 filt0, filt1, filt2, filt3, tmp0, tmp0, tmp0, tmp0);
1448 DPADD_SB4_SH(src21_r, src43_r, src65_r, src87_r,
1449 filt0, filt1, filt2, filt3, tmp1, tmp1, tmp1, tmp1);
1451 DPADD_SB4_SH(src32_r, src54_r, src76_r, src98_r,
1452 filt0, filt1, filt2, filt3, tmp2, tmp2, tmp2, tmp2);
1454 DPADD_SB4_SH(src43_r, src65_r, src87_r, src109_r,
1455 filt0, filt1, filt2, filt3, tmp3, tmp3, tmp3, tmp3);
1457 HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3,
1459 weight_vec, rnd_vec, offset_vec,
1460 dst0_r, dst1_r, dst2_r, dst3_r,
1461 dst0_l, dst1_l, dst2_l, dst3_l);
1463 HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r,
1464 dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
1465 ST8x4_UB(dst0_r, dst1_r, dst, dst_stride);
1466 dst += (4 * dst_stride);
1478 static void hevc_vt_biwgt_8t_12w_msa(uint8_t *src0_ptr,
1481 int32_t src2_stride,
1484 const int8_t *filter,
1493 int32_t offset, weight;
1494 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
1495 v8i16 in0, in1, in2, in3;
1496 v16i8 src10_r, src32_r, src54_r, src76_r;
1497 v16i8 src21_r, src43_r, src65_r, src87_r;
1498 v8i16 tmp0, tmp1, tmp2;
1499 v16i8 src10_l, src32_l, src54_l, src76_l;
1500 v16i8 src21_l, src43_l, src65_l, src87_l;
1501 v16i8 src2110, src4332, src6554, src8776;
1502 v8i16 filt0, filt1, filt2, filt3;
1503 v8i16 filter_vec, const_vec;
1504 v4i32 dst0_r, dst1_r, dst2_r, dst0_l, dst1_l, dst2_l;
1505 v4i32 weight_vec, offset_vec, rnd_vec;
1507 src0_ptr -= (3 * src_stride);
1508 offset = (offset0 + offset1) << rnd_val;
1509 weight0 = weight0 & 0x0000FFFF;
1510 weight = weight0 | (weight1 << 16);
1512 const_vec = __msa_ldi_h(128);
1514 offset_vec = __msa_fill_w(offset);
1515 weight_vec = __msa_fill_w(weight);
1516 rnd_vec = __msa_fill_w(rnd_val + 1);
1518 filter_vec = LD_SH(filter);
1519 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1521 LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6);
1522 src0_ptr += (7 * src_stride);
1523 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1525 ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1526 src10_r, src32_r, src54_r, src21_r);
1527 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1528 ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1529 src10_l, src32_l, src54_l, src21_l);
1530 ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
1531 ILVR_D3_SB(src21_l, src10_l, src43_l, src32_l, src65_l, src54_l,
1532 src2110, src4332, src6554);
1534 for (loop_cnt = (height >> 1); loop_cnt--;) {
1535 LD_SB2(src0_ptr, src_stride, src7, src8);
1536 src0_ptr += (2 * src_stride);
1537 LD_SH2(src1_ptr, src2_stride, in0, in1);
1538 LD_SH2((src1_ptr + 8), src2_stride, in2, in3);
1539 src1_ptr += (2 * src2_stride);
1540 in2 = (v8i16) __msa_ilvr_d((v2i64) in3, (v2i64) in2);
1541 XORI_B2_128_SB(src7, src8);
1543 ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
1544 ILVL_B2_SB(src7, src6, src8, src7, src76_l, src87_l);
1545 src8776 = (v16i8) __msa_ilvr_d((v2i64) src87_l, (v2i64) src76_l);
1548 DPADD_SB4_SH(src10_r, src32_r, src54_r, src76_r,
1549 filt0, filt1, filt2, filt3, tmp0, tmp0, tmp0, tmp0);
1551 DPADD_SB4_SH(src21_r, src43_r, src65_r, src87_r,
1552 filt0, filt1, filt2, filt3, tmp1, tmp1, tmp1, tmp1);
1554 DPADD_SB4_SH(src2110, src4332, src6554, src8776,
1555 filt0, filt1, filt2, filt3, tmp2, tmp2, tmp2, tmp2);
1557 HEVC_BIW_RND_CLIP2(tmp0, tmp1, in0, in1,
1558 weight_vec, rnd_vec, offset_vec,
1559 dst0_r, dst1_r, dst0_l, dst1_l);
1561 ILVRL_H2_SW(tmp2, in2, dst2_r, dst2_l);
1562 dst2_r = __msa_dpadd_s_w(offset_vec, (v8i16) dst2_r,
1563 (v8i16) weight_vec);
1564 dst2_l = __msa_dpadd_s_w(offset_vec, (v8i16) dst2_l,
1565 (v8i16) weight_vec);
1566 SRAR_W2_SW(dst2_r, dst2_l, rnd_vec);
1567 dst2_r = CLIP_SW_0_255(dst2_r);
1568 dst2_l = CLIP_SW_0_255(dst2_l);
1570 HEVC_PCK_SW_SB4(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r);
1571 HEVC_PCK_SW_SB2(dst2_l, dst2_r, dst2_r);
1572 ST8x2_UB(dst0_r, dst, dst_stride);
1573 ST4x2_UB(dst2_r, dst + 8, dst_stride);
1574 dst += (2 * dst_stride);
1589 static void hevc_vt_biwgt_8t_16multx2mult_msa(uint8_t *src0_ptr,
1592 int32_t src2_stride,
1595 const int8_t *filter,
1604 uint8_t *src0_ptr_tmp;
1605 int16_t *src1_ptr_tmp;
1607 uint32_t loop_cnt, cnt;
1608 int32_t offset, weight;
1609 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
1610 v8i16 in0, in1, in2, in3;
1611 v16i8 src10_r, src32_r, src54_r, src76_r;
1612 v16i8 src21_r, src43_r, src65_r, src87_r;
1613 v16i8 src10_l, src32_l, src54_l, src76_l;
1614 v16i8 src21_l, src43_l, src65_l, src87_l;
1615 v8i16 tmp0, tmp1, tmp2, tmp3;
1616 v8i16 filt0, filt1, filt2, filt3;
1617 v8i16 filter_vec, const_vec;
1618 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l;
1619 v4i32 weight_vec, offset_vec, rnd_vec;
1621 src0_ptr -= (3 * src_stride);
1623 offset = (offset0 + offset1) << rnd_val;
1624 weight0 = weight0 & 0x0000FFFF;
1625 weight = weight0 | (weight1 << 16);
1627 const_vec = __msa_ldi_h(128);
1629 offset_vec = __msa_fill_w(offset);
1630 weight_vec = __msa_fill_w(weight);
1631 rnd_vec = __msa_fill_w(rnd_val + 1);
1633 filter_vec = LD_SH(filter);
1634 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1636 for (cnt = (width >> 4); cnt--;) {
1637 src0_ptr_tmp = src0_ptr;
1638 src1_ptr_tmp = src1_ptr;
1641 LD_SB7(src0_ptr_tmp, src_stride,
1642 src0, src1, src2, src3, src4, src5, src6);
1643 src0_ptr_tmp += (7 * src_stride);
1645 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1646 ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1647 src10_r, src32_r, src54_r, src21_r);
1648 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1649 ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1650 src10_l, src32_l, src54_l, src21_l);
1651 ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
1653 for (loop_cnt = (height >> 1); loop_cnt--;) {
1654 LD_SB2(src0_ptr_tmp, src_stride, src7, src8);
1655 src0_ptr_tmp += (2 * src_stride);
1656 LD_SH2(src1_ptr_tmp, src2_stride, in0, in1);
1657 LD_SH2((src1_ptr_tmp + 8), src2_stride, in2, in3);
1658 src1_ptr_tmp += (2 * src2_stride);
1660 XORI_B2_128_SB(src7, src8);
1661 ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
1662 ILVL_B2_SB(src7, src6, src8, src7, src76_l, src87_l);
1665 DPADD_SB4_SH(src10_r, src32_r, src54_r, src76_r,
1666 filt0, filt1, filt2, filt3, tmp0, tmp0, tmp0, tmp0);
1668 DPADD_SB4_SH(src21_r, src43_r, src65_r, src87_r,
1669 filt0, filt1, filt2, filt3, tmp1, tmp1, tmp1, tmp1);
1671 DPADD_SB4_SH(src10_l, src32_l, src54_l, src76_l,
1672 filt0, filt1, filt2, filt3, tmp2, tmp2, tmp2, tmp2);
1674 DPADD_SB4_SH(src21_l, src43_l, src65_l, src87_l,
1675 filt0, filt1, filt2, filt3, tmp3, tmp3, tmp3, tmp3);
1677 HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3,
1679 weight_vec, rnd_vec, offset_vec,
1680 dst0_r, dst1_r, dst2_r, dst3_r,
1681 dst0_l, dst1_l, dst2_l, dst3_l);
1683 HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst2_l, dst2_r,
1684 dst1_l, dst1_r, dst3_l, dst3_r, dst0_r, dst1_r);
1685 ST_SW2(dst0_r, dst1_r, dst_tmp, dst_stride);
1686 dst_tmp += (2 * dst_stride);
1709 static void hevc_vt_biwgt_8t_16w_msa(uint8_t *src0_ptr,
1712 int32_t src2_stride,
1715 const int8_t *filter,
1723 hevc_vt_biwgt_8t_16multx2mult_msa(src0_ptr, src_stride,
1724 src1_ptr, src2_stride,
1725 dst, dst_stride, filter, height,
1726 weight0, weight1, offset0, offset1,
1730 static void hevc_vt_biwgt_8t_24w_msa(uint8_t *src0_ptr,
1733 int32_t src2_stride,
1736 const int8_t *filter,
1744 hevc_vt_biwgt_8t_16multx2mult_msa(src0_ptr, src_stride,
1745 src1_ptr, src2_stride,
1746 dst, dst_stride, filter, height,
1747 weight0, weight1, offset0, offset1,
1749 hevc_vt_biwgt_8t_8w_msa(src0_ptr + 16, src_stride,
1750 src1_ptr + 16, src2_stride,
1751 dst + 16, dst_stride, filter, height,
1752 weight0, weight1, offset0, offset1, rnd_val);
1755 static void hevc_vt_biwgt_8t_32w_msa(uint8_t *src0_ptr,
1758 int32_t src2_stride,
1761 const int8_t *filter,
1769 hevc_vt_biwgt_8t_16multx2mult_msa(src0_ptr, src_stride,
1770 src1_ptr, src2_stride,
1771 dst, dst_stride, filter, height,
1772 weight0, weight1, offset0, offset1,
1776 static void hevc_vt_biwgt_8t_48w_msa(uint8_t *src0_ptr,
1779 int32_t src2_stride,
1782 const int8_t *filter,
1790 hevc_vt_biwgt_8t_16multx2mult_msa(src0_ptr, src_stride,
1791 src1_ptr, src2_stride,
1792 dst, dst_stride, filter, height,
1793 weight0, weight1, offset0, offset1,
1797 static void hevc_vt_biwgt_8t_64w_msa(uint8_t *src0_ptr,
1800 int32_t src2_stride,
1803 const int8_t *filter,
1811 hevc_vt_biwgt_8t_16multx2mult_msa(src0_ptr, src_stride,
1812 src1_ptr, src2_stride,
1813 dst, dst_stride, filter, height,
1814 weight0, weight1, offset0, offset1,
1818 static void hevc_hv_biwgt_8t_4w_msa(uint8_t *src0_ptr,
1821 int32_t src2_stride,
1824 const int8_t *filter_x,
1825 const int8_t *filter_y,
1835 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
1837 v8i16 filt0, filt1, filt2, filt3;
1838 v4i32 filt_h0, filt_h1, filt_h2, filt_h3;
1839 v16i8 mask1, mask2, mask3;
1840 v8i16 filter_vec, const_vec;
1841 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1842 v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1843 v8i16 dst30, dst41, dst52, dst63, dst66, dst87;
1844 v4i32 dst0_r, dst1_r;
1846 v4i32 weight_vec0, weight_vec1, offset_vec, rnd_vec;
1847 v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
1848 v8i16 dst21_r, dst43_r, dst65_r, dst87_r;
1849 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 };
1850 v8u16 mask4 = { 0, 4, 1, 5, 2, 6, 3, 7 };
1852 src0_ptr -= ((3 * src_stride) + 3);
1854 filter_vec = LD_SH(filter_x);
1855 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1857 filter_vec = LD_SH(filter_y);
1858 vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
1859 filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
1861 SPLATI_W4_SW(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
1867 offset = (offset0 + offset1) << rnd_val;
1868 weight0 = weight0 & 0x0000FFFF;
1870 const_vec = __msa_ldi_h(128);
1872 offset_vec = __msa_fill_w(offset);
1873 weight_vec0 = __msa_fill_w(weight0);
1874 weight_vec1 = __msa_fill_w(weight1);
1875 rnd_vec = __msa_fill_w(rnd_val + 1);
1877 LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6);
1878 src0_ptr += (7 * src_stride);
1880 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1882 VSHF_B4_SB(src0, src3, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
1883 VSHF_B4_SB(src1, src4, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7);
1884 VSHF_B4_SB(src2, src5, mask0, mask1, mask2, mask3,
1885 vec8, vec9, vec10, vec11);
1886 VSHF_B4_SB(src3, src6, mask0, mask1, mask2, mask3,
1887 vec12, vec13, vec14, vec15);
1890 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
1891 dst30, dst30, dst30, dst30);
1893 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, filt3,
1894 dst41, dst41, dst41, dst41);
1896 DPADD_SB4_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, filt3,
1897 dst52, dst52, dst52, dst52);
1899 DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2, filt3,
1900 dst63, dst63, dst63, dst63);
1902 ILVR_H3_SH(dst41, dst30, dst52, dst41, dst63, dst52,
1903 dst10_r, dst21_r, dst32_r);
1904 dst43_r = __msa_ilvl_h(dst41, dst30);
1905 dst54_r = __msa_ilvl_h(dst52, dst41);
1906 dst65_r = __msa_ilvl_h(dst63, dst52);
1907 dst66 = (v8i16) __msa_splati_d((v2i64) dst63, 1);
1909 for (loop_cnt = height >> 1; loop_cnt--;) {
1910 LD_SB2(src0_ptr, src_stride, src7, src8);
1911 src0_ptr += (2 * src_stride);
1912 LD_SH2(src1_ptr, src2_stride, in0, in1);
1913 src1_ptr += (2 * src2_stride);
1915 in0 = (v8i16) __msa_ilvr_d((v2i64) in1, (v2i64) in0);
1916 XORI_B2_128_SB(src7, src8);
1918 VSHF_B4_SB(src7, src8, mask0, mask1, mask2, mask3,
1919 vec0, vec1, vec2, vec3);
1921 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
1922 dst87, dst87, dst87, dst87);
1923 dst76_r = __msa_ilvr_h(dst87, dst66);
1924 dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r,
1925 filt_h0, filt_h1, filt_h2, filt_h3);
1926 dst87_r = __msa_vshf_h((v8i16) mask4, dst87, dst87);
1927 dst1_r = HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r,
1928 filt_h0, filt_h1, filt_h2, filt_h3);
1933 ILVRL_H2_SW(in0, in0, tmp1, tmp2);
1934 tmp1 = __msa_dpadd_s_w(offset_vec, (v8i16) tmp1, (v8i16) weight_vec0);
1935 tmp2 = __msa_dpadd_s_w(offset_vec, (v8i16) tmp2, (v8i16) weight_vec0);
1936 tmp1 += dst0_r * weight_vec1;
1937 tmp2 += dst1_r * weight_vec1;
1938 SRAR_W2_SW(tmp1, tmp2, rnd_vec);
1939 tmp1 = CLIP_SW_0_255(tmp1);
1940 tmp2 = CLIP_SW_0_255(tmp2);
1942 HEVC_PCK_SW_SB2(tmp2, tmp1, tmp1);
1943 ST4x2_UB(tmp1, dst, dst_stride);
1944 dst += (2 * dst_stride);
1952 dst66 = (v8i16) __msa_splati_d((v2i64) dst87, 1);
1956 static void hevc_hv_biwgt_8t_8multx2mult_msa(uint8_t *src0_ptr,
1959 int32_t src2_stride,
1962 const int8_t *filter_x,
1963 const int8_t *filter_y,
1972 uint32_t loop_cnt, cnt;
1974 uint8_t *src0_ptr_tmp;
1975 int16_t *src1_ptr_tmp;
1977 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
1979 v8i16 filt0, filt1, filt2, filt3;
1980 v4i32 filt_h0, filt_h1, filt_h2, filt_h3;
1981 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
1982 v16i8 mask1, mask2, mask3;
1983 v8i16 filter_vec, const_vec;
1984 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1985 v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1986 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
1987 v4i32 dst0_r, dst0_l, dst1_r, dst1_l;
1988 v4i32 tmp0, tmp1, tmp2, tmp3;
1989 v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
1990 v8i16 dst10_l, dst32_l, dst54_l, dst76_l;
1991 v8i16 dst21_r, dst43_r, dst65_r, dst87_r;
1992 v8i16 dst21_l, dst43_l, dst65_l, dst87_l;
1993 v4i32 weight_vec0, weight_vec1, offset_vec, rnd_vec;
1995 src0_ptr -= ((3 * src_stride) + 3);
1997 offset = (offset0 + offset1) << rnd_val;
1998 weight0 = weight0 & 0x0000FFFF;
2000 const_vec = __msa_ldi_h(128);
2002 offset_vec = __msa_fill_w(offset);
2003 weight_vec0 = __msa_fill_w(weight0);
2004 weight_vec1 = __msa_fill_w(weight1);
2005 rnd_vec = __msa_fill_w(rnd_val + 1);
2007 filter_vec = LD_SH(filter_x);
2008 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
2010 filter_vec = LD_SH(filter_y);
2011 vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
2012 filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
2014 SPLATI_W4_SW(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
2020 for (cnt = width >> 3; cnt--;) {
2021 src0_ptr_tmp = src0_ptr;
2022 src1_ptr_tmp = src1_ptr;
2025 LD_SB7(src0_ptr_tmp, src_stride,
2026 src0, src1, src2, src3, src4, src5, src6);
2027 src0_ptr_tmp += (7 * src_stride);
2029 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
2031 /* row 0 row 1 row 2 row 3 */
2032 VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
2033 vec0, vec1, vec2, vec3);
2034 VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
2035 vec4, vec5, vec6, vec7);
2036 VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
2037 vec8, vec9, vec10, vec11);
2038 VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
2039 vec12, vec13, vec14, vec15);
2042 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
2043 dst0, dst0, dst0, dst0);
2045 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, filt3,
2046 dst1, dst1, dst1, dst1);
2048 DPADD_SB4_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, filt3,
2049 dst2, dst2, dst2, dst2);
2051 DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2, filt3,
2052 dst3, dst3, dst3, dst3);
2054 /* row 4 row 5 row 6 */
2055 VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3,
2056 vec0, vec1, vec2, vec3);
2057 VSHF_B4_SB(src5, src5, mask0, mask1, mask2, mask3,
2058 vec4, vec5, vec6, vec7);
2059 VSHF_B4_SB(src6, src6, mask0, mask1, mask2, mask3,
2060 vec8, vec9, vec10, vec11);
2063 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
2064 dst4, dst4, dst4, dst4);
2066 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, filt3,
2067 dst5, dst5, dst5, dst5);
2069 DPADD_SB4_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, filt3,
2070 dst6, dst6, dst6, dst6);
2072 ILVR_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1,
2073 dst10_r, dst32_r, dst54_r, dst21_r);
2074 ILVR_H2_SH(dst4, dst3, dst6, dst5, dst43_r, dst65_r);
2075 ILVL_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1,
2076 dst10_l, dst32_l, dst54_l, dst21_l);
2077 ILVL_H2_SH(dst4, dst3, dst6, dst5, dst43_l, dst65_l);
2079 for (loop_cnt = height >> 1; loop_cnt--;) {
2080 LD_SB2(src0_ptr_tmp, src_stride, src7, src8);
2081 XORI_B2_128_SB(src7, src8);
2082 src0_ptr_tmp += 2 * src_stride;
2084 LD_SH2(src1_ptr_tmp, src2_stride, in0, in1);
2085 src1_ptr_tmp += (2 * src2_stride);
2087 VSHF_B4_SB(src7, src7, mask0, mask1, mask2, mask3,
2088 vec0, vec1, vec2, vec3);
2090 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
2091 dst7, dst7, dst7, dst7);
2093 ILVRL_H2_SH(dst7, dst6, dst76_r, dst76_l);
2094 dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r,
2095 filt_h0, filt_h1, filt_h2, filt_h3);
2096 dst0_l = HEVC_FILT_8TAP(dst10_l, dst32_l, dst54_l, dst76_l,
2097 filt_h0, filt_h1, filt_h2, filt_h3);
2103 VSHF_B4_SB(src8, src8, mask0, mask1, mask2, mask3,
2104 vec0, vec1, vec2, vec3);
2107 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
2108 dst8, dst8, dst8, dst8);
2110 ILVRL_H2_SH(dst8, dst7, dst87_r, dst87_l);
2111 dst1_r = HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r,
2112 filt_h0, filt_h1, filt_h2, filt_h3);
2113 dst1_l = HEVC_FILT_8TAP(dst21_l, dst43_l, dst65_l, dst87_l,
2114 filt_h0, filt_h1, filt_h2, filt_h3);
2119 ILVRL_H2_SW(in0, in0, tmp0, tmp1);
2120 ILVRL_H2_SW(in1, in1, tmp2, tmp3);
2121 tmp0 = __msa_dpadd_s_w(offset_vec, (v8i16) tmp0,
2122 (v8i16) weight_vec0);
2123 tmp1 = __msa_dpadd_s_w(offset_vec, (v8i16) tmp1,
2124 (v8i16) weight_vec0);
2125 tmp2 = __msa_dpadd_s_w(offset_vec, (v8i16) tmp2,
2126 (v8i16) weight_vec0);
2127 tmp3 = __msa_dpadd_s_w(offset_vec, (v8i16) tmp3,
2128 (v8i16) weight_vec0);
2130 tmp0 += (dst0_r * weight_vec1);
2131 tmp1 += (dst0_l * weight_vec1);
2132 tmp2 += (dst1_r * weight_vec1);
2133 tmp3 += (dst1_l * weight_vec1);
2135 SRAR_W4_SW(tmp0, tmp1, tmp2, tmp3, rnd_vec);
2136 tmp0 = CLIP_SW_0_255(tmp0);
2137 tmp1 = CLIP_SW_0_255(tmp1);
2138 tmp2 = CLIP_SW_0_255(tmp2);
2139 tmp3 = CLIP_SW_0_255(tmp3);
2140 HEVC_PCK_SW_SB4(tmp1, tmp0, tmp3, tmp2, dst0_r);
2141 ST8x2_UB(dst0_r, dst_tmp, dst_stride);
2142 dst_tmp += (2 * dst_stride);
2165 static void hevc_hv_biwgt_8t_8w_msa(uint8_t *src0_ptr,
2168 int32_t src2_stride,
2171 const int8_t *filter_x,
2172 const int8_t *filter_y,
2180 hevc_hv_biwgt_8t_8multx2mult_msa(src0_ptr, src_stride,
2181 src1_ptr, src2_stride,
2182 dst, dst_stride, filter_x, filter_y,
2183 height, weight0, weight1, offset0,
2184 offset1, rnd_val, 8);
2187 static void hevc_hv_biwgt_8t_12w_msa(uint8_t *src0_ptr,
2190 int32_t src2_stride,
2193 const int8_t *filter_x,
2194 const int8_t *filter_y,
2202 hevc_hv_biwgt_8t_8multx2mult_msa(src0_ptr, src_stride,
2203 src1_ptr, src2_stride,
2204 dst, dst_stride, filter_x, filter_y,
2205 height, weight0, weight1, offset0,
2206 offset1, rnd_val, 8);
2207 hevc_hv_biwgt_8t_4w_msa(src0_ptr + 8, src_stride,
2208 src1_ptr + 8, src2_stride,
2209 dst + 8, dst_stride, filter_x, filter_y,
2210 height, weight0, weight1, offset0, offset1,
2214 static void hevc_hv_biwgt_8t_16w_msa(uint8_t *src0_ptr,
2217 int32_t src2_stride,
2220 const int8_t *filter_x,
2221 const int8_t *filter_y,
2229 hevc_hv_biwgt_8t_8multx2mult_msa(src0_ptr, src_stride,
2230 src1_ptr, src2_stride,
2231 dst, dst_stride, filter_x, filter_y,
2232 height, weight0, weight1, offset0,
2233 offset1, rnd_val, 16);
2236 static void hevc_hv_biwgt_8t_24w_msa(uint8_t *src0_ptr,
2239 int32_t src2_stride,
2242 const int8_t *filter_x,
2243 const int8_t *filter_y,
2251 hevc_hv_biwgt_8t_8multx2mult_msa(src0_ptr, src_stride,
2252 src1_ptr, src2_stride,
2253 dst, dst_stride, filter_x, filter_y,
2254 height, weight0, weight1, offset0,
2255 offset1, rnd_val, 24);
2258 static void hevc_hv_biwgt_8t_32w_msa(uint8_t *src0_ptr,
2261 int32_t src2_stride,
2264 const int8_t *filter_x,
2265 const int8_t *filter_y,
2273 hevc_hv_biwgt_8t_8multx2mult_msa(src0_ptr, src_stride,
2274 src1_ptr, src2_stride,
2275 dst, dst_stride, filter_x, filter_y,
2276 height, weight0, weight1, offset0,
2277 offset1, rnd_val, 32);
2280 static void hevc_hv_biwgt_8t_48w_msa(uint8_t *src0_ptr,
2283 int32_t src2_stride,
2286 const int8_t *filter_x,
2287 const int8_t *filter_y,
2295 hevc_hv_biwgt_8t_8multx2mult_msa(src0_ptr, src_stride,
2296 src1_ptr, src2_stride,
2297 dst, dst_stride, filter_x, filter_y,
2298 height, weight0, weight1, offset0,
2299 offset1, rnd_val, 48);
2302 static void hevc_hv_biwgt_8t_64w_msa(uint8_t *src0_ptr,
2305 int32_t src2_stride,
2308 const int8_t *filter_x,
2309 const int8_t *filter_y,
2317 hevc_hv_biwgt_8t_8multx2mult_msa(src0_ptr, src_stride,
2318 src1_ptr, src2_stride,
2319 dst, dst_stride, filter_x, filter_y,
2320 height, weight0, weight1, offset0,
2321 offset1, rnd_val, 64);
2324 static void hevc_hz_biwgt_4t_4x2_msa(uint8_t *src0_ptr,
2327 int32_t src2_stride,
2330 const int8_t *filter,
2338 int32_t offset, weight;
2342 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 };
2343 v16i8 mask1, vec0, vec1;
2345 v4i32 dst0_r, dst0_l;
2346 v8i16 filter_vec, const_vec;
2347 v4i32 weight_vec, offset_vec, rnd_vec;
2351 filter_vec = LD_SH(filter);
2352 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2356 offset = (offset0 + offset1) << rnd_val;
2357 weight0 = weight0 & 0x0000FFFF;
2358 weight = weight0 | (weight1 << 16);
2360 const_vec = __msa_ldi_h(128);
2362 offset_vec = __msa_fill_w(offset);
2363 weight_vec = __msa_fill_w(weight);
2364 rnd_vec = __msa_fill_w(rnd_val + 1);
2366 LD_SB2(src0_ptr, src_stride, src0, src1);
2367 LD_SH2(src1_ptr, src2_stride, in0, in1);
2368 in0 = (v8i16) __msa_ilvr_d((v2i64) in1, (v2i64) in0);
2369 XORI_B2_128_SB(src0, src1);
2371 VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
2373 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
2375 ILVRL_H2_SW(dst0, in0, dst0_r, dst0_l);
2376 dst0_r = __msa_dpadd_s_w(offset_vec, (v8i16) dst0_r, (v8i16) weight_vec);
2377 dst0_l = __msa_dpadd_s_w(offset_vec, (v8i16) dst0_l, (v8i16) weight_vec);
2378 SRAR_W2_SW(dst0_r, dst0_l, rnd_vec);
2379 dst0_r = CLIP_SW_0_255(dst0_r);
2380 dst0_l = CLIP_SW_0_255(dst0_l);
2382 HEVC_PCK_SW_SB2(dst0_l, dst0_r, dst0_r);
2383 ST4x2_UB(dst0_r, dst, dst_stride);
2386 static void hevc_hz_biwgt_4t_4x4_msa(uint8_t *src0_ptr,
2389 int32_t src2_stride,
2392 const int8_t *filter,
2400 int32_t offset, weight;
2402 v16i8 src0, src1, src2, src3;
2403 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 };
2407 v8i16 in0, in1, in2, in3;
2408 v4i32 dst0_r, dst1_r, dst0_l, dst1_l;
2409 v8i16 filter_vec, const_vec;
2410 v4i32 weight_vec, offset_vec, rnd_vec;
2414 /* rearranging filter */
2415 filter_vec = LD_SH(filter);
2416 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2420 offset = (offset0 + offset1) << rnd_val;
2421 weight0 = weight0 & 0x0000FFFF;
2422 weight = weight0 | (weight1 << 16);
2424 const_vec = __msa_ldi_h(128);
2426 offset_vec = __msa_fill_w(offset);
2427 weight_vec = __msa_fill_w(weight);
2428 rnd_vec = __msa_fill_w(rnd_val + 1);
2430 LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
2431 XORI_B4_128_SB(src0, src1, src2, src3);
2432 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
2433 ILVR_D2_SH(in1, in0, in3, in2, in0, in1);
2435 VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
2437 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
2438 VSHF_B2_SB(src2, src3, src2, src3, mask0, mask1, vec0, vec1);
2440 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
2441 HEVC_BIW_RND_CLIP2(dst0, dst1, in0, in1,
2442 weight_vec, rnd_vec, offset_vec,
2443 dst0_r, dst1_r, dst0_l, dst1_l);
2445 HEVC_PCK_SW_SB4(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r);
2446 ST4x4_UB(dst0_r, dst0_r, 0, 1, 2, 3, dst, dst_stride);
2449 static void hevc_hz_biwgt_4t_4x8multiple_msa(uint8_t *src0_ptr,
2452 int32_t src2_stride,
2455 const int8_t *filter,
2464 int32_t weight, offset;
2466 v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
2467 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 };
2470 v8i16 dst0, dst1, dst2, dst3;
2471 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l;
2472 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
2473 v8i16 filter_vec, const_vec;
2474 v4i32 weight_vec, offset_vec, rnd_vec;
2478 filter_vec = LD_SH(filter);
2479 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2481 offset = (offset0 + offset1) << rnd_val;
2482 weight0 = weight0 & 0x0000FFFF;
2483 weight = weight0 | (weight1 << 16);
2485 const_vec = __msa_ldi_h(128);
2487 offset_vec = __msa_fill_w(offset);
2488 weight_vec = __msa_fill_w(weight);
2489 rnd_vec = __msa_fill_w(rnd_val + 1);
2493 for (loop_cnt = (height >> 3); loop_cnt--;) {
2494 LD_SB8(src0_ptr, src_stride,
2495 src0, src1, src2, src3, src4, src5, src6, src7);
2496 src0_ptr += (8 * src_stride);
2497 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
2498 src1_ptr += (4 * src2_stride);
2499 LD_SH4(src1_ptr, src2_stride, in4, in5, in6, in7);
2500 src1_ptr += (4 * src2_stride);
2501 ILVR_D2_SH(in1, in0, in3, in2, in0, in1);
2502 ILVR_D2_SH(in5, in4, in7, in6, in2, in3);
2503 XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
2505 VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
2507 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
2508 VSHF_B2_SB(src2, src3, src2, src3, mask0, mask1, vec0, vec1);
2510 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
2511 VSHF_B2_SB(src4, src5, src4, src5, mask0, mask1, vec0, vec1);
2513 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
2514 VSHF_B2_SB(src6, src7, src6, src7, mask0, mask1, vec0, vec1);
2516 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
2517 HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3,
2519 weight_vec, rnd_vec, offset_vec,
2520 dst0_r, dst1_r, dst2_r, dst3_r,
2521 dst0_l, dst1_l, dst2_l, dst3_l);
2523 HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r,
2524 dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
2525 ST4x8_UB(dst0_r, dst1_r, dst, dst_stride);
2526 dst += (8 * dst_stride);
2530 static void hevc_hz_biwgt_4t_4w_msa(uint8_t *src0_ptr,
2533 int32_t src2_stride,
2536 const int8_t *filter,
2545 hevc_hz_biwgt_4t_4x2_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
2546 dst, dst_stride, filter, height,
2547 weight0, weight1, offset0, offset1, rnd_val);
2548 } else if (4 == height) {
2549 hevc_hz_biwgt_4t_4x4_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
2550 dst, dst_stride, filter, height,
2551 weight0, weight1, offset0, offset1, rnd_val);
2552 } else if (0 == (height % 8)) {
2553 hevc_hz_biwgt_4t_4x8multiple_msa(src0_ptr, src_stride,
2554 src1_ptr, src2_stride,
2555 dst, dst_stride, filter, height,
2556 weight0, weight1, offset0, offset1,
2561 static void hevc_hz_biwgt_4t_6w_msa(uint8_t *src0_ptr,
2564 int32_t src2_stride,
2567 const int8_t *filter,
2576 int32_t offset, weight;
2578 v16i8 src0, src1, src2, src3;
2579 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
2582 v8i16 in0, in1, in2, in3;
2583 v8i16 dst0, dst1, dst2, dst3;
2584 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l;
2585 v8i16 filter_vec, const_vec;
2586 v4i32 weight_vec, offset_vec, rnd_vec;
2590 filter_vec = LD_SH(filter);
2591 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2593 offset = (offset0 + offset1) << rnd_val;
2594 weight0 = weight0 & 0x0000FFFF;
2595 weight = weight0 | (weight1 << 16);
2597 const_vec = __msa_ldi_h(128);
2599 offset_vec = __msa_fill_w(offset);
2600 weight_vec = __msa_fill_w(weight);
2601 rnd_vec = __msa_fill_w(rnd_val + 1);
2605 for (loop_cnt = (height >> 2); loop_cnt--;) {
2606 LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
2607 src0_ptr += (4 * src_stride);
2608 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
2609 src1_ptr += (4 * src2_stride);
2610 XORI_B4_128_SB(src0, src1, src2, src3);
2612 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
2614 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
2615 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
2617 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
2618 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
2620 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
2621 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
2623 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
2625 HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3,
2627 weight_vec, rnd_vec, offset_vec,
2628 dst0_r, dst1_r, dst2_r, dst3_r,
2629 dst0_l, dst1_l, dst2_l, dst3_l);
2631 HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r,
2632 dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
2633 ST6x4_UB(dst0_r, dst1_r, dst, dst_stride);
2634 dst += (4 * dst_stride);
2638 static void hevc_hz_biwgt_4t_8x2_msa(uint8_t *src0_ptr,
2641 int32_t src2_stride,
2644 const int8_t *filter,
2652 int32_t offset, weight;
2656 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
2657 v16i8 mask1, vec0, vec1;
2659 v8i16 filter_vec, const_vec;
2660 v4i32 dst0_r, dst1_r, dst0_l, dst1_l;
2661 v4i32 weight_vec, offset_vec, rnd_vec;
2665 filter_vec = LD_SH(filter);
2666 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2668 offset = (offset0 + offset1) << rnd_val;
2669 weight0 = weight0 & 0x0000FFFF;
2670 weight = weight0 | (weight1 << 16);
2672 const_vec = __msa_ldi_h(128);
2674 offset_vec = __msa_fill_w(offset);
2675 weight_vec = __msa_fill_w(weight);
2676 rnd_vec = __msa_fill_w(rnd_val + 1);
2680 LD_SB2(src0_ptr, src_stride, src0, src1);
2681 LD_SH2(src1_ptr, src2_stride, in0, in1);
2682 XORI_B2_128_SB(src0, src1);
2683 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
2685 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
2686 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
2688 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
2689 HEVC_BIW_RND_CLIP2(dst0, dst1, in0, in1,
2690 weight_vec, rnd_vec, offset_vec,
2691 dst0_r, dst1_r, dst0_l, dst1_l);
2693 HEVC_PCK_SW_SB4(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r);
2694 ST8x2_UB(dst0_r, dst, dst_stride);
2697 static void hevc_hz_biwgt_4t_8x6_msa(uint8_t *src0_ptr,
2700 int32_t src2_stride,
2703 const int8_t *filter,
2711 int32_t weight, offset;
2713 v16i8 src0, src1, src2, src3, src4, src5;
2714 v8i16 in0, in1, in2, in3, in4, in5;
2715 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
2718 v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
2719 v8i16 filter_vec, const_vec;
2720 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r;
2721 v4i32 dst0_l, dst1_l, dst2_l, dst3_l, dst4_l, dst5_l;
2722 v4i32 weight_vec, offset_vec, rnd_vec;
2726 filter_vec = LD_SH(filter);
2727 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2729 offset = (offset0 + offset1) << rnd_val;
2730 weight0 = weight0 & 0x0000FFFF;
2731 weight = weight0 | (weight1 << 16);
2733 const_vec = __msa_ldi_h(128);
2735 offset_vec = __msa_fill_w(offset);
2736 weight_vec = __msa_fill_w(weight);
2737 rnd_vec = __msa_fill_w(rnd_val + 1);
2741 LD_SB6(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5);
2743 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
2744 src1_ptr += (4 * src2_stride);
2745 LD_SH2(src1_ptr, src2_stride, in4, in5);
2746 XORI_B6_128_SB(src0, src1, src2, src3, src4, src5);
2747 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
2749 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
2750 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
2752 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
2753 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
2755 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
2756 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
2758 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
2759 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
2761 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
2762 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
2764 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5);
2765 HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3,
2767 weight_vec, rnd_vec, offset_vec,
2768 dst0_r, dst1_r, dst2_r, dst3_r,
2769 dst0_l, dst1_l, dst2_l, dst3_l);
2770 HEVC_BIW_RND_CLIP2(dst4, dst5, in4, in5,
2771 weight_vec, rnd_vec, offset_vec,
2772 dst4_r, dst5_r, dst4_l, dst5_l);
2774 HEVC_PCK_SW_SB12(dst0_l, dst0_r, dst1_l, dst1_r,
2775 dst2_l, dst2_r, dst3_l, dst3_r,
2776 dst4_l, dst4_r, dst5_l, dst5_r, dst0_r, dst1_r, dst2_r);
2777 ST8x4_UB(dst0_r, dst1_r, dst, dst_stride);
2778 dst += (4 * dst_stride);
2779 ST8x2_UB(dst2_r, dst, dst_stride);
2782 static void hevc_hz_biwgt_4t_8x4multiple_msa(uint8_t *src0_ptr,
2785 int32_t src2_stride,
2788 const int8_t *filter,
2797 int32_t offset, weight;
2799 v16i8 src0, src1, src2, src3;
2800 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
2803 v8i16 in0, in1, in2, in3;
2804 v8i16 dst0, dst1, dst2, dst3;
2805 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l;
2806 v8i16 filter_vec, const_vec;
2807 v4i32 weight_vec, offset_vec, rnd_vec;
2811 filter_vec = LD_SH(filter);
2812 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2814 offset = (offset0 + offset1) << rnd_val;
2815 weight0 = weight0 & 0x0000FFFF;
2816 weight = weight0 | (weight1 << 16);
2818 const_vec = __msa_ldi_h(128);
2820 offset_vec = __msa_fill_w(offset);
2821 weight_vec = __msa_fill_w(weight);
2822 rnd_vec = __msa_fill_w(rnd_val + 1);
2826 for (loop_cnt = (height >> 2); loop_cnt--;) {
2827 LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
2828 src0_ptr += (4 * src_stride);
2829 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
2830 src1_ptr += (4 * src2_stride);
2831 XORI_B4_128_SB(src0, src1, src2, src3);
2833 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
2835 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
2836 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
2838 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
2839 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
2841 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
2842 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
2844 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
2845 HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3,
2847 weight_vec, rnd_vec, offset_vec,
2848 dst0_r, dst1_r, dst2_r, dst3_r,
2849 dst0_l, dst1_l, dst2_l, dst3_l);
2851 HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r,
2852 dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
2853 ST8x4_UB(dst0_r, dst1_r, dst, dst_stride);
2854 dst += (4 * dst_stride);
2858 static void hevc_hz_biwgt_4t_8w_msa(uint8_t *src0_ptr,
2861 int32_t src2_stride,
2864 const int8_t *filter,
2873 hevc_hz_biwgt_4t_8x2_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
2874 dst, dst_stride, filter, height,
2875 weight0, weight1, offset0, offset1, rnd_val);
2876 } else if (6 == height) {
2877 hevc_hz_biwgt_4t_8x6_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
2878 dst, dst_stride, filter, height,
2879 weight0, weight1, offset0, offset1, rnd_val);
2880 } else if (0 == (height % 4)) {
2881 hevc_hz_biwgt_4t_8x4multiple_msa(src0_ptr, src_stride,
2882 src1_ptr, src2_stride,
2883 dst, dst_stride, filter, height,
2884 weight0, weight1, offset0, offset1,
2889 static void hevc_hz_biwgt_4t_12w_msa(uint8_t *src0_ptr,
2892 int32_t src2_stride,
2895 const int8_t *filter,
2904 int32_t offset, weight;
2906 v16i8 src0, src1, src2, src3;
2907 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
2908 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
2910 8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
2914 v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
2915 v8i16 filter_vec, const_vec;
2916 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r;
2917 v4i32 dst0_l, dst1_l, dst2_l, dst3_l, dst4_l, dst5_l;
2918 v4i32 weight_vec, offset_vec, rnd_vec;
2922 filter_vec = LD_SH(filter);
2923 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2925 offset = (offset0 + offset1) << rnd_val;
2926 weight0 = weight0 & 0x0000FFFF;
2927 weight = weight0 | (weight1 << 16);
2929 const_vec = __msa_ldi_h(128);
2931 offset_vec = __msa_fill_w(offset);
2932 weight_vec = __msa_fill_w(weight);
2933 rnd_vec = __msa_fill_w(rnd_val + 1);
2938 for (loop_cnt = (height >> 2); loop_cnt--;) {
2939 LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
2940 src0_ptr += (4 * src_stride);
2941 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
2942 LD_SH4(src1_ptr + 8, src2_stride, in4, in5, in6, in7);
2943 src1_ptr += (4 * src2_stride);
2944 ILVR_D2_SH(in5, in4, in7, in6, in4, in5);
2945 XORI_B4_128_SB(src0, src1, src2, src3);
2947 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
2949 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
2950 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
2952 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
2953 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
2955 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
2956 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
2958 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
2959 VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec0, vec1);
2961 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
2962 VSHF_B2_SB(src2, src3, src2, src3, mask2, mask3, vec0, vec1);
2964 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5);
2966 HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3,
2968 weight_vec, rnd_vec, offset_vec,
2969 dst0_r, dst1_r, dst2_r, dst3_r,
2970 dst0_l, dst1_l, dst2_l, dst3_l);
2971 HEVC_BIW_RND_CLIP2(dst4, dst5, in4, in5,
2972 weight_vec, rnd_vec, offset_vec,
2973 dst4_r, dst5_r, dst4_l, dst5_l);
2975 HEVC_PCK_SW_SB12(dst0_l, dst0_r, dst1_l, dst1_r,
2976 dst2_l, dst2_r, dst3_l, dst3_r,
2977 dst4_l, dst4_r, dst5_l, dst5_r,
2978 dst0_r, dst1_r, dst2_r);
2979 ST12x4_UB(dst0_r, dst1_r, dst2_r, dst, dst_stride);
2980 dst += (4 * dst_stride);
2984 static void hevc_hz_biwgt_4t_16w_msa(uint8_t *src0_ptr,
2987 int32_t src2_stride,
2990 const int8_t *filter,
2999 int32_t offset, weight;
3000 v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
3001 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
3003 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
3005 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
3007 v8i16 filter_vec, const_vec;
3008 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l;
3009 v4i32 weight_vec, offset_vec, rnd_vec;
3013 filter_vec = LD_SH(filter);
3014 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3016 offset = (offset0 + offset1) << rnd_val;
3017 weight0 = weight0 & 0x0000FFFF;
3018 weight = weight0 | (weight1 << 16);
3020 const_vec = __msa_ldi_h(128);
3022 offset_vec = __msa_fill_w(offset);
3023 weight_vec = __msa_fill_w(weight);
3024 rnd_vec = __msa_fill_w(rnd_val + 1);
3028 for (loop_cnt = (height >> 2); loop_cnt--;) {
3029 LD_SB4(src0_ptr, src_stride, src0, src2, src4, src6);
3030 LD_SB4(src0_ptr + 8, src_stride, src1, src3, src5, src7);
3031 src0_ptr += (4 * src_stride);
3032 LD_SH4(src1_ptr, src2_stride, in0, in2, in4, in6);
3033 LD_SH4(src1_ptr + 8, src2_stride, in1, in3, in5, in7);
3034 src1_ptr += (4 * src2_stride);
3035 XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
3037 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3039 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
3040 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
3042 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
3043 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
3045 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
3046 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3048 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
3049 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
3051 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
3052 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
3054 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5);
3055 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
3057 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst6, dst6);
3058 VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1);
3060 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst7, dst7);
3061 HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3,
3063 weight_vec, rnd_vec, offset_vec,
3064 dst0_r, dst1_r, dst2_r, dst3_r,
3065 dst0_l, dst1_l, dst2_l, dst3_l);
3067 HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r,
3068 dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
3069 ST_SW2(dst0_r, dst1_r, dst, dst_stride);
3070 dst += (2 * dst_stride);
3072 HEVC_BIW_RND_CLIP4(dst4, dst5, dst6, dst7,
3074 weight_vec, rnd_vec, offset_vec,
3075 dst0_r, dst1_r, dst2_r, dst3_r,
3076 dst0_l, dst1_l, dst2_l, dst3_l);
3078 HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r,
3079 dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
3080 ST_SW2(dst0_r, dst1_r, dst, dst_stride);
3081 dst += (2 * dst_stride);
3085 static void hevc_hz_biwgt_4t_24w_msa(uint8_t *src0_ptr,
3088 int32_t src2_stride,
3091 const int8_t *filter,
3100 int32_t offset, weight;
3101 uint8_t *dst_tmp = dst + 16;
3102 v16i8 src0, src1, src2, src3;
3104 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
3105 v16i8 mask1, mask2, mask3;
3107 v8i16 dst0, dst1, dst2, dst3;
3108 v8i16 in0, in1, in2, in3, in4, in5;
3109 v8i16 filter_vec, const_vec;
3110 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l;
3111 v4i32 weight_vec, offset_vec, rnd_vec;
3115 filter_vec = LD_SH(filter);
3116 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3118 offset = (offset0 + offset1) << rnd_val;
3119 weight0 = weight0 & 0x0000FFFF;
3120 weight = weight0 | (weight1 << 16);
3122 const_vec = __msa_ldi_h(128);
3124 offset_vec = __msa_fill_w(offset);
3125 weight_vec = __msa_fill_w(weight);
3126 rnd_vec = __msa_fill_w(rnd_val + 1);
3132 for (loop_cnt = (height >> 1); loop_cnt--;) {
3133 LD_SB2(src0_ptr, src_stride, src0, src2);
3134 LD_SB2(src0_ptr + 16, src_stride, src1, src3);
3135 src0_ptr += (2 * src_stride);
3136 LD_SH2(src1_ptr, src2_stride, in0, in2);
3137 LD_SH2(src1_ptr + 8, src2_stride, in1, in3);
3138 LD_SH2(src1_ptr + 16, src2_stride, in4, in5);
3139 src1_ptr += (2 * src2_stride);
3140 XORI_B4_128_SB(src0, src1, src2, src3);
3142 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3144 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
3145 VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec0, vec1);
3147 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
3148 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
3150 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
3151 VSHF_B2_SB(src2, src3, src2, src3, mask2, mask3, vec0, vec1);
3153 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
3154 HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3,
3156 weight_vec, rnd_vec, offset_vec,
3157 dst0_r, dst1_r, dst2_r, dst3_r,
3158 dst0_l, dst1_l, dst2_l, dst3_l);
3160 HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r,
3161 dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
3162 ST_SW2(dst0_r, dst1_r, dst, dst_stride);
3163 dst += (2 * dst_stride);
3165 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
3167 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
3168 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3170 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
3171 HEVC_BIW_RND_CLIP2(dst0, dst1, in4, in5,
3172 weight_vec, rnd_vec, offset_vec,
3173 dst0_r, dst1_r, dst0_l, dst1_l);
3175 HEVC_PCK_SW_SB4(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r);
3176 ST8x2_UB(dst0_r, dst_tmp, dst_stride);
3177 dst_tmp += (2 * dst_stride);
3181 static void hevc_hz_biwgt_4t_32w_msa(uint8_t *src0_ptr,
3184 int32_t src2_stride,
3187 const int8_t *filter,
3196 int32_t offset, weight;
3197 v16i8 src0, src1, src2;
3199 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
3200 v16i8 mask1, mask2, mask3;
3201 v8i16 dst0, dst1, dst2, dst3;
3203 v8i16 in0, in1, in2, in3;
3204 v8i16 filter_vec, const_vec;
3205 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l;
3206 v4i32 weight_vec, offset_vec, rnd_vec;
3210 filter_vec = LD_SH(filter);
3211 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3213 offset = (offset0 + offset1) << rnd_val;
3214 weight0 = weight0 & 0x0000FFFF;
3215 weight = weight0 | (weight1 << 16);
3217 const_vec = __msa_ldi_h(128);
3219 offset_vec = __msa_fill_w(offset);
3220 weight_vec = __msa_fill_w(weight);
3221 rnd_vec = __msa_fill_w(rnd_val + 1);
3227 for (loop_cnt = height; loop_cnt--;) {
3228 LD_SB2(src0_ptr, 16, src0, src1);
3229 src2 = LD_SB(src0_ptr + 24);
3230 src0_ptr += src_stride;
3231 LD_SH4(src1_ptr, 8, in0, in1, in2, in3);
3232 src1_ptr += src2_stride;
3233 XORI_B3_128_SB(src0, src1, src2);
3235 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3237 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
3238 VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec0, vec1);
3240 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
3241 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
3243 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
3244 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
3246 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
3247 HEVC_BIW_RND_CLIP4(dst0, dst1, dst2, dst3,
3249 weight_vec, rnd_vec, offset_vec,
3250 dst0_r, dst1_r, dst2_r, dst3_r,
3251 dst0_l, dst1_l, dst2_l, dst3_l);
3253 HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r,
3254 dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
3255 ST_SW2(dst0_r, dst1_r, dst, 16);
3260 static void hevc_vt_biwgt_4t_4x2_msa(uint8_t *src0_ptr,
3263 int32_t src2_stride,
3266 const int8_t *filter,
3274 int32_t weight, offset;
3275 v16i8 src0, src1, src2, src3, src4;
3276 v8i16 in0, in1, dst10;
3277 v16i8 src10_r, src32_r, src21_r, src43_r, src2110, src4332;
3278 v4i32 dst10_r, dst10_l;
3280 v8i16 filter_vec, const_vec;
3281 v4i32 weight_vec, offset_vec, rnd_vec;
3283 src0_ptr -= src_stride;
3285 offset = (offset0 + offset1) << rnd_val;
3286 weight0 = weight0 & 0x0000FFFF;
3287 weight = weight0 | (weight1 << 16);
3289 const_vec = __msa_ldi_h(128);
3291 offset_vec = __msa_fill_w(offset);
3292 weight_vec = __msa_fill_w(weight);
3293 rnd_vec = __msa_fill_w(rnd_val + 1);
3295 filter_vec = LD_SH(filter);
3296 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3298 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
3299 src0_ptr += (3 * src_stride);
3300 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3301 src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
3302 src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
3303 LD_SB2(src0_ptr, src_stride, src3, src4);
3304 src0_ptr += (2 * src_stride);
3305 LD_SH2(src1_ptr, src2_stride, in0, in1);
3306 src1_ptr += (2 * src2_stride);
3308 in0 = (v8i16) __msa_ilvr_d((v2i64) in1, (v2i64) in0);
3309 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3310 src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_r, (v2i64) src32_r);
3311 src4332 = (v16i8) __msa_xori_b((v16u8) src4332, 128);
3314 DPADD_SB2_SH(src2110, src4332, filt0, filt1, dst10, dst10);
3316 ILVRL_H2_SW(dst10, in0, dst10_r, dst10_l);
3317 dst10_r = __msa_dpadd_s_w(offset_vec, (v8i16) dst10_r, (v8i16) weight_vec);
3318 dst10_l = __msa_dpadd_s_w(offset_vec, (v8i16) dst10_l, (v8i16) weight_vec);
3319 SRAR_W2_SW(dst10_r, dst10_l, rnd_vec);
3320 dst10_r = CLIP_SW_0_255(dst10_r);
3321 dst10_l = CLIP_SW_0_255(dst10_l);
3323 HEVC_PCK_SW_SB2(dst10_l, dst10_r, dst10_r);
3324 ST4x2_UB(dst10_r, dst, dst_stride);
3327 static void hevc_vt_biwgt_4t_4x4_msa(uint8_t *src0_ptr,
3330 int32_t src2_stride,
3333 const int8_t *filter,
3341 int32_t weight, offset;
3342 v16i8 src0, src1, src2, src3, src4, src5, src6;
3343 v8i16 in0, in1, in2, in3;
3344 v16i8 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r;
3345 v16i8 src2110, src4332, src6554;
3347 v4i32 dst10_r, dst32_r, dst10_l, dst32_l;
3349 v8i16 filter_vec, const_vec;
3350 v4i32 weight_vec, offset_vec, rnd_vec;
3352 src0_ptr -= src_stride;
3354 offset = (offset0 + offset1) << rnd_val;
3355 weight0 = weight0 & 0x0000FFFF;
3356 weight = weight0 | (weight1 << 16);
3358 const_vec = __msa_ldi_h(128);
3360 offset_vec = __msa_fill_w(offset);
3361 weight_vec = __msa_fill_w(weight);
3362 rnd_vec = __msa_fill_w(rnd_val + 1);
3364 filter_vec = LD_SH(filter);
3365 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3367 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
3368 src0_ptr += (3 * src_stride);
3369 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3370 src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
3371 src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
3373 LD_SB4(src0_ptr, src_stride, src3, src4, src5, src6);
3374 src0_ptr += (4 * src_stride);
3375 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
3376 src1_ptr += (4 * src2_stride);
3377 ILVR_D2_SH(in1, in0, in3, in2, in0, in1);
3378 ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
3379 src32_r, src43_r, src54_r, src65_r);
3380 ILVR_D2_SB(src43_r, src32_r, src65_r, src54_r, src4332, src6554);
3381 XORI_B2_128_SB(src4332, src6554);
3384 DPADD_SB2_SH(src2110, src4332, filt0, filt1, dst10, dst10);
3386 DPADD_SB2_SH(src4332, src6554, filt0, filt1, dst32, dst32);
3388 HEVC_BIW_RND_CLIP2(dst10, dst32, in0, in1,
3389 weight_vec, rnd_vec, offset_vec,
3390 dst10_r, dst32_r, dst10_l, dst32_l);
3392 HEVC_PCK_SW_SB4(dst10_l, dst10_r, dst32_l, dst32_r, dst10_r);
3393 ST4x4_UB(dst10_r, dst10_r, 0, 1, 2, 3, dst, dst_stride);
3394 dst += (4 * dst_stride);
3397 static void hevc_vt_biwgt_4t_4x8multiple_msa(uint8_t *src0_ptr,
3400 int32_t src2_stride,
3403 const int8_t *filter,
3412 int32_t weight, offset;
3413 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9;
3414 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
3415 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
3416 v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
3417 v16i8 src2110, src4332, src6554, src8776;
3418 v8i16 dst10, dst32, dst54, dst76;
3419 v4i32 dst10_r, dst32_r, dst54_r, dst76_r;
3420 v4i32 dst10_l, dst32_l, dst54_l, dst76_l;
3422 v8i16 filter_vec, const_vec;
3423 v4i32 weight_vec, offset_vec, rnd_vec;
3425 src0_ptr -= src_stride;
3427 offset = (offset0 + offset1) << rnd_val;
3428 weight0 = weight0 & 0x0000FFFF;
3429 weight = weight0 | (weight1 << 16);
3431 const_vec = __msa_ldi_h(128);
3433 offset_vec = __msa_fill_w(offset);
3434 weight_vec = __msa_fill_w(weight);
3435 rnd_vec = __msa_fill_w(rnd_val + 1);
3437 filter_vec = LD_SH(filter);
3438 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3440 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
3441 src0_ptr += (3 * src_stride);
3442 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3443 src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
3444 src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
3446 for (loop_cnt = (height >> 3); loop_cnt--;) {
3447 LD_SB6(src0_ptr, src_stride, src3, src4, src5, src6, src7, src8);
3448 src0_ptr += (6 * src_stride);
3449 LD_SH8(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5, in6, in7);
3450 src1_ptr += (8 * src2_stride);
3452 ILVR_D2_SH(in1, in0, in3, in2, in0, in1);
3453 ILVR_D2_SH(in5, in4, in7, in6, in2, in3);
3455 ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
3456 src32_r, src43_r, src54_r, src65_r);
3457 ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
3458 ILVR_D3_SB(src43_r, src32_r, src65_r, src54_r, src87_r, src76_r,
3459 src4332, src6554, src8776);
3460 XORI_B3_128_SB(src4332, src6554, src8776);
3463 DPADD_SB2_SH(src2110, src4332, filt0, filt1, dst10, dst10);
3465 DPADD_SB2_SH(src4332, src6554, filt0, filt1, dst32, dst32);
3467 DPADD_SB2_SH(src6554, src8776, filt0, filt1, dst54, dst54);
3469 LD_SB2(src0_ptr, src_stride, src9, src2);
3470 src0_ptr += (2 * src_stride);
3471 ILVR_B2_SB(src9, src8, src2, src9, src98_r, src109_r);
3472 src2110 = (v16i8) __msa_ilvr_d((v2i64) src109_r, (v2i64) src98_r);
3473 src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
3476 DPADD_SB2_SH(src8776, src2110, filt0, filt1, dst76, dst76);
3477 HEVC_BIW_RND_CLIP4(dst10, dst32, dst54, dst76,
3479 weight_vec, rnd_vec, offset_vec,
3480 dst10_r, dst32_r, dst54_r, dst76_r,
3481 dst10_l, dst32_l, dst54_l, dst76_l);
3483 HEVC_PCK_SW_SB8(dst10_l, dst10_r, dst32_l, dst32_r,
3484 dst54_l, dst54_r, dst76_l, dst76_r, dst10_r, dst54_r);
3485 ST4x8_UB(dst10_r, dst54_r, dst, dst_stride);
3486 dst += (8 * dst_stride);
3490 static void hevc_vt_biwgt_4t_4w_msa(uint8_t *src0_ptr,
3493 int32_t src2_stride,
3496 const int8_t *filter,
3505 hevc_vt_biwgt_4t_4x2_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
3506 dst, dst_stride, filter, height,
3507 weight0, weight1, offset0, offset1, rnd_val);
3508 } else if (4 == height) {
3509 hevc_vt_biwgt_4t_4x4_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
3510 dst, dst_stride, filter, height,
3511 weight0, weight1, offset0, offset1, rnd_val);
3512 } else if (0 == (height % 8)) {
3513 hevc_vt_biwgt_4t_4x8multiple_msa(src0_ptr, src_stride,
3514 src1_ptr, src2_stride,
3515 dst, dst_stride, filter, height,
3516 weight0, weight1, offset0, offset1,
3521 static void hevc_vt_biwgt_4t_6w_msa(uint8_t *src0_ptr,
3524 int32_t src2_stride,
3527 const int8_t *filter,
3536 int32_t offset, weight;
3537 v16i8 src0, src1, src2, src3, src4;
3538 v8i16 in0, in1, in2, in3;
3539 v16i8 src10_r, src32_r, src21_r, src43_r;
3540 v8i16 tmp0, tmp1, tmp2, tmp3;
3542 v8i16 filter_vec, const_vec;
3543 v4i32 weight_vec, offset_vec, rnd_vec;
3544 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l;
3546 src0_ptr -= src_stride;
3548 offset = (offset0 + offset1) << rnd_val;
3549 weight0 = weight0 & 0x0000FFFF;
3550 weight = weight0 | (weight1 << 16);
3552 const_vec = __msa_ldi_h(128);
3554 offset_vec = __msa_fill_w(offset);
3555 weight_vec = __msa_fill_w(weight);
3556 rnd_vec = __msa_fill_w(rnd_val + 1);
3558 filter_vec = LD_SH(filter);
3559 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3561 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
3562 src0_ptr += (3 * src_stride);
3563 XORI_B3_128_SB(src0, src1, src2);
3564 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3566 for (loop_cnt = (height >> 2); loop_cnt--;) {
3567 LD_SB2(src0_ptr, src_stride, src3, src4);
3568 src0_ptr += (2 * src_stride);
3569 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
3570 src1_ptr += (4 * src2_stride);
3571 XORI_B2_128_SB(src3, src4);
3572 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3575 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, tmp0, tmp0);
3577 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, tmp1, tmp1);
3579 LD_SB2(src0_ptr, src_stride, src1, src2);
3580 src0_ptr += (2 * src_stride);
3581 XORI_B2_128_SB(src1, src2);
3582 ILVR_B2_SB(src1, src4, src2, src1, src10_r, src21_r);
3585 DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, tmp2, tmp2);
3587 DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, tmp3, tmp3);
3588 HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3,
3590 weight_vec, rnd_vec, offset_vec,
3591 dst0_r, dst1_r, dst2_r, dst3_r,
3592 dst0_l, dst1_l, dst2_l, dst3_l);
3594 HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r,
3595 dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
3596 ST6x4_UB(dst0_r, dst1_r, dst, dst_stride);
3597 dst += (4 * dst_stride);
3601 static void hevc_vt_biwgt_4t_8x2_msa(uint8_t *src0_ptr,
3604 int32_t src2_stride,
3607 const int8_t *filter,
3615 int32_t offset, weight;
3616 v16i8 src0, src1, src2, src3, src4;
3617 v8i16 in0, in1, tmp0, tmp1;
3618 v16i8 src10_r, src32_r, src21_r, src43_r;
3620 v8i16 filter_vec, const_vec;
3621 v4i32 weight_vec, offset_vec, rnd_vec;
3622 v4i32 dst0_r, dst1_r, dst0_l, dst1_l;
3624 src0_ptr -= src_stride;
3626 offset = (offset0 + offset1) << rnd_val;
3627 weight0 = weight0 & 0x0000FFFF;
3628 weight = weight0 | (weight1 << 16);
3630 const_vec = __msa_ldi_h(128);
3632 offset_vec = __msa_fill_w(offset);
3633 weight_vec = __msa_fill_w(weight);
3634 rnd_vec = __msa_fill_w(rnd_val + 1);
3636 filter_vec = LD_SH(filter);
3637 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3639 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
3640 src0_ptr += (3 * src_stride);
3641 XORI_B3_128_SB(src0, src1, src2);
3642 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3644 LD_SB2(src0_ptr, src_stride, src3, src4);
3645 LD_SH2(src1_ptr, src2_stride, in0, in1);
3646 XORI_B2_128_SB(src3, src4);
3647 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3650 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, tmp0, tmp0);
3652 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, tmp1, tmp1);
3653 HEVC_BIW_RND_CLIP2(tmp0, tmp1, in0, in1,
3654 weight_vec, rnd_vec, offset_vec,
3655 dst0_r, dst1_r, dst0_l, dst1_l);
3657 HEVC_PCK_SW_SB4(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r);
3658 ST8x2_UB(dst0_r, dst, dst_stride);
3661 static void hevc_vt_biwgt_4t_8x6_msa(uint8_t *src0_ptr,
3664 int32_t src2_stride,
3667 const int8_t *filter,
3675 int32_t offset, weight;
3676 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
3677 v8i16 in0, in1, in2, in3, in4, in5;
3678 v16i8 src10_r, src32_r, src54_r, src76_r;
3679 v16i8 src21_r, src43_r, src65_r, src87_r;
3680 v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
3682 v8i16 filter_vec, const_vec;
3683 v4i32 weight_vec, offset_vec, rnd_vec;
3684 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r;
3685 v4i32 dst0_l, dst1_l, dst2_l, dst3_l, dst4_l, dst5_l;
3687 src0_ptr -= src_stride;
3689 offset = (offset0 + offset1) << rnd_val;
3690 weight0 = weight0 & 0x0000FFFF;
3691 weight = weight0 | (weight1 << 16);
3693 const_vec = __msa_ldi_h(128);
3695 offset_vec = __msa_fill_w(offset);
3696 weight_vec = __msa_fill_w(weight);
3697 rnd_vec = __msa_fill_w(rnd_val + 1);
3699 filter_vec = LD_SH(filter);
3700 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3702 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
3703 src0_ptr += (3 * src_stride);
3704 XORI_B3_128_SB(src0, src1, src2);
3705 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3707 LD_SB6(src0_ptr, src_stride, src3, src4, src5, src6, src7, src8);
3708 LD_SH6(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5);
3709 XORI_B6_128_SB(src3, src4, src5, src6, src7, src8);
3710 ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
3711 src32_r, src43_r, src54_r, src65_r);
3712 ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
3715 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, tmp0, tmp0);
3717 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, tmp1, tmp1);
3719 DPADD_SB2_SH(src32_r, src54_r, filt0, filt1, tmp2, tmp2);
3721 DPADD_SB2_SH(src43_r, src65_r, filt0, filt1, tmp3, tmp3);
3723 DPADD_SB2_SH(src54_r, src76_r, filt0, filt1, tmp4, tmp4);
3725 DPADD_SB2_SH(src65_r, src87_r, filt0, filt1, tmp5, tmp5);
3726 HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3,
3728 weight_vec, rnd_vec, offset_vec,
3729 dst0_r, dst1_r, dst2_r, dst3_r,
3730 dst0_l, dst1_l, dst2_l, dst3_l);
3731 HEVC_BIW_RND_CLIP2(tmp4, tmp5, in4, in5,
3732 weight_vec, rnd_vec, offset_vec,
3733 dst4_r, dst5_r, dst4_l, dst5_l);
3735 HEVC_PCK_SW_SB12(dst0_l, dst0_r, dst1_l, dst1_r,
3736 dst2_l, dst2_r, dst3_l, dst3_r,
3737 dst4_l, dst4_r, dst5_l, dst5_r, dst0_r, dst1_r, dst2_r);
3738 ST8x4_UB(dst0_r, dst1_r, dst, dst_stride);
3739 dst += (4 * dst_stride);
3740 ST8x2_UB(dst2_r, dst, dst_stride);
3743 static void hevc_vt_biwgt_4t_8x4multiple_msa(uint8_t *src0_ptr,
3746 int32_t src2_stride,
3749 const int8_t *filter,
3758 int32_t offset, weight;
3759 v16i8 src0, src1, src2, src3, src4;
3760 v8i16 in0, in1, in2, in3;
3761 v16i8 src10_r, src32_r, src21_r, src43_r;
3762 v8i16 tmp0, tmp1, tmp2, tmp3;
3764 v8i16 filter_vec, const_vec;
3765 v4i32 weight_vec, offset_vec, rnd_vec;
3766 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l;
3768 src0_ptr -= src_stride;
3770 offset = (offset0 + offset1) << rnd_val;
3771 weight0 = weight0 & 0x0000FFFF;
3772 weight = weight0 | (weight1 << 16);
3774 const_vec = __msa_ldi_h(128);
3776 offset_vec = __msa_fill_w(offset);
3777 weight_vec = __msa_fill_w(weight);
3778 rnd_vec = __msa_fill_w(rnd_val + 1);
3780 filter_vec = LD_SH(filter);
3781 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3783 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
3784 src0_ptr += (3 * src_stride);
3785 XORI_B3_128_SB(src0, src1, src2);
3786 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3788 for (loop_cnt = (height >> 2); loop_cnt--;) {
3789 LD_SB2(src0_ptr, src_stride, src3, src4);
3790 src0_ptr += (2 * src_stride);
3791 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
3792 src1_ptr += (4 * src2_stride);
3793 XORI_B2_128_SB(src3, src4);
3794 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3797 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, tmp0, tmp0);
3799 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, tmp1, tmp1);
3801 LD_SB2(src0_ptr, src_stride, src1, src2);
3802 src0_ptr += (2 * src_stride);
3803 XORI_B2_128_SB(src1, src2);
3804 ILVR_B2_SB(src1, src4, src2, src1, src10_r, src21_r);
3807 DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, tmp2, tmp2);
3809 DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, tmp3, tmp3);
3810 HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3,
3812 weight_vec, rnd_vec, offset_vec,
3813 dst0_r, dst1_r, dst2_r, dst3_r,
3814 dst0_l, dst1_l, dst2_l, dst3_l);
3816 HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r,
3817 dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
3818 ST8x4_UB(dst0_r, dst1_r, dst, dst_stride);
3819 dst += (4 * dst_stride);
3823 static void hevc_vt_biwgt_4t_8w_msa(uint8_t *src0_ptr,
3826 int32_t src2_stride,
3829 const int8_t *filter,
3838 hevc_vt_biwgt_4t_8x2_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
3839 dst, dst_stride, filter, height,
3840 weight0, weight1, offset0, offset1, rnd_val);
3841 } else if (6 == height) {
3842 hevc_vt_biwgt_4t_8x6_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
3843 dst, dst_stride, filter, height,
3844 weight0, weight1, offset0, offset1, rnd_val);
3846 hevc_vt_biwgt_4t_8x4multiple_msa(src0_ptr, src_stride,
3847 src1_ptr, src2_stride,
3848 dst, dst_stride, filter, height,
3849 weight0, weight1, offset0, offset1,
3854 static void hevc_vt_biwgt_4t_12w_msa(uint8_t *src0_ptr,
3857 int32_t src2_stride,
3860 const int8_t *filter,
3869 int32_t offset, weight;
3870 v16i8 src0, src1, src2, src3, src4, src5;
3871 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
3872 v16i8 src10_r, src32_r, src21_r, src43_r;
3873 v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
3874 v16i8 src10_l, src32_l, src54_l, src21_l, src43_l, src65_l;
3875 v16i8 src2110, src4332;
3877 v8i16 filter_vec, const_vec;
3878 v4i32 weight_vec, offset_vec, rnd_vec;
3879 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r;
3880 v4i32 dst0_l, dst1_l, dst2_l, dst3_l, dst4_l, dst5_l;
3882 src0_ptr -= (1 * src_stride);
3884 offset = (offset0 + offset1) << rnd_val;
3885 weight0 = weight0 & 0x0000FFFF;
3886 weight = weight0 | (weight1 << 16);
3888 const_vec = __msa_ldi_h(128);
3890 offset_vec = __msa_fill_w(offset);
3891 weight_vec = __msa_fill_w(weight);
3892 rnd_vec = __msa_fill_w(rnd_val + 1);
3894 filter_vec = LD_SH(filter);
3895 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3897 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
3898 src0_ptr += (3 * src_stride);
3899 XORI_B3_128_SB(src0, src1, src2);
3900 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3901 ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
3902 src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_l, (v2i64) src10_l);
3904 for (loop_cnt = (height >> 2); loop_cnt--;) {
3905 LD_SB2(src0_ptr, src_stride, src3, src4);
3906 src0_ptr += (2 * src_stride);
3907 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
3908 LD_SH4(src1_ptr + 8, src2_stride, in4, in5, in6, in7);
3909 src1_ptr += (4 * src2_stride);
3910 ILVR_D2_SH(in5, in4, in7, in6, in4, in5);
3911 XORI_B2_128_SB(src3, src4);
3913 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3914 ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
3915 src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_l, (v2i64) src32_l);
3918 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, tmp0, tmp0);
3920 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, tmp1, tmp1);
3922 DPADD_SB2_SH(src2110, src4332, filt0, filt1, tmp4, tmp4);
3924 LD_SB2(src0_ptr, src_stride, src5, src2);
3925 src0_ptr += (2 * src_stride);
3926 XORI_B2_128_SB(src5, src2);
3927 ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r);
3928 ILVL_B2_SB(src5, src4, src2, src5, src54_l, src65_l);
3929 src2110 = (v16i8) __msa_ilvr_d((v2i64) src65_l, (v2i64) src54_l);
3932 DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, tmp2, tmp2);
3934 DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, tmp3, tmp3);
3936 DPADD_SB2_SH(src4332, src2110, filt0, filt1, tmp5, tmp5);
3937 HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3,
3939 weight_vec, rnd_vec, offset_vec,
3940 dst0_r, dst1_r, dst2_r, dst3_r,
3941 dst0_l, dst1_l, dst2_l, dst3_l);
3942 HEVC_BIW_RND_CLIP2(tmp4, tmp5, in4, in5,
3943 weight_vec, rnd_vec, offset_vec,
3944 dst4_r, dst5_r, dst4_l, dst5_l);
3946 HEVC_PCK_SW_SB12(dst0_l, dst0_r, dst1_l, dst1_r,
3947 dst2_l, dst2_r, dst3_l, dst3_r,
3948 dst4_l, dst4_r, dst5_l, dst5_r,
3949 dst0_r, dst1_r, dst2_r);
3950 ST12x4_UB(dst0_r, dst1_r, dst2_r, dst, dst_stride);
3951 dst += (4 * dst_stride);
3955 static void hevc_vt_biwgt_4t_16w_msa(uint8_t *src0_ptr,
3958 int32_t src2_stride,
3961 const int8_t *filter,
3970 int32_t offset, weight;
3971 v16i8 src0, src1, src2, src3, src4, src5;
3972 v8i16 in0, in1, in2, in3;
3973 v16i8 src10_r, src32_r, src21_r, src43_r;
3974 v16i8 src10_l, src32_l, src21_l, src43_l;
3975 v8i16 tmp0, tmp1, tmp2, tmp3;
3977 v8i16 filter_vec, const_vec;
3978 v4i32 weight_vec, offset_vec, rnd_vec;
3979 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l;
3981 src0_ptr -= src_stride;
3983 offset = (offset0 + offset1) << rnd_val;
3984 weight0 = weight0 & 0x0000FFFF;
3985 weight = weight0 | (weight1 << 16);
3987 const_vec = __msa_ldi_h(128);
3989 offset_vec = __msa_fill_w(offset);
3990 weight_vec = __msa_fill_w(weight);
3991 rnd_vec = __msa_fill_w(rnd_val + 1);
3993 filter_vec = LD_SH(filter);
3994 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3996 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
3997 src0_ptr += (3 * src_stride);
3998 XORI_B3_128_SB(src0, src1, src2);
3999 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
4000 ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
4002 for (loop_cnt = (height >> 2); loop_cnt--;) {
4003 LD_SB2(src0_ptr, src_stride, src3, src4);
4004 src0_ptr += (2 * src_stride);
4005 LD_SH2(src1_ptr, src2_stride, in0, in1);
4006 LD_SH2(src1_ptr + 8, src2_stride, in2, in3);
4007 src1_ptr += (2 * src2_stride);
4008 XORI_B2_128_SB(src3, src4);
4009 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
4010 ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
4013 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, tmp0, tmp0);
4015 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, tmp1, tmp1);
4017 DPADD_SB2_SH(src10_l, src32_l, filt0, filt1, tmp2, tmp2);
4019 DPADD_SB2_SH(src21_l, src43_l, filt0, filt1, tmp3, tmp3);
4021 HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3,
4023 weight_vec, rnd_vec, offset_vec,
4024 dst0_r, dst1_r, dst2_r, dst3_r,
4025 dst0_l, dst1_l, dst2_l, dst3_l);
4026 HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst2_l, dst2_r,
4027 dst1_l, dst1_r, dst3_l, dst3_r, dst0_r, dst1_r);
4028 ST_SW2(dst0_r, dst1_r, dst, dst_stride);
4029 dst += (2 * dst_stride);
4030 LD_SB2(src0_ptr, src_stride, src5, src2);
4031 src0_ptr += (2 * src_stride);
4033 LD_SH2(src1_ptr, src2_stride, in0, in1);
4034 LD_SH2(src1_ptr + 8, src2_stride, in2, in3);
4035 src1_ptr += (2 * src2_stride);
4036 XORI_B2_128_SB(src5, src2);
4037 ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r);
4038 ILVL_B2_SB(src5, src4, src2, src5, src10_l, src21_l);
4041 DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, tmp0, tmp0);
4043 DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, tmp1, tmp1);
4045 DPADD_SB2_SH(src32_l, src10_l, filt0, filt1, tmp2, tmp2);
4047 DPADD_SB2_SH(src43_l, src21_l, filt0, filt1, tmp3, tmp3);
4048 HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3,
4050 weight_vec, rnd_vec, offset_vec,
4051 dst0_r, dst1_r, dst2_r, dst3_r,
4052 dst0_l, dst1_l, dst2_l, dst3_l);
4054 HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst2_l, dst2_r,
4055 dst1_l, dst1_r, dst3_l, dst3_r, dst0_r, dst1_r);
4056 ST_SW2(dst0_r, dst1_r, dst, dst_stride);
4057 dst += (2 * dst_stride);
4061 static void hevc_vt_biwgt_4t_24w_msa(uint8_t *src0_ptr,
4064 int32_t src2_stride,
4067 const int8_t *filter,
4076 int32_t offset, weight;
4077 v16i8 src0, src1, src2, src3, src4, src5;
4078 v16i8 src6, src7, src8, src9, src10, src11;
4079 v8i16 in0, in1, in2, in3, in4, in5;
4080 v16i8 src10_r, src32_r, src76_r, src98_r;
4081 v16i8 src10_l, src32_l, src21_l, src43_l;
4082 v16i8 src21_r, src43_r, src87_r, src109_r;
4083 v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
4085 v8i16 filter_vec, const_vec;
4086 v4i32 weight_vec, offset_vec, rnd_vec;
4087 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r;
4088 v4i32 dst0_l, dst1_l, dst2_l, dst3_l, dst4_l, dst5_l;
4090 src0_ptr -= src_stride;
4092 offset = (offset0 + offset1) << rnd_val;
4093 weight0 = weight0 & 0x0000FFFF;
4094 weight = weight0 | (weight1 << 16);
4096 const_vec = __msa_ldi_h(128);
4098 offset_vec = __msa_fill_w(offset);
4099 weight_vec = __msa_fill_w(weight);
4100 rnd_vec = __msa_fill_w(rnd_val + 1);
4102 filter_vec = LD_SH(filter);
4103 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
4106 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
4107 XORI_B3_128_SB(src0, src1, src2);
4108 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
4109 ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
4111 LD_SB3(src0_ptr + 16, src_stride, src6, src7, src8);
4112 src0_ptr += (3 * src_stride);
4113 XORI_B3_128_SB(src6, src7, src8);
4114 ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
4116 for (loop_cnt = (height >> 2); loop_cnt--;) {
4118 LD_SB2(src0_ptr, src_stride, src3, src4);
4119 LD_SH2(src1_ptr, src2_stride, in0, in1);
4120 LD_SH2(src1_ptr + 8, src2_stride, in2, in3);
4121 XORI_B2_128_SB(src3, src4);
4122 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
4123 ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
4126 LD_SB2(src0_ptr + 16, src_stride, src9, src10);
4127 src0_ptr += (2 * src_stride);
4128 LD_SH2(src1_ptr + 16, src2_stride, in4, in5);
4129 src1_ptr += (2 * src2_stride);
4130 XORI_B2_128_SB(src9, src10);
4131 ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r);
4134 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, tmp0, tmp0);
4136 DPADD_SB2_SH(src10_l, src32_l, filt0, filt1, tmp4, tmp4);
4138 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, tmp1, tmp1);
4140 DPADD_SB2_SH(src21_l, src43_l, filt0, filt1, tmp5, tmp5);
4143 DPADD_SB2_SH(src76_r, src98_r, filt0, filt1, tmp2, tmp2);
4145 DPADD_SB2_SH(src87_r, src109_r, filt0, filt1, tmp3, tmp3);
4147 HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp4, tmp5,
4149 weight_vec, rnd_vec, offset_vec,
4150 dst0_r, dst1_r, dst2_r, dst3_r,
4151 dst0_l, dst1_l, dst2_l, dst3_l);
4153 HEVC_BIW_RND_CLIP2(tmp2, tmp3, in4, in5,
4154 weight_vec, rnd_vec, offset_vec,
4155 dst4_r, dst5_r, dst4_l, dst5_l);
4157 HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst2_l, dst2_r,
4158 dst1_l, dst1_r, dst3_l, dst3_r, dst0_r, dst1_r);
4160 HEVC_PCK_SW_SB4(dst4_l, dst4_r, dst5_l, dst5_r, dst4_r);
4161 ST_SW2(dst0_r, dst1_r, dst, dst_stride);
4162 ST8x2_UB(dst4_r, dst + 16, dst_stride);
4163 dst += (2 * dst_stride);
4166 LD_SB2(src0_ptr, src_stride, src5, src2);
4167 LD_SH2(src1_ptr, src2_stride, in0, in1);
4168 LD_SH2(src1_ptr + 8, src2_stride, in2, in3);
4169 XORI_B2_128_SB(src5, src2);
4170 ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r);
4171 ILVL_B2_SB(src5, src4, src2, src5, src10_l, src21_l);
4173 LD_SB2(src0_ptr + 16, src_stride, src11, src8);
4174 src0_ptr += (2 * src_stride);
4175 LD_SH2(src1_ptr + 16, src2_stride, in4, in5);
4176 src1_ptr += (2 * src2_stride);
4177 XORI_B2_128_SB(src11, src8);
4178 ILVR_B2_SB(src11, src10, src8, src11, src76_r, src87_r);
4181 DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, tmp0, tmp0);
4183 DPADD_SB2_SH(src32_l, src10_l, filt0, filt1, tmp4, tmp4);
4185 DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, tmp1, tmp1);
4187 DPADD_SB2_SH(src43_l, src21_l, filt0, filt1, tmp5, tmp5);
4190 DPADD_SB2_SH(src98_r, src76_r, filt0, filt1, tmp2, tmp2);
4192 DPADD_SB2_SH(src109_r, src87_r, filt0, filt1, tmp3, tmp3);
4194 HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp4, tmp5,
4196 weight_vec, rnd_vec, offset_vec,
4197 dst0_r, dst1_r, dst2_r, dst3_r,
4198 dst0_l, dst1_l, dst2_l, dst3_l);
4200 HEVC_BIW_RND_CLIP2(tmp2, tmp3, in4, in5,
4201 weight_vec, rnd_vec, offset_vec,
4202 dst4_r, dst5_r, dst4_l, dst5_l);
4204 HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst2_l, dst2_r,
4205 dst1_l, dst1_r, dst3_l, dst3_r, dst0_r, dst1_r);
4208 HEVC_PCK_SW_SB4(dst4_l, dst4_r, dst5_l, dst5_r, dst4_r);
4209 ST_SW2(dst0_r, dst1_r, dst, dst_stride);
4210 ST8x2_UB(dst4_r, dst + 16, dst_stride);
4211 dst += (2 * dst_stride);
4215 static void hevc_vt_biwgt_4t_32w_msa(uint8_t *src0_ptr,
4218 int32_t src2_stride,
4221 const int8_t *filter,
4230 uint8_t *dst_tmp = dst + 16;
4231 int32_t offset, weight;
4232 v16i8 src0, src1, src2, src3, src4, src6, src7, src8, src9, src10;
4233 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
4234 v16i8 src10_r, src32_r, src76_r, src98_r;
4235 v16i8 src21_r, src43_r, src87_r, src109_r;
4236 v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
4237 v16i8 src10_l, src32_l, src76_l, src98_l;
4238 v16i8 src21_l, src43_l, src87_l, src109_l;
4240 v8i16 filter_vec, const_vec;
4241 v4i32 weight_vec, offset_vec, rnd_vec;
4242 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r, dst6_r, dst7_r;
4243 v4i32 dst0_l, dst1_l, dst2_l, dst3_l, dst4_l, dst5_l, dst6_l, dst7_l;
4245 src0_ptr -= src_stride;
4247 offset = (offset0 + offset1) << rnd_val;
4248 weight0 = weight0 & 0x0000FFFF;
4249 weight = weight0 | (weight1 << 16);
4251 const_vec = __msa_ldi_h(128);
4253 offset_vec = __msa_fill_w(offset);
4254 weight_vec = __msa_fill_w(weight);
4255 rnd_vec = __msa_fill_w(rnd_val + 1);
4257 filter_vec = LD_SH(filter);
4258 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
4261 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
4262 XORI_B3_128_SB(src0, src1, src2);
4263 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
4264 ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
4266 LD_SB3(src0_ptr + 16, src_stride, src6, src7, src8);
4267 src0_ptr += (3 * src_stride);
4268 XORI_B3_128_SB(src6, src7, src8);
4269 ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
4270 ILVL_B2_SB(src7, src6, src8, src7, src76_l, src87_l);
4272 for (loop_cnt = (height >> 1); loop_cnt--;) {
4274 LD_SB2(src0_ptr, src_stride, src3, src4);
4275 LD_SH2(src1_ptr, src2_stride, in0, in1);
4276 LD_SH2(src1_ptr + 8, src2_stride, in2, in3);
4277 XORI_B2_128_SB(src3, src4);
4278 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
4279 ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
4283 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, tmp0, tmp0);
4285 DPADD_SB2_SH(src10_l, src32_l, filt0, filt1, tmp4, tmp4);
4287 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, tmp1, tmp1);
4289 DPADD_SB2_SH(src21_l, src43_l, filt0, filt1, tmp5, tmp5);
4291 HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp4, tmp5,
4293 weight_vec, rnd_vec, offset_vec,
4294 dst0_r, dst1_r, dst2_r, dst3_r,
4295 dst0_l, dst1_l, dst2_l, dst3_l);
4297 HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst2_l, dst2_r,
4298 dst1_l, dst1_r, dst3_l, dst3_r, dst0_r, dst1_r);
4299 ST_SW2(dst0_r, dst1_r, dst, dst_stride);
4300 dst += (2 * dst_stride);
4309 LD_SB2(src0_ptr + 16, src_stride, src9, src10);
4310 src0_ptr += (2 * src_stride);
4311 LD_SH2(src1_ptr + 16, src2_stride, in4, in5);
4312 LD_SH2(src1_ptr + 24, src2_stride, in6, in7);
4313 src1_ptr += (2 * src2_stride);
4314 XORI_B2_128_SB(src9, src10);
4315 ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r);
4316 ILVL_B2_SB(src9, src8, src10, src9, src98_l, src109_l);
4319 DPADD_SB2_SH(src76_r, src98_r, filt0, filt1, tmp2, tmp2);
4321 DPADD_SB2_SH(src76_l, src98_l, filt0, filt1, tmp6, tmp6);
4323 DPADD_SB2_SH(src87_r, src109_r, filt0, filt1, tmp3, tmp3);
4325 DPADD_SB2_SH(src87_l, src109_l, filt0, filt1, tmp7, tmp7);
4327 HEVC_BIW_RND_CLIP4(tmp2, tmp3, tmp6, tmp7,
4329 weight_vec, rnd_vec, offset_vec,
4330 dst4_r, dst5_r, dst6_r, dst7_r,
4331 dst4_l, dst5_l, dst6_l, dst7_l);
4334 HEVC_PCK_SW_SB8(dst4_l, dst4_r, dst6_l, dst6_r,
4335 dst5_l, dst5_r, dst7_l, dst7_r, dst4_r, dst5_r);
4336 ST_SW2(dst4_r, dst5_r, dst_tmp, dst_stride);
4337 dst_tmp += (2 * dst_stride);
4347 static void hevc_hv_biwgt_4t_4x2_msa(uint8_t *src0_ptr,
4350 int32_t src2_stride,
4353 const int8_t *filter_x,
4354 const int8_t *filter_y,
4362 int32_t offset, weight;
4364 v16i8 src0, src1, src2, src3, src4;
4366 v4i32 filt_h0, filt_h1;
4367 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
4369 v8i16 filter_vec, const_vec;
4370 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
4371 v8i16 dst0, dst1, dst2, dst3, dst4;
4372 v4i32 dst0_r, dst1_r, dst0_l;
4373 v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
4374 v4i32 weight_vec, offset_vec, rnd_vec;
4376 src0_ptr -= (src_stride + 1);
4378 filter_vec = LD_SH(filter_x);
4379 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
4381 filter_vec = LD_SH(filter_y);
4382 vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
4383 filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
4385 SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1);
4389 offset = (offset0 + offset1) << rnd_val;
4390 weight0 = weight0 & 0x0000FFFF;
4391 weight = weight0 | (weight1 << 16);
4393 const_vec = __msa_ldi_h(128);
4395 offset_vec = __msa_fill_w(offset);
4396 weight_vec = __msa_fill_w(weight);
4397 rnd_vec = __msa_fill_w(rnd_val + 1);
4399 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
4400 src0_ptr += (3 * src_stride);
4401 XORI_B3_128_SB(src0, src1, src2);
4403 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
4404 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
4405 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
4407 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
4409 DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1);
4411 DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2);
4412 ILVR_H2_SH(dst1, dst0, dst2, dst1, dst10_r, dst21_r);
4414 LD_SB2(src0_ptr, src_stride, src3, src4);
4415 LD_SH2(src1_ptr, src2_stride, in0, in1);
4416 in0 = (v8i16) __msa_ilvr_d((v2i64) in1, (v2i64) in0);
4417 XORI_B2_128_SB(src3, src4);
4419 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
4421 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
4422 dst32_r = __msa_ilvr_h(dst3, dst2);
4423 dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
4426 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
4428 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
4429 dst43_r = __msa_ilvr_h(dst4, dst3);
4430 dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
4432 dst1_r = (v4i32) __msa_pckev_h((v8i16) dst1_r, (v8i16) dst0_r);
4434 ILVRL_H2_SW(dst1_r, in0, dst0_r, dst0_l);
4435 dst0_r = __msa_dpadd_s_w(offset_vec, (v8i16) dst0_r, (v8i16) weight_vec);
4436 dst0_l = __msa_dpadd_s_w(offset_vec, (v8i16) dst0_l, (v8i16) weight_vec);
4437 SRAR_W2_SW(dst0_r, dst0_l, rnd_vec);
4438 dst0_r = CLIP_SW_0_255(dst0_r);
4439 dst0_l = CLIP_SW_0_255(dst0_l);
4441 HEVC_PCK_SW_SB2(dst0_l, dst0_r, dst0_r);
4442 ST4x2_UB(dst0_r, dst, dst_stride);
4445 static void hevc_hv_biwgt_4t_4x4_msa(uint8_t *src0_ptr,
4448 int32_t src2_stride,
4451 const int8_t *filter_x,
4452 const int8_t *filter_y,
4460 int32_t offset, weight;
4461 v8i16 in0, in1, in2, in3;
4462 v16i8 src0, src1, src2, src3, src4, src5, src6;
4464 v4i32 filt_h0, filt_h1;
4465 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
4467 v8i16 filter_vec, const_vec;
4468 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
4469 v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
4471 v4i32 dst0_l, dst1_l;
4472 v4i32 dst0_r, dst1_r, dst2_r, dst3_r;
4473 v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
4474 v4i32 weight_vec, offset_vec, rnd_vec;
4476 src0_ptr -= (src_stride + 1);
4478 filter_vec = LD_SH(filter_x);
4479 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
4481 filter_vec = LD_SH(filter_y);
4482 vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
4483 filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
4485 SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1);
4489 offset = (offset0 + offset1) << rnd_val;
4490 weight0 = weight0 & 0x0000FFFF;
4491 weight = weight0 | (weight1 << 16);
4493 const_vec = __msa_ldi_h(128);
4495 offset_vec = __msa_fill_w(offset);
4496 weight_vec = __msa_fill_w(weight);
4497 rnd_vec = __msa_fill_w(rnd_val + 1);
4499 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
4500 src0_ptr += (3 * src_stride);
4501 XORI_B3_128_SB(src0, src1, src2);
4503 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
4504 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
4505 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
4507 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
4509 DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1);
4511 DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2);
4512 ILVR_H2_SH(dst1, dst0, dst2, dst1, dst10_r, dst21_r);
4514 LD_SB4(src0_ptr, src_stride, src3, src4, src5, src6);
4515 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
4516 ILVR_D2_SH(in1, in0, in3, in2, in0, in1);
4517 XORI_B4_128_SB(src3, src4, src5, src6);
4519 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
4521 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
4522 dst32_r = __msa_ilvr_h(dst3, dst2);
4523 dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
4526 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
4528 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
4529 dst43_r = __msa_ilvr_h(dst4, dst3);
4530 dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
4533 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
4535 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5);
4536 dst10_r = __msa_ilvr_h(dst5, dst4);
4537 dst2_r = HEVC_FILT_4TAP(dst32_r, dst10_r, filt_h0, filt_h1);
4540 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
4542 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
4543 dst21_r = __msa_ilvr_h(dst2, dst5);
4544 dst3_r = HEVC_FILT_4TAP(dst43_r, dst21_r, filt_h0, filt_h1);
4546 PCKEV_H2_SH(dst1_r, dst0_r, dst3_r, dst2_r, tmp0, tmp1);
4547 HEVC_BIW_RND_CLIP2(tmp0, tmp1, in0, in1,
4548 weight_vec, rnd_vec, offset_vec,
4549 dst0_r, dst1_r, dst0_l, dst1_l);
4551 HEVC_PCK_SW_SB4(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r);
4552 ST4x4_UB(dst0_r, dst0_r, 0, 1, 2, 3, dst, dst_stride);
4555 static void hevc_hv_biwgt_4t_4multx8mult_msa(uint8_t *src0_ptr,
4558 int32_t src2_stride,
4561 const int8_t *filter_x,
4562 const int8_t *filter_y,
4571 int32_t offset, weight;
4572 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
4573 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
4575 v4i32 filt_h0, filt_h1;
4576 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
4578 v8i16 filter_vec, const_vec;
4579 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
4580 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8, dst9;
4581 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r, dst6_r, dst7_r;
4582 v4i32 dst0_l, dst1_l, dst2_l, dst3_l;
4583 v8i16 tmp0, tmp1, tmp2, tmp3;
4584 v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
4585 v8i16 dst21_r, dst43_r, dst65_r, dst87_r;
4586 v4i32 weight_vec, offset_vec, rnd_vec;
4588 src0_ptr -= (src_stride + 1);
4590 filter_vec = LD_SH(filter_x);
4591 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
4593 filter_vec = LD_SH(filter_y);
4594 vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
4595 filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
4597 SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1);
4601 offset = (offset0 + offset1) << rnd_val;
4602 weight0 = weight0 & 0x0000FFFF;
4603 weight = weight0 | (weight1 << 16);
4605 const_vec = __msa_ldi_h(128);
4607 offset_vec = __msa_fill_w(offset);
4608 weight_vec = __msa_fill_w(weight);
4609 rnd_vec = __msa_fill_w(rnd_val + 1);
4611 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
4612 src0_ptr += (3 * src_stride);
4613 XORI_B3_128_SB(src0, src1, src2);
4615 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
4616 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
4617 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
4619 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
4621 DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1);
4623 DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2);
4624 ILVR_H2_SH(dst1, dst0, dst2, dst1, dst10_r, dst21_r);
4626 for (loop_cnt = height >> 3; loop_cnt--;) {
4627 LD_SB8(src0_ptr, src_stride,
4628 src3, src4, src5, src6, src7, src8, src9, src10);
4629 src0_ptr += (8 * src_stride);
4630 LD_SH8(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5, in6, in7);
4631 src1_ptr += (8 * src2_stride);
4632 ILVR_D2_SH(in1, in0, in3, in2, in0, in1);
4633 ILVR_D2_SH(in5, in4, in7, in6, in2, in3);
4634 XORI_B8_128_SB(src3, src4, src5, src6, src7, src8, src9, src10);
4636 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
4638 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
4639 dst32_r = __msa_ilvr_h(dst3, dst2);
4640 dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
4643 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
4645 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
4646 dst43_r = __msa_ilvr_h(dst4, dst3);
4647 dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
4650 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
4652 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5);
4653 dst54_r = __msa_ilvr_h(dst5, dst4);
4654 dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
4657 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
4659 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst6, dst6);
4660 dst65_r = __msa_ilvr_h(dst6, dst5);
4661 dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
4663 VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1);
4665 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst7, dst7);
4666 dst76_r = __msa_ilvr_h(dst7, dst6);
4667 dst4_r = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
4669 VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec0, vec1);
4671 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst8, dst8);
4672 dst87_r = __msa_ilvr_h(dst8, dst7);
4673 dst5_r = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
4675 VSHF_B2_SB(src9, src9, src9, src9, mask0, mask1, vec0, vec1);
4677 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst9, dst9);
4678 dst10_r = __msa_ilvr_h(dst9, dst8);
4679 dst6_r = HEVC_FILT_4TAP(dst76_r, dst10_r, filt_h0, filt_h1);
4681 VSHF_B2_SB(src10, src10, src10, src10, mask0, mask1, vec0, vec1);
4683 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
4684 dst21_r = __msa_ilvr_h(dst2, dst9);
4685 dst7_r = HEVC_FILT_4TAP(dst87_r, dst21_r, filt_h0, filt_h1);
4687 PCKEV_H4_SH(dst1_r, dst0_r, dst3_r, dst2_r,
4688 dst5_r, dst4_r, dst7_r, dst6_r, tmp0, tmp1, tmp2, tmp3);
4689 HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3,
4691 weight_vec, rnd_vec, offset_vec,
4692 dst0_r, dst1_r, dst2_r, dst3_r,
4693 dst0_l, dst1_l, dst2_l, dst3_l);
4695 HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r,
4696 dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
4697 ST4x8_UB(dst0_r, dst1_r, dst, dst_stride);
4698 dst += (8 * dst_stride);
4702 static void hevc_hv_biwgt_4t_4w_msa(uint8_t *src0_ptr,
4705 int32_t src2_stride,
4708 const int8_t *filter_x,
4709 const int8_t *filter_y,
4718 hevc_hv_biwgt_4t_4x2_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
4719 dst, dst_stride, filter_x, filter_y,
4720 height, weight0, weight1, offset0, offset1,
4722 } else if (4 == height) {
4723 hevc_hv_biwgt_4t_4x4_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
4724 dst, dst_stride, filter_x, filter_y,
4725 height, weight0, weight1, offset0, offset1,
4727 } else if (0 == (height % 8)) {
4728 hevc_hv_biwgt_4t_4multx8mult_msa(src0_ptr, src_stride,
4729 src1_ptr, src2_stride,
4730 dst, dst_stride, filter_x, filter_y,
4731 height, weight0, weight1,
4732 offset0, offset1, rnd_val);
4736 static void hevc_hv_biwgt_4t_6w_msa(uint8_t *src0_ptr,
4739 int32_t src2_stride,
4742 const int8_t *filter_x,
4743 const int8_t *filter_y,
4752 int32_t offset, weight;
4753 v16i8 src0, src1, src2, src3, src4, src5, src6;
4754 v8i16 in0, in1, in2, in3;
4756 v4i32 filt_h0, filt_h1;
4757 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
4759 v8i16 filter_vec, const_vec;
4760 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
4761 v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
4762 v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
4763 v8i16 tmp0, tmp1, tmp2, tmp3;
4764 v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
4765 v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
4766 v4i32 weight_vec, offset_vec, rnd_vec;
4768 src0_ptr -= (src_stride + 1);
4770 filter_vec = LD_SH(filter_x);
4771 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
4773 filter_vec = LD_SH(filter_y);
4774 vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
4775 filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
4777 SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1);
4781 offset = (offset0 + offset1) << rnd_val;
4782 weight0 = weight0 & 0x0000FFFF;
4783 weight = weight0 | (weight1 << 16);
4785 const_vec = __msa_ldi_h(128);
4787 offset_vec = __msa_fill_w(offset);
4788 weight_vec = __msa_fill_w(weight);
4789 rnd_vec = __msa_fill_w(rnd_val + 1);
4791 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
4792 src0_ptr += (3 * src_stride);
4793 XORI_B3_128_SB(src0, src1, src2);
4795 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
4796 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
4797 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
4799 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
4801 DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1);
4803 DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2);
4805 ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
4806 ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
4808 for (loop_cnt = height >> 2; loop_cnt--;) {
4809 LD_SB4(src0_ptr, src_stride, src3, src4, src5, src6);
4810 src0_ptr += (4 * src_stride);
4811 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
4812 src1_ptr += (4 * src2_stride);
4813 XORI_B4_128_SB(src3, src4, src5, src6);
4815 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
4817 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
4819 ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
4820 dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
4821 dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
4825 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
4827 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
4829 ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
4830 dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
4831 dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
4835 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
4837 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5);
4839 ILVRL_H2_SH(dst5, dst4, dst10_r, dst10_l);
4840 dst2_r = HEVC_FILT_4TAP(dst32_r, dst10_r, filt_h0, filt_h1);
4841 dst2_l = HEVC_FILT_4TAP(dst32_l, dst10_l, filt_h0, filt_h1);
4844 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
4846 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
4848 ILVRL_H2_SH(dst2, dst5, dst21_r, dst21_l);
4849 dst3_r = HEVC_FILT_4TAP(dst43_r, dst21_r, filt_h0, filt_h1);
4850 dst3_l = HEVC_FILT_4TAP(dst43_l, dst21_l, filt_h0, filt_h1);
4853 PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r,
4854 dst2_l, dst2_r, dst3_l, dst3_r, tmp0, tmp1, tmp2, tmp3);
4855 HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3,
4857 weight_vec, rnd_vec, offset_vec,
4858 dst0_r, dst1_r, dst2_r, dst3_r,
4859 dst0_l, dst1_l, dst2_l, dst3_l);
4861 HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r,
4862 dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
4863 ST6x4_UB(dst0_r, dst1_r, dst, dst_stride);
4864 dst += (4 * dst_stride);
4868 static void hevc_hv_biwgt_4t_8x2_msa(uint8_t *src0_ptr,
4871 int32_t src2_stride,
4874 const int8_t *filter_x,
4875 const int8_t *filter_y,
4883 int32_t weight, offset;
4884 v16i8 src0, src1, src2, src3, src4;
4886 v4i32 filt_h0, filt_h1;
4887 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
4889 v8i16 filter_vec, const_vec;
4890 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
4891 v8i16 dst0, dst1, dst2, dst3, dst4;
4893 v4i32 dst0_r, dst0_l, dst1_r, dst1_l;
4894 v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
4895 v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
4897 v4i32 weight_vec, offset_vec, rnd_vec;
4899 src0_ptr -= (src_stride + 1);
4901 filter_vec = LD_SH(filter_x);
4902 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
4904 filter_vec = LD_SH(filter_y);
4905 vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
4906 filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
4908 SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1);
4912 offset = (offset0 + offset1) << rnd_val;
4913 weight0 = weight0 & 0x0000FFFF;
4914 weight = weight0 | (weight1 << 16);
4916 const_vec = __msa_ldi_h(128);
4918 offset_vec = __msa_fill_w(offset);
4919 weight_vec = __msa_fill_w(weight);
4920 rnd_vec = __msa_fill_w(rnd_val + 1);
4922 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
4923 src0_ptr += (3 * src_stride);
4924 XORI_B3_128_SB(src0, src1, src2);
4926 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
4927 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
4928 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
4930 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
4932 DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1);
4934 DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2);
4936 ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
4937 ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
4939 LD_SB2(src0_ptr, src_stride, src3, src4);
4941 LD_SH2(src1_ptr, src2_stride, in0, in1);
4942 XORI_B2_128_SB(src3, src4);
4944 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
4946 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
4948 ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
4949 dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
4950 dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
4953 tmp0 = __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r);
4955 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
4957 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
4959 ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
4960 dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
4961 dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
4964 tmp1 = __msa_pckev_h((v8i16) dst1_l, (v8i16) dst1_r);
4966 HEVC_BIW_RND_CLIP2(tmp0, tmp1, in0, in1,
4967 weight_vec, rnd_vec, offset_vec,
4968 dst0_r, dst1_r, dst0_l, dst1_l);
4969 HEVC_PCK_SW_SB4(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r);
4970 ST8x2_UB(dst0_r, dst, dst_stride);
4973 static void hevc_hv_biwgt_4t_8x6_msa(uint8_t *src0_ptr,
4976 int32_t src2_stride,
4979 const int8_t *filter_x,
4980 const int8_t *filter_y,
4988 uint32_t offset, weight;
4989 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
4991 v4i32 filt_h0, filt_h1;
4992 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
4994 v8i16 filter_vec, const_vec;
4995 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
4996 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
4997 v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
4998 v4i32 dst4_r, dst4_l, dst5_r, dst5_l;
4999 v8i16 dst10_r, dst32_r, dst10_l, dst32_l;
5000 v8i16 dst21_r, dst43_r, dst21_l, dst43_l;
5001 v8i16 dst54_r, dst54_l, dst65_r, dst65_l;
5002 v8i16 dst76_r, dst76_l, dst87_r, dst87_l;
5003 v8i16 in0, in1, in2, in3, in4, in5;
5004 v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
5005 v4i32 weight_vec, offset_vec, rnd_vec;
5007 src0_ptr -= (src_stride + 1);
5009 filter_vec = LD_SH(filter_x);
5010 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
5012 filter_vec = LD_SH(filter_y);
5013 vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
5014 filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
5016 SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1);
5020 offset = (offset0 + offset1) << rnd_val;
5021 weight0 = weight0 & 0x0000FFFF;
5022 weight = weight0 | (weight1 << 16);
5024 const_vec = __msa_ldi_h(128);
5026 offset_vec = __msa_fill_w(offset);
5027 weight_vec = __msa_fill_w(weight);
5028 rnd_vec = __msa_fill_w(rnd_val + 1);
5030 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
5031 src0_ptr += (3 * src_stride);
5032 XORI_B3_128_SB(src0, src1, src2);
5034 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
5035 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
5036 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
5038 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
5040 DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1);
5042 DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2);
5044 ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
5045 ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
5047 LD_SB2(src0_ptr, src_stride, src3, src4);
5048 src0_ptr += (2 * src_stride);
5049 XORI_B2_128_SB(src3, src4);
5050 LD_SH6(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5);
5051 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
5053 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
5055 ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
5056 dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
5057 dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
5060 tmp0 = __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r);
5062 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
5064 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
5066 ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
5067 dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
5068 dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
5071 tmp1 = __msa_pckev_h((v8i16) dst1_l, (v8i16) dst1_r);
5073 LD_SB2(src0_ptr, src_stride, src5, src6);
5074 src0_ptr += (2 * src_stride);
5075 XORI_B2_128_SB(src5, src6);
5077 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
5079 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5);
5081 ILVRL_H2_SH(dst5, dst4, dst54_r, dst54_l);
5082 dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
5083 dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1);
5086 tmp2 = __msa_pckev_h((v8i16) dst2_l, (v8i16) dst2_r);
5088 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
5090 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst6, dst6);
5092 ILVRL_H2_SH(dst6, dst5, dst65_r, dst65_l);
5093 dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
5094 dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1);
5097 tmp3 = __msa_pckev_h((v8i16) dst3_l, (v8i16) dst3_r);
5099 HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3,
5101 weight_vec, rnd_vec, offset_vec,
5102 dst0_r, dst1_r, dst2_r, dst3_r,
5103 dst0_l, dst1_l, dst2_l, dst3_l);
5105 HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r,
5106 dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
5107 ST8x4_UB(dst0_r, dst1_r, dst, dst_stride);
5108 dst += (4 * dst_stride);
5110 LD_SB2(src0_ptr, src_stride, src7, src8);
5111 XORI_B2_128_SB(src7, src8);
5113 VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1);
5115 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst7, dst7);
5117 ILVRL_H2_SH(dst7, dst6, dst76_r, dst76_l);
5118 dst4_r = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
5119 dst4_l = HEVC_FILT_4TAP(dst54_l, dst76_l, filt_h0, filt_h1);
5122 tmp4 = __msa_pckev_h((v8i16) dst4_l, (v8i16) dst4_r);
5124 VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec0, vec1);
5126 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst8, dst8);
5128 ILVRL_H2_SH(dst8, dst7, dst87_r, dst87_l);
5129 dst5_r = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
5130 dst5_l = HEVC_FILT_4TAP(dst65_l, dst87_l, filt_h0, filt_h1);
5133 tmp5 = __msa_pckev_h((v8i16) dst5_l, (v8i16) dst5_r);
5135 HEVC_BIW_RND_CLIP2(tmp4, tmp5, in4, in5,
5136 weight_vec, rnd_vec, offset_vec,
5137 dst4_r, dst5_r, dst4_l, dst5_l);
5139 HEVC_PCK_SW_SB4(dst4_l, dst4_r, dst5_l, dst5_r, dst2_r);
5140 ST8x2_UB(dst2_r, dst, dst_stride);
5143 static void hevc_hv_biwgt_4t_8multx4mult_msa(uint8_t *src0_ptr,
5146 int32_t src2_stride,
5149 const int8_t *filter_x,
5150 const int8_t *filter_y,
5161 int32_t offset, weight;
5162 uint8_t *src0_ptr_tmp;
5163 int16_t *src1_ptr_tmp;
5165 v16i8 src0, src1, src2, src3, src4, src5, src6;
5166 v8i16 in0, in1, in2, in3;
5168 v4i32 filt_h0, filt_h1;
5169 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
5171 v8i16 filter_vec, const_vec;
5172 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
5173 v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
5174 v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
5175 v8i16 tmp0, tmp1, tmp2, tmp3;
5176 v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
5177 v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
5178 v4i32 weight_vec, offset_vec, rnd_vec;
5180 src0_ptr -= (src_stride + 1);
5182 filter_vec = LD_SH(filter_x);
5183 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
5185 filter_vec = LD_SH(filter_y);
5186 vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
5187 filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
5189 SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1);
5193 offset = (offset0 + offset1) << rnd_val;
5194 weight0 = weight0 & 0x0000FFFF;
5195 weight = weight0 | (weight1 << 16);
5197 const_vec = __msa_ldi_h(128);
5199 offset_vec = __msa_fill_w(offset);
5200 weight_vec = __msa_fill_w(weight);
5201 rnd_vec = __msa_fill_w(rnd_val + 1);
5203 for (cnt = width >> 3; cnt--;) {
5204 src0_ptr_tmp = src0_ptr;
5205 src1_ptr_tmp = src1_ptr;
5208 LD_SB3(src0_ptr_tmp, src_stride, src0, src1, src2);
5209 src0_ptr_tmp += (3 * src_stride);
5210 XORI_B3_128_SB(src0, src1, src2);
5212 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
5213 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
5214 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
5216 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
5218 DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1);
5220 DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2);
5222 ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
5223 ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
5225 for (loop_cnt = height >> 2; loop_cnt--;) {
5226 LD_SB4(src0_ptr_tmp, src_stride, src3, src4, src5, src6);
5227 src0_ptr_tmp += (4 * src_stride);
5228 LD_SH4(src1_ptr_tmp, src2_stride, in0, in1, in2, in3);
5229 src1_ptr_tmp += (4 * src2_stride);
5230 XORI_B4_128_SB(src3, src4, src5, src6);
5232 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
5234 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
5236 ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
5237 dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
5238 dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
5242 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
5244 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
5246 ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
5247 dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
5248 dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
5252 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
5254 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5);
5256 ILVRL_H2_SH(dst5, dst4, dst10_r, dst10_l);
5257 dst2_r = HEVC_FILT_4TAP(dst32_r, dst10_r, filt_h0, filt_h1);
5258 dst2_l = HEVC_FILT_4TAP(dst32_l, dst10_l, filt_h0, filt_h1);
5262 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
5264 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
5266 ILVRL_H2_SH(dst2, dst5, dst21_r, dst21_l);
5267 dst3_r = HEVC_FILT_4TAP(dst43_r, dst21_r, filt_h0, filt_h1);
5268 dst3_l = HEVC_FILT_4TAP(dst43_l, dst21_l, filt_h0, filt_h1);
5272 PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r,
5273 dst2_l, dst2_r, dst3_l, dst3_r, tmp0, tmp1, tmp2, tmp3);
5274 HEVC_BIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3,
5276 weight_vec, rnd_vec, offset_vec,
5277 dst0_r, dst1_r, dst2_r, dst3_r,
5278 dst0_l, dst1_l, dst2_l, dst3_l);
5280 HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r,
5281 dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
5282 ST8x4_UB(dst0_r, dst1_r, dst_tmp, dst_stride);
5283 dst_tmp += (4 * dst_stride);
5292 static void hevc_hv_biwgt_4t_8w_msa(uint8_t *src0_ptr,
5295 int32_t src2_stride,
5298 const int8_t *filter_x,
5299 const int8_t *filter_y,
5308 hevc_hv_biwgt_4t_8x2_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
5309 dst, dst_stride, filter_x, filter_y,
5310 height, weight0, weight1, offset0, offset1,
5312 } else if (6 == height) {
5313 hevc_hv_biwgt_4t_8x6_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
5314 dst, dst_stride, filter_x, filter_y,
5315 height, weight0, weight1, offset0, offset1,
5317 } else if (0 == (height % 4)) {
5318 hevc_hv_biwgt_4t_8multx4mult_msa(src0_ptr, src_stride,
5319 src1_ptr, src2_stride,
5320 dst, dst_stride, filter_x, filter_y,
5322 weight1, offset0, offset1, rnd_val, 8);
5326 static void hevc_hv_biwgt_4t_12w_msa(uint8_t *src0_ptr,
5329 int32_t src2_stride,
5332 const int8_t *filter_x,
5333 const int8_t *filter_y,
5341 hevc_hv_biwgt_4t_8multx4mult_msa(src0_ptr, src_stride,
5342 src1_ptr, src2_stride,
5344 filter_x, filter_y, height, weight0,
5345 weight1, offset0, offset1, rnd_val, 8);
5347 hevc_hv_biwgt_4t_4w_msa(src0_ptr + 8, src_stride, src1_ptr + 8, src2_stride,
5348 dst + 8, dst_stride, filter_x, filter_y,
5349 height, weight0, weight1, offset0,
5353 static void hevc_hv_biwgt_4t_16w_msa(uint8_t *src0_ptr,
5356 int32_t src2_stride,
5359 const int8_t *filter_x,
5360 const int8_t *filter_y,
5368 hevc_hv_biwgt_4t_8multx4mult_msa(src0_ptr, src_stride,
5369 src1_ptr, src2_stride,
5371 filter_x, filter_y, height, weight0,
5372 weight1, offset0, offset1, rnd_val, 16);
5375 static void hevc_hv_biwgt_4t_24w_msa(uint8_t *src0_ptr,
5378 int32_t src2_stride,
5381 const int8_t *filter_x,
5382 const int8_t *filter_y,
5390 hevc_hv_biwgt_4t_8multx4mult_msa(src0_ptr, src_stride,
5391 src1_ptr, src2_stride,
5393 filter_x, filter_y, height, weight0,
5394 weight1, offset0, offset1, rnd_val, 24);
5397 static void hevc_hv_biwgt_4t_32w_msa(uint8_t *src0_ptr,
5400 int32_t src2_stride,
5403 const int8_t *filter_x,
5404 const int8_t *filter_y,
5412 hevc_hv_biwgt_4t_8multx4mult_msa(src0_ptr, src_stride,
5413 src1_ptr, src2_stride,
5415 filter_x, filter_y, height, weight0,
5416 weight1, offset0, offset1, rnd_val, 32);
5419 #define BI_W_MC_COPY(WIDTH) \
5420 void ff_hevc_put_hevc_bi_w_pel_pixels##WIDTH##_8_msa(uint8_t *dst, \
5421 ptrdiff_t dst_stride, \
5423 ptrdiff_t src_stride, \
5424 int16_t *src_16bit, \
5435 int shift = 14 + 1 - 8; \
5436 int log2Wd = denom + shift - 1; \
5438 hevc_biwgt_copy_##WIDTH##w_msa(src, src_stride, src_16bit, MAX_PB_SIZE, \
5439 dst, dst_stride, height, \
5440 weight0, weight1, offset0, \
5456 #define BI_W_MC(PEL, DIR, WIDTH, TAP, DIR1, FILT_DIR) \
5457 void ff_hevc_put_hevc_bi_w_##PEL##_##DIR####WIDTH##_8_msa(uint8_t *dst, \
5463 int16_t *src_16bit, \
5474 const int8_t *filter = ff_hevc_##PEL##_filters[FILT_DIR - 1]; \
5476 int shift = 14 + 1 - 8; \
5477 int log2Wd = denom + shift - 1; \
5479 hevc_##DIR1##_biwgt_##TAP##t_##WIDTH##w_msa(src, src_stride, \
5480 src_16bit, MAX_PB_SIZE, \
5483 weight0, weight1, offset0, \
5487 BI_W_MC(qpel, h, 4, 8, hz, mx);
5488 BI_W_MC(qpel, h, 8, 8, hz, mx);
5489 BI_W_MC(qpel, h, 12, 8, hz, mx);
5490 BI_W_MC(qpel, h, 16, 8, hz, mx);
5491 BI_W_MC(qpel, h, 24, 8, hz, mx);
5492 BI_W_MC(qpel, h, 32, 8, hz, mx);
5493 BI_W_MC(qpel, h, 48, 8, hz, mx);
5494 BI_W_MC(qpel, h, 64, 8, hz, mx);
5496 BI_W_MC(qpel, v, 4, 8, vt, my);
5497 BI_W_MC(qpel, v, 8, 8, vt, my);
5498 BI_W_MC(qpel, v, 12, 8, vt, my);
5499 BI_W_MC(qpel, v, 16, 8, vt, my);
5500 BI_W_MC(qpel, v, 24, 8, vt, my);
5501 BI_W_MC(qpel, v, 32, 8, vt, my);
5502 BI_W_MC(qpel, v, 48, 8, vt, my);
5503 BI_W_MC(qpel, v, 64, 8, vt, my);
5505 BI_W_MC(epel, h, 4, 4, hz, mx);
5506 BI_W_MC(epel, h, 8, 4, hz, mx);
5507 BI_W_MC(epel, h, 6, 4, hz, mx);
5508 BI_W_MC(epel, h, 12, 4, hz, mx);
5509 BI_W_MC(epel, h, 16, 4, hz, mx);
5510 BI_W_MC(epel, h, 24, 4, hz, mx);
5511 BI_W_MC(epel, h, 32, 4, hz, mx);
5513 BI_W_MC(epel, v, 4, 4, vt, my);
5514 BI_W_MC(epel, v, 8, 4, vt, my);
5515 BI_W_MC(epel, v, 6, 4, vt, my);
5516 BI_W_MC(epel, v, 12, 4, vt, my);
5517 BI_W_MC(epel, v, 16, 4, vt, my);
5518 BI_W_MC(epel, v, 24, 4, vt, my);
5519 BI_W_MC(epel, v, 32, 4, vt, my);
5523 #define BI_W_MC_HV(PEL, DIR, WIDTH, TAP, DIR1) \
5524 void ff_hevc_put_hevc_bi_w_##PEL##_##DIR####WIDTH##_8_msa(uint8_t *dst, \
5530 int16_t *src_16bit, \
5541 const int8_t *filter_x = ff_hevc_##PEL##_filters[mx - 1]; \
5542 const int8_t *filter_y = ff_hevc_##PEL##_filters[my - 1]; \
5544 int shift = 14 + 1 - 8; \
5545 int log2Wd = denom + shift - 1; \
5547 hevc_##DIR1##_biwgt_##TAP##t_##WIDTH##w_msa(src, src_stride, \
5548 src_16bit, MAX_PB_SIZE, \
5550 filter_x, filter_y, \
5551 height, weight0, weight1, \
5552 offset0, offset1, log2Wd); \
5555 BI_W_MC_HV(qpel, hv, 4, 8, hv);
5556 BI_W_MC_HV(qpel, hv, 8, 8, hv);
5557 BI_W_MC_HV(qpel, hv, 12, 8, hv);
5558 BI_W_MC_HV(qpel, hv, 16, 8, hv);
5559 BI_W_MC_HV(qpel, hv, 24, 8, hv);
5560 BI_W_MC_HV(qpel, hv, 32, 8, hv);
5561 BI_W_MC_HV(qpel, hv, 48, 8, hv);
5562 BI_W_MC_HV(qpel, hv, 64, 8, hv);
5564 BI_W_MC_HV(epel, hv, 4, 4, hv);
5565 BI_W_MC_HV(epel, hv, 8, 4, hv);
5566 BI_W_MC_HV(epel, hv, 6, 4, hv);
5567 BI_W_MC_HV(epel, hv, 12, 4, hv);
5568 BI_W_MC_HV(epel, hv, 16, 4, hv);
5569 BI_W_MC_HV(epel, hv, 24, 4, hv);
5570 BI_W_MC_HV(epel, hv, 32, 4, hv);