2 * Copyright (c) 2015 - 2017 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com)
4 * This file is part of FFmpeg.
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 #include "libavutil/mips/generic_macros_msa.h"
22 #include "libavcodec/mips/hevcdsp_mips.h"
23 #include "libavcodec/mips/hevc_macros_msa.h"
25 static const uint8_t ff_hevc_mask_arr[16 * 2] __attribute__((aligned(0x40))) = {
27 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
29 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20
32 #define HEVC_HV_UNIW_RND_CLIP4(in0, in1, in2, in3, wgt, offset, rnd, \
33 out0, out1, out2, out3) \
35 MUL4(in0, wgt, in1, wgt, in2, wgt, in3, wgt, out0, out1, out2, out3); \
36 SRAR_W4_SW(out0, out1, out2, out3, rnd); \
37 ADD4(out0, offset, out1, offset, out2, offset, out3, offset, \
38 out0, out1, out2, out3); \
39 out0 = CLIP_SW_0_255(out0); \
40 out1 = CLIP_SW_0_255(out1); \
41 out2 = CLIP_SW_0_255(out2); \
42 out3 = CLIP_SW_0_255(out3); \
45 #define HEVC_UNIW_RND_CLIP2(in0, in1, wgt, offset, rnd, \
46 out0_r, out1_r, out0_l, out1_l) \
48 ILVR_H2_SW(in0, in0, in1, in1, out0_r, out1_r); \
49 ILVL_H2_SW(in0, in0, in1, in1, out0_l, out1_l); \
50 DOTP_SH4_SW(out0_r, out1_r, out0_l, out1_l, wgt, wgt, wgt, wgt, \
51 out0_r, out1_r, out0_l, out1_l); \
52 SRAR_W4_SW(out0_r, out1_r, out0_l, out1_l, rnd); \
53 ADD4(out0_r, offset, out1_r, offset, \
54 out0_l, offset, out1_l, offset, \
55 out0_r, out1_r, out0_l, out1_l); \
56 out0_r = CLIP_SW_0_255(out0_r); \
57 out1_r = CLIP_SW_0_255(out1_r); \
58 out0_l = CLIP_SW_0_255(out0_l); \
59 out1_l = CLIP_SW_0_255(out1_l); \
62 #define HEVC_UNIW_RND_CLIP4(in0, in1, in2, in3, wgt, offset, rnd, \
63 out0_r, out1_r, out2_r, out3_r, \
64 out0_l, out1_l, out2_l, out3_l) \
66 HEVC_UNIW_RND_CLIP2(in0, in1, wgt, offset, rnd, \
67 out0_r, out1_r, out0_l, out1_l); \
68 HEVC_UNIW_RND_CLIP2(in2, in3, wgt, offset, rnd, \
69 out2_r, out3_r, out2_l, out3_l); \
72 #define HEVC_UNIW_RND_CLIP2_MAX_SATU_H(in0_h, in1_h, wgt_w, offset_h, rnd_w, \
75 v4i32 in0_r_m, in0_l_m, in1_r_m, in1_l_m; \
77 ILVRL_H2_SW(in0_h, in0_h, in0_r_m, in0_l_m); \
78 ILVRL_H2_SW(in1_h, in1_h, in1_r_m, in1_l_m); \
79 DOTP_SH4_SW(in0_r_m, in1_r_m, in0_l_m, in1_l_m, wgt_w, wgt_w, wgt_w, \
80 wgt_w, in0_r_m, in1_r_m, in0_l_m, in1_l_m); \
81 SRAR_W4_SW(in0_r_m, in1_r_m, in0_l_m, in1_l_m, rnd_w); \
82 PCKEV_H2_SH(in0_l_m, in0_r_m, in1_l_m, in1_r_m, out0_h, out1_h); \
83 ADDS_SH2_SH(out0_h, offset_h, out1_h, offset_h, out0_h, out1_h); \
84 CLIP_SH2_0_255_MAX_SATU(out0_h, out1_h); \
87 #define HEVC_UNIW_RND_CLIP4_MAX_SATU_H(in0_h, in1_h, in2_h, in3_h, wgt_w, \
88 offset_h, rnd_w, out0_h, out1_h, \
91 HEVC_UNIW_RND_CLIP2_MAX_SATU_H(in0_h, in1_h, wgt_w, offset_h, rnd_w, \
93 HEVC_UNIW_RND_CLIP2_MAX_SATU_H(in2_h, in3_h, wgt_w, offset_h, rnd_w, \
97 static void hevc_uniwgt_copy_4w_msa(uint8_t *src,
106 uint32_t loop_cnt, tp0, tp1, tp2, tp3;
109 v16i8 src0 = { 0 }, src1 = { 0 };
110 v8i16 dst0, dst1, dst2, dst3, offset_vec;
111 v4i32 weight_vec, rnd_vec;
113 weight = weight & 0x0000FFFF;
114 weight_vec = __msa_fill_w(weight);
115 offset_vec = __msa_fill_h(offset);
116 rnd_vec = __msa_fill_w(rnd_val);
119 v4i32 dst0_r, dst0_l;
121 LW2(src, src_stride, tp0, tp1);
122 INSERT_W2_SB(tp0, tp1, src0);
123 dst0 = (v8i16) __msa_ilvr_b(zero, src0);
126 ILVRL_H2_SW(dst0, dst0, dst0_r, dst0_l);
127 DOTP_SH2_SW(dst0_r, dst0_l, weight_vec, weight_vec, dst0_r, dst0_l);
128 SRAR_W2_SW(dst0_r, dst0_l, rnd_vec);
129 dst0 = __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r);
131 dst0 = CLIP_SH_0_255_MAX_SATU(dst0);
132 out0 = (v16u8) __msa_pckev_b((v16i8) dst0, (v16i8) dst0);
133 ST4x2_UB(out0, dst, dst_stride);
134 } else if (4 == height) {
135 LW4(src, src_stride, tp0, tp1, tp2, tp3);
136 INSERT_W4_SB(tp0, tp1, tp2, tp3, src0);
137 ILVRL_B2_SH(zero, src0, dst0, dst1);
138 SLLI_2V(dst0, dst1, 6);
139 HEVC_UNIW_RND_CLIP2_MAX_SATU_H(dst0, dst1, weight_vec, offset_vec,
140 rnd_vec, dst0, dst1);
141 out0 = (v16u8) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
142 ST4x4_UB(out0, out0, 0, 1, 2, 3, dst, dst_stride);
143 } else if (0 == (height % 8)) {
144 for (loop_cnt = (height >> 3); loop_cnt--;) {
145 LW4(src, src_stride, tp0, tp1, tp2, tp3);
146 src += 4 * src_stride;
147 INSERT_W4_SB(tp0, tp1, tp2, tp3, src0);
148 LW4(src, src_stride, tp0, tp1, tp2, tp3);
149 src += 4 * src_stride;
150 INSERT_W4_SB(tp0, tp1, tp2, tp3, src1);
151 ILVRL_B2_SH(zero, src0, dst0, dst1);
152 ILVRL_B2_SH(zero, src1, dst2, dst3);
153 SLLI_4V(dst0, dst1, dst2, dst3, 6);
154 HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
155 offset_vec, rnd_vec, dst0, dst1,
157 PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
158 ST4x8_UB(out0, out1, dst, dst_stride);
159 dst += 8 * dst_stride;
164 static void hevc_uniwgt_copy_6w_msa(uint8_t *src,
174 uint64_t tp0, tp1, tp2, tp3;
176 v16u8 out0, out1, out2, out3;
177 v16i8 src0, src1, src2, src3;
178 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, offset_vec;
179 v4i32 weight_vec, rnd_vec;
181 weight = weight & 0x0000FFFF;
182 weight_vec = __msa_fill_w(weight);
183 offset_vec = __msa_fill_h(offset);
184 rnd_vec = __msa_fill_w(rnd_val);
186 for (loop_cnt = (height >> 3); loop_cnt--;) {
187 LD4(src, src_stride, tp0, tp1, tp2, tp3);
188 src += (4 * src_stride);
189 INSERT_D2_SB(tp0, tp1, src0);
190 INSERT_D2_SB(tp2, tp3, src1);
191 LD4(src, src_stride, tp0, tp1, tp2, tp3);
192 src += (4 * src_stride);
193 INSERT_D2_SB(tp0, tp1, src2);
194 INSERT_D2_SB(tp2, tp3, src3);
196 ILVRL_B2_SH(zero, src0, dst0, dst1);
197 ILVRL_B2_SH(zero, src1, dst2, dst3);
198 ILVRL_B2_SH(zero, src2, dst4, dst5);
199 ILVRL_B2_SH(zero, src3, dst6, dst7);
201 SLLI_4V(dst0, dst1, dst2, dst3, 6);
202 SLLI_4V(dst4, dst5, dst6, dst7, 6);
204 HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
205 offset_vec, rnd_vec, dst0, dst1, dst2,
207 HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst4, dst5, dst6, dst7, weight_vec,
208 offset_vec, rnd_vec, dst4, dst5, dst6,
210 PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
211 PCKEV_B2_UB(dst5, dst4, dst7, dst6, out2, out3);
213 ST6x4_UB(out0, out1, dst, dst_stride);
214 dst += (4 * dst_stride);
215 ST6x4_UB(out2, out3, dst, dst_stride);
216 dst += (4 * dst_stride);
220 static void hevc_uniwgt_copy_8w_msa(uint8_t *src,
230 uint64_t tp0, tp1, tp2, tp3;
231 v16i8 src0 = { 0 }, src1 = { 0 }, src2 = { 0 }, src3 = { 0 };
233 v16u8 out0, out1, out2, out3;
234 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, offset_vec;
235 v4i32 weight_vec, rnd_vec;
237 weight = weight & 0x0000FFFF;
238 weight_vec = __msa_fill_w(weight);
239 offset_vec = __msa_fill_h(offset);
240 rnd_vec = __msa_fill_w(rnd_val);
243 LD2(src, src_stride, tp0, tp1);
244 INSERT_D2_SB(tp0, tp1, src0);
245 ILVRL_B2_SH(zero, src0, dst0, dst1);
246 SLLI_2V(dst0, dst1, 6);
247 HEVC_UNIW_RND_CLIP2_MAX_SATU_H(dst0, dst1, weight_vec, offset_vec,
248 rnd_vec, dst0, dst1);
249 out0 = (v16u8) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
250 ST8x2_UB(out0, dst, dst_stride);
251 } else if (4 == height) {
252 LD4(src, src_stride, tp0, tp1, tp2, tp3);
253 INSERT_D2_SB(tp0, tp1, src0);
254 INSERT_D2_SB(tp2, tp3, src1);
255 ILVRL_B2_SH(zero, src0, dst0, dst1);
256 ILVRL_B2_SH(zero, src1, dst2, dst3);
257 SLLI_4V(dst0, dst1, dst2, dst3, 6);
258 HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
259 offset_vec, rnd_vec, dst0, dst1, dst2,
261 PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
262 ST8x4_UB(out0, out1, dst, dst_stride);
263 } else if (6 == height) {
264 LD4(src, src_stride, tp0, tp1, tp2, tp3);
265 src += 4 * src_stride;
266 INSERT_D2_SB(tp0, tp1, src0);
267 INSERT_D2_SB(tp2, tp3, src1);
268 LD2(src, src_stride, tp0, tp1);
269 INSERT_D2_SB(tp0, tp1, src2);
270 ILVRL_B2_SH(zero, src0, dst0, dst1);
271 ILVRL_B2_SH(zero, src1, dst2, dst3);
272 ILVRL_B2_SH(zero, src2, dst4, dst5);
273 SLLI_4V(dst0, dst1, dst2, dst3, 6);
274 SLLI_2V(dst4, dst5, 6);
275 HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
276 offset_vec, rnd_vec, dst0, dst1, dst2,
278 HEVC_UNIW_RND_CLIP2_MAX_SATU_H(dst4, dst5, weight_vec, offset_vec,
279 rnd_vec, dst4, dst5);
280 PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
281 ST8x4_UB(out0, out1, dst, dst_stride);
282 dst += (4 * dst_stride);
283 ST8x2_UB(out2, dst, dst_stride);
284 } else if (0 == height % 8) {
285 for (loop_cnt = (height >> 3); loop_cnt--;) {
286 LD4(src, src_stride, tp0, tp1, tp2, tp3);
287 src += 4 * src_stride;
288 INSERT_D2_SB(tp0, tp1, src0);
289 INSERT_D2_SB(tp2, tp3, src1);
290 LD4(src, src_stride, tp0, tp1, tp2, tp3);
291 src += 4 * src_stride;
292 INSERT_D2_SB(tp0, tp1, src2);
293 INSERT_D2_SB(tp2, tp3, src3);
295 ILVRL_B2_SH(zero, src0, dst0, dst1);
296 ILVRL_B2_SH(zero, src1, dst2, dst3);
297 ILVRL_B2_SH(zero, src2, dst4, dst5);
298 ILVRL_B2_SH(zero, src3, dst6, dst7);
299 SLLI_4V(dst0, dst1, dst2, dst3, 6);
300 SLLI_4V(dst4, dst5, dst6, dst7, 6);
301 HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
302 offset_vec, rnd_vec, dst0, dst1,
304 HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst4, dst5, dst6, dst7, weight_vec,
305 offset_vec, rnd_vec, dst4, dst5,
307 PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
308 PCKEV_B2_UB(dst5, dst4, dst7, dst6, out2, out3);
309 ST8x4_UB(out0, out1, dst, dst_stride);
310 dst += (4 * dst_stride);
311 ST8x4_UB(out2, out3, dst, dst_stride);
312 dst += (4 * dst_stride);
317 static void hevc_uniwgt_copy_12w_msa(uint8_t *src,
327 v16u8 out0, out1, out2;
328 v16i8 src0, src1, src2, src3;
329 v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
332 v4i32 weight_vec, rnd_vec;
334 weight = weight & 0x0000FFFF;
335 weight_vec = __msa_fill_w(weight);
336 offset_vec = __msa_fill_h(offset);
337 rnd_vec = __msa_fill_w(rnd_val);
339 for (loop_cnt = 4; loop_cnt--;) {
340 LD_SB4(src, src_stride, src0, src1, src2, src3);
341 src += (4 * src_stride);
342 ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
343 dst0, dst1, dst2, dst3);
345 ILVL_W2_SB(src1, src0, src3, src2, src0, src1);
346 ILVR_B2_SH(zero, src0, zero, src1, dst4, dst5);
347 SLLI_4V(dst0, dst1, dst2, dst3, 6);
348 SLLI_2V(dst4, dst5, 6);
349 HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
350 offset_vec, rnd_vec, dst0, dst1, dst2,
352 HEVC_UNIW_RND_CLIP2_MAX_SATU_H(dst4, dst5, weight_vec, offset_vec,
353 rnd_vec, dst4, dst5);
355 PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
356 ST12x4_UB(out0, out1, out2, dst, dst_stride);
357 dst += (4 * dst_stride);
361 static void hevc_uniwgt_copy_16w_msa(uint8_t *src,
371 v16u8 out0, out1, out2, out3;
372 v16i8 src0, src1, src2, src3;
374 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, offset_vec;
375 v4i32 weight_vec, rnd_vec;
377 weight = weight & 0x0000FFFF;
378 weight_vec = __msa_fill_w(weight);
379 offset_vec = __msa_fill_h(offset);
380 rnd_vec = __msa_fill_w(rnd_val);
382 for (loop_cnt = height >> 2; loop_cnt--;) {
383 LD_SB4(src, src_stride, src0, src1, src2, src3);
384 src += (4 * src_stride);
385 ILVRL_B2_SH(zero, src0, dst0, dst1);
386 ILVRL_B2_SH(zero, src1, dst2, dst3);
387 ILVRL_B2_SH(zero, src2, dst4, dst5);
388 ILVRL_B2_SH(zero, src3, dst6, dst7);
389 SLLI_4V(dst0, dst1, dst2, dst3, 6);
390 SLLI_4V(dst4, dst5, dst6, dst7, 6);
391 HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
392 offset_vec, rnd_vec, dst0, dst1, dst2,
394 HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst4, dst5, dst6, dst7, weight_vec,
395 offset_vec, rnd_vec, dst4, dst5, dst6,
397 PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
398 PCKEV_B2_UB(dst5, dst4, dst7, dst6, out2, out3);
399 ST_UB4(out0, out1, out2, out3, dst, dst_stride);
400 dst += (4 * dst_stride);
404 static void hevc_uniwgt_copy_24w_msa(uint8_t *src,
414 v16u8 out0, out1, out2, out3, out4, out5;
415 v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
417 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, offset_vec;
418 v8i16 dst8, dst9, dst10, dst11;
419 v4i32 weight_vec, rnd_vec;
421 weight = weight & 0x0000FFFF;
422 weight_vec = __msa_fill_w(weight);
423 offset_vec = __msa_fill_h(offset);
424 rnd_vec = __msa_fill_w(rnd_val);
426 for (loop_cnt = (height >> 2); loop_cnt--;) {
427 LD_SB4(src, src_stride, src0, src1, src4, src5);
428 LD_SB4(src + 16, src_stride, src2, src3, src6, src7);
429 src += (4 * src_stride);
431 ILVRL_B2_SH(zero, src0, dst0, dst1);
432 ILVRL_B2_SH(zero, src1, dst2, dst3);
433 ILVR_B2_SH(zero, src2, zero, src3, dst4, dst5);
434 ILVRL_B2_SH(zero, src4, dst6, dst7);
435 ILVRL_B2_SH(zero, src5, dst8, dst9);
436 ILVR_B2_SH(zero, src6, zero, src7, dst10, dst11);
437 SLLI_4V(dst0, dst1, dst2, dst3, 6);
438 SLLI_4V(dst4, dst5, dst6, dst7, 6);
439 SLLI_4V(dst8, dst9, dst10, dst11, 6);
440 HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
441 offset_vec, rnd_vec, dst0, dst1, dst2,
443 HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst4, dst5, dst6, dst7, weight_vec,
444 offset_vec, rnd_vec, dst4, dst5, dst6,
446 HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst8, dst9, dst10, dst11, weight_vec,
447 offset_vec, rnd_vec, dst8, dst9, dst10,
449 PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
450 PCKEV_B3_UB(dst7, dst6, dst9, dst8, dst11, dst10, out3, out4, out5);
451 ST_UB4(out0, out1, out3, out4, dst, dst_stride);
452 ST8x4_UB(out2, out5, dst + 16, dst_stride);
453 dst += (4 * dst_stride);
457 static void hevc_uniwgt_copy_32w_msa(uint8_t *src,
467 v16u8 out0, out1, out2, out3;
468 v16i8 src0, src1, src2, src3;
470 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, offset_vec;
471 v4i32 weight_vec, rnd_vec;
473 weight = weight & 0x0000FFFF;
474 weight_vec = __msa_fill_w(weight);
475 offset_vec = __msa_fill_h(offset);
476 rnd_vec = __msa_fill_w(rnd_val);
478 for (loop_cnt = (height >> 1); loop_cnt--;) {
479 LD_SB2(src, src_stride, src0, src1);
480 LD_SB2(src + 16, src_stride, src2, src3);
481 src += (2 * src_stride);
483 ILVRL_B2_SH(zero, src0, dst0, dst1);
484 ILVRL_B2_SH(zero, src1, dst2, dst3);
485 ILVRL_B2_SH(zero, src2, dst4, dst5);
486 ILVRL_B2_SH(zero, src3, dst6, dst7);
487 SLLI_4V(dst0, dst1, dst2, dst3, 6);
488 SLLI_4V(dst4, dst5, dst6, dst7, 6);
489 HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
490 offset_vec, rnd_vec, dst0, dst1, dst2,
492 HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst4, dst5, dst6, dst7, weight_vec,
493 offset_vec, rnd_vec, dst4, dst5, dst6,
495 PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
496 PCKEV_B2_UB(dst5, dst4, dst7, dst6, out2, out3);
497 ST_UB2(out0, out1, dst, dst_stride);
498 ST_UB2(out2, out3, dst + 16, dst_stride);
499 dst += (2 * dst_stride);
503 static void hevc_uniwgt_copy_48w_msa(uint8_t *src,
513 v16u8 out0, out1, out2, out3, out4, out5;
514 v16i8 src0, src1, src2, src3, src4, src5;
516 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, offset_vec;
517 v8i16 dst6, dst7, dst8, dst9, dst10, dst11;
518 v4i32 weight_vec, rnd_vec;
520 weight = weight & 0x0000FFFF;
521 weight_vec = __msa_fill_w(weight);
522 offset_vec = __msa_fill_h(offset);
523 rnd_vec = __msa_fill_w(rnd_val);
525 for (loop_cnt = (height >> 1); loop_cnt--;) {
526 LD_SB3(src, 16, src0, src1, src2);
528 LD_SB3(src, 16, src3, src4, src5);
531 ILVRL_B2_SH(zero, src0, dst0, dst1);
532 ILVRL_B2_SH(zero, src1, dst2, dst3);
533 ILVRL_B2_SH(zero, src2, dst4, dst5);
534 ILVRL_B2_SH(zero, src3, dst6, dst7);
535 ILVRL_B2_SH(zero, src4, dst8, dst9);
536 ILVRL_B2_SH(zero, src5, dst10, dst11);
537 SLLI_4V(dst0, dst1, dst2, dst3, 6);
538 SLLI_4V(dst4, dst5, dst6, dst7, 6);
539 SLLI_4V(dst8, dst9, dst10, dst11, 6);
540 HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
541 offset_vec, rnd_vec, dst0, dst1, dst2,
543 HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst4, dst5, dst6, dst7, weight_vec,
544 offset_vec, rnd_vec, dst4, dst5, dst6,
546 HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst8, dst9, dst10, dst11, weight_vec,
547 offset_vec, rnd_vec, dst8, dst9, dst10,
549 PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
550 PCKEV_B3_UB(dst7, dst6, dst9, dst8, dst11, dst10, out3, out4, out5);
551 ST_UB2(out0, out1, dst, 16);
552 ST_UB(out2, dst + 32);
554 ST_UB2(out3, out4, dst, 16);
555 ST_UB(out5, dst + 32);
560 static void hevc_uniwgt_copy_64w_msa(uint8_t *src,
570 v16u8 out0, out1, out2, out3, out4, out5, out6, out7;
571 v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
573 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, offset_vec;
574 v8i16 dst8, dst9, dst10, dst11, dst12, dst13, dst14, dst15;
575 v4i32 weight_vec, rnd_vec;
577 weight = weight & 0x0000FFFF;
578 weight_vec = __msa_fill_w(weight);
579 offset_vec = __msa_fill_h(offset);
580 rnd_vec = __msa_fill_w(rnd_val);
582 for (loop_cnt = (height >> 1); loop_cnt--;) {
583 LD_SB4(src, 16, src0, src1, src2, src3);
585 LD_SB4(src, 16, src4, src5, src6, src7);
588 ILVRL_B2_SH(zero, src0, dst0, dst1);
589 ILVRL_B2_SH(zero, src1, dst2, dst3);
590 ILVRL_B2_SH(zero, src2, dst4, dst5);
591 ILVRL_B2_SH(zero, src3, dst6, dst7);
592 ILVRL_B2_SH(zero, src4, dst8, dst9);
593 ILVRL_B2_SH(zero, src5, dst10, dst11);
594 ILVRL_B2_SH(zero, src6, dst12, dst13);
595 ILVRL_B2_SH(zero, src7, dst14, dst15);
596 SLLI_4V(dst0, dst1, dst2, dst3, 6);
597 SLLI_4V(dst4, dst5, dst6, dst7, 6);
598 SLLI_4V(dst8, dst9, dst10, dst11, 6);
599 SLLI_4V(dst12, dst13, dst14, dst15, 6);
600 HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
601 offset_vec, rnd_vec, dst0, dst1, dst2,
603 HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst4, dst5, dst6, dst7, weight_vec,
604 offset_vec, rnd_vec, dst4, dst5, dst6,
606 HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst8, dst9, dst10, dst11, weight_vec,
607 offset_vec, rnd_vec, dst8, dst9, dst10,
609 HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst12, dst13, dst14, dst15, weight_vec,
610 offset_vec, rnd_vec, dst12, dst13, dst14,
612 PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
613 PCKEV_B2_UB(dst5, dst4, dst7, dst6, out2, out3);
614 PCKEV_B2_UB(dst9, dst8, dst11, dst10, out4, out5);
615 PCKEV_B2_UB(dst13, dst12, dst15, dst14, out6, out7);
616 ST_UB4(out0, out1, out2, out3, dst, 16);
618 ST_UB4(out4, out5, out6, out7, dst, 16);
623 static void hevc_hz_uniwgt_8t_4w_msa(uint8_t *src,
627 const int8_t *filter,
635 v8i16 filt0, filt1, filt2, filt3;
636 v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
637 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
638 v16i8 mask0, mask1, mask2, mask3, vec11, vec12, vec13, vec14, vec15;
639 v8i16 filter_vec, dst01, dst23, dst45, dst67;
640 v8i16 dst0, dst1, dst2, dst3, weight_vec_h, offset_vec, denom_vec;
641 v4i32 weight_vec, rnd_vec;
644 weight = weight & 0x0000FFFF;
646 weight_vec = __msa_fill_w(weight);
647 rnd_vec = __msa_fill_w(rnd_val);
652 weight_vec_h = __msa_fill_h(weight);
653 offset_vec = __msa_fill_h(offset);
654 denom_vec = __msa_fill_h(rnd_val);
656 weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
657 offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
659 filter_vec = LD_SH(filter);
660 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
662 mask0 = LD_SB(&ff_hevc_mask_arr[16]);
667 for (loop_cnt = (height >> 3); loop_cnt--;) {
668 LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
669 src += (8 * src_stride);
670 XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
672 VSHF_B4_SB(src0, src1, mask0, mask1, mask2, mask3,
673 vec0, vec1, vec2, vec3);
674 VSHF_B4_SB(src2, src3, mask0, mask1, mask2, mask3,
675 vec4, vec5, vec6, vec7);
676 VSHF_B4_SB(src4, src5, mask0, mask1, mask2, mask3,
677 vec8, vec9, vec10, vec11);
678 VSHF_B4_SB(src6, src7, mask0, mask1, mask2, mask3,
679 vec12, vec13, vec14, vec15);
680 dst01 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
682 dst23 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
684 dst45 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
686 dst67 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1,
689 HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst01, dst23, dst45, dst67, weight_vec,
690 offset_vec, rnd_vec, dst0, dst1, dst2,
693 PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
694 ST4x8_UB(out0, out1, dst, dst_stride);
695 dst += (8 * dst_stride);
699 static void hevc_hz_uniwgt_8t_8w_msa(uint8_t *src,
703 const int8_t *filter,
711 v16i8 src0, src1, src2, src3;
712 v8i16 filt0, filt1, filt2, filt3;
713 v16i8 mask0, mask1, mask2, mask3;
715 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
716 v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
717 v8i16 dst0, dst1, dst2, dst3;
718 v8i16 weight_vec_h, offset_vec, denom_vec;
719 v4i32 weight_vec, rnd_vec;
722 weight = weight & 0x0000FFFF;
724 weight_vec = __msa_fill_w(weight);
725 rnd_vec = __msa_fill_w(rnd_val);
730 weight_vec_h = __msa_fill_h(weight);
731 offset_vec = __msa_fill_h(offset);
732 denom_vec = __msa_fill_h(rnd_val);
734 weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
735 offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
737 filter_vec = LD_SH(filter);
738 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
740 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
745 for (loop_cnt = (height >> 2); loop_cnt--;) {
746 LD_SB4(src, src_stride, src0, src1, src2, src3);
747 src += (4 * src_stride);
748 XORI_B4_128_SB(src0, src1, src2, src3);
750 VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
751 vec0, vec1, vec2, vec3);
752 VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
753 vec4, vec5, vec6, vec7);
754 VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
755 vec8, vec9, vec10, vec11);
756 VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
757 vec12, vec13, vec14, vec15);
758 dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
760 dst1 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
762 dst2 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
764 dst3 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1,
767 HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
768 offset_vec, rnd_vec, dst0, dst1, dst2,
771 PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
772 ST8x4_UB(out0, out1, dst, dst_stride);
773 dst += (4 * dst_stride);
777 static void hevc_hz_uniwgt_8t_12w_msa(uint8_t *src,
781 const int8_t *filter,
788 v16u8 out0, out1, out2;
789 v8i16 filt0, filt1, filt2, filt3;
790 v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
791 v16i8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7;
792 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
793 v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
795 v8i16 dst01, dst23, dst0, dst1, dst2, dst3, dst4, dst5;
796 v8i16 weight_vec_h, offset_vec, denom_vec;
797 v4i32 weight_vec, rnd_vec;
800 weight = weight & 0x0000FFFF;
802 weight_vec = __msa_fill_w(weight);
803 rnd_vec = __msa_fill_w(rnd_val);
808 weight_vec_h = __msa_fill_h(weight);
809 offset_vec = __msa_fill_h(offset);
810 denom_vec = __msa_fill_h(rnd_val);
812 weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
813 offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
815 filter_vec = LD_SH(filter);
816 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
818 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
822 mask4 = LD_SB(&ff_hevc_mask_arr[16]);
827 for (loop_cnt = (height >> 2); loop_cnt--;) {
828 LD_SB4(src, src_stride, src0, src1, src2, src3);
829 LD_SB4(src + 8, src_stride, src4, src5, src6, src7);
830 src += (4 * src_stride);
831 XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
833 VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
834 vec0, vec1, vec2, vec3);
835 VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
836 vec4, vec5, vec6, vec7);
837 VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
838 vec8, vec9, vec10, vec11);
839 VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
840 vec12, vec13, vec14, vec15);
841 dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
843 dst1 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
845 dst2 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
847 dst3 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1,
849 VSHF_B4_SB(src4, src5, mask4, mask5, mask6, mask7,
850 vec0, vec1, vec2, vec3);
851 VSHF_B4_SB(src6, src7, mask4, mask5, mask6, mask7,
852 vec4, vec5, vec6, vec7);
853 dst01 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
855 dst23 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
858 HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
859 offset_vec, rnd_vec, dst0, dst1, dst2,
861 HEVC_UNIW_RND_CLIP2_MAX_SATU_H(dst01, dst23, weight_vec, offset_vec,
862 rnd_vec, dst4, dst5);
864 PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
865 ST8x4_UB(out0, out1, dst, dst_stride);
866 ST4x4_UB(out2, out2, 0, 1, 2, 3, dst + 8, dst_stride);
867 dst += (4 * dst_stride);
871 static void hevc_hz_uniwgt_8t_16w_msa(uint8_t *src,
875 const int8_t *filter,
883 v16i8 src0, src1, src2, src3;
884 v8i16 filt0, filt1, filt2, filt3;
885 v16i8 mask0, mask1, mask2, mask3;
887 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
888 v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
889 v8i16 dst0, dst1, dst2, dst3;
890 v8i16 weight_vec_h, offset_vec, denom_vec;
891 v4i32 weight_vec, rnd_vec;
895 weight_vec = __msa_fill_w(weight);
896 rnd_vec = __msa_fill_w(rnd_val);
901 weight_vec_h = __msa_fill_h(weight);
902 offset_vec = __msa_fill_h(offset);
903 denom_vec = __msa_fill_h(rnd_val);
905 weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
906 offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
908 filter_vec = LD_SH(filter);
909 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
911 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
916 for (loop_cnt = (height >> 1); loop_cnt--;) {
917 LD_SB2(src, src_stride, src0, src2);
918 LD_SB2(src + 8, src_stride, src1, src3);
919 src += (2 * src_stride);
920 XORI_B4_128_SB(src0, src1, src2, src3);
922 VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
923 vec0, vec1, vec2, vec3);
924 VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
925 vec4, vec5, vec6, vec7);
926 VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
927 vec8, vec9, vec10, vec11);
928 VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
929 vec12, vec13, vec14, vec15);
930 dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
932 dst1 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
934 dst2 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
936 dst3 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1,
939 HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
940 offset_vec, rnd_vec, dst0, dst1, dst2,
943 PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
944 ST_UB2(out0, out1, dst, dst_stride);
945 dst += (2 * dst_stride);
949 static void hevc_hz_uniwgt_8t_24w_msa(uint8_t *src,
953 const int8_t *filter,
960 v16u8 out0, out1, out2;
961 v16i8 src0, src1, src2, src3;
962 v8i16 filt0, filt1, filt2, filt3;
963 v16i8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7;
964 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
965 v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
966 v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
967 v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec;
968 v4i32 weight_vec, rnd_vec;
972 weight_vec = __msa_fill_w(weight);
973 rnd_vec = __msa_fill_w(rnd_val);
978 weight_vec_h = __msa_fill_h(weight);
979 offset_vec = __msa_fill_h(offset);
980 denom_vec = __msa_fill_h(rnd_val);
982 weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
983 offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
985 filter_vec = LD_SH(filter);
986 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
988 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
997 for (loop_cnt = 16; loop_cnt--;) {
998 LD_SB2(src, 16, src0, src1);
1000 LD_SB2(src, 16, src2, src3);
1002 XORI_B4_128_SB(src0, src1, src2, src3);
1003 VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
1004 vec0, vec1, vec2, vec3);
1005 VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7,
1006 vec4, vec5, vec6, vec7);
1007 VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
1008 vec8, vec9, vec10, vec11);
1009 VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
1010 vec12, vec13, vec14, vec15);
1011 dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1013 dst1 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
1015 dst2 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
1017 dst3 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1,
1020 VSHF_B4_SB(src2, src3, mask4, mask5, mask6, mask7,
1021 vec0, vec1, vec2, vec3);
1022 VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
1023 vec4, vec5, vec6, vec7);
1024 dst4 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1026 dst5 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
1029 HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
1030 offset_vec, rnd_vec, dst0, dst1, dst2,
1032 HEVC_UNIW_RND_CLIP2_MAX_SATU_H(dst4, dst5, weight_vec, offset_vec,
1033 rnd_vec, dst4, dst5);
1035 PCKEV_B3_UB(dst1, dst0, dst4, dst3, dst5, dst2, out0, out1, out2);
1036 ST_UB2(out0, out1, dst, dst_stride);
1037 ST8x2_UB(out2, dst + 16, dst_stride);
1038 dst += (2 * dst_stride);
1042 static void hevc_hz_uniwgt_8t_32w_msa(uint8_t *src,
1046 const int8_t *filter,
1053 v16u8 out0, out1, out2, out3;
1054 v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
1055 v8i16 filt0, filt1, filt2, filt3;
1056 v16i8 mask0, mask1, mask2, mask3;
1057 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1058 v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1060 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
1061 v8i16 weight_vec_h, offset_vec, denom_vec;
1062 v4i32 weight_vec, rnd_vec;
1066 weight_vec = __msa_fill_w(weight);
1067 rnd_vec = __msa_fill_w(rnd_val);
1072 weight_vec_h = __msa_fill_h(weight);
1073 offset_vec = __msa_fill_h(offset);
1074 denom_vec = __msa_fill_h(rnd_val);
1076 weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
1077 offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
1079 filter_vec = LD_SH(filter);
1080 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1082 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
1087 for (loop_cnt = height >> 1; loop_cnt--;) {
1088 LD_SB4(src, 8, src0, src1, src2, src3);
1090 LD_SB4(src, 8, src4, src5, src6, src7);
1092 XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
1094 VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
1095 vec0, vec1, vec2, vec3);
1096 VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
1097 vec4, vec5, vec6, vec7);
1098 VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
1099 vec8, vec9, vec10, vec11);
1100 VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
1101 vec12, vec13, vec14, vec15);
1102 dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1104 dst1 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
1106 dst2 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
1108 dst3 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1,
1111 VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3,
1112 vec0, vec1, vec2, vec3);
1113 VSHF_B4_SB(src5, src5, mask0, mask1, mask2, mask3,
1114 vec4, vec5, vec6, vec7);
1115 VSHF_B4_SB(src6, src6, mask0, mask1, mask2, mask3,
1116 vec8, vec9, vec10, vec11);
1117 VSHF_B4_SB(src7, src7, mask0, mask1, mask2, mask3,
1118 vec12, vec13, vec14, vec15);
1119 dst4 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1121 dst5 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
1123 dst6 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
1125 dst7 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1,
1128 HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
1129 offset_vec, rnd_vec, dst0, dst1, dst2,
1131 HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst4, dst5, dst6, dst7, weight_vec,
1132 offset_vec, rnd_vec, dst4, dst5, dst6,
1135 PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
1136 PCKEV_B2_UB(dst5, dst4, dst7, dst6, out2, out3);
1137 ST_UB2(out0, out1, dst, 16);
1139 ST_UB2(out2, out3, dst, 16);
1144 static void hevc_hz_uniwgt_8t_48w_msa(uint8_t *src,
1148 const int8_t *filter,
1155 v16u8 out0, out1, out2;
1156 v16i8 src0, src1, src2, src3;
1157 v8i16 filt0, filt1, filt2, filt3;
1158 v16i8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7;
1159 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1160 v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1161 v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
1162 v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec;
1163 v4i32 weight_vec, rnd_vec;
1167 weight = weight & 0x0000FFFF;
1168 weight_vec = __msa_fill_w(weight);
1169 rnd_vec = __msa_fill_w(rnd_val);
1174 weight_vec_h = __msa_fill_h(weight);
1175 offset_vec = __msa_fill_h(offset);
1176 denom_vec = __msa_fill_h(rnd_val);
1178 weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
1179 offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
1181 filter_vec = LD_SH(filter);
1182 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1184 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
1193 for (loop_cnt = 64; loop_cnt--;) {
1194 LD_SB3(src, 16, src0, src1, src2);
1195 src3 = LD_SB(src + 40);
1197 XORI_B4_128_SB(src0, src1, src2, src3);
1199 VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
1200 vec0, vec1, vec2, vec3);
1201 VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7,
1202 vec4, vec5, vec6, vec7);
1203 VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
1204 vec8, vec9, vec10, vec11);
1205 VSHF_B4_SB(src1, src2, mask4, mask5, mask6, mask7,
1206 vec12, vec13, vec14, vec15);
1207 dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1209 dst1 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
1211 dst2 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
1213 dst3 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1,
1216 VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
1217 vec0, vec1, vec2, vec3);
1218 VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
1219 vec4, vec5, vec6, vec7);
1220 dst4 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1222 dst5 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
1225 HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
1226 offset_vec, rnd_vec, dst0, dst1, dst2,
1228 HEVC_UNIW_RND_CLIP2_MAX_SATU_H(dst4, dst5, weight_vec, offset_vec,
1229 rnd_vec, dst4, dst5);
1231 PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
1232 ST_UB2(out0, out1, dst, 16);
1233 ST_UB(out2, dst + 32);
1238 static void hevc_hz_uniwgt_8t_64w_msa(uint8_t *src,
1242 const int8_t *filter,
1250 uint32_t loop_cnt, cnt;
1252 v16i8 src0, src1, src2;
1253 v8i16 filt0, filt1, filt2, filt3;
1254 v16i8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7;
1255 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1256 v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1257 v8i16 dst0, dst1, dst2, dst3;
1258 v8i16 filter_vec, weight_vec_h, offset_vec, denom_vec;
1259 v4i32 weight_vec, rnd_vec;
1263 weight_vec = __msa_fill_w(weight);
1264 rnd_vec = __msa_fill_w(rnd_val);
1269 weight_vec_h = __msa_fill_h(weight);
1270 offset_vec = __msa_fill_h(offset);
1271 denom_vec = __msa_fill_h(rnd_val);
1273 weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
1274 offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
1276 filter_vec = LD_SH(filter);
1277 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1279 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
1288 for (loop_cnt = height; loop_cnt--;) {
1292 for (cnt = 2; cnt--;) {
1293 LD_SB2(src_tmp, 16, src0, src1);
1294 src2 = LD_SB(src_tmp + 24);
1296 XORI_B3_128_SB(src0, src1, src2);
1298 VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
1299 vec0, vec1, vec2, vec3);
1300 VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7,
1301 vec4, vec5, vec6, vec7);
1302 VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
1303 vec8, vec9, vec10, vec11);
1304 VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
1305 vec12, vec13, vec14, vec15);
1306 dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1,
1308 dst1 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1,
1310 dst2 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1,
1312 dst3 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1,
1315 HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
1316 offset_vec, rnd_vec, dst0, dst1,
1319 PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
1320 ST_UB2(out0, out1, dst_tmp, 16);
1329 static void hevc_vt_uniwgt_8t_4w_msa(uint8_t *src,
1333 const int8_t *filter,
1341 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
1342 v16i8 src9, src10, src11, src12, src13, src14;
1343 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
1344 v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
1345 v16i8 src1110_r, src1211_r, src1312_r, src1413_r;
1346 v16i8 src2110, src4332, src6554, src8776, src10998;
1347 v16i8 src12111110, src14131312;
1348 v8i16 filter_vec, dst01, dst23, dst45, dst67;
1349 v8i16 filt0, filt1, filt2, filt3;
1350 v8i16 dst0, dst1, dst2, dst3, weight_vec_h, offset_vec, denom_vec;
1351 v4i32 weight_vec, rnd_vec;
1353 src -= (3 * src_stride);
1356 weight_vec = __msa_fill_w(weight);
1357 rnd_vec = __msa_fill_w(rnd_val);
1362 weight_vec_h = __msa_fill_h(weight);
1363 offset_vec = __msa_fill_h(offset);
1364 denom_vec = __msa_fill_h(rnd_val);
1366 weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
1367 offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
1369 filter_vec = LD_SH(filter);
1370 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1372 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
1373 src += (7 * src_stride);
1375 ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1376 src10_r, src32_r, src54_r, src21_r);
1378 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1380 ILVR_D3_SB(src21_r, src10_r, src43_r,
1381 src32_r, src65_r, src54_r, src2110, src4332, src6554);
1383 XORI_B3_128_SB(src2110, src4332, src6554);
1385 for (loop_cnt = (height >> 3); loop_cnt--;) {
1386 LD_SB8(src, src_stride,
1387 src7, src8, src9, src10, src11, src12, src13, src14);
1388 src += (8 * src_stride);
1389 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
1390 src76_r, src87_r, src98_r, src109_r);
1391 ILVR_B4_SB(src11, src10, src12, src11, src13, src12, src14, src13,
1392 src1110_r, src1211_r, src1312_r, src1413_r);
1393 ILVR_D4_SB(src87_r, src76_r, src109_r, src98_r, src1211_r, src1110_r,
1394 src1413_r, src1312_r,
1395 src8776, src10998, src12111110, src14131312);
1396 XORI_B4_128_SB(src8776, src10998, src12111110, src14131312);
1397 dst01 = HEVC_FILT_8TAP_SH(src2110, src4332, src6554, src8776, filt0,
1398 filt1, filt2, filt3);
1399 dst23 = HEVC_FILT_8TAP_SH(src4332, src6554, src8776, src10998, filt0,
1400 filt1, filt2, filt3);
1401 dst45 = HEVC_FILT_8TAP_SH(src6554, src8776, src10998, src12111110,
1402 filt0, filt1, filt2, filt3);
1403 dst67 = HEVC_FILT_8TAP_SH(src8776, src10998, src12111110, src14131312,
1404 filt0, filt1, filt2, filt3);
1406 HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst01, dst23, dst45, dst67, weight_vec,
1407 offset_vec, rnd_vec, dst0, dst1, dst2,
1410 PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
1411 ST4x8_UB(out0, out1, dst, dst_stride);
1412 dst += (8 * dst_stride);
1415 src4332 = src12111110;
1416 src6554 = src14131312;
1421 static void hevc_vt_uniwgt_8t_8w_msa(uint8_t *src,
1425 const int8_t *filter,
1433 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1434 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
1435 v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
1436 v8i16 filt0, filt1, filt2, filt3;
1438 v8i16 dst0, dst1, dst2, dst3, weight_vec_h, offset_vec, denom_vec;
1439 v4i32 weight_vec, rnd_vec;
1441 src -= (3 * src_stride);
1443 weight_vec = __msa_fill_w(weight);
1444 rnd_vec = __msa_fill_w(rnd_val);
1449 weight_vec_h = __msa_fill_h(weight);
1450 offset_vec = __msa_fill_h(offset);
1451 denom_vec = __msa_fill_h(rnd_val);
1453 weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
1454 offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
1456 filter_vec = LD_SH(filter);
1457 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1459 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
1460 src += (7 * src_stride);
1461 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1463 ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1464 src10_r, src32_r, src54_r, src21_r);
1465 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1467 for (loop_cnt = (height >> 2); loop_cnt--;) {
1468 LD_SB4(src, src_stride, src7, src8, src9, src10);
1469 src += (4 * src_stride);
1470 XORI_B4_128_SB(src7, src8, src9, src10);
1471 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
1472 src76_r, src87_r, src98_r, src109_r);
1473 dst0 = HEVC_FILT_8TAP_SH(src10_r, src32_r, src54_r, src76_r, filt0,
1474 filt1, filt2, filt3);
1475 dst1 = HEVC_FILT_8TAP_SH(src21_r, src43_r, src65_r, src87_r, filt0,
1476 filt1, filt2, filt3);
1477 dst2 = HEVC_FILT_8TAP_SH(src32_r, src54_r, src76_r, src98_r, filt0,
1478 filt1, filt2, filt3);
1479 dst3 = HEVC_FILT_8TAP_SH(src43_r, src65_r, src87_r, src109_r, filt0,
1480 filt1, filt2, filt3);
1482 HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
1483 offset_vec, rnd_vec, dst0, dst1, dst2,
1486 PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
1487 ST8x4_UB(out0, out1, dst, dst_stride);
1488 dst += (4 * dst_stride);
1500 static void hevc_vt_uniwgt_8t_12w_msa(uint8_t *src,
1504 const int8_t *filter,
1511 v16u8 out0, out1, out2;
1512 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1513 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
1514 v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
1515 v16i8 src10_l, src32_l, src54_l, src76_l, src98_l;
1516 v16i8 src21_l, src43_l, src65_l, src87_l, src109_l;
1517 v16i8 src2110, src4332, src6554, src8776, src10998;
1518 v8i16 filt0, filt1, filt2, filt3;
1519 v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
1520 v8i16 weight_vec_h, offset_vec, denom_vec, filter_vec;
1521 v4i32 weight_vec, rnd_vec;
1523 src -= (3 * src_stride);
1525 weight = weight & 0x0000FFFF;
1526 weight_vec = __msa_fill_w(weight);
1527 rnd_vec = __msa_fill_w(rnd_val);
1532 weight_vec_h = __msa_fill_h(weight);
1533 offset_vec = __msa_fill_h(offset);
1534 denom_vec = __msa_fill_h(rnd_val);
1536 weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
1537 offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
1539 filter_vec = LD_SH(filter);
1540 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1542 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
1543 src += (7 * src_stride);
1544 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1546 ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1547 src10_r, src32_r, src54_r, src21_r);
1548 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1549 ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1550 src10_l, src32_l, src54_l, src21_l);
1551 ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
1552 ILVR_D3_SB(src21_l, src10_l, src43_l, src32_l, src65_l, src54_l,
1553 src2110, src4332, src6554);
1555 for (loop_cnt = 4; loop_cnt--;) {
1556 LD_SB4(src, src_stride, src7, src8, src9, src10);
1557 src += (4 * src_stride);
1558 XORI_B4_128_SB(src7, src8, src9, src10);
1560 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
1561 src76_r, src87_r, src98_r, src109_r);
1562 ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
1563 src76_l, src87_l, src98_l, src109_l);
1564 ILVR_D2_SB(src87_l, src76_l, src109_l, src98_l, src8776, src10998);
1566 dst0 = HEVC_FILT_8TAP_SH(src10_r, src32_r, src54_r, src76_r, filt0,
1567 filt1, filt2, filt3);
1568 dst1 = HEVC_FILT_8TAP_SH(src21_r, src43_r, src65_r, src87_r, filt0,
1569 filt1, filt2, filt3);
1570 dst2 = HEVC_FILT_8TAP_SH(src32_r, src54_r, src76_r, src98_r, filt0,
1571 filt1, filt2, filt3);
1572 dst3 = HEVC_FILT_8TAP_SH(src43_r, src65_r, src87_r, src109_r, filt0,
1573 filt1, filt2, filt3);
1574 dst4 = HEVC_FILT_8TAP_SH(src2110, src4332, src6554, src8776, filt0,
1575 filt1, filt2, filt3);
1576 dst5 = HEVC_FILT_8TAP_SH(src4332, src6554, src8776, src10998, filt0,
1577 filt1, filt2, filt3);
1579 HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
1580 offset_vec, rnd_vec, dst0, dst1, dst2,
1582 HEVC_UNIW_RND_CLIP2_MAX_SATU_H(dst4, dst5, weight_vec, offset_vec,
1583 rnd_vec, dst4, dst5);
1585 PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
1586 ST8x4_UB(out0, out1, dst, dst_stride);
1587 ST4x4_UB(out2, out2, 0, 1, 2, 3, dst + 8, dst_stride);
1588 dst += (4 * dst_stride);
1603 static void hevc_vt_uniwgt_8t_16multx4mult_msa(uint8_t *src,
1607 const int8_t *filter,
1612 int32_t weightmul16)
1616 int32_t loop_cnt, cnt;
1617 v16u8 out0, out1, out2, out3;
1618 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1619 v16i8 src10_r, src32_r, src54_r, src76_r;
1620 v16i8 src21_r, src43_r, src65_r, src87_r;
1621 v16i8 src10_l, src32_l, src54_l, src76_l;
1622 v16i8 src21_l, src43_l, src65_l, src87_l;
1623 v16i8 src98_r, src109_r, src98_l, src109_l;
1624 v8i16 filt0, filt1, filt2, filt3;
1626 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
1627 v8i16 weight_vec_h, offset_vec, denom_vec;
1628 v4i32 weight_vec, rnd_vec;
1630 src -= (3 * src_stride);
1632 weight_vec = __msa_fill_w(weight);
1633 rnd_vec = __msa_fill_w(rnd_val);
1638 weight_vec_h = __msa_fill_h(weight);
1639 offset_vec = __msa_fill_h(offset);
1640 denom_vec = __msa_fill_h(rnd_val);
1642 weight_vec_h = __msa_srar_h(weight_vec_h, denom_vec);
1643 offset_vec = __msa_adds_s_h(offset_vec, weight_vec_h);
1645 filter_vec = LD_SH(filter);
1646 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1648 for (cnt = weightmul16; cnt--;) {
1652 LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6);
1653 src_tmp += (7 * src_stride);
1654 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1656 for (loop_cnt = (height >> 2); loop_cnt--;) {
1657 LD_SB4(src_tmp, src_stride, src7, src8, src9, src10);
1658 src_tmp += (4 * src_stride);
1659 XORI_B4_128_SB(src7, src8, src9, src10);
1661 ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1662 src10_r, src32_r, src54_r, src21_r);
1663 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1664 ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1665 src10_l, src32_l, src54_l, src21_l);
1666 ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
1667 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
1668 src76_r, src87_r, src98_r, src109_r);
1669 ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
1670 src76_l, src87_l, src98_l, src109_l);
1672 dst0 = HEVC_FILT_8TAP_SH(src10_r, src32_r, src54_r, src76_r, filt0,
1673 filt1, filt2, filt3);
1674 dst1 = HEVC_FILT_8TAP_SH(src10_l, src32_l, src54_l, src76_l, filt0,
1675 filt1, filt2, filt3);
1676 dst2 = HEVC_FILT_8TAP_SH(src21_r, src43_r, src65_r, src87_r, filt0,
1677 filt1, filt2, filt3);
1678 dst3 = HEVC_FILT_8TAP_SH(src21_l, src43_l, src65_l, src87_l, filt0,
1679 filt1, filt2, filt3);
1680 dst4 = HEVC_FILT_8TAP_SH(src32_r, src54_r, src76_r, src98_r, filt0,
1681 filt1, filt2, filt3);
1682 dst5 = HEVC_FILT_8TAP_SH(src32_l, src54_l, src76_l, src98_l, filt0,
1683 filt1, filt2, filt3);
1684 dst6 = HEVC_FILT_8TAP_SH(src43_r, src65_r, src87_r, src109_r, filt0,
1685 filt1, filt2, filt3);
1686 dst7 = HEVC_FILT_8TAP_SH(src43_l, src65_l, src87_l, src109_l, filt0,
1687 filt1, filt2, filt3);
1689 HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst0, dst1, dst2, dst3, weight_vec,
1690 offset_vec, rnd_vec, dst0, dst1,
1692 HEVC_UNIW_RND_CLIP4_MAX_SATU_H(dst4, dst5, dst6, dst7, weight_vec,
1693 offset_vec, rnd_vec, dst4, dst5,
1695 PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
1696 PCKEV_B2_UB(dst5, dst4, dst7, dst6, out2, out3);
1697 ST_UB4(out0, out1, out2, out3, dst_tmp, dst_stride);
1698 dst_tmp += (4 * dst_stride);
1714 static void hevc_vt_uniwgt_8t_16w_msa(uint8_t *src,
1718 const int8_t *filter,
1724 hevc_vt_uniwgt_8t_16multx4mult_msa(src, src_stride, dst, dst_stride,
1725 filter, height, weight,
1726 offset, rnd_val, 1);
1729 static void hevc_vt_uniwgt_8t_24w_msa(uint8_t *src,
1733 const int8_t *filter,
1739 hevc_vt_uniwgt_8t_16multx4mult_msa(src, src_stride, dst, dst_stride,
1741 offset, rnd_val, 1);
1743 hevc_vt_uniwgt_8t_8w_msa(src + 16, src_stride, dst + 16, dst_stride,
1744 filter, 32, weight, offset, rnd_val);
1747 static void hevc_vt_uniwgt_8t_32w_msa(uint8_t *src,
1751 const int8_t *filter,
1757 hevc_vt_uniwgt_8t_16multx4mult_msa(src, src_stride, dst, dst_stride,
1758 filter, height, weight,
1759 offset, rnd_val, 2);
1762 static void hevc_vt_uniwgt_8t_48w_msa(uint8_t *src,
1766 const int8_t *filter,
1772 hevc_vt_uniwgt_8t_16multx4mult_msa(src, src_stride, dst, dst_stride,
1774 offset, rnd_val, 3);
1777 static void hevc_vt_uniwgt_8t_64w_msa(uint8_t *src,
1781 const int8_t *filter,
1787 hevc_vt_uniwgt_8t_16multx4mult_msa(src, src_stride, dst, dst_stride,
1788 filter, height, weight,
1789 offset, rnd_val, 4);
1792 static void hevc_hv_uniwgt_8t_4w_msa(uint8_t *src,
1796 const int8_t *filter_x,
1797 const int8_t *filter_y,
1804 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
1805 v8i16 filt0, filt1, filt2, filt3;
1806 v4i32 filt_h0, filt_h1, filt_h2, filt_h3;
1807 v16i8 mask1, mask2, mask3;
1808 v8i16 filter_vec, const_vec;
1809 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1810 v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1811 v8i16 dst30, dst41, dst52, dst63, dst66, dst87;
1812 v4i32 dst0_r, dst1_r, weight_vec, offset_vec, rnd_vec;
1813 v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
1814 v8i16 dst21_r, dst43_r, dst65_r, dst87_r;
1815 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 };
1816 v8u16 mask4 = { 0, 4, 1, 5, 2, 6, 3, 7 };
1818 src -= ((3 * src_stride) + 3);
1819 filter_vec = LD_SH(filter_x);
1820 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1822 filter_vec = LD_SH(filter_y);
1823 vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
1824 filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
1826 SPLATI_W4_SW(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
1832 const_vec = __msa_ldi_h(128);
1835 weight_vec = __msa_fill_w(weight);
1836 offset_vec = __msa_fill_w(offset);
1837 rnd_vec = __msa_fill_w(rnd_val);
1839 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
1840 src += (7 * src_stride);
1841 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1843 /* row 0 row 1 row 2 row 3 */
1844 VSHF_B4_SB(src0, src3, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
1845 VSHF_B4_SB(src1, src4, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7);
1846 VSHF_B4_SB(src2, src5, mask0, mask1, mask2, mask3,
1847 vec8, vec9, vec10, vec11);
1848 VSHF_B4_SB(src3, src6, mask0, mask1, mask2, mask3,
1849 vec12, vec13, vec14, vec15);
1851 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
1852 dst30, dst30, dst30, dst30);
1854 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, filt3,
1855 dst41, dst41, dst41, dst41);
1857 DPADD_SB4_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, filt3,
1858 dst52, dst52, dst52, dst52);
1860 DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2, filt3,
1861 dst63, dst63, dst63, dst63);
1863 ILVR_H3_SH(dst41, dst30, dst52, dst41, dst63, dst52,
1864 dst10_r, dst21_r, dst32_r);
1866 dst43_r = __msa_ilvl_h(dst41, dst30);
1867 dst54_r = __msa_ilvl_h(dst52, dst41);
1868 dst65_r = __msa_ilvl_h(dst63, dst52);
1870 dst66 = (v8i16) __msa_splati_d((v2i64) dst63, 1);
1872 for (loop_cnt = height >> 1; loop_cnt--;) {
1873 LD_SB2(src, src_stride, src7, src8);
1874 src += (2 * src_stride);
1875 XORI_B2_128_SB(src7, src8);
1877 VSHF_B4_SB(src7, src8, mask0, mask1, mask2, mask3,
1878 vec0, vec1, vec2, vec3);
1880 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
1881 dst87, dst87, dst87, dst87);
1882 dst76_r = __msa_ilvr_h(dst87, dst66);
1883 dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r,
1884 filt_h0, filt_h1, filt_h2, filt_h3);
1885 dst87_r = __msa_vshf_h((v8i16) mask4, dst87, dst87);
1886 dst1_r = HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r,
1887 filt_h0, filt_h1, filt_h2, filt_h3);
1891 MUL2(dst0_r, weight_vec, dst1_r, weight_vec, dst0_r, dst1_r);
1892 SRAR_W2_SW(dst0_r, dst1_r, rnd_vec);
1893 ADD2(dst0_r, offset_vec, dst1_r, offset_vec, dst0_r, dst1_r);
1894 dst0_r = CLIP_SW_0_255(dst0_r);
1895 dst1_r = CLIP_SW_0_255(dst1_r);
1897 HEVC_PCK_SW_SB2(dst1_r, dst0_r, dst0_r);
1898 ST4x2_UB(dst0_r, dst, dst_stride);
1899 dst += (2 * dst_stride);
1907 dst66 = (v8i16) __msa_splati_d((v2i64) dst87, 1);
1911 static void hevc_hv_uniwgt_8t_8multx2mult_msa(uint8_t *src,
1915 const int8_t *filter_x,
1916 const int8_t *filter_y,
1923 uint32_t loop_cnt, cnt;
1926 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
1927 v8i16 filt0, filt1, filt2, filt3;
1928 v4i32 filt_h0, filt_h1, filt_h2, filt_h3;
1929 v16i8 mask1, mask2, mask3;
1930 v8i16 filter_vec, const_vec;
1931 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1932 v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1933 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
1934 v4i32 dst0_r, dst0_l, dst1_r, dst1_l;
1935 v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
1936 v8i16 dst10_l, dst32_l, dst54_l, dst76_l;
1937 v8i16 dst21_r, dst43_r, dst65_r, dst87_r;
1938 v8i16 dst21_l, dst43_l, dst65_l, dst87_l;
1939 v4i32 weight_vec, offset_vec, rnd_vec;
1940 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
1942 src -= ((3 * src_stride) + 3);
1943 const_vec = __msa_ldi_h(128);
1946 weight_vec = __msa_fill_w(weight);
1947 offset_vec = __msa_fill_w(offset);
1948 rnd_vec = __msa_fill_w(rnd_val);
1950 filter_vec = LD_SH(filter_x);
1951 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1953 filter_vec = LD_SH(filter_y);
1954 vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
1955 filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
1956 SPLATI_W4_SW(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
1962 for (cnt = width >> 3; cnt--;) {
1966 LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6);
1967 src_tmp += (7 * src_stride);
1968 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1970 VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
1971 vec0, vec1, vec2, vec3);
1972 VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
1973 vec4, vec5, vec6, vec7);
1974 VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
1975 vec8, vec9, vec10, vec11);
1976 VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
1977 vec12, vec13, vec14, vec15);
1979 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
1980 dst0, dst0, dst0, dst0);
1982 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, filt3,
1983 dst1, dst1, dst1, dst1);
1985 DPADD_SB4_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, filt3,
1986 dst2, dst2, dst2, dst2);
1988 DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2, filt3,
1989 dst3, dst3, dst3, dst3);
1991 VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3,
1992 vec0, vec1, vec2, vec3);
1993 VSHF_B4_SB(src5, src5, mask0, mask1, mask2, mask3,
1994 vec4, vec5, vec6, vec7);
1995 VSHF_B4_SB(src6, src6, mask0, mask1, mask2, mask3,
1996 vec8, vec9, vec10, vec11);
1998 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
1999 dst4, dst4, dst4, dst4);
2001 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, filt3,
2002 dst5, dst5, dst5, dst5);
2004 DPADD_SB4_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, filt3,
2005 dst6, dst6, dst6, dst6);
2007 ILVR_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1,
2008 dst10_r, dst32_r, dst54_r, dst21_r);
2009 ILVR_H2_SH(dst4, dst3, dst6, dst5, dst43_r, dst65_r);
2010 ILVL_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1,
2011 dst10_l, dst32_l, dst54_l, dst21_l);
2012 ILVL_H2_SH(dst4, dst3, dst6, dst5, dst43_l, dst65_l);
2014 for (loop_cnt = height >> 1; loop_cnt--;) {
2015 LD_SB2(src_tmp, src_stride, src7, src8);
2016 src_tmp += 2 * src_stride;
2017 XORI_B2_128_SB(src7, src8);
2019 VSHF_B4_SB(src7, src7, mask0, mask1, mask2, mask3,
2020 vec0, vec1, vec2, vec3);
2022 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
2023 dst7, dst7, dst7, dst7);
2025 ILVRL_H2_SH(dst7, dst6, dst76_r, dst76_l);
2026 dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r,
2027 filt_h0, filt_h1, filt_h2, filt_h3);
2028 dst0_l = HEVC_FILT_8TAP(dst10_l, dst32_l, dst54_l, dst76_l,
2029 filt_h0, filt_h1, filt_h2, filt_h3);
2034 VSHF_B4_SB(src8, src8, mask0, mask1, mask2, mask3,
2035 vec0, vec1, vec2, vec3);
2037 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
2038 dst8, dst8, dst8, dst8);
2040 ILVRL_H2_SH(dst8, dst7, dst87_r, dst87_l);
2041 dst1_r = HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r,
2042 filt_h0, filt_h1, filt_h2, filt_h3);
2043 dst1_l = HEVC_FILT_8TAP(dst21_l, dst43_l, dst65_l, dst87_l,
2044 filt_h0, filt_h1, filt_h2, filt_h3);
2048 HEVC_HV_UNIW_RND_CLIP4(dst0_r, dst1_r, dst0_l, dst1_l,
2049 weight_vec, offset_vec, rnd_vec,
2050 dst0_r, dst1_r, dst0_l, dst1_l);
2052 HEVC_PCK_SW_SB4(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r);
2053 ST8x2_UB(dst0_r, dst_tmp, dst_stride);
2054 dst_tmp += (2 * dst_stride);
2076 static void hevc_hv_uniwgt_8t_8w_msa(uint8_t *src,
2080 const int8_t *filter_x,
2081 const int8_t *filter_y,
2087 hevc_hv_uniwgt_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
2088 filter_x, filter_y, height, weight,
2089 offset, rnd_val, 8);
2092 static void hevc_hv_uniwgt_8t_12w_msa(uint8_t *src,
2096 const int8_t *filter_x,
2097 const int8_t *filter_y,
2103 hevc_hv_uniwgt_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
2104 filter_x, filter_y, height, weight,
2105 offset, rnd_val, 8);
2106 hevc_hv_uniwgt_8t_4w_msa(src + 8, src_stride, dst + 8, dst_stride,
2107 filter_x, filter_y, height, weight, offset,
2111 static void hevc_hv_uniwgt_8t_16w_msa(uint8_t *src,
2115 const int8_t *filter_x,
2116 const int8_t *filter_y,
2122 hevc_hv_uniwgt_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
2123 filter_x, filter_y, height, weight,
2124 offset, rnd_val, 16);
2127 static void hevc_hv_uniwgt_8t_24w_msa(uint8_t *src,
2131 const int8_t *filter_x,
2132 const int8_t *filter_y,
2138 hevc_hv_uniwgt_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
2139 filter_x, filter_y, height, weight,
2140 offset, rnd_val, 24);
2143 static void hevc_hv_uniwgt_8t_32w_msa(uint8_t *src,
2147 const int8_t *filter_x,
2148 const int8_t *filter_y,
2154 hevc_hv_uniwgt_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
2155 filter_x, filter_y, height, weight,
2156 offset, rnd_val, 32);
2159 static void hevc_hv_uniwgt_8t_48w_msa(uint8_t *src,
2163 const int8_t *filter_x,
2164 const int8_t *filter_y,
2170 hevc_hv_uniwgt_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
2171 filter_x, filter_y, height, weight,
2172 offset, rnd_val, 48);
2175 static void hevc_hv_uniwgt_8t_64w_msa(uint8_t *src,
2179 const int8_t *filter_x,
2180 const int8_t *filter_y,
2186 hevc_hv_uniwgt_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
2187 filter_x, filter_y, height, weight,
2188 offset, rnd_val, 64);
2191 static void hevc_hz_uniwgt_4t_4x2_msa(uint8_t *src,
2195 const int8_t *filter,
2202 v16i8 src0, src1, vec0, vec1;
2205 v4i32 dst0_r, dst0_l;
2206 v8i16 filter_vec, const_vec;
2207 v4i32 weight_vec, offset_vec, rnd_vec;
2208 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 };
2212 filter_vec = LD_SH(filter);
2213 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2217 weight = weight & 0x0000FFFF;
2219 const_vec = __msa_ldi_h(128);
2222 weight_vec = __msa_fill_w(weight);
2223 offset_vec = __msa_fill_w(offset);
2224 rnd_vec = __msa_fill_w(rnd_val);
2226 LD_SB2(src, src_stride, src0, src1);
2227 XORI_B2_128_SB(src0, src1);
2229 VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
2231 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
2233 ILVRL_H2_SW(dst0, dst0, dst0_r, dst0_l);
2234 DOTP_SH2_SW(dst0_r, dst0_l, weight_vec, weight_vec, dst0_r, dst0_l);
2235 SRAR_W2_SW(dst0_r, dst0_l, rnd_vec);
2236 ADD2(dst0_r, offset_vec, dst0_l, offset_vec, dst0_r, dst0_l);
2237 dst0_r = CLIP_SW_0_255(dst0_r);
2238 dst0_l = CLIP_SW_0_255(dst0_l);
2240 HEVC_PCK_SW_SB2(dst0_l, dst0_r, dst0_r);
2241 ST4x2_UB(dst0_r, dst, dst_stride);
2242 dst += (4 * dst_stride);
2245 static void hevc_hz_uniwgt_4t_4x4_msa(uint8_t *src,
2249 const int8_t *filter,
2256 v16i8 src0, src1, src2, src3;
2257 v16i8 mask1, vec0, vec1;
2259 v4i32 dst0_r, dst1_r, dst0_l, dst1_l;
2260 v8i16 filter_vec, const_vec;
2261 v4i32 weight_vec, offset_vec, rnd_vec;
2262 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 };
2266 /* rearranging filter */
2267 filter_vec = LD_SH(filter);
2268 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2272 weight = weight & 0x0000FFFF;
2274 const_vec = __msa_ldi_h(128);
2277 weight_vec = __msa_fill_w(weight);
2278 offset_vec = __msa_fill_w(offset);
2279 rnd_vec = __msa_fill_w(rnd_val);
2281 LD_SB4(src, src_stride, src0, src1, src2, src3);
2282 XORI_B4_128_SB(src0, src1, src2, src3);
2284 VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
2286 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
2288 VSHF_B2_SB(src2, src3, src2, src3, mask0, mask1, vec0, vec1);
2290 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
2292 HEVC_UNIW_RND_CLIP2(dst0, dst1, weight_vec, offset_vec, rnd_vec,
2293 dst0_r, dst1_r, dst0_l, dst1_l);
2295 HEVC_PCK_SW_SB4(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r);
2296 ST4x4_UB(dst0_r, dst0_r, 0, 1, 2, 3, dst, dst_stride);
2297 dst += (4 * dst_stride);
2300 static void hevc_hz_uniwgt_4t_4x8multiple_msa(uint8_t *src,
2304 const int8_t *filter,
2312 v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
2313 v16i8 mask1, vec0, vec1;
2314 v8i16 dst0, dst1, dst2, dst3;
2315 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l;
2316 v8i16 filter_vec, const_vec;
2317 v4i32 weight_vec, offset_vec, rnd_vec;
2318 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 };
2322 filter_vec = LD_SH(filter);
2323 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2325 weight = weight & 0x0000FFFF;
2326 const_vec = __msa_ldi_h(128);
2329 weight_vec = __msa_fill_w(weight);
2330 offset_vec = __msa_fill_w(offset);
2331 rnd_vec = __msa_fill_w(rnd_val);
2335 for (loop_cnt = (height >> 3); loop_cnt--;) {
2336 LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
2337 src += (8 * src_stride);
2339 XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
2341 VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
2343 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
2345 VSHF_B2_SB(src2, src3, src2, src3, mask0, mask1, vec0, vec1);
2347 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
2349 VSHF_B2_SB(src4, src5, src4, src5, mask0, mask1, vec0, vec1);
2351 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
2353 VSHF_B2_SB(src6, src7, src6, src7, mask0, mask1, vec0, vec1);
2355 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
2357 HEVC_UNIW_RND_CLIP4(dst0, dst1, dst2, dst3,
2358 weight_vec, offset_vec, rnd_vec,
2359 dst0_r, dst1_r, dst2_r, dst3_r,
2360 dst0_l, dst1_l, dst2_l, dst3_l);
2362 HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r,
2363 dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
2364 ST4x8_UB(dst0_r, dst1_r, dst, dst_stride);
2365 dst += (8 * dst_stride);
2369 static void hevc_hz_uniwgt_4t_4w_msa(uint8_t *src,
2373 const int8_t *filter,
2380 hevc_hz_uniwgt_4t_4x2_msa(src, src_stride, dst, dst_stride,
2381 filter, height, weight, offset, rnd_val);
2382 } else if (4 == height) {
2383 hevc_hz_uniwgt_4t_4x4_msa(src, src_stride, dst, dst_stride,
2384 filter, height, weight, offset, rnd_val);
2385 } else if (8 == height || 16 == height) {
2386 hevc_hz_uniwgt_4t_4x8multiple_msa(src, src_stride, dst, dst_stride,
2387 filter, height, weight,
2392 static void hevc_hz_uniwgt_4t_6w_msa(uint8_t *src,
2396 const int8_t *filter,
2404 v16i8 src0, src1, src2, src3;
2405 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
2408 v8i16 dst0, dst1, dst2, dst3;
2409 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l;
2410 v8i16 filter_vec, const_vec;
2411 v4i32 weight_vec, offset_vec, rnd_vec;
2415 filter_vec = LD_SH(filter);
2416 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2418 weight = weight & 0x0000FFFF;
2419 const_vec = __msa_ldi_h(128);
2422 weight_vec = __msa_fill_w(weight);
2423 offset_vec = __msa_fill_w(offset);
2424 rnd_vec = __msa_fill_w(rnd_val);
2428 for (loop_cnt = (height >> 2); loop_cnt--;) {
2429 LD_SB4(src, src_stride, src0, src1, src2, src3);
2430 src += (4 * src_stride);
2432 XORI_B4_128_SB(src0, src1, src2, src3);
2434 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
2436 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
2438 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
2440 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
2442 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
2444 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
2446 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
2448 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
2450 HEVC_UNIW_RND_CLIP4(dst0, dst1, dst2, dst3,
2451 weight_vec, offset_vec, rnd_vec,
2452 dst0_r, dst1_r, dst2_r, dst3_r,
2453 dst0_l, dst1_l, dst2_l, dst3_l);
2455 HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r,
2456 dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
2458 ST6x4_UB(dst0_r, dst1_r, dst, dst_stride);
2459 dst += (4 * dst_stride);
2463 static void hevc_hz_uniwgt_4t_8x2_msa(uint8_t *src,
2467 const int8_t *filter,
2473 v8i16 filt0, filt1, dst0, dst1;
2475 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
2478 v8i16 filter_vec, const_vec;
2479 v4i32 dst0_r, dst1_r, dst0_l, dst1_l;
2480 v4i32 weight_vec, offset_vec, rnd_vec;
2484 filter_vec = LD_SH(filter);
2485 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2487 weight = weight & 0x0000FFFF;
2488 const_vec = __msa_ldi_h(128);
2491 weight_vec = __msa_fill_w(weight);
2492 offset_vec = __msa_fill_w(offset);
2493 rnd_vec = __msa_fill_w(rnd_val);
2497 LD_SB2(src, src_stride, src0, src1);
2498 XORI_B2_128_SB(src0, src1);
2500 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
2502 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
2503 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
2505 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
2507 HEVC_UNIW_RND_CLIP2(dst0, dst1, weight_vec, offset_vec, rnd_vec,
2508 dst0_r, dst1_r, dst0_l, dst1_l);
2510 HEVC_PCK_SW_SB4(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r);
2511 ST8x2_UB(dst0_r, dst, dst_stride);
2514 static void hevc_hz_uniwgt_4t_8x6_msa(uint8_t *src,
2518 const int8_t *filter,
2525 v16i8 src0, src1, src2, src3, src4, src5;
2526 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
2529 v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
2530 v8i16 filter_vec, const_vec;
2531 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r;
2532 v4i32 dst0_l, dst1_l, dst2_l, dst3_l, dst4_l, dst5_l;
2533 v4i32 weight_vec, offset_vec, rnd_vec;
2537 filter_vec = LD_SH(filter);
2538 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2540 weight = weight & 0x0000FFFF;
2541 const_vec = __msa_ldi_h(128);
2544 weight_vec = __msa_fill_w(weight);
2545 offset_vec = __msa_fill_w(offset);
2546 rnd_vec = __msa_fill_w(rnd_val);
2550 LD_SB6(src, src_stride, src0, src1, src2, src3, src4, src5);
2551 LD_SB6(src, src_stride, src0, src1, src2, src3, src4, src5);
2552 XORI_B6_128_SB(src0, src1, src2, src3, src4, src5);
2554 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
2556 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
2558 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
2560 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
2562 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
2564 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
2566 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
2568 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
2570 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
2572 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
2574 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
2576 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5);
2578 HEVC_UNIW_RND_CLIP4(dst0, dst1, dst2, dst3,
2579 weight_vec, offset_vec, rnd_vec,
2580 dst0_r, dst1_r, dst2_r, dst3_r,
2581 dst0_l, dst1_l, dst2_l, dst3_l);
2583 HEVC_UNIW_RND_CLIP2(dst4, dst5, weight_vec, offset_vec, rnd_vec,
2584 dst4_r, dst5_r, dst4_l, dst5_l);
2586 HEVC_PCK_SW_SB12(dst0_l, dst0_r, dst1_l, dst1_r,
2587 dst2_l, dst2_r, dst3_l, dst3_r,
2588 dst4_l, dst4_r, dst5_l, dst5_r, dst0_r, dst1_r, dst2_r);
2590 ST8x4_UB(dst0_r, dst1_r, dst, dst_stride);
2591 dst += (4 * dst_stride);
2592 ST8x2_UB(dst2_r, dst, dst_stride);
2595 static void hevc_hz_uniwgt_4t_8x4multiple_msa(uint8_t *src,
2599 const int8_t *filter,
2607 v16i8 src0, src1, src2, src3;
2608 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
2611 v8i16 dst0, dst1, dst2, dst3;
2612 v8i16 filter_vec, const_vec;
2613 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l;
2614 v4i32 weight_vec, offset_vec, rnd_vec;
2618 filter_vec = LD_SH(filter);
2619 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2621 weight = weight & 0x0000FFFF;
2622 const_vec = __msa_ldi_h(128);
2625 weight_vec = __msa_fill_w(weight);
2626 offset_vec = __msa_fill_w(offset);
2627 rnd_vec = __msa_fill_w(rnd_val);
2631 for (loop_cnt = (height >> 2); loop_cnt--;) {
2632 LD_SB4(src, src_stride, src0, src1, src2, src3);
2633 src += (4 * src_stride);
2635 XORI_B4_128_SB(src0, src1, src2, src3);
2637 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
2639 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
2641 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
2643 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
2645 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
2647 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
2649 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
2651 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
2653 HEVC_UNIW_RND_CLIP4(dst0, dst1, dst2, dst3,
2654 weight_vec, offset_vec, rnd_vec,
2655 dst0_r, dst1_r, dst2_r, dst3_r,
2656 dst0_l, dst1_l, dst2_l, dst3_l);
2658 HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r,
2659 dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
2661 ST8x4_UB(dst0_r, dst1_r, dst, dst_stride);
2662 dst += (4 * dst_stride);
2666 static void hevc_hz_uniwgt_4t_8w_msa(uint8_t *src,
2670 const int8_t *filter,
2677 hevc_hz_uniwgt_4t_8x2_msa(src, src_stride, dst, dst_stride,
2678 filter, height, weight, offset, rnd_val);
2679 } else if (6 == height) {
2680 hevc_hz_uniwgt_4t_8x6_msa(src, src_stride, dst, dst_stride,
2681 filter, height, weight, offset, rnd_val);
2683 hevc_hz_uniwgt_4t_8x4multiple_msa(src, src_stride, dst, dst_stride,
2684 filter, height, weight, offset,
2689 static void hevc_hz_uniwgt_4t_12w_msa(uint8_t *src,
2693 const int8_t *filter,
2701 v16i8 src0, src1, src2, src3;
2702 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
2703 v16i8 mask2 = { 8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
2707 v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
2708 v8i16 filter_vec, const_vec;
2710 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r;
2711 v4i32 dst0_l, dst1_l, dst2_l, dst3_l, dst4_l, dst5_l;
2712 v4i32 weight_vec, offset_vec, rnd_vec;
2716 filter_vec = LD_SH(filter);
2717 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2719 weight = weight & 0x0000FFFF;
2720 const_vec = __msa_ldi_h(128);
2723 weight_vec = __msa_fill_w(weight);
2724 offset_vec = __msa_fill_w(offset);
2725 rnd_vec = __msa_fill_w(rnd_val);
2730 for (loop_cnt = (height >> 2); loop_cnt--;) {
2731 LD_SB4(src, src_stride, src0, src1, src2, src3);
2732 src += (4 * src_stride);
2734 XORI_B4_128_SB(src0, src1, src2, src3);
2736 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
2738 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
2740 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
2742 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
2744 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
2746 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
2748 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
2750 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
2752 VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec0, vec1);
2754 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
2756 VSHF_B2_SB(src2, src3, src2, src3, mask2, mask3, vec0, vec1);
2758 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5);
2760 HEVC_UNIW_RND_CLIP4(dst0, dst1, dst2, dst3,
2761 weight_vec, offset_vec, rnd_vec,
2762 dst0_r, dst1_r, dst2_r, dst3_r,
2763 dst0_l, dst1_l, dst2_l, dst3_l);
2765 HEVC_UNIW_RND_CLIP2(dst4, dst5, weight_vec, offset_vec, rnd_vec,
2766 dst4_r, dst5_r, dst4_l, dst5_l);
2768 HEVC_PCK_SW_SB12(dst0_l, dst0_r, dst1_l, dst1_r,
2769 dst2_l, dst2_r, dst3_l, dst3_r,
2770 dst4_l, dst4_r, dst5_l, dst5_r,
2771 dst0_r, dst1_r, dst2_r);
2773 ST12x4_UB(dst0_r, dst1_r, dst2_r, dst, dst_stride);
2774 dst += (4 * dst_stride);
2778 static void hevc_hz_uniwgt_4t_16w_msa(uint8_t *src,
2782 const int8_t *filter,
2789 v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
2791 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
2793 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
2795 v8i16 filter_vec, const_vec;
2796 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l;
2797 v4i32 weight_vec, offset_vec, rnd_vec;
2801 filter_vec = LD_SH(filter);
2802 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2804 weight = weight & 0x0000FFFF;
2805 const_vec = __msa_ldi_h(128);
2808 weight_vec = __msa_fill_w(weight);
2809 offset_vec = __msa_fill_w(offset);
2810 rnd_vec = __msa_fill_w(rnd_val);
2814 for (loop_cnt = (height >> 2); loop_cnt--;) {
2815 LD_SB4(src, src_stride, src0, src2, src4, src6);
2816 LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
2817 src += (4 * src_stride);
2819 XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
2821 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
2823 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
2825 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
2827 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
2829 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
2831 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
2833 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
2835 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
2837 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
2839 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
2841 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
2843 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5);
2845 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
2847 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst6, dst6);
2849 VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1);
2851 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst7, dst7);
2853 HEVC_UNIW_RND_CLIP4(dst0, dst1, dst2, dst3,
2854 weight_vec, offset_vec, rnd_vec,
2855 dst0_r, dst1_r, dst2_r, dst3_r,
2856 dst0_l, dst1_l, dst2_l, dst3_l);
2858 HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r,
2859 dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
2860 ST_SW2(dst0_r, dst1_r, dst, dst_stride);
2861 dst += (2 * dst_stride);
2863 HEVC_UNIW_RND_CLIP4(dst4, dst5, dst6, dst7,
2864 weight_vec, offset_vec, rnd_vec,
2865 dst0_r, dst1_r, dst2_r, dst3_r,
2866 dst0_l, dst1_l, dst2_l, dst3_l);
2868 HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r,
2869 dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
2870 ST_SW2(dst0_r, dst1_r, dst, dst_stride);
2871 dst += (2 * dst_stride);
2875 static void hevc_hz_uniwgt_4t_24w_msa(uint8_t *src,
2879 const int8_t *filter,
2886 uint8_t *dst_tmp = dst + 16;
2887 v16i8 src0, src1, src2, src3;
2889 v8i16 dst0, dst1, dst2, dst3;
2890 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
2891 v16i8 mask1, mask2, mask3;
2893 v8i16 filter_vec, const_vec;
2894 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l;
2895 v4i32 weight_vec, offset_vec, rnd_vec;
2899 filter_vec = LD_SH(filter);
2900 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2902 weight = weight & 0x0000FFFF;
2903 const_vec = __msa_ldi_h(128);
2906 weight_vec = __msa_fill_w(weight);
2907 offset_vec = __msa_fill_w(offset);
2908 rnd_vec = __msa_fill_w(rnd_val);
2914 for (loop_cnt = (height >> 1); loop_cnt--;) {
2916 LD_SB2(src, src_stride, src0, src2);
2917 LD_SB2(src + 16, src_stride, src1, src3);
2918 src += (2 * src_stride);
2920 XORI_B4_128_SB(src0, src1, src2, src3);
2922 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
2924 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
2926 VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec0, vec1);
2928 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
2930 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
2932 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
2934 VSHF_B2_SB(src2, src3, src2, src3, mask2, mask3, vec0, vec1);
2936 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
2938 HEVC_UNIW_RND_CLIP4(dst0, dst1, dst2, dst3,
2939 weight_vec, offset_vec, rnd_vec,
2940 dst0_r, dst1_r, dst2_r, dst3_r,
2941 dst0_l, dst1_l, dst2_l, dst3_l);
2943 HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r,
2944 dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
2945 ST_SW2(dst0_r, dst1_r, dst, dst_stride);
2946 dst += (2 * dst_stride);
2949 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
2951 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
2953 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
2955 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
2957 HEVC_UNIW_RND_CLIP2(dst0, dst1, weight_vec, offset_vec, rnd_vec,
2958 dst0_r, dst1_r, dst0_l, dst1_l);
2960 HEVC_PCK_SW_SB4(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r);
2961 ST8x2_UB(dst0_r, dst_tmp, dst_stride);
2962 dst_tmp += (2 * dst_stride);
2966 static void hevc_hz_uniwgt_4t_32w_msa(uint8_t *src,
2970 const int8_t *filter,
2977 v16i8 src0, src1, src2;
2979 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
2980 v16i8 mask1, mask2, mask3;
2981 v8i16 dst0, dst1, dst2, dst3;
2983 v8i16 filter_vec, const_vec;
2984 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l;
2985 v4i32 weight_vec, offset_vec, rnd_vec;
2989 filter_vec = LD_SH(filter);
2990 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2992 weight = weight & 0x0000FFFF;
2993 const_vec = __msa_ldi_h(128);
2996 weight_vec = __msa_fill_w(weight);
2997 offset_vec = __msa_fill_w(offset);
2998 rnd_vec = __msa_fill_w(rnd_val);
3004 for (loop_cnt = (height >> 1); loop_cnt--;) {
3005 LD_SB2(src, 16, src0, src1);
3006 src2 = LD_SB(src + 24);
3009 XORI_B3_128_SB(src0, src1, src2);
3011 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3013 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
3015 VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec0, vec1);
3017 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
3019 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
3021 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
3023 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
3025 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
3027 HEVC_UNIW_RND_CLIP4(dst0, dst1, dst2, dst3,
3028 weight_vec, offset_vec, rnd_vec,
3029 dst0_r, dst1_r, dst2_r, dst3_r,
3030 dst0_l, dst1_l, dst2_l, dst3_l);
3032 HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r,
3033 dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
3034 ST_SW2(dst0_r, dst1_r, dst, 16);
3037 LD_SB2(src, 16, src0, src1);
3038 src2 = LD_SB(src + 24);
3041 XORI_B3_128_SB(src0, src1, src2);
3043 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3045 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
3047 VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec0, vec1);
3049 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
3051 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
3053 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
3055 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
3057 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
3059 HEVC_UNIW_RND_CLIP4(dst0, dst1, dst2, dst3,
3060 weight_vec, offset_vec, rnd_vec,
3061 dst0_r, dst1_r, dst2_r, dst3_r,
3062 dst0_l, dst1_l, dst2_l, dst3_l);
3064 HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r,
3065 dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
3066 ST_SW2(dst0_r, dst1_r, dst, 16);
3071 static void hevc_vt_uniwgt_4t_4x2_msa(uint8_t *src,
3075 const int8_t *filter,
3081 v16i8 src0, src1, src2, src3, src4;
3082 v16i8 src10_r, src32_r, src21_r, src43_r;
3083 v16i8 src2110, src4332;
3085 v4i32 dst0_r, dst0_l;
3087 v8i16 filter_vec, const_vec;
3088 v4i32 weight_vec, offset_vec, rnd_vec;
3092 const_vec = __msa_ldi_h(128);
3094 weight = weight & 0x0000FFFF;
3096 weight_vec = __msa_fill_w(weight);
3097 offset_vec = __msa_fill_w(offset);
3098 rnd_vec = __msa_fill_w(rnd_val);
3100 filter_vec = LD_SH(filter);
3101 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3103 LD_SB3(src, src_stride, src0, src1, src2);
3104 src += (3 * src_stride);
3105 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3106 src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
3107 src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
3108 LD_SB2(src, src_stride, src3, src4);
3109 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3110 src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_r, (v2i64) src32_r);
3111 src4332 = (v16i8) __msa_xori_b((v16u8) src4332, 128);
3114 DPADD_SB2_SH(src2110, src4332, filt0, filt1, dst10, dst10);
3116 ILVRL_H2_SW(dst10, dst10, dst0_r, dst0_l);
3117 DOTP_SH2_SW(dst0_r, dst0_l, weight_vec, weight_vec, dst0_r, dst0_l);
3118 SRAR_W2_SW(dst0_r, dst0_l, rnd_vec);
3119 ADD2(dst0_r, offset_vec, dst0_l, offset_vec, dst0_r, dst0_l);
3120 dst0_r = CLIP_SW_0_255(dst0_r);
3121 dst0_l = CLIP_SW_0_255(dst0_l);
3123 HEVC_PCK_SW_SB2(dst0_l, dst0_r, dst0_r);
3124 ST4x2_UB(dst0_r, dst, dst_stride);
3127 static void hevc_vt_uniwgt_4t_4x4_msa(uint8_t *src,
3131 const int8_t *filter,
3137 v16i8 src0, src1, src2, src3, src4, src5, src6;
3138 v16i8 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r;
3139 v16i8 src2110, src4332, src6554;
3141 v4i32 dst0_r, dst1_r, dst0_l, dst1_l;
3143 v8i16 filter_vec, const_vec;
3144 v4i32 weight_vec, offset_vec, rnd_vec;
3148 const_vec = __msa_ldi_h(128);
3150 weight = weight & 0x0000FFFF;
3152 weight_vec = __msa_fill_w(weight);
3153 offset_vec = __msa_fill_w(offset);
3154 rnd_vec = __msa_fill_w(rnd_val);
3156 filter_vec = LD_SH(filter);
3157 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3159 LD_SB3(src, src_stride, src0, src1, src2);
3160 src += (3 * src_stride);
3161 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3162 src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
3163 src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
3165 LD_SB4(src, src_stride, src3, src4, src5, src6);
3166 ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
3167 src32_r, src43_r, src54_r, src65_r);
3168 ILVR_D2_SB(src43_r, src32_r, src65_r, src54_r, src4332, src6554);
3169 XORI_B2_128_SB(src4332, src6554);
3172 DPADD_SB2_SH(src2110, src4332, filt0, filt1, dst10, dst10);
3174 DPADD_SB2_SH(src4332, src6554, filt0, filt1, dst32, dst32);
3175 HEVC_UNIW_RND_CLIP2(dst10, dst32, weight_vec, offset_vec, rnd_vec,
3176 dst0_r, dst1_r, dst0_l, dst1_l);
3178 HEVC_PCK_SW_SB4(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r);
3179 ST4x4_UB(dst0_r, dst0_r, 0, 1, 2, 3, dst, dst_stride);
3180 dst += (4 * dst_stride);
3183 static void hevc_vt_uniwgt_4t_4x8multiple_msa(uint8_t *src,
3187 const int8_t *filter,
3194 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9;
3195 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
3196 v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
3197 v16i8 src2110, src4332, src6554, src8776;
3198 v8i16 dst10, dst32, dst54, dst76;
3199 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l;
3201 v8i16 filter_vec, const_vec;
3202 v4i32 weight_vec, offset_vec, rnd_vec;
3206 const_vec = __msa_ldi_h(128);
3208 weight = weight & 0x0000FFFF;
3210 weight_vec = __msa_fill_w(weight);
3211 offset_vec = __msa_fill_w(offset);
3212 rnd_vec = __msa_fill_w(rnd_val);
3214 filter_vec = LD_SH(filter);
3215 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3217 LD_SB3(src, src_stride, src0, src1, src2);
3218 src += (3 * src_stride);
3219 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3220 src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
3221 src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
3223 for (loop_cnt = (height >> 3); loop_cnt--;) {
3224 LD_SB6(src, src_stride, src3, src4, src5, src6, src7, src8);
3225 src += (6 * src_stride);
3226 ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
3227 src32_r, src43_r, src54_r, src65_r);
3228 ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
3229 ILVR_D3_SB(src43_r, src32_r, src65_r, src54_r, src87_r, src76_r,
3230 src4332, src6554, src8776);
3231 XORI_B3_128_SB(src4332, src6554, src8776);
3234 DPADD_SB2_SH(src2110, src4332, filt0, filt1, dst10, dst10);
3236 DPADD_SB2_SH(src4332, src6554, filt0, filt1, dst32, dst32);
3238 DPADD_SB2_SH(src6554, src8776, filt0, filt1, dst54, dst54);
3240 LD_SB2(src, src_stride, src9, src2);
3241 src += (2 * src_stride);
3242 ILVR_B2_SB(src9, src8, src2, src9, src98_r, src109_r);
3243 src2110 = (v16i8) __msa_ilvr_d((v2i64) src109_r, (v2i64) src98_r);
3244 src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
3247 DPADD_SB2_SH(src8776, src2110, filt0, filt1, dst76, dst76);
3248 HEVC_UNIW_RND_CLIP4(dst10, dst32, dst54, dst76,
3249 weight_vec, offset_vec, rnd_vec,
3250 dst0_r, dst1_r, dst2_r, dst3_r,
3251 dst0_l, dst1_l, dst2_l, dst3_l);
3253 HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r,
3254 dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
3255 ST4x8_UB(dst0_r, dst1_r, dst, dst_stride);
3256 dst += (8 * dst_stride);
3260 static void hevc_vt_uniwgt_4t_4w_msa(uint8_t *src,
3264 const int8_t *filter,
3271 hevc_vt_uniwgt_4t_4x2_msa(src, src_stride, dst, dst_stride,
3272 filter, height, weight, offset, rnd_val);
3273 } else if (4 == height) {
3274 hevc_vt_uniwgt_4t_4x4_msa(src, src_stride, dst, dst_stride,
3275 filter, height, weight, offset, rnd_val);
3276 } else if (0 == (height % 8)) {
3277 hevc_vt_uniwgt_4t_4x8multiple_msa(src, src_stride, dst, dst_stride,
3278 filter, height, weight, offset,
3283 static void hevc_vt_uniwgt_4t_6w_msa(uint8_t *src,
3287 const int8_t *filter,
3294 v16i8 src0, src1, src2, src3, src4;
3295 v16i8 src10_r, src32_r, src21_r, src43_r;
3296 v8i16 tmp0, tmp1, tmp2, tmp3;
3298 v8i16 filter_vec, const_vec;
3299 v4i32 weight_vec, offset_vec, rnd_vec;
3300 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l;
3304 const_vec = __msa_ldi_h(128);
3306 weight = weight & 0x0000FFFF;
3308 weight_vec = __msa_fill_w(weight);
3309 offset_vec = __msa_fill_w(offset);
3310 rnd_vec = __msa_fill_w(rnd_val);
3312 filter_vec = LD_SH(filter);
3313 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3315 LD_SB3(src, src_stride, src0, src1, src2);
3316 src += (3 * src_stride);
3317 XORI_B3_128_SB(src0, src1, src2);
3318 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3320 for (loop_cnt = (height >> 2); loop_cnt--;) {
3321 LD_SB2(src, src_stride, src3, src4);
3322 src += (2 * src_stride);
3323 XORI_B2_128_SB(src3, src4);
3324 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3327 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, tmp0, tmp0);
3329 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, tmp1, tmp1);
3331 LD_SB2(src, src_stride, src1, src2);
3332 src += (2 * src_stride);
3333 XORI_B2_128_SB(src1, src2);
3334 ILVR_B2_SB(src1, src4, src2, src1, src10_r, src21_r);
3337 DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, tmp2, tmp2);
3339 DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, tmp3, tmp3);
3340 HEVC_UNIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3,
3341 weight_vec, offset_vec, rnd_vec,
3342 dst0_r, dst1_r, dst2_r, dst3_r,
3343 dst0_l, dst1_l, dst2_l, dst3_l);
3345 HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r,
3346 dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
3348 ST6x4_UB(dst0_r, dst1_r, dst, dst_stride);
3349 dst += (4 * dst_stride);
3353 static void hevc_vt_uniwgt_4t_8x2_msa(uint8_t *src,
3357 const int8_t *filter,
3363 v16i8 src0, src1, src2, src3, src4;
3364 v16i8 src10_r, src32_r, src21_r, src43_r;
3367 v8i16 filter_vec, const_vec;
3368 v4i32 weight_vec, offset_vec, rnd_vec;
3369 v4i32 dst0_r, dst1_r, dst0_l, dst1_l;
3373 const_vec = __msa_ldi_h(128);
3375 weight = weight & 0x0000FFFF;
3377 weight_vec = __msa_fill_w(weight);
3378 offset_vec = __msa_fill_w(offset);
3379 rnd_vec = __msa_fill_w(rnd_val);
3381 filter_vec = LD_SH(filter);
3382 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3384 LD_SB3(src, src_stride, src0, src1, src2);
3385 src += (3 * src_stride);
3386 XORI_B3_128_SB(src0, src1, src2);
3387 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3388 LD_SB2(src, src_stride, src3, src4);
3389 XORI_B2_128_SB(src3, src4);
3390 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3393 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, tmp0, tmp0);
3395 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, tmp1, tmp1);
3396 HEVC_UNIW_RND_CLIP2(tmp0, tmp1, weight_vec, offset_vec, rnd_vec,
3397 dst0_r, dst1_r, dst0_l, dst1_l);
3399 HEVC_PCK_SW_SB4(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r);
3400 ST8x2_UB(dst0_r, dst, dst_stride);
3403 static void hevc_vt_uniwgt_4t_8x6_msa(uint8_t *src,
3407 const int8_t *filter,
3413 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
3414 v16i8 src10_r, src32_r, src54_r, src76_r;
3415 v16i8 src21_r, src43_r, src65_r, src87_r;
3416 v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
3418 v8i16 filter_vec, const_vec;
3419 v4i32 weight_vec, offset_vec, rnd_vec;
3420 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r;
3421 v4i32 dst0_l, dst1_l, dst2_l, dst3_l, dst4_l, dst5_l;
3425 const_vec = __msa_ldi_h(128);
3427 weight = weight & 0x0000FFFF;
3429 weight_vec = __msa_fill_w(weight);
3430 offset_vec = __msa_fill_w(offset);
3431 rnd_vec = __msa_fill_w(rnd_val);
3433 filter_vec = LD_SH(filter);
3434 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3436 LD_SB3(src, src_stride, src0, src1, src2);
3437 src += (3 * src_stride);
3438 XORI_B3_128_SB(src0, src1, src2);
3439 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3441 LD_SB6(src, src_stride, src3, src4, src5, src6, src7, src8);
3442 XORI_B6_128_SB(src3, src4, src5, src6, src7, src8);
3443 ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
3444 src32_r, src43_r, src54_r, src65_r);
3445 ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
3448 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, tmp0, tmp0);
3450 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, tmp1, tmp1);
3452 DPADD_SB2_SH(src32_r, src54_r, filt0, filt1, tmp2, tmp2);
3454 DPADD_SB2_SH(src43_r, src65_r, filt0, filt1, tmp3, tmp3);
3456 DPADD_SB2_SH(src54_r, src76_r, filt0, filt1, tmp4, tmp4);
3458 DPADD_SB2_SH(src65_r, src87_r, filt0, filt1, tmp5, tmp5);
3459 HEVC_UNIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3,
3460 weight_vec, offset_vec, rnd_vec,
3461 dst0_r, dst1_r, dst2_r, dst3_r,
3462 dst0_l, dst1_l, dst2_l, dst3_l);
3463 HEVC_UNIW_RND_CLIP2(tmp4, tmp5, weight_vec, offset_vec, rnd_vec,
3464 dst4_r, dst5_r, dst4_l, dst5_l);
3466 HEVC_PCK_SW_SB12(dst0_l, dst0_r, dst1_l, dst1_r,
3467 dst2_l, dst2_r, dst3_l, dst3_r,
3468 dst4_l, dst4_r, dst5_l, dst5_r, dst0_r, dst1_r, dst2_r);
3469 ST8x4_UB(dst0_r, dst1_r, dst, dst_stride);
3470 dst += (4 * dst_stride);
3471 ST8x2_UB(dst2_r, dst, dst_stride);
3474 static void hevc_vt_uniwgt_4t_8x4multiple_msa(uint8_t *src,
3478 const int8_t *filter,
3485 v16i8 src0, src1, src2, src3, src4;
3486 v16i8 src10_r, src32_r, src21_r, src43_r;
3487 v8i16 tmp0, tmp1, tmp2, tmp3;
3489 v8i16 filter_vec, const_vec;
3490 v4i32 weight_vec, offset_vec, rnd_vec;
3491 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l;
3495 const_vec = __msa_ldi_h(128);
3497 weight = weight & 0x0000FFFF;
3499 weight_vec = __msa_fill_w(weight);
3500 offset_vec = __msa_fill_w(offset);
3501 rnd_vec = __msa_fill_w(rnd_val);
3503 filter_vec = LD_SH(filter);
3504 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3506 LD_SB3(src, src_stride, src0, src1, src2);
3507 src += (3 * src_stride);
3508 XORI_B3_128_SB(src0, src1, src2);
3509 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3511 for (loop_cnt = (height >> 2); loop_cnt--;) {
3512 LD_SB2(src, src_stride, src3, src4);
3513 src += (2 * src_stride);
3514 XORI_B2_128_SB(src3, src4);
3515 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3518 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, tmp0, tmp0);
3520 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, tmp1, tmp1);
3522 LD_SB2(src, src_stride, src1, src2);
3523 src += (2 * src_stride);
3524 XORI_B2_128_SB(src1, src2);
3525 ILVR_B2_SB(src1, src4, src2, src1, src10_r, src21_r);
3528 DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, tmp2, tmp2);
3530 DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, tmp3, tmp3);
3531 HEVC_UNIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3,
3532 weight_vec, offset_vec, rnd_vec,
3533 dst0_r, dst1_r, dst2_r, dst3_r,
3534 dst0_l, dst1_l, dst2_l, dst3_l);
3536 HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r,
3537 dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
3538 ST8x4_UB(dst0_r, dst1_r, dst, dst_stride);
3539 dst += (4 * dst_stride);
3543 static void hevc_vt_uniwgt_4t_8w_msa(uint8_t *src,
3547 const int8_t *filter,
3554 hevc_vt_uniwgt_4t_8x2_msa(src, src_stride, dst, dst_stride,
3555 filter, height, weight, offset, rnd_val);
3556 } else if (6 == height) {
3557 hevc_vt_uniwgt_4t_8x6_msa(src, src_stride, dst, dst_stride,
3558 filter, height, weight, offset, rnd_val);
3560 hevc_vt_uniwgt_4t_8x4multiple_msa(src, src_stride, dst, dst_stride,
3561 filter, height, weight, offset,
3566 static void hevc_vt_uniwgt_4t_12w_msa(uint8_t *src,
3570 const int8_t *filter,
3577 v16i8 src0, src1, src2, src3, src4, src5;
3578 v16i8 src10_r, src32_r, src21_r, src43_r;
3579 v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
3580 v16i8 src10_l, src32_l, src54_l, src21_l, src43_l, src65_l;
3581 v16i8 src2110, src4332;
3583 v8i16 filter_vec, const_vec;
3584 v4i32 weight_vec, offset_vec, rnd_vec;
3585 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r;
3586 v4i32 dst0_l, dst1_l, dst2_l, dst3_l, dst4_l, dst5_l;
3588 src -= (1 * src_stride);
3590 const_vec = __msa_ldi_h(128);
3592 weight = weight & 0x0000FFFF;
3594 weight_vec = __msa_fill_w(weight);
3595 offset_vec = __msa_fill_w(offset);
3596 rnd_vec = __msa_fill_w(rnd_val);
3598 filter_vec = LD_SH(filter);
3599 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3601 LD_SB3(src, src_stride, src0, src1, src2);
3602 src += (3 * src_stride);
3603 XORI_B3_128_SB(src0, src1, src2);
3604 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3605 ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
3606 src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_l, (v2i64) src10_l);
3608 for (loop_cnt = (height >> 2); loop_cnt--;) {
3609 LD_SB2(src, src_stride, src3, src4);
3610 src += (2 * src_stride);
3611 XORI_B2_128_SB(src3, src4);
3612 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3613 ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
3614 src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_l, (v2i64) src32_l);
3617 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, tmp0, tmp0);
3619 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, tmp1, tmp1);
3621 DPADD_SB2_SH(src2110, src4332, filt0, filt1, tmp4, tmp4);
3623 LD_SB2(src, src_stride, src5, src2);
3624 src += (2 * src_stride);
3625 XORI_B2_128_SB(src5, src2);
3626 ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r);
3627 ILVL_B2_SB(src5, src4, src2, src5, src54_l, src65_l);
3628 src2110 = (v16i8) __msa_ilvr_d((v2i64) src65_l, (v2i64) src54_l);
3631 DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, tmp2, tmp2);
3633 DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, tmp3, tmp3);
3635 DPADD_SB2_SH(src4332, src2110, filt0, filt1, tmp5, tmp5);
3636 HEVC_UNIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3,
3637 weight_vec, offset_vec, rnd_vec,
3638 dst0_r, dst1_r, dst2_r, dst3_r,
3639 dst0_l, dst1_l, dst2_l, dst3_l);
3640 HEVC_UNIW_RND_CLIP2(tmp4, tmp5, weight_vec, offset_vec, rnd_vec,
3641 dst4_r, dst5_r, dst4_l, dst5_l);
3643 HEVC_PCK_SW_SB12(dst0_l, dst0_r, dst1_l, dst1_r,
3644 dst2_l, dst2_r, dst3_l, dst3_r,
3645 dst4_l, dst4_r, dst5_l, dst5_r,
3646 dst0_r, dst1_r, dst2_r);
3647 ST12x4_UB(dst0_r, dst1_r, dst2_r, dst, dst_stride);
3648 dst += (4 * dst_stride);
3652 static void hevc_vt_uniwgt_4t_16w_msa(uint8_t *src,
3656 const int8_t *filter,
3663 v16i8 src0, src1, src2, src3, src4, src5;
3664 v16i8 src10_r, src32_r, src21_r, src43_r;
3665 v16i8 src10_l, src32_l, src21_l, src43_l;
3666 v8i16 tmp0, tmp1, tmp2, tmp3;
3668 v8i16 filter_vec, const_vec;
3669 v4i32 weight_vec, offset_vec, rnd_vec;
3670 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l;
3674 const_vec = __msa_ldi_h(128);
3676 weight = weight & 0x0000FFFF;
3678 weight_vec = __msa_fill_w(weight);
3679 offset_vec = __msa_fill_w(offset);
3680 rnd_vec = __msa_fill_w(rnd_val);
3682 filter_vec = LD_SH(filter);
3683 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3685 LD_SB3(src, src_stride, src0, src1, src2);
3686 src += (3 * src_stride);
3687 XORI_B3_128_SB(src0, src1, src2);
3688 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3689 ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
3691 for (loop_cnt = (height >> 2); loop_cnt--;) {
3692 LD_SB2(src, src_stride, src3, src4);
3693 src += (2 * src_stride);
3694 XORI_B2_128_SB(src3, src4);
3695 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3696 ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
3699 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, tmp0, tmp0);
3701 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, tmp1, tmp1);
3703 DPADD_SB2_SH(src10_l, src32_l, filt0, filt1, tmp2, tmp2);
3705 DPADD_SB2_SH(src21_l, src43_l, filt0, filt1, tmp3, tmp3);
3706 HEVC_UNIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3,
3707 weight_vec, offset_vec, rnd_vec,
3708 dst0_r, dst1_r, dst2_r, dst3_r,
3709 dst0_l, dst1_l, dst2_l, dst3_l);
3711 HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst2_l, dst2_r,
3712 dst1_l, dst1_r, dst3_l, dst3_r, dst0_r, dst1_r);
3713 ST_SW2(dst0_r, dst1_r, dst, dst_stride);
3714 dst += (2 * dst_stride);
3716 LD_SB2(src, src_stride, src5, src2);
3717 src += (2 * src_stride);
3718 XORI_B2_128_SB(src5, src2);
3719 ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r);
3720 ILVL_B2_SB(src5, src4, src2, src5, src10_l, src21_l);
3723 DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, tmp0, tmp0);
3725 DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, tmp1, tmp1);
3727 DPADD_SB2_SH(src32_l, src10_l, filt0, filt1, tmp2, tmp2);
3729 DPADD_SB2_SH(src43_l, src21_l, filt0, filt1, tmp3, tmp3);
3730 HEVC_UNIW_RND_CLIP4(tmp0, tmp1, tmp2, tmp3,
3731 weight_vec, offset_vec, rnd_vec,
3732 dst0_r, dst1_r, dst2_r, dst3_r,
3733 dst0_l, dst1_l, dst2_l, dst3_l);
3735 HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst2_l, dst2_r,
3736 dst1_l, dst1_r, dst3_l, dst3_r, dst0_r, dst1_r);
3737 ST_SW2(dst0_r, dst1_r, dst, dst_stride);
3738 dst += (2 * dst_stride);
3742 static void hevc_vt_uniwgt_4t_24w_msa(uint8_t *src,
3746 const int8_t *filter,
3753 v16i8 src0, src1, src2, src3, src4, src5;
3754 v16i8 src6, src7, src8, src9, src10, src11;
3755 v16i8 src10_r, src32_r, src76_r, src98_r;
3756 v16i8 src21_r, src43_r, src87_r, src109_r;
3757 v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
3758 v16i8 src10_l, src32_l, src21_l, src43_l;
3760 v8i16 filter_vec, const_vec;
3761 v4i32 weight_vec, offset_vec, rnd_vec;
3762 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r;
3763 v4i32 dst0_l, dst1_l, dst2_l, dst3_l, dst4_l, dst5_l;
3767 const_vec = __msa_ldi_h(128);
3769 weight = weight & 0x0000FFFF;
3771 weight_vec = __msa_fill_w(weight);
3772 offset_vec = __msa_fill_w(offset);
3773 rnd_vec = __msa_fill_w(rnd_val);
3775 filter_vec = LD_SH(filter);
3776 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3778 LD_SB3(src, src_stride, src0, src1, src2);
3779 XORI_B3_128_SB(src0, src1, src2);
3780 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3781 ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
3783 LD_SB3(src + 16, src_stride, src6, src7, src8);
3784 src += (3 * src_stride);
3785 XORI_B3_128_SB(src6, src7, src8);
3786 ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
3788 for (loop_cnt = (height >> 2); loop_cnt--;) {
3789 LD_SB2(src, src_stride, src3, src4);
3790 XORI_B2_128_SB(src3, src4);
3791 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3792 ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
3793 LD_SB2(src + 16, src_stride, src9, src10);
3794 src += (2 * src_stride);
3795 XORI_B2_128_SB(src9, src10);
3796 ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r);
3799 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, tmp0, tmp0);
3801 DPADD_SB2_SH(src10_l, src32_l, filt0, filt1, tmp4, tmp4);
3803 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, tmp1, tmp1);
3805 DPADD_SB2_SH(src21_l, src43_l, filt0, filt1, tmp5, tmp5);
3807 DPADD_SB2_SH(src76_r, src98_r, filt0, filt1, tmp2, tmp2);
3809 DPADD_SB2_SH(src87_r, src109_r, filt0, filt1, tmp3, tmp3);
3811 HEVC_UNIW_RND_CLIP4(tmp0, tmp1, tmp4, tmp5,
3812 weight_vec, offset_vec, rnd_vec,
3813 dst0_r, dst1_r, dst2_r, dst3_r,
3814 dst0_l, dst1_l, dst2_l, dst3_l);
3815 HEVC_UNIW_RND_CLIP2(tmp2, tmp3, weight_vec, offset_vec, rnd_vec,
3816 dst4_r, dst5_r, dst4_l, dst5_l);
3818 HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst2_l, dst2_r,
3819 dst1_l, dst1_r, dst3_l, dst3_r, dst0_r, dst1_r);
3820 HEVC_PCK_SW_SB4(dst4_l, dst4_r, dst5_l, dst5_r, dst4_r);
3821 ST_SW2(dst0_r, dst1_r, dst, dst_stride);
3822 ST8x2_UB(dst4_r, dst + 16, dst_stride);
3823 dst += (2 * dst_stride);
3825 LD_SB2(src, src_stride, src5, src2);
3826 XORI_B2_128_SB(src5, src2);
3827 ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r);
3828 ILVL_B2_SB(src5, src4, src2, src5, src10_l, src21_l);
3829 LD_SB2(src + 16, src_stride, src11, src8);
3830 src += (2 * src_stride);
3831 XORI_B2_128_SB(src11, src8);
3832 ILVR_B2_SB(src11, src10, src8, src11, src76_r, src87_r);
3835 DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, tmp0, tmp0);
3837 DPADD_SB2_SH(src32_l, src10_l, filt0, filt1, tmp4, tmp4);
3839 DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, tmp1, tmp1);
3841 DPADD_SB2_SH(src43_l, src21_l, filt0, filt1, tmp5, tmp5);
3843 DPADD_SB2_SH(src98_r, src76_r, filt0, filt1, tmp2, tmp2);
3845 DPADD_SB2_SH(src109_r, src87_r, filt0, filt1, tmp3, tmp3);
3847 HEVC_UNIW_RND_CLIP4(tmp0, tmp1, tmp4, tmp5,
3848 weight_vec, offset_vec, rnd_vec,
3849 dst0_r, dst1_r, dst2_r, dst3_r,
3850 dst0_l, dst1_l, dst2_l, dst3_l);
3851 HEVC_UNIW_RND_CLIP2(tmp2, tmp3, weight_vec, offset_vec, rnd_vec,
3852 dst4_r, dst5_r, dst4_l, dst5_l);
3854 HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst2_l, dst2_r,
3855 dst1_l, dst1_r, dst3_l, dst3_r, dst0_r, dst1_r);
3856 HEVC_PCK_SW_SB4(dst4_l, dst4_r, dst5_l, dst5_r, dst4_r);
3857 ST_SW2(dst0_r, dst1_r, dst, dst_stride);
3858 ST8x2_UB(dst4_r, dst + 16, dst_stride);
3859 dst += (2 * dst_stride);
3863 static void hevc_vt_uniwgt_4t_32w_msa(uint8_t *src,
3867 const int8_t *filter,
3874 uint8_t *dst_tmp = dst + 16;
3875 v16i8 src0, src1, src2, src3, src4, src6, src7, src8, src9, src10;
3876 v16i8 src10_r, src32_r, src76_r, src98_r;
3877 v16i8 src21_r, src43_r, src87_r, src109_r;
3878 v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
3879 v16i8 src10_l, src32_l, src76_l, src98_l;
3880 v16i8 src21_l, src43_l, src87_l, src109_l;
3882 v8i16 filter_vec, const_vec;
3883 v4i32 weight_vec, offset_vec, rnd_vec;
3884 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r, dst6_r, dst7_r;
3885 v4i32 dst0_l, dst1_l, dst2_l, dst3_l, dst4_l, dst5_l, dst6_l, dst7_l;
3889 const_vec = __msa_ldi_h(128);
3891 weight = weight & 0x0000FFFF;
3893 weight_vec = __msa_fill_w(weight);
3894 offset_vec = __msa_fill_w(offset);
3895 rnd_vec = __msa_fill_w(rnd_val);
3897 filter_vec = LD_SH(filter);
3898 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3900 LD_SB3(src, src_stride, src0, src1, src2);
3901 XORI_B3_128_SB(src0, src1, src2);
3902 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3903 ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
3905 LD_SB3(src + 16, src_stride, src6, src7, src8);
3906 src += (3 * src_stride);
3907 XORI_B3_128_SB(src6, src7, src8);
3908 ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
3909 ILVL_B2_SB(src7, src6, src8, src7, src76_l, src87_l);
3911 for (loop_cnt = (height >> 1); loop_cnt--;) {
3912 LD_SB2(src, src_stride, src3, src4);
3913 XORI_B2_128_SB(src3, src4);
3914 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3915 ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
3918 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, tmp0, tmp0);
3920 DPADD_SB2_SH(src10_l, src32_l, filt0, filt1, tmp4, tmp4);
3922 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, tmp1, tmp1);
3924 DPADD_SB2_SH(src21_l, src43_l, filt0, filt1, tmp5, tmp5);
3926 HEVC_UNIW_RND_CLIP4(tmp0, tmp1, tmp4, tmp5,
3927 weight_vec, offset_vec, rnd_vec,
3928 dst0_r, dst1_r, dst2_r, dst3_r,
3929 dst0_l, dst1_l, dst2_l, dst3_l);
3930 HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst2_l, dst2_r,
3931 dst1_l, dst1_r, dst3_l, dst3_r, dst0_r, dst1_r);
3932 ST_SW2(dst0_r, dst1_r, dst, dst_stride);
3933 dst += (2 * dst_stride);
3941 LD_SB2(src + 16, src_stride, src9, src10);
3942 src += (2 * src_stride);
3943 XORI_B2_128_SB(src9, src10);
3944 ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r);
3945 ILVL_B2_SB(src9, src8, src10, src9, src98_l, src109_l);
3948 DPADD_SB2_SH(src76_r, src98_r, filt0, filt1, tmp2, tmp2);
3950 DPADD_SB2_SH(src76_l, src98_l, filt0, filt1, tmp6, tmp6);
3952 DPADD_SB2_SH(src87_r, src109_r, filt0, filt1, tmp3, tmp3);
3954 DPADD_SB2_SH(src87_l, src109_l, filt0, filt1, tmp7, tmp7);
3956 HEVC_UNIW_RND_CLIP4(tmp2, tmp3, tmp6, tmp7,
3957 weight_vec, offset_vec, rnd_vec,
3958 dst4_r, dst5_r, dst6_r, dst7_r,
3959 dst4_l, dst5_l, dst6_l, dst7_l);
3961 HEVC_PCK_SW_SB8(dst4_l, dst4_r, dst6_l, dst6_r,
3962 dst5_l, dst5_r, dst7_l, dst7_r, dst4_r, dst5_r);
3963 ST_SW2(dst4_r, dst5_r, dst_tmp, dst_stride);
3964 dst_tmp += (2 * dst_stride);
3974 static void hevc_hv_uniwgt_4t_4x2_msa(uint8_t *src,
3978 const int8_t *filter_x,
3979 const int8_t *filter_y,
3985 v16i8 src0, src1, src2, src3, src4;
3987 v4i32 filt_h0, filt_h1;
3988 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
3990 v8i16 filter_vec, const_vec;
3991 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
3992 v8i16 dst0, dst1, dst2, dst3, dst4;
3993 v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
3994 v4i32 dst0_r, dst1_r;
3995 v4i32 weight_vec, offset_vec, rnd_vec;
3997 src -= (src_stride + 1);
3999 filter_vec = LD_SH(filter_x);
4000 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
4002 filter_vec = LD_SH(filter_y);
4003 vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
4004 filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
4006 SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1);
4010 const_vec = __msa_ldi_h(128);
4013 weight_vec = __msa_fill_w(weight);
4014 offset_vec = __msa_fill_w(offset);
4015 rnd_vec = __msa_fill_w(rnd_val);
4017 LD_SB3(src, src_stride, src0, src1, src2);
4018 src += (3 * src_stride);
4019 XORI_B3_128_SB(src0, src1, src2);
4021 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
4022 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
4023 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
4025 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
4027 DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1);
4029 DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2);
4031 ILVR_H2_SH(dst1, dst0, dst2, dst1, dst10_r, dst21_r);
4032 LD_SB2(src, src_stride, src3, src4);
4033 XORI_B2_128_SB(src3, src4);
4036 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
4038 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
4040 dst32_r = __msa_ilvr_h(dst3, dst2);
4041 dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
4044 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
4046 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
4048 dst43_r = __msa_ilvr_h(dst4, dst3);
4049 dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
4052 MUL2(dst0_r, weight_vec, dst1_r, weight_vec, dst0_r, dst1_r);
4053 SRAR_W2_SW(dst0_r, dst1_r, rnd_vec);
4054 ADD2(dst0_r, offset_vec, dst1_r, offset_vec, dst0_r, dst1_r);
4055 dst0_r = CLIP_SW_0_255(dst0_r);
4056 dst1_r = CLIP_SW_0_255(dst1_r);
4058 HEVC_PCK_SW_SB2(dst1_r, dst0_r, dst0_r);
4059 ST4x2_UB(dst0_r, dst, dst_stride);
4062 static void hevc_hv_uniwgt_4t_4x4_msa(uint8_t *src,
4066 const int8_t *filter_x,
4067 const int8_t *filter_y,
4073 v16i8 src0, src1, src2, src3, src4, src5, src6;
4075 v4i32 filt_h0, filt_h1;
4076 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
4078 v8i16 filter_vec, const_vec;
4079 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
4080 v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
4081 v4i32 dst0_r, dst1_r, dst2_r, dst3_r;
4082 v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
4083 v4i32 weight_vec, offset_vec, rnd_vec;
4085 src -= (src_stride + 1);
4087 filter_vec = LD_SH(filter_x);
4088 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
4090 filter_vec = LD_SH(filter_y);
4091 vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
4092 filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
4094 SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1);
4098 const_vec = __msa_ldi_h(128);
4101 weight_vec = __msa_fill_w(weight);
4102 offset_vec = __msa_fill_w(offset);
4103 rnd_vec = __msa_fill_w(rnd_val);
4105 LD_SB3(src, src_stride, src0, src1, src2);
4106 src += (3 * src_stride);
4107 XORI_B3_128_SB(src0, src1, src2);
4109 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
4110 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
4111 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
4113 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
4115 DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1);
4117 DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2);
4119 ILVR_H2_SH(dst1, dst0, dst2, dst1, dst10_r, dst21_r);
4121 LD_SB4(src, src_stride, src3, src4, src5, src6);
4122 XORI_B4_128_SB(src3, src4, src5, src6);
4125 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
4127 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
4128 dst32_r = __msa_ilvr_h(dst3, dst2);
4129 dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
4133 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
4135 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
4136 dst43_r = __msa_ilvr_h(dst4, dst3);
4137 dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
4141 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
4143 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5);
4144 dst10_r = __msa_ilvr_h(dst5, dst4);
4145 dst2_r = HEVC_FILT_4TAP(dst32_r, dst10_r, filt_h0, filt_h1);
4149 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
4151 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
4152 dst21_r = __msa_ilvr_h(dst2, dst5);
4153 dst3_r = HEVC_FILT_4TAP(dst43_r, dst21_r, filt_h0, filt_h1);
4156 HEVC_HV_UNIW_RND_CLIP4(dst0_r, dst1_r, dst2_r, dst3_r,
4157 weight_vec, offset_vec, rnd_vec,
4158 dst0_r, dst1_r, dst2_r, dst3_r);
4159 HEVC_PCK_SW_SB4(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r);
4160 ST4x4_UB(dst0_r, dst0_r, 0, 1, 2, 3, dst, dst_stride);
4163 static void hevc_hv_uniwgt_4t_4multx8mult_msa(uint8_t *src,
4167 const int8_t *filter_x,
4168 const int8_t *filter_y,
4175 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
4177 v4i32 filt_h0, filt_h1;
4178 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
4180 v8i16 filter_vec, const_vec;
4181 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
4182 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8, dst9;
4183 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r, dst6_r, dst7_r;
4184 v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
4185 v8i16 dst21_r, dst43_r, dst65_r, dst87_r;
4186 v4i32 weight_vec, offset_vec, rnd_vec;
4188 src -= (src_stride + 1);
4190 filter_vec = LD_SH(filter_x);
4191 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
4193 filter_vec = LD_SH(filter_y);
4194 vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
4195 filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
4197 SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1);
4201 const_vec = __msa_ldi_h(128);
4204 weight_vec = __msa_fill_w(weight);
4205 offset_vec = __msa_fill_w(offset);
4206 rnd_vec = __msa_fill_w(rnd_val);
4208 LD_SB3(src, src_stride, src0, src1, src2);
4209 src += (3 * src_stride);
4210 XORI_B3_128_SB(src0, src1, src2);
4212 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
4213 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
4214 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
4216 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
4218 DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1);
4220 DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2);
4221 ILVR_H2_SH(dst1, dst0, dst2, dst1, dst10_r, dst21_r);
4223 for (loop_cnt = height >> 3; loop_cnt--;) {
4224 LD_SB8(src, src_stride,
4225 src3, src4, src5, src6, src7, src8, src9, src10);
4226 src += (8 * src_stride);
4227 XORI_B8_128_SB(src3, src4, src5, src6, src7, src8, src9, src10);
4229 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
4231 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
4232 dst32_r = __msa_ilvr_h(dst3, dst2);
4233 dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
4236 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
4238 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
4239 dst43_r = __msa_ilvr_h(dst4, dst3);
4240 dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
4243 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
4245 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5);
4246 dst54_r = __msa_ilvr_h(dst5, dst4);
4247 dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
4250 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
4252 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst6, dst6);
4253 dst65_r = __msa_ilvr_h(dst6, dst5);
4254 dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
4257 VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1);
4259 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst7, dst7);
4260 dst76_r = __msa_ilvr_h(dst7, dst6);
4261 dst4_r = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
4264 VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec0, vec1);
4266 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst8, dst8);
4267 dst87_r = __msa_ilvr_h(dst8, dst7);
4268 dst5_r = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
4271 VSHF_B2_SB(src9, src9, src9, src9, mask0, mask1, vec0, vec1);
4273 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst9, dst9);
4274 dst10_r = __msa_ilvr_h(dst9, dst8);
4275 dst6_r = HEVC_FILT_4TAP(dst76_r, dst10_r, filt_h0, filt_h1);
4278 VSHF_B2_SB(src10, src10, src10, src10, mask0, mask1, vec0, vec1);
4280 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
4281 dst21_r = __msa_ilvr_h(dst2, dst9);
4282 dst7_r = HEVC_FILT_4TAP(dst87_r, dst21_r, filt_h0, filt_h1);
4285 HEVC_HV_UNIW_RND_CLIP4(dst0_r, dst1_r, dst2_r, dst3_r,
4286 weight_vec, offset_vec, rnd_vec,
4287 dst0_r, dst1_r, dst2_r, dst3_r);
4288 HEVC_PCK_SW_SB4(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r);
4289 ST4x4_UB(dst0_r, dst0_r, 0, 1, 2, 3, dst, dst_stride);
4290 dst += (4 * dst_stride);
4292 HEVC_HV_UNIW_RND_CLIP4(dst4_r, dst5_r, dst6_r, dst7_r,
4293 weight_vec, offset_vec, rnd_vec,
4294 dst4_r, dst5_r, dst6_r, dst7_r);
4295 HEVC_PCK_SW_SB4(dst5_r, dst4_r, dst7_r, dst6_r, dst0_r);
4296 ST4x4_UB(dst0_r, dst0_r, 0, 1, 2, 3, dst, dst_stride);
4297 dst += (4 * dst_stride);
4301 static void hevc_hv_uniwgt_4t_4w_msa(uint8_t *src,
4305 const int8_t *filter_x,
4306 const int8_t *filter_y,
4313 hevc_hv_uniwgt_4t_4x2_msa(src, src_stride, dst, dst_stride,
4314 filter_x, filter_y, height, weight,
4316 } else if (4 == height) {
4317 hevc_hv_uniwgt_4t_4x4_msa(src, src_stride, dst, dst_stride,
4318 filter_x, filter_y, height, weight,
4320 } else if (0 == (height % 8)) {
4321 hevc_hv_uniwgt_4t_4multx8mult_msa(src, src_stride, dst, dst_stride,
4322 filter_x, filter_y, height, weight,
4327 static void hevc_hv_uniwgt_4t_6w_msa(uint8_t *src,
4331 const int8_t *filter_x,
4332 const int8_t *filter_y,
4339 v16i8 src0, src1, src2, src3, src4, src5, src6;
4341 v4i32 filt_h0, filt_h1;
4342 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
4344 v8i16 filter_vec, const_vec;
4345 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
4346 v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
4347 v4i32 dst0_r, dst0_l, dst1_r, dst1_l;
4348 v4i32 weight_vec, offset_vec, rnd_vec;
4349 v4i32 dst2_r, dst2_l, dst3_r, dst3_l;
4350 v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
4351 v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
4353 src -= (src_stride + 1);
4355 filter_vec = LD_SH(filter_x);
4356 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
4358 filter_vec = LD_SH(filter_y);
4359 vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
4360 filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
4362 SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1);
4366 const_vec = __msa_ldi_h(128);
4369 weight_vec = __msa_fill_w(weight);
4370 offset_vec = __msa_fill_w(offset);
4371 rnd_vec = __msa_fill_w(rnd_val);
4373 LD_SB3(src, src_stride, src0, src1, src2);
4374 src += (3 * src_stride);
4375 XORI_B3_128_SB(src0, src1, src2);
4377 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
4378 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
4379 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
4381 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
4383 DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1);
4385 DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2);
4387 ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
4388 ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
4390 for (loop_cnt = height >> 2; loop_cnt--;) {
4391 LD_SB4(src, src_stride, src3, src4, src5, src6);
4392 src += (4 * src_stride);
4393 XORI_B4_128_SB(src3, src4, src5, src6);
4396 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
4398 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
4399 ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
4400 dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
4401 dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
4406 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
4408 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
4409 ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
4410 dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
4411 dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
4416 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
4418 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5);
4419 ILVRL_H2_SH(dst5, dst4, dst10_r, dst10_l);
4420 dst2_r = HEVC_FILT_4TAP(dst32_r, dst10_r, filt_h0, filt_h1);
4421 dst2_l = HEVC_FILT_4TAP(dst32_l, dst10_l, filt_h0, filt_h1);
4426 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
4428 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
4429 ILVRL_H2_SH(dst2, dst5, dst21_r, dst21_l);
4430 dst3_r = HEVC_FILT_4TAP(dst43_r, dst21_r, filt_h0, filt_h1);
4431 dst3_l = HEVC_FILT_4TAP(dst43_l, dst21_l, filt_h0, filt_h1);
4435 HEVC_HV_UNIW_RND_CLIP4(dst0_r, dst1_r, dst0_l, dst1_l,
4436 weight_vec, offset_vec, rnd_vec,
4437 dst0_r, dst1_r, dst0_l, dst1_l);
4438 HEVC_HV_UNIW_RND_CLIP4(dst2_r, dst3_r, dst2_l, dst3_l,
4439 weight_vec, offset_vec, rnd_vec,
4440 dst2_r, dst3_r, dst2_l, dst3_l);
4441 HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r,
4442 dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
4443 ST6x4_UB(dst0_r, dst1_r, dst, dst_stride);
4444 dst += (4 * dst_stride);
4448 static void hevc_hv_uniwgt_4t_8x2_msa(uint8_t *src,
4452 const int8_t *filter_x,
4453 const int8_t *filter_y,
4459 v16i8 src0, src1, src2, src3, src4;
4461 v4i32 filt_h0, filt_h1;
4462 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
4464 v8i16 filter_vec, const_vec;
4465 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
4466 v8i16 dst0, dst1, dst2, dst3, dst4;
4467 v4i32 dst0_r, dst0_l, dst1_r, dst1_l;
4468 v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
4469 v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
4470 v4i32 weight_vec, offset_vec, rnd_vec;
4472 src -= (src_stride + 1);
4474 filter_vec = LD_SH(filter_x);
4475 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
4477 filter_vec = LD_SH(filter_y);
4478 vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
4479 filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
4481 SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1);
4485 const_vec = __msa_ldi_h(128);
4488 weight_vec = __msa_fill_w(weight);
4489 offset_vec = __msa_fill_w(offset);
4490 rnd_vec = __msa_fill_w(rnd_val);
4492 LD_SB3(src, src_stride, src0, src1, src2);
4493 src += (3 * src_stride);
4494 XORI_B3_128_SB(src0, src1, src2);
4496 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
4497 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
4498 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
4500 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
4502 DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1);
4504 DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2);
4506 ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
4507 ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
4509 LD_SB2(src, src_stride, src3, src4);
4510 src += (2 * src_stride);
4511 XORI_B2_128_SB(src3, src4);
4513 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
4515 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
4516 ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
4517 dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
4518 dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
4522 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
4524 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
4525 ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
4526 dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
4527 dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
4531 HEVC_HV_UNIW_RND_CLIP4(dst0_r, dst1_r, dst0_l, dst1_l,
4532 weight_vec, offset_vec, rnd_vec,
4533 dst0_r, dst1_r, dst0_l, dst1_l);
4534 HEVC_PCK_SW_SB4(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r);
4535 ST8x2_UB(dst0_r, dst, dst_stride);
4536 dst += (2 * dst_stride);
4539 static void hevc_hv_uniwgt_4t_8x6_msa(uint8_t *src,
4543 const int8_t *filter_x,
4544 const int8_t *filter_y,
4550 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
4552 v4i32 filt_h0, filt_h1;
4553 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
4555 v8i16 filter_vec, const_vec;
4556 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
4557 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
4558 v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
4559 v4i32 dst4_r, dst4_l, dst5_r, dst5_l;
4560 v8i16 dst10_r, dst32_r, dst10_l, dst32_l;
4561 v8i16 dst21_r, dst43_r, dst21_l, dst43_l;
4562 v8i16 dst54_r, dst54_l, dst65_r, dst65_l;
4563 v8i16 dst76_r, dst76_l, dst87_r, dst87_l;
4564 v4i32 weight_vec, offset_vec, rnd_vec;
4566 src -= (src_stride + 1);
4568 filter_vec = LD_SH(filter_x);
4569 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
4571 filter_vec = LD_SH(filter_y);
4572 vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
4573 filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
4575 SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1);
4579 const_vec = __msa_ldi_h(128);
4582 weight_vec = __msa_fill_w(weight);
4583 offset_vec = __msa_fill_w(offset);
4584 rnd_vec = __msa_fill_w(rnd_val);
4586 LD_SB3(src, src_stride, src0, src1, src2);
4587 src += (3 * src_stride);
4589 XORI_B3_128_SB(src0, src1, src2);
4591 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
4592 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
4593 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
4595 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
4597 DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1);
4599 DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2);
4601 ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
4602 ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
4604 LD_SB2(src, src_stride, src3, src4);
4605 src += (2 * src_stride);
4606 XORI_B2_128_SB(src3, src4);
4609 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
4611 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
4612 ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
4613 dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
4614 dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
4619 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
4621 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
4622 ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
4623 dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
4624 dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
4628 LD_SB2(src, src_stride, src5, src6);
4629 src += (2 * src_stride);
4630 XORI_B2_128_SB(src5, src6);
4633 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
4635 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5);
4636 ILVRL_H2_SH(dst5, dst4, dst54_r, dst54_l);
4637 dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
4638 dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1);
4643 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
4645 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst6, dst6);
4646 ILVRL_H2_SH(dst6, dst5, dst65_r, dst65_l);
4647 dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
4648 dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1);
4652 LD_SB2(src, src_stride, src7, src8);
4653 src += (2 * src_stride);
4654 XORI_B2_128_SB(src7, src8);
4657 VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1);
4659 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst7, dst7);
4660 ILVRL_H2_SH(dst7, dst6, dst76_r, dst76_l);
4661 dst4_r = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
4662 dst4_l = HEVC_FILT_4TAP(dst54_l, dst76_l, filt_h0, filt_h1);
4668 VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec0, vec1);
4670 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst8, dst8);
4671 ILVRL_H2_SH(dst8, dst7, dst87_r, dst87_l);
4672 dst5_r = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
4673 dst5_l = HEVC_FILT_4TAP(dst65_l, dst87_l, filt_h0, filt_h1);
4677 HEVC_HV_UNIW_RND_CLIP4(dst0_r, dst1_r, dst0_l, dst1_l,
4678 weight_vec, offset_vec, rnd_vec,
4679 dst0_r, dst1_r, dst0_l, dst1_l);
4680 HEVC_HV_UNIW_RND_CLIP4(dst2_r, dst3_r, dst2_l, dst3_l,
4681 weight_vec, offset_vec, rnd_vec,
4682 dst2_r, dst3_r, dst2_l, dst3_l);
4683 HEVC_HV_UNIW_RND_CLIP4(dst4_r, dst5_r, dst4_l, dst5_l,
4684 weight_vec, offset_vec, rnd_vec,
4685 dst4_r, dst5_r, dst4_l, dst5_l);
4686 HEVC_PCK_SW_SB12(dst0_l, dst0_r, dst1_l, dst1_r,
4687 dst2_l, dst2_r, dst3_l, dst3_r,
4688 dst4_l, dst4_r, dst5_l, dst5_r, dst0_r, dst1_r, dst2_r);
4689 ST8x4_UB(dst0_r, dst1_r, dst, dst_stride);
4690 dst += (4 * dst_stride);
4691 ST8x2_UB(dst2_r, dst, dst_stride);
4694 static void hevc_hv_uniwgt_4t_8multx4mult_msa(uint8_t *src,
4698 const int8_t *filter_x,
4699 const int8_t *filter_y,
4706 uint32_t loop_cnt, cnt;
4709 v16i8 src0, src1, src2, src3, src4, src5, src6;
4711 v4i32 filt_h0, filt_h1;
4712 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
4714 v8i16 filter_vec, const_vec;
4715 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
4716 v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
4717 v4i32 dst0_r, dst0_l, dst1_r, dst1_l;
4718 v4i32 weight_vec, offset_vec, rnd_vec;
4719 v4i32 dst2_r, dst2_l, dst3_r, dst3_l;
4720 v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
4721 v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
4723 src -= (src_stride + 1);
4725 filter_vec = LD_SH(filter_x);
4726 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
4728 filter_vec = LD_SH(filter_y);
4729 vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
4730 filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
4732 SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1);
4736 const_vec = __msa_ldi_h(128);
4739 weight_vec = __msa_fill_w(weight);
4740 offset_vec = __msa_fill_w(offset);
4741 rnd_vec = __msa_fill_w(rnd_val);
4743 for (cnt = width >> 3; cnt--;) {
4747 LD_SB3(src_tmp, src_stride, src0, src1, src2);
4748 src_tmp += (3 * src_stride);
4749 XORI_B3_128_SB(src0, src1, src2);
4751 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
4752 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
4753 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
4755 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
4757 DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1);
4759 DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2);
4761 ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
4762 ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
4764 for (loop_cnt = height >> 2; loop_cnt--;) {
4765 LD_SB4(src_tmp, src_stride, src3, src4, src5, src6);
4766 src_tmp += (4 * src_stride);
4767 XORI_B4_128_SB(src3, src4, src5, src6);
4769 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
4771 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
4772 ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
4773 dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
4774 dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
4778 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
4780 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
4781 ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
4782 dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
4783 dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
4787 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
4789 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5);
4790 ILVRL_H2_SH(dst5, dst4, dst10_r, dst10_l);
4791 dst2_r = HEVC_FILT_4TAP(dst32_r, dst10_r, filt_h0, filt_h1);
4792 dst2_l = HEVC_FILT_4TAP(dst32_l, dst10_l, filt_h0, filt_h1);
4796 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
4798 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
4799 ILVRL_H2_SH(dst2, dst5, dst21_r, dst21_l);
4800 dst3_r = HEVC_FILT_4TAP(dst43_r, dst21_r, filt_h0, filt_h1);
4801 dst3_l = HEVC_FILT_4TAP(dst43_l, dst21_l, filt_h0, filt_h1);
4805 HEVC_HV_UNIW_RND_CLIP4(dst0_r, dst1_r, dst0_l, dst1_l,
4806 weight_vec, offset_vec, rnd_vec,
4807 dst0_r, dst1_r, dst0_l, dst1_l);
4808 HEVC_HV_UNIW_RND_CLIP4(dst2_r, dst3_r, dst2_l, dst3_l,
4809 weight_vec, offset_vec, rnd_vec,
4810 dst2_r, dst3_r, dst2_l, dst3_l);
4811 HEVC_PCK_SW_SB8(dst0_l, dst0_r, dst1_l, dst1_r,
4812 dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r);
4813 ST8x4_UB(dst0_r, dst1_r, dst_tmp, dst_stride);
4814 dst_tmp += (4 * dst_stride);
4822 static void hevc_hv_uniwgt_4t_8w_msa(uint8_t *src,
4826 const int8_t *filter_x,
4827 const int8_t *filter_y,
4835 hevc_hv_uniwgt_4t_8x2_msa(src, src_stride, dst, dst_stride,
4836 filter_x, filter_y, height, weight,
4838 } else if (6 == height) {
4839 hevc_hv_uniwgt_4t_8x6_msa(src, src_stride, dst, dst_stride,
4840 filter_x, filter_y, height, weight,
4842 } else if (0 == (height % 4)) {
4843 hevc_hv_uniwgt_4t_8multx4mult_msa(src, src_stride, dst, dst_stride,
4844 filter_x, filter_y, height, weight,
4845 offset, rnd_val, 8);
4849 static void hevc_hv_uniwgt_4t_12w_msa(uint8_t *src,
4853 const int8_t *filter_x,
4854 const int8_t *filter_y,
4860 hevc_hv_uniwgt_4t_8multx4mult_msa(src, src_stride, dst, dst_stride,
4861 filter_x, filter_y, height, weight,
4862 offset, rnd_val, 8);
4863 hevc_hv_uniwgt_4t_4w_msa(src + 8, src_stride, dst + 8, dst_stride,
4864 filter_x, filter_y, height, weight,
4868 static void hevc_hv_uniwgt_4t_16w_msa(uint8_t *src,
4872 const int8_t *filter_x,
4873 const int8_t *filter_y,
4879 hevc_hv_uniwgt_4t_8multx4mult_msa(src, src_stride, dst, dst_stride,
4880 filter_x, filter_y, height, weight,
4881 offset, rnd_val, 16);
4884 static void hevc_hv_uniwgt_4t_24w_msa(uint8_t *src,
4888 const int8_t *filter_x,
4889 const int8_t *filter_y,
4895 hevc_hv_uniwgt_4t_8multx4mult_msa(src, src_stride, dst, dst_stride,
4896 filter_x, filter_y, height, weight,
4897 offset, rnd_val, 24);
4900 static void hevc_hv_uniwgt_4t_32w_msa(uint8_t *src,
4904 const int8_t *filter_x,
4905 const int8_t *filter_y,
4911 hevc_hv_uniwgt_4t_8multx4mult_msa(src, src_stride, dst, dst_stride,
4912 filter_x, filter_y, height, weight,
4913 offset, rnd_val, 32);
4916 #define UNIWGT_MC_COPY(WIDTH) \
4917 void ff_hevc_put_hevc_uni_w_pel_pixels##WIDTH##_8_msa(uint8_t *dst, \
4918 ptrdiff_t dst_stride, \
4920 ptrdiff_t src_stride, \
4929 int shift = denom + 14 - 8; \
4930 hevc_uniwgt_copy_##WIDTH##w_msa(src, src_stride, dst, dst_stride, \
4931 height, weight, offset, shift); \
4944 #undef UNIWGT_MC_COPY
4946 #define UNI_W_MC(PEL, DIR, WIDTH, TAP, DIR1, FILT_DIR) \
4947 void ff_hevc_put_hevc_uni_w_##PEL##_##DIR##WIDTH##_8_msa(uint8_t *dst, \
4961 const int8_t *filter = ff_hevc_##PEL##_filters[FILT_DIR - 1]; \
4962 int shift = denom + 14 - 8; \
4964 hevc_##DIR1##_uniwgt_##TAP##t_##WIDTH##w_msa(src, src_stride, dst, \
4965 dst_stride, filter, height, \
4966 weight, offset, shift); \
4969 UNI_W_MC(qpel, h, 4, 8, hz, mx);
4970 UNI_W_MC(qpel, h, 8, 8, hz, mx);
4971 UNI_W_MC(qpel, h, 12, 8, hz, mx);
4972 UNI_W_MC(qpel, h, 16, 8, hz, mx);
4973 UNI_W_MC(qpel, h, 24, 8, hz, mx);
4974 UNI_W_MC(qpel, h, 32, 8, hz, mx);
4975 UNI_W_MC(qpel, h, 48, 8, hz, mx);
4976 UNI_W_MC(qpel, h, 64, 8, hz, mx);
4978 UNI_W_MC(qpel, v, 4, 8, vt, my);
4979 UNI_W_MC(qpel, v, 8, 8, vt, my);
4980 UNI_W_MC(qpel, v, 12, 8, vt, my);
4981 UNI_W_MC(qpel, v, 16, 8, vt, my);
4982 UNI_W_MC(qpel, v, 24, 8, vt, my);
4983 UNI_W_MC(qpel, v, 32, 8, vt, my);
4984 UNI_W_MC(qpel, v, 48, 8, vt, my);
4985 UNI_W_MC(qpel, v, 64, 8, vt, my);
4987 UNI_W_MC(epel, h, 4, 4, hz, mx);
4988 UNI_W_MC(epel, h, 6, 4, hz, mx);
4989 UNI_W_MC(epel, h, 8, 4, hz, mx);
4990 UNI_W_MC(epel, h, 12, 4, hz, mx);
4991 UNI_W_MC(epel, h, 16, 4, hz, mx);
4992 UNI_W_MC(epel, h, 24, 4, hz, mx);
4993 UNI_W_MC(epel, h, 32, 4, hz, mx);
4995 UNI_W_MC(epel, v, 4, 4, vt, my);
4996 UNI_W_MC(epel, v, 6, 4, vt, my);
4997 UNI_W_MC(epel, v, 8, 4, vt, my);
4998 UNI_W_MC(epel, v, 12, 4, vt, my);
4999 UNI_W_MC(epel, v, 16, 4, vt, my);
5000 UNI_W_MC(epel, v, 24, 4, vt, my);
5001 UNI_W_MC(epel, v, 32, 4, vt, my);
5005 #define UNI_W_MC_HV(PEL, WIDTH, TAP) \
5006 void ff_hevc_put_hevc_uni_w_##PEL##_hv##WIDTH##_8_msa(uint8_t *dst, \
5007 ptrdiff_t dst_stride, \
5009 ptrdiff_t src_stride, \
5018 const int8_t *filter_x = ff_hevc_##PEL##_filters[mx - 1]; \
5019 const int8_t *filter_y = ff_hevc_##PEL##_filters[my - 1]; \
5020 int shift = denom + 14 - 8; \
5022 hevc_hv_uniwgt_##TAP##t_##WIDTH##w_msa(src, src_stride, dst, dst_stride, \
5023 filter_x, filter_y, height, \
5024 weight, offset, shift); \
5027 UNI_W_MC_HV(qpel, 4, 8);
5028 UNI_W_MC_HV(qpel, 8, 8);
5029 UNI_W_MC_HV(qpel, 12, 8);
5030 UNI_W_MC_HV(qpel, 16, 8);
5031 UNI_W_MC_HV(qpel, 24, 8);
5032 UNI_W_MC_HV(qpel, 32, 8);
5033 UNI_W_MC_HV(qpel, 48, 8);
5034 UNI_W_MC_HV(qpel, 64, 8);
5036 UNI_W_MC_HV(epel, 4, 4);
5037 UNI_W_MC_HV(epel, 6, 4);
5038 UNI_W_MC_HV(epel, 8, 4);
5039 UNI_W_MC_HV(epel, 12, 4);
5040 UNI_W_MC_HV(epel, 16, 4);
5041 UNI_W_MC_HV(epel, 24, 4);
5042 UNI_W_MC_HV(epel, 32, 4);