2 * Copyright (c) 2015 - 2017 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com)
4 * This file is part of FFmpeg.
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 #include "libavutil/mips/generic_macros_msa.h"
22 #include "libavcodec/mips/hevcdsp_mips.h"
23 #include "libavcodec/mips/hevc_macros_msa.h"
25 static const uint8_t ff_hevc_mask_arr[16 * 2] __attribute__((aligned(0x40))) = {
27 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
28 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20
31 #define HEVC_BI_RND_CLIP2(in0, in1, vec0, vec1, rnd_val, out0, out1) \
33 ADDS_SH2_SH(vec0, in0, vec1, in1, out0, out1); \
34 SRARI_H2_SH(out0, out1, rnd_val); \
35 CLIP_SH2_0_255(out0, out1); \
38 #define HEVC_BI_RND_CLIP4(in0, in1, in2, in3, \
39 vec0, vec1, vec2, vec3, rnd_val, \
40 out0, out1, out2, out3) \
42 HEVC_BI_RND_CLIP2(in0, in1, vec0, vec1, rnd_val, out0, out1); \
43 HEVC_BI_RND_CLIP2(in2, in3, vec2, vec3, rnd_val, out2, out3); \
46 #define HEVC_BI_RND_CLIP2_MAX_SATU(in0, in1, vec0, vec1, rnd_val, \
49 ADDS_SH2_SH(vec0, in0, vec1, in1, out0, out1); \
50 SRARI_H2_SH(out0, out1, rnd_val); \
51 CLIP_SH2_0_255_MAX_SATU(out0, out1); \
54 #define HEVC_BI_RND_CLIP4_MAX_SATU(in0, in1, in2, in3, vec0, vec1, vec2, \
55 vec3, rnd_val, out0, out1, out2, out3) \
57 HEVC_BI_RND_CLIP2_MAX_SATU(in0, in1, vec0, vec1, rnd_val, out0, out1); \
58 HEVC_BI_RND_CLIP2_MAX_SATU(in2, in3, vec2, vec3, rnd_val, out2, out3); \
61 static void hevc_bi_copy_4w_msa(uint8_t *src0_ptr,
69 uint32_t loop_cnt, tp0, tp1, tp2, tp3;
70 uint64_t tpd0, tpd1, tpd2, tpd3;
71 v16i8 src0 = { 0 }, src1 = { 0 };
73 v8i16 in0 = { 0 }, in1 = { 0 }, in2 = { 0 }, in3 = { 0 };
74 v8i16 dst0, dst1, dst2, dst3;
77 LW2(src0_ptr, src_stride, tp0, tp1);
78 INSERT_W2_SB(tp0, tp1, src0);
79 LD2(src1_ptr, src2_stride, tpd0, tpd1);
80 INSERT_D2_SH(tpd0, tpd1, in0);
82 dst0 = (v8i16) __msa_ilvr_b(zero, src0);
85 dst0 = __msa_srari_h(dst0, 7);
86 dst0 = CLIP_SH_0_255_MAX_SATU(dst0);
88 dst0 = (v8i16) __msa_pckev_b((v16i8) dst0, (v16i8) dst0);
89 ST4x2_UB(dst0, dst, dst_stride);
90 } else if (4 == height) {
91 LW4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
92 INSERT_W4_SB(tp0, tp1, tp2, tp3, src0);
93 LD4(src1_ptr, src2_stride, tpd0, tpd1, tpd2, tpd3);
94 INSERT_D2_SH(tpd0, tpd1, in0);
95 INSERT_D2_SH(tpd2, tpd3, in1);
96 ILVRL_B2_SH(zero, src0, dst0, dst1);
97 SLLI_2V(dst0, dst1, 6);
98 HEVC_BI_RND_CLIP2_MAX_SATU(in0, in1, dst0, dst1, 7, dst0, dst1);
99 dst0 = (v8i16) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
100 ST4x4_UB(dst0, dst0, 0, 1, 2, 3, dst, dst_stride);
101 } else if (0 == height % 8) {
102 for (loop_cnt = (height >> 3); loop_cnt--;) {
103 LW4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
104 src0_ptr += 4 * src_stride;
105 INSERT_W4_SB(tp0, tp1, tp2, tp3, src0);
106 LW4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
107 src0_ptr += 4 * src_stride;
108 INSERT_W4_SB(tp0, tp1, tp2, tp3, src1);
109 LD4(src1_ptr, src2_stride, tpd0, tpd1, tpd2, tpd3);
110 src1_ptr += (4 * src2_stride);
111 INSERT_D2_SH(tpd0, tpd1, in0);
112 INSERT_D2_SH(tpd2, tpd3, in1);
113 LD4(src1_ptr, src2_stride, tpd0, tpd1, tpd2, tpd3);
114 src1_ptr += (4 * src2_stride);
115 INSERT_D2_SH(tpd0, tpd1, in2);
116 INSERT_D2_SH(tpd2, tpd3, in3);
117 ILVRL_B2_SH(zero, src0, dst0, dst1);
118 ILVRL_B2_SH(zero, src1, dst2, dst3);
119 SLLI_4V(dst0, dst1, dst2, dst3, 6);
120 HEVC_BI_RND_CLIP4_MAX_SATU(in0, in1, in2, in3, dst0, dst1, dst2,
121 dst3, 7, dst0, dst1, dst2, dst3);
122 PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
123 ST4x8_UB(dst0, dst1, dst, dst_stride);
124 dst += (8 * dst_stride);
129 static void hevc_bi_copy_6w_msa(uint8_t *src0_ptr,
138 uint64_t tp0, tp1, tp2, tp3;
139 v16u8 out0, out1, out2, out3;
141 v16i8 src0 = { 0 }, src1 = { 0 }, src2 = { 0 }, src3 = { 0 };
142 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
143 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
145 for (loop_cnt = (height >> 3); loop_cnt--;) {
146 LD4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
147 src0_ptr += (4 * src_stride);
148 INSERT_D2_SB(tp0, tp1, src0);
149 INSERT_D2_SB(tp2, tp3, src1);
150 LD4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
151 src0_ptr += (4 * src_stride);
152 INSERT_D2_SB(tp0, tp1, src2);
153 INSERT_D2_SB(tp2, tp3, src3);
154 LD_SH8(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5, in6, in7);
155 src1_ptr += (8 * src2_stride);
156 ILVRL_B2_SH(zero, src0, dst0, dst1);
157 ILVRL_B2_SH(zero, src1, dst2, dst3);
158 ILVRL_B2_SH(zero, src2, dst4, dst5);
159 ILVRL_B2_SH(zero, src3, dst6, dst7);
160 SLLI_4V(dst0, dst1, dst2, dst3, 6);
161 SLLI_4V(dst4, dst5, dst6, dst7, 6);
162 HEVC_BI_RND_CLIP4_MAX_SATU(in0, in1, in2, in3, dst0, dst1, dst2, dst3,
163 7, dst0, dst1, dst2, dst3);
164 HEVC_BI_RND_CLIP4_MAX_SATU(in4, in5, in6, in7, dst4, dst5, dst6, dst7,
165 7, dst4, dst5, dst6, dst7);
166 PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
167 PCKEV_B2_UB(dst5, dst4, dst7, dst6, out2, out3);
168 ST6x4_UB(out0, out1, dst, dst_stride);
169 dst += (4 * dst_stride);
170 ST6x4_UB(out2, out3, dst, dst_stride);
171 dst += (4 * dst_stride);
175 static void hevc_bi_copy_8w_msa(uint8_t *src0_ptr,
183 uint64_t tp0, tp1, tp2, tp3;
184 v16u8 out0, out1, out2, out3;
185 v16i8 src0 = { 0 }, src1 = { 0 }, src2 = { 0 }, src3 = { 0 };
187 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
188 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
191 LD2(src0_ptr, src_stride, tp0, tp1);
192 INSERT_D2_SB(tp0, tp1, src0);
193 LD_SH2(src1_ptr, src2_stride, in0, in1);
194 ILVRL_B2_SH(zero, src0, dst0, dst1);
195 SLLI_2V(dst0, dst1, 6);
196 HEVC_BI_RND_CLIP2_MAX_SATU(in0, in1, dst0, dst1, 7, dst0, dst1);
197 out0 = (v16u8) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
198 ST8x2_UB(out0, dst, dst_stride);
199 } else if (4 == height) {
200 LD4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
201 INSERT_D2_SB(tp0, tp1, src0);
202 INSERT_D2_SB(tp2, tp3, src1);
203 ILVRL_B2_SH(zero, src0, dst0, dst1);
204 ILVRL_B2_SH(zero, src1, dst2, dst3);
205 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
206 SLLI_4V(dst0, dst1, dst2, dst3, 6);
207 HEVC_BI_RND_CLIP4_MAX_SATU(in0, in1, in2, in3, dst0, dst1, dst2, dst3,
208 7, dst0, dst1, dst2, dst3);
209 PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
210 ST8x4_UB(out0, out1, dst, dst_stride);
211 } else if (6 == height) {
212 LD4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
213 src0_ptr += 4 * src_stride;
214 INSERT_D2_SB(tp0, tp1, src0);
215 INSERT_D2_SB(tp2, tp3, src1);
216 LD2(src0_ptr, src_stride, tp0, tp1);
217 INSERT_D2_SB(tp0, tp1, src2);
218 ILVRL_B2_SH(zero, src0, dst0, dst1);
219 ILVRL_B2_SH(zero, src1, dst2, dst3);
220 ILVRL_B2_SH(zero, src2, dst4, dst5);
221 LD_SH6(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5);
222 SLLI_4V(dst0, dst1, dst2, dst3, 6);
223 SLLI_2V(dst4, dst5, 6);
224 HEVC_BI_RND_CLIP4_MAX_SATU(in0, in1, in2, in3, dst0, dst1, dst2, dst3,
225 7, dst0, dst1, dst2, dst3);
226 HEVC_BI_RND_CLIP2_MAX_SATU(in4, in5, dst4, dst5, 7, dst4, dst5);
227 PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
228 ST8x4_UB(out0, out1, dst, dst_stride);
229 dst += (4 * dst_stride);
230 ST8x2_UB(out2, dst, dst_stride);
231 } else if (0 == height % 8) {
234 for (loop_cnt = (height >> 3); loop_cnt--;) {
235 LD4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
236 src0_ptr += 4 * src_stride;
237 INSERT_D2_SB(tp0, tp1, src0);
238 INSERT_D2_SB(tp2, tp3, src1);
239 LD4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
240 src0_ptr += 4 * src_stride;
241 INSERT_D2_SB(tp0, tp1, src2);
242 INSERT_D2_SB(tp2, tp3, src3);
243 ILVRL_B2_SH(zero, src0, dst0, dst1);
244 ILVRL_B2_SH(zero, src1, dst2, dst3);
245 ILVRL_B2_SH(zero, src2, dst4, dst5);
246 ILVRL_B2_SH(zero, src3, dst6, dst7);
247 LD_SH8(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5, in6,
249 src1_ptr += (8 * src2_stride);
250 SLLI_4V(dst0, dst1, dst2, dst3, 6);
251 SLLI_4V(dst4, dst5, dst6, dst7, 6);
252 HEVC_BI_RND_CLIP4_MAX_SATU(in0, in1, in2, in3, dst0, dst1, dst2,
253 dst3, 7, dst0, dst1, dst2, dst3);
254 HEVC_BI_RND_CLIP4_MAX_SATU(in4, in5, in6, in7, dst4, dst5, dst6,
255 dst7, 7, dst4, dst5, dst6, dst7);
256 PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
257 PCKEV_B2_UB(dst5, dst4, dst7, dst6, out2, out3);
258 ST8x8_UB(out0, out1, out2, out3, dst, dst_stride);
259 dst += (8 * dst_stride);
264 static void hevc_bi_copy_12w_msa(uint8_t *src0_ptr,
274 v16u8 out0, out1, out2;
275 v16i8 src0, src1, src2, src3;
276 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
277 v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
279 for (loop_cnt = 4; loop_cnt--;) {
280 LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
281 src0_ptr += (4 * src_stride);
283 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
284 LD_SH4(src1_ptr + 8, src2_stride, in4, in5, in6, in7);
285 src1_ptr += (4 * src2_stride);
286 ILVR_D2_SH(in5, in4, in7, in6, in4, in5);
287 ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, dst0, dst1,
289 SLLI_4V(dst0, dst1, dst2, dst3, 6);
290 ILVL_W2_SB(src1, src0, src3, src2, src0, src1);
291 ILVR_B2_SH(zero, src0, zero, src1, dst4, dst5);
292 SLLI_2V(dst4, dst5, 6);
293 HEVC_BI_RND_CLIP4_MAX_SATU(in0, in1, in2, in3, dst0, dst1, dst2, dst3,
294 7, dst0, dst1, dst2, dst3);
295 HEVC_BI_RND_CLIP2_MAX_SATU(in4, in5, dst4, dst5, 7, dst4, dst5);
296 PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
297 ST12x4_UB(out0, out1, out2, dst, dst_stride);
298 dst += (4 * dst_stride);
302 static void hevc_bi_copy_16w_msa(uint8_t *src0_ptr,
311 v16u8 out0, out1, out2, out3;
312 v16i8 src0, src1, src2, src3;
313 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
314 v8i16 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l;
317 for (loop_cnt = (height >> 2); loop_cnt--;) {
318 LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
319 src0_ptr += (4 * src_stride);
320 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
321 LD_SH4(src1_ptr + 8, src2_stride, in4, in5, in6, in7);
322 src1_ptr += (4 * src2_stride);
323 ILVRL_B2_SH(zero, src0, dst0_r, dst0_l);
324 ILVRL_B2_SH(zero, src1, dst1_r, dst1_l);
325 ILVRL_B2_SH(zero, src2, dst2_r, dst2_l);
326 ILVRL_B2_SH(zero, src3, dst3_r, dst3_l);
327 SLLI_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6);
328 SLLI_4V(dst0_l, dst1_l, dst2_l, dst3_l, 6);
329 HEVC_BI_RND_CLIP4_MAX_SATU(in0, in1, in4, in5, dst0_r, dst1_r, dst0_l,
330 dst1_l, 7, dst0_r, dst1_r, dst0_l, dst1_l);
331 HEVC_BI_RND_CLIP4_MAX_SATU(in2, in3, in6, in7, dst2_r, dst3_r, dst2_l,
332 dst3_l, 7, dst2_r, dst3_r, dst2_l, dst3_l);
333 PCKEV_B2_UB(dst0_l, dst0_r, dst1_l, dst1_r, out0, out1);
334 PCKEV_B2_UB(dst2_l, dst2_r, dst3_l, dst3_r, out2, out3);
335 ST_UB4(out0, out1, out2, out3, dst, dst_stride);
336 dst += (4 * dst_stride);
340 static void hevc_bi_copy_24w_msa(uint8_t *src0_ptr,
349 v16u8 out0, out1, out2, out3, out4, out5;
350 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, zero = { 0 };
351 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8, dst9, dst10;
352 v8i16 in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, dst11;
354 for (loop_cnt = 8; loop_cnt--;) {
355 LD_SB4(src0_ptr, src_stride, src0, src1, src4, src5);
356 LD_SB4(src0_ptr + 16, src_stride, src2, src3, src6, src7);
357 src0_ptr += (4 * src_stride);
358 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
359 LD_SH4(src1_ptr + 8, src2_stride, in4, in5, in6, in7);
360 LD_SH4(src1_ptr + 16, src2_stride, in8, in9, in10, in11);
361 src1_ptr += (4 * src2_stride);
363 ILVRL_B2_SH(zero, src0, dst0, dst1);
364 ILVRL_B2_SH(zero, src1, dst2, dst3);
365 ILVR_B2_SH(zero, src2, zero, src3, dst4, dst5);
366 ILVRL_B2_SH(zero, src4, dst6, dst7);
367 ILVRL_B2_SH(zero, src5, dst8, dst9);
368 ILVR_B2_SH(zero, src6, zero, src7, dst10, dst11);
369 SLLI_4V(dst0, dst1, dst2, dst3, 6);
370 SLLI_4V(dst4, dst5, dst6, dst7, 6);
371 SLLI_4V(dst8, dst9, dst10, dst11, 6);
372 HEVC_BI_RND_CLIP4_MAX_SATU(in0, in4, in1, in5, dst0, dst1, dst2, dst3,
373 7, dst0, dst1, dst2, dst3);
374 HEVC_BI_RND_CLIP4_MAX_SATU(in8, in9, in2, in6, dst4, dst5, dst6, dst7,
375 7, dst4, dst5, dst6, dst7);
376 HEVC_BI_RND_CLIP4_MAX_SATU(in3, in7, in10, in11, dst8, dst9, dst10,
377 dst11, 7, dst8, dst9, dst10, dst11);
378 PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
379 PCKEV_B3_UB(dst7, dst6, dst9, dst8, dst11, dst10, out3, out4, out5);
380 ST_UB4(out0, out1, out3, out4, dst, dst_stride);
381 ST8x4_UB(out2, out5, dst + 16, dst_stride);
382 dst += (4 * dst_stride);
386 static void hevc_bi_copy_32w_msa(uint8_t *src0_ptr,
395 v16u8 out0, out1, out2, out3;
396 v16i8 src0, src1, src2, src3;
398 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
399 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
401 for (loop_cnt = (height >> 1); loop_cnt--;) {
402 LD_SB2(src0_ptr, 16, src0, src1);
403 src0_ptr += src_stride;
404 LD_SB2(src0_ptr, 16, src2, src3);
405 src0_ptr += src_stride;
406 LD_SH4(src1_ptr, 8, in0, in1, in2, in3);
407 src1_ptr += src2_stride;
408 LD_SH4(src1_ptr, 8, in4, in5, in6, in7);
409 src1_ptr += src2_stride;
411 ILVRL_B2_SH(zero, src0, dst0, dst1);
412 ILVRL_B2_SH(zero, src1, dst2, dst3);
413 ILVRL_B2_SH(zero, src2, dst4, dst5);
414 ILVRL_B2_SH(zero, src3, dst6, dst7);
415 SLLI_4V(dst0, dst1, dst2, dst3, 6);
416 SLLI_4V(dst4, dst5, dst6, dst7, 6);
417 HEVC_BI_RND_CLIP4_MAX_SATU(in0, in1, in2, in3, dst0, dst1, dst2, dst3,
418 7, dst0, dst1, dst2, dst3);
419 HEVC_BI_RND_CLIP4_MAX_SATU(in4, in5, in6, in7, dst4, dst5, dst6, dst7,
420 7, dst4, dst5, dst6, dst7);
421 PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
422 PCKEV_B2_UB(dst5, dst4, dst7, dst6, out2, out3);
423 ST_UB2(out0, out1, dst, 16);
425 ST_UB2(out2, out3, dst, 16);
430 static void hevc_bi_copy_48w_msa(uint8_t *src0_ptr,
439 v16u8 out0, out1, out2, out3, out4, out5;
440 v16i8 src0, src1, src2, src3, src4, src5;
442 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8, dst9, dst10;
443 v8i16 in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, dst11;
445 for (loop_cnt = (height >> 1); loop_cnt--;) {
446 LD_SB3(src0_ptr, 16, src0, src1, src2);
447 src0_ptr += src_stride;
448 LD_SB3(src0_ptr, 16, src3, src4, src5);
449 src0_ptr += src_stride;
451 LD_SH6(src1_ptr, 8, in0, in1, in2, in3, in4, in5);
452 src1_ptr += src2_stride;
453 LD_SH6(src1_ptr, 8, in6, in7, in8, in9, in10, in11);
454 src1_ptr += src2_stride;
456 ILVRL_B2_SH(zero, src0, dst0, dst1);
457 ILVRL_B2_SH(zero, src1, dst2, dst3);
458 ILVRL_B2_SH(zero, src2, dst4, dst5);
459 ILVRL_B2_SH(zero, src3, dst6, dst7);
460 ILVRL_B2_SH(zero, src4, dst8, dst9);
461 ILVRL_B2_SH(zero, src5, dst10, dst11);
463 SLLI_4V(dst0, dst1, dst2, dst3, 6);
464 SLLI_4V(dst4, dst5, dst6, dst7, 6);
465 SLLI_4V(dst8, dst9, dst10, dst11, 6);
467 HEVC_BI_RND_CLIP4_MAX_SATU(in0, in1, in2, in3, dst0, dst1, dst2, dst3,
468 7, dst0, dst1, dst2, dst3);
469 HEVC_BI_RND_CLIP4_MAX_SATU(in4, in5, in6, in7, dst4, dst5, dst6, dst7,
470 7, dst4, dst5, dst6, dst7);
471 HEVC_BI_RND_CLIP4_MAX_SATU(in8, in9, in10, in11, dst8, dst9, dst10,
472 dst11, 7, dst8, dst9, dst10, dst11);
473 PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
474 PCKEV_B3_UB(dst7, dst6, dst9, dst8, dst11, dst10, out3, out4, out5);
475 ST_UB2(out0, out1, dst, 16);
476 ST_UB(out2, dst + 32);
478 ST_UB2(out3, out4, dst, 16);
479 ST_UB(out5, dst + 32);
484 static void hevc_bi_copy_64w_msa(uint8_t *src0_ptr,
493 v16u8 out0, out1, out2, out3;
494 v16i8 src0, src1, src2, src3;
496 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
497 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
499 for (loop_cnt = height; loop_cnt--;) {
500 LD_SB4(src0_ptr, 16, src0, src1, src2, src3);
501 src0_ptr += src_stride;
502 LD_SH8(src1_ptr, 8, in0, in1, in2, in3, in4, in5, in6, in7);
503 src1_ptr += src2_stride;
505 ILVRL_B2_SH(zero, src0, dst0, dst1);
506 ILVRL_B2_SH(zero, src1, dst2, dst3);
507 ILVRL_B2_SH(zero, src2, dst4, dst5);
508 ILVRL_B2_SH(zero, src3, dst6, dst7);
509 SLLI_4V(dst0, dst1, dst2, dst3, 6);
510 SLLI_4V(dst4, dst5, dst6, dst7, 6);
511 HEVC_BI_RND_CLIP4_MAX_SATU(in0, in1, in2, in3, dst0, dst1, dst2, dst3,
512 7, dst0, dst1, dst2, dst3);
513 HEVC_BI_RND_CLIP4_MAX_SATU(in4, in5, in6, in7, dst4, dst5, dst6, dst7,
514 7, dst4, dst5, dst6, dst7);
515 PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
516 PCKEV_B2_UB(dst5, dst4, dst7, dst6, out2, out3);
518 ST_UB4(out0, out1, out2, out3, dst, 16);
523 static void hevc_hz_bi_8t_4w_msa(uint8_t *src0_ptr,
529 const int8_t *filter,
533 v8i16 filt0, filt1, filt2, filt3;
534 v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
535 v16i8 mask1, mask2, mask3;
536 v16i8 vec0, vec1, vec2, vec3;
537 v8i16 dst0, dst1, dst2, dst3;
538 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
539 v8i16 filter_vec, const_vec;
540 v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[16]);
544 /* rearranging filter */
545 filter_vec = LD_SH(filter);
546 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
552 const_vec = __msa_ldi_h(128);
555 for (loop_cnt = (height >> 3); loop_cnt--;) {
556 LD_SB8(src0_ptr, src_stride, src0, src1, src2, src3,
557 src4, src5, src6, src7);
558 src0_ptr += (8 * src_stride);
559 LD_SH8(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5, in6, in7);
560 src1_ptr += (8 * src2_stride);
562 ILVR_D2_SH(in1, in0, in3, in2, in0, in1);
563 ILVR_D2_SH(in5, in4, in7, in6, in2, in3);
564 XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
570 VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0, vec1);
571 VSHF_B2_SB(src4, src5, src6, src7, mask0, mask0, vec2, vec3);
572 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0,
574 VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec0, vec1);
575 VSHF_B2_SB(src4, src5, src6, src7, mask1, mask1, vec2, vec3);
576 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0,
578 VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec0, vec1);
579 VSHF_B2_SB(src4, src5, src6, src7, mask2, mask2, vec2, vec3);
580 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt2, filt2, filt2, dst0,
582 VSHF_B2_SB(src0, src1, src2, src3, mask3, mask3, vec0, vec1);
583 VSHF_B2_SB(src4, src5, src6, src7, mask3, mask3, vec2, vec3);
584 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt3, filt3, filt3, filt3, dst0,
587 HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
588 dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
590 PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
591 ST4x8_UB(dst0, dst1, dst, dst_stride);
592 dst += (8 * dst_stride);
596 static void hevc_hz_bi_8t_8w_msa(uint8_t *src0_ptr,
602 const int8_t *filter,
606 v8i16 filt0, filt1, filt2, filt3;
607 v16i8 src0, src1, src2, src3;
608 v16i8 mask1, mask2, mask3;
609 v16i8 vec0, vec1, vec2, vec3;
610 v8i16 dst0, dst1, dst2, dst3;
611 v8i16 in0, in1, in2, in3;
612 v8i16 filter_vec, const_vec;
613 v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
617 const_vec = __msa_ldi_h(128);
620 filter_vec = LD_SH(filter);
621 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
627 for (loop_cnt = (height >> 2); loop_cnt--;) {
628 LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
629 src0_ptr += (4 * src_stride);
630 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
631 src1_ptr += (4 * src2_stride);
632 XORI_B4_128_SB(src0, src1, src2, src3);
638 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
639 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3);
640 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0,
642 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0, vec1);
643 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2, vec3);
644 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0,
646 VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec0, vec1);
647 VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2, vec3);
648 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt2, filt2, filt2, dst0,
650 VSHF_B2_SB(src0, src0, src1, src1, mask3, mask3, vec0, vec1);
651 VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec2, vec3);
652 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt3, filt3, filt3, filt3, dst0,
655 HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
656 dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
658 PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
659 ST8x4_UB(dst0, dst1, dst, dst_stride);
660 dst += (4 * dst_stride);
664 static void hevc_hz_bi_8t_12w_msa(uint8_t *src0_ptr,
670 const int8_t *filter,
676 v16i8 src0, src1, src2, src3;
677 v16i8 vec0, vec1, vec2;
678 v8i16 filt0, filt1, filt2, filt3;
679 v16i8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7;
680 v8i16 dst0, dst1, dst2;
681 v8i16 in0, in1, in2, in3;
682 v8i16 filter_vec, const_vec;
685 const_vec = __msa_ldi_h(128);
688 filter_vec = LD_SH(filter);
689 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
691 mask0 = LD_SB(ff_hevc_mask_arr);
695 mask4 = LD_SB(&ff_hevc_mask_arr[16]);
700 for (loop_cnt = 8; loop_cnt--;) {
701 LD_SB2(src0_ptr, 8, src0, src1);
702 src0_ptr += src_stride;
703 LD_SB2(src0_ptr, 8, src2, src3);
704 src0_ptr += src_stride;
705 LD_SH2(src1_ptr, 8, in0, in1);
706 src1_ptr += src2_stride;
707 LD_SH2(src1_ptr, 8, in2, in3);
708 src1_ptr += src2_stride;
709 XORI_B4_128_SB(src0, src1, src2, src3);
715 VSHF_B3_SB(src0, src0, src1, src3, src2, src2, mask0, mask4, mask0,
717 DPADD_SB2_SH(vec0, vec1, filt0, filt0, dst0, dst1);
718 dst2 = __msa_dpadd_s_h(dst2, vec2, (v16i8) filt0);
719 VSHF_B3_SB(src0, src0, src1, src3, src2, src2, mask1, mask5, mask1,
721 DPADD_SB2_SH(vec0, vec1, filt1, filt1, dst0, dst1);
722 dst2 = __msa_dpadd_s_h(dst2, vec2, (v16i8) filt1);
723 VSHF_B3_SB(src0, src0, src1, src3, src2, src2, mask2, mask6, mask2,
725 DPADD_SB2_SH(vec0, vec1, filt2, filt2, dst0, dst1);
726 dst2 = __msa_dpadd_s_h(dst2, vec2, (v16i8) filt2);
727 VSHF_B3_SB(src0, src0, src1, src3, src2, src2, mask3, mask7, mask3,
729 DPADD_SB2_SH(vec0, vec1, filt3, filt3, dst0, dst1);
730 dst2 = __msa_dpadd_s_h(dst2, vec2, (v16i8) filt3);
732 in1 = (v8i16) __msa_pckev_d((v2i64) in3, (v2i64) in1);
733 HEVC_BI_RND_CLIP2(in0, in1, dst0, dst1, 7, dst0, dst1);
734 dst2 = __msa_adds_s_h(in2, dst2);
735 dst2 = __msa_srari_h(dst2, 7);
736 dst2 = CLIP_SH_0_255(dst2);
737 PCKEV_B2_SH(dst1, dst0, dst2, dst2, dst0, dst1);
739 tmp2 = __msa_copy_s_d((v2i64) dst0, 0);
740 tmp0 = __msa_copy_s_w((v4i32) dst0, 2);
741 tmp3 = __msa_copy_s_d((v2i64) dst1, 0);
742 tmp1 = __msa_copy_s_w((v4i32) dst0, 3);
752 static void hevc_hz_bi_8t_16w_msa(uint8_t *src0_ptr,
758 const int8_t *filter,
762 v16i8 src0, src1, src2, src3;
763 v8i16 filt0, filt1, filt2, filt3;
764 v16i8 mask1, mask2, mask3;
765 v16i8 vec0, vec1, vec2, vec3;
766 v8i16 dst0, dst1, dst2, dst3;
767 v8i16 in0, in1, in2, in3;
768 v8i16 filter_vec, const_vec;
769 v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
772 const_vec = __msa_ldi_h(128);
775 filter_vec = LD_SH(filter);
776 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
782 for (loop_cnt = (height >> 1); loop_cnt--;) {
783 LD_SB2(src0_ptr, 8, src0, src1);
784 src0_ptr += src_stride;
785 LD_SB2(src0_ptr, 8, src2, src3);
786 src0_ptr += src_stride;
787 LD_SH2(src1_ptr, 8, in0, in1);
788 src1_ptr += src2_stride;
789 LD_SH2(src1_ptr, 8, in2, in3);
790 src1_ptr += src2_stride;
791 XORI_B4_128_SB(src0, src1, src2, src3);
797 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
798 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3);
799 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0,
801 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0, vec1);
802 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2, vec3);
803 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0,
805 VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec0, vec1);
806 VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2, vec3);
807 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt2, filt2, filt2, dst0,
809 VSHF_B2_SB(src0, src0, src1, src1, mask3, mask3, vec0, vec1);
810 VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec2, vec3);
811 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt3, filt3, filt3, filt3, dst0,
814 HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
815 dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
817 PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
818 ST_SH2(dst0, dst1, dst, dst_stride);
819 dst += (2 * dst_stride);
823 static void hevc_hz_bi_8t_24w_msa(uint8_t *src0_ptr,
829 const int8_t *filter,
834 v16i8 src0, src1, tmp0, tmp1;
835 v8i16 filt0, filt1, filt2, filt3;
836 v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
837 v16i8 vec0, vec1, vec2, vec3;
838 v8i16 dst0, dst1, dst2;
840 v8i16 filter_vec, const_vec;
841 v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
843 src0_ptr = src0_ptr - 3;
844 const_vec = __msa_ldi_h(128);
847 filter_vec = LD_SH(filter);
848 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
858 for (loop_cnt = height; loop_cnt--;) {
859 LD_SB2(src0_ptr, 16, src0, src1);
860 src0_ptr += src_stride;
861 LD_SH2(src1_ptr, 8, in0, in1);
862 in2 = LD_SH(src1_ptr + 16);
863 src1_ptr += src2_stride;
864 XORI_B2_128_SB(src0, src1);
869 VSHF_B2_SB(src0, src0, src0, src1, mask0, mask4, vec0, vec1);
870 VSHF_B2_SB(src1, src1, src0, src0, mask0, mask1, vec2, vec3);
871 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt1, dst0,
873 VSHF_B2_SB(src0, src1, src1, src1, mask5, mask1, vec0, vec1);
874 VSHF_B2_SB(src0, src0, src0, src1, mask2, mask6, vec2, vec3);
875 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt2, filt2, dst1,
877 VSHF_B2_SB(src1, src1, src0, src0, mask2, mask3, vec0, vec1);
878 VSHF_B2_SB(src0, src1, src1, src1, mask7, mask3, vec2, vec3);
879 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt3, filt3, filt3, dst2,
882 HEVC_BI_RND_CLIP2(in0, in1, dst0, dst1, 7, dst0, dst1);
883 dst2 = __msa_adds_s_h(dst2, in2);
884 dst2 = __msa_srari_h(dst2, 7);
885 dst2 = CLIP_SH_0_255(dst2);
887 PCKEV_B2_SB(dst1, dst0, dst2, dst2, tmp0, tmp1);
888 dst_val0 = __msa_copy_u_d((v2i64) tmp1, 0);
890 SD(dst_val0, dst + 16);
895 static void hevc_hz_bi_8t_32w_msa(uint8_t *src0_ptr,
901 const int8_t *filter,
905 v16i8 src0, src1, src2, tmp0, tmp1;
906 v8i16 filt0, filt1, filt2, filt3;
907 v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
908 v16i8 vec0, vec1, vec2, vec3;
909 v8i16 dst0, dst1, dst2, dst3;
910 v8i16 in0, in1, in2, in3;
911 v8i16 filter_vec, const_vec;
912 v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
915 const_vec = __msa_ldi_h(128);
918 filter_vec = LD_SH(filter);
919 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
929 for (loop_cnt = height; loop_cnt--;) {
930 LD_SB2(src0_ptr, 16, src0, src1);
931 src2 = LD_SB(src0_ptr + 24);
932 src0_ptr += src_stride;
933 LD_SH4(src1_ptr, 8, in0, in1, in2, in3);
934 src1_ptr += src2_stride;
935 XORI_B3_128_SB(src0, src1, src2);
941 VSHF_B2_SB(src0, src0, src0, src1, mask0, mask4, vec0, vec1);
942 VSHF_B2_SB(src1, src1, src2, src2, mask0, mask0, vec2, vec3);
943 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0,
945 VSHF_B2_SB(src0, src0, src0, src1, mask1, mask5, vec0, vec1);
946 VSHF_B2_SB(src1, src1, src2, src2, mask1, mask1, vec2, vec3);
947 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0,
949 VSHF_B2_SB(src0, src0, src0, src1, mask2, mask6, vec0, vec1);
950 VSHF_B2_SB(src1, src1, src2, src2, mask2, mask2, vec2, vec3);
951 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt2, filt2, filt2, dst0,
953 VSHF_B2_SB(src0, src0, src0, src1, mask3, mask7, vec0, vec1);
954 VSHF_B2_SB(src1, src1, src2, src2, mask3, mask3, vec2, vec3);
955 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt3, filt3, filt3, filt3, dst0,
958 HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
959 dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
961 PCKEV_B2_SB(dst1, dst0, dst3, dst2, tmp0, tmp1);
962 ST_SB2(tmp0, tmp1, dst, 16);
967 static void hevc_hz_bi_8t_48w_msa(uint8_t *src0_ptr,
973 const int8_t *filter,
977 v16i8 src0, src1, src2, src3;
978 v16i8 tmp0, tmp1, tmp2;
979 v8i16 filt0, filt1, filt2, filt3;
980 v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
981 v16i8 vec0, vec1, vec2, vec3;
982 v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
983 v8i16 in0, in1, in2, in3, in4, in5;
984 v8i16 filter_vec, const_vec;
985 v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
989 const_vec = __msa_ldi_h(128);
992 filter_vec = LD_SH(filter);
993 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1003 for (loop_cnt = 64; loop_cnt--;) {
1004 LD_SB3(src0_ptr, 16, src0, src1, src2);
1005 src3 = LD_SB(src0_ptr + 40);
1006 src0_ptr += src_stride;
1007 LD_SH4(src1_ptr, 8, in0, in1, in2, in3);
1008 XORI_B4_128_SB(src0, src1, src2, src3);
1015 VSHF_B2_SB(src0, src0, src0, src1, mask0, mask4, vec0, vec1);
1016 VSHF_B2_SB(src1, src1, src1, src2, mask0, mask4, vec2, vec3);
1017 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0,
1019 VSHF_B2_SB(src0, src0, src0, src1, mask1, mask5, vec0, vec1);
1020 VSHF_B2_SB(src1, src1, src1, src2, mask1, mask5, vec2, vec3);
1021 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0,
1023 VSHF_B2_SB(src0, src0, src0, src1, mask2, mask6, vec0, vec1);
1024 VSHF_B2_SB(src1, src1, src1, src2, mask2, mask6, vec2, vec3);
1025 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt2, filt2, filt2, dst0,
1027 VSHF_B2_SB(src0, src0, src0, src1, mask3, mask7, vec0, vec1);
1028 VSHF_B2_SB(src1, src1, src1, src2, mask3, mask7, vec2, vec3);
1029 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt3, filt3, filt3, filt3, dst0,
1031 HEVC_BI_RND_CLIP2(in0, in1, dst0, dst1, 7, dst0, dst1);
1032 HEVC_BI_RND_CLIP2(in2, in3, dst2, dst3, 7, dst2, dst3);
1033 PCKEV_B2_SB(dst1, dst0, dst3, dst2, tmp0, tmp1);
1035 ST_SB(tmp1, dst + 16);
1037 LD_SH2(src1_ptr + 32, 8, in4, in5);
1038 src1_ptr += src2_stride;
1042 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec0, vec1);
1043 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2, vec3);
1044 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt1, filt1, dst4,
1046 VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec0, vec1);
1047 VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec2, vec3);
1048 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt2, filt3, filt3, dst4,
1051 HEVC_BI_RND_CLIP2(in4, in5, dst4, dst5, 7, dst4, dst5);
1053 tmp2 = __msa_pckev_b((v16i8) dst5, (v16i8) dst4);
1054 ST_SB(tmp2, dst + 32);
1059 static void hevc_hz_bi_8t_64w_msa(uint8_t *src0_ptr,
1062 int32_t src2_stride,
1065 const int8_t *filter,
1069 v16i8 src0, src1, src2, src3, src4, src5, tmp0, tmp1;
1070 v8i16 filt0, filt1, filt2, filt3;
1071 v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
1072 v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
1073 v16i8 vec0, vec1, vec2, vec3;
1074 v8i16 dst0, dst1, dst2, dst3;
1075 v8i16 in0, in1, in2, in3;
1076 v8i16 filter_vec, const_vec;
1080 const_vec = __msa_ldi_h(128);
1083 filter_vec = LD_SH(filter);
1084 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1094 for (loop_cnt = height; loop_cnt--;) {
1095 LD_SB2(src0_ptr, 16, src0, src1);
1096 src2 = LD_SB(src0_ptr + 24);
1097 LD_SB2(src0_ptr + 32, 16, src3, src4);
1098 src5 = LD_SB(src0_ptr + 56);
1099 LD_SH4(src1_ptr, 8, in0, in1, in2, in3);
1100 XORI_B3_128_SB(src0, src1, src2);
1107 VSHF_B2_SB(src0, src0, src0, src1, mask0, mask4, vec0, vec1);
1108 VSHF_B2_SB(src1, src1, src2, src2, mask0, mask0, vec2, vec3);
1109 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0,
1111 VSHF_B2_SB(src0, src0, src0, src1, mask1, mask5, vec0, vec1);
1112 VSHF_B2_SB(src1, src1, src2, src2, mask1, mask1, vec2, vec3);
1113 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0,
1115 VSHF_B2_SB(src0, src0, src0, src1, mask2, mask6, vec0, vec1);
1116 VSHF_B2_SB(src1, src1, src2, src2, mask2, mask2, vec2, vec3);
1117 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt2, filt2, filt2, dst0,
1119 VSHF_B2_SB(src0, src0, src0, src1, mask3, mask7, vec0, vec1);
1120 VSHF_B2_SB(src1, src1, src2, src2, mask3, mask3, vec2, vec3);
1121 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt3, filt3, filt3, filt3, dst0,
1124 HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
1125 dst0, dst1, dst2, dst3, 7,
1126 dst0, dst1, dst2, dst3);
1128 PCKEV_B2_SB(dst1, dst0, dst3, dst2, tmp0, tmp1);
1129 ST_SB2(tmp0, tmp1, dst, 16);
1135 LD_SH4(src1_ptr + 32, 8, in0, in1, in2, in3);
1136 XORI_B3_128_SB(src0, src1, src2);
1142 VSHF_B2_SB(src0, src0, src0, src1, mask0, mask4, vec0, vec1);
1143 VSHF_B2_SB(src1, src1, src2, src2, mask0, mask0, vec2, vec3);
1144 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0,
1146 VSHF_B2_SB(src0, src0, src0, src1, mask1, mask5, vec0, vec1);
1147 VSHF_B2_SB(src1, src1, src2, src2, mask1, mask1, vec2, vec3);
1148 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0,
1150 VSHF_B2_SB(src0, src0, src0, src1, mask2, mask6, vec0, vec1);
1151 VSHF_B2_SB(src1, src1, src2, src2, mask2, mask2, vec2, vec3);
1152 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt2, filt2, filt2, dst0,
1154 VSHF_B2_SB(src0, src0, src0, src1, mask3, mask7, vec0, vec1);
1155 VSHF_B2_SB(src1, src1, src2, src2, mask3, mask3, vec2, vec3);
1156 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt3, filt3, filt3, filt3, dst0,
1158 HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
1159 dst0, dst1, dst2, dst3, 7,
1160 dst0, dst1, dst2, dst3);
1161 PCKEV_B2_SB(dst1, dst0, dst3, dst2, tmp0, tmp1);
1162 ST_SB2(tmp0, tmp1, dst + 32, 16);
1163 src1_ptr += src2_stride;
1164 src0_ptr += src_stride;
1169 static void hevc_vt_bi_8t_4w_msa(uint8_t *src0_ptr,
1172 int32_t src2_stride,
1175 const int8_t *filter,
1179 v16i8 src0, src1, src2, src3, src4, src5;
1180 v16i8 src6, src7, src8, src9, src10;
1181 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
1182 v16i8 src11, src12, src13, src14;
1183 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
1184 v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
1185 v16i8 src1110_r, src1211_r, src1312_r, src1413_r;
1186 v16i8 src2110, src4332, src6554, src8776, src10998;
1187 v16i8 src12111110, src14131312;
1188 v8i16 dst10, dst32, dst54, dst76;
1189 v8i16 filt0, filt1, filt2, filt3;
1190 v8i16 filter_vec, const_vec;
1192 src0_ptr -= (3 * src_stride);
1194 const_vec = __msa_ldi_h(128);
1197 filter_vec = LD_SH(filter);
1198 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1200 LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6);
1201 src0_ptr += (7 * src_stride);
1202 ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1203 src10_r, src32_r, src54_r, src21_r);
1204 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1205 ILVR_D3_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r,
1206 src2110, src4332, src6554);
1207 XORI_B3_128_SB(src2110, src4332, src6554);
1209 for (loop_cnt = (height >> 3); loop_cnt--;) {
1210 LD_SB8(src0_ptr, src_stride,
1211 src7, src8, src9, src10, src11, src12, src13, src14);
1212 src0_ptr += (8 * src_stride);
1213 LD_SH8(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5, in6, in7);
1214 src1_ptr += (8 * src2_stride);
1216 ILVR_D2_SH(in1, in0, in3, in2, in0, in1);
1217 ILVR_D2_SH(in5, in4, in7, in6, in2, in3);
1218 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
1219 src76_r, src87_r, src98_r, src109_r);
1220 ILVR_B4_SB(src11, src10, src12, src11, src13, src12, src14, src13,
1221 src1110_r, src1211_r, src1312_r, src1413_r);
1222 ILVR_D4_SB(src87_r, src76_r, src109_r, src98_r, src1211_r, src1110_r,
1223 src1413_r, src1312_r,
1224 src8776, src10998, src12111110, src14131312);
1225 XORI_B4_128_SB(src8776, src10998, src12111110, src14131312);
1228 DPADD_SB4_SH(src2110, src4332, src6554, src8776,
1229 filt0, filt1, filt2, filt3, dst10, dst10, dst10, dst10);
1231 DPADD_SB4_SH(src4332, src6554, src8776, src10998,
1232 filt0, filt1, filt2, filt3, dst32, dst32, dst32, dst32);
1234 DPADD_SB4_SH(src6554, src8776, src10998, src12111110,
1235 filt0, filt1, filt2, filt3, dst54, dst54, dst54, dst54);
1237 DPADD_SB4_SH(src8776, src10998, src12111110, src14131312,
1238 filt0, filt1, filt2, filt3, dst76, dst76, dst76, dst76);
1240 HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
1241 dst10, dst32, dst54, dst76, 7,
1242 dst10, dst32, dst54, dst76);
1244 PCKEV_B2_SH(dst32, dst10, dst76, dst54, dst10, dst54);
1245 ST4x8_UB(dst10, dst54, dst, dst_stride);
1246 dst += (8 * dst_stride);
1249 src4332 = src12111110;
1250 src6554 = src14131312;
1255 static void hevc_vt_bi_8t_8w_msa(uint8_t *src0_ptr,
1258 int32_t src2_stride,
1261 const int8_t *filter,
1265 v16i8 src0, src1, src2, src3, src4, src5;
1266 v16i8 src6, src7, src8, src9, src10;
1267 v8i16 in0, in1, in2, in3;
1268 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
1269 v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
1270 v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
1271 v8i16 filt0, filt1, filt2, filt3;
1272 v8i16 filter_vec, const_vec;
1274 src0_ptr -= (3 * src_stride);
1275 const_vec = __msa_ldi_h(128);
1278 filter_vec = LD_SH(filter);
1279 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1281 LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6);
1282 src0_ptr += (7 * src_stride);
1283 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1284 ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1285 src10_r, src32_r, src54_r, src21_r);
1286 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1288 for (loop_cnt = (height >> 2); loop_cnt--;) {
1289 LD_SB4(src0_ptr, src_stride, src7, src8, src9, src10);
1290 src0_ptr += (4 * src_stride);
1291 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
1292 src1_ptr += (4 * src2_stride);
1293 XORI_B4_128_SB(src7, src8, src9, src10);
1294 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
1295 src76_r, src87_r, src98_r, src109_r);
1298 DPADD_SB4_SH(src10_r, src32_r, src54_r, src76_r,
1299 filt0, filt1, filt2, filt3,
1300 dst0_r, dst0_r, dst0_r, dst0_r);
1302 DPADD_SB4_SH(src21_r, src43_r, src65_r, src87_r,
1303 filt0, filt1, filt2, filt3,
1304 dst1_r, dst1_r, dst1_r, dst1_r);
1306 DPADD_SB4_SH(src32_r, src54_r, src76_r, src98_r,
1307 filt0, filt1, filt2, filt3,
1308 dst2_r, dst2_r, dst2_r, dst2_r);
1310 DPADD_SB4_SH(src43_r, src65_r, src87_r, src109_r,
1311 filt0, filt1, filt2, filt3,
1312 dst3_r, dst3_r, dst3_r, dst3_r);
1314 HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
1315 dst0_r, dst1_r, dst2_r, dst3_r, 7,
1316 dst0_r, dst1_r, dst2_r, dst3_r);
1318 PCKEV_B2_SH(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r);
1319 ST8x4_UB(dst0_r, dst1_r, dst, dst_stride);
1320 dst += (4 * dst_stride);
1333 static void hevc_vt_bi_8t_12w_msa(uint8_t *src0_ptr,
1336 int32_t src2_stride,
1339 const int8_t *filter,
1343 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1344 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
1345 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
1346 v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
1347 v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
1348 v16i8 src10_l, src32_l, src54_l, src76_l, src98_l;
1349 v16i8 src21_l, src43_l, src65_l, src87_l, src109_l;
1350 v16i8 src2110, src4332, src6554, src8776, src10998;
1351 v8i16 dst0_l, dst1_l;
1352 v8i16 filt0, filt1, filt2, filt3;
1353 v8i16 filter_vec, const_vec;
1355 src0_ptr -= (3 * src_stride);
1356 const_vec = __msa_ldi_h(128);
1359 filter_vec = LD_SH(filter);
1360 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1362 LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6);
1363 src0_ptr += (7 * src_stride);
1364 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1366 ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1367 src10_r, src32_r, src54_r, src21_r);
1368 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1369 ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1370 src10_l, src32_l, src54_l, src21_l);
1371 ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
1372 ILVR_D3_SB(src21_l, src10_l, src43_l, src32_l, src65_l, src54_l,
1373 src2110, src4332, src6554);
1375 for (loop_cnt = (height >> 2); loop_cnt--;) {
1376 LD_SB4(src0_ptr, src_stride, src7, src8, src9, src10);
1377 src0_ptr += (4 * src_stride);
1378 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
1379 LD_SH4((src1_ptr + 8), src2_stride, in4, in5, in6, in7);
1380 src1_ptr += (4 * src2_stride);
1382 ILVR_D2_SH(in5, in4, in7, in6, in4, in5);
1383 XORI_B4_128_SB(src7, src8, src9, src10);
1384 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
1385 src76_r, src87_r, src98_r, src109_r);
1386 ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
1387 src76_l, src87_l, src98_l, src109_l);
1388 ILVR_D2_SB(src87_l, src76_l, src109_l, src98_l, src8776, src10998);
1391 DPADD_SB4_SH(src10_r, src32_r, src54_r, src76_r,
1392 filt0, filt1, filt2, filt3,
1393 dst0_r, dst0_r, dst0_r, dst0_r);
1395 DPADD_SB4_SH(src21_r, src43_r, src65_r, src87_r,
1396 filt0, filt1, filt2, filt3,
1397 dst1_r, dst1_r, dst1_r, dst1_r);
1399 DPADD_SB4_SH(src32_r, src54_r, src76_r, src98_r,
1400 filt0, filt1, filt2, filt3,
1401 dst2_r, dst2_r, dst2_r, dst2_r);
1403 DPADD_SB4_SH(src43_r, src65_r, src87_r, src109_r,
1404 filt0, filt1, filt2, filt3,
1405 dst3_r, dst3_r, dst3_r, dst3_r);
1407 DPADD_SB4_SH(src2110, src4332, src6554, src8776,
1408 filt0, filt1, filt2, filt3,
1409 dst0_l, dst0_l, dst0_l, dst0_l);
1411 DPADD_SB4_SH(src4332, src6554, src8776, src10998,
1412 filt0, filt1, filt2, filt3,
1413 dst1_l, dst1_l, dst1_l, dst1_l);
1415 HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
1416 dst0_r, dst1_r, dst2_r, dst3_r, 7,
1417 dst0_r, dst1_r, dst2_r, dst3_r);
1418 HEVC_BI_RND_CLIP2(in4, in5, dst0_l, dst1_l, 7, dst0_l, dst1_l);
1421 PCKEV_B2_SH(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r);
1422 dst0_l = (v8i16) __msa_pckev_b((v16i8) dst1_l, (v16i8) dst0_l);
1423 ST12x4_UB(dst0_r, dst1_r, dst0_l, dst, dst_stride);
1424 dst += (4 * dst_stride);
1439 static void hevc_vt_bi_8t_16multx2mult_msa(uint8_t *src0_ptr,
1442 int32_t src2_stride,
1445 const int8_t *filter,
1446 int32_t height, int32_t width)
1448 uint8_t *src0_ptr_tmp;
1449 int16_t *src1_ptr_tmp;
1453 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
1454 v8i16 in0, in1, in2, in3;
1455 v16i8 src10_r, src32_r, src54_r, src76_r;
1456 v16i8 src21_r, src43_r, src65_r, src87_r;
1457 v8i16 dst0_r, dst1_r;
1458 v16i8 src10_l, src32_l, src54_l, src76_l;
1459 v16i8 src21_l, src43_l, src65_l, src87_l;
1460 v8i16 dst0_l, dst1_l;
1461 v8i16 filt0, filt1, filt2, filt3;
1462 v8i16 filter_vec, const_vec;
1464 src0_ptr -= (3 * src_stride);
1465 const_vec = __msa_ldi_h(128);
1468 filter_vec = LD_SH(filter);
1469 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1471 for (cnt = (width >> 4); cnt--;) {
1472 src0_ptr_tmp = src0_ptr;
1473 src1_ptr_tmp = src1_ptr;
1476 LD_SB7(src0_ptr_tmp, src_stride,
1477 src0, src1, src2, src3, src4, src5, src6);
1478 src0_ptr_tmp += (7 * src_stride);
1479 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1481 ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1482 src10_r, src32_r, src54_r, src21_r);
1483 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1484 ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1485 src10_l, src32_l, src54_l, src21_l);
1486 ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
1488 for (loop_cnt = (height >> 1); loop_cnt--;) {
1489 LD_SB2(src0_ptr_tmp, src_stride, src7, src8);
1490 src0_ptr_tmp += (2 * src_stride);
1491 LD_SH2(src1_ptr_tmp, src2_stride, in0, in1);
1492 LD_SH2((src1_ptr_tmp + 8), src2_stride, in2, in3);
1493 src1_ptr_tmp += (2 * src2_stride);
1494 XORI_B2_128_SB(src7, src8);
1496 ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
1497 ILVL_B2_SB(src7, src6, src8, src7, src76_l, src87_l);
1500 DPADD_SB4_SH(src10_r, src32_r, src54_r, src76_r,
1501 filt0, filt1, filt2, filt3,
1502 dst0_r, dst0_r, dst0_r, dst0_r);
1504 DPADD_SB4_SH(src21_r, src43_r, src65_r, src87_r,
1505 filt0, filt1, filt2, filt3,
1506 dst1_r, dst1_r, dst1_r, dst1_r);
1508 DPADD_SB4_SH(src10_l, src32_l, src54_l, src76_l,
1509 filt0, filt1, filt2, filt3,
1510 dst0_l, dst0_l, dst0_l, dst0_l);
1512 DPADD_SB4_SH(src21_l, src43_l, src65_l, src87_l,
1513 filt0, filt1, filt2, filt3,
1514 dst1_l, dst1_l, dst1_l, dst1_l);
1516 HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
1517 dst0_r, dst1_r, dst0_l, dst1_l, 7,
1518 dst0_r, dst1_r, dst0_l, dst1_l);
1520 PCKEV_B2_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r, dst1_r);
1521 ST_SH2(dst0_r, dst1_r, dst_tmp, dst_stride);
1522 dst_tmp += (2 * dst_stride);
1545 static void hevc_vt_bi_8t_16w_msa(uint8_t *src0_ptr,
1548 int32_t src2_stride,
1551 const int8_t *filter,
1554 hevc_vt_bi_8t_16multx2mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
1555 dst, dst_stride, filter, height, 16);
1558 static void hevc_vt_bi_8t_24w_msa(uint8_t *src0_ptr,
1561 int32_t src2_stride,
1564 const int8_t *filter,
1567 hevc_vt_bi_8t_16multx2mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
1568 dst, dst_stride, filter, height, 16);
1569 hevc_vt_bi_8t_8w_msa(src0_ptr + 16, src_stride, src1_ptr + 16, src2_stride,
1570 dst + 16, dst_stride, filter, height);
1573 static void hevc_vt_bi_8t_32w_msa(uint8_t *src0_ptr,
1576 int32_t src2_stride,
1579 const int8_t *filter,
1582 hevc_vt_bi_8t_16multx2mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
1583 dst, dst_stride, filter, height, 32);
1586 static void hevc_vt_bi_8t_48w_msa(uint8_t *src0_ptr,
1589 int32_t src2_stride,
1592 const int8_t *filter,
1595 hevc_vt_bi_8t_16multx2mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
1596 dst, dst_stride, filter, height, 48);
1599 static void hevc_vt_bi_8t_64w_msa(uint8_t *src0_ptr,
1602 int32_t src2_stride,
1605 const int8_t *filter,
1608 hevc_vt_bi_8t_16multx2mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
1609 dst, dst_stride, filter, height, 64);
1612 static void hevc_hv_bi_8t_4w_msa(uint8_t *src0_ptr,
1615 int32_t src2_stride,
1618 const int8_t *filter_x,
1619 const int8_t *filter_y,
1625 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1626 v8i16 in0 = { 0 }, in1 = { 0 };
1627 v8i16 filt0, filt1, filt2, filt3;
1628 v8i16 filt_h0, filt_h1, filt_h2, filt_h3;
1629 v16i8 mask1, mask2, mask3;
1630 v8i16 filter_vec, const_vec;
1631 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1632 v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1634 v8i16 dst30, dst41, dst52, dst63, dst66, dst97, dst108;
1635 v8i16 dst10, dst32, dst54, dst76, dst98, dst21, dst43, dst65, dst87, dst109;
1636 v4i32 dst0, dst1, dst2, dst3;
1637 v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16);
1639 src0_ptr -= ((3 * src_stride) + 3);
1640 filter_vec = LD_SH(filter_x);
1641 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1643 filter_vec = LD_SH(filter_y);
1644 UNPCK_R_SB_SH(filter_vec, filter_vec);
1646 SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
1652 const_vec = __msa_ldi_h(128);
1655 LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6);
1656 src0_ptr += (7 * src_stride);
1657 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1659 /* row 0 row 1 row 2 row 3 */
1660 VSHF_B4_SB(src0, src3, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
1661 VSHF_B4_SB(src1, src4, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7);
1662 VSHF_B4_SB(src2, src5, mask0, mask1, mask2, mask3,
1663 vec8, vec9, vec10, vec11);
1664 VSHF_B4_SB(src3, src6, mask0, mask1, mask2, mask3,
1665 vec12, vec13, vec14, vec15);
1667 dst30 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1669 dst41 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
1671 dst52 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
1673 dst63 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2,
1676 ILVRL_H2_SH(dst41, dst30, dst10, dst43);
1677 ILVRL_H2_SH(dst52, dst41, dst21, dst54);
1678 ILVRL_H2_SH(dst63, dst52, dst32, dst65);
1680 dst66 = (v8i16) __msa_splati_d((v2i64) dst63, 1);
1682 for (loop_cnt = height >> 2; loop_cnt--;) {
1683 LD_SB4(src0_ptr, src_stride, src7, src8, src9, src10);
1684 src0_ptr += (4 * src_stride);
1685 XORI_B4_128_SB(src7, src8, src9, src10);
1687 LD2(src1_ptr, src2_stride, tp0, tp1);
1688 INSERT_D2_SH(tp0, tp1, in0);
1689 src1_ptr += (2 * src2_stride);
1690 LD2(src1_ptr, src2_stride, tp0, tp1);
1691 INSERT_D2_SH(tp0, tp1, in1);
1692 src1_ptr += (2 * src2_stride);
1694 VSHF_B4_SB(src7, src9, mask0, mask1, mask2, mask3,
1695 vec0, vec1, vec2, vec3);
1696 VSHF_B4_SB(src8, src10, mask0, mask1, mask2, mask3,
1697 vec4, vec5, vec6, vec7);
1698 dst97 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1700 dst108 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
1703 dst76 = __msa_ilvr_h(dst97, dst66);
1704 ILVRL_H2_SH(dst108, dst97, dst87, dst109);
1705 dst66 = (v8i16) __msa_splati_d((v2i64) dst97, 1);
1706 dst98 = __msa_ilvr_h(dst66, dst108);
1708 dst0 = HEVC_FILT_8TAP(dst10, dst32, dst54, dst76, filt_h0, filt_h1,
1710 dst1 = HEVC_FILT_8TAP(dst21, dst43, dst65, dst87, filt_h0, filt_h1,
1712 dst2 = HEVC_FILT_8TAP(dst32, dst54, dst76, dst98, filt_h0, filt_h1,
1714 dst3 = HEVC_FILT_8TAP(dst43, dst65, dst87, dst109, filt_h0, filt_h1,
1717 SRA_4V(dst0, dst1, dst2, dst3, 6);
1718 PCKEV_H2_SH(dst1, dst0, dst3, dst2, out0, out1);
1719 ADDS_SH2_SH(out0, in0, out1, in1, out0, out1);
1720 ADDS_SH2_SH(out0, const_vec, out1, const_vec, out0, out1);
1721 SRARI_H2_SH(out0, out1, 7);
1722 CLIP_SH2_0_255_MAX_SATU(out0, out1);
1723 out = (v16u8) __msa_pckev_b((v16i8) out1, (v16i8) out0);
1724 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
1725 dst += (4 * dst_stride);
1733 dst66 = (v8i16) __msa_splati_d((v2i64) dst108, 1);
1737 static void hevc_hv_bi_8t_8multx1mult_msa(uint8_t *src0_ptr,
1740 int32_t src2_stride,
1743 const int8_t *filter_x,
1744 const int8_t *filter_y,
1745 int32_t height, int32_t width)
1749 uint8_t *src0_ptr_tmp;
1750 int16_t *src1_ptr_tmp;
1753 v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
1755 v8i16 filt0, filt1, filt2, filt3;
1756 v8i16 filt_h0, filt_h1, filt_h2, filt_h3;
1757 v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
1758 v16i8 mask1, mask2, mask3;
1759 v8i16 filter_vec, const_vec;
1760 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1761 v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1762 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
1763 v4i32 dst0_r, dst0_l;
1764 v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
1765 v8i16 dst10_l, dst32_l, dst54_l, dst76_l;
1767 src0_ptr -= ((3 * src_stride) + 3);
1768 const_vec = __msa_ldi_h(128);
1771 filter_vec = LD_SH(filter_x);
1772 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1774 filter_vec = LD_SH(filter_y);
1775 UNPCK_R_SB_SH(filter_vec, filter_vec);
1777 SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
1783 for (cnt = width >> 3; cnt--;) {
1784 src0_ptr_tmp = src0_ptr;
1786 src1_ptr_tmp = src1_ptr;
1788 LD_SB7(src0_ptr_tmp, src_stride,
1789 src0, src1, src2, src3, src4, src5, src6);
1790 src0_ptr_tmp += (7 * src_stride);
1791 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1793 /* row 0 row 1 row 2 row 3 */
1794 VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
1795 vec0, vec1, vec2, vec3);
1796 VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
1797 vec4, vec5, vec6, vec7);
1798 VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
1799 vec8, vec9, vec10, vec11);
1800 VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
1801 vec12, vec13, vec14, vec15);
1802 dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1804 dst1 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
1806 dst2 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
1808 dst3 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1,
1811 VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3,
1812 vec0, vec1, vec2, vec3);
1813 VSHF_B4_SB(src5, src5, mask0, mask1, mask2, mask3,
1814 vec4, vec5, vec6, vec7);
1815 VSHF_B4_SB(src6, src6, mask0, mask1, mask2, mask3,
1816 vec8, vec9, vec10, vec11);
1817 dst4 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1819 dst5 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
1821 dst6 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
1824 for (loop_cnt = height; loop_cnt--;) {
1825 src7 = LD_SB(src0_ptr_tmp);
1826 src7 = (v16i8) __msa_xori_b((v16u8) src7, 128);
1827 src0_ptr_tmp += src_stride;
1829 in0 = LD_SH(src1_ptr_tmp);
1830 src1_ptr_tmp += src2_stride;
1832 VSHF_B4_SB(src7, src7, mask0, mask1, mask2, mask3,
1833 vec0, vec1, vec2, vec3);
1834 dst7 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1,
1836 ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
1837 ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
1838 ILVRL_H2_SH(dst5, dst4, dst54_r, dst54_l);
1839 ILVRL_H2_SH(dst7, dst6, dst76_r, dst76_l);
1840 dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r,
1841 filt_h0, filt_h1, filt_h2, filt_h3);
1842 dst0_l = HEVC_FILT_8TAP(dst10_l, dst32_l, dst54_l, dst76_l,
1843 filt_h0, filt_h1, filt_h2, filt_h3);
1847 tmp = __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r);
1848 ADDS_SH2_SH(tmp, in0, tmp, const_vec, tmp, tmp);
1849 tmp = __msa_srari_h(tmp, 7);
1850 tmp = CLIP_SH_0_255_MAX_SATU(tmp);
1851 out = (v16u8) __msa_pckev_b((v16i8) tmp, (v16i8) tmp);
1852 ST8x1_UB(out, dst_tmp);
1853 dst_tmp += dst_stride;
1870 static void hevc_hv_bi_8t_8w_msa(uint8_t *src0_ptr,
1873 int32_t src2_stride,
1876 const int8_t *filter_x,
1877 const int8_t *filter_y,
1880 hevc_hv_bi_8t_8multx1mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
1881 dst, dst_stride, filter_x, filter_y,
1885 static void hevc_hv_bi_8t_12w_msa(uint8_t *src0_ptr,
1888 int32_t src2_stride,
1891 const int8_t *filter_x,
1892 const int8_t *filter_y,
1896 uint8_t *src0_ptr_tmp, *dst_tmp;
1897 int16_t *src1_ptr_tmp;
1900 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1901 v16i8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7;
1902 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1903 v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1904 v8i16 in0, in1 = { 0 }, out0, out1, tmp, filter_vec, const_vec;
1905 v8i16 filt0, filt1, filt2, filt3, filt_h0, filt_h1, filt_h2, filt_h3;
1906 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
1907 v8i16 dst30, dst41, dst52, dst63, dst66, dst97, dst108;
1908 v8i16 dst10, dst32, dst54, dst76, dst98, dst21, dst43, dst65, dst87, dst109;
1909 v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
1910 v8i16 dst10_l, dst32_l, dst54_l, dst76_l;
1911 v4i32 dst0_r, dst0_l, tmp0, tmp1, tmp2, tmp3;
1913 src0_ptr -= ((3 * src_stride) + 3);
1915 const_vec = __msa_ldi_h(128);
1918 filter_vec = LD_SH(filter_x);
1919 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1921 filter_vec = LD_SH(filter_y);
1922 UNPCK_R_SB_SH(filter_vec, filter_vec);
1924 SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
1926 mask0 = LD_SB(ff_hevc_mask_arr);
1931 src0_ptr_tmp = src0_ptr;
1933 src1_ptr_tmp = src1_ptr;
1935 LD_SB7(src0_ptr_tmp, src_stride, src0, src1, src2, src3, src4, src5,
1937 src0_ptr_tmp += (7 * src_stride);
1938 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1940 /* row 0 row 1 row 2 row 3 */
1941 VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3, vec0, vec1, vec2,
1943 VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3, vec4, vec5, vec6,
1945 VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3, vec8, vec9, vec10,
1947 VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3, vec12, vec13, vec14,
1949 dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1951 dst1 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
1953 dst2 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
1955 dst3 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1,
1957 VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3, vec0, vec1, vec2,
1959 VSHF_B4_SB(src5, src5, mask0, mask1, mask2, mask3, vec4, vec5, vec6,
1961 VSHF_B4_SB(src6, src6, mask0, mask1, mask2, mask3, vec8, vec9, vec10,
1963 dst4 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1965 dst5 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
1967 dst6 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
1970 for (loop_cnt = 16; loop_cnt--;) {
1971 src7 = LD_SB(src0_ptr_tmp);
1972 src7 = (v16i8) __msa_xori_b((v16u8) src7, 128);
1973 src0_ptr_tmp += src_stride;
1975 in0 = LD_SH(src1_ptr_tmp);
1976 src1_ptr_tmp += src2_stride;
1978 VSHF_B4_SB(src7, src7, mask0, mask1, mask2, mask3, vec0, vec1, vec2,
1980 dst7 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1,
1982 ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
1983 ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
1984 ILVRL_H2_SH(dst5, dst4, dst54_r, dst54_l);
1985 ILVRL_H2_SH(dst7, dst6, dst76_r, dst76_l);
1986 dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r, filt_h0,
1987 filt_h1, filt_h2, filt_h3);
1988 dst0_l = HEVC_FILT_8TAP(dst10_l, dst32_l, dst54_l, dst76_l, filt_h0,
1989 filt_h1, filt_h2, filt_h3);
1993 tmp = __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r);
1994 ADDS_SH2_SH(tmp, in0, tmp, const_vec, tmp, tmp);
1995 tmp = __msa_srari_h(tmp, 7);
1996 tmp = CLIP_SH_0_255_MAX_SATU(tmp);
1997 out = (v16u8) __msa_pckev_b((v16i8) tmp, (v16i8) tmp);
1998 ST8x1_UB(out, dst_tmp);
1999 dst_tmp += dst_stride;
2014 mask4 = LD_SB(ff_hevc_mask_arr + 16);
2019 LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6);
2020 src0_ptr += (7 * src_stride);
2021 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
2023 /* row 0 row 1 row 2 row 3 */
2024 VSHF_B4_SB(src0, src3, mask4, mask5, mask6, mask7, vec0, vec1, vec2, vec3);
2025 VSHF_B4_SB(src1, src4, mask4, mask5, mask6, mask7, vec4, vec5, vec6, vec7);
2026 VSHF_B4_SB(src2, src5, mask4, mask5, mask6, mask7,
2027 vec8, vec9, vec10, vec11);
2028 VSHF_B4_SB(src3, src6, mask4, mask5, mask6, mask7,
2029 vec12, vec13, vec14, vec15);
2030 dst30 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
2032 dst41 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
2034 dst52 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
2036 dst63 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2,
2039 ILVRL_H2_SH(dst41, dst30, dst10, dst43);
2040 ILVRL_H2_SH(dst52, dst41, dst21, dst54);
2041 ILVRL_H2_SH(dst63, dst52, dst32, dst65);
2043 dst66 = (v8i16) __msa_splati_d((v2i64) dst63, 1);
2045 for (loop_cnt = 4; loop_cnt--;) {
2046 LD_SB4(src0_ptr, src_stride, src7, src8, src9, src10);
2047 src0_ptr += (4 * src_stride);
2048 XORI_B4_128_SB(src7, src8, src9, src10);
2050 LD2(src1_ptr, src2_stride, tp0, tp1);
2051 INSERT_D2_SH(tp0, tp1, in0);
2052 src1_ptr += (2 * src2_stride);
2053 LD2(src1_ptr, src2_stride, tp0, tp1);
2054 INSERT_D2_SH(tp0, tp1, in1);
2055 src1_ptr += (2 * src2_stride);
2057 VSHF_B4_SB(src7, src9, mask4, mask5, mask6, mask7, vec0, vec1, vec2,
2059 VSHF_B4_SB(src8, src10, mask4, mask5, mask6, mask7, vec4, vec5, vec6,
2061 dst97 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
2063 dst108 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
2066 dst76 = __msa_ilvr_h(dst97, dst66);
2067 ILVRL_H2_SH(dst108, dst97, dst87, dst109);
2068 dst66 = (v8i16) __msa_splati_d((v2i64) dst97, 1);
2069 dst98 = __msa_ilvr_h(dst66, dst108);
2071 tmp0 = HEVC_FILT_8TAP(dst10, dst32, dst54, dst76, filt_h0, filt_h1,
2073 tmp1 = HEVC_FILT_8TAP(dst21, dst43, dst65, dst87, filt_h0, filt_h1,
2075 tmp2 = HEVC_FILT_8TAP(dst32, dst54, dst76, dst98, filt_h0, filt_h1,
2077 tmp3 = HEVC_FILT_8TAP(dst43, dst65, dst87, dst109, filt_h0, filt_h1,
2079 SRA_4V(tmp0, tmp1, tmp2, tmp3, 6);
2080 PCKEV_H2_SH(tmp1, tmp0, tmp3, tmp2, out0, out1);
2081 ADDS_SH2_SH(out0, in0, out1, in1, out0, out1);
2082 ADDS_SH2_SH(out0, const_vec, out1, const_vec, out0, out1);
2083 SRARI_H2_SH(out0, out1, 7);
2084 CLIP_SH2_0_255_MAX_SATU(out0, out1);
2085 out = (v16u8) __msa_pckev_b((v16i8) out1, (v16i8) out0);
2086 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
2087 dst += (4 * dst_stride);
2095 dst66 = (v8i16) __msa_splati_d((v2i64) dst108, 1);
2099 static void hevc_hv_bi_8t_16w_msa(uint8_t *src0_ptr,
2102 int32_t src2_stride,
2105 const int8_t *filter_x,
2106 const int8_t *filter_y,
2109 hevc_hv_bi_8t_8multx1mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
2110 dst, dst_stride, filter_x, filter_y,
2114 static void hevc_hv_bi_8t_24w_msa(uint8_t *src0_ptr,
2117 int32_t src2_stride,
2120 const int8_t *filter_x,
2121 const int8_t *filter_y,
2124 hevc_hv_bi_8t_8multx1mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
2125 dst, dst_stride, filter_x, filter_y,
2129 static void hevc_hv_bi_8t_32w_msa(uint8_t *src0_ptr,
2132 int32_t src2_stride,
2135 const int8_t *filter_x,
2136 const int8_t *filter_y,
2139 hevc_hv_bi_8t_8multx1mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
2140 dst, dst_stride, filter_x, filter_y,
2144 static void hevc_hv_bi_8t_48w_msa(uint8_t *src0_ptr,
2147 int32_t src2_stride,
2150 const int8_t *filter_x,
2151 const int8_t *filter_y,
2154 hevc_hv_bi_8t_8multx1mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
2155 dst, dst_stride, filter_x, filter_y,
2159 static void hevc_hv_bi_8t_64w_msa(uint8_t *src0_ptr,
2162 int32_t src2_stride,
2165 const int8_t *filter_x,
2166 const int8_t *filter_y,
2169 hevc_hv_bi_8t_8multx1mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
2170 dst, dst_stride, filter_x, filter_y,
2174 static void hevc_hz_bi_4t_4x2_msa(uint8_t *src0_ptr,
2177 int32_t src2_stride,
2180 const int8_t *filter,
2184 v16i8 src0, src1, dst0, vec0, vec1;
2186 v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[16]);
2189 v8i16 filter_vec, const_vec;
2193 const_vec = __msa_ldi_h(128);
2196 filter_vec = LD_SH(filter);
2197 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2201 LD_SB2(src0_ptr, src_stride, src0, src1);
2202 LD_SH2(src1_ptr, src2_stride, in0, in1);
2203 in0 = (v8i16) __msa_ilvr_d((v2i64) in1, (v2i64) in0);
2204 XORI_B2_128_SB(src0, src1);
2205 VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
2207 DPADD_SB2_SH(vec0, vec1, filt0, filt1, tmp0, tmp0);
2209 tmp0 = __msa_adds_s_h(tmp0, in0);
2210 tmp0 = __msa_srari_h(tmp0, 7);
2211 tmp0 = CLIP_SH_0_255(tmp0);
2212 dst0 = __msa_pckev_b((v16i8) tmp0, (v16i8) tmp0);
2214 ST4x2_UB(dst0, dst, dst_stride);
2217 static void hevc_hz_bi_4t_4x4_msa(uint8_t *src0_ptr,
2220 int32_t src2_stride,
2223 const int8_t *filter,
2227 v16i8 src0, src1, src2, src3, dst0, vec0, vec1;
2228 v8i16 in0, in1, in2, in3;
2230 v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[16]);
2233 v8i16 filter_vec, const_vec;
2237 const_vec = __msa_ldi_h(128);
2240 filter_vec = LD_SH(filter);
2241 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2245 LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
2246 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
2248 ILVR_D2_SH(in1, in0, in3, in2, in0, in1);
2249 XORI_B4_128_SB(src0, src1, src2, src3);
2253 VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0, vec1);
2254 VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2, vec3);
2255 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt1, filt1, tmp0, tmp1,
2257 HEVC_BI_RND_CLIP2(in0, in1, tmp0, tmp1, 7, tmp0, tmp1);
2258 dst0 = __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
2260 ST4x4_UB(dst0, dst0, 0, 1, 2, 3, dst, dst_stride);
2263 static void hevc_hz_bi_4t_4x8multiple_msa(uint8_t *src0_ptr,
2266 int32_t src2_stride,
2269 const int8_t *filter,
2274 v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
2276 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
2277 v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[16]);
2278 v16i8 mask1, vec0, vec1, vec2, vec3;
2279 v8i16 tmp0, tmp1, tmp2, tmp3;
2280 v8i16 filter_vec, const_vec;
2284 const_vec = __msa_ldi_h(128);
2287 filter_vec = LD_SH(filter);
2288 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2292 for (loop_cnt = (height >> 3); loop_cnt--;) {
2293 LD_SB8(src0_ptr, src_stride,
2294 src0, src1, src2, src3, src4, src5, src6, src7);
2295 src0_ptr += (8 * src_stride);
2296 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
2297 src1_ptr += (4 * src2_stride);
2298 LD_SH4(src1_ptr, src2_stride, in4, in5, in6, in7);
2299 src1_ptr += (4 * src2_stride);
2300 ILVR_D2_SH(in1, in0, in3, in2, in0, in1);
2301 ILVR_D2_SH(in5, in4, in7, in6, in2, in3);
2302 XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
2308 VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0, vec1);
2309 VSHF_B2_SB(src4, src5, src6, src7, mask0, mask0, vec2, vec3);
2310 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0,
2312 VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec0, vec1);
2313 VSHF_B2_SB(src4, src5, src6, src7, mask1, mask1, vec2, vec3);
2314 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, tmp0,
2317 HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
2318 tmp0, tmp1, tmp2, tmp3, 7, tmp0, tmp1, tmp2, tmp3);
2320 PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, dst0, dst1);
2321 ST4x8_UB(dst0, dst1, dst, dst_stride);
2322 dst += (8 * dst_stride);
2326 static void hevc_hz_bi_4t_4w_msa(uint8_t *src0_ptr,
2329 int32_t src2_stride,
2332 const int8_t *filter,
2336 hevc_hz_bi_4t_4x2_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
2337 dst, dst_stride, filter, height);
2338 } else if (4 == height) {
2339 hevc_hz_bi_4t_4x4_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
2340 dst, dst_stride, filter, height);
2341 } else if (8 == height || 16 == height) {
2342 hevc_hz_bi_4t_4x8multiple_msa(src0_ptr, src_stride,
2343 src1_ptr, src2_stride,
2344 dst, dst_stride, filter, height);
2348 static void hevc_hz_bi_4t_6w_msa(uint8_t *src0_ptr,
2351 int32_t src2_stride,
2354 const int8_t *filter,
2359 v16i8 src0, src1, src2, src3;
2360 v8i16 in0, in1, in2, in3;
2361 v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
2363 v16i8 vec0, vec1, vec2, vec3;
2364 v8i16 dst0, dst1, dst2, dst3;
2365 v8i16 filter_vec, const_vec;
2369 const_vec = __msa_ldi_h(128);
2372 filter_vec = LD_SH(filter);
2373 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2377 for (loop_cnt = (height >> 2); loop_cnt--;) {
2378 LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
2379 src0_ptr += (4 * src_stride);
2380 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
2381 src1_ptr += (4 * src2_stride);
2382 XORI_B4_128_SB(src0, src1, src2, src3);
2388 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
2389 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3);
2390 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0,
2392 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0, vec1);
2393 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2, vec3);
2394 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0,
2397 HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
2398 dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
2400 PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
2401 ST6x4_UB(dst0, dst1, dst, dst_stride);
2402 dst += (4 * dst_stride);
2406 static void hevc_hz_bi_4t_8x2_msa(uint8_t *src0_ptr,
2409 int32_t src2_stride,
2412 const int8_t *filter,
2418 v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
2419 v16i8 mask1, vec0, vec1, vec2, vec3;
2421 v8i16 filter_vec, const_vec;
2425 const_vec = __msa_ldi_h(128);
2428 filter_vec = LD_SH(filter);
2429 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2433 LD_SB2(src0_ptr, src_stride, src0, src1);
2434 LD_SH2(src1_ptr, src2_stride, in0, in1);
2435 XORI_B2_128_SB(src0, src1);
2439 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
2440 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec2, vec3);
2441 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt1, filt1, dst0, dst1,
2443 HEVC_BI_RND_CLIP2(in0, in1, dst0, dst1, 7, dst0, dst1);
2445 dst0 = (v8i16) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
2446 ST8x2_UB(dst0, dst, dst_stride);
2449 static void hevc_hz_bi_4t_8x6_msa(uint8_t *src0_ptr,
2452 int32_t src2_stride,
2455 const int8_t *filter,
2459 v16i8 src0, src1, src2, src3, src4, src5;
2460 v8i16 in0, in1, in2, in3, in4, in5;
2461 v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
2463 v16i8 vec0, vec1, vec2, vec3;
2464 v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
2465 v8i16 filter_vec, const_vec;
2469 const_vec = __msa_ldi_h(128);
2472 filter_vec = LD_SH(filter);
2473 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2477 LD_SB6(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5);
2478 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
2479 src1_ptr += (4 * src2_stride);
2480 LD_SH2(src1_ptr, src2_stride, in4, in5);
2481 XORI_B6_128_SB(src0, src1, src2, src3, src4, src5);
2487 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
2488 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3);
2489 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0, dst1,
2491 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0, vec1);
2492 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2, vec3);
2493 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0, dst1,
2498 VSHF_B2_SB(src4, src4, src5, src5, mask0, mask0, vec0, vec1);
2499 VSHF_B2_SB(src4, src4, src5, src5, mask1, mask1, vec2, vec3);
2500 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt1, filt1, dst4, dst5,
2503 HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
2504 dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
2505 HEVC_BI_RND_CLIP2(in4, in5, dst4, dst5, 7, dst4, dst5);
2507 PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
2508 dst2 = (v8i16) __msa_pckev_b((v16i8) dst5, (v16i8) dst4);
2509 ST8x4_UB(dst0, dst1, dst, dst_stride);
2510 dst += (4 * dst_stride);
2511 ST8x2_UB(dst2, dst, dst_stride);
2514 static void hevc_hz_bi_4t_8x4multiple_msa(uint8_t *src0_ptr,
2517 int32_t src2_stride,
2520 const int8_t *filter,
2525 v16i8 src0, src1, src2, src3;
2526 v8i16 in0, in1, in2, in3;
2527 v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
2529 v16i8 vec0, vec1, vec2, vec3;
2530 v8i16 dst0, dst1, dst2, dst3;
2531 v8i16 filter_vec, const_vec;
2535 const_vec = __msa_ldi_h(128);
2538 filter_vec = LD_SH(filter);
2539 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2543 for (loop_cnt = (height >> 2); loop_cnt--;) {
2544 LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
2545 src0_ptr += (4 * src_stride);
2546 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
2547 src1_ptr += (4 * src2_stride);
2548 XORI_B4_128_SB(src0, src1, src2, src3);
2554 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
2555 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3);
2556 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0,
2558 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0, vec1);
2559 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2, vec3);
2560 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0,
2563 HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
2564 dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
2566 PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
2567 ST8x4_UB(dst0, dst1, dst, dst_stride);
2568 dst += (4 * dst_stride);
2572 static void hevc_hz_bi_4t_8w_msa(uint8_t *src0_ptr,
2575 int32_t src2_stride,
2578 const int8_t *filter,
2582 hevc_hz_bi_4t_8x2_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
2583 dst, dst_stride, filter, height);
2584 } else if (6 == height) {
2585 hevc_hz_bi_4t_8x6_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
2586 dst, dst_stride, filter, height);
2587 } else if (0 == (height % 4)) {
2588 hevc_hz_bi_4t_8x4multiple_msa(src0_ptr, src_stride,
2589 src1_ptr, src2_stride,
2590 dst, dst_stride, filter, height);
2594 static void hevc_hz_bi_4t_12w_msa(uint8_t *src0_ptr,
2597 int32_t src2_stride,
2600 const int8_t *filter,
2605 v16i8 src0, src1, src2, src3;
2606 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
2607 v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
2609 8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
2612 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
2613 v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
2614 v8i16 filter_vec, const_vec;
2618 const_vec = __msa_ldi_h(128);
2621 filter_vec = LD_SH(filter);
2622 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2627 for (loop_cnt = (height >> 2); loop_cnt--;) {
2628 LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
2629 src0_ptr += (4 * src_stride);
2630 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
2631 LD_SH4(src1_ptr + 8, src2_stride, in4, in5, in6, in7);
2632 src1_ptr += (4 * src2_stride);
2634 ILVR_D2_SH(in5, in4, in7, in6, in4, in5);
2635 XORI_B4_128_SB(src0, src1, src2, src3);
2643 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
2644 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3);
2645 VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4, vec5);
2646 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0,
2648 DPADD_SB2_SH(vec4, vec5, filt0, filt0, dst4, dst5);
2649 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0, vec1);
2650 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2, vec3);
2651 VSHF_B2_SB(src0, src1, src2, src3, mask3, mask3, vec4, vec5);
2652 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0,
2654 DPADD_SB2_SH(vec4, vec5, filt1, filt1, dst4, dst5);
2656 HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
2657 dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
2658 HEVC_BI_RND_CLIP2(in4, in5, dst4, dst5, 7, dst4, dst5);
2660 PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
2661 dst2 = (v8i16) __msa_pckev_b((v16i8) dst5, (v16i8) dst4);
2662 ST12x4_UB(dst0, dst1, dst2, dst, dst_stride);
2663 dst += (4 * dst_stride);
2667 static void hevc_hz_bi_4t_16w_msa(uint8_t *src0_ptr,
2670 int32_t src2_stride,
2673 const int8_t *filter,
2677 v16i8 src0, src1, src2, src3, vec0, vec1, vec2, vec3;
2678 v8i16 in0, in1, in2, in3, dst0, dst1, dst2, dst3;
2680 v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
2682 v8i16 filter_vec, const_vec;
2686 const_vec = __msa_ldi_h(128);
2689 filter_vec = LD_SH(filter);
2690 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2694 for (loop_cnt = (height >> 1); loop_cnt--;) {
2695 LD_SB2(src0_ptr, src_stride, src0, src2);
2696 LD_SB2(src0_ptr + 8, src_stride, src1, src3);
2697 src0_ptr += (2 * src_stride);
2698 LD_SH2(src1_ptr, src2_stride, in0, in2);
2699 LD_SH2(src1_ptr + 8, src2_stride, in1, in3);
2700 src1_ptr += (2 * src2_stride);
2702 XORI_B4_128_SB(src0, src1, src2, src3);
2709 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
2710 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3);
2711 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0,
2713 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0, vec1);
2714 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2, vec3);
2715 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0,
2718 HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
2719 dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
2721 PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
2722 ST_SH2(dst0, dst1, dst, dst_stride);
2723 dst += (2 * dst_stride);
2727 static void hevc_hz_bi_4t_24w_msa(uint8_t *src0_ptr,
2730 int32_t src2_stride,
2733 const int8_t *filter,
2736 int16_t *src1_ptr_tmp;
2739 v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
2740 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
2742 v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
2743 v16i8 mask1, mask2, mask3;
2744 v16i8 vec0, vec1, vec2, vec3;
2745 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
2746 v8i16 filter_vec, const_vec;
2750 const_vec = __msa_ldi_h(128);
2753 filter_vec = LD_SH(filter);
2754 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2761 src1_ptr_tmp = src1_ptr + 16;
2763 for (loop_cnt = (height >> 2); loop_cnt--;) {
2764 LD_SB4(src0_ptr, src_stride, src0, src2, src4, src6);
2765 LD_SB4(src0_ptr + 16, src_stride, src1, src3, src5, src7);
2766 src0_ptr += (4 * src_stride);
2767 LD_SH4(src1_ptr, src2_stride, in0, in2, in4, in6);
2768 LD_SH4(src1_ptr + 8, src2_stride, in1, in3, in5, in7);
2769 src1_ptr += (4 * src2_stride);
2770 XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
2776 VSHF_B2_SB(src0, src0, src0, src1, mask0, mask2, vec0, vec1);
2777 VSHF_B2_SB(src2, src2, src2, src3, mask0, mask2, vec2, vec3);
2778 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0,
2780 VSHF_B2_SB(src0, src0, src0, src1, mask1, mask3, vec0, vec1);
2781 VSHF_B2_SB(src2, src2, src2, src3, mask1, mask3, vec2, vec3);
2782 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0,
2789 VSHF_B2_SB(src4, src4, src4, src5, mask0, mask2, vec0, vec1);
2790 VSHF_B2_SB(src6, src6, src6, src7, mask0, mask2, vec2, vec3);
2791 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst4,
2793 VSHF_B2_SB(src4, src4, src4, src5, mask1, mask3, vec0, vec1);
2794 VSHF_B2_SB(src6, src6, src6, src7, mask1, mask3, vec2, vec3);
2795 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst4,
2798 HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
2799 dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
2800 HEVC_BI_RND_CLIP4(in4, in5, in6, in7,
2801 dst4, dst5, dst6, dst7, 7, dst4, dst5, dst6, dst7);
2803 PCKEV_B4_SH(dst1, dst0, dst3, dst2,
2804 dst5, dst4, dst7, dst6, dst0, dst1, dst2, dst3);
2805 ST_SH4(dst0, dst1, dst2, dst3, dst, dst_stride);
2806 dst += (4 * dst_stride);
2808 LD_SH4(src1_ptr_tmp, src2_stride, in0, in1, in2, in3);
2809 src1_ptr_tmp += (4 * src2_stride);
2815 VSHF_B2_SB(src1, src1, src3, src3, mask0, mask0, vec0, vec1);
2816 VSHF_B2_SB(src5, src5, src7, src7, mask0, mask0, vec2, vec3);
2817 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0,
2819 VSHF_B2_SB(src1, src1, src3, src3, mask1, mask1, vec0, vec1);
2820 VSHF_B2_SB(src5, src5, src7, src7, mask1, mask1, vec2, vec3);
2821 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0,
2824 HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
2825 dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
2827 PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
2828 ST8x4_UB(dst0, dst1, dst_tmp, dst_stride);
2829 dst_tmp += (4 * dst_stride);
2833 static void hevc_hz_bi_4t_32w_msa(uint8_t *src0_ptr,
2836 int32_t src2_stride,
2839 const int8_t *filter,
2843 v16i8 src0, src1, src2;
2844 v8i16 in0, in1, in2, in3;
2846 v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
2847 v16i8 mask1, mask2, mask3;
2848 v8i16 dst0, dst1, dst2, dst3;
2849 v16i8 vec0, vec1, vec2, vec3;
2850 v8i16 filter_vec, const_vec;
2854 const_vec = __msa_ldi_h(128);
2857 filter_vec = LD_SH(filter);
2858 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2864 for (loop_cnt = height; loop_cnt--;) {
2865 LD_SB2(src0_ptr, 16, src0, src1);
2866 src2 = LD_SB(src0_ptr + 24);
2867 src0_ptr += src_stride;
2868 LD_SH4(src1_ptr, 8, in0, in1, in2, in3);
2869 src1_ptr += src2_stride;
2870 XORI_B3_128_SB(src0, src1, src2);
2876 VSHF_B2_SB(src0, src0, src0, src1, mask0, mask2, vec0, vec1);
2877 VSHF_B2_SB(src1, src1, src2, src2, mask0, mask0, vec2, vec3);
2878 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0,
2880 VSHF_B2_SB(src0, src0, src0, src1, mask1, mask3, vec0, vec1);
2881 VSHF_B2_SB(src1, src1, src2, src2, mask1, mask1, vec2, vec3);
2882 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0,
2885 HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
2886 dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
2888 PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
2889 ST_SH2(dst0, dst1, dst, 16);
2894 static void hevc_vt_bi_4t_4x2_msa(uint8_t *src0_ptr,
2897 int32_t src2_stride,
2900 const int8_t *filter,
2903 v16i8 src0, src1, src2, src3, src4;
2905 v16i8 src10_r, src32_r, src21_r, src43_r, src2110, src4332;
2908 v8i16 filter_vec, const_vec;
2910 src0_ptr -= src_stride;
2912 const_vec = __msa_ldi_h(128);
2915 filter_vec = LD_SH(filter);
2916 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2918 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
2919 src0_ptr += (3 * src_stride);
2921 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2922 src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
2923 src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
2925 LD_SB2(src0_ptr, src_stride, src3, src4);
2926 LD_SH2(src1_ptr, src2_stride, in0, in1);
2927 in0 = (v8i16) __msa_ilvr_d((v2i64) in1, (v2i64) in0);
2928 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
2929 src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_r, (v2i64) src32_r);
2930 src4332 = (v16i8) __msa_xori_b((v16u8) src4332, 128);
2933 DPADD_SB2_SH(src2110, src4332, filt0, filt1, dst10, dst10);
2934 dst10 = __msa_adds_s_h(dst10, in0);
2935 dst10 = __msa_srari_h(dst10, 7);
2936 dst10 = CLIP_SH_0_255(dst10);
2938 dst10 = (v8i16) __msa_pckev_b((v16i8) dst10, (v16i8) dst10);
2939 ST4x2_UB(dst10, dst, dst_stride);
2942 static void hevc_vt_bi_4t_4x4_msa(uint8_t *src0_ptr,
2945 int32_t src2_stride,
2948 const int8_t *filter,
2951 v16i8 src0, src1, src2, src3, src4, src5, src6;
2952 v8i16 in0, in1, in2, in3;
2953 v16i8 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r;
2954 v16i8 src2110, src4332, src6554;
2957 v8i16 filter_vec, const_vec;
2959 src0_ptr -= src_stride;
2961 const_vec = __msa_ldi_h(128);
2964 filter_vec = LD_SH(filter);
2965 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2967 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
2968 src0_ptr += (3 * src_stride);
2969 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2970 src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
2971 src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
2973 LD_SB4(src0_ptr, src_stride, src3, src4, src5, src6);
2974 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
2975 ILVR_D2_SH(in1, in0, in3, in2, in0, in1);
2976 ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
2977 src32_r, src43_r, src54_r, src65_r);
2978 ILVR_D2_SB(src43_r, src32_r, src65_r, src54_r, src4332, src6554);
2979 XORI_B2_128_SB(src4332, src6554);
2982 DPADD_SB2_SH(src2110, src4332, filt0, filt1, dst10, dst10);
2984 DPADD_SB2_SH(src4332, src6554, filt0, filt1, dst32, dst32);
2985 HEVC_BI_RND_CLIP2(in0, in1, dst10, dst32, 7, dst10, dst32);
2987 dst10 = (v8i16) __msa_pckev_b((v16i8) dst32, (v16i8) dst10);
2988 ST4x4_UB(dst10, dst10, 0, 1, 2, 3, dst, dst_stride);
2991 static void hevc_vt_bi_4t_4x8multiple_msa(uint8_t *src0_ptr,
2994 int32_t src2_stride,
2997 const int8_t *filter,
3001 v16i8 src0, src1, src2, src3, src4, src5;
3002 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
3003 v16i8 src6, src7, src8, src9;
3004 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
3005 v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
3006 v16i8 src2110, src4332, src6554, src8776;
3007 v8i16 dst10, dst32, dst54, dst76;
3009 v8i16 filter_vec, const_vec;
3011 src0_ptr -= src_stride;
3013 const_vec = __msa_ldi_h(128);
3016 filter_vec = LD_SH(filter);
3017 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3019 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
3020 src0_ptr += (3 * src_stride);
3021 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3022 src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
3023 src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
3025 for (loop_cnt = (height >> 3); loop_cnt--;) {
3026 LD_SB6(src0_ptr, src_stride, src3, src4, src5, src6, src7, src8);
3027 src0_ptr += (6 * src_stride);
3028 LD_SH8(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5, in6, in7);
3029 src1_ptr += (8 * src2_stride);
3030 ILVR_D2_SH(in1, in0, in3, in2, in0, in1);
3031 ILVR_D2_SH(in5, in4, in7, in6, in2, in3);
3032 ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
3033 src32_r, src43_r, src54_r, src65_r);
3034 ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
3035 ILVR_D3_SB(src43_r, src32_r, src65_r, src54_r, src87_r, src76_r,
3036 src4332, src6554, src8776);
3037 XORI_B3_128_SB(src4332, src6554, src8776);
3040 DPADD_SB2_SH(src2110, src4332, filt0, filt1, dst10, dst10);
3042 DPADD_SB2_SH(src4332, src6554, filt0, filt1, dst32, dst32);
3044 DPADD_SB2_SH(src6554, src8776, filt0, filt1, dst54, dst54);
3046 LD_SB2(src0_ptr, src_stride, src9, src2);
3047 src0_ptr += (2 * src_stride);
3048 ILVR_B2_SB(src9, src8, src2, src9, src98_r, src109_r);
3049 src2110 = (v16i8) __msa_ilvr_d((v2i64) src109_r, (v2i64) src98_r);
3050 src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
3052 DPADD_SB2_SH(src8776, src2110, filt0, filt1, dst76, dst76);
3054 HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
3055 dst10, dst32, dst54, dst76, 7,
3056 dst10, dst32, dst54, dst76);
3058 PCKEV_B2_SH(dst32, dst10, dst76, dst54, dst10, dst54);
3059 ST4x8_UB(dst10, dst54, dst, dst_stride);
3060 dst += (8 * dst_stride);
3064 static void hevc_vt_bi_4t_4w_msa(uint8_t *src0_ptr,
3067 int32_t src2_stride,
3070 const int8_t *filter,
3074 hevc_vt_bi_4t_4x2_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
3075 dst, dst_stride, filter, height);
3076 } else if (4 == height) {
3077 hevc_vt_bi_4t_4x4_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
3078 dst, dst_stride, filter, height);
3080 hevc_vt_bi_4t_4x8multiple_msa(src0_ptr, src_stride,
3081 src1_ptr, src2_stride,
3082 dst, dst_stride, filter, height);
3086 static void hevc_vt_bi_4t_6w_msa(uint8_t *src0_ptr,
3089 int32_t src2_stride,
3092 const int8_t *filter,
3095 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
3096 v8i16 in0, in1, in2, in3;
3097 v16i8 src10_r, src32_r, src21_r, src43_r, src54_r, src65_r;
3098 v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
3100 v8i16 filter_vec, const_vec;
3102 src0_ptr -= src_stride;
3104 const_vec = __msa_ldi_h(128);
3107 filter_vec = LD_SH(filter);
3108 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3110 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
3111 src0_ptr += (3 * src_stride);
3112 LD_SB2(src0_ptr, src_stride, src3, src4);
3113 src0_ptr += (2 * src_stride);
3114 LD_SB2(src0_ptr, src_stride, src5, src6);
3115 src0_ptr += (2 * src_stride);
3116 LD_SB2(src0_ptr, src_stride, src7, src8);
3117 src0_ptr += (2 * src_stride);
3118 LD_SB2(src0_ptr, src_stride, src9, src10);
3119 src0_ptr += (2 * src_stride);
3121 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
3122 src1_ptr += (4 * src2_stride);
3124 XORI_B3_128_SB(src0, src1, src2);
3125 XORI_B2_128_SB(src3, src4);
3126 XORI_B2_128_SB(src5, src6);
3127 XORI_B2_128_SB(src7, src8);
3128 XORI_B2_128_SB(src9, src10);
3130 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3131 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3134 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
3136 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
3138 ILVR_B2_SB(src5, src4, src6, src5, src54_r, src65_r);
3141 DPADD_SB2_SH(src32_r, src54_r, filt0, filt1, dst2_r, dst2_r);
3143 DPADD_SB2_SH(src43_r, src65_r, filt0, filt1, dst3_r, dst3_r);
3145 HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
3146 dst0_r, dst1_r, dst2_r, dst3_r, 7,
3147 dst0_r, dst1_r, dst2_r, dst3_r);
3149 PCKEV_B2_SH(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r);
3150 ST6x4_UB(dst0_r, dst1_r, dst, dst_stride);
3151 dst += (4 * dst_stride);
3153 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
3154 src1_ptr += (4 * src2_stride);
3155 ILVR_B2_SB(src7, src6, src8, src7, src32_r, src43_r);
3158 DPADD_SB2_SH(src54_r, src32_r, filt0, filt1, dst0_r, dst0_r);
3160 DPADD_SB2_SH(src65_r, src43_r, filt0, filt1, dst1_r, dst1_r);
3162 ILVR_B2_SB(src9, src8, src10, src9, src54_r, src65_r);
3165 DPADD_SB2_SH(src32_r, src54_r, filt0, filt1, dst2_r, dst2_r);
3167 DPADD_SB2_SH(src43_r, src65_r, filt0, filt1, dst3_r, dst3_r);
3169 HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
3170 dst0_r, dst1_r, dst2_r, dst3_r, 7,
3171 dst0_r, dst1_r, dst2_r, dst3_r);
3173 PCKEV_B2_SH(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r);
3174 ST6x4_UB(dst0_r, dst1_r, dst, dst_stride);
3175 dst += (4 * dst_stride);
3178 static void hevc_vt_bi_4t_8x2_msa(uint8_t *src0_ptr,
3181 int32_t src2_stride,
3184 const int8_t *filter,
3187 v16i8 src0, src1, src2, src3, src4;
3188 v8i16 in0, in1, dst0_r, dst1_r;
3189 v16i8 src10_r, src32_r, src21_r, src43_r;
3191 v8i16 filter_vec, const_vec;
3193 src0_ptr -= src_stride;
3195 const_vec = __msa_ldi_h(128);
3198 filter_vec = LD_SH(filter);
3199 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3201 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
3202 src0_ptr += (3 * src_stride);
3203 XORI_B3_128_SB(src0, src1, src2);
3204 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3206 LD_SB2(src0_ptr, src_stride, src3, src4);
3207 LD_SH2(src1_ptr, src2_stride, in0, in1);
3208 XORI_B2_128_SB(src3, src4);
3209 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3212 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
3214 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
3216 HEVC_BI_RND_CLIP2(in0, in1, dst0_r, dst1_r, 7, dst0_r, dst1_r);
3217 dst0_r = (v8i16) __msa_pckev_b((v16i8) dst1_r, (v16i8) dst0_r);
3219 ST8x2_UB(dst0_r, dst, dst_stride);
3222 static void hevc_vt_bi_4t_8x6_msa(uint8_t *src0_ptr,
3225 int32_t src2_stride,
3228 const int8_t *filter,
3231 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
3232 v8i16 in0, in1, in2, in3, in4, in5;
3233 v16i8 src10_r, src32_r, src54_r, src76_r;
3234 v16i8 src21_r, src43_r, src65_r, src87_r;
3235 v8i16 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r;
3237 v8i16 filter_vec, const_vec;
3239 src0_ptr -= src_stride;
3241 const_vec = __msa_ldi_h(128);
3244 filter_vec = LD_SH(filter);
3245 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3247 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
3248 src0_ptr += (3 * src_stride);
3249 XORI_B3_128_SB(src0, src1, src2);
3250 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3252 LD_SB6(src0_ptr, src_stride, src3, src4, src5, src6, src7, src8);
3253 LD_SH6(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5);
3254 XORI_B6_128_SB(src3, src4, src5, src6, src7, src8);
3255 ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
3256 src32_r, src43_r, src54_r, src65_r);
3257 ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
3260 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
3262 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
3264 DPADD_SB2_SH(src32_r, src54_r, filt0, filt1, dst2_r, dst2_r);
3266 DPADD_SB2_SH(src43_r, src65_r, filt0, filt1, dst3_r, dst3_r);
3268 DPADD_SB2_SH(src54_r, src76_r, filt0, filt1, dst4_r, dst4_r);
3270 DPADD_SB2_SH(src65_r, src87_r, filt0, filt1, dst5_r, dst5_r);
3271 HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
3272 dst0_r, dst1_r, dst2_r, dst3_r, 7,
3273 dst0_r, dst1_r, dst2_r, dst3_r);
3274 HEVC_BI_RND_CLIP2(in4, in5, dst4_r, dst5_r, 7, dst4_r, dst5_r);
3276 PCKEV_B2_SH(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r);
3277 dst2_r = (v8i16) __msa_pckev_b((v16i8) dst5_r, (v16i8) dst4_r);
3278 ST8x4_UB(dst0_r, dst1_r, dst, dst_stride);
3279 dst += (4 * dst_stride);
3280 ST8x2_UB(dst2_r, dst, dst_stride);
3283 static void hevc_vt_bi_4t_8x4multiple_msa(uint8_t *src0_ptr,
3286 int32_t src2_stride,
3289 const int8_t *filter,
3293 v16i8 src0, src1, src2, src3, src4, src5;
3294 v8i16 in0, in1, in2, in3;
3295 v16i8 src10_r, src32_r, src21_r, src43_r;
3296 v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
3298 v8i16 filter_vec, const_vec;
3300 src0_ptr -= src_stride;
3302 const_vec = __msa_ldi_h(128);
3305 filter_vec = LD_SH(filter);
3306 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3308 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
3309 src0_ptr += (3 * src_stride);
3310 XORI_B3_128_SB(src0, src1, src2);
3311 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3313 for (loop_cnt = (height >> 2); loop_cnt--;) {
3314 LD_SB2(src0_ptr, src_stride, src3, src4);
3315 src0_ptr += (2 * src_stride);
3316 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
3317 src1_ptr += (4 * src2_stride);
3318 XORI_B2_128_SB(src3, src4);
3319 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3322 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
3324 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
3326 LD_SB2(src0_ptr, src_stride, src5, src2);
3327 src0_ptr += (2 * src_stride);
3328 XORI_B2_128_SB(src5, src2);
3329 ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r);
3332 DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, dst2_r, dst2_r);
3334 DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, dst3_r, dst3_r);
3335 HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
3336 dst0_r, dst1_r, dst2_r, dst3_r, 7,
3337 dst0_r, dst1_r, dst2_r, dst3_r);
3339 PCKEV_B2_SH(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r);
3340 ST8x4_UB(dst0_r, dst1_r, dst, dst_stride);
3341 dst += (4 * dst_stride);
3345 static void hevc_vt_bi_4t_8w_msa(uint8_t *src0_ptr,
3348 int32_t src2_stride,
3351 const int8_t *filter,
3355 hevc_vt_bi_4t_8x2_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
3356 dst, dst_stride, filter, height);
3357 } else if (6 == height) {
3358 hevc_vt_bi_4t_8x6_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
3359 dst, dst_stride, filter, height);
3361 hevc_vt_bi_4t_8x4multiple_msa(src0_ptr, src_stride,
3362 src1_ptr, src2_stride,
3363 dst, dst_stride, filter, height);
3367 static void hevc_vt_bi_4t_12w_msa(uint8_t *src0_ptr,
3370 int32_t src2_stride,
3373 const int8_t *filter,
3377 v16i8 src0, src1, src2, src3, src4, src5, src6;
3378 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
3379 v16i8 src10_r, src32_r, src21_r, src43_r, src54_r, src65_r;
3380 v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
3381 v16i8 src10_l, src32_l, src54_l, src21_l, src43_l, src65_l;
3382 v16i8 src2110, src4332, src6554;
3383 v8i16 dst0_l, dst1_l, filt0, filt1;
3384 v8i16 filter_vec, const_vec;
3386 src0_ptr -= (1 * src_stride);
3388 const_vec = __msa_ldi_h(128);
3391 filter_vec = LD_SH(filter);
3392 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3394 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
3395 src0_ptr += (3 * src_stride);
3396 XORI_B3_128_SB(src0, src1, src2);
3397 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3398 ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
3399 src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_l, (v2i64) src10_l);
3401 for (loop_cnt = (height >> 2); loop_cnt--;) {
3402 LD_SB2(src0_ptr, src_stride, src3, src4);
3403 src0_ptr += (2 * src_stride);
3404 LD_SB2(src0_ptr, src_stride, src5, src6);
3405 src0_ptr += (2 * src_stride);
3406 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
3407 LD_SH4((src1_ptr + 8), src2_stride, in4, in5, in6, in7);
3408 src1_ptr += (4 * src2_stride);
3409 ILVR_D2_SH(in5, in4, in7, in6, in4, in5);
3410 XORI_B2_128_SB(src3, src4);
3411 XORI_B2_128_SB(src5, src6);
3413 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3414 ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
3415 src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_l, (v2i64) src32_l);
3416 ILVR_B2_SB(src5, src4, src6, src5, src54_r, src65_r);
3417 ILVL_B2_SB(src5, src4, src6, src5, src54_l, src65_l);
3418 src6554 = (v16i8) __msa_ilvr_d((v2i64) src65_l, (v2i64) src54_l);
3421 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
3423 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
3425 DPADD_SB2_SH(src2110, src4332, filt0, filt1, dst0_l, dst0_l);
3427 DPADD_SB2_SH(src32_r, src54_r, filt0, filt1, dst2_r, dst2_r);
3429 DPADD_SB2_SH(src43_r, src65_r, filt0, filt1, dst3_r, dst3_r);
3431 DPADD_SB2_SH(src4332, src6554, filt0, filt1, dst1_l, dst1_l);
3432 HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
3433 dst0_r, dst1_r, dst2_r, dst3_r, 7,
3434 dst0_r, dst1_r, dst2_r, dst3_r);
3435 HEVC_BI_RND_CLIP2(in4, in5, dst0_l, dst1_l, 7, dst0_l, dst1_l);
3437 PCKEV_B2_SH(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r);
3438 dst0_l = (v8i16) __msa_pckev_b((v16i8) dst1_l, (v16i8) dst0_l);
3439 ST12x4_UB(dst0_r, dst1_r, dst0_l, dst, dst_stride);
3440 dst += (4 * dst_stride);
3449 static void hevc_vt_bi_4t_16w_msa(uint8_t *src0_ptr,
3452 int32_t src2_stride,
3455 const int8_t *filter,
3459 v16i8 src0, src1, src2, src3, src4, src5;
3460 v8i16 in0, in1, in2, in3;
3461 v16i8 src10_r, src32_r, src21_r, src43_r;
3462 v16i8 src10_l, src32_l, src21_l, src43_l;
3463 v8i16 dst0_r, dst1_r, dst0_l, dst1_l;
3465 v8i16 filter_vec, const_vec;
3467 src0_ptr -= src_stride;
3469 const_vec = __msa_ldi_h(128);
3472 filter_vec = LD_SH(filter);
3473 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3475 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
3476 src0_ptr += (3 * src_stride);
3477 XORI_B3_128_SB(src0, src1, src2);
3478 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3479 ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
3481 for (loop_cnt = (height >> 2); loop_cnt--;) {
3482 LD_SB2(src0_ptr, src_stride, src3, src4);
3483 src0_ptr += (2 * src_stride);
3484 LD_SH2(src1_ptr, src2_stride, in0, in1);
3485 LD_SH2((src1_ptr + 8), src2_stride, in2, in3);
3486 src1_ptr += (2 * src2_stride);
3487 XORI_B2_128_SB(src3, src4);
3488 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3489 ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
3492 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
3494 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
3496 DPADD_SB2_SH(src10_l, src32_l, filt0, filt1, dst0_l, dst0_l);
3498 DPADD_SB2_SH(src21_l, src43_l, filt0, filt1, dst1_l, dst1_l);
3499 HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
3500 dst0_r, dst1_r, dst0_l, dst1_l, 7,
3501 dst0_r, dst1_r, dst0_l, dst1_l);
3503 PCKEV_B2_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r, dst1_r);
3504 ST_SH2(dst0_r, dst1_r, dst, dst_stride);
3505 dst += (2 * dst_stride);
3507 LD_SB2(src0_ptr, src_stride, src5, src2);
3508 src0_ptr += (2 * src_stride);
3509 LD_SH2(src1_ptr, src2_stride, in0, in1);
3510 LD_SH2((src1_ptr + 8), src2_stride, in2, in3);
3511 src1_ptr += (2 * src2_stride);
3512 XORI_B2_128_SB(src5, src2);
3513 ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r);
3514 ILVL_B2_SB(src5, src4, src2, src5, src10_l, src21_l);
3517 DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, dst0_r, dst0_r);
3519 DPADD_SB2_SH(src32_l, src10_l, filt0, filt1, dst0_l, dst0_l);
3521 DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, dst1_r, dst1_r);
3523 DPADD_SB2_SH(src43_l, src21_l, filt0, filt1, dst1_l, dst1_l);
3524 HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
3525 dst0_r, dst1_r, dst0_l, dst1_l, 7,
3526 dst0_r, dst1_r, dst0_l, dst1_l);
3528 PCKEV_B2_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r, dst1_r);
3529 ST_SH2(dst0_r, dst1_r, dst, dst_stride);
3530 dst += (2 * dst_stride);
3534 static void hevc_vt_bi_4t_24w_msa(uint8_t *src0_ptr,
3537 int32_t src2_stride,
3540 const int8_t *filter,
3544 v16i8 src0, src1, src2, src3, src4, src5;
3545 v16i8 src6, src7, src8, src9, src10, src11;
3546 v8i16 in0, in1, in2, in3, in4, in5;
3547 v16i8 src10_r, src32_r, src76_r, src98_r;
3548 v16i8 src21_r, src43_r, src87_r, src109_r;
3549 v16i8 src10_l, src32_l, src21_l, src43_l;
3550 v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
3551 v8i16 dst0_l, dst1_l;
3553 v8i16 filter_vec, const_vec;
3555 src0_ptr -= src_stride;
3557 const_vec = __msa_ldi_h(128);
3560 filter_vec = LD_SH(filter);
3561 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3564 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
3565 XORI_B3_128_SB(src0, src1, src2);
3566 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3567 ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
3569 LD_SB3(src0_ptr + 16, src_stride, src6, src7, src8);
3570 src0_ptr += (3 * src_stride);
3571 XORI_B3_128_SB(src6, src7, src8);
3572 ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
3574 for (loop_cnt = (height >> 2); loop_cnt--;) {
3576 LD_SB2(src0_ptr, src_stride, src3, src4);
3577 LD_SH2(src1_ptr, src2_stride, in0, in1);
3578 LD_SH2((src1_ptr + 8), src2_stride, in2, in3);
3579 LD_SH2((src1_ptr + 16), src2_stride, in4, in5);
3580 src1_ptr += (2 * src2_stride);
3581 XORI_B2_128_SB(src3, src4);
3582 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3583 ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
3585 LD_SB2(src0_ptr + 16, src_stride, src9, src10);
3586 src0_ptr += (2 * src_stride);
3587 XORI_B2_128_SB(src9, src10);
3588 ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r);
3591 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
3593 DPADD_SB2_SH(src10_l, src32_l, filt0, filt1, dst0_l, dst0_l);
3595 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
3597 DPADD_SB2_SH(src21_l, src43_l, filt0, filt1, dst1_l, dst1_l);
3600 DPADD_SB2_SH(src76_r, src98_r, filt0, filt1, dst2_r, dst2_r);
3602 DPADD_SB2_SH(src87_r, src109_r, filt0, filt1, dst3_r, dst3_r);
3604 HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
3605 dst0_r, dst1_r, dst0_l, dst1_l, 7,
3606 dst0_r, dst1_r, dst0_l, dst1_l);
3608 HEVC_BI_RND_CLIP2(in4, in5, dst2_r, dst3_r, 7, dst2_r, dst3_r);
3610 PCKEV_B2_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r, dst1_r);
3611 dst2_r = (v8i16) __msa_pckev_b((v16i8) dst3_r, (v16i8) dst2_r);
3612 ST_SH2(dst0_r, dst1_r, dst, dst_stride);
3613 ST8x2_UB(dst2_r, dst + 16, dst_stride);
3614 dst += (2 * dst_stride);
3617 LD_SB2(src0_ptr, src_stride, src5, src2);
3618 LD_SH2(src1_ptr, src2_stride, in0, in1);
3619 LD_SH2((src1_ptr + 8), src2_stride, in2, in3);
3620 LD_SH2((src1_ptr + 16), src2_stride, in4, in5);
3621 src1_ptr += (2 * src2_stride);
3622 XORI_B2_128_SB(src5, src2);
3623 ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r);
3624 ILVL_B2_SB(src5, src4, src2, src5, src10_l, src21_l);
3626 LD_SB2(src0_ptr + 16, src_stride, src11, src8);
3627 src0_ptr += (2 * src_stride);
3628 XORI_B2_128_SB(src11, src8);
3629 ILVR_B2_SB(src11, src10, src8, src11, src76_r, src87_r);
3632 DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, dst0_r, dst0_r);
3634 DPADD_SB2_SH(src32_l, src10_l, filt0, filt1, dst0_l, dst0_l);
3636 DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, dst1_r, dst1_r);
3638 DPADD_SB2_SH(src43_l, src21_l, filt0, filt1, dst1_l, dst1_l);
3641 DPADD_SB2_SH(src98_r, src76_r, filt0, filt1, dst2_r, dst2_r);
3643 DPADD_SB2_SH(src109_r, src87_r, filt0, filt1, dst3_r, dst3_r);
3645 HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
3646 dst0_r, dst1_r, dst0_l, dst1_l, 7,
3647 dst0_r, dst1_r, dst0_l, dst1_l);
3648 HEVC_BI_RND_CLIP2(in4, in5, dst2_r, dst3_r, 7, dst2_r, dst3_r);
3650 PCKEV_B2_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r, dst1_r);
3651 dst2_r = (v8i16) __msa_pckev_b((v16i8) dst3_r, (v16i8) dst2_r);
3652 ST_SH2(dst0_r, dst1_r, dst, dst_stride);
3653 ST8x2_UB(dst2_r, dst + 16, dst_stride);
3654 dst += (2 * dst_stride);
3658 static void hevc_vt_bi_4t_32w_msa(uint8_t *src0_ptr,
3661 int32_t src2_stride,
3664 const int8_t *filter,
3668 uint8_t *dst_tmp = dst + 16;
3669 v16i8 src0, src1, src2, src3, src4, src6, src7, src8, src9, src10;
3670 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
3671 v16i8 src10_r, src32_r, src76_r, src98_r;
3672 v16i8 src21_r, src43_r, src87_r, src109_r;
3673 v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
3674 v16i8 src10_l, src32_l, src76_l, src98_l;
3675 v16i8 src21_l, src43_l, src87_l, src109_l;
3676 v8i16 dst0_l, dst1_l, dst2_l, dst3_l;
3678 v8i16 filter_vec, const_vec;
3680 src0_ptr -= src_stride;
3682 const_vec = __msa_ldi_h(128);
3685 filter_vec = LD_SH(filter);
3686 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3689 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
3690 XORI_B3_128_SB(src0, src1, src2);
3691 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3692 ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
3695 LD_SB3(src0_ptr + 16, src_stride, src6, src7, src8);
3696 src0_ptr += (3 * src_stride);
3697 XORI_B3_128_SB(src6, src7, src8);
3698 ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
3699 ILVL_B2_SB(src7, src6, src8, src7, src76_l, src87_l);
3701 for (loop_cnt = (height >> 1); loop_cnt--;) {
3703 LD_SB2(src0_ptr, src_stride, src3, src4);
3704 LD_SH2(src1_ptr, src2_stride, in0, in1);
3705 LD_SH2((src1_ptr + 8), src2_stride, in2, in3);
3706 LD_SH2((src1_ptr + 16), src2_stride, in4, in5);
3707 LD_SH2((src1_ptr + 24), src2_stride, in6, in7);
3708 src1_ptr += (2 * src2_stride);
3709 XORI_B2_128_SB(src3, src4);
3710 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3711 ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
3714 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
3716 DPADD_SB2_SH(src10_l, src32_l, filt0, filt1, dst0_l, dst0_l);
3718 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
3720 DPADD_SB2_SH(src21_l, src43_l, filt0, filt1, dst1_l, dst1_l);
3722 HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
3723 dst0_r, dst1_r, dst0_l, dst1_l, 7,
3724 dst0_r, dst1_r, dst0_l, dst1_l);
3732 PCKEV_B2_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r, dst1_r);
3733 ST_SH2(dst0_r, dst1_r, dst, dst_stride);
3734 dst += (2 * dst_stride);
3737 LD_SB2(src0_ptr + 16, src_stride, src9, src10);
3738 src0_ptr += (2 * src_stride);
3739 XORI_B2_128_SB(src9, src10);
3740 ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r);
3741 ILVL_B2_SB(src9, src8, src10, src9, src98_l, src109_l);
3744 DPADD_SB2_SH(src76_r, src98_r, filt0, filt1, dst2_r, dst2_r);
3746 DPADD_SB2_SH(src76_l, src98_l, filt0, filt1, dst2_l, dst2_l);
3748 DPADD_SB2_SH(src87_r, src109_r, filt0, filt1, dst3_r, dst3_r);
3750 DPADD_SB2_SH(src87_l, src109_l, filt0, filt1, dst3_l, dst3_l);
3752 HEVC_BI_RND_CLIP4(in4, in5, in6, in7,
3753 dst2_r, dst3_r, dst2_l, dst3_l, 7,
3754 dst2_r, dst3_r, dst2_l, dst3_l);
3756 PCKEV_B2_SH(dst2_l, dst2_r, dst3_l, dst3_r, dst2_r, dst3_r);
3757 ST_SH2(dst2_r, dst3_r, dst_tmp, dst_stride);
3758 dst_tmp += (2 * dst_stride);
3768 static void hevc_hv_bi_4t_4x2_msa(uint8_t *src0_ptr,
3771 int32_t src2_stride,
3774 const int8_t *filter_x,
3775 const int8_t *filter_y)
3780 v16i8 src0, src1, src2, src3, src4;
3782 v8i16 filt_h0, filt_h1;
3783 v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16);
3785 v8i16 filter_vec, const_vec;
3786 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
3787 v8i16 dst20, dst31, dst42, dst10, dst32, dst21, dst43, tmp;
3790 src0_ptr -= (src_stride + 1);
3792 filter_vec = LD_SH(filter_x);
3793 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3795 filter_vec = LD_SH(filter_y);
3796 UNPCK_R_SB_SH(filter_vec, filter_vec);
3798 SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
3802 const_vec = __msa_ldi_h(128);
3805 LD_SB5(src0_ptr, src_stride, src0, src1, src2, src3, src4);
3806 XORI_B5_128_SB(src0, src1, src2, src3, src4);
3808 LD2(src1_ptr, src2_stride, tp0, tp1);
3809 INSERT_D2_SH(tp0, tp1, in0);
3810 in0 = __msa_adds_s_h(in0, const_vec);
3812 VSHF_B2_SB(src0, src2, src0, src2, mask0, mask1, vec0, vec1);
3813 VSHF_B2_SB(src1, src3, src1, src3, mask0, mask1, vec2, vec3);
3814 VSHF_B2_SB(src2, src4, src2, src4, mask0, mask1, vec4, vec5);
3816 dst20 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3817 dst31 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3818 dst42 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3820 ILVRL_H2_SH(dst31, dst20, dst10, dst32);
3821 ILVRL_H2_SH(dst42, dst31, dst21, dst43);
3823 dst0 = HEVC_FILT_4TAP(dst10, dst32, filt_h0, filt_h1);
3824 dst1 = HEVC_FILT_4TAP(dst21, dst43, filt_h0, filt_h1);
3827 tmp = __msa_pckev_h((v8i16) dst1, (v8i16) dst0);
3828 tmp = __msa_adds_s_h(tmp, in0);
3829 tmp = __msa_srari_h(tmp, 7);
3830 tmp = CLIP_SH_0_255_MAX_SATU(tmp);
3831 out = (v16u8) __msa_pckev_b((v16i8) tmp, (v16i8) tmp);
3832 ST4x2_UB(out, dst, dst_stride);
3835 static void hevc_hv_bi_4t_4x4_msa(uint8_t *src0_ptr,
3838 int32_t src2_stride,
3841 const int8_t *filter_x,
3842 const int8_t *filter_y)
3846 v16i8 src0, src1, src2, src3, src4, src5, src6;
3848 v8i16 filt_h0, filt_h1;
3849 v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16);
3851 v8i16 filter_vec, const_vec;
3852 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
3854 v8i16 in0 = { 0 }, in1 = { 0 };
3855 v8i16 dst30, dst41, dst52, dst63;
3856 v8i16 dst10, dst32, dst54, dst21, dst43, dst65;
3857 v4i32 dst0, dst1, dst2, dst3;
3859 src0_ptr -= (src_stride + 1);
3861 filter_vec = LD_SH(filter_x);
3862 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3864 filter_vec = LD_SH(filter_y);
3865 UNPCK_R_SB_SH(filter_vec, filter_vec);
3867 SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
3871 LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6);
3872 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
3874 const_vec = __msa_ldi_h(128);
3877 LD2(src1_ptr, src2_stride, tp0, tp1);
3878 src1_ptr += 2 * src2_stride;
3879 INSERT_D2_SH(tp0, tp1, in0);
3880 LD2(src1_ptr, src2_stride, tp0, tp1);
3881 INSERT_D2_SH(tp0, tp1, in1);
3883 ADDS_SH2_SH(in0, const_vec, in1, const_vec, in0, in1);
3885 VSHF_B2_SB(src0, src3, src0, src3, mask0, mask1, vec0, vec1);
3886 VSHF_B2_SB(src1, src4, src1, src4, mask0, mask1, vec2, vec3);
3887 VSHF_B2_SB(src2, src5, src2, src5, mask0, mask1, vec4, vec5);
3888 VSHF_B2_SB(src3, src6, src3, src6, mask0, mask1, vec6, vec7);
3890 dst30 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3891 dst41 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3892 dst52 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3893 dst63 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
3895 ILVRL_H2_SH(dst41, dst30, dst10, dst43);
3896 ILVRL_H2_SH(dst52, dst41, dst21, dst54);
3897 ILVRL_H2_SH(dst63, dst52, dst32, dst65);
3898 dst0 = HEVC_FILT_4TAP(dst10, dst32, filt_h0, filt_h1);
3899 dst1 = HEVC_FILT_4TAP(dst21, dst43, filt_h0, filt_h1);
3900 dst2 = HEVC_FILT_4TAP(dst32, dst54, filt_h0, filt_h1);
3901 dst3 = HEVC_FILT_4TAP(dst43, dst65, filt_h0, filt_h1);
3902 SRA_4V(dst0, dst1, dst2, dst3, 6);
3903 PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp0, tmp1);
3904 ADDS_SH2_SH(tmp0, in0, tmp1, in1, tmp0, tmp1);
3905 SRARI_H2_SH(tmp0, tmp1, 7);
3906 CLIP_SH2_0_255_MAX_SATU(tmp0, tmp1);
3907 out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
3908 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
3911 static void hevc_hv_bi_4t_4multx8mult_msa(uint8_t *src0_ptr,
3914 int32_t src2_stride,
3917 const int8_t *filter_x,
3918 const int8_t *filter_y,
3924 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
3926 v8i16 filt_h0, filt_h1;
3927 v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16);
3929 v8i16 filter_vec, const_vec;
3930 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
3931 v8i16 tmp0, tmp1, tmp2, tmp3;
3932 v8i16 dst10, dst21, dst22, dst73, dst84, dst95, dst106;
3933 v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
3934 v8i16 dst21_r, dst43_r, dst65_r, dst87_r;
3935 v8i16 dst98_r, dst109_r;
3936 v8i16 in0 = { 0 }, in1 = { 0 }, in2 = { 0 }, in3 = { 0 };
3937 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r, dst6_r, dst7_r;
3939 src0_ptr -= (src_stride + 1);
3941 filter_vec = LD_SH(filter_x);
3942 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3944 filter_vec = LD_SH(filter_y);
3945 UNPCK_R_SB_SH(filter_vec, filter_vec);
3947 SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
3951 const_vec = __msa_ldi_h(128);
3954 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
3955 src0_ptr += (3 * src_stride);
3956 XORI_B3_128_SB(src0, src1, src2);
3958 VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
3959 VSHF_B2_SB(src1, src2, src1, src2, mask0, mask1, vec2, vec3);
3960 dst10 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3961 dst21 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3962 ILVRL_H2_SH(dst21, dst10, dst10_r, dst21_r);
3963 dst22 = (v8i16) __msa_splati_d((v2i64) dst21, 1);
3966 for (loop_cnt = height >> 3; loop_cnt--;) {
3967 LD_SB8(src0_ptr, src_stride,
3968 src3, src4, src5, src6, src7, src8, src9, src10);
3969 src0_ptr += (8 * src_stride);
3970 XORI_B8_128_SB(src3, src4, src5, src6, src7, src8, src9, src10);
3971 VSHF_B2_SB(src3, src7, src3, src7, mask0, mask1, vec0, vec1);
3972 VSHF_B2_SB(src4, src8, src4, src8, mask0, mask1, vec2, vec3);
3973 VSHF_B2_SB(src5, src9, src5, src9, mask0, mask1, vec4, vec5);
3974 VSHF_B2_SB(src6, src10, src6, src10, mask0, mask1, vec6, vec7);
3976 dst73 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3977 dst84 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3978 dst95 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3979 dst106 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
3981 dst32_r = __msa_ilvr_h(dst73, dst22);
3982 ILVRL_H2_SH(dst84, dst73, dst43_r, dst87_r);
3983 ILVRL_H2_SH(dst95, dst84, dst54_r, dst98_r);
3984 ILVRL_H2_SH(dst106, dst95, dst65_r, dst109_r);
3985 dst22 = (v8i16) __msa_splati_d((v2i64) dst73, 1);
3986 dst76_r = __msa_ilvr_h(dst22, dst106);
3988 LD2(src1_ptr, src2_stride, tp0, tp1);
3989 src1_ptr += 2 * src2_stride;
3990 INSERT_D2_SH(tp0, tp1, in0);
3991 LD2(src1_ptr, src2_stride, tp0, tp1);
3992 src1_ptr += 2 * src2_stride;
3993 INSERT_D2_SH(tp0, tp1, in1);
3995 LD2(src1_ptr, src2_stride, tp0, tp1);
3996 src1_ptr += 2 * src2_stride;
3997 INSERT_D2_SH(tp0, tp1, in2);
3998 LD2(src1_ptr, src2_stride, tp0, tp1);
3999 src1_ptr += 2 * src2_stride;
4000 INSERT_D2_SH(tp0, tp1, in3);
4002 ADDS_SH4_SH(in0, const_vec, in1, const_vec, in2, const_vec, in3,
4003 const_vec, in0, in1, in2, in3);
4004 dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
4005 dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
4006 dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
4007 dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
4008 dst4_r = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
4009 dst5_r = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
4010 dst6_r = HEVC_FILT_4TAP(dst76_r, dst98_r, filt_h0, filt_h1);
4011 dst7_r = HEVC_FILT_4TAP(dst87_r, dst109_r, filt_h0, filt_h1);
4012 SRA_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6);
4013 SRA_4V(dst4_r, dst5_r, dst6_r, dst7_r, 6);
4014 PCKEV_H4_SH(dst1_r, dst0_r, dst3_r, dst2_r,
4015 dst5_r, dst4_r, dst7_r, dst6_r, tmp0, tmp1, tmp2, tmp3);
4016 ADDS_SH4_SH(in0, tmp0, in1, tmp1, in2, tmp2, in3, tmp3, tmp0, tmp1,
4018 SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7);
4019 CLIP_SH4_0_255_MAX_SATU(tmp0, tmp1, tmp2, tmp3);
4020 PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
4021 ST4x8_UB(out0, out1, dst, dst_stride);
4022 dst += (8 * dst_stride);
4026 dst22 = (v8i16) __msa_splati_d((v2i64) dst106, 1);
4030 static void hevc_hv_bi_4t_4w_msa(uint8_t *src0_ptr,
4033 int32_t src2_stride,
4036 const int8_t *filter_x,
4037 const int8_t *filter_y,
4041 hevc_hv_bi_4t_4x2_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
4042 dst, dst_stride, filter_x, filter_y);
4043 } else if (4 == height) {
4044 hevc_hv_bi_4t_4x4_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
4045 dst, dst_stride, filter_x, filter_y);
4046 } else if (0 == (height % 8)) {
4047 hevc_hv_bi_4t_4multx8mult_msa(src0_ptr, src_stride,
4048 src1_ptr, src2_stride,
4050 filter_x, filter_y, height);
4054 static void hevc_hv_bi_4t_6w_msa(uint8_t *src0_ptr,
4057 int32_t src2_stride,
4060 const int8_t *filter_x,
4061 const int8_t *filter_y,
4064 uint32_t tpw0, tpw1, tpw2, tpw3;
4066 v16u8 out0, out1, out2;
4067 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
4068 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
4070 v8i16 filt_h0, filt_h1;
4071 v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
4073 v8i16 filter_vec, const_vec;
4074 v8i16 dsth0, dsth1, dsth2, dsth3, dsth4, dsth5, dsth6, dsth7, dsth8, dsth9;
4075 v8i16 dsth10, tmp4, tmp5;
4076 v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
4077 v4i32 dst4_r, dst5_r, dst6_r, dst7_r;
4078 v8i16 tmp0, tmp1, tmp2, tmp3;
4079 v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
4080 v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
4081 v8i16 dst54_r, dst76_r, dst98_r, dst65_r, dst87_r, dst109_r;
4082 v8i16 dst54_l, dst76_l, dst98_l, dst65_l, dst87_l, dst109_l;
4083 v8i16 dst1021_l, dst3243_l, dst5465_l, dst7687_l, dst98109_l;
4084 v8i16 in0 = { 0 }, in1 = { 0 }, in2 = { 0 }, in3 = { 0 };
4085 v8i16 in4 = { 0 }, in5 = { 0 };
4087 src0_ptr -= (src_stride + 1);
4089 filter_vec = LD_SH(filter_x);
4090 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
4092 filter_vec = LD_SH(filter_y);
4093 UNPCK_R_SB_SH(filter_vec, filter_vec);
4095 SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
4099 const_vec = __msa_ldi_h(128);
4102 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
4103 src0_ptr += (3 * src_stride);
4104 XORI_B3_128_SB(src0, src1, src2);
4106 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
4107 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
4108 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
4110 dsth0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
4111 dsth1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
4112 dsth2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
4114 ILVRL_H2_SH(dsth1, dsth0, dst10_r, dst10_l);
4115 ILVRL_H2_SH(dsth2, dsth1, dst21_r, dst21_l);
4117 LD_SB8(src0_ptr, src_stride,
4118 src3, src4, src5, src6, src7, src8, src9, src10);
4119 XORI_B8_128_SB(src3, src4, src5, src6, src7, src8, src9, src10);
4121 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
4122 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
4123 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
4124 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
4126 dsth3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
4127 dsth4 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
4128 dsth5 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
4129 dsth6 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
4131 VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1);
4132 VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec2, vec3);
4133 VSHF_B2_SB(src9, src9, src9, src9, mask0, mask1, vec4, vec5);
4134 VSHF_B2_SB(src10, src10, src10, src10, mask0, mask1, vec6, vec7);
4136 dsth7 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
4137 dsth8 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
4138 dsth9 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
4139 dsth10 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
4141 ILVRL_H2_SH(dsth3, dsth2, dst32_r, dst32_l);
4142 ILVRL_H2_SH(dsth4, dsth3, dst43_r, dst43_l);
4143 ILVRL_H2_SH(dsth5, dsth4, dst54_r, dst54_l);
4144 ILVRL_H2_SH(dsth6, dsth5, dst65_r, dst65_l);
4145 ILVRL_H2_SH(dsth7, dsth6, dst76_r, dst76_l);
4146 ILVRL_H2_SH(dsth8, dsth7, dst87_r, dst87_l);
4147 ILVRL_H2_SH(dsth9, dsth8, dst98_r, dst98_l);
4148 ILVRL_H2_SH(dsth10, dsth9, dst109_r, dst109_l);
4149 PCKEV_D2_SH(dst21_l, dst10_l, dst43_l, dst32_l, dst1021_l, dst3243_l);
4150 PCKEV_D2_SH(dst65_l, dst54_l, dst87_l, dst76_l, dst5465_l, dst7687_l);
4151 dst98109_l = (v8i16) __msa_pckev_d((v2i64) dst109_l, (v2i64) dst98_l);
4153 dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
4154 dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
4155 dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
4156 dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
4157 dst4_r = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
4158 dst5_r = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
4159 dst6_r = HEVC_FILT_4TAP(dst76_r, dst98_r, filt_h0, filt_h1);
4160 dst7_r = HEVC_FILT_4TAP(dst87_r, dst109_r, filt_h0, filt_h1);
4161 dst0_l = HEVC_FILT_4TAP(dst1021_l, dst3243_l, filt_h0, filt_h1);
4162 dst1_l = HEVC_FILT_4TAP(dst3243_l, dst5465_l, filt_h0, filt_h1);
4163 dst2_l = HEVC_FILT_4TAP(dst5465_l, dst7687_l, filt_h0, filt_h1);
4164 dst3_l = HEVC_FILT_4TAP(dst7687_l, dst98109_l, filt_h0, filt_h1);
4165 SRA_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6);
4166 SRA_4V(dst4_r, dst5_r, dst6_r, dst7_r, 6);
4167 SRA_4V(dst0_l, dst1_l, dst2_l, dst3_l, 6);
4168 PCKEV_H2_SH(dst1_r, dst0_r, dst3_r, dst2_r, tmp0, tmp1);
4169 PCKEV_H2_SH(dst5_r, dst4_r, dst7_r, dst6_r, tmp2, tmp3);
4170 PCKEV_H2_SH(dst1_l, dst0_l, dst3_l, dst2_l, tmp4, tmp5);
4172 LD2(src1_ptr, src2_stride, tp0, tp1);
4173 INSERT_D2_SH(tp0, tp1, in0);
4174 LD2(src1_ptr + 2 * src2_stride, src2_stride, tp0, tp1);
4175 INSERT_D2_SH(tp0, tp1, in1);
4177 LD2(src1_ptr + 4 * src2_stride, src2_stride, tp0, tp1);
4178 INSERT_D2_SH(tp0, tp1, in2);
4179 LD2(src1_ptr + 6 * src2_stride, src2_stride, tp0, tp1);
4180 INSERT_D2_SH(tp0, tp1, in3);
4182 ADDS_SH4_SH(in0, const_vec, in1, const_vec, in2, const_vec, in3, const_vec,
4183 in0, in1, in2, in3);
4184 ADDS_SH4_SH(in0, tmp0, in1, tmp1, in2, tmp2, in3, tmp3, tmp0, tmp1, tmp2,
4186 SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7);
4187 CLIP_SH4_0_255_MAX_SATU(tmp0, tmp1, tmp2, tmp3);
4188 PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
4189 ST4x8_UB(out0, out1, dst, dst_stride);
4191 LW4(src1_ptr + 4, src2_stride, tpw0, tpw1, tpw2, tpw3);
4192 src1_ptr += (4 * src2_stride);
4193 INSERT_W4_SH(tpw0, tpw1, tpw2, tpw3, in4);
4194 LW4(src1_ptr + 4, src2_stride, tpw0, tpw1, tpw2, tpw3);
4195 INSERT_W4_SH(tpw0, tpw1, tpw2, tpw3, in5);
4196 ADDS_SH2_SH(in4, const_vec, in5, const_vec, in4, in5);
4197 ADDS_SH2_SH(in4, tmp4, in5, tmp5, tmp4, tmp5);
4198 SRARI_H2_SH(tmp4, tmp5, 7);
4199 CLIP_SH2_0_255_MAX_SATU(tmp4, tmp5);
4200 out2 = (v16u8) __msa_pckev_b((v16i8) tmp5, (v16i8) tmp4);
4201 ST2x4_UB(out2, 0, dst + 4, dst_stride);
4202 dst += 4 * dst_stride;
4203 ST2x4_UB(out2, 4, dst + 4, dst_stride);
4206 static void hevc_hv_bi_4t_8x2_msa(uint8_t *src0_ptr,
4209 int32_t src2_stride,
4212 const int8_t *filter_x,
4213 const int8_t *filter_y)
4216 v16i8 src0, src1, src2, src3, src4;
4218 v8i16 filt_h0, filt_h1;
4219 v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
4221 v8i16 filter_vec, const_vec;
4222 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
4223 v8i16 dst0, dst1, dst2, dst3, dst4;
4224 v4i32 dst0_r, dst0_l, dst1_r, dst1_l;
4225 v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
4226 v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
4230 src0_ptr -= (src_stride + 1);
4232 filter_vec = LD_SH(filter_x);
4233 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
4235 filter_vec = LD_SH(filter_y);
4236 UNPCK_R_SB_SH(filter_vec, filter_vec);
4238 SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
4242 const_vec = __msa_ldi_h(128);
4245 LD_SB5(src0_ptr, src_stride, src0, src1, src2, src3, src4);
4246 XORI_B5_128_SB(src0, src1, src2, src3, src4);
4248 LD_SH2(src1_ptr, src2_stride, in0, in1);
4249 ADDS_SH2_SH(in0, const_vec, in1, const_vec, in0, in1);
4251 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
4252 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
4253 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
4254 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7);
4255 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec8, vec9);
4257 dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
4258 dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
4259 dst2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
4260 dst3 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
4261 dst4 = HEVC_FILT_4TAP_SH(vec8, vec9, filt0, filt1);
4263 ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
4264 ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
4265 ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
4266 ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
4267 dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
4268 dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
4269 dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
4270 dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
4271 SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
4272 PCKEV_H2_SH(dst0_l, dst0_r, dst1_l, dst1_r, tmp0, tmp1);
4273 ADDS_SH2_SH(in0, tmp0, in1, tmp1, tmp0, tmp1);
4274 SRARI_H2_SH(tmp0, tmp1, 7);
4275 CLIP_SH2_0_255_MAX_SATU(tmp0, tmp1);
4276 out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
4277 ST8x2_UB(out, dst, dst_stride);
4280 static void hevc_hv_bi_4t_8multx4_msa(uint8_t *src0_ptr,
4283 int32_t src2_stride,
4286 const int8_t *filter_x,
4287 const int8_t *filter_y,
4292 v16i8 src0, src1, src2, src3, src4, src5, src6, mask0, mask1;
4293 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
4294 v8i16 filt0, filt1, filt_h0, filt_h1, filter_vec, const_vec;
4295 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, tmp0, tmp1, tmp2, tmp3;
4296 v8i16 in0, in1, in2, in3;
4297 v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
4298 v8i16 dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r;
4299 v8i16 dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l;
4301 src0_ptr -= (src_stride + 1);
4303 filter_vec = LD_SH(filter_x);
4304 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
4306 filter_vec = LD_SH(filter_y);
4307 UNPCK_R_SB_SH(filter_vec, filter_vec);
4309 SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
4311 mask0 = LD_SB(ff_hevc_mask_arr);
4314 const_vec = __msa_ldi_h(128);
4317 for (cnt = width8mult; cnt--;) {
4318 LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6);
4320 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
4322 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
4324 ADDS_SH4_SH(in0, const_vec, in1, const_vec, in2, const_vec, in3,
4325 const_vec, in0, in1, in2, in3);
4327 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
4328 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
4329 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
4331 dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
4332 dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
4333 dst2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
4335 ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
4336 ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
4338 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
4339 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
4340 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
4341 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
4343 dst3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
4344 dst4 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
4345 dst5 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
4346 dst6 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
4348 ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
4349 ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
4350 ILVRL_H2_SH(dst5, dst4, dst54_r, dst54_l);
4351 ILVRL_H2_SH(dst6, dst5, dst65_r, dst65_l);
4353 dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
4354 dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
4355 dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
4356 dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
4357 dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
4358 dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1);
4359 dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
4360 dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1);
4362 SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
4363 SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
4364 PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, dst3_l,
4365 dst3_r, tmp0, tmp1, tmp2, tmp3);
4366 ADDS_SH4_SH(in0, tmp0, in1, tmp1, in2, tmp2, in3, tmp3,
4367 tmp0, tmp1, tmp2, tmp3);
4368 SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7);
4369 CLIP_SH4_0_255_MAX_SATU(tmp0, tmp1, tmp2, tmp3);
4370 PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
4371 ST8x4_UB(out0, out1, dst, dst_stride);
4376 static void hevc_hv_bi_4t_8x6_msa(uint8_t *src0_ptr,
4379 int32_t src2_stride,
4382 const int8_t *filter_x,
4383 const int8_t *filter_y)
4385 v16u8 out0, out1, out2;
4386 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
4387 v8i16 in0, in1, in2, in3, in4, in5;
4389 v8i16 filt_h0, filt_h1;
4390 v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
4392 v8i16 filter_vec, const_vec;
4393 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
4394 v16i8 vec10, vec11, vec12, vec13, vec14, vec15, vec16, vec17;
4395 v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
4396 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
4397 v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
4398 v4i32 dst4_r, dst4_l, dst5_r, dst5_l;
4399 v8i16 dst10_r, dst32_r, dst10_l, dst32_l;
4400 v8i16 dst21_r, dst43_r, dst21_l, dst43_l;
4401 v8i16 dst54_r, dst54_l, dst65_r, dst65_l;
4402 v8i16 dst76_r, dst76_l, dst87_r, dst87_l;
4404 src0_ptr -= (src_stride + 1);
4406 filter_vec = LD_SH(filter_x);
4407 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
4409 filter_vec = LD_SH(filter_y);
4410 UNPCK_R_SB_SH(filter_vec, filter_vec);
4412 SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
4416 const_vec = __msa_ldi_h(128);
4419 LD_SB5(src0_ptr, src_stride, src0, src1, src2, src3, src4);
4420 src0_ptr += (5 * src_stride);
4421 LD_SB4(src0_ptr, src_stride, src5, src6, src7, src8);
4423 XORI_B5_128_SB(src0, src1, src2, src3, src4);
4424 XORI_B4_128_SB(src5, src6, src7, src8);
4426 LD_SH6(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5);
4427 ADDS_SH4_SH(in0, const_vec, in1, const_vec, in2, const_vec, in3, const_vec,
4428 in0, in1, in2, in3);
4429 ADDS_SH2_SH(in4, const_vec, in5, const_vec, in4, in5);
4431 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
4432 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
4433 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
4434 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7);
4435 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec8, vec9);
4436 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec10, vec11);
4437 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec12, vec13);
4438 VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec14, vec15);
4439 VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec16, vec17);
4441 dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
4442 dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
4443 dst2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
4444 dst3 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
4445 dst4 = HEVC_FILT_4TAP_SH(vec8, vec9, filt0, filt1);
4446 dst5 = HEVC_FILT_4TAP_SH(vec10, vec11, filt0, filt1);
4447 dst6 = HEVC_FILT_4TAP_SH(vec12, vec13, filt0, filt1);
4448 dst7 = HEVC_FILT_4TAP_SH(vec14, vec15, filt0, filt1);
4449 dst8 = HEVC_FILT_4TAP_SH(vec16, vec17, filt0, filt1);
4451 ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
4452 ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
4453 ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
4454 ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
4455 ILVRL_H2_SH(dst5, dst4, dst54_r, dst54_l);
4456 ILVRL_H2_SH(dst6, dst5, dst65_r, dst65_l);
4457 ILVRL_H2_SH(dst7, dst6, dst76_r, dst76_l);
4458 ILVRL_H2_SH(dst8, dst7, dst87_r, dst87_l);
4460 dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
4461 dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
4462 dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
4463 dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
4464 dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
4465 dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1);
4466 dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
4467 dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1);
4468 dst4_r = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
4469 dst4_l = HEVC_FILT_4TAP(dst54_l, dst76_l, filt_h0, filt_h1);
4470 dst5_r = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
4471 dst5_l = HEVC_FILT_4TAP(dst65_l, dst87_l, filt_h0, filt_h1);
4473 SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
4474 SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
4475 SRA_4V(dst4_r, dst4_l, dst5_r, dst5_l, 6);
4476 PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, dst3_l, dst3_r,
4477 tmp0, tmp1, tmp2, tmp3);
4478 PCKEV_H2_SH(dst4_l, dst4_r, dst5_l, dst5_r, tmp4, tmp5);
4479 ADDS_SH4_SH(in0, tmp0, in1, tmp1, in2, tmp2, in3, tmp3,
4480 tmp0, tmp1, tmp2, tmp3);
4481 ADDS_SH2_SH(in4, tmp4, in5, tmp5, tmp4, tmp5);
4482 SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7);
4483 SRARI_H2_SH(tmp4, tmp5, 7);
4484 CLIP_SH4_0_255_MAX_SATU(tmp0, tmp1, tmp2, tmp3);
4485 CLIP_SH2_0_255_MAX_SATU(tmp4, tmp5);
4486 PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
4487 out2 = (v16u8) __msa_pckev_b((v16i8) tmp5, (v16i8) tmp4);
4488 ST8x4_UB(out0, out1, dst, dst_stride);
4489 dst += (4 * dst_stride);
4490 ST8x2_UB(out2, dst, dst_stride);
4493 static void hevc_hv_bi_4t_8multx4mult_msa(uint8_t *src0_ptr,
4496 int32_t src2_stride,
4499 const int8_t *filter_x,
4500 const int8_t *filter_y,
4504 uint32_t loop_cnt, cnt;
4505 uint8_t *src0_ptr_tmp;
4506 int16_t *src1_ptr_tmp;
4509 v16i8 src0, src1, src2, src3, src4, src5, src6;
4510 v8i16 in0, in1, in2, in3;
4512 v8i16 filt_h0, filt_h1;
4513 v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
4515 v8i16 filter_vec, const_vec;
4516 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
4517 v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
4518 v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
4519 v8i16 tmp0, tmp1, tmp2, tmp3;
4520 v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
4521 v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
4522 v8i16 dst54_r, dst54_l, dst65_r, dst65_l, dst6;
4524 src0_ptr -= (src_stride + 1);
4526 filter_vec = LD_SH(filter_x);
4527 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
4529 filter_vec = LD_SH(filter_y);
4530 UNPCK_R_SB_SH(filter_vec, filter_vec);
4532 SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
4536 const_vec = __msa_ldi_h(128);
4539 for (cnt = width >> 3; cnt--;) {
4540 src0_ptr_tmp = src0_ptr;
4542 src1_ptr_tmp = src1_ptr;
4544 LD_SB3(src0_ptr_tmp, src_stride, src0, src1, src2);
4545 src0_ptr_tmp += (3 * src_stride);
4546 XORI_B3_128_SB(src0, src1, src2);
4548 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
4549 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
4550 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
4552 dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
4553 dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
4554 dst2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
4556 ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
4557 ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
4559 for (loop_cnt = height >> 2; loop_cnt--;) {
4560 LD_SB4(src0_ptr_tmp, src_stride, src3, src4, src5, src6);
4561 src0_ptr_tmp += (4 * src_stride);
4562 LD_SH4(src1_ptr_tmp, src2_stride, in0, in1, in2, in3);
4563 src1_ptr_tmp += (4 * src2_stride);
4564 XORI_B4_128_SB(src3, src4, src5, src6);
4566 ADDS_SH4_SH(in0, const_vec, in1, const_vec, in2, const_vec, in3,
4567 const_vec, in0, in1, in2, in3);
4569 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
4570 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
4571 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
4572 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
4574 dst3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
4575 dst4 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
4576 dst5 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
4577 dst6 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
4579 ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
4580 ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
4581 ILVRL_H2_SH(dst5, dst4, dst54_r, dst54_l);
4582 ILVRL_H2_SH(dst6, dst5, dst65_r, dst65_l);
4584 dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
4585 dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
4586 dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
4587 dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
4588 dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
4589 dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1);
4590 dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
4591 dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1);
4593 SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
4594 SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
4595 PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, dst3_l,
4596 dst3_r, tmp0, tmp1, tmp2, tmp3);
4597 ADDS_SH4_SH(in0, tmp0, in1, tmp1, in2, tmp2, in3, tmp3,
4598 tmp0, tmp1, tmp2, tmp3);
4599 SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7);
4600 CLIP_SH4_0_255_MAX_SATU(tmp0, tmp1, tmp2, tmp3);
4601 PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
4602 ST8x4_UB(out0, out1, dst_tmp, dst_stride);
4603 dst_tmp += (4 * dst_stride);
4618 static void hevc_hv_bi_4t_8w_msa(uint8_t *src0_ptr,
4621 int32_t src2_stride,
4624 const int8_t *filter_x,
4625 const int8_t *filter_y,
4629 hevc_hv_bi_4t_8x2_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
4630 dst, dst_stride, filter_x, filter_y);
4631 } else if (4 == height) {
4632 hevc_hv_bi_4t_8multx4_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
4633 dst, dst_stride, filter_x, filter_y, 1);
4634 } else if (6 == height) {
4635 hevc_hv_bi_4t_8x6_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
4636 dst, dst_stride, filter_x, filter_y);
4638 hevc_hv_bi_4t_8multx4mult_msa(src0_ptr, src_stride,
4639 src1_ptr, src2_stride,
4641 filter_x, filter_y, height, 8);
4645 static void hevc_hv_bi_4t_12w_msa(uint8_t *src0_ptr,
4648 int32_t src2_stride,
4651 const int8_t *filter_x,
4652 const int8_t *filter_y,
4657 uint8_t *src0_ptr_tmp, *dst_tmp;
4658 int16_t *src1_ptr_tmp;
4660 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
4661 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
4662 v16i8 mask0, mask1, mask2, mask3;
4663 v8i16 filt0, filt1, filt_h0, filt_h1, filter_vec, tmp0, tmp1, tmp2, tmp3;
4664 v8i16 dsth0, dsth1, dsth2, dsth3, dsth4, dsth5, dsth6, const_vec;
4665 v8i16 dst10, dst21, dst22, dst73, dst84, dst95, dst106;
4666 v8i16 dst76_r, dst98_r, dst87_r, dst109_r;
4667 v8i16 in0 = { 0 }, in1 = { 0 }, in2 = { 0 }, in3 = { 0 };
4668 v8i16 dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r;
4669 v8i16 dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l;
4670 v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
4671 v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
4673 src0_ptr -= (src_stride + 1);
4675 filter_vec = LD_SH(filter_x);
4676 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
4678 filter_vec = LD_SH(filter_y);
4679 UNPCK_R_SB_SH(filter_vec, filter_vec);
4681 SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
4683 mask0 = LD_SB(ff_hevc_mask_arr);
4686 const_vec = __msa_ldi_h(128);
4689 src0_ptr_tmp = src0_ptr;
4691 src1_ptr_tmp = src1_ptr;
4693 LD_SB3(src0_ptr_tmp, src_stride, src0, src1, src2);
4694 src0_ptr_tmp += (3 * src_stride);
4696 XORI_B3_128_SB(src0, src1, src2);
4698 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
4699 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
4700 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
4702 dsth0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
4703 dsth1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
4704 dsth2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
4706 ILVRL_H2_SH(dsth1, dsth0, dst10_r, dst10_l);
4707 ILVRL_H2_SH(dsth2, dsth1, dst21_r, dst21_l);
4709 for (loop_cnt = 4; loop_cnt--;) {
4710 LD_SB4(src0_ptr_tmp, src_stride, src3, src4, src5, src6);
4711 src0_ptr_tmp += (4 * src_stride);
4712 XORI_B4_128_SB(src3, src4, src5, src6);
4714 LD_SH4(src1_ptr_tmp, src2_stride, in0, in1, in2, in3);
4715 src1_ptr_tmp += (4 * src2_stride);
4716 ADDS_SH4_SH(in0, const_vec, in1, const_vec, in2, const_vec, in3,
4717 const_vec, in0, in1, in2, in3);
4719 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
4720 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
4721 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
4722 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
4724 dsth3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
4725 dsth4 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
4726 dsth5 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
4727 dsth6 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
4729 ILVRL_H2_SH(dsth3, dsth2, dst32_r, dst32_l);
4730 ILVRL_H2_SH(dsth4, dsth3, dst43_r, dst43_l);
4731 ILVRL_H2_SH(dsth5, dsth4, dst54_r, dst54_l);
4732 ILVRL_H2_SH(dsth6, dsth5, dst65_r, dst65_l);
4734 dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
4735 dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
4736 dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
4737 dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
4738 dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
4739 dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1);
4740 dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
4741 dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1);
4743 SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
4744 SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
4745 PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, dst3_l,
4746 dst3_r, tmp0, tmp1, tmp2, tmp3);
4747 ADDS_SH4_SH(in0, tmp0, in1, tmp1, in2, tmp2, in3, tmp3,
4748 tmp0, tmp1, tmp2, tmp3);
4749 SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7);
4750 CLIP_SH4_0_255_MAX_SATU(tmp0, tmp1, tmp2, tmp3);
4751 PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
4752 ST8x4_UB(out0, out1, dst_tmp, dst_stride);
4753 dst_tmp += (4 * dst_stride);
4766 mask2 = LD_SB(ff_hevc_mask_arr + 16);
4769 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
4770 src0_ptr += (3 * src_stride);
4771 XORI_B3_128_SB(src0, src1, src2);
4772 VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec0, vec1);
4773 VSHF_B2_SB(src1, src2, src1, src2, mask2, mask3, vec2, vec3);
4775 dst10 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
4776 dst21 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
4778 ILVRL_H2_SH(dst21, dst10, dst10_r, dst21_r);
4779 dst22 = (v8i16) __msa_splati_d((v2i64) dst21, 1);
4781 for (loop_cnt = 2; loop_cnt--;) {
4782 LD_SB8(src0_ptr, src_stride,
4783 src3, src4, src5, src6, src7, src8, src9, src10);
4784 src0_ptr += (8 * src_stride);
4785 XORI_B8_128_SB(src3, src4, src5, src6, src7, src8, src9, src10);
4786 VSHF_B2_SB(src3, src7, src3, src7, mask2, mask3, vec0, vec1);
4787 VSHF_B2_SB(src4, src8, src4, src8, mask2, mask3, vec2, vec3);
4788 VSHF_B2_SB(src5, src9, src5, src9, mask2, mask3, vec4, vec5);
4789 VSHF_B2_SB(src6, src10, src6, src10, mask2, mask3, vec6, vec7);
4791 dst73 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
4792 dst84 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
4793 dst95 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
4794 dst106 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
4796 dst32_r = __msa_ilvr_h(dst73, dst22);
4797 ILVRL_H2_SH(dst84, dst73, dst43_r, dst87_r);
4798 ILVRL_H2_SH(dst95, dst84, dst54_r, dst98_r);
4799 ILVRL_H2_SH(dst106, dst95, dst65_r, dst109_r);
4800 dst22 = (v8i16) __msa_splati_d((v2i64) dst73, 1);
4801 dst76_r = __msa_ilvr_h(dst22, dst106);
4803 LD2(src1_ptr, src2_stride, tp0, tp1);
4804 src1_ptr += 2 * src2_stride;
4805 INSERT_D2_SH(tp0, tp1, in0);
4806 LD2(src1_ptr, src2_stride, tp0, tp1);
4807 src1_ptr += 2 * src2_stride;
4808 INSERT_D2_SH(tp0, tp1, in1);
4810 LD2(src1_ptr, src2_stride, tp0, tp1);
4811 src1_ptr += 2 * src2_stride;
4812 INSERT_D2_SH(tp0, tp1, in2);
4813 LD2(src1_ptr, src2_stride, tp0, tp1);
4814 src1_ptr += 2 * src2_stride;
4815 INSERT_D2_SH(tp0, tp1, in3);
4817 ADDS_SH4_SH(in0, const_vec, in1, const_vec, in2, const_vec, in3,
4818 const_vec, in0, in1, in2, in3);
4820 dst0 = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
4821 dst1 = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
4822 dst2 = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
4823 dst3 = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
4824 dst4 = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
4825 dst5 = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
4826 dst6 = HEVC_FILT_4TAP(dst76_r, dst98_r, filt_h0, filt_h1);
4827 dst7 = HEVC_FILT_4TAP(dst87_r, dst109_r, filt_h0, filt_h1);
4829 SRA_4V(dst0, dst1, dst2, dst3, 6);
4830 SRA_4V(dst4, dst5, dst6, dst7, 6);
4831 PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6,
4832 tmp0, tmp1, tmp2, tmp3);
4833 ADDS_SH4_SH(in0, tmp0, in1, tmp1, in2, tmp2, in3, tmp3,
4834 tmp0, tmp1, tmp2, tmp3);
4835 SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7);
4836 CLIP_SH4_0_255_MAX_SATU(tmp0, tmp1, tmp2, tmp3);
4837 PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
4838 ST4x8_UB(out0, out1, dst, dst_stride);
4839 dst += (8 * dst_stride);
4843 dst22 = (v8i16) __msa_splati_d((v2i64) dst106, 1);
4847 static void hevc_hv_bi_4t_16w_msa(uint8_t *src0_ptr,
4850 int32_t src2_stride,
4853 const int8_t *filter_x,
4854 const int8_t *filter_y,
4858 hevc_hv_bi_4t_8multx4_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
4859 dst, dst_stride, filter_x, filter_y, 2);
4861 hevc_hv_bi_4t_8multx4mult_msa(src0_ptr, src_stride, src1_ptr,
4862 src2_stride, dst, dst_stride, filter_x,
4863 filter_y, height, 16);
4867 static void hevc_hv_bi_4t_24w_msa(uint8_t *src0_ptr,
4870 int32_t src2_stride,
4873 const int8_t *filter_x,
4874 const int8_t *filter_y,
4877 hevc_hv_bi_4t_8multx4mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
4878 dst, dst_stride, filter_x, filter_y,
4882 static void hevc_hv_bi_4t_32w_msa(uint8_t *src0_ptr,
4885 int32_t src2_stride,
4888 const int8_t *filter_x,
4889 const int8_t *filter_y,
4892 hevc_hv_bi_4t_8multx4mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
4893 dst, dst_stride, filter_x, filter_y,
4897 #define BI_MC_COPY(WIDTH) \
4898 void ff_hevc_put_hevc_bi_pel_pixels##WIDTH##_8_msa(uint8_t *dst, \
4899 ptrdiff_t dst_stride, \
4901 ptrdiff_t src_stride, \
4902 int16_t *src_16bit, \
4908 hevc_bi_copy_##WIDTH##w_msa(src, src_stride, src_16bit, MAX_PB_SIZE, \
4909 dst, dst_stride, height); \
4924 #define BI_MC(PEL, DIR, WIDTH, TAP, DIR1, FILT_DIR) \
4925 void ff_hevc_put_hevc_bi_##PEL##_##DIR##WIDTH##_8_msa(uint8_t *dst, \
4926 ptrdiff_t dst_stride, \
4928 ptrdiff_t src_stride, \
4929 int16_t *src_16bit, \
4935 const int8_t *filter = ff_hevc_##PEL##_filters[FILT_DIR - 1]; \
4937 hevc_##DIR1##_bi_##TAP##t_##WIDTH##w_msa(src, src_stride, src_16bit, \
4938 MAX_PB_SIZE, dst, dst_stride, \
4942 BI_MC(qpel, h, 4, 8, hz, mx);
4943 BI_MC(qpel, h, 8, 8, hz, mx);
4944 BI_MC(qpel, h, 12, 8, hz, mx);
4945 BI_MC(qpel, h, 16, 8, hz, mx);
4946 BI_MC(qpel, h, 24, 8, hz, mx);
4947 BI_MC(qpel, h, 32, 8, hz, mx);
4948 BI_MC(qpel, h, 48, 8, hz, mx);
4949 BI_MC(qpel, h, 64, 8, hz, mx);
4951 BI_MC(qpel, v, 4, 8, vt, my);
4952 BI_MC(qpel, v, 8, 8, vt, my);
4953 BI_MC(qpel, v, 12, 8, vt, my);
4954 BI_MC(qpel, v, 16, 8, vt, my);
4955 BI_MC(qpel, v, 24, 8, vt, my);
4956 BI_MC(qpel, v, 32, 8, vt, my);
4957 BI_MC(qpel, v, 48, 8, vt, my);
4958 BI_MC(qpel, v, 64, 8, vt, my);
4960 BI_MC(epel, h, 4, 4, hz, mx);
4961 BI_MC(epel, h, 8, 4, hz, mx);
4962 BI_MC(epel, h, 6, 4, hz, mx);
4963 BI_MC(epel, h, 12, 4, hz, mx);
4964 BI_MC(epel, h, 16, 4, hz, mx);
4965 BI_MC(epel, h, 24, 4, hz, mx);
4966 BI_MC(epel, h, 32, 4, hz, mx);
4968 BI_MC(epel, v, 4, 4, vt, my);
4969 BI_MC(epel, v, 8, 4, vt, my);
4970 BI_MC(epel, v, 6, 4, vt, my);
4971 BI_MC(epel, v, 12, 4, vt, my);
4972 BI_MC(epel, v, 16, 4, vt, my);
4973 BI_MC(epel, v, 24, 4, vt, my);
4974 BI_MC(epel, v, 32, 4, vt, my);
4978 #define BI_MC_HV(PEL, WIDTH, TAP) \
4979 void ff_hevc_put_hevc_bi_##PEL##_hv##WIDTH##_8_msa(uint8_t *dst, \
4980 ptrdiff_t dst_stride, \
4982 ptrdiff_t src_stride, \
4983 int16_t *src_16bit, \
4989 const int8_t *filter_x = ff_hevc_##PEL##_filters[mx - 1]; \
4990 const int8_t *filter_y = ff_hevc_##PEL##_filters[my - 1]; \
4992 hevc_hv_bi_##TAP##t_##WIDTH##w_msa(src, src_stride, src_16bit, \
4993 MAX_PB_SIZE, dst, dst_stride, \
4994 filter_x, filter_y, height); \
4997 BI_MC_HV(qpel, 4, 8);
4998 BI_MC_HV(qpel, 8, 8);
4999 BI_MC_HV(qpel, 12, 8);
5000 BI_MC_HV(qpel, 16, 8);
5001 BI_MC_HV(qpel, 24, 8);
5002 BI_MC_HV(qpel, 32, 8);
5003 BI_MC_HV(qpel, 48, 8);
5004 BI_MC_HV(qpel, 64, 8);
5006 BI_MC_HV(epel, 4, 4);
5007 BI_MC_HV(epel, 8, 4);
5008 BI_MC_HV(epel, 6, 4);
5009 BI_MC_HV(epel, 12, 4);
5010 BI_MC_HV(epel, 16, 4);
5011 BI_MC_HV(epel, 24, 4);
5012 BI_MC_HV(epel, 32, 4);