2 * Copyright (c) 2015 - 2017 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com)
4 * This file is part of FFmpeg.
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 #include "libavutil/mips/generic_macros_msa.h"
22 #include "libavcodec/mips/hevcdsp_mips.h"
23 #include "libavcodec/mips/hevc_macros_msa.h"
25 static const uint8_t ff_hevc_mask_arr[16 * 2] __attribute__((aligned(0x40))) = {
27 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
28 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20
31 #define HEVC_BI_RND_CLIP2(in0, in1, vec0, vec1, rnd_val, out0, out1) \
33 ADDS_SH2_SH(vec0, in0, vec1, in1, out0, out1); \
34 SRARI_H2_SH(out0, out1, rnd_val); \
35 CLIP_SH2_0_255(out0, out1); \
38 #define HEVC_BI_RND_CLIP4(in0, in1, in2, in3, \
39 vec0, vec1, vec2, vec3, rnd_val, \
40 out0, out1, out2, out3) \
42 HEVC_BI_RND_CLIP2(in0, in1, vec0, vec1, rnd_val, out0, out1); \
43 HEVC_BI_RND_CLIP2(in2, in3, vec2, vec3, rnd_val, out2, out3); \
46 #define HEVC_BI_RND_CLIP2_MAX_SATU(in0, in1, vec0, vec1, rnd_val, \
49 ADDS_SH2_SH(vec0, in0, vec1, in1, out0, out1); \
50 SRARI_H2_SH(out0, out1, rnd_val); \
51 CLIP_SH2_0_255(out0, out1); \
54 #define HEVC_BI_RND_CLIP4_MAX_SATU(in0, in1, in2, in3, vec0, vec1, vec2, \
55 vec3, rnd_val, out0, out1, out2, out3) \
57 HEVC_BI_RND_CLIP2_MAX_SATU(in0, in1, vec0, vec1, rnd_val, out0, out1); \
58 HEVC_BI_RND_CLIP2_MAX_SATU(in2, in3, vec2, vec3, rnd_val, out2, out3); \
61 static void hevc_bi_copy_4w_msa(uint8_t *src0_ptr,
69 uint32_t loop_cnt, tp0, tp1, tp2, tp3;
70 uint64_t tpd0, tpd1, tpd2, tpd3;
71 v16i8 src0 = { 0 }, src1 = { 0 };
73 v8i16 in0 = { 0 }, in1 = { 0 }, in2 = { 0 }, in3 = { 0 };
74 v8i16 dst0, dst1, dst2, dst3;
77 LW2(src0_ptr, src_stride, tp0, tp1);
78 INSERT_W2_SB(tp0, tp1, src0);
79 LD2(src1_ptr, src2_stride, tpd0, tpd1);
80 INSERT_D2_SH(tpd0, tpd1, in0);
82 dst0 = (v8i16) __msa_ilvr_b(zero, src0);
85 dst0 = __msa_srari_h(dst0, 7);
88 dst0 = (v8i16) __msa_pckev_b((v16i8) dst0, (v16i8) dst0);
89 ST_W2(dst0, 0, 1, dst, dst_stride);
90 } else if (4 == height) {
91 LW4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
92 INSERT_W4_SB(tp0, tp1, tp2, tp3, src0);
93 LD4(src1_ptr, src2_stride, tpd0, tpd1, tpd2, tpd3);
94 INSERT_D2_SH(tpd0, tpd1, in0);
95 INSERT_D2_SH(tpd2, tpd3, in1);
96 ILVRL_B2_SH(zero, src0, dst0, dst1);
97 SLLI_2V(dst0, dst1, 6);
98 HEVC_BI_RND_CLIP2_MAX_SATU(in0, in1, dst0, dst1, 7, dst0, dst1);
99 dst0 = (v8i16) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
100 ST_W4(dst0, 0, 1, 2, 3, dst, dst_stride);
101 } else if (0 == height % 8) {
102 for (loop_cnt = (height >> 3); loop_cnt--;) {
103 LW4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
104 src0_ptr += 4 * src_stride;
105 INSERT_W4_SB(tp0, tp1, tp2, tp3, src0);
106 LW4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
107 src0_ptr += 4 * src_stride;
108 INSERT_W4_SB(tp0, tp1, tp2, tp3, src1);
109 LD4(src1_ptr, src2_stride, tpd0, tpd1, tpd2, tpd3);
110 src1_ptr += (4 * src2_stride);
111 INSERT_D2_SH(tpd0, tpd1, in0);
112 INSERT_D2_SH(tpd2, tpd3, in1);
113 LD4(src1_ptr, src2_stride, tpd0, tpd1, tpd2, tpd3);
114 src1_ptr += (4 * src2_stride);
115 INSERT_D2_SH(tpd0, tpd1, in2);
116 INSERT_D2_SH(tpd2, tpd3, in3);
117 ILVRL_B2_SH(zero, src0, dst0, dst1);
118 ILVRL_B2_SH(zero, src1, dst2, dst3);
119 SLLI_4V(dst0, dst1, dst2, dst3, 6);
120 HEVC_BI_RND_CLIP4_MAX_SATU(in0, in1, in2, in3, dst0, dst1, dst2,
121 dst3, 7, dst0, dst1, dst2, dst3);
122 PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
123 ST_W8(dst0, dst1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
124 dst += (8 * dst_stride);
129 static void hevc_bi_copy_6w_msa(uint8_t *src0_ptr,
138 uint64_t tp0, tp1, tp2, tp3;
139 v16u8 out0, out1, out2, out3;
141 v16i8 src0 = { 0 }, src1 = { 0 }, src2 = { 0 }, src3 = { 0 };
142 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
143 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
145 for (loop_cnt = (height >> 3); loop_cnt--;) {
146 LD4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
147 src0_ptr += (4 * src_stride);
148 INSERT_D2_SB(tp0, tp1, src0);
149 INSERT_D2_SB(tp2, tp3, src1);
150 LD4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
151 src0_ptr += (4 * src_stride);
152 INSERT_D2_SB(tp0, tp1, src2);
153 INSERT_D2_SB(tp2, tp3, src3);
154 LD_SH8(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5, in6, in7);
155 src1_ptr += (8 * src2_stride);
156 ILVRL_B2_SH(zero, src0, dst0, dst1);
157 ILVRL_B2_SH(zero, src1, dst2, dst3);
158 ILVRL_B2_SH(zero, src2, dst4, dst5);
159 ILVRL_B2_SH(zero, src3, dst6, dst7);
160 SLLI_4V(dst0, dst1, dst2, dst3, 6);
161 SLLI_4V(dst4, dst5, dst6, dst7, 6);
162 HEVC_BI_RND_CLIP4_MAX_SATU(in0, in1, in2, in3, dst0, dst1, dst2, dst3,
163 7, dst0, dst1, dst2, dst3);
164 HEVC_BI_RND_CLIP4_MAX_SATU(in4, in5, in6, in7, dst4, dst5, dst6, dst7,
165 7, dst4, dst5, dst6, dst7);
166 PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
167 PCKEV_B2_UB(dst5, dst4, dst7, dst6, out2, out3);
168 ST_W2(out0, 0, 2, dst, dst_stride);
169 ST_H2(out0, 2, 6, dst + 4, dst_stride);
170 ST_W2(out1, 0, 2, dst + 2 * dst_stride, dst_stride);
171 ST_H2(out1, 2, 6, dst + 2 * dst_stride + 4, dst_stride);
172 dst += (4 * dst_stride);
173 ST_W2(out2, 0, 2, dst, dst_stride);
174 ST_H2(out2, 2, 6, dst + 4, dst_stride);
175 ST_W2(out3, 0, 2, dst + 2 * dst_stride, dst_stride);
176 ST_H2(out3, 2, 6, dst + 2 * dst_stride + 4, dst_stride);
177 dst += (4 * dst_stride);
181 static void hevc_bi_copy_8w_msa(uint8_t *src0_ptr,
189 uint64_t tp0, tp1, tp2, tp3;
190 v16u8 out0, out1, out2, out3;
191 v16i8 src0 = { 0 }, src1 = { 0 }, src2 = { 0 }, src3 = { 0 };
193 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
194 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
197 LD2(src0_ptr, src_stride, tp0, tp1);
198 INSERT_D2_SB(tp0, tp1, src0);
199 LD_SH2(src1_ptr, src2_stride, in0, in1);
200 ILVRL_B2_SH(zero, src0, dst0, dst1);
201 SLLI_2V(dst0, dst1, 6);
202 HEVC_BI_RND_CLIP2_MAX_SATU(in0, in1, dst0, dst1, 7, dst0, dst1);
203 out0 = (v16u8) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
204 ST_D2(out0, 0, 1, dst, dst_stride);
205 } else if (4 == height) {
206 LD4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
207 INSERT_D2_SB(tp0, tp1, src0);
208 INSERT_D2_SB(tp2, tp3, src1);
209 ILVRL_B2_SH(zero, src0, dst0, dst1);
210 ILVRL_B2_SH(zero, src1, dst2, dst3);
211 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
212 SLLI_4V(dst0, dst1, dst2, dst3, 6);
213 HEVC_BI_RND_CLIP4_MAX_SATU(in0, in1, in2, in3, dst0, dst1, dst2, dst3,
214 7, dst0, dst1, dst2, dst3);
215 PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
216 ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
217 } else if (6 == height) {
218 LD4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
219 src0_ptr += 4 * src_stride;
220 INSERT_D2_SB(tp0, tp1, src0);
221 INSERT_D2_SB(tp2, tp3, src1);
222 LD2(src0_ptr, src_stride, tp0, tp1);
223 INSERT_D2_SB(tp0, tp1, src2);
224 ILVRL_B2_SH(zero, src0, dst0, dst1);
225 ILVRL_B2_SH(zero, src1, dst2, dst3);
226 ILVRL_B2_SH(zero, src2, dst4, dst5);
227 LD_SH6(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5);
228 SLLI_4V(dst0, dst1, dst2, dst3, 6);
229 SLLI_2V(dst4, dst5, 6);
230 HEVC_BI_RND_CLIP4_MAX_SATU(in0, in1, in2, in3, dst0, dst1, dst2, dst3,
231 7, dst0, dst1, dst2, dst3);
232 HEVC_BI_RND_CLIP2_MAX_SATU(in4, in5, dst4, dst5, 7, dst4, dst5);
233 PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
234 ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
235 ST_D2(out2, 0, 1, dst + 4 * dst_stride, dst_stride);
236 } else if (0 == height % 8) {
239 for (loop_cnt = (height >> 3); loop_cnt--;) {
240 LD4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
241 src0_ptr += 4 * src_stride;
242 INSERT_D2_SB(tp0, tp1, src0);
243 INSERT_D2_SB(tp2, tp3, src1);
244 LD4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
245 src0_ptr += 4 * src_stride;
246 INSERT_D2_SB(tp0, tp1, src2);
247 INSERT_D2_SB(tp2, tp3, src3);
248 ILVRL_B2_SH(zero, src0, dst0, dst1);
249 ILVRL_B2_SH(zero, src1, dst2, dst3);
250 ILVRL_B2_SH(zero, src2, dst4, dst5);
251 ILVRL_B2_SH(zero, src3, dst6, dst7);
252 LD_SH8(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5, in6,
254 src1_ptr += (8 * src2_stride);
255 SLLI_4V(dst0, dst1, dst2, dst3, 6);
256 SLLI_4V(dst4, dst5, dst6, dst7, 6);
257 HEVC_BI_RND_CLIP4_MAX_SATU(in0, in1, in2, in3, dst0, dst1, dst2,
258 dst3, 7, dst0, dst1, dst2, dst3);
259 HEVC_BI_RND_CLIP4_MAX_SATU(in4, in5, in6, in7, dst4, dst5, dst6,
260 dst7, 7, dst4, dst5, dst6, dst7);
261 PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
262 PCKEV_B2_UB(dst5, dst4, dst7, dst6, out2, out3);
263 ST_D8(out0, out1, out2, out3, 0, 1, 0, 1, 0, 1, 0, 1, dst, dst_stride);
264 dst += (8 * dst_stride);
269 static void hevc_bi_copy_12w_msa(uint8_t *src0_ptr,
279 v16u8 out0, out1, out2;
280 v16i8 src0, src1, src2, src3;
281 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
282 v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
284 for (loop_cnt = 4; loop_cnt--;) {
285 LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
286 src0_ptr += (4 * src_stride);
288 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
289 LD_SH4(src1_ptr + 8, src2_stride, in4, in5, in6, in7);
290 src1_ptr += (4 * src2_stride);
291 ILVR_D2_SH(in5, in4, in7, in6, in4, in5);
292 ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, dst0, dst1,
294 SLLI_4V(dst0, dst1, dst2, dst3, 6);
295 ILVL_W2_SB(src1, src0, src3, src2, src0, src1);
296 ILVR_B2_SH(zero, src0, zero, src1, dst4, dst5);
297 SLLI_2V(dst4, dst5, 6);
298 HEVC_BI_RND_CLIP4_MAX_SATU(in0, in1, in2, in3, dst0, dst1, dst2, dst3,
299 7, dst0, dst1, dst2, dst3);
300 HEVC_BI_RND_CLIP2_MAX_SATU(in4, in5, dst4, dst5, 7, dst4, dst5);
301 PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
302 ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
303 ST_W4(out2, 0, 1, 2, 3, dst + 8, dst_stride);
304 dst += (4 * dst_stride);
308 static void hevc_bi_copy_16w_msa(uint8_t *src0_ptr,
317 v16u8 out0, out1, out2, out3;
318 v16i8 src0, src1, src2, src3;
319 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
320 v8i16 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l;
323 for (loop_cnt = (height >> 2); loop_cnt--;) {
324 LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
325 src0_ptr += (4 * src_stride);
326 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
327 LD_SH4(src1_ptr + 8, src2_stride, in4, in5, in6, in7);
328 src1_ptr += (4 * src2_stride);
329 ILVRL_B2_SH(zero, src0, dst0_r, dst0_l);
330 ILVRL_B2_SH(zero, src1, dst1_r, dst1_l);
331 ILVRL_B2_SH(zero, src2, dst2_r, dst2_l);
332 ILVRL_B2_SH(zero, src3, dst3_r, dst3_l);
333 SLLI_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6);
334 SLLI_4V(dst0_l, dst1_l, dst2_l, dst3_l, 6);
335 HEVC_BI_RND_CLIP4_MAX_SATU(in0, in1, in4, in5, dst0_r, dst1_r, dst0_l,
336 dst1_l, 7, dst0_r, dst1_r, dst0_l, dst1_l);
337 HEVC_BI_RND_CLIP4_MAX_SATU(in2, in3, in6, in7, dst2_r, dst3_r, dst2_l,
338 dst3_l, 7, dst2_r, dst3_r, dst2_l, dst3_l);
339 PCKEV_B2_UB(dst0_l, dst0_r, dst1_l, dst1_r, out0, out1);
340 PCKEV_B2_UB(dst2_l, dst2_r, dst3_l, dst3_r, out2, out3);
341 ST_UB4(out0, out1, out2, out3, dst, dst_stride);
342 dst += (4 * dst_stride);
346 static void hevc_bi_copy_24w_msa(uint8_t *src0_ptr,
355 v16u8 out0, out1, out2, out3, out4, out5;
356 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, zero = { 0 };
357 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8, dst9, dst10;
358 v8i16 in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, dst11;
360 for (loop_cnt = 8; loop_cnt--;) {
361 LD_SB4(src0_ptr, src_stride, src0, src1, src4, src5);
362 LD_SB4(src0_ptr + 16, src_stride, src2, src3, src6, src7);
363 src0_ptr += (4 * src_stride);
364 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
365 LD_SH4(src1_ptr + 8, src2_stride, in4, in5, in6, in7);
366 LD_SH4(src1_ptr + 16, src2_stride, in8, in9, in10, in11);
367 src1_ptr += (4 * src2_stride);
369 ILVRL_B2_SH(zero, src0, dst0, dst1);
370 ILVRL_B2_SH(zero, src1, dst2, dst3);
371 ILVR_B2_SH(zero, src2, zero, src3, dst4, dst5);
372 ILVRL_B2_SH(zero, src4, dst6, dst7);
373 ILVRL_B2_SH(zero, src5, dst8, dst9);
374 ILVR_B2_SH(zero, src6, zero, src7, dst10, dst11);
375 SLLI_4V(dst0, dst1, dst2, dst3, 6);
376 SLLI_4V(dst4, dst5, dst6, dst7, 6);
377 SLLI_4V(dst8, dst9, dst10, dst11, 6);
378 HEVC_BI_RND_CLIP4_MAX_SATU(in0, in4, in1, in5, dst0, dst1, dst2, dst3,
379 7, dst0, dst1, dst2, dst3);
380 HEVC_BI_RND_CLIP4_MAX_SATU(in8, in9, in2, in6, dst4, dst5, dst6, dst7,
381 7, dst4, dst5, dst6, dst7);
382 HEVC_BI_RND_CLIP4_MAX_SATU(in3, in7, in10, in11, dst8, dst9, dst10,
383 dst11, 7, dst8, dst9, dst10, dst11);
384 PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
385 PCKEV_B3_UB(dst7, dst6, dst9, dst8, dst11, dst10, out3, out4, out5);
386 ST_UB4(out0, out1, out3, out4, dst, dst_stride);
387 ST_D4(out2, out5, 0, 1, 0, 1, dst + 16, dst_stride);
388 dst += (4 * dst_stride);
392 static void hevc_bi_copy_32w_msa(uint8_t *src0_ptr,
401 v16u8 out0, out1, out2, out3;
402 v16i8 src0, src1, src2, src3;
404 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
405 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
407 for (loop_cnt = (height >> 1); loop_cnt--;) {
408 LD_SB2(src0_ptr, 16, src0, src1);
409 src0_ptr += src_stride;
410 LD_SB2(src0_ptr, 16, src2, src3);
411 src0_ptr += src_stride;
412 LD_SH4(src1_ptr, 8, in0, in1, in2, in3);
413 src1_ptr += src2_stride;
414 LD_SH4(src1_ptr, 8, in4, in5, in6, in7);
415 src1_ptr += src2_stride;
417 ILVRL_B2_SH(zero, src0, dst0, dst1);
418 ILVRL_B2_SH(zero, src1, dst2, dst3);
419 ILVRL_B2_SH(zero, src2, dst4, dst5);
420 ILVRL_B2_SH(zero, src3, dst6, dst7);
421 SLLI_4V(dst0, dst1, dst2, dst3, 6);
422 SLLI_4V(dst4, dst5, dst6, dst7, 6);
423 HEVC_BI_RND_CLIP4_MAX_SATU(in0, in1, in2, in3, dst0, dst1, dst2, dst3,
424 7, dst0, dst1, dst2, dst3);
425 HEVC_BI_RND_CLIP4_MAX_SATU(in4, in5, in6, in7, dst4, dst5, dst6, dst7,
426 7, dst4, dst5, dst6, dst7);
427 PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
428 PCKEV_B2_UB(dst5, dst4, dst7, dst6, out2, out3);
429 ST_UB2(out0, out1, dst, 16);
431 ST_UB2(out2, out3, dst, 16);
436 static void hevc_bi_copy_48w_msa(uint8_t *src0_ptr,
445 v16u8 out0, out1, out2, out3, out4, out5;
446 v16i8 src0, src1, src2, src3, src4, src5;
448 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8, dst9, dst10;
449 v8i16 in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, dst11;
451 for (loop_cnt = (height >> 1); loop_cnt--;) {
452 LD_SB3(src0_ptr, 16, src0, src1, src2);
453 src0_ptr += src_stride;
454 LD_SB3(src0_ptr, 16, src3, src4, src5);
455 src0_ptr += src_stride;
457 LD_SH6(src1_ptr, 8, in0, in1, in2, in3, in4, in5);
458 src1_ptr += src2_stride;
459 LD_SH6(src1_ptr, 8, in6, in7, in8, in9, in10, in11);
460 src1_ptr += src2_stride;
462 ILVRL_B2_SH(zero, src0, dst0, dst1);
463 ILVRL_B2_SH(zero, src1, dst2, dst3);
464 ILVRL_B2_SH(zero, src2, dst4, dst5);
465 ILVRL_B2_SH(zero, src3, dst6, dst7);
466 ILVRL_B2_SH(zero, src4, dst8, dst9);
467 ILVRL_B2_SH(zero, src5, dst10, dst11);
469 SLLI_4V(dst0, dst1, dst2, dst3, 6);
470 SLLI_4V(dst4, dst5, dst6, dst7, 6);
471 SLLI_4V(dst8, dst9, dst10, dst11, 6);
473 HEVC_BI_RND_CLIP4_MAX_SATU(in0, in1, in2, in3, dst0, dst1, dst2, dst3,
474 7, dst0, dst1, dst2, dst3);
475 HEVC_BI_RND_CLIP4_MAX_SATU(in4, in5, in6, in7, dst4, dst5, dst6, dst7,
476 7, dst4, dst5, dst6, dst7);
477 HEVC_BI_RND_CLIP4_MAX_SATU(in8, in9, in10, in11, dst8, dst9, dst10,
478 dst11, 7, dst8, dst9, dst10, dst11);
479 PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
480 PCKEV_B3_UB(dst7, dst6, dst9, dst8, dst11, dst10, out3, out4, out5);
481 ST_UB2(out0, out1, dst, 16);
482 ST_UB(out2, dst + 32);
484 ST_UB2(out3, out4, dst, 16);
485 ST_UB(out5, dst + 32);
490 static void hevc_bi_copy_64w_msa(uint8_t *src0_ptr,
499 v16u8 out0, out1, out2, out3;
500 v16i8 src0, src1, src2, src3;
502 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
503 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
505 for (loop_cnt = height; loop_cnt--;) {
506 LD_SB4(src0_ptr, 16, src0, src1, src2, src3);
507 src0_ptr += src_stride;
508 LD_SH8(src1_ptr, 8, in0, in1, in2, in3, in4, in5, in6, in7);
509 src1_ptr += src2_stride;
511 ILVRL_B2_SH(zero, src0, dst0, dst1);
512 ILVRL_B2_SH(zero, src1, dst2, dst3);
513 ILVRL_B2_SH(zero, src2, dst4, dst5);
514 ILVRL_B2_SH(zero, src3, dst6, dst7);
515 SLLI_4V(dst0, dst1, dst2, dst3, 6);
516 SLLI_4V(dst4, dst5, dst6, dst7, 6);
517 HEVC_BI_RND_CLIP4_MAX_SATU(in0, in1, in2, in3, dst0, dst1, dst2, dst3,
518 7, dst0, dst1, dst2, dst3);
519 HEVC_BI_RND_CLIP4_MAX_SATU(in4, in5, in6, in7, dst4, dst5, dst6, dst7,
520 7, dst4, dst5, dst6, dst7);
521 PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
522 PCKEV_B2_UB(dst5, dst4, dst7, dst6, out2, out3);
524 ST_UB4(out0, out1, out2, out3, dst, 16);
529 static void hevc_hz_bi_8t_4w_msa(uint8_t *src0_ptr,
535 const int8_t *filter,
539 v8i16 filt0, filt1, filt2, filt3;
540 v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
541 v16i8 mask1, mask2, mask3;
542 v16i8 vec0, vec1, vec2, vec3;
543 v8i16 dst0, dst1, dst2, dst3;
544 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
545 v8i16 filter_vec, const_vec;
546 v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[16]);
550 /* rearranging filter */
551 filter_vec = LD_SH(filter);
552 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
558 const_vec = __msa_ldi_h(128);
561 for (loop_cnt = (height >> 3); loop_cnt--;) {
562 LD_SB8(src0_ptr, src_stride, src0, src1, src2, src3,
563 src4, src5, src6, src7);
564 src0_ptr += (8 * src_stride);
565 LD_SH8(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5, in6, in7);
566 src1_ptr += (8 * src2_stride);
568 ILVR_D2_SH(in1, in0, in3, in2, in0, in1);
569 ILVR_D2_SH(in5, in4, in7, in6, in2, in3);
570 XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
576 VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0, vec1);
577 VSHF_B2_SB(src4, src5, src6, src7, mask0, mask0, vec2, vec3);
578 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0,
580 VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec0, vec1);
581 VSHF_B2_SB(src4, src5, src6, src7, mask1, mask1, vec2, vec3);
582 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0,
584 VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec0, vec1);
585 VSHF_B2_SB(src4, src5, src6, src7, mask2, mask2, vec2, vec3);
586 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt2, filt2, filt2, dst0,
588 VSHF_B2_SB(src0, src1, src2, src3, mask3, mask3, vec0, vec1);
589 VSHF_B2_SB(src4, src5, src6, src7, mask3, mask3, vec2, vec3);
590 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt3, filt3, filt3, filt3, dst0,
593 HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
594 dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
596 PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
597 ST_W8(dst0, dst1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
598 dst += (8 * dst_stride);
602 static void hevc_hz_bi_8t_8w_msa(uint8_t *src0_ptr,
608 const int8_t *filter,
612 v8i16 filt0, filt1, filt2, filt3;
613 v16i8 src0, src1, src2, src3;
614 v16i8 mask1, mask2, mask3;
615 v16i8 vec0, vec1, vec2, vec3;
616 v8i16 dst0, dst1, dst2, dst3;
617 v8i16 in0, in1, in2, in3;
618 v8i16 filter_vec, const_vec;
619 v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
623 const_vec = __msa_ldi_h(128);
626 filter_vec = LD_SH(filter);
627 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
633 for (loop_cnt = (height >> 2); loop_cnt--;) {
634 LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
635 src0_ptr += (4 * src_stride);
636 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
637 src1_ptr += (4 * src2_stride);
638 XORI_B4_128_SB(src0, src1, src2, src3);
644 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
645 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3);
646 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0,
648 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0, vec1);
649 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2, vec3);
650 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0,
652 VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec0, vec1);
653 VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2, vec3);
654 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt2, filt2, filt2, dst0,
656 VSHF_B2_SB(src0, src0, src1, src1, mask3, mask3, vec0, vec1);
657 VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec2, vec3);
658 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt3, filt3, filt3, filt3, dst0,
661 HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
662 dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
664 PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
665 ST_D4(dst0, dst1, 0, 1, 0, 1, dst, dst_stride);
666 dst += (4 * dst_stride);
670 static void hevc_hz_bi_8t_12w_msa(uint8_t *src0_ptr,
676 const int8_t *filter,
682 v16i8 src0, src1, src2, src3;
683 v16i8 vec0, vec1, vec2;
684 v8i16 filt0, filt1, filt2, filt3;
685 v16i8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7;
686 v8i16 dst0, dst1, dst2;
687 v8i16 in0, in1, in2, in3;
688 v8i16 filter_vec, const_vec;
691 const_vec = __msa_ldi_h(128);
694 filter_vec = LD_SH(filter);
695 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
697 mask0 = LD_SB(ff_hevc_mask_arr);
701 mask4 = LD_SB(&ff_hevc_mask_arr[16]);
706 for (loop_cnt = 8; loop_cnt--;) {
707 LD_SB2(src0_ptr, 8, src0, src1);
708 src0_ptr += src_stride;
709 LD_SB2(src0_ptr, 8, src2, src3);
710 src0_ptr += src_stride;
711 LD_SH2(src1_ptr, 8, in0, in1);
712 src1_ptr += src2_stride;
713 LD_SH2(src1_ptr, 8, in2, in3);
714 src1_ptr += src2_stride;
715 XORI_B4_128_SB(src0, src1, src2, src3);
721 VSHF_B3_SB(src0, src0, src1, src3, src2, src2, mask0, mask4, mask0,
723 DPADD_SB2_SH(vec0, vec1, filt0, filt0, dst0, dst1);
724 dst2 = __msa_dpadd_s_h(dst2, vec2, (v16i8) filt0);
725 VSHF_B3_SB(src0, src0, src1, src3, src2, src2, mask1, mask5, mask1,
727 DPADD_SB2_SH(vec0, vec1, filt1, filt1, dst0, dst1);
728 dst2 = __msa_dpadd_s_h(dst2, vec2, (v16i8) filt1);
729 VSHF_B3_SB(src0, src0, src1, src3, src2, src2, mask2, mask6, mask2,
731 DPADD_SB2_SH(vec0, vec1, filt2, filt2, dst0, dst1);
732 dst2 = __msa_dpadd_s_h(dst2, vec2, (v16i8) filt2);
733 VSHF_B3_SB(src0, src0, src1, src3, src2, src2, mask3, mask7, mask3,
735 DPADD_SB2_SH(vec0, vec1, filt3, filt3, dst0, dst1);
736 dst2 = __msa_dpadd_s_h(dst2, vec2, (v16i8) filt3);
738 in1 = (v8i16) __msa_pckev_d((v2i64) in3, (v2i64) in1);
739 HEVC_BI_RND_CLIP2(in0, in1, dst0, dst1, 7, dst0, dst1);
740 dst2 = __msa_adds_s_h(in2, dst2);
741 dst2 = __msa_srari_h(dst2, 7);
743 PCKEV_B2_SH(dst1, dst0, dst2, dst2, dst0, dst1);
745 tmp2 = __msa_copy_s_d((v2i64) dst0, 0);
746 tmp0 = __msa_copy_s_w((v4i32) dst0, 2);
747 tmp3 = __msa_copy_s_d((v2i64) dst1, 0);
748 tmp1 = __msa_copy_s_w((v4i32) dst0, 3);
758 static void hevc_hz_bi_8t_16w_msa(uint8_t *src0_ptr,
764 const int8_t *filter,
768 v16i8 src0, src1, src2, src3;
769 v8i16 filt0, filt1, filt2, filt3;
770 v16i8 mask1, mask2, mask3;
771 v16i8 vec0, vec1, vec2, vec3;
772 v8i16 dst0, dst1, dst2, dst3;
773 v8i16 in0, in1, in2, in3;
774 v8i16 filter_vec, const_vec;
775 v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
778 const_vec = __msa_ldi_h(128);
781 filter_vec = LD_SH(filter);
782 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
788 for (loop_cnt = (height >> 1); loop_cnt--;) {
789 LD_SB2(src0_ptr, 8, src0, src1);
790 src0_ptr += src_stride;
791 LD_SB2(src0_ptr, 8, src2, src3);
792 src0_ptr += src_stride;
793 LD_SH2(src1_ptr, 8, in0, in1);
794 src1_ptr += src2_stride;
795 LD_SH2(src1_ptr, 8, in2, in3);
796 src1_ptr += src2_stride;
797 XORI_B4_128_SB(src0, src1, src2, src3);
803 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
804 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3);
805 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0,
807 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0, vec1);
808 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2, vec3);
809 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0,
811 VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec0, vec1);
812 VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2, vec3);
813 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt2, filt2, filt2, dst0,
815 VSHF_B2_SB(src0, src0, src1, src1, mask3, mask3, vec0, vec1);
816 VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec2, vec3);
817 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt3, filt3, filt3, filt3, dst0,
820 HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
821 dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
823 PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
824 ST_SH2(dst0, dst1, dst, dst_stride);
825 dst += (2 * dst_stride);
829 static void hevc_hz_bi_8t_24w_msa(uint8_t *src0_ptr,
835 const int8_t *filter,
840 v16i8 src0, src1, tmp0, tmp1;
841 v8i16 filt0, filt1, filt2, filt3;
842 v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
843 v16i8 vec0, vec1, vec2, vec3;
844 v8i16 dst0, dst1, dst2;
846 v8i16 filter_vec, const_vec;
847 v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
849 src0_ptr = src0_ptr - 3;
850 const_vec = __msa_ldi_h(128);
853 filter_vec = LD_SH(filter);
854 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
864 for (loop_cnt = height; loop_cnt--;) {
865 LD_SB2(src0_ptr, 16, src0, src1);
866 src0_ptr += src_stride;
867 LD_SH2(src1_ptr, 8, in0, in1);
868 in2 = LD_SH(src1_ptr + 16);
869 src1_ptr += src2_stride;
870 XORI_B2_128_SB(src0, src1);
875 VSHF_B2_SB(src0, src0, src0, src1, mask0, mask4, vec0, vec1);
876 VSHF_B2_SB(src1, src1, src0, src0, mask0, mask1, vec2, vec3);
877 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt1, dst0,
879 VSHF_B2_SB(src0, src1, src1, src1, mask5, mask1, vec0, vec1);
880 VSHF_B2_SB(src0, src0, src0, src1, mask2, mask6, vec2, vec3);
881 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt2, filt2, dst1,
883 VSHF_B2_SB(src1, src1, src0, src0, mask2, mask3, vec0, vec1);
884 VSHF_B2_SB(src0, src1, src1, src1, mask7, mask3, vec2, vec3);
885 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt3, filt3, filt3, dst2,
888 HEVC_BI_RND_CLIP2(in0, in1, dst0, dst1, 7, dst0, dst1);
889 dst2 = __msa_adds_s_h(dst2, in2);
890 dst2 = __msa_srari_h(dst2, 7);
893 PCKEV_B2_SB(dst1, dst0, dst2, dst2, tmp0, tmp1);
894 dst_val0 = __msa_copy_u_d((v2i64) tmp1, 0);
896 SD(dst_val0, dst + 16);
901 static void hevc_hz_bi_8t_32w_msa(uint8_t *src0_ptr,
907 const int8_t *filter,
911 v16i8 src0, src1, src2, tmp0, tmp1;
912 v8i16 filt0, filt1, filt2, filt3;
913 v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
914 v16i8 vec0, vec1, vec2, vec3;
915 v8i16 dst0, dst1, dst2, dst3;
916 v8i16 in0, in1, in2, in3;
917 v8i16 filter_vec, const_vec;
918 v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
921 const_vec = __msa_ldi_h(128);
924 filter_vec = LD_SH(filter);
925 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
935 for (loop_cnt = height; loop_cnt--;) {
936 LD_SB2(src0_ptr, 16, src0, src1);
937 src2 = LD_SB(src0_ptr + 24);
938 src0_ptr += src_stride;
939 LD_SH4(src1_ptr, 8, in0, in1, in2, in3);
940 src1_ptr += src2_stride;
941 XORI_B3_128_SB(src0, src1, src2);
947 VSHF_B2_SB(src0, src0, src0, src1, mask0, mask4, vec0, vec1);
948 VSHF_B2_SB(src1, src1, src2, src2, mask0, mask0, vec2, vec3);
949 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0,
951 VSHF_B2_SB(src0, src0, src0, src1, mask1, mask5, vec0, vec1);
952 VSHF_B2_SB(src1, src1, src2, src2, mask1, mask1, vec2, vec3);
953 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0,
955 VSHF_B2_SB(src0, src0, src0, src1, mask2, mask6, vec0, vec1);
956 VSHF_B2_SB(src1, src1, src2, src2, mask2, mask2, vec2, vec3);
957 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt2, filt2, filt2, dst0,
959 VSHF_B2_SB(src0, src0, src0, src1, mask3, mask7, vec0, vec1);
960 VSHF_B2_SB(src1, src1, src2, src2, mask3, mask3, vec2, vec3);
961 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt3, filt3, filt3, filt3, dst0,
964 HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
965 dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
967 PCKEV_B2_SB(dst1, dst0, dst3, dst2, tmp0, tmp1);
968 ST_SB2(tmp0, tmp1, dst, 16);
973 static void hevc_hz_bi_8t_48w_msa(uint8_t *src0_ptr,
979 const int8_t *filter,
983 v16i8 src0, src1, src2, src3;
984 v16i8 tmp0, tmp1, tmp2;
985 v8i16 filt0, filt1, filt2, filt3;
986 v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
987 v16i8 vec0, vec1, vec2, vec3;
988 v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
989 v8i16 in0, in1, in2, in3, in4, in5;
990 v8i16 filter_vec, const_vec;
991 v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
995 const_vec = __msa_ldi_h(128);
998 filter_vec = LD_SH(filter);
999 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1009 for (loop_cnt = 64; loop_cnt--;) {
1010 LD_SB3(src0_ptr, 16, src0, src1, src2);
1011 src3 = LD_SB(src0_ptr + 40);
1012 src0_ptr += src_stride;
1013 LD_SH4(src1_ptr, 8, in0, in1, in2, in3);
1014 XORI_B4_128_SB(src0, src1, src2, src3);
1021 VSHF_B2_SB(src0, src0, src0, src1, mask0, mask4, vec0, vec1);
1022 VSHF_B2_SB(src1, src1, src1, src2, mask0, mask4, vec2, vec3);
1023 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0,
1025 VSHF_B2_SB(src0, src0, src0, src1, mask1, mask5, vec0, vec1);
1026 VSHF_B2_SB(src1, src1, src1, src2, mask1, mask5, vec2, vec3);
1027 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0,
1029 VSHF_B2_SB(src0, src0, src0, src1, mask2, mask6, vec0, vec1);
1030 VSHF_B2_SB(src1, src1, src1, src2, mask2, mask6, vec2, vec3);
1031 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt2, filt2, filt2, dst0,
1033 VSHF_B2_SB(src0, src0, src0, src1, mask3, mask7, vec0, vec1);
1034 VSHF_B2_SB(src1, src1, src1, src2, mask3, mask7, vec2, vec3);
1035 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt3, filt3, filt3, filt3, dst0,
1037 HEVC_BI_RND_CLIP2(in0, in1, dst0, dst1, 7, dst0, dst1);
1038 HEVC_BI_RND_CLIP2(in2, in3, dst2, dst3, 7, dst2, dst3);
1039 PCKEV_B2_SB(dst1, dst0, dst3, dst2, tmp0, tmp1);
1041 ST_SB(tmp1, dst + 16);
1043 LD_SH2(src1_ptr + 32, 8, in4, in5);
1044 src1_ptr += src2_stride;
1048 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec0, vec1);
1049 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2, vec3);
1050 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt1, filt1, dst4,
1052 VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec0, vec1);
1053 VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec2, vec3);
1054 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt2, filt3, filt3, dst4,
1057 HEVC_BI_RND_CLIP2(in4, in5, dst4, dst5, 7, dst4, dst5);
1059 tmp2 = __msa_pckev_b((v16i8) dst5, (v16i8) dst4);
1060 ST_SB(tmp2, dst + 32);
1065 static void hevc_hz_bi_8t_64w_msa(uint8_t *src0_ptr,
1068 int32_t src2_stride,
1071 const int8_t *filter,
1075 v16i8 src0, src1, src2, src3, src4, src5, tmp0, tmp1;
1076 v8i16 filt0, filt1, filt2, filt3;
1077 v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
1078 v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
1079 v16i8 vec0, vec1, vec2, vec3;
1080 v8i16 dst0, dst1, dst2, dst3;
1081 v8i16 in0, in1, in2, in3;
1082 v8i16 filter_vec, const_vec;
1086 const_vec = __msa_ldi_h(128);
1089 filter_vec = LD_SH(filter);
1090 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1100 for (loop_cnt = height; loop_cnt--;) {
1101 LD_SB2(src0_ptr, 16, src0, src1);
1102 src2 = LD_SB(src0_ptr + 24);
1103 LD_SB2(src0_ptr + 32, 16, src3, src4);
1104 src5 = LD_SB(src0_ptr + 56);
1105 LD_SH4(src1_ptr, 8, in0, in1, in2, in3);
1106 XORI_B3_128_SB(src0, src1, src2);
1113 VSHF_B2_SB(src0, src0, src0, src1, mask0, mask4, vec0, vec1);
1114 VSHF_B2_SB(src1, src1, src2, src2, mask0, mask0, vec2, vec3);
1115 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0,
1117 VSHF_B2_SB(src0, src0, src0, src1, mask1, mask5, vec0, vec1);
1118 VSHF_B2_SB(src1, src1, src2, src2, mask1, mask1, vec2, vec3);
1119 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0,
1121 VSHF_B2_SB(src0, src0, src0, src1, mask2, mask6, vec0, vec1);
1122 VSHF_B2_SB(src1, src1, src2, src2, mask2, mask2, vec2, vec3);
1123 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt2, filt2, filt2, dst0,
1125 VSHF_B2_SB(src0, src0, src0, src1, mask3, mask7, vec0, vec1);
1126 VSHF_B2_SB(src1, src1, src2, src2, mask3, mask3, vec2, vec3);
1127 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt3, filt3, filt3, filt3, dst0,
1130 HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
1131 dst0, dst1, dst2, dst3, 7,
1132 dst0, dst1, dst2, dst3);
1134 PCKEV_B2_SB(dst1, dst0, dst3, dst2, tmp0, tmp1);
1135 ST_SB2(tmp0, tmp1, dst, 16);
1141 LD_SH4(src1_ptr + 32, 8, in0, in1, in2, in3);
1142 XORI_B3_128_SB(src0, src1, src2);
1148 VSHF_B2_SB(src0, src0, src0, src1, mask0, mask4, vec0, vec1);
1149 VSHF_B2_SB(src1, src1, src2, src2, mask0, mask0, vec2, vec3);
1150 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0,
1152 VSHF_B2_SB(src0, src0, src0, src1, mask1, mask5, vec0, vec1);
1153 VSHF_B2_SB(src1, src1, src2, src2, mask1, mask1, vec2, vec3);
1154 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0,
1156 VSHF_B2_SB(src0, src0, src0, src1, mask2, mask6, vec0, vec1);
1157 VSHF_B2_SB(src1, src1, src2, src2, mask2, mask2, vec2, vec3);
1158 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt2, filt2, filt2, dst0,
1160 VSHF_B2_SB(src0, src0, src0, src1, mask3, mask7, vec0, vec1);
1161 VSHF_B2_SB(src1, src1, src2, src2, mask3, mask3, vec2, vec3);
1162 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt3, filt3, filt3, filt3, dst0,
1164 HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
1165 dst0, dst1, dst2, dst3, 7,
1166 dst0, dst1, dst2, dst3);
1167 PCKEV_B2_SB(dst1, dst0, dst3, dst2, tmp0, tmp1);
1168 ST_SB2(tmp0, tmp1, dst + 32, 16);
1169 src1_ptr += src2_stride;
1170 src0_ptr += src_stride;
1175 static void hevc_vt_bi_8t_4w_msa(uint8_t *src0_ptr,
1178 int32_t src2_stride,
1181 const int8_t *filter,
1185 v16i8 src0, src1, src2, src3, src4, src5;
1186 v16i8 src6, src7, src8, src9, src10;
1187 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
1188 v16i8 src11, src12, src13, src14;
1189 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
1190 v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
1191 v16i8 src1110_r, src1211_r, src1312_r, src1413_r;
1192 v16i8 src2110, src4332, src6554, src8776, src10998;
1193 v16i8 src12111110, src14131312;
1194 v8i16 dst10, dst32, dst54, dst76;
1195 v8i16 filt0, filt1, filt2, filt3;
1196 v8i16 filter_vec, const_vec;
1198 src0_ptr -= (3 * src_stride);
1200 const_vec = __msa_ldi_h(128);
1203 filter_vec = LD_SH(filter);
1204 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1206 LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6);
1207 src0_ptr += (7 * src_stride);
1208 ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1209 src10_r, src32_r, src54_r, src21_r);
1210 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1211 ILVR_D3_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r,
1212 src2110, src4332, src6554);
1213 XORI_B3_128_SB(src2110, src4332, src6554);
1215 for (loop_cnt = (height >> 3); loop_cnt--;) {
1216 LD_SB8(src0_ptr, src_stride,
1217 src7, src8, src9, src10, src11, src12, src13, src14);
1218 src0_ptr += (8 * src_stride);
1219 LD_SH8(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5, in6, in7);
1220 src1_ptr += (8 * src2_stride);
1222 ILVR_D2_SH(in1, in0, in3, in2, in0, in1);
1223 ILVR_D2_SH(in5, in4, in7, in6, in2, in3);
1224 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
1225 src76_r, src87_r, src98_r, src109_r);
1226 ILVR_B4_SB(src11, src10, src12, src11, src13, src12, src14, src13,
1227 src1110_r, src1211_r, src1312_r, src1413_r);
1228 ILVR_D4_SB(src87_r, src76_r, src109_r, src98_r, src1211_r, src1110_r,
1229 src1413_r, src1312_r,
1230 src8776, src10998, src12111110, src14131312);
1231 XORI_B4_128_SB(src8776, src10998, src12111110, src14131312);
1234 DPADD_SB4_SH(src2110, src4332, src6554, src8776,
1235 filt0, filt1, filt2, filt3, dst10, dst10, dst10, dst10);
1237 DPADD_SB4_SH(src4332, src6554, src8776, src10998,
1238 filt0, filt1, filt2, filt3, dst32, dst32, dst32, dst32);
1240 DPADD_SB4_SH(src6554, src8776, src10998, src12111110,
1241 filt0, filt1, filt2, filt3, dst54, dst54, dst54, dst54);
1243 DPADD_SB4_SH(src8776, src10998, src12111110, src14131312,
1244 filt0, filt1, filt2, filt3, dst76, dst76, dst76, dst76);
1246 HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
1247 dst10, dst32, dst54, dst76, 7,
1248 dst10, dst32, dst54, dst76);
1250 PCKEV_B2_SH(dst32, dst10, dst76, dst54, dst10, dst54);
1251 ST_W8(dst10, dst54, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
1252 dst += (8 * dst_stride);
1255 src4332 = src12111110;
1256 src6554 = src14131312;
1261 static void hevc_vt_bi_8t_8w_msa(uint8_t *src0_ptr,
1264 int32_t src2_stride,
1267 const int8_t *filter,
1271 v16i8 src0, src1, src2, src3, src4, src5;
1272 v16i8 src6, src7, src8, src9, src10;
1273 v8i16 in0, in1, in2, in3;
1274 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
1275 v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
1276 v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
1277 v8i16 filt0, filt1, filt2, filt3;
1278 v8i16 filter_vec, const_vec;
1280 src0_ptr -= (3 * src_stride);
1281 const_vec = __msa_ldi_h(128);
1284 filter_vec = LD_SH(filter);
1285 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1287 LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6);
1288 src0_ptr += (7 * src_stride);
1289 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1290 ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1291 src10_r, src32_r, src54_r, src21_r);
1292 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1294 for (loop_cnt = (height >> 2); loop_cnt--;) {
1295 LD_SB4(src0_ptr, src_stride, src7, src8, src9, src10);
1296 src0_ptr += (4 * src_stride);
1297 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
1298 src1_ptr += (4 * src2_stride);
1299 XORI_B4_128_SB(src7, src8, src9, src10);
1300 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
1301 src76_r, src87_r, src98_r, src109_r);
1304 DPADD_SB4_SH(src10_r, src32_r, src54_r, src76_r,
1305 filt0, filt1, filt2, filt3,
1306 dst0_r, dst0_r, dst0_r, dst0_r);
1308 DPADD_SB4_SH(src21_r, src43_r, src65_r, src87_r,
1309 filt0, filt1, filt2, filt3,
1310 dst1_r, dst1_r, dst1_r, dst1_r);
1312 DPADD_SB4_SH(src32_r, src54_r, src76_r, src98_r,
1313 filt0, filt1, filt2, filt3,
1314 dst2_r, dst2_r, dst2_r, dst2_r);
1316 DPADD_SB4_SH(src43_r, src65_r, src87_r, src109_r,
1317 filt0, filt1, filt2, filt3,
1318 dst3_r, dst3_r, dst3_r, dst3_r);
1320 HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
1321 dst0_r, dst1_r, dst2_r, dst3_r, 7,
1322 dst0_r, dst1_r, dst2_r, dst3_r);
1324 PCKEV_B2_SH(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r);
1325 ST_D4(dst0_r, dst1_r, 0, 1, 0, 1, dst, dst_stride);
1326 dst += (4 * dst_stride);
1339 static void hevc_vt_bi_8t_12w_msa(uint8_t *src0_ptr,
1342 int32_t src2_stride,
1345 const int8_t *filter,
1349 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1350 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
1351 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
1352 v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
1353 v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
1354 v16i8 src10_l, src32_l, src54_l, src76_l, src98_l;
1355 v16i8 src21_l, src43_l, src65_l, src87_l, src109_l;
1356 v16i8 src2110, src4332, src6554, src8776, src10998;
1357 v8i16 dst0_l, dst1_l;
1358 v8i16 filt0, filt1, filt2, filt3;
1359 v8i16 filter_vec, const_vec;
1361 src0_ptr -= (3 * src_stride);
1362 const_vec = __msa_ldi_h(128);
1365 filter_vec = LD_SH(filter);
1366 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1368 LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6);
1369 src0_ptr += (7 * src_stride);
1370 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1372 ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1373 src10_r, src32_r, src54_r, src21_r);
1374 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1375 ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1376 src10_l, src32_l, src54_l, src21_l);
1377 ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
1378 ILVR_D3_SB(src21_l, src10_l, src43_l, src32_l, src65_l, src54_l,
1379 src2110, src4332, src6554);
1381 for (loop_cnt = (height >> 2); loop_cnt--;) {
1382 LD_SB4(src0_ptr, src_stride, src7, src8, src9, src10);
1383 src0_ptr += (4 * src_stride);
1384 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
1385 LD_SH4((src1_ptr + 8), src2_stride, in4, in5, in6, in7);
1386 src1_ptr += (4 * src2_stride);
1388 ILVR_D2_SH(in5, in4, in7, in6, in4, in5);
1389 XORI_B4_128_SB(src7, src8, src9, src10);
1390 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
1391 src76_r, src87_r, src98_r, src109_r);
1392 ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
1393 src76_l, src87_l, src98_l, src109_l);
1394 ILVR_D2_SB(src87_l, src76_l, src109_l, src98_l, src8776, src10998);
1397 DPADD_SB4_SH(src10_r, src32_r, src54_r, src76_r,
1398 filt0, filt1, filt2, filt3,
1399 dst0_r, dst0_r, dst0_r, dst0_r);
1401 DPADD_SB4_SH(src21_r, src43_r, src65_r, src87_r,
1402 filt0, filt1, filt2, filt3,
1403 dst1_r, dst1_r, dst1_r, dst1_r);
1405 DPADD_SB4_SH(src32_r, src54_r, src76_r, src98_r,
1406 filt0, filt1, filt2, filt3,
1407 dst2_r, dst2_r, dst2_r, dst2_r);
1409 DPADD_SB4_SH(src43_r, src65_r, src87_r, src109_r,
1410 filt0, filt1, filt2, filt3,
1411 dst3_r, dst3_r, dst3_r, dst3_r);
1413 DPADD_SB4_SH(src2110, src4332, src6554, src8776,
1414 filt0, filt1, filt2, filt3,
1415 dst0_l, dst0_l, dst0_l, dst0_l);
1417 DPADD_SB4_SH(src4332, src6554, src8776, src10998,
1418 filt0, filt1, filt2, filt3,
1419 dst1_l, dst1_l, dst1_l, dst1_l);
1421 HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
1422 dst0_r, dst1_r, dst2_r, dst3_r, 7,
1423 dst0_r, dst1_r, dst2_r, dst3_r);
1424 HEVC_BI_RND_CLIP2(in4, in5, dst0_l, dst1_l, 7, dst0_l, dst1_l);
1427 PCKEV_B2_SH(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r);
1428 dst0_l = (v8i16) __msa_pckev_b((v16i8) dst1_l, (v16i8) dst0_l);
1429 ST_D4(dst0_r, dst1_r, 0, 1, 0, 1, dst, dst_stride);
1430 ST_W4(dst0_l, 0, 1, 2, 3, dst + 8, dst_stride);
1431 dst += (4 * dst_stride);
1446 static void hevc_vt_bi_8t_16multx2mult_msa(uint8_t *src0_ptr,
1449 int32_t src2_stride,
1452 const int8_t *filter,
1453 int32_t height, int32_t width)
1455 uint8_t *src0_ptr_tmp;
1456 int16_t *src1_ptr_tmp;
1460 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
1461 v8i16 in0, in1, in2, in3;
1462 v16i8 src10_r, src32_r, src54_r, src76_r;
1463 v16i8 src21_r, src43_r, src65_r, src87_r;
1464 v8i16 dst0_r, dst1_r;
1465 v16i8 src10_l, src32_l, src54_l, src76_l;
1466 v16i8 src21_l, src43_l, src65_l, src87_l;
1467 v8i16 dst0_l, dst1_l;
1468 v8i16 filt0, filt1, filt2, filt3;
1469 v8i16 filter_vec, const_vec;
1471 src0_ptr -= (3 * src_stride);
1472 const_vec = __msa_ldi_h(128);
1475 filter_vec = LD_SH(filter);
1476 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1478 for (cnt = (width >> 4); cnt--;) {
1479 src0_ptr_tmp = src0_ptr;
1480 src1_ptr_tmp = src1_ptr;
1483 LD_SB7(src0_ptr_tmp, src_stride,
1484 src0, src1, src2, src3, src4, src5, src6);
1485 src0_ptr_tmp += (7 * src_stride);
1486 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1488 ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1489 src10_r, src32_r, src54_r, src21_r);
1490 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1491 ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1492 src10_l, src32_l, src54_l, src21_l);
1493 ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
1495 for (loop_cnt = (height >> 1); loop_cnt--;) {
1496 LD_SB2(src0_ptr_tmp, src_stride, src7, src8);
1497 src0_ptr_tmp += (2 * src_stride);
1498 LD_SH2(src1_ptr_tmp, src2_stride, in0, in1);
1499 LD_SH2((src1_ptr_tmp + 8), src2_stride, in2, in3);
1500 src1_ptr_tmp += (2 * src2_stride);
1501 XORI_B2_128_SB(src7, src8);
1503 ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
1504 ILVL_B2_SB(src7, src6, src8, src7, src76_l, src87_l);
1507 DPADD_SB4_SH(src10_r, src32_r, src54_r, src76_r,
1508 filt0, filt1, filt2, filt3,
1509 dst0_r, dst0_r, dst0_r, dst0_r);
1511 DPADD_SB4_SH(src21_r, src43_r, src65_r, src87_r,
1512 filt0, filt1, filt2, filt3,
1513 dst1_r, dst1_r, dst1_r, dst1_r);
1515 DPADD_SB4_SH(src10_l, src32_l, src54_l, src76_l,
1516 filt0, filt1, filt2, filt3,
1517 dst0_l, dst0_l, dst0_l, dst0_l);
1519 DPADD_SB4_SH(src21_l, src43_l, src65_l, src87_l,
1520 filt0, filt1, filt2, filt3,
1521 dst1_l, dst1_l, dst1_l, dst1_l);
1523 HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
1524 dst0_r, dst1_r, dst0_l, dst1_l, 7,
1525 dst0_r, dst1_r, dst0_l, dst1_l);
1527 PCKEV_B2_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r, dst1_r);
1528 ST_SH2(dst0_r, dst1_r, dst_tmp, dst_stride);
1529 dst_tmp += (2 * dst_stride);
1552 static void hevc_vt_bi_8t_16w_msa(uint8_t *src0_ptr,
1555 int32_t src2_stride,
1558 const int8_t *filter,
1561 hevc_vt_bi_8t_16multx2mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
1562 dst, dst_stride, filter, height, 16);
1565 static void hevc_vt_bi_8t_24w_msa(uint8_t *src0_ptr,
1568 int32_t src2_stride,
1571 const int8_t *filter,
1574 hevc_vt_bi_8t_16multx2mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
1575 dst, dst_stride, filter, height, 16);
1576 hevc_vt_bi_8t_8w_msa(src0_ptr + 16, src_stride, src1_ptr + 16, src2_stride,
1577 dst + 16, dst_stride, filter, height);
1580 static void hevc_vt_bi_8t_32w_msa(uint8_t *src0_ptr,
1583 int32_t src2_stride,
1586 const int8_t *filter,
1589 hevc_vt_bi_8t_16multx2mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
1590 dst, dst_stride, filter, height, 32);
1593 static void hevc_vt_bi_8t_48w_msa(uint8_t *src0_ptr,
1596 int32_t src2_stride,
1599 const int8_t *filter,
1602 hevc_vt_bi_8t_16multx2mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
1603 dst, dst_stride, filter, height, 48);
1606 static void hevc_vt_bi_8t_64w_msa(uint8_t *src0_ptr,
1609 int32_t src2_stride,
1612 const int8_t *filter,
1615 hevc_vt_bi_8t_16multx2mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
1616 dst, dst_stride, filter, height, 64);
1619 static void hevc_hv_bi_8t_4w_msa(uint8_t *src0_ptr,
1622 int32_t src2_stride,
1625 const int8_t *filter_x,
1626 const int8_t *filter_y,
1632 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1633 v8i16 in0 = { 0 }, in1 = { 0 };
1634 v8i16 filt0, filt1, filt2, filt3;
1635 v8i16 filt_h0, filt_h1, filt_h2, filt_h3;
1636 v16i8 mask1, mask2, mask3;
1637 v8i16 filter_vec, const_vec;
1638 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1639 v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1641 v8i16 dst30, dst41, dst52, dst63, dst66, dst97, dst108;
1642 v8i16 dst10, dst32, dst54, dst76, dst98, dst21, dst43, dst65, dst87, dst109;
1643 v4i32 dst0, dst1, dst2, dst3;
1644 v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16);
1646 src0_ptr -= ((3 * src_stride) + 3);
1647 filter_vec = LD_SH(filter_x);
1648 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1650 filter_vec = LD_SH(filter_y);
1651 UNPCK_R_SB_SH(filter_vec, filter_vec);
1653 SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
1659 const_vec = __msa_ldi_h(128);
1662 LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6);
1663 src0_ptr += (7 * src_stride);
1664 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1666 /* row 0 row 1 row 2 row 3 */
1667 VSHF_B4_SB(src0, src3, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
1668 VSHF_B4_SB(src1, src4, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7);
1669 VSHF_B4_SB(src2, src5, mask0, mask1, mask2, mask3,
1670 vec8, vec9, vec10, vec11);
1671 VSHF_B4_SB(src3, src6, mask0, mask1, mask2, mask3,
1672 vec12, vec13, vec14, vec15);
1674 dst30 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1676 dst41 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
1678 dst52 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
1680 dst63 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2,
1683 ILVRL_H2_SH(dst41, dst30, dst10, dst43);
1684 ILVRL_H2_SH(dst52, dst41, dst21, dst54);
1685 ILVRL_H2_SH(dst63, dst52, dst32, dst65);
1687 dst66 = (v8i16) __msa_splati_d((v2i64) dst63, 1);
1689 for (loop_cnt = height >> 2; loop_cnt--;) {
1690 LD_SB4(src0_ptr, src_stride, src7, src8, src9, src10);
1691 src0_ptr += (4 * src_stride);
1692 XORI_B4_128_SB(src7, src8, src9, src10);
1694 LD2(src1_ptr, src2_stride, tp0, tp1);
1695 INSERT_D2_SH(tp0, tp1, in0);
1696 src1_ptr += (2 * src2_stride);
1697 LD2(src1_ptr, src2_stride, tp0, tp1);
1698 INSERT_D2_SH(tp0, tp1, in1);
1699 src1_ptr += (2 * src2_stride);
1701 VSHF_B4_SB(src7, src9, mask0, mask1, mask2, mask3,
1702 vec0, vec1, vec2, vec3);
1703 VSHF_B4_SB(src8, src10, mask0, mask1, mask2, mask3,
1704 vec4, vec5, vec6, vec7);
1705 dst97 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1707 dst108 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
1710 dst76 = __msa_ilvr_h(dst97, dst66);
1711 ILVRL_H2_SH(dst108, dst97, dst87, dst109);
1712 dst66 = (v8i16) __msa_splati_d((v2i64) dst97, 1);
1713 dst98 = __msa_ilvr_h(dst66, dst108);
1715 dst0 = HEVC_FILT_8TAP(dst10, dst32, dst54, dst76, filt_h0, filt_h1,
1717 dst1 = HEVC_FILT_8TAP(dst21, dst43, dst65, dst87, filt_h0, filt_h1,
1719 dst2 = HEVC_FILT_8TAP(dst32, dst54, dst76, dst98, filt_h0, filt_h1,
1721 dst3 = HEVC_FILT_8TAP(dst43, dst65, dst87, dst109, filt_h0, filt_h1,
1724 SRA_4V(dst0, dst1, dst2, dst3, 6);
1725 PCKEV_H2_SH(dst1, dst0, dst3, dst2, out0, out1);
1726 ADDS_SH2_SH(out0, in0, out1, in1, out0, out1);
1727 ADDS_SH2_SH(out0, const_vec, out1, const_vec, out0, out1);
1728 SRARI_H2_SH(out0, out1, 7);
1729 CLIP_SH2_0_255(out0, out1);
1730 out = (v16u8) __msa_pckev_b((v16i8) out1, (v16i8) out0);
1731 ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
1732 dst += (4 * dst_stride);
1740 dst66 = (v8i16) __msa_splati_d((v2i64) dst108, 1);
1744 static void hevc_hv_bi_8t_8multx1mult_msa(uint8_t *src0_ptr,
1747 int32_t src2_stride,
1750 const int8_t *filter_x,
1751 const int8_t *filter_y,
1752 int32_t height, int32_t width)
1756 uint8_t *src0_ptr_tmp;
1757 int16_t *src1_ptr_tmp;
1760 v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
1762 v8i16 filt0, filt1, filt2, filt3;
1763 v8i16 filt_h0, filt_h1, filt_h2, filt_h3;
1764 v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
1765 v16i8 mask1, mask2, mask3;
1766 v8i16 filter_vec, const_vec;
1767 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1768 v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1769 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
1770 v4i32 dst0_r, dst0_l;
1771 v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
1772 v8i16 dst10_l, dst32_l, dst54_l, dst76_l;
1774 src0_ptr -= ((3 * src_stride) + 3);
1775 const_vec = __msa_ldi_h(128);
1778 filter_vec = LD_SH(filter_x);
1779 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1781 filter_vec = LD_SH(filter_y);
1782 UNPCK_R_SB_SH(filter_vec, filter_vec);
1784 SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
1790 for (cnt = width >> 3; cnt--;) {
1791 src0_ptr_tmp = src0_ptr;
1793 src1_ptr_tmp = src1_ptr;
1795 LD_SB7(src0_ptr_tmp, src_stride,
1796 src0, src1, src2, src3, src4, src5, src6);
1797 src0_ptr_tmp += (7 * src_stride);
1798 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1800 /* row 0 row 1 row 2 row 3 */
1801 VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
1802 vec0, vec1, vec2, vec3);
1803 VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
1804 vec4, vec5, vec6, vec7);
1805 VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
1806 vec8, vec9, vec10, vec11);
1807 VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
1808 vec12, vec13, vec14, vec15);
1809 dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1811 dst1 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
1813 dst2 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
1815 dst3 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1,
1818 VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3,
1819 vec0, vec1, vec2, vec3);
1820 VSHF_B4_SB(src5, src5, mask0, mask1, mask2, mask3,
1821 vec4, vec5, vec6, vec7);
1822 VSHF_B4_SB(src6, src6, mask0, mask1, mask2, mask3,
1823 vec8, vec9, vec10, vec11);
1824 dst4 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1826 dst5 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
1828 dst6 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
1831 for (loop_cnt = height; loop_cnt--;) {
1832 src7 = LD_SB(src0_ptr_tmp);
1833 src7 = (v16i8) __msa_xori_b((v16u8) src7, 128);
1834 src0_ptr_tmp += src_stride;
1836 in0 = LD_SH(src1_ptr_tmp);
1837 src1_ptr_tmp += src2_stride;
1839 VSHF_B4_SB(src7, src7, mask0, mask1, mask2, mask3,
1840 vec0, vec1, vec2, vec3);
1841 dst7 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1,
1843 ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
1844 ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
1845 ILVRL_H2_SH(dst5, dst4, dst54_r, dst54_l);
1846 ILVRL_H2_SH(dst7, dst6, dst76_r, dst76_l);
1847 dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r,
1848 filt_h0, filt_h1, filt_h2, filt_h3);
1849 dst0_l = HEVC_FILT_8TAP(dst10_l, dst32_l, dst54_l, dst76_l,
1850 filt_h0, filt_h1, filt_h2, filt_h3);
1854 tmp = __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r);
1855 ADDS_SH2_SH(tmp, in0, tmp, const_vec, tmp, tmp);
1856 tmp = __msa_srari_h(tmp, 7);
1858 out = (v16u8) __msa_pckev_b((v16i8) tmp, (v16i8) tmp);
1859 ST_D1(out, 0, dst_tmp);
1860 dst_tmp += dst_stride;
1877 static void hevc_hv_bi_8t_8w_msa(uint8_t *src0_ptr,
1880 int32_t src2_stride,
1883 const int8_t *filter_x,
1884 const int8_t *filter_y,
1887 hevc_hv_bi_8t_8multx1mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
1888 dst, dst_stride, filter_x, filter_y,
1892 static void hevc_hv_bi_8t_12w_msa(uint8_t *src0_ptr,
1895 int32_t src2_stride,
1898 const int8_t *filter_x,
1899 const int8_t *filter_y,
1903 uint8_t *src0_ptr_tmp, *dst_tmp;
1904 int16_t *src1_ptr_tmp;
1907 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1908 v16i8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7;
1909 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1910 v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1911 v8i16 in0, in1 = { 0 }, out0, out1, tmp, filter_vec, const_vec;
1912 v8i16 filt0, filt1, filt2, filt3, filt_h0, filt_h1, filt_h2, filt_h3;
1913 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
1914 v8i16 dst30, dst41, dst52, dst63, dst66, dst97, dst108;
1915 v8i16 dst10, dst32, dst54, dst76, dst98, dst21, dst43, dst65, dst87, dst109;
1916 v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
1917 v8i16 dst10_l, dst32_l, dst54_l, dst76_l;
1918 v4i32 dst0_r, dst0_l, tmp0, tmp1, tmp2, tmp3;
1920 src0_ptr -= ((3 * src_stride) + 3);
1922 const_vec = __msa_ldi_h(128);
1925 filter_vec = LD_SH(filter_x);
1926 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1928 filter_vec = LD_SH(filter_y);
1929 UNPCK_R_SB_SH(filter_vec, filter_vec);
1931 SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
1933 mask0 = LD_SB(ff_hevc_mask_arr);
1938 src0_ptr_tmp = src0_ptr;
1940 src1_ptr_tmp = src1_ptr;
1942 LD_SB7(src0_ptr_tmp, src_stride, src0, src1, src2, src3, src4, src5,
1944 src0_ptr_tmp += (7 * src_stride);
1945 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1947 /* row 0 row 1 row 2 row 3 */
1948 VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3, vec0, vec1, vec2,
1950 VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3, vec4, vec5, vec6,
1952 VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3, vec8, vec9, vec10,
1954 VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3, vec12, vec13, vec14,
1956 dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1958 dst1 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
1960 dst2 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
1962 dst3 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1,
1964 VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3, vec0, vec1, vec2,
1966 VSHF_B4_SB(src5, src5, mask0, mask1, mask2, mask3, vec4, vec5, vec6,
1968 VSHF_B4_SB(src6, src6, mask0, mask1, mask2, mask3, vec8, vec9, vec10,
1970 dst4 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1972 dst5 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
1974 dst6 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
1977 for (loop_cnt = 16; loop_cnt--;) {
1978 src7 = LD_SB(src0_ptr_tmp);
1979 src7 = (v16i8) __msa_xori_b((v16u8) src7, 128);
1980 src0_ptr_tmp += src_stride;
1982 in0 = LD_SH(src1_ptr_tmp);
1983 src1_ptr_tmp += src2_stride;
1985 VSHF_B4_SB(src7, src7, mask0, mask1, mask2, mask3, vec0, vec1, vec2,
1987 dst7 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1,
1989 ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
1990 ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
1991 ILVRL_H2_SH(dst5, dst4, dst54_r, dst54_l);
1992 ILVRL_H2_SH(dst7, dst6, dst76_r, dst76_l);
1993 dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r, filt_h0,
1994 filt_h1, filt_h2, filt_h3);
1995 dst0_l = HEVC_FILT_8TAP(dst10_l, dst32_l, dst54_l, dst76_l, filt_h0,
1996 filt_h1, filt_h2, filt_h3);
2000 tmp = __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r);
2001 ADDS_SH2_SH(tmp, in0, tmp, const_vec, tmp, tmp);
2002 tmp = __msa_srari_h(tmp, 7);
2004 out = (v16u8) __msa_pckev_b((v16i8) tmp, (v16i8) tmp);
2005 ST_D1(out, 0, dst_tmp);
2006 dst_tmp += dst_stride;
2021 mask4 = LD_SB(ff_hevc_mask_arr + 16);
2026 LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6);
2027 src0_ptr += (7 * src_stride);
2028 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
2030 /* row 0 row 1 row 2 row 3 */
2031 VSHF_B4_SB(src0, src3, mask4, mask5, mask6, mask7, vec0, vec1, vec2, vec3);
2032 VSHF_B4_SB(src1, src4, mask4, mask5, mask6, mask7, vec4, vec5, vec6, vec7);
2033 VSHF_B4_SB(src2, src5, mask4, mask5, mask6, mask7,
2034 vec8, vec9, vec10, vec11);
2035 VSHF_B4_SB(src3, src6, mask4, mask5, mask6, mask7,
2036 vec12, vec13, vec14, vec15);
2037 dst30 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
2039 dst41 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
2041 dst52 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
2043 dst63 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2,
2046 ILVRL_H2_SH(dst41, dst30, dst10, dst43);
2047 ILVRL_H2_SH(dst52, dst41, dst21, dst54);
2048 ILVRL_H2_SH(dst63, dst52, dst32, dst65);
2050 dst66 = (v8i16) __msa_splati_d((v2i64) dst63, 1);
2052 for (loop_cnt = 4; loop_cnt--;) {
2053 LD_SB4(src0_ptr, src_stride, src7, src8, src9, src10);
2054 src0_ptr += (4 * src_stride);
2055 XORI_B4_128_SB(src7, src8, src9, src10);
2057 LD2(src1_ptr, src2_stride, tp0, tp1);
2058 INSERT_D2_SH(tp0, tp1, in0);
2059 src1_ptr += (2 * src2_stride);
2060 LD2(src1_ptr, src2_stride, tp0, tp1);
2061 INSERT_D2_SH(tp0, tp1, in1);
2062 src1_ptr += (2 * src2_stride);
2064 VSHF_B4_SB(src7, src9, mask4, mask5, mask6, mask7, vec0, vec1, vec2,
2066 VSHF_B4_SB(src8, src10, mask4, mask5, mask6, mask7, vec4, vec5, vec6,
2068 dst97 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
2070 dst108 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
2073 dst76 = __msa_ilvr_h(dst97, dst66);
2074 ILVRL_H2_SH(dst108, dst97, dst87, dst109);
2075 dst66 = (v8i16) __msa_splati_d((v2i64) dst97, 1);
2076 dst98 = __msa_ilvr_h(dst66, dst108);
2078 tmp0 = HEVC_FILT_8TAP(dst10, dst32, dst54, dst76, filt_h0, filt_h1,
2080 tmp1 = HEVC_FILT_8TAP(dst21, dst43, dst65, dst87, filt_h0, filt_h1,
2082 tmp2 = HEVC_FILT_8TAP(dst32, dst54, dst76, dst98, filt_h0, filt_h1,
2084 tmp3 = HEVC_FILT_8TAP(dst43, dst65, dst87, dst109, filt_h0, filt_h1,
2086 SRA_4V(tmp0, tmp1, tmp2, tmp3, 6);
2087 PCKEV_H2_SH(tmp1, tmp0, tmp3, tmp2, out0, out1);
2088 ADDS_SH2_SH(out0, in0, out1, in1, out0, out1);
2089 ADDS_SH2_SH(out0, const_vec, out1, const_vec, out0, out1);
2090 SRARI_H2_SH(out0, out1, 7);
2091 CLIP_SH2_0_255(out0, out1);
2092 out = (v16u8) __msa_pckev_b((v16i8) out1, (v16i8) out0);
2093 ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
2094 dst += (4 * dst_stride);
2102 dst66 = (v8i16) __msa_splati_d((v2i64) dst108, 1);
2106 static void hevc_hv_bi_8t_16w_msa(uint8_t *src0_ptr,
2109 int32_t src2_stride,
2112 const int8_t *filter_x,
2113 const int8_t *filter_y,
2116 hevc_hv_bi_8t_8multx1mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
2117 dst, dst_stride, filter_x, filter_y,
2121 static void hevc_hv_bi_8t_24w_msa(uint8_t *src0_ptr,
2124 int32_t src2_stride,
2127 const int8_t *filter_x,
2128 const int8_t *filter_y,
2131 hevc_hv_bi_8t_8multx1mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
2132 dst, dst_stride, filter_x, filter_y,
2136 static void hevc_hv_bi_8t_32w_msa(uint8_t *src0_ptr,
2139 int32_t src2_stride,
2142 const int8_t *filter_x,
2143 const int8_t *filter_y,
2146 hevc_hv_bi_8t_8multx1mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
2147 dst, dst_stride, filter_x, filter_y,
2151 static void hevc_hv_bi_8t_48w_msa(uint8_t *src0_ptr,
2154 int32_t src2_stride,
2157 const int8_t *filter_x,
2158 const int8_t *filter_y,
2161 hevc_hv_bi_8t_8multx1mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
2162 dst, dst_stride, filter_x, filter_y,
2166 static void hevc_hv_bi_8t_64w_msa(uint8_t *src0_ptr,
2169 int32_t src2_stride,
2172 const int8_t *filter_x,
2173 const int8_t *filter_y,
2176 hevc_hv_bi_8t_8multx1mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
2177 dst, dst_stride, filter_x, filter_y,
2181 static void hevc_hz_bi_4t_4x2_msa(uint8_t *src0_ptr,
2184 int32_t src2_stride,
2187 const int8_t *filter,
2191 v16i8 src0, src1, dst0, vec0, vec1;
2193 v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[16]);
2196 v8i16 filter_vec, const_vec;
2200 const_vec = __msa_ldi_h(128);
2203 filter_vec = LD_SH(filter);
2204 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2208 LD_SB2(src0_ptr, src_stride, src0, src1);
2209 LD_SH2(src1_ptr, src2_stride, in0, in1);
2210 in0 = (v8i16) __msa_ilvr_d((v2i64) in1, (v2i64) in0);
2211 XORI_B2_128_SB(src0, src1);
2212 VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
2214 DPADD_SB2_SH(vec0, vec1, filt0, filt1, tmp0, tmp0);
2216 tmp0 = __msa_adds_s_h(tmp0, in0);
2217 tmp0 = __msa_srari_h(tmp0, 7);
2218 CLIP_SH_0_255(tmp0);
2219 dst0 = __msa_pckev_b((v16i8) tmp0, (v16i8) tmp0);
2221 ST_W2(dst0, 0, 1, dst, dst_stride);
2224 static void hevc_hz_bi_4t_4x4_msa(uint8_t *src0_ptr,
2227 int32_t src2_stride,
2230 const int8_t *filter,
2234 v16i8 src0, src1, src2, src3, dst0, vec0, vec1;
2235 v8i16 in0, in1, in2, in3;
2237 v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[16]);
2240 v8i16 filter_vec, const_vec;
2244 const_vec = __msa_ldi_h(128);
2247 filter_vec = LD_SH(filter);
2248 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2252 LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
2253 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
2255 ILVR_D2_SH(in1, in0, in3, in2, in0, in1);
2256 XORI_B4_128_SB(src0, src1, src2, src3);
2260 VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0, vec1);
2261 VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2, vec3);
2262 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt1, filt1, tmp0, tmp1,
2264 HEVC_BI_RND_CLIP2(in0, in1, tmp0, tmp1, 7, tmp0, tmp1);
2265 dst0 = __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
2267 ST_W4(dst0, 0, 1, 2, 3, dst, dst_stride);
2270 static void hevc_hz_bi_4t_4x8multiple_msa(uint8_t *src0_ptr,
2273 int32_t src2_stride,
2276 const int8_t *filter,
2281 v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
2283 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
2284 v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[16]);
2285 v16i8 mask1, vec0, vec1, vec2, vec3;
2286 v8i16 tmp0, tmp1, tmp2, tmp3;
2287 v8i16 filter_vec, const_vec;
2291 const_vec = __msa_ldi_h(128);
2294 filter_vec = LD_SH(filter);
2295 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2299 for (loop_cnt = (height >> 3); loop_cnt--;) {
2300 LD_SB8(src0_ptr, src_stride,
2301 src0, src1, src2, src3, src4, src5, src6, src7);
2302 src0_ptr += (8 * src_stride);
2303 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
2304 src1_ptr += (4 * src2_stride);
2305 LD_SH4(src1_ptr, src2_stride, in4, in5, in6, in7);
2306 src1_ptr += (4 * src2_stride);
2307 ILVR_D2_SH(in1, in0, in3, in2, in0, in1);
2308 ILVR_D2_SH(in5, in4, in7, in6, in2, in3);
2309 XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
2315 VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0, vec1);
2316 VSHF_B2_SB(src4, src5, src6, src7, mask0, mask0, vec2, vec3);
2317 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, tmp0,
2319 VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec0, vec1);
2320 VSHF_B2_SB(src4, src5, src6, src7, mask1, mask1, vec2, vec3);
2321 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, tmp0,
2324 HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
2325 tmp0, tmp1, tmp2, tmp3, 7, tmp0, tmp1, tmp2, tmp3);
2327 PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, dst0, dst1);
2328 ST_W8(dst0, dst1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
2329 dst += (8 * dst_stride);
2333 static void hevc_hz_bi_4t_4w_msa(uint8_t *src0_ptr,
2336 int32_t src2_stride,
2339 const int8_t *filter,
2343 hevc_hz_bi_4t_4x2_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
2344 dst, dst_stride, filter, height);
2345 } else if (4 == height) {
2346 hevc_hz_bi_4t_4x4_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
2347 dst, dst_stride, filter, height);
2348 } else if (8 == height || 16 == height) {
2349 hevc_hz_bi_4t_4x8multiple_msa(src0_ptr, src_stride,
2350 src1_ptr, src2_stride,
2351 dst, dst_stride, filter, height);
2355 static void hevc_hz_bi_4t_6w_msa(uint8_t *src0_ptr,
2358 int32_t src2_stride,
2361 const int8_t *filter,
2366 v16i8 src0, src1, src2, src3;
2367 v8i16 in0, in1, in2, in3;
2368 v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
2370 v16i8 vec0, vec1, vec2, vec3;
2371 v8i16 dst0, dst1, dst2, dst3;
2372 v8i16 filter_vec, const_vec;
2376 const_vec = __msa_ldi_h(128);
2379 filter_vec = LD_SH(filter);
2380 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2384 for (loop_cnt = (height >> 2); loop_cnt--;) {
2385 LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
2386 src0_ptr += (4 * src_stride);
2387 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
2388 src1_ptr += (4 * src2_stride);
2389 XORI_B4_128_SB(src0, src1, src2, src3);
2395 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
2396 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3);
2397 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0,
2399 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0, vec1);
2400 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2, vec3);
2401 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0,
2404 HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
2405 dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
2407 PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
2408 ST_W2(dst0, 0, 2, dst, dst_stride);
2409 ST_H2(dst0, 2, 6, dst + 4, dst_stride);
2410 ST_W2(dst1, 0, 2, dst + 2 * dst_stride, dst_stride);
2411 ST_H2(dst1, 2, 6, dst + 2 * dst_stride + 4, dst_stride);
2412 dst += (4 * dst_stride);
2416 static void hevc_hz_bi_4t_8x2_msa(uint8_t *src0_ptr,
2419 int32_t src2_stride,
2422 const int8_t *filter,
2428 v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
2429 v16i8 mask1, vec0, vec1, vec2, vec3;
2431 v8i16 filter_vec, const_vec;
2435 const_vec = __msa_ldi_h(128);
2438 filter_vec = LD_SH(filter);
2439 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2443 LD_SB2(src0_ptr, src_stride, src0, src1);
2444 LD_SH2(src1_ptr, src2_stride, in0, in1);
2445 XORI_B2_128_SB(src0, src1);
2449 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
2450 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec2, vec3);
2451 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt1, filt1, dst0, dst1,
2453 HEVC_BI_RND_CLIP2(in0, in1, dst0, dst1, 7, dst0, dst1);
2455 dst0 = (v8i16) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
2456 ST_D2(dst0, 0, 1, dst, dst_stride);
2459 static void hevc_hz_bi_4t_8x6_msa(uint8_t *src0_ptr,
2462 int32_t src2_stride,
2465 const int8_t *filter,
2469 v16i8 src0, src1, src2, src3, src4, src5;
2470 v8i16 in0, in1, in2, in3, in4, in5;
2471 v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
2473 v16i8 vec0, vec1, vec2, vec3;
2474 v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
2475 v8i16 filter_vec, const_vec;
2479 const_vec = __msa_ldi_h(128);
2482 filter_vec = LD_SH(filter);
2483 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2487 LD_SB6(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5);
2488 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
2489 src1_ptr += (4 * src2_stride);
2490 LD_SH2(src1_ptr, src2_stride, in4, in5);
2491 XORI_B6_128_SB(src0, src1, src2, src3, src4, src5);
2497 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
2498 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3);
2499 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0, dst1,
2501 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0, vec1);
2502 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2, vec3);
2503 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0, dst1,
2508 VSHF_B2_SB(src4, src4, src5, src5, mask0, mask0, vec0, vec1);
2509 VSHF_B2_SB(src4, src4, src5, src5, mask1, mask1, vec2, vec3);
2510 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt1, filt1, dst4, dst5,
2513 HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
2514 dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
2515 HEVC_BI_RND_CLIP2(in4, in5, dst4, dst5, 7, dst4, dst5);
2517 PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
2518 dst2 = (v8i16) __msa_pckev_b((v16i8) dst5, (v16i8) dst4);
2519 ST_D4(dst0, dst1, 0, 1, 0, 1, dst, dst_stride);
2520 ST_D2(dst2, 0, 1, dst + 4 * dst_stride, dst_stride);
2523 static void hevc_hz_bi_4t_8x4multiple_msa(uint8_t *src0_ptr,
2526 int32_t src2_stride,
2529 const int8_t *filter,
2534 v16i8 src0, src1, src2, src3;
2535 v8i16 in0, in1, in2, in3;
2536 v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
2538 v16i8 vec0, vec1, vec2, vec3;
2539 v8i16 dst0, dst1, dst2, dst3;
2540 v8i16 filter_vec, const_vec;
2544 const_vec = __msa_ldi_h(128);
2547 filter_vec = LD_SH(filter);
2548 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2552 for (loop_cnt = (height >> 2); loop_cnt--;) {
2553 LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
2554 src0_ptr += (4 * src_stride);
2555 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
2556 src1_ptr += (4 * src2_stride);
2557 XORI_B4_128_SB(src0, src1, src2, src3);
2563 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
2564 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3);
2565 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0,
2567 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0, vec1);
2568 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2, vec3);
2569 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0,
2572 HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
2573 dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
2575 PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
2576 ST_D4(dst0, dst1, 0, 1, 0, 1, dst, dst_stride);
2577 dst += (4 * dst_stride);
2581 static void hevc_hz_bi_4t_8w_msa(uint8_t *src0_ptr,
2584 int32_t src2_stride,
2587 const int8_t *filter,
2591 hevc_hz_bi_4t_8x2_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
2592 dst, dst_stride, filter, height);
2593 } else if (6 == height) {
2594 hevc_hz_bi_4t_8x6_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
2595 dst, dst_stride, filter, height);
2596 } else if (0 == (height % 4)) {
2597 hevc_hz_bi_4t_8x4multiple_msa(src0_ptr, src_stride,
2598 src1_ptr, src2_stride,
2599 dst, dst_stride, filter, height);
2603 static void hevc_hz_bi_4t_12w_msa(uint8_t *src0_ptr,
2606 int32_t src2_stride,
2609 const int8_t *filter,
2614 v16i8 src0, src1, src2, src3;
2615 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
2616 v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
2618 8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
2621 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
2622 v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
2623 v8i16 filter_vec, const_vec;
2627 const_vec = __msa_ldi_h(128);
2630 filter_vec = LD_SH(filter);
2631 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2636 for (loop_cnt = (height >> 2); loop_cnt--;) {
2637 LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
2638 src0_ptr += (4 * src_stride);
2639 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
2640 LD_SH4(src1_ptr + 8, src2_stride, in4, in5, in6, in7);
2641 src1_ptr += (4 * src2_stride);
2643 ILVR_D2_SH(in5, in4, in7, in6, in4, in5);
2644 XORI_B4_128_SB(src0, src1, src2, src3);
2652 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
2653 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3);
2654 VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4, vec5);
2655 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0,
2657 DPADD_SB2_SH(vec4, vec5, filt0, filt0, dst4, dst5);
2658 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0, vec1);
2659 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2, vec3);
2660 VSHF_B2_SB(src0, src1, src2, src3, mask3, mask3, vec4, vec5);
2661 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0,
2663 DPADD_SB2_SH(vec4, vec5, filt1, filt1, dst4, dst5);
2665 HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
2666 dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
2667 HEVC_BI_RND_CLIP2(in4, in5, dst4, dst5, 7, dst4, dst5);
2669 PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
2670 dst2 = (v8i16) __msa_pckev_b((v16i8) dst5, (v16i8) dst4);
2671 ST_D4(dst0, dst1, 0, 1, 0, 1, dst, dst_stride);
2672 ST_W4(dst2, 0, 1, 2, 3, dst + 8, dst_stride);
2673 dst += (4 * dst_stride);
2677 static void hevc_hz_bi_4t_16w_msa(uint8_t *src0_ptr,
2680 int32_t src2_stride,
2683 const int8_t *filter,
2687 v16i8 src0, src1, src2, src3, vec0, vec1, vec2, vec3;
2688 v8i16 in0, in1, in2, in3, dst0, dst1, dst2, dst3;
2690 v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
2692 v8i16 filter_vec, const_vec;
2696 const_vec = __msa_ldi_h(128);
2699 filter_vec = LD_SH(filter);
2700 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2704 for (loop_cnt = (height >> 1); loop_cnt--;) {
2705 LD_SB2(src0_ptr, src_stride, src0, src2);
2706 LD_SB2(src0_ptr + 8, src_stride, src1, src3);
2707 src0_ptr += (2 * src_stride);
2708 LD_SH2(src1_ptr, src2_stride, in0, in2);
2709 LD_SH2(src1_ptr + 8, src2_stride, in1, in3);
2710 src1_ptr += (2 * src2_stride);
2712 XORI_B4_128_SB(src0, src1, src2, src3);
2719 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
2720 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3);
2721 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0,
2723 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0, vec1);
2724 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2, vec3);
2725 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0,
2728 HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
2729 dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
2731 PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
2732 ST_SH2(dst0, dst1, dst, dst_stride);
2733 dst += (2 * dst_stride);
2737 static void hevc_hz_bi_4t_24w_msa(uint8_t *src0_ptr,
2740 int32_t src2_stride,
2743 const int8_t *filter,
2746 int16_t *src1_ptr_tmp;
2749 v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
2750 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
2752 v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
2753 v16i8 mask1, mask2, mask3;
2754 v16i8 vec0, vec1, vec2, vec3;
2755 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
2756 v8i16 filter_vec, const_vec;
2760 const_vec = __msa_ldi_h(128);
2763 filter_vec = LD_SH(filter);
2764 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2771 src1_ptr_tmp = src1_ptr + 16;
2773 for (loop_cnt = (height >> 2); loop_cnt--;) {
2774 LD_SB4(src0_ptr, src_stride, src0, src2, src4, src6);
2775 LD_SB4(src0_ptr + 16, src_stride, src1, src3, src5, src7);
2776 src0_ptr += (4 * src_stride);
2777 LD_SH4(src1_ptr, src2_stride, in0, in2, in4, in6);
2778 LD_SH4(src1_ptr + 8, src2_stride, in1, in3, in5, in7);
2779 src1_ptr += (4 * src2_stride);
2780 XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
2786 VSHF_B2_SB(src0, src0, src0, src1, mask0, mask2, vec0, vec1);
2787 VSHF_B2_SB(src2, src2, src2, src3, mask0, mask2, vec2, vec3);
2788 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0,
2790 VSHF_B2_SB(src0, src0, src0, src1, mask1, mask3, vec0, vec1);
2791 VSHF_B2_SB(src2, src2, src2, src3, mask1, mask3, vec2, vec3);
2792 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0,
2799 VSHF_B2_SB(src4, src4, src4, src5, mask0, mask2, vec0, vec1);
2800 VSHF_B2_SB(src6, src6, src6, src7, mask0, mask2, vec2, vec3);
2801 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst4,
2803 VSHF_B2_SB(src4, src4, src4, src5, mask1, mask3, vec0, vec1);
2804 VSHF_B2_SB(src6, src6, src6, src7, mask1, mask3, vec2, vec3);
2805 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst4,
2808 HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
2809 dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
2810 HEVC_BI_RND_CLIP4(in4, in5, in6, in7,
2811 dst4, dst5, dst6, dst7, 7, dst4, dst5, dst6, dst7);
2813 PCKEV_B4_SH(dst1, dst0, dst3, dst2,
2814 dst5, dst4, dst7, dst6, dst0, dst1, dst2, dst3);
2815 ST_SH4(dst0, dst1, dst2, dst3, dst, dst_stride);
2816 dst += (4 * dst_stride);
2818 LD_SH4(src1_ptr_tmp, src2_stride, in0, in1, in2, in3);
2819 src1_ptr_tmp += (4 * src2_stride);
2825 VSHF_B2_SB(src1, src1, src3, src3, mask0, mask0, vec0, vec1);
2826 VSHF_B2_SB(src5, src5, src7, src7, mask0, mask0, vec2, vec3);
2827 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0,
2829 VSHF_B2_SB(src1, src1, src3, src3, mask1, mask1, vec0, vec1);
2830 VSHF_B2_SB(src5, src5, src7, src7, mask1, mask1, vec2, vec3);
2831 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0,
2834 HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
2835 dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
2837 PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
2838 ST_D4(dst0, dst1, 0, 1, 0, 1, dst_tmp, dst_stride);
2839 dst_tmp += (4 * dst_stride);
2843 static void hevc_hz_bi_4t_32w_msa(uint8_t *src0_ptr,
2846 int32_t src2_stride,
2849 const int8_t *filter,
2853 v16i8 src0, src1, src2;
2854 v8i16 in0, in1, in2, in3;
2856 v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
2857 v16i8 mask1, mask2, mask3;
2858 v8i16 dst0, dst1, dst2, dst3;
2859 v16i8 vec0, vec1, vec2, vec3;
2860 v8i16 filter_vec, const_vec;
2864 const_vec = __msa_ldi_h(128);
2867 filter_vec = LD_SH(filter);
2868 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2874 for (loop_cnt = height; loop_cnt--;) {
2875 LD_SB2(src0_ptr, 16, src0, src1);
2876 src2 = LD_SB(src0_ptr + 24);
2877 src0_ptr += src_stride;
2878 LD_SH4(src1_ptr, 8, in0, in1, in2, in3);
2879 src1_ptr += src2_stride;
2880 XORI_B3_128_SB(src0, src1, src2);
2886 VSHF_B2_SB(src0, src0, src0, src1, mask0, mask2, vec0, vec1);
2887 VSHF_B2_SB(src1, src1, src2, src2, mask0, mask0, vec2, vec3);
2888 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0,
2890 VSHF_B2_SB(src0, src0, src0, src1, mask1, mask3, vec0, vec1);
2891 VSHF_B2_SB(src1, src1, src2, src2, mask1, mask1, vec2, vec3);
2892 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0,
2895 HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
2896 dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
2898 PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
2899 ST_SH2(dst0, dst1, dst, 16);
2904 static void hevc_vt_bi_4t_4x2_msa(uint8_t *src0_ptr,
2907 int32_t src2_stride,
2910 const int8_t *filter,
2913 v16i8 src0, src1, src2, src3, src4;
2915 v16i8 src10_r, src32_r, src21_r, src43_r, src2110, src4332;
2918 v8i16 filter_vec, const_vec;
2920 src0_ptr -= src_stride;
2922 const_vec = __msa_ldi_h(128);
2925 filter_vec = LD_SH(filter);
2926 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2928 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
2929 src0_ptr += (3 * src_stride);
2931 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2932 src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
2933 src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
2935 LD_SB2(src0_ptr, src_stride, src3, src4);
2936 LD_SH2(src1_ptr, src2_stride, in0, in1);
2937 in0 = (v8i16) __msa_ilvr_d((v2i64) in1, (v2i64) in0);
2938 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
2939 src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_r, (v2i64) src32_r);
2940 src4332 = (v16i8) __msa_xori_b((v16u8) src4332, 128);
2943 DPADD_SB2_SH(src2110, src4332, filt0, filt1, dst10, dst10);
2944 dst10 = __msa_adds_s_h(dst10, in0);
2945 dst10 = __msa_srari_h(dst10, 7);
2946 CLIP_SH_0_255(dst10);
2948 dst10 = (v8i16) __msa_pckev_b((v16i8) dst10, (v16i8) dst10);
2949 ST_W2(dst10, 0, 1, dst, dst_stride);
2952 static void hevc_vt_bi_4t_4x4_msa(uint8_t *src0_ptr,
2955 int32_t src2_stride,
2958 const int8_t *filter,
2961 v16i8 src0, src1, src2, src3, src4, src5, src6;
2962 v8i16 in0, in1, in2, in3;
2963 v16i8 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r;
2964 v16i8 src2110, src4332, src6554;
2967 v8i16 filter_vec, const_vec;
2969 src0_ptr -= src_stride;
2971 const_vec = __msa_ldi_h(128);
2974 filter_vec = LD_SH(filter);
2975 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2977 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
2978 src0_ptr += (3 * src_stride);
2979 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2980 src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
2981 src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
2983 LD_SB4(src0_ptr, src_stride, src3, src4, src5, src6);
2984 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
2985 ILVR_D2_SH(in1, in0, in3, in2, in0, in1);
2986 ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
2987 src32_r, src43_r, src54_r, src65_r);
2988 ILVR_D2_SB(src43_r, src32_r, src65_r, src54_r, src4332, src6554);
2989 XORI_B2_128_SB(src4332, src6554);
2992 DPADD_SB2_SH(src2110, src4332, filt0, filt1, dst10, dst10);
2994 DPADD_SB2_SH(src4332, src6554, filt0, filt1, dst32, dst32);
2995 HEVC_BI_RND_CLIP2(in0, in1, dst10, dst32, 7, dst10, dst32);
2997 dst10 = (v8i16) __msa_pckev_b((v16i8) dst32, (v16i8) dst10);
2998 ST_W4(dst10, 0, 1, 2, 3, dst, dst_stride);
3001 static void hevc_vt_bi_4t_4x8multiple_msa(uint8_t *src0_ptr,
3004 int32_t src2_stride,
3007 const int8_t *filter,
3011 v16i8 src0, src1, src2, src3, src4, src5;
3012 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
3013 v16i8 src6, src7, src8, src9;
3014 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
3015 v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
3016 v16i8 src2110, src4332, src6554, src8776;
3017 v8i16 dst10, dst32, dst54, dst76;
3019 v8i16 filter_vec, const_vec;
3021 src0_ptr -= src_stride;
3023 const_vec = __msa_ldi_h(128);
3026 filter_vec = LD_SH(filter);
3027 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3029 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
3030 src0_ptr += (3 * src_stride);
3031 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3032 src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
3033 src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
3035 for (loop_cnt = (height >> 3); loop_cnt--;) {
3036 LD_SB6(src0_ptr, src_stride, src3, src4, src5, src6, src7, src8);
3037 src0_ptr += (6 * src_stride);
3038 LD_SH8(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5, in6, in7);
3039 src1_ptr += (8 * src2_stride);
3040 ILVR_D2_SH(in1, in0, in3, in2, in0, in1);
3041 ILVR_D2_SH(in5, in4, in7, in6, in2, in3);
3042 ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
3043 src32_r, src43_r, src54_r, src65_r);
3044 ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
3045 ILVR_D3_SB(src43_r, src32_r, src65_r, src54_r, src87_r, src76_r,
3046 src4332, src6554, src8776);
3047 XORI_B3_128_SB(src4332, src6554, src8776);
3050 DPADD_SB2_SH(src2110, src4332, filt0, filt1, dst10, dst10);
3052 DPADD_SB2_SH(src4332, src6554, filt0, filt1, dst32, dst32);
3054 DPADD_SB2_SH(src6554, src8776, filt0, filt1, dst54, dst54);
3056 LD_SB2(src0_ptr, src_stride, src9, src2);
3057 src0_ptr += (2 * src_stride);
3058 ILVR_B2_SB(src9, src8, src2, src9, src98_r, src109_r);
3059 src2110 = (v16i8) __msa_ilvr_d((v2i64) src109_r, (v2i64) src98_r);
3060 src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
3062 DPADD_SB2_SH(src8776, src2110, filt0, filt1, dst76, dst76);
3064 HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
3065 dst10, dst32, dst54, dst76, 7,
3066 dst10, dst32, dst54, dst76);
3068 PCKEV_B2_SH(dst32, dst10, dst76, dst54, dst10, dst54);
3069 ST_W8(dst10, dst54, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
3070 dst += (8 * dst_stride);
3074 static void hevc_vt_bi_4t_4w_msa(uint8_t *src0_ptr,
3077 int32_t src2_stride,
3080 const int8_t *filter,
3084 hevc_vt_bi_4t_4x2_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
3085 dst, dst_stride, filter, height);
3086 } else if (4 == height) {
3087 hevc_vt_bi_4t_4x4_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
3088 dst, dst_stride, filter, height);
3090 hevc_vt_bi_4t_4x8multiple_msa(src0_ptr, src_stride,
3091 src1_ptr, src2_stride,
3092 dst, dst_stride, filter, height);
3096 static void hevc_vt_bi_4t_6w_msa(uint8_t *src0_ptr,
3099 int32_t src2_stride,
3102 const int8_t *filter,
3105 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
3106 v8i16 in0, in1, in2, in3;
3107 v16i8 src10_r, src32_r, src21_r, src43_r, src54_r, src65_r;
3108 v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
3110 v8i16 filter_vec, const_vec;
3112 src0_ptr -= src_stride;
3114 const_vec = __msa_ldi_h(128);
3117 filter_vec = LD_SH(filter);
3118 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3120 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
3121 src0_ptr += (3 * src_stride);
3122 LD_SB2(src0_ptr, src_stride, src3, src4);
3123 src0_ptr += (2 * src_stride);
3124 LD_SB2(src0_ptr, src_stride, src5, src6);
3125 src0_ptr += (2 * src_stride);
3126 LD_SB2(src0_ptr, src_stride, src7, src8);
3127 src0_ptr += (2 * src_stride);
3128 LD_SB2(src0_ptr, src_stride, src9, src10);
3129 src0_ptr += (2 * src_stride);
3131 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
3132 src1_ptr += (4 * src2_stride);
3134 XORI_B3_128_SB(src0, src1, src2);
3135 XORI_B2_128_SB(src3, src4);
3136 XORI_B2_128_SB(src5, src6);
3137 XORI_B2_128_SB(src7, src8);
3138 XORI_B2_128_SB(src9, src10);
3140 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3141 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3144 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
3146 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
3148 ILVR_B2_SB(src5, src4, src6, src5, src54_r, src65_r);
3151 DPADD_SB2_SH(src32_r, src54_r, filt0, filt1, dst2_r, dst2_r);
3153 DPADD_SB2_SH(src43_r, src65_r, filt0, filt1, dst3_r, dst3_r);
3155 HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
3156 dst0_r, dst1_r, dst2_r, dst3_r, 7,
3157 dst0_r, dst1_r, dst2_r, dst3_r);
3159 PCKEV_B2_SH(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r);
3160 ST_W2(dst0_r, 0, 2, dst, dst_stride);
3161 ST_H2(dst0_r, 2, 6, dst + 4, dst_stride);
3162 ST_W2(dst1_r, 0, 2, dst + 2 * dst_stride, dst_stride);
3163 ST_H2(dst1_r, 2, 6, dst + 2 * dst_stride + 4, dst_stride);
3164 dst += (4 * dst_stride);
3166 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
3167 src1_ptr += (4 * src2_stride);
3168 ILVR_B2_SB(src7, src6, src8, src7, src32_r, src43_r);
3171 DPADD_SB2_SH(src54_r, src32_r, filt0, filt1, dst0_r, dst0_r);
3173 DPADD_SB2_SH(src65_r, src43_r, filt0, filt1, dst1_r, dst1_r);
3175 ILVR_B2_SB(src9, src8, src10, src9, src54_r, src65_r);
3178 DPADD_SB2_SH(src32_r, src54_r, filt0, filt1, dst2_r, dst2_r);
3180 DPADD_SB2_SH(src43_r, src65_r, filt0, filt1, dst3_r, dst3_r);
3182 HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
3183 dst0_r, dst1_r, dst2_r, dst3_r, 7,
3184 dst0_r, dst1_r, dst2_r, dst3_r);
3186 PCKEV_B2_SH(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r);
3187 ST_W2(dst0_r, 0, 2, dst, dst_stride);
3188 ST_H2(dst0_r, 2, 6, dst + 4, dst_stride);
3189 ST_W2(dst1_r, 0, 2, dst + 2 * dst_stride, dst_stride);
3190 ST_H2(dst1_r, 2, 6, dst + 2 * dst_stride + 4, dst_stride);
3191 dst += (4 * dst_stride);
3194 static void hevc_vt_bi_4t_8x2_msa(uint8_t *src0_ptr,
3197 int32_t src2_stride,
3200 const int8_t *filter,
3203 v16i8 src0, src1, src2, src3, src4;
3204 v8i16 in0, in1, dst0_r, dst1_r;
3205 v16i8 src10_r, src32_r, src21_r, src43_r;
3207 v8i16 filter_vec, const_vec;
3209 src0_ptr -= src_stride;
3211 const_vec = __msa_ldi_h(128);
3214 filter_vec = LD_SH(filter);
3215 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3217 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
3218 src0_ptr += (3 * src_stride);
3219 XORI_B3_128_SB(src0, src1, src2);
3220 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3222 LD_SB2(src0_ptr, src_stride, src3, src4);
3223 LD_SH2(src1_ptr, src2_stride, in0, in1);
3224 XORI_B2_128_SB(src3, src4);
3225 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3228 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
3230 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
3232 HEVC_BI_RND_CLIP2(in0, in1, dst0_r, dst1_r, 7, dst0_r, dst1_r);
3233 dst0_r = (v8i16) __msa_pckev_b((v16i8) dst1_r, (v16i8) dst0_r);
3235 ST_D2(dst0_r, 0, 1, dst, dst_stride);
3238 static void hevc_vt_bi_4t_8x6_msa(uint8_t *src0_ptr,
3241 int32_t src2_stride,
3244 const int8_t *filter,
3247 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
3248 v8i16 in0, in1, in2, in3, in4, in5;
3249 v16i8 src10_r, src32_r, src54_r, src76_r;
3250 v16i8 src21_r, src43_r, src65_r, src87_r;
3251 v8i16 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r;
3253 v8i16 filter_vec, const_vec;
3255 src0_ptr -= src_stride;
3257 const_vec = __msa_ldi_h(128);
3260 filter_vec = LD_SH(filter);
3261 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3263 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
3264 src0_ptr += (3 * src_stride);
3265 XORI_B3_128_SB(src0, src1, src2);
3266 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3268 LD_SB6(src0_ptr, src_stride, src3, src4, src5, src6, src7, src8);
3269 LD_SH6(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5);
3270 XORI_B6_128_SB(src3, src4, src5, src6, src7, src8);
3271 ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
3272 src32_r, src43_r, src54_r, src65_r);
3273 ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
3276 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
3278 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
3280 DPADD_SB2_SH(src32_r, src54_r, filt0, filt1, dst2_r, dst2_r);
3282 DPADD_SB2_SH(src43_r, src65_r, filt0, filt1, dst3_r, dst3_r);
3284 DPADD_SB2_SH(src54_r, src76_r, filt0, filt1, dst4_r, dst4_r);
3286 DPADD_SB2_SH(src65_r, src87_r, filt0, filt1, dst5_r, dst5_r);
3287 HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
3288 dst0_r, dst1_r, dst2_r, dst3_r, 7,
3289 dst0_r, dst1_r, dst2_r, dst3_r);
3290 HEVC_BI_RND_CLIP2(in4, in5, dst4_r, dst5_r, 7, dst4_r, dst5_r);
3292 PCKEV_B2_SH(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r);
3293 dst2_r = (v8i16) __msa_pckev_b((v16i8) dst5_r, (v16i8) dst4_r);
3294 ST_D4(dst0_r, dst1_r, 0, 1, 0, 1, dst, dst_stride);
3295 ST_D2(dst2_r, 0, 1, dst + 4 * dst_stride, dst_stride);
3298 static void hevc_vt_bi_4t_8x4multiple_msa(uint8_t *src0_ptr,
3301 int32_t src2_stride,
3304 const int8_t *filter,
3308 v16i8 src0, src1, src2, src3, src4, src5;
3309 v8i16 in0, in1, in2, in3;
3310 v16i8 src10_r, src32_r, src21_r, src43_r;
3311 v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
3313 v8i16 filter_vec, const_vec;
3315 src0_ptr -= src_stride;
3317 const_vec = __msa_ldi_h(128);
3320 filter_vec = LD_SH(filter);
3321 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3323 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
3324 src0_ptr += (3 * src_stride);
3325 XORI_B3_128_SB(src0, src1, src2);
3326 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3328 for (loop_cnt = (height >> 2); loop_cnt--;) {
3329 LD_SB2(src0_ptr, src_stride, src3, src4);
3330 src0_ptr += (2 * src_stride);
3331 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
3332 src1_ptr += (4 * src2_stride);
3333 XORI_B2_128_SB(src3, src4);
3334 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3337 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
3339 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
3341 LD_SB2(src0_ptr, src_stride, src5, src2);
3342 src0_ptr += (2 * src_stride);
3343 XORI_B2_128_SB(src5, src2);
3344 ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r);
3347 DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, dst2_r, dst2_r);
3349 DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, dst3_r, dst3_r);
3350 HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
3351 dst0_r, dst1_r, dst2_r, dst3_r, 7,
3352 dst0_r, dst1_r, dst2_r, dst3_r);
3354 PCKEV_B2_SH(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r);
3355 ST_D4(dst0_r, dst1_r, 0, 1, 0, 1, dst, dst_stride);
3356 dst += (4 * dst_stride);
3360 static void hevc_vt_bi_4t_8w_msa(uint8_t *src0_ptr,
3363 int32_t src2_stride,
3366 const int8_t *filter,
3370 hevc_vt_bi_4t_8x2_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
3371 dst, dst_stride, filter, height);
3372 } else if (6 == height) {
3373 hevc_vt_bi_4t_8x6_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
3374 dst, dst_stride, filter, height);
3376 hevc_vt_bi_4t_8x4multiple_msa(src0_ptr, src_stride,
3377 src1_ptr, src2_stride,
3378 dst, dst_stride, filter, height);
3382 static void hevc_vt_bi_4t_12w_msa(uint8_t *src0_ptr,
3385 int32_t src2_stride,
3388 const int8_t *filter,
3392 v16i8 src0, src1, src2, src3, src4, src5, src6;
3393 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
3394 v16i8 src10_r, src32_r, src21_r, src43_r, src54_r, src65_r;
3395 v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
3396 v16i8 src10_l, src32_l, src54_l, src21_l, src43_l, src65_l;
3397 v16i8 src2110, src4332, src6554;
3398 v8i16 dst0_l, dst1_l, filt0, filt1;
3399 v8i16 filter_vec, const_vec;
3401 src0_ptr -= (1 * src_stride);
3403 const_vec = __msa_ldi_h(128);
3406 filter_vec = LD_SH(filter);
3407 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3409 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
3410 src0_ptr += (3 * src_stride);
3411 XORI_B3_128_SB(src0, src1, src2);
3412 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3413 ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
3414 src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_l, (v2i64) src10_l);
3416 for (loop_cnt = (height >> 2); loop_cnt--;) {
3417 LD_SB2(src0_ptr, src_stride, src3, src4);
3418 src0_ptr += (2 * src_stride);
3419 LD_SB2(src0_ptr, src_stride, src5, src6);
3420 src0_ptr += (2 * src_stride);
3421 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
3422 LD_SH4((src1_ptr + 8), src2_stride, in4, in5, in6, in7);
3423 src1_ptr += (4 * src2_stride);
3424 ILVR_D2_SH(in5, in4, in7, in6, in4, in5);
3425 XORI_B2_128_SB(src3, src4);
3426 XORI_B2_128_SB(src5, src6);
3428 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3429 ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
3430 src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_l, (v2i64) src32_l);
3431 ILVR_B2_SB(src5, src4, src6, src5, src54_r, src65_r);
3432 ILVL_B2_SB(src5, src4, src6, src5, src54_l, src65_l);
3433 src6554 = (v16i8) __msa_ilvr_d((v2i64) src65_l, (v2i64) src54_l);
3436 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
3438 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
3440 DPADD_SB2_SH(src2110, src4332, filt0, filt1, dst0_l, dst0_l);
3442 DPADD_SB2_SH(src32_r, src54_r, filt0, filt1, dst2_r, dst2_r);
3444 DPADD_SB2_SH(src43_r, src65_r, filt0, filt1, dst3_r, dst3_r);
3446 DPADD_SB2_SH(src4332, src6554, filt0, filt1, dst1_l, dst1_l);
3447 HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
3448 dst0_r, dst1_r, dst2_r, dst3_r, 7,
3449 dst0_r, dst1_r, dst2_r, dst3_r);
3450 HEVC_BI_RND_CLIP2(in4, in5, dst0_l, dst1_l, 7, dst0_l, dst1_l);
3452 PCKEV_B2_SH(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r);
3453 dst0_l = (v8i16) __msa_pckev_b((v16i8) dst1_l, (v16i8) dst0_l);
3454 ST_D4(dst0_r, dst1_r, 0, 1, 0, 1, dst, dst_stride);
3455 ST_W4(dst0_l, 0, 1, 2, 3, dst + 8, dst_stride);
3456 dst += (4 * dst_stride);
3465 static void hevc_vt_bi_4t_16w_msa(uint8_t *src0_ptr,
3468 int32_t src2_stride,
3471 const int8_t *filter,
3475 v16i8 src0, src1, src2, src3, src4, src5;
3476 v8i16 in0, in1, in2, in3;
3477 v16i8 src10_r, src32_r, src21_r, src43_r;
3478 v16i8 src10_l, src32_l, src21_l, src43_l;
3479 v8i16 dst0_r, dst1_r, dst0_l, dst1_l;
3481 v8i16 filter_vec, const_vec;
3483 src0_ptr -= src_stride;
3485 const_vec = __msa_ldi_h(128);
3488 filter_vec = LD_SH(filter);
3489 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3491 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
3492 src0_ptr += (3 * src_stride);
3493 XORI_B3_128_SB(src0, src1, src2);
3494 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3495 ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
3497 for (loop_cnt = (height >> 2); loop_cnt--;) {
3498 LD_SB2(src0_ptr, src_stride, src3, src4);
3499 src0_ptr += (2 * src_stride);
3500 LD_SH2(src1_ptr, src2_stride, in0, in1);
3501 LD_SH2((src1_ptr + 8), src2_stride, in2, in3);
3502 src1_ptr += (2 * src2_stride);
3503 XORI_B2_128_SB(src3, src4);
3504 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3505 ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
3508 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
3510 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
3512 DPADD_SB2_SH(src10_l, src32_l, filt0, filt1, dst0_l, dst0_l);
3514 DPADD_SB2_SH(src21_l, src43_l, filt0, filt1, dst1_l, dst1_l);
3515 HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
3516 dst0_r, dst1_r, dst0_l, dst1_l, 7,
3517 dst0_r, dst1_r, dst0_l, dst1_l);
3519 PCKEV_B2_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r, dst1_r);
3520 ST_SH2(dst0_r, dst1_r, dst, dst_stride);
3521 dst += (2 * dst_stride);
3523 LD_SB2(src0_ptr, src_stride, src5, src2);
3524 src0_ptr += (2 * src_stride);
3525 LD_SH2(src1_ptr, src2_stride, in0, in1);
3526 LD_SH2((src1_ptr + 8), src2_stride, in2, in3);
3527 src1_ptr += (2 * src2_stride);
3528 XORI_B2_128_SB(src5, src2);
3529 ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r);
3530 ILVL_B2_SB(src5, src4, src2, src5, src10_l, src21_l);
3533 DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, dst0_r, dst0_r);
3535 DPADD_SB2_SH(src32_l, src10_l, filt0, filt1, dst0_l, dst0_l);
3537 DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, dst1_r, dst1_r);
3539 DPADD_SB2_SH(src43_l, src21_l, filt0, filt1, dst1_l, dst1_l);
3540 HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
3541 dst0_r, dst1_r, dst0_l, dst1_l, 7,
3542 dst0_r, dst1_r, dst0_l, dst1_l);
3544 PCKEV_B2_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r, dst1_r);
3545 ST_SH2(dst0_r, dst1_r, dst, dst_stride);
3546 dst += (2 * dst_stride);
3550 static void hevc_vt_bi_4t_24w_msa(uint8_t *src0_ptr,
3553 int32_t src2_stride,
3556 const int8_t *filter,
3560 v16i8 src0, src1, src2, src3, src4, src5;
3561 v16i8 src6, src7, src8, src9, src10, src11;
3562 v8i16 in0, in1, in2, in3, in4, in5;
3563 v16i8 src10_r, src32_r, src76_r, src98_r;
3564 v16i8 src21_r, src43_r, src87_r, src109_r;
3565 v16i8 src10_l, src32_l, src21_l, src43_l;
3566 v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
3567 v8i16 dst0_l, dst1_l;
3569 v8i16 filter_vec, const_vec;
3571 src0_ptr -= src_stride;
3573 const_vec = __msa_ldi_h(128);
3576 filter_vec = LD_SH(filter);
3577 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3580 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
3581 XORI_B3_128_SB(src0, src1, src2);
3582 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3583 ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
3585 LD_SB3(src0_ptr + 16, src_stride, src6, src7, src8);
3586 src0_ptr += (3 * src_stride);
3587 XORI_B3_128_SB(src6, src7, src8);
3588 ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
3590 for (loop_cnt = (height >> 2); loop_cnt--;) {
3592 LD_SB2(src0_ptr, src_stride, src3, src4);
3593 LD_SH2(src1_ptr, src2_stride, in0, in1);
3594 LD_SH2((src1_ptr + 8), src2_stride, in2, in3);
3595 LD_SH2((src1_ptr + 16), src2_stride, in4, in5);
3596 src1_ptr += (2 * src2_stride);
3597 XORI_B2_128_SB(src3, src4);
3598 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3599 ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
3601 LD_SB2(src0_ptr + 16, src_stride, src9, src10);
3602 src0_ptr += (2 * src_stride);
3603 XORI_B2_128_SB(src9, src10);
3604 ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r);
3607 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
3609 DPADD_SB2_SH(src10_l, src32_l, filt0, filt1, dst0_l, dst0_l);
3611 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
3613 DPADD_SB2_SH(src21_l, src43_l, filt0, filt1, dst1_l, dst1_l);
3616 DPADD_SB2_SH(src76_r, src98_r, filt0, filt1, dst2_r, dst2_r);
3618 DPADD_SB2_SH(src87_r, src109_r, filt0, filt1, dst3_r, dst3_r);
3620 HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
3621 dst0_r, dst1_r, dst0_l, dst1_l, 7,
3622 dst0_r, dst1_r, dst0_l, dst1_l);
3624 HEVC_BI_RND_CLIP2(in4, in5, dst2_r, dst3_r, 7, dst2_r, dst3_r);
3626 PCKEV_B2_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r, dst1_r);
3627 dst2_r = (v8i16) __msa_pckev_b((v16i8) dst3_r, (v16i8) dst2_r);
3628 ST_SH2(dst0_r, dst1_r, dst, dst_stride);
3629 ST_D2(dst2_r, 0, 1, dst + 16, dst_stride);
3630 dst += (2 * dst_stride);
3633 LD_SB2(src0_ptr, src_stride, src5, src2);
3634 LD_SH2(src1_ptr, src2_stride, in0, in1);
3635 LD_SH2((src1_ptr + 8), src2_stride, in2, in3);
3636 LD_SH2((src1_ptr + 16), src2_stride, in4, in5);
3637 src1_ptr += (2 * src2_stride);
3638 XORI_B2_128_SB(src5, src2);
3639 ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r);
3640 ILVL_B2_SB(src5, src4, src2, src5, src10_l, src21_l);
3642 LD_SB2(src0_ptr + 16, src_stride, src11, src8);
3643 src0_ptr += (2 * src_stride);
3644 XORI_B2_128_SB(src11, src8);
3645 ILVR_B2_SB(src11, src10, src8, src11, src76_r, src87_r);
3648 DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, dst0_r, dst0_r);
3650 DPADD_SB2_SH(src32_l, src10_l, filt0, filt1, dst0_l, dst0_l);
3652 DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, dst1_r, dst1_r);
3654 DPADD_SB2_SH(src43_l, src21_l, filt0, filt1, dst1_l, dst1_l);
3657 DPADD_SB2_SH(src98_r, src76_r, filt0, filt1, dst2_r, dst2_r);
3659 DPADD_SB2_SH(src109_r, src87_r, filt0, filt1, dst3_r, dst3_r);
3661 HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
3662 dst0_r, dst1_r, dst0_l, dst1_l, 7,
3663 dst0_r, dst1_r, dst0_l, dst1_l);
3664 HEVC_BI_RND_CLIP2(in4, in5, dst2_r, dst3_r, 7, dst2_r, dst3_r);
3666 PCKEV_B2_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r, dst1_r);
3667 dst2_r = (v8i16) __msa_pckev_b((v16i8) dst3_r, (v16i8) dst2_r);
3668 ST_SH2(dst0_r, dst1_r, dst, dst_stride);
3669 ST_D2(dst2_r, 0, 1, dst + 16, dst_stride);
3670 dst += (2 * dst_stride);
3674 static void hevc_vt_bi_4t_32w_msa(uint8_t *src0_ptr,
3677 int32_t src2_stride,
3680 const int8_t *filter,
3684 uint8_t *dst_tmp = dst + 16;
3685 v16i8 src0, src1, src2, src3, src4, src6, src7, src8, src9, src10;
3686 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
3687 v16i8 src10_r, src32_r, src76_r, src98_r;
3688 v16i8 src21_r, src43_r, src87_r, src109_r;
3689 v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
3690 v16i8 src10_l, src32_l, src76_l, src98_l;
3691 v16i8 src21_l, src43_l, src87_l, src109_l;
3692 v8i16 dst0_l, dst1_l, dst2_l, dst3_l;
3694 v8i16 filter_vec, const_vec;
3696 src0_ptr -= src_stride;
3698 const_vec = __msa_ldi_h(128);
3701 filter_vec = LD_SH(filter);
3702 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3705 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
3706 XORI_B3_128_SB(src0, src1, src2);
3707 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3708 ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
3711 LD_SB3(src0_ptr + 16, src_stride, src6, src7, src8);
3712 src0_ptr += (3 * src_stride);
3713 XORI_B3_128_SB(src6, src7, src8);
3714 ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
3715 ILVL_B2_SB(src7, src6, src8, src7, src76_l, src87_l);
3717 for (loop_cnt = (height >> 1); loop_cnt--;) {
3719 LD_SB2(src0_ptr, src_stride, src3, src4);
3720 LD_SH2(src1_ptr, src2_stride, in0, in1);
3721 LD_SH2((src1_ptr + 8), src2_stride, in2, in3);
3722 LD_SH2((src1_ptr + 16), src2_stride, in4, in5);
3723 LD_SH2((src1_ptr + 24), src2_stride, in6, in7);
3724 src1_ptr += (2 * src2_stride);
3725 XORI_B2_128_SB(src3, src4);
3726 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3727 ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
3730 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
3732 DPADD_SB2_SH(src10_l, src32_l, filt0, filt1, dst0_l, dst0_l);
3734 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
3736 DPADD_SB2_SH(src21_l, src43_l, filt0, filt1, dst1_l, dst1_l);
3738 HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
3739 dst0_r, dst1_r, dst0_l, dst1_l, 7,
3740 dst0_r, dst1_r, dst0_l, dst1_l);
3748 PCKEV_B2_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r, dst1_r);
3749 ST_SH2(dst0_r, dst1_r, dst, dst_stride);
3750 dst += (2 * dst_stride);
3753 LD_SB2(src0_ptr + 16, src_stride, src9, src10);
3754 src0_ptr += (2 * src_stride);
3755 XORI_B2_128_SB(src9, src10);
3756 ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r);
3757 ILVL_B2_SB(src9, src8, src10, src9, src98_l, src109_l);
3760 DPADD_SB2_SH(src76_r, src98_r, filt0, filt1, dst2_r, dst2_r);
3762 DPADD_SB2_SH(src76_l, src98_l, filt0, filt1, dst2_l, dst2_l);
3764 DPADD_SB2_SH(src87_r, src109_r, filt0, filt1, dst3_r, dst3_r);
3766 DPADD_SB2_SH(src87_l, src109_l, filt0, filt1, dst3_l, dst3_l);
3768 HEVC_BI_RND_CLIP4(in4, in5, in6, in7,
3769 dst2_r, dst3_r, dst2_l, dst3_l, 7,
3770 dst2_r, dst3_r, dst2_l, dst3_l);
3772 PCKEV_B2_SH(dst2_l, dst2_r, dst3_l, dst3_r, dst2_r, dst3_r);
3773 ST_SH2(dst2_r, dst3_r, dst_tmp, dst_stride);
3774 dst_tmp += (2 * dst_stride);
3784 static void hevc_hv_bi_4t_4x2_msa(uint8_t *src0_ptr,
3787 int32_t src2_stride,
3790 const int8_t *filter_x,
3791 const int8_t *filter_y)
3796 v16i8 src0, src1, src2, src3, src4;
3798 v8i16 filt_h0, filt_h1;
3799 v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16);
3801 v8i16 filter_vec, const_vec;
3802 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
3803 v8i16 dst20, dst31, dst42, dst10, dst32, dst21, dst43, tmp;
3806 src0_ptr -= (src_stride + 1);
3808 filter_vec = LD_SH(filter_x);
3809 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3811 filter_vec = LD_SH(filter_y);
3812 UNPCK_R_SB_SH(filter_vec, filter_vec);
3814 SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
3818 const_vec = __msa_ldi_h(128);
3821 LD_SB5(src0_ptr, src_stride, src0, src1, src2, src3, src4);
3822 XORI_B5_128_SB(src0, src1, src2, src3, src4);
3824 LD2(src1_ptr, src2_stride, tp0, tp1);
3825 INSERT_D2_SH(tp0, tp1, in0);
3826 in0 = __msa_adds_s_h(in0, const_vec);
3828 VSHF_B2_SB(src0, src2, src0, src2, mask0, mask1, vec0, vec1);
3829 VSHF_B2_SB(src1, src3, src1, src3, mask0, mask1, vec2, vec3);
3830 VSHF_B2_SB(src2, src4, src2, src4, mask0, mask1, vec4, vec5);
3832 dst20 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3833 dst31 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3834 dst42 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3836 ILVRL_H2_SH(dst31, dst20, dst10, dst32);
3837 ILVRL_H2_SH(dst42, dst31, dst21, dst43);
3839 dst0 = HEVC_FILT_4TAP(dst10, dst32, filt_h0, filt_h1);
3840 dst1 = HEVC_FILT_4TAP(dst21, dst43, filt_h0, filt_h1);
3843 tmp = __msa_pckev_h((v8i16) dst1, (v8i16) dst0);
3844 tmp = __msa_adds_s_h(tmp, in0);
3845 tmp = __msa_srari_h(tmp, 7);
3847 out = (v16u8) __msa_pckev_b((v16i8) tmp, (v16i8) tmp);
3848 ST_W2(out, 0, 1, dst, dst_stride);
3851 static void hevc_hv_bi_4t_4x4_msa(uint8_t *src0_ptr,
3854 int32_t src2_stride,
3857 const int8_t *filter_x,
3858 const int8_t *filter_y)
3862 v16i8 src0, src1, src2, src3, src4, src5, src6;
3864 v8i16 filt_h0, filt_h1;
3865 v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16);
3867 v8i16 filter_vec, const_vec;
3868 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
3870 v8i16 in0 = { 0 }, in1 = { 0 };
3871 v8i16 dst30, dst41, dst52, dst63;
3872 v8i16 dst10, dst32, dst54, dst21, dst43, dst65;
3873 v4i32 dst0, dst1, dst2, dst3;
3875 src0_ptr -= (src_stride + 1);
3877 filter_vec = LD_SH(filter_x);
3878 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3880 filter_vec = LD_SH(filter_y);
3881 UNPCK_R_SB_SH(filter_vec, filter_vec);
3883 SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
3887 LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6);
3888 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
3890 const_vec = __msa_ldi_h(128);
3893 LD2(src1_ptr, src2_stride, tp0, tp1);
3894 src1_ptr += 2 * src2_stride;
3895 INSERT_D2_SH(tp0, tp1, in0);
3896 LD2(src1_ptr, src2_stride, tp0, tp1);
3897 INSERT_D2_SH(tp0, tp1, in1);
3899 ADDS_SH2_SH(in0, const_vec, in1, const_vec, in0, in1);
3901 VSHF_B2_SB(src0, src3, src0, src3, mask0, mask1, vec0, vec1);
3902 VSHF_B2_SB(src1, src4, src1, src4, mask0, mask1, vec2, vec3);
3903 VSHF_B2_SB(src2, src5, src2, src5, mask0, mask1, vec4, vec5);
3904 VSHF_B2_SB(src3, src6, src3, src6, mask0, mask1, vec6, vec7);
3906 dst30 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3907 dst41 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3908 dst52 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3909 dst63 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
3911 ILVRL_H2_SH(dst41, dst30, dst10, dst43);
3912 ILVRL_H2_SH(dst52, dst41, dst21, dst54);
3913 ILVRL_H2_SH(dst63, dst52, dst32, dst65);
3914 dst0 = HEVC_FILT_4TAP(dst10, dst32, filt_h0, filt_h1);
3915 dst1 = HEVC_FILT_4TAP(dst21, dst43, filt_h0, filt_h1);
3916 dst2 = HEVC_FILT_4TAP(dst32, dst54, filt_h0, filt_h1);
3917 dst3 = HEVC_FILT_4TAP(dst43, dst65, filt_h0, filt_h1);
3918 SRA_4V(dst0, dst1, dst2, dst3, 6);
3919 PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp0, tmp1);
3920 ADDS_SH2_SH(tmp0, in0, tmp1, in1, tmp0, tmp1);
3921 SRARI_H2_SH(tmp0, tmp1, 7);
3922 CLIP_SH2_0_255(tmp0, tmp1);
3923 out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
3924 ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
3927 static void hevc_hv_bi_4t_4multx8mult_msa(uint8_t *src0_ptr,
3930 int32_t src2_stride,
3933 const int8_t *filter_x,
3934 const int8_t *filter_y,
3940 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
3942 v8i16 filt_h0, filt_h1;
3943 v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16);
3945 v8i16 filter_vec, const_vec;
3946 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
3947 v8i16 tmp0, tmp1, tmp2, tmp3;
3948 v8i16 dst10, dst21, dst22, dst73, dst84, dst95, dst106;
3949 v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
3950 v8i16 dst21_r, dst43_r, dst65_r, dst87_r;
3951 v8i16 dst98_r, dst109_r;
3952 v8i16 in0 = { 0 }, in1 = { 0 }, in2 = { 0 }, in3 = { 0 };
3953 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r, dst6_r, dst7_r;
3955 src0_ptr -= (src_stride + 1);
3957 filter_vec = LD_SH(filter_x);
3958 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3960 filter_vec = LD_SH(filter_y);
3961 UNPCK_R_SB_SH(filter_vec, filter_vec);
3963 SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
3967 const_vec = __msa_ldi_h(128);
3970 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
3971 src0_ptr += (3 * src_stride);
3972 XORI_B3_128_SB(src0, src1, src2);
3974 VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
3975 VSHF_B2_SB(src1, src2, src1, src2, mask0, mask1, vec2, vec3);
3976 dst10 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3977 dst21 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3978 ILVRL_H2_SH(dst21, dst10, dst10_r, dst21_r);
3979 dst22 = (v8i16) __msa_splati_d((v2i64) dst21, 1);
3982 for (loop_cnt = height >> 3; loop_cnt--;) {
3983 LD_SB8(src0_ptr, src_stride,
3984 src3, src4, src5, src6, src7, src8, src9, src10);
3985 src0_ptr += (8 * src_stride);
3986 XORI_B8_128_SB(src3, src4, src5, src6, src7, src8, src9, src10);
3987 VSHF_B2_SB(src3, src7, src3, src7, mask0, mask1, vec0, vec1);
3988 VSHF_B2_SB(src4, src8, src4, src8, mask0, mask1, vec2, vec3);
3989 VSHF_B2_SB(src5, src9, src5, src9, mask0, mask1, vec4, vec5);
3990 VSHF_B2_SB(src6, src10, src6, src10, mask0, mask1, vec6, vec7);
3992 dst73 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3993 dst84 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3994 dst95 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3995 dst106 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
3997 dst32_r = __msa_ilvr_h(dst73, dst22);
3998 ILVRL_H2_SH(dst84, dst73, dst43_r, dst87_r);
3999 ILVRL_H2_SH(dst95, dst84, dst54_r, dst98_r);
4000 ILVRL_H2_SH(dst106, dst95, dst65_r, dst109_r);
4001 dst22 = (v8i16) __msa_splati_d((v2i64) dst73, 1);
4002 dst76_r = __msa_ilvr_h(dst22, dst106);
4004 LD2(src1_ptr, src2_stride, tp0, tp1);
4005 src1_ptr += 2 * src2_stride;
4006 INSERT_D2_SH(tp0, tp1, in0);
4007 LD2(src1_ptr, src2_stride, tp0, tp1);
4008 src1_ptr += 2 * src2_stride;
4009 INSERT_D2_SH(tp0, tp1, in1);
4011 LD2(src1_ptr, src2_stride, tp0, tp1);
4012 src1_ptr += 2 * src2_stride;
4013 INSERT_D2_SH(tp0, tp1, in2);
4014 LD2(src1_ptr, src2_stride, tp0, tp1);
4015 src1_ptr += 2 * src2_stride;
4016 INSERT_D2_SH(tp0, tp1, in3);
4018 ADDS_SH4_SH(in0, const_vec, in1, const_vec, in2, const_vec, in3,
4019 const_vec, in0, in1, in2, in3);
4020 dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
4021 dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
4022 dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
4023 dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
4024 dst4_r = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
4025 dst5_r = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
4026 dst6_r = HEVC_FILT_4TAP(dst76_r, dst98_r, filt_h0, filt_h1);
4027 dst7_r = HEVC_FILT_4TAP(dst87_r, dst109_r, filt_h0, filt_h1);
4028 SRA_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6);
4029 SRA_4V(dst4_r, dst5_r, dst6_r, dst7_r, 6);
4030 PCKEV_H4_SH(dst1_r, dst0_r, dst3_r, dst2_r,
4031 dst5_r, dst4_r, dst7_r, dst6_r, tmp0, tmp1, tmp2, tmp3);
4032 ADDS_SH4_SH(in0, tmp0, in1, tmp1, in2, tmp2, in3, tmp3, tmp0, tmp1,
4034 SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7);
4035 CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3);
4036 PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
4037 ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
4038 dst += (8 * dst_stride);
4042 dst22 = (v8i16) __msa_splati_d((v2i64) dst106, 1);
4046 static void hevc_hv_bi_4t_4w_msa(uint8_t *src0_ptr,
4049 int32_t src2_stride,
4052 const int8_t *filter_x,
4053 const int8_t *filter_y,
4057 hevc_hv_bi_4t_4x2_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
4058 dst, dst_stride, filter_x, filter_y);
4059 } else if (4 == height) {
4060 hevc_hv_bi_4t_4x4_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
4061 dst, dst_stride, filter_x, filter_y);
4062 } else if (0 == (height % 8)) {
4063 hevc_hv_bi_4t_4multx8mult_msa(src0_ptr, src_stride,
4064 src1_ptr, src2_stride,
4066 filter_x, filter_y, height);
4070 static void hevc_hv_bi_4t_6w_msa(uint8_t *src0_ptr,
4073 int32_t src2_stride,
4076 const int8_t *filter_x,
4077 const int8_t *filter_y,
4080 uint32_t tpw0, tpw1, tpw2, tpw3;
4082 v16u8 out0, out1, out2;
4083 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
4084 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
4086 v8i16 filt_h0, filt_h1;
4087 v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
4089 v8i16 filter_vec, const_vec;
4090 v8i16 dsth0, dsth1, dsth2, dsth3, dsth4, dsth5, dsth6, dsth7, dsth8, dsth9;
4091 v8i16 dsth10, tmp4, tmp5;
4092 v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
4093 v4i32 dst4_r, dst5_r, dst6_r, dst7_r;
4094 v8i16 tmp0, tmp1, tmp2, tmp3;
4095 v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
4096 v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
4097 v8i16 dst54_r, dst76_r, dst98_r, dst65_r, dst87_r, dst109_r;
4098 v8i16 dst54_l, dst76_l, dst98_l, dst65_l, dst87_l, dst109_l;
4099 v8i16 dst1021_l, dst3243_l, dst5465_l, dst7687_l, dst98109_l;
4100 v8i16 in0 = { 0 }, in1 = { 0 }, in2 = { 0 }, in3 = { 0 };
4101 v8i16 in4 = { 0 }, in5 = { 0 };
4103 src0_ptr -= (src_stride + 1);
4105 filter_vec = LD_SH(filter_x);
4106 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
4108 filter_vec = LD_SH(filter_y);
4109 UNPCK_R_SB_SH(filter_vec, filter_vec);
4111 SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
4115 const_vec = __msa_ldi_h(128);
4118 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
4119 src0_ptr += (3 * src_stride);
4120 XORI_B3_128_SB(src0, src1, src2);
4122 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
4123 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
4124 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
4126 dsth0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
4127 dsth1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
4128 dsth2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
4130 ILVRL_H2_SH(dsth1, dsth0, dst10_r, dst10_l);
4131 ILVRL_H2_SH(dsth2, dsth1, dst21_r, dst21_l);
4133 LD_SB8(src0_ptr, src_stride,
4134 src3, src4, src5, src6, src7, src8, src9, src10);
4135 XORI_B8_128_SB(src3, src4, src5, src6, src7, src8, src9, src10);
4137 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
4138 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
4139 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
4140 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
4142 dsth3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
4143 dsth4 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
4144 dsth5 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
4145 dsth6 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
4147 VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1);
4148 VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec2, vec3);
4149 VSHF_B2_SB(src9, src9, src9, src9, mask0, mask1, vec4, vec5);
4150 VSHF_B2_SB(src10, src10, src10, src10, mask0, mask1, vec6, vec7);
4152 dsth7 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
4153 dsth8 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
4154 dsth9 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
4155 dsth10 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
4157 ILVRL_H2_SH(dsth3, dsth2, dst32_r, dst32_l);
4158 ILVRL_H2_SH(dsth4, dsth3, dst43_r, dst43_l);
4159 ILVRL_H2_SH(dsth5, dsth4, dst54_r, dst54_l);
4160 ILVRL_H2_SH(dsth6, dsth5, dst65_r, dst65_l);
4161 ILVRL_H2_SH(dsth7, dsth6, dst76_r, dst76_l);
4162 ILVRL_H2_SH(dsth8, dsth7, dst87_r, dst87_l);
4163 ILVRL_H2_SH(dsth9, dsth8, dst98_r, dst98_l);
4164 ILVRL_H2_SH(dsth10, dsth9, dst109_r, dst109_l);
4165 PCKEV_D2_SH(dst21_l, dst10_l, dst43_l, dst32_l, dst1021_l, dst3243_l);
4166 PCKEV_D2_SH(dst65_l, dst54_l, dst87_l, dst76_l, dst5465_l, dst7687_l);
4167 dst98109_l = (v8i16) __msa_pckev_d((v2i64) dst109_l, (v2i64) dst98_l);
4169 dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
4170 dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
4171 dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
4172 dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
4173 dst4_r = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
4174 dst5_r = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
4175 dst6_r = HEVC_FILT_4TAP(dst76_r, dst98_r, filt_h0, filt_h1);
4176 dst7_r = HEVC_FILT_4TAP(dst87_r, dst109_r, filt_h0, filt_h1);
4177 dst0_l = HEVC_FILT_4TAP(dst1021_l, dst3243_l, filt_h0, filt_h1);
4178 dst1_l = HEVC_FILT_4TAP(dst3243_l, dst5465_l, filt_h0, filt_h1);
4179 dst2_l = HEVC_FILT_4TAP(dst5465_l, dst7687_l, filt_h0, filt_h1);
4180 dst3_l = HEVC_FILT_4TAP(dst7687_l, dst98109_l, filt_h0, filt_h1);
4181 SRA_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6);
4182 SRA_4V(dst4_r, dst5_r, dst6_r, dst7_r, 6);
4183 SRA_4V(dst0_l, dst1_l, dst2_l, dst3_l, 6);
4184 PCKEV_H2_SH(dst1_r, dst0_r, dst3_r, dst2_r, tmp0, tmp1);
4185 PCKEV_H2_SH(dst5_r, dst4_r, dst7_r, dst6_r, tmp2, tmp3);
4186 PCKEV_H2_SH(dst1_l, dst0_l, dst3_l, dst2_l, tmp4, tmp5);
4188 LD2(src1_ptr, src2_stride, tp0, tp1);
4189 INSERT_D2_SH(tp0, tp1, in0);
4190 LD2(src1_ptr + 2 * src2_stride, src2_stride, tp0, tp1);
4191 INSERT_D2_SH(tp0, tp1, in1);
4193 LD2(src1_ptr + 4 * src2_stride, src2_stride, tp0, tp1);
4194 INSERT_D2_SH(tp0, tp1, in2);
4195 LD2(src1_ptr + 6 * src2_stride, src2_stride, tp0, tp1);
4196 INSERT_D2_SH(tp0, tp1, in3);
4198 ADDS_SH4_SH(in0, const_vec, in1, const_vec, in2, const_vec, in3, const_vec,
4199 in0, in1, in2, in3);
4200 ADDS_SH4_SH(in0, tmp0, in1, tmp1, in2, tmp2, in3, tmp3, tmp0, tmp1, tmp2,
4202 SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7);
4203 CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3);
4204 PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
4205 ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
4207 LW4(src1_ptr + 4, src2_stride, tpw0, tpw1, tpw2, tpw3);
4208 src1_ptr += (4 * src2_stride);
4209 INSERT_W4_SH(tpw0, tpw1, tpw2, tpw3, in4);
4210 LW4(src1_ptr + 4, src2_stride, tpw0, tpw1, tpw2, tpw3);
4211 INSERT_W4_SH(tpw0, tpw1, tpw2, tpw3, in5);
4212 ADDS_SH2_SH(in4, const_vec, in5, const_vec, in4, in5);
4213 ADDS_SH2_SH(in4, tmp4, in5, tmp5, tmp4, tmp5);
4214 SRARI_H2_SH(tmp4, tmp5, 7);
4215 CLIP_SH2_0_255(tmp4, tmp5);
4216 out2 = (v16u8) __msa_pckev_b((v16i8) tmp5, (v16i8) tmp4);
4217 ST_H8(out2, 0, 1, 2, 3, 4, 5, 6, 7, dst + 4, dst_stride);
4220 static void hevc_hv_bi_4t_8x2_msa(uint8_t *src0_ptr,
4223 int32_t src2_stride,
4226 const int8_t *filter_x,
4227 const int8_t *filter_y)
4230 v16i8 src0, src1, src2, src3, src4;
4232 v8i16 filt_h0, filt_h1;
4233 v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
4235 v8i16 filter_vec, const_vec;
4236 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
4237 v8i16 dst0, dst1, dst2, dst3, dst4;
4238 v4i32 dst0_r, dst0_l, dst1_r, dst1_l;
4239 v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
4240 v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
4244 src0_ptr -= (src_stride + 1);
4246 filter_vec = LD_SH(filter_x);
4247 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
4249 filter_vec = LD_SH(filter_y);
4250 UNPCK_R_SB_SH(filter_vec, filter_vec);
4252 SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
4256 const_vec = __msa_ldi_h(128);
4259 LD_SB5(src0_ptr, src_stride, src0, src1, src2, src3, src4);
4260 XORI_B5_128_SB(src0, src1, src2, src3, src4);
4262 LD_SH2(src1_ptr, src2_stride, in0, in1);
4263 ADDS_SH2_SH(in0, const_vec, in1, const_vec, in0, in1);
4265 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
4266 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
4267 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
4268 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7);
4269 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec8, vec9);
4271 dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
4272 dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
4273 dst2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
4274 dst3 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
4275 dst4 = HEVC_FILT_4TAP_SH(vec8, vec9, filt0, filt1);
4277 ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
4278 ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
4279 ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
4280 ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
4281 dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
4282 dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
4283 dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
4284 dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
4285 SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
4286 PCKEV_H2_SH(dst0_l, dst0_r, dst1_l, dst1_r, tmp0, tmp1);
4287 ADDS_SH2_SH(in0, tmp0, in1, tmp1, tmp0, tmp1);
4288 SRARI_H2_SH(tmp0, tmp1, 7);
4289 CLIP_SH2_0_255(tmp0, tmp1);
4290 out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
4291 ST_D2(out, 0, 1, dst, dst_stride);
4294 static void hevc_hv_bi_4t_8multx4_msa(uint8_t *src0_ptr,
4297 int32_t src2_stride,
4300 const int8_t *filter_x,
4301 const int8_t *filter_y,
4306 v16i8 src0, src1, src2, src3, src4, src5, src6, mask0, mask1;
4307 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
4308 v8i16 filt0, filt1, filt_h0, filt_h1, filter_vec, const_vec;
4309 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, tmp0, tmp1, tmp2, tmp3;
4310 v8i16 in0, in1, in2, in3;
4311 v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
4312 v8i16 dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r;
4313 v8i16 dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l;
4315 src0_ptr -= (src_stride + 1);
4317 filter_vec = LD_SH(filter_x);
4318 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
4320 filter_vec = LD_SH(filter_y);
4321 UNPCK_R_SB_SH(filter_vec, filter_vec);
4323 SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
4325 mask0 = LD_SB(ff_hevc_mask_arr);
4328 const_vec = __msa_ldi_h(128);
4331 for (cnt = width8mult; cnt--;) {
4332 LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6);
4334 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
4336 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
4338 ADDS_SH4_SH(in0, const_vec, in1, const_vec, in2, const_vec, in3,
4339 const_vec, in0, in1, in2, in3);
4341 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
4342 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
4343 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
4345 dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
4346 dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
4347 dst2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
4349 ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
4350 ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
4352 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
4353 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
4354 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
4355 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
4357 dst3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
4358 dst4 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
4359 dst5 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
4360 dst6 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
4362 ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
4363 ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
4364 ILVRL_H2_SH(dst5, dst4, dst54_r, dst54_l);
4365 ILVRL_H2_SH(dst6, dst5, dst65_r, dst65_l);
4367 dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
4368 dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
4369 dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
4370 dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
4371 dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
4372 dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1);
4373 dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
4374 dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1);
4376 SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
4377 SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
4378 PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, dst3_l,
4379 dst3_r, tmp0, tmp1, tmp2, tmp3);
4380 ADDS_SH4_SH(in0, tmp0, in1, tmp1, in2, tmp2, in3, tmp3,
4381 tmp0, tmp1, tmp2, tmp3);
4382 SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7);
4383 CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3);
4384 PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
4385 ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
4390 static void hevc_hv_bi_4t_8x6_msa(uint8_t *src0_ptr,
4393 int32_t src2_stride,
4396 const int8_t *filter_x,
4397 const int8_t *filter_y)
4399 v16u8 out0, out1, out2;
4400 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
4401 v8i16 in0, in1, in2, in3, in4, in5;
4403 v8i16 filt_h0, filt_h1;
4404 v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
4406 v8i16 filter_vec, const_vec;
4407 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
4408 v16i8 vec10, vec11, vec12, vec13, vec14, vec15, vec16, vec17;
4409 v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
4410 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
4411 v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
4412 v4i32 dst4_r, dst4_l, dst5_r, dst5_l;
4413 v8i16 dst10_r, dst32_r, dst10_l, dst32_l;
4414 v8i16 dst21_r, dst43_r, dst21_l, dst43_l;
4415 v8i16 dst54_r, dst54_l, dst65_r, dst65_l;
4416 v8i16 dst76_r, dst76_l, dst87_r, dst87_l;
4418 src0_ptr -= (src_stride + 1);
4420 filter_vec = LD_SH(filter_x);
4421 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
4423 filter_vec = LD_SH(filter_y);
4424 UNPCK_R_SB_SH(filter_vec, filter_vec);
4426 SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
4430 const_vec = __msa_ldi_h(128);
4433 LD_SB5(src0_ptr, src_stride, src0, src1, src2, src3, src4);
4434 src0_ptr += (5 * src_stride);
4435 LD_SB4(src0_ptr, src_stride, src5, src6, src7, src8);
4437 XORI_B5_128_SB(src0, src1, src2, src3, src4);
4438 XORI_B4_128_SB(src5, src6, src7, src8);
4440 LD_SH6(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5);
4441 ADDS_SH4_SH(in0, const_vec, in1, const_vec, in2, const_vec, in3, const_vec,
4442 in0, in1, in2, in3);
4443 ADDS_SH2_SH(in4, const_vec, in5, const_vec, in4, in5);
4445 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
4446 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
4447 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
4448 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7);
4449 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec8, vec9);
4450 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec10, vec11);
4451 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec12, vec13);
4452 VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec14, vec15);
4453 VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec16, vec17);
4455 dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
4456 dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
4457 dst2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
4458 dst3 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
4459 dst4 = HEVC_FILT_4TAP_SH(vec8, vec9, filt0, filt1);
4460 dst5 = HEVC_FILT_4TAP_SH(vec10, vec11, filt0, filt1);
4461 dst6 = HEVC_FILT_4TAP_SH(vec12, vec13, filt0, filt1);
4462 dst7 = HEVC_FILT_4TAP_SH(vec14, vec15, filt0, filt1);
4463 dst8 = HEVC_FILT_4TAP_SH(vec16, vec17, filt0, filt1);
4465 ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
4466 ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
4467 ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
4468 ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
4469 ILVRL_H2_SH(dst5, dst4, dst54_r, dst54_l);
4470 ILVRL_H2_SH(dst6, dst5, dst65_r, dst65_l);
4471 ILVRL_H2_SH(dst7, dst6, dst76_r, dst76_l);
4472 ILVRL_H2_SH(dst8, dst7, dst87_r, dst87_l);
4474 dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
4475 dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
4476 dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
4477 dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
4478 dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
4479 dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1);
4480 dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
4481 dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1);
4482 dst4_r = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
4483 dst4_l = HEVC_FILT_4TAP(dst54_l, dst76_l, filt_h0, filt_h1);
4484 dst5_r = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
4485 dst5_l = HEVC_FILT_4TAP(dst65_l, dst87_l, filt_h0, filt_h1);
4487 SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
4488 SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
4489 SRA_4V(dst4_r, dst4_l, dst5_r, dst5_l, 6);
4490 PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, dst3_l, dst3_r,
4491 tmp0, tmp1, tmp2, tmp3);
4492 PCKEV_H2_SH(dst4_l, dst4_r, dst5_l, dst5_r, tmp4, tmp5);
4493 ADDS_SH4_SH(in0, tmp0, in1, tmp1, in2, tmp2, in3, tmp3,
4494 tmp0, tmp1, tmp2, tmp3);
4495 ADDS_SH2_SH(in4, tmp4, in5, tmp5, tmp4, tmp5);
4496 SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7);
4497 SRARI_H2_SH(tmp4, tmp5, 7);
4498 CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3);
4499 CLIP_SH2_0_255(tmp4, tmp5);
4500 PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
4501 out2 = (v16u8) __msa_pckev_b((v16i8) tmp5, (v16i8) tmp4);
4502 ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
4503 ST_D2(out2, 0, 1, dst + 4 * dst_stride, dst_stride);
4506 static void hevc_hv_bi_4t_8multx4mult_msa(uint8_t *src0_ptr,
4509 int32_t src2_stride,
4512 const int8_t *filter_x,
4513 const int8_t *filter_y,
4517 uint32_t loop_cnt, cnt;
4518 uint8_t *src0_ptr_tmp;
4519 int16_t *src1_ptr_tmp;
4522 v16i8 src0, src1, src2, src3, src4, src5, src6;
4523 v8i16 in0, in1, in2, in3;
4525 v8i16 filt_h0, filt_h1;
4526 v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
4528 v8i16 filter_vec, const_vec;
4529 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
4530 v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
4531 v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
4532 v8i16 tmp0, tmp1, tmp2, tmp3;
4533 v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
4534 v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
4535 v8i16 dst54_r, dst54_l, dst65_r, dst65_l, dst6;
4537 src0_ptr -= (src_stride + 1);
4539 filter_vec = LD_SH(filter_x);
4540 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
4542 filter_vec = LD_SH(filter_y);
4543 UNPCK_R_SB_SH(filter_vec, filter_vec);
4545 SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
4549 const_vec = __msa_ldi_h(128);
4552 for (cnt = width >> 3; cnt--;) {
4553 src0_ptr_tmp = src0_ptr;
4555 src1_ptr_tmp = src1_ptr;
4557 LD_SB3(src0_ptr_tmp, src_stride, src0, src1, src2);
4558 src0_ptr_tmp += (3 * src_stride);
4559 XORI_B3_128_SB(src0, src1, src2);
4561 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
4562 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
4563 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
4565 dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
4566 dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
4567 dst2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
4569 ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
4570 ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
4572 for (loop_cnt = height >> 2; loop_cnt--;) {
4573 LD_SB4(src0_ptr_tmp, src_stride, src3, src4, src5, src6);
4574 src0_ptr_tmp += (4 * src_stride);
4575 LD_SH4(src1_ptr_tmp, src2_stride, in0, in1, in2, in3);
4576 src1_ptr_tmp += (4 * src2_stride);
4577 XORI_B4_128_SB(src3, src4, src5, src6);
4579 ADDS_SH4_SH(in0, const_vec, in1, const_vec, in2, const_vec, in3,
4580 const_vec, in0, in1, in2, in3);
4582 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
4583 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
4584 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
4585 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
4587 dst3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
4588 dst4 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
4589 dst5 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
4590 dst6 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
4592 ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
4593 ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
4594 ILVRL_H2_SH(dst5, dst4, dst54_r, dst54_l);
4595 ILVRL_H2_SH(dst6, dst5, dst65_r, dst65_l);
4597 dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
4598 dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
4599 dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
4600 dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
4601 dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
4602 dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1);
4603 dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
4604 dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1);
4606 SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
4607 SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
4608 PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, dst3_l,
4609 dst3_r, tmp0, tmp1, tmp2, tmp3);
4610 ADDS_SH4_SH(in0, tmp0, in1, tmp1, in2, tmp2, in3, tmp3,
4611 tmp0, tmp1, tmp2, tmp3);
4612 SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7);
4613 CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3);
4614 PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
4615 ST_D4(out0, out1, 0, 1, 0, 1, dst_tmp, dst_stride);
4616 dst_tmp += (4 * dst_stride);
4631 static void hevc_hv_bi_4t_8w_msa(uint8_t *src0_ptr,
4634 int32_t src2_stride,
4637 const int8_t *filter_x,
4638 const int8_t *filter_y,
4642 hevc_hv_bi_4t_8x2_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
4643 dst, dst_stride, filter_x, filter_y);
4644 } else if (4 == height) {
4645 hevc_hv_bi_4t_8multx4_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
4646 dst, dst_stride, filter_x, filter_y, 1);
4647 } else if (6 == height) {
4648 hevc_hv_bi_4t_8x6_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
4649 dst, dst_stride, filter_x, filter_y);
4651 hevc_hv_bi_4t_8multx4mult_msa(src0_ptr, src_stride,
4652 src1_ptr, src2_stride,
4654 filter_x, filter_y, height, 8);
4658 static void hevc_hv_bi_4t_12w_msa(uint8_t *src0_ptr,
4661 int32_t src2_stride,
4664 const int8_t *filter_x,
4665 const int8_t *filter_y,
4670 uint8_t *src0_ptr_tmp, *dst_tmp;
4671 int16_t *src1_ptr_tmp;
4673 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
4674 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
4675 v16i8 mask0, mask1, mask2, mask3;
4676 v8i16 filt0, filt1, filt_h0, filt_h1, filter_vec, tmp0, tmp1, tmp2, tmp3;
4677 v8i16 dsth0, dsth1, dsth2, dsth3, dsth4, dsth5, dsth6, const_vec;
4678 v8i16 dst10, dst21, dst22, dst73, dst84, dst95, dst106;
4679 v8i16 dst76_r, dst98_r, dst87_r, dst109_r;
4680 v8i16 in0 = { 0 }, in1 = { 0 }, in2 = { 0 }, in3 = { 0 };
4681 v8i16 dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r;
4682 v8i16 dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l;
4683 v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
4684 v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
4686 src0_ptr -= (src_stride + 1);
4688 filter_vec = LD_SH(filter_x);
4689 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
4691 filter_vec = LD_SH(filter_y);
4692 UNPCK_R_SB_SH(filter_vec, filter_vec);
4694 SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
4696 mask0 = LD_SB(ff_hevc_mask_arr);
4699 const_vec = __msa_ldi_h(128);
4702 src0_ptr_tmp = src0_ptr;
4704 src1_ptr_tmp = src1_ptr;
4706 LD_SB3(src0_ptr_tmp, src_stride, src0, src1, src2);
4707 src0_ptr_tmp += (3 * src_stride);
4709 XORI_B3_128_SB(src0, src1, src2);
4711 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
4712 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
4713 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
4715 dsth0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
4716 dsth1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
4717 dsth2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
4719 ILVRL_H2_SH(dsth1, dsth0, dst10_r, dst10_l);
4720 ILVRL_H2_SH(dsth2, dsth1, dst21_r, dst21_l);
4722 for (loop_cnt = 4; loop_cnt--;) {
4723 LD_SB4(src0_ptr_tmp, src_stride, src3, src4, src5, src6);
4724 src0_ptr_tmp += (4 * src_stride);
4725 XORI_B4_128_SB(src3, src4, src5, src6);
4727 LD_SH4(src1_ptr_tmp, src2_stride, in0, in1, in2, in3);
4728 src1_ptr_tmp += (4 * src2_stride);
4729 ADDS_SH4_SH(in0, const_vec, in1, const_vec, in2, const_vec, in3,
4730 const_vec, in0, in1, in2, in3);
4732 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
4733 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
4734 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
4735 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
4737 dsth3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
4738 dsth4 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
4739 dsth5 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
4740 dsth6 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
4742 ILVRL_H2_SH(dsth3, dsth2, dst32_r, dst32_l);
4743 ILVRL_H2_SH(dsth4, dsth3, dst43_r, dst43_l);
4744 ILVRL_H2_SH(dsth5, dsth4, dst54_r, dst54_l);
4745 ILVRL_H2_SH(dsth6, dsth5, dst65_r, dst65_l);
4747 dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
4748 dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
4749 dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
4750 dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
4751 dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
4752 dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1);
4753 dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
4754 dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1);
4756 SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
4757 SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
4758 PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, dst3_l,
4759 dst3_r, tmp0, tmp1, tmp2, tmp3);
4760 ADDS_SH4_SH(in0, tmp0, in1, tmp1, in2, tmp2, in3, tmp3,
4761 tmp0, tmp1, tmp2, tmp3);
4762 SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7);
4763 CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3);
4764 PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
4765 ST_D4(out0, out1, 0, 1, 0, 1, dst_tmp, dst_stride);
4766 dst_tmp += (4 * dst_stride);
4779 mask2 = LD_SB(ff_hevc_mask_arr + 16);
4782 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
4783 src0_ptr += (3 * src_stride);
4784 XORI_B3_128_SB(src0, src1, src2);
4785 VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec0, vec1);
4786 VSHF_B2_SB(src1, src2, src1, src2, mask2, mask3, vec2, vec3);
4788 dst10 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
4789 dst21 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
4791 ILVRL_H2_SH(dst21, dst10, dst10_r, dst21_r);
4792 dst22 = (v8i16) __msa_splati_d((v2i64) dst21, 1);
4794 for (loop_cnt = 2; loop_cnt--;) {
4795 LD_SB8(src0_ptr, src_stride,
4796 src3, src4, src5, src6, src7, src8, src9, src10);
4797 src0_ptr += (8 * src_stride);
4798 XORI_B8_128_SB(src3, src4, src5, src6, src7, src8, src9, src10);
4799 VSHF_B2_SB(src3, src7, src3, src7, mask2, mask3, vec0, vec1);
4800 VSHF_B2_SB(src4, src8, src4, src8, mask2, mask3, vec2, vec3);
4801 VSHF_B2_SB(src5, src9, src5, src9, mask2, mask3, vec4, vec5);
4802 VSHF_B2_SB(src6, src10, src6, src10, mask2, mask3, vec6, vec7);
4804 dst73 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
4805 dst84 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
4806 dst95 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
4807 dst106 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
4809 dst32_r = __msa_ilvr_h(dst73, dst22);
4810 ILVRL_H2_SH(dst84, dst73, dst43_r, dst87_r);
4811 ILVRL_H2_SH(dst95, dst84, dst54_r, dst98_r);
4812 ILVRL_H2_SH(dst106, dst95, dst65_r, dst109_r);
4813 dst22 = (v8i16) __msa_splati_d((v2i64) dst73, 1);
4814 dst76_r = __msa_ilvr_h(dst22, dst106);
4816 LD2(src1_ptr, src2_stride, tp0, tp1);
4817 src1_ptr += 2 * src2_stride;
4818 INSERT_D2_SH(tp0, tp1, in0);
4819 LD2(src1_ptr, src2_stride, tp0, tp1);
4820 src1_ptr += 2 * src2_stride;
4821 INSERT_D2_SH(tp0, tp1, in1);
4823 LD2(src1_ptr, src2_stride, tp0, tp1);
4824 src1_ptr += 2 * src2_stride;
4825 INSERT_D2_SH(tp0, tp1, in2);
4826 LD2(src1_ptr, src2_stride, tp0, tp1);
4827 src1_ptr += 2 * src2_stride;
4828 INSERT_D2_SH(tp0, tp1, in3);
4830 ADDS_SH4_SH(in0, const_vec, in1, const_vec, in2, const_vec, in3,
4831 const_vec, in0, in1, in2, in3);
4833 dst0 = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
4834 dst1 = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
4835 dst2 = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
4836 dst3 = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
4837 dst4 = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
4838 dst5 = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
4839 dst6 = HEVC_FILT_4TAP(dst76_r, dst98_r, filt_h0, filt_h1);
4840 dst7 = HEVC_FILT_4TAP(dst87_r, dst109_r, filt_h0, filt_h1);
4842 SRA_4V(dst0, dst1, dst2, dst3, 6);
4843 SRA_4V(dst4, dst5, dst6, dst7, 6);
4844 PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6,
4845 tmp0, tmp1, tmp2, tmp3);
4846 ADDS_SH4_SH(in0, tmp0, in1, tmp1, in2, tmp2, in3, tmp3,
4847 tmp0, tmp1, tmp2, tmp3);
4848 SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7);
4849 CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3);
4850 PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
4851 ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
4852 dst += (8 * dst_stride);
4856 dst22 = (v8i16) __msa_splati_d((v2i64) dst106, 1);
4860 static void hevc_hv_bi_4t_16w_msa(uint8_t *src0_ptr,
4863 int32_t src2_stride,
4866 const int8_t *filter_x,
4867 const int8_t *filter_y,
4871 hevc_hv_bi_4t_8multx4_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
4872 dst, dst_stride, filter_x, filter_y, 2);
4874 hevc_hv_bi_4t_8multx4mult_msa(src0_ptr, src_stride, src1_ptr,
4875 src2_stride, dst, dst_stride, filter_x,
4876 filter_y, height, 16);
4880 static void hevc_hv_bi_4t_24w_msa(uint8_t *src0_ptr,
4883 int32_t src2_stride,
4886 const int8_t *filter_x,
4887 const int8_t *filter_y,
4890 hevc_hv_bi_4t_8multx4mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
4891 dst, dst_stride, filter_x, filter_y,
4895 static void hevc_hv_bi_4t_32w_msa(uint8_t *src0_ptr,
4898 int32_t src2_stride,
4901 const int8_t *filter_x,
4902 const int8_t *filter_y,
4905 hevc_hv_bi_4t_8multx4mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
4906 dst, dst_stride, filter_x, filter_y,
4910 #define BI_MC_COPY(WIDTH) \
4911 void ff_hevc_put_hevc_bi_pel_pixels##WIDTH##_8_msa(uint8_t *dst, \
4912 ptrdiff_t dst_stride, \
4914 ptrdiff_t src_stride, \
4915 int16_t *src_16bit, \
4921 hevc_bi_copy_##WIDTH##w_msa(src, src_stride, src_16bit, MAX_PB_SIZE, \
4922 dst, dst_stride, height); \
4937 #define BI_MC(PEL, DIR, WIDTH, TAP, DIR1, FILT_DIR) \
4938 void ff_hevc_put_hevc_bi_##PEL##_##DIR##WIDTH##_8_msa(uint8_t *dst, \
4939 ptrdiff_t dst_stride, \
4941 ptrdiff_t src_stride, \
4942 int16_t *src_16bit, \
4948 const int8_t *filter = ff_hevc_##PEL##_filters[FILT_DIR - 1]; \
4950 hevc_##DIR1##_bi_##TAP##t_##WIDTH##w_msa(src, src_stride, src_16bit, \
4951 MAX_PB_SIZE, dst, dst_stride, \
4955 BI_MC(qpel, h, 4, 8, hz, mx);
4956 BI_MC(qpel, h, 8, 8, hz, mx);
4957 BI_MC(qpel, h, 12, 8, hz, mx);
4958 BI_MC(qpel, h, 16, 8, hz, mx);
4959 BI_MC(qpel, h, 24, 8, hz, mx);
4960 BI_MC(qpel, h, 32, 8, hz, mx);
4961 BI_MC(qpel, h, 48, 8, hz, mx);
4962 BI_MC(qpel, h, 64, 8, hz, mx);
4964 BI_MC(qpel, v, 4, 8, vt, my);
4965 BI_MC(qpel, v, 8, 8, vt, my);
4966 BI_MC(qpel, v, 12, 8, vt, my);
4967 BI_MC(qpel, v, 16, 8, vt, my);
4968 BI_MC(qpel, v, 24, 8, vt, my);
4969 BI_MC(qpel, v, 32, 8, vt, my);
4970 BI_MC(qpel, v, 48, 8, vt, my);
4971 BI_MC(qpel, v, 64, 8, vt, my);
4973 BI_MC(epel, h, 4, 4, hz, mx);
4974 BI_MC(epel, h, 8, 4, hz, mx);
4975 BI_MC(epel, h, 6, 4, hz, mx);
4976 BI_MC(epel, h, 12, 4, hz, mx);
4977 BI_MC(epel, h, 16, 4, hz, mx);
4978 BI_MC(epel, h, 24, 4, hz, mx);
4979 BI_MC(epel, h, 32, 4, hz, mx);
4981 BI_MC(epel, v, 4, 4, vt, my);
4982 BI_MC(epel, v, 8, 4, vt, my);
4983 BI_MC(epel, v, 6, 4, vt, my);
4984 BI_MC(epel, v, 12, 4, vt, my);
4985 BI_MC(epel, v, 16, 4, vt, my);
4986 BI_MC(epel, v, 24, 4, vt, my);
4987 BI_MC(epel, v, 32, 4, vt, my);
4991 #define BI_MC_HV(PEL, WIDTH, TAP) \
4992 void ff_hevc_put_hevc_bi_##PEL##_hv##WIDTH##_8_msa(uint8_t *dst, \
4993 ptrdiff_t dst_stride, \
4995 ptrdiff_t src_stride, \
4996 int16_t *src_16bit, \
5002 const int8_t *filter_x = ff_hevc_##PEL##_filters[mx - 1]; \
5003 const int8_t *filter_y = ff_hevc_##PEL##_filters[my - 1]; \
5005 hevc_hv_bi_##TAP##t_##WIDTH##w_msa(src, src_stride, src_16bit, \
5006 MAX_PB_SIZE, dst, dst_stride, \
5007 filter_x, filter_y, height); \
5010 BI_MC_HV(qpel, 4, 8);
5011 BI_MC_HV(qpel, 8, 8);
5012 BI_MC_HV(qpel, 12, 8);
5013 BI_MC_HV(qpel, 16, 8);
5014 BI_MC_HV(qpel, 24, 8);
5015 BI_MC_HV(qpel, 32, 8);
5016 BI_MC_HV(qpel, 48, 8);
5017 BI_MC_HV(qpel, 64, 8);
5019 BI_MC_HV(epel, 4, 4);
5020 BI_MC_HV(epel, 8, 4);
5021 BI_MC_HV(epel, 6, 4);
5022 BI_MC_HV(epel, 12, 4);
5023 BI_MC_HV(epel, 16, 4);
5024 BI_MC_HV(epel, 24, 4);
5025 BI_MC_HV(epel, 32, 4);