2 * Copyright (c) 2015 - 2017 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com)
4 * This file is part of FFmpeg.
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 #include "libavutil/mips/generic_macros_msa.h"
22 #include "libavcodec/mips/hevcdsp_mips.h"
23 #include "libavcodec/mips/hevc_macros_msa.h"
25 static const uint8_t ff_hevc_mask_arr[16 * 2] __attribute__((aligned(0x40))) = {
27 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
28 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20
31 #define HEVC_BI_RND_CLIP2(in0, in1, vec0, vec1, rnd_val, out0, out1) \
33 ADDS_SH2_SH(vec0, in0, vec1, in1, out0, out1); \
34 SRARI_H2_SH(out0, out1, rnd_val); \
35 CLIP_SH2_0_255(out0, out1); \
38 #define HEVC_BI_RND_CLIP4(in0, in1, in2, in3, \
39 vec0, vec1, vec2, vec3, rnd_val, \
40 out0, out1, out2, out3) \
42 HEVC_BI_RND_CLIP2(in0, in1, vec0, vec1, rnd_val, out0, out1); \
43 HEVC_BI_RND_CLIP2(in2, in3, vec2, vec3, rnd_val, out2, out3); \
46 #define HEVC_BI_RND_CLIP2_MAX_SATU(in0, in1, vec0, vec1, rnd_val, \
49 ADDS_SH2_SH(vec0, in0, vec1, in1, out0, out1); \
50 SRARI_H2_SH(out0, out1, rnd_val); \
51 CLIP_SH2_0_255_MAX_SATU(out0, out1); \
54 #define HEVC_BI_RND_CLIP4_MAX_SATU(in0, in1, in2, in3, vec0, vec1, vec2, \
55 vec3, rnd_val, out0, out1, out2, out3) \
57 HEVC_BI_RND_CLIP2_MAX_SATU(in0, in1, vec0, vec1, rnd_val, out0, out1); \
58 HEVC_BI_RND_CLIP2_MAX_SATU(in2, in3, vec2, vec3, rnd_val, out2, out3); \
61 static void hevc_bi_copy_4w_msa(uint8_t *src0_ptr,
69 uint32_t loop_cnt, tp0, tp1, tp2, tp3;
70 uint64_t tpd0, tpd1, tpd2, tpd3;
71 v16i8 src0 = { 0 }, src1 = { 0 };
73 v8i16 in0 = { 0 }, in1 = { 0 }, in2 = { 0 }, in3 = { 0 };
74 v8i16 dst0, dst1, dst2, dst3;
77 LW2(src0_ptr, src_stride, tp0, tp1);
78 INSERT_W2_SB(tp0, tp1, src0);
79 LD2(src1_ptr, src2_stride, tpd0, tpd1);
80 INSERT_D2_SH(tpd0, tpd1, in0);
82 dst0 = (v8i16) __msa_ilvr_b(zero, src0);
85 dst0 = __msa_srari_h(dst0, 7);
86 dst0 = CLIP_SH_0_255_MAX_SATU(dst0);
88 dst0 = (v8i16) __msa_pckev_b((v16i8) dst0, (v16i8) dst0);
89 ST4x2_UB(dst0, dst, dst_stride);
90 } else if (4 == height) {
91 LW4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
92 INSERT_W4_SB(tp0, tp1, tp2, tp3, src0);
93 LD4(src1_ptr, src2_stride, tpd0, tpd1, tpd2, tpd3);
94 INSERT_D2_SH(tpd0, tpd1, in0);
95 INSERT_D2_SH(tpd2, tpd3, in1);
96 ILVRL_B2_SH(zero, src0, dst0, dst1);
97 SLLI_2V(dst0, dst1, 6);
98 HEVC_BI_RND_CLIP2_MAX_SATU(in0, in1, dst0, dst1, 7, dst0, dst1);
99 dst0 = (v8i16) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
100 ST4x4_UB(dst0, dst0, 0, 1, 2, 3, dst, dst_stride);
101 } else if (0 == height % 8) {
102 for (loop_cnt = (height >> 3); loop_cnt--;) {
103 LW4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
104 src0_ptr += 4 * src_stride;
105 INSERT_W4_SB(tp0, tp1, tp2, tp3, src0);
106 LW4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
107 src0_ptr += 4 * src_stride;
108 INSERT_W4_SB(tp0, tp1, tp2, tp3, src1);
109 LD4(src1_ptr, src2_stride, tpd0, tpd1, tpd2, tpd3);
110 src1_ptr += (4 * src2_stride);
111 INSERT_D2_SH(tpd0, tpd1, in0);
112 INSERT_D2_SH(tpd2, tpd3, in1);
113 LD4(src1_ptr, src2_stride, tpd0, tpd1, tpd2, tpd3);
114 src1_ptr += (4 * src2_stride);
115 INSERT_D2_SH(tpd0, tpd1, in2);
116 INSERT_D2_SH(tpd2, tpd3, in3);
117 ILVRL_B2_SH(zero, src0, dst0, dst1);
118 ILVRL_B2_SH(zero, src1, dst2, dst3);
119 SLLI_4V(dst0, dst1, dst2, dst3, 6);
120 HEVC_BI_RND_CLIP4_MAX_SATU(in0, in1, in2, in3, dst0, dst1, dst2,
121 dst3, 7, dst0, dst1, dst2, dst3);
122 PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
123 ST4x8_UB(dst0, dst1, dst, dst_stride);
124 dst += (8 * dst_stride);
129 static void hevc_bi_copy_6w_msa(uint8_t *src0_ptr,
138 uint64_t tp0, tp1, tp2, tp3;
139 v16u8 out0, out1, out2, out3;
141 v16i8 src0 = { 0 }, src1 = { 0 }, src2 = { 0 }, src3 = { 0 };
142 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
143 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
145 for (loop_cnt = (height >> 3); loop_cnt--;) {
146 LD4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
147 src0_ptr += (4 * src_stride);
148 INSERT_D2_SB(tp0, tp1, src0);
149 INSERT_D2_SB(tp2, tp3, src1);
150 LD4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
151 src0_ptr += (4 * src_stride);
152 INSERT_D2_SB(tp0, tp1, src2);
153 INSERT_D2_SB(tp2, tp3, src3);
154 LD_SH8(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5, in6, in7);
155 src1_ptr += (8 * src2_stride);
156 ILVRL_B2_SH(zero, src0, dst0, dst1);
157 ILVRL_B2_SH(zero, src1, dst2, dst3);
158 ILVRL_B2_SH(zero, src2, dst4, dst5);
159 ILVRL_B2_SH(zero, src3, dst6, dst7);
160 SLLI_4V(dst0, dst1, dst2, dst3, 6);
161 SLLI_4V(dst4, dst5, dst6, dst7, 6);
162 HEVC_BI_RND_CLIP4_MAX_SATU(in0, in1, in2, in3, dst0, dst1, dst2, dst3,
163 7, dst0, dst1, dst2, dst3);
164 HEVC_BI_RND_CLIP4_MAX_SATU(in4, in5, in6, in7, dst4, dst5, dst6, dst7,
165 7, dst4, dst5, dst6, dst7);
166 PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
167 PCKEV_B2_UB(dst5, dst4, dst7, dst6, out2, out3);
168 ST6x4_UB(out0, out1, dst, dst_stride);
169 dst += (4 * dst_stride);
170 ST6x4_UB(out2, out3, dst, dst_stride);
171 dst += (4 * dst_stride);
175 static void hevc_bi_copy_8w_msa(uint8_t *src0_ptr,
183 uint64_t tp0, tp1, tp2, tp3;
184 v16u8 out0, out1, out2, out3;
185 v16i8 src0 = { 0 }, src1 = { 0 }, src2 = { 0 }, src3 = { 0 };
187 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
188 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
191 LD2(src0_ptr, src_stride, tp0, tp1);
192 INSERT_D2_SB(tp0, tp1, src0);
193 LD_SH2(src1_ptr, src2_stride, in0, in1);
194 ILVRL_B2_SH(zero, src0, dst0, dst1);
195 SLLI_2V(dst0, dst1, 6);
196 HEVC_BI_RND_CLIP2_MAX_SATU(in0, in1, dst0, dst1, 7, dst0, dst1);
197 out0 = (v16u8) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
198 ST8x2_UB(out0, dst, dst_stride);
199 } else if (4 == height) {
200 LD4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
201 INSERT_D2_SB(tp0, tp1, src0);
202 INSERT_D2_SB(tp2, tp3, src1);
203 ILVRL_B2_SH(zero, src0, dst0, dst1);
204 ILVRL_B2_SH(zero, src1, dst2, dst3);
205 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
206 SLLI_4V(dst0, dst1, dst2, dst3, 6);
207 HEVC_BI_RND_CLIP4_MAX_SATU(in0, in1, in2, in3, dst0, dst1, dst2, dst3,
208 7, dst0, dst1, dst2, dst3);
209 PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
210 ST8x4_UB(out0, out1, dst, dst_stride);
211 } else if (6 == height) {
212 LD4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
213 src0_ptr += 4 * src_stride;
214 INSERT_D2_SB(tp0, tp1, src0);
215 INSERT_D2_SB(tp2, tp3, src1);
216 LD2(src0_ptr, src_stride, tp0, tp1);
217 INSERT_D2_SB(tp0, tp1, src2);
218 ILVRL_B2_SH(zero, src0, dst0, dst1);
219 ILVRL_B2_SH(zero, src1, dst2, dst3);
220 ILVRL_B2_SH(zero, src2, dst4, dst5);
221 LD_SH6(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5);
222 SLLI_4V(dst0, dst1, dst2, dst3, 6);
223 SLLI_2V(dst4, dst5, 6);
224 HEVC_BI_RND_CLIP4_MAX_SATU(in0, in1, in2, in3, dst0, dst1, dst2, dst3,
225 7, dst0, dst1, dst2, dst3);
226 HEVC_BI_RND_CLIP2_MAX_SATU(in4, in5, dst4, dst5, 7, dst4, dst5);
227 PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
228 ST8x4_UB(out0, out1, dst, dst_stride);
229 dst += (4 * dst_stride);
230 ST8x2_UB(out2, dst, dst_stride);
231 } else if (0 == height % 8) {
234 for (loop_cnt = (height >> 3); loop_cnt--;) {
235 LD4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
236 src0_ptr += 4 * src_stride;
237 INSERT_D2_SB(tp0, tp1, src0);
238 INSERT_D2_SB(tp2, tp3, src1);
239 LD4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
240 src0_ptr += 4 * src_stride;
241 INSERT_D2_SB(tp0, tp1, src2);
242 INSERT_D2_SB(tp2, tp3, src3);
243 ILVRL_B2_SH(zero, src0, dst0, dst1);
244 ILVRL_B2_SH(zero, src1, dst2, dst3);
245 ILVRL_B2_SH(zero, src2, dst4, dst5);
246 ILVRL_B2_SH(zero, src3, dst6, dst7);
247 LD_SH8(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5, in6,
249 src1_ptr += (8 * src2_stride);
250 SLLI_4V(dst0, dst1, dst2, dst3, 6);
251 SLLI_4V(dst4, dst5, dst6, dst7, 6);
252 HEVC_BI_RND_CLIP4_MAX_SATU(in0, in1, in2, in3, dst0, dst1, dst2,
253 dst3, 7, dst0, dst1, dst2, dst3);
254 HEVC_BI_RND_CLIP4_MAX_SATU(in4, in5, in6, in7, dst4, dst5, dst6,
255 dst7, 7, dst4, dst5, dst6, dst7);
256 PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
257 PCKEV_B2_UB(dst5, dst4, dst7, dst6, out2, out3);
258 ST8x8_UB(out0, out1, out2, out3, dst, dst_stride);
259 dst += (8 * dst_stride);
264 static void hevc_bi_copy_12w_msa(uint8_t *src0_ptr,
274 v16u8 out0, out1, out2;
275 v16i8 src0, src1, src2, src3;
276 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
277 v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
279 for (loop_cnt = 4; loop_cnt--;) {
280 LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
281 src0_ptr += (4 * src_stride);
283 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
284 LD_SH4(src1_ptr + 8, src2_stride, in4, in5, in6, in7);
285 src1_ptr += (4 * src2_stride);
286 ILVR_D2_SH(in5, in4, in7, in6, in4, in5);
287 ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, dst0, dst1,
289 SLLI_4V(dst0, dst1, dst2, dst3, 6);
290 ILVL_W2_SB(src1, src0, src3, src2, src0, src1);
291 ILVR_B2_SH(zero, src0, zero, src1, dst4, dst5);
292 SLLI_2V(dst4, dst5, 6);
293 HEVC_BI_RND_CLIP4_MAX_SATU(in0, in1, in2, in3, dst0, dst1, dst2, dst3,
294 7, dst0, dst1, dst2, dst3);
295 HEVC_BI_RND_CLIP2_MAX_SATU(in4, in5, dst4, dst5, 7, dst4, dst5);
296 PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
297 ST12x4_UB(out0, out1, out2, dst, dst_stride);
298 dst += (4 * dst_stride);
302 static void hevc_bi_copy_16w_msa(uint8_t *src0_ptr,
311 v16u8 out0, out1, out2, out3;
312 v16i8 src0, src1, src2, src3;
313 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
314 v8i16 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l;
317 for (loop_cnt = (height >> 2); loop_cnt--;) {
318 LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
319 src0_ptr += (4 * src_stride);
320 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
321 LD_SH4(src1_ptr + 8, src2_stride, in4, in5, in6, in7);
322 src1_ptr += (4 * src2_stride);
323 ILVRL_B2_SH(zero, src0, dst0_r, dst0_l);
324 ILVRL_B2_SH(zero, src1, dst1_r, dst1_l);
325 ILVRL_B2_SH(zero, src2, dst2_r, dst2_l);
326 ILVRL_B2_SH(zero, src3, dst3_r, dst3_l);
327 SLLI_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6);
328 SLLI_4V(dst0_l, dst1_l, dst2_l, dst3_l, 6);
329 HEVC_BI_RND_CLIP4_MAX_SATU(in0, in1, in4, in5, dst0_r, dst1_r, dst0_l,
330 dst1_l, 7, dst0_r, dst1_r, dst0_l, dst1_l);
331 HEVC_BI_RND_CLIP4_MAX_SATU(in2, in3, in6, in7, dst2_r, dst3_r, dst2_l,
332 dst3_l, 7, dst2_r, dst3_r, dst2_l, dst3_l);
333 PCKEV_B2_UB(dst0_l, dst0_r, dst1_l, dst1_r, out0, out1);
334 PCKEV_B2_UB(dst2_l, dst2_r, dst3_l, dst3_r, out2, out3);
335 ST_UB4(out0, out1, out2, out3, dst, dst_stride);
336 dst += (4 * dst_stride);
340 static void hevc_bi_copy_24w_msa(uint8_t *src0_ptr,
349 v16u8 out0, out1, out2, out3, out4, out5;
350 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, zero = { 0 };
351 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8, dst9, dst10;
352 v8i16 in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, dst11;
354 for (loop_cnt = 8; loop_cnt--;) {
355 LD_SB4(src0_ptr, src_stride, src0, src1, src4, src5);
356 LD_SB4(src0_ptr + 16, src_stride, src2, src3, src6, src7);
357 src0_ptr += (4 * src_stride);
358 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
359 LD_SH4(src1_ptr + 8, src2_stride, in4, in5, in6, in7);
360 LD_SH4(src1_ptr + 16, src2_stride, in8, in9, in10, in11);
361 src1_ptr += (4 * src2_stride);
363 ILVRL_B2_SH(zero, src0, dst0, dst1);
364 ILVRL_B2_SH(zero, src1, dst2, dst3);
365 ILVR_B2_SH(zero, src2, zero, src3, dst4, dst5);
366 ILVRL_B2_SH(zero, src4, dst6, dst7);
367 ILVRL_B2_SH(zero, src5, dst8, dst9);
368 ILVR_B2_SH(zero, src6, zero, src7, dst10, dst11);
369 SLLI_4V(dst0, dst1, dst2, dst3, 6);
370 SLLI_4V(dst4, dst5, dst6, dst7, 6);
371 SLLI_4V(dst8, dst9, dst10, dst11, 6);
372 HEVC_BI_RND_CLIP4_MAX_SATU(in0, in4, in1, in5, dst0, dst1, dst2, dst3,
373 7, dst0, dst1, dst2, dst3);
374 HEVC_BI_RND_CLIP4_MAX_SATU(in8, in9, in2, in6, dst4, dst5, dst6, dst7,
375 7, dst4, dst5, dst6, dst7);
376 HEVC_BI_RND_CLIP4_MAX_SATU(in3, in7, in10, in11, dst8, dst9, dst10,
377 dst11, 7, dst8, dst9, dst10, dst11);
378 PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
379 PCKEV_B3_UB(dst7, dst6, dst9, dst8, dst11, dst10, out3, out4, out5);
380 ST_UB4(out0, out1, out3, out4, dst, dst_stride);
381 ST8x4_UB(out2, out5, dst + 16, dst_stride);
382 dst += (4 * dst_stride);
386 static void hevc_bi_copy_32w_msa(uint8_t *src0_ptr,
395 v16u8 out0, out1, out2, out3;
396 v16i8 src0, src1, src2, src3;
398 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
399 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
401 for (loop_cnt = (height >> 1); loop_cnt--;) {
402 LD_SB2(src0_ptr, 16, src0, src1);
403 src0_ptr += src_stride;
404 LD_SB2(src0_ptr, 16, src2, src3);
405 src0_ptr += src_stride;
406 LD_SH4(src1_ptr, 8, in0, in1, in2, in3);
407 src1_ptr += src2_stride;
408 LD_SH4(src1_ptr, 8, in4, in5, in6, in7);
409 src1_ptr += src2_stride;
411 ILVRL_B2_SH(zero, src0, dst0, dst1);
412 ILVRL_B2_SH(zero, src1, dst2, dst3);
413 ILVRL_B2_SH(zero, src2, dst4, dst5);
414 ILVRL_B2_SH(zero, src3, dst6, dst7);
415 SLLI_4V(dst0, dst1, dst2, dst3, 6);
416 SLLI_4V(dst4, dst5, dst6, dst7, 6);
417 HEVC_BI_RND_CLIP4_MAX_SATU(in0, in1, in2, in3, dst0, dst1, dst2, dst3,
418 7, dst0, dst1, dst2, dst3);
419 HEVC_BI_RND_CLIP4_MAX_SATU(in4, in5, in6, in7, dst4, dst5, dst6, dst7,
420 7, dst4, dst5, dst6, dst7);
421 PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
422 PCKEV_B2_UB(dst5, dst4, dst7, dst6, out2, out3);
423 ST_UB2(out0, out1, dst, 16);
425 ST_UB2(out2, out3, dst, 16);
430 static void hevc_bi_copy_48w_msa(uint8_t *src0_ptr,
439 v16u8 out0, out1, out2, out3, out4, out5;
440 v16i8 src0, src1, src2, src3, src4, src5;
442 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8, dst9, dst10;
443 v8i16 in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, dst11;
445 for (loop_cnt = (height >> 1); loop_cnt--;) {
446 LD_SB3(src0_ptr, 16, src0, src1, src2);
447 src0_ptr += src_stride;
448 LD_SB3(src0_ptr, 16, src3, src4, src5);
449 src0_ptr += src_stride;
451 LD_SH6(src1_ptr, 8, in0, in1, in2, in3, in4, in5);
452 src1_ptr += src2_stride;
453 LD_SH6(src1_ptr, 8, in6, in7, in8, in9, in10, in11);
454 src1_ptr += src2_stride;
456 ILVRL_B2_SH(zero, src0, dst0, dst1);
457 ILVRL_B2_SH(zero, src1, dst2, dst3);
458 ILVRL_B2_SH(zero, src2, dst4, dst5);
459 ILVRL_B2_SH(zero, src3, dst6, dst7);
460 ILVRL_B2_SH(zero, src4, dst8, dst9);
461 ILVRL_B2_SH(zero, src5, dst10, dst11);
463 SLLI_4V(dst0, dst1, dst2, dst3, 6);
464 SLLI_4V(dst4, dst5, dst6, dst7, 6);
465 SLLI_4V(dst8, dst9, dst10, dst11, 6);
467 HEVC_BI_RND_CLIP4_MAX_SATU(in0, in1, in2, in3, dst0, dst1, dst2, dst3,
468 7, dst0, dst1, dst2, dst3);
469 HEVC_BI_RND_CLIP4_MAX_SATU(in4, in5, in6, in7, dst4, dst5, dst6, dst7,
470 7, dst4, dst5, dst6, dst7);
471 HEVC_BI_RND_CLIP4_MAX_SATU(in8, in9, in10, in11, dst8, dst9, dst10,
472 dst11, 7, dst8, dst9, dst10, dst11);
473 PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
474 PCKEV_B3_UB(dst7, dst6, dst9, dst8, dst11, dst10, out3, out4, out5);
475 ST_UB2(out0, out1, dst, 16);
476 ST_UB(out2, dst + 32);
478 ST_UB2(out3, out4, dst, 16);
479 ST_UB(out5, dst + 32);
484 static void hevc_bi_copy_64w_msa(uint8_t *src0_ptr,
493 v16u8 out0, out1, out2, out3;
494 v16i8 src0, src1, src2, src3;
496 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
497 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
499 for (loop_cnt = height; loop_cnt--;) {
500 LD_SB4(src0_ptr, 16, src0, src1, src2, src3);
501 src0_ptr += src_stride;
502 LD_SH8(src1_ptr, 8, in0, in1, in2, in3, in4, in5, in6, in7);
503 src1_ptr += src2_stride;
505 ILVRL_B2_SH(zero, src0, dst0, dst1);
506 ILVRL_B2_SH(zero, src1, dst2, dst3);
507 ILVRL_B2_SH(zero, src2, dst4, dst5);
508 ILVRL_B2_SH(zero, src3, dst6, dst7);
509 SLLI_4V(dst0, dst1, dst2, dst3, 6);
510 SLLI_4V(dst4, dst5, dst6, dst7, 6);
511 HEVC_BI_RND_CLIP4_MAX_SATU(in0, in1, in2, in3, dst0, dst1, dst2, dst3,
512 7, dst0, dst1, dst2, dst3);
513 HEVC_BI_RND_CLIP4_MAX_SATU(in4, in5, in6, in7, dst4, dst5, dst6, dst7,
514 7, dst4, dst5, dst6, dst7);
515 PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
516 PCKEV_B2_UB(dst5, dst4, dst7, dst6, out2, out3);
518 ST_UB4(out0, out1, out2, out3, dst, 16);
523 static void hevc_hz_bi_8t_4w_msa(uint8_t *src0_ptr,
529 const int8_t *filter,
533 v8i16 filt0, filt1, filt2, filt3;
534 v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
535 v16i8 mask1, mask2, mask3;
536 v16i8 vec0, vec1, vec2, vec3;
537 v8i16 dst0, dst1, dst2, dst3;
538 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
539 v8i16 filter_vec, const_vec;
540 v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[16]);
544 /* rearranging filter */
545 filter_vec = LD_SH(filter);
546 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
552 const_vec = __msa_ldi_h(128);
555 for (loop_cnt = (height >> 3); loop_cnt--;) {
556 LD_SB8(src0_ptr, src_stride, src0, src1, src2, src3,
557 src4, src5, src6, src7);
558 src0_ptr += (8 * src_stride);
559 LD_SH8(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5, in6, in7);
560 src1_ptr += (8 * src2_stride);
562 ILVR_D2_SH(in1, in0, in3, in2, in0, in1);
563 ILVR_D2_SH(in5, in4, in7, in6, in2, in3);
564 XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
570 VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0, vec1);
571 VSHF_B2_SB(src4, src5, src6, src7, mask0, mask0, vec2, vec3);
572 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0,
574 VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec0, vec1);
575 VSHF_B2_SB(src4, src5, src6, src7, mask1, mask1, vec2, vec3);
576 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0,
578 VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec0, vec1);
579 VSHF_B2_SB(src4, src5, src6, src7, mask2, mask2, vec2, vec3);
580 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt2, filt2, filt2, dst0,
582 VSHF_B2_SB(src0, src1, src2, src3, mask3, mask3, vec0, vec1);
583 VSHF_B2_SB(src4, src5, src6, src7, mask3, mask3, vec2, vec3);
584 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt3, filt3, filt3, filt3, dst0,
587 HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
588 dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
590 PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
591 ST4x8_UB(dst0, dst1, dst, dst_stride);
592 dst += (8 * dst_stride);
596 static void hevc_hz_bi_8t_8w_msa(uint8_t *src0_ptr,
602 const int8_t *filter,
606 v8i16 filt0, filt1, filt2, filt3;
607 v16i8 src0, src1, src2, src3;
608 v16i8 mask1, mask2, mask3;
609 v16i8 vec0, vec1, vec2, vec3;
610 v8i16 dst0, dst1, dst2, dst3;
611 v8i16 in0, in1, in2, in3;
612 v8i16 filter_vec, const_vec;
613 v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
617 const_vec = __msa_ldi_h(128);
620 filter_vec = LD_SH(filter);
621 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
627 for (loop_cnt = (height >> 2); loop_cnt--;) {
628 LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
629 src0_ptr += (4 * src_stride);
630 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
631 src1_ptr += (4 * src2_stride);
632 XORI_B4_128_SB(src0, src1, src2, src3);
638 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
639 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3);
640 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0,
642 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0, vec1);
643 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2, vec3);
644 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0,
646 VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec0, vec1);
647 VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2, vec3);
648 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt2, filt2, filt2, dst0,
650 VSHF_B2_SB(src0, src0, src1, src1, mask3, mask3, vec0, vec1);
651 VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec2, vec3);
652 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt3, filt3, filt3, filt3, dst0,
655 HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
656 dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
658 PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
659 ST8x4_UB(dst0, dst1, dst, dst_stride);
660 dst += (4 * dst_stride);
664 static void hevc_hz_bi_8t_12w_msa(uint8_t *src0_ptr,
670 const int8_t *filter,
676 v16i8 src0, src1, src2, src3;
677 v16i8 vec0, vec1, vec2;
678 v8i16 filt0, filt1, filt2, filt3;
679 v16i8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7;
680 v8i16 dst0, dst1, dst2;
681 v8i16 in0, in1, in2, in3;
682 v8i16 filter_vec, const_vec;
685 const_vec = __msa_ldi_h(128);
688 filter_vec = LD_SH(filter);
689 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
691 mask0 = LD_SB(ff_hevc_mask_arr);
695 mask4 = LD_SB(&ff_hevc_mask_arr[16]);
700 for (loop_cnt = 8; loop_cnt--;) {
701 LD_SB2(src0_ptr, 8, src0, src1);
702 src0_ptr += src_stride;
703 LD_SB2(src0_ptr, 8, src2, src3);
704 src0_ptr += src_stride;
705 LD_SH2(src1_ptr, 8, in0, in1);
706 src1_ptr += src2_stride;
707 LD_SH2(src1_ptr, 8, in2, in3);
708 src1_ptr += src2_stride;
709 XORI_B4_128_SB(src0, src1, src2, src3);
715 VSHF_B3_SB(src0, src0, src1, src3, src2, src2, mask0, mask4, mask0,
717 DPADD_SB2_SH(vec0, vec1, filt0, filt0, dst0, dst1);
718 dst2 = __msa_dpadd_s_h(dst2, vec2, (v16i8) filt0);
719 VSHF_B3_SB(src0, src0, src1, src3, src2, src2, mask1, mask5, mask1,
721 DPADD_SB2_SH(vec0, vec1, filt1, filt1, dst0, dst1);
722 dst2 = __msa_dpadd_s_h(dst2, vec2, (v16i8) filt1);
723 VSHF_B3_SB(src0, src0, src1, src3, src2, src2, mask2, mask6, mask2,
725 DPADD_SB2_SH(vec0, vec1, filt2, filt2, dst0, dst1);
726 dst2 = __msa_dpadd_s_h(dst2, vec2, (v16i8) filt2);
727 VSHF_B3_SB(src0, src0, src1, src3, src2, src2, mask3, mask7, mask3,
729 DPADD_SB2_SH(vec0, vec1, filt3, filt3, dst0, dst1);
730 dst2 = __msa_dpadd_s_h(dst2, vec2, (v16i8) filt3);
732 in1 = (v8i16) __msa_pckev_d((v2i64) in3, (v2i64) in1);
733 HEVC_BI_RND_CLIP2(in0, in1, dst0, dst1, 7, dst0, dst1);
734 dst2 = __msa_adds_s_h(in2, dst2);
735 dst2 = __msa_srari_h(dst2, 7);
736 dst2 = CLIP_SH_0_255(dst2);
737 PCKEV_B2_SH(dst1, dst0, dst2, dst2, dst0, dst1);
739 tmp2 = __msa_copy_s_d((v2i64) dst0, 0);
740 tmp0 = __msa_copy_s_w((v4i32) dst0, 2);
741 tmp3 = __msa_copy_s_d((v2i64) dst1, 0);
742 tmp1 = __msa_copy_s_w((v4i32) dst0, 3);
752 static void hevc_hz_bi_8t_16w_msa(uint8_t *src0_ptr,
758 const int8_t *filter,
762 v16i8 src0, src1, src2, src3;
763 v8i16 filt0, filt1, filt2, filt3;
764 v16i8 mask1, mask2, mask3;
765 v16i8 vec0, vec1, vec2, vec3;
766 v8i16 dst0, dst1, dst2, dst3;
767 v8i16 in0, in1, in2, in3;
768 v8i16 filter_vec, const_vec;
769 v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
772 const_vec = __msa_ldi_h(128);
775 filter_vec = LD_SH(filter);
776 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
782 for (loop_cnt = (height >> 1); loop_cnt--;) {
783 LD_SB2(src0_ptr, 8, src0, src1);
784 src0_ptr += src_stride;
785 LD_SB2(src0_ptr, 8, src2, src3);
786 src0_ptr += src_stride;
787 LD_SH2(src1_ptr, 8, in0, in1);
788 src1_ptr += src2_stride;
789 LD_SH2(src1_ptr, 8, in2, in3);
790 src1_ptr += src2_stride;
791 XORI_B4_128_SB(src0, src1, src2, src3);
797 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
798 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3);
799 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0,
801 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0, vec1);
802 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2, vec3);
803 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0,
805 VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec0, vec1);
806 VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2, vec3);
807 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt2, filt2, filt2, dst0,
809 VSHF_B2_SB(src0, src0, src1, src1, mask3, mask3, vec0, vec1);
810 VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec2, vec3);
811 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt3, filt3, filt3, filt3, dst0,
814 HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
815 dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
817 PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
818 ST_SH2(dst0, dst1, dst, dst_stride);
819 dst += (2 * dst_stride);
823 static void hevc_hz_bi_8t_24w_msa(uint8_t *src0_ptr,
829 const int8_t *filter,
834 v16i8 src0, src1, tmp0, tmp1;
835 v8i16 filt0, filt1, filt2, filt3;
836 v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
837 v16i8 vec0, vec1, vec2, vec3;
838 v8i16 dst0, dst1, dst2;
840 v8i16 filter_vec, const_vec;
841 v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
843 src0_ptr = src0_ptr - 3;
844 const_vec = __msa_ldi_h(128);
847 filter_vec = LD_SH(filter);
848 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
858 for (loop_cnt = height; loop_cnt--;) {
859 LD_SB2(src0_ptr, 16, src0, src1);
860 src0_ptr += src_stride;
861 LD_SH2(src1_ptr, 8, in0, in1);
862 in2 = LD_SH(src1_ptr + 16);
863 src1_ptr += src2_stride;
864 XORI_B2_128_SB(src0, src1);
869 VSHF_B2_SB(src0, src0, src0, src1, mask0, mask4, vec0, vec1);
870 VSHF_B2_SB(src1, src1, src0, src0, mask0, mask1, vec2, vec3);
871 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt1, dst0,
873 VSHF_B2_SB(src0, src1, src1, src1, mask5, mask1, vec0, vec1);
874 VSHF_B2_SB(src0, src0, src0, src1, mask2, mask6, vec2, vec3);
875 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt2, filt2, dst1,
877 VSHF_B2_SB(src1, src1, src0, src0, mask2, mask3, vec0, vec1);
878 VSHF_B2_SB(src0, src1, src1, src1, mask7, mask3, vec2, vec3);
879 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt3, filt3, filt3, dst2,
882 HEVC_BI_RND_CLIP2(in0, in1, dst0, dst1, 7, dst0, dst1);
883 dst2 = __msa_adds_s_h(dst2, in2);
884 dst2 = __msa_srari_h(dst2, 7);
885 dst2 = CLIP_SH_0_255(dst2);
887 PCKEV_B2_SB(dst1, dst0, dst2, dst2, tmp0, tmp1);
888 dst_val0 = __msa_copy_u_d((v2i64) tmp1, 0);
890 SD(dst_val0, dst + 16);
895 static void hevc_hz_bi_8t_32w_msa(uint8_t *src0_ptr,
901 const int8_t *filter,
905 v16i8 src0, src1, src2, tmp0, tmp1;
906 v8i16 filt0, filt1, filt2, filt3;
907 v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
908 v16i8 vec0, vec1, vec2, vec3;
909 v8i16 dst0, dst1, dst2, dst3;
910 v8i16 in0, in1, in2, in3;
911 v8i16 filter_vec, const_vec;
912 v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
915 const_vec = __msa_ldi_h(128);
918 filter_vec = LD_SH(filter);
919 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
929 for (loop_cnt = height; loop_cnt--;) {
930 LD_SB2(src0_ptr, 16, src0, src1);
931 src2 = LD_SB(src0_ptr + 24);
932 src0_ptr += src_stride;
933 LD_SH4(src1_ptr, 8, in0, in1, in2, in3);
934 src1_ptr += src2_stride;
935 XORI_B3_128_SB(src0, src1, src2);
941 VSHF_B2_SB(src0, src0, src0, src1, mask0, mask4, vec0, vec1);
942 VSHF_B2_SB(src1, src1, src2, src2, mask0, mask0, vec2, vec3);
943 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0,
945 VSHF_B2_SB(src0, src0, src0, src1, mask1, mask5, vec0, vec1);
946 VSHF_B2_SB(src1, src1, src2, src2, mask1, mask1, vec2, vec3);
947 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0,
949 VSHF_B2_SB(src0, src0, src0, src1, mask2, mask6, vec0, vec1);
950 VSHF_B2_SB(src1, src1, src2, src2, mask2, mask2, vec2, vec3);
951 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt2, filt2, filt2, dst0,
953 VSHF_B2_SB(src0, src0, src0, src1, mask3, mask7, vec0, vec1);
954 VSHF_B2_SB(src1, src1, src2, src2, mask3, mask3, vec2, vec3);
955 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt3, filt3, filt3, filt3, dst0,
958 HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
959 dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
961 PCKEV_B2_SB(dst1, dst0, dst3, dst2, tmp0, tmp1);
962 ST_SB2(tmp0, tmp1, dst, 16);
967 static void hevc_hz_bi_8t_48w_msa(uint8_t *src0_ptr,
973 const int8_t *filter,
977 v16i8 src0, src1, src2, src3;
978 v16i8 tmp0, tmp1, tmp2;
979 v8i16 filt0, filt1, filt2, filt3;
980 v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
981 v16i8 vec0, vec1, vec2, vec3;
982 v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
983 v8i16 in0, in1, in2, in3, in4, in5;
984 v8i16 filter_vec, const_vec;
985 v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
989 const_vec = __msa_ldi_h(128);
992 filter_vec = LD_SH(filter);
993 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1003 for (loop_cnt = 64; loop_cnt--;) {
1004 LD_SB3(src0_ptr, 16, src0, src1, src2);
1005 src3 = LD_SB(src0_ptr + 40);
1006 src0_ptr += src_stride;
1007 LD_SH4(src1_ptr, 8, in0, in1, in2, in3);
1008 XORI_B4_128_SB(src0, src1, src2, src3);
1015 VSHF_B2_SB(src0, src0, src0, src1, mask0, mask4, vec0, vec1);
1016 VSHF_B2_SB(src1, src1, src1, src2, mask0, mask4, vec2, vec3);
1017 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0,
1019 VSHF_B2_SB(src0, src0, src0, src1, mask1, mask5, vec0, vec1);
1020 VSHF_B2_SB(src1, src1, src1, src2, mask1, mask5, vec2, vec3);
1021 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0,
1023 VSHF_B2_SB(src0, src0, src0, src1, mask2, mask6, vec0, vec1);
1024 VSHF_B2_SB(src1, src1, src1, src2, mask2, mask6, vec2, vec3);
1025 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt2, filt2, filt2, dst0,
1027 VSHF_B2_SB(src0, src0, src0, src1, mask3, mask7, vec0, vec1);
1028 VSHF_B2_SB(src1, src1, src1, src2, mask3, mask7, vec2, vec3);
1029 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt3, filt3, filt3, filt3, dst0,
1031 HEVC_BI_RND_CLIP2(in0, in1, dst0, dst1, 7, dst0, dst1);
1032 HEVC_BI_RND_CLIP2(in2, in3, dst2, dst3, 7, dst2, dst3);
1033 PCKEV_B2_SB(dst1, dst0, dst3, dst2, tmp0, tmp1);
1035 ST_SB(tmp1, dst + 16);
1037 LD_SH2(src1_ptr + 32, 8, in4, in5);
1038 src1_ptr += src2_stride;
1042 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec0, vec1);
1043 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2, vec3);
1044 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt1, filt1, dst4,
1046 VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec0, vec1);
1047 VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec2, vec3);
1048 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt2, filt3, filt3, dst4,
1051 HEVC_BI_RND_CLIP2(in4, in5, dst4, dst5, 7, dst4, dst5);
1053 tmp2 = __msa_pckev_b((v16i8) dst5, (v16i8) dst4);
1054 ST_SB(tmp2, dst + 32);
1059 static void hevc_hz_bi_8t_64w_msa(uint8_t *src0_ptr,
1062 int32_t src2_stride,
1065 const int8_t *filter,
1069 v16i8 src0, src1, src2, src3, src4, src5, tmp0, tmp1;
1070 v8i16 filt0, filt1, filt2, filt3;
1071 v16i8 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
1072 v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
1073 v16i8 vec0, vec1, vec2, vec3;
1074 v8i16 dst0, dst1, dst2, dst3;
1075 v8i16 in0, in1, in2, in3;
1076 v8i16 filter_vec, const_vec;
1080 const_vec = __msa_ldi_h(128);
1083 filter_vec = LD_SH(filter);
1084 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1094 for (loop_cnt = height; loop_cnt--;) {
1095 LD_SB2(src0_ptr, 16, src0, src1);
1096 src2 = LD_SB(src0_ptr + 24);
1097 LD_SB2(src0_ptr + 32, 16, src3, src4);
1098 src5 = LD_SB(src0_ptr + 56);
1099 LD_SH4(src1_ptr, 8, in0, in1, in2, in3);
1100 XORI_B3_128_SB(src0, src1, src2);
1107 VSHF_B2_SB(src0, src0, src0, src1, mask0, mask4, vec0, vec1);
1108 VSHF_B2_SB(src1, src1, src2, src2, mask0, mask0, vec2, vec3);
1109 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0,
1111 VSHF_B2_SB(src0, src0, src0, src1, mask1, mask5, vec0, vec1);
1112 VSHF_B2_SB(src1, src1, src2, src2, mask1, mask1, vec2, vec3);
1113 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0,
1115 VSHF_B2_SB(src0, src0, src0, src1, mask2, mask6, vec0, vec1);
1116 VSHF_B2_SB(src1, src1, src2, src2, mask2, mask2, vec2, vec3);
1117 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt2, filt2, filt2, dst0,
1119 VSHF_B2_SB(src0, src0, src0, src1, mask3, mask7, vec0, vec1);
1120 VSHF_B2_SB(src1, src1, src2, src2, mask3, mask3, vec2, vec3);
1121 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt3, filt3, filt3, filt3, dst0,
1124 HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
1125 dst0, dst1, dst2, dst3, 7,
1126 dst0, dst1, dst2, dst3);
1128 PCKEV_B2_SB(dst1, dst0, dst3, dst2, tmp0, tmp1);
1129 ST_SB2(tmp0, tmp1, dst, 16);
1135 LD_SH4(src1_ptr + 32, 8, in0, in1, in2, in3);
1136 XORI_B3_128_SB(src0, src1, src2);
1142 VSHF_B2_SB(src0, src0, src0, src1, mask0, mask4, vec0, vec1);
1143 VSHF_B2_SB(src1, src1, src2, src2, mask0, mask0, vec2, vec3);
1144 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, dst0,
1146 VSHF_B2_SB(src0, src0, src0, src1, mask1, mask5, vec0, vec1);
1147 VSHF_B2_SB(src1, src1, src2, src2, mask1, mask1, vec2, vec3);
1148 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt1, filt1, filt1, filt1, dst0,
1150 VSHF_B2_SB(src0, src0, src0, src1, mask2, mask6, vec0, vec1);
1151 VSHF_B2_SB(src1, src1, src2, src2, mask2, mask2, vec2, vec3);
1152 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt2, filt2, filt2, dst0,
1154 VSHF_B2_SB(src0, src0, src0, src1, mask3, mask7, vec0, vec1);
1155 VSHF_B2_SB(src1, src1, src2, src2, mask3, mask3, vec2, vec3);
1156 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt3, filt3, filt3, filt3, dst0,
1158 HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
1159 dst0, dst1, dst2, dst3, 7,
1160 dst0, dst1, dst2, dst3);
1161 PCKEV_B2_SB(dst1, dst0, dst3, dst2, tmp0, tmp1);
1162 ST_SB2(tmp0, tmp1, dst + 32, 16);
1163 src1_ptr += src2_stride;
1164 src0_ptr += src_stride;
1169 static void hevc_vt_bi_8t_4w_msa(uint8_t *src0_ptr,
1172 int32_t src2_stride,
1175 const int8_t *filter,
1179 v16i8 src0, src1, src2, src3, src4, src5;
1180 v16i8 src6, src7, src8, src9, src10;
1181 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
1182 v16i8 src11, src12, src13, src14;
1183 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
1184 v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
1185 v16i8 src1110_r, src1211_r, src1312_r, src1413_r;
1186 v16i8 src2110, src4332, src6554, src8776, src10998;
1187 v16i8 src12111110, src14131312;
1188 v8i16 dst10, dst32, dst54, dst76;
1189 v8i16 filt0, filt1, filt2, filt3;
1190 v8i16 filter_vec, const_vec;
1192 src0_ptr -= (3 * src_stride);
1194 const_vec = __msa_ldi_h(128);
1197 filter_vec = LD_SH(filter);
1198 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1200 LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6);
1201 src0_ptr += (7 * src_stride);
1202 ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1203 src10_r, src32_r, src54_r, src21_r);
1204 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1205 ILVR_D3_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r,
1206 src2110, src4332, src6554);
1207 XORI_B3_128_SB(src2110, src4332, src6554);
1209 for (loop_cnt = (height >> 3); loop_cnt--;) {
1210 LD_SB8(src0_ptr, src_stride,
1211 src7, src8, src9, src10, src11, src12, src13, src14);
1212 src0_ptr += (8 * src_stride);
1213 LD_SH8(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5, in6, in7);
1214 src1_ptr += (8 * src2_stride);
1216 ILVR_D2_SH(in1, in0, in3, in2, in0, in1);
1217 ILVR_D2_SH(in5, in4, in7, in6, in2, in3);
1218 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
1219 src76_r, src87_r, src98_r, src109_r);
1220 ILVR_B4_SB(src11, src10, src12, src11, src13, src12, src14, src13,
1221 src1110_r, src1211_r, src1312_r, src1413_r);
1222 ILVR_D4_SB(src87_r, src76_r, src109_r, src98_r, src1211_r, src1110_r,
1223 src1413_r, src1312_r,
1224 src8776, src10998, src12111110, src14131312);
1225 XORI_B4_128_SB(src8776, src10998, src12111110, src14131312);
1228 DPADD_SB4_SH(src2110, src4332, src6554, src8776,
1229 filt0, filt1, filt2, filt3, dst10, dst10, dst10, dst10);
1231 DPADD_SB4_SH(src4332, src6554, src8776, src10998,
1232 filt0, filt1, filt2, filt3, dst32, dst32, dst32, dst32);
1234 DPADD_SB4_SH(src6554, src8776, src10998, src12111110,
1235 filt0, filt1, filt2, filt3, dst54, dst54, dst54, dst54);
1237 DPADD_SB4_SH(src8776, src10998, src12111110, src14131312,
1238 filt0, filt1, filt2, filt3, dst76, dst76, dst76, dst76);
1240 HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
1241 dst10, dst32, dst54, dst76, 7,
1242 dst10, dst32, dst54, dst76);
1244 PCKEV_B2_SH(dst32, dst10, dst76, dst54, dst10, dst54);
1245 ST4x8_UB(dst10, dst54, dst, dst_stride);
1246 dst += (8 * dst_stride);
1249 src4332 = src12111110;
1250 src6554 = src14131312;
1255 static void hevc_vt_bi_8t_8w_msa(uint8_t *src0_ptr,
1258 int32_t src2_stride,
1261 const int8_t *filter,
1265 v16i8 src0, src1, src2, src3, src4, src5;
1266 v16i8 src6, src7, src8, src9, src10;
1267 v8i16 in0, in1, in2, in3;
1268 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
1269 v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
1270 v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
1271 v8i16 filt0, filt1, filt2, filt3;
1272 v8i16 filter_vec, const_vec;
1274 src0_ptr -= (3 * src_stride);
1275 const_vec = __msa_ldi_h(128);
1278 filter_vec = LD_SH(filter);
1279 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1281 LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6);
1282 src0_ptr += (7 * src_stride);
1283 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1284 ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1285 src10_r, src32_r, src54_r, src21_r);
1286 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1288 for (loop_cnt = (height >> 2); loop_cnt--;) {
1289 LD_SB4(src0_ptr, src_stride, src7, src8, src9, src10);
1290 src0_ptr += (4 * src_stride);
1291 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
1292 src1_ptr += (4 * src2_stride);
1293 XORI_B4_128_SB(src7, src8, src9, src10);
1294 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
1295 src76_r, src87_r, src98_r, src109_r);
1298 DPADD_SB4_SH(src10_r, src32_r, src54_r, src76_r,
1299 filt0, filt1, filt2, filt3,
1300 dst0_r, dst0_r, dst0_r, dst0_r);
1302 DPADD_SB4_SH(src21_r, src43_r, src65_r, src87_r,
1303 filt0, filt1, filt2, filt3,
1304 dst1_r, dst1_r, dst1_r, dst1_r);
1306 DPADD_SB4_SH(src32_r, src54_r, src76_r, src98_r,
1307 filt0, filt1, filt2, filt3,
1308 dst2_r, dst2_r, dst2_r, dst2_r);
1310 DPADD_SB4_SH(src43_r, src65_r, src87_r, src109_r,
1311 filt0, filt1, filt2, filt3,
1312 dst3_r, dst3_r, dst3_r, dst3_r);
1314 HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
1315 dst0_r, dst1_r, dst2_r, dst3_r, 7,
1316 dst0_r, dst1_r, dst2_r, dst3_r);
1318 PCKEV_B2_SH(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r);
1319 ST8x4_UB(dst0_r, dst1_r, dst, dst_stride);
1320 dst += (4 * dst_stride);
1333 static void hevc_vt_bi_8t_12w_msa(uint8_t *src0_ptr,
1336 int32_t src2_stride,
1339 const int8_t *filter,
1343 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1344 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
1345 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
1346 v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
1347 v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
1348 v16i8 src10_l, src32_l, src54_l, src76_l, src98_l;
1349 v16i8 src21_l, src43_l, src65_l, src87_l, src109_l;
1350 v16i8 src2110, src4332, src6554, src8776, src10998;
1351 v8i16 dst0_l, dst1_l;
1352 v8i16 filt0, filt1, filt2, filt3;
1353 v8i16 filter_vec, const_vec;
1355 src0_ptr -= (3 * src_stride);
1356 const_vec = __msa_ldi_h(128);
1359 filter_vec = LD_SH(filter);
1360 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1362 LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6);
1363 src0_ptr += (7 * src_stride);
1364 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1366 ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1367 src10_r, src32_r, src54_r, src21_r);
1368 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1369 ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1370 src10_l, src32_l, src54_l, src21_l);
1371 ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
1372 ILVR_D3_SB(src21_l, src10_l, src43_l, src32_l, src65_l, src54_l,
1373 src2110, src4332, src6554);
1375 for (loop_cnt = (height >> 2); loop_cnt--;) {
1376 LD_SB4(src0_ptr, src_stride, src7, src8, src9, src10);
1377 src0_ptr += (4 * src_stride);
1378 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
1379 LD_SH4((src1_ptr + 8), src2_stride, in4, in5, in6, in7);
1380 src1_ptr += (4 * src2_stride);
1382 ILVR_D2_SH(in5, in4, in7, in6, in4, in5);
1383 XORI_B4_128_SB(src7, src8, src9, src10);
1384 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
1385 src76_r, src87_r, src98_r, src109_r);
1386 ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
1387 src76_l, src87_l, src98_l, src109_l);
1388 ILVR_D2_SB(src87_l, src76_l, src109_l, src98_l, src8776, src10998);
1391 DPADD_SB4_SH(src10_r, src32_r, src54_r, src76_r,
1392 filt0, filt1, filt2, filt3,
1393 dst0_r, dst0_r, dst0_r, dst0_r);
1395 DPADD_SB4_SH(src21_r, src43_r, src65_r, src87_r,
1396 filt0, filt1, filt2, filt3,
1397 dst1_r, dst1_r, dst1_r, dst1_r);
1399 DPADD_SB4_SH(src32_r, src54_r, src76_r, src98_r,
1400 filt0, filt1, filt2, filt3,
1401 dst2_r, dst2_r, dst2_r, dst2_r);
1403 DPADD_SB4_SH(src43_r, src65_r, src87_r, src109_r,
1404 filt0, filt1, filt2, filt3,
1405 dst3_r, dst3_r, dst3_r, dst3_r);
1407 DPADD_SB4_SH(src2110, src4332, src6554, src8776,
1408 filt0, filt1, filt2, filt3,
1409 dst0_l, dst0_l, dst0_l, dst0_l);
1411 DPADD_SB4_SH(src4332, src6554, src8776, src10998,
1412 filt0, filt1, filt2, filt3,
1413 dst1_l, dst1_l, dst1_l, dst1_l);
1415 HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
1416 dst0_r, dst1_r, dst2_r, dst3_r, 7,
1417 dst0_r, dst1_r, dst2_r, dst3_r);
1418 HEVC_BI_RND_CLIP2(in4, in5, dst0_l, dst1_l, 7, dst0_l, dst1_l);
1421 PCKEV_B2_SH(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r);
1422 dst0_l = (v8i16) __msa_pckev_b((v16i8) dst1_l, (v16i8) dst0_l);
1423 ST12x4_UB(dst0_r, dst1_r, dst0_l, dst, dst_stride);
1424 dst += (4 * dst_stride);
1439 static void hevc_vt_bi_8t_16multx2mult_msa(uint8_t *src0_ptr,
1442 int32_t src2_stride,
1445 const int8_t *filter,
1446 int32_t height, int32_t width)
1448 uint8_t *src0_ptr_tmp;
1449 int16_t *src1_ptr_tmp;
1453 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
1454 v8i16 in0, in1, in2, in3;
1455 v16i8 src10_r, src32_r, src54_r, src76_r;
1456 v16i8 src21_r, src43_r, src65_r, src87_r;
1457 v8i16 dst0_r, dst1_r;
1458 v16i8 src10_l, src32_l, src54_l, src76_l;
1459 v16i8 src21_l, src43_l, src65_l, src87_l;
1460 v8i16 dst0_l, dst1_l;
1461 v8i16 filt0, filt1, filt2, filt3;
1462 v8i16 filter_vec, const_vec;
1464 src0_ptr -= (3 * src_stride);
1465 const_vec = __msa_ldi_h(128);
1468 filter_vec = LD_SH(filter);
1469 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1471 for (cnt = (width >> 4); cnt--;) {
1472 src0_ptr_tmp = src0_ptr;
1473 src1_ptr_tmp = src1_ptr;
1476 LD_SB7(src0_ptr_tmp, src_stride,
1477 src0, src1, src2, src3, src4, src5, src6);
1478 src0_ptr_tmp += (7 * src_stride);
1479 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1481 ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1482 src10_r, src32_r, src54_r, src21_r);
1483 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1484 ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1485 src10_l, src32_l, src54_l, src21_l);
1486 ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
1488 for (loop_cnt = (height >> 1); loop_cnt--;) {
1489 LD_SB2(src0_ptr_tmp, src_stride, src7, src8);
1490 src0_ptr_tmp += (2 * src_stride);
1491 LD_SH2(src1_ptr_tmp, src2_stride, in0, in1);
1492 LD_SH2((src1_ptr_tmp + 8), src2_stride, in2, in3);
1493 src1_ptr_tmp += (2 * src2_stride);
1494 XORI_B2_128_SB(src7, src8);
1496 ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
1497 ILVL_B2_SB(src7, src6, src8, src7, src76_l, src87_l);
1500 DPADD_SB4_SH(src10_r, src32_r, src54_r, src76_r,
1501 filt0, filt1, filt2, filt3,
1502 dst0_r, dst0_r, dst0_r, dst0_r);
1504 DPADD_SB4_SH(src21_r, src43_r, src65_r, src87_r,
1505 filt0, filt1, filt2, filt3,
1506 dst1_r, dst1_r, dst1_r, dst1_r);
1508 DPADD_SB4_SH(src10_l, src32_l, src54_l, src76_l,
1509 filt0, filt1, filt2, filt3,
1510 dst0_l, dst0_l, dst0_l, dst0_l);
1512 DPADD_SB4_SH(src21_l, src43_l, src65_l, src87_l,
1513 filt0, filt1, filt2, filt3,
1514 dst1_l, dst1_l, dst1_l, dst1_l);
1516 HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
1517 dst0_r, dst1_r, dst0_l, dst1_l, 7,
1518 dst0_r, dst1_r, dst0_l, dst1_l);
1520 PCKEV_B2_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r, dst1_r);
1521 ST_SH2(dst0_r, dst1_r, dst_tmp, dst_stride);
1522 dst_tmp += (2 * dst_stride);
1545 static void hevc_vt_bi_8t_16w_msa(uint8_t *src0_ptr,
1548 int32_t src2_stride,
1551 const int8_t *filter,
1554 hevc_vt_bi_8t_16multx2mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
1555 dst, dst_stride, filter, height, 16);
1558 static void hevc_vt_bi_8t_24w_msa(uint8_t *src0_ptr,
1561 int32_t src2_stride,
1564 const int8_t *filter,
1567 hevc_vt_bi_8t_16multx2mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
1568 dst, dst_stride, filter, height, 16);
1569 hevc_vt_bi_8t_8w_msa(src0_ptr + 16, src_stride, src1_ptr + 16, src2_stride,
1570 dst + 16, dst_stride, filter, height);
1573 static void hevc_vt_bi_8t_32w_msa(uint8_t *src0_ptr,
1576 int32_t src2_stride,
1579 const int8_t *filter,
1582 hevc_vt_bi_8t_16multx2mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
1583 dst, dst_stride, filter, height, 32);
1586 static void hevc_vt_bi_8t_48w_msa(uint8_t *src0_ptr,
1589 int32_t src2_stride,
1592 const int8_t *filter,
1595 hevc_vt_bi_8t_16multx2mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
1596 dst, dst_stride, filter, height, 48);
1599 static void hevc_vt_bi_8t_64w_msa(uint8_t *src0_ptr,
1602 int32_t src2_stride,
1605 const int8_t *filter,
1608 hevc_vt_bi_8t_16multx2mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
1609 dst, dst_stride, filter, height, 64);
1612 static void hevc_hv_bi_8t_4w_msa(uint8_t *src0_ptr,
1615 int32_t src2_stride,
1618 const int8_t *filter_x,
1619 const int8_t *filter_y,
1625 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1626 v8i16 in0 = { 0 }, in1 = { 0 };
1627 v8i16 filt0, filt1, filt2, filt3;
1628 v8i16 filt_h0, filt_h1, filt_h2, filt_h3;
1629 v16i8 mask1, mask2, mask3;
1630 v8i16 filter_vec, const_vec;
1631 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1632 v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1634 v8i16 dst30, dst41, dst52, dst63, dst66, dst97, dst108;
1635 v8i16 dst10, dst32, dst54, dst76, dst98, dst21, dst43, dst65, dst87, dst109;
1636 v4i32 dst0, dst1, dst2, dst3;
1637 v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16);
1639 src0_ptr -= ((3 * src_stride) + 3);
1640 filter_vec = LD_SH(filter_x);
1641 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1643 filter_vec = LD_SH(filter_y);
1644 UNPCK_R_SB_SH(filter_vec, filter_vec);
1646 SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
1652 const_vec = __msa_ldi_h(128);
1655 LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6);
1656 src0_ptr += (7 * src_stride);
1657 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1659 /* row 0 row 1 row 2 row 3 */
1660 VSHF_B4_SB(src0, src3, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
1661 VSHF_B4_SB(src1, src4, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7);
1662 VSHF_B4_SB(src2, src5, mask0, mask1, mask2, mask3,
1663 vec8, vec9, vec10, vec11);
1664 VSHF_B4_SB(src3, src6, mask0, mask1, mask2, mask3,
1665 vec12, vec13, vec14, vec15);
1667 dst30 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1669 dst41 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
1671 dst52 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
1673 dst63 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2,
1676 ILVRL_H2_SH(dst41, dst30, dst10, dst43);
1677 ILVRL_H2_SH(dst52, dst41, dst21, dst54);
1678 ILVRL_H2_SH(dst63, dst52, dst32, dst65);
1680 dst66 = (v8i16) __msa_splati_d((v2i64) dst63, 1);
1682 for (loop_cnt = height >> 2; loop_cnt--;) {
1683 LD_SB4(src0_ptr, src_stride, src7, src8, src9, src10);
1684 src0_ptr += (4 * src_stride);
1685 XORI_B4_128_SB(src7, src8, src9, src10);
1687 LD2(src1_ptr, src2_stride, tp0, tp1);
1688 INSERT_D2_SH(tp0, tp1, in0);
1689 src1_ptr += (2 * src2_stride);
1690 LD2(src1_ptr, src2_stride, tp0, tp1);
1691 INSERT_D2_SH(tp0, tp1, in1);
1692 src1_ptr += (2 * src2_stride);
1694 VSHF_B4_SB(src7, src9, mask0, mask1, mask2, mask3,
1695 vec0, vec1, vec2, vec3);
1696 VSHF_B4_SB(src8, src10, mask0, mask1, mask2, mask3,
1697 vec4, vec5, vec6, vec7);
1698 dst97 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1700 dst108 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
1703 dst76 = __msa_ilvr_h(dst97, dst66);
1704 ILVRL_H2_SH(dst108, dst97, dst87, dst109);
1705 dst66 = (v8i16) __msa_splati_d((v2i64) dst97, 1);
1706 dst98 = __msa_ilvr_h(dst66, dst108);
1708 dst0 = HEVC_FILT_8TAP(dst10, dst32, dst54, dst76, filt_h0, filt_h1,
1710 dst1 = HEVC_FILT_8TAP(dst21, dst43, dst65, dst87, filt_h0, filt_h1,
1712 dst2 = HEVC_FILT_8TAP(dst32, dst54, dst76, dst98, filt_h0, filt_h1,
1714 dst3 = HEVC_FILT_8TAP(dst43, dst65, dst87, dst109, filt_h0, filt_h1,
1717 SRA_4V(dst0, dst1, dst2, dst3, 6);
1718 PCKEV_H2_SH(dst1, dst0, dst3, dst2, out0, out1);
1719 ADDS_SH2_SH(out0, in0, out1, in1, out0, out1);
1720 ADDS_SH2_SH(out0, const_vec, out1, const_vec, out0, out1);
1721 SRARI_H2_SH(out0, out1, 7);
1722 CLIP_SH2_0_255_MAX_SATU(out0, out1);
1723 out = (v16u8) __msa_pckev_b((v16i8) out1, (v16i8) out0);
1724 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
1725 dst += (4 * dst_stride);
1733 dst66 = (v8i16) __msa_splati_d((v2i64) dst108, 1);
1737 static void hevc_hv_bi_8t_8multx1mult_msa(uint8_t *src0_ptr,
1740 int32_t src2_stride,
1743 const int8_t *filter_x,
1744 const int8_t *filter_y,
1745 int32_t height, int32_t width)
1749 uint8_t *src0_ptr_tmp;
1750 int16_t *src1_ptr_tmp;
1753 v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
1755 v8i16 filt0, filt1, filt2, filt3;
1756 v8i16 filt_h0, filt_h1, filt_h2, filt_h3;
1757 v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
1758 v16i8 mask1, mask2, mask3;
1759 v8i16 filter_vec, const_vec;
1760 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1761 v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1762 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
1763 v4i32 dst0_r, dst0_l;
1764 v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
1765 v8i16 dst10_l, dst32_l, dst54_l, dst76_l;
1767 src0_ptr -= ((3 * src_stride) + 3);
1768 const_vec = __msa_ldi_h(128);
1771 filter_vec = LD_SH(filter_x);
1772 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1774 filter_vec = LD_SH(filter_y);
1775 UNPCK_R_SB_SH(filter_vec, filter_vec);
1777 SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
1783 for (cnt = width >> 3; cnt--;) {
1784 src0_ptr_tmp = src0_ptr;
1786 src1_ptr_tmp = src1_ptr;
1788 LD_SB7(src0_ptr_tmp, src_stride,
1789 src0, src1, src2, src3, src4, src5, src6);
1790 src0_ptr_tmp += (7 * src_stride);
1791 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1793 /* row 0 row 1 row 2 row 3 */
1794 VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
1795 vec0, vec1, vec2, vec3);
1796 VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
1797 vec4, vec5, vec6, vec7);
1798 VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
1799 vec8, vec9, vec10, vec11);
1800 VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
1801 vec12, vec13, vec14, vec15);
1802 dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1804 dst1 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
1806 dst2 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
1808 dst3 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1,
1811 VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3,
1812 vec0, vec1, vec2, vec3);
1813 VSHF_B4_SB(src5, src5, mask0, mask1, mask2, mask3,
1814 vec4, vec5, vec6, vec7);
1815 VSHF_B4_SB(src6, src6, mask0, mask1, mask2, mask3,
1816 vec8, vec9, vec10, vec11);
1817 dst4 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1819 dst5 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
1821 dst6 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
1824 for (loop_cnt = height; loop_cnt--;) {
1825 src7 = LD_SB(src0_ptr_tmp);
1826 src7 = (v16i8) __msa_xori_b((v16u8) src7, 128);
1827 src0_ptr_tmp += src_stride;
1829 in0 = LD_SH(src1_ptr_tmp);
1830 src1_ptr_tmp += src2_stride;
1832 VSHF_B4_SB(src7, src7, mask0, mask1, mask2, mask3,
1833 vec0, vec1, vec2, vec3);
1834 dst7 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1,
1836 ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
1837 ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
1838 ILVRL_H2_SH(dst5, dst4, dst54_r, dst54_l);
1839 ILVRL_H2_SH(dst7, dst6, dst76_r, dst76_l);
1840 dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r,
1841 filt_h0, filt_h1, filt_h2, filt_h3);
1842 dst0_l = HEVC_FILT_8TAP(dst10_l, dst32_l, dst54_l, dst76_l,
1843 filt_h0, filt_h1, filt_h2, filt_h3);
1847 tmp = __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r);
1848 ADDS_SH2_SH(tmp, in0, tmp, const_vec, tmp, tmp);
1849 tmp = __msa_srari_h(tmp, 7);
1850 tmp = CLIP_SH_0_255_MAX_SATU(tmp);
1851 out = (v16u8) __msa_pckev_b((v16i8) tmp, (v16i8) tmp);
1852 ST8x1_UB(out, dst_tmp);
1853 dst_tmp += dst_stride;
1870 static void hevc_hv_bi_8t_8w_msa(uint8_t *src0_ptr,
1873 int32_t src2_stride,
1876 const int8_t *filter_x,
1877 const int8_t *filter_y,
1880 hevc_hv_bi_8t_8multx1mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
1881 dst, dst_stride, filter_x, filter_y,
1885 static void hevc_hv_bi_8t_12w_msa(uint8_t *src0_ptr,
1888 int32_t src2_stride,
1891 const int8_t *filter_x,
1892 const int8_t *filter_y,
1896 uint8_t *src0_ptr_tmp, *dst_tmp;
1897 int16_t *src1_ptr_tmp;
1900 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1901 v16i8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7;
1902 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1903 v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1904 v8i16 in0, in1 = { 0 }, out0, out1, tmp, filter_vec, const_vec;
1905 v8i16 filt0, filt1, filt2, filt3, filt_h0, filt_h1, filt_h2, filt_h3;
1906 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
1907 v8i16 dst30, dst41, dst52, dst63, dst66, dst97, dst108;
1908 v8i16 dst10, dst32, dst54, dst76, dst98, dst21, dst43, dst65, dst87, dst109;
1909 v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
1910 v8i16 dst10_l, dst32_l, dst54_l, dst76_l;
1911 v4i32 dst0_r, dst0_l, tmp0, tmp1, tmp2, tmp3;
1913 src0_ptr -= ((3 * src_stride) + 3);
1915 const_vec = __msa_ldi_h(128);
1918 filter_vec = LD_SH(filter_x);
1919 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1921 filter_vec = LD_SH(filter_y);
1922 UNPCK_R_SB_SH(filter_vec, filter_vec);
1924 SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
1926 mask0 = LD_SB(ff_hevc_mask_arr);
1931 src0_ptr_tmp = src0_ptr;
1933 src1_ptr_tmp = src1_ptr;
1935 LD_SB7(src0_ptr_tmp, src_stride, src0, src1, src2, src3, src4, src5,
1937 src0_ptr_tmp += (7 * src_stride);
1938 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1940 /* row 0 row 1 row 2 row 3 */
1941 VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3, vec0, vec1, vec2,
1943 VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3, vec4, vec5, vec6,
1945 VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3, vec8, vec9, vec10,
1947 VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3, vec12, vec13, vec14,
1949 dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1951 dst1 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
1953 dst2 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
1955 dst3 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1,
1957 VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3, vec0, vec1, vec2,
1959 VSHF_B4_SB(src5, src5, mask0, mask1, mask2, mask3, vec4, vec5, vec6,
1961 VSHF_B4_SB(src6, src6, mask0, mask1, mask2, mask3, vec8, vec9, vec10,
1963 dst4 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1965 dst5 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
1967 dst6 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
1970 for (loop_cnt = 16; loop_cnt--;) {
1971 src7 = LD_SB(src0_ptr_tmp);
1972 src7 = (v16i8) __msa_xori_b((v16u8) src7, 128);
1973 src0_ptr_tmp += src_stride;
1975 in0 = LD_SH(src1_ptr_tmp);
1976 src1_ptr_tmp += src2_stride;
1978 VSHF_B4_SB(src7, src7, mask0, mask1, mask2, mask3, vec0, vec1, vec2,
1980 dst7 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1,
1982 ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
1983 ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
1984 ILVRL_H2_SH(dst5, dst4, dst54_r, dst54_l);
1985 ILVRL_H2_SH(dst7, dst6, dst76_r, dst76_l);
1986 dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r, filt_h0,
1987 filt_h1, filt_h2, filt_h3);
1988 dst0_l = HEVC_FILT_8TAP(dst10_l, dst32_l, dst54_l, dst76_l, filt_h0,
1989 filt_h1, filt_h2, filt_h3);
1993 tmp = __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r);
1994 ADDS_SH2_SH(tmp, in0, tmp, const_vec, tmp, tmp);
1995 tmp = __msa_srari_h(tmp, 7);
1996 tmp = CLIP_SH_0_255_MAX_SATU(tmp);
1997 out = (v16u8) __msa_pckev_b((v16i8) tmp, (v16i8) tmp);
1998 ST8x1_UB(out, dst_tmp);
1999 dst_tmp += dst_stride;
2014 mask4 = LD_SB(ff_hevc_mask_arr + 16);
2019 LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6);
2020 src0_ptr += (7 * src_stride);
2021 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
2023 /* row 0 row 1 row 2 row 3 */
2024 VSHF_B4_SB(src0, src3, mask4, mask5, mask6, mask7, vec0, vec1, vec2, vec3);
2025 VSHF_B4_SB(src1, src4, mask4, mask5, mask6, mask7, vec4, vec5, vec6, vec7);
2026 VSHF_B4_SB(src2, src5, mask4, mask5, mask6, mask7,
2027 vec8, vec9, vec10, vec11);
2028 VSHF_B4_SB(src3, src6, mask4, mask5, mask6, mask7,
2029 vec12, vec13, vec14, vec15);
2030 dst30 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
2032 dst41 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
2034 dst52 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
2036 dst63 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2,
2039 ILVRL_H2_SH(dst41, dst30, dst10, dst43);
2040 ILVRL_H2_SH(dst52, dst41, dst21, dst54);
2041 ILVRL_H2_SH(dst63, dst52, dst32, dst65);
2043 dst66 = (v8i16) __msa_splati_d((v2i64) dst63, 1);
2045 for (loop_cnt = 4; loop_cnt--;) {
2046 LD_SB4(src0_ptr, src_stride, src7, src8, src9, src10);
2047 src0_ptr += (4 * src_stride);
2048 XORI_B4_128_SB(src7, src8, src9, src10);
2050 LD2(src1_ptr, src2_stride, tp0, tp1);
2051 INSERT_D2_SH(tp0, tp1, in0);
2052 src1_ptr += (2 * src2_stride);
2053 LD2(src1_ptr, src2_stride, tp0, tp1);
2054 INSERT_D2_SH(tp0, tp1, in1);
2055 src1_ptr += (2 * src2_stride);
2057 VSHF_B4_SB(src7, src9, mask4, mask5, mask6, mask7, vec0, vec1, vec2,
2059 VSHF_B4_SB(src8, src10, mask4, mask5, mask6, mask7, vec4, vec5, vec6,
2061 dst97 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
2063 dst108 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
2066 dst76 = __msa_ilvr_h(dst97, dst66);
2067 ILVRL_H2_SH(dst108, dst97, dst87, dst109);
2068 dst66 = (v8i16) __msa_splati_d((v2i64) dst97, 1);
2069 dst98 = __msa_ilvr_h(dst66, dst108);
2071 tmp0 = HEVC_FILT_8TAP(dst10, dst32, dst54, dst76, filt_h0, filt_h1,
2073 tmp1 = HEVC_FILT_8TAP(dst21, dst43, dst65, dst87, filt_h0, filt_h1,
2075 tmp2 = HEVC_FILT_8TAP(dst32, dst54, dst76, dst98, filt_h0, filt_h1,
2077 tmp3 = HEVC_FILT_8TAP(dst43, dst65, dst87, dst109, filt_h0, filt_h1,
2079 SRA_4V(tmp0, tmp1, tmp2, tmp3, 6);
2080 PCKEV_H2_SH(tmp1, tmp0, tmp3, tmp2, out0, out1);
2081 ADDS_SH2_SH(out0, in0, out1, in1, out0, out1);
2082 ADDS_SH2_SH(out0, const_vec, out1, const_vec, out0, out1);
2083 SRARI_H2_SH(out0, out1, 7);
2084 CLIP_SH2_0_255_MAX_SATU(out0, out1);
2085 out = (v16u8) __msa_pckev_b((v16i8) out1, (v16i8) out0);
2086 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
2087 dst += (4 * dst_stride);
2095 dst66 = (v8i16) __msa_splati_d((v2i64) dst108, 1);
2099 static void hevc_hv_bi_8t_16w_msa(uint8_t *src0_ptr,
2102 int32_t src2_stride,
2105 const int8_t *filter_x,
2106 const int8_t *filter_y,
2109 hevc_hv_bi_8t_8multx1mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
2110 dst, dst_stride, filter_x, filter_y,
2114 static void hevc_hv_bi_8t_24w_msa(uint8_t *src0_ptr,
2117 int32_t src2_stride,
2120 const int8_t *filter_x,
2121 const int8_t *filter_y,
2124 hevc_hv_bi_8t_8multx1mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
2125 dst, dst_stride, filter_x, filter_y,
2129 static void hevc_hv_bi_8t_32w_msa(uint8_t *src0_ptr,
2132 int32_t src2_stride,
2135 const int8_t *filter_x,
2136 const int8_t *filter_y,
2139 hevc_hv_bi_8t_8multx1mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
2140 dst, dst_stride, filter_x, filter_y,
2144 static void hevc_hv_bi_8t_48w_msa(uint8_t *src0_ptr,
2147 int32_t src2_stride,
2150 const int8_t *filter_x,
2151 const int8_t *filter_y,
2154 hevc_hv_bi_8t_8multx1mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
2155 dst, dst_stride, filter_x, filter_y,
2159 static void hevc_hv_bi_8t_64w_msa(uint8_t *src0_ptr,
2162 int32_t src2_stride,
2165 const int8_t *filter_x,
2166 const int8_t *filter_y,
2169 hevc_hv_bi_8t_8multx1mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
2170 dst, dst_stride, filter_x, filter_y,
2174 static void hevc_hz_bi_4t_4x2_msa(uint8_t *src0_ptr,
2177 int32_t src2_stride,
2180 const int8_t *filter,
2184 v16i8 src0, src1, dst0, vec0, vec1;
2186 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 };
2189 v8i16 filter_vec, const_vec;
2193 const_vec = __msa_ldi_h(128);
2196 filter_vec = LD_SH(filter);
2197 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2201 LD_SB2(src0_ptr, src_stride, src0, src1);
2202 LD_SH2(src1_ptr, src2_stride, in0, in1);
2203 in0 = (v8i16) __msa_ilvr_d((v2i64) in1, (v2i64) in0);
2204 XORI_B2_128_SB(src0, src1);
2205 VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
2207 DPADD_SB2_SH(vec0, vec1, filt0, filt1, tmp0, tmp0);
2209 tmp0 = __msa_adds_s_h(tmp0, in0);
2210 tmp0 = __msa_srari_h(tmp0, 7);
2211 tmp0 = CLIP_SH_0_255(tmp0);
2212 dst0 = __msa_pckev_b((v16i8) tmp0, (v16i8) tmp0);
2214 ST4x2_UB(dst0, dst, dst_stride);
2217 static void hevc_hz_bi_4t_4x4_msa(uint8_t *src0_ptr,
2220 int32_t src2_stride,
2223 const int8_t *filter,
2227 v16i8 src0, src1, src2, src3, dst0, vec0, vec1;
2228 v8i16 in0, in1, in2, in3;
2229 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 };
2232 v8i16 filter_vec, const_vec;
2236 const_vec = __msa_ldi_h(128);
2239 filter_vec = LD_SH(filter);
2240 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2244 LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
2245 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
2247 ILVR_D2_SH(in1, in0, in3, in2, in0, in1);
2248 XORI_B4_128_SB(src0, src1, src2, src3);
2250 VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
2252 DPADD_SB2_SH(vec0, vec1, filt0, filt1, tmp0, tmp0);
2253 VSHF_B2_SB(src2, src3, src2, src3, mask0, mask1, vec0, vec1);
2255 DPADD_SB2_SH(vec0, vec1, filt0, filt1, tmp1, tmp1);
2256 HEVC_BI_RND_CLIP2(in0, in1, tmp0, tmp1, 7, tmp0, tmp1);
2257 dst0 = __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
2259 ST4x4_UB(dst0, dst0, 0, 1, 2, 3, dst, dst_stride);
2262 static void hevc_hz_bi_4t_4x8multiple_msa(uint8_t *src0_ptr,
2265 int32_t src2_stride,
2268 const int8_t *filter,
2273 v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
2275 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
2276 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 };
2277 v16i8 mask1, vec0, vec1;
2278 v8i16 tmp0, tmp1, tmp2, tmp3;
2279 v8i16 filter_vec, const_vec;
2283 const_vec = __msa_ldi_h(128);
2286 filter_vec = LD_SH(filter);
2287 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2291 for (loop_cnt = (height >> 3); loop_cnt--;) {
2292 LD_SB8(src0_ptr, src_stride,
2293 src0, src1, src2, src3, src4, src5, src6, src7);
2294 src0_ptr += (8 * src_stride);
2295 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
2296 src1_ptr += (4 * src2_stride);
2297 LD_SH4(src1_ptr, src2_stride, in4, in5, in6, in7);
2298 src1_ptr += (4 * src2_stride);
2299 ILVR_D2_SH(in1, in0, in3, in2, in0, in1);
2300 ILVR_D2_SH(in5, in4, in7, in6, in2, in3);
2301 XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
2303 VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
2305 DPADD_SB2_SH(vec0, vec1, filt0, filt1, tmp0, tmp0);
2306 VSHF_B2_SB(src2, src3, src2, src3, mask0, mask1, vec0, vec1);
2308 DPADD_SB2_SH(vec0, vec1, filt0, filt1, tmp1, tmp1);
2309 VSHF_B2_SB(src4, src5, src4, src5, mask0, mask1, vec0, vec1);
2311 DPADD_SB2_SH(vec0, vec1, filt0, filt1, tmp2, tmp2);
2312 VSHF_B2_SB(src6, src7, src6, src7, mask0, mask1, vec0, vec1);
2314 DPADD_SB2_SH(vec0, vec1, filt0, filt1, tmp3, tmp3);
2316 HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
2317 tmp0, tmp1, tmp2, tmp3, 7, tmp0, tmp1, tmp2, tmp3);
2319 PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, dst0, dst1);
2320 ST4x8_UB(dst0, dst1, dst, dst_stride);
2321 dst += (8 * dst_stride);
2325 static void hevc_hz_bi_4t_4w_msa(uint8_t *src0_ptr,
2328 int32_t src2_stride,
2331 const int8_t *filter,
2335 hevc_hz_bi_4t_4x2_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
2336 dst, dst_stride, filter, height);
2337 } else if (4 == height) {
2338 hevc_hz_bi_4t_4x4_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
2339 dst, dst_stride, filter, height);
2340 } else if (8 == height || 16 == height) {
2341 hevc_hz_bi_4t_4x8multiple_msa(src0_ptr, src_stride,
2342 src1_ptr, src2_stride,
2343 dst, dst_stride, filter, height);
2347 static void hevc_hz_bi_4t_6w_msa(uint8_t *src0_ptr,
2350 int32_t src2_stride,
2353 const int8_t *filter,
2358 v16i8 src0, src1, src2, src3;
2359 v8i16 in0, in1, in2, in3;
2360 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
2363 v8i16 dst0, dst1, dst2, dst3;
2364 v8i16 filter_vec, const_vec;
2368 const_vec = __msa_ldi_h(128);
2371 filter_vec = LD_SH(filter);
2372 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2376 for (loop_cnt = (height >> 2); loop_cnt--;) {
2377 LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
2378 src0_ptr += (4 * src_stride);
2379 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
2380 src1_ptr += (4 * src2_stride);
2381 XORI_B4_128_SB(src0, src1, src2, src3);
2383 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
2385 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
2386 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
2388 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
2389 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
2391 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
2392 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
2394 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
2396 HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
2397 dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
2399 PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
2400 ST6x4_UB(dst0, dst1, dst, dst_stride);
2401 dst += (4 * dst_stride);
2405 static void hevc_hz_bi_4t_8x2_msa(uint8_t *src0_ptr,
2408 int32_t src2_stride,
2411 const int8_t *filter,
2417 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
2418 v16i8 mask1, vec0, vec1;
2420 v8i16 filter_vec, const_vec;
2424 const_vec = __msa_ldi_h(128);
2427 filter_vec = LD_SH(filter);
2428 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2432 LD_SB2(src0_ptr, src_stride, src0, src1);
2433 LD_SH2(src1_ptr, src2_stride, in0, in1);
2434 XORI_B2_128_SB(src0, src1);
2436 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
2438 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
2439 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
2441 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
2442 HEVC_BI_RND_CLIP2(in0, in1, dst0, dst1, 7, dst0, dst1);
2444 dst0 = (v8i16) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
2445 ST8x2_UB(dst0, dst, dst_stride);
2448 static void hevc_hz_bi_4t_8x6_msa(uint8_t *src0_ptr,
2451 int32_t src2_stride,
2454 const int8_t *filter,
2458 v16i8 src0, src1, src2, src3, src4, src5;
2459 v8i16 in0, in1, in2, in3, in4, in5;
2460 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
2463 v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
2464 v8i16 filter_vec, const_vec;
2468 const_vec = __msa_ldi_h(128);
2471 filter_vec = LD_SH(filter);
2472 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2476 LD_SB6(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5);
2477 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
2478 src1_ptr += (4 * src2_stride);
2479 LD_SH2(src1_ptr, src2_stride, in4, in5);
2480 XORI_B6_128_SB(src0, src1, src2, src3, src4, src5);
2482 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
2484 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
2485 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
2487 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
2488 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
2490 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
2491 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
2493 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
2494 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
2496 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
2497 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
2499 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5);
2501 HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
2502 dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
2503 HEVC_BI_RND_CLIP2(in4, in5, dst4, dst5, 7, dst4, dst5);
2505 PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
2506 dst2 = (v8i16) __msa_pckev_b((v16i8) dst5, (v16i8) dst4);
2507 ST8x4_UB(dst0, dst1, dst, dst_stride);
2508 dst += (4 * dst_stride);
2509 ST8x2_UB(dst2, dst, dst_stride);
2512 static void hevc_hz_bi_4t_8x4multiple_msa(uint8_t *src0_ptr,
2515 int32_t src2_stride,
2518 const int8_t *filter,
2523 v16i8 src0, src1, src2, src3;
2524 v8i16 in0, in1, in2, in3;
2525 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
2528 v8i16 dst0, dst1, dst2, dst3;
2529 v8i16 filter_vec, const_vec;
2533 const_vec = __msa_ldi_h(128);
2536 filter_vec = LD_SH(filter);
2537 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2541 for (loop_cnt = (height >> 2); loop_cnt--;) {
2542 LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
2543 src0_ptr += (4 * src_stride);
2544 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
2545 src1_ptr += (4 * src2_stride);
2546 XORI_B4_128_SB(src0, src1, src2, src3);
2548 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
2550 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
2551 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
2553 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
2554 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
2556 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
2557 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
2559 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
2561 HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
2562 dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
2564 PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
2565 ST8x4_UB(dst0, dst1, dst, dst_stride);
2566 dst += (4 * dst_stride);
2570 static void hevc_hz_bi_4t_8w_msa(uint8_t *src0_ptr,
2573 int32_t src2_stride,
2576 const int8_t *filter,
2580 hevc_hz_bi_4t_8x2_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
2581 dst, dst_stride, filter, height);
2582 } else if (6 == height) {
2583 hevc_hz_bi_4t_8x6_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
2584 dst, dst_stride, filter, height);
2585 } else if (0 == (height % 4)) {
2586 hevc_hz_bi_4t_8x4multiple_msa(src0_ptr, src_stride,
2587 src1_ptr, src2_stride,
2588 dst, dst_stride, filter, height);
2592 static void hevc_hz_bi_4t_12w_msa(uint8_t *src0_ptr,
2595 int32_t src2_stride,
2598 const int8_t *filter,
2603 v16i8 src0, src1, src2, src3;
2604 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
2605 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
2607 8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
2611 v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
2612 v8i16 filter_vec, const_vec;
2616 const_vec = __msa_ldi_h(128);
2619 filter_vec = LD_SH(filter);
2620 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2625 for (loop_cnt = (height >> 2); loop_cnt--;) {
2626 LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
2627 src0_ptr += (4 * src_stride);
2628 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
2629 LD_SH4(src1_ptr + 8, src2_stride, in4, in5, in6, in7);
2630 src1_ptr += (4 * src2_stride);
2632 ILVR_D2_SH(in5, in4, in7, in6, in4, in5);
2633 XORI_B4_128_SB(src0, src1, src2, src3);
2635 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
2637 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
2638 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
2640 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
2641 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
2643 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
2644 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
2646 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
2647 VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec0, vec1);
2649 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
2650 VSHF_B2_SB(src2, src3, src2, src3, mask2, mask3, vec0, vec1);
2652 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5);
2654 HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
2655 dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
2656 HEVC_BI_RND_CLIP2(in4, in5, dst4, dst5, 7, dst4, dst5);
2658 PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
2659 dst2 = (v8i16) __msa_pckev_b((v16i8) dst5, (v16i8) dst4);
2660 ST12x4_UB(dst0, dst1, dst2, dst, dst_stride);
2661 dst += (4 * dst_stride);
2665 static void hevc_hz_bi_4t_16w_msa(uint8_t *src0_ptr,
2668 int32_t src2_stride,
2671 const int8_t *filter,
2675 v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
2676 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
2678 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
2680 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
2682 v8i16 filter_vec, const_vec;
2686 const_vec = __msa_ldi_h(128);
2689 filter_vec = LD_SH(filter);
2690 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2694 for (loop_cnt = (height >> 2); loop_cnt--;) {
2695 LD_SB4(src0_ptr, src_stride, src0, src2, src4, src6);
2696 LD_SB4(src0_ptr + 8, src_stride, src1, src3, src5, src7);
2697 src0_ptr += (4 * src_stride);
2698 LD_SH4(src1_ptr, src2_stride, in0, in2, in4, in6);
2699 LD_SH4(src1_ptr + 8, src2_stride, in1, in3, in5, in7);
2700 src1_ptr += (4 * src2_stride);
2701 XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
2703 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
2705 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
2706 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
2708 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
2709 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
2711 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
2712 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
2714 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
2715 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
2717 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
2718 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
2720 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5);
2721 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
2723 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst6, dst6);
2724 VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1);
2726 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst7, dst7);
2728 HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
2729 dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
2730 HEVC_BI_RND_CLIP4(in4, in5, in6, in7,
2731 dst4, dst5, dst6, dst7, 7, dst4, dst5, dst6, dst7);
2733 PCKEV_B4_SH(dst1, dst0, dst3, dst2,
2734 dst5, dst4, dst7, dst6, dst0, dst1, dst2, dst3);
2735 ST_SH4(dst0, dst1, dst2, dst3, dst, dst_stride);
2736 dst += (4 * dst_stride);
2740 static void hevc_hz_bi_4t_24w_msa(uint8_t *src0_ptr,
2743 int32_t src2_stride,
2746 const int8_t *filter,
2749 int16_t *src1_ptr_tmp;
2752 v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
2753 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
2755 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
2756 v16i8 mask1, mask2, mask3;
2758 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
2759 v8i16 filter_vec, const_vec;
2763 const_vec = __msa_ldi_h(128);
2766 filter_vec = LD_SH(filter);
2767 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2774 src1_ptr_tmp = src1_ptr + 16;
2776 for (loop_cnt = (height >> 2); loop_cnt--;) {
2777 LD_SB4(src0_ptr, src_stride, src0, src2, src4, src6);
2778 LD_SB4(src0_ptr + 16, src_stride, src1, src3, src5, src7);
2779 src0_ptr += (4 * src_stride);
2780 LD_SH4(src1_ptr, src2_stride, in0, in2, in4, in6);
2781 LD_SH4(src1_ptr + 8, src2_stride, in1, in3, in5, in7);
2782 src1_ptr += (4 * src2_stride);
2783 XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
2785 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
2787 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
2788 VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec0, vec1);
2790 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
2791 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
2793 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
2794 VSHF_B2_SB(src2, src3, src2, src3, mask2, mask3, vec0, vec1);
2796 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
2797 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
2799 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
2800 VSHF_B2_SB(src4, src5, src4, src5, mask2, mask3, vec0, vec1);
2802 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5);
2803 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
2805 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst6, dst6);
2806 VSHF_B2_SB(src6, src7, src6, src7, mask2, mask3, vec0, vec1);
2808 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst7, dst7);
2810 HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
2811 dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
2812 HEVC_BI_RND_CLIP4(in4, in5, in6, in7,
2813 dst4, dst5, dst6, dst7, 7, dst4, dst5, dst6, dst7);
2815 PCKEV_B4_SH(dst1, dst0, dst3, dst2,
2816 dst5, dst4, dst7, dst6, dst0, dst1, dst2, dst3);
2817 ST_SH4(dst0, dst1, dst2, dst3, dst, dst_stride);
2818 dst += (4 * dst_stride);
2820 LD_SH4(src1_ptr_tmp, src2_stride, in0, in1, in2, in3);
2821 src1_ptr_tmp += (4 * src2_stride);
2823 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
2825 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
2826 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
2828 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
2829 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
2831 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
2832 VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1);
2834 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
2836 HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
2837 dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
2839 PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
2840 ST8x4_UB(dst0, dst1, dst_tmp, dst_stride);
2841 dst_tmp += (4 * dst_stride);
2845 static void hevc_hz_bi_4t_32w_msa(uint8_t *src0_ptr,
2848 int32_t src2_stride,
2851 const int8_t *filter,
2855 v16i8 src0, src1, src2;
2856 v8i16 in0, in1, in2, in3;
2858 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
2859 v16i8 mask1, mask2, mask3;
2860 v8i16 dst0, dst1, dst2, dst3;
2862 v8i16 filter_vec, const_vec;
2866 const_vec = __msa_ldi_h(128);
2869 filter_vec = LD_SH(filter);
2870 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2876 for (loop_cnt = (height >> 1); loop_cnt--;) {
2877 LD_SB2(src0_ptr, 16, src0, src1);
2878 src2 = LD_SB(src0_ptr + 24);
2879 src0_ptr += src_stride;
2880 LD_SH4(src1_ptr, 8, in0, in1, in2, in3);
2881 src1_ptr += src2_stride;
2882 XORI_B3_128_SB(src0, src1, src2);
2884 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
2886 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
2887 VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec0, vec1);
2889 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
2890 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
2892 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
2893 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
2895 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
2897 HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
2898 dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
2900 PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
2901 ST_SH2(dst0, dst1, dst, 16);
2904 LD_SB2(src0_ptr, 16, src0, src1);
2905 src2 = LD_SB(src0_ptr + 24);
2906 src0_ptr += src_stride;
2907 LD_SH4(src1_ptr, 8, in0, in1, in2, in3);
2908 src1_ptr += src2_stride;
2909 XORI_B3_128_SB(src0, src1, src2);
2911 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
2913 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
2914 VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec0, vec1);
2916 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
2917 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
2919 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
2920 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
2922 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
2924 HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
2925 dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
2927 PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
2928 ST_SH2(dst0, dst1, dst, 16);
2933 static void hevc_vt_bi_4t_4x2_msa(uint8_t *src0_ptr,
2936 int32_t src2_stride,
2939 const int8_t *filter,
2942 v16i8 src0, src1, src2, src3, src4;
2944 v16i8 src10_r, src32_r, src21_r, src43_r, src2110, src4332;
2947 v8i16 filter_vec, const_vec;
2949 src0_ptr -= src_stride;
2951 const_vec = __msa_ldi_h(128);
2954 filter_vec = LD_SH(filter);
2955 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2957 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
2958 src0_ptr += (3 * src_stride);
2960 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2961 src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
2962 src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
2964 LD_SB2(src0_ptr, src_stride, src3, src4);
2965 LD_SH2(src1_ptr, src2_stride, in0, in1);
2966 in0 = (v8i16) __msa_ilvr_d((v2i64) in1, (v2i64) in0);
2967 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
2968 src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_r, (v2i64) src32_r);
2969 src4332 = (v16i8) __msa_xori_b((v16u8) src4332, 128);
2972 DPADD_SB2_SH(src2110, src4332, filt0, filt1, dst10, dst10);
2973 dst10 = __msa_adds_s_h(dst10, in0);
2974 dst10 = __msa_srari_h(dst10, 7);
2975 dst10 = CLIP_SH_0_255(dst10);
2977 dst10 = (v8i16) __msa_pckev_b((v16i8) dst10, (v16i8) dst10);
2978 ST4x2_UB(dst10, dst, dst_stride);
2981 static void hevc_vt_bi_4t_4x4_msa(uint8_t *src0_ptr,
2984 int32_t src2_stride,
2987 const int8_t *filter,
2990 v16i8 src0, src1, src2, src3, src4, src5, src6;
2991 v8i16 in0, in1, in2, in3;
2992 v16i8 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r;
2993 v16i8 src2110, src4332, src6554;
2996 v8i16 filter_vec, const_vec;
2998 src0_ptr -= src_stride;
3000 const_vec = __msa_ldi_h(128);
3003 filter_vec = LD_SH(filter);
3004 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3006 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
3007 src0_ptr += (3 * src_stride);
3008 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3009 src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
3010 src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
3012 LD_SB4(src0_ptr, src_stride, src3, src4, src5, src6);
3013 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
3014 ILVR_D2_SH(in1, in0, in3, in2, in0, in1);
3015 ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
3016 src32_r, src43_r, src54_r, src65_r);
3017 ILVR_D2_SB(src43_r, src32_r, src65_r, src54_r, src4332, src6554);
3018 XORI_B2_128_SB(src4332, src6554);
3021 DPADD_SB2_SH(src2110, src4332, filt0, filt1, dst10, dst10);
3023 DPADD_SB2_SH(src4332, src6554, filt0, filt1, dst32, dst32);
3024 HEVC_BI_RND_CLIP2(in0, in1, dst10, dst32, 7, dst10, dst32);
3026 dst10 = (v8i16) __msa_pckev_b((v16i8) dst32, (v16i8) dst10);
3027 ST4x4_UB(dst10, dst10, 0, 1, 2, 3, dst, dst_stride);
3030 static void hevc_vt_bi_4t_4x8multiple_msa(uint8_t *src0_ptr,
3033 int32_t src2_stride,
3036 const int8_t *filter,
3040 v16i8 src0, src1, src2, src3, src4, src5;
3041 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
3042 v16i8 src6, src7, src8, src9;
3043 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
3044 v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
3045 v16i8 src2110, src4332, src6554, src8776;
3046 v8i16 dst10, dst32, dst54, dst76;
3048 v8i16 filter_vec, const_vec;
3050 src0_ptr -= src_stride;
3052 const_vec = __msa_ldi_h(128);
3055 filter_vec = LD_SH(filter);
3056 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3058 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
3059 src0_ptr += (3 * src_stride);
3060 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3061 src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
3062 src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
3064 for (loop_cnt = (height >> 3); loop_cnt--;) {
3065 LD_SB6(src0_ptr, src_stride, src3, src4, src5, src6, src7, src8);
3066 src0_ptr += (6 * src_stride);
3067 LD_SH8(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5, in6, in7);
3068 src1_ptr += (8 * src2_stride);
3069 ILVR_D2_SH(in1, in0, in3, in2, in0, in1);
3070 ILVR_D2_SH(in5, in4, in7, in6, in2, in3);
3071 ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
3072 src32_r, src43_r, src54_r, src65_r);
3073 ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
3074 ILVR_D3_SB(src43_r, src32_r, src65_r, src54_r, src87_r, src76_r,
3075 src4332, src6554, src8776);
3076 XORI_B3_128_SB(src4332, src6554, src8776);
3079 DPADD_SB2_SH(src2110, src4332, filt0, filt1, dst10, dst10);
3081 DPADD_SB2_SH(src4332, src6554, filt0, filt1, dst32, dst32);
3083 DPADD_SB2_SH(src6554, src8776, filt0, filt1, dst54, dst54);
3085 LD_SB2(src0_ptr, src_stride, src9, src2);
3086 src0_ptr += (2 * src_stride);
3087 ILVR_B2_SB(src9, src8, src2, src9, src98_r, src109_r);
3088 src2110 = (v16i8) __msa_ilvr_d((v2i64) src109_r, (v2i64) src98_r);
3089 src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
3091 DPADD_SB2_SH(src8776, src2110, filt0, filt1, dst76, dst76);
3093 HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
3094 dst10, dst32, dst54, dst76, 7,
3095 dst10, dst32, dst54, dst76);
3097 PCKEV_B2_SH(dst32, dst10, dst76, dst54, dst10, dst54);
3098 ST4x8_UB(dst10, dst54, dst, dst_stride);
3099 dst += (8 * dst_stride);
3103 static void hevc_vt_bi_4t_4w_msa(uint8_t *src0_ptr,
3106 int32_t src2_stride,
3109 const int8_t *filter,
3113 hevc_vt_bi_4t_4x2_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
3114 dst, dst_stride, filter, height);
3115 } else if (4 == height) {
3116 hevc_vt_bi_4t_4x4_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
3117 dst, dst_stride, filter, height);
3119 hevc_vt_bi_4t_4x8multiple_msa(src0_ptr, src_stride,
3120 src1_ptr, src2_stride,
3121 dst, dst_stride, filter, height);
3125 static void hevc_vt_bi_4t_6w_msa(uint8_t *src0_ptr,
3128 int32_t src2_stride,
3131 const int8_t *filter,
3135 v16i8 src0, src1, src2, src3, src4, src5;
3136 v8i16 in0, in1, in2, in3;
3137 v16i8 src10_r, src32_r, src21_r, src43_r;
3138 v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
3140 v8i16 filter_vec, const_vec;
3142 src0_ptr -= src_stride;
3144 const_vec = __msa_ldi_h(128);
3147 filter_vec = LD_SH(filter);
3148 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3150 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
3151 src0_ptr += (3 * src_stride);
3152 XORI_B3_128_SB(src0, src1, src2);
3153 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3155 for (loop_cnt = (height >> 2); loop_cnt--;) {
3156 LD_SB2(src0_ptr, src_stride, src3, src4);
3157 src0_ptr += (2 * src_stride);
3158 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
3159 src1_ptr += (4 * src2_stride);
3160 XORI_B2_128_SB(src3, src4);
3161 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3164 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
3166 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
3168 LD_SB2(src0_ptr, src_stride, src5, src2);
3169 src0_ptr += (2 * src_stride);
3170 XORI_B2_128_SB(src5, src2);
3171 ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r);
3174 DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, dst2_r, dst2_r);
3176 DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, dst3_r, dst3_r);
3178 HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
3179 dst0_r, dst1_r, dst2_r, dst3_r, 7,
3180 dst0_r, dst1_r, dst2_r, dst3_r);
3182 PCKEV_B2_SH(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r);
3183 ST6x4_UB(dst0_r, dst1_r, dst, dst_stride);
3184 dst += (4 * dst_stride);
3188 static void hevc_vt_bi_4t_8x2_msa(uint8_t *src0_ptr,
3191 int32_t src2_stride,
3194 const int8_t *filter,
3197 v16i8 src0, src1, src2, src3, src4;
3198 v8i16 in0, in1, dst0_r, dst1_r;
3199 v16i8 src10_r, src32_r, src21_r, src43_r;
3201 v8i16 filter_vec, const_vec;
3203 src0_ptr -= src_stride;
3205 const_vec = __msa_ldi_h(128);
3208 filter_vec = LD_SH(filter);
3209 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3211 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
3212 src0_ptr += (3 * src_stride);
3213 XORI_B3_128_SB(src0, src1, src2);
3214 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3216 LD_SB2(src0_ptr, src_stride, src3, src4);
3217 LD_SH2(src1_ptr, src2_stride, in0, in1);
3218 XORI_B2_128_SB(src3, src4);
3219 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3222 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
3224 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
3226 HEVC_BI_RND_CLIP2(in0, in1, dst0_r, dst1_r, 7, dst0_r, dst1_r);
3227 dst0_r = (v8i16) __msa_pckev_b((v16i8) dst1_r, (v16i8) dst0_r);
3229 ST8x2_UB(dst0_r, dst, dst_stride);
3232 static void hevc_vt_bi_4t_8x6_msa(uint8_t *src0_ptr,
3235 int32_t src2_stride,
3238 const int8_t *filter,
3241 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
3242 v8i16 in0, in1, in2, in3, in4, in5;
3243 v16i8 src10_r, src32_r, src54_r, src76_r;
3244 v16i8 src21_r, src43_r, src65_r, src87_r;
3245 v8i16 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r;
3247 v8i16 filter_vec, const_vec;
3249 src0_ptr -= src_stride;
3251 const_vec = __msa_ldi_h(128);
3254 filter_vec = LD_SH(filter);
3255 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3257 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
3258 src0_ptr += (3 * src_stride);
3259 XORI_B3_128_SB(src0, src1, src2);
3260 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3262 LD_SB6(src0_ptr, src_stride, src3, src4, src5, src6, src7, src8);
3263 LD_SH6(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5);
3264 XORI_B6_128_SB(src3, src4, src5, src6, src7, src8);
3265 ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
3266 src32_r, src43_r, src54_r, src65_r);
3267 ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
3270 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
3272 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
3274 DPADD_SB2_SH(src32_r, src54_r, filt0, filt1, dst2_r, dst2_r);
3276 DPADD_SB2_SH(src43_r, src65_r, filt0, filt1, dst3_r, dst3_r);
3278 DPADD_SB2_SH(src54_r, src76_r, filt0, filt1, dst4_r, dst4_r);
3280 DPADD_SB2_SH(src65_r, src87_r, filt0, filt1, dst5_r, dst5_r);
3281 HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
3282 dst0_r, dst1_r, dst2_r, dst3_r, 7,
3283 dst0_r, dst1_r, dst2_r, dst3_r);
3284 HEVC_BI_RND_CLIP2(in4, in5, dst4_r, dst5_r, 7, dst4_r, dst5_r);
3286 PCKEV_B2_SH(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r);
3287 dst2_r = (v8i16) __msa_pckev_b((v16i8) dst5_r, (v16i8) dst4_r);
3288 ST8x4_UB(dst0_r, dst1_r, dst, dst_stride);
3289 dst += (4 * dst_stride);
3290 ST8x2_UB(dst2_r, dst, dst_stride);
3293 static void hevc_vt_bi_4t_8x4multiple_msa(uint8_t *src0_ptr,
3296 int32_t src2_stride,
3299 const int8_t *filter,
3303 v16i8 src0, src1, src2, src3, src4, src5;
3304 v8i16 in0, in1, in2, in3;
3305 v16i8 src10_r, src32_r, src21_r, src43_r;
3306 v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
3308 v8i16 filter_vec, const_vec;
3310 src0_ptr -= src_stride;
3312 const_vec = __msa_ldi_h(128);
3315 filter_vec = LD_SH(filter);
3316 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3318 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
3319 src0_ptr += (3 * src_stride);
3320 XORI_B3_128_SB(src0, src1, src2);
3321 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3323 for (loop_cnt = (height >> 2); loop_cnt--;) {
3324 LD_SB2(src0_ptr, src_stride, src3, src4);
3325 src0_ptr += (2 * src_stride);
3326 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
3327 src1_ptr += (4 * src2_stride);
3328 XORI_B2_128_SB(src3, src4);
3329 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3332 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
3334 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
3336 LD_SB2(src0_ptr, src_stride, src5, src2);
3337 src0_ptr += (2 * src_stride);
3338 XORI_B2_128_SB(src5, src2);
3339 ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r);
3342 DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, dst2_r, dst2_r);
3344 DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, dst3_r, dst3_r);
3345 HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
3346 dst0_r, dst1_r, dst2_r, dst3_r, 7,
3347 dst0_r, dst1_r, dst2_r, dst3_r);
3349 PCKEV_B2_SH(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r);
3350 ST8x4_UB(dst0_r, dst1_r, dst, dst_stride);
3351 dst += (4 * dst_stride);
3355 static void hevc_vt_bi_4t_8w_msa(uint8_t *src0_ptr,
3358 int32_t src2_stride,
3361 const int8_t *filter,
3365 hevc_vt_bi_4t_8x2_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
3366 dst, dst_stride, filter, height);
3367 } else if (6 == height) {
3368 hevc_vt_bi_4t_8x6_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
3369 dst, dst_stride, filter, height);
3371 hevc_vt_bi_4t_8x4multiple_msa(src0_ptr, src_stride,
3372 src1_ptr, src2_stride,
3373 dst, dst_stride, filter, height);
3377 static void hevc_vt_bi_4t_12w_msa(uint8_t *src0_ptr,
3380 int32_t src2_stride,
3383 const int8_t *filter,
3387 v16i8 src0, src1, src2, src3, src4, src5;
3388 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
3389 v16i8 src10_r, src32_r, src21_r, src43_r;
3390 v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
3391 v16i8 src10_l, src32_l, src54_l, src21_l, src43_l, src65_l;
3392 v16i8 src2110, src4332;
3393 v8i16 dst0_l, dst1_l, filt0, filt1;
3394 v8i16 filter_vec, const_vec;
3396 src0_ptr -= (1 * src_stride);
3398 const_vec = __msa_ldi_h(128);
3401 filter_vec = LD_SH(filter);
3402 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3404 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
3405 src0_ptr += (3 * src_stride);
3406 XORI_B3_128_SB(src0, src1, src2);
3407 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3408 ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
3409 src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_l, (v2i64) src10_l);
3411 for (loop_cnt = (height >> 2); loop_cnt--;) {
3412 LD_SB2(src0_ptr, src_stride, src3, src4);
3413 src0_ptr += (2 * src_stride);
3414 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
3415 LD_SH4((src1_ptr + 8), src2_stride, in4, in5, in6, in7);
3416 src1_ptr += (4 * src2_stride);
3417 ILVR_D2_SH(in5, in4, in7, in6, in4, in5);
3418 XORI_B2_128_SB(src3, src4);
3420 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3421 ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
3422 src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_l, (v2i64) src32_l);
3425 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
3427 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
3429 DPADD_SB2_SH(src2110, src4332, filt0, filt1, dst0_l, dst0_l);
3431 LD_SB2(src0_ptr, src_stride, src5, src2);
3432 src0_ptr += (2 * src_stride);
3433 XORI_B2_128_SB(src5, src2);
3435 ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r);
3436 ILVL_B2_SB(src5, src4, src2, src5, src54_l, src65_l);
3437 src2110 = (v16i8) __msa_ilvr_d((v2i64) src65_l, (v2i64) src54_l);
3440 DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, dst2_r, dst2_r);
3442 DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, dst3_r, dst3_r);
3444 DPADD_SB2_SH(src4332, src2110, filt0, filt1, dst1_l, dst1_l);
3445 HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
3446 dst0_r, dst1_r, dst2_r, dst3_r, 7,
3447 dst0_r, dst1_r, dst2_r, dst3_r);
3448 HEVC_BI_RND_CLIP2(in4, in5, dst0_l, dst1_l, 7, dst0_l, dst1_l);
3450 PCKEV_B2_SH(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r);
3451 dst0_l = (v8i16) __msa_pckev_b((v16i8) dst1_l, (v16i8) dst0_l);
3452 ST12x4_UB(dst0_r, dst1_r, dst0_l, dst, dst_stride);
3453 dst += (4 * dst_stride);
3457 static void hevc_vt_bi_4t_16w_msa(uint8_t *src0_ptr,
3460 int32_t src2_stride,
3463 const int8_t *filter,
3467 v16i8 src0, src1, src2, src3, src4, src5;
3468 v8i16 in0, in1, in2, in3;
3469 v16i8 src10_r, src32_r, src21_r, src43_r;
3470 v16i8 src10_l, src32_l, src21_l, src43_l;
3471 v8i16 dst0_r, dst1_r, dst0_l, dst1_l;
3473 v8i16 filter_vec, const_vec;
3475 src0_ptr -= src_stride;
3477 const_vec = __msa_ldi_h(128);
3480 filter_vec = LD_SH(filter);
3481 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3483 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
3484 src0_ptr += (3 * src_stride);
3485 XORI_B3_128_SB(src0, src1, src2);
3486 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3487 ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
3489 for (loop_cnt = (height >> 2); loop_cnt--;) {
3490 LD_SB2(src0_ptr, src_stride, src3, src4);
3491 src0_ptr += (2 * src_stride);
3492 LD_SH2(src1_ptr, src2_stride, in0, in1);
3493 LD_SH2((src1_ptr + 8), src2_stride, in2, in3);
3494 src1_ptr += (2 * src2_stride);
3495 XORI_B2_128_SB(src3, src4);
3496 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3497 ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
3500 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
3502 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
3504 DPADD_SB2_SH(src10_l, src32_l, filt0, filt1, dst0_l, dst0_l);
3506 DPADD_SB2_SH(src21_l, src43_l, filt0, filt1, dst1_l, dst1_l);
3507 HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
3508 dst0_r, dst1_r, dst0_l, dst1_l, 7,
3509 dst0_r, dst1_r, dst0_l, dst1_l);
3511 PCKEV_B2_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r, dst1_r);
3512 ST_SH2(dst0_r, dst1_r, dst, dst_stride);
3513 dst += (2 * dst_stride);
3515 LD_SB2(src0_ptr, src_stride, src5, src2);
3516 src0_ptr += (2 * src_stride);
3517 LD_SH2(src1_ptr, src2_stride, in0, in1);
3518 LD_SH2((src1_ptr + 8), src2_stride, in2, in3);
3519 src1_ptr += (2 * src2_stride);
3520 XORI_B2_128_SB(src5, src2);
3521 ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r);
3522 ILVL_B2_SB(src5, src4, src2, src5, src10_l, src21_l);
3525 DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, dst0_r, dst0_r);
3527 DPADD_SB2_SH(src32_l, src10_l, filt0, filt1, dst0_l, dst0_l);
3529 DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, dst1_r, dst1_r);
3531 DPADD_SB2_SH(src43_l, src21_l, filt0, filt1, dst1_l, dst1_l);
3532 HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
3533 dst0_r, dst1_r, dst0_l, dst1_l, 7,
3534 dst0_r, dst1_r, dst0_l, dst1_l);
3536 PCKEV_B2_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r, dst1_r);
3537 ST_SH2(dst0_r, dst1_r, dst, dst_stride);
3538 dst += (2 * dst_stride);
3542 static void hevc_vt_bi_4t_24w_msa(uint8_t *src0_ptr,
3545 int32_t src2_stride,
3548 const int8_t *filter,
3552 v16i8 src0, src1, src2, src3, src4, src5;
3553 v16i8 src6, src7, src8, src9, src10, src11;
3554 v8i16 in0, in1, in2, in3, in4, in5;
3555 v16i8 src10_r, src32_r, src76_r, src98_r;
3556 v16i8 src21_r, src43_r, src87_r, src109_r;
3557 v16i8 src10_l, src32_l, src21_l, src43_l;
3558 v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
3559 v8i16 dst0_l, dst1_l;
3561 v8i16 filter_vec, const_vec;
3563 src0_ptr -= src_stride;
3565 const_vec = __msa_ldi_h(128);
3568 filter_vec = LD_SH(filter);
3569 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3572 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
3573 XORI_B3_128_SB(src0, src1, src2);
3574 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3575 ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
3577 LD_SB3(src0_ptr + 16, src_stride, src6, src7, src8);
3578 src0_ptr += (3 * src_stride);
3579 XORI_B3_128_SB(src6, src7, src8);
3580 ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
3582 for (loop_cnt = (height >> 2); loop_cnt--;) {
3584 LD_SB2(src0_ptr, src_stride, src3, src4);
3585 LD_SH2(src1_ptr, src2_stride, in0, in1);
3586 LD_SH2((src1_ptr + 8), src2_stride, in2, in3);
3587 LD_SH2((src1_ptr + 16), src2_stride, in4, in5);
3588 src1_ptr += (2 * src2_stride);
3589 XORI_B2_128_SB(src3, src4);
3590 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3591 ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
3593 LD_SB2(src0_ptr + 16, src_stride, src9, src10);
3594 src0_ptr += (2 * src_stride);
3595 XORI_B2_128_SB(src9, src10);
3596 ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r);
3599 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
3601 DPADD_SB2_SH(src10_l, src32_l, filt0, filt1, dst0_l, dst0_l);
3603 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
3605 DPADD_SB2_SH(src21_l, src43_l, filt0, filt1, dst1_l, dst1_l);
3608 DPADD_SB2_SH(src76_r, src98_r, filt0, filt1, dst2_r, dst2_r);
3610 DPADD_SB2_SH(src87_r, src109_r, filt0, filt1, dst3_r, dst3_r);
3612 HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
3613 dst0_r, dst1_r, dst0_l, dst1_l, 7,
3614 dst0_r, dst1_r, dst0_l, dst1_l);
3616 HEVC_BI_RND_CLIP2(in4, in5, dst2_r, dst3_r, 7, dst2_r, dst3_r);
3618 PCKEV_B2_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r, dst1_r);
3619 dst2_r = (v8i16) __msa_pckev_b((v16i8) dst3_r, (v16i8) dst2_r);
3620 ST_SH2(dst0_r, dst1_r, dst, dst_stride);
3621 ST8x2_UB(dst2_r, dst + 16, dst_stride);
3622 dst += (2 * dst_stride);
3625 LD_SB2(src0_ptr, src_stride, src5, src2);
3626 LD_SH2(src1_ptr, src2_stride, in0, in1);
3627 LD_SH2((src1_ptr + 8), src2_stride, in2, in3);
3628 LD_SH2((src1_ptr + 16), src2_stride, in4, in5);
3629 src1_ptr += (2 * src2_stride);
3630 XORI_B2_128_SB(src5, src2);
3631 ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r);
3632 ILVL_B2_SB(src5, src4, src2, src5, src10_l, src21_l);
3634 LD_SB2(src0_ptr + 16, src_stride, src11, src8);
3635 src0_ptr += (2 * src_stride);
3636 XORI_B2_128_SB(src11, src8);
3637 ILVR_B2_SB(src11, src10, src8, src11, src76_r, src87_r);
3640 DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, dst0_r, dst0_r);
3642 DPADD_SB2_SH(src32_l, src10_l, filt0, filt1, dst0_l, dst0_l);
3644 DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, dst1_r, dst1_r);
3646 DPADD_SB2_SH(src43_l, src21_l, filt0, filt1, dst1_l, dst1_l);
3649 DPADD_SB2_SH(src98_r, src76_r, filt0, filt1, dst2_r, dst2_r);
3651 DPADD_SB2_SH(src109_r, src87_r, filt0, filt1, dst3_r, dst3_r);
3653 HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
3654 dst0_r, dst1_r, dst0_l, dst1_l, 7,
3655 dst0_r, dst1_r, dst0_l, dst1_l);
3656 HEVC_BI_RND_CLIP2(in4, in5, dst2_r, dst3_r, 7, dst2_r, dst3_r);
3658 PCKEV_B2_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r, dst1_r);
3659 dst2_r = (v8i16) __msa_pckev_b((v16i8) dst3_r, (v16i8) dst2_r);
3660 ST_SH2(dst0_r, dst1_r, dst, dst_stride);
3661 ST8x2_UB(dst2_r, dst + 16, dst_stride);
3662 dst += (2 * dst_stride);
3666 static void hevc_vt_bi_4t_32w_msa(uint8_t *src0_ptr,
3669 int32_t src2_stride,
3672 const int8_t *filter,
3676 uint8_t *dst_tmp = dst + 16;
3677 v16i8 src0, src1, src2, src3, src4, src6, src7, src8, src9, src10;
3678 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
3679 v16i8 src10_r, src32_r, src76_r, src98_r;
3680 v16i8 src21_r, src43_r, src87_r, src109_r;
3681 v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
3682 v16i8 src10_l, src32_l, src76_l, src98_l;
3683 v16i8 src21_l, src43_l, src87_l, src109_l;
3684 v8i16 dst0_l, dst1_l, dst2_l, dst3_l;
3686 v8i16 filter_vec, const_vec;
3688 src0_ptr -= src_stride;
3690 const_vec = __msa_ldi_h(128);
3693 filter_vec = LD_SH(filter);
3694 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3697 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
3698 XORI_B3_128_SB(src0, src1, src2);
3699 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3700 ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
3703 LD_SB3(src0_ptr + 16, src_stride, src6, src7, src8);
3704 src0_ptr += (3 * src_stride);
3705 XORI_B3_128_SB(src6, src7, src8);
3706 ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
3707 ILVL_B2_SB(src7, src6, src8, src7, src76_l, src87_l);
3709 for (loop_cnt = (height >> 1); loop_cnt--;) {
3711 LD_SB2(src0_ptr, src_stride, src3, src4);
3712 LD_SH2(src1_ptr, src2_stride, in0, in1);
3713 LD_SH2((src1_ptr + 8), src2_stride, in2, in3);
3714 LD_SH2((src1_ptr + 16), src2_stride, in4, in5);
3715 LD_SH2((src1_ptr + 24), src2_stride, in6, in7);
3716 src1_ptr += (2 * src2_stride);
3717 XORI_B2_128_SB(src3, src4);
3718 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3719 ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
3722 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
3724 DPADD_SB2_SH(src10_l, src32_l, filt0, filt1, dst0_l, dst0_l);
3726 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
3728 DPADD_SB2_SH(src21_l, src43_l, filt0, filt1, dst1_l, dst1_l);
3730 HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
3731 dst0_r, dst1_r, dst0_l, dst1_l, 7,
3732 dst0_r, dst1_r, dst0_l, dst1_l);
3740 PCKEV_B2_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r, dst1_r);
3741 ST_SH2(dst0_r, dst1_r, dst, dst_stride);
3742 dst += (2 * dst_stride);
3745 LD_SB2(src0_ptr + 16, src_stride, src9, src10);
3746 src0_ptr += (2 * src_stride);
3747 XORI_B2_128_SB(src9, src10);
3748 ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r);
3749 ILVL_B2_SB(src9, src8, src10, src9, src98_l, src109_l);
3752 DPADD_SB2_SH(src76_r, src98_r, filt0, filt1, dst2_r, dst2_r);
3754 DPADD_SB2_SH(src76_l, src98_l, filt0, filt1, dst2_l, dst2_l);
3756 DPADD_SB2_SH(src87_r, src109_r, filt0, filt1, dst3_r, dst3_r);
3758 DPADD_SB2_SH(src87_l, src109_l, filt0, filt1, dst3_l, dst3_l);
3760 HEVC_BI_RND_CLIP4(in4, in5, in6, in7,
3761 dst2_r, dst3_r, dst2_l, dst3_l, 7,
3762 dst2_r, dst3_r, dst2_l, dst3_l);
3764 PCKEV_B2_SH(dst2_l, dst2_r, dst3_l, dst3_r, dst2_r, dst3_r);
3765 ST_SH2(dst2_r, dst3_r, dst_tmp, dst_stride);
3766 dst_tmp += (2 * dst_stride);
3776 static void hevc_hv_bi_4t_4x2_msa(uint8_t *src0_ptr,
3779 int32_t src2_stride,
3782 const int8_t *filter_x,
3783 const int8_t *filter_y,
3787 v16i8 src0, src1, src2, src3, src4;
3789 v4i32 filt_h0, filt_h1;
3790 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
3792 v8i16 filter_vec, const_vec;
3793 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
3794 v8i16 dst0, dst1, dst2, dst3, dst4;
3795 v4i32 dst0_r, dst1_r;
3796 v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
3798 src0_ptr -= (src_stride + 1);
3800 filter_vec = LD_SH(filter_x);
3801 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3803 filter_vec = LD_SH(filter_y);
3804 vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
3805 filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
3807 SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1);
3811 const_vec = __msa_ldi_h(128);
3814 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
3815 src0_ptr += (3 * src_stride);
3816 XORI_B3_128_SB(src0, src1, src2);
3818 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3819 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
3820 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3822 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
3824 DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1);
3826 DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2);
3827 ILVR_H2_SH(dst1, dst0, dst2, dst1, dst10_r, dst21_r);
3829 LD_SB2(src0_ptr, src_stride, src3, src4);
3830 LD_SH2(src1_ptr, src2_stride, in0, in1);
3831 in0 = (v8i16) __msa_ilvr_d((v2i64) in1, (v2i64) in0);
3832 XORI_B2_128_SB(src3, src4);
3834 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3836 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
3837 dst32_r = __msa_ilvr_h(dst3, dst2);
3838 dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
3841 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
3843 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
3844 dst43_r = __msa_ilvr_h(dst4, dst3);
3845 dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
3847 dst0_r = (v4i32) __msa_pckev_h((v8i16) dst1_r, (v8i16) dst0_r);
3848 dst0_r = (v4i32) __msa_adds_s_h((v8i16) dst0_r, in0);
3849 dst0_r = (v4i32) __msa_srari_h((v8i16) dst0_r, 7);
3850 dst0_r = (v4i32) CLIP_SH_0_255(dst0_r);
3852 dst0_r = (v4i32) __msa_pckev_b((v16i8) dst0_r, (v16i8) dst0_r);
3853 ST4x2_UB(dst0_r, dst, dst_stride);
3856 static void hevc_hv_bi_4t_4x4_msa(uint8_t *src0_ptr,
3859 int32_t src2_stride,
3862 const int8_t *filter_x,
3863 const int8_t *filter_y,
3866 v8i16 in0, in1, in2, in3;
3867 v16i8 src0, src1, src2, src3, src4, src5, src6;
3869 v4i32 filt_h0, filt_h1;
3870 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
3872 v8i16 filter_vec, const_vec;
3873 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
3874 v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
3875 v8i16 dst0_r, dst1_r;
3876 v4i32 tmp0, tmp1, tmp2, tmp3;
3877 v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
3879 src0_ptr -= (src_stride + 1);
3881 filter_vec = LD_SH(filter_x);
3882 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3884 filter_vec = LD_SH(filter_y);
3885 vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
3886 filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
3888 SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1);
3892 const_vec = __msa_ldi_h(128);
3895 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
3896 src0_ptr += (3 * src_stride);
3897 XORI_B3_128_SB(src0, src1, src2);
3899 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3900 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
3901 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3903 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
3905 DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1);
3907 DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2);
3908 ILVR_H2_SH(dst1, dst0, dst2, dst1, dst10_r, dst21_r);
3910 LD_SB4(src0_ptr, src_stride, src3, src4, src5, src6);
3911 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
3912 ILVR_D2_SH(in1, in0, in3, in2, in0, in1);
3913 XORI_B4_128_SB(src3, src4, src5, src6);
3915 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3917 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
3918 dst32_r = __msa_ilvr_h(dst3, dst2);
3919 tmp0 = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
3922 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
3924 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
3925 dst43_r = __msa_ilvr_h(dst4, dst3);
3926 tmp1 = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
3929 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
3931 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5);
3932 dst10_r = __msa_ilvr_h(dst5, dst4);
3933 tmp2 = HEVC_FILT_4TAP(dst32_r, dst10_r, filt_h0, filt_h1);
3936 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
3938 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
3939 dst21_r = __msa_ilvr_h(dst2, dst5);
3940 tmp3 = HEVC_FILT_4TAP(dst43_r, dst21_r, filt_h0, filt_h1);
3942 PCKEV_H2_SH(tmp1, tmp0, tmp3, tmp2, dst0_r, dst1_r);
3943 HEVC_BI_RND_CLIP2(in0, in1, dst0_r, dst1_r, 7, dst0_r, dst1_r);
3945 dst0_r = (v8i16) __msa_pckev_b((v16i8) dst1_r, (v16i8) dst0_r);
3946 ST4x4_UB(dst0_r, dst0_r, 0, 1, 2, 3, dst, dst_stride);
3947 dst += (4 * dst_stride);
3950 static void hevc_hv_bi_4t_4multx8mult_msa(uint8_t *src0_ptr,
3953 int32_t src2_stride,
3956 const int8_t *filter_x,
3957 const int8_t *filter_y,
3961 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
3962 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
3964 v4i32 filt_h0, filt_h1;
3965 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
3967 v8i16 filter_vec, const_vec;
3968 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
3969 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8, dst9;
3970 v8i16 tmp0, tmp1, tmp2, tmp3;
3971 v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
3972 v8i16 dst21_r, dst43_r, dst65_r, dst87_r;
3973 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r, dst6_r, dst7_r;
3975 src0_ptr -= (src_stride + 1);
3977 filter_vec = LD_SH(filter_x);
3978 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3980 filter_vec = LD_SH(filter_y);
3981 vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
3982 filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
3984 SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1);
3988 const_vec = __msa_ldi_h(128);
3991 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
3992 src0_ptr += (3 * src_stride);
3993 XORI_B3_128_SB(src0, src1, src2);
3995 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3996 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
3997 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3999 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
4001 DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1);
4003 DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2);
4004 ILVR_H2_SH(dst1, dst0, dst2, dst1, dst10_r, dst21_r);
4006 for (loop_cnt = height >> 3; loop_cnt--;) {
4007 LD_SB8(src0_ptr, src_stride,
4008 src3, src4, src5, src6, src7, src8, src9, src10);
4009 src0_ptr += (8 * src_stride);
4010 LD_SH8(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5, in6, in7);
4011 src1_ptr += (8 * src2_stride);
4012 ILVR_D2_SH(in1, in0, in3, in2, in0, in1);
4013 ILVR_D2_SH(in5, in4, in7, in6, in2, in3);
4014 XORI_B8_128_SB(src3, src4, src5, src6, src7, src8, src9, src10);
4016 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
4018 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
4019 dst32_r = __msa_ilvr_h(dst3, dst2);
4020 dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
4023 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
4025 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
4026 dst43_r = __msa_ilvr_h(dst4, dst3);
4027 dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
4030 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
4032 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5);
4033 dst54_r = __msa_ilvr_h(dst5, dst4);
4034 dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
4037 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
4039 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst6, dst6);
4040 dst65_r = __msa_ilvr_h(dst6, dst5);
4041 dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
4044 VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1);
4046 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst7, dst7);
4047 dst76_r = __msa_ilvr_h(dst7, dst6);
4048 dst4_r = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
4051 VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec0, vec1);
4053 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst8, dst8);
4054 dst87_r = __msa_ilvr_h(dst8, dst7);
4055 dst5_r = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
4058 VSHF_B2_SB(src9, src9, src9, src9, mask0, mask1, vec0, vec1);
4060 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst9, dst9);
4061 dst10_r = __msa_ilvr_h(dst9, dst8);
4062 dst6_r = HEVC_FILT_4TAP(dst76_r, dst10_r, filt_h0, filt_h1);
4065 VSHF_B2_SB(src10, src10, src10, src10, mask0, mask1, vec0, vec1);
4067 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
4068 dst21_r = __msa_ilvr_h(dst2, dst9);
4069 dst7_r = HEVC_FILT_4TAP(dst87_r, dst21_r, filt_h0, filt_h1);
4071 PCKEV_H4_SH(dst1_r, dst0_r, dst3_r, dst2_r,
4072 dst5_r, dst4_r, dst7_r, dst6_r, tmp0, tmp1, tmp2, tmp3);
4073 HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
4074 tmp0, tmp1, tmp2, tmp3, 7, tmp0, tmp1, tmp2, tmp3);
4076 PCKEV_B2_SH(tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
4077 ST4x8_UB(tmp0, tmp1, dst, dst_stride);
4078 dst += (8 * dst_stride);
4082 static void hevc_hv_bi_4t_4w_msa(uint8_t *src0_ptr,
4085 int32_t src2_stride,
4088 const int8_t *filter_x,
4089 const int8_t *filter_y,
4093 hevc_hv_bi_4t_4x2_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
4094 dst, dst_stride, filter_x, filter_y, height);
4095 } else if (4 == height) {
4096 hevc_hv_bi_4t_4x4_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
4097 dst, dst_stride, filter_x, filter_y, height);
4098 } else if (0 == (height % 8)) {
4099 hevc_hv_bi_4t_4multx8mult_msa(src0_ptr, src_stride,
4100 src1_ptr, src2_stride,
4102 filter_x, filter_y, height);
4106 static void hevc_hv_bi_4t_6w_msa(uint8_t *src0_ptr,
4109 int32_t src2_stride,
4112 const int8_t *filter_x,
4113 const int8_t *filter_y,
4117 v16i8 src0, src1, src2, src3, src4, src5, src6;
4118 v8i16 in0, in1, in2, in3;
4120 v4i32 filt_h0, filt_h1;
4121 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
4123 v8i16 filter_vec, const_vec;
4124 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
4125 v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
4126 v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
4127 v8i16 tmp0, tmp1, tmp2, tmp3;
4128 v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
4129 v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
4131 src0_ptr -= (src_stride + 1);
4133 filter_vec = LD_SH(filter_x);
4134 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
4136 filter_vec = LD_SH(filter_y);
4137 vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
4138 filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
4140 SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1);
4144 const_vec = __msa_ldi_h(128);
4147 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
4148 src0_ptr += (3 * src_stride);
4149 XORI_B3_128_SB(src0, src1, src2);
4151 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
4152 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
4153 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
4155 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
4157 DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1);
4159 DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2);
4161 ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
4162 ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
4164 for (loop_cnt = height >> 2; loop_cnt--;) {
4165 LD_SB4(src0_ptr, src_stride, src3, src4, src5, src6);
4166 src0_ptr += (4 * src_stride);
4167 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
4168 src1_ptr += (4 * src2_stride);
4169 XORI_B4_128_SB(src3, src4, src5, src6);
4171 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
4173 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
4175 ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
4176 dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
4177 dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
4181 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
4183 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
4185 ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
4186 dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
4187 dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
4191 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
4193 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5);
4195 ILVRL_H2_SH(dst5, dst4, dst10_r, dst10_l);
4196 dst2_r = HEVC_FILT_4TAP(dst32_r, dst10_r, filt_h0, filt_h1);
4197 dst2_l = HEVC_FILT_4TAP(dst32_l, dst10_l, filt_h0, filt_h1);
4201 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
4203 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
4205 ILVRL_H2_SH(dst2, dst5, dst21_r, dst21_l);
4206 dst3_r = HEVC_FILT_4TAP(dst43_r, dst21_r, filt_h0, filt_h1);
4207 dst3_l = HEVC_FILT_4TAP(dst43_l, dst21_l, filt_h0, filt_h1);
4210 PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r,
4211 dst2_l, dst2_r, dst3_l, dst3_r, tmp0, tmp1, tmp2, tmp3);
4212 HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
4213 tmp0, tmp1, tmp2, tmp3, 7, tmp0, tmp1, tmp2, tmp3);
4215 PCKEV_B2_SW(tmp1, tmp0, tmp3, tmp2, dst0_r, dst1_r);
4216 ST6x4_UB(dst0_r, dst1_r, dst, dst_stride);
4217 dst += (4 * dst_stride);
4221 static void hevc_hv_bi_4t_8x2_msa(uint8_t *src0_ptr,
4224 int32_t src2_stride,
4227 const int8_t *filter_x,
4228 const int8_t *filter_y,
4231 v16i8 src0, src1, src2, src3, src4;
4233 v4i32 filt_h0, filt_h1;
4234 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
4236 v8i16 filter_vec, const_vec;
4237 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
4238 v8i16 dst0, dst1, dst2, dst3, dst4;
4239 v4i32 dst0_r, dst0_l, dst1_r, dst1_l;
4240 v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
4241 v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
4245 src0_ptr -= (src_stride + 1);
4247 filter_vec = LD_SH(filter_x);
4248 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
4250 filter_vec = LD_SH(filter_y);
4251 vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
4252 filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
4254 SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1);
4258 const_vec = __msa_ldi_h(128);
4261 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
4262 src0_ptr += (3 * src_stride);
4263 XORI_B3_128_SB(src0, src1, src2);
4265 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
4266 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
4267 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
4269 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
4271 DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1);
4273 DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2);
4275 ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
4276 ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
4278 LD_SB2(src0_ptr, src_stride, src3, src4);
4279 LD_SH2(src1_ptr, src2_stride, in0, in1);
4280 XORI_B2_128_SB(src3, src4);
4282 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
4284 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
4286 ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
4287 dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
4288 dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
4292 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
4294 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
4296 ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
4297 dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
4298 dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
4302 PCKEV_H2_SH(dst0_l, dst0_r, dst1_l, dst1_r, tmp0, tmp1);
4303 HEVC_BI_RND_CLIP2(in0, in1, tmp0, tmp1, 7, tmp0, tmp1);
4305 dst0_r = (v4i32) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
4306 ST8x2_UB(dst0_r, dst, dst_stride);
4309 static void hevc_hv_bi_4t_8x6_msa(uint8_t *src0_ptr,
4312 int32_t src2_stride,
4315 const int8_t *filter_x,
4316 const int8_t *filter_y,
4319 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
4320 v8i16 in0, in1, in2, in3, in4, in5;
4322 v4i32 filt_h0, filt_h1;
4323 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
4325 v8i16 filter_vec, const_vec;
4326 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
4327 v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
4328 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
4329 v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
4330 v4i32 dst4_r, dst4_l, dst5_r, dst5_l;
4331 v8i16 dst10_r, dst32_r, dst10_l, dst32_l;
4332 v8i16 dst21_r, dst43_r, dst21_l, dst43_l;
4333 v8i16 dst54_r, dst54_l, dst65_r, dst65_l;
4334 v8i16 dst76_r, dst76_l, dst87_r, dst87_l;
4336 src0_ptr -= (src_stride + 1);
4338 filter_vec = LD_SH(filter_x);
4339 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
4341 filter_vec = LD_SH(filter_y);
4342 vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
4343 filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
4345 SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1);
4349 const_vec = __msa_ldi_h(128);
4352 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
4353 src0_ptr += (3 * src_stride);
4354 XORI_B3_128_SB(src0, src1, src2);
4355 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
4356 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
4357 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
4359 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
4361 DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1);
4363 DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2);
4365 ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
4366 ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
4368 LD_SB2(src0_ptr, src_stride, src3, src4);
4369 src0_ptr += (2 * src_stride);
4370 XORI_B2_128_SB(src3, src4);
4371 LD_SH6(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5);
4372 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
4374 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
4376 ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
4377 dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
4378 dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
4381 tmp0 = __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r);
4383 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
4385 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
4387 ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
4388 dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
4389 dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
4392 tmp1 = __msa_pckev_h((v8i16) dst1_l, (v8i16) dst1_r);
4394 LD_SB2(src0_ptr, src_stride, src5, src6);
4395 src0_ptr += (2 * src_stride);
4396 XORI_B2_128_SB(src5, src6);
4398 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
4400 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5);
4402 ILVRL_H2_SH(dst5, dst4, dst54_r, dst54_l);
4403 dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
4404 dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1);
4407 tmp2 = __msa_pckev_h((v8i16) dst2_l, (v8i16) dst2_r);
4410 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
4412 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst6, dst6);
4414 ILVRL_H2_SH(dst6, dst5, dst65_r, dst65_l);
4415 dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
4416 dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1);
4419 tmp3 = __msa_pckev_h((v8i16) dst3_l, (v8i16) dst3_r);
4421 LD_SB2(src0_ptr, src_stride, src7, src8);
4422 XORI_B2_128_SB(src7, src8);
4424 VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1);
4426 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst7, dst7);
4428 ILVRL_H2_SH(dst7, dst6, dst76_r, dst76_l);
4429 dst4_r = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
4430 dst4_l = HEVC_FILT_4TAP(dst54_l, dst76_l, filt_h0, filt_h1);
4434 tmp4 = __msa_pckev_h((v8i16) dst4_l, (v8i16) dst4_r);
4436 VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec0, vec1);
4438 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst8, dst8);
4440 ILVRL_H2_SH(dst8, dst7, dst87_r, dst87_l);
4441 dst5_r = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
4442 dst5_l = HEVC_FILT_4TAP(dst65_l, dst87_l, filt_h0, filt_h1);
4445 tmp5 = __msa_pckev_h((v8i16) dst5_l, (v8i16) dst5_r);
4447 HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
4448 tmp0, tmp1, tmp2, tmp3, 7, tmp0, tmp1, tmp2, tmp3);
4449 HEVC_BI_RND_CLIP2(in4, in5, tmp4, tmp5, 7, tmp4, tmp5);
4451 PCKEV_B2_SW(tmp1, tmp0, tmp3, tmp2, dst0_r, dst1_r);
4452 dst2_r = (v4i32) __msa_pckev_b((v16i8) tmp5, (v16i8) tmp4);
4453 ST8x4_UB(dst0_r, dst1_r, dst, dst_stride);
4454 dst += (4 * dst_stride);
4455 ST8x2_UB(dst2_r, dst, dst_stride);
4458 static void hevc_hv_bi_4t_8multx4mult_msa(uint8_t *src0_ptr,
4461 int32_t src2_stride,
4464 const int8_t *filter_x,
4465 const int8_t *filter_y,
4469 uint32_t loop_cnt, cnt;
4470 uint8_t *src0_ptr_tmp;
4471 int16_t *src1_ptr_tmp;
4473 v16i8 src0, src1, src2, src3, src4, src5, src6;
4474 v8i16 in0, in1, in2, in3;
4476 v4i32 filt_h0, filt_h1;
4477 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
4479 v8i16 filter_vec, const_vec;
4480 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
4481 v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
4482 v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
4483 v8i16 tmp0, tmp1, tmp2, tmp3;
4484 v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
4485 v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
4487 src0_ptr -= (src_stride + 1);
4489 filter_vec = LD_SH(filter_x);
4490 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
4492 filter_vec = LD_SH(filter_y);
4493 vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
4494 filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
4496 SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1);
4500 const_vec = __msa_ldi_h(128);
4503 for (cnt = width >> 3; cnt--;) {
4504 src0_ptr_tmp = src0_ptr;
4506 src1_ptr_tmp = src1_ptr;
4508 LD_SB3(src0_ptr_tmp, src_stride, src0, src1, src2);
4509 src0_ptr_tmp += (3 * src_stride);
4510 XORI_B3_128_SB(src0, src1, src2);
4512 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
4513 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
4514 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
4516 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
4518 DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1);
4520 DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2);
4522 ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
4523 ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
4525 for (loop_cnt = height >> 2; loop_cnt--;) {
4526 LD_SB4(src0_ptr_tmp, src_stride, src3, src4, src5, src6);
4527 src0_ptr_tmp += (4 * src_stride);
4528 LD_SH4(src1_ptr_tmp, src2_stride, in0, in1, in2, in3);
4529 src1_ptr_tmp += (4 * src2_stride);
4530 XORI_B4_128_SB(src3, src4, src5, src6);
4532 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
4534 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
4536 ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
4537 dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
4538 dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
4542 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
4544 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
4546 ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
4547 dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
4548 dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
4552 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
4554 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5);
4556 ILVRL_H2_SH(dst5, dst4, dst10_r, dst10_l);
4557 dst2_r = HEVC_FILT_4TAP(dst32_r, dst10_r, filt_h0, filt_h1);
4558 dst2_l = HEVC_FILT_4TAP(dst32_l, dst10_l, filt_h0, filt_h1);
4562 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
4564 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
4566 ILVRL_H2_SH(dst2, dst5, dst21_r, dst21_l);
4567 dst3_r = HEVC_FILT_4TAP(dst43_r, dst21_r, filt_h0, filt_h1);
4568 dst3_l = HEVC_FILT_4TAP(dst43_l, dst21_l, filt_h0, filt_h1);
4572 PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r,
4573 dst2_l, dst2_r, dst3_l, dst3_r, tmp0, tmp1, tmp2, tmp3);
4574 HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
4575 tmp0, tmp1, tmp2, tmp3, 7,
4576 tmp0, tmp1, tmp2, tmp3);
4578 PCKEV_B2_SW(tmp1, tmp0, tmp3, tmp2, dst0_r, dst1_r);
4579 ST8x4_UB(dst0_r, dst1_r, dst_tmp, dst_stride);
4580 dst_tmp += (4 * dst_stride);
4589 static void hevc_hv_bi_4t_8w_msa(uint8_t *src0_ptr,
4592 int32_t src2_stride,
4595 const int8_t *filter_x,
4596 const int8_t *filter_y,
4600 hevc_hv_bi_4t_8x2_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
4601 dst, dst_stride, filter_x, filter_y, height);
4602 } else if (6 == height) {
4603 hevc_hv_bi_4t_8x6_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
4604 dst, dst_stride, filter_x, filter_y, height);
4606 hevc_hv_bi_4t_8multx4mult_msa(src0_ptr, src_stride,
4607 src1_ptr, src2_stride,
4609 filter_x, filter_y, height, 8);
4613 static void hevc_hv_bi_4t_12w_msa(uint8_t *src0_ptr,
4616 int32_t src2_stride,
4619 const int8_t *filter_x,
4620 const int8_t *filter_y,
4623 hevc_hv_bi_4t_8multx4mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
4624 dst, dst_stride, filter_x, filter_y,
4626 hevc_hv_bi_4t_4w_msa(src0_ptr + 8, src_stride, src1_ptr + 8, src2_stride,
4627 dst + 8, dst_stride, filter_x, filter_y, height);
4630 static void hevc_hv_bi_4t_16w_msa(uint8_t *src0_ptr,
4633 int32_t src2_stride,
4636 const int8_t *filter_x,
4637 const int8_t *filter_y,
4640 hevc_hv_bi_4t_8multx4mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
4641 dst, dst_stride, filter_x, filter_y,
4645 static void hevc_hv_bi_4t_24w_msa(uint8_t *src0_ptr,
4648 int32_t src2_stride,
4651 const int8_t *filter_x,
4652 const int8_t *filter_y,
4655 hevc_hv_bi_4t_8multx4mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
4656 dst, dst_stride, filter_x, filter_y,
4660 static void hevc_hv_bi_4t_32w_msa(uint8_t *src0_ptr,
4663 int32_t src2_stride,
4666 const int8_t *filter_x,
4667 const const int8_t *filter_y,
4670 hevc_hv_bi_4t_8multx4mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
4671 dst, dst_stride, filter_x, filter_y,
4675 #define BI_MC_COPY(WIDTH) \
4676 void ff_hevc_put_hevc_bi_pel_pixels##WIDTH##_8_msa(uint8_t *dst, \
4677 ptrdiff_t dst_stride, \
4679 ptrdiff_t src_stride, \
4680 int16_t *src_16bit, \
4686 hevc_bi_copy_##WIDTH##w_msa(src, src_stride, src_16bit, MAX_PB_SIZE, \
4687 dst, dst_stride, height); \
4702 #define BI_MC(PEL, DIR, WIDTH, TAP, DIR1, FILT_DIR) \
4703 void ff_hevc_put_hevc_bi_##PEL##_##DIR##WIDTH##_8_msa(uint8_t *dst, \
4704 ptrdiff_t dst_stride, \
4706 ptrdiff_t src_stride, \
4707 int16_t *src_16bit, \
4713 const int8_t *filter = ff_hevc_##PEL##_filters[FILT_DIR - 1]; \
4715 hevc_##DIR1##_bi_##TAP##t_##WIDTH##w_msa(src, src_stride, src_16bit, \
4716 MAX_PB_SIZE, dst, dst_stride, \
4720 BI_MC(qpel, h, 4, 8, hz, mx);
4721 BI_MC(qpel, h, 8, 8, hz, mx);
4722 BI_MC(qpel, h, 12, 8, hz, mx);
4723 BI_MC(qpel, h, 16, 8, hz, mx);
4724 BI_MC(qpel, h, 24, 8, hz, mx);
4725 BI_MC(qpel, h, 32, 8, hz, mx);
4726 BI_MC(qpel, h, 48, 8, hz, mx);
4727 BI_MC(qpel, h, 64, 8, hz, mx);
4729 BI_MC(qpel, v, 4, 8, vt, my);
4730 BI_MC(qpel, v, 8, 8, vt, my);
4731 BI_MC(qpel, v, 12, 8, vt, my);
4732 BI_MC(qpel, v, 16, 8, vt, my);
4733 BI_MC(qpel, v, 24, 8, vt, my);
4734 BI_MC(qpel, v, 32, 8, vt, my);
4735 BI_MC(qpel, v, 48, 8, vt, my);
4736 BI_MC(qpel, v, 64, 8, vt, my);
4738 BI_MC(epel, h, 4, 4, hz, mx);
4739 BI_MC(epel, h, 8, 4, hz, mx);
4740 BI_MC(epel, h, 6, 4, hz, mx);
4741 BI_MC(epel, h, 12, 4, hz, mx);
4742 BI_MC(epel, h, 16, 4, hz, mx);
4743 BI_MC(epel, h, 24, 4, hz, mx);
4744 BI_MC(epel, h, 32, 4, hz, mx);
4746 BI_MC(epel, v, 4, 4, vt, my);
4747 BI_MC(epel, v, 8, 4, vt, my);
4748 BI_MC(epel, v, 6, 4, vt, my);
4749 BI_MC(epel, v, 12, 4, vt, my);
4750 BI_MC(epel, v, 16, 4, vt, my);
4751 BI_MC(epel, v, 24, 4, vt, my);
4752 BI_MC(epel, v, 32, 4, vt, my);
4756 #define BI_MC_HV(PEL, WIDTH, TAP) \
4757 void ff_hevc_put_hevc_bi_##PEL##_hv##WIDTH##_8_msa(uint8_t *dst, \
4758 ptrdiff_t dst_stride, \
4760 ptrdiff_t src_stride, \
4761 int16_t *src_16bit, \
4767 const int8_t *filter_x = ff_hevc_##PEL##_filters[mx - 1]; \
4768 const int8_t *filter_y = ff_hevc_##PEL##_filters[my - 1]; \
4770 hevc_hv_bi_##TAP##t_##WIDTH##w_msa(src, src_stride, src_16bit, \
4771 MAX_PB_SIZE, dst, dst_stride, \
4772 filter_x, filter_y, height); \
4775 BI_MC_HV(qpel, 4, 8);
4776 BI_MC_HV(qpel, 8, 8);
4777 BI_MC_HV(qpel, 12, 8);
4778 BI_MC_HV(qpel, 16, 8);
4779 BI_MC_HV(qpel, 24, 8);
4780 BI_MC_HV(qpel, 32, 8);
4781 BI_MC_HV(qpel, 48, 8);
4782 BI_MC_HV(qpel, 64, 8);
4784 BI_MC_HV(epel, 4, 4);
4785 BI_MC_HV(epel, 8, 4);
4786 BI_MC_HV(epel, 6, 4);
4787 BI_MC_HV(epel, 12, 4);
4788 BI_MC_HV(epel, 16, 4);
4789 BI_MC_HV(epel, 24, 4);
4790 BI_MC_HV(epel, 32, 4);