2 * Copyright (c) 2015 - 2017 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com)
4 * This file is part of FFmpeg.
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 #include "libavutil/mips/generic_macros_msa.h"
22 #include "libavcodec/mips/hevcdsp_mips.h"
23 #include "libavcodec/mips/hevc_macros_msa.h"
25 #define HEVC_BI_RND_CLIP2(in0, in1, vec0, vec1, rnd_val, out0, out1) \
27 ADDS_SH2_SH(vec0, in0, vec1, in1, out0, out1); \
28 SRARI_H2_SH(out0, out1, rnd_val); \
29 CLIP_SH2_0_255(out0, out1); \
32 #define HEVC_BI_RND_CLIP4(in0, in1, in2, in3, \
33 vec0, vec1, vec2, vec3, rnd_val, \
34 out0, out1, out2, out3) \
36 HEVC_BI_RND_CLIP2(in0, in1, vec0, vec1, rnd_val, out0, out1); \
37 HEVC_BI_RND_CLIP2(in2, in3, vec2, vec3, rnd_val, out2, out3); \
40 #define HEVC_BI_RND_CLIP2_MAX_SATU(in0, in1, vec0, vec1, rnd_val, \
43 ADDS_SH2_SH(vec0, in0, vec1, in1, out0, out1); \
44 SRARI_H2_SH(out0, out1, rnd_val); \
45 CLIP_SH2_0_255_MAX_SATU(out0, out1); \
48 #define HEVC_BI_RND_CLIP4_MAX_SATU(in0, in1, in2, in3, vec0, vec1, vec2, \
49 vec3, rnd_val, out0, out1, out2, out3) \
51 HEVC_BI_RND_CLIP2_MAX_SATU(in0, in1, vec0, vec1, rnd_val, out0, out1); \
52 HEVC_BI_RND_CLIP2_MAX_SATU(in2, in3, vec2, vec3, rnd_val, out2, out3); \
55 static void hevc_bi_copy_4w_msa(uint8_t *src0_ptr,
63 uint32_t loop_cnt, tp0, tp1, tp2, tp3;
64 uint64_t tpd0, tpd1, tpd2, tpd3;
65 v16i8 src0 = { 0 }, src1 = { 0 };
67 v8i16 in0 = { 0 }, in1 = { 0 }, in2 = { 0 }, in3 = { 0 };
68 v8i16 dst0, dst1, dst2, dst3;
71 LW2(src0_ptr, src_stride, tp0, tp1);
72 INSERT_W2_SB(tp0, tp1, src0);
73 LD2(src1_ptr, src2_stride, tpd0, tpd1);
74 INSERT_D2_SH(tpd0, tpd1, in0);
76 dst0 = (v8i16) __msa_ilvr_b(zero, src0);
79 dst0 = __msa_srari_h(dst0, 7);
80 dst0 = CLIP_SH_0_255_MAX_SATU(dst0);
82 dst0 = (v8i16) __msa_pckev_b((v16i8) dst0, (v16i8) dst0);
83 ST4x2_UB(dst0, dst, dst_stride);
84 } else if (4 == height) {
85 LW4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
86 INSERT_W4_SB(tp0, tp1, tp2, tp3, src0);
87 LD4(src1_ptr, src2_stride, tpd0, tpd1, tpd2, tpd3);
88 INSERT_D2_SH(tpd0, tpd1, in0);
89 INSERT_D2_SH(tpd2, tpd3, in1);
90 ILVRL_B2_SH(zero, src0, dst0, dst1);
91 SLLI_2V(dst0, dst1, 6);
92 HEVC_BI_RND_CLIP2_MAX_SATU(in0, in1, dst0, dst1, 7, dst0, dst1);
93 dst0 = (v8i16) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
94 ST4x4_UB(dst0, dst0, 0, 1, 2, 3, dst, dst_stride);
95 } else if (0 == height % 8) {
96 for (loop_cnt = (height >> 3); loop_cnt--;) {
97 LW4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
98 src0_ptr += 4 * src_stride;
99 INSERT_W4_SB(tp0, tp1, tp2, tp3, src0);
100 LW4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
101 src0_ptr += 4 * src_stride;
102 INSERT_W4_SB(tp0, tp1, tp2, tp3, src1);
103 LD4(src1_ptr, src2_stride, tpd0, tpd1, tpd2, tpd3);
104 src1_ptr += (4 * src2_stride);
105 INSERT_D2_SH(tpd0, tpd1, in0);
106 INSERT_D2_SH(tpd2, tpd3, in1);
107 LD4(src1_ptr, src2_stride, tpd0, tpd1, tpd2, tpd3);
108 src1_ptr += (4 * src2_stride);
109 INSERT_D2_SH(tpd0, tpd1, in2);
110 INSERT_D2_SH(tpd2, tpd3, in3);
111 ILVRL_B2_SH(zero, src0, dst0, dst1);
112 ILVRL_B2_SH(zero, src1, dst2, dst3);
113 SLLI_4V(dst0, dst1, dst2, dst3, 6);
114 HEVC_BI_RND_CLIP4_MAX_SATU(in0, in1, in2, in3, dst0, dst1, dst2,
115 dst3, 7, dst0, dst1, dst2, dst3);
116 PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
117 ST4x8_UB(dst0, dst1, dst, dst_stride);
118 dst += (8 * dst_stride);
123 static void hevc_bi_copy_6w_msa(uint8_t *src0_ptr,
132 uint64_t tp0, tp1, tp2, tp3;
133 v16u8 out0, out1, out2, out3;
135 v16i8 src0 = { 0 }, src1 = { 0 }, src2 = { 0 }, src3 = { 0 };
136 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
137 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
139 for (loop_cnt = (height >> 3); loop_cnt--;) {
140 LD4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
141 src0_ptr += (4 * src_stride);
142 INSERT_D2_SB(tp0, tp1, src0);
143 INSERT_D2_SB(tp2, tp3, src1);
144 LD4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
145 src0_ptr += (4 * src_stride);
146 INSERT_D2_SB(tp0, tp1, src2);
147 INSERT_D2_SB(tp2, tp3, src3);
148 LD_SH8(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5, in6, in7);
149 src1_ptr += (8 * src2_stride);
150 ILVRL_B2_SH(zero, src0, dst0, dst1);
151 ILVRL_B2_SH(zero, src1, dst2, dst3);
152 ILVRL_B2_SH(zero, src2, dst4, dst5);
153 ILVRL_B2_SH(zero, src3, dst6, dst7);
154 SLLI_4V(dst0, dst1, dst2, dst3, 6);
155 SLLI_4V(dst4, dst5, dst6, dst7, 6);
156 HEVC_BI_RND_CLIP4_MAX_SATU(in0, in1, in2, in3, dst0, dst1, dst2, dst3,
157 7, dst0, dst1, dst2, dst3);
158 HEVC_BI_RND_CLIP4_MAX_SATU(in4, in5, in6, in7, dst4, dst5, dst6, dst7,
159 7, dst4, dst5, dst6, dst7);
160 PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
161 PCKEV_B2_UB(dst5, dst4, dst7, dst6, out2, out3);
162 ST6x4_UB(out0, out1, dst, dst_stride);
163 dst += (4 * dst_stride);
164 ST6x4_UB(out2, out3, dst, dst_stride);
165 dst += (4 * dst_stride);
169 static void hevc_bi_copy_8w_msa(uint8_t *src0_ptr,
177 uint64_t tp0, tp1, tp2, tp3;
178 v16u8 out0, out1, out2, out3;
179 v16i8 src0 = { 0 }, src1 = { 0 }, src2 = { 0 }, src3 = { 0 };
181 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
182 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
185 LD2(src0_ptr, src_stride, tp0, tp1);
186 INSERT_D2_SB(tp0, tp1, src0);
187 LD_SH2(src1_ptr, src2_stride, in0, in1);
188 ILVRL_B2_SH(zero, src0, dst0, dst1);
189 SLLI_2V(dst0, dst1, 6);
190 HEVC_BI_RND_CLIP2_MAX_SATU(in0, in1, dst0, dst1, 7, dst0, dst1);
191 out0 = (v16u8) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
192 ST8x2_UB(out0, dst, dst_stride);
193 } else if (4 == height) {
194 LD4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
195 INSERT_D2_SB(tp0, tp1, src0);
196 INSERT_D2_SB(tp2, tp3, src1);
197 ILVRL_B2_SH(zero, src0, dst0, dst1);
198 ILVRL_B2_SH(zero, src1, dst2, dst3);
199 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
200 SLLI_4V(dst0, dst1, dst2, dst3, 6);
201 HEVC_BI_RND_CLIP4_MAX_SATU(in0, in1, in2, in3, dst0, dst1, dst2, dst3,
202 7, dst0, dst1, dst2, dst3);
203 PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
204 ST8x4_UB(out0, out1, dst, dst_stride);
205 } else if (6 == height) {
206 LD4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
207 src0_ptr += 4 * src_stride;
208 INSERT_D2_SB(tp0, tp1, src0);
209 INSERT_D2_SB(tp2, tp3, src1);
210 LD2(src0_ptr, src_stride, tp0, tp1);
211 INSERT_D2_SB(tp0, tp1, src2);
212 ILVRL_B2_SH(zero, src0, dst0, dst1);
213 ILVRL_B2_SH(zero, src1, dst2, dst3);
214 ILVRL_B2_SH(zero, src2, dst4, dst5);
215 LD_SH6(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5);
216 SLLI_4V(dst0, dst1, dst2, dst3, 6);
217 SLLI_2V(dst4, dst5, 6);
218 HEVC_BI_RND_CLIP4_MAX_SATU(in0, in1, in2, in3, dst0, dst1, dst2, dst3,
219 7, dst0, dst1, dst2, dst3);
220 HEVC_BI_RND_CLIP2_MAX_SATU(in4, in5, dst4, dst5, 7, dst4, dst5);
221 PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
222 ST8x4_UB(out0, out1, dst, dst_stride);
223 dst += (4 * dst_stride);
224 ST8x2_UB(out2, dst, dst_stride);
225 } else if (0 == height % 8) {
228 for (loop_cnt = (height >> 3); loop_cnt--;) {
229 LD4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
230 src0_ptr += 4 * src_stride;
231 INSERT_D2_SB(tp0, tp1, src0);
232 INSERT_D2_SB(tp2, tp3, src1);
233 LD4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
234 src0_ptr += 4 * src_stride;
235 INSERT_D2_SB(tp0, tp1, src2);
236 INSERT_D2_SB(tp2, tp3, src3);
237 ILVRL_B2_SH(zero, src0, dst0, dst1);
238 ILVRL_B2_SH(zero, src1, dst2, dst3);
239 ILVRL_B2_SH(zero, src2, dst4, dst5);
240 ILVRL_B2_SH(zero, src3, dst6, dst7);
241 LD_SH8(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5, in6,
243 src1_ptr += (8 * src2_stride);
244 SLLI_4V(dst0, dst1, dst2, dst3, 6);
245 SLLI_4V(dst4, dst5, dst6, dst7, 6);
246 HEVC_BI_RND_CLIP4_MAX_SATU(in0, in1, in2, in3, dst0, dst1, dst2,
247 dst3, 7, dst0, dst1, dst2, dst3);
248 HEVC_BI_RND_CLIP4_MAX_SATU(in4, in5, in6, in7, dst4, dst5, dst6,
249 dst7, 7, dst4, dst5, dst6, dst7);
250 PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
251 PCKEV_B2_UB(dst5, dst4, dst7, dst6, out2, out3);
252 ST8x8_UB(out0, out1, out2, out3, dst, dst_stride);
253 dst += (8 * dst_stride);
258 static void hevc_bi_copy_12w_msa(uint8_t *src0_ptr,
268 v16u8 out0, out1, out2;
269 v16i8 src0, src1, src2, src3;
270 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
271 v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
273 for (loop_cnt = 4; loop_cnt--;) {
274 LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
275 src0_ptr += (4 * src_stride);
277 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
278 LD_SH4(src1_ptr + 8, src2_stride, in4, in5, in6, in7);
279 src1_ptr += (4 * src2_stride);
280 ILVR_D2_SH(in5, in4, in7, in6, in4, in5);
281 ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, dst0, dst1,
283 SLLI_4V(dst0, dst1, dst2, dst3, 6);
284 ILVL_W2_SB(src1, src0, src3, src2, src0, src1);
285 ILVR_B2_SH(zero, src0, zero, src1, dst4, dst5);
286 SLLI_2V(dst4, dst5, 6);
287 HEVC_BI_RND_CLIP4_MAX_SATU(in0, in1, in2, in3, dst0, dst1, dst2, dst3,
288 7, dst0, dst1, dst2, dst3);
289 HEVC_BI_RND_CLIP2_MAX_SATU(in4, in5, dst4, dst5, 7, dst4, dst5);
290 PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
291 ST12x4_UB(out0, out1, out2, dst, dst_stride);
292 dst += (4 * dst_stride);
296 static void hevc_bi_copy_16w_msa(uint8_t *src0_ptr,
305 v16u8 out0, out1, out2, out3;
306 v16i8 src0, src1, src2, src3;
307 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
308 v8i16 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, dst2_l, dst3_l;
311 for (loop_cnt = (height >> 2); loop_cnt--;) {
312 LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
313 src0_ptr += (4 * src_stride);
314 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
315 LD_SH4(src1_ptr + 8, src2_stride, in4, in5, in6, in7);
316 src1_ptr += (4 * src2_stride);
317 ILVRL_B2_SH(zero, src0, dst0_r, dst0_l);
318 ILVRL_B2_SH(zero, src1, dst1_r, dst1_l);
319 ILVRL_B2_SH(zero, src2, dst2_r, dst2_l);
320 ILVRL_B2_SH(zero, src3, dst3_r, dst3_l);
321 SLLI_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6);
322 SLLI_4V(dst0_l, dst1_l, dst2_l, dst3_l, 6);
323 HEVC_BI_RND_CLIP4_MAX_SATU(in0, in1, in4, in5, dst0_r, dst1_r, dst0_l,
324 dst1_l, 7, dst0_r, dst1_r, dst0_l, dst1_l);
325 HEVC_BI_RND_CLIP4_MAX_SATU(in2, in3, in6, in7, dst2_r, dst3_r, dst2_l,
326 dst3_l, 7, dst2_r, dst3_r, dst2_l, dst3_l);
327 PCKEV_B2_UB(dst0_l, dst0_r, dst1_l, dst1_r, out0, out1);
328 PCKEV_B2_UB(dst2_l, dst2_r, dst3_l, dst3_r, out2, out3);
329 ST_UB4(out0, out1, out2, out3, dst, dst_stride);
330 dst += (4 * dst_stride);
334 static void hevc_bi_copy_24w_msa(uint8_t *src0_ptr,
343 v16u8 out0, out1, out2, out3, out4, out5;
344 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, zero = { 0 };
345 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8, dst9, dst10;
346 v8i16 in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, dst11;
348 for (loop_cnt = 8; loop_cnt--;) {
349 LD_SB4(src0_ptr, src_stride, src0, src1, src4, src5);
350 LD_SB4(src0_ptr + 16, src_stride, src2, src3, src6, src7);
351 src0_ptr += (4 * src_stride);
352 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
353 LD_SH4(src1_ptr + 8, src2_stride, in4, in5, in6, in7);
354 LD_SH4(src1_ptr + 16, src2_stride, in8, in9, in10, in11);
355 src1_ptr += (4 * src2_stride);
357 ILVRL_B2_SH(zero, src0, dst0, dst1);
358 ILVRL_B2_SH(zero, src1, dst2, dst3);
359 ILVR_B2_SH(zero, src2, zero, src3, dst4, dst5);
360 ILVRL_B2_SH(zero, src4, dst6, dst7);
361 ILVRL_B2_SH(zero, src5, dst8, dst9);
362 ILVR_B2_SH(zero, src6, zero, src7, dst10, dst11);
363 SLLI_4V(dst0, dst1, dst2, dst3, 6);
364 SLLI_4V(dst4, dst5, dst6, dst7, 6);
365 SLLI_4V(dst8, dst9, dst10, dst11, 6);
366 HEVC_BI_RND_CLIP4_MAX_SATU(in0, in4, in1, in5, dst0, dst1, dst2, dst3,
367 7, dst0, dst1, dst2, dst3);
368 HEVC_BI_RND_CLIP4_MAX_SATU(in8, in9, in2, in6, dst4, dst5, dst6, dst7,
369 7, dst4, dst5, dst6, dst7);
370 HEVC_BI_RND_CLIP4_MAX_SATU(in3, in7, in10, in11, dst8, dst9, dst10,
371 dst11, 7, dst8, dst9, dst10, dst11);
372 PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
373 PCKEV_B3_UB(dst7, dst6, dst9, dst8, dst11, dst10, out3, out4, out5);
374 ST_UB4(out0, out1, out3, out4, dst, dst_stride);
375 ST8x4_UB(out2, out5, dst + 16, dst_stride);
376 dst += (4 * dst_stride);
380 static void hevc_bi_copy_32w_msa(uint8_t *src0_ptr,
389 v16u8 out0, out1, out2, out3;
390 v16i8 src0, src1, src2, src3;
392 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
393 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
395 for (loop_cnt = (height >> 1); loop_cnt--;) {
396 LD_SB2(src0_ptr, 16, src0, src1);
397 src0_ptr += src_stride;
398 LD_SB2(src0_ptr, 16, src2, src3);
399 src0_ptr += src_stride;
400 LD_SH4(src1_ptr, 8, in0, in1, in2, in3);
401 src1_ptr += src2_stride;
402 LD_SH4(src1_ptr, 8, in4, in5, in6, in7);
403 src1_ptr += src2_stride;
405 ILVRL_B2_SH(zero, src0, dst0, dst1);
406 ILVRL_B2_SH(zero, src1, dst2, dst3);
407 ILVRL_B2_SH(zero, src2, dst4, dst5);
408 ILVRL_B2_SH(zero, src3, dst6, dst7);
409 SLLI_4V(dst0, dst1, dst2, dst3, 6);
410 SLLI_4V(dst4, dst5, dst6, dst7, 6);
411 HEVC_BI_RND_CLIP4_MAX_SATU(in0, in1, in2, in3, dst0, dst1, dst2, dst3,
412 7, dst0, dst1, dst2, dst3);
413 HEVC_BI_RND_CLIP4_MAX_SATU(in4, in5, in6, in7, dst4, dst5, dst6, dst7,
414 7, dst4, dst5, dst6, dst7);
415 PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
416 PCKEV_B2_UB(dst5, dst4, dst7, dst6, out2, out3);
417 ST_UB2(out0, out1, dst, 16);
419 ST_UB2(out2, out3, dst, 16);
424 static void hevc_bi_copy_48w_msa(uint8_t *src0_ptr,
433 v16u8 out0, out1, out2, out3, out4, out5;
434 v16i8 src0, src1, src2, src3, src4, src5;
436 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8, dst9, dst10;
437 v8i16 in0, in1, in2, in3, in4, in5, in6, in7, in8, in9, in10, in11, dst11;
439 for (loop_cnt = (height >> 1); loop_cnt--;) {
440 LD_SB3(src0_ptr, 16, src0, src1, src2);
441 src0_ptr += src_stride;
442 LD_SB3(src0_ptr, 16, src3, src4, src5);
443 src0_ptr += src_stride;
445 LD_SH6(src1_ptr, 8, in0, in1, in2, in3, in4, in5);
446 src1_ptr += src2_stride;
447 LD_SH6(src1_ptr, 8, in6, in7, in8, in9, in10, in11);
448 src1_ptr += src2_stride;
450 ILVRL_B2_SH(zero, src0, dst0, dst1);
451 ILVRL_B2_SH(zero, src1, dst2, dst3);
452 ILVRL_B2_SH(zero, src2, dst4, dst5);
453 ILVRL_B2_SH(zero, src3, dst6, dst7);
454 ILVRL_B2_SH(zero, src4, dst8, dst9);
455 ILVRL_B2_SH(zero, src5, dst10, dst11);
457 SLLI_4V(dst0, dst1, dst2, dst3, 6);
458 SLLI_4V(dst4, dst5, dst6, dst7, 6);
459 SLLI_4V(dst8, dst9, dst10, dst11, 6);
461 HEVC_BI_RND_CLIP4_MAX_SATU(in0, in1, in2, in3, dst0, dst1, dst2, dst3,
462 7, dst0, dst1, dst2, dst3);
463 HEVC_BI_RND_CLIP4_MAX_SATU(in4, in5, in6, in7, dst4, dst5, dst6, dst7,
464 7, dst4, dst5, dst6, dst7);
465 HEVC_BI_RND_CLIP4_MAX_SATU(in8, in9, in10, in11, dst8, dst9, dst10,
466 dst11, 7, dst8, dst9, dst10, dst11);
467 PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
468 PCKEV_B3_UB(dst7, dst6, dst9, dst8, dst11, dst10, out3, out4, out5);
469 ST_UB2(out0, out1, dst, 16);
470 ST_UB(out2, dst + 32);
472 ST_UB2(out3, out4, dst, 16);
473 ST_UB(out5, dst + 32);
478 static void hevc_bi_copy_64w_msa(uint8_t *src0_ptr,
487 v16u8 out0, out1, out2, out3;
488 v16i8 src0, src1, src2, src3;
490 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
491 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
493 for (loop_cnt = height; loop_cnt--;) {
494 LD_SB4(src0_ptr, 16, src0, src1, src2, src3);
495 src0_ptr += src_stride;
496 LD_SH8(src1_ptr, 8, in0, in1, in2, in3, in4, in5, in6, in7);
497 src1_ptr += src2_stride;
499 ILVRL_B2_SH(zero, src0, dst0, dst1);
500 ILVRL_B2_SH(zero, src1, dst2, dst3);
501 ILVRL_B2_SH(zero, src2, dst4, dst5);
502 ILVRL_B2_SH(zero, src3, dst6, dst7);
503 SLLI_4V(dst0, dst1, dst2, dst3, 6);
504 SLLI_4V(dst4, dst5, dst6, dst7, 6);
505 HEVC_BI_RND_CLIP4_MAX_SATU(in0, in1, in2, in3, dst0, dst1, dst2, dst3,
506 7, dst0, dst1, dst2, dst3);
507 HEVC_BI_RND_CLIP4_MAX_SATU(in4, in5, in6, in7, dst4, dst5, dst6, dst7,
508 7, dst4, dst5, dst6, dst7);
509 PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
510 PCKEV_B2_UB(dst5, dst4, dst7, dst6, out2, out3);
512 ST_UB4(out0, out1, out2, out3, dst, 16);
517 static void hevc_hz_bi_8t_4w_msa(uint8_t *src0_ptr,
523 const int8_t *filter,
527 v8i16 filt0, filt1, filt2, filt3;
528 v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
529 v16i8 mask1, mask2, mask3;
530 v16i8 vec0, vec1, vec2, vec3;
531 v8i16 dst0, dst1, dst2, dst3;
532 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
533 v8i16 filter_vec, const_vec;
534 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 };
538 /* rearranging filter */
539 filter_vec = LD_SH(filter);
540 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
546 const_vec = __msa_ldi_h(128);
549 for (loop_cnt = (height >> 3); loop_cnt--;) {
550 LD_SB8(src0_ptr, src_stride, src0, src1, src2, src3,
551 src4, src5, src6, src7);
552 src0_ptr += (8 * src_stride);
553 LD_SH8(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5, in6, in7);
554 src1_ptr += (8 * src2_stride);
556 ILVR_D2_SH(in1, in0, in3, in2, in0, in1);
557 ILVR_D2_SH(in5, in4, in7, in6, in2, in3);
558 XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
560 VSHF_B4_SB(src0, src1, mask0, mask1, mask2, mask3,
561 vec0, vec1, vec2, vec3);
563 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
564 dst0, dst0, dst0, dst0);
565 VSHF_B4_SB(src2, src3, mask0, mask1, mask2, mask3,
566 vec0, vec1, vec2, vec3);
568 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
569 dst1, dst1, dst1, dst1);
570 VSHF_B4_SB(src4, src5, mask0, mask1, mask2, mask3,
571 vec0, vec1, vec2, vec3);
573 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
574 dst2, dst2, dst2, dst2);
575 VSHF_B4_SB(src6, src7, mask0, mask1, mask2, mask3,
576 vec0, vec1, vec2, vec3);
578 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
579 dst3, dst3, dst3, dst3);
581 HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
582 dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
584 PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
585 ST4x8_UB(dst0, dst1, dst, dst_stride);
586 dst += (8 * dst_stride);
590 static void hevc_hz_bi_8t_8w_msa(uint8_t *src0_ptr,
596 const int8_t *filter,
600 v8i16 filt0, filt1, filt2, filt3;
601 v16i8 src0, src1, src2, src3;
602 v16i8 mask1, mask2, mask3;
603 v16i8 vec0, vec1, vec2, vec3;
604 v8i16 dst0, dst1, dst2, dst3;
605 v8i16 in0, in1, in2, in3;
606 v8i16 filter_vec, const_vec;
607 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
611 const_vec = __msa_ldi_h(128);
614 filter_vec = LD_SH(filter);
615 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
621 for (loop_cnt = (height >> 2); loop_cnt--;) {
622 LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
623 src0_ptr += (4 * src_stride);
624 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
625 src1_ptr += (4 * src2_stride);
626 XORI_B4_128_SB(src0, src1, src2, src3);
628 VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
629 vec0, vec1, vec2, vec3);
631 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
632 dst0, dst0, dst0, dst0);
633 VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
634 vec0, vec1, vec2, vec3);
636 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
637 dst1, dst1, dst1, dst1);
638 VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
639 vec0, vec1, vec2, vec3);
641 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
642 dst2, dst2, dst2, dst2);
643 VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
644 vec0, vec1, vec2, vec3);
646 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
647 dst3, dst3, dst3, dst3);
649 HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
650 dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
652 PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
653 ST8x4_UB(dst0, dst1, dst, dst_stride);
654 dst += (4 * dst_stride);
658 static void hevc_hz_bi_8t_12w_msa(uint8_t *src0_ptr,
664 const int8_t *filter,
667 hevc_hz_bi_8t_8w_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
668 dst, dst_stride, filter, height);
669 hevc_hz_bi_8t_4w_msa(src0_ptr + 8, src_stride, src1_ptr + 8, src2_stride,
670 dst + 8, dst_stride, filter, height);
673 static void hevc_hz_bi_8t_16w_msa(uint8_t *src0_ptr,
679 const int8_t *filter,
683 v16i8 src0, src1, src2, src3;
684 v8i16 filt0, filt1, filt2, filt3;
685 v16i8 mask1, mask2, mask3;
686 v16i8 vec0, vec1, vec2, vec3;
687 v8i16 dst0, dst1, dst2, dst3;
688 v8i16 in0, in1, in2, in3;
689 v8i16 filter_vec, const_vec;
690 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
693 const_vec = __msa_ldi_h(128);
696 filter_vec = LD_SH(filter);
697 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
703 for (loop_cnt = (height >> 1); loop_cnt--;) {
704 LD_SB2(src0_ptr, 8, src0, src1);
705 src0_ptr += src_stride;
706 LD_SB2(src0_ptr, 8, src2, src3);
707 src0_ptr += src_stride;
708 LD_SH2(src1_ptr, 8, in0, in1);
709 src1_ptr += src2_stride;
710 LD_SH2(src1_ptr, 8, in2, in3);
711 src1_ptr += src2_stride;
712 XORI_B4_128_SB(src0, src1, src2, src3);
714 VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
715 vec0, vec1, vec2, vec3);
717 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
718 dst0, dst0, dst0, dst0);
719 VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
720 vec0, vec1, vec2, vec3);
722 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
723 dst1, dst1, dst1, dst1);
724 VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
725 vec0, vec1, vec2, vec3);
727 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
728 dst2, dst2, dst2, dst2);
729 VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
730 vec0, vec1, vec2, vec3);
732 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
733 dst3, dst3, dst3, dst3);
735 HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
736 dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
738 PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
739 ST_SH2(dst0, dst1, dst, dst_stride);
740 dst += (2 * dst_stride);
744 static void hevc_hz_bi_8t_24w_msa(uint8_t *src0_ptr,
750 const int8_t *filter,
755 v16i8 src0, src1, tmp0, tmp1;
756 v8i16 filt0, filt1, filt2, filt3;
757 v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
758 v16i8 vec0, vec1, vec2, vec3;
759 v8i16 dst0, dst1, dst2;
761 v8i16 filter_vec, const_vec;
762 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
764 src0_ptr = src0_ptr - 3;
765 const_vec = __msa_ldi_h(128);
768 filter_vec = LD_SH(filter);
769 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
779 for (loop_cnt = height; loop_cnt--;) {
780 LD_SB2(src0_ptr, 16, src0, src1);
781 src0_ptr += src_stride;
782 LD_SH2(src1_ptr, 8, in0, in1);
783 in2 = LD_SH(src1_ptr + 16);
784 src1_ptr += src2_stride;
785 XORI_B2_128_SB(src0, src1);
787 VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
788 vec0, vec1, vec2, vec3);
790 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
791 dst0, dst0, dst0, dst0);
792 VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7,
793 vec0, vec1, vec2, vec3);
795 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
796 dst1, dst1, dst1, dst1);
797 VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
798 vec0, vec1, vec2, vec3);
800 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
801 dst2, dst2, dst2, dst2);
803 HEVC_BI_RND_CLIP2(in0, in1, dst0, dst1, 7, dst0, dst1);
804 dst2 = __msa_adds_s_h(dst2, in2);
805 dst2 = __msa_srari_h(dst2, 7);
806 dst2 = CLIP_SH_0_255(dst2);
808 PCKEV_B2_SB(dst1, dst0, dst2, dst2, tmp0, tmp1);
809 dst_val0 = __msa_copy_u_d((v2i64) tmp1, 0);
811 SD(dst_val0, dst + 16);
816 static void hevc_hz_bi_8t_32w_msa(uint8_t *src0_ptr,
822 const int8_t *filter,
826 v16i8 src0, src1, src2, tmp0, tmp1;
827 v8i16 filt0, filt1, filt2, filt3;
828 v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
829 v16i8 vec0, vec1, vec2, vec3;
830 v8i16 dst0, dst1, dst2, dst3;
831 v8i16 in0, in1, in2, in3;
832 v8i16 filter_vec, const_vec;
833 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
836 const_vec = __msa_ldi_h(128);
839 filter_vec = LD_SH(filter);
840 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
850 for (loop_cnt = height; loop_cnt--;) {
851 LD_SB2(src0_ptr, 16, src0, src1);
852 src2 = LD_SB(src0_ptr + 24);
853 src0_ptr += src_stride;
854 LD_SH4(src1_ptr, 8, in0, in1, in2, in3);
855 src1_ptr += src2_stride;
856 XORI_B3_128_SB(src0, src1, src2);
858 VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
859 vec0, vec1, vec2, vec3);
861 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
862 dst0, dst0, dst0, dst0);
863 VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7,
864 vec0, vec1, vec2, vec3);
866 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
867 dst1, dst1, dst1, dst1);
868 VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
869 vec0, vec1, vec2, vec3);
871 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
872 dst2, dst2, dst2, dst2);
873 VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
874 vec0, vec1, vec2, vec3);
876 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
877 dst3, dst3, dst3, dst3);
879 HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
880 dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
882 PCKEV_B2_SB(dst1, dst0, dst3, dst2, tmp0, tmp1);
883 ST_SB2(tmp0, tmp1, dst, 16);
888 static void hevc_hz_bi_8t_48w_msa(uint8_t *src0_ptr,
894 const int8_t *filter,
898 v16i8 src0, src1, src2, src3;
899 v16i8 tmp0, tmp1, tmp2;
900 v8i16 filt0, filt1, filt2, filt3;
901 v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
902 v16i8 vec0, vec1, vec2, vec3;
903 v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
904 v8i16 in0, in1, in2, in3, in4, in5;
905 v8i16 filter_vec, const_vec;
906 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
910 const_vec = __msa_ldi_h(128);
913 filter_vec = LD_SH(filter);
914 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
924 for (loop_cnt = height; loop_cnt--;) {
925 LD_SB2(src0_ptr, 16, src0, src1);
926 XORI_B2_128_SB(src0, src1);
927 LD_SH2(src1_ptr, 8, in0, in1);
929 VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
930 vec0, vec1, vec2, vec3);
932 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
933 dst0, dst0, dst0, dst0);
934 VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7,
935 vec0, vec1, vec2, vec3);
937 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
938 dst1, dst1, dst1, dst1);
940 HEVC_BI_RND_CLIP2(in0, in1, dst0, dst1, 7, dst0, dst1);
942 tmp0 = __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
945 LD_SB2(src0_ptr + 32, 8, src2, src3);
946 XORI_B2_128_SB(src2, src3);
947 src0_ptr += src_stride;
949 LD_SH2(src1_ptr + 16, 8, in2, in3);
951 VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
952 vec0, vec1, vec2, vec3);
954 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
955 dst2, dst2, dst2, dst2);
956 VSHF_B4_SB(src1, src2, mask4, mask5, mask6, mask7,
957 vec0, vec1, vec2, vec3);
959 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
960 dst3, dst3, dst3, dst3);
962 HEVC_BI_RND_CLIP2(in2, in3, dst2, dst3, 7, dst2, dst3);
964 tmp1 = __msa_pckev_b((v16i8) dst3, (v16i8) dst2);
965 ST_SB(tmp1, dst + 16);
967 LD_SH2(src1_ptr + 32, 8, in4, in5);
968 src1_ptr += src2_stride;
970 VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
971 vec0, vec1, vec2, vec3);
973 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
974 dst4, dst4, dst4, dst4);
975 VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
976 vec0, vec1, vec2, vec3);
978 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
979 dst5, dst5, dst5, dst5);
981 HEVC_BI_RND_CLIP2(in4, in5, dst4, dst5, 7, dst4, dst5);
983 tmp2 = __msa_pckev_b((v16i8) dst5, (v16i8) dst4);
984 ST_SB(tmp2, dst + 32);
989 static void hevc_hz_bi_8t_64w_msa(uint8_t *src0_ptr,
995 const int8_t *filter,
998 uint8_t *src0_ptr_tmp;
1000 int16_t *src1_ptr_tmp;
1003 v16i8 src0, src1, src2, tmp0, tmp1;
1004 v8i16 filt0, filt1, filt2, filt3;
1005 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
1006 v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
1007 v16i8 vec0, vec1, vec2, vec3;
1008 v8i16 dst0, dst1, dst2, dst3;
1009 v8i16 in0, in1, in2, in3;
1010 v8i16 filter_vec, const_vec;
1014 const_vec = __msa_ldi_h(128);
1017 filter_vec = LD_SH(filter);
1018 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1028 for (loop_cnt = height; loop_cnt--;) {
1029 src0_ptr_tmp = src0_ptr;
1031 src1_ptr_tmp = src1_ptr;
1033 for (cnt = 2; cnt--;) {
1034 LD_SB2(src0_ptr_tmp, 16, src0, src1);
1035 src2 = LD_SB(src0_ptr_tmp + 24);
1037 LD_SH4(src1_ptr_tmp, 8, in0, in1, in2, in3);
1039 XORI_B3_128_SB(src0, src1, src2);
1041 VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
1042 vec0, vec1, vec2, vec3);
1044 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
1045 dst0, dst0, dst0, dst0);
1046 VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7,
1047 vec0, vec1, vec2, vec3);
1049 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
1050 dst1, dst1, dst1, dst1);
1051 VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
1052 vec0, vec1, vec2, vec3);
1054 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
1055 dst2, dst2, dst2, dst2);
1056 VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
1057 vec0, vec1, vec2, vec3);
1059 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
1060 dst3, dst3, dst3, dst3);
1062 HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
1063 dst0, dst1, dst2, dst3, 7,
1064 dst0, dst1, dst2, dst3);
1066 PCKEV_B2_SB(dst1, dst0, dst3, dst2, tmp0, tmp1);
1067 ST_SB2(tmp0, tmp1, dst_tmp, 16);
1071 src1_ptr += src2_stride;
1072 src0_ptr += src_stride;
1077 static void hevc_vt_bi_8t_4w_msa(uint8_t *src0_ptr,
1080 int32_t src2_stride,
1083 const int8_t *filter,
1087 v16i8 src0, src1, src2, src3, src4, src5;
1088 v16i8 src6, src7, src8, src9, src10;
1089 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
1090 v16i8 src11, src12, src13, src14;
1091 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
1092 v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
1093 v16i8 src1110_r, src1211_r, src1312_r, src1413_r;
1094 v16i8 src2110, src4332, src6554, src8776, src10998;
1095 v16i8 src12111110, src14131312;
1096 v8i16 dst10, dst32, dst54, dst76;
1097 v8i16 filt0, filt1, filt2, filt3;
1098 v8i16 filter_vec, const_vec;
1100 src0_ptr -= (3 * src_stride);
1102 const_vec = __msa_ldi_h(128);
1105 filter_vec = LD_SH(filter);
1106 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1108 LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6);
1109 src0_ptr += (7 * src_stride);
1110 ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1111 src10_r, src32_r, src54_r, src21_r);
1112 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1113 ILVR_D3_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r,
1114 src2110, src4332, src6554);
1115 XORI_B3_128_SB(src2110, src4332, src6554);
1117 for (loop_cnt = (height >> 3); loop_cnt--;) {
1118 LD_SB8(src0_ptr, src_stride,
1119 src7, src8, src9, src10, src11, src12, src13, src14);
1120 src0_ptr += (8 * src_stride);
1121 LD_SH8(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5, in6, in7);
1122 src1_ptr += (8 * src2_stride);
1124 ILVR_D2_SH(in1, in0, in3, in2, in0, in1);
1125 ILVR_D2_SH(in5, in4, in7, in6, in2, in3);
1126 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
1127 src76_r, src87_r, src98_r, src109_r);
1128 ILVR_B4_SB(src11, src10, src12, src11, src13, src12, src14, src13,
1129 src1110_r, src1211_r, src1312_r, src1413_r);
1130 ILVR_D4_SB(src87_r, src76_r, src109_r, src98_r, src1211_r, src1110_r,
1131 src1413_r, src1312_r,
1132 src8776, src10998, src12111110, src14131312);
1133 XORI_B4_128_SB(src8776, src10998, src12111110, src14131312);
1136 DPADD_SB4_SH(src2110, src4332, src6554, src8776,
1137 filt0, filt1, filt2, filt3, dst10, dst10, dst10, dst10);
1139 DPADD_SB4_SH(src4332, src6554, src8776, src10998,
1140 filt0, filt1, filt2, filt3, dst32, dst32, dst32, dst32);
1142 DPADD_SB4_SH(src6554, src8776, src10998, src12111110,
1143 filt0, filt1, filt2, filt3, dst54, dst54, dst54, dst54);
1145 DPADD_SB4_SH(src8776, src10998, src12111110, src14131312,
1146 filt0, filt1, filt2, filt3, dst76, dst76, dst76, dst76);
1148 HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
1149 dst10, dst32, dst54, dst76, 7,
1150 dst10, dst32, dst54, dst76);
1152 PCKEV_B2_SH(dst32, dst10, dst76, dst54, dst10, dst54);
1153 ST4x8_UB(dst10, dst54, dst, dst_stride);
1154 dst += (8 * dst_stride);
1157 src4332 = src12111110;
1158 src6554 = src14131312;
1163 static void hevc_vt_bi_8t_8w_msa(uint8_t *src0_ptr,
1166 int32_t src2_stride,
1169 const int8_t *filter,
1173 v16i8 src0, src1, src2, src3, src4, src5;
1174 v16i8 src6, src7, src8, src9, src10;
1175 v8i16 in0, in1, in2, in3;
1176 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
1177 v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
1178 v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
1179 v8i16 filt0, filt1, filt2, filt3;
1180 v8i16 filter_vec, const_vec;
1182 src0_ptr -= (3 * src_stride);
1183 const_vec = __msa_ldi_h(128);
1186 filter_vec = LD_SH(filter);
1187 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1189 LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6);
1190 src0_ptr += (7 * src_stride);
1191 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1192 ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1193 src10_r, src32_r, src54_r, src21_r);
1194 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1196 for (loop_cnt = (height >> 2); loop_cnt--;) {
1197 LD_SB4(src0_ptr, src_stride, src7, src8, src9, src10);
1198 src0_ptr += (4 * src_stride);
1199 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
1200 src1_ptr += (4 * src2_stride);
1201 XORI_B4_128_SB(src7, src8, src9, src10);
1202 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
1203 src76_r, src87_r, src98_r, src109_r);
1206 DPADD_SB4_SH(src10_r, src32_r, src54_r, src76_r,
1207 filt0, filt1, filt2, filt3,
1208 dst0_r, dst0_r, dst0_r, dst0_r);
1210 DPADD_SB4_SH(src21_r, src43_r, src65_r, src87_r,
1211 filt0, filt1, filt2, filt3,
1212 dst1_r, dst1_r, dst1_r, dst1_r);
1214 DPADD_SB4_SH(src32_r, src54_r, src76_r, src98_r,
1215 filt0, filt1, filt2, filt3,
1216 dst2_r, dst2_r, dst2_r, dst2_r);
1218 DPADD_SB4_SH(src43_r, src65_r, src87_r, src109_r,
1219 filt0, filt1, filt2, filt3,
1220 dst3_r, dst3_r, dst3_r, dst3_r);
1222 HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
1223 dst0_r, dst1_r, dst2_r, dst3_r, 7,
1224 dst0_r, dst1_r, dst2_r, dst3_r);
1226 PCKEV_B2_SH(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r);
1227 ST8x4_UB(dst0_r, dst1_r, dst, dst_stride);
1228 dst += (4 * dst_stride);
1241 static void hevc_vt_bi_8t_12w_msa(uint8_t *src0_ptr,
1244 int32_t src2_stride,
1247 const int8_t *filter,
1251 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1252 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
1253 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
1254 v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
1255 v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
1256 v16i8 src10_l, src32_l, src54_l, src76_l, src98_l;
1257 v16i8 src21_l, src43_l, src65_l, src87_l, src109_l;
1258 v16i8 src2110, src4332, src6554, src8776, src10998;
1259 v8i16 dst0_l, dst1_l;
1260 v8i16 filt0, filt1, filt2, filt3;
1261 v8i16 filter_vec, const_vec;
1263 src0_ptr -= (3 * src_stride);
1264 const_vec = __msa_ldi_h(128);
1267 filter_vec = LD_SH(filter);
1268 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1270 LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6);
1271 src0_ptr += (7 * src_stride);
1272 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1274 ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1275 src10_r, src32_r, src54_r, src21_r);
1276 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1277 ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1278 src10_l, src32_l, src54_l, src21_l);
1279 ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
1280 ILVR_D3_SB(src21_l, src10_l, src43_l, src32_l, src65_l, src54_l,
1281 src2110, src4332, src6554);
1283 for (loop_cnt = (height >> 2); loop_cnt--;) {
1284 LD_SB4(src0_ptr, src_stride, src7, src8, src9, src10);
1285 src0_ptr += (4 * src_stride);
1286 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
1287 LD_SH4((src1_ptr + 8), src2_stride, in4, in5, in6, in7);
1288 src1_ptr += (4 * src2_stride);
1290 ILVR_D2_SH(in5, in4, in7, in6, in4, in5);
1291 XORI_B4_128_SB(src7, src8, src9, src10);
1292 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
1293 src76_r, src87_r, src98_r, src109_r);
1294 ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
1295 src76_l, src87_l, src98_l, src109_l);
1296 ILVR_D2_SB(src87_l, src76_l, src109_l, src98_l, src8776, src10998);
1299 DPADD_SB4_SH(src10_r, src32_r, src54_r, src76_r,
1300 filt0, filt1, filt2, filt3,
1301 dst0_r, dst0_r, dst0_r, dst0_r);
1303 DPADD_SB4_SH(src21_r, src43_r, src65_r, src87_r,
1304 filt0, filt1, filt2, filt3,
1305 dst1_r, dst1_r, dst1_r, dst1_r);
1307 DPADD_SB4_SH(src32_r, src54_r, src76_r, src98_r,
1308 filt0, filt1, filt2, filt3,
1309 dst2_r, dst2_r, dst2_r, dst2_r);
1311 DPADD_SB4_SH(src43_r, src65_r, src87_r, src109_r,
1312 filt0, filt1, filt2, filt3,
1313 dst3_r, dst3_r, dst3_r, dst3_r);
1315 DPADD_SB4_SH(src2110, src4332, src6554, src8776,
1316 filt0, filt1, filt2, filt3,
1317 dst0_l, dst0_l, dst0_l, dst0_l);
1319 DPADD_SB4_SH(src4332, src6554, src8776, src10998,
1320 filt0, filt1, filt2, filt3,
1321 dst1_l, dst1_l, dst1_l, dst1_l);
1323 HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
1324 dst0_r, dst1_r, dst2_r, dst3_r, 7,
1325 dst0_r, dst1_r, dst2_r, dst3_r);
1326 HEVC_BI_RND_CLIP2(in4, in5, dst0_l, dst1_l, 7, dst0_l, dst1_l);
1329 PCKEV_B2_SH(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r);
1330 dst0_l = (v8i16) __msa_pckev_b((v16i8) dst1_l, (v16i8) dst0_l);
1331 ST12x4_UB(dst0_r, dst1_r, dst0_l, dst, dst_stride);
1332 dst += (4 * dst_stride);
1347 static void hevc_vt_bi_8t_16multx2mult_msa(uint8_t *src0_ptr,
1350 int32_t src2_stride,
1353 const int8_t *filter,
1354 int32_t height, int32_t width)
1356 uint8_t *src0_ptr_tmp;
1357 int16_t *src1_ptr_tmp;
1361 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
1362 v8i16 in0, in1, in2, in3;
1363 v16i8 src10_r, src32_r, src54_r, src76_r;
1364 v16i8 src21_r, src43_r, src65_r, src87_r;
1365 v8i16 dst0_r, dst1_r;
1366 v16i8 src10_l, src32_l, src54_l, src76_l;
1367 v16i8 src21_l, src43_l, src65_l, src87_l;
1368 v8i16 dst0_l, dst1_l;
1369 v8i16 filt0, filt1, filt2, filt3;
1370 v8i16 filter_vec, const_vec;
1372 src0_ptr -= (3 * src_stride);
1373 const_vec = __msa_ldi_h(128);
1376 filter_vec = LD_SH(filter);
1377 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1379 for (cnt = (width >> 4); cnt--;) {
1380 src0_ptr_tmp = src0_ptr;
1381 src1_ptr_tmp = src1_ptr;
1384 LD_SB7(src0_ptr_tmp, src_stride,
1385 src0, src1, src2, src3, src4, src5, src6);
1386 src0_ptr_tmp += (7 * src_stride);
1387 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1389 ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1390 src10_r, src32_r, src54_r, src21_r);
1391 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1392 ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1393 src10_l, src32_l, src54_l, src21_l);
1394 ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
1396 for (loop_cnt = (height >> 1); loop_cnt--;) {
1397 LD_SB2(src0_ptr_tmp, src_stride, src7, src8);
1398 src0_ptr_tmp += (2 * src_stride);
1399 LD_SH2(src1_ptr_tmp, src2_stride, in0, in1);
1400 LD_SH2((src1_ptr_tmp + 8), src2_stride, in2, in3);
1401 src1_ptr_tmp += (2 * src2_stride);
1402 XORI_B2_128_SB(src7, src8);
1404 ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
1405 ILVL_B2_SB(src7, src6, src8, src7, src76_l, src87_l);
1408 DPADD_SB4_SH(src10_r, src32_r, src54_r, src76_r,
1409 filt0, filt1, filt2, filt3,
1410 dst0_r, dst0_r, dst0_r, dst0_r);
1412 DPADD_SB4_SH(src21_r, src43_r, src65_r, src87_r,
1413 filt0, filt1, filt2, filt3,
1414 dst1_r, dst1_r, dst1_r, dst1_r);
1416 DPADD_SB4_SH(src10_l, src32_l, src54_l, src76_l,
1417 filt0, filt1, filt2, filt3,
1418 dst0_l, dst0_l, dst0_l, dst0_l);
1420 DPADD_SB4_SH(src21_l, src43_l, src65_l, src87_l,
1421 filt0, filt1, filt2, filt3,
1422 dst1_l, dst1_l, dst1_l, dst1_l);
1424 HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
1425 dst0_r, dst1_r, dst0_l, dst1_l, 7,
1426 dst0_r, dst1_r, dst0_l, dst1_l);
1428 PCKEV_B2_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r, dst1_r);
1429 ST_SH2(dst0_r, dst1_r, dst_tmp, dst_stride);
1430 dst_tmp += (2 * dst_stride);
1453 static void hevc_vt_bi_8t_16w_msa(uint8_t *src0_ptr,
1456 int32_t src2_stride,
1459 const int8_t *filter,
1462 hevc_vt_bi_8t_16multx2mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
1463 dst, dst_stride, filter, height, 16);
1466 static void hevc_vt_bi_8t_24w_msa(uint8_t *src0_ptr,
1469 int32_t src2_stride,
1472 const int8_t *filter,
1475 hevc_vt_bi_8t_16multx2mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
1476 dst, dst_stride, filter, height, 16);
1477 hevc_vt_bi_8t_8w_msa(src0_ptr + 16, src_stride, src1_ptr + 16, src2_stride,
1478 dst + 16, dst_stride, filter, height);
1481 static void hevc_vt_bi_8t_32w_msa(uint8_t *src0_ptr,
1484 int32_t src2_stride,
1487 const int8_t *filter,
1490 hevc_vt_bi_8t_16multx2mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
1491 dst, dst_stride, filter, height, 32);
1494 static void hevc_vt_bi_8t_48w_msa(uint8_t *src0_ptr,
1497 int32_t src2_stride,
1500 const int8_t *filter,
1503 hevc_vt_bi_8t_16multx2mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
1504 dst, dst_stride, filter, height, 48);
1507 static void hevc_vt_bi_8t_64w_msa(uint8_t *src0_ptr,
1510 int32_t src2_stride,
1513 const int8_t *filter,
1516 hevc_vt_bi_8t_16multx2mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
1517 dst, dst_stride, filter, height, 64);
1520 static void hevc_hv_bi_8t_4w_msa(uint8_t *src0_ptr,
1523 int32_t src2_stride,
1526 const int8_t *filter_x,
1527 const int8_t *filter_y,
1531 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
1533 v8i16 filt0, filt1, filt2, filt3;
1534 v4i32 filt_h0, filt_h1, filt_h2, filt_h3;
1535 v16i8 mask1, mask2, mask3;
1536 v8i16 filter_vec, const_vec;
1537 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1538 v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1539 v8i16 dst30, dst41, dst52, dst63, dst66, dst87;
1540 v4i32 dst0_r, dst1_r, in0_r, in0_l;
1541 v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
1542 v8i16 dst21_r, dst43_r, dst65_r, dst87_r;
1543 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 };
1544 v8u16 mask4 = { 0, 4, 1, 5, 2, 6, 3, 7 };
1546 src0_ptr -= ((3 * src_stride) + 3);
1547 filter_vec = LD_SH(filter_x);
1548 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1550 filter_vec = LD_SH(filter_y);
1551 vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
1552 filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
1554 SPLATI_W4_SW(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
1560 const_vec = __msa_ldi_h(128);
1563 LD_SB7(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5, src6);
1564 src0_ptr += (7 * src_stride);
1565 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1567 /* row 0 row 1 row 2 row 3 */
1568 VSHF_B4_SB(src0, src3, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
1569 VSHF_B4_SB(src1, src4, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7);
1570 VSHF_B4_SB(src2, src5, mask0, mask1, mask2, mask3,
1571 vec8, vec9, vec10, vec11);
1572 VSHF_B4_SB(src3, src6, mask0, mask1, mask2, mask3,
1573 vec12, vec13, vec14, vec15);
1576 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
1577 dst30, dst30, dst30, dst30);
1579 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, filt3,
1580 dst41, dst41, dst41, dst41);
1582 DPADD_SB4_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, filt3,
1583 dst52, dst52, dst52, dst52);
1585 DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2, filt3,
1586 dst63, dst63, dst63, dst63);
1588 ILVR_H3_SH(dst41, dst30, dst52, dst41, dst63, dst52,
1589 dst10_r, dst21_r, dst32_r);
1590 dst43_r = __msa_ilvl_h(dst41, dst30);
1591 dst54_r = __msa_ilvl_h(dst52, dst41);
1592 dst65_r = __msa_ilvl_h(dst63, dst52);
1593 dst66 = (v8i16) __msa_splati_d((v2i64) dst63, 1);
1595 for (loop_cnt = height >> 1; loop_cnt--;) {
1596 LD_SB2(src0_ptr, src_stride, src7, src8);
1597 src0_ptr += (2 * src_stride);
1598 LD_SH2(src1_ptr, src2_stride, in0, in1);
1599 src1_ptr += (2 * src2_stride);
1601 in0 = (v8i16) __msa_ilvr_d((v2i64) in1, (v2i64) in0);
1602 XORI_B2_128_SB(src7, src8);
1604 VSHF_B4_SB(src7, src8, mask0, mask1, mask2, mask3,
1605 vec0, vec1, vec2, vec3);
1607 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
1608 dst87, dst87, dst87, dst87);
1609 dst76_r = __msa_ilvr_h(dst87, dst66);
1610 dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r,
1611 filt_h0, filt_h1, filt_h2, filt_h3);
1612 dst87_r = __msa_vshf_h((v8i16) mask4, dst87, dst87);
1613 dst1_r = HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r,
1614 filt_h0, filt_h1, filt_h2, filt_h3);
1618 UNPCK_SH_SW(in0, in0_r, in0_l);
1619 dst0_r = __msa_adds_s_w(dst0_r, in0_r);
1620 dst1_r = __msa_adds_s_w(dst1_r, in0_l);
1621 SRARI_W2_SW(dst0_r, dst1_r, 7);
1622 dst0_r = CLIP_SW_0_255(dst0_r);
1623 dst1_r = CLIP_SW_0_255(dst1_r);
1625 HEVC_PCK_SW_SB2(dst1_r, dst0_r, dst0_r);
1626 ST4x2_UB(dst0_r, dst, dst_stride);
1627 dst += (2 * dst_stride);
1635 dst66 = (v8i16) __msa_splati_d((v2i64) dst87, 1);
1639 static void hevc_hv_bi_8t_8multx2mult_msa(uint8_t *src0_ptr,
1642 int32_t src2_stride,
1645 const int8_t *filter_x,
1646 const int8_t *filter_y,
1647 int32_t height, int32_t width)
1651 uint8_t *src0_ptr_tmp;
1652 int16_t *src1_ptr_tmp;
1654 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
1656 v4i32 in0_r, in0_l, in1_r, in1_l;
1657 v8i16 filt0, filt1, filt2, filt3;
1658 v4i32 filt_h0, filt_h1, filt_h2, filt_h3;
1659 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
1660 v16i8 mask1, mask2, mask3;
1661 v8i16 filter_vec, const_vec;
1662 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1663 v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1664 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
1665 v4i32 dst0_r, dst0_l, dst1_r, dst1_l;
1666 v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
1667 v8i16 dst10_l, dst32_l, dst54_l, dst76_l;
1668 v8i16 dst21_r, dst43_r, dst65_r, dst87_r;
1669 v8i16 dst21_l, dst43_l, dst65_l, dst87_l;
1671 src0_ptr -= ((3 * src_stride) + 3);
1672 const_vec = __msa_ldi_h(128);
1675 filter_vec = LD_SH(filter_x);
1676 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1678 filter_vec = LD_SH(filter_y);
1679 vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
1680 filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
1681 SPLATI_W4_SW(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
1687 for (cnt = width >> 3; cnt--;) {
1688 src0_ptr_tmp = src0_ptr;
1690 src1_ptr_tmp = src1_ptr;
1692 LD_SB7(src0_ptr_tmp, src_stride,
1693 src0, src1, src2, src3, src4, src5, src6);
1694 src0_ptr_tmp += (7 * src_stride);
1695 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1697 /* row 0 row 1 row 2 row 3 */
1698 VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
1699 vec0, vec1, vec2, vec3);
1700 VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
1701 vec4, vec5, vec6, vec7);
1702 VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
1703 vec8, vec9, vec10, vec11);
1704 VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
1705 vec12, vec13, vec14, vec15);
1707 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
1708 dst0, dst0, dst0, dst0);
1710 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, filt3,
1711 dst1, dst1, dst1, dst1);
1713 DPADD_SB4_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, filt3,
1714 dst2, dst2, dst2, dst2);
1716 DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2, filt3,
1717 dst3, dst3, dst3, dst3);
1719 VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3,
1720 vec0, vec1, vec2, vec3);
1721 VSHF_B4_SB(src5, src5, mask0, mask1, mask2, mask3,
1722 vec4, vec5, vec6, vec7);
1723 VSHF_B4_SB(src6, src6, mask0, mask1, mask2, mask3,
1724 vec8, vec9, vec10, vec11);
1726 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
1727 dst4, dst4, dst4, dst4);
1729 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, filt3,
1730 dst5, dst5, dst5, dst5);
1732 DPADD_SB4_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, filt3,
1733 dst6, dst6, dst6, dst6);
1735 ILVR_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1,
1736 dst10_r, dst32_r, dst54_r, dst21_r);
1737 ILVR_H2_SH(dst4, dst3, dst6, dst5, dst43_r, dst65_r);
1738 ILVL_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1,
1739 dst10_l, dst32_l, dst54_l, dst21_l);
1740 ILVL_H2_SH(dst4, dst3, dst6, dst5, dst43_l, dst65_l);
1742 for (loop_cnt = height >> 1; loop_cnt--;) {
1744 LD_SB2(src0_ptr_tmp, src_stride, src7, src8);
1745 XORI_B2_128_SB(src7, src8);
1746 src0_ptr_tmp += 2 * src_stride;
1748 LD_SH2(src1_ptr_tmp, src2_stride, in0, in1);
1749 src1_ptr_tmp += (2 * src2_stride);
1751 VSHF_B4_SB(src7, src7, mask0, mask1, mask2, mask3,
1752 vec0, vec1, vec2, vec3);
1754 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
1755 dst7, dst7, dst7, dst7);
1757 ILVRL_H2_SH(dst7, dst6, dst76_r, dst76_l);
1758 dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r,
1759 filt_h0, filt_h1, filt_h2, filt_h3);
1760 dst0_l = HEVC_FILT_8TAP(dst10_l, dst32_l, dst54_l, dst76_l,
1761 filt_h0, filt_h1, filt_h2, filt_h3);
1765 VSHF_B4_SB(src8, src8, mask0, mask1, mask2, mask3,
1766 vec0, vec1, vec2, vec3);
1768 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
1769 dst8, dst8, dst8, dst8);
1771 ILVRL_H2_SH(dst8, dst7, dst87_r, dst87_l);
1772 dst1_r = HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r,
1773 filt_h0, filt_h1, filt_h2, filt_h3);
1774 dst1_l = HEVC_FILT_8TAP(dst21_l, dst43_l, dst65_l, dst87_l,
1775 filt_h0, filt_h1, filt_h2, filt_h3);
1779 UNPCK_SH_SW(in0, in0_r, in0_l);
1780 UNPCK_SH_SW(in1, in1_r, in1_l);
1781 in0_r = __msa_adds_s_w(in0_r, dst0_r);
1782 in0_l = __msa_adds_s_w(in0_l, dst0_l);
1783 in1_r = __msa_adds_s_w(in1_r, dst1_r);
1784 in1_l = __msa_adds_s_w(in1_l, dst1_l);
1785 SRARI_W4_SW(in0_r, in0_l, in1_r, in1_l, 7);
1786 in0_r = CLIP_SW_0_255(in0_r);
1787 in0_l = CLIP_SW_0_255(in0_l);
1788 in1_r = CLIP_SW_0_255(in1_r);
1789 in1_l = CLIP_SW_0_255(in1_l);
1791 HEVC_PCK_SW_SB4(in0_l, in0_r, in1_l, in1_r, dst0_r);
1792 ST8x2_UB(dst0_r, dst_tmp, dst_stride);
1793 dst_tmp += (2 * dst_stride);
1816 static void hevc_hv_bi_8t_8w_msa(uint8_t *src0_ptr,
1819 int32_t src2_stride,
1822 const int8_t *filter_x,
1823 const int8_t *filter_y,
1826 hevc_hv_bi_8t_8multx2mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
1827 dst, dst_stride, filter_x, filter_y,
1831 static void hevc_hv_bi_8t_12w_msa(uint8_t *src0_ptr,
1834 int32_t src2_stride,
1837 const int8_t *filter_x,
1838 const int8_t *filter_y,
1841 hevc_hv_bi_8t_8multx2mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
1842 dst, dst_stride, filter_x, filter_y,
1845 hevc_hv_bi_8t_4w_msa(src0_ptr + 8, src_stride, src1_ptr + 8, src2_stride,
1846 dst + 8, dst_stride, filter_x, filter_y, height);
1849 static void hevc_hv_bi_8t_16w_msa(uint8_t *src0_ptr,
1852 int32_t src2_stride,
1855 const int8_t *filter_x,
1856 const int8_t *filter_y,
1859 hevc_hv_bi_8t_8multx2mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
1860 dst, dst_stride, filter_x, filter_y,
1864 static void hevc_hv_bi_8t_24w_msa(uint8_t *src0_ptr,
1867 int32_t src2_stride,
1870 const int8_t *filter_x,
1871 const int8_t *filter_y,
1874 hevc_hv_bi_8t_8multx2mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
1875 dst, dst_stride, filter_x, filter_y,
1879 static void hevc_hv_bi_8t_32w_msa(uint8_t *src0_ptr,
1882 int32_t src2_stride,
1885 const int8_t *filter_x,
1886 const int8_t *filter_y,
1889 hevc_hv_bi_8t_8multx2mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
1890 dst, dst_stride, filter_x, filter_y,
1894 static void hevc_hv_bi_8t_48w_msa(uint8_t *src0_ptr,
1897 int32_t src2_stride,
1900 const int8_t *filter_x,
1901 const int8_t *filter_y,
1904 hevc_hv_bi_8t_8multx2mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
1905 dst, dst_stride, filter_x, filter_y,
1909 static void hevc_hv_bi_8t_64w_msa(uint8_t *src0_ptr,
1912 int32_t src2_stride,
1915 const int8_t *filter_x,
1916 const int8_t *filter_y,
1919 hevc_hv_bi_8t_8multx2mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
1920 dst, dst_stride, filter_x, filter_y,
1924 static void hevc_hz_bi_4t_4x2_msa(uint8_t *src0_ptr,
1927 int32_t src2_stride,
1930 const int8_t *filter,
1934 v16i8 src0, src1, dst0, vec0, vec1;
1936 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 };
1939 v8i16 filter_vec, const_vec;
1943 const_vec = __msa_ldi_h(128);
1946 filter_vec = LD_SH(filter);
1947 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
1951 LD_SB2(src0_ptr, src_stride, src0, src1);
1952 LD_SH2(src1_ptr, src2_stride, in0, in1);
1953 in0 = (v8i16) __msa_ilvr_d((v2i64) in1, (v2i64) in0);
1954 XORI_B2_128_SB(src0, src1);
1955 VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
1957 DPADD_SB2_SH(vec0, vec1, filt0, filt1, tmp0, tmp0);
1959 tmp0 = __msa_adds_s_h(tmp0, in0);
1960 tmp0 = __msa_srari_h(tmp0, 7);
1961 tmp0 = CLIP_SH_0_255(tmp0);
1962 dst0 = __msa_pckev_b((v16i8) tmp0, (v16i8) tmp0);
1964 ST4x2_UB(dst0, dst, dst_stride);
1967 static void hevc_hz_bi_4t_4x4_msa(uint8_t *src0_ptr,
1970 int32_t src2_stride,
1973 const int8_t *filter,
1977 v16i8 src0, src1, src2, src3, dst0, vec0, vec1;
1978 v8i16 in0, in1, in2, in3;
1979 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 };
1982 v8i16 filter_vec, const_vec;
1986 const_vec = __msa_ldi_h(128);
1989 filter_vec = LD_SH(filter);
1990 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
1994 LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
1995 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
1997 ILVR_D2_SH(in1, in0, in3, in2, in0, in1);
1998 XORI_B4_128_SB(src0, src1, src2, src3);
2000 VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
2002 DPADD_SB2_SH(vec0, vec1, filt0, filt1, tmp0, tmp0);
2003 VSHF_B2_SB(src2, src3, src2, src3, mask0, mask1, vec0, vec1);
2005 DPADD_SB2_SH(vec0, vec1, filt0, filt1, tmp1, tmp1);
2006 HEVC_BI_RND_CLIP2(in0, in1, tmp0, tmp1, 7, tmp0, tmp1);
2007 dst0 = __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
2009 ST4x4_UB(dst0, dst0, 0, 1, 2, 3, dst, dst_stride);
2012 static void hevc_hz_bi_4t_4x8multiple_msa(uint8_t *src0_ptr,
2015 int32_t src2_stride,
2018 const int8_t *filter,
2023 v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
2025 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
2026 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 };
2027 v16i8 mask1, vec0, vec1;
2028 v8i16 tmp0, tmp1, tmp2, tmp3;
2029 v8i16 filter_vec, const_vec;
2033 const_vec = __msa_ldi_h(128);
2036 filter_vec = LD_SH(filter);
2037 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2041 for (loop_cnt = (height >> 3); loop_cnt--;) {
2042 LD_SB8(src0_ptr, src_stride,
2043 src0, src1, src2, src3, src4, src5, src6, src7);
2044 src0_ptr += (8 * src_stride);
2045 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
2046 src1_ptr += (4 * src2_stride);
2047 LD_SH4(src1_ptr, src2_stride, in4, in5, in6, in7);
2048 src1_ptr += (4 * src2_stride);
2049 ILVR_D2_SH(in1, in0, in3, in2, in0, in1);
2050 ILVR_D2_SH(in5, in4, in7, in6, in2, in3);
2051 XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
2053 VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
2055 DPADD_SB2_SH(vec0, vec1, filt0, filt1, tmp0, tmp0);
2056 VSHF_B2_SB(src2, src3, src2, src3, mask0, mask1, vec0, vec1);
2058 DPADD_SB2_SH(vec0, vec1, filt0, filt1, tmp1, tmp1);
2059 VSHF_B2_SB(src4, src5, src4, src5, mask0, mask1, vec0, vec1);
2061 DPADD_SB2_SH(vec0, vec1, filt0, filt1, tmp2, tmp2);
2062 VSHF_B2_SB(src6, src7, src6, src7, mask0, mask1, vec0, vec1);
2064 DPADD_SB2_SH(vec0, vec1, filt0, filt1, tmp3, tmp3);
2066 HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
2067 tmp0, tmp1, tmp2, tmp3, 7, tmp0, tmp1, tmp2, tmp3);
2069 PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, dst0, dst1);
2070 ST4x8_UB(dst0, dst1, dst, dst_stride);
2071 dst += (8 * dst_stride);
2075 static void hevc_hz_bi_4t_4w_msa(uint8_t *src0_ptr,
2078 int32_t src2_stride,
2081 const int8_t *filter,
2085 hevc_hz_bi_4t_4x2_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
2086 dst, dst_stride, filter, height);
2087 } else if (4 == height) {
2088 hevc_hz_bi_4t_4x4_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
2089 dst, dst_stride, filter, height);
2090 } else if (8 == height || 16 == height) {
2091 hevc_hz_bi_4t_4x8multiple_msa(src0_ptr, src_stride,
2092 src1_ptr, src2_stride,
2093 dst, dst_stride, filter, height);
2097 static void hevc_hz_bi_4t_6w_msa(uint8_t *src0_ptr,
2100 int32_t src2_stride,
2103 const int8_t *filter,
2108 v16i8 src0, src1, src2, src3;
2109 v8i16 in0, in1, in2, in3;
2110 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
2113 v8i16 dst0, dst1, dst2, dst3;
2114 v8i16 filter_vec, const_vec;
2118 const_vec = __msa_ldi_h(128);
2121 filter_vec = LD_SH(filter);
2122 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2126 for (loop_cnt = (height >> 2); loop_cnt--;) {
2127 LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
2128 src0_ptr += (4 * src_stride);
2129 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
2130 src1_ptr += (4 * src2_stride);
2131 XORI_B4_128_SB(src0, src1, src2, src3);
2133 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
2135 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
2136 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
2138 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
2139 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
2141 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
2142 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
2144 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
2146 HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
2147 dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
2149 PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
2150 ST6x4_UB(dst0, dst1, dst, dst_stride);
2151 dst += (4 * dst_stride);
2155 static void hevc_hz_bi_4t_8x2_msa(uint8_t *src0_ptr,
2158 int32_t src2_stride,
2161 const int8_t *filter,
2167 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
2168 v16i8 mask1, vec0, vec1;
2170 v8i16 filter_vec, const_vec;
2174 const_vec = __msa_ldi_h(128);
2177 filter_vec = LD_SH(filter);
2178 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2182 LD_SB2(src0_ptr, src_stride, src0, src1);
2183 LD_SH2(src1_ptr, src2_stride, in0, in1);
2184 XORI_B2_128_SB(src0, src1);
2186 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
2188 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
2189 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
2191 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
2192 HEVC_BI_RND_CLIP2(in0, in1, dst0, dst1, 7, dst0, dst1);
2194 dst0 = (v8i16) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
2195 ST8x2_UB(dst0, dst, dst_stride);
2198 static void hevc_hz_bi_4t_8x6_msa(uint8_t *src0_ptr,
2201 int32_t src2_stride,
2204 const int8_t *filter,
2208 v16i8 src0, src1, src2, src3, src4, src5;
2209 v8i16 in0, in1, in2, in3, in4, in5;
2210 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
2213 v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
2214 v8i16 filter_vec, const_vec;
2218 const_vec = __msa_ldi_h(128);
2221 filter_vec = LD_SH(filter);
2222 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2226 LD_SB6(src0_ptr, src_stride, src0, src1, src2, src3, src4, src5);
2227 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
2228 src1_ptr += (4 * src2_stride);
2229 LD_SH2(src1_ptr, src2_stride, in4, in5);
2230 XORI_B6_128_SB(src0, src1, src2, src3, src4, src5);
2232 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
2234 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
2235 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
2237 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
2238 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
2240 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
2241 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
2243 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
2244 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
2246 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
2247 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
2249 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5);
2251 HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
2252 dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
2253 HEVC_BI_RND_CLIP2(in4, in5, dst4, dst5, 7, dst4, dst5);
2255 PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
2256 dst2 = (v8i16) __msa_pckev_b((v16i8) dst5, (v16i8) dst4);
2257 ST8x4_UB(dst0, dst1, dst, dst_stride);
2258 dst += (4 * dst_stride);
2259 ST8x2_UB(dst2, dst, dst_stride);
2262 static void hevc_hz_bi_4t_8x4multiple_msa(uint8_t *src0_ptr,
2265 int32_t src2_stride,
2268 const int8_t *filter,
2273 v16i8 src0, src1, src2, src3;
2274 v8i16 in0, in1, in2, in3;
2275 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
2278 v8i16 dst0, dst1, dst2, dst3;
2279 v8i16 filter_vec, const_vec;
2283 const_vec = __msa_ldi_h(128);
2286 filter_vec = LD_SH(filter);
2287 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2291 for (loop_cnt = (height >> 2); loop_cnt--;) {
2292 LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
2293 src0_ptr += (4 * src_stride);
2294 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
2295 src1_ptr += (4 * src2_stride);
2296 XORI_B4_128_SB(src0, src1, src2, src3);
2298 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
2300 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
2301 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
2303 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
2304 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
2306 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
2307 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
2309 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
2311 HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
2312 dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
2314 PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
2315 ST8x4_UB(dst0, dst1, dst, dst_stride);
2316 dst += (4 * dst_stride);
2320 static void hevc_hz_bi_4t_8w_msa(uint8_t *src0_ptr,
2323 int32_t src2_stride,
2326 const int8_t *filter,
2330 hevc_hz_bi_4t_8x2_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
2331 dst, dst_stride, filter, height);
2332 } else if (6 == height) {
2333 hevc_hz_bi_4t_8x6_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
2334 dst, dst_stride, filter, height);
2335 } else if (0 == (height % 4)) {
2336 hevc_hz_bi_4t_8x4multiple_msa(src0_ptr, src_stride,
2337 src1_ptr, src2_stride,
2338 dst, dst_stride, filter, height);
2342 static void hevc_hz_bi_4t_12w_msa(uint8_t *src0_ptr,
2345 int32_t src2_stride,
2348 const int8_t *filter,
2353 v16i8 src0, src1, src2, src3;
2354 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
2355 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
2357 8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
2361 v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
2362 v8i16 filter_vec, const_vec;
2366 const_vec = __msa_ldi_h(128);
2369 filter_vec = LD_SH(filter);
2370 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2375 for (loop_cnt = (height >> 2); loop_cnt--;) {
2376 LD_SB4(src0_ptr, src_stride, src0, src1, src2, src3);
2377 src0_ptr += (4 * src_stride);
2378 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
2379 LD_SH4(src1_ptr + 8, src2_stride, in4, in5, in6, in7);
2380 src1_ptr += (4 * src2_stride);
2382 ILVR_D2_SH(in5, in4, in7, in6, in4, in5);
2383 XORI_B4_128_SB(src0, src1, src2, src3);
2385 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
2387 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
2388 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
2390 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
2391 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
2393 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
2394 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
2396 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
2397 VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec0, vec1);
2399 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
2400 VSHF_B2_SB(src2, src3, src2, src3, mask2, mask3, vec0, vec1);
2402 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5);
2404 HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
2405 dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
2406 HEVC_BI_RND_CLIP2(in4, in5, dst4, dst5, 7, dst4, dst5);
2408 PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
2409 dst2 = (v8i16) __msa_pckev_b((v16i8) dst5, (v16i8) dst4);
2410 ST12x4_UB(dst0, dst1, dst2, dst, dst_stride);
2411 dst += (4 * dst_stride);
2415 static void hevc_hz_bi_4t_16w_msa(uint8_t *src0_ptr,
2418 int32_t src2_stride,
2421 const int8_t *filter,
2425 v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
2426 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
2428 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
2430 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
2432 v8i16 filter_vec, const_vec;
2436 const_vec = __msa_ldi_h(128);
2439 filter_vec = LD_SH(filter);
2440 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2444 for (loop_cnt = (height >> 2); loop_cnt--;) {
2445 LD_SB4(src0_ptr, src_stride, src0, src2, src4, src6);
2446 LD_SB4(src0_ptr + 8, src_stride, src1, src3, src5, src7);
2447 src0_ptr += (4 * src_stride);
2448 LD_SH4(src1_ptr, src2_stride, in0, in2, in4, in6);
2449 LD_SH4(src1_ptr + 8, src2_stride, in1, in3, in5, in7);
2450 src1_ptr += (4 * src2_stride);
2451 XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
2453 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
2455 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
2456 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
2458 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
2459 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
2461 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
2462 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
2464 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
2465 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
2467 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
2468 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
2470 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5);
2471 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
2473 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst6, dst6);
2474 VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1);
2476 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst7, dst7);
2478 HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
2479 dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
2480 HEVC_BI_RND_CLIP4(in4, in5, in6, in7,
2481 dst4, dst5, dst6, dst7, 7, dst4, dst5, dst6, dst7);
2483 PCKEV_B4_SH(dst1, dst0, dst3, dst2,
2484 dst5, dst4, dst7, dst6, dst0, dst1, dst2, dst3);
2485 ST_SH4(dst0, dst1, dst2, dst3, dst, dst_stride);
2486 dst += (4 * dst_stride);
2490 static void hevc_hz_bi_4t_24w_msa(uint8_t *src0_ptr,
2493 int32_t src2_stride,
2496 const int8_t *filter,
2499 int16_t *src1_ptr_tmp;
2502 v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
2503 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
2505 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
2506 v16i8 mask1, mask2, mask3;
2508 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
2509 v8i16 filter_vec, const_vec;
2513 const_vec = __msa_ldi_h(128);
2516 filter_vec = LD_SH(filter);
2517 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2524 src1_ptr_tmp = src1_ptr + 16;
2526 for (loop_cnt = (height >> 2); loop_cnt--;) {
2527 LD_SB4(src0_ptr, src_stride, src0, src2, src4, src6);
2528 LD_SB4(src0_ptr + 16, src_stride, src1, src3, src5, src7);
2529 src0_ptr += (4 * src_stride);
2530 LD_SH4(src1_ptr, src2_stride, in0, in2, in4, in6);
2531 LD_SH4(src1_ptr + 8, src2_stride, in1, in3, in5, in7);
2532 src1_ptr += (4 * src2_stride);
2533 XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
2535 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
2537 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
2538 VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec0, vec1);
2540 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
2541 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
2543 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
2544 VSHF_B2_SB(src2, src3, src2, src3, mask2, mask3, vec0, vec1);
2546 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
2547 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
2549 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
2550 VSHF_B2_SB(src4, src5, src4, src5, mask2, mask3, vec0, vec1);
2552 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5);
2553 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
2555 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst6, dst6);
2556 VSHF_B2_SB(src6, src7, src6, src7, mask2, mask3, vec0, vec1);
2558 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst7, dst7);
2560 HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
2561 dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
2562 HEVC_BI_RND_CLIP4(in4, in5, in6, in7,
2563 dst4, dst5, dst6, dst7, 7, dst4, dst5, dst6, dst7);
2565 PCKEV_B4_SH(dst1, dst0, dst3, dst2,
2566 dst5, dst4, dst7, dst6, dst0, dst1, dst2, dst3);
2567 ST_SH4(dst0, dst1, dst2, dst3, dst, dst_stride);
2568 dst += (4 * dst_stride);
2570 LD_SH4(src1_ptr_tmp, src2_stride, in0, in1, in2, in3);
2571 src1_ptr_tmp += (4 * src2_stride);
2573 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
2575 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
2576 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
2578 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
2579 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
2581 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
2582 VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1);
2584 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
2586 HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
2587 dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
2589 PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
2590 ST8x4_UB(dst0, dst1, dst_tmp, dst_stride);
2591 dst_tmp += (4 * dst_stride);
2595 static void hevc_hz_bi_4t_32w_msa(uint8_t *src0_ptr,
2598 int32_t src2_stride,
2601 const int8_t *filter,
2605 v16i8 src0, src1, src2;
2606 v8i16 in0, in1, in2, in3;
2608 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
2609 v16i8 mask1, mask2, mask3;
2610 v8i16 dst0, dst1, dst2, dst3;
2612 v8i16 filter_vec, const_vec;
2616 const_vec = __msa_ldi_h(128);
2619 filter_vec = LD_SH(filter);
2620 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2626 for (loop_cnt = (height >> 1); loop_cnt--;) {
2627 LD_SB2(src0_ptr, 16, src0, src1);
2628 src2 = LD_SB(src0_ptr + 24);
2629 src0_ptr += src_stride;
2630 LD_SH4(src1_ptr, 8, in0, in1, in2, in3);
2631 src1_ptr += src2_stride;
2632 XORI_B3_128_SB(src0, src1, src2);
2634 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
2636 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
2637 VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec0, vec1);
2639 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
2640 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
2642 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
2643 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
2645 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
2647 HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
2648 dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
2650 PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
2651 ST_SH2(dst0, dst1, dst, 16);
2654 LD_SB2(src0_ptr, 16, src0, src1);
2655 src2 = LD_SB(src0_ptr + 24);
2656 src0_ptr += src_stride;
2657 LD_SH4(src1_ptr, 8, in0, in1, in2, in3);
2658 src1_ptr += src2_stride;
2659 XORI_B3_128_SB(src0, src1, src2);
2661 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
2663 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
2664 VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec0, vec1);
2666 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
2667 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
2669 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
2670 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
2672 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
2674 HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
2675 dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
2677 PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
2678 ST_SH2(dst0, dst1, dst, 16);
2683 static void hevc_vt_bi_4t_4x2_msa(uint8_t *src0_ptr,
2686 int32_t src2_stride,
2689 const int8_t *filter,
2692 v16i8 src0, src1, src2, src3, src4;
2694 v16i8 src10_r, src32_r, src21_r, src43_r, src2110, src4332;
2697 v8i16 filter_vec, const_vec;
2699 src0_ptr -= src_stride;
2701 const_vec = __msa_ldi_h(128);
2704 filter_vec = LD_SH(filter);
2705 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2707 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
2708 src0_ptr += (3 * src_stride);
2710 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2711 src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
2712 src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
2714 LD_SB2(src0_ptr, src_stride, src3, src4);
2715 LD_SH2(src1_ptr, src2_stride, in0, in1);
2716 in0 = (v8i16) __msa_ilvr_d((v2i64) in1, (v2i64) in0);
2717 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
2718 src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_r, (v2i64) src32_r);
2719 src4332 = (v16i8) __msa_xori_b((v16u8) src4332, 128);
2722 DPADD_SB2_SH(src2110, src4332, filt0, filt1, dst10, dst10);
2723 dst10 = __msa_adds_s_h(dst10, in0);
2724 dst10 = __msa_srari_h(dst10, 7);
2725 dst10 = CLIP_SH_0_255(dst10);
2727 dst10 = (v8i16) __msa_pckev_b((v16i8) dst10, (v16i8) dst10);
2728 ST4x2_UB(dst10, dst, dst_stride);
2731 static void hevc_vt_bi_4t_4x4_msa(uint8_t *src0_ptr,
2734 int32_t src2_stride,
2737 const int8_t *filter,
2740 v16i8 src0, src1, src2, src3, src4, src5, src6;
2741 v8i16 in0, in1, in2, in3;
2742 v16i8 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r;
2743 v16i8 src2110, src4332, src6554;
2746 v8i16 filter_vec, const_vec;
2748 src0_ptr -= src_stride;
2750 const_vec = __msa_ldi_h(128);
2753 filter_vec = LD_SH(filter);
2754 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2756 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
2757 src0_ptr += (3 * src_stride);
2758 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2759 src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
2760 src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
2762 LD_SB4(src0_ptr, src_stride, src3, src4, src5, src6);
2763 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
2764 ILVR_D2_SH(in1, in0, in3, in2, in0, in1);
2765 ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
2766 src32_r, src43_r, src54_r, src65_r);
2767 ILVR_D2_SB(src43_r, src32_r, src65_r, src54_r, src4332, src6554);
2768 XORI_B2_128_SB(src4332, src6554);
2771 DPADD_SB2_SH(src2110, src4332, filt0, filt1, dst10, dst10);
2773 DPADD_SB2_SH(src4332, src6554, filt0, filt1, dst32, dst32);
2774 HEVC_BI_RND_CLIP2(in0, in1, dst10, dst32, 7, dst10, dst32);
2776 dst10 = (v8i16) __msa_pckev_b((v16i8) dst32, (v16i8) dst10);
2777 ST4x4_UB(dst10, dst10, 0, 1, 2, 3, dst, dst_stride);
2780 static void hevc_vt_bi_4t_4x8multiple_msa(uint8_t *src0_ptr,
2783 int32_t src2_stride,
2786 const int8_t *filter,
2790 v16i8 src0, src1, src2, src3, src4, src5;
2791 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
2792 v16i8 src6, src7, src8, src9;
2793 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
2794 v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
2795 v16i8 src2110, src4332, src6554, src8776;
2796 v8i16 dst10, dst32, dst54, dst76;
2798 v8i16 filter_vec, const_vec;
2800 src0_ptr -= src_stride;
2802 const_vec = __msa_ldi_h(128);
2805 filter_vec = LD_SH(filter);
2806 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2808 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
2809 src0_ptr += (3 * src_stride);
2810 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2811 src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
2812 src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
2814 for (loop_cnt = (height >> 3); loop_cnt--;) {
2815 LD_SB6(src0_ptr, src_stride, src3, src4, src5, src6, src7, src8);
2816 src0_ptr += (6 * src_stride);
2817 LD_SH8(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5, in6, in7);
2818 src1_ptr += (8 * src2_stride);
2819 ILVR_D2_SH(in1, in0, in3, in2, in0, in1);
2820 ILVR_D2_SH(in5, in4, in7, in6, in2, in3);
2821 ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
2822 src32_r, src43_r, src54_r, src65_r);
2823 ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
2824 ILVR_D3_SB(src43_r, src32_r, src65_r, src54_r, src87_r, src76_r,
2825 src4332, src6554, src8776);
2826 XORI_B3_128_SB(src4332, src6554, src8776);
2829 DPADD_SB2_SH(src2110, src4332, filt0, filt1, dst10, dst10);
2831 DPADD_SB2_SH(src4332, src6554, filt0, filt1, dst32, dst32);
2833 DPADD_SB2_SH(src6554, src8776, filt0, filt1, dst54, dst54);
2835 LD_SB2(src0_ptr, src_stride, src9, src2);
2836 src0_ptr += (2 * src_stride);
2837 ILVR_B2_SB(src9, src8, src2, src9, src98_r, src109_r);
2838 src2110 = (v16i8) __msa_ilvr_d((v2i64) src109_r, (v2i64) src98_r);
2839 src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
2841 DPADD_SB2_SH(src8776, src2110, filt0, filt1, dst76, dst76);
2843 HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
2844 dst10, dst32, dst54, dst76, 7,
2845 dst10, dst32, dst54, dst76);
2847 PCKEV_B2_SH(dst32, dst10, dst76, dst54, dst10, dst54);
2848 ST4x8_UB(dst10, dst54, dst, dst_stride);
2849 dst += (8 * dst_stride);
2853 static void hevc_vt_bi_4t_4w_msa(uint8_t *src0_ptr,
2856 int32_t src2_stride,
2859 const int8_t *filter,
2863 hevc_vt_bi_4t_4x2_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
2864 dst, dst_stride, filter, height);
2865 } else if (4 == height) {
2866 hevc_vt_bi_4t_4x4_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
2867 dst, dst_stride, filter, height);
2869 hevc_vt_bi_4t_4x8multiple_msa(src0_ptr, src_stride,
2870 src1_ptr, src2_stride,
2871 dst, dst_stride, filter, height);
2875 static void hevc_vt_bi_4t_6w_msa(uint8_t *src0_ptr,
2878 int32_t src2_stride,
2881 const int8_t *filter,
2885 v16i8 src0, src1, src2, src3, src4, src5;
2886 v8i16 in0, in1, in2, in3;
2887 v16i8 src10_r, src32_r, src21_r, src43_r;
2888 v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
2890 v8i16 filter_vec, const_vec;
2892 src0_ptr -= src_stride;
2894 const_vec = __msa_ldi_h(128);
2897 filter_vec = LD_SH(filter);
2898 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2900 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
2901 src0_ptr += (3 * src_stride);
2902 XORI_B3_128_SB(src0, src1, src2);
2903 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2905 for (loop_cnt = (height >> 2); loop_cnt--;) {
2906 LD_SB2(src0_ptr, src_stride, src3, src4);
2907 src0_ptr += (2 * src_stride);
2908 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
2909 src1_ptr += (4 * src2_stride);
2910 XORI_B2_128_SB(src3, src4);
2911 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
2914 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
2916 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
2918 LD_SB2(src0_ptr, src_stride, src5, src2);
2919 src0_ptr += (2 * src_stride);
2920 XORI_B2_128_SB(src5, src2);
2921 ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r);
2924 DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, dst2_r, dst2_r);
2926 DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, dst3_r, dst3_r);
2928 HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
2929 dst0_r, dst1_r, dst2_r, dst3_r, 7,
2930 dst0_r, dst1_r, dst2_r, dst3_r);
2932 PCKEV_B2_SH(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r);
2933 ST6x4_UB(dst0_r, dst1_r, dst, dst_stride);
2934 dst += (4 * dst_stride);
2938 static void hevc_vt_bi_4t_8x2_msa(uint8_t *src0_ptr,
2941 int32_t src2_stride,
2944 const int8_t *filter,
2947 v16i8 src0, src1, src2, src3, src4;
2948 v8i16 in0, in1, dst0_r, dst1_r;
2949 v16i8 src10_r, src32_r, src21_r, src43_r;
2951 v8i16 filter_vec, const_vec;
2953 src0_ptr -= src_stride;
2955 const_vec = __msa_ldi_h(128);
2958 filter_vec = LD_SH(filter);
2959 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2961 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
2962 src0_ptr += (3 * src_stride);
2963 XORI_B3_128_SB(src0, src1, src2);
2964 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2966 LD_SB2(src0_ptr, src_stride, src3, src4);
2967 LD_SH2(src1_ptr, src2_stride, in0, in1);
2968 XORI_B2_128_SB(src3, src4);
2969 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
2972 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
2974 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
2976 HEVC_BI_RND_CLIP2(in0, in1, dst0_r, dst1_r, 7, dst0_r, dst1_r);
2977 dst0_r = (v8i16) __msa_pckev_b((v16i8) dst1_r, (v16i8) dst0_r);
2979 ST8x2_UB(dst0_r, dst, dst_stride);
2982 static void hevc_vt_bi_4t_8x6_msa(uint8_t *src0_ptr,
2985 int32_t src2_stride,
2988 const int8_t *filter,
2991 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
2992 v8i16 in0, in1, in2, in3, in4, in5;
2993 v16i8 src10_r, src32_r, src54_r, src76_r;
2994 v16i8 src21_r, src43_r, src65_r, src87_r;
2995 v8i16 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r;
2997 v8i16 filter_vec, const_vec;
2999 src0_ptr -= src_stride;
3001 const_vec = __msa_ldi_h(128);
3004 filter_vec = LD_SH(filter);
3005 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3007 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
3008 src0_ptr += (3 * src_stride);
3009 XORI_B3_128_SB(src0, src1, src2);
3010 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3012 LD_SB6(src0_ptr, src_stride, src3, src4, src5, src6, src7, src8);
3013 LD_SH6(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5);
3014 XORI_B6_128_SB(src3, src4, src5, src6, src7, src8);
3015 ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
3016 src32_r, src43_r, src54_r, src65_r);
3017 ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
3020 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
3022 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
3024 DPADD_SB2_SH(src32_r, src54_r, filt0, filt1, dst2_r, dst2_r);
3026 DPADD_SB2_SH(src43_r, src65_r, filt0, filt1, dst3_r, dst3_r);
3028 DPADD_SB2_SH(src54_r, src76_r, filt0, filt1, dst4_r, dst4_r);
3030 DPADD_SB2_SH(src65_r, src87_r, filt0, filt1, dst5_r, dst5_r);
3031 HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
3032 dst0_r, dst1_r, dst2_r, dst3_r, 7,
3033 dst0_r, dst1_r, dst2_r, dst3_r);
3034 HEVC_BI_RND_CLIP2(in4, in5, dst4_r, dst5_r, 7, dst4_r, dst5_r);
3036 PCKEV_B2_SH(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r);
3037 dst2_r = (v8i16) __msa_pckev_b((v16i8) dst5_r, (v16i8) dst4_r);
3038 ST8x4_UB(dst0_r, dst1_r, dst, dst_stride);
3039 dst += (4 * dst_stride);
3040 ST8x2_UB(dst2_r, dst, dst_stride);
3043 static void hevc_vt_bi_4t_8x4multiple_msa(uint8_t *src0_ptr,
3046 int32_t src2_stride,
3049 const int8_t *filter,
3053 v16i8 src0, src1, src2, src3, src4, src5;
3054 v8i16 in0, in1, in2, in3;
3055 v16i8 src10_r, src32_r, src21_r, src43_r;
3056 v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
3058 v8i16 filter_vec, const_vec;
3060 src0_ptr -= src_stride;
3062 const_vec = __msa_ldi_h(128);
3065 filter_vec = LD_SH(filter);
3066 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3068 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
3069 src0_ptr += (3 * src_stride);
3070 XORI_B3_128_SB(src0, src1, src2);
3071 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3073 for (loop_cnt = (height >> 2); loop_cnt--;) {
3074 LD_SB2(src0_ptr, src_stride, src3, src4);
3075 src0_ptr += (2 * src_stride);
3076 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
3077 src1_ptr += (4 * src2_stride);
3078 XORI_B2_128_SB(src3, src4);
3079 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3082 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
3084 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
3086 LD_SB2(src0_ptr, src_stride, src5, src2);
3087 src0_ptr += (2 * src_stride);
3088 XORI_B2_128_SB(src5, src2);
3089 ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r);
3092 DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, dst2_r, dst2_r);
3094 DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, dst3_r, dst3_r);
3095 HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
3096 dst0_r, dst1_r, dst2_r, dst3_r, 7,
3097 dst0_r, dst1_r, dst2_r, dst3_r);
3099 PCKEV_B2_SH(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r);
3100 ST8x4_UB(dst0_r, dst1_r, dst, dst_stride);
3101 dst += (4 * dst_stride);
3105 static void hevc_vt_bi_4t_8w_msa(uint8_t *src0_ptr,
3108 int32_t src2_stride,
3111 const int8_t *filter,
3115 hevc_vt_bi_4t_8x2_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
3116 dst, dst_stride, filter, height);
3117 } else if (6 == height) {
3118 hevc_vt_bi_4t_8x6_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
3119 dst, dst_stride, filter, height);
3121 hevc_vt_bi_4t_8x4multiple_msa(src0_ptr, src_stride,
3122 src1_ptr, src2_stride,
3123 dst, dst_stride, filter, height);
3127 static void hevc_vt_bi_4t_12w_msa(uint8_t *src0_ptr,
3130 int32_t src2_stride,
3133 const int8_t *filter,
3137 v16i8 src0, src1, src2, src3, src4, src5;
3138 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
3139 v16i8 src10_r, src32_r, src21_r, src43_r;
3140 v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
3141 v16i8 src10_l, src32_l, src54_l, src21_l, src43_l, src65_l;
3142 v16i8 src2110, src4332;
3143 v8i16 dst0_l, dst1_l, filt0, filt1;
3144 v8i16 filter_vec, const_vec;
3146 src0_ptr -= (1 * src_stride);
3148 const_vec = __msa_ldi_h(128);
3151 filter_vec = LD_SH(filter);
3152 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3154 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
3155 src0_ptr += (3 * src_stride);
3156 XORI_B3_128_SB(src0, src1, src2);
3157 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3158 ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
3159 src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_l, (v2i64) src10_l);
3161 for (loop_cnt = (height >> 2); loop_cnt--;) {
3162 LD_SB2(src0_ptr, src_stride, src3, src4);
3163 src0_ptr += (2 * src_stride);
3164 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
3165 LD_SH4((src1_ptr + 8), src2_stride, in4, in5, in6, in7);
3166 src1_ptr += (4 * src2_stride);
3167 ILVR_D2_SH(in5, in4, in7, in6, in4, in5);
3168 XORI_B2_128_SB(src3, src4);
3170 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3171 ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
3172 src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_l, (v2i64) src32_l);
3175 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
3177 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
3179 DPADD_SB2_SH(src2110, src4332, filt0, filt1, dst0_l, dst0_l);
3181 LD_SB2(src0_ptr, src_stride, src5, src2);
3182 src0_ptr += (2 * src_stride);
3183 XORI_B2_128_SB(src5, src2);
3185 ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r);
3186 ILVL_B2_SB(src5, src4, src2, src5, src54_l, src65_l);
3187 src2110 = (v16i8) __msa_ilvr_d((v2i64) src65_l, (v2i64) src54_l);
3190 DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, dst2_r, dst2_r);
3192 DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, dst3_r, dst3_r);
3194 DPADD_SB2_SH(src4332, src2110, filt0, filt1, dst1_l, dst1_l);
3195 HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
3196 dst0_r, dst1_r, dst2_r, dst3_r, 7,
3197 dst0_r, dst1_r, dst2_r, dst3_r);
3198 HEVC_BI_RND_CLIP2(in4, in5, dst0_l, dst1_l, 7, dst0_l, dst1_l);
3200 PCKEV_B2_SH(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r);
3201 dst0_l = (v8i16) __msa_pckev_b((v16i8) dst1_l, (v16i8) dst0_l);
3202 ST12x4_UB(dst0_r, dst1_r, dst0_l, dst, dst_stride);
3203 dst += (4 * dst_stride);
3207 static void hevc_vt_bi_4t_16w_msa(uint8_t *src0_ptr,
3210 int32_t src2_stride,
3213 const int8_t *filter,
3217 v16i8 src0, src1, src2, src3, src4, src5;
3218 v8i16 in0, in1, in2, in3;
3219 v16i8 src10_r, src32_r, src21_r, src43_r;
3220 v16i8 src10_l, src32_l, src21_l, src43_l;
3221 v8i16 dst0_r, dst1_r, dst0_l, dst1_l;
3223 v8i16 filter_vec, const_vec;
3225 src0_ptr -= src_stride;
3227 const_vec = __msa_ldi_h(128);
3230 filter_vec = LD_SH(filter);
3231 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3233 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
3234 src0_ptr += (3 * src_stride);
3235 XORI_B3_128_SB(src0, src1, src2);
3236 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3237 ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
3239 for (loop_cnt = (height >> 2); loop_cnt--;) {
3240 LD_SB2(src0_ptr, src_stride, src3, src4);
3241 src0_ptr += (2 * src_stride);
3242 LD_SH2(src1_ptr, src2_stride, in0, in1);
3243 LD_SH2((src1_ptr + 8), src2_stride, in2, in3);
3244 src1_ptr += (2 * src2_stride);
3245 XORI_B2_128_SB(src3, src4);
3246 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3247 ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
3250 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
3252 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
3254 DPADD_SB2_SH(src10_l, src32_l, filt0, filt1, dst0_l, dst0_l);
3256 DPADD_SB2_SH(src21_l, src43_l, filt0, filt1, dst1_l, dst1_l);
3257 HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
3258 dst0_r, dst1_r, dst0_l, dst1_l, 7,
3259 dst0_r, dst1_r, dst0_l, dst1_l);
3261 PCKEV_B2_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r, dst1_r);
3262 ST_SH2(dst0_r, dst1_r, dst, dst_stride);
3263 dst += (2 * dst_stride);
3265 LD_SB2(src0_ptr, src_stride, src5, src2);
3266 src0_ptr += (2 * src_stride);
3267 LD_SH2(src1_ptr, src2_stride, in0, in1);
3268 LD_SH2((src1_ptr + 8), src2_stride, in2, in3);
3269 src1_ptr += (2 * src2_stride);
3270 XORI_B2_128_SB(src5, src2);
3271 ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r);
3272 ILVL_B2_SB(src5, src4, src2, src5, src10_l, src21_l);
3275 DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, dst0_r, dst0_r);
3277 DPADD_SB2_SH(src32_l, src10_l, filt0, filt1, dst0_l, dst0_l);
3279 DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, dst1_r, dst1_r);
3281 DPADD_SB2_SH(src43_l, src21_l, filt0, filt1, dst1_l, dst1_l);
3282 HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
3283 dst0_r, dst1_r, dst0_l, dst1_l, 7,
3284 dst0_r, dst1_r, dst0_l, dst1_l);
3286 PCKEV_B2_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r, dst1_r);
3287 ST_SH2(dst0_r, dst1_r, dst, dst_stride);
3288 dst += (2 * dst_stride);
3292 static void hevc_vt_bi_4t_24w_msa(uint8_t *src0_ptr,
3295 int32_t src2_stride,
3298 const int8_t *filter,
3302 v16i8 src0, src1, src2, src3, src4, src5;
3303 v16i8 src6, src7, src8, src9, src10, src11;
3304 v8i16 in0, in1, in2, in3, in4, in5;
3305 v16i8 src10_r, src32_r, src76_r, src98_r;
3306 v16i8 src21_r, src43_r, src87_r, src109_r;
3307 v16i8 src10_l, src32_l, src21_l, src43_l;
3308 v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
3309 v8i16 dst0_l, dst1_l;
3311 v8i16 filter_vec, const_vec;
3313 src0_ptr -= src_stride;
3315 const_vec = __msa_ldi_h(128);
3318 filter_vec = LD_SH(filter);
3319 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3322 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
3323 XORI_B3_128_SB(src0, src1, src2);
3324 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3325 ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
3327 LD_SB3(src0_ptr + 16, src_stride, src6, src7, src8);
3328 src0_ptr += (3 * src_stride);
3329 XORI_B3_128_SB(src6, src7, src8);
3330 ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
3332 for (loop_cnt = (height >> 2); loop_cnt--;) {
3334 LD_SB2(src0_ptr, src_stride, src3, src4);
3335 LD_SH2(src1_ptr, src2_stride, in0, in1);
3336 LD_SH2((src1_ptr + 8), src2_stride, in2, in3);
3337 LD_SH2((src1_ptr + 16), src2_stride, in4, in5);
3338 src1_ptr += (2 * src2_stride);
3339 XORI_B2_128_SB(src3, src4);
3340 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3341 ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
3343 LD_SB2(src0_ptr + 16, src_stride, src9, src10);
3344 src0_ptr += (2 * src_stride);
3345 XORI_B2_128_SB(src9, src10);
3346 ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r);
3349 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
3351 DPADD_SB2_SH(src10_l, src32_l, filt0, filt1, dst0_l, dst0_l);
3353 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
3355 DPADD_SB2_SH(src21_l, src43_l, filt0, filt1, dst1_l, dst1_l);
3358 DPADD_SB2_SH(src76_r, src98_r, filt0, filt1, dst2_r, dst2_r);
3360 DPADD_SB2_SH(src87_r, src109_r, filt0, filt1, dst3_r, dst3_r);
3362 HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
3363 dst0_r, dst1_r, dst0_l, dst1_l, 7,
3364 dst0_r, dst1_r, dst0_l, dst1_l);
3366 HEVC_BI_RND_CLIP2(in4, in5, dst2_r, dst3_r, 7, dst2_r, dst3_r);
3368 PCKEV_B2_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r, dst1_r);
3369 dst2_r = (v8i16) __msa_pckev_b((v16i8) dst3_r, (v16i8) dst2_r);
3370 ST_SH2(dst0_r, dst1_r, dst, dst_stride);
3371 ST8x2_UB(dst2_r, dst + 16, dst_stride);
3372 dst += (2 * dst_stride);
3375 LD_SB2(src0_ptr, src_stride, src5, src2);
3376 LD_SH2(src1_ptr, src2_stride, in0, in1);
3377 LD_SH2((src1_ptr + 8), src2_stride, in2, in3);
3378 LD_SH2((src1_ptr + 16), src2_stride, in4, in5);
3379 src1_ptr += (2 * src2_stride);
3380 XORI_B2_128_SB(src5, src2);
3381 ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r);
3382 ILVL_B2_SB(src5, src4, src2, src5, src10_l, src21_l);
3384 LD_SB2(src0_ptr + 16, src_stride, src11, src8);
3385 src0_ptr += (2 * src_stride);
3386 XORI_B2_128_SB(src11, src8);
3387 ILVR_B2_SB(src11, src10, src8, src11, src76_r, src87_r);
3390 DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, dst0_r, dst0_r);
3392 DPADD_SB2_SH(src32_l, src10_l, filt0, filt1, dst0_l, dst0_l);
3394 DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, dst1_r, dst1_r);
3396 DPADD_SB2_SH(src43_l, src21_l, filt0, filt1, dst1_l, dst1_l);
3399 DPADD_SB2_SH(src98_r, src76_r, filt0, filt1, dst2_r, dst2_r);
3401 DPADD_SB2_SH(src109_r, src87_r, filt0, filt1, dst3_r, dst3_r);
3403 HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
3404 dst0_r, dst1_r, dst0_l, dst1_l, 7,
3405 dst0_r, dst1_r, dst0_l, dst1_l);
3406 HEVC_BI_RND_CLIP2(in4, in5, dst2_r, dst3_r, 7, dst2_r, dst3_r);
3408 PCKEV_B2_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r, dst1_r);
3409 dst2_r = (v8i16) __msa_pckev_b((v16i8) dst3_r, (v16i8) dst2_r);
3410 ST_SH2(dst0_r, dst1_r, dst, dst_stride);
3411 ST8x2_UB(dst2_r, dst + 16, dst_stride);
3412 dst += (2 * dst_stride);
3416 static void hevc_vt_bi_4t_32w_msa(uint8_t *src0_ptr,
3419 int32_t src2_stride,
3422 const int8_t *filter,
3426 uint8_t *dst_tmp = dst + 16;
3427 v16i8 src0, src1, src2, src3, src4, src6, src7, src8, src9, src10;
3428 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
3429 v16i8 src10_r, src32_r, src76_r, src98_r;
3430 v16i8 src21_r, src43_r, src87_r, src109_r;
3431 v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
3432 v16i8 src10_l, src32_l, src76_l, src98_l;
3433 v16i8 src21_l, src43_l, src87_l, src109_l;
3434 v8i16 dst0_l, dst1_l, dst2_l, dst3_l;
3436 v8i16 filter_vec, const_vec;
3438 src0_ptr -= src_stride;
3440 const_vec = __msa_ldi_h(128);
3443 filter_vec = LD_SH(filter);
3444 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3447 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
3448 XORI_B3_128_SB(src0, src1, src2);
3449 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3450 ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
3453 LD_SB3(src0_ptr + 16, src_stride, src6, src7, src8);
3454 src0_ptr += (3 * src_stride);
3455 XORI_B3_128_SB(src6, src7, src8);
3456 ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
3457 ILVL_B2_SB(src7, src6, src8, src7, src76_l, src87_l);
3459 for (loop_cnt = (height >> 1); loop_cnt--;) {
3461 LD_SB2(src0_ptr, src_stride, src3, src4);
3462 LD_SH2(src1_ptr, src2_stride, in0, in1);
3463 LD_SH2((src1_ptr + 8), src2_stride, in2, in3);
3464 LD_SH2((src1_ptr + 16), src2_stride, in4, in5);
3465 LD_SH2((src1_ptr + 24), src2_stride, in6, in7);
3466 src1_ptr += (2 * src2_stride);
3467 XORI_B2_128_SB(src3, src4);
3468 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3469 ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
3472 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
3474 DPADD_SB2_SH(src10_l, src32_l, filt0, filt1, dst0_l, dst0_l);
3476 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
3478 DPADD_SB2_SH(src21_l, src43_l, filt0, filt1, dst1_l, dst1_l);
3480 HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
3481 dst0_r, dst1_r, dst0_l, dst1_l, 7,
3482 dst0_r, dst1_r, dst0_l, dst1_l);
3490 PCKEV_B2_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r, dst1_r);
3491 ST_SH2(dst0_r, dst1_r, dst, dst_stride);
3492 dst += (2 * dst_stride);
3495 LD_SB2(src0_ptr + 16, src_stride, src9, src10);
3496 src0_ptr += (2 * src_stride);
3497 XORI_B2_128_SB(src9, src10);
3498 ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r);
3499 ILVL_B2_SB(src9, src8, src10, src9, src98_l, src109_l);
3502 DPADD_SB2_SH(src76_r, src98_r, filt0, filt1, dst2_r, dst2_r);
3504 DPADD_SB2_SH(src76_l, src98_l, filt0, filt1, dst2_l, dst2_l);
3506 DPADD_SB2_SH(src87_r, src109_r, filt0, filt1, dst3_r, dst3_r);
3508 DPADD_SB2_SH(src87_l, src109_l, filt0, filt1, dst3_l, dst3_l);
3510 HEVC_BI_RND_CLIP4(in4, in5, in6, in7,
3511 dst2_r, dst3_r, dst2_l, dst3_l, 7,
3512 dst2_r, dst3_r, dst2_l, dst3_l);
3514 PCKEV_B2_SH(dst2_l, dst2_r, dst3_l, dst3_r, dst2_r, dst3_r);
3515 ST_SH2(dst2_r, dst3_r, dst_tmp, dst_stride);
3516 dst_tmp += (2 * dst_stride);
3526 static void hevc_hv_bi_4t_4x2_msa(uint8_t *src0_ptr,
3529 int32_t src2_stride,
3532 const int8_t *filter_x,
3533 const int8_t *filter_y,
3537 v16i8 src0, src1, src2, src3, src4;
3539 v4i32 filt_h0, filt_h1;
3540 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
3542 v8i16 filter_vec, const_vec;
3543 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
3544 v8i16 dst0, dst1, dst2, dst3, dst4;
3545 v4i32 dst0_r, dst1_r;
3546 v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
3548 src0_ptr -= (src_stride + 1);
3550 filter_vec = LD_SH(filter_x);
3551 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3553 filter_vec = LD_SH(filter_y);
3554 vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
3555 filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
3557 SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1);
3561 const_vec = __msa_ldi_h(128);
3564 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
3565 src0_ptr += (3 * src_stride);
3566 XORI_B3_128_SB(src0, src1, src2);
3568 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3569 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
3570 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3572 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
3574 DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1);
3576 DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2);
3577 ILVR_H2_SH(dst1, dst0, dst2, dst1, dst10_r, dst21_r);
3579 LD_SB2(src0_ptr, src_stride, src3, src4);
3580 LD_SH2(src1_ptr, src2_stride, in0, in1);
3581 in0 = (v8i16) __msa_ilvr_d((v2i64) in1, (v2i64) in0);
3582 XORI_B2_128_SB(src3, src4);
3584 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3586 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
3587 dst32_r = __msa_ilvr_h(dst3, dst2);
3588 dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
3591 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
3593 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
3594 dst43_r = __msa_ilvr_h(dst4, dst3);
3595 dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
3597 dst0_r = (v4i32) __msa_pckev_h((v8i16) dst1_r, (v8i16) dst0_r);
3598 dst0_r = (v4i32) __msa_adds_s_h((v8i16) dst0_r, in0);
3599 dst0_r = (v4i32) __msa_srari_h((v8i16) dst0_r, 7);
3600 dst0_r = (v4i32) CLIP_SH_0_255(dst0_r);
3602 dst0_r = (v4i32) __msa_pckev_b((v16i8) dst0_r, (v16i8) dst0_r);
3603 ST4x2_UB(dst0_r, dst, dst_stride);
3606 static void hevc_hv_bi_4t_4x4_msa(uint8_t *src0_ptr,
3609 int32_t src2_stride,
3612 const int8_t *filter_x,
3613 const int8_t *filter_y,
3616 v8i16 in0, in1, in2, in3;
3617 v16i8 src0, src1, src2, src3, src4, src5, src6;
3619 v4i32 filt_h0, filt_h1;
3620 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
3622 v8i16 filter_vec, const_vec;
3623 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
3624 v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
3625 v8i16 dst0_r, dst1_r;
3626 v4i32 tmp0, tmp1, tmp2, tmp3;
3627 v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
3629 src0_ptr -= (src_stride + 1);
3631 filter_vec = LD_SH(filter_x);
3632 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3634 filter_vec = LD_SH(filter_y);
3635 vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
3636 filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
3638 SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1);
3642 const_vec = __msa_ldi_h(128);
3645 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
3646 src0_ptr += (3 * src_stride);
3647 XORI_B3_128_SB(src0, src1, src2);
3649 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3650 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
3651 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3653 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
3655 DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1);
3657 DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2);
3658 ILVR_H2_SH(dst1, dst0, dst2, dst1, dst10_r, dst21_r);
3660 LD_SB4(src0_ptr, src_stride, src3, src4, src5, src6);
3661 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
3662 ILVR_D2_SH(in1, in0, in3, in2, in0, in1);
3663 XORI_B4_128_SB(src3, src4, src5, src6);
3665 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3667 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
3668 dst32_r = __msa_ilvr_h(dst3, dst2);
3669 tmp0 = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
3672 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
3674 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
3675 dst43_r = __msa_ilvr_h(dst4, dst3);
3676 tmp1 = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
3679 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
3681 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5);
3682 dst10_r = __msa_ilvr_h(dst5, dst4);
3683 tmp2 = HEVC_FILT_4TAP(dst32_r, dst10_r, filt_h0, filt_h1);
3686 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
3688 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
3689 dst21_r = __msa_ilvr_h(dst2, dst5);
3690 tmp3 = HEVC_FILT_4TAP(dst43_r, dst21_r, filt_h0, filt_h1);
3692 PCKEV_H2_SH(tmp1, tmp0, tmp3, tmp2, dst0_r, dst1_r);
3693 HEVC_BI_RND_CLIP2(in0, in1, dst0_r, dst1_r, 7, dst0_r, dst1_r);
3695 dst0_r = (v8i16) __msa_pckev_b((v16i8) dst1_r, (v16i8) dst0_r);
3696 ST4x4_UB(dst0_r, dst0_r, 0, 1, 2, 3, dst, dst_stride);
3697 dst += (4 * dst_stride);
3700 static void hevc_hv_bi_4t_4multx8mult_msa(uint8_t *src0_ptr,
3703 int32_t src2_stride,
3706 const int8_t *filter_x,
3707 const int8_t *filter_y,
3711 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
3712 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
3714 v4i32 filt_h0, filt_h1;
3715 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
3717 v8i16 filter_vec, const_vec;
3718 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
3719 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8, dst9;
3720 v8i16 tmp0, tmp1, tmp2, tmp3;
3721 v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
3722 v8i16 dst21_r, dst43_r, dst65_r, dst87_r;
3723 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r, dst6_r, dst7_r;
3725 src0_ptr -= (src_stride + 1);
3727 filter_vec = LD_SH(filter_x);
3728 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3730 filter_vec = LD_SH(filter_y);
3731 vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
3732 filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
3734 SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1);
3738 const_vec = __msa_ldi_h(128);
3741 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
3742 src0_ptr += (3 * src_stride);
3743 XORI_B3_128_SB(src0, src1, src2);
3745 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3746 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
3747 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3749 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
3751 DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1);
3753 DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2);
3754 ILVR_H2_SH(dst1, dst0, dst2, dst1, dst10_r, dst21_r);
3756 for (loop_cnt = height >> 3; loop_cnt--;) {
3757 LD_SB8(src0_ptr, src_stride,
3758 src3, src4, src5, src6, src7, src8, src9, src10);
3759 src0_ptr += (8 * src_stride);
3760 LD_SH8(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5, in6, in7);
3761 src1_ptr += (8 * src2_stride);
3762 ILVR_D2_SH(in1, in0, in3, in2, in0, in1);
3763 ILVR_D2_SH(in5, in4, in7, in6, in2, in3);
3764 XORI_B8_128_SB(src3, src4, src5, src6, src7, src8, src9, src10);
3766 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3768 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
3769 dst32_r = __msa_ilvr_h(dst3, dst2);
3770 dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
3773 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
3775 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
3776 dst43_r = __msa_ilvr_h(dst4, dst3);
3777 dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
3780 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
3782 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5);
3783 dst54_r = __msa_ilvr_h(dst5, dst4);
3784 dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
3787 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
3789 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst6, dst6);
3790 dst65_r = __msa_ilvr_h(dst6, dst5);
3791 dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
3794 VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1);
3796 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst7, dst7);
3797 dst76_r = __msa_ilvr_h(dst7, dst6);
3798 dst4_r = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
3801 VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec0, vec1);
3803 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst8, dst8);
3804 dst87_r = __msa_ilvr_h(dst8, dst7);
3805 dst5_r = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
3808 VSHF_B2_SB(src9, src9, src9, src9, mask0, mask1, vec0, vec1);
3810 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst9, dst9);
3811 dst10_r = __msa_ilvr_h(dst9, dst8);
3812 dst6_r = HEVC_FILT_4TAP(dst76_r, dst10_r, filt_h0, filt_h1);
3815 VSHF_B2_SB(src10, src10, src10, src10, mask0, mask1, vec0, vec1);
3817 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
3818 dst21_r = __msa_ilvr_h(dst2, dst9);
3819 dst7_r = HEVC_FILT_4TAP(dst87_r, dst21_r, filt_h0, filt_h1);
3821 PCKEV_H4_SH(dst1_r, dst0_r, dst3_r, dst2_r,
3822 dst5_r, dst4_r, dst7_r, dst6_r, tmp0, tmp1, tmp2, tmp3);
3823 HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
3824 tmp0, tmp1, tmp2, tmp3, 7, tmp0, tmp1, tmp2, tmp3);
3826 PCKEV_B2_SH(tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
3827 ST4x8_UB(tmp0, tmp1, dst, dst_stride);
3828 dst += (8 * dst_stride);
3832 static void hevc_hv_bi_4t_4w_msa(uint8_t *src0_ptr,
3835 int32_t src2_stride,
3838 const int8_t *filter_x,
3839 const int8_t *filter_y,
3843 hevc_hv_bi_4t_4x2_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
3844 dst, dst_stride, filter_x, filter_y, height);
3845 } else if (4 == height) {
3846 hevc_hv_bi_4t_4x4_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
3847 dst, dst_stride, filter_x, filter_y, height);
3848 } else if (0 == (height % 8)) {
3849 hevc_hv_bi_4t_4multx8mult_msa(src0_ptr, src_stride,
3850 src1_ptr, src2_stride,
3852 filter_x, filter_y, height);
3856 static void hevc_hv_bi_4t_6w_msa(uint8_t *src0_ptr,
3859 int32_t src2_stride,
3862 const int8_t *filter_x,
3863 const int8_t *filter_y,
3867 v16i8 src0, src1, src2, src3, src4, src5, src6;
3868 v8i16 in0, in1, in2, in3;
3870 v4i32 filt_h0, filt_h1;
3871 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
3873 v8i16 filter_vec, const_vec;
3874 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
3875 v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
3876 v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
3877 v8i16 tmp0, tmp1, tmp2, tmp3;
3878 v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
3879 v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
3881 src0_ptr -= (src_stride + 1);
3883 filter_vec = LD_SH(filter_x);
3884 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3886 filter_vec = LD_SH(filter_y);
3887 vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
3888 filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
3890 SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1);
3894 const_vec = __msa_ldi_h(128);
3897 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
3898 src0_ptr += (3 * src_stride);
3899 XORI_B3_128_SB(src0, src1, src2);
3901 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3902 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
3903 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3905 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
3907 DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1);
3909 DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2);
3911 ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
3912 ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
3914 for (loop_cnt = height >> 2; loop_cnt--;) {
3915 LD_SB4(src0_ptr, src_stride, src3, src4, src5, src6);
3916 src0_ptr += (4 * src_stride);
3917 LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
3918 src1_ptr += (4 * src2_stride);
3919 XORI_B4_128_SB(src3, src4, src5, src6);
3921 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3923 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
3925 ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
3926 dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
3927 dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
3931 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
3933 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
3935 ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
3936 dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
3937 dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
3941 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
3943 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5);
3945 ILVRL_H2_SH(dst5, dst4, dst10_r, dst10_l);
3946 dst2_r = HEVC_FILT_4TAP(dst32_r, dst10_r, filt_h0, filt_h1);
3947 dst2_l = HEVC_FILT_4TAP(dst32_l, dst10_l, filt_h0, filt_h1);
3951 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
3953 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
3955 ILVRL_H2_SH(dst2, dst5, dst21_r, dst21_l);
3956 dst3_r = HEVC_FILT_4TAP(dst43_r, dst21_r, filt_h0, filt_h1);
3957 dst3_l = HEVC_FILT_4TAP(dst43_l, dst21_l, filt_h0, filt_h1);
3960 PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r,
3961 dst2_l, dst2_r, dst3_l, dst3_r, tmp0, tmp1, tmp2, tmp3);
3962 HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
3963 tmp0, tmp1, tmp2, tmp3, 7, tmp0, tmp1, tmp2, tmp3);
3965 PCKEV_B2_SW(tmp1, tmp0, tmp3, tmp2, dst0_r, dst1_r);
3966 ST6x4_UB(dst0_r, dst1_r, dst, dst_stride);
3967 dst += (4 * dst_stride);
3971 static void hevc_hv_bi_4t_8x2_msa(uint8_t *src0_ptr,
3974 int32_t src2_stride,
3977 const int8_t *filter_x,
3978 const int8_t *filter_y,
3981 v16i8 src0, src1, src2, src3, src4;
3983 v4i32 filt_h0, filt_h1;
3984 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
3986 v8i16 filter_vec, const_vec;
3987 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
3988 v8i16 dst0, dst1, dst2, dst3, dst4;
3989 v4i32 dst0_r, dst0_l, dst1_r, dst1_l;
3990 v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
3991 v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
3995 src0_ptr -= (src_stride + 1);
3997 filter_vec = LD_SH(filter_x);
3998 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
4000 filter_vec = LD_SH(filter_y);
4001 vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
4002 filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
4004 SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1);
4008 const_vec = __msa_ldi_h(128);
4011 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
4012 src0_ptr += (3 * src_stride);
4013 XORI_B3_128_SB(src0, src1, src2);
4015 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
4016 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
4017 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
4019 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
4021 DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1);
4023 DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2);
4025 ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
4026 ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
4028 LD_SB2(src0_ptr, src_stride, src3, src4);
4029 LD_SH2(src1_ptr, src2_stride, in0, in1);
4030 XORI_B2_128_SB(src3, src4);
4032 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
4034 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
4036 ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
4037 dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
4038 dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
4042 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
4044 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
4046 ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
4047 dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
4048 dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
4052 PCKEV_H2_SH(dst0_l, dst0_r, dst1_l, dst1_r, tmp0, tmp1);
4053 HEVC_BI_RND_CLIP2(in0, in1, tmp0, tmp1, 7, tmp0, tmp1);
4055 dst0_r = (v4i32) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
4056 ST8x2_UB(dst0_r, dst, dst_stride);
4059 static void hevc_hv_bi_4t_8x6_msa(uint8_t *src0_ptr,
4062 int32_t src2_stride,
4065 const int8_t *filter_x,
4066 const int8_t *filter_y,
4069 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
4070 v8i16 in0, in1, in2, in3, in4, in5;
4072 v4i32 filt_h0, filt_h1;
4073 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
4075 v8i16 filter_vec, const_vec;
4076 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
4077 v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
4078 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
4079 v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
4080 v4i32 dst4_r, dst4_l, dst5_r, dst5_l;
4081 v8i16 dst10_r, dst32_r, dst10_l, dst32_l;
4082 v8i16 dst21_r, dst43_r, dst21_l, dst43_l;
4083 v8i16 dst54_r, dst54_l, dst65_r, dst65_l;
4084 v8i16 dst76_r, dst76_l, dst87_r, dst87_l;
4086 src0_ptr -= (src_stride + 1);
4088 filter_vec = LD_SH(filter_x);
4089 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
4091 filter_vec = LD_SH(filter_y);
4092 vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
4093 filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
4095 SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1);
4099 const_vec = __msa_ldi_h(128);
4102 LD_SB3(src0_ptr, src_stride, src0, src1, src2);
4103 src0_ptr += (3 * src_stride);
4104 XORI_B3_128_SB(src0, src1, src2);
4105 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
4106 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
4107 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
4109 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
4111 DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1);
4113 DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2);
4115 ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
4116 ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
4118 LD_SB2(src0_ptr, src_stride, src3, src4);
4119 src0_ptr += (2 * src_stride);
4120 XORI_B2_128_SB(src3, src4);
4121 LD_SH6(src1_ptr, src2_stride, in0, in1, in2, in3, in4, in5);
4122 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
4124 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
4126 ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
4127 dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
4128 dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
4131 tmp0 = __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r);
4133 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
4135 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
4137 ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
4138 dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
4139 dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
4142 tmp1 = __msa_pckev_h((v8i16) dst1_l, (v8i16) dst1_r);
4144 LD_SB2(src0_ptr, src_stride, src5, src6);
4145 src0_ptr += (2 * src_stride);
4146 XORI_B2_128_SB(src5, src6);
4148 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
4150 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5);
4152 ILVRL_H2_SH(dst5, dst4, dst54_r, dst54_l);
4153 dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
4154 dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1);
4157 tmp2 = __msa_pckev_h((v8i16) dst2_l, (v8i16) dst2_r);
4160 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
4162 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst6, dst6);
4164 ILVRL_H2_SH(dst6, dst5, dst65_r, dst65_l);
4165 dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
4166 dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1);
4169 tmp3 = __msa_pckev_h((v8i16) dst3_l, (v8i16) dst3_r);
4171 LD_SB2(src0_ptr, src_stride, src7, src8);
4172 XORI_B2_128_SB(src7, src8);
4174 VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1);
4176 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst7, dst7);
4178 ILVRL_H2_SH(dst7, dst6, dst76_r, dst76_l);
4179 dst4_r = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
4180 dst4_l = HEVC_FILT_4TAP(dst54_l, dst76_l, filt_h0, filt_h1);
4184 tmp4 = __msa_pckev_h((v8i16) dst4_l, (v8i16) dst4_r);
4186 VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec0, vec1);
4188 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst8, dst8);
4190 ILVRL_H2_SH(dst8, dst7, dst87_r, dst87_l);
4191 dst5_r = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
4192 dst5_l = HEVC_FILT_4TAP(dst65_l, dst87_l, filt_h0, filt_h1);
4195 tmp5 = __msa_pckev_h((v8i16) dst5_l, (v8i16) dst5_r);
4197 HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
4198 tmp0, tmp1, tmp2, tmp3, 7, tmp0, tmp1, tmp2, tmp3);
4199 HEVC_BI_RND_CLIP2(in4, in5, tmp4, tmp5, 7, tmp4, tmp5);
4201 PCKEV_B2_SW(tmp1, tmp0, tmp3, tmp2, dst0_r, dst1_r);
4202 dst2_r = (v4i32) __msa_pckev_b((v16i8) tmp5, (v16i8) tmp4);
4203 ST8x4_UB(dst0_r, dst1_r, dst, dst_stride);
4204 dst += (4 * dst_stride);
4205 ST8x2_UB(dst2_r, dst, dst_stride);
4208 static void hevc_hv_bi_4t_8multx4mult_msa(uint8_t *src0_ptr,
4211 int32_t src2_stride,
4214 const int8_t *filter_x,
4215 const int8_t *filter_y,
4219 uint32_t loop_cnt, cnt;
4220 uint8_t *src0_ptr_tmp;
4221 int16_t *src1_ptr_tmp;
4223 v16i8 src0, src1, src2, src3, src4, src5, src6;
4224 v8i16 in0, in1, in2, in3;
4226 v4i32 filt_h0, filt_h1;
4227 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
4229 v8i16 filter_vec, const_vec;
4230 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
4231 v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
4232 v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
4233 v8i16 tmp0, tmp1, tmp2, tmp3;
4234 v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
4235 v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
4237 src0_ptr -= (src_stride + 1);
4239 filter_vec = LD_SH(filter_x);
4240 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
4242 filter_vec = LD_SH(filter_y);
4243 vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
4244 filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
4246 SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1);
4250 const_vec = __msa_ldi_h(128);
4253 for (cnt = width >> 3; cnt--;) {
4254 src0_ptr_tmp = src0_ptr;
4256 src1_ptr_tmp = src1_ptr;
4258 LD_SB3(src0_ptr_tmp, src_stride, src0, src1, src2);
4259 src0_ptr_tmp += (3 * src_stride);
4260 XORI_B3_128_SB(src0, src1, src2);
4262 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
4263 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
4264 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
4266 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
4268 DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1);
4270 DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2);
4272 ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
4273 ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
4275 for (loop_cnt = height >> 2; loop_cnt--;) {
4276 LD_SB4(src0_ptr_tmp, src_stride, src3, src4, src5, src6);
4277 src0_ptr_tmp += (4 * src_stride);
4278 LD_SH4(src1_ptr_tmp, src2_stride, in0, in1, in2, in3);
4279 src1_ptr_tmp += (4 * src2_stride);
4280 XORI_B4_128_SB(src3, src4, src5, src6);
4282 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
4284 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
4286 ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
4287 dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
4288 dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
4292 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
4294 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
4296 ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
4297 dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
4298 dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
4302 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
4304 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5);
4306 ILVRL_H2_SH(dst5, dst4, dst10_r, dst10_l);
4307 dst2_r = HEVC_FILT_4TAP(dst32_r, dst10_r, filt_h0, filt_h1);
4308 dst2_l = HEVC_FILT_4TAP(dst32_l, dst10_l, filt_h0, filt_h1);
4312 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
4314 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
4316 ILVRL_H2_SH(dst2, dst5, dst21_r, dst21_l);
4317 dst3_r = HEVC_FILT_4TAP(dst43_r, dst21_r, filt_h0, filt_h1);
4318 dst3_l = HEVC_FILT_4TAP(dst43_l, dst21_l, filt_h0, filt_h1);
4322 PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r,
4323 dst2_l, dst2_r, dst3_l, dst3_r, tmp0, tmp1, tmp2, tmp3);
4324 HEVC_BI_RND_CLIP4(in0, in1, in2, in3,
4325 tmp0, tmp1, tmp2, tmp3, 7,
4326 tmp0, tmp1, tmp2, tmp3);
4328 PCKEV_B2_SW(tmp1, tmp0, tmp3, tmp2, dst0_r, dst1_r);
4329 ST8x4_UB(dst0_r, dst1_r, dst_tmp, dst_stride);
4330 dst_tmp += (4 * dst_stride);
4339 static void hevc_hv_bi_4t_8w_msa(uint8_t *src0_ptr,
4342 int32_t src2_stride,
4345 const int8_t *filter_x,
4346 const int8_t *filter_y,
4350 hevc_hv_bi_4t_8x2_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
4351 dst, dst_stride, filter_x, filter_y, height);
4352 } else if (6 == height) {
4353 hevc_hv_bi_4t_8x6_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
4354 dst, dst_stride, filter_x, filter_y, height);
4356 hevc_hv_bi_4t_8multx4mult_msa(src0_ptr, src_stride,
4357 src1_ptr, src2_stride,
4359 filter_x, filter_y, height, 8);
4363 static void hevc_hv_bi_4t_12w_msa(uint8_t *src0_ptr,
4366 int32_t src2_stride,
4369 const int8_t *filter_x,
4370 const int8_t *filter_y,
4373 hevc_hv_bi_4t_8multx4mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
4374 dst, dst_stride, filter_x, filter_y,
4376 hevc_hv_bi_4t_4w_msa(src0_ptr + 8, src_stride, src1_ptr + 8, src2_stride,
4377 dst + 8, dst_stride, filter_x, filter_y, height);
4380 static void hevc_hv_bi_4t_16w_msa(uint8_t *src0_ptr,
4383 int32_t src2_stride,
4386 const int8_t *filter_x,
4387 const int8_t *filter_y,
4390 hevc_hv_bi_4t_8multx4mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
4391 dst, dst_stride, filter_x, filter_y,
4395 static void hevc_hv_bi_4t_24w_msa(uint8_t *src0_ptr,
4398 int32_t src2_stride,
4401 const int8_t *filter_x,
4402 const int8_t *filter_y,
4405 hevc_hv_bi_4t_8multx4mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
4406 dst, dst_stride, filter_x, filter_y,
4410 static void hevc_hv_bi_4t_32w_msa(uint8_t *src0_ptr,
4413 int32_t src2_stride,
4416 const int8_t *filter_x,
4417 const const int8_t *filter_y,
4420 hevc_hv_bi_4t_8multx4mult_msa(src0_ptr, src_stride, src1_ptr, src2_stride,
4421 dst, dst_stride, filter_x, filter_y,
4425 #define BI_MC_COPY(WIDTH) \
4426 void ff_hevc_put_hevc_bi_pel_pixels##WIDTH##_8_msa(uint8_t *dst, \
4427 ptrdiff_t dst_stride, \
4429 ptrdiff_t src_stride, \
4430 int16_t *src_16bit, \
4436 hevc_bi_copy_##WIDTH##w_msa(src, src_stride, src_16bit, MAX_PB_SIZE, \
4437 dst, dst_stride, height); \
4452 #define BI_MC(PEL, DIR, WIDTH, TAP, DIR1, FILT_DIR) \
4453 void ff_hevc_put_hevc_bi_##PEL##_##DIR##WIDTH##_8_msa(uint8_t *dst, \
4454 ptrdiff_t dst_stride, \
4456 ptrdiff_t src_stride, \
4457 int16_t *src_16bit, \
4463 const int8_t *filter = ff_hevc_##PEL##_filters[FILT_DIR - 1]; \
4465 hevc_##DIR1##_bi_##TAP##t_##WIDTH##w_msa(src, src_stride, src_16bit, \
4466 MAX_PB_SIZE, dst, dst_stride, \
4470 BI_MC(qpel, h, 4, 8, hz, mx);
4471 BI_MC(qpel, h, 8, 8, hz, mx);
4472 BI_MC(qpel, h, 12, 8, hz, mx);
4473 BI_MC(qpel, h, 16, 8, hz, mx);
4474 BI_MC(qpel, h, 24, 8, hz, mx);
4475 BI_MC(qpel, h, 32, 8, hz, mx);
4476 BI_MC(qpel, h, 48, 8, hz, mx);
4477 BI_MC(qpel, h, 64, 8, hz, mx);
4479 BI_MC(qpel, v, 4, 8, vt, my);
4480 BI_MC(qpel, v, 8, 8, vt, my);
4481 BI_MC(qpel, v, 12, 8, vt, my);
4482 BI_MC(qpel, v, 16, 8, vt, my);
4483 BI_MC(qpel, v, 24, 8, vt, my);
4484 BI_MC(qpel, v, 32, 8, vt, my);
4485 BI_MC(qpel, v, 48, 8, vt, my);
4486 BI_MC(qpel, v, 64, 8, vt, my);
4488 BI_MC(epel, h, 4, 4, hz, mx);
4489 BI_MC(epel, h, 8, 4, hz, mx);
4490 BI_MC(epel, h, 6, 4, hz, mx);
4491 BI_MC(epel, h, 12, 4, hz, mx);
4492 BI_MC(epel, h, 16, 4, hz, mx);
4493 BI_MC(epel, h, 24, 4, hz, mx);
4494 BI_MC(epel, h, 32, 4, hz, mx);
4496 BI_MC(epel, v, 4, 4, vt, my);
4497 BI_MC(epel, v, 8, 4, vt, my);
4498 BI_MC(epel, v, 6, 4, vt, my);
4499 BI_MC(epel, v, 12, 4, vt, my);
4500 BI_MC(epel, v, 16, 4, vt, my);
4501 BI_MC(epel, v, 24, 4, vt, my);
4502 BI_MC(epel, v, 32, 4, vt, my);
4506 #define BI_MC_HV(PEL, WIDTH, TAP) \
4507 void ff_hevc_put_hevc_bi_##PEL##_hv##WIDTH##_8_msa(uint8_t *dst, \
4508 ptrdiff_t dst_stride, \
4510 ptrdiff_t src_stride, \
4511 int16_t *src_16bit, \
4517 const int8_t *filter_x = ff_hevc_##PEL##_filters[mx - 1]; \
4518 const int8_t *filter_y = ff_hevc_##PEL##_filters[my - 1]; \
4520 hevc_hv_bi_##TAP##t_##WIDTH##w_msa(src, src_stride, src_16bit, \
4521 MAX_PB_SIZE, dst, dst_stride, \
4522 filter_x, filter_y, height); \
4525 BI_MC_HV(qpel, 4, 8);
4526 BI_MC_HV(qpel, 8, 8);
4527 BI_MC_HV(qpel, 12, 8);
4528 BI_MC_HV(qpel, 16, 8);
4529 BI_MC_HV(qpel, 24, 8);
4530 BI_MC_HV(qpel, 32, 8);
4531 BI_MC_HV(qpel, 48, 8);
4532 BI_MC_HV(qpel, 64, 8);
4534 BI_MC_HV(epel, 4, 4);
4535 BI_MC_HV(epel, 8, 4);
4536 BI_MC_HV(epel, 6, 4);
4537 BI_MC_HV(epel, 12, 4);
4538 BI_MC_HV(epel, 16, 4);
4539 BI_MC_HV(epel, 24, 4);
4540 BI_MC_HV(epel, 32, 4);