2 * Copyright (c) 2015 - 2017 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com)
4 * This file is part of FFmpeg.
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 #include "libavutil/mips/generic_macros_msa.h"
22 #include "libavcodec/mips/hevcdsp_mips.h"
23 #include "libavcodec/mips/hevc_macros_msa.h"
25 static const uint8_t ff_hevc_mask_arr[16 * 3] __attribute__((aligned(0x40))) = {
27 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
29 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20,
31 8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
34 #define HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, \
35 mask0, mask1, mask2, mask3, \
36 filt0, filt1, filt2, filt3, \
39 v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \
41 VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0_m, vec1_m); \
42 DOTP_SB2_SH(vec0_m, vec1_m, filt0, filt0, out0, out1); \
43 VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2_m, vec3_m); \
44 DPADD_SB2_SH(vec2_m, vec3_m, filt1, filt1, out0, out1); \
45 VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4_m, vec5_m); \
46 DPADD_SB2_SH(vec4_m, vec5_m, filt2, filt2, out0, out1); \
47 VSHF_B2_SB(src0, src1, src2, src3, mask3, mask3, vec6_m, vec7_m); \
48 DPADD_SB2_SH(vec6_m, vec7_m, filt3, filt3, out0, out1); \
51 #define HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, \
52 mask0, mask1, mask2, mask3, \
53 filt0, filt1, filt2, filt3, \
54 out0, out1, out2, out3) \
56 v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \
58 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m); \
59 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m); \
60 DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0, \
61 out0, out1, out2, out3); \
62 VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec0_m, vec1_m); \
63 VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2_m, vec3_m); \
64 DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt2, filt2, filt2, filt2, \
65 out0, out1, out2, out3); \
66 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4_m, vec5_m); \
67 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6_m, vec7_m); \
68 DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt1, filt1, filt1, filt1, \
69 out0, out1, out2, out3); \
70 VSHF_B2_SB(src0, src0, src1, src1, mask3, mask3, vec4_m, vec5_m); \
71 VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec6_m, vec7_m); \
72 DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt3, filt3, filt3, filt3, \
73 out0, out1, out2, out3); \
76 #define HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, \
77 mask0, mask1, filt0, filt1, \
80 v16i8 vec0_m, vec1_m, vec2_m, vec3_m; \
82 VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0_m, vec1_m); \
83 DOTP_SB2_SH(vec0_m, vec1_m, filt0, filt0, out0, out1); \
84 VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2_m, vec3_m); \
85 DPADD_SB2_SH(vec2_m, vec3_m, filt1, filt1, out0, out1); \
88 #define HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, \
89 mask0, mask1, filt0, filt1, \
90 out0, out1, out2, out3) \
92 v16i8 vec0_m, vec1_m, vec2_m, vec3_m; \
94 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m); \
95 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m); \
96 DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0, \
97 out0, out1, out2, out3); \
98 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0_m, vec1_m); \
99 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2_m, vec3_m); \
100 DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt1, filt1, filt1, filt1, \
101 out0, out1, out2, out3); \
104 static void copy_width8_msa(uint8_t *src, int32_t src_stride,
105 uint8_t *dst, int32_t dst_stride,
109 uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
112 LD2(src, src_stride, out0, out1);
116 } else if (6 == height) {
117 LD4(src, src_stride, out0, out1, out2, out3);
118 src += (4 * src_stride);
119 SD4(out0, out1, out2, out3, dst, dst_stride);
120 dst += (4 * dst_stride);
121 LD2(src, src_stride, out0, out1);
125 } else if (0 == (height % 8)) {
126 for (cnt = (height >> 3); cnt--;) {
127 LD4(src, src_stride, out0, out1, out2, out3);
128 src += (4 * src_stride);
129 LD4(src, src_stride, out4, out5, out6, out7);
130 src += (4 * src_stride);
131 SD4(out0, out1, out2, out3, dst, dst_stride);
132 dst += (4 * dst_stride);
133 SD4(out4, out5, out6, out7, dst, dst_stride);
134 dst += (4 * dst_stride);
136 } else if (0 == (height % 4)) {
137 for (cnt = (height >> 2); cnt--;) {
138 LD4(src, src_stride, out0, out1, out2, out3);
139 src += (4 * src_stride);
140 SD4(out0, out1, out2, out3, dst, dst_stride);
141 dst += (4 * dst_stride);
146 static void copy_width12_msa(uint8_t *src, int32_t src_stride,
147 uint8_t *dst, int32_t dst_stride,
150 v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
152 LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
153 src += (8 * src_stride);
154 ST12x8_UB(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride);
155 dst += (8 * dst_stride);
156 LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
157 ST12x8_UB(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride);
160 static void copy_width16_msa(uint8_t *src, int32_t src_stride,
161 uint8_t *dst, int32_t dst_stride,
165 v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
168 LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
169 src += (8 * src_stride);
170 ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride);
171 dst += (8 * dst_stride);
172 LD_UB4(src, src_stride, src0, src1, src2, src3);
173 src += (4 * src_stride);
174 ST_UB4(src0, src1, src2, src3, dst, dst_stride);
175 dst += (4 * dst_stride);
176 } else if (0 == (height % 8)) {
177 for (cnt = (height >> 3); cnt--;) {
178 LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6,
180 src += (8 * src_stride);
181 ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst,
183 dst += (8 * dst_stride);
185 } else if (0 == (height % 4)) {
186 for (cnt = (height >> 2); cnt--;) {
187 LD_UB4(src, src_stride, src0, src1, src2, src3);
188 src += (4 * src_stride);
190 ST_UB4(src0, src1, src2, src3, dst, dst_stride);
191 dst += (4 * dst_stride);
196 static void copy_width24_msa(uint8_t *src, int32_t src_stride,
197 uint8_t *dst, int32_t dst_stride,
201 v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
202 uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
204 for (cnt = 4; cnt--;) {
205 LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
206 LD4(src + 16, src_stride, out0, out1, out2, out3);
207 src += (4 * src_stride);
208 LD4(src + 16, src_stride, out4, out5, out6, out7);
209 src += (4 * src_stride);
211 ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride);
212 SD4(out0, out1, out2, out3, dst + 16, dst_stride);
213 dst += (4 * dst_stride);
214 SD4(out4, out5, out6, out7, dst + 16, dst_stride);
215 dst += (4 * dst_stride);
219 static void copy_width32_msa(uint8_t *src, int32_t src_stride,
220 uint8_t *dst, int32_t dst_stride,
224 v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
226 for (cnt = (height >> 2); cnt--;) {
227 LD_UB4(src, src_stride, src0, src1, src2, src3);
228 LD_UB4(src + 16, src_stride, src4, src5, src6, src7);
229 src += (4 * src_stride);
230 ST_UB4(src0, src1, src2, src3, dst, dst_stride);
231 ST_UB4(src4, src5, src6, src7, dst + 16, dst_stride);
232 dst += (4 * dst_stride);
236 static void copy_width48_msa(uint8_t *src, int32_t src_stride,
237 uint8_t *dst, int32_t dst_stride,
241 v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
244 for (cnt = (height >> 2); cnt--;) {
245 LD_UB4(src, src_stride, src0, src1, src2, src3);
246 LD_UB4(src + 16, src_stride, src4, src5, src6, src7);
247 LD_UB4(src + 32, src_stride, src8, src9, src10, src11);
248 src += (4 * src_stride);
250 ST_UB4(src0, src1, src2, src3, dst, dst_stride);
251 ST_UB4(src4, src5, src6, src7, dst + 16, dst_stride);
252 ST_UB4(src8, src9, src10, src11, dst + 32, dst_stride);
253 dst += (4 * dst_stride);
257 static void copy_width64_msa(uint8_t *src, int32_t src_stride,
258 uint8_t *dst, int32_t dst_stride,
262 v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
263 v16u8 src8, src9, src10, src11, src12, src13, src14, src15;
265 for (cnt = (height >> 2); cnt--;) {
266 LD_UB4(src, 16, src0, src1, src2, src3);
268 LD_UB4(src, 16, src4, src5, src6, src7);
270 LD_UB4(src, 16, src8, src9, src10, src11);
272 LD_UB4(src, 16, src12, src13, src14, src15);
275 ST_UB4(src0, src1, src2, src3, dst, 16);
277 ST_UB4(src4, src5, src6, src7, dst, 16);
279 ST_UB4(src8, src9, src10, src11, dst, 16);
281 ST_UB4(src12, src13, src14, src15, dst, 16);
286 static void common_hz_8t_4x4_msa(uint8_t *src, int32_t src_stride,
287 uint8_t *dst, int32_t dst_stride,
288 const int8_t *filter)
290 v16u8 mask0, mask1, mask2, mask3, out;
291 v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
292 v8i16 filt, out0, out1;
294 mask0 = LD_UB(&ff_hevc_mask_arr[16]);
297 /* rearranging filter */
298 filt = LD_SH(filter);
299 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
305 LD_SB4(src, src_stride, src0, src1, src2, src3);
306 XORI_B4_128_SB(src0, src1, src2, src3);
307 HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
308 mask3, filt0, filt1, filt2, filt3, out0, out1);
309 SRARI_H2_SH(out0, out1, 6);
310 SAT_SH2_SH(out0, out1, 7);
311 out = PCKEV_XORI128_UB(out0, out1);
312 ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
315 static void common_hz_8t_4x8_msa(uint8_t *src, int32_t src_stride,
316 uint8_t *dst, int32_t dst_stride,
317 const int8_t *filter)
319 v16i8 filt0, filt1, filt2, filt3;
320 v16i8 src0, src1, src2, src3;
321 v16u8 mask0, mask1, mask2, mask3, out;
322 v8i16 filt, out0, out1, out2, out3;
324 mask0 = LD_UB(&ff_hevc_mask_arr[16]);
327 /* rearranging filter */
328 filt = LD_SH(filter);
329 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
335 LD_SB4(src, src_stride, src0, src1, src2, src3);
336 XORI_B4_128_SB(src0, src1, src2, src3);
337 src += (4 * src_stride);
338 HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
339 mask3, filt0, filt1, filt2, filt3, out0, out1);
340 LD_SB4(src, src_stride, src0, src1, src2, src3);
341 XORI_B4_128_SB(src0, src1, src2, src3);
342 HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
343 mask3, filt0, filt1, filt2, filt3, out2, out3);
344 SRARI_H4_SH(out0, out1, out2, out3, 6);
345 SAT_SH4_SH(out0, out1, out2, out3, 7);
346 out = PCKEV_XORI128_UB(out0, out1);
347 ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
348 out = PCKEV_XORI128_UB(out2, out3);
349 ST_W4(out, 0, 1, 2, 3, dst + 4 * dst_stride, dst_stride);
352 static void common_hz_8t_4x16_msa(uint8_t *src, int32_t src_stride,
353 uint8_t *dst, int32_t dst_stride,
354 const int8_t *filter)
356 v16u8 mask0, mask1, mask2, mask3, out;
357 v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
358 v8i16 filt, out0, out1, out2, out3;
360 mask0 = LD_UB(&ff_hevc_mask_arr[16]);
363 /* rearranging filter */
364 filt = LD_SH(filter);
365 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
371 LD_SB4(src, src_stride, src0, src1, src2, src3);
372 XORI_B4_128_SB(src0, src1, src2, src3);
373 src += (4 * src_stride);
374 HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
375 mask3, filt0, filt1, filt2, filt3, out0, out1);
376 LD_SB4(src, src_stride, src0, src1, src2, src3);
377 XORI_B4_128_SB(src0, src1, src2, src3);
378 src += (4 * src_stride);
379 HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
380 mask3, filt0, filt1, filt2, filt3, out2, out3);
381 SRARI_H4_SH(out0, out1, out2, out3, 6);
382 SAT_SH4_SH(out0, out1, out2, out3, 7);
383 out = PCKEV_XORI128_UB(out0, out1);
384 ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
385 out = PCKEV_XORI128_UB(out2, out3);
386 ST_W4(out, 0, 1, 2, 3, dst + 4 * dst_stride, dst_stride);
387 dst += (8 * dst_stride);
389 LD_SB4(src, src_stride, src0, src1, src2, src3);
390 XORI_B4_128_SB(src0, src1, src2, src3);
391 src += (4 * src_stride);
392 HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
393 mask3, filt0, filt1, filt2, filt3, out0, out1);
394 LD_SB4(src, src_stride, src0, src1, src2, src3);
395 XORI_B4_128_SB(src0, src1, src2, src3);
396 src += (4 * src_stride);
397 HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
398 mask3, filt0, filt1, filt2, filt3, out2, out3);
400 SRARI_H4_SH(out0, out1, out2, out3, 6);
401 SAT_SH4_SH(out0, out1, out2, out3, 7);
402 out = PCKEV_XORI128_UB(out0, out1);
403 ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
404 out = PCKEV_XORI128_UB(out2, out3);
405 ST_W4(out, 0, 1, 2, 3, dst + 4 * dst_stride, dst_stride);
408 static void common_hz_8t_4w_msa(uint8_t *src, int32_t src_stride,
409 uint8_t *dst, int32_t dst_stride,
410 const int8_t *filter, int32_t height)
413 common_hz_8t_4x4_msa(src, src_stride, dst, dst_stride, filter);
414 } else if (8 == height) {
415 common_hz_8t_4x8_msa(src, src_stride, dst, dst_stride, filter);
416 } else if (16 == height) {
417 common_hz_8t_4x16_msa(src, src_stride, dst, dst_stride, filter);
421 static void common_hz_8t_8w_msa(uint8_t *src, int32_t src_stride,
422 uint8_t *dst, int32_t dst_stride,
423 const int8_t *filter, int32_t height)
426 v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
427 v16u8 mask0, mask1, mask2, mask3, tmp0, tmp1;
428 v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m;
429 v8i16 filt, out0, out1, out2, out3;
431 mask0 = LD_UB(&ff_hevc_mask_arr[0]);
434 /* rearranging filter */
435 filt = LD_SH(filter);
436 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
442 for (loop_cnt = (height >> 2); loop_cnt--;) {
443 LD_SB4(src, src_stride, src0, src1, src2, src3);
444 XORI_B4_128_SB(src0, src1, src2, src3);
445 src += (4 * src_stride);
447 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m);
448 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m);
449 DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0,
450 out0, out1, out2, out3);
451 VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec0_m, vec1_m);
452 VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2_m, vec3_m);
453 DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt2, filt2, filt2, filt2,
454 out0, out1, out2, out3);
455 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4_m, vec5_m);
456 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6_m, vec7_m);
457 DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt1, filt1, filt1, filt1,
458 out0, out1, out2, out3);
459 VSHF_B2_SB(src0, src0, src1, src1, mask3, mask3, vec4_m, vec5_m);
460 VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec6_m, vec7_m);
461 DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt3, filt3, filt3, filt3,
462 out0, out1, out2, out3);
464 SRARI_H4_SH(out0, out1, out2, out3, 6);
465 SAT_SH4_SH(out0, out1, out2, out3, 7);
466 tmp0 = PCKEV_XORI128_UB(out0, out1);
467 tmp1 = PCKEV_XORI128_UB(out2, out3);
468 ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
469 dst += (4 * dst_stride);
473 static void common_hz_8t_12w_msa(uint8_t *src, int32_t src_stride,
474 uint8_t *dst, int32_t dst_stride,
475 const int8_t *filter, int32_t height)
478 v16u8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask00;
479 v16u8 tmp0, tmp1, tmp2;
480 v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
481 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
482 v16i8 filt0, filt1, filt2, filt3;
483 v8i16 filt, out0, out1, out2, out3, out4, out5;
485 mask00 = LD_UB(&ff_hevc_mask_arr[0]);
486 mask0 = LD_UB(&ff_hevc_mask_arr[16]);
490 /* rearranging filter */
491 filt = LD_SH(filter);
492 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
501 for (loop_cnt = 4; loop_cnt--;) {
503 LD_SB4(src, src_stride, src0, src1, src2, src3);
505 LD_SB4(src + 8, src_stride, src4, src5, src6, src7);
507 XORI_B4_128_SB(src0, src1, src2, src3);
508 XORI_B4_128_SB(src4, src5, src6, src7);
509 src += (4 * src_stride);
511 VSHF_B2_SB(src0, src0, src1, src1, mask00, mask00, vec0, vec1);
512 VSHF_B2_SB(src2, src2, src3, src3, mask00, mask00, vec2, vec3);
513 DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0,
515 VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec0, vec1);
516 VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2, vec3);
517 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt2, filt2, filt2, out0,
519 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4, vec5);
520 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6, vec7);
521 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1, out0,
523 VSHF_B2_SB(src0, src0, src1, src1, mask3, mask3, vec4, vec5);
524 VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec6, vec7);
525 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt3, filt3, filt3, filt3, out0,
529 VSHF_B2_SB(src4, src5, src6, src7, mask0, mask0, vec0, vec1);
530 DOTP_SB2_SH(vec0, vec1, filt0, filt0, out4, out5);
531 VSHF_B2_SB(src4, src5, src6, src7, mask4, mask4, vec2, vec3);
532 DPADD_SB2_SH(vec2, vec3, filt1, filt1, out4, out5);
533 VSHF_B2_SB(src4, src5, src6, src7, mask5, mask5, vec4, vec5);
534 DPADD_SB2_SH(vec4, vec5, filt2, filt2, out4, out5);
535 VSHF_B2_SB(src4, src5, src6, src7, mask6, mask6, vec6, vec7);
536 DPADD_SB2_SH(vec6, vec7, filt3, filt3, out4, out5);
538 SRARI_H4_SH(out0, out1, out2, out3, 6);
539 SRARI_H2_SH(out4, out5, 6);
540 SAT_SH4_SH(out0, out1, out2, out3, 7);
541 SAT_SH2_SH(out4, out5, 7);
542 tmp0 = PCKEV_XORI128_UB(out0, out1);
543 tmp1 = PCKEV_XORI128_UB(out2, out3);
544 tmp2 = PCKEV_XORI128_UB(out4, out5);
546 ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
547 ST_W4(tmp2, 0, 1, 2, 3, dst + 8, dst_stride);
548 dst += (4 * dst_stride);
552 static void common_hz_8t_16w_msa(uint8_t *src, int32_t src_stride,
553 uint8_t *dst, int32_t dst_stride,
554 const int8_t *filter, int32_t height)
557 v16u8 mask0, mask1, mask2, mask3, out;
558 v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
559 v16i8 filt0, filt1, filt2, filt3;
560 v8i16 filt, out0, out1, out2, out3;
562 mask0 = LD_UB(&ff_hevc_mask_arr[0]);
565 /* rearranging filter */
566 filt = LD_SH(filter);
567 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
573 for (loop_cnt = (height >> 2); loop_cnt--;) {
574 LD_SB2(src, src_stride, src0, src2);
575 LD_SB2(src + 8, src_stride, src1, src3);
576 src += (2 * src_stride);
578 LD_SB2(src, src_stride, src4, src6);
579 LD_SB2(src + 8, src_stride, src5, src7);
580 src += (2 * src_stride);
582 XORI_B4_128_SB(src0, src1, src2, src3);
583 XORI_B4_128_SB(src4, src5, src6, src7);
584 HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
585 mask3, filt0, filt1, filt2, filt3, out0,
587 SRARI_H4_SH(out0, out1, out2, out3, 6);
588 SAT_SH4_SH(out0, out1, out2, out3, 7);
589 out = PCKEV_XORI128_UB(out0, out1);
592 out = PCKEV_XORI128_UB(out2, out3);
596 HORIZ_8TAP_8WID_4VECS_FILT(src4, src5, src6, src7, mask0, mask1, mask2,
597 mask3, filt0, filt1, filt2, filt3, out0,
599 SRARI_H4_SH(out0, out1, out2, out3, 6);
600 SAT_SH4_SH(out0, out1, out2, out3, 7);
601 out = PCKEV_XORI128_UB(out0, out1);
604 out = PCKEV_XORI128_UB(out2, out3);
610 static void common_hz_8t_24w_msa(uint8_t *src, int32_t src_stride,
611 uint8_t *dst, int32_t dst_stride,
612 const int8_t *filter, int32_t height)
615 v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
616 v16u8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7, out;
617 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
619 v8i16 out0, out1, out2, out3, out8, out9, filt;
621 mask0 = LD_UB(&ff_hevc_mask_arr[0]);
624 /* rearranging filter */
625 filt = LD_SH(filter);
626 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
636 for (loop_cnt = 16; loop_cnt--;) {
637 LD_SB2(src, src_stride, src0, src2);
638 LD_SB2(src + 16, src_stride, src1, src3);
639 XORI_B4_128_SB(src0, src1, src2, src3);
640 src += (2 * src_stride);
641 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec8);
642 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec9);
643 VSHF_B2_SB(src0, src1, src2, src3, mask4, mask4, vec1, vec3);
644 DOTP_SB4_SH(vec0, vec8, vec2, vec9, filt0, filt0, filt0, filt0, out0,
646 DOTP_SB2_SH(vec1, vec3, filt0, filt0, out1, out3);
647 VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec0, vec8);
648 VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2, vec9);
649 VSHF_B2_SB(src0, src1, src2, src3, mask6, mask6, vec1, vec3);
650 DPADD_SB4_SH(vec0, vec8, vec2, vec9, filt2, filt2, filt2, filt2,
651 out0, out8, out2, out9);
652 DPADD_SB2_SH(vec1, vec3, filt2, filt2, out1, out3);
653 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4, vec10);
654 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6, vec11);
655 VSHF_B2_SB(src0, src1, src2, src3, mask5, mask5, vec5, vec7);
656 DPADD_SB4_SH(vec4, vec10, vec6, vec11, filt1, filt1, filt1, filt1,
657 out0, out8, out2, out9);
658 DPADD_SB2_SH(vec5, vec7, filt1, filt1, out1, out3);
659 VSHF_B2_SB(src0, src0, src1, src1, mask3, mask3, vec4, vec10);
660 VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec6, vec11);
661 VSHF_B2_SB(src0, src1, src2, src3, mask7, mask7, vec5, vec7);
662 DPADD_SB4_SH(vec4, vec10, vec6, vec11, filt3, filt3, filt3, filt3,
663 out0, out8, out2, out9);
664 DPADD_SB2_SH(vec5, vec7, filt3, filt3, out1, out3);
665 SRARI_H4_SH(out0, out8, out2, out9, 6);
666 SRARI_H2_SH(out1, out3, 6);
667 SAT_SH4_SH(out0, out8, out2, out9, 7);
668 SAT_SH2_SH(out1, out3, 7);
669 out = PCKEV_XORI128_UB(out8, out9);
670 ST_D2(out, 0, 1, dst + 16, dst_stride);
671 out = PCKEV_XORI128_UB(out0, out1);
674 out = PCKEV_XORI128_UB(out2, out3);
680 static void common_hz_8t_32w_msa(uint8_t *src, int32_t src_stride,
681 uint8_t *dst, int32_t dst_stride,
682 const int8_t *filter, int32_t height)
685 v16u8 mask0, mask1, mask2, mask3, out;
686 v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
687 v16i8 filt0, filt1, filt2, filt3;
688 v8i16 filt, out0, out1, out2, out3;
690 mask0 = LD_UB(&ff_hevc_mask_arr[0]);
693 /* rearranging filter */
694 filt = LD_SH(filter);
695 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
701 for (loop_cnt = (height >> 1); loop_cnt--;) {
703 src1 = LD_SB(src + 8);
704 src2 = LD_SB(src + 16);
705 src3 = LD_SB(src + 24);
707 XORI_B4_128_SB(src0, src1, src2, src3);
710 src5 = LD_SB(src + 8);
711 src6 = LD_SB(src + 16);
712 src7 = LD_SB(src + 24);
714 XORI_B4_128_SB(src4, src5, src6, src7);
716 HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
717 mask3, filt0, filt1, filt2, filt3, out0,
719 SRARI_H4_SH(out0, out1, out2, out3, 6);
720 SAT_SH4_SH(out0, out1, out2, out3, 7);
722 out = PCKEV_XORI128_UB(out0, out1);
724 out = PCKEV_XORI128_UB(out2, out3);
725 ST_UB(out, dst + 16);
728 HORIZ_8TAP_8WID_4VECS_FILT(src4, src5, src6, src7, mask0, mask1, mask2,
729 mask3, filt0, filt1, filt2, filt3, out0,
731 SRARI_H4_SH(out0, out1, out2, out3, 6);
732 SAT_SH4_SH(out0, out1, out2, out3, 7);
733 out = PCKEV_XORI128_UB(out0, out1);
735 out = PCKEV_XORI128_UB(out2, out3);
736 ST_UB(out, dst + 16);
741 static void common_hz_8t_48w_msa(uint8_t *src, int32_t src_stride,
742 uint8_t *dst, int32_t dst_stride,
743 const int8_t *filter, int32_t height)
746 v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3, vec0, vec1, vec2;
748 v16u8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7, out;
749 v8i16 filt, out0, out1, out2, out3;
751 mask0 = LD_UB(&ff_hevc_mask_arr[0]);
754 /* rearranging filter */
755 filt = LD_SH(filter);
756 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
766 for (loop_cnt = 64; loop_cnt--;) {
768 src1 = LD_SB(src + 8);
769 src2 = LD_SB(src + 16);
770 src3 = LD_SB(src + 32);
771 src4 = LD_SB(src + 40);
774 XORI_B4_128_SB(src0, src1, src2, src3);
775 src4 = (v16i8) __msa_xori_b((v16u8) src4, 128);
777 VSHF_B3_SB(src0, src0, src1, src1, src2, src2, mask0, mask0, mask0,
779 DOTP_SB3_SH(vec0, vec1, vec2, filt0, filt0, filt0, out0, out1, out2);
780 VSHF_B3_SB(src0, src0, src1, src1, src2, src2, mask1, mask1, mask1,
782 DPADD_SB2_SH(vec0, vec1, filt1, filt1, out0, out1);
783 out2 = __msa_dpadd_s_h(out2, vec2, filt1);
784 VSHF_B3_SB(src0, src0, src1, src1, src2, src2, mask2, mask2, mask2,
786 DPADD_SB2_SH(vec0, vec1, filt2, filt2, out0, out1);
787 out2 = __msa_dpadd_s_h(out2, vec2, filt2);
789 VSHF_B3_SB(src0, src0, src1, src1, src2, src2, mask3, mask3, mask3,
791 DPADD_SB2_SH(vec0, vec1, filt3, filt3, out0, out1);
792 out2 = __msa_dpadd_s_h(out2, vec2, filt3);
794 SRARI_H2_SH(out0, out1, 6);
795 out3 = __msa_srari_h(out2, 6);
796 SAT_SH3_SH(out0, out1, out3, 7);
797 out = PCKEV_XORI128_UB(out0, out1);
800 VSHF_B3_SB(src2, src3, src3, src3, src4, src4, mask4, mask0, mask0,
802 DOTP_SB3_SH(vec0, vec1, vec2, filt0, filt0, filt0, out0, out1, out2);
803 VSHF_B3_SB(src2, src3, src3, src3, src4, src4, mask5, mask1, mask1,
805 DPADD_SB2_SH(vec0, vec1, filt1, filt1, out0, out1);
806 out2 = __msa_dpadd_s_h(out2, vec2, filt1);
807 VSHF_B3_SB(src2, src3, src3, src3, src4, src4, mask6, mask2, mask2,
809 DPADD_SB2_SH(vec0, vec1, filt2, filt2, out0, out1);
810 out2 = __msa_dpadd_s_h(out2, vec2, filt2);
811 VSHF_B3_SB(src2, src3, src3, src3, src4, src4, mask7, mask3, mask3,
813 DPADD_SB2_SH(vec0, vec1, filt3, filt3, out0, out1);
814 out2 = __msa_dpadd_s_h(out2, vec2, filt3);
816 SRARI_H2_SH(out0, out1, 6);
817 out2 = __msa_srari_h(out2, 6);
818 SAT_SH3_SH(out0, out1, out2, 7);
819 out = PCKEV_XORI128_UB(out3, out0);
820 ST_UB(out, dst + 16);
821 out = PCKEV_XORI128_UB(out1, out2);
822 ST_UB(out, dst + 32);
827 static void common_hz_8t_64w_msa(uint8_t *src, int32_t src_stride,
828 uint8_t *dst, int32_t dst_stride,
829 const int8_t *filter, int32_t height)
832 v16u8 mask0, mask1, mask2, mask3, out;
833 v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
834 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
835 v16i8 filt0, filt1, filt2, filt3;
836 v8i16 res0, res1, res2, res3, filt;
838 mask0 = LD_UB(&ff_hevc_mask_arr[0]);
841 /* rearranging filter */
842 filt = LD_SH(filter);
843 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
849 for (loop_cnt = height; loop_cnt--;) {
850 LD_SB8(src, 8, src0, src1, src2, src3, src4, src5, src6, src7);
853 XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
855 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
856 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3);
857 DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, res0,
859 VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec0, vec1);
860 VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2, vec3);
861 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt2, filt2, filt2, res0,
863 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4, vec5);
864 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6, vec7);
865 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1, res0,
867 VSHF_B2_SB(src0, src0, src1, src1, mask3, mask3, vec4, vec5);
868 VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec6, vec7);
869 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt3, filt3, filt3, filt3, res0,
872 SRARI_H4_SH(res0, res1, res2, res3, 6);
873 SAT_SH4_SH(res0, res1, res2, res3, 7);
874 out = PCKEV_XORI128_UB(res0, res1);
876 out = PCKEV_XORI128_UB(res2, res3);
877 ST_UB(out, dst + 16);
879 VSHF_B2_SB(src4, src4, src5, src5, mask0, mask0, vec0, vec1);
880 VSHF_B2_SB(src6, src6, src7, src7, mask0, mask0, vec2, vec3);
881 DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, res0,
883 VSHF_B2_SB(src4, src4, src5, src5, mask2, mask2, vec0, vec1);
884 VSHF_B2_SB(src6, src6, src7, src7, mask2, mask2, vec2, vec3);
885 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt2, filt2, filt2, res0,
887 VSHF_B2_SB(src4, src4, src5, src5, mask1, mask1, vec4, vec5);
888 VSHF_B2_SB(src6, src6, src7, src7, mask1, mask1, vec6, vec7);
889 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1, res0,
891 VSHF_B2_SB(src4, src4, src5, src5, mask3, mask3, vec4, vec5);
892 VSHF_B2_SB(src6, src6, src7, src7, mask3, mask3, vec6, vec7);
893 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt3, filt3, filt3, filt3, res0,
896 SRARI_H4_SH(res0, res1, res2, res3, 6);
897 SAT_SH4_SH(res0, res1, res2, res3, 7);
898 out = PCKEV_XORI128_UB(res0, res1);
899 ST_UB(out, dst + 32);
900 out = PCKEV_XORI128_UB(res2, res3);
901 ST_UB(out, dst + 48);
906 static void common_vt_8t_4w_msa(uint8_t *src, int32_t src_stride,
907 uint8_t *dst, int32_t dst_stride,
908 const int8_t *filter, int32_t height)
912 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
913 v16i8 src11, src12, src13, src14;
914 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
915 v16i8 src65_r, src87_r, src109_r, src2110, src4332, src6554, src8776;
916 v16i8 src1110_r, src1211_r, src1312_r, src1413_r, src12111110, src14131312;
917 v16i8 src10998, filt0, filt1, filt2, filt3;
918 v8i16 filt, out10, out32, out54, out76;
920 src -= (3 * src_stride);
922 filt = LD_SH(filter);
923 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
925 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
926 src += (7 * src_stride);
928 ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
930 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
931 ILVR_D3_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r, src2110,
933 XORI_B3_128_SB(src2110, src4332, src6554);
935 for (loop_cnt = (height >> 3); loop_cnt--;) {
936 LD_SB4(src, src_stride, src7, src8, src9, src10);
937 src += (4 * src_stride);
938 LD_SB4(src, src_stride, src11, src12, src13, src14);
939 src += (4 * src_stride);
941 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
942 src87_r, src98_r, src109_r);
943 ILVR_B4_SB(src11, src10, src12, src11, src13, src12, src14, src13,
944 src1110_r, src1211_r, src1312_r, src1413_r);
945 ILVR_D2_SB(src87_r, src76_r, src109_r, src98_r, src8776, src10998);
946 ILVR_D2_SB(src1211_r, src1110_r, src1413_r, src1312_r,
947 src12111110, src14131312);
948 XORI_B2_128_SB(src8776, src10998);
949 XORI_B2_128_SB(src12111110, src14131312);
951 DOTP_SB2_SH(src2110, src4332, filt0, filt0, out10, out32);
952 DOTP_SB2_SH(src6554, src8776, filt0, filt0, out54, out76);
953 DPADD_SB2_SH(src4332, src6554, filt1, filt1, out10, out32);
954 DPADD_SB2_SH(src8776, src10998, filt1, filt1, out54, out76);
955 DPADD_SB2_SH(src6554, src8776, filt2, filt2, out10, out32);
956 DPADD_SB2_SH(src10998, src12111110, filt2, filt2, out54, out76);
957 DPADD_SB2_SH(src8776, src10998, filt3, filt3, out10, out32);
958 DPADD_SB2_SH(src12111110, src14131312, filt3, filt3, out54, out76);
959 SRARI_H2_SH(out10, out32, 6);
960 SRARI_H2_SH(out54, out76, 6);
961 SAT_SH2_SH(out10, out32, 7);
962 SAT_SH2_SH(out54, out76, 7);
963 out0 = PCKEV_XORI128_UB(out10, out32);
964 out1 = PCKEV_XORI128_UB(out54, out76);
965 ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
966 dst += (8 * dst_stride);
969 src4332 = src12111110;
970 src6554 = src14131312;
975 static void common_vt_8t_8w_msa(uint8_t *src, int32_t src_stride,
976 uint8_t *dst, int32_t dst_stride,
977 const int8_t *filter, int32_t height)
980 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
981 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
982 v16i8 src65_r, src87_r, src109_r, filt0, filt1, filt2, filt3;
984 v8i16 filt, out0_r, out1_r, out2_r, out3_r;
986 src -= (3 * src_stride);
988 filt = LD_SH(filter);
989 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
991 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
992 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
993 src += (7 * src_stride);
994 ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
996 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
998 for (loop_cnt = (height >> 2); loop_cnt--;) {
999 LD_SB4(src, src_stride, src7, src8, src9, src10);
1000 XORI_B4_128_SB(src7, src8, src9, src10);
1001 src += (4 * src_stride);
1003 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
1004 src87_r, src98_r, src109_r);
1005 DOTP_SB4_SH(src10_r, src21_r, src32_r, src43_r, filt0, filt0, filt0,
1006 filt0, out0_r, out1_r, out2_r, out3_r);
1007 DPADD_SB4_SH(src32_r, src43_r, src54_r, src65_r, filt1, filt1, filt1,
1008 filt1, out0_r, out1_r, out2_r, out3_r);
1009 DPADD_SB4_SH(src54_r, src65_r, src76_r, src87_r, filt2, filt2, filt2,
1010 filt2, out0_r, out1_r, out2_r, out3_r);
1011 DPADD_SB4_SH(src76_r, src87_r, src98_r, src109_r, filt3, filt3, filt3,
1012 filt3, out0_r, out1_r, out2_r, out3_r);
1013 SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6);
1014 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
1015 tmp0 = PCKEV_XORI128_UB(out0_r, out1_r);
1016 tmp1 = PCKEV_XORI128_UB(out2_r, out3_r);
1017 ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
1018 dst += (4 * dst_stride);
1030 static void common_vt_8t_12w_msa(uint8_t *src, int32_t src_stride,
1031 uint8_t *dst, int32_t dst_stride,
1032 const int8_t *filter, int32_t height)
1035 uint32_t out2, out3;
1036 uint64_t out0, out1;
1037 v16u8 tmp0, tmp1, tmp2, tmp3;
1038 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1039 v16i8 filt0, filt1, filt2, filt3;
1040 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
1041 v16i8 src65_r, src87_r, src109_r, src10_l, src32_l, src54_l, src76_l;
1042 v16i8 src98_l, src21_l, src43_l, src65_l, src87_l, src109_l;
1043 v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
1045 src -= (3 * src_stride);
1047 filt = LD_SH(filter);
1048 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1050 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
1051 src += (7 * src_stride);
1053 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1055 ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
1057 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1058 ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_l, src32_l,
1060 ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
1062 for (loop_cnt = 4; loop_cnt--;) {
1063 LD_SB4(src, src_stride, src7, src8, src9, src10);
1064 XORI_B4_128_SB(src7, src8, src9, src10);
1065 src += (4 * src_stride);
1067 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
1068 src87_r, src98_r, src109_r);
1069 ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_l,
1070 src87_l, src98_l, src109_l);
1071 out0_r = HEVC_FILT_8TAP_SH(src10_r, src32_r, src54_r, src76_r, filt0,
1072 filt1, filt2, filt3);
1073 out1_r = HEVC_FILT_8TAP_SH(src21_r, src43_r, src65_r, src87_r, filt0,
1074 filt1, filt2, filt3);
1075 out2_r = HEVC_FILT_8TAP_SH(src32_r, src54_r, src76_r, src98_r, filt0,
1076 filt1, filt2, filt3);
1077 out3_r = HEVC_FILT_8TAP_SH(src43_r, src65_r, src87_r, src109_r, filt0,
1078 filt1, filt2, filt3);
1079 out0_l = HEVC_FILT_8TAP_SH(src10_l, src32_l, src54_l, src76_l, filt0,
1080 filt1, filt2, filt3);
1081 out1_l = HEVC_FILT_8TAP_SH(src21_l, src43_l, src65_l, src87_l, filt0,
1082 filt1, filt2, filt3);
1083 out2_l = HEVC_FILT_8TAP_SH(src32_l, src54_l, src76_l, src98_l, filt0,
1084 filt1, filt2, filt3);
1085 out3_l = HEVC_FILT_8TAP_SH(src43_l, src65_l, src87_l, src109_l, filt0,
1086 filt1, filt2, filt3);
1087 SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6);
1088 SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 6);
1089 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
1090 SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
1091 PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
1092 out3_r, tmp0, tmp1, tmp2, tmp3);
1093 XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3);
1095 out0 = __msa_copy_u_d((v2i64) tmp0, 0);
1096 out1 = __msa_copy_u_d((v2i64) tmp1, 0);
1097 out2 = __msa_copy_u_w((v4i32) tmp0, 2);
1098 out3 = __msa_copy_u_w((v4i32) tmp1, 2);
1100 SW(out2, (dst + 8));
1103 SW(out3, (dst + 8));
1105 out0 = __msa_copy_u_d((v2i64) tmp2, 0);
1106 out1 = __msa_copy_u_d((v2i64) tmp3, 0);
1107 out2 = __msa_copy_u_w((v4i32) tmp2, 2);
1108 out3 = __msa_copy_u_w((v4i32) tmp3, 2);
1110 SW(out2, (dst + 8));
1113 SW(out3, (dst + 8));
1132 static void common_vt_8t_16w_msa(uint8_t *src, int32_t src_stride,
1133 uint8_t *dst, int32_t dst_stride,
1134 const int8_t *filter, int32_t height)
1137 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1138 v16i8 filt0, filt1, filt2, filt3;
1139 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
1140 v16i8 src65_r, src87_r, src109_r, src10_l, src32_l, src54_l, src76_l;
1141 v16i8 src98_l, src21_l, src43_l, src65_l, src87_l, src109_l;
1142 v16u8 tmp0, tmp1, tmp2, tmp3;
1143 v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
1145 src -= (3 * src_stride);
1147 filt = LD_SH(filter);
1148 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1150 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
1151 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1152 src += (7 * src_stride);
1153 ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
1155 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1156 ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_l, src32_l,
1158 ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
1160 for (loop_cnt = (height >> 2); loop_cnt--;) {
1161 LD_SB4(src, src_stride, src7, src8, src9, src10);
1162 XORI_B4_128_SB(src7, src8, src9, src10);
1163 src += (4 * src_stride);
1165 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
1166 src87_r, src98_r, src109_r);
1167 ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_l,
1168 src87_l, src98_l, src109_l);
1169 out0_r = HEVC_FILT_8TAP_SH(src10_r, src32_r, src54_r, src76_r, filt0,
1170 filt1, filt2, filt3);
1171 out1_r = HEVC_FILT_8TAP_SH(src21_r, src43_r, src65_r, src87_r, filt0,
1172 filt1, filt2, filt3);
1173 out2_r = HEVC_FILT_8TAP_SH(src32_r, src54_r, src76_r, src98_r, filt0,
1174 filt1, filt2, filt3);
1175 out3_r = HEVC_FILT_8TAP_SH(src43_r, src65_r, src87_r, src109_r, filt0,
1176 filt1, filt2, filt3);
1177 out0_l = HEVC_FILT_8TAP_SH(src10_l, src32_l, src54_l, src76_l, filt0,
1178 filt1, filt2, filt3);
1179 out1_l = HEVC_FILT_8TAP_SH(src21_l, src43_l, src65_l, src87_l, filt0,
1180 filt1, filt2, filt3);
1181 out2_l = HEVC_FILT_8TAP_SH(src32_l, src54_l, src76_l, src98_l, filt0,
1182 filt1, filt2, filt3);
1183 out3_l = HEVC_FILT_8TAP_SH(src43_l, src65_l, src87_l, src109_l, filt0,
1184 filt1, filt2, filt3);
1185 SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6);
1186 SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 6);
1187 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
1188 SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
1189 PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
1190 out3_r, tmp0, tmp1, tmp2, tmp3);
1191 XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3);
1192 ST_UB4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride);
1193 dst += (4 * dst_stride);
1211 static void common_vt_8t_16w_mult_msa(uint8_t *src, int32_t src_stride,
1212 uint8_t *dst, int32_t dst_stride,
1213 const int8_t *filter, int32_t height,
1218 uint32_t loop_cnt, cnt;
1219 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1220 v16i8 filt0, filt1, filt2, filt3;
1221 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
1222 v16i8 src65_r, src87_r, src109_r, src10_l, src32_l, src54_l, src76_l;
1223 v16i8 src98_l, src21_l, src43_l, src65_l, src87_l, src109_l;
1224 v16u8 tmp0, tmp1, tmp2, tmp3;
1225 v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
1227 src -= (3 * src_stride);
1229 filt = LD_SH(filter);
1230 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1232 for (cnt = (width >> 4); cnt--;) {
1236 LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6);
1237 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1238 src_tmp += (7 * src_stride);
1239 ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r,
1240 src32_r, src54_r, src21_r);
1241 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1242 ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_l,
1243 src32_l, src54_l, src21_l);
1244 ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
1246 for (loop_cnt = (height >> 2); loop_cnt--;) {
1247 LD_SB4(src_tmp, src_stride, src7, src8, src9, src10);
1248 XORI_B4_128_SB(src7, src8, src9, src10);
1249 src_tmp += (4 * src_stride);
1250 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
1251 src87_r, src98_r, src109_r);
1252 ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_l,
1253 src87_l, src98_l, src109_l);
1254 out0_r = HEVC_FILT_8TAP_SH(src10_r, src32_r, src54_r, src76_r,
1255 filt0, filt1, filt2, filt3);
1256 out1_r = HEVC_FILT_8TAP_SH(src21_r, src43_r, src65_r, src87_r,
1257 filt0, filt1, filt2, filt3);
1258 out2_r = HEVC_FILT_8TAP_SH(src32_r, src54_r, src76_r, src98_r,
1259 filt0, filt1, filt2, filt3);
1260 out3_r = HEVC_FILT_8TAP_SH(src43_r, src65_r, src87_r, src109_r,
1261 filt0, filt1, filt2, filt3);
1262 out0_l = HEVC_FILT_8TAP_SH(src10_l, src32_l, src54_l, src76_l,
1263 filt0, filt1, filt2, filt3);
1264 out1_l = HEVC_FILT_8TAP_SH(src21_l, src43_l, src65_l, src87_l,
1265 filt0, filt1, filt2, filt3);
1266 out2_l = HEVC_FILT_8TAP_SH(src32_l, src54_l, src76_l, src98_l,
1267 filt0, filt1, filt2, filt3);
1268 out3_l = HEVC_FILT_8TAP_SH(src43_l, src65_l, src87_l, src109_l,
1269 filt0, filt1, filt2, filt3);
1270 SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6);
1271 SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 6);
1272 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
1273 SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
1274 PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
1275 out3_r, tmp0, tmp1, tmp2, tmp3);
1276 XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3);
1277 ST_UB4(tmp0, tmp1, tmp2, tmp3, dst_tmp, dst_stride);
1278 dst_tmp += (4 * dst_stride);
1300 static void common_vt_8t_24w_msa(uint8_t *src, int32_t src_stride,
1301 uint8_t *dst, int32_t dst_stride,
1302 const int8_t *filter, int32_t height)
1304 common_vt_8t_16w_mult_msa(src, src_stride, dst, dst_stride, filter, height,
1307 common_vt_8t_8w_msa(src + 16, src_stride, dst + 16, dst_stride, filter,
1311 static void common_vt_8t_32w_msa(uint8_t *src, int32_t src_stride,
1312 uint8_t *dst, int32_t dst_stride,
1313 const int8_t *filter, int32_t height)
1315 common_vt_8t_16w_mult_msa(src, src_stride, dst, dst_stride, filter, height,
1319 static void common_vt_8t_48w_msa(uint8_t *src, int32_t src_stride,
1320 uint8_t *dst, int32_t dst_stride,
1321 const int8_t *filter, int32_t height)
1323 common_vt_8t_16w_mult_msa(src, src_stride, dst, dst_stride, filter, height,
1327 static void common_vt_8t_64w_msa(uint8_t *src, int32_t src_stride,
1328 uint8_t *dst, int32_t dst_stride,
1329 const int8_t *filter, int32_t height)
1331 common_vt_8t_16w_mult_msa(src, src_stride, dst, dst_stride, filter, height,
1335 static void hevc_hv_uni_8t_4w_msa(uint8_t *src,
1339 const int8_t *filter_x,
1340 const int8_t *filter_y,
1345 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
1346 v16i8 src9, src10, src11, src12, src13, src14;
1347 v8i16 filt0, filt1, filt2, filt3;
1348 v8i16 filt_h0, filt_h1, filt_h2, filt_h3;
1349 v16i8 mask1, mask2, mask3;
1351 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1352 v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1353 v8i16 dst30, dst41, dst52, dst63, dst66, dst117, dst128, dst139, dst1410;
1354 v8i16 dst10_r, dst32_r, dst54_r, dst76_r, dst98_r, dst1110_r, dst1312_r;
1355 v8i16 dst21_r, dst43_r, dst65_r, dst87_r, dst109_r, dst1211_r, dst1413_r;
1356 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r, dst6_r, dst7_r;
1357 v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16);
1359 src -= ((3 * src_stride) + 3);
1360 filter_vec = LD_SH(filter_x);
1361 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1363 filter_vec = LD_SH(filter_y);
1364 UNPCK_R_SB_SH(filter_vec, filter_vec);
1366 SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
1372 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
1373 src += (7 * src_stride);
1374 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1376 VSHF_B4_SB(src0, src3, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
1377 VSHF_B4_SB(src1, src4, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7);
1378 VSHF_B4_SB(src2, src5, mask0, mask1, mask2, mask3,
1379 vec8, vec9, vec10, vec11);
1380 VSHF_B4_SB(src3, src6, mask0, mask1, mask2, mask3,
1381 vec12, vec13, vec14, vec15);
1383 dst30 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1385 dst41 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
1387 dst52 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
1389 dst63 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2,
1392 ILVRL_H2_SH(dst41, dst30, dst10_r, dst43_r);
1393 ILVRL_H2_SH(dst52, dst41, dst21_r, dst54_r);
1394 ILVRL_H2_SH(dst63, dst52, dst32_r, dst65_r);
1396 dst66 = (v8i16) __msa_splati_d((v2i64) dst63, 1);
1398 for (loop_cnt = height >> 3; loop_cnt--;) {
1399 LD_SB8(src, src_stride, src7, src8, src9, src10, src11, src12, src13,
1401 src += (8 * src_stride);
1402 XORI_B8_128_SB(src7, src8, src9, src10, src11, src12, src13, src14);
1404 VSHF_B4_SB(src7, src11, mask0, mask1, mask2, mask3,
1405 vec0, vec1, vec2, vec3);
1406 VSHF_B4_SB(src8, src12, mask0, mask1, mask2, mask3,
1407 vec4, vec5, vec6, vec7);
1408 VSHF_B4_SB(src9, src13, mask0, mask1, mask2, mask3,
1409 vec8, vec9, vec10, vec11);
1410 VSHF_B4_SB(src10, src14, mask0, mask1, mask2, mask3,
1411 vec12, vec13, vec14, vec15);
1413 dst117 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1415 dst128 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
1417 dst139 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1,
1419 dst1410 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1,
1422 dst76_r = __msa_ilvr_h(dst117, dst66);
1423 ILVRL_H2_SH(dst128, dst117, dst87_r, dst1211_r);
1424 ILVRL_H2_SH(dst139, dst128, dst98_r, dst1312_r);
1425 ILVRL_H2_SH(dst1410, dst139, dst109_r, dst1413_r);
1426 dst117 = (v8i16) __msa_splati_d((v2i64) dst117, 1);
1427 dst1110_r = __msa_ilvr_h(dst117, dst1410);
1429 dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r, filt_h0,
1430 filt_h1, filt_h2, filt_h3);
1431 dst1_r = HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r, filt_h0,
1432 filt_h1, filt_h2, filt_h3);
1433 dst2_r = HEVC_FILT_8TAP(dst32_r, dst54_r, dst76_r, dst98_r, filt_h0,
1434 filt_h1, filt_h2, filt_h3);
1435 dst3_r = HEVC_FILT_8TAP(dst43_r, dst65_r, dst87_r, dst109_r, filt_h0,
1436 filt_h1, filt_h2, filt_h3);
1437 dst4_r = HEVC_FILT_8TAP(dst54_r, dst76_r, dst98_r, dst1110_r, filt_h0,
1438 filt_h1, filt_h2, filt_h3);
1439 dst5_r = HEVC_FILT_8TAP(dst65_r, dst87_r, dst109_r, dst1211_r, filt_h0,
1440 filt_h1, filt_h2, filt_h3);
1441 dst6_r = HEVC_FILT_8TAP(dst76_r, dst98_r, dst1110_r, dst1312_r, filt_h0,
1442 filt_h1, filt_h2, filt_h3);
1443 dst7_r = HEVC_FILT_8TAP(dst87_r, dst109_r, dst1211_r, dst1413_r,
1444 filt_h0, filt_h1, filt_h2, filt_h3);
1446 SRA_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6);
1447 SRA_4V(dst4_r, dst5_r, dst6_r, dst7_r, 6);
1448 SRARI_W4_SW(dst0_r, dst1_r, dst2_r, dst3_r, 6);
1449 SRARI_W4_SW(dst4_r, dst5_r, dst6_r, dst7_r, 6);
1450 SAT_SW4_SW(dst0_r, dst1_r, dst2_r, dst3_r, 7);
1451 SAT_SW4_SW(dst4_r, dst5_r, dst6_r, dst7_r, 7);
1452 PCKEV_H2_SW(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r);
1453 PCKEV_H2_SW(dst5_r, dst4_r, dst7_r, dst6_r, dst4_r, dst5_r);
1454 out0 = PCKEV_XORI128_UB(dst0_r, dst1_r);
1455 out1 = PCKEV_XORI128_UB(dst4_r, dst5_r);
1456 ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
1457 dst += (8 * dst_stride);
1460 dst32_r = dst1110_r;
1461 dst54_r = dst1312_r;
1463 dst43_r = dst1211_r;
1464 dst65_r = dst1413_r;
1465 dst66 = (v8i16) __msa_splati_d((v2i64) dst1410, 1);
1469 static void hevc_hv_uni_8t_8multx2mult_msa(uint8_t *src,
1473 const int8_t *filter_x,
1474 const int8_t *filter_y,
1475 int32_t height, int32_t width)
1477 uint32_t loop_cnt, cnt;
1481 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
1482 v8i16 filt0, filt1, filt2, filt3;
1483 v8i16 filt_h0, filt_h1, filt_h2, filt_h3;
1484 v16i8 mask1, mask2, mask3;
1486 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1487 v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1488 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
1489 v4i32 dst0_r, dst0_l, dst1_r, dst1_l;
1490 v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
1491 v8i16 dst10_l, dst32_l, dst54_l, dst76_l;
1492 v8i16 dst21_r, dst43_r, dst65_r, dst87_r;
1493 v8i16 dst21_l, dst43_l, dst65_l, dst87_l;
1494 v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
1496 src -= ((3 * src_stride) + 3);
1498 filter_vec = LD_SH(filter_x);
1499 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1501 filter_vec = LD_SH(filter_y);
1502 UNPCK_R_SB_SH(filter_vec, filter_vec);
1504 SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
1510 for (cnt = width >> 3; cnt--;) {
1514 LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6);
1515 src_tmp += (7 * src_stride);
1516 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1518 /* row 0 row 1 row 2 row 3 */
1519 VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
1520 vec0, vec1, vec2, vec3);
1521 VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
1522 vec4, vec5, vec6, vec7);
1523 VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
1524 vec8, vec9, vec10, vec11);
1525 VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
1526 vec12, vec13, vec14, vec15);
1527 dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1529 dst1 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
1531 dst2 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
1533 dst3 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1,
1536 VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3,
1537 vec0, vec1, vec2, vec3);
1538 VSHF_B4_SB(src5, src5, mask0, mask1, mask2, mask3,
1539 vec4, vec5, vec6, vec7);
1540 VSHF_B4_SB(src6, src6, mask0, mask1, mask2, mask3,
1541 vec8, vec9, vec10, vec11);
1542 dst4 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1544 dst5 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
1546 dst6 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
1549 for (loop_cnt = height >> 1; loop_cnt--;) {
1550 LD_SB2(src_tmp, src_stride, src7, src8);
1551 XORI_B2_128_SB(src7, src8);
1552 src_tmp += 2 * src_stride;
1554 ILVR_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1,
1555 dst10_r, dst32_r, dst54_r, dst21_r);
1556 ILVL_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1,
1557 dst10_l, dst32_l, dst54_l, dst21_l);
1558 ILVR_H2_SH(dst4, dst3, dst6, dst5, dst43_r, dst65_r);
1559 ILVL_H2_SH(dst4, dst3, dst6, dst5, dst43_l, dst65_l);
1561 VSHF_B4_SB(src7, src7, mask0, mask1, mask2, mask3,
1562 vec0, vec1, vec2, vec3);
1563 dst7 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1,
1566 ILVRL_H2_SH(dst7, dst6, dst76_r, dst76_l);
1567 dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r,
1568 filt_h0, filt_h1, filt_h2, filt_h3);
1569 dst0_l = HEVC_FILT_8TAP(dst10_l, dst32_l, dst54_l, dst76_l,
1570 filt_h0, filt_h1, filt_h2, filt_h3);
1574 VSHF_B4_SB(src8, src8, mask0, mask1, mask2, mask3,
1575 vec0, vec1, vec2, vec3);
1576 dst8 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1,
1579 ILVRL_H2_SH(dst8, dst7, dst87_r, dst87_l);
1580 dst1_r = HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r,
1581 filt_h0, filt_h1, filt_h2, filt_h3);
1582 dst1_l = HEVC_FILT_8TAP(dst21_l, dst43_l, dst65_l, dst87_l,
1583 filt_h0, filt_h1, filt_h2, filt_h3);
1586 SRARI_W4_SW(dst0_r, dst0_l, dst1_r, dst1_l, 6);
1587 SAT_SW4_SW(dst0_r, dst0_l, dst1_r, dst1_l, 7);
1589 PCKEV_H2_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst0, dst1);
1590 out = PCKEV_XORI128_UB(dst0, dst1);
1591 ST_D2(out, 0, 1, dst_tmp, dst_stride);
1592 dst_tmp += (2 * dst_stride);
1608 static void hevc_hv_uni_8t_8w_msa(uint8_t *src,
1612 const int8_t *filter_x,
1613 const int8_t *filter_y,
1616 hevc_hv_uni_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
1617 filter_x, filter_y, height, 8);
1620 static void hevc_hv_uni_8t_12w_msa(uint8_t *src,
1624 const int8_t *filter_x,
1625 const int8_t *filter_y,
1629 uint8_t *src_tmp, *dst_tmp;
1631 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1632 v16i8 src11, src12, src13, src14;
1633 v16i8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7;
1634 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1635 v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1636 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
1637 v8i16 dst30, dst41, dst52, dst63, dst66, dst117, dst128, dst139, dst1410;
1638 v8i16 filt0, filt1, filt2, filt3, filt_h0, filt_h1, filt_h2, filt_h3;
1639 v8i16 dst10_r, dst32_r, dst54_r, dst76_r, dst21_r, dst43_r, dst65_r;
1640 v8i16 dst10_l, dst32_l, dst54_l, dst76_l, dst21_l, dst43_l, dst65_l;
1641 v8i16 dst87_r, dst98_r, dst1110_r, dst1312_r, dst109_r, dst1211_r;
1642 v8i16 dst1413_r, dst87_l, filter_vec;
1643 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r, dst6_r, dst7_r;
1644 v4i32 dst0_l, dst1_l;
1646 src -= ((3 * src_stride) + 3);
1648 filter_vec = LD_SH(filter_x);
1649 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1651 filter_vec = LD_SH(filter_y);
1652 UNPCK_R_SB_SH(filter_vec, filter_vec);
1654 SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
1656 mask0 = LD_SB(ff_hevc_mask_arr);
1664 LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6);
1665 src_tmp += (7 * src_stride);
1666 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1668 /* row 0 row 1 row 2 row 3 */
1669 VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
1670 VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7);
1671 VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3, vec8, vec9, vec10,
1673 VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3, vec12, vec13, vec14,
1675 dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1677 dst1 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
1679 dst2 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
1681 dst3 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1,
1684 VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
1685 VSHF_B4_SB(src5, src5, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7);
1686 VSHF_B4_SB(src6, src6, mask0, mask1, mask2, mask3, vec8, vec9, vec10,
1688 dst4 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1690 dst5 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
1692 dst6 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
1695 for (loop_cnt = 8; loop_cnt--;) {
1696 LD_SB2(src_tmp, src_stride, src7, src8);
1697 XORI_B2_128_SB(src7, src8);
1698 src_tmp += 2 * src_stride;
1700 ILVR_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1, dst10_r,
1701 dst32_r, dst54_r, dst21_r);
1702 ILVL_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1, dst10_l,
1703 dst32_l, dst54_l, dst21_l);
1704 ILVR_H2_SH(dst4, dst3, dst6, dst5, dst43_r, dst65_r);
1705 ILVL_H2_SH(dst4, dst3, dst6, dst5, dst43_l, dst65_l);
1707 VSHF_B4_SB(src7, src7, mask0, mask1, mask2, mask3, vec0, vec1, vec2,
1709 dst7 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1712 ILVRL_H2_SH(dst7, dst6, dst76_r, dst76_l);
1713 dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r,
1714 filt_h0, filt_h1, filt_h2, filt_h3);
1715 dst0_l = HEVC_FILT_8TAP(dst10_l, dst32_l, dst54_l, dst76_l,
1716 filt_h0, filt_h1, filt_h2, filt_h3);
1720 VSHF_B4_SB(src8, src8, mask0, mask1, mask2, mask3, vec0, vec1, vec2,
1722 dst8 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1725 ILVRL_H2_SH(dst8, dst7, dst87_r, dst87_l);
1726 dst1_r = HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r,
1727 filt_h0, filt_h1, filt_h2, filt_h3);
1728 dst1_l = HEVC_FILT_8TAP(dst21_l, dst43_l, dst65_l, dst87_l,
1729 filt_h0, filt_h1, filt_h2, filt_h3);
1732 SRARI_W4_SW(dst0_r, dst0_l, dst1_r, dst1_l, 6);
1733 SAT_SW4_SW(dst0_r, dst0_l, dst1_r, dst1_l, 7);
1735 PCKEV_H2_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst0, dst1);
1736 out0 = PCKEV_XORI128_UB(dst0, dst1);
1737 ST_D2(out0, 0, 1, dst_tmp, dst_stride);
1738 dst_tmp += (2 * dst_stride);
1752 mask4 = LD_SB(ff_hevc_mask_arr + 16);
1757 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
1758 src += (7 * src_stride);
1759 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1761 VSHF_B4_SB(src0, src3, mask4, mask5, mask6, mask7, vec0, vec1, vec2, vec3);
1762 VSHF_B4_SB(src1, src4, mask4, mask5, mask6, mask7, vec4, vec5, vec6, vec7);
1763 VSHF_B4_SB(src2, src5, mask4, mask5, mask6, mask7, vec8, vec9, vec10,
1765 VSHF_B4_SB(src3, src6, mask4, mask5, mask6, mask7, vec12, vec13, vec14,
1768 dst30 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1770 dst41 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
1772 dst52 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
1774 dst63 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2,
1777 ILVRL_H2_SH(dst41, dst30, dst10_r, dst43_r);
1778 ILVRL_H2_SH(dst52, dst41, dst21_r, dst54_r);
1779 ILVRL_H2_SH(dst63, dst52, dst32_r, dst65_r);
1781 dst66 = (v8i16) __msa_splati_d((v2i64) dst63, 1);
1783 for (loop_cnt = 2; loop_cnt--;) {
1784 LD_SB8(src, src_stride, src7, src8, src9, src10, src11, src12, src13,
1786 src += (8 * src_stride);
1787 XORI_B8_128_SB(src7, src8, src9, src10, src11, src12, src13, src14);
1789 VSHF_B4_SB(src7, src11, mask4, mask5, mask6, mask7, vec0, vec1, vec2,
1791 VSHF_B4_SB(src8, src12, mask4, mask5, mask6, mask7, vec4, vec5, vec6,
1793 VSHF_B4_SB(src9, src13, mask4, mask5, mask6, mask7, vec8, vec9, vec10,
1795 VSHF_B4_SB(src10, src14, mask4, mask5, mask6, mask7, vec12, vec13,
1798 dst117 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1800 dst128 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
1802 dst139 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1,
1804 dst1410 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1,
1807 dst76_r = __msa_ilvr_h(dst117, dst66);
1808 ILVRL_H2_SH(dst128, dst117, dst87_r, dst1211_r);
1809 ILVRL_H2_SH(dst139, dst128, dst98_r, dst1312_r);
1810 ILVRL_H2_SH(dst1410, dst139, dst109_r, dst1413_r);
1811 dst117 = (v8i16) __msa_splati_d((v2i64) dst117, 1);
1812 dst1110_r = __msa_ilvr_h(dst117, dst1410);
1814 dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r, filt_h0,
1815 filt_h1, filt_h2, filt_h3);
1816 dst1_r = HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r, filt_h0,
1817 filt_h1, filt_h2, filt_h3);
1818 dst2_r = HEVC_FILT_8TAP(dst32_r, dst54_r, dst76_r, dst98_r, filt_h0,
1819 filt_h1, filt_h2, filt_h3);
1820 dst3_r = HEVC_FILT_8TAP(dst43_r, dst65_r, dst87_r, dst109_r, filt_h0,
1821 filt_h1, filt_h2, filt_h3);
1822 dst4_r = HEVC_FILT_8TAP(dst54_r, dst76_r, dst98_r, dst1110_r, filt_h0,
1823 filt_h1, filt_h2, filt_h3);
1824 dst5_r = HEVC_FILT_8TAP(dst65_r, dst87_r, dst109_r, dst1211_r, filt_h0,
1825 filt_h1, filt_h2, filt_h3);
1826 dst6_r = HEVC_FILT_8TAP(dst76_r, dst98_r, dst1110_r, dst1312_r, filt_h0,
1827 filt_h1, filt_h2, filt_h3);
1828 dst7_r = HEVC_FILT_8TAP(dst87_r, dst109_r, dst1211_r, dst1413_r,
1829 filt_h0, filt_h1, filt_h2, filt_h3);
1831 SRA_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6);
1832 SRA_4V(dst4_r, dst5_r, dst6_r, dst7_r, 6);
1833 SRARI_W4_SW(dst0_r, dst1_r, dst2_r, dst3_r, 6);
1834 SRARI_W4_SW(dst4_r, dst5_r, dst6_r, dst7_r, 6);
1835 SAT_SW4_SW(dst0_r, dst1_r, dst2_r, dst3_r, 7);
1836 SAT_SW4_SW(dst4_r, dst5_r, dst6_r, dst7_r, 7);
1837 PCKEV_H2_SW(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r);
1838 PCKEV_H2_SW(dst5_r, dst4_r, dst7_r, dst6_r, dst4_r, dst5_r);
1839 out0 = PCKEV_XORI128_UB(dst0_r, dst1_r);
1840 out1 = PCKEV_XORI128_UB(dst4_r, dst5_r);
1841 ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
1842 dst += (8 * dst_stride);
1845 dst32_r = dst1110_r;
1846 dst54_r = dst1312_r;
1848 dst43_r = dst1211_r;
1849 dst65_r = dst1413_r;
1850 dst66 = (v8i16) __msa_splati_d((v2i64) dst1410, 1);
1854 static void hevc_hv_uni_8t_16w_msa(uint8_t *src,
1858 const int8_t *filter_x,
1859 const int8_t *filter_y,
1862 hevc_hv_uni_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
1863 filter_x, filter_y, height, 16);
1866 static void hevc_hv_uni_8t_24w_msa(uint8_t *src,
1870 const int8_t *filter_x,
1871 const int8_t *filter_y,
1874 hevc_hv_uni_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
1875 filter_x, filter_y, height, 24);
1878 static void hevc_hv_uni_8t_32w_msa(uint8_t *src,
1882 const int8_t *filter_x,
1883 const int8_t *filter_y,
1886 hevc_hv_uni_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
1887 filter_x, filter_y, height, 32);
1890 static void hevc_hv_uni_8t_48w_msa(uint8_t *src,
1894 const int8_t *filter_x,
1895 const int8_t *filter_y,
1898 hevc_hv_uni_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
1899 filter_x, filter_y, height, 48);
1902 static void hevc_hv_uni_8t_64w_msa(uint8_t *src,
1906 const int8_t *filter_x,
1907 const int8_t *filter_y,
1910 hevc_hv_uni_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
1911 filter_x, filter_y, height, 64);
1914 static void common_hz_4t_4x2_msa(uint8_t *src, int32_t src_stride,
1915 uint8_t *dst, int32_t dst_stride,
1916 const int8_t *filter)
1918 v16i8 filt0, filt1, src0, src1, mask0, mask1, vec0, vec1;
1922 mask0 = LD_SB(&ff_hevc_mask_arr[16]);
1925 /* rearranging filter */
1926 filt = LD_SH(filter);
1927 SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
1931 LD_SB2(src, src_stride, src0, src1);
1932 XORI_B2_128_SB(src0, src1);
1933 VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
1934 res0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
1935 res0 = __msa_srari_h(res0, 6);
1936 res0 = __msa_sat_s_h(res0, 7);
1937 out = PCKEV_XORI128_UB(res0, res0);
1938 ST_W2(out, 0, 1, dst, dst_stride);
1941 static void common_hz_4t_4x4_msa(uint8_t *src, int32_t src_stride,
1942 uint8_t *dst, int32_t dst_stride,
1943 const int8_t *filter)
1945 v16i8 src0, src1, src2, src3, filt0, filt1, mask0, mask1;
1946 v8i16 filt, out0, out1;
1949 mask0 = LD_SB(&ff_hevc_mask_arr[16]);
1952 /* rearranging filter */
1953 filt = LD_SH(filter);
1954 SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
1958 LD_SB4(src, src_stride, src0, src1, src2, src3);
1959 XORI_B4_128_SB(src0, src1, src2, src3);
1960 HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,
1961 filt0, filt1, out0, out1);
1962 SRARI_H2_SH(out0, out1, 6);
1963 SAT_SH2_SH(out0, out1, 7);
1964 out = PCKEV_XORI128_UB(out0, out1);
1965 ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
1968 static void common_hz_4t_4x8_msa(uint8_t *src, int32_t src_stride,
1969 uint8_t *dst, int32_t dst_stride,
1970 const int8_t *filter)
1972 v16i8 src0, src1, src2, src3, filt0, filt1, mask0, mask1;
1974 v8i16 filt, out0, out1, out2, out3;
1976 mask0 = LD_SB(&ff_hevc_mask_arr[16]);
1979 /* rearranging filter */
1980 filt = LD_SH(filter);
1981 SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
1985 LD_SB4(src, src_stride, src0, src1, src2, src3);
1986 src += (4 * src_stride);
1988 XORI_B4_128_SB(src0, src1, src2, src3);
1989 HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,
1990 filt0, filt1, out0, out1);
1991 LD_SB4(src, src_stride, src0, src1, src2, src3);
1992 XORI_B4_128_SB(src0, src1, src2, src3);
1993 HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,
1994 filt0, filt1, out2, out3);
1995 SRARI_H4_SH(out0, out1, out2, out3, 6);
1996 SAT_SH4_SH(out0, out1, out2, out3, 7);
1997 out = PCKEV_XORI128_UB(out0, out1);
1998 ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
1999 out = PCKEV_XORI128_UB(out2, out3);
2000 ST_W4(out, 0, 1, 2, 3, dst + 4 * dst_stride, dst_stride);
2003 static void common_hz_4t_4x16_msa(uint8_t *src, int32_t src_stride,
2004 uint8_t *dst, int32_t dst_stride,
2005 const int8_t *filter)
2007 v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
2008 v16i8 filt0, filt1, mask0, mask1;
2010 v8i16 filt, out0, out1, out2, out3;
2012 mask0 = LD_SB(&ff_hevc_mask_arr[16]);
2015 /* rearranging filter */
2016 filt = LD_SH(filter);
2017 SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2021 LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
2022 src += (8 * src_stride);
2023 XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
2024 HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,
2025 filt0, filt1, out0, out1);
2026 HORIZ_4TAP_4WID_4VECS_FILT(src4, src5, src6, src7, mask0, mask1,
2027 filt0, filt1, out2, out3);
2028 SRARI_H4_SH(out0, out1, out2, out3, 6);
2029 SAT_SH4_SH(out0, out1, out2, out3, 7);
2030 out = PCKEV_XORI128_UB(out0, out1);
2031 ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
2032 out = PCKEV_XORI128_UB(out2, out3);
2033 ST_W4(out, 0, 1, 2, 3, dst + 4 * dst_stride, dst_stride);
2034 dst += (8 * dst_stride);
2036 LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
2037 src += (8 * src_stride);
2038 XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
2039 HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,
2040 filt0, filt1, out0, out1);
2041 HORIZ_4TAP_4WID_4VECS_FILT(src4, src5, src6, src7, mask0, mask1,
2042 filt0, filt1, out2, out3);
2043 SRARI_H4_SH(out0, out1, out2, out3, 6);
2044 SAT_SH4_SH(out0, out1, out2, out3, 7);
2045 out = PCKEV_XORI128_UB(out0, out1);
2046 ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
2047 out = PCKEV_XORI128_UB(out2, out3);
2048 ST_W4(out, 0, 1, 2, 3, dst + 4 * dst_stride, dst_stride);
2051 static void common_hz_4t_4w_msa(uint8_t *src, int32_t src_stride,
2052 uint8_t *dst, int32_t dst_stride,
2053 const int8_t *filter, int32_t height)
2056 common_hz_4t_4x2_msa(src, src_stride, dst, dst_stride, filter);
2057 } else if (4 == height) {
2058 common_hz_4t_4x4_msa(src, src_stride, dst, dst_stride, filter);
2059 } else if (8 == height) {
2060 common_hz_4t_4x8_msa(src, src_stride, dst, dst_stride, filter);
2061 } else if (16 == height) {
2062 common_hz_4t_4x16_msa(src, src_stride, dst, dst_stride, filter);
2066 static void common_hz_4t_6w_msa(uint8_t *src, int32_t src_stride,
2067 uint8_t *dst, int32_t dst_stride,
2068 const int8_t *filter, int32_t height)
2070 v16i8 src0, src1, src2, src3, filt0, filt1, mask0, mask1;
2072 v8i16 filt, out0, out1, out2, out3;
2074 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
2077 /* rearranging filter */
2078 filt = LD_SH(filter);
2079 SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2083 LD_SB4(src, src_stride, src0, src1, src2, src3);
2084 src += (4 * src_stride);
2086 XORI_B4_128_SB(src0, src1, src2, src3);
2087 HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0,
2088 filt1, out0, out1, out2, out3);
2089 SRARI_H4_SH(out0, out1, out2, out3, 6);
2090 SAT_SH4_SH(out0, out1, out2, out3, 7);
2091 out4 = PCKEV_XORI128_UB(out0, out1);
2092 out5 = PCKEV_XORI128_UB(out2, out3);
2093 ST_W2(out4, 0, 2, dst, dst_stride);
2094 ST_H2(out4, 2, 6, dst + 4, dst_stride);
2095 ST_W2(out5, 0, 2, dst + 2 * dst_stride, dst_stride);
2096 ST_H2(out5, 2, 6, dst + 2 * dst_stride + 4, dst_stride);
2097 dst += (4 * dst_stride);
2099 LD_SB4(src, src_stride, src0, src1, src2, src3);
2100 src += (4 * src_stride);
2102 XORI_B4_128_SB(src0, src1, src2, src3);
2103 HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0,
2104 filt1, out0, out1, out2, out3);
2105 SRARI_H4_SH(out0, out1, out2, out3, 6);
2106 SAT_SH4_SH(out0, out1, out2, out3, 7);
2107 out4 = PCKEV_XORI128_UB(out0, out1);
2108 out5 = PCKEV_XORI128_UB(out2, out3);
2109 ST_W2(out4, 0, 2, dst, dst_stride);
2110 ST_H2(out4, 2, 6, dst + 4, dst_stride);
2111 ST_W2(out5, 0, 2, dst + 2 * dst_stride, dst_stride);
2112 ST_H2(out5, 2, 6, dst + 2 * dst_stride + 4, dst_stride);
2115 static void common_hz_4t_8x2mult_msa(uint8_t *src, int32_t src_stride,
2116 uint8_t *dst, int32_t dst_stride,
2117 const int8_t *filter, int32_t height)
2120 v16i8 src0, src1, filt0, filt1, mask0, mask1;
2122 v8i16 filt, vec0, vec1, vec2, vec3;
2124 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
2127 filt = LD_SH(filter);
2128 SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2132 for (loop_cnt = (height >> 1); loop_cnt--;) {
2133 LD_SB2(src, src_stride, src0, src1);
2134 src += (2 * src_stride);
2136 XORI_B2_128_SB(src0, src1);
2137 VSHF_B2_SH(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
2138 DOTP_SB2_SH(vec0, vec1, filt0, filt0, vec0, vec1);
2139 VSHF_B2_SH(src0, src0, src1, src1, mask1, mask1, vec2, vec3);
2140 DPADD_SB2_SH(vec2, vec3, filt1, filt1, vec0, vec1);
2141 SRARI_H2_SH(vec0, vec1, 6);
2142 SAT_SH2_SH(vec0, vec1, 7);
2143 out = PCKEV_XORI128_UB(vec0, vec1);
2144 ST_D2(out, 0, 1, dst, dst_stride);
2145 dst += (2 * dst_stride);
2149 static void common_hz_4t_8x4mult_msa(uint8_t *src, int32_t src_stride,
2150 uint8_t *dst, int32_t dst_stride,
2151 const int8_t *filter, int32_t height)
2154 v16i8 src0, src1, src2, src3, filt0, filt1, mask0, mask1;
2156 v8i16 filt, out0, out1, out2, out3;
2158 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
2161 /* rearranging filter */
2162 filt = LD_SH(filter);
2163 SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2167 for (loop_cnt = (height >> 2); loop_cnt--;) {
2168 LD_SB4(src, src_stride, src0, src1, src2, src3);
2169 src += (4 * src_stride);
2171 XORI_B4_128_SB(src0, src1, src2, src3);
2172 HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0,
2173 filt1, out0, out1, out2, out3);
2174 SRARI_H4_SH(out0, out1, out2, out3, 6);
2175 SAT_SH4_SH(out0, out1, out2, out3, 7);
2176 tmp0 = PCKEV_XORI128_UB(out0, out1);
2177 tmp1 = PCKEV_XORI128_UB(out2, out3);
2178 ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
2179 dst += (4 * dst_stride);
2183 static void common_hz_4t_8w_msa(uint8_t *src, int32_t src_stride,
2184 uint8_t *dst, int32_t dst_stride,
2185 const int8_t *filter, int32_t height)
2187 if ((2 == height) || (6 == height)) {
2188 common_hz_4t_8x2mult_msa(src, src_stride, dst, dst_stride, filter,
2191 common_hz_4t_8x4mult_msa(src, src_stride, dst, dst_stride, filter,
2196 static void common_hz_4t_12w_msa(uint8_t *src, int32_t src_stride,
2197 uint8_t *dst, int32_t dst_stride,
2198 const int8_t *filter, int32_t height)
2201 v16i8 src0, src1, src2, src3, filt0, filt1, mask0, mask1, mask2, mask3;
2202 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
2205 v8i16 filt, out0, out1, out2, out3, out4, out5;
2207 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
2208 mask2 = LD_SB(&ff_hevc_mask_arr[32]);
2212 /* rearranging filter */
2213 filt = LD_SH(filter);
2214 SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2219 for (loop_cnt = 4; loop_cnt--;) {
2220 LD_SB4(src, src_stride, src0, src1, src2, src3);
2221 src += (4 * src_stride);
2223 XORI_B4_128_SB(src0, src1, src2, src3);
2224 VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec0, vec1);
2225 DOTP_SB2_SH(vec0, vec1, filt0, filt0, out0, out1);
2226 VSHF_B2_SB(src0, src1, src2, src3, mask3, mask3, vec2, vec3);
2227 DPADD_SB2_SH(vec2, vec3, filt1, filt1, out0, out1);
2228 SRARI_H2_SH(out0, out1, 6);
2229 SAT_SH2_SH(out0, out1, 7);
2230 tmp0 = PCKEV_XORI128_UB(out0, out1);
2231 ST_W4(tmp0, 0, 1, 2, 3, dst + 8, dst_stride);
2233 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec4, vec5);
2234 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec6, vec7);
2235 DOTP_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0,
2236 out2, out3, out4, out5);
2237 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec8, vec9);
2238 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec10, vec11);
2239 DPADD_SB4_SH(vec8, vec9, vec10, vec11, filt1, filt1, filt1, filt1,
2240 out2, out3, out4, out5);
2241 SRARI_H4_SH(out2, out3, out4, out5, 6);
2242 SAT_SH4_SH(out2, out3, out4, out5, 7);
2243 tmp0 = PCKEV_XORI128_UB(out2, out3);
2244 tmp1 = PCKEV_XORI128_UB(out4, out5);
2245 ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
2246 dst += (4 * dst_stride);
2250 static void common_hz_4t_16w_msa(uint8_t *src, int32_t src_stride,
2251 uint8_t *dst, int32_t dst_stride,
2252 const int8_t *filter, int32_t height)
2255 v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
2256 v16i8 filt0, filt1, mask0, mask1;
2257 v16i8 vec0_m, vec1_m, vec2_m, vec3_m;
2258 v8i16 filt, out0, out1, out2, out3, out4, out5, out6, out7;
2261 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
2264 /* rearranging filter */
2265 filt = LD_SH(filter);
2266 SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2270 for (loop_cnt = (height >> 2); loop_cnt--;) {
2271 LD_SB4(src, src_stride, src0, src2, src4, src6);
2272 LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
2273 src += (4 * src_stride);
2275 XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
2277 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m);
2278 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m);
2279 DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0,
2280 out0, out1, out2, out3);
2281 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0_m, vec1_m);
2282 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2_m, vec3_m);
2283 DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt1, filt1, filt1, filt1,
2284 out0, out1, out2, out3);
2285 SRARI_H4_SH(out0, out1, out2, out3, 6);
2286 SAT_SH4_SH(out0, out1, out2, out3, 7);
2287 out = PCKEV_XORI128_UB(out0, out1);
2290 out = PCKEV_XORI128_UB(out2, out3);
2294 VSHF_B2_SB(src4, src4, src5, src5, mask0, mask0, vec0_m, vec1_m);
2295 VSHF_B2_SB(src6, src6, src7, src7, mask0, mask0, vec2_m, vec3_m);
2296 DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0,
2297 out4, out5, out6, out7);
2298 VSHF_B2_SB(src4, src4, src5, src5, mask1, mask1, vec0_m, vec1_m);
2299 VSHF_B2_SB(src6, src6, src7, src7, mask1, mask1, vec2_m, vec3_m);
2300 DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt1, filt1, filt1, filt1,
2301 out4, out5, out6, out7);
2302 SRARI_H4_SH(out4, out5, out6, out7, 6);
2303 SAT_SH4_SH(out4, out5, out6, out7, 7);
2304 out = PCKEV_XORI128_UB(out4, out5);
2307 out = PCKEV_XORI128_UB(out6, out7);
2313 static void common_hz_4t_24w_msa(uint8_t *src, int32_t src_stride,
2314 uint8_t *dst, int32_t dst_stride,
2315 const int8_t *filter, int32_t height)
2317 uint8_t *dst1 = dst + 16;
2319 v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
2320 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
2321 v16i8 filt0, filt1, mask0, mask1, mask00, mask11;
2322 v8i16 filt, out0, out1, out2, out3;
2325 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
2328 /* rearranging filter */
2329 filt = LD_SH(filter);
2330 SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2334 mask11 = mask0 + 10;
2336 for (loop_cnt = 8; loop_cnt--;) {
2337 LD_SB4(src, src_stride, src0, src2, src4, src6);
2338 LD_SB4(src + 16, src_stride, src1, src3, src5, src7);
2339 src += (4 * src_stride);
2341 XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
2342 VSHF_B2_SB(src0, src0, src0, src1, mask0, mask00, vec0, vec1);
2343 VSHF_B2_SB(src2, src2, src2, src3, mask0, mask00, vec2, vec3);
2344 VSHF_B2_SB(src0, src0, src0, src1, mask1, mask11, vec4, vec5);
2345 VSHF_B2_SB(src2, src2, src2, src3, mask1, mask11, vec6, vec7);
2346 DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
2347 out0, out1, out2, out3);
2348 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1,
2349 out0, out1, out2, out3);
2350 SRARI_H4_SH(out0, out1, out2, out3, 6);
2351 SAT_SH4_SH(out0, out1, out2, out3, 7);
2352 tmp0 = PCKEV_XORI128_UB(out0, out1);
2355 tmp0 = PCKEV_XORI128_UB(out2, out3);
2359 VSHF_B2_SB(src4, src4, src4, src5, mask0, mask00, vec0, vec1);
2360 VSHF_B2_SB(src6, src6, src6, src7, mask0, mask00, vec2, vec3);
2361 VSHF_B2_SB(src4, src4, src4, src5, mask1, mask11, vec4, vec5);
2362 VSHF_B2_SB(src6, src6, src6, src7, mask1, mask11, vec6, vec7);
2363 DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
2364 out0, out1, out2, out3);
2365 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1,
2366 out0, out1, out2, out3);
2367 SRARI_H4_SH(out0, out1, out2, out3, 6);
2368 SAT_SH4_SH(out0, out1, out2, out3, 7);
2369 tmp0 = PCKEV_XORI128_UB(out0, out1);
2372 tmp0 = PCKEV_XORI128_UB(out2, out3);
2377 VSHF_B2_SB(src1, src1, src3, src3, mask0, mask0, vec0, vec1);
2378 VSHF_B2_SB(src5, src5, src7, src7, mask0, mask0, vec2, vec3);
2379 VSHF_B2_SB(src1, src1, src3, src3, mask1, mask1, vec4, vec5);
2380 VSHF_B2_SB(src5, src5, src7, src7, mask1, mask1, vec6, vec7);
2382 DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
2383 out0, out1, out2, out3);
2384 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1,
2385 out0, out1, out2, out3);
2387 SRARI_H4_SH(out0, out1, out2, out3, 6);
2388 SAT_SH4_SH(out0, out1, out2, out3, 7);
2389 tmp0 = PCKEV_XORI128_UB(out0, out1);
2390 tmp1 = PCKEV_XORI128_UB(out2, out3);
2391 ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst1, dst_stride);
2392 dst1 += (4 * dst_stride);
2396 static void common_hz_4t_32w_msa(uint8_t *src, int32_t src_stride,
2397 uint8_t *dst, int32_t dst_stride,
2398 const int8_t *filter, int32_t height)
2401 v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
2402 v16i8 filt0, filt1, mask0, mask1;
2404 v16i8 vec0_m, vec1_m, vec2_m, vec3_m;
2405 v8i16 filt, out0, out1, out2, out3, out4, out5, out6, out7;
2407 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
2410 /* rearranging filter */
2411 filt = LD_SH(filter);
2412 SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2416 for (loop_cnt = (height >> 1); loop_cnt--;) {
2418 src1 = LD_SB(src + 8);
2419 src2 = LD_SB(src + 16);
2420 src3 = LD_SB(src + 24);
2423 src5 = LD_SB(src + 8);
2424 src6 = LD_SB(src + 16);
2425 src7 = LD_SB(src + 24);
2428 XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
2430 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m);
2431 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m);
2432 DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0,
2433 out0, out1, out2, out3);
2434 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0_m, vec1_m);
2435 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2_m, vec3_m);
2436 DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt1, filt1, filt1, filt1,
2437 out0, out1, out2, out3);
2439 VSHF_B2_SB(src4, src4, src5, src5, mask0, mask0, vec0_m, vec1_m);
2440 VSHF_B2_SB(src6, src6, src7, src7, mask0, mask0, vec2_m, vec3_m);
2441 DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0,
2442 out4, out5, out6, out7);
2443 VSHF_B2_SB(src4, src4, src5, src5, mask1, mask1, vec0_m, vec1_m);
2444 VSHF_B2_SB(src6, src6, src7, src7, mask1, mask1, vec2_m, vec3_m);
2445 DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt1, filt1, filt1, filt1,
2446 out4, out5, out6, out7);
2447 SRARI_H4_SH(out0, out1, out2, out3, 6);
2448 SRARI_H4_SH(out4, out5, out6, out7, 6);
2449 SAT_SH4_SH(out0, out1, out2, out3, 7);
2450 SAT_SH4_SH(out4, out5, out6, out7, 7);
2451 out = PCKEV_XORI128_UB(out0, out1);
2453 out = PCKEV_XORI128_UB(out2, out3);
2454 ST_UB(out, dst + 16);
2456 out = PCKEV_XORI128_UB(out4, out5);
2458 out = PCKEV_XORI128_UB(out6, out7);
2459 ST_UB(out, dst + 16);
2464 static void common_vt_4t_4x2_msa(uint8_t *src, int32_t src_stride,
2465 uint8_t *dst, int32_t dst_stride,
2466 const int8_t *filter)
2468 v16i8 src0, src1, src2, src3, src4, src10_r, src32_r, src21_r, src43_r;
2469 v16i8 src2110, src4332, filt0, filt1;
2475 filt = LD_SH(filter);
2476 SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2478 LD_SB3(src, src_stride, src0, src1, src2);
2479 src += (3 * src_stride);
2481 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2482 src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
2483 src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
2484 LD_SB2(src, src_stride, src3, src4);
2485 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
2486 src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_r, (v2i64) src32_r);
2487 src4332 = (v16i8) __msa_xori_b((v16u8) src4332, 128);
2488 out10 = HEVC_FILT_4TAP_SH(src2110, src4332, filt0, filt1);
2489 out10 = __msa_srari_h(out10, 6);
2490 out10 = __msa_sat_s_h(out10, 7);
2491 out = PCKEV_XORI128_UB(out10, out10);
2492 ST_W2(out, 0, 1, dst, dst_stride);
2495 static void common_vt_4t_4x4multiple_msa(uint8_t *src, int32_t src_stride,
2496 uint8_t *dst, int32_t dst_stride,
2497 const int8_t *filter, int32_t height)
2500 v16i8 src0, src1, src2, src3, src4, src5;
2501 v16i8 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r;
2502 v16i8 src2110, src4332, filt0, filt1;
2503 v8i16 filt, out10, out32;
2508 filt = LD_SH(filter);
2509 SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2511 LD_SB3(src, src_stride, src0, src1, src2);
2512 src += (3 * src_stride);
2514 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2516 src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
2517 src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
2519 for (loop_cnt = (height >> 2); loop_cnt--;) {
2520 LD_SB3(src, src_stride, src3, src4, src5);
2521 src += (3 * src_stride);
2522 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
2523 src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_r, (v2i64) src32_r);
2524 src4332 = (v16i8) __msa_xori_b((v16u8) src4332, 128);
2525 out10 = HEVC_FILT_4TAP_SH(src2110, src4332, filt0, filt1);
2528 src += (src_stride);
2529 ILVR_B2_SB(src5, src4, src2, src5, src54_r, src65_r);
2530 src2110 = (v16i8) __msa_ilvr_d((v2i64) src65_r, (v2i64) src54_r);
2531 src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
2532 out32 = HEVC_FILT_4TAP_SH(src4332, src2110, filt0, filt1);
2533 SRARI_H2_SH(out10, out32, 6);
2534 SAT_SH2_SH(out10, out32, 7);
2535 out = PCKEV_XORI128_UB(out10, out32);
2536 ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
2537 dst += (4 * dst_stride);
2541 static void common_vt_4t_4w_msa(uint8_t *src, int32_t src_stride,
2542 uint8_t *dst, int32_t dst_stride,
2543 const int8_t *filter, int32_t height)
2546 common_vt_4t_4x2_msa(src, src_stride, dst, dst_stride, filter);
2548 common_vt_4t_4x4multiple_msa(src, src_stride, dst, dst_stride, filter,
2553 static void common_vt_4t_6w_msa(uint8_t *src, int32_t src_stride,
2554 uint8_t *dst, int32_t dst_stride,
2555 const int8_t *filter, int32_t height)
2558 v16i8 src0, src1, src2, src3, src4, src5, src6;
2559 v16i8 src10_r, src32_r, src21_r, src43_r, src54_r, src65_r;
2560 v8i16 dst0_r, dst1_r, dst2_r, dst3_r, filt0, filt1, filter_vec;
2564 filter_vec = LD_SH(filter);
2565 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2567 LD_SB3(src, src_stride, src0, src1, src2);
2568 src += (3 * src_stride);
2569 XORI_B3_128_SB(src0, src1, src2);
2570 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2572 LD_SB2(src, src_stride, src3, src4);
2573 src += (2 * src_stride);
2574 XORI_B2_128_SB(src3, src4);
2575 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
2577 dst0_r = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
2578 dst1_r = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
2580 LD_SB2(src, src_stride, src5, src6);
2581 src += (2 * src_stride);
2582 XORI_B2_128_SB(src5, src6);
2583 ILVR_B2_SB(src5, src4, src6, src5, src54_r, src65_r);
2585 dst2_r = HEVC_FILT_4TAP_SH(src32_r, src54_r, filt0, filt1);
2586 dst3_r = HEVC_FILT_4TAP_SH(src43_r, src65_r, filt0, filt1);
2588 SRARI_H4_SH(dst0_r, dst1_r, dst2_r, dst3_r, 6);
2589 SAT_SH4_SH(dst0_r, dst1_r, dst2_r, dst3_r, 7);
2590 out0 = PCKEV_XORI128_UB(dst0_r, dst1_r);
2591 out1 = PCKEV_XORI128_UB(dst2_r, dst3_r);
2592 ST_W2(out0, 0, 2, dst, dst_stride);
2593 ST_H2(out0, 2, 6, dst + 4, dst_stride);
2594 ST_W2(out1, 0, 2, dst + 2 * dst_stride, dst_stride);
2595 ST_H2(out1, 2, 6, dst + 2 * dst_stride + 4, dst_stride);
2596 dst += (4 * dst_stride);
2598 LD_SB2(src, src_stride, src3, src4);
2599 src += (2 * src_stride);
2600 XORI_B2_128_SB(src3, src4);
2601 ILVR_B2_SB(src3, src6, src4, src3, src32_r, src43_r);
2603 dst0_r = HEVC_FILT_4TAP_SH(src54_r, src32_r, filt0, filt1);
2604 dst1_r = HEVC_FILT_4TAP_SH(src65_r, src43_r, filt0, filt1);
2606 LD_SB2(src, src_stride, src5, src6);
2607 src += (2 * src_stride);
2608 XORI_B2_128_SB(src5, src6);
2609 ILVR_B2_SB(src5, src4, src6, src5, src54_r, src65_r);
2611 dst2_r = HEVC_FILT_4TAP_SH(src32_r, src54_r, filt0, filt1);
2612 dst3_r = HEVC_FILT_4TAP_SH(src43_r, src65_r, filt0, filt1);
2614 SRARI_H4_SH(dst0_r, dst1_r, dst2_r, dst3_r, 6);
2615 SAT_SH4_SH(dst0_r, dst1_r, dst2_r, dst3_r, 7);
2616 out0 = PCKEV_XORI128_UB(dst0_r, dst1_r);
2617 out1 = PCKEV_XORI128_UB(dst2_r, dst3_r);
2618 ST_W2(out0, 0, 2, dst, dst_stride);
2619 ST_H2(out0, 2, 6, dst + 4, dst_stride);
2620 ST_W2(out1, 0, 2, dst + 2 * dst_stride, dst_stride);
2621 ST_H2(out1, 2, 6, dst + 2 * dst_stride + 4, dst_stride);
2624 static void common_vt_4t_8x2_msa(uint8_t *src, int32_t src_stride,
2625 uint8_t *dst, int32_t dst_stride,
2626 const int8_t *filter)
2628 v16i8 src0, src1, src2, src3, src4;
2629 v8i16 src01, src12, src23, src34, tmp0, tmp1, filt, filt0, filt1;
2634 /* rearranging filter_y */
2635 filt = LD_SH(filter);
2636 SPLATI_H2_SH(filt, 0, 1, filt0, filt1);
2638 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
2639 XORI_B5_128_SB(src0, src1, src2, src3, src4);
2640 ILVR_B2_SH(src1, src0, src3, src2, src01, src23);
2641 tmp0 = HEVC_FILT_4TAP_SH(src01, src23, filt0, filt1);
2642 ILVR_B2_SH(src2, src1, src4, src3, src12, src34);
2643 tmp1 = HEVC_FILT_4TAP_SH(src12, src34, filt0, filt1);
2644 SRARI_H2_SH(tmp0, tmp1, 6);
2645 SAT_SH2_SH(tmp0, tmp1, 7);
2646 out = PCKEV_XORI128_UB(tmp0, tmp1);
2647 ST_D2(out, 0, 1, dst, dst_stride);
2650 static void common_vt_4t_8x6_msa(uint8_t *src, int32_t src_stride,
2651 uint8_t *dst, int32_t dst_stride,
2652 const int8_t *filter)
2655 uint64_t out0, out1, out2;
2656 v16i8 src0, src1, src2, src3, src4, src5;
2657 v8i16 vec0, vec1, vec2, vec3, vec4, tmp0, tmp1, tmp2;
2658 v8i16 filt, filt0, filt1;
2662 /* rearranging filter_y */
2663 filt = LD_SH(filter);
2664 SPLATI_H2_SH(filt, 0, 1, filt0, filt1);
2666 LD_SB3(src, src_stride, src0, src1, src2);
2667 src += (3 * src_stride);
2669 XORI_B3_128_SB(src0, src1, src2);
2670 ILVR_B2_SH(src1, src0, src2, src1, vec0, vec2);
2672 for (loop_cnt = 2; loop_cnt--;) {
2673 LD_SB3(src, src_stride, src3, src4, src5);
2674 src += (3 * src_stride);
2676 XORI_B3_128_SB(src3, src4, src5);
2677 ILVR_B3_SH(src3, src2, src4, src3, src5, src4, vec1, vec3, vec4);
2678 tmp0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
2679 tmp1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
2680 tmp2 = HEVC_FILT_4TAP_SH(vec1, vec4, filt0, filt1);
2681 SRARI_H2_SH(tmp0, tmp1, 6);
2682 tmp2 = __msa_srari_h(tmp2, 6);
2683 SAT_SH3_SH(tmp0, tmp1, tmp2, 7);
2684 PCKEV_B2_SH(tmp1, tmp0, tmp2, tmp2, tmp0, tmp2);
2685 XORI_B2_128_SH(tmp0, tmp2);
2687 out0 = __msa_copy_u_d((v2i64) tmp0, 0);
2688 out1 = __msa_copy_u_d((v2i64) tmp0, 1);
2689 out2 = __msa_copy_u_d((v2i64) tmp2, 0);
2703 static void common_vt_4t_8x4mult_msa(uint8_t *src, int32_t src_stride,
2704 uint8_t *dst, int32_t dst_stride,
2705 const int8_t *filter, int32_t height)
2708 v16i8 src0, src1, src2, src7, src8, src9, src10;
2709 v16i8 src10_r, src72_r, src98_r, src21_r, src87_r, src109_r, filt0, filt1;
2711 v8i16 filt, out0_r, out1_r, out2_r, out3_r;
2715 filt = LD_SH(filter);
2716 SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2718 LD_SB3(src, src_stride, src0, src1, src2);
2719 src += (3 * src_stride);
2721 XORI_B3_128_SB(src0, src1, src2);
2722 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2724 for (loop_cnt = (height >> 2); loop_cnt--;) {
2725 LD_SB4(src, src_stride, src7, src8, src9, src10);
2726 src += (4 * src_stride);
2728 XORI_B4_128_SB(src7, src8, src9, src10);
2729 ILVR_B4_SB(src7, src2, src8, src7, src9, src8, src10, src9,
2730 src72_r, src87_r, src98_r, src109_r);
2731 out0_r = HEVC_FILT_4TAP_SH(src10_r, src72_r, filt0, filt1);
2732 out1_r = HEVC_FILT_4TAP_SH(src21_r, src87_r, filt0, filt1);
2733 out2_r = HEVC_FILT_4TAP_SH(src72_r, src98_r, filt0, filt1);
2734 out3_r = HEVC_FILT_4TAP_SH(src87_r, src109_r, filt0, filt1);
2735 SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6);
2736 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
2737 tmp0 = PCKEV_XORI128_UB(out0_r, out1_r);
2738 tmp1 = PCKEV_XORI128_UB(out2_r, out3_r);
2739 ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
2740 dst += (4 * dst_stride);
2748 static void common_vt_4t_8w_msa(uint8_t *src, int32_t src_stride,
2749 uint8_t *dst, int32_t dst_stride,
2750 const int8_t *filter, int32_t height)
2753 common_vt_4t_8x2_msa(src, src_stride, dst, dst_stride, filter);
2754 } else if (6 == height) {
2755 common_vt_4t_8x6_msa(src, src_stride, dst, dst_stride, filter);
2757 common_vt_4t_8x4mult_msa(src, src_stride, dst, dst_stride,
2762 static void common_vt_4t_12w_msa(uint8_t *src, int32_t src_stride,
2763 uint8_t *dst, int32_t dst_stride,
2764 const int8_t *filter, int32_t height)
2767 v16i8 src0, src1, src2, src3, src4, src5, src6;
2769 v16i8 src10_r, src32_r, src21_r, src43_r, src54_r, src65_r;
2770 v16i8 src10_l, src32_l, src54_l, src21_l, src43_l, src65_l;
2771 v16i8 src2110, src4332, src6554;
2772 v8i16 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, filt0, filt1;
2775 src -= (1 * src_stride);
2777 filter_vec = LD_SH(filter);
2778 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2780 LD_SB3(src, src_stride, src0, src1, src2);
2781 src += (3 * src_stride);
2783 XORI_B3_128_SB(src0, src1, src2);
2784 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2785 ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
2786 src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_l, (v2i64) src10_l);
2788 for (loop_cnt = 4; loop_cnt--;) {
2789 LD_SB4(src, src_stride, src3, src4, src5, src6);
2790 src += (4 * src_stride);
2792 XORI_B4_128_SB(src3, src4, src5, src6);
2793 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
2794 ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
2795 src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_l, (v2i64) src32_l);
2796 ILVR_B2_SB(src5, src4, src6, src5, src54_r, src65_r);
2797 ILVL_B2_SB(src5, src4, src6, src5, src54_l, src65_l);
2798 src6554 = (v16i8) __msa_ilvr_d((v2i64) src65_l, (v2i64) src54_l);
2800 dst0_r = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
2801 dst1_r = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
2802 dst0_l = HEVC_FILT_4TAP_SH(src2110, src4332, filt0, filt1);
2803 dst2_r = HEVC_FILT_4TAP_SH(src32_r, src54_r, filt0, filt1);
2804 dst3_r = HEVC_FILT_4TAP_SH(src43_r, src65_r, filt0, filt1);
2805 dst1_l = HEVC_FILT_4TAP_SH(src4332, src6554, filt0, filt1);
2807 SRARI_H4_SH(dst0_r, dst1_r, dst2_r, dst3_r, 6);
2808 SRARI_H2_SH(dst0_l, dst1_l, 6);
2809 SAT_SH4_SH(dst0_r, dst1_r, dst2_r, dst3_r, 7);
2810 SAT_SH2_SH(dst0_l, dst1_l, 7);
2811 out0 = PCKEV_XORI128_UB(dst0_r, dst1_r);
2812 out1 = PCKEV_XORI128_UB(dst2_r, dst3_r);
2813 ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
2814 out0 = PCKEV_XORI128_UB(dst0_l, dst1_l);
2815 ST_W4(out0, 0, 1, 2, 3, dst + 8, dst_stride);
2816 dst += (4 * dst_stride);
2825 static void common_vt_4t_16w_msa(uint8_t *src, int32_t src_stride,
2826 uint8_t *dst, int32_t dst_stride,
2827 const int8_t *filter, int32_t height)
2830 v16i8 src0, src1, src2, src3, src4, src5, src6;
2831 v16i8 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r, src10_l;
2832 v16i8 src32_l, src54_l, src21_l, src43_l, src65_l, filt0, filt1;
2833 v16u8 tmp0, tmp1, tmp2, tmp3;
2834 v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
2838 filt = LD_SH(filter);
2839 SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2841 LD_SB3(src, src_stride, src0, src1, src2);
2842 src += (3 * src_stride);
2844 XORI_B3_128_SB(src0, src1, src2);
2845 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2846 ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
2848 for (loop_cnt = (height >> 2); loop_cnt--;) {
2849 LD_SB4(src, src_stride, src3, src4, src5, src6);
2850 src += (4 * src_stride);
2852 XORI_B4_128_SB(src3, src4, src5, src6);
2853 ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
2854 src32_r, src43_r, src54_r, src65_r);
2855 ILVL_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
2856 src32_l, src43_l, src54_l, src65_l);
2857 out0_r = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
2858 out1_r = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
2859 out2_r = HEVC_FILT_4TAP_SH(src32_r, src54_r, filt0, filt1);
2860 out3_r = HEVC_FILT_4TAP_SH(src43_r, src65_r, filt0, filt1);
2861 out0_l = HEVC_FILT_4TAP_SH(src10_l, src32_l, filt0, filt1);
2862 out1_l = HEVC_FILT_4TAP_SH(src21_l, src43_l, filt0, filt1);
2863 out2_l = HEVC_FILT_4TAP_SH(src32_l, src54_l, filt0, filt1);
2864 out3_l = HEVC_FILT_4TAP_SH(src43_l, src65_l, filt0, filt1);
2865 SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6);
2866 SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 6);
2867 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
2868 SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
2869 PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
2870 out3_r, tmp0, tmp1, tmp2, tmp3);
2871 XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3);
2872 ST_UB4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride);
2873 dst += (4 * dst_stride);
2883 static void common_vt_4t_24w_msa(uint8_t *src, int32_t src_stride,
2884 uint8_t *dst, int32_t dst_stride,
2885 const int8_t *filter, int32_t height)
2888 uint64_t out0, out1;
2889 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
2890 v16i8 src11, filt0, filt1;
2891 v16i8 src10_r, src32_r, src76_r, src98_r, src21_r, src43_r, src87_r;
2892 v16i8 src109_r, src10_l, src32_l, src21_l, src43_l;
2894 v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l;
2898 filt = LD_SH(filter);
2899 SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2902 LD_SB3(src, src_stride, src0, src1, src2);
2903 XORI_B3_128_SB(src0, src1, src2);
2904 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2905 ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
2908 LD_SB3(src + 16, src_stride, src6, src7, src8);
2909 src += (3 * src_stride);
2910 XORI_B3_128_SB(src6, src7, src8);
2911 ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
2913 for (loop_cnt = 8; loop_cnt--;) {
2915 LD_SB2(src, src_stride, src3, src4);
2916 XORI_B2_128_SB(src3, src4);
2917 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
2918 ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
2921 LD_SB2(src + 16, src_stride, src9, src10);
2922 src += (2 * src_stride);
2923 XORI_B2_128_SB(src9, src10);
2924 ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r);
2927 out0_r = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
2928 out0_l = HEVC_FILT_4TAP_SH(src10_l, src32_l, filt0, filt1);
2929 out1_r = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
2930 out1_l = HEVC_FILT_4TAP_SH(src21_l, src43_l, filt0, filt1);
2933 out2_r = HEVC_FILT_4TAP_SH(src76_r, src98_r, filt0, filt1);
2934 out3_r = HEVC_FILT_4TAP_SH(src87_r, src109_r, filt0, filt1);
2937 SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6);
2938 SRARI_H2_SH(out0_l, out1_l, 6);
2939 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
2940 SAT_SH2_SH(out0_l, out1_l, 7);
2941 out = PCKEV_XORI128_UB(out0_r, out0_l);
2943 PCKEV_B2_SH(out2_r, out2_r, out3_r, out3_r, out2_r, out3_r);
2944 XORI_B2_128_SH(out2_r, out3_r);
2945 out0 = __msa_copy_u_d((v2i64) out2_r, 0);
2946 out1 = __msa_copy_u_d((v2i64) out3_r, 0);
2949 out = PCKEV_XORI128_UB(out1_r, out1_l);
2955 LD_SB2(src, src_stride, src5, src2);
2956 XORI_B2_128_SB(src5, src2);
2957 ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r);
2958 ILVL_B2_SB(src5, src4, src2, src5, src10_l, src21_l);
2961 LD_SB2(src + 16, src_stride, src11, src8);
2962 src += (2 * src_stride);
2963 XORI_B2_128_SB(src11, src8);
2964 ILVR_B2_SB(src11, src10, src8, src11, src76_r, src87_r);
2967 out0_r = HEVC_FILT_4TAP_SH(src32_r, src10_r, filt0, filt1);
2968 out0_l = HEVC_FILT_4TAP_SH(src32_l, src10_l, filt0, filt1);
2969 out1_r = HEVC_FILT_4TAP_SH(src43_r, src21_r, filt0, filt1);
2970 out1_l = HEVC_FILT_4TAP_SH(src43_l, src21_l, filt0, filt1);
2973 out2_r = HEVC_FILT_4TAP_SH(src98_r, src76_r, filt0, filt1);
2974 out3_r = HEVC_FILT_4TAP_SH(src109_r, src87_r, filt0, filt1);
2977 SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6);
2978 SRARI_H2_SH(out0_l, out1_l, 6);
2979 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
2980 SAT_SH2_SH(out0_l, out1_l, 7);
2981 out = PCKEV_XORI128_UB(out0_r, out0_l);
2983 out = PCKEV_XORI128_UB(out2_r, out2_r);
2984 ST_D1(out, 0, dst + 16);
2986 out = PCKEV_XORI128_UB(out1_r, out1_l);
2988 out = PCKEV_XORI128_UB(out3_r, out3_r);
2989 ST_D1(out, 0, dst + 16);
2994 static void common_vt_4t_32w_msa(uint8_t *src, int32_t src_stride,
2995 uint8_t *dst, int32_t dst_stride,
2996 const int8_t *filter, int32_t height)
2999 v16i8 src0, src1, src2, src3, src4, src6, src7, src8, src9, src10;
3000 v16i8 src10_r, src32_r, src76_r, src98_r;
3001 v16i8 src21_r, src43_r, src87_r, src109_r;
3002 v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
3003 v16i8 src10_l, src32_l, src76_l, src98_l;
3004 v16i8 src21_l, src43_l, src87_l, src109_l;
3011 filt = LD_SH(filter);
3012 SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
3015 LD_SB3(src, src_stride, src0, src1, src2);
3016 XORI_B3_128_SB(src0, src1, src2);
3018 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3019 ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
3022 LD_SB3(src + 16, src_stride, src6, src7, src8);
3023 src += (3 * src_stride);
3025 XORI_B3_128_SB(src6, src7, src8);
3026 ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
3027 ILVL_B2_SB(src7, src6, src8, src7, src76_l, src87_l);
3029 for (loop_cnt = (height >> 1); loop_cnt--;) {
3031 LD_SB2(src, src_stride, src3, src4);
3032 XORI_B2_128_SB(src3, src4);
3033 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3034 ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
3037 out0_r = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
3038 out0_l = HEVC_FILT_4TAP_SH(src10_l, src32_l, filt0, filt1);
3039 out1_r = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
3040 out1_l = HEVC_FILT_4TAP_SH(src21_l, src43_l, filt0, filt1);
3043 SRARI_H4_SH(out0_r, out1_r, out0_l, out1_l, 6);
3044 SAT_SH4_SH(out0_r, out1_r, out0_l, out1_l, 7);
3045 out = PCKEV_XORI128_UB(out0_r, out0_l);
3047 out = PCKEV_XORI128_UB(out1_r, out1_l);
3048 ST_UB(out, dst + dst_stride);
3057 LD_SB2(src + 16, src_stride, src9, src10);
3058 src += (2 * src_stride);
3059 XORI_B2_128_SB(src9, src10);
3060 ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r);
3061 ILVL_B2_SB(src9, src8, src10, src9, src98_l, src109_l);
3064 out2_r = HEVC_FILT_4TAP_SH(src76_r, src98_r, filt0, filt1);
3065 out2_l = HEVC_FILT_4TAP_SH(src76_l, src98_l, filt0, filt1);
3066 out3_r = HEVC_FILT_4TAP_SH(src87_r, src109_r, filt0, filt1);
3067 out3_l = HEVC_FILT_4TAP_SH(src87_l, src109_l, filt0, filt1);
3070 SRARI_H4_SH(out2_r, out3_r, out2_l, out3_l, 6);
3071 SAT_SH4_SH(out2_r, out3_r, out2_l, out3_l, 7);
3072 out = PCKEV_XORI128_UB(out2_r, out2_l);
3073 ST_UB(out, dst + 16);
3074 out = PCKEV_XORI128_UB(out3_r, out3_l);
3075 ST_UB(out, dst + 16 + dst_stride);
3077 dst += 2 * dst_stride;
3087 static void hevc_hv_uni_4t_4x2_msa(uint8_t *src,
3091 const int8_t *filter_x,
3092 const int8_t *filter_y)
3095 v16i8 src0, src1, src2, src3, src4;
3097 v8i16 filt_h0, filt_h1;
3098 v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16);
3100 v8i16 filter_vec, tmp;
3101 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
3102 v8i16 dst20, dst31, dst42, dst10, dst32, dst21, dst43;
3105 src -= (src_stride + 1);
3107 filter_vec = LD_SH(filter_x);
3108 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3110 filter_vec = LD_SH(filter_y);
3111 UNPCK_R_SB_SH(filter_vec, filter_vec);
3113 SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
3117 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
3118 XORI_B5_128_SB(src0, src1, src2, src3, src4);
3120 VSHF_B2_SB(src0, src2, src0, src2, mask0, mask1, vec0, vec1);
3121 VSHF_B2_SB(src1, src3, src1, src3, mask0, mask1, vec2, vec3);
3122 VSHF_B2_SB(src2, src4, src2, src4, mask0, mask1, vec4, vec5);
3124 dst20 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3125 dst31 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3126 dst42 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3128 ILVRL_H2_SH(dst31, dst20, dst10, dst32);
3129 ILVRL_H2_SH(dst42, dst31, dst21, dst43);
3131 dst0 = HEVC_FILT_4TAP(dst10, dst32, filt_h0, filt_h1);
3132 dst1 = HEVC_FILT_4TAP(dst21, dst43, filt_h0, filt_h1);
3135 tmp = __msa_pckev_h((v8i16) dst1, (v8i16) dst0);
3136 tmp = __msa_srari_h(tmp, 6);
3137 tmp = __msa_sat_s_h(tmp, 7);
3138 out = PCKEV_XORI128_UB(tmp, tmp);
3139 ST_W2(out, 0, 1, dst, dst_stride);
3142 static void hevc_hv_uni_4t_4x4_msa(uint8_t *src,
3146 const int8_t *filter_x,
3147 const int8_t *filter_y)
3150 v16i8 src0, src1, src2, src3, src4, src5, src6;
3152 v8i16 filt_h0, filt_h1;
3153 v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16);
3155 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
3156 v8i16 filter_vec, tmp0, tmp1;
3157 v8i16 dst30, dst41, dst52, dst63;
3158 v8i16 dst10, dst32, dst54, dst21, dst43, dst65;
3159 v4i32 dst0, dst1, dst2, dst3;
3161 src -= (src_stride + 1);
3163 filter_vec = LD_SH(filter_x);
3164 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3166 filter_vec = LD_SH(filter_y);
3167 UNPCK_R_SB_SH(filter_vec, filter_vec);
3169 SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
3173 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
3174 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
3176 VSHF_B2_SB(src0, src3, src0, src3, mask0, mask1, vec0, vec1);
3177 VSHF_B2_SB(src1, src4, src1, src4, mask0, mask1, vec2, vec3);
3178 VSHF_B2_SB(src2, src5, src2, src5, mask0, mask1, vec4, vec5);
3179 VSHF_B2_SB(src3, src6, src3, src6, mask0, mask1, vec6, vec7);
3181 dst30 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3182 dst41 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3183 dst52 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3184 dst63 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
3186 ILVRL_H2_SH(dst41, dst30, dst10, dst43);
3187 ILVRL_H2_SH(dst52, dst41, dst21, dst54);
3188 ILVRL_H2_SH(dst63, dst52, dst32, dst65);
3189 dst0 = HEVC_FILT_4TAP(dst10, dst32, filt_h0, filt_h1);
3190 dst1 = HEVC_FILT_4TAP(dst21, dst43, filt_h0, filt_h1);
3191 dst2 = HEVC_FILT_4TAP(dst32, dst54, filt_h0, filt_h1);
3192 dst3 = HEVC_FILT_4TAP(dst43, dst65, filt_h0, filt_h1);
3193 SRA_4V(dst0, dst1, dst2, dst3, 6);
3194 PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp0, tmp1);
3195 SRARI_H2_SH(tmp0, tmp1, 6);
3196 SAT_SH2_SH(tmp0, tmp1, 7);
3197 out = PCKEV_XORI128_UB(tmp0, tmp1);
3198 ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
3201 static void hevc_hv_uni_4t_4multx8mult_msa(uint8_t *src,
3205 const int8_t *filter_x,
3206 const int8_t *filter_y,
3211 v16i8 src0, src1, src2, src3, src4, src5;
3212 v16i8 src6, src7, src8, src9, src10;
3214 v8i16 filt_h0, filt_h1;
3215 v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16);
3217 v8i16 filter_vec, tmp0, tmp1, tmp2, tmp3;
3218 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
3219 v8i16 dst10, dst21, dst22, dst73, dst84, dst95, dst106;
3220 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r, dst6_r, dst7_r;
3221 v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
3222 v8i16 dst21_r, dst43_r, dst65_r, dst87_r;
3223 v8i16 dst98_r, dst109_r;
3225 src -= (src_stride + 1);
3227 filter_vec = LD_SH(filter_x);
3228 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3230 filter_vec = LD_SH(filter_y);
3231 UNPCK_R_SB_SH(filter_vec, filter_vec);
3233 SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
3237 LD_SB3(src, src_stride, src0, src1, src2);
3238 src += (3 * src_stride);
3240 XORI_B3_128_SB(src0, src1, src2);
3242 VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
3243 VSHF_B2_SB(src1, src2, src1, src2, mask0, mask1, vec2, vec3);
3244 dst10 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3245 dst21 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3246 ILVRL_H2_SH(dst21, dst10, dst10_r, dst21_r);
3247 dst22 = (v8i16) __msa_splati_d((v2i64) dst21, 1);
3249 for (loop_cnt = height >> 3; loop_cnt--;) {
3250 LD_SB8(src, src_stride,
3251 src3, src4, src5, src6, src7, src8, src9, src10);
3252 src += (8 * src_stride);
3254 XORI_B8_128_SB(src3, src4, src5, src6, src7, src8, src9, src10);
3256 VSHF_B2_SB(src3, src7, src3, src7, mask0, mask1, vec0, vec1);
3257 VSHF_B2_SB(src4, src8, src4, src8, mask0, mask1, vec2, vec3);
3258 VSHF_B2_SB(src5, src9, src5, src9, mask0, mask1, vec4, vec5);
3259 VSHF_B2_SB(src6, src10, src6, src10, mask0, mask1, vec6, vec7);
3261 dst73 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3262 dst84 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3263 dst95 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3264 dst106 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
3266 dst32_r = __msa_ilvr_h(dst73, dst22);
3267 ILVRL_H2_SH(dst84, dst73, dst43_r, dst87_r);
3268 ILVRL_H2_SH(dst95, dst84, dst54_r, dst98_r);
3269 ILVRL_H2_SH(dst106, dst95, dst65_r, dst109_r);
3270 dst22 = (v8i16) __msa_splati_d((v2i64) dst73, 1);
3271 dst76_r = __msa_ilvr_h(dst22, dst106);
3273 dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
3274 dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
3275 dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
3276 dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
3277 dst4_r = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
3278 dst5_r = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
3279 dst6_r = HEVC_FILT_4TAP(dst76_r, dst98_r, filt_h0, filt_h1);
3280 dst7_r = HEVC_FILT_4TAP(dst87_r, dst109_r, filt_h0, filt_h1);
3281 SRA_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6);
3282 SRA_4V(dst4_r, dst5_r, dst6_r, dst7_r, 6);
3283 PCKEV_H4_SH(dst1_r, dst0_r, dst3_r, dst2_r,
3284 dst5_r, dst4_r, dst7_r, dst6_r,
3285 tmp0, tmp1, tmp2, tmp3);
3286 SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 6);
3287 SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
3288 out0 = PCKEV_XORI128_UB(tmp0, tmp1);
3289 out1 = PCKEV_XORI128_UB(tmp2, tmp3);
3290 ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
3291 dst += (8 * dst_stride);
3295 dst22 = (v8i16) __msa_splati_d((v2i64) dst106, 1);
3299 static void hevc_hv_uni_4t_4w_msa(uint8_t *src,
3303 const int8_t *filter_x,
3304 const int8_t *filter_y,
3308 hevc_hv_uni_4t_4x2_msa(src, src_stride, dst, dst_stride,
3309 filter_x, filter_y);
3310 } else if (4 == height) {
3311 hevc_hv_uni_4t_4x4_msa(src, src_stride, dst, dst_stride,
3312 filter_x, filter_y);
3313 } else if (0 == (height % 8)) {
3314 hevc_hv_uni_4t_4multx8mult_msa(src, src_stride, dst, dst_stride,
3315 filter_x, filter_y, height);
3319 static void hevc_hv_uni_4t_6w_msa(uint8_t *src,
3323 const int8_t *filter_x,
3324 const int8_t *filter_y,
3327 v16u8 out0, out1, out2;
3328 v16i8 src0, src1, src2, src3, src4, src5, src6;
3329 v16i8 src7, src8, src9, src10;
3331 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
3332 v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
3334 v8i16 filt_h0, filt_h1, filter_vec;
3335 v8i16 dsth0, dsth1, dsth2, dsth3, dsth4, dsth5, dsth6, dsth7, dsth8, dsth9;
3336 v8i16 dsth10, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
3337 v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
3338 v4i32 dst4_r, dst5_r, dst6_r, dst7_r;
3339 v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
3340 v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
3341 v8i16 dst54_r, dst76_r, dst98_r, dst65_r, dst87_r, dst109_r;
3342 v8i16 dst98_l, dst65_l, dst54_l, dst76_l, dst87_l, dst109_l;
3343 v8i16 dst1021_l, dst3243_l, dst5465_l, dst7687_l, dst98109_l;
3345 src -= (src_stride + 1);
3347 filter_vec = LD_SH(filter_x);
3348 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3350 filter_vec = LD_SH(filter_y);
3351 UNPCK_R_SB_SH(filter_vec, filter_vec);
3353 SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
3357 LD_SB3(src, src_stride, src0, src1, src2);
3358 src += (3 * src_stride);
3360 XORI_B3_128_SB(src0, src1, src2);
3362 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3363 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
3364 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3366 dsth0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3367 dsth1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3368 dsth2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3370 ILVRL_H2_SH(dsth1, dsth0, dst10_r, dst10_l);
3371 ILVRL_H2_SH(dsth2, dsth1, dst21_r, dst21_l);
3373 LD_SB8(src, src_stride, src3, src4, src5, src6, src7, src8, src9, src10);
3374 XORI_B8_128_SB(src3, src4, src5, src6, src7, src8, src9, src10);
3376 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3377 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
3378 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
3379 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
3381 dsth3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3382 dsth4 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3383 dsth5 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3384 dsth6 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
3386 VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1);
3387 VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec2, vec3);
3388 VSHF_B2_SB(src9, src9, src9, src9, mask0, mask1, vec4, vec5);
3389 VSHF_B2_SB(src10, src10, src10, src10, mask0, mask1, vec6, vec7);
3391 dsth7 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3392 dsth8 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3393 dsth9 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3394 dsth10 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
3396 ILVRL_H2_SH(dsth3, dsth2, dst32_r, dst32_l);
3397 ILVRL_H2_SH(dsth4, dsth3, dst43_r, dst43_l);
3398 ILVRL_H2_SH(dsth5, dsth4, dst54_r, dst54_l);
3399 ILVRL_H2_SH(dsth6, dsth5, dst65_r, dst65_l);
3400 ILVRL_H2_SH(dsth7, dsth6, dst76_r, dst76_l);
3401 ILVRL_H2_SH(dsth8, dsth7, dst87_r, dst87_l);
3402 ILVRL_H2_SH(dsth9, dsth8, dst98_r, dst98_l);
3403 ILVRL_H2_SH(dsth10, dsth9, dst109_r, dst109_l);
3405 PCKEV_D2_SH(dst21_l, dst10_l, dst43_l, dst32_l, dst1021_l, dst3243_l);
3406 PCKEV_D2_SH(dst65_l, dst54_l, dst87_l, dst76_l, dst5465_l, dst7687_l);
3407 dst98109_l = (v8i16) __msa_pckev_d((v2i64) dst109_l, (v2i64) dst98_l);
3409 dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
3410 dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
3411 dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
3412 dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
3413 dst4_r = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
3414 dst5_r = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
3415 dst6_r = HEVC_FILT_4TAP(dst76_r, dst98_r, filt_h0, filt_h1);
3416 dst7_r = HEVC_FILT_4TAP(dst87_r, dst109_r, filt_h0, filt_h1);
3417 dst0_l = HEVC_FILT_4TAP(dst1021_l, dst3243_l, filt_h0, filt_h1);
3418 dst1_l = HEVC_FILT_4TAP(dst3243_l, dst5465_l, filt_h0, filt_h1);
3419 dst2_l = HEVC_FILT_4TAP(dst5465_l, dst7687_l, filt_h0, filt_h1);
3420 dst3_l = HEVC_FILT_4TAP(dst7687_l, dst98109_l, filt_h0, filt_h1);
3421 SRA_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6);
3422 SRA_4V(dst4_r, dst5_r, dst6_r, dst7_r, 6);
3423 SRA_4V(dst0_l, dst1_l, dst2_l, dst3_l, 6);
3424 PCKEV_H2_SH(dst1_r, dst0_r, dst3_r, dst2_r, tmp0, tmp1);
3425 PCKEV_H2_SH(dst5_r, dst4_r, dst7_r, dst6_r, tmp2, tmp3);
3426 PCKEV_H2_SH(dst1_l, dst0_l, dst3_l, dst2_l, tmp4, tmp5);
3427 SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 6);
3428 SRARI_H2_SH(tmp4, tmp5, 6);
3429 SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3,7);
3430 SAT_SH2_SH(tmp4, tmp5,7);
3431 out0 = PCKEV_XORI128_UB(tmp0, tmp1);
3432 out1 = PCKEV_XORI128_UB(tmp2, tmp3);
3433 out2 = PCKEV_XORI128_UB(tmp4, tmp5);
3434 ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
3435 ST_H8(out2, 0, 1, 2, 3, 4, 5, 6, 7, dst + 4, dst_stride);
3438 static void hevc_hv_uni_4t_8x2_msa(uint8_t *src,
3442 const int8_t *filter_x,
3443 const int8_t *filter_y)
3446 v16i8 src0, src1, src2, src3, src4;
3448 v8i16 filt_h0, filt_h1, filter_vec;
3449 v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
3451 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
3452 v8i16 dst0, dst1, dst2, dst3, dst4;
3453 v4i32 dst0_r, dst0_l, dst1_r, dst1_l;
3454 v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
3455 v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
3456 v8i16 out0_r, out1_r;
3458 src -= (src_stride + 1);
3460 filter_vec = LD_SH(filter_x);
3461 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3463 filter_vec = LD_SH(filter_y);
3464 UNPCK_R_SB_SH(filter_vec, filter_vec);
3466 SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
3470 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
3471 XORI_B5_128_SB(src0, src1, src2, src3, src4);
3473 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3474 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
3475 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3476 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7);
3477 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec8, vec9);
3479 dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3480 dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3481 dst2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3482 dst3 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
3483 dst4 = HEVC_FILT_4TAP_SH(vec8, vec9, filt0, filt1);
3484 ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
3485 ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
3486 ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
3487 ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
3488 dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
3489 dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
3490 dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
3491 dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
3492 SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
3493 PCKEV_H2_SH(dst0_l, dst0_r, dst1_l, dst1_r, out0_r, out1_r);
3494 SRARI_H2_SH(out0_r, out1_r, 6);
3495 SAT_SH2_SH(out0_r, out1_r, 7);
3496 out = PCKEV_XORI128_UB(out0_r, out1_r);
3497 ST_D2(out, 0, 1, dst, dst_stride);
3500 static void hevc_hv_uni_4t_8multx4_msa(uint8_t *src,
3504 const int8_t *filter_x,
3505 const int8_t *filter_y,
3510 v16i8 src0, src1, src2, src3, src4, src5, src6, mask0, mask1;
3511 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
3512 v8i16 filt0, filt1, filt_h0, filt_h1, filter_vec;
3513 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, tmp0, tmp1, tmp2, tmp3;
3514 v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
3515 v8i16 dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r;
3516 v8i16 dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l;
3518 src -= (src_stride + 1);
3520 filter_vec = LD_SH(filter_x);
3521 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3523 filter_vec = LD_SH(filter_y);
3524 UNPCK_R_SB_SH(filter_vec, filter_vec);
3526 SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
3528 mask0 = LD_SB(ff_hevc_mask_arr);
3531 for (cnt = width8mult; cnt--;) {
3532 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
3534 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
3536 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3537 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
3538 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3540 dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3541 dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3542 dst2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3544 ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
3545 ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
3547 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3548 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
3549 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
3550 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
3552 dst3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3553 dst4 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3554 dst5 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3555 dst6 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
3557 ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
3558 ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
3559 ILVRL_H2_SH(dst5, dst4, dst54_r, dst54_l);
3560 ILVRL_H2_SH(dst6, dst5, dst65_r, dst65_l);
3562 dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
3563 dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
3564 dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
3565 dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
3566 dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
3567 dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1);
3568 dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
3569 dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1);
3571 SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
3572 SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
3574 PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, dst3_l,
3575 dst3_r, tmp0, tmp1, tmp2, tmp3);
3576 SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 6);
3577 SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
3578 out0 = PCKEV_XORI128_UB(tmp0, tmp1);
3579 out1 = PCKEV_XORI128_UB(tmp2, tmp3);
3580 ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
3585 static void hevc_hv_uni_4t_8x6_msa(uint8_t *src,
3589 const int8_t *filter_x,
3590 const int8_t *filter_y)
3592 v16u8 out0, out1, out2;
3593 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
3595 v8i16 filt_h0, filt_h1, filter_vec;
3596 v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
3598 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
3599 v16i8 vec10, vec11, vec12, vec13, vec14, vec15, vec16, vec17;
3600 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
3601 v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
3602 v4i32 dst4_r, dst4_l, dst5_r, dst5_l;
3603 v8i16 dst10_r, dst32_r, dst10_l, dst32_l;
3604 v8i16 dst21_r, dst43_r, dst21_l, dst43_l;
3605 v8i16 dst54_r, dst54_l, dst65_r, dst65_l;
3606 v8i16 dst76_r, dst76_l, dst87_r, dst87_l;
3607 v8i16 out0_r, out1_r, out2_r, out3_r, out4_r, out5_r;
3609 src -= (src_stride + 1);
3611 filter_vec = LD_SH(filter_x);
3612 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3614 filter_vec = LD_SH(filter_y);
3615 UNPCK_R_SB_SH(filter_vec, filter_vec);
3617 SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
3621 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
3622 src += (5 * src_stride);
3623 LD_SB4(src, src_stride, src5, src6, src7, src8);
3625 XORI_B5_128_SB(src0, src1, src2, src3, src4);
3626 XORI_B4_128_SB(src5, src6, src7, src8);
3628 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3629 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
3630 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3631 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7);
3632 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec8, vec9);
3633 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec10, vec11);
3634 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec12, vec13);
3635 VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec14, vec15);
3636 VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec16, vec17);
3638 dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3639 dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3640 dst2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3641 dst3 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
3642 dst4 = HEVC_FILT_4TAP_SH(vec8, vec9, filt0, filt1);
3643 dst5 = HEVC_FILT_4TAP_SH(vec10, vec11, filt0, filt1);
3644 dst6 = HEVC_FILT_4TAP_SH(vec12, vec13, filt0, filt1);
3645 dst7 = HEVC_FILT_4TAP_SH(vec14, vec15, filt0, filt1);
3646 dst8 = HEVC_FILT_4TAP_SH(vec16, vec17, filt0, filt1);
3648 ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
3649 ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
3650 ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
3651 ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
3652 ILVRL_H2_SH(dst5, dst4, dst54_r, dst54_l);
3653 ILVRL_H2_SH(dst6, dst5, dst65_r, dst65_l);
3654 ILVRL_H2_SH(dst7, dst6, dst76_r, dst76_l);
3655 ILVRL_H2_SH(dst8, dst7, dst87_r, dst87_l);
3657 dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
3658 dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
3659 dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
3660 dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
3661 dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
3662 dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1);
3663 dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
3664 dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1);
3665 dst4_r = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
3666 dst4_l = HEVC_FILT_4TAP(dst54_l, dst76_l, filt_h0, filt_h1);
3667 dst5_r = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
3668 dst5_l = HEVC_FILT_4TAP(dst65_l, dst87_l, filt_h0, filt_h1);
3670 SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
3671 SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
3672 SRA_4V(dst4_r, dst4_l, dst5_r, dst5_l, 6);
3673 PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r,
3674 dst2_l, dst2_r, dst3_l, dst3_r, out0_r, out1_r, out2_r, out3_r);
3675 PCKEV_H2_SH(dst4_l, dst4_r, dst5_l, dst5_r, out4_r, out5_r);
3676 SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6);
3677 SRARI_H2_SH(out4_r, out5_r, 6);
3678 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
3679 SAT_SH2_SH(out4_r, out5_r, 7);
3680 out0 = PCKEV_XORI128_UB(out0_r, out1_r);
3681 out1 = PCKEV_XORI128_UB(out2_r, out3_r);
3682 out2 = PCKEV_XORI128_UB(out4_r, out5_r);
3684 ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
3685 ST_D2(out2, 0, 1, dst + 4 * dst_stride, dst_stride);
3688 static void hevc_hv_uni_4t_8multx4mult_msa(uint8_t *src,
3692 const int8_t *filter_x,
3693 const int8_t *filter_y,
3697 uint32_t loop_cnt, cnt;
3701 v16i8 src0, src1, src2, src3, src4, src5, src6;
3703 v8i16 filt_h0, filt_h1, filter_vec;
3704 v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
3706 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
3707 v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
3708 v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
3709 v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
3710 v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
3711 v8i16 dst54_r, dst54_l, dst65_r, dst65_l, dst6;
3712 v8i16 out0_r, out1_r, out2_r, out3_r;
3714 src -= (src_stride + 1);
3716 filter_vec = LD_SH(filter_x);
3717 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3719 filter_vec = LD_SH(filter_y);
3720 UNPCK_R_SB_SH(filter_vec, filter_vec);
3722 SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
3726 for (cnt = width8mult; cnt--;) {
3730 LD_SB3(src_tmp, src_stride, src0, src1, src2);
3731 src_tmp += (3 * src_stride);
3733 XORI_B3_128_SB(src0, src1, src2);
3735 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3736 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
3737 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3739 dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3740 dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3741 dst2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3743 ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
3744 ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
3746 for (loop_cnt = (height >> 2); loop_cnt--;) {
3747 LD_SB4(src_tmp, src_stride, src3, src4, src5, src6);
3748 src_tmp += (4 * src_stride);
3750 XORI_B4_128_SB(src3, src4, src5, src6);
3752 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3753 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
3754 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
3755 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
3757 dst3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3758 dst4 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3759 dst5 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3760 dst6 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
3762 ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
3763 ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
3764 ILVRL_H2_SH(dst5, dst4, dst54_r, dst54_l);
3765 ILVRL_H2_SH(dst6, dst5, dst65_r, dst65_l);
3767 dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
3768 dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
3769 dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
3770 dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
3771 dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
3772 dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1);
3773 dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
3774 dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1);
3776 SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
3777 SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
3779 PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r,
3780 dst2_l, dst2_r, dst3_l, dst3_r,
3781 out0_r, out1_r, out2_r, out3_r);
3783 SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6);
3784 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
3785 out0 = PCKEV_XORI128_UB(out0_r, out1_r);
3786 out1 = PCKEV_XORI128_UB(out2_r, out3_r);
3787 ST_D4(out0, out1, 0, 1, 0, 1, dst_tmp, dst_stride);
3788 dst_tmp += (4 * dst_stride);
3802 static void hevc_hv_uni_4t_8w_msa(uint8_t *src,
3806 const int8_t *filter_x,
3807 const int8_t *filter_y,
3811 hevc_hv_uni_4t_8x2_msa(src, src_stride, dst, dst_stride,
3812 filter_x, filter_y);
3813 } else if (4 == height) {
3814 hevc_hv_uni_4t_8multx4_msa(src, src_stride, dst, dst_stride,
3815 filter_x, filter_y, 1);
3816 } else if (6 == height) {
3817 hevc_hv_uni_4t_8x6_msa(src, src_stride, dst, dst_stride,
3818 filter_x, filter_y);
3819 } else if (0 == (height % 4)) {
3820 hevc_hv_uni_4t_8multx4mult_msa(src, src_stride, dst, dst_stride,
3821 filter_x, filter_y, height, 1);
3825 static void hevc_hv_uni_4t_12w_msa(uint8_t *src,
3829 const int8_t *filter_x,
3830 const int8_t *filter_y,
3834 uint8_t *src_tmp, *dst_tmp;
3836 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
3837 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
3838 v16i8 mask0, mask1, mask2, mask3;
3839 v8i16 filt0, filt1, filt_h0, filt_h1, filter_vec, tmp0, tmp1, tmp2, tmp3;
3840 v8i16 dsth0, dsth1, dsth2, dsth3, dsth4, dsth5, dsth6;
3841 v8i16 dst10, dst21, dst22, dst73, dst84, dst95, dst106;
3842 v8i16 dst76_r, dst98_r, dst87_r, dst109_r;
3843 v8i16 dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r;
3844 v8i16 dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l;
3845 v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
3846 v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
3848 src -= (src_stride + 1);
3850 filter_vec = LD_SH(filter_x);
3851 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3853 filter_vec = LD_SH(filter_y);
3854 UNPCK_R_SB_SH(filter_vec, filter_vec);
3856 SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
3858 mask0 = LD_SB(ff_hevc_mask_arr);
3864 LD_SB3(src_tmp, src_stride, src0, src1, src2);
3865 src_tmp += (3 * src_stride);
3867 XORI_B3_128_SB(src0, src1, src2);
3869 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3870 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
3871 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3873 dsth0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3874 dsth1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3875 dsth2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3877 ILVRL_H2_SH(dsth1, dsth0, dst10_r, dst10_l);
3878 ILVRL_H2_SH(dsth2, dsth1, dst21_r, dst21_l);
3880 for (loop_cnt = 4; loop_cnt--;) {
3881 LD_SB4(src_tmp, src_stride, src3, src4, src5, src6);
3882 src_tmp += (4 * src_stride);
3883 XORI_B4_128_SB(src3, src4, src5, src6);
3885 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3886 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
3887 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
3888 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
3890 dsth3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3891 dsth4 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3892 dsth5 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3893 dsth6 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
3895 ILVRL_H2_SH(dsth3, dsth2, dst32_r, dst32_l);
3896 ILVRL_H2_SH(dsth4, dsth3, dst43_r, dst43_l);
3897 ILVRL_H2_SH(dsth5, dsth4, dst54_r, dst54_l);
3898 ILVRL_H2_SH(dsth6, dsth5, dst65_r, dst65_l);
3900 dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
3901 dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
3902 dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
3903 dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
3904 dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
3905 dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1);
3906 dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
3907 dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1);
3909 SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
3910 SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
3912 PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, dst3_l,
3913 dst3_r, tmp0, tmp1, tmp2, tmp3);
3914 SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 6);
3915 SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
3916 out0 = PCKEV_XORI128_UB(tmp0, tmp1);
3917 out1 = PCKEV_XORI128_UB(tmp2, tmp3);
3918 ST_D4(out0, out1, 0, 1, 0, 1, dst_tmp, dst_stride);
3919 dst_tmp += (4 * dst_stride);
3931 mask2 = LD_SB(ff_hevc_mask_arr + 16);
3934 LD_SB3(src, src_stride, src0, src1, src2);
3935 src += (3 * src_stride);
3936 XORI_B3_128_SB(src0, src1, src2);
3937 VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec0, vec1);
3938 VSHF_B2_SB(src1, src2, src1, src2, mask2, mask3, vec2, vec3);
3940 dst10 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3941 dst21 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3943 ILVRL_H2_SH(dst21, dst10, dst10_r, dst21_r);
3944 dst22 = (v8i16) __msa_splati_d((v2i64) dst21, 1);
3946 for (loop_cnt = 2; loop_cnt--;) {
3947 LD_SB8(src, src_stride,
3948 src3, src4, src5, src6, src7, src8, src9, src10);
3949 src += (8 * src_stride);
3950 XORI_B8_128_SB(src3, src4, src5, src6, src7, src8, src9, src10);
3951 VSHF_B2_SB(src3, src7, src3, src7, mask2, mask3, vec0, vec1);
3952 VSHF_B2_SB(src4, src8, src4, src8, mask2, mask3, vec2, vec3);
3953 VSHF_B2_SB(src5, src9, src5, src9, mask2, mask3, vec4, vec5);
3954 VSHF_B2_SB(src6, src10, src6, src10, mask2, mask3, vec6, vec7);
3956 dst73 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3957 dst84 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3958 dst95 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3959 dst106 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
3961 dst32_r = __msa_ilvr_h(dst73, dst22);
3962 ILVRL_H2_SH(dst84, dst73, dst43_r, dst87_r);
3963 ILVRL_H2_SH(dst95, dst84, dst54_r, dst98_r);
3964 ILVRL_H2_SH(dst106, dst95, dst65_r, dst109_r);
3965 dst22 = (v8i16) __msa_splati_d((v2i64) dst73, 1);
3966 dst76_r = __msa_ilvr_h(dst22, dst106);
3968 dst0 = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
3969 dst1 = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
3970 dst2 = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
3971 dst3 = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
3972 dst4 = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
3973 dst5 = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
3974 dst6 = HEVC_FILT_4TAP(dst76_r, dst98_r, filt_h0, filt_h1);
3975 dst7 = HEVC_FILT_4TAP(dst87_r, dst109_r, filt_h0, filt_h1);
3976 SRA_4V(dst0, dst1, dst2, dst3, 6);
3977 SRA_4V(dst4, dst5, dst6, dst7, 6);
3978 PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6,
3979 tmp0, tmp1, tmp2, tmp3);
3980 SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 6);
3981 SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
3982 out0 = PCKEV_XORI128_UB(tmp0, tmp1);
3983 out1 = PCKEV_XORI128_UB(tmp2, tmp3);
3984 ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
3985 dst += (8 * dst_stride);
3989 dst22 = (v8i16) __msa_splati_d((v2i64) dst106, 1);
3993 static void hevc_hv_uni_4t_16w_msa(uint8_t *src,
3997 const int8_t *filter_x,
3998 const int8_t *filter_y,
4002 hevc_hv_uni_4t_8multx4_msa(src, src_stride, dst, dst_stride, filter_x,
4005 hevc_hv_uni_4t_8multx4mult_msa(src, src_stride, dst, dst_stride,
4006 filter_x, filter_y, height, 2);
4010 static void hevc_hv_uni_4t_24w_msa(uint8_t *src,
4014 const int8_t *filter_x,
4015 const int8_t *filter_y,
4018 hevc_hv_uni_4t_8multx4mult_msa(src, src_stride, dst, dst_stride,
4019 filter_x, filter_y, height, 3);
4022 static void hevc_hv_uni_4t_32w_msa(uint8_t *src,
4026 const int8_t *filter_x,
4027 const int8_t *filter_y,
4030 hevc_hv_uni_4t_8multx4mult_msa(src, src_stride, dst, dst_stride,
4031 filter_x, filter_y, height, 4);
4034 #define UNI_MC_COPY(WIDTH) \
4035 void ff_hevc_put_hevc_uni_pel_pixels##WIDTH##_8_msa(uint8_t *dst, \
4036 ptrdiff_t dst_stride, \
4038 ptrdiff_t src_stride, \
4044 copy_width##WIDTH##_msa(src, src_stride, dst, dst_stride, height); \
4057 #define UNI_MC(PEL, DIR, WIDTH, TAP, DIR1, FILT_DIR) \
4058 void ff_hevc_put_hevc_uni_##PEL##_##DIR##WIDTH##_8_msa(uint8_t *dst, \
4059 ptrdiff_t dst_stride, \
4061 ptrdiff_t src_stride, \
4067 const int8_t *filter = ff_hevc_##PEL##_filters[FILT_DIR - 1]; \
4069 common_##DIR1##_##TAP##t_##WIDTH##w_msa(src, src_stride, dst, dst_stride, \
4073 UNI_MC(qpel, h, 4, 8, hz, mx);
4074 UNI_MC(qpel, h, 8, 8, hz, mx);
4075 UNI_MC(qpel, h, 12, 8, hz, mx);
4076 UNI_MC(qpel, h, 16, 8, hz, mx);
4077 UNI_MC(qpel, h, 24, 8, hz, mx);
4078 UNI_MC(qpel, h, 32, 8, hz, mx);
4079 UNI_MC(qpel, h, 48, 8, hz, mx);
4080 UNI_MC(qpel, h, 64, 8, hz, mx);
4082 UNI_MC(qpel, v, 4, 8, vt, my);
4083 UNI_MC(qpel, v, 8, 8, vt, my);
4084 UNI_MC(qpel, v, 12, 8, vt, my);
4085 UNI_MC(qpel, v, 16, 8, vt, my);
4086 UNI_MC(qpel, v, 24, 8, vt, my);
4087 UNI_MC(qpel, v, 32, 8, vt, my);
4088 UNI_MC(qpel, v, 48, 8, vt, my);
4089 UNI_MC(qpel, v, 64, 8, vt, my);
4091 UNI_MC(epel, h, 4, 4, hz, mx);
4092 UNI_MC(epel, h, 6, 4, hz, mx);
4093 UNI_MC(epel, h, 8, 4, hz, mx);
4094 UNI_MC(epel, h, 12, 4, hz, mx);
4095 UNI_MC(epel, h, 16, 4, hz, mx);
4096 UNI_MC(epel, h, 24, 4, hz, mx);
4097 UNI_MC(epel, h, 32, 4, hz, mx);
4099 UNI_MC(epel, v, 4, 4, vt, my);
4100 UNI_MC(epel, v, 6, 4, vt, my);
4101 UNI_MC(epel, v, 8, 4, vt, my);
4102 UNI_MC(epel, v, 12, 4, vt, my);
4103 UNI_MC(epel, v, 16, 4, vt, my);
4104 UNI_MC(epel, v, 24, 4, vt, my);
4105 UNI_MC(epel, v, 32, 4, vt, my);
4109 #define UNI_MC_HV(PEL, WIDTH, TAP) \
4110 void ff_hevc_put_hevc_uni_##PEL##_hv##WIDTH##_8_msa(uint8_t *dst, \
4111 ptrdiff_t dst_stride, \
4113 ptrdiff_t src_stride, \
4119 const int8_t *filter_x = ff_hevc_##PEL##_filters[mx - 1]; \
4120 const int8_t *filter_y = ff_hevc_##PEL##_filters[my - 1]; \
4122 hevc_hv_uni_##TAP##t_##WIDTH##w_msa(src, src_stride, dst, dst_stride, \
4123 filter_x, filter_y, height); \
4126 UNI_MC_HV(qpel, 4, 8);
4127 UNI_MC_HV(qpel, 8, 8);
4128 UNI_MC_HV(qpel, 12, 8);
4129 UNI_MC_HV(qpel, 16, 8);
4130 UNI_MC_HV(qpel, 24, 8);
4131 UNI_MC_HV(qpel, 32, 8);
4132 UNI_MC_HV(qpel, 48, 8);
4133 UNI_MC_HV(qpel, 64, 8);
4135 UNI_MC_HV(epel, 4, 4);
4136 UNI_MC_HV(epel, 6, 4);
4137 UNI_MC_HV(epel, 8, 4);
4138 UNI_MC_HV(epel, 12, 4);
4139 UNI_MC_HV(epel, 16, 4);
4140 UNI_MC_HV(epel, 24, 4);
4141 UNI_MC_HV(epel, 32, 4);