2 * Copyright (c) 2015 - 2017 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com)
4 * This file is part of FFmpeg.
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 #include "libavutil/mips/generic_macros_msa.h"
22 #include "libavcodec/mips/hevcdsp_mips.h"
23 #include "libavcodec/mips/hevc_macros_msa.h"
25 static const uint8_t ff_hevc_mask_arr[16 * 3] __attribute__((aligned(0x40))) = {
27 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
29 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20,
31 8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
34 #define HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, \
35 mask0, mask1, mask2, mask3, \
36 filt0, filt1, filt2, filt3, \
39 v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \
41 VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0_m, vec1_m); \
42 DOTP_SB2_SH(vec0_m, vec1_m, filt0, filt0, out0, out1); \
43 VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2_m, vec3_m); \
44 DPADD_SB2_SH(vec2_m, vec3_m, filt1, filt1, out0, out1); \
45 VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4_m, vec5_m); \
46 DPADD_SB2_SH(vec4_m, vec5_m, filt2, filt2, out0, out1); \
47 VSHF_B2_SB(src0, src1, src2, src3, mask3, mask3, vec6_m, vec7_m); \
48 DPADD_SB2_SH(vec6_m, vec7_m, filt3, filt3, out0, out1); \
51 #define HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, \
52 mask0, mask1, mask2, mask3, \
53 filt0, filt1, filt2, filt3, \
54 out0, out1, out2, out3) \
56 v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \
58 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m); \
59 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m); \
60 DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0, \
61 out0, out1, out2, out3); \
62 VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec0_m, vec1_m); \
63 VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2_m, vec3_m); \
64 DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt2, filt2, filt2, filt2, \
65 out0, out1, out2, out3); \
66 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4_m, vec5_m); \
67 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6_m, vec7_m); \
68 DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt1, filt1, filt1, filt1, \
69 out0, out1, out2, out3); \
70 VSHF_B2_SB(src0, src0, src1, src1, mask3, mask3, vec4_m, vec5_m); \
71 VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec6_m, vec7_m); \
72 DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt3, filt3, filt3, filt3, \
73 out0, out1, out2, out3); \
76 #define HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, \
77 mask0, mask1, filt0, filt1, \
80 v16i8 vec0_m, vec1_m, vec2_m, vec3_m; \
82 VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0_m, vec1_m); \
83 DOTP_SB2_SH(vec0_m, vec1_m, filt0, filt0, out0, out1); \
84 VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2_m, vec3_m); \
85 DPADD_SB2_SH(vec2_m, vec3_m, filt1, filt1, out0, out1); \
88 #define HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, \
89 mask0, mask1, filt0, filt1, \
90 out0, out1, out2, out3) \
92 v16i8 vec0_m, vec1_m, vec2_m, vec3_m; \
94 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m); \
95 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m); \
96 DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0, \
97 out0, out1, out2, out3); \
98 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0_m, vec1_m); \
99 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2_m, vec3_m); \
100 DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt1, filt1, filt1, filt1, \
101 out0, out1, out2, out3); \
104 static void copy_width8_msa(uint8_t *src, int32_t src_stride,
105 uint8_t *dst, int32_t dst_stride,
109 uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
112 LD2(src, src_stride, out0, out1);
116 } else if (6 == height) {
117 LD4(src, src_stride, out0, out1, out2, out3);
118 src += (4 * src_stride);
119 SD4(out0, out1, out2, out3, dst, dst_stride);
120 dst += (4 * dst_stride);
121 LD2(src, src_stride, out0, out1);
125 } else if (0 == (height % 8)) {
126 for (cnt = (height >> 3); cnt--;) {
127 LD4(src, src_stride, out0, out1, out2, out3);
128 src += (4 * src_stride);
129 LD4(src, src_stride, out4, out5, out6, out7);
130 src += (4 * src_stride);
131 SD4(out0, out1, out2, out3, dst, dst_stride);
132 dst += (4 * dst_stride);
133 SD4(out4, out5, out6, out7, dst, dst_stride);
134 dst += (4 * dst_stride);
136 } else if (0 == (height % 4)) {
137 for (cnt = (height >> 2); cnt--;) {
138 LD4(src, src_stride, out0, out1, out2, out3);
139 src += (4 * src_stride);
140 SD4(out0, out1, out2, out3, dst, dst_stride);
141 dst += (4 * dst_stride);
146 static void copy_width12_msa(uint8_t *src, int32_t src_stride,
147 uint8_t *dst, int32_t dst_stride,
150 v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
152 LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
153 src += (8 * src_stride);
154 ST12x8_UB(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride);
155 dst += (8 * dst_stride);
156 LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
157 ST12x8_UB(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride);
160 static void copy_width16_msa(uint8_t *src, int32_t src_stride,
161 uint8_t *dst, int32_t dst_stride,
165 v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
168 LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
169 src += (8 * src_stride);
170 ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride);
171 dst += (8 * dst_stride);
172 LD_UB4(src, src_stride, src0, src1, src2, src3);
173 src += (4 * src_stride);
174 ST_UB4(src0, src1, src2, src3, dst, dst_stride);
175 dst += (4 * dst_stride);
176 } else if (0 == (height % 8)) {
177 for (cnt = (height >> 3); cnt--;) {
178 LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6,
180 src += (8 * src_stride);
181 ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst,
183 dst += (8 * dst_stride);
185 } else if (0 == (height % 4)) {
186 for (cnt = (height >> 2); cnt--;) {
187 LD_UB4(src, src_stride, src0, src1, src2, src3);
188 src += (4 * src_stride);
190 ST_UB4(src0, src1, src2, src3, dst, dst_stride);
191 dst += (4 * dst_stride);
196 static void copy_width24_msa(uint8_t *src, int32_t src_stride,
197 uint8_t *dst, int32_t dst_stride,
201 v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
202 uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
204 for (cnt = 4; cnt--;) {
205 LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
206 LD4(src + 16, src_stride, out0, out1, out2, out3);
207 src += (4 * src_stride);
208 LD4(src + 16, src_stride, out4, out5, out6, out7);
209 src += (4 * src_stride);
211 ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride);
212 SD4(out0, out1, out2, out3, dst + 16, dst_stride);
213 dst += (4 * dst_stride);
214 SD4(out4, out5, out6, out7, dst + 16, dst_stride);
215 dst += (4 * dst_stride);
219 static void copy_width32_msa(uint8_t *src, int32_t src_stride,
220 uint8_t *dst, int32_t dst_stride,
224 v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
226 for (cnt = (height >> 2); cnt--;) {
227 LD_UB4(src, src_stride, src0, src1, src2, src3);
228 LD_UB4(src + 16, src_stride, src4, src5, src6, src7);
229 src += (4 * src_stride);
230 ST_UB4(src0, src1, src2, src3, dst, dst_stride);
231 ST_UB4(src4, src5, src6, src7, dst + 16, dst_stride);
232 dst += (4 * dst_stride);
236 static void copy_width48_msa(uint8_t *src, int32_t src_stride,
237 uint8_t *dst, int32_t dst_stride,
241 v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
244 for (cnt = (height >> 2); cnt--;) {
245 LD_UB4(src, src_stride, src0, src1, src2, src3);
246 LD_UB4(src + 16, src_stride, src4, src5, src6, src7);
247 LD_UB4(src + 32, src_stride, src8, src9, src10, src11);
248 src += (4 * src_stride);
250 ST_UB4(src0, src1, src2, src3, dst, dst_stride);
251 ST_UB4(src4, src5, src6, src7, dst + 16, dst_stride);
252 ST_UB4(src8, src9, src10, src11, dst + 32, dst_stride);
253 dst += (4 * dst_stride);
257 static void copy_width64_msa(uint8_t *src, int32_t src_stride,
258 uint8_t *dst, int32_t dst_stride,
262 v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
263 v16u8 src8, src9, src10, src11, src12, src13, src14, src15;
265 for (cnt = (height >> 2); cnt--;) {
266 LD_UB4(src, 16, src0, src1, src2, src3);
268 LD_UB4(src, 16, src4, src5, src6, src7);
270 LD_UB4(src, 16, src8, src9, src10, src11);
272 LD_UB4(src, 16, src12, src13, src14, src15);
275 ST_UB4(src0, src1, src2, src3, dst, 16);
277 ST_UB4(src4, src5, src6, src7, dst, 16);
279 ST_UB4(src8, src9, src10, src11, dst, 16);
281 ST_UB4(src12, src13, src14, src15, dst, 16);
286 static const uint8_t mc_filt_mask_arr[16 * 3] = {
288 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
290 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20,
292 8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
295 #define FILT_4TAP_DPADD_S_H(vec0, vec1, filt0, filt1) \
299 tmp0 = __msa_dotp_s_h((v16i8) vec0, (v16i8) filt0); \
300 tmp0 = __msa_dpadd_s_h(tmp0, (v16i8) vec1, (v16i8) filt1); \
305 static void common_hz_8t_4x4_msa(uint8_t *src, int32_t src_stride,
306 uint8_t *dst, int32_t dst_stride,
307 const int8_t *filter)
309 v16u8 mask0, mask1, mask2, mask3, out;
310 v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
311 v8i16 filt, out0, out1;
313 mask0 = LD_UB(&ff_hevc_mask_arr[16]);
316 /* rearranging filter */
317 filt = LD_SH(filter);
318 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
324 LD_SB4(src, src_stride, src0, src1, src2, src3);
325 XORI_B4_128_SB(src0, src1, src2, src3);
326 HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
327 mask3, filt0, filt1, filt2, filt3, out0, out1);
328 SRARI_H2_SH(out0, out1, 6);
329 SAT_SH2_SH(out0, out1, 7);
330 out = PCKEV_XORI128_UB(out0, out1);
331 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
334 static void common_hz_8t_4x8_msa(uint8_t *src, int32_t src_stride,
335 uint8_t *dst, int32_t dst_stride,
336 const int8_t *filter)
338 v16i8 filt0, filt1, filt2, filt3;
339 v16i8 src0, src1, src2, src3;
340 v16u8 mask0, mask1, mask2, mask3, out;
341 v8i16 filt, out0, out1, out2, out3;
343 mask0 = LD_UB(&ff_hevc_mask_arr[16]);
346 /* rearranging filter */
347 filt = LD_SH(filter);
348 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
354 LD_SB4(src, src_stride, src0, src1, src2, src3);
355 XORI_B4_128_SB(src0, src1, src2, src3);
356 src += (4 * src_stride);
357 HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
358 mask3, filt0, filt1, filt2, filt3, out0, out1);
359 LD_SB4(src, src_stride, src0, src1, src2, src3);
360 XORI_B4_128_SB(src0, src1, src2, src3);
361 HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
362 mask3, filt0, filt1, filt2, filt3, out2, out3);
363 SRARI_H4_SH(out0, out1, out2, out3, 6);
364 SAT_SH4_SH(out0, out1, out2, out3, 7);
365 out = PCKEV_XORI128_UB(out0, out1);
366 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
367 dst += (4 * dst_stride);
368 out = PCKEV_XORI128_UB(out2, out3);
369 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
372 static void common_hz_8t_4x16_msa(uint8_t *src, int32_t src_stride,
373 uint8_t *dst, int32_t dst_stride,
374 const int8_t *filter)
376 v16u8 mask0, mask1, mask2, mask3, out;
377 v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
378 v8i16 filt, out0, out1, out2, out3;
380 mask0 = LD_UB(&ff_hevc_mask_arr[16]);
383 /* rearranging filter */
384 filt = LD_SH(filter);
385 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
391 LD_SB4(src, src_stride, src0, src1, src2, src3);
392 XORI_B4_128_SB(src0, src1, src2, src3);
393 src += (4 * src_stride);
394 HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
395 mask3, filt0, filt1, filt2, filt3, out0, out1);
396 LD_SB4(src, src_stride, src0, src1, src2, src3);
397 XORI_B4_128_SB(src0, src1, src2, src3);
398 src += (4 * src_stride);
399 HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
400 mask3, filt0, filt1, filt2, filt3, out2, out3);
401 SRARI_H4_SH(out0, out1, out2, out3, 6);
402 SAT_SH4_SH(out0, out1, out2, out3, 7);
403 out = PCKEV_XORI128_UB(out0, out1);
404 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
405 dst += (4 * dst_stride);
406 out = PCKEV_XORI128_UB(out2, out3);
407 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
408 dst += (4 * dst_stride);
410 LD_SB4(src, src_stride, src0, src1, src2, src3);
411 XORI_B4_128_SB(src0, src1, src2, src3);
412 src += (4 * src_stride);
413 HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
414 mask3, filt0, filt1, filt2, filt3, out0, out1);
415 LD_SB4(src, src_stride, src0, src1, src2, src3);
416 XORI_B4_128_SB(src0, src1, src2, src3);
417 src += (4 * src_stride);
418 HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
419 mask3, filt0, filt1, filt2, filt3, out2, out3);
421 SRARI_H4_SH(out0, out1, out2, out3, 6);
422 SAT_SH4_SH(out0, out1, out2, out3, 7);
423 out = PCKEV_XORI128_UB(out0, out1);
424 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
425 dst += (4 * dst_stride);
426 out = PCKEV_XORI128_UB(out2, out3);
427 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
430 static void common_hz_8t_4w_msa(uint8_t *src, int32_t src_stride,
431 uint8_t *dst, int32_t dst_stride,
432 const int8_t *filter, int32_t height)
435 common_hz_8t_4x4_msa(src, src_stride, dst, dst_stride, filter);
436 } else if (8 == height) {
437 common_hz_8t_4x8_msa(src, src_stride, dst, dst_stride, filter);
438 } else if (16 == height) {
439 common_hz_8t_4x16_msa(src, src_stride, dst, dst_stride, filter);
443 static void common_hz_8t_8w_msa(uint8_t *src, int32_t src_stride,
444 uint8_t *dst, int32_t dst_stride,
445 const int8_t *filter, int32_t height)
448 v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
449 v16u8 mask0, mask1, mask2, mask3, tmp0, tmp1;
450 v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m;
451 v8i16 filt, out0, out1, out2, out3;
453 mask0 = LD_UB(&ff_hevc_mask_arr[0]);
456 /* rearranging filter */
457 filt = LD_SH(filter);
458 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
464 for (loop_cnt = (height >> 2); loop_cnt--;) {
465 LD_SB4(src, src_stride, src0, src1, src2, src3);
466 XORI_B4_128_SB(src0, src1, src2, src3);
467 src += (4 * src_stride);
469 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m);
470 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m);
471 DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0,
472 out0, out1, out2, out3);
473 VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec0_m, vec1_m);
474 VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2_m, vec3_m);
475 DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt2, filt2, filt2, filt2,
476 out0, out1, out2, out3);
477 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4_m, vec5_m);
478 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6_m, vec7_m);
479 DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt1, filt1, filt1, filt1,
480 out0, out1, out2, out3);
481 VSHF_B2_SB(src0, src0, src1, src1, mask3, mask3, vec4_m, vec5_m);
482 VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec6_m, vec7_m);
483 DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt3, filt3, filt3, filt3,
484 out0, out1, out2, out3);
486 SRARI_H4_SH(out0, out1, out2, out3, 6);
487 SAT_SH4_SH(out0, out1, out2, out3, 7);
488 tmp0 = PCKEV_XORI128_UB(out0, out1);
489 tmp1 = PCKEV_XORI128_UB(out2, out3);
490 ST8x4_UB(tmp0, tmp1, dst, dst_stride);
491 dst += (4 * dst_stride);
495 static void common_hz_8t_12w_msa(uint8_t *src, int32_t src_stride,
496 uint8_t *dst, int32_t dst_stride,
497 const int8_t *filter, int32_t height)
500 v16u8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask00;
501 v16u8 tmp0, tmp1, tmp2;
502 v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
503 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
504 v16i8 filt0, filt1, filt2, filt3;
505 v8i16 filt, out0, out1, out2, out3, out4, out5;
507 mask00 = LD_UB(&ff_hevc_mask_arr[0]);
508 mask0 = LD_UB(&ff_hevc_mask_arr[16]);
512 /* rearranging filter */
513 filt = LD_SH(filter);
514 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
523 for (loop_cnt = 4; loop_cnt--;) {
525 LD_SB4(src, src_stride, src0, src1, src2, src3);
527 LD_SB4(src + 8, src_stride, src4, src5, src6, src7);
529 XORI_B4_128_SB(src0, src1, src2, src3);
530 XORI_B4_128_SB(src4, src5, src6, src7);
531 src += (4 * src_stride);
533 VSHF_B2_SB(src0, src0, src1, src1, mask00, mask00, vec0, vec1);
534 VSHF_B2_SB(src2, src2, src3, src3, mask00, mask00, vec2, vec3);
535 DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0,
537 VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec0, vec1);
538 VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2, vec3);
539 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt2, filt2, filt2, out0,
541 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4, vec5);
542 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6, vec7);
543 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1, out0,
545 VSHF_B2_SB(src0, src0, src1, src1, mask3, mask3, vec4, vec5);
546 VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec6, vec7);
547 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt3, filt3, filt3, filt3, out0,
551 VSHF_B2_SB(src4, src5, src6, src7, mask0, mask0, vec0, vec1);
552 DOTP_SB2_SH(vec0, vec1, filt0, filt0, out4, out5);
553 VSHF_B2_SB(src4, src5, src6, src7, mask4, mask4, vec2, vec3);
554 DPADD_SB2_SH(vec2, vec3, filt1, filt1, out4, out5);
555 VSHF_B2_SB(src4, src5, src6, src7, mask5, mask5, vec4, vec5);
556 DPADD_SB2_SH(vec4, vec5, filt2, filt2, out4, out5);
557 VSHF_B2_SB(src4, src5, src6, src7, mask6, mask6, vec6, vec7);
558 DPADD_SB2_SH(vec6, vec7, filt3, filt3, out4, out5);
560 SRARI_H4_SH(out0, out1, out2, out3, 6);
561 SRARI_H2_SH(out4, out5, 6);
562 SAT_SH4_SH(out0, out1, out2, out3, 7);
563 SAT_SH2_SH(out4, out5, 7);
564 tmp0 = PCKEV_XORI128_UB(out0, out1);
565 tmp1 = PCKEV_XORI128_UB(out2, out3);
566 tmp2 = PCKEV_XORI128_UB(out4, out5);
568 ST8x4_UB(tmp0, tmp1, dst, dst_stride);
569 ST4x4_UB(tmp2, tmp2, 0, 1, 2, 3, dst + 8, dst_stride);
570 dst += (4 * dst_stride);
574 static void common_hz_8t_16w_msa(uint8_t *src, int32_t src_stride,
575 uint8_t *dst, int32_t dst_stride,
576 const int8_t *filter, int32_t height)
579 v16u8 mask0, mask1, mask2, mask3, out;
580 v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
581 v16i8 filt0, filt1, filt2, filt3;
582 v8i16 filt, out0, out1, out2, out3;
584 mask0 = LD_UB(&ff_hevc_mask_arr[0]);
587 /* rearranging filter */
588 filt = LD_SH(filter);
589 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
595 for (loop_cnt = (height >> 2); loop_cnt--;) {
596 LD_SB2(src, src_stride, src0, src2);
597 LD_SB2(src + 8, src_stride, src1, src3);
598 src += (2 * src_stride);
600 LD_SB2(src, src_stride, src4, src6);
601 LD_SB2(src + 8, src_stride, src5, src7);
602 src += (2 * src_stride);
604 XORI_B4_128_SB(src0, src1, src2, src3);
605 XORI_B4_128_SB(src4, src5, src6, src7);
606 HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
607 mask3, filt0, filt1, filt2, filt3, out0,
609 SRARI_H4_SH(out0, out1, out2, out3, 6);
610 SAT_SH4_SH(out0, out1, out2, out3, 7);
611 out = PCKEV_XORI128_UB(out0, out1);
614 out = PCKEV_XORI128_UB(out2, out3);
618 HORIZ_8TAP_8WID_4VECS_FILT(src4, src5, src6, src7, mask0, mask1, mask2,
619 mask3, filt0, filt1, filt2, filt3, out0,
621 SRARI_H4_SH(out0, out1, out2, out3, 6);
622 SAT_SH4_SH(out0, out1, out2, out3, 7);
623 out = PCKEV_XORI128_UB(out0, out1);
626 out = PCKEV_XORI128_UB(out2, out3);
632 static void common_hz_8t_24w_msa(uint8_t *src, int32_t src_stride,
633 uint8_t *dst, int32_t dst_stride,
634 const int8_t *filter, int32_t height)
637 v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
638 v16u8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7, out;
639 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
641 v8i16 out0, out1, out2, out3, out8, out9, filt;
643 mask0 = LD_UB(&ff_hevc_mask_arr[0]);
646 /* rearranging filter */
647 filt = LD_SH(filter);
648 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
658 for (loop_cnt = 16; loop_cnt--;) {
659 LD_SB2(src, src_stride, src0, src2);
660 LD_SB2(src + 16, src_stride, src1, src3);
661 XORI_B4_128_SB(src0, src1, src2, src3);
662 src += (2 * src_stride);
663 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec8);
664 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec9);
665 VSHF_B2_SB(src0, src1, src2, src3, mask4, mask4, vec1, vec3);
666 DOTP_SB4_SH(vec0, vec8, vec2, vec9, filt0, filt0, filt0, filt0, out0,
668 DOTP_SB2_SH(vec1, vec3, filt0, filt0, out1, out3);
669 VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec0, vec8);
670 VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2, vec9);
671 VSHF_B2_SB(src0, src1, src2, src3, mask6, mask6, vec1, vec3);
672 DPADD_SB4_SH(vec0, vec8, vec2, vec9, filt2, filt2, filt2, filt2,
673 out0, out8, out2, out9);
674 DPADD_SB2_SH(vec1, vec3, filt2, filt2, out1, out3);
675 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4, vec10);
676 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6, vec11);
677 VSHF_B2_SB(src0, src1, src2, src3, mask5, mask5, vec5, vec7);
678 DPADD_SB4_SH(vec4, vec10, vec6, vec11, filt1, filt1, filt1, filt1,
679 out0, out8, out2, out9);
680 DPADD_SB2_SH(vec5, vec7, filt1, filt1, out1, out3);
681 VSHF_B2_SB(src0, src0, src1, src1, mask3, mask3, vec4, vec10);
682 VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec6, vec11);
683 VSHF_B2_SB(src0, src1, src2, src3, mask7, mask7, vec5, vec7);
684 DPADD_SB4_SH(vec4, vec10, vec6, vec11, filt3, filt3, filt3, filt3,
685 out0, out8, out2, out9);
686 DPADD_SB2_SH(vec5, vec7, filt3, filt3, out1, out3);
687 SRARI_H4_SH(out0, out8, out2, out9, 6);
688 SRARI_H2_SH(out1, out3, 6);
689 SAT_SH4_SH(out0, out8, out2, out9, 7);
690 SAT_SH2_SH(out1, out3, 7);
691 out = PCKEV_XORI128_UB(out8, out9);
692 ST8x2_UB(out, dst + 16, dst_stride);
693 out = PCKEV_XORI128_UB(out0, out1);
696 out = PCKEV_XORI128_UB(out2, out3);
702 static void common_hz_8t_32w_msa(uint8_t *src, int32_t src_stride,
703 uint8_t *dst, int32_t dst_stride,
704 const int8_t *filter, int32_t height)
707 v16u8 mask0, mask1, mask2, mask3, out;
708 v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
709 v16i8 filt0, filt1, filt2, filt3;
710 v8i16 filt, out0, out1, out2, out3;
712 mask0 = LD_UB(&ff_hevc_mask_arr[0]);
715 /* rearranging filter */
716 filt = LD_SH(filter);
717 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
723 for (loop_cnt = (height >> 1); loop_cnt--;) {
725 src1 = LD_SB(src + 8);
726 src2 = LD_SB(src + 16);
727 src3 = LD_SB(src + 24);
729 XORI_B4_128_SB(src0, src1, src2, src3);
732 src5 = LD_SB(src + 8);
733 src6 = LD_SB(src + 16);
734 src7 = LD_SB(src + 24);
736 XORI_B4_128_SB(src4, src5, src6, src7);
738 HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
739 mask3, filt0, filt1, filt2, filt3, out0,
741 SRARI_H4_SH(out0, out1, out2, out3, 6);
742 SAT_SH4_SH(out0, out1, out2, out3, 7);
744 out = PCKEV_XORI128_UB(out0, out1);
746 out = PCKEV_XORI128_UB(out2, out3);
747 ST_UB(out, dst + 16);
750 HORIZ_8TAP_8WID_4VECS_FILT(src4, src5, src6, src7, mask0, mask1, mask2,
751 mask3, filt0, filt1, filt2, filt3, out0,
753 SRARI_H4_SH(out0, out1, out2, out3, 6);
754 SAT_SH4_SH(out0, out1, out2, out3, 7);
755 out = PCKEV_XORI128_UB(out0, out1);
757 out = PCKEV_XORI128_UB(out2, out3);
758 ST_UB(out, dst + 16);
763 static void common_hz_8t_48w_msa(uint8_t *src, int32_t src_stride,
764 uint8_t *dst, int32_t dst_stride,
765 const int8_t *filter, int32_t height)
768 v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3, vec0, vec1, vec2;
770 v16u8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7, out;
771 v8i16 filt, out0, out1, out2, out3;
773 mask0 = LD_UB(&ff_hevc_mask_arr[0]);
776 /* rearranging filter */
777 filt = LD_SH(filter);
778 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
788 for (loop_cnt = 64; loop_cnt--;) {
790 src1 = LD_SB(src + 8);
791 src2 = LD_SB(src + 16);
792 src3 = LD_SB(src + 32);
793 src4 = LD_SB(src + 40);
796 XORI_B4_128_SB(src0, src1, src2, src3);
797 src4 = (v16i8) __msa_xori_b((v16u8) src4, 128);
799 VSHF_B3_SB(src0, src0, src1, src1, src2, src2, mask0, mask0, mask0,
801 DOTP_SB3_SH(vec0, vec1, vec2, filt0, filt0, filt0, out0, out1, out2);
802 VSHF_B3_SB(src0, src0, src1, src1, src2, src2, mask1, mask1, mask1,
804 DPADD_SB2_SH(vec0, vec1, filt1, filt1, out0, out1);
805 out2 = __msa_dpadd_s_h(out2, vec2, filt1);
806 VSHF_B3_SB(src0, src0, src1, src1, src2, src2, mask2, mask2, mask2,
808 DPADD_SB2_SH(vec0, vec1, filt2, filt2, out0, out1);
809 out2 = __msa_dpadd_s_h(out2, vec2, filt2);
811 VSHF_B3_SB(src0, src0, src1, src1, src2, src2, mask3, mask3, mask3,
813 DPADD_SB2_SH(vec0, vec1, filt3, filt3, out0, out1);
814 out2 = __msa_dpadd_s_h(out2, vec2, filt3);
816 SRARI_H2_SH(out0, out1, 6);
817 out3 = __msa_srari_h(out2, 6);
818 SAT_SH3_SH(out0, out1, out3, 7);
819 out = PCKEV_XORI128_UB(out0, out1);
822 VSHF_B3_SB(src2, src3, src3, src3, src4, src4, mask4, mask0, mask0,
824 DOTP_SB3_SH(vec0, vec1, vec2, filt0, filt0, filt0, out0, out1, out2);
825 VSHF_B3_SB(src2, src3, src3, src3, src4, src4, mask5, mask1, mask1,
827 DPADD_SB2_SH(vec0, vec1, filt1, filt1, out0, out1);
828 out2 = __msa_dpadd_s_h(out2, vec2, filt1);
829 VSHF_B3_SB(src2, src3, src3, src3, src4, src4, mask6, mask2, mask2,
831 DPADD_SB2_SH(vec0, vec1, filt2, filt2, out0, out1);
832 out2 = __msa_dpadd_s_h(out2, vec2, filt2);
833 VSHF_B3_SB(src2, src3, src3, src3, src4, src4, mask7, mask3, mask3,
835 DPADD_SB2_SH(vec0, vec1, filt3, filt3, out0, out1);
836 out2 = __msa_dpadd_s_h(out2, vec2, filt3);
838 SRARI_H2_SH(out0, out1, 6);
839 out2 = __msa_srari_h(out2, 6);
840 SAT_SH3_SH(out0, out1, out2, 7);
841 out = PCKEV_XORI128_UB(out3, out0);
842 ST_UB(out, dst + 16);
843 out = PCKEV_XORI128_UB(out1, out2);
844 ST_UB(out, dst + 32);
849 static void common_hz_8t_64w_msa(uint8_t *src, int32_t src_stride,
850 uint8_t *dst, int32_t dst_stride,
851 const int8_t *filter, int32_t height)
854 v16u8 mask0, mask1, mask2, mask3, out;
855 v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
856 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
857 v16i8 filt0, filt1, filt2, filt3;
858 v8i16 res0, res1, res2, res3, filt;
860 mask0 = LD_UB(&ff_hevc_mask_arr[0]);
863 /* rearranging filter */
864 filt = LD_SH(filter);
865 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
871 for (loop_cnt = height; loop_cnt--;) {
872 LD_SB8(src, 8, src0, src1, src2, src3, src4, src5, src6, src7);
875 XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
877 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
878 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3);
879 DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, res0,
881 VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec0, vec1);
882 VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2, vec3);
883 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt2, filt2, filt2, res0,
885 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4, vec5);
886 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6, vec7);
887 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1, res0,
889 VSHF_B2_SB(src0, src0, src1, src1, mask3, mask3, vec4, vec5);
890 VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec6, vec7);
891 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt3, filt3, filt3, filt3, res0,
894 SRARI_H4_SH(res0, res1, res2, res3, 6);
895 SAT_SH4_SH(res0, res1, res2, res3, 7);
896 out = PCKEV_XORI128_UB(res0, res1);
898 out = PCKEV_XORI128_UB(res2, res3);
899 ST_UB(out, dst + 16);
901 VSHF_B2_SB(src4, src4, src5, src5, mask0, mask0, vec0, vec1);
902 VSHF_B2_SB(src6, src6, src7, src7, mask0, mask0, vec2, vec3);
903 DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, res0,
905 VSHF_B2_SB(src4, src4, src5, src5, mask2, mask2, vec0, vec1);
906 VSHF_B2_SB(src6, src6, src7, src7, mask2, mask2, vec2, vec3);
907 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt2, filt2, filt2, res0,
909 VSHF_B2_SB(src4, src4, src5, src5, mask1, mask1, vec4, vec5);
910 VSHF_B2_SB(src6, src6, src7, src7, mask1, mask1, vec6, vec7);
911 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1, res0,
913 VSHF_B2_SB(src4, src4, src5, src5, mask3, mask3, vec4, vec5);
914 VSHF_B2_SB(src6, src6, src7, src7, mask3, mask3, vec6, vec7);
915 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt3, filt3, filt3, filt3, res0,
918 SRARI_H4_SH(res0, res1, res2, res3, 6);
919 SAT_SH4_SH(res0, res1, res2, res3, 7);
920 out = PCKEV_XORI128_UB(res0, res1);
921 ST_UB(out, dst + 32);
922 out = PCKEV_XORI128_UB(res2, res3);
923 ST_UB(out, dst + 48);
928 static void common_vt_8t_4w_msa(uint8_t *src, int32_t src_stride,
929 uint8_t *dst, int32_t dst_stride,
930 const int8_t *filter, int32_t height)
934 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
935 v16i8 src11, src12, src13, src14;
936 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
937 v16i8 src65_r, src87_r, src109_r, src2110, src4332, src6554, src8776;
938 v16i8 src1110_r, src1211_r, src1312_r, src1413_r, src12111110, src14131312;
939 v16i8 src10998, filt0, filt1, filt2, filt3;
940 v8i16 filt, out10, out32, out54, out76;
942 src -= (3 * src_stride);
944 filt = LD_SH(filter);
945 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
947 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
948 src += (7 * src_stride);
950 ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
952 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
953 ILVR_D3_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r, src2110,
955 XORI_B3_128_SB(src2110, src4332, src6554);
957 for (loop_cnt = (height >> 3); loop_cnt--;) {
958 LD_SB4(src, src_stride, src7, src8, src9, src10);
959 src += (4 * src_stride);
960 LD_SB4(src, src_stride, src11, src12, src13, src14);
961 src += (4 * src_stride);
963 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
964 src87_r, src98_r, src109_r);
965 ILVR_B4_SB(src11, src10, src12, src11, src13, src12, src14, src13,
966 src1110_r, src1211_r, src1312_r, src1413_r);
967 ILVR_D2_SB(src87_r, src76_r, src109_r, src98_r, src8776, src10998);
968 ILVR_D2_SB(src1211_r, src1110_r, src1413_r, src1312_r,
969 src12111110, src14131312);
970 XORI_B2_128_SB(src8776, src10998);
971 XORI_B2_128_SB(src12111110, src14131312);
973 DOTP_SB2_SH(src2110, src4332, filt0, filt0, out10, out32);
974 DOTP_SB2_SH(src6554, src8776, filt0, filt0, out54, out76);
975 DPADD_SB2_SH(src4332, src6554, filt1, filt1, out10, out32);
976 DPADD_SB2_SH(src8776, src10998, filt1, filt1, out54, out76);
977 DPADD_SB2_SH(src6554, src8776, filt2, filt2, out10, out32);
978 DPADD_SB2_SH(src10998, src12111110, filt2, filt2, out54, out76);
979 DPADD_SB2_SH(src8776, src10998, filt3, filt3, out10, out32);
980 DPADD_SB2_SH(src12111110, src14131312, filt3, filt3, out54, out76);
981 SRARI_H2_SH(out10, out32, 6);
982 SRARI_H2_SH(out54, out76, 6);
983 SAT_SH2_SH(out10, out32, 7);
984 SAT_SH2_SH(out54, out76, 7);
985 out0 = PCKEV_XORI128_UB(out10, out32);
986 out1 = PCKEV_XORI128_UB(out54, out76);
987 ST4x4_UB(out0, out0, 0, 1, 2, 3, dst, dst_stride);
988 dst += (4 * dst_stride);
989 ST4x4_UB(out1, out1, 0, 1, 2, 3, dst, dst_stride);
990 dst += (4 * dst_stride);
993 src4332 = src12111110;
994 src6554 = src14131312;
999 static void common_vt_8t_8w_msa(uint8_t *src, int32_t src_stride,
1000 uint8_t *dst, int32_t dst_stride,
1001 const int8_t *filter, int32_t height)
1004 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1005 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
1006 v16i8 src65_r, src87_r, src109_r, filt0, filt1, filt2, filt3;
1008 v8i16 filt, out0_r, out1_r, out2_r, out3_r;
1010 src -= (3 * src_stride);
1012 filt = LD_SH(filter);
1013 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1015 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
1016 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1017 src += (7 * src_stride);
1018 ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
1020 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1022 for (loop_cnt = (height >> 2); loop_cnt--;) {
1023 LD_SB4(src, src_stride, src7, src8, src9, src10);
1024 XORI_B4_128_SB(src7, src8, src9, src10);
1025 src += (4 * src_stride);
1027 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
1028 src87_r, src98_r, src109_r);
1029 DOTP_SB4_SH(src10_r, src21_r, src32_r, src43_r, filt0, filt0, filt0,
1030 filt0, out0_r, out1_r, out2_r, out3_r);
1031 DPADD_SB4_SH(src32_r, src43_r, src54_r, src65_r, filt1, filt1, filt1,
1032 filt1, out0_r, out1_r, out2_r, out3_r);
1033 DPADD_SB4_SH(src54_r, src65_r, src76_r, src87_r, filt2, filt2, filt2,
1034 filt2, out0_r, out1_r, out2_r, out3_r);
1035 DPADD_SB4_SH(src76_r, src87_r, src98_r, src109_r, filt3, filt3, filt3,
1036 filt3, out0_r, out1_r, out2_r, out3_r);
1037 SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6);
1038 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
1039 tmp0 = PCKEV_XORI128_UB(out0_r, out1_r);
1040 tmp1 = PCKEV_XORI128_UB(out2_r, out3_r);
1041 ST8x4_UB(tmp0, tmp1, dst, dst_stride);
1042 dst += (4 * dst_stride);
1054 static void common_vt_8t_12w_msa(uint8_t *src, int32_t src_stride,
1055 uint8_t *dst, int32_t dst_stride,
1056 const int8_t *filter, int32_t height)
1059 uint32_t out2, out3;
1060 uint64_t out0, out1;
1061 v16u8 tmp0, tmp1, tmp2, tmp3;
1062 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1063 v16i8 filt0, filt1, filt2, filt3;
1064 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
1065 v16i8 src65_r, src87_r, src109_r, src10_l, src32_l, src54_l, src76_l;
1066 v16i8 src98_l, src21_l, src43_l, src65_l, src87_l, src109_l;
1067 v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
1069 src -= (3 * src_stride);
1071 filt = LD_SH(filter);
1072 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1074 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
1075 src += (7 * src_stride);
1077 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1079 ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
1081 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1082 ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_l, src32_l,
1084 ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
1086 for (loop_cnt = 4; loop_cnt--;) {
1087 LD_SB4(src, src_stride, src7, src8, src9, src10);
1088 XORI_B4_128_SB(src7, src8, src9, src10);
1089 src += (4 * src_stride);
1091 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
1092 src87_r, src98_r, src109_r);
1093 ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_l,
1094 src87_l, src98_l, src109_l);
1095 out0_r = HEVC_FILT_8TAP_SH(src10_r, src32_r, src54_r, src76_r, filt0,
1096 filt1, filt2, filt3);
1097 out1_r = HEVC_FILT_8TAP_SH(src21_r, src43_r, src65_r, src87_r, filt0,
1098 filt1, filt2, filt3);
1099 out2_r = HEVC_FILT_8TAP_SH(src32_r, src54_r, src76_r, src98_r, filt0,
1100 filt1, filt2, filt3);
1101 out3_r = HEVC_FILT_8TAP_SH(src43_r, src65_r, src87_r, src109_r, filt0,
1102 filt1, filt2, filt3);
1103 out0_l = HEVC_FILT_8TAP_SH(src10_l, src32_l, src54_l, src76_l, filt0,
1104 filt1, filt2, filt3);
1105 out1_l = HEVC_FILT_8TAP_SH(src21_l, src43_l, src65_l, src87_l, filt0,
1106 filt1, filt2, filt3);
1107 out2_l = HEVC_FILT_8TAP_SH(src32_l, src54_l, src76_l, src98_l, filt0,
1108 filt1, filt2, filt3);
1109 out3_l = HEVC_FILT_8TAP_SH(src43_l, src65_l, src87_l, src109_l, filt0,
1110 filt1, filt2, filt3);
1111 SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6);
1112 SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 6);
1113 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
1114 SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
1115 PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
1116 out3_r, tmp0, tmp1, tmp2, tmp3);
1117 XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3);
1119 out0 = __msa_copy_u_d((v2i64) tmp0, 0);
1120 out1 = __msa_copy_u_d((v2i64) tmp1, 0);
1121 out2 = __msa_copy_u_w((v4i32) tmp0, 2);
1122 out3 = __msa_copy_u_w((v4i32) tmp1, 2);
1124 SW(out2, (dst + 8));
1127 SW(out3, (dst + 8));
1129 out0 = __msa_copy_u_d((v2i64) tmp2, 0);
1130 out1 = __msa_copy_u_d((v2i64) tmp3, 0);
1131 out2 = __msa_copy_u_w((v4i32) tmp2, 2);
1132 out3 = __msa_copy_u_w((v4i32) tmp3, 2);
1134 SW(out2, (dst + 8));
1137 SW(out3, (dst + 8));
1156 static void common_vt_8t_16w_msa(uint8_t *src, int32_t src_stride,
1157 uint8_t *dst, int32_t dst_stride,
1158 const int8_t *filter, int32_t height)
1161 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1162 v16i8 filt0, filt1, filt2, filt3;
1163 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
1164 v16i8 src65_r, src87_r, src109_r, src10_l, src32_l, src54_l, src76_l;
1165 v16i8 src98_l, src21_l, src43_l, src65_l, src87_l, src109_l;
1166 v16u8 tmp0, tmp1, tmp2, tmp3;
1167 v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
1169 src -= (3 * src_stride);
1171 filt = LD_SH(filter);
1172 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1174 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
1175 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1176 src += (7 * src_stride);
1177 ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
1179 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1180 ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_l, src32_l,
1182 ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
1184 for (loop_cnt = (height >> 2); loop_cnt--;) {
1185 LD_SB4(src, src_stride, src7, src8, src9, src10);
1186 XORI_B4_128_SB(src7, src8, src9, src10);
1187 src += (4 * src_stride);
1189 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
1190 src87_r, src98_r, src109_r);
1191 ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_l,
1192 src87_l, src98_l, src109_l);
1193 out0_r = HEVC_FILT_8TAP_SH(src10_r, src32_r, src54_r, src76_r, filt0,
1194 filt1, filt2, filt3);
1195 out1_r = HEVC_FILT_8TAP_SH(src21_r, src43_r, src65_r, src87_r, filt0,
1196 filt1, filt2, filt3);
1197 out2_r = HEVC_FILT_8TAP_SH(src32_r, src54_r, src76_r, src98_r, filt0,
1198 filt1, filt2, filt3);
1199 out3_r = HEVC_FILT_8TAP_SH(src43_r, src65_r, src87_r, src109_r, filt0,
1200 filt1, filt2, filt3);
1201 out0_l = HEVC_FILT_8TAP_SH(src10_l, src32_l, src54_l, src76_l, filt0,
1202 filt1, filt2, filt3);
1203 out1_l = HEVC_FILT_8TAP_SH(src21_l, src43_l, src65_l, src87_l, filt0,
1204 filt1, filt2, filt3);
1205 out2_l = HEVC_FILT_8TAP_SH(src32_l, src54_l, src76_l, src98_l, filt0,
1206 filt1, filt2, filt3);
1207 out3_l = HEVC_FILT_8TAP_SH(src43_l, src65_l, src87_l, src109_l, filt0,
1208 filt1, filt2, filt3);
1209 SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6);
1210 SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 6);
1211 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
1212 SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
1213 PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
1214 out3_r, tmp0, tmp1, tmp2, tmp3);
1215 XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3);
1216 ST_UB4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride);
1217 dst += (4 * dst_stride);
1235 static void common_vt_8t_16w_mult_msa(uint8_t *src, int32_t src_stride,
1236 uint8_t *dst, int32_t dst_stride,
1237 const int8_t *filter, int32_t height,
1242 uint32_t loop_cnt, cnt;
1243 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1244 v16i8 filt0, filt1, filt2, filt3;
1245 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
1246 v16i8 src65_r, src87_r, src109_r, src10_l, src32_l, src54_l, src76_l;
1247 v16i8 src98_l, src21_l, src43_l, src65_l, src87_l, src109_l;
1248 v16u8 tmp0, tmp1, tmp2, tmp3;
1249 v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
1251 src -= (3 * src_stride);
1253 filt = LD_SH(filter);
1254 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1256 for (cnt = (width >> 4); cnt--;) {
1260 LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6);
1261 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1262 src_tmp += (7 * src_stride);
1263 ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r,
1264 src32_r, src54_r, src21_r);
1265 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1266 ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_l,
1267 src32_l, src54_l, src21_l);
1268 ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
1270 for (loop_cnt = (height >> 2); loop_cnt--;) {
1271 LD_SB4(src_tmp, src_stride, src7, src8, src9, src10);
1272 XORI_B4_128_SB(src7, src8, src9, src10);
1273 src_tmp += (4 * src_stride);
1274 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
1275 src87_r, src98_r, src109_r);
1276 ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_l,
1277 src87_l, src98_l, src109_l);
1278 out0_r = HEVC_FILT_8TAP_SH(src10_r, src32_r, src54_r, src76_r,
1279 filt0, filt1, filt2, filt3);
1280 out1_r = HEVC_FILT_8TAP_SH(src21_r, src43_r, src65_r, src87_r,
1281 filt0, filt1, filt2, filt3);
1282 out2_r = HEVC_FILT_8TAP_SH(src32_r, src54_r, src76_r, src98_r,
1283 filt0, filt1, filt2, filt3);
1284 out3_r = HEVC_FILT_8TAP_SH(src43_r, src65_r, src87_r, src109_r,
1285 filt0, filt1, filt2, filt3);
1286 out0_l = HEVC_FILT_8TAP_SH(src10_l, src32_l, src54_l, src76_l,
1287 filt0, filt1, filt2, filt3);
1288 out1_l = HEVC_FILT_8TAP_SH(src21_l, src43_l, src65_l, src87_l,
1289 filt0, filt1, filt2, filt3);
1290 out2_l = HEVC_FILT_8TAP_SH(src32_l, src54_l, src76_l, src98_l,
1291 filt0, filt1, filt2, filt3);
1292 out3_l = HEVC_FILT_8TAP_SH(src43_l, src65_l, src87_l, src109_l,
1293 filt0, filt1, filt2, filt3);
1294 SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6);
1295 SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 6);
1296 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
1297 SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
1298 PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
1299 out3_r, tmp0, tmp1, tmp2, tmp3);
1300 XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3);
1301 ST_UB4(tmp0, tmp1, tmp2, tmp3, dst_tmp, dst_stride);
1302 dst_tmp += (4 * dst_stride);
1324 static void common_vt_8t_24w_msa(uint8_t *src, int32_t src_stride,
1325 uint8_t *dst, int32_t dst_stride,
1326 const int8_t *filter, int32_t height)
1328 common_vt_8t_16w_mult_msa(src, src_stride, dst, dst_stride, filter, height,
1331 common_vt_8t_8w_msa(src + 16, src_stride, dst + 16, dst_stride, filter,
1335 static void common_vt_8t_32w_msa(uint8_t *src, int32_t src_stride,
1336 uint8_t *dst, int32_t dst_stride,
1337 const int8_t *filter, int32_t height)
1339 common_vt_8t_16w_mult_msa(src, src_stride, dst, dst_stride, filter, height,
1343 static void common_vt_8t_48w_msa(uint8_t *src, int32_t src_stride,
1344 uint8_t *dst, int32_t dst_stride,
1345 const int8_t *filter, int32_t height)
1347 common_vt_8t_16w_mult_msa(src, src_stride, dst, dst_stride, filter, height,
1351 static void common_vt_8t_64w_msa(uint8_t *src, int32_t src_stride,
1352 uint8_t *dst, int32_t dst_stride,
1353 const int8_t *filter, int32_t height)
1355 common_vt_8t_16w_mult_msa(src, src_stride, dst, dst_stride, filter, height,
1359 static void hevc_hv_uni_8t_4w_msa(uint8_t *src,
1363 const int8_t *filter_x,
1364 const int8_t *filter_y,
1369 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
1370 v16i8 src9, src10, src11, src12, src13, src14;
1371 v8i16 filt0, filt1, filt2, filt3;
1372 v8i16 filt_h0, filt_h1, filt_h2, filt_h3;
1373 v16i8 mask1, mask2, mask3;
1375 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1376 v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1377 v8i16 dst30, dst41, dst52, dst63, dst66, dst117, dst128, dst139, dst1410;
1378 v8i16 dst10_r, dst32_r, dst54_r, dst76_r, dst98_r, dst1110_r, dst1312_r;
1379 v8i16 dst21_r, dst43_r, dst65_r, dst87_r, dst109_r, dst1211_r, dst1413_r;
1380 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r, dst6_r, dst7_r;
1381 v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16);
1383 src -= ((3 * src_stride) + 3);
1384 filter_vec = LD_SH(filter_x);
1385 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1387 filter_vec = LD_SH(filter_y);
1388 UNPCK_R_SB_SH(filter_vec, filter_vec);
1390 SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
1396 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
1397 src += (7 * src_stride);
1398 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1400 VSHF_B4_SB(src0, src3, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
1401 VSHF_B4_SB(src1, src4, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7);
1402 VSHF_B4_SB(src2, src5, mask0, mask1, mask2, mask3,
1403 vec8, vec9, vec10, vec11);
1404 VSHF_B4_SB(src3, src6, mask0, mask1, mask2, mask3,
1405 vec12, vec13, vec14, vec15);
1407 dst30 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1409 dst41 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
1411 dst52 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
1413 dst63 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2,
1416 ILVRL_H2_SH(dst41, dst30, dst10_r, dst43_r);
1417 ILVRL_H2_SH(dst52, dst41, dst21_r, dst54_r);
1418 ILVRL_H2_SH(dst63, dst52, dst32_r, dst65_r);
1420 dst66 = (v8i16) __msa_splati_d((v2i64) dst63, 1);
1422 for (loop_cnt = height >> 3; loop_cnt--;) {
1423 LD_SB8(src, src_stride, src7, src8, src9, src10, src11, src12, src13,
1425 src += (8 * src_stride);
1426 XORI_B8_128_SB(src7, src8, src9, src10, src11, src12, src13, src14);
1428 VSHF_B4_SB(src7, src11, mask0, mask1, mask2, mask3,
1429 vec0, vec1, vec2, vec3);
1430 VSHF_B4_SB(src8, src12, mask0, mask1, mask2, mask3,
1431 vec4, vec5, vec6, vec7);
1432 VSHF_B4_SB(src9, src13, mask0, mask1, mask2, mask3,
1433 vec8, vec9, vec10, vec11);
1434 VSHF_B4_SB(src10, src14, mask0, mask1, mask2, mask3,
1435 vec12, vec13, vec14, vec15);
1437 dst117 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1439 dst128 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
1441 dst139 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1,
1443 dst1410 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1,
1446 dst76_r = __msa_ilvr_h(dst117, dst66);
1447 ILVRL_H2_SH(dst128, dst117, dst87_r, dst1211_r);
1448 ILVRL_H2_SH(dst139, dst128, dst98_r, dst1312_r);
1449 ILVRL_H2_SH(dst1410, dst139, dst109_r, dst1413_r);
1450 dst117 = (v8i16) __msa_splati_d((v2i64) dst117, 1);
1451 dst1110_r = __msa_ilvr_h(dst117, dst1410);
1453 dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r, filt_h0,
1454 filt_h1, filt_h2, filt_h3);
1455 dst1_r = HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r, filt_h0,
1456 filt_h1, filt_h2, filt_h3);
1457 dst2_r = HEVC_FILT_8TAP(dst32_r, dst54_r, dst76_r, dst98_r, filt_h0,
1458 filt_h1, filt_h2, filt_h3);
1459 dst3_r = HEVC_FILT_8TAP(dst43_r, dst65_r, dst87_r, dst109_r, filt_h0,
1460 filt_h1, filt_h2, filt_h3);
1461 dst4_r = HEVC_FILT_8TAP(dst54_r, dst76_r, dst98_r, dst1110_r, filt_h0,
1462 filt_h1, filt_h2, filt_h3);
1463 dst5_r = HEVC_FILT_8TAP(dst65_r, dst87_r, dst109_r, dst1211_r, filt_h0,
1464 filt_h1, filt_h2, filt_h3);
1465 dst6_r = HEVC_FILT_8TAP(dst76_r, dst98_r, dst1110_r, dst1312_r, filt_h0,
1466 filt_h1, filt_h2, filt_h3);
1467 dst7_r = HEVC_FILT_8TAP(dst87_r, dst109_r, dst1211_r, dst1413_r,
1468 filt_h0, filt_h1, filt_h2, filt_h3);
1470 SRA_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6);
1471 SRA_4V(dst4_r, dst5_r, dst6_r, dst7_r, 6);
1472 SRARI_W4_SW(dst0_r, dst1_r, dst2_r, dst3_r, 6);
1473 SRARI_W4_SW(dst4_r, dst5_r, dst6_r, dst7_r, 6);
1474 SAT_SW4_SW(dst0_r, dst1_r, dst2_r, dst3_r, 7);
1475 SAT_SW4_SW(dst4_r, dst5_r, dst6_r, dst7_r, 7);
1476 PCKEV_H2_SW(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r);
1477 PCKEV_H2_SW(dst5_r, dst4_r, dst7_r, dst6_r, dst4_r, dst5_r);
1478 out0 = PCKEV_XORI128_UB(dst0_r, dst1_r);
1479 out1 = PCKEV_XORI128_UB(dst4_r, dst5_r);
1480 ST4x4_UB(out0, out0, 0, 1, 2, 3, dst, dst_stride);
1481 dst += (4 * dst_stride);
1482 ST4x4_UB(out1, out1, 0, 1, 2, 3, dst, dst_stride);
1483 dst += (4 * dst_stride);
1486 dst32_r = dst1110_r;
1487 dst54_r = dst1312_r;
1489 dst43_r = dst1211_r;
1490 dst65_r = dst1413_r;
1491 dst66 = (v8i16) __msa_splati_d((v2i64) dst1410, 1);
1495 static void hevc_hv_uni_8t_8multx2mult_msa(uint8_t *src,
1499 const int8_t *filter_x,
1500 const int8_t *filter_y,
1501 int32_t height, int32_t width)
1503 uint32_t loop_cnt, cnt;
1507 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
1508 v8i16 filt0, filt1, filt2, filt3;
1509 v8i16 filt_h0, filt_h1, filt_h2, filt_h3;
1510 v16i8 mask1, mask2, mask3;
1512 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1513 v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1514 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
1515 v4i32 dst0_r, dst0_l, dst1_r, dst1_l;
1516 v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
1517 v8i16 dst10_l, dst32_l, dst54_l, dst76_l;
1518 v8i16 dst21_r, dst43_r, dst65_r, dst87_r;
1519 v8i16 dst21_l, dst43_l, dst65_l, dst87_l;
1520 v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
1522 src -= ((3 * src_stride) + 3);
1524 filter_vec = LD_SH(filter_x);
1525 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1527 filter_vec = LD_SH(filter_y);
1528 UNPCK_R_SB_SH(filter_vec, filter_vec);
1530 SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
1536 for (cnt = width >> 3; cnt--;) {
1540 LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6);
1541 src_tmp += (7 * src_stride);
1542 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1544 /* row 0 row 1 row 2 row 3 */
1545 VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
1546 vec0, vec1, vec2, vec3);
1547 VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
1548 vec4, vec5, vec6, vec7);
1549 VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
1550 vec8, vec9, vec10, vec11);
1551 VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
1552 vec12, vec13, vec14, vec15);
1553 dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1555 dst1 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
1557 dst2 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
1559 dst3 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1,
1562 VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3,
1563 vec0, vec1, vec2, vec3);
1564 VSHF_B4_SB(src5, src5, mask0, mask1, mask2, mask3,
1565 vec4, vec5, vec6, vec7);
1566 VSHF_B4_SB(src6, src6, mask0, mask1, mask2, mask3,
1567 vec8, vec9, vec10, vec11);
1568 dst4 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1570 dst5 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
1572 dst6 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
1575 for (loop_cnt = height >> 1; loop_cnt--;) {
1576 LD_SB2(src_tmp, src_stride, src7, src8);
1577 XORI_B2_128_SB(src7, src8);
1578 src_tmp += 2 * src_stride;
1580 ILVR_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1,
1581 dst10_r, dst32_r, dst54_r, dst21_r);
1582 ILVL_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1,
1583 dst10_l, dst32_l, dst54_l, dst21_l);
1584 ILVR_H2_SH(dst4, dst3, dst6, dst5, dst43_r, dst65_r);
1585 ILVL_H2_SH(dst4, dst3, dst6, dst5, dst43_l, dst65_l);
1587 VSHF_B4_SB(src7, src7, mask0, mask1, mask2, mask3,
1588 vec0, vec1, vec2, vec3);
1589 dst7 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1,
1592 ILVRL_H2_SH(dst7, dst6, dst76_r, dst76_l);
1593 dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r,
1594 filt_h0, filt_h1, filt_h2, filt_h3);
1595 dst0_l = HEVC_FILT_8TAP(dst10_l, dst32_l, dst54_l, dst76_l,
1596 filt_h0, filt_h1, filt_h2, filt_h3);
1600 VSHF_B4_SB(src8, src8, mask0, mask1, mask2, mask3,
1601 vec0, vec1, vec2, vec3);
1602 dst8 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1,
1605 ILVRL_H2_SH(dst8, dst7, dst87_r, dst87_l);
1606 dst1_r = HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r,
1607 filt_h0, filt_h1, filt_h2, filt_h3);
1608 dst1_l = HEVC_FILT_8TAP(dst21_l, dst43_l, dst65_l, dst87_l,
1609 filt_h0, filt_h1, filt_h2, filt_h3);
1612 SRARI_W4_SW(dst0_r, dst0_l, dst1_r, dst1_l, 6);
1613 SAT_SW4_SW(dst0_r, dst0_l, dst1_r, dst1_l, 7);
1615 PCKEV_H2_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst0, dst1);
1616 out = PCKEV_XORI128_UB(dst0, dst1);
1617 ST8x2_UB(out, dst_tmp, dst_stride);
1618 dst_tmp += (2 * dst_stride);
1634 static void hevc_hv_uni_8t_8w_msa(uint8_t *src,
1638 const int8_t *filter_x,
1639 const int8_t *filter_y,
1642 hevc_hv_uni_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
1643 filter_x, filter_y, height, 8);
1646 static void hevc_hv_uni_8t_12w_msa(uint8_t *src,
1650 const int8_t *filter_x,
1651 const int8_t *filter_y,
1655 uint8_t *src_tmp, *dst_tmp;
1657 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1658 v16i8 src11, src12, src13, src14;
1659 v16i8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7;
1660 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1661 v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1662 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
1663 v8i16 dst30, dst41, dst52, dst63, dst66, dst117, dst128, dst139, dst1410;
1664 v8i16 filt0, filt1, filt2, filt3, filt_h0, filt_h1, filt_h2, filt_h3;
1665 v8i16 dst10_r, dst32_r, dst54_r, dst76_r, dst21_r, dst43_r, dst65_r;
1666 v8i16 dst10_l, dst32_l, dst54_l, dst76_l, dst21_l, dst43_l, dst65_l;
1667 v8i16 dst87_r, dst98_r, dst1110_r, dst1312_r, dst109_r, dst1211_r;
1668 v8i16 dst1413_r, dst87_l, filter_vec;
1669 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r, dst6_r, dst7_r;
1670 v4i32 dst0_l, dst1_l;
1672 src -= ((3 * src_stride) + 3);
1674 filter_vec = LD_SH(filter_x);
1675 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1677 filter_vec = LD_SH(filter_y);
1678 UNPCK_R_SB_SH(filter_vec, filter_vec);
1680 SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
1682 mask0 = LD_SB(ff_hevc_mask_arr);
1690 LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6);
1691 src_tmp += (7 * src_stride);
1692 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1694 /* row 0 row 1 row 2 row 3 */
1695 VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
1696 VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7);
1697 VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3, vec8, vec9, vec10,
1699 VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3, vec12, vec13, vec14,
1701 dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1703 dst1 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
1705 dst2 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
1707 dst3 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1,
1710 VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
1711 VSHF_B4_SB(src5, src5, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7);
1712 VSHF_B4_SB(src6, src6, mask0, mask1, mask2, mask3, vec8, vec9, vec10,
1714 dst4 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1716 dst5 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
1718 dst6 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
1721 for (loop_cnt = 8; loop_cnt--;) {
1722 LD_SB2(src_tmp, src_stride, src7, src8);
1723 XORI_B2_128_SB(src7, src8);
1724 src_tmp += 2 * src_stride;
1726 ILVR_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1, dst10_r,
1727 dst32_r, dst54_r, dst21_r);
1728 ILVL_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1, dst10_l,
1729 dst32_l, dst54_l, dst21_l);
1730 ILVR_H2_SH(dst4, dst3, dst6, dst5, dst43_r, dst65_r);
1731 ILVL_H2_SH(dst4, dst3, dst6, dst5, dst43_l, dst65_l);
1733 VSHF_B4_SB(src7, src7, mask0, mask1, mask2, mask3, vec0, vec1, vec2,
1735 dst7 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1738 ILVRL_H2_SH(dst7, dst6, dst76_r, dst76_l);
1739 dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r,
1740 filt_h0, filt_h1, filt_h2, filt_h3);
1741 dst0_l = HEVC_FILT_8TAP(dst10_l, dst32_l, dst54_l, dst76_l,
1742 filt_h0, filt_h1, filt_h2, filt_h3);
1746 VSHF_B4_SB(src8, src8, mask0, mask1, mask2, mask3, vec0, vec1, vec2,
1748 dst8 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1751 ILVRL_H2_SH(dst8, dst7, dst87_r, dst87_l);
1752 dst1_r = HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r,
1753 filt_h0, filt_h1, filt_h2, filt_h3);
1754 dst1_l = HEVC_FILT_8TAP(dst21_l, dst43_l, dst65_l, dst87_l,
1755 filt_h0, filt_h1, filt_h2, filt_h3);
1758 SRARI_W4_SW(dst0_r, dst0_l, dst1_r, dst1_l, 6);
1759 SAT_SW4_SW(dst0_r, dst0_l, dst1_r, dst1_l, 7);
1761 PCKEV_H2_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst0, dst1);
1762 out0 = PCKEV_XORI128_UB(dst0, dst1);
1763 ST8x2_UB(out0, dst_tmp, dst_stride);
1764 dst_tmp += (2 * dst_stride);
1778 mask4 = LD_SB(ff_hevc_mask_arr + 16);
1783 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
1784 src += (7 * src_stride);
1785 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1787 VSHF_B4_SB(src0, src3, mask4, mask5, mask6, mask7, vec0, vec1, vec2, vec3);
1788 VSHF_B4_SB(src1, src4, mask4, mask5, mask6, mask7, vec4, vec5, vec6, vec7);
1789 VSHF_B4_SB(src2, src5, mask4, mask5, mask6, mask7, vec8, vec9, vec10,
1791 VSHF_B4_SB(src3, src6, mask4, mask5, mask6, mask7, vec12, vec13, vec14,
1794 dst30 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1796 dst41 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
1798 dst52 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
1800 dst63 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2,
1803 ILVRL_H2_SH(dst41, dst30, dst10_r, dst43_r);
1804 ILVRL_H2_SH(dst52, dst41, dst21_r, dst54_r);
1805 ILVRL_H2_SH(dst63, dst52, dst32_r, dst65_r);
1807 dst66 = (v8i16) __msa_splati_d((v2i64) dst63, 1);
1809 for (loop_cnt = 2; loop_cnt--;) {
1810 LD_SB8(src, src_stride, src7, src8, src9, src10, src11, src12, src13,
1812 src += (8 * src_stride);
1813 XORI_B8_128_SB(src7, src8, src9, src10, src11, src12, src13, src14);
1815 VSHF_B4_SB(src7, src11, mask4, mask5, mask6, mask7, vec0, vec1, vec2,
1817 VSHF_B4_SB(src8, src12, mask4, mask5, mask6, mask7, vec4, vec5, vec6,
1819 VSHF_B4_SB(src9, src13, mask4, mask5, mask6, mask7, vec8, vec9, vec10,
1821 VSHF_B4_SB(src10, src14, mask4, mask5, mask6, mask7, vec12, vec13,
1824 dst117 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1826 dst128 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
1828 dst139 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1,
1830 dst1410 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1,
1833 dst76_r = __msa_ilvr_h(dst117, dst66);
1834 ILVRL_H2_SH(dst128, dst117, dst87_r, dst1211_r);
1835 ILVRL_H2_SH(dst139, dst128, dst98_r, dst1312_r);
1836 ILVRL_H2_SH(dst1410, dst139, dst109_r, dst1413_r);
1837 dst117 = (v8i16) __msa_splati_d((v2i64) dst117, 1);
1838 dst1110_r = __msa_ilvr_h(dst117, dst1410);
1840 dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r, filt_h0,
1841 filt_h1, filt_h2, filt_h3);
1842 dst1_r = HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r, filt_h0,
1843 filt_h1, filt_h2, filt_h3);
1844 dst2_r = HEVC_FILT_8TAP(dst32_r, dst54_r, dst76_r, dst98_r, filt_h0,
1845 filt_h1, filt_h2, filt_h3);
1846 dst3_r = HEVC_FILT_8TAP(dst43_r, dst65_r, dst87_r, dst109_r, filt_h0,
1847 filt_h1, filt_h2, filt_h3);
1848 dst4_r = HEVC_FILT_8TAP(dst54_r, dst76_r, dst98_r, dst1110_r, filt_h0,
1849 filt_h1, filt_h2, filt_h3);
1850 dst5_r = HEVC_FILT_8TAP(dst65_r, dst87_r, dst109_r, dst1211_r, filt_h0,
1851 filt_h1, filt_h2, filt_h3);
1852 dst6_r = HEVC_FILT_8TAP(dst76_r, dst98_r, dst1110_r, dst1312_r, filt_h0,
1853 filt_h1, filt_h2, filt_h3);
1854 dst7_r = HEVC_FILT_8TAP(dst87_r, dst109_r, dst1211_r, dst1413_r,
1855 filt_h0, filt_h1, filt_h2, filt_h3);
1857 SRA_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6);
1858 SRA_4V(dst4_r, dst5_r, dst6_r, dst7_r, 6);
1859 SRARI_W4_SW(dst0_r, dst1_r, dst2_r, dst3_r, 6);
1860 SRARI_W4_SW(dst4_r, dst5_r, dst6_r, dst7_r, 6);
1861 SAT_SW4_SW(dst0_r, dst1_r, dst2_r, dst3_r, 7);
1862 SAT_SW4_SW(dst4_r, dst5_r, dst6_r, dst7_r, 7);
1863 PCKEV_H2_SW(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r);
1864 PCKEV_H2_SW(dst5_r, dst4_r, dst7_r, dst6_r, dst4_r, dst5_r);
1865 out0 = PCKEV_XORI128_UB(dst0_r, dst1_r);
1866 out1 = PCKEV_XORI128_UB(dst4_r, dst5_r);
1867 ST4x4_UB(out0, out0, 0, 1, 2, 3, dst, dst_stride);
1868 dst += (4 * dst_stride);
1869 ST4x4_UB(out1, out1, 0, 1, 2, 3, dst, dst_stride);
1870 dst += (4 * dst_stride);
1873 dst32_r = dst1110_r;
1874 dst54_r = dst1312_r;
1876 dst43_r = dst1211_r;
1877 dst65_r = dst1413_r;
1878 dst66 = (v8i16) __msa_splati_d((v2i64) dst1410, 1);
1882 static void hevc_hv_uni_8t_16w_msa(uint8_t *src,
1886 const int8_t *filter_x,
1887 const int8_t *filter_y,
1890 hevc_hv_uni_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
1891 filter_x, filter_y, height, 16);
1894 static void hevc_hv_uni_8t_24w_msa(uint8_t *src,
1898 const int8_t *filter_x,
1899 const int8_t *filter_y,
1902 hevc_hv_uni_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
1903 filter_x, filter_y, height, 24);
1906 static void hevc_hv_uni_8t_32w_msa(uint8_t *src,
1910 const int8_t *filter_x,
1911 const int8_t *filter_y,
1914 hevc_hv_uni_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
1915 filter_x, filter_y, height, 32);
1918 static void hevc_hv_uni_8t_48w_msa(uint8_t *src,
1922 const int8_t *filter_x,
1923 const int8_t *filter_y,
1926 hevc_hv_uni_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
1927 filter_x, filter_y, height, 48);
1930 static void hevc_hv_uni_8t_64w_msa(uint8_t *src,
1934 const int8_t *filter_x,
1935 const int8_t *filter_y,
1938 hevc_hv_uni_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
1939 filter_x, filter_y, height, 64);
1942 static void common_hz_4t_4x2_msa(uint8_t *src, int32_t src_stride,
1943 uint8_t *dst, int32_t dst_stride,
1944 const int8_t *filter)
1946 v16i8 filt0, filt1, src0, src1, mask0, mask1, vec0, vec1;
1950 mask0 = LD_SB(&ff_hevc_mask_arr[16]);
1953 /* rearranging filter */
1954 filt = LD_SH(filter);
1955 SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
1959 LD_SB2(src, src_stride, src0, src1);
1960 XORI_B2_128_SB(src0, src1);
1961 VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
1962 res0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
1963 res0 = __msa_srari_h(res0, 6);
1964 res0 = __msa_sat_s_h(res0, 7);
1965 out = PCKEV_XORI128_UB(res0, res0);
1966 ST4x2_UB(out, dst, dst_stride);
1969 static void common_hz_4t_4x4_msa(uint8_t *src, int32_t src_stride,
1970 uint8_t *dst, int32_t dst_stride,
1971 const int8_t *filter)
1973 v16i8 src0, src1, src2, src3, filt0, filt1, mask0, mask1;
1974 v8i16 filt, out0, out1;
1977 mask0 = LD_SB(&ff_hevc_mask_arr[16]);
1980 /* rearranging filter */
1981 filt = LD_SH(filter);
1982 SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
1986 LD_SB4(src, src_stride, src0, src1, src2, src3);
1987 XORI_B4_128_SB(src0, src1, src2, src3);
1988 HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,
1989 filt0, filt1, out0, out1);
1990 SRARI_H2_SH(out0, out1, 6);
1991 SAT_SH2_SH(out0, out1, 7);
1992 out = PCKEV_XORI128_UB(out0, out1);
1993 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
1996 static void common_hz_4t_4x8_msa(uint8_t *src, int32_t src_stride,
1997 uint8_t *dst, int32_t dst_stride,
1998 const int8_t *filter)
2000 v16i8 src0, src1, src2, src3, filt0, filt1, mask0, mask1;
2002 v8i16 filt, out0, out1, out2, out3;
2004 mask0 = LD_SB(&ff_hevc_mask_arr[16]);
2007 /* rearranging filter */
2008 filt = LD_SH(filter);
2009 SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2013 LD_SB4(src, src_stride, src0, src1, src2, src3);
2014 src += (4 * src_stride);
2016 XORI_B4_128_SB(src0, src1, src2, src3);
2017 HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,
2018 filt0, filt1, out0, out1);
2019 LD_SB4(src, src_stride, src0, src1, src2, src3);
2020 XORI_B4_128_SB(src0, src1, src2, src3);
2021 HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,
2022 filt0, filt1, out2, out3);
2023 SRARI_H4_SH(out0, out1, out2, out3, 6);
2024 SAT_SH4_SH(out0, out1, out2, out3, 7);
2025 out = PCKEV_XORI128_UB(out0, out1);
2026 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
2027 dst += (4 * dst_stride);
2028 out = PCKEV_XORI128_UB(out2, out3);
2029 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
2032 static void common_hz_4t_4x16_msa(uint8_t *src, int32_t src_stride,
2033 uint8_t *dst, int32_t dst_stride,
2034 const int8_t *filter)
2036 v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
2037 v16i8 filt0, filt1, mask0, mask1;
2039 v8i16 filt, out0, out1, out2, out3;
2041 mask0 = LD_SB(&ff_hevc_mask_arr[16]);
2044 /* rearranging filter */
2045 filt = LD_SH(filter);
2046 SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2050 LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
2051 src += (8 * src_stride);
2052 XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
2053 HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,
2054 filt0, filt1, out0, out1);
2055 HORIZ_4TAP_4WID_4VECS_FILT(src4, src5, src6, src7, mask0, mask1,
2056 filt0, filt1, out2, out3);
2057 SRARI_H4_SH(out0, out1, out2, out3, 6);
2058 SAT_SH4_SH(out0, out1, out2, out3, 7);
2059 out = PCKEV_XORI128_UB(out0, out1);
2060 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
2061 dst += (4 * dst_stride);
2062 out = PCKEV_XORI128_UB(out2, out3);
2063 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
2064 dst += (4 * dst_stride);
2066 LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
2067 src += (8 * src_stride);
2068 XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
2069 HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,
2070 filt0, filt1, out0, out1);
2071 HORIZ_4TAP_4WID_4VECS_FILT(src4, src5, src6, src7, mask0, mask1,
2072 filt0, filt1, out2, out3);
2073 SRARI_H4_SH(out0, out1, out2, out3, 6);
2074 SAT_SH4_SH(out0, out1, out2, out3, 7);
2075 out = PCKEV_XORI128_UB(out0, out1);
2076 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
2077 dst += (4 * dst_stride);
2078 out = PCKEV_XORI128_UB(out2, out3);
2079 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
2082 static void common_hz_4t_4w_msa(uint8_t *src, int32_t src_stride,
2083 uint8_t *dst, int32_t dst_stride,
2084 const int8_t *filter, int32_t height)
2087 common_hz_4t_4x2_msa(src, src_stride, dst, dst_stride, filter);
2088 } else if (4 == height) {
2089 common_hz_4t_4x4_msa(src, src_stride, dst, dst_stride, filter);
2090 } else if (8 == height) {
2091 common_hz_4t_4x8_msa(src, src_stride, dst, dst_stride, filter);
2092 } else if (16 == height) {
2093 common_hz_4t_4x16_msa(src, src_stride, dst, dst_stride, filter);
2097 static void common_hz_4t_6w_msa(uint8_t *src, int32_t src_stride,
2098 uint8_t *dst, int32_t dst_stride,
2099 const int8_t *filter, int32_t height)
2101 v16i8 src0, src1, src2, src3, filt0, filt1, mask0, mask1;
2103 v8i16 filt, out0, out1, out2, out3;
2105 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
2108 /* rearranging filter */
2109 filt = LD_SH(filter);
2110 SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2114 LD_SB4(src, src_stride, src0, src1, src2, src3);
2115 src += (4 * src_stride);
2117 XORI_B4_128_SB(src0, src1, src2, src3);
2118 HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0,
2119 filt1, out0, out1, out2, out3);
2120 SRARI_H4_SH(out0, out1, out2, out3, 6);
2121 SAT_SH4_SH(out0, out1, out2, out3, 7);
2122 out4 = PCKEV_XORI128_UB(out0, out1);
2123 out5 = PCKEV_XORI128_UB(out2, out3);
2124 ST6x4_UB(out4, out5, dst, dst_stride);
2125 dst += (4 * dst_stride);
2127 LD_SB4(src, src_stride, src0, src1, src2, src3);
2128 src += (4 * src_stride);
2130 XORI_B4_128_SB(src0, src1, src2, src3);
2131 HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0,
2132 filt1, out0, out1, out2, out3);
2133 SRARI_H4_SH(out0, out1, out2, out3, 6);
2134 SAT_SH4_SH(out0, out1, out2, out3, 7);
2135 out4 = PCKEV_XORI128_UB(out0, out1);
2136 out5 = PCKEV_XORI128_UB(out2, out3);
2137 ST6x4_UB(out4, out5, dst, dst_stride);
2138 dst += (4 * dst_stride);
2141 static void common_hz_4t_8x2mult_msa(uint8_t *src, int32_t src_stride,
2142 uint8_t *dst, int32_t dst_stride,
2143 const int8_t *filter, int32_t height)
2146 v16i8 src0, src1, filt0, filt1, mask0, mask1;
2148 v8i16 filt, vec0, vec1, vec2, vec3;
2150 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
2153 filt = LD_SH(filter);
2154 SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2158 for (loop_cnt = (height >> 1); loop_cnt--;) {
2159 LD_SB2(src, src_stride, src0, src1);
2160 src += (2 * src_stride);
2162 XORI_B2_128_SB(src0, src1);
2163 VSHF_B2_SH(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
2164 DOTP_SB2_SH(vec0, vec1, filt0, filt0, vec0, vec1);
2165 VSHF_B2_SH(src0, src0, src1, src1, mask1, mask1, vec2, vec3);
2166 DPADD_SB2_SH(vec2, vec3, filt1, filt1, vec0, vec1);
2167 SRARI_H2_SH(vec0, vec1, 6);
2168 SAT_SH2_SH(vec0, vec1, 7);
2169 out = PCKEV_XORI128_UB(vec0, vec1);
2170 ST8x2_UB(out, dst, dst_stride);
2171 dst += (2 * dst_stride);
2175 static void common_hz_4t_8x4mult_msa(uint8_t *src, int32_t src_stride,
2176 uint8_t *dst, int32_t dst_stride,
2177 const int8_t *filter, int32_t height)
2180 v16i8 src0, src1, src2, src3, filt0, filt1, mask0, mask1;
2182 v8i16 filt, out0, out1, out2, out3;
2184 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
2187 /* rearranging filter */
2188 filt = LD_SH(filter);
2189 SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2193 for (loop_cnt = (height >> 2); loop_cnt--;) {
2194 LD_SB4(src, src_stride, src0, src1, src2, src3);
2195 src += (4 * src_stride);
2197 XORI_B4_128_SB(src0, src1, src2, src3);
2198 HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0,
2199 filt1, out0, out1, out2, out3);
2200 SRARI_H4_SH(out0, out1, out2, out3, 6);
2201 SAT_SH4_SH(out0, out1, out2, out3, 7);
2202 tmp0 = PCKEV_XORI128_UB(out0, out1);
2203 tmp1 = PCKEV_XORI128_UB(out2, out3);
2204 ST8x4_UB(tmp0, tmp1, dst, dst_stride);
2205 dst += (4 * dst_stride);
2209 static void common_hz_4t_8w_msa(uint8_t *src, int32_t src_stride,
2210 uint8_t *dst, int32_t dst_stride,
2211 const int8_t *filter, int32_t height)
2213 if ((2 == height) || (6 == height)) {
2214 common_hz_4t_8x2mult_msa(src, src_stride, dst, dst_stride, filter,
2217 common_hz_4t_8x4mult_msa(src, src_stride, dst, dst_stride, filter,
2222 static void common_hz_4t_12w_msa(uint8_t *src, int32_t src_stride,
2223 uint8_t *dst, int32_t dst_stride,
2224 const int8_t *filter, int32_t height)
2227 v16i8 src0, src1, src2, src3, filt0, filt1, mask0, mask1, mask2, mask3;
2228 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
2231 v8i16 filt, out0, out1, out2, out3, out4, out5;
2233 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
2234 mask2 = LD_SB(&ff_hevc_mask_arr[32]);
2238 /* rearranging filter */
2239 filt = LD_SH(filter);
2240 SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2245 for (loop_cnt = 4; loop_cnt--;) {
2246 LD_SB4(src, src_stride, src0, src1, src2, src3);
2247 src += (4 * src_stride);
2249 XORI_B4_128_SB(src0, src1, src2, src3);
2250 VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec0, vec1);
2251 DOTP_SB2_SH(vec0, vec1, filt0, filt0, out0, out1);
2252 VSHF_B2_SB(src0, src1, src2, src3, mask3, mask3, vec2, vec3);
2253 DPADD_SB2_SH(vec2, vec3, filt1, filt1, out0, out1);
2254 SRARI_H2_SH(out0, out1, 6);
2255 SAT_SH2_SH(out0, out1, 7);
2256 tmp0 = PCKEV_XORI128_UB(out0, out1);
2257 ST4x4_UB(tmp0, tmp0, 0, 1, 2, 3, dst + 8, dst_stride);
2259 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec4, vec5);
2260 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec6, vec7);
2261 DOTP_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0,
2262 out2, out3, out4, out5);
2263 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec8, vec9);
2264 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec10, vec11);
2265 DPADD_SB4_SH(vec8, vec9, vec10, vec11, filt1, filt1, filt1, filt1,
2266 out2, out3, out4, out5);
2267 SRARI_H4_SH(out2, out3, out4, out5, 6);
2268 SAT_SH4_SH(out2, out3, out4, out5, 7);
2269 tmp0 = PCKEV_XORI128_UB(out2, out3);
2270 tmp1 = PCKEV_XORI128_UB(out4, out5);
2271 ST8x4_UB(tmp0, tmp1, dst, dst_stride);
2272 dst += (4 * dst_stride);
2276 static void common_hz_4t_16w_msa(uint8_t *src, int32_t src_stride,
2277 uint8_t *dst, int32_t dst_stride,
2278 const int8_t *filter, int32_t height)
2281 v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
2282 v16i8 filt0, filt1, mask0, mask1;
2283 v16i8 vec0_m, vec1_m, vec2_m, vec3_m;
2284 v8i16 filt, out0, out1, out2, out3, out4, out5, out6, out7;
2287 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
2290 /* rearranging filter */
2291 filt = LD_SH(filter);
2292 SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2296 for (loop_cnt = (height >> 2); loop_cnt--;) {
2297 LD_SB4(src, src_stride, src0, src2, src4, src6);
2298 LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
2299 src += (4 * src_stride);
2301 XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
2303 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m);
2304 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m);
2305 DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0,
2306 out0, out1, out2, out3);
2307 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0_m, vec1_m);
2308 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2_m, vec3_m);
2309 DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt1, filt1, filt1, filt1,
2310 out0, out1, out2, out3);
2311 SRARI_H4_SH(out0, out1, out2, out3, 6);
2312 SAT_SH4_SH(out0, out1, out2, out3, 7);
2313 out = PCKEV_XORI128_UB(out0, out1);
2316 out = PCKEV_XORI128_UB(out2, out3);
2320 VSHF_B2_SB(src4, src4, src5, src5, mask0, mask0, vec0_m, vec1_m);
2321 VSHF_B2_SB(src6, src6, src7, src7, mask0, mask0, vec2_m, vec3_m);
2322 DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0,
2323 out4, out5, out6, out7);
2324 VSHF_B2_SB(src4, src4, src5, src5, mask1, mask1, vec0_m, vec1_m);
2325 VSHF_B2_SB(src6, src6, src7, src7, mask1, mask1, vec2_m, vec3_m);
2326 DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt1, filt1, filt1, filt1,
2327 out4, out5, out6, out7);
2328 SRARI_H4_SH(out4, out5, out6, out7, 6);
2329 SAT_SH4_SH(out4, out5, out6, out7, 7);
2330 out = PCKEV_XORI128_UB(out4, out5);
2333 out = PCKEV_XORI128_UB(out6, out7);
2339 static void common_hz_4t_24w_msa(uint8_t *src, int32_t src_stride,
2340 uint8_t *dst, int32_t dst_stride,
2341 const int8_t *filter, int32_t height)
2343 uint8_t *dst1 = dst + 16;
2345 v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
2346 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
2347 v16i8 filt0, filt1, mask0, mask1, mask00, mask11;
2348 v8i16 filt, out0, out1, out2, out3;
2351 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
2354 /* rearranging filter */
2355 filt = LD_SH(filter);
2356 SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2360 mask11 = mask0 + 10;
2362 for (loop_cnt = 8; loop_cnt--;) {
2363 LD_SB4(src, src_stride, src0, src2, src4, src6);
2364 LD_SB4(src + 16, src_stride, src1, src3, src5, src7);
2365 src += (4 * src_stride);
2367 XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
2368 VSHF_B2_SB(src0, src0, src0, src1, mask0, mask00, vec0, vec1);
2369 VSHF_B2_SB(src2, src2, src2, src3, mask0, mask00, vec2, vec3);
2370 VSHF_B2_SB(src0, src0, src0, src1, mask1, mask11, vec4, vec5);
2371 VSHF_B2_SB(src2, src2, src2, src3, mask1, mask11, vec6, vec7);
2372 DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
2373 out0, out1, out2, out3);
2374 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1,
2375 out0, out1, out2, out3);
2376 SRARI_H4_SH(out0, out1, out2, out3, 6);
2377 SAT_SH4_SH(out0, out1, out2, out3, 7);
2378 tmp0 = PCKEV_XORI128_UB(out0, out1);
2381 tmp0 = PCKEV_XORI128_UB(out2, out3);
2385 VSHF_B2_SB(src4, src4, src4, src5, mask0, mask00, vec0, vec1);
2386 VSHF_B2_SB(src6, src6, src6, src7, mask0, mask00, vec2, vec3);
2387 VSHF_B2_SB(src4, src4, src4, src5, mask1, mask11, vec4, vec5);
2388 VSHF_B2_SB(src6, src6, src6, src7, mask1, mask11, vec6, vec7);
2389 DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
2390 out0, out1, out2, out3);
2391 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1,
2392 out0, out1, out2, out3);
2393 SRARI_H4_SH(out0, out1, out2, out3, 6);
2394 SAT_SH4_SH(out0, out1, out2, out3, 7);
2395 tmp0 = PCKEV_XORI128_UB(out0, out1);
2398 tmp0 = PCKEV_XORI128_UB(out2, out3);
2403 VSHF_B2_SB(src1, src1, src3, src3, mask0, mask0, vec0, vec1);
2404 VSHF_B2_SB(src5, src5, src7, src7, mask0, mask0, vec2, vec3);
2405 VSHF_B2_SB(src1, src1, src3, src3, mask1, mask1, vec4, vec5);
2406 VSHF_B2_SB(src5, src5, src7, src7, mask1, mask1, vec6, vec7);
2408 DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
2409 out0, out1, out2, out3);
2410 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1,
2411 out0, out1, out2, out3);
2413 SRARI_H4_SH(out0, out1, out2, out3, 6);
2414 SAT_SH4_SH(out0, out1, out2, out3, 7);
2415 tmp0 = PCKEV_XORI128_UB(out0, out1);
2416 tmp1 = PCKEV_XORI128_UB(out2, out3);
2417 ST8x4_UB(tmp0, tmp1, dst1, dst_stride);
2418 dst1 += (4 * dst_stride);
2422 static void common_hz_4t_32w_msa(uint8_t *src, int32_t src_stride,
2423 uint8_t *dst, int32_t dst_stride,
2424 const int8_t *filter, int32_t height)
2427 v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
2428 v16i8 filt0, filt1, mask0, mask1;
2430 v16i8 vec0_m, vec1_m, vec2_m, vec3_m;
2431 v8i16 filt, out0, out1, out2, out3, out4, out5, out6, out7;
2433 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
2436 /* rearranging filter */
2437 filt = LD_SH(filter);
2438 SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2442 for (loop_cnt = (height >> 1); loop_cnt--;) {
2444 src1 = LD_SB(src + 8);
2445 src2 = LD_SB(src + 16);
2446 src3 = LD_SB(src + 24);
2449 src5 = LD_SB(src + 8);
2450 src6 = LD_SB(src + 16);
2451 src7 = LD_SB(src + 24);
2454 XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
2456 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m);
2457 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m);
2458 DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0,
2459 out0, out1, out2, out3);
2460 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0_m, vec1_m);
2461 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2_m, vec3_m);
2462 DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt1, filt1, filt1, filt1,
2463 out0, out1, out2, out3);
2465 VSHF_B2_SB(src4, src4, src5, src5, mask0, mask0, vec0_m, vec1_m);
2466 VSHF_B2_SB(src6, src6, src7, src7, mask0, mask0, vec2_m, vec3_m);
2467 DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0,
2468 out4, out5, out6, out7);
2469 VSHF_B2_SB(src4, src4, src5, src5, mask1, mask1, vec0_m, vec1_m);
2470 VSHF_B2_SB(src6, src6, src7, src7, mask1, mask1, vec2_m, vec3_m);
2471 DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt1, filt1, filt1, filt1,
2472 out4, out5, out6, out7);
2473 SRARI_H4_SH(out0, out1, out2, out3, 6);
2474 SRARI_H4_SH(out4, out5, out6, out7, 6);
2475 SAT_SH4_SH(out0, out1, out2, out3, 7);
2476 SAT_SH4_SH(out4, out5, out6, out7, 7);
2477 out = PCKEV_XORI128_UB(out0, out1);
2479 out = PCKEV_XORI128_UB(out2, out3);
2480 ST_UB(out, dst + 16);
2482 out = PCKEV_XORI128_UB(out4, out5);
2484 out = PCKEV_XORI128_UB(out6, out7);
2485 ST_UB(out, dst + 16);
2490 static void common_vt_4t_4x2_msa(uint8_t *src, int32_t src_stride,
2491 uint8_t *dst, int32_t dst_stride,
2492 const int8_t *filter)
2494 v16i8 src0, src1, src2, src3, src4, src10_r, src32_r, src21_r, src43_r;
2495 v16i8 src2110, src4332, filt0, filt1;
2501 filt = LD_SH(filter);
2502 SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2504 LD_SB3(src, src_stride, src0, src1, src2);
2505 src += (3 * src_stride);
2507 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2508 src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
2509 src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
2510 LD_SB2(src, src_stride, src3, src4);
2511 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
2512 src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_r, (v2i64) src32_r);
2513 src4332 = (v16i8) __msa_xori_b((v16u8) src4332, 128);
2514 out10 = HEVC_FILT_4TAP_SH(src2110, src4332, filt0, filt1);
2515 out10 = __msa_srari_h(out10, 6);
2516 out10 = __msa_sat_s_h(out10, 7);
2517 out = PCKEV_XORI128_UB(out10, out10);
2518 ST4x2_UB(out, dst, dst_stride);
2521 static void common_vt_4t_4x4multiple_msa(uint8_t *src, int32_t src_stride,
2522 uint8_t *dst, int32_t dst_stride,
2523 const int8_t *filter, int32_t height)
2526 v16i8 src0, src1, src2, src3, src4, src5;
2527 v16i8 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r;
2528 v16i8 src2110, src4332, filt0, filt1;
2529 v8i16 filt, out10, out32;
2534 filt = LD_SH(filter);
2535 SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2537 LD_SB3(src, src_stride, src0, src1, src2);
2538 src += (3 * src_stride);
2540 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2542 src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
2543 src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
2545 for (loop_cnt = (height >> 2); loop_cnt--;) {
2546 LD_SB3(src, src_stride, src3, src4, src5);
2547 src += (3 * src_stride);
2548 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
2549 src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_r, (v2i64) src32_r);
2550 src4332 = (v16i8) __msa_xori_b((v16u8) src4332, 128);
2551 out10 = HEVC_FILT_4TAP_SH(src2110, src4332, filt0, filt1);
2554 src += (src_stride);
2555 ILVR_B2_SB(src5, src4, src2, src5, src54_r, src65_r);
2556 src2110 = (v16i8) __msa_ilvr_d((v2i64) src65_r, (v2i64) src54_r);
2557 src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
2558 out32 = HEVC_FILT_4TAP_SH(src4332, src2110, filt0, filt1);
2559 SRARI_H2_SH(out10, out32, 6);
2560 SAT_SH2_SH(out10, out32, 7);
2561 out = PCKEV_XORI128_UB(out10, out32);
2562 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
2563 dst += (4 * dst_stride);
2567 static void common_vt_4t_4w_msa(uint8_t *src, int32_t src_stride,
2568 uint8_t *dst, int32_t dst_stride,
2569 const int8_t *filter, int32_t height)
2572 common_vt_4t_4x2_msa(src, src_stride, dst, dst_stride, filter);
2574 common_vt_4t_4x4multiple_msa(src, src_stride, dst, dst_stride, filter,
2579 static void common_vt_4t_6w_msa(uint8_t *src, int32_t src_stride,
2580 uint8_t *dst, int32_t dst_stride,
2581 const int8_t *filter, int32_t height)
2584 v16i8 src0, src1, src2, src3, src4, src5, src6;
2585 v16i8 src10_r, src32_r, src21_r, src43_r, src54_r, src65_r;
2586 v8i16 dst0_r, dst1_r, dst2_r, dst3_r, filt0, filt1, filter_vec;
2590 filter_vec = LD_SH(filter);
2591 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2593 LD_SB3(src, src_stride, src0, src1, src2);
2594 src += (3 * src_stride);
2595 XORI_B3_128_SB(src0, src1, src2);
2596 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2598 LD_SB2(src, src_stride, src3, src4);
2599 src += (2 * src_stride);
2600 XORI_B2_128_SB(src3, src4);
2601 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
2603 dst0_r = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
2604 dst1_r = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
2606 LD_SB2(src, src_stride, src5, src6);
2607 src += (2 * src_stride);
2608 XORI_B2_128_SB(src5, src6);
2609 ILVR_B2_SB(src5, src4, src6, src5, src54_r, src65_r);
2611 dst2_r = HEVC_FILT_4TAP_SH(src32_r, src54_r, filt0, filt1);
2612 dst3_r = HEVC_FILT_4TAP_SH(src43_r, src65_r, filt0, filt1);
2614 SRARI_H4_SH(dst0_r, dst1_r, dst2_r, dst3_r, 6);
2615 SAT_SH4_SH(dst0_r, dst1_r, dst2_r, dst3_r, 7);
2616 out0 = PCKEV_XORI128_UB(dst0_r, dst1_r);
2617 out1 = PCKEV_XORI128_UB(dst2_r, dst3_r);
2618 ST6x4_UB(out0, out1, dst, dst_stride);
2619 dst += (4 * dst_stride);
2621 LD_SB2(src, src_stride, src3, src4);
2622 src += (2 * src_stride);
2623 XORI_B2_128_SB(src3, src4);
2624 ILVR_B2_SB(src3, src6, src4, src3, src32_r, src43_r);
2626 dst0_r = HEVC_FILT_4TAP_SH(src54_r, src32_r, filt0, filt1);
2627 dst1_r = HEVC_FILT_4TAP_SH(src65_r, src43_r, filt0, filt1);
2629 LD_SB2(src, src_stride, src5, src6);
2630 src += (2 * src_stride);
2631 XORI_B2_128_SB(src5, src6);
2632 ILVR_B2_SB(src5, src4, src6, src5, src54_r, src65_r);
2634 dst2_r = HEVC_FILT_4TAP_SH(src32_r, src54_r, filt0, filt1);
2635 dst3_r = HEVC_FILT_4TAP_SH(src43_r, src65_r, filt0, filt1);
2637 SRARI_H4_SH(dst0_r, dst1_r, dst2_r, dst3_r, 6);
2638 SAT_SH4_SH(dst0_r, dst1_r, dst2_r, dst3_r, 7);
2639 out0 = PCKEV_XORI128_UB(dst0_r, dst1_r);
2640 out1 = PCKEV_XORI128_UB(dst2_r, dst3_r);
2641 ST6x4_UB(out0, out1, dst, dst_stride);
2644 static void common_vt_4t_8x2_msa(uint8_t *src, int32_t src_stride,
2645 uint8_t *dst, int32_t dst_stride,
2646 const int8_t *filter)
2648 v16i8 src0, src1, src2, src3, src4;
2649 v8i16 src01, src12, src23, src34, tmp0, tmp1, filt, filt0, filt1;
2654 /* rearranging filter_y */
2655 filt = LD_SH(filter);
2656 SPLATI_H2_SH(filt, 0, 1, filt0, filt1);
2658 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
2659 XORI_B5_128_SB(src0, src1, src2, src3, src4);
2660 ILVR_B2_SH(src1, src0, src3, src2, src01, src23);
2661 tmp0 = HEVC_FILT_4TAP_SH(src01, src23, filt0, filt1);
2662 ILVR_B2_SH(src2, src1, src4, src3, src12, src34);
2663 tmp1 = HEVC_FILT_4TAP_SH(src12, src34, filt0, filt1);
2664 SRARI_H2_SH(tmp0, tmp1, 6);
2665 SAT_SH2_SH(tmp0, tmp1, 7);
2666 out = PCKEV_XORI128_UB(tmp0, tmp1);
2667 ST8x2_UB(out, dst, dst_stride);
2670 static void common_vt_4t_8x6_msa(uint8_t *src, int32_t src_stride,
2671 uint8_t *dst, int32_t dst_stride,
2672 const int8_t *filter)
2675 uint64_t out0, out1, out2;
2676 v16i8 src0, src1, src2, src3, src4, src5;
2677 v8i16 vec0, vec1, vec2, vec3, vec4, tmp0, tmp1, tmp2;
2678 v8i16 filt, filt0, filt1;
2682 /* rearranging filter_y */
2683 filt = LD_SH(filter);
2684 SPLATI_H2_SH(filt, 0, 1, filt0, filt1);
2686 LD_SB3(src, src_stride, src0, src1, src2);
2687 src += (3 * src_stride);
2689 XORI_B3_128_SB(src0, src1, src2);
2690 ILVR_B2_SH(src1, src0, src2, src1, vec0, vec2);
2692 for (loop_cnt = 2; loop_cnt--;) {
2693 LD_SB3(src, src_stride, src3, src4, src5);
2694 src += (3 * src_stride);
2696 XORI_B3_128_SB(src3, src4, src5);
2697 ILVR_B3_SH(src3, src2, src4, src3, src5, src4, vec1, vec3, vec4);
2698 tmp0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
2699 tmp1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
2700 tmp2 = HEVC_FILT_4TAP_SH(vec1, vec4, filt0, filt1);
2701 SRARI_H2_SH(tmp0, tmp1, 6);
2702 tmp2 = __msa_srari_h(tmp2, 6);
2703 SAT_SH3_SH(tmp0, tmp1, tmp2, 7);
2704 PCKEV_B2_SH(tmp1, tmp0, tmp2, tmp2, tmp0, tmp2);
2705 XORI_B2_128_SH(tmp0, tmp2);
2707 out0 = __msa_copy_u_d((v2i64) tmp0, 0);
2708 out1 = __msa_copy_u_d((v2i64) tmp0, 1);
2709 out2 = __msa_copy_u_d((v2i64) tmp2, 0);
2723 static void common_vt_4t_8x4mult_msa(uint8_t *src, int32_t src_stride,
2724 uint8_t *dst, int32_t dst_stride,
2725 const int8_t *filter, int32_t height)
2728 v16i8 src0, src1, src2, src7, src8, src9, src10;
2729 v16i8 src10_r, src72_r, src98_r, src21_r, src87_r, src109_r, filt0, filt1;
2731 v8i16 filt, out0_r, out1_r, out2_r, out3_r;
2735 filt = LD_SH(filter);
2736 SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2738 LD_SB3(src, src_stride, src0, src1, src2);
2739 src += (3 * src_stride);
2741 XORI_B3_128_SB(src0, src1, src2);
2742 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2744 for (loop_cnt = (height >> 2); loop_cnt--;) {
2745 LD_SB4(src, src_stride, src7, src8, src9, src10);
2746 src += (4 * src_stride);
2748 XORI_B4_128_SB(src7, src8, src9, src10);
2749 ILVR_B4_SB(src7, src2, src8, src7, src9, src8, src10, src9,
2750 src72_r, src87_r, src98_r, src109_r);
2751 out0_r = HEVC_FILT_4TAP_SH(src10_r, src72_r, filt0, filt1);
2752 out1_r = HEVC_FILT_4TAP_SH(src21_r, src87_r, filt0, filt1);
2753 out2_r = HEVC_FILT_4TAP_SH(src72_r, src98_r, filt0, filt1);
2754 out3_r = HEVC_FILT_4TAP_SH(src87_r, src109_r, filt0, filt1);
2755 SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6);
2756 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
2757 tmp0 = PCKEV_XORI128_UB(out0_r, out1_r);
2758 tmp1 = PCKEV_XORI128_UB(out2_r, out3_r);
2759 ST8x4_UB(tmp0, tmp1, dst, dst_stride);
2760 dst += (4 * dst_stride);
2768 static void common_vt_4t_8w_msa(uint8_t *src, int32_t src_stride,
2769 uint8_t *dst, int32_t dst_stride,
2770 const int8_t *filter, int32_t height)
2773 common_vt_4t_8x2_msa(src, src_stride, dst, dst_stride, filter);
2774 } else if (6 == height) {
2775 common_vt_4t_8x6_msa(src, src_stride, dst, dst_stride, filter);
2777 common_vt_4t_8x4mult_msa(src, src_stride, dst, dst_stride,
2782 static void common_vt_4t_12w_msa(uint8_t *src, int32_t src_stride,
2783 uint8_t *dst, int32_t dst_stride,
2784 const int8_t *filter, int32_t height)
2787 v16i8 src0, src1, src2, src3, src4, src5, src6;
2789 v16i8 src10_r, src32_r, src21_r, src43_r, src54_r, src65_r;
2790 v16i8 src10_l, src32_l, src54_l, src21_l, src43_l, src65_l;
2791 v16i8 src2110, src4332, src6554;
2792 v8i16 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, filt0, filt1;
2795 src -= (1 * src_stride);
2797 filter_vec = LD_SH(filter);
2798 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2800 LD_SB3(src, src_stride, src0, src1, src2);
2801 src += (3 * src_stride);
2803 XORI_B3_128_SB(src0, src1, src2);
2804 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2805 ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
2806 src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_l, (v2i64) src10_l);
2808 for (loop_cnt = 4; loop_cnt--;) {
2809 LD_SB4(src, src_stride, src3, src4, src5, src6);
2810 src += (4 * src_stride);
2812 XORI_B4_128_SB(src3, src4, src5, src6);
2813 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
2814 ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
2815 src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_l, (v2i64) src32_l);
2816 ILVR_B2_SB(src5, src4, src6, src5, src54_r, src65_r);
2817 ILVL_B2_SB(src5, src4, src6, src5, src54_l, src65_l);
2818 src6554 = (v16i8) __msa_ilvr_d((v2i64) src65_l, (v2i64) src54_l);
2820 dst0_r = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
2821 dst1_r = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
2822 dst0_l = HEVC_FILT_4TAP_SH(src2110, src4332, filt0, filt1);
2823 dst2_r = HEVC_FILT_4TAP_SH(src32_r, src54_r, filt0, filt1);
2824 dst3_r = HEVC_FILT_4TAP_SH(src43_r, src65_r, filt0, filt1);
2825 dst1_l = HEVC_FILT_4TAP_SH(src4332, src6554, filt0, filt1);
2827 SRARI_H4_SH(dst0_r, dst1_r, dst2_r, dst3_r, 6);
2828 SRARI_H2_SH(dst0_l, dst1_l, 6);
2829 SAT_SH4_SH(dst0_r, dst1_r, dst2_r, dst3_r, 7);
2830 SAT_SH2_SH(dst0_l, dst1_l, 7);
2831 out0 = PCKEV_XORI128_UB(dst0_r, dst1_r);
2832 out1 = PCKEV_XORI128_UB(dst2_r, dst3_r);
2833 ST8x4_UB(out0, out1, dst, dst_stride);
2834 out0 = PCKEV_XORI128_UB(dst0_l, dst1_l);
2835 ST4x4_UB(out0, out0, 0, 1, 2, 3, dst + 8, dst_stride);
2836 dst += (4 * dst_stride);
2845 static void common_vt_4t_16w_msa(uint8_t *src, int32_t src_stride,
2846 uint8_t *dst, int32_t dst_stride,
2847 const int8_t *filter, int32_t height)
2850 v16i8 src0, src1, src2, src3, src4, src5, src6;
2851 v16i8 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r, src10_l;
2852 v16i8 src32_l, src54_l, src21_l, src43_l, src65_l, filt0, filt1;
2853 v16u8 tmp0, tmp1, tmp2, tmp3;
2854 v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
2858 filt = LD_SH(filter);
2859 SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2861 LD_SB3(src, src_stride, src0, src1, src2);
2862 src += (3 * src_stride);
2864 XORI_B3_128_SB(src0, src1, src2);
2865 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2866 ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
2868 for (loop_cnt = (height >> 2); loop_cnt--;) {
2869 LD_SB4(src, src_stride, src3, src4, src5, src6);
2870 src += (4 * src_stride);
2872 XORI_B4_128_SB(src3, src4, src5, src6);
2873 ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
2874 src32_r, src43_r, src54_r, src65_r);
2875 ILVL_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
2876 src32_l, src43_l, src54_l, src65_l);
2877 out0_r = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
2878 out1_r = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
2879 out2_r = HEVC_FILT_4TAP_SH(src32_r, src54_r, filt0, filt1);
2880 out3_r = HEVC_FILT_4TAP_SH(src43_r, src65_r, filt0, filt1);
2881 out0_l = HEVC_FILT_4TAP_SH(src10_l, src32_l, filt0, filt1);
2882 out1_l = HEVC_FILT_4TAP_SH(src21_l, src43_l, filt0, filt1);
2883 out2_l = HEVC_FILT_4TAP_SH(src32_l, src54_l, filt0, filt1);
2884 out3_l = HEVC_FILT_4TAP_SH(src43_l, src65_l, filt0, filt1);
2885 SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6);
2886 SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 6);
2887 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
2888 SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
2889 PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
2890 out3_r, tmp0, tmp1, tmp2, tmp3);
2891 XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3);
2892 ST_UB4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride);
2893 dst += (4 * dst_stride);
2903 static void common_vt_4t_24w_msa(uint8_t *src, int32_t src_stride,
2904 uint8_t *dst, int32_t dst_stride,
2905 const int8_t *filter, int32_t height)
2908 uint64_t out0, out1;
2909 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
2910 v16i8 src11, filt0, filt1;
2911 v16i8 src10_r, src32_r, src76_r, src98_r, src21_r, src43_r, src87_r;
2912 v16i8 src109_r, src10_l, src32_l, src21_l, src43_l;
2914 v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l;
2918 filt = LD_SH(filter);
2919 SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2922 LD_SB3(src, src_stride, src0, src1, src2);
2923 XORI_B3_128_SB(src0, src1, src2);
2924 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2925 ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
2928 LD_SB3(src + 16, src_stride, src6, src7, src8);
2929 src += (3 * src_stride);
2930 XORI_B3_128_SB(src6, src7, src8);
2931 ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
2933 for (loop_cnt = 8; loop_cnt--;) {
2935 LD_SB2(src, src_stride, src3, src4);
2936 XORI_B2_128_SB(src3, src4);
2937 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
2938 ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
2941 LD_SB2(src + 16, src_stride, src9, src10);
2942 src += (2 * src_stride);
2943 XORI_B2_128_SB(src9, src10);
2944 ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r);
2947 out0_r = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
2948 out0_l = HEVC_FILT_4TAP_SH(src10_l, src32_l, filt0, filt1);
2949 out1_r = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
2950 out1_l = HEVC_FILT_4TAP_SH(src21_l, src43_l, filt0, filt1);
2953 out2_r = HEVC_FILT_4TAP_SH(src76_r, src98_r, filt0, filt1);
2954 out3_r = HEVC_FILT_4TAP_SH(src87_r, src109_r, filt0, filt1);
2957 SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6);
2958 SRARI_H2_SH(out0_l, out1_l, 6);
2959 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
2960 SAT_SH2_SH(out0_l, out1_l, 7);
2961 out = PCKEV_XORI128_UB(out0_r, out0_l);
2963 PCKEV_B2_SH(out2_r, out2_r, out3_r, out3_r, out2_r, out3_r);
2964 XORI_B2_128_SH(out2_r, out3_r);
2965 out0 = __msa_copy_u_d((v2i64) out2_r, 0);
2966 out1 = __msa_copy_u_d((v2i64) out3_r, 0);
2969 out = PCKEV_XORI128_UB(out1_r, out1_l);
2975 LD_SB2(src, src_stride, src5, src2);
2976 XORI_B2_128_SB(src5, src2);
2977 ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r);
2978 ILVL_B2_SB(src5, src4, src2, src5, src10_l, src21_l);
2981 LD_SB2(src + 16, src_stride, src11, src8);
2982 src += (2 * src_stride);
2983 XORI_B2_128_SB(src11, src8);
2984 ILVR_B2_SB(src11, src10, src8, src11, src76_r, src87_r);
2987 out0_r = HEVC_FILT_4TAP_SH(src32_r, src10_r, filt0, filt1);
2988 out0_l = HEVC_FILT_4TAP_SH(src32_l, src10_l, filt0, filt1);
2989 out1_r = HEVC_FILT_4TAP_SH(src43_r, src21_r, filt0, filt1);
2990 out1_l = HEVC_FILT_4TAP_SH(src43_l, src21_l, filt0, filt1);
2993 out2_r = HEVC_FILT_4TAP_SH(src98_r, src76_r, filt0, filt1);
2994 out3_r = HEVC_FILT_4TAP_SH(src109_r, src87_r, filt0, filt1);
2997 SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6);
2998 SRARI_H2_SH(out0_l, out1_l, 6);
2999 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
3000 SAT_SH2_SH(out0_l, out1_l, 7);
3001 out = PCKEV_XORI128_UB(out0_r, out0_l);
3003 out = PCKEV_XORI128_UB(out2_r, out2_r);
3004 ST8x1_UB(out, dst + 16);
3006 out = PCKEV_XORI128_UB(out1_r, out1_l);
3008 out = PCKEV_XORI128_UB(out3_r, out3_r);
3009 ST8x1_UB(out, dst + 16);
3014 static void common_vt_4t_32w_msa(uint8_t *src, int32_t src_stride,
3015 uint8_t *dst, int32_t dst_stride,
3016 const int8_t *filter, int32_t height)
3019 v16i8 src0, src1, src2, src3, src4, src6, src7, src8, src9, src10;
3020 v16i8 src10_r, src32_r, src76_r, src98_r;
3021 v16i8 src21_r, src43_r, src87_r, src109_r;
3022 v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
3023 v16i8 src10_l, src32_l, src76_l, src98_l;
3024 v16i8 src21_l, src43_l, src87_l, src109_l;
3031 filt = LD_SH(filter);
3032 SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
3035 LD_SB3(src, src_stride, src0, src1, src2);
3036 XORI_B3_128_SB(src0, src1, src2);
3038 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3039 ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
3042 LD_SB3(src + 16, src_stride, src6, src7, src8);
3043 src += (3 * src_stride);
3045 XORI_B3_128_SB(src6, src7, src8);
3046 ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
3047 ILVL_B2_SB(src7, src6, src8, src7, src76_l, src87_l);
3049 for (loop_cnt = (height >> 1); loop_cnt--;) {
3051 LD_SB2(src, src_stride, src3, src4);
3052 XORI_B2_128_SB(src3, src4);
3053 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3054 ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
3057 out0_r = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
3058 out0_l = HEVC_FILT_4TAP_SH(src10_l, src32_l, filt0, filt1);
3059 out1_r = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
3060 out1_l = HEVC_FILT_4TAP_SH(src21_l, src43_l, filt0, filt1);
3063 SRARI_H4_SH(out0_r, out1_r, out0_l, out1_l, 6);
3064 SAT_SH4_SH(out0_r, out1_r, out0_l, out1_l, 7);
3065 out = PCKEV_XORI128_UB(out0_r, out0_l);
3067 out = PCKEV_XORI128_UB(out1_r, out1_l);
3068 ST_UB(out, dst + dst_stride);
3077 LD_SB2(src + 16, src_stride, src9, src10);
3078 src += (2 * src_stride);
3079 XORI_B2_128_SB(src9, src10);
3080 ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r);
3081 ILVL_B2_SB(src9, src8, src10, src9, src98_l, src109_l);
3084 out2_r = HEVC_FILT_4TAP_SH(src76_r, src98_r, filt0, filt1);
3085 out2_l = HEVC_FILT_4TAP_SH(src76_l, src98_l, filt0, filt1);
3086 out3_r = HEVC_FILT_4TAP_SH(src87_r, src109_r, filt0, filt1);
3087 out3_l = HEVC_FILT_4TAP_SH(src87_l, src109_l, filt0, filt1);
3090 SRARI_H4_SH(out2_r, out3_r, out2_l, out3_l, 6);
3091 SAT_SH4_SH(out2_r, out3_r, out2_l, out3_l, 7);
3092 out = PCKEV_XORI128_UB(out2_r, out2_l);
3093 ST_UB(out, dst + 16);
3094 out = PCKEV_XORI128_UB(out3_r, out3_l);
3095 ST_UB(out, dst + 16 + dst_stride);
3097 dst += 2 * dst_stride;
3107 static void hevc_hv_uni_4t_4x2_msa(uint8_t *src,
3111 const int8_t *filter_x,
3112 const int8_t *filter_y,
3115 v16i8 src0, src1, src2, src3, src4;
3117 v4i32 filt_h0, filt_h1;
3118 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
3120 v8i16 filter_vec, const_vec;
3121 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
3122 v8i16 dst0, dst1, dst2, dst3, dst4;
3123 v4i32 dst0_r, dst1_r;
3124 v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
3126 src -= (src_stride + 1);
3128 filter_vec = LD_SH(filter_x);
3129 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3131 filter_vec = LD_SH(filter_y);
3132 vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
3133 filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
3135 SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1);
3139 const_vec = __msa_ldi_h(128);
3142 LD_SB3(src, src_stride, src0, src1, src2);
3143 src += (3 * src_stride);
3145 XORI_B3_128_SB(src0, src1, src2);
3147 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3148 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
3149 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3152 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
3154 DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1);
3156 DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2);
3158 ILVR_H2_SH(dst1, dst0, dst2, dst1, dst10_r, dst21_r);
3159 LD_SB2(src, src_stride, src3, src4);
3160 XORI_B2_128_SB(src3, src4);
3163 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3165 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
3167 dst32_r = __msa_ilvr_h(dst3, dst2);
3168 dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
3172 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
3174 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
3176 dst43_r = __msa_ilvr_h(dst4, dst3);
3177 dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
3180 dst0_r = (v4i32) __msa_pckev_h((v8i16) dst1_r, (v8i16) dst0_r);
3181 dst0_r = (v4i32) __msa_srari_h((v8i16) dst0_r, 6);
3182 dst0_r = (v4i32) CLIP_SH_0_255(dst0_r);
3183 dst0_r = (v4i32) __msa_pckev_b((v16i8) dst0_r, (v16i8) dst0_r);
3185 ST4x2_UB(dst0_r, dst, dst_stride);
3188 static void hevc_hv_uni_4t_4x4_msa(uint8_t *src,
3192 const int8_t *filter_x,
3193 const int8_t *filter_y,
3196 v16i8 src0, src1, src2, src3, src4, src5, src6;
3198 v4i32 filt_h0, filt_h1;
3199 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
3201 v8i16 filter_vec, const_vec;
3202 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
3203 v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
3204 v4i32 dst0_r, dst1_r, dst2_r, dst3_r;
3205 v8i16 out0_r, out1_r;
3206 v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
3208 src -= (src_stride + 1);
3210 filter_vec = LD_SH(filter_x);
3211 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3213 filter_vec = LD_SH(filter_y);
3214 vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
3215 filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
3217 SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1);
3221 const_vec = __msa_ldi_h(128);
3224 LD_SB3(src, src_stride, src0, src1, src2);
3225 src += (3 * src_stride);
3227 XORI_B3_128_SB(src0, src1, src2);
3229 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3230 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
3231 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3234 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
3236 DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1);
3238 DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2);
3240 ILVR_H2_SH(dst1, dst0, dst2, dst1, dst10_r, dst21_r);
3241 LD_SB4(src, src_stride, src3, src4, src5, src6);
3242 XORI_B4_128_SB(src3, src4, src5, src6);
3245 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3247 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
3249 dst32_r = __msa_ilvr_h(dst3, dst2);
3250 dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
3254 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
3256 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
3258 dst43_r = __msa_ilvr_h(dst4, dst3);
3259 dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
3263 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
3265 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5);
3267 dst10_r = __msa_ilvr_h(dst5, dst4);
3268 dst2_r = HEVC_FILT_4TAP(dst32_r, dst10_r, filt_h0, filt_h1);
3272 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
3274 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
3276 dst21_r = __msa_ilvr_h(dst2, dst5);
3277 dst3_r = HEVC_FILT_4TAP(dst43_r, dst21_r, filt_h0, filt_h1);
3280 PCKEV_H2_SH(dst1_r, dst0_r, dst3_r, dst2_r, out0_r, out1_r);
3281 SRARI_H2_SH(out0_r, out1_r, 6);
3282 CLIP_SH2_0_255(out0_r, out1_r);
3283 out0_r = (v8i16) __msa_pckev_b((v16i8) out1_r, (v16i8) out0_r);
3285 ST4x4_UB(out0_r, out0_r, 0, 1, 2, 3, dst, dst_stride);
3288 static void hevc_hv_uni_4t_4multx8mult_msa(uint8_t *src,
3292 const int8_t *filter_x,
3293 const int8_t *filter_y,
3297 v16i8 src0, src1, src2, src3, src4, src5;
3298 v16i8 src6, src7, src8, src9, src10;
3300 v4i32 filt_h0, filt_h1;
3301 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
3303 v8i16 filter_vec, const_vec;
3304 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
3305 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8, dst9;
3306 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r, dst6_r, dst7_r;
3307 v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
3308 v8i16 dst21_r, dst43_r, dst65_r, dst87_r;
3309 v8i16 out0_r, out1_r, out2_r, out3_r;
3311 src -= (src_stride + 1);
3313 filter_vec = LD_SH(filter_x);
3314 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3316 filter_vec = LD_SH(filter_y);
3317 vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
3318 filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
3320 SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1);
3324 const_vec = __msa_ldi_h(128);
3327 LD_SB3(src, src_stride, src0, src1, src2);
3328 src += (3 * src_stride);
3330 XORI_B3_128_SB(src0, src1, src2);
3332 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3333 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
3334 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3337 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
3339 DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1);
3341 DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2);
3343 ILVR_H2_SH(dst1, dst0, dst2, dst1, dst10_r, dst21_r);
3345 for (loop_cnt = height >> 3; loop_cnt--;) {
3346 LD_SB8(src, src_stride,
3347 src3, src4, src5, src6, src7, src8, src9, src10);
3348 src += (8 * src_stride);
3350 XORI_B8_128_SB(src3, src4, src5, src6, src7, src8, src9, src10);
3353 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3355 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
3357 dst32_r = __msa_ilvr_h(dst3, dst2);
3358 dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
3362 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
3364 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
3366 dst43_r = __msa_ilvr_h(dst4, dst3);
3367 dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
3371 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
3373 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5);
3375 dst54_r = __msa_ilvr_h(dst5, dst4);
3376 dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
3380 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
3382 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst6, dst6);
3384 dst65_r = __msa_ilvr_h(dst6, dst5);
3385 dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
3389 VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1);
3391 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst7, dst7);
3393 dst76_r = __msa_ilvr_h(dst7, dst6);
3394 dst4_r = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
3398 VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec0, vec1);
3400 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst8, dst8);
3402 dst87_r = __msa_ilvr_h(dst8, dst7);
3403 dst5_r = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
3407 VSHF_B2_SB(src9, src9, src9, src9, mask0, mask1, vec0, vec1);
3409 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst9, dst9);
3411 dst10_r = __msa_ilvr_h(dst9, dst8);
3412 dst6_r = HEVC_FILT_4TAP(dst76_r, dst10_r, filt_h0, filt_h1);
3416 VSHF_B2_SB(src10, src10, src10, src10, mask0, mask1, vec0, vec1);
3418 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
3420 dst21_r = __msa_ilvr_h(dst2, dst9);
3421 dst7_r = HEVC_FILT_4TAP(dst87_r, dst21_r, filt_h0, filt_h1);
3424 PCKEV_H4_SH(dst1_r, dst0_r, dst3_r, dst2_r,
3425 dst5_r, dst4_r, dst7_r, dst6_r,
3426 out0_r, out1_r, out2_r, out3_r);
3428 SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6);
3429 CLIP_SH4_0_255(out0_r, out1_r, out2_r, out3_r);
3431 PCKEV_B2_SH(out1_r, out0_r, out3_r, out2_r, out0_r, out1_r);
3432 ST4x8_UB(out0_r, out1_r, dst, dst_stride);
3433 dst += (8 * dst_stride);
3437 static void hevc_hv_uni_4t_4w_msa(uint8_t *src,
3441 const int8_t *filter_x,
3442 const int8_t *filter_y,
3446 hevc_hv_uni_4t_4x2_msa(src, src_stride, dst, dst_stride,
3447 filter_x, filter_y, height);
3448 } else if (4 == height) {
3449 hevc_hv_uni_4t_4x4_msa(src, src_stride, dst, dst_stride,
3450 filter_x, filter_y, height);
3451 } else if (0 == (height % 8)) {
3452 hevc_hv_uni_4t_4multx8mult_msa(src, src_stride, dst, dst_stride,
3453 filter_x, filter_y, height);
3457 static void hevc_hv_uni_4t_6w_msa(uint8_t *src,
3461 const int8_t *filter_x,
3462 const int8_t *filter_y,
3466 v16i8 src0, src1, src2, src3, src4, src5, src6;
3468 v4i32 filt_h0, filt_h1;
3469 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
3471 v8i16 filter_vec, const_vec;
3472 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
3473 v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
3474 v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
3475 v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
3476 v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
3477 v8i16 out0_r, out1_r, out2_r, out3_r;
3479 src -= (src_stride + 1);
3481 filter_vec = LD_SH(filter_x);
3482 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3484 filter_vec = LD_SH(filter_y);
3485 vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
3486 filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
3488 SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1);
3492 const_vec = __msa_ldi_h(128);
3495 LD_SB3(src, src_stride, src0, src1, src2);
3496 src += (3 * src_stride);
3498 XORI_B3_128_SB(src0, src1, src2);
3500 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3501 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
3502 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3505 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
3507 DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1);
3509 DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2);
3511 ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
3512 ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
3514 for (loop_cnt = height >> 2; loop_cnt--;) {
3515 LD_SB4(src, src_stride, src3, src4, src5, src6);
3516 src += (4 * src_stride);
3518 XORI_B4_128_SB(src3, src4, src5, src6);
3521 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3523 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
3525 ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
3526 dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
3527 dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
3532 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
3534 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
3536 ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
3537 dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
3538 dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
3543 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
3545 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5);
3547 ILVRL_H2_SH(dst5, dst4, dst10_r, dst10_l);
3548 dst2_r = HEVC_FILT_4TAP(dst32_r, dst10_r, filt_h0, filt_h1);
3549 dst2_l = HEVC_FILT_4TAP(dst32_l, dst10_l, filt_h0, filt_h1);
3555 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
3557 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
3559 ILVRL_H2_SH(dst2, dst5, dst21_r, dst21_l);
3560 dst3_r = HEVC_FILT_4TAP(dst43_r, dst21_r, filt_h0, filt_h1);
3561 dst3_l = HEVC_FILT_4TAP(dst43_l, dst21_l, filt_h0, filt_h1);
3566 PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r,
3567 dst2_l, dst2_r, dst3_l, dst3_r,
3568 out0_r, out1_r, out2_r, out3_r);
3570 SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6);
3571 CLIP_SH4_0_255(out0_r, out1_r, out2_r, out3_r);
3573 PCKEV_B2_SH(out1_r, out0_r, out3_r, out2_r, out0_r, out1_r);
3574 ST6x4_UB(out0_r, out1_r, dst, dst_stride);
3575 dst += (4 * dst_stride);
3579 static void hevc_hv_uni_4t_8x2_msa(uint8_t *src,
3583 const int8_t *filter_x,
3584 const int8_t *filter_y,
3587 v16i8 src0, src1, src2, src3, src4;
3589 v4i32 filt_h0, filt_h1;
3590 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
3592 v8i16 filter_vec, const_vec;
3593 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
3594 v8i16 dst0, dst1, dst2, dst3, dst4;
3595 v4i32 dst0_r, dst0_l, dst1_r, dst1_l;
3596 v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
3597 v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
3598 v8i16 out0_r, out1_r;
3600 src -= (src_stride + 1);
3602 filter_vec = LD_SH(filter_x);
3603 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3605 filter_vec = LD_SH(filter_y);
3606 vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
3607 filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
3609 SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1);
3613 const_vec = __msa_ldi_h(128);
3616 LD_SB3(src, src_stride, src0, src1, src2);
3617 src += (3 * src_stride);
3619 XORI_B3_128_SB(src0, src1, src2);
3621 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3622 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
3623 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3626 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
3628 DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1);
3630 DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2);
3632 ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
3633 ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
3635 LD_SB2(src, src_stride, src3, src4);
3636 XORI_B2_128_SB(src3, src4);
3639 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3641 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
3643 ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
3644 dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
3645 dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
3650 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
3652 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
3654 ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
3655 dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
3656 dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
3660 PCKEV_H2_SH(dst0_l, dst0_r, dst1_l, dst1_r, out0_r, out1_r);
3661 SRARI_H2_SH(out0_r, out1_r, 6);
3662 CLIP_SH2_0_255(out0_r, out1_r);
3663 out0_r = (v8i16) __msa_pckev_b((v16i8) out1_r, (v16i8) out0_r);
3665 ST8x2_UB(out0_r, dst, dst_stride);
3668 static void hevc_hv_uni_4t_8x6_msa(uint8_t *src,
3672 const int8_t *filter_x,
3673 const int8_t *filter_y,
3676 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
3678 v4i32 filt_h0, filt_h1;
3679 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
3681 v8i16 filter_vec, const_vec;
3682 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
3683 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
3684 v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
3685 v4i32 dst4_r, dst4_l, dst5_r, dst5_l;
3686 v8i16 dst10_r, dst32_r, dst10_l, dst32_l;
3687 v8i16 dst21_r, dst43_r, dst21_l, dst43_l;
3688 v8i16 dst54_r, dst54_l, dst65_r, dst65_l;
3689 v8i16 dst76_r, dst76_l, dst87_r, dst87_l;
3690 v8i16 out0_r, out1_r, out2_r, out3_r, out4_r, out5_r;
3692 src -= (src_stride + 1);
3694 filter_vec = LD_SH(filter_x);
3695 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3697 filter_vec = LD_SH(filter_y);
3698 vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
3699 filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
3701 SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1);
3705 const_vec = __msa_ldi_h(128);
3708 LD_SB3(src, src_stride, src0, src1, src2);
3709 src += (3 * src_stride);
3711 XORI_B3_128_SB(src0, src1, src2);
3713 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3714 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
3715 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3718 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
3720 DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1);
3722 DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2);
3724 ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
3725 ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
3727 LD_SB2(src, src_stride, src3, src4);
3728 src += (2 * src_stride);
3730 XORI_B2_128_SB(src3, src4);
3733 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3735 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
3737 ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
3738 dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
3739 dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
3745 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
3747 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
3749 ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
3750 dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
3751 dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
3755 LD_SB2(src, src_stride, src5, src6);
3756 src += (2 * src_stride);
3758 XORI_B2_128_SB(src5, src6);
3761 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
3763 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5);
3765 ILVRL_H2_SH(dst5, dst4, dst54_r, dst54_l);
3766 dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
3767 dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1);
3772 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
3774 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst6, dst6);
3776 ILVRL_H2_SH(dst6, dst5, dst65_r, dst65_l);
3777 dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
3778 dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1);
3782 LD_SB2(src, src_stride, src7, src8);
3783 src += (2 * src_stride);
3785 XORI_B2_128_SB(src7, src8);
3788 VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1);
3790 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst7, dst7);
3792 ILVRL_H2_SH(dst7, dst6, dst76_r, dst76_l);
3793 dst4_r = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
3794 dst4_l = HEVC_FILT_4TAP(dst54_l, dst76_l, filt_h0, filt_h1);
3800 VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec0, vec1);
3802 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst8, dst8);
3804 ILVRL_H2_SH(dst8, dst7, dst87_r, dst87_l);
3805 dst5_r = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
3806 dst5_l = HEVC_FILT_4TAP(dst65_l, dst87_l, filt_h0, filt_h1);
3810 PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r,
3811 dst2_l, dst2_r, dst3_l, dst3_r, out0_r, out1_r, out2_r, out3_r);
3812 PCKEV_H2_SH(dst4_l, dst4_r, dst5_l, dst5_r, out4_r, out5_r);
3813 SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6);
3814 SRARI_H2_SH(out4_r, out5_r, 6);
3815 CLIP_SH4_0_255(out0_r, out1_r, out2_r, out3_r);
3816 CLIP_SH2_0_255(out4_r, out5_r);
3818 PCKEV_B2_SH(out1_r, out0_r, out3_r, out2_r, out0_r, out1_r);
3819 out2_r = (v8i16) __msa_pckev_b((v16i8) out5_r, (v16i8) out4_r);
3821 ST8x4_UB(out0_r, out1_r, dst, dst_stride);
3822 dst += (4 * dst_stride);
3823 ST8x2_UB(out2_r, dst, dst_stride);
3826 static void hevc_hv_uni_4t_8w_mult_msa(uint8_t *src,
3830 const int8_t *filter_x,
3831 const int8_t *filter_y,
3835 uint32_t loop_cnt, cnt;
3838 v16i8 src0, src1, src2, src3, src4, src5, src6;
3840 v4i32 filt_h0, filt_h1;
3841 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
3843 v8i16 filter_vec, const_vec;
3844 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
3845 v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
3846 v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
3847 v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
3848 v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
3849 v8i16 out0_r, out1_r, out2_r, out3_r;
3851 src -= (src_stride + 1);
3853 filter_vec = LD_SH(filter_x);
3854 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3856 filter_vec = LD_SH(filter_y);
3857 vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
3858 filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
3860 SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1);
3864 const_vec = __msa_ldi_h(128);
3867 for (cnt = width >> 3; cnt--;) {
3871 LD_SB3(src_tmp, src_stride, src0, src1, src2);
3872 src_tmp += (3 * src_stride);
3874 XORI_B3_128_SB(src0, src1, src2);
3876 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3877 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
3878 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3881 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
3883 DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1);
3885 DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2);
3887 ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
3888 ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
3890 for (loop_cnt = height >> 2; loop_cnt--;) {
3891 LD_SB4(src_tmp, src_stride, src3, src4, src5, src6);
3892 src_tmp += (4 * src_stride);
3894 XORI_B4_128_SB(src3, src4, src5, src6);
3897 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3899 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
3901 ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
3902 dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
3903 dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
3909 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
3911 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
3913 ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
3914 dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
3915 dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
3920 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
3922 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5);
3924 ILVRL_H2_SH(dst5, dst4, dst10_r, dst10_l);
3925 dst2_r = HEVC_FILT_4TAP(dst32_r, dst10_r, filt_h0, filt_h1);
3926 dst2_l = HEVC_FILT_4TAP(dst32_l, dst10_l, filt_h0, filt_h1);
3932 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
3934 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
3936 ILVRL_H2_SH(dst2, dst5, dst21_r, dst21_l);
3937 dst3_r = HEVC_FILT_4TAP(dst43_r, dst21_r, filt_h0, filt_h1);
3938 dst3_l = HEVC_FILT_4TAP(dst43_l, dst21_l, filt_h0, filt_h1);
3943 PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r,
3944 dst2_l, dst2_r, dst3_l, dst3_r,
3945 out0_r, out1_r, out2_r, out3_r);
3947 SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6);
3948 CLIP_SH4_0_255(out0_r, out1_r, out2_r, out3_r);
3950 PCKEV_B2_SH(out1_r, out0_r, out3_r, out2_r, out0_r, out1_r);
3951 ST8x4_UB(out0_r, out1_r, dst_tmp, dst_stride);
3952 dst_tmp += (4 * dst_stride);
3960 static void hevc_hv_uni_4t_8w_msa(uint8_t *src,
3964 const int8_t *filter_x,
3965 const int8_t *filter_y,
3969 hevc_hv_uni_4t_8x2_msa(src, src_stride, dst, dst_stride,
3970 filter_x, filter_y, height);
3971 } else if (6 == height) {
3972 hevc_hv_uni_4t_8x6_msa(src, src_stride, dst, dst_stride,
3973 filter_x, filter_y, height);
3974 } else if (0 == (height % 4)) {
3975 hevc_hv_uni_4t_8w_mult_msa(src, src_stride, dst, dst_stride,
3976 filter_x, filter_y, height, 8);
3980 static void hevc_hv_uni_4t_12w_msa(uint8_t *src,
3984 const int8_t *filter_x,
3985 const int8_t *filter_y,
3988 hevc_hv_uni_4t_8w_mult_msa(src, src_stride, dst, dst_stride,
3989 filter_x, filter_y, height, 8);
3991 hevc_hv_uni_4t_4w_msa(src + 8, src_stride, dst + 8, dst_stride,
3992 filter_x, filter_y, height);
3995 static void hevc_hv_uni_4t_16w_msa(uint8_t *src,
3999 const int8_t *filter_x,
4000 const int8_t *filter_y,
4003 hevc_hv_uni_4t_8w_mult_msa(src, src_stride, dst, dst_stride,
4004 filter_x, filter_y, height, 16);
4007 static void hevc_hv_uni_4t_24w_msa(uint8_t *src,
4011 const int8_t *filter_x,
4012 const int8_t *filter_y,
4015 hevc_hv_uni_4t_8w_mult_msa(src, src_stride, dst, dst_stride,
4016 filter_x, filter_y, height, 24);
4019 static void hevc_hv_uni_4t_32w_msa(uint8_t *src,
4023 const int8_t *filter_x,
4024 const int8_t *filter_y,
4027 hevc_hv_uni_4t_8w_mult_msa(src, src_stride, dst, dst_stride,
4028 filter_x, filter_y, height, 32);
4031 #define UNI_MC_COPY(WIDTH) \
4032 void ff_hevc_put_hevc_uni_pel_pixels##WIDTH##_8_msa(uint8_t *dst, \
4033 ptrdiff_t dst_stride, \
4035 ptrdiff_t src_stride, \
4041 copy_width##WIDTH##_msa(src, src_stride, dst, dst_stride, height); \
4054 #define UNI_MC(PEL, DIR, WIDTH, TAP, DIR1, FILT_DIR) \
4055 void ff_hevc_put_hevc_uni_##PEL##_##DIR##WIDTH##_8_msa(uint8_t *dst, \
4056 ptrdiff_t dst_stride, \
4058 ptrdiff_t src_stride, \
4064 const int8_t *filter = ff_hevc_##PEL##_filters[FILT_DIR - 1]; \
4066 common_##DIR1##_##TAP##t_##WIDTH##w_msa(src, src_stride, dst, dst_stride, \
4070 UNI_MC(qpel, h, 4, 8, hz, mx);
4071 UNI_MC(qpel, h, 8, 8, hz, mx);
4072 UNI_MC(qpel, h, 12, 8, hz, mx);
4073 UNI_MC(qpel, h, 16, 8, hz, mx);
4074 UNI_MC(qpel, h, 24, 8, hz, mx);
4075 UNI_MC(qpel, h, 32, 8, hz, mx);
4076 UNI_MC(qpel, h, 48, 8, hz, mx);
4077 UNI_MC(qpel, h, 64, 8, hz, mx);
4079 UNI_MC(qpel, v, 4, 8, vt, my);
4080 UNI_MC(qpel, v, 8, 8, vt, my);
4081 UNI_MC(qpel, v, 12, 8, vt, my);
4082 UNI_MC(qpel, v, 16, 8, vt, my);
4083 UNI_MC(qpel, v, 24, 8, vt, my);
4084 UNI_MC(qpel, v, 32, 8, vt, my);
4085 UNI_MC(qpel, v, 48, 8, vt, my);
4086 UNI_MC(qpel, v, 64, 8, vt, my);
4088 UNI_MC(epel, h, 4, 4, hz, mx);
4089 UNI_MC(epel, h, 6, 4, hz, mx);
4090 UNI_MC(epel, h, 8, 4, hz, mx);
4091 UNI_MC(epel, h, 12, 4, hz, mx);
4092 UNI_MC(epel, h, 16, 4, hz, mx);
4093 UNI_MC(epel, h, 24, 4, hz, mx);
4094 UNI_MC(epel, h, 32, 4, hz, mx);
4096 UNI_MC(epel, v, 4, 4, vt, my);
4097 UNI_MC(epel, v, 6, 4, vt, my);
4098 UNI_MC(epel, v, 8, 4, vt, my);
4099 UNI_MC(epel, v, 12, 4, vt, my);
4100 UNI_MC(epel, v, 16, 4, vt, my);
4101 UNI_MC(epel, v, 24, 4, vt, my);
4102 UNI_MC(epel, v, 32, 4, vt, my);
4106 #define UNI_MC_HV(PEL, WIDTH, TAP) \
4107 void ff_hevc_put_hevc_uni_##PEL##_hv##WIDTH##_8_msa(uint8_t *dst, \
4108 ptrdiff_t dst_stride, \
4110 ptrdiff_t src_stride, \
4116 const int8_t *filter_x = ff_hevc_##PEL##_filters[mx - 1]; \
4117 const int8_t *filter_y = ff_hevc_##PEL##_filters[my - 1]; \
4119 hevc_hv_uni_##TAP##t_##WIDTH##w_msa(src, src_stride, dst, dst_stride, \
4120 filter_x, filter_y, height); \
4123 UNI_MC_HV(qpel, 4, 8);
4124 UNI_MC_HV(qpel, 8, 8);
4125 UNI_MC_HV(qpel, 12, 8);
4126 UNI_MC_HV(qpel, 16, 8);
4127 UNI_MC_HV(qpel, 24, 8);
4128 UNI_MC_HV(qpel, 32, 8);
4129 UNI_MC_HV(qpel, 48, 8);
4130 UNI_MC_HV(qpel, 64, 8);
4132 UNI_MC_HV(epel, 4, 4);
4133 UNI_MC_HV(epel, 6, 4);
4134 UNI_MC_HV(epel, 8, 4);
4135 UNI_MC_HV(epel, 12, 4);
4136 UNI_MC_HV(epel, 16, 4);
4137 UNI_MC_HV(epel, 24, 4);
4138 UNI_MC_HV(epel, 32, 4);