2 * Copyright (c) 2015 - 2017 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com)
4 * This file is part of FFmpeg.
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 #include "libavutil/mips/generic_macros_msa.h"
22 #include "libavcodec/mips/hevcdsp_mips.h"
23 #include "libavcodec/mips/hevc_macros_msa.h"
25 static const uint8_t ff_hevc_mask_arr[16 * 3] __attribute__((aligned(0x40))) = {
27 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
29 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20,
31 8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
34 #define HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, \
35 mask0, mask1, mask2, mask3, \
36 filt0, filt1, filt2, filt3, \
39 v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \
41 VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0_m, vec1_m); \
42 DOTP_SB2_SH(vec0_m, vec1_m, filt0, filt0, out0, out1); \
43 VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2_m, vec3_m); \
44 DPADD_SB2_SH(vec2_m, vec3_m, filt1, filt1, out0, out1); \
45 VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4_m, vec5_m); \
46 DPADD_SB2_SH(vec4_m, vec5_m, filt2, filt2, out0, out1); \
47 VSHF_B2_SB(src0, src1, src2, src3, mask3, mask3, vec6_m, vec7_m); \
48 DPADD_SB2_SH(vec6_m, vec7_m, filt3, filt3, out0, out1); \
51 #define HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, \
52 mask0, mask1, mask2, mask3, \
53 filt0, filt1, filt2, filt3, \
54 out0, out1, out2, out3) \
56 v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \
58 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m); \
59 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m); \
60 DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0, \
61 out0, out1, out2, out3); \
62 VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec0_m, vec1_m); \
63 VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2_m, vec3_m); \
64 DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt2, filt2, filt2, filt2, \
65 out0, out1, out2, out3); \
66 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4_m, vec5_m); \
67 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6_m, vec7_m); \
68 DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt1, filt1, filt1, filt1, \
69 out0, out1, out2, out3); \
70 VSHF_B2_SB(src0, src0, src1, src1, mask3, mask3, vec4_m, vec5_m); \
71 VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec6_m, vec7_m); \
72 DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt3, filt3, filt3, filt3, \
73 out0, out1, out2, out3); \
76 #define HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, \
77 mask0, mask1, filt0, filt1, \
80 v16i8 vec0_m, vec1_m, vec2_m, vec3_m; \
82 VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0_m, vec1_m); \
83 DOTP_SB2_SH(vec0_m, vec1_m, filt0, filt0, out0, out1); \
84 VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2_m, vec3_m); \
85 DPADD_SB2_SH(vec2_m, vec3_m, filt1, filt1, out0, out1); \
88 #define HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, \
89 mask0, mask1, filt0, filt1, \
90 out0, out1, out2, out3) \
92 v16i8 vec0_m, vec1_m, vec2_m, vec3_m; \
94 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m); \
95 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m); \
96 DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0, \
97 out0, out1, out2, out3); \
98 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0_m, vec1_m); \
99 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2_m, vec3_m); \
100 DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt1, filt1, filt1, filt1, \
101 out0, out1, out2, out3); \
104 static void copy_width8_msa(uint8_t *src, int32_t src_stride,
105 uint8_t *dst, int32_t dst_stride,
109 uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
112 LD2(src, src_stride, out0, out1);
116 } else if (6 == height) {
117 LD4(src, src_stride, out0, out1, out2, out3);
118 src += (4 * src_stride);
119 SD4(out0, out1, out2, out3, dst, dst_stride);
120 dst += (4 * dst_stride);
121 LD2(src, src_stride, out0, out1);
125 } else if (0 == (height % 8)) {
126 for (cnt = (height >> 3); cnt--;) {
127 LD4(src, src_stride, out0, out1, out2, out3);
128 src += (4 * src_stride);
129 LD4(src, src_stride, out4, out5, out6, out7);
130 src += (4 * src_stride);
131 SD4(out0, out1, out2, out3, dst, dst_stride);
132 dst += (4 * dst_stride);
133 SD4(out4, out5, out6, out7, dst, dst_stride);
134 dst += (4 * dst_stride);
136 } else if (0 == (height % 4)) {
137 for (cnt = (height >> 2); cnt--;) {
138 LD4(src, src_stride, out0, out1, out2, out3);
139 src += (4 * src_stride);
140 SD4(out0, out1, out2, out3, dst, dst_stride);
141 dst += (4 * dst_stride);
146 static void copy_width12_msa(uint8_t *src, int32_t src_stride,
147 uint8_t *dst, int32_t dst_stride,
150 v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
152 LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
153 src += (8 * src_stride);
154 ST12x8_UB(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride);
155 dst += (8 * dst_stride);
156 LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
157 ST12x8_UB(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride);
160 static void copy_width16_msa(uint8_t *src, int32_t src_stride,
161 uint8_t *dst, int32_t dst_stride,
165 v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
168 LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
169 src += (8 * src_stride);
170 ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride);
171 dst += (8 * dst_stride);
172 LD_UB4(src, src_stride, src0, src1, src2, src3);
173 src += (4 * src_stride);
174 ST_UB4(src0, src1, src2, src3, dst, dst_stride);
175 dst += (4 * dst_stride);
176 } else if (0 == (height % 8)) {
177 for (cnt = (height >> 3); cnt--;) {
178 LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6,
180 src += (8 * src_stride);
181 ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst,
183 dst += (8 * dst_stride);
185 } else if (0 == (height % 4)) {
186 for (cnt = (height >> 2); cnt--;) {
187 LD_UB4(src, src_stride, src0, src1, src2, src3);
188 src += (4 * src_stride);
190 ST_UB4(src0, src1, src2, src3, dst, dst_stride);
191 dst += (4 * dst_stride);
196 static void copy_width24_msa(uint8_t *src, int32_t src_stride,
197 uint8_t *dst, int32_t dst_stride,
201 v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
202 uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
204 for (cnt = 4; cnt--;) {
205 LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
206 LD4(src + 16, src_stride, out0, out1, out2, out3);
207 src += (4 * src_stride);
208 LD4(src + 16, src_stride, out4, out5, out6, out7);
209 src += (4 * src_stride);
211 ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride);
212 SD4(out0, out1, out2, out3, dst + 16, dst_stride);
213 dst += (4 * dst_stride);
214 SD4(out4, out5, out6, out7, dst + 16, dst_stride);
215 dst += (4 * dst_stride);
219 static void copy_width32_msa(uint8_t *src, int32_t src_stride,
220 uint8_t *dst, int32_t dst_stride,
224 v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
226 for (cnt = (height >> 2); cnt--;) {
227 LD_UB4(src, src_stride, src0, src1, src2, src3);
228 LD_UB4(src + 16, src_stride, src4, src5, src6, src7);
229 src += (4 * src_stride);
230 ST_UB4(src0, src1, src2, src3, dst, dst_stride);
231 ST_UB4(src4, src5, src6, src7, dst + 16, dst_stride);
232 dst += (4 * dst_stride);
236 static void copy_width48_msa(uint8_t *src, int32_t src_stride,
237 uint8_t *dst, int32_t dst_stride,
241 v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
244 for (cnt = (height >> 2); cnt--;) {
245 LD_UB4(src, src_stride, src0, src1, src2, src3);
246 LD_UB4(src + 16, src_stride, src4, src5, src6, src7);
247 LD_UB4(src + 32, src_stride, src8, src9, src10, src11);
248 src += (4 * src_stride);
250 ST_UB4(src0, src1, src2, src3, dst, dst_stride);
251 ST_UB4(src4, src5, src6, src7, dst + 16, dst_stride);
252 ST_UB4(src8, src9, src10, src11, dst + 32, dst_stride);
253 dst += (4 * dst_stride);
257 static void copy_width64_msa(uint8_t *src, int32_t src_stride,
258 uint8_t *dst, int32_t dst_stride,
262 v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
263 v16u8 src8, src9, src10, src11, src12, src13, src14, src15;
265 for (cnt = (height >> 2); cnt--;) {
266 LD_UB4(src, 16, src0, src1, src2, src3);
268 LD_UB4(src, 16, src4, src5, src6, src7);
270 LD_UB4(src, 16, src8, src9, src10, src11);
272 LD_UB4(src, 16, src12, src13, src14, src15);
275 ST_UB4(src0, src1, src2, src3, dst, 16);
277 ST_UB4(src4, src5, src6, src7, dst, 16);
279 ST_UB4(src8, src9, src10, src11, dst, 16);
281 ST_UB4(src12, src13, src14, src15, dst, 16);
286 static const uint8_t mc_filt_mask_arr[16 * 3] = {
288 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
290 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20,
292 8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
295 #define FILT_4TAP_DPADD_S_H(vec0, vec1, filt0, filt1) \
299 tmp0 = __msa_dotp_s_h((v16i8) vec0, (v16i8) filt0); \
300 tmp0 = __msa_dpadd_s_h(tmp0, (v16i8) vec1, (v16i8) filt1); \
305 static void common_hz_8t_4x4_msa(uint8_t *src, int32_t src_stride,
306 uint8_t *dst, int32_t dst_stride,
307 const int8_t *filter)
309 v16u8 mask0, mask1, mask2, mask3, out;
310 v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
311 v8i16 filt, out0, out1;
313 mask0 = LD_UB(&ff_hevc_mask_arr[16]);
316 /* rearranging filter */
317 filt = LD_SH(filter);
318 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
324 LD_SB4(src, src_stride, src0, src1, src2, src3);
325 XORI_B4_128_SB(src0, src1, src2, src3);
326 HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
327 mask3, filt0, filt1, filt2, filt3, out0, out1);
328 SRARI_H2_SH(out0, out1, 6);
329 SAT_SH2_SH(out0, out1, 7);
330 out = PCKEV_XORI128_UB(out0, out1);
331 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
334 static void common_hz_8t_4x8_msa(uint8_t *src, int32_t src_stride,
335 uint8_t *dst, int32_t dst_stride,
336 const int8_t *filter)
338 v16i8 filt0, filt1, filt2, filt3;
339 v16i8 src0, src1, src2, src3;
340 v16u8 mask0, mask1, mask2, mask3, out;
341 v8i16 filt, out0, out1, out2, out3;
343 mask0 = LD_UB(&ff_hevc_mask_arr[16]);
346 /* rearranging filter */
347 filt = LD_SH(filter);
348 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
354 LD_SB4(src, src_stride, src0, src1, src2, src3);
355 XORI_B4_128_SB(src0, src1, src2, src3);
356 src += (4 * src_stride);
357 HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
358 mask3, filt0, filt1, filt2, filt3, out0, out1);
359 LD_SB4(src, src_stride, src0, src1, src2, src3);
360 XORI_B4_128_SB(src0, src1, src2, src3);
361 HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
362 mask3, filt0, filt1, filt2, filt3, out2, out3);
363 SRARI_H4_SH(out0, out1, out2, out3, 6);
364 SAT_SH4_SH(out0, out1, out2, out3, 7);
365 out = PCKEV_XORI128_UB(out0, out1);
366 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
367 dst += (4 * dst_stride);
368 out = PCKEV_XORI128_UB(out2, out3);
369 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
372 static void common_hz_8t_4x16_msa(uint8_t *src, int32_t src_stride,
373 uint8_t *dst, int32_t dst_stride,
374 const int8_t *filter)
376 v16u8 mask0, mask1, mask2, mask3, out;
377 v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
378 v8i16 filt, out0, out1, out2, out3;
380 mask0 = LD_UB(&ff_hevc_mask_arr[16]);
383 /* rearranging filter */
384 filt = LD_SH(filter);
385 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
391 LD_SB4(src, src_stride, src0, src1, src2, src3);
392 XORI_B4_128_SB(src0, src1, src2, src3);
393 src += (4 * src_stride);
394 HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
395 mask3, filt0, filt1, filt2, filt3, out0, out1);
396 LD_SB4(src, src_stride, src0, src1, src2, src3);
397 XORI_B4_128_SB(src0, src1, src2, src3);
398 src += (4 * src_stride);
399 HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
400 mask3, filt0, filt1, filt2, filt3, out2, out3);
401 SRARI_H4_SH(out0, out1, out2, out3, 6);
402 SAT_SH4_SH(out0, out1, out2, out3, 7);
403 out = PCKEV_XORI128_UB(out0, out1);
404 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
405 dst += (4 * dst_stride);
406 out = PCKEV_XORI128_UB(out2, out3);
407 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
408 dst += (4 * dst_stride);
410 LD_SB4(src, src_stride, src0, src1, src2, src3);
411 XORI_B4_128_SB(src0, src1, src2, src3);
412 src += (4 * src_stride);
413 HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
414 mask3, filt0, filt1, filt2, filt3, out0, out1);
415 LD_SB4(src, src_stride, src0, src1, src2, src3);
416 XORI_B4_128_SB(src0, src1, src2, src3);
417 src += (4 * src_stride);
418 HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
419 mask3, filt0, filt1, filt2, filt3, out2, out3);
421 SRARI_H4_SH(out0, out1, out2, out3, 6);
422 SAT_SH4_SH(out0, out1, out2, out3, 7);
423 out = PCKEV_XORI128_UB(out0, out1);
424 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
425 dst += (4 * dst_stride);
426 out = PCKEV_XORI128_UB(out2, out3);
427 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
430 static void common_hz_8t_4w_msa(uint8_t *src, int32_t src_stride,
431 uint8_t *dst, int32_t dst_stride,
432 const int8_t *filter, int32_t height)
435 common_hz_8t_4x4_msa(src, src_stride, dst, dst_stride, filter);
436 } else if (8 == height) {
437 common_hz_8t_4x8_msa(src, src_stride, dst, dst_stride, filter);
438 } else if (16 == height) {
439 common_hz_8t_4x16_msa(src, src_stride, dst, dst_stride, filter);
443 static void common_hz_8t_8w_msa(uint8_t *src, int32_t src_stride,
444 uint8_t *dst, int32_t dst_stride,
445 const int8_t *filter, int32_t height)
448 v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
449 v16u8 mask0, mask1, mask2, mask3, tmp0, tmp1;
450 v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m;
451 v8i16 filt, out0, out1, out2, out3;
453 mask0 = LD_UB(&ff_hevc_mask_arr[0]);
456 /* rearranging filter */
457 filt = LD_SH(filter);
458 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
464 for (loop_cnt = (height >> 2); loop_cnt--;) {
465 LD_SB4(src, src_stride, src0, src1, src2, src3);
466 XORI_B4_128_SB(src0, src1, src2, src3);
467 src += (4 * src_stride);
469 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m);
470 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m);
471 DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0,
472 out0, out1, out2, out3);
473 VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec0_m, vec1_m);
474 VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2_m, vec3_m);
475 DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt2, filt2, filt2, filt2,
476 out0, out1, out2, out3);
477 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4_m, vec5_m);
478 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6_m, vec7_m);
479 DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt1, filt1, filt1, filt1,
480 out0, out1, out2, out3);
481 VSHF_B2_SB(src0, src0, src1, src1, mask3, mask3, vec4_m, vec5_m);
482 VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec6_m, vec7_m);
483 DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt3, filt3, filt3, filt3,
484 out0, out1, out2, out3);
486 SRARI_H4_SH(out0, out1, out2, out3, 6);
487 SAT_SH4_SH(out0, out1, out2, out3, 7);
488 tmp0 = PCKEV_XORI128_UB(out0, out1);
489 tmp1 = PCKEV_XORI128_UB(out2, out3);
490 ST8x4_UB(tmp0, tmp1, dst, dst_stride);
491 dst += (4 * dst_stride);
495 static void common_hz_8t_12w_msa(uint8_t *src, int32_t src_stride,
496 uint8_t *dst, int32_t dst_stride,
497 const int8_t *filter, int32_t height)
500 v16u8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask00;
501 v16u8 tmp0, tmp1, tmp2;
502 v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
503 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
504 v16i8 filt0, filt1, filt2, filt3;
505 v8i16 filt, out0, out1, out2, out3, out4, out5;
507 mask00 = LD_UB(&ff_hevc_mask_arr[0]);
508 mask0 = LD_UB(&ff_hevc_mask_arr[16]);
512 /* rearranging filter */
513 filt = LD_SH(filter);
514 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
523 for (loop_cnt = 4; loop_cnt--;) {
525 LD_SB4(src, src_stride, src0, src1, src2, src3);
527 LD_SB4(src + 8, src_stride, src4, src5, src6, src7);
529 XORI_B4_128_SB(src0, src1, src2, src3);
530 XORI_B4_128_SB(src4, src5, src6, src7);
531 src += (4 * src_stride);
533 VSHF_B2_SB(src0, src0, src1, src1, mask00, mask00, vec0, vec1);
534 VSHF_B2_SB(src2, src2, src3, src3, mask00, mask00, vec2, vec3);
535 DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0,
537 VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec0, vec1);
538 VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2, vec3);
539 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt2, filt2, filt2, out0,
541 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4, vec5);
542 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6, vec7);
543 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1, out0,
545 VSHF_B2_SB(src0, src0, src1, src1, mask3, mask3, vec4, vec5);
546 VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec6, vec7);
547 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt3, filt3, filt3, filt3, out0,
551 VSHF_B2_SB(src4, src5, src6, src7, mask0, mask0, vec0, vec1);
552 DOTP_SB2_SH(vec0, vec1, filt0, filt0, out4, out5);
553 VSHF_B2_SB(src4, src5, src6, src7, mask4, mask4, vec2, vec3);
554 DPADD_SB2_SH(vec2, vec3, filt1, filt1, out4, out5);
555 VSHF_B2_SB(src4, src5, src6, src7, mask5, mask5, vec4, vec5);
556 DPADD_SB2_SH(vec4, vec5, filt2, filt2, out4, out5);
557 VSHF_B2_SB(src4, src5, src6, src7, mask6, mask6, vec6, vec7);
558 DPADD_SB2_SH(vec6, vec7, filt3, filt3, out4, out5);
560 SRARI_H4_SH(out0, out1, out2, out3, 6);
561 SRARI_H2_SH(out4, out5, 6);
562 SAT_SH4_SH(out0, out1, out2, out3, 7);
563 SAT_SH2_SH(out4, out5, 7);
564 tmp0 = PCKEV_XORI128_UB(out0, out1);
565 tmp1 = PCKEV_XORI128_UB(out2, out3);
566 tmp2 = PCKEV_XORI128_UB(out4, out5);
568 ST8x4_UB(tmp0, tmp1, dst, dst_stride);
569 ST4x4_UB(tmp2, tmp2, 0, 1, 2, 3, dst + 8, dst_stride);
570 dst += (4 * dst_stride);
574 static void common_hz_8t_16w_msa(uint8_t *src, int32_t src_stride,
575 uint8_t *dst, int32_t dst_stride,
576 const int8_t *filter, int32_t height)
579 v16u8 mask0, mask1, mask2, mask3, out;
580 v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
581 v16i8 filt0, filt1, filt2, filt3;
582 v8i16 filt, out0, out1, out2, out3;
584 mask0 = LD_UB(&ff_hevc_mask_arr[0]);
587 /* rearranging filter */
588 filt = LD_SH(filter);
589 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
595 for (loop_cnt = (height >> 2); loop_cnt--;) {
596 LD_SB2(src, src_stride, src0, src2);
597 LD_SB2(src + 8, src_stride, src1, src3);
598 src += (2 * src_stride);
600 LD_SB2(src, src_stride, src4, src6);
601 LD_SB2(src + 8, src_stride, src5, src7);
602 src += (2 * src_stride);
604 XORI_B4_128_SB(src0, src1, src2, src3);
605 XORI_B4_128_SB(src4, src5, src6, src7);
606 HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
607 mask3, filt0, filt1, filt2, filt3, out0,
609 SRARI_H4_SH(out0, out1, out2, out3, 6);
610 SAT_SH4_SH(out0, out1, out2, out3, 7);
611 out = PCKEV_XORI128_UB(out0, out1);
614 out = PCKEV_XORI128_UB(out2, out3);
618 HORIZ_8TAP_8WID_4VECS_FILT(src4, src5, src6, src7, mask0, mask1, mask2,
619 mask3, filt0, filt1, filt2, filt3, out0,
621 SRARI_H4_SH(out0, out1, out2, out3, 6);
622 SAT_SH4_SH(out0, out1, out2, out3, 7);
623 out = PCKEV_XORI128_UB(out0, out1);
626 out = PCKEV_XORI128_UB(out2, out3);
632 static void common_hz_8t_24w_msa(uint8_t *src, int32_t src_stride,
633 uint8_t *dst, int32_t dst_stride,
634 const int8_t *filter, int32_t height)
637 v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
638 v16u8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7, out;
639 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
641 v8i16 out0, out1, out2, out3, out8, out9, filt;
643 mask0 = LD_UB(&ff_hevc_mask_arr[0]);
646 /* rearranging filter */
647 filt = LD_SH(filter);
648 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
658 for (loop_cnt = 16; loop_cnt--;) {
659 LD_SB2(src, src_stride, src0, src2);
660 LD_SB2(src + 16, src_stride, src1, src3);
661 XORI_B4_128_SB(src0, src1, src2, src3);
662 src += (2 * src_stride);
663 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec8);
664 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec9);
665 VSHF_B2_SB(src0, src1, src2, src3, mask4, mask4, vec1, vec3);
666 DOTP_SB4_SH(vec0, vec8, vec2, vec9, filt0, filt0, filt0, filt0, out0,
668 DOTP_SB2_SH(vec1, vec3, filt0, filt0, out1, out3);
669 VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec0, vec8);
670 VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2, vec9);
671 VSHF_B2_SB(src0, src1, src2, src3, mask6, mask6, vec1, vec3);
672 DPADD_SB4_SH(vec0, vec8, vec2, vec9, filt2, filt2, filt2, filt2,
673 out0, out8, out2, out9);
674 DPADD_SB2_SH(vec1, vec3, filt2, filt2, out1, out3);
675 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4, vec10);
676 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6, vec11);
677 VSHF_B2_SB(src0, src1, src2, src3, mask5, mask5, vec5, vec7);
678 DPADD_SB4_SH(vec4, vec10, vec6, vec11, filt1, filt1, filt1, filt1,
679 out0, out8, out2, out9);
680 DPADD_SB2_SH(vec5, vec7, filt1, filt1, out1, out3);
681 VSHF_B2_SB(src0, src0, src1, src1, mask3, mask3, vec4, vec10);
682 VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec6, vec11);
683 VSHF_B2_SB(src0, src1, src2, src3, mask7, mask7, vec5, vec7);
684 DPADD_SB4_SH(vec4, vec10, vec6, vec11, filt3, filt3, filt3, filt3,
685 out0, out8, out2, out9);
686 DPADD_SB2_SH(vec5, vec7, filt3, filt3, out1, out3);
687 SRARI_H4_SH(out0, out8, out2, out9, 6);
688 SRARI_H2_SH(out1, out3, 6);
689 SAT_SH4_SH(out0, out8, out2, out9, 7);
690 SAT_SH2_SH(out1, out3, 7);
691 out = PCKEV_XORI128_UB(out8, out9);
692 ST8x2_UB(out, dst + 16, dst_stride);
693 out = PCKEV_XORI128_UB(out0, out1);
696 out = PCKEV_XORI128_UB(out2, out3);
702 static void common_hz_8t_32w_msa(uint8_t *src, int32_t src_stride,
703 uint8_t *dst, int32_t dst_stride,
704 const int8_t *filter, int32_t height)
707 v16u8 mask0, mask1, mask2, mask3, out;
708 v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
709 v16i8 filt0, filt1, filt2, filt3;
710 v8i16 filt, out0, out1, out2, out3;
712 mask0 = LD_UB(&ff_hevc_mask_arr[0]);
715 /* rearranging filter */
716 filt = LD_SH(filter);
717 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
723 for (loop_cnt = (height >> 1); loop_cnt--;) {
725 src1 = LD_SB(src + 8);
726 src2 = LD_SB(src + 16);
727 src3 = LD_SB(src + 24);
729 XORI_B4_128_SB(src0, src1, src2, src3);
732 src5 = LD_SB(src + 8);
733 src6 = LD_SB(src + 16);
734 src7 = LD_SB(src + 24);
736 XORI_B4_128_SB(src4, src5, src6, src7);
738 HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
739 mask3, filt0, filt1, filt2, filt3, out0,
741 SRARI_H4_SH(out0, out1, out2, out3, 6);
742 SAT_SH4_SH(out0, out1, out2, out3, 7);
744 out = PCKEV_XORI128_UB(out0, out1);
746 out = PCKEV_XORI128_UB(out2, out3);
747 ST_UB(out, dst + 16);
750 HORIZ_8TAP_8WID_4VECS_FILT(src4, src5, src6, src7, mask0, mask1, mask2,
751 mask3, filt0, filt1, filt2, filt3, out0,
753 SRARI_H4_SH(out0, out1, out2, out3, 6);
754 SAT_SH4_SH(out0, out1, out2, out3, 7);
755 out = PCKEV_XORI128_UB(out0, out1);
757 out = PCKEV_XORI128_UB(out2, out3);
758 ST_UB(out, dst + 16);
763 static void common_hz_8t_48w_msa(uint8_t *src, int32_t src_stride,
764 uint8_t *dst, int32_t dst_stride,
765 const int8_t *filter, int32_t height)
768 v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3, vec0, vec1, vec2;
770 v16u8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7, out;
771 v8i16 filt, out0, out1, out2, out3;
773 mask0 = LD_UB(&ff_hevc_mask_arr[0]);
776 /* rearranging filter */
777 filt = LD_SH(filter);
778 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
788 for (loop_cnt = 64; loop_cnt--;) {
790 src1 = LD_SB(src + 8);
791 src2 = LD_SB(src + 16);
792 src3 = LD_SB(src + 32);
793 src4 = LD_SB(src + 40);
796 XORI_B4_128_SB(src0, src1, src2, src3);
797 src4 = (v16i8) __msa_xori_b((v16u8) src4, 128);
799 VSHF_B3_SB(src0, src0, src1, src1, src2, src2, mask0, mask0, mask0,
801 DOTP_SB3_SH(vec0, vec1, vec2, filt0, filt0, filt0, out0, out1, out2);
802 VSHF_B3_SB(src0, src0, src1, src1, src2, src2, mask1, mask1, mask1,
804 DPADD_SB2_SH(vec0, vec1, filt1, filt1, out0, out1);
805 out2 = __msa_dpadd_s_h(out2, vec2, filt1);
806 VSHF_B3_SB(src0, src0, src1, src1, src2, src2, mask2, mask2, mask2,
808 DPADD_SB2_SH(vec0, vec1, filt2, filt2, out0, out1);
809 out2 = __msa_dpadd_s_h(out2, vec2, filt2);
811 VSHF_B3_SB(src0, src0, src1, src1, src2, src2, mask3, mask3, mask3,
813 DPADD_SB2_SH(vec0, vec1, filt3, filt3, out0, out1);
814 out2 = __msa_dpadd_s_h(out2, vec2, filt3);
816 SRARI_H2_SH(out0, out1, 6);
817 out3 = __msa_srari_h(out2, 6);
818 SAT_SH3_SH(out0, out1, out3, 7);
819 out = PCKEV_XORI128_UB(out0, out1);
822 VSHF_B3_SB(src2, src3, src3, src3, src4, src4, mask4, mask0, mask0,
824 DOTP_SB3_SH(vec0, vec1, vec2, filt0, filt0, filt0, out0, out1, out2);
825 VSHF_B3_SB(src2, src3, src3, src3, src4, src4, mask5, mask1, mask1,
827 DPADD_SB2_SH(vec0, vec1, filt1, filt1, out0, out1);
828 out2 = __msa_dpadd_s_h(out2, vec2, filt1);
829 VSHF_B3_SB(src2, src3, src3, src3, src4, src4, mask6, mask2, mask2,
831 DPADD_SB2_SH(vec0, vec1, filt2, filt2, out0, out1);
832 out2 = __msa_dpadd_s_h(out2, vec2, filt2);
833 VSHF_B3_SB(src2, src3, src3, src3, src4, src4, mask7, mask3, mask3,
835 DPADD_SB2_SH(vec0, vec1, filt3, filt3, out0, out1);
836 out2 = __msa_dpadd_s_h(out2, vec2, filt3);
838 SRARI_H2_SH(out0, out1, 6);
839 out2 = __msa_srari_h(out2, 6);
840 SAT_SH3_SH(out0, out1, out2, 7);
841 out = PCKEV_XORI128_UB(out3, out0);
842 ST_UB(out, dst + 16);
843 out = PCKEV_XORI128_UB(out1, out2);
844 ST_UB(out, dst + 32);
849 static void common_hz_8t_64w_msa(uint8_t *src, int32_t src_stride,
850 uint8_t *dst, int32_t dst_stride,
851 const int8_t *filter, int32_t height)
854 v16u8 mask0, mask1, mask2, mask3, out;
855 v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
856 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
857 v16i8 filt0, filt1, filt2, filt3;
858 v8i16 res0, res1, res2, res3, filt;
860 mask0 = LD_UB(&ff_hevc_mask_arr[0]);
863 /* rearranging filter */
864 filt = LD_SH(filter);
865 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
871 for (loop_cnt = height; loop_cnt--;) {
872 LD_SB8(src, 8, src0, src1, src2, src3, src4, src5, src6, src7);
875 XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
877 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
878 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3);
879 DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, res0,
881 VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec0, vec1);
882 VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2, vec3);
883 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt2, filt2, filt2, res0,
885 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4, vec5);
886 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6, vec7);
887 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1, res0,
889 VSHF_B2_SB(src0, src0, src1, src1, mask3, mask3, vec4, vec5);
890 VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec6, vec7);
891 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt3, filt3, filt3, filt3, res0,
894 SRARI_H4_SH(res0, res1, res2, res3, 6);
895 SAT_SH4_SH(res0, res1, res2, res3, 7);
896 out = PCKEV_XORI128_UB(res0, res1);
898 out = PCKEV_XORI128_UB(res2, res3);
899 ST_UB(out, dst + 16);
901 VSHF_B2_SB(src4, src4, src5, src5, mask0, mask0, vec0, vec1);
902 VSHF_B2_SB(src6, src6, src7, src7, mask0, mask0, vec2, vec3);
903 DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, res0,
905 VSHF_B2_SB(src4, src4, src5, src5, mask2, mask2, vec0, vec1);
906 VSHF_B2_SB(src6, src6, src7, src7, mask2, mask2, vec2, vec3);
907 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt2, filt2, filt2, res0,
909 VSHF_B2_SB(src4, src4, src5, src5, mask1, mask1, vec4, vec5);
910 VSHF_B2_SB(src6, src6, src7, src7, mask1, mask1, vec6, vec7);
911 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1, res0,
913 VSHF_B2_SB(src4, src4, src5, src5, mask3, mask3, vec4, vec5);
914 VSHF_B2_SB(src6, src6, src7, src7, mask3, mask3, vec6, vec7);
915 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt3, filt3, filt3, filt3, res0,
918 SRARI_H4_SH(res0, res1, res2, res3, 6);
919 SAT_SH4_SH(res0, res1, res2, res3, 7);
920 out = PCKEV_XORI128_UB(res0, res1);
921 ST_UB(out, dst + 32);
922 out = PCKEV_XORI128_UB(res2, res3);
923 ST_UB(out, dst + 48);
928 static void common_vt_8t_4w_msa(uint8_t *src, int32_t src_stride,
929 uint8_t *dst, int32_t dst_stride,
930 const int8_t *filter, int32_t height)
934 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
935 v16i8 src11, src12, src13, src14;
936 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
937 v16i8 src65_r, src87_r, src109_r, src2110, src4332, src6554, src8776;
938 v16i8 src1110_r, src1211_r, src1312_r, src1413_r, src12111110, src14131312;
939 v16i8 src10998, filt0, filt1, filt2, filt3;
940 v8i16 filt, out10, out32, out54, out76;
942 src -= (3 * src_stride);
944 filt = LD_SH(filter);
945 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
947 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
948 src += (7 * src_stride);
950 ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
952 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
953 ILVR_D3_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r, src2110,
955 XORI_B3_128_SB(src2110, src4332, src6554);
957 for (loop_cnt = (height >> 3); loop_cnt--;) {
958 LD_SB4(src, src_stride, src7, src8, src9, src10);
959 src += (4 * src_stride);
960 LD_SB4(src, src_stride, src11, src12, src13, src14);
961 src += (4 * src_stride);
963 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
964 src87_r, src98_r, src109_r);
965 ILVR_B4_SB(src11, src10, src12, src11, src13, src12, src14, src13,
966 src1110_r, src1211_r, src1312_r, src1413_r);
967 ILVR_D2_SB(src87_r, src76_r, src109_r, src98_r, src8776, src10998);
968 ILVR_D2_SB(src1211_r, src1110_r, src1413_r, src1312_r,
969 src12111110, src14131312);
970 XORI_B2_128_SB(src8776, src10998);
971 XORI_B2_128_SB(src12111110, src14131312);
973 DOTP_SB2_SH(src2110, src4332, filt0, filt0, out10, out32);
974 DOTP_SB2_SH(src6554, src8776, filt0, filt0, out54, out76);
975 DPADD_SB2_SH(src4332, src6554, filt1, filt1, out10, out32);
976 DPADD_SB2_SH(src8776, src10998, filt1, filt1, out54, out76);
977 DPADD_SB2_SH(src6554, src8776, filt2, filt2, out10, out32);
978 DPADD_SB2_SH(src10998, src12111110, filt2, filt2, out54, out76);
979 DPADD_SB2_SH(src8776, src10998, filt3, filt3, out10, out32);
980 DPADD_SB2_SH(src12111110, src14131312, filt3, filt3, out54, out76);
981 SRARI_H2_SH(out10, out32, 6);
982 SRARI_H2_SH(out54, out76, 6);
983 SAT_SH2_SH(out10, out32, 7);
984 SAT_SH2_SH(out54, out76, 7);
985 out0 = PCKEV_XORI128_UB(out10, out32);
986 out1 = PCKEV_XORI128_UB(out54, out76);
987 ST4x4_UB(out0, out0, 0, 1, 2, 3, dst, dst_stride);
988 dst += (4 * dst_stride);
989 ST4x4_UB(out1, out1, 0, 1, 2, 3, dst, dst_stride);
990 dst += (4 * dst_stride);
993 src4332 = src12111110;
994 src6554 = src14131312;
999 static void common_vt_8t_8w_msa(uint8_t *src, int32_t src_stride,
1000 uint8_t *dst, int32_t dst_stride,
1001 const int8_t *filter, int32_t height)
1004 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1005 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
1006 v16i8 src65_r, src87_r, src109_r, filt0, filt1, filt2, filt3;
1008 v8i16 filt, out0_r, out1_r, out2_r, out3_r;
1010 src -= (3 * src_stride);
1012 filt = LD_SH(filter);
1013 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1015 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
1016 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1017 src += (7 * src_stride);
1018 ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
1020 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1022 for (loop_cnt = (height >> 2); loop_cnt--;) {
1023 LD_SB4(src, src_stride, src7, src8, src9, src10);
1024 XORI_B4_128_SB(src7, src8, src9, src10);
1025 src += (4 * src_stride);
1027 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
1028 src87_r, src98_r, src109_r);
1029 DOTP_SB4_SH(src10_r, src21_r, src32_r, src43_r, filt0, filt0, filt0,
1030 filt0, out0_r, out1_r, out2_r, out3_r);
1031 DPADD_SB4_SH(src32_r, src43_r, src54_r, src65_r, filt1, filt1, filt1,
1032 filt1, out0_r, out1_r, out2_r, out3_r);
1033 DPADD_SB4_SH(src54_r, src65_r, src76_r, src87_r, filt2, filt2, filt2,
1034 filt2, out0_r, out1_r, out2_r, out3_r);
1035 DPADD_SB4_SH(src76_r, src87_r, src98_r, src109_r, filt3, filt3, filt3,
1036 filt3, out0_r, out1_r, out2_r, out3_r);
1037 SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6);
1038 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
1039 tmp0 = PCKEV_XORI128_UB(out0_r, out1_r);
1040 tmp1 = PCKEV_XORI128_UB(out2_r, out3_r);
1041 ST8x4_UB(tmp0, tmp1, dst, dst_stride);
1042 dst += (4 * dst_stride);
1054 static void common_vt_8t_12w_msa(uint8_t *src, int32_t src_stride,
1055 uint8_t *dst, int32_t dst_stride,
1056 const int8_t *filter, int32_t height)
1059 uint32_t out2, out3;
1060 uint64_t out0, out1;
1061 v16u8 tmp0, tmp1, tmp2, tmp3;
1062 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1063 v16i8 filt0, filt1, filt2, filt3;
1064 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
1065 v16i8 src65_r, src87_r, src109_r, src10_l, src32_l, src54_l, src76_l;
1066 v16i8 src98_l, src21_l, src43_l, src65_l, src87_l, src109_l;
1067 v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
1069 src -= (3 * src_stride);
1071 filt = LD_SH(filter);
1072 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1074 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
1075 src += (7 * src_stride);
1077 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1079 ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
1081 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1082 ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_l, src32_l,
1084 ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
1086 for (loop_cnt = 4; loop_cnt--;) {
1087 LD_SB4(src, src_stride, src7, src8, src9, src10);
1088 XORI_B4_128_SB(src7, src8, src9, src10);
1089 src += (4 * src_stride);
1091 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
1092 src87_r, src98_r, src109_r);
1093 ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_l,
1094 src87_l, src98_l, src109_l);
1095 out0_r = HEVC_FILT_8TAP_SH(src10_r, src32_r, src54_r, src76_r, filt0,
1096 filt1, filt2, filt3);
1097 out1_r = HEVC_FILT_8TAP_SH(src21_r, src43_r, src65_r, src87_r, filt0,
1098 filt1, filt2, filt3);
1099 out2_r = HEVC_FILT_8TAP_SH(src32_r, src54_r, src76_r, src98_r, filt0,
1100 filt1, filt2, filt3);
1101 out3_r = HEVC_FILT_8TAP_SH(src43_r, src65_r, src87_r, src109_r, filt0,
1102 filt1, filt2, filt3);
1103 out0_l = HEVC_FILT_8TAP_SH(src10_l, src32_l, src54_l, src76_l, filt0,
1104 filt1, filt2, filt3);
1105 out1_l = HEVC_FILT_8TAP_SH(src21_l, src43_l, src65_l, src87_l, filt0,
1106 filt1, filt2, filt3);
1107 out2_l = HEVC_FILT_8TAP_SH(src32_l, src54_l, src76_l, src98_l, filt0,
1108 filt1, filt2, filt3);
1109 out3_l = HEVC_FILT_8TAP_SH(src43_l, src65_l, src87_l, src109_l, filt0,
1110 filt1, filt2, filt3);
1111 SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6);
1112 SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 6);
1113 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
1114 SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
1115 PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
1116 out3_r, tmp0, tmp1, tmp2, tmp3);
1117 XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3);
1119 out0 = __msa_copy_u_d((v2i64) tmp0, 0);
1120 out1 = __msa_copy_u_d((v2i64) tmp1, 0);
1121 out2 = __msa_copy_u_w((v4i32) tmp0, 2);
1122 out3 = __msa_copy_u_w((v4i32) tmp1, 2);
1124 SW(out2, (dst + 8));
1127 SW(out3, (dst + 8));
1129 out0 = __msa_copy_u_d((v2i64) tmp2, 0);
1130 out1 = __msa_copy_u_d((v2i64) tmp3, 0);
1131 out2 = __msa_copy_u_w((v4i32) tmp2, 2);
1132 out3 = __msa_copy_u_w((v4i32) tmp3, 2);
1134 SW(out2, (dst + 8));
1137 SW(out3, (dst + 8));
1156 static void common_vt_8t_16w_msa(uint8_t *src, int32_t src_stride,
1157 uint8_t *dst, int32_t dst_stride,
1158 const int8_t *filter, int32_t height)
1161 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1162 v16i8 filt0, filt1, filt2, filt3;
1163 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
1164 v16i8 src65_r, src87_r, src109_r, src10_l, src32_l, src54_l, src76_l;
1165 v16i8 src98_l, src21_l, src43_l, src65_l, src87_l, src109_l;
1166 v16u8 tmp0, tmp1, tmp2, tmp3;
1167 v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
1169 src -= (3 * src_stride);
1171 filt = LD_SH(filter);
1172 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1174 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
1175 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1176 src += (7 * src_stride);
1177 ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
1179 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1180 ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_l, src32_l,
1182 ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
1184 for (loop_cnt = (height >> 2); loop_cnt--;) {
1185 LD_SB4(src, src_stride, src7, src8, src9, src10);
1186 XORI_B4_128_SB(src7, src8, src9, src10);
1187 src += (4 * src_stride);
1189 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
1190 src87_r, src98_r, src109_r);
1191 ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_l,
1192 src87_l, src98_l, src109_l);
1193 out0_r = HEVC_FILT_8TAP_SH(src10_r, src32_r, src54_r, src76_r, filt0,
1194 filt1, filt2, filt3);
1195 out1_r = HEVC_FILT_8TAP_SH(src21_r, src43_r, src65_r, src87_r, filt0,
1196 filt1, filt2, filt3);
1197 out2_r = HEVC_FILT_8TAP_SH(src32_r, src54_r, src76_r, src98_r, filt0,
1198 filt1, filt2, filt3);
1199 out3_r = HEVC_FILT_8TAP_SH(src43_r, src65_r, src87_r, src109_r, filt0,
1200 filt1, filt2, filt3);
1201 out0_l = HEVC_FILT_8TAP_SH(src10_l, src32_l, src54_l, src76_l, filt0,
1202 filt1, filt2, filt3);
1203 out1_l = HEVC_FILT_8TAP_SH(src21_l, src43_l, src65_l, src87_l, filt0,
1204 filt1, filt2, filt3);
1205 out2_l = HEVC_FILT_8TAP_SH(src32_l, src54_l, src76_l, src98_l, filt0,
1206 filt1, filt2, filt3);
1207 out3_l = HEVC_FILT_8TAP_SH(src43_l, src65_l, src87_l, src109_l, filt0,
1208 filt1, filt2, filt3);
1209 SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6);
1210 SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 6);
1211 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
1212 SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
1213 PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
1214 out3_r, tmp0, tmp1, tmp2, tmp3);
1215 XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3);
1216 ST_UB4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride);
1217 dst += (4 * dst_stride);
1235 static void common_vt_8t_16w_mult_msa(uint8_t *src, int32_t src_stride,
1236 uint8_t *dst, int32_t dst_stride,
1237 const int8_t *filter, int32_t height,
1242 uint32_t loop_cnt, cnt;
1243 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1244 v16i8 filt0, filt1, filt2, filt3;
1245 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
1246 v16i8 src65_r, src87_r, src109_r, src10_l, src32_l, src54_l, src76_l;
1247 v16i8 src98_l, src21_l, src43_l, src65_l, src87_l, src109_l;
1248 v16u8 tmp0, tmp1, tmp2, tmp3;
1249 v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
1251 src -= (3 * src_stride);
1253 filt = LD_SH(filter);
1254 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1256 for (cnt = (width >> 4); cnt--;) {
1260 LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6);
1261 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1262 src_tmp += (7 * src_stride);
1263 ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r,
1264 src32_r, src54_r, src21_r);
1265 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1266 ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_l,
1267 src32_l, src54_l, src21_l);
1268 ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
1270 for (loop_cnt = (height >> 2); loop_cnt--;) {
1271 LD_SB4(src_tmp, src_stride, src7, src8, src9, src10);
1272 XORI_B4_128_SB(src7, src8, src9, src10);
1273 src_tmp += (4 * src_stride);
1274 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
1275 src87_r, src98_r, src109_r);
1276 ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_l,
1277 src87_l, src98_l, src109_l);
1278 out0_r = HEVC_FILT_8TAP_SH(src10_r, src32_r, src54_r, src76_r,
1279 filt0, filt1, filt2, filt3);
1280 out1_r = HEVC_FILT_8TAP_SH(src21_r, src43_r, src65_r, src87_r,
1281 filt0, filt1, filt2, filt3);
1282 out2_r = HEVC_FILT_8TAP_SH(src32_r, src54_r, src76_r, src98_r,
1283 filt0, filt1, filt2, filt3);
1284 out3_r = HEVC_FILT_8TAP_SH(src43_r, src65_r, src87_r, src109_r,
1285 filt0, filt1, filt2, filt3);
1286 out0_l = HEVC_FILT_8TAP_SH(src10_l, src32_l, src54_l, src76_l,
1287 filt0, filt1, filt2, filt3);
1288 out1_l = HEVC_FILT_8TAP_SH(src21_l, src43_l, src65_l, src87_l,
1289 filt0, filt1, filt2, filt3);
1290 out2_l = HEVC_FILT_8TAP_SH(src32_l, src54_l, src76_l, src98_l,
1291 filt0, filt1, filt2, filt3);
1292 out3_l = HEVC_FILT_8TAP_SH(src43_l, src65_l, src87_l, src109_l,
1293 filt0, filt1, filt2, filt3);
1294 SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6);
1295 SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 6);
1296 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
1297 SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
1298 PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
1299 out3_r, tmp0, tmp1, tmp2, tmp3);
1300 XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3);
1301 ST_UB4(tmp0, tmp1, tmp2, tmp3, dst_tmp, dst_stride);
1302 dst_tmp += (4 * dst_stride);
1324 static void common_vt_8t_24w_msa(uint8_t *src, int32_t src_stride,
1325 uint8_t *dst, int32_t dst_stride,
1326 const int8_t *filter, int32_t height)
1328 common_vt_8t_16w_mult_msa(src, src_stride, dst, dst_stride, filter, height,
1331 common_vt_8t_8w_msa(src + 16, src_stride, dst + 16, dst_stride, filter,
1335 static void common_vt_8t_32w_msa(uint8_t *src, int32_t src_stride,
1336 uint8_t *dst, int32_t dst_stride,
1337 const int8_t *filter, int32_t height)
1339 common_vt_8t_16w_mult_msa(src, src_stride, dst, dst_stride, filter, height,
1343 static void common_vt_8t_48w_msa(uint8_t *src, int32_t src_stride,
1344 uint8_t *dst, int32_t dst_stride,
1345 const int8_t *filter, int32_t height)
1347 common_vt_8t_16w_mult_msa(src, src_stride, dst, dst_stride, filter, height,
1351 static void common_vt_8t_64w_msa(uint8_t *src, int32_t src_stride,
1352 uint8_t *dst, int32_t dst_stride,
1353 const int8_t *filter, int32_t height)
1355 common_vt_8t_16w_mult_msa(src, src_stride, dst, dst_stride, filter, height,
1359 static void hevc_hv_uni_8t_4w_msa(uint8_t *src,
1363 const int8_t *filter_x,
1364 const int8_t *filter_y,
1369 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
1370 v16i8 src9, src10, src11, src12, src13, src14;
1371 v8i16 filt0, filt1, filt2, filt3;
1372 v8i16 filt_h0, filt_h1, filt_h2, filt_h3;
1373 v16i8 mask1, mask2, mask3;
1375 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1376 v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1377 v8i16 dst30, dst41, dst52, dst63, dst66, dst117, dst128, dst139, dst1410;
1378 v8i16 dst10_r, dst32_r, dst54_r, dst76_r, dst98_r, dst1110_r, dst1312_r;
1379 v8i16 dst21_r, dst43_r, dst65_r, dst87_r, dst109_r, dst1211_r, dst1413_r;
1380 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r, dst6_r, dst7_r;
1381 v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16);
1383 src -= ((3 * src_stride) + 3);
1384 filter_vec = LD_SH(filter_x);
1385 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1387 filter_vec = LD_SH(filter_y);
1388 UNPCK_R_SB_SH(filter_vec, filter_vec);
1390 SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
1396 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
1397 src += (7 * src_stride);
1398 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1400 VSHF_B4_SB(src0, src3, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
1401 VSHF_B4_SB(src1, src4, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7);
1402 VSHF_B4_SB(src2, src5, mask0, mask1, mask2, mask3,
1403 vec8, vec9, vec10, vec11);
1404 VSHF_B4_SB(src3, src6, mask0, mask1, mask2, mask3,
1405 vec12, vec13, vec14, vec15);
1407 dst30 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1409 dst41 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
1411 dst52 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
1413 dst63 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2,
1416 ILVRL_H2_SH(dst41, dst30, dst10_r, dst43_r);
1417 ILVRL_H2_SH(dst52, dst41, dst21_r, dst54_r);
1418 ILVRL_H2_SH(dst63, dst52, dst32_r, dst65_r);
1420 dst66 = (v8i16) __msa_splati_d((v2i64) dst63, 1);
1422 for (loop_cnt = height >> 3; loop_cnt--;) {
1423 LD_SB8(src, src_stride, src7, src8, src9, src10, src11, src12, src13,
1425 src += (8 * src_stride);
1426 XORI_B8_128_SB(src7, src8, src9, src10, src11, src12, src13, src14);
1428 VSHF_B4_SB(src7, src11, mask0, mask1, mask2, mask3,
1429 vec0, vec1, vec2, vec3);
1430 VSHF_B4_SB(src8, src12, mask0, mask1, mask2, mask3,
1431 vec4, vec5, vec6, vec7);
1432 VSHF_B4_SB(src9, src13, mask0, mask1, mask2, mask3,
1433 vec8, vec9, vec10, vec11);
1434 VSHF_B4_SB(src10, src14, mask0, mask1, mask2, mask3,
1435 vec12, vec13, vec14, vec15);
1437 dst117 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1439 dst128 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
1441 dst139 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1,
1443 dst1410 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1,
1446 dst76_r = __msa_ilvr_h(dst117, dst66);
1447 ILVRL_H2_SH(dst128, dst117, dst87_r, dst1211_r);
1448 ILVRL_H2_SH(dst139, dst128, dst98_r, dst1312_r);
1449 ILVRL_H2_SH(dst1410, dst139, dst109_r, dst1413_r);
1450 dst117 = (v8i16) __msa_splati_d((v2i64) dst117, 1);
1451 dst1110_r = __msa_ilvr_h(dst117, dst1410);
1453 dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r, filt_h0,
1454 filt_h1, filt_h2, filt_h3);
1455 dst1_r = HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r, filt_h0,
1456 filt_h1, filt_h2, filt_h3);
1457 dst2_r = HEVC_FILT_8TAP(dst32_r, dst54_r, dst76_r, dst98_r, filt_h0,
1458 filt_h1, filt_h2, filt_h3);
1459 dst3_r = HEVC_FILT_8TAP(dst43_r, dst65_r, dst87_r, dst109_r, filt_h0,
1460 filt_h1, filt_h2, filt_h3);
1461 dst4_r = HEVC_FILT_8TAP(dst54_r, dst76_r, dst98_r, dst1110_r, filt_h0,
1462 filt_h1, filt_h2, filt_h3);
1463 dst5_r = HEVC_FILT_8TAP(dst65_r, dst87_r, dst109_r, dst1211_r, filt_h0,
1464 filt_h1, filt_h2, filt_h3);
1465 dst6_r = HEVC_FILT_8TAP(dst76_r, dst98_r, dst1110_r, dst1312_r, filt_h0,
1466 filt_h1, filt_h2, filt_h3);
1467 dst7_r = HEVC_FILT_8TAP(dst87_r, dst109_r, dst1211_r, dst1413_r,
1468 filt_h0, filt_h1, filt_h2, filt_h3);
1470 SRA_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6);
1471 SRA_4V(dst4_r, dst5_r, dst6_r, dst7_r, 6);
1472 SRARI_W4_SW(dst0_r, dst1_r, dst2_r, dst3_r, 6);
1473 SRARI_W4_SW(dst4_r, dst5_r, dst6_r, dst7_r, 6);
1474 SAT_SW4_SW(dst0_r, dst1_r, dst2_r, dst3_r, 7);
1475 SAT_SW4_SW(dst4_r, dst5_r, dst6_r, dst7_r, 7);
1476 PCKEV_H2_SW(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r);
1477 PCKEV_H2_SW(dst5_r, dst4_r, dst7_r, dst6_r, dst4_r, dst5_r);
1478 out0 = PCKEV_XORI128_UB(dst0_r, dst1_r);
1479 out1 = PCKEV_XORI128_UB(dst4_r, dst5_r);
1480 ST4x4_UB(out0, out0, 0, 1, 2, 3, dst, dst_stride);
1481 dst += (4 * dst_stride);
1482 ST4x4_UB(out1, out1, 0, 1, 2, 3, dst, dst_stride);
1483 dst += (4 * dst_stride);
1486 dst32_r = dst1110_r;
1487 dst54_r = dst1312_r;
1489 dst43_r = dst1211_r;
1490 dst65_r = dst1413_r;
1491 dst66 = (v8i16) __msa_splati_d((v2i64) dst1410, 1);
1495 static void hevc_hv_uni_8t_8multx2mult_msa(uint8_t *src,
1499 const int8_t *filter_x,
1500 const int8_t *filter_y,
1501 int32_t height, int32_t width)
1503 uint32_t loop_cnt, cnt;
1507 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
1508 v8i16 filt0, filt1, filt2, filt3;
1509 v8i16 filt_h0, filt_h1, filt_h2, filt_h3;
1510 v16i8 mask1, mask2, mask3;
1512 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1513 v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1514 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
1515 v4i32 dst0_r, dst0_l, dst1_r, dst1_l;
1516 v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
1517 v8i16 dst10_l, dst32_l, dst54_l, dst76_l;
1518 v8i16 dst21_r, dst43_r, dst65_r, dst87_r;
1519 v8i16 dst21_l, dst43_l, dst65_l, dst87_l;
1520 v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
1522 src -= ((3 * src_stride) + 3);
1524 filter_vec = LD_SH(filter_x);
1525 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1527 filter_vec = LD_SH(filter_y);
1528 UNPCK_R_SB_SH(filter_vec, filter_vec);
1530 SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
1536 for (cnt = width >> 3; cnt--;) {
1540 LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6);
1541 src_tmp += (7 * src_stride);
1542 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1544 /* row 0 row 1 row 2 row 3 */
1545 VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
1546 vec0, vec1, vec2, vec3);
1547 VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
1548 vec4, vec5, vec6, vec7);
1549 VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
1550 vec8, vec9, vec10, vec11);
1551 VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
1552 vec12, vec13, vec14, vec15);
1553 dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1555 dst1 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
1557 dst2 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
1559 dst3 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1,
1562 VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3,
1563 vec0, vec1, vec2, vec3);
1564 VSHF_B4_SB(src5, src5, mask0, mask1, mask2, mask3,
1565 vec4, vec5, vec6, vec7);
1566 VSHF_B4_SB(src6, src6, mask0, mask1, mask2, mask3,
1567 vec8, vec9, vec10, vec11);
1568 dst4 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1570 dst5 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
1572 dst6 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
1575 for (loop_cnt = height >> 1; loop_cnt--;) {
1576 LD_SB2(src_tmp, src_stride, src7, src8);
1577 XORI_B2_128_SB(src7, src8);
1578 src_tmp += 2 * src_stride;
1580 ILVR_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1,
1581 dst10_r, dst32_r, dst54_r, dst21_r);
1582 ILVL_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1,
1583 dst10_l, dst32_l, dst54_l, dst21_l);
1584 ILVR_H2_SH(dst4, dst3, dst6, dst5, dst43_r, dst65_r);
1585 ILVL_H2_SH(dst4, dst3, dst6, dst5, dst43_l, dst65_l);
1587 VSHF_B4_SB(src7, src7, mask0, mask1, mask2, mask3,
1588 vec0, vec1, vec2, vec3);
1589 dst7 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1,
1592 ILVRL_H2_SH(dst7, dst6, dst76_r, dst76_l);
1593 dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r,
1594 filt_h0, filt_h1, filt_h2, filt_h3);
1595 dst0_l = HEVC_FILT_8TAP(dst10_l, dst32_l, dst54_l, dst76_l,
1596 filt_h0, filt_h1, filt_h2, filt_h3);
1600 VSHF_B4_SB(src8, src8, mask0, mask1, mask2, mask3,
1601 vec0, vec1, vec2, vec3);
1602 dst8 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1,
1605 ILVRL_H2_SH(dst8, dst7, dst87_r, dst87_l);
1606 dst1_r = HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r,
1607 filt_h0, filt_h1, filt_h2, filt_h3);
1608 dst1_l = HEVC_FILT_8TAP(dst21_l, dst43_l, dst65_l, dst87_l,
1609 filt_h0, filt_h1, filt_h2, filt_h3);
1612 SRARI_W4_SW(dst0_r, dst0_l, dst1_r, dst1_l, 6);
1613 SAT_SW4_SW(dst0_r, dst0_l, dst1_r, dst1_l, 7);
1615 PCKEV_H2_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst0, dst1);
1616 out = PCKEV_XORI128_UB(dst0, dst1);
1617 ST8x2_UB(out, dst_tmp, dst_stride);
1618 dst_tmp += (2 * dst_stride);
1634 static void hevc_hv_uni_8t_8w_msa(uint8_t *src,
1638 const int8_t *filter_x,
1639 const int8_t *filter_y,
1642 hevc_hv_uni_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
1643 filter_x, filter_y, height, 8);
1646 static void hevc_hv_uni_8t_12w_msa(uint8_t *src,
1650 const int8_t *filter_x,
1651 const int8_t *filter_y,
1655 uint8_t *src_tmp, *dst_tmp;
1657 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1658 v16i8 src11, src12, src13, src14;
1659 v16i8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7;
1660 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1661 v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1662 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
1663 v8i16 dst30, dst41, dst52, dst63, dst66, dst117, dst128, dst139, dst1410;
1664 v8i16 filt0, filt1, filt2, filt3, filt_h0, filt_h1, filt_h2, filt_h3;
1665 v8i16 dst10_r, dst32_r, dst54_r, dst76_r, dst21_r, dst43_r, dst65_r;
1666 v8i16 dst10_l, dst32_l, dst54_l, dst76_l, dst21_l, dst43_l, dst65_l;
1667 v8i16 dst87_r, dst98_r, dst1110_r, dst1312_r, dst109_r, dst1211_r;
1668 v8i16 dst1413_r, dst87_l, filter_vec;
1669 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r, dst6_r, dst7_r;
1670 v4i32 dst0_l, dst1_l;
1672 src -= ((3 * src_stride) + 3);
1674 filter_vec = LD_SH(filter_x);
1675 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1677 filter_vec = LD_SH(filter_y);
1678 UNPCK_R_SB_SH(filter_vec, filter_vec);
1680 SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
1682 mask0 = LD_SB(ff_hevc_mask_arr);
1690 LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6);
1691 src_tmp += (7 * src_stride);
1692 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1694 /* row 0 row 1 row 2 row 3 */
1695 VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
1696 VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7);
1697 VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3, vec8, vec9, vec10,
1699 VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3, vec12, vec13, vec14,
1701 dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1703 dst1 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
1705 dst2 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
1707 dst3 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1,
1710 VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
1711 VSHF_B4_SB(src5, src5, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7);
1712 VSHF_B4_SB(src6, src6, mask0, mask1, mask2, mask3, vec8, vec9, vec10,
1714 dst4 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1716 dst5 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
1718 dst6 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
1721 for (loop_cnt = 8; loop_cnt--;) {
1722 LD_SB2(src_tmp, src_stride, src7, src8);
1723 XORI_B2_128_SB(src7, src8);
1724 src_tmp += 2 * src_stride;
1726 ILVR_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1, dst10_r,
1727 dst32_r, dst54_r, dst21_r);
1728 ILVL_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1, dst10_l,
1729 dst32_l, dst54_l, dst21_l);
1730 ILVR_H2_SH(dst4, dst3, dst6, dst5, dst43_r, dst65_r);
1731 ILVL_H2_SH(dst4, dst3, dst6, dst5, dst43_l, dst65_l);
1733 VSHF_B4_SB(src7, src7, mask0, mask1, mask2, mask3, vec0, vec1, vec2,
1735 dst7 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1738 ILVRL_H2_SH(dst7, dst6, dst76_r, dst76_l);
1739 dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r,
1740 filt_h0, filt_h1, filt_h2, filt_h3);
1741 dst0_l = HEVC_FILT_8TAP(dst10_l, dst32_l, dst54_l, dst76_l,
1742 filt_h0, filt_h1, filt_h2, filt_h3);
1746 VSHF_B4_SB(src8, src8, mask0, mask1, mask2, mask3, vec0, vec1, vec2,
1748 dst8 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1751 ILVRL_H2_SH(dst8, dst7, dst87_r, dst87_l);
1752 dst1_r = HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r,
1753 filt_h0, filt_h1, filt_h2, filt_h3);
1754 dst1_l = HEVC_FILT_8TAP(dst21_l, dst43_l, dst65_l, dst87_l,
1755 filt_h0, filt_h1, filt_h2, filt_h3);
1758 SRARI_W4_SW(dst0_r, dst0_l, dst1_r, dst1_l, 6);
1759 SAT_SW4_SW(dst0_r, dst0_l, dst1_r, dst1_l, 7);
1761 PCKEV_H2_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst0, dst1);
1762 out0 = PCKEV_XORI128_UB(dst0, dst1);
1763 ST8x2_UB(out0, dst_tmp, dst_stride);
1764 dst_tmp += (2 * dst_stride);
1778 mask4 = LD_SB(ff_hevc_mask_arr + 16);
1783 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
1784 src += (7 * src_stride);
1785 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1787 VSHF_B4_SB(src0, src3, mask4, mask5, mask6, mask7, vec0, vec1, vec2, vec3);
1788 VSHF_B4_SB(src1, src4, mask4, mask5, mask6, mask7, vec4, vec5, vec6, vec7);
1789 VSHF_B4_SB(src2, src5, mask4, mask5, mask6, mask7, vec8, vec9, vec10,
1791 VSHF_B4_SB(src3, src6, mask4, mask5, mask6, mask7, vec12, vec13, vec14,
1794 dst30 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1796 dst41 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
1798 dst52 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
1800 dst63 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2,
1803 ILVRL_H2_SH(dst41, dst30, dst10_r, dst43_r);
1804 ILVRL_H2_SH(dst52, dst41, dst21_r, dst54_r);
1805 ILVRL_H2_SH(dst63, dst52, dst32_r, dst65_r);
1807 dst66 = (v8i16) __msa_splati_d((v2i64) dst63, 1);
1809 for (loop_cnt = 2; loop_cnt--;) {
1810 LD_SB8(src, src_stride, src7, src8, src9, src10, src11, src12, src13,
1812 src += (8 * src_stride);
1813 XORI_B8_128_SB(src7, src8, src9, src10, src11, src12, src13, src14);
1815 VSHF_B4_SB(src7, src11, mask4, mask5, mask6, mask7, vec0, vec1, vec2,
1817 VSHF_B4_SB(src8, src12, mask4, mask5, mask6, mask7, vec4, vec5, vec6,
1819 VSHF_B4_SB(src9, src13, mask4, mask5, mask6, mask7, vec8, vec9, vec10,
1821 VSHF_B4_SB(src10, src14, mask4, mask5, mask6, mask7, vec12, vec13,
1824 dst117 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1826 dst128 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
1828 dst139 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1,
1830 dst1410 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1,
1833 dst76_r = __msa_ilvr_h(dst117, dst66);
1834 ILVRL_H2_SH(dst128, dst117, dst87_r, dst1211_r);
1835 ILVRL_H2_SH(dst139, dst128, dst98_r, dst1312_r);
1836 ILVRL_H2_SH(dst1410, dst139, dst109_r, dst1413_r);
1837 dst117 = (v8i16) __msa_splati_d((v2i64) dst117, 1);
1838 dst1110_r = __msa_ilvr_h(dst117, dst1410);
1840 dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r, filt_h0,
1841 filt_h1, filt_h2, filt_h3);
1842 dst1_r = HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r, filt_h0,
1843 filt_h1, filt_h2, filt_h3);
1844 dst2_r = HEVC_FILT_8TAP(dst32_r, dst54_r, dst76_r, dst98_r, filt_h0,
1845 filt_h1, filt_h2, filt_h3);
1846 dst3_r = HEVC_FILT_8TAP(dst43_r, dst65_r, dst87_r, dst109_r, filt_h0,
1847 filt_h1, filt_h2, filt_h3);
1848 dst4_r = HEVC_FILT_8TAP(dst54_r, dst76_r, dst98_r, dst1110_r, filt_h0,
1849 filt_h1, filt_h2, filt_h3);
1850 dst5_r = HEVC_FILT_8TAP(dst65_r, dst87_r, dst109_r, dst1211_r, filt_h0,
1851 filt_h1, filt_h2, filt_h3);
1852 dst6_r = HEVC_FILT_8TAP(dst76_r, dst98_r, dst1110_r, dst1312_r, filt_h0,
1853 filt_h1, filt_h2, filt_h3);
1854 dst7_r = HEVC_FILT_8TAP(dst87_r, dst109_r, dst1211_r, dst1413_r,
1855 filt_h0, filt_h1, filt_h2, filt_h3);
1857 SRA_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6);
1858 SRA_4V(dst4_r, dst5_r, dst6_r, dst7_r, 6);
1859 SRARI_W4_SW(dst0_r, dst1_r, dst2_r, dst3_r, 6);
1860 SRARI_W4_SW(dst4_r, dst5_r, dst6_r, dst7_r, 6);
1861 SAT_SW4_SW(dst0_r, dst1_r, dst2_r, dst3_r, 7);
1862 SAT_SW4_SW(dst4_r, dst5_r, dst6_r, dst7_r, 7);
1863 PCKEV_H2_SW(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r);
1864 PCKEV_H2_SW(dst5_r, dst4_r, dst7_r, dst6_r, dst4_r, dst5_r);
1865 out0 = PCKEV_XORI128_UB(dst0_r, dst1_r);
1866 out1 = PCKEV_XORI128_UB(dst4_r, dst5_r);
1867 ST4x4_UB(out0, out0, 0, 1, 2, 3, dst, dst_stride);
1868 dst += (4 * dst_stride);
1869 ST4x4_UB(out1, out1, 0, 1, 2, 3, dst, dst_stride);
1870 dst += (4 * dst_stride);
1873 dst32_r = dst1110_r;
1874 dst54_r = dst1312_r;
1876 dst43_r = dst1211_r;
1877 dst65_r = dst1413_r;
1878 dst66 = (v8i16) __msa_splati_d((v2i64) dst1410, 1);
1882 static void hevc_hv_uni_8t_16w_msa(uint8_t *src,
1886 const int8_t *filter_x,
1887 const int8_t *filter_y,
1890 hevc_hv_uni_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
1891 filter_x, filter_y, height, 16);
1894 static void hevc_hv_uni_8t_24w_msa(uint8_t *src,
1898 const int8_t *filter_x,
1899 const int8_t *filter_y,
1902 hevc_hv_uni_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
1903 filter_x, filter_y, height, 24);
1906 static void hevc_hv_uni_8t_32w_msa(uint8_t *src,
1910 const int8_t *filter_x,
1911 const int8_t *filter_y,
1914 hevc_hv_uni_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
1915 filter_x, filter_y, height, 32);
1918 static void hevc_hv_uni_8t_48w_msa(uint8_t *src,
1922 const int8_t *filter_x,
1923 const int8_t *filter_y,
1926 hevc_hv_uni_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
1927 filter_x, filter_y, height, 48);
1930 static void hevc_hv_uni_8t_64w_msa(uint8_t *src,
1934 const int8_t *filter_x,
1935 const int8_t *filter_y,
1938 hevc_hv_uni_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
1939 filter_x, filter_y, height, 64);
1942 static void common_hz_4t_4x2_msa(uint8_t *src, int32_t src_stride,
1943 uint8_t *dst, int32_t dst_stride,
1944 const int8_t *filter)
1946 v16i8 filt0, filt1, src0, src1, mask0, mask1, vec0, vec1;
1950 mask0 = LD_SB(&mc_filt_mask_arr[16]);
1953 /* rearranging filter */
1954 filt = LD_SH(filter);
1955 SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
1959 LD_SB2(src, src_stride, src0, src1);
1960 XORI_B2_128_SB(src0, src1);
1961 VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
1962 res0 = FILT_4TAP_DPADD_S_H(vec0, vec1, filt0, filt1);
1963 res0 = __msa_srari_h(res0, 6);
1964 res0 = __msa_sat_s_h(res0, 7);
1965 out = PCKEV_XORI128_UB(res0, res0);
1966 ST4x2_UB(out, dst, dst_stride);
1969 static void common_hz_4t_4x4_msa(uint8_t *src, int32_t src_stride,
1970 uint8_t *dst, int32_t dst_stride,
1971 const int8_t *filter)
1973 v16i8 src0, src1, src2, src3, filt0, filt1, mask0, mask1;
1974 v8i16 filt, out0, out1;
1977 mask0 = LD_SB(&mc_filt_mask_arr[16]);
1980 /* rearranging filter */
1981 filt = LD_SH(filter);
1982 SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
1986 LD_SB4(src, src_stride, src0, src1, src2, src3);
1987 XORI_B4_128_SB(src0, src1, src2, src3);
1988 HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,
1989 filt0, filt1, out0, out1);
1990 SRARI_H2_SH(out0, out1, 6);
1991 SAT_SH2_SH(out0, out1, 7);
1992 out = PCKEV_XORI128_UB(out0, out1);
1993 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
1996 static void common_hz_4t_4x8_msa(uint8_t *src, int32_t src_stride,
1997 uint8_t *dst, int32_t dst_stride,
1998 const int8_t *filter)
2000 v16i8 src0, src1, src2, src3, filt0, filt1, mask0, mask1;
2002 v8i16 filt, out0, out1, out2, out3;
2004 mask0 = LD_SB(&mc_filt_mask_arr[16]);
2007 /* rearranging filter */
2008 filt = LD_SH(filter);
2009 SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2013 LD_SB4(src, src_stride, src0, src1, src2, src3);
2014 src += (4 * src_stride);
2016 XORI_B4_128_SB(src0, src1, src2, src3);
2017 HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,
2018 filt0, filt1, out0, out1);
2019 LD_SB4(src, src_stride, src0, src1, src2, src3);
2020 XORI_B4_128_SB(src0, src1, src2, src3);
2021 HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,
2022 filt0, filt1, out2, out3);
2023 SRARI_H4_SH(out0, out1, out2, out3, 6);
2024 SAT_SH4_SH(out0, out1, out2, out3, 7);
2025 out = PCKEV_XORI128_UB(out0, out1);
2026 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
2027 dst += (4 * dst_stride);
2028 out = PCKEV_XORI128_UB(out2, out3);
2029 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
2032 static void common_hz_4t_4x16_msa(uint8_t *src, int32_t src_stride,
2033 uint8_t *dst, int32_t dst_stride,
2034 const int8_t *filter)
2036 v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
2037 v16i8 filt0, filt1, mask0, mask1;
2039 v8i16 filt, out0, out1, out2, out3;
2041 mask0 = LD_SB(&mc_filt_mask_arr[16]);
2044 /* rearranging filter */
2045 filt = LD_SH(filter);
2046 SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2050 LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
2051 src += (8 * src_stride);
2052 XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
2053 HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,
2054 filt0, filt1, out0, out1);
2055 HORIZ_4TAP_4WID_4VECS_FILT(src4, src5, src6, src7, mask0, mask1,
2056 filt0, filt1, out2, out3);
2057 SRARI_H4_SH(out0, out1, out2, out3, 6);
2058 SAT_SH4_SH(out0, out1, out2, out3, 7);
2059 out = PCKEV_XORI128_UB(out0, out1);
2060 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
2061 dst += (4 * dst_stride);
2062 out = PCKEV_XORI128_UB(out2, out3);
2063 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
2064 dst += (4 * dst_stride);
2066 LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
2067 src += (8 * src_stride);
2068 XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
2069 HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,
2070 filt0, filt1, out0, out1);
2071 HORIZ_4TAP_4WID_4VECS_FILT(src4, src5, src6, src7, mask0, mask1,
2072 filt0, filt1, out2, out3);
2073 SRARI_H4_SH(out0, out1, out2, out3, 6);
2074 SAT_SH4_SH(out0, out1, out2, out3, 7);
2075 out = PCKEV_XORI128_UB(out0, out1);
2076 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
2077 dst += (4 * dst_stride);
2078 out = PCKEV_XORI128_UB(out2, out3);
2079 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
2082 static void common_hz_4t_4w_msa(uint8_t *src, int32_t src_stride,
2083 uint8_t *dst, int32_t dst_stride,
2084 const int8_t *filter, int32_t height)
2087 common_hz_4t_4x2_msa(src, src_stride, dst, dst_stride, filter);
2088 } else if (4 == height) {
2089 common_hz_4t_4x4_msa(src, src_stride, dst, dst_stride, filter);
2090 } else if (8 == height) {
2091 common_hz_4t_4x8_msa(src, src_stride, dst, dst_stride, filter);
2092 } else if (16 == height) {
2093 common_hz_4t_4x16_msa(src, src_stride, dst, dst_stride, filter);
2097 static void common_hz_4t_6w_msa(uint8_t *src, int32_t src_stride,
2098 uint8_t *dst, int32_t dst_stride,
2099 const int8_t *filter, int32_t height)
2102 v16i8 src0, src1, src2, src3, filt0, filt1, mask0, mask1;
2104 v8i16 filt, out0, out1, out2, out3;
2106 mask0 = LD_SB(&mc_filt_mask_arr[0]);
2109 /* rearranging filter */
2110 filt = LD_SH(filter);
2111 SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2115 for (loop_cnt = (height >> 2); loop_cnt--;) {
2116 LD_SB4(src, src_stride, src0, src1, src2, src3);
2117 src += (4 * src_stride);
2119 XORI_B4_128_SB(src0, src1, src2, src3);
2120 HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0,
2121 filt1, out0, out1, out2, out3);
2122 SRARI_H4_SH(out0, out1, out2, out3, 6);
2123 SAT_SH4_SH(out0, out1, out2, out3, 7);
2125 out4 = PCKEV_XORI128_UB(out0, out1);
2126 out5 = PCKEV_XORI128_UB(out2, out3);
2127 ST6x4_UB(out4, out5, dst, dst_stride);
2128 dst += (4 * dst_stride);
2132 static void common_hz_4t_8x2mult_msa(uint8_t *src, int32_t src_stride,
2133 uint8_t *dst, int32_t dst_stride,
2134 const int8_t *filter, int32_t height)
2137 v16i8 src0, src1, filt0, filt1, mask0, mask1;
2139 v8i16 filt, vec0, vec1, vec2, vec3;
2141 mask0 = LD_SB(&mc_filt_mask_arr[0]);
2144 filt = LD_SH(filter);
2145 SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2149 for (loop_cnt = (height >> 1); loop_cnt--;) {
2150 LD_SB2(src, src_stride, src0, src1);
2151 src += (2 * src_stride);
2153 XORI_B2_128_SB(src0, src1);
2154 VSHF_B2_SH(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
2155 DOTP_SB2_SH(vec0, vec1, filt0, filt0, vec0, vec1);
2156 VSHF_B2_SH(src0, src0, src1, src1, mask1, mask1, vec2, vec3);
2157 DPADD_SB2_SH(vec2, vec3, filt1, filt1, vec0, vec1);
2158 SRARI_H2_SH(vec0, vec1, 6);
2159 SAT_SH2_SH(vec0, vec1, 7);
2160 out = PCKEV_XORI128_UB(vec0, vec1);
2161 ST8x2_UB(out, dst, dst_stride);
2162 dst += (2 * dst_stride);
2166 static void common_hz_4t_8x4mult_msa(uint8_t *src, int32_t src_stride,
2167 uint8_t *dst, int32_t dst_stride,
2168 const int8_t *filter, int32_t height)
2171 v16i8 src0, src1, src2, src3, filt0, filt1, mask0, mask1;
2173 v8i16 filt, out0, out1, out2, out3;
2175 mask0 = LD_SB(&mc_filt_mask_arr[0]);
2178 /* rearranging filter */
2179 filt = LD_SH(filter);
2180 SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2184 for (loop_cnt = (height >> 2); loop_cnt--;) {
2185 LD_SB4(src, src_stride, src0, src1, src2, src3);
2186 src += (4 * src_stride);
2188 XORI_B4_128_SB(src0, src1, src2, src3);
2189 HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0,
2190 filt1, out0, out1, out2, out3);
2191 SRARI_H4_SH(out0, out1, out2, out3, 6);
2192 SAT_SH4_SH(out0, out1, out2, out3, 7);
2193 tmp0 = PCKEV_XORI128_UB(out0, out1);
2194 tmp1 = PCKEV_XORI128_UB(out2, out3);
2195 ST8x4_UB(tmp0, tmp1, dst, dst_stride);
2196 dst += (4 * dst_stride);
2200 static void common_hz_4t_8w_msa(uint8_t *src, int32_t src_stride,
2201 uint8_t *dst, int32_t dst_stride,
2202 const int8_t *filter, int32_t height)
2204 if ((2 == height) || (6 == height)) {
2205 common_hz_4t_8x2mult_msa(src, src_stride, dst, dst_stride, filter,
2208 common_hz_4t_8x4mult_msa(src, src_stride, dst, dst_stride, filter,
2213 static void common_hz_4t_12w_msa(uint8_t *src, int32_t src_stride,
2214 uint8_t *dst, int32_t dst_stride,
2215 const int8_t *filter, int32_t height)
2218 v16i8 src0, src1, src2, src3, filt0, filt1, mask0, mask1, mask2, mask3;
2219 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
2222 v8i16 filt, out0, out1, out2, out3, out4, out5;
2224 mask0 = LD_SB(&mc_filt_mask_arr[0]);
2225 mask2 = LD_SB(&mc_filt_mask_arr[32]);
2229 /* rearranging filter */
2230 filt = LD_SH(filter);
2231 SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2236 for (loop_cnt = (height >> 2); loop_cnt--;) {
2237 LD_SB4(src, src_stride, src0, src1, src2, src3);
2238 src += (4 * src_stride);
2240 XORI_B4_128_SB(src0, src1, src2, src3);
2241 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec4, vec5);
2242 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec6, vec7);
2243 VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec0, vec1);
2244 DOTP_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0,
2245 out2, out3, out4, out5);
2246 DOTP_SB2_SH(vec0, vec1, filt0, filt0, out0, out1);
2247 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec8, vec9);
2248 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec10, vec11);
2249 VSHF_B2_SB(src0, src1, src2, src3, mask3, mask3, vec2, vec3);
2250 DPADD_SB4_SH(vec8, vec9, vec10, vec11, filt1, filt1, filt1, filt1,
2251 out2, out3, out4, out5);
2252 DPADD_SB2_SH(vec2, vec3, filt1, filt1, out0, out1);
2253 SRARI_H4_SH(out0, out1, out2, out3, 6);
2254 SRARI_H2_SH(out4, out5, 6);
2255 SAT_SH4_SH(out0, out1, out2, out3, 7);
2256 SAT_SH2_SH(out4, out5, 7);
2257 tmp0 = PCKEV_XORI128_UB(out2, out3);
2258 tmp1 = PCKEV_XORI128_UB(out4, out5);
2259 ST8x4_UB(tmp0, tmp1, dst, dst_stride);
2260 tmp0 = PCKEV_XORI128_UB(out0, out1);
2261 ST4x4_UB(tmp0, tmp0, 0, 1, 2, 3, dst + 8, dst_stride);
2262 dst += (4 * dst_stride);
2266 static void common_hz_4t_16w_msa(uint8_t *src, int32_t src_stride,
2267 uint8_t *dst, int32_t dst_stride,
2268 const int8_t *filter, int32_t height)
2271 v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
2272 v16i8 filt0, filt1, mask0, mask1;
2273 v8i16 filt, out0, out1, out2, out3, out4, out5, out6, out7;
2276 mask0 = LD_SB(&mc_filt_mask_arr[0]);
2279 /* rearranging filter */
2280 filt = LD_SH(filter);
2281 SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2285 for (loop_cnt = (height >> 2); loop_cnt--;) {
2286 LD_SB4(src, src_stride, src0, src2, src4, src6);
2287 LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
2288 src += (4 * src_stride);
2290 XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
2291 HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0,
2292 filt1, out0, out1, out2, out3);
2293 HORIZ_4TAP_8WID_4VECS_FILT(src4, src5, src6, src7, mask0, mask1, filt0,
2294 filt1, out4, out5, out6, out7);
2295 SRARI_H4_SH(out0, out1, out2, out3, 6);
2296 SRARI_H4_SH(out4, out5, out6, out7, 6);
2297 SAT_SH4_SH(out0, out1, out2, out3, 7);
2298 SAT_SH4_SH(out4, out5, out6, out7, 7);
2299 out = PCKEV_XORI128_UB(out0, out1);
2302 out = PCKEV_XORI128_UB(out2, out3);
2305 out = PCKEV_XORI128_UB(out4, out5);
2308 out = PCKEV_XORI128_UB(out6, out7);
2314 static void common_hz_4t_24w_msa(uint8_t *src, int32_t src_stride,
2315 uint8_t *dst, int32_t dst_stride,
2316 const int8_t *filter, int32_t height)
2318 uint8_t *dst1 = dst + 16;
2320 v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
2321 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
2322 v16i8 filt0, filt1, mask0, mask1, mask00, mask11;
2323 v8i16 filt, out0, out1, out2, out3;
2326 mask0 = LD_SB(&mc_filt_mask_arr[0]);
2329 /* rearranging filter */
2330 filt = LD_SH(filter);
2331 SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2335 mask11 = mask0 + 10;
2337 for (loop_cnt = (height >> 2); loop_cnt--;) {
2338 LD_SB4(src, src_stride, src0, src2, src4, src6);
2339 LD_SB4(src + 16, src_stride, src1, src3, src5, src7);
2340 src += (4 * src_stride);
2342 XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
2343 VSHF_B2_SB(src0, src0, src0, src1, mask0, mask00, vec0, vec1);
2344 VSHF_B2_SB(src2, src2, src2, src3, mask0, mask00, vec2, vec3);
2345 VSHF_B2_SB(src0, src0, src0, src1, mask1, mask11, vec4, vec5);
2346 VSHF_B2_SB(src2, src2, src2, src3, mask1, mask11, vec6, vec7);
2347 DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
2348 out0, out1, out2, out3);
2349 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1,
2350 out0, out1, out2, out3);
2351 SRARI_H4_SH(out0, out1, out2, out3, 6);
2352 SAT_SH4_SH(out0, out1, out2, out3, 7);
2353 tmp0 = PCKEV_XORI128_UB(out0, out1);
2356 tmp0 = PCKEV_XORI128_UB(out2, out3);
2360 VSHF_B2_SB(src4, src4, src4, src5, mask0, mask00, vec0, vec1);
2361 VSHF_B2_SB(src6, src6, src6, src7, mask0, mask00, vec2, vec3);
2362 VSHF_B2_SB(src4, src4, src4, src5, mask1, mask11, vec4, vec5);
2363 VSHF_B2_SB(src6, src6, src6, src7, mask1, mask11, vec6, vec7);
2364 DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
2365 out0, out1, out2, out3);
2366 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1,
2367 out0, out1, out2, out3);
2368 SRARI_H4_SH(out0, out1, out2, out3, 6);
2369 SAT_SH4_SH(out0, out1, out2, out3, 7);
2370 tmp0 = PCKEV_XORI128_UB(out0, out1);
2373 tmp0 = PCKEV_XORI128_UB(out2, out3);
2378 VSHF_B2_SB(src1, src1, src3, src3, mask0, mask0, vec0, vec1);
2379 VSHF_B2_SB(src5, src5, src7, src7, mask0, mask0, vec2, vec3);
2380 VSHF_B2_SB(src1, src1, src3, src3, mask1, mask1, vec4, vec5);
2381 VSHF_B2_SB(src5, src5, src7, src7, mask1, mask1, vec6, vec7);
2383 DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
2384 out0, out1, out2, out3);
2385 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1,
2386 out0, out1, out2, out3);
2388 SRARI_H4_SH(out0, out1, out2, out3, 6);
2389 SAT_SH4_SH(out0, out1, out2, out3, 7);
2390 tmp0 = PCKEV_XORI128_UB(out0, out1);
2391 tmp1 = PCKEV_XORI128_UB(out2, out3);
2392 ST8x4_UB(tmp0, tmp1, dst1, dst_stride);
2393 dst1 += (4 * dst_stride);
2397 static void common_hz_4t_32w_msa(uint8_t *src, int32_t src_stride,
2398 uint8_t *dst, int32_t dst_stride,
2399 const int8_t *filter, int32_t height)
2402 v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
2403 v16i8 filt0, filt1, mask0, mask1;
2405 v8i16 filt, out0, out1, out2, out3, out4, out5, out6, out7;
2407 mask0 = LD_SB(&mc_filt_mask_arr[0]);
2410 /* rearranging filter */
2411 filt = LD_SH(filter);
2412 SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2416 for (loop_cnt = (height >> 1); loop_cnt--;) {
2418 src2 = LD_SB(src + 16);
2419 src3 = LD_SB(src + 24);
2422 src6 = LD_SB(src + 16);
2423 src7 = LD_SB(src + 24);
2424 SLDI_B2_SB(src2, src6, src0, src4, src1, src5, 8);
2427 XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
2428 HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,
2429 filt0, filt1, out0, out1, out2, out3);
2430 HORIZ_4TAP_8WID_4VECS_FILT(src4, src5, src6, src7, mask0, mask1,
2431 filt0, filt1, out4, out5, out6, out7);
2432 SRARI_H4_SH(out0, out1, out2, out3, 6);
2433 SRARI_H4_SH(out4, out5, out6, out7, 6);
2434 SAT_SH4_SH(out0, out1, out2, out3, 7);
2435 SAT_SH4_SH(out4, out5, out6, out7, 7);
2436 out = PCKEV_XORI128_UB(out0, out1);
2438 out = PCKEV_XORI128_UB(out2, out3);
2439 ST_UB(out, dst + 16);
2441 out = PCKEV_XORI128_UB(out4, out5);
2443 out = PCKEV_XORI128_UB(out6, out7);
2444 ST_UB(out, dst + 16);
2449 static void common_vt_4t_4x2_msa(uint8_t *src, int32_t src_stride,
2450 uint8_t *dst, int32_t dst_stride,
2451 const int8_t *filter)
2453 v16i8 src0, src1, src2, src3, src4, src10_r, src32_r, src21_r, src43_r;
2454 v16i8 src2110, src4332, filt0, filt1;
2460 filt = LD_SH(filter);
2461 SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2463 LD_SB3(src, src_stride, src0, src1, src2);
2464 src += (3 * src_stride);
2466 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2467 src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
2468 src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
2469 LD_SB2(src, src_stride, src3, src4);
2470 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
2471 src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_r, (v2i64) src32_r);
2472 src4332 = (v16i8) __msa_xori_b((v16u8) src4332, 128);
2473 out10 = FILT_4TAP_DPADD_S_H(src2110, src4332, filt0, filt1);
2474 out10 = __msa_srari_h(out10, 6);
2475 out10 = __msa_sat_s_h(out10, 7);
2476 out = PCKEV_XORI128_UB(out10, out10);
2477 ST4x2_UB(out, dst, dst_stride);
2480 static void common_vt_4t_4x4multiple_msa(uint8_t *src, int32_t src_stride,
2481 uint8_t *dst, int32_t dst_stride,
2482 const int8_t *filter, int32_t height)
2485 v16i8 src0, src1, src2, src3, src4, src5;
2486 v16i8 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r;
2487 v16i8 src2110, src4332, filt0, filt1;
2488 v8i16 filt, out10, out32;
2493 filt = LD_SH(filter);
2494 SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2496 LD_SB3(src, src_stride, src0, src1, src2);
2497 src += (3 * src_stride);
2499 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2501 src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
2502 src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
2504 for (loop_cnt = (height >> 2); loop_cnt--;) {
2505 LD_SB3(src, src_stride, src3, src4, src5);
2506 src += (3 * src_stride);
2507 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
2508 src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_r, (v2i64) src32_r);
2509 src4332 = (v16i8) __msa_xori_b((v16u8) src4332, 128);
2510 out10 = FILT_4TAP_DPADD_S_H(src2110, src4332, filt0, filt1);
2513 src += (src_stride);
2514 ILVR_B2_SB(src5, src4, src2, src5, src54_r, src65_r);
2515 src2110 = (v16i8) __msa_ilvr_d((v2i64) src65_r, (v2i64) src54_r);
2516 src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
2517 out32 = FILT_4TAP_DPADD_S_H(src4332, src2110, filt0, filt1);
2518 SRARI_H2_SH(out10, out32, 6);
2519 SAT_SH2_SH(out10, out32, 7);
2520 out = PCKEV_XORI128_UB(out10, out32);
2521 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
2522 dst += (4 * dst_stride);
2526 static void common_vt_4t_4w_msa(uint8_t *src, int32_t src_stride,
2527 uint8_t *dst, int32_t dst_stride,
2528 const int8_t *filter, int32_t height)
2531 common_vt_4t_4x2_msa(src, src_stride, dst, dst_stride, filter);
2533 common_vt_4t_4x4multiple_msa(src, src_stride, dst, dst_stride, filter,
2538 static void common_vt_4t_6w_msa(uint8_t *src, int32_t src_stride,
2539 uint8_t *dst, int32_t dst_stride,
2540 const int8_t *filter, int32_t height)
2543 v16u8 src0, src1, src2, src3, vec0, vec1, vec2, vec3, out0, out1;
2544 v8i16 vec01, vec12, vec23, vec30, tmp0, tmp1, tmp2, tmp3;
2545 v8i16 filt, filt0, filt1;
2549 /* rearranging filter_y */
2550 filt = LD_SH(filter);
2551 SPLATI_H2_SH(filt, 0, 1, filt0, filt1);
2553 LD_UB3(src, src_stride, src0, src1, src2);
2554 src += (3 * src_stride);
2556 vec0 = (v16u8) __msa_xori_b((v16u8) src0, 128);
2557 vec1 = (v16u8) __msa_xori_b((v16u8) src1, 128);
2558 vec2 = (v16u8) __msa_xori_b((v16u8) src2, 128);
2560 for (loop_cnt = (height >> 2); loop_cnt--;) {
2561 LD_UB4(src, src_stride, src3, src0, src1, src2);
2562 src += (4 * src_stride);
2564 vec3 = (v16u8) __msa_xori_b((v16u8) src3, 128);
2565 ILVR_B2_SH(vec1, vec0, vec3, vec2, vec01, vec23);
2566 tmp0 = FILT_4TAP_DPADD_S_H(vec01, vec23, filt0, filt1);
2568 vec0 = __msa_xori_b((v16u8) src0, 128);
2569 ILVR_B2_SH(vec2, vec1, vec0, vec3, vec12, vec30);
2570 tmp1 = FILT_4TAP_DPADD_S_H(vec12, vec30, filt0, filt1);
2572 vec1 = __msa_xori_b((v16u8) src1, 128);
2573 vec01 = (v8i16) __msa_ilvr_b((v16i8) vec1, (v16i8) vec0);
2574 tmp2 = FILT_4TAP_DPADD_S_H(vec23, vec01, filt0, filt1);
2576 vec2 = __msa_xori_b((v16u8) src2, 128);
2577 vec12 = (v8i16) __msa_ilvr_b((v16i8) vec2, (v16i8) vec1);
2578 tmp3 = FILT_4TAP_DPADD_S_H(vec30, vec12, filt0, filt1);
2580 SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 6);
2581 SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
2582 out0 = PCKEV_XORI128_UB(tmp0, tmp1);
2583 out1 = PCKEV_XORI128_UB(tmp2, tmp3);
2584 ST6x4_UB(out0, out1, dst, dst_stride);
2585 dst += (4 * dst_stride);
2589 static void common_vt_4t_8x2_msa(uint8_t *src, int32_t src_stride,
2590 uint8_t *dst, int32_t dst_stride,
2591 const int8_t *filter)
2593 v16i8 src0, src1, src2, src3, src4;
2594 v8i16 src01, src12, src23, src34, tmp0, tmp1, filt, filt0, filt1;
2599 /* rearranging filter_y */
2600 filt = LD_SH(filter);
2601 SPLATI_H2_SH(filt, 0, 1, filt0, filt1);
2603 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
2604 XORI_B5_128_SB(src0, src1, src2, src3, src4);
2605 ILVR_B2_SH(src1, src0, src3, src2, src01, src23);
2606 tmp0 = FILT_4TAP_DPADD_S_H(src01, src23, filt0, filt1);
2607 ILVR_B2_SH(src2, src1, src4, src3, src12, src34);
2608 tmp1 = FILT_4TAP_DPADD_S_H(src12, src34, filt0, filt1);
2609 SRARI_H2_SH(tmp0, tmp1, 6);
2610 SAT_SH2_SH(tmp0, tmp1, 7);
2611 out = PCKEV_XORI128_UB(tmp0, tmp1);
2612 ST8x2_UB(out, dst, dst_stride);
2615 static void common_vt_4t_8x6_msa(uint8_t *src, int32_t src_stride,
2616 uint8_t *dst, int32_t dst_stride,
2617 const int8_t *filter)
2620 uint64_t out0, out1, out2;
2621 v16i8 src0, src1, src2, src3, src4, src5;
2622 v8i16 vec0, vec1, vec2, vec3, vec4, tmp0, tmp1, tmp2;
2623 v8i16 filt, filt0, filt1;
2627 /* rearranging filter_y */
2628 filt = LD_SH(filter);
2629 SPLATI_H2_SH(filt, 0, 1, filt0, filt1);
2631 LD_SB3(src, src_stride, src0, src1, src2);
2632 src += (3 * src_stride);
2634 XORI_B3_128_SB(src0, src1, src2);
2635 ILVR_B2_SH(src1, src0, src2, src1, vec0, vec2);
2637 for (loop_cnt = 2; loop_cnt--;) {
2638 LD_SB3(src, src_stride, src3, src4, src5);
2639 src += (3 * src_stride);
2641 XORI_B3_128_SB(src3, src4, src5);
2642 ILVR_B3_SH(src3, src2, src4, src3, src5, src4, vec1, vec3, vec4);
2643 tmp0 = FILT_4TAP_DPADD_S_H(vec0, vec1, filt0, filt1);
2644 tmp1 = FILT_4TAP_DPADD_S_H(vec2, vec3, filt0, filt1);
2645 tmp2 = FILT_4TAP_DPADD_S_H(vec1, vec4, filt0, filt1);
2646 SRARI_H2_SH(tmp0, tmp1, 6);
2647 tmp2 = __msa_srari_h(tmp2, 6);
2648 SAT_SH3_SH(tmp0, tmp1, tmp2, 7);
2649 PCKEV_B2_SH(tmp1, tmp0, tmp2, tmp2, tmp0, tmp2);
2650 XORI_B2_128_SH(tmp0, tmp2);
2652 out0 = __msa_copy_u_d((v2i64) tmp0, 0);
2653 out1 = __msa_copy_u_d((v2i64) tmp0, 1);
2654 out2 = __msa_copy_u_d((v2i64) tmp2, 0);
2668 static void common_vt_4t_8x4mult_msa(uint8_t *src, int32_t src_stride,
2669 uint8_t *dst, int32_t dst_stride,
2670 const int8_t *filter, int32_t height)
2673 v16i8 src0, src1, src2, src7, src8, src9, src10;
2674 v16i8 src10_r, src72_r, src98_r, src21_r, src87_r, src109_r, filt0, filt1;
2676 v8i16 filt, out0_r, out1_r, out2_r, out3_r;
2680 filt = LD_SH(filter);
2681 SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2683 LD_SB3(src, src_stride, src0, src1, src2);
2684 src += (3 * src_stride);
2686 XORI_B3_128_SB(src0, src1, src2);
2687 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2689 for (loop_cnt = (height >> 2); loop_cnt--;) {
2690 LD_SB4(src, src_stride, src7, src8, src9, src10);
2691 src += (4 * src_stride);
2693 XORI_B4_128_SB(src7, src8, src9, src10);
2694 ILVR_B4_SB(src7, src2, src8, src7, src9, src8, src10, src9,
2695 src72_r, src87_r, src98_r, src109_r);
2696 out0_r = FILT_4TAP_DPADD_S_H(src10_r, src72_r, filt0, filt1);
2697 out1_r = FILT_4TAP_DPADD_S_H(src21_r, src87_r, filt0, filt1);
2698 out2_r = FILT_4TAP_DPADD_S_H(src72_r, src98_r, filt0, filt1);
2699 out3_r = FILT_4TAP_DPADD_S_H(src87_r, src109_r, filt0, filt1);
2700 SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6);
2701 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
2702 tmp0 = PCKEV_XORI128_UB(out0_r, out1_r);
2703 tmp1 = PCKEV_XORI128_UB(out2_r, out3_r);
2704 ST8x4_UB(tmp0, tmp1, dst, dst_stride);
2705 dst += (4 * dst_stride);
2713 static void common_vt_4t_8w_msa(uint8_t *src, int32_t src_stride,
2714 uint8_t *dst, int32_t dst_stride,
2715 const int8_t *filter, int32_t height)
2718 common_vt_4t_8x2_msa(src, src_stride, dst, dst_stride, filter);
2719 } else if (6 == height) {
2720 common_vt_4t_8x6_msa(src, src_stride, dst, dst_stride, filter);
2722 common_vt_4t_8x4mult_msa(src, src_stride, dst, dst_stride,
2727 static void common_vt_4t_12w_msa(uint8_t *src, int32_t src_stride,
2728 uint8_t *dst, int32_t dst_stride,
2729 const int8_t *filter, int32_t height)
2732 v16i8 src0, src1, src2, src3, src4, src5, src6;
2733 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
2735 v8i16 src10, src21, src32, src43, src54, src65, src87, src109, src1211;
2736 v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, filt, filt0, filt1;
2737 v4u32 mask = { 2, 6, 2, 6 };
2739 /* rearranging filter_y */
2740 filt = LD_SH(filter);
2741 SPLATI_H2_SH(filt, 0, 1, filt0, filt1);
2745 LD_SB3(src, src_stride, src0, src1, src2);
2746 src += (3 * src_stride);
2748 XORI_B3_128_SB(src0, src1, src2);
2749 VSHF_W2_SB(src0, src1, src1, src2, mask, mask, vec0, vec1);
2751 for (loop_cnt = (height >> 2); loop_cnt--;) {
2752 LD_SB4(src, src_stride, src3, src4, src5, src6);
2753 src += (4 * src_stride);
2755 XORI_B4_128_SB(src3, src4, src5, src6);
2756 ILVR_B2_SH(src1, src0, src3, src2, src10, src32);
2757 VSHF_W2_SB(src2, src3, src3, src4, mask, mask, vec2, vec3);
2758 VSHF_W2_SB(src4, src5, src5, src6, mask, mask, vec4, vec5);
2759 tmp0 = FILT_4TAP_DPADD_S_H(src10, src32, filt0, filt1);
2760 ILVR_B4_SH(src2, src1, src4, src3, src5, src4, src6, src5,
2761 src21, src43, src54, src65);
2762 tmp1 = FILT_4TAP_DPADD_S_H(src21, src43, filt0, filt1);
2763 tmp2 = FILT_4TAP_DPADD_S_H(src32, src54, filt0, filt1);
2764 tmp3 = FILT_4TAP_DPADD_S_H(src43, src65, filt0, filt1);
2765 ILVR_B3_SH(vec1, vec0, vec3, vec2, vec5, vec4, src87, src109, src1211);
2766 tmp4 = FILT_4TAP_DPADD_S_H(src87, src109, filt0, filt1);
2767 tmp5 = FILT_4TAP_DPADD_S_H(src109, src1211, filt0, filt1);
2768 SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 6);
2769 SRARI_H2_SH(tmp4, tmp5, 6);
2770 SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
2771 SAT_SH2_SH(tmp4, tmp5, 7);
2772 out0 = PCKEV_XORI128_UB(tmp0, tmp1);
2773 out1 = PCKEV_XORI128_UB(tmp2, tmp3);
2774 ST8x4_UB(out0, out1, dst, dst_stride);
2775 out0 = PCKEV_XORI128_UB(tmp4, tmp5);
2776 ST4x4_UB(out0, out0, 0, 1, 2, 3, dst + 8, dst_stride);
2777 dst += (4 * dst_stride);
2788 static void common_vt_4t_16w_msa(uint8_t *src, int32_t src_stride,
2789 uint8_t *dst, int32_t dst_stride,
2790 const int8_t *filter, int32_t height)
2793 v16i8 src0, src1, src2, src3, src4, src5, src6;
2794 v16i8 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r, src10_l;
2795 v16i8 src32_l, src54_l, src21_l, src43_l, src65_l, filt0, filt1;
2796 v16u8 tmp0, tmp1, tmp2, tmp3;
2797 v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
2801 filt = LD_SH(filter);
2802 SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2804 LD_SB3(src, src_stride, src0, src1, src2);
2805 src += (3 * src_stride);
2807 XORI_B3_128_SB(src0, src1, src2);
2808 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2809 ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
2811 for (loop_cnt = (height >> 2); loop_cnt--;) {
2812 LD_SB4(src, src_stride, src3, src4, src5, src6);
2813 src += (4 * src_stride);
2815 XORI_B4_128_SB(src3, src4, src5, src6);
2816 ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
2817 src32_r, src43_r, src54_r, src65_r);
2818 ILVL_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
2819 src32_l, src43_l, src54_l, src65_l);
2820 out0_r = FILT_4TAP_DPADD_S_H(src10_r, src32_r, filt0, filt1);
2821 out1_r = FILT_4TAP_DPADD_S_H(src21_r, src43_r, filt0, filt1);
2822 out2_r = FILT_4TAP_DPADD_S_H(src32_r, src54_r, filt0, filt1);
2823 out3_r = FILT_4TAP_DPADD_S_H(src43_r, src65_r, filt0, filt1);
2824 out0_l = FILT_4TAP_DPADD_S_H(src10_l, src32_l, filt0, filt1);
2825 out1_l = FILT_4TAP_DPADD_S_H(src21_l, src43_l, filt0, filt1);
2826 out2_l = FILT_4TAP_DPADD_S_H(src32_l, src54_l, filt0, filt1);
2827 out3_l = FILT_4TAP_DPADD_S_H(src43_l, src65_l, filt0, filt1);
2828 SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6);
2829 SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 6);
2830 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
2831 SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
2832 PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
2833 out3_r, tmp0, tmp1, tmp2, tmp3);
2834 XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3);
2835 ST_UB4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride);
2836 dst += (4 * dst_stride);
2846 static void common_vt_4t_24w_msa(uint8_t *src, int32_t src_stride,
2847 uint8_t *dst, int32_t dst_stride,
2848 const int8_t *filter, int32_t height)
2851 uint64_t out0, out1;
2852 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
2853 v16i8 src11, filt0, filt1;
2854 v16i8 src10_r, src32_r, src76_r, src98_r, src21_r, src43_r, src87_r;
2855 v16i8 src109_r, src10_l, src32_l, src21_l, src43_l;
2857 v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l;
2861 filt = LD_SH(filter);
2862 SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2865 LD_SB3(src, src_stride, src0, src1, src2);
2866 XORI_B3_128_SB(src0, src1, src2);
2867 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2868 ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
2871 LD_SB3(src + 16, src_stride, src6, src7, src8);
2872 src += (3 * src_stride);
2873 XORI_B3_128_SB(src6, src7, src8);
2874 ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
2876 for (loop_cnt = (height >> 2); loop_cnt--;) {
2878 LD_SB2(src, src_stride, src3, src4);
2879 XORI_B2_128_SB(src3, src4);
2880 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
2881 ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
2884 LD_SB2(src + 16, src_stride, src9, src10);
2885 src += (2 * src_stride);
2886 XORI_B2_128_SB(src9, src10);
2887 ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r);
2890 out0_r = FILT_4TAP_DPADD_S_H(src10_r, src32_r, filt0, filt1);
2891 out0_l = FILT_4TAP_DPADD_S_H(src10_l, src32_l, filt0, filt1);
2892 out1_r = FILT_4TAP_DPADD_S_H(src21_r, src43_r, filt0, filt1);
2893 out1_l = FILT_4TAP_DPADD_S_H(src21_l, src43_l, filt0, filt1);
2896 out2_r = FILT_4TAP_DPADD_S_H(src76_r, src98_r, filt0, filt1);
2897 out3_r = FILT_4TAP_DPADD_S_H(src87_r, src109_r, filt0, filt1);
2900 SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6);
2901 SRARI_H2_SH(out0_l, out1_l, 6);
2902 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
2903 SAT_SH2_SH(out0_l, out1_l, 7);
2904 out = PCKEV_XORI128_UB(out0_r, out0_l);
2906 PCKEV_B2_SH(out2_r, out2_r, out3_r, out3_r, out2_r, out3_r);
2907 XORI_B2_128_SH(out2_r, out3_r);
2908 out0 = __msa_copy_u_d((v2i64) out2_r, 0);
2909 out1 = __msa_copy_u_d((v2i64) out3_r, 0);
2912 out = PCKEV_XORI128_UB(out1_r, out1_l);
2918 LD_SB2(src, src_stride, src5, src2);
2919 XORI_B2_128_SB(src5, src2);
2920 ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r);
2921 ILVL_B2_SB(src5, src4, src2, src5, src10_l, src21_l);
2924 LD_SB2(src + 16, src_stride, src11, src8);
2925 src += (2 * src_stride);
2926 XORI_B2_128_SB(src11, src8);
2927 ILVR_B2_SB(src11, src10, src8, src11, src76_r, src87_r);
2930 out0_r = FILT_4TAP_DPADD_S_H(src32_r, src10_r, filt0, filt1);
2931 out0_l = FILT_4TAP_DPADD_S_H(src32_l, src10_l, filt0, filt1);
2932 out1_r = FILT_4TAP_DPADD_S_H(src43_r, src21_r, filt0, filt1);
2933 out1_l = FILT_4TAP_DPADD_S_H(src43_l, src21_l, filt0, filt1);
2936 out2_r = FILT_4TAP_DPADD_S_H(src98_r, src76_r, filt0, filt1);
2937 out3_r = FILT_4TAP_DPADD_S_H(src109_r, src87_r, filt0, filt1);
2940 SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6);
2941 SRARI_H2_SH(out0_l, out1_l, 6);
2942 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
2943 SAT_SH2_SH(out0_l, out1_l, 7);
2944 out = PCKEV_XORI128_UB(out0_r, out0_l);
2946 out = PCKEV_XORI128_UB(out2_r, out2_r);
2947 ST8x1_UB(out, dst + 16);
2949 out = PCKEV_XORI128_UB(out1_r, out1_l);
2951 out = PCKEV_XORI128_UB(out3_r, out3_r);
2952 ST8x1_UB(out, dst + 16);
2957 static void common_vt_4t_32w_mult_msa(uint8_t *src, int32_t src_stride,
2958 uint8_t *dst, int32_t dst_stride,
2959 const int8_t *filter, int32_t height,
2962 uint32_t loop_cnt, cnt;
2963 uint8_t *dst_tmp, *src_tmp;
2964 v16i8 src0, src1, src2, src3, src4, src6, src7, src8, src9, src10;
2965 v16i8 src10_r, src32_r, src76_r, src98_r;
2966 v16i8 src21_r, src43_r, src87_r, src109_r;
2967 v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
2968 v16i8 src10_l, src32_l, src76_l, src98_l;
2969 v16i8 src21_l, src43_l, src87_l, src109_l;
2976 filt = LD_SH(filter);
2977 SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2979 for (cnt = (width >> 5); cnt--;) {
2984 LD_SB3(src_tmp, src_stride, src0, src1, src2);
2985 XORI_B3_128_SB(src0, src1, src2);
2987 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2988 ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
2991 LD_SB3(src_tmp + 16, src_stride, src6, src7, src8);
2992 src_tmp += (3 * src_stride);
2994 XORI_B3_128_SB(src6, src7, src8);
2995 ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
2996 ILVL_B2_SB(src7, src6, src8, src7, src76_l, src87_l);
2998 for (loop_cnt = (height >> 1); loop_cnt--;) {
3000 LD_SB2(src_tmp, src_stride, src3, src4);
3001 XORI_B2_128_SB(src3, src4);
3002 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3003 ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
3006 out0_r = FILT_4TAP_DPADD_S_H(src10_r, src32_r, filt0, filt1);
3007 out0_l = FILT_4TAP_DPADD_S_H(src10_l, src32_l, filt0, filt1);
3008 out1_r = FILT_4TAP_DPADD_S_H(src21_r, src43_r, filt0, filt1);
3009 out1_l = FILT_4TAP_DPADD_S_H(src21_l, src43_l, filt0, filt1);
3012 SRARI_H4_SH(out0_r, out1_r, out0_l, out1_l, 6);
3013 SAT_SH4_SH(out0_r, out1_r, out0_l, out1_l, 7);
3014 out = PCKEV_XORI128_UB(out0_r, out0_l);
3015 ST_UB(out, dst_tmp);
3016 out = PCKEV_XORI128_UB(out1_r, out1_l);
3017 ST_UB(out, dst_tmp + dst_stride);
3026 LD_SB2(src_tmp + 16, src_stride, src9, src10);
3027 src_tmp += (2 * src_stride);
3028 XORI_B2_128_SB(src9, src10);
3029 ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r);
3030 ILVL_B2_SB(src9, src8, src10, src9, src98_l, src109_l);
3033 out2_r = FILT_4TAP_DPADD_S_H(src76_r, src98_r, filt0, filt1);
3034 out2_l = FILT_4TAP_DPADD_S_H(src76_l, src98_l, filt0, filt1);
3035 out3_r = FILT_4TAP_DPADD_S_H(src87_r, src109_r, filt0, filt1);
3036 out3_l = FILT_4TAP_DPADD_S_H(src87_l, src109_l, filt0, filt1);
3039 SRARI_H4_SH(out2_r, out3_r, out2_l, out3_l, 6);
3040 SAT_SH4_SH(out2_r, out3_r, out2_l, out3_l, 7);
3041 out = PCKEV_XORI128_UB(out2_r, out2_l);
3042 ST_UB(out, dst_tmp + 16);
3043 out = PCKEV_XORI128_UB(out3_r, out3_l);
3044 ST_UB(out, dst_tmp + 16 + dst_stride);
3046 dst_tmp += 2 * dst_stride;
3060 static void common_vt_4t_32w_msa(uint8_t *src, int32_t src_stride,
3061 uint8_t *dst, int32_t dst_stride,
3062 const int8_t *filter, int32_t height)
3064 common_vt_4t_32w_mult_msa(src, src_stride, dst, dst_stride,
3065 filter, height, 32);
3068 static void hevc_hv_uni_4t_4x2_msa(uint8_t *src,
3072 const int8_t *filter_x,
3073 const int8_t *filter_y,
3076 v16i8 src0, src1, src2, src3, src4;
3078 v4i32 filt_h0, filt_h1;
3079 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
3081 v8i16 filter_vec, const_vec;
3082 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
3083 v8i16 dst0, dst1, dst2, dst3, dst4;
3084 v4i32 dst0_r, dst1_r;
3085 v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
3087 src -= (src_stride + 1);
3089 filter_vec = LD_SH(filter_x);
3090 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3092 filter_vec = LD_SH(filter_y);
3093 vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
3094 filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
3096 SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1);
3100 const_vec = __msa_ldi_h(128);
3103 LD_SB3(src, src_stride, src0, src1, src2);
3104 src += (3 * src_stride);
3106 XORI_B3_128_SB(src0, src1, src2);
3108 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3109 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
3110 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3113 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
3115 DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1);
3117 DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2);
3119 ILVR_H2_SH(dst1, dst0, dst2, dst1, dst10_r, dst21_r);
3120 LD_SB2(src, src_stride, src3, src4);
3121 XORI_B2_128_SB(src3, src4);
3124 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3126 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
3128 dst32_r = __msa_ilvr_h(dst3, dst2);
3129 dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
3133 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
3135 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
3137 dst43_r = __msa_ilvr_h(dst4, dst3);
3138 dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
3141 dst0_r = (v4i32) __msa_pckev_h((v8i16) dst1_r, (v8i16) dst0_r);
3142 dst0_r = (v4i32) __msa_srari_h((v8i16) dst0_r, 6);
3143 dst0_r = (v4i32) CLIP_SH_0_255(dst0_r);
3144 dst0_r = (v4i32) __msa_pckev_b((v16i8) dst0_r, (v16i8) dst0_r);
3146 ST4x2_UB(dst0_r, dst, dst_stride);
3149 static void hevc_hv_uni_4t_4x4_msa(uint8_t *src,
3153 const int8_t *filter_x,
3154 const int8_t *filter_y,
3157 v16i8 src0, src1, src2, src3, src4, src5, src6;
3159 v4i32 filt_h0, filt_h1;
3160 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
3162 v8i16 filter_vec, const_vec;
3163 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
3164 v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
3165 v4i32 dst0_r, dst1_r, dst2_r, dst3_r;
3166 v8i16 out0_r, out1_r;
3167 v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
3169 src -= (src_stride + 1);
3171 filter_vec = LD_SH(filter_x);
3172 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3174 filter_vec = LD_SH(filter_y);
3175 vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
3176 filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
3178 SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1);
3182 const_vec = __msa_ldi_h(128);
3185 LD_SB3(src, src_stride, src0, src1, src2);
3186 src += (3 * src_stride);
3188 XORI_B3_128_SB(src0, src1, src2);
3190 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3191 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
3192 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3195 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
3197 DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1);
3199 DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2);
3201 ILVR_H2_SH(dst1, dst0, dst2, dst1, dst10_r, dst21_r);
3202 LD_SB4(src, src_stride, src3, src4, src5, src6);
3203 XORI_B4_128_SB(src3, src4, src5, src6);
3206 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3208 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
3210 dst32_r = __msa_ilvr_h(dst3, dst2);
3211 dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
3215 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
3217 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
3219 dst43_r = __msa_ilvr_h(dst4, dst3);
3220 dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
3224 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
3226 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5);
3228 dst10_r = __msa_ilvr_h(dst5, dst4);
3229 dst2_r = HEVC_FILT_4TAP(dst32_r, dst10_r, filt_h0, filt_h1);
3233 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
3235 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
3237 dst21_r = __msa_ilvr_h(dst2, dst5);
3238 dst3_r = HEVC_FILT_4TAP(dst43_r, dst21_r, filt_h0, filt_h1);
3241 PCKEV_H2_SH(dst1_r, dst0_r, dst3_r, dst2_r, out0_r, out1_r);
3242 SRARI_H2_SH(out0_r, out1_r, 6);
3243 CLIP_SH2_0_255(out0_r, out1_r);
3244 out0_r = (v8i16) __msa_pckev_b((v16i8) out1_r, (v16i8) out0_r);
3246 ST4x4_UB(out0_r, out0_r, 0, 1, 2, 3, dst, dst_stride);
3249 static void hevc_hv_uni_4t_4multx8mult_msa(uint8_t *src,
3253 const int8_t *filter_x,
3254 const int8_t *filter_y,
3258 v16i8 src0, src1, src2, src3, src4, src5;
3259 v16i8 src6, src7, src8, src9, src10;
3261 v4i32 filt_h0, filt_h1;
3262 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
3264 v8i16 filter_vec, const_vec;
3265 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
3266 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8, dst9;
3267 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r, dst6_r, dst7_r;
3268 v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
3269 v8i16 dst21_r, dst43_r, dst65_r, dst87_r;
3270 v8i16 out0_r, out1_r, out2_r, out3_r;
3272 src -= (src_stride + 1);
3274 filter_vec = LD_SH(filter_x);
3275 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3277 filter_vec = LD_SH(filter_y);
3278 vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
3279 filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
3281 SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1);
3285 const_vec = __msa_ldi_h(128);
3288 LD_SB3(src, src_stride, src0, src1, src2);
3289 src += (3 * src_stride);
3291 XORI_B3_128_SB(src0, src1, src2);
3293 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3294 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
3295 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3298 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
3300 DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1);
3302 DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2);
3304 ILVR_H2_SH(dst1, dst0, dst2, dst1, dst10_r, dst21_r);
3306 for (loop_cnt = height >> 3; loop_cnt--;) {
3307 LD_SB8(src, src_stride,
3308 src3, src4, src5, src6, src7, src8, src9, src10);
3309 src += (8 * src_stride);
3311 XORI_B8_128_SB(src3, src4, src5, src6, src7, src8, src9, src10);
3314 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3316 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
3318 dst32_r = __msa_ilvr_h(dst3, dst2);
3319 dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
3323 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
3325 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
3327 dst43_r = __msa_ilvr_h(dst4, dst3);
3328 dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
3332 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
3334 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5);
3336 dst54_r = __msa_ilvr_h(dst5, dst4);
3337 dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
3341 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
3343 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst6, dst6);
3345 dst65_r = __msa_ilvr_h(dst6, dst5);
3346 dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
3350 VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1);
3352 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst7, dst7);
3354 dst76_r = __msa_ilvr_h(dst7, dst6);
3355 dst4_r = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
3359 VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec0, vec1);
3361 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst8, dst8);
3363 dst87_r = __msa_ilvr_h(dst8, dst7);
3364 dst5_r = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
3368 VSHF_B2_SB(src9, src9, src9, src9, mask0, mask1, vec0, vec1);
3370 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst9, dst9);
3372 dst10_r = __msa_ilvr_h(dst9, dst8);
3373 dst6_r = HEVC_FILT_4TAP(dst76_r, dst10_r, filt_h0, filt_h1);
3377 VSHF_B2_SB(src10, src10, src10, src10, mask0, mask1, vec0, vec1);
3379 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
3381 dst21_r = __msa_ilvr_h(dst2, dst9);
3382 dst7_r = HEVC_FILT_4TAP(dst87_r, dst21_r, filt_h0, filt_h1);
3385 PCKEV_H4_SH(dst1_r, dst0_r, dst3_r, dst2_r,
3386 dst5_r, dst4_r, dst7_r, dst6_r,
3387 out0_r, out1_r, out2_r, out3_r);
3389 SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6);
3390 CLIP_SH4_0_255(out0_r, out1_r, out2_r, out3_r);
3392 PCKEV_B2_SH(out1_r, out0_r, out3_r, out2_r, out0_r, out1_r);
3393 ST4x8_UB(out0_r, out1_r, dst, dst_stride);
3394 dst += (8 * dst_stride);
3398 static void hevc_hv_uni_4t_4w_msa(uint8_t *src,
3402 const int8_t *filter_x,
3403 const int8_t *filter_y,
3407 hevc_hv_uni_4t_4x2_msa(src, src_stride, dst, dst_stride,
3408 filter_x, filter_y, height);
3409 } else if (4 == height) {
3410 hevc_hv_uni_4t_4x4_msa(src, src_stride, dst, dst_stride,
3411 filter_x, filter_y, height);
3412 } else if (0 == (height % 8)) {
3413 hevc_hv_uni_4t_4multx8mult_msa(src, src_stride, dst, dst_stride,
3414 filter_x, filter_y, height);
3418 static void hevc_hv_uni_4t_6w_msa(uint8_t *src,
3422 const int8_t *filter_x,
3423 const int8_t *filter_y,
3427 v16i8 src0, src1, src2, src3, src4, src5, src6;
3429 v4i32 filt_h0, filt_h1;
3430 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
3432 v8i16 filter_vec, const_vec;
3433 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
3434 v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
3435 v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
3436 v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
3437 v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
3438 v8i16 out0_r, out1_r, out2_r, out3_r;
3440 src -= (src_stride + 1);
3442 filter_vec = LD_SH(filter_x);
3443 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3445 filter_vec = LD_SH(filter_y);
3446 vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
3447 filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
3449 SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1);
3453 const_vec = __msa_ldi_h(128);
3456 LD_SB3(src, src_stride, src0, src1, src2);
3457 src += (3 * src_stride);
3459 XORI_B3_128_SB(src0, src1, src2);
3461 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3462 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
3463 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3466 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
3468 DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1);
3470 DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2);
3472 ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
3473 ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
3475 for (loop_cnt = height >> 2; loop_cnt--;) {
3476 LD_SB4(src, src_stride, src3, src4, src5, src6);
3477 src += (4 * src_stride);
3479 XORI_B4_128_SB(src3, src4, src5, src6);
3482 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3484 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
3486 ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
3487 dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
3488 dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
3493 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
3495 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
3497 ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
3498 dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
3499 dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
3504 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
3506 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5);
3508 ILVRL_H2_SH(dst5, dst4, dst10_r, dst10_l);
3509 dst2_r = HEVC_FILT_4TAP(dst32_r, dst10_r, filt_h0, filt_h1);
3510 dst2_l = HEVC_FILT_4TAP(dst32_l, dst10_l, filt_h0, filt_h1);
3516 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
3518 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
3520 ILVRL_H2_SH(dst2, dst5, dst21_r, dst21_l);
3521 dst3_r = HEVC_FILT_4TAP(dst43_r, dst21_r, filt_h0, filt_h1);
3522 dst3_l = HEVC_FILT_4TAP(dst43_l, dst21_l, filt_h0, filt_h1);
3527 PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r,
3528 dst2_l, dst2_r, dst3_l, dst3_r,
3529 out0_r, out1_r, out2_r, out3_r);
3531 SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6);
3532 CLIP_SH4_0_255(out0_r, out1_r, out2_r, out3_r);
3534 PCKEV_B2_SH(out1_r, out0_r, out3_r, out2_r, out0_r, out1_r);
3535 ST6x4_UB(out0_r, out1_r, dst, dst_stride);
3536 dst += (4 * dst_stride);
3540 static void hevc_hv_uni_4t_8x2_msa(uint8_t *src,
3544 const int8_t *filter_x,
3545 const int8_t *filter_y,
3548 v16i8 src0, src1, src2, src3, src4;
3550 v4i32 filt_h0, filt_h1;
3551 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
3553 v8i16 filter_vec, const_vec;
3554 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
3555 v8i16 dst0, dst1, dst2, dst3, dst4;
3556 v4i32 dst0_r, dst0_l, dst1_r, dst1_l;
3557 v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
3558 v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
3559 v8i16 out0_r, out1_r;
3561 src -= (src_stride + 1);
3563 filter_vec = LD_SH(filter_x);
3564 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3566 filter_vec = LD_SH(filter_y);
3567 vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
3568 filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
3570 SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1);
3574 const_vec = __msa_ldi_h(128);
3577 LD_SB3(src, src_stride, src0, src1, src2);
3578 src += (3 * src_stride);
3580 XORI_B3_128_SB(src0, src1, src2);
3582 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3583 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
3584 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3587 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
3589 DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1);
3591 DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2);
3593 ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
3594 ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
3596 LD_SB2(src, src_stride, src3, src4);
3597 XORI_B2_128_SB(src3, src4);
3600 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3602 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
3604 ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
3605 dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
3606 dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
3611 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
3613 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
3615 ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
3616 dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
3617 dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
3621 PCKEV_H2_SH(dst0_l, dst0_r, dst1_l, dst1_r, out0_r, out1_r);
3622 SRARI_H2_SH(out0_r, out1_r, 6);
3623 CLIP_SH2_0_255(out0_r, out1_r);
3624 out0_r = (v8i16) __msa_pckev_b((v16i8) out1_r, (v16i8) out0_r);
3626 ST8x2_UB(out0_r, dst, dst_stride);
3629 static void hevc_hv_uni_4t_8x6_msa(uint8_t *src,
3633 const int8_t *filter_x,
3634 const int8_t *filter_y,
3637 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
3639 v4i32 filt_h0, filt_h1;
3640 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
3642 v8i16 filter_vec, const_vec;
3643 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
3644 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
3645 v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
3646 v4i32 dst4_r, dst4_l, dst5_r, dst5_l;
3647 v8i16 dst10_r, dst32_r, dst10_l, dst32_l;
3648 v8i16 dst21_r, dst43_r, dst21_l, dst43_l;
3649 v8i16 dst54_r, dst54_l, dst65_r, dst65_l;
3650 v8i16 dst76_r, dst76_l, dst87_r, dst87_l;
3651 v8i16 out0_r, out1_r, out2_r, out3_r, out4_r, out5_r;
3653 src -= (src_stride + 1);
3655 filter_vec = LD_SH(filter_x);
3656 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3658 filter_vec = LD_SH(filter_y);
3659 vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
3660 filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
3662 SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1);
3666 const_vec = __msa_ldi_h(128);
3669 LD_SB3(src, src_stride, src0, src1, src2);
3670 src += (3 * src_stride);
3672 XORI_B3_128_SB(src0, src1, src2);
3674 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3675 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
3676 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3679 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
3681 DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1);
3683 DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2);
3685 ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
3686 ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
3688 LD_SB2(src, src_stride, src3, src4);
3689 src += (2 * src_stride);
3691 XORI_B2_128_SB(src3, src4);
3694 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3696 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
3698 ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
3699 dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
3700 dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
3706 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
3708 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
3710 ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
3711 dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
3712 dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
3716 LD_SB2(src, src_stride, src5, src6);
3717 src += (2 * src_stride);
3719 XORI_B2_128_SB(src5, src6);
3722 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
3724 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5);
3726 ILVRL_H2_SH(dst5, dst4, dst54_r, dst54_l);
3727 dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
3728 dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1);
3733 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
3735 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst6, dst6);
3737 ILVRL_H2_SH(dst6, dst5, dst65_r, dst65_l);
3738 dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
3739 dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1);
3743 LD_SB2(src, src_stride, src7, src8);
3744 src += (2 * src_stride);
3746 XORI_B2_128_SB(src7, src8);
3749 VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1);
3751 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst7, dst7);
3753 ILVRL_H2_SH(dst7, dst6, dst76_r, dst76_l);
3754 dst4_r = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
3755 dst4_l = HEVC_FILT_4TAP(dst54_l, dst76_l, filt_h0, filt_h1);
3761 VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec0, vec1);
3763 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst8, dst8);
3765 ILVRL_H2_SH(dst8, dst7, dst87_r, dst87_l);
3766 dst5_r = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
3767 dst5_l = HEVC_FILT_4TAP(dst65_l, dst87_l, filt_h0, filt_h1);
3771 PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r,
3772 dst2_l, dst2_r, dst3_l, dst3_r, out0_r, out1_r, out2_r, out3_r);
3773 PCKEV_H2_SH(dst4_l, dst4_r, dst5_l, dst5_r, out4_r, out5_r);
3774 SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6);
3775 SRARI_H2_SH(out4_r, out5_r, 6);
3776 CLIP_SH4_0_255(out0_r, out1_r, out2_r, out3_r);
3777 CLIP_SH2_0_255(out4_r, out5_r);
3779 PCKEV_B2_SH(out1_r, out0_r, out3_r, out2_r, out0_r, out1_r);
3780 out2_r = (v8i16) __msa_pckev_b((v16i8) out5_r, (v16i8) out4_r);
3782 ST8x4_UB(out0_r, out1_r, dst, dst_stride);
3783 dst += (4 * dst_stride);
3784 ST8x2_UB(out2_r, dst, dst_stride);
3787 static void hevc_hv_uni_4t_8w_mult_msa(uint8_t *src,
3791 const int8_t *filter_x,
3792 const int8_t *filter_y,
3796 uint32_t loop_cnt, cnt;
3799 v16i8 src0, src1, src2, src3, src4, src5, src6;
3801 v4i32 filt_h0, filt_h1;
3802 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
3804 v8i16 filter_vec, const_vec;
3805 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
3806 v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
3807 v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
3808 v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
3809 v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
3810 v8i16 out0_r, out1_r, out2_r, out3_r;
3812 src -= (src_stride + 1);
3814 filter_vec = LD_SH(filter_x);
3815 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3817 filter_vec = LD_SH(filter_y);
3818 vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
3819 filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
3821 SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1);
3825 const_vec = __msa_ldi_h(128);
3828 for (cnt = width >> 3; cnt--;) {
3832 LD_SB3(src_tmp, src_stride, src0, src1, src2);
3833 src_tmp += (3 * src_stride);
3835 XORI_B3_128_SB(src0, src1, src2);
3837 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3838 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
3839 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3842 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
3844 DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1);
3846 DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2);
3848 ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
3849 ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
3851 for (loop_cnt = height >> 2; loop_cnt--;) {
3852 LD_SB4(src_tmp, src_stride, src3, src4, src5, src6);
3853 src_tmp += (4 * src_stride);
3855 XORI_B4_128_SB(src3, src4, src5, src6);
3858 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3860 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
3862 ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
3863 dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
3864 dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
3870 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
3872 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
3874 ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
3875 dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
3876 dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
3881 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
3883 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5);
3885 ILVRL_H2_SH(dst5, dst4, dst10_r, dst10_l);
3886 dst2_r = HEVC_FILT_4TAP(dst32_r, dst10_r, filt_h0, filt_h1);
3887 dst2_l = HEVC_FILT_4TAP(dst32_l, dst10_l, filt_h0, filt_h1);
3893 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
3895 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
3897 ILVRL_H2_SH(dst2, dst5, dst21_r, dst21_l);
3898 dst3_r = HEVC_FILT_4TAP(dst43_r, dst21_r, filt_h0, filt_h1);
3899 dst3_l = HEVC_FILT_4TAP(dst43_l, dst21_l, filt_h0, filt_h1);
3904 PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r,
3905 dst2_l, dst2_r, dst3_l, dst3_r,
3906 out0_r, out1_r, out2_r, out3_r);
3908 SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6);
3909 CLIP_SH4_0_255(out0_r, out1_r, out2_r, out3_r);
3911 PCKEV_B2_SH(out1_r, out0_r, out3_r, out2_r, out0_r, out1_r);
3912 ST8x4_UB(out0_r, out1_r, dst_tmp, dst_stride);
3913 dst_tmp += (4 * dst_stride);
3921 static void hevc_hv_uni_4t_8w_msa(uint8_t *src,
3925 const int8_t *filter_x,
3926 const int8_t *filter_y,
3930 hevc_hv_uni_4t_8x2_msa(src, src_stride, dst, dst_stride,
3931 filter_x, filter_y, height);
3932 } else if (6 == height) {
3933 hevc_hv_uni_4t_8x6_msa(src, src_stride, dst, dst_stride,
3934 filter_x, filter_y, height);
3935 } else if (0 == (height % 4)) {
3936 hevc_hv_uni_4t_8w_mult_msa(src, src_stride, dst, dst_stride,
3937 filter_x, filter_y, height, 8);
3941 static void hevc_hv_uni_4t_12w_msa(uint8_t *src,
3945 const int8_t *filter_x,
3946 const int8_t *filter_y,
3949 hevc_hv_uni_4t_8w_mult_msa(src, src_stride, dst, dst_stride,
3950 filter_x, filter_y, height, 8);
3952 hevc_hv_uni_4t_4w_msa(src + 8, src_stride, dst + 8, dst_stride,
3953 filter_x, filter_y, height);
3956 static void hevc_hv_uni_4t_16w_msa(uint8_t *src,
3960 const int8_t *filter_x,
3961 const int8_t *filter_y,
3964 hevc_hv_uni_4t_8w_mult_msa(src, src_stride, dst, dst_stride,
3965 filter_x, filter_y, height, 16);
3968 static void hevc_hv_uni_4t_24w_msa(uint8_t *src,
3972 const int8_t *filter_x,
3973 const int8_t *filter_y,
3976 hevc_hv_uni_4t_8w_mult_msa(src, src_stride, dst, dst_stride,
3977 filter_x, filter_y, height, 24);
3980 static void hevc_hv_uni_4t_32w_msa(uint8_t *src,
3984 const int8_t *filter_x,
3985 const int8_t *filter_y,
3988 hevc_hv_uni_4t_8w_mult_msa(src, src_stride, dst, dst_stride,
3989 filter_x, filter_y, height, 32);
3992 #define UNI_MC_COPY(WIDTH) \
3993 void ff_hevc_put_hevc_uni_pel_pixels##WIDTH##_8_msa(uint8_t *dst, \
3994 ptrdiff_t dst_stride, \
3996 ptrdiff_t src_stride, \
4002 copy_width##WIDTH##_msa(src, src_stride, dst, dst_stride, height); \
4015 #define UNI_MC(PEL, DIR, WIDTH, TAP, DIR1, FILT_DIR) \
4016 void ff_hevc_put_hevc_uni_##PEL##_##DIR##WIDTH##_8_msa(uint8_t *dst, \
4017 ptrdiff_t dst_stride, \
4019 ptrdiff_t src_stride, \
4025 const int8_t *filter = ff_hevc_##PEL##_filters[FILT_DIR - 1]; \
4027 common_##DIR1##_##TAP##t_##WIDTH##w_msa(src, src_stride, dst, dst_stride, \
4031 UNI_MC(qpel, h, 4, 8, hz, mx);
4032 UNI_MC(qpel, h, 8, 8, hz, mx);
4033 UNI_MC(qpel, h, 12, 8, hz, mx);
4034 UNI_MC(qpel, h, 16, 8, hz, mx);
4035 UNI_MC(qpel, h, 24, 8, hz, mx);
4036 UNI_MC(qpel, h, 32, 8, hz, mx);
4037 UNI_MC(qpel, h, 48, 8, hz, mx);
4038 UNI_MC(qpel, h, 64, 8, hz, mx);
4040 UNI_MC(qpel, v, 4, 8, vt, my);
4041 UNI_MC(qpel, v, 8, 8, vt, my);
4042 UNI_MC(qpel, v, 12, 8, vt, my);
4043 UNI_MC(qpel, v, 16, 8, vt, my);
4044 UNI_MC(qpel, v, 24, 8, vt, my);
4045 UNI_MC(qpel, v, 32, 8, vt, my);
4046 UNI_MC(qpel, v, 48, 8, vt, my);
4047 UNI_MC(qpel, v, 64, 8, vt, my);
4049 UNI_MC(epel, h, 4, 4, hz, mx);
4050 UNI_MC(epel, h, 6, 4, hz, mx);
4051 UNI_MC(epel, h, 8, 4, hz, mx);
4052 UNI_MC(epel, h, 12, 4, hz, mx);
4053 UNI_MC(epel, h, 16, 4, hz, mx);
4054 UNI_MC(epel, h, 24, 4, hz, mx);
4055 UNI_MC(epel, h, 32, 4, hz, mx);
4057 UNI_MC(epel, v, 4, 4, vt, my);
4058 UNI_MC(epel, v, 6, 4, vt, my);
4059 UNI_MC(epel, v, 8, 4, vt, my);
4060 UNI_MC(epel, v, 12, 4, vt, my);
4061 UNI_MC(epel, v, 16, 4, vt, my);
4062 UNI_MC(epel, v, 24, 4, vt, my);
4063 UNI_MC(epel, v, 32, 4, vt, my);
4067 #define UNI_MC_HV(PEL, WIDTH, TAP) \
4068 void ff_hevc_put_hevc_uni_##PEL##_hv##WIDTH##_8_msa(uint8_t *dst, \
4069 ptrdiff_t dst_stride, \
4071 ptrdiff_t src_stride, \
4077 const int8_t *filter_x = ff_hevc_##PEL##_filters[mx - 1]; \
4078 const int8_t *filter_y = ff_hevc_##PEL##_filters[my - 1]; \
4080 hevc_hv_uni_##TAP##t_##WIDTH##w_msa(src, src_stride, dst, dst_stride, \
4081 filter_x, filter_y, height); \
4084 UNI_MC_HV(qpel, 4, 8);
4085 UNI_MC_HV(qpel, 8, 8);
4086 UNI_MC_HV(qpel, 12, 8);
4087 UNI_MC_HV(qpel, 16, 8);
4088 UNI_MC_HV(qpel, 24, 8);
4089 UNI_MC_HV(qpel, 32, 8);
4090 UNI_MC_HV(qpel, 48, 8);
4091 UNI_MC_HV(qpel, 64, 8);
4093 UNI_MC_HV(epel, 4, 4);
4094 UNI_MC_HV(epel, 6, 4);
4095 UNI_MC_HV(epel, 8, 4);
4096 UNI_MC_HV(epel, 12, 4);
4097 UNI_MC_HV(epel, 16, 4);
4098 UNI_MC_HV(epel, 24, 4);
4099 UNI_MC_HV(epel, 32, 4);