2 * Copyright (c) 2015 - 2017 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com)
4 * This file is part of FFmpeg.
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 #include "libavutil/mips/generic_macros_msa.h"
22 #include "libavcodec/mips/hevcdsp_mips.h"
23 #include "libavcodec/mips/hevc_macros_msa.h"
25 static const uint8_t ff_hevc_mask_arr[16 * 3] __attribute__((aligned(0x40))) = {
27 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
29 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20,
31 8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
34 #define HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, \
35 mask0, mask1, mask2, mask3, \
36 filt0, filt1, filt2, filt3, \
39 v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \
41 VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0_m, vec1_m); \
42 DOTP_SB2_SH(vec0_m, vec1_m, filt0, filt0, out0, out1); \
43 VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2_m, vec3_m); \
44 DPADD_SB2_SH(vec2_m, vec3_m, filt1, filt1, out0, out1); \
45 VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4_m, vec5_m); \
46 DPADD_SB2_SH(vec4_m, vec5_m, filt2, filt2, out0, out1); \
47 VSHF_B2_SB(src0, src1, src2, src3, mask3, mask3, vec6_m, vec7_m); \
48 DPADD_SB2_SH(vec6_m, vec7_m, filt3, filt3, out0, out1); \
51 #define HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, \
52 mask0, mask1, mask2, mask3, \
53 filt0, filt1, filt2, filt3, \
54 out0, out1, out2, out3) \
56 v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \
58 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m); \
59 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m); \
60 DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0, \
61 out0, out1, out2, out3); \
62 VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec0_m, vec1_m); \
63 VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2_m, vec3_m); \
64 DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt2, filt2, filt2, filt2, \
65 out0, out1, out2, out3); \
66 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4_m, vec5_m); \
67 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6_m, vec7_m); \
68 DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt1, filt1, filt1, filt1, \
69 out0, out1, out2, out3); \
70 VSHF_B2_SB(src0, src0, src1, src1, mask3, mask3, vec4_m, vec5_m); \
71 VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec6_m, vec7_m); \
72 DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt3, filt3, filt3, filt3, \
73 out0, out1, out2, out3); \
76 #define HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, \
77 mask0, mask1, filt0, filt1, \
80 v16i8 vec0_m, vec1_m, vec2_m, vec3_m; \
82 VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0_m, vec1_m); \
83 DOTP_SB2_SH(vec0_m, vec1_m, filt0, filt0, out0, out1); \
84 VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2_m, vec3_m); \
85 DPADD_SB2_SH(vec2_m, vec3_m, filt1, filt1, out0, out1); \
88 #define HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, \
89 mask0, mask1, filt0, filt1, \
90 out0, out1, out2, out3) \
92 v16i8 vec0_m, vec1_m, vec2_m, vec3_m; \
94 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m); \
95 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m); \
96 DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0, \
97 out0, out1, out2, out3); \
98 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0_m, vec1_m); \
99 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2_m, vec3_m); \
100 DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt1, filt1, filt1, filt1, \
101 out0, out1, out2, out3); \
104 static void copy_width8_msa(uint8_t *src, int32_t src_stride,
105 uint8_t *dst, int32_t dst_stride,
109 uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
112 LD2(src, src_stride, out0, out1);
116 } else if (6 == height) {
117 LD4(src, src_stride, out0, out1, out2, out3);
118 src += (4 * src_stride);
119 SD4(out0, out1, out2, out3, dst, dst_stride);
120 dst += (4 * dst_stride);
121 LD2(src, src_stride, out0, out1);
125 } else if (0 == (height % 8)) {
126 for (cnt = (height >> 3); cnt--;) {
127 LD4(src, src_stride, out0, out1, out2, out3);
128 src += (4 * src_stride);
129 LD4(src, src_stride, out4, out5, out6, out7);
130 src += (4 * src_stride);
131 SD4(out0, out1, out2, out3, dst, dst_stride);
132 dst += (4 * dst_stride);
133 SD4(out4, out5, out6, out7, dst, dst_stride);
134 dst += (4 * dst_stride);
136 } else if (0 == (height % 4)) {
137 for (cnt = (height >> 2); cnt--;) {
138 LD4(src, src_stride, out0, out1, out2, out3);
139 src += (4 * src_stride);
140 SD4(out0, out1, out2, out3, dst, dst_stride);
141 dst += (4 * dst_stride);
146 static void copy_width12_msa(uint8_t *src, int32_t src_stride,
147 uint8_t *dst, int32_t dst_stride,
150 v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
152 LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
153 src += (8 * src_stride);
154 ST12x8_UB(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride);
155 dst += (8 * dst_stride);
156 LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
157 ST12x8_UB(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride);
160 static void copy_width16_msa(uint8_t *src, int32_t src_stride,
161 uint8_t *dst, int32_t dst_stride,
165 v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
168 LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
169 src += (8 * src_stride);
170 ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride);
171 dst += (8 * dst_stride);
172 LD_UB4(src, src_stride, src0, src1, src2, src3);
173 src += (4 * src_stride);
174 ST_UB4(src0, src1, src2, src3, dst, dst_stride);
175 dst += (4 * dst_stride);
176 } else if (0 == (height % 8)) {
177 for (cnt = (height >> 3); cnt--;) {
178 LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6,
180 src += (8 * src_stride);
181 ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst,
183 dst += (8 * dst_stride);
185 } else if (0 == (height % 4)) {
186 for (cnt = (height >> 2); cnt--;) {
187 LD_UB4(src, src_stride, src0, src1, src2, src3);
188 src += (4 * src_stride);
190 ST_UB4(src0, src1, src2, src3, dst, dst_stride);
191 dst += (4 * dst_stride);
196 static void copy_width24_msa(uint8_t *src, int32_t src_stride,
197 uint8_t *dst, int32_t dst_stride,
201 v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
202 uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
204 for (cnt = 4; cnt--;) {
205 LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
206 LD4(src + 16, src_stride, out0, out1, out2, out3);
207 src += (4 * src_stride);
208 LD4(src + 16, src_stride, out4, out5, out6, out7);
209 src += (4 * src_stride);
211 ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride);
212 SD4(out0, out1, out2, out3, dst + 16, dst_stride);
213 dst += (4 * dst_stride);
214 SD4(out4, out5, out6, out7, dst + 16, dst_stride);
215 dst += (4 * dst_stride);
219 static void copy_width32_msa(uint8_t *src, int32_t src_stride,
220 uint8_t *dst, int32_t dst_stride,
224 v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
226 for (cnt = (height >> 2); cnt--;) {
227 LD_UB4(src, src_stride, src0, src1, src2, src3);
228 LD_UB4(src + 16, src_stride, src4, src5, src6, src7);
229 src += (4 * src_stride);
230 ST_UB4(src0, src1, src2, src3, dst, dst_stride);
231 ST_UB4(src4, src5, src6, src7, dst + 16, dst_stride);
232 dst += (4 * dst_stride);
236 static void copy_width48_msa(uint8_t *src, int32_t src_stride,
237 uint8_t *dst, int32_t dst_stride,
241 v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
244 for (cnt = (height >> 2); cnt--;) {
245 LD_UB4(src, src_stride, src0, src1, src2, src3);
246 LD_UB4(src + 16, src_stride, src4, src5, src6, src7);
247 LD_UB4(src + 32, src_stride, src8, src9, src10, src11);
248 src += (4 * src_stride);
250 ST_UB4(src0, src1, src2, src3, dst, dst_stride);
251 ST_UB4(src4, src5, src6, src7, dst + 16, dst_stride);
252 ST_UB4(src8, src9, src10, src11, dst + 32, dst_stride);
253 dst += (4 * dst_stride);
257 static void copy_width64_msa(uint8_t *src, int32_t src_stride,
258 uint8_t *dst, int32_t dst_stride,
262 v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
263 v16u8 src8, src9, src10, src11, src12, src13, src14, src15;
265 for (cnt = (height >> 2); cnt--;) {
266 LD_UB4(src, 16, src0, src1, src2, src3);
268 LD_UB4(src, 16, src4, src5, src6, src7);
270 LD_UB4(src, 16, src8, src9, src10, src11);
272 LD_UB4(src, 16, src12, src13, src14, src15);
275 ST_UB4(src0, src1, src2, src3, dst, 16);
277 ST_UB4(src4, src5, src6, src7, dst, 16);
279 ST_UB4(src8, src9, src10, src11, dst, 16);
281 ST_UB4(src12, src13, src14, src15, dst, 16);
286 static void common_hz_8t_4x4_msa(uint8_t *src, int32_t src_stride,
287 uint8_t *dst, int32_t dst_stride,
288 const int8_t *filter)
290 v16u8 mask0, mask1, mask2, mask3, out;
291 v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
292 v8i16 filt, out0, out1;
294 mask0 = LD_UB(&ff_hevc_mask_arr[16]);
297 /* rearranging filter */
298 filt = LD_SH(filter);
299 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
305 LD_SB4(src, src_stride, src0, src1, src2, src3);
306 XORI_B4_128_SB(src0, src1, src2, src3);
307 HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
308 mask3, filt0, filt1, filt2, filt3, out0, out1);
309 SRARI_H2_SH(out0, out1, 6);
310 SAT_SH2_SH(out0, out1, 7);
311 out = PCKEV_XORI128_UB(out0, out1);
312 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
315 static void common_hz_8t_4x8_msa(uint8_t *src, int32_t src_stride,
316 uint8_t *dst, int32_t dst_stride,
317 const int8_t *filter)
319 v16i8 filt0, filt1, filt2, filt3;
320 v16i8 src0, src1, src2, src3;
321 v16u8 mask0, mask1, mask2, mask3, out;
322 v8i16 filt, out0, out1, out2, out3;
324 mask0 = LD_UB(&ff_hevc_mask_arr[16]);
327 /* rearranging filter */
328 filt = LD_SH(filter);
329 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
335 LD_SB4(src, src_stride, src0, src1, src2, src3);
336 XORI_B4_128_SB(src0, src1, src2, src3);
337 src += (4 * src_stride);
338 HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
339 mask3, filt0, filt1, filt2, filt3, out0, out1);
340 LD_SB4(src, src_stride, src0, src1, src2, src3);
341 XORI_B4_128_SB(src0, src1, src2, src3);
342 HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
343 mask3, filt0, filt1, filt2, filt3, out2, out3);
344 SRARI_H4_SH(out0, out1, out2, out3, 6);
345 SAT_SH4_SH(out0, out1, out2, out3, 7);
346 out = PCKEV_XORI128_UB(out0, out1);
347 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
348 dst += (4 * dst_stride);
349 out = PCKEV_XORI128_UB(out2, out3);
350 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
353 static void common_hz_8t_4x16_msa(uint8_t *src, int32_t src_stride,
354 uint8_t *dst, int32_t dst_stride,
355 const int8_t *filter)
357 v16u8 mask0, mask1, mask2, mask3, out;
358 v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
359 v8i16 filt, out0, out1, out2, out3;
361 mask0 = LD_UB(&ff_hevc_mask_arr[16]);
364 /* rearranging filter */
365 filt = LD_SH(filter);
366 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
372 LD_SB4(src, src_stride, src0, src1, src2, src3);
373 XORI_B4_128_SB(src0, src1, src2, src3);
374 src += (4 * src_stride);
375 HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
376 mask3, filt0, filt1, filt2, filt3, out0, out1);
377 LD_SB4(src, src_stride, src0, src1, src2, src3);
378 XORI_B4_128_SB(src0, src1, src2, src3);
379 src += (4 * src_stride);
380 HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
381 mask3, filt0, filt1, filt2, filt3, out2, out3);
382 SRARI_H4_SH(out0, out1, out2, out3, 6);
383 SAT_SH4_SH(out0, out1, out2, out3, 7);
384 out = PCKEV_XORI128_UB(out0, out1);
385 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
386 dst += (4 * dst_stride);
387 out = PCKEV_XORI128_UB(out2, out3);
388 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
389 dst += (4 * dst_stride);
391 LD_SB4(src, src_stride, src0, src1, src2, src3);
392 XORI_B4_128_SB(src0, src1, src2, src3);
393 src += (4 * src_stride);
394 HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
395 mask3, filt0, filt1, filt2, filt3, out0, out1);
396 LD_SB4(src, src_stride, src0, src1, src2, src3);
397 XORI_B4_128_SB(src0, src1, src2, src3);
398 src += (4 * src_stride);
399 HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
400 mask3, filt0, filt1, filt2, filt3, out2, out3);
402 SRARI_H4_SH(out0, out1, out2, out3, 6);
403 SAT_SH4_SH(out0, out1, out2, out3, 7);
404 out = PCKEV_XORI128_UB(out0, out1);
405 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
406 dst += (4 * dst_stride);
407 out = PCKEV_XORI128_UB(out2, out3);
408 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
411 static void common_hz_8t_4w_msa(uint8_t *src, int32_t src_stride,
412 uint8_t *dst, int32_t dst_stride,
413 const int8_t *filter, int32_t height)
416 common_hz_8t_4x4_msa(src, src_stride, dst, dst_stride, filter);
417 } else if (8 == height) {
418 common_hz_8t_4x8_msa(src, src_stride, dst, dst_stride, filter);
419 } else if (16 == height) {
420 common_hz_8t_4x16_msa(src, src_stride, dst, dst_stride, filter);
424 static void common_hz_8t_8w_msa(uint8_t *src, int32_t src_stride,
425 uint8_t *dst, int32_t dst_stride,
426 const int8_t *filter, int32_t height)
429 v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
430 v16u8 mask0, mask1, mask2, mask3, tmp0, tmp1;
431 v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m;
432 v8i16 filt, out0, out1, out2, out3;
434 mask0 = LD_UB(&ff_hevc_mask_arr[0]);
437 /* rearranging filter */
438 filt = LD_SH(filter);
439 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
445 for (loop_cnt = (height >> 2); loop_cnt--;) {
446 LD_SB4(src, src_stride, src0, src1, src2, src3);
447 XORI_B4_128_SB(src0, src1, src2, src3);
448 src += (4 * src_stride);
450 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m);
451 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m);
452 DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0,
453 out0, out1, out2, out3);
454 VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec0_m, vec1_m);
455 VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2_m, vec3_m);
456 DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt2, filt2, filt2, filt2,
457 out0, out1, out2, out3);
458 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4_m, vec5_m);
459 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6_m, vec7_m);
460 DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt1, filt1, filt1, filt1,
461 out0, out1, out2, out3);
462 VSHF_B2_SB(src0, src0, src1, src1, mask3, mask3, vec4_m, vec5_m);
463 VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec6_m, vec7_m);
464 DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt3, filt3, filt3, filt3,
465 out0, out1, out2, out3);
467 SRARI_H4_SH(out0, out1, out2, out3, 6);
468 SAT_SH4_SH(out0, out1, out2, out3, 7);
469 tmp0 = PCKEV_XORI128_UB(out0, out1);
470 tmp1 = PCKEV_XORI128_UB(out2, out3);
471 ST8x4_UB(tmp0, tmp1, dst, dst_stride);
472 dst += (4 * dst_stride);
476 static void common_hz_8t_12w_msa(uint8_t *src, int32_t src_stride,
477 uint8_t *dst, int32_t dst_stride,
478 const int8_t *filter, int32_t height)
481 v16u8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask00;
482 v16u8 tmp0, tmp1, tmp2;
483 v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
484 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
485 v16i8 filt0, filt1, filt2, filt3;
486 v8i16 filt, out0, out1, out2, out3, out4, out5;
488 mask00 = LD_UB(&ff_hevc_mask_arr[0]);
489 mask0 = LD_UB(&ff_hevc_mask_arr[16]);
493 /* rearranging filter */
494 filt = LD_SH(filter);
495 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
504 for (loop_cnt = 4; loop_cnt--;) {
506 LD_SB4(src, src_stride, src0, src1, src2, src3);
508 LD_SB4(src + 8, src_stride, src4, src5, src6, src7);
510 XORI_B4_128_SB(src0, src1, src2, src3);
511 XORI_B4_128_SB(src4, src5, src6, src7);
512 src += (4 * src_stride);
514 VSHF_B2_SB(src0, src0, src1, src1, mask00, mask00, vec0, vec1);
515 VSHF_B2_SB(src2, src2, src3, src3, mask00, mask00, vec2, vec3);
516 DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0,
518 VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec0, vec1);
519 VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2, vec3);
520 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt2, filt2, filt2, out0,
522 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4, vec5);
523 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6, vec7);
524 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1, out0,
526 VSHF_B2_SB(src0, src0, src1, src1, mask3, mask3, vec4, vec5);
527 VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec6, vec7);
528 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt3, filt3, filt3, filt3, out0,
532 VSHF_B2_SB(src4, src5, src6, src7, mask0, mask0, vec0, vec1);
533 DOTP_SB2_SH(vec0, vec1, filt0, filt0, out4, out5);
534 VSHF_B2_SB(src4, src5, src6, src7, mask4, mask4, vec2, vec3);
535 DPADD_SB2_SH(vec2, vec3, filt1, filt1, out4, out5);
536 VSHF_B2_SB(src4, src5, src6, src7, mask5, mask5, vec4, vec5);
537 DPADD_SB2_SH(vec4, vec5, filt2, filt2, out4, out5);
538 VSHF_B2_SB(src4, src5, src6, src7, mask6, mask6, vec6, vec7);
539 DPADD_SB2_SH(vec6, vec7, filt3, filt3, out4, out5);
541 SRARI_H4_SH(out0, out1, out2, out3, 6);
542 SRARI_H2_SH(out4, out5, 6);
543 SAT_SH4_SH(out0, out1, out2, out3, 7);
544 SAT_SH2_SH(out4, out5, 7);
545 tmp0 = PCKEV_XORI128_UB(out0, out1);
546 tmp1 = PCKEV_XORI128_UB(out2, out3);
547 tmp2 = PCKEV_XORI128_UB(out4, out5);
549 ST8x4_UB(tmp0, tmp1, dst, dst_stride);
550 ST4x4_UB(tmp2, tmp2, 0, 1, 2, 3, dst + 8, dst_stride);
551 dst += (4 * dst_stride);
555 static void common_hz_8t_16w_msa(uint8_t *src, int32_t src_stride,
556 uint8_t *dst, int32_t dst_stride,
557 const int8_t *filter, int32_t height)
560 v16u8 mask0, mask1, mask2, mask3, out;
561 v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
562 v16i8 filt0, filt1, filt2, filt3;
563 v8i16 filt, out0, out1, out2, out3;
565 mask0 = LD_UB(&ff_hevc_mask_arr[0]);
568 /* rearranging filter */
569 filt = LD_SH(filter);
570 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
576 for (loop_cnt = (height >> 2); loop_cnt--;) {
577 LD_SB2(src, src_stride, src0, src2);
578 LD_SB2(src + 8, src_stride, src1, src3);
579 src += (2 * src_stride);
581 LD_SB2(src, src_stride, src4, src6);
582 LD_SB2(src + 8, src_stride, src5, src7);
583 src += (2 * src_stride);
585 XORI_B4_128_SB(src0, src1, src2, src3);
586 XORI_B4_128_SB(src4, src5, src6, src7);
587 HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
588 mask3, filt0, filt1, filt2, filt3, out0,
590 SRARI_H4_SH(out0, out1, out2, out3, 6);
591 SAT_SH4_SH(out0, out1, out2, out3, 7);
592 out = PCKEV_XORI128_UB(out0, out1);
595 out = PCKEV_XORI128_UB(out2, out3);
599 HORIZ_8TAP_8WID_4VECS_FILT(src4, src5, src6, src7, mask0, mask1, mask2,
600 mask3, filt0, filt1, filt2, filt3, out0,
602 SRARI_H4_SH(out0, out1, out2, out3, 6);
603 SAT_SH4_SH(out0, out1, out2, out3, 7);
604 out = PCKEV_XORI128_UB(out0, out1);
607 out = PCKEV_XORI128_UB(out2, out3);
613 static void common_hz_8t_24w_msa(uint8_t *src, int32_t src_stride,
614 uint8_t *dst, int32_t dst_stride,
615 const int8_t *filter, int32_t height)
618 v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
619 v16u8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7, out;
620 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
622 v8i16 out0, out1, out2, out3, out8, out9, filt;
624 mask0 = LD_UB(&ff_hevc_mask_arr[0]);
627 /* rearranging filter */
628 filt = LD_SH(filter);
629 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
639 for (loop_cnt = 16; loop_cnt--;) {
640 LD_SB2(src, src_stride, src0, src2);
641 LD_SB2(src + 16, src_stride, src1, src3);
642 XORI_B4_128_SB(src0, src1, src2, src3);
643 src += (2 * src_stride);
644 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec8);
645 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec9);
646 VSHF_B2_SB(src0, src1, src2, src3, mask4, mask4, vec1, vec3);
647 DOTP_SB4_SH(vec0, vec8, vec2, vec9, filt0, filt0, filt0, filt0, out0,
649 DOTP_SB2_SH(vec1, vec3, filt0, filt0, out1, out3);
650 VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec0, vec8);
651 VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2, vec9);
652 VSHF_B2_SB(src0, src1, src2, src3, mask6, mask6, vec1, vec3);
653 DPADD_SB4_SH(vec0, vec8, vec2, vec9, filt2, filt2, filt2, filt2,
654 out0, out8, out2, out9);
655 DPADD_SB2_SH(vec1, vec3, filt2, filt2, out1, out3);
656 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4, vec10);
657 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6, vec11);
658 VSHF_B2_SB(src0, src1, src2, src3, mask5, mask5, vec5, vec7);
659 DPADD_SB4_SH(vec4, vec10, vec6, vec11, filt1, filt1, filt1, filt1,
660 out0, out8, out2, out9);
661 DPADD_SB2_SH(vec5, vec7, filt1, filt1, out1, out3);
662 VSHF_B2_SB(src0, src0, src1, src1, mask3, mask3, vec4, vec10);
663 VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec6, vec11);
664 VSHF_B2_SB(src0, src1, src2, src3, mask7, mask7, vec5, vec7);
665 DPADD_SB4_SH(vec4, vec10, vec6, vec11, filt3, filt3, filt3, filt3,
666 out0, out8, out2, out9);
667 DPADD_SB2_SH(vec5, vec7, filt3, filt3, out1, out3);
668 SRARI_H4_SH(out0, out8, out2, out9, 6);
669 SRARI_H2_SH(out1, out3, 6);
670 SAT_SH4_SH(out0, out8, out2, out9, 7);
671 SAT_SH2_SH(out1, out3, 7);
672 out = PCKEV_XORI128_UB(out8, out9);
673 ST8x2_UB(out, dst + 16, dst_stride);
674 out = PCKEV_XORI128_UB(out0, out1);
677 out = PCKEV_XORI128_UB(out2, out3);
683 static void common_hz_8t_32w_msa(uint8_t *src, int32_t src_stride,
684 uint8_t *dst, int32_t dst_stride,
685 const int8_t *filter, int32_t height)
688 v16u8 mask0, mask1, mask2, mask3, out;
689 v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
690 v16i8 filt0, filt1, filt2, filt3;
691 v8i16 filt, out0, out1, out2, out3;
693 mask0 = LD_UB(&ff_hevc_mask_arr[0]);
696 /* rearranging filter */
697 filt = LD_SH(filter);
698 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
704 for (loop_cnt = (height >> 1); loop_cnt--;) {
706 src1 = LD_SB(src + 8);
707 src2 = LD_SB(src + 16);
708 src3 = LD_SB(src + 24);
710 XORI_B4_128_SB(src0, src1, src2, src3);
713 src5 = LD_SB(src + 8);
714 src6 = LD_SB(src + 16);
715 src7 = LD_SB(src + 24);
717 XORI_B4_128_SB(src4, src5, src6, src7);
719 HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
720 mask3, filt0, filt1, filt2, filt3, out0,
722 SRARI_H4_SH(out0, out1, out2, out3, 6);
723 SAT_SH4_SH(out0, out1, out2, out3, 7);
725 out = PCKEV_XORI128_UB(out0, out1);
727 out = PCKEV_XORI128_UB(out2, out3);
728 ST_UB(out, dst + 16);
731 HORIZ_8TAP_8WID_4VECS_FILT(src4, src5, src6, src7, mask0, mask1, mask2,
732 mask3, filt0, filt1, filt2, filt3, out0,
734 SRARI_H4_SH(out0, out1, out2, out3, 6);
735 SAT_SH4_SH(out0, out1, out2, out3, 7);
736 out = PCKEV_XORI128_UB(out0, out1);
738 out = PCKEV_XORI128_UB(out2, out3);
739 ST_UB(out, dst + 16);
744 static void common_hz_8t_48w_msa(uint8_t *src, int32_t src_stride,
745 uint8_t *dst, int32_t dst_stride,
746 const int8_t *filter, int32_t height)
749 v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3, vec0, vec1, vec2;
751 v16u8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7, out;
752 v8i16 filt, out0, out1, out2, out3;
754 mask0 = LD_UB(&ff_hevc_mask_arr[0]);
757 /* rearranging filter */
758 filt = LD_SH(filter);
759 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
769 for (loop_cnt = 64; loop_cnt--;) {
771 src1 = LD_SB(src + 8);
772 src2 = LD_SB(src + 16);
773 src3 = LD_SB(src + 32);
774 src4 = LD_SB(src + 40);
777 XORI_B4_128_SB(src0, src1, src2, src3);
778 src4 = (v16i8) __msa_xori_b((v16u8) src4, 128);
780 VSHF_B3_SB(src0, src0, src1, src1, src2, src2, mask0, mask0, mask0,
782 DOTP_SB3_SH(vec0, vec1, vec2, filt0, filt0, filt0, out0, out1, out2);
783 VSHF_B3_SB(src0, src0, src1, src1, src2, src2, mask1, mask1, mask1,
785 DPADD_SB2_SH(vec0, vec1, filt1, filt1, out0, out1);
786 out2 = __msa_dpadd_s_h(out2, vec2, filt1);
787 VSHF_B3_SB(src0, src0, src1, src1, src2, src2, mask2, mask2, mask2,
789 DPADD_SB2_SH(vec0, vec1, filt2, filt2, out0, out1);
790 out2 = __msa_dpadd_s_h(out2, vec2, filt2);
792 VSHF_B3_SB(src0, src0, src1, src1, src2, src2, mask3, mask3, mask3,
794 DPADD_SB2_SH(vec0, vec1, filt3, filt3, out0, out1);
795 out2 = __msa_dpadd_s_h(out2, vec2, filt3);
797 SRARI_H2_SH(out0, out1, 6);
798 out3 = __msa_srari_h(out2, 6);
799 SAT_SH3_SH(out0, out1, out3, 7);
800 out = PCKEV_XORI128_UB(out0, out1);
803 VSHF_B3_SB(src2, src3, src3, src3, src4, src4, mask4, mask0, mask0,
805 DOTP_SB3_SH(vec0, vec1, vec2, filt0, filt0, filt0, out0, out1, out2);
806 VSHF_B3_SB(src2, src3, src3, src3, src4, src4, mask5, mask1, mask1,
808 DPADD_SB2_SH(vec0, vec1, filt1, filt1, out0, out1);
809 out2 = __msa_dpadd_s_h(out2, vec2, filt1);
810 VSHF_B3_SB(src2, src3, src3, src3, src4, src4, mask6, mask2, mask2,
812 DPADD_SB2_SH(vec0, vec1, filt2, filt2, out0, out1);
813 out2 = __msa_dpadd_s_h(out2, vec2, filt2);
814 VSHF_B3_SB(src2, src3, src3, src3, src4, src4, mask7, mask3, mask3,
816 DPADD_SB2_SH(vec0, vec1, filt3, filt3, out0, out1);
817 out2 = __msa_dpadd_s_h(out2, vec2, filt3);
819 SRARI_H2_SH(out0, out1, 6);
820 out2 = __msa_srari_h(out2, 6);
821 SAT_SH3_SH(out0, out1, out2, 7);
822 out = PCKEV_XORI128_UB(out3, out0);
823 ST_UB(out, dst + 16);
824 out = PCKEV_XORI128_UB(out1, out2);
825 ST_UB(out, dst + 32);
830 static void common_hz_8t_64w_msa(uint8_t *src, int32_t src_stride,
831 uint8_t *dst, int32_t dst_stride,
832 const int8_t *filter, int32_t height)
835 v16u8 mask0, mask1, mask2, mask3, out;
836 v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
837 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
838 v16i8 filt0, filt1, filt2, filt3;
839 v8i16 res0, res1, res2, res3, filt;
841 mask0 = LD_UB(&ff_hevc_mask_arr[0]);
844 /* rearranging filter */
845 filt = LD_SH(filter);
846 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
852 for (loop_cnt = height; loop_cnt--;) {
853 LD_SB8(src, 8, src0, src1, src2, src3, src4, src5, src6, src7);
856 XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
858 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
859 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3);
860 DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, res0,
862 VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec0, vec1);
863 VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2, vec3);
864 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt2, filt2, filt2, res0,
866 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4, vec5);
867 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6, vec7);
868 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1, res0,
870 VSHF_B2_SB(src0, src0, src1, src1, mask3, mask3, vec4, vec5);
871 VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec6, vec7);
872 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt3, filt3, filt3, filt3, res0,
875 SRARI_H4_SH(res0, res1, res2, res3, 6);
876 SAT_SH4_SH(res0, res1, res2, res3, 7);
877 out = PCKEV_XORI128_UB(res0, res1);
879 out = PCKEV_XORI128_UB(res2, res3);
880 ST_UB(out, dst + 16);
882 VSHF_B2_SB(src4, src4, src5, src5, mask0, mask0, vec0, vec1);
883 VSHF_B2_SB(src6, src6, src7, src7, mask0, mask0, vec2, vec3);
884 DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, res0,
886 VSHF_B2_SB(src4, src4, src5, src5, mask2, mask2, vec0, vec1);
887 VSHF_B2_SB(src6, src6, src7, src7, mask2, mask2, vec2, vec3);
888 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt2, filt2, filt2, res0,
890 VSHF_B2_SB(src4, src4, src5, src5, mask1, mask1, vec4, vec5);
891 VSHF_B2_SB(src6, src6, src7, src7, mask1, mask1, vec6, vec7);
892 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1, res0,
894 VSHF_B2_SB(src4, src4, src5, src5, mask3, mask3, vec4, vec5);
895 VSHF_B2_SB(src6, src6, src7, src7, mask3, mask3, vec6, vec7);
896 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt3, filt3, filt3, filt3, res0,
899 SRARI_H4_SH(res0, res1, res2, res3, 6);
900 SAT_SH4_SH(res0, res1, res2, res3, 7);
901 out = PCKEV_XORI128_UB(res0, res1);
902 ST_UB(out, dst + 32);
903 out = PCKEV_XORI128_UB(res2, res3);
904 ST_UB(out, dst + 48);
909 static void common_vt_8t_4w_msa(uint8_t *src, int32_t src_stride,
910 uint8_t *dst, int32_t dst_stride,
911 const int8_t *filter, int32_t height)
915 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
916 v16i8 src11, src12, src13, src14;
917 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
918 v16i8 src65_r, src87_r, src109_r, src2110, src4332, src6554, src8776;
919 v16i8 src1110_r, src1211_r, src1312_r, src1413_r, src12111110, src14131312;
920 v16i8 src10998, filt0, filt1, filt2, filt3;
921 v8i16 filt, out10, out32, out54, out76;
923 src -= (3 * src_stride);
925 filt = LD_SH(filter);
926 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
928 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
929 src += (7 * src_stride);
931 ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
933 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
934 ILVR_D3_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r, src2110,
936 XORI_B3_128_SB(src2110, src4332, src6554);
938 for (loop_cnt = (height >> 3); loop_cnt--;) {
939 LD_SB4(src, src_stride, src7, src8, src9, src10);
940 src += (4 * src_stride);
941 LD_SB4(src, src_stride, src11, src12, src13, src14);
942 src += (4 * src_stride);
944 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
945 src87_r, src98_r, src109_r);
946 ILVR_B4_SB(src11, src10, src12, src11, src13, src12, src14, src13,
947 src1110_r, src1211_r, src1312_r, src1413_r);
948 ILVR_D2_SB(src87_r, src76_r, src109_r, src98_r, src8776, src10998);
949 ILVR_D2_SB(src1211_r, src1110_r, src1413_r, src1312_r,
950 src12111110, src14131312);
951 XORI_B2_128_SB(src8776, src10998);
952 XORI_B2_128_SB(src12111110, src14131312);
954 DOTP_SB2_SH(src2110, src4332, filt0, filt0, out10, out32);
955 DOTP_SB2_SH(src6554, src8776, filt0, filt0, out54, out76);
956 DPADD_SB2_SH(src4332, src6554, filt1, filt1, out10, out32);
957 DPADD_SB2_SH(src8776, src10998, filt1, filt1, out54, out76);
958 DPADD_SB2_SH(src6554, src8776, filt2, filt2, out10, out32);
959 DPADD_SB2_SH(src10998, src12111110, filt2, filt2, out54, out76);
960 DPADD_SB2_SH(src8776, src10998, filt3, filt3, out10, out32);
961 DPADD_SB2_SH(src12111110, src14131312, filt3, filt3, out54, out76);
962 SRARI_H2_SH(out10, out32, 6);
963 SRARI_H2_SH(out54, out76, 6);
964 SAT_SH2_SH(out10, out32, 7);
965 SAT_SH2_SH(out54, out76, 7);
966 out0 = PCKEV_XORI128_UB(out10, out32);
967 out1 = PCKEV_XORI128_UB(out54, out76);
968 ST4x4_UB(out0, out0, 0, 1, 2, 3, dst, dst_stride);
969 dst += (4 * dst_stride);
970 ST4x4_UB(out1, out1, 0, 1, 2, 3, dst, dst_stride);
971 dst += (4 * dst_stride);
974 src4332 = src12111110;
975 src6554 = src14131312;
980 static void common_vt_8t_8w_msa(uint8_t *src, int32_t src_stride,
981 uint8_t *dst, int32_t dst_stride,
982 const int8_t *filter, int32_t height)
985 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
986 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
987 v16i8 src65_r, src87_r, src109_r, filt0, filt1, filt2, filt3;
989 v8i16 filt, out0_r, out1_r, out2_r, out3_r;
991 src -= (3 * src_stride);
993 filt = LD_SH(filter);
994 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
996 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
997 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
998 src += (7 * src_stride);
999 ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
1001 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1003 for (loop_cnt = (height >> 2); loop_cnt--;) {
1004 LD_SB4(src, src_stride, src7, src8, src9, src10);
1005 XORI_B4_128_SB(src7, src8, src9, src10);
1006 src += (4 * src_stride);
1008 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
1009 src87_r, src98_r, src109_r);
1010 DOTP_SB4_SH(src10_r, src21_r, src32_r, src43_r, filt0, filt0, filt0,
1011 filt0, out0_r, out1_r, out2_r, out3_r);
1012 DPADD_SB4_SH(src32_r, src43_r, src54_r, src65_r, filt1, filt1, filt1,
1013 filt1, out0_r, out1_r, out2_r, out3_r);
1014 DPADD_SB4_SH(src54_r, src65_r, src76_r, src87_r, filt2, filt2, filt2,
1015 filt2, out0_r, out1_r, out2_r, out3_r);
1016 DPADD_SB4_SH(src76_r, src87_r, src98_r, src109_r, filt3, filt3, filt3,
1017 filt3, out0_r, out1_r, out2_r, out3_r);
1018 SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6);
1019 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
1020 tmp0 = PCKEV_XORI128_UB(out0_r, out1_r);
1021 tmp1 = PCKEV_XORI128_UB(out2_r, out3_r);
1022 ST8x4_UB(tmp0, tmp1, dst, dst_stride);
1023 dst += (4 * dst_stride);
1035 static void common_vt_8t_12w_msa(uint8_t *src, int32_t src_stride,
1036 uint8_t *dst, int32_t dst_stride,
1037 const int8_t *filter, int32_t height)
1040 uint32_t out2, out3;
1041 uint64_t out0, out1;
1042 v16u8 tmp0, tmp1, tmp2, tmp3;
1043 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1044 v16i8 filt0, filt1, filt2, filt3;
1045 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
1046 v16i8 src65_r, src87_r, src109_r, src10_l, src32_l, src54_l, src76_l;
1047 v16i8 src98_l, src21_l, src43_l, src65_l, src87_l, src109_l;
1048 v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
1050 src -= (3 * src_stride);
1052 filt = LD_SH(filter);
1053 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1055 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
1056 src += (7 * src_stride);
1058 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1060 ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
1062 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1063 ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_l, src32_l,
1065 ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
1067 for (loop_cnt = 4; loop_cnt--;) {
1068 LD_SB4(src, src_stride, src7, src8, src9, src10);
1069 XORI_B4_128_SB(src7, src8, src9, src10);
1070 src += (4 * src_stride);
1072 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
1073 src87_r, src98_r, src109_r);
1074 ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_l,
1075 src87_l, src98_l, src109_l);
1076 out0_r = HEVC_FILT_8TAP_SH(src10_r, src32_r, src54_r, src76_r, filt0,
1077 filt1, filt2, filt3);
1078 out1_r = HEVC_FILT_8TAP_SH(src21_r, src43_r, src65_r, src87_r, filt0,
1079 filt1, filt2, filt3);
1080 out2_r = HEVC_FILT_8TAP_SH(src32_r, src54_r, src76_r, src98_r, filt0,
1081 filt1, filt2, filt3);
1082 out3_r = HEVC_FILT_8TAP_SH(src43_r, src65_r, src87_r, src109_r, filt0,
1083 filt1, filt2, filt3);
1084 out0_l = HEVC_FILT_8TAP_SH(src10_l, src32_l, src54_l, src76_l, filt0,
1085 filt1, filt2, filt3);
1086 out1_l = HEVC_FILT_8TAP_SH(src21_l, src43_l, src65_l, src87_l, filt0,
1087 filt1, filt2, filt3);
1088 out2_l = HEVC_FILT_8TAP_SH(src32_l, src54_l, src76_l, src98_l, filt0,
1089 filt1, filt2, filt3);
1090 out3_l = HEVC_FILT_8TAP_SH(src43_l, src65_l, src87_l, src109_l, filt0,
1091 filt1, filt2, filt3);
1092 SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6);
1093 SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 6);
1094 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
1095 SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
1096 PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
1097 out3_r, tmp0, tmp1, tmp2, tmp3);
1098 XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3);
1100 out0 = __msa_copy_u_d((v2i64) tmp0, 0);
1101 out1 = __msa_copy_u_d((v2i64) tmp1, 0);
1102 out2 = __msa_copy_u_w((v4i32) tmp0, 2);
1103 out3 = __msa_copy_u_w((v4i32) tmp1, 2);
1105 SW(out2, (dst + 8));
1108 SW(out3, (dst + 8));
1110 out0 = __msa_copy_u_d((v2i64) tmp2, 0);
1111 out1 = __msa_copy_u_d((v2i64) tmp3, 0);
1112 out2 = __msa_copy_u_w((v4i32) tmp2, 2);
1113 out3 = __msa_copy_u_w((v4i32) tmp3, 2);
1115 SW(out2, (dst + 8));
1118 SW(out3, (dst + 8));
1137 static void common_vt_8t_16w_msa(uint8_t *src, int32_t src_stride,
1138 uint8_t *dst, int32_t dst_stride,
1139 const int8_t *filter, int32_t height)
1142 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1143 v16i8 filt0, filt1, filt2, filt3;
1144 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
1145 v16i8 src65_r, src87_r, src109_r, src10_l, src32_l, src54_l, src76_l;
1146 v16i8 src98_l, src21_l, src43_l, src65_l, src87_l, src109_l;
1147 v16u8 tmp0, tmp1, tmp2, tmp3;
1148 v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
1150 src -= (3 * src_stride);
1152 filt = LD_SH(filter);
1153 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1155 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
1156 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1157 src += (7 * src_stride);
1158 ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
1160 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1161 ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_l, src32_l,
1163 ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
1165 for (loop_cnt = (height >> 2); loop_cnt--;) {
1166 LD_SB4(src, src_stride, src7, src8, src9, src10);
1167 XORI_B4_128_SB(src7, src8, src9, src10);
1168 src += (4 * src_stride);
1170 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
1171 src87_r, src98_r, src109_r);
1172 ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_l,
1173 src87_l, src98_l, src109_l);
1174 out0_r = HEVC_FILT_8TAP_SH(src10_r, src32_r, src54_r, src76_r, filt0,
1175 filt1, filt2, filt3);
1176 out1_r = HEVC_FILT_8TAP_SH(src21_r, src43_r, src65_r, src87_r, filt0,
1177 filt1, filt2, filt3);
1178 out2_r = HEVC_FILT_8TAP_SH(src32_r, src54_r, src76_r, src98_r, filt0,
1179 filt1, filt2, filt3);
1180 out3_r = HEVC_FILT_8TAP_SH(src43_r, src65_r, src87_r, src109_r, filt0,
1181 filt1, filt2, filt3);
1182 out0_l = HEVC_FILT_8TAP_SH(src10_l, src32_l, src54_l, src76_l, filt0,
1183 filt1, filt2, filt3);
1184 out1_l = HEVC_FILT_8TAP_SH(src21_l, src43_l, src65_l, src87_l, filt0,
1185 filt1, filt2, filt3);
1186 out2_l = HEVC_FILT_8TAP_SH(src32_l, src54_l, src76_l, src98_l, filt0,
1187 filt1, filt2, filt3);
1188 out3_l = HEVC_FILT_8TAP_SH(src43_l, src65_l, src87_l, src109_l, filt0,
1189 filt1, filt2, filt3);
1190 SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6);
1191 SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 6);
1192 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
1193 SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
1194 PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
1195 out3_r, tmp0, tmp1, tmp2, tmp3);
1196 XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3);
1197 ST_UB4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride);
1198 dst += (4 * dst_stride);
1216 static void common_vt_8t_16w_mult_msa(uint8_t *src, int32_t src_stride,
1217 uint8_t *dst, int32_t dst_stride,
1218 const int8_t *filter, int32_t height,
1223 uint32_t loop_cnt, cnt;
1224 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1225 v16i8 filt0, filt1, filt2, filt3;
1226 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
1227 v16i8 src65_r, src87_r, src109_r, src10_l, src32_l, src54_l, src76_l;
1228 v16i8 src98_l, src21_l, src43_l, src65_l, src87_l, src109_l;
1229 v16u8 tmp0, tmp1, tmp2, tmp3;
1230 v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
1232 src -= (3 * src_stride);
1234 filt = LD_SH(filter);
1235 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1237 for (cnt = (width >> 4); cnt--;) {
1241 LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6);
1242 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1243 src_tmp += (7 * src_stride);
1244 ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r,
1245 src32_r, src54_r, src21_r);
1246 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1247 ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_l,
1248 src32_l, src54_l, src21_l);
1249 ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
1251 for (loop_cnt = (height >> 2); loop_cnt--;) {
1252 LD_SB4(src_tmp, src_stride, src7, src8, src9, src10);
1253 XORI_B4_128_SB(src7, src8, src9, src10);
1254 src_tmp += (4 * src_stride);
1255 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
1256 src87_r, src98_r, src109_r);
1257 ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_l,
1258 src87_l, src98_l, src109_l);
1259 out0_r = HEVC_FILT_8TAP_SH(src10_r, src32_r, src54_r, src76_r,
1260 filt0, filt1, filt2, filt3);
1261 out1_r = HEVC_FILT_8TAP_SH(src21_r, src43_r, src65_r, src87_r,
1262 filt0, filt1, filt2, filt3);
1263 out2_r = HEVC_FILT_8TAP_SH(src32_r, src54_r, src76_r, src98_r,
1264 filt0, filt1, filt2, filt3);
1265 out3_r = HEVC_FILT_8TAP_SH(src43_r, src65_r, src87_r, src109_r,
1266 filt0, filt1, filt2, filt3);
1267 out0_l = HEVC_FILT_8TAP_SH(src10_l, src32_l, src54_l, src76_l,
1268 filt0, filt1, filt2, filt3);
1269 out1_l = HEVC_FILT_8TAP_SH(src21_l, src43_l, src65_l, src87_l,
1270 filt0, filt1, filt2, filt3);
1271 out2_l = HEVC_FILT_8TAP_SH(src32_l, src54_l, src76_l, src98_l,
1272 filt0, filt1, filt2, filt3);
1273 out3_l = HEVC_FILT_8TAP_SH(src43_l, src65_l, src87_l, src109_l,
1274 filt0, filt1, filt2, filt3);
1275 SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6);
1276 SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 6);
1277 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
1278 SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
1279 PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
1280 out3_r, tmp0, tmp1, tmp2, tmp3);
1281 XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3);
1282 ST_UB4(tmp0, tmp1, tmp2, tmp3, dst_tmp, dst_stride);
1283 dst_tmp += (4 * dst_stride);
1305 static void common_vt_8t_24w_msa(uint8_t *src, int32_t src_stride,
1306 uint8_t *dst, int32_t dst_stride,
1307 const int8_t *filter, int32_t height)
1309 common_vt_8t_16w_mult_msa(src, src_stride, dst, dst_stride, filter, height,
1312 common_vt_8t_8w_msa(src + 16, src_stride, dst + 16, dst_stride, filter,
1316 static void common_vt_8t_32w_msa(uint8_t *src, int32_t src_stride,
1317 uint8_t *dst, int32_t dst_stride,
1318 const int8_t *filter, int32_t height)
1320 common_vt_8t_16w_mult_msa(src, src_stride, dst, dst_stride, filter, height,
1324 static void common_vt_8t_48w_msa(uint8_t *src, int32_t src_stride,
1325 uint8_t *dst, int32_t dst_stride,
1326 const int8_t *filter, int32_t height)
1328 common_vt_8t_16w_mult_msa(src, src_stride, dst, dst_stride, filter, height,
1332 static void common_vt_8t_64w_msa(uint8_t *src, int32_t src_stride,
1333 uint8_t *dst, int32_t dst_stride,
1334 const int8_t *filter, int32_t height)
1336 common_vt_8t_16w_mult_msa(src, src_stride, dst, dst_stride, filter, height,
1340 static void hevc_hv_uni_8t_4w_msa(uint8_t *src,
1344 const int8_t *filter_x,
1345 const int8_t *filter_y,
1350 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
1351 v16i8 src9, src10, src11, src12, src13, src14;
1352 v8i16 filt0, filt1, filt2, filt3;
1353 v8i16 filt_h0, filt_h1, filt_h2, filt_h3;
1354 v16i8 mask1, mask2, mask3;
1356 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1357 v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1358 v8i16 dst30, dst41, dst52, dst63, dst66, dst117, dst128, dst139, dst1410;
1359 v8i16 dst10_r, dst32_r, dst54_r, dst76_r, dst98_r, dst1110_r, dst1312_r;
1360 v8i16 dst21_r, dst43_r, dst65_r, dst87_r, dst109_r, dst1211_r, dst1413_r;
1361 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r, dst6_r, dst7_r;
1362 v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16);
1364 src -= ((3 * src_stride) + 3);
1365 filter_vec = LD_SH(filter_x);
1366 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1368 filter_vec = LD_SH(filter_y);
1369 UNPCK_R_SB_SH(filter_vec, filter_vec);
1371 SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
1377 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
1378 src += (7 * src_stride);
1379 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1381 VSHF_B4_SB(src0, src3, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
1382 VSHF_B4_SB(src1, src4, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7);
1383 VSHF_B4_SB(src2, src5, mask0, mask1, mask2, mask3,
1384 vec8, vec9, vec10, vec11);
1385 VSHF_B4_SB(src3, src6, mask0, mask1, mask2, mask3,
1386 vec12, vec13, vec14, vec15);
1388 dst30 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1390 dst41 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
1392 dst52 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
1394 dst63 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2,
1397 ILVRL_H2_SH(dst41, dst30, dst10_r, dst43_r);
1398 ILVRL_H2_SH(dst52, dst41, dst21_r, dst54_r);
1399 ILVRL_H2_SH(dst63, dst52, dst32_r, dst65_r);
1401 dst66 = (v8i16) __msa_splati_d((v2i64) dst63, 1);
1403 for (loop_cnt = height >> 3; loop_cnt--;) {
1404 LD_SB8(src, src_stride, src7, src8, src9, src10, src11, src12, src13,
1406 src += (8 * src_stride);
1407 XORI_B8_128_SB(src7, src8, src9, src10, src11, src12, src13, src14);
1409 VSHF_B4_SB(src7, src11, mask0, mask1, mask2, mask3,
1410 vec0, vec1, vec2, vec3);
1411 VSHF_B4_SB(src8, src12, mask0, mask1, mask2, mask3,
1412 vec4, vec5, vec6, vec7);
1413 VSHF_B4_SB(src9, src13, mask0, mask1, mask2, mask3,
1414 vec8, vec9, vec10, vec11);
1415 VSHF_B4_SB(src10, src14, mask0, mask1, mask2, mask3,
1416 vec12, vec13, vec14, vec15);
1418 dst117 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1420 dst128 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
1422 dst139 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1,
1424 dst1410 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1,
1427 dst76_r = __msa_ilvr_h(dst117, dst66);
1428 ILVRL_H2_SH(dst128, dst117, dst87_r, dst1211_r);
1429 ILVRL_H2_SH(dst139, dst128, dst98_r, dst1312_r);
1430 ILVRL_H2_SH(dst1410, dst139, dst109_r, dst1413_r);
1431 dst117 = (v8i16) __msa_splati_d((v2i64) dst117, 1);
1432 dst1110_r = __msa_ilvr_h(dst117, dst1410);
1434 dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r, filt_h0,
1435 filt_h1, filt_h2, filt_h3);
1436 dst1_r = HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r, filt_h0,
1437 filt_h1, filt_h2, filt_h3);
1438 dst2_r = HEVC_FILT_8TAP(dst32_r, dst54_r, dst76_r, dst98_r, filt_h0,
1439 filt_h1, filt_h2, filt_h3);
1440 dst3_r = HEVC_FILT_8TAP(dst43_r, dst65_r, dst87_r, dst109_r, filt_h0,
1441 filt_h1, filt_h2, filt_h3);
1442 dst4_r = HEVC_FILT_8TAP(dst54_r, dst76_r, dst98_r, dst1110_r, filt_h0,
1443 filt_h1, filt_h2, filt_h3);
1444 dst5_r = HEVC_FILT_8TAP(dst65_r, dst87_r, dst109_r, dst1211_r, filt_h0,
1445 filt_h1, filt_h2, filt_h3);
1446 dst6_r = HEVC_FILT_8TAP(dst76_r, dst98_r, dst1110_r, dst1312_r, filt_h0,
1447 filt_h1, filt_h2, filt_h3);
1448 dst7_r = HEVC_FILT_8TAP(dst87_r, dst109_r, dst1211_r, dst1413_r,
1449 filt_h0, filt_h1, filt_h2, filt_h3);
1451 SRA_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6);
1452 SRA_4V(dst4_r, dst5_r, dst6_r, dst7_r, 6);
1453 SRARI_W4_SW(dst0_r, dst1_r, dst2_r, dst3_r, 6);
1454 SRARI_W4_SW(dst4_r, dst5_r, dst6_r, dst7_r, 6);
1455 SAT_SW4_SW(dst0_r, dst1_r, dst2_r, dst3_r, 7);
1456 SAT_SW4_SW(dst4_r, dst5_r, dst6_r, dst7_r, 7);
1457 PCKEV_H2_SW(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r);
1458 PCKEV_H2_SW(dst5_r, dst4_r, dst7_r, dst6_r, dst4_r, dst5_r);
1459 out0 = PCKEV_XORI128_UB(dst0_r, dst1_r);
1460 out1 = PCKEV_XORI128_UB(dst4_r, dst5_r);
1461 ST4x4_UB(out0, out0, 0, 1, 2, 3, dst, dst_stride);
1462 dst += (4 * dst_stride);
1463 ST4x4_UB(out1, out1, 0, 1, 2, 3, dst, dst_stride);
1464 dst += (4 * dst_stride);
1467 dst32_r = dst1110_r;
1468 dst54_r = dst1312_r;
1470 dst43_r = dst1211_r;
1471 dst65_r = dst1413_r;
1472 dst66 = (v8i16) __msa_splati_d((v2i64) dst1410, 1);
1476 static void hevc_hv_uni_8t_8multx2mult_msa(uint8_t *src,
1480 const int8_t *filter_x,
1481 const int8_t *filter_y,
1482 int32_t height, int32_t width)
1484 uint32_t loop_cnt, cnt;
1488 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
1489 v8i16 filt0, filt1, filt2, filt3;
1490 v8i16 filt_h0, filt_h1, filt_h2, filt_h3;
1491 v16i8 mask1, mask2, mask3;
1493 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1494 v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1495 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
1496 v4i32 dst0_r, dst0_l, dst1_r, dst1_l;
1497 v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
1498 v8i16 dst10_l, dst32_l, dst54_l, dst76_l;
1499 v8i16 dst21_r, dst43_r, dst65_r, dst87_r;
1500 v8i16 dst21_l, dst43_l, dst65_l, dst87_l;
1501 v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
1503 src -= ((3 * src_stride) + 3);
1505 filter_vec = LD_SH(filter_x);
1506 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1508 filter_vec = LD_SH(filter_y);
1509 UNPCK_R_SB_SH(filter_vec, filter_vec);
1511 SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
1517 for (cnt = width >> 3; cnt--;) {
1521 LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6);
1522 src_tmp += (7 * src_stride);
1523 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1525 /* row 0 row 1 row 2 row 3 */
1526 VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
1527 vec0, vec1, vec2, vec3);
1528 VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
1529 vec4, vec5, vec6, vec7);
1530 VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
1531 vec8, vec9, vec10, vec11);
1532 VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
1533 vec12, vec13, vec14, vec15);
1534 dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1536 dst1 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
1538 dst2 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
1540 dst3 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1,
1543 VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3,
1544 vec0, vec1, vec2, vec3);
1545 VSHF_B4_SB(src5, src5, mask0, mask1, mask2, mask3,
1546 vec4, vec5, vec6, vec7);
1547 VSHF_B4_SB(src6, src6, mask0, mask1, mask2, mask3,
1548 vec8, vec9, vec10, vec11);
1549 dst4 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1551 dst5 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
1553 dst6 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
1556 for (loop_cnt = height >> 1; loop_cnt--;) {
1557 LD_SB2(src_tmp, src_stride, src7, src8);
1558 XORI_B2_128_SB(src7, src8);
1559 src_tmp += 2 * src_stride;
1561 ILVR_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1,
1562 dst10_r, dst32_r, dst54_r, dst21_r);
1563 ILVL_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1,
1564 dst10_l, dst32_l, dst54_l, dst21_l);
1565 ILVR_H2_SH(dst4, dst3, dst6, dst5, dst43_r, dst65_r);
1566 ILVL_H2_SH(dst4, dst3, dst6, dst5, dst43_l, dst65_l);
1568 VSHF_B4_SB(src7, src7, mask0, mask1, mask2, mask3,
1569 vec0, vec1, vec2, vec3);
1570 dst7 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1,
1573 ILVRL_H2_SH(dst7, dst6, dst76_r, dst76_l);
1574 dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r,
1575 filt_h0, filt_h1, filt_h2, filt_h3);
1576 dst0_l = HEVC_FILT_8TAP(dst10_l, dst32_l, dst54_l, dst76_l,
1577 filt_h0, filt_h1, filt_h2, filt_h3);
1581 VSHF_B4_SB(src8, src8, mask0, mask1, mask2, mask3,
1582 vec0, vec1, vec2, vec3);
1583 dst8 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1,
1586 ILVRL_H2_SH(dst8, dst7, dst87_r, dst87_l);
1587 dst1_r = HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r,
1588 filt_h0, filt_h1, filt_h2, filt_h3);
1589 dst1_l = HEVC_FILT_8TAP(dst21_l, dst43_l, dst65_l, dst87_l,
1590 filt_h0, filt_h1, filt_h2, filt_h3);
1593 SRARI_W4_SW(dst0_r, dst0_l, dst1_r, dst1_l, 6);
1594 SAT_SW4_SW(dst0_r, dst0_l, dst1_r, dst1_l, 7);
1596 PCKEV_H2_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst0, dst1);
1597 out = PCKEV_XORI128_UB(dst0, dst1);
1598 ST8x2_UB(out, dst_tmp, dst_stride);
1599 dst_tmp += (2 * dst_stride);
1615 static void hevc_hv_uni_8t_8w_msa(uint8_t *src,
1619 const int8_t *filter_x,
1620 const int8_t *filter_y,
1623 hevc_hv_uni_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
1624 filter_x, filter_y, height, 8);
1627 static void hevc_hv_uni_8t_12w_msa(uint8_t *src,
1631 const int8_t *filter_x,
1632 const int8_t *filter_y,
1636 uint8_t *src_tmp, *dst_tmp;
1638 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1639 v16i8 src11, src12, src13, src14;
1640 v16i8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7;
1641 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1642 v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1643 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
1644 v8i16 dst30, dst41, dst52, dst63, dst66, dst117, dst128, dst139, dst1410;
1645 v8i16 filt0, filt1, filt2, filt3, filt_h0, filt_h1, filt_h2, filt_h3;
1646 v8i16 dst10_r, dst32_r, dst54_r, dst76_r, dst21_r, dst43_r, dst65_r;
1647 v8i16 dst10_l, dst32_l, dst54_l, dst76_l, dst21_l, dst43_l, dst65_l;
1648 v8i16 dst87_r, dst98_r, dst1110_r, dst1312_r, dst109_r, dst1211_r;
1649 v8i16 dst1413_r, dst87_l, filter_vec;
1650 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r, dst6_r, dst7_r;
1651 v4i32 dst0_l, dst1_l;
1653 src -= ((3 * src_stride) + 3);
1655 filter_vec = LD_SH(filter_x);
1656 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1658 filter_vec = LD_SH(filter_y);
1659 UNPCK_R_SB_SH(filter_vec, filter_vec);
1661 SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
1663 mask0 = LD_SB(ff_hevc_mask_arr);
1671 LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6);
1672 src_tmp += (7 * src_stride);
1673 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1675 /* row 0 row 1 row 2 row 3 */
1676 VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
1677 VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7);
1678 VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3, vec8, vec9, vec10,
1680 VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3, vec12, vec13, vec14,
1682 dst0 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1684 dst1 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
1686 dst2 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
1688 dst3 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1,
1691 VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
1692 VSHF_B4_SB(src5, src5, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7);
1693 VSHF_B4_SB(src6, src6, mask0, mask1, mask2, mask3, vec8, vec9, vec10,
1695 dst4 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1697 dst5 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
1699 dst6 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
1702 for (loop_cnt = 8; loop_cnt--;) {
1703 LD_SB2(src_tmp, src_stride, src7, src8);
1704 XORI_B2_128_SB(src7, src8);
1705 src_tmp += 2 * src_stride;
1707 ILVR_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1, dst10_r,
1708 dst32_r, dst54_r, dst21_r);
1709 ILVL_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1, dst10_l,
1710 dst32_l, dst54_l, dst21_l);
1711 ILVR_H2_SH(dst4, dst3, dst6, dst5, dst43_r, dst65_r);
1712 ILVL_H2_SH(dst4, dst3, dst6, dst5, dst43_l, dst65_l);
1714 VSHF_B4_SB(src7, src7, mask0, mask1, mask2, mask3, vec0, vec1, vec2,
1716 dst7 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1719 ILVRL_H2_SH(dst7, dst6, dst76_r, dst76_l);
1720 dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r,
1721 filt_h0, filt_h1, filt_h2, filt_h3);
1722 dst0_l = HEVC_FILT_8TAP(dst10_l, dst32_l, dst54_l, dst76_l,
1723 filt_h0, filt_h1, filt_h2, filt_h3);
1727 VSHF_B4_SB(src8, src8, mask0, mask1, mask2, mask3, vec0, vec1, vec2,
1729 dst8 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1732 ILVRL_H2_SH(dst8, dst7, dst87_r, dst87_l);
1733 dst1_r = HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r,
1734 filt_h0, filt_h1, filt_h2, filt_h3);
1735 dst1_l = HEVC_FILT_8TAP(dst21_l, dst43_l, dst65_l, dst87_l,
1736 filt_h0, filt_h1, filt_h2, filt_h3);
1739 SRARI_W4_SW(dst0_r, dst0_l, dst1_r, dst1_l, 6);
1740 SAT_SW4_SW(dst0_r, dst0_l, dst1_r, dst1_l, 7);
1742 PCKEV_H2_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst0, dst1);
1743 out0 = PCKEV_XORI128_UB(dst0, dst1);
1744 ST8x2_UB(out0, dst_tmp, dst_stride);
1745 dst_tmp += (2 * dst_stride);
1759 mask4 = LD_SB(ff_hevc_mask_arr + 16);
1764 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
1765 src += (7 * src_stride);
1766 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1768 VSHF_B4_SB(src0, src3, mask4, mask5, mask6, mask7, vec0, vec1, vec2, vec3);
1769 VSHF_B4_SB(src1, src4, mask4, mask5, mask6, mask7, vec4, vec5, vec6, vec7);
1770 VSHF_B4_SB(src2, src5, mask4, mask5, mask6, mask7, vec8, vec9, vec10,
1772 VSHF_B4_SB(src3, src6, mask4, mask5, mask6, mask7, vec12, vec13, vec14,
1775 dst30 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1777 dst41 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
1779 dst52 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2,
1781 dst63 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2,
1784 ILVRL_H2_SH(dst41, dst30, dst10_r, dst43_r);
1785 ILVRL_H2_SH(dst52, dst41, dst21_r, dst54_r);
1786 ILVRL_H2_SH(dst63, dst52, dst32_r, dst65_r);
1788 dst66 = (v8i16) __msa_splati_d((v2i64) dst63, 1);
1790 for (loop_cnt = 2; loop_cnt--;) {
1791 LD_SB8(src, src_stride, src7, src8, src9, src10, src11, src12, src13,
1793 src += (8 * src_stride);
1794 XORI_B8_128_SB(src7, src8, src9, src10, src11, src12, src13, src14);
1796 VSHF_B4_SB(src7, src11, mask4, mask5, mask6, mask7, vec0, vec1, vec2,
1798 VSHF_B4_SB(src8, src12, mask4, mask5, mask6, mask7, vec4, vec5, vec6,
1800 VSHF_B4_SB(src9, src13, mask4, mask5, mask6, mask7, vec8, vec9, vec10,
1802 VSHF_B4_SB(src10, src14, mask4, mask5, mask6, mask7, vec12, vec13,
1805 dst117 = HEVC_FILT_8TAP_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2,
1807 dst128 = HEVC_FILT_8TAP_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2,
1809 dst139 = HEVC_FILT_8TAP_SH(vec8, vec9, vec10, vec11, filt0, filt1,
1811 dst1410 = HEVC_FILT_8TAP_SH(vec12, vec13, vec14, vec15, filt0, filt1,
1814 dst76_r = __msa_ilvr_h(dst117, dst66);
1815 ILVRL_H2_SH(dst128, dst117, dst87_r, dst1211_r);
1816 ILVRL_H2_SH(dst139, dst128, dst98_r, dst1312_r);
1817 ILVRL_H2_SH(dst1410, dst139, dst109_r, dst1413_r);
1818 dst117 = (v8i16) __msa_splati_d((v2i64) dst117, 1);
1819 dst1110_r = __msa_ilvr_h(dst117, dst1410);
1821 dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r, filt_h0,
1822 filt_h1, filt_h2, filt_h3);
1823 dst1_r = HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r, filt_h0,
1824 filt_h1, filt_h2, filt_h3);
1825 dst2_r = HEVC_FILT_8TAP(dst32_r, dst54_r, dst76_r, dst98_r, filt_h0,
1826 filt_h1, filt_h2, filt_h3);
1827 dst3_r = HEVC_FILT_8TAP(dst43_r, dst65_r, dst87_r, dst109_r, filt_h0,
1828 filt_h1, filt_h2, filt_h3);
1829 dst4_r = HEVC_FILT_8TAP(dst54_r, dst76_r, dst98_r, dst1110_r, filt_h0,
1830 filt_h1, filt_h2, filt_h3);
1831 dst5_r = HEVC_FILT_8TAP(dst65_r, dst87_r, dst109_r, dst1211_r, filt_h0,
1832 filt_h1, filt_h2, filt_h3);
1833 dst6_r = HEVC_FILT_8TAP(dst76_r, dst98_r, dst1110_r, dst1312_r, filt_h0,
1834 filt_h1, filt_h2, filt_h3);
1835 dst7_r = HEVC_FILT_8TAP(dst87_r, dst109_r, dst1211_r, dst1413_r,
1836 filt_h0, filt_h1, filt_h2, filt_h3);
1838 SRA_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6);
1839 SRA_4V(dst4_r, dst5_r, dst6_r, dst7_r, 6);
1840 SRARI_W4_SW(dst0_r, dst1_r, dst2_r, dst3_r, 6);
1841 SRARI_W4_SW(dst4_r, dst5_r, dst6_r, dst7_r, 6);
1842 SAT_SW4_SW(dst0_r, dst1_r, dst2_r, dst3_r, 7);
1843 SAT_SW4_SW(dst4_r, dst5_r, dst6_r, dst7_r, 7);
1844 PCKEV_H2_SW(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r);
1845 PCKEV_H2_SW(dst5_r, dst4_r, dst7_r, dst6_r, dst4_r, dst5_r);
1846 out0 = PCKEV_XORI128_UB(dst0_r, dst1_r);
1847 out1 = PCKEV_XORI128_UB(dst4_r, dst5_r);
1848 ST4x4_UB(out0, out0, 0, 1, 2, 3, dst, dst_stride);
1849 dst += (4 * dst_stride);
1850 ST4x4_UB(out1, out1, 0, 1, 2, 3, dst, dst_stride);
1851 dst += (4 * dst_stride);
1854 dst32_r = dst1110_r;
1855 dst54_r = dst1312_r;
1857 dst43_r = dst1211_r;
1858 dst65_r = dst1413_r;
1859 dst66 = (v8i16) __msa_splati_d((v2i64) dst1410, 1);
1863 static void hevc_hv_uni_8t_16w_msa(uint8_t *src,
1867 const int8_t *filter_x,
1868 const int8_t *filter_y,
1871 hevc_hv_uni_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
1872 filter_x, filter_y, height, 16);
1875 static void hevc_hv_uni_8t_24w_msa(uint8_t *src,
1879 const int8_t *filter_x,
1880 const int8_t *filter_y,
1883 hevc_hv_uni_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
1884 filter_x, filter_y, height, 24);
1887 static void hevc_hv_uni_8t_32w_msa(uint8_t *src,
1891 const int8_t *filter_x,
1892 const int8_t *filter_y,
1895 hevc_hv_uni_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
1896 filter_x, filter_y, height, 32);
1899 static void hevc_hv_uni_8t_48w_msa(uint8_t *src,
1903 const int8_t *filter_x,
1904 const int8_t *filter_y,
1907 hevc_hv_uni_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
1908 filter_x, filter_y, height, 48);
1911 static void hevc_hv_uni_8t_64w_msa(uint8_t *src,
1915 const int8_t *filter_x,
1916 const int8_t *filter_y,
1919 hevc_hv_uni_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
1920 filter_x, filter_y, height, 64);
1923 static void common_hz_4t_4x2_msa(uint8_t *src, int32_t src_stride,
1924 uint8_t *dst, int32_t dst_stride,
1925 const int8_t *filter)
1927 v16i8 filt0, filt1, src0, src1, mask0, mask1, vec0, vec1;
1931 mask0 = LD_SB(&ff_hevc_mask_arr[16]);
1934 /* rearranging filter */
1935 filt = LD_SH(filter);
1936 SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
1940 LD_SB2(src, src_stride, src0, src1);
1941 XORI_B2_128_SB(src0, src1);
1942 VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
1943 res0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
1944 res0 = __msa_srari_h(res0, 6);
1945 res0 = __msa_sat_s_h(res0, 7);
1946 out = PCKEV_XORI128_UB(res0, res0);
1947 ST4x2_UB(out, dst, dst_stride);
1950 static void common_hz_4t_4x4_msa(uint8_t *src, int32_t src_stride,
1951 uint8_t *dst, int32_t dst_stride,
1952 const int8_t *filter)
1954 v16i8 src0, src1, src2, src3, filt0, filt1, mask0, mask1;
1955 v8i16 filt, out0, out1;
1958 mask0 = LD_SB(&ff_hevc_mask_arr[16]);
1961 /* rearranging filter */
1962 filt = LD_SH(filter);
1963 SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
1967 LD_SB4(src, src_stride, src0, src1, src2, src3);
1968 XORI_B4_128_SB(src0, src1, src2, src3);
1969 HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,
1970 filt0, filt1, out0, out1);
1971 SRARI_H2_SH(out0, out1, 6);
1972 SAT_SH2_SH(out0, out1, 7);
1973 out = PCKEV_XORI128_UB(out0, out1);
1974 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
1977 static void common_hz_4t_4x8_msa(uint8_t *src, int32_t src_stride,
1978 uint8_t *dst, int32_t dst_stride,
1979 const int8_t *filter)
1981 v16i8 src0, src1, src2, src3, filt0, filt1, mask0, mask1;
1983 v8i16 filt, out0, out1, out2, out3;
1985 mask0 = LD_SB(&ff_hevc_mask_arr[16]);
1988 /* rearranging filter */
1989 filt = LD_SH(filter);
1990 SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
1994 LD_SB4(src, src_stride, src0, src1, src2, src3);
1995 src += (4 * src_stride);
1997 XORI_B4_128_SB(src0, src1, src2, src3);
1998 HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,
1999 filt0, filt1, out0, out1);
2000 LD_SB4(src, src_stride, src0, src1, src2, src3);
2001 XORI_B4_128_SB(src0, src1, src2, src3);
2002 HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,
2003 filt0, filt1, out2, out3);
2004 SRARI_H4_SH(out0, out1, out2, out3, 6);
2005 SAT_SH4_SH(out0, out1, out2, out3, 7);
2006 out = PCKEV_XORI128_UB(out0, out1);
2007 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
2008 dst += (4 * dst_stride);
2009 out = PCKEV_XORI128_UB(out2, out3);
2010 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
2013 static void common_hz_4t_4x16_msa(uint8_t *src, int32_t src_stride,
2014 uint8_t *dst, int32_t dst_stride,
2015 const int8_t *filter)
2017 v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
2018 v16i8 filt0, filt1, mask0, mask1;
2020 v8i16 filt, out0, out1, out2, out3;
2022 mask0 = LD_SB(&ff_hevc_mask_arr[16]);
2025 /* rearranging filter */
2026 filt = LD_SH(filter);
2027 SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2031 LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
2032 src += (8 * src_stride);
2033 XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
2034 HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,
2035 filt0, filt1, out0, out1);
2036 HORIZ_4TAP_4WID_4VECS_FILT(src4, src5, src6, src7, mask0, mask1,
2037 filt0, filt1, out2, out3);
2038 SRARI_H4_SH(out0, out1, out2, out3, 6);
2039 SAT_SH4_SH(out0, out1, out2, out3, 7);
2040 out = PCKEV_XORI128_UB(out0, out1);
2041 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
2042 dst += (4 * dst_stride);
2043 out = PCKEV_XORI128_UB(out2, out3);
2044 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
2045 dst += (4 * dst_stride);
2047 LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
2048 src += (8 * src_stride);
2049 XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
2050 HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,
2051 filt0, filt1, out0, out1);
2052 HORIZ_4TAP_4WID_4VECS_FILT(src4, src5, src6, src7, mask0, mask1,
2053 filt0, filt1, out2, out3);
2054 SRARI_H4_SH(out0, out1, out2, out3, 6);
2055 SAT_SH4_SH(out0, out1, out2, out3, 7);
2056 out = PCKEV_XORI128_UB(out0, out1);
2057 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
2058 dst += (4 * dst_stride);
2059 out = PCKEV_XORI128_UB(out2, out3);
2060 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
2063 static void common_hz_4t_4w_msa(uint8_t *src, int32_t src_stride,
2064 uint8_t *dst, int32_t dst_stride,
2065 const int8_t *filter, int32_t height)
2068 common_hz_4t_4x2_msa(src, src_stride, dst, dst_stride, filter);
2069 } else if (4 == height) {
2070 common_hz_4t_4x4_msa(src, src_stride, dst, dst_stride, filter);
2071 } else if (8 == height) {
2072 common_hz_4t_4x8_msa(src, src_stride, dst, dst_stride, filter);
2073 } else if (16 == height) {
2074 common_hz_4t_4x16_msa(src, src_stride, dst, dst_stride, filter);
2078 static void common_hz_4t_6w_msa(uint8_t *src, int32_t src_stride,
2079 uint8_t *dst, int32_t dst_stride,
2080 const int8_t *filter, int32_t height)
2082 v16i8 src0, src1, src2, src3, filt0, filt1, mask0, mask1;
2084 v8i16 filt, out0, out1, out2, out3;
2086 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
2089 /* rearranging filter */
2090 filt = LD_SH(filter);
2091 SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2095 LD_SB4(src, src_stride, src0, src1, src2, src3);
2096 src += (4 * src_stride);
2098 XORI_B4_128_SB(src0, src1, src2, src3);
2099 HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0,
2100 filt1, out0, out1, out2, out3);
2101 SRARI_H4_SH(out0, out1, out2, out3, 6);
2102 SAT_SH4_SH(out0, out1, out2, out3, 7);
2103 out4 = PCKEV_XORI128_UB(out0, out1);
2104 out5 = PCKEV_XORI128_UB(out2, out3);
2105 ST6x4_UB(out4, out5, dst, dst_stride);
2106 dst += (4 * dst_stride);
2108 LD_SB4(src, src_stride, src0, src1, src2, src3);
2109 src += (4 * src_stride);
2111 XORI_B4_128_SB(src0, src1, src2, src3);
2112 HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0,
2113 filt1, out0, out1, out2, out3);
2114 SRARI_H4_SH(out0, out1, out2, out3, 6);
2115 SAT_SH4_SH(out0, out1, out2, out3, 7);
2116 out4 = PCKEV_XORI128_UB(out0, out1);
2117 out5 = PCKEV_XORI128_UB(out2, out3);
2118 ST6x4_UB(out4, out5, dst, dst_stride);
2119 dst += (4 * dst_stride);
2122 static void common_hz_4t_8x2mult_msa(uint8_t *src, int32_t src_stride,
2123 uint8_t *dst, int32_t dst_stride,
2124 const int8_t *filter, int32_t height)
2127 v16i8 src0, src1, filt0, filt1, mask0, mask1;
2129 v8i16 filt, vec0, vec1, vec2, vec3;
2131 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
2134 filt = LD_SH(filter);
2135 SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2139 for (loop_cnt = (height >> 1); loop_cnt--;) {
2140 LD_SB2(src, src_stride, src0, src1);
2141 src += (2 * src_stride);
2143 XORI_B2_128_SB(src0, src1);
2144 VSHF_B2_SH(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
2145 DOTP_SB2_SH(vec0, vec1, filt0, filt0, vec0, vec1);
2146 VSHF_B2_SH(src0, src0, src1, src1, mask1, mask1, vec2, vec3);
2147 DPADD_SB2_SH(vec2, vec3, filt1, filt1, vec0, vec1);
2148 SRARI_H2_SH(vec0, vec1, 6);
2149 SAT_SH2_SH(vec0, vec1, 7);
2150 out = PCKEV_XORI128_UB(vec0, vec1);
2151 ST8x2_UB(out, dst, dst_stride);
2152 dst += (2 * dst_stride);
2156 static void common_hz_4t_8x4mult_msa(uint8_t *src, int32_t src_stride,
2157 uint8_t *dst, int32_t dst_stride,
2158 const int8_t *filter, int32_t height)
2161 v16i8 src0, src1, src2, src3, filt0, filt1, mask0, mask1;
2163 v8i16 filt, out0, out1, out2, out3;
2165 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
2168 /* rearranging filter */
2169 filt = LD_SH(filter);
2170 SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2174 for (loop_cnt = (height >> 2); loop_cnt--;) {
2175 LD_SB4(src, src_stride, src0, src1, src2, src3);
2176 src += (4 * src_stride);
2178 XORI_B4_128_SB(src0, src1, src2, src3);
2179 HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0,
2180 filt1, out0, out1, out2, out3);
2181 SRARI_H4_SH(out0, out1, out2, out3, 6);
2182 SAT_SH4_SH(out0, out1, out2, out3, 7);
2183 tmp0 = PCKEV_XORI128_UB(out0, out1);
2184 tmp1 = PCKEV_XORI128_UB(out2, out3);
2185 ST8x4_UB(tmp0, tmp1, dst, dst_stride);
2186 dst += (4 * dst_stride);
2190 static void common_hz_4t_8w_msa(uint8_t *src, int32_t src_stride,
2191 uint8_t *dst, int32_t dst_stride,
2192 const int8_t *filter, int32_t height)
2194 if ((2 == height) || (6 == height)) {
2195 common_hz_4t_8x2mult_msa(src, src_stride, dst, dst_stride, filter,
2198 common_hz_4t_8x4mult_msa(src, src_stride, dst, dst_stride, filter,
2203 static void common_hz_4t_12w_msa(uint8_t *src, int32_t src_stride,
2204 uint8_t *dst, int32_t dst_stride,
2205 const int8_t *filter, int32_t height)
2208 v16i8 src0, src1, src2, src3, filt0, filt1, mask0, mask1, mask2, mask3;
2209 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
2212 v8i16 filt, out0, out1, out2, out3, out4, out5;
2214 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
2215 mask2 = LD_SB(&ff_hevc_mask_arr[32]);
2219 /* rearranging filter */
2220 filt = LD_SH(filter);
2221 SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2226 for (loop_cnt = 4; loop_cnt--;) {
2227 LD_SB4(src, src_stride, src0, src1, src2, src3);
2228 src += (4 * src_stride);
2230 XORI_B4_128_SB(src0, src1, src2, src3);
2231 VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec0, vec1);
2232 DOTP_SB2_SH(vec0, vec1, filt0, filt0, out0, out1);
2233 VSHF_B2_SB(src0, src1, src2, src3, mask3, mask3, vec2, vec3);
2234 DPADD_SB2_SH(vec2, vec3, filt1, filt1, out0, out1);
2235 SRARI_H2_SH(out0, out1, 6);
2236 SAT_SH2_SH(out0, out1, 7);
2237 tmp0 = PCKEV_XORI128_UB(out0, out1);
2238 ST4x4_UB(tmp0, tmp0, 0, 1, 2, 3, dst + 8, dst_stride);
2240 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec4, vec5);
2241 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec6, vec7);
2242 DOTP_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0,
2243 out2, out3, out4, out5);
2244 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec8, vec9);
2245 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec10, vec11);
2246 DPADD_SB4_SH(vec8, vec9, vec10, vec11, filt1, filt1, filt1, filt1,
2247 out2, out3, out4, out5);
2248 SRARI_H4_SH(out2, out3, out4, out5, 6);
2249 SAT_SH4_SH(out2, out3, out4, out5, 7);
2250 tmp0 = PCKEV_XORI128_UB(out2, out3);
2251 tmp1 = PCKEV_XORI128_UB(out4, out5);
2252 ST8x4_UB(tmp0, tmp1, dst, dst_stride);
2253 dst += (4 * dst_stride);
2257 static void common_hz_4t_16w_msa(uint8_t *src, int32_t src_stride,
2258 uint8_t *dst, int32_t dst_stride,
2259 const int8_t *filter, int32_t height)
2262 v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
2263 v16i8 filt0, filt1, mask0, mask1;
2264 v16i8 vec0_m, vec1_m, vec2_m, vec3_m;
2265 v8i16 filt, out0, out1, out2, out3, out4, out5, out6, out7;
2268 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
2271 /* rearranging filter */
2272 filt = LD_SH(filter);
2273 SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2277 for (loop_cnt = (height >> 2); loop_cnt--;) {
2278 LD_SB4(src, src_stride, src0, src2, src4, src6);
2279 LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
2280 src += (4 * src_stride);
2282 XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
2284 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m);
2285 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m);
2286 DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0,
2287 out0, out1, out2, out3);
2288 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0_m, vec1_m);
2289 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2_m, vec3_m);
2290 DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt1, filt1, filt1, filt1,
2291 out0, out1, out2, out3);
2292 SRARI_H4_SH(out0, out1, out2, out3, 6);
2293 SAT_SH4_SH(out0, out1, out2, out3, 7);
2294 out = PCKEV_XORI128_UB(out0, out1);
2297 out = PCKEV_XORI128_UB(out2, out3);
2301 VSHF_B2_SB(src4, src4, src5, src5, mask0, mask0, vec0_m, vec1_m);
2302 VSHF_B2_SB(src6, src6, src7, src7, mask0, mask0, vec2_m, vec3_m);
2303 DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0,
2304 out4, out5, out6, out7);
2305 VSHF_B2_SB(src4, src4, src5, src5, mask1, mask1, vec0_m, vec1_m);
2306 VSHF_B2_SB(src6, src6, src7, src7, mask1, mask1, vec2_m, vec3_m);
2307 DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt1, filt1, filt1, filt1,
2308 out4, out5, out6, out7);
2309 SRARI_H4_SH(out4, out5, out6, out7, 6);
2310 SAT_SH4_SH(out4, out5, out6, out7, 7);
2311 out = PCKEV_XORI128_UB(out4, out5);
2314 out = PCKEV_XORI128_UB(out6, out7);
2320 static void common_hz_4t_24w_msa(uint8_t *src, int32_t src_stride,
2321 uint8_t *dst, int32_t dst_stride,
2322 const int8_t *filter, int32_t height)
2324 uint8_t *dst1 = dst + 16;
2326 v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
2327 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
2328 v16i8 filt0, filt1, mask0, mask1, mask00, mask11;
2329 v8i16 filt, out0, out1, out2, out3;
2332 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
2335 /* rearranging filter */
2336 filt = LD_SH(filter);
2337 SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2341 mask11 = mask0 + 10;
2343 for (loop_cnt = 8; loop_cnt--;) {
2344 LD_SB4(src, src_stride, src0, src2, src4, src6);
2345 LD_SB4(src + 16, src_stride, src1, src3, src5, src7);
2346 src += (4 * src_stride);
2348 XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
2349 VSHF_B2_SB(src0, src0, src0, src1, mask0, mask00, vec0, vec1);
2350 VSHF_B2_SB(src2, src2, src2, src3, mask0, mask00, vec2, vec3);
2351 VSHF_B2_SB(src0, src0, src0, src1, mask1, mask11, vec4, vec5);
2352 VSHF_B2_SB(src2, src2, src2, src3, mask1, mask11, vec6, vec7);
2353 DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
2354 out0, out1, out2, out3);
2355 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1,
2356 out0, out1, out2, out3);
2357 SRARI_H4_SH(out0, out1, out2, out3, 6);
2358 SAT_SH4_SH(out0, out1, out2, out3, 7);
2359 tmp0 = PCKEV_XORI128_UB(out0, out1);
2362 tmp0 = PCKEV_XORI128_UB(out2, out3);
2366 VSHF_B2_SB(src4, src4, src4, src5, mask0, mask00, vec0, vec1);
2367 VSHF_B2_SB(src6, src6, src6, src7, mask0, mask00, vec2, vec3);
2368 VSHF_B2_SB(src4, src4, src4, src5, mask1, mask11, vec4, vec5);
2369 VSHF_B2_SB(src6, src6, src6, src7, mask1, mask11, vec6, vec7);
2370 DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
2371 out0, out1, out2, out3);
2372 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1,
2373 out0, out1, out2, out3);
2374 SRARI_H4_SH(out0, out1, out2, out3, 6);
2375 SAT_SH4_SH(out0, out1, out2, out3, 7);
2376 tmp0 = PCKEV_XORI128_UB(out0, out1);
2379 tmp0 = PCKEV_XORI128_UB(out2, out3);
2384 VSHF_B2_SB(src1, src1, src3, src3, mask0, mask0, vec0, vec1);
2385 VSHF_B2_SB(src5, src5, src7, src7, mask0, mask0, vec2, vec3);
2386 VSHF_B2_SB(src1, src1, src3, src3, mask1, mask1, vec4, vec5);
2387 VSHF_B2_SB(src5, src5, src7, src7, mask1, mask1, vec6, vec7);
2389 DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
2390 out0, out1, out2, out3);
2391 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1,
2392 out0, out1, out2, out3);
2394 SRARI_H4_SH(out0, out1, out2, out3, 6);
2395 SAT_SH4_SH(out0, out1, out2, out3, 7);
2396 tmp0 = PCKEV_XORI128_UB(out0, out1);
2397 tmp1 = PCKEV_XORI128_UB(out2, out3);
2398 ST8x4_UB(tmp0, tmp1, dst1, dst_stride);
2399 dst1 += (4 * dst_stride);
2403 static void common_hz_4t_32w_msa(uint8_t *src, int32_t src_stride,
2404 uint8_t *dst, int32_t dst_stride,
2405 const int8_t *filter, int32_t height)
2408 v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
2409 v16i8 filt0, filt1, mask0, mask1;
2411 v16i8 vec0_m, vec1_m, vec2_m, vec3_m;
2412 v8i16 filt, out0, out1, out2, out3, out4, out5, out6, out7;
2414 mask0 = LD_SB(&ff_hevc_mask_arr[0]);
2417 /* rearranging filter */
2418 filt = LD_SH(filter);
2419 SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2423 for (loop_cnt = (height >> 1); loop_cnt--;) {
2425 src1 = LD_SB(src + 8);
2426 src2 = LD_SB(src + 16);
2427 src3 = LD_SB(src + 24);
2430 src5 = LD_SB(src + 8);
2431 src6 = LD_SB(src + 16);
2432 src7 = LD_SB(src + 24);
2435 XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
2437 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m);
2438 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m);
2439 DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0,
2440 out0, out1, out2, out3);
2441 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0_m, vec1_m);
2442 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2_m, vec3_m);
2443 DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt1, filt1, filt1, filt1,
2444 out0, out1, out2, out3);
2446 VSHF_B2_SB(src4, src4, src5, src5, mask0, mask0, vec0_m, vec1_m);
2447 VSHF_B2_SB(src6, src6, src7, src7, mask0, mask0, vec2_m, vec3_m);
2448 DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0,
2449 out4, out5, out6, out7);
2450 VSHF_B2_SB(src4, src4, src5, src5, mask1, mask1, vec0_m, vec1_m);
2451 VSHF_B2_SB(src6, src6, src7, src7, mask1, mask1, vec2_m, vec3_m);
2452 DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt1, filt1, filt1, filt1,
2453 out4, out5, out6, out7);
2454 SRARI_H4_SH(out0, out1, out2, out3, 6);
2455 SRARI_H4_SH(out4, out5, out6, out7, 6);
2456 SAT_SH4_SH(out0, out1, out2, out3, 7);
2457 SAT_SH4_SH(out4, out5, out6, out7, 7);
2458 out = PCKEV_XORI128_UB(out0, out1);
2460 out = PCKEV_XORI128_UB(out2, out3);
2461 ST_UB(out, dst + 16);
2463 out = PCKEV_XORI128_UB(out4, out5);
2465 out = PCKEV_XORI128_UB(out6, out7);
2466 ST_UB(out, dst + 16);
2471 static void common_vt_4t_4x2_msa(uint8_t *src, int32_t src_stride,
2472 uint8_t *dst, int32_t dst_stride,
2473 const int8_t *filter)
2475 v16i8 src0, src1, src2, src3, src4, src10_r, src32_r, src21_r, src43_r;
2476 v16i8 src2110, src4332, filt0, filt1;
2482 filt = LD_SH(filter);
2483 SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2485 LD_SB3(src, src_stride, src0, src1, src2);
2486 src += (3 * src_stride);
2488 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2489 src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
2490 src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
2491 LD_SB2(src, src_stride, src3, src4);
2492 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
2493 src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_r, (v2i64) src32_r);
2494 src4332 = (v16i8) __msa_xori_b((v16u8) src4332, 128);
2495 out10 = HEVC_FILT_4TAP_SH(src2110, src4332, filt0, filt1);
2496 out10 = __msa_srari_h(out10, 6);
2497 out10 = __msa_sat_s_h(out10, 7);
2498 out = PCKEV_XORI128_UB(out10, out10);
2499 ST4x2_UB(out, dst, dst_stride);
2502 static void common_vt_4t_4x4multiple_msa(uint8_t *src, int32_t src_stride,
2503 uint8_t *dst, int32_t dst_stride,
2504 const int8_t *filter, int32_t height)
2507 v16i8 src0, src1, src2, src3, src4, src5;
2508 v16i8 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r;
2509 v16i8 src2110, src4332, filt0, filt1;
2510 v8i16 filt, out10, out32;
2515 filt = LD_SH(filter);
2516 SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2518 LD_SB3(src, src_stride, src0, src1, src2);
2519 src += (3 * src_stride);
2521 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2523 src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
2524 src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
2526 for (loop_cnt = (height >> 2); loop_cnt--;) {
2527 LD_SB3(src, src_stride, src3, src4, src5);
2528 src += (3 * src_stride);
2529 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
2530 src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_r, (v2i64) src32_r);
2531 src4332 = (v16i8) __msa_xori_b((v16u8) src4332, 128);
2532 out10 = HEVC_FILT_4TAP_SH(src2110, src4332, filt0, filt1);
2535 src += (src_stride);
2536 ILVR_B2_SB(src5, src4, src2, src5, src54_r, src65_r);
2537 src2110 = (v16i8) __msa_ilvr_d((v2i64) src65_r, (v2i64) src54_r);
2538 src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
2539 out32 = HEVC_FILT_4TAP_SH(src4332, src2110, filt0, filt1);
2540 SRARI_H2_SH(out10, out32, 6);
2541 SAT_SH2_SH(out10, out32, 7);
2542 out = PCKEV_XORI128_UB(out10, out32);
2543 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
2544 dst += (4 * dst_stride);
2548 static void common_vt_4t_4w_msa(uint8_t *src, int32_t src_stride,
2549 uint8_t *dst, int32_t dst_stride,
2550 const int8_t *filter, int32_t height)
2553 common_vt_4t_4x2_msa(src, src_stride, dst, dst_stride, filter);
2555 common_vt_4t_4x4multiple_msa(src, src_stride, dst, dst_stride, filter,
2560 static void common_vt_4t_6w_msa(uint8_t *src, int32_t src_stride,
2561 uint8_t *dst, int32_t dst_stride,
2562 const int8_t *filter, int32_t height)
2565 v16i8 src0, src1, src2, src3, src4, src5, src6;
2566 v16i8 src10_r, src32_r, src21_r, src43_r, src54_r, src65_r;
2567 v8i16 dst0_r, dst1_r, dst2_r, dst3_r, filt0, filt1, filter_vec;
2571 filter_vec = LD_SH(filter);
2572 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2574 LD_SB3(src, src_stride, src0, src1, src2);
2575 src += (3 * src_stride);
2576 XORI_B3_128_SB(src0, src1, src2);
2577 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2579 LD_SB2(src, src_stride, src3, src4);
2580 src += (2 * src_stride);
2581 XORI_B2_128_SB(src3, src4);
2582 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
2584 dst0_r = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
2585 dst1_r = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
2587 LD_SB2(src, src_stride, src5, src6);
2588 src += (2 * src_stride);
2589 XORI_B2_128_SB(src5, src6);
2590 ILVR_B2_SB(src5, src4, src6, src5, src54_r, src65_r);
2592 dst2_r = HEVC_FILT_4TAP_SH(src32_r, src54_r, filt0, filt1);
2593 dst3_r = HEVC_FILT_4TAP_SH(src43_r, src65_r, filt0, filt1);
2595 SRARI_H4_SH(dst0_r, dst1_r, dst2_r, dst3_r, 6);
2596 SAT_SH4_SH(dst0_r, dst1_r, dst2_r, dst3_r, 7);
2597 out0 = PCKEV_XORI128_UB(dst0_r, dst1_r);
2598 out1 = PCKEV_XORI128_UB(dst2_r, dst3_r);
2599 ST6x4_UB(out0, out1, dst, dst_stride);
2600 dst += (4 * dst_stride);
2602 LD_SB2(src, src_stride, src3, src4);
2603 src += (2 * src_stride);
2604 XORI_B2_128_SB(src3, src4);
2605 ILVR_B2_SB(src3, src6, src4, src3, src32_r, src43_r);
2607 dst0_r = HEVC_FILT_4TAP_SH(src54_r, src32_r, filt0, filt1);
2608 dst1_r = HEVC_FILT_4TAP_SH(src65_r, src43_r, filt0, filt1);
2610 LD_SB2(src, src_stride, src5, src6);
2611 src += (2 * src_stride);
2612 XORI_B2_128_SB(src5, src6);
2613 ILVR_B2_SB(src5, src4, src6, src5, src54_r, src65_r);
2615 dst2_r = HEVC_FILT_4TAP_SH(src32_r, src54_r, filt0, filt1);
2616 dst3_r = HEVC_FILT_4TAP_SH(src43_r, src65_r, filt0, filt1);
2618 SRARI_H4_SH(dst0_r, dst1_r, dst2_r, dst3_r, 6);
2619 SAT_SH4_SH(dst0_r, dst1_r, dst2_r, dst3_r, 7);
2620 out0 = PCKEV_XORI128_UB(dst0_r, dst1_r);
2621 out1 = PCKEV_XORI128_UB(dst2_r, dst3_r);
2622 ST6x4_UB(out0, out1, dst, dst_stride);
2625 static void common_vt_4t_8x2_msa(uint8_t *src, int32_t src_stride,
2626 uint8_t *dst, int32_t dst_stride,
2627 const int8_t *filter)
2629 v16i8 src0, src1, src2, src3, src4;
2630 v8i16 src01, src12, src23, src34, tmp0, tmp1, filt, filt0, filt1;
2635 /* rearranging filter_y */
2636 filt = LD_SH(filter);
2637 SPLATI_H2_SH(filt, 0, 1, filt0, filt1);
2639 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
2640 XORI_B5_128_SB(src0, src1, src2, src3, src4);
2641 ILVR_B2_SH(src1, src0, src3, src2, src01, src23);
2642 tmp0 = HEVC_FILT_4TAP_SH(src01, src23, filt0, filt1);
2643 ILVR_B2_SH(src2, src1, src4, src3, src12, src34);
2644 tmp1 = HEVC_FILT_4TAP_SH(src12, src34, filt0, filt1);
2645 SRARI_H2_SH(tmp0, tmp1, 6);
2646 SAT_SH2_SH(tmp0, tmp1, 7);
2647 out = PCKEV_XORI128_UB(tmp0, tmp1);
2648 ST8x2_UB(out, dst, dst_stride);
2651 static void common_vt_4t_8x6_msa(uint8_t *src, int32_t src_stride,
2652 uint8_t *dst, int32_t dst_stride,
2653 const int8_t *filter)
2656 uint64_t out0, out1, out2;
2657 v16i8 src0, src1, src2, src3, src4, src5;
2658 v8i16 vec0, vec1, vec2, vec3, vec4, tmp0, tmp1, tmp2;
2659 v8i16 filt, filt0, filt1;
2663 /* rearranging filter_y */
2664 filt = LD_SH(filter);
2665 SPLATI_H2_SH(filt, 0, 1, filt0, filt1);
2667 LD_SB3(src, src_stride, src0, src1, src2);
2668 src += (3 * src_stride);
2670 XORI_B3_128_SB(src0, src1, src2);
2671 ILVR_B2_SH(src1, src0, src2, src1, vec0, vec2);
2673 for (loop_cnt = 2; loop_cnt--;) {
2674 LD_SB3(src, src_stride, src3, src4, src5);
2675 src += (3 * src_stride);
2677 XORI_B3_128_SB(src3, src4, src5);
2678 ILVR_B3_SH(src3, src2, src4, src3, src5, src4, vec1, vec3, vec4);
2679 tmp0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
2680 tmp1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
2681 tmp2 = HEVC_FILT_4TAP_SH(vec1, vec4, filt0, filt1);
2682 SRARI_H2_SH(tmp0, tmp1, 6);
2683 tmp2 = __msa_srari_h(tmp2, 6);
2684 SAT_SH3_SH(tmp0, tmp1, tmp2, 7);
2685 PCKEV_B2_SH(tmp1, tmp0, tmp2, tmp2, tmp0, tmp2);
2686 XORI_B2_128_SH(tmp0, tmp2);
2688 out0 = __msa_copy_u_d((v2i64) tmp0, 0);
2689 out1 = __msa_copy_u_d((v2i64) tmp0, 1);
2690 out2 = __msa_copy_u_d((v2i64) tmp2, 0);
2704 static void common_vt_4t_8x4mult_msa(uint8_t *src, int32_t src_stride,
2705 uint8_t *dst, int32_t dst_stride,
2706 const int8_t *filter, int32_t height)
2709 v16i8 src0, src1, src2, src7, src8, src9, src10;
2710 v16i8 src10_r, src72_r, src98_r, src21_r, src87_r, src109_r, filt0, filt1;
2712 v8i16 filt, out0_r, out1_r, out2_r, out3_r;
2716 filt = LD_SH(filter);
2717 SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2719 LD_SB3(src, src_stride, src0, src1, src2);
2720 src += (3 * src_stride);
2722 XORI_B3_128_SB(src0, src1, src2);
2723 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2725 for (loop_cnt = (height >> 2); loop_cnt--;) {
2726 LD_SB4(src, src_stride, src7, src8, src9, src10);
2727 src += (4 * src_stride);
2729 XORI_B4_128_SB(src7, src8, src9, src10);
2730 ILVR_B4_SB(src7, src2, src8, src7, src9, src8, src10, src9,
2731 src72_r, src87_r, src98_r, src109_r);
2732 out0_r = HEVC_FILT_4TAP_SH(src10_r, src72_r, filt0, filt1);
2733 out1_r = HEVC_FILT_4TAP_SH(src21_r, src87_r, filt0, filt1);
2734 out2_r = HEVC_FILT_4TAP_SH(src72_r, src98_r, filt0, filt1);
2735 out3_r = HEVC_FILT_4TAP_SH(src87_r, src109_r, filt0, filt1);
2736 SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6);
2737 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
2738 tmp0 = PCKEV_XORI128_UB(out0_r, out1_r);
2739 tmp1 = PCKEV_XORI128_UB(out2_r, out3_r);
2740 ST8x4_UB(tmp0, tmp1, dst, dst_stride);
2741 dst += (4 * dst_stride);
2749 static void common_vt_4t_8w_msa(uint8_t *src, int32_t src_stride,
2750 uint8_t *dst, int32_t dst_stride,
2751 const int8_t *filter, int32_t height)
2754 common_vt_4t_8x2_msa(src, src_stride, dst, dst_stride, filter);
2755 } else if (6 == height) {
2756 common_vt_4t_8x6_msa(src, src_stride, dst, dst_stride, filter);
2758 common_vt_4t_8x4mult_msa(src, src_stride, dst, dst_stride,
2763 static void common_vt_4t_12w_msa(uint8_t *src, int32_t src_stride,
2764 uint8_t *dst, int32_t dst_stride,
2765 const int8_t *filter, int32_t height)
2768 v16i8 src0, src1, src2, src3, src4, src5, src6;
2770 v16i8 src10_r, src32_r, src21_r, src43_r, src54_r, src65_r;
2771 v16i8 src10_l, src32_l, src54_l, src21_l, src43_l, src65_l;
2772 v16i8 src2110, src4332, src6554;
2773 v8i16 dst0_r, dst1_r, dst2_r, dst3_r, dst0_l, dst1_l, filt0, filt1;
2776 src -= (1 * src_stride);
2778 filter_vec = LD_SH(filter);
2779 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2781 LD_SB3(src, src_stride, src0, src1, src2);
2782 src += (3 * src_stride);
2784 XORI_B3_128_SB(src0, src1, src2);
2785 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2786 ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
2787 src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_l, (v2i64) src10_l);
2789 for (loop_cnt = 4; loop_cnt--;) {
2790 LD_SB4(src, src_stride, src3, src4, src5, src6);
2791 src += (4 * src_stride);
2793 XORI_B4_128_SB(src3, src4, src5, src6);
2794 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
2795 ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
2796 src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_l, (v2i64) src32_l);
2797 ILVR_B2_SB(src5, src4, src6, src5, src54_r, src65_r);
2798 ILVL_B2_SB(src5, src4, src6, src5, src54_l, src65_l);
2799 src6554 = (v16i8) __msa_ilvr_d((v2i64) src65_l, (v2i64) src54_l);
2801 dst0_r = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
2802 dst1_r = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
2803 dst0_l = HEVC_FILT_4TAP_SH(src2110, src4332, filt0, filt1);
2804 dst2_r = HEVC_FILT_4TAP_SH(src32_r, src54_r, filt0, filt1);
2805 dst3_r = HEVC_FILT_4TAP_SH(src43_r, src65_r, filt0, filt1);
2806 dst1_l = HEVC_FILT_4TAP_SH(src4332, src6554, filt0, filt1);
2808 SRARI_H4_SH(dst0_r, dst1_r, dst2_r, dst3_r, 6);
2809 SRARI_H2_SH(dst0_l, dst1_l, 6);
2810 SAT_SH4_SH(dst0_r, dst1_r, dst2_r, dst3_r, 7);
2811 SAT_SH2_SH(dst0_l, dst1_l, 7);
2812 out0 = PCKEV_XORI128_UB(dst0_r, dst1_r);
2813 out1 = PCKEV_XORI128_UB(dst2_r, dst3_r);
2814 ST8x4_UB(out0, out1, dst, dst_stride);
2815 out0 = PCKEV_XORI128_UB(dst0_l, dst1_l);
2816 ST4x4_UB(out0, out0, 0, 1, 2, 3, dst + 8, dst_stride);
2817 dst += (4 * dst_stride);
2826 static void common_vt_4t_16w_msa(uint8_t *src, int32_t src_stride,
2827 uint8_t *dst, int32_t dst_stride,
2828 const int8_t *filter, int32_t height)
2831 v16i8 src0, src1, src2, src3, src4, src5, src6;
2832 v16i8 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r, src10_l;
2833 v16i8 src32_l, src54_l, src21_l, src43_l, src65_l, filt0, filt1;
2834 v16u8 tmp0, tmp1, tmp2, tmp3;
2835 v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
2839 filt = LD_SH(filter);
2840 SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2842 LD_SB3(src, src_stride, src0, src1, src2);
2843 src += (3 * src_stride);
2845 XORI_B3_128_SB(src0, src1, src2);
2846 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2847 ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
2849 for (loop_cnt = (height >> 2); loop_cnt--;) {
2850 LD_SB4(src, src_stride, src3, src4, src5, src6);
2851 src += (4 * src_stride);
2853 XORI_B4_128_SB(src3, src4, src5, src6);
2854 ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
2855 src32_r, src43_r, src54_r, src65_r);
2856 ILVL_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
2857 src32_l, src43_l, src54_l, src65_l);
2858 out0_r = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
2859 out1_r = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
2860 out2_r = HEVC_FILT_4TAP_SH(src32_r, src54_r, filt0, filt1);
2861 out3_r = HEVC_FILT_4TAP_SH(src43_r, src65_r, filt0, filt1);
2862 out0_l = HEVC_FILT_4TAP_SH(src10_l, src32_l, filt0, filt1);
2863 out1_l = HEVC_FILT_4TAP_SH(src21_l, src43_l, filt0, filt1);
2864 out2_l = HEVC_FILT_4TAP_SH(src32_l, src54_l, filt0, filt1);
2865 out3_l = HEVC_FILT_4TAP_SH(src43_l, src65_l, filt0, filt1);
2866 SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6);
2867 SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 6);
2868 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
2869 SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
2870 PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
2871 out3_r, tmp0, tmp1, tmp2, tmp3);
2872 XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3);
2873 ST_UB4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride);
2874 dst += (4 * dst_stride);
2884 static void common_vt_4t_24w_msa(uint8_t *src, int32_t src_stride,
2885 uint8_t *dst, int32_t dst_stride,
2886 const int8_t *filter, int32_t height)
2889 uint64_t out0, out1;
2890 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
2891 v16i8 src11, filt0, filt1;
2892 v16i8 src10_r, src32_r, src76_r, src98_r, src21_r, src43_r, src87_r;
2893 v16i8 src109_r, src10_l, src32_l, src21_l, src43_l;
2895 v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l;
2899 filt = LD_SH(filter);
2900 SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2903 LD_SB3(src, src_stride, src0, src1, src2);
2904 XORI_B3_128_SB(src0, src1, src2);
2905 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2906 ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
2909 LD_SB3(src + 16, src_stride, src6, src7, src8);
2910 src += (3 * src_stride);
2911 XORI_B3_128_SB(src6, src7, src8);
2912 ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
2914 for (loop_cnt = 8; loop_cnt--;) {
2916 LD_SB2(src, src_stride, src3, src4);
2917 XORI_B2_128_SB(src3, src4);
2918 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
2919 ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
2922 LD_SB2(src + 16, src_stride, src9, src10);
2923 src += (2 * src_stride);
2924 XORI_B2_128_SB(src9, src10);
2925 ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r);
2928 out0_r = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
2929 out0_l = HEVC_FILT_4TAP_SH(src10_l, src32_l, filt0, filt1);
2930 out1_r = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
2931 out1_l = HEVC_FILT_4TAP_SH(src21_l, src43_l, filt0, filt1);
2934 out2_r = HEVC_FILT_4TAP_SH(src76_r, src98_r, filt0, filt1);
2935 out3_r = HEVC_FILT_4TAP_SH(src87_r, src109_r, filt0, filt1);
2938 SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6);
2939 SRARI_H2_SH(out0_l, out1_l, 6);
2940 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
2941 SAT_SH2_SH(out0_l, out1_l, 7);
2942 out = PCKEV_XORI128_UB(out0_r, out0_l);
2944 PCKEV_B2_SH(out2_r, out2_r, out3_r, out3_r, out2_r, out3_r);
2945 XORI_B2_128_SH(out2_r, out3_r);
2946 out0 = __msa_copy_u_d((v2i64) out2_r, 0);
2947 out1 = __msa_copy_u_d((v2i64) out3_r, 0);
2950 out = PCKEV_XORI128_UB(out1_r, out1_l);
2956 LD_SB2(src, src_stride, src5, src2);
2957 XORI_B2_128_SB(src5, src2);
2958 ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r);
2959 ILVL_B2_SB(src5, src4, src2, src5, src10_l, src21_l);
2962 LD_SB2(src + 16, src_stride, src11, src8);
2963 src += (2 * src_stride);
2964 XORI_B2_128_SB(src11, src8);
2965 ILVR_B2_SB(src11, src10, src8, src11, src76_r, src87_r);
2968 out0_r = HEVC_FILT_4TAP_SH(src32_r, src10_r, filt0, filt1);
2969 out0_l = HEVC_FILT_4TAP_SH(src32_l, src10_l, filt0, filt1);
2970 out1_r = HEVC_FILT_4TAP_SH(src43_r, src21_r, filt0, filt1);
2971 out1_l = HEVC_FILT_4TAP_SH(src43_l, src21_l, filt0, filt1);
2974 out2_r = HEVC_FILT_4TAP_SH(src98_r, src76_r, filt0, filt1);
2975 out3_r = HEVC_FILT_4TAP_SH(src109_r, src87_r, filt0, filt1);
2978 SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6);
2979 SRARI_H2_SH(out0_l, out1_l, 6);
2980 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
2981 SAT_SH2_SH(out0_l, out1_l, 7);
2982 out = PCKEV_XORI128_UB(out0_r, out0_l);
2984 out = PCKEV_XORI128_UB(out2_r, out2_r);
2985 ST8x1_UB(out, dst + 16);
2987 out = PCKEV_XORI128_UB(out1_r, out1_l);
2989 out = PCKEV_XORI128_UB(out3_r, out3_r);
2990 ST8x1_UB(out, dst + 16);
2995 static void common_vt_4t_32w_msa(uint8_t *src, int32_t src_stride,
2996 uint8_t *dst, int32_t dst_stride,
2997 const int8_t *filter, int32_t height)
3000 v16i8 src0, src1, src2, src3, src4, src6, src7, src8, src9, src10;
3001 v16i8 src10_r, src32_r, src76_r, src98_r;
3002 v16i8 src21_r, src43_r, src87_r, src109_r;
3003 v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
3004 v16i8 src10_l, src32_l, src76_l, src98_l;
3005 v16i8 src21_l, src43_l, src87_l, src109_l;
3012 filt = LD_SH(filter);
3013 SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
3016 LD_SB3(src, src_stride, src0, src1, src2);
3017 XORI_B3_128_SB(src0, src1, src2);
3019 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3020 ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
3023 LD_SB3(src + 16, src_stride, src6, src7, src8);
3024 src += (3 * src_stride);
3026 XORI_B3_128_SB(src6, src7, src8);
3027 ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
3028 ILVL_B2_SB(src7, src6, src8, src7, src76_l, src87_l);
3030 for (loop_cnt = (height >> 1); loop_cnt--;) {
3032 LD_SB2(src, src_stride, src3, src4);
3033 XORI_B2_128_SB(src3, src4);
3034 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3035 ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
3038 out0_r = HEVC_FILT_4TAP_SH(src10_r, src32_r, filt0, filt1);
3039 out0_l = HEVC_FILT_4TAP_SH(src10_l, src32_l, filt0, filt1);
3040 out1_r = HEVC_FILT_4TAP_SH(src21_r, src43_r, filt0, filt1);
3041 out1_l = HEVC_FILT_4TAP_SH(src21_l, src43_l, filt0, filt1);
3044 SRARI_H4_SH(out0_r, out1_r, out0_l, out1_l, 6);
3045 SAT_SH4_SH(out0_r, out1_r, out0_l, out1_l, 7);
3046 out = PCKEV_XORI128_UB(out0_r, out0_l);
3048 out = PCKEV_XORI128_UB(out1_r, out1_l);
3049 ST_UB(out, dst + dst_stride);
3058 LD_SB2(src + 16, src_stride, src9, src10);
3059 src += (2 * src_stride);
3060 XORI_B2_128_SB(src9, src10);
3061 ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r);
3062 ILVL_B2_SB(src9, src8, src10, src9, src98_l, src109_l);
3065 out2_r = HEVC_FILT_4TAP_SH(src76_r, src98_r, filt0, filt1);
3066 out2_l = HEVC_FILT_4TAP_SH(src76_l, src98_l, filt0, filt1);
3067 out3_r = HEVC_FILT_4TAP_SH(src87_r, src109_r, filt0, filt1);
3068 out3_l = HEVC_FILT_4TAP_SH(src87_l, src109_l, filt0, filt1);
3071 SRARI_H4_SH(out2_r, out3_r, out2_l, out3_l, 6);
3072 SAT_SH4_SH(out2_r, out3_r, out2_l, out3_l, 7);
3073 out = PCKEV_XORI128_UB(out2_r, out2_l);
3074 ST_UB(out, dst + 16);
3075 out = PCKEV_XORI128_UB(out3_r, out3_l);
3076 ST_UB(out, dst + 16 + dst_stride);
3078 dst += 2 * dst_stride;
3088 static void hevc_hv_uni_4t_4x2_msa(uint8_t *src,
3092 const int8_t *filter_x,
3093 const int8_t *filter_y)
3096 v16i8 src0, src1, src2, src3, src4;
3098 v8i16 filt_h0, filt_h1;
3099 v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16);
3101 v8i16 filter_vec, tmp;
3102 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
3103 v8i16 dst20, dst31, dst42, dst10, dst32, dst21, dst43;
3106 src -= (src_stride + 1);
3108 filter_vec = LD_SH(filter_x);
3109 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3111 filter_vec = LD_SH(filter_y);
3112 UNPCK_R_SB_SH(filter_vec, filter_vec);
3114 SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
3118 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
3119 XORI_B5_128_SB(src0, src1, src2, src3, src4);
3121 VSHF_B2_SB(src0, src2, src0, src2, mask0, mask1, vec0, vec1);
3122 VSHF_B2_SB(src1, src3, src1, src3, mask0, mask1, vec2, vec3);
3123 VSHF_B2_SB(src2, src4, src2, src4, mask0, mask1, vec4, vec5);
3125 dst20 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3126 dst31 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3127 dst42 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3129 ILVRL_H2_SH(dst31, dst20, dst10, dst32);
3130 ILVRL_H2_SH(dst42, dst31, dst21, dst43);
3132 dst0 = HEVC_FILT_4TAP(dst10, dst32, filt_h0, filt_h1);
3133 dst1 = HEVC_FILT_4TAP(dst21, dst43, filt_h0, filt_h1);
3136 tmp = __msa_pckev_h((v8i16) dst1, (v8i16) dst0);
3137 tmp = __msa_srari_h(tmp, 6);
3138 tmp = __msa_sat_s_h(tmp, 7);
3139 out = PCKEV_XORI128_UB(tmp, tmp);
3140 ST4x2_UB(out, dst, dst_stride);
3143 static void hevc_hv_uni_4t_4x4_msa(uint8_t *src,
3147 const int8_t *filter_x,
3148 const int8_t *filter_y)
3151 v16i8 src0, src1, src2, src3, src4, src5, src6;
3153 v8i16 filt_h0, filt_h1;
3154 v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16);
3156 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
3157 v8i16 filter_vec, tmp0, tmp1;
3158 v8i16 dst30, dst41, dst52, dst63;
3159 v8i16 dst10, dst32, dst54, dst21, dst43, dst65;
3160 v4i32 dst0, dst1, dst2, dst3;
3162 src -= (src_stride + 1);
3164 filter_vec = LD_SH(filter_x);
3165 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3167 filter_vec = LD_SH(filter_y);
3168 UNPCK_R_SB_SH(filter_vec, filter_vec);
3170 SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
3174 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
3175 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
3177 VSHF_B2_SB(src0, src3, src0, src3, mask0, mask1, vec0, vec1);
3178 VSHF_B2_SB(src1, src4, src1, src4, mask0, mask1, vec2, vec3);
3179 VSHF_B2_SB(src2, src5, src2, src5, mask0, mask1, vec4, vec5);
3180 VSHF_B2_SB(src3, src6, src3, src6, mask0, mask1, vec6, vec7);
3182 dst30 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3183 dst41 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3184 dst52 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3185 dst63 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
3187 ILVRL_H2_SH(dst41, dst30, dst10, dst43);
3188 ILVRL_H2_SH(dst52, dst41, dst21, dst54);
3189 ILVRL_H2_SH(dst63, dst52, dst32, dst65);
3190 dst0 = HEVC_FILT_4TAP(dst10, dst32, filt_h0, filt_h1);
3191 dst1 = HEVC_FILT_4TAP(dst21, dst43, filt_h0, filt_h1);
3192 dst2 = HEVC_FILT_4TAP(dst32, dst54, filt_h0, filt_h1);
3193 dst3 = HEVC_FILT_4TAP(dst43, dst65, filt_h0, filt_h1);
3194 SRA_4V(dst0, dst1, dst2, dst3, 6);
3195 PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp0, tmp1);
3196 SRARI_H2_SH(tmp0, tmp1, 6);
3197 SAT_SH2_SH(tmp0, tmp1, 7);
3198 out = PCKEV_XORI128_UB(tmp0, tmp1);
3199 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
3202 static void hevc_hv_uni_4t_4multx8mult_msa(uint8_t *src,
3206 const int8_t *filter_x,
3207 const int8_t *filter_y,
3212 v16i8 src0, src1, src2, src3, src4, src5;
3213 v16i8 src6, src7, src8, src9, src10;
3215 v8i16 filt_h0, filt_h1;
3216 v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16);
3218 v8i16 filter_vec, tmp0, tmp1, tmp2, tmp3;
3219 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
3220 v8i16 dst10, dst21, dst22, dst73, dst84, dst95, dst106;
3221 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r, dst6_r, dst7_r;
3222 v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
3223 v8i16 dst21_r, dst43_r, dst65_r, dst87_r;
3224 v8i16 dst98_r, dst109_r;
3226 src -= (src_stride + 1);
3228 filter_vec = LD_SH(filter_x);
3229 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3231 filter_vec = LD_SH(filter_y);
3232 UNPCK_R_SB_SH(filter_vec, filter_vec);
3234 SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
3238 LD_SB3(src, src_stride, src0, src1, src2);
3239 src += (3 * src_stride);
3241 XORI_B3_128_SB(src0, src1, src2);
3243 VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
3244 VSHF_B2_SB(src1, src2, src1, src2, mask0, mask1, vec2, vec3);
3245 dst10 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3246 dst21 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3247 ILVRL_H2_SH(dst21, dst10, dst10_r, dst21_r);
3248 dst22 = (v8i16) __msa_splati_d((v2i64) dst21, 1);
3250 for (loop_cnt = height >> 3; loop_cnt--;) {
3251 LD_SB8(src, src_stride,
3252 src3, src4, src5, src6, src7, src8, src9, src10);
3253 src += (8 * src_stride);
3255 XORI_B8_128_SB(src3, src4, src5, src6, src7, src8, src9, src10);
3257 VSHF_B2_SB(src3, src7, src3, src7, mask0, mask1, vec0, vec1);
3258 VSHF_B2_SB(src4, src8, src4, src8, mask0, mask1, vec2, vec3);
3259 VSHF_B2_SB(src5, src9, src5, src9, mask0, mask1, vec4, vec5);
3260 VSHF_B2_SB(src6, src10, src6, src10, mask0, mask1, vec6, vec7);
3262 dst73 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3263 dst84 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3264 dst95 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3265 dst106 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
3267 dst32_r = __msa_ilvr_h(dst73, dst22);
3268 ILVRL_H2_SH(dst84, dst73, dst43_r, dst87_r);
3269 ILVRL_H2_SH(dst95, dst84, dst54_r, dst98_r);
3270 ILVRL_H2_SH(dst106, dst95, dst65_r, dst109_r);
3271 dst22 = (v8i16) __msa_splati_d((v2i64) dst73, 1);
3272 dst76_r = __msa_ilvr_h(dst22, dst106);
3274 dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
3275 dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
3276 dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
3277 dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
3278 dst4_r = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
3279 dst5_r = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
3280 dst6_r = HEVC_FILT_4TAP(dst76_r, dst98_r, filt_h0, filt_h1);
3281 dst7_r = HEVC_FILT_4TAP(dst87_r, dst109_r, filt_h0, filt_h1);
3282 SRA_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6);
3283 SRA_4V(dst4_r, dst5_r, dst6_r, dst7_r, 6);
3284 PCKEV_H4_SH(dst1_r, dst0_r, dst3_r, dst2_r,
3285 dst5_r, dst4_r, dst7_r, dst6_r,
3286 tmp0, tmp1, tmp2, tmp3);
3287 SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 6);
3288 SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
3289 out0 = PCKEV_XORI128_UB(tmp0, tmp1);
3290 out1 = PCKEV_XORI128_UB(tmp2, tmp3);
3291 ST4x8_UB(out0, out1, dst, dst_stride);
3292 dst += (8 * dst_stride);
3296 dst22 = (v8i16) __msa_splati_d((v2i64) dst106, 1);
3300 static void hevc_hv_uni_4t_4w_msa(uint8_t *src,
3304 const int8_t *filter_x,
3305 const int8_t *filter_y,
3309 hevc_hv_uni_4t_4x2_msa(src, src_stride, dst, dst_stride,
3310 filter_x, filter_y);
3311 } else if (4 == height) {
3312 hevc_hv_uni_4t_4x4_msa(src, src_stride, dst, dst_stride,
3313 filter_x, filter_y);
3314 } else if (0 == (height % 8)) {
3315 hevc_hv_uni_4t_4multx8mult_msa(src, src_stride, dst, dst_stride,
3316 filter_x, filter_y, height);
3320 static void hevc_hv_uni_4t_6w_msa(uint8_t *src,
3324 const int8_t *filter_x,
3325 const int8_t *filter_y,
3328 v16u8 out0, out1, out2;
3329 v16i8 src0, src1, src2, src3, src4, src5, src6;
3330 v16i8 src7, src8, src9, src10;
3332 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
3333 v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
3335 v8i16 filt_h0, filt_h1, filter_vec;
3336 v8i16 dsth0, dsth1, dsth2, dsth3, dsth4, dsth5, dsth6, dsth7, dsth8, dsth9;
3337 v8i16 dsth10, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
3338 v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
3339 v4i32 dst4_r, dst5_r, dst6_r, dst7_r;
3340 v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
3341 v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
3342 v8i16 dst54_r, dst76_r, dst98_r, dst65_r, dst87_r, dst109_r;
3343 v8i16 dst98_l, dst65_l, dst54_l, dst76_l, dst87_l, dst109_l;
3344 v8i16 dst1021_l, dst3243_l, dst5465_l, dst7687_l, dst98109_l;
3346 src -= (src_stride + 1);
3348 filter_vec = LD_SH(filter_x);
3349 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3351 filter_vec = LD_SH(filter_y);
3352 UNPCK_R_SB_SH(filter_vec, filter_vec);
3354 SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
3358 LD_SB3(src, src_stride, src0, src1, src2);
3359 src += (3 * src_stride);
3361 XORI_B3_128_SB(src0, src1, src2);
3363 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3364 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
3365 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3367 dsth0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3368 dsth1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3369 dsth2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3371 ILVRL_H2_SH(dsth1, dsth0, dst10_r, dst10_l);
3372 ILVRL_H2_SH(dsth2, dsth1, dst21_r, dst21_l);
3374 LD_SB8(src, src_stride, src3, src4, src5, src6, src7, src8, src9, src10);
3375 XORI_B8_128_SB(src3, src4, src5, src6, src7, src8, src9, src10);
3377 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3378 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
3379 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
3380 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
3382 dsth3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3383 dsth4 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3384 dsth5 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3385 dsth6 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
3387 VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1);
3388 VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec2, vec3);
3389 VSHF_B2_SB(src9, src9, src9, src9, mask0, mask1, vec4, vec5);
3390 VSHF_B2_SB(src10, src10, src10, src10, mask0, mask1, vec6, vec7);
3392 dsth7 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3393 dsth8 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3394 dsth9 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3395 dsth10 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
3397 ILVRL_H2_SH(dsth3, dsth2, dst32_r, dst32_l);
3398 ILVRL_H2_SH(dsth4, dsth3, dst43_r, dst43_l);
3399 ILVRL_H2_SH(dsth5, dsth4, dst54_r, dst54_l);
3400 ILVRL_H2_SH(dsth6, dsth5, dst65_r, dst65_l);
3401 ILVRL_H2_SH(dsth7, dsth6, dst76_r, dst76_l);
3402 ILVRL_H2_SH(dsth8, dsth7, dst87_r, dst87_l);
3403 ILVRL_H2_SH(dsth9, dsth8, dst98_r, dst98_l);
3404 ILVRL_H2_SH(dsth10, dsth9, dst109_r, dst109_l);
3406 PCKEV_D2_SH(dst21_l, dst10_l, dst43_l, dst32_l, dst1021_l, dst3243_l);
3407 PCKEV_D2_SH(dst65_l, dst54_l, dst87_l, dst76_l, dst5465_l, dst7687_l);
3408 dst98109_l = (v8i16) __msa_pckev_d((v2i64) dst109_l, (v2i64) dst98_l);
3410 dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
3411 dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
3412 dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
3413 dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
3414 dst4_r = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
3415 dst5_r = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
3416 dst6_r = HEVC_FILT_4TAP(dst76_r, dst98_r, filt_h0, filt_h1);
3417 dst7_r = HEVC_FILT_4TAP(dst87_r, dst109_r, filt_h0, filt_h1);
3418 dst0_l = HEVC_FILT_4TAP(dst1021_l, dst3243_l, filt_h0, filt_h1);
3419 dst1_l = HEVC_FILT_4TAP(dst3243_l, dst5465_l, filt_h0, filt_h1);
3420 dst2_l = HEVC_FILT_4TAP(dst5465_l, dst7687_l, filt_h0, filt_h1);
3421 dst3_l = HEVC_FILT_4TAP(dst7687_l, dst98109_l, filt_h0, filt_h1);
3422 SRA_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6);
3423 SRA_4V(dst4_r, dst5_r, dst6_r, dst7_r, 6);
3424 SRA_4V(dst0_l, dst1_l, dst2_l, dst3_l, 6);
3425 PCKEV_H2_SH(dst1_r, dst0_r, dst3_r, dst2_r, tmp0, tmp1);
3426 PCKEV_H2_SH(dst5_r, dst4_r, dst7_r, dst6_r, tmp2, tmp3);
3427 PCKEV_H2_SH(dst1_l, dst0_l, dst3_l, dst2_l, tmp4, tmp5);
3428 SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 6);
3429 SRARI_H2_SH(tmp4, tmp5, 6);
3430 SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3,7);
3431 SAT_SH2_SH(tmp4, tmp5,7);
3432 out0 = PCKEV_XORI128_UB(tmp0, tmp1);
3433 out1 = PCKEV_XORI128_UB(tmp2, tmp3);
3434 out2 = PCKEV_XORI128_UB(tmp4, tmp5);
3435 ST4x8_UB(out0, out1, dst, dst_stride);
3436 ST2x4_UB(out2, 0, dst + 4, dst_stride);
3437 dst += 4 * dst_stride;
3438 ST2x4_UB(out2, 4, dst + 4, dst_stride);
3441 static void hevc_hv_uni_4t_8x2_msa(uint8_t *src,
3445 const int8_t *filter_x,
3446 const int8_t *filter_y)
3449 v16i8 src0, src1, src2, src3, src4;
3451 v8i16 filt_h0, filt_h1, filter_vec;
3452 v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
3454 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
3455 v8i16 dst0, dst1, dst2, dst3, dst4;
3456 v4i32 dst0_r, dst0_l, dst1_r, dst1_l;
3457 v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
3458 v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
3459 v8i16 out0_r, out1_r;
3461 src -= (src_stride + 1);
3463 filter_vec = LD_SH(filter_x);
3464 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3466 filter_vec = LD_SH(filter_y);
3467 UNPCK_R_SB_SH(filter_vec, filter_vec);
3469 SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
3473 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
3474 XORI_B5_128_SB(src0, src1, src2, src3, src4);
3476 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3477 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
3478 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3479 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7);
3480 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec8, vec9);
3482 dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3483 dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3484 dst2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3485 dst3 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
3486 dst4 = HEVC_FILT_4TAP_SH(vec8, vec9, filt0, filt1);
3487 ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
3488 ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
3489 ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
3490 ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
3491 dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
3492 dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
3493 dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
3494 dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
3495 SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
3496 PCKEV_H2_SH(dst0_l, dst0_r, dst1_l, dst1_r, out0_r, out1_r);
3497 SRARI_H2_SH(out0_r, out1_r, 6);
3498 SAT_SH2_SH(out0_r, out1_r, 7);
3499 out = PCKEV_XORI128_UB(out0_r, out1_r);
3500 ST8x2_UB(out, dst, dst_stride);
3503 static void hevc_hv_uni_4t_8multx4_msa(uint8_t *src,
3507 const int8_t *filter_x,
3508 const int8_t *filter_y,
3513 v16i8 src0, src1, src2, src3, src4, src5, src6, mask0, mask1;
3514 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
3515 v8i16 filt0, filt1, filt_h0, filt_h1, filter_vec;
3516 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, tmp0, tmp1, tmp2, tmp3;
3517 v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
3518 v8i16 dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r;
3519 v8i16 dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l;
3521 src -= (src_stride + 1);
3523 filter_vec = LD_SH(filter_x);
3524 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3526 filter_vec = LD_SH(filter_y);
3527 UNPCK_R_SB_SH(filter_vec, filter_vec);
3529 SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
3531 mask0 = LD_SB(ff_hevc_mask_arr);
3534 for (cnt = width8mult; cnt--;) {
3535 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
3537 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
3539 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3540 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
3541 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3543 dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3544 dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3545 dst2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3547 ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
3548 ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
3550 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3551 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
3552 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
3553 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
3555 dst3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3556 dst4 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3557 dst5 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3558 dst6 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
3560 ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
3561 ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
3562 ILVRL_H2_SH(dst5, dst4, dst54_r, dst54_l);
3563 ILVRL_H2_SH(dst6, dst5, dst65_r, dst65_l);
3565 dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
3566 dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
3567 dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
3568 dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
3569 dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
3570 dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1);
3571 dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
3572 dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1);
3574 SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
3575 SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
3577 PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, dst3_l,
3578 dst3_r, tmp0, tmp1, tmp2, tmp3);
3579 SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 6);
3580 SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
3581 out0 = PCKEV_XORI128_UB(tmp0, tmp1);
3582 out1 = PCKEV_XORI128_UB(tmp2, tmp3);
3583 ST8x4_UB(out0, out1, dst, dst_stride);
3588 static void hevc_hv_uni_4t_8x6_msa(uint8_t *src,
3592 const int8_t *filter_x,
3593 const int8_t *filter_y)
3595 v16u8 out0, out1, out2;
3596 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
3598 v8i16 filt_h0, filt_h1, filter_vec;
3599 v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
3601 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
3602 v16i8 vec10, vec11, vec12, vec13, vec14, vec15, vec16, vec17;
3603 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
3604 v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
3605 v4i32 dst4_r, dst4_l, dst5_r, dst5_l;
3606 v8i16 dst10_r, dst32_r, dst10_l, dst32_l;
3607 v8i16 dst21_r, dst43_r, dst21_l, dst43_l;
3608 v8i16 dst54_r, dst54_l, dst65_r, dst65_l;
3609 v8i16 dst76_r, dst76_l, dst87_r, dst87_l;
3610 v8i16 out0_r, out1_r, out2_r, out3_r, out4_r, out5_r;
3612 src -= (src_stride + 1);
3614 filter_vec = LD_SH(filter_x);
3615 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3617 filter_vec = LD_SH(filter_y);
3618 UNPCK_R_SB_SH(filter_vec, filter_vec);
3620 SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
3624 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
3625 src += (5 * src_stride);
3626 LD_SB4(src, src_stride, src5, src6, src7, src8);
3628 XORI_B5_128_SB(src0, src1, src2, src3, src4);
3629 XORI_B4_128_SB(src5, src6, src7, src8);
3631 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3632 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
3633 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3634 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7);
3635 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec8, vec9);
3636 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec10, vec11);
3637 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec12, vec13);
3638 VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec14, vec15);
3639 VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec16, vec17);
3641 dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3642 dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3643 dst2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3644 dst3 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
3645 dst4 = HEVC_FILT_4TAP_SH(vec8, vec9, filt0, filt1);
3646 dst5 = HEVC_FILT_4TAP_SH(vec10, vec11, filt0, filt1);
3647 dst6 = HEVC_FILT_4TAP_SH(vec12, vec13, filt0, filt1);
3648 dst7 = HEVC_FILT_4TAP_SH(vec14, vec15, filt0, filt1);
3649 dst8 = HEVC_FILT_4TAP_SH(vec16, vec17, filt0, filt1);
3651 ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
3652 ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
3653 ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
3654 ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
3655 ILVRL_H2_SH(dst5, dst4, dst54_r, dst54_l);
3656 ILVRL_H2_SH(dst6, dst5, dst65_r, dst65_l);
3657 ILVRL_H2_SH(dst7, dst6, dst76_r, dst76_l);
3658 ILVRL_H2_SH(dst8, dst7, dst87_r, dst87_l);
3660 dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
3661 dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
3662 dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
3663 dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
3664 dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
3665 dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1);
3666 dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
3667 dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1);
3668 dst4_r = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
3669 dst4_l = HEVC_FILT_4TAP(dst54_l, dst76_l, filt_h0, filt_h1);
3670 dst5_r = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
3671 dst5_l = HEVC_FILT_4TAP(dst65_l, dst87_l, filt_h0, filt_h1);
3673 SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
3674 SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
3675 SRA_4V(dst4_r, dst4_l, dst5_r, dst5_l, 6);
3676 PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r,
3677 dst2_l, dst2_r, dst3_l, dst3_r, out0_r, out1_r, out2_r, out3_r);
3678 PCKEV_H2_SH(dst4_l, dst4_r, dst5_l, dst5_r, out4_r, out5_r);
3679 SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6);
3680 SRARI_H2_SH(out4_r, out5_r, 6);
3681 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
3682 SAT_SH2_SH(out4_r, out5_r, 7);
3683 out0 = PCKEV_XORI128_UB(out0_r, out1_r);
3684 out1 = PCKEV_XORI128_UB(out2_r, out3_r);
3685 out2 = PCKEV_XORI128_UB(out4_r, out5_r);
3687 ST8x4_UB(out0, out1, dst, dst_stride);
3688 dst += (4 * dst_stride);
3689 ST8x2_UB(out2, dst, dst_stride);
3692 static void hevc_hv_uni_4t_8multx4mult_msa(uint8_t *src,
3696 const int8_t *filter_x,
3697 const int8_t *filter_y,
3701 uint32_t loop_cnt, cnt;
3705 v16i8 src0, src1, src2, src3, src4, src5, src6;
3707 v8i16 filt_h0, filt_h1, filter_vec;
3708 v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
3710 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
3711 v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
3712 v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
3713 v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
3714 v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
3715 v8i16 dst54_r, dst54_l, dst65_r, dst65_l, dst6;
3716 v8i16 out0_r, out1_r, out2_r, out3_r;
3718 src -= (src_stride + 1);
3720 filter_vec = LD_SH(filter_x);
3721 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3723 filter_vec = LD_SH(filter_y);
3724 UNPCK_R_SB_SH(filter_vec, filter_vec);
3726 SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
3730 for (cnt = width8mult; cnt--;) {
3734 LD_SB3(src_tmp, src_stride, src0, src1, src2);
3735 src_tmp += (3 * src_stride);
3737 XORI_B3_128_SB(src0, src1, src2);
3739 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3740 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
3741 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3743 dst0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3744 dst1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3745 dst2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3747 ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
3748 ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
3750 for (loop_cnt = (height >> 2); loop_cnt--;) {
3751 LD_SB4(src_tmp, src_stride, src3, src4, src5, src6);
3752 src_tmp += (4 * src_stride);
3754 XORI_B4_128_SB(src3, src4, src5, src6);
3756 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3757 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
3758 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
3759 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
3761 dst3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3762 dst4 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3763 dst5 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3764 dst6 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
3766 ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
3767 ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
3768 ILVRL_H2_SH(dst5, dst4, dst54_r, dst54_l);
3769 ILVRL_H2_SH(dst6, dst5, dst65_r, dst65_l);
3771 dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
3772 dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
3773 dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
3774 dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
3775 dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
3776 dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1);
3777 dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
3778 dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1);
3780 SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
3781 SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
3783 PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r,
3784 dst2_l, dst2_r, dst3_l, dst3_r,
3785 out0_r, out1_r, out2_r, out3_r);
3787 SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6);
3788 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
3789 out0 = PCKEV_XORI128_UB(out0_r, out1_r);
3790 out1 = PCKEV_XORI128_UB(out2_r, out3_r);
3791 ST8x4_UB(out0, out1, dst_tmp, dst_stride);
3792 dst_tmp += (4 * dst_stride);
3806 static void hevc_hv_uni_4t_8w_msa(uint8_t *src,
3810 const int8_t *filter_x,
3811 const int8_t *filter_y,
3815 hevc_hv_uni_4t_8x2_msa(src, src_stride, dst, dst_stride,
3816 filter_x, filter_y);
3817 } else if (4 == height) {
3818 hevc_hv_uni_4t_8multx4_msa(src, src_stride, dst, dst_stride,
3819 filter_x, filter_y, 1);
3820 } else if (6 == height) {
3821 hevc_hv_uni_4t_8x6_msa(src, src_stride, dst, dst_stride,
3822 filter_x, filter_y);
3823 } else if (0 == (height % 4)) {
3824 hevc_hv_uni_4t_8multx4mult_msa(src, src_stride, dst, dst_stride,
3825 filter_x, filter_y, height, 1);
3829 static void hevc_hv_uni_4t_12w_msa(uint8_t *src,
3833 const int8_t *filter_x,
3834 const int8_t *filter_y,
3838 uint8_t *src_tmp, *dst_tmp;
3840 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
3841 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
3842 v16i8 mask0, mask1, mask2, mask3;
3843 v8i16 filt0, filt1, filt_h0, filt_h1, filter_vec, tmp0, tmp1, tmp2, tmp3;
3844 v8i16 dsth0, dsth1, dsth2, dsth3, dsth4, dsth5, dsth6;
3845 v8i16 dst10, dst21, dst22, dst73, dst84, dst95, dst106;
3846 v8i16 dst76_r, dst98_r, dst87_r, dst109_r;
3847 v8i16 dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r;
3848 v8i16 dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l;
3849 v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
3850 v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
3852 src -= (src_stride + 1);
3854 filter_vec = LD_SH(filter_x);
3855 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3857 filter_vec = LD_SH(filter_y);
3858 UNPCK_R_SB_SH(filter_vec, filter_vec);
3860 SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
3862 mask0 = LD_SB(ff_hevc_mask_arr);
3868 LD_SB3(src_tmp, src_stride, src0, src1, src2);
3869 src_tmp += (3 * src_stride);
3871 XORI_B3_128_SB(src0, src1, src2);
3873 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3874 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
3875 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3877 dsth0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3878 dsth1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3879 dsth2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3881 ILVRL_H2_SH(dsth1, dsth0, dst10_r, dst10_l);
3882 ILVRL_H2_SH(dsth2, dsth1, dst21_r, dst21_l);
3884 for (loop_cnt = 4; loop_cnt--;) {
3885 LD_SB4(src_tmp, src_stride, src3, src4, src5, src6);
3886 src_tmp += (4 * src_stride);
3887 XORI_B4_128_SB(src3, src4, src5, src6);
3889 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3890 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
3891 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
3892 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
3894 dsth3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3895 dsth4 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3896 dsth5 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3897 dsth6 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
3899 ILVRL_H2_SH(dsth3, dsth2, dst32_r, dst32_l);
3900 ILVRL_H2_SH(dsth4, dsth3, dst43_r, dst43_l);
3901 ILVRL_H2_SH(dsth5, dsth4, dst54_r, dst54_l);
3902 ILVRL_H2_SH(dsth6, dsth5, dst65_r, dst65_l);
3904 dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
3905 dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
3906 dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
3907 dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
3908 dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
3909 dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1);
3910 dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
3911 dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1);
3913 SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
3914 SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
3916 PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, dst3_l,
3917 dst3_r, tmp0, tmp1, tmp2, tmp3);
3918 SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 6);
3919 SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
3920 out0 = PCKEV_XORI128_UB(tmp0, tmp1);
3921 out1 = PCKEV_XORI128_UB(tmp2, tmp3);
3922 ST8x4_UB(out0, out1, dst_tmp, dst_stride);
3923 dst_tmp += (4 * dst_stride);
3935 mask2 = LD_SB(ff_hevc_mask_arr + 16);
3938 LD_SB3(src, src_stride, src0, src1, src2);
3939 src += (3 * src_stride);
3940 XORI_B3_128_SB(src0, src1, src2);
3941 VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec0, vec1);
3942 VSHF_B2_SB(src1, src2, src1, src2, mask2, mask3, vec2, vec3);
3944 dst10 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3945 dst21 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3947 ILVRL_H2_SH(dst21, dst10, dst10_r, dst21_r);
3948 dst22 = (v8i16) __msa_splati_d((v2i64) dst21, 1);
3950 for (loop_cnt = 2; loop_cnt--;) {
3951 LD_SB8(src, src_stride,
3952 src3, src4, src5, src6, src7, src8, src9, src10);
3953 src += (8 * src_stride);
3954 XORI_B8_128_SB(src3, src4, src5, src6, src7, src8, src9, src10);
3955 VSHF_B2_SB(src3, src7, src3, src7, mask2, mask3, vec0, vec1);
3956 VSHF_B2_SB(src4, src8, src4, src8, mask2, mask3, vec2, vec3);
3957 VSHF_B2_SB(src5, src9, src5, src9, mask2, mask3, vec4, vec5);
3958 VSHF_B2_SB(src6, src10, src6, src10, mask2, mask3, vec6, vec7);
3960 dst73 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
3961 dst84 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
3962 dst95 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
3963 dst106 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
3965 dst32_r = __msa_ilvr_h(dst73, dst22);
3966 ILVRL_H2_SH(dst84, dst73, dst43_r, dst87_r);
3967 ILVRL_H2_SH(dst95, dst84, dst54_r, dst98_r);
3968 ILVRL_H2_SH(dst106, dst95, dst65_r, dst109_r);
3969 dst22 = (v8i16) __msa_splati_d((v2i64) dst73, 1);
3970 dst76_r = __msa_ilvr_h(dst22, dst106);
3972 dst0 = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
3973 dst1 = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
3974 dst2 = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
3975 dst3 = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
3976 dst4 = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
3977 dst5 = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
3978 dst6 = HEVC_FILT_4TAP(dst76_r, dst98_r, filt_h0, filt_h1);
3979 dst7 = HEVC_FILT_4TAP(dst87_r, dst109_r, filt_h0, filt_h1);
3980 SRA_4V(dst0, dst1, dst2, dst3, 6);
3981 SRA_4V(dst4, dst5, dst6, dst7, 6);
3982 PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6,
3983 tmp0, tmp1, tmp2, tmp3);
3984 SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 6);
3985 SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
3986 out0 = PCKEV_XORI128_UB(tmp0, tmp1);
3987 out1 = PCKEV_XORI128_UB(tmp2, tmp3);
3988 ST4x8_UB(out0, out1, dst, dst_stride);
3989 dst += (8 * dst_stride);
3993 dst22 = (v8i16) __msa_splati_d((v2i64) dst106, 1);
3997 static void hevc_hv_uni_4t_16w_msa(uint8_t *src,
4001 const int8_t *filter_x,
4002 const int8_t *filter_y,
4006 hevc_hv_uni_4t_8multx4_msa(src, src_stride, dst, dst_stride, filter_x,
4009 hevc_hv_uni_4t_8multx4mult_msa(src, src_stride, dst, dst_stride,
4010 filter_x, filter_y, height, 2);
4014 static void hevc_hv_uni_4t_24w_msa(uint8_t *src,
4018 const int8_t *filter_x,
4019 const int8_t *filter_y,
4022 hevc_hv_uni_4t_8multx4mult_msa(src, src_stride, dst, dst_stride,
4023 filter_x, filter_y, height, 3);
4026 static void hevc_hv_uni_4t_32w_msa(uint8_t *src,
4030 const int8_t *filter_x,
4031 const int8_t *filter_y,
4034 hevc_hv_uni_4t_8multx4mult_msa(src, src_stride, dst, dst_stride,
4035 filter_x, filter_y, height, 4);
4038 #define UNI_MC_COPY(WIDTH) \
4039 void ff_hevc_put_hevc_uni_pel_pixels##WIDTH##_8_msa(uint8_t *dst, \
4040 ptrdiff_t dst_stride, \
4042 ptrdiff_t src_stride, \
4048 copy_width##WIDTH##_msa(src, src_stride, dst, dst_stride, height); \
4061 #define UNI_MC(PEL, DIR, WIDTH, TAP, DIR1, FILT_DIR) \
4062 void ff_hevc_put_hevc_uni_##PEL##_##DIR##WIDTH##_8_msa(uint8_t *dst, \
4063 ptrdiff_t dst_stride, \
4065 ptrdiff_t src_stride, \
4071 const int8_t *filter = ff_hevc_##PEL##_filters[FILT_DIR - 1]; \
4073 common_##DIR1##_##TAP##t_##WIDTH##w_msa(src, src_stride, dst, dst_stride, \
4077 UNI_MC(qpel, h, 4, 8, hz, mx);
4078 UNI_MC(qpel, h, 8, 8, hz, mx);
4079 UNI_MC(qpel, h, 12, 8, hz, mx);
4080 UNI_MC(qpel, h, 16, 8, hz, mx);
4081 UNI_MC(qpel, h, 24, 8, hz, mx);
4082 UNI_MC(qpel, h, 32, 8, hz, mx);
4083 UNI_MC(qpel, h, 48, 8, hz, mx);
4084 UNI_MC(qpel, h, 64, 8, hz, mx);
4086 UNI_MC(qpel, v, 4, 8, vt, my);
4087 UNI_MC(qpel, v, 8, 8, vt, my);
4088 UNI_MC(qpel, v, 12, 8, vt, my);
4089 UNI_MC(qpel, v, 16, 8, vt, my);
4090 UNI_MC(qpel, v, 24, 8, vt, my);
4091 UNI_MC(qpel, v, 32, 8, vt, my);
4092 UNI_MC(qpel, v, 48, 8, vt, my);
4093 UNI_MC(qpel, v, 64, 8, vt, my);
4095 UNI_MC(epel, h, 4, 4, hz, mx);
4096 UNI_MC(epel, h, 6, 4, hz, mx);
4097 UNI_MC(epel, h, 8, 4, hz, mx);
4098 UNI_MC(epel, h, 12, 4, hz, mx);
4099 UNI_MC(epel, h, 16, 4, hz, mx);
4100 UNI_MC(epel, h, 24, 4, hz, mx);
4101 UNI_MC(epel, h, 32, 4, hz, mx);
4103 UNI_MC(epel, v, 4, 4, vt, my);
4104 UNI_MC(epel, v, 6, 4, vt, my);
4105 UNI_MC(epel, v, 8, 4, vt, my);
4106 UNI_MC(epel, v, 12, 4, vt, my);
4107 UNI_MC(epel, v, 16, 4, vt, my);
4108 UNI_MC(epel, v, 24, 4, vt, my);
4109 UNI_MC(epel, v, 32, 4, vt, my);
4113 #define UNI_MC_HV(PEL, WIDTH, TAP) \
4114 void ff_hevc_put_hevc_uni_##PEL##_hv##WIDTH##_8_msa(uint8_t *dst, \
4115 ptrdiff_t dst_stride, \
4117 ptrdiff_t src_stride, \
4123 const int8_t *filter_x = ff_hevc_##PEL##_filters[mx - 1]; \
4124 const int8_t *filter_y = ff_hevc_##PEL##_filters[my - 1]; \
4126 hevc_hv_uni_##TAP##t_##WIDTH##w_msa(src, src_stride, dst, dst_stride, \
4127 filter_x, filter_y, height); \
4130 UNI_MC_HV(qpel, 4, 8);
4131 UNI_MC_HV(qpel, 8, 8);
4132 UNI_MC_HV(qpel, 12, 8);
4133 UNI_MC_HV(qpel, 16, 8);
4134 UNI_MC_HV(qpel, 24, 8);
4135 UNI_MC_HV(qpel, 32, 8);
4136 UNI_MC_HV(qpel, 48, 8);
4137 UNI_MC_HV(qpel, 64, 8);
4139 UNI_MC_HV(epel, 4, 4);
4140 UNI_MC_HV(epel, 6, 4);
4141 UNI_MC_HV(epel, 8, 4);
4142 UNI_MC_HV(epel, 12, 4);
4143 UNI_MC_HV(epel, 16, 4);
4144 UNI_MC_HV(epel, 24, 4);
4145 UNI_MC_HV(epel, 32, 4);