2 * Copyright (c) 2015 - 2017 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com)
4 * This file is part of FFmpeg.
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 #include "libavutil/mips/generic_macros_msa.h"
22 #include "libavcodec/mips/hevcdsp_mips.h"
23 #include "libavcodec/mips/hevc_macros_msa.h"
25 static const uint8_t ff_hevc_mask_arr[16 * 3] __attribute__((aligned(0x40))) = {
27 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
29 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20,
31 8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
34 #define HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, \
35 mask0, mask1, mask2, mask3, \
36 filt0, filt1, filt2, filt3, \
39 v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \
41 VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0_m, vec1_m); \
42 DOTP_SB2_SH(vec0_m, vec1_m, filt0, filt0, out0, out1); \
43 VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2_m, vec3_m); \
44 DPADD_SB2_SH(vec2_m, vec3_m, filt1, filt1, out0, out1); \
45 VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec4_m, vec5_m); \
46 DPADD_SB2_SH(vec4_m, vec5_m, filt2, filt2, out0, out1); \
47 VSHF_B2_SB(src0, src1, src2, src3, mask3, mask3, vec6_m, vec7_m); \
48 DPADD_SB2_SH(vec6_m, vec7_m, filt3, filt3, out0, out1); \
51 #define HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, \
52 mask0, mask1, mask2, mask3, \
53 filt0, filt1, filt2, filt3, \
54 out0, out1, out2, out3) \
56 v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m; \
58 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m); \
59 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m); \
60 DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0, \
61 out0, out1, out2, out3); \
62 VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec0_m, vec1_m); \
63 VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2_m, vec3_m); \
64 DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt2, filt2, filt2, filt2, \
65 out0, out1, out2, out3); \
66 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4_m, vec5_m); \
67 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6_m, vec7_m); \
68 DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt1, filt1, filt1, filt1, \
69 out0, out1, out2, out3); \
70 VSHF_B2_SB(src0, src0, src1, src1, mask3, mask3, vec4_m, vec5_m); \
71 VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec6_m, vec7_m); \
72 DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt3, filt3, filt3, filt3, \
73 out0, out1, out2, out3); \
76 #define HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, \
77 mask0, mask1, filt0, filt1, \
80 v16i8 vec0_m, vec1_m, vec2_m, vec3_m; \
82 VSHF_B2_SB(src0, src1, src2, src3, mask0, mask0, vec0_m, vec1_m); \
83 DOTP_SB2_SH(vec0_m, vec1_m, filt0, filt0, out0, out1); \
84 VSHF_B2_SB(src0, src1, src2, src3, mask1, mask1, vec2_m, vec3_m); \
85 DPADD_SB2_SH(vec2_m, vec3_m, filt1, filt1, out0, out1); \
88 #define HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, \
89 mask0, mask1, filt0, filt1, \
90 out0, out1, out2, out3) \
92 v16i8 vec0_m, vec1_m, vec2_m, vec3_m; \
94 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m); \
95 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m); \
96 DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0, \
97 out0, out1, out2, out3); \
98 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec0_m, vec1_m); \
99 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec2_m, vec3_m); \
100 DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt1, filt1, filt1, filt1, \
101 out0, out1, out2, out3); \
104 static void copy_width8_msa(uint8_t *src, int32_t src_stride,
105 uint8_t *dst, int32_t dst_stride,
109 uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
112 LD2(src, src_stride, out0, out1);
116 } else if (6 == height) {
117 LD4(src, src_stride, out0, out1, out2, out3);
118 src += (4 * src_stride);
119 SD4(out0, out1, out2, out3, dst, dst_stride);
120 dst += (4 * dst_stride);
121 LD2(src, src_stride, out0, out1);
125 } else if (0 == (height % 8)) {
126 for (cnt = (height >> 3); cnt--;) {
127 LD4(src, src_stride, out0, out1, out2, out3);
128 src += (4 * src_stride);
129 LD4(src, src_stride, out4, out5, out6, out7);
130 src += (4 * src_stride);
131 SD4(out0, out1, out2, out3, dst, dst_stride);
132 dst += (4 * dst_stride);
133 SD4(out4, out5, out6, out7, dst, dst_stride);
134 dst += (4 * dst_stride);
136 } else if (0 == (height % 4)) {
137 for (cnt = (height >> 2); cnt--;) {
138 LD4(src, src_stride, out0, out1, out2, out3);
139 src += (4 * src_stride);
140 SD4(out0, out1, out2, out3, dst, dst_stride);
141 dst += (4 * dst_stride);
146 static void copy_width12_msa(uint8_t *src, int32_t src_stride,
147 uint8_t *dst, int32_t dst_stride,
150 v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
152 LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
153 src += (8 * src_stride);
154 ST12x8_UB(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride);
155 dst += (8 * dst_stride);
156 LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
157 ST12x8_UB(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride);
160 static void copy_width16_msa(uint8_t *src, int32_t src_stride,
161 uint8_t *dst, int32_t dst_stride,
165 v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
168 LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
169 src += (8 * src_stride);
170 ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride);
171 dst += (8 * dst_stride);
172 LD_UB4(src, src_stride, src0, src1, src2, src3);
173 src += (4 * src_stride);
174 ST_UB4(src0, src1, src2, src3, dst, dst_stride);
175 dst += (4 * dst_stride);
176 } else if (0 == (height % 8)) {
177 for (cnt = (height >> 3); cnt--;) {
178 LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6,
180 src += (8 * src_stride);
181 ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst,
183 dst += (8 * dst_stride);
185 } else if (0 == (height % 4)) {
186 for (cnt = (height >> 2); cnt--;) {
187 LD_UB4(src, src_stride, src0, src1, src2, src3);
188 src += (4 * src_stride);
190 ST_UB4(src0, src1, src2, src3, dst, dst_stride);
191 dst += (4 * dst_stride);
196 static void copy_width24_msa(uint8_t *src, int32_t src_stride,
197 uint8_t *dst, int32_t dst_stride,
201 v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
202 uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
204 for (cnt = 4; cnt--;) {
205 LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
206 LD4(src + 16, src_stride, out0, out1, out2, out3);
207 src += (4 * src_stride);
208 LD4(src + 16, src_stride, out4, out5, out6, out7);
209 src += (4 * src_stride);
211 ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7, dst, dst_stride);
212 SD4(out0, out1, out2, out3, dst + 16, dst_stride);
213 dst += (4 * dst_stride);
214 SD4(out4, out5, out6, out7, dst + 16, dst_stride);
215 dst += (4 * dst_stride);
219 static void copy_width32_msa(uint8_t *src, int32_t src_stride,
220 uint8_t *dst, int32_t dst_stride,
224 v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
226 for (cnt = (height >> 2); cnt--;) {
227 LD_UB4(src, src_stride, src0, src1, src2, src3);
228 LD_UB4(src + 16, src_stride, src4, src5, src6, src7);
229 src += (4 * src_stride);
230 ST_UB4(src0, src1, src2, src3, dst, dst_stride);
231 ST_UB4(src4, src5, src6, src7, dst + 16, dst_stride);
232 dst += (4 * dst_stride);
236 static void copy_width48_msa(uint8_t *src, int32_t src_stride,
237 uint8_t *dst, int32_t dst_stride,
241 v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
244 for (cnt = (height >> 2); cnt--;) {
245 LD_UB4(src, src_stride, src0, src1, src2, src3);
246 LD_UB4(src + 16, src_stride, src4, src5, src6, src7);
247 LD_UB4(src + 32, src_stride, src8, src9, src10, src11);
248 src += (4 * src_stride);
250 ST_UB4(src0, src1, src2, src3, dst, dst_stride);
251 ST_UB4(src4, src5, src6, src7, dst + 16, dst_stride);
252 ST_UB4(src8, src9, src10, src11, dst + 32, dst_stride);
253 dst += (4 * dst_stride);
257 static void copy_width64_msa(uint8_t *src, int32_t src_stride,
258 uint8_t *dst, int32_t dst_stride,
262 v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
263 v16u8 src8, src9, src10, src11, src12, src13, src14, src15;
265 for (cnt = (height >> 2); cnt--;) {
266 LD_UB4(src, 16, src0, src1, src2, src3);
268 LD_UB4(src, 16, src4, src5, src6, src7);
270 LD_UB4(src, 16, src8, src9, src10, src11);
272 LD_UB4(src, 16, src12, src13, src14, src15);
275 ST_UB4(src0, src1, src2, src3, dst, 16);
277 ST_UB4(src4, src5, src6, src7, dst, 16);
279 ST_UB4(src8, src9, src10, src11, dst, 16);
281 ST_UB4(src12, src13, src14, src15, dst, 16);
286 static const uint8_t mc_filt_mask_arr[16 * 3] = {
288 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
290 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20,
292 8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
295 #define FILT_8TAP_DPADD_S_H(vec0, vec1, vec2, vec3, \
296 filt0, filt1, filt2, filt3) \
300 tmp0 = __msa_dotp_s_h((v16i8) vec0, (v16i8) filt0); \
301 tmp0 = __msa_dpadd_s_h(tmp0, (v16i8) vec1, (v16i8) filt1); \
302 tmp1 = __msa_dotp_s_h((v16i8) vec2, (v16i8) filt2); \
303 tmp1 = __msa_dpadd_s_h(tmp1, (v16i8) vec3, (v16i8) filt3); \
304 tmp0 = __msa_adds_s_h(tmp0, tmp1); \
309 #define FILT_4TAP_DPADD_S_H(vec0, vec1, filt0, filt1) \
313 tmp0 = __msa_dotp_s_h((v16i8) vec0, (v16i8) filt0); \
314 tmp0 = __msa_dpadd_s_h(tmp0, (v16i8) vec1, (v16i8) filt1); \
319 static void common_hz_8t_4x4_msa(uint8_t *src, int32_t src_stride,
320 uint8_t *dst, int32_t dst_stride,
321 const int8_t *filter)
323 v16u8 mask0, mask1, mask2, mask3, out;
324 v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
325 v8i16 filt, out0, out1;
327 mask0 = LD_UB(&ff_hevc_mask_arr[16]);
330 /* rearranging filter */
331 filt = LD_SH(filter);
332 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
338 LD_SB4(src, src_stride, src0, src1, src2, src3);
339 XORI_B4_128_SB(src0, src1, src2, src3);
340 HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
341 mask3, filt0, filt1, filt2, filt3, out0, out1);
342 SRARI_H2_SH(out0, out1, 6);
343 SAT_SH2_SH(out0, out1, 7);
344 out = PCKEV_XORI128_UB(out0, out1);
345 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
348 static void common_hz_8t_4x8_msa(uint8_t *src, int32_t src_stride,
349 uint8_t *dst, int32_t dst_stride,
350 const int8_t *filter)
352 v16i8 filt0, filt1, filt2, filt3;
353 v16i8 src0, src1, src2, src3;
354 v16u8 mask0, mask1, mask2, mask3, out;
355 v8i16 filt, out0, out1, out2, out3;
357 mask0 = LD_UB(&ff_hevc_mask_arr[16]);
360 /* rearranging filter */
361 filt = LD_SH(filter);
362 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
368 LD_SB4(src, src_stride, src0, src1, src2, src3);
369 XORI_B4_128_SB(src0, src1, src2, src3);
370 src += (4 * src_stride);
371 HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
372 mask3, filt0, filt1, filt2, filt3, out0, out1);
373 LD_SB4(src, src_stride, src0, src1, src2, src3);
374 XORI_B4_128_SB(src0, src1, src2, src3);
375 HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
376 mask3, filt0, filt1, filt2, filt3, out2, out3);
377 SRARI_H4_SH(out0, out1, out2, out3, 6);
378 SAT_SH4_SH(out0, out1, out2, out3, 7);
379 out = PCKEV_XORI128_UB(out0, out1);
380 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
381 dst += (4 * dst_stride);
382 out = PCKEV_XORI128_UB(out2, out3);
383 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
386 static void common_hz_8t_4x16_msa(uint8_t *src, int32_t src_stride,
387 uint8_t *dst, int32_t dst_stride,
388 const int8_t *filter)
390 v16u8 mask0, mask1, mask2, mask3, out;
391 v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
392 v8i16 filt, out0, out1, out2, out3;
394 mask0 = LD_UB(&ff_hevc_mask_arr[16]);
397 /* rearranging filter */
398 filt = LD_SH(filter);
399 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
405 LD_SB4(src, src_stride, src0, src1, src2, src3);
406 XORI_B4_128_SB(src0, src1, src2, src3);
407 src += (4 * src_stride);
408 HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
409 mask3, filt0, filt1, filt2, filt3, out0, out1);
410 LD_SB4(src, src_stride, src0, src1, src2, src3);
411 XORI_B4_128_SB(src0, src1, src2, src3);
412 src += (4 * src_stride);
413 HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
414 mask3, filt0, filt1, filt2, filt3, out2, out3);
415 SRARI_H4_SH(out0, out1, out2, out3, 6);
416 SAT_SH4_SH(out0, out1, out2, out3, 7);
417 out = PCKEV_XORI128_UB(out0, out1);
418 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
419 dst += (4 * dst_stride);
420 out = PCKEV_XORI128_UB(out2, out3);
421 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
422 dst += (4 * dst_stride);
424 LD_SB4(src, src_stride, src0, src1, src2, src3);
425 XORI_B4_128_SB(src0, src1, src2, src3);
426 src += (4 * src_stride);
427 HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
428 mask3, filt0, filt1, filt2, filt3, out0, out1);
429 LD_SB4(src, src_stride, src0, src1, src2, src3);
430 XORI_B4_128_SB(src0, src1, src2, src3);
431 src += (4 * src_stride);
432 HORIZ_8TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
433 mask3, filt0, filt1, filt2, filt3, out2, out3);
435 SRARI_H4_SH(out0, out1, out2, out3, 6);
436 SAT_SH4_SH(out0, out1, out2, out3, 7);
437 out = PCKEV_XORI128_UB(out0, out1);
438 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
439 dst += (4 * dst_stride);
440 out = PCKEV_XORI128_UB(out2, out3);
441 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
444 static void common_hz_8t_4w_msa(uint8_t *src, int32_t src_stride,
445 uint8_t *dst, int32_t dst_stride,
446 const int8_t *filter, int32_t height)
449 common_hz_8t_4x4_msa(src, src_stride, dst, dst_stride, filter);
450 } else if (8 == height) {
451 common_hz_8t_4x8_msa(src, src_stride, dst, dst_stride, filter);
452 } else if (16 == height) {
453 common_hz_8t_4x16_msa(src, src_stride, dst, dst_stride, filter);
457 static void common_hz_8t_8w_msa(uint8_t *src, int32_t src_stride,
458 uint8_t *dst, int32_t dst_stride,
459 const int8_t *filter, int32_t height)
462 v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
463 v16u8 mask0, mask1, mask2, mask3, tmp0, tmp1;
464 v16i8 vec0_m, vec1_m, vec2_m, vec3_m, vec4_m, vec5_m, vec6_m, vec7_m;
465 v8i16 filt, out0, out1, out2, out3;
467 mask0 = LD_UB(&ff_hevc_mask_arr[0]);
470 /* rearranging filter */
471 filt = LD_SH(filter);
472 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
478 for (loop_cnt = (height >> 2); loop_cnt--;) {
479 LD_SB4(src, src_stride, src0, src1, src2, src3);
480 XORI_B4_128_SB(src0, src1, src2, src3);
481 src += (4 * src_stride);
483 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0_m, vec1_m);
484 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2_m, vec3_m);
485 DOTP_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt0, filt0, filt0, filt0,
486 out0, out1, out2, out3);
487 VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec0_m, vec1_m);
488 VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2_m, vec3_m);
489 DPADD_SB4_SH(vec0_m, vec1_m, vec2_m, vec3_m, filt2, filt2, filt2, filt2,
490 out0, out1, out2, out3);
491 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4_m, vec5_m);
492 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6_m, vec7_m);
493 DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt1, filt1, filt1, filt1,
494 out0, out1, out2, out3);
495 VSHF_B2_SB(src0, src0, src1, src1, mask3, mask3, vec4_m, vec5_m);
496 VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec6_m, vec7_m);
497 DPADD_SB4_SH(vec4_m, vec5_m, vec6_m, vec7_m, filt3, filt3, filt3, filt3,
498 out0, out1, out2, out3);
500 SRARI_H4_SH(out0, out1, out2, out3, 6);
501 SAT_SH4_SH(out0, out1, out2, out3, 7);
502 tmp0 = PCKEV_XORI128_UB(out0, out1);
503 tmp1 = PCKEV_XORI128_UB(out2, out3);
504 ST8x4_UB(tmp0, tmp1, dst, dst_stride);
505 dst += (4 * dst_stride);
509 static void common_hz_8t_12w_msa(uint8_t *src, int32_t src_stride,
510 uint8_t *dst, int32_t dst_stride,
511 const int8_t *filter, int32_t height)
514 v16u8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask00;
515 v16u8 tmp0, tmp1, tmp2;
516 v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
517 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
518 v16i8 filt0, filt1, filt2, filt3;
519 v8i16 filt, out0, out1, out2, out3, out4, out5;
521 mask00 = LD_UB(&ff_hevc_mask_arr[0]);
522 mask0 = LD_UB(&ff_hevc_mask_arr[16]);
526 /* rearranging filter */
527 filt = LD_SH(filter);
528 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
537 for (loop_cnt = 4; loop_cnt--;) {
539 LD_SB4(src, src_stride, src0, src1, src2, src3);
541 LD_SB4(src + 8, src_stride, src4, src5, src6, src7);
543 XORI_B4_128_SB(src0, src1, src2, src3);
544 XORI_B4_128_SB(src4, src5, src6, src7);
545 src += (4 * src_stride);
547 VSHF_B2_SB(src0, src0, src1, src1, mask00, mask00, vec0, vec1);
548 VSHF_B2_SB(src2, src2, src3, src3, mask00, mask00, vec2, vec3);
549 DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, out0,
551 VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec0, vec1);
552 VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2, vec3);
553 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt2, filt2, filt2, out0,
555 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4, vec5);
556 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6, vec7);
557 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1, out0,
559 VSHF_B2_SB(src0, src0, src1, src1, mask3, mask3, vec4, vec5);
560 VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec6, vec7);
561 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt3, filt3, filt3, filt3, out0,
565 VSHF_B2_SB(src4, src5, src6, src7, mask0, mask0, vec0, vec1);
566 DOTP_SB2_SH(vec0, vec1, filt0, filt0, out4, out5);
567 VSHF_B2_SB(src4, src5, src6, src7, mask4, mask4, vec2, vec3);
568 DPADD_SB2_SH(vec2, vec3, filt1, filt1, out4, out5);
569 VSHF_B2_SB(src4, src5, src6, src7, mask5, mask5, vec4, vec5);
570 DPADD_SB2_SH(vec4, vec5, filt2, filt2, out4, out5);
571 VSHF_B2_SB(src4, src5, src6, src7, mask6, mask6, vec6, vec7);
572 DPADD_SB2_SH(vec6, vec7, filt3, filt3, out4, out5);
574 SRARI_H4_SH(out0, out1, out2, out3, 6);
575 SRARI_H2_SH(out4, out5, 6);
576 SAT_SH4_SH(out0, out1, out2, out3, 7);
577 SAT_SH2_SH(out4, out5, 7);
578 tmp0 = PCKEV_XORI128_UB(out0, out1);
579 tmp1 = PCKEV_XORI128_UB(out2, out3);
580 tmp2 = PCKEV_XORI128_UB(out4, out5);
582 ST8x4_UB(tmp0, tmp1, dst, dst_stride);
583 ST4x4_UB(tmp2, tmp2, 0, 1, 2, 3, dst + 8, dst_stride);
584 dst += (4 * dst_stride);
588 static void common_hz_8t_16w_msa(uint8_t *src, int32_t src_stride,
589 uint8_t *dst, int32_t dst_stride,
590 const int8_t *filter, int32_t height)
593 v16u8 mask0, mask1, mask2, mask3, out;
594 v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
595 v16i8 filt0, filt1, filt2, filt3;
596 v8i16 filt, out0, out1, out2, out3;
598 mask0 = LD_UB(&ff_hevc_mask_arr[0]);
601 /* rearranging filter */
602 filt = LD_SH(filter);
603 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
609 for (loop_cnt = (height >> 2); loop_cnt--;) {
610 LD_SB2(src, src_stride, src0, src2);
611 LD_SB2(src + 8, src_stride, src1, src3);
612 src += (2 * src_stride);
614 LD_SB2(src, src_stride, src4, src6);
615 LD_SB2(src + 8, src_stride, src5, src7);
616 src += (2 * src_stride);
618 XORI_B4_128_SB(src0, src1, src2, src3);
619 XORI_B4_128_SB(src4, src5, src6, src7);
620 HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
621 mask3, filt0, filt1, filt2, filt3, out0,
623 SRARI_H4_SH(out0, out1, out2, out3, 6);
624 SAT_SH4_SH(out0, out1, out2, out3, 7);
625 out = PCKEV_XORI128_UB(out0, out1);
628 out = PCKEV_XORI128_UB(out2, out3);
632 HORIZ_8TAP_8WID_4VECS_FILT(src4, src5, src6, src7, mask0, mask1, mask2,
633 mask3, filt0, filt1, filt2, filt3, out0,
635 SRARI_H4_SH(out0, out1, out2, out3, 6);
636 SAT_SH4_SH(out0, out1, out2, out3, 7);
637 out = PCKEV_XORI128_UB(out0, out1);
640 out = PCKEV_XORI128_UB(out2, out3);
646 static void common_hz_8t_24w_msa(uint8_t *src, int32_t src_stride,
647 uint8_t *dst, int32_t dst_stride,
648 const int8_t *filter, int32_t height)
651 v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3;
652 v16u8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7, out;
653 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9, vec10;
655 v8i16 out0, out1, out2, out3, out8, out9, filt;
657 mask0 = LD_UB(&ff_hevc_mask_arr[0]);
660 /* rearranging filter */
661 filt = LD_SH(filter);
662 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
672 for (loop_cnt = 16; loop_cnt--;) {
673 LD_SB2(src, src_stride, src0, src2);
674 LD_SB2(src + 16, src_stride, src1, src3);
675 XORI_B4_128_SB(src0, src1, src2, src3);
676 src += (2 * src_stride);
677 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec8);
678 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec9);
679 VSHF_B2_SB(src0, src1, src2, src3, mask4, mask4, vec1, vec3);
680 DOTP_SB4_SH(vec0, vec8, vec2, vec9, filt0, filt0, filt0, filt0, out0,
682 DOTP_SB2_SH(vec1, vec3, filt0, filt0, out1, out3);
683 VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec0, vec8);
684 VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2, vec9);
685 VSHF_B2_SB(src0, src1, src2, src3, mask6, mask6, vec1, vec3);
686 DPADD_SB4_SH(vec0, vec8, vec2, vec9, filt2, filt2, filt2, filt2,
687 out0, out8, out2, out9);
688 DPADD_SB2_SH(vec1, vec3, filt2, filt2, out1, out3);
689 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4, vec10);
690 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6, vec11);
691 VSHF_B2_SB(src0, src1, src2, src3, mask5, mask5, vec5, vec7);
692 DPADD_SB4_SH(vec4, vec10, vec6, vec11, filt1, filt1, filt1, filt1,
693 out0, out8, out2, out9);
694 DPADD_SB2_SH(vec5, vec7, filt1, filt1, out1, out3);
695 VSHF_B2_SB(src0, src0, src1, src1, mask3, mask3, vec4, vec10);
696 VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec6, vec11);
697 VSHF_B2_SB(src0, src1, src2, src3, mask7, mask7, vec5, vec7);
698 DPADD_SB4_SH(vec4, vec10, vec6, vec11, filt3, filt3, filt3, filt3,
699 out0, out8, out2, out9);
700 DPADD_SB2_SH(vec5, vec7, filt3, filt3, out1, out3);
701 SRARI_H4_SH(out0, out8, out2, out9, 6);
702 SRARI_H2_SH(out1, out3, 6);
703 SAT_SH4_SH(out0, out8, out2, out9, 7);
704 SAT_SH2_SH(out1, out3, 7);
705 out = PCKEV_XORI128_UB(out8, out9);
706 ST8x2_UB(out, dst + 16, dst_stride);
707 out = PCKEV_XORI128_UB(out0, out1);
710 out = PCKEV_XORI128_UB(out2, out3);
716 static void common_hz_8t_32w_msa(uint8_t *src, int32_t src_stride,
717 uint8_t *dst, int32_t dst_stride,
718 const int8_t *filter, int32_t height)
721 v16u8 mask0, mask1, mask2, mask3, out;
722 v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
723 v16i8 filt0, filt1, filt2, filt3;
724 v8i16 filt, out0, out1, out2, out3;
726 mask0 = LD_UB(&ff_hevc_mask_arr[0]);
729 /* rearranging filter */
730 filt = LD_SH(filter);
731 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
737 for (loop_cnt = (height >> 1); loop_cnt--;) {
739 src1 = LD_SB(src + 8);
740 src2 = LD_SB(src + 16);
741 src3 = LD_SB(src + 24);
743 XORI_B4_128_SB(src0, src1, src2, src3);
746 src5 = LD_SB(src + 8);
747 src6 = LD_SB(src + 16);
748 src7 = LD_SB(src + 24);
750 XORI_B4_128_SB(src4, src5, src6, src7);
752 HORIZ_8TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, mask2,
753 mask3, filt0, filt1, filt2, filt3, out0,
755 SRARI_H4_SH(out0, out1, out2, out3, 6);
756 SAT_SH4_SH(out0, out1, out2, out3, 7);
758 out = PCKEV_XORI128_UB(out0, out1);
760 out = PCKEV_XORI128_UB(out2, out3);
761 ST_UB(out, dst + 16);
764 HORIZ_8TAP_8WID_4VECS_FILT(src4, src5, src6, src7, mask0, mask1, mask2,
765 mask3, filt0, filt1, filt2, filt3, out0,
767 SRARI_H4_SH(out0, out1, out2, out3, 6);
768 SAT_SH4_SH(out0, out1, out2, out3, 7);
769 out = PCKEV_XORI128_UB(out0, out1);
771 out = PCKEV_XORI128_UB(out2, out3);
772 ST_UB(out, dst + 16);
777 static void common_hz_8t_48w_msa(uint8_t *src, int32_t src_stride,
778 uint8_t *dst, int32_t dst_stride,
779 const int8_t *filter, int32_t height)
782 v16i8 src0, src1, src2, src3, filt0, filt1, filt2, filt3, vec0, vec1, vec2;
784 v16u8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7, out;
785 v8i16 filt, out0, out1, out2, out3;
787 mask0 = LD_UB(&ff_hevc_mask_arr[0]);
790 /* rearranging filter */
791 filt = LD_SH(filter);
792 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
802 for (loop_cnt = 64; loop_cnt--;) {
804 src1 = LD_SB(src + 8);
805 src2 = LD_SB(src + 16);
806 src3 = LD_SB(src + 32);
807 src4 = LD_SB(src + 40);
810 XORI_B4_128_SB(src0, src1, src2, src3);
811 src4 = (v16i8) __msa_xori_b((v16u8) src4, 128);
813 VSHF_B3_SB(src0, src0, src1, src1, src2, src2, mask0, mask0, mask0,
815 DOTP_SB3_SH(vec0, vec1, vec2, filt0, filt0, filt0, out0, out1, out2);
816 VSHF_B3_SB(src0, src0, src1, src1, src2, src2, mask1, mask1, mask1,
818 DPADD_SB2_SH(vec0, vec1, filt1, filt1, out0, out1);
819 out2 = __msa_dpadd_s_h(out2, vec2, filt1);
820 VSHF_B3_SB(src0, src0, src1, src1, src2, src2, mask2, mask2, mask2,
822 DPADD_SB2_SH(vec0, vec1, filt2, filt2, out0, out1);
823 out2 = __msa_dpadd_s_h(out2, vec2, filt2);
825 VSHF_B3_SB(src0, src0, src1, src1, src2, src2, mask3, mask3, mask3,
827 DPADD_SB2_SH(vec0, vec1, filt3, filt3, out0, out1);
828 out2 = __msa_dpadd_s_h(out2, vec2, filt3);
830 SRARI_H2_SH(out0, out1, 6);
831 out3 = __msa_srari_h(out2, 6);
832 SAT_SH3_SH(out0, out1, out3, 7);
833 out = PCKEV_XORI128_UB(out0, out1);
836 VSHF_B3_SB(src2, src3, src3, src3, src4, src4, mask4, mask0, mask0,
838 DOTP_SB3_SH(vec0, vec1, vec2, filt0, filt0, filt0, out0, out1, out2);
839 VSHF_B3_SB(src2, src3, src3, src3, src4, src4, mask5, mask1, mask1,
841 DPADD_SB2_SH(vec0, vec1, filt1, filt1, out0, out1);
842 out2 = __msa_dpadd_s_h(out2, vec2, filt1);
843 VSHF_B3_SB(src2, src3, src3, src3, src4, src4, mask6, mask2, mask2,
845 DPADD_SB2_SH(vec0, vec1, filt2, filt2, out0, out1);
846 out2 = __msa_dpadd_s_h(out2, vec2, filt2);
847 VSHF_B3_SB(src2, src3, src3, src3, src4, src4, mask7, mask3, mask3,
849 DPADD_SB2_SH(vec0, vec1, filt3, filt3, out0, out1);
850 out2 = __msa_dpadd_s_h(out2, vec2, filt3);
852 SRARI_H2_SH(out0, out1, 6);
853 out2 = __msa_srari_h(out2, 6);
854 SAT_SH3_SH(out0, out1, out2, 7);
855 out = PCKEV_XORI128_UB(out3, out0);
856 ST_UB(out, dst + 16);
857 out = PCKEV_XORI128_UB(out1, out2);
858 ST_UB(out, dst + 32);
863 static void common_hz_8t_64w_msa(uint8_t *src, int32_t src_stride,
864 uint8_t *dst, int32_t dst_stride,
865 const int8_t *filter, int32_t height)
868 v16u8 mask0, mask1, mask2, mask3, out;
869 v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
870 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
871 v16i8 filt0, filt1, filt2, filt3;
872 v8i16 res0, res1, res2, res3, filt;
874 mask0 = LD_UB(&ff_hevc_mask_arr[0]);
877 /* rearranging filter */
878 filt = LD_SH(filter);
879 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
885 for (loop_cnt = height; loop_cnt--;) {
886 LD_SB8(src, 8, src0, src1, src2, src3, src4, src5, src6, src7);
889 XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
891 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
892 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec2, vec3);
893 DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, res0,
895 VSHF_B2_SB(src0, src0, src1, src1, mask2, mask2, vec0, vec1);
896 VSHF_B2_SB(src2, src2, src3, src3, mask2, mask2, vec2, vec3);
897 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt2, filt2, filt2, res0,
899 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec4, vec5);
900 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec6, vec7);
901 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1, res0,
903 VSHF_B2_SB(src0, src0, src1, src1, mask3, mask3, vec4, vec5);
904 VSHF_B2_SB(src2, src2, src3, src3, mask3, mask3, vec6, vec7);
905 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt3, filt3, filt3, filt3, res0,
908 SRARI_H4_SH(res0, res1, res2, res3, 6);
909 SAT_SH4_SH(res0, res1, res2, res3, 7);
910 out = PCKEV_XORI128_UB(res0, res1);
912 out = PCKEV_XORI128_UB(res2, res3);
913 ST_UB(out, dst + 16);
915 VSHF_B2_SB(src4, src4, src5, src5, mask0, mask0, vec0, vec1);
916 VSHF_B2_SB(src6, src6, src7, src7, mask0, mask0, vec2, vec3);
917 DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0, res0,
919 VSHF_B2_SB(src4, src4, src5, src5, mask2, mask2, vec0, vec1);
920 VSHF_B2_SB(src6, src6, src7, src7, mask2, mask2, vec2, vec3);
921 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt2, filt2, filt2, filt2, res0,
923 VSHF_B2_SB(src4, src4, src5, src5, mask1, mask1, vec4, vec5);
924 VSHF_B2_SB(src6, src6, src7, src7, mask1, mask1, vec6, vec7);
925 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1, res0,
927 VSHF_B2_SB(src4, src4, src5, src5, mask3, mask3, vec4, vec5);
928 VSHF_B2_SB(src6, src6, src7, src7, mask3, mask3, vec6, vec7);
929 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt3, filt3, filt3, filt3, res0,
932 SRARI_H4_SH(res0, res1, res2, res3, 6);
933 SAT_SH4_SH(res0, res1, res2, res3, 7);
934 out = PCKEV_XORI128_UB(res0, res1);
935 ST_UB(out, dst + 32);
936 out = PCKEV_XORI128_UB(res2, res3);
937 ST_UB(out, dst + 48);
942 static void common_vt_8t_4w_msa(uint8_t *src, int32_t src_stride,
943 uint8_t *dst, int32_t dst_stride,
944 const int8_t *filter, int32_t height)
947 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
948 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
949 v16i8 src65_r, src87_r, src109_r, src2110, src4332, src6554, src8776;
950 v16i8 src10998, filt0, filt1, filt2, filt3;
952 v8i16 filt, out10, out32;
954 src -= (3 * src_stride);
956 filt = LD_SH(filter);
957 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
959 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
960 src += (7 * src_stride);
962 ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
964 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
965 ILVR_D3_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r, src2110,
967 XORI_B3_128_SB(src2110, src4332, src6554);
969 for (loop_cnt = (height >> 2); loop_cnt--;) {
970 LD_SB4(src, src_stride, src7, src8, src9, src10);
971 src += (4 * src_stride);
973 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
974 src87_r, src98_r, src109_r);
975 ILVR_D2_SB(src87_r, src76_r, src109_r, src98_r, src8776, src10998);
976 XORI_B2_128_SB(src8776, src10998);
977 out10 = FILT_8TAP_DPADD_S_H(src2110, src4332, src6554, src8776, filt0,
978 filt1, filt2, filt3);
979 out32 = FILT_8TAP_DPADD_S_H(src4332, src6554, src8776, src10998, filt0,
980 filt1, filt2, filt3);
981 SRARI_H2_SH(out10, out32, 6);
982 SAT_SH2_SH(out10, out32, 7);
983 out = PCKEV_XORI128_UB(out10, out32);
984 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
985 dst += (4 * dst_stride);
994 static void common_vt_8t_8w_msa(uint8_t *src, int32_t src_stride,
995 uint8_t *dst, int32_t dst_stride,
996 const int8_t *filter, int32_t height)
999 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1000 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
1001 v16i8 src65_r, src87_r, src109_r, filt0, filt1, filt2, filt3;
1003 v8i16 filt, out0_r, out1_r, out2_r, out3_r;
1005 src -= (3 * src_stride);
1007 filt = LD_SH(filter);
1008 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1010 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
1011 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1012 src += (7 * src_stride);
1013 ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
1015 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1017 for (loop_cnt = (height >> 2); loop_cnt--;) {
1018 LD_SB4(src, src_stride, src7, src8, src9, src10);
1019 XORI_B4_128_SB(src7, src8, src9, src10);
1020 src += (4 * src_stride);
1022 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
1023 src87_r, src98_r, src109_r);
1024 out0_r = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r, filt0,
1025 filt1, filt2, filt3);
1026 out1_r = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r, filt0,
1027 filt1, filt2, filt3);
1028 out2_r = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r, filt0,
1029 filt1, filt2, filt3);
1030 out3_r = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r, filt0,
1031 filt1, filt2, filt3);
1032 SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6);
1033 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
1034 tmp0 = PCKEV_XORI128_UB(out0_r, out1_r);
1035 tmp1 = PCKEV_XORI128_UB(out2_r, out3_r);
1036 ST8x4_UB(tmp0, tmp1, dst, dst_stride);
1037 dst += (4 * dst_stride);
1049 static void common_vt_8t_12w_msa(uint8_t *src, int32_t src_stride,
1050 uint8_t *dst, int32_t dst_stride,
1051 const int8_t *filter, int32_t height)
1054 uint32_t out2, out3;
1055 uint64_t out0, out1;
1056 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, res0, res1;
1057 v16i8 res2, vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1058 v8i16 vec01, vec23, vec45, vec67, tmp0, tmp1, tmp2;
1059 v8i16 filt, filt0, filt1, filt2, filt3;
1060 v4i32 mask = { 2, 6, 2, 6 };
1062 src -= (3 * src_stride);
1064 /* rearranging filter_y */
1065 filt = LD_SH(filter);
1066 SPLATI_H4_SH(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1068 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
1069 src += (7 * src_stride);
1071 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1074 VSHF_W2_SB(src0, src1, src1, src2, mask, mask, vec0, vec1);
1075 VSHF_W2_SB(src2, src3, src3, src4, mask, mask, vec2, vec3);
1076 VSHF_W2_SB(src4, src5, src5, src6, mask, mask, vec4, vec5);
1078 for (loop_cnt = (height >> 1); loop_cnt--;) {
1079 LD_SB2(src, src_stride, src7, src8);
1080 XORI_B2_128_SB(src7, src8);
1081 src += (2 * src_stride);
1083 ILVR_B4_SH(src1, src0, src3, src2, src5, src4, src7, src6,
1084 vec01, vec23, vec45, vec67);
1085 tmp0 = FILT_8TAP_DPADD_S_H(vec01, vec23, vec45, vec67, filt0, filt1,
1087 ILVR_B4_SH(src2, src1, src4, src3, src6, src5, src8, src7, vec01, vec23,
1089 tmp1 = FILT_8TAP_DPADD_S_H(vec01, vec23, vec45, vec67, filt0, filt1,
1093 VSHF_W2_SB(src6, src7, src7, src8, mask, mask, vec6, vec7);
1094 ILVR_B4_SH(vec1, vec0, vec3, vec2, vec5, vec4, vec7, vec6, vec01, vec23,
1096 tmp2 = FILT_8TAP_DPADD_S_H(vec01, vec23, vec45, vec67, filt0, filt1,
1098 SRARI_H2_SH(tmp0, tmp1, 6);
1099 tmp2 = __msa_srari_h(tmp2, 6);
1100 SAT_SH3_SH(tmp0, tmp1, tmp2, 7);
1101 PCKEV_B3_SB(tmp0, tmp0, tmp1, tmp1, tmp2, tmp2, res0, res1, res2);
1102 XORI_B3_128_SB(res0, res1, res2);
1104 out0 = __msa_copy_u_d((v2i64) res0, 0);
1105 out1 = __msa_copy_u_d((v2i64) res1, 0);
1106 out2 = __msa_copy_u_w((v4i32) res2, 0);
1107 out3 = __msa_copy_u_w((v4i32) res2, 1);
1109 SW(out2, (dst + 8));
1112 SW(out3, (dst + 8));
1131 static void common_vt_8t_16w_msa(uint8_t *src, int32_t src_stride,
1132 uint8_t *dst, int32_t dst_stride,
1133 const int8_t *filter, int32_t height)
1136 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1137 v16i8 filt0, filt1, filt2, filt3;
1138 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
1139 v16i8 src65_r, src87_r, src109_r, src10_l, src32_l, src54_l, src76_l;
1140 v16i8 src98_l, src21_l, src43_l, src65_l, src87_l, src109_l;
1141 v16u8 tmp0, tmp1, tmp2, tmp3;
1142 v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
1144 src -= (3 * src_stride);
1146 filt = LD_SH(filter);
1147 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1149 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
1150 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1151 src += (7 * src_stride);
1152 ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r, src32_r,
1154 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1155 ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_l, src32_l,
1157 ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
1159 for (loop_cnt = (height >> 2); loop_cnt--;) {
1160 LD_SB4(src, src_stride, src7, src8, src9, src10);
1161 XORI_B4_128_SB(src7, src8, src9, src10);
1162 src += (4 * src_stride);
1164 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
1165 src87_r, src98_r, src109_r);
1166 ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_l,
1167 src87_l, src98_l, src109_l);
1168 out0_r = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r, filt0,
1169 filt1, filt2, filt3);
1170 out1_r = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r, filt0,
1171 filt1, filt2, filt3);
1172 out2_r = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r, filt0,
1173 filt1, filt2, filt3);
1174 out3_r = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r, filt0,
1175 filt1, filt2, filt3);
1176 out0_l = FILT_8TAP_DPADD_S_H(src10_l, src32_l, src54_l, src76_l, filt0,
1177 filt1, filt2, filt3);
1178 out1_l = FILT_8TAP_DPADD_S_H(src21_l, src43_l, src65_l, src87_l, filt0,
1179 filt1, filt2, filt3);
1180 out2_l = FILT_8TAP_DPADD_S_H(src32_l, src54_l, src76_l, src98_l, filt0,
1181 filt1, filt2, filt3);
1182 out3_l = FILT_8TAP_DPADD_S_H(src43_l, src65_l, src87_l, src109_l, filt0,
1183 filt1, filt2, filt3);
1184 SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6);
1185 SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 6);
1186 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
1187 SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
1188 PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
1189 out3_r, tmp0, tmp1, tmp2, tmp3);
1190 XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3);
1191 ST_UB4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride);
1192 dst += (4 * dst_stride);
1210 static void common_vt_8t_16w_mult_msa(uint8_t *src, int32_t src_stride,
1211 uint8_t *dst, int32_t dst_stride,
1212 const int8_t *filter, int32_t height,
1217 uint32_t loop_cnt, cnt;
1218 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1219 v16i8 filt0, filt1, filt2, filt3;
1220 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r, src21_r, src43_r;
1221 v16i8 src65_r, src87_r, src109_r, src10_l, src32_l, src54_l, src76_l;
1222 v16i8 src98_l, src21_l, src43_l, src65_l, src87_l, src109_l;
1223 v16u8 tmp0, tmp1, tmp2, tmp3;
1224 v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
1226 src -= (3 * src_stride);
1228 filt = LD_SH(filter);
1229 SPLATI_H4_SB(filt, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1231 for (cnt = (width >> 4); cnt--;) {
1235 LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6);
1236 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1237 src_tmp += (7 * src_stride);
1238 ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_r,
1239 src32_r, src54_r, src21_r);
1240 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1241 ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1, src10_l,
1242 src32_l, src54_l, src21_l);
1243 ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
1245 for (loop_cnt = (height >> 2); loop_cnt--;) {
1246 LD_SB4(src_tmp, src_stride, src7, src8, src9, src10);
1247 XORI_B4_128_SB(src7, src8, src9, src10);
1248 src_tmp += (4 * src_stride);
1249 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_r,
1250 src87_r, src98_r, src109_r);
1251 ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9, src76_l,
1252 src87_l, src98_l, src109_l);
1253 out0_r = FILT_8TAP_DPADD_S_H(src10_r, src32_r, src54_r, src76_r,
1254 filt0, filt1, filt2, filt3);
1255 out1_r = FILT_8TAP_DPADD_S_H(src21_r, src43_r, src65_r, src87_r,
1256 filt0, filt1, filt2, filt3);
1257 out2_r = FILT_8TAP_DPADD_S_H(src32_r, src54_r, src76_r, src98_r,
1258 filt0, filt1, filt2, filt3);
1259 out3_r = FILT_8TAP_DPADD_S_H(src43_r, src65_r, src87_r, src109_r,
1260 filt0, filt1, filt2, filt3);
1261 out0_l = FILT_8TAP_DPADD_S_H(src10_l, src32_l, src54_l, src76_l,
1262 filt0, filt1, filt2, filt3);
1263 out1_l = FILT_8TAP_DPADD_S_H(src21_l, src43_l, src65_l, src87_l,
1264 filt0, filt1, filt2, filt3);
1265 out2_l = FILT_8TAP_DPADD_S_H(src32_l, src54_l, src76_l, src98_l,
1266 filt0, filt1, filt2, filt3);
1267 out3_l = FILT_8TAP_DPADD_S_H(src43_l, src65_l, src87_l, src109_l,
1268 filt0, filt1, filt2, filt3);
1269 SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6);
1270 SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 6);
1271 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
1272 SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
1273 PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
1274 out3_r, tmp0, tmp1, tmp2, tmp3);
1275 XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3);
1276 ST_UB4(tmp0, tmp1, tmp2, tmp3, dst_tmp, dst_stride);
1277 dst_tmp += (4 * dst_stride);
1299 static void common_vt_8t_24w_msa(uint8_t *src, int32_t src_stride,
1300 uint8_t *dst, int32_t dst_stride,
1301 const int8_t *filter, int32_t height)
1303 common_vt_8t_16w_mult_msa(src, src_stride, dst, dst_stride, filter, height,
1306 common_vt_8t_8w_msa(src + 16, src_stride, dst + 16, dst_stride, filter,
1310 static void common_vt_8t_32w_msa(uint8_t *src, int32_t src_stride,
1311 uint8_t *dst, int32_t dst_stride,
1312 const int8_t *filter, int32_t height)
1314 common_vt_8t_16w_mult_msa(src, src_stride, dst, dst_stride, filter, height,
1318 static void common_vt_8t_48w_msa(uint8_t *src, int32_t src_stride,
1319 uint8_t *dst, int32_t dst_stride,
1320 const int8_t *filter, int32_t height)
1322 common_vt_8t_16w_mult_msa(src, src_stride, dst, dst_stride, filter, height,
1326 static void common_vt_8t_64w_msa(uint8_t *src, int32_t src_stride,
1327 uint8_t *dst, int32_t dst_stride,
1328 const int8_t *filter, int32_t height)
1330 common_vt_8t_16w_mult_msa(src, src_stride, dst, dst_stride, filter, height,
1334 static void hevc_hv_uni_8t_4w_msa(uint8_t *src,
1338 const int8_t *filter_x,
1339 const int8_t *filter_y,
1343 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
1344 v8i16 filt0, filt1, filt2, filt3;
1345 v4i32 filt_h0, filt_h1, filt_h2, filt_h3;
1346 v16i8 mask1, mask2, mask3;
1347 v8i16 filter_vec, const_vec;
1348 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1349 v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1350 v8i16 dst30, dst41, dst52, dst63, dst66, dst87;
1351 v4i32 dst0_r, dst1_r;
1352 v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
1353 v8i16 dst21_r, dst43_r, dst65_r, dst87_r;
1354 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 };
1355 v8i16 mask4 = { 0, 4, 1, 5, 2, 6, 3, 7 };
1357 src -= ((3 * src_stride) + 3);
1358 filter_vec = LD_SH(filter_x);
1359 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1361 filter_vec = LD_SH(filter_y);
1362 vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
1363 filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
1365 SPLATI_W4_SW(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
1371 const_vec = __msa_ldi_h(128);
1374 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
1375 src += (7 * src_stride);
1376 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1378 VSHF_B4_SB(src0, src3, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
1379 VSHF_B4_SB(src1, src4, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7);
1380 VSHF_B4_SB(src2, src5, mask0, mask1, mask2, mask3,
1381 vec8, vec9, vec10, vec11);
1382 VSHF_B4_SB(src3, src6, mask0, mask1, mask2, mask3,
1383 vec12, vec13, vec14, vec15);
1386 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
1387 dst30, dst30, dst30, dst30);
1389 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, filt3,
1390 dst41, dst41, dst41, dst41);
1392 DPADD_SB4_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, filt3,
1393 dst52, dst52, dst52, dst52);
1395 DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2, filt3,
1396 dst63, dst63, dst63, dst63);
1398 ILVR_H3_SH(dst41, dst30, dst52, dst41, dst63, dst52,
1399 dst10_r, dst21_r, dst32_r);
1400 dst43_r = __msa_ilvl_h(dst41, dst30);
1401 dst54_r = __msa_ilvl_h(dst52, dst41);
1402 dst65_r = __msa_ilvl_h(dst63, dst52);
1403 dst66 = (v8i16) __msa_splati_d((v2i64) dst63, 1);
1405 for (loop_cnt = height >> 1; loop_cnt--;) {
1406 LD_SB2(src, src_stride, src7, src8);
1407 src += 2 * src_stride;
1408 XORI_B2_128_SB(src7, src8);
1410 VSHF_B4_SB(src7, src8, mask0, mask1, mask2, mask3,
1411 vec0, vec1, vec2, vec3);
1413 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
1414 dst87, dst87, dst87, dst87);
1416 dst76_r = __msa_ilvr_h(dst87, dst66);
1417 dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r,
1418 filt_h0, filt_h1, filt_h2, filt_h3);
1419 dst87_r = __msa_vshf_h(mask4, dst87, dst87);
1420 dst1_r = HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r,
1421 filt_h0, filt_h1, filt_h2, filt_h3);
1425 SRARI_W2_SW(dst0_r, dst1_r, 6);
1426 dst0_r = CLIP_SW_0_255(dst0_r);
1427 dst1_r = CLIP_SW_0_255(dst1_r);
1429 HEVC_PCK_SW_SB2(dst1_r, dst0_r, dst0_r);
1430 ST4x2_UB(dst0_r, dst, dst_stride);
1431 dst += (2 * dst_stride);
1439 dst66 = (v8i16) __msa_splati_d((v2i64) dst87, 1);
1443 static void hevc_hv_uni_8t_8multx2mult_msa(uint8_t *src,
1447 const int8_t *filter_x,
1448 const int8_t *filter_y,
1449 int32_t height, int32_t width)
1451 uint32_t loop_cnt, cnt;
1454 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
1455 v8i16 filt0, filt1, filt2, filt3;
1456 v4i32 filt_h0, filt_h1, filt_h2, filt_h3;
1457 v16i8 mask1, mask2, mask3;
1458 v8i16 filter_vec, const_vec;
1459 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1460 v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1461 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
1462 v4i32 dst0_r, dst0_l, dst1_r, dst1_l;
1463 v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
1464 v8i16 dst10_l, dst32_l, dst54_l, dst76_l;
1465 v8i16 dst21_r, dst43_r, dst65_r, dst87_r;
1466 v8i16 dst21_l, dst43_l, dst65_l, dst87_l;
1467 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
1469 src -= ((3 * src_stride) + 3);
1470 const_vec = __msa_ldi_h(128);
1473 filter_vec = LD_SH(filter_x);
1474 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1476 filter_vec = LD_SH(filter_y);
1477 vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
1478 filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
1480 SPLATI_W4_SW(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
1486 for (cnt = width >> 3; cnt--;) {
1490 LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6);
1491 src_tmp += (7 * src_stride);
1492 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1494 /* row 0 row 1 row 2 row 3 */
1495 VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
1496 vec0, vec1, vec2, vec3);
1497 VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
1498 vec4, vec5, vec6, vec7);
1499 VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
1500 vec8, vec9, vec10, vec11);
1501 VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
1502 vec12, vec13, vec14, vec15);
1504 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
1505 dst0, dst0, dst0, dst0);
1507 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, filt3,
1508 dst1, dst1, dst1, dst1);
1510 DPADD_SB4_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, filt3,
1511 dst2, dst2, dst2, dst2);
1513 DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2, filt3,
1514 dst3, dst3, dst3, dst3);
1516 VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3,
1517 vec0, vec1, vec2, vec3);
1518 VSHF_B4_SB(src5, src5, mask0, mask1, mask2, mask3,
1519 vec4, vec5, vec6, vec7);
1520 VSHF_B4_SB(src6, src6, mask0, mask1, mask2, mask3,
1521 vec8, vec9, vec10, vec11);
1523 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
1524 dst4, dst4, dst4, dst4);
1526 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, filt3,
1527 dst5, dst5, dst5, dst5);
1529 DPADD_SB4_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, filt3,
1530 dst6, dst6, dst6, dst6);
1532 ILVR_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1,
1533 dst10_r, dst32_r, dst54_r, dst21_r);
1534 ILVR_H2_SH(dst4, dst3, dst6, dst5, dst43_r, dst65_r);
1535 ILVL_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1,
1536 dst10_l, dst32_l, dst54_l, dst21_l);
1537 ILVL_H2_SH(dst4, dst3, dst6, dst5, dst43_l, dst65_l);
1539 for (loop_cnt = height >> 1; loop_cnt--;) {
1540 LD_SB2(src_tmp, src_stride, src7, src8);
1541 XORI_B2_128_SB(src7, src8);
1542 src_tmp += 2 * src_stride;
1544 VSHF_B4_SB(src7, src7, mask0, mask1, mask2, mask3,
1545 vec0, vec1, vec2, vec3);
1547 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
1548 dst7, dst7, dst7, dst7);
1550 ILVRL_H2_SH(dst7, dst6, dst76_r, dst76_l);
1551 dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r,
1552 filt_h0, filt_h1, filt_h2, filt_h3);
1553 dst0_l = HEVC_FILT_8TAP(dst10_l, dst32_l, dst54_l, dst76_l,
1554 filt_h0, filt_h1, filt_h2, filt_h3);
1558 VSHF_B4_SB(src8, src8, mask0, mask1, mask2, mask3,
1559 vec0, vec1, vec2, vec3);
1561 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
1562 dst8, dst8, dst8, dst8);
1564 ILVRL_H2_SH(dst8, dst7, dst87_r, dst87_l);
1565 dst1_r = HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r,
1566 filt_h0, filt_h1, filt_h2, filt_h3);
1567 dst1_l = HEVC_FILT_8TAP(dst21_l, dst43_l, dst65_l, dst87_l,
1568 filt_h0, filt_h1, filt_h2, filt_h3);
1571 SRARI_W4_SW(dst0_r, dst0_l, dst1_r, dst1_l, 6);
1572 dst0_r = CLIP_SW_0_255(dst0_r);
1573 dst0_l = CLIP_SW_0_255(dst0_l);
1574 dst1_r = CLIP_SW_0_255(dst1_r);
1575 dst1_l = CLIP_SW_0_255(dst1_l);
1577 HEVC_PCK_SW_SB4(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r);
1578 ST8x2_UB(dst0_r, dst_tmp, dst_stride);
1579 dst_tmp += (2 * dst_stride);
1601 static void hevc_hv_uni_8t_8w_msa(uint8_t *src,
1605 const int8_t *filter_x,
1606 const int8_t *filter_y,
1609 hevc_hv_uni_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
1610 filter_x, filter_y, height, 8);
1613 static void hevc_hv_uni_8t_12w_msa(uint8_t *src,
1617 const int8_t *filter_x,
1618 const int8_t *filter_y,
1621 hevc_hv_uni_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
1622 filter_x, filter_y, height, 8);
1624 hevc_hv_uni_8t_4w_msa(src + 8, src_stride, dst + 8, dst_stride,
1625 filter_x, filter_y, height);
1628 static void hevc_hv_uni_8t_16w_msa(uint8_t *src,
1632 const int8_t *filter_x,
1633 const int8_t *filter_y,
1636 hevc_hv_uni_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
1637 filter_x, filter_y, height, 16);
1640 static void hevc_hv_uni_8t_24w_msa(uint8_t *src,
1644 const int8_t *filter_x,
1645 const int8_t *filter_y,
1648 hevc_hv_uni_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
1649 filter_x, filter_y, height, 24);
1652 static void hevc_hv_uni_8t_32w_msa(uint8_t *src,
1656 const int8_t *filter_x,
1657 const int8_t *filter_y,
1660 hevc_hv_uni_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
1661 filter_x, filter_y, height, 32);
1664 static void hevc_hv_uni_8t_48w_msa(uint8_t *src,
1668 const int8_t *filter_x,
1669 const int8_t *filter_y,
1672 hevc_hv_uni_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
1673 filter_x, filter_y, height, 48);
1676 static void hevc_hv_uni_8t_64w_msa(uint8_t *src,
1680 const int8_t *filter_x,
1681 const int8_t *filter_y,
1684 hevc_hv_uni_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
1685 filter_x, filter_y, height, 64);
1688 static void common_hz_4t_4x2_msa(uint8_t *src, int32_t src_stride,
1689 uint8_t *dst, int32_t dst_stride,
1690 const int8_t *filter)
1692 v16i8 filt0, filt1, src0, src1, mask0, mask1, vec0, vec1;
1696 mask0 = LD_SB(&mc_filt_mask_arr[16]);
1699 /* rearranging filter */
1700 filt = LD_SH(filter);
1701 SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
1705 LD_SB2(src, src_stride, src0, src1);
1706 XORI_B2_128_SB(src0, src1);
1707 VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
1708 res0 = FILT_4TAP_DPADD_S_H(vec0, vec1, filt0, filt1);
1709 res0 = __msa_srari_h(res0, 6);
1710 res0 = __msa_sat_s_h(res0, 7);
1711 out = PCKEV_XORI128_UB(res0, res0);
1712 ST4x2_UB(out, dst, dst_stride);
1715 static void common_hz_4t_4x4_msa(uint8_t *src, int32_t src_stride,
1716 uint8_t *dst, int32_t dst_stride,
1717 const int8_t *filter)
1719 v16i8 src0, src1, src2, src3, filt0, filt1, mask0, mask1;
1720 v8i16 filt, out0, out1;
1723 mask0 = LD_SB(&mc_filt_mask_arr[16]);
1726 /* rearranging filter */
1727 filt = LD_SH(filter);
1728 SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
1732 LD_SB4(src, src_stride, src0, src1, src2, src3);
1733 XORI_B4_128_SB(src0, src1, src2, src3);
1734 HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,
1735 filt0, filt1, out0, out1);
1736 SRARI_H2_SH(out0, out1, 6);
1737 SAT_SH2_SH(out0, out1, 7);
1738 out = PCKEV_XORI128_UB(out0, out1);
1739 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
1742 static void common_hz_4t_4x8_msa(uint8_t *src, int32_t src_stride,
1743 uint8_t *dst, int32_t dst_stride,
1744 const int8_t *filter)
1746 v16i8 src0, src1, src2, src3, filt0, filt1, mask0, mask1;
1748 v8i16 filt, out0, out1, out2, out3;
1750 mask0 = LD_SB(&mc_filt_mask_arr[16]);
1753 /* rearranging filter */
1754 filt = LD_SH(filter);
1755 SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
1759 LD_SB4(src, src_stride, src0, src1, src2, src3);
1760 src += (4 * src_stride);
1762 XORI_B4_128_SB(src0, src1, src2, src3);
1763 HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,
1764 filt0, filt1, out0, out1);
1765 LD_SB4(src, src_stride, src0, src1, src2, src3);
1766 XORI_B4_128_SB(src0, src1, src2, src3);
1767 HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,
1768 filt0, filt1, out2, out3);
1769 SRARI_H4_SH(out0, out1, out2, out3, 6);
1770 SAT_SH4_SH(out0, out1, out2, out3, 7);
1771 out = PCKEV_XORI128_UB(out0, out1);
1772 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
1773 dst += (4 * dst_stride);
1774 out = PCKEV_XORI128_UB(out2, out3);
1775 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
1778 static void common_hz_4t_4x16_msa(uint8_t *src, int32_t src_stride,
1779 uint8_t *dst, int32_t dst_stride,
1780 const int8_t *filter)
1782 v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
1783 v16i8 filt0, filt1, mask0, mask1;
1785 v8i16 filt, out0, out1, out2, out3;
1787 mask0 = LD_SB(&mc_filt_mask_arr[16]);
1790 /* rearranging filter */
1791 filt = LD_SH(filter);
1792 SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
1796 LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
1797 src += (8 * src_stride);
1798 XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
1799 HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,
1800 filt0, filt1, out0, out1);
1801 HORIZ_4TAP_4WID_4VECS_FILT(src4, src5, src6, src7, mask0, mask1,
1802 filt0, filt1, out2, out3);
1803 SRARI_H4_SH(out0, out1, out2, out3, 6);
1804 SAT_SH4_SH(out0, out1, out2, out3, 7);
1805 out = PCKEV_XORI128_UB(out0, out1);
1806 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
1807 dst += (4 * dst_stride);
1808 out = PCKEV_XORI128_UB(out2, out3);
1809 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
1810 dst += (4 * dst_stride);
1812 LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
1813 src += (8 * src_stride);
1814 XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
1815 HORIZ_4TAP_4WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,
1816 filt0, filt1, out0, out1);
1817 HORIZ_4TAP_4WID_4VECS_FILT(src4, src5, src6, src7, mask0, mask1,
1818 filt0, filt1, out2, out3);
1819 SRARI_H4_SH(out0, out1, out2, out3, 6);
1820 SAT_SH4_SH(out0, out1, out2, out3, 7);
1821 out = PCKEV_XORI128_UB(out0, out1);
1822 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
1823 dst += (4 * dst_stride);
1824 out = PCKEV_XORI128_UB(out2, out3);
1825 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
1828 static void common_hz_4t_4w_msa(uint8_t *src, int32_t src_stride,
1829 uint8_t *dst, int32_t dst_stride,
1830 const int8_t *filter, int32_t height)
1833 common_hz_4t_4x2_msa(src, src_stride, dst, dst_stride, filter);
1834 } else if (4 == height) {
1835 common_hz_4t_4x4_msa(src, src_stride, dst, dst_stride, filter);
1836 } else if (8 == height) {
1837 common_hz_4t_4x8_msa(src, src_stride, dst, dst_stride, filter);
1838 } else if (16 == height) {
1839 common_hz_4t_4x16_msa(src, src_stride, dst, dst_stride, filter);
1843 static void common_hz_4t_6w_msa(uint8_t *src, int32_t src_stride,
1844 uint8_t *dst, int32_t dst_stride,
1845 const int8_t *filter, int32_t height)
1848 v16i8 src0, src1, src2, src3, filt0, filt1, mask0, mask1;
1850 v8i16 filt, out0, out1, out2, out3;
1852 mask0 = LD_SB(&mc_filt_mask_arr[0]);
1855 /* rearranging filter */
1856 filt = LD_SH(filter);
1857 SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
1861 for (loop_cnt = (height >> 2); loop_cnt--;) {
1862 LD_SB4(src, src_stride, src0, src1, src2, src3);
1863 src += (4 * src_stride);
1865 XORI_B4_128_SB(src0, src1, src2, src3);
1866 HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0,
1867 filt1, out0, out1, out2, out3);
1868 SRARI_H4_SH(out0, out1, out2, out3, 6);
1869 SAT_SH4_SH(out0, out1, out2, out3, 7);
1871 out4 = PCKEV_XORI128_UB(out0, out1);
1872 out5 = PCKEV_XORI128_UB(out2, out3);
1873 ST6x4_UB(out4, out5, dst, dst_stride);
1874 dst += (4 * dst_stride);
1878 static void common_hz_4t_8x2mult_msa(uint8_t *src, int32_t src_stride,
1879 uint8_t *dst, int32_t dst_stride,
1880 const int8_t *filter, int32_t height)
1883 v16i8 src0, src1, filt0, filt1, mask0, mask1;
1885 v8i16 filt, vec0, vec1, vec2, vec3;
1887 mask0 = LD_SB(&mc_filt_mask_arr[0]);
1890 filt = LD_SH(filter);
1891 SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
1895 for (loop_cnt = (height >> 1); loop_cnt--;) {
1896 LD_SB2(src, src_stride, src0, src1);
1897 src += (2 * src_stride);
1899 XORI_B2_128_SB(src0, src1);
1900 VSHF_B2_SH(src0, src0, src1, src1, mask0, mask0, vec0, vec1);
1901 DOTP_SB2_SH(vec0, vec1, filt0, filt0, vec0, vec1);
1902 VSHF_B2_SH(src0, src0, src1, src1, mask1, mask1, vec2, vec3);
1903 DPADD_SB2_SH(vec2, vec3, filt1, filt1, vec0, vec1);
1904 SRARI_H2_SH(vec0, vec1, 6);
1905 SAT_SH2_SH(vec0, vec1, 7);
1906 out = PCKEV_XORI128_UB(vec0, vec1);
1907 ST8x2_UB(out, dst, dst_stride);
1908 dst += (2 * dst_stride);
1912 static void common_hz_4t_8x4mult_msa(uint8_t *src, int32_t src_stride,
1913 uint8_t *dst, int32_t dst_stride,
1914 const int8_t *filter, int32_t height)
1917 v16i8 src0, src1, src2, src3, filt0, filt1, mask0, mask1;
1919 v8i16 filt, out0, out1, out2, out3;
1921 mask0 = LD_SB(&mc_filt_mask_arr[0]);
1924 /* rearranging filter */
1925 filt = LD_SH(filter);
1926 SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
1930 for (loop_cnt = (height >> 2); loop_cnt--;) {
1931 LD_SB4(src, src_stride, src0, src1, src2, src3);
1932 src += (4 * src_stride);
1934 XORI_B4_128_SB(src0, src1, src2, src3);
1935 HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0,
1936 filt1, out0, out1, out2, out3);
1937 SRARI_H4_SH(out0, out1, out2, out3, 6);
1938 SAT_SH4_SH(out0, out1, out2, out3, 7);
1939 tmp0 = PCKEV_XORI128_UB(out0, out1);
1940 tmp1 = PCKEV_XORI128_UB(out2, out3);
1941 ST8x4_UB(tmp0, tmp1, dst, dst_stride);
1942 dst += (4 * dst_stride);
1946 static void common_hz_4t_8w_msa(uint8_t *src, int32_t src_stride,
1947 uint8_t *dst, int32_t dst_stride,
1948 const int8_t *filter, int32_t height)
1950 if ((2 == height) || (6 == height)) {
1951 common_hz_4t_8x2mult_msa(src, src_stride, dst, dst_stride, filter,
1954 common_hz_4t_8x4mult_msa(src, src_stride, dst, dst_stride, filter,
1959 static void common_hz_4t_12w_msa(uint8_t *src, int32_t src_stride,
1960 uint8_t *dst, int32_t dst_stride,
1961 const int8_t *filter, int32_t height)
1964 v16i8 src0, src1, src2, src3, filt0, filt1, mask0, mask1, mask2, mask3;
1965 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
1968 v8i16 filt, out0, out1, out2, out3, out4, out5;
1970 mask0 = LD_SB(&mc_filt_mask_arr[0]);
1971 mask2 = LD_SB(&mc_filt_mask_arr[32]);
1975 /* rearranging filter */
1976 filt = LD_SH(filter);
1977 SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
1982 for (loop_cnt = (height >> 2); loop_cnt--;) {
1983 LD_SB4(src, src_stride, src0, src1, src2, src3);
1984 src += (4 * src_stride);
1986 XORI_B4_128_SB(src0, src1, src2, src3);
1987 VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec4, vec5);
1988 VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec6, vec7);
1989 VSHF_B2_SB(src0, src1, src2, src3, mask2, mask2, vec0, vec1);
1990 DOTP_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0,
1991 out2, out3, out4, out5);
1992 DOTP_SB2_SH(vec0, vec1, filt0, filt0, out0, out1);
1993 VSHF_B2_SB(src0, src0, src1, src1, mask1, mask1, vec8, vec9);
1994 VSHF_B2_SB(src2, src2, src3, src3, mask1, mask1, vec10, vec11);
1995 VSHF_B2_SB(src0, src1, src2, src3, mask3, mask3, vec2, vec3);
1996 DPADD_SB4_SH(vec8, vec9, vec10, vec11, filt1, filt1, filt1, filt1,
1997 out2, out3, out4, out5);
1998 DPADD_SB2_SH(vec2, vec3, filt1, filt1, out0, out1);
1999 SRARI_H4_SH(out0, out1, out2, out3, 6);
2000 SRARI_H2_SH(out4, out5, 6);
2001 SAT_SH4_SH(out0, out1, out2, out3, 7);
2002 SAT_SH2_SH(out4, out5, 7);
2003 tmp0 = PCKEV_XORI128_UB(out2, out3);
2004 tmp1 = PCKEV_XORI128_UB(out4, out5);
2005 ST8x4_UB(tmp0, tmp1, dst, dst_stride);
2006 tmp0 = PCKEV_XORI128_UB(out0, out1);
2007 ST4x4_UB(tmp0, tmp0, 0, 1, 2, 3, dst + 8, dst_stride);
2008 dst += (4 * dst_stride);
2012 static void common_hz_4t_16w_msa(uint8_t *src, int32_t src_stride,
2013 uint8_t *dst, int32_t dst_stride,
2014 const int8_t *filter, int32_t height)
2017 v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
2018 v16i8 filt0, filt1, mask0, mask1;
2019 v8i16 filt, out0, out1, out2, out3, out4, out5, out6, out7;
2022 mask0 = LD_SB(&mc_filt_mask_arr[0]);
2025 /* rearranging filter */
2026 filt = LD_SH(filter);
2027 SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2031 for (loop_cnt = (height >> 2); loop_cnt--;) {
2032 LD_SB4(src, src_stride, src0, src2, src4, src6);
2033 LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
2034 src += (4 * src_stride);
2036 XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
2037 HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1, filt0,
2038 filt1, out0, out1, out2, out3);
2039 HORIZ_4TAP_8WID_4VECS_FILT(src4, src5, src6, src7, mask0, mask1, filt0,
2040 filt1, out4, out5, out6, out7);
2041 SRARI_H4_SH(out0, out1, out2, out3, 6);
2042 SRARI_H4_SH(out4, out5, out6, out7, 6);
2043 SAT_SH4_SH(out0, out1, out2, out3, 7);
2044 SAT_SH4_SH(out4, out5, out6, out7, 7);
2045 out = PCKEV_XORI128_UB(out0, out1);
2048 out = PCKEV_XORI128_UB(out2, out3);
2051 out = PCKEV_XORI128_UB(out4, out5);
2054 out = PCKEV_XORI128_UB(out6, out7);
2060 static void common_hz_4t_24w_msa(uint8_t *src, int32_t src_stride,
2061 uint8_t *dst, int32_t dst_stride,
2062 const int8_t *filter, int32_t height)
2064 uint8_t *dst1 = dst + 16;
2066 v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
2067 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
2068 v16i8 filt0, filt1, mask0, mask1, mask00, mask11;
2069 v8i16 filt, out0, out1, out2, out3;
2072 mask0 = LD_SB(&mc_filt_mask_arr[0]);
2075 /* rearranging filter */
2076 filt = LD_SH(filter);
2077 SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2081 mask11 = mask0 + 10;
2083 for (loop_cnt = (height >> 2); loop_cnt--;) {
2084 LD_SB4(src, src_stride, src0, src2, src4, src6);
2085 LD_SB4(src + 16, src_stride, src1, src3, src5, src7);
2086 src += (4 * src_stride);
2088 XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
2089 VSHF_B2_SB(src0, src0, src0, src1, mask0, mask00, vec0, vec1);
2090 VSHF_B2_SB(src2, src2, src2, src3, mask0, mask00, vec2, vec3);
2091 VSHF_B2_SB(src0, src0, src0, src1, mask1, mask11, vec4, vec5);
2092 VSHF_B2_SB(src2, src2, src2, src3, mask1, mask11, vec6, vec7);
2093 DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
2094 out0, out1, out2, out3);
2095 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1,
2096 out0, out1, out2, out3);
2097 SRARI_H4_SH(out0, out1, out2, out3, 6);
2098 SAT_SH4_SH(out0, out1, out2, out3, 7);
2099 tmp0 = PCKEV_XORI128_UB(out0, out1);
2102 tmp0 = PCKEV_XORI128_UB(out2, out3);
2106 VSHF_B2_SB(src4, src4, src4, src5, mask0, mask00, vec0, vec1);
2107 VSHF_B2_SB(src6, src6, src6, src7, mask0, mask00, vec2, vec3);
2108 VSHF_B2_SB(src4, src4, src4, src5, mask1, mask11, vec4, vec5);
2109 VSHF_B2_SB(src6, src6, src6, src7, mask1, mask11, vec6, vec7);
2110 DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
2111 out0, out1, out2, out3);
2112 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1,
2113 out0, out1, out2, out3);
2114 SRARI_H4_SH(out0, out1, out2, out3, 6);
2115 SAT_SH4_SH(out0, out1, out2, out3, 7);
2116 tmp0 = PCKEV_XORI128_UB(out0, out1);
2119 tmp0 = PCKEV_XORI128_UB(out2, out3);
2124 VSHF_B2_SB(src1, src1, src3, src3, mask0, mask0, vec0, vec1);
2125 VSHF_B2_SB(src5, src5, src7, src7, mask0, mask0, vec2, vec3);
2126 VSHF_B2_SB(src1, src1, src3, src3, mask1, mask1, vec4, vec5);
2127 VSHF_B2_SB(src5, src5, src7, src7, mask1, mask1, vec6, vec7);
2129 DOTP_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt0, filt0, filt0,
2130 out0, out1, out2, out3);
2131 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt1, filt1, filt1, filt1,
2132 out0, out1, out2, out3);
2134 SRARI_H4_SH(out0, out1, out2, out3, 6);
2135 SAT_SH4_SH(out0, out1, out2, out3, 7);
2136 tmp0 = PCKEV_XORI128_UB(out0, out1);
2137 tmp1 = PCKEV_XORI128_UB(out2, out3);
2138 ST8x4_UB(tmp0, tmp1, dst1, dst_stride);
2139 dst1 += (4 * dst_stride);
2143 static void common_hz_4t_32w_msa(uint8_t *src, int32_t src_stride,
2144 uint8_t *dst, int32_t dst_stride,
2145 const int8_t *filter, int32_t height)
2148 v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
2149 v16i8 filt0, filt1, mask0, mask1;
2151 v8i16 filt, out0, out1, out2, out3, out4, out5, out6, out7;
2153 mask0 = LD_SB(&mc_filt_mask_arr[0]);
2156 /* rearranging filter */
2157 filt = LD_SH(filter);
2158 SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2162 for (loop_cnt = (height >> 1); loop_cnt--;) {
2164 src2 = LD_SB(src + 16);
2165 src3 = LD_SB(src + 24);
2168 src6 = LD_SB(src + 16);
2169 src7 = LD_SB(src + 24);
2170 SLDI_B2_SB(src2, src6, src0, src4, src1, src5, 8);
2173 XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
2174 HORIZ_4TAP_8WID_4VECS_FILT(src0, src1, src2, src3, mask0, mask1,
2175 filt0, filt1, out0, out1, out2, out3);
2176 HORIZ_4TAP_8WID_4VECS_FILT(src4, src5, src6, src7, mask0, mask1,
2177 filt0, filt1, out4, out5, out6, out7);
2178 SRARI_H4_SH(out0, out1, out2, out3, 6);
2179 SRARI_H4_SH(out4, out5, out6, out7, 6);
2180 SAT_SH4_SH(out0, out1, out2, out3, 7);
2181 SAT_SH4_SH(out4, out5, out6, out7, 7);
2182 out = PCKEV_XORI128_UB(out0, out1);
2184 out = PCKEV_XORI128_UB(out2, out3);
2185 ST_UB(out, dst + 16);
2187 out = PCKEV_XORI128_UB(out4, out5);
2189 out = PCKEV_XORI128_UB(out6, out7);
2190 ST_UB(out, dst + 16);
2195 static void common_vt_4t_4x2_msa(uint8_t *src, int32_t src_stride,
2196 uint8_t *dst, int32_t dst_stride,
2197 const int8_t *filter)
2199 v16i8 src0, src1, src2, src3, src4, src10_r, src32_r, src21_r, src43_r;
2200 v16i8 src2110, src4332, filt0, filt1;
2206 filt = LD_SH(filter);
2207 SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2209 LD_SB3(src, src_stride, src0, src1, src2);
2210 src += (3 * src_stride);
2212 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2213 src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
2214 src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
2215 LD_SB2(src, src_stride, src3, src4);
2216 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
2217 src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_r, (v2i64) src32_r);
2218 src4332 = (v16i8) __msa_xori_b((v16u8) src4332, 128);
2219 out10 = FILT_4TAP_DPADD_S_H(src2110, src4332, filt0, filt1);
2220 out10 = __msa_srari_h(out10, 6);
2221 out10 = __msa_sat_s_h(out10, 7);
2222 out = PCKEV_XORI128_UB(out10, out10);
2223 ST4x2_UB(out, dst, dst_stride);
2226 static void common_vt_4t_4x4multiple_msa(uint8_t *src, int32_t src_stride,
2227 uint8_t *dst, int32_t dst_stride,
2228 const int8_t *filter, int32_t height)
2231 v16i8 src0, src1, src2, src3, src4, src5;
2232 v16i8 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r;
2233 v16i8 src2110, src4332, filt0, filt1;
2234 v8i16 filt, out10, out32;
2239 filt = LD_SH(filter);
2240 SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2242 LD_SB3(src, src_stride, src0, src1, src2);
2243 src += (3 * src_stride);
2245 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2247 src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
2248 src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
2250 for (loop_cnt = (height >> 2); loop_cnt--;) {
2251 LD_SB3(src, src_stride, src3, src4, src5);
2252 src += (3 * src_stride);
2253 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
2254 src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_r, (v2i64) src32_r);
2255 src4332 = (v16i8) __msa_xori_b((v16u8) src4332, 128);
2256 out10 = FILT_4TAP_DPADD_S_H(src2110, src4332, filt0, filt1);
2259 src += (src_stride);
2260 ILVR_B2_SB(src5, src4, src2, src5, src54_r, src65_r);
2261 src2110 = (v16i8) __msa_ilvr_d((v2i64) src65_r, (v2i64) src54_r);
2262 src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
2263 out32 = FILT_4TAP_DPADD_S_H(src4332, src2110, filt0, filt1);
2264 SRARI_H2_SH(out10, out32, 6);
2265 SAT_SH2_SH(out10, out32, 7);
2266 out = PCKEV_XORI128_UB(out10, out32);
2267 ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
2268 dst += (4 * dst_stride);
2272 static void common_vt_4t_4w_msa(uint8_t *src, int32_t src_stride,
2273 uint8_t *dst, int32_t dst_stride,
2274 const int8_t *filter, int32_t height)
2277 common_vt_4t_4x2_msa(src, src_stride, dst, dst_stride, filter);
2279 common_vt_4t_4x4multiple_msa(src, src_stride, dst, dst_stride, filter,
2284 static void common_vt_4t_6w_msa(uint8_t *src, int32_t src_stride,
2285 uint8_t *dst, int32_t dst_stride,
2286 const int8_t *filter, int32_t height)
2289 v16u8 src0, src1, src2, src3, vec0, vec1, vec2, vec3, out0, out1;
2290 v8i16 vec01, vec12, vec23, vec30, tmp0, tmp1, tmp2, tmp3;
2291 v8i16 filt, filt0, filt1;
2295 /* rearranging filter_y */
2296 filt = LD_SH(filter);
2297 SPLATI_H2_SH(filt, 0, 1, filt0, filt1);
2299 LD_UB3(src, src_stride, src0, src1, src2);
2300 src += (3 * src_stride);
2302 vec0 = (v16u8) __msa_xori_b((v16u8) src0, 128);
2303 vec1 = (v16u8) __msa_xori_b((v16u8) src1, 128);
2304 vec2 = (v16u8) __msa_xori_b((v16u8) src2, 128);
2306 for (loop_cnt = (height >> 2); loop_cnt--;) {
2307 LD_UB4(src, src_stride, src3, src0, src1, src2);
2308 src += (4 * src_stride);
2310 vec3 = (v16u8) __msa_xori_b((v16u8) src3, 128);
2311 ILVR_B2_SH(vec1, vec0, vec3, vec2, vec01, vec23);
2312 tmp0 = FILT_4TAP_DPADD_S_H(vec01, vec23, filt0, filt1);
2314 vec0 = __msa_xori_b((v16u8) src0, 128);
2315 ILVR_B2_SH(vec2, vec1, vec0, vec3, vec12, vec30);
2316 tmp1 = FILT_4TAP_DPADD_S_H(vec12, vec30, filt0, filt1);
2318 vec1 = __msa_xori_b((v16u8) src1, 128);
2319 vec01 = (v8i16) __msa_ilvr_b((v16i8) vec1, (v16i8) vec0);
2320 tmp2 = FILT_4TAP_DPADD_S_H(vec23, vec01, filt0, filt1);
2322 vec2 = __msa_xori_b((v16u8) src2, 128);
2323 vec12 = (v8i16) __msa_ilvr_b((v16i8) vec2, (v16i8) vec1);
2324 tmp3 = FILT_4TAP_DPADD_S_H(vec30, vec12, filt0, filt1);
2326 SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 6);
2327 SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
2328 out0 = PCKEV_XORI128_UB(tmp0, tmp1);
2329 out1 = PCKEV_XORI128_UB(tmp2, tmp3);
2330 ST6x4_UB(out0, out1, dst, dst_stride);
2331 dst += (4 * dst_stride);
2335 static void common_vt_4t_8x2_msa(uint8_t *src, int32_t src_stride,
2336 uint8_t *dst, int32_t dst_stride,
2337 const int8_t *filter)
2339 v16i8 src0, src1, src2, src3, src4;
2340 v8i16 src01, src12, src23, src34, tmp0, tmp1, filt, filt0, filt1;
2345 /* rearranging filter_y */
2346 filt = LD_SH(filter);
2347 SPLATI_H2_SH(filt, 0, 1, filt0, filt1);
2349 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
2350 XORI_B5_128_SB(src0, src1, src2, src3, src4);
2351 ILVR_B2_SH(src1, src0, src3, src2, src01, src23);
2352 tmp0 = FILT_4TAP_DPADD_S_H(src01, src23, filt0, filt1);
2353 ILVR_B2_SH(src2, src1, src4, src3, src12, src34);
2354 tmp1 = FILT_4TAP_DPADD_S_H(src12, src34, filt0, filt1);
2355 SRARI_H2_SH(tmp0, tmp1, 6);
2356 SAT_SH2_SH(tmp0, tmp1, 7);
2357 out = PCKEV_XORI128_UB(tmp0, tmp1);
2358 ST8x2_UB(out, dst, dst_stride);
2361 static void common_vt_4t_8x6_msa(uint8_t *src, int32_t src_stride,
2362 uint8_t *dst, int32_t dst_stride,
2363 const int8_t *filter)
2366 uint64_t out0, out1, out2;
2367 v16i8 src0, src1, src2, src3, src4, src5;
2368 v8i16 vec0, vec1, vec2, vec3, vec4, tmp0, tmp1, tmp2;
2369 v8i16 filt, filt0, filt1;
2373 /* rearranging filter_y */
2374 filt = LD_SH(filter);
2375 SPLATI_H2_SH(filt, 0, 1, filt0, filt1);
2377 LD_SB3(src, src_stride, src0, src1, src2);
2378 src += (3 * src_stride);
2380 XORI_B3_128_SB(src0, src1, src2);
2381 ILVR_B2_SH(src1, src0, src2, src1, vec0, vec2);
2383 for (loop_cnt = 2; loop_cnt--;) {
2384 LD_SB3(src, src_stride, src3, src4, src5);
2385 src += (3 * src_stride);
2387 XORI_B3_128_SB(src3, src4, src5);
2388 ILVR_B3_SH(src3, src2, src4, src3, src5, src4, vec1, vec3, vec4);
2389 tmp0 = FILT_4TAP_DPADD_S_H(vec0, vec1, filt0, filt1);
2390 tmp1 = FILT_4TAP_DPADD_S_H(vec2, vec3, filt0, filt1);
2391 tmp2 = FILT_4TAP_DPADD_S_H(vec1, vec4, filt0, filt1);
2392 SRARI_H2_SH(tmp0, tmp1, 6);
2393 tmp2 = __msa_srari_h(tmp2, 6);
2394 SAT_SH3_SH(tmp0, tmp1, tmp2, 7);
2395 PCKEV_B2_SH(tmp1, tmp0, tmp2, tmp2, tmp0, tmp2);
2396 XORI_B2_128_SH(tmp0, tmp2);
2398 out0 = __msa_copy_u_d((v2i64) tmp0, 0);
2399 out1 = __msa_copy_u_d((v2i64) tmp0, 1);
2400 out2 = __msa_copy_u_d((v2i64) tmp2, 0);
2414 static void common_vt_4t_8x4mult_msa(uint8_t *src, int32_t src_stride,
2415 uint8_t *dst, int32_t dst_stride,
2416 const int8_t *filter, int32_t height)
2419 v16i8 src0, src1, src2, src7, src8, src9, src10;
2420 v16i8 src10_r, src72_r, src98_r, src21_r, src87_r, src109_r, filt0, filt1;
2422 v8i16 filt, out0_r, out1_r, out2_r, out3_r;
2426 filt = LD_SH(filter);
2427 SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2429 LD_SB3(src, src_stride, src0, src1, src2);
2430 src += (3 * src_stride);
2432 XORI_B3_128_SB(src0, src1, src2);
2433 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2435 for (loop_cnt = (height >> 2); loop_cnt--;) {
2436 LD_SB4(src, src_stride, src7, src8, src9, src10);
2437 src += (4 * src_stride);
2439 XORI_B4_128_SB(src7, src8, src9, src10);
2440 ILVR_B4_SB(src7, src2, src8, src7, src9, src8, src10, src9,
2441 src72_r, src87_r, src98_r, src109_r);
2442 out0_r = FILT_4TAP_DPADD_S_H(src10_r, src72_r, filt0, filt1);
2443 out1_r = FILT_4TAP_DPADD_S_H(src21_r, src87_r, filt0, filt1);
2444 out2_r = FILT_4TAP_DPADD_S_H(src72_r, src98_r, filt0, filt1);
2445 out3_r = FILT_4TAP_DPADD_S_H(src87_r, src109_r, filt0, filt1);
2446 SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6);
2447 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
2448 tmp0 = PCKEV_XORI128_UB(out0_r, out1_r);
2449 tmp1 = PCKEV_XORI128_UB(out2_r, out3_r);
2450 ST8x4_UB(tmp0, tmp1, dst, dst_stride);
2451 dst += (4 * dst_stride);
2459 static void common_vt_4t_8w_msa(uint8_t *src, int32_t src_stride,
2460 uint8_t *dst, int32_t dst_stride,
2461 const int8_t *filter, int32_t height)
2464 common_vt_4t_8x2_msa(src, src_stride, dst, dst_stride, filter);
2465 } else if (6 == height) {
2466 common_vt_4t_8x6_msa(src, src_stride, dst, dst_stride, filter);
2468 common_vt_4t_8x4mult_msa(src, src_stride, dst, dst_stride,
2473 static void common_vt_4t_12w_msa(uint8_t *src, int32_t src_stride,
2474 uint8_t *dst, int32_t dst_stride,
2475 const int8_t *filter, int32_t height)
2478 v16i8 src0, src1, src2, src3, src4, src5, src6;
2479 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
2481 v8i16 src10, src21, src32, src43, src54, src65, src87, src109, src1211;
2482 v8i16 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, filt, filt0, filt1;
2483 v4u32 mask = { 2, 6, 2, 6 };
2485 /* rearranging filter_y */
2486 filt = LD_SH(filter);
2487 SPLATI_H2_SH(filt, 0, 1, filt0, filt1);
2491 LD_SB3(src, src_stride, src0, src1, src2);
2492 src += (3 * src_stride);
2494 XORI_B3_128_SB(src0, src1, src2);
2495 VSHF_W2_SB(src0, src1, src1, src2, mask, mask, vec0, vec1);
2497 for (loop_cnt = (height >> 2); loop_cnt--;) {
2498 LD_SB4(src, src_stride, src3, src4, src5, src6);
2499 src += (4 * src_stride);
2501 XORI_B4_128_SB(src3, src4, src5, src6);
2502 ILVR_B2_SH(src1, src0, src3, src2, src10, src32);
2503 VSHF_W2_SB(src2, src3, src3, src4, mask, mask, vec2, vec3);
2504 VSHF_W2_SB(src4, src5, src5, src6, mask, mask, vec4, vec5);
2505 tmp0 = FILT_4TAP_DPADD_S_H(src10, src32, filt0, filt1);
2506 ILVR_B4_SH(src2, src1, src4, src3, src5, src4, src6, src5,
2507 src21, src43, src54, src65);
2508 tmp1 = FILT_4TAP_DPADD_S_H(src21, src43, filt0, filt1);
2509 tmp2 = FILT_4TAP_DPADD_S_H(src32, src54, filt0, filt1);
2510 tmp3 = FILT_4TAP_DPADD_S_H(src43, src65, filt0, filt1);
2511 ILVR_B3_SH(vec1, vec0, vec3, vec2, vec5, vec4, src87, src109, src1211);
2512 tmp4 = FILT_4TAP_DPADD_S_H(src87, src109, filt0, filt1);
2513 tmp5 = FILT_4TAP_DPADD_S_H(src109, src1211, filt0, filt1);
2514 SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 6);
2515 SRARI_H2_SH(tmp4, tmp5, 6);
2516 SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
2517 SAT_SH2_SH(tmp4, tmp5, 7);
2518 out0 = PCKEV_XORI128_UB(tmp0, tmp1);
2519 out1 = PCKEV_XORI128_UB(tmp2, tmp3);
2520 ST8x4_UB(out0, out1, dst, dst_stride);
2521 out0 = PCKEV_XORI128_UB(tmp4, tmp5);
2522 ST4x4_UB(out0, out0, 0, 1, 2, 3, dst + 8, dst_stride);
2523 dst += (4 * dst_stride);
2534 static void common_vt_4t_16w_msa(uint8_t *src, int32_t src_stride,
2535 uint8_t *dst, int32_t dst_stride,
2536 const int8_t *filter, int32_t height)
2539 v16i8 src0, src1, src2, src3, src4, src5, src6;
2540 v16i8 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r, src10_l;
2541 v16i8 src32_l, src54_l, src21_l, src43_l, src65_l, filt0, filt1;
2542 v16u8 tmp0, tmp1, tmp2, tmp3;
2543 v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
2547 filt = LD_SH(filter);
2548 SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2550 LD_SB3(src, src_stride, src0, src1, src2);
2551 src += (3 * src_stride);
2553 XORI_B3_128_SB(src0, src1, src2);
2554 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2555 ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
2557 for (loop_cnt = (height >> 2); loop_cnt--;) {
2558 LD_SB4(src, src_stride, src3, src4, src5, src6);
2559 src += (4 * src_stride);
2561 XORI_B4_128_SB(src3, src4, src5, src6);
2562 ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
2563 src32_r, src43_r, src54_r, src65_r);
2564 ILVL_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
2565 src32_l, src43_l, src54_l, src65_l);
2566 out0_r = FILT_4TAP_DPADD_S_H(src10_r, src32_r, filt0, filt1);
2567 out1_r = FILT_4TAP_DPADD_S_H(src21_r, src43_r, filt0, filt1);
2568 out2_r = FILT_4TAP_DPADD_S_H(src32_r, src54_r, filt0, filt1);
2569 out3_r = FILT_4TAP_DPADD_S_H(src43_r, src65_r, filt0, filt1);
2570 out0_l = FILT_4TAP_DPADD_S_H(src10_l, src32_l, filt0, filt1);
2571 out1_l = FILT_4TAP_DPADD_S_H(src21_l, src43_l, filt0, filt1);
2572 out2_l = FILT_4TAP_DPADD_S_H(src32_l, src54_l, filt0, filt1);
2573 out3_l = FILT_4TAP_DPADD_S_H(src43_l, src65_l, filt0, filt1);
2574 SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6);
2575 SRARI_H4_SH(out0_l, out1_l, out2_l, out3_l, 6);
2576 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
2577 SAT_SH4_SH(out0_l, out1_l, out2_l, out3_l, 7);
2578 PCKEV_B4_UB(out0_l, out0_r, out1_l, out1_r, out2_l, out2_r, out3_l,
2579 out3_r, tmp0, tmp1, tmp2, tmp3);
2580 XORI_B4_128_UB(tmp0, tmp1, tmp2, tmp3);
2581 ST_UB4(tmp0, tmp1, tmp2, tmp3, dst, dst_stride);
2582 dst += (4 * dst_stride);
2592 static void common_vt_4t_24w_msa(uint8_t *src, int32_t src_stride,
2593 uint8_t *dst, int32_t dst_stride,
2594 const int8_t *filter, int32_t height)
2597 uint64_t out0, out1;
2598 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
2599 v16i8 src11, filt0, filt1;
2600 v16i8 src10_r, src32_r, src76_r, src98_r, src21_r, src43_r, src87_r;
2601 v16i8 src109_r, src10_l, src32_l, src21_l, src43_l;
2603 v8i16 filt, out0_r, out1_r, out2_r, out3_r, out0_l, out1_l;
2607 filt = LD_SH(filter);
2608 SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2611 LD_SB3(src, src_stride, src0, src1, src2);
2612 XORI_B3_128_SB(src0, src1, src2);
2613 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2614 ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
2617 LD_SB3(src + 16, src_stride, src6, src7, src8);
2618 src += (3 * src_stride);
2619 XORI_B3_128_SB(src6, src7, src8);
2620 ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
2622 for (loop_cnt = (height >> 2); loop_cnt--;) {
2624 LD_SB2(src, src_stride, src3, src4);
2625 XORI_B2_128_SB(src3, src4);
2626 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
2627 ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
2630 LD_SB2(src + 16, src_stride, src9, src10);
2631 src += (2 * src_stride);
2632 XORI_B2_128_SB(src9, src10);
2633 ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r);
2636 out0_r = FILT_4TAP_DPADD_S_H(src10_r, src32_r, filt0, filt1);
2637 out0_l = FILT_4TAP_DPADD_S_H(src10_l, src32_l, filt0, filt1);
2638 out1_r = FILT_4TAP_DPADD_S_H(src21_r, src43_r, filt0, filt1);
2639 out1_l = FILT_4TAP_DPADD_S_H(src21_l, src43_l, filt0, filt1);
2642 out2_r = FILT_4TAP_DPADD_S_H(src76_r, src98_r, filt0, filt1);
2643 out3_r = FILT_4TAP_DPADD_S_H(src87_r, src109_r, filt0, filt1);
2646 SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6);
2647 SRARI_H2_SH(out0_l, out1_l, 6);
2648 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
2649 SAT_SH2_SH(out0_l, out1_l, 7);
2650 out = PCKEV_XORI128_UB(out0_r, out0_l);
2652 PCKEV_B2_SH(out2_r, out2_r, out3_r, out3_r, out2_r, out3_r);
2653 XORI_B2_128_SH(out2_r, out3_r);
2654 out0 = __msa_copy_u_d((v2i64) out2_r, 0);
2655 out1 = __msa_copy_u_d((v2i64) out3_r, 0);
2658 out = PCKEV_XORI128_UB(out1_r, out1_l);
2664 LD_SB2(src, src_stride, src5, src2);
2665 XORI_B2_128_SB(src5, src2);
2666 ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r);
2667 ILVL_B2_SB(src5, src4, src2, src5, src10_l, src21_l);
2670 LD_SB2(src + 16, src_stride, src11, src8);
2671 src += (2 * src_stride);
2672 XORI_B2_128_SB(src11, src8);
2673 ILVR_B2_SB(src11, src10, src8, src11, src76_r, src87_r);
2676 out0_r = FILT_4TAP_DPADD_S_H(src32_r, src10_r, filt0, filt1);
2677 out0_l = FILT_4TAP_DPADD_S_H(src32_l, src10_l, filt0, filt1);
2678 out1_r = FILT_4TAP_DPADD_S_H(src43_r, src21_r, filt0, filt1);
2679 out1_l = FILT_4TAP_DPADD_S_H(src43_l, src21_l, filt0, filt1);
2682 out2_r = FILT_4TAP_DPADD_S_H(src98_r, src76_r, filt0, filt1);
2683 out3_r = FILT_4TAP_DPADD_S_H(src109_r, src87_r, filt0, filt1);
2686 SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6);
2687 SRARI_H2_SH(out0_l, out1_l, 6);
2688 SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
2689 SAT_SH2_SH(out0_l, out1_l, 7);
2690 out = PCKEV_XORI128_UB(out0_r, out0_l);
2692 out = PCKEV_XORI128_UB(out2_r, out2_r);
2693 ST8x1_UB(out, dst + 16);
2695 out = PCKEV_XORI128_UB(out1_r, out1_l);
2697 out = PCKEV_XORI128_UB(out3_r, out3_r);
2698 ST8x1_UB(out, dst + 16);
2703 static void common_vt_4t_32w_mult_msa(uint8_t *src, int32_t src_stride,
2704 uint8_t *dst, int32_t dst_stride,
2705 const int8_t *filter, int32_t height,
2708 uint32_t loop_cnt, cnt;
2709 uint8_t *dst_tmp, *src_tmp;
2710 v16i8 src0, src1, src2, src3, src4, src6, src7, src8, src9, src10;
2711 v16i8 src10_r, src32_r, src76_r, src98_r;
2712 v16i8 src21_r, src43_r, src87_r, src109_r;
2713 v8i16 out0_r, out1_r, out2_r, out3_r, out0_l, out1_l, out2_l, out3_l;
2714 v16i8 src10_l, src32_l, src76_l, src98_l;
2715 v16i8 src21_l, src43_l, src87_l, src109_l;
2722 filt = LD_SH(filter);
2723 SPLATI_H2_SB(filt, 0, 1, filt0, filt1);
2725 for (cnt = (width >> 5); cnt--;) {
2730 LD_SB3(src_tmp, src_stride, src0, src1, src2);
2731 XORI_B3_128_SB(src0, src1, src2);
2733 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2734 ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
2737 LD_SB3(src_tmp + 16, src_stride, src6, src7, src8);
2738 src_tmp += (3 * src_stride);
2740 XORI_B3_128_SB(src6, src7, src8);
2741 ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
2742 ILVL_B2_SB(src7, src6, src8, src7, src76_l, src87_l);
2744 for (loop_cnt = (height >> 1); loop_cnt--;) {
2746 LD_SB2(src_tmp, src_stride, src3, src4);
2747 XORI_B2_128_SB(src3, src4);
2748 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
2749 ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
2752 out0_r = FILT_4TAP_DPADD_S_H(src10_r, src32_r, filt0, filt1);
2753 out0_l = FILT_4TAP_DPADD_S_H(src10_l, src32_l, filt0, filt1);
2754 out1_r = FILT_4TAP_DPADD_S_H(src21_r, src43_r, filt0, filt1);
2755 out1_l = FILT_4TAP_DPADD_S_H(src21_l, src43_l, filt0, filt1);
2758 SRARI_H4_SH(out0_r, out1_r, out0_l, out1_l, 6);
2759 SAT_SH4_SH(out0_r, out1_r, out0_l, out1_l, 7);
2760 out = PCKEV_XORI128_UB(out0_r, out0_l);
2761 ST_UB(out, dst_tmp);
2762 out = PCKEV_XORI128_UB(out1_r, out1_l);
2763 ST_UB(out, dst_tmp + dst_stride);
2772 LD_SB2(src_tmp + 16, src_stride, src9, src10);
2773 src_tmp += (2 * src_stride);
2774 XORI_B2_128_SB(src9, src10);
2775 ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r);
2776 ILVL_B2_SB(src9, src8, src10, src9, src98_l, src109_l);
2779 out2_r = FILT_4TAP_DPADD_S_H(src76_r, src98_r, filt0, filt1);
2780 out2_l = FILT_4TAP_DPADD_S_H(src76_l, src98_l, filt0, filt1);
2781 out3_r = FILT_4TAP_DPADD_S_H(src87_r, src109_r, filt0, filt1);
2782 out3_l = FILT_4TAP_DPADD_S_H(src87_l, src109_l, filt0, filt1);
2785 SRARI_H4_SH(out2_r, out3_r, out2_l, out3_l, 6);
2786 SAT_SH4_SH(out2_r, out3_r, out2_l, out3_l, 7);
2787 out = PCKEV_XORI128_UB(out2_r, out2_l);
2788 ST_UB(out, dst_tmp + 16);
2789 out = PCKEV_XORI128_UB(out3_r, out3_l);
2790 ST_UB(out, dst_tmp + 16 + dst_stride);
2792 dst_tmp += 2 * dst_stride;
2806 static void common_vt_4t_32w_msa(uint8_t *src, int32_t src_stride,
2807 uint8_t *dst, int32_t dst_stride,
2808 const int8_t *filter, int32_t height)
2810 common_vt_4t_32w_mult_msa(src, src_stride, dst, dst_stride,
2811 filter, height, 32);
2814 static void hevc_hv_uni_4t_4x2_msa(uint8_t *src,
2818 const int8_t *filter_x,
2819 const int8_t *filter_y,
2822 v16i8 src0, src1, src2, src3, src4;
2824 v4i32 filt_h0, filt_h1;
2825 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
2827 v8i16 filter_vec, const_vec;
2828 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
2829 v8i16 dst0, dst1, dst2, dst3, dst4;
2830 v4i32 dst0_r, dst1_r;
2831 v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
2833 src -= (src_stride + 1);
2835 filter_vec = LD_SH(filter_x);
2836 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2838 filter_vec = LD_SH(filter_y);
2839 vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
2840 filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
2842 SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1);
2846 const_vec = __msa_ldi_h(128);
2849 LD_SB3(src, src_stride, src0, src1, src2);
2850 src += (3 * src_stride);
2852 XORI_B3_128_SB(src0, src1, src2);
2854 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
2855 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
2856 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
2859 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
2861 DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1);
2863 DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2);
2865 ILVR_H2_SH(dst1, dst0, dst2, dst1, dst10_r, dst21_r);
2866 LD_SB2(src, src_stride, src3, src4);
2867 XORI_B2_128_SB(src3, src4);
2870 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
2872 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
2874 dst32_r = __msa_ilvr_h(dst3, dst2);
2875 dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
2879 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
2881 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
2883 dst43_r = __msa_ilvr_h(dst4, dst3);
2884 dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
2887 dst0_r = (v4i32) __msa_pckev_h((v8i16) dst1_r, (v8i16) dst0_r);
2888 dst0_r = (v4i32) __msa_srari_h((v8i16) dst0_r, 6);
2889 dst0_r = (v4i32) CLIP_SH_0_255(dst0_r);
2890 dst0_r = (v4i32) __msa_pckev_b((v16i8) dst0_r, (v16i8) dst0_r);
2892 ST4x2_UB(dst0_r, dst, dst_stride);
2895 static void hevc_hv_uni_4t_4x4_msa(uint8_t *src,
2899 const int8_t *filter_x,
2900 const int8_t *filter_y,
2903 v16i8 src0, src1, src2, src3, src4, src5, src6;
2905 v4i32 filt_h0, filt_h1;
2906 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
2908 v8i16 filter_vec, const_vec;
2909 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
2910 v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
2911 v4i32 dst0_r, dst1_r, dst2_r, dst3_r;
2912 v8i16 out0_r, out1_r;
2913 v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
2915 src -= (src_stride + 1);
2917 filter_vec = LD_SH(filter_x);
2918 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2920 filter_vec = LD_SH(filter_y);
2921 vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
2922 filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
2924 SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1);
2928 const_vec = __msa_ldi_h(128);
2931 LD_SB3(src, src_stride, src0, src1, src2);
2932 src += (3 * src_stride);
2934 XORI_B3_128_SB(src0, src1, src2);
2936 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
2937 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
2938 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
2941 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
2943 DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1);
2945 DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2);
2947 ILVR_H2_SH(dst1, dst0, dst2, dst1, dst10_r, dst21_r);
2948 LD_SB4(src, src_stride, src3, src4, src5, src6);
2949 XORI_B4_128_SB(src3, src4, src5, src6);
2952 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
2954 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
2956 dst32_r = __msa_ilvr_h(dst3, dst2);
2957 dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
2961 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
2963 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
2965 dst43_r = __msa_ilvr_h(dst4, dst3);
2966 dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
2970 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
2972 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5);
2974 dst10_r = __msa_ilvr_h(dst5, dst4);
2975 dst2_r = HEVC_FILT_4TAP(dst32_r, dst10_r, filt_h0, filt_h1);
2979 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
2981 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
2983 dst21_r = __msa_ilvr_h(dst2, dst5);
2984 dst3_r = HEVC_FILT_4TAP(dst43_r, dst21_r, filt_h0, filt_h1);
2987 PCKEV_H2_SH(dst1_r, dst0_r, dst3_r, dst2_r, out0_r, out1_r);
2988 SRARI_H2_SH(out0_r, out1_r, 6);
2989 CLIP_SH2_0_255(out0_r, out1_r);
2990 out0_r = (v8i16) __msa_pckev_b((v16i8) out1_r, (v16i8) out0_r);
2992 ST4x4_UB(out0_r, out0_r, 0, 1, 2, 3, dst, dst_stride);
2995 static void hevc_hv_uni_4t_4multx8mult_msa(uint8_t *src,
2999 const int8_t *filter_x,
3000 const int8_t *filter_y,
3004 v16i8 src0, src1, src2, src3, src4, src5;
3005 v16i8 src6, src7, src8, src9, src10;
3007 v4i32 filt_h0, filt_h1;
3008 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
3010 v8i16 filter_vec, const_vec;
3011 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
3012 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8, dst9;
3013 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r, dst6_r, dst7_r;
3014 v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
3015 v8i16 dst21_r, dst43_r, dst65_r, dst87_r;
3016 v8i16 out0_r, out1_r, out2_r, out3_r;
3018 src -= (src_stride + 1);
3020 filter_vec = LD_SH(filter_x);
3021 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3023 filter_vec = LD_SH(filter_y);
3024 vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
3025 filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
3027 SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1);
3031 const_vec = __msa_ldi_h(128);
3034 LD_SB3(src, src_stride, src0, src1, src2);
3035 src += (3 * src_stride);
3037 XORI_B3_128_SB(src0, src1, src2);
3039 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3040 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
3041 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3044 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
3046 DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1);
3048 DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2);
3050 ILVR_H2_SH(dst1, dst0, dst2, dst1, dst10_r, dst21_r);
3052 for (loop_cnt = height >> 3; loop_cnt--;) {
3053 LD_SB8(src, src_stride,
3054 src3, src4, src5, src6, src7, src8, src9, src10);
3055 src += (8 * src_stride);
3057 XORI_B8_128_SB(src3, src4, src5, src6, src7, src8, src9, src10);
3060 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3062 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
3064 dst32_r = __msa_ilvr_h(dst3, dst2);
3065 dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
3069 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
3071 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
3073 dst43_r = __msa_ilvr_h(dst4, dst3);
3074 dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
3078 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
3080 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5);
3082 dst54_r = __msa_ilvr_h(dst5, dst4);
3083 dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
3087 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
3089 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst6, dst6);
3091 dst65_r = __msa_ilvr_h(dst6, dst5);
3092 dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
3096 VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1);
3098 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst7, dst7);
3100 dst76_r = __msa_ilvr_h(dst7, dst6);
3101 dst4_r = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
3105 VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec0, vec1);
3107 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst8, dst8);
3109 dst87_r = __msa_ilvr_h(dst8, dst7);
3110 dst5_r = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
3114 VSHF_B2_SB(src9, src9, src9, src9, mask0, mask1, vec0, vec1);
3116 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst9, dst9);
3118 dst10_r = __msa_ilvr_h(dst9, dst8);
3119 dst6_r = HEVC_FILT_4TAP(dst76_r, dst10_r, filt_h0, filt_h1);
3123 VSHF_B2_SB(src10, src10, src10, src10, mask0, mask1, vec0, vec1);
3125 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
3127 dst21_r = __msa_ilvr_h(dst2, dst9);
3128 dst7_r = HEVC_FILT_4TAP(dst87_r, dst21_r, filt_h0, filt_h1);
3131 PCKEV_H4_SH(dst1_r, dst0_r, dst3_r, dst2_r,
3132 dst5_r, dst4_r, dst7_r, dst6_r,
3133 out0_r, out1_r, out2_r, out3_r);
3135 SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6);
3136 CLIP_SH4_0_255(out0_r, out1_r, out2_r, out3_r);
3138 PCKEV_B2_SH(out1_r, out0_r, out3_r, out2_r, out0_r, out1_r);
3139 ST4x8_UB(out0_r, out1_r, dst, dst_stride);
3140 dst += (8 * dst_stride);
3144 static void hevc_hv_uni_4t_4w_msa(uint8_t *src,
3148 const int8_t *filter_x,
3149 const int8_t *filter_y,
3153 hevc_hv_uni_4t_4x2_msa(src, src_stride, dst, dst_stride,
3154 filter_x, filter_y, height);
3155 } else if (4 == height) {
3156 hevc_hv_uni_4t_4x4_msa(src, src_stride, dst, dst_stride,
3157 filter_x, filter_y, height);
3158 } else if (0 == (height % 8)) {
3159 hevc_hv_uni_4t_4multx8mult_msa(src, src_stride, dst, dst_stride,
3160 filter_x, filter_y, height);
3164 static void hevc_hv_uni_4t_6w_msa(uint8_t *src,
3168 const int8_t *filter_x,
3169 const int8_t *filter_y,
3173 v16i8 src0, src1, src2, src3, src4, src5, src6;
3175 v4i32 filt_h0, filt_h1;
3176 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
3178 v8i16 filter_vec, const_vec;
3179 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
3180 v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
3181 v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
3182 v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
3183 v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
3184 v8i16 out0_r, out1_r, out2_r, out3_r;
3186 src -= (src_stride + 1);
3188 filter_vec = LD_SH(filter_x);
3189 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3191 filter_vec = LD_SH(filter_y);
3192 vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
3193 filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
3195 SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1);
3199 const_vec = __msa_ldi_h(128);
3202 LD_SB3(src, src_stride, src0, src1, src2);
3203 src += (3 * src_stride);
3205 XORI_B3_128_SB(src0, src1, src2);
3207 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3208 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
3209 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3212 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
3214 DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1);
3216 DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2);
3218 ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
3219 ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
3221 for (loop_cnt = height >> 2; loop_cnt--;) {
3222 LD_SB4(src, src_stride, src3, src4, src5, src6);
3223 src += (4 * src_stride);
3225 XORI_B4_128_SB(src3, src4, src5, src6);
3228 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3230 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
3232 ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
3233 dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
3234 dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
3239 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
3241 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
3243 ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
3244 dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
3245 dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
3250 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
3252 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5);
3254 ILVRL_H2_SH(dst5, dst4, dst10_r, dst10_l);
3255 dst2_r = HEVC_FILT_4TAP(dst32_r, dst10_r, filt_h0, filt_h1);
3256 dst2_l = HEVC_FILT_4TAP(dst32_l, dst10_l, filt_h0, filt_h1);
3262 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
3264 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
3266 ILVRL_H2_SH(dst2, dst5, dst21_r, dst21_l);
3267 dst3_r = HEVC_FILT_4TAP(dst43_r, dst21_r, filt_h0, filt_h1);
3268 dst3_l = HEVC_FILT_4TAP(dst43_l, dst21_l, filt_h0, filt_h1);
3273 PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r,
3274 dst2_l, dst2_r, dst3_l, dst3_r,
3275 out0_r, out1_r, out2_r, out3_r);
3277 SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6);
3278 CLIP_SH4_0_255(out0_r, out1_r, out2_r, out3_r);
3280 PCKEV_B2_SH(out1_r, out0_r, out3_r, out2_r, out0_r, out1_r);
3281 ST6x4_UB(out0_r, out1_r, dst, dst_stride);
3282 dst += (4 * dst_stride);
3286 static void hevc_hv_uni_4t_8x2_msa(uint8_t *src,
3290 const int8_t *filter_x,
3291 const int8_t *filter_y,
3294 v16i8 src0, src1, src2, src3, src4;
3296 v4i32 filt_h0, filt_h1;
3297 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
3299 v8i16 filter_vec, const_vec;
3300 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
3301 v8i16 dst0, dst1, dst2, dst3, dst4;
3302 v4i32 dst0_r, dst0_l, dst1_r, dst1_l;
3303 v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
3304 v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
3305 v8i16 out0_r, out1_r;
3307 src -= (src_stride + 1);
3309 filter_vec = LD_SH(filter_x);
3310 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3312 filter_vec = LD_SH(filter_y);
3313 vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
3314 filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
3316 SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1);
3320 const_vec = __msa_ldi_h(128);
3323 LD_SB3(src, src_stride, src0, src1, src2);
3324 src += (3 * src_stride);
3326 XORI_B3_128_SB(src0, src1, src2);
3328 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3329 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
3330 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3333 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
3335 DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1);
3337 DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2);
3339 ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
3340 ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
3342 LD_SB2(src, src_stride, src3, src4);
3343 XORI_B2_128_SB(src3, src4);
3346 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3348 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
3350 ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
3351 dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
3352 dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
3357 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
3359 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
3361 ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
3362 dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
3363 dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
3367 PCKEV_H2_SH(dst0_l, dst0_r, dst1_l, dst1_r, out0_r, out1_r);
3368 SRARI_H2_SH(out0_r, out1_r, 6);
3369 CLIP_SH2_0_255(out0_r, out1_r);
3370 out0_r = (v8i16) __msa_pckev_b((v16i8) out1_r, (v16i8) out0_r);
3372 ST8x2_UB(out0_r, dst, dst_stride);
3375 static void hevc_hv_uni_4t_8x6_msa(uint8_t *src,
3379 const int8_t *filter_x,
3380 const int8_t *filter_y,
3383 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
3385 v4i32 filt_h0, filt_h1;
3386 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
3388 v8i16 filter_vec, const_vec;
3389 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
3390 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
3391 v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
3392 v4i32 dst4_r, dst4_l, dst5_r, dst5_l;
3393 v8i16 dst10_r, dst32_r, dst10_l, dst32_l;
3394 v8i16 dst21_r, dst43_r, dst21_l, dst43_l;
3395 v8i16 dst54_r, dst54_l, dst65_r, dst65_l;
3396 v8i16 dst76_r, dst76_l, dst87_r, dst87_l;
3397 v8i16 out0_r, out1_r, out2_r, out3_r, out4_r, out5_r;
3399 src -= (src_stride + 1);
3401 filter_vec = LD_SH(filter_x);
3402 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3404 filter_vec = LD_SH(filter_y);
3405 vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
3406 filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
3408 SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1);
3412 const_vec = __msa_ldi_h(128);
3415 LD_SB3(src, src_stride, src0, src1, src2);
3416 src += (3 * src_stride);
3418 XORI_B3_128_SB(src0, src1, src2);
3420 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3421 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
3422 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3425 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
3427 DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1);
3429 DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2);
3431 ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
3432 ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
3434 LD_SB2(src, src_stride, src3, src4);
3435 src += (2 * src_stride);
3437 XORI_B2_128_SB(src3, src4);
3440 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3442 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
3444 ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
3445 dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
3446 dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
3452 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
3454 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
3456 ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
3457 dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
3458 dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
3462 LD_SB2(src, src_stride, src5, src6);
3463 src += (2 * src_stride);
3465 XORI_B2_128_SB(src5, src6);
3468 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
3470 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5);
3472 ILVRL_H2_SH(dst5, dst4, dst54_r, dst54_l);
3473 dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
3474 dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1);
3479 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
3481 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst6, dst6);
3483 ILVRL_H2_SH(dst6, dst5, dst65_r, dst65_l);
3484 dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
3485 dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1);
3489 LD_SB2(src, src_stride, src7, src8);
3490 src += (2 * src_stride);
3492 XORI_B2_128_SB(src7, src8);
3495 VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1);
3497 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst7, dst7);
3499 ILVRL_H2_SH(dst7, dst6, dst76_r, dst76_l);
3500 dst4_r = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
3501 dst4_l = HEVC_FILT_4TAP(dst54_l, dst76_l, filt_h0, filt_h1);
3507 VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec0, vec1);
3509 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst8, dst8);
3511 ILVRL_H2_SH(dst8, dst7, dst87_r, dst87_l);
3512 dst5_r = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
3513 dst5_l = HEVC_FILT_4TAP(dst65_l, dst87_l, filt_h0, filt_h1);
3517 PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r,
3518 dst2_l, dst2_r, dst3_l, dst3_r, out0_r, out1_r, out2_r, out3_r);
3519 PCKEV_H2_SH(dst4_l, dst4_r, dst5_l, dst5_r, out4_r, out5_r);
3520 SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6);
3521 SRARI_H2_SH(out4_r, out5_r, 6);
3522 CLIP_SH4_0_255(out0_r, out1_r, out2_r, out3_r);
3523 CLIP_SH2_0_255(out4_r, out5_r);
3525 PCKEV_B2_SH(out1_r, out0_r, out3_r, out2_r, out0_r, out1_r);
3526 out2_r = (v8i16) __msa_pckev_b((v16i8) out5_r, (v16i8) out4_r);
3528 ST8x4_UB(out0_r, out1_r, dst, dst_stride);
3529 dst += (4 * dst_stride);
3530 ST8x2_UB(out2_r, dst, dst_stride);
3533 static void hevc_hv_uni_4t_8w_mult_msa(uint8_t *src,
3537 const int8_t *filter_x,
3538 const int8_t *filter_y,
3542 uint32_t loop_cnt, cnt;
3545 v16i8 src0, src1, src2, src3, src4, src5, src6;
3547 v4i32 filt_h0, filt_h1;
3548 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
3550 v8i16 filter_vec, const_vec;
3551 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
3552 v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
3553 v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
3554 v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
3555 v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
3556 v8i16 out0_r, out1_r, out2_r, out3_r;
3558 src -= (src_stride + 1);
3560 filter_vec = LD_SH(filter_x);
3561 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3563 filter_vec = LD_SH(filter_y);
3564 vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
3565 filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
3567 SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1);
3571 const_vec = __msa_ldi_h(128);
3574 for (cnt = width >> 3; cnt--;) {
3578 LD_SB3(src_tmp, src_stride, src0, src1, src2);
3579 src_tmp += (3 * src_stride);
3581 XORI_B3_128_SB(src0, src1, src2);
3583 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3584 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
3585 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3588 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
3590 DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1);
3592 DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2);
3594 ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
3595 ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
3597 for (loop_cnt = height >> 2; loop_cnt--;) {
3598 LD_SB4(src_tmp, src_stride, src3, src4, src5, src6);
3599 src_tmp += (4 * src_stride);
3601 XORI_B4_128_SB(src3, src4, src5, src6);
3604 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3606 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
3608 ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
3609 dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
3610 dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
3616 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
3618 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
3620 ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
3621 dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
3622 dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
3627 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
3629 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5);
3631 ILVRL_H2_SH(dst5, dst4, dst10_r, dst10_l);
3632 dst2_r = HEVC_FILT_4TAP(dst32_r, dst10_r, filt_h0, filt_h1);
3633 dst2_l = HEVC_FILT_4TAP(dst32_l, dst10_l, filt_h0, filt_h1);
3639 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
3641 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
3643 ILVRL_H2_SH(dst2, dst5, dst21_r, dst21_l);
3644 dst3_r = HEVC_FILT_4TAP(dst43_r, dst21_r, filt_h0, filt_h1);
3645 dst3_l = HEVC_FILT_4TAP(dst43_l, dst21_l, filt_h0, filt_h1);
3650 PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r,
3651 dst2_l, dst2_r, dst3_l, dst3_r,
3652 out0_r, out1_r, out2_r, out3_r);
3654 SRARI_H4_SH(out0_r, out1_r, out2_r, out3_r, 6);
3655 CLIP_SH4_0_255(out0_r, out1_r, out2_r, out3_r);
3657 PCKEV_B2_SH(out1_r, out0_r, out3_r, out2_r, out0_r, out1_r);
3658 ST8x4_UB(out0_r, out1_r, dst_tmp, dst_stride);
3659 dst_tmp += (4 * dst_stride);
3667 static void hevc_hv_uni_4t_8w_msa(uint8_t *src,
3671 const int8_t *filter_x,
3672 const int8_t *filter_y,
3676 hevc_hv_uni_4t_8x2_msa(src, src_stride, dst, dst_stride,
3677 filter_x, filter_y, height);
3678 } else if (6 == height) {
3679 hevc_hv_uni_4t_8x6_msa(src, src_stride, dst, dst_stride,
3680 filter_x, filter_y, height);
3681 } else if (0 == (height % 4)) {
3682 hevc_hv_uni_4t_8w_mult_msa(src, src_stride, dst, dst_stride,
3683 filter_x, filter_y, height, 8);
3687 static void hevc_hv_uni_4t_12w_msa(uint8_t *src,
3691 const int8_t *filter_x,
3692 const int8_t *filter_y,
3695 hevc_hv_uni_4t_8w_mult_msa(src, src_stride, dst, dst_stride,
3696 filter_x, filter_y, height, 8);
3698 hevc_hv_uni_4t_4w_msa(src + 8, src_stride, dst + 8, dst_stride,
3699 filter_x, filter_y, height);
3702 static void hevc_hv_uni_4t_16w_msa(uint8_t *src,
3706 const int8_t *filter_x,
3707 const int8_t *filter_y,
3710 hevc_hv_uni_4t_8w_mult_msa(src, src_stride, dst, dst_stride,
3711 filter_x, filter_y, height, 16);
3714 static void hevc_hv_uni_4t_24w_msa(uint8_t *src,
3718 const int8_t *filter_x,
3719 const int8_t *filter_y,
3722 hevc_hv_uni_4t_8w_mult_msa(src, src_stride, dst, dst_stride,
3723 filter_x, filter_y, height, 24);
3726 static void hevc_hv_uni_4t_32w_msa(uint8_t *src,
3730 const int8_t *filter_x,
3731 const int8_t *filter_y,
3734 hevc_hv_uni_4t_8w_mult_msa(src, src_stride, dst, dst_stride,
3735 filter_x, filter_y, height, 32);
3738 #define UNI_MC_COPY(WIDTH) \
3739 void ff_hevc_put_hevc_uni_pel_pixels##WIDTH##_8_msa(uint8_t *dst, \
3740 ptrdiff_t dst_stride, \
3742 ptrdiff_t src_stride, \
3748 copy_width##WIDTH##_msa(src, src_stride, dst, dst_stride, height); \
3761 #define UNI_MC(PEL, DIR, WIDTH, TAP, DIR1, FILT_DIR) \
3762 void ff_hevc_put_hevc_uni_##PEL##_##DIR##WIDTH##_8_msa(uint8_t *dst, \
3763 ptrdiff_t dst_stride, \
3765 ptrdiff_t src_stride, \
3771 const int8_t *filter = ff_hevc_##PEL##_filters[FILT_DIR - 1]; \
3773 common_##DIR1##_##TAP##t_##WIDTH##w_msa(src, src_stride, dst, dst_stride, \
3777 UNI_MC(qpel, h, 4, 8, hz, mx);
3778 UNI_MC(qpel, h, 8, 8, hz, mx);
3779 UNI_MC(qpel, h, 12, 8, hz, mx);
3780 UNI_MC(qpel, h, 16, 8, hz, mx);
3781 UNI_MC(qpel, h, 24, 8, hz, mx);
3782 UNI_MC(qpel, h, 32, 8, hz, mx);
3783 UNI_MC(qpel, h, 48, 8, hz, mx);
3784 UNI_MC(qpel, h, 64, 8, hz, mx);
3786 UNI_MC(qpel, v, 4, 8, vt, my);
3787 UNI_MC(qpel, v, 8, 8, vt, my);
3788 UNI_MC(qpel, v, 12, 8, vt, my);
3789 UNI_MC(qpel, v, 16, 8, vt, my);
3790 UNI_MC(qpel, v, 24, 8, vt, my);
3791 UNI_MC(qpel, v, 32, 8, vt, my);
3792 UNI_MC(qpel, v, 48, 8, vt, my);
3793 UNI_MC(qpel, v, 64, 8, vt, my);
3795 UNI_MC(epel, h, 4, 4, hz, mx);
3796 UNI_MC(epel, h, 6, 4, hz, mx);
3797 UNI_MC(epel, h, 8, 4, hz, mx);
3798 UNI_MC(epel, h, 12, 4, hz, mx);
3799 UNI_MC(epel, h, 16, 4, hz, mx);
3800 UNI_MC(epel, h, 24, 4, hz, mx);
3801 UNI_MC(epel, h, 32, 4, hz, mx);
3803 UNI_MC(epel, v, 4, 4, vt, my);
3804 UNI_MC(epel, v, 6, 4, vt, my);
3805 UNI_MC(epel, v, 8, 4, vt, my);
3806 UNI_MC(epel, v, 12, 4, vt, my);
3807 UNI_MC(epel, v, 16, 4, vt, my);
3808 UNI_MC(epel, v, 24, 4, vt, my);
3809 UNI_MC(epel, v, 32, 4, vt, my);
3813 #define UNI_MC_HV(PEL, WIDTH, TAP) \
3814 void ff_hevc_put_hevc_uni_##PEL##_hv##WIDTH##_8_msa(uint8_t *dst, \
3815 ptrdiff_t dst_stride, \
3817 ptrdiff_t src_stride, \
3823 const int8_t *filter_x = ff_hevc_##PEL##_filters[mx - 1]; \
3824 const int8_t *filter_y = ff_hevc_##PEL##_filters[my - 1]; \
3826 hevc_hv_uni_##TAP##t_##WIDTH##w_msa(src, src_stride, dst, dst_stride, \
3827 filter_x, filter_y, height); \
3830 UNI_MC_HV(qpel, 4, 8);
3831 UNI_MC_HV(qpel, 8, 8);
3832 UNI_MC_HV(qpel, 12, 8);
3833 UNI_MC_HV(qpel, 16, 8);
3834 UNI_MC_HV(qpel, 24, 8);
3835 UNI_MC_HV(qpel, 32, 8);
3836 UNI_MC_HV(qpel, 48, 8);
3837 UNI_MC_HV(qpel, 64, 8);
3839 UNI_MC_HV(epel, 4, 4);
3840 UNI_MC_HV(epel, 6, 4);
3841 UNI_MC_HV(epel, 8, 4);
3842 UNI_MC_HV(epel, 12, 4);
3843 UNI_MC_HV(epel, 16, 4);
3844 UNI_MC_HV(epel, 24, 4);
3845 UNI_MC_HV(epel, 32, 4);