2 * Copyright (c) 2015 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com)
4 * This file is part of FFmpeg.
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 #include "libavutil/mips/generic_macros_msa.h"
22 #include "libavcodec/mips/hevcdsp_mips.h"
24 #define HEVC_FILT_8TAP_DPADD_H(vec0, vec1, vec2, vec3, \
25 filt0, filt1, filt2, filt3, \
30 out = __msa_dpadd_s_h((v8i16) (var_in), (v16i8) (vec0), (v16i8) (filt0)); \
31 out = __msa_dpadd_s_h(out, (v16i8) (vec1), (v16i8) (filt1)); \
32 out = __msa_dpadd_s_h(out, (v16i8) (vec2), (v16i8) (filt2)); \
33 out = __msa_dpadd_s_h(out, (v16i8) (vec3), (v16i8) (filt3)); \
37 static void hevc_hz_8t_4w_msa(uint8_t * __restrict src, int32_t src_stride,
38 int16_t * __restrict dst, int32_t dst_stride,
39 const int8_t * __restrict filter, int32_t height)
42 uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
43 v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
44 v8i16 filt0, filt1, filt2, filt3;
45 v16i8 mask1, mask2, mask3;
47 v16i8 vec0, vec1, vec2, vec3;
48 v8i16 dst0, dst1, dst2, dst3;
50 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 };
54 const_vec = (v8u16) __msa_ldi_h(128);
57 filter_vec = LOAD_SH(filter);
58 filt0 = __msa_splati_h(filter_vec, 0);
59 filt1 = __msa_splati_h(filter_vec, 1);
60 filt2 = __msa_splati_h(filter_vec, 2);
61 filt3 = __msa_splati_h(filter_vec, 3);
67 for (loop_cnt = (height >> 3); loop_cnt--;) {
68 LOAD_8VECS_SB(src, src_stride,
69 src0, src1, src2, src3, src4, src5, src6, src7);
70 src += (8 * src_stride);
72 XORI_B_8VECS_SB(src0, src1, src2, src3, src4, src5, src6, src7,
73 src0, src1, src2, src3, src4, src5, src6, src7, 128);
75 vec0 = __msa_vshf_b(mask0, src1, src0);
76 vec1 = __msa_vshf_b(mask1, src1, src0);
77 vec2 = __msa_vshf_b(mask2, src1, src0);
78 vec3 = __msa_vshf_b(mask3, src1, src0);
80 dst0 = HEVC_FILT_8TAP_DPADD_H(vec0, vec1, vec2, vec3,
81 filt0, filt1, filt2, filt3, const_vec);
83 vec0 = __msa_vshf_b(mask0, src3, src2);
84 vec1 = __msa_vshf_b(mask1, src3, src2);
85 vec2 = __msa_vshf_b(mask2, src3, src2);
86 vec3 = __msa_vshf_b(mask3, src3, src2);
88 dst1 = HEVC_FILT_8TAP_DPADD_H(vec0, vec1, vec2, vec3,
89 filt0, filt1, filt2, filt3, const_vec);
91 vec0 = __msa_vshf_b(mask0, src5, src4);
92 vec1 = __msa_vshf_b(mask1, src5, src4);
93 vec2 = __msa_vshf_b(mask2, src5, src4);
94 vec3 = __msa_vshf_b(mask3, src5, src4);
96 dst2 = HEVC_FILT_8TAP_DPADD_H(vec0, vec1, vec2, vec3,
97 filt0, filt1, filt2, filt3, const_vec);
99 vec0 = __msa_vshf_b(mask0, src7, src6);
100 vec1 = __msa_vshf_b(mask1, src7, src6);
101 vec2 = __msa_vshf_b(mask2, src7, src6);
102 vec3 = __msa_vshf_b(mask3, src7, src6);
104 dst3 = HEVC_FILT_8TAP_DPADD_H(vec0, vec1, vec2, vec3,
105 filt0, filt1, filt2, filt3, const_vec);
107 out0 = __msa_copy_u_d((v2i64) dst0, 0);
108 out1 = __msa_copy_u_d((v2i64) dst0, 1);
109 out2 = __msa_copy_u_d((v2i64) dst1, 0);
110 out3 = __msa_copy_u_d((v2i64) dst1, 1);
111 out4 = __msa_copy_u_d((v2i64) dst2, 0);
112 out5 = __msa_copy_u_d((v2i64) dst2, 1);
113 out6 = __msa_copy_u_d((v2i64) dst3, 0);
114 out7 = __msa_copy_u_d((v2i64) dst3, 1);
116 STORE_DWORD(dst, out0);
118 STORE_DWORD(dst, out1);
120 STORE_DWORD(dst, out2);
122 STORE_DWORD(dst, out3);
124 STORE_DWORD(dst, out4);
126 STORE_DWORD(dst, out5);
128 STORE_DWORD(dst, out6);
130 STORE_DWORD(dst, out7);
135 static void hevc_hz_8t_8w_msa(uint8_t * __restrict src, int32_t src_stride,
136 int16_t * __restrict dst, int32_t dst_stride,
137 const int8_t * __restrict filter, int32_t height)
140 v16i8 src0, src1, src2, src3;
141 v8i16 filt0, filt1, filt2, filt3;
142 v16i8 mask1, mask2, mask3;
144 v16i8 vec0, vec1, vec2, vec3;
145 v8i16 dst0, dst1, dst2, dst3;
147 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
151 const_vec = (v8u16) __msa_ldi_h(128);
154 filter_vec = LOAD_SH(filter);
155 filt0 = __msa_splati_h(filter_vec, 0);
156 filt1 = __msa_splati_h(filter_vec, 1);
157 filt2 = __msa_splati_h(filter_vec, 2);
158 filt3 = __msa_splati_h(filter_vec, 3);
164 for (loop_cnt = (height >> 2); loop_cnt--;) {
165 LOAD_4VECS_SB(src, src_stride, src0, src1, src2, src3);
166 src += (4 * src_stride);
168 XORI_B_4VECS_SB(src0, src1, src2, src3, src0, src1, src2, src3, 128);
170 vec0 = __msa_vshf_b(mask0, src0, src0);
171 vec1 = __msa_vshf_b(mask1, src0, src0);
172 vec2 = __msa_vshf_b(mask2, src0, src0);
173 vec3 = __msa_vshf_b(mask3, src0, src0);
175 dst0 = HEVC_FILT_8TAP_DPADD_H(vec0, vec1, vec2, vec3,
176 filt0, filt1, filt2, filt3, const_vec);
178 vec0 = __msa_vshf_b(mask0, src1, src1);
179 vec1 = __msa_vshf_b(mask1, src1, src1);
180 vec2 = __msa_vshf_b(mask2, src1, src1);
181 vec3 = __msa_vshf_b(mask3, src1, src1);
183 dst1 = HEVC_FILT_8TAP_DPADD_H(vec0, vec1, vec2, vec3,
184 filt0, filt1, filt2, filt3, const_vec);
186 vec0 = __msa_vshf_b(mask0, src2, src2);
187 vec1 = __msa_vshf_b(mask1, src2, src2);
188 vec2 = __msa_vshf_b(mask2, src2, src2);
189 vec3 = __msa_vshf_b(mask3, src2, src2);
191 dst2 = HEVC_FILT_8TAP_DPADD_H(vec0, vec1, vec2, vec3,
192 filt0, filt1, filt2, filt3, const_vec);
194 vec0 = __msa_vshf_b(mask0, src3, src3);
195 vec1 = __msa_vshf_b(mask1, src3, src3);
196 vec2 = __msa_vshf_b(mask2, src3, src3);
197 vec3 = __msa_vshf_b(mask3, src3, src3);
199 dst3 = HEVC_FILT_8TAP_DPADD_H(vec0, vec1, vec2, vec3,
200 filt0, filt1, filt2, filt3, const_vec);
213 static void hevc_hz_8t_12w_msa(uint8_t * __restrict src, int32_t src_stride,
214 int16_t * __restrict dst, int32_t dst_stride,
215 const int8_t * __restrict filter, int32_t height)
217 hevc_hz_8t_8w_msa(src, src_stride, dst, dst_stride, filter, height);
219 hevc_hz_8t_4w_msa(src + 8, src_stride, dst + 8, dst_stride, filter, height);
222 static void hevc_hz_8t_16w_msa(uint8_t * __restrict src, int32_t src_stride,
223 int16_t * __restrict dst, int32_t dst_stride,
224 const int8_t * __restrict filter, int32_t height)
227 v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
228 v8i16 filt0, filt1, filt2, filt3;
229 v16i8 mask1, mask2, mask3;
231 v16i8 vec0, vec1, vec2, vec3;
232 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
234 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
238 const_vec = (v8u16) __msa_ldi_h(128);
241 filter_vec = LOAD_SH(filter);
242 filt0 = __msa_splati_h(filter_vec, 0);
243 filt1 = __msa_splati_h(filter_vec, 1);
244 filt2 = __msa_splati_h(filter_vec, 2);
245 filt3 = __msa_splati_h(filter_vec, 3);
251 for (loop_cnt = (height >> 2); loop_cnt--;) {
252 LOAD_4VECS_SB(src, src_stride, src0, src2, src4, src6);
253 LOAD_4VECS_SB(src + 8, src_stride, src1, src3, src5, src7);
254 src += (4 * src_stride);
256 XORI_B_8VECS_SB(src0, src1, src2, src3, src4, src5, src6, src7,
257 src0, src1, src2, src3, src4, src5, src6, src7, 128);
259 vec0 = __msa_vshf_b(mask0, src0, src0);
260 vec1 = __msa_vshf_b(mask1, src0, src0);
261 vec2 = __msa_vshf_b(mask2, src0, src0);
262 vec3 = __msa_vshf_b(mask3, src0, src0);
264 dst0 = HEVC_FILT_8TAP_DPADD_H(vec0, vec1, vec2, vec3,
265 filt0, filt1, filt2, filt3, const_vec);
267 vec0 = __msa_vshf_b(mask0, src1, src1);
268 vec1 = __msa_vshf_b(mask1, src1, src1);
269 vec2 = __msa_vshf_b(mask2, src1, src1);
270 vec3 = __msa_vshf_b(mask3, src1, src1);
272 dst1 = HEVC_FILT_8TAP_DPADD_H(vec0, vec1, vec2, vec3,
273 filt0, filt1, filt2, filt3, const_vec);
275 vec0 = __msa_vshf_b(mask0, src2, src2);
276 vec1 = __msa_vshf_b(mask1, src2, src2);
277 vec2 = __msa_vshf_b(mask2, src2, src2);
278 vec3 = __msa_vshf_b(mask3, src2, src2);
280 dst2 = HEVC_FILT_8TAP_DPADD_H(vec0, vec1, vec2, vec3,
281 filt0, filt1, filt2, filt3, const_vec);
283 vec0 = __msa_vshf_b(mask0, src3, src3);
284 vec1 = __msa_vshf_b(mask1, src3, src3);
285 vec2 = __msa_vshf_b(mask2, src3, src3);
286 vec3 = __msa_vshf_b(mask3, src3, src3);
288 dst3 = HEVC_FILT_8TAP_DPADD_H(vec0, vec1, vec2, vec3,
289 filt0, filt1, filt2, filt3, const_vec);
291 vec0 = __msa_vshf_b(mask0, src4, src4);
292 vec1 = __msa_vshf_b(mask1, src4, src4);
293 vec2 = __msa_vshf_b(mask2, src4, src4);
294 vec3 = __msa_vshf_b(mask3, src4, src4);
296 dst4 = HEVC_FILT_8TAP_DPADD_H(vec0, vec1, vec2, vec3,
297 filt0, filt1, filt2, filt3, const_vec);
299 vec0 = __msa_vshf_b(mask0, src5, src5);
300 vec1 = __msa_vshf_b(mask1, src5, src5);
301 vec2 = __msa_vshf_b(mask2, src5, src5);
302 vec3 = __msa_vshf_b(mask3, src5, src5);
304 dst5 = HEVC_FILT_8TAP_DPADD_H(vec0, vec1, vec2, vec3,
305 filt0, filt1, filt2, filt3, const_vec);
307 vec0 = __msa_vshf_b(mask0, src6, src6);
308 vec1 = __msa_vshf_b(mask1, src6, src6);
309 vec2 = __msa_vshf_b(mask2, src6, src6);
310 vec3 = __msa_vshf_b(mask3, src6, src6);
312 dst6 = HEVC_FILT_8TAP_DPADD_H(vec0, vec1, vec2, vec3,
313 filt0, filt1, filt2, filt3, const_vec);
315 vec0 = __msa_vshf_b(mask0, src7, src7);
316 vec1 = __msa_vshf_b(mask1, src7, src7);
317 vec2 = __msa_vshf_b(mask2, src7, src7);
318 vec3 = __msa_vshf_b(mask3, src7, src7);
320 dst7 = HEVC_FILT_8TAP_DPADD_H(vec0, vec1, vec2, vec3,
321 filt0, filt1, filt2, filt3, const_vec);
324 STORE_SH(dst1, dst + 8);
327 STORE_SH(dst3, dst + 8);
330 STORE_SH(dst5, dst + 8);
333 STORE_SH(dst7, dst + 8);
338 static void hevc_hz_8t_24w_msa(uint8_t * __restrict src, int32_t src_stride,
339 int16_t * __restrict dst, int32_t dst_stride,
340 const int8_t * __restrict filter, int32_t height)
343 v16i8 src0, src1, src2, src3;
344 v8i16 filt0, filt1, filt2, filt3;
345 v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
346 v16i8 vec0, vec1, vec2, vec3;
347 v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
350 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
354 filter_vec = LOAD_SH(filter);
355 filt0 = __msa_splati_h(filter_vec, 0);
356 filt1 = __msa_splati_h(filter_vec, 1);
357 filt2 = __msa_splati_h(filter_vec, 2);
358 filt3 = __msa_splati_h(filter_vec, 3);
368 const_vec = (v8u16) __msa_ldi_h(128);
371 for (loop_cnt = (height >> 1); loop_cnt--;) {
373 src1 = LOAD_SB(src + 16);
376 src3 = LOAD_SB(src + 16);
379 XORI_B_4VECS_SB(src0, src1, src2, src3, src0, src1, src2, src3, 128);
381 vec0 = __msa_vshf_b(mask0, src0, src0);
382 vec1 = __msa_vshf_b(mask1, src0, src0);
383 vec2 = __msa_vshf_b(mask2, src0, src0);
384 vec3 = __msa_vshf_b(mask3, src0, src0);
386 dst0 = HEVC_FILT_8TAP_DPADD_H(vec0, vec1, vec2, vec3,
387 filt0, filt1, filt2, filt3, const_vec);
389 vec0 = __msa_vshf_b(mask4, src1, src0);
390 vec1 = __msa_vshf_b(mask5, src1, src0);
391 vec2 = __msa_vshf_b(mask6, src1, src0);
392 vec3 = __msa_vshf_b(mask7, src1, src0);
394 dst1 = HEVC_FILT_8TAP_DPADD_H(vec0, vec1, vec2, vec3,
395 filt0, filt1, filt2, filt3, const_vec);
397 vec0 = __msa_vshf_b(mask0, src1, src1);
398 vec1 = __msa_vshf_b(mask1, src1, src1);
399 vec2 = __msa_vshf_b(mask2, src1, src1);
400 vec3 = __msa_vshf_b(mask3, src1, src1);
402 dst2 = HEVC_FILT_8TAP_DPADD_H(vec0, vec1, vec2, vec3,
403 filt0, filt1, filt2, filt3, const_vec);
405 vec0 = __msa_vshf_b(mask0, src2, src2);
406 vec1 = __msa_vshf_b(mask1, src2, src2);
407 vec2 = __msa_vshf_b(mask2, src2, src2);
408 vec3 = __msa_vshf_b(mask3, src2, src2);
410 dst3 = HEVC_FILT_8TAP_DPADD_H(vec0, vec1, vec2, vec3,
411 filt0, filt1, filt2, filt3, const_vec);
413 vec0 = __msa_vshf_b(mask4, src3, src2);
414 vec1 = __msa_vshf_b(mask5, src3, src2);
415 vec2 = __msa_vshf_b(mask6, src3, src2);
416 vec3 = __msa_vshf_b(mask7, src3, src2);
418 dst4 = HEVC_FILT_8TAP_DPADD_H(vec0, vec1, vec2, vec3,
419 filt0, filt1, filt2, filt3, const_vec);
421 vec0 = __msa_vshf_b(mask0, src3, src3);
422 vec1 = __msa_vshf_b(mask1, src3, src3);
423 vec2 = __msa_vshf_b(mask2, src3, src3);
424 vec3 = __msa_vshf_b(mask3, src3, src3);
426 dst5 = HEVC_FILT_8TAP_DPADD_H(vec0, vec1, vec2, vec3,
427 filt0, filt1, filt2, filt3, const_vec);
430 STORE_SH(dst1, dst + 8);
431 STORE_SH(dst2, dst + 16);
434 STORE_SH(dst4, dst + 8);
435 STORE_SH(dst5, dst + 16);
440 static void hevc_hz_8t_32w_msa(uint8_t * __restrict src, int32_t src_stride,
441 int16_t * __restrict dst, int32_t dst_stride,
442 const int8_t * __restrict filter, int32_t height)
445 v16i8 src0, src1, src2;
446 v8i16 filt0, filt1, filt2, filt3;
447 v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
448 v16i8 vec0, vec1, vec2, vec3;
449 v8i16 dst0, dst1, dst2, dst3;
452 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
456 filter_vec = LOAD_SH(filter);
457 filt0 = __msa_splati_h(filter_vec, 0);
458 filt1 = __msa_splati_h(filter_vec, 1);
459 filt2 = __msa_splati_h(filter_vec, 2);
460 filt3 = __msa_splati_h(filter_vec, 3);
470 const_vec = (v8u16) __msa_ldi_h(128);
473 for (loop_cnt = height; loop_cnt--;) {
475 src1 = LOAD_SB(src + 16);
476 src2 = LOAD_SB(src + 24);
479 XORI_B_3VECS_SB(src0, src1, src2, src0, src1, src2, 128);
481 vec0 = __msa_vshf_b(mask0, src0, src0);
482 vec1 = __msa_vshf_b(mask1, src0, src0);
483 vec2 = __msa_vshf_b(mask2, src0, src0);
484 vec3 = __msa_vshf_b(mask3, src0, src0);
486 dst0 = HEVC_FILT_8TAP_DPADD_H(vec0, vec1, vec2, vec3,
487 filt0, filt1, filt2, filt3, const_vec);
489 vec0 = __msa_vshf_b(mask4, src1, src0);
490 vec1 = __msa_vshf_b(mask5, src1, src0);
491 vec2 = __msa_vshf_b(mask6, src1, src0);
492 vec3 = __msa_vshf_b(mask7, src1, src0);
494 dst1 = HEVC_FILT_8TAP_DPADD_H(vec0, vec1, vec2, vec3,
495 filt0, filt1, filt2, filt3, const_vec);
497 vec0 = __msa_vshf_b(mask0, src1, src1);
498 vec1 = __msa_vshf_b(mask1, src1, src1);
499 vec2 = __msa_vshf_b(mask2, src1, src1);
500 vec3 = __msa_vshf_b(mask3, src1, src1);
502 dst2 = HEVC_FILT_8TAP_DPADD_H(vec0, vec1, vec2, vec3,
503 filt0, filt1, filt2, filt3, const_vec);
505 vec0 = __msa_vshf_b(mask0, src2, src2);
506 vec1 = __msa_vshf_b(mask1, src2, src2);
507 vec2 = __msa_vshf_b(mask2, src2, src2);
508 vec3 = __msa_vshf_b(mask3, src2, src2);
510 dst3 = HEVC_FILT_8TAP_DPADD_H(vec0, vec1, vec2, vec3,
511 filt0, filt1, filt2, filt3, const_vec);
514 STORE_SH(dst1, dst + 8);
515 STORE_SH(dst2, dst + 16);
516 STORE_SH(dst3, dst + 24);
521 static void hevc_hz_8t_48w_msa(uint8_t * __restrict src, int32_t src_stride,
522 int16_t * __restrict dst, int32_t dst_stride,
523 const int8_t * __restrict filter, int32_t height)
526 v16i8 src0, src1, src2, src3;
527 v8i16 filt0, filt1, filt2, filt3;
528 v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
529 v16i8 vec0, vec1, vec2, vec3;
530 v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
533 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
537 filter_vec = LOAD_SH(filter);
538 filt0 = __msa_splati_h(filter_vec, 0);
539 filt1 = __msa_splati_h(filter_vec, 1);
540 filt2 = __msa_splati_h(filter_vec, 2);
541 filt3 = __msa_splati_h(filter_vec, 3);
551 const_vec = (v8u16) __msa_ldi_h(128);
554 for (loop_cnt = height; loop_cnt--;) {
556 src1 = LOAD_SB(src + 16);
557 src2 = LOAD_SB(src + 32);
558 src3 = LOAD_SB(src + 40);
561 XORI_B_4VECS_SB(src0, src1, src2, src3, src0, src1, src2, src3, 128);
563 vec0 = __msa_vshf_b(mask0, src0, src0);
564 vec1 = __msa_vshf_b(mask1, src0, src0);
565 vec2 = __msa_vshf_b(mask2, src0, src0);
566 vec3 = __msa_vshf_b(mask3, src0, src0);
568 dst0 = HEVC_FILT_8TAP_DPADD_H(vec0, vec1, vec2, vec3,
569 filt0, filt1, filt2, filt3, const_vec);
571 vec0 = __msa_vshf_b(mask4, src1, src0);
572 vec1 = __msa_vshf_b(mask5, src1, src0);
573 vec2 = __msa_vshf_b(mask6, src1, src0);
574 vec3 = __msa_vshf_b(mask7, src1, src0);
576 dst1 = HEVC_FILT_8TAP_DPADD_H(vec0, vec1, vec2, vec3,
577 filt0, filt1, filt2, filt3, const_vec);
579 vec0 = __msa_vshf_b(mask0, src1, src1);
580 vec1 = __msa_vshf_b(mask1, src1, src1);
581 vec2 = __msa_vshf_b(mask2, src1, src1);
582 vec3 = __msa_vshf_b(mask3, src1, src1);
584 dst2 = HEVC_FILT_8TAP_DPADD_H(vec0, vec1, vec2, vec3,
585 filt0, filt1, filt2, filt3, const_vec);
587 vec0 = __msa_vshf_b(mask4, src2, src1);
588 vec1 = __msa_vshf_b(mask5, src2, src1);
589 vec2 = __msa_vshf_b(mask6, src2, src1);
590 vec3 = __msa_vshf_b(mask7, src2, src1);
592 dst3 = HEVC_FILT_8TAP_DPADD_H(vec0, vec1, vec2, vec3,
593 filt0, filt1, filt2, filt3, const_vec);
595 vec0 = __msa_vshf_b(mask0, src2, src2);
596 vec1 = __msa_vshf_b(mask1, src2, src2);
597 vec2 = __msa_vshf_b(mask2, src2, src2);
598 vec3 = __msa_vshf_b(mask3, src2, src2);
600 dst4 = HEVC_FILT_8TAP_DPADD_H(vec0, vec1, vec2, vec3,
601 filt0, filt1, filt2, filt3, const_vec);
603 vec0 = __msa_vshf_b(mask0, src3, src3);
604 vec1 = __msa_vshf_b(mask1, src3, src3);
605 vec2 = __msa_vshf_b(mask2, src3, src3);
606 vec3 = __msa_vshf_b(mask3, src3, src3);
608 dst5 = HEVC_FILT_8TAP_DPADD_H(vec0, vec1, vec2, vec3,
609 filt0, filt1, filt2, filt3, const_vec);
612 STORE_SH(dst1, dst + 8);
613 STORE_SH(dst2, dst + 16);
614 STORE_SH(dst3, dst + 24);
615 STORE_SH(dst4, dst + 32);
616 STORE_SH(dst5, dst + 40);
621 static void hevc_hz_8t_64w_msa(uint8_t * __restrict src, int32_t src_stride,
622 int16_t * __restrict dst, int32_t dst_stride,
623 const int8_t * __restrict filter, int32_t height)
626 v16i8 src0, src1, src2, src3, src4;
627 v8i16 filt0, filt1, filt2, filt3;
628 v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
629 v16i8 vec0, vec1, vec2, vec3;
630 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
633 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
637 filter_vec = LOAD_SH(filter);
638 filt0 = __msa_splati_h(filter_vec, 0);
639 filt1 = __msa_splati_h(filter_vec, 1);
640 filt2 = __msa_splati_h(filter_vec, 2);
641 filt3 = __msa_splati_h(filter_vec, 3);
651 const_vec = (v8u16) __msa_ldi_h(128);
654 for (loop_cnt = height; loop_cnt--;) {
656 src1 = LOAD_SB(src + 16);
657 src2 = LOAD_SB(src + 32);
658 src3 = LOAD_SB(src + 48);
659 src4 = LOAD_SB(src + 56);
662 XORI_B_5VECS_SB(src0, src1, src2, src3, src4,
663 src0, src1, src2, src3, src4, 128);
665 vec0 = __msa_vshf_b(mask0, src0, src0);
666 vec1 = __msa_vshf_b(mask1, src0, src0);
667 vec2 = __msa_vshf_b(mask2, src0, src0);
668 vec3 = __msa_vshf_b(mask3, src0, src0);
670 dst0 = HEVC_FILT_8TAP_DPADD_H(vec0, vec1, vec2, vec3,
671 filt0, filt1, filt2, filt3, const_vec);
675 vec0 = __msa_vshf_b(mask4, src1, src0);
676 vec1 = __msa_vshf_b(mask5, src1, src0);
677 vec2 = __msa_vshf_b(mask6, src1, src0);
678 vec3 = __msa_vshf_b(mask7, src1, src0);
680 dst1 = HEVC_FILT_8TAP_DPADD_H(vec0, vec1, vec2, vec3,
681 filt0, filt1, filt2, filt3, const_vec);
683 STORE_SH(dst1, dst + 8);
685 vec0 = __msa_vshf_b(mask0, src1, src1);
686 vec1 = __msa_vshf_b(mask1, src1, src1);
687 vec2 = __msa_vshf_b(mask2, src1, src1);
688 vec3 = __msa_vshf_b(mask3, src1, src1);
690 dst2 = HEVC_FILT_8TAP_DPADD_H(vec0, vec1, vec2, vec3,
691 filt0, filt1, filt2, filt3, const_vec);
693 STORE_SH(dst2, dst + 16);
695 vec0 = __msa_vshf_b(mask4, src2, src1);
696 vec1 = __msa_vshf_b(mask5, src2, src1);
697 vec2 = __msa_vshf_b(mask6, src2, src1);
698 vec3 = __msa_vshf_b(mask7, src2, src1);
700 dst3 = HEVC_FILT_8TAP_DPADD_H(vec0, vec1, vec2, vec3,
701 filt0, filt1, filt2, filt3, const_vec);
703 STORE_SH(dst3, dst + 24);
705 vec0 = __msa_vshf_b(mask0, src2, src2);
706 vec1 = __msa_vshf_b(mask1, src2, src2);
707 vec2 = __msa_vshf_b(mask2, src2, src2);
708 vec3 = __msa_vshf_b(mask3, src2, src2);
710 dst4 = HEVC_FILT_8TAP_DPADD_H(vec0, vec1, vec2, vec3,
711 filt0, filt1, filt2, filt3, const_vec);
713 STORE_SH(dst4, dst + 32);
715 vec0 = __msa_vshf_b(mask4, src3, src2);
716 vec1 = __msa_vshf_b(mask5, src3, src2);
717 vec2 = __msa_vshf_b(mask6, src3, src2);
718 vec3 = __msa_vshf_b(mask7, src3, src2);
720 dst5 = HEVC_FILT_8TAP_DPADD_H(vec0, vec1, vec2, vec3,
721 filt0, filt1, filt2, filt3, const_vec);
723 STORE_SH(dst5, dst + 40);
725 vec0 = __msa_vshf_b(mask0, src3, src3);
726 vec1 = __msa_vshf_b(mask1, src3, src3);
727 vec2 = __msa_vshf_b(mask2, src3, src3);
728 vec3 = __msa_vshf_b(mask3, src3, src3);
730 dst6 = HEVC_FILT_8TAP_DPADD_H(vec0, vec1, vec2, vec3,
731 filt0, filt1, filt2, filt3, const_vec);
733 STORE_SH(dst6, dst + 48);
735 vec0 = __msa_vshf_b(mask0, src4, src4);
736 vec1 = __msa_vshf_b(mask1, src4, src4);
737 vec2 = __msa_vshf_b(mask2, src4, src4);
738 vec3 = __msa_vshf_b(mask3, src4, src4);
740 dst7 = HEVC_FILT_8TAP_DPADD_H(vec0, vec1, vec2, vec3,
741 filt0, filt1, filt2, filt3, const_vec);
743 STORE_SH(dst7, dst + 56);
749 static void hevc_vt_8t_4w_msa(uint8_t * __restrict src, int32_t src_stride,
750 int16_t * __restrict dst, int32_t dst_stride,
751 const int8_t * __restrict filter, int32_t height)
754 uint64_t out0, out1, out2, out3;
755 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
756 v16i8 src9, src10, src11, src12, src13, src14;
757 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
758 v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
759 v16i8 src1110_r, src1211_r, src1312_r, src1413_r;
760 v16i8 src2110, src4332, src6554, src8776, src10998;
761 v16i8 src12111110, src14131312;
762 v8i16 dst10, dst32, dst54, dst76;
764 v8i16 filt0, filt1, filt2, filt3;
767 src -= (3 * src_stride);
769 const_vec = (v8u16) __msa_ldi_h(128);
772 filter_vec = LOAD_SH(filter);
773 filt0 = __msa_splati_h(filter_vec, 0);
774 filt1 = __msa_splati_h(filter_vec, 1);
775 filt2 = __msa_splati_h(filter_vec, 2);
776 filt3 = __msa_splati_h(filter_vec, 3);
778 LOAD_7VECS_SB(src, src_stride,
779 src0, src1, src2, src3, src4, src5, src6);
780 src += (7 * src_stride);
782 ILVR_B_6VECS_SB(src0, src2, src4, src1, src3, src5,
783 src1, src3, src5, src2, src4, src6,
784 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r);
786 ILVR_D_3VECS_SB(src2110, src21_r, src10_r, src4332, src43_r, src32_r,
787 src6554, src65_r, src54_r);
789 XORI_B_3VECS_SB(src2110, src4332, src6554, src2110, src4332, src6554, 128);
791 for (loop_cnt = (height >> 3); loop_cnt--;) {
792 LOAD_8VECS_SB(src, src_stride,
793 src7, src8, src9, src10, src11, src12, src13, src14);
794 src += (8 * src_stride);
796 ILVR_B_8VECS_SB(src6, src7, src8, src9, src10, src11, src12, src13,
797 src7, src8, src9, src10, src11, src12, src13, src14,
798 src76_r, src87_r, src98_r, src109_r,
799 src1110_r, src1211_r, src1312_r, src1413_r);
801 ILVR_D_4VECS_SB(src8776, src87_r, src76_r, src10998, src109_r, src98_r,
802 src12111110, src1211_r, src1110_r,
803 src14131312, src1413_r, src1312_r);
805 XORI_B_4VECS_SB(src8776, src10998, src12111110, src14131312,
806 src8776, src10998, src12111110, src14131312, 128);
808 dst10 = HEVC_FILT_8TAP_DPADD_H(src2110, src4332, src6554, src8776,
809 filt0, filt1, filt2, filt3, const_vec);
811 dst32 = HEVC_FILT_8TAP_DPADD_H(src4332, src6554, src8776, src10998,
812 filt0, filt1, filt2, filt3, const_vec);
814 dst54 = HEVC_FILT_8TAP_DPADD_H(src6554, src8776, src10998, src12111110,
815 filt0, filt1, filt2, filt3, const_vec);
817 dst76 = HEVC_FILT_8TAP_DPADD_H(src8776, src10998,
818 src12111110, src14131312,
819 filt0, filt1, filt2, filt3, const_vec);
821 out0 = __msa_copy_u_d((v2i64) dst10, 0);
822 out1 = __msa_copy_u_d((v2i64) dst10, 1);
823 out2 = __msa_copy_u_d((v2i64) dst32, 0);
824 out3 = __msa_copy_u_d((v2i64) dst32, 1);
826 STORE_DWORD(dst, out0);
828 STORE_DWORD(dst, out1);
830 STORE_DWORD(dst, out2);
832 STORE_DWORD(dst, out3);
835 out0 = __msa_copy_u_d((v2i64) dst54, 0);
836 out1 = __msa_copy_u_d((v2i64) dst54, 1);
837 out2 = __msa_copy_u_d((v2i64) dst76, 0);
838 out3 = __msa_copy_u_d((v2i64) dst76, 1);
840 STORE_DWORD(dst, out0);
842 STORE_DWORD(dst, out1);
844 STORE_DWORD(dst, out2);
846 STORE_DWORD(dst, out3);
850 src4332 = src12111110;
851 src6554 = src14131312;
857 static void hevc_vt_8t_8w_msa(uint8_t * __restrict src, int32_t src_stride,
858 int16_t * __restrict dst, int32_t dst_stride,
859 const int8_t * __restrict filter, int32_t height)
862 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
863 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
864 v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
865 v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
867 v8i16 filt0, filt1, filt2, filt3;
870 src -= (3 * src_stride);
872 const_vec = (v8u16) __msa_ldi_h(128);
875 filter_vec = LOAD_SH(filter);
876 filt0 = __msa_splati_h(filter_vec, 0);
877 filt1 = __msa_splati_h(filter_vec, 1);
878 filt2 = __msa_splati_h(filter_vec, 2);
879 filt3 = __msa_splati_h(filter_vec, 3);
881 LOAD_7VECS_SB(src, src_stride,
882 src0, src1, src2, src3, src4, src5, src6);
883 src += (7 * src_stride);
885 XORI_B_7VECS_SB(src0, src1, src2, src3, src4, src5, src6,
886 src0, src1, src2, src3, src4, src5, src6, 128);
888 ILVR_B_6VECS_SB(src0, src2, src4, src1, src3, src5,
889 src1, src3, src5, src2, src4, src6,
890 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r);
892 for (loop_cnt = (height >> 2); loop_cnt--;) {
893 LOAD_4VECS_SB(src, src_stride, src7, src8, src9, src10);
894 src += (4 * src_stride);
896 XORI_B_4VECS_SB(src7, src8, src9, src10, src7, src8, src9, src10, 128);
898 ILVR_B_4VECS_SB(src6, src7, src8, src9, src7, src8, src9, src10,
899 src76_r, src87_r, src98_r, src109_r);
901 dst0_r = HEVC_FILT_8TAP_DPADD_H(src10_r, src32_r, src54_r, src76_r,
902 filt0, filt1, filt2, filt3, const_vec);
904 dst1_r = HEVC_FILT_8TAP_DPADD_H(src21_r, src43_r, src65_r, src87_r,
905 filt0, filt1, filt2, filt3, const_vec);
907 dst2_r = HEVC_FILT_8TAP_DPADD_H(src32_r, src54_r, src76_r, src98_r,
908 filt0, filt1, filt2, filt3, const_vec);
910 dst3_r = HEVC_FILT_8TAP_DPADD_H(src43_r, src65_r, src87_r, src109_r,
911 filt0, filt1, filt2, filt3, const_vec);
913 STORE_SH(dst0_r, dst);
915 STORE_SH(dst1_r, dst);
917 STORE_SH(dst2_r, dst);
919 STORE_SH(dst3_r, dst);
934 static void hevc_vt_8t_12w_msa(uint8_t * __restrict src, int32_t src_stride,
935 int16_t * __restrict dst, int32_t dst_stride,
936 const int8_t * __restrict filter, int32_t height)
939 uint64_t out0, out1, out2, out3;
940 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
941 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
942 v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
943 v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
944 v16i8 src10_l, src32_l, src54_l, src76_l, src98_l;
945 v16i8 src21_l, src43_l, src65_l, src87_l, src109_l;
946 v16i8 src2110, src4332, src6554, src8776, src10998;
947 v8i16 dst0_l, dst1_l;
949 v8i16 filt0, filt1, filt2, filt3;
952 src -= (3 * src_stride);
954 const_vec = (v8u16) __msa_ldi_h(128);
957 filter_vec = LOAD_SH(filter);
958 filt0 = __msa_splati_h(filter_vec, 0);
959 filt1 = __msa_splati_h(filter_vec, 1);
960 filt2 = __msa_splati_h(filter_vec, 2);
961 filt3 = __msa_splati_h(filter_vec, 3);
963 LOAD_7VECS_SB(src, src_stride,
964 src0, src1, src2, src3, src4, src5, src6);
965 src += (7 * src_stride);
967 XORI_B_7VECS_SB(src0, src1, src2, src3, src4, src5, src6,
968 src0, src1, src2, src3, src4, src5, src6, 128);
970 ILVR_B_6VECS_SB(src0, src2, src4, src1, src3, src5,
971 src1, src3, src5, src2, src4, src6,
972 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r);
974 ILVL_B_6VECS_SB(src0, src2, src4, src1, src3, src5,
975 src1, src3, src5, src2, src4, src6,
976 src10_l, src32_l, src54_l, src21_l, src43_l, src65_l);
978 ILVR_D_3VECS_SB(src2110, src21_l, src10_l, src4332, src43_l, src32_l,
979 src6554, src65_l, src54_l);
981 for (loop_cnt = (height >> 2); loop_cnt--;) {
982 LOAD_4VECS_SB(src, src_stride, src7, src8, src9, src10);
983 src += (4 * src_stride);
985 XORI_B_4VECS_SB(src7, src8, src9, src10, src7, src8, src9, src10, 128);
987 ILVR_B_4VECS_SB(src6, src7, src8, src9, src7, src8, src9, src10,
988 src76_r, src87_r, src98_r, src109_r);
990 ILVL_B_4VECS_SB(src6, src7, src8, src9, src7, src8, src9, src10,
991 src76_l, src87_l, src98_l, src109_l);
993 ILVR_D_2VECS_SB(src8776, src87_l, src76_l, src10998, src109_l, src98_l);
995 dst0_r = HEVC_FILT_8TAP_DPADD_H(src10_r, src32_r, src54_r, src76_r,
996 filt0, filt1, filt2, filt3, const_vec);
998 dst1_r = HEVC_FILT_8TAP_DPADD_H(src21_r, src43_r, src65_r, src87_r,
999 filt0, filt1, filt2, filt3, const_vec);
1001 dst2_r = HEVC_FILT_8TAP_DPADD_H(src32_r, src54_r, src76_r, src98_r,
1002 filt0, filt1, filt2, filt3, const_vec);
1004 dst3_r = HEVC_FILT_8TAP_DPADD_H(src43_r, src65_r, src87_r, src109_r,
1005 filt0, filt1, filt2, filt3, const_vec);
1007 dst0_l = HEVC_FILT_8TAP_DPADD_H(src2110, src4332, src6554, src8776,
1008 filt0, filt1, filt2, filt3, const_vec);
1010 dst1_l = HEVC_FILT_8TAP_DPADD_H(src4332, src6554, src8776, src10998,
1011 filt0, filt1, filt2, filt3, const_vec);
1013 out0 = __msa_copy_u_d((v2i64) dst0_l, 0);
1014 out1 = __msa_copy_u_d((v2i64) dst0_l, 1);
1015 out2 = __msa_copy_u_d((v2i64) dst1_l, 0);
1016 out3 = __msa_copy_u_d((v2i64) dst1_l, 1);
1018 STORE_SH(dst0_r, dst);
1019 STORE_DWORD(dst + 8, out0);
1021 STORE_SH(dst1_r, dst);
1022 STORE_DWORD(dst + 8, out1);
1025 STORE_SH(dst2_r, dst);
1026 STORE_DWORD(dst + 8, out2);
1028 STORE_SH(dst3_r, dst);
1029 STORE_DWORD(dst + 8, out3);
1048 static void hevc_vt_8t_16multx4mult_msa(uint8_t * __restrict src,
1050 int16_t * __restrict dst,
1052 const int8_t * __restrict filter,
1058 int32_t loop_cnt, cnt;
1059 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1060 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
1061 v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
1062 v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
1063 v16i8 src10_l, src32_l, src54_l, src76_l, src98_l;
1064 v16i8 src21_l, src43_l, src65_l, src87_l, src109_l;
1065 v8i16 dst0_l, dst1_l, dst2_l, dst3_l;
1067 v8i16 filt0, filt1, filt2, filt3;
1070 src -= (3 * src_stride);
1072 const_vec = (v8u16) __msa_ldi_h(128);
1075 filter_vec = LOAD_SH(filter);
1076 filt0 = __msa_splati_h(filter_vec, 0);
1077 filt1 = __msa_splati_h(filter_vec, 1);
1078 filt2 = __msa_splati_h(filter_vec, 2);
1079 filt3 = __msa_splati_h(filter_vec, 3);
1081 for (cnt = width >> 4; cnt--;) {
1085 LOAD_7VECS_SB(src_tmp, src_stride,
1086 src0, src1, src2, src3, src4, src5, src6);
1087 src_tmp += (7 * src_stride);
1089 XORI_B_7VECS_SB(src0, src1, src2, src3, src4, src5, src6,
1090 src0, src1, src2, src3, src4, src5, src6, 128);
1092 ILVR_B_6VECS_SB(src0, src2, src4, src1, src3, src5,
1093 src1, src3, src5, src2, src4, src6,
1094 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r);
1096 ILVL_B_6VECS_SB(src0, src2, src4, src1, src3, src5,
1097 src1, src3, src5, src2, src4, src6,
1098 src10_l, src32_l, src54_l, src21_l, src43_l, src65_l);
1100 for (loop_cnt = (height >> 2); loop_cnt--;) {
1101 LOAD_4VECS_SB(src_tmp, src_stride, src7, src8, src9, src10);
1102 src_tmp += (4 * src_stride);
1104 XORI_B_4VECS_SB(src7, src8, src9, src10,
1105 src7, src8, src9, src10, 128);
1107 ILVR_B_4VECS_SB(src6, src7, src8, src9, src7, src8, src9, src10,
1108 src76_r, src87_r, src98_r, src109_r);
1110 ILVL_B_4VECS_SB(src6, src7, src8, src9, src7, src8, src9, src10,
1111 src76_l, src87_l, src98_l, src109_l);
1113 dst0_r = HEVC_FILT_8TAP_DPADD_H(src10_r, src32_r, src54_r, src76_r,
1114 filt0, filt1, filt2, filt3,
1117 dst1_r = HEVC_FILT_8TAP_DPADD_H(src21_r, src43_r, src65_r, src87_r,
1118 filt0, filt1, filt2, filt3,
1121 dst2_r = HEVC_FILT_8TAP_DPADD_H(src32_r, src54_r, src76_r, src98_r,
1122 filt0, filt1, filt2, filt3,
1125 dst3_r = HEVC_FILT_8TAP_DPADD_H(src43_r, src65_r, src87_r, src109_r,
1126 filt0, filt1, filt2, filt3,
1129 dst0_l = HEVC_FILT_8TAP_DPADD_H(src10_l, src32_l, src54_l, src76_l,
1130 filt0, filt1, filt2, filt3,
1133 dst1_l = HEVC_FILT_8TAP_DPADD_H(src21_l, src43_l, src65_l, src87_l,
1134 filt0, filt1, filt2, filt3,
1137 dst2_l = HEVC_FILT_8TAP_DPADD_H(src32_l, src54_l, src76_l, src98_l,
1138 filt0, filt1, filt2, filt3,
1141 dst3_l = HEVC_FILT_8TAP_DPADD_H(src43_l, src65_l, src87_l, src109_l,
1142 filt0, filt1, filt2, filt3,
1145 STORE_SH(dst0_r, dst_tmp);
1146 STORE_SH(dst0_l, dst_tmp + 8);
1147 dst_tmp += dst_stride;
1148 STORE_SH(dst1_r, dst_tmp);
1149 STORE_SH(dst1_l, dst_tmp + 8);
1150 dst_tmp += dst_stride;
1152 STORE_SH(dst2_r, dst_tmp);
1153 STORE_SH(dst2_l, dst_tmp + 8);
1154 dst_tmp += dst_stride;
1155 STORE_SH(dst3_r, dst_tmp);
1156 STORE_SH(dst3_l, dst_tmp + 8);
1157 dst_tmp += dst_stride;
1183 static void hevc_vt_8t_16w_msa(uint8_t * __restrict src, int32_t src_stride,
1184 int16_t * __restrict dst, int32_t dst_stride,
1185 const int8_t * __restrict filter, int32_t height)
1187 hevc_vt_8t_16multx4mult_msa(src, src_stride, dst, dst_stride,
1188 filter, height, 16);
1191 static void hevc_vt_8t_24w_msa(uint8_t * __restrict src, int32_t src_stride,
1192 int16_t * __restrict dst, int32_t dst_stride,
1193 const int8_t * __restrict filter, int32_t height)
1195 hevc_vt_8t_16multx4mult_msa(src, src_stride, dst, dst_stride,
1196 filter, height, 16);
1198 hevc_vt_8t_8w_msa(src + 16, src_stride, dst + 16, dst_stride,
1202 static void hevc_vt_8t_32w_msa(uint8_t * __restrict src, int32_t src_stride,
1203 int16_t * __restrict dst, int32_t dst_stride,
1204 const int8_t * __restrict filter, int32_t height)
1206 hevc_vt_8t_16multx4mult_msa(src, src_stride, dst, dst_stride,
1207 filter, height, 32);
1210 static void hevc_vt_8t_48w_msa(uint8_t * __restrict src, int32_t src_stride,
1211 int16_t * __restrict dst, int32_t dst_stride,
1212 const int8_t * __restrict filter, int32_t height)
1214 hevc_vt_8t_16multx4mult_msa(src, src_stride, dst, dst_stride,
1215 filter, height, 48);
1218 static void hevc_vt_8t_64w_msa(uint8_t * __restrict src, int32_t src_stride,
1219 int16_t * __restrict dst, int32_t dst_stride,
1220 const int8_t * __restrict filter, int32_t height)
1222 hevc_vt_8t_16multx4mult_msa(src, src_stride, dst, dst_stride,
1223 filter, height, 64);
1226 #define MC(PEL, DIR, WIDTH, TAP, DIR1, FILT_DIR) \
1227 void ff_hevc_put_hevc_##PEL##_##DIR####WIDTH##_8_msa(int16_t *dst, \
1229 ptrdiff_t src_stride, \
1235 const int8_t *filter = ff_hevc_##PEL##_filters[FILT_DIR - 1]; \
1237 hevc_##DIR1##_##TAP##t_##WIDTH##w_msa(src, src_stride, dst, \
1238 MAX_PB_SIZE, filter, height); \
1241 MC(qpel, h, 4, 8, hz, mx);
1242 MC(qpel, h, 8, 8, hz, mx);
1243 MC(qpel, h, 12, 8, hz, mx);
1244 MC(qpel, h, 16, 8, hz, mx);
1245 MC(qpel, h, 24, 8, hz, mx);
1246 MC(qpel, h, 32, 8, hz, mx);
1247 MC(qpel, h, 48, 8, hz, mx);
1248 MC(qpel, h, 64, 8, hz, mx);
1250 MC(qpel, v, 4, 8, vt, my);
1251 MC(qpel, v, 8, 8, vt, my);
1252 MC(qpel, v, 12, 8, vt, my);
1253 MC(qpel, v, 16, 8, vt, my);
1254 MC(qpel, v, 24, 8, vt, my);
1255 MC(qpel, v, 32, 8, vt, my);
1256 MC(qpel, v, 48, 8, vt, my);
1257 MC(qpel, v, 64, 8, vt, my);