2 * Copyright (c) 2015 - 2017 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com)
4 * This file is part of FFmpeg.
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 #include "libavutil/mips/generic_macros_msa.h"
22 #include "libavcodec/mips/hevcdsp_mips.h"
23 #include "libavcodec/mips/hevc_macros_msa.h"
25 static const uint8_t ff_hevc_mask_arr[16 * 2] __attribute__((aligned(0x40))) = {
27 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8,
29 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20
32 static void hevc_copy_4w_msa(uint8_t *src, int32_t src_stride,
33 int16_t *dst, int32_t dst_stride,
42 LD_SB2(src, src_stride, src0, src1);
44 src0 = (v16i8) __msa_ilvr_w((v4i32) src1, (v4i32) src0);
45 in0 = (v8i16) __msa_ilvr_b(zero, src0);
47 ST8x2_UB(in0, dst, 2 * dst_stride);
48 } else if (4 == height) {
49 v16i8 src0, src1, src2, src3;
52 LD_SB4(src, src_stride, src0, src1, src2, src3);
54 ILVR_W2_SB(src1, src0, src3, src2, src0, src1);
55 ILVR_B2_SH(zero, src0, zero, src1, in0, in1);
58 ST8x4_UB(in0, in1, dst, 2 * dst_stride);
59 } else if (0 == height % 8) {
60 v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
61 v8i16 in0, in1, in2, in3;
64 for (loop_cnt = (height >> 3); loop_cnt--;) {
65 LD_SB8(src, src_stride,
66 src0, src1, src2, src3, src4, src5, src6, src7);
67 src += (8 * src_stride);
69 ILVR_W4_SB(src1, src0, src3, src2, src5, src4, src7, src6,
70 src0, src1, src2, src3);
71 ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
73 SLLI_4V(in0, in1, in2, in3, 6);
74 ST8x8_UB(in0, in1, in2, in3, dst, 2 * dst_stride);
75 dst += (8 * dst_stride);
80 static void hevc_copy_6w_msa(uint8_t *src, int32_t src_stride,
81 int16_t *dst, int32_t dst_stride,
86 v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
87 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
89 for (loop_cnt = (height >> 3); loop_cnt--;) {
90 LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
91 src += (8 * src_stride);
93 ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
95 ILVR_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7,
97 SLLI_4V(in0, in1, in2, in3, 6);
98 SLLI_4V(in4, in5, in6, in7, 6);
99 ST12x8_UB(in0, in1, in2, in3, in4, in5, in6, in7, dst, 2 * dst_stride);
100 dst += (8 * dst_stride);
104 static void hevc_copy_8w_msa(uint8_t *src, int32_t src_stride,
105 int16_t *dst, int32_t dst_stride,
114 LD_SB2(src, src_stride, src0, src1);
116 ILVR_B2_SH(zero, src0, zero, src1, in0, in1);
119 ST_SH2(in0, in1, dst, dst_stride);
120 } else if (4 == height) {
121 v16i8 src0, src1, src2, src3;
122 v8i16 in0, in1, in2, in3;
124 LD_SB4(src, src_stride, src0, src1, src2, src3);
126 ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
128 SLLI_4V(in0, in1, in2, in3, 6);
129 ST_SH4(in0, in1, in2, in3, dst, dst_stride);
130 } else if (6 == height) {
131 v16i8 src0, src1, src2, src3, src4, src5;
132 v8i16 in0, in1, in2, in3, in4, in5;
134 LD_SB6(src, src_stride, src0, src1, src2, src3, src4, src5);
136 ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
138 ILVR_B2_SH(zero, src4, zero, src5, in4, in5);
139 SLLI_4V(in0, in1, in2, in3, 6);
142 ST_SH6(in0, in1, in2, in3, in4, in5, dst, dst_stride);
143 } else if (0 == height % 8) {
145 v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
146 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
148 for (loop_cnt = (height >> 3); loop_cnt--;) {
149 LD_SB8(src, src_stride,
150 src0, src1, src2, src3, src4, src5, src6, src7);
151 src += (8 * src_stride);
153 ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
155 ILVR_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7,
157 SLLI_4V(in0, in1, in2, in3, 6);
158 SLLI_4V(in4, in5, in6, in7, 6);
159 ST_SH8(in0, in1, in2, in3, in4, in5, in6, in7, dst, dst_stride);
160 dst += (8 * dst_stride);
165 static void hevc_copy_12w_msa(uint8_t *src, int32_t src_stride,
166 int16_t *dst, int32_t dst_stride,
171 v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
172 v8i16 in0, in1, in0_r, in1_r, in2_r, in3_r;
174 for (loop_cnt = (height >> 3); loop_cnt--;) {
175 LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
176 src += (8 * src_stride);
178 ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
179 in0_r, in1_r, in2_r, in3_r);
180 SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6);
181 ILVL_W2_SB(src1, src0, src3, src2, src0, src1);
182 ILVR_B2_SH(zero, src0, zero, src1, in0, in1);
185 ST_SH4(in0_r, in1_r, in2_r, in3_r, dst, dst_stride);
186 ST8x4_UB(in0, in1, dst + 8, 2 * dst_stride);
187 dst += (4 * dst_stride);
189 ILVR_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7,
190 in0_r, in1_r, in2_r, in3_r);
191 SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6);
192 ILVL_W2_SB(src5, src4, src7, src6, src0, src1);
193 ILVR_B2_SH(zero, src0, zero, src1, in0, in1);
196 ST_SH4(in0_r, in1_r, in2_r, in3_r, dst, dst_stride);
197 ST8x4_UB(in0, in1, dst + 8, 2 * dst_stride);
198 dst += (4 * dst_stride);
202 static void hevc_copy_16w_msa(uint8_t *src, int32_t src_stride,
203 int16_t *dst, int32_t dst_stride,
209 v16i8 src0, src1, src2, src3;
210 v8i16 in0_r, in1_r, in2_r, in3_r;
211 v8i16 in0_l, in1_l, in2_l, in3_l;
213 LD_SB4(src, src_stride, src0, src1, src2, src3);
215 ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
216 in0_r, in1_r, in2_r, in3_r);
217 ILVL_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
218 in0_l, in1_l, in2_l, in3_l);
219 SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6);
220 SLLI_4V(in0_l, in1_l, in2_l, in3_l, 6);
221 ST_SH4(in0_r, in1_r, in2_r, in3_r, dst, dst_stride);
222 ST_SH4(in0_l, in1_l, in2_l, in3_l, (dst + 8), dst_stride);
223 } else if (12 == height) {
224 v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
225 v16i8 src8, src9, src10, src11;
226 v8i16 in0_r, in1_r, in2_r, in3_r;
227 v8i16 in0_l, in1_l, in2_l, in3_l;
229 LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
230 src += (8 * src_stride);
231 LD_SB4(src, src_stride, src8, src9, src10, src11);
233 ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
234 in0_r, in1_r, in2_r, in3_r);
235 ILVL_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
236 in0_l, in1_l, in2_l, in3_l);
237 SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6);
238 SLLI_4V(in0_l, in1_l, in2_l, in3_l, 6);
239 ST_SH4(in0_r, in1_r, in2_r, in3_r, dst, dst_stride);
240 ST_SH4(in0_l, in1_l, in2_l, in3_l, (dst + 8), dst_stride);
241 dst += (4 * dst_stride);
243 ILVR_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7,
244 in0_r, in1_r, in2_r, in3_r);
245 ILVL_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7,
246 in0_l, in1_l, in2_l, in3_l);
247 SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6);
248 SLLI_4V(in0_l, in1_l, in2_l, in3_l, 6);
249 ST_SH4(in0_r, in1_r, in2_r, in3_r, dst, dst_stride);
250 ST_SH4(in0_l, in1_l, in2_l, in3_l, (dst + 8), dst_stride);
251 dst += (4 * dst_stride);
253 ILVR_B4_SH(zero, src8, zero, src9, zero, src10, zero, src11,
254 in0_r, in1_r, in2_r, in3_r);
255 ILVL_B4_SH(zero, src8, zero, src9, zero, src10, zero, src11,
256 in0_l, in1_l, in2_l, in3_l);
257 SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6);
258 SLLI_4V(in0_l, in1_l, in2_l, in3_l, 6);
259 ST_SH4(in0_r, in1_r, in2_r, in3_r, dst, dst_stride);
260 ST_SH4(in0_l, in1_l, in2_l, in3_l, (dst + 8), dst_stride);
261 } else if (0 == (height % 8)) {
263 v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
264 v8i16 in0_r, in1_r, in2_r, in3_r, in0_l, in1_l, in2_l, in3_l;
266 for (loop_cnt = (height >> 3); loop_cnt--;) {
267 LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6,
269 src += (8 * src_stride);
270 ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, in0_r,
271 in1_r, in2_r, in3_r);
272 ILVL_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, in0_l,
273 in1_l, in2_l, in3_l);
274 SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6);
275 SLLI_4V(in0_l, in1_l, in2_l, in3_l, 6);
276 ST_SH4(in0_r, in1_r, in2_r, in3_r, dst, dst_stride);
277 ST_SH4(in0_l, in1_l, in2_l, in3_l, (dst + 8), dst_stride);
278 dst += (4 * dst_stride);
280 ILVR_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7, in0_r,
281 in1_r, in2_r, in3_r);
282 ILVL_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7, in0_l,
283 in1_l, in2_l, in3_l);
284 SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6);
285 SLLI_4V(in0_l, in1_l, in2_l, in3_l, 6);
286 ST_SH4(in0_r, in1_r, in2_r, in3_r, dst, dst_stride);
287 ST_SH4(in0_l, in1_l, in2_l, in3_l, (dst + 8), dst_stride);
288 dst += (4 * dst_stride);
293 static void hevc_copy_24w_msa(uint8_t *src, int32_t src_stride,
294 int16_t *dst, int32_t dst_stride,
299 v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
300 v8i16 in0_r, in1_r, in2_r, in3_r, in0_l, in1_l, in2_l, in3_l;
302 for (loop_cnt = (height >> 2); loop_cnt--;) {
303 LD_SB4(src, src_stride, src0, src1, src2, src3);
304 LD_SB4((src + 16), src_stride, src4, src5, src6, src7);
305 src += (4 * src_stride);
306 ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, in0_r, in1_r,
308 ILVL_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, in0_l, in1_l,
310 SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6);
311 SLLI_4V(in0_l, in1_l, in2_l, in3_l, 6);
312 ST_SH4(in0_r, in1_r, in2_r, in3_r, dst, dst_stride);
313 ST_SH4(in0_l, in1_l, in2_l, in3_l, (dst + 8), dst_stride);
314 ILVR_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7, in0_r, in1_r,
316 SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6);
317 ST_SH4(in0_r, in1_r, in2_r, in3_r, (dst + 16), dst_stride);
318 dst += (4 * dst_stride);
322 static void hevc_copy_32w_msa(uint8_t *src, int32_t src_stride,
323 int16_t *dst, int32_t dst_stride,
328 v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
329 v8i16 in0_r, in1_r, in2_r, in3_r, in0_l, in1_l, in2_l, in3_l;
331 for (loop_cnt = (height >> 2); loop_cnt--;) {
332 LD_SB4(src, src_stride, src0, src2, src4, src6);
333 LD_SB4((src + 16), src_stride, src1, src3, src5, src7);
334 src += (4 * src_stride);
336 ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, in0_r, in1_r,
338 ILVL_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, in0_l, in1_l,
340 SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6);
341 SLLI_4V(in0_l, in1_l, in2_l, in3_l, 6);
342 ST_SH4(in0_r, in0_l, in1_r, in1_l, dst, 8);
344 ST_SH4(in2_r, in2_l, in3_r, in3_l, dst, 8);
347 ILVR_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7, in0_r, in1_r,
349 ILVL_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7, in0_l, in1_l,
351 SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6);
352 SLLI_4V(in0_l, in1_l, in2_l, in3_l, 6);
353 ST_SH4(in0_r, in0_l, in1_r, in1_l, dst, 8);
355 ST_SH4(in2_r, in2_l, in3_r, in3_l, dst, 8);
360 static void hevc_copy_48w_msa(uint8_t *src, int32_t src_stride,
361 int16_t *dst, int32_t dst_stride,
366 v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
367 v16i8 src8, src9, src10, src11;
368 v8i16 in0_r, in1_r, in2_r, in3_r, in4_r, in5_r;
369 v8i16 in0_l, in1_l, in2_l, in3_l, in4_l, in5_l;
371 for (loop_cnt = (height >> 2); loop_cnt--;) {
372 LD_SB3(src, 16, src0, src1, src2);
374 LD_SB3(src, 16, src3, src4, src5);
376 LD_SB3(src, 16, src6, src7, src8);
378 LD_SB3(src, 16, src9, src10, src11);
381 ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
382 in0_r, in1_r, in2_r, in3_r);
383 ILVL_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
384 in0_l, in1_l, in2_l, in3_l);
385 ILVR_B2_SH(zero, src4, zero, src5, in4_r, in5_r);
386 ILVL_B2_SH(zero, src4, zero, src5, in4_l, in5_l);
387 SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6);
388 SLLI_4V(in0_l, in1_l, in2_l, in3_l, 6);
389 SLLI_4V(in4_r, in5_r, in4_l, in5_l, 6);
390 ST_SH6(in0_r, in0_l, in1_r, in1_l, in2_r, in2_l, dst, 8);
392 ST_SH6(in3_r, in3_l, in4_r, in4_l, in5_r, in5_l, dst, 8);
395 ILVR_B4_SH(zero, src6, zero, src7, zero, src8, zero, src9,
396 in0_r, in1_r, in2_r, in3_r);
397 ILVL_B4_SH(zero, src6, zero, src7, zero, src8, zero, src9,
398 in0_l, in1_l, in2_l, in3_l);
399 ILVR_B2_SH(zero, src10, zero, src11, in4_r, in5_r);
400 ILVL_B2_SH(zero, src10, zero, src11, in4_l, in5_l);
401 SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6);
402 SLLI_4V(in0_l, in1_l, in2_l, in3_l, 6);
403 SLLI_4V(in4_r, in5_r, in4_l, in5_l, 6);
404 ST_SH6(in0_r, in0_l, in1_r, in1_l, in2_r, in2_l, dst, 8);
406 ST_SH6(in3_r, in3_l, in4_r, in4_l, in5_r, in5_l, dst, 8);
411 static void hevc_copy_64w_msa(uint8_t *src, int32_t src_stride,
412 int16_t *dst, int32_t dst_stride,
417 v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
418 v8i16 in0_r, in1_r, in2_r, in3_r, in0_l, in1_l, in2_l, in3_l;
420 for (loop_cnt = (height >> 1); loop_cnt--;) {
421 LD_SB4(src, 16, src0, src1, src2, src3);
423 LD_SB4(src, 16, src4, src5, src6, src7);
426 ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
427 in0_r, in1_r, in2_r, in3_r);
428 ILVL_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
429 in0_l, in1_l, in2_l, in3_l);
430 SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6);
431 SLLI_4V(in0_l, in1_l, in2_l, in3_l, 6);
432 ST_SH4(in0_r, in0_l, in1_r, in1_l, dst, 8);
433 ST_SH4(in2_r, in2_l, in3_r, in3_l, (dst + 32), 8);
436 ILVR_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7,
437 in0_r, in1_r, in2_r, in3_r);
438 ILVL_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7,
439 in0_l, in1_l, in2_l, in3_l);
440 SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6);
441 SLLI_4V(in0_l, in1_l, in2_l, in3_l, 6);
442 ST_SH4(in0_r, in0_l, in1_r, in1_l, dst, 8);
443 ST_SH4(in2_r, in2_l, in3_r, in3_l, (dst + 32), 8);
448 static void hevc_hz_8t_4w_msa(uint8_t *src, int32_t src_stride,
449 int16_t *dst, int32_t dst_stride,
450 const int8_t *filter, int32_t height)
453 v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
454 v8i16 filt0, filt1, filt2, filt3;
455 v16i8 mask1, mask2, mask3;
456 v16i8 vec0, vec1, vec2, vec3;
457 v8i16 dst0, dst1, dst2, dst3;
458 v8i16 filter_vec, const_vec;
459 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 };
462 const_vec = __msa_ldi_h(128);
465 filter_vec = LD_SH(filter);
466 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
472 for (loop_cnt = (height >> 3); loop_cnt--;) {
473 LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
474 src += (8 * src_stride);
475 XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
477 VSHF_B4_SB(src0, src1, mask0, mask1, mask2, mask3,
478 vec0, vec1, vec2, vec3);
480 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
481 dst0, dst0, dst0, dst0);
482 VSHF_B4_SB(src2, src3, mask0, mask1, mask2, mask3,
483 vec0, vec1, vec2, vec3);
485 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
486 dst1, dst1, dst1, dst1);
487 VSHF_B4_SB(src4, src5, mask0, mask1, mask2, mask3,
488 vec0, vec1, vec2, vec3);
490 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
491 dst2, dst2, dst2, dst2);
492 VSHF_B4_SB(src6, src7, mask0, mask1, mask2, mask3,
493 vec0, vec1, vec2, vec3);
495 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
496 dst3, dst3, dst3, dst3);
498 ST8x8_UB(dst0, dst1, dst2, dst3, dst, 2 * dst_stride);
499 dst += (8 * dst_stride);
503 static void hevc_hz_8t_8w_msa(uint8_t *src, int32_t src_stride,
504 int16_t *dst, int32_t dst_stride,
505 const int8_t *filter, int32_t height)
508 v16i8 src0, src1, src2, src3;
509 v8i16 filt0, filt1, filt2, filt3;
510 v16i8 mask1, mask2, mask3;
511 v16i8 vec0, vec1, vec2, vec3;
512 v8i16 dst0, dst1, dst2, dst3;
513 v8i16 filter_vec, const_vec;
514 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
517 const_vec = __msa_ldi_h(128);
520 filter_vec = LD_SH(filter);
521 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
527 for (loop_cnt = (height >> 2); loop_cnt--;) {
528 LD_SB4(src, src_stride, src0, src1, src2, src3);
529 src += (4 * src_stride);
530 XORI_B4_128_SB(src0, src1, src2, src3);
532 VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
533 vec0, vec1, vec2, vec3);
535 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
536 dst0, dst0, dst0, dst0);
537 VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
538 vec0, vec1, vec2, vec3);
540 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
541 dst1, dst1, dst1, dst1);
542 VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
543 vec0, vec1, vec2, vec3);
545 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
546 dst2, dst2, dst2, dst2);
547 VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
548 vec0, vec1, vec2, vec3);
550 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
551 dst3, dst3, dst3, dst3);
553 ST_SH4(dst0, dst1, dst2, dst3, dst, dst_stride);
554 dst += (4 * dst_stride);
558 static void hevc_hz_8t_12w_msa(uint8_t *src, int32_t src_stride,
559 int16_t *dst, int32_t dst_stride,
560 const int8_t *filter, int32_t height)
562 hevc_hz_8t_8w_msa(src, src_stride, dst, dst_stride, filter, height);
563 hevc_hz_8t_4w_msa(src + 8, src_stride, dst + 8, dst_stride, filter, height);
566 static void hevc_hz_8t_16w_msa(uint8_t *src, int32_t src_stride,
567 int16_t *dst, int32_t dst_stride,
568 const int8_t *filter, int32_t height)
571 v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
572 v8i16 filt0, filt1, filt2, filt3;
573 v16i8 mask1, mask2, mask3;
574 v16i8 vec0, vec1, vec2, vec3;
575 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
576 v8i16 filter_vec, const_vec;
577 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
580 const_vec = __msa_ldi_h(128);
583 filter_vec = LD_SH(filter);
584 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
590 for (loop_cnt = (height >> 2); loop_cnt--;) {
591 LD_SB4(src, src_stride, src0, src2, src4, src6);
592 LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
593 src += (4 * src_stride);
594 XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
596 VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
597 vec0, vec1, vec2, vec3);
599 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
600 dst0, dst0, dst0, dst0);
601 VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
602 vec0, vec1, vec2, vec3);
604 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
605 dst1, dst1, dst1, dst1);
606 VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
607 vec0, vec1, vec2, vec3);
609 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
610 dst2, dst2, dst2, dst2);
611 VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
612 vec0, vec1, vec2, vec3);
614 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
615 dst3, dst3, dst3, dst3);
616 VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3,
617 vec0, vec1, vec2, vec3);
619 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
620 dst4, dst4, dst4, dst4);
621 VSHF_B4_SB(src5, src5, mask0, mask1, mask2, mask3,
622 vec0, vec1, vec2, vec3);
624 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
625 dst5, dst5, dst5, dst5);
626 VSHF_B4_SB(src6, src6, mask0, mask1, mask2, mask3,
627 vec0, vec1, vec2, vec3);
629 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
630 dst6, dst6, dst6, dst6);
631 VSHF_B4_SB(src7, src7, mask0, mask1, mask2, mask3,
632 vec0, vec1, vec2, vec3);
634 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
635 dst7, dst7, dst7, dst7);
637 ST_SH4(dst0, dst2, dst4, dst6, dst, dst_stride);
638 ST_SH4(dst1, dst3, dst5, dst7, dst + 8, dst_stride);
639 dst += (4 * dst_stride);
643 static void hevc_hz_8t_24w_msa(uint8_t *src, int32_t src_stride,
644 int16_t *dst, int32_t dst_stride,
645 const int8_t *filter, int32_t height)
648 v16i8 src0, src1, src2, src3;
649 v8i16 filt0, filt1, filt2, filt3;
650 v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
651 v16i8 vec0, vec1, vec2, vec3;
652 v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
653 v8i16 filter_vec, const_vec;
654 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
657 filter_vec = LD_SH(filter);
658 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
668 const_vec = __msa_ldi_h(128);
671 for (loop_cnt = (height >> 1); loop_cnt--;) {
672 LD_SB2(src, 16, src0, src1);
674 LD_SB2(src, 16, src2, src3);
676 XORI_B4_128_SB(src0, src1, src2, src3);
678 VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
679 vec0, vec1, vec2, vec3);
681 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
682 dst0, dst0, dst0, dst0);
683 VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7,
684 vec0, vec1, vec2, vec3);
686 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
687 dst1, dst1, dst1, dst1);
688 VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
689 vec0, vec1, vec2, vec3);
691 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
692 dst2, dst2, dst2, dst2);
693 VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
694 vec0, vec1, vec2, vec3);
696 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
697 dst3, dst3, dst3, dst3);
698 VSHF_B4_SB(src2, src3, mask4, mask5, mask6, mask7,
699 vec0, vec1, vec2, vec3);
701 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
702 dst4, dst4, dst4, dst4);
703 VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
704 vec0, vec1, vec2, vec3);
706 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
707 dst5, dst5, dst5, dst5);
709 ST_SH2(dst0, dst1, dst, 8);
710 ST_SH(dst2, dst + 16);
712 ST_SH2(dst3, dst4, dst, 8);
713 ST_SH(dst5, dst + 16);
718 static void hevc_hz_8t_32w_msa(uint8_t *src, int32_t src_stride,
719 int16_t *dst, int32_t dst_stride,
720 const int8_t *filter, int32_t height)
723 v16i8 src0, src1, src2;
724 v8i16 filt0, filt1, filt2, filt3;
725 v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
726 v16i8 vec0, vec1, vec2, vec3;
727 v8i16 dst0, dst1, dst2, dst3;
728 v8i16 filter_vec, const_vec;
729 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
732 filter_vec = LD_SH(filter);
733 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
743 const_vec = __msa_ldi_h(128);
746 for (loop_cnt = height; loop_cnt--;) {
747 LD_SB2(src, 16, src0, src1);
748 src2 = LD_SB(src + 24);
750 XORI_B3_128_SB(src0, src1, src2);
752 VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
753 vec0, vec1, vec2, vec3);
755 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
756 dst0, dst0, dst0, dst0);
757 VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7,
758 vec0, vec1, vec2, vec3);
760 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
761 dst1, dst1, dst1, dst1);
762 VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
763 vec0, vec1, vec2, vec3);
765 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
766 dst2, dst2, dst2, dst2);
767 VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
768 vec0, vec1, vec2, vec3);
770 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
771 dst3, dst3, dst3, dst3);
773 ST_SH4(dst0, dst1, dst2, dst3, dst, 8);
778 static void hevc_hz_8t_48w_msa(uint8_t *src, int32_t src_stride,
779 int16_t *dst, int32_t dst_stride,
780 const int8_t *filter, int32_t height)
783 v16i8 src0, src1, src2, src3;
784 v8i16 filt0, filt1, filt2, filt3;
785 v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
786 v16i8 vec0, vec1, vec2, vec3;
787 v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
788 v8i16 filter_vec, const_vec;
789 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
792 filter_vec = LD_SH(filter);
793 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
803 const_vec = __msa_ldi_h(128);
806 for (loop_cnt = height; loop_cnt--;) {
807 LD_SB3(src, 16, src0, src1, src2);
808 src3 = LD_SB(src + 40);
810 XORI_B4_128_SB(src0, src1, src2, src3);
812 VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
813 vec0, vec1, vec2, vec3);
815 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
816 dst0, dst0, dst0, dst0);
817 VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7,
818 vec0, vec1, vec2, vec3);
820 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
821 dst1, dst1, dst1, dst1);
822 VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
823 vec0, vec1, vec2, vec3);
825 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
826 dst2, dst2, dst2, dst2);
827 VSHF_B4_SB(src1, src2, mask4, mask5, mask6, mask7,
828 vec0, vec1, vec2, vec3);
830 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
831 dst3, dst3, dst3, dst3);
832 VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
833 vec0, vec1, vec2, vec3);
835 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
836 dst4, dst4, dst4, dst4);
837 VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
838 vec0, vec1, vec2, vec3);
840 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
841 dst5, dst5, dst5, dst5);
843 ST_SH6(dst0, dst1, dst2, dst3, dst4, dst5, dst, 8);
848 static void hevc_hz_8t_64w_msa(uint8_t *src, int32_t src_stride,
849 int16_t *dst, int32_t dst_stride,
850 const int8_t *filter, int32_t height)
853 v16i8 src0, src1, src2, src3, src4;
854 v8i16 filt0, filt1, filt2, filt3;
855 v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
856 v16i8 vec0, vec1, vec2, vec3;
857 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
858 v8i16 filter_vec, const_vec;
859 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
863 filter_vec = LD_SH(filter);
864 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
874 const_vec = __msa_ldi_h(128);
877 for (loop_cnt = height; loop_cnt--;) {
878 LD_SB4(src, 16, src0, src1, src2, src3);
879 src4 = LD_SB(src + 56);
881 XORI_B5_128_SB(src0, src1, src2, src3, src4);
883 VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
884 vec0, vec1, vec2, vec3);
886 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
887 dst0, dst0, dst0, dst0);
890 VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7,
891 vec0, vec1, vec2, vec3);
893 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
894 dst1, dst1, dst1, dst1);
895 ST_SH(dst1, dst + 8);
897 VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
898 vec0, vec1, vec2, vec3);
900 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
901 dst2, dst2, dst2, dst2);
902 ST_SH(dst2, dst + 16);
904 VSHF_B4_SB(src1, src2, mask4, mask5, mask6, mask7,
905 vec0, vec1, vec2, vec3);
907 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
908 dst3, dst3, dst3, dst3);
909 ST_SH(dst3, dst + 24);
911 VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
912 vec0, vec1, vec2, vec3);
914 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
915 dst4, dst4, dst4, dst4);
916 ST_SH(dst4, dst + 32);
918 VSHF_B4_SB(src2, src3, mask4, mask5, mask6, mask7,
919 vec0, vec1, vec2, vec3);
921 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
922 dst5, dst5, dst5, dst5);
923 ST_SH(dst5, dst + 40);
925 VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
926 vec0, vec1, vec2, vec3);
928 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
929 dst6, dst6, dst6, dst6);
930 ST_SH(dst6, dst + 48);
932 VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3,
933 vec0, vec1, vec2, vec3);
935 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
936 dst7, dst7, dst7, dst7);
937 ST_SH(dst7, dst + 56);
942 static void hevc_vt_8t_4w_msa(uint8_t *src, int32_t src_stride,
943 int16_t *dst, int32_t dst_stride,
944 const int8_t *filter, int32_t height)
947 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
948 v16i8 src9, src10, src11, src12, src13, src14;
949 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
950 v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
951 v16i8 src1110_r, src1211_r, src1312_r, src1413_r;
952 v16i8 src2110, src4332, src6554, src8776, src10998;
953 v16i8 src12111110, src14131312;
954 v8i16 dst10, dst32, dst54, dst76;
955 v8i16 filt0, filt1, filt2, filt3;
956 v8i16 filter_vec, const_vec;
958 src -= (3 * src_stride);
960 const_vec = __msa_ldi_h(128);
963 filter_vec = LD_SH(filter);
964 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
966 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
967 src += (7 * src_stride);
968 ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
969 src10_r, src32_r, src54_r, src21_r);
970 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
971 ILVR_D3_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r,
972 src2110, src4332, src6554);
973 XORI_B3_128_SB(src2110, src4332, src6554);
975 for (loop_cnt = (height >> 3); loop_cnt--;) {
976 LD_SB8(src, src_stride,
977 src7, src8, src9, src10, src11, src12, src13, src14);
978 src += (8 * src_stride);
980 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
981 src76_r, src87_r, src98_r, src109_r);
982 ILVR_B4_SB(src11, src10, src12, src11, src13, src12, src14, src13,
983 src1110_r, src1211_r, src1312_r, src1413_r);
984 ILVR_D4_SB(src87_r, src76_r, src109_r, src98_r,
985 src1211_r, src1110_r, src1413_r, src1312_r,
986 src8776, src10998, src12111110, src14131312);
987 XORI_B4_128_SB(src8776, src10998, src12111110, src14131312);
990 DPADD_SB4_SH(src2110, src4332, src6554, src8776,
991 filt0, filt1, filt2, filt3, dst10, dst10, dst10, dst10);
993 DPADD_SB4_SH(src4332, src6554, src8776, src10998,
994 filt0, filt1, filt2, filt3, dst32, dst32, dst32, dst32);
996 DPADD_SB4_SH(src6554, src8776, src10998, src12111110,
997 filt0, filt1, filt2, filt3, dst54, dst54, dst54, dst54);
999 DPADD_SB4_SH(src8776, src10998, src12111110, src14131312,
1000 filt0, filt1, filt2, filt3, dst76, dst76, dst76, dst76);
1002 ST8x8_UB(dst10, dst32, dst54, dst76, dst, 2 * dst_stride);
1003 dst += (8 * dst_stride);
1006 src4332 = src12111110;
1007 src6554 = src14131312;
1012 static void hevc_vt_8t_8w_msa(uint8_t *src, int32_t src_stride,
1013 int16_t *dst, int32_t dst_stride,
1014 const int8_t *filter, int32_t height)
1017 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1018 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
1019 v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
1020 v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
1021 v8i16 filter_vec, const_vec;
1022 v8i16 filt0, filt1, filt2, filt3;
1024 src -= (3 * src_stride);
1025 const_vec = __msa_ldi_h(128);
1028 filter_vec = LD_SH(filter);
1029 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1031 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
1032 src += (7 * src_stride);
1033 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1034 ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1035 src10_r, src32_r, src54_r, src21_r);
1036 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1038 for (loop_cnt = (height >> 2); loop_cnt--;) {
1039 LD_SB4(src, src_stride, src7, src8, src9, src10);
1040 src += (4 * src_stride);
1041 XORI_B4_128_SB(src7, src8, src9, src10);
1042 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
1043 src76_r, src87_r, src98_r, src109_r);
1046 DPADD_SB4_SH(src10_r, src32_r, src54_r, src76_r,
1047 filt0, filt1, filt2, filt3,
1048 dst0_r, dst0_r, dst0_r, dst0_r);
1050 DPADD_SB4_SH(src21_r, src43_r, src65_r, src87_r,
1051 filt0, filt1, filt2, filt3,
1052 dst1_r, dst1_r, dst1_r, dst1_r);
1054 DPADD_SB4_SH(src32_r, src54_r, src76_r, src98_r,
1055 filt0, filt1, filt2, filt3,
1056 dst2_r, dst2_r, dst2_r, dst2_r);
1058 DPADD_SB4_SH(src43_r, src65_r, src87_r, src109_r,
1059 filt0, filt1, filt2, filt3,
1060 dst3_r, dst3_r, dst3_r, dst3_r);
1062 ST_SH4(dst0_r, dst1_r, dst2_r, dst3_r, dst, dst_stride);
1063 dst += (4 * dst_stride);
1075 static void hevc_vt_8t_12w_msa(uint8_t *src, int32_t src_stride,
1076 int16_t *dst, int32_t dst_stride,
1077 const int8_t *filter, int32_t height)
1080 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1081 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
1082 v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
1083 v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
1084 v16i8 src10_l, src32_l, src54_l, src76_l, src98_l;
1085 v16i8 src21_l, src43_l, src65_l, src87_l, src109_l;
1086 v16i8 src2110, src4332, src6554, src8776, src10998;
1087 v8i16 dst0_l, dst1_l;
1088 v8i16 filter_vec, const_vec;
1089 v8i16 filt0, filt1, filt2, filt3;
1091 src -= (3 * src_stride);
1092 const_vec = __msa_ldi_h(128);
1095 filter_vec = LD_SH(filter);
1096 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1098 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
1099 src += (7 * src_stride);
1100 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1101 ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1102 src10_r, src32_r, src54_r, src21_r);
1103 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1104 ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1105 src10_l, src32_l, src54_l, src21_l);
1106 ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
1107 ILVR_D3_SB(src21_l, src10_l, src43_l, src32_l, src65_l, src54_l,
1108 src2110, src4332, src6554);
1110 for (loop_cnt = (height >> 2); loop_cnt--;) {
1111 LD_SB4(src, src_stride, src7, src8, src9, src10);
1112 src += (4 * src_stride);
1113 XORI_B4_128_SB(src7, src8, src9, src10);
1114 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
1115 src76_r, src87_r, src98_r, src109_r);
1116 ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
1117 src76_l, src87_l, src98_l, src109_l);
1118 ILVR_D2_SB(src87_l, src76_l, src109_l, src98_l, src8776, src10998);
1121 DPADD_SB4_SH(src10_r, src32_r, src54_r, src76_r,
1122 filt0, filt1, filt2, filt3,
1123 dst0_r, dst0_r, dst0_r, dst0_r);
1125 DPADD_SB4_SH(src21_r, src43_r, src65_r, src87_r,
1126 filt0, filt1, filt2, filt3,
1127 dst1_r, dst1_r, dst1_r, dst1_r);
1129 DPADD_SB4_SH(src32_r, src54_r, src76_r, src98_r,
1130 filt0, filt1, filt2, filt3,
1131 dst2_r, dst2_r, dst2_r, dst2_r);
1133 DPADD_SB4_SH(src43_r, src65_r, src87_r, src109_r,
1134 filt0, filt1, filt2, filt3,
1135 dst3_r, dst3_r, dst3_r, dst3_r);
1137 DPADD_SB4_SH(src2110, src4332, src6554, src8776,
1138 filt0, filt1, filt2, filt3,
1139 dst0_l, dst0_l, dst0_l, dst0_l);
1141 DPADD_SB4_SH(src4332, src6554, src8776, src10998,
1142 filt0, filt1, filt2, filt3,
1143 dst1_l, dst1_l, dst1_l, dst1_l);
1145 ST_SH4(dst0_r, dst1_r, dst2_r, dst3_r, dst, dst_stride);
1146 ST8x4_UB(dst0_l, dst1_l, dst + 8, 2 * dst_stride);
1147 dst += (4 * dst_stride);
1162 static void hevc_vt_8t_16multx4mult_msa(uint8_t *src,
1166 const int8_t *filter,
1172 int32_t loop_cnt, cnt;
1173 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1174 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
1175 v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
1176 v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
1177 v16i8 src10_l, src32_l, src54_l, src76_l, src98_l;
1178 v16i8 src21_l, src43_l, src65_l, src87_l, src109_l;
1179 v8i16 dst0_l, dst1_l, dst2_l, dst3_l;
1180 v8i16 filter_vec, const_vec;
1181 v8i16 filt0, filt1, filt2, filt3;
1183 src -= (3 * src_stride);
1184 const_vec = __msa_ldi_h(128);
1187 filter_vec = LD_SH(filter);
1188 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1190 for (cnt = width >> 4; cnt--;) {
1194 LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6);
1195 src_tmp += (7 * src_stride);
1196 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1197 ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1198 src10_r, src32_r, src54_r, src21_r);
1199 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1200 ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1201 src10_l, src32_l, src54_l, src21_l);
1202 ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
1204 for (loop_cnt = (height >> 2); loop_cnt--;) {
1205 LD_SB4(src_tmp, src_stride, src7, src8, src9, src10);
1206 src_tmp += (4 * src_stride);
1207 XORI_B4_128_SB(src7, src8, src9, src10);
1208 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
1209 src76_r, src87_r, src98_r, src109_r);
1210 ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
1211 src76_l, src87_l, src98_l, src109_l);
1214 DPADD_SB4_SH(src10_r, src32_r, src54_r, src76_r,
1215 filt0, filt1, filt2, filt3,
1216 dst0_r, dst0_r, dst0_r, dst0_r);
1218 DPADD_SB4_SH(src21_r, src43_r, src65_r, src87_r,
1219 filt0, filt1, filt2, filt3,
1220 dst1_r, dst1_r, dst1_r, dst1_r);
1222 DPADD_SB4_SH(src32_r, src54_r, src76_r, src98_r,
1223 filt0, filt1, filt2, filt3,
1224 dst2_r, dst2_r, dst2_r, dst2_r);
1226 DPADD_SB4_SH(src43_r, src65_r, src87_r, src109_r,
1227 filt0, filt1, filt2, filt3,
1228 dst3_r, dst3_r, dst3_r, dst3_r);
1230 DPADD_SB4_SH(src10_l, src32_l, src54_l, src76_l,
1231 filt0, filt1, filt2, filt3,
1232 dst0_l, dst0_l, dst0_l, dst0_l);
1234 DPADD_SB4_SH(src21_l, src43_l, src65_l, src87_l,
1235 filt0, filt1, filt2, filt3,
1236 dst1_l, dst1_l, dst1_l, dst1_l);
1238 DPADD_SB4_SH(src32_l, src54_l, src76_l, src98_l,
1239 filt0, filt1, filt2, filt3,
1240 dst2_l, dst2_l, dst2_l, dst2_l);
1242 DPADD_SB4_SH(src43_l, src65_l, src87_l, src109_l,
1243 filt0, filt1, filt2, filt3,
1244 dst3_l, dst3_l, dst3_l, dst3_l);
1246 ST_SH4(dst0_r, dst1_r, dst2_r, dst3_r, dst_tmp, dst_stride);
1247 ST_SH4(dst0_l, dst1_l, dst2_l, dst3_l, dst_tmp + 8, dst_stride);
1248 dst_tmp += (4 * dst_stride);
1270 static void hevc_vt_8t_16w_msa(uint8_t *src, int32_t src_stride,
1271 int16_t *dst, int32_t dst_stride,
1272 const int8_t *filter, int32_t height)
1274 hevc_vt_8t_16multx4mult_msa(src, src_stride, dst, dst_stride,
1275 filter, height, 16);
1278 static void hevc_vt_8t_24w_msa(uint8_t *src, int32_t src_stride,
1279 int16_t *dst, int32_t dst_stride,
1280 const int8_t *filter, int32_t height)
1282 hevc_vt_8t_16multx4mult_msa(src, src_stride, dst, dst_stride,
1283 filter, height, 16);
1284 hevc_vt_8t_8w_msa(src + 16, src_stride, dst + 16, dst_stride,
1288 static void hevc_vt_8t_32w_msa(uint8_t *src, int32_t src_stride,
1289 int16_t *dst, int32_t dst_stride,
1290 const int8_t *filter, int32_t height)
1292 hevc_vt_8t_16multx4mult_msa(src, src_stride, dst, dst_stride,
1293 filter, height, 32);
1296 static void hevc_vt_8t_48w_msa(uint8_t *src, int32_t src_stride,
1297 int16_t *dst, int32_t dst_stride,
1298 const int8_t *filter, int32_t height)
1300 hevc_vt_8t_16multx4mult_msa(src, src_stride, dst, dst_stride,
1301 filter, height, 48);
1304 static void hevc_vt_8t_64w_msa(uint8_t *src, int32_t src_stride,
1305 int16_t *dst, int32_t dst_stride,
1306 const int8_t *filter, int32_t height)
1308 hevc_vt_8t_16multx4mult_msa(src, src_stride, dst, dst_stride,
1309 filter, height, 64);
1312 static void hevc_hv_8t_4w_msa(uint8_t *src, int32_t src_stride,
1313 int16_t *dst, int32_t dst_stride,
1314 const int8_t *filter_x, const int8_t *filter_y,
1318 int32_t dst_stride_in_bytes = 2 * dst_stride;
1319 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1320 v8i16 filt0, filt1, filt2, filt3;
1321 v8i16 filt_h0, filt_h1, filt_h2, filt_h3;
1322 v16i8 mask1, mask2, mask3;
1323 v8i16 filter_vec, const_vec;
1324 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1325 v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1326 v8i16 dst30, dst41, dst52, dst63, dst66, dst97, dst108;
1327 v4i32 dst0_r, dst1_r, dst2_r, dst3_r;
1328 v8i16 dst10_r, dst32_r, dst54_r, dst76_r, dst98_r;
1329 v8i16 dst21_r, dst43_r, dst65_r, dst87_r, dst109_r;
1330 v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16);
1332 src -= ((3 * src_stride) + 3);
1333 filter_vec = LD_SH(filter_x);
1334 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1336 filter_vec = LD_SH(filter_y);
1337 UNPCK_R_SB_SH(filter_vec, filter_vec);
1339 SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
1345 const_vec = __msa_ldi_h(128);
1348 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
1349 src += (7 * src_stride);
1350 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1352 VSHF_B4_SB(src0, src3, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
1353 VSHF_B4_SB(src1, src4, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7);
1354 VSHF_B4_SB(src2, src5, mask0, mask1, mask2, mask3,
1355 vec8, vec9, vec10, vec11);
1356 VSHF_B4_SB(src3, src6, mask0, mask1, mask2, mask3,
1357 vec12, vec13, vec14, vec15);
1359 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
1360 dst30, dst30, dst30, dst30);
1362 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, filt3,
1363 dst41, dst41, dst41, dst41);
1365 DPADD_SB4_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, filt3,
1366 dst52, dst52, dst52, dst52);
1368 DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2, filt3,
1369 dst63, dst63, dst63, dst63);
1371 ILVRL_H2_SH(dst41, dst30, dst10_r, dst43_r);
1372 ILVRL_H2_SH(dst52, dst41, dst21_r, dst54_r);
1373 ILVRL_H2_SH(dst63, dst52, dst32_r, dst65_r);
1374 dst66 = (v8i16) __msa_splati_d((v2i64) dst63, 1);
1376 for (loop_cnt = height >> 2; loop_cnt--;) {
1377 LD_SB4(src, src_stride, src7, src8, src9, src10);
1378 src += (4 * src_stride);
1379 XORI_B4_128_SB(src7, src8, src9, src10);
1381 VSHF_B4_SB(src7, src9, mask0, mask1, mask2, mask3,
1382 vec0, vec1, vec2, vec3);
1383 VSHF_B4_SB(src8, src10, mask0, mask1, mask2, mask3,
1384 vec4, vec5, vec6, vec7);
1387 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
1388 dst97, dst97, dst97, dst97);
1389 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, filt3,
1390 dst108, dst108, dst108, dst108);
1392 dst76_r = __msa_ilvr_h(dst97, dst66);
1393 ILVRL_H2_SH(dst108, dst97, dst87_r, dst109_r);
1394 dst66 = (v8i16) __msa_splati_d((v2i64) dst97, 1);
1395 dst98_r = __msa_ilvr_h(dst66, dst108);
1397 dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r,
1398 filt_h0, filt_h1, filt_h2, filt_h3);
1399 dst1_r = HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r,
1400 filt_h0, filt_h1, filt_h2, filt_h3);
1401 dst2_r = HEVC_FILT_8TAP(dst32_r, dst54_r, dst76_r, dst98_r,
1402 filt_h0, filt_h1, filt_h2, filt_h3);
1403 dst3_r = HEVC_FILT_8TAP(dst43_r, dst65_r, dst87_r, dst109_r,
1404 filt_h0, filt_h1, filt_h2, filt_h3);
1405 SRA_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6);
1406 PCKEV_H2_SW(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst2_r);
1407 ST8x4_UB(dst0_r, dst2_r, dst, dst_stride_in_bytes);
1408 dst += (4 * dst_stride);
1416 dst66 = (v8i16) __msa_splati_d((v2i64) dst108, 1);
1420 static void hevc_hv_8t_8multx1mult_msa(uint8_t *src,
1424 const int8_t *filter_x,
1425 const int8_t *filter_y,
1426 int32_t height, int32_t width)
1428 uint32_t loop_cnt, cnt;
1431 v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
1432 v8i16 filt0, filt1, filt2, filt3;
1433 v8i16 filt_h0, filt_h1, filt_h2, filt_h3;
1434 v16i8 mask1, mask2, mask3;
1435 v8i16 filter_vec, const_vec;
1436 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1437 v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1438 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
1439 v4i32 dst0_r, dst0_l;
1440 v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
1441 v8i16 dst10_l, dst32_l, dst54_l, dst76_l;
1442 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
1444 src -= ((3 * src_stride) + 3);
1445 filter_vec = LD_SH(filter_x);
1446 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1448 filter_vec = LD_SH(filter_y);
1449 UNPCK_R_SB_SH(filter_vec, filter_vec);
1451 SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
1457 const_vec = __msa_ldi_h(128);
1460 for (cnt = width >> 3; cnt--;) {
1464 LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6);
1465 src_tmp += (7 * src_stride);
1466 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1468 /* row 0 row 1 row 2 row 3 */
1469 VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
1470 vec0, vec1, vec2, vec3);
1471 VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
1472 vec4, vec5, vec6, vec7);
1473 VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
1474 vec8, vec9, vec10, vec11);
1475 VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
1476 vec12, vec13, vec14, vec15);
1478 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
1479 dst0, dst0, dst0, dst0);
1481 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, filt3,
1482 dst1, dst1, dst1, dst1);
1484 DPADD_SB4_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, filt3,
1485 dst2, dst2, dst2, dst2);
1487 DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2, filt3,
1488 dst3, dst3, dst3, dst3);
1490 /* row 4 row 5 row 6 */
1491 VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3,
1492 vec0, vec1, vec2, vec3);
1493 VSHF_B4_SB(src5, src5, mask0, mask1, mask2, mask3,
1494 vec4, vec5, vec6, vec7);
1495 VSHF_B4_SB(src6, src6, mask0, mask1, mask2, mask3,
1496 vec8, vec9, vec10, vec11);
1498 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
1499 dst4, dst4, dst4, dst4);
1501 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, filt3,
1502 dst5, dst5, dst5, dst5);
1504 DPADD_SB4_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, filt3,
1505 dst6, dst6, dst6, dst6);
1507 for (loop_cnt = height; loop_cnt--;) {
1508 src7 = LD_SB(src_tmp);
1509 src7 = (v16i8) __msa_xori_b((v16u8) src7, 128);
1510 src_tmp += src_stride;
1512 VSHF_B4_SB(src7, src7, mask0, mask1, mask2, mask3,
1513 vec0, vec1, vec2, vec3);
1515 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
1516 dst7, dst7, dst7, dst7);
1518 ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
1519 ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
1520 ILVRL_H2_SH(dst5, dst4, dst54_r, dst54_l);
1521 ILVRL_H2_SH(dst7, dst6, dst76_r, dst76_l);
1522 dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r,
1523 filt_h0, filt_h1, filt_h2, filt_h3);
1524 dst0_l = HEVC_FILT_8TAP(dst10_l, dst32_l, dst54_l, dst76_l,
1525 filt_h0, filt_h1, filt_h2, filt_h3);
1529 dst0_r = (v4i32) __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r);
1530 ST_SW(dst0_r, dst_tmp);
1531 dst_tmp += dst_stride;
1547 static void hevc_hv_8t_8w_msa(uint8_t *src, int32_t src_stride,
1548 int16_t *dst, int32_t dst_stride,
1549 const int8_t *filter_x, const int8_t *filter_y,
1552 hevc_hv_8t_8multx1mult_msa(src, src_stride, dst, dst_stride,
1553 filter_x, filter_y, height, 8);
1556 static void hevc_hv_8t_12w_msa(uint8_t *src, int32_t src_stride,
1557 int16_t *dst, int32_t dst_stride,
1558 const int8_t *filter_x, const int8_t *filter_y,
1562 int32_t dst_stride_in_bytes = 2 * dst_stride;
1565 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1566 v16i8 mask0, mask1, mask2, mask3, mask4, mask5, mask6, mask7;
1567 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1568 v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1569 v8i16 filt0, filt1, filt2, filt3, filt_h0, filt_h1, filt_h2, filt_h3;
1570 v8i16 filter_vec, const_vec;
1571 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
1572 v8i16 dst30, dst41, dst52, dst63, dst66, dst97, dst108;
1573 v8i16 dst10_r, dst32_r, dst54_r, dst76_r, dst98_r, dst21_r, dst43_r;
1574 v8i16 dst65_r, dst87_r, dst109_r, dst10_l, dst32_l, dst54_l, dst76_l;
1575 v4i32 dst0_r, dst0_l, dst1_r, dst2_r, dst3_r;
1577 src -= ((3 * src_stride) + 3);
1578 filter_vec = LD_SH(filter_x);
1579 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1581 filter_vec = LD_SH(filter_y);
1582 UNPCK_R_SB_SH(filter_vec, filter_vec);
1584 SPLATI_W4_SH(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
1586 mask0 = LD_SB(ff_hevc_mask_arr);
1591 const_vec = __msa_ldi_h(128);
1597 LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6);
1598 src_tmp += (7 * src_stride);
1599 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1601 /* row 0 row 1 row 2 row 3 */
1602 VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
1603 VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7);
1604 VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3, vec8, vec9, vec10,
1606 VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3, vec12, vec13, vec14,
1609 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, dst0, dst0,
1612 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, filt3, dst1, dst1,
1615 DPADD_SB4_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, filt3, dst2,
1618 DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2, filt3, dst3,
1621 /* row 4 row 5 row 6 */
1622 VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
1623 VSHF_B4_SB(src5, src5, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7);
1624 VSHF_B4_SB(src6, src6, mask0, mask1, mask2, mask3, vec8, vec9, vec10,
1627 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, dst4, dst4,
1630 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, filt3, dst5, dst5,
1633 DPADD_SB4_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, filt3, dst6,
1636 for (loop_cnt = height; loop_cnt--;) {
1637 src7 = LD_SB(src_tmp);
1638 src7 = (v16i8) __msa_xori_b((v16u8) src7, 128);
1639 src_tmp += src_stride;
1641 VSHF_B4_SB(src7, src7, mask0, mask1, mask2, mask3, vec0, vec1, vec2,
1644 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, dst7,
1647 ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
1648 ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
1649 ILVRL_H2_SH(dst5, dst4, dst54_r, dst54_l);
1650 ILVRL_H2_SH(dst7, dst6, dst76_r, dst76_l);
1651 dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r, filt_h0,
1652 filt_h1, filt_h2, filt_h3);
1653 dst0_l = HEVC_FILT_8TAP(dst10_l, dst32_l, dst54_l, dst76_l, filt_h0,
1654 filt_h1, filt_h2, filt_h3);
1658 dst0_r = (v4i32) __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r);
1659 ST_SW(dst0_r, dst_tmp);
1660 dst_tmp += dst_stride;
1674 mask4 = LD_SB(ff_hevc_mask_arr + 16);
1679 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
1680 src += (7 * src_stride);
1681 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1683 VSHF_B4_SB(src0, src3, mask4, mask5, mask6, mask7, vec0, vec1, vec2, vec3);
1684 VSHF_B4_SB(src1, src4, mask4, mask5, mask6, mask7, vec4, vec5, vec6, vec7);
1685 VSHF_B4_SB(src2, src5, mask4, mask5, mask6, mask7, vec8, vec9, vec10,
1687 VSHF_B4_SB(src3, src6, mask4, mask5, mask6, mask7, vec12, vec13, vec14,
1690 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, dst30,
1691 dst30, dst30, dst30);
1693 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, filt3, dst41,
1694 dst41, dst41, dst41);
1696 DPADD_SB4_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, filt3, dst52,
1697 dst52, dst52, dst52);
1699 DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2, filt3, dst63,
1700 dst63, dst63, dst63);
1702 ILVRL_H2_SH(dst41, dst30, dst10_r, dst43_r);
1703 ILVRL_H2_SH(dst52, dst41, dst21_r, dst54_r);
1704 ILVRL_H2_SH(dst63, dst52, dst32_r, dst65_r);
1706 dst66 = (v8i16) __msa_splati_d((v2i64) dst63, 1);
1708 for (loop_cnt = height >> 2; loop_cnt--;) {
1709 LD_SB4(src, src_stride, src7, src8, src9, src10);
1710 src += (4 * src_stride);
1711 XORI_B4_128_SB(src7, src8, src9, src10);
1713 VSHF_B4_SB(src7, src9, mask4, mask5, mask6, mask7, vec0, vec1, vec2,
1715 VSHF_B4_SB(src8, src10, mask4, mask5, mask6, mask7, vec4, vec5, vec6,
1719 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3, dst97,
1720 dst97, dst97, dst97);
1721 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, filt3, dst108,
1722 dst108, dst108, dst108);
1724 dst76_r = __msa_ilvr_h(dst97, dst66);
1725 ILVRL_H2_SH(dst108, dst97, dst87_r, dst109_r);
1726 dst66 = (v8i16) __msa_splati_d((v2i64) dst97, 1);
1727 dst98_r = __msa_ilvr_h(dst66, dst108);
1729 dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r, filt_h0,
1730 filt_h1, filt_h2, filt_h3);
1731 dst1_r = HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r, filt_h0,
1732 filt_h1, filt_h2, filt_h3);
1733 dst2_r = HEVC_FILT_8TAP(dst32_r, dst54_r, dst76_r, dst98_r, filt_h0,
1734 filt_h1, filt_h2, filt_h3);
1735 dst3_r = HEVC_FILT_8TAP(dst43_r, dst65_r, dst87_r, dst109_r, filt_h0,
1736 filt_h1, filt_h2, filt_h3);
1737 SRA_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6);
1738 PCKEV_H2_SW(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst2_r);
1739 ST8x4_UB(dst0_r, dst2_r, dst, dst_stride_in_bytes);
1740 dst += (4 * dst_stride);
1748 dst66 = (v8i16) __msa_splati_d((v2i64) dst108, 1);
1752 static void hevc_hv_8t_16w_msa(uint8_t *src, int32_t src_stride,
1753 int16_t *dst, int32_t dst_stride,
1754 const int8_t *filter_x, const int8_t *filter_y,
1757 hevc_hv_8t_8multx1mult_msa(src, src_stride, dst, dst_stride,
1758 filter_x, filter_y, height, 16);
1761 static void hevc_hv_8t_24w_msa(uint8_t *src, int32_t src_stride,
1762 int16_t *dst, int32_t dst_stride,
1763 const int8_t *filter_x, const int8_t *filter_y,
1766 hevc_hv_8t_8multx1mult_msa(src, src_stride, dst, dst_stride,
1767 filter_x, filter_y, height, 24);
1770 static void hevc_hv_8t_32w_msa(uint8_t *src, int32_t src_stride,
1771 int16_t *dst, int32_t dst_stride,
1772 const int8_t *filter_x, const int8_t *filter_y,
1775 hevc_hv_8t_8multx1mult_msa(src, src_stride, dst, dst_stride,
1776 filter_x, filter_y, height, 32);
1779 static void hevc_hv_8t_48w_msa(uint8_t *src, int32_t src_stride,
1780 int16_t *dst, int32_t dst_stride,
1781 const int8_t *filter_x, const int8_t *filter_y,
1784 hevc_hv_8t_8multx1mult_msa(src, src_stride, dst, dst_stride,
1785 filter_x, filter_y, height, 48);
1788 static void hevc_hv_8t_64w_msa(uint8_t *src, int32_t src_stride,
1789 int16_t *dst, int32_t dst_stride,
1790 const int8_t *filter_x, const int8_t *filter_y,
1793 hevc_hv_8t_8multx1mult_msa(src, src_stride, dst, dst_stride,
1794 filter_x, filter_y, height, 64);
1797 static void hevc_hz_4t_4x2_msa(uint8_t *src,
1801 const int8_t *filter)
1805 v16i8 mask1, vec0, vec1;
1807 v8i16 filter_vec, const_vec;
1808 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 };
1812 filter_vec = LD_SH(filter);
1813 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
1817 const_vec = __msa_ldi_h(128);
1820 LD_SB2(src, src_stride, src0, src1);
1821 XORI_B2_128_SB(src0, src1);
1823 VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
1825 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
1827 ST8x2_UB(dst0, dst, 2 * dst_stride);
1830 static void hevc_hz_4t_4x4_msa(uint8_t *src,
1834 const int8_t *filter)
1837 v16i8 src0, src1, src2, src3;
1838 v16i8 mask1, vec0, vec1;
1840 v8i16 filter_vec, const_vec;
1841 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 };
1845 filter_vec = LD_SH(filter);
1846 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
1850 const_vec = __msa_ldi_h(128);
1853 LD_SB4(src, src_stride, src0, src1, src2, src3);
1854 XORI_B4_128_SB(src0, src1, src2, src3);
1856 VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
1858 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
1860 VSHF_B2_SB(src2, src3, src2, src3, mask0, mask1, vec0, vec1);
1862 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
1864 ST8x4_UB(dst0, dst1, dst, 2 * dst_stride);
1867 static void hevc_hz_4t_4x8multiple_msa(uint8_t *src,
1871 const int8_t *filter,
1876 v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
1877 v16i8 mask1, vec0, vec1;
1878 v8i16 dst0, dst1, dst2, dst3;
1879 v8i16 filter_vec, const_vec;
1880 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 };
1884 filter_vec = LD_SH(filter);
1885 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
1889 const_vec = __msa_ldi_h(128);
1892 for (loop_cnt = (height >> 3); loop_cnt--;) {
1893 LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
1894 src += (8 * src_stride);
1896 XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
1898 VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
1900 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
1901 VSHF_B2_SB(src2, src3, src2, src3, mask0, mask1, vec0, vec1);
1903 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
1904 VSHF_B2_SB(src4, src5, src4, src5, mask0, mask1, vec0, vec1);
1906 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
1907 VSHF_B2_SB(src6, src7, src6, src7, mask0, mask1, vec0, vec1);
1909 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
1911 ST8x8_UB(dst0, dst1, dst2, dst3, dst, 2 * dst_stride);
1912 dst += (8 * dst_stride);
1916 static void hevc_hz_4t_4w_msa(uint8_t *src,
1920 const int8_t *filter,
1924 hevc_hz_4t_4x2_msa(src, src_stride, dst, dst_stride, filter);
1925 } else if (4 == height) {
1926 hevc_hz_4t_4x4_msa(src, src_stride, dst, dst_stride, filter);
1927 } else if (0 == height % 8) {
1928 hevc_hz_4t_4x8multiple_msa(src, src_stride, dst, dst_stride,
1933 static void hevc_hz_4t_6w_msa(uint8_t *src,
1937 const int8_t *filter,
1941 uint64_t dst_val0, dst_val1, dst_val2, dst_val3;
1942 uint32_t dst_val_int0, dst_val_int1, dst_val_int2, dst_val_int3;
1943 v8i16 filt0, filt1, dst0, dst1, dst2, dst3;
1944 v16i8 src0, src1, src2, src3;
1945 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
1948 v8i16 filter_vec, const_vec;
1952 filter_vec = LD_SH(filter);
1953 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
1957 const_vec = __msa_ldi_h(128);
1960 for (loop_cnt = (height >> 2); loop_cnt--;) {
1961 LD_SB4(src, src_stride, src0, src1, src2, src3);
1962 src += (4 * src_stride);
1964 XORI_B4_128_SB(src0, src1, src2, src3);
1966 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
1968 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
1969 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
1971 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
1972 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
1974 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
1975 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
1977 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
1979 dst_val0 = __msa_copy_u_d((v2i64) dst0, 0);
1980 dst_val1 = __msa_copy_u_d((v2i64) dst1, 0);
1981 dst_val2 = __msa_copy_u_d((v2i64) dst2, 0);
1982 dst_val3 = __msa_copy_u_d((v2i64) dst3, 0);
1984 dst_val_int0 = __msa_copy_u_w((v4i32) dst0, 2);
1985 dst_val_int1 = __msa_copy_u_w((v4i32) dst1, 2);
1986 dst_val_int2 = __msa_copy_u_w((v4i32) dst2, 2);
1987 dst_val_int3 = __msa_copy_u_w((v4i32) dst3, 2);
1990 SW(dst_val_int0, dst + 4);
1993 SW(dst_val_int1, dst + 4);
1996 SW(dst_val_int2, dst + 4);
1999 SW(dst_val_int3, dst + 4);
2004 static void hevc_hz_4t_8x2multiple_msa(uint8_t *src,
2008 const int8_t *filter,
2012 v8i16 filt0, filt1, dst0, dst1;
2014 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
2017 v8i16 filter_vec, const_vec;
2021 filter_vec = LD_SH(filter);
2022 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2026 const_vec = __msa_ldi_h(128);
2029 for (loop_cnt = (height >> 1); loop_cnt--;) {
2030 LD_SB2(src, src_stride, src0, src1);
2031 src += (2 * src_stride);
2033 XORI_B2_128_SB(src0, src1);
2035 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
2037 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
2039 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
2041 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
2043 ST_SH2(dst0, dst1, dst, dst_stride);
2044 dst += (2 * dst_stride);
2048 static void hevc_hz_4t_8x4multiple_msa(uint8_t *src,
2052 const int8_t *filter,
2057 v16i8 src0, src1, src2, src3;
2058 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
2061 v8i16 dst0, dst1, dst2, dst3;
2062 v8i16 filter_vec, const_vec;
2066 filter_vec = LD_SH(filter);
2067 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2071 const_vec = __msa_ldi_h(128);
2074 for (loop_cnt = (height >> 2); loop_cnt--;) {
2075 LD_SB4(src, src_stride, src0, src1, src2, src3);
2076 src += (4 * src_stride);
2078 XORI_B4_128_SB(src0, src1, src2, src3);
2080 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
2082 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
2084 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
2086 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
2088 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
2090 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
2092 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
2094 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
2096 ST_SH4(dst0, dst1, dst2, dst3, dst, dst_stride);
2097 dst += (4 * dst_stride);
2101 static void hevc_hz_4t_8w_msa(uint8_t *src,
2105 const int8_t *filter,
2108 if (2 == height || 6 == height) {
2109 hevc_hz_4t_8x2multiple_msa(src, src_stride, dst, dst_stride,
2112 hevc_hz_4t_8x4multiple_msa(src, src_stride, dst, dst_stride,
2117 static void hevc_hz_4t_12w_msa(uint8_t *src,
2121 const int8_t *filter,
2126 v16i8 src0, src1, src2, src3;
2129 v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
2130 v8i16 filter_vec, const_vec;
2132 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
2134 8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
2139 filter_vec = LD_SH(filter);
2140 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2145 const_vec = __msa_ldi_h(128);
2148 for (loop_cnt = (height >> 2); loop_cnt--;) {
2149 LD_SB4(src, src_stride, src0, src1, src2, src3);
2150 src += (4 * src_stride);
2151 XORI_B4_128_SB(src0, src1, src2, src3);
2153 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
2155 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
2156 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
2158 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
2159 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
2161 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
2162 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
2164 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
2165 VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec0, vec1);
2167 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
2168 VSHF_B2_SB(src2, src3, src2, src3, mask2, mask3, vec0, vec1);
2170 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5);
2172 ST_SH4(dst0, dst1, dst2, dst3, dst, dst_stride);
2173 ST8x4_UB(dst4, dst5, dst + 8, 2 * dst_stride);
2174 dst += (4 * dst_stride);
2178 static void hevc_hz_4t_16w_msa(uint8_t *src,
2182 const int8_t *filter,
2186 v16i8 src0, src1, src2, src3;
2187 v16i8 src4, src5, src6, src7;
2189 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
2191 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
2193 v8i16 filter_vec, const_vec;
2197 filter_vec = LD_SH(filter);
2198 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2202 const_vec = __msa_ldi_h(128);
2205 for (loop_cnt = (height >> 2); loop_cnt--;) {
2206 LD_SB4(src, src_stride, src0, src2, src4, src6);
2207 LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
2208 src += (4 * src_stride);
2210 XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
2212 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
2214 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
2216 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
2218 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
2220 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
2222 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
2224 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
2226 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
2228 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
2230 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
2232 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
2234 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5);
2236 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
2238 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst6, dst6);
2240 VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1);
2242 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst7, dst7);
2244 ST_SH4(dst0, dst2, dst4, dst6, dst, dst_stride);
2245 ST_SH4(dst1, dst3, dst5, dst7, dst + 8, dst_stride);
2246 dst += (4 * dst_stride);
2250 static void hevc_hz_4t_24w_msa(uint8_t *src,
2254 const int8_t *filter,
2258 int16_t *dst_tmp = dst + 16;
2259 v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
2261 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
2262 v16i8 mask1, mask00, mask11;
2264 v8i16 dst0, dst1, dst2, dst3;
2265 v8i16 filter_vec, const_vec;
2269 filter_vec = LD_SH(filter);
2270 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2274 mask11 = mask0 + 10;
2276 const_vec = __msa_ldi_h(128);
2279 for (loop_cnt = (height >> 2); loop_cnt--;) {
2281 LD_SB4(src, src_stride, src0, src2, src4, src6);
2282 LD_SB4(src + 16, src_stride, src1, src3, src5, src7);
2283 src += (4 * src_stride);
2285 XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
2287 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
2289 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
2291 VSHF_B2_SB(src0, src1, src0, src1, mask00, mask11, vec0, vec1);
2293 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
2295 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
2297 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
2299 VSHF_B2_SB(src2, src3, src2, src3, mask00, mask11, vec0, vec1);
2301 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
2303 ST_SH2(dst0, dst1, dst, 8);
2305 ST_SH2(dst2, dst3, dst, 8);
2308 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
2310 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
2312 VSHF_B2_SB(src4, src5, src4, src5, mask00, mask11, vec0, vec1);
2314 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
2316 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
2318 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
2320 VSHF_B2_SB(src6, src7, src6, src7, mask00, mask11, vec0, vec1);
2322 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
2324 ST_SH2(dst0, dst1, dst, 8);
2326 ST_SH2(dst2, dst3, dst, 8);
2330 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
2332 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
2334 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
2336 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
2338 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
2340 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
2342 VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1);
2344 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
2346 ST_SH4(dst0, dst1, dst2, dst3, dst_tmp, dst_stride);
2347 dst_tmp += (4 * dst_stride);
2351 static void hevc_hz_4t_32w_msa(uint8_t *src,
2355 const int8_t *filter,
2359 v16i8 src0, src1, src2;
2361 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
2362 v16i8 mask1, mask2, mask3;
2363 v8i16 dst0, dst1, dst2, dst3;
2365 v8i16 filter_vec, const_vec;
2369 filter_vec = LD_SH(filter);
2370 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2372 const_vec = __msa_ldi_h(128);
2379 for (loop_cnt = (height >> 1); loop_cnt--;) {
2380 LD_SB2(src, 16, src0, src1);
2381 src2 = LD_SB(src + 24);
2384 XORI_B3_128_SB(src0, src1, src2);
2386 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
2388 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
2390 VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec0, vec1);
2392 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
2394 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
2396 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
2398 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
2400 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
2402 ST_SH4(dst0, dst1, dst2, dst3, dst, 8);
2405 LD_SB2(src, 16, src0, src1);
2406 src2 = LD_SB(src + 24);
2409 XORI_B3_128_SB(src0, src1, src2);
2411 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
2413 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
2415 VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec0, vec1);
2417 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
2419 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
2421 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
2423 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
2425 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
2427 ST_SH4(dst0, dst1, dst2, dst3, dst, 8);
2432 static void hevc_vt_4t_4x2_msa(uint8_t *src,
2436 const int8_t *filter)
2438 v16i8 src0, src1, src2, src3, src4;
2439 v16i8 src10_r, src32_r, src21_r, src43_r;
2440 v16i8 src2110, src4332;
2443 v8i16 filter_vec, const_vec;
2447 const_vec = __msa_ldi_h(128);
2450 filter_vec = LD_SH(filter);
2451 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2453 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
2454 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3,
2455 src10_r, src21_r, src32_r, src43_r);
2457 ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
2458 XORI_B2_128_SB(src2110, src4332);
2460 DPADD_SB2_SH(src2110, src4332, filt0, filt1, dst10, dst10);
2462 ST8x2_UB(dst10, dst, 2 * dst_stride);
2465 static void hevc_vt_4t_4x4_msa(uint8_t *src,
2469 const int8_t *filter,
2472 v16i8 src0, src1, src2, src3, src4, src5, src6;
2473 v16i8 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r;
2474 v16i8 src2110, src4332, src6554;
2477 v8i16 filter_vec, const_vec;
2481 const_vec = __msa_ldi_h(128);
2484 filter_vec = LD_SH(filter);
2485 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2487 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
2488 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3,
2489 src10_r, src21_r, src32_r, src43_r);
2490 ILVR_B2_SB(src5, src4, src6, src5, src54_r, src65_r);
2491 ILVR_D3_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r,
2492 src2110, src4332, src6554);
2493 XORI_B3_128_SB(src2110, src4332, src6554);
2495 DPADD_SB2_SH(src2110, src4332, filt0, filt1, dst10, dst10);
2497 DPADD_SB2_SH(src4332, src6554, filt0, filt1, dst32, dst32);
2499 ST8x4_UB(dst10, dst32, dst, 2 * dst_stride);
2502 static void hevc_vt_4t_4x8multiple_msa(uint8_t *src,
2506 const int8_t *filter,
2510 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9;
2511 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
2512 v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
2513 v16i8 src2110, src4332, src6554, src8776;
2514 v8i16 dst10, dst32, dst54, dst76;
2516 v8i16 filter_vec, const_vec;
2519 const_vec = __msa_ldi_h(128);
2522 filter_vec = LD_SH(filter);
2523 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2525 LD_SB3(src, src_stride, src0, src1, src2);
2526 src += (3 * src_stride);
2528 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2529 src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
2530 src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
2532 for (loop_cnt = (height >> 3); loop_cnt--;) {
2533 LD_SB6(src, src_stride, src3, src4, src5, src6, src7, src8);
2534 src += (6 * src_stride);
2536 ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
2537 src32_r, src43_r, src54_r, src65_r);
2538 ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
2539 ILVR_D3_SB(src43_r, src32_r, src65_r, src54_r, src87_r, src76_r,
2540 src4332, src6554, src8776);
2541 XORI_B3_128_SB(src4332, src6554, src8776);
2544 DPADD_SB2_SH(src2110, src4332, filt0, filt1, dst10, dst10);
2546 DPADD_SB2_SH(src4332, src6554, filt0, filt1, dst32, dst32);
2548 DPADD_SB2_SH(src6554, src8776, filt0, filt1, dst54, dst54);
2550 LD_SB2(src, src_stride, src9, src2);
2551 src += (2 * src_stride);
2552 ILVR_B2_SB(src9, src8, src2, src9, src98_r, src109_r);
2553 src2110 = (v16i8) __msa_ilvr_d((v2i64) src109_r, (v2i64) src98_r);
2554 src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
2556 DPADD_SB2_SH(src8776, src2110, filt0, filt1, dst76, dst76);
2558 ST8x8_UB(dst10, dst32, dst54, dst76, dst, 2 * dst_stride);
2559 dst += (8 * dst_stride);
2563 static void hevc_vt_4t_4w_msa(uint8_t *src,
2567 const int8_t *filter,
2571 hevc_vt_4t_4x2_msa(src, src_stride, dst, dst_stride, filter);
2572 } else if (4 == height) {
2573 hevc_vt_4t_4x4_msa(src, src_stride, dst, dst_stride, filter, height);
2574 } else if (0 == (height % 8)) {
2575 hevc_vt_4t_4x8multiple_msa(src, src_stride, dst, dst_stride,
2580 static void hevc_vt_4t_6w_msa(uint8_t *src,
2584 const int8_t *filter,
2588 uint32_t dst_val_int0, dst_val_int1, dst_val_int2, dst_val_int3;
2589 uint64_t dst_val0, dst_val1, dst_val2, dst_val3;
2590 v16i8 src0, src1, src2, src3, src4;
2591 v16i8 src10_r, src32_r, src21_r, src43_r;
2592 v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
2594 v8i16 filter_vec, const_vec;
2597 const_vec = __msa_ldi_h(128);
2600 filter_vec = LD_SH(filter);
2601 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2603 LD_SB3(src, src_stride, src0, src1, src2);
2604 src += (3 * src_stride);
2605 XORI_B3_128_SB(src0, src1, src2);
2606 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2608 for (loop_cnt = (height >> 2); loop_cnt--;) {
2609 LD_SB2(src, src_stride, src3, src4);
2610 src += (2 * src_stride);
2611 XORI_B2_128_SB(src3, src4);
2612 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
2615 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
2617 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
2619 LD_SB2(src, src_stride, src1, src2);
2620 src += (2 * src_stride);
2621 XORI_B2_128_SB(src1, src2);
2622 ILVR_B2_SB(src1, src4, src2, src1, src10_r, src21_r);
2625 DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, dst2_r, dst2_r);
2627 DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, dst3_r, dst3_r);
2629 dst_val0 = __msa_copy_u_d((v2i64) dst0_r, 0);
2630 dst_val1 = __msa_copy_u_d((v2i64) dst1_r, 0);
2631 dst_val2 = __msa_copy_u_d((v2i64) dst2_r, 0);
2632 dst_val3 = __msa_copy_u_d((v2i64) dst3_r, 0);
2634 dst_val_int0 = __msa_copy_u_w((v4i32) dst0_r, 2);
2635 dst_val_int1 = __msa_copy_u_w((v4i32) dst1_r, 2);
2636 dst_val_int2 = __msa_copy_u_w((v4i32) dst2_r, 2);
2637 dst_val_int3 = __msa_copy_u_w((v4i32) dst3_r, 2);
2640 SW(dst_val_int0, dst + 4);
2643 SW(dst_val_int1, dst + 4);
2646 SW(dst_val_int2, dst + 4);
2649 SW(dst_val_int3, dst + 4);
2654 static void hevc_vt_4t_8x2_msa(uint8_t *src,
2658 const int8_t *filter)
2660 v16i8 src0, src1, src2, src3, src4;
2661 v16i8 src10_r, src32_r, src21_r, src43_r;
2662 v8i16 dst0_r, dst1_r;
2664 v8i16 filter_vec, const_vec;
2667 const_vec = __msa_ldi_h(128);
2670 filter_vec = LD_SH(filter);
2671 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2673 LD_SB3(src, src_stride, src0, src1, src2);
2674 src += (3 * src_stride);
2675 XORI_B3_128_SB(src0, src1, src2);
2676 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2678 LD_SB2(src, src_stride, src3, src4);
2679 XORI_B2_128_SB(src3, src4);
2680 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
2682 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
2684 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
2686 ST_SH2(dst0_r, dst1_r, dst, dst_stride);
2689 static void hevc_vt_4t_8x6_msa(uint8_t *src,
2693 const int8_t *filter)
2695 v16i8 src0, src1, src2, src3, src4;
2696 v16i8 src10_r, src32_r, src21_r, src43_r;
2697 v8i16 dst0_r, dst1_r;
2699 v8i16 filter_vec, const_vec;
2702 const_vec = __msa_ldi_h(128);
2705 filter_vec = LD_SH(filter);
2706 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2708 LD_SB3(src, src_stride, src0, src1, src2);
2709 src += (3 * src_stride);
2710 XORI_B3_128_SB(src0, src1, src2);
2711 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2713 LD_SB2(src, src_stride, src3, src4);
2714 src += (2 * src_stride);
2715 XORI_B2_128_SB(src3, src4);
2717 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
2719 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
2721 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
2723 ST_SH2(dst0_r, dst1_r, dst, dst_stride);
2724 dst += (2 * dst_stride);
2726 LD_SB2(src, src_stride, src1, src2);
2727 src += (2 * src_stride);
2728 XORI_B2_128_SB(src1, src2);
2730 ILVR_B2_SB(src1, src4, src2, src1, src10_r, src21_r);
2732 DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, dst0_r, dst0_r);
2734 DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, dst1_r, dst1_r);
2736 ST_SH2(dst0_r, dst1_r, dst, dst_stride);
2737 dst += (2 * dst_stride);
2739 LD_SB2(src, src_stride, src3, src4);
2740 XORI_B2_128_SB(src3, src4);
2742 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
2744 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
2746 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
2748 ST_SH2(dst0_r, dst1_r, dst, dst_stride);
2751 static void hevc_vt_4t_8x4multiple_msa(uint8_t *src,
2755 const int8_t *filter,
2759 v16i8 src0, src1, src2, src3, src4, src5;
2760 v16i8 src10_r, src32_r, src21_r, src43_r;
2761 v8i16 dst0_r, dst1_r;
2763 v8i16 filter_vec, const_vec;
2766 const_vec = __msa_ldi_h(128);
2769 filter_vec = LD_SH(filter);
2770 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2772 LD_SB3(src, src_stride, src0, src1, src2);
2773 src += (3 * src_stride);
2774 XORI_B3_128_SB(src0, src1, src2);
2775 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2777 for (loop_cnt = (height >> 2); loop_cnt--;) {
2778 LD_SB2(src, src_stride, src3, src4);
2779 src += (2 * src_stride);
2780 XORI_B2_128_SB(src3, src4);
2781 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
2783 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
2785 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
2787 ST_SH2(dst0_r, dst1_r, dst, dst_stride);
2788 dst += (2 * dst_stride);
2790 LD_SB2(src, src_stride, src5, src2);
2791 src += (2 * src_stride);
2792 XORI_B2_128_SB(src5, src2);
2793 ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r);
2795 DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, dst0_r, dst0_r);
2797 DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, dst1_r, dst1_r);
2799 ST_SH2(dst0_r, dst1_r, dst, dst_stride);
2800 dst += (2 * dst_stride);
2804 static void hevc_vt_4t_8w_msa(uint8_t *src,
2808 const int8_t *filter,
2812 hevc_vt_4t_8x2_msa(src, src_stride, dst, dst_stride, filter);
2813 } else if (6 == height) {
2814 hevc_vt_4t_8x6_msa(src, src_stride, dst, dst_stride, filter);
2816 hevc_vt_4t_8x4multiple_msa(src, src_stride, dst, dst_stride,
2821 static void hevc_vt_4t_12w_msa(uint8_t *src,
2825 const int8_t *filter,
2829 v16i8 src0, src1, src2, src3, src4, src5;
2830 v16i8 src10_r, src32_r, src21_r, src43_r;
2831 v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
2832 v16i8 src10_l, src32_l, src54_l, src21_l, src43_l, src65_l;
2833 v16i8 src2110, src4332;
2834 v8i16 dst0_l, dst1_l;
2836 v8i16 filter_vec, const_vec;
2838 src -= (1 * src_stride);
2839 const_vec = __msa_ldi_h(128);
2842 filter_vec = LD_SH(filter);
2843 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2845 LD_SB3(src, src_stride, src0, src1, src2);
2846 src += (3 * src_stride);
2847 XORI_B3_128_SB(src0, src1, src2);
2848 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2849 ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
2850 src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_l, (v2i64) src10_l);
2852 for (loop_cnt = (height >> 2); loop_cnt--;) {
2853 LD_SB2(src, src_stride, src3, src4);
2854 src += (2 * src_stride);
2855 XORI_B2_128_SB(src3, src4);
2856 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
2857 ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
2858 src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_l, (v2i64) src32_l);
2860 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
2862 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
2864 DPADD_SB2_SH(src2110, src4332, filt0, filt1, dst0_l, dst0_l);
2866 LD_SB2(src, src_stride, src5, src2);
2867 src += (2 * src_stride);
2868 XORI_B2_128_SB(src5, src2);
2869 ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r);
2870 ILVL_B2_SB(src5, src4, src2, src5, src54_l, src65_l);
2871 src2110 = (v16i8) __msa_ilvr_d((v2i64) src65_l, (v2i64) src54_l);
2873 DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, dst2_r, dst2_r);
2875 DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, dst3_r, dst3_r);
2877 DPADD_SB2_SH(src4332, src2110, filt0, filt1, dst1_l, dst1_l);
2879 ST_SH4(dst0_r, dst1_r, dst2_r, dst3_r, dst, dst_stride);
2880 ST8x4_UB(dst0_l, dst1_l, dst + 8, (2 * dst_stride));
2881 dst += (4 * dst_stride);
2885 static void hevc_vt_4t_16w_msa(uint8_t *src,
2889 const int8_t *filter,
2893 v16i8 src0, src1, src2, src3, src4, src5;
2894 v16i8 src10_r, src32_r, src21_r, src43_r;
2895 v16i8 src10_l, src32_l, src21_l, src43_l;
2896 v8i16 dst0_r, dst1_r, dst0_l, dst1_l;
2898 v8i16 filter_vec, const_vec;
2901 const_vec = __msa_ldi_h(128);
2904 filter_vec = LD_SH(filter);
2905 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2907 LD_SB3(src, src_stride, src0, src1, src2);
2908 src += (3 * src_stride);
2909 XORI_B3_128_SB(src0, src1, src2);
2910 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2911 ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
2913 for (loop_cnt = (height >> 2); loop_cnt--;) {
2914 LD_SB2(src, src_stride, src3, src4);
2915 src += (2 * src_stride);
2916 XORI_B2_128_SB(src3, src4);
2917 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
2918 ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
2920 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
2922 DPADD_SB2_SH(src10_l, src32_l, filt0, filt1, dst0_l, dst0_l);
2924 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
2926 DPADD_SB2_SH(src21_l, src43_l, filt0, filt1, dst1_l, dst1_l);
2927 ST_SH2(dst0_r, dst0_l, dst, 8);
2929 ST_SH2(dst1_r, dst1_l, dst, 8);
2932 LD_SB2(src, src_stride, src5, src2);
2933 src += (2 * src_stride);
2934 XORI_B2_128_SB(src5, src2);
2935 ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r);
2936 ILVL_B2_SB(src5, src4, src2, src5, src10_l, src21_l);
2938 DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, dst0_r, dst0_r);
2940 DPADD_SB2_SH(src32_l, src10_l, filt0, filt1, dst0_l, dst0_l);
2942 DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, dst1_r, dst1_r);
2944 DPADD_SB2_SH(src43_l, src21_l, filt0, filt1, dst1_l, dst1_l);
2945 ST_SH2(dst0_r, dst0_l, dst, 8);
2947 ST_SH2(dst1_r, dst1_l, dst, 8);
2952 static void hevc_vt_4t_24w_msa(uint8_t *src,
2956 const int8_t *filter,
2960 v16i8 src0, src1, src2, src3, src4, src5;
2961 v16i8 src6, src7, src8, src9, src10, src11;
2962 v16i8 src10_r, src32_r, src76_r, src98_r;
2963 v16i8 src21_r, src43_r, src87_r, src109_r;
2964 v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
2965 v16i8 src10_l, src32_l, src21_l, src43_l;
2966 v8i16 dst0_l, dst1_l;
2968 v8i16 filter_vec, const_vec;
2971 const_vec = __msa_ldi_h(128);
2974 filter_vec = LD_SH(filter);
2975 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2977 LD_SB3(src, src_stride, src0, src1, src2);
2978 XORI_B3_128_SB(src0, src1, src2);
2979 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2980 ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
2982 LD_SB3(src + 16, src_stride, src6, src7, src8);
2983 src += (3 * src_stride);
2984 XORI_B3_128_SB(src6, src7, src8);
2985 ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
2987 for (loop_cnt = (height >> 2); loop_cnt--;) {
2988 LD_SB2(src, src_stride, src3, src4);
2989 XORI_B2_128_SB(src3, src4);
2990 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
2991 ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
2993 LD_SB2(src + 16, src_stride, src9, src10);
2994 src += (2 * src_stride);
2995 XORI_B2_128_SB(src9, src10);
2996 ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r);
2999 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
3001 DPADD_SB2_SH(src10_l, src32_l, filt0, filt1, dst0_l, dst0_l);
3003 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
3005 DPADD_SB2_SH(src21_l, src43_l, filt0, filt1, dst1_l, dst1_l);
3007 DPADD_SB2_SH(src76_r, src98_r, filt0, filt1, dst2_r, dst2_r);
3009 DPADD_SB2_SH(src87_r, src109_r, filt0, filt1, dst3_r, dst3_r);
3011 ST_SH2(dst0_r, dst0_l, dst, 8);
3012 ST_SH(dst2_r, dst + 16);
3014 ST_SH2(dst1_r, dst1_l, dst, 8);
3015 ST_SH(dst3_r, dst + 16);
3018 LD_SB2(src, src_stride, src5, src2);
3019 XORI_B2_128_SB(src5, src2);
3020 ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r);
3021 ILVL_B2_SB(src5, src4, src2, src5, src10_l, src21_l);
3023 LD_SB2(src + 16, src_stride, src11, src8);
3024 src += (2 * src_stride);
3025 XORI_B2_128_SB(src11, src8);
3026 ILVR_B2_SB(src11, src10, src8, src11, src76_r, src87_r);
3029 DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, dst0_r, dst0_r);
3031 DPADD_SB2_SH(src32_l, src10_l, filt0, filt1, dst0_l, dst0_l);
3033 DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, dst1_r, dst1_r);
3035 DPADD_SB2_SH(src43_l, src21_l, filt0, filt1, dst1_l, dst1_l);
3037 DPADD_SB2_SH(src98_r, src76_r, filt0, filt1, dst2_r, dst2_r);
3039 DPADD_SB2_SH(src109_r, src87_r, filt0, filt1, dst3_r, dst3_r);
3041 ST_SH2(dst0_r, dst0_l, dst, 8);
3042 ST_SH(dst2_r, dst + 16);
3044 ST_SH2(dst1_r, dst1_l, dst, 8);
3045 ST_SH(dst3_r, dst + 16);
3050 static void hevc_vt_4t_32w_msa(uint8_t *src,
3054 const int8_t *filter,
3058 v16i8 src0, src1, src2, src3, src4, src5;
3059 v16i8 src6, src7, src8, src9, src10, src11;
3060 v16i8 src10_r, src32_r, src76_r, src98_r;
3061 v16i8 src21_r, src43_r, src87_r, src109_r;
3062 v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
3063 v16i8 src10_l, src32_l, src76_l, src98_l;
3064 v16i8 src21_l, src43_l, src87_l, src109_l;
3065 v8i16 dst0_l, dst1_l, dst2_l, dst3_l;
3067 v8i16 filter_vec, const_vec;
3070 const_vec = __msa_ldi_h(128);
3073 filter_vec = LD_SH(filter);
3074 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3076 LD_SB3(src, src_stride, src0, src1, src2);
3077 XORI_B3_128_SB(src0, src1, src2);
3078 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
3079 ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
3081 LD_SB3(src + 16, src_stride, src6, src7, src8);
3082 src += (3 * src_stride);
3083 XORI_B3_128_SB(src6, src7, src8);
3084 ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
3085 ILVL_B2_SB(src7, src6, src8, src7, src76_l, src87_l);
3087 for (loop_cnt = (height >> 2); loop_cnt--;) {
3088 LD_SB2(src, src_stride, src3, src4);
3089 XORI_B2_128_SB(src3, src4);
3090 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
3091 ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
3093 LD_SB2(src + 16, src_stride, src9, src10);
3094 src += (2 * src_stride);
3095 XORI_B2_128_SB(src9, src10);
3096 ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r);
3097 ILVL_B2_SB(src9, src8, src10, src9, src98_l, src109_l);
3100 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
3102 DPADD_SB2_SH(src10_l, src32_l, filt0, filt1, dst0_l, dst0_l);
3104 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
3106 DPADD_SB2_SH(src21_l, src43_l, filt0, filt1, dst1_l, dst1_l);
3108 DPADD_SB2_SH(src76_r, src98_r, filt0, filt1, dst2_r, dst2_r);
3110 DPADD_SB2_SH(src76_l, src98_l, filt0, filt1, dst2_l, dst2_l);
3112 DPADD_SB2_SH(src87_r, src109_r, filt0, filt1, dst3_r, dst3_r);
3114 DPADD_SB2_SH(src87_l, src109_l, filt0, filt1, dst3_l, dst3_l);
3116 ST_SH4(dst0_r, dst0_l, dst2_r, dst2_l, dst, 8);
3118 ST_SH4(dst1_r, dst1_l, dst3_r, dst3_l, dst, 8);
3121 LD_SB2(src, src_stride, src5, src2);
3122 XORI_B2_128_SB(src5, src2);
3123 ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r);
3124 ILVL_B2_SB(src5, src4, src2, src5, src10_l, src21_l);
3126 LD_SB2(src + 16, src_stride, src11, src8);
3127 src += (2 * src_stride);
3128 XORI_B2_128_SB(src11, src8);
3129 ILVR_B2_SB(src11, src10, src8, src11, src76_r, src87_r);
3130 ILVL_B2_SB(src11, src10, src8, src11, src76_l, src87_l);
3133 DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, dst0_r, dst0_r);
3135 DPADD_SB2_SH(src32_l, src10_l, filt0, filt1, dst0_l, dst0_l);
3137 DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, dst1_r, dst1_r);
3139 DPADD_SB2_SH(src43_l, src21_l, filt0, filt1, dst1_l, dst1_l);
3141 DPADD_SB2_SH(src98_r, src76_r, filt0, filt1, dst2_r, dst2_r);
3143 DPADD_SB2_SH(src98_l, src76_l, filt0, filt1, dst2_l, dst2_l);
3145 DPADD_SB2_SH(src109_r, src87_r, filt0, filt1, dst3_r, dst3_r);
3147 DPADD_SB2_SH(src109_l, src87_l, filt0, filt1, dst3_l, dst3_l);
3149 ST_SH4(dst0_r, dst0_l, dst2_r, dst2_l, dst, 8);
3151 ST_SH4(dst1_r, dst1_l, dst3_r, dst3_l, dst, 8);
3156 static void hevc_hv_4t_4x2_msa(uint8_t *src,
3160 const int8_t *filter_x,
3161 const int8_t *filter_y)
3163 int32_t dst_stride_in_bytes = 2 * dst_stride;
3164 v16i8 src0, src1, src2, src3, src4;
3166 v8i16 filt_h0, filt_h1;
3167 v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16);
3169 v8i16 filter_vec, const_vec;
3170 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
3171 v8i16 dst20, dst31, dst42, dst10, dst32, dst21, dst43;
3174 src -= (src_stride + 1);
3175 filter_vec = LD_SH(filter_x);
3176 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3178 filter_vec = LD_SH(filter_y);
3179 UNPCK_R_SB_SH(filter_vec, filter_vec);
3181 SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
3185 const_vec = __msa_ldi_h(128);
3188 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
3189 XORI_B5_128_SB(src0, src1, src2, src3, src4);
3190 VSHF_B2_SB(src0, src2, src0, src2, mask0, mask1, vec0, vec1);
3191 VSHF_B2_SB(src1, src3, src1, src3, mask0, mask1, vec2, vec3);
3192 VSHF_B2_SB(src2, src4, src2, src4, mask0, mask1, vec4, vec5);
3197 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst20, dst20);
3198 DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst31, dst31);
3199 DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst42, dst42);
3200 ILVRL_H2_SH(dst31, dst20, dst10, dst32);
3201 ILVRL_H2_SH(dst42, dst31, dst21, dst43);
3203 dst0 = HEVC_FILT_4TAP(dst10, dst32, filt_h0, filt_h1);
3204 dst1 = HEVC_FILT_4TAP(dst21, dst43, filt_h0, filt_h1);
3207 dst0 = (v4i32) __msa_pckev_h((v8i16) dst1, (v8i16) dst0);
3208 ST8x2_UB(dst0, dst, dst_stride_in_bytes);
3211 static void hevc_hv_4t_4x4_msa(uint8_t *src,
3215 const int8_t *filter_x,
3216 const int8_t *filter_y)
3218 int32_t dst_stride_in_bytes = 2 * dst_stride;
3219 v16i8 src0, src1, src2, src3, src4, src5, src6;
3221 v8i16 filt_h0, filt_h1;
3222 v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16);
3224 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
3225 v8i16 filter_vec, const_vec;
3226 v8i16 dst30, dst41, dst52, dst63, dst10, dst32, dst54, dst21, dst43, dst65;
3227 v4i32 dst0, dst1, dst2, dst3;
3229 src -= (src_stride + 1);
3231 filter_vec = LD_SH(filter_x);
3232 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3234 filter_vec = LD_SH(filter_y);
3235 UNPCK_R_SB_SH(filter_vec, filter_vec);
3237 SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
3241 const_vec = __msa_ldi_h(128);
3244 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
3245 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
3247 VSHF_B2_SB(src0, src3, src0, src3, mask0, mask1, vec0, vec1);
3248 VSHF_B2_SB(src1, src4, src1, src4, mask0, mask1, vec2, vec3);
3249 VSHF_B2_SB(src2, src5, src2, src5, mask0, mask1, vec4, vec5);
3250 VSHF_B2_SB(src3, src6, src3, src6, mask0, mask1, vec6, vec7);
3256 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst30, dst30);
3257 DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst41, dst41);
3258 DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst52, dst52);
3259 DPADD_SB2_SH(vec6, vec7, filt0, filt1, dst63, dst63);
3261 ILVRL_H2_SH(dst41, dst30, dst10, dst43);
3262 ILVRL_H2_SH(dst52, dst41, dst21, dst54);
3263 ILVRL_H2_SH(dst63, dst52, dst32, dst65);
3265 dst0 = HEVC_FILT_4TAP(dst10, dst32, filt_h0, filt_h1);
3266 dst1 = HEVC_FILT_4TAP(dst21, dst43, filt_h0, filt_h1);
3267 dst2 = HEVC_FILT_4TAP(dst32, dst54, filt_h0, filt_h1);
3268 dst3 = HEVC_FILT_4TAP(dst43, dst65, filt_h0, filt_h1);
3269 SRA_4V(dst0, dst1, dst2, dst3, 6);
3270 PCKEV_H2_SW(dst1, dst0, dst3, dst2, dst0, dst2);
3271 ST8x4_UB(dst0, dst2, dst, dst_stride_in_bytes);
3275 static void hevc_hv_4t_4multx8mult_msa(uint8_t *src,
3279 const int8_t *filter_x,
3280 const int8_t *filter_y,
3284 v16i8 src0, src1, src2, src3, src4, src5, src6;
3285 v16i8 src7, src8, src9, src10;
3287 v8i16 filt_h0, filt_h1;
3288 v16i8 mask0 = LD_SB(ff_hevc_mask_arr + 16);
3290 v8i16 filter_vec, const_vec;
3291 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
3292 v8i16 dst10, dst21, dst22, dst73, dst84, dst95, dst106;
3293 v8i16 dst10_r, dst32_r, dst54_r, dst76_r, dst98_r;
3294 v8i16 dst21_r, dst43_r, dst65_r, dst87_r, dst109_r;
3295 v4i32 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
3297 src -= (src_stride + 1);
3298 filter_vec = LD_SH(filter_x);
3299 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3301 filter_vec = LD_SH(filter_y);
3302 UNPCK_R_SB_SH(filter_vec, filter_vec);
3304 SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
3308 const_vec = __msa_ldi_h(128);
3311 LD_SB3(src, src_stride, src0, src1, src2);
3312 src += (3 * src_stride);
3313 XORI_B3_128_SB(src0, src1, src2);
3314 VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
3315 VSHF_B2_SB(src1, src2, src1, src2, mask0, mask1, vec2, vec3);
3317 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst10, dst10);
3319 DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst21, dst21);
3320 ILVRL_H2_SH(dst21, dst10, dst10_r, dst21_r);
3321 dst22 = (v8i16) __msa_splati_d((v2i64) dst21, 1);
3323 for (loop_cnt = height >> 3; loop_cnt--;) {
3324 LD_SB8(src, src_stride,
3325 src3, src4, src5, src6, src7, src8, src9, src10);
3326 src += (8 * src_stride);
3327 XORI_B8_128_SB(src3, src4, src5, src6, src7, src8, src9, src10);
3329 VSHF_B2_SB(src3, src7, src3, src7, mask0, mask1, vec0, vec1);
3330 VSHF_B2_SB(src4, src8, src4, src8, mask0, mask1, vec2, vec3);
3331 VSHF_B2_SB(src5, src9, src5, src9, mask0, mask1, vec4, vec5);
3332 VSHF_B2_SB(src6, src10, src6, src10, mask0, mask1, vec6, vec7);
3338 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst73, dst73);
3339 DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst84, dst84);
3340 DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst95, dst95);
3341 DPADD_SB2_SH(vec6, vec7, filt0, filt1, dst106, dst106);
3343 dst32_r = __msa_ilvr_h(dst73, dst22);
3344 ILVRL_H2_SH(dst84, dst73, dst43_r, dst87_r);
3345 ILVRL_H2_SH(dst95, dst84, dst54_r, dst98_r);
3346 ILVRL_H2_SH(dst106, dst95, dst65_r, dst109_r);
3347 dst22 = (v8i16) __msa_splati_d((v2i64) dst73, 1);
3348 dst76_r = __msa_ilvr_h(dst22, dst106);
3350 dst0 = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
3351 dst1 = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
3352 dst2 = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
3353 dst3 = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
3354 dst4 = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
3355 dst5 = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
3356 dst6 = HEVC_FILT_4TAP(dst76_r, dst98_r, filt_h0, filt_h1);
3357 dst7 = HEVC_FILT_4TAP(dst87_r, dst109_r, filt_h0, filt_h1);
3358 SRA_4V(dst0, dst1, dst2, dst3, 6);
3359 SRA_4V(dst4, dst5, dst6, dst7, 6);
3360 PCKEV_H4_SW(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6,
3361 dst0, dst1, dst2, dst3);
3362 ST8x8_UB(dst0, dst1, dst2, dst3, dst, 2 * dst_stride);
3363 dst += (8 * dst_stride);
3367 dst22 = (v8i16) __msa_splati_d((v2i64) dst106, 1);
3371 static void hevc_hv_4t_4w_msa(uint8_t *src,
3375 const int8_t *filter_x,
3376 const int8_t *filter_y,
3380 hevc_hv_4t_4x2_msa(src, src_stride, dst, dst_stride,
3381 filter_x, filter_y);
3382 } else if (4 == height) {
3383 hevc_hv_4t_4x4_msa(src, src_stride, dst, dst_stride,
3384 filter_x, filter_y);
3385 } else if (0 == (height % 8)) {
3386 hevc_hv_4t_4multx8mult_msa(src, src_stride, dst, dst_stride,
3387 filter_x, filter_y, height);
3391 static void hevc_hv_4t_6w_msa(uint8_t *src,
3395 const int8_t *filter_x,
3396 const int8_t *filter_y,
3399 int32_t dst_stride_in_bytes = 2 * dst_stride;
3400 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
3402 v8i16 filt_h0, filt_h1;
3403 v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
3405 v8i16 filter_vec, const_vec;
3406 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
3407 v8i16 dsth0, dsth1, dsth2, dsth3, dsth4, dsth5, dsth6, dsth7, dsth8, dsth9;
3408 v8i16 dsth10, tmp0, tmp1, tmp2, tmp3, tmp4, tmp5;
3409 v8i16 dst10_r, dst32_r, dst54_r, dst76_r, dst98_r, dst21_r, dst43_r;
3410 v8i16 dst65_r, dst87_r, dst109_r, dst10_l, dst32_l, dst54_l, dst76_l;
3411 v8i16 dst98_l, dst21_l, dst43_l, dst65_l, dst87_l, dst109_l;
3412 v8i16 dst1021_l, dst3243_l, dst5465_l, dst7687_l, dst98109_l;
3413 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r, dst6_r, dst7_r;
3414 v4i32 dst0_l, dst1_l, dst2_l, dst3_l;
3416 src -= (src_stride + 1);
3417 filter_vec = LD_SH(filter_x);
3418 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3420 filter_vec = LD_SH(filter_y);
3421 UNPCK_R_SB_SH(filter_vec, filter_vec);
3423 SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
3427 const_vec = __msa_ldi_h(128);
3430 LD_SB3(src, src_stride, src0, src1, src2);
3431 src += (3 * src_stride);
3432 XORI_B3_128_SB(src0, src1, src2);
3434 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3435 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
3436 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3441 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dsth0, dsth0);
3442 DPADD_SB2_SH(vec2, vec3, filt0, filt1, dsth1, dsth1);
3443 DPADD_SB2_SH(vec4, vec5, filt0, filt1, dsth2, dsth2);
3445 ILVRL_H2_SH(dsth1, dsth0, dst10_r, dst10_l);
3446 ILVRL_H2_SH(dsth2, dsth1, dst21_r, dst21_l);
3448 LD_SB8(src, src_stride, src3, src4, src5, src6, src7, src8, src9, src10);
3449 XORI_B8_128_SB(src3, src4, src5, src6, src7, src8, src9, src10);
3451 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3452 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
3453 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
3454 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
3460 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dsth3, dsth3);
3461 DPADD_SB2_SH(vec2, vec3, filt0, filt1, dsth4, dsth4);
3462 DPADD_SB2_SH(vec4, vec5, filt0, filt1, dsth5, dsth5);
3463 DPADD_SB2_SH(vec6, vec7, filt0, filt1, dsth6, dsth6);
3465 VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1);
3466 VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec2, vec3);
3467 VSHF_B2_SB(src9, src9, src9, src9, mask0, mask1, vec4, vec5);
3468 VSHF_B2_SB(src10, src10, src10, src10, mask0, mask1, vec6, vec7);
3474 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dsth7, dsth7);
3475 DPADD_SB2_SH(vec2, vec3, filt0, filt1, dsth8, dsth8);
3476 DPADD_SB2_SH(vec4, vec5, filt0, filt1, dsth9, dsth9);
3477 DPADD_SB2_SH(vec6, vec7, filt0, filt1, dsth10, dsth10);
3479 ILVRL_H2_SH(dsth3, dsth2, dst32_r, dst32_l);
3480 ILVRL_H2_SH(dsth4, dsth3, dst43_r, dst43_l);
3481 ILVRL_H2_SH(dsth5, dsth4, dst54_r, dst54_l);
3482 ILVRL_H2_SH(dsth6, dsth5, dst65_r, dst65_l);
3483 ILVRL_H2_SH(dsth7, dsth6, dst76_r, dst76_l);
3484 ILVRL_H2_SH(dsth8, dsth7, dst87_r, dst87_l);
3485 ILVRL_H2_SH(dsth9, dsth8, dst98_r, dst98_l);
3486 ILVRL_H2_SH(dsth10, dsth9, dst109_r, dst109_l);
3488 PCKEV_D2_SH(dst21_l, dst10_l, dst43_l, dst32_l, dst1021_l, dst3243_l);
3489 PCKEV_D2_SH(dst65_l, dst54_l, dst87_l, dst76_l, dst5465_l, dst7687_l);
3490 dst98109_l = (v8i16) __msa_pckev_d((v2i64) dst109_l, (v2i64) dst98_l);
3492 dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
3493 dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
3494 dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
3495 dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
3496 dst4_r = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
3497 dst5_r = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
3498 dst6_r = HEVC_FILT_4TAP(dst76_r, dst98_r, filt_h0, filt_h1);
3499 dst7_r = HEVC_FILT_4TAP(dst87_r, dst109_r, filt_h0, filt_h1);
3500 dst0_l = HEVC_FILT_4TAP(dst1021_l, dst3243_l, filt_h0, filt_h1);
3501 dst1_l = HEVC_FILT_4TAP(dst3243_l, dst5465_l, filt_h0, filt_h1);
3502 dst2_l = HEVC_FILT_4TAP(dst5465_l, dst7687_l, filt_h0, filt_h1);
3503 dst3_l = HEVC_FILT_4TAP(dst7687_l, dst98109_l, filt_h0, filt_h1);
3504 SRA_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6);
3505 SRA_4V(dst4_r, dst5_r, dst6_r, dst7_r, 6);
3506 SRA_4V(dst0_l, dst1_l, dst2_l, dst3_l, 6);
3507 PCKEV_H2_SH(dst1_r, dst0_r, dst3_r, dst2_r, tmp0, tmp1);
3508 PCKEV_H2_SH(dst5_r, dst4_r, dst7_r, dst6_r, tmp2, tmp3);
3509 PCKEV_H2_SH(dst1_l, dst0_l, dst3_l, dst2_l, tmp4, tmp5);
3510 ST8x4_UB(tmp0, tmp1, dst, dst_stride_in_bytes);
3511 ST4x4_UB(tmp4, tmp4, 0, 1, 2, 3, dst + 4, dst_stride_in_bytes);
3512 dst += 4 * dst_stride;
3513 ST8x4_UB(tmp2, tmp3, dst, dst_stride_in_bytes);
3514 ST4x4_UB(tmp5, tmp5, 0, 1, 2, 3, dst + 4, dst_stride_in_bytes);
3517 static void hevc_hv_4t_8x2_msa(uint8_t *src,
3521 const int8_t *filter_x,
3522 const int8_t *filter_y)
3524 v16i8 src0, src1, src2, src3, src4;
3526 v8i16 filt_h0, filt_h1;
3527 v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
3529 v8i16 filter_vec, const_vec;
3530 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
3531 v8i16 dst0, dst1, dst2, dst3, dst4;
3532 v4i32 dst0_r, dst0_l, dst1_r, dst1_l;
3533 v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
3534 v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
3536 src -= (src_stride + 1);
3538 filter_vec = LD_SH(filter_x);
3539 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3541 filter_vec = LD_SH(filter_y);
3542 UNPCK_R_SB_SH(filter_vec, filter_vec);
3544 SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
3548 const_vec = __msa_ldi_h(128);
3551 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
3552 XORI_B5_128_SB(src0, src1, src2, src3, src4);
3554 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3555 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
3556 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3557 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7);
3558 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec8, vec9);
3561 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
3563 DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1);
3565 DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2);
3567 DPADD_SB2_SH(vec6, vec7, filt0, filt1, dst3, dst3);
3569 DPADD_SB2_SH(vec8, vec9, filt0, filt1, dst4, dst4);
3571 ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
3572 ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
3573 ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
3574 ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
3575 dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
3576 dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
3577 dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
3578 dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
3579 SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
3580 PCKEV_H2_SW(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r, dst1_r);
3581 ST_SW2(dst0_r, dst1_r, dst, dst_stride);
3584 static void hevc_hv_4t_8multx4_msa(uint8_t *src, int32_t src_stride,
3585 int16_t *dst, int32_t dst_stride,
3586 const int8_t *filter_x,
3587 const int8_t *filter_y, int32_t width8mult)
3590 v16i8 src0, src1, src2, src3, src4, src5, src6, mask0, mask1;
3591 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
3592 v8i16 filt0, filt1, filt_h0, filt_h1, filter_vec, const_vec;
3593 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6;
3594 v8i16 dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r;
3595 v8i16 dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l;
3596 v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
3598 src -= (src_stride + 1);
3600 filter_vec = LD_SH(filter_x);
3601 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3603 filter_vec = LD_SH(filter_y);
3604 UNPCK_R_SB_SH(filter_vec, filter_vec);
3606 SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
3608 mask0 = LD_SB(ff_hevc_mask_arr);
3611 const_vec = __msa_ldi_h(128);
3614 for (cnt = width8mult; cnt--;) {
3615 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
3617 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
3619 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3620 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
3621 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3626 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
3627 DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1);
3628 DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2);
3630 ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
3631 ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
3633 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3634 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
3635 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
3636 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
3641 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
3642 DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst4, dst4);
3643 DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst5, dst5);
3644 DPADD_SB2_SH(vec6, vec7, filt0, filt1, dst6, dst6);
3645 ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
3646 ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
3647 ILVRL_H2_SH(dst5, dst4, dst54_r, dst54_l);
3648 ILVRL_H2_SH(dst6, dst5, dst65_r, dst65_l);
3649 dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
3650 dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
3651 dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
3652 dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
3654 dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
3655 dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1);
3656 dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
3657 dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1);
3658 SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
3659 SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
3660 PCKEV_H2_SW(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r, dst1_r);
3661 PCKEV_H2_SW(dst2_l, dst2_r, dst3_l, dst3_r, dst2_r, dst3_r);
3663 ST_SW4(dst0_r, dst1_r, dst2_r, dst3_r, dst, dst_stride);
3668 static void hevc_hv_4t_8x6_msa(uint8_t *src,
3672 const int8_t *filter_x,
3673 const int8_t *filter_y)
3675 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
3677 v8i16 filt_h0, filt_h1;
3678 v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
3680 v8i16 filter_vec, const_vec;
3681 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7, vec8, vec9;
3682 v16i8 vec10, vec11, vec12, vec13, vec14, vec15, vec16, vec17;
3683 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
3684 v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
3685 v4i32 dst4_r, dst4_l, dst5_r, dst5_l;
3686 v8i16 dst10_r, dst32_r, dst10_l, dst32_l;
3687 v8i16 dst21_r, dst43_r, dst21_l, dst43_l;
3688 v8i16 dst54_r, dst54_l, dst65_r, dst65_l;
3689 v8i16 dst76_r, dst76_l, dst87_r, dst87_l;
3691 src -= (src_stride + 1);
3693 filter_vec = LD_SH(filter_x);
3694 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3696 filter_vec = LD_SH(filter_y);
3697 UNPCK_R_SB_SH(filter_vec, filter_vec);
3699 SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
3703 const_vec = __msa_ldi_h(128);
3706 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
3707 src += (5 * src_stride);
3708 LD_SB4(src, src_stride, src5, src6, src7, src8);
3710 XORI_B5_128_SB(src0, src1, src2, src3, src4);
3711 XORI_B4_128_SB(src5, src6, src7, src8);
3713 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3714 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
3715 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3716 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec6, vec7);
3717 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec8, vec9);
3718 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec10, vec11);
3719 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec12, vec13);
3720 VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec14, vec15);
3721 VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec16, vec17);
3724 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
3726 DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1);
3728 DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2);
3730 DPADD_SB2_SH(vec6, vec7, filt0, filt1, dst3, dst3);
3732 DPADD_SB2_SH(vec8, vec9, filt0, filt1, dst4, dst4);
3734 DPADD_SB2_SH(vec10, vec11, filt0, filt1, dst5, dst5);
3736 DPADD_SB2_SH(vec12, vec13, filt0, filt1, dst6, dst6);
3738 DPADD_SB2_SH(vec14, vec15, filt0, filt1, dst7, dst7);
3740 DPADD_SB2_SH(vec16, vec17, filt0, filt1, dst8, dst8);
3742 ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
3743 ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
3744 ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
3745 ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
3746 ILVRL_H2_SH(dst5, dst4, dst54_r, dst54_l);
3747 ILVRL_H2_SH(dst6, dst5, dst65_r, dst65_l);
3748 ILVRL_H2_SH(dst7, dst6, dst76_r, dst76_l);
3749 ILVRL_H2_SH(dst8, dst7, dst87_r, dst87_l);
3751 dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
3752 dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
3753 dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
3754 dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
3755 dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
3756 dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1);
3757 dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
3758 dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1);
3759 dst4_r = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
3760 dst4_l = HEVC_FILT_4TAP(dst54_l, dst76_l, filt_h0, filt_h1);
3761 dst5_r = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
3762 dst5_l = HEVC_FILT_4TAP(dst65_l, dst87_l, filt_h0, filt_h1);
3764 SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
3765 SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
3766 SRA_4V(dst4_r, dst4_l, dst5_r, dst5_l, 6);
3768 PCKEV_H4_SW(dst0_l, dst0_r, dst1_l, dst1_r,
3769 dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r, dst2_r, dst3_r);
3770 PCKEV_H2_SW(dst4_l, dst4_r, dst5_l, dst5_r, dst4_r, dst5_r);
3772 ST_SW2(dst0_r, dst1_r, dst, dst_stride);
3773 dst += (2 * dst_stride);
3774 ST_SW2(dst2_r, dst3_r, dst, dst_stride);
3775 dst += (2 * dst_stride);
3776 ST_SW2(dst4_r, dst5_r, dst, dst_stride);
3779 static void hevc_hv_4t_8multx4mult_msa(uint8_t *src,
3783 const int8_t *filter_x,
3784 const int8_t *filter_y,
3788 uint32_t loop_cnt, cnt;
3791 v16i8 src0, src1, src2, src3, src4, src5, src6;
3793 v8i16 filt_h0, filt_h1;
3794 v16i8 mask0 = LD_SB(ff_hevc_mask_arr);
3796 v8i16 filter_vec, const_vec;
3797 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
3798 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6;
3799 v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
3800 v8i16 dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r;
3801 v8i16 dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l;
3803 src -= (src_stride + 1);
3805 filter_vec = LD_SH(filter_x);
3806 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3808 filter_vec = LD_SH(filter_y);
3809 UNPCK_R_SB_SH(filter_vec, filter_vec);
3811 SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
3815 const_vec = __msa_ldi_h(128);
3818 for (cnt = width8mult; cnt--;) {
3822 LD_SB3(src_tmp, src_stride, src0, src1, src2);
3823 src_tmp += (3 * src_stride);
3825 XORI_B3_128_SB(src0, src1, src2);
3827 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3828 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
3829 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3832 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
3834 DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1);
3836 DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2);
3838 ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
3839 ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
3841 for (loop_cnt = height >> 2; loop_cnt--;) {
3842 LD_SB4(src_tmp, src_stride, src3, src4, src5, src6);
3843 src_tmp += (4 * src_stride);
3844 XORI_B4_128_SB(src3, src4, src5, src6);
3846 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3847 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
3848 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
3849 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
3855 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
3856 DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst4, dst4);
3857 DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst5, dst5);
3858 DPADD_SB2_SH(vec6, vec7, filt0, filt1, dst6, dst6);
3860 ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
3861 ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
3862 ILVRL_H2_SH(dst5, dst4, dst54_r, dst54_l);
3863 ILVRL_H2_SH(dst6, dst5, dst65_r, dst65_l);
3865 dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
3866 dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
3867 dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
3868 dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
3869 dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
3870 dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1);
3871 dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
3872 dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1);
3874 SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
3875 SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
3877 PCKEV_H4_SW(dst0_l, dst0_r, dst1_l, dst1_r,
3878 dst2_l, dst2_r, dst3_l, dst3_r,
3879 dst0_r, dst1_r, dst2_r, dst3_r);
3881 ST_SW4(dst0_r, dst1_r, dst2_r, dst3_r, dst_tmp, dst_stride);
3882 dst_tmp += (4 * dst_stride);
3896 static void hevc_hv_4t_8w_msa(uint8_t *src,
3900 const int8_t *filter_x,
3901 const int8_t *filter_y,
3906 hevc_hv_4t_8x2_msa(src, src_stride, dst, dst_stride,
3907 filter_x, filter_y);
3908 } else if (4 == height) {
3909 hevc_hv_4t_8multx4_msa(src, src_stride, dst, dst_stride,
3910 filter_x, filter_y, 1);
3911 } else if (6 == height) {
3912 hevc_hv_4t_8x6_msa(src, src_stride, dst, dst_stride,
3913 filter_x, filter_y);
3914 } else if (0 == (height % 4)) {
3915 hevc_hv_4t_8multx4mult_msa(src, src_stride, dst, dst_stride,
3916 filter_x, filter_y, height, 1);
3920 static void hevc_hv_4t_12w_msa(uint8_t *src,
3924 const int8_t *filter_x,
3925 const int8_t *filter_y,
3931 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
3932 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
3933 v16i8 mask0, mask1, mask2, mask3;
3934 v8i16 filt0, filt1, filt_h0, filt_h1, filter_vec, const_vec;
3935 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst10, dst21, dst22, dst73;
3936 v8i16 dst84, dst95, dst106, dst76_r, dst98_r, dst87_r, dst109_r;
3937 v8i16 dst10_r, dst32_r, dst54_r, dst21_r, dst43_r, dst65_r;
3938 v8i16 dst10_l, dst32_l, dst54_l, dst21_l, dst43_l, dst65_l;
3939 v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
3940 v4i32 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
3942 src -= (src_stride + 1);
3944 filter_vec = LD_SH(filter_x);
3945 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3947 filter_vec = LD_SH(filter_y);
3948 UNPCK_R_SB_SH(filter_vec, filter_vec);
3950 SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
3952 mask0 = LD_SB(ff_hevc_mask_arr);
3955 const_vec = __msa_ldi_h(128);
3961 LD_SB3(src_tmp, src_stride, src0, src1, src2);
3962 src_tmp += (3 * src_stride);
3964 XORI_B3_128_SB(src0, src1, src2);
3966 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3967 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
3968 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3973 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
3974 DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1);
3975 DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2);
3977 ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
3978 ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
3980 for (loop_cnt = 4; loop_cnt--;) {
3981 LD_SB4(src_tmp, src_stride, src3, src4, src5, src6);
3982 src_tmp += (4 * src_stride);
3983 XORI_B4_128_SB(src3, src4, src5, src6);
3985 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3986 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
3987 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
3988 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
3994 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
3995 DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst4, dst4);
3996 DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst5, dst5);
3997 DPADD_SB2_SH(vec6, vec7, filt0, filt1, dst6, dst6);
3999 ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
4000 ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
4001 ILVRL_H2_SH(dst5, dst4, dst54_r, dst54_l);
4002 ILVRL_H2_SH(dst6, dst5, dst65_r, dst65_l);
4004 dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
4005 dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
4006 dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
4007 dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
4008 dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
4009 dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1);
4010 dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
4011 dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1);
4013 SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
4014 SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
4015 PCKEV_H4_SW(dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, dst3_l,
4016 dst3_r, dst0_r, dst1_r, dst2_r, dst3_r);
4017 ST_SW4(dst0_r, dst1_r, dst2_r, dst3_r, dst_tmp, dst_stride);
4018 dst_tmp += (4 * dst_stride);
4030 mask2 = LD_SB(ff_hevc_mask_arr + 16);
4033 LD_SB3(src, src_stride, src0, src1, src2);
4034 src += (3 * src_stride);
4035 XORI_B3_128_SB(src0, src1, src2);
4036 VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec0, vec1);
4037 VSHF_B2_SB(src1, src2, src1, src2, mask2, mask3, vec2, vec3);
4040 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst10, dst10);
4041 DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst21, dst21);
4042 ILVRL_H2_SH(dst21, dst10, dst10_r, dst21_r);
4043 dst22 = (v8i16) __msa_splati_d((v2i64) dst21, 1);
4045 for (loop_cnt = 2; loop_cnt--;) {
4046 LD_SB8(src, src_stride, src3, src4, src5, src6, src7, src8, src9,
4048 src += (8 * src_stride);
4049 XORI_B8_128_SB(src3, src4, src5, src6, src7, src8, src9, src10);
4050 VSHF_B2_SB(src3, src7, src3, src7, mask2, mask3, vec0, vec1);
4051 VSHF_B2_SB(src4, src8, src4, src8, mask2, mask3, vec2, vec3);
4052 VSHF_B2_SB(src5, src9, src5, src9, mask2, mask3, vec4, vec5);
4053 VSHF_B2_SB(src6, src10, src6, src10, mask2, mask3, vec6, vec7);
4059 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst73, dst73);
4060 DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst84, dst84);
4061 DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst95, dst95);
4062 DPADD_SB2_SH(vec6, vec7, filt0, filt1, dst106, dst106);
4064 dst32_r = __msa_ilvr_h(dst73, dst22);
4065 ILVRL_H2_SH(dst84, dst73, dst43_r, dst87_r);
4066 ILVRL_H2_SH(dst95, dst84, dst54_r, dst98_r);
4067 ILVRL_H2_SH(dst106, dst95, dst65_r, dst109_r);
4068 dst22 = (v8i16) __msa_splati_d((v2i64) dst73, 1);
4069 dst76_r = __msa_ilvr_h(dst22, dst106);
4071 tmp0 = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
4072 tmp1 = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
4073 tmp2 = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
4074 tmp3 = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
4075 tmp4 = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
4076 tmp5 = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
4077 tmp6 = HEVC_FILT_4TAP(dst76_r, dst98_r, filt_h0, filt_h1);
4078 tmp7 = HEVC_FILT_4TAP(dst87_r, dst109_r, filt_h0, filt_h1);
4080 SRA_4V(tmp0, tmp1, tmp2, tmp3, 6);
4081 SRA_4V(tmp4, tmp5, tmp6, tmp7, 6);
4082 PCKEV_H4_SW(tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, tmp7, tmp6, tmp0, tmp1,
4084 ST8x8_UB(tmp0, tmp1, tmp2, tmp3, dst, 2 * dst_stride);
4085 dst += (8 * dst_stride);
4089 dst22 = (v8i16) __msa_splati_d((v2i64) dst106, 1);
4093 static void hevc_hv_4t_16w_msa(uint8_t *src,
4097 const int8_t *filter_x,
4098 const int8_t *filter_y,
4102 hevc_hv_4t_8multx4_msa(src, src_stride, dst, dst_stride,
4103 filter_x, filter_y, 2);
4105 hevc_hv_4t_8multx4mult_msa(src, src_stride, dst, dst_stride,
4106 filter_x, filter_y, height, 2);
4110 static void hevc_hv_4t_24w_msa(uint8_t *src,
4114 const int8_t *filter_x,
4115 const int8_t *filter_y,
4118 hevc_hv_4t_8multx4mult_msa(src, src_stride, dst, dst_stride,
4119 filter_x, filter_y, height, 3);
4122 static void hevc_hv_4t_32w_msa(uint8_t *src,
4126 const int8_t *filter_x,
4127 const int8_t *filter_y,
4130 hevc_hv_4t_8multx4mult_msa(src, src_stride, dst, dst_stride,
4131 filter_x, filter_y, height, 4);
4134 #define MC_COPY(WIDTH) \
4135 void ff_hevc_put_hevc_pel_pixels##WIDTH##_8_msa(int16_t *dst, \
4137 ptrdiff_t src_stride, \
4143 hevc_copy_##WIDTH##w_msa(src, src_stride, dst, MAX_PB_SIZE, height); \
4158 #define MC(PEL, DIR, WIDTH, TAP, DIR1, FILT_DIR) \
4159 void ff_hevc_put_hevc_##PEL##_##DIR##WIDTH##_8_msa(int16_t *dst, \
4161 ptrdiff_t src_stride, \
4167 const int8_t *filter = ff_hevc_##PEL##_filters[FILT_DIR - 1]; \
4169 hevc_##DIR1##_##TAP##t_##WIDTH##w_msa(src, src_stride, dst, \
4170 MAX_PB_SIZE, filter, height); \
4173 MC(qpel, h, 4, 8, hz, mx);
4174 MC(qpel, h, 8, 8, hz, mx);
4175 MC(qpel, h, 12, 8, hz, mx);
4176 MC(qpel, h, 16, 8, hz, mx);
4177 MC(qpel, h, 24, 8, hz, mx);
4178 MC(qpel, h, 32, 8, hz, mx);
4179 MC(qpel, h, 48, 8, hz, mx);
4180 MC(qpel, h, 64, 8, hz, mx);
4182 MC(qpel, v, 4, 8, vt, my);
4183 MC(qpel, v, 8, 8, vt, my);
4184 MC(qpel, v, 12, 8, vt, my);
4185 MC(qpel, v, 16, 8, vt, my);
4186 MC(qpel, v, 24, 8, vt, my);
4187 MC(qpel, v, 32, 8, vt, my);
4188 MC(qpel, v, 48, 8, vt, my);
4189 MC(qpel, v, 64, 8, vt, my);
4191 MC(epel, h, 4, 4, hz, mx);
4192 MC(epel, h, 6, 4, hz, mx);
4193 MC(epel, h, 8, 4, hz, mx);
4194 MC(epel, h, 12, 4, hz, mx);
4195 MC(epel, h, 16, 4, hz, mx);
4196 MC(epel, h, 24, 4, hz, mx);
4197 MC(epel, h, 32, 4, hz, mx);
4199 MC(epel, v, 4, 4, vt, my);
4200 MC(epel, v, 6, 4, vt, my);
4201 MC(epel, v, 8, 4, vt, my);
4202 MC(epel, v, 12, 4, vt, my);
4203 MC(epel, v, 16, 4, vt, my);
4204 MC(epel, v, 24, 4, vt, my);
4205 MC(epel, v, 32, 4, vt, my);
4209 #define MC_HV(PEL, WIDTH, TAP) \
4210 void ff_hevc_put_hevc_##PEL##_hv##WIDTH##_8_msa(int16_t *dst, \
4212 ptrdiff_t src_stride, \
4218 const int8_t *filter_x = ff_hevc_##PEL##_filters[mx - 1]; \
4219 const int8_t *filter_y = ff_hevc_##PEL##_filters[my - 1]; \
4221 hevc_hv_##TAP##t_##WIDTH##w_msa(src, src_stride, dst, MAX_PB_SIZE, \
4222 filter_x, filter_y, height); \