2 * Copyright (c) 2015 - 2017 Manojkumar Bhosale (Manojkumar.Bhosale@imgtec.com)
4 * This file is part of FFmpeg.
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 #include "libavutil/mips/generic_macros_msa.h"
22 #include "libavcodec/mips/hevcdsp_mips.h"
23 #include "libavcodec/mips/hevc_macros_msa.h"
25 static void hevc_copy_4w_msa(uint8_t *src, int32_t src_stride,
26 int16_t *dst, int32_t dst_stride,
35 LD_SB2(src, src_stride, src0, src1);
37 src0 = (v16i8) __msa_ilvr_w((v4i32) src1, (v4i32) src0);
38 in0 = (v8i16) __msa_ilvr_b(zero, src0);
40 ST8x2_UB(in0, dst, 2 * dst_stride);
41 } else if (4 == height) {
42 v16i8 src0, src1, src2, src3;
45 LD_SB4(src, src_stride, src0, src1, src2, src3);
47 ILVR_W2_SB(src1, src0, src3, src2, src0, src1);
48 ILVR_B2_SH(zero, src0, zero, src1, in0, in1);
51 ST8x4_UB(in0, in1, dst, 2 * dst_stride);
52 } else if (0 == height % 8) {
53 v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
54 v8i16 in0, in1, in2, in3;
57 for (loop_cnt = (height >> 3); loop_cnt--;) {
58 LD_SB8(src, src_stride,
59 src0, src1, src2, src3, src4, src5, src6, src7);
60 src += (8 * src_stride);
62 ILVR_W4_SB(src1, src0, src3, src2, src5, src4, src7, src6,
63 src0, src1, src2, src3);
64 ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
66 SLLI_4V(in0, in1, in2, in3, 6);
67 ST8x8_UB(in0, in1, in2, in3, dst, 2 * dst_stride);
68 dst += (8 * dst_stride);
73 static void hevc_copy_6w_msa(uint8_t *src, int32_t src_stride,
74 int16_t *dst, int32_t dst_stride,
79 v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
80 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
82 for (loop_cnt = (height >> 3); loop_cnt--;) {
83 LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
84 src += (8 * src_stride);
86 ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
88 ILVR_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7,
90 SLLI_4V(in0, in1, in2, in3, 6);
91 SLLI_4V(in4, in5, in6, in7, 6);
92 ST12x8_UB(in0, in1, in2, in3, in4, in5, in6, in7, dst, 2 * dst_stride);
93 dst += (8 * dst_stride);
97 static void hevc_copy_8w_msa(uint8_t *src, int32_t src_stride,
98 int16_t *dst, int32_t dst_stride,
107 LD_SB2(src, src_stride, src0, src1);
109 ILVR_B2_SH(zero, src0, zero, src1, in0, in1);
112 ST_SH2(in0, in1, dst, dst_stride);
113 } else if (4 == height) {
114 v16i8 src0, src1, src2, src3;
115 v8i16 in0, in1, in2, in3;
117 LD_SB4(src, src_stride, src0, src1, src2, src3);
119 ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
121 SLLI_4V(in0, in1, in2, in3, 6);
122 ST_SH4(in0, in1, in2, in3, dst, dst_stride);
123 } else if (6 == height) {
124 v16i8 src0, src1, src2, src3, src4, src5;
125 v8i16 in0, in1, in2, in3, in4, in5;
127 LD_SB6(src, src_stride, src0, src1, src2, src3, src4, src5);
129 ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
131 ILVR_B2_SH(zero, src4, zero, src5, in4, in5);
132 SLLI_4V(in0, in1, in2, in3, 6);
135 ST_SH6(in0, in1, in2, in3, in4, in5, dst, dst_stride);
136 } else if (0 == height % 8) {
138 v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
139 v8i16 in0, in1, in2, in3, in4, in5, in6, in7;
141 for (loop_cnt = (height >> 3); loop_cnt--;) {
142 LD_SB8(src, src_stride,
143 src0, src1, src2, src3, src4, src5, src6, src7);
144 src += (8 * src_stride);
146 ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
148 ILVR_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7,
150 SLLI_4V(in0, in1, in2, in3, 6);
151 SLLI_4V(in4, in5, in6, in7, 6);
152 ST_SH8(in0, in1, in2, in3, in4, in5, in6, in7, dst, dst_stride);
153 dst += (8 * dst_stride);
158 static void hevc_copy_12w_msa(uint8_t *src, int32_t src_stride,
159 int16_t *dst, int32_t dst_stride,
164 v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
165 v8i16 in0, in1, in0_r, in1_r, in2_r, in3_r;
167 for (loop_cnt = (height >> 3); loop_cnt--;) {
168 LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
169 src += (8 * src_stride);
171 ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
172 in0_r, in1_r, in2_r, in3_r);
173 SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6);
174 ILVL_W2_SB(src1, src0, src3, src2, src0, src1);
175 ILVR_B2_SH(zero, src0, zero, src1, in0, in1);
178 ST_SH4(in0_r, in1_r, in2_r, in3_r, dst, dst_stride);
179 ST8x4_UB(in0, in1, dst + 8, 2 * dst_stride);
180 dst += (4 * dst_stride);
182 ILVR_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7,
183 in0_r, in1_r, in2_r, in3_r);
184 SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6);
185 ILVL_W2_SB(src5, src4, src7, src6, src0, src1);
186 ILVR_B2_SH(zero, src0, zero, src1, in0, in1);
189 ST_SH4(in0_r, in1_r, in2_r, in3_r, dst, dst_stride);
190 ST8x4_UB(in0, in1, dst + 8, 2 * dst_stride);
191 dst += (4 * dst_stride);
195 static void hevc_copy_16multx8mult_msa(uint8_t *src,
204 uint32_t loop_cnt, cnt;
206 v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
207 v8i16 in0_r, in1_r, in2_r, in3_r;
208 v8i16 in0_l, in1_l, in2_l, in3_l;
210 for (cnt = (width >> 4); cnt--;) {
214 for (loop_cnt = (height >> 3); loop_cnt--;) {
215 LD_SB8(src_tmp, src_stride,
216 src0, src1, src2, src3, src4, src5, src6, src7);
217 src_tmp += (8 * src_stride);
219 ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
220 in0_r, in1_r, in2_r, in3_r);
221 ILVL_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
222 in0_l, in1_l, in2_l, in3_l);
223 SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6);
224 SLLI_4V(in0_l, in1_l, in2_l, in3_l, 6);
225 ST_SH4(in0_r, in1_r, in2_r, in3_r, dst_tmp, dst_stride);
226 ST_SH4(in0_l, in1_l, in2_l, in3_l, (dst_tmp + 8), dst_stride);
227 dst_tmp += (4 * dst_stride);
229 ILVR_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7,
230 in0_r, in1_r, in2_r, in3_r);
231 ILVL_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7,
232 in0_l, in1_l, in2_l, in3_l);
233 SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6);
234 SLLI_4V(in0_l, in1_l, in2_l, in3_l, 6);
235 ST_SH4(in0_r, in1_r, in2_r, in3_r, dst_tmp, dst_stride);
236 ST_SH4(in0_l, in1_l, in2_l, in3_l, (dst_tmp + 8), dst_stride);
237 dst_tmp += (4 * dst_stride);
245 static void hevc_copy_16w_msa(uint8_t *src, int32_t src_stride,
246 int16_t *dst, int32_t dst_stride,
252 v16i8 src0, src1, src2, src3;
253 v8i16 in0_r, in1_r, in2_r, in3_r;
254 v8i16 in0_l, in1_l, in2_l, in3_l;
256 LD_SB4(src, src_stride, src0, src1, src2, src3);
258 ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
259 in0_r, in1_r, in2_r, in3_r);
260 ILVL_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
261 in0_l, in1_l, in2_l, in3_l);
262 SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6);
263 SLLI_4V(in0_l, in1_l, in2_l, in3_l, 6);
264 ST_SH4(in0_r, in1_r, in2_r, in3_r, dst, dst_stride);
265 ST_SH4(in0_l, in1_l, in2_l, in3_l, (dst + 8), dst_stride);
266 } else if (12 == height) {
267 v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
268 v16i8 src8, src9, src10, src11;
269 v8i16 in0_r, in1_r, in2_r, in3_r;
270 v8i16 in0_l, in1_l, in2_l, in3_l;
272 LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
273 src += (8 * src_stride);
274 LD_SB4(src, src_stride, src8, src9, src10, src11);
276 ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
277 in0_r, in1_r, in2_r, in3_r);
278 ILVL_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
279 in0_l, in1_l, in2_l, in3_l);
280 SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6);
281 SLLI_4V(in0_l, in1_l, in2_l, in3_l, 6);
282 ST_SH4(in0_r, in1_r, in2_r, in3_r, dst, dst_stride);
283 ST_SH4(in0_l, in1_l, in2_l, in3_l, (dst + 8), dst_stride);
284 dst += (4 * dst_stride);
286 ILVR_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7,
287 in0_r, in1_r, in2_r, in3_r);
288 ILVL_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7,
289 in0_l, in1_l, in2_l, in3_l);
290 SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6);
291 SLLI_4V(in0_l, in1_l, in2_l, in3_l, 6);
292 ST_SH4(in0_r, in1_r, in2_r, in3_r, dst, dst_stride);
293 ST_SH4(in0_l, in1_l, in2_l, in3_l, (dst + 8), dst_stride);
294 dst += (4 * dst_stride);
296 ILVR_B4_SH(zero, src8, zero, src9, zero, src10, zero, src11,
297 in0_r, in1_r, in2_r, in3_r);
298 ILVL_B4_SH(zero, src8, zero, src9, zero, src10, zero, src11,
299 in0_l, in1_l, in2_l, in3_l);
300 SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6);
301 SLLI_4V(in0_l, in1_l, in2_l, in3_l, 6);
302 ST_SH4(in0_r, in1_r, in2_r, in3_r, dst, dst_stride);
303 ST_SH4(in0_l, in1_l, in2_l, in3_l, (dst + 8), dst_stride);
304 } else if (0 == (height % 8)) {
306 v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
307 v8i16 in0_r, in1_r, in2_r, in3_r, in0_l, in1_l, in2_l, in3_l;
309 for (loop_cnt = (height >> 3); loop_cnt--;) {
310 LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6,
312 src += (8 * src_stride);
313 ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, in0_r,
314 in1_r, in2_r, in3_r);
315 ILVL_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, in0_l,
316 in1_l, in2_l, in3_l);
317 SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6);
318 SLLI_4V(in0_l, in1_l, in2_l, in3_l, 6);
319 ST_SH4(in0_r, in1_r, in2_r, in3_r, dst, dst_stride);
320 ST_SH4(in0_l, in1_l, in2_l, in3_l, (dst + 8), dst_stride);
321 dst += (4 * dst_stride);
323 ILVR_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7, in0_r,
324 in1_r, in2_r, in3_r);
325 ILVL_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7, in0_l,
326 in1_l, in2_l, in3_l);
327 SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6);
328 SLLI_4V(in0_l, in1_l, in2_l, in3_l, 6);
329 ST_SH4(in0_r, in1_r, in2_r, in3_r, dst, dst_stride);
330 ST_SH4(in0_l, in1_l, in2_l, in3_l, (dst + 8), dst_stride);
331 dst += (4 * dst_stride);
336 static void hevc_copy_24w_msa(uint8_t *src, int32_t src_stride,
337 int16_t *dst, int32_t dst_stride,
342 v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
343 v8i16 in0_r, in1_r, in2_r, in3_r, in0_l, in1_l, in2_l, in3_l;
345 for (loop_cnt = (height >> 2); loop_cnt--;) {
346 LD_SB4(src, src_stride, src0, src1, src2, src3);
347 LD_SB4((src + 16), src_stride, src4, src5, src6, src7);
348 src += (4 * src_stride);
349 ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, in0_r, in1_r,
351 ILVL_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, in0_l, in1_l,
353 SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6);
354 SLLI_4V(in0_l, in1_l, in2_l, in3_l, 6);
355 ST_SH4(in0_r, in1_r, in2_r, in3_r, dst, dst_stride);
356 ST_SH4(in0_l, in1_l, in2_l, in3_l, (dst + 8), dst_stride);
357 ILVR_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7, in0_r, in1_r,
359 SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6);
360 ST_SH4(in0_r, in1_r, in2_r, in3_r, (dst + 16), dst_stride);
361 dst += (4 * dst_stride);
365 static void hevc_copy_32w_msa(uint8_t *src, int32_t src_stride,
366 int16_t *dst, int32_t dst_stride,
371 v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
372 v8i16 in0_r, in1_r, in2_r, in3_r, in0_l, in1_l, in2_l, in3_l;
374 for (loop_cnt = (height >> 2); loop_cnt--;) {
375 LD_SB4(src, src_stride, src0, src2, src4, src6);
376 LD_SB4((src + 16), src_stride, src1, src3, src5, src7);
377 src += (4 * src_stride);
379 ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, in0_r, in1_r,
381 ILVL_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3, in0_l, in1_l,
383 SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6);
384 SLLI_4V(in0_l, in1_l, in2_l, in3_l, 6);
385 ST_SH4(in0_r, in0_l, in1_r, in1_l, dst, 8);
387 ST_SH4(in2_r, in2_l, in3_r, in3_l, dst, 8);
390 ILVR_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7, in0_r, in1_r,
392 ILVL_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7, in0_l, in1_l,
394 SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6);
395 SLLI_4V(in0_l, in1_l, in2_l, in3_l, 6);
396 ST_SH4(in0_r, in0_l, in1_r, in1_l, dst, 8);
398 ST_SH4(in2_r, in2_l, in3_r, in3_l, dst, 8);
403 static void hevc_copy_48w_msa(uint8_t *src, int32_t src_stride,
404 int16_t *dst, int32_t dst_stride,
409 v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
410 v16i8 src8, src9, src10, src11;
411 v8i16 in0_r, in1_r, in2_r, in3_r, in4_r, in5_r;
412 v8i16 in0_l, in1_l, in2_l, in3_l, in4_l, in5_l;
414 for (loop_cnt = (height >> 2); loop_cnt--;) {
415 LD_SB3(src, 16, src0, src1, src2);
417 LD_SB3(src, 16, src3, src4, src5);
419 LD_SB3(src, 16, src6, src7, src8);
421 LD_SB3(src, 16, src9, src10, src11);
424 ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
425 in0_r, in1_r, in2_r, in3_r);
426 ILVL_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
427 in0_l, in1_l, in2_l, in3_l);
428 ILVR_B2_SH(zero, src4, zero, src5, in4_r, in5_r);
429 ILVL_B2_SH(zero, src4, zero, src5, in4_l, in5_l);
430 SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6);
431 SLLI_4V(in0_l, in1_l, in2_l, in3_l, 6);
432 SLLI_4V(in4_r, in5_r, in4_l, in5_l, 6);
433 ST_SH6(in0_r, in0_l, in1_r, in1_l, in2_r, in2_l, dst, 8);
435 ST_SH6(in3_r, in3_l, in4_r, in4_l, in5_r, in5_l, dst, 8);
438 ILVR_B4_SH(zero, src6, zero, src7, zero, src8, zero, src9,
439 in0_r, in1_r, in2_r, in3_r);
440 ILVL_B4_SH(zero, src6, zero, src7, zero, src8, zero, src9,
441 in0_l, in1_l, in2_l, in3_l);
442 ILVR_B2_SH(zero, src10, zero, src11, in4_r, in5_r);
443 ILVL_B2_SH(zero, src10, zero, src11, in4_l, in5_l);
444 SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6);
445 SLLI_4V(in0_l, in1_l, in2_l, in3_l, 6);
446 SLLI_4V(in4_r, in5_r, in4_l, in5_l, 6);
447 ST_SH6(in0_r, in0_l, in1_r, in1_l, in2_r, in2_l, dst, 8);
449 ST_SH6(in3_r, in3_l, in4_r, in4_l, in5_r, in5_l, dst, 8);
454 static void hevc_copy_64w_msa(uint8_t *src, int32_t src_stride,
455 int16_t *dst, int32_t dst_stride,
460 v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
461 v8i16 in0_r, in1_r, in2_r, in3_r, in0_l, in1_l, in2_l, in3_l;
463 for (loop_cnt = (height >> 1); loop_cnt--;) {
464 LD_SB4(src, 16, src0, src1, src2, src3);
466 LD_SB4(src, 16, src4, src5, src6, src7);
469 ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
470 in0_r, in1_r, in2_r, in3_r);
471 ILVL_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
472 in0_l, in1_l, in2_l, in3_l);
473 SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6);
474 SLLI_4V(in0_l, in1_l, in2_l, in3_l, 6);
475 ST_SH4(in0_r, in0_l, in1_r, in1_l, dst, 8);
476 ST_SH4(in2_r, in2_l, in3_r, in3_l, (dst + 32), 8);
479 ILVR_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7,
480 in0_r, in1_r, in2_r, in3_r);
481 ILVL_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7,
482 in0_l, in1_l, in2_l, in3_l);
483 SLLI_4V(in0_r, in1_r, in2_r, in3_r, 6);
484 SLLI_4V(in0_l, in1_l, in2_l, in3_l, 6);
485 ST_SH4(in0_r, in0_l, in1_r, in1_l, dst, 8);
486 ST_SH4(in2_r, in2_l, in3_r, in3_l, (dst + 32), 8);
491 static void hevc_hz_8t_4w_msa(uint8_t *src, int32_t src_stride,
492 int16_t *dst, int32_t dst_stride,
493 const int8_t *filter, int32_t height)
496 v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
497 v8i16 filt0, filt1, filt2, filt3;
498 v16i8 mask1, mask2, mask3;
499 v16i8 vec0, vec1, vec2, vec3;
500 v8i16 dst0, dst1, dst2, dst3;
501 v8i16 filter_vec, const_vec;
502 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 };
505 const_vec = __msa_ldi_h(128);
508 filter_vec = LD_SH(filter);
509 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
515 for (loop_cnt = (height >> 3); loop_cnt--;) {
516 LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
517 src += (8 * src_stride);
518 XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
520 VSHF_B4_SB(src0, src1, mask0, mask1, mask2, mask3,
521 vec0, vec1, vec2, vec3);
523 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
524 dst0, dst0, dst0, dst0);
525 VSHF_B4_SB(src2, src3, mask0, mask1, mask2, mask3,
526 vec0, vec1, vec2, vec3);
528 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
529 dst1, dst1, dst1, dst1);
530 VSHF_B4_SB(src4, src5, mask0, mask1, mask2, mask3,
531 vec0, vec1, vec2, vec3);
533 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
534 dst2, dst2, dst2, dst2);
535 VSHF_B4_SB(src6, src7, mask0, mask1, mask2, mask3,
536 vec0, vec1, vec2, vec3);
538 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
539 dst3, dst3, dst3, dst3);
541 ST8x8_UB(dst0, dst1, dst2, dst3, dst, 2 * dst_stride);
542 dst += (8 * dst_stride);
546 static void hevc_hz_8t_8w_msa(uint8_t *src, int32_t src_stride,
547 int16_t *dst, int32_t dst_stride,
548 const int8_t *filter, int32_t height)
551 v16i8 src0, src1, src2, src3;
552 v8i16 filt0, filt1, filt2, filt3;
553 v16i8 mask1, mask2, mask3;
554 v16i8 vec0, vec1, vec2, vec3;
555 v8i16 dst0, dst1, dst2, dst3;
556 v8i16 filter_vec, const_vec;
557 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
560 const_vec = __msa_ldi_h(128);
563 filter_vec = LD_SH(filter);
564 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
570 for (loop_cnt = (height >> 2); loop_cnt--;) {
571 LD_SB4(src, src_stride, src0, src1, src2, src3);
572 src += (4 * src_stride);
573 XORI_B4_128_SB(src0, src1, src2, src3);
575 VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
576 vec0, vec1, vec2, vec3);
578 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
579 dst0, dst0, dst0, dst0);
580 VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
581 vec0, vec1, vec2, vec3);
583 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
584 dst1, dst1, dst1, dst1);
585 VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
586 vec0, vec1, vec2, vec3);
588 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
589 dst2, dst2, dst2, dst2);
590 VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
591 vec0, vec1, vec2, vec3);
593 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
594 dst3, dst3, dst3, dst3);
596 ST_SH4(dst0, dst1, dst2, dst3, dst, dst_stride);
597 dst += (4 * dst_stride);
601 static void hevc_hz_8t_12w_msa(uint8_t *src, int32_t src_stride,
602 int16_t *dst, int32_t dst_stride,
603 const int8_t *filter, int32_t height)
605 hevc_hz_8t_8w_msa(src, src_stride, dst, dst_stride, filter, height);
606 hevc_hz_8t_4w_msa(src + 8, src_stride, dst + 8, dst_stride, filter, height);
609 static void hevc_hz_8t_16w_msa(uint8_t *src, int32_t src_stride,
610 int16_t *dst, int32_t dst_stride,
611 const int8_t *filter, int32_t height)
614 v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
615 v8i16 filt0, filt1, filt2, filt3;
616 v16i8 mask1, mask2, mask3;
617 v16i8 vec0, vec1, vec2, vec3;
618 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
619 v8i16 filter_vec, const_vec;
620 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
623 const_vec = __msa_ldi_h(128);
626 filter_vec = LD_SH(filter);
627 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
633 for (loop_cnt = (height >> 2); loop_cnt--;) {
634 LD_SB4(src, src_stride, src0, src2, src4, src6);
635 LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
636 src += (4 * src_stride);
637 XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
639 VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
640 vec0, vec1, vec2, vec3);
642 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
643 dst0, dst0, dst0, dst0);
644 VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
645 vec0, vec1, vec2, vec3);
647 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
648 dst1, dst1, dst1, dst1);
649 VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
650 vec0, vec1, vec2, vec3);
652 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
653 dst2, dst2, dst2, dst2);
654 VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
655 vec0, vec1, vec2, vec3);
657 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
658 dst3, dst3, dst3, dst3);
659 VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3,
660 vec0, vec1, vec2, vec3);
662 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
663 dst4, dst4, dst4, dst4);
664 VSHF_B4_SB(src5, src5, mask0, mask1, mask2, mask3,
665 vec0, vec1, vec2, vec3);
667 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
668 dst5, dst5, dst5, dst5);
669 VSHF_B4_SB(src6, src6, mask0, mask1, mask2, mask3,
670 vec0, vec1, vec2, vec3);
672 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
673 dst6, dst6, dst6, dst6);
674 VSHF_B4_SB(src7, src7, mask0, mask1, mask2, mask3,
675 vec0, vec1, vec2, vec3);
677 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
678 dst7, dst7, dst7, dst7);
680 ST_SH4(dst0, dst2, dst4, dst6, dst, dst_stride);
681 ST_SH4(dst1, dst3, dst5, dst7, dst + 8, dst_stride);
682 dst += (4 * dst_stride);
686 static void hevc_hz_8t_24w_msa(uint8_t *src, int32_t src_stride,
687 int16_t *dst, int32_t dst_stride,
688 const int8_t *filter, int32_t height)
691 v16i8 src0, src1, src2, src3;
692 v8i16 filt0, filt1, filt2, filt3;
693 v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
694 v16i8 vec0, vec1, vec2, vec3;
695 v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
696 v8i16 filter_vec, const_vec;
697 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
700 filter_vec = LD_SH(filter);
701 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
711 const_vec = __msa_ldi_h(128);
714 for (loop_cnt = (height >> 1); loop_cnt--;) {
715 LD_SB2(src, 16, src0, src1);
717 LD_SB2(src, 16, src2, src3);
719 XORI_B4_128_SB(src0, src1, src2, src3);
721 VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
722 vec0, vec1, vec2, vec3);
724 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
725 dst0, dst0, dst0, dst0);
726 VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7,
727 vec0, vec1, vec2, vec3);
729 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
730 dst1, dst1, dst1, dst1);
731 VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
732 vec0, vec1, vec2, vec3);
734 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
735 dst2, dst2, dst2, dst2);
736 VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
737 vec0, vec1, vec2, vec3);
739 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
740 dst3, dst3, dst3, dst3);
741 VSHF_B4_SB(src2, src3, mask4, mask5, mask6, mask7,
742 vec0, vec1, vec2, vec3);
744 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
745 dst4, dst4, dst4, dst4);
746 VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
747 vec0, vec1, vec2, vec3);
749 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
750 dst5, dst5, dst5, dst5);
752 ST_SH2(dst0, dst1, dst, 8);
753 ST_SH(dst2, dst + 16);
755 ST_SH2(dst3, dst4, dst, 8);
756 ST_SH(dst5, dst + 16);
761 static void hevc_hz_8t_32w_msa(uint8_t *src, int32_t src_stride,
762 int16_t *dst, int32_t dst_stride,
763 const int8_t *filter, int32_t height)
766 v16i8 src0, src1, src2;
767 v8i16 filt0, filt1, filt2, filt3;
768 v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
769 v16i8 vec0, vec1, vec2, vec3;
770 v8i16 dst0, dst1, dst2, dst3;
771 v8i16 filter_vec, const_vec;
772 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
775 filter_vec = LD_SH(filter);
776 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
786 const_vec = __msa_ldi_h(128);
789 for (loop_cnt = height; loop_cnt--;) {
790 LD_SB2(src, 16, src0, src1);
791 src2 = LD_SB(src + 24);
793 XORI_B3_128_SB(src0, src1, src2);
795 VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
796 vec0, vec1, vec2, vec3);
798 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
799 dst0, dst0, dst0, dst0);
800 VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7,
801 vec0, vec1, vec2, vec3);
803 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
804 dst1, dst1, dst1, dst1);
805 VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
806 vec0, vec1, vec2, vec3);
808 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
809 dst2, dst2, dst2, dst2);
810 VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
811 vec0, vec1, vec2, vec3);
813 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
814 dst3, dst3, dst3, dst3);
816 ST_SH4(dst0, dst1, dst2, dst3, dst, 8);
821 static void hevc_hz_8t_48w_msa(uint8_t *src, int32_t src_stride,
822 int16_t *dst, int32_t dst_stride,
823 const int8_t *filter, int32_t height)
826 v16i8 src0, src1, src2, src3;
827 v8i16 filt0, filt1, filt2, filt3;
828 v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
829 v16i8 vec0, vec1, vec2, vec3;
830 v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
831 v8i16 filter_vec, const_vec;
832 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
835 filter_vec = LD_SH(filter);
836 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
846 const_vec = __msa_ldi_h(128);
849 for (loop_cnt = height; loop_cnt--;) {
850 LD_SB3(src, 16, src0, src1, src2);
851 src3 = LD_SB(src + 40);
853 XORI_B4_128_SB(src0, src1, src2, src3);
855 VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
856 vec0, vec1, vec2, vec3);
858 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
859 dst0, dst0, dst0, dst0);
860 VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7,
861 vec0, vec1, vec2, vec3);
863 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
864 dst1, dst1, dst1, dst1);
865 VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
866 vec0, vec1, vec2, vec3);
868 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
869 dst2, dst2, dst2, dst2);
870 VSHF_B4_SB(src1, src2, mask4, mask5, mask6, mask7,
871 vec0, vec1, vec2, vec3);
873 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
874 dst3, dst3, dst3, dst3);
875 VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
876 vec0, vec1, vec2, vec3);
878 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
879 dst4, dst4, dst4, dst4);
880 VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
881 vec0, vec1, vec2, vec3);
883 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
884 dst5, dst5, dst5, dst5);
886 ST_SH6(dst0, dst1, dst2, dst3, dst4, dst5, dst, 8);
891 static void hevc_hz_8t_64w_msa(uint8_t *src, int32_t src_stride,
892 int16_t *dst, int32_t dst_stride,
893 const int8_t *filter, int32_t height)
896 v16i8 src0, src1, src2, src3, src4;
897 v8i16 filt0, filt1, filt2, filt3;
898 v16i8 mask1, mask2, mask3, mask4, mask5, mask6, mask7;
899 v16i8 vec0, vec1, vec2, vec3;
900 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
901 v8i16 filter_vec, const_vec;
902 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
906 filter_vec = LD_SH(filter);
907 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
917 const_vec = __msa_ldi_h(128);
920 for (loop_cnt = height; loop_cnt--;) {
921 LD_SB4(src, 16, src0, src1, src2, src3);
922 src4 = LD_SB(src + 56);
924 XORI_B5_128_SB(src0, src1, src2, src3, src4);
926 VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
927 vec0, vec1, vec2, vec3);
929 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
930 dst0, dst0, dst0, dst0);
933 VSHF_B4_SB(src0, src1, mask4, mask5, mask6, mask7,
934 vec0, vec1, vec2, vec3);
936 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
937 dst1, dst1, dst1, dst1);
938 ST_SH(dst1, dst + 8);
940 VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
941 vec0, vec1, vec2, vec3);
943 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
944 dst2, dst2, dst2, dst2);
945 ST_SH(dst2, dst + 16);
947 VSHF_B4_SB(src1, src2, mask4, mask5, mask6, mask7,
948 vec0, vec1, vec2, vec3);
950 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
951 dst3, dst3, dst3, dst3);
952 ST_SH(dst3, dst + 24);
954 VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
955 vec0, vec1, vec2, vec3);
957 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
958 dst4, dst4, dst4, dst4);
959 ST_SH(dst4, dst + 32);
961 VSHF_B4_SB(src2, src3, mask4, mask5, mask6, mask7,
962 vec0, vec1, vec2, vec3);
964 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
965 dst5, dst5, dst5, dst5);
966 ST_SH(dst5, dst + 40);
968 VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
969 vec0, vec1, vec2, vec3);
971 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
972 dst6, dst6, dst6, dst6);
973 ST_SH(dst6, dst + 48);
975 VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3,
976 vec0, vec1, vec2, vec3);
978 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
979 dst7, dst7, dst7, dst7);
980 ST_SH(dst7, dst + 56);
985 static void hevc_vt_8t_4w_msa(uint8_t *src, int32_t src_stride,
986 int16_t *dst, int32_t dst_stride,
987 const int8_t *filter, int32_t height)
990 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
991 v16i8 src9, src10, src11, src12, src13, src14;
992 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
993 v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
994 v16i8 src1110_r, src1211_r, src1312_r, src1413_r;
995 v16i8 src2110, src4332, src6554, src8776, src10998;
996 v16i8 src12111110, src14131312;
997 v8i16 dst10, dst32, dst54, dst76;
998 v8i16 filt0, filt1, filt2, filt3;
999 v8i16 filter_vec, const_vec;
1001 src -= (3 * src_stride);
1003 const_vec = __msa_ldi_h(128);
1006 filter_vec = LD_SH(filter);
1007 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1009 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
1010 src += (7 * src_stride);
1011 ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1012 src10_r, src32_r, src54_r, src21_r);
1013 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1014 ILVR_D3_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r,
1015 src2110, src4332, src6554);
1016 XORI_B3_128_SB(src2110, src4332, src6554);
1018 for (loop_cnt = (height >> 3); loop_cnt--;) {
1019 LD_SB8(src, src_stride,
1020 src7, src8, src9, src10, src11, src12, src13, src14);
1021 src += (8 * src_stride);
1023 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
1024 src76_r, src87_r, src98_r, src109_r);
1025 ILVR_B4_SB(src11, src10, src12, src11, src13, src12, src14, src13,
1026 src1110_r, src1211_r, src1312_r, src1413_r);
1027 ILVR_D4_SB(src87_r, src76_r, src109_r, src98_r,
1028 src1211_r, src1110_r, src1413_r, src1312_r,
1029 src8776, src10998, src12111110, src14131312);
1030 XORI_B4_128_SB(src8776, src10998, src12111110, src14131312);
1033 DPADD_SB4_SH(src2110, src4332, src6554, src8776,
1034 filt0, filt1, filt2, filt3, dst10, dst10, dst10, dst10);
1036 DPADD_SB4_SH(src4332, src6554, src8776, src10998,
1037 filt0, filt1, filt2, filt3, dst32, dst32, dst32, dst32);
1039 DPADD_SB4_SH(src6554, src8776, src10998, src12111110,
1040 filt0, filt1, filt2, filt3, dst54, dst54, dst54, dst54);
1042 DPADD_SB4_SH(src8776, src10998, src12111110, src14131312,
1043 filt0, filt1, filt2, filt3, dst76, dst76, dst76, dst76);
1045 ST8x8_UB(dst10, dst32, dst54, dst76, dst, 2 * dst_stride);
1046 dst += (8 * dst_stride);
1049 src4332 = src12111110;
1050 src6554 = src14131312;
1055 static void hevc_vt_8t_8w_msa(uint8_t *src, int32_t src_stride,
1056 int16_t *dst, int32_t dst_stride,
1057 const int8_t *filter, int32_t height)
1060 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1061 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
1062 v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
1063 v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
1064 v8i16 filter_vec, const_vec;
1065 v8i16 filt0, filt1, filt2, filt3;
1067 src -= (3 * src_stride);
1068 const_vec = __msa_ldi_h(128);
1071 filter_vec = LD_SH(filter);
1072 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1074 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
1075 src += (7 * src_stride);
1076 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1077 ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1078 src10_r, src32_r, src54_r, src21_r);
1079 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1081 for (loop_cnt = (height >> 2); loop_cnt--;) {
1082 LD_SB4(src, src_stride, src7, src8, src9, src10);
1083 src += (4 * src_stride);
1084 XORI_B4_128_SB(src7, src8, src9, src10);
1085 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
1086 src76_r, src87_r, src98_r, src109_r);
1089 DPADD_SB4_SH(src10_r, src32_r, src54_r, src76_r,
1090 filt0, filt1, filt2, filt3,
1091 dst0_r, dst0_r, dst0_r, dst0_r);
1093 DPADD_SB4_SH(src21_r, src43_r, src65_r, src87_r,
1094 filt0, filt1, filt2, filt3,
1095 dst1_r, dst1_r, dst1_r, dst1_r);
1097 DPADD_SB4_SH(src32_r, src54_r, src76_r, src98_r,
1098 filt0, filt1, filt2, filt3,
1099 dst2_r, dst2_r, dst2_r, dst2_r);
1101 DPADD_SB4_SH(src43_r, src65_r, src87_r, src109_r,
1102 filt0, filt1, filt2, filt3,
1103 dst3_r, dst3_r, dst3_r, dst3_r);
1105 ST_SH4(dst0_r, dst1_r, dst2_r, dst3_r, dst, dst_stride);
1106 dst += (4 * dst_stride);
1118 static void hevc_vt_8t_12w_msa(uint8_t *src, int32_t src_stride,
1119 int16_t *dst, int32_t dst_stride,
1120 const int8_t *filter, int32_t height)
1123 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1124 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
1125 v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
1126 v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
1127 v16i8 src10_l, src32_l, src54_l, src76_l, src98_l;
1128 v16i8 src21_l, src43_l, src65_l, src87_l, src109_l;
1129 v16i8 src2110, src4332, src6554, src8776, src10998;
1130 v8i16 dst0_l, dst1_l;
1131 v8i16 filter_vec, const_vec;
1132 v8i16 filt0, filt1, filt2, filt3;
1134 src -= (3 * src_stride);
1135 const_vec = __msa_ldi_h(128);
1138 filter_vec = LD_SH(filter);
1139 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1141 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
1142 src += (7 * src_stride);
1143 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1144 ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1145 src10_r, src32_r, src54_r, src21_r);
1146 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1147 ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1148 src10_l, src32_l, src54_l, src21_l);
1149 ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
1150 ILVR_D3_SB(src21_l, src10_l, src43_l, src32_l, src65_l, src54_l,
1151 src2110, src4332, src6554);
1153 for (loop_cnt = (height >> 2); loop_cnt--;) {
1154 LD_SB4(src, src_stride, src7, src8, src9, src10);
1155 src += (4 * src_stride);
1156 XORI_B4_128_SB(src7, src8, src9, src10);
1157 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
1158 src76_r, src87_r, src98_r, src109_r);
1159 ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
1160 src76_l, src87_l, src98_l, src109_l);
1161 ILVR_D2_SB(src87_l, src76_l, src109_l, src98_l, src8776, src10998);
1164 DPADD_SB4_SH(src10_r, src32_r, src54_r, src76_r,
1165 filt0, filt1, filt2, filt3,
1166 dst0_r, dst0_r, dst0_r, dst0_r);
1168 DPADD_SB4_SH(src21_r, src43_r, src65_r, src87_r,
1169 filt0, filt1, filt2, filt3,
1170 dst1_r, dst1_r, dst1_r, dst1_r);
1172 DPADD_SB4_SH(src32_r, src54_r, src76_r, src98_r,
1173 filt0, filt1, filt2, filt3,
1174 dst2_r, dst2_r, dst2_r, dst2_r);
1176 DPADD_SB4_SH(src43_r, src65_r, src87_r, src109_r,
1177 filt0, filt1, filt2, filt3,
1178 dst3_r, dst3_r, dst3_r, dst3_r);
1180 DPADD_SB4_SH(src2110, src4332, src6554, src8776,
1181 filt0, filt1, filt2, filt3,
1182 dst0_l, dst0_l, dst0_l, dst0_l);
1184 DPADD_SB4_SH(src4332, src6554, src8776, src10998,
1185 filt0, filt1, filt2, filt3,
1186 dst1_l, dst1_l, dst1_l, dst1_l);
1188 ST_SH4(dst0_r, dst1_r, dst2_r, dst3_r, dst, dst_stride);
1189 ST8x4_UB(dst0_l, dst1_l, dst + 8, 2 * dst_stride);
1190 dst += (4 * dst_stride);
1205 static void hevc_vt_8t_16multx4mult_msa(uint8_t *src,
1209 const int8_t *filter,
1215 int32_t loop_cnt, cnt;
1216 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1217 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
1218 v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
1219 v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
1220 v16i8 src10_l, src32_l, src54_l, src76_l, src98_l;
1221 v16i8 src21_l, src43_l, src65_l, src87_l, src109_l;
1222 v8i16 dst0_l, dst1_l, dst2_l, dst3_l;
1223 v8i16 filter_vec, const_vec;
1224 v8i16 filt0, filt1, filt2, filt3;
1226 src -= (3 * src_stride);
1227 const_vec = __msa_ldi_h(128);
1230 filter_vec = LD_SH(filter);
1231 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1233 for (cnt = width >> 4; cnt--;) {
1237 LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6);
1238 src_tmp += (7 * src_stride);
1239 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1240 ILVR_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1241 src10_r, src32_r, src54_r, src21_r);
1242 ILVR_B2_SB(src4, src3, src6, src5, src43_r, src65_r);
1243 ILVL_B4_SB(src1, src0, src3, src2, src5, src4, src2, src1,
1244 src10_l, src32_l, src54_l, src21_l);
1245 ILVL_B2_SB(src4, src3, src6, src5, src43_l, src65_l);
1247 for (loop_cnt = (height >> 2); loop_cnt--;) {
1248 LD_SB4(src_tmp, src_stride, src7, src8, src9, src10);
1249 src_tmp += (4 * src_stride);
1250 XORI_B4_128_SB(src7, src8, src9, src10);
1251 ILVR_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
1252 src76_r, src87_r, src98_r, src109_r);
1253 ILVL_B4_SB(src7, src6, src8, src7, src9, src8, src10, src9,
1254 src76_l, src87_l, src98_l, src109_l);
1257 DPADD_SB4_SH(src10_r, src32_r, src54_r, src76_r,
1258 filt0, filt1, filt2, filt3,
1259 dst0_r, dst0_r, dst0_r, dst0_r);
1261 DPADD_SB4_SH(src21_r, src43_r, src65_r, src87_r,
1262 filt0, filt1, filt2, filt3,
1263 dst1_r, dst1_r, dst1_r, dst1_r);
1265 DPADD_SB4_SH(src32_r, src54_r, src76_r, src98_r,
1266 filt0, filt1, filt2, filt3,
1267 dst2_r, dst2_r, dst2_r, dst2_r);
1269 DPADD_SB4_SH(src43_r, src65_r, src87_r, src109_r,
1270 filt0, filt1, filt2, filt3,
1271 dst3_r, dst3_r, dst3_r, dst3_r);
1273 DPADD_SB4_SH(src10_l, src32_l, src54_l, src76_l,
1274 filt0, filt1, filt2, filt3,
1275 dst0_l, dst0_l, dst0_l, dst0_l);
1277 DPADD_SB4_SH(src21_l, src43_l, src65_l, src87_l,
1278 filt0, filt1, filt2, filt3,
1279 dst1_l, dst1_l, dst1_l, dst1_l);
1281 DPADD_SB4_SH(src32_l, src54_l, src76_l, src98_l,
1282 filt0, filt1, filt2, filt3,
1283 dst2_l, dst2_l, dst2_l, dst2_l);
1285 DPADD_SB4_SH(src43_l, src65_l, src87_l, src109_l,
1286 filt0, filt1, filt2, filt3,
1287 dst3_l, dst3_l, dst3_l, dst3_l);
1289 ST_SH4(dst0_r, dst1_r, dst2_r, dst3_r, dst_tmp, dst_stride);
1290 ST_SH4(dst0_l, dst1_l, dst2_l, dst3_l, dst_tmp + 8, dst_stride);
1291 dst_tmp += (4 * dst_stride);
1313 static void hevc_vt_8t_16w_msa(uint8_t *src, int32_t src_stride,
1314 int16_t *dst, int32_t dst_stride,
1315 const int8_t *filter, int32_t height)
1317 hevc_vt_8t_16multx4mult_msa(src, src_stride, dst, dst_stride,
1318 filter, height, 16);
1321 static void hevc_vt_8t_24w_msa(uint8_t *src, int32_t src_stride,
1322 int16_t *dst, int32_t dst_stride,
1323 const int8_t *filter, int32_t height)
1325 hevc_vt_8t_16multx4mult_msa(src, src_stride, dst, dst_stride,
1326 filter, height, 16);
1327 hevc_vt_8t_8w_msa(src + 16, src_stride, dst + 16, dst_stride,
1331 static void hevc_vt_8t_32w_msa(uint8_t *src, int32_t src_stride,
1332 int16_t *dst, int32_t dst_stride,
1333 const int8_t *filter, int32_t height)
1335 hevc_vt_8t_16multx4mult_msa(src, src_stride, dst, dst_stride,
1336 filter, height, 32);
1339 static void hevc_vt_8t_48w_msa(uint8_t *src, int32_t src_stride,
1340 int16_t *dst, int32_t dst_stride,
1341 const int8_t *filter, int32_t height)
1343 hevc_vt_8t_16multx4mult_msa(src, src_stride, dst, dst_stride,
1344 filter, height, 48);
1347 static void hevc_vt_8t_64w_msa(uint8_t *src, int32_t src_stride,
1348 int16_t *dst, int32_t dst_stride,
1349 const int8_t *filter, int32_t height)
1351 hevc_vt_8t_16multx4mult_msa(src, src_stride, dst, dst_stride,
1352 filter, height, 64);
1355 static void hevc_hv_8t_4w_msa(uint8_t *src, int32_t src_stride,
1356 int16_t *dst, int32_t dst_stride,
1357 const int8_t *filter_x, const int8_t *filter_y,
1361 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
1362 v8i16 filt0, filt1, filt2, filt3;
1363 v4i32 filt_h0, filt_h1, filt_h2, filt_h3;
1364 v16i8 mask1, mask2, mask3;
1365 v8i16 filter_vec, const_vec;
1366 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1367 v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1368 v8i16 dst30, dst41, dst52, dst63, dst66, dst87;
1369 v4i32 dst0_r, dst1_r;
1370 v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
1371 v8i16 dst21_r, dst43_r, dst65_r, dst87_r;
1373 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20
1375 v8u16 mask4 = { 0, 4, 1, 5, 2, 6, 3, 7 };
1377 src -= ((3 * src_stride) + 3);
1378 filter_vec = LD_SH(filter_x);
1379 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1381 filter_vec = LD_SH(filter_y);
1382 vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
1383 filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
1385 SPLATI_W4_SW(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
1391 const_vec = __msa_ldi_h(128);
1394 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
1395 src += (7 * src_stride);
1396 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1398 VSHF_B4_SB(src0, src3, mask0, mask1, mask2, mask3, vec0, vec1, vec2, vec3);
1399 VSHF_B4_SB(src1, src4, mask0, mask1, mask2, mask3, vec4, vec5, vec6, vec7);
1400 VSHF_B4_SB(src2, src5, mask0, mask1, mask2, mask3,
1401 vec8, vec9, vec10, vec11);
1402 VSHF_B4_SB(src3, src6, mask0, mask1, mask2, mask3,
1403 vec12, vec13, vec14, vec15);
1405 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
1406 dst30, dst30, dst30, dst30);
1408 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, filt3,
1409 dst41, dst41, dst41, dst41);
1411 DPADD_SB4_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, filt3,
1412 dst52, dst52, dst52, dst52);
1414 DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2, filt3,
1415 dst63, dst63, dst63, dst63);
1417 ILVR_H3_SH(dst41, dst30, dst52, dst41, dst63, dst52,
1418 dst10_r, dst21_r, dst32_r);
1419 dst43_r = __msa_ilvl_h(dst41, dst30);
1420 dst54_r = __msa_ilvl_h(dst52, dst41);
1421 dst65_r = __msa_ilvl_h(dst63, dst52);
1422 dst66 = (v8i16) __msa_splati_d((v2i64) dst63, 1);
1424 for (loop_cnt = height >> 1; loop_cnt--;) {
1425 LD_SB2(src, src_stride, src7, src8);
1426 src += (2 * src_stride);
1427 XORI_B2_128_SB(src7, src8);
1429 VSHF_B4_SB(src7, src8, mask0, mask1, mask2, mask3,
1430 vec0, vec1, vec2, vec3);
1432 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
1433 dst87, dst87, dst87, dst87);
1434 dst76_r = __msa_ilvr_h(dst87, dst66);
1435 dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r,
1436 filt_h0, filt_h1, filt_h2, filt_h3);
1437 dst87_r = __msa_vshf_h((v8i16) mask4, dst87, dst87);
1438 dst1_r = HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r,
1439 filt_h0, filt_h1, filt_h2, filt_h3);
1443 dst0_r = (v4i32) __msa_pckev_h((v8i16) dst1_r, (v8i16) dst0_r);
1444 ST8x2_UB(dst0_r, dst, (2 * dst_stride));
1445 dst += (2 * dst_stride);
1453 dst66 = (v8i16) __msa_splati_d((v2i64) dst87, 1);
1457 static void hevc_hv_8t_8multx2mult_msa(uint8_t *src,
1461 const int8_t *filter_x,
1462 const int8_t *filter_y,
1463 int32_t height, int32_t width)
1465 uint32_t loop_cnt, cnt;
1468 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
1469 v8i16 filt0, filt1, filt2, filt3;
1470 v4i32 filt_h0, filt_h1, filt_h2, filt_h3;
1471 v16i8 mask1, mask2, mask3;
1472 v8i16 filter_vec, const_vec;
1473 v16i8 vec0, vec1, vec2, vec3, vec4, vec5, vec6, vec7;
1474 v16i8 vec8, vec9, vec10, vec11, vec12, vec13, vec14, vec15;
1475 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
1476 v4i32 dst0_r, dst0_l;
1477 v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
1478 v8i16 dst10_l, dst32_l, dst54_l, dst76_l;
1479 v8i16 dst21_r, dst43_r, dst65_r, dst87_r;
1480 v8i16 dst21_l, dst43_l, dst65_l, dst87_l;
1481 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
1483 src -= ((3 * src_stride) + 3);
1484 filter_vec = LD_SH(filter_x);
1485 SPLATI_H4_SH(filter_vec, 0, 1, 2, 3, filt0, filt1, filt2, filt3);
1487 filter_vec = LD_SH(filter_y);
1488 vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
1489 filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
1491 SPLATI_W4_SW(filter_vec, filt_h0, filt_h1, filt_h2, filt_h3);
1497 const_vec = __msa_ldi_h(128);
1500 for (cnt = width >> 3; cnt--;) {
1504 LD_SB7(src_tmp, src_stride, src0, src1, src2, src3, src4, src5, src6);
1505 src_tmp += (7 * src_stride);
1506 XORI_B7_128_SB(src0, src1, src2, src3, src4, src5, src6);
1508 /* row 0 row 1 row 2 row 3 */
1509 VSHF_B4_SB(src0, src0, mask0, mask1, mask2, mask3,
1510 vec0, vec1, vec2, vec3);
1511 VSHF_B4_SB(src1, src1, mask0, mask1, mask2, mask3,
1512 vec4, vec5, vec6, vec7);
1513 VSHF_B4_SB(src2, src2, mask0, mask1, mask2, mask3,
1514 vec8, vec9, vec10, vec11);
1515 VSHF_B4_SB(src3, src3, mask0, mask1, mask2, mask3,
1516 vec12, vec13, vec14, vec15);
1518 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
1519 dst0, dst0, dst0, dst0);
1521 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, filt3,
1522 dst1, dst1, dst1, dst1);
1524 DPADD_SB4_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, filt3,
1525 dst2, dst2, dst2, dst2);
1527 DPADD_SB4_SH(vec12, vec13, vec14, vec15, filt0, filt1, filt2, filt3,
1528 dst3, dst3, dst3, dst3);
1530 /* row 4 row 5 row 6 */
1531 VSHF_B4_SB(src4, src4, mask0, mask1, mask2, mask3,
1532 vec0, vec1, vec2, vec3);
1533 VSHF_B4_SB(src5, src5, mask0, mask1, mask2, mask3,
1534 vec4, vec5, vec6, vec7);
1535 VSHF_B4_SB(src6, src6, mask0, mask1, mask2, mask3,
1536 vec8, vec9, vec10, vec11);
1538 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
1539 dst4, dst4, dst4, dst4);
1541 DPADD_SB4_SH(vec4, vec5, vec6, vec7, filt0, filt1, filt2, filt3,
1542 dst5, dst5, dst5, dst5);
1544 DPADD_SB4_SH(vec8, vec9, vec10, vec11, filt0, filt1, filt2, filt3,
1545 dst6, dst6, dst6, dst6);
1547 ILVR_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1,
1548 dst10_r, dst32_r, dst54_r, dst21_r);
1549 ILVR_H2_SH(dst4, dst3, dst6, dst5, dst43_r, dst65_r);
1550 ILVL_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst2, dst1,
1551 dst10_l, dst32_l, dst54_l, dst21_l);
1552 ILVL_H2_SH(dst4, dst3, dst6, dst5, dst43_l, dst65_l);
1554 for (loop_cnt = height >> 1; loop_cnt--;) {
1555 LD_SB2(src_tmp, src_stride, src7, src8);
1556 XORI_B2_128_SB(src7, src8);
1557 src_tmp += 2 * src_stride;
1559 VSHF_B4_SB(src7, src7, mask0, mask1, mask2, mask3,
1560 vec0, vec1, vec2, vec3);
1562 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
1563 dst7, dst7, dst7, dst7);
1565 ILVRL_H2_SH(dst7, dst6, dst76_r, dst76_l);
1566 dst0_r = HEVC_FILT_8TAP(dst10_r, dst32_r, dst54_r, dst76_r,
1567 filt_h0, filt_h1, filt_h2, filt_h3);
1568 dst0_l = HEVC_FILT_8TAP(dst10_l, dst32_l, dst54_l, dst76_l,
1569 filt_h0, filt_h1, filt_h2, filt_h3);
1573 dst0_r = (v4i32) __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r);
1574 ST_SW(dst0_r, dst_tmp);
1575 dst_tmp += dst_stride;
1577 VSHF_B4_SB(src8, src8, mask0, mask1, mask2, mask3,
1578 vec0, vec1, vec2, vec3);
1580 DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
1581 dst8, dst8, dst8, dst8);
1583 ILVRL_H2_SH(dst8, dst7, dst87_r, dst87_l);
1585 dst0_r = HEVC_FILT_8TAP(dst21_r, dst43_r, dst65_r, dst87_r,
1586 filt_h0, filt_h1, filt_h2, filt_h3);
1587 dst0_l = HEVC_FILT_8TAP(dst21_l, dst43_l, dst65_l, dst87_l,
1588 filt_h0, filt_h1, filt_h2, filt_h3);
1592 dst0_r = (v4i32) __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r);
1593 ST_SW(dst0_r, dst_tmp);
1594 dst_tmp += dst_stride;
1615 static void hevc_hv_8t_8w_msa(uint8_t *src, int32_t src_stride,
1616 int16_t *dst, int32_t dst_stride,
1617 const int8_t *filter_x, const int8_t *filter_y,
1620 hevc_hv_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
1621 filter_x, filter_y, height, 8);
1624 static void hevc_hv_8t_12w_msa(uint8_t *src, int32_t src_stride,
1625 int16_t *dst, int32_t dst_stride,
1626 const int8_t *filter_x, const int8_t *filter_y,
1629 hevc_hv_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
1630 filter_x, filter_y, height, 8);
1632 hevc_hv_8t_4w_msa(src + 8, src_stride, dst + 8, dst_stride,
1633 filter_x, filter_y, height);
1636 static void hevc_hv_8t_16w_msa(uint8_t *src, int32_t src_stride,
1637 int16_t *dst, int32_t dst_stride,
1638 const int8_t *filter_x, const int8_t *filter_y,
1641 hevc_hv_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
1642 filter_x, filter_y, height, 16);
1645 static void hevc_hv_8t_24w_msa(uint8_t *src, int32_t src_stride,
1646 int16_t *dst, int32_t dst_stride,
1647 const int8_t *filter_x, const int8_t *filter_y,
1650 hevc_hv_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
1651 filter_x, filter_y, height, 24);
1654 static void hevc_hv_8t_32w_msa(uint8_t *src, int32_t src_stride,
1655 int16_t *dst, int32_t dst_stride,
1656 const int8_t *filter_x, const int8_t *filter_y,
1659 hevc_hv_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
1660 filter_x, filter_y, height, 32);
1663 static void hevc_hv_8t_48w_msa(uint8_t *src, int32_t src_stride,
1664 int16_t *dst, int32_t dst_stride,
1665 const int8_t *filter_x, const int8_t *filter_y,
1668 hevc_hv_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
1669 filter_x, filter_y, height, 48);
1672 static void hevc_hv_8t_64w_msa(uint8_t *src, int32_t src_stride,
1673 int16_t *dst, int32_t dst_stride,
1674 const int8_t *filter_x, const int8_t *filter_y,
1677 hevc_hv_8t_8multx2mult_msa(src, src_stride, dst, dst_stride,
1678 filter_x, filter_y, height, 64);
1681 static void hevc_hz_4t_4x2_msa(uint8_t *src,
1685 const int8_t *filter)
1689 v16i8 mask1, vec0, vec1;
1691 v8i16 filter_vec, const_vec;
1692 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 };
1696 filter_vec = LD_SH(filter);
1697 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
1701 const_vec = __msa_ldi_h(128);
1704 LD_SB2(src, src_stride, src0, src1);
1705 XORI_B2_128_SB(src0, src1);
1707 VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
1709 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
1711 ST8x2_UB(dst0, dst, 2 * dst_stride);
1714 static void hevc_hz_4t_4x4_msa(uint8_t *src,
1718 const int8_t *filter)
1721 v16i8 src0, src1, src2, src3;
1722 v16i8 mask1, vec0, vec1;
1724 v8i16 filter_vec, const_vec;
1725 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 };
1729 filter_vec = LD_SH(filter);
1730 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
1734 const_vec = __msa_ldi_h(128);
1737 LD_SB4(src, src_stride, src0, src1, src2, src3);
1738 XORI_B4_128_SB(src0, src1, src2, src3);
1740 VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
1742 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
1744 VSHF_B2_SB(src2, src3, src2, src3, mask0, mask1, vec0, vec1);
1746 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
1748 ST8x4_UB(dst0, dst1, dst, 2 * dst_stride);
1751 static void hevc_hz_4t_4x8multiple_msa(uint8_t *src,
1755 const int8_t *filter,
1760 v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
1761 v16i8 mask1, vec0, vec1;
1762 v8i16 dst0, dst1, dst2, dst3;
1763 v8i16 filter_vec, const_vec;
1764 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 16, 17, 17, 18, 18, 19, 19, 20 };
1768 filter_vec = LD_SH(filter);
1769 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
1773 const_vec = __msa_ldi_h(128);
1776 for (loop_cnt = (height >> 3); loop_cnt--;) {
1777 LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
1778 src += (8 * src_stride);
1780 XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
1782 VSHF_B2_SB(src0, src1, src0, src1, mask0, mask1, vec0, vec1);
1784 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
1785 VSHF_B2_SB(src2, src3, src2, src3, mask0, mask1, vec0, vec1);
1787 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
1788 VSHF_B2_SB(src4, src5, src4, src5, mask0, mask1, vec0, vec1);
1790 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
1791 VSHF_B2_SB(src6, src7, src6, src7, mask0, mask1, vec0, vec1);
1793 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
1795 ST8x8_UB(dst0, dst1, dst2, dst3, dst, 2 * dst_stride);
1796 dst += (8 * dst_stride);
1800 static void hevc_hz_4t_4w_msa(uint8_t *src,
1804 const int8_t *filter,
1808 hevc_hz_4t_4x2_msa(src, src_stride, dst, dst_stride, filter);
1809 } else if (4 == height) {
1810 hevc_hz_4t_4x4_msa(src, src_stride, dst, dst_stride, filter);
1811 } else if (0 == height % 8) {
1812 hevc_hz_4t_4x8multiple_msa(src, src_stride, dst, dst_stride,
1817 static void hevc_hz_4t_6w_msa(uint8_t *src,
1821 const int8_t *filter,
1825 uint64_t dst_val0, dst_val1, dst_val2, dst_val3;
1826 uint32_t dst_val_int0, dst_val_int1, dst_val_int2, dst_val_int3;
1827 v8i16 filt0, filt1, dst0, dst1, dst2, dst3;
1828 v16i8 src0, src1, src2, src3;
1829 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
1832 v8i16 filter_vec, const_vec;
1836 filter_vec = LD_SH(filter);
1837 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
1841 const_vec = __msa_ldi_h(128);
1844 for (loop_cnt = (height >> 2); loop_cnt--;) {
1845 LD_SB4(src, src_stride, src0, src1, src2, src3);
1846 src += (4 * src_stride);
1848 XORI_B4_128_SB(src0, src1, src2, src3);
1850 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
1852 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
1853 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
1855 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
1856 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
1858 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
1859 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
1861 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
1863 dst_val0 = __msa_copy_u_d((v2i64) dst0, 0);
1864 dst_val1 = __msa_copy_u_d((v2i64) dst1, 0);
1865 dst_val2 = __msa_copy_u_d((v2i64) dst2, 0);
1866 dst_val3 = __msa_copy_u_d((v2i64) dst3, 0);
1868 dst_val_int0 = __msa_copy_u_w((v4i32) dst0, 2);
1869 dst_val_int1 = __msa_copy_u_w((v4i32) dst1, 2);
1870 dst_val_int2 = __msa_copy_u_w((v4i32) dst2, 2);
1871 dst_val_int3 = __msa_copy_u_w((v4i32) dst3, 2);
1874 SW(dst_val_int0, dst + 4);
1877 SW(dst_val_int1, dst + 4);
1880 SW(dst_val_int2, dst + 4);
1883 SW(dst_val_int3, dst + 4);
1888 static void hevc_hz_4t_8x2multiple_msa(uint8_t *src,
1892 const int8_t *filter,
1896 v8i16 filt0, filt1, dst0, dst1;
1898 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
1901 v8i16 filter_vec, const_vec;
1905 filter_vec = LD_SH(filter);
1906 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
1910 const_vec = __msa_ldi_h(128);
1913 for (loop_cnt = (height >> 1); loop_cnt--;) {
1914 LD_SB2(src, src_stride, src0, src1);
1915 src += (2 * src_stride);
1917 XORI_B2_128_SB(src0, src1);
1919 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
1921 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
1923 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
1925 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
1927 ST_SH2(dst0, dst1, dst, dst_stride);
1928 dst += (2 * dst_stride);
1932 static void hevc_hz_4t_8x4multiple_msa(uint8_t *src,
1936 const int8_t *filter,
1941 v16i8 src0, src1, src2, src3;
1942 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
1945 v8i16 dst0, dst1, dst2, dst3;
1946 v8i16 filter_vec, const_vec;
1950 filter_vec = LD_SH(filter);
1951 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
1955 const_vec = __msa_ldi_h(128);
1958 for (loop_cnt = (height >> 2); loop_cnt--;) {
1959 LD_SB4(src, src_stride, src0, src1, src2, src3);
1960 src += (4 * src_stride);
1962 XORI_B4_128_SB(src0, src1, src2, src3);
1964 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
1966 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
1968 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
1970 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
1972 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
1974 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
1976 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
1978 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
1980 ST_SH4(dst0, dst1, dst2, dst3, dst, dst_stride);
1981 dst += (4 * dst_stride);
1985 static void hevc_hz_4t_8w_msa(uint8_t *src,
1989 const int8_t *filter,
1992 if (2 == height || 6 == height) {
1993 hevc_hz_4t_8x2multiple_msa(src, src_stride, dst, dst_stride,
1996 hevc_hz_4t_8x4multiple_msa(src, src_stride, dst, dst_stride,
2001 static void hevc_hz_4t_12w_msa(uint8_t *src,
2005 const int8_t *filter,
2010 v16i8 src0, src1, src2, src3;
2013 v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
2014 v8i16 filter_vec, const_vec;
2016 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
2018 8, 9, 9, 10, 10, 11, 11, 12, 24, 25, 25, 26, 26, 27, 27, 28
2023 filter_vec = LD_SH(filter);
2024 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2029 const_vec = __msa_ldi_h(128);
2032 for (loop_cnt = (height >> 2); loop_cnt--;) {
2033 LD_SB4(src, src_stride, src0, src1, src2, src3);
2034 src += (4 * src_stride);
2035 XORI_B4_128_SB(src0, src1, src2, src3);
2037 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
2039 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
2040 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
2042 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
2043 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
2045 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
2046 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
2048 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
2049 VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec0, vec1);
2051 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
2052 VSHF_B2_SB(src2, src3, src2, src3, mask2, mask3, vec0, vec1);
2054 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5);
2056 ST_SH4(dst0, dst1, dst2, dst3, dst, dst_stride);
2057 ST8x4_UB(dst4, dst5, dst + 8, 2 * dst_stride);
2058 dst += (4 * dst_stride);
2062 static void hevc_hz_4t_16w_msa(uint8_t *src,
2066 const int8_t *filter,
2070 v16i8 src0, src1, src2, src3;
2071 v16i8 src4, src5, src6, src7;
2073 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
2075 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
2077 v8i16 filter_vec, const_vec;
2081 filter_vec = LD_SH(filter);
2082 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2086 const_vec = __msa_ldi_h(128);
2089 for (loop_cnt = (height >> 2); loop_cnt--;) {
2090 LD_SB4(src, src_stride, src0, src2, src4, src6);
2091 LD_SB4(src + 8, src_stride, src1, src3, src5, src7);
2092 src += (4 * src_stride);
2094 XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
2096 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
2098 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
2100 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
2102 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
2104 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
2106 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
2108 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
2110 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
2112 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
2114 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
2116 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
2118 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5);
2120 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
2122 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst6, dst6);
2124 VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1);
2126 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst7, dst7);
2128 ST_SH4(dst0, dst2, dst4, dst6, dst, dst_stride);
2129 ST_SH4(dst1, dst3, dst5, dst7, dst + 8, dst_stride);
2130 dst += (4 * dst_stride);
2134 static void hevc_hz_4t_24w_msa(uint8_t *src,
2138 const int8_t *filter,
2142 int16_t *dst_tmp = dst + 16;
2143 v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
2145 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
2146 v16i8 mask1, mask00, mask11;
2148 v8i16 dst0, dst1, dst2, dst3;
2149 v8i16 filter_vec, const_vec;
2153 filter_vec = LD_SH(filter);
2154 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2158 mask11 = mask0 + 10;
2160 const_vec = __msa_ldi_h(128);
2163 for (loop_cnt = (height >> 2); loop_cnt--;) {
2165 LD_SB4(src, src_stride, src0, src2, src4, src6);
2166 LD_SB4(src + 16, src_stride, src1, src3, src5, src7);
2167 src += (4 * src_stride);
2169 XORI_B8_128_SB(src0, src1, src2, src3, src4, src5, src6, src7);
2171 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
2173 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
2175 VSHF_B2_SB(src0, src1, src0, src1, mask00, mask11, vec0, vec1);
2177 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
2179 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
2181 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
2183 VSHF_B2_SB(src2, src3, src2, src3, mask00, mask11, vec0, vec1);
2185 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
2187 ST_SH2(dst0, dst1, dst, 8);
2189 ST_SH2(dst2, dst3, dst, 8);
2192 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
2194 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
2196 VSHF_B2_SB(src4, src5, src4, src5, mask00, mask11, vec0, vec1);
2198 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
2200 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
2202 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
2204 VSHF_B2_SB(src6, src7, src6, src7, mask00, mask11, vec0, vec1);
2206 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
2208 ST_SH2(dst0, dst1, dst, 8);
2210 ST_SH2(dst2, dst3, dst, 8);
2214 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
2216 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
2218 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
2220 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
2222 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
2224 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
2226 VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1);
2228 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
2230 ST_SH4(dst0, dst1, dst2, dst3, dst_tmp, dst_stride);
2231 dst_tmp += (4 * dst_stride);
2235 static void hevc_hz_4t_32w_msa(uint8_t *src,
2239 const int8_t *filter,
2243 v16i8 src0, src1, src2;
2245 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
2246 v16i8 mask1, mask2, mask3;
2247 v8i16 dst0, dst1, dst2, dst3;
2249 v8i16 filter_vec, const_vec;
2253 filter_vec = LD_SH(filter);
2254 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2256 const_vec = __msa_ldi_h(128);
2263 for (loop_cnt = (height >> 1); loop_cnt--;) {
2264 LD_SB2(src, 16, src0, src1);
2265 src2 = LD_SB(src + 24);
2268 XORI_B3_128_SB(src0, src1, src2);
2270 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
2272 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
2274 VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec0, vec1);
2276 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
2278 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
2280 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
2282 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
2284 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
2286 ST_SH4(dst0, dst1, dst2, dst3, dst, 8);
2289 LD_SB2(src, 16, src0, src1);
2290 src2 = LD_SB(src + 24);
2293 XORI_B3_128_SB(src0, src1, src2);
2295 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
2297 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
2299 VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec0, vec1);
2301 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
2303 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec0, vec1);
2305 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
2307 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec0, vec1);
2309 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
2311 ST_SH4(dst0, dst1, dst2, dst3, dst, 8);
2316 static void hevc_vt_4t_4x2_msa(uint8_t *src,
2320 const int8_t *filter)
2322 v16i8 src0, src1, src2, src3, src4;
2323 v16i8 src10_r, src32_r, src21_r, src43_r;
2324 v16i8 src2110, src4332;
2327 v8i16 filter_vec, const_vec;
2331 const_vec = __msa_ldi_h(128);
2334 filter_vec = LD_SH(filter);
2335 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2337 LD_SB5(src, src_stride, src0, src1, src2, src3, src4);
2338 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3,
2339 src10_r, src21_r, src32_r, src43_r);
2341 ILVR_D2_SB(src21_r, src10_r, src43_r, src32_r, src2110, src4332);
2342 XORI_B2_128_SB(src2110, src4332);
2344 DPADD_SB2_SH(src2110, src4332, filt0, filt1, dst10, dst10);
2346 ST8x2_UB(dst10, dst, 2 * dst_stride);
2349 static void hevc_vt_4t_4x4_msa(uint8_t *src,
2353 const int8_t *filter,
2356 v16i8 src0, src1, src2, src3, src4, src5, src6;
2357 v16i8 src10_r, src32_r, src54_r, src21_r, src43_r, src65_r;
2358 v16i8 src2110, src4332, src6554;
2361 v8i16 filter_vec, const_vec;
2365 const_vec = __msa_ldi_h(128);
2368 filter_vec = LD_SH(filter);
2369 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2371 LD_SB7(src, src_stride, src0, src1, src2, src3, src4, src5, src6);
2372 ILVR_B4_SB(src1, src0, src2, src1, src3, src2, src4, src3,
2373 src10_r, src21_r, src32_r, src43_r);
2374 ILVR_B2_SB(src5, src4, src6, src5, src54_r, src65_r);
2375 ILVR_D3_SB(src21_r, src10_r, src43_r, src32_r, src65_r, src54_r,
2376 src2110, src4332, src6554);
2377 XORI_B3_128_SB(src2110, src4332, src6554);
2379 DPADD_SB2_SH(src2110, src4332, filt0, filt1, dst10, dst10);
2381 DPADD_SB2_SH(src4332, src6554, filt0, filt1, dst32, dst32);
2383 ST8x4_UB(dst10, dst32, dst, 2 * dst_stride);
2386 static void hevc_vt_4t_4x8multiple_msa(uint8_t *src,
2390 const int8_t *filter,
2394 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9;
2395 v16i8 src10_r, src32_r, src54_r, src76_r, src98_r;
2396 v16i8 src21_r, src43_r, src65_r, src87_r, src109_r;
2397 v16i8 src2110, src4332, src6554, src8776;
2398 v8i16 dst10, dst32, dst54, dst76;
2400 v8i16 filter_vec, const_vec;
2403 const_vec = __msa_ldi_h(128);
2406 filter_vec = LD_SH(filter);
2407 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2409 LD_SB3(src, src_stride, src0, src1, src2);
2410 src += (3 * src_stride);
2412 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2413 src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_r, (v2i64) src10_r);
2414 src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
2416 for (loop_cnt = (height >> 3); loop_cnt--;) {
2417 LD_SB6(src, src_stride, src3, src4, src5, src6, src7, src8);
2418 src += (6 * src_stride);
2420 ILVR_B4_SB(src3, src2, src4, src3, src5, src4, src6, src5,
2421 src32_r, src43_r, src54_r, src65_r);
2422 ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
2423 ILVR_D3_SB(src43_r, src32_r, src65_r, src54_r, src87_r, src76_r,
2424 src4332, src6554, src8776);
2425 XORI_B3_128_SB(src4332, src6554, src8776);
2428 DPADD_SB2_SH(src2110, src4332, filt0, filt1, dst10, dst10);
2430 DPADD_SB2_SH(src4332, src6554, filt0, filt1, dst32, dst32);
2432 DPADD_SB2_SH(src6554, src8776, filt0, filt1, dst54, dst54);
2434 LD_SB2(src, src_stride, src9, src2);
2435 src += (2 * src_stride);
2436 ILVR_B2_SB(src9, src8, src2, src9, src98_r, src109_r);
2437 src2110 = (v16i8) __msa_ilvr_d((v2i64) src109_r, (v2i64) src98_r);
2438 src2110 = (v16i8) __msa_xori_b((v16u8) src2110, 128);
2440 DPADD_SB2_SH(src8776, src2110, filt0, filt1, dst76, dst76);
2442 ST8x8_UB(dst10, dst32, dst54, dst76, dst, 2 * dst_stride);
2443 dst += (8 * dst_stride);
2447 static void hevc_vt_4t_4w_msa(uint8_t *src,
2451 const int8_t *filter,
2455 hevc_vt_4t_4x2_msa(src, src_stride, dst, dst_stride, filter);
2456 } else if (4 == height) {
2457 hevc_vt_4t_4x4_msa(src, src_stride, dst, dst_stride, filter, height);
2458 } else if (0 == (height % 8)) {
2459 hevc_vt_4t_4x8multiple_msa(src, src_stride, dst, dst_stride,
2464 static void hevc_vt_4t_6w_msa(uint8_t *src,
2468 const int8_t *filter,
2472 uint32_t dst_val_int0, dst_val_int1, dst_val_int2, dst_val_int3;
2473 uint64_t dst_val0, dst_val1, dst_val2, dst_val3;
2474 v16i8 src0, src1, src2, src3, src4;
2475 v16i8 src10_r, src32_r, src21_r, src43_r;
2476 v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
2478 v8i16 filter_vec, const_vec;
2481 const_vec = __msa_ldi_h(128);
2484 filter_vec = LD_SH(filter);
2485 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2487 LD_SB3(src, src_stride, src0, src1, src2);
2488 src += (3 * src_stride);
2489 XORI_B3_128_SB(src0, src1, src2);
2490 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2492 for (loop_cnt = (height >> 2); loop_cnt--;) {
2493 LD_SB2(src, src_stride, src3, src4);
2494 src += (2 * src_stride);
2495 XORI_B2_128_SB(src3, src4);
2496 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
2499 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
2501 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
2503 LD_SB2(src, src_stride, src1, src2);
2504 src += (2 * src_stride);
2505 XORI_B2_128_SB(src1, src2);
2506 ILVR_B2_SB(src1, src4, src2, src1, src10_r, src21_r);
2509 DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, dst2_r, dst2_r);
2511 DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, dst3_r, dst3_r);
2513 dst_val0 = __msa_copy_u_d((v2i64) dst0_r, 0);
2514 dst_val1 = __msa_copy_u_d((v2i64) dst1_r, 0);
2515 dst_val2 = __msa_copy_u_d((v2i64) dst2_r, 0);
2516 dst_val3 = __msa_copy_u_d((v2i64) dst3_r, 0);
2518 dst_val_int0 = __msa_copy_u_w((v4i32) dst0_r, 2);
2519 dst_val_int1 = __msa_copy_u_w((v4i32) dst1_r, 2);
2520 dst_val_int2 = __msa_copy_u_w((v4i32) dst2_r, 2);
2521 dst_val_int3 = __msa_copy_u_w((v4i32) dst3_r, 2);
2524 SW(dst_val_int0, dst + 4);
2527 SW(dst_val_int1, dst + 4);
2530 SW(dst_val_int2, dst + 4);
2533 SW(dst_val_int3, dst + 4);
2538 static void hevc_vt_4t_8x2_msa(uint8_t *src,
2542 const int8_t *filter)
2544 v16i8 src0, src1, src2, src3, src4;
2545 v16i8 src10_r, src32_r, src21_r, src43_r;
2546 v8i16 dst0_r, dst1_r;
2548 v8i16 filter_vec, const_vec;
2551 const_vec = __msa_ldi_h(128);
2554 filter_vec = LD_SH(filter);
2555 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2557 LD_SB3(src, src_stride, src0, src1, src2);
2558 src += (3 * src_stride);
2559 XORI_B3_128_SB(src0, src1, src2);
2560 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2562 LD_SB2(src, src_stride, src3, src4);
2563 XORI_B2_128_SB(src3, src4);
2564 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
2566 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
2568 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
2570 ST_SH2(dst0_r, dst1_r, dst, dst_stride);
2573 static void hevc_vt_4t_8x6_msa(uint8_t *src,
2577 const int8_t *filter)
2579 v16i8 src0, src1, src2, src3, src4;
2580 v16i8 src10_r, src32_r, src21_r, src43_r;
2581 v8i16 dst0_r, dst1_r;
2583 v8i16 filter_vec, const_vec;
2586 const_vec = __msa_ldi_h(128);
2589 filter_vec = LD_SH(filter);
2590 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2592 LD_SB3(src, src_stride, src0, src1, src2);
2593 src += (3 * src_stride);
2594 XORI_B3_128_SB(src0, src1, src2);
2595 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2597 LD_SB2(src, src_stride, src3, src4);
2598 src += (2 * src_stride);
2599 XORI_B2_128_SB(src3, src4);
2601 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
2603 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
2605 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
2607 ST_SH2(dst0_r, dst1_r, dst, dst_stride);
2608 dst += (2 * dst_stride);
2610 LD_SB2(src, src_stride, src1, src2);
2611 src += (2 * src_stride);
2612 XORI_B2_128_SB(src1, src2);
2614 ILVR_B2_SB(src1, src4, src2, src1, src10_r, src21_r);
2616 DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, dst0_r, dst0_r);
2618 DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, dst1_r, dst1_r);
2620 ST_SH2(dst0_r, dst1_r, dst, dst_stride);
2621 dst += (2 * dst_stride);
2623 LD_SB2(src, src_stride, src3, src4);
2624 XORI_B2_128_SB(src3, src4);
2626 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
2628 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
2630 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
2632 ST_SH2(dst0_r, dst1_r, dst, dst_stride);
2635 static void hevc_vt_4t_8x4multiple_msa(uint8_t *src,
2639 const int8_t *filter,
2643 v16i8 src0, src1, src2, src3, src4, src5;
2644 v16i8 src10_r, src32_r, src21_r, src43_r;
2645 v8i16 dst0_r, dst1_r;
2647 v8i16 filter_vec, const_vec;
2650 const_vec = __msa_ldi_h(128);
2653 filter_vec = LD_SH(filter);
2654 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2656 LD_SB3(src, src_stride, src0, src1, src2);
2657 src += (3 * src_stride);
2658 XORI_B3_128_SB(src0, src1, src2);
2659 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2661 for (loop_cnt = (height >> 2); loop_cnt--;) {
2662 LD_SB2(src, src_stride, src3, src4);
2663 src += (2 * src_stride);
2664 XORI_B2_128_SB(src3, src4);
2665 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
2667 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
2669 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
2671 ST_SH2(dst0_r, dst1_r, dst, dst_stride);
2672 dst += (2 * dst_stride);
2674 LD_SB2(src, src_stride, src5, src2);
2675 src += (2 * src_stride);
2676 XORI_B2_128_SB(src5, src2);
2677 ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r);
2679 DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, dst0_r, dst0_r);
2681 DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, dst1_r, dst1_r);
2683 ST_SH2(dst0_r, dst1_r, dst, dst_stride);
2684 dst += (2 * dst_stride);
2688 static void hevc_vt_4t_8w_msa(uint8_t *src,
2692 const int8_t *filter,
2696 hevc_vt_4t_8x2_msa(src, src_stride, dst, dst_stride, filter);
2697 } else if (6 == height) {
2698 hevc_vt_4t_8x6_msa(src, src_stride, dst, dst_stride, filter);
2700 hevc_vt_4t_8x4multiple_msa(src, src_stride, dst, dst_stride,
2705 static void hevc_vt_4t_12w_msa(uint8_t *src,
2709 const int8_t *filter,
2713 v16i8 src0, src1, src2, src3, src4, src5;
2714 v16i8 src10_r, src32_r, src21_r, src43_r;
2715 v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
2716 v16i8 src10_l, src32_l, src54_l, src21_l, src43_l, src65_l;
2717 v16i8 src2110, src4332;
2718 v8i16 dst0_l, dst1_l;
2720 v8i16 filter_vec, const_vec;
2722 src -= (1 * src_stride);
2723 const_vec = __msa_ldi_h(128);
2726 filter_vec = LD_SH(filter);
2727 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2729 LD_SB3(src, src_stride, src0, src1, src2);
2730 src += (3 * src_stride);
2731 XORI_B3_128_SB(src0, src1, src2);
2732 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2733 ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
2734 src2110 = (v16i8) __msa_ilvr_d((v2i64) src21_l, (v2i64) src10_l);
2736 for (loop_cnt = (height >> 2); loop_cnt--;) {
2737 LD_SB2(src, src_stride, src3, src4);
2738 src += (2 * src_stride);
2739 XORI_B2_128_SB(src3, src4);
2740 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
2741 ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
2742 src4332 = (v16i8) __msa_ilvr_d((v2i64) src43_l, (v2i64) src32_l);
2744 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
2746 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
2748 DPADD_SB2_SH(src2110, src4332, filt0, filt1, dst0_l, dst0_l);
2750 LD_SB2(src, src_stride, src5, src2);
2751 src += (2 * src_stride);
2752 XORI_B2_128_SB(src5, src2);
2753 ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r);
2754 ILVL_B2_SB(src5, src4, src2, src5, src54_l, src65_l);
2755 src2110 = (v16i8) __msa_ilvr_d((v2i64) src65_l, (v2i64) src54_l);
2757 DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, dst2_r, dst2_r);
2759 DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, dst3_r, dst3_r);
2761 DPADD_SB2_SH(src4332, src2110, filt0, filt1, dst1_l, dst1_l);
2763 ST_SH4(dst0_r, dst1_r, dst2_r, dst3_r, dst, dst_stride);
2764 ST8x4_UB(dst0_l, dst1_l, dst + 8, (2 * dst_stride));
2765 dst += (4 * dst_stride);
2769 static void hevc_vt_4t_16w_msa(uint8_t *src,
2773 const int8_t *filter,
2777 v16i8 src0, src1, src2, src3, src4, src5;
2778 v16i8 src10_r, src32_r, src21_r, src43_r;
2779 v16i8 src10_l, src32_l, src21_l, src43_l;
2780 v8i16 dst0_r, dst1_r, dst0_l, dst1_l;
2782 v8i16 filter_vec, const_vec;
2785 const_vec = __msa_ldi_h(128);
2788 filter_vec = LD_SH(filter);
2789 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2791 LD_SB3(src, src_stride, src0, src1, src2);
2792 src += (3 * src_stride);
2793 XORI_B3_128_SB(src0, src1, src2);
2794 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2795 ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
2797 for (loop_cnt = (height >> 2); loop_cnt--;) {
2798 LD_SB2(src, src_stride, src3, src4);
2799 src += (2 * src_stride);
2800 XORI_B2_128_SB(src3, src4);
2801 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
2802 ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
2804 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
2806 DPADD_SB2_SH(src10_l, src32_l, filt0, filt1, dst0_l, dst0_l);
2808 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
2810 DPADD_SB2_SH(src21_l, src43_l, filt0, filt1, dst1_l, dst1_l);
2811 ST_SH2(dst0_r, dst0_l, dst, 8);
2813 ST_SH2(dst1_r, dst1_l, dst, 8);
2816 LD_SB2(src, src_stride, src5, src2);
2817 src += (2 * src_stride);
2818 XORI_B2_128_SB(src5, src2);
2819 ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r);
2820 ILVL_B2_SB(src5, src4, src2, src5, src10_l, src21_l);
2822 DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, dst0_r, dst0_r);
2824 DPADD_SB2_SH(src32_l, src10_l, filt0, filt1, dst0_l, dst0_l);
2826 DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, dst1_r, dst1_r);
2828 DPADD_SB2_SH(src43_l, src21_l, filt0, filt1, dst1_l, dst1_l);
2829 ST_SH2(dst0_r, dst0_l, dst, 8);
2831 ST_SH2(dst1_r, dst1_l, dst, 8);
2836 static void hevc_vt_4t_24w_msa(uint8_t *src,
2840 const int8_t *filter,
2844 v16i8 src0, src1, src2, src3, src4, src5;
2845 v16i8 src6, src7, src8, src9, src10, src11;
2846 v16i8 src10_r, src32_r, src76_r, src98_r;
2847 v16i8 src21_r, src43_r, src87_r, src109_r;
2848 v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
2849 v16i8 src10_l, src32_l, src21_l, src43_l;
2850 v8i16 dst0_l, dst1_l;
2852 v8i16 filter_vec, const_vec;
2855 const_vec = __msa_ldi_h(128);
2858 filter_vec = LD_SH(filter);
2859 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2861 LD_SB3(src, src_stride, src0, src1, src2);
2862 XORI_B3_128_SB(src0, src1, src2);
2863 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2864 ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
2866 LD_SB3(src + 16, src_stride, src6, src7, src8);
2867 src += (3 * src_stride);
2868 XORI_B3_128_SB(src6, src7, src8);
2869 ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
2871 for (loop_cnt = (height >> 2); loop_cnt--;) {
2872 LD_SB2(src, src_stride, src3, src4);
2873 XORI_B2_128_SB(src3, src4);
2874 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
2875 ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
2877 LD_SB2(src + 16, src_stride, src9, src10);
2878 src += (2 * src_stride);
2879 XORI_B2_128_SB(src9, src10);
2880 ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r);
2883 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
2885 DPADD_SB2_SH(src10_l, src32_l, filt0, filt1, dst0_l, dst0_l);
2887 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
2889 DPADD_SB2_SH(src21_l, src43_l, filt0, filt1, dst1_l, dst1_l);
2891 DPADD_SB2_SH(src76_r, src98_r, filt0, filt1, dst2_r, dst2_r);
2893 DPADD_SB2_SH(src87_r, src109_r, filt0, filt1, dst3_r, dst3_r);
2895 ST_SH2(dst0_r, dst0_l, dst, 8);
2896 ST_SH(dst2_r, dst + 16);
2898 ST_SH2(dst1_r, dst1_l, dst, 8);
2899 ST_SH(dst3_r, dst + 16);
2902 LD_SB2(src, src_stride, src5, src2);
2903 XORI_B2_128_SB(src5, src2);
2904 ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r);
2905 ILVL_B2_SB(src5, src4, src2, src5, src10_l, src21_l);
2907 LD_SB2(src + 16, src_stride, src11, src8);
2908 src += (2 * src_stride);
2909 XORI_B2_128_SB(src11, src8);
2910 ILVR_B2_SB(src11, src10, src8, src11, src76_r, src87_r);
2913 DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, dst0_r, dst0_r);
2915 DPADD_SB2_SH(src32_l, src10_l, filt0, filt1, dst0_l, dst0_l);
2917 DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, dst1_r, dst1_r);
2919 DPADD_SB2_SH(src43_l, src21_l, filt0, filt1, dst1_l, dst1_l);
2921 DPADD_SB2_SH(src98_r, src76_r, filt0, filt1, dst2_r, dst2_r);
2923 DPADD_SB2_SH(src109_r, src87_r, filt0, filt1, dst3_r, dst3_r);
2925 ST_SH2(dst0_r, dst0_l, dst, 8);
2926 ST_SH(dst2_r, dst + 16);
2928 ST_SH2(dst1_r, dst1_l, dst, 8);
2929 ST_SH(dst3_r, dst + 16);
2934 static void hevc_vt_4t_32w_msa(uint8_t *src,
2938 const int8_t *filter,
2942 v16i8 src0, src1, src2, src3, src4, src5;
2943 v16i8 src6, src7, src8, src9, src10, src11;
2944 v16i8 src10_r, src32_r, src76_r, src98_r;
2945 v16i8 src21_r, src43_r, src87_r, src109_r;
2946 v8i16 dst0_r, dst1_r, dst2_r, dst3_r;
2947 v16i8 src10_l, src32_l, src76_l, src98_l;
2948 v16i8 src21_l, src43_l, src87_l, src109_l;
2949 v8i16 dst0_l, dst1_l, dst2_l, dst3_l;
2951 v8i16 filter_vec, const_vec;
2954 const_vec = __msa_ldi_h(128);
2957 filter_vec = LD_SH(filter);
2958 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
2960 LD_SB3(src, src_stride, src0, src1, src2);
2961 XORI_B3_128_SB(src0, src1, src2);
2962 ILVR_B2_SB(src1, src0, src2, src1, src10_r, src21_r);
2963 ILVL_B2_SB(src1, src0, src2, src1, src10_l, src21_l);
2965 LD_SB3(src + 16, src_stride, src6, src7, src8);
2966 src += (3 * src_stride);
2967 XORI_B3_128_SB(src6, src7, src8);
2968 ILVR_B2_SB(src7, src6, src8, src7, src76_r, src87_r);
2969 ILVL_B2_SB(src7, src6, src8, src7, src76_l, src87_l);
2971 for (loop_cnt = (height >> 2); loop_cnt--;) {
2972 LD_SB2(src, src_stride, src3, src4);
2973 XORI_B2_128_SB(src3, src4);
2974 ILVR_B2_SB(src3, src2, src4, src3, src32_r, src43_r);
2975 ILVL_B2_SB(src3, src2, src4, src3, src32_l, src43_l);
2977 LD_SB2(src + 16, src_stride, src9, src10);
2978 src += (2 * src_stride);
2979 XORI_B2_128_SB(src9, src10);
2980 ILVR_B2_SB(src9, src8, src10, src9, src98_r, src109_r);
2981 ILVL_B2_SB(src9, src8, src10, src9, src98_l, src109_l);
2984 DPADD_SB2_SH(src10_r, src32_r, filt0, filt1, dst0_r, dst0_r);
2986 DPADD_SB2_SH(src10_l, src32_l, filt0, filt1, dst0_l, dst0_l);
2988 DPADD_SB2_SH(src21_r, src43_r, filt0, filt1, dst1_r, dst1_r);
2990 DPADD_SB2_SH(src21_l, src43_l, filt0, filt1, dst1_l, dst1_l);
2992 DPADD_SB2_SH(src76_r, src98_r, filt0, filt1, dst2_r, dst2_r);
2994 DPADD_SB2_SH(src76_l, src98_l, filt0, filt1, dst2_l, dst2_l);
2996 DPADD_SB2_SH(src87_r, src109_r, filt0, filt1, dst3_r, dst3_r);
2998 DPADD_SB2_SH(src87_l, src109_l, filt0, filt1, dst3_l, dst3_l);
3000 ST_SH4(dst0_r, dst0_l, dst2_r, dst2_l, dst, 8);
3002 ST_SH4(dst1_r, dst1_l, dst3_r, dst3_l, dst, 8);
3005 LD_SB2(src, src_stride, src5, src2);
3006 XORI_B2_128_SB(src5, src2);
3007 ILVR_B2_SB(src5, src4, src2, src5, src10_r, src21_r);
3008 ILVL_B2_SB(src5, src4, src2, src5, src10_l, src21_l);
3010 LD_SB2(src + 16, src_stride, src11, src8);
3011 src += (2 * src_stride);
3012 XORI_B2_128_SB(src11, src8);
3013 ILVR_B2_SB(src11, src10, src8, src11, src76_r, src87_r);
3014 ILVL_B2_SB(src11, src10, src8, src11, src76_l, src87_l);
3017 DPADD_SB2_SH(src32_r, src10_r, filt0, filt1, dst0_r, dst0_r);
3019 DPADD_SB2_SH(src32_l, src10_l, filt0, filt1, dst0_l, dst0_l);
3021 DPADD_SB2_SH(src43_r, src21_r, filt0, filt1, dst1_r, dst1_r);
3023 DPADD_SB2_SH(src43_l, src21_l, filt0, filt1, dst1_l, dst1_l);
3025 DPADD_SB2_SH(src98_r, src76_r, filt0, filt1, dst2_r, dst2_r);
3027 DPADD_SB2_SH(src98_l, src76_l, filt0, filt1, dst2_l, dst2_l);
3029 DPADD_SB2_SH(src109_r, src87_r, filt0, filt1, dst3_r, dst3_r);
3031 DPADD_SB2_SH(src109_l, src87_l, filt0, filt1, dst3_l, dst3_l);
3033 ST_SH4(dst0_r, dst0_l, dst2_r, dst2_l, dst, 8);
3035 ST_SH4(dst1_r, dst1_l, dst3_r, dst3_l, dst, 8);
3040 static void hevc_hv_4t_4x2_msa(uint8_t *src,
3044 const int8_t *filter_x,
3045 const int8_t *filter_y)
3047 v16i8 src0, src1, src2, src3, src4;
3049 v4i32 filt_h0, filt_h1;
3050 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
3052 v8i16 filter_vec, const_vec;
3053 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
3054 v8i16 dst0, dst1, dst2, dst3, dst4;
3055 v4i32 dst0_r, dst1_r;
3056 v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
3058 src -= (src_stride + 1);
3059 filter_vec = LD_SH(filter_x);
3060 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3062 filter_vec = LD_SH(filter_y);
3063 vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
3064 filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
3066 SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1);
3070 const_vec = __msa_ldi_h(128);
3073 LD_SB3(src, src_stride, src0, src1, src2);
3074 src += (3 * src_stride);
3075 XORI_B3_128_SB(src0, src1, src2);
3077 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3078 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
3079 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3082 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
3084 DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1);
3086 DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2);
3087 ILVR_H2_SH(dst1, dst0, dst2, dst1, dst10_r, dst21_r);
3089 LD_SB2(src, src_stride, src3, src4);
3090 XORI_B2_128_SB(src3, src4);
3092 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3094 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
3096 dst32_r = __msa_ilvr_h(dst3, dst2);
3097 dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
3100 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
3102 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
3104 dst43_r = __msa_ilvr_h(dst4, dst3);
3105 dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
3108 dst0_r = (v4i32) __msa_pckev_h((v8i16) dst1_r, (v8i16) dst0_r);
3109 ST8x2_UB(dst0_r, dst, 2 * dst_stride);
3112 static void hevc_hv_4t_4x4_msa(uint8_t *src,
3116 const int8_t *filter_x,
3117 const int8_t *filter_y)
3119 v16i8 src0, src1, src2, src3, src4, src5, src6;
3121 v4i32 filt_h0, filt_h1;
3122 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
3124 v8i16 filter_vec, const_vec;
3125 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
3126 v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
3127 v4i32 dst0_r, dst1_r, dst2_r, dst3_r;
3128 v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
3130 src -= (src_stride + 1);
3132 filter_vec = LD_SH(filter_x);
3133 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3135 filter_vec = LD_SH(filter_y);
3136 vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
3137 filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
3139 SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1);
3143 const_vec = __msa_ldi_h(128);
3146 LD_SB3(src, src_stride, src0, src1, src2);
3147 src += (3 * src_stride);
3149 XORI_B3_128_SB(src0, src1, src2);
3151 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3152 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
3153 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3156 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
3158 DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1);
3160 DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2);
3162 ILVR_H2_SH(dst1, dst0, dst2, dst1, dst10_r, dst21_r);
3164 LD_SB4(src, src_stride, src3, src4, src5, src6);
3165 XORI_B4_128_SB(src3, src4, src5, src6);
3167 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3169 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
3171 dst32_r = __msa_ilvr_h(dst3, dst2);
3172 dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
3175 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
3177 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
3179 dst43_r = __msa_ilvr_h(dst4, dst3);
3180 dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
3183 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
3185 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5);
3187 dst10_r = __msa_ilvr_h(dst5, dst4);
3188 dst2_r = HEVC_FILT_4TAP(dst32_r, dst10_r, filt_h0, filt_h1);
3191 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
3193 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
3195 dst21_r = __msa_ilvr_h(dst2, dst5);
3196 dst3_r = HEVC_FILT_4TAP(dst43_r, dst21_r, filt_h0, filt_h1);
3199 PCKEV_H2_SW(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r);
3200 ST8x4_UB(dst0_r, dst1_r, dst, 2 * dst_stride);
3204 static void hevc_hv_4t_4multx8mult_msa(uint8_t *src,
3208 const int8_t *filter_x,
3209 const int8_t *filter_y,
3213 v16i8 src0, src1, src2, src3, src4, src5, src6;
3214 v16i8 src7, src8, src9, src10;
3216 v4i32 filt_h0, filt_h1;
3217 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
3219 v8i16 filter_vec, const_vec;
3220 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
3221 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8, dst9;
3222 v4i32 dst0_r, dst1_r, dst2_r, dst3_r, dst4_r, dst5_r, dst6_r, dst7_r;
3223 v8i16 dst10_r, dst32_r, dst54_r, dst76_r;
3224 v8i16 dst21_r, dst43_r, dst65_r, dst87_r;
3226 src -= (src_stride + 1);
3227 filter_vec = LD_SH(filter_x);
3228 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3230 filter_vec = LD_SH(filter_y);
3231 vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
3232 filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
3234 SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1);
3238 const_vec = __msa_ldi_h(128);
3241 LD_SB3(src, src_stride, src0, src1, src2);
3242 src += (3 * src_stride);
3243 XORI_B3_128_SB(src0, src1, src2);
3245 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3246 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
3247 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3250 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
3252 DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1);
3254 DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2);
3256 ILVR_H2_SH(dst1, dst0, dst2, dst1, dst10_r, dst21_r);
3258 for (loop_cnt = height >> 3; loop_cnt--;) {
3259 LD_SB8(src, src_stride,
3260 src3, src4, src5, src6, src7, src8, src9, src10);
3261 src += (8 * src_stride);
3262 XORI_B8_128_SB(src3, src4, src5, src6, src7, src8, src9, src10);
3264 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3266 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
3268 dst32_r = __msa_ilvr_h(dst3, dst2);
3269 dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
3272 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
3274 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
3276 dst43_r = __msa_ilvr_h(dst4, dst3);
3277 dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
3280 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
3282 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5);
3284 dst54_r = __msa_ilvr_h(dst5, dst4);
3285 dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
3288 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
3290 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst6, dst6);
3292 dst65_r = __msa_ilvr_h(dst6, dst5);
3293 dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
3296 VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1);
3298 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst7, dst7);
3300 dst76_r = __msa_ilvr_h(dst7, dst6);
3301 dst4_r = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
3304 VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec0, vec1);
3306 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst8, dst8);
3308 dst87_r = __msa_ilvr_h(dst8, dst7);
3309 dst5_r = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
3312 VSHF_B2_SB(src9, src9, src9, src9, mask0, mask1, vec0, vec1);
3314 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst9, dst9);
3316 dst10_r = __msa_ilvr_h(dst9, dst8);
3317 dst6_r = HEVC_FILT_4TAP(dst76_r, dst10_r, filt_h0, filt_h1);
3320 VSHF_B2_SB(src10, src10, src10, src10, mask0, mask1, vec0, vec1);
3322 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
3324 dst21_r = __msa_ilvr_h(dst2, dst9);
3325 dst7_r = HEVC_FILT_4TAP(dst87_r, dst21_r, filt_h0, filt_h1);
3328 PCKEV_H4_SW(dst1_r, dst0_r, dst3_r, dst2_r,
3329 dst5_r, dst4_r, dst7_r, dst6_r,
3330 dst0_r, dst1_r, dst2_r, dst3_r);
3331 ST8x8_UB(dst0_r, dst1_r, dst2_r, dst3_r, dst, 2 * dst_stride);
3332 dst += (8 * dst_stride);
3336 static void hevc_hv_4t_4w_msa(uint8_t *src,
3340 const int8_t *filter_x,
3341 const int8_t *filter_y,
3345 hevc_hv_4t_4x2_msa(src, src_stride, dst, dst_stride,
3346 filter_x, filter_y);
3347 } else if (4 == height) {
3348 hevc_hv_4t_4x4_msa(src, src_stride, dst, dst_stride,
3349 filter_x, filter_y);
3350 } else if (0 == (height % 8)) {
3351 hevc_hv_4t_4multx8mult_msa(src, src_stride, dst, dst_stride,
3352 filter_x, filter_y, height);
3356 static void hevc_hv_4t_6w_msa(uint8_t *src,
3360 const int8_t *filter_x,
3361 const int8_t *filter_y,
3365 uint64_t dst_val0, dst_val1, dst_val2, dst_val3;
3366 uint32_t dst_val_int0, dst_val_int1, dst_val_int2, dst_val_int3;
3367 v16i8 src0, src1, src2, src3, src4, src5, src6;
3369 v4i32 filt_h0, filt_h1;
3370 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
3372 v8i16 filter_vec, const_vec;
3373 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
3374 v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
3375 v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
3376 v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
3377 v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
3379 src -= (src_stride + 1);
3380 filter_vec = LD_SH(filter_x);
3381 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3383 filter_vec = LD_SH(filter_y);
3384 vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
3385 filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
3387 SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1);
3391 const_vec = __msa_ldi_h(128);
3394 LD_SB3(src, src_stride, src0, src1, src2);
3395 src += (3 * src_stride);
3396 XORI_B3_128_SB(src0, src1, src2);
3398 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3399 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
3400 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3403 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
3405 DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1);
3407 DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2);
3409 ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
3410 ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
3412 for (loop_cnt = height >> 2; loop_cnt--;) {
3413 LD_SB4(src, src_stride, src3, src4, src5, src6);
3414 src += (4 * src_stride);
3415 XORI_B4_128_SB(src3, src4, src5, src6);
3417 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3419 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
3421 ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
3422 dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
3423 dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
3427 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
3429 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
3431 ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
3432 dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
3433 dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
3437 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
3439 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5);
3441 ILVRL_H2_SH(dst5, dst4, dst10_r, dst10_l);
3442 dst2_r = HEVC_FILT_4TAP(dst32_r, dst10_r, filt_h0, filt_h1);
3443 dst2_l = HEVC_FILT_4TAP(dst32_l, dst10_l, filt_h0, filt_h1);
3447 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
3449 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
3451 ILVRL_H2_SH(dst2, dst5, dst21_r, dst21_l);
3452 dst3_r = HEVC_FILT_4TAP(dst43_r, dst21_r, filt_h0, filt_h1);
3453 dst3_l = HEVC_FILT_4TAP(dst43_l, dst21_l, filt_h0, filt_h1);
3457 PCKEV_H4_SW(dst0_l, dst0_r, dst1_l, dst1_r,
3458 dst2_l, dst2_r, dst3_l, dst3_r,
3459 dst0_r, dst1_r, dst2_r, dst3_r);
3461 dst_val0 = __msa_copy_u_d((v2i64) dst0_r, 0);
3462 dst_val1 = __msa_copy_u_d((v2i64) dst1_r, 0);
3463 dst_val2 = __msa_copy_u_d((v2i64) dst2_r, 0);
3464 dst_val3 = __msa_copy_u_d((v2i64) dst3_r, 0);
3466 dst_val_int0 = __msa_copy_u_w((v4i32) dst0_r, 2);
3467 dst_val_int1 = __msa_copy_u_w((v4i32) dst1_r, 2);
3468 dst_val_int2 = __msa_copy_u_w((v4i32) dst2_r, 2);
3469 dst_val_int3 = __msa_copy_u_w((v4i32) dst3_r, 2);
3472 SW(dst_val_int0, dst + 4);
3475 SW(dst_val_int1, dst + 4);
3478 SW(dst_val_int2, dst + 4);
3481 SW(dst_val_int3, dst + 4);
3487 static void hevc_hv_4t_8x2_msa(uint8_t *src,
3491 const int8_t *filter_x,
3492 const int8_t *filter_y,
3495 v16i8 src0, src1, src2, src3, src4;
3497 v4i32 filt_h0, filt_h1;
3498 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
3500 v8i16 filter_vec, const_vec;
3501 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
3502 v8i16 dst0, dst1, dst2, dst3, dst4;
3503 v4i32 dst0_r, dst0_l, dst1_r, dst1_l;
3504 v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
3505 v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
3507 src -= (src_stride + 1);
3509 filter_vec = LD_SH(filter_x);
3510 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3512 filter_vec = LD_SH(filter_y);
3513 vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
3514 filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
3516 SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1);
3520 const_vec = __msa_ldi_h(128);
3523 LD_SB3(src, src_stride, src0, src1, src2);
3524 src += (3 * src_stride);
3525 XORI_B3_128_SB(src0, src1, src2);
3527 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3528 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
3529 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3532 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
3534 DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1);
3536 DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2);
3538 ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
3539 ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
3541 LD_SB2(src, src_stride, src3, src4);
3542 src += (2 * src_stride);
3543 XORI_B2_128_SB(src3, src4);
3545 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3547 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
3549 ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
3550 dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
3551 dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
3555 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
3557 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
3559 ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
3560 dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
3561 dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
3565 PCKEV_H2_SW(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r, dst1_r);
3566 ST_SW2(dst0_r, dst1_r, dst, dst_stride);
3569 static void hevc_hv_4t_8x6_msa(uint8_t *src,
3573 const int8_t *filter_x,
3574 const int8_t *filter_y,
3577 v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
3579 v4i32 filt_h0, filt_h1;
3580 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
3582 v8i16 filter_vec, const_vec;
3583 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
3584 v8i16 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst8;
3585 v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
3586 v4i32 dst4_r, dst4_l, dst5_r, dst5_l;
3587 v8i16 dst10_r, dst32_r, dst10_l, dst32_l;
3588 v8i16 dst21_r, dst43_r, dst21_l, dst43_l;
3589 v8i16 dst54_r, dst54_l, dst65_r, dst65_l;
3590 v8i16 dst76_r, dst76_l, dst87_r, dst87_l;
3592 src -= (src_stride + 1);
3594 filter_vec = LD_SH(filter_x);
3595 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3597 filter_vec = LD_SH(filter_y);
3598 vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
3599 filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
3601 SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1);
3605 const_vec = __msa_ldi_h(128);
3608 LD_SB3(src, src_stride, src0, src1, src2);
3609 src += (3 * src_stride);
3611 XORI_B3_128_SB(src0, src1, src2);
3613 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3614 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
3615 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3618 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
3620 DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1);
3622 DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2);
3624 ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
3625 ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
3627 LD_SB2(src, src_stride, src3, src4);
3628 src += (2 * src_stride);
3630 XORI_B2_128_SB(src3, src4);
3633 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3635 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
3637 ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
3638 dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
3640 dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
3646 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
3648 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
3650 ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
3651 dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
3652 dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
3656 LD_SB2(src, src_stride, src5, src6);
3657 src += (2 * src_stride);
3659 XORI_B2_128_SB(src5, src6);
3662 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
3664 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5);
3666 ILVRL_H2_SH(dst5, dst4, dst54_r, dst54_l);
3667 dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
3668 dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1);
3673 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
3675 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst6, dst6);
3677 ILVRL_H2_SH(dst6, dst5, dst65_r, dst65_l);
3678 dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
3679 dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1);
3683 LD_SB2(src, src_stride, src7, src8);
3685 XORI_B2_128_SB(src7, src8);
3688 VSHF_B2_SB(src7, src7, src7, src7, mask0, mask1, vec0, vec1);
3690 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst7, dst7);
3692 ILVRL_H2_SH(dst7, dst6, dst76_r, dst76_l);
3693 dst4_r = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
3694 dst4_l = HEVC_FILT_4TAP(dst54_l, dst76_l, filt_h0, filt_h1);
3699 VSHF_B2_SB(src8, src8, src8, src8, mask0, mask1, vec0, vec1);
3701 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst8, dst8);
3703 ILVRL_H2_SH(dst8, dst7, dst87_r, dst87_l);
3704 dst5_r = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
3705 dst5_l = HEVC_FILT_4TAP(dst65_l, dst87_l, filt_h0, filt_h1);
3709 PCKEV_H4_SW(dst0_l, dst0_r, dst1_l, dst1_r,
3710 dst2_l, dst2_r, dst3_l, dst3_r, dst0_r, dst1_r, dst2_r, dst3_r);
3711 PCKEV_H2_SW(dst4_l, dst4_r, dst5_l, dst5_r, dst4_r, dst5_r);
3713 ST_SW2(dst0_r, dst1_r, dst, dst_stride);
3714 dst += (2 * dst_stride);
3715 ST_SW2(dst2_r, dst3_r, dst, dst_stride);
3716 dst += (2 * dst_stride);
3717 ST_SW2(dst4_r, dst5_r, dst, dst_stride);
3720 static void hevc_hv_4t_8multx4mult_msa(uint8_t *src,
3724 const int8_t *filter_x,
3725 const int8_t *filter_y,
3729 uint32_t loop_cnt, cnt;
3732 v16i8 src0, src1, src2, src3, src4, src5, src6;
3734 v4i32 filt_h0, filt_h1;
3735 v16i8 mask0 = { 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 };
3737 v8i16 filter_vec, const_vec;
3738 v16i8 vec0, vec1, vec2, vec3, vec4, vec5;
3739 v8i16 dst0, dst1, dst2, dst3, dst4, dst5;
3740 v4i32 dst0_r, dst0_l, dst1_r, dst1_l, dst2_r, dst2_l, dst3_r, dst3_l;
3741 v8i16 dst10_r, dst32_r, dst21_r, dst43_r;
3742 v8i16 dst10_l, dst32_l, dst21_l, dst43_l;
3744 src -= (src_stride + 1);
3746 filter_vec = LD_SH(filter_x);
3747 SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
3749 filter_vec = LD_SH(filter_y);
3750 vec0 = __msa_clti_s_b((v16i8) filter_vec, 0);
3751 filter_vec = (v8i16) __msa_ilvr_b(vec0, (v16i8) filter_vec);
3753 SPLATI_W2_SW(filter_vec, 0, filt_h0, filt_h1);
3757 const_vec = __msa_ldi_h(128);
3760 for (cnt = width >> 3; cnt--;) {
3764 LD_SB3(src_tmp, src_stride, src0, src1, src2);
3765 src_tmp += (3 * src_stride);
3767 XORI_B3_128_SB(src0, src1, src2);
3769 VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
3770 VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
3771 VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
3774 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
3776 DPADD_SB2_SH(vec2, vec3, filt0, filt1, dst1, dst1);
3778 DPADD_SB2_SH(vec4, vec5, filt0, filt1, dst2, dst2);
3780 ILVRL_H2_SH(dst1, dst0, dst10_r, dst10_l);
3781 ILVRL_H2_SH(dst2, dst1, dst21_r, dst21_l);
3783 for (loop_cnt = height >> 2; loop_cnt--;) {
3784 LD_SB4(src_tmp, src_stride, src3, src4, src5, src6);
3785 src_tmp += (4 * src_stride);
3786 XORI_B4_128_SB(src3, src4, src5, src6);
3788 VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
3790 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
3792 ILVRL_H2_SH(dst3, dst2, dst32_r, dst32_l);
3793 dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
3794 dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
3800 VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec0, vec1);
3802 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst4, dst4);
3804 ILVRL_H2_SH(dst4, dst3, dst43_r, dst43_l);
3805 dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
3806 dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
3811 VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec0, vec1);
3813 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5);
3815 ILVRL_H2_SH(dst5, dst4, dst10_r, dst10_l);
3816 dst2_r = HEVC_FILT_4TAP(dst32_r, dst10_r, filt_h0, filt_h1);
3817 dst2_l = HEVC_FILT_4TAP(dst32_l, dst10_l, filt_h0, filt_h1);
3823 VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec0, vec1);
3825 DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst2, dst2);
3827 ILVRL_H2_SH(dst2, dst5, dst21_r, dst21_l);
3828 dst3_r = HEVC_FILT_4TAP(dst43_r, dst21_r, filt_h0, filt_h1);
3829 dst3_l = HEVC_FILT_4TAP(dst43_l, dst21_l, filt_h0, filt_h1);
3834 PCKEV_H4_SW(dst0_l, dst0_r, dst1_l, dst1_r,
3835 dst2_l, dst2_r, dst3_l, dst3_r,
3836 dst0_r, dst1_r, dst2_r, dst3_r);
3838 ST_SW2(dst0_r, dst1_r, dst_tmp, dst_stride);
3839 dst_tmp += (2 * dst_stride);
3840 ST_SW2(dst2_r, dst3_r, dst_tmp, dst_stride);
3841 dst_tmp += (2 * dst_stride);
3849 static void hevc_hv_4t_8w_msa(uint8_t *src,
3853 const int8_t *filter_x,
3854 const int8_t *filter_y,
3859 hevc_hv_4t_8x2_msa(src, src_stride, dst, dst_stride,
3860 filter_x, filter_y, height);
3861 } else if (6 == height) {
3862 hevc_hv_4t_8x6_msa(src, src_stride, dst, dst_stride,
3863 filter_x, filter_y, height);
3864 } else if (0 == (height % 4)) {
3865 hevc_hv_4t_8multx4mult_msa(src, src_stride, dst, dst_stride,
3866 filter_x, filter_y, height, 8);
3870 static void hevc_hv_4t_12w_msa(uint8_t *src,
3874 const int8_t *filter_x,
3875 const int8_t *filter_y,
3878 hevc_hv_4t_8multx4mult_msa(src, src_stride, dst, dst_stride,
3879 filter_x, filter_y, height, 8);
3881 hevc_hv_4t_4w_msa(src + 8, src_stride, dst + 8, dst_stride,
3882 filter_x, filter_y, height);
3886 static void hevc_hv_4t_16w_msa(uint8_t *src,
3890 const int8_t *filter_x,
3891 const int8_t *filter_y,
3894 hevc_hv_4t_8multx4mult_msa(src, src_stride, dst, dst_stride,
3895 filter_x, filter_y, height, 16);
3898 static void hevc_hv_4t_24w_msa(uint8_t *src,
3902 const int8_t *filter_x,
3903 const int8_t *filter_y,
3906 hevc_hv_4t_8multx4mult_msa(src, src_stride, dst, dst_stride,
3907 filter_x, filter_y, height, 24);
3910 static void hevc_hv_4t_32w_msa(uint8_t *src,
3914 const int8_t *filter_x,
3915 const int8_t *filter_y,
3918 hevc_hv_4t_8multx4mult_msa(src, src_stride, dst, dst_stride,
3919 filter_x, filter_y, height, 32);
3922 #define MC_COPY(WIDTH) \
3923 void ff_hevc_put_hevc_pel_pixels##WIDTH##_8_msa(int16_t *dst, \
3925 ptrdiff_t src_stride, \
3931 hevc_copy_##WIDTH##w_msa(src, src_stride, dst, MAX_PB_SIZE, height); \
3946 #define MC(PEL, DIR, WIDTH, TAP, DIR1, FILT_DIR) \
3947 void ff_hevc_put_hevc_##PEL##_##DIR##WIDTH##_8_msa(int16_t *dst, \
3949 ptrdiff_t src_stride, \
3955 const int8_t *filter = ff_hevc_##PEL##_filters[FILT_DIR - 1]; \
3957 hevc_##DIR1##_##TAP##t_##WIDTH##w_msa(src, src_stride, dst, \
3958 MAX_PB_SIZE, filter, height); \
3961 MC(qpel, h, 4, 8, hz, mx);
3962 MC(qpel, h, 8, 8, hz, mx);
3963 MC(qpel, h, 12, 8, hz, mx);
3964 MC(qpel, h, 16, 8, hz, mx);
3965 MC(qpel, h, 24, 8, hz, mx);
3966 MC(qpel, h, 32, 8, hz, mx);
3967 MC(qpel, h, 48, 8, hz, mx);
3968 MC(qpel, h, 64, 8, hz, mx);
3970 MC(qpel, v, 4, 8, vt, my);
3971 MC(qpel, v, 8, 8, vt, my);
3972 MC(qpel, v, 12, 8, vt, my);
3973 MC(qpel, v, 16, 8, vt, my);
3974 MC(qpel, v, 24, 8, vt, my);
3975 MC(qpel, v, 32, 8, vt, my);
3976 MC(qpel, v, 48, 8, vt, my);
3977 MC(qpel, v, 64, 8, vt, my);
3979 MC(epel, h, 4, 4, hz, mx);
3980 MC(epel, h, 6, 4, hz, mx);
3981 MC(epel, h, 8, 4, hz, mx);
3982 MC(epel, h, 12, 4, hz, mx);
3983 MC(epel, h, 16, 4, hz, mx);
3984 MC(epel, h, 24, 4, hz, mx);
3985 MC(epel, h, 32, 4, hz, mx);
3987 MC(epel, v, 4, 4, vt, my);
3988 MC(epel, v, 6, 4, vt, my);
3989 MC(epel, v, 8, 4, vt, my);
3990 MC(epel, v, 12, 4, vt, my);
3991 MC(epel, v, 16, 4, vt, my);
3992 MC(epel, v, 24, 4, vt, my);
3993 MC(epel, v, 32, 4, vt, my);
3997 #define MC_HV(PEL, DIR, WIDTH, TAP, DIR1) \
3998 void ff_hevc_put_hevc_##PEL##_##DIR##WIDTH##_8_msa(int16_t *dst, \
4000 ptrdiff_t src_stride, \
4006 const int8_t *filter_x = ff_hevc_##PEL##_filters[mx - 1]; \
4007 const int8_t *filter_y = ff_hevc_##PEL##_filters[my - 1]; \
4009 hevc_##DIR1##_##TAP##t_##WIDTH##w_msa(src, src_stride, dst, MAX_PB_SIZE, \
4010 filter_x, filter_y, height); \
4013 MC_HV(qpel, hv, 4, 8, hv);
4014 MC_HV(qpel, hv, 8, 8, hv);
4015 MC_HV(qpel, hv, 12, 8, hv);
4016 MC_HV(qpel, hv, 16, 8, hv);
4017 MC_HV(qpel, hv, 24, 8, hv);
4018 MC_HV(qpel, hv, 32, 8, hv);
4019 MC_HV(qpel, hv, 48, 8, hv);
4020 MC_HV(qpel, hv, 64, 8, hv);
4022 MC_HV(epel, hv, 4, 4, hv);
4023 MC_HV(epel, hv, 6, 4, hv);
4024 MC_HV(epel, hv, 8, 4, hv);
4025 MC_HV(epel, hv, 12, 4, hv);
4026 MC_HV(epel, hv, 16, 4, hv);
4027 MC_HV(epel, hv, 24, 4, hv);
4028 MC_HV(epel, hv, 32, 4, hv);