2 * Copyright (c) 2015 Parag Salasakar (Parag.Salasakar@imgtec.com)
4 * This file is part of FFmpeg.
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 #include "libavutil/mips/generic_macros_msa.h"
22 #include "libavcodec/mips/hpeldsp_mips.h"
24 #define PCKEV_AVG_ST_UB(in0, in1, dst, pdst) \
28 tmp_m = (v16u8) __msa_pckev_b((v16i8) in0, (v16i8) in1); \
29 tmp_m = __msa_aver_u_b(tmp_m, (v16u8) dst); \
30 ST_UB(tmp_m, (pdst)); \
33 #define PCKEV_ST_SB4(in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) \
35 v16i8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
36 uint8_t *pdst_m = (uint8_t *) (pdst); \
38 PCKEV_B4_SB(in0, in1, in2, in3, in4, in5, in6, in7, \
39 tmp0_m, tmp1_m, tmp2_m, tmp3_m); \
40 ST_SB4(tmp0_m, tmp1_m, tmp2_m, tmp3_m, pdst_m, stride); \
43 #define PCKEV_AVG_ST8x4_UB(in1, dst0, in2, dst1, in3, dst2, in4, dst3, \
46 v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
47 uint8_t *pdst_m = (uint8_t *) (pdst); \
49 PCKEV_B2_UB(in2, in1, in4, in3, tmp0_m, tmp1_m); \
50 PCKEV_D2_UB(dst1, dst0, dst3, dst2, tmp2_m, tmp3_m); \
51 AVER_UB2_UB(tmp0_m, tmp2_m, tmp1_m, tmp3_m, tmp0_m, tmp1_m); \
52 ST_D4(tmp0_m, tmp1_m, 0, 1, 0, 1, pdst_m, stride); \
55 static void common_hz_bil_4w_msa(const uint8_t *src, int32_t src_stride,
56 uint8_t *dst, int32_t dst_stride,
61 v16u8 src0, src1, src0_sld1, src1_sld1, res0, res1;
64 for (loop_cnt = (height >> 1); loop_cnt--;) {
65 LD_UB2(src, src_stride, src0, src1);
66 src += (2 * src_stride);
68 SLDI_B2_UB(zeros, src0, zeros, src1, 1, src0_sld1, src1_sld1);
69 AVER_UB2_UB(src0_sld1, src0, src1_sld1, src1, res0, res1);
71 out0 = __msa_copy_u_w((v4i32) res0, 0);
72 out1 = __msa_copy_u_w((v4i32) res1, 0);
80 static void common_hz_bil_8w_msa(const uint8_t *src, int32_t src_stride,
81 uint8_t *dst, int32_t dst_stride,
85 v16i8 src0, src1, src2, src3, src0_sld1, src1_sld1, src2_sld1, src3_sld1;
88 for (loop_cnt = (height >> 2); loop_cnt--;) {
89 LD_SB4(src, src_stride, src0, src1, src2, src3);
90 src += (4 * src_stride);
92 SLDI_B4_SB(zeros, src0, zeros, src1, zeros, src2, zeros, src3, 1,
93 src0_sld1, src1_sld1, src2_sld1, src3_sld1);
94 AVER_ST8x4_UB(src0, src0_sld1, src1, src1_sld1,
95 src2, src2_sld1, src3, src3_sld1, dst, dst_stride);
96 dst += (4 * dst_stride);
100 static void common_hz_bil_16w_msa(const uint8_t *src, int32_t src_stride,
101 uint8_t *dst, int32_t dst_stride,
105 v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
106 v16u8 src8, src9, src10, src11, src12, src13, src14, src15;
108 for (loop_cnt = (height >> 3); loop_cnt--;) {
109 LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
110 LD_UB8((src + 1), src_stride,
111 src8, src9, src10, src11, src12, src13, src14, src15);
112 src += (8 * src_stride);
114 AVER_ST16x4_UB(src0, src8, src1, src9, src2, src10, src3, src11,
116 dst += (4 * dst_stride);
118 AVER_ST16x4_UB(src4, src12, src5, src13, src6, src14, src7, src15,
120 dst += (4 * dst_stride);
124 static void common_hz_bil_no_rnd_8x8_msa(const uint8_t *src, int32_t src_stride,
125 uint8_t *dst, int32_t dst_stride)
127 v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
128 v16i8 src0_sld1, src1_sld1, src2_sld1, src3_sld1;
129 v16i8 src4_sld1, src5_sld1, src6_sld1, src7_sld1;
132 LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
133 src += (8 * src_stride);
135 SLDI_B4_SB(zeros, src0, zeros, src1, zeros, src2, zeros, src3, 1,
136 src0_sld1, src1_sld1, src2_sld1, src3_sld1);
137 SLDI_B4_SB(zeros, src4, zeros, src5, zeros, src6, zeros, src7, 1,
138 src4_sld1, src5_sld1, src6_sld1, src7_sld1);
140 AVE_ST8x4_UB(src0, src0_sld1, src1, src1_sld1,
141 src2, src2_sld1, src3, src3_sld1, dst, dst_stride);
142 dst += (4 * dst_stride);
143 AVE_ST8x4_UB(src4, src4_sld1, src5, src5_sld1,
144 src6, src6_sld1, src7, src7_sld1, dst, dst_stride);
147 static void common_hz_bil_no_rnd_4x8_msa(const uint8_t *src, int32_t src_stride,
148 uint8_t *dst, int32_t dst_stride)
150 v16i8 src0, src1, src2, src3, src0_sld1, src1_sld1, src2_sld1, src3_sld1;
153 LD_SB4(src, src_stride, src0, src1, src2, src3);
154 SLDI_B4_SB(zeros, src0, zeros, src1, zeros, src2, zeros, src3, 1,
155 src0_sld1, src1_sld1, src2_sld1, src3_sld1);
156 AVE_ST8x4_UB(src0, src0_sld1, src1, src1_sld1,
157 src2, src2_sld1, src3, src3_sld1, dst, dst_stride);
160 static void common_hz_bil_no_rnd_16x16_msa(const uint8_t *src,
162 uint8_t *dst, int32_t dst_stride)
164 v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
165 v16u8 src9, src10, src11, src12, src13, src14, src15;
167 LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
168 LD_UB8((src + 1), src_stride,
169 src8, src9, src10, src11, src12, src13, src14, src15);
170 src += (8 * src_stride);
172 AVE_ST16x4_UB(src0, src8, src1, src9, src2, src10, src3, src11,
174 dst += (4 * dst_stride);
176 LD_UB4(src, src_stride, src0, src1, src2, src3);
177 LD_UB4((src + 1), src_stride, src8, src9, src10, src11);
178 src += (4 * src_stride);
180 AVE_ST16x4_UB(src4, src12, src5, src13, src6, src14, src7, src15,
182 dst += (4 * dst_stride);
184 LD_UB4(src, src_stride, src4, src5, src6, src7);
185 LD_UB4((src + 1), src_stride, src12, src13, src14, src15);
186 src += (4 * src_stride);
188 AVE_ST16x4_UB(src0, src8, src1, src9, src2, src10, src3, src11,
190 dst += (4 * dst_stride);
191 AVE_ST16x4_UB(src4, src12, src5, src13, src6, src14, src7, src15,
195 static void common_hz_bil_no_rnd_8x16_msa(const uint8_t *src,
197 uint8_t *dst, int32_t dst_stride)
199 v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
200 v16u8 src9, src10, src11, src12, src13, src14, src15;
202 LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
203 LD_UB8((src + 1), src_stride,
204 src8, src9, src10, src11, src12, src13, src14, src15);
206 AVE_ST16x4_UB(src0, src8, src1, src9, src2, src10, src3, src11,
208 dst += (4 * dst_stride);
209 AVE_ST16x4_UB(src4, src12, src5, src13, src6, src14, src7, src15,
213 static void common_hz_bil_and_aver_dst_4w_msa(const uint8_t *src,
215 uint8_t *dst, int32_t dst_stride,
219 uint32_t dst0, dst1, out0, out1;
220 v16u8 src0, src1, src0_sld1, src1_sld1, res0, res1;
225 for (loop_cnt = (height >> 1); loop_cnt--;) {
226 LD_UB2(src, src_stride, src0, src1);
227 src += (2 * src_stride);
229 SLDI_B2_UB(zeros, src0, zeros, src1, 1, src0_sld1, src1_sld1);
232 dst1 = LW(dst + dst_stride);
233 tmp0 = (v16u8) __msa_insert_w((v4i32) tmp0, 0, dst0);
234 tmp1 = (v16u8) __msa_insert_w((v4i32) tmp1, 0, dst1);
236 AVER_UB2_UB(src0_sld1, src0, src1_sld1, src1, res0, res1);
237 AVER_UB2_UB(res0, tmp0, res1, tmp1, res0, res1);
239 out0 = __msa_copy_u_w((v4i32) res0, 0);
240 out1 = __msa_copy_u_w((v4i32) res1, 0);
248 static void common_hz_bil_and_aver_dst_8w_msa(const uint8_t *src,
250 uint8_t *dst, int32_t dst_stride,
254 v16i8 src0, src1, src2, src3, src0_sld1, src1_sld1, src2_sld1, src3_sld1;
257 for (loop_cnt = (height >> 2); loop_cnt--;) {
258 LD_SB4(src, src_stride, src0, src1, src2, src3);
259 src += (4 * src_stride);
261 SLDI_B4_SB(zeros, src0, zeros, src1, zeros, src2, zeros, src3, 1,
262 src0_sld1, src1_sld1, src2_sld1, src3_sld1);
264 AVER_DST_ST8x4_UB(src0, src0_sld1, src1, src1_sld1, src2, src2_sld1,
265 src3, src3_sld1, dst, dst_stride);
266 dst += (4 * dst_stride);
270 static void common_hz_bil_and_aver_dst_16w_msa(const uint8_t *src,
272 uint8_t *dst, int32_t dst_stride,
276 v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
277 v16u8 src9, src10, src11, src12, src13, src14, src15;
279 for (loop_cnt = (height >> 3); loop_cnt--;) {
280 LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
281 LD_UB8((src + 1), src_stride,
282 src8, src9, src10, src11, src12, src13, src14, src15);
283 src += (8 * src_stride);
285 AVER_DST_ST16x4_UB(src0, src8, src1, src9, src2, src10, src3, src11,
287 dst += (4 * dst_stride);
288 AVER_DST_ST16x4_UB(src4, src12, src5, src13, src6, src14, src7, src15,
290 dst += (4 * dst_stride);
294 static void common_vt_bil_4w_msa(const uint8_t *src, int32_t src_stride,
295 uint8_t *dst, int32_t dst_stride,
300 v16u8 src0, src1, src2, res0, res1;
305 for (loop_cnt = (height >> 1); loop_cnt--;) {
306 LD_UB2(src, src_stride, src1, src2);
307 src += (2 * src_stride);
309 AVER_UB2_UB(src0, src1, src1, src2, res0, res1);
311 out0 = __msa_copy_u_w((v4i32) res0, 0);
312 out1 = __msa_copy_u_w((v4i32) res1, 0);
322 static void common_vt_bil_8w_msa(const uint8_t *src, int32_t src_stride,
323 uint8_t *dst, int32_t dst_stride,
327 v16u8 src0, src1, src2, src3, src4;
332 for (loop_cnt = (height >> 2); loop_cnt--;) {
333 LD_UB4(src, src_stride, src1, src2, src3, src4);
334 src += (4 * src_stride);
336 AVER_ST8x4_UB(src0, src1, src1, src2, src2, src3, src3, src4,
338 dst += (4 * dst_stride);
344 static void common_vt_bil_16w_msa(const uint8_t *src, int32_t src_stride,
345 uint8_t *dst, int32_t dst_stride,
349 v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
354 for (loop_cnt = (height >> 3); loop_cnt--;) {
355 LD_UB8(src, src_stride, src1, src2, src3, src4, src5, src6, src7, src8);
356 src += (8 * src_stride);
358 AVER_ST16x4_UB(src0, src1, src1, src2, src2, src3, src3, src4,
360 dst += (4 * dst_stride);
361 AVER_ST16x4_UB(src4, src5, src5, src6, src6, src7, src7, src8,
363 dst += (4 * dst_stride);
369 static void common_vt_bil_no_rnd_8x8_msa(const uint8_t *src, int32_t src_stride,
370 uint8_t *dst, int32_t dst_stride)
372 v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
374 LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
375 src += (8 * src_stride);
378 AVE_ST8x4_UB(src0, src1, src1, src2, src2, src3, src3, src4,
380 dst += (4 * dst_stride);
382 AVE_ST8x4_UB(src4, src5, src5, src6, src6, src7, src7, src8,
386 static void common_vt_bil_no_rnd_4x8_msa(const uint8_t *src, int32_t src_stride,
387 uint8_t *dst, int32_t dst_stride)
389 v16u8 src0, src1, src2, src3, src4;
391 LD_UB5(src, src_stride, src0, src1, src2, src3, src4);
392 AVE_ST8x4_UB(src0, src1, src1, src2, src2, src3, src3, src4,
396 static void common_vt_bil_no_rnd_16x16_msa(const uint8_t *src,
398 uint8_t *dst, int32_t dst_stride)
400 v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
401 v16u8 src9, src10, src11, src12, src13, src14, src15, src16;
403 LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
404 src += (8 * src_stride);
405 LD_UB8(src, src_stride,
406 src8, src9, src10, src11, src12, src13, src14, src15);
407 src += (8 * src_stride);
410 AVE_ST16x4_UB(src0, src1, src1, src2, src2, src3, src3, src4,
412 dst += (4 * dst_stride);
413 AVE_ST16x4_UB(src4, src5, src5, src6, src6, src7, src7, src8,
415 dst += (4 * dst_stride);
416 AVE_ST16x4_UB(src8, src9, src9, src10, src10, src11, src11, src12,
418 dst += (4 * dst_stride);
419 AVE_ST16x4_UB(src12, src13, src13, src14,
420 src14, src15, src15, src16, dst, dst_stride);
423 static void common_vt_bil_no_rnd_8x16_msa(const uint8_t *src,
425 uint8_t *dst, int32_t dst_stride)
427 v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
429 LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
430 src += (8 * src_stride);
433 AVE_ST16x4_UB(src0, src1, src1, src2, src2, src3, src3, src4,
435 dst += (4 * dst_stride);
436 AVE_ST16x4_UB(src4, src5, src5, src6, src6, src7, src7, src8,
440 static void common_vt_bil_and_aver_dst_4w_msa(const uint8_t *src,
442 uint8_t *dst, int32_t dst_stride,
446 uint32_t out0, out1, dst0, dst1;
447 v16u8 src0, src1, src2;
455 for (loop_cnt = (height >> 1); loop_cnt--;) {
456 LD_UB2(src, src_stride, src1, src2);
457 src += (2 * src_stride);
459 dst1 = LW(dst + dst_stride);
460 tmp0 = (v16u8) __msa_insert_w((v4i32) tmp0, 0, dst0);
461 tmp1 = (v16u8) __msa_insert_w((v4i32) tmp1, 0, dst1);
462 AVER_UB2_UB(src0, src1, src1, src2, res0, res1);
463 AVER_UB2_UB(res0, tmp0, res1, tmp1, res0, res1);
464 out0 = __msa_copy_u_w((v4i32) res0, 0);
465 out1 = __msa_copy_u_w((v4i32) res1, 0);
474 static void common_vt_bil_and_aver_dst_8w_msa(const uint8_t *src,
476 uint8_t *dst, int32_t dst_stride,
480 v16u8 src0, src1, src2, src3, src4;
485 for (loop_cnt = (height >> 2); loop_cnt--;) {
486 LD_UB4(src, src_stride, src1, src2, src3, src4);
487 src += (4 * src_stride);
489 AVER_DST_ST8x4_UB(src0, src1, src1, src2, src2, src3, src3, src4,
491 dst += (4 * dst_stride);
496 static void common_vt_bil_and_aver_dst_16w_msa(const uint8_t *src,
498 uint8_t *dst, int32_t dst_stride,
502 v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
503 v16u8 res0, res1, res2, res3, res4, res5, res6, res7;
504 v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
509 for (loop_cnt = (height >> 3); loop_cnt--;) {
510 LD_UB8(src, src_stride, src1, src2, src3, src4, src5, src6, src7, src8);
511 src += (8 * src_stride);
512 AVER_UB4_UB(src0, src1, src1, src2, src2, src3, src3, src4,
513 res0, res1, res2, res3);
514 AVER_UB4_UB(src4, src5, src5, src6, src6, src7, src7, src8,
515 res4, res5, res6, res7);
517 LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
518 AVER_UB4_UB(dst0, res0, dst1, res1, dst2, res2, dst3, res3,
519 res0, res1, res2, res3);
520 AVER_UB4_UB(dst4, res4, dst5, res5, dst6, res6, dst7, res7,
521 res4, res5, res6, res7);
522 ST_UB8(res0, res1, res2, res3, res4, res5, res6, res7, dst, dst_stride);
523 dst += (8 * dst_stride);
529 static void common_hv_bil_4w_msa(const uint8_t *src, int32_t src_stride,
530 uint8_t *dst, int32_t dst_stride,
535 v16i8 src0, src1, src2, src0_sld1, src1_sld1, src2_sld1;
536 v16u8 src0_r, src1_r, src2_r, res;
537 v8u16 add0, add1, add2, sum0, sum1;
543 for (loop_cnt = (height >> 1); loop_cnt--;) {
544 LD_SB2(src, src_stride, src1, src2);
545 src += (2 * src_stride);
547 SLDI_B3_SB(zeros, src0, zeros, src1, zeros, src2, 1, src0_sld1,
548 src1_sld1, src2_sld1);
549 ILVR_B3_UB(src0_sld1, src0, src1_sld1, src1, src2_sld1, src2,
550 src0_r, src1_r, src2_r);
551 HADD_UB3_UH(src0_r, src1_r, src2_r, add0, add1, add2);
552 ADD2(add0, add1, add1, add2, sum0, sum1);
553 SRARI_H2_UH(sum0, sum1, 2);
554 res = (v16u8) __msa_pckev_b((v16i8) sum1, (v16i8) sum0);
555 res0 = __msa_copy_u_w((v4i32) res, 0);
556 res1 = __msa_copy_u_w((v4i32) res, 2);
566 static void common_hv_bil_8w_msa(const uint8_t *src, int32_t src_stride,
567 uint8_t *dst, int32_t dst_stride,
571 v16i8 src0, src1, src2, src3, src4;
572 v16i8 src0_sld1, src1_sld1, src2_sld1, src3_sld1, src4_sld1;
573 v16u8 src0_r, src1_r, src2_r, src3_r, src4_r;
574 v8u16 add0, add1, add2, add3, add4;
575 v8u16 sum0, sum1, sum2, sum3;
581 for (loop_cnt = (height >> 2); loop_cnt--;) {
582 LD_SB4(src, src_stride, src1, src2, src3, src4);
583 src += (4 * src_stride);
585 SLDI_B3_SB(zeros, src0, zeros, src1, zeros, src2, 1, src0_sld1,
586 src1_sld1, src2_sld1);
587 SLDI_B2_SB(zeros, src3, zeros, src4, 1, src3_sld1, src4_sld1);
588 ILVR_B3_UB(src0_sld1, src0, src1_sld1, src1, src2_sld1, src2, src0_r,
590 ILVR_B2_UB(src3_sld1, src3, src4_sld1, src4, src3_r, src4_r);
591 HADD_UB3_UH(src0_r, src1_r, src2_r, add0, add1, add2);
592 HADD_UB2_UH(src3_r, src4_r, add3, add4);
593 ADD4(add0, add1, add1, add2, add2, add3, add3, add4,
594 sum0, sum1, sum2, sum3);
595 SRARI_H4_UH(sum0, sum1, sum2, sum3, 2);
596 PCKEV_B2_SB(sum1, sum0, sum3, sum2, src0, src1);
597 ST_D4(src0, src1, 0, 1, 0, 1, dst, dst_stride);
598 dst += (4 * dst_stride);
603 static void common_hv_bil_16w_msa(const uint8_t *src, int32_t src_stride,
604 uint8_t *dst, int32_t dst_stride,
608 v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9;
609 v16u8 src10, src11, src12, src13, src14, src15, src16, src17;
610 v8u16 src0_r, src1_r, src2_r, src3_r, src4_r, src5_r, src6_r, src7_r;
611 v8u16 src8_r, src0_l, src1_l, src2_l, src3_l, src4_l, src5_l, src6_l;
612 v8u16 src7_l, src8_l;
613 v8u16 sum0_r, sum1_r, sum2_r, sum3_r, sum4_r, sum5_r, sum6_r, sum7_r;
614 v8u16 sum0_l, sum1_l, sum2_l, sum3_l, sum4_l, sum5_l, sum6_l, sum7_l;
616 for (loop_cnt = (height >> 3); loop_cnt--;) {
617 LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
618 LD_UB8((src + 1), src_stride,
619 src9, src10, src11, src12, src13, src14, src15, src16);
620 src += (8 * src_stride);
623 src17 = LD_UB(src + 1);
625 ILVRL_B2_UH(src9, src0, src0_r, src0_l);
626 ILVRL_B2_UH(src10, src1, src1_r, src1_l);
627 ILVRL_B2_UH(src11, src2, src2_r, src2_l);
628 ILVRL_B2_UH(src12, src3, src3_r, src3_l);
629 ILVRL_B2_UH(src13, src4, src4_r, src4_l);
630 ILVRL_B2_UH(src14, src5, src5_r, src5_l);
631 ILVRL_B2_UH(src15, src6, src6_r, src6_l);
632 ILVRL_B2_UH(src16, src7, src7_r, src7_l);
633 ILVRL_B2_UH(src17, src8, src8_r, src8_l);
634 HADD_UB3_UH(src0_r, src1_r, src2_r, src0_r, src1_r, src2_r);
635 HADD_UB3_UH(src3_r, src4_r, src5_r, src3_r, src4_r, src5_r);
636 HADD_UB3_UH(src6_r, src7_r, src8_r, src6_r, src7_r, src8_r);
637 HADD_UB3_UH(src0_l, src1_l, src2_l, src0_l, src1_l, src2_l);
638 HADD_UB3_UH(src3_l, src4_l, src5_l, src3_l, src4_l, src5_l);
639 HADD_UB3_UH(src6_l, src7_l, src8_l, src6_l, src7_l, src8_l);
640 ADD4(src0_r, src1_r, src1_r, src2_r, src2_r, src3_r, src3_r, src4_r,
641 sum0_r, sum1_r, sum2_r, sum3_r);
642 ADD4(src4_r, src5_r, src5_r, src6_r, src6_r, src7_r, src7_r, src8_r,
643 sum4_r, sum5_r, sum6_r, sum7_r);
644 ADD4(src0_l, src1_l, src1_l, src2_l, src2_l, src3_l, src3_l, src4_l,
645 sum0_l, sum1_l, sum2_l, sum3_l);
646 ADD4(src4_l, src5_l, src5_l, src6_l, src6_l, src7_l, src7_l, src8_l,
647 sum4_l, sum5_l, sum6_l, sum7_l);
648 SRARI_H4_UH(sum0_r, sum1_r, sum2_r, sum3_r, 2);
649 SRARI_H4_UH(sum4_r, sum5_r, sum6_r, sum7_r, 2);
650 SRARI_H4_UH(sum0_l, sum1_l, sum2_l, sum3_l, 2);
651 SRARI_H4_UH(sum4_l, sum5_l, sum6_l, sum7_l, 2);
652 PCKEV_ST_SB4(sum0_l, sum0_r, sum1_l, sum1_r, sum2_l, sum2_r,
653 sum3_l, sum3_r, dst, dst_stride);
654 dst += (4 * dst_stride);
655 PCKEV_ST_SB4(sum4_l, sum4_r, sum5_l, sum5_r, sum6_l, sum6_r,
656 sum7_l, sum7_r, dst, dst_stride);
657 dst += (4 * dst_stride);
661 static void common_hv_bil_no_rnd_8x8_msa(const uint8_t *src, int32_t src_stride,
662 uint8_t *dst, int32_t dst_stride)
664 v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
665 v16u8 src0_sld1, src1_sld1, src2_sld1, src3_sld1;
666 v16u8 src4_sld1, src5_sld1, src6_sld1, src7_sld1, src8_sld1;
667 v8u16 src0_r, src1_r, src2_r, src3_r;
668 v8u16 src4_r, src5_r, src6_r, src7_r, src8_r;
669 v8u16 add0, add1, add2, add3, add4, add5, add6, add7, add8;
670 v8u16 sum0, sum1, sum2, sum3, sum4, sum5, sum6, sum7;
674 LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
675 src += (8 * src_stride);
678 SLDI_B4_UB(zeros, src0, zeros, src1, zeros, src2, zeros, src3, 1,
679 src0_sld1, src1_sld1, src2_sld1, src3_sld1);
680 SLDI_B3_UB(zeros, src4, zeros, src5, zeros, src6, 1, src4_sld1,
681 src5_sld1, src6_sld1);
682 SLDI_B2_UB(zeros, src7, zeros, src8, 1, src7_sld1, src8_sld1);
683 ILVR_B4_UH(src0_sld1, src0, src1_sld1, src1, src2_sld1, src2, src3_sld1,
684 src3, src0_r, src1_r, src2_r, src3_r);
685 ILVR_B3_UH(src4_sld1, src4, src5_sld1, src5, src6_sld1, src6, src4_r,
687 ILVR_B2_UH(src7_sld1, src7, src8_sld1, src8, src7_r, src8_r);
688 HADD_UB3_UH(src0_r, src1_r, src2_r, add0, add1, add2);
689 HADD_UB3_UH(src3_r, src4_r, src5_r, add3, add4, add5);
690 HADD_UB3_UH(src6_r, src7_r, src8_r, add6, add7, add8);
692 sum0 = add0 + add1 + 1;
693 sum1 = add1 + add2 + 1;
694 sum2 = add2 + add3 + 1;
695 sum3 = add3 + add4 + 1;
696 sum4 = add4 + add5 + 1;
697 sum5 = add5 + add6 + 1;
698 sum6 = add6 + add7 + 1;
699 sum7 = add7 + add8 + 1;
701 SRA_4V(sum0, sum1, sum2, sum3, 2);
702 SRA_4V(sum4, sum5, sum6, sum7, 2);
703 PCKEV_B2_SB(sum1, sum0, sum3, sum2, out0, out1);
704 ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
705 PCKEV_B2_SB(sum5, sum4, sum7, sum6, out0, out1);
706 ST_D4(out0, out1, 0, 1, 0, 1, dst + 4 * dst_stride, dst_stride);
709 static void common_hv_bil_no_rnd_4x8_msa(const uint8_t *src, int32_t src_stride,
710 uint8_t *dst, int32_t dst_stride)
712 v16i8 src0, src1, src2, src3, src4;
713 v16i8 src0_sld1, src1_sld1, src2_sld1, src3_sld1, src4_sld1;
714 v8u16 src0_r, src1_r, src2_r, src3_r, src4_r;
715 v8u16 add0, add1, add2, add3, add4;
716 v8u16 sum0, sum1, sum2, sum3;
720 LD_SB4(src, src_stride, src0, src1, src2, src3);
721 src += (4 * src_stride);
724 SLDI_B3_SB(zeros, src0, zeros, src1, zeros, src2, 1, src0_sld1,
725 src1_sld1, src2_sld1);
726 SLDI_B2_SB(zeros, src3, zeros, src4, 1, src3_sld1, src4_sld1);
727 ILVR_B3_UH(src0_sld1, src0, src1_sld1, src1, src2_sld1, src2, src0_r,
729 ILVR_B2_UH(src3_sld1, src3, src4_sld1, src4, src3_r, src4_r);
730 HADD_UB3_UH(src0_r, src1_r, src2_r, add0, add1, add2);
731 HADD_UB2_UH(src3_r, src4_r, add3, add4);
733 sum0 = add0 + add1 + 1;
734 sum1 = add1 + add2 + 1;
735 sum2 = add2 + add3 + 1;
736 sum3 = add3 + add4 + 1;
738 SRA_4V(sum0, sum1, sum2, sum3, 2);
739 PCKEV_B2_SB(sum1, sum0, sum3, sum2, out0, out1);
740 ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
743 static void common_hv_bil_no_rnd_16x16_msa(const uint8_t *src,
745 uint8_t *dst, int32_t dst_stride)
747 v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9;
748 v16u8 src10, src11, src12, src13, src14, src15, src16, src17;
749 v8u16 src0_r, src1_r, src2_r, src3_r, src4_r, src5_r, src6_r, src7_r;
750 v8u16 src8_r, src0_l, src1_l, src2_l, src3_l, src4_l, src5_l, src6_l;
751 v8u16 src7_l, src8_l;
752 v8u16 sum0_r, sum1_r, sum2_r, sum3_r, sum4_r, sum5_r, sum6_r, sum7_r;
753 v8u16 sum0_l, sum1_l, sum2_l, sum3_l, sum4_l, sum5_l, sum6_l, sum7_l;
755 LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
756 LD_UB8((src + 1), src_stride,
757 src9, src10, src11, src12, src13, src14, src15, src16);
758 src += (8 * src_stride);
760 src17 = LD_UB(src + 1);
762 ILVRL_B2_UH(src9, src0, src0_r, src0_l);
763 ILVRL_B2_UH(src10, src1, src1_r, src1_l);
764 ILVRL_B2_UH(src11, src2, src2_r, src2_l);
765 ILVRL_B2_UH(src12, src3, src3_r, src3_l);
766 ILVRL_B2_UH(src13, src4, src4_r, src4_l);
767 ILVRL_B2_UH(src14, src5, src5_r, src5_l);
768 ILVRL_B2_UH(src15, src6, src6_r, src6_l);
769 ILVRL_B2_UH(src16, src7, src7_r, src7_l);
770 ILVRL_B2_UH(src17, src8, src8_r, src8_l);
772 HADD_UB3_UH(src0_r, src1_r, src2_r, src0_r, src1_r, src2_r);
773 HADD_UB3_UH(src3_r, src4_r, src5_r, src3_r, src4_r, src5_r);
774 HADD_UB3_UH(src6_r, src7_r, src8_r, src6_r, src7_r, src8_r);
775 HADD_UB3_UH(src0_l, src1_l, src2_l, src0_l, src1_l, src2_l);
776 HADD_UB3_UH(src3_l, src4_l, src5_l, src3_l, src4_l, src5_l);
777 HADD_UB3_UH(src6_l, src7_l, src8_l, src6_l, src7_l, src8_l);
779 sum0_r = src0_r + src1_r + 1;
780 sum1_r = src1_r + src2_r + 1;
781 sum2_r = src2_r + src3_r + 1;
782 sum3_r = src3_r + src4_r + 1;
783 sum4_r = src4_r + src5_r + 1;
784 sum5_r = src5_r + src6_r + 1;
785 sum6_r = src6_r + src7_r + 1;
786 sum7_r = src7_r + src8_r + 1;
787 sum0_l = src0_l + src1_l + 1;
788 sum1_l = src1_l + src2_l + 1;
789 sum2_l = src2_l + src3_l + 1;
790 sum3_l = src3_l + src4_l + 1;
791 sum4_l = src4_l + src5_l + 1;
792 sum5_l = src5_l + src6_l + 1;
793 sum6_l = src6_l + src7_l + 1;
794 sum7_l = src7_l + src8_l + 1;
796 SRA_4V(sum0_r, sum1_r, sum2_r, sum3_r, 2);
797 SRA_4V(sum4_r, sum5_r, sum6_r, sum7_r, 2);
798 SRA_4V(sum0_l, sum1_l, sum2_l, sum3_l, 2);
799 SRA_4V(sum4_l, sum5_l, sum6_l, sum7_l, 2);
800 PCKEV_ST_SB4(sum0_l, sum0_r, sum1_l, sum1_r,
801 sum2_l, sum2_r, sum3_l, sum3_r, dst, dst_stride);
802 dst += (4 * dst_stride);
804 LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
805 LD_UB8((src + 1), src_stride,
806 src9, src10, src11, src12, src13, src14, src15, src16);
807 src += (8 * src_stride);
809 src17 = LD_UB(src + 1);
811 PCKEV_ST_SB4(sum4_l, sum4_r, sum5_l, sum5_r,
812 sum6_l, sum6_r, sum7_l, sum7_r, dst, dst_stride);
813 dst += (4 * dst_stride);
815 ILVRL_B2_UH(src9, src0, src0_r, src0_l);
816 ILVRL_B2_UH(src10, src1, src1_r, src1_l);
817 ILVRL_B2_UH(src11, src2, src2_r, src2_l);
818 ILVRL_B2_UH(src12, src3, src3_r, src3_l);
819 ILVRL_B2_UH(src13, src4, src4_r, src4_l);
820 ILVRL_B2_UH(src14, src5, src5_r, src5_l);
821 ILVRL_B2_UH(src15, src6, src6_r, src6_l);
822 ILVRL_B2_UH(src16, src7, src7_r, src7_l);
823 ILVRL_B2_UH(src17, src8, src8_r, src8_l);
825 HADD_UB3_UH(src0_r, src1_r, src2_r, src0_r, src1_r, src2_r);
826 HADD_UB3_UH(src3_r, src4_r, src5_r, src3_r, src4_r, src5_r);
827 HADD_UB3_UH(src6_r, src7_r, src8_r, src6_r, src7_r, src8_r);
828 HADD_UB3_UH(src0_l, src1_l, src2_l, src0_l, src1_l, src2_l);
829 HADD_UB3_UH(src3_l, src4_l, src5_l, src3_l, src4_l, src5_l);
830 HADD_UB3_UH(src6_l, src7_l, src8_l, src6_l, src7_l, src8_l);
832 sum0_r = src0_r + src1_r + 1;
833 sum1_r = src1_r + src2_r + 1;
834 sum2_r = src2_r + src3_r + 1;
835 sum3_r = src3_r + src4_r + 1;
836 sum4_r = src4_r + src5_r + 1;
837 sum5_r = src5_r + src6_r + 1;
838 sum6_r = src6_r + src7_r + 1;
839 sum7_r = src7_r + src8_r + 1;
840 sum0_l = src0_l + src1_l + 1;
841 sum1_l = src1_l + src2_l + 1;
842 sum2_l = src2_l + src3_l + 1;
843 sum3_l = src3_l + src4_l + 1;
844 sum4_l = src4_l + src5_l + 1;
845 sum5_l = src5_l + src6_l + 1;
846 sum6_l = src6_l + src7_l + 1;
847 sum7_l = src7_l + src8_l + 1;
849 SRA_4V(sum0_r, sum1_r, sum2_r, sum3_r, 2);
850 SRA_4V(sum4_r, sum5_r, sum6_r, sum7_r, 2);
851 SRA_4V(sum0_l, sum1_l, sum2_l, sum3_l, 2);
852 SRA_4V(sum4_l, sum5_l, sum6_l, sum7_l, 2);
853 PCKEV_ST_SB4(sum0_l, sum0_r, sum1_l, sum1_r,
854 sum2_l, sum2_r, sum3_l, sum3_r, dst, dst_stride);
855 dst += (4 * dst_stride);
856 PCKEV_ST_SB4(sum4_l, sum4_r, sum5_l, sum5_r,
857 sum6_l, sum6_r, sum7_l, sum7_r, dst, dst_stride);
860 static void common_hv_bil_no_rnd_8x16_msa(const uint8_t *src,
862 uint8_t *dst, int32_t dst_stride)
864 v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9;
865 v16u8 src10, src11, src12, src13, src14, src15, src16, src17;
866 v8u16 src0_r, src1_r, src2_r, src3_r, src4_r, src5_r, src6_r, src7_r;
867 v8u16 src8_r, src0_l, src1_l, src2_l, src3_l, src4_l, src5_l, src6_l;
868 v8u16 src7_l, src8_l;
869 v8u16 sum0_r, sum1_r, sum2_r, sum3_r, sum4_r, sum5_r, sum6_r, sum7_r;
870 v8u16 sum0_l, sum1_l, sum2_l, sum3_l, sum4_l, sum5_l, sum6_l, sum7_l;
872 LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
873 LD_UB8((src + 1), src_stride,
874 src9, src10, src11, src12, src13, src14, src15, src16);
875 src += (8 * src_stride);
877 src17 = LD_UB(src + 1);
879 ILVRL_B2_UH(src9, src0, src0_r, src0_l);
880 ILVRL_B2_UH(src10, src1, src1_r, src1_l);
881 ILVRL_B2_UH(src11, src2, src2_r, src2_l);
882 ILVRL_B2_UH(src12, src3, src3_r, src3_l);
883 ILVRL_B2_UH(src13, src4, src4_r, src4_l);
884 ILVRL_B2_UH(src14, src5, src5_r, src5_l);
885 ILVRL_B2_UH(src15, src6, src6_r, src6_l);
886 ILVRL_B2_UH(src16, src7, src7_r, src7_l);
887 ILVRL_B2_UH(src17, src8, src8_r, src8_l);
889 HADD_UB3_UH(src0_r, src1_r, src2_r, src0_r, src1_r, src2_r);
890 HADD_UB3_UH(src3_r, src4_r, src5_r, src3_r, src4_r, src5_r);
891 HADD_UB3_UH(src6_r, src7_r, src8_r, src6_r, src7_r, src8_r);
892 HADD_UB3_UH(src0_l, src1_l, src2_l, src0_l, src1_l, src2_l);
893 HADD_UB3_UH(src3_l, src4_l, src5_l, src3_l, src4_l, src5_l);
894 HADD_UB3_UH(src6_l, src7_l, src8_l, src6_l, src7_l, src8_l);
896 sum0_r = src0_r + src1_r + 1;
897 sum1_r = src1_r + src2_r + 1;
898 sum2_r = src2_r + src3_r + 1;
899 sum3_r = src3_r + src4_r + 1;
900 sum4_r = src4_r + src5_r + 1;
901 sum5_r = src5_r + src6_r + 1;
902 sum6_r = src6_r + src7_r + 1;
903 sum7_r = src7_r + src8_r + 1;
904 sum0_l = src0_l + src1_l + 1;
905 sum1_l = src1_l + src2_l + 1;
906 sum2_l = src2_l + src3_l + 1;
907 sum3_l = src3_l + src4_l + 1;
908 sum4_l = src4_l + src5_l + 1;
909 sum5_l = src5_l + src6_l + 1;
910 sum6_l = src6_l + src7_l + 1;
911 sum7_l = src7_l + src8_l + 1;
913 SRA_4V(sum0_r, sum1_r, sum2_r, sum3_r, 2);
914 SRA_4V(sum4_r, sum5_r, sum6_r, sum7_r, 2);
915 SRA_4V(sum0_l, sum1_l, sum2_l, sum3_l, 2);
916 SRA_4V(sum4_l, sum5_l, sum6_l, sum7_l, 2);
917 PCKEV_ST_SB4(sum0_l, sum0_r, sum1_l, sum1_r,
918 sum2_l, sum2_r, sum3_l, sum3_r, dst, dst_stride);
919 dst += (4 * dst_stride);
920 PCKEV_ST_SB4(sum4_l, sum4_r, sum5_l, sum5_r,
921 sum6_l, sum6_r, sum7_l, sum7_r, dst, dst_stride);
924 static void common_hv_bil_and_aver_dst_4w_msa(const uint8_t *src,
926 uint8_t *dst, int32_t dst_stride,
931 v16i8 src0, src1, src2, src0_sld1, src1_sld1, src2_sld1;
932 v16u8 src0_r, src1_r, src2_r;
933 v8u16 add0, add1, add2, sum0, sum1;
934 v16u8 dst0, dst1, res0, res1;
940 for (loop_cnt = (height >> 1); loop_cnt--;) {
941 LD_SB2(src, src_stride, src1, src2);
942 src += (2 * src_stride);
944 LD_UB2(dst, dst_stride, dst0, dst1);
945 SLDI_B3_SB(zeros, src0, zeros, src1, zeros, src2, 1, src0_sld1,
946 src1_sld1, src2_sld1);
947 ILVR_B3_UB(src0_sld1, src0, src1_sld1, src1, src2_sld1, src2, src0_r,
949 HADD_UB3_UH(src0_r, src1_r, src2_r, add0, add1, add2);
950 ADD2(add0, add1, add1, add2, sum0, sum1);
951 SRARI_H2_UH(sum0, sum1, 2);
952 PCKEV_B2_UB(sum0, sum0, sum1, sum1, res0, res1);
953 AVER_UB2_UB(dst0, res0, dst1, res1, res0, res1);
955 out0 = __msa_copy_u_w((v4i32) res0, 0);
956 out1 = __msa_copy_u_w((v4i32) res1, 0);
966 static void common_hv_bil_and_aver_dst_8w_msa(const uint8_t *src,
968 uint8_t *dst, int32_t dst_stride,
972 v16i8 src0, src1, src2, src3, src4;
973 v16i8 src0_sld1, src1_sld1, src2_sld1, src3_sld1, src4_sld1;
974 v16u8 dst0, dst1, dst2, dst3;
975 v16u8 src0_r, src1_r, src2_r, src3_r, src4_r;
976 v8u16 add0, add1, add2, add3, add4;
977 v8u16 sum0, sum1, sum2, sum3;
983 for (loop_cnt = (height >> 2); loop_cnt--;) {
984 LD_SB4(src, src_stride, src1, src2, src3, src4);
985 src += (4 * src_stride);
987 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
988 SLDI_B3_SB(zeros, src0, zeros, src1, zeros, src2, 1, src0_sld1,
989 src1_sld1, src2_sld1);
990 SLDI_B2_SB(zeros, src3, zeros, src4, 1, src3_sld1, src4_sld1);
991 ILVR_B3_UB(src0_sld1, src0, src1_sld1, src1, src2_sld1, src2, src0_r,
993 ILVR_B2_UB(src3_sld1, src3, src4_sld1, src4, src3_r, src4_r);
994 HADD_UB3_UH(src0_r, src1_r, src2_r, add0, add1, add2);
995 HADD_UB2_UH(src3_r, src4_r, add3, add4);
996 ADD4(add0, add1, add1, add2, add2, add3, add3, add4,
997 sum0, sum1, sum2, sum3);
998 SRARI_H4_UH(sum0, sum1, sum2, sum3, 2);
999 PCKEV_AVG_ST8x4_UB(sum0, dst0, sum1, dst1,
1000 sum2, dst2, sum3, dst3, dst, dst_stride);
1001 dst += (4 * dst_stride);
1006 static void common_hv_bil_and_aver_dst_16w_msa(const uint8_t *src,
1008 uint8_t *dst, int32_t dst_stride,
1012 v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
1013 v16u8 src11, src12, src13, src14, src15, src16, src17;
1014 v16u8 src0_r, src1_r, src2_r, src3_r, src4_r, src5_r, src6_r, src7_r;
1015 v16u8 src8_r, src0_l, src1_l, src2_l, src3_l, src4_l, src5_l, src6_l;
1016 v16u8 src7_l, src8_l;
1017 v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
1018 v8u16 sum0_r, sum1_r, sum2_r, sum3_r, sum4_r, sum5_r, sum6_r, sum7_r;
1019 v8u16 sum0_l, sum1_l, sum2_l, sum3_l, sum4_l, sum5_l, sum6_l, sum7_l;
1020 v8u16 add0, add1, add2, add3, add4, add5, add6, add7, add8;
1022 for (loop_cnt = (height >> 3); loop_cnt--;) {
1023 LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
1024 LD_UB8((src + 1), src_stride,
1025 src9, src10, src11, src12, src13, src14, src15, src16);
1026 src += (8 * src_stride);
1029 src17 = LD_UB(src + 1);
1031 ILVRL_B2_UB(src9, src0, src0_r, src0_l);
1032 ILVRL_B2_UB(src10, src1, src1_r, src1_l);
1033 ILVRL_B2_UB(src11, src2, src2_r, src2_l);
1034 ILVRL_B2_UB(src12, src3, src3_r, src3_l);
1035 ILVRL_B2_UB(src13, src4, src4_r, src4_l);
1036 ILVRL_B2_UB(src14, src5, src5_r, src5_l);
1037 ILVRL_B2_UB(src15, src6, src6_r, src6_l);
1038 ILVRL_B2_UB(src16, src7, src7_r, src7_l);
1039 ILVRL_B2_UB(src17, src8, src8_r, src8_l);
1040 HADD_UB3_UH(src0_r, src1_r, src2_r, add0, add1, add2);
1041 HADD_UB3_UH(src3_r, src4_r, src5_r, add3, add4, add5);
1042 HADD_UB3_UH(src6_r, src7_r, src8_r, add6, add7, add8);
1043 ADD4(add0, add1, add1, add2, add2, add3, add3, add4, sum0_r, sum1_r,
1045 ADD4(add4, add5, add5, add6, add6, add7, add7, add8, sum4_r, sum5_r,
1047 HADD_UB3_UH(src0_l, src1_l, src2_l, add0, add1, add2);
1048 HADD_UB3_UH(src3_l, src4_l, src5_l, add3, add4, add5);
1049 HADD_UB3_UH(src6_l, src7_l, src8_l, add6, add7, add8);
1050 ADD4(add0, add1, add1, add2, add2, add3, add3, add4, sum0_l, sum1_l,
1052 ADD4(add4, add5, add5, add6, add6, add7, add7, add8, sum4_l, sum5_l,
1054 SRARI_H4_UH(sum0_r, sum1_r, sum2_r, sum3_r, 2);
1055 SRARI_H4_UH(sum4_r, sum5_r, sum6_r, sum7_r, 2);
1056 SRARI_H4_UH(sum0_l, sum1_l, sum2_l, sum3_l, 2);
1057 SRARI_H4_UH(sum4_l, sum5_l, sum6_l, sum7_l, 2);
1058 LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
1059 PCKEV_AVG_ST_UB(sum0_l, sum0_r, dst0, dst);
1061 PCKEV_AVG_ST_UB(sum1_l, sum1_r, dst1, dst);
1063 PCKEV_AVG_ST_UB(sum2_l, sum2_r, dst2, dst);
1065 PCKEV_AVG_ST_UB(sum3_l, sum3_r, dst3, dst);
1067 PCKEV_AVG_ST_UB(sum4_l, sum4_r, dst4, dst);
1069 PCKEV_AVG_ST_UB(sum5_l, sum5_r, dst5, dst);
1071 PCKEV_AVG_ST_UB(sum6_l, sum6_r, dst6, dst);
1073 PCKEV_AVG_ST_UB(sum7_l, sum7_r, dst7, dst);
1078 static void copy_width8_msa(const uint8_t *src, int32_t src_stride,
1079 uint8_t *dst, int32_t dst_stride,
1083 uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
1084 v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
1086 if (0 == height % 12) {
1087 for (cnt = (height / 12); cnt--;) {
1088 LD_UB8(src, src_stride,
1089 src0, src1, src2, src3, src4, src5, src6, src7);
1090 src += (8 * src_stride);
1092 out0 = __msa_copy_u_d((v2i64) src0, 0);
1093 out1 = __msa_copy_u_d((v2i64) src1, 0);
1094 out2 = __msa_copy_u_d((v2i64) src2, 0);
1095 out3 = __msa_copy_u_d((v2i64) src3, 0);
1096 out4 = __msa_copy_u_d((v2i64) src4, 0);
1097 out5 = __msa_copy_u_d((v2i64) src5, 0);
1098 out6 = __msa_copy_u_d((v2i64) src6, 0);
1099 out7 = __msa_copy_u_d((v2i64) src7, 0);
1101 SD4(out0, out1, out2, out3, dst, dst_stride);
1102 dst += (4 * dst_stride);
1103 SD4(out4, out5, out6, out7, dst, dst_stride);
1104 dst += (4 * dst_stride);
1106 LD_UB4(src, src_stride, src0, src1, src2, src3);
1107 src += (4 * src_stride);
1109 out0 = __msa_copy_u_d((v2i64) src0, 0);
1110 out1 = __msa_copy_u_d((v2i64) src1, 0);
1111 out2 = __msa_copy_u_d((v2i64) src2, 0);
1112 out3 = __msa_copy_u_d((v2i64) src3, 0);
1114 SD4(out0, out1, out2, out3, dst, dst_stride);
1115 dst += (4 * dst_stride);
1117 } else if (0 == height % 8) {
1118 for (cnt = height >> 3; cnt--;) {
1119 LD_UB8(src, src_stride,
1120 src0, src1, src2, src3, src4, src5, src6, src7);
1121 src += (8 * src_stride);
1123 out0 = __msa_copy_u_d((v2i64) src0, 0);
1124 out1 = __msa_copy_u_d((v2i64) src1, 0);
1125 out2 = __msa_copy_u_d((v2i64) src2, 0);
1126 out3 = __msa_copy_u_d((v2i64) src3, 0);
1127 out4 = __msa_copy_u_d((v2i64) src4, 0);
1128 out5 = __msa_copy_u_d((v2i64) src5, 0);
1129 out6 = __msa_copy_u_d((v2i64) src6, 0);
1130 out7 = __msa_copy_u_d((v2i64) src7, 0);
1132 SD4(out0, out1, out2, out3, dst, dst_stride);
1133 dst += (4 * dst_stride);
1134 SD4(out4, out5, out6, out7, dst, dst_stride);
1135 dst += (4 * dst_stride);
1137 } else if (0 == height % 4) {
1138 for (cnt = (height / 4); cnt--;) {
1139 LD_UB4(src, src_stride, src0, src1, src2, src3);
1140 src += (4 * src_stride);
1141 out0 = __msa_copy_u_d((v2i64) src0, 0);
1142 out1 = __msa_copy_u_d((v2i64) src1, 0);
1143 out2 = __msa_copy_u_d((v2i64) src2, 0);
1144 out3 = __msa_copy_u_d((v2i64) src3, 0);
1146 SD4(out0, out1, out2, out3, dst, dst_stride);
1147 dst += (4 * dst_stride);
1149 } else if (0 == height % 2) {
1150 for (cnt = (height / 2); cnt--;) {
1151 LD_UB2(src, src_stride, src0, src1);
1152 src += (2 * src_stride);
1153 out0 = __msa_copy_u_d((v2i64) src0, 0);
1154 out1 = __msa_copy_u_d((v2i64) src1, 0);
1164 static void copy_16multx8mult_msa(const uint8_t *src, int32_t src_stride,
1165 uint8_t *dst, int32_t dst_stride,
1166 int32_t height, int32_t width)
1168 int32_t cnt, loop_cnt;
1169 const uint8_t *src_tmp;
1171 v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
1173 for (cnt = (width >> 4); cnt--;) {
1177 for (loop_cnt = (height >> 3); loop_cnt--;) {
1178 LD_UB8(src_tmp, src_stride,
1179 src0, src1, src2, src3, src4, src5, src6, src7);
1180 src_tmp += (8 * src_stride);
1182 ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7,
1183 dst_tmp, dst_stride);
1184 dst_tmp += (8 * dst_stride);
1192 static void copy_width16_msa(const uint8_t *src, int32_t src_stride,
1193 uint8_t *dst, int32_t dst_stride,
1197 v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
1199 if (0 == height % 12) {
1200 for (cnt = (height / 12); cnt--;) {
1201 LD_UB8(src, src_stride,
1202 src0, src1, src2, src3, src4, src5, src6, src7);
1203 src += (8 * src_stride);
1204 ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7,
1206 dst += (8 * dst_stride);
1208 LD_UB4(src, src_stride, src0, src1, src2, src3);
1209 src += (4 * src_stride);
1210 ST_UB4(src0, src1, src2, src3, dst, dst_stride);
1211 dst += (4 * dst_stride);
1213 } else if (0 == height % 8) {
1214 copy_16multx8mult_msa(src, src_stride, dst, dst_stride, height, 16);
1215 } else if (0 == height % 4) {
1216 for (cnt = (height >> 2); cnt--;) {
1217 LD_UB4(src, src_stride, src0, src1, src2, src3);
1218 src += (4 * src_stride);
1220 ST_UB4(src0, src1, src2, src3, dst, dst_stride);
1221 dst += (4 * dst_stride);
1226 static void avg_width4_msa(const uint8_t *src, int32_t src_stride,
1227 uint8_t *dst, int32_t dst_stride,
1231 uint32_t out0, out1, out2, out3;
1232 v16u8 src0, src1, src2, src3;
1233 v16u8 dst0, dst1, dst2, dst3;
1235 if (0 == (height % 4)) {
1236 for (cnt = (height / 4); cnt--;) {
1237 LD_UB4(src, src_stride, src0, src1, src2, src3);
1238 src += (4 * src_stride);
1240 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
1242 AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3,
1243 dst0, dst1, dst2, dst3);
1245 out0 = __msa_copy_u_w((v4i32) dst0, 0);
1246 out1 = __msa_copy_u_w((v4i32) dst1, 0);
1247 out2 = __msa_copy_u_w((v4i32) dst2, 0);
1248 out3 = __msa_copy_u_w((v4i32) dst3, 0);
1249 SW4(out0, out1, out2, out3, dst, dst_stride);
1250 dst += (4 * dst_stride);
1252 } else if (0 == (height % 2)) {
1253 for (cnt = (height / 2); cnt--;) {
1254 LD_UB2(src, src_stride, src0, src1);
1255 src += (2 * src_stride);
1257 LD_UB2(dst, dst_stride, dst0, dst1);
1259 AVER_UB2_UB(src0, dst0, src1, dst1, dst0, dst1);
1261 out0 = __msa_copy_u_w((v4i32) dst0, 0);
1262 out1 = __msa_copy_u_w((v4i32) dst1, 0);
1271 static void avg_width8_msa(const uint8_t *src, int32_t src_stride,
1272 uint8_t *dst, int32_t dst_stride,
1276 uint64_t out0, out1, out2, out3;
1277 v16u8 src0, src1, src2, src3;
1278 v16u8 dst0, dst1, dst2, dst3;
1280 for (cnt = (height / 4); cnt--;) {
1281 LD_UB4(src, src_stride, src0, src1, src2, src3);
1282 src += (4 * src_stride);
1283 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
1285 AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3,
1286 dst0, dst1, dst2, dst3);
1288 out0 = __msa_copy_u_d((v2i64) dst0, 0);
1289 out1 = __msa_copy_u_d((v2i64) dst1, 0);
1290 out2 = __msa_copy_u_d((v2i64) dst2, 0);
1291 out3 = __msa_copy_u_d((v2i64) dst3, 0);
1292 SD4(out0, out1, out2, out3, dst, dst_stride);
1293 dst += (4 * dst_stride);
1297 static void avg_width16_msa(const uint8_t *src, int32_t src_stride,
1298 uint8_t *dst, int32_t dst_stride,
1302 v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
1303 v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
1305 for (cnt = (height / 8); cnt--;) {
1306 LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
1307 src += (8 * src_stride);
1308 LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
1310 AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3,
1311 dst0, dst1, dst2, dst3);
1312 AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7,
1313 dst4, dst5, dst6, dst7);
1314 ST_UB8(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst, dst_stride);
1315 dst += (8 * dst_stride);
1319 void ff_put_pixels16_msa(uint8_t *block, const uint8_t *pixels,
1320 ptrdiff_t line_size, int h)
1322 copy_width16_msa(pixels, line_size, block, line_size, h);
1325 void ff_put_pixels16_x2_msa(uint8_t *block, const uint8_t *pixels,
1326 ptrdiff_t line_size, int h)
1328 common_hz_bil_16w_msa(pixels, line_size, block, line_size, h);
1331 void ff_put_pixels16_y2_msa(uint8_t *block, const uint8_t *pixels,
1332 ptrdiff_t line_size, int h)
1334 common_vt_bil_16w_msa(pixels, line_size, block, line_size, h);
1337 void ff_put_pixels16_xy2_msa(uint8_t *block, const uint8_t *pixels,
1338 ptrdiff_t line_size, int h)
1340 common_hv_bil_16w_msa(pixels, line_size, block, line_size, h);
1343 void ff_put_pixels8_msa(uint8_t *block, const uint8_t *pixels,
1344 ptrdiff_t line_size, int h)
1346 copy_width8_msa(pixels, line_size, block, line_size, h);
1349 void ff_put_pixels8_x2_msa(uint8_t *block, const uint8_t *pixels,
1350 ptrdiff_t line_size, int h)
1352 common_hz_bil_8w_msa(pixels, line_size, block, line_size, h);
1355 void ff_put_pixels8_y2_msa(uint8_t *block, const uint8_t *pixels,
1356 ptrdiff_t line_size, int h)
1358 common_vt_bil_8w_msa(pixels, line_size, block, line_size, h);
1361 void ff_put_pixels8_xy2_msa(uint8_t *block, const uint8_t *pixels,
1362 ptrdiff_t line_size, int h)
1364 common_hv_bil_8w_msa(pixels, line_size, block, line_size, h);
1367 void ff_put_pixels4_x2_msa(uint8_t *block, const uint8_t *pixels,
1368 ptrdiff_t line_size, int h)
1370 common_hz_bil_4w_msa(pixels, line_size, block, line_size, h);
1373 void ff_put_pixels4_y2_msa(uint8_t *block, const uint8_t *pixels,
1374 ptrdiff_t line_size, int h)
1376 common_vt_bil_4w_msa(pixels, line_size, block, line_size, h);
1379 void ff_put_pixels4_xy2_msa(uint8_t *block, const uint8_t *pixels,
1380 ptrdiff_t line_size, int h)
1382 common_hv_bil_4w_msa(pixels, line_size, block, line_size, h);
1385 void ff_put_no_rnd_pixels16_x2_msa(uint8_t *block, const uint8_t *pixels,
1386 ptrdiff_t line_size, int h)
1389 common_hz_bil_no_rnd_16x16_msa(pixels, line_size, block, line_size);
1390 } else if (h == 8) {
1391 common_hz_bil_no_rnd_8x16_msa(pixels, line_size, block, line_size);
1395 void ff_put_no_rnd_pixels16_y2_msa(uint8_t *block, const uint8_t *pixels,
1396 ptrdiff_t line_size, int h)
1399 common_vt_bil_no_rnd_16x16_msa(pixels, line_size, block, line_size);
1400 } else if (h == 8) {
1401 common_vt_bil_no_rnd_8x16_msa(pixels, line_size, block, line_size);
1405 void ff_put_no_rnd_pixels16_xy2_msa(uint8_t *block,
1406 const uint8_t *pixels,
1407 ptrdiff_t line_size, int h)
1410 common_hv_bil_no_rnd_16x16_msa(pixels, line_size, block, line_size);
1411 } else if (h == 8) {
1412 common_hv_bil_no_rnd_8x16_msa(pixels, line_size, block, line_size);
1416 void ff_put_no_rnd_pixels8_x2_msa(uint8_t *block, const uint8_t *pixels,
1417 ptrdiff_t line_size, int h)
1420 common_hz_bil_no_rnd_8x8_msa(pixels, line_size, block, line_size);
1421 } else if (h == 4) {
1422 common_hz_bil_no_rnd_4x8_msa(pixels, line_size, block, line_size);
1426 void ff_put_no_rnd_pixels8_y2_msa(uint8_t *block, const uint8_t *pixels,
1427 ptrdiff_t line_size, int h)
1430 common_vt_bil_no_rnd_8x8_msa(pixels, line_size, block, line_size);
1431 } else if (h == 4) {
1432 common_vt_bil_no_rnd_4x8_msa(pixels, line_size, block, line_size);
1436 void ff_put_no_rnd_pixels8_xy2_msa(uint8_t *block, const uint8_t *pixels,
1437 ptrdiff_t line_size, int h)
1440 common_hv_bil_no_rnd_8x8_msa(pixels, line_size, block, line_size);
1441 } else if (h == 4) {
1442 common_hv_bil_no_rnd_4x8_msa(pixels, line_size, block, line_size);
1446 void ff_avg_pixels16_msa(uint8_t *block, const uint8_t *pixels,
1447 ptrdiff_t line_size, int h)
1449 avg_width16_msa(pixels, line_size, block, line_size, h);
1452 void ff_avg_pixels16_x2_msa(uint8_t *block, const uint8_t *pixels,
1453 ptrdiff_t line_size, int h)
1455 common_hz_bil_and_aver_dst_16w_msa(pixels, line_size, block, line_size, h);
1458 void ff_avg_pixels16_y2_msa(uint8_t *block, const uint8_t *pixels,
1459 ptrdiff_t line_size, int h)
1461 common_vt_bil_and_aver_dst_16w_msa(pixels, line_size, block, line_size, h);
1464 void ff_avg_pixels16_xy2_msa(uint8_t *block, const uint8_t *pixels,
1465 ptrdiff_t line_size, int h)
1467 common_hv_bil_and_aver_dst_16w_msa(pixels, line_size, block, line_size, h);
1470 void ff_avg_pixels8_msa(uint8_t *block, const uint8_t *pixels,
1471 ptrdiff_t line_size, int h)
1473 avg_width8_msa(pixels, line_size, block, line_size, h);
1476 void ff_avg_pixels8_x2_msa(uint8_t *block, const uint8_t *pixels,
1477 ptrdiff_t line_size, int h)
1479 common_hz_bil_and_aver_dst_8w_msa(pixels, line_size, block, line_size, h);
1482 void ff_avg_pixels8_y2_msa(uint8_t *block, const uint8_t *pixels,
1483 ptrdiff_t line_size, int h)
1485 common_vt_bil_and_aver_dst_8w_msa(pixels, line_size, block, line_size, h);
1488 void ff_avg_pixels8_xy2_msa(uint8_t *block, const uint8_t *pixels,
1489 ptrdiff_t line_size, int h)
1491 common_hv_bil_and_aver_dst_8w_msa(pixels, line_size, block, line_size, h);
1494 void ff_avg_pixels4_msa(uint8_t *block, const uint8_t *pixels,
1495 ptrdiff_t line_size, int h)
1497 avg_width4_msa(pixels, line_size, block, line_size, h);
1500 void ff_avg_pixels4_x2_msa(uint8_t *block, const uint8_t *pixels,
1501 ptrdiff_t line_size, int h)
1503 common_hz_bil_and_aver_dst_4w_msa(pixels, line_size, block, line_size, h);
1506 void ff_avg_pixels4_y2_msa(uint8_t *block, const uint8_t *pixels,
1507 ptrdiff_t line_size, int h)
1509 common_vt_bil_and_aver_dst_4w_msa(pixels, line_size, block, line_size, h);
1512 void ff_avg_pixels4_xy2_msa(uint8_t *block, const uint8_t *pixels,
1513 ptrdiff_t line_size, int h)
1515 common_hv_bil_and_aver_dst_4w_msa(pixels, line_size, block, line_size, h);