2 * Copyright (c) 2015 Parag Salasakar (Parag.Salasakar@imgtec.com)
4 * This file is part of FFmpeg.
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 #include "libavutil/mips/generic_macros_msa.h"
22 #include "libavcodec/mips/hpeldsp_mips.h"
24 #define PCKEV_AVG_ST_UB(in0, in1, dst, pdst) \
28 tmp_m = (v16u8) __msa_pckev_b((v16i8) in0, (v16i8) in1); \
29 tmp_m = __msa_aver_u_b(tmp_m, (v16u8) dst); \
30 ST_UB(tmp_m, (pdst)); \
33 #define PCKEV_ST_SB4(in0, in1, in2, in3, in4, in5, in6, in7, pdst, stride) \
35 v16i8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
36 uint8_t *pdst_m = (uint8_t *) (pdst); \
38 PCKEV_B4_SB(in0, in1, in2, in3, in4, in5, in6, in7, \
39 tmp0_m, tmp1_m, tmp2_m, tmp3_m); \
40 ST_SB4(tmp0_m, tmp1_m, tmp2_m, tmp3_m, pdst_m, stride); \
43 #define PCKEV_AVG_ST8x4_UB(in1, dst0, in2, dst1, in3, dst2, in4, dst3, \
46 v16u8 tmp0_m, tmp1_m, tmp2_m, tmp3_m; \
47 uint8_t *pdst_m = (uint8_t *) (pdst); \
49 PCKEV_B2_UB(in2, in1, in4, in3, tmp0_m, tmp1_m); \
50 PCKEV_D2_UB(dst1, dst0, dst3, dst2, tmp2_m, tmp3_m); \
51 AVER_UB2_UB(tmp0_m, tmp2_m, tmp1_m, tmp3_m, tmp0_m, tmp1_m); \
52 ST8x4_UB(tmp0_m, tmp1_m, pdst_m, stride); \
55 static void common_hz_bil_4w_msa(const uint8_t *src, int32_t src_stride,
56 uint8_t *dst, int32_t dst_stride,
61 v16u8 src0, src1, src0_sld1, src1_sld1, res0, res1;
63 for (loop_cnt = (height >> 1); loop_cnt--;) {
64 LD_UB2(src, src_stride, src0, src1);
65 src += (2 * src_stride);
67 SLDI_B2_0_UB(src0, src1, src0_sld1, src1_sld1, 1);
68 AVER_UB2_UB(src0_sld1, src0, src1_sld1, src1, res0, res1);
70 out0 = __msa_copy_u_w((v4i32) res0, 0);
71 out1 = __msa_copy_u_w((v4i32) res1, 0);
79 static void common_hz_bil_8w_msa(const uint8_t *src, int32_t src_stride,
80 uint8_t *dst, int32_t dst_stride,
84 v16i8 src0, src1, src2, src3, src0_sld1, src1_sld1, src2_sld1, src3_sld1;
86 for (loop_cnt = (height >> 2); loop_cnt--;) {
87 LD_SB4(src, src_stride, src0, src1, src2, src3);
88 src += (4 * src_stride);
90 SLDI_B4_0_SB(src0, src1, src2, src3,
91 src0_sld1, src1_sld1, src2_sld1, src3_sld1, 1);
92 AVER_ST8x4_UB(src0, src0_sld1, src1, src1_sld1,
93 src2, src2_sld1, src3, src3_sld1, dst, dst_stride);
94 dst += (4 * dst_stride);
98 static void common_hz_bil_16w_msa(const uint8_t *src, int32_t src_stride,
99 uint8_t *dst, int32_t dst_stride,
103 v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
104 v16u8 src8, src9, src10, src11, src12, src13, src14, src15;
106 for (loop_cnt = (height >> 3); loop_cnt--;) {
107 LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
108 LD_UB8((src + 1), src_stride,
109 src8, src9, src10, src11, src12, src13, src14, src15);
110 src += (8 * src_stride);
112 AVER_ST16x4_UB(src0, src8, src1, src9, src2, src10, src3, src11,
114 dst += (4 * dst_stride);
116 AVER_ST16x4_UB(src4, src12, src5, src13, src6, src14, src7, src15,
118 dst += (4 * dst_stride);
122 static void common_hz_bil_no_rnd_8x8_msa(const uint8_t *src, int32_t src_stride,
123 uint8_t *dst, int32_t dst_stride)
125 v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
126 v16i8 src0_sld1, src1_sld1, src2_sld1, src3_sld1;
127 v16i8 src4_sld1, src5_sld1, src6_sld1, src7_sld1;
129 LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
130 src += (8 * src_stride);
132 SLDI_B4_0_SB(src0, src1, src2, src3,
133 src0_sld1, src1_sld1, src2_sld1, src3_sld1, 1);
134 SLDI_B4_0_SB(src4, src5, src6, src7,
135 src4_sld1, src5_sld1, src6_sld1, src7_sld1, 1);
137 AVE_ST8x4_UB(src0, src0_sld1, src1, src1_sld1,
138 src2, src2_sld1, src3, src3_sld1, dst, dst_stride);
139 dst += (4 * dst_stride);
140 AVE_ST8x4_UB(src4, src4_sld1, src5, src5_sld1,
141 src6, src6_sld1, src7, src7_sld1, dst, dst_stride);
144 static void common_hz_bil_no_rnd_4x8_msa(const uint8_t *src, int32_t src_stride,
145 uint8_t *dst, int32_t dst_stride)
147 v16i8 src0, src1, src2, src3, src0_sld1, src1_sld1, src2_sld1, src3_sld1;
149 LD_SB4(src, src_stride, src0, src1, src2, src3);
150 SLDI_B4_0_SB(src0, src1, src2, src3,
151 src0_sld1, src1_sld1, src2_sld1, src3_sld1, 1);
152 AVE_ST8x4_UB(src0, src0_sld1, src1, src1_sld1,
153 src2, src2_sld1, src3, src3_sld1, dst, dst_stride);
156 static void common_hz_bil_no_rnd_16x16_msa(const uint8_t *src,
158 uint8_t *dst, int32_t dst_stride)
160 v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
161 v16u8 src9, src10, src11, src12, src13, src14, src15;
163 LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
164 LD_UB8((src + 1), src_stride,
165 src8, src9, src10, src11, src12, src13, src14, src15);
166 src += (8 * src_stride);
168 AVE_ST16x4_UB(src0, src8, src1, src9, src2, src10, src3, src11,
170 dst += (4 * dst_stride);
172 LD_UB4(src, src_stride, src0, src1, src2, src3);
173 LD_UB4((src + 1), src_stride, src8, src9, src10, src11);
174 src += (4 * src_stride);
176 AVE_ST16x4_UB(src4, src12, src5, src13, src6, src14, src7, src15,
178 dst += (4 * dst_stride);
180 LD_UB4(src, src_stride, src4, src5, src6, src7);
181 LD_UB4((src + 1), src_stride, src12, src13, src14, src15);
182 src += (4 * src_stride);
184 AVE_ST16x4_UB(src0, src8, src1, src9, src2, src10, src3, src11,
186 dst += (4 * dst_stride);
187 AVE_ST16x4_UB(src4, src12, src5, src13, src6, src14, src7, src15,
191 static void common_hz_bil_no_rnd_8x16_msa(const uint8_t *src,
193 uint8_t *dst, int32_t dst_stride)
195 v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
196 v16u8 src9, src10, src11, src12, src13, src14, src15;
198 LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
199 LD_UB8((src + 1), src_stride,
200 src8, src9, src10, src11, src12, src13, src14, src15);
202 AVE_ST16x4_UB(src0, src8, src1, src9, src2, src10, src3, src11,
204 dst += (4 * dst_stride);
205 AVE_ST16x4_UB(src4, src12, src5, src13, src6, src14, src7, src15,
209 static void common_hz_bil_and_aver_dst_4w_msa(const uint8_t *src,
211 uint8_t *dst, int32_t dst_stride,
215 uint32_t dst0, dst1, out0, out1;
216 v16u8 src0, src1, src0_sld1, src1_sld1, res0, res1;
220 for (loop_cnt = (height >> 1); loop_cnt--;) {
221 LD_UB2(src, src_stride, src0, src1);
222 src += (2 * src_stride);
224 SLDI_B2_0_UB(src0, src1, src0_sld1, src1_sld1, 1);
227 dst1 = LW(dst + dst_stride);
228 tmp0 = (v16u8) __msa_insert_w((v4i32) tmp0, 0, dst0);
229 tmp1 = (v16u8) __msa_insert_w((v4i32) tmp1, 0, dst1);
231 AVER_UB2_UB(src0_sld1, src0, src1_sld1, src1, res0, res1);
232 AVER_UB2_UB(res0, tmp0, res1, tmp1, res0, res1);
234 out0 = __msa_copy_u_w((v4i32) res0, 0);
235 out1 = __msa_copy_u_w((v4i32) res1, 0);
243 static void common_hz_bil_and_aver_dst_8w_msa(const uint8_t *src,
245 uint8_t *dst, int32_t dst_stride,
249 v16i8 src0, src1, src2, src3, src0_sld1, src1_sld1, src2_sld1, src3_sld1;
251 for (loop_cnt = (height >> 2); loop_cnt--;) {
252 LD_SB4(src, src_stride, src0, src1, src2, src3);
253 src += (4 * src_stride);
255 SLDI_B4_0_SB(src0, src1, src2, src3,
256 src0_sld1, src1_sld1, src2_sld1, src3_sld1, 1);
258 AVER_DST_ST8x4_UB(src0, src0_sld1, src1, src1_sld1, src2, src2_sld1,
259 src3, src3_sld1, dst, dst_stride);
260 dst += (4 * dst_stride);
264 static void common_hz_bil_and_aver_dst_16w_msa(const uint8_t *src,
266 uint8_t *dst, int32_t dst_stride,
270 v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
271 v16u8 src9, src10, src11, src12, src13, src14, src15;
273 for (loop_cnt = (height >> 3); loop_cnt--;) {
274 LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
275 LD_UB8((src + 1), src_stride,
276 src8, src9, src10, src11, src12, src13, src14, src15);
277 src += (8 * src_stride);
279 AVER_DST_ST16x4_UB(src0, src8, src1, src9, src2, src10, src3, src11,
281 dst += (4 * dst_stride);
282 AVER_DST_ST16x4_UB(src4, src12, src5, src13, src6, src14, src7, src15,
284 dst += (4 * dst_stride);
288 static void common_vt_bil_4w_msa(const uint8_t *src, int32_t src_stride,
289 uint8_t *dst, int32_t dst_stride,
294 v16u8 src0, src1, src2, res0, res1;
299 for (loop_cnt = (height >> 1); loop_cnt--;) {
300 LD_UB2(src, src_stride, src1, src2);
301 src += (2 * src_stride);
303 AVER_UB2_UB(src0, src1, src1, src2, res0, res1);
305 out0 = __msa_copy_u_w((v4i32) res0, 0);
306 out1 = __msa_copy_u_w((v4i32) res1, 0);
316 static void common_vt_bil_8w_msa(const uint8_t *src, int32_t src_stride,
317 uint8_t *dst, int32_t dst_stride,
321 v16u8 src0, src1, src2, src3, src4;
326 for (loop_cnt = (height >> 2); loop_cnt--;) {
327 LD_UB4(src, src_stride, src1, src2, src3, src4);
328 src += (4 * src_stride);
330 AVER_ST8x4_UB(src0, src1, src1, src2, src2, src3, src3, src4,
332 dst += (4 * dst_stride);
338 static void common_vt_bil_16w_msa(const uint8_t *src, int32_t src_stride,
339 uint8_t *dst, int32_t dst_stride,
343 v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
348 for (loop_cnt = (height >> 3); loop_cnt--;) {
349 LD_UB8(src, src_stride, src1, src2, src3, src4, src5, src6, src7, src8);
350 src += (8 * src_stride);
352 AVER_ST16x4_UB(src0, src1, src1, src2, src2, src3, src3, src4,
354 dst += (4 * dst_stride);
355 AVER_ST16x4_UB(src4, src5, src5, src6, src6, src7, src7, src8,
357 dst += (4 * dst_stride);
363 static void common_vt_bil_no_rnd_8x8_msa(const uint8_t *src, int32_t src_stride,
364 uint8_t *dst, int32_t dst_stride)
366 v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
368 LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
369 src += (8 * src_stride);
372 AVE_ST8x4_UB(src0, src1, src1, src2, src2, src3, src3, src4,
374 dst += (4 * dst_stride);
376 AVE_ST8x4_UB(src4, src5, src5, src6, src6, src7, src7, src8,
380 static void common_vt_bil_no_rnd_4x8_msa(const uint8_t *src, int32_t src_stride,
381 uint8_t *dst, int32_t dst_stride)
383 v16u8 src0, src1, src2, src3, src4;
385 LD_UB5(src, src_stride, src0, src1, src2, src3, src4);
386 AVE_ST8x4_UB(src0, src1, src1, src2, src2, src3, src3, src4,
390 static void common_vt_bil_no_rnd_16x16_msa(const uint8_t *src,
392 uint8_t *dst, int32_t dst_stride)
394 v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
395 v16u8 src9, src10, src11, src12, src13, src14, src15, src16;
397 LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
398 src += (8 * src_stride);
399 LD_UB8(src, src_stride,
400 src8, src9, src10, src11, src12, src13, src14, src15);
401 src += (8 * src_stride);
404 AVE_ST16x4_UB(src0, src1, src1, src2, src2, src3, src3, src4,
406 dst += (4 * dst_stride);
407 AVE_ST16x4_UB(src4, src5, src5, src6, src6, src7, src7, src8,
409 dst += (4 * dst_stride);
410 AVE_ST16x4_UB(src8, src9, src9, src10, src10, src11, src11, src12,
412 dst += (4 * dst_stride);
413 AVE_ST16x4_UB(src12, src13, src13, src14,
414 src14, src15, src15, src16, dst, dst_stride);
417 static void common_vt_bil_no_rnd_8x16_msa(const uint8_t *src,
419 uint8_t *dst, int32_t dst_stride)
421 v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
423 LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
424 src += (8 * src_stride);
427 AVE_ST16x4_UB(src0, src1, src1, src2, src2, src3, src3, src4,
429 dst += (4 * dst_stride);
430 AVE_ST16x4_UB(src4, src5, src5, src6, src6, src7, src7, src8,
434 static void common_vt_bil_and_aver_dst_4w_msa(const uint8_t *src,
436 uint8_t *dst, int32_t dst_stride,
440 uint32_t out0, out1, dst0, dst1;
441 v16u8 src0, src1, src2;
449 for (loop_cnt = (height >> 1); loop_cnt--;) {
450 LD_UB2(src, src_stride, src1, src2);
451 src += (2 * src_stride);
453 dst1 = LW(dst + dst_stride);
454 tmp0 = (v16u8) __msa_insert_w((v4i32) tmp0, 0, dst0);
455 tmp1 = (v16u8) __msa_insert_w((v4i32) tmp1, 0, dst1);
456 AVER_UB2_UB(src0, src1, src1, src2, res0, res1);
457 AVER_UB2_UB(res0, tmp0, res1, tmp1, res0, res1);
458 out0 = __msa_copy_u_w((v4i32) res0, 0);
459 out1 = __msa_copy_u_w((v4i32) res1, 0);
468 static void common_vt_bil_and_aver_dst_8w_msa(const uint8_t *src,
470 uint8_t *dst, int32_t dst_stride,
474 v16u8 src0, src1, src2, src3, src4;
479 for (loop_cnt = (height >> 2); loop_cnt--;) {
480 LD_UB4(src, src_stride, src1, src2, src3, src4);
481 src += (4 * src_stride);
483 AVER_DST_ST8x4_UB(src0, src1, src1, src2, src2, src3, src3, src4,
485 dst += (4 * dst_stride);
490 static void common_vt_bil_and_aver_dst_16w_msa(const uint8_t *src,
492 uint8_t *dst, int32_t dst_stride,
496 v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
497 v16u8 res0, res1, res2, res3, res4, res5, res6, res7;
498 v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
503 for (loop_cnt = (height >> 3); loop_cnt--;) {
504 LD_UB8(src, src_stride, src1, src2, src3, src4, src5, src6, src7, src8);
505 src += (8 * src_stride);
506 AVER_UB4_UB(src0, src1, src1, src2, src2, src3, src3, src4,
507 res0, res1, res2, res3);
508 AVER_UB4_UB(src4, src5, src5, src6, src6, src7, src7, src8,
509 res4, res5, res6, res7);
511 LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
512 AVER_UB4_UB(dst0, res0, dst1, res1, dst2, res2, dst3, res3,
513 res0, res1, res2, res3);
514 AVER_UB4_UB(dst4, res4, dst5, res5, dst6, res6, dst7, res7,
515 res4, res5, res6, res7);
516 ST_UB8(res0, res1, res2, res3, res4, res5, res6, res7, dst, dst_stride);
517 dst += (8 * dst_stride);
523 static void common_hv_bil_4w_msa(const uint8_t *src, int32_t src_stride,
524 uint8_t *dst, int32_t dst_stride,
529 v16i8 src0, src1, src2, src0_sld1, src1_sld1, src2_sld1;
530 v16u8 src0_r, src1_r, src2_r, res;
531 v8u16 add0, add1, add2, sum0, sum1;
536 for (loop_cnt = (height >> 1); loop_cnt--;) {
537 LD_SB2(src, src_stride, src1, src2);
538 src += (2 * src_stride);
540 SLDI_B3_0_SB(src0, src1, src2, src0_sld1, src1_sld1, src2_sld1, 1);
541 ILVR_B3_UB(src0_sld1, src0, src1_sld1, src1, src2_sld1, src2,
542 src0_r, src1_r, src2_r);
543 HADD_UB3_UH(src0_r, src1_r, src2_r, add0, add1, add2);
544 ADD2(add0, add1, add1, add2, sum0, sum1);
545 SRARI_H2_UH(sum0, sum1, 2);
546 res = (v16u8) __msa_pckev_b((v16i8) sum1, (v16i8) sum0);
547 res0 = __msa_copy_u_w((v4i32) res, 0);
548 res1 = __msa_copy_u_w((v4i32) res, 2);
558 static void common_hv_bil_8w_msa(const uint8_t *src, int32_t src_stride,
559 uint8_t *dst, int32_t dst_stride,
563 v16i8 src0, src1, src2, src3, src4;
564 v16i8 src0_sld1, src1_sld1, src2_sld1, src3_sld1, src4_sld1;
565 v16u8 src0_r, src1_r, src2_r, src3_r, src4_r;
566 v8u16 add0, add1, add2, add3, add4;
567 v8u16 sum0, sum1, sum2, sum3;
572 for (loop_cnt = (height >> 2); loop_cnt--;) {
573 LD_SB4(src, src_stride, src1, src2, src3, src4);
574 src += (4 * src_stride);
576 SLDI_B3_0_SB(src0, src1, src2, src0_sld1, src1_sld1, src2_sld1, 1);
577 SLDI_B2_0_SB(src3, src4, src3_sld1, src4_sld1, 1);
578 ILVR_B3_UB(src0_sld1, src0, src1_sld1, src1, src2_sld1, src2, src0_r,
580 ILVR_B2_UB(src3_sld1, src3, src4_sld1, src4, src3_r, src4_r);
581 HADD_UB3_UH(src0_r, src1_r, src2_r, add0, add1, add2);
582 HADD_UB2_UH(src3_r, src4_r, add3, add4);
583 ADD4(add0, add1, add1, add2, add2, add3, add3, add4,
584 sum0, sum1, sum2, sum3);
585 SRARI_H4_UH(sum0, sum1, sum2, sum3, 2);
586 PCKEV_B2_SB(sum1, sum0, sum3, sum2, src0, src1);
587 ST8x4_UB(src0, src1, dst, dst_stride);
588 dst += (4 * dst_stride);
593 static void common_hv_bil_16w_msa(const uint8_t *src, int32_t src_stride,
594 uint8_t *dst, int32_t dst_stride,
598 v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9;
599 v16u8 src10, src11, src12, src13, src14, src15, src16, src17;
600 v8u16 src0_r, src1_r, src2_r, src3_r, src4_r, src5_r, src6_r, src7_r;
601 v8u16 src8_r, src0_l, src1_l, src2_l, src3_l, src4_l, src5_l, src6_l;
602 v8u16 src7_l, src8_l;
603 v8u16 sum0_r, sum1_r, sum2_r, sum3_r, sum4_r, sum5_r, sum6_r, sum7_r;
604 v8u16 sum0_l, sum1_l, sum2_l, sum3_l, sum4_l, sum5_l, sum6_l, sum7_l;
606 for (loop_cnt = (height >> 3); loop_cnt--;) {
607 LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
608 LD_UB8((src + 1), src_stride,
609 src9, src10, src11, src12, src13, src14, src15, src16);
610 src += (8 * src_stride);
613 src17 = LD_UB(src + 1);
615 ILVRL_B2_UH(src9, src0, src0_r, src0_l);
616 ILVRL_B2_UH(src10, src1, src1_r, src1_l);
617 ILVRL_B2_UH(src11, src2, src2_r, src2_l);
618 ILVRL_B2_UH(src12, src3, src3_r, src3_l);
619 ILVRL_B2_UH(src13, src4, src4_r, src4_l);
620 ILVRL_B2_UH(src14, src5, src5_r, src5_l);
621 ILVRL_B2_UH(src15, src6, src6_r, src6_l);
622 ILVRL_B2_UH(src16, src7, src7_r, src7_l);
623 ILVRL_B2_UH(src17, src8, src8_r, src8_l);
624 HADD_UB3_UH(src0_r, src1_r, src2_r, src0_r, src1_r, src2_r);
625 HADD_UB3_UH(src3_r, src4_r, src5_r, src3_r, src4_r, src5_r);
626 HADD_UB3_UH(src6_r, src7_r, src8_r, src6_r, src7_r, src8_r);
627 HADD_UB3_UH(src0_l, src1_l, src2_l, src0_l, src1_l, src2_l);
628 HADD_UB3_UH(src3_l, src4_l, src5_l, src3_l, src4_l, src5_l);
629 HADD_UB3_UH(src6_l, src7_l, src8_l, src6_l, src7_l, src8_l);
630 ADD4(src0_r, src1_r, src1_r, src2_r, src2_r, src3_r, src3_r, src4_r,
631 sum0_r, sum1_r, sum2_r, sum3_r);
632 ADD4(src4_r, src5_r, src5_r, src6_r, src6_r, src7_r, src7_r, src8_r,
633 sum4_r, sum5_r, sum6_r, sum7_r);
634 ADD4(src0_l, src1_l, src1_l, src2_l, src2_l, src3_l, src3_l, src4_l,
635 sum0_l, sum1_l, sum2_l, sum3_l);
636 ADD4(src4_l, src5_l, src5_l, src6_l, src6_l, src7_l, src7_l, src8_l,
637 sum4_l, sum5_l, sum6_l, sum7_l);
638 SRARI_H4_UH(sum0_r, sum1_r, sum2_r, sum3_r, 2);
639 SRARI_H4_UH(sum4_r, sum5_r, sum6_r, sum7_r, 2);
640 SRARI_H4_UH(sum0_l, sum1_l, sum2_l, sum3_l, 2);
641 SRARI_H4_UH(sum4_l, sum5_l, sum6_l, sum7_l, 2);
642 PCKEV_ST_SB4(sum0_l, sum0_r, sum1_l, sum1_r, sum2_l, sum2_r,
643 sum3_l, sum3_r, dst, dst_stride);
644 dst += (4 * dst_stride);
645 PCKEV_ST_SB4(sum4_l, sum4_r, sum5_l, sum5_r, sum6_l, sum6_r,
646 sum7_l, sum7_r, dst, dst_stride);
647 dst += (4 * dst_stride);
651 static void common_hv_bil_no_rnd_8x8_msa(const uint8_t *src, int32_t src_stride,
652 uint8_t *dst, int32_t dst_stride)
654 v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8;
655 v16u8 src0_sld1, src1_sld1, src2_sld1, src3_sld1;
656 v16u8 src4_sld1, src5_sld1, src6_sld1, src7_sld1, src8_sld1;
657 v8u16 src0_r, src1_r, src2_r, src3_r;
658 v8u16 src4_r, src5_r, src6_r, src7_r, src8_r;
659 v8u16 add0, add1, add2, add3, add4, add5, add6, add7, add8;
660 v8u16 sum0, sum1, sum2, sum3, sum4, sum5, sum6, sum7;
663 LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
664 src += (8 * src_stride);
667 SLDI_B4_0_UB(src0, src1, src2, src3, src0_sld1, src1_sld1, src2_sld1,
669 SLDI_B3_0_UB(src4, src5, src6, src4_sld1, src5_sld1, src6_sld1, 1);
670 SLDI_B2_0_UB(src7, src8, src7_sld1, src8_sld1, 1);
671 ILVR_B4_UH(src0_sld1, src0, src1_sld1, src1, src2_sld1, src2, src3_sld1,
672 src3, src0_r, src1_r, src2_r, src3_r);
673 ILVR_B3_UH(src4_sld1, src4, src5_sld1, src5, src6_sld1, src6, src4_r,
675 ILVR_B2_UH(src7_sld1, src7, src8_sld1, src8, src7_r, src8_r);
676 HADD_UB3_UH(src0_r, src1_r, src2_r, add0, add1, add2);
677 HADD_UB3_UH(src3_r, src4_r, src5_r, add3, add4, add5);
678 HADD_UB3_UH(src6_r, src7_r, src8_r, add6, add7, add8);
680 sum0 = add0 + add1 + 1;
681 sum1 = add1 + add2 + 1;
682 sum2 = add2 + add3 + 1;
683 sum3 = add3 + add4 + 1;
684 sum4 = add4 + add5 + 1;
685 sum5 = add5 + add6 + 1;
686 sum6 = add6 + add7 + 1;
687 sum7 = add7 + add8 + 1;
689 SRA_4V(sum0, sum1, sum2, sum3, 2);
690 SRA_4V(sum4, sum5, sum6, sum7, 2);
691 PCKEV_B2_SB(sum1, sum0, sum3, sum2, out0, out1);
692 ST8x4_UB(out0, out1, dst, dst_stride);
693 PCKEV_B2_SB(sum5, sum4, sum7, sum6, out0, out1);
694 ST8x4_UB(out0, out1, dst + 4 * dst_stride, dst_stride);
697 static void common_hv_bil_no_rnd_4x8_msa(const uint8_t *src, int32_t src_stride,
698 uint8_t *dst, int32_t dst_stride)
700 v16i8 src0, src1, src2, src3, src4;
701 v16i8 src0_sld1, src1_sld1, src2_sld1, src3_sld1, src4_sld1;
702 v8u16 src0_r, src1_r, src2_r, src3_r, src4_r;
703 v8u16 add0, add1, add2, add3, add4;
704 v8u16 sum0, sum1, sum2, sum3;
707 LD_SB4(src, src_stride, src0, src1, src2, src3);
708 src += (4 * src_stride);
711 SLDI_B3_0_SB(src0, src1, src2, src0_sld1, src1_sld1, src2_sld1, 1);
712 SLDI_B2_0_SB(src3, src4, src3_sld1, src4_sld1, 1);
713 ILVR_B3_UH(src0_sld1, src0, src1_sld1, src1, src2_sld1, src2, src0_r,
715 ILVR_B2_UH(src3_sld1, src3, src4_sld1, src4, src3_r, src4_r);
716 HADD_UB3_UH(src0_r, src1_r, src2_r, add0, add1, add2);
717 HADD_UB2_UH(src3_r, src4_r, add3, add4);
719 sum0 = add0 + add1 + 1;
720 sum1 = add1 + add2 + 1;
721 sum2 = add2 + add3 + 1;
722 sum3 = add3 + add4 + 1;
724 SRA_4V(sum0, sum1, sum2, sum3, 2);
725 PCKEV_B2_SB(sum1, sum0, sum3, sum2, out0, out1);
726 ST8x4_UB(out0, out1, dst, dst_stride);
729 static void common_hv_bil_no_rnd_16x16_msa(const uint8_t *src,
731 uint8_t *dst, int32_t dst_stride)
733 v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9;
734 v16u8 src10, src11, src12, src13, src14, src15, src16, src17;
735 v8u16 src0_r, src1_r, src2_r, src3_r, src4_r, src5_r, src6_r, src7_r;
736 v8u16 src8_r, src0_l, src1_l, src2_l, src3_l, src4_l, src5_l, src6_l;
737 v8u16 src7_l, src8_l;
738 v8u16 sum0_r, sum1_r, sum2_r, sum3_r, sum4_r, sum5_r, sum6_r, sum7_r;
739 v8u16 sum0_l, sum1_l, sum2_l, sum3_l, sum4_l, sum5_l, sum6_l, sum7_l;
741 LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
742 LD_UB8((src + 1), src_stride,
743 src9, src10, src11, src12, src13, src14, src15, src16);
744 src += (8 * src_stride);
746 src17 = LD_UB(src + 1);
748 ILVRL_B2_UH(src9, src0, src0_r, src0_l);
749 ILVRL_B2_UH(src10, src1, src1_r, src1_l);
750 ILVRL_B2_UH(src11, src2, src2_r, src2_l);
751 ILVRL_B2_UH(src12, src3, src3_r, src3_l);
752 ILVRL_B2_UH(src13, src4, src4_r, src4_l);
753 ILVRL_B2_UH(src14, src5, src5_r, src5_l);
754 ILVRL_B2_UH(src15, src6, src6_r, src6_l);
755 ILVRL_B2_UH(src16, src7, src7_r, src7_l);
756 ILVRL_B2_UH(src17, src8, src8_r, src8_l);
758 HADD_UB3_UH(src0_r, src1_r, src2_r, src0_r, src1_r, src2_r);
759 HADD_UB3_UH(src3_r, src4_r, src5_r, src3_r, src4_r, src5_r);
760 HADD_UB3_UH(src6_r, src7_r, src8_r, src6_r, src7_r, src8_r);
761 HADD_UB3_UH(src0_l, src1_l, src2_l, src0_l, src1_l, src2_l);
762 HADD_UB3_UH(src3_l, src4_l, src5_l, src3_l, src4_l, src5_l);
763 HADD_UB3_UH(src6_l, src7_l, src8_l, src6_l, src7_l, src8_l);
765 sum0_r = src0_r + src1_r + 1;
766 sum1_r = src1_r + src2_r + 1;
767 sum2_r = src2_r + src3_r + 1;
768 sum3_r = src3_r + src4_r + 1;
769 sum4_r = src4_r + src5_r + 1;
770 sum5_r = src5_r + src6_r + 1;
771 sum6_r = src6_r + src7_r + 1;
772 sum7_r = src7_r + src8_r + 1;
773 sum0_l = src0_l + src1_l + 1;
774 sum1_l = src1_l + src2_l + 1;
775 sum2_l = src2_l + src3_l + 1;
776 sum3_l = src3_l + src4_l + 1;
777 sum4_l = src4_l + src5_l + 1;
778 sum5_l = src5_l + src6_l + 1;
779 sum6_l = src6_l + src7_l + 1;
780 sum7_l = src7_l + src8_l + 1;
782 SRA_4V(sum0_r, sum1_r, sum2_r, sum3_r, 2);
783 SRA_4V(sum4_r, sum5_r, sum6_r, sum7_r, 2);
784 SRA_4V(sum0_l, sum1_l, sum2_l, sum3_l, 2);
785 SRA_4V(sum4_l, sum5_l, sum6_l, sum7_l, 2);
786 PCKEV_ST_SB4(sum0_l, sum0_r, sum1_l, sum1_r,
787 sum2_l, sum2_r, sum3_l, sum3_r, dst, dst_stride);
788 dst += (4 * dst_stride);
790 LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
791 LD_UB8((src + 1), src_stride,
792 src9, src10, src11, src12, src13, src14, src15, src16);
793 src += (8 * src_stride);
795 src17 = LD_UB(src + 1);
797 PCKEV_ST_SB4(sum4_l, sum4_r, sum5_l, sum5_r,
798 sum6_l, sum6_r, sum7_l, sum7_r, dst, dst_stride);
799 dst += (4 * dst_stride);
801 ILVRL_B2_UH(src9, src0, src0_r, src0_l);
802 ILVRL_B2_UH(src10, src1, src1_r, src1_l);
803 ILVRL_B2_UH(src11, src2, src2_r, src2_l);
804 ILVRL_B2_UH(src12, src3, src3_r, src3_l);
805 ILVRL_B2_UH(src13, src4, src4_r, src4_l);
806 ILVRL_B2_UH(src14, src5, src5_r, src5_l);
807 ILVRL_B2_UH(src15, src6, src6_r, src6_l);
808 ILVRL_B2_UH(src16, src7, src7_r, src7_l);
809 ILVRL_B2_UH(src17, src8, src8_r, src8_l);
811 HADD_UB3_UH(src0_r, src1_r, src2_r, src0_r, src1_r, src2_r);
812 HADD_UB3_UH(src3_r, src4_r, src5_r, src3_r, src4_r, src5_r);
813 HADD_UB3_UH(src6_r, src7_r, src8_r, src6_r, src7_r, src8_r);
814 HADD_UB3_UH(src0_l, src1_l, src2_l, src0_l, src1_l, src2_l);
815 HADD_UB3_UH(src3_l, src4_l, src5_l, src3_l, src4_l, src5_l);
816 HADD_UB3_UH(src6_l, src7_l, src8_l, src6_l, src7_l, src8_l);
818 sum0_r = src0_r + src1_r + 1;
819 sum1_r = src1_r + src2_r + 1;
820 sum2_r = src2_r + src3_r + 1;
821 sum3_r = src3_r + src4_r + 1;
822 sum4_r = src4_r + src5_r + 1;
823 sum5_r = src5_r + src6_r + 1;
824 sum6_r = src6_r + src7_r + 1;
825 sum7_r = src7_r + src8_r + 1;
826 sum0_l = src0_l + src1_l + 1;
827 sum1_l = src1_l + src2_l + 1;
828 sum2_l = src2_l + src3_l + 1;
829 sum3_l = src3_l + src4_l + 1;
830 sum4_l = src4_l + src5_l + 1;
831 sum5_l = src5_l + src6_l + 1;
832 sum6_l = src6_l + src7_l + 1;
833 sum7_l = src7_l + src8_l + 1;
835 SRA_4V(sum0_r, sum1_r, sum2_r, sum3_r, 2);
836 SRA_4V(sum4_r, sum5_r, sum6_r, sum7_r, 2);
837 SRA_4V(sum0_l, sum1_l, sum2_l, sum3_l, 2);
838 SRA_4V(sum4_l, sum5_l, sum6_l, sum7_l, 2);
839 PCKEV_ST_SB4(sum0_l, sum0_r, sum1_l, sum1_r,
840 sum2_l, sum2_r, sum3_l, sum3_r, dst, dst_stride);
841 dst += (4 * dst_stride);
842 PCKEV_ST_SB4(sum4_l, sum4_r, sum5_l, sum5_r,
843 sum6_l, sum6_r, sum7_l, sum7_r, dst, dst_stride);
846 static void common_hv_bil_no_rnd_8x16_msa(const uint8_t *src,
848 uint8_t *dst, int32_t dst_stride)
850 v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9;
851 v16u8 src10, src11, src12, src13, src14, src15, src16, src17;
852 v8u16 src0_r, src1_r, src2_r, src3_r, src4_r, src5_r, src6_r, src7_r;
853 v8u16 src8_r, src0_l, src1_l, src2_l, src3_l, src4_l, src5_l, src6_l;
854 v8u16 src7_l, src8_l;
855 v8u16 sum0_r, sum1_r, sum2_r, sum3_r, sum4_r, sum5_r, sum6_r, sum7_r;
856 v8u16 sum0_l, sum1_l, sum2_l, sum3_l, sum4_l, sum5_l, sum6_l, sum7_l;
858 LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
859 LD_UB8((src + 1), src_stride,
860 src9, src10, src11, src12, src13, src14, src15, src16);
861 src += (8 * src_stride);
863 src17 = LD_UB(src + 1);
865 ILVRL_B2_UH(src9, src0, src0_r, src0_l);
866 ILVRL_B2_UH(src10, src1, src1_r, src1_l);
867 ILVRL_B2_UH(src11, src2, src2_r, src2_l);
868 ILVRL_B2_UH(src12, src3, src3_r, src3_l);
869 ILVRL_B2_UH(src13, src4, src4_r, src4_l);
870 ILVRL_B2_UH(src14, src5, src5_r, src5_l);
871 ILVRL_B2_UH(src15, src6, src6_r, src6_l);
872 ILVRL_B2_UH(src16, src7, src7_r, src7_l);
873 ILVRL_B2_UH(src17, src8, src8_r, src8_l);
875 HADD_UB3_UH(src0_r, src1_r, src2_r, src0_r, src1_r, src2_r);
876 HADD_UB3_UH(src3_r, src4_r, src5_r, src3_r, src4_r, src5_r);
877 HADD_UB3_UH(src6_r, src7_r, src8_r, src6_r, src7_r, src8_r);
878 HADD_UB3_UH(src0_l, src1_l, src2_l, src0_l, src1_l, src2_l);
879 HADD_UB3_UH(src3_l, src4_l, src5_l, src3_l, src4_l, src5_l);
880 HADD_UB3_UH(src6_l, src7_l, src8_l, src6_l, src7_l, src8_l);
882 sum0_r = src0_r + src1_r + 1;
883 sum1_r = src1_r + src2_r + 1;
884 sum2_r = src2_r + src3_r + 1;
885 sum3_r = src3_r + src4_r + 1;
886 sum4_r = src4_r + src5_r + 1;
887 sum5_r = src5_r + src6_r + 1;
888 sum6_r = src6_r + src7_r + 1;
889 sum7_r = src7_r + src8_r + 1;
890 sum0_l = src0_l + src1_l + 1;
891 sum1_l = src1_l + src2_l + 1;
892 sum2_l = src2_l + src3_l + 1;
893 sum3_l = src3_l + src4_l + 1;
894 sum4_l = src4_l + src5_l + 1;
895 sum5_l = src5_l + src6_l + 1;
896 sum6_l = src6_l + src7_l + 1;
897 sum7_l = src7_l + src8_l + 1;
899 SRA_4V(sum0_r, sum1_r, sum2_r, sum3_r, 2);
900 SRA_4V(sum4_r, sum5_r, sum6_r, sum7_r, 2);
901 SRA_4V(sum0_l, sum1_l, sum2_l, sum3_l, 2);
902 SRA_4V(sum4_l, sum5_l, sum6_l, sum7_l, 2);
903 PCKEV_ST_SB4(sum0_l, sum0_r, sum1_l, sum1_r,
904 sum2_l, sum2_r, sum3_l, sum3_r, dst, dst_stride);
905 dst += (4 * dst_stride);
906 PCKEV_ST_SB4(sum4_l, sum4_r, sum5_l, sum5_r,
907 sum6_l, sum6_r, sum7_l, sum7_r, dst, dst_stride);
910 static void common_hv_bil_and_aver_dst_4w_msa(const uint8_t *src,
912 uint8_t *dst, int32_t dst_stride,
917 v16i8 src0, src1, src2, src0_sld1, src1_sld1, src2_sld1;
918 v16u8 src0_r, src1_r, src2_r;
919 v8u16 add0, add1, add2, sum0, sum1;
920 v16u8 dst0, dst1, res0, res1;
925 for (loop_cnt = (height >> 1); loop_cnt--;) {
926 LD_SB2(src, src_stride, src1, src2);
927 src += (2 * src_stride);
929 LD_UB2(dst, dst_stride, dst0, dst1);
930 SLDI_B3_0_SB(src0, src1, src2, src0_sld1, src1_sld1, src2_sld1, 1);
931 ILVR_B3_UB(src0_sld1, src0, src1_sld1, src1, src2_sld1, src2, src0_r,
933 HADD_UB3_UH(src0_r, src1_r, src2_r, add0, add1, add2);
934 ADD2(add0, add1, add1, add2, sum0, sum1);
935 SRARI_H2_UH(sum0, sum1, 2);
936 PCKEV_B2_UB(sum0, sum0, sum1, sum1, res0, res1);
937 AVER_UB2_UB(dst0, res0, dst1, res1, res0, res1);
939 out0 = __msa_copy_u_w((v4i32) res0, 0);
940 out1 = __msa_copy_u_w((v4i32) res1, 0);
950 static void common_hv_bil_and_aver_dst_8w_msa(const uint8_t *src,
952 uint8_t *dst, int32_t dst_stride,
956 v16i8 src0, src1, src2, src3, src4;
957 v16i8 src0_sld1, src1_sld1, src2_sld1, src3_sld1, src4_sld1;
958 v16u8 dst0, dst1, dst2, dst3;
959 v16u8 src0_r, src1_r, src2_r, src3_r, src4_r;
960 v8u16 add0, add1, add2, add3, add4;
961 v8u16 sum0, sum1, sum2, sum3;
966 for (loop_cnt = (height >> 2); loop_cnt--;) {
967 LD_SB4(src, src_stride, src1, src2, src3, src4);
968 src += (4 * src_stride);
970 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
971 SLDI_B3_0_SB(src0, src1, src2, src0_sld1, src1_sld1, src2_sld1, 1);
972 SLDI_B2_0_SB(src3, src4, src3_sld1, src4_sld1, 1);
973 ILVR_B3_UB(src0_sld1, src0, src1_sld1, src1, src2_sld1, src2, src0_r,
975 ILVR_B2_UB(src3_sld1, src3, src4_sld1, src4, src3_r, src4_r);
976 HADD_UB3_UH(src0_r, src1_r, src2_r, add0, add1, add2);
977 HADD_UB2_UH(src3_r, src4_r, add3, add4);
978 ADD4(add0, add1, add1, add2, add2, add3, add3, add4,
979 sum0, sum1, sum2, sum3);
980 SRARI_H4_UH(sum0, sum1, sum2, sum3, 2);
981 PCKEV_AVG_ST8x4_UB(sum0, dst0, sum1, dst1,
982 sum2, dst2, sum3, dst3, dst, dst_stride);
983 dst += (4 * dst_stride);
988 static void common_hv_bil_and_aver_dst_16w_msa(const uint8_t *src,
990 uint8_t *dst, int32_t dst_stride,
994 v16u8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
995 v16u8 src11, src12, src13, src14, src15, src16, src17;
996 v16u8 src0_r, src1_r, src2_r, src3_r, src4_r, src5_r, src6_r, src7_r;
997 v16u8 src8_r, src0_l, src1_l, src2_l, src3_l, src4_l, src5_l, src6_l;
998 v16u8 src7_l, src8_l;
999 v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
1000 v8u16 sum0_r, sum1_r, sum2_r, sum3_r, sum4_r, sum5_r, sum6_r, sum7_r;
1001 v8u16 sum0_l, sum1_l, sum2_l, sum3_l, sum4_l, sum5_l, sum6_l, sum7_l;
1002 v8u16 add0, add1, add2, add3, add4, add5, add6, add7, add8;
1004 for (loop_cnt = (height >> 3); loop_cnt--;) {
1005 LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
1006 LD_UB8((src + 1), src_stride,
1007 src9, src10, src11, src12, src13, src14, src15, src16);
1008 src += (8 * src_stride);
1011 src17 = LD_UB(src + 1);
1013 ILVRL_B2_UB(src9, src0, src0_r, src0_l);
1014 ILVRL_B2_UB(src10, src1, src1_r, src1_l);
1015 ILVRL_B2_UB(src11, src2, src2_r, src2_l);
1016 ILVRL_B2_UB(src12, src3, src3_r, src3_l);
1017 ILVRL_B2_UB(src13, src4, src4_r, src4_l);
1018 ILVRL_B2_UB(src14, src5, src5_r, src5_l);
1019 ILVRL_B2_UB(src15, src6, src6_r, src6_l);
1020 ILVRL_B2_UB(src16, src7, src7_r, src7_l);
1021 ILVRL_B2_UB(src17, src8, src8_r, src8_l);
1022 HADD_UB3_UH(src0_r, src1_r, src2_r, add0, add1, add2);
1023 HADD_UB3_UH(src3_r, src4_r, src5_r, add3, add4, add5);
1024 HADD_UB3_UH(src6_r, src7_r, src8_r, add6, add7, add8);
1025 ADD4(add0, add1, add1, add2, add2, add3, add3, add4, sum0_r, sum1_r,
1027 ADD4(add4, add5, add5, add6, add6, add7, add7, add8, sum4_r, sum5_r,
1029 HADD_UB3_UH(src0_l, src1_l, src2_l, add0, add1, add2);
1030 HADD_UB3_UH(src3_l, src4_l, src5_l, add3, add4, add5);
1031 HADD_UB3_UH(src6_l, src7_l, src8_l, add6, add7, add8);
1032 ADD4(add0, add1, add1, add2, add2, add3, add3, add4, sum0_l, sum1_l,
1034 ADD4(add4, add5, add5, add6, add6, add7, add7, add8, sum4_l, sum5_l,
1036 SRARI_H4_UH(sum0_r, sum1_r, sum2_r, sum3_r, 2);
1037 SRARI_H4_UH(sum4_r, sum5_r, sum6_r, sum7_r, 2);
1038 SRARI_H4_UH(sum0_l, sum1_l, sum2_l, sum3_l, 2);
1039 SRARI_H4_UH(sum4_l, sum5_l, sum6_l, sum7_l, 2);
1040 LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
1041 PCKEV_AVG_ST_UB(sum0_l, sum0_r, dst0, dst);
1043 PCKEV_AVG_ST_UB(sum1_l, sum1_r, dst1, dst);
1045 PCKEV_AVG_ST_UB(sum2_l, sum2_r, dst2, dst);
1047 PCKEV_AVG_ST_UB(sum3_l, sum3_r, dst3, dst);
1049 PCKEV_AVG_ST_UB(sum4_l, sum4_r, dst4, dst);
1051 PCKEV_AVG_ST_UB(sum5_l, sum5_r, dst5, dst);
1053 PCKEV_AVG_ST_UB(sum6_l, sum6_r, dst6, dst);
1055 PCKEV_AVG_ST_UB(sum7_l, sum7_r, dst7, dst);
1060 static void copy_width8_msa(const uint8_t *src, int32_t src_stride,
1061 uint8_t *dst, int32_t dst_stride,
1065 uint64_t out0, out1, out2, out3, out4, out5, out6, out7;
1066 v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
1068 if (0 == height % 12) {
1069 for (cnt = (height / 12); cnt--;) {
1070 LD_UB8(src, src_stride,
1071 src0, src1, src2, src3, src4, src5, src6, src7);
1072 src += (8 * src_stride);
1074 out0 = __msa_copy_u_d((v2i64) src0, 0);
1075 out1 = __msa_copy_u_d((v2i64) src1, 0);
1076 out2 = __msa_copy_u_d((v2i64) src2, 0);
1077 out3 = __msa_copy_u_d((v2i64) src3, 0);
1078 out4 = __msa_copy_u_d((v2i64) src4, 0);
1079 out5 = __msa_copy_u_d((v2i64) src5, 0);
1080 out6 = __msa_copy_u_d((v2i64) src6, 0);
1081 out7 = __msa_copy_u_d((v2i64) src7, 0);
1083 SD4(out0, out1, out2, out3, dst, dst_stride);
1084 dst += (4 * dst_stride);
1085 SD4(out4, out5, out6, out7, dst, dst_stride);
1086 dst += (4 * dst_stride);
1088 LD_UB4(src, src_stride, src0, src1, src2, src3);
1089 src += (4 * src_stride);
1091 out0 = __msa_copy_u_d((v2i64) src0, 0);
1092 out1 = __msa_copy_u_d((v2i64) src1, 0);
1093 out2 = __msa_copy_u_d((v2i64) src2, 0);
1094 out3 = __msa_copy_u_d((v2i64) src3, 0);
1096 SD4(out0, out1, out2, out3, dst, dst_stride);
1097 dst += (4 * dst_stride);
1099 } else if (0 == height % 8) {
1100 for (cnt = height >> 3; cnt--;) {
1101 LD_UB8(src, src_stride,
1102 src0, src1, src2, src3, src4, src5, src6, src7);
1103 src += (8 * src_stride);
1105 out0 = __msa_copy_u_d((v2i64) src0, 0);
1106 out1 = __msa_copy_u_d((v2i64) src1, 0);
1107 out2 = __msa_copy_u_d((v2i64) src2, 0);
1108 out3 = __msa_copy_u_d((v2i64) src3, 0);
1109 out4 = __msa_copy_u_d((v2i64) src4, 0);
1110 out5 = __msa_copy_u_d((v2i64) src5, 0);
1111 out6 = __msa_copy_u_d((v2i64) src6, 0);
1112 out7 = __msa_copy_u_d((v2i64) src7, 0);
1114 SD4(out0, out1, out2, out3, dst, dst_stride);
1115 dst += (4 * dst_stride);
1116 SD4(out4, out5, out6, out7, dst, dst_stride);
1117 dst += (4 * dst_stride);
1119 } else if (0 == height % 4) {
1120 for (cnt = (height / 4); cnt--;) {
1121 LD_UB4(src, src_stride, src0, src1, src2, src3);
1122 src += (4 * src_stride);
1123 out0 = __msa_copy_u_d((v2i64) src0, 0);
1124 out1 = __msa_copy_u_d((v2i64) src1, 0);
1125 out2 = __msa_copy_u_d((v2i64) src2, 0);
1126 out3 = __msa_copy_u_d((v2i64) src3, 0);
1128 SD4(out0, out1, out2, out3, dst, dst_stride);
1129 dst += (4 * dst_stride);
1131 } else if (0 == height % 2) {
1132 for (cnt = (height / 2); cnt--;) {
1133 LD_UB2(src, src_stride, src0, src1);
1134 src += (2 * src_stride);
1135 out0 = __msa_copy_u_d((v2i64) src0, 0);
1136 out1 = __msa_copy_u_d((v2i64) src1, 0);
1146 static void copy_16multx8mult_msa(const uint8_t *src, int32_t src_stride,
1147 uint8_t *dst, int32_t dst_stride,
1148 int32_t height, int32_t width)
1150 int32_t cnt, loop_cnt;
1151 const uint8_t *src_tmp;
1153 v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
1155 for (cnt = (width >> 4); cnt--;) {
1159 for (loop_cnt = (height >> 3); loop_cnt--;) {
1160 LD_UB8(src_tmp, src_stride,
1161 src0, src1, src2, src3, src4, src5, src6, src7);
1162 src_tmp += (8 * src_stride);
1164 ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7,
1165 dst_tmp, dst_stride);
1166 dst_tmp += (8 * dst_stride);
1174 static void copy_width16_msa(const uint8_t *src, int32_t src_stride,
1175 uint8_t *dst, int32_t dst_stride,
1179 v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
1181 if (0 == height % 12) {
1182 for (cnt = (height / 12); cnt--;) {
1183 LD_UB8(src, src_stride,
1184 src0, src1, src2, src3, src4, src5, src6, src7);
1185 src += (8 * src_stride);
1186 ST_UB8(src0, src1, src2, src3, src4, src5, src6, src7,
1188 dst += (8 * dst_stride);
1190 LD_UB4(src, src_stride, src0, src1, src2, src3);
1191 src += (4 * src_stride);
1192 ST_UB4(src0, src1, src2, src3, dst, dst_stride);
1193 dst += (4 * dst_stride);
1195 } else if (0 == height % 8) {
1196 copy_16multx8mult_msa(src, src_stride, dst, dst_stride, height, 16);
1197 } else if (0 == height % 4) {
1198 for (cnt = (height >> 2); cnt--;) {
1199 LD_UB4(src, src_stride, src0, src1, src2, src3);
1200 src += (4 * src_stride);
1202 ST_UB4(src0, src1, src2, src3, dst, dst_stride);
1203 dst += (4 * dst_stride);
1208 static void avg_width4_msa(const uint8_t *src, int32_t src_stride,
1209 uint8_t *dst, int32_t dst_stride,
1213 uint32_t out0, out1, out2, out3;
1214 v16u8 src0, src1, src2, src3;
1215 v16u8 dst0, dst1, dst2, dst3;
1217 if (0 == (height % 4)) {
1218 for (cnt = (height / 4); cnt--;) {
1219 LD_UB4(src, src_stride, src0, src1, src2, src3);
1220 src += (4 * src_stride);
1222 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
1224 AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3,
1225 dst0, dst1, dst2, dst3);
1227 out0 = __msa_copy_u_w((v4i32) dst0, 0);
1228 out1 = __msa_copy_u_w((v4i32) dst1, 0);
1229 out2 = __msa_copy_u_w((v4i32) dst2, 0);
1230 out3 = __msa_copy_u_w((v4i32) dst3, 0);
1231 SW4(out0, out1, out2, out3, dst, dst_stride);
1232 dst += (4 * dst_stride);
1234 } else if (0 == (height % 2)) {
1235 for (cnt = (height / 2); cnt--;) {
1236 LD_UB2(src, src_stride, src0, src1);
1237 src += (2 * src_stride);
1239 LD_UB2(dst, dst_stride, dst0, dst1);
1241 AVER_UB2_UB(src0, dst0, src1, dst1, dst0, dst1);
1243 out0 = __msa_copy_u_w((v4i32) dst0, 0);
1244 out1 = __msa_copy_u_w((v4i32) dst1, 0);
1253 static void avg_width8_msa(const uint8_t *src, int32_t src_stride,
1254 uint8_t *dst, int32_t dst_stride,
1258 uint64_t out0, out1, out2, out3;
1259 v16u8 src0, src1, src2, src3;
1260 v16u8 dst0, dst1, dst2, dst3;
1262 for (cnt = (height / 4); cnt--;) {
1263 LD_UB4(src, src_stride, src0, src1, src2, src3);
1264 src += (4 * src_stride);
1265 LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
1267 AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3,
1268 dst0, dst1, dst2, dst3);
1270 out0 = __msa_copy_u_d((v2i64) dst0, 0);
1271 out1 = __msa_copy_u_d((v2i64) dst1, 0);
1272 out2 = __msa_copy_u_d((v2i64) dst2, 0);
1273 out3 = __msa_copy_u_d((v2i64) dst3, 0);
1274 SD4(out0, out1, out2, out3, dst, dst_stride);
1275 dst += (4 * dst_stride);
1279 static void avg_width16_msa(const uint8_t *src, int32_t src_stride,
1280 uint8_t *dst, int32_t dst_stride,
1284 v16u8 src0, src1, src2, src3, src4, src5, src6, src7;
1285 v16u8 dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7;
1287 for (cnt = (height / 8); cnt--;) {
1288 LD_UB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
1289 src += (8 * src_stride);
1290 LD_UB8(dst, dst_stride, dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7);
1292 AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3,
1293 dst0, dst1, dst2, dst3);
1294 AVER_UB4_UB(src4, dst4, src5, dst5, src6, dst6, src7, dst7,
1295 dst4, dst5, dst6, dst7);
1296 ST_UB8(dst0, dst1, dst2, dst3, dst4, dst5, dst6, dst7, dst, dst_stride);
1297 dst += (8 * dst_stride);
1301 void ff_put_pixels16_msa(uint8_t *block, const uint8_t *pixels,
1302 ptrdiff_t line_size, int h)
1304 copy_width16_msa(pixels, line_size, block, line_size, h);
1307 void ff_put_pixels16_x2_msa(uint8_t *block, const uint8_t *pixels,
1308 ptrdiff_t line_size, int h)
1310 common_hz_bil_16w_msa(pixels, line_size, block, line_size, h);
1313 void ff_put_pixels16_y2_msa(uint8_t *block, const uint8_t *pixels,
1314 ptrdiff_t line_size, int h)
1316 common_vt_bil_16w_msa(pixels, line_size, block, line_size, h);
1319 void ff_put_pixels16_xy2_msa(uint8_t *block, const uint8_t *pixels,
1320 ptrdiff_t line_size, int h)
1322 common_hv_bil_16w_msa(pixels, line_size, block, line_size, h);
1325 void ff_put_pixels8_msa(uint8_t *block, const uint8_t *pixels,
1326 ptrdiff_t line_size, int h)
1328 copy_width8_msa(pixels, line_size, block, line_size, h);
1331 void ff_put_pixels8_x2_msa(uint8_t *block, const uint8_t *pixels,
1332 ptrdiff_t line_size, int h)
1334 common_hz_bil_8w_msa(pixels, line_size, block, line_size, h);
1337 void ff_put_pixels8_y2_msa(uint8_t *block, const uint8_t *pixels,
1338 ptrdiff_t line_size, int h)
1340 common_vt_bil_8w_msa(pixels, line_size, block, line_size, h);
1343 void ff_put_pixels8_xy2_msa(uint8_t *block, const uint8_t *pixels,
1344 ptrdiff_t line_size, int h)
1346 common_hv_bil_8w_msa(pixels, line_size, block, line_size, h);
1349 void ff_put_pixels4_x2_msa(uint8_t *block, const uint8_t *pixels,
1350 ptrdiff_t line_size, int h)
1352 common_hz_bil_4w_msa(pixels, line_size, block, line_size, h);
1355 void ff_put_pixels4_y2_msa(uint8_t *block, const uint8_t *pixels,
1356 ptrdiff_t line_size, int h)
1358 common_vt_bil_4w_msa(pixels, line_size, block, line_size, h);
1361 void ff_put_pixels4_xy2_msa(uint8_t *block, const uint8_t *pixels,
1362 ptrdiff_t line_size, int h)
1364 common_hv_bil_4w_msa(pixels, line_size, block, line_size, h);
1367 void ff_put_no_rnd_pixels16_x2_msa(uint8_t *block, const uint8_t *pixels,
1368 ptrdiff_t line_size, int h)
1371 common_hz_bil_no_rnd_16x16_msa(pixels, line_size, block, line_size);
1372 } else if (h == 8) {
1373 common_hz_bil_no_rnd_8x16_msa(pixels, line_size, block, line_size);
1377 void ff_put_no_rnd_pixels16_y2_msa(uint8_t *block, const uint8_t *pixels,
1378 ptrdiff_t line_size, int h)
1381 common_vt_bil_no_rnd_16x16_msa(pixels, line_size, block, line_size);
1382 } else if (h == 8) {
1383 common_vt_bil_no_rnd_8x16_msa(pixels, line_size, block, line_size);
1387 void ff_put_no_rnd_pixels16_xy2_msa(uint8_t *block,
1388 const uint8_t *pixels,
1389 ptrdiff_t line_size, int h)
1392 common_hv_bil_no_rnd_16x16_msa(pixels, line_size, block, line_size);
1393 } else if (h == 8) {
1394 common_hv_bil_no_rnd_8x16_msa(pixels, line_size, block, line_size);
1398 void ff_put_no_rnd_pixels8_x2_msa(uint8_t *block, const uint8_t *pixels,
1399 ptrdiff_t line_size, int h)
1402 common_hz_bil_no_rnd_8x8_msa(pixels, line_size, block, line_size);
1403 } else if (h == 4) {
1404 common_hz_bil_no_rnd_4x8_msa(pixels, line_size, block, line_size);
1408 void ff_put_no_rnd_pixels8_y2_msa(uint8_t *block, const uint8_t *pixels,
1409 ptrdiff_t line_size, int h)
1412 common_vt_bil_no_rnd_8x8_msa(pixels, line_size, block, line_size);
1413 } else if (h == 4) {
1414 common_vt_bil_no_rnd_4x8_msa(pixels, line_size, block, line_size);
1418 void ff_put_no_rnd_pixels8_xy2_msa(uint8_t *block, const uint8_t *pixels,
1419 ptrdiff_t line_size, int h)
1422 common_hv_bil_no_rnd_8x8_msa(pixels, line_size, block, line_size);
1423 } else if (h == 4) {
1424 common_hv_bil_no_rnd_4x8_msa(pixels, line_size, block, line_size);
1428 void ff_avg_pixels16_msa(uint8_t *block, const uint8_t *pixels,
1429 ptrdiff_t line_size, int h)
1431 avg_width16_msa(pixels, line_size, block, line_size, h);
1434 void ff_avg_pixels16_x2_msa(uint8_t *block, const uint8_t *pixels,
1435 ptrdiff_t line_size, int h)
1437 common_hz_bil_and_aver_dst_16w_msa(pixels, line_size, block, line_size, h);
1440 void ff_avg_pixels16_y2_msa(uint8_t *block, const uint8_t *pixels,
1441 ptrdiff_t line_size, int h)
1443 common_vt_bil_and_aver_dst_16w_msa(pixels, line_size, block, line_size, h);
1446 void ff_avg_pixels16_xy2_msa(uint8_t *block, const uint8_t *pixels,
1447 ptrdiff_t line_size, int h)
1449 common_hv_bil_and_aver_dst_16w_msa(pixels, line_size, block, line_size, h);
1452 void ff_avg_pixels8_msa(uint8_t *block, const uint8_t *pixels,
1453 ptrdiff_t line_size, int h)
1455 avg_width8_msa(pixels, line_size, block, line_size, h);
1458 void ff_avg_pixels8_x2_msa(uint8_t *block, const uint8_t *pixels,
1459 ptrdiff_t line_size, int h)
1461 common_hz_bil_and_aver_dst_8w_msa(pixels, line_size, block, line_size, h);
1464 void ff_avg_pixels8_y2_msa(uint8_t *block, const uint8_t *pixels,
1465 ptrdiff_t line_size, int h)
1467 common_vt_bil_and_aver_dst_8w_msa(pixels, line_size, block, line_size, h);
1470 void ff_avg_pixels8_xy2_msa(uint8_t *block, const uint8_t *pixels,
1471 ptrdiff_t line_size, int h)
1473 common_hv_bil_and_aver_dst_8w_msa(pixels, line_size, block, line_size, h);
1476 void ff_avg_pixels4_msa(uint8_t *block, const uint8_t *pixels,
1477 ptrdiff_t line_size, int h)
1479 avg_width4_msa(pixels, line_size, block, line_size, h);
1482 void ff_avg_pixels4_x2_msa(uint8_t *block, const uint8_t *pixels,
1483 ptrdiff_t line_size, int h)
1485 common_hz_bil_and_aver_dst_4w_msa(pixels, line_size, block, line_size, h);
1488 void ff_avg_pixels4_y2_msa(uint8_t *block, const uint8_t *pixels,
1489 ptrdiff_t line_size, int h)
1491 common_vt_bil_and_aver_dst_4w_msa(pixels, line_size, block, line_size, h);
1494 void ff_avg_pixels4_xy2_msa(uint8_t *block, const uint8_t *pixels,
1495 ptrdiff_t line_size, int h)
1497 common_hv_bil_and_aver_dst_4w_msa(pixels, line_size, block, line_size, h);