X-Git-Url: https://git.sesse.net/?a=blobdiff_plain;f=libavcodec%2Fmips%2Fqpeldsp_msa.c;h=c7675f112ec0f418066358fdb69b382a83a5ab78;hb=682b0e42e4a09d2d745293385a557fb5f316cb95;hp=4710b3f7329225d653b1ec428e714a83ec21648d;hpb=f05f210526a3dc2d9fa6b1c228e3907ebd1d43c6;p=ffmpeg diff --git a/libavcodec/mips/qpeldsp_msa.c b/libavcodec/mips/qpeldsp_msa.c index 4710b3f7329..c7675f112ec 100644 --- a/libavcodec/mips/qpeldsp_msa.c +++ b/libavcodec/mips/qpeldsp_msa.c @@ -96,7 +96,7 @@ DPADD_UB2_UH(sum2_r, sum1_r, coef2, coef1, sum0_r, sum3_r); \ res0_r = (v8i16) (sum0_r - sum3_r); \ res0_r = __msa_srari_h(res0_r, 5); \ - res0_r = CLIP_SH_0_255(res0_r); \ + CLIP_SH_0_255(res0_r); \ out = (v16u8) __msa_pckev_b((v16i8) res0_r, (v16i8) res0_r); \ \ out; \ @@ -118,7 +118,7 @@ res0_r = (v8i16) (sum0_r - sum3_r); \ res0_r += 15; \ res0_r >>= 5; \ - res0_r = CLIP_SH_0_255(res0_r); \ + CLIP_SH_0_255(res0_r); \ out = (v16u8) __msa_pckev_b((v16i8) res0_r, (v16i8) res0_r); \ \ out; \ @@ -334,7 +334,7 @@ static void horiz_mc_qpel_aver_src0_8width_msa(const uint8_t *src, inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1); inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3); AVER_UB2_UB(inp0, res0, inp2, res1, res0, res1); - ST8x4_UB(res0, res1, dst, dst_stride); + ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride); dst += (4 * dst_stride); } } @@ -409,7 +409,7 @@ static void horiz_mc_qpel_8width_msa(const uint8_t *src, res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3, const20, const6, const3); - ST8x4_UB(res0, res1, dst, dst_stride); + ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride); dst += (4 * dst_stride); } } @@ -480,12 +480,12 @@ static void horiz_mc_qpel_aver_src1_8width_msa(const uint8_t *src, res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3, const20, const6, const3); - SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1); - SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1); + SLDI_B4_UB(inp0, inp0, inp1, inp1, inp2, inp2, inp3, inp3, 1, + inp0, inp1, inp2, inp3); inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1); inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3); AVER_UB2_UB(inp0, res0, inp2, res1, res0, res1); - ST8x4_UB(res0, res1, dst, dst_stride); + ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride); dst += (4 * dst_stride); } } @@ -564,7 +564,7 @@ static void horiz_mc_qpel_no_rnd_aver_src0_8width_msa(const uint8_t *src, inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3); res0 = __msa_ave_u_b(inp0, res0); res1 = __msa_ave_u_b(inp2, res1); - ST8x4_UB(res0, res1, dst, dst_stride); + ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride); dst += (4 * dst_stride); } } @@ -639,7 +639,7 @@ static void horiz_mc_qpel_no_rnd_8width_msa(const uint8_t *src, res1 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3, const20, const6, const3); - ST8x4_UB(res0, res1, dst, dst_stride); + ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride); dst += (4 * dst_stride); } } @@ -710,13 +710,13 @@ static void horiz_mc_qpel_no_rnd_aver_src1_8width_msa(const uint8_t *src, res1 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3, const20, const6, const3); - SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1); - SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1); + SLDI_B4_UB(inp0, inp0, inp1, inp1, inp2, inp2, inp3, inp3, 1, + inp0, inp1, inp2, inp3); inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1); inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3); res0 = __msa_ave_u_b(inp0, res0); res1 = __msa_ave_u_b(inp2, res1); - ST8x4_UB(res0, res1, dst, dst_stride); + ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride); dst += (4 * dst_stride); } } @@ -799,7 +799,7 @@ static void horiz_mc_qpel_avg_dst_aver_src0_8width_msa(const uint8_t *src, dst2 = (v16u8) __msa_insve_d((v2i64) dst2, 1, (v2i64) dst3); AVER_UB2_UB(inp0, res0, inp2, res1, res0, res1); AVER_UB2_UB(dst0, res0, dst2, res1, res0, res1); - ST8x4_UB(res0, res1, dst, dst_stride); + ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride); dst += (4 * dst_stride); } } @@ -876,7 +876,7 @@ static void horiz_mc_qpel_avg_dst_8width_msa(const uint8_t *src, dst0 = (v16u8) __msa_insve_d((v2i64) dst0, 1, (v2i64) dst1); dst2 = (v16u8) __msa_insve_d((v2i64) dst2, 1, (v2i64) dst3); AVER_UB2_UB(dst0, res0, dst2, res1, res0, res1); - ST8x4_UB(res0, res1, dst, dst_stride); + ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride); dst += (4 * dst_stride); } } @@ -948,15 +948,15 @@ static void horiz_mc_qpel_avg_dst_aver_src1_8width_msa(const uint8_t *src, mask0, mask1, mask2, mask3, const20, const6, const3); LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3); - SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1); - SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1); + SLDI_B4_UB(inp0, inp0, inp1, inp1, inp2, inp2, inp3, inp3, 1, + inp0, inp1, inp2, inp3); inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1); inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3); dst0 = (v16u8) __msa_insve_d((v2i64) dst0, 1, (v2i64) dst1); dst2 = (v16u8) __msa_insve_d((v2i64) dst2, 1, (v2i64) dst3); AVER_UB2_UB(inp0, res0, inp2, res1, res0, res1); AVER_UB2_UB(dst0, res0, dst2, res1, res0, res1); - ST8x4_UB(res0, res1, dst, dst_stride); + ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride); dst += (4 * dst_stride); } } @@ -1031,8 +1031,7 @@ static void vert_mc_qpel_aver_src0_8x8_msa(const uint8_t *src, tmp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1); tmp1 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3); AVER_UB2_UB(res0, tmp0, res1, tmp1, res0, res1); - ST8x4_UB(res0, res1, dst, dst_stride); - dst += (4 * dst_stride); + ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride); inp8 = LD_UB(src); res0 = APPLY_VERT_QPEL_FILTER_8BYTE(inp4, inp3, inp2, inp1, @@ -1048,8 +1047,7 @@ static void vert_mc_qpel_aver_src0_8x8_msa(const uint8_t *src, tmp0 = (v16u8) __msa_insve_d((v2i64) inp4, 1, (v2i64) inp5); tmp1 = (v16u8) __msa_insve_d((v2i64) inp6, 1, (v2i64) inp7); AVER_UB2_UB(res0, tmp0, res1, tmp1, res0, res1); - ST8x4_UB(res0, res1, dst, dst_stride); - dst += (4 * dst_stride); + ST_D4(res0, res1, 0, 1, 0, 1, dst + 4 * dst_stride, dst_stride); } static void vert_mc_qpel_aver_src0_16x16_msa(const uint8_t *src, @@ -1220,8 +1218,7 @@ static void vert_mc_qpel_8x8_msa(const uint8_t *src, inp3, inp2, inp1, inp0, inp4, inp5, inp6, inp7, const20, const6, const3); - ST8x4_UB(res0, res1, dst, dst_stride); - dst += (4 * dst_stride); + ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride); inp8 = LD_UB(src); res0 = APPLY_VERT_QPEL_FILTER_8BYTE(inp4, inp3, inp2, inp1, @@ -1234,8 +1231,7 @@ static void vert_mc_qpel_8x8_msa(const uint8_t *src, inp7, inp6, inp5, inp4, inp8, inp8, inp7, inp6, const20, const6, const3); - ST8x4_UB(res0, res1, dst, dst_stride); - dst += (4 * dst_stride); + ST_D4(res0, res1, 0, 1, 0, 1, dst + 4 * dst_stride, dst_stride); } static void vert_mc_qpel_16x16_msa(const uint8_t *src, @@ -1405,8 +1401,7 @@ static void vert_mc_qpel_aver_src1_8x8_msa(const uint8_t *src, tmp0 = (v16u8) __msa_insve_d((v2i64) inp1, 1, (v2i64) inp2); tmp1 = (v16u8) __msa_insve_d((v2i64) inp3, 1, (v2i64) inp4); AVER_UB2_UB(res0, tmp0, res1, tmp1, res0, res1); - ST8x4_UB(res0, res1, dst, dst_stride); - dst += (4 * dst_stride); + ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride); inp8 = LD_UB(src); res0 = APPLY_VERT_QPEL_FILTER_8BYTE(inp4, inp3, inp2, inp1, @@ -1422,7 +1417,7 @@ static void vert_mc_qpel_aver_src1_8x8_msa(const uint8_t *src, tmp0 = (v16u8) __msa_insve_d((v2i64) inp5, 1, (v2i64) inp6); tmp1 = (v16u8) __msa_insve_d((v2i64) inp7, 1, (v2i64) inp8); AVER_UB2_UB(res0, tmp0, res1, tmp1, res0, res1); - ST8x4_UB(res0, res1, dst, dst_stride); + ST_D4(res0, res1, 0, 1, 0, 1, dst + 4 * dst_stride, dst_stride); } static void vert_mc_qpel_aver_src1_16x16_msa(const uint8_t *src, @@ -1607,8 +1602,7 @@ static void vert_mc_qpel_no_rnd_aver_src0_8x8_msa(const uint8_t *src, tmp1 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3); res0 = __msa_ave_u_b(res0, tmp0); res1 = __msa_ave_u_b(res1, tmp1); - ST8x4_UB(res0, res1, dst, dst_stride); - dst += (4 * dst_stride); + ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride); inp8 = LD_UB(src); res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(inp4, inp3, inp2, inp1, @@ -1625,8 +1619,7 @@ static void vert_mc_qpel_no_rnd_aver_src0_8x8_msa(const uint8_t *src, tmp1 = (v16u8) __msa_insve_d((v2i64) inp6, 1, (v2i64) inp7); res0 = __msa_ave_u_b(res0, tmp0); res1 = __msa_ave_u_b(res1, tmp1); - ST8x4_UB(res0, res1, dst, dst_stride); - dst += (4 * dst_stride); + ST_D4(res0, res1, 0, 1, 0, 1, dst + 4 * dst_stride, dst_stride); } static void vert_mc_qpel_no_rnd_aver_src0_16x16_msa(const uint8_t *src, @@ -1806,8 +1799,7 @@ static void vert_mc_qpel_no_rnd_8x8_msa(const uint8_t *src, inp3, inp2, inp1, inp0, inp4, inp5, inp6, inp7, const20, const6, const3); - ST8x4_UB(res0, res1, dst, dst_stride); - dst += (4 * dst_stride); + ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride); inp8 = LD_UB(src); res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(inp4, inp3, inp2, inp1, @@ -1820,8 +1812,7 @@ static void vert_mc_qpel_no_rnd_8x8_msa(const uint8_t *src, inp7, inp6, inp5, inp4, inp8, inp8, inp7, inp6, const20, const6, const3); - ST8x4_UB(res0, res1, dst, dst_stride); - dst += (4 * dst_stride); + ST_D4(res0, res1, 0, 1, 0, 1, dst + 4 * dst_stride, dst_stride); } static void vert_mc_qpel_no_rnd_16x16_msa(const uint8_t *src, @@ -1988,8 +1979,7 @@ static void vert_mc_qpel_no_rnd_aver_src1_8x8_msa(const uint8_t *src, tmp1 = (v16u8) __msa_insve_d((v2i64) inp3, 1, (v2i64) inp4); res0 = __msa_ave_u_b(res0, tmp0); res1 = __msa_ave_u_b(res1, tmp1); - ST8x4_UB(res0, res1, dst, dst_stride); - dst += (4 * dst_stride); + ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride); inp8 = LD_UB(src); res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(inp4, inp3, inp2, inp1, @@ -2006,7 +1996,7 @@ static void vert_mc_qpel_no_rnd_aver_src1_8x8_msa(const uint8_t *src, tmp1 = (v16u8) __msa_insve_d((v2i64) inp7, 1, (v2i64) inp8); res0 = __msa_ave_u_b(res0, tmp0); res1 = __msa_ave_u_b(res1, tmp1); - ST8x4_UB(res0, res1, dst, dst_stride); + ST_D4(res0, res1, 0, 1, 0, 1, dst + 4 * dst_stride, dst_stride); } static void vert_mc_qpel_no_rnd_aver_src1_16x16_msa(const uint8_t *src, @@ -2195,7 +2185,7 @@ static void vert_mc_qpel_avg_dst_aver_src0_8x8_msa(const uint8_t *src, dst2 = (v16u8) __msa_insve_d((v2i64) dst2, 1, (v2i64) dst3); AVER_UB2_UB(res0, tmp0, res1, tmp1, res0, res1); AVER_UB2_UB(dst0, res0, dst2, res1, res0, res1); - ST8x4_UB(res0, res1, dst, dst_stride); + ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride); dst += (4 * dst_stride); inp8 = LD_UB(src); @@ -2217,7 +2207,7 @@ static void vert_mc_qpel_avg_dst_aver_src0_8x8_msa(const uint8_t *src, dst2 = (v16u8) __msa_insve_d((v2i64) dst2, 1, (v2i64) dst3); AVER_UB2_UB(res0, tmp0, res1, tmp1, res0, res1); AVER_UB2_UB(dst0, res0, dst2, res1, res0, res1); - ST8x4_UB(res0, res1, dst, dst_stride); + ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride); } static void vert_mc_qpel_avg_dst_aver_src0_16x16_msa(const uint8_t *src, @@ -2384,7 +2374,7 @@ static void vert_mc_qpel_avg_dst_8x8_msa(const uint8_t *src, dst0 = (v16u8) __msa_insve_d((v2i64) dst0, 1, (v2i64) dst1); dst2 = (v16u8) __msa_insve_d((v2i64) dst2, 1, (v2i64) dst3); AVER_UB2_UB(dst0, res0, dst2, res1, res0, res1); - ST8x4_UB(res0, res1, dst, dst_stride); + ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride); dst += (4 * dst_stride); inp8 = LD_UB(src); @@ -2402,8 +2392,7 @@ static void vert_mc_qpel_avg_dst_8x8_msa(const uint8_t *src, dst0 = (v16u8) __msa_insve_d((v2i64) dst0, 1, (v2i64) dst1); dst2 = (v16u8) __msa_insve_d((v2i64) dst2, 1, (v2i64) dst3); AVER_UB2_UB(dst0, res0, dst2, res1, res0, res1); - ST8x4_UB(res0, res1, dst, dst_stride); - dst += (4 * dst_stride); + ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride); } static void vert_mc_qpel_avg_dst_16x16_msa(const uint8_t *src, @@ -2566,7 +2555,7 @@ static void vert_mc_qpel_avg_dst_aver_src1_8x8_msa(const uint8_t *src, dst2 = (v16u8) __msa_insve_d((v2i64) dst2, 1, (v2i64) dst3); AVER_UB2_UB(res0, tmp0, res1, tmp1, res0, res1); AVER_UB2_UB(dst0, res0, dst2, res1, res0, res1); - ST8x4_UB(res0, res1, dst, dst_stride); + ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride); dst += (4 * dst_stride); inp8 = LD_UB(src); @@ -2587,7 +2576,7 @@ static void vert_mc_qpel_avg_dst_aver_src1_8x8_msa(const uint8_t *src, dst2 = (v16u8) __msa_insve_d((v2i64) dst2, 1, (v2i64) dst3); AVER_UB2_UB(res0, tmp0, res1, tmp1, res0, res1); AVER_UB2_UB(dst0, res0, dst2, res1, res0, res1); - ST8x4_UB(res0, res1, dst, dst_stride); + ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride); } static void vert_mc_qpel_avg_dst_aver_src1_16x16_msa(const uint8_t *src, @@ -2832,7 +2821,7 @@ static void hv_mc_qpel_no_rnd_aver_hv_src00_8x8_msa(const uint8_t *src, const20, const6, const3); avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz1, (v2i64) horiz0); res0 = __msa_ave_u_b(avg0, res0); - ST8x2_UB(res0, dst, dst_stride); + ST_D2(res0, 0, 1, dst, dst_stride); dst += (2 * dst_stride); LD_UB2(src, src_stride, inp2, inp3); @@ -2860,7 +2849,7 @@ static void hv_mc_qpel_no_rnd_aver_hv_src00_8x8_msa(const uint8_t *src, horiz5, horiz4, horiz3, horiz2, horiz6, horiz7, horiz8, horiz8, const20, const6, const3); - ST8x2_UB(res1, dst, dst_stride); + ST_D2(res1, 0, 1, dst, dst_stride); dst += 2 * dst_stride; avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz5, (v2i64) horiz4); @@ -2870,12 +2859,12 @@ static void hv_mc_qpel_no_rnd_aver_hv_src00_8x8_msa(const uint8_t *src, horiz7, horiz6, horiz5, horiz4, horiz8, horiz8, horiz7, horiz6, const20, const6, const3); - ST8x2_UB(res0, dst, dst_stride); + ST_D2(res0, 0, 1, dst, dst_stride); dst += 2 * dst_stride; avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz7, (v2i64) horiz6); res1 = __msa_ave_u_b(avg1, res1); - ST8x2_UB(res1, dst, dst_stride); + ST_D2(res1, 0, 1, dst, dst_stride); } static void hv_mc_qpel_no_rnd_horiz_16x16_msa(const uint8_t *src, @@ -2977,7 +2966,7 @@ static void hv_mc_qpel_no_rnd_aver_v_src0_8x8_msa(const uint8_t *src, const20, const6, const3); avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz1, (v2i64) horiz0); res0 = __msa_ave_u_b(avg0, res0); - ST8x2_UB(res0, dst, dst_stride); + ST_D2(res0, 0, 1, dst, dst_stride); dst += (2 * dst_stride); LD_UB2(src, src_stride, inp2, inp3); @@ -2999,7 +2988,7 @@ static void hv_mc_qpel_no_rnd_aver_v_src0_8x8_msa(const uint8_t *src, res1 = __msa_ave_u_b(avg1, res1); avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz1, (v2i64) horiz0); res0 = __msa_ave_u_b(avg0, res0); - ST8x2_UB(res1, dst, dst_stride); + ST_D2(res1, 0, 1, dst, dst_stride); dst += (2 * dst_stride); res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1, @@ -3009,7 +2998,7 @@ static void hv_mc_qpel_no_rnd_aver_v_src0_8x8_msa(const uint8_t *src, const20, const6, const3); avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz5, (v2i64) horiz4); res0 = __msa_ave_u_b(avg0, res0); - ST8x2_UB(res0, dst, dst_stride); + ST_D2(res0, 0, 1, dst, dst_stride); dst += (2 * dst_stride); res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3, @@ -3019,7 +3008,7 @@ static void hv_mc_qpel_no_rnd_aver_v_src0_8x8_msa(const uint8_t *src, const20, const6, const3); avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz7, (v2i64) horiz6); res1 = __msa_ave_u_b(avg1, res1); - ST8x2_UB(res1, dst, dst_stride); + ST_D2(res1, 0, 1, dst, dst_stride); } static void hv_mc_qpel_no_rnd_horiz_src1_16x16_msa(const uint8_t *src, @@ -3105,7 +3094,7 @@ static void hv_mc_qpel_no_rnd_aver_hv_src10_8x8_msa(const uint8_t *src, res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3, const20, const6, const3); - SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1); + SLDI_B2_UB(inp0, inp0, inp1, inp1, 1, inp0, inp1); inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1); horiz0 = __msa_ave_u_b(inp0, res0); @@ -3115,7 +3104,7 @@ static void hv_mc_qpel_no_rnd_aver_hv_src10_8x8_msa(const uint8_t *src, res1 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3, const20, const6, const3); - SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1); + SLDI_B2_UB(inp2, inp2, inp3, inp3, 1, inp2, inp3); inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3); horiz2 = __msa_ave_u_b(inp2, res1); @@ -3125,7 +3114,7 @@ static void hv_mc_qpel_no_rnd_aver_hv_src10_8x8_msa(const uint8_t *src, res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3, const20, const6, const3); - SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1); + SLDI_B2_UB(inp0, inp0, inp1, inp1, 1, inp0, inp1); inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1); horiz4 = __msa_ave_u_b(inp0, res0); @@ -3137,7 +3126,7 @@ static void hv_mc_qpel_no_rnd_aver_hv_src10_8x8_msa(const uint8_t *src, const20, const6, const3); avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz1, (v2i64) horiz0); res0 = __msa_ave_u_b(avg0, res0); - ST8x2_UB(res0, dst, dst_stride); + ST_D2(res0, 0, 1, dst, dst_stride); dst += (2 * dst_stride); LD_UB2(src, src_stride, inp2, inp3); @@ -3145,7 +3134,7 @@ static void hv_mc_qpel_no_rnd_aver_hv_src10_8x8_msa(const uint8_t *src, res1 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3, const20, const6, const3); - SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1); + SLDI_B2_UB(inp2, inp2, inp3, inp3, 1, inp2, inp3); inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3); horiz6 = __msa_ave_u_b(inp2, res1); @@ -3163,7 +3152,7 @@ static void hv_mc_qpel_no_rnd_aver_hv_src10_8x8_msa(const uint8_t *src, const20, const6, const3); avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz3, (v2i64) horiz2); res1 = __msa_ave_u_b(avg1, res1); - ST8x2_UB(res1, dst, dst_stride); + ST_D2(res1, 0, 1, dst, dst_stride); dst += (2 * dst_stride); res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1, @@ -3173,7 +3162,7 @@ static void hv_mc_qpel_no_rnd_aver_hv_src10_8x8_msa(const uint8_t *src, const20, const6, const3); avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz5, (v2i64) horiz4); res0 = __msa_ave_u_b(avg0, res0); - ST8x2_UB(res0, dst, dst_stride); + ST_D2(res0, 0, 1, dst, dst_stride); dst += (2 * dst_stride); res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3, @@ -3183,7 +3172,7 @@ static void hv_mc_qpel_no_rnd_aver_hv_src10_8x8_msa(const uint8_t *src, const20, const6, const3); avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz7, (v2i64) horiz6); res1 = __msa_ave_u_b(avg1, res1); - ST8x2_UB(res1, dst, dst_stride); + ST_D2(res1, 0, 1, dst, dst_stride); } static void hv_mc_qpel_no_rnd_aver_h_src0_16x16_msa(const uint8_t *src, @@ -3246,7 +3235,7 @@ static void hv_mc_qpel_no_rnd_aver_h_src0_8x8_msa(const uint8_t *src, LD_UB2(src, src_stride, inp2, inp3); src += (2 * src_stride); - ST8x2_UB(res0, dst, dst_stride); + ST_D2(res0, 0, 1, dst, dst_stride); dst += 2 * dst_stride; res1 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1, @@ -3270,18 +3259,15 @@ static void hv_mc_qpel_no_rnd_aver_h_src0_8x8_msa(const uint8_t *src, horiz5, horiz4, horiz3, horiz2, horiz6, horiz7, horiz8, horiz8, const20, const6, const3); - ST8x2_UB(res1, dst, dst_stride); - dst += 2 * dst_stride; - - ST8x2_UB(res0, dst, dst_stride); - dst += (2 * dst_stride); + ST_D4(res1, res0, 0, 1, 0, 1, dst, dst_stride); + dst += (4 * dst_stride); res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3, horiz7, horiz8, horiz8, horiz7, horiz7, horiz6, horiz5, horiz4, horiz8, horiz8, horiz7, horiz6, const20, const6, const3); - ST8x2_UB(res1, dst, dst_stride); + ST_D2(res1, 0, 1, dst, dst_stride); } static void hv_mc_qpel_no_rnd_16x16_msa(const uint8_t *src, @@ -3337,7 +3323,7 @@ static void hv_mc_qpel_no_rnd_8x8_msa(const uint8_t *src, const20, const6, const3); LD_UB2(src, src_stride, inp2, inp3); src += (2 * src_stride); - ST8x2_UB(res0, dst, dst_stride); + ST_D2(res0, 0, 1, dst, dst_stride); dst += 2 * dst_stride; horiz6 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1, @@ -3358,7 +3344,7 @@ static void hv_mc_qpel_no_rnd_8x8_msa(const uint8_t *src, horiz5, horiz4, horiz3, horiz2, horiz6, horiz7, horiz8, horiz8, const20, const6, const3); - ST8x2_UB(res1, dst, dst_stride); + ST_D2(res1, 0, 1, dst, dst_stride); dst += 2 * dst_stride; @@ -3367,9 +3353,7 @@ static void hv_mc_qpel_no_rnd_8x8_msa(const uint8_t *src, horiz7, horiz6, horiz5, horiz4, horiz8, horiz8, horiz7, horiz6, const20, const6, const3); - ST8x2_UB(res0, dst, dst_stride); - dst += 2 * dst_stride; - ST8x2_UB(res1, dst, dst_stride); + ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride); } static void hv_mc_qpel_no_rnd_aver_h_src1_16x16_msa(const uint8_t *src, @@ -3405,7 +3389,7 @@ static void hv_mc_qpel_no_rnd_aver_h_src1_8x8_msa(const uint8_t *src, res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3, const20, const6, const3); - SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1); + SLDI_B2_UB(inp0, inp0, inp1, inp1, 1, inp0, inp1); inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1); horiz0 = __msa_ave_u_b(inp0, res0); @@ -3415,7 +3399,7 @@ static void hv_mc_qpel_no_rnd_aver_h_src1_8x8_msa(const uint8_t *src, res1 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3, const20, const6, const3); - SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1); + SLDI_B2_UB(inp2, inp2, inp3, inp3, 1, inp2, inp3); inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3); horiz2 = __msa_ave_u_b(inp2, res1); @@ -3425,7 +3409,7 @@ static void hv_mc_qpel_no_rnd_aver_h_src1_8x8_msa(const uint8_t *src, res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3, const20, const6, const3); - SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1); + SLDI_B2_UB(inp0, inp0, inp1, inp1, 1, inp0, inp1); inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1); horiz4 = __msa_ave_u_b(inp0, res0); @@ -3437,13 +3421,13 @@ static void hv_mc_qpel_no_rnd_aver_h_src1_8x8_msa(const uint8_t *src, const20, const6, const3); LD_UB2(src, src_stride, inp2, inp3); src += (2 * src_stride); - ST8x2_UB(res0, dst, dst_stride); + ST_D2(res0, 0, 1, dst, dst_stride); dst += 2 * dst_stride; res1 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3, const20, const6, const3); - SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1); + SLDI_B2_UB(inp2, inp2, inp3, inp3, 1, inp2, inp3); inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3); horiz6 = __msa_ave_u_b(inp2, res1); @@ -3464,7 +3448,7 @@ static void hv_mc_qpel_no_rnd_aver_h_src1_8x8_msa(const uint8_t *src, horiz5, horiz4, horiz3, horiz2, horiz6, horiz7, horiz8, horiz8, const20, const6, const3); - ST8x2_UB(res1, dst, dst_stride); + ST_D2(res1, 0, 1, dst, dst_stride); dst += 2 * dst_stride; res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3, @@ -3472,9 +3456,7 @@ static void hv_mc_qpel_no_rnd_aver_h_src1_8x8_msa(const uint8_t *src, horiz7, horiz6, horiz5, horiz4, horiz8, horiz8, horiz7, horiz6, const20, const6, const3); - ST8x2_UB(res0, dst, dst_stride); - dst += 2 * dst_stride; - ST8x2_UB(res1, dst, dst_stride); + ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride); } static void hv_mc_qpel_no_rnd_aver_hv_src01_16x16_msa(const uint8_t *src, @@ -3536,7 +3518,7 @@ static void hv_mc_qpel_no_rnd_aver_hv_src01_8x8_msa(const uint8_t *src, const20, const6, const3); avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz2, (v2i64) horiz1); res0 = __msa_ave_u_b(avg0, res0); - ST8x2_UB(res0, dst, dst_stride); + ST_D2(res0, 0, 1, dst, dst_stride); dst += (2 * dst_stride); LD_UB2(src, src_stride, inp2, inp3); @@ -3564,7 +3546,7 @@ static void hv_mc_qpel_no_rnd_aver_hv_src01_8x8_msa(const uint8_t *src, horiz5, horiz4, horiz3, horiz2, horiz6, horiz7, horiz8, horiz8, const20, const6, const3); - ST8x2_UB(res1, dst, dst_stride); + ST_D2(res1, 0, 1, dst, dst_stride); dst += 2 * dst_stride; avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz6, (v2i64) horiz5); @@ -3575,12 +3557,12 @@ static void hv_mc_qpel_no_rnd_aver_hv_src01_8x8_msa(const uint8_t *src, horiz7, horiz6, horiz5, horiz4, horiz8, horiz8, horiz7, horiz6, const20, const6, const3); - ST8x2_UB(res0, dst, dst_stride); + ST_D2(res0, 0, 1, dst, dst_stride); dst += 2 * dst_stride; avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz8, (v2i64) horiz7); res1 = __msa_ave_u_b(avg1, res1); - ST8x2_UB(res1, dst, dst_stride); + ST_D2(res1, 0, 1, dst, dst_stride); } static void hv_mc_qpel_no_rnd_aver_v_src1_16x16_msa(const uint8_t *src, @@ -3638,7 +3620,7 @@ static void hv_mc_qpel_no_rnd_aver_v_src1_8x8_msa(const uint8_t *src, res0 = __msa_ave_u_b(avg0, res0); LD_UB2(src, src_stride, inp2, inp3); src += (2 * src_stride); - ST8x2_UB(res0, dst, dst_stride); + ST_D2(res0, 0, 1, dst, dst_stride); dst += 2 * dst_stride; horiz6 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1, @@ -3656,7 +3638,7 @@ static void hv_mc_qpel_no_rnd_aver_v_src1_8x8_msa(const uint8_t *src, horiz8 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE_1ROW(inp0, mask0, mask1, mask2, mask3, const20, const6, const3); - ST8x2_UB(res1, dst, dst_stride); + ST_D2(res1, 0, 1, dst, dst_stride); dst += 2 * dst_stride; res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1, @@ -3671,12 +3653,9 @@ static void hv_mc_qpel_no_rnd_aver_v_src1_8x8_msa(const uint8_t *src, horiz7, horiz6, horiz5, horiz4, horiz8, horiz8, horiz7, horiz6, const20, const6, const3); - ST8x2_UB(res0, dst, dst_stride); - dst += 2 * dst_stride; - avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz8, (v2i64) horiz7); res1 = __msa_ave_u_b(avg1, res1); - ST8x2_UB(res1, dst, dst_stride); + ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride); } static void hv_mc_qpel_no_rnd_aver_hv_src11_16x16_msa(const uint8_t *src, @@ -3712,7 +3691,7 @@ static void hv_mc_qpel_no_rnd_aver_hv_src11_8x8_msa(const uint8_t *src, res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3, const20, const6, const3); - SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1); + SLDI_B2_UB(inp0, inp0, inp1, inp1, 1, inp0, inp1); inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1); horiz0 = __msa_ave_u_b(inp0, res0); @@ -3722,7 +3701,7 @@ static void hv_mc_qpel_no_rnd_aver_hv_src11_8x8_msa(const uint8_t *src, res1 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3, const20, const6, const3); - SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1); + SLDI_B2_UB(inp2, inp2, inp3, inp3, 1, inp2, inp3); inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3); horiz2 = __msa_ave_u_b(inp2, res1); @@ -3733,7 +3712,7 @@ static void hv_mc_qpel_no_rnd_aver_hv_src11_8x8_msa(const uint8_t *src, mask2, mask3, const20, const6, const3); - SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1); + SLDI_B2_UB(inp0, inp0, inp1, inp1, 1, inp0, inp1); inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1); horiz4 = __msa_ave_u_b(inp0, res0); horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1); @@ -3744,7 +3723,7 @@ static void hv_mc_qpel_no_rnd_aver_hv_src11_8x8_msa(const uint8_t *src, const20, const6, const3); avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz2, (v2i64) horiz1); res0 = __msa_ave_u_b(avg0, res0); - ST8x2_UB(res0, dst, dst_stride); + ST_D2(res0, 0, 1, dst, dst_stride); dst += (2 * dst_stride); LD_UB2(src, src_stride, inp2, inp3); @@ -3752,7 +3731,7 @@ static void hv_mc_qpel_no_rnd_aver_hv_src11_8x8_msa(const uint8_t *src, res1 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3, const20, const6, const3); - SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1); + SLDI_B2_UB(inp2, inp2, inp3, inp3, 1, inp2, inp3); inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3); horiz6 = __msa_ave_u_b(inp2, res1); @@ -3764,7 +3743,7 @@ static void hv_mc_qpel_no_rnd_aver_hv_src11_8x8_msa(const uint8_t *src, const20, const6, const3); avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz4, (v2i64) horiz3); res1 = __msa_ave_u_b(avg1, res1); - ST8x2_UB(res1, dst, dst_stride); + ST_D2(res1, 0, 1, dst, dst_stride); dst += (2 * dst_stride); inp0 = LD_UB(src); @@ -3787,7 +3766,7 @@ static void hv_mc_qpel_no_rnd_aver_hv_src11_8x8_msa(const uint8_t *src, res0 = __msa_ave_u_b(avg0, res0); avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz8, (v2i64) horiz7); res1 = __msa_ave_u_b(avg1, res1); - ST8x4_UB(res0, res1, dst, dst_stride); + ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride); } static void hv_mc_qpel_aver_horiz_src0_16x16_msa(const uint8_t *src, @@ -3893,7 +3872,7 @@ static void hv_mc_qpel_aver_hv_src00_8x8_msa(const uint8_t *src, const20, const6, const3); avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz1, (v2i64) horiz0); res0 = __msa_aver_u_b(avg0, res0); - ST8x2_UB(res0, dst, dst_stride); + ST_D2(res0, 0, 1, dst, dst_stride); dst += (2 * dst_stride); LD_UB2(src, src_stride, inp2, inp3); @@ -3915,7 +3894,7 @@ static void hv_mc_qpel_aver_hv_src00_8x8_msa(const uint8_t *src, res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0, mask0, mask1, mask2, mask3, const20, const6, const3); horiz8 = __msa_aver_u_b(inp0, res0); - ST8x2_UB(res1, dst, dst_stride); + ST_D2(res1, 0, 1, dst, dst_stride); dst += 2 * dst_stride; res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1, @@ -3930,11 +3909,9 @@ static void hv_mc_qpel_aver_hv_src00_8x8_msa(const uint8_t *src, horiz7, horiz6, horiz5, horiz4, horiz8, horiz8, horiz7, horiz6, const20, const6, const3); - ST8x2_UB(res0, dst, dst_stride); - dst += 2 * dst_stride; avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz7, (v2i64) horiz6); res1 = __msa_aver_u_b(avg1, res1); - ST8x2_UB(res1, dst, dst_stride); + ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride); } static void hv_mc_qpel_aver_horiz_16x16_msa(const uint8_t *src, @@ -4034,7 +4011,7 @@ static void hv_mc_qpel_aver_v_src0_8x8_msa(const uint8_t *src, const20, const6, const3); avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz1, (v2i64) horiz0); res0 = __msa_aver_u_b(avg0, res0); - ST8x2_UB(res0, dst, dst_stride); + ST_D2(res0, 0, 1, dst, dst_stride); dst += (2 * dst_stride); LD_UB2(src, src_stride, inp2, inp3); @@ -4059,7 +4036,7 @@ static void hv_mc_qpel_aver_v_src0_8x8_msa(const uint8_t *src, horiz5, horiz4, horiz3, horiz2, horiz6, horiz7, horiz8, horiz8, const20, const6, const3); - ST8x2_UB(res1, dst, dst_stride); + ST_D2(res1, 0, 1, dst, dst_stride); dst += 2 * dst_stride; avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz5, (v2i64) horiz4); @@ -4069,11 +4046,9 @@ static void hv_mc_qpel_aver_v_src0_8x8_msa(const uint8_t *src, horiz7, horiz6, horiz5, horiz4, horiz8, horiz8, horiz7, horiz6, const20, const6, const3); - ST8x2_UB(res0, dst, dst_stride); - dst += 2 * dst_stride; avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz7, (v2i64) horiz6); res1 = __msa_aver_u_b(avg1, res1); - ST8x2_UB(res1, dst, dst_stride); + ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride); } static void hv_mc_qpel_aver_horiz_src1_16x16_msa(const uint8_t *src, @@ -4159,12 +4134,12 @@ static void hv_mc_qpel_aver_hv_src10_8x8_msa(const uint8_t *src, const20, const6, const3); res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3, const20, const6, const3); - SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1); + SLDI_B2_UB(inp0, inp0, inp1, inp1, 1, inp0, inp1); inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1); horiz0 = __msa_aver_u_b(inp0, res0); horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1); - SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1); + SLDI_B2_UB(inp2, inp2, inp3, inp3, 1, inp2, inp3); inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3); horiz2 = __msa_aver_u_b(inp2, res1); @@ -4175,12 +4150,12 @@ static void hv_mc_qpel_aver_hv_src10_8x8_msa(const uint8_t *src, const20, const6, const3); res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3, const20, const6, const3); - SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1); + SLDI_B2_UB(inp0, inp0, inp1, inp1, 1, inp0, inp1); inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1); horiz4 = __msa_aver_u_b(inp0, res0); horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1); - SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1); + SLDI_B2_UB(inp2, inp2, inp3, inp3, 1, inp2, inp3); inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3); horiz6 = __msa_aver_u_b(inp2, res1); @@ -4197,7 +4172,7 @@ static void hv_mc_qpel_aver_hv_src10_8x8_msa(const uint8_t *src, horiz3, horiz2, horiz1, horiz0, horiz4, horiz5, horiz6, horiz7, const20, const6, const3); - ST8x2_UB(res0, dst, dst_stride); + ST_D2(res0, 0, 1, dst, dst_stride); dst += 2 * dst_stride; inp0 = LD_UB(src); @@ -4212,7 +4187,7 @@ static void hv_mc_qpel_aver_hv_src10_8x8_msa(const uint8_t *src, horiz5, horiz4, horiz3, horiz2, horiz6, horiz7, horiz8, horiz8, const20, const6, const3); - ST8x2_UB(res1, dst, dst_stride); + ST_D2(res1, 0, 1, dst, dst_stride); dst += 2 * dst_stride; avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz5, (v2i64) horiz4); @@ -4222,12 +4197,9 @@ static void hv_mc_qpel_aver_hv_src10_8x8_msa(const uint8_t *src, horiz7, horiz6, horiz5, horiz4, horiz8, horiz8, horiz7, horiz6, const20, const6, const3); - ST8x2_UB(res0, dst, dst_stride); - dst += 2 * dst_stride; - avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz7, (v2i64) horiz6); res1 = __msa_aver_u_b(avg1, res1); - ST8x2_UB(res1, dst, dst_stride); + ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride); } static void hv_mc_qpel_aver_h_src0_16x16_msa(const uint8_t *src, @@ -4285,7 +4257,7 @@ static void hv_mc_qpel_aver_h_src0_8x8_msa(const uint8_t *src, horiz1, horiz0, horiz0, horiz1, horiz2, horiz3, horiz4, horiz5, const20, const6, const3); - ST8x2_UB(res0, dst, dst_stride); + ST_D2(res0, 0, 1, dst, dst_stride); dst += (2 * dst_stride); LD_UB2(src, src_stride, inp2, inp3); @@ -4309,7 +4281,7 @@ static void hv_mc_qpel_aver_h_src0_8x8_msa(const uint8_t *src, horiz5, horiz4, horiz3, horiz2, horiz6, horiz7, horiz8, horiz8, const20, const6, const3); - ST8x2_UB(res1, dst, dst_stride); + ST_D2(res1, 0, 1, dst, dst_stride); dst += 2 * dst_stride; res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3, @@ -4317,9 +4289,7 @@ static void hv_mc_qpel_aver_h_src0_8x8_msa(const uint8_t *src, horiz7, horiz6, horiz5, horiz4, horiz8, horiz8, horiz7, horiz6, const20, const6, const3); - ST8x2_UB(res0, dst, dst_stride); - dst += 2 * dst_stride; - ST8x2_UB(res1, dst, dst_stride); + ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride); } static void hv_mc_qpel_16x16_msa(const uint8_t *src, @@ -4371,7 +4341,7 @@ static void hv_mc_qpel_8x8_msa(const uint8_t *src, int32_t src_stride, horiz1, horiz0, horiz0, horiz1, horiz2, horiz3, horiz4, horiz5, const20, const6, const3); - ST8x2_UB(res0, dst, dst_stride); + ST_D2(res0, 0, 1, dst, dst_stride); dst += (2 * dst_stride); LD_UB2(src, src_stride, inp2, inp3); @@ -4389,7 +4359,7 @@ static void hv_mc_qpel_8x8_msa(const uint8_t *src, int32_t src_stride, horiz8 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0, mask0, mask1, mask2, mask3, const20, const6, const3); - ST8x2_UB(res1, dst, dst_stride); + ST_D2(res1, 0, 1, dst, dst_stride); dst += 2 * dst_stride; res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1, @@ -4402,9 +4372,7 @@ static void hv_mc_qpel_8x8_msa(const uint8_t *src, int32_t src_stride, horiz7, horiz6, horiz5, horiz4, horiz8, horiz8, horiz7, horiz6, const20, const6, const3); - ST8x2_UB(res0, dst, dst_stride); - dst += 2 * dst_stride; - ST8x2_UB(res1, dst, dst_stride); + ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride); } static void hv_mc_qpel_aver_h_src1_16x16_msa(const uint8_t *src, @@ -4442,12 +4410,12 @@ static void hv_mc_qpel_aver_h_src1_8x8_msa(const uint8_t *src, const20, const6, const3); res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3, const20, const6, const3); - SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1); + SLDI_B2_UB(inp0, inp0, inp1, inp1, 1, inp0, inp1); inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1); horiz0 = __msa_aver_u_b(inp0, res0); horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1); - SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1); + SLDI_B2_UB(inp2, inp2, inp3, inp3, 1, inp2, inp3); inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3); horiz2 = __msa_aver_u_b(inp2, res1); @@ -4458,12 +4426,12 @@ static void hv_mc_qpel_aver_h_src1_8x8_msa(const uint8_t *src, const20, const6, const3); res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3, const20, const6, const3); - SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1); + SLDI_B2_UB(inp0, inp0, inp1, inp1, 1, inp0, inp1); inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1); horiz4 = __msa_aver_u_b(inp0, res0); horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1); - SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1); + SLDI_B2_UB(inp2, inp2, inp3, inp3, 1, inp2, inp3); inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3); horiz6 = __msa_aver_u_b(inp2, res1); @@ -4478,31 +4446,25 @@ static void hv_mc_qpel_aver_h_src1_8x8_msa(const uint8_t *src, horiz1, horiz0, horiz0, horiz1, horiz2, horiz3, horiz4, horiz5, const20, const6, const3); - ST8x2_UB(res0, dst, dst_stride); - dst += (2 * dst_stride); - res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0, horiz3, horiz4, horiz5, horiz6, horiz3, horiz2, horiz1, horiz0, horiz4, horiz5, horiz6, horiz7, const20, const6, const3); - ST8x2_UB(res1, dst, dst_stride); - dst += (2 * dst_stride); + ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride); + dst += (4 * dst_stride); res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1, horiz5, horiz6, horiz7, horiz8, horiz5, horiz4, horiz3, horiz2, horiz6, horiz7, horiz8, horiz8, const20, const6, const3); - ST8x2_UB(res0, dst, dst_stride); - dst += (2 * dst_stride); - res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3, horiz7, horiz8, horiz8, horiz7, horiz7, horiz6, horiz5, horiz4, horiz8, horiz8, horiz7, horiz6, const20, const6, const3); - ST8x2_UB(res1, dst, dst_stride); + ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride); } static void hv_mc_qpel_aver_hv_src01_16x16_msa(const uint8_t *src, @@ -4561,7 +4523,7 @@ static void hv_mc_qpel_aver_hv_src01_8x8_msa(const uint8_t *src, const20, const6, const3); avg0 = (v16u8) __msa_insve_d((v2i64) horiz1, 1, (v2i64) horiz2); res0 = __msa_aver_u_b(avg0, res0); - ST8x2_UB(res0, dst, dst_stride); + ST_D2(res0, 0, 1, dst, dst_stride); dst += (2 * dst_stride); LD_UB2(src, src_stride, inp2, inp3); @@ -4587,7 +4549,7 @@ static void hv_mc_qpel_aver_hv_src01_8x8_msa(const uint8_t *src, horiz5, horiz4, horiz3, horiz2, horiz6, horiz7, horiz8, horiz8, const20, const6, const3); - ST8x2_UB(res1, dst, dst_stride); + ST_D2(res1, 0, 1, dst, dst_stride); dst += 2 * dst_stride; avg0 = (v16u8) __msa_insve_d((v2i64) horiz5, 1, (v2i64) horiz6); @@ -4597,13 +4559,9 @@ static void hv_mc_qpel_aver_hv_src01_8x8_msa(const uint8_t *src, horiz7, horiz6, horiz5, horiz4, horiz8, horiz8, horiz7, horiz6, const20, const6, const3); - ST8x2_UB(res0, dst, dst_stride); - dst += 2 * dst_stride; - avg1 = (v16u8) __msa_insve_d((v2i64) horiz7, 1, (v2i64) horiz8); res1 = __msa_aver_u_b(avg1, res1); - ST8x2_UB(res1, dst, dst_stride); - dst += (2 * dst_stride); + ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride); } static void hv_mc_qpel_aver_v_src1_16x16_msa(const uint8_t *src, @@ -4660,7 +4618,7 @@ static void hv_mc_qpel_aver_v_src1_8x8_msa(const uint8_t *src, const20, const6, const3); avg0 = (v16u8) __msa_insve_d((v2i64) horiz1, 1, (v2i64) horiz2); res0 = __msa_aver_u_b(avg0, res0); - ST8x2_UB(res0, dst, dst_stride); + ST_D2(res0, 0, 1, dst, dst_stride); dst += (2 * dst_stride); LD_UB2(src, src_stride, inp2, inp3); @@ -4685,7 +4643,7 @@ static void hv_mc_qpel_aver_v_src1_8x8_msa(const uint8_t *src, horiz5, horiz4, horiz3, horiz2, horiz6, horiz7, horiz8, horiz8, const20, const6, const3); - ST8x2_UB(res1, dst, dst_stride); + ST_D2(res1, 0, 1, dst, dst_stride); dst += 2 * dst_stride; avg0 = (v16u8) __msa_insve_d((v2i64) horiz5, 1, (v2i64) horiz6); res0 = __msa_aver_u_b(avg0, res0); @@ -4695,11 +4653,9 @@ static void hv_mc_qpel_aver_v_src1_8x8_msa(const uint8_t *src, horiz7, horiz6, horiz5, horiz4, horiz8, horiz8, horiz7, horiz6, const20, const6, const3); - ST8x2_UB(res0, dst, dst_stride); - dst += 2 * dst_stride; avg1 = (v16u8) __msa_insve_d((v2i64) horiz7, 1, (v2i64) horiz8); res1 = __msa_aver_u_b(avg1, res1); - ST8x2_UB(res1, dst, dst_stride); + ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride); } static void hv_mc_qpel_aver_hv_src11_16x16_msa(const uint8_t *src, @@ -4734,14 +4690,14 @@ static void hv_mc_qpel_aver_hv_src11_8x8_msa(const uint8_t *src, res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3, const20, const6, const3); - SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1); + SLDI_B2_UB(inp0, inp0, inp1, inp1, 1, inp0, inp1); inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1); horiz0 = __msa_aver_u_b(inp0, res0); horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1); res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3, const20, const6, const3); - SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1); + SLDI_B2_UB(inp2, inp2, inp3, inp3, 1, inp2, inp3); inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3); horiz2 = __msa_aver_u_b(inp2, res1); @@ -4750,7 +4706,7 @@ static void hv_mc_qpel_aver_hv_src11_8x8_msa(const uint8_t *src, src += (2 * src_stride); res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3, const20, const6, const3); - SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1); + SLDI_B2_UB(inp0, inp0, inp1, inp1, 1, inp0, inp1); inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1); horiz4 = __msa_aver_u_b(inp0, res0); @@ -4764,12 +4720,12 @@ static void hv_mc_qpel_aver_hv_src11_8x8_msa(const uint8_t *src, res0 = __msa_aver_u_b(avg0, res0); LD_UB2(src, src_stride, inp2, inp3); src += (2 * src_stride); - ST8x2_UB(res0, dst, dst_stride); + ST_D2(res0, 0, 1, dst, dst_stride); dst += 2 * dst_stride; res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3, const20, const6, const3); - SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1); + SLDI_B2_UB(inp2, inp2, inp3, inp3, 1, inp2, inp3); inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3); horiz6 = __msa_aver_u_b(inp2, res1); @@ -4791,7 +4747,7 @@ static void hv_mc_qpel_aver_hv_src11_8x8_msa(const uint8_t *src, horiz5, horiz4, horiz3, horiz2, horiz6, horiz7, horiz8, horiz8, const20, const6, const3); - ST8x2_UB(res1, dst, dst_stride); + ST_D2(res1, 0, 1, dst, dst_stride); dst += 2 * dst_stride; avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz6, (v2i64) horiz5); @@ -4801,12 +4757,9 @@ static void hv_mc_qpel_aver_hv_src11_8x8_msa(const uint8_t *src, horiz7, horiz6, horiz5, horiz4, horiz8, horiz8, horiz7, horiz6, const20, const6, const3); - ST8x2_UB(res0, dst, dst_stride); - dst += 2 * dst_stride; - avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz8, (v2i64) horiz7); res1 = __msa_aver_u_b(avg1, res1); - ST8x2_UB(res1, dst, dst_stride); + ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride); } static void hv_mc_qpel_avg_dst_aver_hv_src00_16x16_msa(const uint8_t *src, @@ -4869,7 +4822,7 @@ static void hv_mc_qpel_avg_dst_aver_hv_src00_8x8_msa(const uint8_t *src, res0 = __msa_aver_u_b(avg0, res0); avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0); res0 = __msa_aver_u_b(avg0, res0); - ST8x2_UB(res0, dst, dst_stride); + ST_D2(res0, 0, 1, dst, dst_stride); dst += (2 * dst_stride); LD_UB2(src, src_stride, inp2, inp3); @@ -4889,7 +4842,7 @@ static void hv_mc_qpel_avg_dst_aver_hv_src00_8x8_msa(const uint8_t *src, res1 = __msa_aver_u_b(avg1, res1); avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0); res1 = __msa_aver_u_b(avg1, res1); - ST8x2_UB(res1, dst, dst_stride); + ST_D2(res1, 0, 1, dst, dst_stride); dst += (2 * dst_stride); inp0 = LD_UB(src); @@ -4906,7 +4859,7 @@ static void hv_mc_qpel_avg_dst_aver_hv_src00_8x8_msa(const uint8_t *src, res0 = __msa_aver_u_b(avg0, res0); avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0); res0 = __msa_aver_u_b(avg0, res0); - ST8x2_UB(res0, dst, dst_stride); + ST_D2(res0, 0, 1, dst, dst_stride); dst += (2 * dst_stride); LD_UB2(dst, dst_stride, dst0, dst1); @@ -4919,7 +4872,7 @@ static void hv_mc_qpel_avg_dst_aver_hv_src00_8x8_msa(const uint8_t *src, res1 = __msa_aver_u_b(avg1, res1); avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0); res1 = __msa_aver_u_b(avg1, res1); - ST8x2_UB(res1, dst, dst_stride); + ST_D2(res1, 0, 1, dst, dst_stride); } static void hv_mc_qpel_avg_dst_aver_v_src0_16x16_msa(const uint8_t *src, @@ -4979,7 +4932,7 @@ static void hv_mc_qpel_avg_dst_aver_v_src0_8x8_msa(const uint8_t *src, res0 = __msa_aver_u_b(avg0, res0); avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0); res0 = __msa_aver_u_b(avg0, res0); - ST8x2_UB(res0, dst, dst_stride); + ST_D2(res0, 0, 1, dst, dst_stride); dst += (2 * dst_stride); LD_UB2(src, src_stride, inp2, inp3); @@ -4998,7 +4951,7 @@ static void hv_mc_qpel_avg_dst_aver_v_src0_8x8_msa(const uint8_t *src, res1 = __msa_aver_u_b(avg1, res1); avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0); res1 = __msa_aver_u_b(avg1, res1); - ST8x2_UB(res1, dst, dst_stride); + ST_D2(res1, 0, 1, dst, dst_stride); dst += (2 * dst_stride); inp0 = LD_UB(src); @@ -5015,7 +4968,7 @@ static void hv_mc_qpel_avg_dst_aver_v_src0_8x8_msa(const uint8_t *src, res0 = __msa_aver_u_b(avg0, res0); avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0); res0 = __msa_aver_u_b(avg0, res0); - ST8x2_UB(res0, dst, dst_stride); + ST_D2(res0, 0, 1, dst, dst_stride); dst += (2 * dst_stride); LD_UB2(dst, dst_stride, dst0, dst1); @@ -5028,7 +4981,7 @@ static void hv_mc_qpel_avg_dst_aver_v_src0_8x8_msa(const uint8_t *src, res1 = __msa_aver_u_b(avg1, res1); avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0); res1 = __msa_aver_u_b(avg1, res1); - ST8x2_UB(res1, dst, dst_stride); + ST_D2(res1, 0, 1, dst, dst_stride); } static void hv_mc_qpel_avg_dst_aver_hv_src10_16x16_msa(const uint8_t *src, @@ -5067,7 +5020,7 @@ static void hv_mc_qpel_avg_dst_aver_hv_src10_8x8_msa(const uint8_t *src, LD_UB2(src, src_stride, inp2, inp3); src += (2 * src_stride); - SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1); + SLDI_B2_UB(inp0, inp0, inp1, inp1, 1, inp0, inp1); inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0); horiz0 = __msa_aver_u_b(inp0, res0); @@ -5076,7 +5029,7 @@ static void hv_mc_qpel_avg_dst_aver_hv_src10_8x8_msa(const uint8_t *src, const20, const6, const3); LD_UB2(src, src_stride, inp0, inp1); src += (2 * src_stride); - SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1); + SLDI_B2_UB(inp2, inp2, inp3, inp3, 1, inp2, inp3); inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2); horiz2 = __msa_aver_u_b(inp2, res1); @@ -5084,7 +5037,7 @@ static void hv_mc_qpel_avg_dst_aver_hv_src10_8x8_msa(const uint8_t *src, res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3, const20, const6, const3); - SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1); + SLDI_B2_UB(inp0, inp0, inp1, inp1, 1, inp0, inp1); inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0); horiz4 = __msa_aver_u_b(inp0, res0); @@ -5099,7 +5052,7 @@ static void hv_mc_qpel_avg_dst_aver_hv_src10_8x8_msa(const uint8_t *src, res0 = __msa_aver_u_b(avg0, res0); avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0); res0 = __msa_aver_u_b(avg0, res0); - ST8x2_UB(res0, dst, dst_stride); + ST_D2(res0, 0, 1, dst, dst_stride); dst += (2 * dst_stride); LD_UB2(src, src_stride, inp2, inp3); @@ -5107,7 +5060,7 @@ static void hv_mc_qpel_avg_dst_aver_hv_src10_8x8_msa(const uint8_t *src, res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3, const20, const6, const3); - SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1); + SLDI_B2_UB(inp2, inp2, inp3, inp3, 1, inp2, inp3); inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2); horiz6 = __msa_aver_u_b(inp2, res1); @@ -5122,7 +5075,7 @@ static void hv_mc_qpel_avg_dst_aver_hv_src10_8x8_msa(const uint8_t *src, res1 = __msa_aver_u_b(avg1, res1); avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0); res1 = __msa_aver_u_b(avg1, res1); - ST8x2_UB(res1, dst, dst_stride); + ST_D2(res1, 0, 1, dst, dst_stride); dst += (2 * dst_stride); inp0 = LD_UB(src); @@ -5140,7 +5093,7 @@ static void hv_mc_qpel_avg_dst_aver_hv_src10_8x8_msa(const uint8_t *src, res0 = __msa_aver_u_b(avg0, res0); avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0); res0 = __msa_aver_u_b(avg0, res0); - ST8x2_UB(res0, dst, dst_stride); + ST_D2(res0, 0, 1, dst, dst_stride); dst += (2 * dst_stride); LD_UB2(dst, dst_stride, dst0, dst1); @@ -5153,7 +5106,7 @@ static void hv_mc_qpel_avg_dst_aver_hv_src10_8x8_msa(const uint8_t *src, res1 = __msa_aver_u_b(avg1, res1); avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0); res1 = __msa_aver_u_b(avg1, res1); - ST8x2_UB(res1, dst, dst_stride); + ST_D2(res1, 0, 1, dst, dst_stride); } static void hv_mc_qpel_avg_dst_aver_h_src0_16x16_msa(const uint8_t *src, @@ -5214,7 +5167,7 @@ static void hv_mc_qpel_avg_dst_aver_h_src0_8x8_msa(const uint8_t *src, const20, const6, const3); avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0); res0 = __msa_aver_u_b(avg0, res0); - ST8x2_UB(res0, dst, dst_stride); + ST_D2(res0, 0, 1, dst, dst_stride); dst += (2 * dst_stride); LD_UB2(src, src_stride, inp2, inp3); @@ -5232,7 +5185,7 @@ static void hv_mc_qpel_avg_dst_aver_h_src0_8x8_msa(const uint8_t *src, const20, const6, const3); avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0); res1 = __msa_aver_u_b(avg1, res1); - ST8x2_UB(res1, dst, dst_stride); + ST_D2(res1, 0, 1, dst, dst_stride); dst += (2 * dst_stride); inp0 = LD_UB(src); @@ -5247,7 +5200,7 @@ static void hv_mc_qpel_avg_dst_aver_h_src0_8x8_msa(const uint8_t *src, const20, const6, const3); avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0); res0 = __msa_aver_u_b(avg0, res0); - ST8x2_UB(res0, dst, dst_stride); + ST_D2(res0, 0, 1, dst, dst_stride); dst += (2 * dst_stride); LD_UB2(dst, dst_stride, dst0, dst1); @@ -5258,8 +5211,7 @@ static void hv_mc_qpel_avg_dst_aver_h_src0_8x8_msa(const uint8_t *src, const20, const6, const3); avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0); res1 = __msa_aver_u_b(avg1, res1); - ST8x2_UB(res1, dst, dst_stride); - dst += (2 * dst_stride); + ST_D2(res1, 0, 1, dst, dst_stride); } static void hv_mc_qpel_avg_dst_16x16_msa(const uint8_t *src, int32_t src_stride, @@ -5324,7 +5276,7 @@ static void hv_mc_qpel_avg_dst_8x8_msa(const uint8_t *src, int32_t src_stride, const20, const6, const3); avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0); res0 = __msa_aver_u_b(avg0, res0); - ST8x2_UB(res0, dst, dst_stride); + ST_D2(res0, 0, 1, dst, dst_stride); dst += (2 * dst_stride); LD_UB2(dst, dst_stride, dst0, dst1); @@ -5335,7 +5287,7 @@ static void hv_mc_qpel_avg_dst_8x8_msa(const uint8_t *src, int32_t src_stride, const20, const6, const3); avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0); res1 = __msa_aver_u_b(avg1, res1); - ST8x2_UB(res1, dst, dst_stride); + ST_D2(res1, 0, 1, dst, dst_stride); dst += (2 * dst_stride); LD_UB2(dst, dst_stride, dst0, dst1); @@ -5346,7 +5298,7 @@ static void hv_mc_qpel_avg_dst_8x8_msa(const uint8_t *src, int32_t src_stride, const20, const6, const3); avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0); res0 = __msa_aver_u_b(avg0, res0); - ST8x2_UB(res0, dst, dst_stride); + ST_D2(res0, 0, 1, dst, dst_stride); dst += (2 * dst_stride); LD_UB2(dst, dst_stride, dst0, dst1); @@ -5357,7 +5309,7 @@ static void hv_mc_qpel_avg_dst_8x8_msa(const uint8_t *src, int32_t src_stride, const20, const6, const3); avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0); res1 = __msa_aver_u_b(avg1, res1); - ST8x2_UB(res1, dst, dst_stride); + ST_D2(res1, 0, 1, dst, dst_stride); } static void hv_mc_qpel_avg_dst_aver_h_src1_16x16_msa(const uint8_t *src, @@ -5395,7 +5347,7 @@ static void hv_mc_qpel_avg_dst_aver_h_src1_8x8_msa(const uint8_t *src, const20, const6, const3); LD_UB2(src, src_stride, inp2, inp3); src += (2 * src_stride); - SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1); + SLDI_B2_UB(inp0, inp0, inp1, inp1, 1, inp0, inp1); inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0); horiz0 = __msa_aver_u_b(inp0, res0); @@ -5404,7 +5356,7 @@ static void hv_mc_qpel_avg_dst_aver_h_src1_8x8_msa(const uint8_t *src, const20, const6, const3); LD_UB2(src, src_stride, inp0, inp1); src += (2 * src_stride); - SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1); + SLDI_B2_UB(inp2, inp2, inp3, inp3, 1, inp2, inp3); inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2); horiz2 = __msa_aver_u_b(inp2, res1); @@ -5412,7 +5364,7 @@ static void hv_mc_qpel_avg_dst_aver_h_src1_8x8_msa(const uint8_t *src, res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3, const20, const6, const3); - SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1); + SLDI_B2_UB(inp0, inp0, inp1, inp1, 1, inp0, inp1); inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0); horiz4 = __msa_aver_u_b(inp0, res0); @@ -5425,7 +5377,7 @@ static void hv_mc_qpel_avg_dst_aver_h_src1_8x8_msa(const uint8_t *src, const20, const6, const3); avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0); res0 = __msa_aver_u_b(avg0, res0); - ST8x2_UB(res0, dst, dst_stride); + ST_D2(res0, 0, 1, dst, dst_stride); dst += (2 * dst_stride); LD_UB2(src, src_stride, inp2, inp3); @@ -5433,7 +5385,7 @@ static void hv_mc_qpel_avg_dst_aver_h_src1_8x8_msa(const uint8_t *src, res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3, const20, const6, const3); - SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1); + SLDI_B2_UB(inp2, inp2, inp3, inp3, 1, inp2, inp3); inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2); horiz6 = __msa_aver_u_b(inp2, res1); @@ -5446,7 +5398,7 @@ static void hv_mc_qpel_avg_dst_aver_h_src1_8x8_msa(const uint8_t *src, const20, const6, const3); avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0); res1 = __msa_aver_u_b(avg1, res1); - ST8x2_UB(res1, dst, dst_stride); + ST_D2(res1, 0, 1, dst, dst_stride); dst += (2 * dst_stride); inp0 = LD_UB(src); @@ -5462,7 +5414,7 @@ static void hv_mc_qpel_avg_dst_aver_h_src1_8x8_msa(const uint8_t *src, const20, const6, const3); avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0); res0 = __msa_aver_u_b(avg0, res0); - ST8x2_UB(res0, dst, dst_stride); + ST_D2(res0, 0, 1, dst, dst_stride); dst += (2 * dst_stride); LD_UB2(dst, dst_stride, dst0, dst1); @@ -5473,7 +5425,7 @@ static void hv_mc_qpel_avg_dst_aver_h_src1_8x8_msa(const uint8_t *src, const20, const6, const3); avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0); res1 = __msa_aver_u_b(avg1, res1); - ST8x2_UB(res1, dst, dst_stride); + ST_D2(res1, 0, 1, dst, dst_stride); } static void hv_mc_qpel_avg_dst_aver_hv_src01_16x16_msa(const uint8_t *src, @@ -5537,7 +5489,7 @@ static void hv_mc_qpel_avg_dst_aver_hv_src01_8x8_msa(const uint8_t *src, res0 = __msa_aver_u_b(avg0, res0); avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0); res0 = __msa_aver_u_b(avg0, res0); - ST8x2_UB(res0, dst, dst_stride); + ST_D2(res0, 0, 1, dst, dst_stride); dst += (2 * dst_stride); LD_UB2(dst, dst_stride, dst0, dst1); @@ -5557,7 +5509,7 @@ static void hv_mc_qpel_avg_dst_aver_hv_src01_8x8_msa(const uint8_t *src, res1 = __msa_aver_u_b(avg1, res1); avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0); res1 = __msa_aver_u_b(avg1, res1); - ST8x2_UB(res1, dst, dst_stride); + ST_D2(res1, 0, 1, dst, dst_stride); dst += (2 * dst_stride); inp0 = LD_UB(src); @@ -5579,7 +5531,7 @@ static void hv_mc_qpel_avg_dst_aver_hv_src01_8x8_msa(const uint8_t *src, LD_UB2(dst, dst_stride, dst0, dst1); avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0); res0 = __msa_aver_u_b(avg0, res0); - ST8x2_UB(res0, dst, dst_stride); + ST_D2(res0, 0, 1, dst, dst_stride); dst += (2 * dst_stride); avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz8, (v2i64) horiz7); @@ -5587,7 +5539,7 @@ static void hv_mc_qpel_avg_dst_aver_hv_src01_8x8_msa(const uint8_t *src, LD_UB2(dst, dst_stride, dst0, dst1); avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0); res1 = __msa_aver_u_b(avg1, res1); - ST8x2_UB(res1, dst, dst_stride); + ST_D2(res1, 0, 1, dst, dst_stride); } static void hv_mc_qpel_avg_dst_aver_v_src1_16x16_msa(const uint8_t *src, @@ -5647,7 +5599,7 @@ static void hv_mc_qpel_avg_dst_aver_v_src1_8x8_msa(const uint8_t *src, res0 = __msa_aver_u_b(avg0, res0); avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0); res0 = __msa_aver_u_b(avg0, res0); - ST8x2_UB(res0, dst, dst_stride); + ST_D2(res0, 0, 1, dst, dst_stride); dst += (2 * dst_stride); LD_UB2(dst, dst_stride, dst0, dst1); @@ -5666,7 +5618,7 @@ static void hv_mc_qpel_avg_dst_aver_v_src1_8x8_msa(const uint8_t *src, res1 = __msa_aver_u_b(avg1, res1); avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0); res1 = __msa_aver_u_b(avg1, res1); - ST8x2_UB(res1, dst, dst_stride); + ST_D2(res1, 0, 1, dst, dst_stride); dst += (2 * dst_stride); inp0 = LD_UB(src); @@ -5686,7 +5638,7 @@ static void hv_mc_qpel_avg_dst_aver_v_src1_8x8_msa(const uint8_t *src, LD_UB2(dst, dst_stride, dst0, dst1); avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0); res0 = __msa_aver_u_b(avg0, res0); - ST8x2_UB(res0, dst, dst_stride); + ST_D2(res0, 0, 1, dst, dst_stride); dst += (2 * dst_stride); avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz8, (v2i64) horiz7); @@ -5694,7 +5646,7 @@ static void hv_mc_qpel_avg_dst_aver_v_src1_8x8_msa(const uint8_t *src, LD_UB2(dst, dst_stride, dst0, dst1); avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0); res1 = __msa_aver_u_b(avg1, res1); - ST8x2_UB(res1, dst, dst_stride); + ST_D2(res1, 0, 1, dst, dst_stride); } static void hv_mc_qpel_avg_dst_aver_hv_src11_16x16_msa(const uint8_t *src, @@ -5732,7 +5684,7 @@ static void hv_mc_qpel_avg_dst_aver_hv_src11_8x8_msa(const uint8_t *src, const20, const6, const3); LD_UB2(src, src_stride, inp2, inp3); src += (2 * src_stride); - SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1); + SLDI_B2_UB(inp0, inp0, inp1, inp1, 1, inp0, inp1); inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0); horiz0 = __msa_aver_u_b(inp0, res0); @@ -5741,14 +5693,14 @@ static void hv_mc_qpel_avg_dst_aver_hv_src11_8x8_msa(const uint8_t *src, const20, const6, const3); LD_UB2(src, src_stride, inp0, inp1); src += (2 * src_stride); - SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1); + SLDI_B2_UB(inp2, inp2, inp3, inp3, 1, inp2, inp3); inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2); horiz2 = __msa_aver_u_b(inp2, res1); horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1); res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3, const20, const6, const3); - SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1); + SLDI_B2_UB(inp0, inp0, inp1, inp1, 1, inp0, inp1); inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0); horiz4 = __msa_aver_u_b(inp0, res0); @@ -5762,14 +5714,14 @@ static void hv_mc_qpel_avg_dst_aver_hv_src11_8x8_msa(const uint8_t *src, res0 = __msa_aver_u_b(avg0, res0); avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0); res0 = __msa_aver_u_b(avg0, res0); - ST8x2_UB(res0, dst, dst_stride); + ST_D2(res0, 0, 1, dst, dst_stride); dst += (2 * dst_stride); LD_UB2(src, src_stride, inp2, inp3); src += (2 * src_stride); res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3, const20, const6, const3); - SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1); + SLDI_B2_UB(inp2, inp2, inp3, inp3, 1, inp2, inp3); inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2); horiz6 = __msa_aver_u_b(inp2, res1); @@ -5783,7 +5735,7 @@ static void hv_mc_qpel_avg_dst_aver_hv_src11_8x8_msa(const uint8_t *src, res1 = __msa_aver_u_b(avg1, res1); avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0); res1 = __msa_aver_u_b(avg1, res1); - ST8x2_UB(res1, dst, dst_stride); + ST_D2(res1, 0, 1, dst, dst_stride); dst += (2 * dst_stride); inp0 = LD_UB(src); @@ -5800,7 +5752,7 @@ static void hv_mc_qpel_avg_dst_aver_hv_src11_8x8_msa(const uint8_t *src, res0 = __msa_aver_u_b(avg0, res0); avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0); res0 = __msa_aver_u_b(avg0, res0); - ST8x2_UB(res0, dst, dst_stride); + ST_D2(res0, 0, 1, dst, dst_stride); dst += (2 * dst_stride); LD_UB2(dst, dst_stride, dst0, dst1); @@ -5812,7 +5764,7 @@ static void hv_mc_qpel_avg_dst_aver_hv_src11_8x8_msa(const uint8_t *src, res1 = __msa_aver_u_b(avg1, res1); avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0); res1 = __msa_aver_u_b(avg1, res1); - ST8x2_UB(res1, dst, dst_stride); + ST_D2(res1, 0, 1, dst, dst_stride); } static void copy_8x8_msa(const uint8_t *src, int32_t src_stride,