DPADD_UB2_UH(sum2_r, sum1_r, coef2, coef1, sum0_r, sum3_r); \
res0_r = (v8i16) (sum0_r - sum3_r); \
res0_r = __msa_srari_h(res0_r, 5); \
- res0_r = CLIP_SH_0_255(res0_r); \
+ CLIP_SH_0_255(res0_r); \
out = (v16u8) __msa_pckev_b((v16i8) res0_r, (v16i8) res0_r); \
\
out; \
res0_r = (v8i16) (sum0_r - sum3_r); \
res0_r += 15; \
res0_r >>= 5; \
- res0_r = CLIP_SH_0_255(res0_r); \
+ CLIP_SH_0_255(res0_r); \
out = (v16u8) __msa_pckev_b((v16i8) res0_r, (v16i8) res0_r); \
\
out; \
inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
AVER_UB2_UB(inp0, res0, inp2, res1, res0, res1);
- ST8x4_UB(res0, res1, dst, dst_stride);
+ ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
dst += (4 * dst_stride);
}
}
res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3,
mask0, mask1, mask2, mask3,
const20, const6, const3);
- ST8x4_UB(res0, res1, dst, dst_stride);
+ ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
dst += (4 * dst_stride);
}
}
res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3,
mask0, mask1, mask2, mask3,
const20, const6, const3);
- SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1);
- SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1);
+ SLDI_B4_UB(inp0, inp0, inp1, inp1, inp2, inp2, inp3, inp3, 1,
+ inp0, inp1, inp2, inp3);
inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
AVER_UB2_UB(inp0, res0, inp2, res1, res0, res1);
- ST8x4_UB(res0, res1, dst, dst_stride);
+ ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
dst += (4 * dst_stride);
}
}
inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
res0 = __msa_ave_u_b(inp0, res0);
res1 = __msa_ave_u_b(inp2, res1);
- ST8x4_UB(res0, res1, dst, dst_stride);
+ ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
dst += (4 * dst_stride);
}
}
res1 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1,
mask2, mask3, const20,
const6, const3);
- ST8x4_UB(res0, res1, dst, dst_stride);
+ ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
dst += (4 * dst_stride);
}
}
res1 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1,
mask2, mask3, const20,
const6, const3);
- SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1);
- SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1);
+ SLDI_B4_UB(inp0, inp0, inp1, inp1, inp2, inp2, inp3, inp3, 1,
+ inp0, inp1, inp2, inp3);
inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
res0 = __msa_ave_u_b(inp0, res0);
res1 = __msa_ave_u_b(inp2, res1);
- ST8x4_UB(res0, res1, dst, dst_stride);
+ ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
dst += (4 * dst_stride);
}
}
dst2 = (v16u8) __msa_insve_d((v2i64) dst2, 1, (v2i64) dst3);
AVER_UB2_UB(inp0, res0, inp2, res1, res0, res1);
AVER_UB2_UB(dst0, res0, dst2, res1, res0, res1);
- ST8x4_UB(res0, res1, dst, dst_stride);
+ ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
dst += (4 * dst_stride);
}
}
dst0 = (v16u8) __msa_insve_d((v2i64) dst0, 1, (v2i64) dst1);
dst2 = (v16u8) __msa_insve_d((v2i64) dst2, 1, (v2i64) dst3);
AVER_UB2_UB(dst0, res0, dst2, res1, res0, res1);
- ST8x4_UB(res0, res1, dst, dst_stride);
+ ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
dst += (4 * dst_stride);
}
}
mask0, mask1, mask2, mask3,
const20, const6, const3);
LD_UB4(dst, dst_stride, dst0, dst1, dst2, dst3);
- SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1);
- SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1);
+ SLDI_B4_UB(inp0, inp0, inp1, inp1, inp2, inp2, inp3, inp3, 1,
+ inp0, inp1, inp2, inp3);
inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
dst0 = (v16u8) __msa_insve_d((v2i64) dst0, 1, (v2i64) dst1);
dst2 = (v16u8) __msa_insve_d((v2i64) dst2, 1, (v2i64) dst3);
AVER_UB2_UB(inp0, res0, inp2, res1, res0, res1);
AVER_UB2_UB(dst0, res0, dst2, res1, res0, res1);
- ST8x4_UB(res0, res1, dst, dst_stride);
+ ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
dst += (4 * dst_stride);
}
}
tmp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
tmp1 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
AVER_UB2_UB(res0, tmp0, res1, tmp1, res0, res1);
- ST8x4_UB(res0, res1, dst, dst_stride);
- dst += (4 * dst_stride);
+ ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
inp8 = LD_UB(src);
res0 = APPLY_VERT_QPEL_FILTER_8BYTE(inp4, inp3, inp2, inp1,
tmp0 = (v16u8) __msa_insve_d((v2i64) inp4, 1, (v2i64) inp5);
tmp1 = (v16u8) __msa_insve_d((v2i64) inp6, 1, (v2i64) inp7);
AVER_UB2_UB(res0, tmp0, res1, tmp1, res0, res1);
- ST8x4_UB(res0, res1, dst, dst_stride);
- dst += (4 * dst_stride);
+ ST_D4(res0, res1, 0, 1, 0, 1, dst + 4 * dst_stride, dst_stride);
}
static void vert_mc_qpel_aver_src0_16x16_msa(const uint8_t *src,
inp3, inp2, inp1, inp0,
inp4, inp5, inp6, inp7,
const20, const6, const3);
- ST8x4_UB(res0, res1, dst, dst_stride);
- dst += (4 * dst_stride);
+ ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
inp8 = LD_UB(src);
res0 = APPLY_VERT_QPEL_FILTER_8BYTE(inp4, inp3, inp2, inp1,
inp7, inp6, inp5, inp4,
inp8, inp8, inp7, inp6,
const20, const6, const3);
- ST8x4_UB(res0, res1, dst, dst_stride);
- dst += (4 * dst_stride);
+ ST_D4(res0, res1, 0, 1, 0, 1, dst + 4 * dst_stride, dst_stride);
}
static void vert_mc_qpel_16x16_msa(const uint8_t *src,
tmp0 = (v16u8) __msa_insve_d((v2i64) inp1, 1, (v2i64) inp2);
tmp1 = (v16u8) __msa_insve_d((v2i64) inp3, 1, (v2i64) inp4);
AVER_UB2_UB(res0, tmp0, res1, tmp1, res0, res1);
- ST8x4_UB(res0, res1, dst, dst_stride);
- dst += (4 * dst_stride);
+ ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
inp8 = LD_UB(src);
res0 = APPLY_VERT_QPEL_FILTER_8BYTE(inp4, inp3, inp2, inp1,
tmp0 = (v16u8) __msa_insve_d((v2i64) inp5, 1, (v2i64) inp6);
tmp1 = (v16u8) __msa_insve_d((v2i64) inp7, 1, (v2i64) inp8);
AVER_UB2_UB(res0, tmp0, res1, tmp1, res0, res1);
- ST8x4_UB(res0, res1, dst, dst_stride);
+ ST_D4(res0, res1, 0, 1, 0, 1, dst + 4 * dst_stride, dst_stride);
}
static void vert_mc_qpel_aver_src1_16x16_msa(const uint8_t *src,
tmp1 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
res0 = __msa_ave_u_b(res0, tmp0);
res1 = __msa_ave_u_b(res1, tmp1);
- ST8x4_UB(res0, res1, dst, dst_stride);
- dst += (4 * dst_stride);
+ ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
inp8 = LD_UB(src);
res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(inp4, inp3, inp2, inp1,
tmp1 = (v16u8) __msa_insve_d((v2i64) inp6, 1, (v2i64) inp7);
res0 = __msa_ave_u_b(res0, tmp0);
res1 = __msa_ave_u_b(res1, tmp1);
- ST8x4_UB(res0, res1, dst, dst_stride);
- dst += (4 * dst_stride);
+ ST_D4(res0, res1, 0, 1, 0, 1, dst + 4 * dst_stride, dst_stride);
}
static void vert_mc_qpel_no_rnd_aver_src0_16x16_msa(const uint8_t *src,
inp3, inp2, inp1, inp0,
inp4, inp5, inp6, inp7,
const20, const6, const3);
- ST8x4_UB(res0, res1, dst, dst_stride);
- dst += (4 * dst_stride);
+ ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
inp8 = LD_UB(src);
res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(inp4, inp3, inp2, inp1,
inp7, inp6, inp5, inp4,
inp8, inp8, inp7, inp6,
const20, const6, const3);
- ST8x4_UB(res0, res1, dst, dst_stride);
- dst += (4 * dst_stride);
+ ST_D4(res0, res1, 0, 1, 0, 1, dst + 4 * dst_stride, dst_stride);
}
static void vert_mc_qpel_no_rnd_16x16_msa(const uint8_t *src,
tmp1 = (v16u8) __msa_insve_d((v2i64) inp3, 1, (v2i64) inp4);
res0 = __msa_ave_u_b(res0, tmp0);
res1 = __msa_ave_u_b(res1, tmp1);
- ST8x4_UB(res0, res1, dst, dst_stride);
- dst += (4 * dst_stride);
+ ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
inp8 = LD_UB(src);
res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(inp4, inp3, inp2, inp1,
tmp1 = (v16u8) __msa_insve_d((v2i64) inp7, 1, (v2i64) inp8);
res0 = __msa_ave_u_b(res0, tmp0);
res1 = __msa_ave_u_b(res1, tmp1);
- ST8x4_UB(res0, res1, dst, dst_stride);
+ ST_D4(res0, res1, 0, 1, 0, 1, dst + 4 * dst_stride, dst_stride);
}
static void vert_mc_qpel_no_rnd_aver_src1_16x16_msa(const uint8_t *src,
dst2 = (v16u8) __msa_insve_d((v2i64) dst2, 1, (v2i64) dst3);
AVER_UB2_UB(res0, tmp0, res1, tmp1, res0, res1);
AVER_UB2_UB(dst0, res0, dst2, res1, res0, res1);
- ST8x4_UB(res0, res1, dst, dst_stride);
+ ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
dst += (4 * dst_stride);
inp8 = LD_UB(src);
dst2 = (v16u8) __msa_insve_d((v2i64) dst2, 1, (v2i64) dst3);
AVER_UB2_UB(res0, tmp0, res1, tmp1, res0, res1);
AVER_UB2_UB(dst0, res0, dst2, res1, res0, res1);
- ST8x4_UB(res0, res1, dst, dst_stride);
+ ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
}
static void vert_mc_qpel_avg_dst_aver_src0_16x16_msa(const uint8_t *src,
dst0 = (v16u8) __msa_insve_d((v2i64) dst0, 1, (v2i64) dst1);
dst2 = (v16u8) __msa_insve_d((v2i64) dst2, 1, (v2i64) dst3);
AVER_UB2_UB(dst0, res0, dst2, res1, res0, res1);
- ST8x4_UB(res0, res1, dst, dst_stride);
+ ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
dst += (4 * dst_stride);
inp8 = LD_UB(src);
dst0 = (v16u8) __msa_insve_d((v2i64) dst0, 1, (v2i64) dst1);
dst2 = (v16u8) __msa_insve_d((v2i64) dst2, 1, (v2i64) dst3);
AVER_UB2_UB(dst0, res0, dst2, res1, res0, res1);
- ST8x4_UB(res0, res1, dst, dst_stride);
- dst += (4 * dst_stride);
+ ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
}
static void vert_mc_qpel_avg_dst_16x16_msa(const uint8_t *src,
dst2 = (v16u8) __msa_insve_d((v2i64) dst2, 1, (v2i64) dst3);
AVER_UB2_UB(res0, tmp0, res1, tmp1, res0, res1);
AVER_UB2_UB(dst0, res0, dst2, res1, res0, res1);
- ST8x4_UB(res0, res1, dst, dst_stride);
+ ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
dst += (4 * dst_stride);
inp8 = LD_UB(src);
dst2 = (v16u8) __msa_insve_d((v2i64) dst2, 1, (v2i64) dst3);
AVER_UB2_UB(res0, tmp0, res1, tmp1, res0, res1);
AVER_UB2_UB(dst0, res0, dst2, res1, res0, res1);
- ST8x4_UB(res0, res1, dst, dst_stride);
+ ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
}
static void vert_mc_qpel_avg_dst_aver_src1_16x16_msa(const uint8_t *src,
const20, const6, const3);
avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz1, (v2i64) horiz0);
res0 = __msa_ave_u_b(avg0, res0);
- ST8x2_UB(res0, dst, dst_stride);
+ ST_D2(res0, 0, 1, dst, dst_stride);
dst += (2 * dst_stride);
LD_UB2(src, src_stride, inp2, inp3);
horiz5, horiz4, horiz3, horiz2,
horiz6, horiz7, horiz8, horiz8,
const20, const6, const3);
- ST8x2_UB(res1, dst, dst_stride);
+ ST_D2(res1, 0, 1, dst, dst_stride);
dst += 2 * dst_stride;
avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz5, (v2i64) horiz4);
horiz7, horiz6, horiz5, horiz4,
horiz8, horiz8, horiz7, horiz6,
const20, const6, const3);
- ST8x2_UB(res0, dst, dst_stride);
+ ST_D2(res0, 0, 1, dst, dst_stride);
dst += 2 * dst_stride;
avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz7, (v2i64) horiz6);
res1 = __msa_ave_u_b(avg1, res1);
- ST8x2_UB(res1, dst, dst_stride);
+ ST_D2(res1, 0, 1, dst, dst_stride);
}
static void hv_mc_qpel_no_rnd_horiz_16x16_msa(const uint8_t *src,
const20, const6, const3);
avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz1, (v2i64) horiz0);
res0 = __msa_ave_u_b(avg0, res0);
- ST8x2_UB(res0, dst, dst_stride);
+ ST_D2(res0, 0, 1, dst, dst_stride);
dst += (2 * dst_stride);
LD_UB2(src, src_stride, inp2, inp3);
res1 = __msa_ave_u_b(avg1, res1);
avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz1, (v2i64) horiz0);
res0 = __msa_ave_u_b(avg0, res0);
- ST8x2_UB(res1, dst, dst_stride);
+ ST_D2(res1, 0, 1, dst, dst_stride);
dst += (2 * dst_stride);
res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
const20, const6, const3);
avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz5, (v2i64) horiz4);
res0 = __msa_ave_u_b(avg0, res0);
- ST8x2_UB(res0, dst, dst_stride);
+ ST_D2(res0, 0, 1, dst, dst_stride);
dst += (2 * dst_stride);
res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
const20, const6, const3);
avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz7, (v2i64) horiz6);
res1 = __msa_ave_u_b(avg1, res1);
- ST8x2_UB(res1, dst, dst_stride);
+ ST_D2(res1, 0, 1, dst, dst_stride);
}
static void hv_mc_qpel_no_rnd_horiz_src1_16x16_msa(const uint8_t *src,
res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1,
mask2, mask3, const20,
const6, const3);
- SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1);
+ SLDI_B2_UB(inp0, inp0, inp1, inp1, 1, inp0, inp1);
inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
horiz0 = __msa_ave_u_b(inp0, res0);
res1 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1,
mask2, mask3, const20,
const6, const3);
- SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1);
+ SLDI_B2_UB(inp2, inp2, inp3, inp3, 1, inp2, inp3);
inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
horiz2 = __msa_ave_u_b(inp2, res1);
res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1,
mask2, mask3, const20,
const6, const3);
- SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1);
+ SLDI_B2_UB(inp0, inp0, inp1, inp1, 1, inp0, inp1);
inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
horiz4 = __msa_ave_u_b(inp0, res0);
const20, const6, const3);
avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz1, (v2i64) horiz0);
res0 = __msa_ave_u_b(avg0, res0);
- ST8x2_UB(res0, dst, dst_stride);
+ ST_D2(res0, 0, 1, dst, dst_stride);
dst += (2 * dst_stride);
LD_UB2(src, src_stride, inp2, inp3);
res1 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1,
mask2, mask3, const20,
const6, const3);
- SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1);
+ SLDI_B2_UB(inp2, inp2, inp3, inp3, 1, inp2, inp3);
inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
horiz6 = __msa_ave_u_b(inp2, res1);
const20, const6, const3);
avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz3, (v2i64) horiz2);
res1 = __msa_ave_u_b(avg1, res1);
- ST8x2_UB(res1, dst, dst_stride);
+ ST_D2(res1, 0, 1, dst, dst_stride);
dst += (2 * dst_stride);
res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
const20, const6, const3);
avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz5, (v2i64) horiz4);
res0 = __msa_ave_u_b(avg0, res0);
- ST8x2_UB(res0, dst, dst_stride);
+ ST_D2(res0, 0, 1, dst, dst_stride);
dst += (2 * dst_stride);
res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
const20, const6, const3);
avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz7, (v2i64) horiz6);
res1 = __msa_ave_u_b(avg1, res1);
- ST8x2_UB(res1, dst, dst_stride);
+ ST_D2(res1, 0, 1, dst, dst_stride);
}
static void hv_mc_qpel_no_rnd_aver_h_src0_16x16_msa(const uint8_t *src,
LD_UB2(src, src_stride, inp2, inp3);
src += (2 * src_stride);
- ST8x2_UB(res0, dst, dst_stride);
+ ST_D2(res0, 0, 1, dst, dst_stride);
dst += 2 * dst_stride;
res1 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1,
horiz5, horiz4, horiz3, horiz2,
horiz6, horiz7, horiz8, horiz8,
const20, const6, const3);
- ST8x2_UB(res1, dst, dst_stride);
- dst += 2 * dst_stride;
-
- ST8x2_UB(res0, dst, dst_stride);
- dst += (2 * dst_stride);
+ ST_D4(res1, res0, 0, 1, 0, 1, dst, dst_stride);
+ dst += (4 * dst_stride);
res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
horiz7, horiz8, horiz8, horiz7,
horiz7, horiz6, horiz5, horiz4,
horiz8, horiz8, horiz7, horiz6,
const20, const6, const3);
- ST8x2_UB(res1, dst, dst_stride);
+ ST_D2(res1, 0, 1, dst, dst_stride);
}
static void hv_mc_qpel_no_rnd_16x16_msa(const uint8_t *src,
const20, const6, const3);
LD_UB2(src, src_stride, inp2, inp3);
src += (2 * src_stride);
- ST8x2_UB(res0, dst, dst_stride);
+ ST_D2(res0, 0, 1, dst, dst_stride);
dst += 2 * dst_stride;
horiz6 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1,
horiz5, horiz4, horiz3, horiz2,
horiz6, horiz7, horiz8, horiz8,
const20, const6, const3);
- ST8x2_UB(res1, dst, dst_stride);
+ ST_D2(res1, 0, 1, dst, dst_stride);
dst += 2 * dst_stride;
horiz7, horiz6, horiz5, horiz4,
horiz8, horiz8, horiz7, horiz6,
const20, const6, const3);
- ST8x2_UB(res0, dst, dst_stride);
- dst += 2 * dst_stride;
- ST8x2_UB(res1, dst, dst_stride);
+ ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
}
static void hv_mc_qpel_no_rnd_aver_h_src1_16x16_msa(const uint8_t *src,
res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1,
mask2, mask3, const20,
const6, const3);
- SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1);
+ SLDI_B2_UB(inp0, inp0, inp1, inp1, 1, inp0, inp1);
inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
horiz0 = __msa_ave_u_b(inp0, res0);
res1 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1,
mask2, mask3, const20,
const6, const3);
- SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1);
+ SLDI_B2_UB(inp2, inp2, inp3, inp3, 1, inp2, inp3);
inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
horiz2 = __msa_ave_u_b(inp2, res1);
res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1,
mask2, mask3, const20,
const6, const3);
- SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1);
+ SLDI_B2_UB(inp0, inp0, inp1, inp1, 1, inp0, inp1);
inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
horiz4 = __msa_ave_u_b(inp0, res0);
const20, const6, const3);
LD_UB2(src, src_stride, inp2, inp3);
src += (2 * src_stride);
- ST8x2_UB(res0, dst, dst_stride);
+ ST_D2(res0, 0, 1, dst, dst_stride);
dst += 2 * dst_stride;
res1 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1,
mask2, mask3, const20,
const6, const3);
- SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1);
+ SLDI_B2_UB(inp2, inp2, inp3, inp3, 1, inp2, inp3);
inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
horiz6 = __msa_ave_u_b(inp2, res1);
horiz5, horiz4, horiz3, horiz2,
horiz6, horiz7, horiz8, horiz8,
const20, const6, const3);
- ST8x2_UB(res1, dst, dst_stride);
+ ST_D2(res1, 0, 1, dst, dst_stride);
dst += 2 * dst_stride;
res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
horiz7, horiz6, horiz5, horiz4,
horiz8, horiz8, horiz7, horiz6,
const20, const6, const3);
- ST8x2_UB(res0, dst, dst_stride);
- dst += 2 * dst_stride;
- ST8x2_UB(res1, dst, dst_stride);
+ ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
}
static void hv_mc_qpel_no_rnd_aver_hv_src01_16x16_msa(const uint8_t *src,
const20, const6, const3);
avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz2, (v2i64) horiz1);
res0 = __msa_ave_u_b(avg0, res0);
- ST8x2_UB(res0, dst, dst_stride);
+ ST_D2(res0, 0, 1, dst, dst_stride);
dst += (2 * dst_stride);
LD_UB2(src, src_stride, inp2, inp3);
horiz5, horiz4, horiz3, horiz2,
horiz6, horiz7, horiz8, horiz8,
const20, const6, const3);
- ST8x2_UB(res1, dst, dst_stride);
+ ST_D2(res1, 0, 1, dst, dst_stride);
dst += 2 * dst_stride;
avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz6, (v2i64) horiz5);
horiz7, horiz6, horiz5, horiz4,
horiz8, horiz8, horiz7, horiz6,
const20, const6, const3);
- ST8x2_UB(res0, dst, dst_stride);
+ ST_D2(res0, 0, 1, dst, dst_stride);
dst += 2 * dst_stride;
avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz8, (v2i64) horiz7);
res1 = __msa_ave_u_b(avg1, res1);
- ST8x2_UB(res1, dst, dst_stride);
+ ST_D2(res1, 0, 1, dst, dst_stride);
}
static void hv_mc_qpel_no_rnd_aver_v_src1_16x16_msa(const uint8_t *src,
res0 = __msa_ave_u_b(avg0, res0);
LD_UB2(src, src_stride, inp2, inp3);
src += (2 * src_stride);
- ST8x2_UB(res0, dst, dst_stride);
+ ST_D2(res0, 0, 1, dst, dst_stride);
dst += 2 * dst_stride;
horiz6 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1,
horiz8 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE_1ROW(inp0, mask0, mask1,
mask2, mask3, const20,
const6, const3);
- ST8x2_UB(res1, dst, dst_stride);
+ ST_D2(res1, 0, 1, dst, dst_stride);
dst += 2 * dst_stride;
res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
horiz7, horiz6, horiz5, horiz4,
horiz8, horiz8, horiz7, horiz6,
const20, const6, const3);
- ST8x2_UB(res0, dst, dst_stride);
- dst += 2 * dst_stride;
-
avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz8, (v2i64) horiz7);
res1 = __msa_ave_u_b(avg1, res1);
- ST8x2_UB(res1, dst, dst_stride);
+ ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
}
static void hv_mc_qpel_no_rnd_aver_hv_src11_16x16_msa(const uint8_t *src,
res0 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp0, inp1, mask0, mask1,
mask2, mask3, const20,
const6, const3);
- SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1);
+ SLDI_B2_UB(inp0, inp0, inp1, inp1, 1, inp0, inp1);
inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
horiz0 = __msa_ave_u_b(inp0, res0);
res1 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1,
mask2, mask3, const20,
const6, const3);
- SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1);
+ SLDI_B2_UB(inp2, inp2, inp3, inp3, 1, inp2, inp3);
inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
horiz2 = __msa_ave_u_b(inp2, res1);
mask2, mask3, const20,
const6, const3);
- SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1);
+ SLDI_B2_UB(inp0, inp0, inp1, inp1, 1, inp0, inp1);
inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
horiz4 = __msa_ave_u_b(inp0, res0);
horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
const20, const6, const3);
avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz2, (v2i64) horiz1);
res0 = __msa_ave_u_b(avg0, res0);
- ST8x2_UB(res0, dst, dst_stride);
+ ST_D2(res0, 0, 1, dst, dst_stride);
dst += (2 * dst_stride);
LD_UB2(src, src_stride, inp2, inp3);
res1 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1,
mask2, mask3, const20,
const6, const3);
- SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1);
+ SLDI_B2_UB(inp2, inp2, inp3, inp3, 1, inp2, inp3);
inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
horiz6 = __msa_ave_u_b(inp2, res1);
const20, const6, const3);
avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz4, (v2i64) horiz3);
res1 = __msa_ave_u_b(avg1, res1);
- ST8x2_UB(res1, dst, dst_stride);
+ ST_D2(res1, 0, 1, dst, dst_stride);
dst += (2 * dst_stride);
inp0 = LD_UB(src);
res0 = __msa_ave_u_b(avg0, res0);
avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz8, (v2i64) horiz7);
res1 = __msa_ave_u_b(avg1, res1);
- ST8x4_UB(res0, res1, dst, dst_stride);
+ ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
}
static void hv_mc_qpel_aver_horiz_src0_16x16_msa(const uint8_t *src,
const20, const6, const3);
avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz1, (v2i64) horiz0);
res0 = __msa_aver_u_b(avg0, res0);
- ST8x2_UB(res0, dst, dst_stride);
+ ST_D2(res0, 0, 1, dst, dst_stride);
dst += (2 * dst_stride);
LD_UB2(src, src_stride, inp2, inp3);
res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0, mask0, mask1, mask2, mask3,
const20, const6, const3);
horiz8 = __msa_aver_u_b(inp0, res0);
- ST8x2_UB(res1, dst, dst_stride);
+ ST_D2(res1, 0, 1, dst, dst_stride);
dst += 2 * dst_stride;
res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
horiz7, horiz6, horiz5, horiz4,
horiz8, horiz8, horiz7, horiz6,
const20, const6, const3);
- ST8x2_UB(res0, dst, dst_stride);
- dst += 2 * dst_stride;
avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz7, (v2i64) horiz6);
res1 = __msa_aver_u_b(avg1, res1);
- ST8x2_UB(res1, dst, dst_stride);
+ ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
}
static void hv_mc_qpel_aver_horiz_16x16_msa(const uint8_t *src,
const20, const6, const3);
avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz1, (v2i64) horiz0);
res0 = __msa_aver_u_b(avg0, res0);
- ST8x2_UB(res0, dst, dst_stride);
+ ST_D2(res0, 0, 1, dst, dst_stride);
dst += (2 * dst_stride);
LD_UB2(src, src_stride, inp2, inp3);
horiz5, horiz4, horiz3, horiz2,
horiz6, horiz7, horiz8, horiz8,
const20, const6, const3);
- ST8x2_UB(res1, dst, dst_stride);
+ ST_D2(res1, 0, 1, dst, dst_stride);
dst += 2 * dst_stride;
avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz5, (v2i64) horiz4);
horiz7, horiz6, horiz5, horiz4,
horiz8, horiz8, horiz7, horiz6,
const20, const6, const3);
- ST8x2_UB(res0, dst, dst_stride);
- dst += 2 * dst_stride;
avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz7, (v2i64) horiz6);
res1 = __msa_aver_u_b(avg1, res1);
- ST8x2_UB(res1, dst, dst_stride);
+ ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
}
static void hv_mc_qpel_aver_horiz_src1_16x16_msa(const uint8_t *src,
const20, const6, const3);
res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
const20, const6, const3);
- SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1);
+ SLDI_B2_UB(inp0, inp0, inp1, inp1, 1, inp0, inp1);
inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
horiz0 = __msa_aver_u_b(inp0, res0);
horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
- SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1);
+ SLDI_B2_UB(inp2, inp2, inp3, inp3, 1, inp2, inp3);
inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
horiz2 = __msa_aver_u_b(inp2, res1);
const20, const6, const3);
res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
const20, const6, const3);
- SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1);
+ SLDI_B2_UB(inp0, inp0, inp1, inp1, 1, inp0, inp1);
inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
horiz4 = __msa_aver_u_b(inp0, res0);
horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
- SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1);
+ SLDI_B2_UB(inp2, inp2, inp3, inp3, 1, inp2, inp3);
inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
horiz6 = __msa_aver_u_b(inp2, res1);
horiz3, horiz2, horiz1, horiz0,
horiz4, horiz5, horiz6, horiz7,
const20, const6, const3);
- ST8x2_UB(res0, dst, dst_stride);
+ ST_D2(res0, 0, 1, dst, dst_stride);
dst += 2 * dst_stride;
inp0 = LD_UB(src);
horiz5, horiz4, horiz3, horiz2,
horiz6, horiz7, horiz8, horiz8,
const20, const6, const3);
- ST8x2_UB(res1, dst, dst_stride);
+ ST_D2(res1, 0, 1, dst, dst_stride);
dst += 2 * dst_stride;
avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz5, (v2i64) horiz4);
horiz7, horiz6, horiz5, horiz4,
horiz8, horiz8, horiz7, horiz6,
const20, const6, const3);
- ST8x2_UB(res0, dst, dst_stride);
- dst += 2 * dst_stride;
-
avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz7, (v2i64) horiz6);
res1 = __msa_aver_u_b(avg1, res1);
- ST8x2_UB(res1, dst, dst_stride);
+ ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
}
static void hv_mc_qpel_aver_h_src0_16x16_msa(const uint8_t *src,
horiz1, horiz0, horiz0, horiz1,
horiz2, horiz3, horiz4, horiz5,
const20, const6, const3);
- ST8x2_UB(res0, dst, dst_stride);
+ ST_D2(res0, 0, 1, dst, dst_stride);
dst += (2 * dst_stride);
LD_UB2(src, src_stride, inp2, inp3);
horiz5, horiz4, horiz3, horiz2,
horiz6, horiz7, horiz8, horiz8,
const20, const6, const3);
- ST8x2_UB(res1, dst, dst_stride);
+ ST_D2(res1, 0, 1, dst, dst_stride);
dst += 2 * dst_stride;
res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
horiz7, horiz6, horiz5, horiz4,
horiz8, horiz8, horiz7, horiz6,
const20, const6, const3);
- ST8x2_UB(res0, dst, dst_stride);
- dst += 2 * dst_stride;
- ST8x2_UB(res1, dst, dst_stride);
+ ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
}
static void hv_mc_qpel_16x16_msa(const uint8_t *src,
horiz1, horiz0, horiz0, horiz1,
horiz2, horiz3, horiz4, horiz5,
const20, const6, const3);
- ST8x2_UB(res0, dst, dst_stride);
+ ST_D2(res0, 0, 1, dst, dst_stride);
dst += (2 * dst_stride);
LD_UB2(src, src_stride, inp2, inp3);
horiz8 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0,
mask0, mask1, mask2, mask3,
const20, const6, const3);
- ST8x2_UB(res1, dst, dst_stride);
+ ST_D2(res1, 0, 1, dst, dst_stride);
dst += 2 * dst_stride;
res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
horiz7, horiz6, horiz5, horiz4,
horiz8, horiz8, horiz7, horiz6,
const20, const6, const3);
- ST8x2_UB(res0, dst, dst_stride);
- dst += 2 * dst_stride;
- ST8x2_UB(res1, dst, dst_stride);
+ ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
}
static void hv_mc_qpel_aver_h_src1_16x16_msa(const uint8_t *src,
const20, const6, const3);
res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
const20, const6, const3);
- SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1);
+ SLDI_B2_UB(inp0, inp0, inp1, inp1, 1, inp0, inp1);
inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
horiz0 = __msa_aver_u_b(inp0, res0);
horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
- SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1);
+ SLDI_B2_UB(inp2, inp2, inp3, inp3, 1, inp2, inp3);
inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
horiz2 = __msa_aver_u_b(inp2, res1);
const20, const6, const3);
res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
const20, const6, const3);
- SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1);
+ SLDI_B2_UB(inp0, inp0, inp1, inp1, 1, inp0, inp1);
inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
horiz4 = __msa_aver_u_b(inp0, res0);
horiz5 = (v16u8) __msa_splati_d((v2i64) horiz4, 1);
- SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1);
+ SLDI_B2_UB(inp2, inp2, inp3, inp3, 1, inp2, inp3);
inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
horiz6 = __msa_aver_u_b(inp2, res1);
horiz1, horiz0, horiz0, horiz1,
horiz2, horiz3, horiz4, horiz5,
const20, const6, const3);
- ST8x2_UB(res0, dst, dst_stride);
- dst += (2 * dst_stride);
-
res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
horiz3, horiz4, horiz5, horiz6,
horiz3, horiz2, horiz1, horiz0,
horiz4, horiz5, horiz6, horiz7,
const20, const6, const3);
- ST8x2_UB(res1, dst, dst_stride);
- dst += (2 * dst_stride);
+ ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
+ dst += (4 * dst_stride);
res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
horiz5, horiz6, horiz7, horiz8,
horiz5, horiz4, horiz3, horiz2,
horiz6, horiz7, horiz8, horiz8,
const20, const6, const3);
- ST8x2_UB(res0, dst, dst_stride);
- dst += (2 * dst_stride);
-
res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
horiz7, horiz8, horiz8, horiz7,
horiz7, horiz6, horiz5, horiz4,
horiz8, horiz8, horiz7, horiz6,
const20, const6, const3);
- ST8x2_UB(res1, dst, dst_stride);
+ ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
}
static void hv_mc_qpel_aver_hv_src01_16x16_msa(const uint8_t *src,
const20, const6, const3);
avg0 = (v16u8) __msa_insve_d((v2i64) horiz1, 1, (v2i64) horiz2);
res0 = __msa_aver_u_b(avg0, res0);
- ST8x2_UB(res0, dst, dst_stride);
+ ST_D2(res0, 0, 1, dst, dst_stride);
dst += (2 * dst_stride);
LD_UB2(src, src_stride, inp2, inp3);
horiz5, horiz4, horiz3, horiz2,
horiz6, horiz7, horiz8, horiz8,
const20, const6, const3);
- ST8x2_UB(res1, dst, dst_stride);
+ ST_D2(res1, 0, 1, dst, dst_stride);
dst += 2 * dst_stride;
avg0 = (v16u8) __msa_insve_d((v2i64) horiz5, 1, (v2i64) horiz6);
horiz7, horiz6, horiz5, horiz4,
horiz8, horiz8, horiz7, horiz6,
const20, const6, const3);
- ST8x2_UB(res0, dst, dst_stride);
- dst += 2 * dst_stride;
-
avg1 = (v16u8) __msa_insve_d((v2i64) horiz7, 1, (v2i64) horiz8);
res1 = __msa_aver_u_b(avg1, res1);
- ST8x2_UB(res1, dst, dst_stride);
- dst += (2 * dst_stride);
+ ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
}
static void hv_mc_qpel_aver_v_src1_16x16_msa(const uint8_t *src,
const20, const6, const3);
avg0 = (v16u8) __msa_insve_d((v2i64) horiz1, 1, (v2i64) horiz2);
res0 = __msa_aver_u_b(avg0, res0);
- ST8x2_UB(res0, dst, dst_stride);
+ ST_D2(res0, 0, 1, dst, dst_stride);
dst += (2 * dst_stride);
LD_UB2(src, src_stride, inp2, inp3);
horiz5, horiz4, horiz3, horiz2,
horiz6, horiz7, horiz8, horiz8,
const20, const6, const3);
- ST8x2_UB(res1, dst, dst_stride);
+ ST_D2(res1, 0, 1, dst, dst_stride);
dst += 2 * dst_stride;
avg0 = (v16u8) __msa_insve_d((v2i64) horiz5, 1, (v2i64) horiz6);
res0 = __msa_aver_u_b(avg0, res0);
horiz7, horiz6, horiz5, horiz4,
horiz8, horiz8, horiz7, horiz6,
const20, const6, const3);
- ST8x2_UB(res0, dst, dst_stride);
- dst += 2 * dst_stride;
avg1 = (v16u8) __msa_insve_d((v2i64) horiz7, 1, (v2i64) horiz8);
res1 = __msa_aver_u_b(avg1, res1);
- ST8x2_UB(res1, dst, dst_stride);
+ ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
}
static void hv_mc_qpel_aver_hv_src11_16x16_msa(const uint8_t *src,
res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1,
mask0, mask1, mask2, mask3,
const20, const6, const3);
- SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1);
+ SLDI_B2_UB(inp0, inp0, inp1, inp1, 1, inp0, inp1);
inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
horiz0 = __msa_aver_u_b(inp0, res0);
horiz1 = (v16u8) __msa_splati_d((v2i64) horiz0, 1);
res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
const20, const6, const3);
- SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1);
+ SLDI_B2_UB(inp2, inp2, inp3, inp3, 1, inp2, inp3);
inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
horiz2 = __msa_aver_u_b(inp2, res1);
src += (2 * src_stride);
res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
const20, const6, const3);
- SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1);
+ SLDI_B2_UB(inp0, inp0, inp1, inp1, 1, inp0, inp1);
inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
horiz4 = __msa_aver_u_b(inp0, res0);
res0 = __msa_aver_u_b(avg0, res0);
LD_UB2(src, src_stride, inp2, inp3);
src += (2 * src_stride);
- ST8x2_UB(res0, dst, dst_stride);
+ ST_D2(res0, 0, 1, dst, dst_stride);
dst += 2 * dst_stride;
res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
const20, const6, const3);
- SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1);
+ SLDI_B2_UB(inp2, inp2, inp3, inp3, 1, inp2, inp3);
inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
horiz6 = __msa_aver_u_b(inp2, res1);
horiz5, horiz4, horiz3, horiz2,
horiz6, horiz7, horiz8, horiz8,
const20, const6, const3);
- ST8x2_UB(res1, dst, dst_stride);
+ ST_D2(res1, 0, 1, dst, dst_stride);
dst += 2 * dst_stride;
avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz6, (v2i64) horiz5);
horiz7, horiz6, horiz5, horiz4,
horiz8, horiz8, horiz7, horiz6,
const20, const6, const3);
- ST8x2_UB(res0, dst, dst_stride);
- dst += 2 * dst_stride;
-
avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz8, (v2i64) horiz7);
res1 = __msa_aver_u_b(avg1, res1);
- ST8x2_UB(res1, dst, dst_stride);
+ ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
}
static void hv_mc_qpel_avg_dst_aver_hv_src00_16x16_msa(const uint8_t *src,
res0 = __msa_aver_u_b(avg0, res0);
avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
res0 = __msa_aver_u_b(avg0, res0);
- ST8x2_UB(res0, dst, dst_stride);
+ ST_D2(res0, 0, 1, dst, dst_stride);
dst += (2 * dst_stride);
LD_UB2(src, src_stride, inp2, inp3);
res1 = __msa_aver_u_b(avg1, res1);
avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
res1 = __msa_aver_u_b(avg1, res1);
- ST8x2_UB(res1, dst, dst_stride);
+ ST_D2(res1, 0, 1, dst, dst_stride);
dst += (2 * dst_stride);
inp0 = LD_UB(src);
res0 = __msa_aver_u_b(avg0, res0);
avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
res0 = __msa_aver_u_b(avg0, res0);
- ST8x2_UB(res0, dst, dst_stride);
+ ST_D2(res0, 0, 1, dst, dst_stride);
dst += (2 * dst_stride);
LD_UB2(dst, dst_stride, dst0, dst1);
res1 = __msa_aver_u_b(avg1, res1);
avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
res1 = __msa_aver_u_b(avg1, res1);
- ST8x2_UB(res1, dst, dst_stride);
+ ST_D2(res1, 0, 1, dst, dst_stride);
}
static void hv_mc_qpel_avg_dst_aver_v_src0_16x16_msa(const uint8_t *src,
res0 = __msa_aver_u_b(avg0, res0);
avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
res0 = __msa_aver_u_b(avg0, res0);
- ST8x2_UB(res0, dst, dst_stride);
+ ST_D2(res0, 0, 1, dst, dst_stride);
dst += (2 * dst_stride);
LD_UB2(src, src_stride, inp2, inp3);
res1 = __msa_aver_u_b(avg1, res1);
avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
res1 = __msa_aver_u_b(avg1, res1);
- ST8x2_UB(res1, dst, dst_stride);
+ ST_D2(res1, 0, 1, dst, dst_stride);
dst += (2 * dst_stride);
inp0 = LD_UB(src);
res0 = __msa_aver_u_b(avg0, res0);
avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
res0 = __msa_aver_u_b(avg0, res0);
- ST8x2_UB(res0, dst, dst_stride);
+ ST_D2(res0, 0, 1, dst, dst_stride);
dst += (2 * dst_stride);
LD_UB2(dst, dst_stride, dst0, dst1);
res1 = __msa_aver_u_b(avg1, res1);
avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
res1 = __msa_aver_u_b(avg1, res1);
- ST8x2_UB(res1, dst, dst_stride);
+ ST_D2(res1, 0, 1, dst, dst_stride);
}
static void hv_mc_qpel_avg_dst_aver_hv_src10_16x16_msa(const uint8_t *src,
LD_UB2(src, src_stride, inp2, inp3);
src += (2 * src_stride);
- SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1);
+ SLDI_B2_UB(inp0, inp0, inp1, inp1, 1, inp0, inp1);
inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
horiz0 = __msa_aver_u_b(inp0, res0);
const20, const6, const3);
LD_UB2(src, src_stride, inp0, inp1);
src += (2 * src_stride);
- SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1);
+ SLDI_B2_UB(inp2, inp2, inp3, inp3, 1, inp2, inp3);
inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
horiz2 = __msa_aver_u_b(inp2, res1);
res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
const20, const6, const3);
- SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1);
+ SLDI_B2_UB(inp0, inp0, inp1, inp1, 1, inp0, inp1);
inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
horiz4 = __msa_aver_u_b(inp0, res0);
res0 = __msa_aver_u_b(avg0, res0);
avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
res0 = __msa_aver_u_b(avg0, res0);
- ST8x2_UB(res0, dst, dst_stride);
+ ST_D2(res0, 0, 1, dst, dst_stride);
dst += (2 * dst_stride);
LD_UB2(src, src_stride, inp2, inp3);
res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
const20, const6, const3);
- SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1);
+ SLDI_B2_UB(inp2, inp2, inp3, inp3, 1, inp2, inp3);
inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
horiz6 = __msa_aver_u_b(inp2, res1);
res1 = __msa_aver_u_b(avg1, res1);
avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
res1 = __msa_aver_u_b(avg1, res1);
- ST8x2_UB(res1, dst, dst_stride);
+ ST_D2(res1, 0, 1, dst, dst_stride);
dst += (2 * dst_stride);
inp0 = LD_UB(src);
res0 = __msa_aver_u_b(avg0, res0);
avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
res0 = __msa_aver_u_b(avg0, res0);
- ST8x2_UB(res0, dst, dst_stride);
+ ST_D2(res0, 0, 1, dst, dst_stride);
dst += (2 * dst_stride);
LD_UB2(dst, dst_stride, dst0, dst1);
res1 = __msa_aver_u_b(avg1, res1);
avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
res1 = __msa_aver_u_b(avg1, res1);
- ST8x2_UB(res1, dst, dst_stride);
+ ST_D2(res1, 0, 1, dst, dst_stride);
}
static void hv_mc_qpel_avg_dst_aver_h_src0_16x16_msa(const uint8_t *src,
const20, const6, const3);
avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
res0 = __msa_aver_u_b(avg0, res0);
- ST8x2_UB(res0, dst, dst_stride);
+ ST_D2(res0, 0, 1, dst, dst_stride);
dst += (2 * dst_stride);
LD_UB2(src, src_stride, inp2, inp3);
const20, const6, const3);
avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
res1 = __msa_aver_u_b(avg1, res1);
- ST8x2_UB(res1, dst, dst_stride);
+ ST_D2(res1, 0, 1, dst, dst_stride);
dst += (2 * dst_stride);
inp0 = LD_UB(src);
const20, const6, const3);
avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
res0 = __msa_aver_u_b(avg0, res0);
- ST8x2_UB(res0, dst, dst_stride);
+ ST_D2(res0, 0, 1, dst, dst_stride);
dst += (2 * dst_stride);
LD_UB2(dst, dst_stride, dst0, dst1);
const20, const6, const3);
avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
res1 = __msa_aver_u_b(avg1, res1);
- ST8x2_UB(res1, dst, dst_stride);
- dst += (2 * dst_stride);
+ ST_D2(res1, 0, 1, dst, dst_stride);
}
static void hv_mc_qpel_avg_dst_16x16_msa(const uint8_t *src, int32_t src_stride,
const20, const6, const3);
avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
res0 = __msa_aver_u_b(avg0, res0);
- ST8x2_UB(res0, dst, dst_stride);
+ ST_D2(res0, 0, 1, dst, dst_stride);
dst += (2 * dst_stride);
LD_UB2(dst, dst_stride, dst0, dst1);
const20, const6, const3);
avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
res1 = __msa_aver_u_b(avg1, res1);
- ST8x2_UB(res1, dst, dst_stride);
+ ST_D2(res1, 0, 1, dst, dst_stride);
dst += (2 * dst_stride);
LD_UB2(dst, dst_stride, dst0, dst1);
const20, const6, const3);
avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
res0 = __msa_aver_u_b(avg0, res0);
- ST8x2_UB(res0, dst, dst_stride);
+ ST_D2(res0, 0, 1, dst, dst_stride);
dst += (2 * dst_stride);
LD_UB2(dst, dst_stride, dst0, dst1);
const20, const6, const3);
avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
res1 = __msa_aver_u_b(avg1, res1);
- ST8x2_UB(res1, dst, dst_stride);
+ ST_D2(res1, 0, 1, dst, dst_stride);
}
static void hv_mc_qpel_avg_dst_aver_h_src1_16x16_msa(const uint8_t *src,
const20, const6, const3);
LD_UB2(src, src_stride, inp2, inp3);
src += (2 * src_stride);
- SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1);
+ SLDI_B2_UB(inp0, inp0, inp1, inp1, 1, inp0, inp1);
inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
horiz0 = __msa_aver_u_b(inp0, res0);
const20, const6, const3);
LD_UB2(src, src_stride, inp0, inp1);
src += (2 * src_stride);
- SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1);
+ SLDI_B2_UB(inp2, inp2, inp3, inp3, 1, inp2, inp3);
inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
horiz2 = __msa_aver_u_b(inp2, res1);
res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
const20, const6, const3);
- SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1);
+ SLDI_B2_UB(inp0, inp0, inp1, inp1, 1, inp0, inp1);
inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
horiz4 = __msa_aver_u_b(inp0, res0);
const20, const6, const3);
avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
res0 = __msa_aver_u_b(avg0, res0);
- ST8x2_UB(res0, dst, dst_stride);
+ ST_D2(res0, 0, 1, dst, dst_stride);
dst += (2 * dst_stride);
LD_UB2(src, src_stride, inp2, inp3);
res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
const20, const6, const3);
- SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1);
+ SLDI_B2_UB(inp2, inp2, inp3, inp3, 1, inp2, inp3);
inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
horiz6 = __msa_aver_u_b(inp2, res1);
const20, const6, const3);
avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
res1 = __msa_aver_u_b(avg1, res1);
- ST8x2_UB(res1, dst, dst_stride);
+ ST_D2(res1, 0, 1, dst, dst_stride);
dst += (2 * dst_stride);
inp0 = LD_UB(src);
const20, const6, const3);
avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
res0 = __msa_aver_u_b(avg0, res0);
- ST8x2_UB(res0, dst, dst_stride);
+ ST_D2(res0, 0, 1, dst, dst_stride);
dst += (2 * dst_stride);
LD_UB2(dst, dst_stride, dst0, dst1);
const20, const6, const3);
avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
res1 = __msa_aver_u_b(avg1, res1);
- ST8x2_UB(res1, dst, dst_stride);
+ ST_D2(res1, 0, 1, dst, dst_stride);
}
static void hv_mc_qpel_avg_dst_aver_hv_src01_16x16_msa(const uint8_t *src,
res0 = __msa_aver_u_b(avg0, res0);
avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
res0 = __msa_aver_u_b(avg0, res0);
- ST8x2_UB(res0, dst, dst_stride);
+ ST_D2(res0, 0, 1, dst, dst_stride);
dst += (2 * dst_stride);
LD_UB2(dst, dst_stride, dst0, dst1);
res1 = __msa_aver_u_b(avg1, res1);
avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
res1 = __msa_aver_u_b(avg1, res1);
- ST8x2_UB(res1, dst, dst_stride);
+ ST_D2(res1, 0, 1, dst, dst_stride);
dst += (2 * dst_stride);
inp0 = LD_UB(src);
LD_UB2(dst, dst_stride, dst0, dst1);
avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
res0 = __msa_aver_u_b(avg0, res0);
- ST8x2_UB(res0, dst, dst_stride);
+ ST_D2(res0, 0, 1, dst, dst_stride);
dst += (2 * dst_stride);
avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz8, (v2i64) horiz7);
LD_UB2(dst, dst_stride, dst0, dst1);
avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
res1 = __msa_aver_u_b(avg1, res1);
- ST8x2_UB(res1, dst, dst_stride);
+ ST_D2(res1, 0, 1, dst, dst_stride);
}
static void hv_mc_qpel_avg_dst_aver_v_src1_16x16_msa(const uint8_t *src,
res0 = __msa_aver_u_b(avg0, res0);
avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
res0 = __msa_aver_u_b(avg0, res0);
- ST8x2_UB(res0, dst, dst_stride);
+ ST_D2(res0, 0, 1, dst, dst_stride);
dst += (2 * dst_stride);
LD_UB2(dst, dst_stride, dst0, dst1);
res1 = __msa_aver_u_b(avg1, res1);
avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
res1 = __msa_aver_u_b(avg1, res1);
- ST8x2_UB(res1, dst, dst_stride);
+ ST_D2(res1, 0, 1, dst, dst_stride);
dst += (2 * dst_stride);
inp0 = LD_UB(src);
LD_UB2(dst, dst_stride, dst0, dst1);
avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
res0 = __msa_aver_u_b(avg0, res0);
- ST8x2_UB(res0, dst, dst_stride);
+ ST_D2(res0, 0, 1, dst, dst_stride);
dst += (2 * dst_stride);
avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz8, (v2i64) horiz7);
LD_UB2(dst, dst_stride, dst0, dst1);
avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
res1 = __msa_aver_u_b(avg1, res1);
- ST8x2_UB(res1, dst, dst_stride);
+ ST_D2(res1, 0, 1, dst, dst_stride);
}
static void hv_mc_qpel_avg_dst_aver_hv_src11_16x16_msa(const uint8_t *src,
const20, const6, const3);
LD_UB2(src, src_stride, inp2, inp3);
src += (2 * src_stride);
- SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1);
+ SLDI_B2_UB(inp0, inp0, inp1, inp1, 1, inp0, inp1);
inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
horiz0 = __msa_aver_u_b(inp0, res0);
const20, const6, const3);
LD_UB2(src, src_stride, inp0, inp1);
src += (2 * src_stride);
- SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1);
+ SLDI_B2_UB(inp2, inp2, inp3, inp3, 1, inp2, inp3);
inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
horiz2 = __msa_aver_u_b(inp2, res1);
horiz3 = (v16u8) __msa_splati_d((v2i64) horiz2, 1);
res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp0, inp1, mask0, mask1, mask2, mask3,
const20, const6, const3);
- SLDI_B2_UB(inp0, inp1, inp0, inp1, inp0, inp1, 1);
+ SLDI_B2_UB(inp0, inp0, inp1, inp1, 1, inp0, inp1);
inp0 = (v16u8) __msa_ilvr_d((v2i64) inp1, (v2i64) inp0);
horiz4 = __msa_aver_u_b(inp0, res0);
res0 = __msa_aver_u_b(avg0, res0);
avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
res0 = __msa_aver_u_b(avg0, res0);
- ST8x2_UB(res0, dst, dst_stride);
+ ST_D2(res0, 0, 1, dst, dst_stride);
dst += (2 * dst_stride);
LD_UB2(src, src_stride, inp2, inp3);
src += (2 * src_stride);
res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
const20, const6, const3);
- SLDI_B2_UB(inp2, inp3, inp2, inp3, inp2, inp3, 1);
+ SLDI_B2_UB(inp2, inp2, inp3, inp3, 1, inp2, inp3);
inp2 = (v16u8) __msa_ilvr_d((v2i64) inp3, (v2i64) inp2);
horiz6 = __msa_aver_u_b(inp2, res1);
res1 = __msa_aver_u_b(avg1, res1);
avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
res1 = __msa_aver_u_b(avg1, res1);
- ST8x2_UB(res1, dst, dst_stride);
+ ST_D2(res1, 0, 1, dst, dst_stride);
dst += (2 * dst_stride);
inp0 = LD_UB(src);
res0 = __msa_aver_u_b(avg0, res0);
avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
res0 = __msa_aver_u_b(avg0, res0);
- ST8x2_UB(res0, dst, dst_stride);
+ ST_D2(res0, 0, 1, dst, dst_stride);
dst += (2 * dst_stride);
LD_UB2(dst, dst_stride, dst0, dst1);
res1 = __msa_aver_u_b(avg1, res1);
avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
res1 = __msa_aver_u_b(avg1, res1);
- ST8x2_UB(res1, dst, dst_stride);
+ ST_D2(res1, 0, 1, dst, dst_stride);
}
static void copy_8x8_msa(const uint8_t *src, int32_t src_stride,