Replace STnxm_UB and LDnxm_SH with new macros ST_{H/W/D}{1/2/4/8}.
The old macros are difficult to use because they don't follow the same parameter passing rules.
Changing details as following:
1. remove LD4x4_SH.
2. replace ST2x4_UB with ST_H4.
3. replace ST4x2_UB with ST_W2.
4. replace ST4x4_UB with ST_W4.
5. replace ST4x8_UB with ST_W8.
6. replace ST6x4_UB with ST_W2 and ST_H2.
7. replace ST8x1_UB with ST_D1.
8. replace ST8x2_UB with ST_D2.
9. replace ST8x4_UB with ST_D4.
10. replace ST8x8_UB with ST_D8.
11. replace ST12x4_UB with ST_D4 and ST_W4.
Examples of new macro: ST_H4(in, idx0, idx1, idx2, idx3, pdst, stride)
ST_H4 store four half-word elements in vector 'in' to pdst with stride.
About the macro name:
1) 'ST' means store operation.
2) 'H/W/D' means type of vector element is 'half-word/word/double-word'.
3) Number '1/2/4/8' means how many elements will be stored.
About the macro parameter:
1) 'in0, in1...' 128-bits vector.
2) 'idx0, idx1...' elements index.
3) 'pdst' destination pointer to store to
4) 'stride' stride of each store operation.
Signed-off-by: Michael Niedermayer <michael@niedermayer.cc>
ILVR_B2_SH(in3, in0, in1, in2, temp0, temp1);
in0 = (v16u8) __msa_ilvr_h(temp1, temp0);
in3 = (v16u8) __msa_ilvl_h(temp1, temp0);
- ST4x4_UB(in0, in0, 0, 1, 2, 3, src, stride);
- src += 4 * stride;
- ST4x4_UB(in3, in3, 0, 1, 2, 3, src, stride);
- src += 4 * stride;
+ ST_W8(in0, in3, 0, 1, 2, 3, 0, 1, 2, 3, src, stride);
}
static void h263_v_loop_filter_msa(uint8_t *src, int32_t stride, int32_t qscale)
res_r = __msa_sat_u_h(res_r, 7);
res = (v8i16) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
- ST2x4_UB(res, 0, dst, stride);
+ ST_H4(res, 0, 1, 2, 3, dst, stride);
}
static void avc_chroma_hz_2w_msa(uint8_t *src, uint8_t *dst, int32_t stride,
res_r = __msa_sat_u_h(res_r, 7);
res = (v4i32) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
- ST4x2_UB(res, dst, stride);
+ ST_W2(res, 0, 1, dst, stride);
}
static void avc_chroma_hz_4x4_msa(uint8_t *src, uint8_t *dst, int32_t stride,
SRARI_H2_UH(res0_r, res1_r, 6);
SAT_UH2_UH(res0_r, res1_r, 7);
out = (v16u8) __msa_pckev_b((v16i8) res1_r, (v16i8) res0_r);
- ST4x4_UB(out, out, 0, 1, 2, 3, dst, stride);
+ ST_W4(out, 0, 1, 2, 3, dst, stride);
}
static void avc_chroma_hz_4x8_msa(uint8_t *src, uint8_t *dst, int32_t stride,
SRARI_H4_UH(res0, res1, res2, res3, 6);
SAT_UH4_UH(res0, res1, res2, res3, 7);
PCKEV_B2_UB(res1, res0, res3, res2, out0, out1);
- ST4x8_UB(out0, out1, dst, stride);
+ ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, stride);
}
static void avc_chroma_hz_4w_msa(uint8_t *src, uint8_t *dst, int32_t stride,
SRARI_H4_UH(res0, res1, res2, res3, 6);
SAT_UH4_UH(res0, res1, res2, res3, 7);
PCKEV_B2_UB(res1, res0, res3, res2, out0, out1);
- ST8x4_UB(out0, out1, dst, stride);
+ ST_D4(out0, out1, 0, 1, 0, 1, dst, stride);
}
static void avc_chroma_hz_8x8_msa(uint8_t *src, uint8_t *dst, int32_t stride,
SAT_UH4_UH(res4, res5, res6, res7, 7);
PCKEV_B2_UB(res1, res0, res3, res2, out0, out1);
PCKEV_B2_UB(res5, res4, res7, res6, out2, out3);
- ST8x8_UB(out0, out1, out2, out3, dst, stride);
+ ST_D8(out0, out1, out2, out3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride);
}
static void avc_chroma_hz_nonmult_msa(uint8_t *src, uint8_t *dst,
SRARI_H4_UH(res0, res1, res2, res3, 6);
SAT_UH4_UH(res0, res1, res2, res3, 7);
PCKEV_B2_UB(res1, res0, res3, res2, out0, out1);
- ST8x4_UB(out0, out1, dst, stride);
+ ST_D4(out0, out1, 0, 1, 0, 1, dst, stride);
dst += (4 * stride);
}
res0 = __msa_sat_u_h(res0, 7);
res0 = (v8u16) __msa_pckev_b((v16i8) res0, (v16i8) res0);
- ST8x1_UB(res0, dst);
+ ST_D1(res0, 0, dst);
dst += stride;
}
}
res = (v8i16) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
- ST2x4_UB(res, 0, dst, stride);
+ ST_H4(res, 0, 1, 2, 3, dst, stride);
}
static void avc_chroma_vt_2w_msa(uint8_t *src, uint8_t *dst, int32_t stride,
res_r = __msa_sat_u_h(res_r, 7);
res = (v4i32) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
- ST4x2_UB(res, dst, stride);
+ ST_W2(res, 0, 1, dst, stride);
}
static void avc_chroma_vt_4x4_msa(uint8_t *src, uint8_t *dst, int32_t stride,
SRARI_H2_UH(res0_r, res1_r, 6);
SAT_UH2_UH(res0_r, res1_r, 7);
out = (v16u8) __msa_pckev_b((v16i8) res1_r, (v16i8) res0_r);
- ST4x4_UB(out, out, 0, 1, 2, 3, dst, stride);
+ ST_W4(out, 0, 1, 2, 3, dst, stride);
}
static void avc_chroma_vt_4x8_msa(uint8_t *src, uint8_t *dst, int32_t stride,
SRARI_H4_UH(res0, res1, res2, res3, 6);
SAT_UH4_UH(res0, res1, res2, res3, 7);
PCKEV_B2_UB(res1, res0, res3, res2, out0, out1);
- ST4x8_UB(out0, out1, dst, stride);
+ ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, stride);
}
static void avc_chroma_vt_4w_msa(uint8_t *src, uint8_t *dst, int32_t stride,
SRARI_H4_UH(res0, res1, res2, res3, 6);
SAT_UH4_UH(res0, res1, res2, res3, 7);
PCKEV_B2_UB(res1, res0, res3, res2, out0, out1);
- ST8x4_UB(out0, out1, dst, stride);
+ ST_D4(out0, out1, 0, 1, 0, 1, dst, stride);
}
static void avc_chroma_vt_8x8_msa(uint8_t *src, uint8_t *dst, int32_t stride,
SAT_UH4_UH(res0, res1, res2, res3, 7);
PCKEV_B2_UB(res1, res0, res3, res2, out0, out1);
PCKEV_B2_UB(res5, res4, res7, res6, out2, out3);
- ST8x8_UB(out0, out1, out2, out3, dst, stride);
+ ST_D8(out0, out1, out2, out3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride);
}
static void avc_chroma_vt_8w_msa(uint8_t *src, uint8_t *dst, int32_t stride,
res = (v8i16) __msa_pckev_b((v16i8) res_vt0, (v16i8) res_vt0);
- ST2x4_UB(res, 0, dst, stride);
+ ST_H4(res, 0, 1, 2, 3, dst, stride);
}
static void avc_chroma_hv_2w_msa(uint8_t *src, uint8_t *dst, int32_t stride,
res_vt0 = __msa_sat_u_h(res_vt0, 7);
res = (v4i32) __msa_pckev_b((v16i8) res_vt0, (v16i8) res_vt0);
- ST4x2_UB(res, dst, stride);
+ ST_W2(res, 0, 1, dst, stride);
}
static void avc_chroma_hv_4x4_msa(uint8_t *src, uint8_t *dst, int32_t stride,
SRARI_H2_UH(res_vt0, res_vt1, 6);
SAT_UH2_UH(res_vt0, res_vt1, 7);
PCKEV_B2_SW(res_vt0, res_vt0, res_vt1, res_vt1, res0, res1);
- ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, stride);
+ ST_W2(res0, 0, 1, dst, stride);
+ ST_W2(res1, 0, 1, dst + 2 * stride, stride);
}
static void avc_chroma_hv_4x8_msa(uint8_t *src, uint8_t *dst, int32_t stride,
SRARI_H4_UH(res_vt0, res_vt1, res_vt2, res_vt3, 6);
SAT_UH4_UH(res_vt0, res_vt1, res_vt2, res_vt3, 7);
PCKEV_B2_UB(res_vt1, res_vt0, res_vt3, res_vt2, res0, res1);
- ST4x8_UB(res0, res1, dst, stride);
+ ST_W8(res0, res1, 0, 1, 2, 3, 0, 1, 2, 3, dst, stride);
}
static void avc_chroma_hv_4w_msa(uint8_t *src, uint8_t *dst, int32_t stride,
SRARI_H4_UH(res_vt0, res_vt1, res_vt2, res_vt3, 6);
SAT_UH4_UH(res_vt0, res_vt1, res_vt2, res_vt3, 7);
PCKEV_B2_UB(res_vt1, res_vt0, res_vt3, res_vt2, out0, out1);
- ST8x4_UB(out0, out1, dst, stride);
+ ST_D4(out0, out1, 0, 1, 0, 1, dst, stride);
}
static void avc_chroma_hv_8x8_msa(uint8_t *src, uint8_t *dst, int32_t stride,
SAT_UH4_UH(res_vt4, res_vt5, res_vt6, res_vt7, 7);
PCKEV_B2_UB(res_vt1, res_vt0, res_vt3, res_vt2, out0, out1);
PCKEV_B2_UB(res_vt5, res_vt4, res_vt7, res_vt6, out2, out3);
- ST8x8_UB(out0, out1, out2, out3, dst, stride);
+ ST_D8(out0, out1, out2, out3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride);
}
static void avc_chroma_hv_8w_msa(uint8_t *src, uint8_t *dst, int32_t stride,
dst0 = (v16u8) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
dst0 = __msa_aver_u_b(dst0, dst_data);
- ST2x4_UB(dst0, 0, dst, stride);
+ ST_H4(dst0, 0, 1, 2, 3, dst, stride);
}
static void avc_chroma_hz_and_aver_dst_2w_msa(uint8_t *src, uint8_t *dst,
res = __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
dst_data = __msa_aver_u_b((v16u8) res, dst_data);
- ST4x2_UB(dst_data, dst, stride);
+ ST_W2(dst_data, 0, 1, dst, stride);
}
static void avc_chroma_hz_and_aver_dst_4x4_msa(uint8_t *src, uint8_t *dst,
SAT_UH2_UH(res0_r, res1_r, 7);
out = (v16u8) __msa_pckev_b((v16i8) res1_r, (v16i8) res0_r);
out = __msa_aver_u_b(out, dst_data);
- ST4x4_UB(out, out, 0, 1, 2, 3, dst, stride);
+ ST_W4(out, 0, 1, 2, 3, dst, stride);
}
static void avc_chroma_hz_and_aver_dst_4x8_msa(uint8_t *src, uint8_t *dst,
SAT_UH4_UH(res0, res1, res2, res3, 7);
PCKEV_B2_UB(res1, res0, res3, res2, out0, out1);
AVER_UB2_UB(out0, dst0, out1, dst1, out0, out1);
- ST4x8_UB(out0, out1, dst, stride);
+ ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, stride);
}
static void avc_chroma_hz_and_aver_dst_4w_msa(uint8_t *src, uint8_t *dst,
SAT_UH4_UH(res0, res1, res2, res3, 7);
PCKEV_B2_UB(res1, res0, res3, res2, out0, out1);
AVER_UB2_UB(out0, dst0, out1, dst1, dst0, dst1);
- ST8x4_UB(dst0, dst1, dst, stride);
+ ST_D4(dst0, dst1, 0, 1, 0, 1, dst, stride);
}
static void avc_chroma_hz_and_aver_dst_8x8_msa(uint8_t *src, uint8_t *dst,
PCKEV_B2_UB(res5, res4, res7, res6, out2, out3);
AVER_UB2_UB(out0, dst0, out1, dst1, out0, out1);
AVER_UB2_UB(out2, dst2, out3, dst3, out2, out3);
- ST8x8_UB(out0, out1, out2, out3, dst, stride);
+ ST_D8(out0, out1, out2, out3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride);
}
static void avc_chroma_hz_and_aver_dst_8w_msa(uint8_t *src, uint8_t *dst,
res = (v8i16) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
res = (v8i16) __msa_aver_u_b((v16u8) res, dst_data);
- ST2x4_UB(res, 0, dst, stride);
+ ST_H4(res, 0, 1, 2, 3, dst, stride);
}
static void avc_chroma_vt_and_aver_dst_2w_msa(uint8_t *src, uint8_t *dst,
res = (v16u8) __msa_pckev_b((v16i8) res_r, (v16i8) res_r);
res = __msa_aver_u_b(res, dst_data);
- ST4x2_UB(res, dst, stride);
+ ST_W2(res, 0, 1, dst, stride);
}
static void avc_chroma_vt_and_aver_dst_4x4_msa(uint8_t *src, uint8_t *dst,
SAT_UH2_UH(res0_r, res1_r, 7);
out = (v16u8) __msa_pckev_b((v16i8) res1_r, (v16i8) res0_r);
out = __msa_aver_u_b(out, dst0);
- ST4x4_UB(out, out, 0, 1, 2, 3, dst, stride);
+ ST_W4(out, 0, 1, 2, 3, dst, stride);
}
static void avc_chroma_vt_and_aver_dst_4x8_msa(uint8_t *src, uint8_t *dst,
SAT_UH4_UH(res0, res1, res2, res3, 7);
PCKEV_B2_UB(res1, res0, res3, res2, out0, out1);
AVER_UB2_UB(out0, dst0, out1, dst1, out0, out1);
- ST4x8_UB(out0, out1, dst, stride);
+ ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, stride);
}
static void avc_chroma_vt_and_aver_dst_4w_msa(uint8_t *src, uint8_t *dst,
SAT_UH4_UH(res0, res1, res2, res3, 7);
PCKEV_B2_UB(res1, res0, res3, res2, out0, out1);
AVER_UB2_UB(out0, dst0, out1, dst1, out0, out1);
- ST8x4_UB(out0, out1, dst, stride);
+ ST_D4(out0, out1, 0, 1, 0, 1, dst, stride);
}
static void avc_chroma_vt_and_aver_dst_8x8_msa(uint8_t *src, uint8_t *dst,
PCKEV_B2_UB(res5, res4, res7, res6, out2, out3);
AVER_UB2_UB(out0, dst0, out1, dst1, out0, out1);
AVER_UB2_UB(out2, dst2, out3, dst3, out2, out3);
- ST8x8_UB(out0, out1, out2, out3, dst, stride);
+ ST_D8(out0, out1, out2, out3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride);
}
static void avc_chroma_vt_and_aver_dst_8w_msa(uint8_t *src, uint8_t *dst,
res = __msa_pckev_b((v16i8) res_vt0, (v16i8) res_vt0);
dst0 = __msa_aver_u_b((v16u8) res, dst0);
- ST2x4_UB(dst0, 0, dst, stride);
+ ST_H4(dst0, 0, 1, 2, 3, dst, stride);
}
static void avc_chroma_hv_and_aver_dst_2w_msa(uint8_t *src, uint8_t *dst,
dst0 = (v16u8) __msa_pckev_b((v16i8) res_vt0, (v16i8) res_vt0);
dst0 = __msa_aver_u_b(dst0, dst_data);
- ST4x2_UB(dst0, dst, stride);
+ ST_W2(dst0, 0, 1, dst, stride);
}
static void avc_chroma_hv_and_aver_dst_4x4_msa(uint8_t *src, uint8_t *dst,
SAT_UH2_UH(res_vt0, res_vt1, 7);
out = (v16u8) __msa_pckev_b((v16i8) res_vt1, (v16i8) res_vt0);
out = __msa_aver_u_b(out, dst_data);
- ST4x4_UB(out, out, 0, 1, 2, 3, dst, stride);
+ ST_W4(out, 0, 1, 2, 3, dst, stride);
}
static void avc_chroma_hv_and_aver_dst_4x8_msa(uint8_t *src, uint8_t *dst,
SAT_UH4_UH(res_vt0, res_vt1, res_vt2, res_vt3, 7);
PCKEV_B2_UB(res_vt1, res_vt0, res_vt3, res_vt2, res0, res1);
AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
- ST4x8_UB(res0, res1, dst, stride);
+ ST_W8(res0, res1, 0, 1, 2, 3, 0, 1, 2, 3, dst, stride);
}
static void avc_chroma_hv_and_aver_dst_4w_msa(uint8_t *src, uint8_t *dst,
SAT_UH4_UH(res_vt0, res_vt1, res_vt2, res_vt3, 7);
PCKEV_B2_UB(res_vt1, res_vt0, res_vt3, res_vt2, out0, out1);
AVER_UB2_UB(out0, dst0, out1, dst1, out0, out1);
- ST8x4_UB(out0, out1, dst, stride);
+ ST_D4(out0, out1, 0, 1, 0, 1, dst, stride);
}
static void avc_chroma_hv_and_aver_dst_8x8_msa(uint8_t *src, uint8_t *dst,
PCKEV_B2_UB(res_vt5, res_vt4, res_vt7, res_vt6, out2, out3);
AVER_UB2_UB(out0, dst0, out1, dst1, out0, out1);
AVER_UB2_UB(out2, dst2, out3, dst3, out2, out3);
- ST8x8_UB(out0, out1, out2, out3, dst, stride);
+ ST_D8(out0, out1, out2, out3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride);
}
static void avc_chroma_hv_and_aver_dst_8w_msa(uint8_t *src, uint8_t *dst,
LW4(dst + 4 * stride, stride, tp0, tp1, tp2, tp3);
INSERT_W4_UB(tp0, tp1, tp2, tp3, dst1);
AVER_UB2_UB(src0, dst0, src1, dst1, dst0, dst1);
- ST4x8_UB(dst0, dst1, dst, stride);
+ ST_W8(dst0, dst1, 0, 1, 2, 3, 0, 1, 2, 3, dst, stride);
} else if (4 == height) {
LW4(src, stride, tp0, tp1, tp2, tp3);
INSERT_W4_UB(tp0, tp1, tp2, tp3, src0);
LW4(dst, stride, tp0, tp1, tp2, tp3);
INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
dst0 = __msa_aver_u_b(src0, dst0);
- ST4x4_UB(dst0, dst0, 0, 1, 2, 3, dst, stride);
+ ST_W4(dst0, 0, 1, 2, 3, dst, stride);
} else if (2 == height) {
LW2(src, stride, tp0, tp1);
INSERT_W2_UB(tp0, tp1, src0);
LW2(dst, stride, tp0, tp1);
INSERT_W2_UB(tp0, tp1, dst0);
dst0 = __msa_aver_u_b(src0, dst0);
- ST4x2_UB(dst0, dst, stride);
+ ST_W2(dst0, 0, 1, dst, stride);
}
}
INSERT_D2_UB(tp6, tp7, dst3);
AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3, dst0, dst1,
dst2, dst3);
- ST8x8_UB(dst0, dst1, dst2, dst3, dst, stride);
+ ST_D8(dst0, dst1, dst2, dst3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride);
} else if (4 == height) {
LD4(src, stride, tp0, tp1, tp2, tp3);
INSERT_D2_UB(tp0, tp1, src0);
INSERT_D2_UB(tp0, tp1, dst0);
INSERT_D2_UB(tp2, tp3, dst1);
AVER_UB2_UB(src0, dst0, src1, dst1, dst0, dst1);
- ST8x4_UB(dst0, dst1, dst, stride);
+ ST_D4(dst0, dst1, 0, 1, 0, 1, dst, stride);
}
}
tmp0 = __msa_srlr_h(tmp0, denom);
tmp0 = (v8i16) __msa_sat_u_h((v8u16) tmp0, 7);
src0 = (v16u8) __msa_pckev_b((v16i8) tmp0, (v16i8) tmp0);
- ST4x2_UB(src0, data, stride);
+ ST_W2(src0, 0, 1, data, stride);
}
static void avc_wgt_4x4_msa(uint8_t *data, int32_t stride, int32_t log2_denom,
tmp1 = __msa_srlr_h(tmp1, denom);
SAT_UH2_SH(tmp0, tmp1, 7);
src0 = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
- ST4x4_UB(src0, src0, 0, 1, 2, 3, data, stride);
+ ST_W4(src0, 0, 1, 2, 3, data, stride);
}
static void avc_wgt_4x8_msa(uint8_t *data, int32_t stride, int32_t log2_denom,
SRLR_H4_SH(tmp0, tmp1, tmp2, tmp3, denom);
SAT_UH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, src0, src1);
- ST4x8_UB(src0, src1, data, stride);
+ ST_W8(src0, src1, 0, 1, 2, 3, 0, 1, 2, 3, data, stride);
}
static void avc_wgt_8x4_msa(uint8_t *data, int32_t stride, int32_t log2_denom,
SRLR_H4_SH(tmp0, tmp1, tmp2, tmp3, denom);
SAT_UH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, src0, src1);
- ST8x4_UB(src0, src1, data, stride);
+ ST_D4(src0, src1, 0, 1, 0, 1, data, stride);
}
static void avc_wgt_8x8_msa(uint8_t *data, int32_t stride, int32_t log2_denom,
SAT_UH8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, 7);
PCKEV_B4_UB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, tmp7, tmp6, src0, src1,
src2, src3);
- ST8x8_UB(src0, src1, src2, src3, data, stride);
+ ST_D8(src0, src1, src2, src3, 0, 1, 0, 1, 0, 1, 0, 1, data, stride);
}
static void avc_wgt_8x16_msa(uint8_t *data, int32_t stride, int32_t log2_denom,
SAT_UH8_SH(tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, 7);
PCKEV_B4_UB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, tmp7, tmp6, src0, src1,
src2, src3);
- ST8x8_UB(src0, src1, src2, src3, data, stride);
+ ST_D8(src0, src1, src2, src3, 0, 1, 0, 1, 0, 1, 0, 1, data, stride);
data += 8 * stride;
}
}
tmp0 = __msa_maxi_s_h(tmp0, 0);
tmp0 = __msa_min_s_h(max255, tmp0);
dst0 = (v16u8) __msa_pckev_b((v16i8) tmp0, (v16i8) tmp0);
- ST4x2_UB(dst0, dst, stride);
+ ST_W2(dst0, 0, 1, dst, stride);
}
static void avc_biwgt_4x4_msa(uint8_t *src, uint8_t *dst, int32_t stride,
tmp1 >>= denom;
CLIP_SH2_0_255(tmp0, tmp1);
dst0 = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
- ST4x4_UB(dst0, dst0, 0, 1, 2, 3, dst, stride);
+ ST_W4(dst0, 0, 1, 2, 3, dst, stride);
}
static void avc_biwgt_4x8_msa(uint8_t *src, uint8_t *dst, int32_t stride,
SRA_4V(tmp0, tmp1, tmp2, tmp3, denom);
CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3);
PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, dst0, dst1);
- ST4x8_UB(dst0, dst1, dst, stride);
+ ST_W8(dst0, dst1, 0, 1, 2, 3, 0, 1, 2, 3, dst, stride);
}
static void avc_biwgt_8x4_msa(uint8_t *src, uint8_t *dst, int32_t stride,
SRA_4V(tmp0, tmp1, tmp2, tmp3, denom);
CLIP_SH4_0_255(tmp0, tmp1, tmp2, tmp3);
PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, dst0, dst1);
- ST8x4_UB(dst0, dst1, dst, stride);
+ ST_D4(dst0, dst1, 0, 1, 0, 1, dst, stride);
}
static void avc_biwgt_8x8_msa(uint8_t *src, uint8_t *dst, int32_t stride,
CLIP_SH4_0_255(tmp4, tmp5, tmp6, tmp7);
PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, dst0, dst1);
PCKEV_B2_UB(tmp5, tmp4, tmp7, tmp6, dst2, dst3);
- ST8x8_UB(dst0, dst1, dst2, dst3, dst, stride);
+ ST_D8(dst0, dst1, dst2, dst3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride);
}
static void avc_biwgt_8x16_msa(uint8_t *src, uint8_t *dst, int32_t stride,
CLIP_SH4_0_255(temp4, temp5, temp6, temp7);
PCKEV_B4_UB(temp1, temp0, temp3, temp2, temp5, temp4, temp7, temp6,
dst0, dst1, dst2, dst3);
- ST8x8_UB(dst0, dst1, dst2, dst3, dst, stride);
+ ST_D8(dst0, dst1, dst2, dst3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride);
dst += 8 * stride;
}
}
ILVRL_H2_SH(tp3, tp2, tmp6, tmp7);
src = data - 3;
- ST4x4_UB(tmp3, tmp3, 0, 1, 2, 3, src, img_width);
- ST2x4_UB(tmp2, 0, src + 4, img_width);
+ ST_W4(tmp3, 0, 1, 2, 3, src, img_width);
+ ST_H4(tmp2, 0, 1, 2, 3, src + 4, img_width);
src += 4 * img_width;
- ST4x4_UB(tmp4, tmp4, 0, 1, 2, 3, src, img_width);
- ST2x4_UB(tmp2, 4, src + 4, img_width);
+ ST_W4(tmp4, 0, 1, 2, 3, src, img_width);
+ ST_H4(tmp2, 4, 5, 6, 7, src + 4, img_width);
src += 4 * img_width;
- ST4x4_UB(tmp6, tmp6, 0, 1, 2, 3, src, img_width);
- ST2x4_UB(tmp5, 0, src + 4, img_width);
+ ST_W4(tmp6, 0, 1, 2, 3, src, img_width);
+ ST_H4(tmp5, 0, 1, 2, 3, src + 4, img_width);
src += 4 * img_width;
- ST4x4_UB(tmp7, tmp7, 0, 1, 2, 3, src, img_width);
- ST2x4_UB(tmp5, 4, src + 4, img_width);
+ ST_W4(tmp7, 0, 1, 2, 3, src, img_width);
+ ST_H4(tmp5, 4, 5, 6, 7, src + 4, img_width);
}
}
}
tmp1 = (v8i16) __msa_ilvr_b((v16i8) q0_or_p0_org, (v16i8) p0_or_q0_org);
data_cb_or_cr -= 1;
- ST2x4_UB(tmp1, 0, data_cb_or_cr, img_width);
+ ST_H4(tmp1, 0, 1, 2, 3, data_cb_or_cr, img_width);
data_cb_or_cr += 4 * img_width;
- ST2x4_UB(tmp1, 4, data_cb_or_cr, img_width);
+ ST_H4(tmp1, 4, 5, 6, 7, data_cb_or_cr, img_width);
}
}
q0_org = __msa_bmnz_v(q0_org, q0, is_less_than);
tmp1 = (v8i16) __msa_ilvr_b((v16i8) q0_org, (v16i8) p0_org);
src = data - 1;
- ST2x4_UB(tmp1, 0, src, img_width);
+ ST_H4(tmp1, 0, 1, 2, 3, src, img_width);
src += 4 * img_width;
- ST2x4_UB(tmp1, 4, src, img_width);
+ ST_H4(tmp1, 4, 5, 6, 7, src, img_width);
}
}
}
}
AVC_LPF_H_CHROMA_422(src, stride, tc_val, alpha, beta, res);
- ST2x4_UB(res, 0, (src - 1), stride);
+ ST_H4(res, 0, 1, 2, 3, (src - 1), stride);
src += (4 * stride);
}
}
CLIP_SH4_0_255(res4, res5, res6, res7);
PCKEV_B4_SB(res1, res0, res3, res2, res5, res4, res7, res6,
dst0, dst1, dst2, dst3);
- ST8x4_UB(dst0, dst1, dst, dst_stride);
- dst += (4 * dst_stride);
- ST8x4_UB(dst2, dst3, dst, dst_stride);
+ ST_D8(dst0, dst1, dst2, dst3, 0, 1, 0, 1, 0, 1, 0, 1, dst, dst_stride)
}
static void avc_idct8_dc_addblk_msa(uint8_t *dst, int16_t *src,
CLIP_SH4_0_255(dst4_r, dst5_r, dst6_r, dst7_r);
PCKEV_B4_SB(dst1_r, dst0_r, dst3_r, dst2_r, dst5_r, dst4_r, dst7_r, dst6_r,
dst0, dst1, dst2, dst3);
- ST8x4_UB(dst0, dst1, dst, dst_stride);
- dst += (4 * dst_stride);
- ST8x4_UB(dst2, dst3, dst, dst_stride);
+ ST_D8(dst0, dst1, dst2, dst3, 0, 1, 0, 1, 0, 1, 0, 1, dst, dst_stride)
}
void ff_h264_idct_add_msa(uint8_t *dst, int16_t *src, int32_t dst_stride)
ADD2(pred_r, input_dc, pred_l, input_dc, pred_r, pred_l);
CLIP_SH2_0_255(pred_r, pred_l);
out = __msa_pckev_b((v16i8) pred_l, (v16i8) pred_r);
- ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+ ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
}
void ff_h264_idct8_dc_addblk_msa(uint8_t *dst, int16_t *src,
SAT_SH2_SH(out0, out1, 7);
out = PCKEV_XORI128_UB(out0, out1);
- ST4x4_UB(out, out, 0, 1, 2, 3, dst, stride);
+ ST_W4(out, 0, 1, 2, 3, dst, stride);
}
static void avc_luma_hv_qrt_8x8_msa(const uint8_t *src_x, const uint8_t *src_y,
SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
out0 = PCKEV_XORI128_UB(tmp0, tmp1);
out1 = PCKEV_XORI128_UB(tmp2, tmp3);
- ST8x4_UB(out0, out1, dst, stride);
+ ST_D4(out0, out1, 0, 1, 0, 1, dst, stride);
dst += (4 * stride);
LD_SB4(src_y, stride, src_vt9, src_vt10, src_vt11, src_vt12);
SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
out0 = PCKEV_XORI128_UB(tmp0, tmp1);
out1 = PCKEV_XORI128_UB(tmp2, tmp3);
- ST8x4_UB(out0, out1, dst, stride);
- dst += (4 * stride);
+ ST_D4(out0, out1, 0, 1, 0, 1, dst, stride);
}
static void avc_luma_hv_qrt_16x16_msa(const uint8_t *src_x,
SAT_SH4_SH(out0, out1, out2, out3, 7);
tmp0 = PCKEV_XORI128_UB(out0, out1);
tmp1 = PCKEV_XORI128_UB(out2, out3);
- ST8x4_UB(tmp0, tmp1, dst, stride);
+ ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, stride);
dst += (4 * stride);
src_vt0 = src_vt4;
res = PCKEV_XORI128_UB(res0, res1);
dst0 = __msa_aver_u_b(res, dst0);
- ST4x4_UB(dst0, dst0, 0, 1, 2, 3, dst, stride);
+ ST_W4(dst0, 0, 1, 2, 3, dst, stride);
}
static void avc_luma_hv_qrt_and_aver_dst_8x8_msa(const uint8_t *src_x,
out0 = PCKEV_XORI128_UB(tmp0, tmp1);
out1 = PCKEV_XORI128_UB(tmp2, tmp3);
AVER_UB2_UB(out0, dst0, out1, dst1, dst0, dst1);
- ST8x4_UB(dst0, dst1, dst, stride);
+ ST_D4(dst0, dst1, 0, 1, 0, 1, dst, stride);
dst += (4 * stride);
LD_SB4(src_y, stride, src_vt9, src_vt10, src_vt11, src_vt12);
out0 = PCKEV_XORI128_UB(tmp0, tmp1);
out1 = PCKEV_XORI128_UB(tmp2, tmp3);
AVER_UB2_UB(out0, dst0, out1, dst1, dst0, dst1);
- ST8x4_UB(dst0, dst1, dst, stride);
- dst += (4 * stride);
+ ST_D4(dst0, dst1, 0, 1, 0, 1, dst, stride);
}
static void avc_luma_hv_qrt_and_aver_dst_16x16_msa(const uint8_t *src_x,
tmp0 = PCKEV_XORI128_UB(out0, out1);
tmp1 = PCKEV_XORI128_UB(out2, out3);
AVER_UB2_UB(tmp0, dst0, tmp1, dst1, dst0, dst1);
- ST8x4_UB(dst0, dst1, dst, stride);
+ ST_D4(dst0, dst1, 0, 1, 0, 1, dst, stride);
dst += (4 * stride);
src_vt0 = src_vt4;
AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3, dst0, dst1,
dst2, dst3);
- ST8x8_UB(dst0, dst1, dst2, dst3, dst, stride);
+ ST_D8(dst0, dst1, dst2, dst3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride);
}
void ff_avg_h264_qpel4_mc00_msa(uint8_t *dst, const uint8_t *src,
dst0 = __msa_aver_u_b(src0, dst0);
- ST4x4_UB(dst0, dst0, 0, 1, 2, 3, dst, stride);
+ ST_W4(dst0, 0, 1, 2, 3, dst, stride);
}
void ff_put_h264_qpel16_mc10_msa(uint8_t *dst, const uint8_t *src,
tmp2 = __msa_aver_s_b(tmp2, src4);
tmp3 = __msa_aver_s_b(tmp3, src5);
XORI_B4_128_SB(tmp0, tmp1, tmp2, tmp3);
- ST8x8_UB(tmp0, tmp1, tmp2, tmp3, dst, stride);
+ ST_D8(tmp0, tmp1, tmp2, tmp3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride);
}
void ff_put_h264_qpel8_mc30_msa(uint8_t *dst, const uint8_t *src,
tmp2 = __msa_aver_s_b(tmp2, src4);
tmp3 = __msa_aver_s_b(tmp3, src5);
XORI_B4_128_SB(tmp0, tmp1, tmp2, tmp3);
- ST8x8_UB(tmp0, tmp1, tmp2, tmp3, dst, stride);
+ ST_D8(tmp0, tmp1, tmp2, tmp3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride);
}
void ff_put_h264_qpel4_mc10_msa(uint8_t *dst, const uint8_t *src,
src0 = (v16i8) __msa_insve_d((v2i64) src0, 1, (v2i64) src1);
res = __msa_aver_s_b(res, src0);
res = (v16i8) __msa_xori_b((v16u8) res, 128);
- ST4x4_UB(res, res, 0, 1, 2, 3, dst, stride);
+ ST_W4(res, 0, 1, 2, 3, dst, stride);
}
void ff_put_h264_qpel4_mc30_msa(uint8_t *dst, const uint8_t *src,
src0 = (v16i8) __msa_insve_d((v2i64) src0, 1, (v2i64) src1);
res = __msa_aver_s_b(res, src0);
res = (v16i8) __msa_xori_b((v16u8) res, 128);
- ST4x4_UB(res, res, 0, 1, 2, 3, dst, stride);
+ ST_W4(res, 0, 1, 2, 3, dst, stride);
}
void ff_put_h264_qpel16_mc20_msa(uint8_t *dst, const uint8_t *src,
out1 = PCKEV_XORI128_UB(res2, res3);
out2 = PCKEV_XORI128_UB(res4, res5);
out3 = PCKEV_XORI128_UB(res6, res7);
- ST8x8_UB(out0, out1, out2, out3, dst, stride);
+ ST_D8(out0, out1, out2, out3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride);
}
void ff_put_h264_qpel4_mc20_msa(uint8_t *dst, const uint8_t *src,
SRARI_H2_SH(res0, res1, 5);
SAT_SH2_SH(res0, res1, 7);
out = PCKEV_XORI128_UB(res0, res1);
- ST4x4_UB(out, out, 0, 1, 2, 3, dst, stride);
+ ST_W4(out, 0, 1, 2, 3, dst, stride);
}
void ff_put_h264_qpel16_mc01_msa(uint8_t *dst, const uint8_t *src,
out2 = __msa_aver_s_b(out2, tmp2);
out3 = __msa_aver_s_b(out3, tmp3);
XORI_B4_128_SB(out0, out1, out2, out3);
- ST8x8_UB(out0, out1, out2, out3, dst, stride);
+ ST_D8(out0, out1, out2, out3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride);
}
void ff_put_h264_qpel8_mc03_msa(uint8_t *dst, const uint8_t *src,
out2 = __msa_aver_s_b(out2, tmp2);
out3 = __msa_aver_s_b(out3, tmp3);
XORI_B4_128_SB(out0, out1, out2, out3);
- ST8x8_UB(out0, out1, out2, out3, dst, stride);
+ ST_D8(out0, out1, out2, out3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride);
}
void ff_put_h264_qpel4_mc01_msa(uint8_t *dst, const uint8_t *src,
src54_r = (v16i8) __msa_insve_w((v4i32) src4, 1, (v4i32) src5);
src32_r = (v16i8) __msa_insve_d((v2i64) src32_r, 1, (v2i64) src54_r);
out = __msa_aver_u_b(out, (v16u8) src32_r);
- ST4x4_UB(out, out, 0, 1, 2, 3, dst, stride);
+ ST_W4(out, 0, 1, 2, 3, dst, stride);
}
void ff_put_h264_qpel4_mc03_msa(uint8_t *dst, const uint8_t *src,
src54_r = (v16i8) __msa_insve_w((v4i32) src5, 1, (v4i32) src6);
src32_r = (v16i8) __msa_insve_d((v2i64) src32_r, 1, (v2i64) src54_r);
out = __msa_aver_u_b(out, (v16u8) src32_r);
- ST4x4_UB(out, out, 0, 1, 2, 3, dst, stride);
+ ST_W4(out, 0, 1, 2, 3, dst, stride);
}
void ff_put_h264_qpel16_mc11_msa(uint8_t *dst, const uint8_t *src,
out0 = PCKEV_XORI128_UB(dst0, dst1);
out1 = PCKEV_XORI128_UB(dst2, dst3);
- ST8x4_UB(out0, out1, dst, stride);
+ ST_D4(out0, out1, 0, 1, 0, 1, dst, stride);
dst += (4 * stride);
hz_out0 = hz_out4;
out0 = PCKEV_XORI128_UB(dst0, dst1);
out1 = PCKEV_XORI128_UB(dst2, dst3);
- ST8x4_UB(out0, out1, dst, stride);
+ ST_D4(out0, out1, 0, 1, 0, 1, dst, stride);
dst += (4 * stride);
hz_out0 = hz_out4;
out0 = PCKEV_XORI128_UB(dst0, dst1);
out1 = PCKEV_XORI128_UB(dst2, dst3);
- ST8x4_UB(out0, out1, dst, stride);
+ ST_D4(out0, out1, 0, 1, 0, 1, dst, stride);
dst += (4 * stride);
LD_SB4(src, stride, src9, src10, src11, src12);
out0 = PCKEV_XORI128_UB(dst0, dst1);
out1 = PCKEV_XORI128_UB(dst2, dst3);
- ST8x4_UB(out0, out1, dst, stride);
+ ST_D4(out0, out1, 0, 1, 0, 1, dst, stride);
}
void ff_put_h264_qpel8_mc23_msa(uint8_t *dst, const uint8_t *src,
out0 = PCKEV_XORI128_UB(dst0, dst1);
out1 = PCKEV_XORI128_UB(dst2, dst3);
- ST8x4_UB(out0, out1, dst, stride);
+ ST_D4(out0, out1, 0, 1, 0, 1, dst, stride);
dst += (4 * stride);
LD_SB4(src, stride, src9, src10, src11, src12);
out0 = PCKEV_XORI128_UB(dst0, dst1);
out1 = PCKEV_XORI128_UB(dst2, dst3);
- ST8x4_UB(out0, out1, dst, stride);
+ ST_D4(out0, out1, 0, 1, 0, 1, dst, stride);
}
void ff_put_h264_qpel4_mc21_msa(uint8_t *dst, const uint8_t *src,
dst1 = __msa_aver_s_h(dst1, hz_out4);
res = PCKEV_XORI128_UB(dst0, dst1);
- ST4x4_UB(res, res, 0, 1, 2, 3, dst, stride);
+ ST_W4(res, 0, 1, 2, 3, dst, stride);
}
void ff_put_h264_qpel4_mc23_msa(uint8_t *dst, const uint8_t *src,
dst1 = __msa_aver_s_h(dst1, hz_out1);
res = PCKEV_XORI128_UB(dst0, dst1);
- ST4x4_UB(res, res, 0, 1, 2, 3, dst, stride);
+ ST_W4(res, 0, 1, 2, 3, dst, stride);
}
void ff_put_h264_qpel16_mc02_msa(uint8_t *dst, const uint8_t *src,
out1 = PCKEV_XORI128_UB(out2_r, out3_r);
out2 = PCKEV_XORI128_UB(out4_r, out5_r);
out3 = PCKEV_XORI128_UB(out6_r, out7_r);
- ST8x8_UB(out0, out1, out2, out3, dst, stride);
+ ST_D8(out0, out1, out2, out3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride);
}
void ff_put_h264_qpel4_mc02_msa(uint8_t *dst, const uint8_t *src,
SRARI_H2_SH(out10, out32, 5);
SAT_SH2_SH(out10, out32, 7);
out = PCKEV_XORI128_UB(out10, out32);
- ST4x4_UB(out, out, 0, 1, 2, 3, dst, stride);
+ ST_W4(out, 0, 1, 2, 3, dst, stride);
}
void ff_put_h264_qpel16_mc12_msa(uint8_t *dst, const uint8_t *src,
dst0 = __msa_aver_s_h(dst2, dst0);
dst1 = __msa_aver_s_h(dst3, dst1);
out = PCKEV_XORI128_UB(dst0, dst1);
- ST8x2_UB(out, dst, stride);
+ ST_D2(out, 0, 1, dst, stride);
dst += (2 * stride);
src0 = src2;
dst0 = __msa_aver_s_h(dst2, dst0);
dst1 = __msa_aver_s_h(dst3, dst1);
out = PCKEV_XORI128_UB(dst0, dst1);
- ST8x2_UB(out, dst, stride);
+ ST_D2(out, 0, 1, dst, stride);
dst += (2 * stride);
src0 = src2;
PCKEV_H2_SH(hz_res1, hz_res0, hz_res3, hz_res2, dst0, dst2);
out = PCKEV_XORI128_UB(dst0, dst2);
- ST4x4_UB(out, out, 0, 1, 2, 3, dst, stride);
+ ST_W4(out, 0, 1, 2, 3, dst, stride);
}
void ff_put_h264_qpel4_mc32_msa(uint8_t *dst, const uint8_t *src,
PCKEV_H2_SH(hz_res1, hz_res0, hz_res3, hz_res2, dst0, dst2);
out = PCKEV_XORI128_UB(dst0, dst2);
- ST4x4_UB(out, out, 0, 1, 2, 3, dst, stride);
+ ST_W4(out, 0, 1, 2, 3, dst, stride);
}
void ff_put_h264_qpel16_mc22_msa(uint8_t *dst, const uint8_t *src,
out0 = PCKEV_XORI128_UB(dst0, dst1);
out1 = PCKEV_XORI128_UB(dst2, dst3);
- ST8x4_UB(out0, out1, dst, stride);
+ ST_D4(out0, out1, 0, 1, 0, 1, dst, stride);
dst += (4 * stride);
hz_out0 = hz_out4;
dst3 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
out0 = PCKEV_XORI128_UB(dst0, dst1);
out1 = PCKEV_XORI128_UB(dst2, dst3);
- ST8x4_UB(out0, out1, dst, stride);
+ ST_D4(out0, out1, 0, 1, 0, 1, dst, stride);
dst += (4 * stride);
LD_SB4(src, stride, src0, src1, src2, src3);
dst3 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
out0 = PCKEV_XORI128_UB(dst0, dst1);
out1 = PCKEV_XORI128_UB(dst2, dst3);
- ST8x4_UB(out0, out1, dst, stride);
+ ST_D4(out0, out1, 0, 1, 0, 1, dst, stride);
}
void ff_put_h264_qpel4_mc22_msa(uint8_t *dst, const uint8_t *src,
filt2);
dst1 = __msa_pckev_h((v8i16) tmp1, (v8i16) tmp0);
res = PCKEV_XORI128_UB(dst0, dst1);
- ST4x4_UB(res, res, 0, 1, 2, 3, dst, stride);
+ ST_W4(res, 0, 1, 2, 3, dst, stride);
}
void ff_avg_h264_qpel16_mc10_msa(uint8_t *dst, const uint8_t *src,
INSERT_D2_UB(tp2, tp3, dst3);
AVER_UB2_UB(tmp0, dst0, tmp1, dst1, dst0, dst1);
AVER_UB2_UB(tmp2, dst2, tmp3, dst3, dst2, dst3);
- ST8x8_UB(dst0, dst1, dst2, dst3, dst, stride);
+ ST_D8(dst0, dst1, dst2, dst3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride);
}
void ff_avg_h264_qpel8_mc30_msa(uint8_t *dst, const uint8_t *src,
INSERT_D2_UB(tp2, tp3, dst3);
AVER_UB2_UB(tmp0, dst0, tmp1, dst1, dst0, dst1);
AVER_UB2_UB(tmp2, dst2, tmp3, dst3, dst2, dst3);
- ST8x8_UB(dst0, dst1, dst2, dst3, dst, stride);
+ ST_D8(dst0, dst1, dst2, dst3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride);
}
void ff_avg_h264_qpel4_mc10_msa(uint8_t *dst, const uint8_t *src,
LW4(dst, stride, tp0, tp1, tp2, tp3);
INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
dst0 = __msa_aver_u_b((v16u8) res, dst0);
- ST4x4_UB(dst0, dst0, 0, 1, 2, 3, dst, stride);
+ ST_W4(dst0, 0, 1, 2, 3, dst, stride);
}
void ff_avg_h264_qpel4_mc30_msa(uint8_t *dst, const uint8_t *src,
LW4(dst, stride, tp0, tp1, tp2, tp3);
INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
dst0 = __msa_aver_u_b((v16u8) res, dst0);
- ST4x4_UB(dst0, dst0, 0, 1, 2, 3, dst, stride);
+ ST_W4(dst0, 0, 1, 2, 3, dst, stride);
}
void ff_avg_h264_qpel16_mc20_msa(uint8_t *dst, const uint8_t *src,
INSERT_D2_UB(tp2, tp3, out7);
AVER_UB2_UB(out0, out2, out1, out3, out0, out1);
AVER_UB2_UB(out4, out6, out5, out7, out4, out5);
- ST8x8_UB(out0, out1, out4, out5, dst, stride);
+ ST_D8(out0, out1, out4, out5, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride);
}
void ff_avg_h264_qpel4_mc20_msa(uint8_t *dst, const uint8_t *src,
LW4(dst, stride, tp0, tp1, tp2, tp3);
INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
res = __msa_aver_u_b(res, dst0);
- ST4x4_UB(res, res, 0, 1, 2, 3, dst, stride);
+ ST_W4(res, 0, 1, 2, 3, dst, stride);
}
void ff_avg_h264_qpel16_mc01_msa(uint8_t *dst, const uint8_t *src,
XORI_B4_128_SB(out0, out1, out2, out3);
AVER_UB4_UB(out0, dst0, out1, dst1, out2, dst2, out3, dst3, dst0, dst1,
dst2, dst3);
- ST8x8_UB(dst0, dst1, dst2, dst3, dst, stride);
+ ST_D8(dst0, dst1, dst2, dst3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride);
}
void ff_avg_h264_qpel8_mc03_msa(uint8_t *dst, const uint8_t *src,
XORI_B4_128_SB(out0, out1, out2, out3);
AVER_UB4_UB(out0, dst0, out1, dst1, out2, dst2, out3, dst3, dst0, dst1,
dst2, dst3);
- ST8x8_UB(dst0, dst1, dst2, dst3, dst, stride);
+ ST_D8(dst0, dst1, dst2, dst3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride);
}
void ff_avg_h264_qpel4_mc01_msa(uint8_t *dst, const uint8_t *src,
res = PCKEV_XORI128_UB(out10, out32);
res = __msa_aver_u_b(res, (v16u8) src32_r);
dst0 = __msa_aver_u_b(res, dst0);
- ST4x4_UB(dst0, dst0, 0, 1, 2, 3, dst, stride);
+ ST_W4(dst0, 0, 1, 2, 3, dst, stride);
}
void ff_avg_h264_qpel4_mc03_msa(uint8_t *dst, const uint8_t *src,
src32_r = (v16i8) __msa_insve_d((v2i64) src32_r, 1, (v2i64) src54_r);
res = __msa_aver_u_b(res, (v16u8) src32_r);
dst0 = __msa_aver_u_b(res, dst0);
- ST4x4_UB(dst0, dst0, 0, 1, 2, 3, dst, stride);
+ ST_W4(dst0, 0, 1, 2, 3, dst, stride);
}
void ff_avg_h264_qpel16_mc11_msa(uint8_t *dst, const uint8_t *src,
out0 = PCKEV_XORI128_UB(tmp0, tmp1);
dst0 = __msa_aver_u_b(out0, dst0);
- ST8x2_UB(dst0, dst, stride);
+ ST_D2(dst0, 0, 1, dst, stride);
dst += (2 * stride);
LD_SB2(src, stride, src7, src8);
out1 = PCKEV_XORI128_UB(tmp2, tmp3);
dst1 = __msa_aver_u_b(out1, dst1);
- ST8x2_UB(dst1, dst, stride);
+ ST_D2(dst1, 0, 1, dst, stride);
dst += (2 * stride);
hz_out0 = hz_out4;
INSERT_D2_UB(tp0, tp1, dst0);
out0 = PCKEV_XORI128_UB(tmp0, tmp1);
dst0 = __msa_aver_u_b(out0, dst0);
- ST8x2_UB(dst0, dst, stride);
+ ST_D2(dst0, 0, 1, dst, stride);
dst += (2 * stride);
LD_SB2(src, stride, src7, src8);
INSERT_D2_UB(tp2, tp3, dst1);
out1 = PCKEV_XORI128_UB(tmp2, tmp3);
dst1 = __msa_aver_u_b(out1, dst1);
- ST8x2_UB(dst1, dst, stride);
+ ST_D2(dst1, 0, 1, dst, stride);
dst += (2 * stride);
hz_out0 = hz_out4;
out0 = PCKEV_XORI128_UB(tmp0, tmp1);
out1 = PCKEV_XORI128_UB(tmp2, tmp3);
AVER_UB2_UB(out0, dst0, out1, dst1, dst0, dst1);
- ST8x4_UB(dst0, dst1, dst, stride);
+ ST_D4(dst0, dst1, 0, 1, 0, 1, dst, stride);
dst += (4 * stride);
LD_SB4(src, stride, src9, src10, src11, src12);
out0 = PCKEV_XORI128_UB(tmp0, tmp1);
out1 = PCKEV_XORI128_UB(tmp2, tmp3);
AVER_UB2_UB(out0, dst0, out1, dst1, dst0, dst1);
- ST8x4_UB(dst0, dst1, dst, stride);
+ ST_D4(dst0, dst1, 0, 1, 0, 1, dst, stride);
}
void ff_avg_h264_qpel8_mc23_msa(uint8_t *dst, const uint8_t *src,
out0 = PCKEV_XORI128_UB(tmp0, tmp1);
out1 = PCKEV_XORI128_UB(tmp2, tmp3);
AVER_UB2_UB(out0, dst0, out1, dst1, dst0, dst1);
- ST8x4_UB(dst0, dst1, dst, stride);
+ ST_D4(dst0, dst1, 0, 1, 0, 1, dst, stride);
dst += (4 * stride);
LD_SB4(src, stride, src9, src10, src11, src12);
out0 = PCKEV_XORI128_UB(tmp0, tmp1);
out1 = PCKEV_XORI128_UB(tmp2, tmp3);
AVER_UB2_UB(out0, dst0, out1, dst1, dst0, dst1);
- ST8x4_UB(dst0, dst1, dst, stride);
+ ST_D4(dst0, dst1, 0, 1, 0, 1, dst, stride);
}
void ff_avg_h264_qpel4_mc21_msa(uint8_t *dst, const uint8_t *src,
INSERT_W4_UB(tp0, tp1, tp2, tp3, out);
res = PCKEV_XORI128_UB(dst0, dst1);
res = __msa_aver_u_b(res, out);
- ST4x4_UB(res, res, 0, 1, 2, 3, dst, stride);
+ ST_W4(res, 0, 1, 2, 3, dst, stride);
}
void ff_avg_h264_qpel4_mc23_msa(uint8_t *dst, const uint8_t *src,
INSERT_W4_UB(tp0, tp1, tp2, tp3, out);
res = PCKEV_XORI128_UB(dst0, dst1);
res = __msa_aver_u_b(res, out);
- ST4x4_UB(res, res, 0, 1, 2, 3, dst, stride);
+ ST_W4(res, 0, 1, 2, 3, dst, stride);
}
void ff_avg_h264_qpel16_mc02_msa(uint8_t *dst, const uint8_t *src,
out3 = PCKEV_XORI128_UB(out6_r, out7_r);
AVER_UB4_UB(out0, dst0, out1, dst1, out2, dst2, out3, dst3, dst0, dst1,
dst2, dst3);
- ST8x8_UB(dst0, dst1, dst2, dst3, dst, stride);
+ ST_D8(dst0, dst1, dst2, dst3, 0, 1, 0, 1, 0, 1, 0, 1, dst, stride);
}
void ff_avg_h264_qpel4_mc02_msa(uint8_t *dst, const uint8_t *src,
INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
res = PCKEV_XORI128_UB(out10, out32);
dst0 = __msa_aver_u_b(res, dst0);
- ST4x4_UB(dst0, dst0, 0, 1, 2, 3, dst, stride);
+ ST_W4(dst0, 0, 1, 2, 3, dst, stride);
}
void ff_avg_h264_qpel16_mc12_msa(uint8_t *dst, const uint8_t *src,
tmp1 = __msa_aver_s_h(tmp3, tmp1);
out = PCKEV_XORI128_UB(tmp0, tmp1);
out = __msa_aver_u_b(out, dst0);
- ST8x2_UB(out, dst, stride);
+ ST_D2(out, 0, 1, dst, stride);
dst += (2 * stride);
src0 = src2;
tmp1 = __msa_aver_s_h(tmp3, tmp1);
out = PCKEV_XORI128_UB(tmp0, tmp1);
out = __msa_aver_u_b(out, dst0);
- ST8x2_UB(out, dst, stride);
+ ST_D2(out, 0, 1, dst, stride);
dst += (2 * stride);
src0 = src2;
PCKEV_H2_SH(hz_res1, hz_res0, hz_res3, hz_res2, dst0, dst2);
out = PCKEV_XORI128_UB(dst0, dst2);
out = __msa_aver_u_b(out, dstv);
- ST4x4_UB(out, out, 0, 1, 2, 3, dst, stride);
+ ST_W4(out, 0, 1, 2, 3, dst, stride);
}
void ff_avg_h264_qpel4_mc32_msa(uint8_t *dst, const uint8_t *src,
PCKEV_H2_SH(hz_res1, hz_res0, hz_res3, hz_res2, dst0, dst2);
out = PCKEV_XORI128_UB(dst0, dst2);
out = __msa_aver_u_b(out, dstv);
- ST4x4_UB(out, out, 0, 1, 2, 3, dst, stride);
+ ST_W4(out, 0, 1, 2, 3, dst, stride);
}
void ff_avg_h264_qpel16_mc22_msa(uint8_t *dst, const uint8_t *src,
out0 = PCKEV_XORI128_UB(res0, res1);
out1 = PCKEV_XORI128_UB(res2, res3);
AVER_UB2_UB(out0, dst0, out1, dst1, out0, out1);
- ST8x4_UB(out0, out1, dst, stride);
+ ST_D4(out0, out1, 0, 1, 0, 1, dst, stride);
dst += (4 * stride);
hz_out0 = hz_out4;
out0 = PCKEV_XORI128_UB(res0, res1);
out1 = PCKEV_XORI128_UB(res2, res3);
AVER_UB2_UB(out0, dst0, out1, dst1, dst0, dst1);
- ST8x4_UB(dst0, dst1, dst, stride);
+ ST_D4(dst0, dst1, 0, 1, 0, 1, dst, stride);
dst += (4 * stride);
LD_SB4(src, stride, src0, src1, src2, src3);
out0 = PCKEV_XORI128_UB(res0, res1);
out1 = PCKEV_XORI128_UB(res2, res3);
AVER_UB2_UB(out0, dst0, out1, dst1, dst0, dst1);
- ST8x4_UB(dst0, dst1, dst, stride);
+ ST_D4(dst0, dst1, 0, 1, 0, 1, dst, stride);
}
void ff_avg_h264_qpel4_mc22_msa(uint8_t *dst, const uint8_t *src,
INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
res = PCKEV_XORI128_UB(res0, res1);
res = __msa_aver_u_b(res, dst0);
- ST4x4_UB(res, res, 0, 1, 2, 3, dst, stride);
+ ST_W4(res, 0, 1, 2, 3, dst, stride);
}
ADD2(dst_r0, in0, dst_l0, in1, dst_r0, dst_l0);
CLIP_SH2_0_255(dst_r0, dst_l0);
dst_vec = (v4i32) __msa_pckev_b((v16i8) dst_l0, (v16i8) dst_r0);
- ST4x4_UB(dst_vec, dst_vec, 0, 1, 2, 3, dst, stride);
+ ST_W4(dst_vec, 0, 1, 2, 3, dst, stride);
}
static void hevc_addblk_8x8_msa(int16_t *coeffs, uint8_t *dst, int32_t stride)
dst_r0, dst_l0, dst_r1, dst_l1);
CLIP_SH4_0_255(dst_r0, dst_l0, dst_r1, dst_l1);
PCKEV_B2_SH(dst_l0, dst_r0, dst_l1, dst_r1, dst_r0, dst_r1);
- ST8x4_UB(dst_r0, dst_r1, dst, stride);
- dst += (4 * stride);
+ ST_D4(dst_r0, dst_r1, 0, 1, 0, 1, dst, stride);
LD4(temp_dst, stride, dst0, dst1, dst2, dst3);
INSERT_D2_SD(dst0, dst1, dst_vec0);
dst_r0, dst_l0, dst_r1, dst_l1);
CLIP_SH4_0_255(dst_r0, dst_l0, dst_r1, dst_l1);
PCKEV_B2_SH(dst_l0, dst_r0, dst_l1, dst_r1, dst_r0, dst_r1);
- ST8x4_UB(dst_r0, dst_r1, dst, stride);
+ ST_D4(dst_r0, dst_r1, 0, 1, 0, 1, dst + 4 * stride, stride);
}
static void hevc_addblk_16x16_msa(int16_t *coeffs, uint8_t *dst, int32_t stride)
dst_val0 = __msa_copy_u_d((v2i64) dst2, 0);
dst_val1 = __msa_copy_u_d((v2i64) dst2, 1);
- ST8x4_UB(dst0, dst1, p2, stride);
- p2 += (4 * stride);
- SD(dst_val0, p2);
- p2 += stride;
- SD(dst_val1, p2);
+ ST_D4(dst0, dst1, 0, 1, 0, 1, p2, stride);
+ SD(dst_val0, p2 + 4 * stride);
+ SD(dst_val1, p2 + 5 * stride);
/* strong filter ends */
} else if (flag0 == flag1) { /* weak only */
/* weak filter */
dst1 = __msa_bmz_v(dst1, dst3, (v16u8) cmp3);
p2 += stride;
- ST8x4_UB(dst0, dst1, p2, stride);
+ ST_D4(dst0, dst1, 0, 1, 0, 1, p2, stride);
/* weak filter ends */
} else { /* strong + weak */
/* strong filter */
dst_val0 = __msa_copy_u_d((v2i64) dst2, 0);
dst_val1 = __msa_copy_u_d((v2i64) dst2, 1);
- ST8x4_UB(dst0, dst1, p2, stride);
- p2 += (4 * stride);
- SD(dst_val0, p2);
- p2 += stride;
- SD(dst_val1, p2);
+ ST_D4(dst0, dst1, 0, 1, 0, 1, p2, stride);
+ SD(dst_val0, p2 + 4 * stride);
+ SD(dst_val1, p2 + 5 * stride);
}
}
}
temp1 = (v8i16) __msa_bmnz_v((v16u8) temp1, (v16u8) q0, (v16u8) tc_pos);
temp0 = (v8i16) __msa_pckev_b((v16i8) temp1, (v16i8) temp0);
- ST8x2_UB(temp0, p0_ptr, stride);
+ ST_D2(temp0, 0, 1, p0_ptr, stride);
}
}
temp0 = (v8i16) __msa_ilvev_b((v16i8) temp1, (v16i8) temp0);
src += 1;
- ST2x4_UB(temp0, 0, src, stride);
- src += (4 * stride);
- ST2x4_UB(temp0, 4, src, stride);
+ ST_H8(temp0, 0, 1, 2, 3, 4, 5, 6, 7, src, stride);
}
}
LD_UB4(src, src_stride, src0, src1, src2, src3);
/* store results */
- ST4x4_UB(dst0, dst0, 0, 1, 2, 3, dst, dst_stride);
+ ST_W4(dst0, 0, 1, 2, 3, dst, dst_stride);
dst += (4 * dst_stride);
}
dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
/* store results */
- ST4x4_UB(dst0, dst0, 0, 1, 2, 3, dst, dst_stride);
+ ST_W4(dst0, 0, 1, 2, 3, dst, dst_stride);
}
static void hevc_sao_band_filter_8width_msa(uint8_t *dst, int32_t dst_stride,
XORI_B2_128_SB(dst0, dst1);
/* store results */
- ST8x4_UB(dst0, dst1, dst, dst_stride);
+ ST_D4(dst0, dst1, 0, 1, 0, 1, dst, dst_stride);
dst += dst_stride << 2;
}
XORI_B2_128_SB(dst0, dst1);
/* store results */
- ST8x4_UB(dst0, dst1, dst, dst_stride);
+ ST_D4(dst0, dst1, 0, 1, 0, 1, dst, dst_stride);
}
static void hevc_sao_band_filter_16multiple_msa(uint8_t *dst,
dst0 = CLIP_SH_0_255_MAX_SATU(dst0);
dst0 = (v8i16) __msa_pckev_b((v16i8) dst0, (v16i8) dst0);
- ST4x2_UB(dst0, dst, dst_stride);
+ ST_W2(dst0, 0, 1, dst, dst_stride);
} else if (4 == height) {
LW4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
INSERT_W4_SB(tp0, tp1, tp2, tp3, src0);
SLLI_2V(dst0, dst1, 6);
HEVC_BI_RND_CLIP2_MAX_SATU(in0, in1, dst0, dst1, 7, dst0, dst1);
dst0 = (v8i16) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
- ST4x4_UB(dst0, dst0, 0, 1, 2, 3, dst, dst_stride);
+ ST_W4(dst0, 0, 1, 2, 3, dst, dst_stride);
} else if (0 == height % 8) {
for (loop_cnt = (height >> 3); loop_cnt--;) {
LW4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
HEVC_BI_RND_CLIP4_MAX_SATU(in0, in1, in2, in3, dst0, dst1, dst2,
dst3, 7, dst0, dst1, dst2, dst3);
PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
- ST4x8_UB(dst0, dst1, dst, dst_stride);
+ ST_W8(dst0, dst1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
dst += (8 * dst_stride);
}
}
7, dst4, dst5, dst6, dst7);
PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
PCKEV_B2_UB(dst5, dst4, dst7, dst6, out2, out3);
- ST6x4_UB(out0, out1, dst, dst_stride);
+ ST_W2(out0, 0, 2, dst, dst_stride);
+ ST_H2(out0, 2, 6, dst + 4, dst_stride);
+ ST_W2(out1, 0, 2, dst + 2 * dst_stride, dst_stride);
+ ST_H2(out1, 2, 6, dst + 2 * dst_stride + 4, dst_stride);
dst += (4 * dst_stride);
- ST6x4_UB(out2, out3, dst, dst_stride);
+ ST_W2(out2, 0, 2, dst, dst_stride);
+ ST_H2(out2, 2, 6, dst + 4, dst_stride);
+ ST_W2(out3, 0, 2, dst + 2 * dst_stride, dst_stride);
+ ST_H2(out3, 2, 6, dst + 2 * dst_stride + 4, dst_stride);
dst += (4 * dst_stride);
}
}
SLLI_2V(dst0, dst1, 6);
HEVC_BI_RND_CLIP2_MAX_SATU(in0, in1, dst0, dst1, 7, dst0, dst1);
out0 = (v16u8) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
- ST8x2_UB(out0, dst, dst_stride);
+ ST_D2(out0, 0, 1, dst, dst_stride);
} else if (4 == height) {
LD4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
INSERT_D2_SB(tp0, tp1, src0);
HEVC_BI_RND_CLIP4_MAX_SATU(in0, in1, in2, in3, dst0, dst1, dst2, dst3,
7, dst0, dst1, dst2, dst3);
PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
- ST8x4_UB(out0, out1, dst, dst_stride);
+ ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
} else if (6 == height) {
LD4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
src0_ptr += 4 * src_stride;
7, dst0, dst1, dst2, dst3);
HEVC_BI_RND_CLIP2_MAX_SATU(in4, in5, dst4, dst5, 7, dst4, dst5);
PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
- ST8x4_UB(out0, out1, dst, dst_stride);
- dst += (4 * dst_stride);
- ST8x2_UB(out2, dst, dst_stride);
+ ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
+ ST_D2(out2, 0, 1, dst + 4 * dst_stride, dst_stride);
} else if (0 == height % 8) {
uint32_t loop_cnt;
dst7, 7, dst4, dst5, dst6, dst7);
PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
PCKEV_B2_UB(dst5, dst4, dst7, dst6, out2, out3);
- ST8x8_UB(out0, out1, out2, out3, dst, dst_stride);
+ ST_D8(out0, out1, out2, out3, 0, 1, 0, 1, 0, 1, 0, 1, dst, dst_stride);
dst += (8 * dst_stride);
}
}
7, dst0, dst1, dst2, dst3);
HEVC_BI_RND_CLIP2_MAX_SATU(in4, in5, dst4, dst5, 7, dst4, dst5);
PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
- ST12x4_UB(out0, out1, out2, dst, dst_stride);
+ ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
+ ST_W4(out2, 0, 1, 2, 3, dst + 8, dst_stride);
dst += (4 * dst_stride);
}
}
PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
PCKEV_B3_UB(dst7, dst6, dst9, dst8, dst11, dst10, out3, out4, out5);
ST_UB4(out0, out1, out3, out4, dst, dst_stride);
- ST8x4_UB(out2, out5, dst + 16, dst_stride);
+ ST_D4(out2, out5, 0, 1, 0, 1, dst + 16, dst_stride);
dst += (4 * dst_stride);
}
}
dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
- ST4x8_UB(dst0, dst1, dst, dst_stride);
+ ST_W8(dst0, dst1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
dst += (8 * dst_stride);
}
}
dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
- ST8x4_UB(dst0, dst1, dst, dst_stride);
+ ST_D4(dst0, dst1, 0, 1, 0, 1, dst, dst_stride);
dst += (4 * dst_stride);
}
}
dst10, dst32, dst54, dst76);
PCKEV_B2_SH(dst32, dst10, dst76, dst54, dst10, dst54);
- ST4x8_UB(dst10, dst54, dst, dst_stride);
+ ST_W8(dst10, dst54, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
dst += (8 * dst_stride);
src2110 = src10998;
dst0_r, dst1_r, dst2_r, dst3_r);
PCKEV_B2_SH(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r);
- ST8x4_UB(dst0_r, dst1_r, dst, dst_stride);
+ ST_D4(dst0_r, dst1_r, 0, 1, 0, 1, dst, dst_stride);
dst += (4 * dst_stride);
src10_r = src54_r;
PCKEV_B2_SH(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r);
dst0_l = (v8i16) __msa_pckev_b((v16i8) dst1_l, (v16i8) dst0_l);
- ST12x4_UB(dst0_r, dst1_r, dst0_l, dst, dst_stride);
+ ST_D4(dst0_r, dst1_r, 0, 1, 0, 1, dst, dst_stride);
+ ST_W4(dst0_l, 0, 1, 2, 3, dst + 8, dst_stride);
dst += (4 * dst_stride);
src10_r = src54_r;
SRARI_H2_SH(out0, out1, 7);
CLIP_SH2_0_255_MAX_SATU(out0, out1);
out = (v16u8) __msa_pckev_b((v16i8) out1, (v16i8) out0);
- ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+ ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
dst += (4 * dst_stride);
dst10 = dst54;
tmp = __msa_srari_h(tmp, 7);
tmp = CLIP_SH_0_255_MAX_SATU(tmp);
out = (v16u8) __msa_pckev_b((v16i8) tmp, (v16i8) tmp);
- ST8x1_UB(out, dst_tmp);
+ ST_D1(out, 0, dst_tmp);
dst_tmp += dst_stride;
dst0 = dst1;
tmp = __msa_srari_h(tmp, 7);
tmp = CLIP_SH_0_255_MAX_SATU(tmp);
out = (v16u8) __msa_pckev_b((v16i8) tmp, (v16i8) tmp);
- ST8x1_UB(out, dst_tmp);
+ ST_D1(out, 0, dst_tmp);
dst_tmp += dst_stride;
dst0 = dst1;
SRARI_H2_SH(out0, out1, 7);
CLIP_SH2_0_255_MAX_SATU(out0, out1);
out = (v16u8) __msa_pckev_b((v16i8) out1, (v16i8) out0);
- ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+ ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
dst += (4 * dst_stride);
dst10 = dst54;
tmp0 = CLIP_SH_0_255(tmp0);
dst0 = __msa_pckev_b((v16i8) tmp0, (v16i8) tmp0);
- ST4x2_UB(dst0, dst, dst_stride);
+ ST_W2(dst0, 0, 1, dst, dst_stride);
}
static void hevc_hz_bi_4t_4x4_msa(uint8_t *src0_ptr,
HEVC_BI_RND_CLIP2(in0, in1, tmp0, tmp1, 7, tmp0, tmp1);
dst0 = __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
- ST4x4_UB(dst0, dst0, 0, 1, 2, 3, dst, dst_stride);
+ ST_W4(dst0, 0, 1, 2, 3, dst, dst_stride);
}
static void hevc_hz_bi_4t_4x8multiple_msa(uint8_t *src0_ptr,
tmp0, tmp1, tmp2, tmp3, 7, tmp0, tmp1, tmp2, tmp3);
PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, dst0, dst1);
- ST4x8_UB(dst0, dst1, dst, dst_stride);
+ ST_W8(dst0, dst1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
dst += (8 * dst_stride);
}
}
dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
- ST6x4_UB(dst0, dst1, dst, dst_stride);
+ ST_W2(dst0, 0, 2, dst, dst_stride);
+ ST_H2(dst0, 2, 6, dst + 4, dst_stride);
+ ST_W2(dst1, 0, 2, dst + 2 * dst_stride, dst_stride);
+ ST_H2(dst1, 2, 6, dst + 2 * dst_stride + 4, dst_stride);
dst += (4 * dst_stride);
}
}
HEVC_BI_RND_CLIP2(in0, in1, dst0, dst1, 7, dst0, dst1);
dst0 = (v8i16) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
- ST8x2_UB(dst0, dst, dst_stride);
+ ST_D2(dst0, 0, 1, dst, dst_stride);
}
static void hevc_hz_bi_4t_8x6_msa(uint8_t *src0_ptr,
PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
dst2 = (v8i16) __msa_pckev_b((v16i8) dst5, (v16i8) dst4);
- ST8x4_UB(dst0, dst1, dst, dst_stride);
- dst += (4 * dst_stride);
- ST8x2_UB(dst2, dst, dst_stride);
+ ST_D4(dst0, dst1, 0, 1, 0, 1, dst, dst_stride);
+ ST_D2(dst2, 0, 1, dst + 4 * dst_stride, dst_stride);
}
static void hevc_hz_bi_4t_8x4multiple_msa(uint8_t *src0_ptr,
dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
- ST8x4_UB(dst0, dst1, dst, dst_stride);
+ ST_D4(dst0, dst1, 0, 1, 0, 1, dst, dst_stride);
dst += (4 * dst_stride);
}
}
PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
dst2 = (v8i16) __msa_pckev_b((v16i8) dst5, (v16i8) dst4);
- ST12x4_UB(dst0, dst1, dst2, dst, dst_stride);
+ ST_D4(dst0, dst1, 0, 1, 0, 1, dst, dst_stride);
+ ST_W4(dst2, 0, 1, 2, 3, dst + 8, dst_stride);
dst += (4 * dst_stride);
}
}
dst0, dst1, dst2, dst3, 7, dst0, dst1, dst2, dst3);
PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
- ST8x4_UB(dst0, dst1, dst_tmp, dst_stride);
+ ST_D4(dst0, dst1, 0, 1, 0, 1, dst_tmp, dst_stride);
dst_tmp += (4 * dst_stride);
}
}
dst10 = CLIP_SH_0_255(dst10);
dst10 = (v8i16) __msa_pckev_b((v16i8) dst10, (v16i8) dst10);
- ST4x2_UB(dst10, dst, dst_stride);
+ ST_W2(dst10, 0, 1, dst, dst_stride);
}
static void hevc_vt_bi_4t_4x4_msa(uint8_t *src0_ptr,
HEVC_BI_RND_CLIP2(in0, in1, dst10, dst32, 7, dst10, dst32);
dst10 = (v8i16) __msa_pckev_b((v16i8) dst32, (v16i8) dst10);
- ST4x4_UB(dst10, dst10, 0, 1, 2, 3, dst, dst_stride);
+ ST_W4(dst10, 0, 1, 2, 3, dst, dst_stride);
}
static void hevc_vt_bi_4t_4x8multiple_msa(uint8_t *src0_ptr,
dst10, dst32, dst54, dst76);
PCKEV_B2_SH(dst32, dst10, dst76, dst54, dst10, dst54);
- ST4x8_UB(dst10, dst54, dst, dst_stride);
+ ST_W8(dst10, dst54, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
dst += (8 * dst_stride);
}
}
dst0_r, dst1_r, dst2_r, dst3_r);
PCKEV_B2_SH(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r);
- ST6x4_UB(dst0_r, dst1_r, dst, dst_stride);
+ ST_W2(dst0_r, 0, 2, dst, dst_stride);
+ ST_H2(dst0_r, 2, 6, dst + 4, dst_stride);
+ ST_W2(dst1_r, 0, 2, dst + 2 * dst_stride, dst_stride);
+ ST_H2(dst1_r, 2, 6, dst + 2 * dst_stride + 4, dst_stride);
dst += (4 * dst_stride);
LD_SH4(src1_ptr, src2_stride, in0, in1, in2, in3);
dst0_r, dst1_r, dst2_r, dst3_r);
PCKEV_B2_SH(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r);
- ST6x4_UB(dst0_r, dst1_r, dst, dst_stride);
+ ST_W2(dst0_r, 0, 2, dst, dst_stride);
+ ST_H2(dst0_r, 2, 6, dst + 4, dst_stride);
+ ST_W2(dst1_r, 0, 2, dst + 2 * dst_stride, dst_stride);
+ ST_H2(dst1_r, 2, 6, dst + 2 * dst_stride + 4, dst_stride);
dst += (4 * dst_stride);
}
HEVC_BI_RND_CLIP2(in0, in1, dst0_r, dst1_r, 7, dst0_r, dst1_r);
dst0_r = (v8i16) __msa_pckev_b((v16i8) dst1_r, (v16i8) dst0_r);
- ST8x2_UB(dst0_r, dst, dst_stride);
+ ST_D2(dst0_r, 0, 1, dst, dst_stride);
}
static void hevc_vt_bi_4t_8x6_msa(uint8_t *src0_ptr,
PCKEV_B2_SH(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r);
dst2_r = (v8i16) __msa_pckev_b((v16i8) dst5_r, (v16i8) dst4_r);
- ST8x4_UB(dst0_r, dst1_r, dst, dst_stride);
- dst += (4 * dst_stride);
- ST8x2_UB(dst2_r, dst, dst_stride);
+ ST_D4(dst0_r, dst1_r, 0, 1, 0, 1, dst, dst_stride);
+ ST_D2(dst2_r, 0, 1, dst + 4 * dst_stride, dst_stride);
}
static void hevc_vt_bi_4t_8x4multiple_msa(uint8_t *src0_ptr,
dst0_r, dst1_r, dst2_r, dst3_r);
PCKEV_B2_SH(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r);
- ST8x4_UB(dst0_r, dst1_r, dst, dst_stride);
+ ST_D4(dst0_r, dst1_r, 0, 1, 0, 1, dst, dst_stride);
dst += (4 * dst_stride);
}
}
PCKEV_B2_SH(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r);
dst0_l = (v8i16) __msa_pckev_b((v16i8) dst1_l, (v16i8) dst0_l);
- ST12x4_UB(dst0_r, dst1_r, dst0_l, dst, dst_stride);
+ ST_D4(dst0_r, dst1_r, 0, 1, 0, 1, dst, dst_stride);
+ ST_W4(dst0_l, 0, 1, 2, 3, dst + 8, dst_stride);
dst += (4 * dst_stride);
src2 = src6;
PCKEV_B2_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r, dst1_r);
dst2_r = (v8i16) __msa_pckev_b((v16i8) dst3_r, (v16i8) dst2_r);
ST_SH2(dst0_r, dst1_r, dst, dst_stride);
- ST8x2_UB(dst2_r, dst + 16, dst_stride);
+ ST_D2(dst2_r, 0, 1, dst + 16, dst_stride);
dst += (2 * dst_stride);
/* 16width */
PCKEV_B2_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r, dst1_r);
dst2_r = (v8i16) __msa_pckev_b((v16i8) dst3_r, (v16i8) dst2_r);
ST_SH2(dst0_r, dst1_r, dst, dst_stride);
- ST8x2_UB(dst2_r, dst + 16, dst_stride);
+ ST_D2(dst2_r, 0, 1, dst + 16, dst_stride);
dst += (2 * dst_stride);
}
}
tmp = __msa_srari_h(tmp, 7);
tmp = CLIP_SH_0_255_MAX_SATU(tmp);
out = (v16u8) __msa_pckev_b((v16i8) tmp, (v16i8) tmp);
- ST4x2_UB(out, dst, dst_stride);
+ ST_W2(out, 0, 1, dst, dst_stride);
}
static void hevc_hv_bi_4t_4x4_msa(uint8_t *src0_ptr,
SRARI_H2_SH(tmp0, tmp1, 7);
CLIP_SH2_0_255_MAX_SATU(tmp0, tmp1);
out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
- ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+ ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
}
static void hevc_hv_bi_4t_4multx8mult_msa(uint8_t *src0_ptr,
SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7);
CLIP_SH4_0_255_MAX_SATU(tmp0, tmp1, tmp2, tmp3);
PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
- ST4x8_UB(out0, out1, dst, dst_stride);
+ ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
dst += (8 * dst_stride);
dst10_r = dst98_r;
SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7);
CLIP_SH4_0_255_MAX_SATU(tmp0, tmp1, tmp2, tmp3);
PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
- ST4x8_UB(out0, out1, dst, dst_stride);
+ ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
LW4(src1_ptr + 4, src2_stride, tpw0, tpw1, tpw2, tpw3);
src1_ptr += (4 * src2_stride);
SRARI_H2_SH(tmp4, tmp5, 7);
CLIP_SH2_0_255_MAX_SATU(tmp4, tmp5);
out2 = (v16u8) __msa_pckev_b((v16i8) tmp5, (v16i8) tmp4);
- ST2x4_UB(out2, 0, dst + 4, dst_stride);
- dst += 4 * dst_stride;
- ST2x4_UB(out2, 4, dst + 4, dst_stride);
+ ST_H8(out2, 0, 1, 2, 3, 4, 5, 6, 7, dst + 4, dst_stride);
}
static void hevc_hv_bi_4t_8x2_msa(uint8_t *src0_ptr,
SRARI_H2_SH(tmp0, tmp1, 7);
CLIP_SH2_0_255_MAX_SATU(tmp0, tmp1);
out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
- ST8x2_UB(out, dst, dst_stride);
+ ST_D2(out, 0, 1, dst, dst_stride);
}
static void hevc_hv_bi_4t_8multx4_msa(uint8_t *src0_ptr,
SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7);
CLIP_SH4_0_255_MAX_SATU(tmp0, tmp1, tmp2, tmp3);
PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
- ST8x4_UB(out0, out1, dst, dst_stride);
+ ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
dst += 8;
}
}
CLIP_SH2_0_255_MAX_SATU(tmp4, tmp5);
PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
out2 = (v16u8) __msa_pckev_b((v16i8) tmp5, (v16i8) tmp4);
- ST8x4_UB(out0, out1, dst, dst_stride);
- dst += (4 * dst_stride);
- ST8x2_UB(out2, dst, dst_stride);
+ ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
+ ST_D2(out2, 0, 1, dst + 4 * dst_stride, dst_stride);
}
static void hevc_hv_bi_4t_8multx4mult_msa(uint8_t *src0_ptr,
SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7);
CLIP_SH4_0_255_MAX_SATU(tmp0, tmp1, tmp2, tmp3);
PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
- ST8x4_UB(out0, out1, dst_tmp, dst_stride);
+ ST_D4(out0, out1, 0, 1, 0, 1, dst_tmp, dst_stride);
dst_tmp += (4 * dst_stride);
dst10_r = dst54_r;
SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7);
CLIP_SH4_0_255_MAX_SATU(tmp0, tmp1, tmp2, tmp3);
PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
- ST8x4_UB(out0, out1, dst_tmp, dst_stride);
+ ST_D4(out0, out1, 0, 1, 0, 1, dst_tmp, dst_stride);
dst_tmp += (4 * dst_stride);
dst10_r = dst54_r;
SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 7);
CLIP_SH4_0_255_MAX_SATU(tmp0, tmp1, tmp2, tmp3);
PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
- ST4x8_UB(out0, out1, dst, dst_stride);
+ ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
dst += (8 * dst_stride);
dst10_r = dst98_r;
dst0 = (v8i16) __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r);
dst0 = CLIP_SH_0_255_MAX_SATU(dst0);
out0 = (v16u8) __msa_pckev_b((v16i8) dst0, (v16i8) dst0);
- ST4x2_UB(out0, dst, dst_stride);
+ ST_W2(out0, 0, 1, dst, dst_stride);
} else if (4 == height) {
LW4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
INSERT_W4_SB(tp0, tp1, tp2, tp3, src0);
HEVC_BIW_RND_CLIP2_MAX_SATU(dst0, dst1, in0, in1, weight_vec, rnd_vec,
offset_vec, dst0, dst1);
out0 = (v16u8) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
- ST4x4_UB(out0, out0, 0, 1, 2, 3, dst, dst_stride);
+ ST_W4(out0, 0, 1, 2, 3, dst, dst_stride);
} else if (0 == height % 8) {
for (loop_cnt = (height >> 3); loop_cnt--;) {
LW4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
in3, weight_vec, rnd_vec, offset_vec,
dst0, dst1, dst2, dst3);
PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
- ST4x8_UB(out0, out1, dst, dst_stride);
+ ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
dst += (8 * dst_stride);
}
}
weight_vec, rnd_vec, offset_vec,
dst0, dst1, dst2, dst3);
PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
- ST6x4_UB(out0, out1, dst, dst_stride);
+ ST_W2(out0, 0, 2, dst, dst_stride);
+ ST_H2(out0, 2, 6, dst + 4, dst_stride);
+ ST_W2(out1, 0, 2, dst + 2 * dst_stride, dst_stride);
+ ST_H2(out1, 2, 6, dst + 2 * dst_stride + 4, dst_stride);
dst += (4 * dst_stride);
}
}
dst0, dst1);
out0 = (v16u8) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
- ST8x2_UB(out0, dst, dst_stride);
+ ST_D2(out0, 0, 1, dst, dst_stride);
} else if (6 == height) {
LD4(src0_ptr, src_stride, tp0, tp1, tp2, tp3);
src0_ptr += 4 * src_stride;
HEVC_BIW_RND_CLIP2_MAX_SATU(dst4, dst5, in4, in5, weight_vec, rnd_vec,
offset_vec, dst4, dst5);
PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
- ST8x4_UB(out0, out1, dst, dst_stride);
- dst += (4 * dst_stride);
- ST8x2_UB(out2, dst, dst_stride);
+ ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
+ ST_D2(out2, 0, 1, dst + 4 * dst_stride, dst_stride);
} else if (0 == height % 4) {
uint32_t loop_cnt;
in3, weight_vec, rnd_vec, offset_vec,
dst0, dst1, dst2, dst3);
PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
- ST8x4_UB(out0, out1, dst, dst_stride);
+ ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
dst += (4 * dst_stride);
}
}
HEVC_BIW_RND_CLIP2_MAX_SATU(dst4, dst5, in4, in5, weight_vec, rnd_vec,
offset_vec, dst4, dst5);
PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
- ST12x4_UB(out0, out1, out2, dst, dst_stride);
+ ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
+ ST_W4(out2, 0, 1, 2, 3, dst + 8, dst_stride);
dst += (4 * dst_stride);
}
}
PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
PCKEV_B3_UB(dst7, dst6, dst9, dst8, dst11, dst10, out3, out4, out5);
ST_UB4(out0, out1, out3, out4, dst, dst_stride);
- ST8x4_UB(out2, out5, dst + 16, dst_stride);
+ ST_D4(out2, out5, 0, 1, 0, 1, dst + 16, dst_stride);
dst += (4 * dst_stride);
}
}
out0, out1);
out0 = (v8i16) __msa_pckev_b((v16i8) out1, (v16i8) out0);
- ST4x4_UB(out0, out0, 0, 1, 2, 3, dst, dst_stride);
+ ST_W4(out0, 0, 1, 2, 3, dst, dst_stride);
dst += (4 * dst_stride);
}
}
out0, out1, out2, out3);
PCKEV_B2_SH(out1, out0, out3, out2, out0, out1);
- ST8x4_UB(out0, out1, dst, dst_stride);
+ ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
dst += (4 * dst_stride);
}
}
weight_vec, rnd_vec, offset_vec, out0, out1, out2,
out3);
PCKEV_B2_SH(out1, out0, out3, out2, out0, out1);
- ST8x4_UB(out0, out1, dst, dst_stride);
+ ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
LD_SB4(src0_ptr + 8, src_stride, src0, src1, src2, src3);
src0_ptr += (4 * src_stride);
HEVC_BIW_RND_CLIP2(dst0, dst1, in0, in1, weight_vec, rnd_vec,
offset_vec, out0, out1);
out0 = (v8i16) __msa_pckev_b((v16i8) out1, (v16i8) out0);
- ST4x4_UB(out0, out0, 0, 1, 2, 3, dst + 8, dst_stride);
+ ST_W4(out0, 0, 1, 2, 3, dst + 8, dst_stride);
dst += (4 * dst_stride);
}
}
out0, out1, out2, out3);
PCKEV_B2_SH(out1, out0, out3, out2, out0, out1);
- ST4x8_UB(out0, out1, dst, dst_stride);
+ ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
dst += (8 * dst_stride);
src2110 = src10998;
out0, out1, out2, out3);
PCKEV_B2_SH(out1, out0, out3, out2, out0, out1);
- ST8x4_UB(out0, out1, dst, dst_stride);
+ ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
dst += (4 * dst_stride);
src10_r = src54_r;
dst2_r = (v4i32) __msa_pckev_h((v8i16) dst2_l, (v8i16) dst2_r);
out2 = CLIP_SH_0_255(dst2_r);
PCKEV_B2_SH(out1, out0, out2, out2, out0, out2);
- ST8x2_UB(out0, dst, dst_stride);
- ST4x2_UB(out2, dst + 8, dst_stride);
+ ST_D2(out0, 0, 1, dst, dst_stride);
+ ST_W2(out2, 0, 1, dst + 8, dst_stride);
dst += (2 * dst_stride);
src10_r = src32_r;
CLIP_SW4_0_255_MAX_SATU(dst0, dst1, dst2, dst3);
PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp0, tmp1);
out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
- ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+ ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
dst += (4 * dst_stride);
dst10 = dst54;
CLIP_SW4_0_255_MAX_SATU(dst0_l, dst0_r, dst1_l, dst1_r);
PCKEV_H2_SH(dst0_l, dst0_r, dst1_l, dst1_r, tmp0, tmp1);
out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
- ST8x2_UB(out, dst_tmp, dst_stride);
+ ST_D2(out, 0, 1, dst_tmp, dst_stride);
dst_tmp += (2 * dst_stride);
dst0 = dst2;
CLIP_SW4_0_255_MAX_SATU(dst1, dst0, dst3, dst2);
PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp0, tmp1);
out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
- ST8x2_UB(out, dst_tmp, dst_stride);
+ ST_D2(out, 0, 1, dst_tmp, dst_stride);
dst_tmp += (2 * dst_stride);
dsth0 = dsth2;
CLIP_SW4_0_255_MAX_SATU(dst0, dst1, dst2, dst3);
PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp0, tmp1);
out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
- ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+ ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
dst += (4 * dst_stride);
dst10 = dst54;
dst0_r = (v4i32) __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r);
out0 = CLIP_SH_0_255(dst0_r);
out0 = (v8i16) __msa_pckev_b((v16i8) out0, (v16i8) out0);
- ST4x2_UB(out0, dst, dst_stride);
+ ST_W2(out0, 0, 1, dst, dst_stride);
}
static void hevc_hz_biwgt_4t_4x4_msa(uint8_t *src0_ptr,
dst0, dst1);
dst0 = (v8i16) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
- ST4x4_UB(dst0, dst0, 0, 1, 2, 3, dst, dst_stride);
+ ST_W4(dst0, 0, 1, 2, 3, dst, dst_stride);
}
static void hevc_hz_biwgt_4t_4x8multiple_msa(uint8_t *src0_ptr,
dst0, dst1, dst2, dst3);
PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
- ST4x8_UB(dst0, dst1, dst, dst_stride);
+ ST_W8(dst0, dst1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
dst += (8 * dst_stride);
}
}
dst0, dst1, dst2, dst3);
PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
- ST6x4_UB(dst0, dst1, dst, dst_stride);
+ ST_W2(dst0, 0, 2, dst, dst_stride);
+ ST_H2(dst0, 2, 6, dst + 4, dst_stride);
+ ST_W2(dst1, 0, 2, dst + 2 * dst_stride, dst_stride);
+ ST_H2(dst1, 2, 6, dst + 2 * dst_stride + 4, dst_stride);
dst += (4 * dst_stride);
}
}
dst0, dst1);
dst0 = (v8i16) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
- ST8x2_UB(dst0, dst, dst_stride);
+ ST_D2(dst0, 0, 1, dst, dst_stride);
}
static void hevc_hz_biwgt_4t_8x6_msa(uint8_t *src0_ptr,
PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
dst3 = (v8i16) __msa_pckev_b((v16i8) dst5, (v16i8) dst4);
- ST8x4_UB(dst0, dst1, dst, dst_stride);
- dst += (4 * dst_stride);
- ST8x2_UB(dst3, dst, dst_stride);
+ ST_D4(dst0, dst1, 0, 1, 0, 1, dst, dst_stride);
+ ST_D2(dst3, 0, 1, dst + 4 * dst_stride, dst_stride);
}
static void hevc_hz_biwgt_4t_8x4multiple_msa(uint8_t *src0_ptr,
dst0, dst1, dst2, dst3);
PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
- ST8x4_UB(dst0, dst1, dst, dst_stride);
+ ST_D4(dst0, dst1, 0, 1, 0, 1, dst, dst_stride);
dst += (4 * dst_stride);
}
}
PCKEV_B2_SH(dst1, dst0, dst3, dst2, dst0, dst1);
dst3 = (v8i16) __msa_pckev_b((v16i8) dst5, (v16i8) dst4);
- ST12x4_UB(dst0, dst1, dst3, dst, dst_stride);
+ ST_D4(dst0, dst1, 0, 1, 0, 1, dst, dst_stride);
+ ST_W4(dst3, 0, 1, 2, 3, dst + 8, dst_stride);
dst += (4 * dst_stride);
}
}
dst0, dst1);
dst0 = (v8i16) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
- ST8x2_UB(dst0, (dst + 16), dst_stride);
+ ST_D2(dst0, 0, 1, (dst + 16), dst_stride);
dst += (2 * dst_stride);
}
}
dst10_r = (v4i32) __msa_pckev_h((v8i16) dst10_l, (v8i16) dst10_r);
out = CLIP_SH_0_255(dst10_r);
out = (v8i16) __msa_pckev_b((v16i8) out, (v16i8) out);
- ST4x2_UB(out, dst, dst_stride);
+ ST_W2(out, 0, 1, dst, dst_stride);
}
static void hevc_vt_biwgt_4t_4x4_msa(uint8_t *src0_ptr,
dst10, dst32);
dst10 = (v8i16) __msa_pckev_b((v16i8) dst32, (v16i8) dst10);
- ST4x4_UB(dst10, dst10, 0, 1, 2, 3, dst, dst_stride);
+ ST_W4(dst10, 0, 1, 2, 3, dst, dst_stride);
dst += (4 * dst_stride);
}
dst10, dst32, dst54, dst76);
PCKEV_B2_SH(dst32, dst10, dst76, dst54, dst10, dst32);
- ST4x8_UB(dst10, dst32, dst, dst_stride);
+ ST_W8(dst10, dst32, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
dst += (8 * dst_stride);
}
}
tmp0, tmp1, tmp2, tmp3);
PCKEV_B2_SH(tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
- ST6x4_UB(tmp0, tmp1, dst, dst_stride);
+ ST_W2(tmp0, 0, 2, dst, dst_stride);
+ ST_H2(tmp0, 2, 6, dst + 4, dst_stride);
+ ST_W2(tmp1, 0, 2, dst + 2 * dst_stride, dst_stride);
+ ST_H2(tmp1, 2, 6, dst + 2 * dst_stride + 4, dst_stride);
dst += (4 * dst_stride);
}
}
tmp0, tmp1);
tmp0 = (v8i16) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
- ST8x2_UB(tmp0, dst, dst_stride);
+ ST_D2(tmp0, 0, 1, dst, dst_stride);
}
static void hevc_vt_biwgt_4t_8x6_msa(uint8_t *src0_ptr,
PCKEV_B2_SH(tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
tmp3 = (v8i16) __msa_pckev_b((v16i8) tmp5, (v16i8) tmp4);
- ST8x4_UB(tmp0, tmp1, dst, dst_stride);
- dst += (4 * dst_stride);
- ST8x2_UB(tmp3, dst, dst_stride);
+ ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
+ ST_D2(tmp3, 0, 1, dst + 4 * dst_stride, dst_stride);
}
static void hevc_vt_biwgt_4t_8x4multiple_msa(uint8_t *src0_ptr,
tmp0, tmp1, tmp2, tmp3);
PCKEV_B2_SH(tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
- ST8x4_UB(tmp0, tmp1, dst, dst_stride);
+ ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
dst += (4 * dst_stride);
}
}
PCKEV_B2_SH(tmp1, tmp0, tmp3, tmp2, tmp0, tmp1);
tmp2 = (v8i16) __msa_pckev_b((v16i8) tmp5, (v16i8) tmp4);
- ST12x4_UB(tmp0, tmp1, tmp2, dst, dst_stride);
+ ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
+ ST_W4(tmp2, 0, 1, 2, 3, dst + 8, dst_stride);
dst += (4 * dst_stride);
}
}
/* 8width */
tmp2 = (v8i16) __msa_pckev_b((v16i8) tmp3, (v16i8) tmp2);
ST_SH2(tmp0, tmp1, dst, dst_stride);
- ST8x2_UB(tmp2, dst + 16, dst_stride);
+ ST_D2(tmp2, 0, 1, dst + 16, dst_stride);
dst += (2 * dst_stride);
/* 16width */
/* 8width */
tmp2 = (v8i16) __msa_pckev_b((v16i8) tmp3, (v16i8) tmp2);
ST_SH2(tmp0, tmp1, dst, dst_stride);
- ST8x2_UB(tmp2, dst + 16, dst_stride);
+ ST_D2(tmp2, 0, 1, dst + 16, dst_stride);
dst += (2 * dst_stride);
}
}
tmp = __msa_pckev_h((v8i16) dst1, (v8i16) dst0);
tmp = CLIP_SH_0_255_MAX_SATU(tmp);
out = (v16u8) __msa_pckev_b((v16i8) tmp, (v16i8) tmp);
- ST4x2_UB(out, dst, dst_stride);
+ ST_W2(out, 0, 1, dst, dst_stride);
}
static void hevc_hv_biwgt_4t_4x4_msa(uint8_t *src0_ptr,
PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp0, tmp1);
CLIP_SH2_0_255_MAX_SATU(tmp0, tmp1);
out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
- ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+ ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
}
static void hevc_hv_biwgt_4t_4multx8mult_msa(uint8_t *src0_ptr,
tmp2, tmp3);
CLIP_SH4_0_255_MAX_SATU(tmp0, tmp1, tmp2, tmp3);
PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
- ST4x8_UB(out0, out1, dst, dst_stride);
+ ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
dst += (8 * dst_stride);
dst10_r = dst98_r;
tmp2, tmp3);
CLIP_SH4_0_255_MAX_SATU(tmp0, tmp1, tmp2, tmp3);
PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
- ST4x8_UB(out0, out1, dst, dst_stride);
+ ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
PCKEV_H2_SW(dst1_l, dst0_l, dst3_l, dst2_l, dst4, dst5);
CLIP_SH2_0_255_MAX_SATU(tmp4, tmp5);
out2 = (v16u8) __msa_pckev_b((v16i8) tmp5, (v16i8) tmp4);
- ST2x4_UB(out2, 0, dst + 4, dst_stride);
- dst += 4 * dst_stride;
- ST2x4_UB(out2, 4, dst + 4, dst_stride);
+ ST_H8(out2, 0, 1, 2, 3, 4, 5, 6, 7, dst + 4, dst_stride);
}
static void hevc_hv_biwgt_4t_8x2_msa(uint8_t *src0_ptr,
PCKEV_H2_SH(dst0_l, dst0_r, dst1_l, dst1_r, tmp0, tmp1);
CLIP_SH2_0_255_MAX_SATU(tmp0, tmp1);
out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
- ST8x2_UB(out, dst, dst_stride);
+ ST_D2(out, 0, 1, dst, dst_stride);
}
static void hevc_hv_biwgt_4t_8multx4_msa(uint8_t *src0_ptr,
tmp0, tmp1, tmp2, tmp3);
CLIP_SH4_0_255_MAX_SATU(tmp0, tmp1, tmp2, tmp3);
PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
- ST8x4_UB(out0, out1, dst, dst_stride);
+ ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
dst += 8;
}
}
PCKEV_H2_SH(dst1, dst0, dst3, dst2, tmp4, tmp5);
CLIP_SH2_0_255_MAX_SATU(tmp4, tmp5);
out2 = (v16u8) __msa_pckev_b((v16i8) tmp5, (v16i8) tmp4);
- ST8x4_UB(out0, out1, dst, dst_stride);
- dst += (4 * dst_stride);
- ST8x2_UB(out2, dst, dst_stride);
+ ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
+ ST_D2(out2, 0, 1, dst + 4 * dst_stride, dst_stride);
}
static void hevc_hv_biwgt_4t_8multx4mult_msa(uint8_t *src0_ptr,
tmp0, tmp1, tmp2, tmp3);
CLIP_SH4_0_255_MAX_SATU(tmp0, tmp1, tmp2, tmp3);
PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
- ST8x4_UB(out0, out1, dst_tmp, dst_stride);
+ ST_D4(out0, out1, 0, 1, 0, 1, dst_tmp, dst_stride);
dst_tmp += (4 * dst_stride);
dst10_r = dst54_r;
tmp0, tmp1, tmp2, tmp3);
CLIP_SH4_0_255_MAX_SATU(tmp0, tmp1, tmp2, tmp3);
PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
- ST8x4_UB(out0, out1, dst_tmp, dst_stride);
+ ST_D4(out0, out1, 0, 1, 0, 1, dst_tmp, dst_stride);
dst_tmp += (4 * dst_stride);
dst10_r = dst54_r;
tmp0, tmp1, tmp2, tmp3);
CLIP_SH4_0_255_MAX_SATU(tmp0, tmp1, tmp2, tmp3);
PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
- ST4x8_UB(out0, out1, dst, dst_stride);
+ ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
dst += (8 * dst_stride);
dst10_r = dst98_r;
SRARI_H2_SH(out0, out1, 6);
SAT_SH2_SH(out0, out1, 7);
out = PCKEV_XORI128_UB(out0, out1);
- ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+ ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
}
static void common_hz_8t_4x8_msa(uint8_t *src, int32_t src_stride,
SRARI_H4_SH(out0, out1, out2, out3, 6);
SAT_SH4_SH(out0, out1, out2, out3, 7);
out = PCKEV_XORI128_UB(out0, out1);
- ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
- dst += (4 * dst_stride);
+ ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
out = PCKEV_XORI128_UB(out2, out3);
- ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+ ST_W4(out, 0, 1, 2, 3, dst + 4 * dst_stride, dst_stride);
}
static void common_hz_8t_4x16_msa(uint8_t *src, int32_t src_stride,
SRARI_H4_SH(out0, out1, out2, out3, 6);
SAT_SH4_SH(out0, out1, out2, out3, 7);
out = PCKEV_XORI128_UB(out0, out1);
- ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
- dst += (4 * dst_stride);
+ ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
out = PCKEV_XORI128_UB(out2, out3);
- ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
- dst += (4 * dst_stride);
+ ST_W4(out, 0, 1, 2, 3, dst + 4 * dst_stride, dst_stride);
+ dst += (8 * dst_stride);
LD_SB4(src, src_stride, src0, src1, src2, src3);
XORI_B4_128_SB(src0, src1, src2, src3);
SRARI_H4_SH(out0, out1, out2, out3, 6);
SAT_SH4_SH(out0, out1, out2, out3, 7);
out = PCKEV_XORI128_UB(out0, out1);
- ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
- dst += (4 * dst_stride);
+ ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
out = PCKEV_XORI128_UB(out2, out3);
- ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+ ST_W4(out, 0, 1, 2, 3, dst + 4 * dst_stride, dst_stride);
}
static void common_hz_8t_4w_msa(uint8_t *src, int32_t src_stride,
SAT_SH4_SH(out0, out1, out2, out3, 7);
tmp0 = PCKEV_XORI128_UB(out0, out1);
tmp1 = PCKEV_XORI128_UB(out2, out3);
- ST8x4_UB(tmp0, tmp1, dst, dst_stride);
+ ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
dst += (4 * dst_stride);
}
}
tmp1 = PCKEV_XORI128_UB(out2, out3);
tmp2 = PCKEV_XORI128_UB(out4, out5);
- ST8x4_UB(tmp0, tmp1, dst, dst_stride);
- ST4x4_UB(tmp2, tmp2, 0, 1, 2, 3, dst + 8, dst_stride);
+ ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
+ ST_W4(tmp2, 0, 1, 2, 3, dst + 8, dst_stride);
dst += (4 * dst_stride);
}
}
SAT_SH4_SH(out0, out8, out2, out9, 7);
SAT_SH2_SH(out1, out3, 7);
out = PCKEV_XORI128_UB(out8, out9);
- ST8x2_UB(out, dst + 16, dst_stride);
+ ST_D2(out, 0, 1, dst + 16, dst_stride);
out = PCKEV_XORI128_UB(out0, out1);
ST_UB(out, dst);
dst += dst_stride;
SAT_SH2_SH(out54, out76, 7);
out0 = PCKEV_XORI128_UB(out10, out32);
out1 = PCKEV_XORI128_UB(out54, out76);
- ST4x4_UB(out0, out0, 0, 1, 2, 3, dst, dst_stride);
- dst += (4 * dst_stride);
- ST4x4_UB(out1, out1, 0, 1, 2, 3, dst, dst_stride);
- dst += (4 * dst_stride);
+ ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
+ dst += (8 * dst_stride);
src2110 = src10998;
src4332 = src12111110;
SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
tmp0 = PCKEV_XORI128_UB(out0_r, out1_r);
tmp1 = PCKEV_XORI128_UB(out2_r, out3_r);
- ST8x4_UB(tmp0, tmp1, dst, dst_stride);
+ ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
dst += (4 * dst_stride);
src10_r = src54_r;
PCKEV_H2_SW(dst5_r, dst4_r, dst7_r, dst6_r, dst4_r, dst5_r);
out0 = PCKEV_XORI128_UB(dst0_r, dst1_r);
out1 = PCKEV_XORI128_UB(dst4_r, dst5_r);
- ST4x4_UB(out0, out0, 0, 1, 2, 3, dst, dst_stride);
- dst += (4 * dst_stride);
- ST4x4_UB(out1, out1, 0, 1, 2, 3, dst, dst_stride);
- dst += (4 * dst_stride);
+ ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
+ dst += (8 * dst_stride);
dst10_r = dst98_r;
dst32_r = dst1110_r;
PCKEV_H2_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst0, dst1);
out = PCKEV_XORI128_UB(dst0, dst1);
- ST8x2_UB(out, dst_tmp, dst_stride);
+ ST_D2(out, 0, 1, dst_tmp, dst_stride);
dst_tmp += (2 * dst_stride);
dst0 = dst2;
PCKEV_H2_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst0, dst1);
out0 = PCKEV_XORI128_UB(dst0, dst1);
- ST8x2_UB(out0, dst_tmp, dst_stride);
+ ST_D2(out0, 0, 1, dst_tmp, dst_stride);
dst_tmp += (2 * dst_stride);
dst0 = dst2;
PCKEV_H2_SW(dst5_r, dst4_r, dst7_r, dst6_r, dst4_r, dst5_r);
out0 = PCKEV_XORI128_UB(dst0_r, dst1_r);
out1 = PCKEV_XORI128_UB(dst4_r, dst5_r);
- ST4x4_UB(out0, out0, 0, 1, 2, 3, dst, dst_stride);
- dst += (4 * dst_stride);
- ST4x4_UB(out1, out1, 0, 1, 2, 3, dst, dst_stride);
- dst += (4 * dst_stride);
+ ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
+ dst += (8 * dst_stride);
dst10_r = dst98_r;
dst32_r = dst1110_r;
res0 = __msa_srari_h(res0, 6);
res0 = __msa_sat_s_h(res0, 7);
out = PCKEV_XORI128_UB(res0, res0);
- ST4x2_UB(out, dst, dst_stride);
+ ST_W2(out, 0, 1, dst, dst_stride);
}
static void common_hz_4t_4x4_msa(uint8_t *src, int32_t src_stride,
SRARI_H2_SH(out0, out1, 6);
SAT_SH2_SH(out0, out1, 7);
out = PCKEV_XORI128_UB(out0, out1);
- ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+ ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
}
static void common_hz_4t_4x8_msa(uint8_t *src, int32_t src_stride,
SRARI_H4_SH(out0, out1, out2, out3, 6);
SAT_SH4_SH(out0, out1, out2, out3, 7);
out = PCKEV_XORI128_UB(out0, out1);
- ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
- dst += (4 * dst_stride);
+ ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
out = PCKEV_XORI128_UB(out2, out3);
- ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+ ST_W4(out, 0, 1, 2, 3, dst + 4 * dst_stride, dst_stride);
}
static void common_hz_4t_4x16_msa(uint8_t *src, int32_t src_stride,
SRARI_H4_SH(out0, out1, out2, out3, 6);
SAT_SH4_SH(out0, out1, out2, out3, 7);
out = PCKEV_XORI128_UB(out0, out1);
- ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
- dst += (4 * dst_stride);
+ ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
out = PCKEV_XORI128_UB(out2, out3);
- ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
- dst += (4 * dst_stride);
+ ST_W4(out, 0, 1, 2, 3, dst + 4 * dst_stride, dst_stride);
+ dst += (8 * dst_stride);
LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
src += (8 * src_stride);
SRARI_H4_SH(out0, out1, out2, out3, 6);
SAT_SH4_SH(out0, out1, out2, out3, 7);
out = PCKEV_XORI128_UB(out0, out1);
- ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
- dst += (4 * dst_stride);
+ ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
out = PCKEV_XORI128_UB(out2, out3);
- ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+ ST_W4(out, 0, 1, 2, 3, dst + 4 * dst_stride, dst_stride);
}
static void common_hz_4t_4w_msa(uint8_t *src, int32_t src_stride,
SAT_SH4_SH(out0, out1, out2, out3, 7);
out4 = PCKEV_XORI128_UB(out0, out1);
out5 = PCKEV_XORI128_UB(out2, out3);
- ST6x4_UB(out4, out5, dst, dst_stride);
+ ST_W2(out4, 0, 2, dst, dst_stride);
+ ST_H2(out4, 2, 6, dst + 4, dst_stride);
+ ST_W2(out5, 0, 2, dst + 2 * dst_stride, dst_stride);
+ ST_H2(out5, 2, 6, dst + 2 * dst_stride + 4, dst_stride);
dst += (4 * dst_stride);
LD_SB4(src, src_stride, src0, src1, src2, src3);
SAT_SH4_SH(out0, out1, out2, out3, 7);
out4 = PCKEV_XORI128_UB(out0, out1);
out5 = PCKEV_XORI128_UB(out2, out3);
- ST6x4_UB(out4, out5, dst, dst_stride);
- dst += (4 * dst_stride);
+ ST_W2(out4, 0, 2, dst, dst_stride);
+ ST_H2(out4, 2, 6, dst + 4, dst_stride);
+ ST_W2(out5, 0, 2, dst + 2 * dst_stride, dst_stride);
+ ST_H2(out5, 2, 6, dst + 2 * dst_stride + 4, dst_stride);
}
static void common_hz_4t_8x2mult_msa(uint8_t *src, int32_t src_stride,
SRARI_H2_SH(vec0, vec1, 6);
SAT_SH2_SH(vec0, vec1, 7);
out = PCKEV_XORI128_UB(vec0, vec1);
- ST8x2_UB(out, dst, dst_stride);
+ ST_D2(out, 0, 1, dst, dst_stride);
dst += (2 * dst_stride);
}
}
SAT_SH4_SH(out0, out1, out2, out3, 7);
tmp0 = PCKEV_XORI128_UB(out0, out1);
tmp1 = PCKEV_XORI128_UB(out2, out3);
- ST8x4_UB(tmp0, tmp1, dst, dst_stride);
+ ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
dst += (4 * dst_stride);
}
}
SRARI_H2_SH(out0, out1, 6);
SAT_SH2_SH(out0, out1, 7);
tmp0 = PCKEV_XORI128_UB(out0, out1);
- ST4x4_UB(tmp0, tmp0, 0, 1, 2, 3, dst + 8, dst_stride);
+ ST_W4(tmp0, 0, 1, 2, 3, dst + 8, dst_stride);
VSHF_B2_SB(src0, src0, src1, src1, mask0, mask0, vec4, vec5);
VSHF_B2_SB(src2, src2, src3, src3, mask0, mask0, vec6, vec7);
SAT_SH4_SH(out2, out3, out4, out5, 7);
tmp0 = PCKEV_XORI128_UB(out2, out3);
tmp1 = PCKEV_XORI128_UB(out4, out5);
- ST8x4_UB(tmp0, tmp1, dst, dst_stride);
+ ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
dst += (4 * dst_stride);
}
}
SAT_SH4_SH(out0, out1, out2, out3, 7);
tmp0 = PCKEV_XORI128_UB(out0, out1);
tmp1 = PCKEV_XORI128_UB(out2, out3);
- ST8x4_UB(tmp0, tmp1, dst1, dst_stride);
+ ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst1, dst_stride);
dst1 += (4 * dst_stride);
}
}
out10 = __msa_srari_h(out10, 6);
out10 = __msa_sat_s_h(out10, 7);
out = PCKEV_XORI128_UB(out10, out10);
- ST4x2_UB(out, dst, dst_stride);
+ ST_W2(out, 0, 1, dst, dst_stride);
}
static void common_vt_4t_4x4multiple_msa(uint8_t *src, int32_t src_stride,
SRARI_H2_SH(out10, out32, 6);
SAT_SH2_SH(out10, out32, 7);
out = PCKEV_XORI128_UB(out10, out32);
- ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+ ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
dst += (4 * dst_stride);
}
}
SAT_SH4_SH(dst0_r, dst1_r, dst2_r, dst3_r, 7);
out0 = PCKEV_XORI128_UB(dst0_r, dst1_r);
out1 = PCKEV_XORI128_UB(dst2_r, dst3_r);
- ST6x4_UB(out0, out1, dst, dst_stride);
+ ST_W2(out0, 0, 2, dst, dst_stride);
+ ST_H2(out0, 2, 6, dst + 4, dst_stride);
+ ST_W2(out1, 0, 2, dst + 2 * dst_stride, dst_stride);
+ ST_H2(out1, 2, 6, dst + 2 * dst_stride + 4, dst_stride);
dst += (4 * dst_stride);
LD_SB2(src, src_stride, src3, src4);
SAT_SH4_SH(dst0_r, dst1_r, dst2_r, dst3_r, 7);
out0 = PCKEV_XORI128_UB(dst0_r, dst1_r);
out1 = PCKEV_XORI128_UB(dst2_r, dst3_r);
- ST6x4_UB(out0, out1, dst, dst_stride);
+ ST_W2(out0, 0, 2, dst, dst_stride);
+ ST_H2(out0, 2, 6, dst + 4, dst_stride);
+ ST_W2(out1, 0, 2, dst + 2 * dst_stride, dst_stride);
+ ST_H2(out1, 2, 6, dst + 2 * dst_stride + 4, dst_stride);
}
static void common_vt_4t_8x2_msa(uint8_t *src, int32_t src_stride,
SRARI_H2_SH(tmp0, tmp1, 6);
SAT_SH2_SH(tmp0, tmp1, 7);
out = PCKEV_XORI128_UB(tmp0, tmp1);
- ST8x2_UB(out, dst, dst_stride);
+ ST_D2(out, 0, 1, dst, dst_stride);
}
static void common_vt_4t_8x6_msa(uint8_t *src, int32_t src_stride,
SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
tmp0 = PCKEV_XORI128_UB(out0_r, out1_r);
tmp1 = PCKEV_XORI128_UB(out2_r, out3_r);
- ST8x4_UB(tmp0, tmp1, dst, dst_stride);
+ ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
dst += (4 * dst_stride);
src10_r = src98_r;
SAT_SH2_SH(dst0_l, dst1_l, 7);
out0 = PCKEV_XORI128_UB(dst0_r, dst1_r);
out1 = PCKEV_XORI128_UB(dst2_r, dst3_r);
- ST8x4_UB(out0, out1, dst, dst_stride);
+ ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
out0 = PCKEV_XORI128_UB(dst0_l, dst1_l);
- ST4x4_UB(out0, out0, 0, 1, 2, 3, dst + 8, dst_stride);
+ ST_W4(out0, 0, 1, 2, 3, dst + 8, dst_stride);
dst += (4 * dst_stride);
src2 = src6;
out = PCKEV_XORI128_UB(out0_r, out0_l);
ST_UB(out, dst);
out = PCKEV_XORI128_UB(out2_r, out2_r);
- ST8x1_UB(out, dst + 16);
+ ST_D1(out, 0, dst + 16);
dst += dst_stride;
out = PCKEV_XORI128_UB(out1_r, out1_l);
ST_UB(out, dst);
out = PCKEV_XORI128_UB(out3_r, out3_r);
- ST8x1_UB(out, dst + 16);
+ ST_D1(out, 0, dst + 16);
dst += dst_stride;
}
}
tmp = __msa_srari_h(tmp, 6);
tmp = __msa_sat_s_h(tmp, 7);
out = PCKEV_XORI128_UB(tmp, tmp);
- ST4x2_UB(out, dst, dst_stride);
+ ST_W2(out, 0, 1, dst, dst_stride);
}
static void hevc_hv_uni_4t_4x4_msa(uint8_t *src,
SRARI_H2_SH(tmp0, tmp1, 6);
SAT_SH2_SH(tmp0, tmp1, 7);
out = PCKEV_XORI128_UB(tmp0, tmp1);
- ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+ ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
}
static void hevc_hv_uni_4t_4multx8mult_msa(uint8_t *src,
SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
out0 = PCKEV_XORI128_UB(tmp0, tmp1);
out1 = PCKEV_XORI128_UB(tmp2, tmp3);
- ST4x8_UB(out0, out1, dst, dst_stride);
+ ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
dst += (8 * dst_stride);
dst10_r = dst98_r;
out0 = PCKEV_XORI128_UB(tmp0, tmp1);
out1 = PCKEV_XORI128_UB(tmp2, tmp3);
out2 = PCKEV_XORI128_UB(tmp4, tmp5);
- ST4x8_UB(out0, out1, dst, dst_stride);
- ST2x4_UB(out2, 0, dst + 4, dst_stride);
- dst += 4 * dst_stride;
- ST2x4_UB(out2, 4, dst + 4, dst_stride);
+ ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
+ ST_H8(out2, 0, 1, 2, 3, 4, 5, 6, 7, dst + 4, dst_stride);
}
static void hevc_hv_uni_4t_8x2_msa(uint8_t *src,
SRARI_H2_SH(out0_r, out1_r, 6);
SAT_SH2_SH(out0_r, out1_r, 7);
out = PCKEV_XORI128_UB(out0_r, out1_r);
- ST8x2_UB(out, dst, dst_stride);
+ ST_D2(out, 0, 1, dst, dst_stride);
}
static void hevc_hv_uni_4t_8multx4_msa(uint8_t *src,
SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
out0 = PCKEV_XORI128_UB(tmp0, tmp1);
out1 = PCKEV_XORI128_UB(tmp2, tmp3);
- ST8x4_UB(out0, out1, dst, dst_stride);
+ ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
dst += 8;
}
}
out1 = PCKEV_XORI128_UB(out2_r, out3_r);
out2 = PCKEV_XORI128_UB(out4_r, out5_r);
- ST8x4_UB(out0, out1, dst, dst_stride);
- dst += (4 * dst_stride);
- ST8x2_UB(out2, dst, dst_stride);
+ ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
+ ST_D2(out2, 0, 1, dst + 4 * dst_stride, dst_stride);
}
static void hevc_hv_uni_4t_8multx4mult_msa(uint8_t *src,
SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
out0 = PCKEV_XORI128_UB(out0_r, out1_r);
out1 = PCKEV_XORI128_UB(out2_r, out3_r);
- ST8x4_UB(out0, out1, dst_tmp, dst_stride);
+ ST_D4(out0, out1, 0, 1, 0, 1, dst_tmp, dst_stride);
dst_tmp += (4 * dst_stride);
dst10_r = dst54_r;
SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
out0 = PCKEV_XORI128_UB(tmp0, tmp1);
out1 = PCKEV_XORI128_UB(tmp2, tmp3);
- ST8x4_UB(out0, out1, dst_tmp, dst_stride);
+ ST_D4(out0, out1, 0, 1, 0, 1, dst_tmp, dst_stride);
dst_tmp += (4 * dst_stride);
dst10_r = dst54_r;
SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
out0 = PCKEV_XORI128_UB(tmp0, tmp1);
out1 = PCKEV_XORI128_UB(tmp2, tmp3);
- ST4x8_UB(out0, out1, dst, dst_stride);
+ ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
dst += (8 * dst_stride);
dst10_r = dst98_r;
dst0 += offset_vec;
dst0 = CLIP_SH_0_255_MAX_SATU(dst0);
out0 = (v16u8) __msa_pckev_b((v16i8) dst0, (v16i8) dst0);
- ST4x2_UB(out0, dst, dst_stride);
+ ST_W2(out0, 0, 1, dst, dst_stride);
} else if (4 == height) {
LW4(src, src_stride, tp0, tp1, tp2, tp3);
INSERT_W4_SB(tp0, tp1, tp2, tp3, src0);
HEVC_UNIW_RND_CLIP2_MAX_SATU_H(dst0, dst1, weight_vec, offset_vec,
rnd_vec, dst0, dst1);
out0 = (v16u8) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
- ST4x4_UB(out0, out0, 0, 1, 2, 3, dst, dst_stride);
+ ST_W4(out0, 0, 1, 2, 3, dst, dst_stride);
} else if (0 == (height % 8)) {
for (loop_cnt = (height >> 3); loop_cnt--;) {
LW4(src, src_stride, tp0, tp1, tp2, tp3);
offset_vec, rnd_vec, dst0, dst1,
dst2, dst3);
PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
- ST4x8_UB(out0, out1, dst, dst_stride);
+ ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
dst += 8 * dst_stride;
}
}
PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
PCKEV_B2_UB(dst5, dst4, dst7, dst6, out2, out3);
- ST6x4_UB(out0, out1, dst, dst_stride);
+ ST_W2(out0, 0, 2, dst, dst_stride);
+ ST_H2(out0, 2, 6, dst + 4, dst_stride);
+ ST_W2(out1, 0, 2, dst + 2 * dst_stride, dst_stride);
+ ST_H2(out1, 2, 6, dst + 2 * dst_stride + 4, dst_stride);
dst += (4 * dst_stride);
- ST6x4_UB(out2, out3, dst, dst_stride);
+ ST_W2(out2, 0, 2, dst, dst_stride);
+ ST_H2(out2, 2, 6, dst + 4, dst_stride);
+ ST_W2(out3, 0, 2, dst + 2 * dst_stride, dst_stride);
+ ST_H2(out3, 2, 6, dst + 2 * dst_stride + 4, dst_stride);
dst += (4 * dst_stride);
}
}
HEVC_UNIW_RND_CLIP2_MAX_SATU_H(dst0, dst1, weight_vec, offset_vec,
rnd_vec, dst0, dst1);
out0 = (v16u8) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
- ST8x2_UB(out0, dst, dst_stride);
+ ST_D2(out0, 0, 1, dst, dst_stride);
} else if (4 == height) {
LD4(src, src_stride, tp0, tp1, tp2, tp3);
INSERT_D2_SB(tp0, tp1, src0);
offset_vec, rnd_vec, dst0, dst1, dst2,
dst3);
PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
- ST8x4_UB(out0, out1, dst, dst_stride);
+ ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
} else if (6 == height) {
LD4(src, src_stride, tp0, tp1, tp2, tp3);
src += 4 * src_stride;
HEVC_UNIW_RND_CLIP2_MAX_SATU_H(dst4, dst5, weight_vec, offset_vec,
rnd_vec, dst4, dst5);
PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
- ST8x4_UB(out0, out1, dst, dst_stride);
- dst += (4 * dst_stride);
- ST8x2_UB(out2, dst, dst_stride);
+ ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
+ ST_D2(out2, 0, 1, dst + 4 * dst_stride, dst_stride);
} else if (0 == height % 8) {
for (loop_cnt = (height >> 3); loop_cnt--;) {
LD4(src, src_stride, tp0, tp1, tp2, tp3);
dst6, dst7);
PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
PCKEV_B2_UB(dst5, dst4, dst7, dst6, out2, out3);
- ST8x4_UB(out0, out1, dst, dst_stride);
- dst += (4 * dst_stride);
- ST8x4_UB(out2, out3, dst, dst_stride);
- dst += (4 * dst_stride);
+ ST_D8(out0, out1, out2, out3, 0, 1, 0, 1, 0, 1, 0, 1,
+ dst, dst_stride);
+ dst += (8 * dst_stride);
}
}
}
rnd_vec, dst4, dst5);
PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
- ST12x4_UB(out0, out1, out2, dst, dst_stride);
+ ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
+ ST_W4(out2, 0, 1, 2, 3, dst + 8, dst_stride);
dst += (4 * dst_stride);
}
}
PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
PCKEV_B3_UB(dst7, dst6, dst9, dst8, dst11, dst10, out3, out4, out5);
ST_UB4(out0, out1, out3, out4, dst, dst_stride);
- ST8x4_UB(out2, out5, dst + 16, dst_stride);
+ ST_D4(out2, out5, 0, 1, 0, 1, dst + 16, dst_stride);
dst += (4 * dst_stride);
}
}
dst3);
PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
- ST4x8_UB(out0, out1, dst, dst_stride);
+ ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
dst += (8 * dst_stride);
}
}
dst3);
PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
- ST8x4_UB(out0, out1, dst, dst_stride);
+ ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
dst += (4 * dst_stride);
}
}
rnd_vec, dst4, dst5);
PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
- ST8x4_UB(out0, out1, dst, dst_stride);
- ST4x4_UB(out2, out2, 0, 1, 2, 3, dst + 8, dst_stride);
+ ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
+ ST_W4(out2, 0, 1, 2, 3, dst + 8, dst_stride);
dst += (4 * dst_stride);
}
}
PCKEV_B3_UB(dst1, dst0, dst4, dst3, dst5, dst2, out0, out1, out2);
ST_UB2(out0, out1, dst, dst_stride);
- ST8x2_UB(out2, dst + 16, dst_stride);
+ ST_D2(out2, 0, 1, dst + 16, dst_stride);
dst += (2 * dst_stride);
}
}
dst3);
PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
- ST4x8_UB(out0, out1, dst, dst_stride);
+ ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
dst += (8 * dst_stride);
src2110 = src10998;
dst3);
PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
- ST8x4_UB(out0, out1, dst, dst_stride);
+ ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
dst += (4 * dst_stride);
src10_r = src54_r;
rnd_vec, dst4, dst5);
PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
- ST8x4_UB(out0, out1, dst, dst_stride);
- ST4x4_UB(out2, out2, 0, 1, 2, 3, dst + 8, dst_stride);
+ ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
+ ST_W4(out2, 0, 1, 2, 3, dst + 8, dst_stride);
dst += (4 * dst_stride);
src10_r = src54_r;
CLIP_SW4_0_255_MAX_SATU(dst0_r, dst1_r, dst2_r, dst3_r);
PCKEV_H2_SW(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r);
out = (v16u8) __msa_pckev_b((v16i8) dst1_r, (v16i8) dst0_r);
- ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+ ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
dst += (4 * dst_stride);
dst10_r = dst54_r;
PCKEV_H2_SW(dst0_l, dst0_r, dst1_l, dst1_r, dst0_r, dst1_r);
dst0_r = (v4i32) __msa_pckev_b((v16i8) dst1_r, (v16i8) dst0_r);
- ST8x2_UB(dst0_r, dst_tmp, dst_stride);
+ ST_D2(dst0_r, 0, 1, dst_tmp, dst_stride);
dst_tmp += (2 * dst_stride);
dst10_r = dst32_r;
CLIP_SW2_0_255_MAX_SATU(dst0_r, dst0_l);
dst0_r = (v4i32) __msa_pckev_h((v8i16) dst0_l, (v8i16) dst0_r);
out = (v16u8) __msa_pckev_b((v16i8) dst0_r, (v16i8) dst0_r);
- ST8x1_UB(out, dst_tmp);
+ ST_D1(out, 0, dst_tmp);
dst_tmp += dst_stride;
dst0 = dst1;
CLIP_SW4_0_255_MAX_SATU(dst0_r, dst1_r, dst2_r, dst3_r);
PCKEV_H2_SW(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst1_r);
out = (v16u8) __msa_pckev_b((v16i8) dst1_r, (v16i8) dst0_r);
- ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+ ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
dst += (4 * dst_stride);
dst10_r = dst54_r;
dst0 = __msa_adds_s_h(dst0, offset_vec);
dst0 = CLIP_SH_0_255_MAX_SATU(dst0);
out = (v16u8) __msa_pckev_b((v16i8) dst0, (v16i8) dst0);
- ST4x2_UB(out, dst, dst_stride);
+ ST_W2(out, 0, 1, dst, dst_stride);
dst += (4 * dst_stride);
}
dst0, dst1);
out = (v16u8) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
- ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+ ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
dst += (4 * dst_stride);
}
dst0, dst1, dst2, dst3);
PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
- ST4x8_UB(out0, out1, dst, dst_stride);
+ ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
dst += (8 * dst_stride);
}
}
PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
PCKEV_B2_UB(dst5, dst4, dst7, dst6, out2, out3);
- ST6x4_UB(out0, out1, dst, dst_stride);
+ ST_W2(out0, 0, 2, dst, dst_stride);
+ ST_H2(out0, 2, 6, dst + 4, dst_stride);
+ ST_W2(out1, 0, 2, dst + 2 * dst_stride, dst_stride);
+ ST_H2(out1, 2, 6, dst + 2 * dst_stride + 4, dst_stride);
dst += (4 * dst_stride);
- ST6x4_UB(out2, out3, dst, dst_stride);
+ ST_W2(out2, 0, 2, dst, dst_stride);
+ ST_H2(out2, 2, 6, dst + 4, dst_stride);
+ ST_W2(out3, 0, 2, dst + 2 * dst_stride, dst_stride);
+ ST_H2(out3, 2, 6, dst + 2 * dst_stride + 4, dst_stride);
}
static void hevc_hz_uniwgt_4t_8x2_msa(uint8_t *src,
dst0, dst1);
out = (v16u8) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
- ST8x2_UB(out, dst, dst_stride);
+ ST_D2(out, 0, 1, dst, dst_stride);
}
static void hevc_hz_uniwgt_4t_8x4_msa(uint8_t *src,
dst0, dst1, dst2, dst3);
PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
- ST8x4_UB(out0, out1, dst, dst_stride);
+ ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
}
static void hevc_hz_uniwgt_4t_8x6_msa(uint8_t *src,
dst4, dst5);
PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
- ST8x4_UB(out0, out1, dst, dst_stride);
- dst += (4 * dst_stride);
- ST8x2_UB(out2, dst, dst_stride);
+ ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
+ ST_D2(out2, 0, 1, dst + 4 * dst_stride, dst_stride);
}
static void hevc_hz_uniwgt_4t_8x8multiple_msa(uint8_t *src,
PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
PCKEV_B2_UB(dst5, dst4, dst7, dst6, out2, out3);
- ST8x8_UB(out0, out1, out2, out3, dst, dst_stride);
+ ST_D8(out0, out1, out2, out3, 0, 1, 0, 1, 0, 1, 0, 1, dst, dst_stride);
dst += (8 * dst_stride);
}
}
rnd_vec, dst4, dst5);
PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
- ST12x4_UB(out0, out1, out2, dst, dst_stride);
+ ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
+ ST_W4(out2, 0, 1, 2, 3, dst + 8, dst_stride);
dst += (4 * dst_stride);
}
}
PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
ST_UB2(out0, out1, dst, dst_stride);
- ST8x2_UB(out2, dst + 16, dst_stride);
+ ST_D2(out2, 0, 1, dst + 16, dst_stride);
dst += (2 * dst_stride);
}
}
dst0 = __msa_adds_s_h(dst0, offset_vec);
dst0 = CLIP_SH_0_255_MAX_SATU(dst0);
out = (v16u8) __msa_pckev_b((v16i8) dst0, (v16i8) dst0);
- ST4x2_UB(out, dst, dst_stride);
+ ST_W2(out, 0, 1, dst, dst_stride);
}
static void hevc_vt_uniwgt_4t_4x4_msa(uint8_t *src,
dst0, dst1);
out = (v16u8) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
- ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+ ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
}
static void hevc_vt_uniwgt_4t_4x8multiple_msa(uint8_t *src,
dst0, dst1, dst2, dst3);
PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
- ST4x8_UB(out0, out1, dst, dst_stride);
+ ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
dst += (8 * dst_stride);
src2 = src10;
PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
PCKEV_B2_UB(dst5, dst4, dst7, dst6, out2, out3);
- ST6x4_UB(out0, out1, dst, dst_stride);
+ ST_W2(out0, 0, 2, dst, dst_stride);
+ ST_H2(out0, 2, 6, dst + 4, dst_stride);
+ ST_W2(out1, 0, 2, dst + 2 * dst_stride, dst_stride);
+ ST_H2(out1, 2, 6, dst + 2 * dst_stride + 4, dst_stride);
dst += (4 * dst_stride);
- ST6x4_UB(out2, out3, dst, dst_stride);
+ ST_W2(out2, 0, 2, dst, dst_stride);
+ ST_H2(out2, 2, 6, dst + 4, dst_stride);
+ ST_W2(out3, 0, 2, dst + 2 * dst_stride, dst_stride);
+ ST_H2(out3, 2, 6, dst + 2 * dst_stride + 4, dst_stride);
}
static void hevc_vt_uniwgt_4t_8x2_msa(uint8_t *src,
dst0, dst1);
out = (v16u8) __msa_pckev_b((v16i8) dst1, (v16i8) dst0);
- ST8x2_UB(out, dst, dst_stride);
+ ST_D2(out, 0, 1, dst, dst_stride);
}
static void hevc_vt_uniwgt_4t_8x4_msa(uint8_t *src,
offset_vec, rnd_vec, dst0, dst1, dst2,
dst3);
PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
- ST8x4_UB(out0, out1, dst, dst_stride);
+ ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
}
static void hevc_vt_uniwgt_4t_8x6_msa(uint8_t *src,
HEVC_UNIW_RND_CLIP2_MAX_SATU_H(dst4, dst5, weight_vec, offset_vec, rnd_vec,
dst4, dst5);
PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
- ST8x4_UB(out0, out1, dst, dst_stride);
- dst += (4 * dst_stride);
- ST8x2_UB(out2, dst, dst_stride);
+ ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
+ ST_D2(out2, 0, 1, dst + 4 * dst_stride, dst_stride);
}
static void hevc_vt_uniwgt_4t_8x8mult_msa(uint8_t *src,
dst7);
PCKEV_B2_UB(dst1, dst0, dst3, dst2, out0, out1);
PCKEV_B2_UB(dst5, dst4, dst7, dst6, out2, out3);
- ST8x8_UB(out0, out1, out2, out3, dst, dst_stride);
+ ST_D8(out0, out1, out2, out3, 0, 1, 0, 1, 0, 1, 0, 1, dst, dst_stride);
dst += (8 * dst_stride);
src2 = src10;
HEVC_UNIW_RND_CLIP2_MAX_SATU_H(dst4, dst5, weight_vec, offset_vec,
rnd_vec, dst4, dst5);
PCKEV_B3_UB(dst1, dst0, dst3, dst2, dst5, dst4, out0, out1, out2);
- ST12x4_UB(out0, out1, out2, dst, dst_stride);
+ ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
+ ST_W4(out2, 0, 1, 2, 3, dst + 8, dst_stride);
dst += (4 * dst_stride);
ILVRL_B2_SB(src7, src6, src76_r, src76_l);
HEVC_UNIW_RND_CLIP2_MAX_SATU_H(dst10, dst11, weight_vec, offset_vec,
rnd_vec, dst10, dst11);
PCKEV_B3_UB(dst7, dst6, dst9, dst8, dst11, dst10, out3, out4, out5);
- ST12x4_UB(out3, out4, out5, dst, dst_stride);
+ ST_D4(out3, out4, 0, 1, 0, 1, dst, dst_stride);
+ ST_W4(out5, 0, 1, 2, 3, dst + 8, dst_stride);
dst += (4 * dst_stride);
src2 = src10;
out2, out3);
PCKEV_B2_UB(dst9, dst8, dst11, dst10, out4, out5);
ST_UB4(out0, out1, out2, out3, dst, dst_stride);
- ST8x4_UB(out4, out5, dst + 16, dst_stride);
+ ST_D4(out4, out5, 0, 1, 0, 1, dst + 16, dst_stride);
dst += (4 * dst_stride);
src2 = src6;
tmp += offset_vec;
tmp = CLIP_SH_0_255_MAX_SATU(tmp);
out = (v16u8) __msa_pckev_b((v16i8) tmp, (v16i8) tmp);
- ST4x2_UB(out, dst, dst_stride);
+ ST_W2(out, 0, 1, dst, dst_stride);
}
static void hevc_hv_uniwgt_4t_4x4_msa(uint8_t *src,
ADD2(tmp0, offset_vec, tmp1, offset_vec, tmp0, tmp1);
CLIP_SH2_0_255_MAX_SATU(tmp0, tmp1);
out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
- ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+ ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
}
static void hevc_hv_uniwgt_4t_4multx8mult_msa(uint8_t *src,
ADD2(tmp2, offset_vec, tmp3, offset_vec, tmp2, tmp3);
CLIP_SH4_0_255_MAX_SATU(tmp0, tmp1, tmp2, tmp3);
PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
- ST4x8_UB(out0, out1, dst, dst_stride);
+ ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
dst += (8 * dst_stride);
dst10_r = dst98_r;
CLIP_SH4_0_255_MAX_SATU(tmp0, tmp1, tmp2, tmp3);
CLIP_SH2_0_255_MAX_SATU(tmp4, tmp5);
PCKEV_B3_UB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, out0, out1, out2);
- ST4x8_UB(out0, out1, dst, dst_stride);
- ST2x4_UB(out2, 0, dst + 4, dst_stride);
- dst += 4 * dst_stride;
- ST2x4_UB(out2, 4, dst + 4, dst_stride);
+ ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
+ ST_H8(out2, 0, 1, 2, 3, 4, 5, 6, 7, dst + 4, dst_stride);
}
static void hevc_hv_uniwgt_4t_8x2_msa(uint8_t *src,
ADD2(tmp0, offset_vec, tmp1, offset_vec, tmp0, tmp1);
CLIP_SH2_0_255_MAX_SATU(tmp0, tmp1);
out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
- ST8x2_UB(out, dst, dst_stride);
+ ST_D2(out, 0, 1, dst, dst_stride);
}
static void hevc_hv_uniwgt_4t_8multx4_msa(uint8_t *src,
ADD2(tmp2, offset_vec, tmp3, offset_vec, tmp2, tmp3);
CLIP_SH4_0_255_MAX_SATU(tmp0, tmp1, tmp2, tmp3);
PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
- ST8x4_UB(out0, out1, dst, dst_stride);
+ ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
dst += 8;
}
}
CLIP_SH4_0_255_MAX_SATU(tmp0, tmp1, tmp2, tmp3);
CLIP_SH2_0_255_MAX_SATU(tmp4, tmp5);
PCKEV_B3_UB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, out0, out1, out2);
- ST8x4_UB(out0, out1, dst, dst_stride);
- dst += (4 * dst_stride);
- ST8x2_UB(out2, dst, dst_stride);
+ ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
+ ST_D2(out2, 0, 1, dst + 4 * dst_stride, dst_stride);
}
static void hevc_hv_uniwgt_4t_8multx4mult_msa(uint8_t *src,
ADD2(tmp2, offset_vec, tmp3, offset_vec, tmp2, tmp3);
CLIP_SH4_0_255_MAX_SATU(tmp0, tmp1, tmp2, tmp3);
PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
- ST8x4_UB(out0, out1, dst_tmp, dst_stride);
+ ST_D4(out0, out1, 0, 1, 0, 1, dst_tmp, dst_stride);
dst_tmp += (4 * dst_stride);
dst10_r = dst54_r;
ADD2(tmp2, offset_vec, tmp3, offset_vec, tmp2, tmp3);
CLIP_SH4_0_255_MAX_SATU(tmp0, tmp1, tmp2, tmp3);
PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
- ST8x4_UB(out0, out1, dst_tmp, dst_stride);
+ ST_D4(out0, out1, 0, 1, 0, 1, dst_tmp, dst_stride);
dst_tmp += (4 * dst_stride);
dst10_r = dst54_r;
ADD2(tmp2, offset_vec, tmp3, offset_vec, tmp2, tmp3);
CLIP_SH4_0_255_MAX_SATU(tmp0, tmp1, tmp2, tmp3);
PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, out0, out1);
- ST4x8_UB(out0, out1, dst, dst_stride);
+ ST_W8(out0, out1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
dst += (8 * dst_stride);
dst10_r = dst98_r;
src0 = (v16i8) __msa_ilvr_w((v4i32) src1, (v4i32) src0);
in0 = (v8i16) __msa_ilvr_b(zero, src0);
in0 <<= 6;
- ST8x2_UB(in0, dst, 2 * dst_stride);
+ ST_D2(in0, 0, 1, dst, dst_stride);
} else if (4 == height) {
v16i8 src0, src1, src2, src3;
v8i16 in0, in1;
ILVR_B2_SH(zero, src0, zero, src1, in0, in1);
in0 <<= 6;
in1 <<= 6;
- ST8x4_UB(in0, in1, dst, 2 * dst_stride);
+ ST_D4(in0, in1, 0, 1, 0, 1, dst, dst_stride);
} else if (0 == height % 8) {
v16i8 src0, src1, src2, src3, src4, src5, src6, src7;
v8i16 in0, in1, in2, in3;
ILVR_B4_SH(zero, src0, zero, src1, zero, src2, zero, src3,
in0, in1, in2, in3);
SLLI_4V(in0, in1, in2, in3, 6);
- ST8x8_UB(in0, in1, in2, in3, dst, 2 * dst_stride);
+ ST_D8(in0, in1, in2, in3, 0, 1, 0, 1, 0, 1, 0, 1, dst, dst_stride);
dst += (8 * dst_stride);
}
}
in0 <<= 6;
in1 <<= 6;
ST_SH4(in0_r, in1_r, in2_r, in3_r, dst, dst_stride);
- ST8x4_UB(in0, in1, dst + 8, 2 * dst_stride);
+ ST_D4(in0, in1, 0, 1, 0, 1, dst + 8, dst_stride);
dst += (4 * dst_stride);
ILVR_B4_SH(zero, src4, zero, src5, zero, src6, zero, src7,
in0 <<= 6;
in1 <<= 6;
ST_SH4(in0_r, in1_r, in2_r, in3_r, dst, dst_stride);
- ST8x4_UB(in0, in1, dst + 8, 2 * dst_stride);
+ ST_D4(in0, in1, 0, 1, 0, 1, dst + 8, dst_stride);
dst += (4 * dst_stride);
}
}
DPADD_SB4_SH(vec0, vec1, vec2, vec3, filt0, filt1, filt2, filt3,
dst3, dst3, dst3, dst3);
- ST8x8_UB(dst0, dst1, dst2, dst3, dst, 2 * dst_stride);
+ ST_D8(dst0, dst1, dst2, dst3, 0, 1, 0, 1, 0, 1, 0, 1, dst, dst_stride);
dst += (8 * dst_stride);
}
}
DPADD_SB4_SH(src8776, src10998, src12111110, src14131312,
filt0, filt1, filt2, filt3, dst76, dst76, dst76, dst76);
- ST8x8_UB(dst10, dst32, dst54, dst76, dst, 2 * dst_stride);
+ ST_D8(dst10, dst32, dst54, dst76, 0, 1, 0, 1, 0, 1, 0, 1, dst, dst_stride);
dst += (8 * dst_stride);
src2110 = src10998;
dst1_l, dst1_l, dst1_l, dst1_l);
ST_SH4(dst0_r, dst1_r, dst2_r, dst3_r, dst, dst_stride);
- ST8x4_UB(dst0_l, dst1_l, dst + 8, 2 * dst_stride);
+ ST_D4(dst0_l, dst1_l, 0, 1, 0, 1, dst + 8, dst_stride);
dst += (4 * dst_stride);
src10_r = src54_r;
int32_t height)
{
uint32_t loop_cnt;
- int32_t dst_stride_in_bytes = 2 * dst_stride;
v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
v8i16 filt0, filt1, filt2, filt3;
v8i16 filt_h0, filt_h1, filt_h2, filt_h3;
filt_h0, filt_h1, filt_h2, filt_h3);
SRA_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6);
PCKEV_H2_SW(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst2_r);
- ST8x4_UB(dst0_r, dst2_r, dst, dst_stride_in_bytes);
+ ST_D4(dst0_r, dst2_r, 0, 1, 0, 1, dst, dst_stride);
dst += (4 * dst_stride);
dst10_r = dst54_r;
int32_t height)
{
uint32_t loop_cnt;
- int32_t dst_stride_in_bytes = 2 * dst_stride;
uint8_t *src_tmp;
int16_t *dst_tmp;
v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
filt_h1, filt_h2, filt_h3);
SRA_4V(dst0_r, dst1_r, dst2_r, dst3_r, 6);
PCKEV_H2_SW(dst1_r, dst0_r, dst3_r, dst2_r, dst0_r, dst2_r);
- ST8x4_UB(dst0_r, dst2_r, dst, dst_stride_in_bytes);
+ ST_D4(dst0_r, dst2_r, 0, 1, 0, 1, dst, dst_stride);
dst += (4 * dst_stride);
dst10_r = dst54_r;
dst0 = const_vec;
DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst0, dst0);
- ST8x2_UB(dst0, dst, 2 * dst_stride);
+ ST_D2(dst0, 0, 1, dst, dst_stride);
}
static void hevc_hz_4t_4x4_msa(uint8_t *src,
dst1 = const_vec;
DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst1, dst1);
- ST8x4_UB(dst0, dst1, dst, 2 * dst_stride);
+ ST_D4(dst0, dst1, 0, 1, 0, 1, dst, dst_stride);
}
static void hevc_hz_4t_4x8multiple_msa(uint8_t *src,
dst3 = const_vec;
DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst3, dst3);
- ST8x8_UB(dst0, dst1, dst2, dst3, dst, 2 * dst_stride);
+ ST_D8(dst0, dst1, dst2, dst3, 0, 1, 0, 1, 0, 1, 0, 1, dst, dst_stride);
dst += (8 * dst_stride);
}
}
DPADD_SB2_SH(vec0, vec1, filt0, filt1, dst5, dst5);
ST_SH4(dst0, dst1, dst2, dst3, dst, dst_stride);
- ST8x4_UB(dst4, dst5, dst + 8, 2 * dst_stride);
+ ST_D4(dst4, dst5, 0, 1, 0, 1, dst + 8, dst_stride);
dst += (4 * dst_stride);
}
}
dst10 = const_vec;
DPADD_SB2_SH(src2110, src4332, filt0, filt1, dst10, dst10);
- ST8x2_UB(dst10, dst, 2 * dst_stride);
+ ST_D2(dst10, 0, 1, dst, dst_stride);
}
static void hevc_vt_4t_4x4_msa(uint8_t *src,
dst32 = const_vec;
DPADD_SB2_SH(src4332, src6554, filt0, filt1, dst32, dst32);
- ST8x4_UB(dst10, dst32, dst, 2 * dst_stride);
+ ST_D4(dst10, dst32, 0, 1, 0, 1, dst, dst_stride);
}
static void hevc_vt_4t_4x8_msa(uint8_t *src,
DPADD_SB2_SH(src4332, src6554, filt0, filt1, dst32, dst32);
DPADD_SB2_SH(src6554, src8776, filt0, filt1, dst54, dst54);
DPADD_SB2_SH(src8776, src10998, filt0, filt1, dst76, dst76);
- ST8x8_UB(dst10, dst32, dst54, dst76, dst, 2 * dst_stride);
- dst += (8 * dst_stride);
+ ST_D8(dst10, dst32, dst54, dst76, 0, 1, 0, 1, 0, 1, 0, 1, dst, dst_stride);
}
static void hevc_vt_4t_4x16_msa(uint8_t *src, int32_t src_stride,
DPADD_SB2_SH(src4332, src6554, filt0, filt1, dst32, dst32);
DPADD_SB2_SH(src6554, src8776, filt0, filt1, dst54, dst54);
DPADD_SB2_SH(src8776, src10998, filt0, filt1, dst76, dst76);
- ST8x8_UB(dst10, dst32, dst54, dst76, dst, 2 * dst_stride);
+ ST_D8(dst10, dst32, dst54, dst76, 0, 1, 0, 1, 0, 1, 0, 1, dst, dst_stride);
dst += (8 * dst_stride);
src2 = src10;
DPADD_SB2_SH(src4332, src6554, filt0, filt1, dst32, dst32);
DPADD_SB2_SH(src6554, src8776, filt0, filt1, dst54, dst54);
DPADD_SB2_SH(src8776, src10998, filt0, filt1, dst76, dst76);
- ST8x8_UB(dst10, dst32, dst54, dst76, dst, 2 * dst_stride);
- dst += (8 * dst_stride);
+ ST_D8(dst10, dst32, dst54, dst76, 0, 1, 0, 1, 0, 1, 0, 1, dst, dst_stride);
}
static void hevc_vt_4t_4w_msa(uint8_t *src,
DPADD_SB2_SH(src4332, src6554, filt0, filt1, dst1_l, dst1_l);
ST_SH4(dst0_r, dst1_r, dst2_r, dst3_r, dst, dst_stride);
- ST8x4_UB(dst0_l, dst1_l, dst + 8, (2 * dst_stride));
+ ST_D4(dst0_l, dst1_l, 0, 1, 0, 1, dst + 8, dst_stride);
dst += (4 * dst_stride);
src2 = src6;
const int8_t *filter_x,
const int8_t *filter_y)
{
- int32_t dst_stride_in_bytes = 2 * dst_stride;
v16i8 src0, src1, src2, src3, src4;
v8i16 filt0, filt1;
v8i16 filt_h0, filt_h1;
dst0 >>= 6;
dst1 >>= 6;
dst0 = (v4i32) __msa_pckev_h((v8i16) dst1, (v8i16) dst0);
- ST8x2_UB(dst0, dst, dst_stride_in_bytes);
+ ST_D2(dst0, 0, 1, dst, dst_stride);
}
static void hevc_hv_4t_4x4_msa(uint8_t *src,
const int8_t *filter_x,
const int8_t *filter_y)
{
- int32_t dst_stride_in_bytes = 2 * dst_stride;
v16i8 src0, src1, src2, src3, src4, src5, src6;
v8i16 filt0, filt1;
v8i16 filt_h0, filt_h1;
dst3 = HEVC_FILT_4TAP(dst43, dst65, filt_h0, filt_h1);
SRA_4V(dst0, dst1, dst2, dst3, 6);
PCKEV_H2_SW(dst1, dst0, dst3, dst2, dst0, dst2);
- ST8x4_UB(dst0, dst2, dst, dst_stride_in_bytes);
+ ST_D4(dst0, dst2, 0, 1, 0, 1, dst, dst_stride);
}
SRA_4V(dst4, dst5, dst6, dst7, 6);
PCKEV_H4_SW(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6,
dst0, dst1, dst2, dst3);
- ST8x8_UB(dst0, dst1, dst2, dst3, dst, 2 * dst_stride);
+ ST_D8(dst0, dst1, dst2, dst3, 0, 1, 0, 1, 0, 1, 0, 1, dst, dst_stride);
dst += (8 * dst_stride);
dst10_r = dst98_r;
const int8_t *filter_y,
int32_t height)
{
- int32_t dst_stride_in_bytes = 2 * dst_stride;
v16i8 src0, src1, src2, src3, src4, src5, src6, src7, src8, src9, src10;
v8i16 filt0, filt1;
v8i16 filt_h0, filt_h1;
PCKEV_H2_SH(dst1_r, dst0_r, dst3_r, dst2_r, tmp0, tmp1);
PCKEV_H2_SH(dst5_r, dst4_r, dst7_r, dst6_r, tmp2, tmp3);
PCKEV_H2_SH(dst1_l, dst0_l, dst3_l, dst2_l, tmp4, tmp5);
- ST8x4_UB(tmp0, tmp1, dst, dst_stride_in_bytes);
- ST4x4_UB(tmp4, tmp4, 0, 1, 2, 3, dst + 4, dst_stride_in_bytes);
+ ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
+ ST_W4(tmp4, 0, 1, 2, 3, dst + 4, dst_stride);
dst += 4 * dst_stride;
- ST8x4_UB(tmp2, tmp3, dst, dst_stride_in_bytes);
- ST4x4_UB(tmp5, tmp5, 0, 1, 2, 3, dst + 4, dst_stride_in_bytes);
+ ST_D4(tmp2, tmp3, 0, 1, 0, 1, dst, dst_stride);
+ ST_W4(tmp5, 0, 1, 2, 3, dst + 4, dst_stride);
}
static void hevc_hv_4t_8x2_msa(uint8_t *src,
SRA_4V(tmp4, tmp5, tmp6, tmp7, 6);
PCKEV_H4_SW(tmp1, tmp0, tmp3, tmp2, tmp5, tmp4, tmp7, tmp6, tmp0, tmp1,
tmp2, tmp3);
- ST8x8_UB(tmp0, tmp1, tmp2, tmp3, dst, 2 * dst_stride);
+ ST_D8(tmp0, tmp1, tmp2, tmp3, 0, 1, 0, 1, 0, 1, 0, 1, dst, dst_stride);
dst += (8 * dst_stride);
dst10_r = dst98_r;
PCKEV_D2_SH(res1, res0, res3, res2, res0, res1);
SRARI_H2_SH(res0, res1, 3);
src_vec0 = __msa_pckev_b((v16i8) res1, (v16i8) res0);
- ST4x4_UB(src_vec0, src_vec0, 0, 1, 2, 3, dst, stride);
+ ST_W4(src_vec0, 0, 1, 2, 3, dst, stride);
}
static void hevc_intra_pred_plane_8x8_msa(const uint8_t *src_top,
PCKEV_B4_SB(res1, res0, res3, res2, res5, res4, res7, res6,
src_vec0, src_vec1, src_vec2, src_vec3);
- ST8x8_UB(src_vec0, src_vec1, src_vec2, src_vec3, dst, stride);
+ ST_D8(src_vec0, src_vec1, src_vec2, src_vec3, 0, 1, 0, 1,
+ 0, 1, 0, 1, dst, stride);
}
static void hevc_intra_pred_plane_16x16_msa(const uint8_t *src_top,
SRARI_H2_SH(diff1, diff3, 5);
dst_val0 = __msa_pckev_b((v16i8) diff3, (v16i8) diff1);
- ST4x4_UB(dst_val0, dst_val0, 0, 1, 2, 3, dst, stride);
+ ST_W4(dst_val0, 0, 1, 2, 3, dst, stride);
}
static void hevc_intra_pred_angular_upper_8width_msa(const uint8_t *src_top,
SRARI_H4_SH(diff1, diff3, diff5, diff7, 5);
PCKEV_B2_UB(diff3, diff1, diff7, diff5, dst_val0, dst_val1);
- ST8x4_UB(dst_val0, dst_val1, dst, stride);
+ ST_D4(dst_val0, dst_val1, 0, 1, 0, 1, dst, stride);
dst += (4 * stride);
}
}
dst_val0 = __msa_pckev_b((v16i8) diff2, (v16i8) diff2);
dst_val1 = __msa_pckod_b((v16i8) diff2, (v16i8) diff2);
- ST4x2_UB(dst_val0, dst, stride);
- dst += (2 * stride);
- ST4x2_UB(dst_val1, dst, stride);
+ ST_W2(dst_val0, 0, 1, dst, stride);
+ ST_W2(dst_val1, 0, 1, dst + 2 * stride, stride);
}
static void hevc_intra_pred_angular_lower_8width_msa(const uint8_t *src_top,
dst_val0, dst_val1, dst_val2, dst_val3);
ILVR_B2_SH(dst_val1, dst_val0, dst_val3, dst_val2, diff0, diff1);
ILVRL_H2_SH(diff1, diff0, diff3, diff4);
- ST4x8_UB(diff3, diff4, dst_org, stride);
+ ST_W8(diff3, diff4, 0, 1, 2, 3, 0, 1, 2, 3, dst_org, stride);
dst += 4;
}
}
ILVL_B2_SH(dst_val1, dst_val0, dst_val3, dst_val2, diff2, diff3);
ILVRL_H2_SH(diff1, diff0, diff4, diff5);
ILVRL_H2_SH(diff3, diff2, diff6, diff7);
- ST4x8_UB(diff4, diff5, dst_org, stride);
+ ST_W8(diff4, diff5, 0, 1, 2, 3, 0, 1, 2, 3, dst_org, stride);
dst_org += (8 * stride);
- ST4x8_UB(diff6, diff7, dst_org, stride);
+ ST_W8(diff6, diff7, 0, 1, 2, 3, 0, 1, 2, 3, dst_org, stride);
dst += 4;
}
}
ILVRL_B2_SH(dst_val2, dst_val0, diff0, diff1);
ILVRL_B2_SH(dst_val3, dst_val1, diff2, diff3);
- ST2x4_UB(diff0, 0, dst_org, stride);
- dst_org += (4 * stride);
- ST2x4_UB(diff0, 4, dst_org, stride);
- dst_org += (4 * stride);
- ST2x4_UB(diff1, 0, dst_org, stride);
- dst_org += (4 * stride);
- ST2x4_UB(diff1, 4, dst_org, stride);
- dst_org += (4 * stride);
-
- ST2x4_UB(diff2, 0, dst_org, stride);
- dst_org += (4 * stride);
- ST2x4_UB(diff2, 4, dst_org, stride);
- dst_org += (4 * stride);
- ST2x4_UB(diff3, 0, dst_org, stride);
- dst_org += (4 * stride);
- ST2x4_UB(diff3, 4, dst_org, stride);
- dst_org += (4 * stride);
+ ST_H8(diff0, 0, 1, 2, 3, 4, 5, 6, 7, dst_org, stride)
+ dst_org += (8 * stride);
+ ST_H8(diff1, 0, 1, 2, 3, 4, 5, 6, 7, dst_org, stride)
+ dst_org += (8 * stride);
+ ST_H8(diff2, 0, 1, 2, 3, 4, 5, 6, 7, dst_org, stride)
+ dst_org += (8 * stride);
+ ST_H8(diff3, 0, 1, 2, 3, 4, 5, 6, 7, dst_org, stride)
+ dst_org += (8 * stride);
dst += 2;
}
PCKEV_B2_UB(in2, in1, in4, in3, tmp0_m, tmp1_m); \
PCKEV_D2_UB(dst1, dst0, dst3, dst2, tmp2_m, tmp3_m); \
AVER_UB2_UB(tmp0_m, tmp2_m, tmp1_m, tmp3_m, tmp0_m, tmp1_m); \
- ST8x4_UB(tmp0_m, tmp1_m, pdst_m, stride); \
+ ST_D4(tmp0_m, tmp1_m, 0, 1, 0, 1, pdst_m, stride); \
}
static void common_hz_bil_4w_msa(const uint8_t *src, int32_t src_stride,
sum0, sum1, sum2, sum3);
SRARI_H4_UH(sum0, sum1, sum2, sum3, 2);
PCKEV_B2_SB(sum1, sum0, sum3, sum2, src0, src1);
- ST8x4_UB(src0, src1, dst, dst_stride);
+ ST_D4(src0, src1, 0, 1, 0, 1, dst, dst_stride);
dst += (4 * dst_stride);
src0 = src4;
}
SRA_4V(sum0, sum1, sum2, sum3, 2);
SRA_4V(sum4, sum5, sum6, sum7, 2);
PCKEV_B2_SB(sum1, sum0, sum3, sum2, out0, out1);
- ST8x4_UB(out0, out1, dst, dst_stride);
+ ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
PCKEV_B2_SB(sum5, sum4, sum7, sum6, out0, out1);
- ST8x4_UB(out0, out1, dst + 4 * dst_stride, dst_stride);
+ ST_D4(out0, out1, 0, 1, 0, 1, dst + 4 * dst_stride, dst_stride);
}
static void common_hv_bil_no_rnd_4x8_msa(const uint8_t *src, int32_t src_stride,
SRA_4V(sum0, sum1, sum2, sum3, 2);
PCKEV_B2_SB(sum1, sum0, sum3, sum2, out0, out1);
- ST8x4_UB(out0, out1, dst, dst_stride);
+ ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
}
static void common_hv_bil_no_rnd_16x16_msa(const uint8_t *src,
inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
AVER_UB2_UB(inp0, res0, inp2, res1, res0, res1);
- ST8x4_UB(res0, res1, dst, dst_stride);
+ ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
dst += (4 * dst_stride);
}
}
res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3,
mask0, mask1, mask2, mask3,
const20, const6, const3);
- ST8x4_UB(res0, res1, dst, dst_stride);
+ ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
dst += (4 * dst_stride);
}
}
inp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
AVER_UB2_UB(inp0, res0, inp2, res1, res0, res1);
- ST8x4_UB(res0, res1, dst, dst_stride);
+ ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
dst += (4 * dst_stride);
}
}
inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
res0 = __msa_ave_u_b(inp0, res0);
res1 = __msa_ave_u_b(inp2, res1);
- ST8x4_UB(res0, res1, dst, dst_stride);
+ ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
dst += (4 * dst_stride);
}
}
res1 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1,
mask2, mask3, const20,
const6, const3);
- ST8x4_UB(res0, res1, dst, dst_stride);
+ ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
dst += (4 * dst_stride);
}
}
inp2 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
res0 = __msa_ave_u_b(inp0, res0);
res1 = __msa_ave_u_b(inp2, res1);
- ST8x4_UB(res0, res1, dst, dst_stride);
+ ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
dst += (4 * dst_stride);
}
}
dst2 = (v16u8) __msa_insve_d((v2i64) dst2, 1, (v2i64) dst3);
AVER_UB2_UB(inp0, res0, inp2, res1, res0, res1);
AVER_UB2_UB(dst0, res0, dst2, res1, res0, res1);
- ST8x4_UB(res0, res1, dst, dst_stride);
+ ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
dst += (4 * dst_stride);
}
}
dst0 = (v16u8) __msa_insve_d((v2i64) dst0, 1, (v2i64) dst1);
dst2 = (v16u8) __msa_insve_d((v2i64) dst2, 1, (v2i64) dst3);
AVER_UB2_UB(dst0, res0, dst2, res1, res0, res1);
- ST8x4_UB(res0, res1, dst, dst_stride);
+ ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
dst += (4 * dst_stride);
}
}
dst2 = (v16u8) __msa_insve_d((v2i64) dst2, 1, (v2i64) dst3);
AVER_UB2_UB(inp0, res0, inp2, res1, res0, res1);
AVER_UB2_UB(dst0, res0, dst2, res1, res0, res1);
- ST8x4_UB(res0, res1, dst, dst_stride);
+ ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
dst += (4 * dst_stride);
}
}
tmp0 = (v16u8) __msa_insve_d((v2i64) inp0, 1, (v2i64) inp1);
tmp1 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
AVER_UB2_UB(res0, tmp0, res1, tmp1, res0, res1);
- ST8x4_UB(res0, res1, dst, dst_stride);
- dst += (4 * dst_stride);
+ ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
inp8 = LD_UB(src);
res0 = APPLY_VERT_QPEL_FILTER_8BYTE(inp4, inp3, inp2, inp1,
tmp0 = (v16u8) __msa_insve_d((v2i64) inp4, 1, (v2i64) inp5);
tmp1 = (v16u8) __msa_insve_d((v2i64) inp6, 1, (v2i64) inp7);
AVER_UB2_UB(res0, tmp0, res1, tmp1, res0, res1);
- ST8x4_UB(res0, res1, dst, dst_stride);
- dst += (4 * dst_stride);
+ ST_D4(res0, res1, 0, 1, 0, 1, dst + 4 * dst_stride, dst_stride);
}
static void vert_mc_qpel_aver_src0_16x16_msa(const uint8_t *src,
inp3, inp2, inp1, inp0,
inp4, inp5, inp6, inp7,
const20, const6, const3);
- ST8x4_UB(res0, res1, dst, dst_stride);
- dst += (4 * dst_stride);
+ ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
inp8 = LD_UB(src);
res0 = APPLY_VERT_QPEL_FILTER_8BYTE(inp4, inp3, inp2, inp1,
inp7, inp6, inp5, inp4,
inp8, inp8, inp7, inp6,
const20, const6, const3);
- ST8x4_UB(res0, res1, dst, dst_stride);
- dst += (4 * dst_stride);
+ ST_D4(res0, res1, 0, 1, 0, 1, dst + 4 * dst_stride, dst_stride);
}
static void vert_mc_qpel_16x16_msa(const uint8_t *src,
tmp0 = (v16u8) __msa_insve_d((v2i64) inp1, 1, (v2i64) inp2);
tmp1 = (v16u8) __msa_insve_d((v2i64) inp3, 1, (v2i64) inp4);
AVER_UB2_UB(res0, tmp0, res1, tmp1, res0, res1);
- ST8x4_UB(res0, res1, dst, dst_stride);
- dst += (4 * dst_stride);
+ ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
inp8 = LD_UB(src);
res0 = APPLY_VERT_QPEL_FILTER_8BYTE(inp4, inp3, inp2, inp1,
tmp0 = (v16u8) __msa_insve_d((v2i64) inp5, 1, (v2i64) inp6);
tmp1 = (v16u8) __msa_insve_d((v2i64) inp7, 1, (v2i64) inp8);
AVER_UB2_UB(res0, tmp0, res1, tmp1, res0, res1);
- ST8x4_UB(res0, res1, dst, dst_stride);
+ ST_D4(res0, res1, 0, 1, 0, 1, dst + 4 * dst_stride, dst_stride);
}
static void vert_mc_qpel_aver_src1_16x16_msa(const uint8_t *src,
tmp1 = (v16u8) __msa_insve_d((v2i64) inp2, 1, (v2i64) inp3);
res0 = __msa_ave_u_b(res0, tmp0);
res1 = __msa_ave_u_b(res1, tmp1);
- ST8x4_UB(res0, res1, dst, dst_stride);
- dst += (4 * dst_stride);
+ ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
inp8 = LD_UB(src);
res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(inp4, inp3, inp2, inp1,
tmp1 = (v16u8) __msa_insve_d((v2i64) inp6, 1, (v2i64) inp7);
res0 = __msa_ave_u_b(res0, tmp0);
res1 = __msa_ave_u_b(res1, tmp1);
- ST8x4_UB(res0, res1, dst, dst_stride);
- dst += (4 * dst_stride);
+ ST_D4(res0, res1, 0, 1, 0, 1, dst + 4 * dst_stride, dst_stride);
}
static void vert_mc_qpel_no_rnd_aver_src0_16x16_msa(const uint8_t *src,
inp3, inp2, inp1, inp0,
inp4, inp5, inp6, inp7,
const20, const6, const3);
- ST8x4_UB(res0, res1, dst, dst_stride);
- dst += (4 * dst_stride);
+ ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
inp8 = LD_UB(src);
res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(inp4, inp3, inp2, inp1,
inp7, inp6, inp5, inp4,
inp8, inp8, inp7, inp6,
const20, const6, const3);
- ST8x4_UB(res0, res1, dst, dst_stride);
- dst += (4 * dst_stride);
+ ST_D4(res0, res1, 0, 1, 0, 1, dst + 4 * dst_stride, dst_stride);
}
static void vert_mc_qpel_no_rnd_16x16_msa(const uint8_t *src,
tmp1 = (v16u8) __msa_insve_d((v2i64) inp3, 1, (v2i64) inp4);
res0 = __msa_ave_u_b(res0, tmp0);
res1 = __msa_ave_u_b(res1, tmp1);
- ST8x4_UB(res0, res1, dst, dst_stride);
- dst += (4 * dst_stride);
+ ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
inp8 = LD_UB(src);
res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(inp4, inp3, inp2, inp1,
tmp1 = (v16u8) __msa_insve_d((v2i64) inp7, 1, (v2i64) inp8);
res0 = __msa_ave_u_b(res0, tmp0);
res1 = __msa_ave_u_b(res1, tmp1);
- ST8x4_UB(res0, res1, dst, dst_stride);
+ ST_D4(res0, res1, 0, 1, 0, 1, dst + 4 * dst_stride, dst_stride);
}
static void vert_mc_qpel_no_rnd_aver_src1_16x16_msa(const uint8_t *src,
dst2 = (v16u8) __msa_insve_d((v2i64) dst2, 1, (v2i64) dst3);
AVER_UB2_UB(res0, tmp0, res1, tmp1, res0, res1);
AVER_UB2_UB(dst0, res0, dst2, res1, res0, res1);
- ST8x4_UB(res0, res1, dst, dst_stride);
+ ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
dst += (4 * dst_stride);
inp8 = LD_UB(src);
dst2 = (v16u8) __msa_insve_d((v2i64) dst2, 1, (v2i64) dst3);
AVER_UB2_UB(res0, tmp0, res1, tmp1, res0, res1);
AVER_UB2_UB(dst0, res0, dst2, res1, res0, res1);
- ST8x4_UB(res0, res1, dst, dst_stride);
+ ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
}
static void vert_mc_qpel_avg_dst_aver_src0_16x16_msa(const uint8_t *src,
dst0 = (v16u8) __msa_insve_d((v2i64) dst0, 1, (v2i64) dst1);
dst2 = (v16u8) __msa_insve_d((v2i64) dst2, 1, (v2i64) dst3);
AVER_UB2_UB(dst0, res0, dst2, res1, res0, res1);
- ST8x4_UB(res0, res1, dst, dst_stride);
+ ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
dst += (4 * dst_stride);
inp8 = LD_UB(src);
dst0 = (v16u8) __msa_insve_d((v2i64) dst0, 1, (v2i64) dst1);
dst2 = (v16u8) __msa_insve_d((v2i64) dst2, 1, (v2i64) dst3);
AVER_UB2_UB(dst0, res0, dst2, res1, res0, res1);
- ST8x4_UB(res0, res1, dst, dst_stride);
- dst += (4 * dst_stride);
+ ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
}
static void vert_mc_qpel_avg_dst_16x16_msa(const uint8_t *src,
dst2 = (v16u8) __msa_insve_d((v2i64) dst2, 1, (v2i64) dst3);
AVER_UB2_UB(res0, tmp0, res1, tmp1, res0, res1);
AVER_UB2_UB(dst0, res0, dst2, res1, res0, res1);
- ST8x4_UB(res0, res1, dst, dst_stride);
+ ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
dst += (4 * dst_stride);
inp8 = LD_UB(src);
dst2 = (v16u8) __msa_insve_d((v2i64) dst2, 1, (v2i64) dst3);
AVER_UB2_UB(res0, tmp0, res1, tmp1, res0, res1);
AVER_UB2_UB(dst0, res0, dst2, res1, res0, res1);
- ST8x4_UB(res0, res1, dst, dst_stride);
+ ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
}
static void vert_mc_qpel_avg_dst_aver_src1_16x16_msa(const uint8_t *src,
const20, const6, const3);
avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz1, (v2i64) horiz0);
res0 = __msa_ave_u_b(avg0, res0);
- ST8x2_UB(res0, dst, dst_stride);
+ ST_D2(res0, 0, 1, dst, dst_stride);
dst += (2 * dst_stride);
LD_UB2(src, src_stride, inp2, inp3);
horiz5, horiz4, horiz3, horiz2,
horiz6, horiz7, horiz8, horiz8,
const20, const6, const3);
- ST8x2_UB(res1, dst, dst_stride);
+ ST_D2(res1, 0, 1, dst, dst_stride);
dst += 2 * dst_stride;
avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz5, (v2i64) horiz4);
horiz7, horiz6, horiz5, horiz4,
horiz8, horiz8, horiz7, horiz6,
const20, const6, const3);
- ST8x2_UB(res0, dst, dst_stride);
+ ST_D2(res0, 0, 1, dst, dst_stride);
dst += 2 * dst_stride;
avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz7, (v2i64) horiz6);
res1 = __msa_ave_u_b(avg1, res1);
- ST8x2_UB(res1, dst, dst_stride);
+ ST_D2(res1, 0, 1, dst, dst_stride);
}
static void hv_mc_qpel_no_rnd_horiz_16x16_msa(const uint8_t *src,
const20, const6, const3);
avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz1, (v2i64) horiz0);
res0 = __msa_ave_u_b(avg0, res0);
- ST8x2_UB(res0, dst, dst_stride);
+ ST_D2(res0, 0, 1, dst, dst_stride);
dst += (2 * dst_stride);
LD_UB2(src, src_stride, inp2, inp3);
res1 = __msa_ave_u_b(avg1, res1);
avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz1, (v2i64) horiz0);
res0 = __msa_ave_u_b(avg0, res0);
- ST8x2_UB(res1, dst, dst_stride);
+ ST_D2(res1, 0, 1, dst, dst_stride);
dst += (2 * dst_stride);
res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
const20, const6, const3);
avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz5, (v2i64) horiz4);
res0 = __msa_ave_u_b(avg0, res0);
- ST8x2_UB(res0, dst, dst_stride);
+ ST_D2(res0, 0, 1, dst, dst_stride);
dst += (2 * dst_stride);
res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
const20, const6, const3);
avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz7, (v2i64) horiz6);
res1 = __msa_ave_u_b(avg1, res1);
- ST8x2_UB(res1, dst, dst_stride);
+ ST_D2(res1, 0, 1, dst, dst_stride);
}
static void hv_mc_qpel_no_rnd_horiz_src1_16x16_msa(const uint8_t *src,
const20, const6, const3);
avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz1, (v2i64) horiz0);
res0 = __msa_ave_u_b(avg0, res0);
- ST8x2_UB(res0, dst, dst_stride);
+ ST_D2(res0, 0, 1, dst, dst_stride);
dst += (2 * dst_stride);
LD_UB2(src, src_stride, inp2, inp3);
const20, const6, const3);
avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz3, (v2i64) horiz2);
res1 = __msa_ave_u_b(avg1, res1);
- ST8x2_UB(res1, dst, dst_stride);
+ ST_D2(res1, 0, 1, dst, dst_stride);
dst += (2 * dst_stride);
res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
const20, const6, const3);
avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz5, (v2i64) horiz4);
res0 = __msa_ave_u_b(avg0, res0);
- ST8x2_UB(res0, dst, dst_stride);
+ ST_D2(res0, 0, 1, dst, dst_stride);
dst += (2 * dst_stride);
res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
const20, const6, const3);
avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz7, (v2i64) horiz6);
res1 = __msa_ave_u_b(avg1, res1);
- ST8x2_UB(res1, dst, dst_stride);
+ ST_D2(res1, 0, 1, dst, dst_stride);
}
static void hv_mc_qpel_no_rnd_aver_h_src0_16x16_msa(const uint8_t *src,
LD_UB2(src, src_stride, inp2, inp3);
src += (2 * src_stride);
- ST8x2_UB(res0, dst, dst_stride);
+ ST_D2(res0, 0, 1, dst, dst_stride);
dst += 2 * dst_stride;
res1 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1,
horiz5, horiz4, horiz3, horiz2,
horiz6, horiz7, horiz8, horiz8,
const20, const6, const3);
- ST8x2_UB(res1, dst, dst_stride);
- dst += 2 * dst_stride;
-
- ST8x2_UB(res0, dst, dst_stride);
- dst += (2 * dst_stride);
+ ST_D4(res1, res0, 0, 1, 0, 1, dst, dst_stride);
+ dst += (4 * dst_stride);
res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
horiz7, horiz8, horiz8, horiz7,
horiz7, horiz6, horiz5, horiz4,
horiz8, horiz8, horiz7, horiz6,
const20, const6, const3);
- ST8x2_UB(res1, dst, dst_stride);
+ ST_D2(res1, 0, 1, dst, dst_stride);
}
static void hv_mc_qpel_no_rnd_16x16_msa(const uint8_t *src,
const20, const6, const3);
LD_UB2(src, src_stride, inp2, inp3);
src += (2 * src_stride);
- ST8x2_UB(res0, dst, dst_stride);
+ ST_D2(res0, 0, 1, dst, dst_stride);
dst += 2 * dst_stride;
horiz6 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1,
horiz5, horiz4, horiz3, horiz2,
horiz6, horiz7, horiz8, horiz8,
const20, const6, const3);
- ST8x2_UB(res1, dst, dst_stride);
+ ST_D2(res1, 0, 1, dst, dst_stride);
dst += 2 * dst_stride;
horiz7, horiz6, horiz5, horiz4,
horiz8, horiz8, horiz7, horiz6,
const20, const6, const3);
- ST8x2_UB(res0, dst, dst_stride);
- dst += 2 * dst_stride;
- ST8x2_UB(res1, dst, dst_stride);
+ ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
}
static void hv_mc_qpel_no_rnd_aver_h_src1_16x16_msa(const uint8_t *src,
const20, const6, const3);
LD_UB2(src, src_stride, inp2, inp3);
src += (2 * src_stride);
- ST8x2_UB(res0, dst, dst_stride);
+ ST_D2(res0, 0, 1, dst, dst_stride);
dst += 2 * dst_stride;
res1 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1,
horiz5, horiz4, horiz3, horiz2,
horiz6, horiz7, horiz8, horiz8,
const20, const6, const3);
- ST8x2_UB(res1, dst, dst_stride);
+ ST_D2(res1, 0, 1, dst, dst_stride);
dst += 2 * dst_stride;
res1 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
horiz7, horiz6, horiz5, horiz4,
horiz8, horiz8, horiz7, horiz6,
const20, const6, const3);
- ST8x2_UB(res0, dst, dst_stride);
- dst += 2 * dst_stride;
- ST8x2_UB(res1, dst, dst_stride);
+ ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
}
static void hv_mc_qpel_no_rnd_aver_hv_src01_16x16_msa(const uint8_t *src,
const20, const6, const3);
avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz2, (v2i64) horiz1);
res0 = __msa_ave_u_b(avg0, res0);
- ST8x2_UB(res0, dst, dst_stride);
+ ST_D2(res0, 0, 1, dst, dst_stride);
dst += (2 * dst_stride);
LD_UB2(src, src_stride, inp2, inp3);
horiz5, horiz4, horiz3, horiz2,
horiz6, horiz7, horiz8, horiz8,
const20, const6, const3);
- ST8x2_UB(res1, dst, dst_stride);
+ ST_D2(res1, 0, 1, dst, dst_stride);
dst += 2 * dst_stride;
avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz6, (v2i64) horiz5);
horiz7, horiz6, horiz5, horiz4,
horiz8, horiz8, horiz7, horiz6,
const20, const6, const3);
- ST8x2_UB(res0, dst, dst_stride);
+ ST_D2(res0, 0, 1, dst, dst_stride);
dst += 2 * dst_stride;
avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz8, (v2i64) horiz7);
res1 = __msa_ave_u_b(avg1, res1);
- ST8x2_UB(res1, dst, dst_stride);
+ ST_D2(res1, 0, 1, dst, dst_stride);
}
static void hv_mc_qpel_no_rnd_aver_v_src1_16x16_msa(const uint8_t *src,
res0 = __msa_ave_u_b(avg0, res0);
LD_UB2(src, src_stride, inp2, inp3);
src += (2 * src_stride);
- ST8x2_UB(res0, dst, dst_stride);
+ ST_D2(res0, 0, 1, dst, dst_stride);
dst += 2 * dst_stride;
horiz6 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE(inp2, inp3, mask0, mask1,
horiz8 = APPLY_HORIZ_QPEL_NO_ROUND_FILTER_8BYTE_1ROW(inp0, mask0, mask1,
mask2, mask3, const20,
const6, const3);
- ST8x2_UB(res1, dst, dst_stride);
+ ST_D2(res1, 0, 1, dst, dst_stride);
dst += 2 * dst_stride;
res0 = APPLY_VERT_QPEL_NO_ROUND_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
horiz7, horiz6, horiz5, horiz4,
horiz8, horiz8, horiz7, horiz6,
const20, const6, const3);
- ST8x2_UB(res0, dst, dst_stride);
- dst += 2 * dst_stride;
-
avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz8, (v2i64) horiz7);
res1 = __msa_ave_u_b(avg1, res1);
- ST8x2_UB(res1, dst, dst_stride);
+ ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
}
static void hv_mc_qpel_no_rnd_aver_hv_src11_16x16_msa(const uint8_t *src,
const20, const6, const3);
avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz2, (v2i64) horiz1);
res0 = __msa_ave_u_b(avg0, res0);
- ST8x2_UB(res0, dst, dst_stride);
+ ST_D2(res0, 0, 1, dst, dst_stride);
dst += (2 * dst_stride);
LD_UB2(src, src_stride, inp2, inp3);
const20, const6, const3);
avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz4, (v2i64) horiz3);
res1 = __msa_ave_u_b(avg1, res1);
- ST8x2_UB(res1, dst, dst_stride);
+ ST_D2(res1, 0, 1, dst, dst_stride);
dst += (2 * dst_stride);
inp0 = LD_UB(src);
res0 = __msa_ave_u_b(avg0, res0);
avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz8, (v2i64) horiz7);
res1 = __msa_ave_u_b(avg1, res1);
- ST8x4_UB(res0, res1, dst, dst_stride);
+ ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
}
static void hv_mc_qpel_aver_horiz_src0_16x16_msa(const uint8_t *src,
const20, const6, const3);
avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz1, (v2i64) horiz0);
res0 = __msa_aver_u_b(avg0, res0);
- ST8x2_UB(res0, dst, dst_stride);
+ ST_D2(res0, 0, 1, dst, dst_stride);
dst += (2 * dst_stride);
LD_UB2(src, src_stride, inp2, inp3);
res0 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0, mask0, mask1, mask2, mask3,
const20, const6, const3);
horiz8 = __msa_aver_u_b(inp0, res0);
- ST8x2_UB(res1, dst, dst_stride);
+ ST_D2(res1, 0, 1, dst, dst_stride);
dst += 2 * dst_stride;
res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
horiz7, horiz6, horiz5, horiz4,
horiz8, horiz8, horiz7, horiz6,
const20, const6, const3);
- ST8x2_UB(res0, dst, dst_stride);
- dst += 2 * dst_stride;
avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz7, (v2i64) horiz6);
res1 = __msa_aver_u_b(avg1, res1);
- ST8x2_UB(res1, dst, dst_stride);
+ ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
}
static void hv_mc_qpel_aver_horiz_16x16_msa(const uint8_t *src,
const20, const6, const3);
avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz1, (v2i64) horiz0);
res0 = __msa_aver_u_b(avg0, res0);
- ST8x2_UB(res0, dst, dst_stride);
+ ST_D2(res0, 0, 1, dst, dst_stride);
dst += (2 * dst_stride);
LD_UB2(src, src_stride, inp2, inp3);
horiz5, horiz4, horiz3, horiz2,
horiz6, horiz7, horiz8, horiz8,
const20, const6, const3);
- ST8x2_UB(res1, dst, dst_stride);
+ ST_D2(res1, 0, 1, dst, dst_stride);
dst += 2 * dst_stride;
avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz5, (v2i64) horiz4);
horiz7, horiz6, horiz5, horiz4,
horiz8, horiz8, horiz7, horiz6,
const20, const6, const3);
- ST8x2_UB(res0, dst, dst_stride);
- dst += 2 * dst_stride;
avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz7, (v2i64) horiz6);
res1 = __msa_aver_u_b(avg1, res1);
- ST8x2_UB(res1, dst, dst_stride);
+ ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
}
static void hv_mc_qpel_aver_horiz_src1_16x16_msa(const uint8_t *src,
horiz3, horiz2, horiz1, horiz0,
horiz4, horiz5, horiz6, horiz7,
const20, const6, const3);
- ST8x2_UB(res0, dst, dst_stride);
+ ST_D2(res0, 0, 1, dst, dst_stride);
dst += 2 * dst_stride;
inp0 = LD_UB(src);
horiz5, horiz4, horiz3, horiz2,
horiz6, horiz7, horiz8, horiz8,
const20, const6, const3);
- ST8x2_UB(res1, dst, dst_stride);
+ ST_D2(res1, 0, 1, dst, dst_stride);
dst += 2 * dst_stride;
avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz5, (v2i64) horiz4);
horiz7, horiz6, horiz5, horiz4,
horiz8, horiz8, horiz7, horiz6,
const20, const6, const3);
- ST8x2_UB(res0, dst, dst_stride);
- dst += 2 * dst_stride;
-
avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz7, (v2i64) horiz6);
res1 = __msa_aver_u_b(avg1, res1);
- ST8x2_UB(res1, dst, dst_stride);
+ ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
}
static void hv_mc_qpel_aver_h_src0_16x16_msa(const uint8_t *src,
horiz1, horiz0, horiz0, horiz1,
horiz2, horiz3, horiz4, horiz5,
const20, const6, const3);
- ST8x2_UB(res0, dst, dst_stride);
+ ST_D2(res0, 0, 1, dst, dst_stride);
dst += (2 * dst_stride);
LD_UB2(src, src_stride, inp2, inp3);
horiz5, horiz4, horiz3, horiz2,
horiz6, horiz7, horiz8, horiz8,
const20, const6, const3);
- ST8x2_UB(res1, dst, dst_stride);
+ ST_D2(res1, 0, 1, dst, dst_stride);
dst += 2 * dst_stride;
res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
horiz7, horiz6, horiz5, horiz4,
horiz8, horiz8, horiz7, horiz6,
const20, const6, const3);
- ST8x2_UB(res0, dst, dst_stride);
- dst += 2 * dst_stride;
- ST8x2_UB(res1, dst, dst_stride);
+ ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
}
static void hv_mc_qpel_16x16_msa(const uint8_t *src,
horiz1, horiz0, horiz0, horiz1,
horiz2, horiz3, horiz4, horiz5,
const20, const6, const3);
- ST8x2_UB(res0, dst, dst_stride);
+ ST_D2(res0, 0, 1, dst, dst_stride);
dst += (2 * dst_stride);
LD_UB2(src, src_stride, inp2, inp3);
horiz8 = APPLY_HORIZ_QPEL_FILTER_8BYTE_1ROW(inp0,
mask0, mask1, mask2, mask3,
const20, const6, const3);
- ST8x2_UB(res1, dst, dst_stride);
+ ST_D2(res1, 0, 1, dst, dst_stride);
dst += 2 * dst_stride;
res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
horiz7, horiz6, horiz5, horiz4,
horiz8, horiz8, horiz7, horiz6,
const20, const6, const3);
- ST8x2_UB(res0, dst, dst_stride);
- dst += 2 * dst_stride;
- ST8x2_UB(res1, dst, dst_stride);
+ ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
}
static void hv_mc_qpel_aver_h_src1_16x16_msa(const uint8_t *src,
horiz1, horiz0, horiz0, horiz1,
horiz2, horiz3, horiz4, horiz5,
const20, const6, const3);
- ST8x2_UB(res0, dst, dst_stride);
- dst += (2 * dst_stride);
-
res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz2, horiz1, horiz0, horiz0,
horiz3, horiz4, horiz5, horiz6,
horiz3, horiz2, horiz1, horiz0,
horiz4, horiz5, horiz6, horiz7,
const20, const6, const3);
- ST8x2_UB(res1, dst, dst_stride);
- dst += (2 * dst_stride);
+ ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
+ dst += (4 * dst_stride);
res0 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz4, horiz3, horiz2, horiz1,
horiz5, horiz6, horiz7, horiz8,
horiz5, horiz4, horiz3, horiz2,
horiz6, horiz7, horiz8, horiz8,
const20, const6, const3);
- ST8x2_UB(res0, dst, dst_stride);
- dst += (2 * dst_stride);
-
res1 = APPLY_VERT_QPEL_FILTER_8BYTE(horiz6, horiz5, horiz4, horiz3,
horiz7, horiz8, horiz8, horiz7,
horiz7, horiz6, horiz5, horiz4,
horiz8, horiz8, horiz7, horiz6,
const20, const6, const3);
- ST8x2_UB(res1, dst, dst_stride);
+ ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
}
static void hv_mc_qpel_aver_hv_src01_16x16_msa(const uint8_t *src,
const20, const6, const3);
avg0 = (v16u8) __msa_insve_d((v2i64) horiz1, 1, (v2i64) horiz2);
res0 = __msa_aver_u_b(avg0, res0);
- ST8x2_UB(res0, dst, dst_stride);
+ ST_D2(res0, 0, 1, dst, dst_stride);
dst += (2 * dst_stride);
LD_UB2(src, src_stride, inp2, inp3);
horiz5, horiz4, horiz3, horiz2,
horiz6, horiz7, horiz8, horiz8,
const20, const6, const3);
- ST8x2_UB(res1, dst, dst_stride);
+ ST_D2(res1, 0, 1, dst, dst_stride);
dst += 2 * dst_stride;
avg0 = (v16u8) __msa_insve_d((v2i64) horiz5, 1, (v2i64) horiz6);
horiz7, horiz6, horiz5, horiz4,
horiz8, horiz8, horiz7, horiz6,
const20, const6, const3);
- ST8x2_UB(res0, dst, dst_stride);
- dst += 2 * dst_stride;
-
avg1 = (v16u8) __msa_insve_d((v2i64) horiz7, 1, (v2i64) horiz8);
res1 = __msa_aver_u_b(avg1, res1);
- ST8x2_UB(res1, dst, dst_stride);
- dst += (2 * dst_stride);
+ ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
}
static void hv_mc_qpel_aver_v_src1_16x16_msa(const uint8_t *src,
const20, const6, const3);
avg0 = (v16u8) __msa_insve_d((v2i64) horiz1, 1, (v2i64) horiz2);
res0 = __msa_aver_u_b(avg0, res0);
- ST8x2_UB(res0, dst, dst_stride);
+ ST_D2(res0, 0, 1, dst, dst_stride);
dst += (2 * dst_stride);
LD_UB2(src, src_stride, inp2, inp3);
horiz5, horiz4, horiz3, horiz2,
horiz6, horiz7, horiz8, horiz8,
const20, const6, const3);
- ST8x2_UB(res1, dst, dst_stride);
+ ST_D2(res1, 0, 1, dst, dst_stride);
dst += 2 * dst_stride;
avg0 = (v16u8) __msa_insve_d((v2i64) horiz5, 1, (v2i64) horiz6);
res0 = __msa_aver_u_b(avg0, res0);
horiz7, horiz6, horiz5, horiz4,
horiz8, horiz8, horiz7, horiz6,
const20, const6, const3);
- ST8x2_UB(res0, dst, dst_stride);
- dst += 2 * dst_stride;
avg1 = (v16u8) __msa_insve_d((v2i64) horiz7, 1, (v2i64) horiz8);
res1 = __msa_aver_u_b(avg1, res1);
- ST8x2_UB(res1, dst, dst_stride);
+ ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
}
static void hv_mc_qpel_aver_hv_src11_16x16_msa(const uint8_t *src,
res0 = __msa_aver_u_b(avg0, res0);
LD_UB2(src, src_stride, inp2, inp3);
src += (2 * src_stride);
- ST8x2_UB(res0, dst, dst_stride);
+ ST_D2(res0, 0, 1, dst, dst_stride);
dst += 2 * dst_stride;
res1 = APPLY_HORIZ_QPEL_FILTER_8BYTE(inp2, inp3, mask0, mask1, mask2, mask3,
horiz5, horiz4, horiz3, horiz2,
horiz6, horiz7, horiz8, horiz8,
const20, const6, const3);
- ST8x2_UB(res1, dst, dst_stride);
+ ST_D2(res1, 0, 1, dst, dst_stride);
dst += 2 * dst_stride;
avg0 = (v16u8) __msa_ilvr_d((v2i64) horiz6, (v2i64) horiz5);
horiz7, horiz6, horiz5, horiz4,
horiz8, horiz8, horiz7, horiz6,
const20, const6, const3);
- ST8x2_UB(res0, dst, dst_stride);
- dst += 2 * dst_stride;
-
avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz8, (v2i64) horiz7);
res1 = __msa_aver_u_b(avg1, res1);
- ST8x2_UB(res1, dst, dst_stride);
+ ST_D4(res0, res1, 0, 1, 0, 1, dst, dst_stride);
}
static void hv_mc_qpel_avg_dst_aver_hv_src00_16x16_msa(const uint8_t *src,
res0 = __msa_aver_u_b(avg0, res0);
avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
res0 = __msa_aver_u_b(avg0, res0);
- ST8x2_UB(res0, dst, dst_stride);
+ ST_D2(res0, 0, 1, dst, dst_stride);
dst += (2 * dst_stride);
LD_UB2(src, src_stride, inp2, inp3);
res1 = __msa_aver_u_b(avg1, res1);
avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
res1 = __msa_aver_u_b(avg1, res1);
- ST8x2_UB(res1, dst, dst_stride);
+ ST_D2(res1, 0, 1, dst, dst_stride);
dst += (2 * dst_stride);
inp0 = LD_UB(src);
res0 = __msa_aver_u_b(avg0, res0);
avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
res0 = __msa_aver_u_b(avg0, res0);
- ST8x2_UB(res0, dst, dst_stride);
+ ST_D2(res0, 0, 1, dst, dst_stride);
dst += (2 * dst_stride);
LD_UB2(dst, dst_stride, dst0, dst1);
res1 = __msa_aver_u_b(avg1, res1);
avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
res1 = __msa_aver_u_b(avg1, res1);
- ST8x2_UB(res1, dst, dst_stride);
+ ST_D2(res1, 0, 1, dst, dst_stride);
}
static void hv_mc_qpel_avg_dst_aver_v_src0_16x16_msa(const uint8_t *src,
res0 = __msa_aver_u_b(avg0, res0);
avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
res0 = __msa_aver_u_b(avg0, res0);
- ST8x2_UB(res0, dst, dst_stride);
+ ST_D2(res0, 0, 1, dst, dst_stride);
dst += (2 * dst_stride);
LD_UB2(src, src_stride, inp2, inp3);
res1 = __msa_aver_u_b(avg1, res1);
avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
res1 = __msa_aver_u_b(avg1, res1);
- ST8x2_UB(res1, dst, dst_stride);
+ ST_D2(res1, 0, 1, dst, dst_stride);
dst += (2 * dst_stride);
inp0 = LD_UB(src);
res0 = __msa_aver_u_b(avg0, res0);
avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
res0 = __msa_aver_u_b(avg0, res0);
- ST8x2_UB(res0, dst, dst_stride);
+ ST_D2(res0, 0, 1, dst, dst_stride);
dst += (2 * dst_stride);
LD_UB2(dst, dst_stride, dst0, dst1);
res1 = __msa_aver_u_b(avg1, res1);
avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
res1 = __msa_aver_u_b(avg1, res1);
- ST8x2_UB(res1, dst, dst_stride);
+ ST_D2(res1, 0, 1, dst, dst_stride);
}
static void hv_mc_qpel_avg_dst_aver_hv_src10_16x16_msa(const uint8_t *src,
res0 = __msa_aver_u_b(avg0, res0);
avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
res0 = __msa_aver_u_b(avg0, res0);
- ST8x2_UB(res0, dst, dst_stride);
+ ST_D2(res0, 0, 1, dst, dst_stride);
dst += (2 * dst_stride);
LD_UB2(src, src_stride, inp2, inp3);
res1 = __msa_aver_u_b(avg1, res1);
avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
res1 = __msa_aver_u_b(avg1, res1);
- ST8x2_UB(res1, dst, dst_stride);
+ ST_D2(res1, 0, 1, dst, dst_stride);
dst += (2 * dst_stride);
inp0 = LD_UB(src);
res0 = __msa_aver_u_b(avg0, res0);
avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
res0 = __msa_aver_u_b(avg0, res0);
- ST8x2_UB(res0, dst, dst_stride);
+ ST_D2(res0, 0, 1, dst, dst_stride);
dst += (2 * dst_stride);
LD_UB2(dst, dst_stride, dst0, dst1);
res1 = __msa_aver_u_b(avg1, res1);
avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
res1 = __msa_aver_u_b(avg1, res1);
- ST8x2_UB(res1, dst, dst_stride);
+ ST_D2(res1, 0, 1, dst, dst_stride);
}
static void hv_mc_qpel_avg_dst_aver_h_src0_16x16_msa(const uint8_t *src,
const20, const6, const3);
avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
res0 = __msa_aver_u_b(avg0, res0);
- ST8x2_UB(res0, dst, dst_stride);
+ ST_D2(res0, 0, 1, dst, dst_stride);
dst += (2 * dst_stride);
LD_UB2(src, src_stride, inp2, inp3);
const20, const6, const3);
avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
res1 = __msa_aver_u_b(avg1, res1);
- ST8x2_UB(res1, dst, dst_stride);
+ ST_D2(res1, 0, 1, dst, dst_stride);
dst += (2 * dst_stride);
inp0 = LD_UB(src);
const20, const6, const3);
avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
res0 = __msa_aver_u_b(avg0, res0);
- ST8x2_UB(res0, dst, dst_stride);
+ ST_D2(res0, 0, 1, dst, dst_stride);
dst += (2 * dst_stride);
LD_UB2(dst, dst_stride, dst0, dst1);
const20, const6, const3);
avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
res1 = __msa_aver_u_b(avg1, res1);
- ST8x2_UB(res1, dst, dst_stride);
- dst += (2 * dst_stride);
+ ST_D2(res1, 0, 1, dst, dst_stride);
}
static void hv_mc_qpel_avg_dst_16x16_msa(const uint8_t *src, int32_t src_stride,
const20, const6, const3);
avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
res0 = __msa_aver_u_b(avg0, res0);
- ST8x2_UB(res0, dst, dst_stride);
+ ST_D2(res0, 0, 1, dst, dst_stride);
dst += (2 * dst_stride);
LD_UB2(dst, dst_stride, dst0, dst1);
const20, const6, const3);
avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
res1 = __msa_aver_u_b(avg1, res1);
- ST8x2_UB(res1, dst, dst_stride);
+ ST_D2(res1, 0, 1, dst, dst_stride);
dst += (2 * dst_stride);
LD_UB2(dst, dst_stride, dst0, dst1);
const20, const6, const3);
avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
res0 = __msa_aver_u_b(avg0, res0);
- ST8x2_UB(res0, dst, dst_stride);
+ ST_D2(res0, 0, 1, dst, dst_stride);
dst += (2 * dst_stride);
LD_UB2(dst, dst_stride, dst0, dst1);
const20, const6, const3);
avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
res1 = __msa_aver_u_b(avg1, res1);
- ST8x2_UB(res1, dst, dst_stride);
+ ST_D2(res1, 0, 1, dst, dst_stride);
}
static void hv_mc_qpel_avg_dst_aver_h_src1_16x16_msa(const uint8_t *src,
const20, const6, const3);
avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
res0 = __msa_aver_u_b(avg0, res0);
- ST8x2_UB(res0, dst, dst_stride);
+ ST_D2(res0, 0, 1, dst, dst_stride);
dst += (2 * dst_stride);
LD_UB2(src, src_stride, inp2, inp3);
const20, const6, const3);
avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
res1 = __msa_aver_u_b(avg1, res1);
- ST8x2_UB(res1, dst, dst_stride);
+ ST_D2(res1, 0, 1, dst, dst_stride);
dst += (2 * dst_stride);
inp0 = LD_UB(src);
const20, const6, const3);
avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
res0 = __msa_aver_u_b(avg0, res0);
- ST8x2_UB(res0, dst, dst_stride);
+ ST_D2(res0, 0, 1, dst, dst_stride);
dst += (2 * dst_stride);
LD_UB2(dst, dst_stride, dst0, dst1);
const20, const6, const3);
avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
res1 = __msa_aver_u_b(avg1, res1);
- ST8x2_UB(res1, dst, dst_stride);
+ ST_D2(res1, 0, 1, dst, dst_stride);
}
static void hv_mc_qpel_avg_dst_aver_hv_src01_16x16_msa(const uint8_t *src,
res0 = __msa_aver_u_b(avg0, res0);
avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
res0 = __msa_aver_u_b(avg0, res0);
- ST8x2_UB(res0, dst, dst_stride);
+ ST_D2(res0, 0, 1, dst, dst_stride);
dst += (2 * dst_stride);
LD_UB2(dst, dst_stride, dst0, dst1);
res1 = __msa_aver_u_b(avg1, res1);
avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
res1 = __msa_aver_u_b(avg1, res1);
- ST8x2_UB(res1, dst, dst_stride);
+ ST_D2(res1, 0, 1, dst, dst_stride);
dst += (2 * dst_stride);
inp0 = LD_UB(src);
LD_UB2(dst, dst_stride, dst0, dst1);
avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
res0 = __msa_aver_u_b(avg0, res0);
- ST8x2_UB(res0, dst, dst_stride);
+ ST_D2(res0, 0, 1, dst, dst_stride);
dst += (2 * dst_stride);
avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz8, (v2i64) horiz7);
LD_UB2(dst, dst_stride, dst0, dst1);
avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
res1 = __msa_aver_u_b(avg1, res1);
- ST8x2_UB(res1, dst, dst_stride);
+ ST_D2(res1, 0, 1, dst, dst_stride);
}
static void hv_mc_qpel_avg_dst_aver_v_src1_16x16_msa(const uint8_t *src,
res0 = __msa_aver_u_b(avg0, res0);
avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
res0 = __msa_aver_u_b(avg0, res0);
- ST8x2_UB(res0, dst, dst_stride);
+ ST_D2(res0, 0, 1, dst, dst_stride);
dst += (2 * dst_stride);
LD_UB2(dst, dst_stride, dst0, dst1);
res1 = __msa_aver_u_b(avg1, res1);
avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
res1 = __msa_aver_u_b(avg1, res1);
- ST8x2_UB(res1, dst, dst_stride);
+ ST_D2(res1, 0, 1, dst, dst_stride);
dst += (2 * dst_stride);
inp0 = LD_UB(src);
LD_UB2(dst, dst_stride, dst0, dst1);
avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
res0 = __msa_aver_u_b(avg0, res0);
- ST8x2_UB(res0, dst, dst_stride);
+ ST_D2(res0, 0, 1, dst, dst_stride);
dst += (2 * dst_stride);
avg1 = (v16u8) __msa_ilvr_d((v2i64) horiz8, (v2i64) horiz7);
LD_UB2(dst, dst_stride, dst0, dst1);
avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
res1 = __msa_aver_u_b(avg1, res1);
- ST8x2_UB(res1, dst, dst_stride);
+ ST_D2(res1, 0, 1, dst, dst_stride);
}
static void hv_mc_qpel_avg_dst_aver_hv_src11_16x16_msa(const uint8_t *src,
res0 = __msa_aver_u_b(avg0, res0);
avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
res0 = __msa_aver_u_b(avg0, res0);
- ST8x2_UB(res0, dst, dst_stride);
+ ST_D2(res0, 0, 1, dst, dst_stride);
dst += (2 * dst_stride);
LD_UB2(src, src_stride, inp2, inp3);
res1 = __msa_aver_u_b(avg1, res1);
avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
res1 = __msa_aver_u_b(avg1, res1);
- ST8x2_UB(res1, dst, dst_stride);
+ ST_D2(res1, 0, 1, dst, dst_stride);
dst += (2 * dst_stride);
inp0 = LD_UB(src);
res0 = __msa_aver_u_b(avg0, res0);
avg0 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
res0 = __msa_aver_u_b(avg0, res0);
- ST8x2_UB(res0, dst, dst_stride);
+ ST_D2(res0, 0, 1, dst, dst_stride);
dst += (2 * dst_stride);
LD_UB2(dst, dst_stride, dst0, dst1);
res1 = __msa_aver_u_b(avg1, res1);
avg1 = (v16u8) __msa_ilvr_d((v2i64) dst1, (v2i64) dst0);
res1 = __msa_aver_u_b(avg1, res1);
- ST8x2_UB(res1, dst, dst_stride);
+ ST_D2(res1, 0, 1, dst, dst_stride);
}
static void copy_8x8_msa(const uint8_t *src, int32_t src_stride,
v4i32 cnst8w = {8, 8, 8, 8};
v4i32 cnst2048w = {2048, 2048, 2048, 2048};
v4i32 cnst128w = {128, 128, 128, 128};
- int nstride = stride;
/* Extended input data */
LD_SH8(input, 8, r0, r1, r2, r3, r4, r5, r6, r7);
VSHF_B2_SB(r2_l, r6_l, r3_l, r7_l, mask, mask, d6, d7);
/* Final sequence of operations over-write original dst */
- ST8x1_UB(d0, dst);
- ST8x1_UB(d1, dst + nstride);
- nstride += stride;
- ST8x1_UB(d2, dst + nstride);
- nstride += stride;
- ST8x1_UB(d3, dst + nstride);
- nstride += stride;
- ST8x1_UB(d4, dst + nstride);
- nstride += stride;
- ST8x1_UB(d5, dst + nstride);
- nstride += stride;
- ST8x1_UB(d6, dst + nstride);
- nstride += stride;
- ST8x1_UB(d7, dst + nstride);
+ ST_D1(d0, 0, dst);
+ ST_D1(d1, 0, dst + stride);
+ ST_D1(d2, 0, dst + 2 * stride);
+ ST_D1(d3, 0, dst + 3 * stride);
+ ST_D1(d4, 0, dst + 4 * stride);
+ ST_D1(d5, 0, dst + 5 * stride);
+ ST_D1(d6, 0, dst + 6 * stride);
+ ST_D1(d7, 0, dst + 7 * stride);
}
void ff_vp3_idct_put_msa(uint8_t *dest, ptrdiff_t line_size, int16_t *block)
v4i32 r0, r1, r2, r3, r4, r5, r6, r7;
v16i8 mask = {0, 4, 8, 12, 16, 20, 24, 28, 0, 0, 0, 0, 0, 0, 0, 0};
v16i8 zero = {0};
- int nstride = line_size;
LD_SB8(dest, line_size, d0, d1, d2, d3, d4, d5, d6, d7);
ILVR_B4_SW(zero, d0, zero, d1, zero, d2, zero, d3,
VSHF_B2_SB(e6, r6, e7, r7, mask, mask, d6, d7);
/* Final sequence of operations over-write original dst */
- ST8x1_UB(d0, dest);
- ST8x1_UB(d1, dest + nstride);
- nstride += line_size;
- ST8x1_UB(d2, dest + nstride);
- nstride += line_size;
- ST8x1_UB(d3, dest + nstride);
- nstride += line_size;
- ST8x1_UB(d4, dest + nstride);
- nstride += line_size;
- ST8x1_UB(d5, dest + nstride);
- nstride += line_size;
- ST8x1_UB(d6, dest + nstride);
- nstride += line_size;
- ST8x1_UB(d7, dest + nstride);
+ ST_D1(d0, 0, dest);
+ ST_D1(d1, 0, dest + line_size);
+ ST_D1(d2, 0, dest + 2 * line_size);
+ ST_D1(d3, 0, dest + 3 * line_size);
+ ST_D1(d4, 0, dest + 4 * line_size);
+ ST_D1(d5, 0, dest + 5 * line_size);
+ ST_D1(d6, 0, dest + 6 * line_size);
+ ST_D1(d7, 0, dest + 7 * line_size);
block[0] = 0;
}
VSHF_B2_SB(f0, f1, g0, g1, mask, mask, d1, d2);
/* Final move to first_pixel */
- ST8x1_UB(d1, first_pixel + nstride);
- ST8x1_UB(d2, first_pixel);
+ ST_D1(d1, 0, first_pixel + nstride);
+ ST_D1(d2, 0, first_pixel);
}
void ff_vp3_h_loop_filter_msa(uint8_t *first_pixel, ptrdiff_t stride,
g1 = CLIP_SW_0_255(g1);
VSHF_B2_SB(f0, g0, f1, g1, mask, mask, d1, d2);
/* Final move to first_pixel */
- ST2x4_UB(d1, 0, first_pixel - 1, stride);
- ST2x4_UB(d2, 0, first_pixel - 1 + 4 * stride, stride);
+ ST_H4(d1, 0, 1, 2, 3, first_pixel - 1, stride);
+ ST_H4(d2, 0, 1, 2, 3, first_pixel - 1 + 4 * stride, stride);
}
void ff_put_no_rnd_pixels_l2_msa(uint8_t *dst, const uint8_t *src1,
f2 = (v4i32) __msa_and_v((v16u8)a3, (v16u8)b3);
t3 = t3 + (v4u32)f2;
- ST4x4_UB(t0, t0, 0, 1, 2, 3, dst, stride);
- ST4x4_UB(t1, t1, 0, 1, 2, 3, dst + 4 * stride, stride);
- ST4x4_UB(t2, t2, 0, 1, 2, 3, dst + 4, stride);
- ST4x4_UB(t3, t3, 0, 1, 2, 3, dst + 4 + 4 * stride, stride);
+ ST_W8(t0, t1, 0, 1, 2, 3, 0, 1, 2, 3, dst, stride);
+ ST_W8(t2, t3, 0, 1, 2, 3, 0, 1, 2, 3, dst + 4, stride);
} else {
int i;
res2 = CLIP_SW_0_255(res2);
res3 = CLIP_SW_0_255(res3);
VSHF_B2_SB(res0, res1, res2, res3, mask, mask, dest0, dest1);
- ST4x4_UB(dest0, dest1, 0, 1, 0, 1, dst, stride);
+ ST_W2(dest0, 0, 1, dst, stride);
+ ST_W2(dest1, 0, 1, dst + 2 * stride, stride);
memset(input, 0, 4 * 4 * sizeof(*input));
}
ADD4(res0, vec, res1, vec, res2, vec, res3, vec, res0, res1, res2, res3);
CLIP_SH4_0_255(res0, res1, res2, res3);
VSHF_B2_SB(res0, res1, res2, res3, mask, mask, dest0, dest1);
- ST4x4_UB(dest0, dest1, 0, 1, 0, 1, dst, stride);
+ ST_W2(dest0, 0, 1, dst, stride);
+ ST_W2(dest1, 0, 1, dst + 2 * stride, stride);
in_dc[0] = 0;
}
ILVRL_B2_SH(q0, p0, tmp1, tmp0);
src -= 1;
- ST2x4_UB(tmp1, 0, src, pitch);
- src += 4 * pitch;
- ST2x4_UB(tmp1, 4, src, pitch);
- src += 4 * pitch;
- ST2x4_UB(tmp0, 0, src, pitch);
- src += 4 * pitch;
- ST2x4_UB(tmp0, 4, src, pitch);
- src += 4 * pitch;
+ ST_H8(tmp1, 0, 1, 2, 3, 4, 5, 6, 7, src, pitch)
+ ST_H8(tmp0, 0, 1, 2, 3, 4, 5, 6, 7, src + 8 * pitch, pitch)
}
void ff_vp8_v_loop_filter8uv_inner_msa(uint8_t *src_u, uint8_t *src_v,
ptrdiff_t pitch, int b_limit_in,
int limit_in, int thresh_in)
{
- uint8_t *temp_src_u, *temp_src_v;
v16u8 p3, p2, p1, p0, q3, q2, q1, q0;
v16u8 mask, hev, flat, thresh, limit, b_limit;
v16u8 row0, row1, row2, row3, row4, row5, row6, row7, row8;
tmp1 = (v4i32) __msa_ilvl_b((v16i8) q1, (v16i8) q0);
ILVRL_H2_SW(tmp1, tmp0, tmp4, tmp5);
- temp_src_u = src_u - 2;
- ST4x4_UB(tmp2, tmp2, 0, 1, 2, 3, temp_src_u, pitch);
- temp_src_u += 4 * pitch;
- ST4x4_UB(tmp3, tmp3, 0, 1, 2, 3, temp_src_u, pitch);
-
- temp_src_v = src_v - 2;
- ST4x4_UB(tmp4, tmp4, 0, 1, 2, 3, temp_src_v, pitch);
- temp_src_v += 4 * pitch;
- ST4x4_UB(tmp5, tmp5, 0, 1, 2, 3, temp_src_v, pitch);
+ ST_W8(tmp2, tmp3, 0, 1, 2, 3, 0, 1, 2, 3, src_u - 2, pitch);
+ ST_W8(tmp4, tmp5, 0, 1, 2, 3, 0, 1, 2, 3, src_v - 2, pitch);
}
void ff_vp8_v_loop_filter16_inner_msa(uint8_t *src, ptrdiff_t pitch,
ILVRL_H2_SH(tmp1, tmp0, tmp4, tmp5);
src -= 2;
- ST4x8_UB(tmp2, tmp3, src, pitch);
- src += (8 * pitch);
- ST4x8_UB(tmp4, tmp5, src, pitch);
+ ST_W8(tmp2, tmp3, 0, 1, 2, 3, 0, 1, 2, 3, src, pitch)
+ ST_W8(tmp4, tmp5, 0, 1, 2, 3, 0, 1, 2, 3, src + 8 * pitch, pitch)
}
SRARI_H2_SH(out0, out1, 7);
SAT_SH2_SH(out0, out1, 7);
out = PCKEV_XORI128_UB(out0, out1);
- ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+ ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
}
static void common_hz_6t_4x8_msa(uint8_t *src, int32_t src_stride,
SRARI_H4_SH(out0, out1, out2, out3, 7);
SAT_SH4_SH(out0, out1, out2, out3, 7);
out = PCKEV_XORI128_UB(out0, out1);
- ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
- dst += (4 * dst_stride);
+ ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
out = PCKEV_XORI128_UB(out2, out3);
- ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+ ST_W4(out, 0, 1, 2, 3, dst + 4 * dst_stride, dst_stride);
}
void ff_put_vp8_epel4_h6_msa(uint8_t *dst, ptrdiff_t dst_stride,
SAT_SH4_SH(out0, out1, out2, out3, 7);
tmp0 = PCKEV_XORI128_UB(out0, out1);
tmp1 = PCKEV_XORI128_UB(out2, out3);
- ST8x4_UB(tmp0, tmp1, dst, dst_stride);
+ ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
dst += (4 * dst_stride);
for (loop_cnt = (height >> 2) - 1; loop_cnt--;) {
SAT_SH4_SH(out0, out1, out2, out3, 7);
tmp0 = PCKEV_XORI128_UB(out0, out1);
tmp1 = PCKEV_XORI128_UB(out2, out3);
- ST8x4_UB(tmp0, tmp1, dst, dst_stride);
+ ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
dst += (4 * dst_stride);
}
}
SRARI_H2_SH(out10, out32, 7);
SAT_SH2_SH(out10, out32, 7);
out = PCKEV_XORI128_UB(out10, out32);
- ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+ ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
dst += (4 * dst_stride);
src2110 = src6554;
SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
tmp0 = PCKEV_XORI128_UB(out0_r, out1_r);
tmp1 = PCKEV_XORI128_UB(out2_r, out3_r);
- ST8x4_UB(tmp0, tmp1, dst, dst_stride);
+ ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
dst += (4 * dst_stride);
src10_r = src76_r;
SRARI_H2_SH(tmp0, tmp1, 7);
SAT_SH2_SH(tmp0, tmp1, 7);
out = PCKEV_XORI128_UB(tmp0, tmp1);
- ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+ ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
dst += (4 * dst_stride);
hz_out3 = hz_out7;
SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
vec0 = PCKEV_XORI128_UB(tmp0, tmp1);
vec1 = PCKEV_XORI128_UB(tmp2, tmp3);
- ST8x4_UB(vec0, vec1, dst, dst_stride);
+ ST_D4(vec0, vec1, 0, 1, 0, 1, dst, dst_stride);
dst += (4 * dst_stride);
hz_out4 = hz_out8;
SRARI_H2_SH(out0, out1, 7);
SAT_SH2_SH(out0, out1, 7);
out = PCKEV_XORI128_UB(out0, out1);
- ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+ ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
}
static void common_hz_4t_4x8_msa(uint8_t *src, int32_t src_stride,
SRARI_H4_SH(out0, out1, out2, out3, 7);
SAT_SH4_SH(out0, out1, out2, out3, 7);
out = PCKEV_XORI128_UB(out0, out1);
- ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
- dst += (4 * dst_stride);
+ ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
out = PCKEV_XORI128_UB(out2, out3);
- ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+ ST_W4(out, 0, 1, 2, 3, dst + 4 * dst_stride, dst_stride);
}
static void common_hz_4t_4x16_msa(uint8_t *src, int32_t src_stride,
SRARI_H4_SH(out0, out1, out2, out3, 7);
SAT_SH4_SH(out0, out1, out2, out3, 7);
out = PCKEV_XORI128_UB(out0, out1);
- ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+ ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
dst += (4 * dst_stride);
out = PCKEV_XORI128_UB(out2, out3);
- ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+ ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
dst += (4 * dst_stride);
LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7);
SRARI_H4_SH(out0, out1, out2, out3, 7);
SAT_SH4_SH(out0, out1, out2, out3, 7);
out = PCKEV_XORI128_UB(out0, out1);
- ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+ ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
dst += (4 * dst_stride);
out = PCKEV_XORI128_UB(out2, out3);
- ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+ ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
}
void ff_put_vp8_epel4_h4_msa(uint8_t *dst, ptrdiff_t dst_stride,
SAT_SH4_SH(out0, out1, out2, out3, 7);
tmp0 = PCKEV_XORI128_UB(out0, out1);
tmp1 = PCKEV_XORI128_UB(out2, out3);
- ST8x4_UB(tmp0, tmp1, dst, dst_stride);
+ ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
dst += (4 * dst_stride);
}
}
SRARI_H2_SH(out10, out32, 7);
SAT_SH2_SH(out10, out32, 7);
out = PCKEV_XORI128_UB(out10, out32);
- ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+ ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
dst += (4 * dst_stride);
}
}
SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
tmp0 = PCKEV_XORI128_UB(out0_r, out1_r);
tmp1 = PCKEV_XORI128_UB(out2_r, out3_r);
- ST8x4_UB(tmp0, tmp1, dst, dst_stride);
+ ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
dst += (4 * dst_stride);
src10_r = src98_r;
SRARI_H2_SH(tmp0, tmp1, 7);
SAT_SH2_SH(tmp0, tmp1, 7);
out = PCKEV_XORI128_UB(tmp0, tmp1);
- ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+ ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
dst += (4 * dst_stride);
hz_out1 = hz_out5;
SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
out0 = PCKEV_XORI128_UB(tmp0, tmp1);
out1 = PCKEV_XORI128_UB(tmp2, tmp3);
- ST8x4_UB(out0, out1, dst, dst_stride);
+ ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
dst += (4 * dst_stride);
vec0 = vec4;
SAT_SH2_SH(tmp0, tmp1, 7);
PCKEV_B2_UB(tmp0, tmp0, tmp1, tmp1, res0, res1);
XORI_B2_128_UB(res0, res1);
- ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
+ ST_W2(res0, 0, 1, dst, dst_stride);
+ ST_W2(res1, 0, 1, dst + 2 * dst_stride, dst_stride);
dst += (4 * dst_stride);
hz_out1 = hz_out5;
SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
out0 = PCKEV_XORI128_UB(tmp0, tmp1);
out1 = PCKEV_XORI128_UB(tmp2, tmp3);
- ST8x4_UB(out0, out1, dst, dst_stride);
+ ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
dst += (4 * dst_stride);
}
}
SRARI_H2_SH(tmp0, tmp1, 7);
SAT_SH2_SH(tmp0, tmp1, 7);
out = PCKEV_XORI128_UB(tmp0, tmp1);
- ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+ ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
dst += (4 * dst_stride);
hz_out3 = hz_out7;
SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
vec0 = PCKEV_XORI128_UB(tmp0, tmp1);
vec1 = PCKEV_XORI128_UB(tmp2, tmp3);
- ST8x4_UB(vec0, vec1, dst, dst_stride);
+ ST_D4(vec0, vec1, 0, 1, 0, 1, dst, dst_stride);
dst += (4 * dst_stride);
hz_out4 = hz_out8;
DOTP_UB2_UH(vec0, vec1, filt0, filt0, vec2, vec3);
SRARI_H2_UH(vec2, vec3, 7);
PCKEV_B2_UB(vec2, vec2, vec3, vec3, res0, res1);
- ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
+ ST_W2(res0, 0, 1, dst, dst_stride);
+ ST_W2(res1, 0, 1, dst + 2 * dst_stride, dst_stride);
}
static void common_hz_2t_4x8_msa(uint8_t *src, int32_t src_stride,
SRARI_H4_UH(vec4, vec5, vec6, vec7, 7);
PCKEV_B4_SB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7,
res0, res1, res2, res3);
- ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
- dst += (4 * dst_stride);
- ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride);
+ ST_W2(res0, 0, 1, dst, dst_stride);
+ ST_W2(res1, 0, 1, dst + 2 * dst_stride, dst_stride);
+ ST_W2(res2, 0, 1, dst + 4 * dst_stride, dst_stride);
+ ST_W2(res3, 0, 1, dst + 6 * dst_stride, dst_stride);
}
void ff_put_vp8_bilinear4_h_msa(uint8_t *dst, ptrdiff_t dst_stride,
vec0, vec1, vec2, vec3);
SRARI_H4_UH(vec0, vec1, vec2, vec3, 7);
PCKEV_B2_SB(vec1, vec0, vec3, vec2, src0, src1);
- ST8x4_UB(src0, src1, dst, dst_stride);
+ ST_D4(src0, src1, 0, 1, 0, 1, dst, dst_stride);
}
static void common_hz_2t_8x8mult_msa(uint8_t *src, int32_t src_stride,
src += (4 * src_stride);
PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
- ST8x4_UB(out0, out1, dst, dst_stride);
- dst += (4 * dst_stride);
+ ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
vec0, vec1, vec2, vec3);
SRARI_H4_UH(vec0, vec1, vec2, vec3, 7);
PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
- ST8x4_UB(out0, out1, dst, dst_stride);
- dst += (4 * dst_stride);
+ ST_D4(out0, out1, 0, 1, 0, 1, dst + 4 * dst_stride, dst_stride);
+ dst += (8 * dst_stride);
if (16 == height) {
LD_SB4(src, src_stride, src0, src1, src2, src3);
src += (4 * src_stride);
PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
- ST8x4_UB(out0, out1, dst, dst_stride);
+ ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
vec0, vec1, vec2, vec3);
SRARI_H4_UH(vec0, vec1, vec2, vec3, 7);
PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
- ST8x4_UB(out0, out1, dst + 4 * dst_stride, dst_stride);
+ ST_D4(out0, out1, 0, 1, 0, 1, dst + 4 * dst_stride, dst_stride);
}
}
SRARI_H2_UH(tmp0, tmp1, 7);
SAT_UH2_UH(tmp0, tmp1, 7);
src2110 = __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
- ST4x4_UB(src2110, src2110, 0, 1, 2, 3, dst, dst_stride);
+ ST_W4(src2110, 0, 1, 2, 3, dst, dst_stride);
}
static void common_vt_2t_4x8_msa(uint8_t *src, int32_t src_stride,
SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7);
SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, src2110, src4332);
- ST4x4_UB(src2110, src2110, 0, 1, 2, 3, dst, dst_stride);
- ST4x4_UB(src4332, src4332, 0, 1, 2, 3, dst + 4 * dst_stride, dst_stride);
+ ST_W8(src2110, src4332, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
}
void ff_put_vp8_bilinear4_v_msa(uint8_t *dst, ptrdiff_t dst_stride,
SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7);
SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1);
- ST8x4_UB(out0, out1, dst, dst_stride);
+ ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
}
static void common_vt_2t_8x8mult_msa(uint8_t *src, int32_t src_stride,
SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7);
SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1);
- ST8x4_UB(out0, out1, dst, dst_stride);
- dst += (4 * dst_stride);
+ ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0,
tmp0, tmp1, tmp2, tmp3);
SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7);
SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1);
- ST8x4_UB(out0, out1, dst, dst_stride);
- dst += (4 * dst_stride);
+ ST_D4(out0, out1, 0, 1, 0, 1, dst + 4 * dst_stride, dst_stride);
+ dst += (8 * dst_stride);
src0 = src8;
}
SRARI_H2_UH(tmp0, tmp1, 7);
SAT_UH2_UH(tmp0, tmp1, 7);
PCKEV_B2_UB(tmp0, tmp0, tmp1, tmp1, res0, res1);
- ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
+ ST_W2(res0, 0, 1, dst, dst_stride);
+ ST_W2(res1, 0, 1, dst + 2 * dst_stride, dst_stride);
}
static void common_hv_2ht_2vt_4x8_msa(uint8_t *src, int32_t src_stride,
SAT_UH4_UH(vec4, vec5, vec6, vec7, 7);
PCKEV_B4_SB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7,
res0, res1, res2, res3);
- ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
- dst += (4 * dst_stride);
- ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride);
+ ST_W2(res0, 0, 1, dst, dst_stride);
+ ST_W2(res1, 0, 1, dst + 2 * dst_stride, dst_stride);
+ ST_W2(res2, 0, 1, dst + 4 * dst_stride, dst_stride);
+ ST_W2(res3, 0, 1, dst + 6 * dst_stride, dst_stride);
}
void ff_put_vp8_bilinear4_hv_msa(uint8_t *dst, ptrdiff_t dst_stride,
SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7);
SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1);
- ST8x4_UB(out0, out1, dst, dst_stride);
+ ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
}
static void common_hv_2ht_2vt_8x8mult_msa(uint8_t *src, int32_t src_stride,
SRARI_H2_UH(tmp3, tmp4, 7);
SAT_UH2_UH(tmp3, tmp4, 7);
PCKEV_B2_SB(tmp2, tmp1, tmp4, tmp3, out0, out1);
- ST8x4_UB(out0, out1, dst, dst_stride);
- dst += (4 * dst_stride);
+ ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, 7);
vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
SRARI_H4_UH(tmp5, tmp6, tmp7, tmp8, 7);
SAT_UH4_UH(tmp5, tmp6, tmp7, tmp8, 7);
PCKEV_B2_SB(tmp6, tmp5, tmp8, tmp7, out0, out1);
- ST8x4_UB(out0, out1, dst, dst_stride);
- dst += (4 * dst_stride);
+ ST_D4(out0, out1, 0, 1, 0, 1, dst + 4 * dst_stride, dst_stride);
+ dst += (8 * dst_stride);
}
}
res0_m, res1_m, res2_m, res3_m); \
CLIP_SH4_0_255(res0_m, res1_m, res2_m, res3_m); \
PCKEV_B2_SB(res1_m, res0_m, res3_m, res2_m, tmp0_m, tmp1_m); \
- ST8x4_UB(tmp0_m, tmp1_m, dst_m, dst_stride); \
+ ST_D4(tmp0_m, tmp1_m, 0, 1, 0, 1, dst_m, dst_stride); \
}
#define VP9_IDCT4x4(in0, in1, in2, in3, out0, out1, out2, out3) \
v8i16 zero = { 0 };
/* load vector elements of 4x4 block */
- LD4x4_SH(input, in0, in1, in2, in3);
+ in0 = LD_SH(input);
+ in2 = LD_SH(input + 8);
+ in1 = (v8i16) __msa_ilvl_d((v2i64) in0, (v2i64) in0);
+ in3 = (v8i16) __msa_ilvl_d((v2i64) in2, (v2i64) in2);
ST_SH2(zero, zero, input, 8);
/* rows */
VP9_IDCT4x4(in0, in1, in2, in3, in0, in1, in2, in3);
v8i16 zero = { 0 };
/* load vector elements of 4x4 block */
- LD4x4_SH(input, in0, in1, in2, in3);
+ in0 = LD_SH(input);
+ in2 = LD_SH(input + 8);
+ in1 = (v8i16) __msa_ilvl_d((v2i64) in0, (v2i64) in0);
+ in3 = (v8i16) __msa_ilvl_d((v2i64) in2, (v2i64) in2);
ST_SH2(zero, zero, input, 8);
/* rows */
VP9_IADST4x4(in0, in1, in2, in3, in0, in1, in2, in3);
v8i16 zero = { 0 };
/* load vector elements of 4x4 block */
- LD4x4_SH(input, in0, in1, in2, in3);
+ in0 = LD_SH(input);
+ in2 = LD_SH(input + 8);
+ in1 = (v8i16) __msa_ilvl_d((v2i64) in0, (v2i64) in0);
+ in3 = (v8i16) __msa_ilvl_d((v2i64) in2, (v2i64) in2);
ST_SH2(zero, zero, input, 8);
/* cols */
VP9_IADST4x4(in0, in1, in2, in3, in0, in1, in2, in3);
v8i16 zero = { 0 };
/* load vector elements of 4x4 block */
- LD4x4_SH(input, in0, in1, in2, in3);
+ in0 = LD_SH(input);
+ in2 = LD_SH(input + 8);
+ in1 = (v8i16) __msa_ilvl_d((v2i64) in0, (v2i64) in0);
+ in3 = (v8i16) __msa_ilvl_d((v2i64) in2, (v2i64) in2);
ST_SH2(zero, zero, input, 8);
/* cols */
VP9_IDCT4x4(in0, in1, in2, in3, in0, in1, in2, in3);
res0 += out0;
res0 = CLIP_SH_0_255(res0);
res0 = (v8i16) __msa_pckev_b((v16i8) res0, (v16i8) res0);
- ST8x1_UB(res0, dst);
+ ST_D1(res0, 0, dst);
res7 = (v8i16) __msa_ilvr_b((v16i8) zero, (v16i8) dst7);
res7 += out7;
res7 = CLIP_SH_0_255(res7);
res7 = (v8i16) __msa_pckev_b((v16i8) res7, (v16i8) res7);
- ST8x1_UB(res7, dst + 7 * dst_stride);
+ ST_D1(res7, 0, dst + 7 * dst_stride);
cnst1 = __msa_fill_h(cospi_24_64);
cnst0 = __msa_fill_h(cospi_8_64);
ADD2(res1, out1, res6, out6, res1, res6);
CLIP_SH2_0_255(res1, res6);
PCKEV_B2_SH(res1, res1, res6, res6, res1, res6);
- ST8x1_UB(res1, dst + dst_stride);
- ST8x1_UB(res6, dst + 6 * dst_stride);
+ ST_D1(res1, 0, dst + dst_stride);
+ ST_D1(res6, 0, dst + 6 * dst_stride);
cnst0 = __msa_fill_h(cospi_16_64);
cnst1 = -cnst0;
ADD2(res3, out3, res4, out4, res3, res4);
CLIP_SH2_0_255(res3, res4);
PCKEV_B2_SH(res3, res3, res4, res4, res3, res4);
- ST8x1_UB(res3, dst + 3 * dst_stride);
- ST8x1_UB(res4, dst + 4 * dst_stride);
+ ST_D1(res3, 0, dst + 3 * dst_stride);
+ ST_D1(res4, 0, dst + 4 * dst_stride);
out2 = VP9_DOT_SHIFT_RIGHT_PCK_H(temp2, temp3, cnst0);
out5 = VP9_DOT_SHIFT_RIGHT_PCK_H(temp2, temp3, cnst1);
ADD2(res2, out2, res5, out5, res2, res5);
CLIP_SH2_0_255(res2, res5);
PCKEV_B2_SH(res2, res2, res5, res5, res2, res5);
- ST8x1_UB(res2, dst + 2 * dst_stride);
- ST8x1_UB(res5, dst + 5 * dst_stride);
+ ST_D1(res2, 0, dst + 2 * dst_stride);
+ ST_D1(res5, 0, dst + 5 * dst_stride);
}
static void vp9_iadst_idct_8x8_add_msa(int16_t *input, uint8_t *dst,
ADD2(res0, out0, res1, out1, res0, res1);
CLIP_SH2_0_255(res0, res1);
PCKEV_B2_SH(res0, res0, res1, res1, res0, res1);
- ST8x1_UB(res0, dst);
- ST8x1_UB(res1, dst + 15 * dst_stride);
+ ST_D1(res0, 0, dst);
+ ST_D1(res1, 0, dst + 15 * dst_stride);
k0 = VP9_SET_COSPI_PAIR(cospi_12_64, cospi_20_64);
k1 = VP9_SET_COSPI_PAIR(-cospi_20_64, cospi_12_64);
ADD2(res8, out8, res9, out9, res8, res9);
CLIP_SH2_0_255(res8, res9);
PCKEV_B2_SH(res8, res8, res9, res9, res8, res9);
- ST8x1_UB(res8, dst + dst_stride);
- ST8x1_UB(res9, dst + 14 * dst_stride);
+ ST_D1(res8, 0, dst + dst_stride);
+ ST_D1(res9, 0, dst + 14 * dst_stride);
k0 = VP9_SET_COSPI_PAIR(cospi_8_64, cospi_24_64);
k1 = VP9_SET_COSPI_PAIR(cospi_24_64, -cospi_8_64);
ADD2(res4, out4, res5, out5, res4, res5);
CLIP_SH2_0_255(res4, res5);
PCKEV_B2_SH(res4, res4, res5, res5, res4, res5);
- ST8x1_UB(res4, dst + 3 * dst_stride);
- ST8x1_UB(res5, dst + 12 * dst_stride);
+ ST_D1(res4, 0, dst + 3 * dst_stride);
+ ST_D1(res5, 0, dst + 12 * dst_stride);
VP9_MADD_BF(h1, h3, h5, h7, k0, k1, k2, k0, out12, out14, out13, out15);
out13 = -out13;
ADD2(res12, out12, res13, out13, res12, res13);
CLIP_SH2_0_255(res12, res13);
PCKEV_B2_SH(res12, res12, res13, res13, res12, res13);
- ST8x1_UB(res12, dst + 2 * dst_stride);
- ST8x1_UB(res13, dst + 13 * dst_stride);
+ ST_D1(res12, 0, dst + 2 * dst_stride);
+ ST_D1(res13, 0, dst + 13 * dst_stride);
k0 = VP9_SET_COSPI_PAIR(cospi_16_64, cospi_16_64);
k3 = VP9_SET_COSPI_PAIR(-cospi_16_64, cospi_16_64);
ADD2(res6, out6, res7, out7, res6, res7);
CLIP_SH2_0_255(res6, res7);
PCKEV_B2_SH(res6, res6, res7, res7, res6, res7);
- ST8x1_UB(res6, dst + 4 * dst_stride);
- ST8x1_UB(res7, dst + 11 * dst_stride);
+ ST_D1(res6, 0, dst + 4 * dst_stride);
+ ST_D1(res7, 0, dst + 11 * dst_stride);
VP9_MADD_SHORT(out10, out11, k0, k3, out10, out11);
SRARI_H2_SH(out10, out11, 6);
ADD2(res10, out10, res11, out11, res10, res11);
CLIP_SH2_0_255(res10, res11);
PCKEV_B2_SH(res10, res10, res11, res11, res10, res11);
- ST8x1_UB(res10, dst + 6 * dst_stride);
- ST8x1_UB(res11, dst + 9 * dst_stride);
+ ST_D1(res10, 0, dst + 6 * dst_stride);
+ ST_D1(res11, 0, dst + 9 * dst_stride);
k1 = VP9_SET_COSPI_PAIR(-cospi_16_64, -cospi_16_64);
k2 = VP9_SET_COSPI_PAIR(cospi_16_64, -cospi_16_64);
ADD2(res2, out2, res3, out3, res2, res3);
CLIP_SH2_0_255(res2, res3);
PCKEV_B2_SH(res2, res2, res3, res3, res2, res3);
- ST8x1_UB(res2, dst + 7 * dst_stride);
- ST8x1_UB(res3, dst + 8 * dst_stride);
+ ST_D1(res2, 0, dst + 7 * dst_stride);
+ ST_D1(res3, 0, dst + 8 * dst_stride);
VP9_MADD_SHORT(out14, out15, k1, k2, out14, out15);
SRARI_H2_SH(out14, out15, 6);
ADD2(res14, out14, res15, out15, res14, res15);
CLIP_SH2_0_255(res14, res15);
PCKEV_B2_SH(res14, res14, res15, res15, res14, res15);
- ST8x1_UB(res14, dst + 5 * dst_stride);
- ST8x1_UB(res15, dst + 10 * dst_stride);
+ ST_D1(res14, 0, dst + 5 * dst_stride);
+ ST_D1(res15, 0, dst + 10 * dst_stride);
}
static void vp9_iadst16x16_colcol_addblk_msa(int16_t *input, uint8_t *dst,
IPRED_SUBS_UH2_UH(src_top_left, src_top_left, vec2, vec3);
SAT_UH4_UH(vec0, vec1, vec2, vec3, 7);
PCKEV_B2_SB(vec1, vec0, vec3, vec2, tmp0, tmp1);
- ST4x4_UB(tmp0, tmp1, 0, 2, 0, 2, dst, dst_stride);
+ ST_W2(tmp0, 0, 2, dst, dst_stride);
+ ST_W2(tmp1, 0, 2, dst + 2 * dst_stride, dst_stride);
}
void ff_tm_8x8_msa(uint8_t *dst, ptrdiff_t dst_stride,
IPRED_SUBS_UH2_UH(src_top_left, src_top_left, vec2, vec3);
SAT_UH4_UH(vec0, vec1, vec2, vec3, 7);
PCKEV_B2_SB(vec1, vec0, vec3, vec2, tmp0, tmp1);
- ST8x4_UB(tmp0, tmp1, dst, dst_stride);
+ ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
dst += (4 * dst_stride);
}
}
ILVRL_H2_SH(vec1, vec0, vec2, vec3);
src -= 2;
- ST4x4_UB(vec2, vec2, 0, 1, 2, 3, src, pitch);
- src += 4 * pitch;
- ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src, pitch);
+ ST_W8(vec2, vec3, 0, 1, 2, 3, 0, 1, 2, 3, src, pitch);
}
void ff_loop_filter_h_44_16_msa(uint8_t *src, ptrdiff_t pitch,
src -= 2;
- ST4x8_UB(tmp2, tmp3, src, pitch);
- src += (8 * pitch);
- ST4x8_UB(tmp4, tmp5, src, pitch);
+ ST_W8(tmp2, tmp3, 0, 1, 2, 3, 0, 1, 2, 3, src, pitch);
+ ST_W8(tmp4, tmp5, 0, 1, 2, 3, 0, 1, 2, 3, src + 8 * pitch, pitch);
}
void ff_loop_filter_h_8_8_msa(uint8_t *src, ptrdiff_t pitch,
ILVRL_H2_SH(vec1, vec0, vec2, vec3);
src -= 2;
- ST4x4_UB(vec2, vec2, 0, 1, 2, 3, src, pitch);
- src += 4 * pitch;
- ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src, pitch);
+ ST_W8(vec2, vec3, 0, 1, 2, 3, 0, 1, 2, 3, src, pitch);
} else {
ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1,
zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r,
vec4 = (v8i16) __msa_ilvr_b((v16i8) q2, (v16i8) q1);
src -= 3;
- ST4x4_UB(vec2, vec2, 0, 1, 2, 3, src, pitch);
- ST2x4_UB(vec4, 0, src + 4, pitch);
+ ST_W4(vec2, 0, 1, 2, 3, src, pitch);
+ ST_H4(vec4, 0, 1, 2, 3, src + 4, pitch);
src += (4 * pitch);
- ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src, pitch);
- ST2x4_UB(vec4, 4, src + 4, pitch);
+ ST_W4(vec3, 0, 1, 2, 3, src, pitch);
+ ST_H4(vec4, 4, 5, 6, 7, src + 4, pitch);
}
}
ILVRL_H2_SH(vec1, vec0, vec4, vec5);
src -= 2;
- ST4x8_UB(vec2, vec3, src, pitch);
- src += 8 * pitch;
- ST4x8_UB(vec4, vec5, src, pitch);
+ ST_W8(vec2, vec3, 0, 1, 2, 3, 0, 1, 2, 3, src, pitch);
+ ST_W8(vec4, vec5, 0, 1, 2, 3, 0, 1, 2, 3, src + 8 * pitch, pitch);
} else {
ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1,
zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r,
ILVRL_B2_SH(q2, q1, vec2, vec5);
src -= 3;
- ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src, pitch);
- ST2x4_UB(vec2, 0, src + 4, pitch);
+ ST_W4(vec3, 0, 1, 2, 3, src, pitch);
+ ST_H4(vec2, 0, 1, 2, 3, src + 4, pitch);
src += (4 * pitch);
- ST4x4_UB(vec4, vec4, 0, 1, 2, 3, src, pitch);
- ST2x4_UB(vec2, 4, src + 4, pitch);
+ ST_W4(vec4, 0, 1, 2, 3, src, pitch);
+ ST_H4(vec2, 4, 5, 6, 7, src + 4, pitch);
src += (4 * pitch);
- ST4x4_UB(vec6, vec6, 0, 1, 2, 3, src, pitch);
- ST2x4_UB(vec5, 0, src + 4, pitch);
+ ST_W4(vec6, 0, 1, 2, 3, src, pitch);
+ ST_H4(vec5, 0, 1, 2, 3, src + 4, pitch);
src += (4 * pitch);
- ST4x4_UB(vec7, vec7, 0, 1, 2, 3, src, pitch);
- ST2x4_UB(vec5, 4, src + 4, pitch);
+ ST_W4(vec7, 0, 1, 2, 3, src, pitch);
+ ST_H4(vec5, 4, 5, 6, 7, src + 4, pitch);
}
}
ILVRL_H2_SH(vec1, vec0, vec4, vec5);
src -= 2;
- ST4x8_UB(vec2, vec3, src, pitch);
- src += 8 * pitch;
- ST4x8_UB(vec4, vec5, src, pitch);
+ ST_W8(vec2, vec3, 0, 1, 2, 3, 0, 1, 2, 3, src, pitch);
+ ST_W8(vec4, vec5, 0, 1, 2, 3, 0, 1, 2, 3, src + 8 * pitch, pitch);
} else {
ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1,
zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r,
ILVRL_B2_SH(q2, q1, vec2, vec5);
src -= 3;
- ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src, pitch);
- ST2x4_UB(vec2, 0, src + 4, pitch);
+ ST_W4(vec3, 0, 1, 2, 3, src, pitch);
+ ST_H4(vec2, 0, 1, 2, 3, src + 4, pitch);
src += (4 * pitch);
- ST4x4_UB(vec4, vec4, 0, 1, 2, 3, src, pitch);
- ST2x4_UB(vec2, 4, src + 4, pitch);
+ ST_W4(vec4, 0, 1, 2, 3, src, pitch);
+ ST_H4(vec2, 4, 5, 6, 7, src + 4, pitch);
src += (4 * pitch);
- ST4x4_UB(vec6, vec6, 0, 1, 2, 3, src, pitch);
- ST2x4_UB(vec5, 0, src + 4, pitch);
+ ST_W4(vec6, 0, 1, 2, 3, src, pitch);
+ ST_H4(vec5, 0, 1, 2, 3, src + 4, pitch);
src += (4 * pitch);
- ST4x4_UB(vec7, vec7, 0, 1, 2, 3, src, pitch);
- ST2x4_UB(vec5, 4, src + 4, pitch);
+ ST_W4(vec7, 0, 1, 2, 3, src, pitch);
+ ST_H4(vec5, 4, 5, 6, 7, src + 4, pitch);
}
}
ILVRL_H2_SH(vec1, vec0, vec4, vec5);
src -= 2;
- ST4x8_UB(vec2, vec3, src, pitch);
- src += 8 * pitch;
- ST4x8_UB(vec4, vec5, src, pitch);
+ ST_W8(vec2, vec3, 0, 1, 2, 3, 0, 1, 2, 3, src, pitch);
+ ST_W8(vec4, vec5, 0, 1, 2, 3, 0, 1, 2, 3, src + 8 * pitch, pitch);
} else {
ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l,
p0_l);
ILVRL_B2_SH(q2, q1, vec2, vec5);
src -= 3;
- ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src, pitch);
- ST2x4_UB(vec2, 0, src + 4, pitch);
+ ST_W4(vec3, 0, 1, 2, 3, src, pitch);
+ ST_H4(vec2, 0, 1, 2, 3, src + 4, pitch);
src += (4 * pitch);
- ST4x4_UB(vec4, vec4, 0, 1, 2, 3, src, pitch);
- ST2x4_UB(vec2, 4, src + 4, pitch);
+ ST_W4(vec4, 0, 1, 2, 3, src, pitch);
+ ST_H4(vec2, 4, 5, 6, 7, src + 4, pitch);
src += (4 * pitch);
- ST4x4_UB(vec6, vec6, 0, 1, 2, 3, src, pitch);
- ST2x4_UB(vec5, 0, src + 4, pitch);
+ ST_W4(vec6, 0, 1, 2, 3, src, pitch);
+ ST_H4(vec5, 0, 1, 2, 3, src + 4, pitch);
src += (4 * pitch);
- ST4x4_UB(vec7, vec7, 0, 1, 2, 3, src, pitch);
- ST2x4_UB(vec5, 4, src + 4, pitch);
+ ST_W4(vec7, 0, 1, 2, 3, src, pitch);
+ ST_H4(vec5, 4, 5, 6, 7, src + 4, pitch);
}
}
if (__msa_test_bz_v(flat)) {
ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
ILVRL_H2_SH(vec1, vec0, vec2, vec3);
- ST4x8_UB(vec2, vec3, (src_org - 2), pitch_org);
+ ST_W8(vec2, vec3, 0, 1, 2, 3, 0, 1, 2, 3, (src_org - 2), pitch_org);
return 1;
} else {
ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1,
vec2 = (v8i16) __msa_ilvr_b((v16i8) q2, (v16i8) q1);
src_org -= 3;
- ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src_org, pitch);
- ST2x4_UB(vec2, 0, (src_org + 4), pitch);
+ ST_W4(vec3, 0, 1, 2, 3, src_org, pitch);
+ ST_H4(vec2, 0, 1, 2, 3, (src_org + 4), pitch);
src_org += (4 * pitch);
- ST4x4_UB(vec4, vec4, 0, 1, 2, 3, src_org, pitch);
- ST2x4_UB(vec2, 4, (src_org + 4), pitch);
+ ST_W4(vec4, 0, 1, 2, 3, src_org, pitch);
+ ST_H4(vec2, 4, 5, 6, 7, (src_org + 4), pitch);
return 1;
} else {
r_out = __msa_srari_h((v8i16) tmp1_r, 4);
r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
p6 = __msa_bmnz_v(p6, (v16u8) r_out, flat2);
- ST8x1_UB(p6, src);
+ ST_D1(p6, 0, src);
src += 16;
/* p5 */
r_out = __msa_srari_h((v8i16) tmp1_r, 4);
r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
p5 = __msa_bmnz_v(p5, (v16u8) r_out, flat2);
- ST8x1_UB(p5, src);
+ ST_D1(p5, 0, src);
src += 16;
/* p4 */
r_out = __msa_srari_h((v8i16) tmp1_r, 4);
r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
p4 = __msa_bmnz_v(p4, (v16u8) r_out, flat2);
- ST8x1_UB(p4, src);
+ ST_D1(p4, 0, src);
src += 16;
/* p3 */
r_out = __msa_srari_h((v8i16) tmp1_r, 4);
r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
p3 = __msa_bmnz_v(p3, (v16u8) r_out, flat2);
- ST8x1_UB(p3, src);
+ ST_D1(p3, 0, src);
src += 16;
/* p2 */
r_out = __msa_srari_h((v8i16) tmp1_r, 4);
r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
- ST8x1_UB(filter8, src);
+ ST_D1(filter8, 0, src);
src += 16;
/* p1 */
r_out = __msa_srari_h((v8i16) tmp1_r, 4);
r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
- ST8x1_UB(filter8, src);
+ ST_D1(filter8, 0, src);
src += 16;
/* p0 */
r_out = __msa_srari_h((v8i16) tmp1_r, 4);
r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
- ST8x1_UB(filter8, src);
+ ST_D1(filter8, 0, src);
src += 16;
/* q0 */
r_out = __msa_srari_h((v8i16) tmp1_r, 4);
r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
- ST8x1_UB(filter8, src);
+ ST_D1(filter8, 0, src);
src += 16;
/* q1 */
r_out = __msa_srari_h((v8i16) tmp1_r, 4);
r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
- ST8x1_UB(filter8, src);
+ ST_D1(filter8, 0, src);
src += 16;
/* q2 */
r_out = __msa_srari_h((v8i16) tmp1_r, 4);
r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
- ST8x1_UB(filter8, src);
+ ST_D1(filter8, 0, src);
src += 16;
/* q3 */
r_out = __msa_srari_h((v8i16) tmp1_r, 4);
r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
q3 = __msa_bmnz_v(q3, (v16u8) r_out, flat2);
- ST8x1_UB(q3, src);
+ ST_D1(q3, 0, src);
src += 16;
/* q4 */
r_out = __msa_srari_h((v8i16) tmp1_r, 4);
r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
q4 = __msa_bmnz_v(q4, (v16u8) r_out, flat2);
- ST8x1_UB(q4, src);
+ ST_D1(q4, 0, src);
src += 16;
/* q5 */
r_out = __msa_srari_h((v8i16) tmp1_r, 4);
r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
q5 = __msa_bmnz_v(q5, (v16u8) r_out, flat2);
- ST8x1_UB(q5, src);
+ ST_D1(q5, 0, src);
src += 16;
/* q6 */
r_out = __msa_srari_h((v8i16) tmp1_r, 4);
r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
q6 = __msa_bmnz_v(q6, (v16u8) r_out, flat2);
- ST8x1_UB(q6, src);
+ ST_D1(q6, 0, src);
return 0;
}
ILVRL_H2_SH(vec1, vec0, vec4, vec5);
src_org -= 2;
- ST4x8_UB(vec2, vec3, src_org, pitch);
- src_org += 8 * pitch;
- ST4x8_UB(vec4, vec5, src_org, pitch);
+ ST_W8(vec2, vec3, 0, 1, 2, 3, 0, 1, 2, 3, src_org, pitch);
+ ST_W8(vec4, vec5, 0, 1, 2, 3, 0, 1, 2, 3, src_org + 8 * pitch, pitch);
return 1;
} else {
ILVRL_B2_SH(q2, q1, vec2, vec5);
src_org -= 3;
- ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src_org, pitch);
- ST2x4_UB(vec2, 0, (src_org + 4), pitch);
+ ST_W4(vec3, 0, 1, 2, 3, src_org, pitch);
+ ST_H4(vec2, 0, 1, 2, 3, (src_org + 4), pitch);
src_org += (4 * pitch);
- ST4x4_UB(vec4, vec4, 0, 1, 2, 3, src_org, pitch);
- ST2x4_UB(vec2, 4, (src_org + 4), pitch);
+ ST_W4(vec4, 0, 1, 2, 3, src_org, pitch);
+ ST_H4(vec2, 4, 5, 6, 7, (src_org + 4), pitch);
src_org += (4 * pitch);
- ST4x4_UB(vec6, vec6, 0, 1, 2, 3, src_org, pitch);
- ST2x4_UB(vec5, 0, (src_org + 4), pitch);
+ ST_W4(vec6, 0, 1, 2, 3, src_org, pitch);
+ ST_H4(vec5, 0, 1, 2, 3, (src_org + 4), pitch);
src_org += (4 * pitch);
- ST4x4_UB(vec7, vec7, 0, 1, 2, 3, src_org, pitch);
- ST2x4_UB(vec5, 4, (src_org + 4), pitch);
+ ST_W4(vec7, 0, 1, 2, 3, src_org, pitch);
+ ST_H4(vec5, 4, 5, 6, 7, (src_org + 4), pitch);
return 1;
} else {
\
PCKEV_B2_UB(in1, in0, in3, in2, tmp0_m, tmp1_m); \
AVER_UB2_UB(tmp0_m, dst0, tmp1_m, dst1, tmp0_m, tmp1_m); \
- ST8x4_UB(tmp0_m, tmp1_m, pdst_m, stride); \
+ ST_D4(tmp0_m, tmp1_m, 0, 1, 0, 1, pdst_m, stride); \
}
static void common_hz_8t_4x4_msa(const uint8_t *src, int32_t src_stride,
SRARI_H2_SH(out0, out1, 7);
SAT_SH2_SH(out0, out1, 7);
out = PCKEV_XORI128_UB(out0, out1);
- ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+ ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
}
static void common_hz_8t_4x8_msa(const uint8_t *src, int32_t src_stride,
SRARI_H4_SH(out0, out1, out2, out3, 7);
SAT_SH4_SH(out0, out1, out2, out3, 7);
out = PCKEV_XORI128_UB(out0, out1);
- ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
- dst += (4 * dst_stride);
+ ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
out = PCKEV_XORI128_UB(out2, out3);
- ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+ ST_W4(out, 0, 1, 2, 3, dst + 4 * dst_stride, dst_stride);
}
static void common_hz_8t_4w_msa(const uint8_t *src, int32_t src_stride,
SAT_SH4_SH(out0, out1, out2, out3, 7);
tmp0 = PCKEV_XORI128_UB(out0, out1);
tmp1 = PCKEV_XORI128_UB(out2, out3);
- ST8x4_UB(tmp0, tmp1, dst, dst_stride);
+ ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
}
static void common_hz_8t_8x8mult_msa(const uint8_t *src, int32_t src_stride,
SAT_SH4_SH(out0, out1, out2, out3, 7);
tmp0 = PCKEV_XORI128_UB(out0, out1);
tmp1 = PCKEV_XORI128_UB(out2, out3);
- ST8x4_UB(tmp0, tmp1, dst, dst_stride);
+ ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
dst += (4 * dst_stride);
}
}
SRARI_H2_SH(out10, out32, 7);
SAT_SH2_SH(out10, out32, 7);
out = PCKEV_XORI128_UB(out10, out32);
- ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+ ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
dst += (4 * dst_stride);
src2110 = src6554;
SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7);
tmp0 = PCKEV_XORI128_UB(out0_r, out1_r);
tmp1 = PCKEV_XORI128_UB(out2_r, out3_r);
- ST8x4_UB(tmp0, tmp1, dst, dst_stride);
+ ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride);
dst += (4 * dst_stride);
src10_r = src54_r;
SRARI_H2_SH(tmp0, tmp1, 7);
SAT_SH2_SH(tmp0, tmp1, 7);
out = PCKEV_XORI128_UB(tmp0, tmp1);
- ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+ ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
dst += (4 * dst_stride);
hz_out5 = hz_out9;
SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
vec0 = PCKEV_XORI128_UB(tmp0, tmp1);
vec1 = PCKEV_XORI128_UB(tmp2, tmp3);
- ST8x4_UB(vec0, vec1, dst, dst_stride);
+ ST_D4(vec0, vec1, 0, 1, 0, 1, dst, dst_stride);
dst += (4 * dst_stride);
hz_out6 = hz_out10;
SAT_SH2_SH(res0, res1, 7);
res = PCKEV_XORI128_UB(res0, res1);
res = (v16u8) __msa_aver_u_b(res, dst0);
- ST4x4_UB(res, res, 0, 1, 2, 3, dst, dst_stride);
+ ST_W4(res, 0, 1, 2, 3, dst, dst_stride);
}
static void common_hz_8t_and_aver_dst_4x8_msa(const uint8_t *src,
ILVR_D2_UB(res1, res0, res3, res2, res0, res2);
XORI_B2_128_UB(res0, res2);
AVER_UB2_UB(res0, dst0, res2, dst1, res0, res2);
- ST4x8_UB(res0, res2, dst, dst_stride);
+ ST_W8(res0, res2, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
}
static void common_hz_8t_and_aver_dst_4w_msa(const uint8_t *src,
out = PCKEV_XORI128_UB(out10, out32);
out = __msa_aver_u_b(out, dst0);
- ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+ ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
dst += (4 * dst_stride);
src2110 = src6554;
SAT_SH2_SH(res0, res1, 7);
res = PCKEV_XORI128_UB(res0, res1);
res = (v16u8) __msa_aver_u_b(res, dst0);
- ST4x4_UB(res, res, 0, 1, 2, 3, dst, dst_stride);
+ ST_W4(res, 0, 1, 2, 3, dst, dst_stride);
dst += (4 * dst_stride);
hz_out5 = hz_out9;
DOTP_UB2_UH(vec0, vec1, filt0, filt0, vec2, vec3);
SRARI_H2_UH(vec2, vec3, 7);
PCKEV_B2_UB(vec2, vec2, vec3, vec3, res0, res1);
- ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
+ ST_W2(res0, 0, 1, dst, dst_stride);
+ ST_W2(res1, 0, 1, dst + 2 * dst_stride, dst_stride);
}
static void common_hz_2t_4x8_msa(const uint8_t *src, int32_t src_stride,
SRARI_H4_UH(vec4, vec5, vec6, vec7, 7);
PCKEV_B4_SB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7,
res0, res1, res2, res3);
- ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
- dst += (4 * dst_stride);
- ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride);
+ ST_W2(res0, 0, 1, dst, dst_stride);
+ ST_W2(res1, 0, 1, dst + 2 * dst_stride, dst_stride);
+ ST_W2(res2, 0, 1, dst + 4 * dst_stride, dst_stride);
+ ST_W2(res3, 0, 1, dst + 6 * dst_stride, dst_stride);
}
void ff_put_bilin_4h_msa(uint8_t *dst, ptrdiff_t dst_stride,
vec0, vec1, vec2, vec3);
SRARI_H4_UH(vec0, vec1, vec2, vec3, 7);
PCKEV_B2_SB(vec1, vec0, vec3, vec2, src0, src1);
- ST8x4_UB(src0, src1, dst, dst_stride);
+ ST_D4(src0, src1, 0, 1, 0, 1, dst, dst_stride);
}
static void common_hz_2t_8x8mult_msa(const uint8_t *src, int32_t src_stride,
src += (4 * src_stride);
PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
- ST8x4_UB(out0, out1, dst, dst_stride);
- dst += (4 * dst_stride);
+ ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
vec0, vec1, vec2, vec3);
SRARI_H4_UH(vec0, vec1, vec2, vec3, 7);
PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
- ST8x4_UB(out0, out1, dst, dst_stride);
- dst += (4 * dst_stride);
+ ST_D4(out0, out1, 0, 1, 0, 1, dst + 4 * dst_stride, dst_stride);
+ dst += (8 * dst_stride);
if (16 == height) {
LD_SB4(src, src_stride, src0, src1, src2, src3);
src += (4 * src_stride);
PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
- ST8x4_UB(out0, out1, dst, dst_stride);
+ ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1);
VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3);
vec0, vec1, vec2, vec3);
SRARI_H4_UH(vec0, vec1, vec2, vec3, 7);
PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1);
- ST8x4_UB(out0, out1, dst + 4 * dst_stride, dst_stride);
+ ST_D4(out0, out1, 0, 1, 0, 1, dst + 4 * dst_stride, dst_stride);
}
}
SRARI_H2_UH(tmp0, tmp1, 7);
SAT_UH2_UH(tmp0, tmp1, 7);
src2110 = __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
- ST4x4_UB(src2110, src2110, 0, 1, 2, 3, dst, dst_stride);
+ ST_W4(src2110, 0, 1, 2, 3, dst, dst_stride);
}
static void common_vt_2t_4x8_msa(const uint8_t *src, int32_t src_stride,
SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7);
SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, src2110, src4332);
- ST4x4_UB(src2110, src2110, 0, 1, 2, 3, dst, dst_stride);
- ST4x4_UB(src4332, src4332, 0, 1, 2, 3, dst + 4 * dst_stride, dst_stride);
+ ST_W8(src2110, src4332, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
}
void ff_put_bilin_4v_msa(uint8_t *dst, ptrdiff_t dst_stride,
SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7);
SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1);
- ST8x4_UB(out0, out1, dst, dst_stride);
+ ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
}
static void common_vt_2t_8x8mult_msa(const uint8_t *src, int32_t src_stride,
SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7);
SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1);
- ST8x4_UB(out0, out1, dst, dst_stride);
- dst += (4 * dst_stride);
+ ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0,
tmp0, tmp1, tmp2, tmp3);
SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7);
SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1);
- ST8x4_UB(out0, out1, dst, dst_stride);
- dst += (4 * dst_stride);
+ ST_D4(out0, out1, 0, 1, 0, 1, dst + 4 * dst_stride, dst_stride);
+ dst += (8 * dst_stride);
src0 = src8;
}
SRARI_H2_UH(tmp0, tmp1, 7);
SAT_UH2_UH(tmp0, tmp1, 7);
PCKEV_B2_UB(tmp0, tmp0, tmp1, tmp1, res0, res1);
- ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
+ ST_W2(res0, 0, 1, dst, dst_stride);
+ ST_W2(res1, 0, 1, dst + 2 * dst_stride, dst_stride);
}
static void common_hv_2ht_2vt_4x8_msa(const uint8_t *src, int32_t src_stride,
SAT_UH4_UH(vec4, vec5, vec6, vec7, 7);
PCKEV_B4_SB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7,
res0, res1, res2, res3);
- ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride);
- dst += (4 * dst_stride);
- ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride);
+ ST_W2(res0, 0, 1, dst, dst_stride);
+ ST_W2(res1, 0, 1, dst + 2 * dst_stride, dst_stride);
+ ST_W2(res2, 0, 1, dst + 4 * dst_stride, dst_stride);
+ ST_W2(res3, 0, 1, dst + 6 * dst_stride, dst_stride);
}
void ff_put_bilin_4hv_msa(uint8_t *dst, ptrdiff_t dst_stride,
SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7);
SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1);
- ST8x4_UB(out0, out1, dst, dst_stride);
+ ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
}
static void common_hv_2ht_2vt_8x8mult_msa(const uint8_t *src, int32_t src_stride,
SRARI_H2_UH(tmp3, tmp4, 7);
SAT_UH2_UH(tmp3, tmp4, 7);
PCKEV_B2_SB(tmp2, tmp1, tmp4, tmp3, out0, out1);
- ST8x4_UB(out0, out1, dst, dst_stride);
- dst += (4 * dst_stride);
+ ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride);
hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, 7);
vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0);
SRARI_H4_UH(tmp5, tmp6, tmp7, tmp8, 7);
SAT_UH4_UH(tmp5, tmp6, tmp7, tmp8, 7);
PCKEV_B2_SB(tmp6, tmp5, tmp8, tmp7, out0, out1);
- ST8x4_UB(out0, out1, dst, dst_stride);
- dst += (4 * dst_stride);
+ ST_D4(out0, out1, 0, 1, 0, 1, dst + 4 * dst_stride, dst_stride);
+ dst += (8 * dst_stride);
}
}
res = (v16u8) __msa_pckev_b((v16i8) vec3, (v16i8) vec2);
res = (v16u8) __msa_aver_u_b(res, dst0);
- ST4x4_UB(res, res, 0, 1, 2, 3, dst, dst_stride);
+ ST_W4(res, 0, 1, 2, 3, dst, dst_stride);
}
static void common_hz_2t_and_aver_dst_4x8_msa(const uint8_t *src,
res2, res3);
ILVR_D2_UB(res1, res0, res3, res2, res0, res2);
AVER_UB2_UB(res0, dst0, res2, dst1, res0, res2);
- ST4x8_UB(res0, res2, dst, dst_stride);
+ ST_W8(res0, res2, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
}
void ff_avg_bilin_4h_msa(uint8_t *dst, ptrdiff_t dst_stride,
out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
out = __msa_aver_u_b(out, dst0);
- ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+ ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
}
static void common_vt_2t_and_aver_dst_4x8_msa(const uint8_t *src,
SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, src2110, src4332);
AVER_UB2_UB(src2110, dst0, src4332, dst1, src2110, src4332);
- ST4x8_UB(src2110, src4332, dst, dst_stride);
+ ST_W8(src2110, src4332, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
}
void ff_avg_bilin_4v_msa(uint8_t *dst, ptrdiff_t dst_stride,
out = (v16u8) __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0);
out = __msa_aver_u_b(out, dst0);
- ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride);
+ ST_W4(out, 0, 1, 2, 3, dst, dst_stride);
}
static void common_hv_2ht_2vt_and_aver_dst_4x8_msa(const uint8_t *src,
SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7);
PCKEV_B2_UB(tmp1, tmp0, tmp3, tmp2, res0, res1);
AVER_UB2_UB(res0, dst0, res1, dst1, res0, res1);
- ST4x8_UB(res0, res1, dst, dst_stride);
+ ST_W8(res0, res1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
}
void ff_avg_bilin_4hv_msa(uint8_t *dst, ptrdiff_t dst_stride,
LW4(dst + 4 * dst_stride, dst_stride, tp0, tp1, tp2, tp3);
INSERT_W4_UB(tp0, tp1, tp2, tp3, dst1);
AVER_UB2_UB(src0, dst0, src1, dst1, dst0, dst1);
- ST4x8_UB(dst0, dst1, dst, dst_stride);
+ ST_W8(dst0, dst1, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride);
} else if (4 == height) {
LW4(src, src_stride, tp0, tp1, tp2, tp3);
INSERT_W4_UB(tp0, tp1, tp2, tp3, src0);
LW4(dst, dst_stride, tp0, tp1, tp2, tp3);
INSERT_W4_UB(tp0, tp1, tp2, tp3, dst0);
dst0 = __msa_aver_u_b(src0, dst0);
- ST4x4_UB(dst0, dst0, 0, 1, 2, 3, dst, dst_stride);
+ ST_W4(dst0, 0, 1, 2, 3, dst, dst_stride);
}
}
INSERT_D2_UB(tp6, tp7, dst3);
AVER_UB4_UB(src0, dst0, src1, dst1, src2, dst2, src3, dst3, dst0,
dst1, dst2, dst3);
- ST8x8_UB(dst0, dst1, dst2, dst3, dst, dst_stride);
+ ST_D8(dst0, dst1, dst2, dst3, 0, 1, 0, 1, 0, 1, 0, 1, dst, dst_stride);
dst += 8 * dst_stride;
}
} else if (4 == height) {
INSERT_D2_UB(tp0, tp1, dst0);
INSERT_D2_UB(tp2, tp3, dst1);
AVER_UB2_UB(src0, dst0, src1, dst1, dst0, dst1);
- ST8x4_UB(dst0, dst1, dst, dst_stride);
+ ST_D4(dst0, dst1, 0, 1, 0, 1, dst, dst_stride);
}
}
}
#define LD_SH16(...) LD_V16(v8i16, __VA_ARGS__)
-/* Description : Load as 4x4 block of signed halfword elements from 1D source
- data into 4 vectors (Each vector with 4 signed halfwords)
- Arguments : Inputs - psrc
- Outputs - out0, out1, out2, out3
-*/
-#define LD4x4_SH(psrc, out0, out1, out2, out3) \
-{ \
- out0 = LD_SH(psrc); \
- out2 = LD_SH(psrc + 8); \
- out1 = (v8i16) __msa_ilvl_d((v2i64) out0, (v2i64) out0); \
- out3 = (v8i16) __msa_ilvl_d((v2i64) out2, (v2i64) out2); \
-}
-
/* Description : Store vectors with stride
Arguments : Inputs - in0, in1, stride
Outputs - pdst (destination pointer to store to)
#define ST_SH8(...) ST_V8(v8i16, __VA_ARGS__)
#define ST_SW8(...) ST_V8(v4i32, __VA_ARGS__)
-/* Description : Store as 2x4 byte block to destination memory from input vector
- Arguments : Inputs - in, stidx, pdst, stride
- Return Type - unsigned byte
- Details : Index stidx halfword element from 'in' vector is copied and
- stored on first line
- Index stidx+1 halfword element from 'in' vector is copied and
- stored on second line
- Index stidx+2 halfword element from 'in' vector is copied and
- stored on third line
- Index stidx+3 halfword element from 'in' vector is copied and
- stored on fourth line
-*/
-#define ST2x4_UB(in, stidx, pdst, stride) \
-{ \
- uint16_t out0_m, out1_m, out2_m, out3_m; \
- uint8_t *pblk_2x4_m = (uint8_t *) (pdst); \
- \
- out0_m = __msa_copy_u_h((v8i16) in, (stidx)); \
- out1_m = __msa_copy_u_h((v8i16) in, (stidx + 1)); \
- out2_m = __msa_copy_u_h((v8i16) in, (stidx + 2)); \
- out3_m = __msa_copy_u_h((v8i16) in, (stidx + 3)); \
- \
- SH(out0_m, pblk_2x4_m); \
- SH(out1_m, pblk_2x4_m + stride); \
- SH(out2_m, pblk_2x4_m + 2 * stride); \
- SH(out3_m, pblk_2x4_m + 3 * stride); \
-}
-
-/* Description : Store as 4x2 byte block to destination memory from input vector
- Arguments : Inputs - in, pdst, stride
- Return Type - unsigned byte
- Details : Index 0 word element from input vector is copied and stored
- on first line
- Index 1 word element from input vector is copied and stored
- on second line
-*/
-#define ST4x2_UB(in, pdst, stride) \
-{ \
- uint32_t out0_m, out1_m; \
- uint8_t *pblk_4x2_m = (uint8_t *) (pdst); \
- \
- out0_m = __msa_copy_u_w((v4i32) in, 0); \
- out1_m = __msa_copy_u_w((v4i32) in, 1); \
- \
- SW(out0_m, pblk_4x2_m); \
- SW(out1_m, pblk_4x2_m + stride); \
-}
-
-/* Description : Store as 4x4 byte block to destination memory from input vector
- Arguments : Inputs - in0, in1, pdst, stride
- Return Type - unsigned byte
- Details : Idx0 word element from input vector 'in0' is copied and stored
- on first line
- Idx1 word element from input vector 'in0' is copied and stored
- on second line
- Idx2 word element from input vector 'in1' is copied and stored
- on third line
- Idx3 word element from input vector 'in1' is copied and stored
- on fourth line
-*/
-#define ST4x4_UB(in0, in1, idx0, idx1, idx2, idx3, pdst, stride) \
-{ \
- uint32_t out0_m, out1_m, out2_m, out3_m; \
- uint8_t *pblk_4x4_m = (uint8_t *) (pdst); \
- \
- out0_m = __msa_copy_u_w((v4i32) in0, idx0); \
- out1_m = __msa_copy_u_w((v4i32) in0, idx1); \
- out2_m = __msa_copy_u_w((v4i32) in1, idx2); \
- out3_m = __msa_copy_u_w((v4i32) in1, idx3); \
- \
- SW4(out0_m, out1_m, out2_m, out3_m, pblk_4x4_m, stride); \
+/* Description : Store half word elements of vector with stride
+ * Arguments : Inputs - in source vector
+ * - pdst (destination pointer to store to)
+ * - stride
+ * Details : Stores half word 'idx0' from 'in' to (pdst)
+ * Stores half word 'idx1' from 'in' to (pdst + stride)
+ * Similar for other elements
+ */
+#define ST_H1(in, idx, pdst) \
+{ \
+ uint16_t out0_m; \
+ out0_m = __msa_copy_u_h((v8i16) in, idx); \
+ SH(out0_m, (pdst)); \
}
-#define ST4x8_UB(in0, in1, pdst, stride) \
-{ \
- uint8_t *pblk_4x8 = (uint8_t *) (pdst); \
- \
- ST4x4_UB(in0, in0, 0, 1, 2, 3, pblk_4x8, stride); \
- ST4x4_UB(in1, in1, 0, 1, 2, 3, pblk_4x8 + 4 * stride, stride); \
+#define ST_H2(in, idx0, idx1, pdst, stride) \
+{ \
+ uint16_t out0_m, out1_m; \
+ out0_m = __msa_copy_u_h((v8i16) in, idx0); \
+ out1_m = __msa_copy_u_h((v8i16) in, idx1); \
+ SH(out0_m, (pdst)); \
+ SH(out1_m, (pdst) + stride); \
+}
+#define ST_H4(in, idx0, idx1, idx2, idx3, pdst, stride) \
+{ \
+ uint16_t out0_m, out1_m, out2_m, out3_m; \
+ out0_m = __msa_copy_u_h((v8i16) in, idx0); \
+ out1_m = __msa_copy_u_h((v8i16) in, idx1); \
+ out2_m = __msa_copy_u_h((v8i16) in, idx2); \
+ out3_m = __msa_copy_u_h((v8i16) in, idx3); \
+ SH(out0_m, (pdst)); \
+ SH(out1_m, (pdst) + stride); \
+ SH(out2_m, (pdst) + 2 * stride); \
+ SH(out3_m, (pdst) + 3 * stride); \
+}
+#define ST_H8(in, idx0, idx1, idx2, idx3, idx4, idx5, \
+ idx6, idx7, pdst, stride) \
+{ \
+ ST_H4(in, idx0, idx1, idx2, idx3, pdst, stride) \
+ ST_H4(in, idx4, idx5, idx6, idx7, (pdst) + 4*stride, stride) \
}
-/* Description : Store as 6x4 byte block to destination memory from input
- vectors
- Arguments : Inputs - in0, in1, pdst, stride
- Return Type - unsigned byte
- Details : Index 0 word element from input vector 'in0' is copied and
- stored on first line followed by index 2 halfword element
- Index 2 word element from input vector 'in0' is copied and
- stored on second line followed by index 2 halfword element
- Index 0 word element from input vector 'in1' is copied and
- stored on third line followed by index 2 halfword element
- Index 2 word element from input vector 'in1' is copied and
- stored on fourth line followed by index 2 halfword element
-*/
-#define ST6x4_UB(in0, in1, pdst, stride) \
+/* Description : Store word elements of vector with stride
+ * Arguments : Inputs - in source vector
+ * - pdst (destination pointer to store to)
+ * - stride
+ * Details : Stores word 'idx0' from 'in' to (pdst)
+ * Stores word 'idx1' from 'in' to (pdst + stride)
+ * Similar for other elements
+ */
+#define ST_W1(in, idx, pdst) \
+{ \
+ uint32_t out0_m; \
+ out0_m = __msa_copy_u_w((v4i32) in, idx); \
+ SW(out0_m, (pdst)); \
+}
+#define ST_W2(in, idx0, idx1, pdst, stride) \
+{ \
+ uint32_t out0_m, out1_m; \
+ out0_m = __msa_copy_u_w((v4i32) in, idx0); \
+ out1_m = __msa_copy_u_w((v4i32) in, idx1); \
+ SW(out0_m, (pdst)); \
+ SW(out1_m, (pdst) + stride); \
+}
+#define ST_W4(in, idx0, idx1, idx2, idx3, pdst, stride) \
+{ \
+ uint32_t out0_m, out1_m, out2_m, out3_m; \
+ out0_m = __msa_copy_u_w((v4i32) in, idx0); \
+ out1_m = __msa_copy_u_w((v4i32) in, idx1); \
+ out2_m = __msa_copy_u_w((v4i32) in, idx2); \
+ out3_m = __msa_copy_u_w((v4i32) in, idx3); \
+ SW(out0_m, (pdst)); \
+ SW(out1_m, (pdst) + stride); \
+ SW(out2_m, (pdst) + 2*stride); \
+ SW(out3_m, (pdst) + 3*stride); \
+}
+#define ST_W8(in0, in1, idx0, idx1, idx2, idx3, \
+ idx4, idx5, idx6, idx7, pdst, stride) \
+{ \
+ ST_W4(in0, idx0, idx1, idx2, idx3, pdst, stride) \
+ ST_W4(in1, idx4, idx5, idx6, idx7, pdst + 4*stride, stride) \
+}
+
+/* Description : Store double word elements of vector with stride
+ * Arguments : Inputs - in source vector
+ * - pdst (destination pointer to store to)
+ * - stride
+ * Details : Stores double word 'idx0' from 'in' to (pdst)
+ * Stores double word 'idx1' from 'in' to (pdst + stride)
+ * Similar for other elements
+ */
+#define ST_D1(in, idx, pdst) \
{ \
- uint32_t out0_m, out1_m, out2_m, out3_m; \
- uint16_t out4_m, out5_m, out6_m, out7_m; \
- uint8_t *pblk_6x4_m = (uint8_t *) (pdst); \
- \
- out0_m = __msa_copy_u_w((v4i32) in0, 0); \
- out1_m = __msa_copy_u_w((v4i32) in0, 2); \
- out2_m = __msa_copy_u_w((v4i32) in1, 0); \
- out3_m = __msa_copy_u_w((v4i32) in1, 2); \
- \
- out4_m = __msa_copy_u_h((v8i16) in0, 2); \
- out5_m = __msa_copy_u_h((v8i16) in0, 6); \
- out6_m = __msa_copy_u_h((v8i16) in1, 2); \
- out7_m = __msa_copy_u_h((v8i16) in1, 6); \
- \
- SW(out0_m, pblk_6x4_m); \
- SH(out4_m, (pblk_6x4_m + 4)); \
- pblk_6x4_m += stride; \
- SW(out1_m, pblk_6x4_m); \
- SH(out5_m, (pblk_6x4_m + 4)); \
- pblk_6x4_m += stride; \
- SW(out2_m, pblk_6x4_m); \
- SH(out6_m, (pblk_6x4_m + 4)); \
- pblk_6x4_m += stride; \
- SW(out3_m, pblk_6x4_m); \
- SH(out7_m, (pblk_6x4_m + 4)); \
-}
-
-/* Description : Store as 8x1 byte block to destination memory from input vector
- Arguments : Inputs - in, pdst
- Details : Index 0 double word element from input vector 'in' is copied
- and stored to destination memory at (pdst)
-*/
-#define ST8x1_UB(in, pdst) \
-{ \
- uint64_t out0_m; \
- out0_m = __msa_copy_u_d((v2i64) in, 0); \
- SD(out0_m, pdst); \
-}
-
-/* Description : Store as 8x2 byte block to destination memory from input vector
- Arguments : Inputs - in, pdst, stride
- Details : Index 0 double word element from input vector 'in' is copied
- and stored to destination memory at (pdst)
- Index 1 double word element from input vector 'in' is copied
- and stored to destination memory at (pdst + stride)
-*/
-#define ST8x2_UB(in, pdst, stride) \
+ uint64_t out0_m; \
+ out0_m = __msa_copy_u_d((v2i64) in, idx); \
+ SD(out0_m, (pdst)); \
+}
+#define ST_D2(in, idx0, idx1, pdst, stride) \
{ \
uint64_t out0_m, out1_m; \
- uint8_t *pblk_8x2_m = (uint8_t *) (pdst); \
- \
- out0_m = __msa_copy_u_d((v2i64) in, 0); \
- out1_m = __msa_copy_u_d((v2i64) in, 1); \
- \
- SD(out0_m, pblk_8x2_m); \
- SD(out1_m, pblk_8x2_m + stride); \
-}
-
-/* Description : Store as 8x4 byte block to destination memory from input
- vectors
- Arguments : Inputs - in0, in1, pdst, stride
- Details : Index 0 double word element from input vector 'in0' is copied
- and stored to destination memory at (pblk_8x4_m)
- Index 1 double word element from input vector 'in0' is copied
- and stored to destination memory at (pblk_8x4_m + stride)
- Index 0 double word element from input vector 'in1' is copied
- and stored to destination memory at (pblk_8x4_m + 2 * stride)
- Index 1 double word element from input vector 'in1' is copied
- and stored to destination memory at (pblk_8x4_m + 3 * stride)
-*/
-#define ST8x4_UB(in0, in1, pdst, stride) \
-{ \
- uint64_t out0_m, out1_m, out2_m, out3_m; \
- uint8_t *pblk_8x4_m = (uint8_t *) (pdst); \
- \
- out0_m = __msa_copy_u_d((v2i64) in0, 0); \
- out1_m = __msa_copy_u_d((v2i64) in0, 1); \
- out2_m = __msa_copy_u_d((v2i64) in1, 0); \
- out3_m = __msa_copy_u_d((v2i64) in1, 1); \
- \
- SD4(out0_m, out1_m, out2_m, out3_m, pblk_8x4_m, stride); \
+ out0_m = __msa_copy_u_d((v2i64) in, idx0); \
+ out1_m = __msa_copy_u_d((v2i64) in, idx1); \
+ SD(out0_m, (pdst)); \
+ SD(out1_m, (pdst) + stride); \
}
-#define ST8x8_UB(in0, in1, in2, in3, pdst, stride) \
-{ \
- uint8_t *pblk_8x8_m = (uint8_t *) (pdst); \
- \
- ST8x4_UB(in0, in1, pblk_8x8_m, stride); \
- ST8x4_UB(in2, in3, pblk_8x8_m + 4 * stride, stride); \
-}
-#define ST12x4_UB(in0, in1, in2, pdst, stride) \
+#define ST_D4(in0, in1, idx0, idx1, idx2, idx3, pdst, stride) \
{ \
- uint8_t *pblk_12x4_m = (uint8_t *) (pdst); \
- \
- /* left 8x4 */ \
- ST8x4_UB(in0, in1, pblk_12x4_m, stride); \
- /* right 4x4 */ \
- ST4x4_UB(in2, in2, 0, 1, 2, 3, pblk_12x4_m + 8, stride); \
+ uint64_t out0_m, out1_m, out2_m, out3_m; \
+ out0_m = __msa_copy_u_d((v2i64) in0, idx0); \
+ out1_m = __msa_copy_u_d((v2i64) in0, idx1); \
+ out2_m = __msa_copy_u_d((v2i64) in1, idx2); \
+ out3_m = __msa_copy_u_d((v2i64) in1, idx3); \
+ SD(out0_m, (pdst)); \
+ SD(out1_m, (pdst) + stride); \
+ SD(out2_m, (pdst) + 2 * stride); \
+ SD(out3_m, (pdst) + 3 * stride); \
+}
+#define ST_D8(in0, in1, in2, in3, idx0, idx1, idx2, idx3, \
+ idx4, idx5, idx6, idx7, pdst, stride) \
+{ \
+ ST_D4(in0, in1, idx0, idx1, idx2, idx3, pdst, stride) \
+ ST_D4(in2, in3, idx4, idx5, idx6, idx7, pdst + 4 * stride, stride) \
}
/* Description : Store as 12x8 byte block to destination memory from
tmp0_m = PCKEV_XORI128_UB(in0, in1); \
tmp1_m = PCKEV_XORI128_UB(in2, in3); \
AVER_UB2_UB(tmp0_m, dst0, tmp1_m, dst1, tmp0_m, tmp1_m); \
- ST8x4_UB(tmp0_m, tmp1_m, pdst_m, stride); \
+ ST_D4(tmp0_m, tmp1_m, 0, 1, 0, 1, pdst_m, stride); \
}
/* Description : Pack even byte elements, extract 0 & 2 index words from pair