X-Git-Url: https://git.sesse.net/?a=blobdiff_plain;f=libavcodec%2Fmips%2Fhevc_lpf_sao_msa.c;h=26663dd89bc30ec5b4d146b66d25cfe5a7b20ca0;hb=d07534b5f5f20b4f780f5b0284aca6354da00695;hp=5b5537a26450382ce7f2e1d2e178e1225c291920;hpb=50ae1f7e0ff1fa00236622415039f7e28d919a25;p=ffmpeg diff --git a/libavcodec/mips/hevc_lpf_sao_msa.c b/libavcodec/mips/hevc_lpf_sao_msa.c index 5b5537a2645..26663dd89bc 100644 --- a/libavcodec/mips/hevc_lpf_sao_msa.c +++ b/libavcodec/mips/hevc_lpf_sao_msa.c @@ -140,19 +140,19 @@ static void hevc_loopfilter_luma_hor_msa(uint8_t *src, int32_t stride, temp1 = ((p3_src + p2_src) << 1) + p2_src + temp0; temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3); temp2 = (v8i16) (temp1 - p2_src); - temp2 = CLIP_SH(temp2, tc_neg, tc_pos); + CLIP_SH(temp2, tc_neg, tc_pos); dst0 = (v16u8) (temp2 + (v8i16) p2_src); temp1 = temp0 + p2_src; temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 2); temp2 = (v8i16) (temp1 - p1_src); - temp2 = CLIP_SH(temp2, tc_neg, tc_pos); + CLIP_SH(temp2, tc_neg, tc_pos); dst1 = (v16u8) (temp2 + (v8i16) p1_src); temp1 = (temp0 << 1) + p2_src + q1_src; temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3); temp2 = (v8i16) (temp1 - p0_src); - temp2 = CLIP_SH(temp2, tc_neg, tc_pos); + CLIP_SH(temp2, tc_neg, tc_pos); dst2 = (v16u8) (temp2 + (v8i16) p0_src); dst0 = __msa_bmz_v(dst0, (v16u8) p2_src, (v16u8) p_is_pcm_vec); @@ -165,19 +165,19 @@ static void hevc_loopfilter_luma_hor_msa(uint8_t *src, int32_t stride, temp1 = ((q3_src + q2_src) << 1) + q2_src + temp0; temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3); temp2 = (v8i16) (temp1 - q2_src); - temp2 = CLIP_SH(temp2, tc_neg, tc_pos); + CLIP_SH(temp2, tc_neg, tc_pos); dst5 = (v16u8) (temp2 + (v8i16) q2_src); temp1 = temp0 + q2_src; temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 2); temp2 = (v8i16) (temp1 - q1_src); - temp2 = CLIP_SH(temp2, tc_neg, tc_pos); + CLIP_SH(temp2, tc_neg, tc_pos); dst4 = (v16u8) (temp2 + (v8i16) q1_src); temp1 = (temp0 << 1) + p1_src + q2_src; temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3); temp2 = (v8i16) (temp1 - q0_src); - temp2 = CLIP_SH(temp2, tc_neg, tc_pos); + CLIP_SH(temp2, tc_neg, tc_pos); dst3 = (v16u8) (temp2 + (v8i16) q0_src); dst3 = __msa_bmz_v(dst3, (v16u8) q0_src, (v16u8) q_is_pcm_vec); @@ -199,11 +199,9 @@ static void hevc_loopfilter_luma_hor_msa(uint8_t *src, int32_t stride, dst_val0 = __msa_copy_u_d((v2i64) dst2, 0); dst_val1 = __msa_copy_u_d((v2i64) dst2, 1); - ST8x4_UB(dst0, dst1, p2, stride); - p2 += (4 * stride); - SD(dst_val0, p2); - p2 += stride; - SD(dst_val1, p2); + ST_D4(dst0, dst1, 0, 1, 0, 1, p2, stride); + SD(dst_val0, p2 + 4 * stride); + SD(dst_val1, p2 + 5 * stride); /* strong filter ends */ } else if (flag0 == flag1) { /* weak only */ /* weak filter */ @@ -220,15 +218,15 @@ static void hevc_loopfilter_luma_hor_msa(uint8_t *src, int32_t stride, abs_delta0 = __msa_add_a_h(delta0, (v8i16) zero); abs_delta0 = (v8u16) abs_delta0 < temp1; - delta0 = CLIP_SH(delta0, tc_neg, tc_pos); + CLIP_SH(delta0, tc_neg, tc_pos); - temp0 = (v8u16) (delta0 + p0_src); - temp0 = (v8u16) CLIP_SH_0_255(temp0); - temp0 = (v8u16) __msa_bmz_v((v16u8) temp0, (v16u8) p0_src, + temp2 = (v8i16) (delta0 + p0_src); + CLIP_SH_0_255(temp2); + temp0 = (v8u16) __msa_bmz_v((v16u8) temp2, (v16u8) p0_src, (v16u8) p_is_pcm_vec); temp2 = (v8i16) (q0_src - delta0); - temp2 = CLIP_SH_0_255(temp2); + CLIP_SH_0_255(temp2); temp2 = (v8i16) __msa_bmz_v((v16u8) temp2, (v16u8) q0_src, (v16u8) q_is_pcm_vec); @@ -254,9 +252,9 @@ static void hevc_loopfilter_luma_hor_msa(uint8_t *src, int32_t stride, delta1 -= (v8i16) p1_src; delta1 += delta0; delta1 >>= 1; - delta1 = CLIP_SH(delta1, tc_neg, tc_pos); + CLIP_SH(delta1, tc_neg, tc_pos); delta1 = (v8i16) p1_src + (v8i16) delta1; - delta1 = CLIP_SH_0_255(delta1); + CLIP_SH_0_255(delta1); delta1 = (v8i16) __msa_bmnz_v((v16u8) delta1, (v16u8) p1_src, (v16u8) p_is_pcm_vec); @@ -264,9 +262,9 @@ static void hevc_loopfilter_luma_hor_msa(uint8_t *src, int32_t stride, delta2 = delta2 - (v8i16) q1_src; delta2 = delta2 - delta0; delta2 = delta2 >> 1; - delta2 = CLIP_SH(delta2, tc_neg, tc_pos); + CLIP_SH(delta2, tc_neg, tc_pos); delta2 = (v8i16) q1_src + (v8i16) delta2; - delta2 = CLIP_SH_0_255(delta2); + CLIP_SH_0_255(delta2); delta2 = (v8i16) __msa_bmnz_v((v16u8) delta2, (v16u8) q1_src, (v16u8) q_is_pcm_vec); @@ -288,7 +286,7 @@ static void hevc_loopfilter_luma_hor_msa(uint8_t *src, int32_t stride, dst1 = __msa_bmz_v(dst1, dst3, (v16u8) cmp3); p2 += stride; - ST8x4_UB(dst0, dst1, p2, stride); + ST_D4(dst0, dst1, 0, 1, 0, 1, p2, stride); /* weak filter ends */ } else { /* strong + weak */ /* strong filter */ @@ -300,19 +298,19 @@ static void hevc_loopfilter_luma_hor_msa(uint8_t *src, int32_t stride, temp1 = ((p3_src + p2_src) << 1) + p2_src + temp0; temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3); temp2 = (v8i16) (temp1 - p2_src); - temp2 = CLIP_SH(temp2, tc_neg, tc_pos); + CLIP_SH(temp2, tc_neg, tc_pos); dst0 = (v16u8) (temp2 + (v8i16) p2_src); temp1 = temp0 + p2_src; temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 2); temp2 = (v8i16) (temp1 - p1_src); - temp2 = CLIP_SH(temp2, tc_neg, tc_pos); + CLIP_SH(temp2, tc_neg, tc_pos); dst1 = (v16u8) (temp2 + (v8i16) p1_src); temp1 = (temp0 << 1) + p2_src + q1_src; temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3); temp2 = (v8i16) (temp1 - p0_src); - temp2 = CLIP_SH(temp2, tc_neg, tc_pos); + CLIP_SH(temp2, tc_neg, tc_pos); dst2 = (v16u8) (temp2 + (v8i16) p0_src); dst0 = __msa_bmz_v(dst0, (v16u8) p2_src, (v16u8) p_is_pcm_vec); @@ -325,19 +323,19 @@ static void hevc_loopfilter_luma_hor_msa(uint8_t *src, int32_t stride, temp1 = ((q3_src + q2_src) << 1) + q2_src + temp0; temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3); temp2 = (v8i16) (temp1 - q2_src); - temp2 = CLIP_SH(temp2, tc_neg, tc_pos); + CLIP_SH(temp2, tc_neg, tc_pos); dst5 = (v16u8) (temp2 + (v8i16) q2_src); temp1 = temp0 + q2_src; temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 2); temp2 = (v8i16) (temp1 - q1_src); - temp2 = CLIP_SH(temp2, tc_neg, tc_pos); + CLIP_SH(temp2, tc_neg, tc_pos); dst4 = (v16u8) (temp2 + (v8i16) q1_src); temp1 = (temp0 << 1) + p1_src + q2_src; temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3); temp2 = (v8i16) (temp1 - q0_src); - temp2 = CLIP_SH(temp2, tc_neg, tc_pos); + CLIP_SH(temp2, tc_neg, tc_pos); dst3 = (v16u8) (temp2 + (v8i16) q0_src); dst3 = __msa_bmz_v(dst3, (v16u8) q0_src, (v16u8) q_is_pcm_vec); @@ -364,15 +362,15 @@ static void hevc_loopfilter_luma_hor_msa(uint8_t *src, int32_t stride, abs_delta0 = __msa_add_a_h(delta0, (v8i16) zero); abs_delta0 = (v8u16) abs_delta0 < temp1; - delta0 = CLIP_SH(delta0, tc_neg, tc_pos); + CLIP_SH(delta0, tc_neg, tc_pos); - temp0 = (v8u16) (delta0 + p0_src); - temp0 = (v8u16) CLIP_SH_0_255(temp0); - temp0 = (v8u16) __msa_bmz_v((v16u8) temp0, (v16u8) p0_src, + temp2 = (v8i16) (delta0 + p0_src); + CLIP_SH_0_255(temp2); + temp0 = (v8u16) __msa_bmz_v((v16u8) temp2, (v16u8) p0_src, (v16u8) p_is_pcm_vec); temp2 = (v8i16) (q0_src - delta0); - temp2 = CLIP_SH_0_255(temp2); + CLIP_SH_0_255(temp2); temp2 = (v8i16) __msa_bmz_v((v16u8) temp2, (v16u8) q0_src, (v16u8) q_is_pcm_vec); @@ -396,9 +394,9 @@ static void hevc_loopfilter_luma_hor_msa(uint8_t *src, int32_t stride, delta1 -= (v8i16) p1_src; delta1 += delta0; delta1 >>= 1; - delta1 = CLIP_SH(delta1, tc_neg, tc_pos); + CLIP_SH(delta1, tc_neg, tc_pos); delta1 = (v8i16) p1_src + (v8i16) delta1; - delta1 = CLIP_SH_0_255(delta1); + CLIP_SH_0_255(delta1); delta1 = (v8i16) __msa_bmnz_v((v16u8) delta1, (v16u8) p1_src, (v16u8) p_is_pcm_vec); @@ -406,9 +404,9 @@ static void hevc_loopfilter_luma_hor_msa(uint8_t *src, int32_t stride, delta2 = delta2 - (v8i16) q1_src; delta2 = delta2 - delta0; delta2 = delta2 >> 1; - delta2 = CLIP_SH(delta2, tc_neg, tc_pos); + CLIP_SH(delta2, tc_neg, tc_pos); delta2 = (v8i16) q1_src + (v8i16) delta2; - delta2 = CLIP_SH_0_255(delta2); + CLIP_SH_0_255(delta2); delta2 = (v8i16) __msa_bmnz_v((v16u8) delta2, (v16u8) q1_src, (v16u8) q_is_pcm_vec); @@ -442,11 +440,9 @@ static void hevc_loopfilter_luma_hor_msa(uint8_t *src, int32_t stride, dst_val0 = __msa_copy_u_d((v2i64) dst2, 0); dst_val1 = __msa_copy_u_d((v2i64) dst2, 1); - ST8x4_UB(dst0, dst1, p2, stride); - p2 += (4 * stride); - SD(dst_val0, p2); - p2 += stride; - SD(dst_val1, p2); + ST_D4(dst0, dst1, 0, 1, 0, 1, p2, stride); + SD(dst_val0, p2 + 4 * stride); + SD(dst_val1, p2 + 5 * stride); } } } @@ -565,19 +561,19 @@ static void hevc_loopfilter_luma_ver_msa(uint8_t *src, int32_t stride, temp1 = ((p3_src + p2_src) << 1) + p2_src + temp0; temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3); temp2 = (v8i16) (temp1 - p2_src); - temp2 = CLIP_SH(temp2, tc_neg, tc_pos); + CLIP_SH(temp2, tc_neg, tc_pos); dst0 = (v16u8) (temp2 + (v8i16) p2_src); temp1 = temp0 + p2_src; temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 2); temp2 = (v8i16) (temp1 - p1_src); - temp2 = CLIP_SH(temp2, tc_neg, tc_pos); + CLIP_SH(temp2, tc_neg, tc_pos); dst1 = (v16u8) (temp2 + (v8i16) p1_src); temp1 = (temp0 << 1) + p2_src + q1_src; temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3); temp2 = (v8i16) (temp1 - p0_src); - temp2 = CLIP_SH(temp2, tc_neg, tc_pos); + CLIP_SH(temp2, tc_neg, tc_pos); dst2 = (v16u8) (temp2 + (v8i16) p0_src); dst0 = __msa_bmz_v(dst0, (v16u8) p2_src, (v16u8) p_is_pcm_vec); @@ -589,19 +585,19 @@ static void hevc_loopfilter_luma_ver_msa(uint8_t *src, int32_t stride, temp1 = ((q3_src + q2_src) << 1) + q2_src + temp0; temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3); temp2 = (v8i16) (temp1 - q2_src); - temp2 = CLIP_SH(temp2, tc_neg, tc_pos); + CLIP_SH(temp2, tc_neg, tc_pos); dst5 = (v16u8) (temp2 + (v8i16) q2_src); temp1 = temp0 + q2_src; temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 2); temp2 = (v8i16) (temp1 - q1_src); - temp2 = CLIP_SH(temp2, tc_neg, tc_pos); + CLIP_SH(temp2, tc_neg, tc_pos); dst4 = (v16u8) (temp2 + (v8i16) q1_src); temp1 = (temp0 << 1) + p1_src + q2_src; temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3); temp2 = (v8i16) (temp1 - q0_src); - temp2 = CLIP_SH(temp2, tc_neg, tc_pos); + CLIP_SH(temp2, tc_neg, tc_pos); dst3 = (v16u8) (temp2 + (v8i16) q0_src); dst3 = __msa_bmz_v(dst3, (v16u8) q0_src, (v16u8) q_is_pcm_vec); @@ -624,14 +620,14 @@ static void hevc_loopfilter_luma_ver_msa(uint8_t *src, int32_t stride, abs_delta0 = __msa_add_a_h(delta0, (v8i16) zero); abs_delta0 = (v8u16) abs_delta0 < temp1; - delta0 = CLIP_SH(delta0, tc_neg, tc_pos); - temp0 = (v8u16) (delta0 + p0_src); - temp0 = (v8u16) CLIP_SH_0_255(temp0); - temp0 = (v8u16) __msa_bmz_v((v16u8) temp0, (v16u8) p0_src, + CLIP_SH(delta0, tc_neg, tc_pos); + temp2 = (v8i16) (delta0 + p0_src); + CLIP_SH_0_255(temp2); + temp0 = (v8u16) __msa_bmz_v((v16u8) temp2, (v16u8) p0_src, (v16u8) p_is_pcm_vec); temp2 = (v8i16) (q0_src - delta0); - temp2 = CLIP_SH_0_255(temp2); + CLIP_SH_0_255(temp2); temp2 = (v8i16) __msa_bmz_v((v16u8) temp2, (v16u8) q0_src, (v16u8) q_is_pcm_vec); @@ -653,9 +649,9 @@ static void hevc_loopfilter_luma_ver_msa(uint8_t *src, int32_t stride, delta1 -= (v8i16) p1_src; delta1 += delta0; delta1 >>= 1; - delta1 = CLIP_SH(delta1, tc_neg, tc_pos); + CLIP_SH(delta1, tc_neg, tc_pos); delta1 = (v8i16) p1_src + (v8i16) delta1; - delta1 = CLIP_SH_0_255(delta1); + CLIP_SH_0_255(delta1); delta1 = (v8i16) __msa_bmnz_v((v16u8) delta1, (v16u8) p1_src, (v16u8) p_is_pcm_vec); @@ -663,9 +659,9 @@ static void hevc_loopfilter_luma_ver_msa(uint8_t *src, int32_t stride, delta2 = delta2 - (v8i16) q1_src; delta2 = delta2 - delta0; delta2 = delta2 >> 1; - delta2 = CLIP_SH(delta2, tc_neg, tc_pos); + CLIP_SH(delta2, tc_neg, tc_pos); delta2 = (v8i16) q1_src + (v8i16) delta2; - delta2 = CLIP_SH_0_255(delta2); + CLIP_SH_0_255(delta2); delta2 = (v8i16) __msa_bmnz_v((v16u8) delta2, (v16u8) q1_src, (v16u8) q_is_pcm_vec); @@ -730,19 +726,19 @@ static void hevc_loopfilter_luma_ver_msa(uint8_t *src, int32_t stride, temp1 = ((p3_src + p2_src) << 1) + p2_src + temp0; temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3); temp2 = (v8i16) (temp1 - p2_src); - temp2 = CLIP_SH(temp2, tc_neg, tc_pos); + CLIP_SH(temp2, tc_neg, tc_pos); dst0 = (v16u8) (temp2 + (v8i16) p2_src); temp1 = temp0 + p2_src; temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 2); temp2 = (v8i16) (temp1 - p1_src); - temp2 = CLIP_SH(temp2, tc_neg, tc_pos); + CLIP_SH(temp2, tc_neg, tc_pos); dst1 = (v16u8) (temp2 + (v8i16) p1_src); temp1 = (temp0 << 1) + p2_src + q1_src; temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3); temp2 = (v8i16) (temp1 - p0_src); - temp2 = CLIP_SH(temp2, tc_neg, tc_pos); + CLIP_SH(temp2, tc_neg, tc_pos); dst2 = (v16u8) (temp2 + (v8i16) p0_src); dst0 = __msa_bmz_v(dst0, (v16u8) p2_src, (v16u8) p_is_pcm_vec); @@ -754,19 +750,19 @@ static void hevc_loopfilter_luma_ver_msa(uint8_t *src, int32_t stride, temp1 = ((q3_src + q2_src) << 1) + q2_src + temp0; temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3); temp2 = (v8i16) (temp1 - q2_src); - temp2 = CLIP_SH(temp2, tc_neg, tc_pos); + CLIP_SH(temp2, tc_neg, tc_pos); dst5 = (v16u8) (temp2 + (v8i16) q2_src); temp1 = temp0 + q2_src; temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 2); temp2 = (v8i16) (temp1 - q1_src); - temp2 = CLIP_SH(temp2, tc_neg, tc_pos); + CLIP_SH(temp2, tc_neg, tc_pos); dst4 = (v16u8) (temp2 + (v8i16) q1_src); temp1 = (temp0 << 1) + p1_src + q2_src; temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3); temp2 = (v8i16) (temp1 - q0_src); - temp2 = CLIP_SH(temp2, tc_neg, tc_pos); + CLIP_SH(temp2, tc_neg, tc_pos); dst3 = (v16u8) (temp2 + (v8i16) q0_src); dst3 = __msa_bmz_v(dst3, (v16u8) q0_src, (v16u8) q_is_pcm_vec); @@ -789,15 +785,15 @@ static void hevc_loopfilter_luma_ver_msa(uint8_t *src, int32_t stride, abs_delta0 = __msa_add_a_h(delta0, (v8i16) zero); abs_delta0 = (v8u16) abs_delta0 < temp1; - delta0 = CLIP_SH(delta0, tc_neg, tc_pos); + CLIP_SH(delta0, tc_neg, tc_pos); - temp0 = (v8u16) (delta0 + p0_src); - temp0 = (v8u16) CLIP_SH_0_255(temp0); - temp0 = (v8u16) __msa_bmz_v((v16u8) temp0, (v16u8) p0_src, + temp2 = (v8i16) (delta0 + p0_src); + CLIP_SH_0_255(temp2); + temp0 = (v8u16) __msa_bmz_v((v16u8) temp2, (v16u8) p0_src, (v16u8) p_is_pcm_vec); temp2 = (v8i16) (q0_src - delta0); - temp2 = CLIP_SH_0_255(temp2); + CLIP_SH_0_255(temp2); temp2 = (v8i16) __msa_bmz_v((v16u8) temp2, (v16u8) q0_src, (v16u8) q_is_pcm_vec); @@ -819,9 +815,9 @@ static void hevc_loopfilter_luma_ver_msa(uint8_t *src, int32_t stride, delta1 -= (v8i16) p1_src; delta1 += delta0; delta1 >>= 1; - delta1 = CLIP_SH(delta1, tc_neg, tc_pos); + CLIP_SH(delta1, tc_neg, tc_pos); delta1 = (v8i16) p1_src + (v8i16) delta1; - delta1 = CLIP_SH_0_255(delta1); + CLIP_SH_0_255(delta1); delta1 = (v8i16) __msa_bmnz_v((v16u8) delta1, (v16u8) p1_src, (v16u8) p_is_pcm_vec); @@ -829,9 +825,9 @@ static void hevc_loopfilter_luma_ver_msa(uint8_t *src, int32_t stride, delta2 = delta2 - (v8i16) q1_src; delta2 = delta2 - delta0; delta2 = delta2 >> 1; - delta2 = CLIP_SH(delta2, tc_neg, tc_pos); + CLIP_SH(delta2, tc_neg, tc_pos); delta2 = (v8i16) q1_src + (v8i16) delta2; - delta2 = CLIP_SH_0_255(delta2); + CLIP_SH_0_255(delta2); delta2 = (v8i16) __msa_bmnz_v((v16u8) delta2, (v16u8) q1_src, (v16u8) q_is_pcm_vec); delta1 = (v8i16) __msa_bmz_v((v16u8) delta1, (v16u8) p1_src, @@ -959,15 +955,15 @@ static void hevc_loopfilter_chroma_hor_msa(uint8_t *src, int32_t stride, temp0 <<= 2; temp0 += temp1; delta = __msa_srari_h((v8i16) temp0, 3); - delta = CLIP_SH(delta, tc_neg, tc_pos); + CLIP_SH(delta, tc_neg, tc_pos); temp0 = (v8i16) ((v8i16) p0 + delta); - temp0 = CLIP_SH_0_255(temp0); + CLIP_SH_0_255(temp0); temp0 = (v8i16) __msa_bmz_v((v16u8) temp0, (v16u8) p0, (v16u8) p_is_pcm_vec); temp1 = (v8i16) ((v8i16) q0 - delta); - temp1 = CLIP_SH_0_255(temp1); + CLIP_SH_0_255(temp1); temp1 = (v8i16) __msa_bmz_v((v16u8) temp1, (v16u8) q0, (v16u8) q_is_pcm_vec); @@ -976,7 +972,7 @@ static void hevc_loopfilter_chroma_hor_msa(uint8_t *src, int32_t stride, temp1 = (v8i16) __msa_bmnz_v((v16u8) temp1, (v16u8) q0, (v16u8) tc_pos); temp0 = (v8i16) __msa_pckev_b((v16i8) temp1, (v16i8) temp0); - ST8x2_UB(temp0, p0_ptr, stride); + ST_D2(temp0, 0, 1, p0_ptr, stride); } } @@ -1018,15 +1014,15 @@ static void hevc_loopfilter_chroma_ver_msa(uint8_t *src, int32_t stride, temp0 <<= 2; temp0 += temp1; delta = __msa_srari_h((v8i16) temp0, 3); - delta = CLIP_SH(delta, tc_neg, tc_pos); + CLIP_SH(delta, tc_neg, tc_pos); temp0 = (v8i16) ((v8i16) p0 + delta); - temp0 = CLIP_SH_0_255(temp0); + CLIP_SH_0_255(temp0); temp0 = (v8i16) __msa_bmz_v((v16u8) temp0, (v16u8) p0, (v16u8) p_is_pcm_vec); temp1 = (v8i16) ((v8i16) q0 - delta); - temp1 = CLIP_SH_0_255(temp1); + CLIP_SH_0_255(temp1); temp1 = (v8i16) __msa_bmz_v((v16u8) temp1, (v16u8) q0, (v16u8) q_is_pcm_vec); @@ -1037,9 +1033,7 @@ static void hevc_loopfilter_chroma_ver_msa(uint8_t *src, int32_t stride, temp0 = (v8i16) __msa_ilvev_b((v16i8) temp1, (v16i8) temp0); src += 1; - ST2x4_UB(temp0, 0, src, stride); - src += (4 * stride); - ST2x4_UB(temp0, 4, src, stride); + ST_H8(temp0, 0, 1, 2, 3, 4, 5, 6, 7, src, stride); } } @@ -1087,7 +1081,7 @@ static void hevc_sao_band_filter_4width_msa(uint8_t *dst, int32_t dst_stride, LD_UB4(src, src_stride, src0, src1, src2, src3); /* store results */ - ST4x4_UB(dst0, dst0, 0, 1, 2, 3, dst, dst_stride); + ST_W4(dst0, 0, 1, 2, 3, dst, dst_stride); dst += (4 * dst_stride); } @@ -1102,7 +1096,7 @@ static void hevc_sao_band_filter_4width_msa(uint8_t *dst, int32_t dst_stride, dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128); /* store results */ - ST4x4_UB(dst0, dst0, 0, 1, 2, 3, dst, dst_stride); + ST_W4(dst0, 0, 1, 2, 3, dst, dst_stride); } static void hevc_sao_band_filter_8width_msa(uint8_t *dst, int32_t dst_stride, @@ -1153,7 +1147,7 @@ static void hevc_sao_band_filter_8width_msa(uint8_t *dst, int32_t dst_stride, XORI_B2_128_SB(dst0, dst1); /* store results */ - ST8x4_UB(dst0, dst1, dst, dst_stride); + ST_D4(dst0, dst1, 0, 1, 0, 1, dst, dst_stride); dst += dst_stride << 2; } @@ -1173,7 +1167,7 @@ static void hevc_sao_band_filter_8width_msa(uint8_t *dst, int32_t dst_stride, XORI_B2_128_SB(dst0, dst1); /* store results */ - ST8x4_UB(dst0, dst1, dst, dst_stride); + ST_D4(dst0, dst1, 0, 1, 0, 1, dst, dst_stride); } static void hevc_sao_band_filter_16multiple_msa(uint8_t *dst, @@ -1363,6 +1357,7 @@ static void hevc_sao_edge_filter_0degree_8width_msa(uint8_t *dst, v16u8 cmp_minus10, diff_minus10, diff_minus11; v16u8 src0, src1, dst0, src_minus10, src_minus11, src_plus10, src_plus11; v16i8 offset, sao_offset = LD_SB(sao_offset_val); + v16i8 zeros = { 0 }; sao_offset = __msa_pckev_b(sao_offset, sao_offset); src -= 1; @@ -1373,8 +1368,8 @@ static void hevc_sao_edge_filter_0degree_8width_msa(uint8_t *dst, for (height -= 2; height; height -= 2) { src += (src_stride << 1); - SLDI_B2_0_UB(src_minus10, src_minus11, src0, src1, 1); - SLDI_B2_0_UB(src_minus10, src_minus11, src_plus10, src_plus11, 2); + SLDI_B2_UB(zeros, src_minus10, zeros, src_minus11, 1, src0, src1); + SLDI_B2_UB(zeros, src_minus10, zeros, src_minus11, 2, src_plus10, src_plus11); PCKEV_D2_UB(src_minus11, src_minus10, src_plus11, src_plus10, src_minus10, src_plus10); @@ -1410,8 +1405,8 @@ static void hevc_sao_edge_filter_0degree_8width_msa(uint8_t *dst, dst += dst_stride; } - SLDI_B2_0_UB(src_minus10, src_minus11, src0, src1, 1); - SLDI_B2_0_UB(src_minus10, src_minus11, src_plus10, src_plus11, 2); + SLDI_B2_UB(zeros, src_minus10, zeros, src_minus11, 1, src0, src1); + SLDI_B2_UB(zeros, src_minus10, zeros, src_minus11, 2, src_plus10, src_plus11); PCKEV_D2_UB(src_minus11, src_minus10, src_plus11, src_plus10, src_minus10, src_plus10); @@ -1479,14 +1474,12 @@ static void hevc_sao_edge_filter_0degree_16multiple_msa(uint8_t *dst, dst_ptr = dst + v_cnt; LD_UB4(src_minus1, src_stride, src10, src11, src12, src13); - SLDI_B2_SB(src10, src11, src_minus10, src_minus11, src_zero0, - src_zero1, 1); - SLDI_B2_SB(src12, src13, src_minus12, src_minus13, src_zero2, - src_zero3, 1); - SLDI_B2_SB(src10, src11, src_minus10, src_minus11, src_plus10, - src_plus11, 2); - SLDI_B2_SB(src12, src13, src_minus12, src_minus13, src_plus12, - src_plus13, 2); + SLDI_B4_SB(src10, src_minus10, src11, src_minus11, + src12, src_minus12, src13, src_minus13, 1, + src_zero0, src_zero1, src_zero2, src_zero3); + SLDI_B4_SB(src10, src_minus10, src11, src_minus11, + src12, src_minus12, src13, src_minus13, 2, + src_plus10, src_plus11, src_plus12, src_plus13); cmp_minus10 = ((v16u8) src_zero0 == src_minus10); cmp_plus10 = ((v16u8) src_zero0 == (v16u8) src_plus10); @@ -1886,6 +1879,7 @@ static void hevc_sao_edge_filter_45degree_4width_msa(uint8_t *dst, v16u8 src_minus11, src10, src11; v16i8 src_plus0, src_zero0, src_plus1, src_zero1, dst0; v8i16 offset_mask0, offset_mask1; + v16i8 zeros = { 0 }; sao_offset = __msa_pckev_b(sao_offset, sao_offset); @@ -1898,8 +1892,8 @@ static void hevc_sao_edge_filter_45degree_4width_msa(uint8_t *dst, for (height -= 2; height; height -= 2) { src_orig += (src_stride << 1); - SLDI_B2_0_SB(src_minus11, src10, src_zero0, src_zero1, 1); - SLDI_B2_0_SB(src10, src11, src_plus0, src_plus1, 2); + SLDI_B2_SB(zeros, src_minus11, zeros, src10, 1, src_zero0, src_zero1); + SLDI_B2_SB(zeros, src10, zeros, src11, 2, src_plus0, src_plus1); ILVR_B2_UB(src_plus0, src_minus10, src_plus1, src_minus11, src_minus10, src_minus11); @@ -1944,8 +1938,8 @@ static void hevc_sao_edge_filter_45degree_4width_msa(uint8_t *dst, dst += dst_stride; } - SLDI_B2_0_SB(src_minus11, src10, src_zero0, src_zero1, 1); - SLDI_B2_0_SB(src10, src11, src_plus0, src_plus1, 2); + SLDI_B2_SB(zeros, src_minus11, zeros, src10, 1, src_zero0, src_zero1); + SLDI_B2_SB(zeros, src10, zeros, src11, 2, src_plus0, src_plus1); ILVR_B2_UB(src_plus0, src_minus10, src_plus1, src_minus11, src_minus10, src_minus11); @@ -1998,6 +1992,7 @@ static void hevc_sao_edge_filter_45degree_8width_msa(uint8_t *dst, v16u8 src_minus10, src10, src_minus11, src11; v16i8 src_zero0, src_plus10, src_zero1, src_plus11, dst0; v8i16 offset_mask0, offset_mask1; + v16i8 zeros = { 0 }; sao_offset = __msa_pckev_b(sao_offset, sao_offset); src_orig = src - 1; @@ -2009,8 +2004,8 @@ static void hevc_sao_edge_filter_45degree_8width_msa(uint8_t *dst, for (height -= 2; height; height -= 2) { src_orig += (src_stride << 1); - SLDI_B2_0_SB(src_minus11, src10, src_zero0, src_zero1, 1); - SLDI_B2_0_SB(src10, src11, src_plus10, src_plus11, 2); + SLDI_B2_SB(zeros, src_minus11, zeros, src10, 1, src_zero0, src_zero1); + SLDI_B2_SB(zeros, src10, zeros, src11, 2, src_plus10, src_plus11); ILVR_B2_UB(src_plus10, src_minus10, src_plus11, src_minus11, src_minus10, src_minus11); @@ -2054,8 +2049,8 @@ static void hevc_sao_edge_filter_45degree_8width_msa(uint8_t *dst, dst += dst_stride; } - SLDI_B2_0_SB(src_minus11, src10, src_zero0, src_zero1, 1); - SLDI_B2_0_SB(src10, src11, src_plus10, src_plus11, 2); + SLDI_B2_SB(zeros, src_minus11, zeros, src10, 1, src_zero0, src_zero1); + SLDI_B2_SB(zeros, src10, zeros, src11, 2, src_plus10, src_plus11); ILVR_B2_UB(src_plus10, src_minus10, src_plus11, src_minus11, src_minus10, src_minus11); ILVR_B2_SB(src_zero0, src_zero0, src_zero1, src_zero1, src_zero0, @@ -2136,12 +2131,11 @@ static void hevc_sao_edge_filter_45degree_16multiple_msa(uint8_t *dst, src_plus13 = LD_UB(src + 1 + v_cnt + (src_stride << 2)); src_orig += 16; - SLDI_B2_SB(src10, src11, src_minus11, src_minus12, src_zero0, - src_zero1, 1); - SLDI_B2_SB(src12, src13, src_minus13, src_minus14, src_zero2, - src_zero3, 1); - SLDI_B2_SB(src11, src12, src_minus12, src_minus13, src_plus10, - src_plus11, 2); + SLDI_B4_SB(src10, src_minus11, src11, src_minus12, + src12, src_minus13, src13, src_minus14, 1, + src_zero0, src_zero1, src_zero2, src_zero3); + SLDI_B2_SB(src11, src_minus12, src12, src_minus13, 2, src_plus10, + src_plus11); src_plus12 = __msa_sldi_b((v16i8) src13, (v16i8) src_minus14, 2); @@ -2234,6 +2228,7 @@ static void hevc_sao_edge_filter_135degree_4width_msa(uint8_t *dst, v16u8 cmp_minus10, diff_minus10, cmp_minus11, diff_minus11; v16u8 src_minus10, src10, src_minus11, src11; v8i16 offset_mask0, offset_mask1; + v16i8 zeros = { 0 }; sao_offset = __msa_pckev_b(sao_offset, sao_offset); src_orig = src - 1; @@ -2245,8 +2240,8 @@ static void hevc_sao_edge_filter_135degree_4width_msa(uint8_t *dst, for (height -= 2; height; height -= 2) { src_orig += (src_stride << 1); - SLDI_B2_0_SB(src_minus11, src10, src_zero0, src_zero1, 1); - SLDI_B2_0_UB(src_minus10, src_minus11, src_minus10, src_minus11, 2); + SLDI_B2_SB(zeros, src_minus11, zeros, src10, 1, src_zero0, src_zero1); + SLDI_B2_UB(zeros, src_minus10, zeros, src_minus11, 2, src_minus10, src_minus11); ILVR_B2_UB(src10, src_minus10, src11, src_minus11, src_minus10, src_minus11); @@ -2292,8 +2287,8 @@ static void hevc_sao_edge_filter_135degree_4width_msa(uint8_t *dst, dst += dst_stride; } - SLDI_B2_0_SB(src_minus11, src10, src_zero0, src_zero1, 1); - SLDI_B2_0_UB(src_minus10, src_minus11, src_minus10, src_minus11, 2); + SLDI_B2_SB(zeros, src_minus11, zeros, src10, 1, src_zero0, src_zero1); + SLDI_B2_UB(zeros, src_minus10, zeros, src_minus11, 2, src_minus10, src_minus11); ILVR_B2_UB(src10, src_minus10, src11, src_minus11, src_minus10, src_minus11); @@ -2348,6 +2343,7 @@ static void hevc_sao_edge_filter_135degree_8width_msa(uint8_t *dst, v16u8 src_minus10, src10, src_minus11, src11; v16i8 src_zero0, src_zero1, dst0; v8i16 offset_mask0, offset_mask1; + v16i8 zeros = { 0 }; sao_offset = __msa_pckev_b(sao_offset, sao_offset); src_orig = src - 1; @@ -2359,8 +2355,8 @@ static void hevc_sao_edge_filter_135degree_8width_msa(uint8_t *dst, for (height -= 2; height; height -= 2) { src_orig += (src_stride << 1); - SLDI_B2_0_SB(src_minus11, src10, src_zero0, src_zero1, 1); - SLDI_B2_0_UB(src_minus10, src_minus11, src_minus10, src_minus11, 2); + SLDI_B2_SB(zeros, src_minus11, zeros, src10, 1, src_zero0, src_zero1); + SLDI_B2_UB(zeros, src_minus10, zeros, src_minus11, 2, src_minus10, src_minus11); ILVR_B2_UB(src10, src_minus10, src11, src_minus11, src_minus10, src_minus11); ILVR_B2_SB(src_zero0, src_zero0, src_zero1, src_zero1, src_zero0, @@ -2404,8 +2400,8 @@ static void hevc_sao_edge_filter_135degree_8width_msa(uint8_t *dst, dst += dst_stride; } - SLDI_B2_0_SB(src_minus11, src10, src_zero0, src_zero1, 1); - SLDI_B2_0_UB(src_minus10, src_minus11, src_minus10, src_minus11, 2); + SLDI_B2_SB(zeros, src_minus11, zeros, src10, 1, src_zero0, src_zero1); + SLDI_B2_UB(zeros, src_minus10, zeros, src_minus11, 2, src_minus10, src_minus11); ILVR_B2_UB(src10, src_minus10, src11, src_minus11, src_minus10, src_minus11); ILVR_B2_SB(src_zero0, src_zero0, src_zero1, src_zero1, src_zero0, @@ -2630,7 +2626,7 @@ void ff_hevc_sao_edge_filter_8_msa(uint8_t *dst, uint8_t *src, int16_t *sao_offset_val, int eo, int width, int height) { - ptrdiff_t stride_src = (2 * 64 + 32) / sizeof(uint8_t); + ptrdiff_t stride_src = (2 * MAX_PB_SIZE + AV_INPUT_BUFFER_PADDING_SIZE) / sizeof(uint8_t); switch (eo) { case 0: