+ src -= (src_stride + 1);
+
+ filter_vec = LD_SH(filter_x);
+ SPLATI_H2_SH(filter_vec, 0, 1, filt0, filt1);
+
+ filter_vec = LD_SH(filter_y);
+ UNPCK_R_SB_SH(filter_vec, filter_vec);
+
+ SPLATI_W2_SH(filter_vec, 0, filt_h0, filt_h1);
+
+ mask0 = LD_SB(ff_hevc_mask_arr);
+ mask1 = mask0 + 2;
+
+ src_tmp = src;
+ dst_tmp = dst;
+
+ LD_SB3(src_tmp, src_stride, src0, src1, src2);
+ src_tmp += (3 * src_stride);
+
+ XORI_B3_128_SB(src0, src1, src2);
+
+ VSHF_B2_SB(src0, src0, src0, src0, mask0, mask1, vec0, vec1);
+ VSHF_B2_SB(src1, src1, src1, src1, mask0, mask1, vec2, vec3);
+ VSHF_B2_SB(src2, src2, src2, src2, mask0, mask1, vec4, vec5);
+
+ dsth0 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
+ dsth1 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
+ dsth2 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
+
+ ILVRL_H2_SH(dsth1, dsth0, dst10_r, dst10_l);
+ ILVRL_H2_SH(dsth2, dsth1, dst21_r, dst21_l);
+
+ for (loop_cnt = 4; loop_cnt--;) {
+ LD_SB4(src_tmp, src_stride, src3, src4, src5, src6);
+ src_tmp += (4 * src_stride);
+ XORI_B4_128_SB(src3, src4, src5, src6);
+
+ VSHF_B2_SB(src3, src3, src3, src3, mask0, mask1, vec0, vec1);
+ VSHF_B2_SB(src4, src4, src4, src4, mask0, mask1, vec2, vec3);
+ VSHF_B2_SB(src5, src5, src5, src5, mask0, mask1, vec4, vec5);
+ VSHF_B2_SB(src6, src6, src6, src6, mask0, mask1, vec6, vec7);
+
+ dsth3 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
+ dsth4 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
+ dsth5 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
+ dsth6 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
+
+ ILVRL_H2_SH(dsth3, dsth2, dst32_r, dst32_l);
+ ILVRL_H2_SH(dsth4, dsth3, dst43_r, dst43_l);
+ ILVRL_H2_SH(dsth5, dsth4, dst54_r, dst54_l);
+ ILVRL_H2_SH(dsth6, dsth5, dst65_r, dst65_l);
+
+ dst0_r = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
+ dst0_l = HEVC_FILT_4TAP(dst10_l, dst32_l, filt_h0, filt_h1);
+ dst1_r = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
+ dst1_l = HEVC_FILT_4TAP(dst21_l, dst43_l, filt_h0, filt_h1);
+ dst2_r = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
+ dst2_l = HEVC_FILT_4TAP(dst32_l, dst54_l, filt_h0, filt_h1);
+ dst3_r = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
+ dst3_l = HEVC_FILT_4TAP(dst43_l, dst65_l, filt_h0, filt_h1);
+
+ SRA_4V(dst0_r, dst0_l, dst1_r, dst1_l, 6);
+ SRA_4V(dst2_r, dst2_l, dst3_r, dst3_l, 6);
+
+ PCKEV_H4_SH(dst0_l, dst0_r, dst1_l, dst1_r, dst2_l, dst2_r, dst3_l,
+ dst3_r, tmp0, tmp1, tmp2, tmp3);
+ SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 6);
+ SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
+ out0 = PCKEV_XORI128_UB(tmp0, tmp1);
+ out1 = PCKEV_XORI128_UB(tmp2, tmp3);
+ ST8x4_UB(out0, out1, dst_tmp, dst_stride);
+ dst_tmp += (4 * dst_stride);
+
+ dst10_r = dst54_r;
+ dst10_l = dst54_l;
+ dst21_r = dst65_r;
+ dst21_l = dst65_l;
+ dsth2 = dsth6;
+ }
+
+ src += 8;
+ dst += 8;
+
+ mask2 = LD_SB(ff_hevc_mask_arr + 16);
+ mask3 = mask2 + 2;
+
+ LD_SB3(src, src_stride, src0, src1, src2);
+ src += (3 * src_stride);
+ XORI_B3_128_SB(src0, src1, src2);
+ VSHF_B2_SB(src0, src1, src0, src1, mask2, mask3, vec0, vec1);
+ VSHF_B2_SB(src1, src2, src1, src2, mask2, mask3, vec2, vec3);
+
+ dst10 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
+ dst21 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
+
+ ILVRL_H2_SH(dst21, dst10, dst10_r, dst21_r);
+ dst22 = (v8i16) __msa_splati_d((v2i64) dst21, 1);
+
+ for (loop_cnt = 2; loop_cnt--;) {
+ LD_SB8(src, src_stride,
+ src3, src4, src5, src6, src7, src8, src9, src10);
+ src += (8 * src_stride);
+ XORI_B8_128_SB(src3, src4, src5, src6, src7, src8, src9, src10);
+ VSHF_B2_SB(src3, src7, src3, src7, mask2, mask3, vec0, vec1);
+ VSHF_B2_SB(src4, src8, src4, src8, mask2, mask3, vec2, vec3);
+ VSHF_B2_SB(src5, src9, src5, src9, mask2, mask3, vec4, vec5);
+ VSHF_B2_SB(src6, src10, src6, src10, mask2, mask3, vec6, vec7);
+
+ dst73 = HEVC_FILT_4TAP_SH(vec0, vec1, filt0, filt1);
+ dst84 = HEVC_FILT_4TAP_SH(vec2, vec3, filt0, filt1);
+ dst95 = HEVC_FILT_4TAP_SH(vec4, vec5, filt0, filt1);
+ dst106 = HEVC_FILT_4TAP_SH(vec6, vec7, filt0, filt1);
+
+ dst32_r = __msa_ilvr_h(dst73, dst22);
+ ILVRL_H2_SH(dst84, dst73, dst43_r, dst87_r);
+ ILVRL_H2_SH(dst95, dst84, dst54_r, dst98_r);
+ ILVRL_H2_SH(dst106, dst95, dst65_r, dst109_r);
+ dst22 = (v8i16) __msa_splati_d((v2i64) dst73, 1);
+ dst76_r = __msa_ilvr_h(dst22, dst106);
+
+ dst0 = HEVC_FILT_4TAP(dst10_r, dst32_r, filt_h0, filt_h1);
+ dst1 = HEVC_FILT_4TAP(dst21_r, dst43_r, filt_h0, filt_h1);
+ dst2 = HEVC_FILT_4TAP(dst32_r, dst54_r, filt_h0, filt_h1);
+ dst3 = HEVC_FILT_4TAP(dst43_r, dst65_r, filt_h0, filt_h1);
+ dst4 = HEVC_FILT_4TAP(dst54_r, dst76_r, filt_h0, filt_h1);
+ dst5 = HEVC_FILT_4TAP(dst65_r, dst87_r, filt_h0, filt_h1);
+ dst6 = HEVC_FILT_4TAP(dst76_r, dst98_r, filt_h0, filt_h1);
+ dst7 = HEVC_FILT_4TAP(dst87_r, dst109_r, filt_h0, filt_h1);
+ SRA_4V(dst0, dst1, dst2, dst3, 6);
+ SRA_4V(dst4, dst5, dst6, dst7, 6);
+ PCKEV_H4_SH(dst1, dst0, dst3, dst2, dst5, dst4, dst7, dst6,
+ tmp0, tmp1, tmp2, tmp3);
+ SRARI_H4_SH(tmp0, tmp1, tmp2, tmp3, 6);
+ SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7);
+ out0 = PCKEV_XORI128_UB(tmp0, tmp1);
+ out1 = PCKEV_XORI128_UB(tmp2, tmp3);
+ ST4x8_UB(out0, out1, dst, dst_stride);
+ dst += (8 * dst_stride);
+
+ dst10_r = dst98_r;
+ dst21_r = dst109_r;
+ dst22 = (v8i16) __msa_splati_d((v2i64) dst106, 1);
+ }