temp1 = ((p3_src + p2_src) << 1) + p2_src + temp0;
temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3);
temp2 = (v8i16) (temp1 - p2_src);
- temp2 = CLIP_SH(temp2, tc_neg, tc_pos);
+ CLIP_SH(temp2, tc_neg, tc_pos);
dst0 = (v16u8) (temp2 + (v8i16) p2_src);
temp1 = temp0 + p2_src;
temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 2);
temp2 = (v8i16) (temp1 - p1_src);
- temp2 = CLIP_SH(temp2, tc_neg, tc_pos);
+ CLIP_SH(temp2, tc_neg, tc_pos);
dst1 = (v16u8) (temp2 + (v8i16) p1_src);
temp1 = (temp0 << 1) + p2_src + q1_src;
temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3);
temp2 = (v8i16) (temp1 - p0_src);
- temp2 = CLIP_SH(temp2, tc_neg, tc_pos);
+ CLIP_SH(temp2, tc_neg, tc_pos);
dst2 = (v16u8) (temp2 + (v8i16) p0_src);
dst0 = __msa_bmz_v(dst0, (v16u8) p2_src, (v16u8) p_is_pcm_vec);
temp1 = ((q3_src + q2_src) << 1) + q2_src + temp0;
temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3);
temp2 = (v8i16) (temp1 - q2_src);
- temp2 = CLIP_SH(temp2, tc_neg, tc_pos);
+ CLIP_SH(temp2, tc_neg, tc_pos);
dst5 = (v16u8) (temp2 + (v8i16) q2_src);
temp1 = temp0 + q2_src;
temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 2);
temp2 = (v8i16) (temp1 - q1_src);
- temp2 = CLIP_SH(temp2, tc_neg, tc_pos);
+ CLIP_SH(temp2, tc_neg, tc_pos);
dst4 = (v16u8) (temp2 + (v8i16) q1_src);
temp1 = (temp0 << 1) + p1_src + q2_src;
temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3);
temp2 = (v8i16) (temp1 - q0_src);
- temp2 = CLIP_SH(temp2, tc_neg, tc_pos);
+ CLIP_SH(temp2, tc_neg, tc_pos);
dst3 = (v16u8) (temp2 + (v8i16) q0_src);
dst3 = __msa_bmz_v(dst3, (v16u8) q0_src, (v16u8) q_is_pcm_vec);
dst_val0 = __msa_copy_u_d((v2i64) dst2, 0);
dst_val1 = __msa_copy_u_d((v2i64) dst2, 1);
- ST8x4_UB(dst0, dst1, p2, stride);
- p2 += (4 * stride);
- SD(dst_val0, p2);
- p2 += stride;
- SD(dst_val1, p2);
+ ST_D4(dst0, dst1, 0, 1, 0, 1, p2, stride);
+ SD(dst_val0, p2 + 4 * stride);
+ SD(dst_val1, p2 + 5 * stride);
/* strong filter ends */
} else if (flag0 == flag1) { /* weak only */
/* weak filter */
abs_delta0 = __msa_add_a_h(delta0, (v8i16) zero);
abs_delta0 = (v8u16) abs_delta0 < temp1;
- delta0 = CLIP_SH(delta0, tc_neg, tc_pos);
+ CLIP_SH(delta0, tc_neg, tc_pos);
- temp0 = (v8u16) (delta0 + p0_src);
- temp0 = (v8u16) CLIP_SH_0_255(temp0);
- temp0 = (v8u16) __msa_bmz_v((v16u8) temp0, (v16u8) p0_src,
+ temp2 = (v8i16) (delta0 + p0_src);
+ CLIP_SH_0_255(temp2);
+ temp0 = (v8u16) __msa_bmz_v((v16u8) temp2, (v16u8) p0_src,
(v16u8) p_is_pcm_vec);
temp2 = (v8i16) (q0_src - delta0);
- temp2 = CLIP_SH_0_255(temp2);
+ CLIP_SH_0_255(temp2);
temp2 = (v8i16) __msa_bmz_v((v16u8) temp2, (v16u8) q0_src,
(v16u8) q_is_pcm_vec);
delta1 -= (v8i16) p1_src;
delta1 += delta0;
delta1 >>= 1;
- delta1 = CLIP_SH(delta1, tc_neg, tc_pos);
+ CLIP_SH(delta1, tc_neg, tc_pos);
delta1 = (v8i16) p1_src + (v8i16) delta1;
- delta1 = CLIP_SH_0_255(delta1);
+ CLIP_SH_0_255(delta1);
delta1 = (v8i16) __msa_bmnz_v((v16u8) delta1, (v16u8) p1_src,
(v16u8) p_is_pcm_vec);
delta2 = delta2 - (v8i16) q1_src;
delta2 = delta2 - delta0;
delta2 = delta2 >> 1;
- delta2 = CLIP_SH(delta2, tc_neg, tc_pos);
+ CLIP_SH(delta2, tc_neg, tc_pos);
delta2 = (v8i16) q1_src + (v8i16) delta2;
- delta2 = CLIP_SH_0_255(delta2);
+ CLIP_SH_0_255(delta2);
delta2 = (v8i16) __msa_bmnz_v((v16u8) delta2, (v16u8) q1_src,
(v16u8) q_is_pcm_vec);
dst1 = __msa_bmz_v(dst1, dst3, (v16u8) cmp3);
p2 += stride;
- ST8x4_UB(dst0, dst1, p2, stride);
+ ST_D4(dst0, dst1, 0, 1, 0, 1, p2, stride);
/* weak filter ends */
} else { /* strong + weak */
/* strong filter */
temp1 = ((p3_src + p2_src) << 1) + p2_src + temp0;
temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3);
temp2 = (v8i16) (temp1 - p2_src);
- temp2 = CLIP_SH(temp2, tc_neg, tc_pos);
+ CLIP_SH(temp2, tc_neg, tc_pos);
dst0 = (v16u8) (temp2 + (v8i16) p2_src);
temp1 = temp0 + p2_src;
temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 2);
temp2 = (v8i16) (temp1 - p1_src);
- temp2 = CLIP_SH(temp2, tc_neg, tc_pos);
+ CLIP_SH(temp2, tc_neg, tc_pos);
dst1 = (v16u8) (temp2 + (v8i16) p1_src);
temp1 = (temp0 << 1) + p2_src + q1_src;
temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3);
temp2 = (v8i16) (temp1 - p0_src);
- temp2 = CLIP_SH(temp2, tc_neg, tc_pos);
+ CLIP_SH(temp2, tc_neg, tc_pos);
dst2 = (v16u8) (temp2 + (v8i16) p0_src);
dst0 = __msa_bmz_v(dst0, (v16u8) p2_src, (v16u8) p_is_pcm_vec);
temp1 = ((q3_src + q2_src) << 1) + q2_src + temp0;
temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3);
temp2 = (v8i16) (temp1 - q2_src);
- temp2 = CLIP_SH(temp2, tc_neg, tc_pos);
+ CLIP_SH(temp2, tc_neg, tc_pos);
dst5 = (v16u8) (temp2 + (v8i16) q2_src);
temp1 = temp0 + q2_src;
temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 2);
temp2 = (v8i16) (temp1 - q1_src);
- temp2 = CLIP_SH(temp2, tc_neg, tc_pos);
+ CLIP_SH(temp2, tc_neg, tc_pos);
dst4 = (v16u8) (temp2 + (v8i16) q1_src);
temp1 = (temp0 << 1) + p1_src + q2_src;
temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3);
temp2 = (v8i16) (temp1 - q0_src);
- temp2 = CLIP_SH(temp2, tc_neg, tc_pos);
+ CLIP_SH(temp2, tc_neg, tc_pos);
dst3 = (v16u8) (temp2 + (v8i16) q0_src);
dst3 = __msa_bmz_v(dst3, (v16u8) q0_src, (v16u8) q_is_pcm_vec);
abs_delta0 = __msa_add_a_h(delta0, (v8i16) zero);
abs_delta0 = (v8u16) abs_delta0 < temp1;
- delta0 = CLIP_SH(delta0, tc_neg, tc_pos);
+ CLIP_SH(delta0, tc_neg, tc_pos);
- temp0 = (v8u16) (delta0 + p0_src);
- temp0 = (v8u16) CLIP_SH_0_255(temp0);
- temp0 = (v8u16) __msa_bmz_v((v16u8) temp0, (v16u8) p0_src,
+ temp2 = (v8i16) (delta0 + p0_src);
+ CLIP_SH_0_255(temp2);
+ temp0 = (v8u16) __msa_bmz_v((v16u8) temp2, (v16u8) p0_src,
(v16u8) p_is_pcm_vec);
temp2 = (v8i16) (q0_src - delta0);
- temp2 = CLIP_SH_0_255(temp2);
+ CLIP_SH_0_255(temp2);
temp2 = (v8i16) __msa_bmz_v((v16u8) temp2, (v16u8) q0_src,
(v16u8) q_is_pcm_vec);
delta1 -= (v8i16) p1_src;
delta1 += delta0;
delta1 >>= 1;
- delta1 = CLIP_SH(delta1, tc_neg, tc_pos);
+ CLIP_SH(delta1, tc_neg, tc_pos);
delta1 = (v8i16) p1_src + (v8i16) delta1;
- delta1 = CLIP_SH_0_255(delta1);
+ CLIP_SH_0_255(delta1);
delta1 = (v8i16) __msa_bmnz_v((v16u8) delta1, (v16u8) p1_src,
(v16u8) p_is_pcm_vec);
delta2 = delta2 - (v8i16) q1_src;
delta2 = delta2 - delta0;
delta2 = delta2 >> 1;
- delta2 = CLIP_SH(delta2, tc_neg, tc_pos);
+ CLIP_SH(delta2, tc_neg, tc_pos);
delta2 = (v8i16) q1_src + (v8i16) delta2;
- delta2 = CLIP_SH_0_255(delta2);
+ CLIP_SH_0_255(delta2);
delta2 = (v8i16) __msa_bmnz_v((v16u8) delta2, (v16u8) q1_src,
(v16u8) q_is_pcm_vec);
dst_val0 = __msa_copy_u_d((v2i64) dst2, 0);
dst_val1 = __msa_copy_u_d((v2i64) dst2, 1);
- ST8x4_UB(dst0, dst1, p2, stride);
- p2 += (4 * stride);
- SD(dst_val0, p2);
- p2 += stride;
- SD(dst_val1, p2);
+ ST_D4(dst0, dst1, 0, 1, 0, 1, p2, stride);
+ SD(dst_val0, p2 + 4 * stride);
+ SD(dst_val1, p2 + 5 * stride);
}
}
}
temp1 = ((p3_src + p2_src) << 1) + p2_src + temp0;
temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3);
temp2 = (v8i16) (temp1 - p2_src);
- temp2 = CLIP_SH(temp2, tc_neg, tc_pos);
+ CLIP_SH(temp2, tc_neg, tc_pos);
dst0 = (v16u8) (temp2 + (v8i16) p2_src);
temp1 = temp0 + p2_src;
temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 2);
temp2 = (v8i16) (temp1 - p1_src);
- temp2 = CLIP_SH(temp2, tc_neg, tc_pos);
+ CLIP_SH(temp2, tc_neg, tc_pos);
dst1 = (v16u8) (temp2 + (v8i16) p1_src);
temp1 = (temp0 << 1) + p2_src + q1_src;
temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3);
temp2 = (v8i16) (temp1 - p0_src);
- temp2 = CLIP_SH(temp2, tc_neg, tc_pos);
+ CLIP_SH(temp2, tc_neg, tc_pos);
dst2 = (v16u8) (temp2 + (v8i16) p0_src);
dst0 = __msa_bmz_v(dst0, (v16u8) p2_src, (v16u8) p_is_pcm_vec);
temp1 = ((q3_src + q2_src) << 1) + q2_src + temp0;
temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3);
temp2 = (v8i16) (temp1 - q2_src);
- temp2 = CLIP_SH(temp2, tc_neg, tc_pos);
+ CLIP_SH(temp2, tc_neg, tc_pos);
dst5 = (v16u8) (temp2 + (v8i16) q2_src);
temp1 = temp0 + q2_src;
temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 2);
temp2 = (v8i16) (temp1 - q1_src);
- temp2 = CLIP_SH(temp2, tc_neg, tc_pos);
+ CLIP_SH(temp2, tc_neg, tc_pos);
dst4 = (v16u8) (temp2 + (v8i16) q1_src);
temp1 = (temp0 << 1) + p1_src + q2_src;
temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3);
temp2 = (v8i16) (temp1 - q0_src);
- temp2 = CLIP_SH(temp2, tc_neg, tc_pos);
+ CLIP_SH(temp2, tc_neg, tc_pos);
dst3 = (v16u8) (temp2 + (v8i16) q0_src);
dst3 = __msa_bmz_v(dst3, (v16u8) q0_src, (v16u8) q_is_pcm_vec);
abs_delta0 = __msa_add_a_h(delta0, (v8i16) zero);
abs_delta0 = (v8u16) abs_delta0 < temp1;
- delta0 = CLIP_SH(delta0, tc_neg, tc_pos);
- temp0 = (v8u16) (delta0 + p0_src);
- temp0 = (v8u16) CLIP_SH_0_255(temp0);
- temp0 = (v8u16) __msa_bmz_v((v16u8) temp0, (v16u8) p0_src,
+ CLIP_SH(delta0, tc_neg, tc_pos);
+ temp2 = (v8i16) (delta0 + p0_src);
+ CLIP_SH_0_255(temp2);
+ temp0 = (v8u16) __msa_bmz_v((v16u8) temp2, (v16u8) p0_src,
(v16u8) p_is_pcm_vec);
temp2 = (v8i16) (q0_src - delta0);
- temp2 = CLIP_SH_0_255(temp2);
+ CLIP_SH_0_255(temp2);
temp2 = (v8i16) __msa_bmz_v((v16u8) temp2, (v16u8) q0_src,
(v16u8) q_is_pcm_vec);
delta1 -= (v8i16) p1_src;
delta1 += delta0;
delta1 >>= 1;
- delta1 = CLIP_SH(delta1, tc_neg, tc_pos);
+ CLIP_SH(delta1, tc_neg, tc_pos);
delta1 = (v8i16) p1_src + (v8i16) delta1;
- delta1 = CLIP_SH_0_255(delta1);
+ CLIP_SH_0_255(delta1);
delta1 = (v8i16) __msa_bmnz_v((v16u8) delta1, (v16u8) p1_src,
(v16u8) p_is_pcm_vec);
delta2 = delta2 - (v8i16) q1_src;
delta2 = delta2 - delta0;
delta2 = delta2 >> 1;
- delta2 = CLIP_SH(delta2, tc_neg, tc_pos);
+ CLIP_SH(delta2, tc_neg, tc_pos);
delta2 = (v8i16) q1_src + (v8i16) delta2;
- delta2 = CLIP_SH_0_255(delta2);
+ CLIP_SH_0_255(delta2);
delta2 = (v8i16) __msa_bmnz_v((v16u8) delta2, (v16u8) q1_src,
(v16u8) q_is_pcm_vec);
temp1 = ((p3_src + p2_src) << 1) + p2_src + temp0;
temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3);
temp2 = (v8i16) (temp1 - p2_src);
- temp2 = CLIP_SH(temp2, tc_neg, tc_pos);
+ CLIP_SH(temp2, tc_neg, tc_pos);
dst0 = (v16u8) (temp2 + (v8i16) p2_src);
temp1 = temp0 + p2_src;
temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 2);
temp2 = (v8i16) (temp1 - p1_src);
- temp2 = CLIP_SH(temp2, tc_neg, tc_pos);
+ CLIP_SH(temp2, tc_neg, tc_pos);
dst1 = (v16u8) (temp2 + (v8i16) p1_src);
temp1 = (temp0 << 1) + p2_src + q1_src;
temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3);
temp2 = (v8i16) (temp1 - p0_src);
- temp2 = CLIP_SH(temp2, tc_neg, tc_pos);
+ CLIP_SH(temp2, tc_neg, tc_pos);
dst2 = (v16u8) (temp2 + (v8i16) p0_src);
dst0 = __msa_bmz_v(dst0, (v16u8) p2_src, (v16u8) p_is_pcm_vec);
temp1 = ((q3_src + q2_src) << 1) + q2_src + temp0;
temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3);
temp2 = (v8i16) (temp1 - q2_src);
- temp2 = CLIP_SH(temp2, tc_neg, tc_pos);
+ CLIP_SH(temp2, tc_neg, tc_pos);
dst5 = (v16u8) (temp2 + (v8i16) q2_src);
temp1 = temp0 + q2_src;
temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 2);
temp2 = (v8i16) (temp1 - q1_src);
- temp2 = CLIP_SH(temp2, tc_neg, tc_pos);
+ CLIP_SH(temp2, tc_neg, tc_pos);
dst4 = (v16u8) (temp2 + (v8i16) q1_src);
temp1 = (temp0 << 1) + p1_src + q2_src;
temp1 = (v8u16) __msa_srari_h((v8i16) temp1, 3);
temp2 = (v8i16) (temp1 - q0_src);
- temp2 = CLIP_SH(temp2, tc_neg, tc_pos);
+ CLIP_SH(temp2, tc_neg, tc_pos);
dst3 = (v16u8) (temp2 + (v8i16) q0_src);
dst3 = __msa_bmz_v(dst3, (v16u8) q0_src, (v16u8) q_is_pcm_vec);
abs_delta0 = __msa_add_a_h(delta0, (v8i16) zero);
abs_delta0 = (v8u16) abs_delta0 < temp1;
- delta0 = CLIP_SH(delta0, tc_neg, tc_pos);
+ CLIP_SH(delta0, tc_neg, tc_pos);
- temp0 = (v8u16) (delta0 + p0_src);
- temp0 = (v8u16) CLIP_SH_0_255(temp0);
- temp0 = (v8u16) __msa_bmz_v((v16u8) temp0, (v16u8) p0_src,
+ temp2 = (v8i16) (delta0 + p0_src);
+ CLIP_SH_0_255(temp2);
+ temp0 = (v8u16) __msa_bmz_v((v16u8) temp2, (v16u8) p0_src,
(v16u8) p_is_pcm_vec);
temp2 = (v8i16) (q0_src - delta0);
- temp2 = CLIP_SH_0_255(temp2);
+ CLIP_SH_0_255(temp2);
temp2 = (v8i16) __msa_bmz_v((v16u8) temp2, (v16u8) q0_src,
(v16u8) q_is_pcm_vec);
delta1 -= (v8i16) p1_src;
delta1 += delta0;
delta1 >>= 1;
- delta1 = CLIP_SH(delta1, tc_neg, tc_pos);
+ CLIP_SH(delta1, tc_neg, tc_pos);
delta1 = (v8i16) p1_src + (v8i16) delta1;
- delta1 = CLIP_SH_0_255(delta1);
+ CLIP_SH_0_255(delta1);
delta1 = (v8i16) __msa_bmnz_v((v16u8) delta1, (v16u8) p1_src,
(v16u8) p_is_pcm_vec);
delta2 = delta2 - (v8i16) q1_src;
delta2 = delta2 - delta0;
delta2 = delta2 >> 1;
- delta2 = CLIP_SH(delta2, tc_neg, tc_pos);
+ CLIP_SH(delta2, tc_neg, tc_pos);
delta2 = (v8i16) q1_src + (v8i16) delta2;
- delta2 = CLIP_SH_0_255(delta2);
+ CLIP_SH_0_255(delta2);
delta2 = (v8i16) __msa_bmnz_v((v16u8) delta2, (v16u8) q1_src,
(v16u8) q_is_pcm_vec);
delta1 = (v8i16) __msa_bmz_v((v16u8) delta1, (v16u8) p1_src,
temp0 <<= 2;
temp0 += temp1;
delta = __msa_srari_h((v8i16) temp0, 3);
- delta = CLIP_SH(delta, tc_neg, tc_pos);
+ CLIP_SH(delta, tc_neg, tc_pos);
temp0 = (v8i16) ((v8i16) p0 + delta);
- temp0 = CLIP_SH_0_255(temp0);
+ CLIP_SH_0_255(temp0);
temp0 = (v8i16) __msa_bmz_v((v16u8) temp0, (v16u8) p0,
(v16u8) p_is_pcm_vec);
temp1 = (v8i16) ((v8i16) q0 - delta);
- temp1 = CLIP_SH_0_255(temp1);
+ CLIP_SH_0_255(temp1);
temp1 = (v8i16) __msa_bmz_v((v16u8) temp1, (v16u8) q0,
(v16u8) q_is_pcm_vec);
temp1 = (v8i16) __msa_bmnz_v((v16u8) temp1, (v16u8) q0, (v16u8) tc_pos);
temp0 = (v8i16) __msa_pckev_b((v16i8) temp1, (v16i8) temp0);
- ST8x2_UB(temp0, p0_ptr, stride);
+ ST_D2(temp0, 0, 1, p0_ptr, stride);
}
}
temp0 <<= 2;
temp0 += temp1;
delta = __msa_srari_h((v8i16) temp0, 3);
- delta = CLIP_SH(delta, tc_neg, tc_pos);
+ CLIP_SH(delta, tc_neg, tc_pos);
temp0 = (v8i16) ((v8i16) p0 + delta);
- temp0 = CLIP_SH_0_255(temp0);
+ CLIP_SH_0_255(temp0);
temp0 = (v8i16) __msa_bmz_v((v16u8) temp0, (v16u8) p0,
(v16u8) p_is_pcm_vec);
temp1 = (v8i16) ((v8i16) q0 - delta);
- temp1 = CLIP_SH_0_255(temp1);
+ CLIP_SH_0_255(temp1);
temp1 = (v8i16) __msa_bmz_v((v16u8) temp1, (v16u8) q0,
(v16u8) q_is_pcm_vec);
temp0 = (v8i16) __msa_ilvev_b((v16i8) temp1, (v16i8) temp0);
src += 1;
- ST2x4_UB(temp0, 0, src, stride);
- src += (4 * stride);
- ST2x4_UB(temp0, 4, src, stride);
+ ST_H8(temp0, 0, 1, 2, 3, 4, 5, 6, 7, src, stride);
}
}
LD_UB4(src, src_stride, src0, src1, src2, src3);
/* store results */
- ST4x4_UB(dst0, dst0, 0, 1, 2, 3, dst, dst_stride);
+ ST_W4(dst0, 0, 1, 2, 3, dst, dst_stride);
dst += (4 * dst_stride);
}
dst0 = (v16i8) __msa_xori_b((v16u8) dst0, 128);
/* store results */
- ST4x4_UB(dst0, dst0, 0, 1, 2, 3, dst, dst_stride);
+ ST_W4(dst0, 0, 1, 2, 3, dst, dst_stride);
}
static void hevc_sao_band_filter_8width_msa(uint8_t *dst, int32_t dst_stride,
XORI_B2_128_SB(dst0, dst1);
/* store results */
- ST8x4_UB(dst0, dst1, dst, dst_stride);
+ ST_D4(dst0, dst1, 0, 1, 0, 1, dst, dst_stride);
dst += dst_stride << 2;
}
XORI_B2_128_SB(dst0, dst1);
/* store results */
- ST8x4_UB(dst0, dst1, dst, dst_stride);
+ ST_D4(dst0, dst1, 0, 1, 0, 1, dst, dst_stride);
}
static void hevc_sao_band_filter_16multiple_msa(uint8_t *dst,
v16u8 cmp_minus10, diff_minus10, diff_minus11;
v16u8 src0, src1, dst0, src_minus10, src_minus11, src_plus10, src_plus11;
v16i8 offset, sao_offset = LD_SB(sao_offset_val);
+ v16i8 zeros = { 0 };
sao_offset = __msa_pckev_b(sao_offset, sao_offset);
src -= 1;
for (height -= 2; height; height -= 2) {
src += (src_stride << 1);
- SLDI_B2_0_UB(src_minus10, src_minus11, src0, src1, 1);
- SLDI_B2_0_UB(src_minus10, src_minus11, src_plus10, src_plus11, 2);
+ SLDI_B2_UB(zeros, src_minus10, zeros, src_minus11, 1, src0, src1);
+ SLDI_B2_UB(zeros, src_minus10, zeros, src_minus11, 2, src_plus10, src_plus11);
PCKEV_D2_UB(src_minus11, src_minus10, src_plus11, src_plus10,
src_minus10, src_plus10);
dst += dst_stride;
}
- SLDI_B2_0_UB(src_minus10, src_minus11, src0, src1, 1);
- SLDI_B2_0_UB(src_minus10, src_minus11, src_plus10, src_plus11, 2);
+ SLDI_B2_UB(zeros, src_minus10, zeros, src_minus11, 1, src0, src1);
+ SLDI_B2_UB(zeros, src_minus10, zeros, src_minus11, 2, src_plus10, src_plus11);
PCKEV_D2_UB(src_minus11, src_minus10, src_plus11, src_plus10, src_minus10,
src_plus10);
dst_ptr = dst + v_cnt;
LD_UB4(src_minus1, src_stride, src10, src11, src12, src13);
- SLDI_B2_SB(src10, src11, src_minus10, src_minus11, src_zero0,
- src_zero1, 1);
- SLDI_B2_SB(src12, src13, src_minus12, src_minus13, src_zero2,
- src_zero3, 1);
- SLDI_B2_SB(src10, src11, src_minus10, src_minus11, src_plus10,
- src_plus11, 2);
- SLDI_B2_SB(src12, src13, src_minus12, src_minus13, src_plus12,
- src_plus13, 2);
+ SLDI_B4_SB(src10, src_minus10, src11, src_minus11,
+ src12, src_minus12, src13, src_minus13, 1,
+ src_zero0, src_zero1, src_zero2, src_zero3);
+ SLDI_B4_SB(src10, src_minus10, src11, src_minus11,
+ src12, src_minus12, src13, src_minus13, 2,
+ src_plus10, src_plus11, src_plus12, src_plus13);
cmp_minus10 = ((v16u8) src_zero0 == src_minus10);
cmp_plus10 = ((v16u8) src_zero0 == (v16u8) src_plus10);
v16u8 src_minus11, src10, src11;
v16i8 src_plus0, src_zero0, src_plus1, src_zero1, dst0;
v8i16 offset_mask0, offset_mask1;
+ v16i8 zeros = { 0 };
sao_offset = __msa_pckev_b(sao_offset, sao_offset);
for (height -= 2; height; height -= 2) {
src_orig += (src_stride << 1);
- SLDI_B2_0_SB(src_minus11, src10, src_zero0, src_zero1, 1);
- SLDI_B2_0_SB(src10, src11, src_plus0, src_plus1, 2);
+ SLDI_B2_SB(zeros, src_minus11, zeros, src10, 1, src_zero0, src_zero1);
+ SLDI_B2_SB(zeros, src10, zeros, src11, 2, src_plus0, src_plus1);
ILVR_B2_UB(src_plus0, src_minus10, src_plus1, src_minus11, src_minus10,
src_minus11);
dst += dst_stride;
}
- SLDI_B2_0_SB(src_minus11, src10, src_zero0, src_zero1, 1);
- SLDI_B2_0_SB(src10, src11, src_plus0, src_plus1, 2);
+ SLDI_B2_SB(zeros, src_minus11, zeros, src10, 1, src_zero0, src_zero1);
+ SLDI_B2_SB(zeros, src10, zeros, src11, 2, src_plus0, src_plus1);
ILVR_B2_UB(src_plus0, src_minus10, src_plus1, src_minus11, src_minus10,
src_minus11);
v16u8 src_minus10, src10, src_minus11, src11;
v16i8 src_zero0, src_plus10, src_zero1, src_plus11, dst0;
v8i16 offset_mask0, offset_mask1;
+ v16i8 zeros = { 0 };
sao_offset = __msa_pckev_b(sao_offset, sao_offset);
src_orig = src - 1;
for (height -= 2; height; height -= 2) {
src_orig += (src_stride << 1);
- SLDI_B2_0_SB(src_minus11, src10, src_zero0, src_zero1, 1);
- SLDI_B2_0_SB(src10, src11, src_plus10, src_plus11, 2);
+ SLDI_B2_SB(zeros, src_minus11, zeros, src10, 1, src_zero0, src_zero1);
+ SLDI_B2_SB(zeros, src10, zeros, src11, 2, src_plus10, src_plus11);
ILVR_B2_UB(src_plus10, src_minus10, src_plus11, src_minus11,
src_minus10, src_minus11);
dst += dst_stride;
}
- SLDI_B2_0_SB(src_minus11, src10, src_zero0, src_zero1, 1);
- SLDI_B2_0_SB(src10, src11, src_plus10, src_plus11, 2);
+ SLDI_B2_SB(zeros, src_minus11, zeros, src10, 1, src_zero0, src_zero1);
+ SLDI_B2_SB(zeros, src10, zeros, src11, 2, src_plus10, src_plus11);
ILVR_B2_UB(src_plus10, src_minus10, src_plus11, src_minus11, src_minus10,
src_minus11);
ILVR_B2_SB(src_zero0, src_zero0, src_zero1, src_zero1, src_zero0,
src_plus13 = LD_UB(src + 1 + v_cnt + (src_stride << 2));
src_orig += 16;
- SLDI_B2_SB(src10, src11, src_minus11, src_minus12, src_zero0,
- src_zero1, 1);
- SLDI_B2_SB(src12, src13, src_minus13, src_minus14, src_zero2,
- src_zero3, 1);
- SLDI_B2_SB(src11, src12, src_minus12, src_minus13, src_plus10,
- src_plus11, 2);
+ SLDI_B4_SB(src10, src_minus11, src11, src_minus12,
+ src12, src_minus13, src13, src_minus14, 1,
+ src_zero0, src_zero1, src_zero2, src_zero3);
+ SLDI_B2_SB(src11, src_minus12, src12, src_minus13, 2, src_plus10,
+ src_plus11);
src_plus12 = __msa_sldi_b((v16i8) src13, (v16i8) src_minus14, 2);
v16u8 cmp_minus10, diff_minus10, cmp_minus11, diff_minus11;
v16u8 src_minus10, src10, src_minus11, src11;
v8i16 offset_mask0, offset_mask1;
+ v16i8 zeros = { 0 };
sao_offset = __msa_pckev_b(sao_offset, sao_offset);
src_orig = src - 1;
for (height -= 2; height; height -= 2) {
src_orig += (src_stride << 1);
- SLDI_B2_0_SB(src_minus11, src10, src_zero0, src_zero1, 1);
- SLDI_B2_0_UB(src_minus10, src_minus11, src_minus10, src_minus11, 2);
+ SLDI_B2_SB(zeros, src_minus11, zeros, src10, 1, src_zero0, src_zero1);
+ SLDI_B2_UB(zeros, src_minus10, zeros, src_minus11, 2, src_minus10, src_minus11);
ILVR_B2_UB(src10, src_minus10, src11, src_minus11, src_minus10,
src_minus11);
dst += dst_stride;
}
- SLDI_B2_0_SB(src_minus11, src10, src_zero0, src_zero1, 1);
- SLDI_B2_0_UB(src_minus10, src_minus11, src_minus10, src_minus11, 2);
+ SLDI_B2_SB(zeros, src_minus11, zeros, src10, 1, src_zero0, src_zero1);
+ SLDI_B2_UB(zeros, src_minus10, zeros, src_minus11, 2, src_minus10, src_minus11);
ILVR_B2_UB(src10, src_minus10, src11, src_minus11, src_minus10,
src_minus11);
v16u8 src_minus10, src10, src_minus11, src11;
v16i8 src_zero0, src_zero1, dst0;
v8i16 offset_mask0, offset_mask1;
+ v16i8 zeros = { 0 };
sao_offset = __msa_pckev_b(sao_offset, sao_offset);
src_orig = src - 1;
for (height -= 2; height; height -= 2) {
src_orig += (src_stride << 1);
- SLDI_B2_0_SB(src_minus11, src10, src_zero0, src_zero1, 1);
- SLDI_B2_0_UB(src_minus10, src_minus11, src_minus10, src_minus11, 2);
+ SLDI_B2_SB(zeros, src_minus11, zeros, src10, 1, src_zero0, src_zero1);
+ SLDI_B2_UB(zeros, src_minus10, zeros, src_minus11, 2, src_minus10, src_minus11);
ILVR_B2_UB(src10, src_minus10, src11, src_minus11, src_minus10,
src_minus11);
ILVR_B2_SB(src_zero0, src_zero0, src_zero1, src_zero1, src_zero0,
dst += dst_stride;
}
- SLDI_B2_0_SB(src_minus11, src10, src_zero0, src_zero1, 1);
- SLDI_B2_0_UB(src_minus10, src_minus11, src_minus10, src_minus11, 2);
+ SLDI_B2_SB(zeros, src_minus11, zeros, src10, 1, src_zero0, src_zero1);
+ SLDI_B2_UB(zeros, src_minus10, zeros, src_minus11, 2, src_minus10, src_minus11);
ILVR_B2_UB(src10, src_minus10, src11, src_minus11, src_minus10,
src_minus11);
ILVR_B2_SB(src_zero0, src_zero0, src_zero1, src_zero1, src_zero0,
int16_t *sao_offset_val,
int eo, int width, int height)
{
- ptrdiff_t stride_src = (2 * 64 + 32) / sizeof(uint8_t);
+ ptrdiff_t stride_src = (2 * MAX_PB_SIZE + AV_INPUT_BUFFER_PADDING_SIZE) / sizeof(uint8_t);
switch (eo) {
case 0: