ILVRL_H2_SH(vec1, vec0, vec2, vec3);
src -= 2;
- ST4x4_UB(vec2, vec2, 0, 1, 2, 3, src, pitch);
- src += 4 * pitch;
- ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src, pitch);
+ ST_W8(vec2, vec3, 0, 1, 2, 3, 0, 1, 2, 3, src, pitch);
}
void ff_loop_filter_h_44_16_msa(uint8_t *src, ptrdiff_t pitch,
src -= 2;
- ST4x8_UB(tmp2, tmp3, src, pitch);
- src += (8 * pitch);
- ST4x8_UB(tmp4, tmp5, src, pitch);
+ ST_W8(tmp2, tmp3, 0, 1, 2, 3, 0, 1, 2, 3, src, pitch);
+ ST_W8(tmp4, tmp5, 0, 1, 2, 3, 0, 1, 2, 3, src + 8 * pitch, pitch);
}
void ff_loop_filter_h_8_8_msa(uint8_t *src, ptrdiff_t pitch,
ILVRL_H2_SH(vec1, vec0, vec2, vec3);
src -= 2;
- ST4x4_UB(vec2, vec2, 0, 1, 2, 3, src, pitch);
- src += 4 * pitch;
- ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src, pitch);
+ ST_W8(vec2, vec3, 0, 1, 2, 3, 0, 1, 2, 3, src, pitch);
} else {
ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1,
zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r,
vec4 = (v8i16) __msa_ilvr_b((v16i8) q2, (v16i8) q1);
src -= 3;
- ST4x4_UB(vec2, vec2, 0, 1, 2, 3, src, pitch);
- ST2x4_UB(vec4, 0, src + 4, pitch);
+ ST_W4(vec2, 0, 1, 2, 3, src, pitch);
+ ST_H4(vec4, 0, 1, 2, 3, src + 4, pitch);
src += (4 * pitch);
- ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src, pitch);
- ST2x4_UB(vec4, 4, src + 4, pitch);
+ ST_W4(vec3, 0, 1, 2, 3, src, pitch);
+ ST_H4(vec4, 4, 5, 6, 7, src + 4, pitch);
}
}
ILVRL_H2_SH(vec1, vec0, vec4, vec5);
src -= 2;
- ST4x8_UB(vec2, vec3, src, pitch);
- src += 8 * pitch;
- ST4x8_UB(vec4, vec5, src, pitch);
+ ST_W8(vec2, vec3, 0, 1, 2, 3, 0, 1, 2, 3, src, pitch);
+ ST_W8(vec4, vec5, 0, 1, 2, 3, 0, 1, 2, 3, src + 8 * pitch, pitch);
} else {
ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1,
zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r,
ILVRL_B2_SH(q2, q1, vec2, vec5);
src -= 3;
- ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src, pitch);
- ST2x4_UB(vec2, 0, src + 4, pitch);
+ ST_W4(vec3, 0, 1, 2, 3, src, pitch);
+ ST_H4(vec2, 0, 1, 2, 3, src + 4, pitch);
src += (4 * pitch);
- ST4x4_UB(vec4, vec4, 0, 1, 2, 3, src, pitch);
- ST2x4_UB(vec2, 4, src + 4, pitch);
+ ST_W4(vec4, 0, 1, 2, 3, src, pitch);
+ ST_H4(vec2, 4, 5, 6, 7, src + 4, pitch);
src += (4 * pitch);
- ST4x4_UB(vec6, vec6, 0, 1, 2, 3, src, pitch);
- ST2x4_UB(vec5, 0, src + 4, pitch);
+ ST_W4(vec6, 0, 1, 2, 3, src, pitch);
+ ST_H4(vec5, 0, 1, 2, 3, src + 4, pitch);
src += (4 * pitch);
- ST4x4_UB(vec7, vec7, 0, 1, 2, 3, src, pitch);
- ST2x4_UB(vec5, 4, src + 4, pitch);
+ ST_W4(vec7, 0, 1, 2, 3, src, pitch);
+ ST_H4(vec5, 4, 5, 6, 7, src + 4, pitch);
}
}
ILVRL_H2_SH(vec1, vec0, vec4, vec5);
src -= 2;
- ST4x8_UB(vec2, vec3, src, pitch);
- src += 8 * pitch;
- ST4x8_UB(vec4, vec5, src, pitch);
+ ST_W8(vec2, vec3, 0, 1, 2, 3, 0, 1, 2, 3, src, pitch);
+ ST_W8(vec4, vec5, 0, 1, 2, 3, 0, 1, 2, 3, src + 8 * pitch, pitch);
} else {
ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1,
zero, q2, zero, q3, p3_r, p2_r, p1_r, p0_r, q0_r, q1_r, q2_r,
ILVRL_B2_SH(q2, q1, vec2, vec5);
src -= 3;
- ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src, pitch);
- ST2x4_UB(vec2, 0, src + 4, pitch);
+ ST_W4(vec3, 0, 1, 2, 3, src, pitch);
+ ST_H4(vec2, 0, 1, 2, 3, src + 4, pitch);
src += (4 * pitch);
- ST4x4_UB(vec4, vec4, 0, 1, 2, 3, src, pitch);
- ST2x4_UB(vec2, 4, src + 4, pitch);
+ ST_W4(vec4, 0, 1, 2, 3, src, pitch);
+ ST_H4(vec2, 4, 5, 6, 7, src + 4, pitch);
src += (4 * pitch);
- ST4x4_UB(vec6, vec6, 0, 1, 2, 3, src, pitch);
- ST2x4_UB(vec5, 0, src + 4, pitch);
+ ST_W4(vec6, 0, 1, 2, 3, src, pitch);
+ ST_H4(vec5, 0, 1, 2, 3, src + 4, pitch);
src += (4 * pitch);
- ST4x4_UB(vec7, vec7, 0, 1, 2, 3, src, pitch);
- ST2x4_UB(vec5, 4, src + 4, pitch);
+ ST_W4(vec7, 0, 1, 2, 3, src, pitch);
+ ST_H4(vec5, 4, 5, 6, 7, src + 4, pitch);
}
}
ILVRL_H2_SH(vec1, vec0, vec4, vec5);
src -= 2;
- ST4x8_UB(vec2, vec3, src, pitch);
- src += 8 * pitch;
- ST4x8_UB(vec4, vec5, src, pitch);
+ ST_W8(vec2, vec3, 0, 1, 2, 3, 0, 1, 2, 3, src, pitch);
+ ST_W8(vec4, vec5, 0, 1, 2, 3, 0, 1, 2, 3, src + 8 * pitch, pitch);
} else {
ILVL_B4_UH(zero, p3, zero, p2, zero, p1, zero, p0, p3_l, p2_l, p1_l,
p0_l);
ILVRL_B2_SH(q2, q1, vec2, vec5);
src -= 3;
- ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src, pitch);
- ST2x4_UB(vec2, 0, src + 4, pitch);
+ ST_W4(vec3, 0, 1, 2, 3, src, pitch);
+ ST_H4(vec2, 0, 1, 2, 3, src + 4, pitch);
src += (4 * pitch);
- ST4x4_UB(vec4, vec4, 0, 1, 2, 3, src, pitch);
- ST2x4_UB(vec2, 4, src + 4, pitch);
+ ST_W4(vec4, 0, 1, 2, 3, src, pitch);
+ ST_H4(vec2, 4, 5, 6, 7, src + 4, pitch);
src += (4 * pitch);
- ST4x4_UB(vec6, vec6, 0, 1, 2, 3, src, pitch);
- ST2x4_UB(vec5, 0, src + 4, pitch);
+ ST_W4(vec6, 0, 1, 2, 3, src, pitch);
+ ST_H4(vec5, 0, 1, 2, 3, src + 4, pitch);
src += (4 * pitch);
- ST4x4_UB(vec7, vec7, 0, 1, 2, 3, src, pitch);
- ST2x4_UB(vec5, 4, src + 4, pitch);
+ ST_W4(vec7, 0, 1, 2, 3, src, pitch);
+ ST_H4(vec5, 4, 5, 6, 7, src + 4, pitch);
}
}
v16u8 p7_org, p6_org, p5_org, p4_org, p3_org, p2_org, p1_org, p0_org;
v16i8 tmp0, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7;
v16u8 p7, p6, p5, p4, p3, p2, p1, p0, q0, q1, q2, q3, q4, q5, q6, q7;
+ v16i8 zeros = { 0 };
LD_UB8(input, in_pitch,
p7_org, p6_org, p5_org, p4_org, p3_org, p2_org, p1_org, p0_org);
ILVL_B2_SB(tmp1, tmp0, tmp3, tmp2, tmp5, tmp7);
ILVR_W2_UB(tmp6, tmp4, tmp7, tmp5, q0, q4);
ILVL_W2_UB(tmp6, tmp4, tmp7, tmp5, q2, q6);
- SLDI_B4_0_UB(q0, q2, q4, q6, q1, q3, q5, q7, 8);
+ SLDI_B4_UB(zeros, q0, zeros, q2, zeros, q4, zeros, q6, 8, q1, q3, q5, q7);
ST_UB8(p7, p6, p5, p4, p3, p2, p1, p0, output, out_pitch);
output += (8 * out_pitch);
if (__msa_test_bz_v(flat)) {
ILVR_B2_SH(p0_out, p1_out, q1_out, q0_out, vec0, vec1);
ILVRL_H2_SH(vec1, vec0, vec2, vec3);
- ST4x8_UB(vec2, vec3, (src_org - 2), pitch_org);
+ ST_W8(vec2, vec3, 0, 1, 2, 3, 0, 1, 2, 3, (src_org - 2), pitch_org);
return 1;
} else {
ILVR_B8_UH(zero, p3, zero, p2, zero, p1, zero, p0, zero, q0, zero, q1,
vec2 = (v8i16) __msa_ilvr_b((v16i8) q2, (v16i8) q1);
src_org -= 3;
- ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src_org, pitch);
- ST2x4_UB(vec2, 0, (src_org + 4), pitch);
+ ST_W4(vec3, 0, 1, 2, 3, src_org, pitch);
+ ST_H4(vec2, 0, 1, 2, 3, (src_org + 4), pitch);
src_org += (4 * pitch);
- ST4x4_UB(vec4, vec4, 0, 1, 2, 3, src_org, pitch);
- ST2x4_UB(vec2, 4, (src_org + 4), pitch);
+ ST_W4(vec4, 0, 1, 2, 3, src_org, pitch);
+ ST_H4(vec2, 4, 5, 6, 7, (src_org + 4), pitch);
return 1;
} else {
r_out = __msa_srari_h((v8i16) tmp1_r, 4);
r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
p6 = __msa_bmnz_v(p6, (v16u8) r_out, flat2);
- ST8x1_UB(p6, src);
+ ST_D1(p6, 0, src);
src += 16;
/* p5 */
r_out = __msa_srari_h((v8i16) tmp1_r, 4);
r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
p5 = __msa_bmnz_v(p5, (v16u8) r_out, flat2);
- ST8x1_UB(p5, src);
+ ST_D1(p5, 0, src);
src += 16;
/* p4 */
r_out = __msa_srari_h((v8i16) tmp1_r, 4);
r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
p4 = __msa_bmnz_v(p4, (v16u8) r_out, flat2);
- ST8x1_UB(p4, src);
+ ST_D1(p4, 0, src);
src += 16;
/* p3 */
r_out = __msa_srari_h((v8i16) tmp1_r, 4);
r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
p3 = __msa_bmnz_v(p3, (v16u8) r_out, flat2);
- ST8x1_UB(p3, src);
+ ST_D1(p3, 0, src);
src += 16;
/* p2 */
r_out = __msa_srari_h((v8i16) tmp1_r, 4);
r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
- ST8x1_UB(filter8, src);
+ ST_D1(filter8, 0, src);
src += 16;
/* p1 */
r_out = __msa_srari_h((v8i16) tmp1_r, 4);
r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
- ST8x1_UB(filter8, src);
+ ST_D1(filter8, 0, src);
src += 16;
/* p0 */
r_out = __msa_srari_h((v8i16) tmp1_r, 4);
r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
- ST8x1_UB(filter8, src);
+ ST_D1(filter8, 0, src);
src += 16;
/* q0 */
r_out = __msa_srari_h((v8i16) tmp1_r, 4);
r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
- ST8x1_UB(filter8, src);
+ ST_D1(filter8, 0, src);
src += 16;
/* q1 */
r_out = __msa_srari_h((v8i16) tmp1_r, 4);
r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
- ST8x1_UB(filter8, src);
+ ST_D1(filter8, 0, src);
src += 16;
/* q2 */
r_out = __msa_srari_h((v8i16) tmp1_r, 4);
r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
filter8 = __msa_bmnz_v(filter8, (v16u8) r_out, flat2);
- ST8x1_UB(filter8, src);
+ ST_D1(filter8, 0, src);
src += 16;
/* q3 */
r_out = __msa_srari_h((v8i16) tmp1_r, 4);
r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
q3 = __msa_bmnz_v(q3, (v16u8) r_out, flat2);
- ST8x1_UB(q3, src);
+ ST_D1(q3, 0, src);
src += 16;
/* q4 */
r_out = __msa_srari_h((v8i16) tmp1_r, 4);
r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
q4 = __msa_bmnz_v(q4, (v16u8) r_out, flat2);
- ST8x1_UB(q4, src);
+ ST_D1(q4, 0, src);
src += 16;
/* q5 */
r_out = __msa_srari_h((v8i16) tmp1_r, 4);
r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
q5 = __msa_bmnz_v(q5, (v16u8) r_out, flat2);
- ST8x1_UB(q5, src);
+ ST_D1(q5, 0, src);
src += 16;
/* q6 */
r_out = __msa_srari_h((v8i16) tmp1_r, 4);
r_out = (v8i16) __msa_pckev_b((v16i8) r_out, (v16i8) r_out);
q6 = __msa_bmnz_v(q6, (v16u8) r_out, flat2);
- ST8x1_UB(q6, src);
+ ST_D1(q6, 0, src);
return 0;
}
ILVRL_H2_SH(vec1, vec0, vec4, vec5);
src_org -= 2;
- ST4x8_UB(vec2, vec3, src_org, pitch);
- src_org += 8 * pitch;
- ST4x8_UB(vec4, vec5, src_org, pitch);
+ ST_W8(vec2, vec3, 0, 1, 2, 3, 0, 1, 2, 3, src_org, pitch);
+ ST_W8(vec4, vec5, 0, 1, 2, 3, 0, 1, 2, 3, src_org + 8 * pitch, pitch);
return 1;
} else {
ILVRL_B2_SH(q2, q1, vec2, vec5);
src_org -= 3;
- ST4x4_UB(vec3, vec3, 0, 1, 2, 3, src_org, pitch);
- ST2x4_UB(vec2, 0, (src_org + 4), pitch);
+ ST_W4(vec3, 0, 1, 2, 3, src_org, pitch);
+ ST_H4(vec2, 0, 1, 2, 3, (src_org + 4), pitch);
src_org += (4 * pitch);
- ST4x4_UB(vec4, vec4, 0, 1, 2, 3, src_org, pitch);
- ST2x4_UB(vec2, 4, (src_org + 4), pitch);
+ ST_W4(vec4, 0, 1, 2, 3, src_org, pitch);
+ ST_H4(vec2, 4, 5, 6, 7, (src_org + 4), pitch);
src_org += (4 * pitch);
- ST4x4_UB(vec6, vec6, 0, 1, 2, 3, src_org, pitch);
- ST2x4_UB(vec5, 0, (src_org + 4), pitch);
+ ST_W4(vec6, 0, 1, 2, 3, src_org, pitch);
+ ST_H4(vec5, 0, 1, 2, 3, (src_org + 4), pitch);
src_org += (4 * pitch);
- ST4x4_UB(vec7, vec7, 0, 1, 2, 3, src_org, pitch);
- ST2x4_UB(vec5, 4, (src_org + 4), pitch);
+ ST_W4(vec7, 0, 1, 2, 3, src_org, pitch);
+ ST_H4(vec5, 4, 5, 6, 7, (src_org + 4), pitch);
return 1;
} else {