X-Git-Url: https://git.sesse.net/?a=blobdiff_plain;f=libavcodec%2Fmips%2Fvp8_mc_msa.c;h=57af6b45f197f10bec585e10e4c1f107ba928aae;hb=153c60752558369b98dce0b7a0ca7acc687fa630;hp=2bf0abd8c9288f871d7ec8adf24a5d08d7b57631;hpb=e645d7a6d452df83cedcbb1d6708429ceea156da;p=ffmpeg diff --git a/libavcodec/mips/vp8_mc_msa.c b/libavcodec/mips/vp8_mc_msa.c index 2bf0abd8c92..57af6b45f19 100644 --- a/libavcodec/mips/vp8_mc_msa.c +++ b/libavcodec/mips/vp8_mc_msa.c @@ -181,7 +181,7 @@ static void common_hz_6t_4x4_msa(uint8_t *src, int32_t src_stride, SRARI_H2_SH(out0, out1, 7); SAT_SH2_SH(out0, out1, 7); out = PCKEV_XORI128_UB(out0, out1); - ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride); + ST_W4(out, 0, 1, 2, 3, dst, dst_stride); } static void common_hz_6t_4x8_msa(uint8_t *src, int32_t src_stride, @@ -214,10 +214,9 @@ static void common_hz_6t_4x8_msa(uint8_t *src, int32_t src_stride, SRARI_H4_SH(out0, out1, out2, out3, 7); SAT_SH4_SH(out0, out1, out2, out3, 7); out = PCKEV_XORI128_UB(out0, out1); - ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride); - dst += (4 * dst_stride); + ST_W4(out, 0, 1, 2, 3, dst, dst_stride); out = PCKEV_XORI128_UB(out2, out3); - ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride); + ST_W4(out, 0, 1, 2, 3, dst + 4 * dst_stride, dst_stride); } void ff_put_vp8_epel4_h6_msa(uint8_t *dst, ptrdiff_t dst_stride, @@ -263,7 +262,7 @@ void ff_put_vp8_epel8_h6_msa(uint8_t *dst, ptrdiff_t dst_stride, SAT_SH4_SH(out0, out1, out2, out3, 7); tmp0 = PCKEV_XORI128_UB(out0, out1); tmp1 = PCKEV_XORI128_UB(out2, out3); - ST8x4_UB(tmp0, tmp1, dst, dst_stride); + ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride); dst += (4 * dst_stride); for (loop_cnt = (height >> 2) - 1; loop_cnt--;) { @@ -276,7 +275,7 @@ void ff_put_vp8_epel8_h6_msa(uint8_t *dst, ptrdiff_t dst_stride, SAT_SH4_SH(out0, out1, out2, out3, 7); tmp0 = PCKEV_XORI128_UB(out0, out1); tmp1 = PCKEV_XORI128_UB(out2, out3); - ST8x4_UB(tmp0, tmp1, dst, dst_stride); + ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride); dst += (4 * dst_stride); } } @@ -368,7 +367,7 @@ void ff_put_vp8_epel4_v6_msa(uint8_t *dst, ptrdiff_t dst_stride, SRARI_H2_SH(out10, out32, 7); SAT_SH2_SH(out10, out32, 7); out = PCKEV_XORI128_UB(out10, out32); - ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride); + ST_W4(out, 0, 1, 2, 3, dst, dst_stride); dst += (4 * dst_stride); src2110 = src6554; @@ -416,7 +415,7 @@ void ff_put_vp8_epel8_v6_msa(uint8_t *dst, ptrdiff_t dst_stride, SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7); tmp0 = PCKEV_XORI128_UB(out0_r, out1_r); tmp1 = PCKEV_XORI128_UB(out2_r, out3_r); - ST8x4_UB(tmp0, tmp1, dst, dst_stride); + ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride); dst += (4 * dst_stride); src10_r = src76_r; @@ -567,7 +566,7 @@ void ff_put_vp8_epel4_h6v6_msa(uint8_t *dst, ptrdiff_t dst_stride, SRARI_H2_SH(tmp0, tmp1, 7); SAT_SH2_SH(tmp0, tmp1, 7); out = PCKEV_XORI128_UB(tmp0, tmp1); - ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride); + ST_W4(out, 0, 1, 2, 3, dst, dst_stride); dst += (4 * dst_stride); hz_out3 = hz_out7; @@ -651,7 +650,7 @@ void ff_put_vp8_epel8_h6v6_msa(uint8_t *dst, ptrdiff_t dst_stride, SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7); vec0 = PCKEV_XORI128_UB(tmp0, tmp1); vec1 = PCKEV_XORI128_UB(tmp2, tmp3); - ST8x4_UB(vec0, vec1, dst, dst_stride); + ST_D4(vec0, vec1, 0, 1, 0, 1, dst, dst_stride); dst += (4 * dst_stride); hz_out4 = hz_out8; @@ -702,7 +701,7 @@ static void common_hz_4t_4x4_msa(uint8_t *src, int32_t src_stride, SRARI_H2_SH(out0, out1, 7); SAT_SH2_SH(out0, out1, 7); out = PCKEV_XORI128_UB(out0, out1); - ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride); + ST_W4(out, 0, 1, 2, 3, dst, dst_stride); } static void common_hz_4t_4x8_msa(uint8_t *src, int32_t src_stride, @@ -735,10 +734,9 @@ static void common_hz_4t_4x8_msa(uint8_t *src, int32_t src_stride, SRARI_H4_SH(out0, out1, out2, out3, 7); SAT_SH4_SH(out0, out1, out2, out3, 7); out = PCKEV_XORI128_UB(out0, out1); - ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride); - dst += (4 * dst_stride); + ST_W4(out, 0, 1, 2, 3, dst, dst_stride); out = PCKEV_XORI128_UB(out2, out3); - ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride); + ST_W4(out, 0, 1, 2, 3, dst + 4 * dst_stride, dst_stride); } static void common_hz_4t_4x16_msa(uint8_t *src, int32_t src_stride, @@ -769,10 +767,10 @@ static void common_hz_4t_4x16_msa(uint8_t *src, int32_t src_stride, SRARI_H4_SH(out0, out1, out2, out3, 7); SAT_SH4_SH(out0, out1, out2, out3, 7); out = PCKEV_XORI128_UB(out0, out1); - ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride); + ST_W4(out, 0, 1, 2, 3, dst, dst_stride); dst += (4 * dst_stride); out = PCKEV_XORI128_UB(out2, out3); - ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride); + ST_W4(out, 0, 1, 2, 3, dst, dst_stride); dst += (4 * dst_stride); LD_SB8(src, src_stride, src0, src1, src2, src3, src4, src5, src6, src7); @@ -785,10 +783,10 @@ static void common_hz_4t_4x16_msa(uint8_t *src, int32_t src_stride, SRARI_H4_SH(out0, out1, out2, out3, 7); SAT_SH4_SH(out0, out1, out2, out3, 7); out = PCKEV_XORI128_UB(out0, out1); - ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride); + ST_W4(out, 0, 1, 2, 3, dst, dst_stride); dst += (4 * dst_stride); out = PCKEV_XORI128_UB(out2, out3); - ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride); + ST_W4(out, 0, 1, 2, 3, dst, dst_stride); } void ff_put_vp8_epel4_h4_msa(uint8_t *dst, ptrdiff_t dst_stride, @@ -836,7 +834,7 @@ void ff_put_vp8_epel8_h4_msa(uint8_t *dst, ptrdiff_t dst_stride, SAT_SH4_SH(out0, out1, out2, out3, 7); tmp0 = PCKEV_XORI128_UB(out0, out1); tmp1 = PCKEV_XORI128_UB(out2, out3); - ST8x4_UB(tmp0, tmp1, dst, dst_stride); + ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride); dst += (4 * dst_stride); } } @@ -932,7 +930,7 @@ void ff_put_vp8_epel4_v4_msa(uint8_t *dst, ptrdiff_t dst_stride, SRARI_H2_SH(out10, out32, 7); SAT_SH2_SH(out10, out32, 7); out = PCKEV_XORI128_UB(out10, out32); - ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride); + ST_W4(out, 0, 1, 2, 3, dst, dst_stride); dst += (4 * dst_stride); } } @@ -974,7 +972,7 @@ void ff_put_vp8_epel8_v4_msa(uint8_t *dst, ptrdiff_t dst_stride, SAT_SH4_SH(out0_r, out1_r, out2_r, out3_r, 7); tmp0 = PCKEV_XORI128_UB(out0_r, out1_r); tmp1 = PCKEV_XORI128_UB(out2_r, out3_r); - ST8x4_UB(tmp0, tmp1, dst, dst_stride); + ST_D4(tmp0, tmp1, 0, 1, 0, 1, dst, dst_stride); dst += (4 * dst_stride); src10_r = src98_r; @@ -1093,7 +1091,7 @@ void ff_put_vp8_epel4_h4v4_msa(uint8_t *dst, ptrdiff_t dst_stride, SRARI_H2_SH(tmp0, tmp1, 7); SAT_SH2_SH(tmp0, tmp1, 7); out = PCKEV_XORI128_UB(tmp0, tmp1); - ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride); + ST_W4(out, 0, 1, 2, 3, dst, dst_stride); dst += (4 * dst_stride); hz_out1 = hz_out5; @@ -1160,7 +1158,7 @@ void ff_put_vp8_epel8_h4v4_msa(uint8_t *dst, ptrdiff_t dst_stride, SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7); out0 = PCKEV_XORI128_UB(tmp0, tmp1); out1 = PCKEV_XORI128_UB(tmp2, tmp3); - ST8x4_UB(out0, out1, dst, dst_stride); + ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride); dst += (4 * dst_stride); vec0 = vec4; @@ -1240,7 +1238,8 @@ void ff_put_vp8_epel4_h6v4_msa(uint8_t *dst, ptrdiff_t dst_stride, SAT_SH2_SH(tmp0, tmp1, 7); PCKEV_B2_UB(tmp0, tmp0, tmp1, tmp1, res0, res1); XORI_B2_128_UB(res0, res1); - ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride); + ST_W2(res0, 0, 1, dst, dst_stride); + ST_W2(res1, 0, 1, dst + 2 * dst_stride, dst_stride); dst += (4 * dst_stride); hz_out1 = hz_out5; @@ -1316,7 +1315,7 @@ void ff_put_vp8_epel8_h6v4_msa(uint8_t *dst, ptrdiff_t dst_stride, SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7); out0 = PCKEV_XORI128_UB(tmp0, tmp1); out1 = PCKEV_XORI128_UB(tmp2, tmp3); - ST8x4_UB(out0, out1, dst, dst_stride); + ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride); dst += (4 * dst_stride); } } @@ -1391,7 +1390,7 @@ void ff_put_vp8_epel4_h4v6_msa(uint8_t *dst, ptrdiff_t dst_stride, SRARI_H2_SH(tmp0, tmp1, 7); SAT_SH2_SH(tmp0, tmp1, 7); out = PCKEV_XORI128_UB(tmp0, tmp1); - ST4x4_UB(out, out, 0, 1, 2, 3, dst, dst_stride); + ST_W4(out, 0, 1, 2, 3, dst, dst_stride); dst += (4 * dst_stride); hz_out3 = hz_out7; @@ -1464,7 +1463,7 @@ void ff_put_vp8_epel8_h4v6_msa(uint8_t *dst, ptrdiff_t dst_stride, SAT_SH4_SH(tmp0, tmp1, tmp2, tmp3, 7); vec0 = PCKEV_XORI128_UB(tmp0, tmp1); vec1 = PCKEV_XORI128_UB(tmp2, tmp3); - ST8x4_UB(vec0, vec1, dst, dst_stride); + ST_D4(vec0, vec1, 0, 1, 0, 1, dst, dst_stride); dst += (4 * dst_stride); hz_out4 = hz_out8; @@ -1509,7 +1508,8 @@ static void common_hz_2t_4x4_msa(uint8_t *src, int32_t src_stride, DOTP_UB2_UH(vec0, vec1, filt0, filt0, vec2, vec3); SRARI_H2_UH(vec2, vec3, 7); PCKEV_B2_UB(vec2, vec2, vec3, vec3, res0, res1); - ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride); + ST_W2(res0, 0, 1, dst, dst_stride); + ST_W2(res1, 0, 1, dst + 2 * dst_stride, dst_stride); } static void common_hz_2t_4x8_msa(uint8_t *src, int32_t src_stride, @@ -1535,9 +1535,10 @@ static void common_hz_2t_4x8_msa(uint8_t *src, int32_t src_stride, SRARI_H4_UH(vec4, vec5, vec6, vec7, 7); PCKEV_B4_SB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7, res0, res1, res2, res3); - ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride); - dst += (4 * dst_stride); - ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride); + ST_W2(res0, 0, 1, dst, dst_stride); + ST_W2(res1, 0, 1, dst + 2 * dst_stride, dst_stride); + ST_W2(res2, 0, 1, dst + 4 * dst_stride, dst_stride); + ST_W2(res3, 0, 1, dst + 6 * dst_stride, dst_stride); } void ff_put_vp8_bilinear4_h_msa(uint8_t *dst, ptrdiff_t dst_stride, @@ -1574,7 +1575,7 @@ static void common_hz_2t_8x4_msa(uint8_t *src, int32_t src_stride, vec0, vec1, vec2, vec3); SRARI_H4_UH(vec0, vec1, vec2, vec3, 7); PCKEV_B2_SB(vec1, vec0, vec3, vec2, src0, src1); - ST8x4_UB(src0, src1, dst, dst_stride); + ST_D4(src0, src1, 0, 1, 0, 1, dst, dst_stride); } static void common_hz_2t_8x8mult_msa(uint8_t *src, int32_t src_stride, @@ -1604,8 +1605,7 @@ static void common_hz_2t_8x8mult_msa(uint8_t *src, int32_t src_stride, src += (4 * src_stride); PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1); - ST8x4_UB(out0, out1, dst, dst_stride); - dst += (4 * dst_stride); + ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride); VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); @@ -1613,8 +1613,8 @@ static void common_hz_2t_8x8mult_msa(uint8_t *src, int32_t src_stride, vec0, vec1, vec2, vec3); SRARI_H4_UH(vec0, vec1, vec2, vec3, 7); PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1); - ST8x4_UB(out0, out1, dst, dst_stride); - dst += (4 * dst_stride); + ST_D4(out0, out1, 0, 1, 0, 1, dst + 4 * dst_stride, dst_stride); + dst += (8 * dst_stride); if (16 == height) { LD_SB4(src, src_stride, src0, src1, src2, src3); @@ -1629,7 +1629,7 @@ static void common_hz_2t_8x8mult_msa(uint8_t *src, int32_t src_stride, src += (4 * src_stride); PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1); - ST8x4_UB(out0, out1, dst, dst_stride); + ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride); VSHF_B2_UH(src0, src0, src1, src1, mask, mask, vec0, vec1); VSHF_B2_UH(src2, src2, src3, src3, mask, mask, vec2, vec3); @@ -1637,7 +1637,7 @@ static void common_hz_2t_8x8mult_msa(uint8_t *src, int32_t src_stride, vec0, vec1, vec2, vec3); SRARI_H4_UH(vec0, vec1, vec2, vec3, 7); PCKEV_B2_SB(vec1, vec0, vec3, vec2, out0, out1); - ST8x4_UB(out0, out1, dst + 4 * dst_stride, dst_stride); + ST_D4(out0, out1, 0, 1, 0, 1, dst + 4 * dst_stride, dst_stride); } } @@ -1745,7 +1745,7 @@ static void common_vt_2t_4x4_msa(uint8_t *src, int32_t src_stride, SRARI_H2_UH(tmp0, tmp1, 7); SAT_UH2_UH(tmp0, tmp1, 7); src2110 = __msa_pckev_b((v16i8) tmp1, (v16i8) tmp0); - ST4x4_UB(src2110, src2110, 0, 1, 2, 3, dst, dst_stride); + ST_W4(src2110, 0, 1, 2, 3, dst, dst_stride); } static void common_vt_2t_4x8_msa(uint8_t *src, int32_t src_stride, @@ -1779,8 +1779,7 @@ static void common_vt_2t_4x8_msa(uint8_t *src, int32_t src_stride, SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7); SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7); PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, src2110, src4332); - ST4x4_UB(src2110, src2110, 0, 1, 2, 3, dst, dst_stride); - ST4x4_UB(src4332, src4332, 0, 1, 2, 3, dst + 4 * dst_stride, dst_stride); + ST_W8(src2110, src4332, 0, 1, 2, 3, 0, 1, 2, 3, dst, dst_stride); } void ff_put_vp8_bilinear4_v_msa(uint8_t *dst, ptrdiff_t dst_stride, @@ -1817,7 +1816,7 @@ static void common_vt_2t_8x4_msa(uint8_t *src, int32_t src_stride, SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7); SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7); PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1); - ST8x4_UB(out0, out1, dst, dst_stride); + ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride); } static void common_vt_2t_8x8mult_msa(uint8_t *src, int32_t src_stride, @@ -1851,16 +1850,15 @@ static void common_vt_2t_8x8mult_msa(uint8_t *src, int32_t src_stride, SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7); SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7); PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1); - ST8x4_UB(out0, out1, dst, dst_stride); - dst += (4 * dst_stride); + ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride); DOTP_UB4_UH(vec4, vec5, vec6, vec7, filt0, filt0, filt0, filt0, tmp0, tmp1, tmp2, tmp3); SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7); SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7); PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1); - ST8x4_UB(out0, out1, dst, dst_stride); - dst += (4 * dst_stride); + ST_D4(out0, out1, 0, 1, 0, 1, dst + 4 * dst_stride, dst_stride); + dst += (8 * dst_stride); src0 = src8; } @@ -1964,7 +1962,8 @@ static void common_hv_2ht_2vt_4x4_msa(uint8_t *src, int32_t src_stride, SRARI_H2_UH(tmp0, tmp1, 7); SAT_UH2_UH(tmp0, tmp1, 7); PCKEV_B2_UB(tmp0, tmp0, tmp1, tmp1, res0, res1); - ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride); + ST_W2(res0, 0, 1, dst, dst_stride); + ST_W2(res1, 0, 1, dst + 2 * dst_stride, dst_stride); } static void common_hv_2ht_2vt_4x8_msa(uint8_t *src, int32_t src_stride, @@ -2008,9 +2007,10 @@ static void common_hv_2ht_2vt_4x8_msa(uint8_t *src, int32_t src_stride, SAT_UH4_UH(vec4, vec5, vec6, vec7, 7); PCKEV_B4_SB(vec4, vec4, vec5, vec5, vec6, vec6, vec7, vec7, res0, res1, res2, res3); - ST4x4_UB(res0, res1, 0, 1, 0, 1, dst, dst_stride); - dst += (4 * dst_stride); - ST4x4_UB(res2, res3, 0, 1, 0, 1, dst, dst_stride); + ST_W2(res0, 0, 1, dst, dst_stride); + ST_W2(res1, 0, 1, dst + 2 * dst_stride, dst_stride); + ST_W2(res2, 0, 1, dst + 4 * dst_stride, dst_stride); + ST_W2(res3, 0, 1, dst + 6 * dst_stride, dst_stride); } void ff_put_vp8_bilinear4_hv_msa(uint8_t *dst, ptrdiff_t dst_stride, @@ -2070,7 +2070,7 @@ static void common_hv_2ht_2vt_8x4_msa(uint8_t *src, int32_t src_stride, SRARI_H4_UH(tmp0, tmp1, tmp2, tmp3, 7); SAT_UH4_UH(tmp0, tmp1, tmp2, tmp3, 7); PCKEV_B2_SB(tmp1, tmp0, tmp3, tmp2, out0, out1); - ST8x4_UB(out0, out1, dst, dst_stride); + ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride); } static void common_hv_2ht_2vt_8x8mult_msa(uint8_t *src, int32_t src_stride, @@ -2127,8 +2127,7 @@ static void common_hv_2ht_2vt_8x8mult_msa(uint8_t *src, int32_t src_stride, SRARI_H2_UH(tmp3, tmp4, 7); SAT_UH2_UH(tmp3, tmp4, 7); PCKEV_B2_SB(tmp2, tmp1, tmp4, tmp3, out0, out1); - ST8x4_UB(out0, out1, dst, dst_stride); - dst += (4 * dst_stride); + ST_D4(out0, out1, 0, 1, 0, 1, dst, dst_stride); hz_out1 = HORIZ_2TAP_FILT_UH(src1, src1, mask, filt_hz, 7); vec0 = (v16u8) __msa_ilvev_b((v16i8) hz_out1, (v16i8) hz_out0); @@ -2149,8 +2148,8 @@ static void common_hv_2ht_2vt_8x8mult_msa(uint8_t *src, int32_t src_stride, SRARI_H4_UH(tmp5, tmp6, tmp7, tmp8, 7); SAT_UH4_UH(tmp5, tmp6, tmp7, tmp8, 7); PCKEV_B2_SB(tmp6, tmp5, tmp8, tmp7, out0, out1); - ST8x4_UB(out0, out1, dst, dst_stride); - dst += (4 * dst_stride); + ST_D4(out0, out1, 0, 1, 0, 1, dst + 4 * dst_stride, dst_stride); + dst += (8 * dst_stride); } }