avutil/mips: refactor msa SLDI_Bn_0 and SLDI_Bn macros.

[ffmpeg] / libavcodec / mips / vp3dsp_idct_msa.c
diff --git a/libavcodec/mips/vp3dsp_idct_msa.c b/libavcodec/mips/vp3dsp_idct_msa.c

index 5427ac559bcdf8f302321b573d6d059f9c96bd2b..90c578f13487aacb238e707bb13d65bf1375d0c5 100644 (file)
--- a/libavcodec/mips/vp3dsp_idct_msa.c
+++ b/libavcodec/mips/vp3dsp_idct_msa.c
@@ -47,7 +47,6 @@ static void idct_msa(uint8_t *dst, int stride, int16_t *input, int type)
      v4i32 cnst8w = {8, 8, 8, 8};
      v4i32 cnst2048w = {2048, 2048, 2048, 2048};
      v4i32 cnst128w = {128, 128, 128, 128};
-    int nstride = stride;
  
      /* Extended input data */
      LD_SH8(input, 8, r0, r1, r2, r3, r4, r5, r6, r7);
@@ -188,14 +187,7 @@ static void idct_msa(uint8_t *dst, int stride, int16_t *input, int type)
          G += c5;
          H += c6;
      }
-    A = CLIP_SW_0_255(A);
-    B = CLIP_SW_0_255(B);
-    C = CLIP_SW_0_255(C);
-    D = CLIP_SW_0_255(D);
-    E = CLIP_SW_0_255(E);
-    F = CLIP_SW_0_255(F);
-    G = CLIP_SW_0_255(G);
-    H = CLIP_SW_0_255(H);
+    CLIP_SW8_0_255(A, B, C, D, E, F, G, H);
      sign_l = __msa_or_v((v16u8)r1_r, (v16u8)r2_r);
      sign_l = __msa_or_v(sign_l, (v16u8)r3_r);
      sign_l = __msa_or_v(sign_l, (v16u8)r0_l);
@@ -206,7 +198,7 @@ static void idct_msa(uint8_t *dst, int stride, int16_t *input, int type)
      Add = ((r0_r * cnst46341w) + (8 << 16)) >> 20;
      if (type == 1) {
          Bdd = Add + cnst128w;
-        Bdd = CLIP_SW_0_255(Bdd);
+        CLIP_SW_0_255(Bdd);
          Ad = Bdd;
          Bd = Bdd;
          Cd = Bdd;
@@ -224,14 +216,7 @@ static void idct_msa(uint8_t *dst, int stride, int16_t *input, int type)
          Fd = Add + c5;
          Gd = Add + c6;
          Hd = Add + c7;
-        Ad = CLIP_SW_0_255(Ad);
-        Bd = CLIP_SW_0_255(Bd);
-        Cd = CLIP_SW_0_255(Cd);
-        Dd = CLIP_SW_0_255(Dd);
-        Ed = CLIP_SW_0_255(Ed);
-        Fd = CLIP_SW_0_255(Fd);
-        Gd = CLIP_SW_0_255(Gd);
-        Hd = CLIP_SW_0_255(Hd);
+        CLIP_SW8_0_255(Ad, Bd, Cd, Dd, Ed, Fd, Gd, Hd);
      }
      Ad = (v4i32)__msa_and_v((v16u8)Ad, (v16u8)sign_t);
      Bd = (v4i32)__msa_and_v((v16u8)Bd, (v16u8)sign_t);
@@ -310,14 +295,7 @@ static void idct_msa(uint8_t *dst, int stride, int16_t *input, int type)
          G += c5;
          H += c6;
      }
-    A = CLIP_SW_0_255(A);
-    B = CLIP_SW_0_255(B);
-    C = CLIP_SW_0_255(C);
-    D = CLIP_SW_0_255(D);
-    E = CLIP_SW_0_255(E);
-    F = CLIP_SW_0_255(F);
-    G = CLIP_SW_0_255(G);
-    H = CLIP_SW_0_255(H);
+    CLIP_SW8_0_255(A, B, C, D, E, F, G, H);
      sign_l = __msa_or_v((v16u8)r5_r, (v16u8)r6_r);
      sign_l = __msa_or_v(sign_l, (v16u8)r7_r);
      sign_l = __msa_or_v(sign_l, (v16u8)r4_l);
@@ -328,7 +306,7 @@ static void idct_msa(uint8_t *dst, int stride, int16_t *input, int type)
      Add = ((r4_r * cnst46341w) + (8 << 16)) >> 20;
      if (type == 1) {
          Bdd = Add + cnst128w;
-        Bdd = CLIP_SW_0_255(Bdd);
+        CLIP_SW_0_255(Bdd);
          Ad = Bdd;
          Bd = Bdd;
          Cd = Bdd;
@@ -346,14 +324,7 @@ static void idct_msa(uint8_t *dst, int stride, int16_t *input, int type)
          Fd = Add + c5;
          Gd = Add + c6;
          Hd = Add + c7;
-        Ad = CLIP_SW_0_255(Ad);
-        Bd = CLIP_SW_0_255(Bd);
-        Cd = CLIP_SW_0_255(Cd);
-        Dd = CLIP_SW_0_255(Dd);
-        Ed = CLIP_SW_0_255(Ed);
-        Fd = CLIP_SW_0_255(Fd);
-        Gd = CLIP_SW_0_255(Gd);
-        Hd = CLIP_SW_0_255(Hd);
+        CLIP_SW8_0_255(Ad, Bd, Cd, Dd, Ed, Fd, Gd, Hd);
      }
      Ad = (v4i32)__msa_and_v((v16u8)Ad, (v16u8)sign_t);
      Bd = (v4i32)__msa_and_v((v16u8)Bd, (v16u8)sign_t);
@@ -386,20 +357,14 @@ static void idct_msa(uint8_t *dst, int stride, int16_t *input, int type)
      VSHF_B2_SB(r2_l, r6_l, r3_l, r7_l, mask, mask, d6, d7);
  
      /* Final sequence of operations over-write original dst */
-    ST8x1_UB(d0, dst);
-    ST8x1_UB(d1, dst + nstride);
-    nstride += stride;
-    ST8x1_UB(d2, dst + nstride);
-    nstride += stride;
-    ST8x1_UB(d3, dst + nstride);
-    nstride += stride;
-    ST8x1_UB(d4, dst + nstride);
-    nstride += stride;
-    ST8x1_UB(d5, dst + nstride);
-    nstride += stride;
-    ST8x1_UB(d6, dst + nstride);
-    nstride += stride;
-    ST8x1_UB(d7, dst + nstride);
+    ST_D1(d0, 0, dst);
+    ST_D1(d1, 0, dst + stride);
+    ST_D1(d2, 0, dst + 2 * stride);
+    ST_D1(d3, 0, dst + 3 * stride);
+    ST_D1(d4, 0, dst + 4 * stride);
+    ST_D1(d5, 0, dst + 5 * stride);
+    ST_D1(d6, 0, dst + 6 * stride);
+    ST_D1(d7, 0, dst + 7 * stride);
  }
  
  void ff_vp3_idct_put_msa(uint8_t *dest, ptrdiff_t line_size, int16_t *block)
@@ -424,7 +389,6 @@ void ff_vp3_idct_dc_add_msa(uint8_t *dest, ptrdiff_t line_size, int16_t *block)
      v4i32 r0, r1, r2, r3, r4, r5, r6, r7;
      v16i8 mask = {0, 4, 8, 12, 16, 20, 24, 28, 0, 0, 0, 0, 0, 0, 0, 0};
      v16i8 zero = {0};
-    int nstride = line_size;
  
      LD_SB8(dest, line_size, d0, d1, d2, d3, d4, d5, d6, d7);
      ILVR_B4_SW(zero, d0, zero, d1, zero, d2, zero, d3,
@@ -444,14 +408,7 @@ void ff_vp3_idct_dc_add_msa(uint8_t *dest, ptrdiff_t line_size, int16_t *block)
      e5 += dc;
      e6 += dc;
      e7 += dc;
-    e0 = CLIP_SW_0_255(e0);
-    e1 = CLIP_SW_0_255(e1);
-    e2 = CLIP_SW_0_255(e2);
-    e3 = CLIP_SW_0_255(e3);
-    e4 = CLIP_SW_0_255(e4);
-    e5 = CLIP_SW_0_255(e5);
-    e6 = CLIP_SW_0_255(e6);
-    e7 = CLIP_SW_0_255(e7);
+    CLIP_SW8_0_255(e0, e1, e2, e3, e4, e5, e6, e7);
  
      /* Left part */
      ILVL_H4_SW(zero, c0, zero, c1, zero, c2, zero, c3,
@@ -466,34 +423,21 @@ void ff_vp3_idct_dc_add_msa(uint8_t *dest, ptrdiff_t line_size, int16_t *block)
      r5 += dc;
      r6 += dc;
      r7 += dc;
-    r0 = CLIP_SW_0_255(r0);
-    r1 = CLIP_SW_0_255(r1);
-    r2 = CLIP_SW_0_255(r2);
-    r3 = CLIP_SW_0_255(r3);
-    r4 = CLIP_SW_0_255(r4);
-    r5 = CLIP_SW_0_255(r5);
-    r6 = CLIP_SW_0_255(r6);
-    r7 = CLIP_SW_0_255(r7);
+    CLIP_SW8_0_255(r0, r1, r2, r3, r4, r5, r6, r7);
      VSHF_B2_SB(e0, r0, e1, r1, mask, mask, d0, d1);
      VSHF_B2_SB(e2, r2, e3, r3, mask, mask, d2, d3);
      VSHF_B2_SB(e4, r4, e5, r5, mask, mask, d4, d5);
      VSHF_B2_SB(e6, r6, e7, r7, mask, mask, d6, d7);
  
      /* Final sequence of operations over-write original dst */
-    ST8x1_UB(d0, dest);
-    ST8x1_UB(d1, dest + nstride);
-    nstride += line_size;
-    ST8x1_UB(d2, dest + nstride);
-    nstride += line_size;
-    ST8x1_UB(d3, dest + nstride);
-    nstride += line_size;
-    ST8x1_UB(d4, dest + nstride);
-    nstride += line_size;
-    ST8x1_UB(d5, dest + nstride);
-    nstride += line_size;
-    ST8x1_UB(d6, dest + nstride);
-    nstride += line_size;
-    ST8x1_UB(d7, dest + nstride);
+    ST_D1(d0, 0, dest);
+    ST_D1(d1, 0, dest + line_size);
+    ST_D1(d2, 0, dest + 2 * line_size);
+    ST_D1(d3, 0, dest + 3 * line_size);
+    ST_D1(d4, 0, dest + 4 * line_size);
+    ST_D1(d5, 0, dest + 5 * line_size);
+    ST_D1(d6, 0, dest + 6 * line_size);
+    ST_D1(d7, 0, dest + 7 * line_size);
  
      block[0] = 0;
  }
@@ -530,15 +474,12 @@ void ff_vp3_v_loop_filter_msa(uint8_t *first_pixel, ptrdiff_t stride,
      f1 += e1;
      g0 -= e0;
      g1 -= e1;
-    f0 = CLIP_SW_0_255(f0);
-    f1 = CLIP_SW_0_255(f1);
-    g0 = CLIP_SW_0_255(g0);
-    g1 = CLIP_SW_0_255(g1);
+    CLIP_SW4_0_255(f0, f1, g0, g1);
      VSHF_B2_SB(f0, f1, g0, g1, mask, mask, d1, d2);
  
      /* Final move to first_pixel */
-    ST8x1_UB(d1, first_pixel + nstride);
-    ST8x1_UB(d2, first_pixel);
+    ST_D1(d1, 0, first_pixel + nstride);
+    ST_D1(d2, 0, first_pixel);
  }
  
  void ff_vp3_h_loop_filter_msa(uint8_t *first_pixel, ptrdiff_t stride,
@@ -577,14 +518,11 @@ void ff_vp3_h_loop_filter_msa(uint8_t *first_pixel, ptrdiff_t stride,
      f1 += e1;
      g0 -= e0;
      g1 -= e1;
-    f0 = CLIP_SW_0_255(f0);
-    f1 = CLIP_SW_0_255(f1);
-    g0 = CLIP_SW_0_255(g0);
-    g1 = CLIP_SW_0_255(g1);
+    CLIP_SW4_0_255(f0, f1, g0, g1);
      VSHF_B2_SB(f0, g0, f1, g1, mask, mask, d1, d2);
      /* Final move to first_pixel */
-    ST2x4_UB(d1, 0, first_pixel - 1, stride);
-    ST2x4_UB(d2, 0, first_pixel - 1 + 4 * stride, stride);
+    ST_H4(d1, 0, 1, 2, 3, first_pixel - 1, stride);
+    ST_H4(d2, 0, 1, 2, 3, first_pixel - 1 + 4 * stride, stride);
  }
  
  void ff_put_no_rnd_pixels_l2_msa(uint8_t *dst, const uint8_t *src1,
@@ -641,10 +579,8 @@ void ff_put_no_rnd_pixels_l2_msa(uint8_t *dst, const uint8_t *src1,
          f2 = (v4i32) __msa_and_v((v16u8)a3, (v16u8)b3);
          t3 = t3 + (v4u32)f2;
  
-        ST4x4_UB(t0, t0, 0, 1, 2, 3, dst, stride);
-        ST4x4_UB(t1, t1, 0, 1, 2, 3, dst + 4 * stride, stride);
-        ST4x4_UB(t2, t2, 0, 1, 2, 3, dst + 4, stride);
-        ST4x4_UB(t3, t3, 0, 1, 2, 3, dst + 4 + 4 * stride, stride);
+        ST_W8(t0, t1, 0, 1, 2, 3, 0, 1, 2, 3, dst, stride);
+        ST_W8(t2, t3, 0, 1, 2, 3, 0, 1, 2, 3, dst + 4, stride);
      } else {
          int i;