arm/aarch64: vp9: Fix vertical alignment

author Martin Storsjö <martin@martin.st>

Sun, 8 Jan 2017 22:04:19 +0000 (00:04 +0200)

committer Martin Storsjö <martin@martin.st>

Sun, 19 Mar 2017 20:53:32 +0000 (22:53 +0200)
author Martin Storsjö <martin@martin.st>
Sun, 8 Jan 2017 22:04:19 +0000 (00:04 +0200)
committer Martin Storsjö <martin@martin.st>
Sun, 19 Mar 2017 20:53:32 +0000 (22:53 +0200)
diff --git a/libavcodec/aarch64/vp9itxfm_neon.S b/libavcodec/aarch64/vp9itxfm_neon.S

index 3e5da0880c7f2ec809e74e705d95d36c5a392c78..b12890f0db3665ff658de4eff99e6524560b3b21 100644 (file)
--- a/libavcodec/aarch64/vp9itxfm_neon.S
+++ b/libavcodec/aarch64/vp9itxfm_neon.S
@@ -380,7 +380,7 @@ function ff_vp9_\txfm1\()_\txfm2\()_8x8_add_neon, export=1
  .ifc \txfm1\()_\txfm2,idct_idct
          movrel          x4,  idct_coeffs
  .else
-        movrel          x4, iadst8_coeffs
+        movrel          x4,  iadst8_coeffs
          ld1             {v1.8h}, [x4], #16
  .endif
          ld1             {v0.8h}, [x4]
@@ -480,23 +480,23 @@ itxfm_func8x8 iadst, iadst
  
  
  function idct16x16_dc_add_neon
-        movrel          x4, idct_coeffs
+        movrel          x4,  idct_coeffs
          ld1             {v0.4h}, [x4]
  
-        movi            v1.4h, #0
+        movi            v1.4h,  #0
  
          ld1             {v2.h}[0], [x2]
-        smull           v2.4s,  v2.4h, v0.h[0]
-        rshrn           v2.4h,  v2.4s, #14
-        smull           v2.4s,  v2.4h, v0.h[0]
-        rshrn           v2.4h,  v2.4s, #14
+        smull           v2.4s,  v2.4h,  v0.h[0]
+        rshrn           v2.4h,  v2.4s,  #14
+        smull           v2.4s,  v2.4h,  v0.h[0]
+        rshrn           v2.4h,  v2.4s,  #14
          dup             v2.8h,  v2.h[0]
          st1             {v1.h}[0], [x2]
  
-        srshr           v2.8h, v2.8h, #6
+        srshr           v2.8h,  v2.8h,  #6
  
-        mov             x3, x0
-        mov             x4, #16
+        mov             x3,  x0
+        mov             x4,  #16
  1:
          // Loop to add the constant from v2 into all 16x16 outputs
          subs            x4,  x4,  #2
@@ -869,7 +869,7 @@ function ff_vp9_\txfm1\()_\txfm2\()_16x16_add_neon, export=1
  .ifc \txfm1,idct
          ld1             {v0.8h,v1.8h}, [x10]
  .endif
-        mov             x9, #32
+        mov             x9,  #32
  
  .ifc \txfm1\()_\txfm2,idct_idct
          cmp             w3,  #10
@@ -1046,10 +1046,10 @@ idct16_partial quarter
  idct16_partial half
  
  function idct32x32_dc_add_neon
-        movrel          x4, idct_coeffs
+        movrel          x4,  idct_coeffs
          ld1             {v0.4h}, [x4]
  
-        movi            v1.4h, #0
+        movi            v1.4h,  #0
  
          ld1             {v2.h}[0], [x2]
          smull           v2.4s,  v2.4h,  v0.h[0]
@@ -1059,10 +1059,10 @@ function idct32x32_dc_add_neon
          dup             v2.8h,  v2.h[0]
          st1             {v1.h}[0], [x2]
  
-        srshr           v0.8h, v2.8h, #6
+        srshr           v0.8h,  v2.8h,  #6
  
-        mov             x3, x0
-        mov             x4, #32
+        mov             x3,  x0
+        mov             x4,  #32
  1:
          // Loop to add the constant v0 into all 32x32 outputs
          subs            x4,  x4,  #2
@@ -1230,7 +1230,7 @@ endfunc
  // x9 = double input stride
  function idct32_1d_8x32_pass1\suffix\()_neon
          mov             x14, x30
-        movi            v2.8h, #0
+        movi            v2.8h,  #0
  
          // v16 = IN(0), v17 = IN(2) ... v31 = IN(30)
  .ifb \suffix
@@ -1295,7 +1295,7 @@ function idct32_1d_8x32_pass1\suffix\()_neon
  .endif
          add             x2,  x2,  #64
  
-        movi            v2.8h, #0
+        movi            v2.8h,  #0
          // v16 = IN(1), v17 = IN(3) ... v31 = IN(31)
  .ifb \suffix
  .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
diff --git a/libavcodec/arm/vp9itxfm_neon.S b/libavcodec/arm/vp9itxfm_neon.S

index 6d4d765c28571f2aae7ed42b466842d37077e550..6c09922caee88dc2b2fd6e2d90781f0ec38c6544 100644 (file)
--- a/libavcodec/arm/vp9itxfm_neon.S
+++ b/libavcodec/arm/vp9itxfm_neon.S
@@ -530,7 +530,7 @@ function idct16x16_dc_add_neon
          movrel          r12, idct_coeffs
          vld1.16         {d0}, [r12,:64]
  
-        vmov.i16        q2, #0
+        vmov.i16        q2,  #0
  
          vld1.16         {d16[]}, [r2,:16]
          vmull.s16       q8,  d16, d0[0]
@@ -793,7 +793,7 @@ function \txfm\()16_1d_4x16_pass1_neon
          push            {lr}
  
          mov             r12, #32
-        vmov.s16        q2, #0
+        vmov.s16        q2,  #0
  .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
          vld1.16         {d\i}, [r2,:64]
          vst1.16         {d4},  [r2,:64], r12
@@ -1142,7 +1142,7 @@ function idct32x32_dc_add_neon
          movrel          r12, idct_coeffs
          vld1.16         {d0}, [r12,:64]
  
-        vmov.i16        q2, #0
+        vmov.i16        q2,  #0
  
          vld1.16         {d16[]}, [r2,:16]
          vmull.s16       q8,  d16, d0[0]
@@ -1330,7 +1330,7 @@ function idct32_1d_4x32_pass1\suffix\()_neon
  
          @ Double stride of the input, since we only read every other line
          mov             r12, #128
-        vmov.s16        d4, #0
+        vmov.s16        d4,  #0
  
          @ d16 = IN(0), d17 = IN(2) ... d31 = IN(30)
  .ifb \suffix
@@ -1394,7 +1394,7 @@ function idct32_1d_4x32_pass1\suffix\()_neon
  .endif
          add             r2,  r2,  #64
  
-        vmov.s16        d8, #0
+        vmov.s16        d8,  #0
          @ d16 = IN(1), d17 = IN(3) ... d31 = IN(31)
  .ifb \suffix
  .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
@@ -1533,9 +1533,9 @@ function idct32_1d_4x32_pass2\suffix\()_neon
  .endif
          vld1.32         {d12[]},  [r0,:32], r1
          vld1.32         {d12[1]}, [r0,:32], r1
-        vrshr.s16       q4, q4, #6
+        vrshr.s16       q4,  q4,  #6
          vld1.32         {d13[]},  [r0,:32], r1
-        vrshr.s16       q5, q5, #6
+        vrshr.s16       q5,  q5,  #6
          vld1.32         {d13[1]}, [r0,:32], r1
          sub             r0,  r0,  r1, lsl #2
          vaddw.u8        q4,  q4,  d12
diff --git a/libavcodec/arm/vp9lpf_neon.S b/libavcodec/arm/vp9lpf_neon.S

index 8d44d58f32e8b4fd04d3dc1293e08529607a706d..4b3608064a5d8805b2ec0e56733b8fe3f3386703 100644 (file)
--- a/libavcodec/arm/vp9lpf_neon.S
+++ b/libavcodec/arm/vp9lpf_neon.S
@@ -828,7 +828,7 @@ function ff_vp9_loop_filter_v_16_16_neon, export=1
  endfunc
  
  function vp9_loop_filter_h_16_neon
-        sub             r12,  r0,  #8
+        sub             r12, r0,  #8
          vld1.8          {d16}, [r12,:64], r1
          vld1.8          {d24}, [r0, :64], r1
          vld1.8          {d17}, [r12,:64], r1
author	Martin Storsjö <martin@martin.st>
	Sun, 8 Jan 2017 22:04:19 +0000 (00:04 +0200)
committer	Martin Storsjö <martin@martin.st>
	Sun, 19 Mar 2017 20:53:32 +0000 (22:53 +0200)
libavcodec/aarch64/vp9itxfm_neon.S		patch \| blob \| history
libavcodec/arm/vp9itxfm_neon.S		patch \| blob \| history
libavcodec/arm/vp9lpf_neon.S		patch \| blob \| history