]> git.sesse.net Git - ffmpeg/commitdiff
arm/aarch64: vp9: Fix vertical alignment
authorMartin Storsjö <martin@martin.st>
Sun, 8 Jan 2017 22:04:19 +0000 (00:04 +0200)
committerMartin Storsjö <martin@martin.st>
Sun, 19 Mar 2017 20:53:32 +0000 (22:53 +0200)
Align the second/third operands as they usually are.

Due to the wildly varying sizes of the written out operands
in aarch64 assembly, the column alignment is usually not as clear
as in arm assembly.

This is cherrypicked from libav commit
7995ebfad12002033c73feed422a1cfc62081e8f.

Signed-off-by: Martin Storsjö <martin@martin.st>
libavcodec/aarch64/vp9itxfm_neon.S
libavcodec/arm/vp9itxfm_neon.S
libavcodec/arm/vp9lpf_neon.S

index 3e5da0880c7f2ec809e74e705d95d36c5a392c78..b12890f0db3665ff658de4eff99e6524560b3b21 100644 (file)
@@ -380,7 +380,7 @@ function ff_vp9_\txfm1\()_\txfm2\()_8x8_add_neon, export=1
 .ifc \txfm1\()_\txfm2,idct_idct
         movrel          x4,  idct_coeffs
 .else
-        movrel          x4, iadst8_coeffs
+        movrel          x4,  iadst8_coeffs
         ld1             {v1.8h}, [x4], #16
 .endif
         ld1             {v0.8h}, [x4]
@@ -480,23 +480,23 @@ itxfm_func8x8 iadst, iadst
 
 
 function idct16x16_dc_add_neon
-        movrel          x4, idct_coeffs
+        movrel          x4,  idct_coeffs
         ld1             {v0.4h}, [x4]
 
-        movi            v1.4h, #0
+        movi            v1.4h,  #0
 
         ld1             {v2.h}[0], [x2]
-        smull           v2.4s,  v2.4h, v0.h[0]
-        rshrn           v2.4h,  v2.4s, #14
-        smull           v2.4s,  v2.4h, v0.h[0]
-        rshrn           v2.4h,  v2.4s, #14
+        smull           v2.4s,  v2.4h,  v0.h[0]
+        rshrn           v2.4h,  v2.4s,  #14
+        smull           v2.4s,  v2.4h,  v0.h[0]
+        rshrn           v2.4h,  v2.4s,  #14
         dup             v2.8h,  v2.h[0]
         st1             {v1.h}[0], [x2]
 
-        srshr           v2.8h, v2.8h, #6
+        srshr           v2.8h,  v2.8h,  #6
 
-        mov             x3, x0
-        mov             x4, #16
+        mov             x3,  x0
+        mov             x4,  #16
 1:
         // Loop to add the constant from v2 into all 16x16 outputs
         subs            x4,  x4,  #2
@@ -869,7 +869,7 @@ function ff_vp9_\txfm1\()_\txfm2\()_16x16_add_neon, export=1
 .ifc \txfm1,idct
         ld1             {v0.8h,v1.8h}, [x10]
 .endif
-        mov             x9, #32
+        mov             x9,  #32
 
 .ifc \txfm1\()_\txfm2,idct_idct
         cmp             w3,  #10
@@ -1046,10 +1046,10 @@ idct16_partial quarter
 idct16_partial half
 
 function idct32x32_dc_add_neon
-        movrel          x4, idct_coeffs
+        movrel          x4,  idct_coeffs
         ld1             {v0.4h}, [x4]
 
-        movi            v1.4h, #0
+        movi            v1.4h,  #0
 
         ld1             {v2.h}[0], [x2]
         smull           v2.4s,  v2.4h,  v0.h[0]
@@ -1059,10 +1059,10 @@ function idct32x32_dc_add_neon
         dup             v2.8h,  v2.h[0]
         st1             {v1.h}[0], [x2]
 
-        srshr           v0.8h, v2.8h, #6
+        srshr           v0.8h,  v2.8h,  #6
 
-        mov             x3, x0
-        mov             x4, #32
+        mov             x3,  x0
+        mov             x4,  #32
 1:
         // Loop to add the constant v0 into all 32x32 outputs
         subs            x4,  x4,  #2
@@ -1230,7 +1230,7 @@ endfunc
 // x9 = double input stride
 function idct32_1d_8x32_pass1\suffix\()_neon
         mov             x14, x30
-        movi            v2.8h, #0
+        movi            v2.8h,  #0
 
         // v16 = IN(0), v17 = IN(2) ... v31 = IN(30)
 .ifb \suffix
@@ -1295,7 +1295,7 @@ function idct32_1d_8x32_pass1\suffix\()_neon
 .endif
         add             x2,  x2,  #64
 
-        movi            v2.8h, #0
+        movi            v2.8h,  #0
         // v16 = IN(1), v17 = IN(3) ... v31 = IN(31)
 .ifb \suffix
 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
index 6d4d765c28571f2aae7ed42b466842d37077e550..6c09922caee88dc2b2fd6e2d90781f0ec38c6544 100644 (file)
@@ -530,7 +530,7 @@ function idct16x16_dc_add_neon
         movrel          r12, idct_coeffs
         vld1.16         {d0}, [r12,:64]
 
-        vmov.i16        q2, #0
+        vmov.i16        q2,  #0
 
         vld1.16         {d16[]}, [r2,:16]
         vmull.s16       q8,  d16, d0[0]
@@ -793,7 +793,7 @@ function \txfm\()16_1d_4x16_pass1_neon
         push            {lr}
 
         mov             r12, #32
-        vmov.s16        q2, #0
+        vmov.s16        q2,  #0
 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
         vld1.16         {d\i}, [r2,:64]
         vst1.16         {d4},  [r2,:64], r12
@@ -1142,7 +1142,7 @@ function idct32x32_dc_add_neon
         movrel          r12, idct_coeffs
         vld1.16         {d0}, [r12,:64]
 
-        vmov.i16        q2, #0
+        vmov.i16        q2,  #0
 
         vld1.16         {d16[]}, [r2,:16]
         vmull.s16       q8,  d16, d0[0]
@@ -1330,7 +1330,7 @@ function idct32_1d_4x32_pass1\suffix\()_neon
 
         @ Double stride of the input, since we only read every other line
         mov             r12, #128
-        vmov.s16        d4, #0
+        vmov.s16        d4,  #0
 
         @ d16 = IN(0), d17 = IN(2) ... d31 = IN(30)
 .ifb \suffix
@@ -1394,7 +1394,7 @@ function idct32_1d_4x32_pass1\suffix\()_neon
 .endif
         add             r2,  r2,  #64
 
-        vmov.s16        d8, #0
+        vmov.s16        d8,  #0
         @ d16 = IN(1), d17 = IN(3) ... d31 = IN(31)
 .ifb \suffix
 .irp i, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31
@@ -1533,9 +1533,9 @@ function idct32_1d_4x32_pass2\suffix\()_neon
 .endif
         vld1.32         {d12[]},  [r0,:32], r1
         vld1.32         {d12[1]}, [r0,:32], r1
-        vrshr.s16       q4, q4, #6
+        vrshr.s16       q4,  q4,  #6
         vld1.32         {d13[]},  [r0,:32], r1
-        vrshr.s16       q5, q5, #6
+        vrshr.s16       q5,  q5,  #6
         vld1.32         {d13[1]}, [r0,:32], r1
         sub             r0,  r0,  r1, lsl #2
         vaddw.u8        q4,  q4,  d12
index 8d44d58f32e8b4fd04d3dc1293e08529607a706d..4b3608064a5d8805b2ec0e56733b8fe3f3386703 100644 (file)
@@ -828,7 +828,7 @@ function ff_vp9_loop_filter_v_16_16_neon, export=1
 endfunc
 
 function vp9_loop_filter_h_16_neon
-        sub             r12,  r0,  #8
+        sub             r12, r0,  #8
         vld1.8          {d16}, [r12,:64], r1
         vld1.8          {d24}, [r0, :64], r1
         vld1.8          {d17}, [r12,:64], r1