]> git.sesse.net Git - ffmpeg/commitdiff
Merge commit '41ef1d360bac65032aa32f6b43ae137666507ae5'
authorMichael Niedermayer <michaelni@gmx.at>
Mon, 22 Jul 2013 09:40:21 +0000 (11:40 +0200)
committerMichael Niedermayer <michaelni@gmx.at>
Mon, 22 Jul 2013 09:41:37 +0000 (11:41 +0200)
* commit '41ef1d360bac65032aa32f6b43ae137666507ae5':
  arm: Add VFP-accelerated version of synth_filter_float

Merged-by: Michael Niedermayer <michaelni@gmx.at>
1  2 
libavcodec/arm/fft_init_arm.c
libavcodec/arm/synth_filter_vfp.S

Simple merge
index 0000000000000000000000000000000000000000,1b99e64598eff6dae34429fdbf4e6a1015cc36d5..b13373986b55b06ffd0bf1b60581c190b11bb568
mode 000000,100644..100644
--- /dev/null
@@@ -1,0 -1,243 +1,243 @@@
 - * This file is part of Libav.
+ /*
+  * Copyright (c) 2013 RISC OS Open Ltd
+  * Author: Ben Avison <bavison@riscosopen.org>
+  *
 - * Libav is free software; you can redistribute it and/or
++ * This file is part of FFmpeg.
+  *
 - * Libav is distributed in the hope that it will be useful,
++ * FFmpeg is free software; you can redistribute it and/or
+  * modify it under the terms of the GNU Lesser General Public
+  * License as published by the Free Software Foundation; either
+  * version 2.1 of the License, or (at your option) any later version.
+  *
 - * License along with Libav; if not, write to the Free Software
++ * FFmpeg is distributed in the hope that it will be useful,
+  * but WITHOUT ANY WARRANTY; without even the implied warranty of
+  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+  * Lesser General Public License for more details.
+  *
+  * You should have received a copy of the GNU Lesser General Public
++ * License along with FFmpeg; if not, write to the Free Software
+  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
+  */
+ #include "libavutil/arm/asm.S"
+ IMDCT         .req    r0
+ ORIG_P_SB     .req    r1
+ P_SB_OFF      .req    r2
+ I             .req    r0
+ P_SB2_UP      .req    r1
+ OLDFPSCR      .req    r2
+ P_SB2_DN      .req    r3
+ P_WIN_DN      .req    r4
+ P_OUT_DN      .req    r5
+ P_SB          .req    r6
+ J_WRAP        .req    r7
+ P_WIN_UP      .req    r12
+ P_OUT_UP      .req    r14
+ SCALE         .req    s0
+ SBUF_DAT_REV0 .req    s4
+ SBUF_DAT_REV1 .req    s5
+ SBUF_DAT_REV2 .req    s6
+ SBUF_DAT_REV3 .req    s7
+ VA0           .req    s8
+ VA3           .req    s11
+ VB0           .req    s12
+ VB3           .req    s15
+ VC0           .req    s8
+ VC3           .req    s11
+ VD0           .req    s12
+ VD3           .req    s15
+ SBUF_DAT0     .req    s16
+ SBUF_DAT1     .req    s17
+ SBUF_DAT2     .req    s18
+ SBUF_DAT3     .req    s19
+ SBUF_DAT_ALT0 .req    s20
+ SBUF_DAT_ALT1 .req    s21
+ SBUF_DAT_ALT2 .req    s22
+ SBUF_DAT_ALT3 .req    s23
+ WIN_DN_DAT0   .req    s24
+ WIN_UP_DAT0   .req    s28
+ .macro inner_loop  half, tail, head
+  .if (OFFSET & (64*4)) == 0                @ even numbered call
+         SBUF_DAT_THIS0 .req SBUF_DAT0
+         SBUF_DAT_THIS1 .req SBUF_DAT1
+         SBUF_DAT_THIS2 .req SBUF_DAT2
+         SBUF_DAT_THIS3 .req SBUF_DAT3
+   .ifnc "\head",""
+         vldr    d8, [P_SB, #OFFSET]        @ d8 = SBUF_DAT
+         vldr    d9, [P_SB, #OFFSET+8]
+   .endif
+  .else
+         SBUF_DAT_THIS0 .req SBUF_DAT_ALT0
+         SBUF_DAT_THIS1 .req SBUF_DAT_ALT1
+         SBUF_DAT_THIS2 .req SBUF_DAT_ALT2
+         SBUF_DAT_THIS3 .req SBUF_DAT_ALT3
+   .ifnc "\head",""
+         vldr    d10, [P_SB, #OFFSET]       @ d10 = SBUF_DAT_ALT
+         vldr    d11, [P_SB, #OFFSET+8]
+   .endif
+  .endif
+  .ifnc "\tail",""
+   .ifc "\half","ab"
+         vmls.f  VA0, SBUF_DAT_REV0, WIN_DN_DAT0  @ all operands treated as vectors
+   .else
+         vmla.f  VD0, SBUF_DAT_REV0, WIN_DN_DAT0  @ all operands treated as vectors
+   .endif
+  .endif
+  .ifnc "\head",""
+         vldr    d14, [P_WIN_UP, #OFFSET]   @ d14 = WIN_UP_DAT
+         vldr    d15, [P_WIN_UP, #OFFSET+8]
+         vldr    d12, [P_WIN_DN, #OFFSET]   @ d12 = WIN_DN_DAT
+         vldr    d13, [P_WIN_DN, #OFFSET+8]
+         vmov    SBUF_DAT_REV3, SBUF_DAT_THIS0
+         vmov    SBUF_DAT_REV2, SBUF_DAT_THIS1
+         vmov    SBUF_DAT_REV1, SBUF_DAT_THIS2
+         vmov    SBUF_DAT_REV0, SBUF_DAT_THIS3
+   .ifc "\half","ab"
+         vmla.f  VB0, SBUF_DAT_THIS0, WIN_UP_DAT0
+   .else
+         vmla.f  VC0, SBUF_DAT_THIS0, WIN_UP_DAT0
+   .endif
+         teq     J_WRAP, #J
+         bne     2f             @ strongly predictable, so better than cond exec in this case
+         sub     P_SB, P_SB, #512*4
+ 2:
+   .set J, J - 64
+   .set OFFSET, OFFSET + 64*4
+  .endif
+         .unreq  SBUF_DAT_THIS0
+         .unreq  SBUF_DAT_THIS1
+         .unreq  SBUF_DAT_THIS2
+         .unreq  SBUF_DAT_THIS3
+ .endm
+ /* void ff_synth_filter_float_vfp(FFTContext *imdct,
+  *                                float *synth_buf_ptr, int *synth_buf_offset,
+  *                                float synth_buf2[32], const float window[512],
+  *                                float out[32], const float in[32], float scale)
+  */
+ function ff_synth_filter_float_vfp, export=1
+         push    {r3-r7,lr}
+         vpush   {s16-s31}
+         ldr     lr, [P_SB_OFF]
+         add     a2, ORIG_P_SB, lr, LSL #2 @ calculate synth_buf to pass to imdct_half
+         mov     P_SB, a2                  @ and keep a copy for ourselves
+         bic     J_WRAP, lr, #63           @ mangled to make testing for wrap easier in inner loop
+         sub     lr, lr, #32
+         and     lr, lr, #512-32
+         str     lr, [P_SB_OFF]            @ rotate offset, modulo buffer size, ready for next call
+         ldr     a3, [sp, #(16+6+2)*4]     @ fetch in from stack, to pass to imdct_half
+ VFP     vmov    s16, SCALE                @ imdct_half is free to corrupt s0, but it contains one of our arguments in hardfp case
+         bl      ff_imdct_half_c
+ VFP     vmov    SCALE, s16
+         fmrx    OLDFPSCR, FPSCR
+         ldr     lr, =0x03030000           @ RunFast mode, short vectors of length 4, stride 1
+         fmxr    FPSCR, lr
+         ldr     P_SB2_DN, [sp, #16*4]
+         ldr     P_WIN_DN, [sp, #(16+6+0)*4]
+         ldr     P_OUT_DN, [sp, #(16+6+1)*4]
+ NOVFP   vldr    SCALE, [sp, #(16+6+3)*4]
+ #define IMM_OFF_SKEW 956                   /* also valid immediate constant when you add 16*4 */
+         add     P_SB, P_SB, #IMM_OFF_SKEW  @ so we can use -ve offsets to use full immediate offset range
+         add     P_SB2_UP, P_SB2_DN, #16*4
+         add     P_WIN_UP, P_WIN_DN, #16*4+IMM_OFF_SKEW
+         add     P_OUT_UP, P_OUT_DN, #16*4
+         add     P_SB2_DN, P_SB2_DN, #16*4
+         add     P_WIN_DN, P_WIN_DN, #12*4+IMM_OFF_SKEW
+         add     P_OUT_DN, P_OUT_DN, #16*4
+         mov     I, #4
+ 1:
+         vldmia  P_SB2_UP!, {VB0-VB3}
+         vldmdb  P_SB2_DN!, {VA0-VA3}
+  .set J, 512 - 64
+  .set OFFSET, -IMM_OFF_SKEW
+         inner_loop  ab,, head
+  .rept 7
+         inner_loop  ab, tail, head
+  .endr
+         inner_loop  ab, tail
+         add     P_WIN_UP, P_WIN_UP, #4*4
+         sub     P_WIN_DN, P_WIN_DN, #4*4
+         vmul.f  VB0, VB0, SCALE      @ SCALE treated as scalar
+         add     P_SB, P_SB, #(512+4)*4
+         subs    I, I, #1
+         vmul.f  VA0, VA0, SCALE
+         vstmia  P_OUT_UP!, {VB0-VB3}
+         vstmdb  P_OUT_DN!, {VA0-VA3}
+         bne     1b
+         add     P_SB2_DN, P_SB2_DN, #(16+28-12)*4
+         sub     P_SB2_UP, P_SB2_UP, #(16+16)*4
+         add     P_WIN_DN, P_WIN_DN, #(32+16+28-12)*4
+         mov     I, #4
+ 1:
+         vldr.d  d4, zero             @ d4 = VC0
+         vldr.d  d5, zero
+         vldr.d  d6, zero             @ d6 = VD0
+         vldr.d  d7, zero
+  .set J, 512 - 64
+  .set OFFSET, -IMM_OFF_SKEW
+         inner_loop  cd,, head
+  .rept 7
+         inner_loop  cd, tail, head
+  .endr
+         inner_loop  cd, tail
+         add     P_WIN_UP, P_WIN_UP, #4*4
+         sub     P_WIN_DN, P_WIN_DN, #4*4
+         add     P_SB, P_SB, #(512+4)*4
+         subs    I, I, #1
+         vstmia  P_SB2_UP!, {VC0-VC3}
+         vstmdb  P_SB2_DN!, {VD0-VD3}
+         bne     1b
+         fmxr    FPSCR, OLDFPSCR
+         vpop    {s16-s31}
+         pop     {r3-r7,pc}
+ endfunc
+         .unreq  IMDCT
+         .unreq  ORIG_P_SB
+         .unreq  P_SB_OFF
+         .unreq  I
+         .unreq  P_SB2_UP
+         .unreq  OLDFPSCR
+         .unreq  P_SB2_DN
+         .unreq  P_WIN_DN
+         .unreq  P_OUT_DN
+         .unreq  P_SB
+         .unreq  J_WRAP
+         .unreq  P_WIN_UP
+         .unreq  P_OUT_UP
+         .unreq  SCALE
+         .unreq  SBUF_DAT_REV0
+         .unreq  SBUF_DAT_REV1
+         .unreq  SBUF_DAT_REV2
+         .unreq  SBUF_DAT_REV3
+         .unreq  VA0
+         .unreq  VA3
+         .unreq  VB0
+         .unreq  VB3
+         .unreq  VC0
+         .unreq  VC3
+         .unreq  VD0
+         .unreq  VD3
+         .unreq  SBUF_DAT0
+         .unreq  SBUF_DAT1
+         .unreq  SBUF_DAT2
+         .unreq  SBUF_DAT3
+         .unreq  SBUF_DAT_ALT0
+         .unreq  SBUF_DAT_ALT1
+         .unreq  SBUF_DAT_ALT2
+         .unreq  SBUF_DAT_ALT3
+         .unreq  WIN_DN_DAT0
+         .unreq  WIN_UP_DAT0
+         .align  3
+ zero:   .word   0, 0