git.sesse.net Git - ffmpeg/blob - libavcodec/arm/mdct_vfp.S

   1 /*
   2  * Copyright (c) 2013 RISC OS Open Ltd
   3  * Author: Ben Avison <bavison@riscosopen.org>
   4  *
   5  * This file is part of FFmpeg.
   6  *
   7  * FFmpeg is free software; you can redistribute it and/or
   8  * modify it under the terms of the GNU Lesser General Public
   9  * License as published by the Free Software Foundation; either
  10  * version 2.1 of the License, or (at your option) any later version.
  11  *
  12  * FFmpeg is distributed in the hope that it will be useful,
  13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15  * Lesser General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU Lesser General Public
  18  * License along with FFmpeg; if not, write to the Free Software
  19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20  */
  21
  22 #include "libavutil/arm/asm.S"
  23
  24 CONTEXT .req    a1
  25 ORIGOUT .req    a2
  26 IN      .req    a3
  27 OUT     .req    v1
  28 REVTAB  .req    v2
  29 TCOS    .req    v3
  30 TSIN    .req    v4
  31 OLDFPSCR .req   v5
  32 J0      .req    a2
  33 J1      .req    a4
  34 J2      .req    ip
  35 J3      .req    lr
  36
  37 .macro prerotation_innerloop
  38  .set trig_lo, k
  39  .set trig_hi, n4 - k - 2
  40  .set in_lo, trig_lo * 2
  41  .set in_hi, trig_hi * 2
  42         vldr    d8, [TCOS, #trig_lo*4]          @ s16,s17
  43         vldr    d9, [TCOS, #trig_hi*4]          @ s18,s19
  44         vldr    s0, [IN, #in_hi*4 + 12]
  45         vldr    s1, [IN, #in_hi*4 + 4]
  46         vldr    s2, [IN, #in_lo*4 + 12]
  47         vldr    s3, [IN, #in_lo*4 + 4]
  48         vmul.f  s8, s0, s16                     @ vector operation
  49         vldr    d10, [TSIN, #trig_lo*4]         @ s20,s21
  50         vldr    d11, [TSIN, #trig_hi*4]         @ s22,s23
  51         vldr    s4, [IN, #in_lo*4]
  52         vldr    s5, [IN, #in_lo*4 + 8]
  53         vldr    s6, [IN, #in_hi*4]
  54         vldr    s7, [IN, #in_hi*4 + 8]
  55         ldr     J0, [REVTAB, #trig_lo*2]
  56         vmul.f  s12, s0, s20                    @ vector operation
  57         ldr     J2, [REVTAB, #trig_hi*2]
  58         mov     J1, J0, lsr #16
  59         and     J0, J0, #255                    @ halfword value will be < n4
  60         vmls.f  s8, s4, s20                     @ vector operation
  61         mov     J3, J2, lsr #16
  62         and     J2, J2, #255                    @ halfword value will be < n4
  63         add     J0, OUT, J0, lsl #3
  64         vmla.f  s12, s4, s16                    @ vector operation
  65         add     J1, OUT, J1, lsl #3
  66         add     J2, OUT, J2, lsl #3
  67         add     J3, OUT, J3, lsl #3
  68         vstr    s8, [J0]
  69         vstr    s9, [J1]
  70         vstr    s10, [J2]
  71         vstr    s11, [J3]
  72         vstr    s12, [J0, #4]
  73         vstr    s13, [J1, #4]
  74         vstr    s14, [J2, #4]
  75         vstr    s15, [J3, #4]
  76  .set k, k + 2
  77 .endm
  78
  79 .macro postrotation_innerloop tail, head
  80  .set trig_lo_head, n8 - k - 2
  81  .set trig_hi_head, n8 + k
  82  .set out_lo_head, trig_lo_head * 2
  83  .set out_hi_head, trig_hi_head * 2
  84  .set trig_lo_tail, n8 - (k - 2) - 2
  85  .set trig_hi_tail, n8 + (k - 2)
  86  .set out_lo_tail, trig_lo_tail * 2
  87  .set out_hi_tail, trig_hi_tail * 2
  88  .if (k & 2) == 0
  89   TCOS_D0_HEAD .req d10 @ s20,s21
  90   TCOS_D1_HEAD .req d11 @ s22,s23
  91   TCOS_S0_TAIL .req s24
  92  .else
  93   TCOS_D0_HEAD .req d12 @ s24,s25
  94   TCOS_D1_HEAD .req d13 @ s26,s27
  95   TCOS_S0_TAIL .req s20
  96  .endif
  97  .ifnc "\tail",""
  98         vmls.f  s8, s0, TCOS_S0_TAIL        @ vector operation
  99  .endif
 100  .ifnc "\head",""
 101         vldr    d8, [TSIN, #trig_lo_head*4] @ s16,s17
 102         vldr    d9, [TSIN, #trig_hi_head*4] @ s18,s19
 103         vldr    TCOS_D0_HEAD, [TCOS, #trig_lo_head*4]
 104  .endif
 105  .ifnc "\tail",""
 106         vmla.f  s12, s4, TCOS_S0_TAIL       @ vector operation
 107  .endif
 108  .ifnc "\head",""
 109         vldr    s0, [OUT, #out_lo_head*4]
 110         vldr    s1, [OUT, #out_lo_head*4 + 8]
 111         vldr    s2, [OUT, #out_hi_head*4]
 112         vldr    s3, [OUT, #out_hi_head*4 + 8]
 113         vldr    s4, [OUT, #out_lo_head*4 + 4]
 114         vldr    s5, [OUT, #out_lo_head*4 + 12]
 115         vldr    s6, [OUT, #out_hi_head*4 + 4]
 116         vldr    s7, [OUT, #out_hi_head*4 + 12]
 117  .endif
 118  .ifnc "\tail",""
 119         vstr    s8, [OUT, #out_lo_tail*4]
 120         vstr    s9, [OUT, #out_lo_tail*4 + 8]
 121         vstr    s10, [OUT, #out_hi_tail*4]
 122         vstr    s11, [OUT, #out_hi_tail*4 + 8]
 123  .endif
 124  .ifnc "\head",""
 125         vmul.f  s8, s4, s16                 @ vector operation
 126  .endif
 127  .ifnc "\tail",""
 128         vstr    s12, [OUT, #out_hi_tail*4 + 12]
 129         vstr    s13, [OUT, #out_hi_tail*4 + 4]
 130         vstr    s14, [OUT, #out_lo_tail*4 + 12]
 131         vstr    s15, [OUT, #out_lo_tail*4 + 4]
 132  .endif
 133  .ifnc "\head",""
 134         vmul.f  s12, s0, s16                @ vector operation
 135         vldr    TCOS_D1_HEAD, [TCOS, #trig_hi_head*4]
 136  .endif
 137  .unreq TCOS_D0_HEAD
 138  .unreq TCOS_D1_HEAD
 139  .unreq TCOS_S0_TAIL
 140  .ifnc "\head",""
 141   .set k, k + 2
 142  .endif
 143 .endm
 144
 145
 146 /* void ff_imdct_half_vfp(FFTContext *s,
 147  *                        FFTSample *output,
 148  *                        const FFTSample *input)
 149  */
 150 function ff_imdct_half_vfp, export=1
 151         ldr     ip, [CONTEXT, #5*4]         @ mdct_bits
 152         teq     ip, #6
 153         it      ne
 154         bne     X(ff_imdct_half_c)          @ only case currently accelerated is the one used by DCA
 155
 156  .set n, 1<<6
 157  .set n2, n/2
 158  .set n4, n/4
 159  .set n8, n/8
 160
 161         push    {v1-v5,lr}
 162         vpush   {s16-s27}
 163         fmrx    OLDFPSCR, FPSCR
 164         ldr     lr, =0x03030000             @ RunFast mode, short vectors of length 4, stride 1
 165         fmxr    FPSCR, lr
 166         mov     OUT, ORIGOUT
 167         ldr     REVTAB, [CONTEXT, #2*4]
 168         ldr     TCOS, [CONTEXT, #6*4]
 169         ldr     TSIN, [CONTEXT, #7*4]
 170
 171  .set k, 0
 172  .rept n8/2
 173         prerotation_innerloop
 174  .endr
 175
 176         fmxr    FPSCR, OLDFPSCR
 177         mov     a1, OUT
 178         bl      X(ff_fft16_vfp)
 179         ldr     lr, =0x03030000             @ RunFast mode, short vectors of length 4, stride 1
 180         fmxr    FPSCR, lr
 181
 182  .set k, 0
 183         postrotation_innerloop , head
 184  .rept n8/2 - 1
 185         postrotation_innerloop tail, head
 186  .endr
 187         postrotation_innerloop tail
 188
 189         fmxr    FPSCR, OLDFPSCR
 190         vpop    {s16-s27}
 191         pop     {v1-v5,pc}
 192 endfunc
 193
 194         .unreq  CONTEXT
 195         .unreq  ORIGOUT
 196         .unreq  IN
 197         .unreq  OUT
 198         .unreq  REVTAB
 199         .unreq  TCOS
 200         .unreq  TSIN
 201         .unreq  OLDFPSCR
 202         .unreq  J0
 203         .unreq  J1
 204         .unreq  J2
 205         .unreq  J3