git.sesse.net Git - ffmpeg/blob - libavcodec/arm/mdct_vfp.S

   1 /*
   2  * Copyright (c) 2013 RISC OS Open Ltd
   3  * Author: Ben Avison <bavison@riscosopen.org>
   4  *
   5  * This file is part of FFmpeg.
   6  *
   7  * FFmpeg is free software; you can redistribute it and/or
   8  * modify it under the terms of the GNU Lesser General Public
   9  * License as published by the Free Software Foundation; either
  10  * version 2.1 of the License, or (at your option) any later version.
  11  *
  12  * FFmpeg is distributed in the hope that it will be useful,
  13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15  * Lesser General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU Lesser General Public
  18  * License along with FFmpeg; if not, write to the Free Software
  19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20  */
  21
  22 #include "libavutil/arm/asm.S"
  23
  24 CONTEXT .req    a1
  25 ORIGOUT .req    a2
  26 IN      .req    a3
  27 OUT     .req    v1
  28 REVTAB  .req    v2
  29 TCOS    .req    v3
  30 TSIN    .req    v4
  31 OLDFPSCR .req   v5
  32 J0      .req    a2
  33 J1      .req    a4
  34 J2      .req    ip
  35 J3      .req    lr
  36 REVTAB_HI .req  v5
  37 IN_HI   .req    v6
  38 OUT_HI  .req    v6
  39 TCOS_HI .req    sl
  40 TSIN_HI .req    fp
  41
  42 .macro prerotation_innerloop
  43  .set trig_lo, k
  44  .set trig_hi, n4 - k - 2
  45  .set in_lo, trig_lo * 2
  46  .set in_hi, trig_hi * 2
  47         vldr    d8, [TCOS, #trig_lo*4]          @ s16,s17
  48         vldr    d9, [TCOS, #trig_hi*4]          @ s18,s19
  49         vldr    s0, [IN, #in_hi*4 + 12]
  50         vldr    s1, [IN, #in_hi*4 + 4]
  51         vldr    s2, [IN, #in_lo*4 + 12]
  52         vldr    s3, [IN, #in_lo*4 + 4]
  53         vmul.f  s8, s0, s16                     @ vector operation
  54         vldr    d10, [TSIN, #trig_lo*4]         @ s20,s21
  55         vldr    d11, [TSIN, #trig_hi*4]         @ s22,s23
  56         vldr    s4, [IN, #in_lo*4]
  57         vldr    s5, [IN, #in_lo*4 + 8]
  58         vldr    s6, [IN, #in_hi*4]
  59         vldr    s7, [IN, #in_hi*4 + 8]
  60         ldr     J0, [REVTAB, #trig_lo*2]
  61         vmul.f  s12, s0, s20                    @ vector operation
  62         ldr     J2, [REVTAB, #trig_hi*2]
  63         mov     J1, J0, lsr #16
  64         and     J0, J0, #255                    @ halfword value will be < n4
  65         vmls.f  s8, s4, s20                     @ vector operation
  66         mov     J3, J2, lsr #16
  67         and     J2, J2, #255                    @ halfword value will be < n4
  68         add     J0, OUT, J0, lsl #3
  69         vmla.f  s12, s4, s16                    @ vector operation
  70         add     J1, OUT, J1, lsl #3
  71         add     J2, OUT, J2, lsl #3
  72         add     J3, OUT, J3, lsl #3
  73         vstr    s8, [J0]
  74         vstr    s9, [J1]
  75         vstr    s10, [J2]
  76         vstr    s11, [J3]
  77         vstr    s12, [J0, #4]
  78         vstr    s13, [J1, #4]
  79         vstr    s14, [J2, #4]
  80         vstr    s15, [J3, #4]
  81  .set k, k + 2
  82 .endm
  83
  84 .macro prerotation_innerloop_rolled
  85         vldmia  TCOS!, {s16,s17}
  86         vldmdb  TCOS_HI!, {s18,s19}
  87         vldr    s0, [IN_HI, #-4]
  88         vldr    s1, [IN_HI, #-12]
  89         vldr    s2, [IN, #12]
  90         vldr    s3, [IN, #4]
  91         vmul.f  s8, s0, s16                     @ vector operation
  92         vldmia  TSIN!, {s20,s21}
  93         vldmdb  TSIN_HI!, {s22,s23}
  94         vldr    s4, [IN]
  95         vldr    s5, [IN, #8]
  96         vldr    s6, [IN_HI, #-16]
  97         vldr    s7, [IN_HI, #-8]
  98         vmul.f  s12, s0, s20                    @ vector operation
  99         add     IN, IN, #16
 100         sub     IN_HI, IN_HI, #16
 101         ldrh    J0, [REVTAB], #2
 102         ldrh    J1, [REVTAB], #2
 103         vmls.f  s8, s4, s20                     @ vector operation
 104         ldrh    J3, [REVTAB_HI, #-2]!
 105         ldrh    J2, [REVTAB_HI, #-2]!
 106         add     J0, OUT, J0, lsl #3
 107         vmla.f  s12, s4, s16                    @ vector operation
 108         add     J1, OUT, J1, lsl #3
 109         add     J2, OUT, J2, lsl #3
 110         add     J3, OUT, J3, lsl #3
 111         vstr    s8, [J0]
 112         vstr    s9, [J1]
 113         vstr    s10, [J2]
 114         vstr    s11, [J3]
 115         vstr    s12, [J0, #4]
 116         vstr    s13, [J1, #4]
 117         vstr    s14, [J2, #4]
 118         vstr    s15, [J3, #4]
 119 .endm
 120
 121 .macro postrotation_innerloop tail, head
 122  .set trig_lo_head, n8 - k - 2
 123  .set trig_hi_head, n8 + k
 124  .set out_lo_head, trig_lo_head * 2
 125  .set out_hi_head, trig_hi_head * 2
 126  .set trig_lo_tail, n8 - (k - 2) - 2
 127  .set trig_hi_tail, n8 + (k - 2)
 128  .set out_lo_tail, trig_lo_tail * 2
 129  .set out_hi_tail, trig_hi_tail * 2
 130  .if (k & 2) == 0
 131   TCOS_D0_HEAD .req d10 @ s20,s21
 132   TCOS_D1_HEAD .req d11 @ s22,s23
 133   TCOS_S0_TAIL .req s24
 134  .else
 135   TCOS_D0_HEAD .req d12 @ s24,s25
 136   TCOS_D1_HEAD .req d13 @ s26,s27
 137   TCOS_S0_TAIL .req s20
 138  .endif
 139  .ifnc "\tail",""
 140         vmls.f  s8, s0, TCOS_S0_TAIL        @ vector operation
 141  .endif
 142  .ifnc "\head",""
 143         vldr    d8, [TSIN, #trig_lo_head*4] @ s16,s17
 144         vldr    d9, [TSIN, #trig_hi_head*4] @ s18,s19
 145         vldr    TCOS_D0_HEAD, [TCOS, #trig_lo_head*4]
 146  .endif
 147  .ifnc "\tail",""
 148         vmla.f  s12, s4, TCOS_S0_TAIL       @ vector operation
 149  .endif
 150  .ifnc "\head",""
 151         vldr    s0, [OUT, #out_lo_head*4]
 152         vldr    s1, [OUT, #out_lo_head*4 + 8]
 153         vldr    s2, [OUT, #out_hi_head*4]
 154         vldr    s3, [OUT, #out_hi_head*4 + 8]
 155         vldr    s4, [OUT, #out_lo_head*4 + 4]
 156         vldr    s5, [OUT, #out_lo_head*4 + 12]
 157         vldr    s6, [OUT, #out_hi_head*4 + 4]
 158         vldr    s7, [OUT, #out_hi_head*4 + 12]
 159  .endif
 160  .ifnc "\tail",""
 161         vstr    s8, [OUT, #out_lo_tail*4]
 162         vstr    s9, [OUT, #out_lo_tail*4 + 8]
 163         vstr    s10, [OUT, #out_hi_tail*4]
 164         vstr    s11, [OUT, #out_hi_tail*4 + 8]
 165  .endif
 166  .ifnc "\head",""
 167         vmul.f  s8, s4, s16                 @ vector operation
 168  .endif
 169  .ifnc "\tail",""
 170         vstr    s12, [OUT, #out_hi_tail*4 + 12]
 171         vstr    s13, [OUT, #out_hi_tail*4 + 4]
 172         vstr    s14, [OUT, #out_lo_tail*4 + 12]
 173         vstr    s15, [OUT, #out_lo_tail*4 + 4]
 174  .endif
 175  .ifnc "\head",""
 176         vmul.f  s12, s0, s16                @ vector operation
 177         vldr    TCOS_D1_HEAD, [TCOS, #trig_hi_head*4]
 178  .endif
 179  .unreq TCOS_D0_HEAD
 180  .unreq TCOS_D1_HEAD
 181  .unreq TCOS_S0_TAIL
 182  .ifnc "\head",""
 183   .set k, k + 2
 184  .endif
 185 .endm
 186
 187 .macro postrotation_innerloop_rolled tail, head, tcos_s0_head, tcos_s1_head, tcos_s2_head, tcos_s3_head, tcos_s0_tail, out_offset_head, out_offset_tail
 188  .ifnc "\tail",""
 189         vmls.f  s8, s0, \tcos_s0_tail       @ vector operation
 190  .endif
 191  .ifnc "\head",""
 192         vldmia  TSIN!, {s16,s17}
 193         vldmdb  TSIN_HI!, {s18,s19}
 194         vldmia  TCOS!, {\tcos_s0_head,\tcos_s1_head}
 195  .endif
 196  .ifnc "\tail",""
 197         vmla.f  s12, s4, \tcos_s0_tail      @ vector operation
 198  .endif
 199  .ifnc "\head",""
 200         vldr    s0, [OUT, #+\out_offset_head+0]
 201         vldr    s1, [OUT, #+\out_offset_head+8]
 202         vldr    s2, [OUT_HI, #-\out_offset_head-16]
 203         vldr    s3, [OUT_HI, #-\out_offset_head-8]
 204         vldr    s4, [OUT, #+\out_offset_head+4]
 205         vldr    s5, [OUT, #+\out_offset_head+12]
 206         vldr    s6, [OUT_HI, #-\out_offset_head-12]
 207         vldr    s7, [OUT_HI, #-\out_offset_head-4]
 208  .endif
 209  .ifnc "\tail",""
 210         vstr    s8, [OUT, #+\out_offset_tail+0]
 211         vstr    s9, [OUT, #+\out_offset_tail+8]
 212         vstr    s10, [OUT_HI, #-\out_offset_tail-16]
 213         vstr    s11, [OUT_HI, #-\out_offset_tail-8]
 214  .endif
 215  .ifnc "\head",""
 216         vmul.f  s8, s4, s16                 @ vector operation
 217  .endif
 218  .ifnc "\tail",""
 219         vstr    s12, [OUT_HI, #-\out_offset_tail-4]
 220         vstr    s13, [OUT_HI, #-\out_offset_tail-12]
 221         vstr    s14, [OUT, #+\out_offset_tail+12]
 222         vstr    s15, [OUT, #+\out_offset_tail+4]
 223  .endif
 224  .ifnc "\head",""
 225         vmul.f  s12, s0, s16                @ vector operation
 226         vldmdb  TCOS_HI!, {\tcos_s2_head,\tcos_s3_head}
 227  .endif
 228 .endm
 229
 230
 231 /* void ff_imdct_half_vfp(FFTContext *s,
 232  *                        FFTSample *output,
 233  *                        const FFTSample *input)
 234  */
 235 function ff_imdct_half_vfp, export=1
 236         ldr     ip, [CONTEXT, #5*4]         @ mdct_bits
 237         teq     ip, #6
 238         bne     10f
 239
 240  .set n, 1<<6
 241  .set n2, n/2
 242  .set n4, n/4
 243  .set n8, n/8
 244
 245         push    {v1-v5,lr}
 246         vpush   {s16-s27}
 247         fmrx    OLDFPSCR, FPSCR
 248         ldr     lr, =0x03030000             @ RunFast mode, short vectors of length 4, stride 1
 249         fmxr    FPSCR, lr
 250         mov     OUT, ORIGOUT
 251         ldr     REVTAB, [CONTEXT, #2*4]
 252         ldr     TCOS, [CONTEXT, #6*4]
 253         ldr     TSIN, [CONTEXT, #7*4]
 254
 255  .set k, 0
 256  .rept n8/2
 257         prerotation_innerloop
 258  .endr
 259
 260         fmxr    FPSCR, OLDFPSCR
 261         mov     a1, OUT
 262         bl      X(ff_fft16_vfp)
 263         ldr     lr, =0x03030000             @ RunFast mode, short vectors of length 4, stride 1
 264         fmxr    FPSCR, lr
 265
 266  .set k, 0
 267         postrotation_innerloop , head
 268  .rept n8/2 - 1
 269         postrotation_innerloop tail, head
 270  .endr
 271         postrotation_innerloop tail
 272
 273         fmxr    FPSCR, OLDFPSCR
 274         vpop    {s16-s27}
 275         pop     {v1-v5,pc}
 276
 277 10:
 278         push    {v1-v6,sl,fp,lr}
 279         vpush   {s16-s27}
 280         fmrx    OLDFPSCR, FPSCR
 281         ldr     lr, =0x03030000             @ RunFast mode, short vectors of length 4, stride 1
 282         fmxr    FPSCR, lr
 283         mov     lr, #1
 284         mov     OUT, ORIGOUT
 285         ldr     REVTAB, [CONTEXT, #2*4]
 286         ldr     TCOS, [CONTEXT, #6*4]
 287         ldr     TSIN, [CONTEXT, #7*4]
 288         mov     lr, lr, lsl ip
 289
 290         push    {CONTEXT,OLDFPSCR}
 291         add     IN_HI, IN, lr, lsl #1
 292         add     REVTAB_HI, REVTAB, lr, lsr #1
 293         add     TCOS_HI, TCOS, lr
 294         add     TSIN_HI, TSIN, lr
 295 0:      prerotation_innerloop_rolled
 296         teq     IN, IN_HI
 297         bne     0b
 298         ldmia   sp, {CONTEXT,OLDFPSCR}
 299
 300         mov     ORIGOUT, OUT
 301         fmxr    FPSCR, OLDFPSCR
 302         ldr     ip, [CONTEXT, #9*4]
 303         blx     ip                          @ s->fft_calc(s, output)
 304
 305         pop     {CONTEXT,OLDFPSCR}
 306         ldr     lr, =0x03030000             @ RunFast mode, short vectors of length 4, stride 1
 307         ldr     ip, [CONTEXT, #5*4]         @ mdct_bits
 308         fmxr    FPSCR, lr
 309         mov     lr, #1
 310         mov     lr, lr, lsl ip
 311         sub     TCOS, TCOS, lr, lsr #1
 312         sub     TSIN, TSIN, lr, lsr #1
 313         add     OUT_HI, OUT, lr, lsl #1
 314         add     TCOS_HI, TCOS, lr
 315         add     TSIN_HI, TSIN, lr
 316         postrotation_innerloop_rolled , head, s20, s21, s22, s23,, 0
 317         b       1f
 318 0:      add     OUT, OUT, #32
 319         sub     OUT_HI, OUT_HI, #32
 320         postrotation_innerloop_rolled tail, head, s20, s21, s22, s23, s24, 0, -16
 321 1:      postrotation_innerloop_rolled tail, head, s24, s25, s26, s27, s20, 16, 0
 322         teq     TSIN, TSIN_HI
 323         bne     0b
 324         postrotation_innerloop_rolled tail,,,,,, s24,, 16
 325
 326         fmxr    FPSCR, OLDFPSCR
 327         vpop    {s16-s27}
 328         pop     {v1-v6,sl,fp,pc}
 329 endfunc
 330
 331         .unreq  CONTEXT
 332         .unreq  ORIGOUT
 333         .unreq  IN
 334         .unreq  OUT
 335         .unreq  REVTAB
 336         .unreq  TCOS
 337         .unreq  TSIN
 338         .unreq  OLDFPSCR
 339         .unreq  J0
 340         .unreq  J1
 341         .unreq  J2
 342         .unreq  J3
 343         .unreq  REVTAB_HI
 344         .unreq  IN_HI
 345         .unreq  OUT_HI
 346         .unreq  TCOS_HI
 347         .unreq  TSIN_HI