git.sesse.net Git - ffmpeg/blob - libavcodec/aarch64/mpegaudiodsp_neon.S

   1 /*
   2  * Copyright (c) 2014 Janne Grunau <janne-libav@jannau.net>
   3  *
   4  * This file is part of FFmpeg.
   5  *
   6  * FFmpeg is free software; you can redistribute it and/or
   7  * modify it under the terms of the GNU Lesser General Public
   8  * License as published by the Free Software Foundation; either
   9  * version 2.1 of the License, or (at your option) any later version.
  10  *
  11  * FFmpeg is distributed in the hope that it will be useful,
  12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  14  * Lesser General Public License for more details.
  15  *
  16  * You should have received a copy of the GNU Lesser General Public
  17  * License along with FFmpeg; if not, write to the Free Software
  18  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  19  */
  20
  21 #include "libavutil/aarch64/asm.S"
  22
  23 #define FRAC_BITS   23   // fractional bits for sb_samples and dct
  24 #define WFRAC_BITS  16   // fractional bits for window
  25 #define OUT_SHIFT (WFRAC_BITS + FRAC_BITS - 15)
  26
  27 const   tbl_rev128_s, align=4
  28         .byte           12, 13, 14, 15
  29         .byte            8,  9, 10, 11
  30         .byte            4,  5,  6,  7
  31         .byte            0,  1,  2,  3
  32 endconst
  33
  34 .macro   apply_window   type, st
  35 function ff_mpadsp_apply_window_\type\()_neon, export=1
  36         mov             x7,  x0
  37         add             x8,  x0,  #512<<2
  38         ld1             {v0.4s,v1.4s,v2.4s,v3.4s},  [x7],  #64
  39         ld1             {v4.4s,v5.4s,v6.4s,v7.4s},  [x7],  #64
  40         st1             {v0.4s,v1.4s,v2.4s,v3.4s},  [x8],  #64
  41         st1             {v4.4s,v5.4s,v6.4s,v7.4s},  [x8],  #64
  42         movrel          x15, tbl_rev128_s
  43         ld1             {v27.4s}, [x15]
  44 .ifc \type, fixed
  45         lsl             x4,  x4,  #1
  46 .else
  47         lsl             x4,  x4,  #2
  48 .endif
  49         add             x10, x0,  #45<<2
  50         add             x0,  x0,  #16<<2
  51         add             x1,  x1,  #16<<2
  52         add             x5,  x3,  x4,  lsl #5
  53         sub             x5,  x5,  x4            // samples2
  54         neg             x13, x4                 // -incr
  55         mov             x9,  #64<<2
  56 .ifc \type, fixed
  57         ld1r            {v16.2s}, [x2]          // dither_state
  58         sxtl            v16.2d, v16.2s
  59         movi            v29.2d, #0
  60         movi            v30.2d, #(1<<OUT_SHIFT)-1
  61         trn1            v31.2d, v29.2d, v30.2d
  62         trn2            v30.2d, v30.2d, v29.2d
  63         trn1            v16.2d, v16.2d, v29.2d
  64 .else
  65         movi            v16.4s, #0
  66         movi            v28.4s, #0
  67 .endif
  68         mov             x14, #4
  69 1:
  70         mov             x8,  x0
  71         sub             x7,  x1,  #3<<2
  72         sub             x6,  x1,  x14, lsl #4
  73         add             x7,  x7,  x14, lsl #4
  74         add             x11, x6, #(32)<<2      // w  + 32
  75         add             x12, x7, #(32)<<2      // w2 + 32
  76         mov             x15, #8
  77         movi            v17.2d, #0
  78         movi            v18.2d, #0
  79         movi            v19.2d, #0
  80 2:
  81         subs            x15, x15, #1
  82         ld1             {v0.4s},  [x8],  x9
  83         ld1             {v1.4s},  [x10], x9
  84         ld1             {v2.4s},  [x6],  x9
  85         ld1             {v3.4s},  [x7],  x9
  86         tbl             v6.16b, {v0.16b}, v27.16b
  87         tbl             v7.16b, {v1.16b}, v27.16b
  88         ld1             {v4.4s},  [x11], x9
  89         ld1             {v5.4s},  [x12], x9
  90         MLA             v16, v2, v0
  91         MLA2            v17, v2, v0
  92         MLS             v18, v3, v6
  93         MLS2            v19, v3, v6
  94         MLS             v16, v4, v7
  95         MLS2            v17, v4, v7
  96         MLS             v18, v5, v1
  97         MLS2            v19, v5, v1
  98         b.gt            2b
  99
 100         cmp             x14, #4
 101         sub             x10, x10, #64<<5        // 64 * 8 * sizeof(int32_t)
 102
 103 .ifc \type, fixed
 104         and             v28.16b, v16.16b, v30.16b
 105         ext             v28.16b, v29.16b, v28.16b, #8
 106
 107         b.eq            4f
 108         round_sample    v19, 1, 1
 109 4:
 110         round_sample    v16, 1, 0
 111         shrn            v16.2s, v16.2d,  #OUT_SHIFT
 112         round_sample    v19, 0, 0
 113         shrn            v19.2s, v19.2d,  #OUT_SHIFT
 114         round_sample    v17, 0, 1
 115         round_sample    v18, 1, 1
 116         round_sample    v17, 1, 0
 117         shrn2           v16.4s, v17.2d,  #OUT_SHIFT
 118         round_sample    v18, 0, 0
 119         shrn2           v19.4s, v18.2d,  #OUT_SHIFT
 120         sqxtn           v16.4h, v16.4s
 121         sqxtn           v18.4h, v19.4s
 122 .else
 123         ext             v18.16b, v18.16b, v18.16b, #8
 124 .endif
 125
 126         st1             {v16.\st\()}[0], [x3], x4
 127         b.eq            4f
 128         st1             {v18.\st\()}[1], [x5], x13
 129 4:
 130         st1             {v16.\st\()}[1], [x3], x4
 131         st1             {v18.\st\()}[0], [x5], x13
 132         st1             {v16.\st\()}[2], [x3], x4
 133         st1             {v18.\st\()}[3], [x5], x13
 134         st1             {v16.\st\()}[3], [x3], x4
 135         st1             {v18.\st\()}[2], [x5], x13
 136
 137         mov             v16.16b, v28.16b
 138
 139         subs            x14, x14, #1
 140         add             x0,  x0,  #4<<2
 141         sub             x10, x10, #4<<2
 142         b.gt            1b
 143
 144 // computing samples[16]
 145         add             x6,  x1,  #32<<2
 146         ld1             {v0.2s},  [x6],  x9
 147         ld1             {v1.2s},  [x0],  x9
 148 .rept   3
 149         ld1             {v2.2s},  [x6],  x9
 150         ld1             {v3.2s},  [x0],  x9
 151         MLS             v16, v0,  v1
 152         ld1             {v0.2s},  [x6],  x9
 153         ld1             {v1.2s},  [x0],  x9
 154         MLS             v16, v2,  v3
 155 .endr
 156         ld1             {v2.2s},  [x6],  x9
 157         ld1             {v3.2s},  [x0],  x9
 158         MLS             v16, v0,  v1
 159         MLS             v16, v2,  v3
 160
 161 .ifc \type, fixed
 162         and             v28.16b, v16.16b, v30.16b
 163         shrn            v20.2s,  v16.2d,  #OUT_SHIFT
 164         xtn             v28.2s,  v28.2d
 165         sqxtn           v20.4h,  v20.4s
 166         st1             {v28.s}[0], [x2]        // save dither_state
 167         st1             {v20.h}[0], [x3]
 168 .else
 169         st1             {v16.s}[0], [x3]
 170 .endif
 171
 172         ret
 173 endfunc
 174 .purgem round_sample
 175 .purgem MLA
 176 .purgem MLA2
 177 .purgem MLS
 178 .purgem MLS2
 179 .endm
 180
 181
 182 .macro  round_sample    r, idx, next
 183         add             \r\().2d, \r\().2d, v28.2d
 184 .if \idx == 0
 185         and             v28.16b,  \r\().16b,  v30.16b
 186 .else // \idx == 1
 187         and             v28.16b,  \r\().16b,  v31.16b
 188 .endif
 189 .if \idx != \next
 190   .if \next == 0
 191         ext             v28.16b, v28.16b, v29.16b, #8
 192   .else
 193         ext             v28.16b, v29.16b, v28.16b, #8
 194   .endif
 195 .endif
 196 .endm
 197 .macro  MLA             d, s1, s2
 198         smlal           \d\().2d, \s1\().2s, \s2\().2s
 199 .endm
 200 .macro  MLA2            d, s1, s2
 201         smlal2          \d\().2d, \s1\().4s, \s2\().4s
 202 .endm
 203 .macro  MLS             d, s1, s2
 204         smlsl           \d\().2d, \s1\().2s, \s2\().2s
 205 .endm
 206 .macro  MLS2            d, s1, s2
 207         smlsl2          \d\().2d, \s1\().4s, \s2\().4s
 208 .endm
 209 apply_window fixed, h
 210
 211
 212 // nothing to do for round_sample and ML{A,S}2
 213 .macro  round_sample    r, idx, next
 214 .endm
 215 .macro  MLA2            d, s1, s2
 216 .endm
 217 .macro  MLS2            d, s1, s2
 218 .endm
 219 .macro  MLA             d, s1, s2
 220         fmla            \d\().4s, \s1\().4s, \s2\().4s
 221 .endm
 222 .macro  MLS             d, s1, s2
 223         fmls            \d\().4s, \s1\().4s, \s2\().4s
 224 .endm
 225 apply_window float, s