git.sesse.net Git - ffmpeg/blob - libavcodec/aarch64/synth_filter_neon.S

   1 /*
   2  * Copyright (c) 2010 Mans Rullgard <mans@mansr.com>
   3  * Copyright (c) 2015 Janne Grunau <janne-libav@jannau.net>
   4  *
   5  * This file is part of FFmpeg.
   6  *
   7  * FFmpeg is free software; you can redistribute it and/or
   8  * modify it under the terms of the GNU Lesser General Public
   9  * License as published by the Free Software Foundation; either
  10  * version 2.1 of the License, or (at your option) any later version.
  11  *
  12  * FFmpeg is distributed in the hope that it will be useful,
  13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15  * Lesser General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU Lesser General Public
  18  * License along with FFmpeg; if not, write to the Free Software
  19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20  */
  21
  22 #include "asm-offsets.h"
  23
  24 #include "libavutil/aarch64/asm.S"
  25
  26 .macro inner_loop
  27         ld1             {v29.4s},  [x9],  x15
  28         ld1             {v28.4s},  [x8],  x15
  29         ld1             {v30.4s},  [x10], x15
  30         ld1             {v31.4s},  [x11], x15
  31         rev64           v28.4s, v28.4s
  32         ld1             {v24.4s},  [x4],  x15
  33         ld1             {v25.4s},  [x5],  x15
  34         rev64           v31.4s, v31.4s
  35         ld1             {v26.4s},  [x6],  x15
  36         fmla            v5.4s,  v25.4s, v29.4s
  37         ld1             {v27.4s},  [x7],  x15
  38         ext             v28.16b, v28.16b, v28.16b, #8
  39         ext             v31.16b, v31.16b, v31.16b, #8
  40         fmla            v6.4s,  v26.4s, v30.4s
  41         fmls            v4.4s,  v24.4s, v28.4s
  42         fmla            v7.4s,  v27.4s, v31.4s
  43 .endm
  44
  45 function ff_synth_filter_float_neon, export=1
  46         ldr             w7,  [x2]               // *synth_buf_offset
  47         ldr             x9,  [x0, #IMDCT_HALF]  // imdct_half function pointer
  48         sxtw            x7,  w7
  49         stp             x3,  x4,  [sp, #-64]!
  50         add             x1,  x1,  x7,  lsl #2   // synth_buf
  51         sub             w8,  w7,  #32
  52         stp             x5,  x1,  [sp, #16]
  53         and             x7,  x7,  #~63
  54         and             w8,  w8,  #511
  55         stp             x7,  x30, [sp, #32]
  56         str             w8,  [x2]
  57         str             s0,  [sp, #48]
  58
  59         mov             x2,  x6                 // in
  60
  61         blr             x9
  62
  63         ldp             x2,  x4,  [sp]          // synct_buf_2, window
  64         ldp             x13, x9,  [sp, #16]     // out, synth_buf
  65         ldp             x0,  x30, [sp, #32]     // *synth_buf_offset
  66         ldr             s0,  [sp, #48]
  67
  68         add             x3,  x2,  #16*4         // synct_buf_2 + 16
  69         add             x14, x13, #16*4         // out + 16
  70         add             x8,  x9,  #12*4
  71         mov             x15, #64*4
  72         mov             x1,  #4
  73 1:
  74         add             x10, x9,  #16*4         // synth_buf
  75         add             x11, x8,  #16*4
  76         add             x5,  x4,  #16*4         // window
  77         add             x6,  x4,  #32*4
  78         add             x7,  x4,  #48*4
  79
  80         ld1             {v4.4s},   [x2]         // a
  81         ld1             {v5.4s},   [x3]         // b
  82         movi            v6.4s,  #0              // c
  83         movi            v7.4s,  #0              // d
  84
  85         mov             x12, #512
  86 2:
  87         sub             x12, x12, #64
  88         cmp             x12, x0
  89         inner_loop
  90         b.gt            2b
  91
  92         sub             x8,  x8,  #512*4
  93         sub             x9,  x9,  #512*4
  94         cbz             x12, 4f
  95         sub             x10, x10, #512*4
  96         sub             x11, x11, #512*4
  97 3:
  98         subs            x12, x12, #64
  99         inner_loop
 100         b.gt            3b
 101 4:
 102         subs            x1,  x1,  #1
 103         fmul            v4.4s,  v4.4s,  v0.s[0]
 104         fmul            v5.4s,  v5.4s,  v0.s[0]
 105         st1             {v6.4s},   [x2],  #16
 106         st1             {v7.4s},   [x3],  #16
 107         st1             {v4.4s},   [x13], #16
 108         st1             {v5.4s},   [x14], #16
 109         b.le            10f
 110
 111         sub             x4,  x4,  #508*4        // window
 112         add             x9,  x9,  #4*4          // synth_buf
 113         sub             x8,  x8,  #4*4          // synth_buf
 114         b               1b
 115
 116 10:
 117         add             sp,  sp,  #64
 118         ret
 119 endfunc