@***************************************************************************** @ amplify.S : ARM NEON software amplification @***************************************************************************** @ Copyright (C) 2012 RĂ©mi Denis-Courmont @ @ This program is free software; you can redistribute it and/or modify @ it under the terms of the GNU Lesser General Public License as published by @ the Free Software Foundation; either version 2.1 of the License, or @ (at your option) any later version. @ @ This program is distributed in the hope that it will be useful, @ but WITHOUT ANY WARRANTY; without even the implied warranty of @ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the @ GNU Lesser General Public License for more details. @ @ You should have received a copy of the GNU Lesser General Public License @ along with this program; if not, write to the Free Software Foundation, @ Inc., 51 Franklin Street, Fifth Floor, Boston MA 02110-1301, USA. @****************************************************************************/ .syntax unified .arm .fpu neon .text #define DST r0 #define SRC r1 #define SIZE r2 .align 2 .global amplify_float_arm_neon .type amplify_float_arm_neon, %function amplify_float_arm_neon: cmp SIZE, #0 bxeq lr #ifdef __ARM_PCS vmov s0, r3 @ softfp #endif pld [SRC, #64] vld1.f32 {d16-d17}, [SRC,:128]! subs SIZE, SIZE, #16 vmul.f32 d16, d16, d0[0] vmul.f32 d17, d17, d0[0] blo 5f pld [SRC, #64] vld1.f32 {d18-d19}, [SRC,:128]! subs SIZE, SIZE, #16 vmul.f32 d18, d18, d0[0] vmul.f32 d19, d19, d0[0] blo 2f 1: @ main loop starts pld [SRC, #64] vld1.f32 {d20-d21}, [SRC,:128]! subs SIZE, SIZE, #16 vmul.f32 d20, d20, d0[0] vmul.f32 d21, d21, d0[0] vst1.f32 {d16-d17}, [DST,:128]! blo 3f pld [SRC, #64] vld1.f32 {d16-d17}, [SRC,:128]! subs SIZE, SIZE, #16 vmul.f32 d16, d16, d0[0] vmul.f32 d17, d17, d0[0] vst1.f32 {d18-d19}, [DST,:128]! blo 4f pld [SRC, #64] vld1.f32 {d18-d19}, [SRC,:128]! subs SIZE, SIZE, #16 vmul.f32 d18, d18, d0[0] vmul.f32 d19, d19, d0[0] vst1.f32 {d20-d21}, [DST,:128]! bhi 1b @ main loop ends 2: vst1.f32 {d16-d17}, [DST,:128]! vst1.f32 {d18-d19}, [DST,:128]! bx lr 3: vst1.f32 {d18-d19}, [DST,:128]! vst1.f32 {d20-d21}, [DST,:128]! bx lr 4: vst1.f32 {d20-d21}, [DST,:128]! 5: vst1.f32 {d16-d17}, [DST,:128]! bx lr