1 ;******************************************************************************
2 ;* SIMD-optimized MLP DSP functions
3 ;* Copyright (c) 2014 James Almer <jamrial@gmail.com>
5 ;* This file is part of FFmpeg.
7 ;* FFmpeg is free software; you can redistribute it and/or
8 ;* modify it under the terms of the GNU Lesser General Public
9 ;* License as published by the Free Software Foundation; either
10 ;* version 2.1 of the License, or (at your option) any later version.
12 ;* FFmpeg is distributed in the hope that it will be useful,
13 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
14 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 ;* Lesser General Public License for more details.
17 ;* You should have received a copy of the GNU Lesser General Public
18 ;* License along with FFmpeg; if not, write to the Free Software
19 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20 ;******************************************************************************
22 %include "libavutil/x86/x86util.asm"
45 movdqa m1, [samplesq + 16]
46 movdqa m2, [coeffsq + 16]
54 vextracti128 xm1, m0, 1
60 pshufd xm1, xm0, q0032
63 movzx blsbsd, byte [blsbs_ptrq] ; load *bypassed_lsbs
64 sar accumq, 14 ; accum >>= 14
65 and accumd, maskd ; accum &= mask
66 add accumd, blsbsd ; accum += *bypassed_lsbs
67 mov [samplesq + dest_chq], accumd ; samples[dest_ch] = accum
68 add blsbs_ptrq, 8 ; bypassed_lsbs += MAX_CHANNELS;
69 add samplesq, 32 ; samples += MAX_CHANNELS;
73 %macro LOOP_SHIFT_END 0
74 pshufd xm1, xm0, q0032
77 and indexd, auspd ; index &= access_unit_size_pow2;
78 movsx noiseq, byte [noise_bufferq + indexq] ; load noise_buffer[index]
79 add indexd, index2d ; index += index2
80 SHLX noiseq, mns ; noise_buffer[index] <<= matrix_noise_shift
81 add accumq, noiseq ; accum += noise_buffer[index]
82 movzx noised, byte [blsbs_ptrq] ; load *bypassed_lsbs (reuse tmp noise register)
83 sar accumq, 14 ; accum >>= 14
84 and accumd, maskd ; accum &= mask
85 add accumd, noised ; accum += *bypassed_lsbs
86 mov [samplesq + dest_chq], accumd ; samples[dest_ch] = accum
87 add blsbs_ptrq, 8 ; bypassed_lsbs += MAX_CHANNELS;
88 add samplesq, 32 ; samples += MAX_CHANNELS;
92 ;void ff_mlp_rematrix_channel(int32_t *samples, const int32_t *coeffs,
93 ; const uint8_t *bypassed_lsbs, const int8_t *noise_buffer,
94 ; int index, unsigned int dest_ch, uint16_t blockpos,
95 ; unsigned int maxchan, int matrix_noise_shift,
96 ; int access_unit_size_pow2, int32_t mask)
97 %macro MLP_REMATRIX_CHANNEL 0
98 cglobal mlp_rematrix_channel, 0, 13, 5, samples, coeffs, blsbs_ptr, blsbs, \
99 index, dest_ch, blockpos, maxchan, mns, \
101 mov mnsd, mnsm ; load matrix_noise_shift
102 movzx blockposq, word blockposm ; load and zero extend blockpos (16bit)
103 mov maxchand, maxchanm ; load maxchan
104 mov maskd, maskm ; load mask
106 mov dest_chd, dest_chm ; load dest_chd (not needed on UNIX64)
109 lea cntq, [blsbs_ptrq + blockposq*8]
110 test mnsd, mnsd ; is matrix_noise_shift != 0?
111 jne .shift ; jump if true
112 cmp maxchand, 4 ; is maxchan < 4?
113 jl .loop4 ; jump if true
117 ; Process 5 or more channels
125 ; Process up to 4 channels
126 movdqa xm0, [samplesq]
127 movdqa xm1, [coeffsq ]
128 pshufd xm2, xm0, q2301
129 pshufd xm3, xm1, q2301
139 mov indexd, indexm ; load index (not needed on UNIX64)
141 mov r9d, r9m ; load access_unit_size_pow2
143 ; bmi2 has shift functions that accept any gpr, not just cl, so keep things in place.
144 DEFINE_ARGS samples, coeffs, blsbs_ptr, noise_buffer, \
145 index, dest_ch, accum, index2, mns, \
146 ausp, mask, cnt, noise
147 add mnsd, 7 ; matrix_noise_shift += 7
149 mov r6, rcx ; move rcx elsewhere so we can use cl for matrix_noise_shift
152 DEFINE_ARGS mns, coeffs, blsbs_ptr, noise_buffer, index, dest_ch, samples, \
153 index2, accum, ausp, mask, cnt, noise
156 DEFINE_ARGS samples, coeffs, blsbs_ptr, mns, index, dest_ch, noise_buffer, \
157 index2, accum, ausp, mask, cnt, noise
159 lea mnsd, [r8 + 7] ; rcx = matrix_noise_shift + 7
161 sub auspd, 1 ; access_unit_size_pow2 -= 1
162 cmp r7d, 4 ; is maxchan < 4?
163 lea index2q, [indexq*2 + 1] ; index2 = 2 * index + 1;
164 jl .loop4_shift ; jump if maxchan < 4
168 ; Process 5 or more channels
176 ; Process up to 4 channels
177 movdqa xm0, [samplesq]
178 movdqa xm1, [coeffsq ]
179 pshufd xm2, xm0, q2301
180 pshufd xm3, xm1, q2301
191 %if HAVE_AVX2_EXTERNAL