1 ;******************************************************************************
2 ;* SIMD optimized Opus encoder DSP function
4 ;* Copyright (C) 2017 Ivan Kalvachev <ikalvachev@gmail.com>
6 ;* This file is part of FFmpeg.
8 ;* FFmpeg is free software; you can redistribute it and/or
9 ;* modify it under the terms of the GNU Lesser General Public
10 ;* License as published by the Free Software Foundation; either
11 ;* version 2.1 of the License, or (at your option) any later version.
13 ;* FFmpeg is distributed in the hope that it will be useful,
14 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
15 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 ;* Lesser General Public License for more details.
18 ;* You should have received a copy of the GNU Lesser General Public
19 ;* License along with FFmpeg; if not, write to the Free Software
20 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21 ;******************************************************************************
24 %include "libavutil/x86/x86util.asm"
33 const_float_abs_mask: times 8 dd 0x7fffffff
34 const_align_abs_edge: times 8 dd 0
36 const_float_0_5: times 8 dd 0.5
37 const_float_1: times 8 dd 1.0
38 const_float_sign_mask: times 8 dd 0x80000000
42 dd $-const_int32_offsets
47 ; Setup High Register to be used
48 ; for holding memory constants
50 ; %1 - the register to be used, assmues it is >= mm8
51 ; %2 - name of the constant.
53 ; Subsequent opcodes are going to use the constant in the form
54 ; "addps m0, mm_const_name" and it would be turned into:
55 ; "addps m0, [const_name]" on 32 bit arch or
56 ; "addps m0, m8" on 64 bit arch
57 %macro SET_HI_REG_MM_CONSTANT 3 ; movop, reg, const_name
60 %{1} %2, [%3] ; movaps m8, [const_name]
67 ; Set Position Independent Code
68 ; Base address of a constant
69 ; %1 - the register to be used, if PIC is set
70 ; %2 - name of the constant.
72 ; Subsequent opcode are going to use the base address in the form
73 ; "movaps m0, [pic_base_constant_name+r4]" and it would be turned into
74 ; "movaps m0, [r5 + r4]" if PIC is enabled
75 ; "movaps m0, [constant_name + r4]" if texrel are used
76 %macro SET_PIC_BASE 3; reg, const_label
78 %{1} %2, [%3] ; lea r5, [rip+const]
79 %define pic_base_%3 %2
81 %define pic_base_%3 %3
85 %macro PULSES_SEARCH 1
88 addps m6, mm_const_float_0_5 ; Syy_norm += 1.0/2
94 movd xm2, dword r4d ; movd zero extends
96 movaps m4, [tmpY + r4] ; y[i]
97 movaps m5, [tmpX + r4] ; X[i]
99 %if USE_APPROXIMATION == 1
101 cmpps m0, m0, m5, 4 ; m0 = (X[i] != 0.0)
104 addps m4, m6 ; m4 = Syy_new = y[i] + Syy_norm
105 addps m5, m7 ; m5 = Sxy_new = X[i] + Sxy_norm
107 %if USE_APPROXIMATION == 1
108 andps m5, m0 ; if(X[i] == 0) Sxy_new = 0; Prevent aproximation error from setting pulses in array padding.
112 movaps m5, [tmpY + r4] ; m5 = y[i]
114 xorps m0, m0 ; m0 = 0;
115 cmpps m0, m0, m5, 1 ; m0 = (0<y)
117 subps m4, m6, m5 ; m4 = Syy_new = Syy_norm - y[i]
118 subps m5, m7, [tmpX + r4] ; m5 = Sxy_new = Sxy_norm - X[i]
119 andps m5, m0 ; (0<y)?m5:0
122 %if USE_APPROXIMATION == 1
124 mulps m5, m4 ; m5 = p = Sxy_new*approx(1/sqrt(Syy) )
127 divps m5, m4 ; m5 = p = Sxy_new*Sxy_new/Syy
129 VPBROADCASTD m2, xm2 ; m2=i (all lanes get same values, we add the offset-per-lane, later)
131 cmpps m0, m3, m5, 1 ; m0 = (m3 < m5) ; (p_max < p) ; (p > p_max)
132 maxps m3, m5 ; m3=max(p_max,p)
133 ; maxps here is faster than blendvps, despite blend having lower latency.
135 pand m2, m0 ; This version seems faster than sse41 pblendvb
136 pmaxsw m1, m2 ; SSE2 signed word, so it would work for N < 32768/4
140 jb %%distortion_search
142 por m1, mm_const_int32_offsets ; max_idx offsets per individual lane (skipped in the inner loop)
143 movdqa m4, m1 ; needed for the aligned y[max_idx]+=1; processing
146 ; Merge parallel maximums round 8 (4 vs 4)
148 vextractf128 xm5, ym3, 1 ; xmm5 = ymm3[1x128] = ymm3[255..128b]
149 cmpps xm0, xm3, xm5, 1 ; m0 = (m3 < m5) = ( p[0x128] < p[1x128] )
151 vextracti128 xm2, ym1, 1 ; xmm2 = ymm1[1x128] = ymm1[255..128b]
152 BLENDVPS xm3, xm5, xm0 ; max_idx = m0 ? max_idx[1x128] : max_idx[0x128]
153 PBLENDVB xm1, xm2, xm0 ; p = m0 ? p[1x128] : p[0x128]
156 ; Merge parallel maximums round 4 (2 vs 2)
158 movhlps xm5, xm3 ; m5=p[xx32]
159 cmpps xm0, xm3, xm5, 1 ; m0 = (m3 < m5) = ( p[1,0] < p[3,2] )
161 pshufd xm2, xm1, q3232
162 BLENDVPS xm3, xm5, xm0 ; max_idx = m0 ? max_idx[3,2] : max_idx[1,0]
163 PBLENDVB xm1, xm2, xm0 ; p = m0 ? p[3,2] : p[1,0]
165 ; Merge parallel maximums final round (1 vs 1)
166 shufps xm0, xm3, xm3, q1111 ; m0 = m3[1] = p[1]
167 cmpss xm0, xm3, 5 ; m0 = !(m0 >= m3) = !( p[1] >= p[0] )
169 pshufd xm2, xm1, q1111
170 PBLENDVB xm1, xm2, xm0
172 movd dword r4d, xm1 ; zero extends to the rest of r4q
174 VBROADCASTSS m3, [tmpX + r4]
175 %{1}ps m7, m3 ; Sxy += X[max_idx]
177 VBROADCASTSS m5, [tmpY + r4]
178 %{1}ps m6, m5 ; Syy += Y[max_idx]
180 ; We have to update a single element in Y[i]
181 ; However writing 4 bytes and then doing 16 byte load in the inner loop
182 ; could cause a stall due to breaking write forwarding.
184 pcmpeqd m1, m1, m4 ; exactly 1 element matches max_idx and this finds it
186 and r4d, ~(mmsize-1) ; align address down, so the value pointed by max_idx is inside a mmsize load
187 movaps m5, [tmpY + r4] ; m5 = Y[y3...ym...y0]
188 andps m1, mm_const_float_1 ; m1 = [ 0...1.0...0]
189 %{1}ps m5, m1 ; m5 = Y[y3...ym...y0] +/- [0...1.0...0]
190 movaps [tmpY + r4], m5 ; Y[max_idx] +-= 1.0;
194 ; We need one more register for
195 ; PIC relative addressing. Use this
196 ; to count it in cglobal
199 %define num_pic_regs 1
201 %define num_pic_regs 0
205 ; Pyramid Vector Quantization Search implementation
207 ; float * inX - Unaligned (SIMD) access, it will be overread,
208 ; but extra data is masked away.
209 ; int32 * outY - Should be aligned and padded buffer.
210 ; It is used as temp buffer.
211 ; uint32 K - Number of pulses to have after quantizations.
212 ; uint32 N - Number of vector elements. Must be 0 < N < 256
214 %macro PVQ_FAST_SEARCH 1
215 cglobal pvq_search%1, 4, 5+num_pic_regs, 11, 256*4, inX, outY, K, N
219 movaps m0, [const_float_abs_mask]
220 shl Nd, 2 ; N *= sizeof(float); also 32 bit operation zeroes the high 32 bits in 64 bit mode.
226 SET_PIC_BASE lea, r5, const_align_abs_edge ; rip+const
227 movups m2, [pic_base_const_align_abs_edge + r4 - mmsize]
229 add Nd, r4d ; N = align(N, mmsize)
231 lea r4d, [Nd - mmsize] ; N is rounded up (aligned up) to mmsize, so r4 can't become negative here, unless N=0.
232 movups m1, [inXq + r4]
234 movaps [tmpX + r4], m1 ; Sx = abs( X[N-1] )
239 jc %%end_loop_abs_sum
241 movups m2, [inXq + r4]
244 movaps [tmpX + r4], m2 ; tmpX[i]=abs(X[i])
245 addps m1, m2 ; Sx += abs(X[i])
251 HSUMPS m1, m2 ; m1 = Sx
255 jz %%zero_input ; if (Sx==0) goto zero_input
257 cvtsi2ss xm0, dword Kd ; m0 = K
258 %if USE_APPROXIMATION == 1
259 rcpss xm1, xm1 ; m1 = approx(1/Sx)
260 mulss xm0, xm1 ; m0 = K*(1/Sx)
262 divss xm0, xm1 ; b = K/Sx
268 lea r4d, [Nd - mmsize]
269 pxor m5, m5 ; Sy ( Sum of abs( y[i]) )
270 xorps m6, m6 ; Syy ( Sum of y[i]*y[i] )
271 xorps m7, m7 ; Sxy ( Sum of X[i]*y[i] )
274 movaps m1, [tmpX + r4] ; m1 = X[i]
275 mulps m2, m0, m1 ; m2 = res*X[i]
276 cvtps2dq m2, m2 ; yt = (int)lrintf( res*X[i] )
277 paddd m5, m2 ; Sy += yt
278 cvtdq2ps m2, m2 ; yt = (float)yt
279 mulps m1, m2 ; m1 = X[i]*yt
280 movaps [tmpY + r4], m2 ; y[i] = m2
281 addps m7, m1 ; Sxy += m1;
282 mulps m2, m2 ; m2 = yt*yt
283 addps m6, m2 ; Syy += m2
288 HSUMPS m6, m1 ; Syy_norm
289 HADDD m5, m4 ; pulses
291 movd dword r4d, xm5 ; zero extends to the rest of r4q
293 sub Kd, r4d ; K -= pulses , also 32 bit operation zeroes high 32 bit in 64 bit mode.
294 jz %%finish ; K - pulses == 0
296 SET_HI_REG_MM_CONSTANT movaps, m8, const_float_0_5
297 SET_HI_REG_MM_CONSTANT movaps, m9, const_float_1
298 SET_HI_REG_MM_CONSTANT movdqa, m10, const_int32_offsets
299 ; Use Syy/2 in distortion parameter calculations.
300 ; Saves pre and post-caclulation to correct Y[] values.
301 ; Same precision, since float mantisa is normalized.
302 ; The SQRT approximation does differ.
303 HSUMPS m7, m0 ; Sxy_norm
304 mulps m6, mm_const_float_0_5
306 jc %%remove_pulses_loop ; K - pulses < 0
308 align 16 ; K - pulses > 0
311 PULSES_SEARCH add ; m6 Syy_norm ; m7 Sxy_norm
314 jnz %%add_pulses_loop
316 addps m6, m6 ; Syy*=2
321 %%remove_pulses_loop:
323 PULSES_SEARCH sub ; m6 Syy_norm ; m7 Sxy_norm
326 jnz %%remove_pulses_loop
328 addps m6, m6 ; Syy*=2
332 lea r4d, [Nd - mmsize]
333 movaps m2, [const_float_sign_mask]
337 movaps m0, [tmpY + r4] ; m0 = Y[i]
338 movups m1, [inXq + r4] ; m1 = X[i]
339 andps m1, m2 ; m1 = sign(X[i])
340 orps m0, m1 ; m0 = Y[i]*sign
341 cvtps2dq m3, m0 ; m3 = (int)m0
342 movaps [outYq + r4], m3
345 jnc %%restore_sign_loop
348 %if ARCH_X86_64 == 0 ; sbrdsp
349 movss r0m, xm6 ; return (float)Syy_norm
352 movaps m0, m6 ; return (float)Syy_norm
359 lea r4d, [Nd - mmsize]
362 movaps [outYq + r4], m0
366 movaps m6, [const_float_1]
370 ; if 1, use a float op that give half precision but execute for around 3 cycles.
371 ; On Skylake & Ryzen the division is much faster (around 11c/3),
372 ; that makes the full precision code about 2% slower.
373 ; Opus also does use rsqrt approximation in their intrinsics code.
374 %define USE_APPROXIMATION 1
377 PVQ_FAST_SEARCH _approx
380 PVQ_FAST_SEARCH _approx
382 %define USE_APPROXIMATION 0
385 PVQ_FAST_SEARCH _exact