summary |
shortlog |
log |
commit | commitdiff |
tree
raw |
patch |
inline | side by side (from parent 1:
1146133)
Using named define properly documents the code paths.
It also avoids passing additional numbered arguments through
multiple levels of macro templates.
The suffix handling is done by concatenation, like in
other asm functions and avoid having two separate
"cglobal" defines.
Signed-off-by: Ivan Kalvachev <ikalvachev@gmail.com>
-%macro PULSES_SEARCH 2 ; %1 - add or sub, %2 - use approximation
; m6 Syy_norm
; m7 Sxy_norm
addps m6, mm_const_float_0_5 ; Syy_norm += 1.0/2
; m6 Syy_norm
; m7 Sxy_norm
addps m6, mm_const_float_0_5 ; Syy_norm += 1.0/2
movaps m4, [tmpY + r4] ; y[i]
movaps m5, [tmpX + r4] ; X[i]
movaps m4, [tmpY + r4] ; y[i]
movaps m5, [tmpX + r4] ; X[i]
+ %if USE_APPROXIMATION == 1
xorps m0, m0
cmpps m0, m0, m5, 4 ; m0 = (X[i] != 0.0)
xorps m0, m0
cmpps m0, m0, m5, 4 ; m0 = (X[i] != 0.0)
addps m4, m6 ; m4 = Syy_new = y[i] + Syy_norm
addps m5, m7 ; m5 = Sxy_new = X[i] + Sxy_norm
addps m4, m6 ; m4 = Syy_new = y[i] + Syy_norm
addps m5, m7 ; m5 = Sxy_new = X[i] + Sxy_norm
+ %if USE_APPROXIMATION == 1
andps m5, m0 ; if(X[i] == 0) Sxy_new = 0; Prevent aproximation error from setting pulses in array padding.
andps m5, m0 ; if(X[i] == 0) Sxy_new = 0; Prevent aproximation error from setting pulses in array padding.
%else
movaps m5, [tmpY + r4] ; m5 = y[i]
%else
movaps m5, [tmpY + r4] ; m5 = y[i]
andps m5, m0 ; (0<y)?m5:0
%endif
andps m5, m0 ; (0<y)?m5:0
%endif
+%if USE_APPROXIMATION == 1
rsqrtps m4, m4
mulps m5, m4 ; m5 = p = Sxy_new*approx(1/sqrt(Syy) )
%else
rsqrtps m4, m4
mulps m5, m4 ; m5 = p = Sxy_new*approx(1/sqrt(Syy) )
%else
; uint32 K - Number of pulses to have after quantizations.
; uint32 N - Number of vector elements. Must be 0 < N < 256
;
; uint32 K - Number of pulses to have after quantizations.
; uint32 N - Number of vector elements. Must be 0 < N < 256
;
-%macro PVQ_FAST_SEARCH 1 ; %1 - use approximation
-%if %1
-cglobal pvq_search_approx, 4, 5+num_pic_regs, 11, 256*4, inX, outY, K, N
-%else
-cglobal pvq_search_exact, 4, 5+num_pic_regs, 11, 256*4, inX, outY, K, N
-%endif
-
+%macro PVQ_FAST_SEARCH 1
+cglobal pvq_search%1, 4, 5+num_pic_regs, 11, 256*4, inX, outY, K, N
%define tmpX rsp
%define tmpY outYq
%define tmpX rsp
%define tmpY outYq
jz %%zero_input ; if (Sx==0) goto zero_input
cvtsi2ss xm0, dword Kd ; m0 = K
jz %%zero_input ; if (Sx==0) goto zero_input
cvtsi2ss xm0, dword Kd ; m0 = K
+%if USE_APPROXIMATION == 1
rcpss xm1, xm1 ; m1 = approx(1/Sx)
mulss xm0, xm1 ; m0 = K*(1/Sx)
%else
rcpss xm1, xm1 ; m1 = approx(1/Sx)
mulss xm0, xm1 ; m0 = K*(1/Sx)
%else
align 16 ; K - pulses > 0
%%add_pulses_loop:
align 16 ; K - pulses > 0
%%add_pulses_loop:
- PULSES_SEARCH add, %1 ; m6 Syy_norm ; m7 Sxy_norm
+ PULSES_SEARCH add ; m6 Syy_norm ; m7 Sxy_norm
sub Kd, 1
jnz %%add_pulses_loop
sub Kd, 1
jnz %%add_pulses_loop
align 16
%%remove_pulses_loop:
align 16
%%remove_pulses_loop:
- PULSES_SEARCH sub, %1 ; m6 Syy_norm ; m7 Sxy_norm
+ PULSES_SEARCH sub ; m6 Syy_norm ; m7 Sxy_norm
add Kd, 1
jnz %%remove_pulses_loop
add Kd, 1
jnz %%remove_pulses_loop
; On Skylake & Ryzen the division is much faster (around 11c/3),
; that makes the full precision code about 2% slower.
; Opus also does use rsqrt approximation in their intrinsics code.
; On Skylake & Ryzen the division is much faster (around 11c/3),
; that makes the full precision code about 2% slower.
; Opus also does use rsqrt approximation in their intrinsics code.
+%define USE_APPROXIMATION 1
+
+PVQ_FAST_SEARCH _approx
+
+%define USE_APPROXIMATION 0