jne .loop
REP_RET
-INIT_XMM sse2
; sbr_qmf_deint_bfly(float *v, const float *src0, const float *src1)
+%macro SBR_QMF_DEINT_BFLY 0
cglobal sbr_qmf_deint_bfly, 3,5,8, v,src0,src1,vrev,c
- mov cq, 64*4-2*mmsize
- lea vrevq, [vq + 64*4]
+ mov cq, 64*4-2*mmsize
+ lea vrevq, [vq + 64*4]
.loop:
- mova m0, [src0q+cq]
- mova m1, [src1q]
- mova m4, [src0q+cq+mmsize]
- mova m5, [src1q+mmsize]
+ mova m0, [src0q+cq]
+ mova m1, [src1q]
- mova m2, [src0q+cq+mmsize]
- mova m3, [src1q+mmsize]
- pshufd m4, m0, q0123
- pshufd m5, m1, q0123
- pshufd m6, m2, q0123
- pshufd m7, m3, q0123
- addps m3, m4
++ mova m4, [src0q+cq+mmsize]
++ mova m5, [src1q+mmsize]
+%if cpuflag(sse2)
- pshufd m2, m0, q0123
- pshufd m3, m1, q0123
- pshufd m6, m4, q0123
- pshufd m7, m5, q0123
++ pshufd m2, m0, q0123
++ pshufd m3, m1, q0123
++ pshufd m6, m4, q0123
++ pshufd m7, m5, q0123
+%else
- shufps m2, m0, m0, q0123
- shufps m3, m1, m1, q0123
- shufps m6, m4, m4, q0123
- shufps m7, m5, m5, q0123
++ shufps m2, m0, m0, q0123
++ shufps m3, m1, m1, q0123
++ shufps m6, m4, m4, q0123
++ shufps m7, m5, m5, q0123
+%endif
- addps m5, m2
- subps m0, m7
- addps m1, m6
- subps m4, m3
- mova [vrevq], m1
++ addps m5, m2
+ subps m0, m7
+ addps m1, m6
- subps m2, m5
++ subps m4, m3
+ mova [vrevq], m1
- mova [vrevq+mmsize], m3
+ mova [vrevq+mmsize], m5
- mova [vq+cq], m0
+ mova [vq+cq], m0
- mova [vq+cq+mmsize], m2
+ mova [vq+cq+mmsize], m4
- add src1q, 2*mmsize
- add vrevq, 2*mmsize
- sub cq, 2*mmsize
- jge .loop
+ add src1q, 2*mmsize
+ add vrevq, 2*mmsize
+ sub cq, 2*mmsize
+ jge .loop
REP_RET
+%endmacro
+
+INIT_XMM sse
+SBR_QMF_DEINT_BFLY
+
+INIT_XMM sse2
+SBR_QMF_DEINT_BFLY
+
+INIT_XMM sse2
+cglobal sbr_qmf_pre_shuffle, 1,4,7,z
+%define OFFSET (32*4-2*mmsize)
+ mov r3q, OFFSET
+ lea r1q, [zq + (32+1)*4]
+ lea r2q, [zq + 64*4]
+ mova m6, [ps_neg]
+.loop:
+ movu m0, [r1q]
+ movu m2, [r1q + mmsize]
+ movu m1, [zq + r3q + 4 + mmsize]
+ movu m3, [zq + r3q + 4]
+
+ pxor m2, m6
+ pxor m0, m6
+ pshufd m2, m2, q0123
+ pshufd m0, m0, q0123
+ SBUTTERFLY dq, 2, 3, 5
+ SBUTTERFLY dq, 0, 1, 4
+ mova [r2q + 2*r3q + 0*mmsize], m2
+ mova [r2q + 2*r3q + 1*mmsize], m3
+ mova [r2q + 2*r3q + 2*mmsize], m0
+ mova [r2q + 2*r3q + 3*mmsize], m1
+ add r1q, 2*mmsize
+ sub r3q, 2*mmsize
+ jge .loop
+ mova m2, [zq]
+ movq [r2q], m2
+ REP_RET
+
+%if WIN64 == 0
+
+%if WIN64
+%define NREGS 0
+%define NOISE_TABLE sbr_noise_table
+%else
+%ifdef PIC
+%define NREGS 1
+%if UNIX64
+%define NOISE_TABLE r6q ; r5q is m_max
+%else
+%define NOISE_TABLE r5q
+%endif
+%else
+%define NREGS 0
+%define NOISE_TABLE sbr_noise_table
+%endif
+%endif
+
+%macro LOAD_NST 1
+%if NREGS
+ lea NOISE_TABLE, [%1]
+ mova m0, [kxq + NOISE_TABLE]
+%else
+ mova m0, [kxq + %1]
+%endif
+%endmacro
+
+INIT_XMM sse2
+; sbr_hf_apply_noise_0(float (*Y)[2], const float *s_m,
+; const float *q_filt, int noise,
+; int kx, int m_max)
+cglobal sbr_hf_apply_noise_0, 5,5+NREGS+UNIX64,8, Y,s_m,q_filt,noise,kx,m_max
+ mova m0, [ps_noise0]
+ jmp apply_noise_main
+
+; sbr_hf_apply_noise_1(float (*Y)[2], const float *s_m,
+; const float *q_filt, int noise,
+; int kx, int m_max)
+cglobal sbr_hf_apply_noise_1, 5,5+NREGS+UNIX64,8, Y,s_m,q_filt,noise,kx,m_max
+ and kxq, 1
+ shl kxq, 4
+ LOAD_NST ps_noise13
+ jmp apply_noise_main
+
+; sbr_hf_apply_noise_2(float (*Y)[2], const float *s_m,
+; const float *q_filt, int noise,
+; int kx, int m_max)
+cglobal sbr_hf_apply_noise_2, 5,5+NREGS+UNIX64,8, Y,s_m,q_filt,noise,kx,m_max
+ mova m0, [ps_noise2]
+ jmp apply_noise_main
+
+; sbr_hf_apply_noise_3(float (*Y)[2], const float *s_m,
+; const float *q_filt, int noise,
+; int kx, int m_max)
+cglobal sbr_hf_apply_noise_3, 5,5+NREGS+UNIX64,8, Y,s_m,q_filt,noise,kx,m_max
+ and kxq, 1
+ shl kxq, 4
+ LOAD_NST ps_noise13+16
+
+apply_noise_main:
+%if ARCH_X86_64 == 0 || WIN64
+ mov kxd, m_maxm
+%define count kxq
+%else
+%define count m_maxq
+%endif
+ dec noiseq
+ shl count, 2
+%if NREGS
+ lea NOISE_TABLE, [sbr_noise_table]
+%endif
+ lea Yq, [Yq + 2*count]
+ add s_mq, count
+ add q_filtq, count
+ shl noiseq, 3
+ pxor m5, m5
+ neg count
+.loop:
+ mova m1, [q_filtq + count]
+ movu m3, [noiseq + NOISE_TABLE + 1*mmsize]
+ movu m4, [noiseq + NOISE_TABLE + 2*mmsize]
+ add noiseq, 2*mmsize
+ and noiseq, 0x1ff<<3
+ punpckhdq m2, m1, m1
+ punpckldq m1, m1
+ mulps m1, m3 ; m2 = q_filt[m] * ff_sbr_noise_table[noise]
+ mulps m2, m4 ; m2 = q_filt[m] * ff_sbr_noise_table[noise]
+ mova m3, [s_mq + count]
+ ; TODO: replace by a vpermd in AVX2
+ punpckhdq m4, m3, m3
+ punpckldq m3, m3
+ pcmpeqd m6, m3, m5 ; m6 == 0
+ pcmpeqd m7, m4, m5 ; m7 == 0
+ mulps m3, m0 ; s_m[m] * phi_sign
+ mulps m4, m0 ; s_m[m] * phi_sign
+ pand m1, m6
+ pand m2, m7
+ movu m6, [Yq + 2*count]
+ movu m7, [Yq + 2*count + mmsize]
+ addps m3, m1
+ addps m4, m2
+ addps m6, m3
+ addps m7, m4
+ movu [Yq + 2*count], m6
+ movu [Yq + 2*count + mmsize], m7
+ add count, mmsize
+ jl .loop
+ RET
+
+%endif ; WIN64 == 0