git.sesse.net Git - ffmpeg/blob - libavcodec/x86/celt_pvq_search.asm

   1 ;******************************************************************************
   2 ;* SIMD optimized Opus encoder DSP function
   3 ;*
   4 ;* Copyright (C) 2017 Ivan Kalvachev <ikalvachev@gmail.com>
   5 ;*
   6 ;* This file is part of FFmpeg.
   7 ;*
   8 ;* FFmpeg is free software; you can redistribute it and/or
   9 ;* modify it under the terms of the GNU Lesser General Public
  10 ;* License as published by the Free Software Foundation; either
  11 ;* version 2.1 of the License, or (at your option) any later version.
  12 ;*
  13 ;* FFmpeg is distributed in the hope that it will be useful,
  14 ;* but WITHOUT ANY WARRANTY; without even the implied warranty of
  15 ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  16 ;* Lesser General Public License for more details.
  17 ;*
  18 ;* You should have received a copy of the GNU Lesser General Public
  19 ;* License along with FFmpeg; if not, write to the Free Software
  20 ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  21 ;******************************************************************************
  22
  23 %include "config.asm"
  24 %include "libavutil/x86/x86util.asm"
  25
  26 %ifdef __NASM_VER__
  27 %use "smartalign"
  28 ALIGNMODE p6
  29 %endif
  30
  31 SECTION_RODATA 64
  32
  33 const_float_abs_mask:   times 8 dd 0x7fffffff
  34 const_align_abs_edge:   times 8 dd 0
  35
  36 const_float_0_5:        times 8 dd 0.5
  37 const_float_1:          times 8 dd 1.0
  38 const_float_sign_mask:  times 8 dd 0x80000000
  39
  40 const_int32_offsets:
  41                         %rep 8
  42                                 dd $-const_int32_offsets
  43                         %endrep
  44 SECTION .text
  45
  46 ;
  47 ;   Setup High Register to be used
  48 ;   for holding memory constants
  49 ;
  50 ; %1 - the register to be used, assmues it is >= mm8
  51 ; %2 - name of the constant.
  52 ;
  53 ; Subsequent opcodes are going to use the constant in the form
  54 ; "addps m0, mm_const_name" and it would be turned into:
  55 ; "addps m0, [const_name]" on 32 bit arch or
  56 ; "addps m0, m8" on 64 bit arch
  57 %macro SET_HI_REG_MM_CONSTANT 3 ; movop, reg, const_name
  58 %if num_mmregs > 8
  59     %define  mm_%3   %2
  60     %{1}        %2, [%3]    ; movaps m8, [const_name]
  61 %else
  62     %define  mm_%3  [%3]
  63 %endif
  64 %endmacro
  65
  66 ;
  67 ;   Set Position Independent Code
  68 ;       Base address of a constant
  69 ; %1 - the register to be used, if PIC is set
  70 ; %2 - name of the constant.
  71 ;
  72 ; Subsequent opcode are going to use the base address in the form
  73 ; "movaps m0, [pic_base_constant_name+r4]" and it would be turned into
  74 ; "movaps m0, [r5 + r4]" if PIC is enabled
  75 ; "movaps m0, [constant_name + r4]" if texrel are used
  76 %macro SET_PIC_BASE 3; reg, const_label
  77 %ifdef PIC
  78     %{1}     %2, [%3]      ; lea r5, [rip+const]
  79     %define  pic_base_%3 %2
  80 %else
  81     %define  pic_base_%3 %3
  82 %endif
  83 %endmacro
  84
  85 %macro PULSES_SEARCH 1
  86 ; m6 Syy_norm
  87 ; m7 Sxy_norm
  88     addps          m6, mm_const_float_0_5   ; Syy_norm += 1.0/2
  89     pxor           m1, m1                   ; max_idx
  90     xorps          m3, m3                   ; p_max
  91     xor           r4d, r4d
  92 align 16
  93 %%distortion_search:
  94     movd          xm2, dword r4d    ; movd zero extends
  95 %ifidn %1,add
  96     movaps         m4, [tmpY + r4]  ; y[i]
  97     movaps         m5, [tmpX + r4]  ; X[i]
  98
  99   %if USE_APPROXIMATION == 1
 100     xorps          m0, m0
 101     cmpps          m0, m0, m5, 4    ; m0 = (X[i] != 0.0)
 102   %endif
 103
 104     addps          m4, m6           ; m4 = Syy_new = y[i] + Syy_norm
 105     addps          m5, m7           ; m5 = Sxy_new = X[i] + Sxy_norm
 106
 107   %if USE_APPROXIMATION == 1
 108     andps          m5, m0           ; if(X[i] == 0) Sxy_new = 0; Prevent aproximation error from setting pulses in array padding.
 109   %endif
 110
 111 %else
 112     movaps         m5, [tmpY + r4]      ; m5 = y[i]
 113
 114     xorps          m0, m0               ; m0 = 0;
 115     cmpps          m0, m0, m5, 1        ; m0 = (0<y)
 116
 117     subps          m4, m6, m5           ; m4 = Syy_new = Syy_norm - y[i]
 118     subps          m5, m7, [tmpX + r4]  ; m5 = Sxy_new = Sxy_norm - X[i]
 119     andps          m5, m0               ; (0<y)?m5:0
 120 %endif
 121
 122 %if USE_APPROXIMATION == 1
 123     rsqrtps        m4, m4
 124     mulps          m5, m4           ; m5 = p = Sxy_new*approx(1/sqrt(Syy) )
 125 %else
 126     mulps          m5, m5
 127     divps          m5, m4           ; m5 = p = Sxy_new*Sxy_new/Syy
 128 %endif
 129     VPBROADCASTD   m2, xm2          ; m2=i (all lanes get same values, we add the offset-per-lane, later)
 130
 131     cmpps          m0, m3, m5, 1    ; m0 = (m3 < m5) ; (p_max < p) ; (p > p_max)
 132     maxps          m3, m5           ; m3=max(p_max,p)
 133                                     ; maxps here is faster than blendvps, despite blend having lower latency.
 134
 135     pand           m2, m0           ; This version seems faster than sse41 pblendvb
 136     pmaxsw         m1, m2           ; SSE2 signed word, so it would work for N < 32768/4
 137
 138     add           r4d, mmsize
 139     cmp           r4d, Nd
 140     jb   %%distortion_search
 141
 142     por            m1, mm_const_int32_offsets  ; max_idx offsets per individual lane (skipped in the inner loop)
 143     movdqa         m4, m1                      ; needed for the aligned y[max_idx]+=1; processing
 144
 145 %if mmsize >= 32
 146 ; Merge parallel maximums round 8 (4 vs 4)
 147
 148     vextractf128  xm5, ym3, 1       ; xmm5 = ymm3[1x128] = ymm3[255..128b]
 149     cmpps         xm0, xm3, xm5, 1  ; m0 = (m3 < m5) = ( p[0x128] < p[1x128] )
 150
 151     vextracti128  xm2, ym1, 1       ; xmm2 = ymm1[1x128] = ymm1[255..128b]
 152     BLENDVPS      xm3, xm5, xm0     ; max_idx = m0 ? max_idx[1x128] : max_idx[0x128]
 153     PBLENDVB      xm1, xm2, xm0     ; p       = m0 ? p[1x128]       : p[0x128]
 154 %endif
 155
 156 ; Merge parallel maximums round 4 (2 vs 2)
 157                                     ; m3=p[3210]
 158     movhlps       xm5, xm3          ; m5=p[xx32]
 159     cmpps         xm0, xm3, xm5, 1  ; m0 = (m3 < m5) = ( p[1,0] < p[3,2] )
 160
 161     pshufd        xm2, xm1, q3232
 162     BLENDVPS      xm3, xm5, xm0     ; max_idx = m0 ? max_idx[3,2] : max_idx[1,0]
 163     PBLENDVB      xm1, xm2, xm0     ; p       = m0 ? p[3,2]       : p[1,0]
 164
 165 ; Merge parallel maximums final round (1 vs 1)
 166     shufps        xm0, xm3, xm3, q1111  ; m0 = m3[1] = p[1]
 167     cmpss         xm0, xm3, 5           ; m0 = !(m0 >= m3) = !( p[1] >= p[0] )
 168
 169     pshufd        xm2, xm1, q1111
 170     PBLENDVB      xm1, xm2, xm0
 171
 172     movd    dword r4d, xm1          ; zero extends to the rest of r4q
 173
 174     VBROADCASTSS   m3, [tmpX + r4]
 175     %{1}ps         m7, m3           ; Sxy += X[max_idx]
 176
 177     VBROADCASTSS   m5, [tmpY + r4]
 178     %{1}ps         m6, m5           ; Syy += Y[max_idx]
 179
 180     ; We have to update a single element in Y[i]
 181     ; However writing 4 bytes and then doing 16 byte load in the inner loop
 182     ; could cause a stall due to breaking write forwarding.
 183     VPBROADCASTD   m1, xm1
 184     pcmpeqd        m1, m1, m4           ; exactly 1 element matches max_idx and this finds it
 185
 186     and           r4d, ~(mmsize-1)      ; align address down, so the value pointed by max_idx is inside a mmsize load
 187     movaps         m5, [tmpY + r4]      ; m5 = Y[y3...ym...y0]
 188     andps          m1, mm_const_float_1 ; m1 =  [ 0...1.0...0]
 189     %{1}ps         m5, m1               ; m5 = Y[y3...ym...y0] +/- [0...1.0...0]
 190     movaps [tmpY + r4], m5              ; Y[max_idx] +-= 1.0;
 191 %endmacro
 192
 193 ;
 194 ; We need one more register for
 195 ; PIC relative addressing. Use this
 196 ; to count it in cglobal
 197 ;
 198 %ifdef PIC
 199   %define num_pic_regs 1
 200 %else
 201   %define num_pic_regs 0
 202 %endif
 203
 204 ;
 205 ; Pyramid Vector Quantization Search implementation
 206 ;
 207 ; float * inX   - Unaligned (SIMD) access, it will be overread,
 208 ;                 but extra data is masked away.
 209 ; int32 * outY  - Should be aligned and padded buffer.
 210 ;                 It is used as temp buffer.
 211 ; uint32 K      - Number of pulses to have after quantizations.
 212 ; uint32 N      - Number of vector elements. Must be 0 < N < 256
 213 ;
 214 %macro PVQ_FAST_SEARCH 1
 215 cglobal pvq_search%1, 4, 5+num_pic_regs, 11, 256*4, inX, outY, K, N
 216 %define tmpX rsp
 217 %define tmpY outYq
 218
 219     movaps     m0, [const_float_abs_mask]
 220     shl        Nd, 2    ; N *= sizeof(float); also 32 bit operation zeroes the high 32 bits in 64 bit mode.
 221     mov       r4d, Nd
 222
 223     neg       r4d
 224     and       r4d, mmsize-1
 225
 226     SET_PIC_BASE lea, r5, const_align_abs_edge  ; rip+const
 227     movups     m2, [pic_base_const_align_abs_edge + r4 - mmsize]
 228
 229     add        Nd, r4d              ; N = align(N, mmsize)
 230
 231     lea       r4d, [Nd - mmsize]    ; N is rounded up (aligned up) to mmsize, so r4 can't become negative here, unless N=0.
 232     movups     m1, [inXq + r4]
 233     andps      m1, m2
 234     movaps  [tmpX + r4], m1         ; Sx = abs( X[N-1] )
 235
 236 align 16
 237 %%loop_abs_sum:
 238     sub       r4d, mmsize
 239     jc   %%end_loop_abs_sum
 240
 241     movups     m2, [inXq + r4]
 242     andps      m2, m0
 243
 244     movaps  [tmpX + r4], m2 ; tmpX[i]=abs(X[i])
 245     addps      m1, m2       ; Sx += abs(X[i])
 246     jmp  %%loop_abs_sum
 247
 248 align 16
 249 %%end_loop_abs_sum:
 250
 251     HSUMPS     m1, m2       ; m1  = Sx
 252
 253     xorps      m0, m0
 254     comiss    xm0, xm1      ;
 255     jz   %%zero_input       ; if (Sx==0) goto zero_input
 256
 257     cvtsi2ss  xm0, dword Kd ; m0 = K
 258 %if USE_APPROXIMATION == 1
 259     rcpss     xm1, xm1      ; m1 = approx(1/Sx)
 260     mulss     xm0, xm1      ; m0 = K*(1/Sx)
 261 %else
 262     divss     xm0, xm1      ; b = K/Sx
 263                             ; b = K/max_x
 264 %endif
 265
 266     VBROADCASTSS  m0, xm0
 267
 268     lea       r4d, [Nd - mmsize]
 269     pxor       m5, m5             ; Sy    ( Sum of abs( y[i]) )
 270     xorps      m6, m6             ; Syy   ( Sum of y[i]*y[i]  )
 271     xorps      m7, m7             ; Sxy   ( Sum of X[i]*y[i]  )
 272 align 16
 273 %%loop_guess:
 274     movaps     m1, [tmpX + r4]    ; m1   = X[i]
 275     mulps      m2, m0, m1         ; m2   = res*X[i]
 276     cvtps2dq   m2, m2             ; yt   = (int)lrintf( res*X[i] )
 277     paddd      m5, m2             ; Sy  += yt
 278     cvtdq2ps   m2, m2             ; yt   = (float)yt
 279     mulps      m1, m2             ; m1   = X[i]*yt
 280     movaps  [tmpY + r4], m2       ; y[i] = m2
 281     addps      m7, m1             ; Sxy += m1;
 282     mulps      m2, m2             ; m2   = yt*yt
 283     addps      m6, m2             ; Syy += m2
 284
 285     sub       r4d, mmsize
 286     jnc  %%loop_guess
 287
 288     HSUMPS     m6, m1       ; Syy_norm
 289     HADDD      m5, m4       ; pulses
 290
 291     movd  dword r4d, xm5    ; zero extends to the rest of r4q
 292
 293     sub        Kd, r4d      ; K -= pulses , also 32 bit operation zeroes high 32 bit in 64 bit mode.
 294     jz   %%finish           ; K - pulses == 0
 295
 296     SET_HI_REG_MM_CONSTANT movaps,  m8, const_float_0_5
 297     SET_HI_REG_MM_CONSTANT movaps,  m9, const_float_1
 298     SET_HI_REG_MM_CONSTANT movdqa, m10, const_int32_offsets
 299     ; Use Syy/2 in distortion parameter calculations.
 300     ; Saves pre and post-caclulation to correct Y[] values.
 301     ; Same precision, since float mantisa is normalized.
 302     ; The SQRT approximation does differ.
 303     HSUMPS     m7, m0         ; Sxy_norm
 304     mulps      m6, mm_const_float_0_5
 305
 306     jc   %%remove_pulses_loop   ; K - pulses < 0
 307
 308 align 16                        ; K - pulses > 0
 309 %%add_pulses_loop:
 310
 311     PULSES_SEARCH add   ; m6 Syy_norm ; m7 Sxy_norm
 312
 313     sub        Kd, 1
 314     jnz  %%add_pulses_loop
 315
 316     addps      m6, m6 ; Syy*=2
 317
 318     jmp  %%finish
 319
 320 align 16
 321 %%remove_pulses_loop:
 322
 323     PULSES_SEARCH sub   ; m6 Syy_norm ; m7 Sxy_norm
 324
 325     add        Kd, 1
 326     jnz  %%remove_pulses_loop
 327
 328     addps      m6, m6 ; Syy*=2
 329
 330 align 16
 331 %%finish:
 332     lea       r4d, [Nd - mmsize]
 333     movaps     m2, [const_float_sign_mask]
 334
 335 align 16
 336 %%restore_sign_loop:
 337     movaps     m0, [tmpY + r4]    ; m0 = Y[i]
 338     movups     m1, [inXq + r4]    ; m1 = X[i]
 339     andps      m1, m2             ; m1 = sign(X[i])
 340     orps       m0, m1             ; m0 = Y[i]*sign
 341     cvtps2dq   m3, m0             ; m3 = (int)m0
 342     movaps  [outYq + r4], m3
 343
 344     sub       r4d, mmsize
 345     jnc  %%restore_sign_loop
 346 %%return:
 347
 348 %if ARCH_X86_64 == 0    ; sbrdsp
 349     movss     r0m, xm6  ; return (float)Syy_norm
 350     fld dword r0m
 351 %else
 352     movaps     m0, m6   ; return (float)Syy_norm
 353 %endif
 354
 355     RET
 356
 357 align 16
 358 %%zero_input:
 359     lea       r4d, [Nd - mmsize]
 360     xorps      m0, m0
 361 %%zero_loop:
 362     movaps  [outYq + r4], m0
 363     sub       r4d, mmsize
 364     jnc  %%zero_loop
 365
 366     movaps     m6, [const_float_1]
 367     jmp  %%return
 368 %endmacro
 369
 370 ; if 1, use a float op that give half precision but execute for around 3 cycles.
 371 ; On Skylake & Ryzen the division is much faster (around 11c/3),
 372 ; that makes the full precision code about 2% slower.
 373 ; Opus also does use rsqrt approximation in their intrinsics code.
 374 %define USE_APPROXIMATION   1
 375
 376 INIT_XMM sse2
 377 PVQ_FAST_SEARCH _approx
 378
 379 INIT_XMM sse4
 380 PVQ_FAST_SEARCH _approx
 381
 382 %define USE_APPROXIMATION   0
 383
 384 INIT_XMM avx
 385 PVQ_FAST_SEARCH _exact