git.sesse.net Git - ffmpeg/blob - libavcodec/wmavoice.c

   1 /*
   2  * Windows Media Audio Voice decoder.
   3  * Copyright (c) 2009 Ronald S. Bultje
   4  *
   5  * This file is part of FFmpeg.
   6  *
   7  * FFmpeg is free software; you can redistribute it and/or
   8  * modify it under the terms of the GNU Lesser General Public
   9  * License as published by the Free Software Foundation; either
  10  * version 2.1 of the License, or (at your option) any later version.
  11  *
  12  * FFmpeg is distributed in the hope that it will be useful,
  13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15  * Lesser General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU Lesser General Public
  18  * License along with FFmpeg; if not, write to the Free Software
  19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20  */
  21
  22 /**
  23  * @file
  24  * @brief Windows Media Audio Voice compatible decoder
  25  * @author Ronald S. Bultje <rsbultje@gmail.com>
  26  */
  27
  28 #include <math.h>
  29
  30 #include "libavutil/channel_layout.h"
  31 #include "libavutil/float_dsp.h"
  32 #include "libavutil/mem.h"
  33 #include "avcodec.h"
  34 #include "internal.h"
  35 #include "get_bits.h"
  36 #include "put_bits.h"
  37 #include "wmavoice_data.h"
  38 #include "celp_filters.h"
  39 #include "acelp_vectors.h"
  40 #include "acelp_filters.h"
  41 #include "lsp.h"
  42 #include "dct.h"
  43 #include "rdft.h"
  44 #include "sinewin.h"
  45
  46 #define MAX_BLOCKS           8   ///< maximum number of blocks per frame
  47 #define MAX_LSPS             16  ///< maximum filter order
  48 #define MAX_LSPS_ALIGN16     16  ///< same as #MAX_LSPS; needs to be multiple
  49                                  ///< of 16 for ASM input buffer alignment
  50 #define MAX_FRAMES           3   ///< maximum number of frames per superframe
  51 #define MAX_FRAMESIZE        160 ///< maximum number of samples per frame
  52 #define MAX_SIGNAL_HISTORY   416 ///< maximum excitation signal history
  53 #define MAX_SFRAMESIZE       (MAX_FRAMESIZE * MAX_FRAMES)
  54                                  ///< maximum number of samples per superframe
  55 #define SFRAME_CACHE_MAXSIZE 256 ///< maximum cache size for frame data that
  56                                  ///< was split over two packets
  57 #define VLC_NBITS            6   ///< number of bits to read per VLC iteration
  58
  59 /**
  60  * Frame type VLC coding.
  61  */
  62 static VLC frame_type_vlc;
  63
  64 /**
  65  * Adaptive codebook types.
  66  */
  67 enum {
  68     ACB_TYPE_NONE       = 0, ///< no adaptive codebook (only hardcoded fixed)
  69     ACB_TYPE_ASYMMETRIC = 1, ///< adaptive codebook with per-frame pitch, which
  70                              ///< we interpolate to get a per-sample pitch.
  71                              ///< Signal is generated using an asymmetric sinc
  72                              ///< window function
  73                              ///< @note see #wmavoice_ipol1_coeffs
  74     ACB_TYPE_HAMMING    = 2  ///< Per-block pitch with signal generation using
  75                              ///< a Hamming sinc window function
  76                              ///< @note see #wmavoice_ipol2_coeffs
  77 };
  78
  79 /**
  80  * Fixed codebook types.
  81  */
  82 enum {
  83     FCB_TYPE_SILENCE    = 0, ///< comfort noise during silence
  84                              ///< generated from a hardcoded (fixed) codebook
  85                              ///< with per-frame (low) gain values
  86     FCB_TYPE_HARDCODED  = 1, ///< hardcoded (fixed) codebook with per-block
  87                              ///< gain values
  88     FCB_TYPE_AW_PULSES  = 2, ///< Pitch-adaptive window (AW) pulse signals,
  89                              ///< used in particular for low-bitrate streams
  90     FCB_TYPE_EXC_PULSES = 3, ///< Innovation (fixed) codebook pulse sets in
  91                              ///< combinations of either single pulses or
  92                              ///< pulse pairs
  93 };
  94
  95 /**
  96  * Description of frame types.
  97  */
  98 static const struct frame_type_desc {
  99     uint8_t n_blocks;     ///< amount of blocks per frame (each block
 100                           ///< (contains 160/#n_blocks samples)
 101     uint8_t log_n_blocks; ///< log2(#n_blocks)
 102     uint8_t acb_type;     ///< Adaptive codebook type (ACB_TYPE_*)
 103     uint8_t fcb_type;     ///< Fixed codebook type (FCB_TYPE_*)
 104     uint8_t dbl_pulses;   ///< how many pulse vectors have pulse pairs
 105                           ///< (rather than just one single pulse)
 106                           ///< only if #fcb_type == #FCB_TYPE_EXC_PULSES
 107 } frame_descs[17] = {
 108     { 1, 0, ACB_TYPE_NONE,       FCB_TYPE_SILENCE,    0 },
 109     { 2, 1, ACB_TYPE_NONE,       FCB_TYPE_HARDCODED,  0 },
 110     { 2, 1, ACB_TYPE_ASYMMETRIC, FCB_TYPE_AW_PULSES,  0 },
 111     { 2, 1, ACB_TYPE_ASYMMETRIC, FCB_TYPE_EXC_PULSES, 2 },
 112     { 2, 1, ACB_TYPE_ASYMMETRIC, FCB_TYPE_EXC_PULSES, 5 },
 113     { 4, 2, ACB_TYPE_ASYMMETRIC, FCB_TYPE_EXC_PULSES, 0 },
 114     { 4, 2, ACB_TYPE_ASYMMETRIC, FCB_TYPE_EXC_PULSES, 2 },
 115     { 4, 2, ACB_TYPE_ASYMMETRIC, FCB_TYPE_EXC_PULSES, 5 },
 116     { 2, 1, ACB_TYPE_HAMMING,    FCB_TYPE_EXC_PULSES, 0 },
 117     { 2, 1, ACB_TYPE_HAMMING,    FCB_TYPE_EXC_PULSES, 2 },
 118     { 2, 1, ACB_TYPE_HAMMING,    FCB_TYPE_EXC_PULSES, 5 },
 119     { 4, 2, ACB_TYPE_HAMMING,    FCB_TYPE_EXC_PULSES, 0 },
 120     { 4, 2, ACB_TYPE_HAMMING,    FCB_TYPE_EXC_PULSES, 2 },
 121     { 4, 2, ACB_TYPE_HAMMING,    FCB_TYPE_EXC_PULSES, 5 },
 122     { 8, 3, ACB_TYPE_HAMMING,    FCB_TYPE_EXC_PULSES, 0 },
 123     { 8, 3, ACB_TYPE_HAMMING,    FCB_TYPE_EXC_PULSES, 2 },
 124     { 8, 3, ACB_TYPE_HAMMING,    FCB_TYPE_EXC_PULSES, 5 }
 125 };
 126
 127 /**
 128  * WMA Voice decoding context.
 129  */
 130 typedef struct WMAVoiceContext {
 131     /**
 132      * @name Global values specified in the stream header / extradata or used all over.
 133      * @{
 134      */
 135     GetBitContext gb;             ///< packet bitreader. During decoder init,
 136                                   ///< it contains the extradata from the
 137                                   ///< demuxer. During decoding, it contains
 138                                   ///< packet data.
 139     int8_t vbm_tree[25];          ///< converts VLC codes to frame type
 140
 141     int spillover_bitsize;        ///< number of bits used to specify
 142                                   ///< #spillover_nbits in the packet header
 143                                   ///< = ceil(log2(ctx->block_align << 3))
 144     int history_nsamples;         ///< number of samples in history for signal
 145                                   ///< prediction (through ACB)
 146
 147     /* postfilter specific values */
 148     int do_apf;                   ///< whether to apply the averaged
 149                                   ///< projection filter (APF)
 150     int denoise_strength;         ///< strength of denoising in Wiener filter
 151                                   ///< [0-11]
 152     int denoise_tilt_corr;        ///< Whether to apply tilt correction to the
 153                                   ///< Wiener filter coefficients (postfilter)
 154     int dc_level;                 ///< Predicted amount of DC noise, based
 155                                   ///< on which a DC removal filter is used
 156
 157     int lsps;                     ///< number of LSPs per frame [10 or 16]
 158     int lsp_q_mode;               ///< defines quantizer defaults [0, 1]
 159     int lsp_def_mode;             ///< defines different sets of LSP defaults
 160                                   ///< [0, 1]
 161
 162     int min_pitch_val;            ///< base value for pitch parsing code
 163     int max_pitch_val;            ///< max value + 1 for pitch parsing
 164     int pitch_nbits;              ///< number of bits used to specify the
 165                                   ///< pitch value in the frame header
 166     int block_pitch_nbits;        ///< number of bits used to specify the
 167                                   ///< first block's pitch value
 168     int block_pitch_range;        ///< range of the block pitch
 169     int block_delta_pitch_nbits;  ///< number of bits used to specify the
 170                                   ///< delta pitch between this and the last
 171                                   ///< block's pitch value, used in all but
 172                                   ///< first block
 173     int block_delta_pitch_hrange; ///< 1/2 range of the delta (full range is
 174                                   ///< from -this to +this-1)
 175     uint16_t block_conv_table[4]; ///< boundaries for block pitch unit/scale
 176                                   ///< conversion
 177
 178     /**
 179      * @}
 180      *
 181      * @name Packet values specified in the packet header or related to a packet.
 182      *
 183      * A packet is considered to be a single unit of data provided to this
 184      * decoder by the demuxer.
 185      * @{
 186      */
 187     int spillover_nbits;          ///< number of bits of the previous packet's
 188                                   ///< last superframe preceding this
 189                                   ///< packet's first full superframe (useful
 190                                   ///< for re-synchronization also)
 191     int has_residual_lsps;        ///< if set, superframes contain one set of
 192                                   ///< LSPs that cover all frames, encoded as
 193                                   ///< independent and residual LSPs; if not
 194                                   ///< set, each frame contains its own, fully
 195                                   ///< independent, LSPs
 196     int skip_bits_next;           ///< number of bits to skip at the next call
 197                                   ///< to #wmavoice_decode_packet() (since
 198                                   ///< they're part of the previous superframe)
 199
 200     uint8_t sframe_cache[SFRAME_CACHE_MAXSIZE + AV_INPUT_BUFFER_PADDING_SIZE];
 201                                   ///< cache for superframe data split over
 202                                   ///< multiple packets
 203     int sframe_cache_size;        ///< set to >0 if we have data from an
 204                                   ///< (incomplete) superframe from a previous
 205                                   ///< packet that spilled over in the current
 206                                   ///< packet; specifies the amount of bits in
 207                                   ///< #sframe_cache
 208     PutBitContext pb;             ///< bitstream writer for #sframe_cache
 209
 210     /**
 211      * @}
 212      *
 213      * @name Frame and superframe values
 214      * Superframe and frame data - these can change from frame to frame,
 215      * although some of them do in that case serve as a cache / history for
 216      * the next frame or superframe.
 217      * @{
 218      */
 219     double prev_lsps[MAX_LSPS];   ///< LSPs of the last frame of the previous
 220                                   ///< superframe
 221     int last_pitch_val;           ///< pitch value of the previous frame
 222     int last_acb_type;            ///< frame type [0-2] of the previous frame
 223     int pitch_diff_sh16;          ///< ((cur_pitch_val - #last_pitch_val)
 224                                   ///< << 16) / #MAX_FRAMESIZE
 225     float silence_gain;           ///< set for use in blocks if #ACB_TYPE_NONE
 226
 227     int aw_idx_is_ext;            ///< whether the AW index was encoded in
 228                                   ///< 8 bits (instead of 6)
 229     int aw_pulse_range;           ///< the range over which #aw_pulse_set1()
 230                                   ///< can apply the pulse, relative to the
 231                                   ///< value in aw_first_pulse_off. The exact
 232                                   ///< position of the first AW-pulse is within
 233                                   ///< [pulse_off, pulse_off + this], and
 234                                   ///< depends on bitstream values; [16 or 24]
 235     int aw_n_pulses[2];           ///< number of AW-pulses in each block; note
 236                                   ///< that this number can be negative (in
 237                                   ///< which case it basically means "zero")
 238     int aw_first_pulse_off[2];    ///< index of first sample to which to
 239                                   ///< apply AW-pulses, or -0xff if unset
 240     int aw_next_pulse_off_cache;  ///< the position (relative to start of the
 241                                   ///< second block) at which pulses should
 242                                   ///< start to be positioned, serves as a
 243                                   ///< cache for pitch-adaptive window pulses
 244                                   ///< between blocks
 245
 246     int frame_cntr;               ///< current frame index [0 - 0xFFFE]; is
 247                                   ///< only used for comfort noise in #pRNG()
 248     int nb_superframes;           ///< number of superframes in current packet
 249     float gain_pred_err[6];       ///< cache for gain prediction
 250     float excitation_history[MAX_SIGNAL_HISTORY];
 251                                   ///< cache of the signal of previous
 252                                   ///< superframes, used as a history for
 253                                   ///< signal generation
 254     float synth_history[MAX_LSPS]; ///< see #excitation_history
 255     /**
 256      * @}
 257      *
 258      * @name Postfilter values
 259      *
 260      * Variables used for postfilter implementation, mostly history for
 261      * smoothing and so on, and context variables for FFT/iFFT.
 262      * @{
 263      */
 264     RDFTContext rdft, irdft;      ///< contexts for FFT-calculation in the
 265                                   ///< postfilter (for denoise filter)
 266     DCTContext dct, dst;          ///< contexts for phase shift (in Hilbert
 267                                   ///< transform, part of postfilter)
 268     float sin[511], cos[511];     ///< 8-bit cosine/sine windows over [-pi,pi]
 269                                   ///< range
 270     float postfilter_agc;         ///< gain control memory, used in
 271                                   ///< #adaptive_gain_control()
 272     float dcf_mem[2];             ///< DC filter history
 273     float zero_exc_pf[MAX_SIGNAL_HISTORY + MAX_SFRAMESIZE];
 274                                   ///< zero filter output (i.e. excitation)
 275                                   ///< by postfilter
 276     float denoise_filter_cache[MAX_FRAMESIZE];
 277     int   denoise_filter_cache_size; ///< samples in #denoise_filter_cache
 278     DECLARE_ALIGNED(32, float, tilted_lpcs_pf)[0x80];
 279                                   ///< aligned buffer for LPC tilting
 280     DECLARE_ALIGNED(32, float, denoise_coeffs_pf)[0x80];
 281                                   ///< aligned buffer for denoise coefficients
 282     DECLARE_ALIGNED(32, float, synth_filter_out_buf)[0x80 + MAX_LSPS_ALIGN16];
 283                                   ///< aligned buffer for postfilter speech
 284                                   ///< synthesis
 285     /**
 286      * @}
 287      */
 288 } WMAVoiceContext;
 289
 290 /**
 291  * Set up the variable bit mode (VBM) tree from container extradata.
 292  * @param gb bit I/O context.
 293  *           The bit context (s->gb) should be loaded with byte 23-46 of the
 294  *           container extradata (i.e. the ones containing the VBM tree).
 295  * @param vbm_tree pointer to array to which the decoded VBM tree will be
 296  *                 written.
 297  * @return 0 on success, <0 on error.
 298  */
 299 static av_cold int decode_vbmtree(GetBitContext *gb, int8_t vbm_tree[25])
 300 {
 301     int cntr[8] = { 0 }, n, res;
 302
 303     memset(vbm_tree, 0xff, sizeof(vbm_tree[0]) * 25);
 304     for (n = 0; n < 17; n++) {
 305         res = get_bits(gb, 3);
 306         if (cntr[res] > 3) // should be >= 3 + (res == 7))
 307             return -1;
 308         vbm_tree[res * 3 + cntr[res]++] = n;
 309     }
 310     return 0;
 311 }
 312
 313 static av_cold void wmavoice_init_static_data(AVCodec *codec)
 314 {
 315     static const uint8_t bits[] = {
 316          2,  2,  2,  4,  4,  4,
 317          6,  6,  6,  8,  8,  8,
 318         10, 10, 10, 12, 12, 12,
 319         14, 14, 14, 14
 320     };
 321     static const uint16_t codes[] = {
 322           0x0000, 0x0001, 0x0002,        //              00/01/10
 323           0x000c, 0x000d, 0x000e,        //           11+00/01/10
 324           0x003c, 0x003d, 0x003e,        //         1111+00/01/10
 325           0x00fc, 0x00fd, 0x00fe,        //       111111+00/01/10
 326           0x03fc, 0x03fd, 0x03fe,        //     11111111+00/01/10
 327           0x0ffc, 0x0ffd, 0x0ffe,        //   1111111111+00/01/10
 328           0x3ffc, 0x3ffd, 0x3ffe, 0x3fff // 111111111111+xx
 329     };
 330
 331     INIT_VLC_STATIC(&frame_type_vlc, VLC_NBITS, sizeof(bits),
 332                     bits, 1, 1, codes, 2, 2, 132);
 333 }
 334
 335 static av_cold void wmavoice_flush(AVCodecContext *ctx)
 336 {
 337     WMAVoiceContext *s = ctx->priv_data;
 338     int n;
 339
 340     s->postfilter_agc    = 0;
 341     s->sframe_cache_size = 0;
 342     s->skip_bits_next    = 0;
 343     for (n = 0; n < s->lsps; n++)
 344         s->prev_lsps[n] = M_PI * (n + 1.0) / (s->lsps + 1.0);
 345     memset(s->excitation_history, 0,
 346            sizeof(*s->excitation_history) * MAX_SIGNAL_HISTORY);
 347     memset(s->synth_history,      0,
 348            sizeof(*s->synth_history)      * MAX_LSPS);
 349     memset(s->gain_pred_err,      0,
 350            sizeof(s->gain_pred_err));
 351
 352     if (s->do_apf) {
 353         memset(&s->synth_filter_out_buf[MAX_LSPS_ALIGN16 - s->lsps], 0,
 354                sizeof(*s->synth_filter_out_buf) * s->lsps);
 355         memset(s->dcf_mem,              0,
 356                sizeof(*s->dcf_mem)              * 2);
 357         memset(s->zero_exc_pf,          0,
 358                sizeof(*s->zero_exc_pf)          * s->history_nsamples);
 359         memset(s->denoise_filter_cache, 0, sizeof(s->denoise_filter_cache));
 360     }
 361 }
 362
 363 /**
 364  * Set up decoder with parameters from demuxer (extradata etc.).
 365  */
 366 static av_cold int wmavoice_decode_init(AVCodecContext *ctx)
 367 {
 368     int n, flags, pitch_range, lsp16_flag;
 369     WMAVoiceContext *s = ctx->priv_data;
 370
 371     /**
 372      * Extradata layout:
 373      * - byte  0-18: WMAPro-in-WMAVoice extradata (see wmaprodec.c),
 374      * - byte 19-22: flags field (annoyingly in LE; see below for known
 375      *               values),
 376      * - byte 23-46: variable bitmode tree (really just 17 * 3 bits,
 377      *               rest is 0).
 378      */
 379     if (ctx->extradata_size != 46) {
 380         av_log(ctx, AV_LOG_ERROR,
 381                "Invalid extradata size %d (should be 46)\n",
 382                ctx->extradata_size);
 383         return AVERROR_INVALIDDATA;
 384     }
 385     if (ctx->block_align <= 0) {
 386         av_log(ctx, AV_LOG_ERROR, "Invalid block alignment %d.\n", ctx->block_align);
 387         return AVERROR_INVALIDDATA;
 388     }
 389
 390     flags                = AV_RL32(ctx->extradata + 18);
 391     s->spillover_bitsize = 3 + av_ceil_log2(ctx->block_align);
 392     s->do_apf            =    flags & 0x1;
 393     if (s->do_apf) {
 394         ff_rdft_init(&s->rdft,  7, DFT_R2C);
 395         ff_rdft_init(&s->irdft, 7, IDFT_C2R);
 396         ff_dct_init(&s->dct,  6, DCT_I);
 397         ff_dct_init(&s->dst,  6, DST_I);
 398
 399         ff_sine_window_init(s->cos, 256);
 400         memcpy(&s->sin[255], s->cos, 256 * sizeof(s->cos[0]));
 401         for (n = 0; n < 255; n++) {
 402             s->sin[n]       = -s->sin[510 - n];
 403             s->cos[510 - n] =  s->cos[n];
 404         }
 405     }
 406     s->denoise_strength  =   (flags >> 2) & 0xF;
 407     if (s->denoise_strength >= 12) {
 408         av_log(ctx, AV_LOG_ERROR,
 409                "Invalid denoise filter strength %d (max=11)\n",
 410                s->denoise_strength);
 411         return AVERROR_INVALIDDATA;
 412     }
 413     s->denoise_tilt_corr = !!(flags & 0x40);
 414     s->dc_level          =   (flags >> 7) & 0xF;
 415     s->lsp_q_mode        = !!(flags & 0x2000);
 416     s->lsp_def_mode      = !!(flags & 0x4000);
 417     lsp16_flag           =    flags & 0x1000;
 418     if (lsp16_flag) {
 419         s->lsps               = 16;
 420     } else {
 421         s->lsps               = 10;
 422     }
 423     for (n = 0; n < s->lsps; n++)
 424         s->prev_lsps[n] = M_PI * (n + 1.0) / (s->lsps + 1.0);
 425
 426     init_get_bits(&s->gb, ctx->extradata + 22, (ctx->extradata_size - 22) << 3);
 427     if (decode_vbmtree(&s->gb, s->vbm_tree) < 0) {
 428         av_log(ctx, AV_LOG_ERROR, "Invalid VBM tree; broken extradata?\n");
 429         return AVERROR_INVALIDDATA;
 430     }
 431
 432     s->min_pitch_val    = ((ctx->sample_rate << 8)      /  400 + 50) >> 8;
 433     s->max_pitch_val    = ((ctx->sample_rate << 8) * 37 / 2000 + 50) >> 8;
 434     pitch_range         = s->max_pitch_val - s->min_pitch_val;
 435     if (pitch_range <= 0) {
 436         av_log(ctx, AV_LOG_ERROR, "Invalid pitch range; broken extradata?\n");
 437         return AVERROR_INVALIDDATA;
 438     }
 439     s->pitch_nbits      = av_ceil_log2(pitch_range);
 440     s->last_pitch_val   = 40;
 441     s->last_acb_type    = ACB_TYPE_NONE;
 442     s->history_nsamples = s->max_pitch_val + 8;
 443
 444     if (s->min_pitch_val < 1 || s->history_nsamples > MAX_SIGNAL_HISTORY) {
 445         int min_sr = ((((1 << 8) - 50) * 400) + 0xFF) >> 8,
 446             max_sr = ((((MAX_SIGNAL_HISTORY - 8) << 8) + 205) * 2000 / 37) >> 8;
 447
 448         av_log(ctx, AV_LOG_ERROR,
 449                "Unsupported samplerate %d (min=%d, max=%d)\n",
 450                ctx->sample_rate, min_sr, max_sr); // 322-22097 Hz
 451
 452         return AVERROR(ENOSYS);
 453     }
 454
 455     s->block_conv_table[0]      = s->min_pitch_val;
 456     s->block_conv_table[1]      = (pitch_range * 25) >> 6;
 457     s->block_conv_table[2]      = (pitch_range * 44) >> 6;
 458     s->block_conv_table[3]      = s->max_pitch_val - 1;
 459     s->block_delta_pitch_hrange = (pitch_range >> 3) & ~0xF;
 460     if (s->block_delta_pitch_hrange <= 0) {
 461         av_log(ctx, AV_LOG_ERROR, "Invalid delta pitch hrange; broken extradata?\n");
 462         return AVERROR_INVALIDDATA;
 463     }
 464     s->block_delta_pitch_nbits  = 1 + av_ceil_log2(s->block_delta_pitch_hrange);
 465     s->block_pitch_range        = s->block_conv_table[2] +
 466                                   s->block_conv_table[3] + 1 +
 467                                   2 * (s->block_conv_table[1] - 2 * s->min_pitch_val);
 468     s->block_pitch_nbits        = av_ceil_log2(s->block_pitch_range);
 469
 470     ctx->channels               = 1;
 471     ctx->channel_layout         = AV_CH_LAYOUT_MONO;
 472     ctx->sample_fmt             = AV_SAMPLE_FMT_FLT;
 473
 474     return 0;
 475 }
 476
 477 /**
 478  * @name Postfilter functions
 479  * Postfilter functions (gain control, wiener denoise filter, DC filter,
 480  * kalman smoothening, plus surrounding code to wrap it)
 481  * @{
 482  */
 483 /**
 484  * Adaptive gain control (as used in postfilter).
 485  *
 486  * Identical to #ff_adaptive_gain_control() in acelp_vectors.c, except
 487  * that the energy here is calculated using sum(abs(...)), whereas the
 488  * other codecs (e.g. AMR-NB, SIPRO) use sqrt(dotproduct(...)).
 489  *
 490  * @param out output buffer for filtered samples
 491  * @param in input buffer containing the samples as they are after the
 492  *           postfilter steps so far
 493  * @param speech_synth input buffer containing speech synth before postfilter
 494  * @param size input buffer size
 495  * @param alpha exponential filter factor
 496  * @param gain_mem pointer to filter memory (single float)
 497  */
 498 static void adaptive_gain_control(float *out, const float *in,
 499                                   const float *speech_synth,
 500                                   int size, float alpha, float *gain_mem)
 501 {
 502     int i;
 503     float speech_energy = 0.0, postfilter_energy = 0.0, gain_scale_factor;
 504     float mem = *gain_mem;
 505
 506     for (i = 0; i < size; i++) {
 507         speech_energy     += fabsf(speech_synth[i]);
 508         postfilter_energy += fabsf(in[i]);
 509     }
 510     gain_scale_factor = postfilter_energy == 0.0 ? 0.0 :
 511                         (1.0 - alpha) * speech_energy / postfilter_energy;
 512
 513     for (i = 0; i < size; i++) {
 514         mem = alpha * mem + gain_scale_factor;
 515         out[i] = in[i] * mem;
 516     }
 517
 518     *gain_mem = mem;
 519 }
 520
 521 /**
 522  * Kalman smoothing function.
 523  *
 524  * This function looks back pitch +/- 3 samples back into history to find
 525  * the best fitting curve (that one giving the optimal gain of the two
 526  * signals, i.e. the highest dot product between the two), and then
 527  * uses that signal history to smoothen the output of the speech synthesis
 528  * filter.
 529  *
 530  * @param s WMA Voice decoding context
 531  * @param pitch pitch of the speech signal
 532  * @param in input speech signal
 533  * @param out output pointer for smoothened signal
 534  * @param size input/output buffer size
 535  *
 536  * @returns -1 if no smoothening took place, e.g. because no optimal
 537  *          fit could be found, or 0 on success.
 538  */
 539 static int kalman_smoothen(WMAVoiceContext *s, int pitch,
 540                            const float *in, float *out, int size)
 541 {
 542     int n;
 543     float optimal_gain = 0, dot;
 544     const float *ptr = &in[-FFMAX(s->min_pitch_val, pitch - 3)],
 545                 *end = &in[-FFMIN(s->max_pitch_val, pitch + 3)],
 546                 *best_hist_ptr = NULL;
 547
 548     /* find best fitting point in history */
 549     do {
 550         dot = avpriv_scalarproduct_float_c(in, ptr, size);
 551         if (dot > optimal_gain) {
 552             optimal_gain  = dot;
 553             best_hist_ptr = ptr;
 554         }
 555     } while (--ptr >= end);
 556
 557     if (optimal_gain <= 0)
 558         return -1;
 559     dot = avpriv_scalarproduct_float_c(best_hist_ptr, best_hist_ptr, size);
 560     if (dot <= 0) // would be 1.0
 561         return -1;
 562
 563     if (optimal_gain <= dot) {
 564         dot = dot / (dot + 0.6 * optimal_gain); // 0.625-1.000
 565     } else
 566         dot = 0.625;
 567
 568     /* actual smoothing */
 569     for (n = 0; n < size; n++)
 570         out[n] = best_hist_ptr[n] + dot * (in[n] - best_hist_ptr[n]);
 571
 572     return 0;
 573 }
 574
 575 /**
 576  * Get the tilt factor of a formant filter from its transfer function
 577  * @see #tilt_factor() in amrnbdec.c, which does essentially the same,
 578  *      but somehow (??) it does a speech synthesis filter in the
 579  *      middle, which is missing here
 580  *
 581  * @param lpcs LPC coefficients
 582  * @param n_lpcs Size of LPC buffer
 583  * @returns the tilt factor
 584  */
 585 static float tilt_factor(const float *lpcs, int n_lpcs)
 586 {
 587     float rh0, rh1;
 588
 589     rh0 = 1.0     + avpriv_scalarproduct_float_c(lpcs,  lpcs,    n_lpcs);
 590     rh1 = lpcs[0] + avpriv_scalarproduct_float_c(lpcs, &lpcs[1], n_lpcs - 1);
 591
 592     return rh1 / rh0;
 593 }
 594
 595 /**
 596  * Derive denoise filter coefficients (in real domain) from the LPCs.
 597  */
 598 static void calc_input_response(WMAVoiceContext *s, float *lpcs,
 599                                 int fcb_type, float *coeffs, int remainder)
 600 {
 601     float last_coeff, min = 15.0, max = -15.0;
 602     float irange, angle_mul, gain_mul, range, sq;
 603     int n, idx;
 604
 605     /* Create frequency power spectrum of speech input (i.e. RDFT of LPCs) */
 606     s->rdft.rdft_calc(&s->rdft, lpcs);
 607 #define log_range(var, assign) do { \
 608         float tmp = log10f(assign);  var = tmp; \
 609         max       = FFMAX(max, tmp); min = FFMIN(min, tmp); \
 610     } while (0)
 611     log_range(last_coeff,  lpcs[1]         * lpcs[1]);
 612     for (n = 1; n < 64; n++)
 613         log_range(lpcs[n], lpcs[n * 2]     * lpcs[n * 2] +
 614                            lpcs[n * 2 + 1] * lpcs[n * 2 + 1]);
 615     log_range(lpcs[0],     lpcs[0]         * lpcs[0]);
 616 #undef log_range
 617     range    = max - min;
 618     lpcs[64] = last_coeff;
 619
 620     /* Now, use this spectrum to pick out these frequencies with higher
 621      * (relative) power/energy (which we then take to be "not noise"),
 622      * and set up a table (still in lpc[]) of (relative) gains per frequency.
 623      * These frequencies will be maintained, while others ("noise") will be
 624      * decreased in the filter output. */
 625     irange    = 64.0 / range; // so irange*(max-value) is in the range [0, 63]
 626     gain_mul  = range * (fcb_type == FCB_TYPE_HARDCODED ? (5.0 / 13.0) :
 627                                                           (5.0 / 14.7));
 628     angle_mul = gain_mul * (8.0 * M_LN10 / M_PI);
 629     for (n = 0; n <= 64; n++) {
 630         float pwr;
 631
 632         idx = FFMAX(0, lrint((max - lpcs[n]) * irange) - 1);
 633         pwr = wmavoice_denoise_power_table[s->denoise_strength][idx];
 634         lpcs[n] = angle_mul * pwr;
 635
 636         /* 70.57 =~ 1/log10(1.0331663) */
 637         idx = (pwr * gain_mul - 0.0295) * 70.570526123;
 638         if (idx > 127) { // fall back if index falls outside table range
 639             coeffs[n] = wmavoice_energy_table[127] *
 640                         powf(1.0331663, idx - 127);
 641         } else
 642             coeffs[n] = wmavoice_energy_table[FFMAX(0, idx)];
 643     }
 644
 645     /* calculate the Hilbert transform of the gains, which we do (since this
 646      * is a sine input) by doing a phase shift (in theory, H(sin())=cos()).
 647      * Hilbert_Transform(RDFT(x)) = Laplace_Transform(x), which calculates the
 648      * "moment" of the LPCs in this filter. */
 649     s->dct.dct_calc(&s->dct, lpcs);
 650     s->dst.dct_calc(&s->dst, lpcs);
 651
 652     /* Split out the coefficient indexes into phase/magnitude pairs */
 653     idx = 255 + av_clip(lpcs[64],               -255, 255);
 654     coeffs[0]  = coeffs[0]  * s->cos[idx];
 655     idx = 255 + av_clip(lpcs[64] - 2 * lpcs[63], -255, 255);
 656     last_coeff = coeffs[64] * s->cos[idx];
 657     for (n = 63;; n--) {
 658         idx = 255 + av_clip(-lpcs[64] - 2 * lpcs[n - 1], -255, 255);
 659         coeffs[n * 2 + 1] = coeffs[n] * s->sin[idx];
 660         coeffs[n * 2]     = coeffs[n] * s->cos[idx];
 661
 662         if (!--n) break;
 663
 664         idx = 255 + av_clip( lpcs[64] - 2 * lpcs[n - 1], -255, 255);
 665         coeffs[n * 2 + 1] = coeffs[n] * s->sin[idx];
 666         coeffs[n * 2]     = coeffs[n] * s->cos[idx];
 667     }
 668     coeffs[1] = last_coeff;
 669
 670     /* move into real domain */
 671     s->irdft.rdft_calc(&s->irdft, coeffs);
 672
 673     /* tilt correction and normalize scale */
 674     memset(&coeffs[remainder], 0, sizeof(coeffs[0]) * (128 - remainder));
 675     if (s->denoise_tilt_corr) {
 676         float tilt_mem = 0;
 677
 678         coeffs[remainder - 1] = 0;
 679         ff_tilt_compensation(&tilt_mem,
 680                              -1.8 * tilt_factor(coeffs, remainder - 1),
 681                              coeffs, remainder);
 682     }
 683     sq = (1.0 / 64.0) * sqrtf(1 / avpriv_scalarproduct_float_c(coeffs, coeffs,
 684                                                                remainder));
 685     for (n = 0; n < remainder; n++)
 686         coeffs[n] *= sq;
 687 }
 688
 689 /**
 690  * This function applies a Wiener filter on the (noisy) speech signal as
 691  * a means to denoise it.
 692  *
 693  * - take RDFT of LPCs to get the power spectrum of the noise + speech;
 694  * - using this power spectrum, calculate (for each frequency) the Wiener
 695  *    filter gain, which depends on the frequency power and desired level
 696  *    of noise subtraction (when set too high, this leads to artifacts)
 697  *    We can do this symmetrically over the X-axis (so 0-4kHz is the inverse
 698  *    of 4-8kHz);
 699  * - by doing a phase shift, calculate the Hilbert transform of this array
 700  *    of per-frequency filter-gains to get the filtering coefficients;
 701  * - smoothen/normalize/de-tilt these filter coefficients as desired;
 702  * - take RDFT of noisy sound, apply the coefficients and take its IRDFT
 703  *    to get the denoised speech signal;
 704  * - the leftover (i.e. output of the IRDFT on denoised speech data beyond
 705  *    the frame boundary) are saved and applied to subsequent frames by an
 706  *    overlap-add method (otherwise you get clicking-artifacts).
 707  *
 708  * @param s WMA Voice decoding context
 709  * @param fcb_type Frame (codebook) type
 710  * @param synth_pf input: the noisy speech signal, output: denoised speech
 711  *                 data; should be 16-byte aligned (for ASM purposes)
 712  * @param size size of the speech data
 713  * @param lpcs LPCs used to synthesize this frame's speech data
 714  */
 715 static void wiener_denoise(WMAVoiceContext *s, int fcb_type,
 716                            float *synth_pf, int size,
 717                            const float *lpcs)
 718 {
 719     int remainder, lim, n;
 720
 721     if (fcb_type != FCB_TYPE_SILENCE) {
 722         float *tilted_lpcs = s->tilted_lpcs_pf,
 723               *coeffs = s->denoise_coeffs_pf, tilt_mem = 0;
 724
 725         tilted_lpcs[0]           = 1.0;
 726         memcpy(&tilted_lpcs[1], lpcs, sizeof(lpcs[0]) * s->lsps);
 727         memset(&tilted_lpcs[s->lsps + 1], 0,
 728                sizeof(tilted_lpcs[0]) * (128 - s->lsps - 1));
 729         ff_tilt_compensation(&tilt_mem, 0.7 * tilt_factor(lpcs, s->lsps),
 730                              tilted_lpcs, s->lsps + 2);
 731
 732         /* The IRDFT output (127 samples for 7-bit filter) beyond the frame
 733          * size is applied to the next frame. All input beyond this is zero,
 734          * and thus all output beyond this will go towards zero, hence we can
 735          * limit to min(size-1, 127-size) as a performance consideration. */
 736         remainder = FFMIN(127 - size, size - 1);
 737         calc_input_response(s, tilted_lpcs, fcb_type, coeffs, remainder);
 738
 739         /* apply coefficients (in frequency spectrum domain), i.e. complex
 740          * number multiplication */
 741         memset(&synth_pf[size], 0, sizeof(synth_pf[0]) * (128 - size));
 742         s->rdft.rdft_calc(&s->rdft, synth_pf);
 743         s->rdft.rdft_calc(&s->rdft, coeffs);
 744         synth_pf[0] *= coeffs[0];
 745         synth_pf[1] *= coeffs[1];
 746         for (n = 1; n < 64; n++) {
 747             float v1 = synth_pf[n * 2], v2 = synth_pf[n * 2 + 1];
 748             synth_pf[n * 2]     = v1 * coeffs[n * 2] - v2 * coeffs[n * 2 + 1];
 749             synth_pf[n * 2 + 1] = v2 * coeffs[n * 2] + v1 * coeffs[n * 2 + 1];
 750         }
 751         s->irdft.rdft_calc(&s->irdft, synth_pf);
 752     }
 753
 754     /* merge filter output with the history of previous runs */
 755     if (s->denoise_filter_cache_size) {
 756         lim = FFMIN(s->denoise_filter_cache_size, size);
 757         for (n = 0; n < lim; n++)
 758             synth_pf[n] += s->denoise_filter_cache[n];
 759         s->denoise_filter_cache_size -= lim;
 760         memmove(s->denoise_filter_cache, &s->denoise_filter_cache[size],
 761                 sizeof(s->denoise_filter_cache[0]) * s->denoise_filter_cache_size);
 762     }
 763
 764     /* move remainder of filter output into a cache for future runs */
 765     if (fcb_type != FCB_TYPE_SILENCE) {
 766         lim = FFMIN(remainder, s->denoise_filter_cache_size);
 767         for (n = 0; n < lim; n++)
 768             s->denoise_filter_cache[n] += synth_pf[size + n];
 769         if (lim < remainder) {
 770             memcpy(&s->denoise_filter_cache[lim], &synth_pf[size + lim],
 771                    sizeof(s->denoise_filter_cache[0]) * (remainder - lim));
 772             s->denoise_filter_cache_size = remainder;
 773         }
 774     }
 775 }
 776
 777 /**
 778  * Averaging projection filter, the postfilter used in WMAVoice.
 779  *
 780  * This uses the following steps:
 781  * - A zero-synthesis filter (generate excitation from synth signal)
 782  * - Kalman smoothing on excitation, based on pitch
 783  * - Re-synthesized smoothened output
 784  * - Iterative Wiener denoise filter
 785  * - Adaptive gain filter
 786  * - DC filter
 787  *
 788  * @param s WMAVoice decoding context
 789  * @param synth Speech synthesis output (before postfilter)
 790  * @param samples Output buffer for filtered samples
 791  * @param size Buffer size of synth & samples
 792  * @param lpcs Generated LPCs used for speech synthesis
 793  * @param zero_exc_pf destination for zero synthesis filter (16-byte aligned)
 794  * @param fcb_type Frame type (silence, hardcoded, AW-pulses or FCB-pulses)
 795  * @param pitch Pitch of the input signal
 796  */
 797 static void postfilter(WMAVoiceContext *s, const float *synth,
 798                        float *samples,    int size,
 799                        const float *lpcs, float *zero_exc_pf,
 800                        int fcb_type,      int pitch)
 801 {
 802     float synth_filter_in_buf[MAX_FRAMESIZE / 2],
 803           *synth_pf = &s->synth_filter_out_buf[MAX_LSPS_ALIGN16],
 804           *synth_filter_in = zero_exc_pf;
 805
 806     av_assert0(size <= MAX_FRAMESIZE / 2);
 807
 808     /* generate excitation from input signal */
 809     ff_celp_lp_zero_synthesis_filterf(zero_exc_pf, lpcs, synth, size, s->lsps);
 810
 811     if (fcb_type >= FCB_TYPE_AW_PULSES &&
 812         !kalman_smoothen(s, pitch, zero_exc_pf, synth_filter_in_buf, size))
 813         synth_filter_in = synth_filter_in_buf;
 814
 815     /* re-synthesize speech after smoothening, and keep history */
 816     ff_celp_lp_synthesis_filterf(synth_pf, lpcs,
 817                                  synth_filter_in, size, s->lsps);
 818     memcpy(&synth_pf[-s->lsps], &synth_pf[size - s->lsps],
 819            sizeof(synth_pf[0]) * s->lsps);
 820
 821     wiener_denoise(s, fcb_type, synth_pf, size, lpcs);
 822
 823     adaptive_gain_control(samples, synth_pf, synth, size, 0.99,
 824                           &s->postfilter_agc);
 825
 826     if (s->dc_level > 8) {
 827         /* remove ultra-low frequency DC noise / highpass filter;
 828          * coefficients are identical to those used in SIPR decoding,
 829          * and very closely resemble those used in AMR-NB decoding. */
 830         ff_acelp_apply_order_2_transfer_function(samples, samples,
 831             (const float[2]) { -1.99997,      1.0 },
 832             (const float[2]) { -1.9330735188, 0.93589198496 },
 833             0.93980580475, s->dcf_mem, size);
 834     }
 835 }
 836 /**
 837  * @}
 838  */
 839
 840 /**
 841  * Dequantize LSPs
 842  * @param lsps output pointer to the array that will hold the LSPs
 843  * @param num number of LSPs to be dequantized
 844  * @param values quantized values, contains n_stages values
 845  * @param sizes range (i.e. max value) of each quantized value
 846  * @param n_stages number of dequantization runs
 847  * @param table dequantization table to be used
 848  * @param mul_q LSF multiplier
 849  * @param base_q base (lowest) LSF values
 850  */
 851 static void dequant_lsps(double *lsps, int num,
 852                          const uint16_t *values,
 853                          const uint16_t *sizes,
 854                          int n_stages, const uint8_t *table,
 855                          const double *mul_q,
 856                          const double *base_q)
 857 {
 858     int n, m;
 859
 860     memset(lsps, 0, num * sizeof(*lsps));
 861     for (n = 0; n < n_stages; n++) {
 862         const uint8_t *t_off = &table[values[n] * num];
 863         double base = base_q[n], mul = mul_q[n];
 864
 865         for (m = 0; m < num; m++)
 866             lsps[m] += base + mul * t_off[m];
 867
 868         table += sizes[n] * num;
 869     }
 870 }
 871
 872 /**
 873  * @name LSP dequantization routines
 874  * LSP dequantization routines, for 10/16LSPs and independent/residual coding.
 875  * lsp10i() consumes 24 bits; lsp10r() consumes an additional 24 bits;
 876  * lsp16i() consumes 34 bits; lsp16r() consumes an additional 26 bits.
 877  * @{
 878  */
 879 /**
 880  * Parse 10 independently-coded LSPs.
 881  */
 882 static void dequant_lsp10i(GetBitContext *gb, double *lsps)
 883 {
 884     static const uint16_t vec_sizes[4] = { 256, 64, 32, 32 };
 885     static const double mul_lsf[4] = {
 886         5.2187144800e-3,    1.4626986422e-3,
 887         9.6179549166e-4,    1.1325736225e-3
 888     };
 889     static const double base_lsf[4] = {
 890         M_PI * -2.15522e-1, M_PI * -6.1646e-2,
 891         M_PI * -3.3486e-2,  M_PI * -5.7408e-2
 892     };
 893     uint16_t v[4];
 894
 895     v[0] = get_bits(gb, 8);
 896     v[1] = get_bits(gb, 6);
 897     v[2] = get_bits(gb, 5);
 898     v[3] = get_bits(gb, 5);
 899
 900     dequant_lsps(lsps, 10, v, vec_sizes, 4, wmavoice_dq_lsp10i,
 901                  mul_lsf, base_lsf);
 902 }
 903
 904 /**
 905  * Parse 10 independently-coded LSPs, and then derive the tables to
 906  * generate LSPs for the other frames from them (residual coding).
 907  */
 908 static void dequant_lsp10r(GetBitContext *gb,
 909                            double *i_lsps, const double *old,
 910                            double *a1, double *a2, int q_mode)
 911 {
 912     static const uint16_t vec_sizes[3] = { 128, 64, 64 };
 913     static const double mul_lsf[3] = {
 914         2.5807601174e-3,    1.2354460219e-3,   1.1763821673e-3
 915     };
 916     static const double base_lsf[3] = {
 917         M_PI * -1.07448e-1, M_PI * -5.2706e-2, M_PI * -5.1634e-2
 918     };
 919     const float (*ipol_tab)[2][10] = q_mode ?
 920         wmavoice_lsp10_intercoeff_b : wmavoice_lsp10_intercoeff_a;
 921     uint16_t interpol, v[3];
 922     int n;
 923
 924     dequant_lsp10i(gb, i_lsps);
 925
 926     interpol = get_bits(gb, 5);
 927     v[0]     = get_bits(gb, 7);
 928     v[1]     = get_bits(gb, 6);
 929     v[2]     = get_bits(gb, 6);
 930
 931     for (n = 0; n < 10; n++) {
 932         double delta = old[n] - i_lsps[n];
 933         a1[n]        = ipol_tab[interpol][0][n] * delta + i_lsps[n];
 934         a1[10 + n]   = ipol_tab[interpol][1][n] * delta + i_lsps[n];
 935     }
 936
 937     dequant_lsps(a2, 20, v, vec_sizes, 3, wmavoice_dq_lsp10r,
 938                  mul_lsf, base_lsf);
 939 }
 940
 941 /**
 942  * Parse 16 independently-coded LSPs.
 943  */
 944 static void dequant_lsp16i(GetBitContext *gb, double *lsps)
 945 {
 946     static const uint16_t vec_sizes[5] = { 256, 64, 128, 64, 128 };
 947     static const double mul_lsf[5] = {
 948         3.3439586280e-3,    6.9908173703e-4,
 949         3.3216608306e-3,    1.0334960326e-3,
 950         3.1899104283e-3
 951     };
 952     static const double base_lsf[5] = {
 953         M_PI * -1.27576e-1, M_PI * -2.4292e-2,
 954         M_PI * -1.28094e-1, M_PI * -3.2128e-2,
 955         M_PI * -1.29816e-1
 956     };
 957     uint16_t v[5];
 958
 959     v[0] = get_bits(gb, 8);
 960     v[1] = get_bits(gb, 6);
 961     v[2] = get_bits(gb, 7);
 962     v[3] = get_bits(gb, 6);
 963     v[4] = get_bits(gb, 7);
 964
 965     dequant_lsps( lsps,     5,  v,     vec_sizes,    2,
 966                  wmavoice_dq_lsp16i1,  mul_lsf,     base_lsf);
 967     dequant_lsps(&lsps[5],  5, &v[2], &vec_sizes[2], 2,
 968                  wmavoice_dq_lsp16i2, &mul_lsf[2], &base_lsf[2]);
 969     dequant_lsps(&lsps[10], 6, &v[4], &vec_sizes[4], 1,
 970                  wmavoice_dq_lsp16i3, &mul_lsf[4], &base_lsf[4]);
 971 }
 972
 973 /**
 974  * Parse 16 independently-coded LSPs, and then derive the tables to
 975  * generate LSPs for the other frames from them (residual coding).
 976  */
 977 static void dequant_lsp16r(GetBitContext *gb,
 978                            double *i_lsps, const double *old,
 979                            double *a1, double *a2, int q_mode)
 980 {
 981     static const uint16_t vec_sizes[3] = { 128, 128, 128 };
 982     static const double mul_lsf[3] = {
 983         1.2232979501e-3,   1.4062241527e-3,   1.6114744851e-3
 984     };
 985     static const double base_lsf[3] = {
 986         M_PI * -5.5830e-2, M_PI * -5.2908e-2, M_PI * -5.4776e-2
 987     };
 988     const float (*ipol_tab)[2][16] = q_mode ?
 989         wmavoice_lsp16_intercoeff_b : wmavoice_lsp16_intercoeff_a;
 990     uint16_t interpol, v[3];
 991     int n;
 992
 993     dequant_lsp16i(gb, i_lsps);
 994
 995     interpol = get_bits(gb, 5);
 996     v[0]     = get_bits(gb, 7);
 997     v[1]     = get_bits(gb, 7);
 998     v[2]     = get_bits(gb, 7);
 999
1000     for (n = 0; n < 16; n++) {
1001         double delta = old[n] - i_lsps[n];
1002         a1[n]        = ipol_tab[interpol][0][n] * delta + i_lsps[n];
1003         a1[16 + n]   = ipol_tab[interpol][1][n] * delta + i_lsps[n];
1004     }
1005
1006     dequant_lsps( a2,     10,  v,     vec_sizes,    1,
1007                  wmavoice_dq_lsp16r1,  mul_lsf,     base_lsf);
1008     dequant_lsps(&a2[10], 10, &v[1], &vec_sizes[1], 1,
1009                  wmavoice_dq_lsp16r2, &mul_lsf[1], &base_lsf[1]);
1010     dequant_lsps(&a2[20], 12, &v[2], &vec_sizes[2], 1,
1011                  wmavoice_dq_lsp16r3, &mul_lsf[2], &base_lsf[2]);
1012 }
1013
1014 /**
1015  * @}
1016  * @name Pitch-adaptive window coding functions
1017  * The next few functions are for pitch-adaptive window coding.
1018  * @{
1019  */
1020 /**
1021  * Parse the offset of the first pitch-adaptive window pulses, and
1022  * the distribution of pulses between the two blocks in this frame.
1023  * @param s WMA Voice decoding context private data
1024  * @param gb bit I/O context
1025  * @param pitch pitch for each block in this frame
1026  */
1027 static void aw_parse_coords(WMAVoiceContext *s, GetBitContext *gb,
1028                             const int *pitch)
1029 {
1030     static const int16_t start_offset[94] = {
1031         -11,  -9,  -7,  -5,  -3,  -1,   1,   3,   5,   7,   9,  11,
1032          13,  15,  18,  17,  19,  20,  21,  22,  23,  24,  25,  26,
1033          27,  28,  29,  30,  31,  32,  33,  35,  37,  39,  41,  43,
1034          45,  47,  49,  51,  53,  55,  57,  59,  61,  63,  65,  67,
1035          69,  71,  73,  75,  77,  79,  81,  83,  85,  87,  89,  91,
1036          93,  95,  97,  99, 101, 103, 105, 107, 109, 111, 113, 115,
1037         117, 119, 121, 123, 125, 127, 129, 131, 133, 135, 137, 139,
1038         141, 143, 145, 147, 149, 151, 153, 155, 157, 159
1039     };
1040     int bits, offset;
1041
1042     /* position of pulse */
1043     s->aw_idx_is_ext = 0;
1044     if ((bits = get_bits(gb, 6)) >= 54) {
1045         s->aw_idx_is_ext = 1;
1046         bits += (bits - 54) * 3 + get_bits(gb, 2);
1047     }
1048
1049     /* for a repeated pulse at pulse_off with a pitch_lag of pitch[], count
1050      * the distribution of the pulses in each block contained in this frame. */
1051     s->aw_pulse_range        = FFMIN(pitch[0], pitch[1]) > 32 ? 24 : 16;
1052     for (offset = start_offset[bits]; offset < 0; offset += pitch[0]) ;
1053     s->aw_n_pulses[0]        = (pitch[0] - 1 + MAX_FRAMESIZE / 2 - offset) / pitch[0];
1054     s->aw_first_pulse_off[0] = offset - s->aw_pulse_range / 2;
1055     offset                  += s->aw_n_pulses[0] * pitch[0];
1056     s->aw_n_pulses[1]        = (pitch[1] - 1 + MAX_FRAMESIZE - offset) / pitch[1];
1057     s->aw_first_pulse_off[1] = offset - (MAX_FRAMESIZE + s->aw_pulse_range) / 2;
1058
1059     /* if continuing from a position before the block, reset position to
1060      * start of block (when corrected for the range over which it can be
1061      * spread in aw_pulse_set1()). */
1062     if (start_offset[bits] < MAX_FRAMESIZE / 2) {
1063         while (s->aw_first_pulse_off[1] - pitch[1] + s->aw_pulse_range > 0)
1064             s->aw_first_pulse_off[1] -= pitch[1];
1065         if (start_offset[bits] < 0)
1066             while (s->aw_first_pulse_off[0] - pitch[0] + s->aw_pulse_range > 0)
1067                 s->aw_first_pulse_off[0] -= pitch[0];
1068     }
1069 }
1070
1071 /**
1072  * Apply second set of pitch-adaptive window pulses.
1073  * @param s WMA Voice decoding context private data
1074  * @param gb bit I/O context
1075  * @param block_idx block index in frame [0, 1]
1076  * @param fcb structure containing fixed codebook vector info
1077  * @return -1 on error, 0 otherwise
1078  */
1079 static int aw_pulse_set2(WMAVoiceContext *s, GetBitContext *gb,
1080                          int block_idx, AMRFixed *fcb)
1081 {
1082     uint16_t use_mask_mem[9]; // only 5 are used, rest is padding
1083     uint16_t *use_mask = use_mask_mem + 2;
1084     /* in this function, idx is the index in the 80-bit (+ padding) use_mask
1085      * bit-array. Since use_mask consists of 16-bit values, the lower 4 bits
1086      * of idx are the position of the bit within a particular item in the
1087      * array (0 being the most significant bit, and 15 being the least
1088      * significant bit), and the remainder (>> 4) is the index in the
1089      * use_mask[]-array. This is faster and uses less memory than using a
1090      * 80-byte/80-int array. */
1091     int pulse_off = s->aw_first_pulse_off[block_idx],
1092         pulse_start, n, idx, range, aidx, start_off = 0;
1093
1094     /* set offset of first pulse to within this block */
1095     if (s->aw_n_pulses[block_idx] > 0)
1096         while (pulse_off + s->aw_pulse_range < 1)
1097             pulse_off += fcb->pitch_lag;
1098
1099     /* find range per pulse */
1100     if (s->aw_n_pulses[0] > 0) {
1101         if (block_idx == 0) {
1102             range = 32;
1103         } else /* block_idx = 1 */ {
1104             range = 8;
1105             if (s->aw_n_pulses[block_idx] > 0)
1106                 pulse_off = s->aw_next_pulse_off_cache;
1107         }
1108     } else
1109         range = 16;
1110     pulse_start = s->aw_n_pulses[block_idx] > 0 ? pulse_off - range / 2 : 0;
1111
1112     /* aw_pulse_set1() already applies pulses around pulse_off (to be exactly,
1113      * in the range of [pulse_off, pulse_off + s->aw_pulse_range], and thus
1114      * we exclude that range from being pulsed again in this function. */
1115     memset(&use_mask[-2], 0, 2 * sizeof(use_mask[0]));
1116     memset( use_mask,   -1, 5 * sizeof(use_mask[0]));
1117     memset(&use_mask[5], 0, 2 * sizeof(use_mask[0]));
1118     if (s->aw_n_pulses[block_idx] > 0)
1119         for (idx = pulse_off; idx < MAX_FRAMESIZE / 2; idx += fcb->pitch_lag) {
1120             int excl_range         = s->aw_pulse_range; // always 16 or 24
1121             uint16_t *use_mask_ptr = &use_mask[idx >> 4];
1122             int first_sh           = 16 - (idx & 15);
1123             *use_mask_ptr++       &= 0xFFFFu << first_sh;
1124             excl_range            -= first_sh;
1125             if (excl_range >= 16) {
1126                 *use_mask_ptr++    = 0;
1127                 *use_mask_ptr     &= 0xFFFF >> (excl_range - 16);
1128             } else
1129                 *use_mask_ptr     &= 0xFFFF >> excl_range;
1130         }
1131
1132     /* find the 'aidx'th offset that is not excluded */
1133     aidx = get_bits(gb, s->aw_n_pulses[0] > 0 ? 5 - 2 * block_idx : 4);
1134     for (n = 0; n <= aidx; pulse_start++) {
1135         for (idx = pulse_start; idx < 0; idx += fcb->pitch_lag) ;
1136         if (idx >= MAX_FRAMESIZE / 2) { // find from zero
1137             if (use_mask[0])      idx = 0x0F;
1138             else if (use_mask[1]) idx = 0x1F;
1139             else if (use_mask[2]) idx = 0x2F;
1140             else if (use_mask[3]) idx = 0x3F;
1141             else if (use_mask[4]) idx = 0x4F;
1142             else return -1;
1143             idx -= av_log2_16bit(use_mask[idx >> 4]);
1144         }
1145         if (use_mask[idx >> 4] & (0x8000 >> (idx & 15))) {
1146             use_mask[idx >> 4] &= ~(0x8000 >> (idx & 15));
1147             n++;
1148             start_off = idx;
1149         }
1150     }
1151
1152     fcb->x[fcb->n] = start_off;
1153     fcb->y[fcb->n] = get_bits1(gb) ? -1.0 : 1.0;
1154     fcb->n++;
1155
1156     /* set offset for next block, relative to start of that block */
1157     n = (MAX_FRAMESIZE / 2 - start_off) % fcb->pitch_lag;
1158     s->aw_next_pulse_off_cache = n ? fcb->pitch_lag - n : 0;
1159     return 0;
1160 }
1161
1162 /**
1163  * Apply first set of pitch-adaptive window pulses.
1164  * @param s WMA Voice decoding context private data
1165  * @param gb bit I/O context
1166  * @param block_idx block index in frame [0, 1]
1167  * @param fcb storage location for fixed codebook pulse info
1168  */
1169 static void aw_pulse_set1(WMAVoiceContext *s, GetBitContext *gb,
1170                           int block_idx, AMRFixed *fcb)
1171 {
1172     int val = get_bits(gb, 12 - 2 * (s->aw_idx_is_ext && !block_idx));
1173     float v;
1174
1175     if (s->aw_n_pulses[block_idx] > 0) {
1176         int n, v_mask, i_mask, sh, n_pulses;
1177
1178         if (s->aw_pulse_range == 24) { // 3 pulses, 1:sign + 3:index each
1179             n_pulses = 3;
1180             v_mask   = 8;
1181             i_mask   = 7;
1182             sh       = 4;
1183         } else { // 4 pulses, 1:sign + 2:index each
1184             n_pulses = 4;
1185             v_mask   = 4;
1186             i_mask   = 3;
1187             sh       = 3;
1188         }
1189
1190         for (n = n_pulses - 1; n >= 0; n--, val >>= sh) {
1191             fcb->y[fcb->n] = (val & v_mask) ? -1.0 : 1.0;
1192             fcb->x[fcb->n] = (val & i_mask) * n_pulses + n +
1193                                  s->aw_first_pulse_off[block_idx];
1194             while (fcb->x[fcb->n] < 0)
1195                 fcb->x[fcb->n] += fcb->pitch_lag;
1196             if (fcb->x[fcb->n] < MAX_FRAMESIZE / 2)
1197                 fcb->n++;
1198         }
1199     } else {
1200         int num2 = (val & 0x1FF) >> 1, delta, idx;
1201
1202         if (num2 < 1 * 79)      { delta = 1; idx = num2 + 1; }
1203         else if (num2 < 2 * 78) { delta = 3; idx = num2 + 1 - 1 * 77; }
1204         else if (num2 < 3 * 77) { delta = 5; idx = num2 + 1 - 2 * 76; }
1205         else                    { delta = 7; idx = num2 + 1 - 3 * 75; }
1206         v = (val & 0x200) ? -1.0 : 1.0;
1207
1208         fcb->no_repeat_mask |= 3 << fcb->n;
1209         fcb->x[fcb->n]       = idx - delta;
1210         fcb->y[fcb->n]       = v;
1211         fcb->x[fcb->n + 1]   = idx;
1212         fcb->y[fcb->n + 1]   = (val & 1) ? -v : v;
1213         fcb->n              += 2;
1214     }
1215 }
1216
1217 /**
1218  * @}
1219  *
1220  * Generate a random number from frame_cntr and block_idx, which will live
1221  * in the range [0, 1000 - block_size] (so it can be used as an index in a
1222  * table of size 1000 of which you want to read block_size entries).
1223  *
1224  * @param frame_cntr current frame number
1225  * @param block_num current block index
1226  * @param block_size amount of entries we want to read from a table
1227  *                   that has 1000 entries
1228  * @return a (non-)random number in the [0, 1000 - block_size] range.
1229  */
1230 static int pRNG(int frame_cntr, int block_num, int block_size)
1231 {
1232     /* array to simplify the calculation of z:
1233      * y = (x % 9) * 5 + 6;
1234      * z = (49995 * x) / y;
1235      * Since y only has 9 values, we can remove the division by using a
1236      * LUT and using FASTDIV-style divisions. For each of the 9 values
1237      * of y, we can rewrite z as:
1238      * z = x * (49995 / y) + x * ((49995 % y) / y)
1239      * In this table, each col represents one possible value of y, the
1240      * first number is 49995 / y, and the second is the FASTDIV variant
1241      * of 49995 % y / y. */
1242     static const unsigned int div_tbl[9][2] = {
1243         { 8332,  3 * 715827883U }, // y =  6
1244         { 4545,  0 * 390451573U }, // y = 11
1245         { 3124, 11 * 268435456U }, // y = 16
1246         { 2380, 15 * 204522253U }, // y = 21
1247         { 1922, 23 * 165191050U }, // y = 26
1248         { 1612, 23 * 138547333U }, // y = 31
1249         { 1388, 27 * 119304648U }, // y = 36
1250         { 1219, 16 * 104755300U }, // y = 41
1251         { 1086, 39 *  93368855U }  // y = 46
1252     };
1253     unsigned int z, y, x = MUL16(block_num, 1877) + frame_cntr;
1254     if (x >= 0xFFFF) x -= 0xFFFF;   // max value of x is 8*1877+0xFFFE=0x13AA6,
1255                                     // so this is effectively a modulo (%)
1256     y = x - 9 * MULH(477218589, x); // x % 9
1257     z = (uint16_t) (x * div_tbl[y][0] + UMULH(x, div_tbl[y][1]));
1258                                     // z = x * 49995 / (y * 5 + 6)
1259     return z % (1000 - block_size);
1260 }
1261
1262 /**
1263  * Parse hardcoded signal for a single block.
1264  * @note see #synth_block().
1265  */
1266 static void synth_block_hardcoded(WMAVoiceContext *s, GetBitContext *gb,
1267                                  int block_idx, int size,
1268                                  const struct frame_type_desc *frame_desc,
1269                                  float *excitation)
1270 {
1271     float gain;
1272     int n, r_idx;
1273
1274     av_assert0(size <= MAX_FRAMESIZE);
1275
1276     /* Set the offset from which we start reading wmavoice_std_codebook */
1277     if (frame_desc->fcb_type == FCB_TYPE_SILENCE) {
1278         r_idx = pRNG(s->frame_cntr, block_idx, size);
1279         gain  = s->silence_gain;
1280     } else /* FCB_TYPE_HARDCODED */ {
1281         r_idx = get_bits(gb, 8);
1282         gain  = wmavoice_gain_universal[get_bits(gb, 6)];
1283     }
1284
1285     /* Clear gain prediction parameters */
1286     memset(s->gain_pred_err, 0, sizeof(s->gain_pred_err));
1287
1288     /* Apply gain to hardcoded codebook and use that as excitation signal */
1289     for (n = 0; n < size; n++)
1290         excitation[n] = wmavoice_std_codebook[r_idx + n] * gain;
1291 }
1292
1293 /**
1294  * Parse FCB/ACB signal for a single block.
1295  * @note see #synth_block().
1296  */
1297 static void synth_block_fcb_acb(WMAVoiceContext *s, GetBitContext *gb,
1298                                 int block_idx, int size,
1299                                 int block_pitch_sh2,
1300                                 const struct frame_type_desc *frame_desc,
1301                                 float *excitation)
1302 {
1303     static const float gain_coeff[6] = {
1304         0.8169, -0.06545, 0.1726, 0.0185, -0.0359, 0.0458
1305     };
1306     float pulses[MAX_FRAMESIZE / 2], pred_err, acb_gain, fcb_gain;
1307     int n, idx, gain_weight;
1308     AMRFixed fcb;
1309
1310     av_assert0(size <= MAX_FRAMESIZE / 2);
1311     memset(pulses, 0, sizeof(*pulses) * size);
1312
1313     fcb.pitch_lag      = block_pitch_sh2 >> 2;
1314     fcb.pitch_fac      = 1.0;
1315     fcb.no_repeat_mask = 0;
1316     fcb.n              = 0;
1317
1318     /* For the other frame types, this is where we apply the innovation
1319      * (fixed) codebook pulses of the speech signal. */
1320     if (frame_desc->fcb_type == FCB_TYPE_AW_PULSES) {
1321         aw_pulse_set1(s, gb, block_idx, &fcb);
1322         if (aw_pulse_set2(s, gb, block_idx, &fcb)) {
1323             /* Conceal the block with silence and return.
1324              * Skip the correct amount of bits to read the next
1325              * block from the correct offset. */
1326             int r_idx = pRNG(s->frame_cntr, block_idx, size);
1327
1328             for (n = 0; n < size; n++)
1329                 excitation[n] =
1330                     wmavoice_std_codebook[r_idx + n] * s->silence_gain;
1331             skip_bits(gb, 7 + 1);
1332             return;
1333         }
1334     } else /* FCB_TYPE_EXC_PULSES */ {
1335         int offset_nbits = 5 - frame_desc->log_n_blocks;
1336
1337         fcb.no_repeat_mask = -1;
1338         /* similar to ff_decode_10_pulses_35bits(), but with single pulses
1339          * (instead of double) for a subset of pulses */
1340         for (n = 0; n < 5; n++) {
1341             float sign;
1342             int pos1, pos2;
1343
1344             sign           = get_bits1(gb) ? 1.0 : -1.0;
1345             pos1           = get_bits(gb, offset_nbits);
1346             fcb.x[fcb.n]   = n + 5 * pos1;
1347             fcb.y[fcb.n++] = sign;
1348             if (n < frame_desc->dbl_pulses) {
1349                 pos2           = get_bits(gb, offset_nbits);
1350                 fcb.x[fcb.n]   = n + 5 * pos2;
1351                 fcb.y[fcb.n++] = (pos1 < pos2) ? -sign : sign;
1352             }
1353         }
1354     }
1355     ff_set_fixed_vector(pulses, &fcb, 1.0, size);
1356
1357     /* Calculate gain for adaptive & fixed codebook signal.
1358      * see ff_amr_set_fixed_gain(). */
1359     idx = get_bits(gb, 7);
1360     fcb_gain = expf(avpriv_scalarproduct_float_c(s->gain_pred_err,
1361                                                  gain_coeff, 6) -
1362                     5.2409161640 + wmavoice_gain_codebook_fcb[idx]);
1363     acb_gain = wmavoice_gain_codebook_acb[idx];
1364     pred_err = av_clipf(wmavoice_gain_codebook_fcb[idx],
1365                         -2.9957322736 /* log(0.05) */,
1366                          1.6094379124 /* log(5.0)  */);
1367
1368     gain_weight = 8 >> frame_desc->log_n_blocks;
1369     memmove(&s->gain_pred_err[gain_weight], s->gain_pred_err,
1370             sizeof(*s->gain_pred_err) * (6 - gain_weight));
1371     for (n = 0; n < gain_weight; n++)
1372         s->gain_pred_err[n] = pred_err;
1373
1374     /* Calculation of adaptive codebook */
1375     if (frame_desc->acb_type == ACB_TYPE_ASYMMETRIC) {
1376         int len;
1377         for (n = 0; n < size; n += len) {
1378             int next_idx_sh16;
1379             int abs_idx    = block_idx * size + n;
1380             int pitch_sh16 = (s->last_pitch_val << 16) +
1381                              s->pitch_diff_sh16 * abs_idx;
1382             int pitch      = (pitch_sh16 + 0x6FFF) >> 16;
1383             int idx_sh16   = ((pitch << 16) - pitch_sh16) * 8 + 0x58000;
1384             idx            = idx_sh16 >> 16;
1385             if (s->pitch_diff_sh16) {
1386                 if (s->pitch_diff_sh16 > 0) {
1387                     next_idx_sh16 = (idx_sh16) &~ 0xFFFF;
1388                 } else
1389                     next_idx_sh16 = (idx_sh16 + 0x10000) &~ 0xFFFF;
1390                 len = av_clip((idx_sh16 - next_idx_sh16) / s->pitch_diff_sh16 / 8,
1391                               1, size - n);
1392             } else
1393                 len = size;
1394
1395             ff_acelp_interpolatef(&excitation[n], &excitation[n - pitch],
1396                                   wmavoice_ipol1_coeffs, 17,
1397                                   idx, 9, len);
1398         }
1399     } else /* ACB_TYPE_HAMMING */ {
1400         int block_pitch = block_pitch_sh2 >> 2;
1401         idx             = block_pitch_sh2 & 3;
1402         if (idx) {
1403             ff_acelp_interpolatef(excitation, &excitation[-block_pitch],
1404                                   wmavoice_ipol2_coeffs, 4,
1405                                   idx, 8, size);
1406         } else
1407             av_memcpy_backptr((uint8_t *) excitation, sizeof(float) * block_pitch,
1408                               sizeof(float) * size);
1409     }
1410
1411     /* Interpolate ACB/FCB and use as excitation signal */
1412     ff_weighted_vector_sumf(excitation, excitation, pulses,
1413                             acb_gain, fcb_gain, size);
1414 }
1415
1416 /**
1417  * Parse data in a single block.
1418  *
1419  * @param s WMA Voice decoding context private data
1420  * @param gb bit I/O context
1421  * @param block_idx index of the to-be-read block
1422  * @param size amount of samples to be read in this block
1423  * @param block_pitch_sh2 pitch for this block << 2
1424  * @param lsps LSPs for (the end of) this frame
1425  * @param prev_lsps LSPs for the last frame
1426  * @param frame_desc frame type descriptor
1427  * @param excitation target memory for the ACB+FCB interpolated signal
1428  * @param synth target memory for the speech synthesis filter output
1429  * @return 0 on success, <0 on error.
1430  */
1431 static void synth_block(WMAVoiceContext *s, GetBitContext *gb,
1432                         int block_idx, int size,
1433                         int block_pitch_sh2,
1434                         const double *lsps, const double *prev_lsps,
1435                         const struct frame_type_desc *frame_desc,
1436                         float *excitation, float *synth)
1437 {
1438     double i_lsps[MAX_LSPS];
1439     float lpcs[MAX_LSPS];
1440     float fac;
1441     int n;
1442
1443     if (frame_desc->acb_type == ACB_TYPE_NONE)
1444         synth_block_hardcoded(s, gb, block_idx, size, frame_desc, excitation);
1445     else
1446         synth_block_fcb_acb(s, gb, block_idx, size, block_pitch_sh2,
1447                             frame_desc, excitation);
1448
1449     /* convert interpolated LSPs to LPCs */
1450     fac = (block_idx + 0.5) / frame_desc->n_blocks;
1451     for (n = 0; n < s->lsps; n++) // LSF -> LSP
1452         i_lsps[n] = cos(prev_lsps[n] + fac * (lsps[n] - prev_lsps[n]));
1453     ff_acelp_lspd2lpc(i_lsps, lpcs, s->lsps >> 1);
1454
1455     /* Speech synthesis */
1456     ff_celp_lp_synthesis_filterf(synth, lpcs, excitation, size, s->lsps);
1457 }
1458
1459 /**
1460  * Synthesize output samples for a single frame.
1461  *
1462  * @param ctx WMA Voice decoder context
1463  * @param gb bit I/O context (s->gb or one for cross-packet superframes)
1464  * @param frame_idx Frame number within superframe [0-2]
1465  * @param samples pointer to output sample buffer, has space for at least 160
1466  *                samples
1467  * @param lsps LSP array
1468  * @param prev_lsps array of previous frame's LSPs
1469  * @param excitation target buffer for excitation signal
1470  * @param synth target buffer for synthesized speech data
1471  * @return 0 on success, <0 on error.
1472  */
1473 static int synth_frame(AVCodecContext *ctx, GetBitContext *gb, int frame_idx,
1474                        float *samples,
1475                        const double *lsps, const double *prev_lsps,
1476                        float *excitation, float *synth)
1477 {
1478     WMAVoiceContext *s = ctx->priv_data;
1479     int n, n_blocks_x2, log_n_blocks_x2, av_uninit(cur_pitch_val);
1480     int pitch[MAX_BLOCKS], av_uninit(last_block_pitch);
1481
1482     /* Parse frame type ("frame header"), see frame_descs */
1483     int bd_idx = s->vbm_tree[get_vlc2(gb, frame_type_vlc.table, 6, 3)], block_nsamples;
1484
1485     if (bd_idx < 0) {
1486         av_log(ctx, AV_LOG_ERROR,
1487                "Invalid frame type VLC code, skipping\n");
1488         return AVERROR_INVALIDDATA;
1489     }
1490
1491     block_nsamples = MAX_FRAMESIZE / frame_descs[bd_idx].n_blocks;
1492
1493     /* Pitch calculation for ACB_TYPE_ASYMMETRIC ("pitch-per-frame") */
1494     if (frame_descs[bd_idx].acb_type == ACB_TYPE_ASYMMETRIC) {
1495         /* Pitch is provided per frame, which is interpreted as the pitch of
1496          * the last sample of the last block of this frame. We can interpolate
1497          * the pitch of other blocks (and even pitch-per-sample) by gradually
1498          * incrementing/decrementing prev_frame_pitch to cur_pitch_val. */
1499         n_blocks_x2      = frame_descs[bd_idx].n_blocks << 1;
1500         log_n_blocks_x2  = frame_descs[bd_idx].log_n_blocks + 1;
1501         cur_pitch_val    = s->min_pitch_val + get_bits(gb, s->pitch_nbits);
1502         cur_pitch_val    = FFMIN(cur_pitch_val, s->max_pitch_val - 1);
1503         if (s->last_acb_type == ACB_TYPE_NONE ||
1504             20 * abs(cur_pitch_val - s->last_pitch_val) >
1505                 (cur_pitch_val + s->last_pitch_val))
1506             s->last_pitch_val = cur_pitch_val;
1507
1508         /* pitch per block */
1509         for (n = 0; n < frame_descs[bd_idx].n_blocks; n++) {
1510             int fac = n * 2 + 1;
1511
1512             pitch[n] = (MUL16(fac,                 cur_pitch_val) +
1513                         MUL16((n_blocks_x2 - fac), s->last_pitch_val) +
1514                         frame_descs[bd_idx].n_blocks) >> log_n_blocks_x2;
1515         }
1516
1517         /* "pitch-diff-per-sample" for calculation of pitch per sample */
1518         s->pitch_diff_sh16 =
1519             ((cur_pitch_val - s->last_pitch_val) << 16) / MAX_FRAMESIZE;
1520     }
1521
1522     /* Global gain (if silence) and pitch-adaptive window coordinates */
1523     switch (frame_descs[bd_idx].fcb_type) {
1524     case FCB_TYPE_SILENCE:
1525         s->silence_gain = wmavoice_gain_silence[get_bits(gb, 8)];
1526         break;
1527     case FCB_TYPE_AW_PULSES:
1528         aw_parse_coords(s, gb, pitch);
1529         break;
1530     }
1531
1532     for (n = 0; n < frame_descs[bd_idx].n_blocks; n++) {
1533         int bl_pitch_sh2;
1534
1535         /* Pitch calculation for ACB_TYPE_HAMMING ("pitch-per-block") */
1536         switch (frame_descs[bd_idx].acb_type) {
1537         case ACB_TYPE_HAMMING: {
1538             /* Pitch is given per block. Per-block pitches are encoded as an
1539              * absolute value for the first block, and then delta values
1540              * relative to this value) for all subsequent blocks. The scale of
1541              * this pitch value is semi-logarithmic compared to its use in the
1542              * decoder, so we convert it to normal scale also. */
1543             int block_pitch,
1544                 t1 = (s->block_conv_table[1] - s->block_conv_table[0]) << 2,
1545                 t2 = (s->block_conv_table[2] - s->block_conv_table[1]) << 1,
1546                 t3 =  s->block_conv_table[3] - s->block_conv_table[2] + 1;
1547
1548             if (n == 0) {
1549                 block_pitch = get_bits(gb, s->block_pitch_nbits);
1550             } else
1551                 block_pitch = last_block_pitch - s->block_delta_pitch_hrange +
1552                                  get_bits(gb, s->block_delta_pitch_nbits);
1553             /* Convert last_ so that any next delta is within _range */
1554             last_block_pitch = av_clip(block_pitch,
1555                                        s->block_delta_pitch_hrange,
1556                                        s->block_pitch_range -
1557                                            s->block_delta_pitch_hrange);
1558
1559             /* Convert semi-log-style scale back to normal scale */
1560             if (block_pitch < t1) {
1561                 bl_pitch_sh2 = (s->block_conv_table[0] << 2) + block_pitch;
1562             } else {
1563                 block_pitch -= t1;
1564                 if (block_pitch < t2) {
1565                     bl_pitch_sh2 =
1566                         (s->block_conv_table[1] << 2) + (block_pitch << 1);
1567                 } else {
1568                     block_pitch -= t2;
1569                     if (block_pitch < t3) {
1570                         bl_pitch_sh2 =
1571                             (s->block_conv_table[2] + block_pitch) << 2;
1572                     } else
1573                         bl_pitch_sh2 = s->block_conv_table[3] << 2;
1574                 }
1575             }
1576             pitch[n] = bl_pitch_sh2 >> 2;
1577             break;
1578         }
1579
1580         case ACB_TYPE_ASYMMETRIC: {
1581             bl_pitch_sh2 = pitch[n] << 2;
1582             break;
1583         }
1584
1585         default: // ACB_TYPE_NONE has no pitch
1586             bl_pitch_sh2 = 0;
1587             break;
1588         }
1589
1590         synth_block(s, gb, n, block_nsamples, bl_pitch_sh2,
1591                     lsps, prev_lsps, &frame_descs[bd_idx],
1592                     &excitation[n * block_nsamples],
1593                     &synth[n * block_nsamples]);
1594     }
1595
1596     /* Averaging projection filter, if applicable. Else, just copy samples
1597      * from synthesis buffer */
1598     if (s->do_apf) {
1599         double i_lsps[MAX_LSPS];
1600         float lpcs[MAX_LSPS];
1601
1602         for (n = 0; n < s->lsps; n++) // LSF -> LSP
1603             i_lsps[n] = cos(0.5 * (prev_lsps[n] + lsps[n]));
1604         ff_acelp_lspd2lpc(i_lsps, lpcs, s->lsps >> 1);
1605         postfilter(s, synth, samples, 80, lpcs,
1606                    &s->zero_exc_pf[s->history_nsamples + MAX_FRAMESIZE * frame_idx],
1607                    frame_descs[bd_idx].fcb_type, pitch[0]);
1608
1609         for (n = 0; n < s->lsps; n++) // LSF -> LSP
1610             i_lsps[n] = cos(lsps[n]);
1611         ff_acelp_lspd2lpc(i_lsps, lpcs, s->lsps >> 1);
1612         postfilter(s, &synth[80], &samples[80], 80, lpcs,
1613                    &s->zero_exc_pf[s->history_nsamples + MAX_FRAMESIZE * frame_idx + 80],
1614                    frame_descs[bd_idx].fcb_type, pitch[0]);
1615     } else
1616         memcpy(samples, synth, 160 * sizeof(synth[0]));
1617
1618     /* Cache values for next frame */
1619     s->frame_cntr++;
1620     if (s->frame_cntr >= 0xFFFF) s->frame_cntr -= 0xFFFF; // i.e. modulo (%)
1621     s->last_acb_type = frame_descs[bd_idx].acb_type;
1622     switch (frame_descs[bd_idx].acb_type) {
1623     case ACB_TYPE_NONE:
1624         s->last_pitch_val = 0;
1625         break;
1626     case ACB_TYPE_ASYMMETRIC:
1627         s->last_pitch_val = cur_pitch_val;
1628         break;
1629     case ACB_TYPE_HAMMING:
1630         s->last_pitch_val = pitch[frame_descs[bd_idx].n_blocks - 1];
1631         break;
1632     }
1633
1634     return 0;
1635 }
1636
1637 /**
1638  * Ensure minimum value for first item, maximum value for last value,
1639  * proper spacing between each value and proper ordering.
1640  *
1641  * @param lsps array of LSPs
1642  * @param num size of LSP array
1643  *
1644  * @note basically a double version of #ff_acelp_reorder_lsf(), might be
1645  *       useful to put in a generic location later on. Parts are also
1646  *       present in #ff_set_min_dist_lsf() + #ff_sort_nearly_sorted_floats(),
1647  *       which is in float.
1648  */
1649 static void stabilize_lsps(double *lsps, int num)
1650 {
1651     int n, m, l;
1652
1653     /* set minimum value for first, maximum value for last and minimum
1654      * spacing between LSF values.
1655      * Very similar to ff_set_min_dist_lsf(), but in double. */
1656     lsps[0]       = FFMAX(lsps[0],       0.0015 * M_PI);
1657     for (n = 1; n < num; n++)
1658         lsps[n]   = FFMAX(lsps[n],       lsps[n - 1] + 0.0125 * M_PI);
1659     lsps[num - 1] = FFMIN(lsps[num - 1], 0.9985 * M_PI);
1660
1661     /* reorder (looks like one-time / non-recursed bubblesort).
1662      * Very similar to ff_sort_nearly_sorted_floats(), but in double. */
1663     for (n = 1; n < num; n++) {
1664         if (lsps[n] < lsps[n - 1]) {
1665             for (m = 1; m < num; m++) {
1666                 double tmp = lsps[m];
1667                 for (l = m - 1; l >= 0; l--) {
1668                     if (lsps[l] <= tmp) break;
1669                     lsps[l + 1] = lsps[l];
1670                 }
1671                 lsps[l + 1] = tmp;
1672             }
1673             break;
1674         }
1675     }
1676 }
1677
1678 /**
1679  * Synthesize output samples for a single superframe. If we have any data
1680  * cached in s->sframe_cache, that will be used instead of whatever is loaded
1681  * in s->gb.
1682  *
1683  * WMA Voice superframes contain 3 frames, each containing 160 audio samples,
1684  * to give a total of 480 samples per frame. See #synth_frame() for frame
1685  * parsing. In addition to 3 frames, superframes can also contain the LSPs
1686  * (if these are globally specified for all frames (residually); they can
1687  * also be specified individually per-frame. See the s->has_residual_lsps
1688  * option), and can specify the number of samples encoded in this superframe
1689  * (if less than 480), usually used to prevent blanks at track boundaries.
1690  *
1691  * @param ctx WMA Voice decoder context
1692  * @return 0 on success, <0 on error or 1 if there was not enough data to
1693  *         fully parse the superframe
1694  */
1695 static int synth_superframe(AVCodecContext *ctx, AVFrame *frame,
1696                             int *got_frame_ptr)
1697 {
1698     WMAVoiceContext *s = ctx->priv_data;
1699     GetBitContext *gb = &s->gb, s_gb;
1700     int n, res, n_samples = MAX_SFRAMESIZE;
1701     double lsps[MAX_FRAMES][MAX_LSPS];
1702     const double *mean_lsf = s->lsps == 16 ?
1703         wmavoice_mean_lsf16[s->lsp_def_mode] : wmavoice_mean_lsf10[s->lsp_def_mode];
1704     float excitation[MAX_SIGNAL_HISTORY + MAX_SFRAMESIZE + 12];
1705     float synth[MAX_LSPS + MAX_SFRAMESIZE];
1706     float *samples;
1707
1708     memcpy(synth,      s->synth_history,
1709            s->lsps             * sizeof(*synth));
1710     memcpy(excitation, s->excitation_history,
1711            s->history_nsamples * sizeof(*excitation));
1712
1713     if (s->sframe_cache_size > 0) {
1714         gb = &s_gb;
1715         init_get_bits(gb, s->sframe_cache, s->sframe_cache_size);
1716         s->sframe_cache_size = 0;
1717     }
1718
1719     /* First bit is speech/music bit, it differentiates between WMAVoice
1720      * speech samples (the actual codec) and WMAVoice music samples, which
1721      * are really WMAPro-in-WMAVoice-superframes. I've never seen those in
1722      * the wild yet. */
1723     if (!get_bits1(gb)) {
1724         avpriv_request_sample(ctx, "WMAPro-in-WMAVoice");
1725         return AVERROR_PATCHWELCOME;
1726     }
1727
1728     /* (optional) nr. of samples in superframe; always <= 480 and >= 0 */
1729     if (get_bits1(gb)) {
1730         if ((n_samples = get_bits(gb, 12)) > MAX_SFRAMESIZE) {
1731             av_log(ctx, AV_LOG_ERROR,
1732                    "Superframe encodes > %d samples (%d), not allowed\n",
1733                    MAX_SFRAMESIZE, n_samples);
1734             return AVERROR_INVALIDDATA;
1735         }
1736     }
1737
1738     /* Parse LSPs, if global for the superframe (can also be per-frame). */
1739     if (s->has_residual_lsps) {
1740         double prev_lsps[MAX_LSPS], a1[MAX_LSPS * 2], a2[MAX_LSPS * 2];
1741
1742         for (n = 0; n < s->lsps; n++)
1743             prev_lsps[n] = s->prev_lsps[n] - mean_lsf[n];
1744
1745         if (s->lsps == 10) {
1746             dequant_lsp10r(gb, lsps[2], prev_lsps, a1, a2, s->lsp_q_mode);
1747         } else /* s->lsps == 16 */
1748             dequant_lsp16r(gb, lsps[2], prev_lsps, a1, a2, s->lsp_q_mode);
1749
1750         for (n = 0; n < s->lsps; n++) {
1751             lsps[0][n]  = mean_lsf[n] + (a1[n]           - a2[n * 2]);
1752             lsps[1][n]  = mean_lsf[n] + (a1[s->lsps + n] - a2[n * 2 + 1]);
1753             lsps[2][n] += mean_lsf[n];
1754         }
1755         for (n = 0; n < 3; n++)
1756             stabilize_lsps(lsps[n], s->lsps);
1757     }
1758
1759     /* get output buffer */
1760     frame->nb_samples = MAX_SFRAMESIZE;
1761     if ((res = ff_get_buffer(ctx, frame, 0)) < 0)
1762         return res;
1763     frame->nb_samples = n_samples;
1764     samples = (float *)frame->data[0];
1765
1766     /* Parse frames, optionally preceded by per-frame (independent) LSPs. */
1767     for (n = 0; n < 3; n++) {
1768         if (!s->has_residual_lsps) {
1769             int m;
1770
1771             if (s->lsps == 10) {
1772                 dequant_lsp10i(gb, lsps[n]);
1773             } else /* s->lsps == 16 */
1774                 dequant_lsp16i(gb, lsps[n]);
1775
1776             for (m = 0; m < s->lsps; m++)
1777                 lsps[n][m] += mean_lsf[m];
1778             stabilize_lsps(lsps[n], s->lsps);
1779         }
1780
1781         if ((res = synth_frame(ctx, gb, n,
1782                                &samples[n * MAX_FRAMESIZE],
1783                                lsps[n], n == 0 ? s->prev_lsps : lsps[n - 1],
1784                                &excitation[s->history_nsamples + n * MAX_FRAMESIZE],
1785                                &synth[s->lsps + n * MAX_FRAMESIZE]))) {
1786             *got_frame_ptr = 0;
1787             return res;
1788         }
1789     }
1790
1791     /* Statistics? FIXME - we don't check for length, a slight overrun
1792      * will be caught by internal buffer padding, and anything else
1793      * will be skipped, not read. */
1794     if (get_bits1(gb)) {
1795         res = get_bits(gb, 4);
1796         skip_bits(gb, 10 * (res + 1));
1797     }
1798
1799     if (get_bits_left(gb) < 0) {
1800         wmavoice_flush(ctx);
1801         return AVERROR_INVALIDDATA;
1802     }
1803
1804     *got_frame_ptr = 1;
1805
1806     /* Update history */
1807     memcpy(s->prev_lsps,           lsps[2],
1808            s->lsps             * sizeof(*s->prev_lsps));
1809     memcpy(s->synth_history,      &synth[MAX_SFRAMESIZE],
1810            s->lsps             * sizeof(*synth));
1811     memcpy(s->excitation_history, &excitation[MAX_SFRAMESIZE],
1812            s->history_nsamples * sizeof(*excitation));
1813     if (s->do_apf)
1814         memmove(s->zero_exc_pf,       &s->zero_exc_pf[MAX_SFRAMESIZE],
1815                 s->history_nsamples * sizeof(*s->zero_exc_pf));
1816
1817     return 0;
1818 }
1819
1820 /**
1821  * Parse the packet header at the start of each packet (input data to this
1822  * decoder).
1823  *
1824  * @param s WMA Voice decoding context private data
1825  * @return <0 on error, nb_superframes on success.
1826  */
1827 static int parse_packet_header(WMAVoiceContext *s)
1828 {
1829     GetBitContext *gb = &s->gb;
1830     unsigned int res, n_superframes = 0;
1831
1832     skip_bits(gb, 4);          // packet sequence number
1833     s->has_residual_lsps = get_bits1(gb);
1834     do {
1835         res = get_bits(gb, 6); // number of superframes per packet
1836                                // (minus first one if there is spillover)
1837         n_superframes += res;
1838     } while (res == 0x3F);
1839     s->spillover_nbits   = get_bits(gb, s->spillover_bitsize);
1840
1841     return get_bits_left(gb) >= 0 ? n_superframes : AVERROR_INVALIDDATA;
1842 }
1843
1844 /**
1845  * Copy (unaligned) bits from gb/data/size to pb.
1846  *
1847  * @param pb target buffer to copy bits into
1848  * @param data source buffer to copy bits from
1849  * @param size size of the source data, in bytes
1850  * @param gb bit I/O context specifying the current position in the source.
1851  *           data. This function might use this to align the bit position to
1852  *           a whole-byte boundary before calling #avpriv_copy_bits() on aligned
1853  *           source data
1854  * @param nbits the amount of bits to copy from source to target
1855  *
1856  * @note after calling this function, the current position in the input bit
1857  *       I/O context is undefined.
1858  */
1859 static void copy_bits(PutBitContext *pb,
1860                       const uint8_t *data, int size,
1861                       GetBitContext *gb, int nbits)
1862 {
1863     int rmn_bytes, rmn_bits;
1864
1865     rmn_bits = rmn_bytes = get_bits_left(gb);
1866     if (rmn_bits < nbits)
1867         return;
1868     if (nbits > pb->size_in_bits - put_bits_count(pb))
1869         return;
1870     rmn_bits &= 7; rmn_bytes >>= 3;
1871     if ((rmn_bits = FFMIN(rmn_bits, nbits)) > 0)
1872         put_bits(pb, rmn_bits, get_bits(gb, rmn_bits));
1873     avpriv_copy_bits(pb, data + size - rmn_bytes,
1874                  FFMIN(nbits - rmn_bits, rmn_bytes << 3));
1875 }
1876
1877 /**
1878  * Packet decoding: a packet is anything that the (ASF) demuxer contains,
1879  * and we expect that the demuxer / application provides it to us as such
1880  * (else you'll probably get garbage as output). Every packet has a size of
1881  * ctx->block_align bytes, starts with a packet header (see
1882  * #parse_packet_header()), and then a series of superframes. Superframe
1883  * boundaries may exceed packets, i.e. superframes can split data over
1884  * multiple (two) packets.
1885  *
1886  * For more information about frames, see #synth_superframe().
1887  */
1888 static int wmavoice_decode_packet(AVCodecContext *ctx, void *data,
1889                                   int *got_frame_ptr, AVPacket *avpkt)
1890 {
1891     WMAVoiceContext *s = ctx->priv_data;
1892     GetBitContext *gb = &s->gb;
1893     int size, res, pos;
1894
1895     /* Packets are sometimes a multiple of ctx->block_align, with a packet
1896      * header at each ctx->block_align bytes. However, FFmpeg's ASF demuxer
1897      * feeds us ASF packets, which may concatenate multiple "codec" packets
1898      * in a single "muxer" packet, so we artificially emulate that by
1899      * capping the packet size at ctx->block_align. */
1900     for (size = avpkt->size; size > ctx->block_align; size -= ctx->block_align);
1901     init_get_bits(&s->gb, avpkt->data, size << 3);
1902
1903     /* size == ctx->block_align is used to indicate whether we are dealing with
1904      * a new packet or a packet of which we already read the packet header
1905      * previously. */
1906     if (!(size % ctx->block_align)) { // new packet header
1907         if (!size) {
1908             s->spillover_nbits = 0;
1909             s->nb_superframes = 0;
1910         } else {
1911             if ((res = parse_packet_header(s)) < 0)
1912                 return res;
1913             s->nb_superframes = res;
1914         }
1915
1916         /* If the packet header specifies a s->spillover_nbits, then we want
1917          * to push out all data of the previous packet (+ spillover) before
1918          * continuing to parse new superframes in the current packet. */
1919         if (s->sframe_cache_size > 0) {
1920             int cnt = get_bits_count(gb);
1921             if (cnt + s->spillover_nbits > avpkt->size * 8) {
1922                 s->spillover_nbits = avpkt->size * 8 - cnt;
1923             }
1924             copy_bits(&s->pb, avpkt->data, size, gb, s->spillover_nbits);
1925             flush_put_bits(&s->pb);
1926             s->sframe_cache_size += s->spillover_nbits;
1927             if ((res = synth_superframe(ctx, data, got_frame_ptr)) == 0 &&
1928                 *got_frame_ptr) {
1929                 cnt += s->spillover_nbits;
1930                 s->skip_bits_next = cnt & 7;
1931                 res = cnt >> 3;
1932                 return res;
1933             } else
1934                 skip_bits_long (gb, s->spillover_nbits - cnt +
1935                                 get_bits_count(gb)); // resync
1936         } else if (s->spillover_nbits) {
1937             skip_bits_long(gb, s->spillover_nbits);  // resync
1938         }
1939     } else if (s->skip_bits_next)
1940         skip_bits(gb, s->skip_bits_next);
1941
1942     /* Try parsing superframes in current packet */
1943     s->sframe_cache_size = 0;
1944     s->skip_bits_next = 0;
1945     pos = get_bits_left(gb);
1946     if (s->nb_superframes-- == 0) {
1947         *got_frame_ptr = 0;
1948         return size;
1949     } else if (s->nb_superframes > 0) {
1950         if ((res = synth_superframe(ctx, data, got_frame_ptr)) < 0) {
1951             return res;
1952         } else if (*got_frame_ptr) {
1953             int cnt = get_bits_count(gb);
1954             s->skip_bits_next = cnt & 7;
1955             res = cnt >> 3;
1956             return res;
1957         }
1958     } else if ((s->sframe_cache_size = pos) > 0) {
1959         /* ... cache it for spillover in next packet */
1960         init_put_bits(&s->pb, s->sframe_cache, SFRAME_CACHE_MAXSIZE);
1961         copy_bits(&s->pb, avpkt->data, size, gb, s->sframe_cache_size);
1962         // FIXME bad - just copy bytes as whole and add use the
1963         // skip_bits_next field
1964     }
1965
1966     return size;
1967 }
1968
1969 static av_cold int wmavoice_decode_end(AVCodecContext *ctx)
1970 {
1971     WMAVoiceContext *s = ctx->priv_data;
1972
1973     if (s->do_apf) {
1974         ff_rdft_end(&s->rdft);
1975         ff_rdft_end(&s->irdft);
1976         ff_dct_end(&s->dct);
1977         ff_dct_end(&s->dst);
1978     }
1979
1980     return 0;
1981 }
1982
1983 AVCodec ff_wmavoice_decoder = {
1984     .name             = "wmavoice",
1985     .long_name        = NULL_IF_CONFIG_SMALL("Windows Media Audio Voice"),
1986     .type             = AVMEDIA_TYPE_AUDIO,
1987     .id               = AV_CODEC_ID_WMAVOICE,
1988     .priv_data_size   = sizeof(WMAVoiceContext),
1989     .init             = wmavoice_decode_init,
1990     .init_static_data = wmavoice_init_static_data,
1991     .close            = wmavoice_decode_end,
1992     .decode           = wmavoice_decode_packet,
1993     .capabilities     = AV_CODEC_CAP_SUBFRAMES | AV_CODEC_CAP_DR1 | AV_CODEC_CAP_DELAY,
1994     .flush            = wmavoice_flush,
1995 };