git.sesse.net Git - ffmpeg/blob - libavcodec/wmavoice.c

   1 /*
   2  * Windows Media Audio Voice decoder.
   3  * Copyright (c) 2009 Ronald S. Bultje
   4  *
   5  * This file is part of FFmpeg.
   6  *
   7  * FFmpeg is free software; you can redistribute it and/or
   8  * modify it under the terms of the GNU Lesser General Public
   9  * License as published by the Free Software Foundation; either
  10  * version 2.1 of the License, or (at your option) any later version.
  11  *
  12  * FFmpeg is distributed in the hope that it will be useful,
  13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15  * Lesser General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU Lesser General Public
  18  * License along with FFmpeg; if not, write to the Free Software
  19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20  */
  21
  22 /**
  23  * @file
  24  * @brief Windows Media Audio Voice compatible decoder
  25  * @author Ronald S. Bultje <rsbultje@gmail.com>
  26  */
  27
  28 #include <math.h>
  29
  30 #include "libavutil/channel_layout.h"
  31 #include "libavutil/float_dsp.h"
  32 #include "libavutil/mem.h"
  33 #include "libavutil/thread.h"
  34 #include "avcodec.h"
  35 #include "internal.h"
  36 #include "get_bits.h"
  37 #include "put_bits.h"
  38 #include "wmavoice_data.h"
  39 #include "celp_filters.h"
  40 #include "acelp_vectors.h"
  41 #include "acelp_filters.h"
  42 #include "lsp.h"
  43 #include "dct.h"
  44 #include "rdft.h"
  45 #include "sinewin.h"
  46
  47 #define MAX_BLOCKS           8   ///< maximum number of blocks per frame
  48 #define MAX_LSPS             16  ///< maximum filter order
  49 #define MAX_LSPS_ALIGN16     16  ///< same as #MAX_LSPS; needs to be multiple
  50                                  ///< of 16 for ASM input buffer alignment
  51 #define MAX_FRAMES           3   ///< maximum number of frames per superframe
  52 #define MAX_FRAMESIZE        160 ///< maximum number of samples per frame
  53 #define MAX_SIGNAL_HISTORY   416 ///< maximum excitation signal history
  54 #define MAX_SFRAMESIZE       (MAX_FRAMESIZE * MAX_FRAMES)
  55                                  ///< maximum number of samples per superframe
  56 #define SFRAME_CACHE_MAXSIZE 256 ///< maximum cache size for frame data that
  57                                  ///< was split over two packets
  58 #define VLC_NBITS            6   ///< number of bits to read per VLC iteration
  59
  60 /**
  61  * Frame type VLC coding.
  62  */
  63 static VLC frame_type_vlc;
  64
  65 /**
  66  * Adaptive codebook types.
  67  */
  68 enum {
  69     ACB_TYPE_NONE       = 0, ///< no adaptive codebook (only hardcoded fixed)
  70     ACB_TYPE_ASYMMETRIC = 1, ///< adaptive codebook with per-frame pitch, which
  71                              ///< we interpolate to get a per-sample pitch.
  72                              ///< Signal is generated using an asymmetric sinc
  73                              ///< window function
  74                              ///< @note see #wmavoice_ipol1_coeffs
  75     ACB_TYPE_HAMMING    = 2  ///< Per-block pitch with signal generation using
  76                              ///< a Hamming sinc window function
  77                              ///< @note see #wmavoice_ipol2_coeffs
  78 };
  79
  80 /**
  81  * Fixed codebook types.
  82  */
  83 enum {
  84     FCB_TYPE_SILENCE    = 0, ///< comfort noise during silence
  85                              ///< generated from a hardcoded (fixed) codebook
  86                              ///< with per-frame (low) gain values
  87     FCB_TYPE_HARDCODED  = 1, ///< hardcoded (fixed) codebook with per-block
  88                              ///< gain values
  89     FCB_TYPE_AW_PULSES  = 2, ///< Pitch-adaptive window (AW) pulse signals,
  90                              ///< used in particular for low-bitrate streams
  91     FCB_TYPE_EXC_PULSES = 3, ///< Innovation (fixed) codebook pulse sets in
  92                              ///< combinations of either single pulses or
  93                              ///< pulse pairs
  94 };
  95
  96 /**
  97  * Description of frame types.
  98  */
  99 static const struct frame_type_desc {
 100     uint8_t n_blocks;     ///< amount of blocks per frame (each block
 101                           ///< (contains 160/#n_blocks samples)
 102     uint8_t log_n_blocks; ///< log2(#n_blocks)
 103     uint8_t acb_type;     ///< Adaptive codebook type (ACB_TYPE_*)
 104     uint8_t fcb_type;     ///< Fixed codebook type (FCB_TYPE_*)
 105     uint8_t dbl_pulses;   ///< how many pulse vectors have pulse pairs
 106                           ///< (rather than just one single pulse)
 107                           ///< only if #fcb_type == #FCB_TYPE_EXC_PULSES
 108 } frame_descs[17] = {
 109     { 1, 0, ACB_TYPE_NONE,       FCB_TYPE_SILENCE,    0 },
 110     { 2, 1, ACB_TYPE_NONE,       FCB_TYPE_HARDCODED,  0 },
 111     { 2, 1, ACB_TYPE_ASYMMETRIC, FCB_TYPE_AW_PULSES,  0 },
 112     { 2, 1, ACB_TYPE_ASYMMETRIC, FCB_TYPE_EXC_PULSES, 2 },
 113     { 2, 1, ACB_TYPE_ASYMMETRIC, FCB_TYPE_EXC_PULSES, 5 },
 114     { 4, 2, ACB_TYPE_ASYMMETRIC, FCB_TYPE_EXC_PULSES, 0 },
 115     { 4, 2, ACB_TYPE_ASYMMETRIC, FCB_TYPE_EXC_PULSES, 2 },
 116     { 4, 2, ACB_TYPE_ASYMMETRIC, FCB_TYPE_EXC_PULSES, 5 },
 117     { 2, 1, ACB_TYPE_HAMMING,    FCB_TYPE_EXC_PULSES, 0 },
 118     { 2, 1, ACB_TYPE_HAMMING,    FCB_TYPE_EXC_PULSES, 2 },
 119     { 2, 1, ACB_TYPE_HAMMING,    FCB_TYPE_EXC_PULSES, 5 },
 120     { 4, 2, ACB_TYPE_HAMMING,    FCB_TYPE_EXC_PULSES, 0 },
 121     { 4, 2, ACB_TYPE_HAMMING,    FCB_TYPE_EXC_PULSES, 2 },
 122     { 4, 2, ACB_TYPE_HAMMING,    FCB_TYPE_EXC_PULSES, 5 },
 123     { 8, 3, ACB_TYPE_HAMMING,    FCB_TYPE_EXC_PULSES, 0 },
 124     { 8, 3, ACB_TYPE_HAMMING,    FCB_TYPE_EXC_PULSES, 2 },
 125     { 8, 3, ACB_TYPE_HAMMING,    FCB_TYPE_EXC_PULSES, 5 }
 126 };
 127
 128 /**
 129  * WMA Voice decoding context.
 130  */
 131 typedef struct WMAVoiceContext {
 132     /**
 133      * @name Global values specified in the stream header / extradata or used all over.
 134      * @{
 135      */
 136     GetBitContext gb;             ///< packet bitreader. During decoder init,
 137                                   ///< it contains the extradata from the
 138                                   ///< demuxer. During decoding, it contains
 139                                   ///< packet data.
 140     int8_t vbm_tree[25];          ///< converts VLC codes to frame type
 141
 142     int spillover_bitsize;        ///< number of bits used to specify
 143                                   ///< #spillover_nbits in the packet header
 144                                   ///< = ceil(log2(ctx->block_align << 3))
 145     int history_nsamples;         ///< number of samples in history for signal
 146                                   ///< prediction (through ACB)
 147
 148     /* postfilter specific values */
 149     int do_apf;                   ///< whether to apply the averaged
 150                                   ///< projection filter (APF)
 151     int denoise_strength;         ///< strength of denoising in Wiener filter
 152                                   ///< [0-11]
 153     int denoise_tilt_corr;        ///< Whether to apply tilt correction to the
 154                                   ///< Wiener filter coefficients (postfilter)
 155     int dc_level;                 ///< Predicted amount of DC noise, based
 156                                   ///< on which a DC removal filter is used
 157
 158     int lsps;                     ///< number of LSPs per frame [10 or 16]
 159     int lsp_q_mode;               ///< defines quantizer defaults [0, 1]
 160     int lsp_def_mode;             ///< defines different sets of LSP defaults
 161                                   ///< [0, 1]
 162
 163     int min_pitch_val;            ///< base value for pitch parsing code
 164     int max_pitch_val;            ///< max value + 1 for pitch parsing
 165     int pitch_nbits;              ///< number of bits used to specify the
 166                                   ///< pitch value in the frame header
 167     int block_pitch_nbits;        ///< number of bits used to specify the
 168                                   ///< first block's pitch value
 169     int block_pitch_range;        ///< range of the block pitch
 170     int block_delta_pitch_nbits;  ///< number of bits used to specify the
 171                                   ///< delta pitch between this and the last
 172                                   ///< block's pitch value, used in all but
 173                                   ///< first block
 174     int block_delta_pitch_hrange; ///< 1/2 range of the delta (full range is
 175                                   ///< from -this to +this-1)
 176     uint16_t block_conv_table[4]; ///< boundaries for block pitch unit/scale
 177                                   ///< conversion
 178
 179     /**
 180      * @}
 181      *
 182      * @name Packet values specified in the packet header or related to a packet.
 183      *
 184      * A packet is considered to be a single unit of data provided to this
 185      * decoder by the demuxer.
 186      * @{
 187      */
 188     int spillover_nbits;          ///< number of bits of the previous packet's
 189                                   ///< last superframe preceding this
 190                                   ///< packet's first full superframe (useful
 191                                   ///< for re-synchronization also)
 192     int has_residual_lsps;        ///< if set, superframes contain one set of
 193                                   ///< LSPs that cover all frames, encoded as
 194                                   ///< independent and residual LSPs; if not
 195                                   ///< set, each frame contains its own, fully
 196                                   ///< independent, LSPs
 197     int skip_bits_next;           ///< number of bits to skip at the next call
 198                                   ///< to #wmavoice_decode_packet() (since
 199                                   ///< they're part of the previous superframe)
 200
 201     uint8_t sframe_cache[SFRAME_CACHE_MAXSIZE + AV_INPUT_BUFFER_PADDING_SIZE];
 202                                   ///< cache for superframe data split over
 203                                   ///< multiple packets
 204     int sframe_cache_size;        ///< set to >0 if we have data from an
 205                                   ///< (incomplete) superframe from a previous
 206                                   ///< packet that spilled over in the current
 207                                   ///< packet; specifies the amount of bits in
 208                                   ///< #sframe_cache
 209     PutBitContext pb;             ///< bitstream writer for #sframe_cache
 210
 211     /**
 212      * @}
 213      *
 214      * @name Frame and superframe values
 215      * Superframe and frame data - these can change from frame to frame,
 216      * although some of them do in that case serve as a cache / history for
 217      * the next frame or superframe.
 218      * @{
 219      */
 220     double prev_lsps[MAX_LSPS];   ///< LSPs of the last frame of the previous
 221                                   ///< superframe
 222     int last_pitch_val;           ///< pitch value of the previous frame
 223     int last_acb_type;            ///< frame type [0-2] of the previous frame
 224     int pitch_diff_sh16;          ///< ((cur_pitch_val - #last_pitch_val)
 225                                   ///< << 16) / #MAX_FRAMESIZE
 226     float silence_gain;           ///< set for use in blocks if #ACB_TYPE_NONE
 227
 228     int aw_idx_is_ext;            ///< whether the AW index was encoded in
 229                                   ///< 8 bits (instead of 6)
 230     int aw_pulse_range;           ///< the range over which #aw_pulse_set1()
 231                                   ///< can apply the pulse, relative to the
 232                                   ///< value in aw_first_pulse_off. The exact
 233                                   ///< position of the first AW-pulse is within
 234                                   ///< [pulse_off, pulse_off + this], and
 235                                   ///< depends on bitstream values; [16 or 24]
 236     int aw_n_pulses[2];           ///< number of AW-pulses in each block; note
 237                                   ///< that this number can be negative (in
 238                                   ///< which case it basically means "zero")
 239     int aw_first_pulse_off[2];    ///< index of first sample to which to
 240                                   ///< apply AW-pulses, or -0xff if unset
 241     int aw_next_pulse_off_cache;  ///< the position (relative to start of the
 242                                   ///< second block) at which pulses should
 243                                   ///< start to be positioned, serves as a
 244                                   ///< cache for pitch-adaptive window pulses
 245                                   ///< between blocks
 246
 247     int frame_cntr;               ///< current frame index [0 - 0xFFFE]; is
 248                                   ///< only used for comfort noise in #pRNG()
 249     int nb_superframes;           ///< number of superframes in current packet
 250     float gain_pred_err[6];       ///< cache for gain prediction
 251     float excitation_history[MAX_SIGNAL_HISTORY];
 252                                   ///< cache of the signal of previous
 253                                   ///< superframes, used as a history for
 254                                   ///< signal generation
 255     float synth_history[MAX_LSPS]; ///< see #excitation_history
 256     /**
 257      * @}
 258      *
 259      * @name Postfilter values
 260      *
 261      * Variables used for postfilter implementation, mostly history for
 262      * smoothing and so on, and context variables for FFT/iFFT.
 263      * @{
 264      */
 265     RDFTContext rdft, irdft;      ///< contexts for FFT-calculation in the
 266                                   ///< postfilter (for denoise filter)
 267     DCTContext dct, dst;          ///< contexts for phase shift (in Hilbert
 268                                   ///< transform, part of postfilter)
 269     float sin[511], cos[511];     ///< 8-bit cosine/sine windows over [-pi,pi]
 270                                   ///< range
 271     float postfilter_agc;         ///< gain control memory, used in
 272                                   ///< #adaptive_gain_control()
 273     float dcf_mem[2];             ///< DC filter history
 274     float zero_exc_pf[MAX_SIGNAL_HISTORY + MAX_SFRAMESIZE];
 275                                   ///< zero filter output (i.e. excitation)
 276                                   ///< by postfilter
 277     float denoise_filter_cache[MAX_FRAMESIZE];
 278     int   denoise_filter_cache_size; ///< samples in #denoise_filter_cache
 279     DECLARE_ALIGNED(32, float, tilted_lpcs_pf)[0x80];
 280                                   ///< aligned buffer for LPC tilting
 281     DECLARE_ALIGNED(32, float, denoise_coeffs_pf)[0x80];
 282                                   ///< aligned buffer for denoise coefficients
 283     DECLARE_ALIGNED(32, float, synth_filter_out_buf)[0x80 + MAX_LSPS_ALIGN16];
 284                                   ///< aligned buffer for postfilter speech
 285                                   ///< synthesis
 286     /**
 287      * @}
 288      */
 289 } WMAVoiceContext;
 290
 291 /**
 292  * Set up the variable bit mode (VBM) tree from container extradata.
 293  * @param gb bit I/O context.
 294  *           The bit context (s->gb) should be loaded with byte 23-46 of the
 295  *           container extradata (i.e. the ones containing the VBM tree).
 296  * @param vbm_tree pointer to array to which the decoded VBM tree will be
 297  *                 written.
 298  * @return 0 on success, <0 on error.
 299  */
 300 static av_cold int decode_vbmtree(GetBitContext *gb, int8_t vbm_tree[25])
 301 {
 302     int cntr[8] = { 0 }, n, res;
 303
 304     memset(vbm_tree, 0xff, sizeof(vbm_tree[0]) * 25);
 305     for (n = 0; n < 17; n++) {
 306         res = get_bits(gb, 3);
 307         if (cntr[res] > 3) // should be >= 3 + (res == 7))
 308             return -1;
 309         vbm_tree[res * 3 + cntr[res]++] = n;
 310     }
 311     return 0;
 312 }
 313
 314 static av_cold void wmavoice_init_static_data(void)
 315 {
 316     static const uint8_t bits[] = {
 317          2,  2,  2,  4,  4,  4,
 318          6,  6,  6,  8,  8,  8,
 319         10, 10, 10, 12, 12, 12,
 320         14, 14, 14, 14
 321     };
 322     static const uint16_t codes[] = {
 323           0x0000, 0x0001, 0x0002,        //              00/01/10
 324           0x000c, 0x000d, 0x000e,        //           11+00/01/10
 325           0x003c, 0x003d, 0x003e,        //         1111+00/01/10
 326           0x00fc, 0x00fd, 0x00fe,        //       111111+00/01/10
 327           0x03fc, 0x03fd, 0x03fe,        //     11111111+00/01/10
 328           0x0ffc, 0x0ffd, 0x0ffe,        //   1111111111+00/01/10
 329           0x3ffc, 0x3ffd, 0x3ffe, 0x3fff // 111111111111+xx
 330     };
 331
 332     INIT_VLC_STATIC(&frame_type_vlc, VLC_NBITS, sizeof(bits),
 333                     bits, 1, 1, codes, 2, 2, 132);
 334 }
 335
 336 static av_cold void wmavoice_flush(AVCodecContext *ctx)
 337 {
 338     WMAVoiceContext *s = ctx->priv_data;
 339     int n;
 340
 341     s->postfilter_agc    = 0;
 342     s->sframe_cache_size = 0;
 343     s->skip_bits_next    = 0;
 344     for (n = 0; n < s->lsps; n++)
 345         s->prev_lsps[n] = M_PI * (n + 1.0) / (s->lsps + 1.0);
 346     memset(s->excitation_history, 0,
 347            sizeof(*s->excitation_history) * MAX_SIGNAL_HISTORY);
 348     memset(s->synth_history,      0,
 349            sizeof(*s->synth_history)      * MAX_LSPS);
 350     memset(s->gain_pred_err,      0,
 351            sizeof(s->gain_pred_err));
 352
 353     if (s->do_apf) {
 354         memset(&s->synth_filter_out_buf[MAX_LSPS_ALIGN16 - s->lsps], 0,
 355                sizeof(*s->synth_filter_out_buf) * s->lsps);
 356         memset(s->dcf_mem,              0,
 357                sizeof(*s->dcf_mem)              * 2);
 358         memset(s->zero_exc_pf,          0,
 359                sizeof(*s->zero_exc_pf)          * s->history_nsamples);
 360         memset(s->denoise_filter_cache, 0, sizeof(s->denoise_filter_cache));
 361     }
 362 }
 363
 364 /**
 365  * Set up decoder with parameters from demuxer (extradata etc.).
 366  */
 367 static av_cold int wmavoice_decode_init(AVCodecContext *ctx)
 368 {
 369     static AVOnce init_static_once = AV_ONCE_INIT;
 370     int n, flags, pitch_range, lsp16_flag;
 371     WMAVoiceContext *s = ctx->priv_data;
 372
 373     ff_thread_once(&init_static_once, wmavoice_init_static_data);
 374
 375     /**
 376      * Extradata layout:
 377      * - byte  0-18: WMAPro-in-WMAVoice extradata (see wmaprodec.c),
 378      * - byte 19-22: flags field (annoyingly in LE; see below for known
 379      *               values),
 380      * - byte 23-46: variable bitmode tree (really just 17 * 3 bits,
 381      *               rest is 0).
 382      */
 383     if (ctx->extradata_size != 46) {
 384         av_log(ctx, AV_LOG_ERROR,
 385                "Invalid extradata size %d (should be 46)\n",
 386                ctx->extradata_size);
 387         return AVERROR_INVALIDDATA;
 388     }
 389     if (ctx->block_align <= 0 || ctx->block_align > (1<<22)) {
 390         av_log(ctx, AV_LOG_ERROR, "Invalid block alignment %d.\n", ctx->block_align);
 391         return AVERROR_INVALIDDATA;
 392     }
 393
 394     flags                = AV_RL32(ctx->extradata + 18);
 395     s->spillover_bitsize = 3 + av_ceil_log2(ctx->block_align);
 396     s->do_apf            =    flags & 0x1;
 397     if (s->do_apf) {
 398         ff_rdft_init(&s->rdft,  7, DFT_R2C);
 399         ff_rdft_init(&s->irdft, 7, IDFT_C2R);
 400         ff_dct_init(&s->dct,  6, DCT_I);
 401         ff_dct_init(&s->dst,  6, DST_I);
 402
 403         ff_sine_window_init(s->cos, 256);
 404         memcpy(&s->sin[255], s->cos, 256 * sizeof(s->cos[0]));
 405         for (n = 0; n < 255; n++) {
 406             s->sin[n]       = -s->sin[510 - n];
 407             s->cos[510 - n] =  s->cos[n];
 408         }
 409     }
 410     s->denoise_strength  =   (flags >> 2) & 0xF;
 411     if (s->denoise_strength >= 12) {
 412         av_log(ctx, AV_LOG_ERROR,
 413                "Invalid denoise filter strength %d (max=11)\n",
 414                s->denoise_strength);
 415         return AVERROR_INVALIDDATA;
 416     }
 417     s->denoise_tilt_corr = !!(flags & 0x40);
 418     s->dc_level          =   (flags >> 7) & 0xF;
 419     s->lsp_q_mode        = !!(flags & 0x2000);
 420     s->lsp_def_mode      = !!(flags & 0x4000);
 421     lsp16_flag           =    flags & 0x1000;
 422     if (lsp16_flag) {
 423         s->lsps               = 16;
 424     } else {
 425         s->lsps               = 10;
 426     }
 427     for (n = 0; n < s->lsps; n++)
 428         s->prev_lsps[n] = M_PI * (n + 1.0) / (s->lsps + 1.0);
 429
 430     init_get_bits(&s->gb, ctx->extradata + 22, (ctx->extradata_size - 22) << 3);
 431     if (decode_vbmtree(&s->gb, s->vbm_tree) < 0) {
 432         av_log(ctx, AV_LOG_ERROR, "Invalid VBM tree; broken extradata?\n");
 433         return AVERROR_INVALIDDATA;
 434     }
 435
 436     if (ctx->sample_rate >= INT_MAX / (256 * 37))
 437         return AVERROR_INVALIDDATA;
 438
 439     s->min_pitch_val    = ((ctx->sample_rate << 8)      /  400 + 50) >> 8;
 440     s->max_pitch_val    = ((ctx->sample_rate << 8) * 37 / 2000 + 50) >> 8;
 441     pitch_range         = s->max_pitch_val - s->min_pitch_val;
 442     if (pitch_range <= 0) {
 443         av_log(ctx, AV_LOG_ERROR, "Invalid pitch range; broken extradata?\n");
 444         return AVERROR_INVALIDDATA;
 445     }
 446     s->pitch_nbits      = av_ceil_log2(pitch_range);
 447     s->last_pitch_val   = 40;
 448     s->last_acb_type    = ACB_TYPE_NONE;
 449     s->history_nsamples = s->max_pitch_val + 8;
 450
 451     if (s->min_pitch_val < 1 || s->history_nsamples > MAX_SIGNAL_HISTORY) {
 452         int min_sr = ((((1 << 8) - 50) * 400) + 0xFF) >> 8,
 453             max_sr = ((((MAX_SIGNAL_HISTORY - 8) << 8) + 205) * 2000 / 37) >> 8;
 454
 455         av_log(ctx, AV_LOG_ERROR,
 456                "Unsupported samplerate %d (min=%d, max=%d)\n",
 457                ctx->sample_rate, min_sr, max_sr); // 322-22097 Hz
 458
 459         return AVERROR(ENOSYS);
 460     }
 461
 462     s->block_conv_table[0]      = s->min_pitch_val;
 463     s->block_conv_table[1]      = (pitch_range * 25) >> 6;
 464     s->block_conv_table[2]      = (pitch_range * 44) >> 6;
 465     s->block_conv_table[3]      = s->max_pitch_val - 1;
 466     s->block_delta_pitch_hrange = (pitch_range >> 3) & ~0xF;
 467     if (s->block_delta_pitch_hrange <= 0) {
 468         av_log(ctx, AV_LOG_ERROR, "Invalid delta pitch hrange; broken extradata?\n");
 469         return AVERROR_INVALIDDATA;
 470     }
 471     s->block_delta_pitch_nbits  = 1 + av_ceil_log2(s->block_delta_pitch_hrange);
 472     s->block_pitch_range        = s->block_conv_table[2] +
 473                                   s->block_conv_table[3] + 1 +
 474                                   2 * (s->block_conv_table[1] - 2 * s->min_pitch_val);
 475     s->block_pitch_nbits        = av_ceil_log2(s->block_pitch_range);
 476
 477     ctx->channels               = 1;
 478     ctx->channel_layout         = AV_CH_LAYOUT_MONO;
 479     ctx->sample_fmt             = AV_SAMPLE_FMT_FLT;
 480
 481     return 0;
 482 }
 483
 484 /**
 485  * @name Postfilter functions
 486  * Postfilter functions (gain control, wiener denoise filter, DC filter,
 487  * kalman smoothening, plus surrounding code to wrap it)
 488  * @{
 489  */
 490 /**
 491  * Adaptive gain control (as used in postfilter).
 492  *
 493  * Identical to #ff_adaptive_gain_control() in acelp_vectors.c, except
 494  * that the energy here is calculated using sum(abs(...)), whereas the
 495  * other codecs (e.g. AMR-NB, SIPRO) use sqrt(dotproduct(...)).
 496  *
 497  * @param out output buffer for filtered samples
 498  * @param in input buffer containing the samples as they are after the
 499  *           postfilter steps so far
 500  * @param speech_synth input buffer containing speech synth before postfilter
 501  * @param size input buffer size
 502  * @param alpha exponential filter factor
 503  * @param gain_mem pointer to filter memory (single float)
 504  */
 505 static void adaptive_gain_control(float *out, const float *in,
 506                                   const float *speech_synth,
 507                                   int size, float alpha, float *gain_mem)
 508 {
 509     int i;
 510     float speech_energy = 0.0, postfilter_energy = 0.0, gain_scale_factor;
 511     float mem = *gain_mem;
 512
 513     for (i = 0; i < size; i++) {
 514         speech_energy     += fabsf(speech_synth[i]);
 515         postfilter_energy += fabsf(in[i]);
 516     }
 517     gain_scale_factor = postfilter_energy == 0.0 ? 0.0 :
 518                         (1.0 - alpha) * speech_energy / postfilter_energy;
 519
 520     for (i = 0; i < size; i++) {
 521         mem = alpha * mem + gain_scale_factor;
 522         out[i] = in[i] * mem;
 523     }
 524
 525     *gain_mem = mem;
 526 }
 527
 528 /**
 529  * Kalman smoothing function.
 530  *
 531  * This function looks back pitch +/- 3 samples back into history to find
 532  * the best fitting curve (that one giving the optimal gain of the two
 533  * signals, i.e. the highest dot product between the two), and then
 534  * uses that signal history to smoothen the output of the speech synthesis
 535  * filter.
 536  *
 537  * @param s WMA Voice decoding context
 538  * @param pitch pitch of the speech signal
 539  * @param in input speech signal
 540  * @param out output pointer for smoothened signal
 541  * @param size input/output buffer size
 542  *
 543  * @returns -1 if no smoothening took place, e.g. because no optimal
 544  *          fit could be found, or 0 on success.
 545  */
 546 static int kalman_smoothen(WMAVoiceContext *s, int pitch,
 547                            const float *in, float *out, int size)
 548 {
 549     int n;
 550     float optimal_gain = 0, dot;
 551     const float *ptr = &in[-FFMAX(s->min_pitch_val, pitch - 3)],
 552                 *end = &in[-FFMIN(s->max_pitch_val, pitch + 3)],
 553                 *best_hist_ptr = NULL;
 554
 555     /* find best fitting point in history */
 556     do {
 557         dot = avpriv_scalarproduct_float_c(in, ptr, size);
 558         if (dot > optimal_gain) {
 559             optimal_gain  = dot;
 560             best_hist_ptr = ptr;
 561         }
 562     } while (--ptr >= end);
 563
 564     if (optimal_gain <= 0)
 565         return -1;
 566     dot = avpriv_scalarproduct_float_c(best_hist_ptr, best_hist_ptr, size);
 567     if (dot <= 0) // would be 1.0
 568         return -1;
 569
 570     if (optimal_gain <= dot) {
 571         dot = dot / (dot + 0.6 * optimal_gain); // 0.625-1.000
 572     } else
 573         dot = 0.625;
 574
 575     /* actual smoothing */
 576     for (n = 0; n < size; n++)
 577         out[n] = best_hist_ptr[n] + dot * (in[n] - best_hist_ptr[n]);
 578
 579     return 0;
 580 }
 581
 582 /**
 583  * Get the tilt factor of a formant filter from its transfer function
 584  * @see #tilt_factor() in amrnbdec.c, which does essentially the same,
 585  *      but somehow (??) it does a speech synthesis filter in the
 586  *      middle, which is missing here
 587  *
 588  * @param lpcs LPC coefficients
 589  * @param n_lpcs Size of LPC buffer
 590  * @returns the tilt factor
 591  */
 592 static float tilt_factor(const float *lpcs, int n_lpcs)
 593 {
 594     float rh0, rh1;
 595
 596     rh0 = 1.0     + avpriv_scalarproduct_float_c(lpcs,  lpcs,    n_lpcs);
 597     rh1 = lpcs[0] + avpriv_scalarproduct_float_c(lpcs, &lpcs[1], n_lpcs - 1);
 598
 599     return rh1 / rh0;
 600 }
 601
 602 /**
 603  * Derive denoise filter coefficients (in real domain) from the LPCs.
 604  */
 605 static void calc_input_response(WMAVoiceContext *s, float *lpcs,
 606                                 int fcb_type, float *coeffs, int remainder)
 607 {
 608     float last_coeff, min = 15.0, max = -15.0;
 609     float irange, angle_mul, gain_mul, range, sq;
 610     int n, idx;
 611
 612     /* Create frequency power spectrum of speech input (i.e. RDFT of LPCs) */
 613     s->rdft.rdft_calc(&s->rdft, lpcs);
 614 #define log_range(var, assign) do { \
 615         float tmp = log10f(assign);  var = tmp; \
 616         max       = FFMAX(max, tmp); min = FFMIN(min, tmp); \
 617     } while (0)
 618     log_range(last_coeff,  lpcs[1]         * lpcs[1]);
 619     for (n = 1; n < 64; n++)
 620         log_range(lpcs[n], lpcs[n * 2]     * lpcs[n * 2] +
 621                            lpcs[n * 2 + 1] * lpcs[n * 2 + 1]);
 622     log_range(lpcs[0],     lpcs[0]         * lpcs[0]);
 623 #undef log_range
 624     range    = max - min;
 625     lpcs[64] = last_coeff;
 626
 627     /* Now, use this spectrum to pick out these frequencies with higher
 628      * (relative) power/energy (which we then take to be "not noise"),
 629      * and set up a table (still in lpc[]) of (relative) gains per frequency.
 630      * These frequencies will be maintained, while others ("noise") will be
 631      * decreased in the filter output. */
 632     irange    = 64.0 / range; // so irange*(max-value) is in the range [0, 63]
 633     gain_mul  = range * (fcb_type == FCB_TYPE_HARDCODED ? (5.0 / 13.0) :
 634                                                           (5.0 / 14.7));
 635     angle_mul = gain_mul * (8.0 * M_LN10 / M_PI);
 636     for (n = 0; n <= 64; n++) {
 637         float pwr;
 638
 639         idx = FFMAX(0, lrint((max - lpcs[n]) * irange) - 1);
 640         pwr = wmavoice_denoise_power_table[s->denoise_strength][idx];
 641         lpcs[n] = angle_mul * pwr;
 642
 643         /* 70.57 =~ 1/log10(1.0331663) */
 644         idx = (pwr * gain_mul - 0.0295) * 70.570526123;
 645         if (idx > 127) { // fall back if index falls outside table range
 646             coeffs[n] = wmavoice_energy_table[127] *
 647                         powf(1.0331663, idx - 127);
 648         } else
 649             coeffs[n] = wmavoice_energy_table[FFMAX(0, idx)];
 650     }
 651
 652     /* calculate the Hilbert transform of the gains, which we do (since this
 653      * is a sine input) by doing a phase shift (in theory, H(sin())=cos()).
 654      * Hilbert_Transform(RDFT(x)) = Laplace_Transform(x), which calculates the
 655      * "moment" of the LPCs in this filter. */
 656     s->dct.dct_calc(&s->dct, lpcs);
 657     s->dst.dct_calc(&s->dst, lpcs);
 658
 659     /* Split out the coefficient indexes into phase/magnitude pairs */
 660     idx = 255 + av_clip(lpcs[64],               -255, 255);
 661     coeffs[0]  = coeffs[0]  * s->cos[idx];
 662     idx = 255 + av_clip(lpcs[64] - 2 * lpcs[63], -255, 255);
 663     last_coeff = coeffs[64] * s->cos[idx];
 664     for (n = 63;; n--) {
 665         idx = 255 + av_clip(-lpcs[64] - 2 * lpcs[n - 1], -255, 255);
 666         coeffs[n * 2 + 1] = coeffs[n] * s->sin[idx];
 667         coeffs[n * 2]     = coeffs[n] * s->cos[idx];
 668
 669         if (!--n) break;
 670
 671         idx = 255 + av_clip( lpcs[64] - 2 * lpcs[n - 1], -255, 255);
 672         coeffs[n * 2 + 1] = coeffs[n] * s->sin[idx];
 673         coeffs[n * 2]     = coeffs[n] * s->cos[idx];
 674     }
 675     coeffs[1] = last_coeff;
 676
 677     /* move into real domain */
 678     s->irdft.rdft_calc(&s->irdft, coeffs);
 679
 680     /* tilt correction and normalize scale */
 681     memset(&coeffs[remainder], 0, sizeof(coeffs[0]) * (128 - remainder));
 682     if (s->denoise_tilt_corr) {
 683         float tilt_mem = 0;
 684
 685         coeffs[remainder - 1] = 0;
 686         ff_tilt_compensation(&tilt_mem,
 687                              -1.8 * tilt_factor(coeffs, remainder - 1),
 688                              coeffs, remainder);
 689     }
 690     sq = (1.0 / 64.0) * sqrtf(1 / avpriv_scalarproduct_float_c(coeffs, coeffs,
 691                                                                remainder));
 692     for (n = 0; n < remainder; n++)
 693         coeffs[n] *= sq;
 694 }
 695
 696 /**
 697  * This function applies a Wiener filter on the (noisy) speech signal as
 698  * a means to denoise it.
 699  *
 700  * - take RDFT of LPCs to get the power spectrum of the noise + speech;
 701  * - using this power spectrum, calculate (for each frequency) the Wiener
 702  *    filter gain, which depends on the frequency power and desired level
 703  *    of noise subtraction (when set too high, this leads to artifacts)
 704  *    We can do this symmetrically over the X-axis (so 0-4kHz is the inverse
 705  *    of 4-8kHz);
 706  * - by doing a phase shift, calculate the Hilbert transform of this array
 707  *    of per-frequency filter-gains to get the filtering coefficients;
 708  * - smoothen/normalize/de-tilt these filter coefficients as desired;
 709  * - take RDFT of noisy sound, apply the coefficients and take its IRDFT
 710  *    to get the denoised speech signal;
 711  * - the leftover (i.e. output of the IRDFT on denoised speech data beyond
 712  *    the frame boundary) are saved and applied to subsequent frames by an
 713  *    overlap-add method (otherwise you get clicking-artifacts).
 714  *
 715  * @param s WMA Voice decoding context
 716  * @param fcb_type Frame (codebook) type
 717  * @param synth_pf input: the noisy speech signal, output: denoised speech
 718  *                 data; should be 16-byte aligned (for ASM purposes)
 719  * @param size size of the speech data
 720  * @param lpcs LPCs used to synthesize this frame's speech data
 721  */
 722 static void wiener_denoise(WMAVoiceContext *s, int fcb_type,
 723                            float *synth_pf, int size,
 724                            const float *lpcs)
 725 {
 726     int remainder, lim, n;
 727
 728     if (fcb_type != FCB_TYPE_SILENCE) {
 729         float *tilted_lpcs = s->tilted_lpcs_pf,
 730               *coeffs = s->denoise_coeffs_pf, tilt_mem = 0;
 731
 732         tilted_lpcs[0]           = 1.0;
 733         memcpy(&tilted_lpcs[1], lpcs, sizeof(lpcs[0]) * s->lsps);
 734         memset(&tilted_lpcs[s->lsps + 1], 0,
 735                sizeof(tilted_lpcs[0]) * (128 - s->lsps - 1));
 736         ff_tilt_compensation(&tilt_mem, 0.7 * tilt_factor(lpcs, s->lsps),
 737                              tilted_lpcs, s->lsps + 2);
 738
 739         /* The IRDFT output (127 samples for 7-bit filter) beyond the frame
 740          * size is applied to the next frame. All input beyond this is zero,
 741          * and thus all output beyond this will go towards zero, hence we can
 742          * limit to min(size-1, 127-size) as a performance consideration. */
 743         remainder = FFMIN(127 - size, size - 1);
 744         calc_input_response(s, tilted_lpcs, fcb_type, coeffs, remainder);
 745
 746         /* apply coefficients (in frequency spectrum domain), i.e. complex
 747          * number multiplication */
 748         memset(&synth_pf[size], 0, sizeof(synth_pf[0]) * (128 - size));
 749         s->rdft.rdft_calc(&s->rdft, synth_pf);
 750         s->rdft.rdft_calc(&s->rdft, coeffs);
 751         synth_pf[0] *= coeffs[0];
 752         synth_pf[1] *= coeffs[1];
 753         for (n = 1; n < 64; n++) {
 754             float v1 = synth_pf[n * 2], v2 = synth_pf[n * 2 + 1];
 755             synth_pf[n * 2]     = v1 * coeffs[n * 2] - v2 * coeffs[n * 2 + 1];
 756             synth_pf[n * 2 + 1] = v2 * coeffs[n * 2] + v1 * coeffs[n * 2 + 1];
 757         }
 758         s->irdft.rdft_calc(&s->irdft, synth_pf);
 759     }
 760
 761     /* merge filter output with the history of previous runs */
 762     if (s->denoise_filter_cache_size) {
 763         lim = FFMIN(s->denoise_filter_cache_size, size);
 764         for (n = 0; n < lim; n++)
 765             synth_pf[n] += s->denoise_filter_cache[n];
 766         s->denoise_filter_cache_size -= lim;
 767         memmove(s->denoise_filter_cache, &s->denoise_filter_cache[size],
 768                 sizeof(s->denoise_filter_cache[0]) * s->denoise_filter_cache_size);
 769     }
 770
 771     /* move remainder of filter output into a cache for future runs */
 772     if (fcb_type != FCB_TYPE_SILENCE) {
 773         lim = FFMIN(remainder, s->denoise_filter_cache_size);
 774         for (n = 0; n < lim; n++)
 775             s->denoise_filter_cache[n] += synth_pf[size + n];
 776         if (lim < remainder) {
 777             memcpy(&s->denoise_filter_cache[lim], &synth_pf[size + lim],
 778                    sizeof(s->denoise_filter_cache[0]) * (remainder - lim));
 779             s->denoise_filter_cache_size = remainder;
 780         }
 781     }
 782 }
 783
 784 /**
 785  * Averaging projection filter, the postfilter used in WMAVoice.
 786  *
 787  * This uses the following steps:
 788  * - A zero-synthesis filter (generate excitation from synth signal)
 789  * - Kalman smoothing on excitation, based on pitch
 790  * - Re-synthesized smoothened output
 791  * - Iterative Wiener denoise filter
 792  * - Adaptive gain filter
 793  * - DC filter
 794  *
 795  * @param s WMAVoice decoding context
 796  * @param synth Speech synthesis output (before postfilter)
 797  * @param samples Output buffer for filtered samples
 798  * @param size Buffer size of synth & samples
 799  * @param lpcs Generated LPCs used for speech synthesis
 800  * @param zero_exc_pf destination for zero synthesis filter (16-byte aligned)
 801  * @param fcb_type Frame type (silence, hardcoded, AW-pulses or FCB-pulses)
 802  * @param pitch Pitch of the input signal
 803  */
 804 static void postfilter(WMAVoiceContext *s, const float *synth,
 805                        float *samples,    int size,
 806                        const float *lpcs, float *zero_exc_pf,
 807                        int fcb_type,      int pitch)
 808 {
 809     float synth_filter_in_buf[MAX_FRAMESIZE / 2],
 810           *synth_pf = &s->synth_filter_out_buf[MAX_LSPS_ALIGN16],
 811           *synth_filter_in = zero_exc_pf;
 812
 813     av_assert0(size <= MAX_FRAMESIZE / 2);
 814
 815     /* generate excitation from input signal */
 816     ff_celp_lp_zero_synthesis_filterf(zero_exc_pf, lpcs, synth, size, s->lsps);
 817
 818     if (fcb_type >= FCB_TYPE_AW_PULSES &&
 819         !kalman_smoothen(s, pitch, zero_exc_pf, synth_filter_in_buf, size))
 820         synth_filter_in = synth_filter_in_buf;
 821
 822     /* re-synthesize speech after smoothening, and keep history */
 823     ff_celp_lp_synthesis_filterf(synth_pf, lpcs,
 824                                  synth_filter_in, size, s->lsps);
 825     memcpy(&synth_pf[-s->lsps], &synth_pf[size - s->lsps],
 826            sizeof(synth_pf[0]) * s->lsps);
 827
 828     wiener_denoise(s, fcb_type, synth_pf, size, lpcs);
 829
 830     adaptive_gain_control(samples, synth_pf, synth, size, 0.99,
 831                           &s->postfilter_agc);
 832
 833     if (s->dc_level > 8) {
 834         /* remove ultra-low frequency DC noise / highpass filter;
 835          * coefficients are identical to those used in SIPR decoding,
 836          * and very closely resemble those used in AMR-NB decoding. */
 837         ff_acelp_apply_order_2_transfer_function(samples, samples,
 838             (const float[2]) { -1.99997,      1.0 },
 839             (const float[2]) { -1.9330735188, 0.93589198496 },
 840             0.93980580475, s->dcf_mem, size);
 841     }
 842 }
 843 /**
 844  * @}
 845  */
 846
 847 /**
 848  * Dequantize LSPs
 849  * @param lsps output pointer to the array that will hold the LSPs
 850  * @param num number of LSPs to be dequantized
 851  * @param values quantized values, contains n_stages values
 852  * @param sizes range (i.e. max value) of each quantized value
 853  * @param n_stages number of dequantization runs
 854  * @param table dequantization table to be used
 855  * @param mul_q LSF multiplier
 856  * @param base_q base (lowest) LSF values
 857  */
 858 static void dequant_lsps(double *lsps, int num,
 859                          const uint16_t *values,
 860                          const uint16_t *sizes,
 861                          int n_stages, const uint8_t *table,
 862                          const double *mul_q,
 863                          const double *base_q)
 864 {
 865     int n, m;
 866
 867     memset(lsps, 0, num * sizeof(*lsps));
 868     for (n = 0; n < n_stages; n++) {
 869         const uint8_t *t_off = &table[values[n] * num];
 870         double base = base_q[n], mul = mul_q[n];
 871
 872         for (m = 0; m < num; m++)
 873             lsps[m] += base + mul * t_off[m];
 874
 875         table += sizes[n] * num;
 876     }
 877 }
 878
 879 /**
 880  * @name LSP dequantization routines
 881  * LSP dequantization routines, for 10/16LSPs and independent/residual coding.
 882  * lsp10i() consumes 24 bits; lsp10r() consumes an additional 24 bits;
 883  * lsp16i() consumes 34 bits; lsp16r() consumes an additional 26 bits.
 884  * @{
 885  */
 886 /**
 887  * Parse 10 independently-coded LSPs.
 888  */
 889 static void dequant_lsp10i(GetBitContext *gb, double *lsps)
 890 {
 891     static const uint16_t vec_sizes[4] = { 256, 64, 32, 32 };
 892     static const double mul_lsf[4] = {
 893         5.2187144800e-3,    1.4626986422e-3,
 894         9.6179549166e-4,    1.1325736225e-3
 895     };
 896     static const double base_lsf[4] = {
 897         M_PI * -2.15522e-1, M_PI * -6.1646e-2,
 898         M_PI * -3.3486e-2,  M_PI * -5.7408e-2
 899     };
 900     uint16_t v[4];
 901
 902     v[0] = get_bits(gb, 8);
 903     v[1] = get_bits(gb, 6);
 904     v[2] = get_bits(gb, 5);
 905     v[3] = get_bits(gb, 5);
 906
 907     dequant_lsps(lsps, 10, v, vec_sizes, 4, wmavoice_dq_lsp10i,
 908                  mul_lsf, base_lsf);
 909 }
 910
 911 /**
 912  * Parse 10 independently-coded LSPs, and then derive the tables to
 913  * generate LSPs for the other frames from them (residual coding).
 914  */
 915 static void dequant_lsp10r(GetBitContext *gb,
 916                            double *i_lsps, const double *old,
 917                            double *a1, double *a2, int q_mode)
 918 {
 919     static const uint16_t vec_sizes[3] = { 128, 64, 64 };
 920     static const double mul_lsf[3] = {
 921         2.5807601174e-3,    1.2354460219e-3,   1.1763821673e-3
 922     };
 923     static const double base_lsf[3] = {
 924         M_PI * -1.07448e-1, M_PI * -5.2706e-2, M_PI * -5.1634e-2
 925     };
 926     const float (*ipol_tab)[2][10] = q_mode ?
 927         wmavoice_lsp10_intercoeff_b : wmavoice_lsp10_intercoeff_a;
 928     uint16_t interpol, v[3];
 929     int n;
 930
 931     dequant_lsp10i(gb, i_lsps);
 932
 933     interpol = get_bits(gb, 5);
 934     v[0]     = get_bits(gb, 7);
 935     v[1]     = get_bits(gb, 6);
 936     v[2]     = get_bits(gb, 6);
 937
 938     for (n = 0; n < 10; n++) {
 939         double delta = old[n] - i_lsps[n];
 940         a1[n]        = ipol_tab[interpol][0][n] * delta + i_lsps[n];
 941         a1[10 + n]   = ipol_tab[interpol][1][n] * delta + i_lsps[n];
 942     }
 943
 944     dequant_lsps(a2, 20, v, vec_sizes, 3, wmavoice_dq_lsp10r,
 945                  mul_lsf, base_lsf);
 946 }
 947
 948 /**
 949  * Parse 16 independently-coded LSPs.
 950  */
 951 static void dequant_lsp16i(GetBitContext *gb, double *lsps)
 952 {
 953     static const uint16_t vec_sizes[5] = { 256, 64, 128, 64, 128 };
 954     static const double mul_lsf[5] = {
 955         3.3439586280e-3,    6.9908173703e-4,
 956         3.3216608306e-3,    1.0334960326e-3,
 957         3.1899104283e-3
 958     };
 959     static const double base_lsf[5] = {
 960         M_PI * -1.27576e-1, M_PI * -2.4292e-2,
 961         M_PI * -1.28094e-1, M_PI * -3.2128e-2,
 962         M_PI * -1.29816e-1
 963     };
 964     uint16_t v[5];
 965
 966     v[0] = get_bits(gb, 8);
 967     v[1] = get_bits(gb, 6);
 968     v[2] = get_bits(gb, 7);
 969     v[3] = get_bits(gb, 6);
 970     v[4] = get_bits(gb, 7);
 971
 972     dequant_lsps( lsps,     5,  v,     vec_sizes,    2,
 973                  wmavoice_dq_lsp16i1,  mul_lsf,     base_lsf);
 974     dequant_lsps(&lsps[5],  5, &v[2], &vec_sizes[2], 2,
 975                  wmavoice_dq_lsp16i2, &mul_lsf[2], &base_lsf[2]);
 976     dequant_lsps(&lsps[10], 6, &v[4], &vec_sizes[4], 1,
 977                  wmavoice_dq_lsp16i3, &mul_lsf[4], &base_lsf[4]);
 978 }
 979
 980 /**
 981  * Parse 16 independently-coded LSPs, and then derive the tables to
 982  * generate LSPs for the other frames from them (residual coding).
 983  */
 984 static void dequant_lsp16r(GetBitContext *gb,
 985                            double *i_lsps, const double *old,
 986                            double *a1, double *a2, int q_mode)
 987 {
 988     static const uint16_t vec_sizes[3] = { 128, 128, 128 };
 989     static const double mul_lsf[3] = {
 990         1.2232979501e-3,   1.4062241527e-3,   1.6114744851e-3
 991     };
 992     static const double base_lsf[3] = {
 993         M_PI * -5.5830e-2, M_PI * -5.2908e-2, M_PI * -5.4776e-2
 994     };
 995     const float (*ipol_tab)[2][16] = q_mode ?
 996         wmavoice_lsp16_intercoeff_b : wmavoice_lsp16_intercoeff_a;
 997     uint16_t interpol, v[3];
 998     int n;
 999
1000     dequant_lsp16i(gb, i_lsps);
1001
1002     interpol = get_bits(gb, 5);
1003     v[0]     = get_bits(gb, 7);
1004     v[1]     = get_bits(gb, 7);
1005     v[2]     = get_bits(gb, 7);
1006
1007     for (n = 0; n < 16; n++) {
1008         double delta = old[n] - i_lsps[n];
1009         a1[n]        = ipol_tab[interpol][0][n] * delta + i_lsps[n];
1010         a1[16 + n]   = ipol_tab[interpol][1][n] * delta + i_lsps[n];
1011     }
1012
1013     dequant_lsps( a2,     10,  v,     vec_sizes,    1,
1014                  wmavoice_dq_lsp16r1,  mul_lsf,     base_lsf);
1015     dequant_lsps(&a2[10], 10, &v[1], &vec_sizes[1], 1,
1016                  wmavoice_dq_lsp16r2, &mul_lsf[1], &base_lsf[1]);
1017     dequant_lsps(&a2[20], 12, &v[2], &vec_sizes[2], 1,
1018                  wmavoice_dq_lsp16r3, &mul_lsf[2], &base_lsf[2]);
1019 }
1020
1021 /**
1022  * @}
1023  * @name Pitch-adaptive window coding functions
1024  * The next few functions are for pitch-adaptive window coding.
1025  * @{
1026  */
1027 /**
1028  * Parse the offset of the first pitch-adaptive window pulses, and
1029  * the distribution of pulses between the two blocks in this frame.
1030  * @param s WMA Voice decoding context private data
1031  * @param gb bit I/O context
1032  * @param pitch pitch for each block in this frame
1033  */
1034 static void aw_parse_coords(WMAVoiceContext *s, GetBitContext *gb,
1035                             const int *pitch)
1036 {
1037     static const int16_t start_offset[94] = {
1038         -11,  -9,  -7,  -5,  -3,  -1,   1,   3,   5,   7,   9,  11,
1039          13,  15,  18,  17,  19,  20,  21,  22,  23,  24,  25,  26,
1040          27,  28,  29,  30,  31,  32,  33,  35,  37,  39,  41,  43,
1041          45,  47,  49,  51,  53,  55,  57,  59,  61,  63,  65,  67,
1042          69,  71,  73,  75,  77,  79,  81,  83,  85,  87,  89,  91,
1043          93,  95,  97,  99, 101, 103, 105, 107, 109, 111, 113, 115,
1044         117, 119, 121, 123, 125, 127, 129, 131, 133, 135, 137, 139,
1045         141, 143, 145, 147, 149, 151, 153, 155, 157, 159
1046     };
1047     int bits, offset;
1048
1049     /* position of pulse */
1050     s->aw_idx_is_ext = 0;
1051     if ((bits = get_bits(gb, 6)) >= 54) {
1052         s->aw_idx_is_ext = 1;
1053         bits += (bits - 54) * 3 + get_bits(gb, 2);
1054     }
1055
1056     /* for a repeated pulse at pulse_off with a pitch_lag of pitch[], count
1057      * the distribution of the pulses in each block contained in this frame. */
1058     s->aw_pulse_range        = FFMIN(pitch[0], pitch[1]) > 32 ? 24 : 16;
1059     for (offset = start_offset[bits]; offset < 0; offset += pitch[0]) ;
1060     s->aw_n_pulses[0]        = (pitch[0] - 1 + MAX_FRAMESIZE / 2 - offset) / pitch[0];
1061     s->aw_first_pulse_off[0] = offset - s->aw_pulse_range / 2;
1062     offset                  += s->aw_n_pulses[0] * pitch[0];
1063     s->aw_n_pulses[1]        = (pitch[1] - 1 + MAX_FRAMESIZE - offset) / pitch[1];
1064     s->aw_first_pulse_off[1] = offset - (MAX_FRAMESIZE + s->aw_pulse_range) / 2;
1065
1066     /* if continuing from a position before the block, reset position to
1067      * start of block (when corrected for the range over which it can be
1068      * spread in aw_pulse_set1()). */
1069     if (start_offset[bits] < MAX_FRAMESIZE / 2) {
1070         while (s->aw_first_pulse_off[1] - pitch[1] + s->aw_pulse_range > 0)
1071             s->aw_first_pulse_off[1] -= pitch[1];
1072         if (start_offset[bits] < 0)
1073             while (s->aw_first_pulse_off[0] - pitch[0] + s->aw_pulse_range > 0)
1074                 s->aw_first_pulse_off[0] -= pitch[0];
1075     }
1076 }
1077
1078 /**
1079  * Apply second set of pitch-adaptive window pulses.
1080  * @param s WMA Voice decoding context private data
1081  * @param gb bit I/O context
1082  * @param block_idx block index in frame [0, 1]
1083  * @param fcb structure containing fixed codebook vector info
1084  * @return -1 on error, 0 otherwise
1085  */
1086 static int aw_pulse_set2(WMAVoiceContext *s, GetBitContext *gb,
1087                          int block_idx, AMRFixed *fcb)
1088 {
1089     uint16_t use_mask_mem[9]; // only 5 are used, rest is padding
1090     uint16_t *use_mask = use_mask_mem + 2;
1091     /* in this function, idx is the index in the 80-bit (+ padding) use_mask
1092      * bit-array. Since use_mask consists of 16-bit values, the lower 4 bits
1093      * of idx are the position of the bit within a particular item in the
1094      * array (0 being the most significant bit, and 15 being the least
1095      * significant bit), and the remainder (>> 4) is the index in the
1096      * use_mask[]-array. This is faster and uses less memory than using a
1097      * 80-byte/80-int array. */
1098     int pulse_off = s->aw_first_pulse_off[block_idx],
1099         pulse_start, n, idx, range, aidx, start_off = 0;
1100
1101     /* set offset of first pulse to within this block */
1102     if (s->aw_n_pulses[block_idx] > 0)
1103         while (pulse_off + s->aw_pulse_range < 1)
1104             pulse_off += fcb->pitch_lag;
1105
1106     /* find range per pulse */
1107     if (s->aw_n_pulses[0] > 0) {
1108         if (block_idx == 0) {
1109             range = 32;
1110         } else /* block_idx = 1 */ {
1111             range = 8;
1112             if (s->aw_n_pulses[block_idx] > 0)
1113                 pulse_off = s->aw_next_pulse_off_cache;
1114         }
1115     } else
1116         range = 16;
1117     pulse_start = s->aw_n_pulses[block_idx] > 0 ? pulse_off - range / 2 : 0;
1118
1119     /* aw_pulse_set1() already applies pulses around pulse_off (to be exactly,
1120      * in the range of [pulse_off, pulse_off + s->aw_pulse_range], and thus
1121      * we exclude that range from being pulsed again in this function. */
1122     memset(&use_mask[-2], 0, 2 * sizeof(use_mask[0]));
1123     memset( use_mask,   -1, 5 * sizeof(use_mask[0]));
1124     memset(&use_mask[5], 0, 2 * sizeof(use_mask[0]));
1125     if (s->aw_n_pulses[block_idx] > 0)
1126         for (idx = pulse_off; idx < MAX_FRAMESIZE / 2; idx += fcb->pitch_lag) {
1127             int excl_range         = s->aw_pulse_range; // always 16 or 24
1128             uint16_t *use_mask_ptr = &use_mask[idx >> 4];
1129             int first_sh           = 16 - (idx & 15);
1130             *use_mask_ptr++       &= 0xFFFFu << first_sh;
1131             excl_range            -= first_sh;
1132             if (excl_range >= 16) {
1133                 *use_mask_ptr++    = 0;
1134                 *use_mask_ptr     &= 0xFFFF >> (excl_range - 16);
1135             } else
1136                 *use_mask_ptr     &= 0xFFFF >> excl_range;
1137         }
1138
1139     /* find the 'aidx'th offset that is not excluded */
1140     aidx = get_bits(gb, s->aw_n_pulses[0] > 0 ? 5 - 2 * block_idx : 4);
1141     for (n = 0; n <= aidx; pulse_start++) {
1142         for (idx = pulse_start; idx < 0; idx += fcb->pitch_lag) ;
1143         if (idx >= MAX_FRAMESIZE / 2) { // find from zero
1144             if (use_mask[0])      idx = 0x0F;
1145             else if (use_mask[1]) idx = 0x1F;
1146             else if (use_mask[2]) idx = 0x2F;
1147             else if (use_mask[3]) idx = 0x3F;
1148             else if (use_mask[4]) idx = 0x4F;
1149             else return -1;
1150             idx -= av_log2_16bit(use_mask[idx >> 4]);
1151         }
1152         if (use_mask[idx >> 4] & (0x8000 >> (idx & 15))) {
1153             use_mask[idx >> 4] &= ~(0x8000 >> (idx & 15));
1154             n++;
1155             start_off = idx;
1156         }
1157     }
1158
1159     fcb->x[fcb->n] = start_off;
1160     fcb->y[fcb->n] = get_bits1(gb) ? -1.0 : 1.0;
1161     fcb->n++;
1162
1163     /* set offset for next block, relative to start of that block */
1164     n = (MAX_FRAMESIZE / 2 - start_off) % fcb->pitch_lag;
1165     s->aw_next_pulse_off_cache = n ? fcb->pitch_lag - n : 0;
1166     return 0;
1167 }
1168
1169 /**
1170  * Apply first set of pitch-adaptive window pulses.
1171  * @param s WMA Voice decoding context private data
1172  * @param gb bit I/O context
1173  * @param block_idx block index in frame [0, 1]
1174  * @param fcb storage location for fixed codebook pulse info
1175  */
1176 static void aw_pulse_set1(WMAVoiceContext *s, GetBitContext *gb,
1177                           int block_idx, AMRFixed *fcb)
1178 {
1179     int val = get_bits(gb, 12 - 2 * (s->aw_idx_is_ext && !block_idx));
1180     float v;
1181
1182     if (s->aw_n_pulses[block_idx] > 0) {
1183         int n, v_mask, i_mask, sh, n_pulses;
1184
1185         if (s->aw_pulse_range == 24) { // 3 pulses, 1:sign + 3:index each
1186             n_pulses = 3;
1187             v_mask   = 8;
1188             i_mask   = 7;
1189             sh       = 4;
1190         } else { // 4 pulses, 1:sign + 2:index each
1191             n_pulses = 4;
1192             v_mask   = 4;
1193             i_mask   = 3;
1194             sh       = 3;
1195         }
1196
1197         for (n = n_pulses - 1; n >= 0; n--, val >>= sh) {
1198             fcb->y[fcb->n] = (val & v_mask) ? -1.0 : 1.0;
1199             fcb->x[fcb->n] = (val & i_mask) * n_pulses + n +
1200                                  s->aw_first_pulse_off[block_idx];
1201             while (fcb->x[fcb->n] < 0)
1202                 fcb->x[fcb->n] += fcb->pitch_lag;
1203             if (fcb->x[fcb->n] < MAX_FRAMESIZE / 2)
1204                 fcb->n++;
1205         }
1206     } else {
1207         int num2 = (val & 0x1FF) >> 1, delta, idx;
1208
1209         if (num2 < 1 * 79)      { delta = 1; idx = num2 + 1; }
1210         else if (num2 < 2 * 78) { delta = 3; idx = num2 + 1 - 1 * 77; }
1211         else if (num2 < 3 * 77) { delta = 5; idx = num2 + 1 - 2 * 76; }
1212         else                    { delta = 7; idx = num2 + 1 - 3 * 75; }
1213         v = (val & 0x200) ? -1.0 : 1.0;
1214
1215         fcb->no_repeat_mask |= 3 << fcb->n;
1216         fcb->x[fcb->n]       = idx - delta;
1217         fcb->y[fcb->n]       = v;
1218         fcb->x[fcb->n + 1]   = idx;
1219         fcb->y[fcb->n + 1]   = (val & 1) ? -v : v;
1220         fcb->n              += 2;
1221     }
1222 }
1223
1224 /**
1225  * @}
1226  *
1227  * Generate a random number from frame_cntr and block_idx, which will live
1228  * in the range [0, 1000 - block_size] (so it can be used as an index in a
1229  * table of size 1000 of which you want to read block_size entries).
1230  *
1231  * @param frame_cntr current frame number
1232  * @param block_num current block index
1233  * @param block_size amount of entries we want to read from a table
1234  *                   that has 1000 entries
1235  * @return a (non-)random number in the [0, 1000 - block_size] range.
1236  */
1237 static int pRNG(int frame_cntr, int block_num, int block_size)
1238 {
1239     /* array to simplify the calculation of z:
1240      * y = (x % 9) * 5 + 6;
1241      * z = (49995 * x) / y;
1242      * Since y only has 9 values, we can remove the division by using a
1243      * LUT and using FASTDIV-style divisions. For each of the 9 values
1244      * of y, we can rewrite z as:
1245      * z = x * (49995 / y) + x * ((49995 % y) / y)
1246      * In this table, each col represents one possible value of y, the
1247      * first number is 49995 / y, and the second is the FASTDIV variant
1248      * of 49995 % y / y. */
1249     static const unsigned int div_tbl[9][2] = {
1250         { 8332,  3 * 715827883U }, // y =  6
1251         { 4545,  0 * 390451573U }, // y = 11
1252         { 3124, 11 * 268435456U }, // y = 16
1253         { 2380, 15 * 204522253U }, // y = 21
1254         { 1922, 23 * 165191050U }, // y = 26
1255         { 1612, 23 * 138547333U }, // y = 31
1256         { 1388, 27 * 119304648U }, // y = 36
1257         { 1219, 16 * 104755300U }, // y = 41
1258         { 1086, 39 *  93368855U }  // y = 46
1259     };
1260     unsigned int z, y, x = MUL16(block_num, 1877) + frame_cntr;
1261     if (x >= 0xFFFF) x -= 0xFFFF;   // max value of x is 8*1877+0xFFFE=0x13AA6,
1262                                     // so this is effectively a modulo (%)
1263     y = x - 9 * MULH(477218589, x); // x % 9
1264     z = (uint16_t) (x * div_tbl[y][0] + UMULH(x, div_tbl[y][1]));
1265                                     // z = x * 49995 / (y * 5 + 6)
1266     return z % (1000 - block_size);
1267 }
1268
1269 /**
1270  * Parse hardcoded signal for a single block.
1271  * @note see #synth_block().
1272  */
1273 static void synth_block_hardcoded(WMAVoiceContext *s, GetBitContext *gb,
1274                                  int block_idx, int size,
1275                                  const struct frame_type_desc *frame_desc,
1276                                  float *excitation)
1277 {
1278     float gain;
1279     int n, r_idx;
1280
1281     av_assert0(size <= MAX_FRAMESIZE);
1282
1283     /* Set the offset from which we start reading wmavoice_std_codebook */
1284     if (frame_desc->fcb_type == FCB_TYPE_SILENCE) {
1285         r_idx = pRNG(s->frame_cntr, block_idx, size);
1286         gain  = s->silence_gain;
1287     } else /* FCB_TYPE_HARDCODED */ {
1288         r_idx = get_bits(gb, 8);
1289         gain  = wmavoice_gain_universal[get_bits(gb, 6)];
1290     }
1291
1292     /* Clear gain prediction parameters */
1293     memset(s->gain_pred_err, 0, sizeof(s->gain_pred_err));
1294
1295     /* Apply gain to hardcoded codebook and use that as excitation signal */
1296     for (n = 0; n < size; n++)
1297         excitation[n] = wmavoice_std_codebook[r_idx + n] * gain;
1298 }
1299
1300 /**
1301  * Parse FCB/ACB signal for a single block.
1302  * @note see #synth_block().
1303  */
1304 static void synth_block_fcb_acb(WMAVoiceContext *s, GetBitContext *gb,
1305                                 int block_idx, int size,
1306                                 int block_pitch_sh2,
1307                                 const struct frame_type_desc *frame_desc,
1308                                 float *excitation)
1309 {
1310     static const float gain_coeff[6] = {
1311         0.8169, -0.06545, 0.1726, 0.0185, -0.0359, 0.0458
1312     };
1313     float pulses[MAX_FRAMESIZE / 2], pred_err, acb_gain, fcb_gain;
1314     int n, idx, gain_weight;
1315     AMRFixed fcb;
1316
1317     av_assert0(size <= MAX_FRAMESIZE / 2);
1318     memset(pulses, 0, sizeof(*pulses) * size);
1319
1320     fcb.pitch_lag      = block_pitch_sh2 >> 2;
1321     fcb.pitch_fac      = 1.0;
1322     fcb.no_repeat_mask = 0;
1323     fcb.n              = 0;
1324
1325     /* For the other frame types, this is where we apply the innovation
1326      * (fixed) codebook pulses of the speech signal. */
1327     if (frame_desc->fcb_type == FCB_TYPE_AW_PULSES) {
1328         aw_pulse_set1(s, gb, block_idx, &fcb);
1329         if (aw_pulse_set2(s, gb, block_idx, &fcb)) {
1330             /* Conceal the block with silence and return.
1331              * Skip the correct amount of bits to read the next
1332              * block from the correct offset. */
1333             int r_idx = pRNG(s->frame_cntr, block_idx, size);
1334
1335             for (n = 0; n < size; n++)
1336                 excitation[n] =
1337                     wmavoice_std_codebook[r_idx + n] * s->silence_gain;
1338             skip_bits(gb, 7 + 1);
1339             return;
1340         }
1341     } else /* FCB_TYPE_EXC_PULSES */ {
1342         int offset_nbits = 5 - frame_desc->log_n_blocks;
1343
1344         fcb.no_repeat_mask = -1;
1345         /* similar to ff_decode_10_pulses_35bits(), but with single pulses
1346          * (instead of double) for a subset of pulses */
1347         for (n = 0; n < 5; n++) {
1348             float sign;
1349             int pos1, pos2;
1350
1351             sign           = get_bits1(gb) ? 1.0 : -1.0;
1352             pos1           = get_bits(gb, offset_nbits);
1353             fcb.x[fcb.n]   = n + 5 * pos1;
1354             fcb.y[fcb.n++] = sign;
1355             if (n < frame_desc->dbl_pulses) {
1356                 pos2           = get_bits(gb, offset_nbits);
1357                 fcb.x[fcb.n]   = n + 5 * pos2;
1358                 fcb.y[fcb.n++] = (pos1 < pos2) ? -sign : sign;
1359             }
1360         }
1361     }
1362     ff_set_fixed_vector(pulses, &fcb, 1.0, size);
1363
1364     /* Calculate gain for adaptive & fixed codebook signal.
1365      * see ff_amr_set_fixed_gain(). */
1366     idx = get_bits(gb, 7);
1367     fcb_gain = expf(avpriv_scalarproduct_float_c(s->gain_pred_err,
1368                                                  gain_coeff, 6) -
1369                     5.2409161640 + wmavoice_gain_codebook_fcb[idx]);
1370     acb_gain = wmavoice_gain_codebook_acb[idx];
1371     pred_err = av_clipf(wmavoice_gain_codebook_fcb[idx],
1372                         -2.9957322736 /* log(0.05) */,
1373                          1.6094379124 /* log(5.0)  */);
1374
1375     gain_weight = 8 >> frame_desc->log_n_blocks;
1376     memmove(&s->gain_pred_err[gain_weight], s->gain_pred_err,
1377             sizeof(*s->gain_pred_err) * (6 - gain_weight));
1378     for (n = 0; n < gain_weight; n++)
1379         s->gain_pred_err[n] = pred_err;
1380
1381     /* Calculation of adaptive codebook */
1382     if (frame_desc->acb_type == ACB_TYPE_ASYMMETRIC) {
1383         int len;
1384         for (n = 0; n < size; n += len) {
1385             int next_idx_sh16;
1386             int abs_idx    = block_idx * size + n;
1387             int pitch_sh16 = (s->last_pitch_val << 16) +
1388                              s->pitch_diff_sh16 * abs_idx;
1389             int pitch      = (pitch_sh16 + 0x6FFF) >> 16;
1390             int idx_sh16   = ((pitch << 16) - pitch_sh16) * 8 + 0x58000;
1391             idx            = idx_sh16 >> 16;
1392             if (s->pitch_diff_sh16) {
1393                 if (s->pitch_diff_sh16 > 0) {
1394                     next_idx_sh16 = (idx_sh16) &~ 0xFFFF;
1395                 } else
1396                     next_idx_sh16 = (idx_sh16 + 0x10000) &~ 0xFFFF;
1397                 len = av_clip((idx_sh16 - next_idx_sh16) / s->pitch_diff_sh16 / 8,
1398                               1, size - n);
1399             } else
1400                 len = size;
1401
1402             ff_acelp_interpolatef(&excitation[n], &excitation[n - pitch],
1403                                   wmavoice_ipol1_coeffs, 17,
1404                                   idx, 9, len);
1405         }
1406     } else /* ACB_TYPE_HAMMING */ {
1407         int block_pitch = block_pitch_sh2 >> 2;
1408         idx             = block_pitch_sh2 & 3;
1409         if (idx) {
1410             ff_acelp_interpolatef(excitation, &excitation[-block_pitch],
1411                                   wmavoice_ipol2_coeffs, 4,
1412                                   idx, 8, size);
1413         } else
1414             av_memcpy_backptr((uint8_t *) excitation, sizeof(float) * block_pitch,
1415                               sizeof(float) * size);
1416     }
1417
1418     /* Interpolate ACB/FCB and use as excitation signal */
1419     ff_weighted_vector_sumf(excitation, excitation, pulses,
1420                             acb_gain, fcb_gain, size);
1421 }
1422
1423 /**
1424  * Parse data in a single block.
1425  *
1426  * @param s WMA Voice decoding context private data
1427  * @param gb bit I/O context
1428  * @param block_idx index of the to-be-read block
1429  * @param size amount of samples to be read in this block
1430  * @param block_pitch_sh2 pitch for this block << 2
1431  * @param lsps LSPs for (the end of) this frame
1432  * @param prev_lsps LSPs for the last frame
1433  * @param frame_desc frame type descriptor
1434  * @param excitation target memory for the ACB+FCB interpolated signal
1435  * @param synth target memory for the speech synthesis filter output
1436  * @return 0 on success, <0 on error.
1437  */
1438 static void synth_block(WMAVoiceContext *s, GetBitContext *gb,
1439                         int block_idx, int size,
1440                         int block_pitch_sh2,
1441                         const double *lsps, const double *prev_lsps,
1442                         const struct frame_type_desc *frame_desc,
1443                         float *excitation, float *synth)
1444 {
1445     double i_lsps[MAX_LSPS];
1446     float lpcs[MAX_LSPS];
1447     float fac;
1448     int n;
1449
1450     if (frame_desc->acb_type == ACB_TYPE_NONE)
1451         synth_block_hardcoded(s, gb, block_idx, size, frame_desc, excitation);
1452     else
1453         synth_block_fcb_acb(s, gb, block_idx, size, block_pitch_sh2,
1454                             frame_desc, excitation);
1455
1456     /* convert interpolated LSPs to LPCs */
1457     fac = (block_idx + 0.5) / frame_desc->n_blocks;
1458     for (n = 0; n < s->lsps; n++) // LSF -> LSP
1459         i_lsps[n] = cos(prev_lsps[n] + fac * (lsps[n] - prev_lsps[n]));
1460     ff_acelp_lspd2lpc(i_lsps, lpcs, s->lsps >> 1);
1461
1462     /* Speech synthesis */
1463     ff_celp_lp_synthesis_filterf(synth, lpcs, excitation, size, s->lsps);
1464 }
1465
1466 /**
1467  * Synthesize output samples for a single frame.
1468  *
1469  * @param ctx WMA Voice decoder context
1470  * @param gb bit I/O context (s->gb or one for cross-packet superframes)
1471  * @param frame_idx Frame number within superframe [0-2]
1472  * @param samples pointer to output sample buffer, has space for at least 160
1473  *                samples
1474  * @param lsps LSP array
1475  * @param prev_lsps array of previous frame's LSPs
1476  * @param excitation target buffer for excitation signal
1477  * @param synth target buffer for synthesized speech data
1478  * @return 0 on success, <0 on error.
1479  */
1480 static int synth_frame(AVCodecContext *ctx, GetBitContext *gb, int frame_idx,
1481                        float *samples,
1482                        const double *lsps, const double *prev_lsps,
1483                        float *excitation, float *synth)
1484 {
1485     WMAVoiceContext *s = ctx->priv_data;
1486     int n, n_blocks_x2, log_n_blocks_x2, av_uninit(cur_pitch_val);
1487     int pitch[MAX_BLOCKS], av_uninit(last_block_pitch);
1488
1489     /* Parse frame type ("frame header"), see frame_descs */
1490     int bd_idx = s->vbm_tree[get_vlc2(gb, frame_type_vlc.table, 6, 3)], block_nsamples;
1491
1492     if (bd_idx < 0) {
1493         av_log(ctx, AV_LOG_ERROR,
1494                "Invalid frame type VLC code, skipping\n");
1495         return AVERROR_INVALIDDATA;
1496     }
1497
1498     block_nsamples = MAX_FRAMESIZE / frame_descs[bd_idx].n_blocks;
1499
1500     /* Pitch calculation for ACB_TYPE_ASYMMETRIC ("pitch-per-frame") */
1501     if (frame_descs[bd_idx].acb_type == ACB_TYPE_ASYMMETRIC) {
1502         /* Pitch is provided per frame, which is interpreted as the pitch of
1503          * the last sample of the last block of this frame. We can interpolate
1504          * the pitch of other blocks (and even pitch-per-sample) by gradually
1505          * incrementing/decrementing prev_frame_pitch to cur_pitch_val. */
1506         n_blocks_x2      = frame_descs[bd_idx].n_blocks << 1;
1507         log_n_blocks_x2  = frame_descs[bd_idx].log_n_blocks + 1;
1508         cur_pitch_val    = s->min_pitch_val + get_bits(gb, s->pitch_nbits);
1509         cur_pitch_val    = FFMIN(cur_pitch_val, s->max_pitch_val - 1);
1510         if (s->last_acb_type == ACB_TYPE_NONE ||
1511             20 * abs(cur_pitch_val - s->last_pitch_val) >
1512                 (cur_pitch_val + s->last_pitch_val))
1513             s->last_pitch_val = cur_pitch_val;
1514
1515         /* pitch per block */
1516         for (n = 0; n < frame_descs[bd_idx].n_blocks; n++) {
1517             int fac = n * 2 + 1;
1518
1519             pitch[n] = (MUL16(fac,                 cur_pitch_val) +
1520                         MUL16((n_blocks_x2 - fac), s->last_pitch_val) +
1521                         frame_descs[bd_idx].n_blocks) >> log_n_blocks_x2;
1522         }
1523
1524         /* "pitch-diff-per-sample" for calculation of pitch per sample */
1525         s->pitch_diff_sh16 =
1526             (cur_pitch_val - s->last_pitch_val) * (1 << 16) / MAX_FRAMESIZE;
1527     }
1528
1529     /* Global gain (if silence) and pitch-adaptive window coordinates */
1530     switch (frame_descs[bd_idx].fcb_type) {
1531     case FCB_TYPE_SILENCE:
1532         s->silence_gain = wmavoice_gain_silence[get_bits(gb, 8)];
1533         break;
1534     case FCB_TYPE_AW_PULSES:
1535         aw_parse_coords(s, gb, pitch);
1536         break;
1537     }
1538
1539     for (n = 0; n < frame_descs[bd_idx].n_blocks; n++) {
1540         int bl_pitch_sh2;
1541
1542         /* Pitch calculation for ACB_TYPE_HAMMING ("pitch-per-block") */
1543         switch (frame_descs[bd_idx].acb_type) {
1544         case ACB_TYPE_HAMMING: {
1545             /* Pitch is given per block. Per-block pitches are encoded as an
1546              * absolute value for the first block, and then delta values
1547              * relative to this value) for all subsequent blocks. The scale of
1548              * this pitch value is semi-logarithmic compared to its use in the
1549              * decoder, so we convert it to normal scale also. */
1550             int block_pitch,
1551                 t1 = (s->block_conv_table[1] - s->block_conv_table[0]) << 2,
1552                 t2 = (s->block_conv_table[2] - s->block_conv_table[1]) << 1,
1553                 t3 =  s->block_conv_table[3] - s->block_conv_table[2] + 1;
1554
1555             if (n == 0) {
1556                 block_pitch = get_bits(gb, s->block_pitch_nbits);
1557             } else
1558                 block_pitch = last_block_pitch - s->block_delta_pitch_hrange +
1559                                  get_bits(gb, s->block_delta_pitch_nbits);
1560             /* Convert last_ so that any next delta is within _range */
1561             last_block_pitch = av_clip(block_pitch,
1562                                        s->block_delta_pitch_hrange,
1563                                        s->block_pitch_range -
1564                                            s->block_delta_pitch_hrange);
1565
1566             /* Convert semi-log-style scale back to normal scale */
1567             if (block_pitch < t1) {
1568                 bl_pitch_sh2 = (s->block_conv_table[0] << 2) + block_pitch;
1569             } else {
1570                 block_pitch -= t1;
1571                 if (block_pitch < t2) {
1572                     bl_pitch_sh2 =
1573                         (s->block_conv_table[1] << 2) + (block_pitch << 1);
1574                 } else {
1575                     block_pitch -= t2;
1576                     if (block_pitch < t3) {
1577                         bl_pitch_sh2 =
1578                             (s->block_conv_table[2] + block_pitch) << 2;
1579                     } else
1580                         bl_pitch_sh2 = s->block_conv_table[3] << 2;
1581                 }
1582             }
1583             pitch[n] = bl_pitch_sh2 >> 2;
1584             break;
1585         }
1586
1587         case ACB_TYPE_ASYMMETRIC: {
1588             bl_pitch_sh2 = pitch[n] << 2;
1589             break;
1590         }
1591
1592         default: // ACB_TYPE_NONE has no pitch
1593             bl_pitch_sh2 = 0;
1594             break;
1595         }
1596
1597         synth_block(s, gb, n, block_nsamples, bl_pitch_sh2,
1598                     lsps, prev_lsps, &frame_descs[bd_idx],
1599                     &excitation[n * block_nsamples],
1600                     &synth[n * block_nsamples]);
1601     }
1602
1603     /* Averaging projection filter, if applicable. Else, just copy samples
1604      * from synthesis buffer */
1605     if (s->do_apf) {
1606         double i_lsps[MAX_LSPS];
1607         float lpcs[MAX_LSPS];
1608
1609         for (n = 0; n < s->lsps; n++) // LSF -> LSP
1610             i_lsps[n] = cos(0.5 * (prev_lsps[n] + lsps[n]));
1611         ff_acelp_lspd2lpc(i_lsps, lpcs, s->lsps >> 1);
1612         postfilter(s, synth, samples, 80, lpcs,
1613                    &s->zero_exc_pf[s->history_nsamples + MAX_FRAMESIZE * frame_idx],
1614                    frame_descs[bd_idx].fcb_type, pitch[0]);
1615
1616         for (n = 0; n < s->lsps; n++) // LSF -> LSP
1617             i_lsps[n] = cos(lsps[n]);
1618         ff_acelp_lspd2lpc(i_lsps, lpcs, s->lsps >> 1);
1619         postfilter(s, &synth[80], &samples[80], 80, lpcs,
1620                    &s->zero_exc_pf[s->history_nsamples + MAX_FRAMESIZE * frame_idx + 80],
1621                    frame_descs[bd_idx].fcb_type, pitch[0]);
1622     } else
1623         memcpy(samples, synth, 160 * sizeof(synth[0]));
1624
1625     /* Cache values for next frame */
1626     s->frame_cntr++;
1627     if (s->frame_cntr >= 0xFFFF) s->frame_cntr -= 0xFFFF; // i.e. modulo (%)
1628     s->last_acb_type = frame_descs[bd_idx].acb_type;
1629     switch (frame_descs[bd_idx].acb_type) {
1630     case ACB_TYPE_NONE:
1631         s->last_pitch_val = 0;
1632         break;
1633     case ACB_TYPE_ASYMMETRIC:
1634         s->last_pitch_val = cur_pitch_val;
1635         break;
1636     case ACB_TYPE_HAMMING:
1637         s->last_pitch_val = pitch[frame_descs[bd_idx].n_blocks - 1];
1638         break;
1639     }
1640
1641     return 0;
1642 }
1643
1644 /**
1645  * Ensure minimum value for first item, maximum value for last value,
1646  * proper spacing between each value and proper ordering.
1647  *
1648  * @param lsps array of LSPs
1649  * @param num size of LSP array
1650  *
1651  * @note basically a double version of #ff_acelp_reorder_lsf(), might be
1652  *       useful to put in a generic location later on. Parts are also
1653  *       present in #ff_set_min_dist_lsf() + #ff_sort_nearly_sorted_floats(),
1654  *       which is in float.
1655  */
1656 static void stabilize_lsps(double *lsps, int num)
1657 {
1658     int n, m, l;
1659
1660     /* set minimum value for first, maximum value for last and minimum
1661      * spacing between LSF values.
1662      * Very similar to ff_set_min_dist_lsf(), but in double. */
1663     lsps[0]       = FFMAX(lsps[0],       0.0015 * M_PI);
1664     for (n = 1; n < num; n++)
1665         lsps[n]   = FFMAX(lsps[n],       lsps[n - 1] + 0.0125 * M_PI);
1666     lsps[num - 1] = FFMIN(lsps[num - 1], 0.9985 * M_PI);
1667
1668     /* reorder (looks like one-time / non-recursed bubblesort).
1669      * Very similar to ff_sort_nearly_sorted_floats(), but in double. */
1670     for (n = 1; n < num; n++) {
1671         if (lsps[n] < lsps[n - 1]) {
1672             for (m = 1; m < num; m++) {
1673                 double tmp = lsps[m];
1674                 for (l = m - 1; l >= 0; l--) {
1675                     if (lsps[l] <= tmp) break;
1676                     lsps[l + 1] = lsps[l];
1677                 }
1678                 lsps[l + 1] = tmp;
1679             }
1680             break;
1681         }
1682     }
1683 }
1684
1685 /**
1686  * Synthesize output samples for a single superframe. If we have any data
1687  * cached in s->sframe_cache, that will be used instead of whatever is loaded
1688  * in s->gb.
1689  *
1690  * WMA Voice superframes contain 3 frames, each containing 160 audio samples,
1691  * to give a total of 480 samples per frame. See #synth_frame() for frame
1692  * parsing. In addition to 3 frames, superframes can also contain the LSPs
1693  * (if these are globally specified for all frames (residually); they can
1694  * also be specified individually per-frame. See the s->has_residual_lsps
1695  * option), and can specify the number of samples encoded in this superframe
1696  * (if less than 480), usually used to prevent blanks at track boundaries.
1697  *
1698  * @param ctx WMA Voice decoder context
1699  * @return 0 on success, <0 on error or 1 if there was not enough data to
1700  *         fully parse the superframe
1701  */
1702 static int synth_superframe(AVCodecContext *ctx, AVFrame *frame,
1703                             int *got_frame_ptr)
1704 {
1705     WMAVoiceContext *s = ctx->priv_data;
1706     GetBitContext *gb = &s->gb, s_gb;
1707     int n, res, n_samples = MAX_SFRAMESIZE;
1708     double lsps[MAX_FRAMES][MAX_LSPS];
1709     const double *mean_lsf = s->lsps == 16 ?
1710         wmavoice_mean_lsf16[s->lsp_def_mode] : wmavoice_mean_lsf10[s->lsp_def_mode];
1711     float excitation[MAX_SIGNAL_HISTORY + MAX_SFRAMESIZE + 12];
1712     float synth[MAX_LSPS + MAX_SFRAMESIZE];
1713     float *samples;
1714
1715     memcpy(synth,      s->synth_history,
1716            s->lsps             * sizeof(*synth));
1717     memcpy(excitation, s->excitation_history,
1718            s->history_nsamples * sizeof(*excitation));
1719
1720     if (s->sframe_cache_size > 0) {
1721         gb = &s_gb;
1722         init_get_bits(gb, s->sframe_cache, s->sframe_cache_size);
1723         s->sframe_cache_size = 0;
1724     }
1725
1726     /* First bit is speech/music bit, it differentiates between WMAVoice
1727      * speech samples (the actual codec) and WMAVoice music samples, which
1728      * are really WMAPro-in-WMAVoice-superframes. I've never seen those in
1729      * the wild yet. */
1730     if (!get_bits1(gb)) {
1731         avpriv_request_sample(ctx, "WMAPro-in-WMAVoice");
1732         return AVERROR_PATCHWELCOME;
1733     }
1734
1735     /* (optional) nr. of samples in superframe; always <= 480 and >= 0 */
1736     if (get_bits1(gb)) {
1737         if ((n_samples = get_bits(gb, 12)) > MAX_SFRAMESIZE) {
1738             av_log(ctx, AV_LOG_ERROR,
1739                    "Superframe encodes > %d samples (%d), not allowed\n",
1740                    MAX_SFRAMESIZE, n_samples);
1741             return AVERROR_INVALIDDATA;
1742         }
1743     }
1744
1745     /* Parse LSPs, if global for the superframe (can also be per-frame). */
1746     if (s->has_residual_lsps) {
1747         double prev_lsps[MAX_LSPS], a1[MAX_LSPS * 2], a2[MAX_LSPS * 2];
1748
1749         for (n = 0; n < s->lsps; n++)
1750             prev_lsps[n] = s->prev_lsps[n] - mean_lsf[n];
1751
1752         if (s->lsps == 10) {
1753             dequant_lsp10r(gb, lsps[2], prev_lsps, a1, a2, s->lsp_q_mode);
1754         } else /* s->lsps == 16 */
1755             dequant_lsp16r(gb, lsps[2], prev_lsps, a1, a2, s->lsp_q_mode);
1756
1757         for (n = 0; n < s->lsps; n++) {
1758             lsps[0][n]  = mean_lsf[n] + (a1[n]           - a2[n * 2]);
1759             lsps[1][n]  = mean_lsf[n] + (a1[s->lsps + n] - a2[n * 2 + 1]);
1760             lsps[2][n] += mean_lsf[n];
1761         }
1762         for (n = 0; n < 3; n++)
1763             stabilize_lsps(lsps[n], s->lsps);
1764     }
1765
1766     /* synth_superframe can run multiple times per packet
1767      * free potential previous frame */
1768     av_frame_unref(frame);
1769
1770     /* get output buffer */
1771     frame->nb_samples = MAX_SFRAMESIZE;
1772     if ((res = ff_get_buffer(ctx, frame, 0)) < 0)
1773         return res;
1774     frame->nb_samples = n_samples;
1775     samples = (float *)frame->data[0];
1776
1777     /* Parse frames, optionally preceded by per-frame (independent) LSPs. */
1778     for (n = 0; n < 3; n++) {
1779         if (!s->has_residual_lsps) {
1780             int m;
1781
1782             if (s->lsps == 10) {
1783                 dequant_lsp10i(gb, lsps[n]);
1784             } else /* s->lsps == 16 */
1785                 dequant_lsp16i(gb, lsps[n]);
1786
1787             for (m = 0; m < s->lsps; m++)
1788                 lsps[n][m] += mean_lsf[m];
1789             stabilize_lsps(lsps[n], s->lsps);
1790         }
1791
1792         if ((res = synth_frame(ctx, gb, n,
1793                                &samples[n * MAX_FRAMESIZE],
1794                                lsps[n], n == 0 ? s->prev_lsps : lsps[n - 1],
1795                                &excitation[s->history_nsamples + n * MAX_FRAMESIZE],
1796                                &synth[s->lsps + n * MAX_FRAMESIZE]))) {
1797             *got_frame_ptr = 0;
1798             return res;
1799         }
1800     }
1801
1802     /* Statistics? FIXME - we don't check for length, a slight overrun
1803      * will be caught by internal buffer padding, and anything else
1804      * will be skipped, not read. */
1805     if (get_bits1(gb)) {
1806         res = get_bits(gb, 4);
1807         skip_bits(gb, 10 * (res + 1));
1808     }
1809
1810     if (get_bits_left(gb) < 0) {
1811         wmavoice_flush(ctx);
1812         return AVERROR_INVALIDDATA;
1813     }
1814
1815     *got_frame_ptr = 1;
1816
1817     /* Update history */
1818     memcpy(s->prev_lsps,           lsps[2],
1819            s->lsps             * sizeof(*s->prev_lsps));
1820     memcpy(s->synth_history,      &synth[MAX_SFRAMESIZE],
1821            s->lsps             * sizeof(*synth));
1822     memcpy(s->excitation_history, &excitation[MAX_SFRAMESIZE],
1823            s->history_nsamples * sizeof(*excitation));
1824     if (s->do_apf)
1825         memmove(s->zero_exc_pf,       &s->zero_exc_pf[MAX_SFRAMESIZE],
1826                 s->history_nsamples * sizeof(*s->zero_exc_pf));
1827
1828     return 0;
1829 }
1830
1831 /**
1832  * Parse the packet header at the start of each packet (input data to this
1833  * decoder).
1834  *
1835  * @param s WMA Voice decoding context private data
1836  * @return <0 on error, nb_superframes on success.
1837  */
1838 static int parse_packet_header(WMAVoiceContext *s)
1839 {
1840     GetBitContext *gb = &s->gb;
1841     unsigned int res, n_superframes = 0;
1842
1843     skip_bits(gb, 4);          // packet sequence number
1844     s->has_residual_lsps = get_bits1(gb);
1845     do {
1846         if (get_bits_left(gb) < 6 + s->spillover_bitsize)
1847             return AVERROR_INVALIDDATA;
1848
1849         res = get_bits(gb, 6); // number of superframes per packet
1850                                // (minus first one if there is spillover)
1851         n_superframes += res;
1852     } while (res == 0x3F);
1853     s->spillover_nbits   = get_bits(gb, s->spillover_bitsize);
1854
1855     return get_bits_left(gb) >= 0 ? n_superframes : AVERROR_INVALIDDATA;
1856 }
1857
1858 /**
1859  * Copy (unaligned) bits from gb/data/size to pb.
1860  *
1861  * @param pb target buffer to copy bits into
1862  * @param data source buffer to copy bits from
1863  * @param size size of the source data, in bytes
1864  * @param gb bit I/O context specifying the current position in the source.
1865  *           data. This function might use this to align the bit position to
1866  *           a whole-byte boundary before calling #avpriv_copy_bits() on aligned
1867  *           source data
1868  * @param nbits the amount of bits to copy from source to target
1869  *
1870  * @note after calling this function, the current position in the input bit
1871  *       I/O context is undefined.
1872  */
1873 static void copy_bits(PutBitContext *pb,
1874                       const uint8_t *data, int size,
1875                       GetBitContext *gb, int nbits)
1876 {
1877     int rmn_bytes, rmn_bits;
1878
1879     rmn_bits = rmn_bytes = get_bits_left(gb);
1880     if (rmn_bits < nbits)
1881         return;
1882     if (nbits > pb->size_in_bits - put_bits_count(pb))
1883         return;
1884     rmn_bits &= 7; rmn_bytes >>= 3;
1885     if ((rmn_bits = FFMIN(rmn_bits, nbits)) > 0)
1886         put_bits(pb, rmn_bits, get_bits(gb, rmn_bits));
1887     avpriv_copy_bits(pb, data + size - rmn_bytes,
1888                  FFMIN(nbits - rmn_bits, rmn_bytes << 3));
1889 }
1890
1891 /**
1892  * Packet decoding: a packet is anything that the (ASF) demuxer contains,
1893  * and we expect that the demuxer / application provides it to us as such
1894  * (else you'll probably get garbage as output). Every packet has a size of
1895  * ctx->block_align bytes, starts with a packet header (see
1896  * #parse_packet_header()), and then a series of superframes. Superframe
1897  * boundaries may exceed packets, i.e. superframes can split data over
1898  * multiple (two) packets.
1899  *
1900  * For more information about frames, see #synth_superframe().
1901  */
1902 static int wmavoice_decode_packet(AVCodecContext *ctx, void *data,
1903                                   int *got_frame_ptr, AVPacket *avpkt)
1904 {
1905     WMAVoiceContext *s = ctx->priv_data;
1906     GetBitContext *gb = &s->gb;
1907     int size, res, pos;
1908
1909     /* Packets are sometimes a multiple of ctx->block_align, with a packet
1910      * header at each ctx->block_align bytes. However, FFmpeg's ASF demuxer
1911      * feeds us ASF packets, which may concatenate multiple "codec" packets
1912      * in a single "muxer" packet, so we artificially emulate that by
1913      * capping the packet size at ctx->block_align. */
1914     for (size = avpkt->size; size > ctx->block_align; size -= ctx->block_align);
1915     init_get_bits8(&s->gb, avpkt->data, size);
1916
1917     /* size == ctx->block_align is used to indicate whether we are dealing with
1918      * a new packet or a packet of which we already read the packet header
1919      * previously. */
1920     if (!(size % ctx->block_align)) { // new packet header
1921         if (!size) {
1922             s->spillover_nbits = 0;
1923             s->nb_superframes = 0;
1924         } else {
1925             if ((res = parse_packet_header(s)) < 0)
1926                 return res;
1927             s->nb_superframes = res;
1928         }
1929
1930         /* If the packet header specifies a s->spillover_nbits, then we want
1931          * to push out all data of the previous packet (+ spillover) before
1932          * continuing to parse new superframes in the current packet. */
1933         if (s->sframe_cache_size > 0) {
1934             int cnt = get_bits_count(gb);
1935             if (cnt + s->spillover_nbits > avpkt->size * 8) {
1936                 s->spillover_nbits = avpkt->size * 8 - cnt;
1937             }
1938             copy_bits(&s->pb, avpkt->data, size, gb, s->spillover_nbits);
1939             flush_put_bits(&s->pb);
1940             s->sframe_cache_size += s->spillover_nbits;
1941             if ((res = synth_superframe(ctx, data, got_frame_ptr)) == 0 &&
1942                 *got_frame_ptr) {
1943                 cnt += s->spillover_nbits;
1944                 s->skip_bits_next = cnt & 7;
1945                 res = cnt >> 3;
1946                 return res;
1947             } else
1948                 skip_bits_long (gb, s->spillover_nbits - cnt +
1949                                 get_bits_count(gb)); // resync
1950         } else if (s->spillover_nbits) {
1951             skip_bits_long(gb, s->spillover_nbits);  // resync
1952         }
1953     } else if (s->skip_bits_next)
1954         skip_bits(gb, s->skip_bits_next);
1955
1956     /* Try parsing superframes in current packet */
1957     s->sframe_cache_size = 0;
1958     s->skip_bits_next = 0;
1959     pos = get_bits_left(gb);
1960     if (s->nb_superframes-- == 0) {
1961         *got_frame_ptr = 0;
1962         return size;
1963     } else if (s->nb_superframes > 0) {
1964         if ((res = synth_superframe(ctx, data, got_frame_ptr)) < 0) {
1965             return res;
1966         } else if (*got_frame_ptr) {
1967             int cnt = get_bits_count(gb);
1968             s->skip_bits_next = cnt & 7;
1969             res = cnt >> 3;
1970             return res;
1971         }
1972     } else if ((s->sframe_cache_size = pos) > 0) {
1973         /* ... cache it for spillover in next packet */
1974         init_put_bits(&s->pb, s->sframe_cache, SFRAME_CACHE_MAXSIZE);
1975         copy_bits(&s->pb, avpkt->data, size, gb, s->sframe_cache_size);
1976         // FIXME bad - just copy bytes as whole and add use the
1977         // skip_bits_next field
1978     }
1979
1980     return size;
1981 }
1982
1983 static av_cold int wmavoice_decode_end(AVCodecContext *ctx)
1984 {
1985     WMAVoiceContext *s = ctx->priv_data;
1986
1987     if (s->do_apf) {
1988         ff_rdft_end(&s->rdft);
1989         ff_rdft_end(&s->irdft);
1990         ff_dct_end(&s->dct);
1991         ff_dct_end(&s->dst);
1992     }
1993
1994     return 0;
1995 }
1996
1997 AVCodec ff_wmavoice_decoder = {
1998     .name             = "wmavoice",
1999     .long_name        = NULL_IF_CONFIG_SMALL("Windows Media Audio Voice"),
2000     .type             = AVMEDIA_TYPE_AUDIO,
2001     .id               = AV_CODEC_ID_WMAVOICE,
2002     .priv_data_size   = sizeof(WMAVoiceContext),
2003     .init             = wmavoice_decode_init,
2004     .close            = wmavoice_decode_end,
2005     .decode           = wmavoice_decode_packet,
2006     .capabilities     = AV_CODEC_CAP_SUBFRAMES | AV_CODEC_CAP_DR1 | AV_CODEC_CAP_DELAY,
2007     .caps_internal    = FF_CODEC_CAP_INIT_CLEANUP,
2008     .flush            = wmavoice_flush,
2009 };