2 * Windows Media Audio Voice decoder.
3 * Copyright (c) 2009 Ronald S. Bultje
5 * This file is part of FFmpeg.
7 * FFmpeg is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License as published by the Free Software Foundation; either
10 * version 2.1 of the License, or (at your option) any later version.
12 * FFmpeg is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * Lesser General Public License for more details.
17 * You should have received a copy of the GNU Lesser General Public
18 * License along with FFmpeg; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
24 * @brief Windows Media Audio Voice compatible decoder
25 * @author Ronald S. Bultje <rsbultje@gmail.com>
32 #include "wmavoice_data.h"
33 #include "celp_math.h"
34 #include "celp_filters.h"
35 #include "acelp_vectors.h"
36 #include "acelp_filters.h"
38 #include "libavutil/lzo.h"
43 #define MAX_BLOCKS 8 ///< maximum number of blocks per frame
44 #define MAX_LSPS 16 ///< maximum filter order
45 #define MAX_LSPS_ALIGN16 16 ///< same as #MAX_LSPS; needs to be multiple
46 ///< of 16 for ASM input buffer alignment
47 #define MAX_FRAMES 3 ///< maximum number of frames per superframe
48 #define MAX_FRAMESIZE 160 ///< maximum number of samples per frame
49 #define MAX_SIGNAL_HISTORY 416 ///< maximum excitation signal history
50 #define MAX_SFRAMESIZE (MAX_FRAMESIZE * MAX_FRAMES)
51 ///< maximum number of samples per superframe
52 #define SFRAME_CACHE_MAXSIZE 256 ///< maximum cache size for frame data that
53 ///< was split over two packets
54 #define VLC_NBITS 6 ///< number of bits to read per VLC iteration
57 * Frame type VLC coding.
59 static VLC frame_type_vlc;
62 * Adaptive codebook types.
65 ACB_TYPE_NONE = 0, ///< no adaptive codebook (only hardcoded fixed)
66 ACB_TYPE_ASYMMETRIC = 1, ///< adaptive codebook with per-frame pitch, which
67 ///< we interpolate to get a per-sample pitch.
68 ///< Signal is generated using an asymmetric sinc
70 ///< @note see #wmavoice_ipol1_coeffs
71 ACB_TYPE_HAMMING = 2 ///< Per-block pitch with signal generation using
72 ///< a Hamming sinc window function
73 ///< @note see #wmavoice_ipol2_coeffs
77 * Fixed codebook types.
80 FCB_TYPE_SILENCE = 0, ///< comfort noise during silence
81 ///< generated from a hardcoded (fixed) codebook
82 ///< with per-frame (low) gain values
83 FCB_TYPE_HARDCODED = 1, ///< hardcoded (fixed) codebook with per-block
85 FCB_TYPE_AW_PULSES = 2, ///< Pitch-adaptive window (AW) pulse signals,
86 ///< used in particular for low-bitrate streams
87 FCB_TYPE_EXC_PULSES = 3, ///< Innovation (fixed) codebook pulse sets in
88 ///< combinations of either single pulses or
93 * Description of frame types.
95 static const struct frame_type_desc {
96 uint8_t n_blocks; ///< amount of blocks per frame (each block
97 ///< (contains 160/#n_blocks samples)
98 uint8_t log_n_blocks; ///< log2(#n_blocks)
99 uint8_t acb_type; ///< Adaptive codebook type (ACB_TYPE_*)
100 uint8_t fcb_type; ///< Fixed codebook type (FCB_TYPE_*)
101 uint8_t dbl_pulses; ///< how many pulse vectors have pulse pairs
102 ///< (rather than just one single pulse)
103 ///< only if #fcb_type == #FCB_TYPE_EXC_PULSES
104 uint16_t frame_size; ///< the amount of bits that make up the block
105 ///< data (per frame)
106 } frame_descs[17] = {
107 { 1, 0, ACB_TYPE_NONE, FCB_TYPE_SILENCE, 0, 0 },
108 { 2, 1, ACB_TYPE_NONE, FCB_TYPE_HARDCODED, 0, 28 },
109 { 2, 1, ACB_TYPE_ASYMMETRIC, FCB_TYPE_AW_PULSES, 0, 46 },
110 { 2, 1, ACB_TYPE_ASYMMETRIC, FCB_TYPE_EXC_PULSES, 2, 80 },
111 { 2, 1, ACB_TYPE_ASYMMETRIC, FCB_TYPE_EXC_PULSES, 5, 104 },
112 { 4, 2, ACB_TYPE_ASYMMETRIC, FCB_TYPE_EXC_PULSES, 0, 108 },
113 { 4, 2, ACB_TYPE_ASYMMETRIC, FCB_TYPE_EXC_PULSES, 2, 132 },
114 { 4, 2, ACB_TYPE_ASYMMETRIC, FCB_TYPE_EXC_PULSES, 5, 168 },
115 { 2, 1, ACB_TYPE_HAMMING, FCB_TYPE_EXC_PULSES, 0, 64 },
116 { 2, 1, ACB_TYPE_HAMMING, FCB_TYPE_EXC_PULSES, 2, 80 },
117 { 2, 1, ACB_TYPE_HAMMING, FCB_TYPE_EXC_PULSES, 5, 104 },
118 { 4, 2, ACB_TYPE_HAMMING, FCB_TYPE_EXC_PULSES, 0, 108 },
119 { 4, 2, ACB_TYPE_HAMMING, FCB_TYPE_EXC_PULSES, 2, 132 },
120 { 4, 2, ACB_TYPE_HAMMING, FCB_TYPE_EXC_PULSES, 5, 168 },
121 { 8, 3, ACB_TYPE_HAMMING, FCB_TYPE_EXC_PULSES, 0, 176 },
122 { 8, 3, ACB_TYPE_HAMMING, FCB_TYPE_EXC_PULSES, 2, 208 },
123 { 8, 3, ACB_TYPE_HAMMING, FCB_TYPE_EXC_PULSES, 5, 256 }
127 * WMA Voice decoding context.
131 * @defgroup struct_global Global values
132 * Global values, specified in the stream header / extradata or used
136 GetBitContext gb; ///< packet bitreader. During decoder init,
137 ///< it contains the extradata from the
138 ///< demuxer. During decoding, it contains
140 int8_t vbm_tree[25]; ///< converts VLC codes to frame type
142 int spillover_bitsize; ///< number of bits used to specify
143 ///< #spillover_nbits in the packet header
144 ///< = ceil(log2(ctx->block_align << 3))
145 int history_nsamples; ///< number of samples in history for signal
146 ///< prediction (through ACB)
148 /* postfilter specific values */
149 int do_apf; ///< whether to apply the averaged
150 ///< projection filter (APF)
151 int denoise_strength; ///< strength of denoising in Wiener filter
153 int denoise_tilt_corr; ///< Whether to apply tilt correction to the
154 ///< Wiener filter coefficients (postfilter)
155 int dc_level; ///< Predicted amount of DC noise, based
156 ///< on which a DC removal filter is used
158 int lsps; ///< number of LSPs per frame [10 or 16]
159 int lsp_q_mode; ///< defines quantizer defaults [0, 1]
160 int lsp_def_mode; ///< defines different sets of LSP defaults
162 int frame_lsp_bitsize; ///< size (in bits) of LSPs, when encoded
163 ///< per-frame (independent coding)
164 int sframe_lsp_bitsize; ///< size (in bits) of LSPs, when encoded
165 ///< per superframe (residual coding)
167 int min_pitch_val; ///< base value for pitch parsing code
168 int max_pitch_val; ///< max value + 1 for pitch parsing
169 int pitch_nbits; ///< number of bits used to specify the
170 ///< pitch value in the frame header
171 int block_pitch_nbits; ///< number of bits used to specify the
172 ///< first block's pitch value
173 int block_pitch_range; ///< range of the block pitch
174 int block_delta_pitch_nbits; ///< number of bits used to specify the
175 ///< delta pitch between this and the last
176 ///< block's pitch value, used in all but
178 int block_delta_pitch_hrange; ///< 1/2 range of the delta (full range is
179 ///< from -this to +this-1)
180 uint16_t block_conv_table[4]; ///< boundaries for block pitch unit/scale
185 * @defgroup struct_packet Packet values
186 * Packet values, specified in the packet header or related to a packet.
187 * A packet is considered to be a single unit of data provided to this
188 * decoder by the demuxer.
191 int spillover_nbits; ///< number of bits of the previous packet's
192 ///< last superframe preceeding this
193 ///< packet's first full superframe (useful
194 ///< for re-synchronization also)
195 int has_residual_lsps; ///< if set, superframes contain one set of
196 ///< LSPs that cover all frames, encoded as
197 ///< independent and residual LSPs; if not
198 ///< set, each frame contains its own, fully
199 ///< independent, LSPs
200 int skip_bits_next; ///< number of bits to skip at the next call
201 ///< to #wmavoice_decode_packet() (since
202 ///< they're part of the previous superframe)
204 uint8_t sframe_cache[SFRAME_CACHE_MAXSIZE + FF_INPUT_BUFFER_PADDING_SIZE];
205 ///< cache for superframe data split over
206 ///< multiple packets
207 int sframe_cache_size; ///< set to >0 if we have data from an
208 ///< (incomplete) superframe from a previous
209 ///< packet that spilled over in the current
210 ///< packet; specifies the amount of bits in
212 PutBitContext pb; ///< bitstream writer for #sframe_cache
216 * @defgroup struct_frame Frame and superframe values
217 * Superframe and frame data - these can change from frame to frame,
218 * although some of them do in that case serve as a cache / history for
219 * the next frame or superframe.
222 double prev_lsps[MAX_LSPS]; ///< LSPs of the last frame of the previous
224 int last_pitch_val; ///< pitch value of the previous frame
225 int last_acb_type; ///< frame type [0-2] of the previous frame
226 int pitch_diff_sh16; ///< ((cur_pitch_val - #last_pitch_val)
227 ///< << 16) / #MAX_FRAMESIZE
228 float silence_gain; ///< set for use in blocks if #ACB_TYPE_NONE
230 int aw_idx_is_ext; ///< whether the AW index was encoded in
231 ///< 8 bits (instead of 6)
232 int aw_pulse_range; ///< the range over which #aw_pulse_set1()
233 ///< can apply the pulse, relative to the
234 ///< value in aw_first_pulse_off. The exact
235 ///< position of the first AW-pulse is within
236 ///< [pulse_off, pulse_off + this], and
237 ///< depends on bitstream values; [16 or 24]
238 int aw_n_pulses[2]; ///< number of AW-pulses in each block; note
239 ///< that this number can be negative (in
240 ///< which case it basically means "zero")
241 int aw_first_pulse_off[2]; ///< index of first sample to which to
242 ///< apply AW-pulses, or -0xff if unset
243 int aw_next_pulse_off_cache; ///< the position (relative to start of the
244 ///< second block) at which pulses should
245 ///< start to be positioned, serves as a
246 ///< cache for pitch-adaptive window pulses
249 int frame_cntr; ///< current frame index [0 - 0xFFFE]; is
250 ///< only used for comfort noise in #pRNG()
251 float gain_pred_err[6]; ///< cache for gain prediction
252 float excitation_history[MAX_SIGNAL_HISTORY];
253 ///< cache of the signal of previous
254 ///< superframes, used as a history for
255 ///< signal generation
256 float synth_history[MAX_LSPS]; ///< see #excitation_history
259 * @defgroup post_filter Postfilter values
260 * Variables used for postfilter implementation, mostly history for
261 * smoothing and so on, and context variables for FFT/iFFT.
264 RDFTContext rdft, irdft; ///< contexts for FFT-calculation in the
265 ///< postfilter (for denoise filter)
266 DCTContext dct, dst; ///< contexts for phase shift (in Hilbert
267 ///< transform, part of postfilter)
268 float sin[511], cos[511]; ///< 8-bit cosine/sine windows over [-pi,pi]
270 float postfilter_agc; ///< gain control memory, used in
271 ///< #adaptive_gain_control()
272 float dcf_mem[2]; ///< DC filter history
273 float zero_exc_pf[MAX_SIGNAL_HISTORY + MAX_SFRAMESIZE];
274 ///< zero filter output (i.e. excitation)
276 float denoise_filter_cache[MAX_FRAMESIZE];
277 int denoise_filter_cache_size; ///< samples in #denoise_filter_cache
278 DECLARE_ALIGNED(16, float, tilted_lpcs_pf)[0x80];
279 ///< aligned buffer for LPC tilting
280 DECLARE_ALIGNED(16, float, denoise_coeffs_pf)[0x80];
281 ///< aligned buffer for denoise coefficients
282 DECLARE_ALIGNED(16, float, synth_filter_out_buf)[0x80 + MAX_LSPS_ALIGN16];
283 ///< aligned buffer for postfilter speech
291 * Set up the variable bit mode (VBM) tree from container extradata.
292 * @param gb bit I/O context.
293 * The bit context (s->gb) should be loaded with byte 23-46 of the
294 * container extradata (i.e. the ones containing the VBM tree).
295 * @param vbm_tree pointer to array to which the decoded VBM tree will be
297 * @return 0 on success, <0 on error.
299 static av_cold int decode_vbmtree(GetBitContext *gb, int8_t vbm_tree[25])
301 static const uint8_t bits[] = {
304 10, 10, 10, 12, 12, 12,
307 static const uint16_t codes[] = {
308 0x0000, 0x0001, 0x0002, // 00/01/10
309 0x000c, 0x000d, 0x000e, // 11+00/01/10
310 0x003c, 0x003d, 0x003e, // 1111+00/01/10
311 0x00fc, 0x00fd, 0x00fe, // 111111+00/01/10
312 0x03fc, 0x03fd, 0x03fe, // 11111111+00/01/10
313 0x0ffc, 0x0ffd, 0x0ffe, // 1111111111+00/01/10
314 0x3ffc, 0x3ffd, 0x3ffe, 0x3fff // 111111111111+xx
318 memset(vbm_tree, 0xff, sizeof(vbm_tree));
319 memset(cntr, 0, sizeof(cntr));
320 for (n = 0; n < 17; n++) {
321 res = get_bits(gb, 3);
322 if (cntr[res] > 3) // should be >= 3 + (res == 7))
324 vbm_tree[res * 3 + cntr[res]++] = n;
326 INIT_VLC_STATIC(&frame_type_vlc, VLC_NBITS, sizeof(bits),
327 bits, 1, 1, codes, 2, 2, 132);
332 * Set up decoder with parameters from demuxer (extradata etc.).
334 static av_cold int wmavoice_decode_init(AVCodecContext *ctx)
336 int n, flags, pitch_range, lsp16_flag;
337 WMAVoiceContext *s = ctx->priv_data;
341 * - byte 0-18: WMAPro-in-WMAVoice extradata (see wmaprodec.c),
342 * - byte 19-22: flags field (annoyingly in LE; see below for known
344 * - byte 23-46: variable bitmode tree (really just 17 * 3 bits,
347 if (ctx->extradata_size != 46) {
348 av_log(ctx, AV_LOG_ERROR,
349 "Invalid extradata size %d (should be 46)\n",
350 ctx->extradata_size);
353 flags = AV_RL32(ctx->extradata + 18);
354 s->spillover_bitsize = 3 + av_ceil_log2(ctx->block_align);
355 s->do_apf = flags & 0x1;
357 ff_rdft_init(&s->rdft, 7, DFT_R2C);
358 ff_rdft_init(&s->irdft, 7, IDFT_C2R);
359 ff_dct_init(&s->dct, 6, DCT_I);
360 ff_dct_init(&s->dst, 6, DST_I);
362 ff_sine_window_init(s->cos, 256);
363 memcpy(&s->sin[255], s->cos, 256 * sizeof(s->cos[0]));
364 for (n = 0; n < 255; n++) {
365 s->sin[n] = -s->sin[510 - n];
366 s->cos[510 - n] = s->cos[n];
369 s->denoise_strength = (flags >> 2) & 0xF;
370 if (s->denoise_strength >= 12) {
371 av_log(ctx, AV_LOG_ERROR,
372 "Invalid denoise filter strength %d (max=11)\n",
373 s->denoise_strength);
376 s->denoise_tilt_corr = !!(flags & 0x40);
377 s->dc_level = (flags >> 7) & 0xF;
378 s->lsp_q_mode = !!(flags & 0x2000);
379 s->lsp_def_mode = !!(flags & 0x4000);
380 lsp16_flag = flags & 0x1000;
383 s->frame_lsp_bitsize = 34;
384 s->sframe_lsp_bitsize = 60;
387 s->frame_lsp_bitsize = 24;
388 s->sframe_lsp_bitsize = 48;
390 for (n = 0; n < s->lsps; n++)
391 s->prev_lsps[n] = M_PI * (n + 1.0) / (s->lsps + 1.0);
393 init_get_bits(&s->gb, ctx->extradata + 22, (ctx->extradata_size - 22) << 3);
394 if (decode_vbmtree(&s->gb, s->vbm_tree) < 0) {
395 av_log(ctx, AV_LOG_ERROR, "Invalid VBM tree; broken extradata?\n");
399 s->min_pitch_val = ((ctx->sample_rate << 8) / 400 + 50) >> 8;
400 s->max_pitch_val = ((ctx->sample_rate << 8) * 37 / 2000 + 50) >> 8;
401 pitch_range = s->max_pitch_val - s->min_pitch_val;
402 s->pitch_nbits = av_ceil_log2(pitch_range);
403 s->last_pitch_val = 40;
404 s->last_acb_type = ACB_TYPE_NONE;
405 s->history_nsamples = s->max_pitch_val + 8;
407 if (s->min_pitch_val < 1 || s->history_nsamples > MAX_SIGNAL_HISTORY) {
408 int min_sr = ((((1 << 8) - 50) * 400) + 0xFF) >> 8,
409 max_sr = ((((MAX_SIGNAL_HISTORY - 8) << 8) + 205) * 2000 / 37) >> 8;
411 av_log(ctx, AV_LOG_ERROR,
412 "Unsupported samplerate %d (min=%d, max=%d)\n",
413 ctx->sample_rate, min_sr, max_sr); // 322-22097 Hz
418 s->block_conv_table[0] = s->min_pitch_val;
419 s->block_conv_table[1] = (pitch_range * 25) >> 6;
420 s->block_conv_table[2] = (pitch_range * 44) >> 6;
421 s->block_conv_table[3] = s->max_pitch_val - 1;
422 s->block_delta_pitch_hrange = (pitch_range >> 3) & ~0xF;
423 s->block_delta_pitch_nbits = 1 + av_ceil_log2(s->block_delta_pitch_hrange);
424 s->block_pitch_range = s->block_conv_table[2] +
425 s->block_conv_table[3] + 1 +
426 2 * (s->block_conv_table[1] - 2 * s->min_pitch_val);
427 s->block_pitch_nbits = av_ceil_log2(s->block_pitch_range);
429 ctx->sample_fmt = AV_SAMPLE_FMT_FLT;
435 * @defgroup postfilter Postfilter functions
436 * Postfilter functions (gain control, wiener denoise filter, DC filter,
437 * kalman smoothening, plus surrounding code to wrap it)
441 * Adaptive gain control (as used in postfilter).
443 * Identical to #ff_adaptive_gain_control() in acelp_vectors.c, except
444 * that the energy here is calculated using sum(abs(...)), whereas the
445 * other codecs (e.g. AMR-NB, SIPRO) use sqrt(dotproduct(...)).
447 * @param out output buffer for filtered samples
448 * @param in input buffer containing the samples as they are after the
449 * postfilter steps so far
450 * @param speech_synth input buffer containing speech synth before postfilter
451 * @param size input buffer size
452 * @param alpha exponential filter factor
453 * @param gain_mem pointer to filter memory (single float)
455 static void adaptive_gain_control(float *out, const float *in,
456 const float *speech_synth,
457 int size, float alpha, float *gain_mem)
460 float speech_energy = 0.0, postfilter_energy = 0.0, gain_scale_factor;
461 float mem = *gain_mem;
463 for (i = 0; i < size; i++) {
464 speech_energy += fabsf(speech_synth[i]);
465 postfilter_energy += fabsf(in[i]);
467 gain_scale_factor = (1.0 - alpha) * speech_energy / postfilter_energy;
469 for (i = 0; i < size; i++) {
470 mem = alpha * mem + gain_scale_factor;
471 out[i] = in[i] * mem;
478 * Kalman smoothing function.
480 * This function looks back pitch +/- 3 samples back into history to find
481 * the best fitting curve (that one giving the optimal gain of the two
482 * signals, i.e. the highest dot product between the two), and then
483 * uses that signal history to smoothen the output of the speech synthesis
486 * @param s WMA Voice decoding context
487 * @param pitch pitch of the speech signal
488 * @param in input speech signal
489 * @param out output pointer for smoothened signal
490 * @param size input/output buffer size
492 * @returns -1 if no smoothening took place, e.g. because no optimal
493 * fit could be found, or 0 on success.
495 static int kalman_smoothen(WMAVoiceContext *s, int pitch,
496 const float *in, float *out, int size)
499 float optimal_gain = 0, dot;
500 const float *ptr = &in[-FFMAX(s->min_pitch_val, pitch - 3)],
501 *end = &in[-FFMIN(s->max_pitch_val, pitch + 3)],
504 /* find best fitting point in history */
506 dot = ff_dot_productf(in, ptr, size);
507 if (dot > optimal_gain) {
511 } while (--ptr >= end);
513 if (optimal_gain <= 0)
515 dot = ff_dot_productf(best_hist_ptr, best_hist_ptr, size);
516 if (dot <= 0) // would be 1.0
519 if (optimal_gain <= dot) {
520 dot = dot / (dot + 0.6 * optimal_gain); // 0.625-1.000
524 /* actual smoothing */
525 for (n = 0; n < size; n++)
526 out[n] = best_hist_ptr[n] + dot * (in[n] - best_hist_ptr[n]);
532 * Get the tilt factor of a formant filter from its transfer function
533 * @see #tilt_factor() in amrnbdec.c, which does essentially the same,
534 * but somehow (??) it does a speech synthesis filter in the
535 * middle, which is missing here
537 * @param lpcs LPC coefficients
538 * @param n_lpcs Size of LPC buffer
539 * @returns the tilt factor
541 static float tilt_factor(const float *lpcs, int n_lpcs)
545 rh0 = 1.0 + ff_dot_productf(lpcs, lpcs, n_lpcs);
546 rh1 = lpcs[0] + ff_dot_productf(lpcs, &lpcs[1], n_lpcs - 1);
552 * Derive denoise filter coefficients (in real domain) from the LPCs.
554 static void calc_input_response(WMAVoiceContext *s, float *lpcs,
555 int fcb_type, float *coeffs, int remainder)
557 float last_coeff, min = 15.0, max = -15.0;
558 float irange, angle_mul, gain_mul, range, sq;
561 /* Create frequency power spectrum of speech input (i.e. RDFT of LPCs) */
562 s->rdft.rdft_calc(&s->rdft, lpcs);
563 #define log_range(var, assign) do { \
564 float tmp = log10f(assign); var = tmp; \
565 max = FFMAX(max, tmp); min = FFMIN(min, tmp); \
567 log_range(last_coeff, lpcs[1] * lpcs[1]);
568 for (n = 1; n < 64; n++)
569 log_range(lpcs[n], lpcs[n * 2] * lpcs[n * 2] +
570 lpcs[n * 2 + 1] * lpcs[n * 2 + 1]);
571 log_range(lpcs[0], lpcs[0] * lpcs[0]);
574 lpcs[64] = last_coeff;
576 /* Now, use this spectrum to pick out these frequencies with higher
577 * (relative) power/energy (which we then take to be "not noise"),
578 * and set up a table (still in lpc[]) of (relative) gains per frequency.
579 * These frequencies will be maintained, while others ("noise") will be
580 * decreased in the filter output. */
581 irange = 64.0 / range; // so irange*(max-value) is in the range [0, 63]
582 gain_mul = range * (fcb_type == FCB_TYPE_HARDCODED ? (5.0 / 13.0) :
584 angle_mul = gain_mul * (8.0 * M_LN10 / M_PI);
585 for (n = 0; n <= 64; n++) {
588 idx = FFMAX(0, lrint((max - lpcs[n]) * irange) - 1);
589 pwr = wmavoice_denoise_power_table[s->denoise_strength][idx];
590 lpcs[n] = angle_mul * pwr;
592 /* 70.57 =~ 1/log10(1.0331663) */
593 idx = (pwr * gain_mul - 0.0295) * 70.570526123;
594 if (idx > 127) { // fallback if index falls outside table range
595 coeffs[n] = wmavoice_energy_table[127] *
596 powf(1.0331663, idx - 127);
598 coeffs[n] = wmavoice_energy_table[FFMAX(0, idx)];
601 /* calculate the Hilbert transform of the gains, which we do (since this
602 * is a sinus input) by doing a phase shift (in theory, H(sin())=cos()).
603 * Hilbert_Transform(RDFT(x)) = Laplace_Transform(x), which calculates the
604 * "moment" of the LPCs in this filter. */
605 s->dct.dct_calc(&s->dct, lpcs);
606 s->dst.dct_calc(&s->dst, lpcs);
608 /* Split out the coefficient indexes into phase/magnitude pairs */
609 idx = 255 + av_clip(lpcs[64], -255, 255);
610 coeffs[0] = coeffs[0] * s->cos[idx];
611 idx = 255 + av_clip(lpcs[64] - 2 * lpcs[63], -255, 255);
612 last_coeff = coeffs[64] * s->cos[idx];
614 idx = 255 + av_clip(-lpcs[64] - 2 * lpcs[n - 1], -255, 255);
615 coeffs[n * 2 + 1] = coeffs[n] * s->sin[idx];
616 coeffs[n * 2] = coeffs[n] * s->cos[idx];
620 idx = 255 + av_clip( lpcs[64] - 2 * lpcs[n - 1], -255, 255);
621 coeffs[n * 2 + 1] = coeffs[n] * s->sin[idx];
622 coeffs[n * 2] = coeffs[n] * s->cos[idx];
624 coeffs[1] = last_coeff;
626 /* move into real domain */
627 s->irdft.rdft_calc(&s->irdft, coeffs);
629 /* tilt correction and normalize scale */
630 memset(&coeffs[remainder], 0, sizeof(coeffs[0]) * (128 - remainder));
631 if (s->denoise_tilt_corr) {
634 coeffs[remainder - 1] = 0;
635 ff_tilt_compensation(&tilt_mem,
636 -1.8 * tilt_factor(coeffs, remainder - 1),
639 sq = (1.0 / 64.0) * sqrtf(1 / ff_dot_productf(coeffs, coeffs, remainder));
640 for (n = 0; n < remainder; n++)
645 * This function applies a Wiener filter on the (noisy) speech signal as
646 * a means to denoise it.
648 * - take RDFT of LPCs to get the power spectrum of the noise + speech;
649 * - using this power spectrum, calculate (for each frequency) the Wiener
650 * filter gain, which depends on the frequency power and desired level
651 * of noise subtraction (when set too high, this leads to artifacts)
652 * We can do this symmetrically over the X-axis (so 0-4kHz is the inverse
654 * - by doing a phase shift, calculate the Hilbert transform of this array
655 * of per-frequency filter-gains to get the filtering coefficients;
656 * - smoothen/normalize/de-tilt these filter coefficients as desired;
657 * - take RDFT of noisy sound, apply the coefficients and take its IRDFT
658 * to get the denoised speech signal;
659 * - the leftover (i.e. output of the IRDFT on denoised speech data beyond
660 * the frame boundary) are saved and applied to subsequent frames by an
661 * overlap-add method (otherwise you get clicking-artifacts).
663 * @param s WMA Voice decoding context
664 * @param fcb_type Frame (codebook) type
665 * @param synth_pf input: the noisy speech signal, output: denoised speech
666 * data; should be 16-byte aligned (for ASM purposes)
667 * @param size size of the speech data
668 * @param lpcs LPCs used to synthesize this frame's speech data
670 static void wiener_denoise(WMAVoiceContext *s, int fcb_type,
671 float *synth_pf, int size,
674 int remainder, lim, n;
676 if (fcb_type != FCB_TYPE_SILENCE) {
677 float *tilted_lpcs = s->tilted_lpcs_pf,
678 *coeffs = s->denoise_coeffs_pf, tilt_mem = 0;
680 tilted_lpcs[0] = 1.0;
681 memcpy(&tilted_lpcs[1], lpcs, sizeof(lpcs[0]) * s->lsps);
682 memset(&tilted_lpcs[s->lsps + 1], 0,
683 sizeof(tilted_lpcs[0]) * (128 - s->lsps - 1));
684 ff_tilt_compensation(&tilt_mem, 0.7 * tilt_factor(lpcs, s->lsps),
685 tilted_lpcs, s->lsps + 2);
687 /* The IRDFT output (127 samples for 7-bit filter) beyond the frame
688 * size is applied to the next frame. All input beyond this is zero,
689 * and thus all output beyond this will go towards zero, hence we can
690 * limit to min(size-1, 127-size) as a performance consideration. */
691 remainder = FFMIN(127 - size, size - 1);
692 calc_input_response(s, tilted_lpcs, fcb_type, coeffs, remainder);
694 /* apply coefficients (in frequency spectrum domain), i.e. complex
695 * number multiplication */
696 memset(&synth_pf[size], 0, sizeof(synth_pf[0]) * (128 - size));
697 s->rdft.rdft_calc(&s->rdft, synth_pf);
698 s->rdft.rdft_calc(&s->rdft, coeffs);
699 synth_pf[0] *= coeffs[0];
700 synth_pf[1] *= coeffs[1];
701 for (n = 1; n < 64; n++) {
702 float v1 = synth_pf[n * 2], v2 = synth_pf[n * 2 + 1];
703 synth_pf[n * 2] = v1 * coeffs[n * 2] - v2 * coeffs[n * 2 + 1];
704 synth_pf[n * 2 + 1] = v2 * coeffs[n * 2] + v1 * coeffs[n * 2 + 1];
706 s->irdft.rdft_calc(&s->irdft, synth_pf);
709 /* merge filter output with the history of previous runs */
710 if (s->denoise_filter_cache_size) {
711 lim = FFMIN(s->denoise_filter_cache_size, size);
712 for (n = 0; n < lim; n++)
713 synth_pf[n] += s->denoise_filter_cache[n];
714 s->denoise_filter_cache_size -= lim;
715 memmove(s->denoise_filter_cache, &s->denoise_filter_cache[size],
716 sizeof(s->denoise_filter_cache[0]) * s->denoise_filter_cache_size);
719 /* move remainder of filter output into a cache for future runs */
720 if (fcb_type != FCB_TYPE_SILENCE) {
721 lim = FFMIN(remainder, s->denoise_filter_cache_size);
722 for (n = 0; n < lim; n++)
723 s->denoise_filter_cache[n] += synth_pf[size + n];
724 if (lim < remainder) {
725 memcpy(&s->denoise_filter_cache[lim], &synth_pf[size + lim],
726 sizeof(s->denoise_filter_cache[0]) * (remainder - lim));
727 s->denoise_filter_cache_size = remainder;
733 * Averaging projection filter, the postfilter used in WMAVoice.
735 * This uses the following steps:
736 * - A zero-synthesis filter (generate excitation from synth signal)
737 * - Kalman smoothing on excitation, based on pitch
738 * - Re-synthesized smoothened output
739 * - Iterative Wiener denoise filter
740 * - Adaptive gain filter
743 * @param s WMAVoice decoding context
744 * @param synth Speech synthesis output (before postfilter)
745 * @param samples Output buffer for filtered samples
746 * @param size Buffer size of synth & samples
747 * @param lpcs Generated LPCs used for speech synthesis
748 * @param zero_exc_pf destination for zero synthesis filter (16-byte aligned)
749 * @param fcb_type Frame type (silence, hardcoded, AW-pulses or FCB-pulses)
750 * @param pitch Pitch of the input signal
752 static void postfilter(WMAVoiceContext *s, const float *synth,
753 float *samples, int size,
754 const float *lpcs, float *zero_exc_pf,
755 int fcb_type, int pitch)
757 float synth_filter_in_buf[MAX_FRAMESIZE / 2],
758 *synth_pf = &s->synth_filter_out_buf[MAX_LSPS_ALIGN16],
759 *synth_filter_in = zero_exc_pf;
761 assert(size <= MAX_FRAMESIZE / 2);
763 /* generate excitation from input signal */
764 ff_celp_lp_zero_synthesis_filterf(zero_exc_pf, lpcs, synth, size, s->lsps);
766 if (fcb_type >= FCB_TYPE_AW_PULSES &&
767 !kalman_smoothen(s, pitch, zero_exc_pf, synth_filter_in_buf, size))
768 synth_filter_in = synth_filter_in_buf;
770 /* re-synthesize speech after smoothening, and keep history */
771 ff_celp_lp_synthesis_filterf(synth_pf, lpcs,
772 synth_filter_in, size, s->lsps);
773 memcpy(&synth_pf[-s->lsps], &synth_pf[size - s->lsps],
774 sizeof(synth_pf[0]) * s->lsps);
776 wiener_denoise(s, fcb_type, synth_pf, size, lpcs);
778 adaptive_gain_control(samples, synth_pf, synth, size, 0.99,
781 if (s->dc_level > 8) {
782 /* remove ultra-low frequency DC noise / highpass filter;
783 * coefficients are identical to those used in SIPR decoding,
784 * and very closely resemble those used in AMR-NB decoding. */
785 ff_acelp_apply_order_2_transfer_function(samples, samples,
786 (const float[2]) { -1.99997, 1.0 },
787 (const float[2]) { -1.9330735188, 0.93589198496 },
788 0.93980580475, s->dcf_mem, size);
797 * @param lsps output pointer to the array that will hold the LSPs
798 * @param num number of LSPs to be dequantized
799 * @param values quantized values, contains n_stages values
800 * @param sizes range (i.e. max value) of each quantized value
801 * @param n_stages number of dequantization runs
802 * @param table dequantization table to be used
803 * @param mul_q LSF multiplier
804 * @param base_q base (lowest) LSF values
806 static void dequant_lsps(double *lsps, int num,
807 const uint16_t *values,
808 const uint16_t *sizes,
809 int n_stages, const uint8_t *table,
811 const double *base_q)
815 memset(lsps, 0, num * sizeof(*lsps));
816 for (n = 0; n < n_stages; n++) {
817 const uint8_t *t_off = &table[values[n] * num];
818 double base = base_q[n], mul = mul_q[n];
820 for (m = 0; m < num; m++)
821 lsps[m] += base + mul * t_off[m];
823 table += sizes[n] * num;
828 * @defgroup lsp_dequant LSP dequantization routines
829 * LSP dequantization routines, for 10/16LSPs and independent/residual coding.
830 * @note we assume enough bits are available, caller should check.
831 * lsp10i() consumes 24 bits; lsp10r() consumes an additional 24 bits;
832 * lsp16i() consumes 34 bits; lsp16r() consumes an additional 26 bits.
836 * Parse 10 independently-coded LSPs.
838 static void dequant_lsp10i(GetBitContext *gb, double *lsps)
840 static const uint16_t vec_sizes[4] = { 256, 64, 32, 32 };
841 static const double mul_lsf[4] = {
842 5.2187144800e-3, 1.4626986422e-3,
843 9.6179549166e-4, 1.1325736225e-3
845 static const double base_lsf[4] = {
846 M_PI * -2.15522e-1, M_PI * -6.1646e-2,
847 M_PI * -3.3486e-2, M_PI * -5.7408e-2
851 v[0] = get_bits(gb, 8);
852 v[1] = get_bits(gb, 6);
853 v[2] = get_bits(gb, 5);
854 v[3] = get_bits(gb, 5);
856 dequant_lsps(lsps, 10, v, vec_sizes, 4, wmavoice_dq_lsp10i,
861 * Parse 10 independently-coded LSPs, and then derive the tables to
862 * generate LSPs for the other frames from them (residual coding).
864 static void dequant_lsp10r(GetBitContext *gb,
865 double *i_lsps, const double *old,
866 double *a1, double *a2, int q_mode)
868 static const uint16_t vec_sizes[3] = { 128, 64, 64 };
869 static const double mul_lsf[3] = {
870 2.5807601174e-3, 1.2354460219e-3, 1.1763821673e-3
872 static const double base_lsf[3] = {
873 M_PI * -1.07448e-1, M_PI * -5.2706e-2, M_PI * -5.1634e-2
875 const float (*ipol_tab)[2][10] = q_mode ?
876 wmavoice_lsp10_intercoeff_b : wmavoice_lsp10_intercoeff_a;
877 uint16_t interpol, v[3];
880 dequant_lsp10i(gb, i_lsps);
882 interpol = get_bits(gb, 5);
883 v[0] = get_bits(gb, 7);
884 v[1] = get_bits(gb, 6);
885 v[2] = get_bits(gb, 6);
887 for (n = 0; n < 10; n++) {
888 double delta = old[n] - i_lsps[n];
889 a1[n] = ipol_tab[interpol][0][n] * delta + i_lsps[n];
890 a1[10 + n] = ipol_tab[interpol][1][n] * delta + i_lsps[n];
893 dequant_lsps(a2, 20, v, vec_sizes, 3, wmavoice_dq_lsp10r,
898 * Parse 16 independently-coded LSPs.
900 static void dequant_lsp16i(GetBitContext *gb, double *lsps)
902 static const uint16_t vec_sizes[5] = { 256, 64, 128, 64, 128 };
903 static const double mul_lsf[5] = {
904 3.3439586280e-3, 6.9908173703e-4,
905 3.3216608306e-3, 1.0334960326e-3,
908 static const double base_lsf[5] = {
909 M_PI * -1.27576e-1, M_PI * -2.4292e-2,
910 M_PI * -1.28094e-1, M_PI * -3.2128e-2,
915 v[0] = get_bits(gb, 8);
916 v[1] = get_bits(gb, 6);
917 v[2] = get_bits(gb, 7);
918 v[3] = get_bits(gb, 6);
919 v[4] = get_bits(gb, 7);
921 dequant_lsps( lsps, 5, v, vec_sizes, 2,
922 wmavoice_dq_lsp16i1, mul_lsf, base_lsf);
923 dequant_lsps(&lsps[5], 5, &v[2], &vec_sizes[2], 2,
924 wmavoice_dq_lsp16i2, &mul_lsf[2], &base_lsf[2]);
925 dequant_lsps(&lsps[10], 6, &v[4], &vec_sizes[4], 1,
926 wmavoice_dq_lsp16i3, &mul_lsf[4], &base_lsf[4]);
930 * Parse 16 independently-coded LSPs, and then derive the tables to
931 * generate LSPs for the other frames from them (residual coding).
933 static void dequant_lsp16r(GetBitContext *gb,
934 double *i_lsps, const double *old,
935 double *a1, double *a2, int q_mode)
937 static const uint16_t vec_sizes[3] = { 128, 128, 128 };
938 static const double mul_lsf[3] = {
939 1.2232979501e-3, 1.4062241527e-3, 1.6114744851e-3
941 static const double base_lsf[3] = {
942 M_PI * -5.5830e-2, M_PI * -5.2908e-2, M_PI * -5.4776e-2
944 const float (*ipol_tab)[2][16] = q_mode ?
945 wmavoice_lsp16_intercoeff_b : wmavoice_lsp16_intercoeff_a;
946 uint16_t interpol, v[3];
949 dequant_lsp16i(gb, i_lsps);
951 interpol = get_bits(gb, 5);
952 v[0] = get_bits(gb, 7);
953 v[1] = get_bits(gb, 7);
954 v[2] = get_bits(gb, 7);
956 for (n = 0; n < 16; n++) {
957 double delta = old[n] - i_lsps[n];
958 a1[n] = ipol_tab[interpol][0][n] * delta + i_lsps[n];
959 a1[16 + n] = ipol_tab[interpol][1][n] * delta + i_lsps[n];
962 dequant_lsps( a2, 10, v, vec_sizes, 1,
963 wmavoice_dq_lsp16r1, mul_lsf, base_lsf);
964 dequant_lsps(&a2[10], 10, &v[1], &vec_sizes[1], 1,
965 wmavoice_dq_lsp16r2, &mul_lsf[1], &base_lsf[1]);
966 dequant_lsps(&a2[20], 12, &v[2], &vec_sizes[2], 1,
967 wmavoice_dq_lsp16r3, &mul_lsf[2], &base_lsf[2]);
972 * @defgroup aw Pitch-adaptive window coding functions
973 * The next few functions are for pitch-adaptive window coding.
977 * Parse the offset of the first pitch-adaptive window pulses, and
978 * the distribution of pulses between the two blocks in this frame.
979 * @param s WMA Voice decoding context private data
980 * @param gb bit I/O context
981 * @param pitch pitch for each block in this frame
983 static void aw_parse_coords(WMAVoiceContext *s, GetBitContext *gb,
986 static const int16_t start_offset[94] = {
987 -11, -9, -7, -5, -3, -1, 1, 3, 5, 7, 9, 11,
988 13, 15, 18, 17, 19, 20, 21, 22, 23, 24, 25, 26,
989 27, 28, 29, 30, 31, 32, 33, 35, 37, 39, 41, 43,
990 45, 47, 49, 51, 53, 55, 57, 59, 61, 63, 65, 67,
991 69, 71, 73, 75, 77, 79, 81, 83, 85, 87, 89, 91,
992 93, 95, 97, 99, 101, 103, 105, 107, 109, 111, 113, 115,
993 117, 119, 121, 123, 125, 127, 129, 131, 133, 135, 137, 139,
994 141, 143, 145, 147, 149, 151, 153, 155, 157, 159
998 /* position of pulse */
999 s->aw_idx_is_ext = 0;
1000 if ((bits = get_bits(gb, 6)) >= 54) {
1001 s->aw_idx_is_ext = 1;
1002 bits += (bits - 54) * 3 + get_bits(gb, 2);
1005 /* for a repeated pulse at pulse_off with a pitch_lag of pitch[], count
1006 * the distribution of the pulses in each block contained in this frame. */
1007 s->aw_pulse_range = FFMIN(pitch[0], pitch[1]) > 32 ? 24 : 16;
1008 for (offset = start_offset[bits]; offset < 0; offset += pitch[0]) ;
1009 s->aw_n_pulses[0] = (pitch[0] - 1 + MAX_FRAMESIZE / 2 - offset) / pitch[0];
1010 s->aw_first_pulse_off[0] = offset - s->aw_pulse_range / 2;
1011 offset += s->aw_n_pulses[0] * pitch[0];
1012 s->aw_n_pulses[1] = (pitch[1] - 1 + MAX_FRAMESIZE - offset) / pitch[1];
1013 s->aw_first_pulse_off[1] = offset - (MAX_FRAMESIZE + s->aw_pulse_range) / 2;
1015 /* if continuing from a position before the block, reset position to
1016 * start of block (when corrected for the range over which it can be
1017 * spread in aw_pulse_set1()). */
1018 if (start_offset[bits] < MAX_FRAMESIZE / 2) {
1019 while (s->aw_first_pulse_off[1] - pitch[1] + s->aw_pulse_range > 0)
1020 s->aw_first_pulse_off[1] -= pitch[1];
1021 if (start_offset[bits] < 0)
1022 while (s->aw_first_pulse_off[0] - pitch[0] + s->aw_pulse_range > 0)
1023 s->aw_first_pulse_off[0] -= pitch[0];
1028 * Apply second set of pitch-adaptive window pulses.
1029 * @param s WMA Voice decoding context private data
1030 * @param gb bit I/O context
1031 * @param block_idx block index in frame [0, 1]
1032 * @param fcb structure containing fixed codebook vector info
1034 static void aw_pulse_set2(WMAVoiceContext *s, GetBitContext *gb,
1035 int block_idx, AMRFixed *fcb)
1037 uint16_t use_mask_mem[9]; // only 5 are used, rest is padding
1038 uint16_t *use_mask = use_mask_mem + 2;
1039 /* in this function, idx is the index in the 80-bit (+ padding) use_mask
1040 * bit-array. Since use_mask consists of 16-bit values, the lower 4 bits
1041 * of idx are the position of the bit within a particular item in the
1042 * array (0 being the most significant bit, and 15 being the least
1043 * significant bit), and the remainder (>> 4) is the index in the
1044 * use_mask[]-array. This is faster and uses less memory than using a
1045 * 80-byte/80-int array. */
1046 int pulse_off = s->aw_first_pulse_off[block_idx],
1047 pulse_start, n, idx, range, aidx, start_off = 0;
1049 /* set offset of first pulse to within this block */
1050 if (s->aw_n_pulses[block_idx] > 0)
1051 while (pulse_off + s->aw_pulse_range < 1)
1052 pulse_off += fcb->pitch_lag;
1054 /* find range per pulse */
1055 if (s->aw_n_pulses[0] > 0) {
1056 if (block_idx == 0) {
1058 } else /* block_idx = 1 */ {
1060 if (s->aw_n_pulses[block_idx] > 0)
1061 pulse_off = s->aw_next_pulse_off_cache;
1065 pulse_start = s->aw_n_pulses[block_idx] > 0 ? pulse_off - range / 2 : 0;
1067 /* aw_pulse_set1() already applies pulses around pulse_off (to be exactly,
1068 * in the range of [pulse_off, pulse_off + s->aw_pulse_range], and thus
1069 * we exclude that range from being pulsed again in this function. */
1070 memset(&use_mask[-2], 0, 2 * sizeof(use_mask[0]));
1071 memset( use_mask, -1, 5 * sizeof(use_mask[0]));
1072 memset(&use_mask[5], 0, 2 * sizeof(use_mask[0]));
1073 if (s->aw_n_pulses[block_idx] > 0)
1074 for (idx = pulse_off; idx < MAX_FRAMESIZE / 2; idx += fcb->pitch_lag) {
1075 int excl_range = s->aw_pulse_range; // always 16 or 24
1076 uint16_t *use_mask_ptr = &use_mask[idx >> 4];
1077 int first_sh = 16 - (idx & 15);
1078 *use_mask_ptr++ &= 0xFFFF << first_sh;
1079 excl_range -= first_sh;
1080 if (excl_range >= 16) {
1081 *use_mask_ptr++ = 0;
1082 *use_mask_ptr &= 0xFFFF >> (excl_range - 16);
1084 *use_mask_ptr &= 0xFFFF >> excl_range;
1087 /* find the 'aidx'th offset that is not excluded */
1088 aidx = get_bits(gb, s->aw_n_pulses[0] > 0 ? 5 - 2 * block_idx : 4);
1089 for (n = 0; n <= aidx; pulse_start++) {
1090 for (idx = pulse_start; idx < 0; idx += fcb->pitch_lag) ;
1091 if (idx >= MAX_FRAMESIZE / 2) { // find from zero
1092 if (use_mask[0]) idx = 0x0F;
1093 else if (use_mask[1]) idx = 0x1F;
1094 else if (use_mask[2]) idx = 0x2F;
1095 else if (use_mask[3]) idx = 0x3F;
1096 else if (use_mask[4]) idx = 0x4F;
1098 idx -= av_log2_16bit(use_mask[idx >> 4]);
1100 if (use_mask[idx >> 4] & (0x8000 >> (idx & 15))) {
1101 use_mask[idx >> 4] &= ~(0x8000 >> (idx & 15));
1107 fcb->x[fcb->n] = start_off;
1108 fcb->y[fcb->n] = get_bits1(gb) ? -1.0 : 1.0;
1111 /* set offset for next block, relative to start of that block */
1112 n = (MAX_FRAMESIZE / 2 - start_off) % fcb->pitch_lag;
1113 s->aw_next_pulse_off_cache = n ? fcb->pitch_lag - n : 0;
1117 * Apply first set of pitch-adaptive window pulses.
1118 * @param s WMA Voice decoding context private data
1119 * @param gb bit I/O context
1120 * @param block_idx block index in frame [0, 1]
1121 * @param fcb storage location for fixed codebook pulse info
1123 static void aw_pulse_set1(WMAVoiceContext *s, GetBitContext *gb,
1124 int block_idx, AMRFixed *fcb)
1126 int val = get_bits(gb, 12 - 2 * (s->aw_idx_is_ext && !block_idx));
1129 if (s->aw_n_pulses[block_idx] > 0) {
1130 int n, v_mask, i_mask, sh, n_pulses;
1132 if (s->aw_pulse_range == 24) { // 3 pulses, 1:sign + 3:index each
1137 } else { // 4 pulses, 1:sign + 2:index each
1144 for (n = n_pulses - 1; n >= 0; n--, val >>= sh) {
1145 fcb->y[fcb->n] = (val & v_mask) ? -1.0 : 1.0;
1146 fcb->x[fcb->n] = (val & i_mask) * n_pulses + n +
1147 s->aw_first_pulse_off[block_idx];
1148 while (fcb->x[fcb->n] < 0)
1149 fcb->x[fcb->n] += fcb->pitch_lag;
1150 if (fcb->x[fcb->n] < MAX_FRAMESIZE / 2)
1154 int num2 = (val & 0x1FF) >> 1, delta, idx;
1156 if (num2 < 1 * 79) { delta = 1; idx = num2 + 1; }
1157 else if (num2 < 2 * 78) { delta = 3; idx = num2 + 1 - 1 * 77; }
1158 else if (num2 < 3 * 77) { delta = 5; idx = num2 + 1 - 2 * 76; }
1159 else { delta = 7; idx = num2 + 1 - 3 * 75; }
1160 v = (val & 0x200) ? -1.0 : 1.0;
1162 fcb->no_repeat_mask |= 3 << fcb->n;
1163 fcb->x[fcb->n] = idx - delta;
1165 fcb->x[fcb->n + 1] = idx;
1166 fcb->y[fcb->n + 1] = (val & 1) ? -v : v;
1174 * Generate a random number from frame_cntr and block_idx, which will lief
1175 * in the range [0, 1000 - block_size] (so it can be used as an index in a
1176 * table of size 1000 of which you want to read block_size entries).
1178 * @param frame_cntr current frame number
1179 * @param block_num current block index
1180 * @param block_size amount of entries we want to read from a table
1181 * that has 1000 entries
1182 * @return a (non-)random number in the [0, 1000 - block_size] range.
1184 static int pRNG(int frame_cntr, int block_num, int block_size)
1186 /* array to simplify the calculation of z:
1187 * y = (x % 9) * 5 + 6;
1188 * z = (49995 * x) / y;
1189 * Since y only has 9 values, we can remove the division by using a
1190 * LUT and using FASTDIV-style divisions. For each of the 9 values
1191 * of y, we can rewrite z as:
1192 * z = x * (49995 / y) + x * ((49995 % y) / y)
1193 * In this table, each col represents one possible value of y, the
1194 * first number is 49995 / y, and the second is the FASTDIV variant
1195 * of 49995 % y / y. */
1196 static const unsigned int div_tbl[9][2] = {
1197 { 8332, 3 * 715827883U }, // y = 6
1198 { 4545, 0 * 390451573U }, // y = 11
1199 { 3124, 11 * 268435456U }, // y = 16
1200 { 2380, 15 * 204522253U }, // y = 21
1201 { 1922, 23 * 165191050U }, // y = 26
1202 { 1612, 23 * 138547333U }, // y = 31
1203 { 1388, 27 * 119304648U }, // y = 36
1204 { 1219, 16 * 104755300U }, // y = 41
1205 { 1086, 39 * 93368855U } // y = 46
1207 unsigned int z, y, x = MUL16(block_num, 1877) + frame_cntr;
1208 if (x >= 0xFFFF) x -= 0xFFFF; // max value of x is 8*1877+0xFFFE=0x13AA6,
1209 // so this is effectively a modulo (%)
1210 y = x - 9 * MULH(477218589, x); // x % 9
1211 z = (uint16_t) (x * div_tbl[y][0] + UMULH(x, div_tbl[y][1]));
1212 // z = x * 49995 / (y * 5 + 6)
1213 return z % (1000 - block_size);
1217 * Parse hardcoded signal for a single block.
1218 * @note see #synth_block().
1220 static void synth_block_hardcoded(WMAVoiceContext *s, GetBitContext *gb,
1221 int block_idx, int size,
1222 const struct frame_type_desc *frame_desc,
1228 assert(size <= MAX_FRAMESIZE);
1230 /* Set the offset from which we start reading wmavoice_std_codebook */
1231 if (frame_desc->fcb_type == FCB_TYPE_SILENCE) {
1232 r_idx = pRNG(s->frame_cntr, block_idx, size);
1233 gain = s->silence_gain;
1234 } else /* FCB_TYPE_HARDCODED */ {
1235 r_idx = get_bits(gb, 8);
1236 gain = wmavoice_gain_universal[get_bits(gb, 6)];
1239 /* Clear gain prediction parameters */
1240 memset(s->gain_pred_err, 0, sizeof(s->gain_pred_err));
1242 /* Apply gain to hardcoded codebook and use that as excitation signal */
1243 for (n = 0; n < size; n++)
1244 excitation[n] = wmavoice_std_codebook[r_idx + n] * gain;
1248 * Parse FCB/ACB signal for a single block.
1249 * @note see #synth_block().
1251 static void synth_block_fcb_acb(WMAVoiceContext *s, GetBitContext *gb,
1252 int block_idx, int size,
1253 int block_pitch_sh2,
1254 const struct frame_type_desc *frame_desc,
1257 static const float gain_coeff[6] = {
1258 0.8169, -0.06545, 0.1726, 0.0185, -0.0359, 0.0458
1260 float pulses[MAX_FRAMESIZE / 2], pred_err, acb_gain, fcb_gain;
1261 int n, idx, gain_weight;
1264 assert(size <= MAX_FRAMESIZE / 2);
1265 memset(pulses, 0, sizeof(*pulses) * size);
1267 fcb.pitch_lag = block_pitch_sh2 >> 2;
1268 fcb.pitch_fac = 1.0;
1269 fcb.no_repeat_mask = 0;
1272 /* For the other frame types, this is where we apply the innovation
1273 * (fixed) codebook pulses of the speech signal. */
1274 if (frame_desc->fcb_type == FCB_TYPE_AW_PULSES) {
1275 aw_pulse_set1(s, gb, block_idx, &fcb);
1276 aw_pulse_set2(s, gb, block_idx, &fcb);
1277 } else /* FCB_TYPE_EXC_PULSES */ {
1278 int offset_nbits = 5 - frame_desc->log_n_blocks;
1280 fcb.no_repeat_mask = -1;
1281 /* similar to ff_decode_10_pulses_35bits(), but with single pulses
1282 * (instead of double) for a subset of pulses */
1283 for (n = 0; n < 5; n++) {
1287 sign = get_bits1(gb) ? 1.0 : -1.0;
1288 pos1 = get_bits(gb, offset_nbits);
1289 fcb.x[fcb.n] = n + 5 * pos1;
1290 fcb.y[fcb.n++] = sign;
1291 if (n < frame_desc->dbl_pulses) {
1292 pos2 = get_bits(gb, offset_nbits);
1293 fcb.x[fcb.n] = n + 5 * pos2;
1294 fcb.y[fcb.n++] = (pos1 < pos2) ? -sign : sign;
1298 ff_set_fixed_vector(pulses, &fcb, 1.0, size);
1300 /* Calculate gain for adaptive & fixed codebook signal.
1301 * see ff_amr_set_fixed_gain(). */
1302 idx = get_bits(gb, 7);
1303 fcb_gain = expf(ff_dot_productf(s->gain_pred_err, gain_coeff, 6) -
1304 5.2409161640 + wmavoice_gain_codebook_fcb[idx]);
1305 acb_gain = wmavoice_gain_codebook_acb[idx];
1306 pred_err = av_clipf(wmavoice_gain_codebook_fcb[idx],
1307 -2.9957322736 /* log(0.05) */,
1308 1.6094379124 /* log(5.0) */);
1310 gain_weight = 8 >> frame_desc->log_n_blocks;
1311 memmove(&s->gain_pred_err[gain_weight], s->gain_pred_err,
1312 sizeof(*s->gain_pred_err) * (6 - gain_weight));
1313 for (n = 0; n < gain_weight; n++)
1314 s->gain_pred_err[n] = pred_err;
1316 /* Calculation of adaptive codebook */
1317 if (frame_desc->acb_type == ACB_TYPE_ASYMMETRIC) {
1319 for (n = 0; n < size; n += len) {
1321 int abs_idx = block_idx * size + n;
1322 int pitch_sh16 = (s->last_pitch_val << 16) +
1323 s->pitch_diff_sh16 * abs_idx;
1324 int pitch = (pitch_sh16 + 0x6FFF) >> 16;
1325 int idx_sh16 = ((pitch << 16) - pitch_sh16) * 8 + 0x58000;
1326 idx = idx_sh16 >> 16;
1327 if (s->pitch_diff_sh16) {
1328 if (s->pitch_diff_sh16 > 0) {
1329 next_idx_sh16 = (idx_sh16) &~ 0xFFFF;
1331 next_idx_sh16 = (idx_sh16 + 0x10000) &~ 0xFFFF;
1332 len = av_clip((idx_sh16 - next_idx_sh16) / s->pitch_diff_sh16 / 8,
1337 ff_acelp_interpolatef(&excitation[n], &excitation[n - pitch],
1338 wmavoice_ipol1_coeffs, 17,
1341 } else /* ACB_TYPE_HAMMING */ {
1342 int block_pitch = block_pitch_sh2 >> 2;
1343 idx = block_pitch_sh2 & 3;
1345 ff_acelp_interpolatef(excitation, &excitation[-block_pitch],
1346 wmavoice_ipol2_coeffs, 4,
1349 av_memcpy_backptr((uint8_t *) excitation, sizeof(float) * block_pitch,
1350 sizeof(float) * size);
1353 /* Interpolate ACB/FCB and use as excitation signal */
1354 ff_weighted_vector_sumf(excitation, excitation, pulses,
1355 acb_gain, fcb_gain, size);
1359 * Parse data in a single block.
1360 * @note we assume enough bits are available, caller should check.
1362 * @param s WMA Voice decoding context private data
1363 * @param gb bit I/O context
1364 * @param block_idx index of the to-be-read block
1365 * @param size amount of samples to be read in this block
1366 * @param block_pitch_sh2 pitch for this block << 2
1367 * @param lsps LSPs for (the end of) this frame
1368 * @param prev_lsps LSPs for the last frame
1369 * @param frame_desc frame type descriptor
1370 * @param excitation target memory for the ACB+FCB interpolated signal
1371 * @param synth target memory for the speech synthesis filter output
1372 * @return 0 on success, <0 on error.
1374 static void synth_block(WMAVoiceContext *s, GetBitContext *gb,
1375 int block_idx, int size,
1376 int block_pitch_sh2,
1377 const double *lsps, const double *prev_lsps,
1378 const struct frame_type_desc *frame_desc,
1379 float *excitation, float *synth)
1381 double i_lsps[MAX_LSPS];
1382 float lpcs[MAX_LSPS];
1386 if (frame_desc->acb_type == ACB_TYPE_NONE)
1387 synth_block_hardcoded(s, gb, block_idx, size, frame_desc, excitation);
1389 synth_block_fcb_acb(s, gb, block_idx, size, block_pitch_sh2,
1390 frame_desc, excitation);
1392 /* convert interpolated LSPs to LPCs */
1393 fac = (block_idx + 0.5) / frame_desc->n_blocks;
1394 for (n = 0; n < s->lsps; n++) // LSF -> LSP
1395 i_lsps[n] = cos(prev_lsps[n] + fac * (lsps[n] - prev_lsps[n]));
1396 ff_acelp_lspd2lpc(i_lsps, lpcs, s->lsps >> 1);
1398 /* Speech synthesis */
1399 ff_celp_lp_synthesis_filterf(synth, lpcs, excitation, size, s->lsps);
1403 * Synthesize output samples for a single frame.
1404 * @note we assume enough bits are available, caller should check.
1406 * @param ctx WMA Voice decoder context
1407 * @param gb bit I/O context (s->gb or one for cross-packet superframes)
1408 * @param frame_idx Frame number within superframe [0-2]
1409 * @param samples pointer to output sample buffer, has space for at least 160
1411 * @param lsps LSP array
1412 * @param prev_lsps array of previous frame's LSPs
1413 * @param excitation target buffer for excitation signal
1414 * @param synth target buffer for synthesized speech data
1415 * @return 0 on success, <0 on error.
1417 static int synth_frame(AVCodecContext *ctx, GetBitContext *gb, int frame_idx,
1419 const double *lsps, const double *prev_lsps,
1420 float *excitation, float *synth)
1422 WMAVoiceContext *s = ctx->priv_data;
1423 int n, n_blocks_x2, log_n_blocks_x2, cur_pitch_val;
1424 int pitch[MAX_BLOCKS], last_block_pitch;
1426 /* Parse frame type ("frame header"), see frame_descs */
1427 int bd_idx = s->vbm_tree[get_vlc2(gb, frame_type_vlc.table, 6, 3)],
1428 block_nsamples = MAX_FRAMESIZE / frame_descs[bd_idx].n_blocks;
1431 av_log(ctx, AV_LOG_ERROR,
1432 "Invalid frame type VLC code, skipping\n");
1436 /* Pitch calculation for ACB_TYPE_ASYMMETRIC ("pitch-per-frame") */
1437 if (frame_descs[bd_idx].acb_type == ACB_TYPE_ASYMMETRIC) {
1438 /* Pitch is provided per frame, which is interpreted as the pitch of
1439 * the last sample of the last block of this frame. We can interpolate
1440 * the pitch of other blocks (and even pitch-per-sample) by gradually
1441 * incrementing/decrementing prev_frame_pitch to cur_pitch_val. */
1442 n_blocks_x2 = frame_descs[bd_idx].n_blocks << 1;
1443 log_n_blocks_x2 = frame_descs[bd_idx].log_n_blocks + 1;
1444 cur_pitch_val = s->min_pitch_val + get_bits(gb, s->pitch_nbits);
1445 cur_pitch_val = FFMIN(cur_pitch_val, s->max_pitch_val - 1);
1446 if (s->last_acb_type == ACB_TYPE_NONE ||
1447 20 * abs(cur_pitch_val - s->last_pitch_val) >
1448 (cur_pitch_val + s->last_pitch_val))
1449 s->last_pitch_val = cur_pitch_val;
1451 /* pitch per block */
1452 for (n = 0; n < frame_descs[bd_idx].n_blocks; n++) {
1453 int fac = n * 2 + 1;
1455 pitch[n] = (MUL16(fac, cur_pitch_val) +
1456 MUL16((n_blocks_x2 - fac), s->last_pitch_val) +
1457 frame_descs[bd_idx].n_blocks) >> log_n_blocks_x2;
1460 /* "pitch-diff-per-sample" for calculation of pitch per sample */
1461 s->pitch_diff_sh16 =
1462 ((cur_pitch_val - s->last_pitch_val) << 16) / MAX_FRAMESIZE;
1465 /* Global gain (if silence) and pitch-adaptive window coordinates */
1466 switch (frame_descs[bd_idx].fcb_type) {
1467 case FCB_TYPE_SILENCE:
1468 s->silence_gain = wmavoice_gain_silence[get_bits(gb, 8)];
1470 case FCB_TYPE_AW_PULSES:
1471 aw_parse_coords(s, gb, pitch);
1475 for (n = 0; n < frame_descs[bd_idx].n_blocks; n++) {
1478 /* Pitch calculation for ACB_TYPE_HAMMING ("pitch-per-block") */
1479 switch (frame_descs[bd_idx].acb_type) {
1480 case ACB_TYPE_HAMMING: {
1481 /* Pitch is given per block. Per-block pitches are encoded as an
1482 * absolute value for the first block, and then delta values
1483 * relative to this value) for all subsequent blocks. The scale of
1484 * this pitch value is semi-logaritmic compared to its use in the
1485 * decoder, so we convert it to normal scale also. */
1487 t1 = (s->block_conv_table[1] - s->block_conv_table[0]) << 2,
1488 t2 = (s->block_conv_table[2] - s->block_conv_table[1]) << 1,
1489 t3 = s->block_conv_table[3] - s->block_conv_table[2] + 1;
1492 block_pitch = get_bits(gb, s->block_pitch_nbits);
1494 block_pitch = last_block_pitch - s->block_delta_pitch_hrange +
1495 get_bits(gb, s->block_delta_pitch_nbits);
1496 /* Convert last_ so that any next delta is within _range */
1497 last_block_pitch = av_clip(block_pitch,
1498 s->block_delta_pitch_hrange,
1499 s->block_pitch_range -
1500 s->block_delta_pitch_hrange);
1502 /* Convert semi-log-style scale back to normal scale */
1503 if (block_pitch < t1) {
1504 bl_pitch_sh2 = (s->block_conv_table[0] << 2) + block_pitch;
1507 if (block_pitch < t2) {
1509 (s->block_conv_table[1] << 2) + (block_pitch << 1);
1512 if (block_pitch < t3) {
1514 (s->block_conv_table[2] + block_pitch) << 2;
1516 bl_pitch_sh2 = s->block_conv_table[3] << 2;
1519 pitch[n] = bl_pitch_sh2 >> 2;
1523 case ACB_TYPE_ASYMMETRIC: {
1524 bl_pitch_sh2 = pitch[n] << 2;
1528 default: // ACB_TYPE_NONE has no pitch
1533 synth_block(s, gb, n, block_nsamples, bl_pitch_sh2,
1534 lsps, prev_lsps, &frame_descs[bd_idx],
1535 &excitation[n * block_nsamples],
1536 &synth[n * block_nsamples]);
1539 /* Averaging projection filter, if applicable. Else, just copy samples
1540 * from synthesis buffer */
1542 double i_lsps[MAX_LSPS];
1543 float lpcs[MAX_LSPS];
1545 for (n = 0; n < s->lsps; n++) // LSF -> LSP
1546 i_lsps[n] = cos(0.5 * (prev_lsps[n] + lsps[n]));
1547 ff_acelp_lspd2lpc(i_lsps, lpcs, s->lsps >> 1);
1548 postfilter(s, synth, samples, 80, lpcs,
1549 &s->zero_exc_pf[s->history_nsamples + MAX_FRAMESIZE * frame_idx],
1550 frame_descs[bd_idx].fcb_type, pitch[0]);
1552 for (n = 0; n < s->lsps; n++) // LSF -> LSP
1553 i_lsps[n] = cos(lsps[n]);
1554 ff_acelp_lspd2lpc(i_lsps, lpcs, s->lsps >> 1);
1555 postfilter(s, &synth[80], &samples[80], 80, lpcs,
1556 &s->zero_exc_pf[s->history_nsamples + MAX_FRAMESIZE * frame_idx + 80],
1557 frame_descs[bd_idx].fcb_type, pitch[0]);
1559 memcpy(samples, synth, 160 * sizeof(synth[0]));
1561 /* Cache values for next frame */
1563 if (s->frame_cntr >= 0xFFFF) s->frame_cntr -= 0xFFFF; // i.e. modulo (%)
1564 s->last_acb_type = frame_descs[bd_idx].acb_type;
1565 switch (frame_descs[bd_idx].acb_type) {
1567 s->last_pitch_val = 0;
1569 case ACB_TYPE_ASYMMETRIC:
1570 s->last_pitch_val = cur_pitch_val;
1572 case ACB_TYPE_HAMMING:
1573 s->last_pitch_val = pitch[frame_descs[bd_idx].n_blocks - 1];
1581 * Ensure minimum value for first item, maximum value for last value,
1582 * proper spacing between each value and proper ordering.
1584 * @param lsps array of LSPs
1585 * @param num size of LSP array
1587 * @note basically a double version of #ff_acelp_reorder_lsf(), might be
1588 * useful to put in a generic location later on. Parts are also
1589 * present in #ff_set_min_dist_lsf() + #ff_sort_nearly_sorted_floats(),
1590 * which is in float.
1592 static void stabilize_lsps(double *lsps, int num)
1596 /* set minimum value for first, maximum value for last and minimum
1597 * spacing between LSF values.
1598 * Very similar to ff_set_min_dist_lsf(), but in double. */
1599 lsps[0] = FFMAX(lsps[0], 0.0015 * M_PI);
1600 for (n = 1; n < num; n++)
1601 lsps[n] = FFMAX(lsps[n], lsps[n - 1] + 0.0125 * M_PI);
1602 lsps[num - 1] = FFMIN(lsps[num - 1], 0.9985 * M_PI);
1604 /* reorder (looks like one-time / non-recursed bubblesort).
1605 * Very similar to ff_sort_nearly_sorted_floats(), but in double. */
1606 for (n = 1; n < num; n++) {
1607 if (lsps[n] < lsps[n - 1]) {
1608 for (m = 1; m < num; m++) {
1609 double tmp = lsps[m];
1610 for (l = m - 1; l >= 0; l--) {
1611 if (lsps[l] <= tmp) break;
1612 lsps[l + 1] = lsps[l];
1622 * Test if there's enough bits to read 1 superframe.
1624 * @param orig_gb bit I/O context used for reading. This function
1625 * does not modify the state of the bitreader; it
1626 * only uses it to copy the current stream position
1627 * @param s WMA Voice decoding context private data
1628 * @return -1 if unsupported, 1 on not enough bits or 0 if OK.
1630 static int check_bits_for_superframe(GetBitContext *orig_gb,
1633 GetBitContext s_gb, *gb = &s_gb;
1634 int n, need_bits, bd_idx;
1635 const struct frame_type_desc *frame_desc;
1637 /* initialize a copy */
1638 init_get_bits(gb, orig_gb->buffer, orig_gb->size_in_bits);
1639 skip_bits_long(gb, get_bits_count(orig_gb));
1640 assert(get_bits_left(gb) == get_bits_left(orig_gb));
1642 /* superframe header */
1643 if (get_bits_left(gb) < 14)
1646 return -1; // WMAPro-in-WMAVoice superframe
1647 if (get_bits1(gb)) skip_bits(gb, 12); // number of samples in superframe
1648 if (s->has_residual_lsps) { // residual LSPs (for all frames)
1649 if (get_bits_left(gb) < s->sframe_lsp_bitsize)
1651 skip_bits_long(gb, s->sframe_lsp_bitsize);
1655 for (n = 0; n < MAX_FRAMES; n++) {
1656 int aw_idx_is_ext = 0;
1658 if (!s->has_residual_lsps) { // independent LSPs (per-frame)
1659 if (get_bits_left(gb) < s->frame_lsp_bitsize) return 1;
1660 skip_bits_long(gb, s->frame_lsp_bitsize);
1662 bd_idx = s->vbm_tree[get_vlc2(gb, frame_type_vlc.table, 6, 3)];
1664 return -1; // invalid frame type VLC code
1665 frame_desc = &frame_descs[bd_idx];
1666 if (frame_desc->acb_type == ACB_TYPE_ASYMMETRIC) {
1667 if (get_bits_left(gb) < s->pitch_nbits)
1669 skip_bits_long(gb, s->pitch_nbits);
1671 if (frame_desc->fcb_type == FCB_TYPE_SILENCE) {
1673 } else if (frame_desc->fcb_type == FCB_TYPE_AW_PULSES) {
1674 int tmp = get_bits(gb, 6);
1682 if (frame_desc->acb_type == ACB_TYPE_HAMMING) {
1683 need_bits = s->block_pitch_nbits +
1684 (frame_desc->n_blocks - 1) * s->block_delta_pitch_nbits;
1685 } else if (frame_desc->fcb_type == FCB_TYPE_AW_PULSES) {
1686 need_bits = 2 * !aw_idx_is_ext;
1689 need_bits += frame_desc->frame_size;
1690 if (get_bits_left(gb) < need_bits)
1692 skip_bits_long(gb, need_bits);
1699 * Synthesize output samples for a single superframe. If we have any data
1700 * cached in s->sframe_cache, that will be used instead of whatever is loaded
1703 * WMA Voice superframes contain 3 frames, each containing 160 audio samples,
1704 * to give a total of 480 samples per frame. See #synth_frame() for frame
1705 * parsing. In addition to 3 frames, superframes can also contain the LSPs
1706 * (if these are globally specified for all frames (residually); they can
1707 * also be specified individually per-frame. See the s->has_residual_lsps
1708 * option), and can specify the number of samples encoded in this superframe
1709 * (if less than 480), usually used to prevent blanks at track boundaries.
1711 * @param ctx WMA Voice decoder context
1712 * @param samples pointer to output buffer for voice samples
1713 * @param data_size pointer containing the size of #samples on input, and the
1714 * amount of #samples filled on output
1715 * @return 0 on success, <0 on error or 1 if there was not enough data to
1716 * fully parse the superframe
1718 static int synth_superframe(AVCodecContext *ctx,
1719 float *samples, int *data_size)
1721 WMAVoiceContext *s = ctx->priv_data;
1722 GetBitContext *gb = &s->gb, s_gb;
1723 int n, res, n_samples = 480;
1724 double lsps[MAX_FRAMES][MAX_LSPS];
1725 const double *mean_lsf = s->lsps == 16 ?
1726 wmavoice_mean_lsf16[s->lsp_def_mode] : wmavoice_mean_lsf10[s->lsp_def_mode];
1727 float excitation[MAX_SIGNAL_HISTORY + MAX_SFRAMESIZE + 12];
1728 float synth[MAX_LSPS + MAX_SFRAMESIZE];
1730 memcpy(synth, s->synth_history,
1731 s->lsps * sizeof(*synth));
1732 memcpy(excitation, s->excitation_history,
1733 s->history_nsamples * sizeof(*excitation));
1735 if (s->sframe_cache_size > 0) {
1737 init_get_bits(gb, s->sframe_cache, s->sframe_cache_size);
1738 s->sframe_cache_size = 0;
1741 if ((res = check_bits_for_superframe(gb, s)) == 1) return 1;
1743 /* First bit is speech/music bit, it differentiates between WMAVoice
1744 * speech samples (the actual codec) and WMAVoice music samples, which
1745 * are really WMAPro-in-WMAVoice-superframes. I've never seen those in
1747 if (!get_bits1(gb)) {
1748 av_log_missing_feature(ctx, "WMAPro-in-WMAVoice support", 1);
1752 /* (optional) nr. of samples in superframe; always <= 480 and >= 0 */
1753 if (get_bits1(gb)) {
1754 if ((n_samples = get_bits(gb, 12)) > 480) {
1755 av_log(ctx, AV_LOG_ERROR,
1756 "Superframe encodes >480 samples (%d), not allowed\n",
1761 /* Parse LSPs, if global for the superframe (can also be per-frame). */
1762 if (s->has_residual_lsps) {
1763 double prev_lsps[MAX_LSPS], a1[MAX_LSPS * 2], a2[MAX_LSPS * 2];
1765 for (n = 0; n < s->lsps; n++)
1766 prev_lsps[n] = s->prev_lsps[n] - mean_lsf[n];
1768 if (s->lsps == 10) {
1769 dequant_lsp10r(gb, lsps[2], prev_lsps, a1, a2, s->lsp_q_mode);
1770 } else /* s->lsps == 16 */
1771 dequant_lsp16r(gb, lsps[2], prev_lsps, a1, a2, s->lsp_q_mode);
1773 for (n = 0; n < s->lsps; n++) {
1774 lsps[0][n] = mean_lsf[n] + (a1[n] - a2[n * 2]);
1775 lsps[1][n] = mean_lsf[n] + (a1[s->lsps + n] - a2[n * 2 + 1]);
1776 lsps[2][n] += mean_lsf[n];
1778 for (n = 0; n < 3; n++)
1779 stabilize_lsps(lsps[n], s->lsps);
1782 /* Parse frames, optionally preceeded by per-frame (independent) LSPs. */
1783 for (n = 0; n < 3; n++) {
1784 if (!s->has_residual_lsps) {
1787 if (s->lsps == 10) {
1788 dequant_lsp10i(gb, lsps[n]);
1789 } else /* s->lsps == 16 */
1790 dequant_lsp16i(gb, lsps[n]);
1792 for (m = 0; m < s->lsps; m++)
1793 lsps[n][m] += mean_lsf[m];
1794 stabilize_lsps(lsps[n], s->lsps);
1797 if ((res = synth_frame(ctx, gb, n,
1798 &samples[n * MAX_FRAMESIZE],
1799 lsps[n], n == 0 ? s->prev_lsps : lsps[n - 1],
1800 &excitation[s->history_nsamples + n * MAX_FRAMESIZE],
1801 &synth[s->lsps + n * MAX_FRAMESIZE])))
1805 /* Statistics? FIXME - we don't check for length, a slight overrun
1806 * will be caught by internal buffer padding, and anything else
1807 * will be skipped, not read. */
1808 if (get_bits1(gb)) {
1809 res = get_bits(gb, 4);
1810 skip_bits(gb, 10 * (res + 1));
1813 /* Specify nr. of output samples */
1814 *data_size = n_samples * sizeof(float);
1816 /* Update history */
1817 memcpy(s->prev_lsps, lsps[2],
1818 s->lsps * sizeof(*s->prev_lsps));
1819 memcpy(s->synth_history, &synth[MAX_SFRAMESIZE],
1820 s->lsps * sizeof(*synth));
1821 memcpy(s->excitation_history, &excitation[MAX_SFRAMESIZE],
1822 s->history_nsamples * sizeof(*excitation));
1824 memmove(s->zero_exc_pf, &s->zero_exc_pf[MAX_SFRAMESIZE],
1825 s->history_nsamples * sizeof(*s->zero_exc_pf));
1831 * Parse the packet header at the start of each packet (input data to this
1834 * @param s WMA Voice decoding context private data
1835 * @return 1 if not enough bits were available, or 0 on success.
1837 static int parse_packet_header(WMAVoiceContext *s)
1839 GetBitContext *gb = &s->gb;
1842 if (get_bits_left(gb) < 11)
1844 skip_bits(gb, 4); // packet sequence number
1845 s->has_residual_lsps = get_bits1(gb);
1847 res = get_bits(gb, 6); // number of superframes per packet
1848 // (minus first one if there is spillover)
1849 if (get_bits_left(gb) < 6 * (res == 0x3F) + s->spillover_bitsize)
1851 } while (res == 0x3F);
1852 s->spillover_nbits = get_bits(gb, s->spillover_bitsize);
1858 * Copy (unaligned) bits from gb/data/size to pb.
1860 * @param pb target buffer to copy bits into
1861 * @param data source buffer to copy bits from
1862 * @param size size of the source data, in bytes
1863 * @param gb bit I/O context specifying the current position in the source.
1864 * data. This function might use this to align the bit position to
1865 * a whole-byte boundary before calling #ff_copy_bits() on aligned
1867 * @param nbits the amount of bits to copy from source to target
1869 * @note after calling this function, the current position in the input bit
1870 * I/O context is undefined.
1872 static void copy_bits(PutBitContext *pb,
1873 const uint8_t *data, int size,
1874 GetBitContext *gb, int nbits)
1876 int rmn_bytes, rmn_bits;
1878 rmn_bits = rmn_bytes = get_bits_left(gb);
1879 if (rmn_bits < nbits)
1881 rmn_bits &= 7; rmn_bytes >>= 3;
1882 if ((rmn_bits = FFMIN(rmn_bits, nbits)) > 0)
1883 put_bits(pb, rmn_bits, get_bits(gb, rmn_bits));
1884 ff_copy_bits(pb, data + size - rmn_bytes,
1885 FFMIN(nbits - rmn_bits, rmn_bytes << 3));
1889 * Packet decoding: a packet is anything that the (ASF) demuxer contains,
1890 * and we expect that the demuxer / application provides it to us as such
1891 * (else you'll probably get garbage as output). Every packet has a size of
1892 * ctx->block_align bytes, starts with a packet header (see
1893 * #parse_packet_header()), and then a series of superframes. Superframe
1894 * boundaries may exceed packets, i.e. superframes can split data over
1895 * multiple (two) packets.
1897 * For more information about frames, see #synth_superframe().
1899 static int wmavoice_decode_packet(AVCodecContext *ctx, void *data,
1900 int *data_size, AVPacket *avpkt)
1902 WMAVoiceContext *s = ctx->priv_data;
1903 GetBitContext *gb = &s->gb;
1906 if (*data_size < 480 * sizeof(float)) {
1907 av_log(ctx, AV_LOG_ERROR,
1908 "Output buffer too small (%d given - %zu needed)\n",
1909 *data_size, 480 * sizeof(float));
1914 /* Packets are sometimes a multiple of ctx->block_align, with a packet
1915 * header at each ctx->block_align bytes. However, FFmpeg's ASF demuxer
1916 * feeds us ASF packets, which may concatenate multiple "codec" packets
1917 * in a single "muxer" packet, so we artificially emulate that by
1918 * capping the packet size at ctx->block_align. */
1919 for (size = avpkt->size; size > ctx->block_align; size -= ctx->block_align);
1922 init_get_bits(&s->gb, avpkt->data, size << 3);
1924 /* size == ctx->block_align is used to indicate whether we are dealing with
1925 * a new packet or a packet of which we already read the packet header
1927 if (size == ctx->block_align) { // new packet header
1928 if ((res = parse_packet_header(s)) < 0)
1931 /* If the packet header specifies a s->spillover_nbits, then we want
1932 * to push out all data of the previous packet (+ spillover) before
1933 * continuing to parse new superframes in the current packet. */
1934 if (s->spillover_nbits > 0) {
1935 if (s->sframe_cache_size > 0) {
1936 int cnt = get_bits_count(gb);
1937 copy_bits(&s->pb, avpkt->data, size, gb, s->spillover_nbits);
1938 flush_put_bits(&s->pb);
1939 s->sframe_cache_size += s->spillover_nbits;
1940 if ((res = synth_superframe(ctx, data, data_size)) == 0 &&
1942 cnt += s->spillover_nbits;
1943 s->skip_bits_next = cnt & 7;
1946 skip_bits_long (gb, s->spillover_nbits - cnt +
1947 get_bits_count(gb)); // resync
1949 skip_bits_long(gb, s->spillover_nbits); // resync
1951 } else if (s->skip_bits_next)
1952 skip_bits(gb, s->skip_bits_next);
1954 /* Try parsing superframes in current packet */
1955 s->sframe_cache_size = 0;
1956 s->skip_bits_next = 0;
1957 pos = get_bits_left(gb);
1958 if ((res = synth_superframe(ctx, data, data_size)) < 0) {
1960 } else if (*data_size > 0) {
1961 int cnt = get_bits_count(gb);
1962 s->skip_bits_next = cnt & 7;
1964 } else if ((s->sframe_cache_size = pos) > 0) {
1965 /* rewind bit reader to start of last (incomplete) superframe... */
1966 init_get_bits(gb, avpkt->data, size << 3);
1967 skip_bits_long(gb, (size << 3) - pos);
1968 assert(get_bits_left(gb) == pos);
1970 /* ...and cache it for spillover in next packet */
1971 init_put_bits(&s->pb, s->sframe_cache, SFRAME_CACHE_MAXSIZE);
1972 copy_bits(&s->pb, avpkt->data, size, gb, s->sframe_cache_size);
1973 // FIXME bad - just copy bytes as whole and add use the
1974 // skip_bits_next field
1980 static av_cold int wmavoice_decode_end(AVCodecContext *ctx)
1982 WMAVoiceContext *s = ctx->priv_data;
1985 ff_rdft_end(&s->rdft);
1986 ff_rdft_end(&s->irdft);
1987 ff_dct_end(&s->dct);
1988 ff_dct_end(&s->dst);
1994 static av_cold void wmavoice_flush(AVCodecContext *ctx)
1996 WMAVoiceContext *s = ctx->priv_data;
1999 s->postfilter_agc = 0;
2000 s->sframe_cache_size = 0;
2001 s->skip_bits_next = 0;
2002 for (n = 0; n < s->lsps; n++)
2003 s->prev_lsps[n] = M_PI * (n + 1.0) / (s->lsps + 1.0);
2004 memset(s->excitation_history, 0,
2005 sizeof(*s->excitation_history) * MAX_SIGNAL_HISTORY);
2006 memset(s->synth_history, 0,
2007 sizeof(*s->synth_history) * MAX_LSPS);
2008 memset(s->gain_pred_err, 0,
2009 sizeof(s->gain_pred_err));
2012 memset(&s->synth_filter_out_buf[MAX_LSPS_ALIGN16 - s->lsps], 0,
2013 sizeof(*s->synth_filter_out_buf) * s->lsps);
2014 memset(s->dcf_mem, 0,
2015 sizeof(*s->dcf_mem) * 2);
2016 memset(s->zero_exc_pf, 0,
2017 sizeof(*s->zero_exc_pf) * s->history_nsamples);
2018 memset(s->denoise_filter_cache, 0, sizeof(s->denoise_filter_cache));
2022 AVCodec ff_wmavoice_decoder = {
2026 sizeof(WMAVoiceContext),
2027 wmavoice_decode_init,
2029 wmavoice_decode_end,
2030 wmavoice_decode_packet,
2031 CODEC_CAP_SUBFRAMES,
2032 .flush = wmavoice_flush,
2033 .long_name = NULL_IF_CONFIG_SMALL("Windows Media Audio Voice"),