2 * Windows Media Audio Voice decoder.
3 * Copyright (c) 2009 Ronald S. Bultje
5 * This file is part of FFmpeg.
7 * FFmpeg is free software; you can redistribute it and/or
8 * modify it under the terms of the GNU Lesser General Public
9 * License as published by the Free Software Foundation; either
10 * version 2.1 of the License, or (at your option) any later version.
12 * FFmpeg is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
15 * Lesser General Public License for more details.
17 * You should have received a copy of the GNU Lesser General Public
18 * License along with FFmpeg; if not, write to the Free Software
19 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23 * @file libavcodec/wmavoice.c
24 * @brief Windows Media Audio Voice compatible decoder
25 * @author Ronald S. Bultje <rsbultje@gmail.com>
32 #include "wmavoice_data.h"
33 #include "celp_math.h"
34 #include "celp_filters.h"
35 #include "acelp_vectors.h"
36 #include "acelp_filters.h"
38 #include "libavutil/lzo.h"
40 #define MAX_BLOCKS 8 ///< maximum number of blocks per frame
41 #define MAX_LSPS 16 ///< maximum filter order
42 #define MAX_FRAMES 3 ///< maximum number of frames per superframe
43 #define MAX_FRAMESIZE 160 ///< maximum number of samples per frame
44 #define MAX_SIGNAL_HISTORY 416 ///< maximum excitation signal history
45 #define MAX_SFRAMESIZE (MAX_FRAMESIZE * MAX_FRAMES)
46 ///< maximum number of samples per superframe
47 #define SFRAME_CACHE_MAXSIZE 256 ///< maximum cache size for frame data that
48 ///< was split over two packets
49 #define VLC_NBITS 6 ///< number of bits to read per VLC iteration
52 * Frame type VLC coding.
54 static VLC frame_type_vlc;
57 * Adaptive codebook types.
60 ACB_TYPE_NONE = 0, ///< no adaptive codebook (only hardcoded fixed)
61 ACB_TYPE_ASYMMETRIC = 1, ///< adaptive codebook with per-frame pitch, which
62 ///< we interpolate to get a per-sample pitch.
63 ///< Signal is generated using an asymmetric sinc
65 ///< @note see #wmavoice_ipol1_coeffs
66 ACB_TYPE_HAMMING = 2 ///< Per-block pitch with signal generation using
67 ///< a Hamming sinc window function
68 ///< @note see #wmavoice_ipol2_coeffs
72 * Fixed codebook types.
75 FCB_TYPE_SILENCE = 0, ///< comfort noise during silence
76 ///< generated from a hardcoded (fixed) codebook
77 ///< with per-frame (low) gain values
78 FCB_TYPE_HARDCODED = 1, ///< hardcoded (fixed) codebook with per-block
80 FCB_TYPE_AW_PULSES = 2, ///< Pitch-adaptive window (AW) pulse signals,
81 ///< used in particular for low-bitrate streams
82 FCB_TYPE_EXC_PULSES = 3, ///< Innovation (fixed) codebook pulse sets in
83 ///< combinations of either single pulses or
88 * Description of frame types.
90 static const struct frame_type_desc {
91 uint8_t n_blocks; ///< amount of blocks per frame (each block
92 ///< (contains 160/#n_blocks samples)
93 uint8_t log_n_blocks; ///< log2(#n_blocks)
94 uint8_t acb_type; ///< Adaptive codebook type (ACB_TYPE_*)
95 uint8_t fcb_type; ///< Fixed codebook type (FCB_TYPE_*)
96 uint8_t dbl_pulses; ///< how many pulse vectors have pulse pairs
97 ///< (rather than just one single pulse)
98 ///< only if #fcb_type == #FCB_TYPE_EXC_PULSES
99 uint16_t frame_size; ///< the amount of bits that make up the block
100 ///< data (per frame)
101 } frame_descs[17] = {
102 { 1, 0, ACB_TYPE_NONE, FCB_TYPE_SILENCE, 0, 0 },
103 { 2, 1, ACB_TYPE_NONE, FCB_TYPE_HARDCODED, 0, 28 },
104 { 2, 1, ACB_TYPE_ASYMMETRIC, FCB_TYPE_AW_PULSES, 0, 46 },
105 { 2, 1, ACB_TYPE_ASYMMETRIC, FCB_TYPE_EXC_PULSES, 2, 80 },
106 { 2, 1, ACB_TYPE_ASYMMETRIC, FCB_TYPE_EXC_PULSES, 5, 104 },
107 { 4, 2, ACB_TYPE_ASYMMETRIC, FCB_TYPE_EXC_PULSES, 0, 108 },
108 { 4, 2, ACB_TYPE_ASYMMETRIC, FCB_TYPE_EXC_PULSES, 2, 132 },
109 { 4, 2, ACB_TYPE_ASYMMETRIC, FCB_TYPE_EXC_PULSES, 5, 168 },
110 { 2, 1, ACB_TYPE_HAMMING, FCB_TYPE_EXC_PULSES, 0, 64 },
111 { 2, 1, ACB_TYPE_HAMMING, FCB_TYPE_EXC_PULSES, 2, 80 },
112 { 2, 1, ACB_TYPE_HAMMING, FCB_TYPE_EXC_PULSES, 5, 104 },
113 { 4, 2, ACB_TYPE_HAMMING, FCB_TYPE_EXC_PULSES, 0, 108 },
114 { 4, 2, ACB_TYPE_HAMMING, FCB_TYPE_EXC_PULSES, 2, 132 },
115 { 4, 2, ACB_TYPE_HAMMING, FCB_TYPE_EXC_PULSES, 5, 168 },
116 { 8, 3, ACB_TYPE_HAMMING, FCB_TYPE_EXC_PULSES, 0, 176 },
117 { 8, 3, ACB_TYPE_HAMMING, FCB_TYPE_EXC_PULSES, 2, 208 },
118 { 8, 3, ACB_TYPE_HAMMING, FCB_TYPE_EXC_PULSES, 5, 256 }
122 * WMA Voice decoding context.
126 * @defgroup struct_global Global values
127 * Global values, specified in the stream header / extradata or used
131 GetBitContext gb; ///< packet bitreader. During decoder init,
132 ///< it contains the extradata from the
133 ///< demuxer. During decoding, it contains
135 int8_t vbm_tree[25]; ///< converts VLC codes to frame type
137 int spillover_bitsize; ///< number of bits used to specify
138 ///< #spillover_nbits in the packet header
139 ///< = ceil(log2(ctx->block_align << 3))
140 int history_nsamples; ///< number of samples in history for signal
141 ///< prediction (through ACB)
143 int do_apf; ///< whether to apply the averaged
144 ///< projection filter (APF)
146 int lsps; ///< number of LSPs per frame [10 or 16]
147 int lsp_q_mode; ///< defines quantizer defaults [0, 1]
148 int lsp_def_mode; ///< defines different sets of LSP defaults
150 int frame_lsp_bitsize; ///< size (in bits) of LSPs, when encoded
151 ///< per-frame (independent coding)
152 int sframe_lsp_bitsize; ///< size (in bits) of LSPs, when encoded
153 ///< per superframe (residual coding)
155 int min_pitch_val; ///< base value for pitch parsing code
156 int max_pitch_val; ///< max value + 1 for pitch parsing
157 int pitch_nbits; ///< number of bits used to specify the
158 ///< pitch value in the frame header
159 int block_pitch_nbits; ///< number of bits used to specify the
160 ///< first block's pitch value
161 int block_pitch_range; ///< range of the block pitch
162 int block_delta_pitch_nbits; ///< number of bits used to specify the
163 ///< delta pitch between this and the last
164 ///< block's pitch value, used in all but
166 int block_delta_pitch_hrange; ///< 1/2 range of the delta (full range is
167 ///< from -this to +this-1)
168 uint16_t block_conv_table[4]; ///< boundaries for block pitch unit/scale
173 * @defgroup struct_packet Packet values
174 * Packet values, specified in the packet header or related to a packet.
175 * A packet is considered to be a single unit of data provided to this
176 * decoder by the demuxer.
179 int spillover_nbits; ///< number of bits of the previous packet's
180 ///< last superframe preceeding this
181 ///< packet's first full superframe (useful
182 ///< for re-synchronization also)
183 int has_residual_lsps; ///< if set, superframes contain one set of
184 ///< LSPs that cover all frames, encoded as
185 ///< independent and residual LSPs; if not
186 ///< set, each frame contains its own, fully
187 ///< independent, LSPs
188 int skip_bits_next; ///< number of bits to skip at the next call
189 ///< to #wmavoice_decode_packet() (since
190 ///< they're part of the previous superframe)
192 uint8_t sframe_cache[SFRAME_CACHE_MAXSIZE + FF_INPUT_BUFFER_PADDING_SIZE];
193 ///< cache for superframe data split over
194 ///< multiple packets
195 int sframe_cache_size; ///< set to >0 if we have data from an
196 ///< (incomplete) superframe from a previous
197 ///< packet that spilled over in the current
198 ///< packet; specifies the amount of bits in
200 PutBitContext pb; ///< bitstream writer for #sframe_cache
204 * @defgroup struct_frame Frame and superframe values
205 * Superframe and frame data - these can change from frame to frame,
206 * although some of them do in that case serve as a cache / history for
207 * the next frame or superframe.
210 double prev_lsps[MAX_LSPS]; ///< LSPs of the last frame of the previous
212 int last_pitch_val; ///< pitch value of the previous frame
213 int last_acb_type; ///< frame type [0-2] of the previous frame
214 int pitch_diff_sh16; ///< ((cur_pitch_val - #last_pitch_val)
215 ///< << 16) / #MAX_FRAMESIZE
216 float silence_gain; ///< set for use in blocks if #ACB_TYPE_NONE
218 int aw_idx_is_ext; ///< whether the AW index was encoded in
219 ///< 8 bits (instead of 6)
220 int aw_pulse_range; ///< the range over which #aw_pulse_set1()
221 ///< can apply the pulse, relative to the
222 ///< value in aw_first_pulse_off. The exact
223 ///< position of the first AW-pulse is within
224 ///< [pulse_off, pulse_off + this], and
225 ///< depends on bitstream values; [16 or 24]
226 int aw_n_pulses[2]; ///< number of AW-pulses in each block; note
227 ///< that this number can be negative (in
228 ///< which case it basically means "zero")
229 int aw_first_pulse_off[2]; ///< index of first sample to which to
230 ///< apply AW-pulses, or -0xff if unset
231 int aw_next_pulse_off_cache; ///< the position (relative to start of the
232 ///< second block) at which pulses should
233 ///< start to be positioned, serves as a
234 ///< cache for pitch-adaptive window pulses
237 int frame_cntr; ///< current frame index [0 - 0xFFFE]; is
238 ///< only used for comfort noise in #pRNG()
239 float gain_pred_err[6]; ///< cache for gain prediction
240 float excitation_history[MAX_SIGNAL_HISTORY];
241 ///< cache of the signal of previous
242 ///< superframes, used as a history for
243 ///< signal generation
244 float synth_history[MAX_LSPS]; ///< see #excitation_history
251 * Sets up the variable bit mode (VBM) tree from container extradata.
252 * @param gb bit I/O context.
253 * The bit context (s->gb) should be loaded with byte 23-46 of the
254 * container extradata (i.e. the ones containing the VBM tree).
255 * @param vbm_tree pointer to array to which the decoded VBM tree will be
257 * @return 0 on success, <0 on error.
259 static av_cold int decode_vbmtree(GetBitContext *gb, int8_t vbm_tree[25])
261 static const uint8_t bits[] = {
264 10, 10, 10, 12, 12, 12,
267 static const uint16_t codes[] = {
268 0x0000, 0x0001, 0x0002, // 00/01/10
269 0x000c, 0x000d, 0x000e, // 11+00/01/10
270 0x003c, 0x003d, 0x003e, // 1111+00/01/10
271 0x00fc, 0x00fd, 0x00fe, // 111111+00/01/10
272 0x03fc, 0x03fd, 0x03fe, // 11111111+00/01/10
273 0x0ffc, 0x0ffd, 0x0ffe, // 1111111111+00/01/10
274 0x3ffc, 0x3ffd, 0x3ffe, 0x3fff // 111111111111+xx
278 memset(vbm_tree, 0xff, sizeof(vbm_tree));
279 memset(cntr, 0, sizeof(cntr));
280 for (n = 0; n < 17; n++) {
281 res = get_bits(gb, 3);
282 if (cntr[res] > 3) // should be >= 3 + (res == 7))
284 vbm_tree[res * 3 + cntr[res]++] = n;
286 INIT_VLC_STATIC(&frame_type_vlc, VLC_NBITS, sizeof(bits),
287 bits, 1, 1, codes, 2, 2, 132);
292 * Set up decoder with parameters from demuxer (extradata etc.).
294 static av_cold int wmavoice_decode_init(AVCodecContext *ctx)
296 int n, flags, pitch_range, lsp16_flag;
297 WMAVoiceContext *s = ctx->priv_data;
301 * - byte 0-18: WMAPro-in-WMAVoice extradata (see wmaprodec.c),
302 * - byte 19-22: flags field (annoyingly in LE; see below for known
304 * - byte 23-46: variable bitmode tree (really just 17 * 3 bits,
307 if (ctx->extradata_size != 46) {
308 av_log(ctx, AV_LOG_ERROR,
309 "Invalid extradata size %d (should be 46)\n",
310 ctx->extradata_size);
313 flags = AV_RL32(ctx->extradata + 18);
314 s->spillover_bitsize = 3 + av_ceil_log2(ctx->block_align);
315 s->do_apf = flags & 0x1;
316 s->lsp_q_mode = !!(flags & 0x2000);
317 s->lsp_def_mode = !!(flags & 0x4000);
318 lsp16_flag = flags & 0x1000;
321 s->frame_lsp_bitsize = 34;
322 s->sframe_lsp_bitsize = 60;
325 s->frame_lsp_bitsize = 24;
326 s->sframe_lsp_bitsize = 48;
328 for (n = 0; n < s->lsps; n++)
329 s->prev_lsps[n] = M_PI * (n + 1.0) / (s->lsps + 1.0);
331 init_get_bits(&s->gb, ctx->extradata + 22, (ctx->extradata_size - 22) << 3);
332 if (decode_vbmtree(&s->gb, s->vbm_tree) < 0) {
333 av_log(ctx, AV_LOG_ERROR, "Invalid VBM tree; broken extradata?\n");
337 s->min_pitch_val = ((ctx->sample_rate << 8) / 400 + 50) >> 8;
338 s->max_pitch_val = ((ctx->sample_rate << 8) * 37 / 2000 + 50) >> 8;
339 pitch_range = s->max_pitch_val - s->min_pitch_val;
340 s->pitch_nbits = av_ceil_log2(pitch_range);
341 s->last_pitch_val = 40;
342 s->last_acb_type = ACB_TYPE_NONE;
343 s->history_nsamples = s->max_pitch_val + 8;
345 if (s->min_pitch_val < 1 || s->history_nsamples > MAX_SIGNAL_HISTORY) {
346 int min_sr = ((((1 << 8) - 50) * 400) + 0xFF) >> 8,
347 max_sr = ((((MAX_SIGNAL_HISTORY - 8) << 8) + 205) * 2000 / 37) >> 8;
349 av_log(ctx, AV_LOG_ERROR,
350 "Unsupported samplerate %d (min=%d, max=%d)\n",
351 ctx->sample_rate, min_sr, max_sr); // 322-22097 Hz
356 s->block_conv_table[0] = s->min_pitch_val;
357 s->block_conv_table[1] = (pitch_range * 25) >> 6;
358 s->block_conv_table[2] = (pitch_range * 44) >> 6;
359 s->block_conv_table[3] = s->max_pitch_val - 1;
360 s->block_delta_pitch_hrange = (pitch_range >> 3) & ~0xF;
361 s->block_delta_pitch_nbits = 1 + av_ceil_log2(s->block_delta_pitch_hrange);
362 s->block_pitch_range = s->block_conv_table[2] +
363 s->block_conv_table[3] + 1 +
364 2 * (s->block_conv_table[1] - 2 * s->min_pitch_val);
365 s->block_pitch_nbits = av_ceil_log2(s->block_pitch_range);
367 ctx->sample_fmt = SAMPLE_FMT_FLT;
374 * @param lsps output pointer to the array that will hold the LSPs
375 * @param num number of LSPs to be dequantized
376 * @param values quantized values, contains n_stages values
377 * @param sizes range (i.e. max value) of each quantized value
378 * @param n_stages number of dequantization runs
379 * @param table dequantization table to be used
380 * @param mul_q LSF multiplier
381 * @param base_q base (lowest) LSF values
383 static void dequant_lsps(double *lsps, int num,
384 const uint16_t *values,
385 const uint16_t *sizes,
386 int n_stages, const uint8_t *table,
388 const double *base_q)
392 memset(lsps, 0, num * sizeof(*lsps));
393 for (n = 0; n < n_stages; n++) {
394 const uint8_t *t_off = &table[values[n] * num];
395 double base = base_q[n], mul = mul_q[n];
397 for (m = 0; m < num; m++)
398 lsps[m] += base + mul * t_off[m];
400 table += sizes[n] * num;
405 * @defgroup lsp_dequant LSP dequantization routines
406 * LSP dequantization routines, for 10/16LSPs and independent/residual coding.
407 * @note we assume enough bits are available, caller should check.
408 * lsp10i() consumes 24 bits; lsp10r() consumes an additional 24 bits;
409 * lsp16i() consumes 34 bits; lsp16r() consumes an additional 26 bits.
413 * Parse 10 independently-coded LSPs.
415 static void dequant_lsp10i(GetBitContext *gb, double *lsps)
417 static const uint16_t vec_sizes[4] = { 256, 64, 32, 32 };
418 static const double mul_lsf[4] = {
419 5.2187144800e-3, 1.4626986422e-3,
420 9.6179549166e-4, 1.1325736225e-3
422 static const double base_lsf[4] = {
423 M_PI * -2.15522e-1, M_PI * -6.1646e-2,
424 M_PI * -3.3486e-2, M_PI * -5.7408e-2
428 v[0] = get_bits(gb, 8);
429 v[1] = get_bits(gb, 6);
430 v[2] = get_bits(gb, 5);
431 v[3] = get_bits(gb, 5);
433 dequant_lsps(lsps, 10, v, vec_sizes, 4, wmavoice_dq_lsp10i,
438 * Parse 10 independently-coded LSPs, and then derive the tables to
439 * generate LSPs for the other frames from them (residual coding).
441 static void dequant_lsp10r(GetBitContext *gb,
442 double *i_lsps, const double *old,
443 double *a1, double *a2, int q_mode)
445 static const uint16_t vec_sizes[3] = { 128, 64, 64 };
446 static const double mul_lsf[3] = {
447 2.5807601174e-3, 1.2354460219e-3, 1.1763821673e-3
449 static const double base_lsf[3] = {
450 M_PI * -1.07448e-1, M_PI * -5.2706e-2, M_PI * -5.1634e-2
452 const float (*ipol_tab)[2][10] = q_mode ?
453 wmavoice_lsp10_intercoeff_b : wmavoice_lsp10_intercoeff_a;
454 uint16_t interpol, v[3];
457 dequant_lsp10i(gb, i_lsps);
459 interpol = get_bits(gb, 5);
460 v[0] = get_bits(gb, 7);
461 v[1] = get_bits(gb, 6);
462 v[2] = get_bits(gb, 6);
464 for (n = 0; n < 10; n++) {
465 double delta = old[n] - i_lsps[n];
466 a1[n] = ipol_tab[interpol][0][n] * delta + i_lsps[n];
467 a1[10 + n] = ipol_tab[interpol][1][n] * delta + i_lsps[n];
470 dequant_lsps(a2, 20, v, vec_sizes, 3, wmavoice_dq_lsp10r,
475 * Parse 16 independently-coded LSPs.
477 static void dequant_lsp16i(GetBitContext *gb, double *lsps)
479 static const uint16_t vec_sizes[5] = { 256, 64, 128, 64, 128 };
480 static const double mul_lsf[5] = {
481 3.3439586280e-3, 6.9908173703e-4,
482 3.3216608306e-3, 1.0334960326e-3,
485 static const double base_lsf[5] = {
486 M_PI * -1.27576e-1, M_PI * -2.4292e-2,
487 M_PI * -1.28094e-1, M_PI * -3.2128e-2,
492 v[0] = get_bits(gb, 8);
493 v[1] = get_bits(gb, 6);
494 v[2] = get_bits(gb, 7);
495 v[3] = get_bits(gb, 6);
496 v[4] = get_bits(gb, 7);
498 dequant_lsps( lsps, 5, v, vec_sizes, 2,
499 wmavoice_dq_lsp16i1, mul_lsf, base_lsf);
500 dequant_lsps(&lsps[5], 5, &v[2], &vec_sizes[2], 2,
501 wmavoice_dq_lsp16i2, &mul_lsf[2], &base_lsf[2]);
502 dequant_lsps(&lsps[10], 6, &v[4], &vec_sizes[4], 1,
503 wmavoice_dq_lsp16i3, &mul_lsf[4], &base_lsf[4]);
507 * Parse 16 independently-coded LSPs, and then derive the tables to
508 * generate LSPs for the other frames from them (residual coding).
510 static void dequant_lsp16r(GetBitContext *gb,
511 double *i_lsps, const double *old,
512 double *a1, double *a2, int q_mode)
514 static const uint16_t vec_sizes[3] = { 128, 128, 128 };
515 static const double mul_lsf[3] = {
516 1.2232979501e-3, 1.4062241527e-3, 1.6114744851e-3
518 static const double base_lsf[3] = {
519 M_PI * -5.5830e-2, M_PI * -5.2908e-2, M_PI * -5.4776e-2
521 const float (*ipol_tab)[2][16] = q_mode ?
522 wmavoice_lsp16_intercoeff_b : wmavoice_lsp16_intercoeff_a;
523 uint16_t interpol, v[3];
526 dequant_lsp16i(gb, i_lsps);
528 interpol = get_bits(gb, 5);
529 v[0] = get_bits(gb, 7);
530 v[1] = get_bits(gb, 7);
531 v[2] = get_bits(gb, 7);
533 for (n = 0; n < 16; n++) {
534 double delta = old[n] - i_lsps[n];
535 a1[n] = ipol_tab[interpol][0][n] * delta + i_lsps[n];
536 a1[16 + n] = ipol_tab[interpol][1][n] * delta + i_lsps[n];
539 dequant_lsps( a2, 10, v, vec_sizes, 1,
540 wmavoice_dq_lsp16r1, mul_lsf, base_lsf);
541 dequant_lsps(&a2[10], 10, &v[1], &vec_sizes[1], 1,
542 wmavoice_dq_lsp16r2, &mul_lsf[1], &base_lsf[1]);
543 dequant_lsps(&a2[20], 12, &v[2], &vec_sizes[2], 1,
544 wmavoice_dq_lsp16r3, &mul_lsf[2], &base_lsf[2]);
549 * @defgroup aw Pitch-adaptive window coding functions
550 * The next few functions are for pitch-adaptive window coding.
554 * Parse the offset of the first pitch-adaptive window pulses, and
555 * the distribution of pulses between the two blocks in this frame.
556 * @param s WMA Voice decoding context private data
557 * @param gb bit I/O context
558 * @param pitch pitch for each block in this frame
560 static void aw_parse_coords(WMAVoiceContext *s, GetBitContext *gb,
563 static const int16_t start_offset[94] = {
564 -11, -9, -7, -5, -3, -1, 1, 3, 5, 7, 9, 11,
565 13, 15, 18, 17, 19, 20, 21, 22, 23, 24, 25, 26,
566 27, 28, 29, 30, 31, 32, 33, 35, 37, 39, 41, 43,
567 45, 47, 49, 51, 53, 55, 57, 59, 61, 63, 65, 67,
568 69, 71, 73, 75, 77, 79, 81, 83, 85, 87, 89, 91,
569 93, 95, 97, 99, 101, 103, 105, 107, 109, 111, 113, 115,
570 117, 119, 121, 123, 125, 127, 129, 131, 133, 135, 137, 139,
571 141, 143, 145, 147, 149, 151, 153, 155, 157, 159
575 /* position of pulse */
576 s->aw_idx_is_ext = 0;
577 if ((bits = get_bits(gb, 6)) >= 54) {
578 s->aw_idx_is_ext = 1;
579 bits += (bits - 54) * 3 + get_bits(gb, 2);
582 /* for a repeated pulse at pulse_off with a pitch_lag of pitch[], count
583 * the distribution of the pulses in each block contained in this frame. */
584 s->aw_pulse_range = FFMIN(pitch[0], pitch[1]) > 32 ? 24 : 16;
585 for (offset = start_offset[bits]; offset < 0; offset += pitch[0]) ;
586 s->aw_n_pulses[0] = (pitch[0] - 1 + MAX_FRAMESIZE / 2 - offset) / pitch[0];
587 s->aw_first_pulse_off[0] = offset - s->aw_pulse_range / 2;
588 offset += s->aw_n_pulses[0] * pitch[0];
589 s->aw_n_pulses[1] = (pitch[1] - 1 + MAX_FRAMESIZE - offset) / pitch[1];
590 s->aw_first_pulse_off[1] = offset - (MAX_FRAMESIZE + s->aw_pulse_range) / 2;
592 /* if continuing from a position before the block, reset position to
593 * start of block (when corrected for the range over which it can be
594 * spread in aw_pulse_set1()). */
595 if (start_offset[bits] < MAX_FRAMESIZE / 2) {
596 while (s->aw_first_pulse_off[1] - pitch[1] + s->aw_pulse_range > 0)
597 s->aw_first_pulse_off[1] -= pitch[1];
598 if (start_offset[bits] < 0)
599 while (s->aw_first_pulse_off[0] - pitch[0] + s->aw_pulse_range > 0)
600 s->aw_first_pulse_off[0] -= pitch[0];
605 * Apply second set of pitch-adaptive window pulses.
606 * @param s WMA Voice decoding context private data
607 * @param gb bit I/O context
608 * @param block_idx block index in frame [0, 1]
609 * @param fcb structure containing fixed codebook vector info
611 static void aw_pulse_set2(WMAVoiceContext *s, GetBitContext *gb,
612 int block_idx, AMRFixed *fcb)
614 uint16_t use_mask[7]; // only 5 are used, rest is padding
615 /* in this function, idx is the index in the 80-bit (+ padding) use_mask
616 * bit-array. Since use_mask consists of 16-bit values, the lower 4 bits
617 * of idx are the position of the bit within a particular item in the
618 * array (0 being the most significant bit, and 15 being the least
619 * significant bit), and the remainder (>> 4) is the index in the
620 * use_mask[]-array. This is faster and uses less memory than using a
621 * 80-byte/80-int array. */
622 int pulse_off = s->aw_first_pulse_off[block_idx],
623 pulse_start, n, idx, range, aidx, start_off = 0;
625 /* set offset of first pulse to within this block */
626 if (s->aw_n_pulses[block_idx] > 0)
627 while (pulse_off + s->aw_pulse_range < 1)
628 pulse_off += fcb->pitch_lag;
630 /* find range per pulse */
631 if (s->aw_n_pulses[0] > 0) {
632 if (block_idx == 0) {
634 } else /* block_idx = 1 */ {
636 if (s->aw_n_pulses[block_idx] > 0)
637 pulse_off = s->aw_next_pulse_off_cache;
641 pulse_start = s->aw_n_pulses[block_idx] > 0 ? pulse_off - range / 2 : 0;
643 /* aw_pulse_set1() already applies pulses around pulse_off (to be exactly,
644 * in the range of [pulse_off, pulse_off + s->aw_pulse_range], and thus
645 * we exclude that range from being pulsed again in this function. */
646 memset( use_mask, -1, 5 * sizeof(use_mask[0]));
647 memset(&use_mask[5], 0, 2 * sizeof(use_mask[0]));
648 if (s->aw_n_pulses[block_idx] > 0)
649 for (idx = pulse_off; idx < MAX_FRAMESIZE / 2; idx += fcb->pitch_lag) {
650 int excl_range = s->aw_pulse_range; // always 16 or 24
651 uint16_t *use_mask_ptr = &use_mask[idx >> 4];
652 int first_sh = 16 - (idx & 15);
653 *use_mask_ptr++ &= 0xFFFF << first_sh;
654 excl_range -= first_sh;
655 if (excl_range >= 16) {
657 *use_mask_ptr &= 0xFFFF >> (excl_range - 16);
659 *use_mask_ptr &= 0xFFFF >> excl_range;
662 /* find the 'aidx'th offset that is not excluded */
663 aidx = get_bits(gb, s->aw_n_pulses[0] > 0 ? 5 - 2 * block_idx : 4);
664 for (n = 0; n <= aidx; pulse_start++) {
665 for (idx = pulse_start; idx < 0; idx += fcb->pitch_lag) ;
666 if (idx >= MAX_FRAMESIZE / 2) { // find from zero
667 if (use_mask[0]) idx = 0x0F;
668 else if (use_mask[1]) idx = 0x1F;
669 else if (use_mask[2]) idx = 0x2F;
670 else if (use_mask[3]) idx = 0x3F;
671 else if (use_mask[4]) idx = 0x4F;
673 idx -= av_log2_16bit(use_mask[idx >> 4]);
675 if (use_mask[idx >> 4] & (0x8000 >> (idx & 15))) {
676 use_mask[idx >> 4] &= ~(0x8000 >> (idx & 15));
682 fcb->x[fcb->n] = start_off;
683 fcb->y[fcb->n] = get_bits1(gb) ? -1.0 : 1.0;
686 /* set offset for next block, relative to start of that block */
687 n = (MAX_FRAMESIZE / 2 - start_off) % fcb->pitch_lag;
688 s->aw_next_pulse_off_cache = n ? fcb->pitch_lag - n : 0;
692 * Apply first set of pitch-adaptive window pulses.
693 * @param s WMA Voice decoding context private data
694 * @param gb bit I/O context
695 * @param block_idx block index in frame [0, 1]
696 * @param fcb storage location for fixed codebook pulse info
698 static void aw_pulse_set1(WMAVoiceContext *s, GetBitContext *gb,
699 int block_idx, AMRFixed *fcb)
701 int val = get_bits(gb, 12 - 2 * (s->aw_idx_is_ext && !block_idx));
704 if (s->aw_n_pulses[block_idx] > 0) {
705 int n, v_mask, i_mask, sh, n_pulses;
707 if (s->aw_pulse_range == 24) { // 3 pulses, 1:sign + 3:index each
712 } else { // 4 pulses, 1:sign + 2:index each
719 for (n = n_pulses - 1; n >= 0; n--, val >>= sh) {
720 fcb->y[fcb->n] = (val & v_mask) ? -1.0 : 1.0;
721 fcb->x[fcb->n] = (val & i_mask) * n_pulses + n +
722 s->aw_first_pulse_off[block_idx];
723 while (fcb->x[fcb->n] < 0)
724 fcb->x[fcb->n] += fcb->pitch_lag;
725 if (fcb->x[fcb->n] < MAX_FRAMESIZE / 2)
729 int num2 = (val & 0x1FF) >> 1, delta, idx;
731 if (num2 < 1 * 79) { delta = 1; idx = num2 + 1; }
732 else if (num2 < 2 * 78) { delta = 3; idx = num2 + 1 - 1 * 77; }
733 else if (num2 < 3 * 77) { delta = 5; idx = num2 + 1 - 2 * 76; }
734 else { delta = 7; idx = num2 + 1 - 3 * 75; }
735 v = (val & 0x200) ? -1.0 : 1.0;
737 fcb->no_repeat_mask |= 3 << fcb->n;
738 fcb->x[fcb->n] = idx - delta;
740 fcb->x[fcb->n + 1] = idx;
741 fcb->y[fcb->n + 1] = (val & 1) ? -v : v;
749 * Generate a random number from frame_cntr and block_idx, which will lief
750 * in the range [0, 1000 - block_size] (so it can be used as an index in a
751 * table of size 1000 of which you want to read block_size entries).
753 * @param frame_cntr current frame number
754 * @param block_num current block index
755 * @param block_size amount of entries we want to read from a table
756 * that has 1000 entries
757 * @returns a (non-)random number in the [0, 1000 - block_size] range.
759 static int pRNG(int frame_cntr, int block_num, int block_size)
761 /* array to simplify the calculation of z:
762 * y = (x % 9) * 5 + 6;
763 * z = (49995 * x) / y;
764 * Since y only has 9 values, we can remove the division by using a
765 * LUT and using FASTDIV-style divisions. For each of the 9 values
766 * of y, we can rewrite z as:
767 * z = x * (49995 / y) + x * ((49995 % y) / y)
768 * In this table, each col represents one possible value of y, the
769 * first number is 49995 / y, and the second is the FASTDIV variant
770 * of 49995 % y / y. */
771 static const unsigned int div_tbl[9][2] = {
772 { 8332, 3 * 715827883U }, // y = 6
773 { 4545, 0 * 390451573U }, // y = 11
774 { 3124, 11 * 268435456U }, // y = 16
775 { 2380, 15 * 204522253U }, // y = 21
776 { 1922, 23 * 165191050U }, // y = 26
777 { 1612, 23 * 138547333U }, // y = 31
778 { 1388, 27 * 119304648U }, // y = 36
779 { 1219, 16 * 104755300U }, // y = 41
780 { 1086, 39 * 93368855U } // y = 46
782 unsigned int z, y, x = MUL16(block_num, 1877) + frame_cntr;
783 if (x >= 0xFFFF) x -= 0xFFFF; // max value of x is 8*1877+0xFFFE=0x13AA6,
784 // so this is effectively a modulo (%)
785 y = x - 9 * MULH(477218589, x); // x % 9
786 z = (uint16_t) (x * div_tbl[y][0] + UMULH(x, div_tbl[y][1]));
787 // z = x * 49995 / (y * 5 + 6)
788 return z % (1000 - block_size);
792 * Parse hardcoded signal for a single block.
793 * @note see #synth_block().
795 static void synth_block_hardcoded(WMAVoiceContext *s, GetBitContext *gb,
796 int block_idx, int size,
797 const struct frame_type_desc *frame_desc,
803 assert(size <= MAX_FRAMESIZE);
805 /* Set the offset from which we start reading wmavoice_std_codebook */
806 if (frame_desc->fcb_type == FCB_TYPE_SILENCE) {
807 r_idx = pRNG(s->frame_cntr, block_idx, size);
808 gain = s->silence_gain;
809 } else /* FCB_TYPE_HARDCODED */ {
810 r_idx = get_bits(gb, 8);
811 gain = wmavoice_gain_universal[get_bits(gb, 6)];
814 /* Clear gain prediction parameters */
815 memset(s->gain_pred_err, 0, sizeof(s->gain_pred_err));
817 /* Apply gain to hardcoded codebook and use that as excitation signal */
818 for (n = 0; n < size; n++)
819 excitation[n] = wmavoice_std_codebook[r_idx + n] * gain;
823 * Parse FCB/ACB signal for a single block.
824 * @note see #synth_block().
826 static void synth_block_fcb_acb(WMAVoiceContext *s, GetBitContext *gb,
827 int block_idx, int size,
829 const struct frame_type_desc *frame_desc,
832 static const float gain_coeff[6] = {
833 0.8169, -0.06545, 0.1726, 0.0185, -0.0359, 0.0458
835 float pulses[MAX_FRAMESIZE / 2], pred_err, acb_gain, fcb_gain;
836 int n, idx, gain_weight;
839 assert(size <= MAX_FRAMESIZE / 2);
840 memset(pulses, 0, sizeof(*pulses) * size);
842 fcb.pitch_lag = block_pitch_sh2 >> 2;
844 fcb.no_repeat_mask = 0;
847 /* For the other frame types, this is where we apply the innovation
848 * (fixed) codebook pulses of the speech signal. */
849 if (frame_desc->fcb_type == FCB_TYPE_AW_PULSES) {
850 aw_pulse_set1(s, gb, block_idx, &fcb);
851 aw_pulse_set2(s, gb, block_idx, &fcb);
852 } else /* FCB_TYPE_EXC_PULSES */ {
853 int offset_nbits = 5 - frame_desc->log_n_blocks;
855 fcb.no_repeat_mask = -1;
856 /* similar to ff_decode_10_pulses_35bits(), but with single pulses
857 * (instead of double) for a subset of pulses */
858 for (n = 0; n < 5; n++) {
862 sign = get_bits1(gb) ? 1.0 : -1.0;
863 pos1 = get_bits(gb, offset_nbits);
864 fcb.x[fcb.n] = n + 5 * pos1;
865 fcb.y[fcb.n++] = sign;
866 if (n < frame_desc->dbl_pulses) {
867 pos2 = get_bits(gb, offset_nbits);
868 fcb.x[fcb.n] = n + 5 * pos2;
869 fcb.y[fcb.n++] = (pos1 < pos2) ? -sign : sign;
873 ff_set_fixed_vector(pulses, &fcb, 1.0, size);
875 /* Calculate gain for adaptive & fixed codebook signal.
876 * see ff_amr_set_fixed_gain(). */
877 idx = get_bits(gb, 7);
878 fcb_gain = expf(ff_dot_productf(s->gain_pred_err, gain_coeff, 6) -
879 5.2409161640 + wmavoice_gain_codebook_fcb[idx]);
880 acb_gain = wmavoice_gain_codebook_acb[idx];
881 pred_err = av_clipf(wmavoice_gain_codebook_fcb[idx],
882 -2.9957322736 /* log(0.05) */,
883 1.6094379124 /* log(5.0) */);
885 gain_weight = 8 >> frame_desc->log_n_blocks;
886 memmove(&s->gain_pred_err[gain_weight], s->gain_pred_err,
887 sizeof(*s->gain_pred_err) * (6 - gain_weight));
888 for (n = 0; n < gain_weight; n++)
889 s->gain_pred_err[n] = pred_err;
891 /* Calculation of adaptive codebook */
892 if (frame_desc->acb_type == ACB_TYPE_ASYMMETRIC) {
894 for (n = 0; n < size; n += len) {
896 int abs_idx = block_idx * size + n;
897 int pitch_sh16 = (s->last_pitch_val << 16) +
898 s->pitch_diff_sh16 * abs_idx;
899 int pitch = (pitch_sh16 + 0x6FFF) >> 16;
900 int idx_sh16 = ((pitch << 16) - pitch_sh16) * 8 + 0x58000;
901 idx = idx_sh16 >> 16;
902 if (s->pitch_diff_sh16) {
903 if (s->pitch_diff_sh16 > 0) {
904 next_idx_sh16 = (idx_sh16) &~ 0xFFFF;
906 next_idx_sh16 = (idx_sh16 + 0x10000) &~ 0xFFFF;
907 len = av_clip((idx_sh16 - next_idx_sh16) / s->pitch_diff_sh16 / 8,
912 ff_acelp_interpolatef(&excitation[n], &excitation[n - pitch],
913 wmavoice_ipol1_coeffs, 17,
916 } else /* ACB_TYPE_HAMMING */ {
917 int block_pitch = block_pitch_sh2 >> 2;
918 idx = block_pitch_sh2 & 3;
920 ff_acelp_interpolatef(excitation, &excitation[-block_pitch],
921 wmavoice_ipol2_coeffs, 4,
924 av_memcpy_backptr(excitation, sizeof(float) * block_pitch,
925 sizeof(float) * size);
928 /* Interpolate ACB/FCB and use as excitation signal */
929 ff_weighted_vector_sumf(excitation, excitation, pulses,
930 acb_gain, fcb_gain, size);
934 * Parse data in a single block.
935 * @note we assume enough bits are available, caller should check.
937 * @param s WMA Voice decoding context private data
938 * @param gb bit I/O context
939 * @param block_idx index of the to-be-read block
940 * @param size amount of samples to be read in this block
941 * @param block_pitch_sh2 pitch for this block << 2
942 * @param lsps LSPs for (the end of) this frame
943 * @param prev_lsps LSPs for the last frame
944 * @param frame_desc frame type descriptor
945 * @param excitation target memory for the ACB+FCB interpolated signal
946 * @param synth target memory for the speech synthesis filter output
947 * @return 0 on success, <0 on error.
949 static void synth_block(WMAVoiceContext *s, GetBitContext *gb,
950 int block_idx, int size,
952 const double *lsps, const double *prev_lsps,
953 const struct frame_type_desc *frame_desc,
954 float *excitation, float *synth)
956 double i_lsps[MAX_LSPS];
957 float lpcs[MAX_LSPS];
961 if (frame_desc->acb_type == ACB_TYPE_NONE)
962 synth_block_hardcoded(s, gb, block_idx, size, frame_desc, excitation);
964 synth_block_fcb_acb(s, gb, block_idx, size, block_pitch_sh2,
965 frame_desc, excitation);
967 /* convert interpolated LSPs to LPCs */
968 fac = (block_idx + 0.5) / frame_desc->n_blocks;
969 for (n = 0; n < s->lsps; n++) // LSF -> LSP
970 i_lsps[n] = cos(prev_lsps[n] + fac * (lsps[n] - prev_lsps[n]));
971 ff_acelp_lspd2lpc(i_lsps, lpcs, s->lsps >> 1);
973 /* Speech synthesis */
974 ff_celp_lp_synthesis_filterf(synth, lpcs, excitation, size, s->lsps);
978 * Synthesize output samples for a single frame.
979 * @note we assume enough bits are available, caller should check.
981 * @param ctx WMA Voice decoder context
982 * @param gb bit I/O context (s->gb or one for cross-packet superframes)
983 * @param samples pointer to output sample buffer, has space for at least 160
985 * @param lsps LSP array
986 * @param prev_lsps array of previous frame's LSPs
987 * @param excitation target buffer for excitation signal
988 * @param synth target buffer for synthesized speech data
989 * @return 0 on success, <0 on error.
991 static int synth_frame(AVCodecContext *ctx, GetBitContext *gb,
993 const double *lsps, const double *prev_lsps,
994 float *excitation, float *synth)
996 WMAVoiceContext *s = ctx->priv_data;
997 int n, n_blocks_x2, log_n_blocks_x2, cur_pitch_val;
998 int pitch[MAX_BLOCKS], last_block_pitch;
1000 /* Parse frame type ("frame header"), see frame_descs */
1001 int bd_idx = s->vbm_tree[get_vlc2(gb, frame_type_vlc.table, 6, 3)],
1002 block_nsamples = MAX_FRAMESIZE / frame_descs[bd_idx].n_blocks;
1005 av_log(ctx, AV_LOG_ERROR,
1006 "Invalid frame type VLC code, skipping\n");
1010 /* Pitch calculation for ACB_TYPE_ASYMMETRIC ("pitch-per-frame") */
1011 if (frame_descs[bd_idx].acb_type == ACB_TYPE_ASYMMETRIC) {
1012 /* Pitch is provided per frame, which is interpreted as the pitch of
1013 * the last sample of the last block of this frame. We can interpolate
1014 * the pitch of other blocks (and even pitch-per-sample) by gradually
1015 * incrementing/decrementing prev_frame_pitch to cur_pitch_val. */
1016 n_blocks_x2 = frame_descs[bd_idx].n_blocks << 1;
1017 log_n_blocks_x2 = frame_descs[bd_idx].log_n_blocks + 1;
1018 cur_pitch_val = s->min_pitch_val + get_bits(gb, s->pitch_nbits);
1019 cur_pitch_val = FFMIN(cur_pitch_val, s->max_pitch_val - 1);
1020 if (s->last_acb_type == ACB_TYPE_NONE ||
1021 20 * abs(cur_pitch_val - s->last_pitch_val) >
1022 (cur_pitch_val + s->last_pitch_val))
1023 s->last_pitch_val = cur_pitch_val;
1025 /* pitch per block */
1026 for (n = 0; n < frame_descs[bd_idx].n_blocks; n++) {
1027 int fac = n * 2 + 1;
1029 pitch[n] = (MUL16(fac, cur_pitch_val) +
1030 MUL16((n_blocks_x2 - fac), s->last_pitch_val) +
1031 frame_descs[bd_idx].n_blocks) >> log_n_blocks_x2;
1034 /* "pitch-diff-per-sample" for calculation of pitch per sample */
1035 s->pitch_diff_sh16 =
1036 ((cur_pitch_val - s->last_pitch_val) << 16) / MAX_FRAMESIZE;
1039 /* Global gain (if silence) and pitch-adaptive window coordinates */
1040 switch (frame_descs[bd_idx].fcb_type) {
1041 case FCB_TYPE_SILENCE:
1042 s->silence_gain = wmavoice_gain_silence[get_bits(gb, 8)];
1044 case FCB_TYPE_AW_PULSES:
1045 aw_parse_coords(s, gb, pitch);
1049 for (n = 0; n < frame_descs[bd_idx].n_blocks; n++) {
1052 /* Pitch calculation for ACB_TYPE_HAMMING ("pitch-per-block") */
1053 switch (frame_descs[bd_idx].acb_type) {
1054 case ACB_TYPE_HAMMING: {
1055 /* Pitch is given per block. Per-block pitches are encoded as an
1056 * absolute value for the first block, and then delta values
1057 * relative to this value) for all subsequent blocks. The scale of
1058 * this pitch value is semi-logaritmic compared to its use in the
1059 * decoder, so we convert it to normal scale also. */
1061 t1 = (s->block_conv_table[1] - s->block_conv_table[0]) << 2,
1062 t2 = (s->block_conv_table[2] - s->block_conv_table[1]) << 1,
1063 t3 = s->block_conv_table[3] - s->block_conv_table[2] + 1;
1066 block_pitch = get_bits(gb, s->block_pitch_nbits);
1068 block_pitch = last_block_pitch - s->block_delta_pitch_hrange +
1069 get_bits(gb, s->block_delta_pitch_nbits);
1070 /* Convert last_ so that any next delta is within _range */
1071 last_block_pitch = av_clip(block_pitch,
1072 s->block_delta_pitch_hrange,
1073 s->block_pitch_range -
1074 s->block_delta_pitch_hrange);
1076 /* Convert semi-log-style scale back to normal scale */
1077 if (block_pitch < t1) {
1078 bl_pitch_sh2 = (s->block_conv_table[0] << 2) + block_pitch;
1081 if (block_pitch < t2) {
1083 (s->block_conv_table[1] << 2) + (block_pitch << 1);
1086 if (block_pitch < t3) {
1088 (s->block_conv_table[2] + block_pitch) << 2;
1090 bl_pitch_sh2 = s->block_conv_table[3] << 2;
1093 pitch[n] = bl_pitch_sh2 >> 2;
1097 case ACB_TYPE_ASYMMETRIC: {
1098 bl_pitch_sh2 = pitch[n] << 2;
1102 default: // ACB_TYPE_NONE has no pitch
1107 synth_block(s, gb, n, block_nsamples, bl_pitch_sh2,
1108 lsps, prev_lsps, &frame_descs[bd_idx],
1109 &excitation[n * block_nsamples],
1110 &synth[n * block_nsamples]);
1113 /* Averaging projection filter, if applicable. Else, just copy samples
1114 * from synthesis buffer */
1116 // FIXME this is where APF would take place, currently not implemented
1117 av_log_missing_feature(ctx, "APF", 0);
1120 for (n = 0; n < 160; n++)
1121 samples[n] = av_clipf(synth[n], -1.0, 1.0);
1123 /* Cache values for next frame */
1125 if (s->frame_cntr >= 0xFFFF) s->frame_cntr -= 0xFFFF; // i.e. modulo (%)
1126 s->last_acb_type = frame_descs[bd_idx].acb_type;
1127 switch (frame_descs[bd_idx].acb_type) {
1129 s->last_pitch_val = 0;
1131 case ACB_TYPE_ASYMMETRIC:
1132 s->last_pitch_val = cur_pitch_val;
1134 case ACB_TYPE_HAMMING:
1135 s->last_pitch_val = pitch[frame_descs[bd_idx].n_blocks - 1];
1143 * Ensure minimum value for first item, maximum value for last value,
1144 * proper spacing between each value and proper ordering.
1146 * @param lsps array of LSPs
1147 * @param num size of LSP array
1149 * @note basically a double version of #ff_acelp_reorder_lsf(), might be
1150 * useful to put in a generic location later on. Parts are also
1151 * present in #ff_set_min_dist_lsf() + #ff_sort_nearly_sorted_floats(),
1152 * which is in float.
1154 static void stabilize_lsps(double *lsps, int num)
1158 /* set minimum value for first, maximum value for last and minimum
1159 * spacing between LSF values.
1160 * Very similar to ff_set_min_dist_lsf(), but in double. */
1161 lsps[0] = FFMAX(lsps[0], 0.0015 * M_PI);
1162 for (n = 1; n < num; n++)
1163 lsps[n] = FFMAX(lsps[n], lsps[n - 1] + 0.0125 * M_PI);
1164 lsps[num - 1] = FFMIN(lsps[num - 1], 0.9985 * M_PI);
1166 /* reorder (looks like one-time / non-recursed bubblesort).
1167 * Very similar to ff_sort_nearly_sorted_floats(), but in double. */
1168 for (n = 1; n < num; n++) {
1169 if (lsps[n] < lsps[n - 1]) {
1170 for (m = 1; m < num; m++) {
1171 double tmp = lsps[m];
1172 for (l = m - 1; l >= 0; l--) {
1173 if (lsps[l] <= tmp) break;
1174 lsps[l + 1] = lsps[l];
1184 * Test if there's enough bits to read 1 superframe.
1186 * @param orig_gb bit I/O context used for reading. This function
1187 * does not modify the state of the bitreader; it
1188 * only uses it to copy the current stream position
1189 * @param s WMA Voice decoding context private data
1190 * @returns -1 if unsupported, 1 on not enough bits or 0 if OK.
1192 static int check_bits_for_superframe(GetBitContext *orig_gb,
1195 GetBitContext s_gb, *gb = &s_gb;
1196 int n, need_bits, bd_idx;
1197 const struct frame_type_desc *frame_desc;
1199 /* initialize a copy */
1200 init_get_bits(gb, orig_gb->buffer, orig_gb->size_in_bits);
1201 skip_bits_long(gb, get_bits_count(orig_gb));
1202 assert(get_bits_left(gb) == get_bits_left(orig_gb));
1204 /* superframe header */
1205 if (get_bits_left(gb) < 14)
1208 return -1; // WMAPro-in-WMAVoice superframe
1209 if (get_bits1(gb)) skip_bits(gb, 12); // number of samples in superframe
1210 if (s->has_residual_lsps) { // residual LSPs (for all frames)
1211 if (get_bits_left(gb) < s->sframe_lsp_bitsize)
1213 skip_bits_long(gb, s->sframe_lsp_bitsize);
1217 for (n = 0; n < MAX_FRAMES; n++) {
1218 int aw_idx_is_ext = 0;
1220 if (!s->has_residual_lsps) { // independent LSPs (per-frame)
1221 if (get_bits_left(gb) < s->frame_lsp_bitsize) return 1;
1222 skip_bits_long(gb, s->frame_lsp_bitsize);
1224 bd_idx = s->vbm_tree[get_vlc2(gb, frame_type_vlc.table, 6, 3)];
1226 return -1; // invalid frame type VLC code
1227 frame_desc = &frame_descs[bd_idx];
1228 if (frame_desc->acb_type == ACB_TYPE_ASYMMETRIC) {
1229 if (get_bits_left(gb) < s->pitch_nbits)
1231 skip_bits_long(gb, s->pitch_nbits);
1233 if (frame_desc->fcb_type == FCB_TYPE_SILENCE) {
1235 } else if (frame_desc->fcb_type == FCB_TYPE_AW_PULSES) {
1236 int tmp = get_bits(gb, 6);
1244 if (frame_desc->acb_type == ACB_TYPE_HAMMING) {
1245 need_bits = s->block_pitch_nbits +
1246 (frame_desc->n_blocks - 1) * s->block_delta_pitch_nbits;
1247 } else if (frame_desc->fcb_type == FCB_TYPE_AW_PULSES) {
1248 need_bits = 2 * !aw_idx_is_ext;
1251 need_bits += frame_desc->frame_size;
1252 if (get_bits_left(gb) < need_bits)
1254 skip_bits_long(gb, need_bits);
1261 * Synthesize output samples for a single superframe. If we have any data
1262 * cached in s->sframe_cache, that will be used instead of whatever is loaded
1265 * WMA Voice superframes contain 3 frames, each containing 160 audio samples,
1266 * to give a total of 480 samples per frame. See #synth_frame() for frame
1267 * parsing. In addition to 3 frames, superframes can also contain the LSPs
1268 * (if these are globally specified for all frames (residually); they can
1269 * also be specified individually per-frame. See the s->has_residual_lsps
1270 * option), and can specify the number of samples encoded in this superframe
1271 * (if less than 480), usually used to prevent blanks at track boundaries.
1273 * @param ctx WMA Voice decoder context
1274 * @param samples pointer to output buffer for voice samples
1275 * @param data_size pointer containing the size of #samples on input, and the
1276 * amount of #samples filled on output
1277 * @return 0 on success, <0 on error or 1 if there was not enough data to
1278 * fully parse the superframe
1280 static int synth_superframe(AVCodecContext *ctx,
1281 float *samples, int *data_size)
1283 WMAVoiceContext *s = ctx->priv_data;
1284 GetBitContext *gb = &s->gb, s_gb;
1285 int n, res, n_samples = 480;
1286 double lsps[MAX_FRAMES][MAX_LSPS];
1287 const double *mean_lsf = s->lsps == 16 ?
1288 wmavoice_mean_lsf16[s->lsp_def_mode] : wmavoice_mean_lsf10[s->lsp_def_mode];
1289 float excitation[MAX_SIGNAL_HISTORY + MAX_SFRAMESIZE + 12];
1290 float synth[MAX_LSPS + MAX_SFRAMESIZE];
1292 memcpy(synth, s->synth_history,
1293 s->lsps * sizeof(*synth));
1294 memcpy(excitation, s->excitation_history,
1295 s->history_nsamples * sizeof(*excitation));
1297 if (s->sframe_cache_size > 0) {
1299 init_get_bits(gb, s->sframe_cache, s->sframe_cache_size);
1300 s->sframe_cache_size = 0;
1303 if ((res = check_bits_for_superframe(gb, s)) == 1) return 1;
1305 /* First bit is speech/music bit, it differentiates between WMAVoice
1306 * speech samples (the actual codec) and WMAVoice music samples, which
1307 * are really WMAPro-in-WMAVoice-superframes. I've never seen those in
1309 if (!get_bits1(gb)) {
1310 av_log_missing_feature(ctx, "WMAPro-in-WMAVoice support", 1);
1314 /* (optional) nr. of samples in superframe; always <= 480 and >= 0 */
1315 if (get_bits1(gb)) {
1316 if ((n_samples = get_bits(gb, 12)) > 480) {
1317 av_log(ctx, AV_LOG_ERROR,
1318 "Superframe encodes >480 samples (%d), not allowed\n",
1323 /* Parse LSPs, if global for the superframe (can also be per-frame). */
1324 if (s->has_residual_lsps) {
1325 double prev_lsps[MAX_LSPS], a1[MAX_LSPS * 2], a2[MAX_LSPS * 2];
1327 for (n = 0; n < s->lsps; n++)
1328 prev_lsps[n] = s->prev_lsps[n] - mean_lsf[n];
1330 if (s->lsps == 10) {
1331 dequant_lsp10r(gb, lsps[2], prev_lsps, a1, a2, s->lsp_q_mode);
1332 } else /* s->lsps == 16 */
1333 dequant_lsp16r(gb, lsps[2], prev_lsps, a1, a2, s->lsp_q_mode);
1335 for (n = 0; n < s->lsps; n++) {
1336 lsps[0][n] = mean_lsf[n] + (a1[n] - a2[n * 2]);
1337 lsps[1][n] = mean_lsf[n] + (a1[s->lsps + n] - a2[n * 2 + 1]);
1338 lsps[2][n] += mean_lsf[n];
1340 for (n = 0; n < 3; n++)
1341 stabilize_lsps(lsps[n], s->lsps);
1344 /* Parse frames, optionally preceeded by per-frame (independent) LSPs. */
1345 for (n = 0; n < 3; n++) {
1346 if (!s->has_residual_lsps) {
1349 if (s->lsps == 10) {
1350 dequant_lsp10i(gb, lsps[n]);
1351 } else /* s->lsps == 16 */
1352 dequant_lsp16i(gb, lsps[n]);
1354 for (m = 0; m < s->lsps; m++)
1355 lsps[n][m] += mean_lsf[m];
1356 stabilize_lsps(lsps[n], s->lsps);
1359 if ((res = synth_frame(ctx, gb,
1360 &samples[n * MAX_FRAMESIZE],
1361 lsps[n], n == 0 ? s->prev_lsps : lsps[n - 1],
1362 &excitation[s->history_nsamples + n * MAX_FRAMESIZE],
1363 &synth[s->lsps + n * MAX_FRAMESIZE])))
1367 /* Statistics? FIXME - we don't check for length, a slight overrun
1368 * will be caught by internal buffer padding, and anything else
1369 * will be skipped, not read. */
1370 if (get_bits1(gb)) {
1371 res = get_bits(gb, 4);
1372 skip_bits(gb, 10 * (res + 1));
1375 /* Specify nr. of output samples */
1376 *data_size = n_samples * sizeof(float);
1378 /* Update history */
1379 memcpy(s->prev_lsps, lsps[2],
1380 s->lsps * sizeof(*s->prev_lsps));
1381 memcpy(s->synth_history, &synth[MAX_SFRAMESIZE],
1382 s->lsps * sizeof(*synth));
1383 memcpy(s->excitation_history, &excitation[MAX_SFRAMESIZE],
1384 s->history_nsamples * sizeof(*excitation));
1390 * Parse the packet header at the start of each packet (input data to this
1393 * @param s WMA Voice decoding context private data
1394 * @returns 1 if not enough bits were available, or 0 on success.
1396 static int parse_packet_header(WMAVoiceContext *s)
1398 GetBitContext *gb = &s->gb;
1401 if (get_bits_left(gb) < 11)
1403 skip_bits(gb, 4); // packet sequence number
1404 s->has_residual_lsps = get_bits1(gb);
1406 res = get_bits(gb, 6); // number of superframes per packet
1407 // (minus first one if there is spillover)
1408 if (get_bits_left(gb) < 6 * (res == 0x3F) + s->spillover_bitsize)
1410 } while (res == 0x3F);
1411 s->spillover_nbits = get_bits(gb, s->spillover_bitsize);
1417 * Copy (unaligned) bits from gb/data/size to pb.
1419 * @param pb target buffer to copy bits into
1420 * @param data source buffer to copy bits from
1421 * @param size size of the source data, in bytes
1422 * @param gb bit I/O context specifying the current position in the source.
1423 * data. This function might use this to align the bit position to
1424 * a whole-byte boundary before calling #ff_copy_bits() on aligned
1426 * @param nbits the amount of bits to copy from source to target
1428 * @note after calling this function, the current position in the input bit
1429 * I/O context is undefined.
1431 static void copy_bits(PutBitContext *pb,
1432 const uint8_t *data, int size,
1433 GetBitContext *gb, int nbits)
1435 int rmn_bytes, rmn_bits;
1437 rmn_bits = rmn_bytes = get_bits_left(gb);
1438 if (rmn_bits < nbits)
1440 rmn_bits &= 7; rmn_bytes >>= 3;
1441 if ((rmn_bits = FFMIN(rmn_bits, nbits)) > 0)
1442 put_bits(pb, rmn_bits, get_bits(gb, rmn_bits));
1443 ff_copy_bits(pb, data + size - rmn_bytes,
1444 FFMIN(nbits - rmn_bits, rmn_bytes << 3));
1448 * Packet decoding: a packet is anything that the (ASF) demuxer contains,
1449 * and we expect that the demuxer / application provides it to us as such
1450 * (else you'll probably get garbage as output). Every packet has a size of
1451 * ctx->block_align bytes, starts with a packet header (see
1452 * #parse_packet_header()), and then a series of superframes. Superframe
1453 * boundaries may exceed packets, i.e. superframes can split data over
1454 * multiple (two) packets.
1456 * For more information about frames, see #synth_superframe().
1458 static int wmavoice_decode_packet(AVCodecContext *ctx, void *data,
1459 int *data_size, AVPacket *avpkt)
1461 WMAVoiceContext *s = ctx->priv_data;
1462 GetBitContext *gb = &s->gb;
1465 if (*data_size < 480 * sizeof(float)) {
1466 av_log(ctx, AV_LOG_ERROR,
1467 "Output buffer too small (%d given - %lu needed)\n",
1468 *data_size, 480 * sizeof(float));
1473 /* Packets are sometimes a multiple of ctx->block_align, with a packet
1474 * header at each ctx->block_align bytes. However, FFmpeg's ASF demuxer
1475 * feeds us ASF packets, which may concatenate multiple "codec" packets
1476 * in a single "muxer" packet, so we artificially emulate that by
1477 * capping the packet size at ctx->block_align. */
1478 for (size = avpkt->size; size > ctx->block_align; size -= ctx->block_align);
1481 init_get_bits(&s->gb, avpkt->data, size << 3);
1483 /* size == ctx->block_align is used to indicate whether we are dealing with
1484 * a new packet or a packet of which we already read the packet header
1486 if (size == ctx->block_align) { // new packet header
1487 if ((res = parse_packet_header(s)) < 0)
1490 /* If the packet header specifies a s->spillover_nbits, then we want
1491 * to push out all data of the previous packet (+ spillover) before
1492 * continuing to parse new superframes in the current packet. */
1493 if (s->spillover_nbits > 0) {
1494 if (s->sframe_cache_size > 0) {
1495 int cnt = get_bits_count(gb);
1496 copy_bits(&s->pb, avpkt->data, size, gb, s->spillover_nbits);
1497 flush_put_bits(&s->pb);
1498 s->sframe_cache_size += s->spillover_nbits;
1499 if ((res = synth_superframe(ctx, data, data_size)) == 0 &&
1501 cnt += s->spillover_nbits;
1502 s->skip_bits_next = cnt & 7;
1505 skip_bits_long (gb, s->spillover_nbits - cnt +
1506 get_bits_count(gb)); // resync
1508 skip_bits_long(gb, s->spillover_nbits); // resync
1510 } else if (s->skip_bits_next)
1511 skip_bits(gb, s->skip_bits_next);
1513 /* Try parsing superframes in current packet */
1514 s->sframe_cache_size = 0;
1515 s->skip_bits_next = 0;
1516 pos = get_bits_left(gb);
1517 if ((res = synth_superframe(ctx, data, data_size)) < 0) {
1519 } else if (*data_size > 0) {
1520 int cnt = get_bits_count(gb);
1521 s->skip_bits_next = cnt & 7;
1523 } else if ((s->sframe_cache_size = pos) > 0) {
1524 /* rewind bit reader to start of last (incomplete) superframe... */
1525 init_get_bits(gb, avpkt->data, size << 3);
1526 skip_bits_long(gb, (size << 3) - pos);
1527 assert(get_bits_left(gb) == pos);
1529 /* ...and cache it for spillover in next packet */
1530 init_put_bits(&s->pb, s->sframe_cache, SFRAME_CACHE_MAXSIZE);
1531 copy_bits(&s->pb, avpkt->data, size, gb, s->sframe_cache_size);
1532 // FIXME bad - just copy bytes as whole and add use the
1533 // skip_bits_next field
1539 static av_cold void wmavoice_flush(AVCodecContext *ctx)
1541 WMAVoiceContext *s = ctx->priv_data;
1544 s->sframe_cache_size = 0;
1545 s->skip_bits_next = 0;
1546 for (n = 0; n < s->lsps; n++)
1547 s->prev_lsps[n] = M_PI * (n + 1.0) / (s->lsps + 1.0);
1548 memset(s->excitation_history, 0,
1549 sizeof(*s->excitation_history) * MAX_SIGNAL_HISTORY);
1550 memset(s->synth_history, 0,
1551 sizeof(*s->synth_history) * MAX_LSPS);
1552 memset(s->gain_pred_err, 0,
1553 sizeof(s->gain_pred_err));
1556 AVCodec wmavoice_decoder = {
1560 sizeof(WMAVoiceContext),
1561 wmavoice_decode_init,
1564 wmavoice_decode_packet,
1565 CODEC_CAP_SUBFRAMES,
1566 .flush = wmavoice_flush,
1567 .long_name = NULL_IF_CONFIG_SMALL("Windows Media Audio Voice"),