git.sesse.net Git - ffmpeg/blob - libavcodec/opusenc_psy.c

   1 /*
   2  * Opus encoder
   3  * Copyright (c) 2017 Rostislav Pehlivanov <atomnuker@gmail.com>
   4  *
   5  * This file is part of FFmpeg.
   6  *
   7  * FFmpeg is free software; you can redistribute it and/or
   8  * modify it under the terms of the GNU Lesser General Public
   9  * License as published by the Free Software Foundation; either
  10  * version 2.1 of the License, or (at your option) any later version.
  11  *
  12  * FFmpeg is distributed in the hope that it will be useful,
  13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15  * Lesser General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU Lesser General Public
  18  * License along with FFmpeg; if not, write to the Free Software
  19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20  */
  21
  22 #include "opusenc_psy.h"
  23 #include "opus_pvq.h"
  24 #include "opustab.h"
  25 #include "mdct15.h"
  26 #include "libavutil/qsort.h"
  27
  28 /* Populate metrics without taking into consideration neighbouring steps */
  29 static void step_collect_psy_metrics(OpusPsyContext *s, int index)
  30 {
  31     int silence = 0, ch, i, j;
  32     OpusPsyStep *st = s->steps[index];
  33
  34     st->index = index;
  35
  36     for (ch = 0; ch < s->avctx->channels; ch++) {
  37         const int lap_size = (1 << s->bsize_analysis);
  38         for (i = 1; i <= FFMIN(lap_size, index); i++) {
  39             const int offset = i*120;
  40             AVFrame *cur = ff_bufqueue_peek(s->bufqueue, index - i);
  41             memcpy(&s->scratch[offset], cur->extended_data[ch], cur->nb_samples*sizeof(float));
  42         }
  43         for (i = 0; i < lap_size; i++) {
  44             const int offset = i*120 + lap_size;
  45             AVFrame *cur = ff_bufqueue_peek(s->bufqueue, index + i);
  46             memcpy(&s->scratch[offset], cur->extended_data[ch], cur->nb_samples*sizeof(float));
  47         }
  48
  49         s->dsp->vector_fmul(s->scratch, s->scratch, s->window[s->bsize_analysis],
  50                             (OPUS_BLOCK_SIZE(s->bsize_analysis) << 1));
  51
  52         s->mdct[s->bsize_analysis]->mdct(s->mdct[s->bsize_analysis], st->coeffs[ch], s->scratch, 1);
  53
  54         for (i = 0; i < CELT_MAX_BANDS; i++)
  55             st->bands[ch][i] = &st->coeffs[ch][ff_celt_freq_bands[i] << s->bsize_analysis];
  56     }
  57
  58     for (ch = 0; ch < s->avctx->channels; ch++) {
  59         for (i = 0; i < CELT_MAX_BANDS; i++) {
  60             float avg_c_s, energy = 0.0f, dist_dev = 0.0f;
  61             const int range = ff_celt_freq_range[i] << s->bsize_analysis;
  62             const float *coeffs = st->bands[ch][i];
  63             for (j = 0; j < range; j++)
  64                 energy += coeffs[j]*coeffs[j];
  65
  66             st->energy[ch][i] += sqrtf(energy);
  67             silence |= !!st->energy[ch][i];
  68             avg_c_s = energy / range;
  69
  70             for (j = 0; j < range; j++) {
  71                 const float c_s = coeffs[j]*coeffs[j];
  72                 dist_dev = (avg_c_s - c_s)*(avg_c_s - c_s);
  73             }
  74
  75             st->tone[ch][i] += sqrtf(dist_dev);
  76         }
  77     }
  78
  79     st->silence = !silence;
  80
  81     if (s->avctx->channels > 1) {
  82         for (i = 0; i < CELT_MAX_BANDS; i++) {
  83             float incompat = 0.0f;
  84             const float *coeffs1 = st->bands[0][i];
  85             const float *coeffs2 = st->bands[1][i];
  86             const int range = ff_celt_freq_range[i] << s->bsize_analysis;
  87             for (j = 0; j < range; j++)
  88                 incompat += (coeffs1[j] - coeffs2[j])*(coeffs1[j] - coeffs2[j]);
  89             st->stereo[i] = sqrtf(incompat);
  90         }
  91     }
  92
  93     for (ch = 0; ch < s->avctx->channels; ch++) {
  94         for (i = 0; i < CELT_MAX_BANDS; i++) {
  95             OpusBandExcitation *ex = &s->ex[ch][i];
  96             float bp_e = bessel_filter(&s->bfilter_lo[ch][i], st->energy[ch][i]);
  97             bp_e = bessel_filter(&s->bfilter_hi[ch][i], bp_e);
  98             bp_e *= bp_e;
  99             if (bp_e > ex->excitation) {
 100                 st->change_amp[ch][i] = bp_e - ex->excitation;
 101                 st->total_change += st->change_amp[ch][i];
 102                 ex->excitation = ex->excitation_init = bp_e;
 103                 ex->excitation_dist = 0.0f;
 104             }
 105             if (ex->excitation > 0.0f) {
 106                 ex->excitation -= av_clipf((1/expf(ex->excitation_dist)), ex->excitation_init/20, ex->excitation_init/1.09);
 107                 ex->excitation = FFMAX(ex->excitation, 0.0f);
 108                 ex->excitation_dist += 1.0f;
 109             }
 110         }
 111     }
 112 }
 113
 114 static void search_for_change_points(OpusPsyContext *s, float tgt_change,
 115                                      int offset_s, int offset_e, int resolution,
 116                                      int level)
 117 {
 118     int i;
 119     float c_change = 0.0f;
 120     if ((offset_e - offset_s) <= resolution)
 121         return;
 122     for (i = offset_s; i < offset_e; i++) {
 123         c_change += s->steps[i]->total_change;
 124         if (c_change > tgt_change)
 125             break;
 126     }
 127     if (i == offset_e)
 128         return;
 129     search_for_change_points(s, tgt_change / 2.0f, offset_s, i + 0, resolution, level + 1);
 130     s->inflection_points[s->inflection_points_count++] = i;
 131     search_for_change_points(s, tgt_change / 2.0f, i + 1, offset_e, resolution, level + 1);
 132 }
 133
 134 static int flush_silent_frames(OpusPsyContext *s)
 135 {
 136     int fsize, silent_frames;
 137
 138     for (silent_frames = 0; silent_frames < s->buffered_steps; silent_frames++)
 139         if (!s->steps[silent_frames]->silence)
 140             break;
 141     if (--silent_frames < 0)
 142         return 0;
 143
 144     for (fsize = CELT_BLOCK_960; fsize > CELT_BLOCK_120; fsize--) {
 145         if ((1 << fsize) > silent_frames)
 146             continue;
 147         s->p.frames = FFMIN(silent_frames / (1 << fsize), 48 >> fsize);
 148         s->p.framesize = fsize;
 149         return 1;
 150     }
 151
 152     return 0;
 153 }
 154
 155 /* Main function which decides frame size and frames per current packet */
 156 static void psy_output_groups(OpusPsyContext *s)
 157 {
 158     int max_delay_samples = (s->options->max_delay_ms*s->avctx->sample_rate)/1000;
 159     int max_bsize = FFMIN(OPUS_SAMPLES_TO_BLOCK_SIZE(max_delay_samples), CELT_BLOCK_960);
 160
 161     /* These don't change for now */
 162     s->p.mode      = OPUS_MODE_CELT;
 163     s->p.bandwidth = OPUS_BANDWIDTH_FULLBAND;
 164
 165     /* Flush silent frames ASAP */
 166     if (s->steps[0]->silence && flush_silent_frames(s))
 167         return;
 168
 169     s->p.framesize = FFMIN(max_bsize, CELT_BLOCK_960);
 170     s->p.frames    = 1;
 171 }
 172
 173 int ff_opus_psy_process(OpusPsyContext *s, OpusPacketInfo *p)
 174 {
 175     int i;
 176     float total_energy_change = 0.0f;
 177
 178     if (s->buffered_steps < s->max_steps && !s->eof) {
 179         const int awin = (1 << s->bsize_analysis);
 180         if (++s->steps_to_process >= awin) {
 181             step_collect_psy_metrics(s, s->buffered_steps - awin + 1);
 182             s->steps_to_process = 0;
 183         }
 184         if ((++s->buffered_steps) < s->max_steps)
 185             return 1;
 186     }
 187
 188     for (i = 0; i < s->buffered_steps; i++)
 189         total_energy_change += s->steps[i]->total_change;
 190
 191     search_for_change_points(s, total_energy_change / 2.0f, 0,
 192                              s->buffered_steps, 1, 0);
 193
 194     psy_output_groups(s);
 195
 196     p->frames    = s->p.frames;
 197     p->framesize = s->p.framesize;
 198     p->mode      = s->p.mode;
 199     p->bandwidth = s->p.bandwidth;
 200
 201     return 0;
 202 }
 203
 204 void ff_opus_psy_celt_frame_init(OpusPsyContext *s, CeltFrame *f, int index)
 205 {
 206     int i, neighbouring_points = 0, start_offset = 0;
 207     int radius = (1 << s->p.framesize), step_offset = radius*index;
 208     int silence = 1;
 209
 210     f->start_band = (s->p.mode == OPUS_MODE_HYBRID) ? 17 : 0;
 211     f->end_band   = ff_celt_band_end[s->p.bandwidth];
 212     f->channels   = s->avctx->channels;
 213     f->size       = s->p.framesize;
 214
 215     for (i = 0; i < (1 << f->size); i++)
 216         silence &= s->steps[index*(1 << f->size) + i]->silence;
 217
 218     f->silence = silence;
 219     if (f->silence) {
 220         f->framebits = 0; /* Otherwise the silence flag eats up 16(!) bits */
 221         return;
 222     }
 223
 224     for (i = 0; i < s->inflection_points_count; i++) {
 225         if (s->inflection_points[i] >= step_offset) {
 226             start_offset = i;
 227             break;
 228         }
 229     }
 230
 231     for (i = start_offset; i < FFMIN(radius, s->inflection_points_count - start_offset); i++) {
 232         if (s->inflection_points[i] < (step_offset + radius)) {
 233             neighbouring_points++;
 234         }
 235     }
 236
 237     /* Transient flagging */
 238     f->transient = neighbouring_points > 0;
 239     f->blocks = f->transient ? OPUS_BLOCK_SIZE(s->p.framesize)/CELT_OVERLAP : 1;
 240
 241     /* Some sane defaults */
 242     f->pfilter   = 0;
 243     f->pf_gain   = 0.5f;
 244     f->pf_octave = 2;
 245     f->pf_period = 1;
 246     f->pf_tapset = 2;
 247
 248     /* More sane defaults */
 249     f->tf_select = 0;
 250     f->anticollapse = 1;
 251     f->alloc_trim = 5;
 252     f->skip_band_floor = f->end_band;
 253     f->intensity_stereo = f->end_band;
 254     f->dual_stereo = 0;
 255     f->spread = CELT_SPREAD_NORMAL;
 256     memset(f->tf_change, 0, sizeof(int)*CELT_MAX_BANDS);
 257     memset(f->alloc_boost, 0, sizeof(int)*CELT_MAX_BANDS);
 258 }
 259
 260 static void celt_gauge_psy_weight(OpusPsyContext *s, OpusPsyStep **start,
 261                                   CeltFrame *f_out)
 262 {
 263     int i, f, ch;
 264     int frame_size = OPUS_BLOCK_SIZE(s->p.framesize);
 265     float rate, frame_bits = 0;
 266
 267     /* Used for the global ROTATE flag */
 268     float tonal = 0.0f;
 269
 270     /* Pseudo-weights */
 271     float band_score[CELT_MAX_BANDS] = { 0 };
 272     float max_score = 1.0f;
 273
 274     /* Pass one - one loop around each band, computing unquant stuff */
 275     for (i = 0; i < CELT_MAX_BANDS; i++) {
 276         float weight = 0.0f;
 277         float tonal_contrib = 0.0f;
 278         for (f = 0; f < (1 << s->p.framesize); f++) {
 279             weight = start[f]->stereo[i];
 280             for (ch = 0; ch < s->avctx->channels; ch++) {
 281                 weight += start[f]->change_amp[ch][i] + start[f]->tone[ch][i] + start[f]->energy[ch][i];
 282                 tonal_contrib += start[f]->tone[ch][i];
 283             }
 284         }
 285         tonal += tonal_contrib;
 286         band_score[i] = weight;
 287     }
 288
 289     tonal /= (float)CELT_MAX_BANDS;
 290
 291     for (i = 0; i < CELT_MAX_BANDS; i++) {
 292         if (band_score[i] > max_score)
 293             max_score = band_score[i];
 294     }
 295
 296     for (i = 0; i < CELT_MAX_BANDS; i++) {
 297         f_out->alloc_boost[i] = (int)((band_score[i]/max_score)*3.0f);
 298         frame_bits += band_score[i]*8.0f;
 299     }
 300
 301     tonal /= 1333136.0f;
 302     f_out->spread = av_clip_uintp2(lrintf(tonal), 2);
 303
 304     rate = ((float)s->avctx->bit_rate) + frame_bits*frame_size*16;
 305     rate *= s->lambda;
 306     rate /= s->avctx->sample_rate/frame_size;
 307
 308     f_out->framebits = lrintf(rate);
 309     f_out->framebits = FFMIN(f_out->framebits, OPUS_MAX_PACKET_SIZE*8);
 310     f_out->framebits = FFALIGN(f_out->framebits, 8);
 311 }
 312
 313 static int bands_dist(OpusPsyContext *s, CeltFrame *f, float *total_dist)
 314 {
 315     int i, tdist = 0.0f;
 316     OpusRangeCoder dump;
 317
 318     ff_opus_rc_enc_init(&dump);
 319     ff_celt_enc_bitalloc(&dump, f);
 320
 321     for (i = 0; i < CELT_MAX_BANDS; i++) {
 322         float bits = 0.0f;
 323         float dist = f->pvq->band_cost(f->pvq, f, &dump, i, &bits, s->lambda);
 324         tdist += dist;
 325     }
 326
 327     *total_dist = tdist;
 328
 329     return 0;
 330 }
 331
 332 static void celt_search_for_dual_stereo(OpusPsyContext *s, CeltFrame *f)
 333 {
 334     float td1, td2;
 335     f->dual_stereo = 0;
 336     bands_dist(s, f, &td1);
 337     f->dual_stereo = 1;
 338     bands_dist(s, f, &td2);
 339
 340     f->dual_stereo = td2 < td1;
 341     s->dual_stereo_used += td2 < td1;
 342 }
 343
 344 static void celt_search_for_intensity(OpusPsyContext *s, CeltFrame *f)
 345 {
 346     int i, best_band = CELT_MAX_BANDS - 1;
 347     float dist, best_dist = FLT_MAX;
 348
 349     /* TODO: fix, make some heuristic up here using the lambda value */
 350     float end_band = 0;
 351
 352     for (i = f->end_band; i >= end_band; i--) {
 353         f->intensity_stereo = i;
 354         bands_dist(s, f, &dist);
 355         if (best_dist > dist) {
 356             best_dist = dist;
 357             best_band = i;
 358         }
 359     }
 360
 361     f->intensity_stereo = best_band;
 362     s->avg_is_band = (s->avg_is_band + f->intensity_stereo)/2.0f;
 363 }
 364
 365 static int celt_search_for_tf(OpusPsyContext *s, OpusPsyStep **start, CeltFrame *f)
 366 {
 367     int i, j, k, cway, config[2][CELT_MAX_BANDS] = { { 0 } };
 368     float score[2] = { 0 };
 369
 370     for (cway = 0; cway < 2; cway++) {
 371         int mag[2];
 372         int base = f->transient ? 120 : 960;
 373
 374         for (i = 0; i < 2; i++) {
 375             int c = ff_celt_tf_select[f->size][f->transient][cway][i];
 376             mag[i] = c < 0 ? base >> FFABS(c) : base << FFABS(c);
 377         }
 378
 379         for (i = 0; i < CELT_MAX_BANDS; i++) {
 380             float iscore0 = 0.0f;
 381             float iscore1 = 0.0f;
 382             for (j = 0; j < (1 << f->size); j++) {
 383                 for (k = 0; k < s->avctx->channels; k++) {
 384                     iscore0 += start[j]->tone[k][i]*start[j]->change_amp[k][i]/mag[0];
 385                     iscore1 += start[j]->tone[k][i]*start[j]->change_amp[k][i]/mag[1];
 386                 }
 387             }
 388             config[cway][i] = FFABS(iscore0 - 1.0f) < FFABS(iscore1 - 1.0f);
 389             score[cway] += config[cway][i] ? iscore1 : iscore0;
 390         }
 391     }
 392
 393     f->tf_select = score[0] < score[1];
 394     memcpy(f->tf_change, config[f->tf_select], sizeof(int)*CELT_MAX_BANDS);
 395
 396     return 0;
 397 }
 398
 399 int ff_opus_psy_celt_frame_process(OpusPsyContext *s, CeltFrame *f, int index)
 400 {
 401     int start_transient_flag = f->transient;
 402     OpusPsyStep **start = &s->steps[index * (1 << s->p.framesize)];
 403
 404     if (f->silence)
 405         return 0;
 406
 407     celt_gauge_psy_weight(s, start, f);
 408     celt_search_for_intensity(s, f);
 409     celt_search_for_dual_stereo(s, f);
 410     celt_search_for_tf(s, start, f);
 411
 412     if (f->transient != start_transient_flag) {
 413         f->blocks = f->transient ? OPUS_BLOCK_SIZE(s->p.framesize)/CELT_OVERLAP : 1;
 414         s->redo_analysis = 1;
 415         return 1;
 416     }
 417
 418     s->redo_analysis = 0;
 419
 420     return 0;
 421 }
 422
 423 void ff_opus_psy_postencode_update(OpusPsyContext *s, CeltFrame *f, OpusRangeCoder *rc)
 424 {
 425     int i, frame_size = OPUS_BLOCK_SIZE(s->p.framesize);
 426     int steps_out = s->p.frames*(frame_size/120);
 427     void *tmp[FF_BUFQUEUE_SIZE];
 428     float ideal_fbits;
 429
 430     for (i = 0; i < steps_out; i++)
 431         memset(s->steps[i], 0, sizeof(OpusPsyStep));
 432
 433     for (i = 0; i < s->max_steps; i++)
 434         tmp[i] = s->steps[i];
 435
 436     for (i = 0; i < s->max_steps; i++) {
 437         const int i_new = i - steps_out;
 438         s->steps[i_new < 0 ? s->max_steps + i_new : i_new] = tmp[i];
 439     }
 440
 441     for (i = steps_out; i < s->buffered_steps; i++)
 442         s->steps[i]->index -= steps_out;
 443
 444     ideal_fbits = s->avctx->bit_rate/(s->avctx->sample_rate/frame_size);
 445
 446     for (i = 0; i < s->p.frames; i++) {
 447         s->avg_is_band += f[i].intensity_stereo;
 448         s->lambda *= ideal_fbits / f[i].framebits;
 449     }
 450
 451     s->avg_is_band /= (s->p.frames + 1);
 452
 453     s->cs_num = 0;
 454     s->steps_to_process = 0;
 455     s->buffered_steps -= steps_out;
 456     s->total_packets_out += s->p.frames;
 457     s->inflection_points_count = 0;
 458 }
 459
 460 av_cold int ff_opus_psy_init(OpusPsyContext *s, AVCodecContext *avctx,
 461                              struct FFBufQueue *bufqueue, OpusEncOptions *options)
 462 {
 463     int i, ch, ret;
 464
 465     s->redo_analysis = 0;
 466     s->lambda = 1.0f;
 467     s->options = options;
 468     s->avctx = avctx;
 469     s->bufqueue = bufqueue;
 470     s->max_steps = ceilf(s->options->max_delay_ms/2.5f);
 471     s->bsize_analysis = CELT_BLOCK_960;
 472     s->avg_is_band = CELT_MAX_BANDS - 1;
 473     s->inflection_points_count = 0;
 474
 475     s->inflection_points = av_mallocz(sizeof(*s->inflection_points)*s->max_steps);
 476     if (!s->inflection_points) {
 477         ret = AVERROR(ENOMEM);
 478         goto fail;
 479     }
 480
 481     s->dsp = avpriv_float_dsp_alloc(avctx->flags & AV_CODEC_FLAG_BITEXACT);
 482     if (!s->dsp) {
 483         ret = AVERROR(ENOMEM);
 484         goto fail;
 485     }
 486
 487     for (ch = 0; ch < s->avctx->channels; ch++) {
 488         for (i = 0; i < CELT_MAX_BANDS; i++) {
 489             bessel_init(&s->bfilter_hi[ch][i], 1.0f, 19.0f, 100.0f, 1);
 490             bessel_init(&s->bfilter_lo[ch][i], 1.0f, 20.0f, 100.0f, 0);
 491         }
 492     }
 493
 494     for (i = 0; i < s->max_steps; i++) {
 495         s->steps[i] = av_mallocz(sizeof(OpusPsyStep));
 496         if (!s->steps[i]) {
 497             ret = AVERROR(ENOMEM);
 498             goto fail;
 499         }
 500     }
 501
 502     for (i = 0; i < CELT_BLOCK_NB; i++) {
 503         float tmp;
 504         const int len = OPUS_BLOCK_SIZE(i);
 505         s->window[i] = av_malloc(2*len*sizeof(float));
 506         if (!s->window[i]) {
 507             ret = AVERROR(ENOMEM);
 508             goto fail;
 509         }
 510         generate_window_func(s->window[i], 2*len, WFUNC_SINE, &tmp);
 511         if ((ret = ff_mdct15_init(&s->mdct[i], 0, i + 3, 68 << (CELT_BLOCK_NB - 1 - i))))
 512             goto fail;
 513     }
 514
 515     return 0;
 516
 517 fail:
 518     av_freep(&s->inflection_points);
 519     av_freep(&s->dsp);
 520
 521     for (i = 0; i < CELT_BLOCK_NB; i++) {
 522         ff_mdct15_uninit(&s->mdct[i]);
 523         av_freep(&s->window[i]);
 524     }
 525
 526     for (i = 0; i < s->max_steps; i++)
 527         av_freep(&s->steps[i]);
 528
 529     return ret;
 530 }
 531
 532 void ff_opus_psy_signal_eof(OpusPsyContext *s)
 533 {
 534     s->eof = 1;
 535 }
 536
 537 av_cold int ff_opus_psy_end(OpusPsyContext *s)
 538 {
 539     int i;
 540
 541     av_freep(&s->inflection_points);
 542     av_freep(&s->dsp);
 543
 544     for (i = 0; i < CELT_BLOCK_NB; i++) {
 545         ff_mdct15_uninit(&s->mdct[i]);
 546         av_freep(&s->window[i]);
 547     }
 548
 549     for (i = 0; i < s->max_steps; i++)
 550         av_freep(&s->steps[i]);
 551
 552     av_log(s->avctx, AV_LOG_INFO, "Average Intensity Stereo band: %0.1f\n", s->avg_is_band);
 553     av_log(s->avctx, AV_LOG_INFO, "Dual Stereo used: %0.2f%%\n", ((float)s->dual_stereo_used/s->total_packets_out)*100.0f);
 554
 555     return 0;
 556 }