git.sesse.net Git - ffmpeg/blob - libavfilter/af_amix.c

   1 /*
   2  * Audio Mix Filter
   3  * Copyright (c) 2012 Justin Ruggles <justin.ruggles@gmail.com>
   4  *
   5  * This file is part of FFmpeg.
   6  *
   7  * FFmpeg is free software; you can redistribute it and/or
   8  * modify it under the terms of the GNU Lesser General Public
   9  * License as published by the Free Software Foundation; either
  10  * version 2.1 of the License, or (at your option) any later version.
  11  *
  12  * FFmpeg is distributed in the hope that it will be useful,
  13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15  * Lesser General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU Lesser General Public
  18  * License along with FFmpeg; if not, write to the Free Software
  19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20  */
  21
  22 /**
  23  * @file
  24  * Audio Mix Filter
  25  *
  26  * Mixes audio from multiple sources into a single output. The channel layout,
  27  * sample rate, and sample format will be the same for all inputs and the
  28  * output.
  29  */
  30
  31 #include "libavutil/attributes.h"
  32 #include "libavutil/audio_fifo.h"
  33 #include "libavutil/avassert.h"
  34 #include "libavutil/avstring.h"
  35 #include "libavutil/channel_layout.h"
  36 #include "libavutil/common.h"
  37 #include "libavutil/eval.h"
  38 #include "libavutil/float_dsp.h"
  39 #include "libavutil/mathematics.h"
  40 #include "libavutil/opt.h"
  41 #include "libavutil/samplefmt.h"
  42
  43 #include "audio.h"
  44 #include "avfilter.h"
  45 #include "filters.h"
  46 #include "formats.h"
  47 #include "internal.h"
  48
  49 #define INPUT_ON       1    /**< input is active */
  50 #define INPUT_EOF      2    /**< input has reached EOF (may still be active) */
  51
  52 #define DURATION_LONGEST  0
  53 #define DURATION_SHORTEST 1
  54 #define DURATION_FIRST    2
  55
  56
  57 typedef struct FrameInfo {
  58     int nb_samples;
  59     int64_t pts;
  60     struct FrameInfo *next;
  61 } FrameInfo;
  62
  63 /**
  64  * Linked list used to store timestamps and frame sizes of all frames in the
  65  * FIFO for the first input.
  66  *
  67  * This is needed to keep timestamps synchronized for the case where multiple
  68  * input frames are pushed to the filter for processing before a frame is
  69  * requested by the output link.
  70  */
  71 typedef struct FrameList {
  72     int nb_frames;
  73     int nb_samples;
  74     FrameInfo *list;
  75     FrameInfo *end;
  76 } FrameList;
  77
  78 static void frame_list_clear(FrameList *frame_list)
  79 {
  80     if (frame_list) {
  81         while (frame_list->list) {
  82             FrameInfo *info = frame_list->list;
  83             frame_list->list = info->next;
  84             av_free(info);
  85         }
  86         frame_list->nb_frames  = 0;
  87         frame_list->nb_samples = 0;
  88         frame_list->end        = NULL;
  89     }
  90 }
  91
  92 static int frame_list_next_frame_size(FrameList *frame_list)
  93 {
  94     if (!frame_list->list)
  95         return 0;
  96     return frame_list->list->nb_samples;
  97 }
  98
  99 static int64_t frame_list_next_pts(FrameList *frame_list)
 100 {
 101     if (!frame_list->list)
 102         return AV_NOPTS_VALUE;
 103     return frame_list->list->pts;
 104 }
 105
 106 static void frame_list_remove_samples(FrameList *frame_list, int nb_samples)
 107 {
 108     if (nb_samples >= frame_list->nb_samples) {
 109         frame_list_clear(frame_list);
 110     } else {
 111         int samples = nb_samples;
 112         while (samples > 0) {
 113             FrameInfo *info = frame_list->list;
 114             av_assert0(info);
 115             if (info->nb_samples <= samples) {
 116                 samples -= info->nb_samples;
 117                 frame_list->list = info->next;
 118                 if (!frame_list->list)
 119                     frame_list->end = NULL;
 120                 frame_list->nb_frames--;
 121                 frame_list->nb_samples -= info->nb_samples;
 122                 av_free(info);
 123             } else {
 124                 info->nb_samples       -= samples;
 125                 info->pts              += samples;
 126                 frame_list->nb_samples -= samples;
 127                 samples = 0;
 128             }
 129         }
 130     }
 131 }
 132
 133 static int frame_list_add_frame(FrameList *frame_list, int nb_samples, int64_t pts)
 134 {
 135     FrameInfo *info = av_malloc(sizeof(*info));
 136     if (!info)
 137         return AVERROR(ENOMEM);
 138     info->nb_samples = nb_samples;
 139     info->pts        = pts;
 140     info->next       = NULL;
 141
 142     if (!frame_list->list) {
 143         frame_list->list = info;
 144         frame_list->end  = info;
 145     } else {
 146         av_assert0(frame_list->end);
 147         frame_list->end->next = info;
 148         frame_list->end       = info;
 149     }
 150     frame_list->nb_frames++;
 151     frame_list->nb_samples += nb_samples;
 152
 153     return 0;
 154 }
 155
 156 /* FIXME: use directly links fifo */
 157
 158 typedef struct MixContext {
 159     const AVClass *class;       /**< class for AVOptions */
 160     AVFloatDSPContext *fdsp;
 161
 162     int nb_inputs;              /**< number of inputs */
 163     int active_inputs;          /**< number of input currently active */
 164     int duration_mode;          /**< mode for determining duration */
 165     float dropout_transition;   /**< transition time when an input drops out */
 166     char *weights_str;          /**< string for custom weights for every input */
 167
 168     int nb_channels;            /**< number of channels */
 169     int sample_rate;            /**< sample rate */
 170     int planar;
 171     AVAudioFifo **fifos;        /**< audio fifo for each input */
 172     uint8_t *input_state;       /**< current state of each input */
 173     float *input_scale;         /**< mixing scale factor for each input */
 174     float *weights;             /**< custom weights for every input */
 175     float weight_sum;           /**< sum of custom weights for every input */
 176     float *scale_norm;          /**< normalization factor for every input */
 177     int64_t next_pts;           /**< calculated pts for next output frame */
 178     FrameList *frame_list;      /**< list of frame info for the first input */
 179 } MixContext;
 180
 181 #define OFFSET(x) offsetof(MixContext, x)
 182 #define A AV_OPT_FLAG_AUDIO_PARAM
 183 #define F AV_OPT_FLAG_FILTERING_PARAM
 184 static const AVOption amix_options[] = {
 185     { "inputs", "Number of inputs.",
 186             OFFSET(nb_inputs), AV_OPT_TYPE_INT, { .i64 = 2 }, 1, INT16_MAX, A|F },
 187     { "duration", "How to determine the end-of-stream.",
 188             OFFSET(duration_mode), AV_OPT_TYPE_INT, { .i64 = DURATION_LONGEST }, 0,  2, A|F, "duration" },
 189         { "longest",  "Duration of longest input.",  0, AV_OPT_TYPE_CONST, { .i64 = DURATION_LONGEST  }, 0, 0, A|F, "duration" },
 190         { "shortest", "Duration of shortest input.", 0, AV_OPT_TYPE_CONST, { .i64 = DURATION_SHORTEST }, 0, 0, A|F, "duration" },
 191         { "first",    "Duration of first input.",    0, AV_OPT_TYPE_CONST, { .i64 = DURATION_FIRST    }, 0, 0, A|F, "duration" },
 192     { "dropout_transition", "Transition time, in seconds, for volume "
 193                             "renormalization when an input stream ends.",
 194             OFFSET(dropout_transition), AV_OPT_TYPE_FLOAT, { .dbl = 2.0 }, 0, INT_MAX, A|F },
 195     { "weights", "Set weight for each input.",
 196             OFFSET(weights_str), AV_OPT_TYPE_STRING, {.str="1 1"}, 0, 0, A|F },
 197     { NULL }
 198 };
 199
 200 AVFILTER_DEFINE_CLASS(amix);
 201
 202 /**
 203  * Update the scaling factors to apply to each input during mixing.
 204  *
 205  * This balances the full volume range between active inputs and handles
 206  * volume transitions when EOF is encountered on an input but mixing continues
 207  * with the remaining inputs.
 208  */
 209 static void calculate_scales(MixContext *s, int nb_samples)
 210 {
 211     float weight_sum = 0.f;
 212     int i;
 213
 214     for (i = 0; i < s->nb_inputs; i++)
 215         if (s->input_state[i] & INPUT_ON)
 216             weight_sum += FFABS(s->weights[i]);
 217
 218     for (i = 0; i < s->nb_inputs; i++) {
 219         if (s->input_state[i] & INPUT_ON) {
 220             if (s->scale_norm[i] > weight_sum / FFABS(s->weights[i])) {
 221                 s->scale_norm[i] -= ((s->weight_sum / FFABS(s->weights[i])) / s->nb_inputs) *
 222                                     nb_samples / (s->dropout_transition * s->sample_rate);
 223                 s->scale_norm[i] = FFMAX(s->scale_norm[i], weight_sum / FFABS(s->weights[i]));
 224             }
 225         }
 226     }
 227
 228     for (i = 0; i < s->nb_inputs; i++) {
 229         if (s->input_state[i] & INPUT_ON)
 230             s->input_scale[i] = 1.0f / s->scale_norm[i] * FFSIGN(s->weights[i]);
 231         else
 232             s->input_scale[i] = 0.0f;
 233     }
 234 }
 235
 236 static int config_output(AVFilterLink *outlink)
 237 {
 238     AVFilterContext *ctx = outlink->src;
 239     MixContext *s      = ctx->priv;
 240     int i;
 241     char buf[64];
 242
 243     s->planar          = av_sample_fmt_is_planar(outlink->format);
 244     s->sample_rate     = outlink->sample_rate;
 245     outlink->time_base = (AVRational){ 1, outlink->sample_rate };
 246     s->next_pts        = AV_NOPTS_VALUE;
 247
 248     s->frame_list = av_mallocz(sizeof(*s->frame_list));
 249     if (!s->frame_list)
 250         return AVERROR(ENOMEM);
 251
 252     s->fifos = av_mallocz_array(s->nb_inputs, sizeof(*s->fifos));
 253     if (!s->fifos)
 254         return AVERROR(ENOMEM);
 255
 256     s->nb_channels = outlink->channels;
 257     for (i = 0; i < s->nb_inputs; i++) {
 258         s->fifos[i] = av_audio_fifo_alloc(outlink->format, s->nb_channels, 1024);
 259         if (!s->fifos[i])
 260             return AVERROR(ENOMEM);
 261     }
 262
 263     s->input_state = av_malloc(s->nb_inputs);
 264     if (!s->input_state)
 265         return AVERROR(ENOMEM);
 266     memset(s->input_state, INPUT_ON, s->nb_inputs);
 267     s->active_inputs = s->nb_inputs;
 268
 269     s->input_scale = av_mallocz_array(s->nb_inputs, sizeof(*s->input_scale));
 270     s->scale_norm  = av_mallocz_array(s->nb_inputs, sizeof(*s->scale_norm));
 271     if (!s->input_scale || !s->scale_norm)
 272         return AVERROR(ENOMEM);
 273     for (i = 0; i < s->nb_inputs; i++)
 274         s->scale_norm[i] = s->weight_sum / FFABS(s->weights[i]);
 275     calculate_scales(s, 0);
 276
 277     av_get_channel_layout_string(buf, sizeof(buf), -1, outlink->channel_layout);
 278
 279     av_log(ctx, AV_LOG_VERBOSE,
 280            "inputs:%d fmt:%s srate:%d cl:%s\n", s->nb_inputs,
 281            av_get_sample_fmt_name(outlink->format), outlink->sample_rate, buf);
 282
 283     return 0;
 284 }
 285
 286 /**
 287  * Read samples from the input FIFOs, mix, and write to the output link.
 288  */
 289 static int output_frame(AVFilterLink *outlink)
 290 {
 291     AVFilterContext *ctx = outlink->src;
 292     MixContext      *s = ctx->priv;
 293     AVFrame *out_buf, *in_buf;
 294     int nb_samples, ns, i;
 295
 296     if (s->input_state[0] & INPUT_ON) {
 297         /* first input live: use the corresponding frame size */
 298         nb_samples = frame_list_next_frame_size(s->frame_list);
 299         for (i = 1; i < s->nb_inputs; i++) {
 300             if (s->input_state[i] & INPUT_ON) {
 301                 ns = av_audio_fifo_size(s->fifos[i]);
 302                 if (ns < nb_samples) {
 303                     if (!(s->input_state[i] & INPUT_EOF))
 304                         /* unclosed input with not enough samples */
 305                         return 0;
 306                     /* closed input to drain */
 307                     nb_samples = ns;
 308                 }
 309             }
 310         }
 311     } else {
 312         /* first input closed: use the available samples */
 313         nb_samples = INT_MAX;
 314         for (i = 1; i < s->nb_inputs; i++) {
 315             if (s->input_state[i] & INPUT_ON) {
 316                 ns = av_audio_fifo_size(s->fifos[i]);
 317                 nb_samples = FFMIN(nb_samples, ns);
 318             }
 319         }
 320         if (nb_samples == INT_MAX) {
 321             ff_outlink_set_status(outlink, AVERROR_EOF, s->next_pts);
 322             return 0;
 323         }
 324     }
 325
 326     s->next_pts = frame_list_next_pts(s->frame_list);
 327     frame_list_remove_samples(s->frame_list, nb_samples);
 328
 329     calculate_scales(s, nb_samples);
 330
 331     if (nb_samples == 0)
 332         return 0;
 333
 334     out_buf = ff_get_audio_buffer(outlink, nb_samples);
 335     if (!out_buf)
 336         return AVERROR(ENOMEM);
 337
 338     in_buf = ff_get_audio_buffer(outlink, nb_samples);
 339     if (!in_buf) {
 340         av_frame_free(&out_buf);
 341         return AVERROR(ENOMEM);
 342     }
 343
 344     for (i = 0; i < s->nb_inputs; i++) {
 345         if (s->input_state[i] & INPUT_ON) {
 346             int planes, plane_size, p;
 347
 348             av_audio_fifo_read(s->fifos[i], (void **)in_buf->extended_data,
 349                                nb_samples);
 350
 351             planes     = s->planar ? s->nb_channels : 1;
 352             plane_size = nb_samples * (s->planar ? 1 : s->nb_channels);
 353             plane_size = FFALIGN(plane_size, 16);
 354
 355             if (out_buf->format == AV_SAMPLE_FMT_FLT ||
 356                 out_buf->format == AV_SAMPLE_FMT_FLTP) {
 357                 for (p = 0; p < planes; p++) {
 358                     s->fdsp->vector_fmac_scalar((float *)out_buf->extended_data[p],
 359                                                 (float *) in_buf->extended_data[p],
 360                                                 s->input_scale[i], plane_size);
 361                 }
 362             } else {
 363                 for (p = 0; p < planes; p++) {
 364                     s->fdsp->vector_dmac_scalar((double *)out_buf->extended_data[p],
 365                                                 (double *) in_buf->extended_data[p],
 366                                                 s->input_scale[i], plane_size);
 367                 }
 368             }
 369         }
 370     }
 371     av_frame_free(&in_buf);
 372
 373     out_buf->pts = s->next_pts;
 374     if (s->next_pts != AV_NOPTS_VALUE)
 375         s->next_pts += nb_samples;
 376
 377     return ff_filter_frame(outlink, out_buf);
 378 }
 379
 380 /**
 381  * Requests a frame, if needed, from each input link other than the first.
 382  */
 383 static int request_samples(AVFilterContext *ctx, int min_samples)
 384 {
 385     MixContext *s = ctx->priv;
 386     int i;
 387
 388     av_assert0(s->nb_inputs > 1);
 389
 390     for (i = 1; i < s->nb_inputs; i++) {
 391         if (!(s->input_state[i] & INPUT_ON) ||
 392              (s->input_state[i] & INPUT_EOF))
 393             continue;
 394         if (av_audio_fifo_size(s->fifos[i]) >= min_samples)
 395             continue;
 396         ff_inlink_request_frame(ctx->inputs[i]);
 397     }
 398     return output_frame(ctx->outputs[0]);
 399 }
 400
 401 /**
 402  * Calculates the number of active inputs and determines EOF based on the
 403  * duration option.
 404  *
 405  * @return 0 if mixing should continue, or AVERROR_EOF if mixing should stop.
 406  */
 407 static int calc_active_inputs(MixContext *s)
 408 {
 409     int i;
 410     int active_inputs = 0;
 411     for (i = 0; i < s->nb_inputs; i++)
 412         active_inputs += !!(s->input_state[i] & INPUT_ON);
 413     s->active_inputs = active_inputs;
 414
 415     if (!active_inputs ||
 416         (s->duration_mode == DURATION_FIRST && !(s->input_state[0] & INPUT_ON)) ||
 417         (s->duration_mode == DURATION_SHORTEST && active_inputs != s->nb_inputs))
 418         return AVERROR_EOF;
 419     return 0;
 420 }
 421
 422 static int activate(AVFilterContext *ctx)
 423 {
 424     AVFilterLink *outlink = ctx->outputs[0];
 425     MixContext *s = ctx->priv;
 426     AVFrame *buf = NULL;
 427     int i, ret;
 428
 429     FF_FILTER_FORWARD_STATUS_BACK_ALL(outlink, ctx);
 430
 431     for (i = 0; i < s->nb_inputs; i++) {
 432         AVFilterLink *inlink = ctx->inputs[i];
 433
 434         if ((ret = ff_inlink_consume_frame(ctx->inputs[i], &buf)) > 0) {
 435             if (i == 0) {
 436                 int64_t pts = av_rescale_q(buf->pts, inlink->time_base,
 437                                            outlink->time_base);
 438                 ret = frame_list_add_frame(s->frame_list, buf->nb_samples, pts);
 439                 if (ret < 0) {
 440                     av_frame_free(&buf);
 441                     return ret;
 442                 }
 443             }
 444
 445             ret = av_audio_fifo_write(s->fifos[i], (void **)buf->extended_data,
 446                                       buf->nb_samples);
 447             if (ret < 0) {
 448                 av_frame_free(&buf);
 449                 return ret;
 450             }
 451
 452             av_frame_free(&buf);
 453
 454             ret = output_frame(outlink);
 455             if (ret < 0)
 456                 return ret;
 457         }
 458     }
 459
 460     for (i = 0; i < s->nb_inputs; i++) {
 461         int64_t pts;
 462         int status;
 463
 464         if (ff_inlink_acknowledge_status(ctx->inputs[i], &status, &pts)) {
 465             if (status == AVERROR_EOF) {
 466                 if (i == 0) {
 467                     s->input_state[i] = 0;
 468                     if (s->nb_inputs == 1) {
 469                         ff_outlink_set_status(outlink, status, pts);
 470                         return 0;
 471                     }
 472                 } else {
 473                     s->input_state[i] |= INPUT_EOF;
 474                     if (av_audio_fifo_size(s->fifos[i]) == 0) {
 475                         s->input_state[i] = 0;
 476                     }
 477                 }
 478             }
 479         }
 480     }
 481
 482     if (calc_active_inputs(s)) {
 483         ff_outlink_set_status(outlink, AVERROR_EOF, s->next_pts);
 484         return 0;
 485     }
 486
 487     if (ff_outlink_frame_wanted(outlink)) {
 488         int wanted_samples;
 489
 490         if (!(s->input_state[0] & INPUT_ON))
 491             return request_samples(ctx, 1);
 492
 493         if (s->frame_list->nb_frames == 0) {
 494             ff_inlink_request_frame(ctx->inputs[0]);
 495             return 0;
 496         }
 497         av_assert0(s->frame_list->nb_frames > 0);
 498
 499         wanted_samples = frame_list_next_frame_size(s->frame_list);
 500
 501         return request_samples(ctx, wanted_samples);
 502     }
 503
 504     return 0;
 505 }
 506
 507 static av_cold int init(AVFilterContext *ctx)
 508 {
 509     MixContext *s = ctx->priv;
 510     float last_weight = 1.f;
 511     int i, ret;
 512     char *p;
 513
 514     for (i = 0; i < s->nb_inputs; i++) {
 515         AVFilterPad pad = { 0 };
 516
 517         pad.type           = AVMEDIA_TYPE_AUDIO;
 518         pad.name           = av_asprintf("input%d", i);
 519         if (!pad.name)
 520             return AVERROR(ENOMEM);
 521
 522         if ((ret = ff_insert_inpad(ctx, i, &pad)) < 0) {
 523             av_freep(&pad.name);
 524             return ret;
 525         }
 526     }
 527
 528     s->fdsp = avpriv_float_dsp_alloc(0);
 529     if (!s->fdsp)
 530         return AVERROR(ENOMEM);
 531
 532     s->weights = av_mallocz_array(s->nb_inputs, sizeof(*s->weights));
 533     if (!s->weights)
 534         return AVERROR(ENOMEM);
 535
 536     p = s->weights_str;
 537     for (i = 0; i < s->nb_inputs; i++) {
 538         last_weight = av_strtod(p, &p);
 539         s->weights[i] = last_weight;
 540         s->weight_sum += FFABS(last_weight);
 541         if (p && *p) {
 542             p++;
 543         } else {
 544             i++;
 545             break;
 546         }
 547     }
 548
 549     for (; i < s->nb_inputs; i++) {
 550         s->weights[i] = last_weight;
 551         s->weight_sum += FFABS(last_weight);
 552     }
 553
 554     return 0;
 555 }
 556
 557 static av_cold void uninit(AVFilterContext *ctx)
 558 {
 559     int i;
 560     MixContext *s = ctx->priv;
 561
 562     if (s->fifos) {
 563         for (i = 0; i < s->nb_inputs; i++)
 564             av_audio_fifo_free(s->fifos[i]);
 565         av_freep(&s->fifos);
 566     }
 567     frame_list_clear(s->frame_list);
 568     av_freep(&s->frame_list);
 569     av_freep(&s->input_state);
 570     av_freep(&s->input_scale);
 571     av_freep(&s->scale_norm);
 572     av_freep(&s->weights);
 573     av_freep(&s->fdsp);
 574
 575     for (i = 0; i < ctx->nb_inputs; i++)
 576         av_freep(&ctx->input_pads[i].name);
 577 }
 578
 579 static int query_formats(AVFilterContext *ctx)
 580 {
 581     AVFilterFormats *formats = NULL;
 582     AVFilterChannelLayouts *layouts;
 583     int ret;
 584
 585     layouts = ff_all_channel_counts();
 586     if (!layouts) {
 587         ret = AVERROR(ENOMEM);
 588         goto fail;
 589     }
 590
 591     if ((ret = ff_add_format(&formats, AV_SAMPLE_FMT_FLT ))          < 0 ||
 592         (ret = ff_add_format(&formats, AV_SAMPLE_FMT_FLTP))          < 0 ||
 593         (ret = ff_add_format(&formats, AV_SAMPLE_FMT_DBL ))          < 0 ||
 594         (ret = ff_add_format(&formats, AV_SAMPLE_FMT_DBLP))          < 0 ||
 595         (ret = ff_set_common_formats        (ctx, formats))          < 0 ||
 596         (ret = ff_set_common_channel_layouts(ctx, layouts))          < 0 ||
 597         (ret = ff_set_common_samplerates(ctx, ff_all_samplerates())) < 0)
 598         goto fail;
 599     return 0;
 600 fail:
 601     if (layouts)
 602         av_freep(&layouts->channel_layouts);
 603     av_freep(&layouts);
 604     return ret;
 605 }
 606
 607 static const AVFilterPad avfilter_af_amix_outputs[] = {
 608     {
 609         .name          = "default",
 610         .type          = AVMEDIA_TYPE_AUDIO,
 611         .config_props  = config_output,
 612     },
 613     { NULL }
 614 };
 615
 616 AVFilter ff_af_amix = {
 617     .name           = "amix",
 618     .description    = NULL_IF_CONFIG_SMALL("Audio mixing."),
 619     .priv_size      = sizeof(MixContext),
 620     .priv_class     = &amix_class,
 621     .init           = init,
 622     .uninit         = uninit,
 623     .activate       = activate,
 624     .query_formats  = query_formats,
 625     .inputs         = NULL,
 626     .outputs        = avfilter_af_amix_outputs,
 627     .flags          = AVFILTER_FLAG_DYNAMIC_INPUTS,
 628 };