git.sesse.net Git - ffmpeg/blob - libavfilter/af_amix.c

   1 /*
   2  * Audio Mix Filter
   3  * Copyright (c) 2012 Justin Ruggles <justin.ruggles@gmail.com>
   4  *
   5  * This file is part of FFmpeg.
   6  *
   7  * FFmpeg is free software; you can redistribute it and/or
   8  * modify it under the terms of the GNU Lesser General Public
   9  * License as published by the Free Software Foundation; either
  10  * version 2.1 of the License, or (at your option) any later version.
  11  *
  12  * FFmpeg is distributed in the hope that it will be useful,
  13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15  * Lesser General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU Lesser General Public
  18  * License along with FFmpeg; if not, write to the Free Software
  19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20  */
  21
  22 /**
  23  * @file
  24  * Audio Mix Filter
  25  *
  26  * Mixes audio from multiple sources into a single output. The channel layout,
  27  * sample rate, and sample format will be the same for all inputs and the
  28  * output.
  29  */
  30
  31 #include "libavutil/attributes.h"
  32 #include "libavutil/audio_fifo.h"
  33 #include "libavutil/avassert.h"
  34 #include "libavutil/avstring.h"
  35 #include "libavutil/channel_layout.h"
  36 #include "libavutil/common.h"
  37 #include "libavutil/float_dsp.h"
  38 #include "libavutil/mathematics.h"
  39 #include "libavutil/opt.h"
  40 #include "libavutil/samplefmt.h"
  41
  42 #include "audio.h"
  43 #include "avfilter.h"
  44 #include "filters.h"
  45 #include "formats.h"
  46 #include "internal.h"
  47
  48 #define INPUT_ON       1    /**< input is active */
  49 #define INPUT_EOF      2    /**< input has reached EOF (may still be active) */
  50
  51 #define DURATION_LONGEST  0
  52 #define DURATION_SHORTEST 1
  53 #define DURATION_FIRST    2
  54
  55
  56 typedef struct FrameInfo {
  57     int nb_samples;
  58     int64_t pts;
  59     struct FrameInfo *next;
  60 } FrameInfo;
  61
  62 /**
  63  * Linked list used to store timestamps and frame sizes of all frames in the
  64  * FIFO for the first input.
  65  *
  66  * This is needed to keep timestamps synchronized for the case where multiple
  67  * input frames are pushed to the filter for processing before a frame is
  68  * requested by the output link.
  69  */
  70 typedef struct FrameList {
  71     int nb_frames;
  72     int nb_samples;
  73     FrameInfo *list;
  74     FrameInfo *end;
  75 } FrameList;
  76
  77 static void frame_list_clear(FrameList *frame_list)
  78 {
  79     if (frame_list) {
  80         while (frame_list->list) {
  81             FrameInfo *info = frame_list->list;
  82             frame_list->list = info->next;
  83             av_free(info);
  84         }
  85         frame_list->nb_frames  = 0;
  86         frame_list->nb_samples = 0;
  87         frame_list->end        = NULL;
  88     }
  89 }
  90
  91 static int frame_list_next_frame_size(FrameList *frame_list)
  92 {
  93     if (!frame_list->list)
  94         return 0;
  95     return frame_list->list->nb_samples;
  96 }
  97
  98 static int64_t frame_list_next_pts(FrameList *frame_list)
  99 {
 100     if (!frame_list->list)
 101         return AV_NOPTS_VALUE;
 102     return frame_list->list->pts;
 103 }
 104
 105 static void frame_list_remove_samples(FrameList *frame_list, int nb_samples)
 106 {
 107     if (nb_samples >= frame_list->nb_samples) {
 108         frame_list_clear(frame_list);
 109     } else {
 110         int samples = nb_samples;
 111         while (samples > 0) {
 112             FrameInfo *info = frame_list->list;
 113             av_assert0(info);
 114             if (info->nb_samples <= samples) {
 115                 samples -= info->nb_samples;
 116                 frame_list->list = info->next;
 117                 if (!frame_list->list)
 118                     frame_list->end = NULL;
 119                 frame_list->nb_frames--;
 120                 frame_list->nb_samples -= info->nb_samples;
 121                 av_free(info);
 122             } else {
 123                 info->nb_samples       -= samples;
 124                 info->pts              += samples;
 125                 frame_list->nb_samples -= samples;
 126                 samples = 0;
 127             }
 128         }
 129     }
 130 }
 131
 132 static int frame_list_add_frame(FrameList *frame_list, int nb_samples, int64_t pts)
 133 {
 134     FrameInfo *info = av_malloc(sizeof(*info));
 135     if (!info)
 136         return AVERROR(ENOMEM);
 137     info->nb_samples = nb_samples;
 138     info->pts        = pts;
 139     info->next       = NULL;
 140
 141     if (!frame_list->list) {
 142         frame_list->list = info;
 143         frame_list->end  = info;
 144     } else {
 145         av_assert0(frame_list->end);
 146         frame_list->end->next = info;
 147         frame_list->end       = info;
 148     }
 149     frame_list->nb_frames++;
 150     frame_list->nb_samples += nb_samples;
 151
 152     return 0;
 153 }
 154
 155 /* FIXME: use directly links fifo */
 156
 157 typedef struct MixContext {
 158     const AVClass *class;       /**< class for AVOptions */
 159     AVFloatDSPContext *fdsp;
 160
 161     int nb_inputs;              /**< number of inputs */
 162     int active_inputs;          /**< number of input currently active */
 163     int duration_mode;          /**< mode for determining duration */
 164     float dropout_transition;   /**< transition time when an input drops out */
 165     char *weights_str;          /**< string for custom weights for every input */
 166
 167     int nb_channels;            /**< number of channels */
 168     int sample_rate;            /**< sample rate */
 169     int planar;
 170     AVAudioFifo **fifos;        /**< audio fifo for each input */
 171     uint8_t *input_state;       /**< current state of each input */
 172     float *input_scale;         /**< mixing scale factor for each input */
 173     float *weights;             /**< custom weights for every input */
 174     float weight_sum;           /**< sum of custom weights for every input */
 175     float *scale_norm;          /**< normalization factor for every input */
 176     int64_t next_pts;           /**< calculated pts for next output frame */
 177     FrameList *frame_list;      /**< list of frame info for the first input */
 178 } MixContext;
 179
 180 #define OFFSET(x) offsetof(MixContext, x)
 181 #define A AV_OPT_FLAG_AUDIO_PARAM
 182 #define F AV_OPT_FLAG_FILTERING_PARAM
 183 static const AVOption amix_options[] = {
 184     { "inputs", "Number of inputs.",
 185             OFFSET(nb_inputs), AV_OPT_TYPE_INT, { .i64 = 2 }, 1, 1024, A|F },
 186     { "duration", "How to determine the end-of-stream.",
 187             OFFSET(duration_mode), AV_OPT_TYPE_INT, { .i64 = DURATION_LONGEST }, 0,  2, A|F, "duration" },
 188         { "longest",  "Duration of longest input.",  0, AV_OPT_TYPE_CONST, { .i64 = DURATION_LONGEST  }, 0, 0, A|F, "duration" },
 189         { "shortest", "Duration of shortest input.", 0, AV_OPT_TYPE_CONST, { .i64 = DURATION_SHORTEST }, 0, 0, A|F, "duration" },
 190         { "first",    "Duration of first input.",    0, AV_OPT_TYPE_CONST, { .i64 = DURATION_FIRST    }, 0, 0, A|F, "duration" },
 191     { "dropout_transition", "Transition time, in seconds, for volume "
 192                             "renormalization when an input stream ends.",
 193             OFFSET(dropout_transition), AV_OPT_TYPE_FLOAT, { .dbl = 2.0 }, 0, INT_MAX, A|F },
 194     { "weights", "Set weight for each input.",
 195             OFFSET(weights_str), AV_OPT_TYPE_STRING, {.str="1 1"}, 0, 0, A|F },
 196     { NULL }
 197 };
 198
 199 AVFILTER_DEFINE_CLASS(amix);
 200
 201 /**
 202  * Update the scaling factors to apply to each input during mixing.
 203  *
 204  * This balances the full volume range between active inputs and handles
 205  * volume transitions when EOF is encountered on an input but mixing continues
 206  * with the remaining inputs.
 207  */
 208 static void calculate_scales(MixContext *s, int nb_samples)
 209 {
 210     float weight_sum = 0.f;
 211     int i;
 212
 213     for (i = 0; i < s->nb_inputs; i++)
 214         if (s->input_state[i] & INPUT_ON)
 215             weight_sum += s->weights[i];
 216
 217     for (i = 0; i < s->nb_inputs; i++) {
 218         if (s->input_state[i] & INPUT_ON) {
 219             if (s->scale_norm[i] > weight_sum / s->weights[i]) {
 220                 s->scale_norm[i] -= ((s->weight_sum / s->weights[i]) / s->nb_inputs) *
 221                                     nb_samples / (s->dropout_transition * s->sample_rate);
 222                 s->scale_norm[i] = FFMAX(s->scale_norm[i], weight_sum / s->weights[i]);
 223             }
 224         }
 225     }
 226
 227     for (i = 0; i < s->nb_inputs; i++) {
 228         if (s->input_state[i] & INPUT_ON)
 229             s->input_scale[i] = 1.0f / s->scale_norm[i];
 230         else
 231             s->input_scale[i] = 0.0f;
 232     }
 233 }
 234
 235 static int config_output(AVFilterLink *outlink)
 236 {
 237     AVFilterContext *ctx = outlink->src;
 238     MixContext *s      = ctx->priv;
 239     int i;
 240     char buf[64];
 241
 242     s->planar          = av_sample_fmt_is_planar(outlink->format);
 243     s->sample_rate     = outlink->sample_rate;
 244     outlink->time_base = (AVRational){ 1, outlink->sample_rate };
 245     s->next_pts        = AV_NOPTS_VALUE;
 246
 247     s->frame_list = av_mallocz(sizeof(*s->frame_list));
 248     if (!s->frame_list)
 249         return AVERROR(ENOMEM);
 250
 251     s->fifos = av_mallocz_array(s->nb_inputs, sizeof(*s->fifos));
 252     if (!s->fifos)
 253         return AVERROR(ENOMEM);
 254
 255     s->nb_channels = outlink->channels;
 256     for (i = 0; i < s->nb_inputs; i++) {
 257         s->fifos[i] = av_audio_fifo_alloc(outlink->format, s->nb_channels, 1024);
 258         if (!s->fifos[i])
 259             return AVERROR(ENOMEM);
 260     }
 261
 262     s->input_state = av_malloc(s->nb_inputs);
 263     if (!s->input_state)
 264         return AVERROR(ENOMEM);
 265     memset(s->input_state, INPUT_ON, s->nb_inputs);
 266     s->active_inputs = s->nb_inputs;
 267
 268     s->input_scale = av_mallocz_array(s->nb_inputs, sizeof(*s->input_scale));
 269     s->scale_norm  = av_mallocz_array(s->nb_inputs, sizeof(*s->scale_norm));
 270     if (!s->input_scale || !s->scale_norm)
 271         return AVERROR(ENOMEM);
 272     for (i = 0; i < s->nb_inputs; i++)
 273         s->scale_norm[i] = s->weight_sum / s->weights[i];
 274     calculate_scales(s, 0);
 275
 276     av_get_channel_layout_string(buf, sizeof(buf), -1, outlink->channel_layout);
 277
 278     av_log(ctx, AV_LOG_VERBOSE,
 279            "inputs:%d fmt:%s srate:%d cl:%s\n", s->nb_inputs,
 280            av_get_sample_fmt_name(outlink->format), outlink->sample_rate, buf);
 281
 282     return 0;
 283 }
 284
 285 /**
 286  * Read samples from the input FIFOs, mix, and write to the output link.
 287  */
 288 static int output_frame(AVFilterLink *outlink)
 289 {
 290     AVFilterContext *ctx = outlink->src;
 291     MixContext      *s = ctx->priv;
 292     AVFrame *out_buf, *in_buf;
 293     int nb_samples, ns, i;
 294
 295     if (s->input_state[0] & INPUT_ON) {
 296         /* first input live: use the corresponding frame size */
 297         nb_samples = frame_list_next_frame_size(s->frame_list);
 298         for (i = 1; i < s->nb_inputs; i++) {
 299             if (s->input_state[i] & INPUT_ON) {
 300                 ns = av_audio_fifo_size(s->fifos[i]);
 301                 if (ns < nb_samples) {
 302                     if (!(s->input_state[i] & INPUT_EOF))
 303                         /* unclosed input with not enough samples */
 304                         return 0;
 305                     /* closed input to drain */
 306                     nb_samples = ns;
 307                 }
 308             }
 309         }
 310     } else {
 311         /* first input closed: use the available samples */
 312         nb_samples = INT_MAX;
 313         for (i = 1; i < s->nb_inputs; i++) {
 314             if (s->input_state[i] & INPUT_ON) {
 315                 ns = av_audio_fifo_size(s->fifos[i]);
 316                 nb_samples = FFMIN(nb_samples, ns);
 317             }
 318         }
 319         if (nb_samples == INT_MAX) {
 320             ff_outlink_set_status(outlink, AVERROR_EOF, s->next_pts);
 321             return 0;
 322         }
 323     }
 324
 325     s->next_pts = frame_list_next_pts(s->frame_list);
 326     frame_list_remove_samples(s->frame_list, nb_samples);
 327
 328     calculate_scales(s, nb_samples);
 329
 330     if (nb_samples == 0)
 331         return 0;
 332
 333     out_buf = ff_get_audio_buffer(outlink, nb_samples);
 334     if (!out_buf)
 335         return AVERROR(ENOMEM);
 336
 337     in_buf = ff_get_audio_buffer(outlink, nb_samples);
 338     if (!in_buf) {
 339         av_frame_free(&out_buf);
 340         return AVERROR(ENOMEM);
 341     }
 342
 343     for (i = 0; i < s->nb_inputs; i++) {
 344         if (s->input_state[i] & INPUT_ON) {
 345             int planes, plane_size, p;
 346
 347             av_audio_fifo_read(s->fifos[i], (void **)in_buf->extended_data,
 348                                nb_samples);
 349
 350             planes     = s->planar ? s->nb_channels : 1;
 351             plane_size = nb_samples * (s->planar ? 1 : s->nb_channels);
 352             plane_size = FFALIGN(plane_size, 16);
 353
 354             if (out_buf->format == AV_SAMPLE_FMT_FLT ||
 355                 out_buf->format == AV_SAMPLE_FMT_FLTP) {
 356                 for (p = 0; p < planes; p++) {
 357                     s->fdsp->vector_fmac_scalar((float *)out_buf->extended_data[p],
 358                                                 (float *) in_buf->extended_data[p],
 359                                                 s->input_scale[i], plane_size);
 360                 }
 361             } else {
 362                 for (p = 0; p < planes; p++) {
 363                     s->fdsp->vector_dmac_scalar((double *)out_buf->extended_data[p],
 364                                                 (double *) in_buf->extended_data[p],
 365                                                 s->input_scale[i], plane_size);
 366                 }
 367             }
 368         }
 369     }
 370     av_frame_free(&in_buf);
 371
 372     out_buf->pts = s->next_pts;
 373     if (s->next_pts != AV_NOPTS_VALUE)
 374         s->next_pts += nb_samples;
 375
 376     return ff_filter_frame(outlink, out_buf);
 377 }
 378
 379 /**
 380  * Requests a frame, if needed, from each input link other than the first.
 381  */
 382 static int request_samples(AVFilterContext *ctx, int min_samples)
 383 {
 384     MixContext *s = ctx->priv;
 385     int i;
 386
 387     av_assert0(s->nb_inputs > 1);
 388
 389     for (i = 1; i < s->nb_inputs; i++) {
 390         if (!(s->input_state[i] & INPUT_ON) ||
 391              (s->input_state[i] & INPUT_EOF))
 392             continue;
 393         if (av_audio_fifo_size(s->fifos[i]) >= min_samples)
 394             continue;
 395         ff_inlink_request_frame(ctx->inputs[i]);
 396     }
 397     return output_frame(ctx->outputs[0]);
 398 }
 399
 400 /**
 401  * Calculates the number of active inputs and determines EOF based on the
 402  * duration option.
 403  *
 404  * @return 0 if mixing should continue, or AVERROR_EOF if mixing should stop.
 405  */
 406 static int calc_active_inputs(MixContext *s)
 407 {
 408     int i;
 409     int active_inputs = 0;
 410     for (i = 0; i < s->nb_inputs; i++)
 411         active_inputs += !!(s->input_state[i] & INPUT_ON);
 412     s->active_inputs = active_inputs;
 413
 414     if (!active_inputs ||
 415         (s->duration_mode == DURATION_FIRST && !(s->input_state[0] & INPUT_ON)) ||
 416         (s->duration_mode == DURATION_SHORTEST && active_inputs != s->nb_inputs))
 417         return AVERROR_EOF;
 418     return 0;
 419 }
 420
 421 static int activate(AVFilterContext *ctx)
 422 {
 423     AVFilterLink *outlink = ctx->outputs[0];
 424     MixContext *s = ctx->priv;
 425     AVFrame *buf = NULL;
 426     int i, ret;
 427
 428     FF_FILTER_FORWARD_STATUS_BACK_ALL(outlink, ctx);
 429
 430     for (i = 0; i < s->nb_inputs; i++) {
 431         AVFilterLink *inlink = ctx->inputs[i];
 432
 433         if ((ret = ff_inlink_consume_frame(ctx->inputs[i], &buf)) > 0) {
 434             if (i == 0) {
 435                 int64_t pts = av_rescale_q(buf->pts, inlink->time_base,
 436                                            outlink->time_base);
 437                 ret = frame_list_add_frame(s->frame_list, buf->nb_samples, pts);
 438                 if (ret < 0) {
 439                     av_frame_free(&buf);
 440                     return ret;
 441                 }
 442             }
 443
 444             ret = av_audio_fifo_write(s->fifos[i], (void **)buf->extended_data,
 445                                       buf->nb_samples);
 446             if (ret < 0) {
 447                 av_frame_free(&buf);
 448                 return ret;
 449             }
 450
 451             av_frame_free(&buf);
 452
 453             ret = output_frame(outlink);
 454             if (ret < 0)
 455                 return ret;
 456         }
 457     }
 458
 459     for (i = 0; i < s->nb_inputs; i++) {
 460         int64_t pts;
 461         int status;
 462
 463         if (ff_inlink_acknowledge_status(ctx->inputs[i], &status, &pts)) {
 464             if (status == AVERROR_EOF) {
 465                 if (i == 0) {
 466                     s->input_state[i] = 0;
 467                     if (s->nb_inputs == 1) {
 468                         ff_outlink_set_status(outlink, status, pts);
 469                         return 0;
 470                     }
 471                 } else {
 472                     s->input_state[i] |= INPUT_EOF;
 473                     if (av_audio_fifo_size(s->fifos[i]) == 0) {
 474                         s->input_state[i] = 0;
 475                     }
 476                 }
 477             }
 478         }
 479     }
 480
 481     if (calc_active_inputs(s)) {
 482         ff_outlink_set_status(outlink, AVERROR_EOF, s->next_pts);
 483         return 0;
 484     }
 485
 486     if (ff_outlink_frame_wanted(outlink)) {
 487         int wanted_samples;
 488
 489         if (!(s->input_state[0] & INPUT_ON))
 490             return request_samples(ctx, 1);
 491
 492         if (s->frame_list->nb_frames == 0) {
 493             ff_inlink_request_frame(ctx->inputs[0]);
 494             return 0;
 495         }
 496         av_assert0(s->frame_list->nb_frames > 0);
 497
 498         wanted_samples = frame_list_next_frame_size(s->frame_list);
 499
 500         return request_samples(ctx, wanted_samples);
 501     }
 502
 503     return 0;
 504 }
 505
 506 static av_cold int init(AVFilterContext *ctx)
 507 {
 508     MixContext *s = ctx->priv;
 509     char *p, *arg, *saveptr = NULL;
 510     float last_weight = 1.f;
 511     int i, ret;
 512
 513     for (i = 0; i < s->nb_inputs; i++) {
 514         AVFilterPad pad = { 0 };
 515
 516         pad.type           = AVMEDIA_TYPE_AUDIO;
 517         pad.name           = av_asprintf("input%d", i);
 518         if (!pad.name)
 519             return AVERROR(ENOMEM);
 520
 521         if ((ret = ff_insert_inpad(ctx, i, &pad)) < 0) {
 522             av_freep(&pad.name);
 523             return ret;
 524         }
 525     }
 526
 527     s->fdsp = avpriv_float_dsp_alloc(0);
 528     if (!s->fdsp)
 529         return AVERROR(ENOMEM);
 530
 531     s->weights = av_mallocz_array(s->nb_inputs, sizeof(*s->weights));
 532     if (!s->weights)
 533         return AVERROR(ENOMEM);
 534
 535     p = s->weights_str;
 536     for (i = 0; i < s->nb_inputs; i++) {
 537         if (!(arg = av_strtok(p, " ", &saveptr)))
 538             break;
 539
 540         p = NULL;
 541         sscanf(arg, "%f", &last_weight);
 542         s->weights[i] = last_weight;
 543         s->weight_sum += last_weight;
 544     }
 545
 546     for (; i < s->nb_inputs; i++) {
 547         s->weights[i] = last_weight;
 548         s->weight_sum += last_weight;
 549     }
 550
 551     return 0;
 552 }
 553
 554 static av_cold void uninit(AVFilterContext *ctx)
 555 {
 556     int i;
 557     MixContext *s = ctx->priv;
 558
 559     if (s->fifos) {
 560         for (i = 0; i < s->nb_inputs; i++)
 561             av_audio_fifo_free(s->fifos[i]);
 562         av_freep(&s->fifos);
 563     }
 564     frame_list_clear(s->frame_list);
 565     av_freep(&s->frame_list);
 566     av_freep(&s->input_state);
 567     av_freep(&s->input_scale);
 568     av_freep(&s->scale_norm);
 569     av_freep(&s->weights);
 570     av_freep(&s->fdsp);
 571
 572     for (i = 0; i < ctx->nb_inputs; i++)
 573         av_freep(&ctx->input_pads[i].name);
 574 }
 575
 576 static int query_formats(AVFilterContext *ctx)
 577 {
 578     AVFilterFormats *formats = NULL;
 579     AVFilterChannelLayouts *layouts;
 580     int ret;
 581
 582     layouts = ff_all_channel_counts();
 583     if (!layouts) {
 584         ret = AVERROR(ENOMEM);
 585         goto fail;
 586     }
 587
 588     if ((ret = ff_add_format(&formats, AV_SAMPLE_FMT_FLT ))          < 0 ||
 589         (ret = ff_add_format(&formats, AV_SAMPLE_FMT_FLTP))          < 0 ||
 590         (ret = ff_add_format(&formats, AV_SAMPLE_FMT_DBL ))          < 0 ||
 591         (ret = ff_add_format(&formats, AV_SAMPLE_FMT_DBLP))          < 0 ||
 592         (ret = ff_set_common_formats        (ctx, formats))          < 0 ||
 593         (ret = ff_set_common_channel_layouts(ctx, layouts))          < 0 ||
 594         (ret = ff_set_common_samplerates(ctx, ff_all_samplerates())) < 0)
 595         goto fail;
 596     return 0;
 597 fail:
 598     if (layouts)
 599         av_freep(&layouts->channel_layouts);
 600     av_freep(&layouts);
 601     return ret;
 602 }
 603
 604 static const AVFilterPad avfilter_af_amix_outputs[] = {
 605     {
 606         .name          = "default",
 607         .type          = AVMEDIA_TYPE_AUDIO,
 608         .config_props  = config_output,
 609     },
 610     { NULL }
 611 };
 612
 613 AVFilter ff_af_amix = {
 614     .name           = "amix",
 615     .description    = NULL_IF_CONFIG_SMALL("Audio mixing."),
 616     .priv_size      = sizeof(MixContext),
 617     .priv_class     = &amix_class,
 618     .init           = init,
 619     .uninit         = uninit,
 620     .activate       = activate,
 621     .query_formats  = query_formats,
 622     .inputs         = NULL,
 623     .outputs        = avfilter_af_amix_outputs,
 624     .flags          = AVFILTER_FLAG_DYNAMIC_INPUTS,
 625 };