git.sesse.net Git - ffmpeg/blob - libavfilter/af_amix.c

   1 /*
   2  * Audio Mix Filter
   3  * Copyright (c) 2012 Justin Ruggles <justin.ruggles@gmail.com>
   4  *
   5  * This file is part of Libav.
   6  *
   7  * Libav is free software; you can redistribute it and/or
   8  * modify it under the terms of the GNU Lesser General Public
   9  * License as published by the Free Software Foundation; either
  10  * version 2.1 of the License, or (at your option) any later version.
  11  *
  12  * Libav is distributed in the hope that it will be useful,
  13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15  * Lesser General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU Lesser General Public
  18  * License along with Libav; if not, write to the Free Software
  19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20  */
  21
  22 /**
  23  * @file
  24  * Audio Mix Filter
  25  *
  26  * Mixes audio from multiple sources into a single output. The channel layout,
  27  * sample rate, and sample format will be the same for all inputs and the
  28  * output.
  29  */
  30
  31 #include "libavutil/audioconvert.h"
  32 #include "libavutil/audio_fifo.h"
  33 #include "libavutil/avassert.h"
  34 #include "libavutil/avstring.h"
  35 #include "libavutil/common.h"
  36 #include "libavutil/float_dsp.h"
  37 #include "libavutil/mathematics.h"
  38 #include "libavutil/opt.h"
  39 #include "libavutil/samplefmt.h"
  40
  41 #include "audio.h"
  42 #include "avfilter.h"
  43 #include "formats.h"
  44 #include "internal.h"
  45
  46 #define INPUT_OFF      0    /**< input has reached EOF */
  47 #define INPUT_ON       1    /**< input is active */
  48 #define INPUT_INACTIVE 2    /**< input is on, but is currently inactive */
  49
  50 #define DURATION_LONGEST  0
  51 #define DURATION_SHORTEST 1
  52 #define DURATION_FIRST    2
  53
  54
  55 typedef struct FrameInfo {
  56     int nb_samples;
  57     int64_t pts;
  58     struct FrameInfo *next;
  59 } FrameInfo;
  60
  61 /**
  62  * Linked list used to store timestamps and frame sizes of all frames in the
  63  * FIFO for the first input.
  64  *
  65  * This is needed to keep timestamps synchronized for the case where multiple
  66  * input frames are pushed to the filter for processing before a frame is
  67  * requested by the output link.
  68  */
  69 typedef struct FrameList {
  70     int nb_frames;
  71     int nb_samples;
  72     FrameInfo *list;
  73     FrameInfo *end;
  74 } FrameList;
  75
  76 static void frame_list_clear(FrameList *frame_list)
  77 {
  78     if (frame_list) {
  79         while (frame_list->list) {
  80             FrameInfo *info = frame_list->list;
  81             frame_list->list = info->next;
  82             av_free(info);
  83         }
  84         frame_list->nb_frames  = 0;
  85         frame_list->nb_samples = 0;
  86         frame_list->end        = NULL;
  87     }
  88 }
  89
  90 static int frame_list_next_frame_size(FrameList *frame_list)
  91 {
  92     if (!frame_list->list)
  93         return 0;
  94     return frame_list->list->nb_samples;
  95 }
  96
  97 static int64_t frame_list_next_pts(FrameList *frame_list)
  98 {
  99     if (!frame_list->list)
 100         return AV_NOPTS_VALUE;
 101     return frame_list->list->pts;
 102 }
 103
 104 static void frame_list_remove_samples(FrameList *frame_list, int nb_samples)
 105 {
 106     if (nb_samples >= frame_list->nb_samples) {
 107         frame_list_clear(frame_list);
 108     } else {
 109         int samples = nb_samples;
 110         while (samples > 0) {
 111             FrameInfo *info = frame_list->list;
 112             av_assert0(info != NULL);
 113             if (info->nb_samples <= samples) {
 114                 samples -= info->nb_samples;
 115                 frame_list->list = info->next;
 116                 if (!frame_list->list)
 117                     frame_list->end = NULL;
 118                 frame_list->nb_frames--;
 119                 frame_list->nb_samples -= info->nb_samples;
 120                 av_free(info);
 121             } else {
 122                 info->nb_samples       -= samples;
 123                 info->pts              += samples;
 124                 frame_list->nb_samples -= samples;
 125                 samples = 0;
 126             }
 127         }
 128     }
 129 }
 130
 131 static int frame_list_add_frame(FrameList *frame_list, int nb_samples, int64_t pts)
 132 {
 133     FrameInfo *info = av_malloc(sizeof(*info));
 134     if (!info)
 135         return AVERROR(ENOMEM);
 136     info->nb_samples = nb_samples;
 137     info->pts        = pts;
 138     info->next       = NULL;
 139
 140     if (!frame_list->list) {
 141         frame_list->list = info;
 142         frame_list->end  = info;
 143     } else {
 144         av_assert0(frame_list->end != NULL);
 145         frame_list->end->next = info;
 146         frame_list->end       = info;
 147     }
 148     frame_list->nb_frames++;
 149     frame_list->nb_samples += nb_samples;
 150
 151     return 0;
 152 }
 153
 154
 155 typedef struct MixContext {
 156     const AVClass *class;       /**< class for AVOptions */
 157     AVFloatDSPContext fdsp;
 158
 159     int nb_inputs;              /**< number of inputs */
 160     int active_inputs;          /**< number of input currently active */
 161     int duration_mode;          /**< mode for determining duration */
 162     float dropout_transition;   /**< transition time when an input drops out */
 163
 164     int nb_channels;            /**< number of channels */
 165     int sample_rate;            /**< sample rate */
 166     int planar;
 167     AVAudioFifo **fifos;        /**< audio fifo for each input */
 168     uint8_t *input_state;       /**< current state of each input */
 169     float *input_scale;         /**< mixing scale factor for each input */
 170     float scale_norm;           /**< normalization factor for all inputs */
 171     int64_t next_pts;           /**< calculated pts for next output frame */
 172     FrameList *frame_list;      /**< list of frame info for the first input */
 173 } MixContext;
 174
 175 #define OFFSET(x) offsetof(MixContext, x)
 176 #define A AV_OPT_FLAG_AUDIO_PARAM
 177 static const AVOption options[] = {
 178     { "inputs", "Number of inputs.",
 179             OFFSET(nb_inputs), AV_OPT_TYPE_INT, { .i64 = 2 }, 1, 32, A },
 180     { "duration", "How to determine the end-of-stream.",
 181             OFFSET(duration_mode), AV_OPT_TYPE_INT, { .i64 = DURATION_LONGEST }, 0,  2, A, "duration" },
 182         { "longest",  "Duration of longest input.",  0, AV_OPT_TYPE_CONST, { .i64 = DURATION_LONGEST  }, INT_MIN, INT_MAX, A, "duration" },
 183         { "shortest", "Duration of shortest input.", 0, AV_OPT_TYPE_CONST, { .i64 = DURATION_SHORTEST }, INT_MIN, INT_MAX, A, "duration" },
 184         { "first",    "Duration of first input.",    0, AV_OPT_TYPE_CONST, { .i64 = DURATION_FIRST    }, INT_MIN, INT_MAX, A, "duration" },
 185     { "dropout_transition", "Transition time, in seconds, for volume "
 186                             "renormalization when an input stream ends.",
 187             OFFSET(dropout_transition), AV_OPT_TYPE_FLOAT, { .dbl = 2.0 }, 0, INT_MAX, A },
 188     { NULL },
 189 };
 190
 191 static const AVClass amix_class = {
 192     .class_name = "amix filter",
 193     .item_name  = av_default_item_name,
 194     .option     = options,
 195     .version    = LIBAVUTIL_VERSION_INT,
 196 };
 197
 198
 199 /**
 200  * Update the scaling factors to apply to each input during mixing.
 201  *
 202  * This balances the full volume range between active inputs and handles
 203  * volume transitions when EOF is encountered on an input but mixing continues
 204  * with the remaining inputs.
 205  */
 206 static void calculate_scales(MixContext *s, int nb_samples)
 207 {
 208     int i;
 209
 210     if (s->scale_norm > s->active_inputs) {
 211         s->scale_norm -= nb_samples / (s->dropout_transition * s->sample_rate);
 212         s->scale_norm = FFMAX(s->scale_norm, s->active_inputs);
 213     }
 214
 215     for (i = 0; i < s->nb_inputs; i++) {
 216         if (s->input_state[i] == INPUT_ON)
 217             s->input_scale[i] = 1.0f / s->scale_norm;
 218         else
 219             s->input_scale[i] = 0.0f;
 220     }
 221 }
 222
 223 static int config_output(AVFilterLink *outlink)
 224 {
 225     AVFilterContext *ctx = outlink->src;
 226     MixContext *s      = ctx->priv;
 227     int i;
 228     char buf[64];
 229
 230     s->planar          = av_sample_fmt_is_planar(outlink->format);
 231     s->sample_rate     = outlink->sample_rate;
 232     outlink->time_base = (AVRational){ 1, outlink->sample_rate };
 233     s->next_pts        = AV_NOPTS_VALUE;
 234
 235     s->frame_list = av_mallocz(sizeof(*s->frame_list));
 236     if (!s->frame_list)
 237         return AVERROR(ENOMEM);
 238
 239     s->fifos = av_mallocz(s->nb_inputs * sizeof(*s->fifos));
 240     if (!s->fifos)
 241         return AVERROR(ENOMEM);
 242
 243     s->nb_channels = av_get_channel_layout_nb_channels(outlink->channel_layout);
 244     for (i = 0; i < s->nb_inputs; i++) {
 245         s->fifos[i] = av_audio_fifo_alloc(outlink->format, s->nb_channels, 1024);
 246         if (!s->fifos[i])
 247             return AVERROR(ENOMEM);
 248     }
 249
 250     s->input_state = av_malloc(s->nb_inputs);
 251     if (!s->input_state)
 252         return AVERROR(ENOMEM);
 253     memset(s->input_state, INPUT_ON, s->nb_inputs);
 254     s->active_inputs = s->nb_inputs;
 255
 256     s->input_scale = av_mallocz(s->nb_inputs * sizeof(*s->input_scale));
 257     if (!s->input_scale)
 258         return AVERROR(ENOMEM);
 259     s->scale_norm = s->active_inputs;
 260     calculate_scales(s, 0);
 261
 262     av_get_channel_layout_string(buf, sizeof(buf), -1, outlink->channel_layout);
 263
 264     av_log(ctx, AV_LOG_VERBOSE,
 265            "inputs:%d fmt:%s srate:%d cl:%s\n", s->nb_inputs,
 266            av_get_sample_fmt_name(outlink->format), outlink->sample_rate, buf);
 267
 268     return 0;
 269 }
 270
 271 /**
 272  * Read samples from the input FIFOs, mix, and write to the output link.
 273  */
 274 static int output_frame(AVFilterLink *outlink, int nb_samples)
 275 {
 276     AVFilterContext *ctx = outlink->src;
 277     MixContext      *s = ctx->priv;
 278     AVFilterBufferRef *out_buf, *in_buf;
 279     int i;
 280
 281     calculate_scales(s, nb_samples);
 282
 283     out_buf = ff_get_audio_buffer(outlink, AV_PERM_WRITE, nb_samples);
 284     if (!out_buf)
 285         return AVERROR(ENOMEM);
 286
 287     in_buf = ff_get_audio_buffer(outlink, AV_PERM_WRITE, nb_samples);
 288     if (!in_buf)
 289         return AVERROR(ENOMEM);
 290
 291     for (i = 0; i < s->nb_inputs; i++) {
 292         if (s->input_state[i] == INPUT_ON) {
 293             int planes, plane_size, p;
 294
 295             av_audio_fifo_read(s->fifos[i], (void **)in_buf->extended_data,
 296                                nb_samples);
 297
 298             planes     = s->planar ? s->nb_channels : 1;
 299             plane_size = nb_samples * (s->planar ? 1 : s->nb_channels);
 300             plane_size = FFALIGN(plane_size, 16);
 301
 302             for (p = 0; p < planes; p++) {
 303                 s->fdsp.vector_fmac_scalar((float *)out_buf->extended_data[p],
 304                                            (float *) in_buf->extended_data[p],
 305                                            s->input_scale[i], plane_size);
 306             }
 307         }
 308     }
 309     avfilter_unref_buffer(in_buf);
 310
 311     out_buf->pts = s->next_pts;
 312     if (s->next_pts != AV_NOPTS_VALUE)
 313         s->next_pts += nb_samples;
 314
 315     return ff_filter_samples(outlink, out_buf);
 316 }
 317
 318 /**
 319  * Returns the smallest number of samples available in the input FIFOs other
 320  * than that of the first input.
 321  */
 322 static int get_available_samples(MixContext *s)
 323 {
 324     int i;
 325     int available_samples = INT_MAX;
 326
 327     av_assert0(s->nb_inputs > 1);
 328
 329     for (i = 1; i < s->nb_inputs; i++) {
 330         int nb_samples;
 331         if (s->input_state[i] == INPUT_OFF)
 332             continue;
 333         nb_samples = av_audio_fifo_size(s->fifos[i]);
 334         available_samples = FFMIN(available_samples, nb_samples);
 335     }
 336     if (available_samples == INT_MAX)
 337         return 0;
 338     return available_samples;
 339 }
 340
 341 /**
 342  * Requests a frame, if needed, from each input link other than the first.
 343  */
 344 static int request_samples(AVFilterContext *ctx, int min_samples)
 345 {
 346     MixContext *s = ctx->priv;
 347     int i, ret;
 348
 349     av_assert0(s->nb_inputs > 1);
 350
 351     for (i = 1; i < s->nb_inputs; i++) {
 352         ret = 0;
 353         if (s->input_state[i] == INPUT_OFF)
 354             continue;
 355         while (!ret && av_audio_fifo_size(s->fifos[i]) < min_samples)
 356             ret = ff_request_frame(ctx->inputs[i]);
 357         if (ret == AVERROR_EOF) {
 358             if (av_audio_fifo_size(s->fifos[i]) == 0) {
 359                 s->input_state[i] = INPUT_OFF;
 360                 continue;
 361             }
 362         } else if (ret < 0)
 363             return ret;
 364     }
 365     return 0;
 366 }
 367
 368 /**
 369  * Calculates the number of active inputs and determines EOF based on the
 370  * duration option.
 371  *
 372  * @return 0 if mixing should continue, or AVERROR_EOF if mixing should stop.
 373  */
 374 static int calc_active_inputs(MixContext *s)
 375 {
 376     int i;
 377     int active_inputs = 0;
 378     for (i = 0; i < s->nb_inputs; i++)
 379         active_inputs += !!(s->input_state[i] != INPUT_OFF);
 380     s->active_inputs = active_inputs;
 381
 382     if (!active_inputs ||
 383         (s->duration_mode == DURATION_FIRST && s->input_state[0] == INPUT_OFF) ||
 384         (s->duration_mode == DURATION_SHORTEST && active_inputs != s->nb_inputs))
 385         return AVERROR_EOF;
 386     return 0;
 387 }
 388
 389 static int request_frame(AVFilterLink *outlink)
 390 {
 391     AVFilterContext *ctx = outlink->src;
 392     MixContext      *s = ctx->priv;
 393     int ret;
 394     int wanted_samples, available_samples;
 395
 396     ret = calc_active_inputs(s);
 397     if (ret < 0)
 398         return ret;
 399
 400     if (s->input_state[0] == INPUT_OFF) {
 401         ret = request_samples(ctx, 1);
 402         if (ret < 0)
 403             return ret;
 404
 405         ret = calc_active_inputs(s);
 406         if (ret < 0)
 407             return ret;
 408
 409         available_samples = get_available_samples(s);
 410         if (!available_samples)
 411             return AVERROR(EAGAIN);
 412
 413         return output_frame(outlink, available_samples);
 414     }
 415
 416     if (s->frame_list->nb_frames == 0) {
 417         ret = ff_request_frame(ctx->inputs[0]);
 418         if (ret == AVERROR_EOF) {
 419             s->input_state[0] = INPUT_OFF;
 420             if (s->nb_inputs == 1)
 421                 return AVERROR_EOF;
 422             else
 423                 return AVERROR(EAGAIN);
 424         } else if (ret < 0)
 425             return ret;
 426     }
 427     av_assert0(s->frame_list->nb_frames > 0);
 428
 429     wanted_samples = frame_list_next_frame_size(s->frame_list);
 430
 431     if (s->active_inputs > 1) {
 432         ret = request_samples(ctx, wanted_samples);
 433         if (ret < 0)
 434             return ret;
 435
 436         ret = calc_active_inputs(s);
 437         if (ret < 0)
 438             return ret;
 439     }
 440
 441     if (s->active_inputs > 1) {
 442         available_samples = get_available_samples(s);
 443         if (!available_samples)
 444             return AVERROR(EAGAIN);
 445         available_samples = FFMIN(available_samples, wanted_samples);
 446     } else {
 447         available_samples = wanted_samples;
 448     }
 449
 450     s->next_pts = frame_list_next_pts(s->frame_list);
 451     frame_list_remove_samples(s->frame_list, available_samples);
 452
 453     return output_frame(outlink, available_samples);
 454 }
 455
 456 static int filter_samples(AVFilterLink *inlink, AVFilterBufferRef *buf)
 457 {
 458     AVFilterContext  *ctx = inlink->dst;
 459     MixContext       *s = ctx->priv;
 460     AVFilterLink *outlink = ctx->outputs[0];
 461     int i, ret = 0;
 462
 463     for (i = 0; i < ctx->nb_inputs; i++)
 464         if (ctx->inputs[i] == inlink)
 465             break;
 466     if (i >= ctx->nb_inputs) {
 467         av_log(ctx, AV_LOG_ERROR, "unknown input link\n");
 468         ret = AVERROR(EINVAL);
 469         goto fail;
 470     }
 471
 472     if (i == 0) {
 473         int64_t pts = av_rescale_q(buf->pts, inlink->time_base,
 474                                    outlink->time_base);
 475         ret = frame_list_add_frame(s->frame_list, buf->audio->nb_samples, pts);
 476         if (ret < 0)
 477             goto fail;
 478     }
 479
 480     ret = av_audio_fifo_write(s->fifos[i], (void **)buf->extended_data,
 481                               buf->audio->nb_samples);
 482
 483 fail:
 484     avfilter_unref_buffer(buf);
 485
 486     return ret;
 487 }
 488
 489 static int init(AVFilterContext *ctx, const char *args)
 490 {
 491     MixContext *s = ctx->priv;
 492     int i, ret;
 493
 494     s->class = &amix_class;
 495     av_opt_set_defaults(s);
 496
 497     if ((ret = av_set_options_string(s, args, "=", ":")) < 0) {
 498         av_log(ctx, AV_LOG_ERROR, "Error parsing options string '%s'.\n", args);
 499         return ret;
 500     }
 501     av_opt_free(s);
 502
 503     for (i = 0; i < s->nb_inputs; i++) {
 504         char name[32];
 505         AVFilterPad pad = { 0 };
 506
 507         snprintf(name, sizeof(name), "input%d", i);
 508         pad.type           = AVMEDIA_TYPE_AUDIO;
 509         pad.name           = av_strdup(name);
 510         pad.filter_samples = filter_samples;
 511
 512         ff_insert_inpad(ctx, i, &pad);
 513     }
 514
 515     avpriv_float_dsp_init(&s->fdsp, 0);
 516
 517     return 0;
 518 }
 519
 520 static void uninit(AVFilterContext *ctx)
 521 {
 522     int i;
 523     MixContext *s = ctx->priv;
 524
 525     if (s->fifos) {
 526         for (i = 0; i < s->nb_inputs; i++)
 527             av_audio_fifo_free(s->fifos[i]);
 528         av_freep(&s->fifos);
 529     }
 530     frame_list_clear(s->frame_list);
 531     av_freep(&s->frame_list);
 532     av_freep(&s->input_state);
 533     av_freep(&s->input_scale);
 534
 535     for (i = 0; i < ctx->nb_inputs; i++)
 536         av_freep(&ctx->input_pads[i].name);
 537 }
 538
 539 static int query_formats(AVFilterContext *ctx)
 540 {
 541     AVFilterFormats *formats = NULL;
 542     ff_add_format(&formats, AV_SAMPLE_FMT_FLT);
 543     ff_add_format(&formats, AV_SAMPLE_FMT_FLTP);
 544     ff_set_common_formats(ctx, formats);
 545     ff_set_common_channel_layouts(ctx, ff_all_channel_layouts());
 546     ff_set_common_samplerates(ctx, ff_all_samplerates());
 547     return 0;
 548 }
 549
 550 AVFilter avfilter_af_amix = {
 551     .name          = "amix",
 552     .description   = NULL_IF_CONFIG_SMALL("Audio mixing."),
 553     .priv_size     = sizeof(MixContext),
 554
 555     .init           = init,
 556     .uninit         = uninit,
 557     .query_formats  = query_formats,
 558
 559     .inputs    = (const AVFilterPad[]) {{ .name = NULL}},
 560     .outputs   = (const AVFilterPad[]) {{ .name          = "default",
 561                                           .type          = AVMEDIA_TYPE_AUDIO,
 562                                           .config_props  = config_output,
 563                                           .request_frame = request_frame },
 564                                         { .name = NULL}},
 565 };