git.sesse.net Git - ffmpeg/blob - libavfilter/af_amix.c

   1 /*
   2  * Audio Mix Filter
   3  * Copyright (c) 2012 Justin Ruggles <justin.ruggles@gmail.com>
   4  *
   5  * This file is part of Libav.
   6  *
   7  * Libav is free software; you can redistribute it and/or
   8  * modify it under the terms of the GNU Lesser General Public
   9  * License as published by the Free Software Foundation; either
  10  * version 2.1 of the License, or (at your option) any later version.
  11  *
  12  * Libav is distributed in the hope that it will be useful,
  13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15  * Lesser General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU Lesser General Public
  18  * License along with Libav; if not, write to the Free Software
  19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20  */
  21
  22 /**
  23  * @file
  24  * Audio Mix Filter
  25  *
  26  * Mixes audio from multiple sources into a single output. The channel layout,
  27  * sample rate, and sample format will be the same for all inputs and the
  28  * output.
  29  */
  30
  31 #include "libavutil/audioconvert.h"
  32 #include "libavutil/audio_fifo.h"
  33 #include "libavutil/avassert.h"
  34 #include "libavutil/avstring.h"
  35 #include "libavutil/mathematics.h"
  36 #include "libavutil/opt.h"
  37 #include "libavutil/samplefmt.h"
  38
  39 #include "audio.h"
  40 #include "avfilter.h"
  41 #include "formats.h"
  42 #include "internal.h"
  43
  44 #define INPUT_OFF      0    /**< input has reached EOF */
  45 #define INPUT_ON       1    /**< input is active */
  46 #define INPUT_INACTIVE 2    /**< input is on, but is currently inactive */
  47
  48 #define DURATION_LONGEST  0
  49 #define DURATION_SHORTEST 1
  50 #define DURATION_FIRST    2
  51
  52
  53 typedef struct FrameInfo {
  54     int nb_samples;
  55     int64_t pts;
  56     struct FrameInfo *next;
  57 } FrameInfo;
  58
  59 /**
  60  * Linked list used to store timestamps and frame sizes of all frames in the
  61  * FIFO for the first input.
  62  *
  63  * This is needed to keep timestamps synchronized for the case where multiple
  64  * input frames are pushed to the filter for processing before a frame is
  65  * requested by the output link.
  66  */
  67 typedef struct FrameList {
  68     int nb_frames;
  69     int nb_samples;
  70     FrameInfo *list;
  71     FrameInfo *end;
  72 } FrameList;
  73
  74 static void frame_list_clear(FrameList *frame_list)
  75 {
  76     if (frame_list) {
  77         while (frame_list->list) {
  78             FrameInfo *info = frame_list->list;
  79             frame_list->list = info->next;
  80             av_free(info);
  81         }
  82         frame_list->nb_frames  = 0;
  83         frame_list->nb_samples = 0;
  84         frame_list->end        = NULL;
  85     }
  86 }
  87
  88 static int frame_list_next_frame_size(FrameList *frame_list)
  89 {
  90     if (!frame_list->list)
  91         return 0;
  92     return frame_list->list->nb_samples;
  93 }
  94
  95 static int64_t frame_list_next_pts(FrameList *frame_list)
  96 {
  97     if (!frame_list->list)
  98         return AV_NOPTS_VALUE;
  99     return frame_list->list->pts;
 100 }
 101
 102 static void frame_list_remove_samples(FrameList *frame_list, int nb_samples)
 103 {
 104     if (nb_samples >= frame_list->nb_samples) {
 105         frame_list_clear(frame_list);
 106     } else {
 107         int samples = nb_samples;
 108         while (samples > 0) {
 109             FrameInfo *info = frame_list->list;
 110             av_assert0(info != NULL);
 111             if (info->nb_samples <= samples) {
 112                 samples -= info->nb_samples;
 113                 frame_list->list = info->next;
 114                 if (!frame_list->list)
 115                     frame_list->end = NULL;
 116                 frame_list->nb_frames--;
 117                 frame_list->nb_samples -= info->nb_samples;
 118                 av_free(info);
 119             } else {
 120                 info->nb_samples       -= samples;
 121                 info->pts              += samples;
 122                 frame_list->nb_samples -= samples;
 123                 samples = 0;
 124             }
 125         }
 126     }
 127 }
 128
 129 static int frame_list_add_frame(FrameList *frame_list, int nb_samples, int64_t pts)
 130 {
 131     FrameInfo *info = av_malloc(sizeof(*info));
 132     if (!info)
 133         return AVERROR(ENOMEM);
 134     info->nb_samples = nb_samples;
 135     info->pts        = pts;
 136     info->next       = NULL;
 137
 138     if (!frame_list->list) {
 139         frame_list->list = info;
 140         frame_list->end  = info;
 141     } else {
 142         av_assert0(frame_list->end != NULL);
 143         frame_list->end->next = info;
 144         frame_list->end       = info;
 145     }
 146     frame_list->nb_frames++;
 147     frame_list->nb_samples += nb_samples;
 148
 149     return 0;
 150 }
 151
 152
 153 typedef struct MixContext {
 154     const AVClass *class;       /**< class for AVOptions */
 155
 156     int nb_inputs;              /**< number of inputs */
 157     int active_inputs;          /**< number of input currently active */
 158     int duration_mode;          /**< mode for determining duration */
 159     float dropout_transition;   /**< transition time when an input drops out */
 160
 161     int nb_channels;            /**< number of channels */
 162     int sample_rate;            /**< sample rate */
 163     AVAudioFifo **fifos;        /**< audio fifo for each input */
 164     uint8_t *input_state;       /**< current state of each input */
 165     float *input_scale;         /**< mixing scale factor for each input */
 166     float scale_norm;           /**< normalization factor for all inputs */
 167     int64_t next_pts;           /**< calculated pts for next output frame */
 168     FrameList *frame_list;      /**< list of frame info for the first input */
 169 } MixContext;
 170
 171 #define OFFSET(x) offsetof(MixContext, x)
 172 #define A AV_OPT_FLAG_AUDIO_PARAM
 173 static const AVOption options[] = {
 174     { "inputs", "Number of inputs.",
 175             OFFSET(nb_inputs), AV_OPT_TYPE_INT, { 2 }, 1, 32, A },
 176     { "duration", "How to determine the end-of-stream.",
 177             OFFSET(duration_mode), AV_OPT_TYPE_INT, { DURATION_LONGEST }, 0,  2, A, "duration" },
 178         { "longest",  "Duration of longest input.",  0, AV_OPT_TYPE_CONST, { DURATION_LONGEST  }, INT_MIN, INT_MAX, A, "duration" },
 179         { "shortest", "Duration of shortest input.", 0, AV_OPT_TYPE_CONST, { DURATION_SHORTEST }, INT_MIN, INT_MAX, A, "duration" },
 180         { "first",    "Duration of first input.",    0, AV_OPT_TYPE_CONST, { DURATION_FIRST    }, INT_MIN, INT_MAX, A, "duration" },
 181     { "dropout_transition", "Transition time, in seconds, for volume "
 182                             "renormalization when an input stream ends.",
 183             OFFSET(dropout_transition), AV_OPT_TYPE_FLOAT, { 2.0 }, 0, INT_MAX, A },
 184     { NULL },
 185 };
 186
 187 static const AVClass amix_class = {
 188     .class_name = "amix filter",
 189     .item_name  = av_default_item_name,
 190     .option     = options,
 191     .version    = LIBAVUTIL_VERSION_INT,
 192 };
 193
 194
 195 /**
 196  * Update the scaling factors to apply to each input during mixing.
 197  *
 198  * This balances the full volume range between active inputs and handles
 199  * volume transitions when EOF is encountered on an input but mixing continues
 200  * with the remaining inputs.
 201  */
 202 static void calculate_scales(MixContext *s, int nb_samples)
 203 {
 204     int i;
 205
 206     if (s->scale_norm > s->active_inputs) {
 207         s->scale_norm -= nb_samples / (s->dropout_transition * s->sample_rate);
 208         s->scale_norm = FFMAX(s->scale_norm, s->active_inputs);
 209     }
 210
 211     for (i = 0; i < s->nb_inputs; i++) {
 212         if (s->input_state[i] == INPUT_ON)
 213             s->input_scale[i] = 1.0f / s->scale_norm;
 214         else
 215             s->input_scale[i] = 0.0f;
 216     }
 217 }
 218
 219 static int config_output(AVFilterLink *outlink)
 220 {
 221     AVFilterContext *ctx = outlink->src;
 222     MixContext *s      = ctx->priv;
 223     int i;
 224     char buf[64];
 225
 226     s->sample_rate     = outlink->sample_rate;
 227     outlink->time_base = (AVRational){ 1, outlink->sample_rate };
 228     s->next_pts        = AV_NOPTS_VALUE;
 229
 230     s->frame_list = av_mallocz(sizeof(*s->frame_list));
 231     if (!s->frame_list)
 232         return AVERROR(ENOMEM);
 233
 234     s->fifos = av_mallocz(s->nb_inputs * sizeof(*s->fifos));
 235     if (!s->fifos)
 236         return AVERROR(ENOMEM);
 237
 238     s->nb_channels = av_get_channel_layout_nb_channels(outlink->channel_layout);
 239     for (i = 0; i < s->nb_inputs; i++) {
 240         s->fifos[i] = av_audio_fifo_alloc(outlink->format, s->nb_channels, 1024);
 241         if (!s->fifos[i])
 242             return AVERROR(ENOMEM);
 243     }
 244
 245     s->input_state = av_malloc(s->nb_inputs);
 246     if (!s->input_state)
 247         return AVERROR(ENOMEM);
 248     memset(s->input_state, INPUT_ON, s->nb_inputs);
 249     s->active_inputs = s->nb_inputs;
 250
 251     s->input_scale = av_mallocz(s->nb_inputs * sizeof(*s->input_scale));
 252     if (!s->input_scale)
 253         return AVERROR(ENOMEM);
 254     s->scale_norm = s->active_inputs;
 255     calculate_scales(s, 0);
 256
 257     av_get_channel_layout_string(buf, sizeof(buf), -1, outlink->channel_layout);
 258
 259     av_log(ctx, AV_LOG_VERBOSE,
 260            "inputs:%d fmt:%s srate:%"PRId64" cl:%s\n", s->nb_inputs,
 261            av_get_sample_fmt_name(outlink->format), outlink->sample_rate, buf);
 262
 263     return 0;
 264 }
 265
 266 /* TODO: move optimized version from DSPContext to libavutil */
 267 static void vector_fmac_scalar(float *dst, const float *src, float mul, int len)
 268 {
 269     int i;
 270     for (i = 0; i < len; i++)
 271         dst[i] += src[i] * mul;
 272 }
 273
 274 /**
 275  * Read samples from the input FIFOs, mix, and write to the output link.
 276  */
 277 static int output_frame(AVFilterLink *outlink, int nb_samples)
 278 {
 279     AVFilterContext *ctx = outlink->src;
 280     MixContext      *s = ctx->priv;
 281     AVFilterBufferRef *out_buf, *in_buf;
 282     int i;
 283
 284     calculate_scales(s, nb_samples);
 285
 286     out_buf = ff_get_audio_buffer(outlink, AV_PERM_WRITE, nb_samples);
 287     if (!out_buf)
 288         return AVERROR(ENOMEM);
 289
 290     in_buf = ff_get_audio_buffer(outlink, AV_PERM_WRITE, nb_samples);
 291     if (!in_buf)
 292         return AVERROR(ENOMEM);
 293
 294     for (i = 0; i < s->nb_inputs; i++) {
 295         if (s->input_state[i] == INPUT_ON) {
 296             av_audio_fifo_read(s->fifos[i], (void **)in_buf->extended_data,
 297                                nb_samples);
 298             vector_fmac_scalar((float *)out_buf->extended_data[0],
 299                                (float *) in_buf->extended_data[0],
 300                                s->input_scale[i], nb_samples * s->nb_channels);
 301         }
 302     }
 303     avfilter_unref_buffer(in_buf);
 304
 305     out_buf->pts = s->next_pts;
 306     if (s->next_pts != AV_NOPTS_VALUE)
 307         s->next_pts += nb_samples;
 308
 309     ff_filter_samples(outlink, out_buf);
 310
 311     return 0;
 312 }
 313
 314 /**
 315  * Returns the smallest number of samples available in the input FIFOs other
 316  * than that of the first input.
 317  */
 318 static int get_available_samples(MixContext *s)
 319 {
 320     int i;
 321     int available_samples = INT_MAX;
 322
 323     av_assert0(s->nb_inputs > 1);
 324
 325     for (i = 1; i < s->nb_inputs; i++) {
 326         int nb_samples;
 327         if (s->input_state[i] == INPUT_OFF)
 328             continue;
 329         nb_samples = av_audio_fifo_size(s->fifos[i]);
 330         available_samples = FFMIN(available_samples, nb_samples);
 331     }
 332     if (available_samples == INT_MAX)
 333         return 0;
 334     return available_samples;
 335 }
 336
 337 /**
 338  * Requests a frame, if needed, from each input link other than the first.
 339  */
 340 static int request_samples(AVFilterContext *ctx, int min_samples)
 341 {
 342     MixContext *s = ctx->priv;
 343     int i, ret;
 344
 345     av_assert0(s->nb_inputs > 1);
 346
 347     for (i = 1; i < s->nb_inputs; i++) {
 348         ret = 0;
 349         if (s->input_state[i] == INPUT_OFF)
 350             continue;
 351         while (!ret && av_audio_fifo_size(s->fifos[i]) < min_samples)
 352             ret = avfilter_request_frame(ctx->inputs[i]);
 353         if (ret == AVERROR_EOF) {
 354             if (av_audio_fifo_size(s->fifos[i]) == 0) {
 355                 s->input_state[i] = INPUT_OFF;
 356                 continue;
 357             }
 358         } else if (ret)
 359             return ret;
 360     }
 361     return 0;
 362 }
 363
 364 /**
 365  * Calculates the number of active inputs and determines EOF based on the
 366  * duration option.
 367  *
 368  * @return 0 if mixing should continue, or AVERROR_EOF if mixing should stop.
 369  */
 370 static int calc_active_inputs(MixContext *s)
 371 {
 372     int i;
 373     int active_inputs = 0;
 374     for (i = 0; i < s->nb_inputs; i++)
 375         active_inputs += !!(s->input_state[i] != INPUT_OFF);
 376     s->active_inputs = active_inputs;
 377
 378     if (!active_inputs ||
 379         (s->duration_mode == DURATION_FIRST && s->input_state[0] == INPUT_OFF) ||
 380         (s->duration_mode == DURATION_SHORTEST && active_inputs != s->nb_inputs))
 381         return AVERROR_EOF;
 382     return 0;
 383 }
 384
 385 static int request_frame(AVFilterLink *outlink)
 386 {
 387     AVFilterContext *ctx = outlink->src;
 388     MixContext      *s = ctx->priv;
 389     int ret;
 390     int wanted_samples, available_samples;
 391
 392     ret = calc_active_inputs(s);
 393     if (ret < 0)
 394         return ret;
 395
 396     if (s->input_state[0] == INPUT_OFF) {
 397         ret = request_samples(ctx, 1);
 398         if (ret < 0)
 399             return ret;
 400
 401         ret = calc_active_inputs(s);
 402         if (ret < 0)
 403             return ret;
 404
 405         available_samples = get_available_samples(s);
 406         if (!available_samples)
 407             return 0;
 408
 409         return output_frame(outlink, available_samples);
 410     }
 411
 412     if (s->frame_list->nb_frames == 0) {
 413         ret = avfilter_request_frame(ctx->inputs[0]);
 414         if (ret == AVERROR_EOF) {
 415             s->input_state[0] = INPUT_OFF;
 416             if (s->nb_inputs == 1)
 417                 return AVERROR_EOF;
 418             else
 419                 return AVERROR(EAGAIN);
 420         } else if (ret)
 421             return ret;
 422     }
 423     av_assert0(s->frame_list->nb_frames > 0);
 424
 425     wanted_samples = frame_list_next_frame_size(s->frame_list);
 426
 427     if (s->active_inputs > 1) {
 428         ret = request_samples(ctx, wanted_samples);
 429         if (ret < 0)
 430             return ret;
 431
 432         ret = calc_active_inputs(s);
 433         if (ret < 0)
 434             return ret;
 435
 436         available_samples = get_available_samples(s);
 437         if (!available_samples)
 438             return 0;
 439         available_samples = FFMIN(available_samples, wanted_samples);
 440     } else {
 441         available_samples = wanted_samples;
 442     }
 443
 444     s->next_pts = frame_list_next_pts(s->frame_list);
 445     frame_list_remove_samples(s->frame_list, available_samples);
 446
 447     return output_frame(outlink, available_samples);
 448 }
 449
 450 static void filter_samples(AVFilterLink *inlink, AVFilterBufferRef *buf)
 451 {
 452     AVFilterContext  *ctx = inlink->dst;
 453     MixContext       *s = ctx->priv;
 454     AVFilterLink *outlink = ctx->outputs[0];
 455     int i;
 456
 457     for (i = 0; i < ctx->input_count; i++)
 458         if (ctx->inputs[i] == inlink)
 459             break;
 460     if (i >= ctx->input_count) {
 461         av_log(ctx, AV_LOG_ERROR, "unknown input link\n");
 462         return;
 463     }
 464
 465     if (i == 0) {
 466         int64_t pts = av_rescale_q(buf->pts, inlink->time_base,
 467                                    outlink->time_base);
 468         frame_list_add_frame(s->frame_list, buf->audio->nb_samples, pts);
 469     }
 470
 471     av_audio_fifo_write(s->fifos[i], (void **)buf->extended_data,
 472                         buf->audio->nb_samples);
 473
 474     avfilter_unref_buffer(buf);
 475 }
 476
 477 static int init(AVFilterContext *ctx, const char *args, void *opaque)
 478 {
 479     MixContext *s = ctx->priv;
 480     int i, ret;
 481
 482     s->class = &amix_class;
 483     av_opt_set_defaults(s);
 484
 485     if ((ret = av_set_options_string(s, args, "=", ":")) < 0) {
 486         av_log(ctx, AV_LOG_ERROR, "Error parsing options string '%s'.\n", args);
 487         return ret;
 488     }
 489     av_opt_free(s);
 490
 491     for (i = 0; i < s->nb_inputs; i++) {
 492         char name[32];
 493         AVFilterPad pad = { 0 };
 494
 495         snprintf(name, sizeof(name), "input%d", i);
 496         pad.type           = AVMEDIA_TYPE_AUDIO;
 497         pad.name           = av_strdup(name);
 498         pad.filter_samples = filter_samples;
 499
 500         avfilter_insert_inpad(ctx, i, &pad);
 501     }
 502
 503     return 0;
 504 }
 505
 506 static void uninit(AVFilterContext *ctx)
 507 {
 508     int i;
 509     MixContext *s = ctx->priv;
 510
 511     if (s->fifos) {
 512         for (i = 0; i < s->nb_inputs; i++)
 513             av_audio_fifo_free(s->fifos[i]);
 514         av_freep(&s->fifos);
 515     }
 516     frame_list_clear(s->frame_list);
 517     av_freep(&s->frame_list);
 518     av_freep(&s->input_state);
 519     av_freep(&s->input_scale);
 520
 521     for (i = 0; i < ctx->input_count; i++)
 522         av_freep(&ctx->input_pads[i].name);
 523 }
 524
 525 static int query_formats(AVFilterContext *ctx)
 526 {
 527     AVFilterFormats *formats = NULL;
 528     ff_add_format(&formats, AV_SAMPLE_FMT_FLT);
 529     ff_set_common_formats(ctx, formats);
 530     ff_set_common_channel_layouts(ctx, ff_all_channel_layouts());
 531     ff_set_common_samplerates(ctx, ff_all_samplerates());
 532     return 0;
 533 }
 534
 535 AVFilter avfilter_af_amix = {
 536     .name          = "amix",
 537     .description   = NULL_IF_CONFIG_SMALL("Audio mixing."),
 538     .priv_size     = sizeof(MixContext),
 539
 540     .init           = init,
 541     .uninit         = uninit,
 542     .query_formats  = query_formats,
 543
 544     .inputs    = (const AVFilterPad[]) {{ .name = NULL}},
 545     .outputs   = (const AVFilterPad[]) {{ .name          = "default",
 546                                           .type          = AVMEDIA_TYPE_AUDIO,
 547                                           .config_props  = config_output,
 548                                           .request_frame = request_frame },
 549                                         { .name = NULL}},
 550 };