git.sesse.net Git - ffmpeg/blob - libavfilter/af_amix.c

   1 /*
   2  * Audio Mix Filter
   3  * Copyright (c) 2012 Justin Ruggles <justin.ruggles@gmail.com>
   4  *
   5  * This file is part of Libav.
   6  *
   7  * Libav is free software; you can redistribute it and/or
   8  * modify it under the terms of the GNU Lesser General Public
   9  * License as published by the Free Software Foundation; either
  10  * version 2.1 of the License, or (at your option) any later version.
  11  *
  12  * Libav is distributed in the hope that it will be useful,
  13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  15  * Lesser General Public License for more details.
  16  *
  17  * You should have received a copy of the GNU Lesser General Public
  18  * License along with Libav; if not, write to the Free Software
  19  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  20  */
  21
  22 /**
  23  * @file
  24  * Audio Mix Filter
  25  *
  26  * Mixes audio from multiple sources into a single output. The channel layout,
  27  * sample rate, and sample format will be the same for all inputs and the
  28  * output.
  29  */
  30
  31 #include "libavutil/audioconvert.h"
  32 #include "libavutil/audio_fifo.h"
  33 #include "libavutil/avassert.h"
  34 #include "libavutil/avstring.h"
  35 #include "libavutil/float_dsp.h"
  36 #include "libavutil/mathematics.h"
  37 #include "libavutil/opt.h"
  38 #include "libavutil/samplefmt.h"
  39
  40 #include "audio.h"
  41 #include "avfilter.h"
  42 #include "formats.h"
  43 #include "internal.h"
  44
  45 #define INPUT_OFF      0    /**< input has reached EOF */
  46 #define INPUT_ON       1    /**< input is active */
  47 #define INPUT_INACTIVE 2    /**< input is on, but is currently inactive */
  48
  49 #define DURATION_LONGEST  0
  50 #define DURATION_SHORTEST 1
  51 #define DURATION_FIRST    2
  52
  53
  54 typedef struct FrameInfo {
  55     int nb_samples;
  56     int64_t pts;
  57     struct FrameInfo *next;
  58 } FrameInfo;
  59
  60 /**
  61  * Linked list used to store timestamps and frame sizes of all frames in the
  62  * FIFO for the first input.
  63  *
  64  * This is needed to keep timestamps synchronized for the case where multiple
  65  * input frames are pushed to the filter for processing before a frame is
  66  * requested by the output link.
  67  */
  68 typedef struct FrameList {
  69     int nb_frames;
  70     int nb_samples;
  71     FrameInfo *list;
  72     FrameInfo *end;
  73 } FrameList;
  74
  75 static void frame_list_clear(FrameList *frame_list)
  76 {
  77     if (frame_list) {
  78         while (frame_list->list) {
  79             FrameInfo *info = frame_list->list;
  80             frame_list->list = info->next;
  81             av_free(info);
  82         }
  83         frame_list->nb_frames  = 0;
  84         frame_list->nb_samples = 0;
  85         frame_list->end        = NULL;
  86     }
  87 }
  88
  89 static int frame_list_next_frame_size(FrameList *frame_list)
  90 {
  91     if (!frame_list->list)
  92         return 0;
  93     return frame_list->list->nb_samples;
  94 }
  95
  96 static int64_t frame_list_next_pts(FrameList *frame_list)
  97 {
  98     if (!frame_list->list)
  99         return AV_NOPTS_VALUE;
 100     return frame_list->list->pts;
 101 }
 102
 103 static void frame_list_remove_samples(FrameList *frame_list, int nb_samples)
 104 {
 105     if (nb_samples >= frame_list->nb_samples) {
 106         frame_list_clear(frame_list);
 107     } else {
 108         int samples = nb_samples;
 109         while (samples > 0) {
 110             FrameInfo *info = frame_list->list;
 111             av_assert0(info != NULL);
 112             if (info->nb_samples <= samples) {
 113                 samples -= info->nb_samples;
 114                 frame_list->list = info->next;
 115                 if (!frame_list->list)
 116                     frame_list->end = NULL;
 117                 frame_list->nb_frames--;
 118                 frame_list->nb_samples -= info->nb_samples;
 119                 av_free(info);
 120             } else {
 121                 info->nb_samples       -= samples;
 122                 info->pts              += samples;
 123                 frame_list->nb_samples -= samples;
 124                 samples = 0;
 125             }
 126         }
 127     }
 128 }
 129
 130 static int frame_list_add_frame(FrameList *frame_list, int nb_samples, int64_t pts)
 131 {
 132     FrameInfo *info = av_malloc(sizeof(*info));
 133     if (!info)
 134         return AVERROR(ENOMEM);
 135     info->nb_samples = nb_samples;
 136     info->pts        = pts;
 137     info->next       = NULL;
 138
 139     if (!frame_list->list) {
 140         frame_list->list = info;
 141         frame_list->end  = info;
 142     } else {
 143         av_assert0(frame_list->end != NULL);
 144         frame_list->end->next = info;
 145         frame_list->end       = info;
 146     }
 147     frame_list->nb_frames++;
 148     frame_list->nb_samples += nb_samples;
 149
 150     return 0;
 151 }
 152
 153
 154 typedef struct MixContext {
 155     const AVClass *class;       /**< class for AVOptions */
 156     AVFloatDSPContext fdsp;
 157
 158     int nb_inputs;              /**< number of inputs */
 159     int active_inputs;          /**< number of input currently active */
 160     int duration_mode;          /**< mode for determining duration */
 161     float dropout_transition;   /**< transition time when an input drops out */
 162
 163     int nb_channels;            /**< number of channels */
 164     int sample_rate;            /**< sample rate */
 165     int planar;
 166     AVAudioFifo **fifos;        /**< audio fifo for each input */
 167     uint8_t *input_state;       /**< current state of each input */
 168     float *input_scale;         /**< mixing scale factor for each input */
 169     float scale_norm;           /**< normalization factor for all inputs */
 170     int64_t next_pts;           /**< calculated pts for next output frame */
 171     FrameList *frame_list;      /**< list of frame info for the first input */
 172 } MixContext;
 173
 174 #define OFFSET(x) offsetof(MixContext, x)
 175 #define A AV_OPT_FLAG_AUDIO_PARAM
 176 static const AVOption amix_options[] = {
 177     { "inputs", "Number of inputs.",
 178             OFFSET(nb_inputs), AV_OPT_TYPE_INT, { 2 }, 1, 32, A },
 179     { "duration", "How to determine the end-of-stream.",
 180             OFFSET(duration_mode), AV_OPT_TYPE_INT, { DURATION_LONGEST }, 0,  2, A, "duration" },
 181         { "longest",  "Duration of longest input.",  0, AV_OPT_TYPE_CONST, { DURATION_LONGEST  }, INT_MIN, INT_MAX, A, "duration" },
 182         { "shortest", "Duration of shortest input.", 0, AV_OPT_TYPE_CONST, { DURATION_SHORTEST }, INT_MIN, INT_MAX, A, "duration" },
 183         { "first",    "Duration of first input.",    0, AV_OPT_TYPE_CONST, { DURATION_FIRST    }, INT_MIN, INT_MAX, A, "duration" },
 184     { "dropout_transition", "Transition time, in seconds, for volume "
 185                             "renormalization when an input stream ends.",
 186             OFFSET(dropout_transition), AV_OPT_TYPE_FLOAT, { 2.0 }, 0, INT_MAX, A },
 187     { NULL },
 188 };
 189
 190 AVFILTER_DEFINE_CLASS(amix);
 191
 192 /**
 193  * Update the scaling factors to apply to each input during mixing.
 194  *
 195  * This balances the full volume range between active inputs and handles
 196  * volume transitions when EOF is encountered on an input but mixing continues
 197  * with the remaining inputs.
 198  */
 199 static void calculate_scales(MixContext *s, int nb_samples)
 200 {
 201     int i;
 202
 203     if (s->scale_norm > s->active_inputs) {
 204         s->scale_norm -= nb_samples / (s->dropout_transition * s->sample_rate);
 205         s->scale_norm = FFMAX(s->scale_norm, s->active_inputs);
 206     }
 207
 208     for (i = 0; i < s->nb_inputs; i++) {
 209         if (s->input_state[i] == INPUT_ON)
 210             s->input_scale[i] = 1.0f / s->scale_norm;
 211         else
 212             s->input_scale[i] = 0.0f;
 213     }
 214 }
 215
 216 static int config_output(AVFilterLink *outlink)
 217 {
 218     AVFilterContext *ctx = outlink->src;
 219     MixContext *s      = ctx->priv;
 220     int i;
 221     char buf[64];
 222
 223     s->planar          = av_sample_fmt_is_planar(outlink->format);
 224     s->sample_rate     = outlink->sample_rate;
 225     outlink->time_base = (AVRational){ 1, outlink->sample_rate };
 226     s->next_pts        = AV_NOPTS_VALUE;
 227
 228     s->frame_list = av_mallocz(sizeof(*s->frame_list));
 229     if (!s->frame_list)
 230         return AVERROR(ENOMEM);
 231
 232     s->fifos = av_mallocz(s->nb_inputs * sizeof(*s->fifos));
 233     if (!s->fifos)
 234         return AVERROR(ENOMEM);
 235
 236     s->nb_channels = av_get_channel_layout_nb_channels(outlink->channel_layout);
 237     for (i = 0; i < s->nb_inputs; i++) {
 238         s->fifos[i] = av_audio_fifo_alloc(outlink->format, s->nb_channels, 1024);
 239         if (!s->fifos[i])
 240             return AVERROR(ENOMEM);
 241     }
 242
 243     s->input_state = av_malloc(s->nb_inputs);
 244     if (!s->input_state)
 245         return AVERROR(ENOMEM);
 246     memset(s->input_state, INPUT_ON, s->nb_inputs);
 247     s->active_inputs = s->nb_inputs;
 248
 249     s->input_scale = av_mallocz(s->nb_inputs * sizeof(*s->input_scale));
 250     if (!s->input_scale)
 251         return AVERROR(ENOMEM);
 252     s->scale_norm = s->active_inputs;
 253     calculate_scales(s, 0);
 254
 255     av_get_channel_layout_string(buf, sizeof(buf), -1, outlink->channel_layout);
 256
 257     av_log(ctx, AV_LOG_VERBOSE,
 258            "inputs:%d fmt:%s srate:%d cl:%s\n", s->nb_inputs,
 259            av_get_sample_fmt_name(outlink->format), outlink->sample_rate, buf);
 260
 261     return 0;
 262 }
 263
 264 /**
 265  * Read samples from the input FIFOs, mix, and write to the output link.
 266  */
 267 static int output_frame(AVFilterLink *outlink, int nb_samples)
 268 {
 269     AVFilterContext *ctx = outlink->src;
 270     MixContext      *s = ctx->priv;
 271     AVFilterBufferRef *out_buf, *in_buf;
 272     int i;
 273
 274     calculate_scales(s, nb_samples);
 275
 276     out_buf = ff_get_audio_buffer(outlink, AV_PERM_WRITE, nb_samples);
 277     if (!out_buf)
 278         return AVERROR(ENOMEM);
 279
 280     in_buf = ff_get_audio_buffer(outlink, AV_PERM_WRITE, nb_samples);
 281     if (!in_buf)
 282         return AVERROR(ENOMEM);
 283
 284     for (i = 0; i < s->nb_inputs; i++) {
 285         if (s->input_state[i] == INPUT_ON) {
 286             int planes, plane_size, p;
 287
 288             av_audio_fifo_read(s->fifos[i], (void **)in_buf->extended_data,
 289                                nb_samples);
 290
 291             planes     = s->planar ? s->nb_channels : 1;
 292             plane_size = nb_samples * (s->planar ? 1 : s->nb_channels);
 293             plane_size = FFALIGN(plane_size, 16);
 294
 295             for (p = 0; p < planes; p++) {
 296                 s->fdsp.vector_fmac_scalar((float *)out_buf->extended_data[p],
 297                                            (float *) in_buf->extended_data[p],
 298                                            s->input_scale[i], plane_size);
 299             }
 300         }
 301     }
 302     avfilter_unref_buffer(in_buf);
 303
 304     out_buf->pts = s->next_pts;
 305     if (s->next_pts != AV_NOPTS_VALUE)
 306         s->next_pts += nb_samples;
 307
 308     return ff_filter_samples(outlink, out_buf);
 309 }
 310
 311 /**
 312  * Returns the smallest number of samples available in the input FIFOs other
 313  * than that of the first input.
 314  */
 315 static int get_available_samples(MixContext *s)
 316 {
 317     int i;
 318     int available_samples = INT_MAX;
 319
 320     av_assert0(s->nb_inputs > 1);
 321
 322     for (i = 1; i < s->nb_inputs; i++) {
 323         int nb_samples;
 324         if (s->input_state[i] == INPUT_OFF)
 325             continue;
 326         nb_samples = av_audio_fifo_size(s->fifos[i]);
 327         available_samples = FFMIN(available_samples, nb_samples);
 328     }
 329     if (available_samples == INT_MAX)
 330         return 0;
 331     return available_samples;
 332 }
 333
 334 /**
 335  * Requests a frame, if needed, from each input link other than the first.
 336  */
 337 static int request_samples(AVFilterContext *ctx, int min_samples)
 338 {
 339     MixContext *s = ctx->priv;
 340     int i, ret;
 341
 342     av_assert0(s->nb_inputs > 1);
 343
 344     for (i = 1; i < s->nb_inputs; i++) {
 345         ret = 0;
 346         if (s->input_state[i] == INPUT_OFF)
 347             continue;
 348         while (!ret && av_audio_fifo_size(s->fifos[i]) < min_samples)
 349             ret = ff_request_frame(ctx->inputs[i]);
 350         if (ret == AVERROR_EOF) {
 351             if (av_audio_fifo_size(s->fifos[i]) == 0) {
 352                 s->input_state[i] = INPUT_OFF;
 353                 continue;
 354             }
 355         } else if (ret < 0)
 356             return ret;
 357     }
 358     return 0;
 359 }
 360
 361 /**
 362  * Calculates the number of active inputs and determines EOF based on the
 363  * duration option.
 364  *
 365  * @return 0 if mixing should continue, or AVERROR_EOF if mixing should stop.
 366  */
 367 static int calc_active_inputs(MixContext *s)
 368 {
 369     int i;
 370     int active_inputs = 0;
 371     for (i = 0; i < s->nb_inputs; i++)
 372         active_inputs += !!(s->input_state[i] != INPUT_OFF);
 373     s->active_inputs = active_inputs;
 374
 375     if (!active_inputs ||
 376         (s->duration_mode == DURATION_FIRST && s->input_state[0] == INPUT_OFF) ||
 377         (s->duration_mode == DURATION_SHORTEST && active_inputs != s->nb_inputs))
 378         return AVERROR_EOF;
 379     return 0;
 380 }
 381
 382 static int request_frame(AVFilterLink *outlink)
 383 {
 384     AVFilterContext *ctx = outlink->src;
 385     MixContext      *s = ctx->priv;
 386     int ret;
 387     int wanted_samples, available_samples;
 388
 389     ret = calc_active_inputs(s);
 390     if (ret < 0)
 391         return ret;
 392
 393     if (s->input_state[0] == INPUT_OFF) {
 394         ret = request_samples(ctx, 1);
 395         if (ret < 0)
 396             return ret;
 397
 398         ret = calc_active_inputs(s);
 399         if (ret < 0)
 400             return ret;
 401
 402         available_samples = get_available_samples(s);
 403         if (!available_samples)
 404             return AVERROR(EAGAIN);
 405
 406         return output_frame(outlink, available_samples);
 407     }
 408
 409     if (s->frame_list->nb_frames == 0) {
 410         ret = ff_request_frame(ctx->inputs[0]);
 411         if (ret == AVERROR_EOF) {
 412             s->input_state[0] = INPUT_OFF;
 413             if (s->nb_inputs == 1)
 414                 return AVERROR_EOF;
 415             else
 416                 return AVERROR(EAGAIN);
 417         } else if (ret < 0)
 418             return ret;
 419     }
 420     av_assert0(s->frame_list->nb_frames > 0);
 421
 422     wanted_samples = frame_list_next_frame_size(s->frame_list);
 423
 424     if (s->active_inputs > 1) {
 425         ret = request_samples(ctx, wanted_samples);
 426         if (ret < 0)
 427             return ret;
 428
 429         ret = calc_active_inputs(s);
 430         if (ret < 0)
 431             return ret;
 432     }
 433
 434     if (s->active_inputs > 1) {
 435         available_samples = get_available_samples(s);
 436         if (!available_samples)
 437             return AVERROR(EAGAIN);
 438         available_samples = FFMIN(available_samples, wanted_samples);
 439     } else {
 440         available_samples = wanted_samples;
 441     }
 442
 443     s->next_pts = frame_list_next_pts(s->frame_list);
 444     frame_list_remove_samples(s->frame_list, available_samples);
 445
 446     return output_frame(outlink, available_samples);
 447 }
 448
 449 static int filter_samples(AVFilterLink *inlink, AVFilterBufferRef *buf)
 450 {
 451     AVFilterContext  *ctx = inlink->dst;
 452     MixContext       *s = ctx->priv;
 453     AVFilterLink *outlink = ctx->outputs[0];
 454     int i, ret = 0;
 455
 456     for (i = 0; i < ctx->nb_inputs; i++)
 457         if (ctx->inputs[i] == inlink)
 458             break;
 459     if (i >= ctx->nb_inputs) {
 460         av_log(ctx, AV_LOG_ERROR, "unknown input link\n");
 461         ret = AVERROR(EINVAL);
 462         goto fail;
 463     }
 464
 465     if (i == 0) {
 466         int64_t pts = av_rescale_q(buf->pts, inlink->time_base,
 467                                    outlink->time_base);
 468         ret = frame_list_add_frame(s->frame_list, buf->audio->nb_samples, pts);
 469         if (ret < 0)
 470             goto fail;
 471     }
 472
 473     ret = av_audio_fifo_write(s->fifos[i], (void **)buf->extended_data,
 474                               buf->audio->nb_samples);
 475
 476 fail:
 477     avfilter_unref_buffer(buf);
 478
 479     return ret;
 480 }
 481
 482 static int init(AVFilterContext *ctx, const char *args)
 483 {
 484     MixContext *s = ctx->priv;
 485     int i, ret;
 486
 487     s->class = &amix_class;
 488     av_opt_set_defaults(s);
 489
 490     if ((ret = av_set_options_string(s, args, "=", ":")) < 0)
 491         return ret;
 492     av_opt_free(s);
 493
 494     for (i = 0; i < s->nb_inputs; i++) {
 495         char name[32];
 496         AVFilterPad pad = { 0 };
 497
 498         snprintf(name, sizeof(name), "input%d", i);
 499         pad.type           = AVMEDIA_TYPE_AUDIO;
 500         pad.name           = av_strdup(name);
 501         pad.filter_samples = filter_samples;
 502
 503         ff_insert_inpad(ctx, i, &pad);
 504     }
 505
 506     avpriv_float_dsp_init(&s->fdsp, 0);
 507
 508     return 0;
 509 }
 510
 511 static void uninit(AVFilterContext *ctx)
 512 {
 513     int i;
 514     MixContext *s = ctx->priv;
 515
 516     if (s->fifos) {
 517         for (i = 0; i < s->nb_inputs; i++)
 518             av_audio_fifo_free(s->fifos[i]);
 519         av_freep(&s->fifos);
 520     }
 521     frame_list_clear(s->frame_list);
 522     av_freep(&s->frame_list);
 523     av_freep(&s->input_state);
 524     av_freep(&s->input_scale);
 525
 526     for (i = 0; i < ctx->nb_inputs; i++)
 527         av_freep(&ctx->input_pads[i].name);
 528 }
 529
 530 static int query_formats(AVFilterContext *ctx)
 531 {
 532     AVFilterFormats *formats = NULL;
 533     ff_add_format(&formats, AV_SAMPLE_FMT_FLT);
 534     ff_add_format(&formats, AV_SAMPLE_FMT_FLTP);
 535     ff_set_common_formats(ctx, formats);
 536     ff_set_common_channel_layouts(ctx, ff_all_channel_layouts());
 537     ff_set_common_samplerates(ctx, ff_all_samplerates());
 538     return 0;
 539 }
 540
 541 AVFilter avfilter_af_amix = {
 542     .name          = "amix",
 543     .description   = NULL_IF_CONFIG_SMALL("Audio mixing."),
 544     .priv_size     = sizeof(MixContext),
 545
 546     .init           = init,
 547     .uninit         = uninit,
 548     .query_formats  = query_formats,
 549
 550     .inputs    = (const AVFilterPad[]) {{ .name = NULL}},
 551     .outputs   = (const AVFilterPad[]) {{ .name          = "default",
 552                                           .type          = AVMEDIA_TYPE_AUDIO,
 553                                           .config_props  = config_output,
 554                                           .request_frame = request_frame },
 555                                         { .name = NULL}},
 556 };