]> git.sesse.net Git - ffmpeg/blob - libavfilter/af_silenceremove.c
avfilter/af_silenceremove: add options to keep min duration of silence
[ffmpeg] / libavfilter / af_silenceremove.c
1 /*
2  * Copyright (c) 2001 Heikki Leinonen
3  * Copyright (c) 2001 Chris Bagwell
4  * Copyright (c) 2003 Donnie Smith
5  * Copyright (c) 2014 Paul B Mahol
6  *
7  * This file is part of FFmpeg.
8  *
9  * FFmpeg is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU Lesser General Public
11  * License as published by the Free Software Foundation; either
12  * version 2.1 of the License, or (at your option) any later version.
13  *
14  * FFmpeg is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
17  * Lesser General Public License for more details.
18  *
19  * You should have received a copy of the GNU Lesser General Public
20  * License along with FFmpeg; if not, write to the Free Software
21  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
22  */
23
24 #include <float.h> /* DBL_MAX */
25
26 #include "libavutil/opt.h"
27 #include "libavutil/timestamp.h"
28 #include "audio.h"
29 #include "formats.h"
30 #include "avfilter.h"
31 #include "internal.h"
32
33 enum SilenceMode {
34     SILENCE_TRIM,
35     SILENCE_TRIM_FLUSH,
36     SILENCE_COPY,
37     SILENCE_COPY_FLUSH,
38     SILENCE_STOP
39 };
40
41 typedef struct SilenceRemoveContext {
42     const AVClass *class;
43
44     enum SilenceMode mode;
45
46     int start_periods;
47     int64_t start_duration;
48     int64_t start_duration_opt;
49     double start_threshold;
50     int64_t start_silence;
51     int64_t start_silence_opt;
52
53     int stop_periods;
54     int64_t stop_duration;
55     int64_t stop_duration_opt;
56     double stop_threshold;
57     int64_t stop_silence;
58     int64_t stop_silence_opt;
59
60     double *start_holdoff;
61     double *start_silence_hold;
62     size_t start_holdoff_offset;
63     size_t start_holdoff_end;
64     size_t start_silence_offset;
65     size_t start_silence_end;
66     int    start_found_periods;
67
68     double *stop_holdoff;
69     double *stop_silence_hold;
70     size_t stop_holdoff_offset;
71     size_t stop_holdoff_end;
72     size_t stop_silence_offset;
73     size_t stop_silence_end;
74     int    stop_found_periods;
75
76     double window_ratio;
77     double *window;
78     double *window_current;
79     double *window_end;
80     int window_size;
81     double sum;
82
83     int restart;
84     int64_t next_pts;
85
86     int detection;
87     void (*update)(struct SilenceRemoveContext *s, double sample);
88     double(*compute)(struct SilenceRemoveContext *s, double sample);
89 } SilenceRemoveContext;
90
91 #define OFFSET(x) offsetof(SilenceRemoveContext, x)
92 #define AF AV_OPT_FLAG_FILTERING_PARAM|AV_OPT_FLAG_AUDIO_PARAM
93
94 static const AVOption silenceremove_options[] = {
95     { "start_periods",   NULL, OFFSET(start_periods),       AV_OPT_TYPE_INT,      {.i64=0},     0,      9000, AF },
96     { "start_duration",  NULL, OFFSET(start_duration_opt),  AV_OPT_TYPE_DURATION, {.i64=0},     0, INT32_MAX, AF },
97     { "start_threshold", NULL, OFFSET(start_threshold),     AV_OPT_TYPE_DOUBLE,   {.dbl=0},     0,   DBL_MAX, AF },
98     { "start_silence",   NULL, OFFSET(start_silence_opt),   AV_OPT_TYPE_DURATION, {.i64=0},     0, INT32_MAX, AF },
99     { "stop_periods",    NULL, OFFSET(stop_periods),        AV_OPT_TYPE_INT,      {.i64=0}, -9000,      9000, AF },
100     { "stop_duration",   NULL, OFFSET(stop_duration_opt),   AV_OPT_TYPE_DURATION, {.i64=0},     0, INT32_MAX, AF },
101     { "stop_threshold",  NULL, OFFSET(stop_threshold),      AV_OPT_TYPE_DOUBLE,   {.dbl=0},     0,   DBL_MAX, AF },
102     { "stop_silence",    NULL, OFFSET(stop_silence_opt),    AV_OPT_TYPE_DURATION, {.i64=0},     0, INT32_MAX, AF },
103     { "detection",       NULL, OFFSET(detection),           AV_OPT_TYPE_INT,      {.i64=1},     0,         1, AF, "detection" },
104     {   "peak",          0,    0,                           AV_OPT_TYPE_CONST,    {.i64=0},     0,         0, AF, "detection" },
105     {   "rms",           0,    0,                           AV_OPT_TYPE_CONST,    {.i64=1},     0,         0, AF, "detection" },
106     { "window",          NULL, OFFSET(window_ratio),        AV_OPT_TYPE_DOUBLE,   {.dbl=0.02},  0,        10, AF },
107     { NULL }
108 };
109
110 AVFILTER_DEFINE_CLASS(silenceremove);
111
112 static double compute_peak(SilenceRemoveContext *s, double sample)
113 {
114     double new_sum;
115
116     new_sum  = s->sum;
117     new_sum -= *s->window_current;
118     new_sum += fabs(sample);
119
120     return new_sum / s->window_size;
121 }
122
123 static void update_peak(SilenceRemoveContext *s, double sample)
124 {
125     s->sum -= *s->window_current;
126     *s->window_current = fabs(sample);
127     s->sum += *s->window_current;
128
129     s->window_current++;
130     if (s->window_current >= s->window_end)
131         s->window_current = s->window;
132 }
133
134 static double compute_rms(SilenceRemoveContext *s, double sample)
135 {
136     double new_sum;
137
138     new_sum  = s->sum;
139     new_sum -= *s->window_current;
140     new_sum += sample * sample;
141
142     return sqrt(new_sum / s->window_size);
143 }
144
145 static void update_rms(SilenceRemoveContext *s, double sample)
146 {
147     s->sum -= *s->window_current;
148     *s->window_current = sample * sample;
149     s->sum += *s->window_current;
150
151     s->window_current++;
152     if (s->window_current >= s->window_end)
153         s->window_current = s->window;
154 }
155
156 static av_cold int init(AVFilterContext *ctx)
157 {
158     SilenceRemoveContext *s = ctx->priv;
159
160     if (s->stop_periods < 0) {
161         s->stop_periods = -s->stop_periods;
162         s->restart = 1;
163     }
164
165     switch (s->detection) {
166     case 0:
167         s->update = update_peak;
168         s->compute = compute_peak;
169         break;
170     case 1:
171         s->update = update_rms;
172         s->compute = compute_rms;
173         break;
174     }
175
176     return 0;
177 }
178
179 static void clear_window(SilenceRemoveContext *s)
180 {
181     memset(s->window, 0, s->window_size * sizeof(*s->window));
182
183     s->window_current = s->window;
184     s->window_end = s->window + s->window_size;
185     s->sum = 0;
186 }
187
188 static int config_input(AVFilterLink *inlink)
189 {
190     AVFilterContext *ctx = inlink->dst;
191     SilenceRemoveContext *s = ctx->priv;
192
193     s->window_size = FFMAX((inlink->sample_rate * s->window_ratio), 1) * inlink->channels;
194     s->window = av_malloc_array(s->window_size, sizeof(*s->window));
195     if (!s->window)
196         return AVERROR(ENOMEM);
197
198     clear_window(s);
199
200     s->start_duration = av_rescale(s->start_duration_opt, inlink->sample_rate,
201                                    AV_TIME_BASE);
202     s->start_silence  = av_rescale(s->start_silence_opt, inlink->sample_rate,
203                                    AV_TIME_BASE);
204     s->stop_duration  = av_rescale(s->stop_duration_opt, inlink->sample_rate,
205                                    AV_TIME_BASE);
206     s->stop_silence   = av_rescale(s->stop_silence_opt, inlink->sample_rate,
207                                    AV_TIME_BASE);
208
209     s->start_holdoff = av_malloc_array(FFMAX(s->start_duration, 1),
210                                        sizeof(*s->start_holdoff) *
211                                        inlink->channels);
212     if (!s->start_holdoff)
213         return AVERROR(ENOMEM);
214
215     s->start_silence_hold = av_malloc_array(FFMAX(s->start_silence, 1),
216                                             sizeof(*s->start_silence_hold) *
217                                             inlink->channels);
218     if (!s->start_silence_hold)
219         return AVERROR(ENOMEM);
220
221     s->start_holdoff_offset = 0;
222     s->start_holdoff_end    = 0;
223     s->start_found_periods  = 0;
224
225     s->stop_holdoff = av_malloc_array(FFMAX(s->stop_duration, 1),
226                                       sizeof(*s->stop_holdoff) *
227                                       inlink->channels);
228     if (!s->stop_holdoff)
229         return AVERROR(ENOMEM);
230
231     s->stop_silence_hold = av_malloc_array(FFMAX(s->stop_silence, 1),
232                                            sizeof(*s->stop_silence_hold) *
233                                            inlink->channels);
234     if (!s->stop_silence_hold)
235         return AVERROR(ENOMEM);
236
237     s->stop_holdoff_offset = 0;
238     s->stop_holdoff_end    = 0;
239     s->stop_found_periods  = 0;
240
241     if (s->start_periods)
242         s->mode = SILENCE_TRIM;
243     else
244         s->mode = SILENCE_COPY;
245
246     return 0;
247 }
248
249 static void flush(SilenceRemoveContext *s,
250                   AVFrame *out, AVFilterLink *outlink,
251                   int *nb_samples_written, int *ret, int flush_silence)
252 {
253     AVFrame *silence;
254
255     if (*nb_samples_written) {
256         out->nb_samples = *nb_samples_written / outlink->channels;
257
258         out->pts = s->next_pts;
259         s->next_pts += av_rescale_q(out->nb_samples,
260                                     (AVRational){1, outlink->sample_rate},
261                                     outlink->time_base);
262
263         *ret = ff_filter_frame(outlink, out);
264         if (*ret < 0)
265             return;
266         *nb_samples_written = 0;
267     } else {
268         av_frame_free(&out);
269     }
270
271     if (s->stop_silence_end <= 0 || !flush_silence)
272         return;
273
274     silence = ff_get_audio_buffer(outlink, s->stop_silence_end / outlink->channels);
275     if (!silence) {
276         *ret = AVERROR(ENOMEM);
277         return;
278     }
279
280     if (s->stop_silence_offset < s->stop_silence_end) {
281         memcpy(silence->data[0],
282                &s->stop_silence_hold[s->stop_silence_offset],
283                (s->stop_silence_end - s->stop_silence_offset) * sizeof(double));
284     }
285
286     if (s->stop_silence_offset > 0) {
287         memcpy(silence->data[0] + (s->stop_silence_end - s->stop_silence_offset) * sizeof(double),
288                &s->stop_silence_hold[0],
289                s->stop_silence_offset * sizeof(double));
290     }
291
292     s->stop_silence_offset = 0;
293     s->stop_silence_end = 0;
294
295     silence->pts = s->next_pts;
296     s->next_pts += av_rescale_q(silence->nb_samples,
297                                 (AVRational){1, outlink->sample_rate},
298                                 outlink->time_base);
299
300     *ret = ff_filter_frame(outlink, silence);
301 }
302
303 static int filter_frame(AVFilterLink *inlink, AVFrame *in)
304 {
305     AVFilterContext *ctx = inlink->dst;
306     AVFilterLink *outlink = ctx->outputs[0];
307     SilenceRemoveContext *s = ctx->priv;
308     int i, j, threshold, ret = 0;
309     int nbs, nb_samples_read, nb_samples_written;
310     double *obuf, *ibuf = (double *)in->data[0];
311     AVFrame *out;
312
313     nb_samples_read = nb_samples_written = 0;
314
315     switch (s->mode) {
316     case SILENCE_TRIM:
317 silence_trim:
318         nbs = in->nb_samples - nb_samples_read / inlink->channels;
319         if (!nbs)
320             break;
321
322         for (i = 0; i < nbs; i++) {
323             threshold = 0;
324             for (j = 0; j < inlink->channels; j++) {
325                 threshold |= s->compute(s, ibuf[j]) > s->start_threshold;
326             }
327
328             if (threshold) {
329                 for (j = 0; j < inlink->channels; j++) {
330                     s->update(s, *ibuf);
331                     s->start_holdoff[s->start_holdoff_end++] = *ibuf++;
332                 }
333                 nb_samples_read += inlink->channels;
334
335                 if (s->start_holdoff_end >= s->start_duration * inlink->channels) {
336                     if (++s->start_found_periods >= s->start_periods) {
337                         s->mode = SILENCE_TRIM_FLUSH;
338                         goto silence_trim_flush;
339                     }
340
341                     s->start_holdoff_offset = 0;
342                     s->start_holdoff_end = 0;
343                     s->start_silence_offset = 0;
344                     s->start_silence_end = 0;
345                 }
346             } else {
347                 s->start_holdoff_end = 0;
348
349                 for (j = 0; j < inlink->channels; j++) {
350                     s->update(s, ibuf[j]);
351                     if (s->start_silence) {
352                         s->start_silence_hold[s->start_silence_offset++] = ibuf[j];
353                         s->start_silence_end = FFMIN(s->start_silence_end + 1, inlink->channels * s->start_silence);
354                         if (s->start_silence_offset >= inlink->channels * s->start_silence) {
355                             s->start_silence_offset = 0;
356                         }
357                     }
358                 }
359
360                 ibuf += inlink->channels;
361                 nb_samples_read += inlink->channels;
362             }
363         }
364         break;
365
366     case SILENCE_TRIM_FLUSH:
367 silence_trim_flush:
368         nbs  = s->start_holdoff_end - s->start_holdoff_offset;
369         nbs -= nbs % inlink->channels;
370         if (!nbs)
371             break;
372
373         out = ff_get_audio_buffer(inlink, nbs / inlink->channels + s->start_silence_end / inlink->channels);
374         if (!out) {
375             av_frame_free(&in);
376             return AVERROR(ENOMEM);
377         }
378
379         if (s->start_silence_end > 0) {
380             if (s->start_silence_offset < s->start_silence_end) {
381                 memcpy(out->data[0],
382                        &s->start_silence_hold[s->start_silence_offset],
383                        (s->start_silence_end - s->start_silence_offset) * sizeof(double));
384             }
385
386             if (s->start_silence_offset > 0) {
387                 memcpy(out->data[0] + (s->start_silence_end - s->start_silence_offset) * sizeof(double),
388                        &s->start_silence_hold[0],
389                        s->start_silence_offset * sizeof(double));
390             }
391         }
392
393         memcpy(out->data[0] + s->start_silence_end * sizeof(double),
394                &s->start_holdoff[s->start_holdoff_offset],
395                nbs * sizeof(double));
396
397         out->pts = s->next_pts;
398         s->next_pts += av_rescale_q(out->nb_samples,
399                                     (AVRational){1, outlink->sample_rate},
400                                     outlink->time_base);
401
402         s->start_holdoff_offset += nbs;
403
404         ret = ff_filter_frame(outlink, out);
405
406         if (s->start_holdoff_offset == s->start_holdoff_end) {
407             s->start_holdoff_offset = 0;
408             s->start_holdoff_end = 0;
409             s->start_silence_offset = 0;
410             s->start_silence_end = 0;
411             s->mode = SILENCE_COPY;
412             goto silence_copy;
413         }
414         break;
415
416     case SILENCE_COPY:
417 silence_copy:
418         nbs = in->nb_samples - nb_samples_read / inlink->channels;
419         if (!nbs)
420             break;
421
422         out = ff_get_audio_buffer(inlink, nbs);
423         if (!out) {
424             av_frame_free(&in);
425             return AVERROR(ENOMEM);
426         }
427         obuf = (double *)out->data[0];
428
429         if (s->stop_periods) {
430             for (i = 0; i < nbs; i++) {
431                 threshold = 1;
432                 for (j = 0; j < inlink->channels; j++)
433                     threshold &= s->compute(s, ibuf[j]) > s->stop_threshold;
434
435                 if (threshold && s->stop_holdoff_end && !s->stop_silence) {
436                     s->mode = SILENCE_COPY_FLUSH;
437                     flush(s, out, outlink, &nb_samples_written, &ret, 0);
438                     goto silence_copy_flush;
439                 } else if (threshold) {
440                     for (j = 0; j < inlink->channels; j++) {
441                         s->update(s, *ibuf);
442                         *obuf++ = *ibuf++;
443                     }
444                     nb_samples_read    += inlink->channels;
445                     nb_samples_written += inlink->channels;
446                 } else if (!threshold) {
447                     for (j = 0; j < inlink->channels; j++) {
448                         s->update(s, *ibuf);
449                         if (s->stop_silence) {
450                             s->stop_silence_hold[s->stop_silence_offset++] = *ibuf;
451                             s->stop_silence_end = FFMIN(s->stop_silence_end + 1, inlink->channels * s->stop_silence);
452                             if (s->stop_silence_offset >= inlink->channels * s->stop_silence) {
453                                 s->stop_silence_offset = 0;
454                             }
455                         }
456
457                         s->stop_holdoff[s->stop_holdoff_end++] = *ibuf++;
458                     }
459                     nb_samples_read += inlink->channels;
460
461                     if (s->stop_holdoff_end >= s->stop_duration * inlink->channels) {
462                         if (++s->stop_found_periods >= s->stop_periods) {
463                             s->stop_holdoff_offset = 0;
464                             s->stop_holdoff_end = 0;
465
466                             if (!s->restart) {
467                                 s->mode = SILENCE_STOP;
468                                 flush(s, out, outlink, &nb_samples_written, &ret, 1);
469                                 goto silence_stop;
470                             } else {
471                                 s->stop_found_periods = 0;
472                                 s->start_found_periods = 0;
473                                 s->start_holdoff_offset = 0;
474                                 s->start_holdoff_end = 0;
475                                 s->start_silence_offset = 0;
476                                 s->start_silence_end = 0;
477                                 clear_window(s);
478                                 s->mode = SILENCE_TRIM;
479                                 flush(s, out, outlink, &nb_samples_written, &ret, 1);
480                                 goto silence_trim;
481                             }
482                         }
483                         s->mode = SILENCE_COPY_FLUSH;
484                         flush(s, out, outlink, &nb_samples_written, &ret, 0);
485                         goto silence_copy_flush;
486                     }
487                 }
488             }
489             flush(s, out, outlink, &nb_samples_written, &ret, 0);
490         } else {
491             memcpy(obuf, ibuf, sizeof(double) * nbs * inlink->channels);
492
493             out->pts = s->next_pts;
494             s->next_pts += av_rescale_q(out->nb_samples,
495                                         (AVRational){1, outlink->sample_rate},
496                                         outlink->time_base);
497
498             ret = ff_filter_frame(outlink, out);
499         }
500         break;
501
502     case SILENCE_COPY_FLUSH:
503 silence_copy_flush:
504         nbs  = s->stop_holdoff_end - s->stop_holdoff_offset;
505         nbs -= nbs % inlink->channels;
506         if (!nbs)
507             break;
508
509         out = ff_get_audio_buffer(inlink, nbs / inlink->channels);
510         if (!out) {
511             av_frame_free(&in);
512             return AVERROR(ENOMEM);
513         }
514
515         memcpy(out->data[0], &s->stop_holdoff[s->stop_holdoff_offset],
516                nbs * sizeof(double));
517         s->stop_holdoff_offset += nbs;
518
519         out->pts = s->next_pts;
520         s->next_pts += av_rescale_q(out->nb_samples,
521                                     (AVRational){1, outlink->sample_rate},
522                                     outlink->time_base);
523
524         ret = ff_filter_frame(outlink, out);
525
526         if (s->stop_holdoff_offset == s->stop_holdoff_end) {
527             s->stop_holdoff_offset = 0;
528             s->stop_holdoff_end = 0;
529             s->stop_silence_offset = 0;
530             s->stop_silence_end = 0;
531             s->mode = SILENCE_COPY;
532             goto silence_copy;
533         }
534         break;
535     case SILENCE_STOP:
536 silence_stop:
537         break;
538     }
539
540     av_frame_free(&in);
541
542     return ret;
543 }
544
545 static int request_frame(AVFilterLink *outlink)
546 {
547     AVFilterContext *ctx = outlink->src;
548     SilenceRemoveContext *s = ctx->priv;
549     int ret;
550
551     ret = ff_request_frame(ctx->inputs[0]);
552     if (ret == AVERROR_EOF && (s->mode == SILENCE_COPY_FLUSH ||
553                                s->mode == SILENCE_COPY)) {
554         int nbs = s->stop_holdoff_end - s->stop_holdoff_offset;
555         if (nbs) {
556             AVFrame *frame;
557
558             frame = ff_get_audio_buffer(outlink, nbs / outlink->channels);
559             if (!frame)
560                 return AVERROR(ENOMEM);
561
562             memcpy(frame->data[0], &s->stop_holdoff[s->stop_holdoff_offset],
563                    nbs * sizeof(double));
564
565             frame->pts = s->next_pts;
566             s->next_pts += av_rescale_q(frame->nb_samples,
567                                         (AVRational){1, outlink->sample_rate},
568                                         outlink->time_base);
569
570             ret = ff_filter_frame(outlink, frame);
571         }
572         s->mode = SILENCE_STOP;
573     }
574     return ret;
575 }
576
577 static int query_formats(AVFilterContext *ctx)
578 {
579     AVFilterFormats *formats = NULL;
580     AVFilterChannelLayouts *layouts = NULL;
581     static const enum AVSampleFormat sample_fmts[] = {
582         AV_SAMPLE_FMT_DBL, AV_SAMPLE_FMT_NONE
583     };
584     int ret;
585
586     layouts = ff_all_channel_counts();
587     if (!layouts)
588         return AVERROR(ENOMEM);
589     ret = ff_set_common_channel_layouts(ctx, layouts);
590     if (ret < 0)
591         return ret;
592
593     formats = ff_make_format_list(sample_fmts);
594     if (!formats)
595         return AVERROR(ENOMEM);
596     ret = ff_set_common_formats(ctx, formats);
597     if (ret < 0)
598         return ret;
599
600     formats = ff_all_samplerates();
601     if (!formats)
602         return AVERROR(ENOMEM);
603     return ff_set_common_samplerates(ctx, formats);
604 }
605
606 static av_cold void uninit(AVFilterContext *ctx)
607 {
608     SilenceRemoveContext *s = ctx->priv;
609
610     av_freep(&s->start_holdoff);
611     av_freep(&s->start_silence_hold);
612     av_freep(&s->stop_holdoff);
613     av_freep(&s->stop_silence_hold);
614     av_freep(&s->window);
615 }
616
617 static const AVFilterPad silenceremove_inputs[] = {
618     {
619         .name         = "default",
620         .type         = AVMEDIA_TYPE_AUDIO,
621         .config_props = config_input,
622         .filter_frame = filter_frame,
623     },
624     { NULL }
625 };
626
627 static const AVFilterPad silenceremove_outputs[] = {
628     {
629         .name          = "default",
630         .type          = AVMEDIA_TYPE_AUDIO,
631         .request_frame = request_frame,
632     },
633     { NULL }
634 };
635
636 AVFilter ff_af_silenceremove = {
637     .name          = "silenceremove",
638     .description   = NULL_IF_CONFIG_SMALL("Remove silence."),
639     .priv_size     = sizeof(SilenceRemoveContext),
640     .priv_class    = &silenceremove_class,
641     .init          = init,
642     .uninit        = uninit,
643     .query_formats = query_formats,
644     .inputs        = silenceremove_inputs,
645     .outputs       = silenceremove_outputs,
646 };