2 * Copyright (c) 2001 Heikki Leinonen
3 * Copyright (c) 2001 Chris Bagwell
4 * Copyright (c) 2003 Donnie Smith
5 * Copyright (c) 2014 Paul B Mahol
7 * This file is part of FFmpeg.
9 * FFmpeg is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public
11 * License as published by the Free Software Foundation; either
12 * version 2.1 of the License, or (at your option) any later version.
14 * FFmpeg is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with FFmpeg; if not, write to the Free Software
21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
24 #include <float.h> /* DBL_MAX */
26 #include "libavutil/opt.h"
27 #include "libavutil/timestamp.h"
41 typedef struct SilenceRemoveContext {
44 enum SilenceMode mode;
47 int64_t start_duration;
48 double start_threshold;
51 int64_t stop_duration;
52 double stop_threshold;
54 double *start_holdoff;
55 size_t start_holdoff_offset;
56 size_t start_holdoff_end;
57 int start_found_periods;
60 size_t stop_holdoff_offset;
61 size_t stop_holdoff_end;
62 int stop_found_periods;
65 double *window_current;
73 } SilenceRemoveContext;
75 #define OFFSET(x) offsetof(SilenceRemoveContext, x)
76 #define FLAGS AV_OPT_FLAG_FILTERING_PARAM|AV_OPT_FLAG_AUDIO_PARAM
77 static const AVOption silenceremove_options[] = {
78 { "start_periods", NULL, OFFSET(start_periods), AV_OPT_TYPE_INT, {.i64=0}, 0, 9000, FLAGS },
79 { "start_duration", NULL, OFFSET(start_duration), AV_OPT_TYPE_DURATION, {.i64=0}, 0, 9000, FLAGS },
80 { "start_threshold", NULL, OFFSET(start_threshold), AV_OPT_TYPE_DOUBLE, {.dbl=0}, 0, DBL_MAX, FLAGS },
81 { "stop_periods", NULL, OFFSET(stop_periods), AV_OPT_TYPE_INT, {.i64=0}, -9000, 9000, FLAGS },
82 { "stop_duration", NULL, OFFSET(stop_duration), AV_OPT_TYPE_DURATION, {.i64=0}, 0, 9000, FLAGS },
83 { "stop_threshold", NULL, OFFSET(stop_threshold), AV_OPT_TYPE_DOUBLE, {.dbl=0}, 0, DBL_MAX, FLAGS },
84 { "leave_silence", NULL, OFFSET(leave_silence), AV_OPT_TYPE_BOOL, {.i64=0}, 0, 1, FLAGS },
88 AVFILTER_DEFINE_CLASS(silenceremove);
90 static av_cold int init(AVFilterContext *ctx)
92 SilenceRemoveContext *s = ctx->priv;
94 if (s->stop_periods < 0) {
95 s->stop_periods = -s->stop_periods;
102 static void clear_rms(SilenceRemoveContext *s)
104 memset(s->window, 0, s->window_size * sizeof(*s->window));
106 s->window_current = s->window;
107 s->window_end = s->window + s->window_size;
111 static int config_input(AVFilterLink *inlink)
113 AVFilterContext *ctx = inlink->dst;
114 SilenceRemoveContext *s = ctx->priv;
116 s->window_size = (inlink->sample_rate / 50) * inlink->channels;
117 s->window = av_malloc_array(s->window_size, sizeof(*s->window));
119 return AVERROR(ENOMEM);
123 s->start_duration = av_rescale(s->start_duration, inlink->sample_rate,
125 s->stop_duration = av_rescale(s->stop_duration, inlink->sample_rate,
128 s->start_holdoff = av_malloc_array(FFMAX(s->start_duration, 1),
129 sizeof(*s->start_holdoff) *
131 if (!s->start_holdoff)
132 return AVERROR(ENOMEM);
134 s->start_holdoff_offset = 0;
135 s->start_holdoff_end = 0;
136 s->start_found_periods = 0;
138 s->stop_holdoff = av_malloc_array(FFMAX(s->stop_duration, 1),
139 sizeof(*s->stop_holdoff) *
141 if (!s->stop_holdoff)
142 return AVERROR(ENOMEM);
144 s->stop_holdoff_offset = 0;
145 s->stop_holdoff_end = 0;
146 s->stop_found_periods = 0;
148 if (s->start_periods)
149 s->mode = SILENCE_TRIM;
151 s->mode = SILENCE_COPY;
156 static double compute_rms(SilenceRemoveContext *s, double sample)
160 new_sum = s->rms_sum;
161 new_sum -= *s->window_current;
162 new_sum += sample * sample;
164 return sqrt(new_sum / s->window_size);
167 static void update_rms(SilenceRemoveContext *s, double sample)
169 s->rms_sum -= *s->window_current;
170 *s->window_current = sample * sample;
171 s->rms_sum += *s->window_current;
174 if (s->window_current >= s->window_end)
175 s->window_current = s->window;
178 static void flush(AVFrame *out, AVFilterLink *outlink,
179 int *nb_samples_written, int *ret)
181 if (*nb_samples_written) {
182 out->nb_samples = *nb_samples_written / outlink->channels;
183 *ret = ff_filter_frame(outlink, out);
184 *nb_samples_written = 0;
190 static int filter_frame(AVFilterLink *inlink, AVFrame *in)
192 AVFilterContext *ctx = inlink->dst;
193 AVFilterLink *outlink = ctx->outputs[0];
194 SilenceRemoveContext *s = ctx->priv;
195 int i, j, threshold, ret = 0;
196 int nbs, nb_samples_read, nb_samples_written;
197 double *obuf, *ibuf = (double *)in->data[0];
200 nb_samples_read = nb_samples_written = 0;
205 nbs = in->nb_samples - nb_samples_read / inlink->channels;
209 for (i = 0; i < nbs; i++) {
211 for (j = 0; j < inlink->channels; j++) {
212 threshold |= compute_rms(s, ibuf[j]) > s->start_threshold;
216 for (j = 0; j < inlink->channels; j++) {
217 update_rms(s, *ibuf);
218 s->start_holdoff[s->start_holdoff_end++] = *ibuf++;
222 if (s->start_holdoff_end >= s->start_duration * inlink->channels) {
223 if (++s->start_found_periods >= s->start_periods) {
224 s->mode = SILENCE_TRIM_FLUSH;
225 goto silence_trim_flush;
228 s->start_holdoff_offset = 0;
229 s->start_holdoff_end = 0;
232 s->start_holdoff_end = 0;
234 for (j = 0; j < inlink->channels; j++)
235 update_rms(s, ibuf[j]);
237 ibuf += inlink->channels;
238 nb_samples_read += inlink->channels;
243 case SILENCE_TRIM_FLUSH:
245 nbs = s->start_holdoff_end - s->start_holdoff_offset;
246 nbs -= nbs % inlink->channels;
250 out = ff_get_audio_buffer(inlink, nbs / inlink->channels);
253 return AVERROR(ENOMEM);
256 memcpy(out->data[0], &s->start_holdoff[s->start_holdoff_offset],
257 nbs * sizeof(double));
258 s->start_holdoff_offset += nbs;
260 ret = ff_filter_frame(outlink, out);
262 if (s->start_holdoff_offset == s->start_holdoff_end) {
263 s->start_holdoff_offset = 0;
264 s->start_holdoff_end = 0;
265 s->mode = SILENCE_COPY;
272 nbs = in->nb_samples - nb_samples_read / inlink->channels;
276 out = ff_get_audio_buffer(inlink, nbs);
279 return AVERROR(ENOMEM);
281 obuf = (double *)out->data[0];
283 if (s->stop_periods) {
284 for (i = 0; i < nbs; i++) {
286 for (j = 0; j < inlink->channels; j++)
287 threshold &= compute_rms(s, ibuf[j]) > s->stop_threshold;
289 if (threshold && s->stop_holdoff_end && !s->leave_silence) {
290 s->mode = SILENCE_COPY_FLUSH;
291 flush(out, outlink, &nb_samples_written, &ret);
292 goto silence_copy_flush;
293 } else if (threshold) {
294 for (j = 0; j < inlink->channels; j++) {
295 update_rms(s, *ibuf);
298 nb_samples_written++;
300 } else if (!threshold) {
301 for (j = 0; j < inlink->channels; j++) {
302 update_rms(s, *ibuf);
303 if (s->leave_silence) {
305 nb_samples_written++;
308 s->stop_holdoff[s->stop_holdoff_end++] = *ibuf++;
312 if (s->stop_holdoff_end >= s->stop_duration * inlink->channels) {
313 if (++s->stop_found_periods >= s->stop_periods) {
314 s->stop_holdoff_offset = 0;
315 s->stop_holdoff_end = 0;
318 s->mode = SILENCE_STOP;
319 flush(out, outlink, &nb_samples_written, &ret);
322 s->stop_found_periods = 0;
323 s->start_found_periods = 0;
324 s->start_holdoff_offset = 0;
325 s->start_holdoff_end = 0;
327 s->mode = SILENCE_TRIM;
328 flush(out, outlink, &nb_samples_written, &ret);
332 s->mode = SILENCE_COPY_FLUSH;
333 flush(out, outlink, &nb_samples_written, &ret);
334 goto silence_copy_flush;
338 flush(out, outlink, &nb_samples_written, &ret);
340 memcpy(obuf, ibuf, sizeof(double) * nbs * inlink->channels);
341 ret = ff_filter_frame(outlink, out);
345 case SILENCE_COPY_FLUSH:
347 nbs = s->stop_holdoff_end - s->stop_holdoff_offset;
348 nbs -= nbs % inlink->channels;
352 out = ff_get_audio_buffer(inlink, nbs / inlink->channels);
355 return AVERROR(ENOMEM);
358 memcpy(out->data[0], &s->stop_holdoff[s->stop_holdoff_offset],
359 nbs * sizeof(double));
360 s->stop_holdoff_offset += nbs;
362 ret = ff_filter_frame(outlink, out);
364 if (s->stop_holdoff_offset == s->stop_holdoff_end) {
365 s->stop_holdoff_offset = 0;
366 s->stop_holdoff_end = 0;
367 s->mode = SILENCE_COPY;
381 static int request_frame(AVFilterLink *outlink)
383 AVFilterContext *ctx = outlink->src;
384 SilenceRemoveContext *s = ctx->priv;
387 ret = ff_request_frame(ctx->inputs[0]);
388 if (ret == AVERROR_EOF && (s->mode == SILENCE_COPY_FLUSH ||
389 s->mode == SILENCE_COPY)) {
390 int nbs = s->stop_holdoff_end - s->stop_holdoff_offset;
394 frame = ff_get_audio_buffer(outlink, nbs / outlink->channels);
396 return AVERROR(ENOMEM);
398 memcpy(frame->data[0], &s->stop_holdoff[s->stop_holdoff_offset],
399 nbs * sizeof(double));
400 ret = ff_filter_frame(ctx->inputs[0], frame);
402 s->mode = SILENCE_STOP;
407 static int query_formats(AVFilterContext *ctx)
409 AVFilterFormats *formats = NULL;
410 AVFilterChannelLayouts *layouts = NULL;
411 static const enum AVSampleFormat sample_fmts[] = {
412 AV_SAMPLE_FMT_DBL, AV_SAMPLE_FMT_NONE
416 layouts = ff_all_channel_counts();
418 return AVERROR(ENOMEM);
419 ret = ff_set_common_channel_layouts(ctx, layouts);
423 formats = ff_make_format_list(sample_fmts);
425 return AVERROR(ENOMEM);
426 ret = ff_set_common_formats(ctx, formats);
430 formats = ff_all_samplerates();
432 return AVERROR(ENOMEM);
433 return ff_set_common_samplerates(ctx, formats);
436 static av_cold void uninit(AVFilterContext *ctx)
438 SilenceRemoveContext *s = ctx->priv;
440 av_freep(&s->start_holdoff);
441 av_freep(&s->stop_holdoff);
442 av_freep(&s->window);
445 static const AVFilterPad silenceremove_inputs[] = {
448 .type = AVMEDIA_TYPE_AUDIO,
449 .config_props = config_input,
450 .filter_frame = filter_frame,
455 static const AVFilterPad silenceremove_outputs[] = {
458 .type = AVMEDIA_TYPE_AUDIO,
459 .request_frame = request_frame,
464 AVFilter ff_af_silenceremove = {
465 .name = "silenceremove",
466 .description = NULL_IF_CONFIG_SMALL("Remove silence."),
467 .priv_size = sizeof(SilenceRemoveContext),
468 .priv_class = &silenceremove_class,
471 .query_formats = query_formats,
472 .inputs = silenceremove_inputs,
473 .outputs = silenceremove_outputs,