X-Git-Url: https://git.sesse.net/?a=blobdiff_plain;f=libavfilter%2Faf_silenceremove.c;h=e4b4cd71d23ee65bb949214c24e651bb17c4b925;hb=a04ad248a05e7b613abe09b3bb067f555108d794;hp=d826a22e9dea6a8bfc8ccab543399d34d27dd093;hpb=03210fe138f3b3bd7f5272fe29aca810cf517329;p=ffmpeg diff --git a/libavfilter/af_silenceremove.c b/libavfilter/af_silenceremove.c index d826a22e9de..e4b4cd71d23 100644 --- a/libavfilter/af_silenceremove.c +++ b/libavfilter/af_silenceremove.c @@ -30,6 +30,16 @@ #include "avfilter.h" #include "internal.h" +enum SilenceDetect { + D_PEAK, + D_RMS, +}; + +enum ThresholdMode { + T_ANY, + T_ALL, +}; + enum SilenceMode { SILENCE_TRIM, SILENCE_TRIM_FLUSH, @@ -45,20 +55,34 @@ typedef struct SilenceRemoveContext { int start_periods; int64_t start_duration; + int64_t start_duration_opt; double start_threshold; + int64_t start_silence; + int64_t start_silence_opt; + int start_mode; int stop_periods; int64_t stop_duration; + int64_t stop_duration_opt; double stop_threshold; + int64_t stop_silence; + int64_t stop_silence_opt; + int stop_mode; double *start_holdoff; + double *start_silence_hold; size_t start_holdoff_offset; size_t start_holdoff_end; + size_t start_silence_offset; + size_t start_silence_end; int start_found_periods; double *stop_holdoff; + double *stop_silence_hold; size_t stop_holdoff_offset; size_t stop_holdoff_end; + size_t stop_silence_offset; + size_t stop_silence_end; int stop_found_periods; double window_ratio; @@ -68,7 +92,6 @@ typedef struct SilenceRemoveContext { int window_size; double sum; - int leave_silence; int restart; int64_t next_pts; @@ -78,19 +101,25 @@ typedef struct SilenceRemoveContext { } SilenceRemoveContext; #define OFFSET(x) offsetof(SilenceRemoveContext, x) -#define FLAGS AV_OPT_FLAG_FILTERING_PARAM|AV_OPT_FLAG_AUDIO_PARAM +#define AF AV_OPT_FLAG_FILTERING_PARAM|AV_OPT_FLAG_AUDIO_PARAM + static const AVOption silenceremove_options[] = { - { "start_periods", NULL, OFFSET(start_periods), AV_OPT_TYPE_INT, {.i64=0}, 0, 9000, FLAGS }, - { "start_duration", NULL, OFFSET(start_duration), AV_OPT_TYPE_DURATION, {.i64=0}, 0, 9000, FLAGS }, - { "start_threshold", NULL, OFFSET(start_threshold), AV_OPT_TYPE_DOUBLE, {.dbl=0}, 0, DBL_MAX, FLAGS }, - { "stop_periods", NULL, OFFSET(stop_periods), AV_OPT_TYPE_INT, {.i64=0}, -9000, 9000, FLAGS }, - { "stop_duration", NULL, OFFSET(stop_duration), AV_OPT_TYPE_DURATION, {.i64=0}, 0, 9000, FLAGS }, - { "stop_threshold", NULL, OFFSET(stop_threshold), AV_OPT_TYPE_DOUBLE, {.dbl=0}, 0, DBL_MAX, FLAGS }, - { "leave_silence", NULL, OFFSET(leave_silence), AV_OPT_TYPE_BOOL, {.i64=0}, 0, 1, FLAGS }, - { "detection", NULL, OFFSET(detection), AV_OPT_TYPE_INT, {.i64=1}, 0, 1, FLAGS, "detection" }, - { "peak", 0, 0, AV_OPT_TYPE_CONST, {.i64=0}, 0, 0, FLAGS, "detection" }, - { "rms", 0, 0, AV_OPT_TYPE_CONST, {.i64=1}, 0, 0, FLAGS, "detection" }, - { "window", NULL, OFFSET(window_ratio), AV_OPT_TYPE_DOUBLE, {.dbl=0.02}, 0, 10, FLAGS }, + { "start_periods", NULL, OFFSET(start_periods), AV_OPT_TYPE_INT, {.i64=0}, 0, 9000, AF }, + { "start_duration", "set start duration of non-silence part", OFFSET(start_duration_opt), AV_OPT_TYPE_DURATION, {.i64=0}, 0, INT32_MAX, AF }, + { "start_threshold", "set threshold for start silence detection", OFFSET(start_threshold), AV_OPT_TYPE_DOUBLE, {.dbl=0}, 0, DBL_MAX, AF }, + { "start_silence", "set start duration of silence part to keep", OFFSET(start_silence_opt), AV_OPT_TYPE_DURATION, {.i64=0}, 0, INT32_MAX, AF }, + { "start_mode", "set which channel will trigger trimming from start", OFFSET(start_mode), AV_OPT_TYPE_INT, {.i64=T_ANY}, T_ANY, T_ALL, AF, "mode" }, + { "any", 0, 0, AV_OPT_TYPE_CONST, {.i64=T_ANY}, 0, 0, AF, "mode" }, + { "all", 0, 0, AV_OPT_TYPE_CONST, {.i64=T_ALL}, 0, 0, AF, "mode" }, + { "stop_periods", NULL, OFFSET(stop_periods), AV_OPT_TYPE_INT, {.i64=0}, -9000, 9000, AF }, + { "stop_duration", "set stop duration of non-silence part", OFFSET(stop_duration_opt), AV_OPT_TYPE_DURATION, {.i64=0}, 0, INT32_MAX, AF }, + { "stop_threshold", "set threshold for stop silence detection", OFFSET(stop_threshold), AV_OPT_TYPE_DOUBLE, {.dbl=0}, 0, DBL_MAX, AF }, + { "stop_silence", "set stop duration of silence part to keep", OFFSET(stop_silence_opt), AV_OPT_TYPE_DURATION, {.i64=0}, 0, INT32_MAX, AF }, + { "stop_mode", "set which channel will trigger trimming from end", OFFSET(stop_mode), AV_OPT_TYPE_INT, {.i64=T_ANY}, T_ANY, T_ALL, AF, "mode" }, + { "detection", "set how silence is detected", OFFSET(detection), AV_OPT_TYPE_INT, {.i64=D_RMS}, D_PEAK,D_RMS, AF, "detection" }, + { "peak", "use absolute values of samples", 0, AV_OPT_TYPE_CONST, {.i64=D_PEAK},0, 0, AF, "detection" }, + { "rms", "use squared values of samples", 0, AV_OPT_TYPE_CONST, {.i64=D_RMS}, 0, 0, AF, "detection" }, + { "window", "set duration of window in seconds", OFFSET(window_ratio), AV_OPT_TYPE_DOUBLE, {.dbl=0.02}, 0, 10, AF }, { NULL } }; @@ -150,15 +179,15 @@ static av_cold int init(AVFilterContext *ctx) } switch (s->detection) { - case 0: + case D_PEAK: s->update = update_peak; s->compute = compute_peak; break; - case 1: + case D_RMS: s->update = update_rms; s->compute = compute_rms; break; - }; + } return 0; } @@ -177,6 +206,7 @@ static int config_input(AVFilterLink *inlink) AVFilterContext *ctx = inlink->dst; SilenceRemoveContext *s = ctx->priv; + s->next_pts = AV_NOPTS_VALUE; s->window_size = FFMAX((inlink->sample_rate * s->window_ratio), 1) * inlink->channels; s->window = av_malloc_array(s->window_size, sizeof(*s->window)); if (!s->window) @@ -184,19 +214,14 @@ static int config_input(AVFilterLink *inlink) clear_window(s); - s->start_duration = av_rescale(s->start_duration, inlink->sample_rate, + s->start_duration = av_rescale(s->start_duration_opt, inlink->sample_rate, AV_TIME_BASE); - if (s->start_duration < 0) { - av_log(ctx, AV_LOG_WARNING, "start duration must be non-negative\n"); - s->start_duration = -s->start_duration; - } - - s->stop_duration = av_rescale(s->stop_duration, inlink->sample_rate, + s->start_silence = av_rescale(s->start_silence_opt, inlink->sample_rate, + AV_TIME_BASE); + s->stop_duration = av_rescale(s->stop_duration_opt, inlink->sample_rate, + AV_TIME_BASE); + s->stop_silence = av_rescale(s->stop_silence_opt, inlink->sample_rate, AV_TIME_BASE); - if (s->stop_duration < 0) { - av_log(ctx, AV_LOG_WARNING, "stop duration must be non-negative\n"); - s->stop_duration = -s->stop_duration; - } s->start_holdoff = av_malloc_array(FFMAX(s->start_duration, 1), sizeof(*s->start_holdoff) * @@ -204,6 +229,12 @@ static int config_input(AVFilterLink *inlink) if (!s->start_holdoff) return AVERROR(ENOMEM); + s->start_silence_hold = av_malloc_array(FFMAX(s->start_silence, 1), + sizeof(*s->start_silence_hold) * + inlink->channels); + if (!s->start_silence_hold) + return AVERROR(ENOMEM); + s->start_holdoff_offset = 0; s->start_holdoff_end = 0; s->start_found_periods = 0; @@ -214,6 +245,12 @@ static int config_input(AVFilterLink *inlink) if (!s->stop_holdoff) return AVERROR(ENOMEM); + s->stop_silence_hold = av_malloc_array(FFMAX(s->stop_silence, 1), + sizeof(*s->stop_silence_hold) * + inlink->channels); + if (!s->stop_silence_hold) + return AVERROR(ENOMEM); + s->stop_holdoff_offset = 0; s->stop_holdoff_end = 0; s->stop_found_periods = 0; @@ -228,8 +265,10 @@ static int config_input(AVFilterLink *inlink) static void flush(SilenceRemoveContext *s, AVFrame *out, AVFilterLink *outlink, - int *nb_samples_written, int *ret) + int *nb_samples_written, int *ret, int flush_silence) { + AVFrame *silence; + if (*nb_samples_written) { out->nb_samples = *nb_samples_written / outlink->channels; @@ -239,10 +278,43 @@ static void flush(SilenceRemoveContext *s, outlink->time_base); *ret = ff_filter_frame(outlink, out); + if (*ret < 0) + return; *nb_samples_written = 0; } else { av_frame_free(&out); } + + if (s->stop_silence_end <= 0 || !flush_silence) + return; + + silence = ff_get_audio_buffer(outlink, s->stop_silence_end / outlink->channels); + if (!silence) { + *ret = AVERROR(ENOMEM); + return; + } + + if (s->stop_silence_offset < s->stop_silence_end) { + memcpy(silence->data[0], + &s->stop_silence_hold[s->stop_silence_offset], + (s->stop_silence_end - s->stop_silence_offset) * sizeof(double)); + } + + if (s->stop_silence_offset > 0) { + memcpy(silence->data[0] + (s->stop_silence_end - s->stop_silence_offset) * sizeof(double), + &s->stop_silence_hold[0], + s->stop_silence_offset * sizeof(double)); + } + + s->stop_silence_offset = 0; + s->stop_silence_end = 0; + + silence->pts = s->next_pts; + s->next_pts += av_rescale_q(silence->nb_samples, + (AVRational){1, outlink->sample_rate}, + outlink->time_base); + + *ret = ff_filter_frame(outlink, silence); } static int filter_frame(AVFilterLink *inlink, AVFrame *in) @@ -257,27 +329,37 @@ static int filter_frame(AVFilterLink *inlink, AVFrame *in) nb_samples_read = nb_samples_written = 0; + if (s->next_pts == AV_NOPTS_VALUE) + s->next_pts = in->pts; + switch (s->mode) { case SILENCE_TRIM: silence_trim: - nbs = in->nb_samples - nb_samples_read / inlink->channels; + nbs = in->nb_samples - nb_samples_read / outlink->channels; if (!nbs) break; for (i = 0; i < nbs; i++) { - threshold = 0; - for (j = 0; j < inlink->channels; j++) { - threshold |= s->compute(s, ibuf[j]) > s->start_threshold; + if (s->start_mode == T_ANY) { + threshold = 0; + for (j = 0; j < outlink->channels; j++) { + threshold |= s->compute(s, ibuf[j]) > s->start_threshold; + } + } else { + threshold = 1; + for (j = 0; j < outlink->channels; j++) { + threshold &= s->compute(s, ibuf[j]) > s->start_threshold; + } } if (threshold) { - for (j = 0; j < inlink->channels; j++) { + for (j = 0; j < outlink->channels; j++) { s->update(s, *ibuf); s->start_holdoff[s->start_holdoff_end++] = *ibuf++; } - nb_samples_read += inlink->channels; + nb_samples_read += outlink->channels; - if (s->start_holdoff_end >= s->start_duration * inlink->channels) { + if (s->start_holdoff_end >= s->start_duration * outlink->channels) { if (++s->start_found_periods >= s->start_periods) { s->mode = SILENCE_TRIM_FLUSH; goto silence_trim_flush; @@ -285,15 +367,25 @@ silence_trim: s->start_holdoff_offset = 0; s->start_holdoff_end = 0; + s->start_silence_offset = 0; + s->start_silence_end = 0; } } else { s->start_holdoff_end = 0; - for (j = 0; j < inlink->channels; j++) + for (j = 0; j < outlink->channels; j++) { s->update(s, ibuf[j]); + if (s->start_silence) { + s->start_silence_hold[s->start_silence_offset++] = ibuf[j]; + s->start_silence_end = FFMIN(s->start_silence_end + 1, outlink->channels * s->start_silence); + if (s->start_silence_offset >= outlink->channels * s->start_silence) { + s->start_silence_offset = 0; + } + } + } - ibuf += inlink->channels; - nb_samples_read += inlink->channels; + ibuf += outlink->channels; + nb_samples_read += outlink->channels; } } break; @@ -301,17 +393,32 @@ silence_trim: case SILENCE_TRIM_FLUSH: silence_trim_flush: nbs = s->start_holdoff_end - s->start_holdoff_offset; - nbs -= nbs % inlink->channels; + nbs -= nbs % outlink->channels; if (!nbs) break; - out = ff_get_audio_buffer(inlink, nbs / inlink->channels); + out = ff_get_audio_buffer(outlink, nbs / outlink->channels + s->start_silence_end / outlink->channels); if (!out) { av_frame_free(&in); return AVERROR(ENOMEM); } - memcpy(out->data[0], &s->start_holdoff[s->start_holdoff_offset], + if (s->start_silence_end > 0) { + if (s->start_silence_offset < s->start_silence_end) { + memcpy(out->data[0], + &s->start_silence_hold[s->start_silence_offset], + (s->start_silence_end - s->start_silence_offset) * sizeof(double)); + } + + if (s->start_silence_offset > 0) { + memcpy(out->data[0] + (s->start_silence_end - s->start_silence_offset) * sizeof(double), + &s->start_silence_hold[0], + s->start_silence_offset * sizeof(double)); + } + } + + memcpy(out->data[0] + s->start_silence_end * sizeof(double), + &s->start_holdoff[s->start_holdoff_offset], nbs * sizeof(double)); out->pts = s->next_pts; @@ -326,6 +433,8 @@ silence_trim_flush: if (s->start_holdoff_offset == s->start_holdoff_end) { s->start_holdoff_offset = 0; s->start_holdoff_end = 0; + s->start_silence_offset = 0; + s->start_silence_end = 0; s->mode = SILENCE_COPY; goto silence_copy; } @@ -333,11 +442,11 @@ silence_trim_flush: case SILENCE_COPY: silence_copy: - nbs = in->nb_samples - nb_samples_read / inlink->channels; + nbs = in->nb_samples - nb_samples_read / outlink->channels; if (!nbs) break; - out = ff_get_audio_buffer(inlink, nbs); + out = ff_get_audio_buffer(outlink, nbs); if (!out) { av_frame_free(&in); return AVERROR(ENOMEM); @@ -346,62 +455,75 @@ silence_copy: if (s->stop_periods) { for (i = 0; i < nbs; i++) { - threshold = 1; - for (j = 0; j < inlink->channels; j++) - threshold &= s->compute(s, ibuf[j]) > s->stop_threshold; + if (s->stop_mode == T_ANY) { + threshold = 0; + for (j = 0; j < outlink->channels; j++) { + threshold |= s->compute(s, ibuf[j]) > s->stop_threshold; + } + } else { + threshold = 1; + for (j = 0; j < outlink->channels; j++) { + threshold &= s->compute(s, ibuf[j]) > s->stop_threshold; + } + } - if (threshold && s->stop_holdoff_end && !s->leave_silence) { + if (threshold && s->stop_holdoff_end && !s->stop_silence) { s->mode = SILENCE_COPY_FLUSH; - flush(s, out, outlink, &nb_samples_written, &ret); + flush(s, out, outlink, &nb_samples_written, &ret, 0); goto silence_copy_flush; } else if (threshold) { - for (j = 0; j < inlink->channels; j++) { + for (j = 0; j < outlink->channels; j++) { s->update(s, *ibuf); *obuf++ = *ibuf++; } - nb_samples_read += inlink->channels; - nb_samples_written += inlink->channels; + nb_samples_read += outlink->channels; + nb_samples_written += outlink->channels; } else if (!threshold) { - for (j = 0; j < inlink->channels; j++) { + for (j = 0; j < outlink->channels; j++) { s->update(s, *ibuf); - if (s->leave_silence) { - *obuf++ = *ibuf; - nb_samples_written++; + if (s->stop_silence) { + s->stop_silence_hold[s->stop_silence_offset++] = *ibuf; + s->stop_silence_end = FFMIN(s->stop_silence_end + 1, outlink->channels * s->stop_silence); + if (s->stop_silence_offset >= outlink->channels * s->stop_silence) { + s->stop_silence_offset = 0; + } } s->stop_holdoff[s->stop_holdoff_end++] = *ibuf++; } - nb_samples_read += inlink->channels; + nb_samples_read += outlink->channels; - if (s->stop_holdoff_end >= s->stop_duration * inlink->channels) { + if (s->stop_holdoff_end >= s->stop_duration * outlink->channels) { if (++s->stop_found_periods >= s->stop_periods) { s->stop_holdoff_offset = 0; s->stop_holdoff_end = 0; if (!s->restart) { s->mode = SILENCE_STOP; - flush(s, out, outlink, &nb_samples_written, &ret); + flush(s, out, outlink, &nb_samples_written, &ret, 1); goto silence_stop; } else { s->stop_found_periods = 0; s->start_found_periods = 0; s->start_holdoff_offset = 0; s->start_holdoff_end = 0; + s->start_silence_offset = 0; + s->start_silence_end = 0; clear_window(s); s->mode = SILENCE_TRIM; - flush(s, out, outlink, &nb_samples_written, &ret); + flush(s, out, outlink, &nb_samples_written, &ret, 1); goto silence_trim; } } s->mode = SILENCE_COPY_FLUSH; - flush(s, out, outlink, &nb_samples_written, &ret); + flush(s, out, outlink, &nb_samples_written, &ret, 0); goto silence_copy_flush; } } } - flush(s, out, outlink, &nb_samples_written, &ret); + flush(s, out, outlink, &nb_samples_written, &ret, 0); } else { - memcpy(obuf, ibuf, sizeof(double) * nbs * inlink->channels); + memcpy(obuf, ibuf, sizeof(double) * nbs * outlink->channels); out->pts = s->next_pts; s->next_pts += av_rescale_q(out->nb_samples, @@ -415,11 +537,11 @@ silence_copy: case SILENCE_COPY_FLUSH: silence_copy_flush: nbs = s->stop_holdoff_end - s->stop_holdoff_offset; - nbs -= nbs % inlink->channels; + nbs -= nbs % outlink->channels; if (!nbs) break; - out = ff_get_audio_buffer(inlink, nbs / inlink->channels); + out = ff_get_audio_buffer(outlink, nbs / outlink->channels); if (!out) { av_frame_free(&in); return AVERROR(ENOMEM); @@ -439,6 +561,8 @@ silence_copy_flush: if (s->stop_holdoff_offset == s->stop_holdoff_end) { s->stop_holdoff_offset = 0; s->stop_holdoff_end = 0; + s->stop_silence_offset = 0; + s->stop_silence_end = 0; s->mode = SILENCE_COPY; goto silence_copy; } @@ -519,7 +643,9 @@ static av_cold void uninit(AVFilterContext *ctx) SilenceRemoveContext *s = ctx->priv; av_freep(&s->start_holdoff); + av_freep(&s->start_silence_hold); av_freep(&s->stop_holdoff); + av_freep(&s->stop_silence_hold); av_freep(&s->window); } @@ -542,7 +668,7 @@ static const AVFilterPad silenceremove_outputs[] = { { NULL } }; -AVFilter ff_af_silenceremove = { +const AVFilter ff_af_silenceremove = { .name = "silenceremove", .description = NULL_IF_CONFIG_SMALL("Remove silence."), .priv_size = sizeof(SilenceRemoveContext),