]> git.sesse.net Git - ffmpeg/blob - libavfilter/af_speechnorm.c
avfilter: Constify all AVFilters
[ffmpeg] / libavfilter / af_speechnorm.c
1 /*
2  * Copyright (c) 2020 Paul B Mahol
3  *
4  * Speech Normalizer
5  *
6  * This file is part of FFmpeg.
7  *
8  * FFmpeg is free software; you can redistribute it and/or
9  * modify it under the terms of the GNU Lesser General Public
10  * License as published by the Free Software Foundation; either
11  * version 2.1 of the License, or (at your option) any later version.
12  *
13  * FFmpeg is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16  * Lesser General Public License for more details.
17  *
18  * You should have received a copy of the GNU Lesser General Public
19  * License along with FFmpeg; if not, write to the Free Software
20  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21  */
22
23 /**
24  * @file
25  * Speech Normalizer
26  */
27
28 #include <float.h>
29
30 #include "libavutil/avassert.h"
31 #include "libavutil/opt.h"
32
33 #define FF_BUFQUEUE_SIZE (1024)
34 #include "bufferqueue.h"
35
36 #include "audio.h"
37 #include "avfilter.h"
38 #include "filters.h"
39 #include "internal.h"
40
41 #define MAX_ITEMS  882000
42 #define MIN_PEAK (1. / 32768.)
43
44 typedef struct PeriodItem {
45     int size;
46     int type;
47     double max_peak;
48 } PeriodItem;
49
50 typedef struct ChannelContext {
51     int state;
52     int bypass;
53     PeriodItem pi[MAX_ITEMS];
54     double gain_state;
55     double pi_max_peak;
56     int pi_start;
57     int pi_end;
58     int pi_size;
59 } ChannelContext;
60
61 typedef struct SpeechNormalizerContext {
62     const AVClass *class;
63
64     double peak_value;
65     double max_expansion;
66     double max_compression;
67     double threshold_value;
68     double raise_amount;
69     double fall_amount;
70     uint64_t channels;
71     int invert;
72     int link;
73
74     ChannelContext *cc;
75     double prev_gain;
76
77     int max_period;
78     int eof;
79     int64_t pts;
80
81     struct FFBufQueue queue;
82
83     void (*analyze_channel)(AVFilterContext *ctx, ChannelContext *cc,
84                             const uint8_t *srcp, int nb_samples);
85     void (*filter_channels[2])(AVFilterContext *ctx,
86                                AVFrame *in, int nb_samples);
87 } SpeechNormalizerContext;
88
89 #define OFFSET(x) offsetof(SpeechNormalizerContext, x)
90 #define FLAGS AV_OPT_FLAG_AUDIO_PARAM|AV_OPT_FLAG_FILTERING_PARAM|AV_OPT_FLAG_RUNTIME_PARAM
91
92 static const AVOption speechnorm_options[] = {
93     { "peak", "set the peak value", OFFSET(peak_value), AV_OPT_TYPE_DOUBLE, {.dbl=0.95}, 0.0, 1.0, FLAGS },
94     { "p",    "set the peak value", OFFSET(peak_value), AV_OPT_TYPE_DOUBLE, {.dbl=0.95}, 0.0, 1.0, FLAGS },
95     { "expansion", "set the max expansion factor", OFFSET(max_expansion), AV_OPT_TYPE_DOUBLE, {.dbl=2.0}, 1.0, 50.0, FLAGS },
96     { "e",         "set the max expansion factor", OFFSET(max_expansion), AV_OPT_TYPE_DOUBLE, {.dbl=2.0}, 1.0, 50.0, FLAGS },
97     { "compression", "set the max compression factor", OFFSET(max_compression), AV_OPT_TYPE_DOUBLE, {.dbl=2.0}, 1.0, 50.0, FLAGS },
98     { "c",           "set the max compression factor", OFFSET(max_compression), AV_OPT_TYPE_DOUBLE, {.dbl=2.0}, 1.0, 50.0, FLAGS },
99     { "threshold", "set the threshold value", OFFSET(threshold_value), AV_OPT_TYPE_DOUBLE, {.dbl=0}, 0.0, 1.0, FLAGS },
100     { "t",         "set the threshold value", OFFSET(threshold_value), AV_OPT_TYPE_DOUBLE, {.dbl=0}, 0.0, 1.0, FLAGS },
101     { "raise", "set the expansion raising amount", OFFSET(raise_amount), AV_OPT_TYPE_DOUBLE, {.dbl=0.001}, 0.0, 1.0, FLAGS },
102     { "r",     "set the expansion raising amount", OFFSET(raise_amount), AV_OPT_TYPE_DOUBLE, {.dbl=0.001}, 0.0, 1.0, FLAGS },
103     { "fall", "set the compression raising amount", OFFSET(fall_amount), AV_OPT_TYPE_DOUBLE, {.dbl=0.001}, 0.0, 1.0, FLAGS },
104     { "f",    "set the compression raising amount", OFFSET(fall_amount), AV_OPT_TYPE_DOUBLE, {.dbl=0.001}, 0.0, 1.0, FLAGS },
105     { "channels", "set channels to filter", OFFSET(channels), AV_OPT_TYPE_CHANNEL_LAYOUT, {.i64=-1}, INT64_MIN, INT64_MAX, FLAGS },
106     { "h",        "set channels to filter", OFFSET(channels), AV_OPT_TYPE_CHANNEL_LAYOUT, {.i64=-1}, INT64_MIN, INT64_MAX, FLAGS },
107     { "invert", "set inverted filtering", OFFSET(invert), AV_OPT_TYPE_BOOL, {.i64=0}, 0, 1, FLAGS },
108     { "i",      "set inverted filtering", OFFSET(invert), AV_OPT_TYPE_BOOL, {.i64=0}, 0, 1, FLAGS },
109     { "link", "set linked channels filtering", OFFSET(link), AV_OPT_TYPE_BOOL, {.i64=0}, 0, 1, FLAGS },
110     { "l",    "set linked channels filtering", OFFSET(link), AV_OPT_TYPE_BOOL, {.i64=0}, 0, 1, FLAGS },
111     { NULL }
112 };
113
114 AVFILTER_DEFINE_CLASS(speechnorm);
115
116 static int query_formats(AVFilterContext *ctx)
117 {
118     AVFilterFormats *formats;
119     AVFilterChannelLayouts *layouts;
120     static const enum AVSampleFormat sample_fmts[] = {
121         AV_SAMPLE_FMT_FLTP, AV_SAMPLE_FMT_DBLP,
122         AV_SAMPLE_FMT_NONE
123     };
124     int ret;
125
126     layouts = ff_all_channel_counts();
127     if (!layouts)
128         return AVERROR(ENOMEM);
129     ret = ff_set_common_channel_layouts(ctx, layouts);
130     if (ret < 0)
131         return ret;
132
133     formats = ff_make_format_list(sample_fmts);
134     if (!formats)
135         return AVERROR(ENOMEM);
136     ret = ff_set_common_formats(ctx, formats);
137     if (ret < 0)
138         return ret;
139
140     formats = ff_all_samplerates();
141     if (!formats)
142         return AVERROR(ENOMEM);
143     return ff_set_common_samplerates(ctx, formats);
144 }
145
146 static int get_pi_samples(PeriodItem *pi, int start, int end, int remain)
147 {
148     int sum;
149
150     if (pi[start].type == 0)
151         return remain;
152
153     sum = remain;
154     while (start != end) {
155         start++;
156         if (start >= MAX_ITEMS)
157             start = 0;
158         if (pi[start].type == 0)
159             break;
160         av_assert0(pi[start].size > 0);
161         sum += pi[start].size;
162     }
163
164     return sum;
165 }
166
167 static int available_samples(AVFilterContext *ctx)
168 {
169     SpeechNormalizerContext *s = ctx->priv;
170     AVFilterLink *inlink = ctx->inputs[0];
171     int min_pi_nb_samples;
172
173     min_pi_nb_samples = get_pi_samples(s->cc[0].pi, s->cc[0].pi_start, s->cc[0].pi_end, s->cc[0].pi_size);
174     for (int ch = 1; ch < inlink->channels && min_pi_nb_samples > 0; ch++) {
175         ChannelContext *cc = &s->cc[ch];
176
177         min_pi_nb_samples = FFMIN(min_pi_nb_samples, get_pi_samples(cc->pi, cc->pi_start, cc->pi_end, cc->pi_size));
178     }
179
180     return min_pi_nb_samples;
181 }
182
183 static void consume_pi(ChannelContext *cc, int nb_samples)
184 {
185     if (cc->pi_size >= nb_samples) {
186         cc->pi_size -= nb_samples;
187     } else {
188         av_assert0(0);
189     }
190 }
191
192 static double next_gain(AVFilterContext *ctx, double pi_max_peak, int bypass, double state)
193 {
194     SpeechNormalizerContext *s = ctx->priv;
195     const double expansion = FFMIN(s->max_expansion, s->peak_value / pi_max_peak);
196     const double compression = 1. / s->max_compression;
197     const int type = s->invert ? pi_max_peak <= s->threshold_value : pi_max_peak >= s->threshold_value;
198
199     if (bypass) {
200         return 1.;
201     } else if (type) {
202         return FFMIN(expansion, state + s->raise_amount);
203     } else {
204         return FFMIN(expansion, FFMAX(compression, state - s->fall_amount));
205     }
206 }
207
208 static void next_pi(AVFilterContext *ctx, ChannelContext *cc, int bypass)
209 {
210     av_assert0(cc->pi_size >= 0);
211     if (cc->pi_size == 0) {
212         SpeechNormalizerContext *s = ctx->priv;
213         int start = cc->pi_start;
214
215         av_assert0(cc->pi[start].size > 0);
216         av_assert0(cc->pi[start].type > 0 || s->eof);
217         cc->pi_size = cc->pi[start].size;
218         cc->pi_max_peak = cc->pi[start].max_peak;
219         av_assert0(cc->pi_start != cc->pi_end || s->eof);
220         start++;
221         if (start >= MAX_ITEMS)
222             start = 0;
223         cc->pi_start = start;
224         cc->gain_state = next_gain(ctx, cc->pi_max_peak, bypass, cc->gain_state);
225     }
226 }
227
228 static double min_gain(AVFilterContext *ctx, ChannelContext *cc, int max_size)
229 {
230     SpeechNormalizerContext *s = ctx->priv;
231     double min_gain = s->max_expansion;
232     double gain_state = cc->gain_state;
233     int size = cc->pi_size;
234     int idx = cc->pi_start;
235
236     min_gain = FFMIN(min_gain, gain_state);
237     while (size <= max_size) {
238         if (idx == cc->pi_end)
239             break;
240         gain_state = next_gain(ctx, cc->pi[idx].max_peak, 0, gain_state);
241         min_gain = FFMIN(min_gain, gain_state);
242         size += cc->pi[idx].size;
243         idx++;
244         if (idx >= MAX_ITEMS)
245             idx = 0;
246     }
247
248     return min_gain;
249 }
250
251 #define ANALYZE_CHANNEL(name, ptype, zero)                                                 \
252 static void analyze_channel_## name (AVFilterContext *ctx, ChannelContext *cc,             \
253                                      const uint8_t *srcp, int nb_samples)                  \
254 {                                                                                          \
255     SpeechNormalizerContext *s = ctx->priv;                                                \
256     const ptype *src = (const ptype *)srcp;                                                \
257     int n = 0;                                                                             \
258                                                                                            \
259     if (cc->state < 0)                                                                     \
260         cc->state = src[0] >= zero;                                                        \
261                                                                                            \
262     while (n < nb_samples) {                                                               \
263         if ((cc->state != (src[n] >= zero)) ||                                             \
264             (cc->pi[cc->pi_end].size > s->max_period)) {                                   \
265             double max_peak = cc->pi[cc->pi_end].max_peak;                                 \
266             int state = cc->state;                                                         \
267             cc->state = src[n] >= zero;                                                    \
268             av_assert0(cc->pi[cc->pi_end].size > 0);                                       \
269             if (cc->pi[cc->pi_end].max_peak >= MIN_PEAK ||                                 \
270                 cc->pi[cc->pi_end].size > s->max_period) {                                 \
271                 cc->pi[cc->pi_end].type = 1;                                               \
272                 cc->pi_end++;                                                              \
273                 if (cc->pi_end >= MAX_ITEMS)                                               \
274                     cc->pi_end = 0;                                                        \
275                 if (cc->state != state)                                                    \
276                     cc->pi[cc->pi_end].max_peak = DBL_MIN;                                 \
277                 else                                                                       \
278                     cc->pi[cc->pi_end].max_peak = max_peak;                                \
279                 cc->pi[cc->pi_end].type = 0;                                               \
280                 cc->pi[cc->pi_end].size = 0;                                               \
281                 av_assert0(cc->pi_end != cc->pi_start);                                    \
282             }                                                                              \
283         }                                                                                  \
284                                                                                            \
285         if (cc->state) {                                                                   \
286             while (src[n] >= zero) {                                                       \
287                 cc->pi[cc->pi_end].max_peak = FFMAX(cc->pi[cc->pi_end].max_peak,  src[n]); \
288                 cc->pi[cc->pi_end].size++;                                                 \
289                 n++;                                                                       \
290                 if (n >= nb_samples)                                                       \
291                     break;                                                                 \
292             }                                                                              \
293         } else {                                                                           \
294             while (src[n] < zero) {                                                        \
295                 cc->pi[cc->pi_end].max_peak = FFMAX(cc->pi[cc->pi_end].max_peak, -src[n]); \
296                 cc->pi[cc->pi_end].size++;                                                 \
297                 n++;                                                                       \
298                 if (n >= nb_samples)                                                       \
299                     break;                                                                 \
300             }                                                                              \
301         }                                                                                  \
302     }                                                                                      \
303 }
304
305 ANALYZE_CHANNEL(dbl, double, 0.0)
306 ANALYZE_CHANNEL(flt, float,  0.f)
307
308 #define FILTER_CHANNELS(name, ptype)                                            \
309 static void filter_channels_## name (AVFilterContext *ctx,                      \
310                                      AVFrame *in, int nb_samples)               \
311 {                                                                               \
312     SpeechNormalizerContext *s = ctx->priv;                                     \
313     AVFilterLink *inlink = ctx->inputs[0];                                      \
314                                                                                 \
315     for (int ch = 0; ch < inlink->channels; ch++) {                             \
316         ChannelContext *cc = &s->cc[ch];                                        \
317         ptype *dst = (ptype *)in->extended_data[ch];                            \
318         const int bypass = !(av_channel_layout_extract_channel(inlink->channel_layout, ch) & s->channels); \
319         int n = 0;                                                              \
320                                                                                 \
321         while (n < nb_samples) {                                                \
322             ptype gain;                                                         \
323             int size;                                                           \
324                                                                                 \
325             next_pi(ctx, cc, bypass);                                           \
326             size = FFMIN(nb_samples - n, cc->pi_size);                          \
327             av_assert0(size > 0);                                               \
328             gain = cc->gain_state;                                              \
329             consume_pi(cc, size);                                               \
330             for (int i = n; i < n + size; i++)                                  \
331                 dst[i] *= gain;                                                 \
332             n += size;                                                          \
333         }                                                                       \
334     }                                                                           \
335 }
336
337 FILTER_CHANNELS(dbl, double)
338 FILTER_CHANNELS(flt, float)
339
340 static double lerp(double min, double max, double mix)
341 {
342     return min + (max - min) * mix;
343 }
344
345 #define FILTER_LINK_CHANNELS(name, ptype)                                       \
346 static void filter_link_channels_## name (AVFilterContext *ctx,                 \
347                                           AVFrame *in, int nb_samples)          \
348 {                                                                               \
349     SpeechNormalizerContext *s = ctx->priv;                                     \
350     AVFilterLink *inlink = ctx->inputs[0];                                      \
351     int n = 0;                                                                  \
352                                                                                 \
353     while (n < nb_samples) {                                                    \
354         int min_size = nb_samples - n;                                          \
355         int max_size = 1;                                                       \
356         ptype gain = s->max_expansion;                                          \
357                                                                                 \
358         for (int ch = 0; ch < inlink->channels; ch++) {                         \
359             ChannelContext *cc = &s->cc[ch];                                    \
360                                                                                 \
361             cc->bypass = !(av_channel_layout_extract_channel(inlink->channel_layout, ch) & s->channels); \
362                                                                                 \
363             next_pi(ctx, cc, cc->bypass);                                       \
364             min_size = FFMIN(min_size, cc->pi_size);                            \
365             max_size = FFMAX(max_size, cc->pi_size);                            \
366         }                                                                       \
367                                                                                 \
368         av_assert0(min_size > 0);                                               \
369         for (int ch = 0; ch < inlink->channels; ch++) {                         \
370             ChannelContext *cc = &s->cc[ch];                                    \
371                                                                                 \
372             if (cc->bypass)                                                     \
373                 continue;                                                       \
374             gain = FFMIN(gain, min_gain(ctx, cc, max_size));                    \
375         }                                                                       \
376                                                                                 \
377         for (int ch = 0; ch < inlink->channels; ch++) {                         \
378             ChannelContext *cc = &s->cc[ch];                                    \
379             ptype *dst = (ptype *)in->extended_data[ch];                        \
380                                                                                 \
381             consume_pi(cc, min_size);                                           \
382             if (cc->bypass)                                                     \
383                 continue;                                                       \
384                                                                                 \
385             for (int i = n; i < n + min_size; i++) {                            \
386                 ptype g = lerp(s->prev_gain, gain, (i - n) / (double)min_size); \
387                 dst[i] *= g;                                                    \
388             }                                                                   \
389         }                                                                       \
390                                                                                 \
391         s->prev_gain = gain;                                                    \
392         n += min_size;                                                          \
393     }                                                                           \
394 }
395
396 FILTER_LINK_CHANNELS(dbl, double)
397 FILTER_LINK_CHANNELS(flt, float)
398
399 static int filter_frame(AVFilterContext *ctx)
400 {
401     SpeechNormalizerContext *s = ctx->priv;
402     AVFilterLink *outlink = ctx->outputs[0];
403     AVFilterLink *inlink = ctx->inputs[0];
404     int ret;
405
406     while (s->queue.available > 0) {
407         int min_pi_nb_samples;
408         AVFrame *in;
409
410         in = ff_bufqueue_peek(&s->queue, 0);
411         if (!in)
412             break;
413
414         min_pi_nb_samples = available_samples(ctx);
415         if (min_pi_nb_samples < in->nb_samples && !s->eof)
416             break;
417
418         in = ff_bufqueue_get(&s->queue);
419
420         av_frame_make_writable(in);
421
422         s->filter_channels[s->link](ctx, in, in->nb_samples);
423
424         s->pts = in->pts + in->nb_samples;
425
426         return ff_filter_frame(outlink, in);
427     }
428
429     for (int f = 0; f < ff_inlink_queued_frames(inlink); f++) {
430         AVFrame *in;
431
432         ret = ff_inlink_consume_frame(inlink, &in);
433         if (ret < 0)
434             return ret;
435         if (ret == 0)
436             break;
437
438         ff_bufqueue_add(ctx, &s->queue, in);
439
440         for (int ch = 0; ch < inlink->channels; ch++) {
441             ChannelContext *cc = &s->cc[ch];
442
443             s->analyze_channel(ctx, cc, in->extended_data[ch], in->nb_samples);
444         }
445     }
446
447     return 1;
448 }
449
450 static int activate(AVFilterContext *ctx)
451 {
452     AVFilterLink *inlink = ctx->inputs[0];
453     AVFilterLink *outlink = ctx->outputs[0];
454     SpeechNormalizerContext *s = ctx->priv;
455     int ret, status;
456     int64_t pts;
457
458     FF_FILTER_FORWARD_STATUS_BACK(outlink, inlink);
459
460     ret = filter_frame(ctx);
461     if (ret <= 0)
462         return ret;
463
464     if (!s->eof && ff_inlink_acknowledge_status(inlink, &status, &pts)) {
465         if (status == AVERROR_EOF)
466             s->eof = 1;
467     }
468
469     if (s->eof && ff_inlink_queued_samples(inlink) == 0 &&
470         s->queue.available == 0) {
471         ff_outlink_set_status(outlink, AVERROR_EOF, s->pts);
472         return 0;
473     }
474
475     if (s->queue.available > 0) {
476         AVFrame *in = ff_bufqueue_peek(&s->queue, 0);
477         const int nb_samples = available_samples(ctx);
478
479         if (nb_samples >= in->nb_samples || s->eof) {
480             ff_filter_set_ready(ctx, 10);
481             return 0;
482         }
483     }
484
485     FF_FILTER_FORWARD_WANTED(outlink, inlink);
486
487     return FFERROR_NOT_READY;
488 }
489
490 static int config_input(AVFilterLink *inlink)
491 {
492     AVFilterContext *ctx = inlink->dst;
493     SpeechNormalizerContext *s = ctx->priv;
494
495     s->max_period = inlink->sample_rate / 10;
496
497     s->prev_gain = 1.;
498     s->cc = av_calloc(inlink->channels, sizeof(*s->cc));
499     if (!s->cc)
500         return AVERROR(ENOMEM);
501
502     for (int ch = 0; ch < inlink->channels; ch++) {
503         ChannelContext *cc = &s->cc[ch];
504
505         cc->state = -1;
506         cc->gain_state = 1.;
507     }
508
509     switch (inlink->format) {
510     case AV_SAMPLE_FMT_FLTP:
511         s->analyze_channel = analyze_channel_flt;
512         s->filter_channels[0] = filter_channels_flt;
513         s->filter_channels[1] = filter_link_channels_flt;
514         break;
515     case AV_SAMPLE_FMT_DBLP:
516         s->analyze_channel = analyze_channel_dbl;
517         s->filter_channels[0] = filter_channels_dbl;
518         s->filter_channels[1] = filter_link_channels_dbl;
519         break;
520     default:
521         av_assert0(0);
522     }
523
524     return 0;
525 }
526
527 static int process_command(AVFilterContext *ctx, const char *cmd, const char *args,
528                            char *res, int res_len, int flags)
529 {
530     SpeechNormalizerContext *s = ctx->priv;
531     int link = s->link;
532     int ret;
533
534     ret = ff_filter_process_command(ctx, cmd, args, res, res_len, flags);
535     if (ret < 0)
536         return ret;
537     if (link != s->link)
538         s->prev_gain = 1.;
539
540     return 0;
541 }
542
543 static av_cold void uninit(AVFilterContext *ctx)
544 {
545     SpeechNormalizerContext *s = ctx->priv;
546
547     ff_bufqueue_discard_all(&s->queue);
548     av_freep(&s->cc);
549 }
550
551 static const AVFilterPad inputs[] = {
552     {
553         .name         = "default",
554         .type         = AVMEDIA_TYPE_AUDIO,
555         .config_props = config_input,
556     },
557     { NULL }
558 };
559
560 static const AVFilterPad outputs[] = {
561     {
562         .name = "default",
563         .type = AVMEDIA_TYPE_AUDIO,
564     },
565     { NULL }
566 };
567
568 const AVFilter ff_af_speechnorm = {
569     .name            = "speechnorm",
570     .description     = NULL_IF_CONFIG_SMALL("Speech Normalizer."),
571     .query_formats   = query_formats,
572     .priv_size       = sizeof(SpeechNormalizerContext),
573     .priv_class      = &speechnorm_class,
574     .activate        = activate,
575     .uninit          = uninit,
576     .inputs          = inputs,
577     .outputs         = outputs,
578     .process_command = process_command,
579 };