#include "libavutil/avassert.h"
#include "libavutil/avstring.h"
#include "libavutil/float_dsp.h"
+#include "libavutil/mem_internal.h"
#include "libavutil/opt.h"
#include "libavutil/tx.h"
#include "avfilter.h"
int last_period;
float mem_hp_x[2];
float lastg[NB_BANDS];
- RNNState rnn;
+ float history[FRAME_SIZE];
+ RNNState rnn[2];
AVTXContext *tx, *txi;
av_tx_fn tx_fn, txi_fn;
} DenoiseState;
const AVClass *class;
char *model_name;
+ float mix;
int channels;
DenoiseState *st;
DECLARE_ALIGNED(32, float, window)[WINDOW_SIZE];
DECLARE_ALIGNED(32, float, dct_table)[FFALIGN(NB_BANDS, 4)][FFALIGN(NB_BANDS, 4)];
- RNNModel *model;
+ RNNModel *model[2];
AVFloatDSPContext *fdsp;
} AudioRNNContext;
av_free(model);
}
-static RNNModel *rnnoise_model_from_file(FILE *f)
+static int rnnoise_model_from_file(FILE *f, RNNModel **rnn)
{
- RNNModel *ret;
+ RNNModel *ret = NULL;
DenseLayer *input_dense;
GRULayer *vad_gru;
GRULayer *noise_gru;
int in;
if (fscanf(f, "rnnoise-nu model file version %d\n", &in) != 1 || in != 1)
- return NULL;
+ return AVERROR_INVALIDDATA;
ret = av_calloc(1, sizeof(RNNModel));
if (!ret)
- return NULL;
+ return AVERROR(ENOMEM);
#define ALLOC_LAYER(type, name) \
name = av_calloc(1, sizeof(type)); \
if (!name) { \
rnnoise_model_free(ret); \
- return NULL; \
+ return AVERROR(ENOMEM); \
} \
ret->name = name
#define INPUT_VAL(name) do { \
if (fscanf(f, "%d", &in) != 1 || in < 0 || in > 128) { \
rnnoise_model_free(ret); \
- return NULL; \
+ return AVERROR(EINVAL); \
} \
name = in; \
} while (0)
float *values = av_calloc((len), sizeof(float)); \
if (!values) { \
rnnoise_model_free(ret); \
- return NULL; \
+ return AVERROR(ENOMEM); \
} \
name = values; \
for (int i = 0; i < (len); i++) { \
if (fscanf(f, "%d", &in) != 1) { \
rnnoise_model_free(ret); \
- return NULL; \
+ return AVERROR(EINVAL); \
} \
values[i] = in; \
} \
float *values = av_calloc(FFALIGN((len0), 4) * FFALIGN((len1), 4) * (len2), sizeof(float)); \
if (!values) { \
rnnoise_model_free(ret); \
- return NULL; \
+ return AVERROR(ENOMEM); \
} \
name = values; \
for (int k = 0; k < (len0); k++) { \
for (int j = 0; j < (len1); j++) { \
if (fscanf(f, "%d", &in) != 1) { \
rnnoise_model_free(ret); \
- return NULL; \
+ return AVERROR(EINVAL); \
} \
values[j * (len2) * FFALIGN((len0), 4) + i * FFALIGN((len0), 4) + k] = in; \
} \
} \
} while (0)
+#define NEW_LINE() do { \
+ int c; \
+ while ((c = fgetc(f)) != EOF) { \
+ if (c == '\n') \
+ break; \
+ } \
+ } while (0)
+
#define INPUT_DENSE(name) do { \
INPUT_VAL(name->nb_inputs); \
INPUT_VAL(name->nb_neurons); \
ret->name ## _size = name->nb_neurons; \
INPUT_ACTIVATION(name->activation); \
+ NEW_LINE(); \
INPUT_ARRAY(name->input_weights, name->nb_inputs * name->nb_neurons); \
+ NEW_LINE(); \
INPUT_ARRAY(name->bias, name->nb_neurons); \
+ NEW_LINE(); \
} while (0)
#define INPUT_GRU(name) do { \
INPUT_VAL(name->nb_neurons); \
ret->name ## _size = name->nb_neurons; \
INPUT_ACTIVATION(name->activation); \
+ NEW_LINE(); \
INPUT_ARRAY3(name->input_weights, name->nb_inputs, name->nb_neurons, 3); \
+ NEW_LINE(); \
INPUT_ARRAY3(name->recurrent_weights, name->nb_neurons, name->nb_neurons, 3); \
+ NEW_LINE(); \
INPUT_ARRAY(name->bias, name->nb_neurons * 3); \
+ NEW_LINE(); \
} while (0)
INPUT_DENSE(input_dense);
if (vad_output->nb_neurons != 1) {
rnnoise_model_free(ret);
- return NULL;
+ return AVERROR(EINVAL);
}
- return ret;
+ *rnn = ret;
+
+ return 0;
}
static int query_formats(AVFilterContext *ctx)
s->channels = inlink->channels;
- s->st = av_calloc(s->channels, sizeof(DenoiseState));
+ if (!s->st)
+ s->st = av_calloc(s->channels, sizeof(DenoiseState));
if (!s->st)
return AVERROR(ENOMEM);
for (int i = 0; i < s->channels; i++) {
DenoiseState *st = &s->st[i];
- st->rnn.model = s->model;
- st->rnn.vad_gru_state = av_calloc(sizeof(float), FFALIGN(s->model->vad_gru_size, 16));
- st->rnn.noise_gru_state = av_calloc(sizeof(float), FFALIGN(s->model->noise_gru_size, 16));
- st->rnn.denoise_gru_state = av_calloc(sizeof(float), FFALIGN(s->model->denoise_gru_size, 16));
- if (!st->rnn.vad_gru_state ||
- !st->rnn.noise_gru_state ||
- !st->rnn.denoise_gru_state)
+ st->rnn[0].model = s->model[0];
+ st->rnn[0].vad_gru_state = av_calloc(sizeof(float), FFALIGN(s->model[0]->vad_gru_size, 16));
+ st->rnn[0].noise_gru_state = av_calloc(sizeof(float), FFALIGN(s->model[0]->noise_gru_size, 16));
+ st->rnn[0].denoise_gru_state = av_calloc(sizeof(float), FFALIGN(s->model[0]->denoise_gru_size, 16));
+ if (!st->rnn[0].vad_gru_state ||
+ !st->rnn[0].noise_gru_state ||
+ !st->rnn[0].denoise_gru_state)
return AVERROR(ENOMEM);
+ }
+
+ for (int i = 0; i < s->channels; i++) {
+ DenoiseState *st = &s->st[i];
- ret = av_tx_init(&st->tx, &st->tx_fn, AV_TX_FLOAT_FFT, 0, WINDOW_SIZE, NULL, 0);
+ if (!st->tx)
+ ret = av_tx_init(&st->tx, &st->tx_fn, AV_TX_FLOAT_FFT, 0, WINDOW_SIZE, NULL, 0);
if (ret < 0)
return ret;
- ret = av_tx_init(&st->txi, &st->txi_fn, AV_TX_FLOAT_FFT, 1, WINDOW_SIZE, NULL, 0);
+ if (!st->txi)
+ ret = av_tx_init(&st->txi, &st->txi_fn, AV_TX_FLOAT_FFT, 1, WINDOW_SIZE, NULL, 0);
if (ret < 0)
return ret;
}
static void frame_synthesis(AudioRNNContext *s, DenoiseState *st, float *out, const AVComplexFloat *y)
{
LOCAL_ALIGNED_32(float, x, [WINDOW_SIZE]);
+ const float *src = st->history;
+ const float mix = s->mix;
+ const float imix = 1.f - FFMAX(mix, 0.f);
inverse_transform(st, x, y);
s->fdsp->vector_fmul(x, x, s->window, WINDOW_SIZE);
s->fdsp->vector_fmac_scalar(x, st->synthesis_mem, 1.f, FRAME_SIZE);
RNN_COPY(out, x, FRAME_SIZE);
RNN_COPY(st->synthesis_mem, &x[FRAME_SIZE], FRAME_SIZE);
+
+ for (int n = 0; n < FRAME_SIZE; n++)
+ out[n] = out[n] * mix + src[n] * imix;
}
static inline void xcorr_kernel(const float *x, const float *y, float sum[4], int len)
return xy / sqrtf(1.f + xx * yy);
}
-static const int second_check[16] = {0, 0, 3, 2, 3, 2, 5, 2, 3, 2, 3, 2, 5, 2, 3, 2};
+static const uint8_t second_check[16] = {0, 0, 3, 2, 3, 2, 5, 2, 3, 2, 3, 2, 5, 2, 3, 2};
static float remove_doubling(float *x, int maxperiod, int minperiod, int N,
int *T0_, int prev_period, float prev_gain)
{
compute_dense(rnn->model->denoise_output, gains, rnn->denoise_gru_state);
}
-static float rnnoise_channel(AudioRNNContext *s, DenoiseState *st, float *out, const float *in)
+static float rnnoise_channel(AudioRNNContext *s, DenoiseState *st, float *out, const float *in,
+ int disabled)
{
AVComplexFloat X[FREQ_SIZE];
AVComplexFloat P[WINDOW_SIZE];
float g[NB_BANDS];
float gf[FREQ_SIZE];
float vad_prob = 0;
+ float *history = st->history;
static const float a_hp[2] = {-1.99599, 0.99600};
static const float b_hp[2] = {-2, 1};
int silence;
biquad(x, st->mem_hp_x, in, b_hp, a_hp, FRAME_SIZE);
silence = compute_frame_features(s, st, X, P, Ex, Ep, Exp, features, x);
- if (!silence) {
- compute_rnn(s, &st->rnn, g, &vad_prob, features);
+ if (!silence && !disabled) {
+ compute_rnn(s, &st->rnn[0], g, &vad_prob, features);
pitch_filter(X, P, Ex, Ep, Exp, g);
for (int i = 0; i < NB_BANDS; i++) {
float alpha = .6f;
}
frame_synthesis(s, st, out, X);
+ memcpy(history, in, FRAME_SIZE * sizeof(*history));
return vad_prob;
}
for (int ch = start; ch < end; ch++) {
rnnoise_channel(s, &s->st[ch],
(float *)out->extended_data[ch],
- (const float *)in->extended_data[ch]);
+ (const float *)in->extended_data[ch],
+ ctx->is_disabled);
}
return 0;
return FFERROR_NOT_READY;
}
-static av_cold int init(AVFilterContext *ctx)
+static int open_model(AVFilterContext *ctx, RNNModel **model)
{
AudioRNNContext *s = ctx->priv;
+ int ret;
FILE *f;
- s->fdsp = avpriv_float_dsp_alloc(0);
- if (!s->fdsp)
- return AVERROR(ENOMEM);
-
if (!s->model_name)
return AVERROR(EINVAL);
f = av_fopen_utf8(s->model_name, "r");
- if (!f)
+ if (!f) {
+ av_log(ctx, AV_LOG_ERROR, "Failed to open model file: %s\n", s->model_name);
return AVERROR(EINVAL);
+ }
- s->model = rnnoise_model_from_file(f);
+ ret = rnnoise_model_from_file(f, model);
fclose(f);
- if (!s->model)
- return AVERROR(EINVAL);
+ if (!*model || ret < 0)
+ return ret;
+
+ return 0;
+}
+
+static av_cold int init(AVFilterContext *ctx)
+{
+ AudioRNNContext *s = ctx->priv;
+ int ret;
+
+ s->fdsp = avpriv_float_dsp_alloc(0);
+ if (!s->fdsp)
+ return AVERROR(ENOMEM);
+
+ ret = open_model(ctx, &s->model[0]);
+ if (ret < 0)
+ return ret;
for (int i = 0; i < FRAME_SIZE; i++) {
s->window[i] = sin(.5*M_PI*sin(.5*M_PI*(i+.5)/FRAME_SIZE) * sin(.5*M_PI*(i+.5)/FRAME_SIZE));
return 0;
}
+static void free_model(AVFilterContext *ctx, int n)
+{
+ AudioRNNContext *s = ctx->priv;
+
+ rnnoise_model_free(s->model[n]);
+ s->model[n] = NULL;
+
+ for (int ch = 0; ch < s->channels && s->st; ch++) {
+ av_freep(&s->st[ch].rnn[n].vad_gru_state);
+ av_freep(&s->st[ch].rnn[n].noise_gru_state);
+ av_freep(&s->st[ch].rnn[n].denoise_gru_state);
+ }
+}
+
+static int process_command(AVFilterContext *ctx, const char *cmd, const char *args,
+ char *res, int res_len, int flags)
+{
+ AudioRNNContext *s = ctx->priv;
+ int ret;
+
+ ret = ff_filter_process_command(ctx, cmd, args, res, res_len, flags);
+ if (ret < 0)
+ return ret;
+
+ ret = open_model(ctx, &s->model[1]);
+ if (ret < 0)
+ return ret;
+
+ FFSWAP(RNNModel *, s->model[0], s->model[1]);
+ for (int ch = 0; ch < s->channels; ch++)
+ FFSWAP(RNNState, s->st[ch].rnn[0], s->st[ch].rnn[1]);
+
+ ret = config_input(ctx->inputs[0]);
+ if (ret < 0) {
+ for (int ch = 0; ch < s->channels; ch++)
+ FFSWAP(RNNState, s->st[ch].rnn[0], s->st[ch].rnn[1]);
+ FFSWAP(RNNModel *, s->model[0], s->model[1]);
+ return ret;
+ }
+
+ free_model(ctx, 1);
+ return 0;
+}
+
static av_cold void uninit(AVFilterContext *ctx)
{
AudioRNNContext *s = ctx->priv;
av_freep(&s->fdsp);
- rnnoise_model_free(s->model);
- s->model = NULL;
-
- if (s->st) {
- for (int ch = 0; ch < s->channels; ch++) {
- av_freep(&s->st[ch].rnn.vad_gru_state);
- av_freep(&s->st[ch].rnn.noise_gru_state);
- av_freep(&s->st[ch].rnn.denoise_gru_state);
- av_tx_uninit(&s->st[ch].tx);
- av_tx_uninit(&s->st[ch].txi);
- }
+ free_model(ctx, 0);
+ for (int ch = 0; ch < s->channels && s->st; ch++) {
+ av_tx_uninit(&s->st[ch].tx);
+ av_tx_uninit(&s->st[ch].txi);
}
av_freep(&s->st);
}
};
#define OFFSET(x) offsetof(AudioRNNContext, x)
-#define AF AV_OPT_FLAG_AUDIO_PARAM|AV_OPT_FLAG_FILTERING_PARAM
+#define AF AV_OPT_FLAG_AUDIO_PARAM|AV_OPT_FLAG_FILTERING_PARAM|AV_OPT_FLAG_RUNTIME_PARAM
static const AVOption arnndn_options[] = {
{ "model", "set model name", OFFSET(model_name), AV_OPT_TYPE_STRING, {.str=NULL}, 0, 0, AF },
{ "m", "set model name", OFFSET(model_name), AV_OPT_TYPE_STRING, {.str=NULL}, 0, 0, AF },
+ { "mix", "set output vs input mix", OFFSET(mix), AV_OPT_TYPE_FLOAT, {.dbl=1.0},-1, 1, AF },
{ NULL }
};
AVFILTER_DEFINE_CLASS(arnndn);
-AVFilter ff_af_arnndn = {
+const AVFilter ff_af_arnndn = {
.name = "arnndn",
.description = NULL_IF_CONFIG_SMALL("Reduce noise from speech using Recurrent Neural Networks."),
.query_formats = query_formats,
.uninit = uninit,
.inputs = inputs,
.outputs = outputs,
- .flags = AVFILTER_FLAG_SLICE_THREADS,
+ .flags = AVFILTER_FLAG_SUPPORT_TIMELINE_INTERNAL |
+ AVFILTER_FLAG_SLICE_THREADS,
+ .process_command = process_command,
};