2 * Copyright (c) 2012 Pavel Koshevoy <pkoshevoy at gmail dot com>
4 * This file is part of FFmpeg.
6 * FFmpeg is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU Lesser General Public
8 * License as published by the Free Software Foundation; either
9 * version 2.1 of the License, or (at your option) any later version.
11 * FFmpeg is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
14 * Lesser General Public License for more details.
16 * You should have received a copy of the GNU Lesser General Public
17 * License along with FFmpeg; if not, write to the Free Software
18 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23 * tempo scaling audio filter -- an implementation of WSOLA algorithm
25 * Based on MIT licensed yaeAudioTempoFilter.h and yaeAudioFragment.h
26 * from Apprentice Video player by Pavel Koshevoy.
27 * https://sourceforge.net/projects/apprenticevideo/
29 * An explanation of SOLA algorithm is available at
30 * http://www.surina.net/article/time-and-pitch-scaling.html
32 * WSOLA is very similar to SOLA, only one major difference exists between
33 * these algorithms. SOLA shifts audio fragments along the output stream,
34 * where as WSOLA shifts audio fragments along the input stream.
36 * The advantage of WSOLA algorithm is that the overlap region size is
37 * always the same, therefore the blending function is constant and
42 #include "libavcodec/avfft.h"
43 #include "libavutil/audioconvert.h"
44 #include "libavutil/avassert.h"
45 #include "libavutil/avstring.h"
46 #include "libavutil/eval.h"
47 #include "libavutil/opt.h"
48 #include "libavutil/samplefmt.h"
54 * A fragment of audio waveform
57 // index of the first sample of this fragment in the overall waveform;
58 // 0: input sample position
59 // 1: output sample position
62 // original packed multi-channel samples:
65 // number of samples in this fragment:
68 // rDFT transform of the down-mixed mono fragment, used for
69 // fast waveform alignment via correlation in frequency domain:
74 * Filter state machine states
80 YAE_OUTPUT_OVERLAP_ADD,
85 * Filter state machine
88 // ring-buffer of input samples, necessary because some times
89 // input fragment position may be adjusted backwards:
92 // ring-buffer maximum capacity, expressed in sample rate time base:
95 // ring-buffer house keeping:
100 // 0: input sample position corresponding to the ring buffer tail
101 // 1: output sample position
105 enum AVSampleFormat format;
107 // number of channels:
110 // row of bytes to skip from one sample to next, across multple channels;
111 // stride = (number-of-channels * bits-per-sample-per-channel) / 8
114 // fragment window size, power-of-two integer:
117 // Hann window coefficients, for feathering
118 // (blending) the overlapping fragment region:
121 // tempo scaling factor:
124 // cumulative alignment drift:
127 // current/previous fragment ring-buffer:
128 AudioFragment frag[2];
130 // current fragment index:
136 // for fast correlation calculation in frequency domain:
137 RDFTContext *real_to_complex;
138 RDFTContext *complex_to_real;
139 FFTSample *correlation;
141 // for managing AVFilterPad.request_frame and AVFilterPad.filter_samples
142 int request_fulfilled;
143 AVFilterBufferRef *dst_buffer;
146 uint64_t nsamples_in;
147 uint64_t nsamples_out;
151 * Reset filter to initial state, do not deallocate existing local buffers.
153 static void yae_clear(ATempoContext *atempo)
161 atempo->state = YAE_LOAD_FRAGMENT;
163 atempo->position[0] = 0;
164 atempo->position[1] = 0;
166 atempo->frag[0].position[0] = 0;
167 atempo->frag[0].position[1] = 0;
168 atempo->frag[0].nsamples = 0;
170 atempo->frag[1].position[0] = 0;
171 atempo->frag[1].position[1] = 0;
172 atempo->frag[1].nsamples = 0;
174 // shift left position of 1st fragment by half a window
175 // so that no re-normalization would be required for
176 // the left half of the 1st fragment:
177 atempo->frag[0].position[0] = -(int64_t)(atempo->window / 2);
178 atempo->frag[0].position[1] = -(int64_t)(atempo->window / 2);
180 avfilter_unref_bufferp(&atempo->dst_buffer);
182 atempo->dst_end = NULL;
184 atempo->request_fulfilled = 0;
185 atempo->nsamples_in = 0;
186 atempo->nsamples_out = 0;
190 * Reset filter to initial state and deallocate all buffers.
192 static void yae_release_buffers(ATempoContext *atempo)
196 av_freep(&atempo->frag[0].data);
197 av_freep(&atempo->frag[1].data);
198 av_freep(&atempo->frag[0].xdat);
199 av_freep(&atempo->frag[1].xdat);
201 av_freep(&atempo->buffer);
202 av_freep(&atempo->hann);
203 av_freep(&atempo->correlation);
205 av_rdft_end(atempo->real_to_complex);
206 atempo->real_to_complex = NULL;
208 av_rdft_end(atempo->complex_to_real);
209 atempo->complex_to_real = NULL;
212 #define REALLOC_OR_FAIL(field, field_size) \
214 void * new_field = av_realloc(field, (field_size)); \
216 yae_release_buffers(atempo); \
217 return AVERROR(ENOMEM); \
223 * Prepare filter for processing audio data of given format,
224 * sample rate and number of channels.
226 static int yae_reset(ATempoContext *atempo,
227 enum AVSampleFormat format,
231 const int sample_size = av_get_bytes_per_sample(format);
232 uint32_t nlevels = 0;
236 atempo->format = format;
237 atempo->channels = channels;
238 atempo->stride = sample_size * channels;
240 // pick a segment window size:
241 atempo->window = sample_rate / 24;
243 // adjust window size to be a power-of-two integer:
244 nlevels = av_log2(atempo->window);
246 av_assert0(pot <= atempo->window);
248 if (pot < atempo->window) {
249 atempo->window = pot * 2;
253 // initialize audio fragment buffers:
254 REALLOC_OR_FAIL(atempo->frag[0].data, atempo->window * atempo->stride);
255 REALLOC_OR_FAIL(atempo->frag[1].data, atempo->window * atempo->stride);
256 REALLOC_OR_FAIL(atempo->frag[0].xdat, atempo->window * sizeof(FFTComplex));
257 REALLOC_OR_FAIL(atempo->frag[1].xdat, atempo->window * sizeof(FFTComplex));
259 // initialize rDFT contexts:
260 av_rdft_end(atempo->real_to_complex);
261 atempo->real_to_complex = NULL;
263 av_rdft_end(atempo->complex_to_real);
264 atempo->complex_to_real = NULL;
266 atempo->real_to_complex = av_rdft_init(nlevels + 1, DFT_R2C);
267 if (!atempo->real_to_complex) {
268 yae_release_buffers(atempo);
269 return AVERROR(ENOMEM);
272 atempo->complex_to_real = av_rdft_init(nlevels + 1, IDFT_C2R);
273 if (!atempo->complex_to_real) {
274 yae_release_buffers(atempo);
275 return AVERROR(ENOMEM);
278 REALLOC_OR_FAIL(atempo->correlation, atempo->window * sizeof(FFTComplex));
280 atempo->ring = atempo->window * 3;
281 REALLOC_OR_FAIL(atempo->buffer, atempo->ring * atempo->stride);
283 // initialize the Hann window function:
284 REALLOC_OR_FAIL(atempo->hann, atempo->window * sizeof(float));
286 for (i = 0; i < atempo->window; i++) {
287 double t = (double)i / (double)(atempo->window - 1);
288 double h = 0.5 * (1.0 - cos(2.0 * M_PI * t));
289 atempo->hann[i] = (float)h;
296 static int yae_set_tempo(AVFilterContext *ctx, const char *arg_tempo)
298 ATempoContext *atempo = ctx->priv;
300 double tempo = av_strtod(arg_tempo, &tail);
303 av_log(ctx, AV_LOG_ERROR, "Invalid tempo value '%s'\n", arg_tempo);
304 return AVERROR(EINVAL);
307 if (tempo < 0.5 || tempo > 2.0) {
308 av_log(ctx, AV_LOG_ERROR, "Tempo value %f exceeds [0.5, 2.0] range\n",
310 return AVERROR(EINVAL);
313 atempo->tempo = tempo;
317 inline static AudioFragment *yae_curr_frag(ATempoContext *atempo)
319 return &atempo->frag[atempo->nfrag % 2];
322 inline static AudioFragment *yae_prev_frag(ATempoContext *atempo)
324 return &atempo->frag[(atempo->nfrag + 1) % 2];
328 * A helper macro for initializing complex data buffer with scalar data
331 #define yae_init_xdat(scalar_type, scalar_max) \
333 const uint8_t *src_end = src + \
334 frag->nsamples * atempo->channels * sizeof(scalar_type); \
336 FFTSample *xdat = frag->xdat; \
339 if (atempo->channels == 1) { \
340 for (; src < src_end; xdat++) { \
341 tmp = *(const scalar_type *)src; \
342 src += sizeof(scalar_type); \
344 *xdat = (FFTSample)tmp; \
347 FFTSample s, max, ti, si; \
350 for (; src < src_end; xdat++) { \
351 tmp = *(const scalar_type *)src; \
352 src += sizeof(scalar_type); \
354 max = (FFTSample)tmp; \
355 s = FFMIN((FFTSample)scalar_max, \
356 (FFTSample)fabsf(max)); \
358 for (i = 1; i < atempo->channels; i++) { \
359 tmp = *(const scalar_type *)src; \
360 src += sizeof(scalar_type); \
362 ti = (FFTSample)tmp; \
363 si = FFMIN((FFTSample)scalar_max, \
364 (FFTSample)fabsf(ti)); \
378 * Initialize complex data buffer of a given audio fragment
379 * with down-mixed mono data of appropriate scalar type.
381 static void yae_downmix(ATempoContext *atempo, AudioFragment *frag)
384 const uint8_t *src = frag->data;
386 // init complex data buffer used for FFT and Correlation:
387 memset(frag->xdat, 0, sizeof(FFTComplex) * atempo->window);
389 if (atempo->format == AV_SAMPLE_FMT_U8) {
390 yae_init_xdat(uint8_t, 127);
391 } else if (atempo->format == AV_SAMPLE_FMT_S16) {
392 yae_init_xdat(int16_t, 32767);
393 } else if (atempo->format == AV_SAMPLE_FMT_S32) {
394 yae_init_xdat(int, 2147483647);
395 } else if (atempo->format == AV_SAMPLE_FMT_FLT) {
396 yae_init_xdat(float, 1);
397 } else if (atempo->format == AV_SAMPLE_FMT_DBL) {
398 yae_init_xdat(double, 1);
403 * Populate the internal data buffer on as-needed basis.
406 * 0 if requested data was already available or was successfully loaded,
407 * AVERROR(EAGAIN) if more input data is required.
409 static int yae_load_data(ATempoContext *atempo,
410 const uint8_t **src_ref,
411 const uint8_t *src_end,
415 const uint8_t *src = *src_ref;
416 const int read_size = stop_here - atempo->position[0];
418 if (stop_here <= atempo->position[0]) {
422 // samples are not expected to be skipped:
423 av_assert0(read_size <= atempo->ring);
425 while (atempo->position[0] < stop_here && src < src_end) {
426 int src_samples = (src_end - src) / atempo->stride;
428 // load data piece-wise, in order to avoid complicating the logic:
429 int nsamples = FFMIN(read_size, src_samples);
433 nsamples = FFMIN(nsamples, atempo->ring);
434 na = FFMIN(nsamples, atempo->ring - atempo->tail);
435 nb = FFMIN(nsamples - na, atempo->ring);
438 uint8_t *a = atempo->buffer + atempo->tail * atempo->stride;
439 memcpy(a, src, na * atempo->stride);
441 src += na * atempo->stride;
442 atempo->position[0] += na;
444 atempo->size = FFMIN(atempo->size + na, atempo->ring);
445 atempo->tail = (atempo->tail + na) % atempo->ring;
447 atempo->size < atempo->ring ?
448 atempo->tail - atempo->size :
453 uint8_t *b = atempo->buffer;
454 memcpy(b, src, nb * atempo->stride);
456 src += nb * atempo->stride;
457 atempo->position[0] += nb;
459 atempo->size = FFMIN(atempo->size + nb, atempo->ring);
460 atempo->tail = (atempo->tail + nb) % atempo->ring;
462 atempo->size < atempo->ring ?
463 atempo->tail - atempo->size :
468 // pass back the updated source buffer pointer:
472 av_assert0(atempo->position[0] <= stop_here);
474 return atempo->position[0] == stop_here ? 0 : AVERROR(EAGAIN);
478 * Populate current audio fragment data buffer.
481 * 0 when the fragment is ready,
482 * AVERROR(EAGAIN) if more input data is required.
484 static int yae_load_frag(ATempoContext *atempo,
485 const uint8_t **src_ref,
486 const uint8_t *src_end)
489 AudioFragment *frag = yae_curr_frag(atempo);
491 int64_t missing, start, zeros;
493 const uint8_t *a, *b;
494 int i0, i1, n0, n1, na, nb;
496 int64_t stop_here = frag->position[0] + atempo->window;
497 if (src_ref && yae_load_data(atempo, src_ref, src_end, stop_here) != 0) {
498 return AVERROR(EAGAIN);
501 // calculate the number of samples we don't have:
503 stop_here > atempo->position[0] ?
504 stop_here - atempo->position[0] : 0;
507 missing < (int64_t)atempo->window ?
508 (uint32_t)(atempo->window - missing) : 0;
510 // setup the output buffer:
511 frag->nsamples = nsamples;
514 start = atempo->position[0] - atempo->size;
517 if (frag->position[0] < start) {
518 // what we don't have we substitute with zeros:
519 zeros = FFMIN(start - frag->position[0], (int64_t)nsamples);
520 av_assert0(zeros != nsamples);
522 memset(dst, 0, zeros * atempo->stride);
523 dst += zeros * atempo->stride;
526 if (zeros == nsamples) {
530 // get the remaining data from the ring buffer:
531 na = (atempo->head < atempo->tail ?
532 atempo->tail - atempo->head :
533 atempo->ring - atempo->head);
535 nb = atempo->head < atempo->tail ? 0 : atempo->tail;
538 av_assert0(nsamples <= zeros + na + nb);
540 a = atempo->buffer + atempo->head * atempo->stride;
543 i0 = frag->position[0] + zeros - start;
544 i1 = i0 < na ? 0 : i0 - na;
546 n0 = i0 < na ? FFMIN(na - i0, (int)(nsamples - zeros)) : 0;
547 n1 = nsamples - zeros - n0;
550 memcpy(dst, a + i0 * atempo->stride, n0 * atempo->stride);
551 dst += n0 * atempo->stride;
555 memcpy(dst, b + i1 * atempo->stride, n1 * atempo->stride);
562 * Prepare for loading next audio fragment.
564 static void yae_advance_to_next_frag(ATempoContext *atempo)
566 const double fragment_step = atempo->tempo * (double)(atempo->window / 2);
568 const AudioFragment *prev;
572 prev = yae_prev_frag(atempo);
573 frag = yae_curr_frag(atempo);
575 frag->position[0] = prev->position[0] + (int64_t)fragment_step;
576 frag->position[1] = prev->position[1] + atempo->window / 2;
581 * Calculate cross-correlation via rDFT.
583 * Multiply two vectors of complex numbers (result of real_to_complex rDFT)
584 * and transform back via complex_to_real rDFT.
586 static void yae_xcorr_via_rdft(FFTSample *xcorr,
587 RDFTContext *complex_to_real,
588 const FFTComplex *xa,
589 const FFTComplex *xb,
592 FFTComplex *xc = (FFTComplex *)xcorr;
595 // NOTE: first element requires special care -- Given Y = rDFT(X),
596 // Im(Y[0]) and Im(Y[N/2]) are always zero, therefore av_rdft_calc
597 // stores Re(Y[N/2]) in place of Im(Y[0]).
599 xc->re = xa->re * xb->re;
600 xc->im = xa->im * xb->im;
605 for (i = 1; i < window; i++, xa++, xb++, xc++) {
606 xc->re = (xa->re * xb->re + xa->im * xb->im);
607 xc->im = (xa->im * xb->re - xa->re * xb->im);
610 // apply inverse rDFT:
611 av_rdft_calc(complex_to_real, xcorr);
615 * Calculate alignment offset for given fragment
616 * relative to the previous fragment.
618 * @return alignment offset of current fragment relative to previous.
620 static int yae_align(AudioFragment *frag,
621 const AudioFragment *prev,
625 FFTSample *correlation,
626 RDFTContext *complex_to_real)
628 int best_offset = -drift;
629 FFTSample best_metric = -FLT_MAX;
636 yae_xcorr_via_rdft(correlation,
638 (const FFTComplex *)prev->xdat,
639 (const FFTComplex *)frag->xdat,
642 // identify search window boundaries:
643 i0 = FFMAX(window / 2 - delta_max - drift, 0);
644 i0 = FFMIN(i0, window);
646 i1 = FFMIN(window / 2 + delta_max - drift, window - window / 16);
649 // identify cross-correlation peaks within search window:
650 xcorr = correlation + i0;
652 for (i = i0; i < i1; i++, xcorr++) {
653 FFTSample metric = *xcorr;
656 FFTSample drifti = (FFTSample)(drift + i);
657 metric *= drifti * (FFTSample)(i - i0) * (FFTSample)(i1 - i);
659 if (metric > best_metric) {
660 best_metric = metric;
661 best_offset = i - window / 2;
669 * Adjust current fragment position for better alignment
670 * with previous fragment.
672 * @return alignment correction.
674 static int yae_adjust_position(ATempoContext *atempo)
676 const AudioFragment *prev = yae_prev_frag(atempo);
677 AudioFragment *frag = yae_curr_frag(atempo);
679 const int delta_max = atempo->window / 2;
680 const int correction = yae_align(frag,
686 atempo->complex_to_real);
689 // adjust fragment position:
690 frag->position[0] -= correction;
692 // clear so that the fragment can be reloaded:
695 // update cumulative correction drift counter:
696 atempo->drift += correction;
703 * A helper macro for blending the overlap region of previous
704 * and current audio fragment.
706 #define yae_blend(scalar_type) \
708 const scalar_type *aaa = (const scalar_type *)a; \
709 const scalar_type *bbb = (const scalar_type *)b; \
711 scalar_type *out = (scalar_type *)dst; \
712 scalar_type *out_end = (scalar_type *)dst_end; \
715 for (i = 0; i < overlap && out < out_end; \
716 i++, atempo->position[1]++, wa++, wb++) { \
721 for (j = 0; j < atempo->channels; \
722 j++, aaa++, bbb++, out++) { \
723 float t0 = (float)*aaa; \
724 float t1 = (float)*bbb; \
727 frag->position[0] + i < 0 ? \
729 (scalar_type)(t0 * w0 + t1 * w1); \
732 dst = (uint8_t *)out; \
736 * Blend the overlap region of previous and current audio fragment
737 * and output the results to the given destination buffer.
740 * 0 if the overlap region was completely stored in the dst buffer,
741 * AVERROR(EAGAIN) if more destination buffer space is required.
743 static int yae_overlap_add(ATempoContext *atempo,
748 const AudioFragment *prev = yae_prev_frag(atempo);
749 const AudioFragment *frag = yae_curr_frag(atempo);
751 const int64_t start_here = FFMAX(atempo->position[1],
754 const int64_t stop_here = FFMIN(prev->position[1] + prev->nsamples,
755 frag->position[1] + frag->nsamples);
757 const int64_t overlap = stop_here - start_here;
759 const int64_t ia = start_here - prev->position[1];
760 const int64_t ib = start_here - frag->position[1];
762 const float *wa = atempo->hann + ia;
763 const float *wb = atempo->hann + ib;
765 const uint8_t *a = prev->data + ia * atempo->stride;
766 const uint8_t *b = frag->data + ib * atempo->stride;
768 uint8_t *dst = *dst_ref;
770 av_assert0(start_here <= stop_here &&
771 frag->position[1] <= start_here &&
772 overlap <= frag->nsamples);
774 if (atempo->format == AV_SAMPLE_FMT_U8) {
776 } else if (atempo->format == AV_SAMPLE_FMT_S16) {
778 } else if (atempo->format == AV_SAMPLE_FMT_S32) {
780 } else if (atempo->format == AV_SAMPLE_FMT_FLT) {
782 } else if (atempo->format == AV_SAMPLE_FMT_DBL) {
786 // pass-back the updated destination buffer pointer:
789 return atempo->position[1] == stop_here ? 0 : AVERROR(EAGAIN);
793 * Feed as much data to the filter as it is able to consume
794 * and receive as much processed data in the destination buffer
795 * as it is able to produce or store.
798 yae_apply(ATempoContext *atempo,
799 const uint8_t **src_ref,
800 const uint8_t *src_end,
805 if (atempo->state == YAE_LOAD_FRAGMENT) {
806 // load additional data for the current fragment:
807 if (yae_load_frag(atempo, src_ref, src_end) != 0) {
812 yae_downmix(atempo, yae_curr_frag(atempo));
815 av_rdft_calc(atempo->real_to_complex, yae_curr_frag(atempo)->xdat);
817 // must load the second fragment before alignment can start:
818 if (!atempo->nfrag) {
819 yae_advance_to_next_frag(atempo);
823 atempo->state = YAE_ADJUST_POSITION;
826 if (atempo->state == YAE_ADJUST_POSITION) {
827 // adjust position for better alignment:
828 if (yae_adjust_position(atempo)) {
829 // reload the fragment at the corrected position, so that the
830 // Hann window blending would not require normalization:
831 atempo->state = YAE_RELOAD_FRAGMENT;
833 atempo->state = YAE_OUTPUT_OVERLAP_ADD;
837 if (atempo->state == YAE_RELOAD_FRAGMENT) {
838 // load additional data if necessary due to position adjustment:
839 if (yae_load_frag(atempo, src_ref, src_end) != 0) {
844 yae_downmix(atempo, yae_curr_frag(atempo));
847 av_rdft_calc(atempo->real_to_complex, yae_curr_frag(atempo)->xdat);
849 atempo->state = YAE_OUTPUT_OVERLAP_ADD;
852 if (atempo->state == YAE_OUTPUT_OVERLAP_ADD) {
853 // overlap-add and output the result:
854 if (yae_overlap_add(atempo, dst_ref, dst_end) != 0) {
858 // advance to the next fragment, repeat:
859 yae_advance_to_next_frag(atempo);
860 atempo->state = YAE_LOAD_FRAGMENT;
866 * Flush any buffered data from the filter.
869 * 0 if all data was completely stored in the dst buffer,
870 * AVERROR(EAGAIN) if more destination buffer space is required.
872 static int yae_flush(ATempoContext *atempo,
876 AudioFragment *frag = yae_curr_frag(atempo);
889 atempo->state = YAE_FLUSH_OUTPUT;
891 if (atempo->position[0] == frag->position[0] + frag->nsamples &&
892 atempo->position[1] == frag->position[1] + frag->nsamples) {
893 // the current fragment is already flushed:
897 if (frag->position[0] + frag->nsamples < atempo->position[0]) {
898 // finish loading the current (possibly partial) fragment:
899 yae_load_frag(atempo, NULL, NULL);
903 yae_downmix(atempo, frag);
906 av_rdft_calc(atempo->real_to_complex, frag->xdat);
908 // align current fragment to previous fragment:
909 if (yae_adjust_position(atempo)) {
910 // reload the current fragment due to adjusted position:
911 yae_load_frag(atempo, NULL, NULL);
916 // flush the overlap region:
917 overlap_end = frag->position[1] + FFMIN(atempo->window / 2,
920 while (atempo->position[1] < overlap_end) {
921 if (yae_overlap_add(atempo, dst_ref, dst_end) != 0) {
922 return AVERROR(EAGAIN);
926 // flush the remaininder of the current fragment:
927 start_here = FFMAX(atempo->position[1], overlap_end);
928 stop_here = frag->position[1] + frag->nsamples;
929 offset = start_here - frag->position[1];
930 av_assert0(start_here <= stop_here && frag->position[1] <= start_here);
932 src = frag->data + offset * atempo->stride;
933 dst = (uint8_t *)*dst_ref;
935 src_size = (int)(stop_here - start_here) * atempo->stride;
936 dst_size = dst_end - dst;
937 nbytes = FFMIN(src_size, dst_size);
939 memcpy(dst, src, nbytes);
942 atempo->position[1] += (nbytes / atempo->stride);
944 // pass-back the updated destination buffer pointer:
945 *dst_ref = (uint8_t *)dst;
947 return atempo->position[1] == stop_here ? 0 : AVERROR(EAGAIN);
950 static av_cold int init(AVFilterContext *ctx, const char *args)
952 ATempoContext *atempo = ctx->priv;
954 // NOTE: this assumes that the caller has memset ctx->priv to 0:
955 atempo->format = AV_SAMPLE_FMT_NONE;
957 atempo->state = YAE_LOAD_FRAGMENT;
959 return args ? yae_set_tempo(ctx, args) : 0;
962 static av_cold void uninit(AVFilterContext *ctx)
964 ATempoContext *atempo = ctx->priv;
965 yae_release_buffers(atempo);
968 static int query_formats(AVFilterContext *ctx)
970 AVFilterChannelLayouts *layouts = NULL;
971 AVFilterFormats *formats = NULL;
973 // WSOLA necessitates an internal sliding window ring buffer
974 // for incoming audio stream.
976 // Planar sample formats are too cumbersome to store in a ring buffer,
977 // therefore planar sample formats are not supported.
979 enum AVSampleFormat sample_fmts[] = {
988 layouts = ff_all_channel_layouts();
990 return AVERROR(ENOMEM);
992 ff_set_common_channel_layouts(ctx, layouts);
994 formats = ff_make_format_list(sample_fmts);
996 return AVERROR(ENOMEM);
998 ff_set_common_formats(ctx, formats);
1000 formats = ff_all_samplerates();
1002 return AVERROR(ENOMEM);
1004 ff_set_common_samplerates(ctx, formats);
1009 static int config_props(AVFilterLink *inlink)
1011 AVFilterContext *ctx = inlink->dst;
1012 ATempoContext *atempo = ctx->priv;
1014 enum AVSampleFormat format = inlink->format;
1015 int sample_rate = (int)inlink->sample_rate;
1016 int channels = av_get_channel_layout_nb_channels(inlink->channel_layout);
1018 return yae_reset(atempo, format, sample_rate, channels);
1021 static void push_samples(ATempoContext *atempo,
1022 AVFilterLink *outlink,
1025 atempo->dst_buffer->audio->sample_rate = outlink->sample_rate;
1026 atempo->dst_buffer->audio->nb_samples = n_out;
1029 atempo->dst_buffer->pts =
1030 av_rescale_q(atempo->nsamples_out,
1031 (AVRational){ 1, outlink->sample_rate },
1032 outlink->time_base);
1034 ff_filter_samples(outlink, atempo->dst_buffer);
1035 atempo->dst_buffer = NULL;
1037 atempo->dst_end = NULL;
1039 atempo->nsamples_out += n_out;
1042 static int filter_samples(AVFilterLink *inlink,
1043 AVFilterBufferRef *src_buffer)
1045 AVFilterContext *ctx = inlink->dst;
1046 ATempoContext *atempo = ctx->priv;
1047 AVFilterLink *outlink = ctx->outputs[0];
1049 int n_in = src_buffer->audio->nb_samples;
1050 int n_out = (int)(0.5 + ((double)n_in) / atempo->tempo);
1052 const uint8_t *src = src_buffer->data[0];
1053 const uint8_t *src_end = src + n_in * atempo->stride;
1055 while (src < src_end) {
1056 if (!atempo->dst_buffer) {
1057 atempo->dst_buffer = ff_get_audio_buffer(outlink,
1060 avfilter_copy_buffer_ref_props(atempo->dst_buffer, src_buffer);
1062 atempo->dst = atempo->dst_buffer->data[0];
1063 atempo->dst_end = atempo->dst + n_out * atempo->stride;
1066 yae_apply(atempo, &src, src_end, &atempo->dst, atempo->dst_end);
1068 if (atempo->dst == atempo->dst_end) {
1069 push_samples(atempo, outlink, n_out);
1070 atempo->request_fulfilled = 1;
1074 atempo->nsamples_in += n_in;
1075 avfilter_unref_bufferp(&src_buffer);
1079 static int request_frame(AVFilterLink *outlink)
1081 AVFilterContext *ctx = outlink->src;
1082 ATempoContext *atempo = ctx->priv;
1085 atempo->request_fulfilled = 0;
1087 ret = ff_request_frame(ctx->inputs[0]);
1089 while (!atempo->request_fulfilled && ret >= 0);
1091 if (ret == AVERROR_EOF) {
1092 // flush the filter:
1093 int n_max = atempo->ring;
1095 int err = AVERROR(EAGAIN);
1097 while (err == AVERROR(EAGAIN)) {
1098 if (!atempo->dst_buffer) {
1099 atempo->dst_buffer = ff_get_audio_buffer(outlink,
1103 atempo->dst = atempo->dst_buffer->data[0];
1104 atempo->dst_end = atempo->dst + n_max * atempo->stride;
1107 err = yae_flush(atempo, &atempo->dst, atempo->dst_end);
1109 n_out = ((atempo->dst - atempo->dst_buffer->data[0]) /
1113 push_samples(atempo, outlink, n_out);
1117 avfilter_unref_bufferp(&atempo->dst_buffer);
1119 atempo->dst_end = NULL;
1127 static int process_command(AVFilterContext *ctx,
1134 return !strcmp(cmd, "tempo") ? yae_set_tempo(ctx, arg) : AVERROR(ENOSYS);
1137 AVFilter avfilter_af_atempo = {
1139 .description = NULL_IF_CONFIG_SMALL("Adjust audio tempo."),
1142 .query_formats = query_formats,
1143 .process_command = process_command,
1144 .priv_size = sizeof(ATempoContext),
1146 .inputs = (const AVFilterPad[]) {
1147 { .name = "default",
1148 .type = AVMEDIA_TYPE_AUDIO,
1149 .filter_samples = filter_samples,
1150 .config_props = config_props,
1151 .min_perms = AV_PERM_READ, },
1155 .outputs = (const AVFilterPad[]) {
1156 { .name = "default",
1157 .request_frame = request_frame,
1158 .type = AVMEDIA_TYPE_AUDIO, },