From d41e4825e6e02a693661ae7b055b081411e8b1dc Mon Sep 17 00:00:00 2001 From: "Steinar H. Gunderson" Date: Tue, 15 May 2018 00:23:44 +0200 Subject: [PATCH] Support audio-only FFmpeg inputs. Somewhat wonky, though. --- ffmpeg_capture.cpp | 161 +++++++++++++++++++++++++------------------- mixer.cpp | 20 ++++-- mixer.h | 2 +- ref_counted_frame.h | 10 ++- 4 files changed, 114 insertions(+), 79 deletions(-) diff --git a/ffmpeg_capture.cpp b/ffmpeg_capture.cpp index 7bd9278..5f95da2 100644 --- a/ffmpeg_capture.cpp +++ b/ffmpeg_capture.cpp @@ -380,29 +380,36 @@ bool FFmpegCapture::play_video(const string &pathname) } int video_stream_index = find_stream_index(format_ctx.get(), AVMEDIA_TYPE_VIDEO); - if (video_stream_index == -1) { - fprintf(stderr, "%s: No video stream found\n", pathname.c_str()); - return false; - } - int audio_stream_index = find_stream_index(format_ctx.get(), AVMEDIA_TYPE_AUDIO); - // Open video decoder. - const AVCodecParameters *video_codecpar = format_ctx->streams[video_stream_index]->codecpar; - AVCodec *video_codec = avcodec_find_decoder(video_codecpar->codec_id); - video_timebase = format_ctx->streams[video_stream_index]->time_base; - AVCodecContextWithDeleter video_codec_ctx = avcodec_alloc_context3_unique(nullptr); - if (avcodec_parameters_to_context(video_codec_ctx.get(), video_codecpar) < 0) { - fprintf(stderr, "%s: Cannot fill video codec parameters\n", pathname.c_str()); + if (video_stream_index == -1 && audio_stream_index == -1) { + fprintf(stderr, "%s: No audio nor video stream found\n", pathname.c_str()); return false; } - if (video_codec == nullptr) { - fprintf(stderr, "%s: Cannot find video decoder\n", pathname.c_str()); - return false; - } - if (avcodec_open2(video_codec_ctx.get(), video_codec, nullptr) < 0) { - fprintf(stderr, "%s: Cannot open video decoder\n", pathname.c_str()); - return false; + if (video_stream_index == -1) { + fprintf(stderr, "%s: No video stream found, assuming audio-only.\n", pathname.c_str()); + } + const bool audio_only_stream = (video_stream_index == -1); + + // Open video decoder, if we have video. + AVCodecContextWithDeleter video_codec_ctx; + if (video_stream_index != -1) { + const AVCodecParameters *video_codecpar = format_ctx->streams[video_stream_index]->codecpar; + AVCodec *video_codec = avcodec_find_decoder(video_codecpar->codec_id); + video_timebase = format_ctx->streams[video_stream_index]->time_base; + video_codec_ctx = avcodec_alloc_context3_unique(nullptr); + if (avcodec_parameters_to_context(video_codec_ctx.get(), video_codecpar) < 0) { + fprintf(stderr, "%s: Cannot fill video codec parameters\n", pathname.c_str()); + return false; + } + if (video_codec == nullptr) { + fprintf(stderr, "%s: Cannot find video decoder\n", pathname.c_str()); + return false; + } + if (avcodec_open2(video_codec_ctx.get(), video_codec, nullptr) < 0) { + fprintf(stderr, "%s: Cannot open video decoder\n", pathname.c_str()); + return false; + } } unique_ptr video_codec_ctx_cleanup( video_codec_ctx.get(), avcodec_close); @@ -448,7 +455,7 @@ bool FFmpegCapture::play_video(const string &pathname) if (error) { return false; } - if (frame == nullptr) { + if (frame == nullptr && !(audio_only_stream && audio_frame->len > 0)) { // EOF. Loop back to the start if we can. if (av_seek_frame(format_ctx.get(), /*stream_index=*/-1, /*timestamp=*/0, /*flags=*/0) < 0) { fprintf(stderr, "%s: Rewind failed, not looping.\n", pathname.c_str()); @@ -471,50 +478,60 @@ bool FFmpegCapture::play_video(const string &pathname) continue; } - VideoFormat video_format = construct_video_format(frame.get(), video_timebase); - UniqueFrame video_frame = make_video_frame(frame.get(), pathname, &error); - if (error) { - return false; + VideoFormat video_format; + UniqueFrame video_frame; + if (!audio_only_stream) { + video_format = construct_video_format(frame.get(), video_timebase); + video_frame = make_video_frame(frame.get(), pathname, &error); + if (error) { + return false; + } } - for ( ;; ) { + int64_t frame_pts = audio_only_stream ? audio_pts : frame->pts; + AVRational timebase = audio_only_stream ? audio_timebase : video_timebase; + for ( ;; ) { // Try sending the frame in a loop as long as we get interrupted (then break). if (last_pts == 0 && pts_origin == 0) { - pts_origin = frame->pts; + pts_origin = frame_pts; } - next_frame_start = compute_frame_start(frame->pts, pts_origin, video_timebase, start, rate); - if (first_frame && last_frame_was_connected) { - // If reconnect took more than one second, this is probably a live feed, - // and we should reset the resampler. (Or the rate is really, really low, - // in which case a reset on the first frame is fine anyway.) - if (duration(next_frame_start - last_frame).count() >= 1.0) { - last_frame_was_connected = false; + next_frame_start = compute_frame_start(frame_pts, pts_origin, timebase, start, rate); + if (audio_only_stream) { + audio_frame->received_timestamp = next_frame_start; + } else { + if (first_frame && last_frame_was_connected) { + // If reconnect took more than one second, this is probably a live feed, + // and we should reset the resampler. (Or the rate is really, really low, + // in which case a reset on the first frame is fine anyway.) + if (duration(next_frame_start - last_frame).count() >= 1.0) { + last_frame_was_connected = false; + } + } + video_frame->received_timestamp = next_frame_start; + + // The easiest way to get all the rate conversions etc. right is to move the + // audio PTS into the video PTS timebase and go from there. (We'll get some + // rounding issues, but they should not be a big problem.) + int64_t audio_pts_as_video_pts = av_rescale_q(audio_pts, audio_timebase, video_timebase); + audio_frame->received_timestamp = compute_frame_start(audio_pts_as_video_pts, pts_origin, video_timebase, start, rate); + + if (audio_frame->len != 0) { + // The received timestamps in Nageru are measured after we've just received the frame. + // However, pts (especially audio pts) is at the _beginning_ of the frame. + // If we have locked audio, the distinction doesn't really matter, as pts is + // on a relative scale and a fixed offset is fine. But if we don't, we will have + // a different number of samples each time, which will cause huge audio jitter + // and throw off the resampler. + // + // In a sense, we should have compensated by adding the frame and audio lengths + // to video_frame->received_timestamp and audio_frame->received_timestamp respectively, + // but that would mean extra waiting in sleep_until(). All we need is that they + // are correct relative to each other, though (and to the other frames we send), + // so just align the end of the audio frame, and we're fine. + size_t num_samples = (audio_frame->len * 8) / audio_format.bits_per_sample / audio_format.num_channels; + double offset = double(num_samples) / OUTPUT_FREQUENCY - + double(video_format.frame_rate_den) / video_format.frame_rate_nom; + audio_frame->received_timestamp += duration_cast(duration(offset)); } - } - video_frame->received_timestamp = next_frame_start; - - // The easiest way to get all the rate conversions etc. right is to move the - // audio PTS into the video PTS timebase and go from there. (We'll get some - // rounding issues, but they should not be a big problem.) - int64_t audio_pts_as_video_pts = av_rescale_q(audio_pts, audio_timebase, video_timebase); - audio_frame->received_timestamp = compute_frame_start(audio_pts_as_video_pts, pts_origin, video_timebase, start, rate); - - if (audio_frame->len != 0) { - // The received timestamps in Nageru are measured after we've just received the frame. - // However, pts (especially audio pts) is at the _beginning_ of the frame. - // If we have locked audio, the distinction doesn't really matter, as pts is - // on a relative scale and a fixed offset is fine. But if we don't, we will have - // a different number of samples each time, which will cause huge audio jitter - // and throw off the resampler. - // - // In a sense, we should have compensated by adding the frame and audio lengths - // to video_frame->received_timestamp and audio_frame->received_timestamp respectively, - // but that would mean extra waiting in sleep_until(). All we need is that they - // are correct relative to each other, though (and to the other frames we send), - // so just align the end of the audio frame, and we're fine. - size_t num_samples = (audio_frame->len * 8) / audio_format.bits_per_sample / audio_format.num_channels; - double offset = double(num_samples) / OUTPUT_FREQUENCY - - double(video_format.frame_rate_den) / video_format.frame_rate_nom; - audio_frame->received_timestamp += duration_cast(duration(offset)); } steady_clock::time_point now = steady_clock::now(); @@ -526,7 +543,7 @@ bool FFmpegCapture::play_video(const string &pathname) fprintf(stderr, "%s: Playback %.0f ms behind, resetting time scale\n", pathname.c_str(), 1e3 * duration(now - next_frame_start).count()); - pts_origin = frame->pts; + pts_origin = frame_pts; start = next_frame_start = now; timecode += MAX_FPS * 2 + 1; } @@ -542,7 +559,7 @@ bool FFmpegCapture::play_video(const string &pathname) // audio discontinuity.) timecode += MAX_FPS * 2 + 1; } - frame_callback(frame->pts, video_timebase, audio_pts, audio_timebase, timecode++, + frame_callback(frame_pts, video_timebase, audio_pts, audio_timebase, timecode++, video_frame.get_and_release(), 0, video_format, audio_frame.get_and_release(), 0, audio_format); first_frame = false; @@ -571,7 +588,7 @@ bool FFmpegCapture::play_video(const string &pathname) } } } - last_pts = frame->pts; + last_pts = frame_pts; } return true; } @@ -685,14 +702,18 @@ AVFrameWithDeleter FFmpegCapture::decode_frame(AVFormatContext *format_ctx, AVCo } } - // Decode video, if we have a frame. - int err = avcodec_receive_frame(video_codec_ctx, video_avframe.get()); - if (err == 0) { - frame_finished = true; - break; - } else if (err != AVERROR(EAGAIN)) { - fprintf(stderr, "%s: Cannot receive frame from video codec.\n", pathname.c_str()); - *error = true; + if (video_codec_ctx != nullptr) { + // Decode video, if we have a frame. + int err = avcodec_receive_frame(video_codec_ctx, video_avframe.get()); + if (err == 0) { + frame_finished = true; + break; + } else if (err != AVERROR(EAGAIN)) { + fprintf(stderr, "%s: Cannot receive frame from video codec.\n", pathname.c_str()); + *error = true; + return AVFrameWithDeleter(nullptr); + } + } else { return AVFrameWithDeleter(nullptr); } } while (!eof); diff --git a/mixer.cpp b/mixer.cpp index deaa8e7..b26e097 100644 --- a/mixer.cpp +++ b/mixer.cpp @@ -731,9 +731,6 @@ void Mixer::bm_frame(unsigned card_index, uint16_t timecode, } } - int64_t frame_length = int64_t(TIMEBASE) * video_format.frame_rate_den / video_format.frame_rate_nom; - assert(frame_length > 0); - size_t num_samples = (audio_frame.len > audio_offset) ? (audio_frame.len - audio_offset) / audio_format.num_channels / (audio_format.bits_per_sample / 8) : 0; if (num_samples > OUTPUT_FREQUENCY / 10 && card->type != CardType::FFMPEG_INPUT) { printf("%s: Dropping frame with implausible audio length (len=%d, offset=%d) [timecode=0x%04x video_len=%d video_offset=%d video_format=%x)\n", @@ -748,6 +745,17 @@ void Mixer::bm_frame(unsigned card_index, uint16_t timecode, return; } + int64_t frame_length; + bool audio_only_frame = false; + if (video_frame.len - video_offset == 0 && num_samples > 0) { + // Audio-only frame (probably from FFmpegCapture). + frame_length = int64_t(TIMEBASE) * num_samples / OUTPUT_FREQUENCY; + audio_only_frame = true; + } else { + frame_length = int64_t(TIMEBASE) * video_format.frame_rate_den / video_format.frame_rate_nom; + } + assert(frame_length > 0); + int dropped_frames = 0; if (card->last_timecode != -1) { dropped_frames = unwrap_timecode(timecode, card->last_timecode) - card->last_timecode - 1; @@ -819,7 +827,7 @@ void Mixer::bm_frame(unsigned card_index, uint16_t timecode, } // Still send on the information that we _had_ a frame, even though it's corrupted, - // so that pts can go up accordingly. + // so that pts can go up accordingly. (This is also used for audio-only frames.) { unique_lock lock(card_mutex); CaptureCard::NewFrame new_frame; @@ -827,6 +835,7 @@ void Mixer::bm_frame(unsigned card_index, uint16_t timecode, new_frame.length = frame_length; new_frame.interlaced = false; new_frame.dropped_frames = dropped_frames; + new_frame.audio_only = audio_only_frame; new_frame.received_timestamp = video_frame.received_timestamp; card->new_frames.push_back(move(new_frame)); card->jitter_history.frame_arrived(video_frame.received_timestamp, frame_length, dropped_frames); @@ -1032,7 +1041,8 @@ void Mixer::thread_func() if (card_index == master_card_index || !has_new_frame[card_index]) { continue; } - if (new_frames[card_index].frame->len == 0) { + if (new_frames[card_index].frame->len == 0 && + !new_frames[card_index].audio_only) { ++new_frames[card_index].dropped_frames; } if (new_frames[card_index].dropped_frames > 0) { diff --git a/mixer.h b/mixer.h index 32a1ea4..226c91d 100644 --- a/mixer.h +++ b/mixer.h @@ -525,7 +525,7 @@ private: struct NewFrame { RefCountedFrame frame; int64_t length; // In TIMEBASE units. - bool interlaced; + bool interlaced, audio_only = false; unsigned field; // Which field (0 or 1) of the frame to use. Always 0 for progressive. std::function upload_func; // Needs to be called to actually upload the texture to OpenGL. unsigned dropped_frames = 0; // Number of dropped frames before this one. diff --git a/ref_counted_frame.h b/ref_counted_frame.h index 59a1686..b3a8187 100644 --- a/ref_counted_frame.h +++ b/ref_counted_frame.h @@ -51,9 +51,13 @@ public: bmusb::FrameAllocator::Frame get_and_release() { bmusb::FrameAllocator::Frame *ptr = release(); - bmusb::FrameAllocator::Frame frame = *ptr; - delete ptr; - return frame; + if (ptr == nullptr) { + return bmusb::FrameAllocator::Frame(); + } else { + bmusb::FrameAllocator::Frame frame = *ptr; + delete ptr; + return frame; + } } }; -- 2.39.2