From 25326c82bda01dfa1b86fb4f074d7697705239f8 Mon Sep 17 00:00:00 2001 From: "Steinar H. Gunderson" Date: Thu, 28 Mar 2019 18:49:01 +0100 Subject: [PATCH] Include the raw audio in the MJPEG output. --- nageru/audio_mixer.cpp | 94 ++++++++++++++++++++++++++++++++++++++++ nageru/audio_mixer.h | 4 ++ nageru/mixer.cpp | 37 ++++++++++++++-- nageru/mixer.h | 5 ++- nageru/mjpeg_encoder.cpp | 56 ++++++++++++++++++++++-- nageru/mjpeg_encoder.h | 4 +- 6 files changed, 190 insertions(+), 10 deletions(-) diff --git a/nageru/audio_mixer.cpp b/nageru/audio_mixer.cpp index 9b588d7..360689b 100644 --- a/nageru/audio_mixer.cpp +++ b/nageru/audio_mixer.cpp @@ -52,6 +52,26 @@ void convert_fixed16_to_fp32(float *dst, size_t out_channel, size_t out_num_chan } } +void convert_fixed16_to_fixed32(int32_t *dst, size_t out_channel, size_t out_num_channels, + const uint8_t *src, size_t in_channel, size_t in_num_channels, + size_t num_samples) +{ + assert(in_channel < in_num_channels); + assert(out_channel < out_num_channels); + src += in_channel * 2; + dst += out_channel; + + for (size_t i = 0; i < num_samples; ++i) { + uint32_t s = uint32_t(uint16_t(le16toh(*(int16_t *)src))) << 16; + + // Keep the sign bit in place, repeat the other 15 bits as far as they go. + *dst = s | ((s & 0x7fffffff) >> 15) | ((s & 0x7fffffff) >> 30); + + src += 2 * in_num_channels; + dst += out_num_channels; + } +} + void convert_fixed24_to_fp32(float *dst, size_t out_channel, size_t out_num_channels, const uint8_t *src, size_t in_channel, size_t in_num_channels, size_t num_samples) @@ -73,6 +93,29 @@ void convert_fixed24_to_fp32(float *dst, size_t out_channel, size_t out_num_chan } } +void convert_fixed24_to_fixed32(int32_t *dst, size_t out_channel, size_t out_num_channels, + const uint8_t *src, size_t in_channel, size_t in_num_channels, + size_t num_samples) +{ + assert(in_channel < in_num_channels); + assert(out_channel < out_num_channels); + src += in_channel * 3; + dst += out_channel; + + for (size_t i = 0; i < num_samples; ++i) { + uint32_t s1 = src[0]; + uint32_t s2 = src[1]; + uint32_t s3 = src[2]; + uint32_t s = (s1 << 8) | (s2 << 16) | (s3 << 24); + + // Keep the sign bit in place, repeat the other 23 bits as far as they go. + *dst = s | ((s & 0x7fffffff) >> 23); + + src += 3 * in_num_channels; + dst += out_num_channels; + } +} + void convert_fixed32_to_fp32(float *dst, size_t out_channel, size_t out_num_channels, const uint8_t *src, size_t in_channel, size_t in_num_channels, size_t num_samples) @@ -91,6 +134,25 @@ void convert_fixed32_to_fp32(float *dst, size_t out_channel, size_t out_num_chan } } +// Basically just a reinterleave. +void convert_fixed32_to_fixed32(int32_t *dst, size_t out_channel, size_t out_num_channels, + const uint8_t *src, size_t in_channel, size_t in_num_channels, + size_t num_samples) +{ + assert(in_channel < in_num_channels); + assert(out_channel < out_num_channels); + src += in_channel * 4; + dst += out_channel; + + for (size_t i = 0; i < num_samples; ++i) { + int32_t s = le32toh(*(int32_t *)src); + *dst = s; + + src += 4 * in_num_channels; + dst += out_num_channels; + } +} + float find_peak_plain(const float *samples, size_t num_samples) __attribute__((unused)); float find_peak_plain(const float *samples, size_t num_samples) @@ -294,6 +356,38 @@ bool AudioMixer::add_audio(DeviceSpec device_spec, const uint8_t *data, unsigned return true; } +vector convert_audio_to_fixed32(const uint8_t *data, unsigned num_samples, bmusb::AudioFormat audio_format, unsigned num_channels) +{ + vector audio; + + if (num_channels > audio_format.num_channels) { + audio.resize(num_samples * num_channels, 0); + } else { + audio.resize(num_samples * num_channels); + } + for (unsigned channel_index = 0; channel_index < num_channels && channel_index < audio_format.num_channels; ++channel_index) { + switch (audio_format.bits_per_sample) { + case 0: + assert(num_samples == 0); + break; + case 16: + convert_fixed16_to_fixed32(&audio[0], channel_index, num_channels, data, channel_index, audio_format.num_channels, num_samples); + break; + case 24: + convert_fixed24_to_fixed32(&audio[0], channel_index, num_channels, data, channel_index, audio_format.num_channels, num_samples); + break; + case 32: + convert_fixed32_to_fixed32(&audio[0], channel_index, num_channels, data, channel_index, audio_format.num_channels, num_samples); + break; + default: + fprintf(stderr, "Cannot handle audio with %u bits per sample\n", audio_format.bits_per_sample); + assert(false); + } + } + + return audio; +} + bool AudioMixer::add_silence(DeviceSpec device_spec, unsigned samples_per_frame, unsigned num_frames) { AudioDevice *device = find_audio_device(device_spec); diff --git a/nageru/audio_mixer.h b/nageru/audio_mixer.h index 1cf4da3..14e7e85 100644 --- a/nageru/audio_mixer.h +++ b/nageru/audio_mixer.h @@ -37,6 +37,10 @@ namespace bmusb { struct AudioFormat; } // namespace bmusb +// Convert the given audio from {16,24,32}-bit M-channel to 32-bit N-channel PCM. +// Assumes little-endian and chunky, signed PCM throughout. +std::vector convert_audio_to_fixed32(const uint8_t *data, unsigned num_samples, bmusb::AudioFormat audio_format, unsigned num_destination_channels); + enum EQBand { EQ_BAND_BASS = 0, EQ_BAND_MID, diff --git a/nageru/mixer.cpp b/nageru/mixer.cpp index c91857e..5f38a67 100644 --- a/nageru/mixer.cpp +++ b/nageru/mixer.cpp @@ -785,6 +785,34 @@ void Mixer::bm_frame(unsigned card_index, uint16_t timecode, if (num_samples > 0) { audio_mixer->add_audio(device, audio_frame.data + audio_offset, num_samples, audio_format, audio_frame.received_timestamp); + + // Audio for the MJPEG stream. We don't resample; audio that's not in 48 kHz + // just gets dropped for now. + // + // Only bother doing MJPEG encoding if there are any connected clients + // that want the stream. + if (httpd.get_num_connected_multicam_clients() > 0) { + vector converted_samples = convert_audio_to_fixed32(audio_frame.data + audio_offset, num_samples, audio_format, 2); + lock_guard lock(card_mutex); + if (card->new_raw_audio.empty()) { + card->new_raw_audio = move(converted_samples); + } else { + // For raw audio, we don't really synchronize audio and video; + // we just put the audio in frame by frame, and if a video frame is + // dropped, we still keep the audio, which means it will be added + // to the beginning of the next frame. It would probably be better + // to move the audio pts earlier to show this, but most players can + // live with some jitter, and in a lot of ways, it's much nicer for + // Futatabi to have all audio locked to a video frame. + card->new_raw_audio.insert(card->new_raw_audio.end(), converted_samples.begin(), converted_samples.end()); + + // Truncate to one second, just to be sure we don't have infinite buildup in case of weirdness. + if (card->new_raw_audio.size() > OUTPUT_FREQUENCY * 2) { + size_t excess_samples = card->new_raw_audio.size() - OUTPUT_FREQUENCY * 2; + card->new_raw_audio.erase(card->new_raw_audio.begin(), card->new_raw_audio.begin() + excess_samples); + } + } + } } // Done with the audio, so release it. @@ -1038,7 +1066,8 @@ void Mixer::thread_func() assert(master_card_index < num_cards + num_video_inputs); } - OutputFrameInfo output_frame_info = get_one_frame_from_each_card(master_card_index, master_card_is_output, new_frames, has_new_frame); + vector raw_audio[MAX_VIDEO_CARDS]; // For MJPEG encoding. + OutputFrameInfo output_frame_info = get_one_frame_from_each_card(master_card_index, master_card_is_output, new_frames, has_new_frame, raw_audio); schedule_audio_resampling_tasks(output_frame_info.dropped_frames, output_frame_info.num_samples, output_frame_info.frame_duration, output_frame_info.is_preroll, output_frame_info.frame_timestamp); stats_dropped_frames += output_frame_info.dropped_frames; @@ -1084,7 +1113,7 @@ void Mixer::thread_func() if (new_frame->frame->data_copy != nullptr) { int mjpeg_card_index = mjpeg_encoder->get_mjpeg_stream_for_card(card_index); if (mjpeg_card_index != -1) { - mjpeg_encoder->upload_frame(pts_int, mjpeg_card_index, new_frame->frame, new_frame->video_format, new_frame->y_offset, new_frame->cbcr_offset); + mjpeg_encoder->upload_frame(pts_int, mjpeg_card_index, new_frame->frame, new_frame->video_format, new_frame->y_offset, new_frame->cbcr_offset, move(raw_audio[card_index])); } } } @@ -1198,7 +1227,7 @@ pair Mixer::get_channel_color_http(unsigned channel_idx) return make_pair(theme->get_channel_color(channel_idx), "text/plain"); } -Mixer::OutputFrameInfo Mixer::get_one_frame_from_each_card(unsigned master_card_index, bool master_card_is_output, CaptureCard::NewFrame new_frames[MAX_VIDEO_CARDS], bool has_new_frame[MAX_VIDEO_CARDS]) +Mixer::OutputFrameInfo Mixer::get_one_frame_from_each_card(unsigned master_card_index, bool master_card_is_output, CaptureCard::NewFrame new_frames[MAX_VIDEO_CARDS], bool has_new_frame[MAX_VIDEO_CARDS], vector raw_audio[MAX_VIDEO_CARDS]) { OutputFrameInfo output_frame_info; start: @@ -1246,6 +1275,8 @@ start: card->new_frames.pop_front(); card->new_frames_changed.notify_all(); } + + raw_audio[card_index] = move(card->new_raw_audio); } if (!master_card_is_output) { diff --git a/nageru/mixer.h b/nageru/mixer.h index b4ed76f..de8b457 100644 --- a/nageru/mixer.h +++ b/nageru/mixer.h @@ -541,9 +541,10 @@ private: }; std::deque new_frames; std::condition_variable new_frames_changed; // Set whenever new_frames is changed. - QueueLengthPolicy queue_length_policy; // Refers to the "new_frames" queue. + std::vector new_raw_audio; + int last_timecode = -1; // Unwrapped. JitterHistory jitter_history; @@ -578,7 +579,7 @@ private: bool is_preroll; std::chrono::steady_clock::time_point frame_timestamp; }; - OutputFrameInfo get_one_frame_from_each_card(unsigned master_card_index, bool master_card_is_output, CaptureCard::NewFrame new_frames[MAX_VIDEO_CARDS], bool has_new_frame[MAX_VIDEO_CARDS]); + OutputFrameInfo get_one_frame_from_each_card(unsigned master_card_index, bool master_card_is_output, CaptureCard::NewFrame new_frames[MAX_VIDEO_CARDS], bool has_new_frame[MAX_VIDEO_CARDS], std::vector raw_audio[MAX_VIDEO_CARDS]); InputState input_state; diff --git a/nageru/mjpeg_encoder.cpp b/nageru/mjpeg_encoder.cpp index 46bb94c..033f67a 100644 --- a/nageru/mjpeg_encoder.cpp +++ b/nageru/mjpeg_encoder.cpp @@ -120,7 +120,7 @@ MJPEGEncoder::MJPEGEncoder(HTTPD *httpd, const string &va_display) // a situation with only one video stream (and possibly one audio stream) // with known width/height, and we don't need the extra functionality it provides. avctx.reset(avformat_alloc_context()); - avctx->oformat = av_guess_format("mp4", nullptr, nullptr); + avctx->oformat = av_guess_format("nut", nullptr, nullptr); uint8_t *buf = (uint8_t *)av_malloc(MUX_BUFFER_SIZE); avctx->pb = avio_alloc_context(buf, MUX_BUFFER_SIZE, 1, this, nullptr, nullptr, nullptr); @@ -133,7 +133,11 @@ MJPEGEncoder::MJPEGEncoder(HTTPD *httpd, const string &va_display) fprintf(stderr, "avformat_new_stream() failed\n"); abort(); } - stream->time_base = AVRational{ 1, TIMEBASE }; + + // FFmpeg is very picky about having audio at 1/48000 timebase, + // no matter what we write. Even though we'd prefer our usual 1/120000, + // put the video on the same one, so that we can have locked audio. + stream->time_base = AVRational{ 1, OUTPUT_FREQUENCY }; stream->codecpar->codec_type = AVMEDIA_TYPE_VIDEO; stream->codecpar->codec_id = AV_CODEC_ID_MJPEG; @@ -151,6 +155,19 @@ MJPEGEncoder::MJPEGEncoder(HTTPD *httpd, const string &va_display) stream->codecpar->chroma_location = AVCHROMA_LOC_LEFT; stream->codecpar->field_order = AV_FIELD_PROGRESSIVE; } + for (unsigned card_idx = 0; card_idx < global_flags.card_to_mjpeg_stream_export.size(); ++card_idx) { + AVStream *stream = avformat_new_stream(avctx.get(), nullptr); + if (stream == nullptr) { + fprintf(stderr, "avformat_new_stream() failed\n"); + abort(); + } + stream->time_base = AVRational{ 1, OUTPUT_FREQUENCY }; + stream->codecpar->codec_type = AVMEDIA_TYPE_AUDIO; + stream->codecpar->codec_id = AV_CODEC_ID_PCM_S32LE; + stream->codecpar->channel_layout = AV_CH_LAYOUT_STEREO; + stream->codecpar->channels = 2; + stream->codecpar->sample_rate = OUTPUT_FREQUENCY; + } AVDictionary *options = NULL; vector> opts = MUX_OPTS; @@ -269,7 +286,7 @@ unique_ptr MJPEGEncoder::try_open_va(const string &va_disp return va_dpy; } -void MJPEGEncoder::upload_frame(int64_t pts, unsigned card_index, RefCountedFrame frame, const bmusb::VideoFormat &video_format, size_t y_offset, size_t cbcr_offset) +void MJPEGEncoder::upload_frame(int64_t pts, unsigned card_index, RefCountedFrame frame, const bmusb::VideoFormat &video_format, size_t y_offset, size_t cbcr_offset, vector audio) { PBOFrameAllocator::Userdata *userdata = (PBOFrameAllocator::Userdata *)frame->userdata; if (video_format.width == 0 || video_format.height == 0) { @@ -300,7 +317,7 @@ void MJPEGEncoder::upload_frame(int64_t pts, unsigned card_index, RefCountedFram return; } ++metric_mjpeg_overrun_submitted; - frames_to_be_encoded.push(QueuedFrame{ pts, card_index, frame, video_format, y_offset, cbcr_offset }); + frames_to_be_encoded.push(QueuedFrame{ pts, card_index, frame, video_format, y_offset, cbcr_offset, move(audio) }); any_frames_to_be_encoded.notify_all(); } @@ -341,6 +358,11 @@ void MJPEGEncoder::encoder_thread_func() // Will call back in the receiver thread. encode_jpeg_va(move(qf)); } else { + // Write audio before video, since Futatabi expects it. + if (qf.audio.size() > 0) { + write_audio_packet(qf.pts, qf.card_index, qf.audio); + } + // Encode synchronously, in the same thread. vector jpeg = encode_jpeg_libjpeg(qf); write_mjpeg_packet(qf.pts, qf.card_index, jpeg.data(), jpeg.size()); @@ -364,6 +386,27 @@ void MJPEGEncoder::write_mjpeg_packet(int64_t pts, unsigned card_index, const ui pkt.flags = AV_PKT_FLAG_KEY; AVRational time_base = avctx->streams[pkt.stream_index]->time_base; pkt.pts = pkt.dts = av_rescale_q(pts, AVRational{ 1, TIMEBASE }, time_base); + pkt.duration = 0; + + if (av_write_frame(avctx.get(), &pkt) < 0) { + fprintf(stderr, "av_write_frame() failed\n"); + abort(); + } +} + +void MJPEGEncoder::write_audio_packet(int64_t pts, unsigned card_index, const vector &audio) +{ + AVPacket pkt; + memset(&pkt, 0, sizeof(pkt)); + pkt.buf = nullptr; + pkt.data = reinterpret_cast(const_cast(&audio[0])); + pkt.size = audio.size() * sizeof(audio[0]); + pkt.stream_index = card_index + global_flags.card_to_mjpeg_stream_export.size(); + pkt.flags = AV_PKT_FLAG_KEY; + AVRational time_base = avctx->streams[pkt.stream_index]->time_base; + pkt.pts = pkt.dts = av_rescale_q(pts, AVRational{ 1, TIMEBASE }, time_base); + size_t num_stereo_samples = audio.size() / 2; + pkt.duration = av_rescale_q(num_stereo_samples, AVRational{ 1, OUTPUT_FREQUENCY }, time_base); if (av_write_frame(avctx.get(), &pkt) < 0) { fprintf(stderr, "av_write_frame() failed\n"); @@ -752,6 +795,11 @@ void MJPEGEncoder::va_receiver_thread_func() frames_encoding.pop(); } + // Write audio before video, since Futatabi expects it. + if (qf.audio.size() > 0) { + write_audio_packet(qf.pts, qf.card_index, qf.audio); + } + VAStatus va_status = vaSyncSurface(va_dpy->va_dpy, qf.resources.surface); CHECK_VASTATUS(va_status, "vaSyncSurface"); diff --git a/nageru/mjpeg_encoder.h b/nageru/mjpeg_encoder.h index 6e0357f..bb783d8 100644 --- a/nageru/mjpeg_encoder.h +++ b/nageru/mjpeg_encoder.h @@ -38,7 +38,7 @@ public: MJPEGEncoder(HTTPD *httpd, const std::string &va_display); ~MJPEGEncoder(); void stop(); - void upload_frame(int64_t pts, unsigned card_index, RefCountedFrame frame, const bmusb::VideoFormat &video_format, size_t y_offset, size_t cbcr_offset); + void upload_frame(int64_t pts, unsigned card_index, RefCountedFrame frame, const bmusb::VideoFormat &video_format, size_t y_offset, size_t cbcr_offset, std::vector audio); bool using_vaapi() const { return va_dpy != nullptr; } // Returns -1 for inactive (ie., don't encode frames for this card right now). @@ -104,6 +104,7 @@ private: RefCountedFrame frame; bmusb::VideoFormat video_format; size_t y_offset, cbcr_offset; + std::vector audio; // Only for frames in the process of being encoded by VA-API. VAResources resources; @@ -115,6 +116,7 @@ private: void encode_jpeg_va(QueuedFrame &&qf); std::vector encode_jpeg_libjpeg(const QueuedFrame &qf); void write_mjpeg_packet(int64_t pts, unsigned card_index, const uint8_t *jpeg, size_t jpeg_size); + void write_audio_packet(int64_t pts, unsigned card_index, const std::vector &audio); void init_jpeg_422(unsigned width, unsigned height, VectorDestinationManager *dest, jpeg_compress_struct *cinfo); std::vector get_jpeg_header(unsigned width, unsigned height, jpeg_compress_struct *cinfo); -- 2.39.2