]> git.sesse.net Git - nageru/commitdiff
Include the raw audio in the MJPEG output.
authorSteinar H. Gunderson <sgunderson@bigfoot.com>
Thu, 28 Mar 2019 17:49:01 +0000 (18:49 +0100)
committerSteinar H. Gunderson <sgunderson@bigfoot.com>
Thu, 28 Mar 2019 17:49:01 +0000 (18:49 +0100)
nageru/audio_mixer.cpp
nageru/audio_mixer.h
nageru/mixer.cpp
nageru/mixer.h
nageru/mjpeg_encoder.cpp
nageru/mjpeg_encoder.h

index 9b588d704d63303dccd354a40bafcbc7431f6d69..360689b83406b04e6737cb6b7815d4092aca0a90 100644 (file)
@@ -52,6 +52,26 @@ void convert_fixed16_to_fp32(float *dst, size_t out_channel, size_t out_num_chan
        }
 }
 
+void convert_fixed16_to_fixed32(int32_t *dst, size_t out_channel, size_t out_num_channels,
+                                const uint8_t *src, size_t in_channel, size_t in_num_channels,
+                                size_t num_samples)
+{
+       assert(in_channel < in_num_channels);
+       assert(out_channel < out_num_channels);
+       src += in_channel * 2;
+       dst += out_channel;
+
+       for (size_t i = 0; i < num_samples; ++i) {
+               uint32_t s = uint32_t(uint16_t(le16toh(*(int16_t *)src))) << 16;
+
+               // Keep the sign bit in place, repeat the other 15 bits as far as they go.
+               *dst = s | ((s & 0x7fffffff) >> 15) | ((s & 0x7fffffff) >> 30);
+
+               src += 2 * in_num_channels;
+               dst += out_num_channels;
+       }
+}
+
 void convert_fixed24_to_fp32(float *dst, size_t out_channel, size_t out_num_channels,
                              const uint8_t *src, size_t in_channel, size_t in_num_channels,
                              size_t num_samples)
@@ -73,6 +93,29 @@ void convert_fixed24_to_fp32(float *dst, size_t out_channel, size_t out_num_chan
        }
 }
 
+void convert_fixed24_to_fixed32(int32_t *dst, size_t out_channel, size_t out_num_channels,
+                                const uint8_t *src, size_t in_channel, size_t in_num_channels,
+                                size_t num_samples)
+{
+       assert(in_channel < in_num_channels);
+       assert(out_channel < out_num_channels);
+       src += in_channel * 3;
+       dst += out_channel;
+
+       for (size_t i = 0; i < num_samples; ++i) {
+               uint32_t s1 = src[0];
+               uint32_t s2 = src[1];
+               uint32_t s3 = src[2];
+               uint32_t s = (s1 << 8) | (s2 << 16) | (s3 << 24);
+
+               // Keep the sign bit in place, repeat the other 23 bits as far as they go.
+               *dst = s | ((s & 0x7fffffff) >> 23);
+
+               src += 3 * in_num_channels;
+               dst += out_num_channels;
+       }
+}
+
 void convert_fixed32_to_fp32(float *dst, size_t out_channel, size_t out_num_channels,
                              const uint8_t *src, size_t in_channel, size_t in_num_channels,
                              size_t num_samples)
@@ -91,6 +134,25 @@ void convert_fixed32_to_fp32(float *dst, size_t out_channel, size_t out_num_chan
        }
 }
 
+// Basically just a reinterleave.
+void convert_fixed32_to_fixed32(int32_t *dst, size_t out_channel, size_t out_num_channels,
+                                const uint8_t *src, size_t in_channel, size_t in_num_channels,
+                                size_t num_samples)
+{
+       assert(in_channel < in_num_channels);
+       assert(out_channel < out_num_channels);
+       src += in_channel * 4;
+       dst += out_channel;
+
+       for (size_t i = 0; i < num_samples; ++i) {
+               int32_t s = le32toh(*(int32_t *)src);
+               *dst = s;
+
+               src += 4 * in_num_channels;
+               dst += out_num_channels;
+       }
+}
+
 float find_peak_plain(const float *samples, size_t num_samples) __attribute__((unused));
 
 float find_peak_plain(const float *samples, size_t num_samples)
@@ -294,6 +356,38 @@ bool AudioMixer::add_audio(DeviceSpec device_spec, const uint8_t *data, unsigned
        return true;
 }
 
+vector<int32_t> convert_audio_to_fixed32(const uint8_t *data, unsigned num_samples, bmusb::AudioFormat audio_format, unsigned num_channels)
+{
+       vector<int32_t> audio;
+
+       if (num_channels > audio_format.num_channels) {
+               audio.resize(num_samples * num_channels, 0);
+       } else {
+               audio.resize(num_samples * num_channels);
+       }
+       for (unsigned channel_index = 0; channel_index < num_channels && channel_index < audio_format.num_channels; ++channel_index) {
+               switch (audio_format.bits_per_sample) {
+               case 0:
+                       assert(num_samples == 0);
+                       break;
+               case 16:
+                       convert_fixed16_to_fixed32(&audio[0], channel_index, num_channels, data, channel_index, audio_format.num_channels, num_samples);
+                       break;
+               case 24:
+                       convert_fixed24_to_fixed32(&audio[0], channel_index, num_channels, data, channel_index, audio_format.num_channels, num_samples);
+                       break;
+               case 32:
+                       convert_fixed32_to_fixed32(&audio[0], channel_index, num_channels, data, channel_index, audio_format.num_channels, num_samples);
+                       break;
+               default:
+                       fprintf(stderr, "Cannot handle audio with %u bits per sample\n", audio_format.bits_per_sample);
+                       assert(false);
+               }
+       }
+
+       return audio;
+}
+
 bool AudioMixer::add_silence(DeviceSpec device_spec, unsigned samples_per_frame, unsigned num_frames)
 {
        AudioDevice *device = find_audio_device(device_spec);
index 1cf4da349e71d4cc28d7c2ad5b4689f56fd7586d..14e7e85d098065aabc5cd39d70ed5b31a8713041 100644 (file)
@@ -37,6 +37,10 @@ namespace bmusb {
 struct AudioFormat;
 }  // namespace bmusb
 
+// Convert the given audio from {16,24,32}-bit M-channel to 32-bit N-channel PCM.
+// Assumes little-endian and chunky, signed PCM throughout.
+std::vector<int32_t> convert_audio_to_fixed32(const uint8_t *data, unsigned num_samples, bmusb::AudioFormat audio_format, unsigned num_destination_channels);
+
 enum EQBand {
        EQ_BAND_BASS = 0,
        EQ_BAND_MID,
index c91857e62fbab09b3677fcebba41402235feabc6..5f38a673581ccbca276faf9a5d9992dafe06d2f5 100644 (file)
@@ -785,6 +785,34 @@ void Mixer::bm_frame(unsigned card_index, uint16_t timecode,
 
        if (num_samples > 0) {
                audio_mixer->add_audio(device, audio_frame.data + audio_offset, num_samples, audio_format, audio_frame.received_timestamp);
+
+               // Audio for the MJPEG stream. We don't resample; audio that's not in 48 kHz
+               // just gets dropped for now.
+               //
+               // Only bother doing MJPEG encoding if there are any connected clients
+               // that want the stream.
+               if (httpd.get_num_connected_multicam_clients() > 0) {
+                       vector<int32_t> converted_samples = convert_audio_to_fixed32(audio_frame.data + audio_offset, num_samples, audio_format, 2);
+                       lock_guard<mutex> lock(card_mutex);
+                       if (card->new_raw_audio.empty()) {
+                               card->new_raw_audio = move(converted_samples);
+                       } else {
+                               // For raw audio, we don't really synchronize audio and video;
+                               // we just put the audio in frame by frame, and if a video frame is
+                               // dropped, we still keep the audio, which means it will be added
+                               // to the beginning of the next frame. It would probably be better
+                               // to move the audio pts earlier to show this, but most players can
+                               // live with some jitter, and in a lot of ways, it's much nicer for
+                               // Futatabi to have all audio locked to a video frame.
+                               card->new_raw_audio.insert(card->new_raw_audio.end(), converted_samples.begin(), converted_samples.end());
+
+                               // Truncate to one second, just to be sure we don't have infinite buildup in case of weirdness.
+                               if (card->new_raw_audio.size() > OUTPUT_FREQUENCY * 2) {
+                                       size_t excess_samples = card->new_raw_audio.size() - OUTPUT_FREQUENCY * 2;
+                                       card->new_raw_audio.erase(card->new_raw_audio.begin(), card->new_raw_audio.begin() + excess_samples);
+                               }
+                       }
+               }
        }
 
        // Done with the audio, so release it.
@@ -1038,7 +1066,8 @@ void Mixer::thread_func()
                        assert(master_card_index < num_cards + num_video_inputs);
                }
 
-               OutputFrameInfo output_frame_info = get_one_frame_from_each_card(master_card_index, master_card_is_output, new_frames, has_new_frame);
+               vector<int32_t> raw_audio[MAX_VIDEO_CARDS];  // For MJPEG encoding.
+               OutputFrameInfo output_frame_info = get_one_frame_from_each_card(master_card_index, master_card_is_output, new_frames, has_new_frame, raw_audio);
                schedule_audio_resampling_tasks(output_frame_info.dropped_frames, output_frame_info.num_samples, output_frame_info.frame_duration, output_frame_info.is_preroll, output_frame_info.frame_timestamp);
                stats_dropped_frames += output_frame_info.dropped_frames;
 
@@ -1084,7 +1113,7 @@ void Mixer::thread_func()
                        if (new_frame->frame->data_copy != nullptr) {
                                int mjpeg_card_index = mjpeg_encoder->get_mjpeg_stream_for_card(card_index);
                                if (mjpeg_card_index != -1) {
-                                       mjpeg_encoder->upload_frame(pts_int, mjpeg_card_index, new_frame->frame, new_frame->video_format, new_frame->y_offset, new_frame->cbcr_offset);
+                                       mjpeg_encoder->upload_frame(pts_int, mjpeg_card_index, new_frame->frame, new_frame->video_format, new_frame->y_offset, new_frame->cbcr_offset, move(raw_audio[card_index]));
                                }
                        }
                }
@@ -1198,7 +1227,7 @@ pair<string, string> Mixer::get_channel_color_http(unsigned channel_idx)
        return make_pair(theme->get_channel_color(channel_idx), "text/plain");
 }
 
-Mixer::OutputFrameInfo Mixer::get_one_frame_from_each_card(unsigned master_card_index, bool master_card_is_output, CaptureCard::NewFrame new_frames[MAX_VIDEO_CARDS], bool has_new_frame[MAX_VIDEO_CARDS])
+Mixer::OutputFrameInfo Mixer::get_one_frame_from_each_card(unsigned master_card_index, bool master_card_is_output, CaptureCard::NewFrame new_frames[MAX_VIDEO_CARDS], bool has_new_frame[MAX_VIDEO_CARDS], vector<int32_t> raw_audio[MAX_VIDEO_CARDS])
 {
        OutputFrameInfo output_frame_info;
 start:
@@ -1246,6 +1275,8 @@ start:
                        card->new_frames.pop_front();
                        card->new_frames_changed.notify_all();
                }
+
+               raw_audio[card_index] = move(card->new_raw_audio);
        }
 
        if (!master_card_is_output) {
index b4ed76f4575ff0403b4865a470af8925d548ddd1..de8b45741ef30cf97058d5270f995f45e1680553 100644 (file)
@@ -541,9 +541,10 @@ private:
                };
                std::deque<NewFrame> new_frames;
                std::condition_variable new_frames_changed;  // Set whenever new_frames is changed.
-
                QueueLengthPolicy queue_length_policy;  // Refers to the "new_frames" queue.
 
+               std::vector<int32_t> new_raw_audio;
+
                int last_timecode = -1;  // Unwrapped.
 
                JitterHistory jitter_history;
@@ -578,7 +579,7 @@ private:
                bool is_preroll;
                std::chrono::steady_clock::time_point frame_timestamp;
        };
-       OutputFrameInfo get_one_frame_from_each_card(unsigned master_card_index, bool master_card_is_output, CaptureCard::NewFrame new_frames[MAX_VIDEO_CARDS], bool has_new_frame[MAX_VIDEO_CARDS]);
+       OutputFrameInfo get_one_frame_from_each_card(unsigned master_card_index, bool master_card_is_output, CaptureCard::NewFrame new_frames[MAX_VIDEO_CARDS], bool has_new_frame[MAX_VIDEO_CARDS], std::vector<int32_t> raw_audio[MAX_VIDEO_CARDS]);
 
        InputState input_state;
 
index 46bb94c7639112f76b031c96c63acab9560b99ce..033f67afd9e439db3a5289248bed5b67e8f306ba 100644 (file)
@@ -120,7 +120,7 @@ MJPEGEncoder::MJPEGEncoder(HTTPD *httpd, const string &va_display)
        // a situation with only one video stream (and possibly one audio stream)
        // with known width/height, and we don't need the extra functionality it provides.
        avctx.reset(avformat_alloc_context());
-       avctx->oformat = av_guess_format("mp4", nullptr, nullptr);
+       avctx->oformat = av_guess_format("nut", nullptr, nullptr);
 
        uint8_t *buf = (uint8_t *)av_malloc(MUX_BUFFER_SIZE);
        avctx->pb = avio_alloc_context(buf, MUX_BUFFER_SIZE, 1, this, nullptr, nullptr, nullptr);
@@ -133,7 +133,11 @@ MJPEGEncoder::MJPEGEncoder(HTTPD *httpd, const string &va_display)
                        fprintf(stderr, "avformat_new_stream() failed\n");
                        abort();
                }
-               stream->time_base = AVRational{ 1, TIMEBASE };
+
+               // FFmpeg is very picky about having audio at 1/48000 timebase,
+               // no matter what we write. Even though we'd prefer our usual 1/120000,
+               // put the video on the same one, so that we can have locked audio.
+               stream->time_base = AVRational{ 1, OUTPUT_FREQUENCY };
                stream->codecpar->codec_type = AVMEDIA_TYPE_VIDEO;
                stream->codecpar->codec_id = AV_CODEC_ID_MJPEG;
 
@@ -151,6 +155,19 @@ MJPEGEncoder::MJPEGEncoder(HTTPD *httpd, const string &va_display)
                stream->codecpar->chroma_location = AVCHROMA_LOC_LEFT;
                stream->codecpar->field_order = AV_FIELD_PROGRESSIVE;
        }
+       for (unsigned card_idx = 0; card_idx < global_flags.card_to_mjpeg_stream_export.size(); ++card_idx) {
+               AVStream *stream = avformat_new_stream(avctx.get(), nullptr);
+               if (stream == nullptr) {
+                       fprintf(stderr, "avformat_new_stream() failed\n");
+                       abort();
+               }
+               stream->time_base = AVRational{ 1, OUTPUT_FREQUENCY };
+               stream->codecpar->codec_type = AVMEDIA_TYPE_AUDIO;
+               stream->codecpar->codec_id = AV_CODEC_ID_PCM_S32LE;
+               stream->codecpar->channel_layout = AV_CH_LAYOUT_STEREO;
+               stream->codecpar->channels = 2;
+               stream->codecpar->sample_rate = OUTPUT_FREQUENCY;
+       }
 
        AVDictionary *options = NULL;
        vector<pair<string, string>> opts = MUX_OPTS;
@@ -269,7 +286,7 @@ unique_ptr<VADisplayWithCleanup> MJPEGEncoder::try_open_va(const string &va_disp
        return va_dpy;
 }
 
-void MJPEGEncoder::upload_frame(int64_t pts, unsigned card_index, RefCountedFrame frame, const bmusb::VideoFormat &video_format, size_t y_offset, size_t cbcr_offset)
+void MJPEGEncoder::upload_frame(int64_t pts, unsigned card_index, RefCountedFrame frame, const bmusb::VideoFormat &video_format, size_t y_offset, size_t cbcr_offset, vector<int32_t> audio)
 {
        PBOFrameAllocator::Userdata *userdata = (PBOFrameAllocator::Userdata *)frame->userdata;
        if (video_format.width == 0 || video_format.height == 0) {
@@ -300,7 +317,7 @@ void MJPEGEncoder::upload_frame(int64_t pts, unsigned card_index, RefCountedFram
                return;
        }
        ++metric_mjpeg_overrun_submitted;
-       frames_to_be_encoded.push(QueuedFrame{ pts, card_index, frame, video_format, y_offset, cbcr_offset });
+       frames_to_be_encoded.push(QueuedFrame{ pts, card_index, frame, video_format, y_offset, cbcr_offset, move(audio) });
        any_frames_to_be_encoded.notify_all();
 }
 
@@ -341,6 +358,11 @@ void MJPEGEncoder::encoder_thread_func()
                        // Will call back in the receiver thread.
                        encode_jpeg_va(move(qf));
                } else {
+                       // Write audio before video, since Futatabi expects it.
+                       if (qf.audio.size() > 0) {
+                               write_audio_packet(qf.pts, qf.card_index, qf.audio);
+                       }
+
                        // Encode synchronously, in the same thread.
                        vector<uint8_t> jpeg = encode_jpeg_libjpeg(qf);
                        write_mjpeg_packet(qf.pts, qf.card_index, jpeg.data(), jpeg.size());
@@ -364,6 +386,27 @@ void MJPEGEncoder::write_mjpeg_packet(int64_t pts, unsigned card_index, const ui
        pkt.flags = AV_PKT_FLAG_KEY;
        AVRational time_base = avctx->streams[pkt.stream_index]->time_base;
        pkt.pts = pkt.dts = av_rescale_q(pts, AVRational{ 1, TIMEBASE }, time_base);
+       pkt.duration = 0;
+
+       if (av_write_frame(avctx.get(), &pkt) < 0) {
+               fprintf(stderr, "av_write_frame() failed\n");
+               abort();
+       }
+}
+
+void MJPEGEncoder::write_audio_packet(int64_t pts, unsigned card_index, const vector<int32_t> &audio)
+{
+       AVPacket pkt;
+       memset(&pkt, 0, sizeof(pkt));
+       pkt.buf = nullptr;
+       pkt.data = reinterpret_cast<uint8_t *>(const_cast<int32_t *>(&audio[0]));
+       pkt.size = audio.size() * sizeof(audio[0]);
+       pkt.stream_index = card_index + global_flags.card_to_mjpeg_stream_export.size();
+       pkt.flags = AV_PKT_FLAG_KEY;
+       AVRational time_base = avctx->streams[pkt.stream_index]->time_base;
+       pkt.pts = pkt.dts = av_rescale_q(pts, AVRational{ 1, TIMEBASE }, time_base);
+       size_t num_stereo_samples = audio.size() / 2;
+       pkt.duration = av_rescale_q(num_stereo_samples, AVRational{ 1, OUTPUT_FREQUENCY }, time_base);
 
        if (av_write_frame(avctx.get(), &pkt) < 0) {
                fprintf(stderr, "av_write_frame() failed\n");
@@ -752,6 +795,11 @@ void MJPEGEncoder::va_receiver_thread_func()
                        frames_encoding.pop();
                }
 
+               // Write audio before video, since Futatabi expects it.
+               if (qf.audio.size() > 0) {
+                       write_audio_packet(qf.pts, qf.card_index, qf.audio);
+               }
+
                VAStatus va_status = vaSyncSurface(va_dpy->va_dpy, qf.resources.surface);
                CHECK_VASTATUS(va_status, "vaSyncSurface");
 
index 6e0357f0c0f32bd4a862ece103bb74d3bb7b50e5..bb783d83b0c76af83c5637f7bc78bfd296542aac 100644 (file)
@@ -38,7 +38,7 @@ public:
        MJPEGEncoder(HTTPD *httpd, const std::string &va_display);
        ~MJPEGEncoder();
        void stop();
-       void upload_frame(int64_t pts, unsigned card_index, RefCountedFrame frame, const bmusb::VideoFormat &video_format, size_t y_offset, size_t cbcr_offset);
+       void upload_frame(int64_t pts, unsigned card_index, RefCountedFrame frame, const bmusb::VideoFormat &video_format, size_t y_offset, size_t cbcr_offset, std::vector<int32_t> audio);
        bool using_vaapi() const { return va_dpy != nullptr; }
 
        // Returns -1 for inactive (ie., don't encode frames for this card right now).
@@ -104,6 +104,7 @@ private:
                RefCountedFrame frame;
                bmusb::VideoFormat video_format;
                size_t y_offset, cbcr_offset;
+               std::vector<int32_t> audio;
 
                // Only for frames in the process of being encoded by VA-API.
                VAResources resources;
@@ -115,6 +116,7 @@ private:
        void encode_jpeg_va(QueuedFrame &&qf);
        std::vector<uint8_t> encode_jpeg_libjpeg(const QueuedFrame &qf);
        void write_mjpeg_packet(int64_t pts, unsigned card_index, const uint8_t *jpeg, size_t jpeg_size);
+       void write_audio_packet(int64_t pts, unsigned card_index, const std::vector<int32_t> &audio);
        void init_jpeg_422(unsigned width, unsigned height, VectorDestinationManager *dest, jpeg_compress_struct *cinfo);
        std::vector<uint8_t> get_jpeg_header(unsigned width, unsigned height, jpeg_compress_struct *cinfo);