From a839022c035b3d9387feabc02843c166ac78b469 Mon Sep 17 00:00:00 2001 From: "Steinar H. Gunderson" Date: Sun, 17 Mar 2019 22:53:36 +0100 Subject: [PATCH] When uploading MJPEG data to VA-API, do it directly into the buffer. MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Besides the obvious of spending less time copying, this has two positive effects: - The VA-API thread is no longer a choke point; uploading can happen from multiple cores. - With one copy less, we seem to be reducing L3 cache pressure a bit; at some point between five and six 1080p sources, we “fall off a cliff” wrt. the L3 and start thrashing. This doesn't fix the issue, but alleviates it somewhat. All in all, we seem to go down from ~2.6 to ~2.1–2.2 cores used with one 720p channel and five 1080p channels. I haven't tried saturating channels yet to see how many we can actually encode. --- nageru/bmusb | 2 +- nageru/decklink_capture.cpp | 2 +- nageru/mixer.cpp | 14 +++--- nageru/mjpeg_encoder.cpp | 79 ++++++++++++++++++++++++++-------- nageru/mjpeg_encoder.h | 18 ++++++++ nageru/pbo_frame_allocator.cpp | 75 ++++++++++++++++++++++++++++++-- nageru/pbo_frame_allocator.h | 28 ++++++++++++ 7 files changed, 186 insertions(+), 32 deletions(-) diff --git a/nageru/bmusb b/nageru/bmusb index 5163d25..03e3889 160000 --- a/nageru/bmusb +++ b/nageru/bmusb @@ -1 +1 @@ -Subproject commit 5163d25c65c3028090db1aea6587ec2fb4cb823e +Subproject commit 03e38890b599efe6ac906fdb70b43cda63f11d01 diff --git a/nageru/decklink_capture.cpp b/nageru/decklink_capture.cpp index a09aefa..f7016dc 100644 --- a/nageru/decklink_capture.cpp +++ b/nageru/decklink_capture.cpp @@ -252,7 +252,7 @@ HRESULT STDMETHODCALLTYPE DeckLinkCapture::VideoInputFrameArrived( assert(stride == width * 2); } - current_video_frame = video_frame_allocator->alloc_frame(); + current_video_frame = video_frame_allocator->create_frame(width, height, stride); if (current_video_frame.data != nullptr) { const uint8_t *src; video_frame->GetBytes((void **)&src); diff --git a/nageru/mixer.cpp b/nageru/mixer.cpp index bad1a5e..e0aa8b1 100644 --- a/nageru/mixer.cpp +++ b/nageru/mixer.cpp @@ -551,7 +551,7 @@ void Mixer::configure_card(unsigned card_index, CaptureInterface *capture, CardT card->capture->set_frame_callback(bind(&Mixer::bm_frame, this, card_index, _1, _2, _3, _4, _5, _6, _7)); if (card->frame_allocator == nullptr) { - card->frame_allocator.reset(new PBOFrameAllocator(pixel_format, 8 << 20, global_flags.width, global_flags.height)); // 8 MB. + card->frame_allocator.reset(new PBOFrameAllocator(pixel_format, 8 << 20, global_flags.width, global_flags.height, card_index, mjpeg_encoder.get())); // 8 MB. } card->capture->set_video_frame_allocator(card->frame_allocator.get()); if (card->surface == nullptr) { @@ -1081,12 +1081,12 @@ void Mixer::thread_func() new_frame->upload_func = nullptr; } - // Only bother doing MJPEG encoding if there are any connected clients - // that want the stream. FIXME: We should also stop memcpy-ing if there are none! - if (httpd.get_num_connected_multicam_clients() > 0) { - auto stream_it = global_flags.card_to_mjpeg_stream_export.find(card_index); - if (stream_it != global_flags.card_to_mjpeg_stream_export.end()) { - mjpeg_encoder->upload_frame(pts_int, stream_it->second, new_frame->frame, new_frame->video_format, new_frame->y_offset, new_frame->cbcr_offset); + if (new_frame->frame->data_copy != nullptr) { + int mjpeg_card_index = mjpeg_encoder->get_mjpeg_stream_for_card(card_index); + if (mjpeg_card_index == -1) { + mjpeg_encoder->finish_frame(new_frame->frame); + } else { + mjpeg_encoder->upload_frame(pts_int, mjpeg_card_index, new_frame->frame, new_frame->video_format, new_frame->y_offset, new_frame->cbcr_offset); } } } diff --git a/nageru/mjpeg_encoder.cpp b/nageru/mjpeg_encoder.cpp index 07e302c..9ae018f 100644 --- a/nageru/mjpeg_encoder.cpp +++ b/nageru/mjpeg_encoder.cpp @@ -30,12 +30,6 @@ using namespace std; extern void memcpy_with_pitch(uint8_t *dst, const uint8_t *src, size_t src_width, size_t dst_pitch, size_t height); -#define CHECK_VASTATUS(va_status, func) \ - if (va_status != VA_STATUS_SUCCESS) { \ - fprintf(stderr, "%s:%d (%s) failed with %d\n", __func__, __LINE__, func, va_status); \ - exit(1); \ - } - // From libjpeg (although it's of course identical between implementations). static const int jpeg_natural_order[DCTSIZE2] = { 0, 1, 8, 16, 9, 2, 3, 10, @@ -294,6 +288,37 @@ void MJPEGEncoder::upload_frame(int64_t pts, unsigned card_index, RefCountedFram any_frames_to_be_encoded.notify_all(); } +void MJPEGEncoder::finish_frame(RefCountedFrame frame) +{ + PBOFrameAllocator::Userdata *userdata = (PBOFrameAllocator::Userdata *)frame->userdata; + + if (userdata->data_copy_current_src == PBOFrameAllocator::Userdata::FROM_VA_API) { + VAResources resources __attribute__((unused)) = move(userdata->va_resources); + ReleaseVAResources release = move(userdata->va_resources_release); + VAImage image = move(userdata->va_image); + + VAStatus va_status = vaUnmapBuffer(va_dpy->va_dpy, image.buf); + CHECK_VASTATUS(va_status, "vaUnmapBuffer"); + va_status = vaDestroyImage(va_dpy->va_dpy, image.image_id); + CHECK_VASTATUS(va_status, "vaDestroyImage"); + } +} + +int MJPEGEncoder::get_mjpeg_stream_for_card(unsigned card_index) +{ + // Only bother doing MJPEG encoding if there are any connected clients + // that want the stream. + if (httpd->get_num_connected_multicam_clients() == 0) { + return -1; + } + + auto it = global_flags.card_to_mjpeg_stream_export.find(card_index); + if (it == global_flags.card_to_mjpeg_stream_export.end()) { + return -1; + } + return it->second; +} + void MJPEGEncoder::encoder_thread_func() { pthread_setname_np(pthread_self(), "MJPEG_Encode"); @@ -597,11 +622,20 @@ MJPEGEncoder::VAData MJPEGEncoder::get_va_data_for_resolution(unsigned width, un void MJPEGEncoder::encode_jpeg_va(QueuedFrame &&qf) { + PBOFrameAllocator::Userdata *userdata = (PBOFrameAllocator::Userdata *)qf.frame->userdata; unsigned width = qf.video_format.width; unsigned height = qf.video_format.height; - VAResources resources = get_va_resources(width, height); - ReleaseVAResources release(this, resources); + VAResources resources; + ReleaseVAResources release; + if (userdata->data_copy_current_src == PBOFrameAllocator::Userdata::FROM_VA_API) { + resources = move(userdata->va_resources); + release = move(userdata->va_resources_release); + } else { + assert(userdata->data_copy_current_src == PBOFrameAllocator::Userdata::FROM_MALLOC); + resources = get_va_resources(width, height); + release = ReleaseVAResources(this, resources); + } VAData va_data = get_va_data_for_resolution(width, height); va_data.pic_param.coded_buf = resources.data_buffer; @@ -627,20 +661,27 @@ void MJPEGEncoder::encode_jpeg_va(QueuedFrame &&qf) VABufferDestroyer destroy_slice_param(va_dpy->va_dpy, slice_param_buffer); VAImage image; - va_status = vaDeriveImage(va_dpy->va_dpy, resources.surface, &image); - CHECK_VASTATUS(va_status, "vaDeriveImage"); + if (userdata->data_copy_current_src == PBOFrameAllocator::Userdata::FROM_VA_API) { + // The pixel data is already uploaded by the caller. + image = move(userdata->va_image); + } else { + assert(userdata->data_copy_current_src == PBOFrameAllocator::Userdata::FROM_MALLOC); - // Upload the pixel data. - uint8_t *surface_p = nullptr; - vaMapBuffer(va_dpy->va_dpy, image.buf, (void **)&surface_p); + // Upload the pixel data. + va_status = vaDeriveImage(va_dpy->va_dpy, resources.surface, &image); + CHECK_VASTATUS(va_status, "vaDeriveImage"); - size_t field_start_line = qf.video_format.extra_lines_top; // No interlacing support. - size_t field_start = qf.cbcr_offset * 2 + qf.video_format.width * field_start_line * 2; + uint8_t *surface_p = nullptr; + vaMapBuffer(va_dpy->va_dpy, image.buf, (void **)&surface_p); - { - const uint8_t *src = qf.frame->data_copy + field_start; - uint8_t *dst = (unsigned char *)surface_p + image.offsets[0]; - memcpy_with_pitch(dst, src, qf.video_format.width * 2, image.pitches[0], qf.video_format.height); + size_t field_start_line = qf.video_format.extra_lines_top; // No interlacing support. + size_t field_start = qf.cbcr_offset * 2 + qf.video_format.width * field_start_line * 2; + + { + const uint8_t *src = qf.frame->data_copy + field_start; + uint8_t *dst = (unsigned char *)surface_p + image.offsets[0]; + memcpy_with_pitch(dst, src, qf.video_format.width * 2, image.pitches[0], qf.video_format.height); + } } va_status = vaUnmapBuffer(va_dpy->va_dpy, image.buf); diff --git a/nageru/mjpeg_encoder.h b/nageru/mjpeg_encoder.h index 8b68294..b7b2043 100644 --- a/nageru/mjpeg_encoder.h +++ b/nageru/mjpeg_encoder.h @@ -27,6 +27,12 @@ struct jpeg_compress_struct; struct VADisplayWithCleanup; struct VectorDestinationManager; +#define CHECK_VASTATUS(va_status, func) \ + if (va_status != VA_STATUS_SUCCESS) { \ + fprintf(stderr, "%s:%d (%s) failed with %d\n", __func__, __LINE__, func, va_status); \ + exit(1); \ + } + class MJPEGEncoder { public: MJPEGEncoder(HTTPD *httpd, const std::string &va_display); @@ -34,6 +40,16 @@ public: void stop(); void upload_frame(int64_t pts, unsigned card_index, RefCountedFrame frame, const bmusb::VideoFormat &video_format, size_t y_offset, size_t cbcr_offset); + // If the frame was started (data_copy != nullptr) but will not be finished + // (MJPEG decoding was turned off in the meantime), you'll need to call finish_frame() + // to release any VA-API resources. + void finish_frame(RefCountedFrame frame); + + bool using_vaapi() const { return va_dpy != nullptr; } + + // Returns -1 for inactive (ie., don't encode frames for this card right now). + int get_mjpeg_stream_for_card(unsigned card_index); + private: static constexpr int quality = 90; @@ -153,6 +169,8 @@ private: std::atomic metric_mjpeg_frames_oversized_dropped{0}; std::atomic metric_mjpeg_overrun_dropped{0}; std::atomic metric_mjpeg_overrun_submitted{0}; + + friend class PBOFrameAllocator; // FIXME }; #endif // !defined(_MJPEG_ENCODER_H) diff --git a/nageru/pbo_frame_allocator.cpp b/nageru/pbo_frame_allocator.cpp index 4c1a55b..d0859b3 100644 --- a/nageru/pbo_frame_allocator.cpp +++ b/nageru/pbo_frame_allocator.cpp @@ -8,7 +8,9 @@ #include #include "flags.h" +#include "mjpeg_encoder.h" #include "v210_converter.h" +#include "va_display_with_cleanup.h" using namespace std; @@ -26,8 +28,8 @@ void set_clamp_to_edge() } // namespace -PBOFrameAllocator::PBOFrameAllocator(bmusb::PixelFormat pixel_format, size_t frame_size, GLuint width, GLuint height, size_t num_queued_frames, GLenum buffer, GLenum permissions, GLenum map_bits) - : pixel_format(pixel_format), buffer(buffer) +PBOFrameAllocator::PBOFrameAllocator(bmusb::PixelFormat pixel_format, size_t frame_size, GLuint width, GLuint height, unsigned card_index, MJPEGEncoder *mjpeg_encoder, size_t num_queued_frames, GLenum buffer, GLenum permissions, GLenum map_bits) + : card_index(card_index), mjpeg_encoder(mjpeg_encoder), pixel_format(pixel_format), buffer(buffer) { userdata.reset(new Userdata[num_queued_frames]); for (size_t i = 0; i < num_queued_frames; ++i) { @@ -52,13 +54,13 @@ void PBOFrameAllocator::init_frame(size_t frame_idx, size_t frame_size, GLuint w Frame frame; frame.data = (uint8_t *)glMapBufferRange(buffer, 0, frame_size, permissions | map_bits | GL_MAP_PERSISTENT_BIT); frame.data2 = frame.data + frame_size / 2; - frame.data_copy = new uint8_t[frame_size]; check_error(); frame.size = frame_size; Userdata *ud = &userdata[frame_idx]; frame.userdata = ud; ud->pbo = pbo; ud->pixel_format = pixel_format; + ud->data_copy_malloc = new uint8_t[frame_size]; frame.owner = this; // For 8-bit non-planar Y'CbCr, we ask the driver to split Y' and Cb/Cr @@ -217,7 +219,7 @@ PBOFrameAllocator::~PBOFrameAllocator() void PBOFrameAllocator::destroy_frame(Frame *frame) { Userdata *ud = (Userdata *)frame->userdata; - delete[] frame->data_copy; + delete[] ud->data_copy_malloc; GLuint pbo = ud->pbo; glBindBuffer(buffer, pbo); @@ -273,6 +275,71 @@ bmusb::FrameAllocator::Frame PBOFrameAllocator::alloc_frame() } vf.len = 0; vf.overflow = 0; + + if (mjpeg_encoder != nullptr && mjpeg_encoder->using_vaapi() && + mjpeg_encoder->get_mjpeg_stream_for_card(card_index) != -1) { + Userdata *ud = (Userdata *)vf.userdata; + vf.data_copy = ud->data_copy_malloc; + ud->data_copy_current_src = Userdata::FROM_MALLOC; + } else { + vf.data_copy = nullptr; + } + + return vf; +} + +bmusb::FrameAllocator::Frame PBOFrameAllocator::create_frame(size_t width, size_t height, size_t stride) +{ + Frame vf; + + { + lock_guard lock(freelist_mutex); + if (freelist.empty()) { + printf("Frame overrun (no more spare PBO frames), dropping frame!\n"); + vf.len = 0; + vf.overflow = 0; + return vf; + } else { + vf = freelist.front(); + freelist.pop(); + } + } + vf.len = 0; + vf.overflow = 0; + + Userdata *userdata = (Userdata *)vf.userdata; + + if (mjpeg_encoder != nullptr && mjpeg_encoder->using_vaapi() && + mjpeg_encoder->get_mjpeg_stream_for_card(card_index) != -1) { + VADisplay va_dpy = mjpeg_encoder->va_dpy->va_dpy; + MJPEGEncoder::VAResources resources = mjpeg_encoder->get_va_resources(width, height); + MJPEGEncoder::ReleaseVAResources release(mjpeg_encoder, resources); + + VAImage image; + VAStatus va_status = vaDeriveImage(va_dpy, resources.surface, &image); + CHECK_VASTATUS(va_status, "vaDeriveImage"); + + if (image.pitches[0] == stride) { + userdata->va_resources = move(resources); + userdata->va_resources_release = move(release); + userdata->va_image = move(image); + + va_status = vaMapBuffer(va_dpy, image.buf, (void **)&vf.data_copy); + CHECK_VASTATUS(va_status, "vaMapBuffer"); + vf.data_copy += image.offsets[0]; + userdata->data_copy_current_src = Userdata::FROM_VA_API; + } else { + printf("WARNING: Could not copy directly into VA-API MJPEG buffer for %zu x %zu, since producer and consumer disagreed on stride (%zu != %d).\n", width, height, stride, image.pitches[0]); + vf.data_copy = userdata->data_copy_malloc; + userdata->data_copy_current_src = Userdata::FROM_MALLOC; + + va_status = vaDestroyImage(va_dpy, image.image_id); + CHECK_VASTATUS(va_status, "vaDestroyImage"); + } + } else { + vf.data_copy = nullptr; + } + return vf; } diff --git a/nageru/pbo_frame_allocator.h b/nageru/pbo_frame_allocator.h index ab51f6b..a7ae92e 100644 --- a/nageru/pbo_frame_allocator.h +++ b/nageru/pbo_frame_allocator.h @@ -11,6 +11,9 @@ #include #include "bmusb/bmusb.h" +#include "mjpeg_encoder.h" + +class MJPEGEncoder; // An allocator that allocates straight into OpenGL pinned memory. // Meant for video frames only. We use a queue rather than a stack, @@ -22,12 +25,15 @@ public: PBOFrameAllocator(bmusb::PixelFormat pixel_format, size_t frame_size, GLuint width, GLuint height, + unsigned card_index, + MJPEGEncoder *mjpeg_encoder = nullptr, size_t num_queued_frames = 16, GLenum buffer = GL_PIXEL_UNPACK_BUFFER_ARB, GLenum permissions = GL_MAP_WRITE_BIT, GLenum map_bits = GL_MAP_FLUSH_EXPLICIT_BIT); ~PBOFrameAllocator() override; Frame alloc_frame() override; + Frame create_frame(size_t width, size_t height, size_t stride) override; void release_frame(Frame frame) override; struct Userdata { @@ -54,12 +60,34 @@ public: unsigned last_frame_rate_nom, last_frame_rate_den; bool has_last_subtitle = false; std::string last_subtitle; + + // These are the source of the “data_copy” member in Frame, + // used for MJPEG encoding. There are three possibilities: + // + // - MJPEG encoding is not active (at all, or for this specific + // card). Then data_copy is nullptr, and what's in here + // does not matter at all. + // - We can encode directly into VA-API buffers (ie., VA-API + // is active, and nothing strange happened wrt. strides); + // then va_resources, va_resources_release and va_image + // are fetched from MJPEGEncoder at create_frame() and released + // back when the frame is uploaded (or would have been). + // In this case, data_copy points into the mapped VAImage. + // - If not, data_copy points to data_copy_malloc, and is copied + // from there into VA-API buffers (by MJPEGEncoder) if needed. + enum { FROM_MALLOC, FROM_VA_API } data_copy_current_src; + uint8_t *data_copy_malloc; + MJPEGEncoder::VAResources va_resources; + MJPEGEncoder::ReleaseVAResources va_resources_release; + VAImage va_image; }; private: void init_frame(size_t frame_idx, size_t frame_size, GLuint width, GLuint height, GLenum permissions, GLenum map_bits); void destroy_frame(Frame *frame); + unsigned card_index; + MJPEGEncoder *mjpeg_encoder; bmusb::PixelFormat pixel_format; std::mutex freelist_mutex; std::queue freelist; -- 2.39.2