From 575f6eb1b052bb1291987753b1a8cccc7f1e3ab3 Mon Sep 17 00:00:00 2001 From: "Steinar H. Gunderson" Date: Mon, 18 Mar 2019 19:29:38 +0100 Subject: [PATCH] Use vaCreateImage + vaPutImage instead of vaDeriveImage. MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Seemingly, this largely fixes the L3 issues I've been seeing, taking CPU usage down from ~2.1–2.2 to ~1.4 cores. A test run with eight full 1080p59.94 inputs demonstrates that it can be done without the GPU keeling over, although there are some issues with VA-API threading. --- nageru/mjpeg_encoder.cpp | 52 +++++++++++++++++++++++----------- nageru/mjpeg_encoder.h | 1 + nageru/pbo_frame_allocator.cpp | 16 +++-------- nageru/pbo_frame_allocator.h | 1 - 4 files changed, 40 insertions(+), 30 deletions(-) diff --git a/nageru/mjpeg_encoder.cpp b/nageru/mjpeg_encoder.cpp index 9ae018f..01173e0 100644 --- a/nageru/mjpeg_encoder.cpp +++ b/nageru/mjpeg_encoder.cpp @@ -28,6 +28,8 @@ extern "C" { using namespace bmusb; using namespace std; +static VAImageFormat uyvy_format; + extern void memcpy_with_pitch(uint8_t *dst, const uint8_t *src, size_t src_width, size_t dst_pitch, size_t height); // From libjpeg (although it's of course identical between implementations). @@ -238,6 +240,7 @@ unique_ptr MJPEGEncoder::try_open_va(const string &va_disp return nullptr; } + // TODO: Unify with the code in Futatabi. int num_formats = vaMaxNumImageFormats(va_dpy->va_dpy); assert(num_formats > 0); @@ -250,6 +253,19 @@ unique_ptr MJPEGEncoder::try_open_va(const string &va_disp return nullptr; } + bool found = false; + for (int i = 0; i < num_formats; ++i) { + if (formats[i].fourcc == VA_FOURCC_UYVY) { + memcpy(&uyvy_format, &formats[i], sizeof(VAImageFormat)); + found = true; + break; + } + } + if (!found) { + if (error != nullptr) *error = "UYVY format not found"; + return nullptr; + } + return va_dpy; } @@ -295,12 +311,9 @@ void MJPEGEncoder::finish_frame(RefCountedFrame frame) if (userdata->data_copy_current_src == PBOFrameAllocator::Userdata::FROM_VA_API) { VAResources resources __attribute__((unused)) = move(userdata->va_resources); ReleaseVAResources release = move(userdata->va_resources_release); - VAImage image = move(userdata->va_image); - VAStatus va_status = vaUnmapBuffer(va_dpy->va_dpy, image.buf); + VAStatus va_status = vaUnmapBuffer(va_dpy->va_dpy, resources.image.buf); CHECK_VASTATUS(va_status, "vaUnmapBuffer"); - va_status = vaDestroyImage(va_dpy->va_dpy, image.image_id); - CHECK_VASTATUS(va_status, "vaDestroyImage"); } } @@ -421,6 +434,9 @@ MJPEGEncoder::VAResources MJPEGEncoder::get_va_resources(unsigned width, unsigne va_status = vaCreateBuffer(va_dpy->va_dpy, config_id, VAEncCodedBufferType, width * height * 3 + 8192, 1, nullptr, &ret.data_buffer); CHECK_VASTATUS(va_status, "vaCreateBuffer"); + va_status = vaCreateImage(va_dpy->va_dpy, &uyvy_format, width, height, &ret.image); + CHECK_VASTATUS(va_status, "vaCreateImage"); + return ret; } @@ -660,34 +676,36 @@ void MJPEGEncoder::encode_jpeg_va(QueuedFrame &&qf) CHECK_VASTATUS(va_status, "vaCreateBuffer"); VABufferDestroyer destroy_slice_param(va_dpy->va_dpy, slice_param_buffer); - VAImage image; if (userdata->data_copy_current_src == PBOFrameAllocator::Userdata::FROM_VA_API) { - // The pixel data is already uploaded by the caller. - image = move(userdata->va_image); + va_status = vaUnmapBuffer(va_dpy->va_dpy, resources.image.buf); + CHECK_VASTATUS(va_status, "vaUnmapBuffer"); + // The pixel data is already put into the image by the caller. } else { assert(userdata->data_copy_current_src == PBOFrameAllocator::Userdata::FROM_MALLOC); // Upload the pixel data. - va_status = vaDeriveImage(va_dpy->va_dpy, resources.surface, &image); - CHECK_VASTATUS(va_status, "vaDeriveImage"); - uint8_t *surface_p = nullptr; - vaMapBuffer(va_dpy->va_dpy, image.buf, (void **)&surface_p); + vaMapBuffer(va_dpy->va_dpy, resources.image.buf, (void **)&surface_p); size_t field_start_line = qf.video_format.extra_lines_top; // No interlacing support. size_t field_start = qf.cbcr_offset * 2 + qf.video_format.width * field_start_line * 2; { const uint8_t *src = qf.frame->data_copy + field_start; - uint8_t *dst = (unsigned char *)surface_p + image.offsets[0]; - memcpy_with_pitch(dst, src, qf.video_format.width * 2, image.pitches[0], qf.video_format.height); + uint8_t *dst = (unsigned char *)surface_p + resources.image.offsets[0]; + memcpy_with_pitch(dst, src, qf.video_format.width * 2, resources.image.pitches[0], qf.video_format.height); } + + va_status = vaUnmapBuffer(va_dpy->va_dpy, resources.image.buf); + CHECK_VASTATUS(va_status, "vaUnmapBuffer"); } - va_status = vaUnmapBuffer(va_dpy->va_dpy, image.buf); - CHECK_VASTATUS(va_status, "vaUnmapBuffer"); - va_status = vaDestroyImage(va_dpy->va_dpy, image.image_id); - CHECK_VASTATUS(va_status, "vaDestroyImage"); + // Seemingly vaPutImage() (which triggers a GPU copy) is much nicer to the + // CPU than vaDeriveImage() and copying directly into the GPU's buffers. + // Exactly why is unclear, but it seems to involve L3 cache usage when there + // are many high-res (1080p+) images in play. + va_status = vaPutImage(va_dpy->va_dpy, resources.surface, resources.image.image_id, 0, 0, width, height, 0, 0, width, height); + CHECK_VASTATUS(va_status, "vaPutImage"); // Finally, stick in the JPEG header. VAEncPackedHeaderParameterBuffer header_parm; diff --git a/nageru/mjpeg_encoder.h b/nageru/mjpeg_encoder.h index b7b2043..aee1b9b 100644 --- a/nageru/mjpeg_encoder.h +++ b/nageru/mjpeg_encoder.h @@ -58,6 +58,7 @@ private: VASurfaceID surface; VAContextID context; VABufferID data_buffer; + VAImage image; }; // RAII wrapper to release VAResources on return (even on error). diff --git a/nageru/pbo_frame_allocator.cpp b/nageru/pbo_frame_allocator.cpp index d0859b3..54182a8 100644 --- a/nageru/pbo_frame_allocator.cpp +++ b/nageru/pbo_frame_allocator.cpp @@ -315,26 +315,18 @@ bmusb::FrameAllocator::Frame PBOFrameAllocator::create_frame(size_t width, size_ MJPEGEncoder::VAResources resources = mjpeg_encoder->get_va_resources(width, height); MJPEGEncoder::ReleaseVAResources release(mjpeg_encoder, resources); - VAImage image; - VAStatus va_status = vaDeriveImage(va_dpy, resources.surface, &image); - CHECK_VASTATUS(va_status, "vaDeriveImage"); - - if (image.pitches[0] == stride) { + if (resources.image.pitches[0] == stride) { userdata->va_resources = move(resources); userdata->va_resources_release = move(release); - userdata->va_image = move(image); - va_status = vaMapBuffer(va_dpy, image.buf, (void **)&vf.data_copy); + VAStatus va_status = vaMapBuffer(va_dpy, resources.image.buf, (void **)&vf.data_copy); CHECK_VASTATUS(va_status, "vaMapBuffer"); - vf.data_copy += image.offsets[0]; + vf.data_copy += resources.image.offsets[0]; userdata->data_copy_current_src = Userdata::FROM_VA_API; } else { - printf("WARNING: Could not copy directly into VA-API MJPEG buffer for %zu x %zu, since producer and consumer disagreed on stride (%zu != %d).\n", width, height, stride, image.pitches[0]); + printf("WARNING: Could not copy directly into VA-API MJPEG buffer for %zu x %zu, since producer and consumer disagreed on stride (%zu != %d).\n", width, height, stride, resources.image.pitches[0]); vf.data_copy = userdata->data_copy_malloc; userdata->data_copy_current_src = Userdata::FROM_MALLOC; - - va_status = vaDestroyImage(va_dpy, image.image_id); - CHECK_VASTATUS(va_status, "vaDestroyImage"); } } else { vf.data_copy = nullptr; diff --git a/nageru/pbo_frame_allocator.h b/nageru/pbo_frame_allocator.h index a7ae92e..d7559dc 100644 --- a/nageru/pbo_frame_allocator.h +++ b/nageru/pbo_frame_allocator.h @@ -79,7 +79,6 @@ public: uint8_t *data_copy_malloc; MJPEGEncoder::VAResources va_resources; MJPEGEncoder::ReleaseVAResources va_resources_release; - VAImage va_image; }; private: -- 2.39.2