]> git.sesse.net Git - nageru/commitdiff
When uploading MJPEG data to VA-API, do it directly into the buffer.
authorSteinar H. Gunderson <sgunderson@bigfoot.com>
Sun, 17 Mar 2019 21:53:36 +0000 (22:53 +0100)
committerSteinar H. Gunderson <sgunderson@bigfoot.com>
Sun, 17 Mar 2019 21:53:36 +0000 (22:53 +0100)
Besides the obvious of spending less time copying, this has two positive effects:

 - The VA-API thread is no longer a choke point; uploading can happen from
   multiple cores.
 - With one copy less, we seem to be reducing L3 cache pressure a bit;
   at some point between five and six 1080p sources, we “fall off a cliff”
   wrt. the L3 and start thrashing. This doesn't fix the issue, but alleviates
   it somewhat.

All in all, we seem to go down from ~2.6 to ~2.1–2.2 cores used with one
720p channel and five 1080p channels. I haven't tried saturating channels
yet to see how many we can actually encode.

nageru/bmusb
nageru/decklink_capture.cpp
nageru/mixer.cpp
nageru/mjpeg_encoder.cpp
nageru/mjpeg_encoder.h
nageru/pbo_frame_allocator.cpp
nageru/pbo_frame_allocator.h

index 5163d25c65c3028090db1aea6587ec2fb4cb823e..03e38890b599efe6ac906fdb70b43cda63f11d01 160000 (submodule)
@@ -1 +1 @@
-Subproject commit 5163d25c65c3028090db1aea6587ec2fb4cb823e
+Subproject commit 03e38890b599efe6ac906fdb70b43cda63f11d01
index a09aefaaec14661bafcc3bedc5555d054fe19038..f7016dce7c07f607455481c9d00a34e2d379ac99 100644 (file)
@@ -252,7 +252,7 @@ HRESULT STDMETHODCALLTYPE DeckLinkCapture::VideoInputFrameArrived(
                        assert(stride == width * 2);
                }
 
-               current_video_frame = video_frame_allocator->alloc_frame();
+               current_video_frame = video_frame_allocator->create_frame(width, height, stride);
                if (current_video_frame.data != nullptr) {
                        const uint8_t *src;
                        video_frame->GetBytes((void **)&src);
index bad1a5e166461737c3463684ccc427038f7c3269..e0aa8b1fe27783c685dde0c142a8be586dcb89cf 100644 (file)
@@ -551,7 +551,7 @@ void Mixer::configure_card(unsigned card_index, CaptureInterface *capture, CardT
 
        card->capture->set_frame_callback(bind(&Mixer::bm_frame, this, card_index, _1, _2, _3, _4, _5, _6, _7));
        if (card->frame_allocator == nullptr) {
-               card->frame_allocator.reset(new PBOFrameAllocator(pixel_format, 8 << 20, global_flags.width, global_flags.height));  // 8 MB.
+               card->frame_allocator.reset(new PBOFrameAllocator(pixel_format, 8 << 20, global_flags.width, global_flags.height, card_index, mjpeg_encoder.get()));  // 8 MB.
        }
        card->capture->set_video_frame_allocator(card->frame_allocator.get());
        if (card->surface == nullptr) {
@@ -1081,12 +1081,12 @@ void Mixer::thread_func()
                                new_frame->upload_func = nullptr;
                        }
 
-                       // Only bother doing MJPEG encoding if there are any connected clients
-                       // that want the stream. FIXME: We should also stop memcpy-ing if there are none!
-                       if (httpd.get_num_connected_multicam_clients() > 0) {
-                               auto stream_it = global_flags.card_to_mjpeg_stream_export.find(card_index);
-                               if (stream_it != global_flags.card_to_mjpeg_stream_export.end()) {
-                                       mjpeg_encoder->upload_frame(pts_int, stream_it->second, new_frame->frame, new_frame->video_format, new_frame->y_offset, new_frame->cbcr_offset);
+                       if (new_frame->frame->data_copy != nullptr) {
+                               int mjpeg_card_index = mjpeg_encoder->get_mjpeg_stream_for_card(card_index);
+                               if (mjpeg_card_index == -1) {
+                                       mjpeg_encoder->finish_frame(new_frame->frame);
+                               } else {
+                                       mjpeg_encoder->upload_frame(pts_int, mjpeg_card_index, new_frame->frame, new_frame->video_format, new_frame->y_offset, new_frame->cbcr_offset);
                                }
                        }
                }
index 07e302c4e93148c2b9a83edfc1034e5465f1e8bc..9ae018f81059584621bb383b3f4d6b30c01df0f4 100644 (file)
@@ -30,12 +30,6 @@ using namespace std;
 
 extern void memcpy_with_pitch(uint8_t *dst, const uint8_t *src, size_t src_width, size_t dst_pitch, size_t height);
 
-#define CHECK_VASTATUS(va_status, func)                                 \
-    if (va_status != VA_STATUS_SUCCESS) {                               \
-        fprintf(stderr, "%s:%d (%s) failed with %d\n", __func__, __LINE__, func, va_status); \
-        exit(1);                                                        \
-    }
-
 // From libjpeg (although it's of course identical between implementations).
 static const int jpeg_natural_order[DCTSIZE2] = {
         0,  1,  8, 16,  9,  2,  3, 10,
@@ -294,6 +288,37 @@ void MJPEGEncoder::upload_frame(int64_t pts, unsigned card_index, RefCountedFram
        any_frames_to_be_encoded.notify_all();
 }
 
+void MJPEGEncoder::finish_frame(RefCountedFrame frame)
+{
+       PBOFrameAllocator::Userdata *userdata = (PBOFrameAllocator::Userdata *)frame->userdata;
+
+       if (userdata->data_copy_current_src == PBOFrameAllocator::Userdata::FROM_VA_API) {
+               VAResources resources __attribute__((unused)) = move(userdata->va_resources);
+               ReleaseVAResources release = move(userdata->va_resources_release);
+               VAImage image = move(userdata->va_image);
+
+               VAStatus va_status = vaUnmapBuffer(va_dpy->va_dpy, image.buf);
+               CHECK_VASTATUS(va_status, "vaUnmapBuffer");
+               va_status = vaDestroyImage(va_dpy->va_dpy, image.image_id);
+               CHECK_VASTATUS(va_status, "vaDestroyImage");
+       }
+}
+
+int MJPEGEncoder::get_mjpeg_stream_for_card(unsigned card_index)
+{
+       // Only bother doing MJPEG encoding if there are any connected clients
+       // that want the stream.
+       if (httpd->get_num_connected_multicam_clients() == 0) {
+               return -1;
+       }
+
+       auto it = global_flags.card_to_mjpeg_stream_export.find(card_index);
+       if (it == global_flags.card_to_mjpeg_stream_export.end()) {
+               return -1;
+       }
+       return it->second;
+}
+
 void MJPEGEncoder::encoder_thread_func()
 {
        pthread_setname_np(pthread_self(), "MJPEG_Encode");
@@ -597,11 +622,20 @@ MJPEGEncoder::VAData MJPEGEncoder::get_va_data_for_resolution(unsigned width, un
 
 void MJPEGEncoder::encode_jpeg_va(QueuedFrame &&qf)
 {
+       PBOFrameAllocator::Userdata *userdata = (PBOFrameAllocator::Userdata *)qf.frame->userdata;
        unsigned width = qf.video_format.width;
        unsigned height = qf.video_format.height;
 
-       VAResources resources = get_va_resources(width, height);
-       ReleaseVAResources release(this, resources);
+       VAResources resources;
+       ReleaseVAResources release;
+       if (userdata->data_copy_current_src == PBOFrameAllocator::Userdata::FROM_VA_API) {
+               resources = move(userdata->va_resources);
+               release = move(userdata->va_resources_release);
+       } else {
+               assert(userdata->data_copy_current_src == PBOFrameAllocator::Userdata::FROM_MALLOC);
+               resources = get_va_resources(width, height);
+               release = ReleaseVAResources(this, resources);
+       }
 
        VAData va_data = get_va_data_for_resolution(width, height);
        va_data.pic_param.coded_buf = resources.data_buffer;
@@ -627,20 +661,27 @@ void MJPEGEncoder::encode_jpeg_va(QueuedFrame &&qf)
        VABufferDestroyer destroy_slice_param(va_dpy->va_dpy, slice_param_buffer);
 
        VAImage image;
-       va_status = vaDeriveImage(va_dpy->va_dpy, resources.surface, &image);
-       CHECK_VASTATUS(va_status, "vaDeriveImage");
+       if (userdata->data_copy_current_src == PBOFrameAllocator::Userdata::FROM_VA_API) {
+               // The pixel data is already uploaded by the caller.
+               image = move(userdata->va_image);
+       } else {
+               assert(userdata->data_copy_current_src == PBOFrameAllocator::Userdata::FROM_MALLOC);
 
-       // Upload the pixel data.
-       uint8_t *surface_p = nullptr;
-       vaMapBuffer(va_dpy->va_dpy, image.buf, (void **)&surface_p);
+               // Upload the pixel data.
+               va_status = vaDeriveImage(va_dpy->va_dpy, resources.surface, &image);
+               CHECK_VASTATUS(va_status, "vaDeriveImage");
 
-       size_t field_start_line = qf.video_format.extra_lines_top;  // No interlacing support.
-       size_t field_start = qf.cbcr_offset * 2 + qf.video_format.width * field_start_line * 2;
+               uint8_t *surface_p = nullptr;
+               vaMapBuffer(va_dpy->va_dpy, image.buf, (void **)&surface_p);
 
-       {
-               const uint8_t *src = qf.frame->data_copy + field_start;
-               uint8_t *dst = (unsigned char *)surface_p + image.offsets[0];
-               memcpy_with_pitch(dst, src, qf.video_format.width * 2, image.pitches[0], qf.video_format.height);
+               size_t field_start_line = qf.video_format.extra_lines_top;  // No interlacing support.
+               size_t field_start = qf.cbcr_offset * 2 + qf.video_format.width * field_start_line * 2;
+
+               {
+                       const uint8_t *src = qf.frame->data_copy + field_start;
+                       uint8_t *dst = (unsigned char *)surface_p + image.offsets[0];
+                       memcpy_with_pitch(dst, src, qf.video_format.width * 2, image.pitches[0], qf.video_format.height);
+               }
        }
 
        va_status = vaUnmapBuffer(va_dpy->va_dpy, image.buf);
index 8b68294a48d9e01dfd0254b213b2770d57e0330d..b7b20431825b3fabffcf5da22f97c6ced4b5bb5d 100644 (file)
@@ -27,6 +27,12 @@ struct jpeg_compress_struct;
 struct VADisplayWithCleanup;
 struct VectorDestinationManager;
 
+#define CHECK_VASTATUS(va_status, func)                                 \
+    if (va_status != VA_STATUS_SUCCESS) {                               \
+        fprintf(stderr, "%s:%d (%s) failed with %d\n", __func__, __LINE__, func, va_status); \
+        exit(1);                                                        \
+    }
+
 class MJPEGEncoder {
 public:
        MJPEGEncoder(HTTPD *httpd, const std::string &va_display);
@@ -34,6 +40,16 @@ public:
        void stop();
        void upload_frame(int64_t pts, unsigned card_index, RefCountedFrame frame, const bmusb::VideoFormat &video_format, size_t y_offset, size_t cbcr_offset);
 
+       // If the frame was started (data_copy != nullptr) but will not be finished
+       // (MJPEG decoding was turned off in the meantime), you'll need to call finish_frame()
+       // to release any VA-API resources.
+       void finish_frame(RefCountedFrame frame);
+
+       bool using_vaapi() const { return va_dpy != nullptr; }
+
+       // Returns -1 for inactive (ie., don't encode frames for this card right now).
+       int get_mjpeg_stream_for_card(unsigned card_index);
+
 private:
        static constexpr int quality = 90;
 
@@ -153,6 +169,8 @@ private:
        std::atomic<int64_t> metric_mjpeg_frames_oversized_dropped{0};
        std::atomic<int64_t> metric_mjpeg_overrun_dropped{0};
        std::atomic<int64_t> metric_mjpeg_overrun_submitted{0};
+
+       friend class PBOFrameAllocator;  // FIXME
 };
 
 #endif  // !defined(_MJPEG_ENCODER_H)
index 4c1a55bcd0d4d47aeaffdff324b29f59780d9fdd..d0859b357d98a313fb40587b2663dad517c7e2e5 100644 (file)
@@ -8,7 +8,9 @@
 #include <cstddef>
 
 #include "flags.h"
+#include "mjpeg_encoder.h"
 #include "v210_converter.h"
+#include "va_display_with_cleanup.h"
 
 using namespace std;
 
@@ -26,8 +28,8 @@ void set_clamp_to_edge()
 
 }  // namespace
 
-PBOFrameAllocator::PBOFrameAllocator(bmusb::PixelFormat pixel_format, size_t frame_size, GLuint width, GLuint height, size_t num_queued_frames, GLenum buffer, GLenum permissions, GLenum map_bits)
-        : pixel_format(pixel_format), buffer(buffer)
+PBOFrameAllocator::PBOFrameAllocator(bmusb::PixelFormat pixel_format, size_t frame_size, GLuint width, GLuint height, unsigned card_index, MJPEGEncoder *mjpeg_encoder, size_t num_queued_frames, GLenum buffer, GLenum permissions, GLenum map_bits)
+        : card_index(card_index), mjpeg_encoder(mjpeg_encoder), pixel_format(pixel_format), buffer(buffer)
 {
        userdata.reset(new Userdata[num_queued_frames]);
        for (size_t i = 0; i < num_queued_frames; ++i) {
@@ -52,13 +54,13 @@ void PBOFrameAllocator::init_frame(size_t frame_idx, size_t frame_size, GLuint w
        Frame frame;
        frame.data = (uint8_t *)glMapBufferRange(buffer, 0, frame_size, permissions | map_bits | GL_MAP_PERSISTENT_BIT);
        frame.data2 = frame.data + frame_size / 2;
-       frame.data_copy = new uint8_t[frame_size];
        check_error();
        frame.size = frame_size;
        Userdata *ud = &userdata[frame_idx];
        frame.userdata = ud;
        ud->pbo = pbo;
        ud->pixel_format = pixel_format;
+       ud->data_copy_malloc = new uint8_t[frame_size];
        frame.owner = this;
 
        // For 8-bit non-planar Y'CbCr, we ask the driver to split Y' and Cb/Cr
@@ -217,7 +219,7 @@ PBOFrameAllocator::~PBOFrameAllocator()
 void PBOFrameAllocator::destroy_frame(Frame *frame)
 {
        Userdata *ud = (Userdata *)frame->userdata;
-       delete[] frame->data_copy;
+       delete[] ud->data_copy_malloc;
 
        GLuint pbo = ud->pbo;
        glBindBuffer(buffer, pbo);
@@ -273,6 +275,71 @@ bmusb::FrameAllocator::Frame PBOFrameAllocator::alloc_frame()
        }
        vf.len = 0;
        vf.overflow = 0;
+
+       if (mjpeg_encoder != nullptr && mjpeg_encoder->using_vaapi() &&
+           mjpeg_encoder->get_mjpeg_stream_for_card(card_index) != -1) {
+               Userdata *ud = (Userdata *)vf.userdata;
+               vf.data_copy = ud->data_copy_malloc;
+               ud->data_copy_current_src = Userdata::FROM_MALLOC;
+       } else {
+               vf.data_copy = nullptr;
+       }
+
+       return vf;
+}
+
+bmusb::FrameAllocator::Frame PBOFrameAllocator::create_frame(size_t width, size_t height, size_t stride)
+{
+        Frame vf;
+
+       {
+               lock_guard<mutex> lock(freelist_mutex);
+               if (freelist.empty()) {
+                       printf("Frame overrun (no more spare PBO frames), dropping frame!\n");
+                       vf.len = 0;
+                       vf.overflow = 0;
+                       return vf;
+               } else {
+                       vf = freelist.front();
+                       freelist.pop();
+               }
+       }
+       vf.len = 0;
+       vf.overflow = 0;
+
+       Userdata *userdata = (Userdata *)vf.userdata;
+
+       if (mjpeg_encoder != nullptr && mjpeg_encoder->using_vaapi() &&
+           mjpeg_encoder->get_mjpeg_stream_for_card(card_index) != -1) {
+               VADisplay va_dpy = mjpeg_encoder->va_dpy->va_dpy;
+               MJPEGEncoder::VAResources resources = mjpeg_encoder->get_va_resources(width, height);
+               MJPEGEncoder::ReleaseVAResources release(mjpeg_encoder, resources);
+
+               VAImage image;
+               VAStatus va_status = vaDeriveImage(va_dpy, resources.surface, &image);
+               CHECK_VASTATUS(va_status, "vaDeriveImage");
+
+               if (image.pitches[0] == stride) {
+                       userdata->va_resources = move(resources);
+                       userdata->va_resources_release = move(release);
+                       userdata->va_image = move(image);
+
+                       va_status = vaMapBuffer(va_dpy, image.buf, (void **)&vf.data_copy);
+                       CHECK_VASTATUS(va_status, "vaMapBuffer");
+                       vf.data_copy += image.offsets[0];
+                       userdata->data_copy_current_src = Userdata::FROM_VA_API;
+               } else {
+                       printf("WARNING: Could not copy directly into VA-API MJPEG buffer for %zu x %zu, since producer and consumer disagreed on stride (%zu != %d).\n", width, height, stride, image.pitches[0]);
+                       vf.data_copy = userdata->data_copy_malloc;
+                       userdata->data_copy_current_src = Userdata::FROM_MALLOC;
+
+                       va_status = vaDestroyImage(va_dpy, image.image_id);
+                       CHECK_VASTATUS(va_status, "vaDestroyImage");
+               }
+       } else {
+               vf.data_copy = nullptr;
+       }
+
        return vf;
 }
 
index ab51f6bc108839297363dd0ede80860dacfb4481..a7ae92e93a9bc0620af7b8606a143ff1686128ab 100644 (file)
@@ -11,6 +11,9 @@
 #include <movit/ycbcr.h>
 
 #include "bmusb/bmusb.h"
+#include "mjpeg_encoder.h"
+
+class MJPEGEncoder;
 
 // An allocator that allocates straight into OpenGL pinned memory.
 // Meant for video frames only. We use a queue rather than a stack,
@@ -22,12 +25,15 @@ public:
        PBOFrameAllocator(bmusb::PixelFormat pixel_format,
                          size_t frame_size,
                          GLuint width, GLuint height,
+                         unsigned card_index,
+                         MJPEGEncoder *mjpeg_encoder = nullptr,
                          size_t num_queued_frames = 16,
                          GLenum buffer = GL_PIXEL_UNPACK_BUFFER_ARB,
                          GLenum permissions = GL_MAP_WRITE_BIT,
                          GLenum map_bits = GL_MAP_FLUSH_EXPLICIT_BIT);
        ~PBOFrameAllocator() override;
        Frame alloc_frame() override;
+       Frame create_frame(size_t width, size_t height, size_t stride) override;
        void release_frame(Frame frame) override;
 
        struct Userdata {
@@ -54,12 +60,34 @@ public:
                unsigned last_frame_rate_nom, last_frame_rate_den;
                bool has_last_subtitle = false;
                std::string last_subtitle;
+
+               // These are the source of the “data_copy” member in Frame,
+               // used for MJPEG encoding. There are three possibilities:
+               //
+               //  - MJPEG encoding is not active (at all, or for this specific
+               //    card). Then data_copy is nullptr, and what's in here
+               //    does not matter at all.
+               //  - We can encode directly into VA-API buffers (ie., VA-API
+               //    is active, and nothing strange happened wrt. strides);
+               //    then va_resources, va_resources_release and va_image
+               //    are fetched from MJPEGEncoder at create_frame() and released
+               //    back when the frame is uploaded (or would have been).
+               //    In this case, data_copy points into the mapped VAImage.
+               //  - If not, data_copy points to data_copy_malloc, and is copied
+               //    from there into VA-API buffers (by MJPEGEncoder) if needed.
+               enum { FROM_MALLOC, FROM_VA_API } data_copy_current_src;
+               uint8_t *data_copy_malloc;
+               MJPEGEncoder::VAResources va_resources;
+               MJPEGEncoder::ReleaseVAResources va_resources_release;
+               VAImage va_image;
        };
 
 private:
        void init_frame(size_t frame_idx, size_t frame_size, GLuint width, GLuint height, GLenum permissions, GLenum map_bits);
        void destroy_frame(Frame *frame);
 
+       unsigned card_index;
+       MJPEGEncoder *mjpeg_encoder;
        bmusb::PixelFormat pixel_format;
        std::mutex freelist_mutex;
        std::queue<Frame> freelist;