From a839022c035b3d9387feabc02843c166ac78b469 Mon Sep 17 00:00:00 2001
From: "Steinar H. Gunderson" <sgunderson@bigfoot.com>
Date: Sun, 17 Mar 2019 22:53:36 +0100
Subject: [PATCH] When uploading MJPEG data to VA-API, do it directly into the
 buffer.
MIME-Version: 1.0
Content-Type: text/plain; charset=utf8
Content-Transfer-Encoding: 8bit

Besides the obvious of spending less time copying, this has two positive effects:

 - The VA-API thread is no longer a choke point; uploading can happen from
   multiple cores.
 - With one copy less, we seem to be reducing L3 cache pressure a bit;
   at some point between five and six 1080p sources, we âfall off a cliffâ
   wrt. the L3 and start thrashing. This doesn't fix the issue, but alleviates
   it somewhat.

All in all, we seem to go down from ~2.6 to ~2.1â2.2 cores used with one
720p channel and five 1080p channels. I haven't tried saturating channels
yet to see how many we can actually encode.
---
 nageru/bmusb                   |  2 +-
 nageru/decklink_capture.cpp    |  2 +-
 nageru/mixer.cpp               | 14 +++---
 nageru/mjpeg_encoder.cpp       | 79 ++++++++++++++++++++++++++--------
 nageru/mjpeg_encoder.h         | 18 ++++++++
 nageru/pbo_frame_allocator.cpp | 75 ++++++++++++++++++++++++++++++--
 nageru/pbo_frame_allocator.h   | 28 ++++++++++++
 7 files changed, 186 insertions(+), 32 deletions(-)

diff --git a/nageru/bmusb b/nageru/bmusb
index 5163d25..03e3889 160000
--- a/nageru/bmusb
+++ b/nageru/bmusb
@@ -1 +1 @@
-Subproject commit 5163d25c65c3028090db1aea6587ec2fb4cb823e
+Subproject commit 03e38890b599efe6ac906fdb70b43cda63f11d01
diff --git a/nageru/decklink_capture.cpp b/nageru/decklink_capture.cpp
index a09aefa..f7016dc 100644
--- a/nageru/decklink_capture.cpp
+++ b/nageru/decklink_capture.cpp
@@ -252,7 +252,7 @@ HRESULT STDMETHODCALLTYPE DeckLinkCapture::VideoInputFrameArrived(
 			assert(stride == width * 2);
 		}
 
-		current_video_frame = video_frame_allocator->alloc_frame();
+		current_video_frame = video_frame_allocator->create_frame(width, height, stride);
 		if (current_video_frame.data != nullptr) {
 			const uint8_t *src;
 			video_frame->GetBytes((void **)&src);
diff --git a/nageru/mixer.cpp b/nageru/mixer.cpp
index bad1a5e..e0aa8b1 100644
--- a/nageru/mixer.cpp
+++ b/nageru/mixer.cpp
@@ -551,7 +551,7 @@ void Mixer::configure_card(unsigned card_index, CaptureInterface *capture, CardT
 
 	card->capture->set_frame_callback(bind(&Mixer::bm_frame, this, card_index, _1, _2, _3, _4, _5, _6, _7));
 	if (card->frame_allocator == nullptr) {
-		card->frame_allocator.reset(new PBOFrameAllocator(pixel_format, 8 << 20, global_flags.width, global_flags.height));  // 8 MB.
+		card->frame_allocator.reset(new PBOFrameAllocator(pixel_format, 8 << 20, global_flags.width, global_flags.height, card_index, mjpeg_encoder.get()));  // 8 MB.
 	}
 	card->capture->set_video_frame_allocator(card->frame_allocator.get());
 	if (card->surface == nullptr) {
@@ -1081,12 +1081,12 @@ void Mixer::thread_func()
 				new_frame->upload_func = nullptr;
 			}
 
-			// Only bother doing MJPEG encoding if there are any connected clients
-			// that want the stream. FIXME: We should also stop memcpy-ing if there are none!
-			if (httpd.get_num_connected_multicam_clients() > 0) {
-				auto stream_it = global_flags.card_to_mjpeg_stream_export.find(card_index);
-				if (stream_it != global_flags.card_to_mjpeg_stream_export.end()) {
-					mjpeg_encoder->upload_frame(pts_int, stream_it->second, new_frame->frame, new_frame->video_format, new_frame->y_offset, new_frame->cbcr_offset);
+			if (new_frame->frame->data_copy != nullptr) {
+				int mjpeg_card_index = mjpeg_encoder->get_mjpeg_stream_for_card(card_index);
+				if (mjpeg_card_index == -1) {
+					mjpeg_encoder->finish_frame(new_frame->frame);
+				} else {
+					mjpeg_encoder->upload_frame(pts_int, mjpeg_card_index, new_frame->frame, new_frame->video_format, new_frame->y_offset, new_frame->cbcr_offset);
 				}
 			}
 		}
diff --git a/nageru/mjpeg_encoder.cpp b/nageru/mjpeg_encoder.cpp
index 07e302c..9ae018f 100644
--- a/nageru/mjpeg_encoder.cpp
+++ b/nageru/mjpeg_encoder.cpp
@@ -30,12 +30,6 @@ using namespace std;
 
 extern void memcpy_with_pitch(uint8_t *dst, const uint8_t *src, size_t src_width, size_t dst_pitch, size_t height);
 
-#define CHECK_VASTATUS(va_status, func)                                 \
-    if (va_status != VA_STATUS_SUCCESS) {                               \
-        fprintf(stderr, "%s:%d (%s) failed with %d\n", __func__, __LINE__, func, va_status); \
-        exit(1);                                                        \
-    }
-
 // From libjpeg (although it's of course identical between implementations).
 static const int jpeg_natural_order[DCTSIZE2] = {
 	 0,  1,  8, 16,  9,  2,  3, 10,
@@ -294,6 +288,37 @@ void MJPEGEncoder::upload_frame(int64_t pts, unsigned card_index, RefCountedFram
 	any_frames_to_be_encoded.notify_all();
 }
 
+void MJPEGEncoder::finish_frame(RefCountedFrame frame)
+{
+	PBOFrameAllocator::Userdata *userdata = (PBOFrameAllocator::Userdata *)frame->userdata;
+
+	if (userdata->data_copy_current_src == PBOFrameAllocator::Userdata::FROM_VA_API) {
+		VAResources resources __attribute__((unused)) = move(userdata->va_resources);
+		ReleaseVAResources release = move(userdata->va_resources_release);
+		VAImage image = move(userdata->va_image);
+
+		VAStatus va_status = vaUnmapBuffer(va_dpy->va_dpy, image.buf);
+		CHECK_VASTATUS(va_status, "vaUnmapBuffer");
+		va_status = vaDestroyImage(va_dpy->va_dpy, image.image_id);
+		CHECK_VASTATUS(va_status, "vaDestroyImage");
+	}
+}
+
+int MJPEGEncoder::get_mjpeg_stream_for_card(unsigned card_index)
+{
+	// Only bother doing MJPEG encoding if there are any connected clients
+	// that want the stream.
+	if (httpd->get_num_connected_multicam_clients() == 0) {
+		return -1;
+	}
+
+	auto it = global_flags.card_to_mjpeg_stream_export.find(card_index);
+	if (it == global_flags.card_to_mjpeg_stream_export.end()) {
+		return -1;
+	}
+	return it->second;
+}
+
 void MJPEGEncoder::encoder_thread_func()
 {
 	pthread_setname_np(pthread_self(), "MJPEG_Encode");
@@ -597,11 +622,20 @@ MJPEGEncoder::VAData MJPEGEncoder::get_va_data_for_resolution(unsigned width, un
 
 void MJPEGEncoder::encode_jpeg_va(QueuedFrame &&qf)
 {
+	PBOFrameAllocator::Userdata *userdata = (PBOFrameAllocator::Userdata *)qf.frame->userdata;
 	unsigned width = qf.video_format.width;
 	unsigned height = qf.video_format.height;
 
-	VAResources resources = get_va_resources(width, height);
-	ReleaseVAResources release(this, resources);
+	VAResources resources;
+	ReleaseVAResources release;
+	if (userdata->data_copy_current_src == PBOFrameAllocator::Userdata::FROM_VA_API) {
+		resources = move(userdata->va_resources);
+		release = move(userdata->va_resources_release);
+	} else {
+		assert(userdata->data_copy_current_src == PBOFrameAllocator::Userdata::FROM_MALLOC);
+		resources = get_va_resources(width, height);
+		release = ReleaseVAResources(this, resources);
+	}
 
 	VAData va_data = get_va_data_for_resolution(width, height);
 	va_data.pic_param.coded_buf = resources.data_buffer;
@@ -627,20 +661,27 @@ void MJPEGEncoder::encode_jpeg_va(QueuedFrame &&qf)
 	VABufferDestroyer destroy_slice_param(va_dpy->va_dpy, slice_param_buffer);
 
 	VAImage image;
-	va_status = vaDeriveImage(va_dpy->va_dpy, resources.surface, &image);
-	CHECK_VASTATUS(va_status, "vaDeriveImage");
+	if (userdata->data_copy_current_src == PBOFrameAllocator::Userdata::FROM_VA_API) {
+		// The pixel data is already uploaded by the caller.
+		image = move(userdata->va_image);
+	} else {
+		assert(userdata->data_copy_current_src == PBOFrameAllocator::Userdata::FROM_MALLOC);
 
-	// Upload the pixel data.
-	uint8_t *surface_p = nullptr;
-	vaMapBuffer(va_dpy->va_dpy, image.buf, (void **)&surface_p);
+		// Upload the pixel data.
+		va_status = vaDeriveImage(va_dpy->va_dpy, resources.surface, &image);
+		CHECK_VASTATUS(va_status, "vaDeriveImage");
 
-	size_t field_start_line = qf.video_format.extra_lines_top;  // No interlacing support.
-	size_t field_start = qf.cbcr_offset * 2 + qf.video_format.width * field_start_line * 2;
+		uint8_t *surface_p = nullptr;
+		vaMapBuffer(va_dpy->va_dpy, image.buf, (void **)&surface_p);
 
-	{
-		const uint8_t *src = qf.frame->data_copy + field_start;
-		uint8_t *dst = (unsigned char *)surface_p + image.offsets[0];
-		memcpy_with_pitch(dst, src, qf.video_format.width * 2, image.pitches[0], qf.video_format.height);
+		size_t field_start_line = qf.video_format.extra_lines_top;  // No interlacing support.
+		size_t field_start = qf.cbcr_offset * 2 + qf.video_format.width * field_start_line * 2;
+
+		{
+			const uint8_t *src = qf.frame->data_copy + field_start;
+			uint8_t *dst = (unsigned char *)surface_p + image.offsets[0];
+			memcpy_with_pitch(dst, src, qf.video_format.width * 2, image.pitches[0], qf.video_format.height);
+		}
 	}
 
 	va_status = vaUnmapBuffer(va_dpy->va_dpy, image.buf);
diff --git a/nageru/mjpeg_encoder.h b/nageru/mjpeg_encoder.h
index 8b68294..b7b2043 100644
--- a/nageru/mjpeg_encoder.h
+++ b/nageru/mjpeg_encoder.h
@@ -27,6 +27,12 @@ struct jpeg_compress_struct;
 struct VADisplayWithCleanup;
 struct VectorDestinationManager;
 
+#define CHECK_VASTATUS(va_status, func)                                 \
+    if (va_status != VA_STATUS_SUCCESS) {                               \
+        fprintf(stderr, "%s:%d (%s) failed with %d\n", __func__, __LINE__, func, va_status); \
+        exit(1);                                                        \
+    }
+
 class MJPEGEncoder {
 public:
 	MJPEGEncoder(HTTPD *httpd, const std::string &va_display);
@@ -34,6 +40,16 @@ public:
 	void stop();
 	void upload_frame(int64_t pts, unsigned card_index, RefCountedFrame frame, const bmusb::VideoFormat &video_format, size_t y_offset, size_t cbcr_offset);
 
+	// If the frame was started (data_copy != nullptr) but will not be finished
+	// (MJPEG decoding was turned off in the meantime), you'll need to call finish_frame()
+	// to release any VA-API resources.
+	void finish_frame(RefCountedFrame frame);
+
+	bool using_vaapi() const { return va_dpy != nullptr; }
+
+	// Returns -1 for inactive (ie., don't encode frames for this card right now).
+	int get_mjpeg_stream_for_card(unsigned card_index);
+
 private:
 	static constexpr int quality = 90;
 
@@ -153,6 +169,8 @@ private:
 	std::atomic<int64_t> metric_mjpeg_frames_oversized_dropped{0};
 	std::atomic<int64_t> metric_mjpeg_overrun_dropped{0};
 	std::atomic<int64_t> metric_mjpeg_overrun_submitted{0};
+
+	friend class PBOFrameAllocator;  // FIXME
 };
 
 #endif  // !defined(_MJPEG_ENCODER_H)
diff --git a/nageru/pbo_frame_allocator.cpp b/nageru/pbo_frame_allocator.cpp
index 4c1a55b..d0859b3 100644
--- a/nageru/pbo_frame_allocator.cpp
+++ b/nageru/pbo_frame_allocator.cpp
@@ -8,7 +8,9 @@
 #include <cstddef>
 
 #include "flags.h"
+#include "mjpeg_encoder.h"
 #include "v210_converter.h"
+#include "va_display_with_cleanup.h"
 
 using namespace std;
 
@@ -26,8 +28,8 @@ void set_clamp_to_edge()
 
 }  // namespace
 
-PBOFrameAllocator::PBOFrameAllocator(bmusb::PixelFormat pixel_format, size_t frame_size, GLuint width, GLuint height, size_t num_queued_frames, GLenum buffer, GLenum permissions, GLenum map_bits)
-        : pixel_format(pixel_format), buffer(buffer)
+PBOFrameAllocator::PBOFrameAllocator(bmusb::PixelFormat pixel_format, size_t frame_size, GLuint width, GLuint height, unsigned card_index, MJPEGEncoder *mjpeg_encoder, size_t num_queued_frames, GLenum buffer, GLenum permissions, GLenum map_bits)
+        : card_index(card_index), mjpeg_encoder(mjpeg_encoder), pixel_format(pixel_format), buffer(buffer)
 {
 	userdata.reset(new Userdata[num_queued_frames]);
 	for (size_t i = 0; i < num_queued_frames; ++i) {
@@ -52,13 +54,13 @@ void PBOFrameAllocator::init_frame(size_t frame_idx, size_t frame_size, GLuint w
 	Frame frame;
 	frame.data = (uint8_t *)glMapBufferRange(buffer, 0, frame_size, permissions | map_bits | GL_MAP_PERSISTENT_BIT);
 	frame.data2 = frame.data + frame_size / 2;
-	frame.data_copy = new uint8_t[frame_size];
 	check_error();
 	frame.size = frame_size;
 	Userdata *ud = &userdata[frame_idx];
 	frame.userdata = ud;
 	ud->pbo = pbo;
 	ud->pixel_format = pixel_format;
+	ud->data_copy_malloc = new uint8_t[frame_size];
 	frame.owner = this;
 
 	// For 8-bit non-planar Y'CbCr, we ask the driver to split Y' and Cb/Cr
@@ -217,7 +219,7 @@ PBOFrameAllocator::~PBOFrameAllocator()
 void PBOFrameAllocator::destroy_frame(Frame *frame)
 {
 	Userdata *ud = (Userdata *)frame->userdata;
-	delete[] frame->data_copy;
+	delete[] ud->data_copy_malloc;
 
 	GLuint pbo = ud->pbo;
 	glBindBuffer(buffer, pbo);
@@ -273,6 +275,71 @@ bmusb::FrameAllocator::Frame PBOFrameAllocator::alloc_frame()
 	}
 	vf.len = 0;
 	vf.overflow = 0;
+
+	if (mjpeg_encoder != nullptr && mjpeg_encoder->using_vaapi() &&
+	    mjpeg_encoder->get_mjpeg_stream_for_card(card_index) != -1) {
+		Userdata *ud = (Userdata *)vf.userdata;
+		vf.data_copy = ud->data_copy_malloc;
+		ud->data_copy_current_src = Userdata::FROM_MALLOC;
+	} else {
+		vf.data_copy = nullptr;
+	}
+
+	return vf;
+}
+
+bmusb::FrameAllocator::Frame PBOFrameAllocator::create_frame(size_t width, size_t height, size_t stride)
+{
+        Frame vf;
+
+	{
+		lock_guard<mutex> lock(freelist_mutex);
+		if (freelist.empty()) {
+			printf("Frame overrun (no more spare PBO frames), dropping frame!\n");
+			vf.len = 0;
+			vf.overflow = 0;
+			return vf;
+		} else {
+			vf = freelist.front();
+			freelist.pop();
+		}
+	}
+	vf.len = 0;
+	vf.overflow = 0;
+
+	Userdata *userdata = (Userdata *)vf.userdata;
+
+	if (mjpeg_encoder != nullptr && mjpeg_encoder->using_vaapi() &&
+	    mjpeg_encoder->get_mjpeg_stream_for_card(card_index) != -1) {
+		VADisplay va_dpy = mjpeg_encoder->va_dpy->va_dpy;
+		MJPEGEncoder::VAResources resources = mjpeg_encoder->get_va_resources(width, height);
+		MJPEGEncoder::ReleaseVAResources release(mjpeg_encoder, resources);
+
+		VAImage image;
+		VAStatus va_status = vaDeriveImage(va_dpy, resources.surface, &image);
+		CHECK_VASTATUS(va_status, "vaDeriveImage");
+
+		if (image.pitches[0] == stride) {
+			userdata->va_resources = move(resources);
+			userdata->va_resources_release = move(release);
+			userdata->va_image = move(image);
+
+			va_status = vaMapBuffer(va_dpy, image.buf, (void **)&vf.data_copy);
+			CHECK_VASTATUS(va_status, "vaMapBuffer");
+			vf.data_copy += image.offsets[0];
+			userdata->data_copy_current_src = Userdata::FROM_VA_API;
+		} else {
+			printf("WARNING: Could not copy directly into VA-API MJPEG buffer for %zu x %zu, since producer and consumer disagreed on stride (%zu != %d).\n", width, height, stride, image.pitches[0]);
+			vf.data_copy = userdata->data_copy_malloc;
+			userdata->data_copy_current_src = Userdata::FROM_MALLOC;
+
+			va_status = vaDestroyImage(va_dpy, image.image_id);
+			CHECK_VASTATUS(va_status, "vaDestroyImage");
+		}
+	} else {
+		vf.data_copy = nullptr;
+	}
+
 	return vf;
 }
 
diff --git a/nageru/pbo_frame_allocator.h b/nageru/pbo_frame_allocator.h
index ab51f6b..a7ae92e 100644
--- a/nageru/pbo_frame_allocator.h
+++ b/nageru/pbo_frame_allocator.h
@@ -11,6 +11,9 @@
 #include <movit/ycbcr.h>
 
 #include "bmusb/bmusb.h"
+#include "mjpeg_encoder.h"
+
+class MJPEGEncoder;
 
 // An allocator that allocates straight into OpenGL pinned memory.
 // Meant for video frames only. We use a queue rather than a stack,
@@ -22,12 +25,15 @@ public:
 	PBOFrameAllocator(bmusb::PixelFormat pixel_format,
 	                  size_t frame_size,
 	                  GLuint width, GLuint height,
+	                  unsigned card_index,
+	                  MJPEGEncoder *mjpeg_encoder = nullptr,
 	                  size_t num_queued_frames = 16,
 	                  GLenum buffer = GL_PIXEL_UNPACK_BUFFER_ARB,
 	                  GLenum permissions = GL_MAP_WRITE_BIT,
 	                  GLenum map_bits = GL_MAP_FLUSH_EXPLICIT_BIT);
 	~PBOFrameAllocator() override;
 	Frame alloc_frame() override;
+	Frame create_frame(size_t width, size_t height, size_t stride) override;
 	void release_frame(Frame frame) override;
 
 	struct Userdata {
@@ -54,12 +60,34 @@ public:
 		unsigned last_frame_rate_nom, last_frame_rate_den;
 		bool has_last_subtitle = false;
 		std::string last_subtitle;
+
+		// These are the source of the âdata_copyâ member in Frame,
+		// used for MJPEG encoding. There are three possibilities:
+		//
+		//  - MJPEG encoding is not active (at all, or for this specific
+		//    card). Then data_copy is nullptr, and what's in here
+		//    does not matter at all.
+		//  - We can encode directly into VA-API buffers (ie., VA-API
+		//    is active, and nothing strange happened wrt. strides);
+		//    then va_resources, va_resources_release and va_image
+		//    are fetched from MJPEGEncoder at create_frame() and released
+		//    back when the frame is uploaded (or would have been).
+		//    In this case, data_copy points into the mapped VAImage.
+		//  - If not, data_copy points to data_copy_malloc, and is copied
+		//    from there into VA-API buffers (by MJPEGEncoder) if needed.
+		enum { FROM_MALLOC, FROM_VA_API } data_copy_current_src;
+		uint8_t *data_copy_malloc;
+		MJPEGEncoder::VAResources va_resources;
+		MJPEGEncoder::ReleaseVAResources va_resources_release;
+		VAImage va_image;
 	};
 
 private:
 	void init_frame(size_t frame_idx, size_t frame_size, GLuint width, GLuint height, GLenum permissions, GLenum map_bits);
 	void destroy_frame(Frame *frame);
 
+	unsigned card_index;
+	MJPEGEncoder *mjpeg_encoder;
 	bmusb::PixelFormat pixel_format;
 	std::mutex freelist_mutex;
 	std::queue<Frame> freelist;
-- 
2.39.2