From 2abf57fbc06f52c04fb2ca1f765459908e688890 Mon Sep 17 00:00:00 2001
From: "Steinar H. Gunderson" <sgunderson@bigfoot.com>
Date: Thu, 16 Mar 2017 19:43:11 +0100
Subject: [PATCH] If not using VA-API zerocopy, don't write extra copy
 textures.

Saves some precious memory bandwidth.
---
 flags.h                  |  1 +
 mixer.cpp                | 78 ++++++++++++++++++++++++++--------------
 quicksync_encoder.cpp    | 41 +++++++++++++--------
 quicksync_encoder.h      |  3 ++
 quicksync_encoder_impl.h |  4 +--
 theme.cpp                |  8 ++++-
 video_encoder.cpp        |  6 ++++
 video_encoder.h          | 15 +++++++-
 8 files changed, 110 insertions(+), 46 deletions(-)

diff --git a/flags.h b/flags.h
index bd962fe..caeaf2b 100644
--- a/flags.h
+++ b/flags.h
@@ -57,6 +57,7 @@ struct Flags {
 	bool ten_bit_input = false;
 	bool ten_bit_output = false;  // Implies x264_video_to_disk == true and x264_bit_depth == 10.
 	int x264_bit_depth = 8;  // Not user-settable.
+	bool use_zerocopy = false;  // Not user-settable.
 };
 extern Flags global_flags;
 
diff --git a/mixer.cpp b/mixer.cpp
index 25fa3e4..0c00d14 100644
--- a/mixer.cpp
+++ b/mixer.cpp
@@ -193,7 +193,6 @@ Mixer::Mixer(const QSurfaceFormat &format, unsigned num_cards)
 	movit_texel_subpixel_precision /= 2.0;
 
 	resource_pool.reset(new ResourcePool);
-	theme.reset(new Theme(global_flags.theme_filename, global_flags.theme_dirs, resource_pool.get(), num_cards));
 	for (unsigned i = 0; i < NUM_OUTPUTS; ++i) {
 		output_channel[i].parent = this;
 		output_channel[i].channel = i;
@@ -231,6 +230,9 @@ Mixer::Mixer(const QSurfaceFormat &format, unsigned num_cards)
 
 	video_encoder.reset(new VideoEncoder(resource_pool.get(), h264_encoder_surface, global_flags.va_display, global_flags.width, global_flags.height, &httpd, global_disk_space_estimator));
 
+	// Must be instantiated after VideoEncoder has initialized global_flags.use_zerocopy.
+	theme.reset(new Theme(global_flags.theme_filename, global_flags.theme_dirs, resource_pool.get(), num_cards));
+
 	// Start listening for clients only once VideoEncoder has written its header, if any.
 	httpd.start(9095);
 
@@ -1038,29 +1040,47 @@ void Mixer::render_one_frame(int64_t duration)
 	output_ycbcr_format.num_levels = 1 << global_flags.x264_bit_depth;
 	chain->change_ycbcr_output_format(output_ycbcr_format);
 
-	const int64_t av_delay = lrint(global_flags.audio_queue_length_ms * 0.001 * TIMEBASE);  // Corresponds to the delay in ResamplingQueue.
-	GLuint y_tex, cbcr_tex;
-	bool got_frame = video_encoder->begin_frame(pts_int + av_delay, duration, ycbcr_output_coefficients, theme_main_chain.input_frames, &y_tex, &cbcr_tex);
-	assert(got_frame);
-
-	// Render main chain. We take an extra copy of the created outputs,
+	// Render main chain. If we're using zerocopy Quick Sync encoding
+	// (the default case), we take an extra copy of the created outputs,
 	// so that we can display it back to the screen later (it's less memory
 	// bandwidth than writing and reading back an RGBA texture, even at 16-bit).
 	// Ideally, we'd like to avoid taking copies and just use the main textures
-	// for display as well, but if they're used for zero-copy Quick Sync encoding
-	// (the default case), they're just views into VA-API memory and must be
+	// for display as well, but they're just views into VA-API memory and must be
 	// unmapped during encoding, so we can't use them for display, unfortunately.
-	GLuint cbcr_full_tex, cbcr_copy_tex, y_copy_tex;
-	if (global_flags.x264_bit_depth > 8) {
-		cbcr_full_tex = resource_pool->create_2d_texture(GL_RG16, global_flags.width, global_flags.height);
-		y_copy_tex = resource_pool->create_2d_texture(GL_R16, global_flags.width, global_flags.height);
-		cbcr_copy_tex = resource_pool->create_2d_texture(GL_RG16, global_flags.width / 2, global_flags.height / 2);
+	GLuint y_tex, cbcr_full_tex, cbcr_tex;
+	GLuint y_copy_tex, cbcr_copy_tex = 0;
+	GLuint y_display_tex, cbcr_display_tex;
+	GLenum y_type = (global_flags.x264_bit_depth > 8) ? GL_R16 : GL_R8;
+	GLenum cbcr_type = (global_flags.x264_bit_depth > 8) ? GL_RG16 : GL_RG8;
+	const bool is_zerocopy = video_encoder->is_zerocopy();
+	if (is_zerocopy) {
+		cbcr_full_tex = resource_pool->create_2d_texture(cbcr_type, global_flags.width, global_flags.height);
+		y_copy_tex = resource_pool->create_2d_texture(y_type, global_flags.width, global_flags.height);
+		cbcr_copy_tex = resource_pool->create_2d_texture(cbcr_type, global_flags.width / 2, global_flags.height / 2);
+
+		y_display_tex = y_copy_tex;
+		cbcr_display_tex = cbcr_copy_tex;
+
+		// y_tex and cbcr_tex will be given by VideoEncoder.
 	} else {
-		cbcr_full_tex = resource_pool->create_2d_texture(GL_RG8, global_flags.width, global_flags.height);
-		y_copy_tex = resource_pool->create_2d_texture(GL_R8, global_flags.width, global_flags.height);
-		cbcr_copy_tex = resource_pool->create_2d_texture(GL_RG8, global_flags.width / 2, global_flags.height / 2);
+		cbcr_full_tex = resource_pool->create_2d_texture(cbcr_type, global_flags.width, global_flags.height);
+		y_tex = resource_pool->create_2d_texture(y_type, global_flags.width, global_flags.height);
+		cbcr_tex = resource_pool->create_2d_texture(cbcr_type, global_flags.width / 2, global_flags.height / 2);
+
+		y_display_tex = y_tex;
+		cbcr_display_tex = cbcr_tex;
+	}
+
+	const int64_t av_delay = lrint(global_flags.audio_queue_length_ms * 0.001 * TIMEBASE);  // Corresponds to the delay in ResamplingQueue.
+	bool got_frame = video_encoder->begin_frame(pts_int + av_delay, duration, ycbcr_output_coefficients, theme_main_chain.input_frames, &y_tex, &cbcr_tex);
+	assert(got_frame);
+
+	GLuint fbo;
+	if (is_zerocopy) {
+		fbo = resource_pool->create_fbo(y_tex, cbcr_full_tex, y_copy_tex);
+	} else {
+		fbo = resource_pool->create_fbo(y_tex, cbcr_full_tex);
 	}
-	GLuint fbo = resource_pool->create_fbo(y_tex, cbcr_full_tex, y_copy_tex);
 	check_error();
 	chain->render_to_fbo(fbo, global_flags.width, global_flags.height);
 
@@ -1071,20 +1091,24 @@ void Mixer::render_one_frame(int64_t duration)
 
 	resource_pool->release_fbo(fbo);
 
-	chroma_subsampler->subsample_chroma(cbcr_full_tex, global_flags.width, global_flags.height, cbcr_tex, cbcr_copy_tex);
+	if (is_zerocopy) {
+		chroma_subsampler->subsample_chroma(cbcr_full_tex, global_flags.width, global_flags.height, cbcr_tex, cbcr_copy_tex);
+	} else {
+		chroma_subsampler->subsample_chroma(cbcr_full_tex, global_flags.width, global_flags.height, cbcr_tex);
+	}
 	if (output_card_index != -1) {
 		cards[output_card_index].output->send_frame(y_tex, cbcr_full_tex, ycbcr_output_coefficients, theme_main_chain.input_frames, pts_int, duration);
 	}
 	resource_pool->release_2d_texture(cbcr_full_tex);
 
-	// Set the right state for the Y' and CbCr copies.
+	// Set the right state for the Y' and CbCr textures we use for display.
 	glBindFramebuffer(GL_FRAMEBUFFER, 0);
-	glBindTexture(GL_TEXTURE_2D, y_copy_tex);
+	glBindTexture(GL_TEXTURE_2D, y_display_tex);
 	glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_LINEAR);
 	glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE);
 	glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE);
 
-	glBindTexture(GL_TEXTURE_2D, cbcr_copy_tex);
+	glBindTexture(GL_TEXTURE_2D, cbcr_display_tex);
 	glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_LINEAR);
 	glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE);
 	glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE);
@@ -1092,16 +1116,16 @@ void Mixer::render_one_frame(int64_t duration)
 	RefCountedGLsync fence = video_encoder->end_frame();
 
 	// The live frame pieces the Y'CbCr texture copies back into RGB and displays them.
-	// It owns y_copy_tex and cbcr_copy_tex now.
+	// It owns y_display_tex and cbcr_display_tex now (whichever textures they are).
 	DisplayFrame live_frame;
 	live_frame.chain = display_chain.get();
-	live_frame.setup_chain = [this, y_copy_tex, cbcr_copy_tex]{
-		display_input->set_texture_num(0, y_copy_tex);
-		display_input->set_texture_num(1, cbcr_copy_tex);
+	live_frame.setup_chain = [this, y_display_tex, cbcr_display_tex]{
+		display_input->set_texture_num(0, y_display_tex);
+		display_input->set_texture_num(1, cbcr_display_tex);
 	};
 	live_frame.ready_fence = fence;
 	live_frame.input_frames = {};
-	live_frame.temp_textures = { y_copy_tex, cbcr_copy_tex };
+	live_frame.temp_textures = { y_display_tex, cbcr_display_tex };
 	output_channel[OUTPUT_LIVE].output_frame(live_frame);
 
 	// Set up preview and any additional channels.
diff --git a/quicksync_encoder.cpp b/quicksync_encoder.cpp
index 635a95a..3a689d3 100644
--- a/quicksync_encoder.cpp
+++ b/quicksync_encoder.cpp
@@ -736,6 +736,7 @@ void QuickSyncEncoderImpl::enable_zerocopy_if_possible()
 	} else {
 		use_zerocopy = true;
 	}
+	global_flags.use_zerocopy = use_zerocopy;
 }
 
 VADisplay QuickSyncEncoderImpl::va_open_display(const string &va_display)
@@ -994,16 +995,7 @@ int QuickSyncEncoderImpl::setup_encode()
 			gl_surfaces[i].y_tex = resource_pool->create_2d_texture(GL_R8, 1, 1);
 			gl_surfaces[i].cbcr_tex = resource_pool->create_2d_texture(GL_RG8, 1, 1);
 		} else {
-			size_t bytes_per_pixel;
-			if (global_flags.x264_bit_depth > 8) {
-				bytes_per_pixel = 2;
-				gl_surfaces[i].y_tex = resource_pool->create_2d_texture(GL_R16, frame_width, frame_height);
-				gl_surfaces[i].cbcr_tex = resource_pool->create_2d_texture(GL_RG16, frame_width / 2, frame_height / 2);
-			} else {
-				bytes_per_pixel = 1;
-				gl_surfaces[i].y_tex = resource_pool->create_2d_texture(GL_R8, frame_width, frame_height);
-				gl_surfaces[i].cbcr_tex = resource_pool->create_2d_texture(GL_RG8, frame_width / 2, frame_height / 2);
-			}
+			size_t bytes_per_pixel = (global_flags.x264_bit_depth > 8) ? 2 : 1;
 
 			// Generate a PBO to read into. It doesn't necessarily fit 1:1 with the VA-API
 			// buffers, due to potentially differing pitch.
@@ -1516,14 +1508,15 @@ void QuickSyncEncoderImpl::release_gl_resources()
 	}
 
 	for (unsigned i = 0; i < SURFACE_NUM; i++) {
-		if (!use_zerocopy) {
+		if (use_zerocopy) {
+			resource_pool->release_2d_texture(gl_surfaces[i].y_tex);
+			resource_pool->release_2d_texture(gl_surfaces[i].cbcr_tex);
+		} else {
 			glBindBuffer(GL_PIXEL_PACK_BUFFER, gl_surfaces[i].pbo);
 			glUnmapBuffer(GL_PIXEL_PACK_BUFFER);
 			glBindBuffer(GL_PIXEL_PACK_BUFFER, 0);
 			glDeleteBuffers(1, &gl_surfaces[i].pbo);
 		}
-		resource_pool->release_2d_texture(gl_surfaces[i].y_tex);
-		resource_pool->release_2d_texture(gl_surfaces[i].cbcr_tex);
 	}
 
 	has_released_gl_resources = true;
@@ -1611,6 +1604,11 @@ void QuickSyncEncoderImpl::release_gl_surface(size_t display_frame_num)
 	}
 }
 
+bool QuickSyncEncoderImpl::is_zerocopy() const
+{
+	return use_zerocopy;
+}
+
 bool QuickSyncEncoderImpl::begin_frame(int64_t pts, int64_t duration, YCbCrLumaCoefficients ycbcr_coefficients, const vector<RefCountedFrame> &input_frames, GLuint *y_tex, GLuint *cbcr_tex)
 {
 	assert(!is_shutdown);
@@ -1634,8 +1632,13 @@ bool QuickSyncEncoderImpl::begin_frame(int64_t pts, int64_t duration, YCbCrLumaC
 		surface_for_frame[current_storage_frame] = surf;
 	}
 
-	*y_tex = surf->y_tex;
-	*cbcr_tex = surf->cbcr_tex;
+	if (use_zerocopy) {
+		*y_tex = surf->y_tex;
+		*cbcr_tex = surf->cbcr_tex;
+	} else {
+		surf->y_tex = *y_tex;
+		surf->cbcr_tex = *cbcr_tex;
+	}
 
 	if (!global_flags.x264_video_to_disk) {
 		VAStatus va_status = vaDeriveImage(va_dpy, surf->src_surface, &surf->surface_image);
@@ -1727,6 +1730,9 @@ RefCountedGLsync QuickSyncEncoderImpl::end_frame()
 		glGetTexImage(GL_TEXTURE_2D, 0, GL_RG, type, BUFFER_OFFSET(surf->cbcr_offset));
 		check_error();
 
+		// We don't own these; the caller does.
+		surf->y_tex = surf->cbcr_tex = 0;
+
 		glBindTexture(GL_TEXTURE_2D, 0);
 		check_error();
 		glBindBuffer(GL_PIXEL_PACK_BUFFER, 0);
@@ -2080,6 +2086,11 @@ void QuickSyncEncoder::add_audio(int64_t pts, vector<float> audio)
 	impl->add_audio(pts, audio);
 }
 
+bool QuickSyncEncoder::is_zerocopy() const
+{
+	return impl->is_zerocopy();
+}
+
 bool QuickSyncEncoder::begin_frame(int64_t pts, int64_t duration, YCbCrLumaCoefficients ycbcr_coefficients, const vector<RefCountedFrame> &input_frames, GLuint *y_tex, GLuint *cbcr_tex)
 {
 	return impl->begin_frame(pts, duration, ycbcr_coefficients, input_frames, y_tex, cbcr_tex);
diff --git a/quicksync_encoder.h b/quicksync_encoder.h
index eebabbd..f4e9e0b 100644
--- a/quicksync_encoder.h
+++ b/quicksync_encoder.h
@@ -66,6 +66,9 @@ public:
 
 	void set_stream_mux(Mux *mux);  // Does not take ownership. Must be called unless x264 is used for the stream.
 	void add_audio(int64_t pts, std::vector<float> audio);
+	bool is_zerocopy() const;
+
+	// See VideoEncoder::begin_frame().
 	bool begin_frame(int64_t pts, int64_t duration, movit::YCbCrLumaCoefficients ycbcr_coefficients, const std::vector<RefCountedFrame> &input_frames, GLuint *y_tex, GLuint *cbcr_tex);
 	RefCountedGLsync end_frame();
 	void shutdown();  // Blocking. Does not require an OpenGL context.
diff --git a/quicksync_encoder_impl.h b/quicksync_encoder_impl.h
index 1846132..917420c 100644
--- a/quicksync_encoder_impl.h
+++ b/quicksync_encoder_impl.h
@@ -36,6 +36,7 @@ public:
 	QuickSyncEncoderImpl(const std::string &filename, movit::ResourcePool *resource_pool, QSurface *surface, const std::string &va_display, int width, int height, AVOutputFormat *oformat, X264Encoder *x264_encoder, DiskSpaceEstimator *disk_space_estimator);
 	~QuickSyncEncoderImpl();
 	void add_audio(int64_t pts, std::vector<float> audio);
+	bool is_zerocopy() const;
 	bool begin_frame(int64_t pts, int64_t duration, movit::YCbCrLumaCoefficients ycbcr_coefficients, const std::vector<RefCountedFrame> &input_frames, GLuint *y_tex, GLuint *cbcr_tex);
 	RefCountedGLsync end_frame();
 	void shutdown();
@@ -68,14 +69,13 @@ private:
 		movit::YCbCrLumaCoefficients ycbcr_coefficients;
 	};
 	struct GLSurface {
-		GLuint y_tex, cbcr_tex;
-
 		// Only if x264_video_to_disk == false.
 		VASurfaceID src_surface, ref_surface;
 		VABufferID coded_buf;
 		VAImage surface_image;
 
 		// Only if use_zerocopy == true (which implies x264_video_to_disk == false).
+		GLuint y_tex, cbcr_tex;
 		EGLImage y_egl_image, cbcr_egl_image;
 
 		// Only if use_zerocopy == false.
diff --git a/theme.cpp b/theme.cpp
index e5002bc..0cc498e 100644
--- a/theme.cpp
+++ b/theme.cpp
@@ -279,7 +279,13 @@ int EffectChain_finalize(lua_State* L)
 		GLenum type = global_flags.x264_bit_depth > 8 ? GL_UNSIGNED_SHORT : GL_UNSIGNED_BYTE;
 
 		chain->add_ycbcr_output(inout_format, OUTPUT_ALPHA_FORMAT_POSTMULTIPLIED, output_ycbcr_format, YCBCR_OUTPUT_SPLIT_Y_AND_CBCR, type);
-		chain->add_ycbcr_output(inout_format, OUTPUT_ALPHA_FORMAT_POSTMULTIPLIED, output_ycbcr_format, YCBCR_OUTPUT_INTERLEAVED, type);  // Add a copy where we'll only be using the Y component.
+
+		// If we're using zerocopy video encoding (so the destination
+		// Y texture is owned by VA-API and will be unavailable for
+		// display), add a copy, where we'll only be using the Y component.
+		if (global_flags.use_zerocopy) {
+			chain->add_ycbcr_output(inout_format, OUTPUT_ALPHA_FORMAT_POSTMULTIPLIED, output_ycbcr_format, YCBCR_OUTPUT_INTERLEAVED, type);  // Add a copy where we'll only be using the Y component.
+		}
 		chain->set_dither_bits(global_flags.x264_bit_depth > 8 ? 16 : 8);
 		chain->set_output_origin(OUTPUT_ORIGIN_TOP_LEFT);
 	} else {
diff --git a/video_encoder.cpp b/video_encoder.cpp
index b7e36bd..6a4ebf7 100644
--- a/video_encoder.cpp
+++ b/video_encoder.cpp
@@ -140,6 +140,12 @@ void VideoEncoder::add_audio(int64_t pts, std::vector<float> audio)
 	stream_audio_encoder->encode_audio(audio, pts + quicksync_encoder->global_delay());
 }
 
+bool VideoEncoder::is_zerocopy() const
+{
+	lock_guard<mutex> lock(qs_mu);
+	return quicksync_encoder->is_zerocopy();
+}
+
 bool VideoEncoder::begin_frame(int64_t pts, int64_t duration, movit::YCbCrLumaCoefficients ycbcr_coefficients, const std::vector<RefCountedFrame> &input_frames, GLuint *y_tex, GLuint *cbcr_tex)
 {
 	lock_guard<mutex> lock(qs_mu);
diff --git a/video_encoder.h b/video_encoder.h
index e1518ae..d51399e 100644
--- a/video_encoder.h
+++ b/video_encoder.h
@@ -42,9 +42,22 @@ public:
 
 	void add_audio(int64_t pts, std::vector<float> audio);
 
+	bool is_zerocopy() const;
+
 	// Allocate a frame to render into. The returned two textures
 	// are yours to render into (build them into an FBO).
 	// Call end_frame() when you're done.
+	//
+	// The semantics of y_tex and cbcr_tex depend on is_zerocopy():
+	//
+	//   - If false, the are input parameters, ie., the caller
+	//     allocates textures. (The contents are not read before
+	//     end_frame() is called.)
+	//   - If true, they are output parameters, ie., VideoEncoder
+	//     allocates textures and borrow them to you for rendering.
+	//     In this case, after end_frame(), you are no longer allowed
+	//     to use the textures; they are torn down and given to the
+	//     H.264 encoder.
 	bool begin_frame(int64_t pts, int64_t duration, movit::YCbCrLumaCoefficients ycbcr_coefficients, const std::vector<RefCountedFrame> &input_frames, GLuint *y_tex, GLuint *cbcr_tex);
 
 	// Call after you are done rendering into the frame; at this point,
@@ -65,7 +78,7 @@ private:
 	int write_packet2(uint8_t *buf, int buf_size, AVIODataMarkerType type, int64_t time);
 
 	AVOutputFormat *oformat;
-	std::mutex qs_mu;
+	mutable std::mutex qs_mu;
 	std::unique_ptr<QuickSyncEncoder> quicksync_encoder;  // Under <qs_mu>.
 	movit::ResourcePool *resource_pool;
 	QSurface *surface;
-- 
2.39.2