If not using VA-API zerocopy, don't write extra copy textures.

author Steinar H. Gunderson <sgunderson@bigfoot.com>

Thu, 16 Mar 2017 18:43:11 +0000 (19:43 +0100)

committer Steinar H. Gunderson <sgunderson@bigfoot.com>

Thu, 16 Mar 2017 18:43:11 +0000 (19:43 +0100)
author Steinar H. Gunderson <sgunderson@bigfoot.com>
Thu, 16 Mar 2017 18:43:11 +0000 (19:43 +0100)
committer Steinar H. Gunderson <sgunderson@bigfoot.com>
Thu, 16 Mar 2017 18:43:11 +0000 (19:43 +0100)
diff --git a/flags.h b/flags.h

index bd962fe7e4890e1bcc8944ca305cb6fded6f0cde..caeaf2bc0cf43a2c769d23c8d2121a456a32bef2 100644 (file)
--- a/flags.h
+++ b/flags.h
@@ -57,6 +57,7 @@ struct Flags {
         bool ten_bit_input = false;
         bool ten_bit_output = false;  // Implies x264_video_to_disk == true and x264_bit_depth == 10.
         int x264_bit_depth = 8;  // Not user-settable.
+       bool use_zerocopy = false;  // Not user-settable.
  };
  extern Flags global_flags;
  
diff --git a/mixer.cpp b/mixer.cpp

index 25fa3e40d70723e12e4fa5a54b69628a0f49ed7c..0c00d148144eae9583d65b8e06d32a5fe14f8c71 100644 (file)
--- a/mixer.cpp
+++ b/mixer.cpp
@@ -193,7 +193,6 @@ Mixer::Mixer(const QSurfaceFormat &format, unsigned num_cards)
         movit_texel_subpixel_precision /= 2.0;
  
         resource_pool.reset(new ResourcePool);
-       theme.reset(new Theme(global_flags.theme_filename, global_flags.theme_dirs, resource_pool.get(), num_cards));
         for (unsigned i = 0; i < NUM_OUTPUTS; ++i) {
                 output_channel[i].parent = this;
                 output_channel[i].channel = i;
@@ -231,6 +230,9 @@ Mixer::Mixer(const QSurfaceFormat &format, unsigned num_cards)
  
         video_encoder.reset(new VideoEncoder(resource_pool.get(), h264_encoder_surface, global_flags.va_display, global_flags.width, global_flags.height, &httpd, global_disk_space_estimator));
  
+       // Must be instantiated after VideoEncoder has initialized global_flags.use_zerocopy.
+       theme.reset(new Theme(global_flags.theme_filename, global_flags.theme_dirs, resource_pool.get(), num_cards));
+
         // Start listening for clients only once VideoEncoder has written its header, if any.
         httpd.start(9095);
  
@@ -1038,29 +1040,47 @@ void Mixer::render_one_frame(int64_t duration)
         output_ycbcr_format.num_levels = 1 << global_flags.x264_bit_depth;
         chain->change_ycbcr_output_format(output_ycbcr_format);
  
-       const int64_t av_delay = lrint(global_flags.audio_queue_length_ms * 0.001 * TIMEBASE);  // Corresponds to the delay in ResamplingQueue.
-       GLuint y_tex, cbcr_tex;
-       bool got_frame = video_encoder->begin_frame(pts_int + av_delay, duration, ycbcr_output_coefficients, theme_main_chain.input_frames, &y_tex, &cbcr_tex);
-       assert(got_frame);
-
-       // Render main chain. We take an extra copy of the created outputs,
+       // Render main chain. If we're using zerocopy Quick Sync encoding
+       // (the default case), we take an extra copy of the created outputs,
         // so that we can display it back to the screen later (it's less memory
         // bandwidth than writing and reading back an RGBA texture, even at 16-bit).
         // Ideally, we'd like to avoid taking copies and just use the main textures
-       // for display as well, but if they're used for zero-copy Quick Sync encoding
-       // (the default case), they're just views into VA-API memory and must be
+       // for display as well, but they're just views into VA-API memory and must be
         // unmapped during encoding, so we can't use them for display, unfortunately.
-       GLuint cbcr_full_tex, cbcr_copy_tex, y_copy_tex;
-       if (global_flags.x264_bit_depth > 8) {
-               cbcr_full_tex = resource_pool->create_2d_texture(GL_RG16, global_flags.width, global_flags.height);
-               y_copy_tex = resource_pool->create_2d_texture(GL_R16, global_flags.width, global_flags.height);
-               cbcr_copy_tex = resource_pool->create_2d_texture(GL_RG16, global_flags.width / 2, global_flags.height / 2);
+       GLuint y_tex, cbcr_full_tex, cbcr_tex;
+       GLuint y_copy_tex, cbcr_copy_tex = 0;
+       GLuint y_display_tex, cbcr_display_tex;
+       GLenum y_type = (global_flags.x264_bit_depth > 8) ? GL_R16 : GL_R8;
+       GLenum cbcr_type = (global_flags.x264_bit_depth > 8) ? GL_RG16 : GL_RG8;
+       const bool is_zerocopy = video_encoder->is_zerocopy();
+       if (is_zerocopy) {
+               cbcr_full_tex = resource_pool->create_2d_texture(cbcr_type, global_flags.width, global_flags.height);
+               y_copy_tex = resource_pool->create_2d_texture(y_type, global_flags.width, global_flags.height);
+               cbcr_copy_tex = resource_pool->create_2d_texture(cbcr_type, global_flags.width / 2, global_flags.height / 2);
+
+               y_display_tex = y_copy_tex;
+               cbcr_display_tex = cbcr_copy_tex;
+
+               // y_tex and cbcr_tex will be given by VideoEncoder.
         } else {
-               cbcr_full_tex = resource_pool->create_2d_texture(GL_RG8, global_flags.width, global_flags.height);
-               y_copy_tex = resource_pool->create_2d_texture(GL_R8, global_flags.width, global_flags.height);
-               cbcr_copy_tex = resource_pool->create_2d_texture(GL_RG8, global_flags.width / 2, global_flags.height / 2);
+               cbcr_full_tex = resource_pool->create_2d_texture(cbcr_type, global_flags.width, global_flags.height);
+               y_tex = resource_pool->create_2d_texture(y_type, global_flags.width, global_flags.height);
+               cbcr_tex = resource_pool->create_2d_texture(cbcr_type, global_flags.width / 2, global_flags.height / 2);
+
+               y_display_tex = y_tex;
+               cbcr_display_tex = cbcr_tex;
+       }
+
+       const int64_t av_delay = lrint(global_flags.audio_queue_length_ms * 0.001 * TIMEBASE);  // Corresponds to the delay in ResamplingQueue.
+       bool got_frame = video_encoder->begin_frame(pts_int + av_delay, duration, ycbcr_output_coefficients, theme_main_chain.input_frames, &y_tex, &cbcr_tex);
+       assert(got_frame);
+
+       GLuint fbo;
+       if (is_zerocopy) {
+               fbo = resource_pool->create_fbo(y_tex, cbcr_full_tex, y_copy_tex);
+       } else {
+               fbo = resource_pool->create_fbo(y_tex, cbcr_full_tex);
         }
-       GLuint fbo = resource_pool->create_fbo(y_tex, cbcr_full_tex, y_copy_tex);
         check_error();
         chain->render_to_fbo(fbo, global_flags.width, global_flags.height);
  
@@ -1071,20 +1091,24 @@ void Mixer::render_one_frame(int64_t duration)
  
         resource_pool->release_fbo(fbo);
  
-       chroma_subsampler->subsample_chroma(cbcr_full_tex, global_flags.width, global_flags.height, cbcr_tex, cbcr_copy_tex);
+       if (is_zerocopy) {
+               chroma_subsampler->subsample_chroma(cbcr_full_tex, global_flags.width, global_flags.height, cbcr_tex, cbcr_copy_tex);
+       } else {
+               chroma_subsampler->subsample_chroma(cbcr_full_tex, global_flags.width, global_flags.height, cbcr_tex);
+       }
         if (output_card_index != -1) {
                 cards[output_card_index].output->send_frame(y_tex, cbcr_full_tex, ycbcr_output_coefficients, theme_main_chain.input_frames, pts_int, duration);
         }
         resource_pool->release_2d_texture(cbcr_full_tex);
  
-       // Set the right state for the Y' and CbCr copies.
+       // Set the right state for the Y' and CbCr textures we use for display.
         glBindFramebuffer(GL_FRAMEBUFFER, 0);
-       glBindTexture(GL_TEXTURE_2D, y_copy_tex);
+       glBindTexture(GL_TEXTURE_2D, y_display_tex);
         glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_LINEAR);
         glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE);
         glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE);
  
-       glBindTexture(GL_TEXTURE_2D, cbcr_copy_tex);
+       glBindTexture(GL_TEXTURE_2D, cbcr_display_tex);
         glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_LINEAR);
         glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE);
         glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE);
@@ -1092,16 +1116,16 @@ void Mixer::render_one_frame(int64_t duration)
         RefCountedGLsync fence = video_encoder->end_frame();
  
         // The live frame pieces the Y'CbCr texture copies back into RGB and displays them.
-       // It owns y_copy_tex and cbcr_copy_tex now.
+       // It owns y_display_tex and cbcr_display_tex now (whichever textures they are).
         DisplayFrame live_frame;
         live_frame.chain = display_chain.get();
-       live_frame.setup_chain = [this, y_copy_tex, cbcr_copy_tex]{
-               display_input->set_texture_num(0, y_copy_tex);
-               display_input->set_texture_num(1, cbcr_copy_tex);
+       live_frame.setup_chain = [this, y_display_tex, cbcr_display_tex]{
+               display_input->set_texture_num(0, y_display_tex);
+               display_input->set_texture_num(1, cbcr_display_tex);
         };
         live_frame.ready_fence = fence;
         live_frame.input_frames = {};
-       live_frame.temp_textures = { y_copy_tex, cbcr_copy_tex };
+       live_frame.temp_textures = { y_display_tex, cbcr_display_tex };
         output_channel[OUTPUT_LIVE].output_frame(live_frame);
  
         // Set up preview and any additional channels.
diff --git a/quicksync_encoder.cpp b/quicksync_encoder.cpp

index 635a95a5fb297e8a7fb9de7bca63ce9a73ab6d8d..3a689d35b47c2107cfe7220a87040e7e4b775722 100644 (file)
--- a/quicksync_encoder.cpp
+++ b/quicksync_encoder.cpp
@@ -736,6 +736,7 @@ void QuickSyncEncoderImpl::enable_zerocopy_if_possible()
         } else {
                 use_zerocopy = true;
         }
+       global_flags.use_zerocopy = use_zerocopy;
  }
  
  VADisplay QuickSyncEncoderImpl::va_open_display(const string &va_display)
@@ -994,16 +995,7 @@ int QuickSyncEncoderImpl::setup_encode()
                         gl_surfaces[i].y_tex = resource_pool->create_2d_texture(GL_R8, 1, 1);
                         gl_surfaces[i].cbcr_tex = resource_pool->create_2d_texture(GL_RG8, 1, 1);
                 } else {
-                       size_t bytes_per_pixel;
-                       if (global_flags.x264_bit_depth > 8) {
-                               bytes_per_pixel = 2;
-                               gl_surfaces[i].y_tex = resource_pool->create_2d_texture(GL_R16, frame_width, frame_height);
-                               gl_surfaces[i].cbcr_tex = resource_pool->create_2d_texture(GL_RG16, frame_width / 2, frame_height / 2);
-                       } else {
-                               bytes_per_pixel = 1;
-                               gl_surfaces[i].y_tex = resource_pool->create_2d_texture(GL_R8, frame_width, frame_height);
-                               gl_surfaces[i].cbcr_tex = resource_pool->create_2d_texture(GL_RG8, frame_width / 2, frame_height / 2);
-                       }
+                       size_t bytes_per_pixel = (global_flags.x264_bit_depth > 8) ? 2 : 1;
  
                         // Generate a PBO to read into. It doesn't necessarily fit 1:1 with the VA-API
                         // buffers, due to potentially differing pitch.
@@ -1516,14 +1508,15 @@ void QuickSyncEncoderImpl::release_gl_resources()
         }
  
         for (unsigned i = 0; i < SURFACE_NUM; i++) {
-               if (!use_zerocopy) {
+               if (use_zerocopy) {
+                       resource_pool->release_2d_texture(gl_surfaces[i].y_tex);
+                       resource_pool->release_2d_texture(gl_surfaces[i].cbcr_tex);
+               } else {
                         glBindBuffer(GL_PIXEL_PACK_BUFFER, gl_surfaces[i].pbo);
                         glUnmapBuffer(GL_PIXEL_PACK_BUFFER);
                         glBindBuffer(GL_PIXEL_PACK_BUFFER, 0);
                         glDeleteBuffers(1, &gl_surfaces[i].pbo);
                 }
-               resource_pool->release_2d_texture(gl_surfaces[i].y_tex);
-               resource_pool->release_2d_texture(gl_surfaces[i].cbcr_tex);
         }
  
         has_released_gl_resources = true;
@@ -1611,6 +1604,11 @@ void QuickSyncEncoderImpl::release_gl_surface(size_t display_frame_num)
         }
  }
  
+bool QuickSyncEncoderImpl::is_zerocopy() const
+{
+       return use_zerocopy;
+}
+
  bool QuickSyncEncoderImpl::begin_frame(int64_t pts, int64_t duration, YCbCrLumaCoefficients ycbcr_coefficients, const vector<RefCountedFrame> &input_frames, GLuint *y_tex, GLuint *cbcr_tex)
  {
         assert(!is_shutdown);
@@ -1634,8 +1632,13 @@ bool QuickSyncEncoderImpl::begin_frame(int64_t pts, int64_t duration, YCbCrLumaC
                 surface_for_frame[current_storage_frame] = surf;
         }
  
-       *y_tex = surf->y_tex;
-       *cbcr_tex = surf->cbcr_tex;
+       if (use_zerocopy) {
+               *y_tex = surf->y_tex;
+               *cbcr_tex = surf->cbcr_tex;
+       } else {
+               surf->y_tex = *y_tex;
+               surf->cbcr_tex = *cbcr_tex;
+       }
  
         if (!global_flags.x264_video_to_disk) {
                 VAStatus va_status = vaDeriveImage(va_dpy, surf->src_surface, &surf->surface_image);
@@ -1727,6 +1730,9 @@ RefCountedGLsync QuickSyncEncoderImpl::end_frame()
                 glGetTexImage(GL_TEXTURE_2D, 0, GL_RG, type, BUFFER_OFFSET(surf->cbcr_offset));
                 check_error();
  
+               // We don't own these; the caller does.
+               surf->y_tex = surf->cbcr_tex = 0;
+
                 glBindTexture(GL_TEXTURE_2D, 0);
                 check_error();
                 glBindBuffer(GL_PIXEL_PACK_BUFFER, 0);
@@ -2080,6 +2086,11 @@ void QuickSyncEncoder::add_audio(int64_t pts, vector<float> audio)
         impl->add_audio(pts, audio);
  }
  
+bool QuickSyncEncoder::is_zerocopy() const
+{
+       return impl->is_zerocopy();
+}
+
  bool QuickSyncEncoder::begin_frame(int64_t pts, int64_t duration, YCbCrLumaCoefficients ycbcr_coefficients, const vector<RefCountedFrame> &input_frames, GLuint *y_tex, GLuint *cbcr_tex)
  {
         return impl->begin_frame(pts, duration, ycbcr_coefficients, input_frames, y_tex, cbcr_tex);
diff --git a/quicksync_encoder.h b/quicksync_encoder.h

index eebabbd95c8d52aa25df078fbddd0f83f448ff6a..f4e9e0b57e37c890b44536a24e86babd9d2dd558 100644 (file)
--- a/quicksync_encoder.h
+++ b/quicksync_encoder.h
@@ -66,6 +66,9 @@ public:
  
         void set_stream_mux(Mux *mux);  // Does not take ownership. Must be called unless x264 is used for the stream.
         void add_audio(int64_t pts, std::vector<float> audio);
+       bool is_zerocopy() const;
+
+       // See VideoEncoder::begin_frame().
         bool begin_frame(int64_t pts, int64_t duration, movit::YCbCrLumaCoefficients ycbcr_coefficients, const std::vector<RefCountedFrame> &input_frames, GLuint *y_tex, GLuint *cbcr_tex);
         RefCountedGLsync end_frame();
         void shutdown();  // Blocking. Does not require an OpenGL context.
diff --git a/quicksync_encoder_impl.h b/quicksync_encoder_impl.h

index 18461328a6def27b1264e980bcace8c546cf2592..917420ca41af16a81ffdd3668bd6ea38f3fb78cc 100644 (file)
--- a/quicksync_encoder_impl.h
+++ b/quicksync_encoder_impl.h
@@ -36,6 +36,7 @@ public:
         QuickSyncEncoderImpl(const std::string &filename, movit::ResourcePool *resource_pool, QSurface *surface, const std::string &va_display, int width, int height, AVOutputFormat *oformat, X264Encoder *x264_encoder, DiskSpaceEstimator *disk_space_estimator);
         ~QuickSyncEncoderImpl();
         void add_audio(int64_t pts, std::vector<float> audio);
+       bool is_zerocopy() const;
         bool begin_frame(int64_t pts, int64_t duration, movit::YCbCrLumaCoefficients ycbcr_coefficients, const std::vector<RefCountedFrame> &input_frames, GLuint *y_tex, GLuint *cbcr_tex);
         RefCountedGLsync end_frame();
         void shutdown();
@@ -68,14 +69,13 @@ private:
                 movit::YCbCrLumaCoefficients ycbcr_coefficients;
         };
         struct GLSurface {
-               GLuint y_tex, cbcr_tex;
-
                 // Only if x264_video_to_disk == false.
                 VASurfaceID src_surface, ref_surface;
                 VABufferID coded_buf;
                 VAImage surface_image;
  
                 // Only if use_zerocopy == true (which implies x264_video_to_disk == false).
+               GLuint y_tex, cbcr_tex;
                 EGLImage y_egl_image, cbcr_egl_image;
  
                 // Only if use_zerocopy == false.
diff --git a/theme.cpp b/theme.cpp

index e5002bca3b0a0499ed73ea654b5f57cf18505dba..0cc498e8ba0a4297be74bb320440f6c75cba2f7f 100644 (file)
--- a/theme.cpp
+++ b/theme.cpp
@@ -279,7 +279,13 @@ int EffectChain_finalize(lua_State* L)
                 GLenum type = global_flags.x264_bit_depth > 8 ? GL_UNSIGNED_SHORT : GL_UNSIGNED_BYTE;
  
                 chain->add_ycbcr_output(inout_format, OUTPUT_ALPHA_FORMAT_POSTMULTIPLIED, output_ycbcr_format, YCBCR_OUTPUT_SPLIT_Y_AND_CBCR, type);
-               chain->add_ycbcr_output(inout_format, OUTPUT_ALPHA_FORMAT_POSTMULTIPLIED, output_ycbcr_format, YCBCR_OUTPUT_INTERLEAVED, type);  // Add a copy where we'll only be using the Y component.
+
+               // If we're using zerocopy video encoding (so the destination
+               // Y texture is owned by VA-API and will be unavailable for
+               // display), add a copy, where we'll only be using the Y component.
+               if (global_flags.use_zerocopy) {
+                       chain->add_ycbcr_output(inout_format, OUTPUT_ALPHA_FORMAT_POSTMULTIPLIED, output_ycbcr_format, YCBCR_OUTPUT_INTERLEAVED, type);  // Add a copy where we'll only be using the Y component.
+               }
                 chain->set_dither_bits(global_flags.x264_bit_depth > 8 ? 16 : 8);
                 chain->set_output_origin(OUTPUT_ORIGIN_TOP_LEFT);
         } else {
diff --git a/video_encoder.cpp b/video_encoder.cpp

index b7e36bd7620de732364b839125175c093e4da447..6a4ebf7219b04ec7b7bd5c85cdcb0bff4ca60680 100644 (file)
--- a/video_encoder.cpp
+++ b/video_encoder.cpp
@@ -140,6 +140,12 @@ void VideoEncoder::add_audio(int64_t pts, std::vector<float> audio)
         stream_audio_encoder->encode_audio(audio, pts + quicksync_encoder->global_delay());
  }
  
+bool VideoEncoder::is_zerocopy() const
+{
+       lock_guard<mutex> lock(qs_mu);
+       return quicksync_encoder->is_zerocopy();
+}
+
  bool VideoEncoder::begin_frame(int64_t pts, int64_t duration, movit::YCbCrLumaCoefficients ycbcr_coefficients, const std::vector<RefCountedFrame> &input_frames, GLuint *y_tex, GLuint *cbcr_tex)
  {
         lock_guard<mutex> lock(qs_mu);
diff --git a/video_encoder.h b/video_encoder.h

index e1518aec1fb3067c7bbfa56b24aa51b61448a1da..d51399e17a82c71bcbf31fbe7ba28115fc768b85 100644 (file)
--- a/video_encoder.h
+++ b/video_encoder.h
@@ -42,9 +42,22 @@ public:
  
         void add_audio(int64_t pts, std::vector<float> audio);
  
+       bool is_zerocopy() const;
+
         // Allocate a frame to render into. The returned two textures
         // are yours to render into (build them into an FBO).
         // Call end_frame() when you're done.
+       //
+       // The semantics of y_tex and cbcr_tex depend on is_zerocopy():
+       //
+       //   - If false, the are input parameters, ie., the caller
+       //     allocates textures. (The contents are not read before
+       //     end_frame() is called.)
+       //   - If true, they are output parameters, ie., VideoEncoder
+       //     allocates textures and borrow them to you for rendering.
+       //     In this case, after end_frame(), you are no longer allowed
+       //     to use the textures; they are torn down and given to the
+       //     H.264 encoder.
         bool begin_frame(int64_t pts, int64_t duration, movit::YCbCrLumaCoefficients ycbcr_coefficients, const std::vector<RefCountedFrame> &input_frames, GLuint *y_tex, GLuint *cbcr_tex);
  
         // Call after you are done rendering into the frame; at this point,
@@ -65,7 +78,7 @@ private:
         int write_packet2(uint8_t *buf, int buf_size, AVIODataMarkerType type, int64_t time);
  
         AVOutputFormat *oformat;
-       std::mutex qs_mu;
+       mutable std::mutex qs_mu;
         std::unique_ptr<QuickSyncEncoder> quicksync_encoder;  // Under <qs_mu>.
         movit::ResourcePool *resource_pool;
         QSurface *surface;
author	Steinar H. Gunderson <sgunderson@bigfoot.com>
	Thu, 16 Mar 2017 18:43:11 +0000 (19:43 +0100)
committer	Steinar H. Gunderson <sgunderson@bigfoot.com>
	Thu, 16 Mar 2017 18:43:11 +0000 (19:43 +0100)
flags.h		patch \| blob \| history
mixer.cpp		patch \| blob \| history
quicksync_encoder.cpp		patch \| blob \| history
quicksync_encoder.h		patch \| blob \| history
quicksync_encoder_impl.h		patch \| blob \| history
theme.cpp		patch \| blob \| history
video_encoder.cpp		patch \| blob \| history
video_encoder.h		patch \| blob \| history