]> git.sesse.net Git - nageru/commitdiff
If not using VA-API zerocopy, don't write extra copy textures.
authorSteinar H. Gunderson <sgunderson@bigfoot.com>
Thu, 16 Mar 2017 18:43:11 +0000 (19:43 +0100)
committerSteinar H. Gunderson <sgunderson@bigfoot.com>
Thu, 16 Mar 2017 18:43:11 +0000 (19:43 +0100)
Saves some precious memory bandwidth.

flags.h
mixer.cpp
quicksync_encoder.cpp
quicksync_encoder.h
quicksync_encoder_impl.h
theme.cpp
video_encoder.cpp
video_encoder.h

diff --git a/flags.h b/flags.h
index bd962fe7e4890e1bcc8944ca305cb6fded6f0cde..caeaf2bc0cf43a2c769d23c8d2121a456a32bef2 100644 (file)
--- a/flags.h
+++ b/flags.h
@@ -57,6 +57,7 @@ struct Flags {
        bool ten_bit_input = false;
        bool ten_bit_output = false;  // Implies x264_video_to_disk == true and x264_bit_depth == 10.
        int x264_bit_depth = 8;  // Not user-settable.
+       bool use_zerocopy = false;  // Not user-settable.
 };
 extern Flags global_flags;
 
index 25fa3e40d70723e12e4fa5a54b69628a0f49ed7c..0c00d148144eae9583d65b8e06d32a5fe14f8c71 100644 (file)
--- a/mixer.cpp
+++ b/mixer.cpp
@@ -193,7 +193,6 @@ Mixer::Mixer(const QSurfaceFormat &format, unsigned num_cards)
        movit_texel_subpixel_precision /= 2.0;
 
        resource_pool.reset(new ResourcePool);
-       theme.reset(new Theme(global_flags.theme_filename, global_flags.theme_dirs, resource_pool.get(), num_cards));
        for (unsigned i = 0; i < NUM_OUTPUTS; ++i) {
                output_channel[i].parent = this;
                output_channel[i].channel = i;
@@ -231,6 +230,9 @@ Mixer::Mixer(const QSurfaceFormat &format, unsigned num_cards)
 
        video_encoder.reset(new VideoEncoder(resource_pool.get(), h264_encoder_surface, global_flags.va_display, global_flags.width, global_flags.height, &httpd, global_disk_space_estimator));
 
+       // Must be instantiated after VideoEncoder has initialized global_flags.use_zerocopy.
+       theme.reset(new Theme(global_flags.theme_filename, global_flags.theme_dirs, resource_pool.get(), num_cards));
+
        // Start listening for clients only once VideoEncoder has written its header, if any.
        httpd.start(9095);
 
@@ -1038,29 +1040,47 @@ void Mixer::render_one_frame(int64_t duration)
        output_ycbcr_format.num_levels = 1 << global_flags.x264_bit_depth;
        chain->change_ycbcr_output_format(output_ycbcr_format);
 
-       const int64_t av_delay = lrint(global_flags.audio_queue_length_ms * 0.001 * TIMEBASE);  // Corresponds to the delay in ResamplingQueue.
-       GLuint y_tex, cbcr_tex;
-       bool got_frame = video_encoder->begin_frame(pts_int + av_delay, duration, ycbcr_output_coefficients, theme_main_chain.input_frames, &y_tex, &cbcr_tex);
-       assert(got_frame);
-
-       // Render main chain. We take an extra copy of the created outputs,
+       // Render main chain. If we're using zerocopy Quick Sync encoding
+       // (the default case), we take an extra copy of the created outputs,
        // so that we can display it back to the screen later (it's less memory
        // bandwidth than writing and reading back an RGBA texture, even at 16-bit).
        // Ideally, we'd like to avoid taking copies and just use the main textures
-       // for display as well, but if they're used for zero-copy Quick Sync encoding
-       // (the default case), they're just views into VA-API memory and must be
+       // for display as well, but they're just views into VA-API memory and must be
        // unmapped during encoding, so we can't use them for display, unfortunately.
-       GLuint cbcr_full_tex, cbcr_copy_tex, y_copy_tex;
-       if (global_flags.x264_bit_depth > 8) {
-               cbcr_full_tex = resource_pool->create_2d_texture(GL_RG16, global_flags.width, global_flags.height);
-               y_copy_tex = resource_pool->create_2d_texture(GL_R16, global_flags.width, global_flags.height);
-               cbcr_copy_tex = resource_pool->create_2d_texture(GL_RG16, global_flags.width / 2, global_flags.height / 2);
+       GLuint y_tex, cbcr_full_tex, cbcr_tex;
+       GLuint y_copy_tex, cbcr_copy_tex = 0;
+       GLuint y_display_tex, cbcr_display_tex;
+       GLenum y_type = (global_flags.x264_bit_depth > 8) ? GL_R16 : GL_R8;
+       GLenum cbcr_type = (global_flags.x264_bit_depth > 8) ? GL_RG16 : GL_RG8;
+       const bool is_zerocopy = video_encoder->is_zerocopy();
+       if (is_zerocopy) {
+               cbcr_full_tex = resource_pool->create_2d_texture(cbcr_type, global_flags.width, global_flags.height);
+               y_copy_tex = resource_pool->create_2d_texture(y_type, global_flags.width, global_flags.height);
+               cbcr_copy_tex = resource_pool->create_2d_texture(cbcr_type, global_flags.width / 2, global_flags.height / 2);
+
+               y_display_tex = y_copy_tex;
+               cbcr_display_tex = cbcr_copy_tex;
+
+               // y_tex and cbcr_tex will be given by VideoEncoder.
        } else {
-               cbcr_full_tex = resource_pool->create_2d_texture(GL_RG8, global_flags.width, global_flags.height);
-               y_copy_tex = resource_pool->create_2d_texture(GL_R8, global_flags.width, global_flags.height);
-               cbcr_copy_tex = resource_pool->create_2d_texture(GL_RG8, global_flags.width / 2, global_flags.height / 2);
+               cbcr_full_tex = resource_pool->create_2d_texture(cbcr_type, global_flags.width, global_flags.height);
+               y_tex = resource_pool->create_2d_texture(y_type, global_flags.width, global_flags.height);
+               cbcr_tex = resource_pool->create_2d_texture(cbcr_type, global_flags.width / 2, global_flags.height / 2);
+
+               y_display_tex = y_tex;
+               cbcr_display_tex = cbcr_tex;
+       }
+
+       const int64_t av_delay = lrint(global_flags.audio_queue_length_ms * 0.001 * TIMEBASE);  // Corresponds to the delay in ResamplingQueue.
+       bool got_frame = video_encoder->begin_frame(pts_int + av_delay, duration, ycbcr_output_coefficients, theme_main_chain.input_frames, &y_tex, &cbcr_tex);
+       assert(got_frame);
+
+       GLuint fbo;
+       if (is_zerocopy) {
+               fbo = resource_pool->create_fbo(y_tex, cbcr_full_tex, y_copy_tex);
+       } else {
+               fbo = resource_pool->create_fbo(y_tex, cbcr_full_tex);
        }
-       GLuint fbo = resource_pool->create_fbo(y_tex, cbcr_full_tex, y_copy_tex);
        check_error();
        chain->render_to_fbo(fbo, global_flags.width, global_flags.height);
 
@@ -1071,20 +1091,24 @@ void Mixer::render_one_frame(int64_t duration)
 
        resource_pool->release_fbo(fbo);
 
-       chroma_subsampler->subsample_chroma(cbcr_full_tex, global_flags.width, global_flags.height, cbcr_tex, cbcr_copy_tex);
+       if (is_zerocopy) {
+               chroma_subsampler->subsample_chroma(cbcr_full_tex, global_flags.width, global_flags.height, cbcr_tex, cbcr_copy_tex);
+       } else {
+               chroma_subsampler->subsample_chroma(cbcr_full_tex, global_flags.width, global_flags.height, cbcr_tex);
+       }
        if (output_card_index != -1) {
                cards[output_card_index].output->send_frame(y_tex, cbcr_full_tex, ycbcr_output_coefficients, theme_main_chain.input_frames, pts_int, duration);
        }
        resource_pool->release_2d_texture(cbcr_full_tex);
 
-       // Set the right state for the Y' and CbCr copies.
+       // Set the right state for the Y' and CbCr textures we use for display.
        glBindFramebuffer(GL_FRAMEBUFFER, 0);
-       glBindTexture(GL_TEXTURE_2D, y_copy_tex);
+       glBindTexture(GL_TEXTURE_2D, y_display_tex);
        glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_LINEAR);
        glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE);
        glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE);
 
-       glBindTexture(GL_TEXTURE_2D, cbcr_copy_tex);
+       glBindTexture(GL_TEXTURE_2D, cbcr_display_tex);
        glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_MIN_FILTER, GL_LINEAR);
        glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE);
        glTexParameteri(GL_TEXTURE_2D, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE);
@@ -1092,16 +1116,16 @@ void Mixer::render_one_frame(int64_t duration)
        RefCountedGLsync fence = video_encoder->end_frame();
 
        // The live frame pieces the Y'CbCr texture copies back into RGB and displays them.
-       // It owns y_copy_tex and cbcr_copy_tex now.
+       // It owns y_display_tex and cbcr_display_tex now (whichever textures they are).
        DisplayFrame live_frame;
        live_frame.chain = display_chain.get();
-       live_frame.setup_chain = [this, y_copy_tex, cbcr_copy_tex]{
-               display_input->set_texture_num(0, y_copy_tex);
-               display_input->set_texture_num(1, cbcr_copy_tex);
+       live_frame.setup_chain = [this, y_display_tex, cbcr_display_tex]{
+               display_input->set_texture_num(0, y_display_tex);
+               display_input->set_texture_num(1, cbcr_display_tex);
        };
        live_frame.ready_fence = fence;
        live_frame.input_frames = {};
-       live_frame.temp_textures = { y_copy_tex, cbcr_copy_tex };
+       live_frame.temp_textures = { y_display_tex, cbcr_display_tex };
        output_channel[OUTPUT_LIVE].output_frame(live_frame);
 
        // Set up preview and any additional channels.
index 635a95a5fb297e8a7fb9de7bca63ce9a73ab6d8d..3a689d35b47c2107cfe7220a87040e7e4b775722 100644 (file)
@@ -736,6 +736,7 @@ void QuickSyncEncoderImpl::enable_zerocopy_if_possible()
        } else {
                use_zerocopy = true;
        }
+       global_flags.use_zerocopy = use_zerocopy;
 }
 
 VADisplay QuickSyncEncoderImpl::va_open_display(const string &va_display)
@@ -994,16 +995,7 @@ int QuickSyncEncoderImpl::setup_encode()
                        gl_surfaces[i].y_tex = resource_pool->create_2d_texture(GL_R8, 1, 1);
                        gl_surfaces[i].cbcr_tex = resource_pool->create_2d_texture(GL_RG8, 1, 1);
                } else {
-                       size_t bytes_per_pixel;
-                       if (global_flags.x264_bit_depth > 8) {
-                               bytes_per_pixel = 2;
-                               gl_surfaces[i].y_tex = resource_pool->create_2d_texture(GL_R16, frame_width, frame_height);
-                               gl_surfaces[i].cbcr_tex = resource_pool->create_2d_texture(GL_RG16, frame_width / 2, frame_height / 2);
-                       } else {
-                               bytes_per_pixel = 1;
-                               gl_surfaces[i].y_tex = resource_pool->create_2d_texture(GL_R8, frame_width, frame_height);
-                               gl_surfaces[i].cbcr_tex = resource_pool->create_2d_texture(GL_RG8, frame_width / 2, frame_height / 2);
-                       }
+                       size_t bytes_per_pixel = (global_flags.x264_bit_depth > 8) ? 2 : 1;
 
                        // Generate a PBO to read into. It doesn't necessarily fit 1:1 with the VA-API
                        // buffers, due to potentially differing pitch.
@@ -1516,14 +1508,15 @@ void QuickSyncEncoderImpl::release_gl_resources()
        }
 
        for (unsigned i = 0; i < SURFACE_NUM; i++) {
-               if (!use_zerocopy) {
+               if (use_zerocopy) {
+                       resource_pool->release_2d_texture(gl_surfaces[i].y_tex);
+                       resource_pool->release_2d_texture(gl_surfaces[i].cbcr_tex);
+               } else {
                        glBindBuffer(GL_PIXEL_PACK_BUFFER, gl_surfaces[i].pbo);
                        glUnmapBuffer(GL_PIXEL_PACK_BUFFER);
                        glBindBuffer(GL_PIXEL_PACK_BUFFER, 0);
                        glDeleteBuffers(1, &gl_surfaces[i].pbo);
                }
-               resource_pool->release_2d_texture(gl_surfaces[i].y_tex);
-               resource_pool->release_2d_texture(gl_surfaces[i].cbcr_tex);
        }
 
        has_released_gl_resources = true;
@@ -1611,6 +1604,11 @@ void QuickSyncEncoderImpl::release_gl_surface(size_t display_frame_num)
        }
 }
 
+bool QuickSyncEncoderImpl::is_zerocopy() const
+{
+       return use_zerocopy;
+}
+
 bool QuickSyncEncoderImpl::begin_frame(int64_t pts, int64_t duration, YCbCrLumaCoefficients ycbcr_coefficients, const vector<RefCountedFrame> &input_frames, GLuint *y_tex, GLuint *cbcr_tex)
 {
        assert(!is_shutdown);
@@ -1634,8 +1632,13 @@ bool QuickSyncEncoderImpl::begin_frame(int64_t pts, int64_t duration, YCbCrLumaC
                surface_for_frame[current_storage_frame] = surf;
        }
 
-       *y_tex = surf->y_tex;
-       *cbcr_tex = surf->cbcr_tex;
+       if (use_zerocopy) {
+               *y_tex = surf->y_tex;
+               *cbcr_tex = surf->cbcr_tex;
+       } else {
+               surf->y_tex = *y_tex;
+               surf->cbcr_tex = *cbcr_tex;
+       }
 
        if (!global_flags.x264_video_to_disk) {
                VAStatus va_status = vaDeriveImage(va_dpy, surf->src_surface, &surf->surface_image);
@@ -1727,6 +1730,9 @@ RefCountedGLsync QuickSyncEncoderImpl::end_frame()
                glGetTexImage(GL_TEXTURE_2D, 0, GL_RG, type, BUFFER_OFFSET(surf->cbcr_offset));
                check_error();
 
+               // We don't own these; the caller does.
+               surf->y_tex = surf->cbcr_tex = 0;
+
                glBindTexture(GL_TEXTURE_2D, 0);
                check_error();
                glBindBuffer(GL_PIXEL_PACK_BUFFER, 0);
@@ -2080,6 +2086,11 @@ void QuickSyncEncoder::add_audio(int64_t pts, vector<float> audio)
        impl->add_audio(pts, audio);
 }
 
+bool QuickSyncEncoder::is_zerocopy() const
+{
+       return impl->is_zerocopy();
+}
+
 bool QuickSyncEncoder::begin_frame(int64_t pts, int64_t duration, YCbCrLumaCoefficients ycbcr_coefficients, const vector<RefCountedFrame> &input_frames, GLuint *y_tex, GLuint *cbcr_tex)
 {
        return impl->begin_frame(pts, duration, ycbcr_coefficients, input_frames, y_tex, cbcr_tex);
index eebabbd95c8d52aa25df078fbddd0f83f448ff6a..f4e9e0b57e37c890b44536a24e86babd9d2dd558 100644 (file)
@@ -66,6 +66,9 @@ public:
 
        void set_stream_mux(Mux *mux);  // Does not take ownership. Must be called unless x264 is used for the stream.
        void add_audio(int64_t pts, std::vector<float> audio);
+       bool is_zerocopy() const;
+
+       // See VideoEncoder::begin_frame().
        bool begin_frame(int64_t pts, int64_t duration, movit::YCbCrLumaCoefficients ycbcr_coefficients, const std::vector<RefCountedFrame> &input_frames, GLuint *y_tex, GLuint *cbcr_tex);
        RefCountedGLsync end_frame();
        void shutdown();  // Blocking. Does not require an OpenGL context.
index 18461328a6def27b1264e980bcace8c546cf2592..917420ca41af16a81ffdd3668bd6ea38f3fb78cc 100644 (file)
@@ -36,6 +36,7 @@ public:
        QuickSyncEncoderImpl(const std::string &filename, movit::ResourcePool *resource_pool, QSurface *surface, const std::string &va_display, int width, int height, AVOutputFormat *oformat, X264Encoder *x264_encoder, DiskSpaceEstimator *disk_space_estimator);
        ~QuickSyncEncoderImpl();
        void add_audio(int64_t pts, std::vector<float> audio);
+       bool is_zerocopy() const;
        bool begin_frame(int64_t pts, int64_t duration, movit::YCbCrLumaCoefficients ycbcr_coefficients, const std::vector<RefCountedFrame> &input_frames, GLuint *y_tex, GLuint *cbcr_tex);
        RefCountedGLsync end_frame();
        void shutdown();
@@ -68,14 +69,13 @@ private:
                movit::YCbCrLumaCoefficients ycbcr_coefficients;
        };
        struct GLSurface {
-               GLuint y_tex, cbcr_tex;
-
                // Only if x264_video_to_disk == false.
                VASurfaceID src_surface, ref_surface;
                VABufferID coded_buf;
                VAImage surface_image;
 
                // Only if use_zerocopy == true (which implies x264_video_to_disk == false).
+               GLuint y_tex, cbcr_tex;
                EGLImage y_egl_image, cbcr_egl_image;
 
                // Only if use_zerocopy == false.
index e5002bca3b0a0499ed73ea654b5f57cf18505dba..0cc498e8ba0a4297be74bb320440f6c75cba2f7f 100644 (file)
--- a/theme.cpp
+++ b/theme.cpp
@@ -279,7 +279,13 @@ int EffectChain_finalize(lua_State* L)
                GLenum type = global_flags.x264_bit_depth > 8 ? GL_UNSIGNED_SHORT : GL_UNSIGNED_BYTE;
 
                chain->add_ycbcr_output(inout_format, OUTPUT_ALPHA_FORMAT_POSTMULTIPLIED, output_ycbcr_format, YCBCR_OUTPUT_SPLIT_Y_AND_CBCR, type);
-               chain->add_ycbcr_output(inout_format, OUTPUT_ALPHA_FORMAT_POSTMULTIPLIED, output_ycbcr_format, YCBCR_OUTPUT_INTERLEAVED, type);  // Add a copy where we'll only be using the Y component.
+
+               // If we're using zerocopy video encoding (so the destination
+               // Y texture is owned by VA-API and will be unavailable for
+               // display), add a copy, where we'll only be using the Y component.
+               if (global_flags.use_zerocopy) {
+                       chain->add_ycbcr_output(inout_format, OUTPUT_ALPHA_FORMAT_POSTMULTIPLIED, output_ycbcr_format, YCBCR_OUTPUT_INTERLEAVED, type);  // Add a copy where we'll only be using the Y component.
+               }
                chain->set_dither_bits(global_flags.x264_bit_depth > 8 ? 16 : 8);
                chain->set_output_origin(OUTPUT_ORIGIN_TOP_LEFT);
        } else {
index b7e36bd7620de732364b839125175c093e4da447..6a4ebf7219b04ec7b7bd5c85cdcb0bff4ca60680 100644 (file)
@@ -140,6 +140,12 @@ void VideoEncoder::add_audio(int64_t pts, std::vector<float> audio)
        stream_audio_encoder->encode_audio(audio, pts + quicksync_encoder->global_delay());
 }
 
+bool VideoEncoder::is_zerocopy() const
+{
+       lock_guard<mutex> lock(qs_mu);
+       return quicksync_encoder->is_zerocopy();
+}
+
 bool VideoEncoder::begin_frame(int64_t pts, int64_t duration, movit::YCbCrLumaCoefficients ycbcr_coefficients, const std::vector<RefCountedFrame> &input_frames, GLuint *y_tex, GLuint *cbcr_tex)
 {
        lock_guard<mutex> lock(qs_mu);
index e1518aec1fb3067c7bbfa56b24aa51b61448a1da..d51399e17a82c71bcbf31fbe7ba28115fc768b85 100644 (file)
@@ -42,9 +42,22 @@ public:
 
        void add_audio(int64_t pts, std::vector<float> audio);
 
+       bool is_zerocopy() const;
+
        // Allocate a frame to render into. The returned two textures
        // are yours to render into (build them into an FBO).
        // Call end_frame() when you're done.
+       //
+       // The semantics of y_tex and cbcr_tex depend on is_zerocopy():
+       //
+       //   - If false, the are input parameters, ie., the caller
+       //     allocates textures. (The contents are not read before
+       //     end_frame() is called.)
+       //   - If true, they are output parameters, ie., VideoEncoder
+       //     allocates textures and borrow them to you for rendering.
+       //     In this case, after end_frame(), you are no longer allowed
+       //     to use the textures; they are torn down and given to the
+       //     H.264 encoder.
        bool begin_frame(int64_t pts, int64_t duration, movit::YCbCrLumaCoefficients ycbcr_coefficients, const std::vector<RefCountedFrame> &input_frames, GLuint *y_tex, GLuint *cbcr_tex);
 
        // Call after you are done rendering into the frame; at this point,
@@ -65,7 +78,7 @@ private:
        int write_packet2(uint8_t *buf, int buf_size, AVIODataMarkerType type, int64_t time);
 
        AVOutputFormat *oformat;
-       std::mutex qs_mu;
+       mutable std::mutex qs_mu;
        std::unique_ptr<QuickSyncEncoder> quicksync_encoder;  // Under <qs_mu>.
        movit::ResourcePool *resource_pool;
        QSurface *surface;