Support rendering compute shaders straight to textures (skipping the dummy phase).
authorSteinar H. Gunderson <sgunderson@bigfoot.com>
Tue, 21 Nov 2017 23:04:03 +0000 (00:04 +0100)
committerSteinar H. Gunderson <sgunderson@bigfoot.com>
Tue, 21 Nov 2017 23:05:19 +0000 (00:05 +0100)
There are lots of limitations currently (only one destination,
only GL_RGBA16F), but it's a good start. Curiously enough,
it doesn't really help anything on the deinterlacing benchmark
for my Haswell, but NVIDIA sees ~15% improvement.

effect_chain.cpp
effect_chain.h
test_util.cpp

index a0eef11..4afe415 100644 (file)
@@ -1725,6 +1725,7 @@ void EffectChain::add_dummy_effect_if_needed()
        if (output->effect->is_compute_shader()) {
                Node *dummy = add_node(new IdentityEffect());
                connect_nodes(output, dummy);
+               has_dummy_effect = true;
        }
 }
 
@@ -1820,18 +1821,6 @@ void EffectChain::finalize()
 
 void EffectChain::render_to_fbo(GLuint dest_fbo, unsigned width, unsigned height)
 {
-       assert(finalized);
-
-       // This needs to be set anew, in case we are coming from a different context
-       // from when we initialized.
-       check_error();
-       glDisable(GL_DITHER);
-       check_error();
-
-       const bool final_srgb = glIsEnabled(GL_FRAMEBUFFER_SRGB);
-       check_error();
-       bool current_srgb = final_srgb;
-
        // Save original viewport.
        GLuint x = 0, y = 0;
 
@@ -1844,6 +1833,44 @@ void EffectChain::render_to_fbo(GLuint dest_fbo, unsigned width, unsigned height
                height = viewport[3];
        }
 
+       render(dest_fbo, {}, x, y, width, height);
+}
+
+void EffectChain::render_to_texture(const vector<DestinationTexture> &destinations, unsigned width, unsigned height)
+{
+       assert(finalized);
+       assert(!destinations.empty());
+
+       if (!has_dummy_effect) {
+               // We don't end in a compute shader, so there's nothing specific for us to do.
+               // Create an FBO for this set of textures, and just render to that.
+               GLuint texnums[4] = { 0, 0, 0, 0 };
+               for (unsigned i = 0; i < destinations.size() && i < 4; ++i) {
+                       texnums[i] = destinations[i].texnum;
+               }
+               GLuint dest_fbo = resource_pool->create_fbo(texnums[0], texnums[1], texnums[2], texnums[3]);
+               render(dest_fbo, {}, 0, 0, width, height);
+               resource_pool->release_fbo(dest_fbo);
+       } else {
+               render((GLuint)-1, destinations, 0, 0, width, height);
+       }
+}
+
+void EffectChain::render(GLuint dest_fbo, const vector<DestinationTexture> &destinations, unsigned x, unsigned y, unsigned width, unsigned height)
+{
+       assert(finalized);
+       assert(destinations.size() <= 1);
+
+       // This needs to be set anew, in case we are coming from a different context
+       // from when we initialized.
+       check_error();
+       glDisable(GL_DITHER);
+       check_error();
+
+       const bool final_srgb = glIsEnabled(GL_FRAMEBUFFER_SRGB);
+       check_error();
+       bool current_srgb = final_srgb;
+
        // Basic state.
        check_error();
        glDisable(GL_BLEND);
@@ -1859,7 +1886,29 @@ void EffectChain::render_to_fbo(GLuint dest_fbo, unsigned width, unsigned height
        // since otherwise this turns into an (albeit simple) register allocation problem.
        map<Phase *, GLuint> output_textures;
 
-       for (unsigned phase_num = 0; phase_num < phases.size(); ++phase_num) {
+       size_t num_phases = phases.size();
+       if (destinations.empty()) {
+               assert(dest_fbo != (GLuint)-1);
+       } else {
+               assert(has_dummy_effect);
+               assert(x == 0);
+               assert(y == 0);
+               assert(num_phases >= 2);
+               assert(!phases.back()->is_compute_shader);
+               assert(phases.back()->effects.size() == 1);
+               assert(phases.back()->effects[0]->effect->effect_type_id() == "IdentityEffect");
+
+               // We are rendering to a set of textures, so we can run the compute shader
+               // directly and skip the dummy phase.
+               --num_phases;
+
+               // TODO: Support more than one destination.
+               output_textures[phases[num_phases - 1]] = destinations[0].texnum;
+               assert(destinations[0].format == GL_RGBA16F);
+               assert(destinations[0].texnum != 0);
+       }
+
+       for (unsigned phase_num = 0; phase_num < num_phases; ++phase_num) {
                Phase *phase = phases[phase_num];
 
                if (do_phase_timing) {
@@ -1874,14 +1923,16 @@ void EffectChain::render_to_fbo(GLuint dest_fbo, unsigned width, unsigned height
                        phase->timer_query_objects_running.push_back(timer_query_object);
                }
                bool render_to_texture = true;
-               if (phase_num == phases.size() - 1) {
+               if (phase_num == num_phases - 1) {
                        // Last phase goes to the output the user specified.
-                       glBindFramebuffer(GL_FRAMEBUFFER, dest_fbo);
-                       check_error();
-                       GLenum status = glCheckFramebufferStatusEXT(GL_FRAMEBUFFER_EXT);
-                       assert(status == GL_FRAMEBUFFER_COMPLETE);
-                       glViewport(x, y, width, height);
-                       render_to_texture = false;
+                       if (!phase->is_compute_shader) {
+                               glBindFramebuffer(GL_FRAMEBUFFER, dest_fbo);
+                               check_error();
+                               GLenum status = glCheckFramebufferStatusEXT(GL_FRAMEBUFFER_EXT);
+                               assert(status == GL_FRAMEBUFFER_COMPLETE);
+                               glViewport(x, y, width, height);
+                               render_to_texture = false;
+                       }
                        if (dither_effect != nullptr) {
                                CHECK(dither_effect->set_int("output_width", width));
                                CHECK(dither_effect->set_int("output_height", height));
@@ -1890,6 +1941,7 @@ void EffectChain::render_to_fbo(GLuint dest_fbo, unsigned width, unsigned height
 
                // Enable sRGB rendering for intermediates in case we are
                // rendering to an sRGB format.
+               // TODO: Support this for compute shaders.
                bool needs_srgb = render_to_texture ? true : final_srgb;
                if (needs_srgb && !current_srgb) {
                        glEnable(GL_FRAMEBUFFER_SRGB);
@@ -1907,6 +1959,10 @@ void EffectChain::render_to_fbo(GLuint dest_fbo, unsigned width, unsigned height
                }
        }
 
+       // Take out the destination textures from the list of temporary textures to be freed.
+       if (has_dummy_effect && !destinations.empty()) {
+               output_textures.erase(phases[num_phases - 1]);
+       }
        for (const auto &phase_and_texnum : output_textures) {
                resource_pool->release_2d_texture(phase_and_texnum.second);
        }
index 038fb1c..35931f8 100644 (file)
@@ -389,6 +389,22 @@ public:
        // the current viewport.
        void render_to_fbo(GLuint fbo, unsigned width, unsigned height);
 
+       // Render the effect chain to the given set of textures. This is equivalent
+       // to render_to_fbo() with a freshly created FBO bound to the given textures,
+       // except that it is more efficient if the last phase contains a compute shader.
+       // Thus, prefer this to render_to_fbo() where possible.
+       //
+       // The format must currently be GL_RGBA16F, and only one destination
+       // texture is supported. Both of these restrictions will be lifted in the future.
+       //
+       // All destination textures must be exactly of size <width> x <height>.
+       // width and height can not be zero.
+       struct DestinationTexture {
+               GLuint texnum;
+               GLenum format;
+       };
+       void render_to_texture(const std::vector<DestinationTexture> &destinations, unsigned width, unsigned height);
+
        Effect *last_added_effect() {
                if (nodes.empty()) {
                        return nullptr;
@@ -457,6 +473,13 @@ private:
        // as the last effect. Also pushes all phases in order onto <phases>.
        Phase *construct_phase(Node *output, std::map<Node *, Phase *> *completed_effects);
 
+       // Do the actual rendering of the chain. If <dest_fbo> is not (GLuint)-1,
+       // renders to that FBO. If <destinations> is non-empty, render to that set
+       // of textures (last phase, save for the dummy phase, must be a compute shader),
+       // with x/y ignored. Having both set is an error.
+       void render(GLuint dest_fbo, const std::vector<DestinationTexture> &destinations,
+                   unsigned x, unsigned y, unsigned width, unsigned height);
+
        // Execute one phase, ie. set up all inputs, effects and outputs, and render the quad.
        void execute_phase(Phase *phase, bool render_to_texture,
                           std::map<Phase *, GLuint> *output_textures,
@@ -539,6 +562,12 @@ private:
        bool finalized;
        GLuint vbo;  // Contains vertex and texture coordinate data.
 
+       // Whether the last effect (which will then be in a phase all by itself)
+       // is a dummy effect that is only added because the last phase uses a compute
+       // shader, which cannot output directly to the backbuffer. This means that
+       // the phase can be skipped if we are _not_ rendering to the backbuffer.
+       bool has_dummy_effect = false;
+
        ResourcePool *resource_pool;
        bool owns_resource_pool;
 
index 096c07a..78122ce 100644 (file)
@@ -248,7 +248,7 @@ void EffectChainTester::internal_run(T *out_data, T *out_data2, T *out_data3, T
                num_outputs = 1;
        }
 
-       GLuint fbo, texnum[4];
+       GLuint texnum[4];
 
        glGenTextures(num_outputs, texnum);
        check_error();
@@ -259,24 +259,12 @@ void EffectChainTester::internal_run(T *out_data, T *out_data2, T *out_data3, T
                check_error();
        }
 
-       glGenFramebuffers(1, &fbo);
-       check_error();
-       glBindFramebuffer(GL_FRAMEBUFFER, fbo);
-       check_error();
+       vector<EffectChain::DestinationTexture> textures;
        for (unsigned i = 0; i < num_outputs; ++i) {
-               glFramebufferTexture2D(
-                       GL_FRAMEBUFFER,
-                       GL_COLOR_ATTACHMENT0 + i,
-                       GL_TEXTURE_2D,
-                       texnum[i],
-                       0);
-               check_error();
+               textures.push_back(EffectChain::DestinationTexture{texnum[i], framebuffer_format});
        }
 
-       GLenum bufs[] = { GL_COLOR_ATTACHMENT0, GL_COLOR_ATTACHMENT1, GL_COLOR_ATTACHMENT2, GL_COLOR_ATTACHMENT3 };
-       glDrawBuffers(num_outputs, bufs);
-
-       chain.render_to_fbo(fbo, width, height);
+       chain.render_to_texture(textures, width, height);
 
 #ifdef HAVE_BENCHMARK
        // If running benchmarks: Now we've warmed up everything, so let's run the
@@ -285,7 +273,7 @@ void EffectChainTester::internal_run(T *out_data, T *out_data2, T *out_data3, T
                glFinish();
                size_t iters = benchmark_state->max_iterations;
                for (auto _ : *benchmark_state) {
-                       chain.render_to_fbo(fbo, width, height);
+                       chain.render_to_texture(textures, width, height);
                        if (--iters == 0) {
                                glFinish();
                        }
@@ -331,8 +319,6 @@ void EffectChainTester::internal_run(T *out_data, T *out_data2, T *out_data3, T
                }
        }
 
-       glDeleteFramebuffers(1, &fbo);
-       check_error();
        glDeleteTextures(num_outputs, texnum);
        check_error();
 }