]> git.sesse.net Git - nageru/commitdiff
Make a new flag --detailed-timing for microsecond measurements and more.
authorSteinar H. Gunderson <sgunderson@bigfoot.com>
Thu, 2 Aug 2018 15:19:49 +0000 (17:19 +0200)
committerSteinar H. Gunderson <sgunderson@bigfoot.com>
Thu, 2 Aug 2018 15:19:49 +0000 (17:19 +0200)
flow.cpp
gpu_timers.cpp
gpu_timers.h

index bda8eecfc162ff18bd4db86210a79729036204a2..d9c165fb139c2338492bfded5ca004a25e7f49b3 100644 (file)
--- a/flow.cpp
+++ b/flow.cpp
@@ -47,6 +47,7 @@ constexpr unsigned patch_size_pixels = 12;
 float vr_alpha = 1.0f, vr_delta = 0.25f, vr_gamma = 0.25f;
 
 bool enable_timing = true;
+bool detailed_timing = false;
 bool enable_variational_refinement = true;  // Just for debugging.
 bool enable_interpolation = false;
 
@@ -800,7 +801,7 @@ void SetupEquations::exec(GLuint I_x_y_tex, GLuint I_t_tex, GLuint diff_flow_tex
 class SOR {
 public:
        SOR();
-       void exec(GLuint diff_flow_tex, GLuint equation_tex, GLuint smoothness_x_tex, GLuint smoothness_y_tex, int level_width, int level_height, int num_iterations);
+       void exec(GLuint diff_flow_tex, GLuint equation_tex, GLuint smoothness_x_tex, GLuint smoothness_y_tex, int level_width, int level_height, int num_iterations, ScopedTimer *sor_timer);
 
 private:
        PersistentFBOSet<1> fbos;
@@ -838,7 +839,7 @@ SOR::SOR()
        uniform_phase = glGetUniformLocation(sor_program, "phase");
 }
 
-void SOR::exec(GLuint diff_flow_tex, GLuint equation_tex, GLuint smoothness_x_tex, GLuint smoothness_y_tex, int level_width, int level_height, int num_iterations)
+void SOR::exec(GLuint diff_flow_tex, GLuint equation_tex, GLuint smoothness_x_tex, GLuint smoothness_y_tex, int level_width, int level_height, int num_iterations, ScopedTimer *sor_timer)
 {
        glUseProgram(sor_program);
 
@@ -857,14 +858,20 @@ void SOR::exec(GLuint diff_flow_tex, GLuint equation_tex, GLuint smoothness_x_te
        fbos.render_to(diff_flow_tex);
 
        for (int i = 0; i < num_iterations; ++i) {
-               glProgramUniform1i(sor_program, uniform_phase, 0);
-               glDrawArrays(GL_TRIANGLE_STRIP, 0, 4);
-               glTextureBarrier();
-               glProgramUniform1i(sor_program, uniform_phase, 1);
-               glDrawArrays(GL_TRIANGLE_STRIP, 0, 4);
-               if (i != num_iterations - 1) {
+               {
+                       ScopedTimer timer("Red pass", sor_timer);
+                       glProgramUniform1i(sor_program, uniform_phase, 0);
+                       glDrawArrays(GL_TRIANGLE_STRIP, 0, 4);
                        glTextureBarrier();
                }
+               {
+                       ScopedTimer timer("Black pass", sor_timer);
+                       glProgramUniform1i(sor_program, uniform_phase, 1);
+                       glDrawArrays(GL_TRIANGLE_STRIP, 0, 4);
+                       if (i != num_iterations - 1) {
+                               glTextureBarrier();
+                       }
+               }
        }
 }
 
@@ -1190,7 +1197,7 @@ GLuint DISComputeFlow::exec(GLuint tex0, GLuint tex1, ResizeStrategy resize_stra
                        // Note that these are to/from the same texture.
                        {
                                ScopedTimer timer("SOR", &varref_timer);
-                               sor.exec(du_dv_tex, equation_tex, smoothness_x_tex, smoothness_y_tex, level_width, level_height, 5);
+                               sor.exec(du_dv_tex, equation_tex, smoothness_x_tex, smoothness_y_tex, level_width, level_height, 5, &timer);
                        }
                }
 
@@ -1974,6 +1981,7 @@ int main(int argc, char **argv)
                { "intensity-relative-weight", required_argument, 0, 'i' },  // delta.
                { "gradient-relative-weight", required_argument, 0, 'g' },  // gamma.
                { "disable-timing", no_argument, 0, 1000 },
+               { "detailed-timing", no_argument, 0, 1003 },
                { "ignore-variational-refinement", no_argument, 0, 1001 },  // Still calculates it, just doesn't apply it.
                { "interpolate", no_argument, 0, 1002 }
        };
@@ -2004,6 +2012,9 @@ int main(int argc, char **argv)
                case 1002:
                        enable_interpolation = true;
                        break;
+               case 1003:
+                       detailed_timing = true;
+                       break;
                default:
                        fprintf(stderr, "Unknown option '%s'\n", argv[option_index]);
                        exit(1);
index b58f1cd50d36530584a62d7f81cec10575792fc0..a72205125b2956bb9d32be441b20c5a249dc36ba 100644 (file)
@@ -23,17 +23,45 @@ pair<GLuint, GLuint> GPUTimers::begin_timer(const string &name, int level)
        return timer.query;
 }
 
+GLint64 find_elapsed(pair<GLuint, GLuint> queries)
+{
+       // NOTE: This makes the CPU wait for the GPU.
+       GLuint64 time_start, time_end;
+       glGetQueryObjectui64v(queries.first, GL_QUERY_RESULT, &time_start);
+       glGetQueryObjectui64v(queries.second, GL_QUERY_RESULT, &time_end);
+       return time_end - time_start;
+}
+
 void GPUTimers::print()
 {
-       for (const Timer &timer : timers) {
-               // NOTE: This makes the CPU wait for the GPU.
-               GLuint64 time_start, time_end;
-               glGetQueryObjectui64v(timer.query.first, GL_QUERY_RESULT, &time_start);
-               glGetQueryObjectui64v(timer.query.second, GL_QUERY_RESULT, &time_end);
-               //fprintf(stderr, "GPU time used = %.1f ms\n", time_elapsed / 1e6);
-               for (int i = 0; i < timer.level * 2; ++i) {
+       for (size_t i = 0; i < timers.size(); ++i) {
+               if (timers[i].level >= 4 && !detailed_timing) {
+                       // In practice, only affects the SOR sub-timers.
+                       continue;
+               }
+
+               GLint64 time_elapsed = find_elapsed(timers[i].query);
+               for (int j = 0; j < timers[i].level * 2; ++j) {
                        fprintf(stderr, " ");
                }
-               fprintf(stderr, "%-30s %4.1f ms\n", timer.name.c_str(), GLint64(time_end - time_start) / 1e6);
+
+               if (detailed_timing) {
+                       // Look for any immediate subtimers, and see if they sum to the large one.
+                       size_t num_subtimers = 0;
+                       GLint64 sum_subtimers = 0;
+                       for (size_t j = i + 1; j < timers.size() && timers[j].level > timers[i].level; ++j) {
+                               if (timers[j].level != timers[i].level + 1) continue;
+                               ++num_subtimers;
+                               sum_subtimers += find_elapsed(timers[j].query);
+                       }
+
+                       if (num_subtimers > 0 && (time_elapsed - sum_subtimers) / 1e6 >= 0.01) {
+                               fprintf(stderr, "%-30s %4.3f ms [%4.3f ms unaccounted for]\n", timers[i].name.c_str(), time_elapsed / 1e6, (time_elapsed - sum_subtimers) / 1e6);
+                       } else {
+                               fprintf(stderr, "%-30s %4.3f ms\n", timers[i].name.c_str(), time_elapsed / 1e6);
+                       }
+               } else {
+                       fprintf(stderr, "%-30s %4.1f ms\n", timers[i].name.c_str(), time_elapsed / 1e6);
+               }
        }
 }
index 34cd625e1f172395a9d3c058c86ade692d6ff64a..91dfed8d929053cf4d079266f2fbf4de25ad33cb 100644 (file)
@@ -8,6 +8,7 @@
 #include <epoxy/gl.h>
 
 extern bool enable_timing;
+extern bool detailed_timing;
 
 class GPUTimers {
 public: