From: Steinar H. Gunderson Date: Thu, 2 Aug 2018 15:19:49 +0000 (+0200) Subject: Make a new flag --detailed-timing for microsecond measurements and more. X-Git-Tag: 1.8.0~76^2~154 X-Git-Url: https://git.sesse.net/?a=commitdiff_plain;h=8b76be9145873ab01a81136c44cf58c3e8cbf90f;hp=d60474a5a6da369c37bd1a218e347a66f9995fa1;p=nageru Make a new flag --detailed-timing for microsecond measurements and more. --- diff --git a/flow.cpp b/flow.cpp index bda8eec..d9c165f 100644 --- a/flow.cpp +++ b/flow.cpp @@ -47,6 +47,7 @@ constexpr unsigned patch_size_pixels = 12; float vr_alpha = 1.0f, vr_delta = 0.25f, vr_gamma = 0.25f; bool enable_timing = true; +bool detailed_timing = false; bool enable_variational_refinement = true; // Just for debugging. bool enable_interpolation = false; @@ -800,7 +801,7 @@ void SetupEquations::exec(GLuint I_x_y_tex, GLuint I_t_tex, GLuint diff_flow_tex class SOR { public: SOR(); - void exec(GLuint diff_flow_tex, GLuint equation_tex, GLuint smoothness_x_tex, GLuint smoothness_y_tex, int level_width, int level_height, int num_iterations); + void exec(GLuint diff_flow_tex, GLuint equation_tex, GLuint smoothness_x_tex, GLuint smoothness_y_tex, int level_width, int level_height, int num_iterations, ScopedTimer *sor_timer); private: PersistentFBOSet<1> fbos; @@ -838,7 +839,7 @@ SOR::SOR() uniform_phase = glGetUniformLocation(sor_program, "phase"); } -void SOR::exec(GLuint diff_flow_tex, GLuint equation_tex, GLuint smoothness_x_tex, GLuint smoothness_y_tex, int level_width, int level_height, int num_iterations) +void SOR::exec(GLuint diff_flow_tex, GLuint equation_tex, GLuint smoothness_x_tex, GLuint smoothness_y_tex, int level_width, int level_height, int num_iterations, ScopedTimer *sor_timer) { glUseProgram(sor_program); @@ -857,14 +858,20 @@ void SOR::exec(GLuint diff_flow_tex, GLuint equation_tex, GLuint smoothness_x_te fbos.render_to(diff_flow_tex); for (int i = 0; i < num_iterations; ++i) { - glProgramUniform1i(sor_program, uniform_phase, 0); - glDrawArrays(GL_TRIANGLE_STRIP, 0, 4); - glTextureBarrier(); - glProgramUniform1i(sor_program, uniform_phase, 1); - glDrawArrays(GL_TRIANGLE_STRIP, 0, 4); - if (i != num_iterations - 1) { + { + ScopedTimer timer("Red pass", sor_timer); + glProgramUniform1i(sor_program, uniform_phase, 0); + glDrawArrays(GL_TRIANGLE_STRIP, 0, 4); glTextureBarrier(); } + { + ScopedTimer timer("Black pass", sor_timer); + glProgramUniform1i(sor_program, uniform_phase, 1); + glDrawArrays(GL_TRIANGLE_STRIP, 0, 4); + if (i != num_iterations - 1) { + glTextureBarrier(); + } + } } } @@ -1190,7 +1197,7 @@ GLuint DISComputeFlow::exec(GLuint tex0, GLuint tex1, ResizeStrategy resize_stra // Note that these are to/from the same texture. { ScopedTimer timer("SOR", &varref_timer); - sor.exec(du_dv_tex, equation_tex, smoothness_x_tex, smoothness_y_tex, level_width, level_height, 5); + sor.exec(du_dv_tex, equation_tex, smoothness_x_tex, smoothness_y_tex, level_width, level_height, 5, &timer); } } @@ -1974,6 +1981,7 @@ int main(int argc, char **argv) { "intensity-relative-weight", required_argument, 0, 'i' }, // delta. { "gradient-relative-weight", required_argument, 0, 'g' }, // gamma. { "disable-timing", no_argument, 0, 1000 }, + { "detailed-timing", no_argument, 0, 1003 }, { "ignore-variational-refinement", no_argument, 0, 1001 }, // Still calculates it, just doesn't apply it. { "interpolate", no_argument, 0, 1002 } }; @@ -2004,6 +2012,9 @@ int main(int argc, char **argv) case 1002: enable_interpolation = true; break; + case 1003: + detailed_timing = true; + break; default: fprintf(stderr, "Unknown option '%s'\n", argv[option_index]); exit(1); diff --git a/gpu_timers.cpp b/gpu_timers.cpp index b58f1cd..a722051 100644 --- a/gpu_timers.cpp +++ b/gpu_timers.cpp @@ -23,17 +23,45 @@ pair GPUTimers::begin_timer(const string &name, int level) return timer.query; } +GLint64 find_elapsed(pair queries) +{ + // NOTE: This makes the CPU wait for the GPU. + GLuint64 time_start, time_end; + glGetQueryObjectui64v(queries.first, GL_QUERY_RESULT, &time_start); + glGetQueryObjectui64v(queries.second, GL_QUERY_RESULT, &time_end); + return time_end - time_start; +} + void GPUTimers::print() { - for (const Timer &timer : timers) { - // NOTE: This makes the CPU wait for the GPU. - GLuint64 time_start, time_end; - glGetQueryObjectui64v(timer.query.first, GL_QUERY_RESULT, &time_start); - glGetQueryObjectui64v(timer.query.second, GL_QUERY_RESULT, &time_end); - //fprintf(stderr, "GPU time used = %.1f ms\n", time_elapsed / 1e6); - for (int i = 0; i < timer.level * 2; ++i) { + for (size_t i = 0; i < timers.size(); ++i) { + if (timers[i].level >= 4 && !detailed_timing) { + // In practice, only affects the SOR sub-timers. + continue; + } + + GLint64 time_elapsed = find_elapsed(timers[i].query); + for (int j = 0; j < timers[i].level * 2; ++j) { fprintf(stderr, " "); } - fprintf(stderr, "%-30s %4.1f ms\n", timer.name.c_str(), GLint64(time_end - time_start) / 1e6); + + if (detailed_timing) { + // Look for any immediate subtimers, and see if they sum to the large one. + size_t num_subtimers = 0; + GLint64 sum_subtimers = 0; + for (size_t j = i + 1; j < timers.size() && timers[j].level > timers[i].level; ++j) { + if (timers[j].level != timers[i].level + 1) continue; + ++num_subtimers; + sum_subtimers += find_elapsed(timers[j].query); + } + + if (num_subtimers > 0 && (time_elapsed - sum_subtimers) / 1e6 >= 0.01) { + fprintf(stderr, "%-30s %4.3f ms [%4.3f ms unaccounted for]\n", timers[i].name.c_str(), time_elapsed / 1e6, (time_elapsed - sum_subtimers) / 1e6); + } else { + fprintf(stderr, "%-30s %4.3f ms\n", timers[i].name.c_str(), time_elapsed / 1e6); + } + } else { + fprintf(stderr, "%-30s %4.1f ms\n", timers[i].name.c_str(), time_elapsed / 1e6); + } } } diff --git a/gpu_timers.h b/gpu_timers.h index 34cd625..91dfed8 100644 --- a/gpu_timers.h +++ b/gpu_timers.h @@ -8,6 +8,7 @@ #include extern bool enable_timing; +extern bool detailed_timing; class GPUTimers { public: