From adc0df09f7a9dc88a3c0dbad47a21a805e728862 Mon Sep 17 00:00:00 2001 From: "Steinar H. Gunderson" Date: Sun, 8 Mar 2020 15:57:51 +0100 Subject: [PATCH] Change Futatabi frames to be cached as textures instead of in system memory. The JPEGs are now decoded into PBO bounce buffers, which saves a lot of CPU time (copying is asynchronous, and done by the GPU -- plus we save a copy into a staging buffer). Similarly, keeping the cache in textures allows the driver (if it wants!) to keep it in VRAM, saving repeated uploading if the same frame is used multiple times. CPU usage is down from 1.05 to 0.60 cores on my machine, when not playing. More importantly, the 99-percentile player queue status is extremely much better. --- futatabi/jpeg_frame.h | 12 +++-- futatabi/jpeg_frame_view.cpp | 78 ++++++++++++++++++++-------- futatabi/jpeg_frame_view.h | 2 +- futatabi/mainwindow.cpp | 1 + futatabi/pbo_pool.cpp | 79 +++++++++++++++++++++++++++++ futatabi/pbo_pool.h | 51 +++++++++++++++++++ futatabi/vaapi_jpeg_decoder.cpp | 29 ++++++++--- futatabi/video_stream.cpp | 90 ++++++++++++++++++++++----------- futatabi/ycbcr_converter.cpp | 12 ++--- meson.build | 2 +- nageru/image_input.cpp | 2 +- nageru/image_input.h | 2 +- shared/meson.build | 5 +- shared/ref_counted_texture.cpp | 25 +++++++++ shared/ref_counted_texture.h | 6 ++- 15 files changed, 319 insertions(+), 77 deletions(-) create mode 100644 futatabi/pbo_pool.cpp create mode 100644 futatabi/pbo_pool.h create mode 100644 shared/ref_counted_texture.cpp diff --git a/futatabi/jpeg_frame.h b/futatabi/jpeg_frame.h index 6fd0d4b..5e94cbb 100644 --- a/futatabi/jpeg_frame.h +++ b/futatabi/jpeg_frame.h @@ -4,15 +4,19 @@ #include #include +#include "shared/ref_counted_gl_sync.h" +#include "shared/ref_counted_texture.h" + struct Frame { bool is_semiplanar = false; - std::unique_ptr y; - std::unique_ptr cb, cr; // For planar. - std::unique_ptr cbcr; // For semiplanar. + RefCountedTexture y; + RefCountedTexture cb, cr; // For planar. + RefCountedTexture cbcr; // For semiplanar. unsigned width, height; unsigned chroma_subsampling_x, chroma_subsampling_y; - unsigned pitch_y, pitch_chroma; std::string exif_data; + RefCountedGLsync uploaded_ui_thread; + RefCountedGLsync uploaded_interpolation; }; #endif // !defined(_JPEG_FRAME_H) diff --git a/futatabi/jpeg_frame_view.cpp b/futatabi/jpeg_frame_view.cpp index ebcf509..85c708d 100644 --- a/futatabi/jpeg_frame_view.cpp +++ b/futatabi/jpeg_frame_view.cpp @@ -4,6 +4,8 @@ #include "flags.h" #include "jpeg_destroyer.h" #include "jpeglib_error_wrapper.h" +#include "pbo_pool.h" +#include "shared/context.h" #include "shared/metrics.h" #include "shared/post_to_main_thread.h" #include "video_stream.h" @@ -159,15 +161,15 @@ shared_ptr decode_jpeg(const string &jpeg) unsigned luma_width_blocks = mcu_width_blocks * dinfo.comp_info[0].h_samp_factor; unsigned chroma_width_blocks = mcu_width_blocks * dinfo.comp_info[1].h_samp_factor; - unsigned luma_height_blocks = mcu_height_blocks * dinfo.comp_info[0].v_samp_factor; - unsigned chroma_height_blocks = mcu_height_blocks * dinfo.comp_info[1].v_samp_factor; - // TODO: Decode into a PBO. - frame->y.reset(new uint8_t[luma_width_blocks * luma_height_blocks * DCTSIZE2]); - frame->cb.reset(new uint8_t[chroma_width_blocks * chroma_height_blocks * DCTSIZE2]); - frame->cr.reset(new uint8_t[chroma_width_blocks * chroma_height_blocks * DCTSIZE2]); - frame->pitch_y = luma_width_blocks * DCTSIZE; - frame->pitch_chroma = chroma_width_blocks * DCTSIZE; + PBO pbo = global_pbo_pool->alloc_pbo(); + size_t cb_offset = dinfo.image_width * dinfo.image_height; + size_t cr_offset = cb_offset + (dinfo.image_width / 2) * dinfo.image_height; + uint8_t *y_pix = pbo.ptr; + uint8_t *cb_pix = pbo.ptr + cb_offset; + uint8_t *cr_pix = pbo.ptr + cr_offset; + unsigned pitch_y = luma_width_blocks * DCTSIZE; + unsigned pitch_chroma = chroma_width_blocks * DCTSIZE * 2; if (dinfo.marker_list != nullptr && dinfo.marker_list->marker == JPEG_APP0 + 1 && @@ -177,15 +179,15 @@ shared_ptr decode_jpeg(const string &jpeg) dinfo.marker_list->data_length); } - if (!error_mgr.run([&dinfo, &frame, v_mcu_size, mcu_height_blocks] { + if (!error_mgr.run([&dinfo, &y_pix, &cb_pix, &cr_pix, pitch_y, pitch_chroma, v_mcu_size, mcu_height_blocks] { JSAMPROW yptr[v_mcu_size], cbptr[v_mcu_size], crptr[v_mcu_size]; JSAMPARRAY data[3] = { yptr, cbptr, crptr }; for (unsigned y = 0; y < mcu_height_blocks; ++y) { // NOTE: The last elements of cbptr/crptr will be unused for vertically subsampled chroma. for (unsigned yy = 0; yy < v_mcu_size; ++yy) { - yptr[yy] = frame->y.get() + (y * DCTSIZE * dinfo.max_v_samp_factor + yy) * frame->pitch_y; - cbptr[yy] = frame->cb.get() + (y * DCTSIZE * dinfo.comp_info[1].v_samp_factor + yy) * frame->pitch_chroma; - crptr[yy] = frame->cr.get() + (y * DCTSIZE * dinfo.comp_info[1].v_samp_factor + yy) * frame->pitch_chroma; + yptr[yy] = y_pix + (y * DCTSIZE * dinfo.max_v_samp_factor + yy) * pitch_y; + cbptr[yy] = cb_pix + (y * DCTSIZE * dinfo.comp_info[1].v_samp_factor + yy) * pitch_chroma; + crptr[yy] = cr_pix + (y * DCTSIZE * dinfo.comp_info[1].v_samp_factor + yy) * pitch_chroma; } jpeg_read_raw_data(&dinfo, data, v_mcu_size); @@ -196,6 +198,20 @@ shared_ptr decode_jpeg(const string &jpeg) return get_black_frame(); } + // FIXME: what about resolutions that are not divisible by the block factor? + glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pbo.pbo); + frame->y = create_texture_2d(frame->width, frame->height, GL_R8, GL_RED, GL_UNSIGNED_BYTE, BUFFER_OFFSET(0)); + frame->cb = create_texture_2d(frame->width / 2, frame->height, GL_R8, GL_RED, GL_UNSIGNED_BYTE, BUFFER_OFFSET(cb_offset)); + frame->cr = create_texture_2d(frame->width / 2, frame->height, GL_R8, GL_RED, GL_UNSIGNED_BYTE, BUFFER_OFFSET(cr_offset)); + glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0); + + glFlushMappedNamedBufferRange(pbo.pbo, 0, dinfo.image_width * dinfo.image_height * 2); + glMemoryBarrier(GL_PIXEL_BUFFER_BARRIER_BIT); + pbo.upload_done = RefCountedGLsync(GL_SYNC_GPU_COMMANDS_COMPLETE, /*flags=*/0); + frame->uploaded_ui_thread = pbo.upload_done; + frame->uploaded_interpolation = pbo.upload_done; + global_pbo_pool->release_pbo(move(pbo)); + ++metric_jpeg_software_decode_frames; steady_clock::time_point stop = steady_clock::now(); metric_jpeg_decode_time_seconds.count_event(duration(stop - start).count()); @@ -276,6 +292,13 @@ void JPEGFrameView::jpeg_decoder_thread_func() size_t num_decoded = 0, num_dropped = 0; pthread_setname_np(pthread_self(), "JPEGDecoder"); + QSurface *surface = create_surface(); + QOpenGLContext *context = create_context(surface); + bool ok = make_current(context, surface); + if (!ok) { + fprintf(stderr, "Video stream couldn't get an OpenGL context\n"); + abort(); + } while (!should_quit.load()) { PendingDecode decode; CacheMissBehavior cache_miss_behavior = DECODE_IF_NOT_IN_CACHE; @@ -392,6 +415,8 @@ void JPEGFrameView::setFrame(shared_ptr frame) void JPEGFrameView::initializeGL() { + init_pbo_pool(); + glDisable(GL_BLEND); glDisable(GL_DEPTH_TEST); check_error(); @@ -436,6 +461,14 @@ void JPEGFrameView::paintGL() ++metric_jpeg_displayed_frames; displayed_this_frame = true; } + if (current_frame->uploaded_ui_thread != nullptr) { + glWaitSync(current_frame->uploaded_ui_thread.get(), /*flags=*/0, GL_TIMEOUT_IGNORED); + current_frame->uploaded_ui_thread.reset(); + } + if (current_secondary_frame != nullptr && current_secondary_frame->uploaded_ui_thread != nullptr) { + glWaitSync(current_secondary_frame->uploaded_ui_thread.get(), /*flags=*/0, GL_TIMEOUT_IGNORED); + current_secondary_frame->uploaded_ui_thread.reset(); + } check_error(); current_chain->render_to_screen(); @@ -527,16 +560,17 @@ shared_ptr get_black_frame() static shared_ptr black_frame; static once_flag flag; call_once(flag, [] { - black_frame.reset(new Frame); - black_frame->y.reset(new uint8_t[global_flags.width * global_flags.height]); - black_frame->cb.reset(new uint8_t[(global_flags.width / 2) * (global_flags.height / 2)]); - black_frame->cr.reset(new uint8_t[(global_flags.width / 2) * (global_flags.height / 2)]); - black_frame->width = global_flags.width; - black_frame->height = global_flags.height; - black_frame->chroma_subsampling_x = 2; - black_frame->chroma_subsampling_y = 2; - black_frame->pitch_y = global_flags.width; - black_frame->pitch_chroma = global_flags.width / 2; + // Not really black, but whatever. :-) + uint8_t black[] = { 0, 0, 0, 255 }; + RefCountedTexture black_tex = create_texture_2d(1, 1, GL_RGBA8, GL_RGBA, GL_UNSIGNED_BYTE, black); + + black_frame->y = black_tex; + black_frame->cb = black_tex; + black_frame->cr = move(black_tex); + black_frame->width = 1; + black_frame->height = 1; + black_frame->chroma_subsampling_x = 1; + black_frame->chroma_subsampling_y = 1; }); ++metric_jpeg_software_fail_frames; return black_frame; diff --git a/futatabi/jpeg_frame_view.h b/futatabi/jpeg_frame_view.h index 3f92e4c..693ea9b 100644 --- a/futatabi/jpeg_frame_view.h +++ b/futatabi/jpeg_frame_view.h @@ -63,7 +63,7 @@ private: movit::EffectChain *current_chain = nullptr; // Owned by ycbcr_converter. bool displayed_this_frame = false; // Owned by the UI frame. - std::shared_ptr current_frame; // So that we hold on to the pixels. + std::shared_ptr current_frame; // So that we hold on to the textures. std::shared_ptr current_secondary_frame; // Same. int overlay_base_width = 16, overlay_base_height = 16; diff --git a/futatabi/mainwindow.cpp b/futatabi/mainwindow.cpp index 49581e5..0bc11d8 100644 --- a/futatabi/mainwindow.cpp +++ b/futatabi/mainwindow.cpp @@ -7,6 +7,7 @@ #include "player.h" #include "futatabi_midi_mapping.pb.h" #include "midi_mapping_dialog.h" +#include "pbo_pool.h" #include "shared/aboutdialog.h" #include "shared/disk_space_estimator.h" #include "shared/post_to_main_thread.h" diff --git a/futatabi/pbo_pool.cpp b/futatabi/pbo_pool.cpp new file mode 100644 index 0000000..1933b31 --- /dev/null +++ b/futatabi/pbo_pool.cpp @@ -0,0 +1,79 @@ +#include "pbo_pool.h" + +#include +#include + +#include + +using namespace std; +using namespace std::chrono; + +once_flag global_pbo_pool_inited; +PBOPool *global_pbo_pool = nullptr; + +void init_pbo_pool() +{ + call_once(global_pbo_pool_inited, []{ + global_pbo_pool = new PBOPool; + }); +} + +PBOPool::PBOPool(size_t pbo_size, size_t num_pbos, GLenum buffer, GLenum permissions, GLenum map_bits) + : pbo_size(pbo_size), buffer(buffer), permissions(permissions), map_bits(map_bits) +{ + for (size_t i = 0; i < num_pbos; ++i) { + freelist.push(create_pbo()); + } +} + +PBO PBOPool::alloc_pbo() +{ + PBO pbo; + bool found_pbo = false; + { + lock_guard lock(freelist_mutex); + if (!freelist.empty()) { + pbo = move(freelist.front()); + freelist.pop(); + found_pbo = true; + } + } + + if (!found_pbo) { + fprintf(stderr, "WARNING: Out of PBOs for texture upload, creating a new one\n"); + pbo = create_pbo(); + } + if (pbo.upload_done != nullptr) { + if (glClientWaitSync(pbo.upload_done.get(), 0, 0) == GL_TIMEOUT_EXPIRED) { + steady_clock::time_point start = steady_clock::now(); + glClientWaitSync(pbo.upload_done.get(), /*flags=*/0, GL_TIMEOUT_IGNORED); + steady_clock::time_point stop = steady_clock::now(); + + fprintf(stderr, "WARNING: PBO was not ready after previous upload, had to wait %.1f ms before reusing\n", + 1e3 * duration(stop - start).count()); + } + pbo.upload_done.reset(); + } + + return pbo; +} + +void PBOPool::release_pbo(PBO pbo) +{ + lock_guard lock(freelist_mutex); + freelist.push(move(pbo)); +} + +PBO PBOPool::create_pbo() +{ + PBO pbo; + + glCreateBuffers(1, &pbo.pbo); + check_error(); + glNamedBufferStorage(pbo.pbo, pbo_size, nullptr, permissions | GL_MAP_PERSISTENT_BIT); + check_error(); + pbo.ptr = (uint8_t *)glMapNamedBufferRange(pbo.pbo, 0, pbo_size, permissions | map_bits | GL_MAP_PERSISTENT_BIT); + check_error(); + + return pbo; +} diff --git a/futatabi/pbo_pool.h b/futatabi/pbo_pool.h new file mode 100644 index 0000000..20c61e4 --- /dev/null +++ b/futatabi/pbo_pool.h @@ -0,0 +1,51 @@ +#ifndef _PBO_POOL_H +#define _PBO_POOL_H 1 + +// Keeps a pool of persistently mapped PBOs around that can be used as staging +// buffers for texture uploads. (Uploading from a PBO is asynchronous and done +// by the GPU, so assuming we don't need an extra copy into the PBO, this is a +// significant win over uploading from regular malloc-ed RAM.) +// +// Unlike Nageru's PBOFrameAllocator, these are not connected to +// a given frame, since we can have thousands of frames in the cache +// at any given time. Thus, we need to have separate fences for each PBO +// to know that the upload is done. + +#include +#include + +#include + +#include "shared/ref_counted_gl_sync.h" + +struct PBO { + GLuint pbo; + uint8_t *ptr; // Mapped memory. + RefCountedGLsync upload_done; +}; + +class PBOPool { +public: + PBOPool(size_t pbo_size = 8 << 20, // 8 MB, large enough for 1080p 4:2:2. + size_t num_pbos = 8, + GLenum buffer = GL_PIXEL_UNPACK_BUFFER_ARB, + GLenum permissions = GL_MAP_WRITE_BIT, + GLenum map_bits = GL_MAP_FLUSH_EXPLICIT_BIT); + + PBO alloc_pbo(); + void release_pbo(PBO pbo); // Set a fence on upload_done if the PBO may still be in use. + +private: + PBO create_pbo(); + + std::mutex freelist_mutex; + std::queue freelist; + + size_t pbo_size; + GLenum buffer, permissions, map_bits; +}; + +extern PBOPool *global_pbo_pool; +void init_pbo_pool(); // Idempotent. + +#endif // !defined(_PBO_POOL_H) diff --git a/futatabi/vaapi_jpeg_decoder.cpp b/futatabi/vaapi_jpeg_decoder.cpp index f34654d..758d974 100644 --- a/futatabi/vaapi_jpeg_decoder.cpp +++ b/futatabi/vaapi_jpeg_decoder.cpp @@ -3,6 +3,7 @@ #include "jpeg_destroyer.h" #include "jpeg_frame.h" #include "jpeglib_error_wrapper.h" +#include "pbo_pool.h" #include "shared/memcpy_interleaved.h" #include @@ -22,6 +23,8 @@ #include #include +#define BUFFER_OFFSET(i) ((char *)nullptr + (i)) + using namespace std; static unique_ptr va_dpy; @@ -549,24 +552,38 @@ shared_ptr decode_jpeg_vaapi(const string &jpeg) #else // Convert Y'CbCr to separate Y' and CbCr. frame->is_semiplanar = true; - frame->y.reset(new uint8_t[dinfo.image_width * dinfo.image_height]); - frame->cbcr.reset(new uint8_t[dinfo.image_width * dinfo.image_height]); + + PBO pbo = global_pbo_pool->alloc_pbo(); + size_t cbcr_offset = dinfo.image_width * dinfo.image_height; + uint8_t *y_pix = pbo.ptr; + uint8_t *cbcr_pix = pbo.ptr + cbcr_offset; + const uint8_t *src = (const uint8_t *)mapped + resources.image.offsets[0]; if (resources.image.pitches[0] == dinfo.image_width * 2) { - memcpy_interleaved(frame->cbcr.get(), frame->y.get(), src, dinfo.image_width * dinfo.image_height * 2); + memcpy_interleaved(cbcr_pix, y_pix, src, dinfo.image_width * dinfo.image_height * 2); } else { for (unsigned y = 0; y < dinfo.image_height; ++y) { - memcpy_interleaved(frame->cbcr.get() + y * dinfo.image_width, frame->y.get() + y * dinfo.image_width, + memcpy_interleaved(cbcr_pix + y * dinfo.image_width, y_pix + y * dinfo.image_width, src + y * resources.image.pitches[0], dinfo.image_width * 2); } } + + glBindBuffer(GL_PIXEL_UNPACK_BUFFER, pbo.pbo); + frame->y = create_texture_2d(dinfo.image_width, dinfo.image_height, GL_R8, GL_RED, GL_UNSIGNED_BYTE, BUFFER_OFFSET(0)); + frame->cbcr = create_texture_2d(dinfo.image_width / 2, dinfo.image_height, GL_RG8, GL_RG, GL_UNSIGNED_BYTE, BUFFER_OFFSET(cbcr_offset)); + glBindBuffer(GL_PIXEL_UNPACK_BUFFER, 0); + + glFlushMappedNamedBufferRange(pbo.pbo, 0, dinfo.image_width * dinfo.image_height * 2); + glMemoryBarrier(GL_PIXEL_BUFFER_BARRIER_BIT); + pbo.upload_done = RefCountedGLsync(GL_SYNC_GPU_COMMANDS_COMPLETE, /*flags=*/0); + frame->uploaded_ui_thread = pbo.upload_done; + frame->uploaded_interpolation = pbo.upload_done; + global_pbo_pool->release_pbo(move(pbo)); #endif frame->width = dinfo.image_width; frame->height = dinfo.image_height; frame->chroma_subsampling_x = 2; frame->chroma_subsampling_y = 1; - frame->pitch_y = dinfo.image_width; - frame->pitch_chroma = dinfo.image_width / 2; if (dinfo.marker_list != nullptr && dinfo.marker_list->marker == JPEG_APP0 + 1 && diff --git a/futatabi/video_stream.cpp b/futatabi/video_stream.cpp index 5a36801..591ee7e 100644 --- a/futatabi/video_stream.cpp +++ b/futatabi/video_stream.cpp @@ -11,6 +11,7 @@ extern "C" { #include "flow.h" #include "jpeg_frame_view.h" #include "movit/util.h" +#include "pbo_pool.h" #include "player.h" #include "shared/context.h" #include "shared/httpd.h" @@ -37,6 +38,14 @@ Summary metric_interpolation_latency_seconds; Summary metric_fade_fence_wait_time_seconds; Summary metric_interpolation_fence_wait_time_seconds; +void wait_for_upload(shared_ptr &frame) +{ + if (frame->uploaded_interpolation != nullptr) { + glWaitSync(frame->uploaded_interpolation.get(), /*flags=*/0, GL_TIMEOUT_IGNORED); + frame->uploaded_interpolation.reset(); + } +} + } // namespace extern HTTPD *global_httpd; @@ -152,6 +161,16 @@ string encode_jpeg(const uint8_t *y_data, const uint8_t *cb_data, const uint8_t return move(dest.dest); } +string encode_jpeg_from_pbo(void *contents, unsigned width, unsigned height, const string exif_data) +{ + unsigned chroma_width = width / 2; + + const uint8_t *y = (const uint8_t *)contents; + const uint8_t *cb = (const uint8_t *)contents + width * height; + const uint8_t *cr = (const uint8_t *)contents + width * height + chroma_width * height; + return encode_jpeg(y, cb, cr, width, height, move(exif_data)); +} + VideoStream::VideoStream(AVFormatContext *file_avctx) : avctx(file_avctx), output_fast_forward(file_avctx != nullptr) { @@ -430,6 +449,8 @@ void VideoStream::schedule_faded_frame(steady_clock::time_point local_pts, int64 shared_ptr frame1 = decode_jpeg_with_cache(frame1_spec, DECODE_IF_NOT_IN_CACHE, &frame_reader, &did_decode); shared_ptr frame2 = decode_jpeg_with_cache(frame2_spec, DECODE_IF_NOT_IN_CACHE, &frame_reader, &did_decode); + wait_for_upload(frame1); + wait_for_upload(frame2); ycbcr_semiplanar_converter->prepare_chain_for_fade(frame1, frame2, fade_alpha)->render_to_fbo(resources->fade_fbo, global_flags.width, global_flags.height); @@ -517,6 +538,7 @@ void VideoStream::schedule_interpolated_frame(steady_clock::time_point local_pts FrameOnDisk frame_spec = frame_no == 1 ? frame2 : frame1; bool did_decode; shared_ptr frame = decode_jpeg_with_cache(frame_spec, DECODE_IF_NOT_IN_CACHE, &frame_reader, &did_decode); + wait_for_upload(frame); ycbcr_converter->prepare_chain_for_conversion(frame)->render_to_fbo(resources->input_fbos[frame_no], global_flags.width, global_flags.height); if (frame_no == 1) { qf.exif_data = frame->exif_data; // Use the white point from the last frame. @@ -557,6 +579,7 @@ void VideoStream::schedule_interpolated_frame(steady_clock::time_point local_pts // Now decode the image we are fading against. bool did_decode; shared_ptr frame2 = decode_jpeg_with_cache(secondary_frame, DECODE_IF_NOT_IN_CACHE, &frame_reader, &did_decode); + wait_for_upload(frame2); // Then fade against it, putting it into the fade Y' and CbCr textures. RGBTriplet neutral_color = get_neutral_color(qf.exif_data); @@ -645,31 +668,27 @@ void VideoStream::schedule_silence(steady_clock::time_point local_pts, int64_t o namespace { -shared_ptr frame_from_pbo(void *contents, size_t width, size_t height) +RefCountedTexture clone_r8_texture(GLuint src_tex, unsigned width, unsigned height) { - size_t chroma_width = width / 2; - - const uint8_t *y = (const uint8_t *)contents; - const uint8_t *cb = (const uint8_t *)contents + width * height; - const uint8_t *cr = (const uint8_t *)contents + width * height + chroma_width * height; + GLuint tex; + glCreateTextures(GL_TEXTURE_2D, 1, &tex); + check_error(); + glTextureStorage2D(tex, 1, GL_R8, width, height); + check_error(); + glCopyImageSubData(src_tex, GL_TEXTURE_2D, 0, 0, 0, 0, + tex, GL_TEXTURE_2D, 0, 0, 0, 0, + width, height, 1); + check_error(); + glTextureParameteri(tex, GL_TEXTURE_MIN_FILTER, GL_LINEAR); + check_error(); + glTextureParameteri(tex, GL_TEXTURE_MAG_FILTER, GL_LINEAR); + check_error(); + glTextureParameteri(tex, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE); + check_error(); + glTextureParameteri(tex, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE); + check_error(); - shared_ptr frame(new Frame); - frame->y.reset(new uint8_t[width * height]); - frame->cb.reset(new uint8_t[chroma_width * height]); - frame->cr.reset(new uint8_t[chroma_width * height]); - for (unsigned yy = 0; yy < height; ++yy) { - memcpy(frame->y.get() + width * yy, y + width * yy, width); - memcpy(frame->cb.get() + chroma_width * yy, cb + chroma_width * yy, chroma_width); - memcpy(frame->cr.get() + chroma_width * yy, cr + chroma_width * yy, chroma_width); - } - frame->is_semiplanar = false; - frame->width = width; - frame->height = height; - frame->chroma_subsampling_x = 2; - frame->chroma_subsampling_y = 1; - frame->pitch_y = width; - frame->pitch_chroma = chroma_width; - return frame; + return RefCountedTexture(new GLuint(tex), TextureDeleter()); } } // namespace @@ -685,6 +704,8 @@ void VideoStream::encode_thread_func() abort(); } + init_pbo_pool(); + while (!should_quit) { QueuedFrame qf; { @@ -751,11 +772,8 @@ void VideoStream::encode_thread_func() metric_fade_fence_wait_time_seconds.count_event(duration(stop - start).count()); metric_fade_latency_seconds.count_event(duration(stop - qf.fence_created).count()); - shared_ptr frame = frame_from_pbo(qf.resources->pbo_contents, global_flags.width, global_flags.height); - assert(frame->exif_data.empty()); - // Now JPEG encode it, and send it on to the stream. - string jpeg = encode_jpeg(frame->y.get(), frame->cb.get(), frame->cr.get(), global_flags.width, global_flags.height, /*exif_data=*/""); + string jpeg = encode_jpeg_from_pbo(qf.resources->pbo_contents, global_flags.width, global_flags.height, /*exif_data=*/""); AVPacket pkt; av_init_packet(&pkt); @@ -775,13 +793,25 @@ void VideoStream::encode_thread_func() metric_interpolation_latency_seconds.count_event(duration(stop - qf.fence_created).count()); // Send it on to display. - shared_ptr frame = frame_from_pbo(qf.resources->pbo_contents, global_flags.width, global_flags.height); if (qf.display_decoded_func != nullptr) { - qf.display_decoded_func(frame); + shared_ptr frame(new Frame); + if (qf.type == QueuedFrame::FADED_INTERPOLATED) { + frame->y = clone_r8_texture(qf.resources->fade_y_output_tex, global_flags.width, global_flags.height); + } else { + frame->y = clone_r8_texture(qf.output_tex, global_flags.width, global_flags.height); + } + frame->cb = clone_r8_texture(qf.resources->cb_tex, global_flags.width / 2, global_flags.height); + frame->cr = clone_r8_texture(qf.resources->cr_tex, global_flags.width / 2, global_flags.height); + frame->width = global_flags.width; + frame->height = global_flags.height; + frame->chroma_subsampling_x = 2; + frame->chroma_subsampling_y = 1; + frame->uploaded_ui_thread = RefCountedGLsync(GL_SYNC_GPU_COMMANDS_COMPLETE, /*flags=*/0); + qf.display_decoded_func(move(frame)); } // Now JPEG encode it, and send it on to the stream. - string jpeg = encode_jpeg(frame->y.get(), frame->cb.get(), frame->cr.get(), global_flags.width, global_flags.height, move(qf.exif_data)); + string jpeg = encode_jpeg_from_pbo(qf.resources->pbo_contents, global_flags.width, global_flags.height, move(qf.exif_data)); if (qf.flow_tex != 0) { compute_flow->release_texture(qf.flow_tex); } diff --git a/futatabi/ycbcr_converter.cpp b/futatabi/ycbcr_converter.cpp index 2d2f32f..0edd7e6 100644 --- a/futatabi/ycbcr_converter.cpp +++ b/futatabi/ycbcr_converter.cpp @@ -194,15 +194,11 @@ void setup_input_for_frame(shared_ptr frame, const YCbCrFormat &ycbcr_for input->set_width(frame->width); input->set_height(frame->height); - input->set_pixel_data(0, frame->y.get()); - input->set_pitch(0, frame->pitch_y); + input->set_texture_num(0, *frame->y); if (frame->is_semiplanar) { - input->set_pixel_data(1, frame->cbcr.get()); - input->set_pitch(1, frame->pitch_chroma); + input->set_texture_num(1, *frame->cbcr); } else { - input->set_pixel_data(1, frame->cb.get()); - input->set_pixel_data(2, frame->cr.get()); - input->set_pitch(1, frame->pitch_chroma); - input->set_pitch(2, frame->pitch_chroma); + input->set_texture_num(1, *frame->cb); + input->set_texture_num(2, *frame->cr); } } diff --git a/meson.build b/meson.build index d705c7f..76b4b59 100644 --- a/meson.build +++ b/meson.build @@ -290,7 +290,7 @@ futatabi_srcs += ['futatabi/main.cpp', 'futatabi/player.cpp', 'futatabi/video_st futatabi_srcs += ['futatabi/vaapi_jpeg_decoder.cpp', 'futatabi/db.cpp', 'futatabi/ycbcr_converter.cpp', 'futatabi/flags.cpp'] futatabi_srcs += ['futatabi/mainwindow.cpp', 'futatabi/jpeg_frame_view.cpp', 'futatabi/clip_list.cpp', 'futatabi/frame_on_disk.cpp'] futatabi_srcs += ['futatabi/export.cpp', 'futatabi/midi_mapper.cpp', 'futatabi/midi_mapping_dialog.cpp'] -futatabi_srcs += ['futatabi/exif_parser.cpp'] +futatabi_srcs += ['futatabi/exif_parser.cpp', 'futatabi/pbo_pool.cpp'] futatabi_srcs += moc_files futatabi_srcs += proto_generated diff --git a/nageru/image_input.cpp b/nageru/image_input.cpp index 6a2c5ab..afb87c5 100644 --- a/nageru/image_input.cpp +++ b/nageru/image_input.cpp @@ -228,7 +228,7 @@ shared_ptr ImageInput::load_image_raw(const string &pat glBindTexture(GL_TEXTURE_2D, 0); check_error(); - shared_ptr image(new Image{unsigned(frame->width), unsigned(frame->height), RefCountedTexture(new GLuint(tex)), last_modified}); + shared_ptr image(new Image{unsigned(frame->width), unsigned(frame->height), UniqueTexture(new GLuint(tex)), last_modified}); return image; } diff --git a/nageru/image_input.h b/nageru/image_input.h index babf5f5..7b712ab 100644 --- a/nageru/image_input.h +++ b/nageru/image_input.h @@ -27,7 +27,7 @@ public: // NOTE: You will need to call start_update_thread() yourself, once per program. struct Image { unsigned width, height; - RefCountedTexture tex; + UniqueTexture tex; timespec last_modified; }; static std::shared_ptr load_image(const std::string &filename, const std::string &pathname); diff --git a/shared/meson.build b/shared/meson.build index 5653528..c7ef06c 100644 --- a/shared/meson.build +++ b/shared/meson.build @@ -3,6 +3,7 @@ shared_qt5deps = dependency('qt5', modules: ['Core', 'Gui', 'Widgets', 'OpenGL'] libmicrohttpddep = dependency('libmicrohttpd') protobufdep = dependency('protobuf') alsadep = dependency('alsa') +movitdep = dependency('movit') # Preprocess Qt as needed. qt_files = qt5.preprocess( @@ -18,14 +19,14 @@ proto_generated = gen.process(['midi_mapping.proto']) protobuf_lib = static_library('protobufs', proto_generated, dependencies: [protobufdep]) protobuf_hdrs = declare_dependency(sources: proto_generated) -srcs = ['memcpy_interleaved.cpp', 'metacube2.cpp', 'ffmpeg_raii.cpp', 'mux.cpp', 'metrics.cpp', 'context.cpp', 'httpd.cpp', 'disk_space_estimator.cpp', 'read_file.cpp', 'text_proto.cpp', 'midi_device.cpp'] +srcs = ['memcpy_interleaved.cpp', 'metacube2.cpp', 'ffmpeg_raii.cpp', 'mux.cpp', 'metrics.cpp', 'context.cpp', 'httpd.cpp', 'disk_space_estimator.cpp', 'read_file.cpp', 'text_proto.cpp', 'midi_device.cpp', 'ref_counted_texture.cpp'] srcs += proto_generated # Qt objects. srcs += qt_files srcs += ['aboutdialog.cpp'] -shared = static_library('shared', srcs, include_directories: top_include, dependencies: [shared_qt5deps, libmicrohttpddep, protobufdep, alsadep]) +shared = static_library('shared', srcs, include_directories: top_include, dependencies: [shared_qt5deps, libmicrohttpddep, protobufdep, alsadep, movitdep]) shareddep = declare_dependency( sources: proto_generated, include_directories: top_include, diff --git a/shared/ref_counted_texture.cpp b/shared/ref_counted_texture.cpp new file mode 100644 index 0000000..d10b0dc --- /dev/null +++ b/shared/ref_counted_texture.cpp @@ -0,0 +1,25 @@ +#include "ref_counted_texture.h" + +#include +#include + +RefCountedTexture create_texture_2d(GLuint width, GLuint height, GLenum internal_format, GLenum format, GLenum type, const GLvoid *pixels) +{ + GLuint tex; + glCreateTextures(GL_TEXTURE_2D, 1, &tex); + check_error(); + glTextureStorage2D(tex, 1, internal_format, width, height); + check_error(); + glTextureSubImage2D(tex, 0, 0, 0, width, height, format, type, pixels); + check_error(); + glTextureParameteri(tex, GL_TEXTURE_MIN_FILTER, GL_LINEAR); + check_error(); + glTextureParameteri(tex, GL_TEXTURE_MAG_FILTER, GL_LINEAR); + check_error(); + glTextureParameteri(tex, GL_TEXTURE_WRAP_S, GL_CLAMP_TO_EDGE); + check_error(); + glTextureParameteri(tex, GL_TEXTURE_WRAP_T, GL_CLAMP_TO_EDGE); + check_error(); + + return RefCountedTexture(new GLuint(tex), TextureDeleter()); +} diff --git a/shared/ref_counted_texture.h b/shared/ref_counted_texture.h index 20d0e5a..240bf86 100644 --- a/shared/ref_counted_texture.h +++ b/shared/ref_counted_texture.h @@ -14,6 +14,10 @@ struct TextureDeleter { } }; -typedef std::unique_ptr RefCountedTexture; +typedef std::unique_ptr UniqueTexture; +typedef std::shared_ptr RefCountedTexture; + +// TODO: consider mipmaps. +RefCountedTexture create_texture_2d(GLuint width, GLuint height, GLenum internal_format, GLenum format, GLenum type, const GLvoid *pixels); #endif // !defined(_REF_COUNTED_TEXTURE) -- 2.39.2