From cf158af1c2219bd9f5a9bc531fb3c1133d327b45 Mon Sep 17 00:00:00 2001 From: "Steinar H. Gunderson" Date: Sat, 29 Sep 2018 16:37:14 +0200 Subject: [PATCH] Decode 4:2:2 JPEGs via VA-API if available. --- Makefile | 3 +- jpeg_frame.h | 16 ++ jpeg_frame_view.cpp | 81 +++--- jpeg_frame_view.h | 15 +- main.cpp | 3 + memcpy_interleaved.cpp | 136 ++++++++++ memcpy_interleaved.h | 11 + vaapi_jpeg_decoder.cpp | 546 +++++++++++++++++++++++++++++++++++++++++ vaapi_jpeg_decoder.h | 27 ++ video_stream.cpp | 74 +++--- video_stream.h | 6 +- 11 files changed, 851 insertions(+), 67 deletions(-) create mode 100644 jpeg_frame.h create mode 100644 memcpy_interleaved.cpp create mode 100644 memcpy_interleaved.h create mode 100644 vaapi_jpeg_decoder.cpp create mode 100644 vaapi_jpeg_decoder.h diff --git a/Makefile b/Makefile index a0075c5..a15bb6b 100644 --- a/Makefile +++ b/Makefile @@ -3,7 +3,7 @@ PKG_MODULES := Qt5Core Qt5Gui Qt5Widgets Qt5OpenGLExtensions Qt5OpenGL Qt5PrintS CXXFLAGS ?= -O2 -g -Wall # Will be overridden by environment. CXXFLAGS += -fPIC $(shell pkg-config --cflags $(PKG_MODULES)) -DMOVIT_SHADER_DIR=\"$(shell pkg-config --variable=shaderdir movit)\" -pthread -LDLIBS=$(shell pkg-config --libs $(PKG_MODULES)) -pthread -lavformat -lavcodec -lavutil -lswscale -lGL +LDLIBS=$(shell pkg-config --libs $(PKG_MODULES)) -pthread -lavformat -lavcodec -lavutil -lswscale -lGL -lva -lva-drm -lva-x11 -lX11 # Qt objects OBJS_WITH_MOC = mainwindow.o jpeg_frame_view.o clip_list.o @@ -14,6 +14,7 @@ OBJS += $(OBJS_WITH_MOC:.o=.moc.o) OBJS += flow.o gpu_timers.o OBJS += ffmpeg_raii.o main.o player.o httpd.o mux.o metacube2.o video_stream.o context.o chroma_subsampler.o +OBJS += vaapi_jpeg_decoder.o memcpy_interleaved.o %.o: %.cpp $(CXX) -MMD -MP $(CPPFLAGS) $(CXXFLAGS) -o $@ -c $< diff --git a/jpeg_frame.h b/jpeg_frame.h new file mode 100644 index 0000000..eb73e13 --- /dev/null +++ b/jpeg_frame.h @@ -0,0 +1,16 @@ +#ifndef _JPEG_FRAME_H +#define _JPEG_FRAME_H 1 + +#include + +struct Frame { + bool is_semiplanar = false; + std::unique_ptr y; + std::unique_ptr cb, cr; // For planar. + std::unique_ptr cbcr; // For semiplanar. + unsigned width, height; + unsigned chroma_subsampling_x, chroma_subsampling_y; + unsigned pitch_y, pitch_chroma; +}; + +#endif // !defined(_JPEG_FRAME_H) diff --git a/jpeg_frame_view.cpp b/jpeg_frame_view.cpp index 73030ff..ef1cded 100644 --- a/jpeg_frame_view.cpp +++ b/jpeg_frame_view.cpp @@ -20,6 +20,7 @@ #include "defs.h" #include "post_to_main_thread.h" +#include "vaapi_jpeg_decoder.h" #include "video_stream.h" using namespace movit; @@ -50,10 +51,18 @@ deque> pending_decodes; // Under cache_mu. atomic event_counter{0}; extern QGLWidget *global_share_widget; -// TODO: Decode using VA-API if available. shared_ptr decode_jpeg(const string &filename) { - shared_ptr frame(new Frame); + shared_ptr frame; + if (vaapi_jpeg_decoding_usable) { + frame = decode_jpeg_vaapi(filename); + if (frame != nullptr) { + return frame; + } + fprintf(stderr, "VA-API hardware decoding failed; falling back to software.\n"); + } + + frame.reset(new Frame); jpeg_decompress_struct dinfo; jpeg_error_mgr jerr; @@ -298,10 +307,10 @@ void JPEGFrameView::initializeGL() std::thread(&jpeg_decoder_thread).detach(); }); - chain.reset(new EffectChain(1280, 720, resource_pool)); - ImageFormat image_format; - image_format.color_space = COLORSPACE_sRGB; - image_format.gamma_curve = GAMMA_sRGB; + ImageFormat inout_format; + inout_format.color_space = COLORSPACE_sRGB; + inout_format.gamma_curve = GAMMA_sRGB; + ycbcr_format.luma_coefficients = YCBCR_REC_709; ycbcr_format.full_range = false; ycbcr_format.num_levels = 256; @@ -311,22 +320,23 @@ void JPEGFrameView::initializeGL() ycbcr_format.cb_y_position = 0.5f; // Irrelevant. ycbcr_format.cr_x_position = 0.0f; ycbcr_format.cr_y_position = 0.5f; - ycbcr_input = (movit::YCbCrInput *)chain->add_input(new YCbCrInput(image_format, ycbcr_format, 1280, 720)); - ImageFormat inout_format; - inout_format.color_space = COLORSPACE_sRGB; - inout_format.gamma_curve = GAMMA_sRGB; + // Planar Y'CbCr decoding chain. + planar_chain.reset(new EffectChain(1280, 720, resource_pool)); + ycbcr_planar_input = (movit::YCbCrInput *)planar_chain->add_input(new YCbCrInput(inout_format, ycbcr_format, 1280, 720, YCBCR_INPUT_PLANAR)); + planar_chain->add_output(inout_format, OUTPUT_ALPHA_FORMAT_POSTMULTIPLIED); + planar_chain->set_dither_bits(8); + planar_chain->finalize(); - check_error(); - chain->add_output(inout_format, OUTPUT_ALPHA_FORMAT_POSTMULTIPLIED); - check_error(); - chain->set_dither_bits(8); - check_error(); - chain->finalize(); - check_error(); + // Semiplanar Y'CbCr decoding chain (for images coming from VA-API). + semiplanar_chain.reset(new EffectChain(1280, 720, resource_pool)); + ycbcr_semiplanar_input = (movit::YCbCrInput *)semiplanar_chain->add_input(new YCbCrInput(inout_format, ycbcr_format, 1280, 720, YCBCR_INPUT_SPLIT_Y_AND_CBCR)); + semiplanar_chain->add_output(inout_format, OUTPUT_ALPHA_FORMAT_POSTMULTIPLIED); + semiplanar_chain->set_dither_bits(8); + semiplanar_chain->finalize(); overlay_chain.reset(new EffectChain(overlay_base_width, overlay_base_height, resource_pool)); - overlay_input = (movit::FlatInput *)overlay_chain->add_input(new FlatInput(image_format, FORMAT_GRAYSCALE, GL_UNSIGNED_BYTE, overlay_base_width, overlay_base_height)); + overlay_input = (movit::FlatInput *)overlay_chain->add_input(new FlatInput(inout_format, FORMAT_GRAYSCALE, GL_UNSIGNED_BYTE, overlay_base_width, overlay_base_height)); overlay_chain->add_output(inout_format, OUTPUT_ALPHA_FORMAT_POSTMULTIPLIED); overlay_chain->finalize(); @@ -353,7 +363,11 @@ void JPEGFrameView::paintGL() } check_error(); - chain->render_to_screen(); + if (current_frame->is_semiplanar) { + semiplanar_chain->render_to_screen(); + } else { + planar_chain->render_to_screen(); + } if (overlay_image != nullptr) { if (overlay_input_needs_refresh) { @@ -372,15 +386,26 @@ void JPEGFrameView::setDecodedFrame(std::shared_ptr frame) current_frame = frame; ycbcr_format.chroma_subsampling_x = frame->chroma_subsampling_x; ycbcr_format.chroma_subsampling_y = frame->chroma_subsampling_y; - ycbcr_input->change_ycbcr_format(ycbcr_format); - ycbcr_input->set_width(frame->width); - ycbcr_input->set_height(frame->height); - ycbcr_input->set_pixel_data(0, frame->y.get()); - ycbcr_input->set_pixel_data(1, frame->cb.get()); - ycbcr_input->set_pixel_data(2, frame->cr.get()); - ycbcr_input->set_pitch(0, frame->pitch_y); - ycbcr_input->set_pitch(1, frame->pitch_chroma); - ycbcr_input->set_pitch(2, frame->pitch_chroma); + + if (frame->is_semiplanar) { + ycbcr_semiplanar_input->change_ycbcr_format(ycbcr_format); + ycbcr_semiplanar_input->set_width(frame->width); + ycbcr_semiplanar_input->set_height(frame->height); + ycbcr_semiplanar_input->set_pixel_data(0, frame->y.get()); + ycbcr_semiplanar_input->set_pixel_data(1, frame->cbcr.get()); + ycbcr_semiplanar_input->set_pitch(0, frame->pitch_y); + ycbcr_semiplanar_input->set_pitch(1, frame->pitch_chroma); + } else { + ycbcr_planar_input->change_ycbcr_format(ycbcr_format); + ycbcr_planar_input->set_width(frame->width); + ycbcr_planar_input->set_height(frame->height); + ycbcr_planar_input->set_pixel_data(0, frame->y.get()); + ycbcr_planar_input->set_pixel_data(1, frame->cb.get()); + ycbcr_planar_input->set_pixel_data(2, frame->cr.get()); + ycbcr_planar_input->set_pitch(0, frame->pitch_y); + ycbcr_planar_input->set_pitch(1, frame->pitch_chroma); + ycbcr_planar_input->set_pitch(2, frame->pitch_chroma); + } update(); }); } diff --git a/jpeg_frame_view.h b/jpeg_frame_view.h index 8b2c93c..7c41b78 100644 --- a/jpeg_frame_view.h +++ b/jpeg_frame_view.h @@ -12,17 +12,13 @@ #include +#include "jpeg_frame.h" + struct JPEGID { unsigned stream_idx; int64_t pts; bool interpolated; }; -struct Frame { - std::unique_ptr y, cb, cr; - unsigned width, height; - unsigned chroma_subsampling_x, chroma_subsampling_y; - unsigned pitch_y, pitch_chroma; -}; enum CacheMissBehavior { DECODE_IF_NOT_IN_CACHE, RETURN_NULLPTR_IF_NOT_IN_CACHE @@ -60,11 +56,14 @@ private: // The stream index of the latest frame we displayed. unsigned current_stream_idx = 0; - std::unique_ptr chain; + std::unique_ptr planar_chain; std::shared_ptr current_frame; // So that we hold on to the pixels. - movit::YCbCrInput *ycbcr_input; + movit::YCbCrInput *ycbcr_planar_input; movit::YCbCrFormat ycbcr_format; + std::unique_ptr semiplanar_chain; + movit::YCbCrInput *ycbcr_semiplanar_input; + static constexpr int overlay_base_width = 16, overlay_base_height = 16; int overlay_width = overlay_base_width, overlay_height = overlay_base_height; std::unique_ptr overlay_image; // If nullptr, no overlay. diff --git a/main.cpp b/main.cpp index dcb6f10..9bb666a 100644 --- a/main.cpp +++ b/main.cpp @@ -30,6 +30,7 @@ extern "C" { #include "ref_counted_gl_sync.h" #include "timebase.h" #include "ui_mainwindow.h" +#include "vaapi_jpeg_decoder.h" using namespace std; using namespace std::chrono; @@ -97,6 +98,8 @@ int main(int argc, char **argv) thread(record_thread_func).detach(); + init_jpeg_vaapi(); + return app.exec(); } diff --git a/memcpy_interleaved.cpp b/memcpy_interleaved.cpp new file mode 100644 index 0000000..9a41cdd --- /dev/null +++ b/memcpy_interleaved.cpp @@ -0,0 +1,136 @@ +#include +#include +#include +#if __SSE2__ +#include +#endif + +using namespace std; + +// TODO: Support stride. +void memcpy_interleaved_slow(uint8_t *dest1, uint8_t *dest2, const uint8_t *src, size_t n) +{ + assert(n % 2 == 0); + uint8_t *dptr1 = dest1; + uint8_t *dptr2 = dest2; + + for (size_t i = 0; i < n; i += 2) { + *dptr1++ = *src++; + *dptr2++ = *src++; + } +} + +#ifdef __SSE2__ + +// Returns the number of bytes consumed. +size_t memcpy_interleaved_fastpath(uint8_t *dest1, uint8_t *dest2, const uint8_t *src, size_t n) +{ + const uint8_t *limit = src + n; + size_t consumed = 0; + + // Align end to 32 bytes. + limit = (const uint8_t *)(intptr_t(limit) & ~31); + + if (src >= limit) { + return 0; + } + + // Process [0,31] bytes, such that start gets aligned to 32 bytes. + const uint8_t *aligned_src = (const uint8_t *)(intptr_t(src + 31) & ~31); + if (aligned_src != src) { + size_t n2 = aligned_src - src; + memcpy_interleaved_slow(dest1, dest2, src, n2); + dest1 += n2 / 2; + dest2 += n2 / 2; + if (n2 % 2) { + swap(dest1, dest2); + } + src = aligned_src; + consumed += n2; + } + + // Make the length a multiple of 64. + if (((limit - src) % 64) != 0) { + limit -= 32; + } + assert(((limit - src) % 64) == 0); + +#if __AVX2__ + const __m256i * __restrict in = (const __m256i *)src; + __m256i * __restrict out1 = (__m256i *)dest1; + __m256i * __restrict out2 = (__m256i *)dest2; + + __m256i shuffle_cw = _mm256_set_epi8( + 15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0, + 15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0); + while (in < (const __m256i *)limit) { + // Note: For brevity, comments show lanes as if they were 2x64-bit (they're actually 2x128). + __m256i data1 = _mm256_stream_load_si256(in); // AaBbCcDd EeFfGgHh + __m256i data2 = _mm256_stream_load_si256(in + 1); // IiJjKkLl MmNnOoPp + + data1 = _mm256_shuffle_epi8(data1, shuffle_cw); // ABCDabcd EFGHefgh + data2 = _mm256_shuffle_epi8(data2, shuffle_cw); // IJKLijkl MNOPmnop + + data1 = _mm256_permute4x64_epi64(data1, 0b11011000); // ABCDEFGH abcdefgh + data2 = _mm256_permute4x64_epi64(data2, 0b11011000); // IJKLMNOP ijklmnop + + __m256i lo = _mm256_permute2x128_si256(data1, data2, 0b00100000); + __m256i hi = _mm256_permute2x128_si256(data1, data2, 0b00110001); + + _mm256_storeu_si256(out1, lo); + _mm256_storeu_si256(out2, hi); + + in += 2; + ++out1; + ++out2; + consumed += 64; + } +#else + const __m128i * __restrict in = (const __m128i *)src; + __m128i * __restrict out1 = (__m128i *)dest1; + __m128i * __restrict out2 = (__m128i *)dest2; + + __m128i mask_lower_byte = _mm_set1_epi16(0x00ff); + while (in < (const __m128i *)limit) { + __m128i data1 = _mm_load_si128(in); + __m128i data2 = _mm_load_si128(in + 1); + __m128i data1_lo = _mm_and_si128(data1, mask_lower_byte); + __m128i data2_lo = _mm_and_si128(data2, mask_lower_byte); + __m128i data1_hi = _mm_srli_epi16(data1, 8); + __m128i data2_hi = _mm_srli_epi16(data2, 8); + __m128i lo = _mm_packus_epi16(data1_lo, data2_lo); + _mm_storeu_si128(out1, lo); + __m128i hi = _mm_packus_epi16(data1_hi, data2_hi); + _mm_storeu_si128(out2, hi); + + in += 2; + ++out1; + ++out2; + consumed += 32; + } +#endif + + return consumed; +} + +#endif // defined(__SSE2__) + +void memcpy_interleaved(uint8_t *dest1, uint8_t *dest2, const uint8_t *src, size_t n) +{ +#ifdef __SSE2__ + size_t consumed = memcpy_interleaved_fastpath(dest1, dest2, src, n); + src += consumed; + dest1 += consumed / 2; + dest2 += consumed / 2; + if (consumed % 2) { + swap(dest1, dest2); + } + n -= consumed; + + if (n > 0) { + memcpy_interleaved_slow(dest1, dest2, src, n); + } +#else + memcpy_interleaved_slow(dest1, dest2, src, n); +#endif +} diff --git a/memcpy_interleaved.h b/memcpy_interleaved.h new file mode 100644 index 0000000..a7f8994 --- /dev/null +++ b/memcpy_interleaved.h @@ -0,0 +1,11 @@ +#ifndef _MEMCPY_INTERLEAVED_H +#define _MEMCPY_INTERLEAVED_H 1 + +#include +#include + +// Copies every other byte from src to dest1 and dest2. +// TODO: Support stride. +void memcpy_interleaved(uint8_t *dest1, uint8_t *dest2, const uint8_t *src, size_t n); + +#endif // !defined(_MEMCPY_INTERLEAVED_H) diff --git a/vaapi_jpeg_decoder.cpp b/vaapi_jpeg_decoder.cpp new file mode 100644 index 0000000..b6f9c50 --- /dev/null +++ b/vaapi_jpeg_decoder.cpp @@ -0,0 +1,546 @@ +#include "vaapi_jpeg_decoder.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include "jpeg_frame.h" +#include "memcpy_interleaved.h" + +using namespace std; + +static unique_ptr va_dpy; +static VAConfigID config_id; +static VAImageFormat uyvy_format; +bool vaapi_jpeg_decoding_usable = false; + +struct VAResources { + unsigned width, height; + VASurfaceID surface; + VAContextID context; + VAImage image; +}; +static list va_resources_freelist; +static mutex va_resources_mutex; + +#define CHECK_VASTATUS(va_status, func) \ + if (va_status != VA_STATUS_SUCCESS) { \ + fprintf(stderr, "%s:%d (%s) failed with %d\n", __func__, __LINE__, func, va_status); \ + exit(1); \ + } + +#define CHECK_VASTATUS_RET(va_status, func) \ + if (va_status != VA_STATUS_SUCCESS) { \ + fprintf(stderr, "%s:%d (%s) failed with %d\n", __func__, __LINE__, func, va_status); \ + return nullptr; \ + } + +VAResources get_va_resources(unsigned width, unsigned height) +{ + { + lock_guard lock(va_resources_mutex); + for (auto it = va_resources_freelist.begin(); it != va_resources_freelist.end(); ++it) { + if (it->width == width && it->height == height) { + VAResources ret = *it; + va_resources_freelist.erase(it); + return ret; + } + } + } + + VAResources ret; + + ret.width = width; + ret.height = height; + + VAStatus va_status = vaCreateSurfaces(va_dpy->va_dpy, VA_RT_FORMAT_YUV422, + width, height, + &ret.surface, 1, nullptr, 0); + CHECK_VASTATUS(va_status, "vaCreateSurfaces"); + + va_status = vaCreateContext(va_dpy->va_dpy, config_id, width, height, 0, &ret.surface, 1, &ret.context); + CHECK_VASTATUS(va_status, "vaCreateContext"); + + va_status = vaCreateImage(va_dpy->va_dpy, &uyvy_format, width, height, &ret.image); + CHECK_VASTATUS(va_status, "vaCreateImage"); + + return ret; +} + +void release_va_resources(VAResources resources) +{ + lock_guard lock(va_resources_mutex); + if (va_resources_freelist.size() > 10) { + auto it = va_resources_freelist.end(); + --it; + + VAStatus va_status = vaDestroyImage(va_dpy->va_dpy, it->image.image_id); + CHECK_VASTATUS(va_status, "vaDestroyImage"); + + va_status = vaDestroyContext(va_dpy->va_dpy, it->context); + CHECK_VASTATUS(va_status, "vaDestroyContext"); + + va_status = vaDestroySurfaces(va_dpy->va_dpy, &it->surface, 1); + CHECK_VASTATUS(va_status, "vaDestroySurfaces"); + + va_resources_freelist.erase(it); + } + + va_resources_freelist.push_front(resources); +} + +// RAII wrapper to release VAResources on return (even on error). +class ReleaseVAResources { +public: + ReleaseVAResources(const VAResources &resources) + : resources(resources) {} + ~ReleaseVAResources() { + if (!committed) { + release_va_resources(resources); + } + } + + void commit() { committed = true; } + +private: + const VAResources &resources; + bool committed = false; +}; + +VADisplayWithCleanup::~VADisplayWithCleanup() +{ + if (va_dpy != nullptr) { + vaTerminate(va_dpy); + } + if (x11_display != nullptr) { + XCloseDisplay(x11_display); + } + if (drm_fd != -1) { + close(drm_fd); + } +} + +unique_ptr va_open_display(const string &va_display) +{ + if (va_display.empty() || va_display[0] != '/') { // An X display. + Display *x11_display = XOpenDisplay(va_display.empty() ? nullptr : va_display.c_str()); + if (x11_display == nullptr) { + fprintf(stderr, "error: can't connect to X server!\n"); + return nullptr; + } + + unique_ptr ret(new VADisplayWithCleanup); + ret->x11_display = x11_display; + ret->va_dpy = vaGetDisplay(x11_display); + if (ret->va_dpy == nullptr) { + return nullptr; + } + return ret; + } else { // A DRM node on the filesystem (e.g. /dev/dri/renderD128). + int drm_fd = open(va_display.c_str(), O_RDWR); + if (drm_fd == -1) { + perror(va_display.c_str()); + return nullptr; + } + unique_ptr ret(new VADisplayWithCleanup); + ret->drm_fd = drm_fd; + ret->va_dpy = vaGetDisplayDRM(drm_fd); + if (ret->va_dpy == nullptr) { + return nullptr; + } + return ret; + } +} + +unique_ptr try_open_va(const string &va_display, string *error) +{ + unique_ptr va_dpy = va_open_display(va_display); + if (va_dpy == nullptr) { + if (error) *error = "Opening VA display failed"; + return nullptr; + } + int major_ver, minor_ver; + VAStatus va_status = vaInitialize(va_dpy->va_dpy, &major_ver, &minor_ver); + if (va_status != VA_STATUS_SUCCESS) { + char buf[256]; + snprintf(buf, sizeof(buf), "vaInitialize() failed with status %d\n", va_status); + if (error != nullptr) *error = buf; + return nullptr; + } + + int num_entrypoints = vaMaxNumEntrypoints(va_dpy->va_dpy); + unique_ptr entrypoints(new VAEntrypoint[num_entrypoints]); + if (entrypoints == nullptr) { + if (error != nullptr) *error = "Failed to allocate memory for VA entry points"; + return nullptr; + } + + vaQueryConfigEntrypoints(va_dpy->va_dpy, VAProfileJPEGBaseline, entrypoints.get(), &num_entrypoints); + for (int slice_entrypoint = 0; slice_entrypoint < num_entrypoints; slice_entrypoint++) { + if (entrypoints[slice_entrypoint] != VAEntrypointVLD) { + continue; + } + + // We found a usable decode, so return it. + return va_dpy; + } + + if (error != nullptr) *error = "Can't find VAEntrypointVLD for the JPEG profile"; + return nullptr; +} + +string get_usable_va_display() +{ + // Reduce the amount of chatter while probing, + // unless the user has specified otherwise. + bool need_env_reset = false; + if (getenv("LIBVA_MESSAGING_LEVEL") == nullptr) { + setenv("LIBVA_MESSAGING_LEVEL", "0", true); + need_env_reset = true; + } + + // First try the default (ie., whatever $DISPLAY is set to). + unique_ptr va_dpy = try_open_va("", nullptr); + if (va_dpy != nullptr) { + if (need_env_reset) { + unsetenv("LIBVA_MESSAGING_LEVEL"); + } + return ""; + } + + fprintf(stderr, "The X11 display did not expose a VA-API JPEG decoder.\n"); + + // Try all /dev/dri/render* in turn. TODO: Accept /dev/dri/card*, too? + glob_t g; + int err = glob("/dev/dri/renderD*", 0, nullptr, &g); + if (err != 0) { + fprintf(stderr, "Couldn't list render nodes (%s) when trying to autodetect a replacement.\n", strerror(errno)); + } else { + for (size_t i = 0; i < g.gl_pathc; ++i) { + string path = g.gl_pathv[i]; + va_dpy = try_open_va(path, nullptr); + if (va_dpy != nullptr) { + fprintf(stderr, "Autodetected %s as a suitable replacement; using it.\n", + path.c_str()); + globfree(&g); + if (need_env_reset) { + unsetenv("LIBVA_MESSAGING_LEVEL"); + } + return path; + } + } + } + + fprintf(stderr, "No suitable VA-API JPEG decoders were found in /dev/dri; giving up.\n"); + fprintf(stderr, "Note that if you are using an Intel CPU with an external GPU,\n"); + fprintf(stderr, "you may need to enable the integrated Intel GPU in your BIOS\n"); + fprintf(stderr, "to expose Quick Sync.\n"); + return "none"; +} + +void init_jpeg_vaapi() +{ + string dpy = get_usable_va_display(); + if (dpy == "none") { + return; + } + + va_dpy = try_open_va(dpy, nullptr); + if (va_dpy == nullptr) { + return; + } + + VAConfigAttrib attr = { VAConfigAttribRTFormat, VA_RT_FORMAT_YUV422 }; + + VAStatus va_status = vaCreateConfig(va_dpy->va_dpy, VAProfileJPEGBaseline, VAEntrypointVLD, + &attr, 1, &config_id); + CHECK_VASTATUS(va_status, "vaCreateConfig"); + + int num_formats = vaMaxNumImageFormats(va_dpy->va_dpy); + assert(num_formats > 0); + + unique_ptr formats(new VAImageFormat[num_formats]); + va_status = vaQueryImageFormats(va_dpy->va_dpy, formats.get(), &num_formats); + CHECK_VASTATUS(va_status, "vaQueryImageFormats"); + + bool found = false; + for (int i = 0; i < num_formats; ++i) { + // Seemingly VA_FOURCC_422H is no good for vaGetImage(). :-/ + if (formats[i].fourcc == VA_FOURCC_UYVY) { + memcpy(&uyvy_format, &formats[i], sizeof(VAImageFormat)); + found = true; + break; + } + } + if (!found) { + return; + } + + fprintf(stderr, "VA-API JPEG decoding initialized.\n"); + vaapi_jpeg_decoding_usable = true; +} + +shared_ptr decode_jpeg_vaapi(const string &filename) +{ + jpeg_decompress_struct dinfo; + jpeg_error_mgr jerr; + dinfo.err = jpeg_std_error(&jerr); + jpeg_create_decompress(&dinfo); + + FILE *fp = fopen(filename.c_str(), "rb"); + if (fp == nullptr) { + perror(filename.c_str()); + exit(1); + } + jpeg_stdio_src(&dinfo, fp); + + jpeg_read_header(&dinfo, true); + + // Read the data that comes after the header. VA-API will destuff and all for us. + std::string str((const char *)dinfo.src->next_input_byte, dinfo.src->bytes_in_buffer); + while (!feof(fp)) { + char buf[4096]; + size_t ret = fread(buf, 1, sizeof(buf), fp); + str.append(buf, ret); + } + fclose(fp); + + if (dinfo.num_components != 3) { + fprintf(stderr, "Not a color JPEG. (%d components, Y=%dx%d, Cb=%dx%d, Cr=%dx%d)\n", + dinfo.num_components, + dinfo.comp_info[0].h_samp_factor, dinfo.comp_info[0].v_samp_factor, + dinfo.comp_info[1].h_samp_factor, dinfo.comp_info[1].v_samp_factor, + dinfo.comp_info[2].h_samp_factor, dinfo.comp_info[2].v_samp_factor); + return nullptr; + } + if (dinfo.comp_info[0].h_samp_factor != 2 || + dinfo.comp_info[0].v_samp_factor != 2 || + dinfo.comp_info[1].h_samp_factor != 1 || + dinfo.comp_info[1].v_samp_factor != 2 || + dinfo.comp_info[2].h_samp_factor != 1 || + dinfo.comp_info[2].v_samp_factor != 2) { + fprintf(stderr, "Not 4:2:2. (Y=%dx%d, Cb=%dx%d, Cr=%dx%d)\n", + dinfo.comp_info[0].h_samp_factor, dinfo.comp_info[0].v_samp_factor, + dinfo.comp_info[1].h_samp_factor, dinfo.comp_info[1].v_samp_factor, + dinfo.comp_info[2].h_samp_factor, dinfo.comp_info[2].v_samp_factor); + return nullptr; + } + + // Picture parameters. + VAPictureParameterBufferJPEGBaseline pic_param; + memset(&pic_param, 0, sizeof(pic_param)); + pic_param.picture_width = dinfo.image_width; + pic_param.picture_height = dinfo.image_height; + for (int component_idx = 0; component_idx < dinfo.num_components; ++component_idx) { + const jpeg_component_info *comp = &dinfo.comp_info[component_idx]; + pic_param.components[component_idx].component_id = comp->component_id; + pic_param.components[component_idx].h_sampling_factor = comp->h_samp_factor; + pic_param.components[component_idx].v_sampling_factor = comp->v_samp_factor; + pic_param.components[component_idx].quantiser_table_selector = comp->quant_tbl_no; + } + pic_param.num_components = dinfo.num_components; + pic_param.color_space = 0; // YUV. + pic_param.rotation = VA_ROTATION_NONE; + + VABufferID pic_param_buffer; + VAStatus va_status = vaCreateBuffer(va_dpy->va_dpy, config_id, VAPictureParameterBufferType, sizeof(pic_param), 1, &pic_param, &pic_param_buffer); + CHECK_VASTATUS_RET(va_status, "vaCreateBuffer"); + + // Quantization matrices. + VAIQMatrixBufferJPEGBaseline iq; + memset(&iq, 0, sizeof(iq)); + + for (int quant_tbl_idx = 0; quant_tbl_idx < min(4, NUM_QUANT_TBLS); ++quant_tbl_idx) { + const JQUANT_TBL *qtbl = dinfo.quant_tbl_ptrs[quant_tbl_idx]; + if (qtbl == nullptr) { + iq.load_quantiser_table[quant_tbl_idx] = 0; + } else { + iq.load_quantiser_table[quant_tbl_idx] = 1; + for (int i = 0; i < 64; ++i) { + if (qtbl->quantval[i] > 255) { + fprintf(stderr, "Baseline JPEG only!\n"); + return nullptr; + } + iq.quantiser_table[quant_tbl_idx][i] = qtbl->quantval[i]; + } + } + } + + VABufferID iq_buffer; + va_status = vaCreateBuffer(va_dpy->va_dpy, config_id, VAIQMatrixBufferType, sizeof(iq), 1, &iq, &iq_buffer); + CHECK_VASTATUS_RET(va_status, "vaCreateBuffer"); + + // Huffman tables (arithmetic is not supported). + VAHuffmanTableBufferJPEGBaseline huff; + memset(&huff, 0, sizeof(huff)); + + for (int huff_tbl_idx = 0; huff_tbl_idx < min(2, NUM_HUFF_TBLS); ++huff_tbl_idx) { + const JHUFF_TBL *ac_hufftbl = dinfo.ac_huff_tbl_ptrs[huff_tbl_idx]; + const JHUFF_TBL *dc_hufftbl = dinfo.dc_huff_tbl_ptrs[huff_tbl_idx]; + if (ac_hufftbl == nullptr) { + assert(dc_hufftbl == nullptr); + huff.load_huffman_table[huff_tbl_idx] = 0; + } else { + assert(dc_hufftbl != nullptr); + huff.load_huffman_table[huff_tbl_idx] = 1; + + for (int i = 0; i < 16; ++i) { + huff.huffman_table[huff_tbl_idx].num_dc_codes[i] = dc_hufftbl->bits[i + 1]; + } + for (int i = 0; i < 12; ++i) { + huff.huffman_table[huff_tbl_idx].dc_values[i] = dc_hufftbl->huffval[i]; + } + for (int i = 0; i < 16; ++i) { + huff.huffman_table[huff_tbl_idx].num_ac_codes[i] = ac_hufftbl->bits[i + 1]; + } + for (int i = 0; i < 162; ++i) { + huff.huffman_table[huff_tbl_idx].ac_values[i] = ac_hufftbl->huffval[i]; + } + } + } + + VABufferID huff_buffer; + va_status = vaCreateBuffer(va_dpy->va_dpy, config_id, VAHuffmanTableBufferType, sizeof(huff), 1, &huff, &huff_buffer); + CHECK_VASTATUS_RET(va_status, "vaCreateBuffer"); + + // Slice parameters (metadata about the slice). + VASliceParameterBufferJPEGBaseline parms; + memset(&parms, 0, sizeof(parms)); + parms.slice_data_size = str.size(); + parms.slice_data_offset = 0; + parms.slice_data_flag = VA_SLICE_DATA_FLAG_ALL; + parms.slice_horizontal_position = 0; + parms.slice_vertical_position = 0; + for (int component_idx = 0; component_idx < dinfo.num_components; ++component_idx) { + const jpeg_component_info *comp = &dinfo.comp_info[component_idx]; + parms.components[component_idx].component_selector = comp->component_id; + parms.components[component_idx].dc_table_selector = comp->dc_tbl_no; + parms.components[component_idx].ac_table_selector = comp->ac_tbl_no; + if (parms.components[component_idx].dc_table_selector > 1 || + parms.components[component_idx].ac_table_selector > 1) { + fprintf(stderr, "Uses too many Huffman tables\n"); + return nullptr; + } + } + parms.num_components = dinfo.num_components; + parms.restart_interval = dinfo.restart_interval; + int horiz_mcus = (dinfo.image_width + (DCTSIZE * 2) - 1) / (DCTSIZE * 2); + int vert_mcus = (dinfo.image_height + DCTSIZE - 1) / DCTSIZE; + parms.num_mcus = horiz_mcus * vert_mcus; + + VABufferID slice_param_buffer; + va_status = vaCreateBuffer(va_dpy->va_dpy, config_id, VASliceParameterBufferType, sizeof(parms), 1, &parms, &slice_param_buffer); + CHECK_VASTATUS_RET(va_status, "vaCreateBuffer"); + + // The actual data. + VABufferID data_buffer; + va_status = vaCreateBuffer(va_dpy->va_dpy, config_id, VASliceDataBufferType, str.size(), 1, &str[0], &data_buffer); + CHECK_VASTATUS_RET(va_status, "vaCreateBuffer"); + + VAResources resources = get_va_resources(dinfo.image_width, dinfo.image_height); + ReleaseVAResources release(resources); + + va_status = vaBeginPicture(va_dpy->va_dpy, resources.context, resources.surface); + CHECK_VASTATUS_RET(va_status, "vaBeginPicture"); + va_status = vaRenderPicture(va_dpy->va_dpy, resources.context, &pic_param_buffer, 1); + CHECK_VASTATUS_RET(va_status, "vaRenderPicture(pic_param)"); + va_status = vaRenderPicture(va_dpy->va_dpy, resources.context, &iq_buffer, 1); + CHECK_VASTATUS_RET(va_status, "vaRenderPicture(iq)"); + va_status = vaRenderPicture(va_dpy->va_dpy, resources.context, &huff_buffer, 1); + CHECK_VASTATUS_RET(va_status, "vaRenderPicture(huff)"); + va_status = vaRenderPicture(va_dpy->va_dpy, resources.context, &slice_param_buffer, 1); + CHECK_VASTATUS_RET(va_status, "vaRenderPicture(slice_param)"); + va_status = vaRenderPicture(va_dpy->va_dpy, resources.context, &data_buffer, 1); + CHECK_VASTATUS_RET(va_status, "vaRenderPicture(data)"); + va_status = vaEndPicture(va_dpy->va_dpy, resources.context); + CHECK_VASTATUS_RET(va_status, "vaEndPicture"); + + // vaDeriveImage() works, but the resulting image seems to live in + // uncached memory, which makes copying data out from it very, very slow. + // Thanks to FFmpeg for the observation that you can vaGetImage() the + // surface onto your own image (although then, it can't be planar, which + // is unfortunate for us). +#if 0 + VAImage image; + va_status = vaDeriveImage(va_dpy->va_dpy, surf, &image); + CHECK_VASTATUS_RET(va_status, "vaDeriveImage"); +#else + va_status = vaSyncSurface(va_dpy->va_dpy, resources.surface); + CHECK_VASTATUS_RET(va_status, "vaSyncSurface"); + + va_status = vaGetImage(va_dpy->va_dpy, resources.surface, 0, 0, dinfo.image_width, dinfo.image_height, resources.image.image_id); + CHECK_VASTATUS_RET(va_status, "vaGetImage"); +#endif + + void *mapped; + va_status = vaMapBuffer(va_dpy->va_dpy, resources.image.buf, &mapped); + CHECK_VASTATUS_RET(va_status, "vaMapBuffer"); + + shared_ptr frame(new Frame); +#if 0 + // 4:2:2 planar (for vaDeriveImage). + frame->y.reset(new uint8_t[dinfo.image_width * dinfo.image_height]); + frame->cb.reset(new uint8_t[(dinfo.image_width / 2) * dinfo.image_height]); + frame->cr.reset(new uint8_t[(dinfo.image_width / 2) * dinfo.image_height]); + for (int component_idx = 0; component_idx < dinfo.num_components; ++component_idx) { + uint8_t *dptr; + size_t width; + if (component_idx == 0) { + dptr = frame->y.get(); + width = dinfo.image_width; + } else if (component_idx == 1) { + dptr = frame->cb.get(); + width = dinfo.image_width / 2; + } else if (component_idx == 2) { + dptr = frame->cr.get(); + width = dinfo.image_width / 2; + } else { + assert(false); + } + const uint8_t *sptr = (const uint8_t *)mapped + image.offsets[component_idx]; + size_t spitch = image.pitches[component_idx]; + for (size_t y = 0; y < dinfo.image_height; ++y) { + memcpy(dptr + y * width, sptr + y * spitch, width); + } + } +#else + // Convert Y'CbCr to separate Y' and CbCr. + frame->is_semiplanar = true; + frame->y.reset(new uint8_t[dinfo.image_width * dinfo.image_height]); + frame->cbcr.reset(new uint8_t[dinfo.image_width * dinfo.image_height]); + const uint8_t *src = (const uint8_t *)mapped + resources.image.offsets[0]; + if (resources.image.pitches[0] == dinfo.image_width * 2) { + memcpy_interleaved(frame->cbcr.get(), frame->y.get(), src, dinfo.image_width * dinfo.image_height * 2); + } else { + for (unsigned y = 0; y < dinfo.image_height; ++y) { + memcpy_interleaved(frame->cbcr.get() + y * dinfo.image_width, frame->y.get() + y * dinfo.image_width, + src + y * resources.image.pitches[0], dinfo.image_width * 2); + } + } +#endif + frame->width = dinfo.image_width; + frame->height = dinfo.image_height; + frame->chroma_subsampling_x = 2; + frame->chroma_subsampling_y = 1; + frame->pitch_y = dinfo.image_width; + frame->pitch_chroma = dinfo.image_width / 2; + + va_status = vaUnmapBuffer(va_dpy->va_dpy, resources.image.buf); + CHECK_VASTATUS_RET(va_status, "vaUnmapBuffer"); + + return frame; +} diff --git a/vaapi_jpeg_decoder.h b/vaapi_jpeg_decoder.h new file mode 100644 index 0000000..4ab957e --- /dev/null +++ b/vaapi_jpeg_decoder.h @@ -0,0 +1,27 @@ +#ifndef _VAAPI_JPEG_DECODER_H +#define _VAAPI_JPEG_DECODER_H 1 + +#include +#include + +#include +#include + +struct Frame; + +struct VADisplayWithCleanup { + ~VADisplayWithCleanup(); + + VADisplay va_dpy; + Display *x11_display = nullptr; + int drm_fd = -1; +}; +std::unique_ptr va_open_display(const std::string &va_display); // Can return nullptr on failure. +std::string get_usable_va_display(); + +void init_jpeg_vaapi(); +std::shared_ptr decode_jpeg_vaapi(const std::string &filename); + +extern bool vaapi_jpeg_decoding_usable; + +#endif // !defined(_VAAPI_JPEG_DECODER_H) diff --git a/video_stream.cpp b/video_stream.cpp index a733d46..aa93dc7 100644 --- a/video_stream.cpp +++ b/video_stream.cpp @@ -149,11 +149,11 @@ vector encode_jpeg(const uint8_t *y_data, const uint8_t *cb_data, const VideoStream::VideoStream() { using namespace movit; - // TODO: deduplicate code against JPEGFrameView? - ycbcr_convert_chain.reset(new EffectChain(1280, 720)); - ImageFormat image_format; - image_format.color_space = COLORSPACE_sRGB; - image_format.gamma_curve = GAMMA_sRGB; + + ImageFormat inout_format; + inout_format.color_space = COLORSPACE_sRGB; + inout_format.gamma_curve = GAMMA_sRGB; + ycbcr_format.luma_coefficients = YCBCR_REC_709; ycbcr_format.full_range = true; // JPEG. ycbcr_format.num_levels = 256; @@ -163,28 +163,33 @@ VideoStream::VideoStream() ycbcr_format.cb_y_position = 0.5f; // Irrelevant. ycbcr_format.cr_x_position = 0.0f; ycbcr_format.cr_y_position = 0.5f; - ycbcr_input = (movit::YCbCrInput *)ycbcr_convert_chain->add_input(new YCbCrInput(image_format, ycbcr_format, 1280, 720)); YCbCrFormat ycbcr_output_format = ycbcr_format; ycbcr_output_format.chroma_subsampling_x = 1; - ImageFormat inout_format; - inout_format.color_space = COLORSPACE_sRGB; - inout_format.gamma_curve = GAMMA_sRGB; + // TODO: deduplicate code against JPEGFrameView? + ycbcr_planar_convert_chain.reset(new EffectChain(1280, 720)); + ycbcr_planar_input = (movit::YCbCrInput *)ycbcr_planar_convert_chain->add_input(new YCbCrInput(inout_format, ycbcr_format, 1280, 720, YCBCR_INPUT_PLANAR)); - check_error(); + // One full Y'CbCr texture (for interpolation), one that's just Y (throwing away the + // Cb and Cr channels). The second copy is sort of redundant, but it's the easiest way + // of getting the gray data into a layered texture. + ycbcr_planar_convert_chain->add_ycbcr_output(inout_format, OUTPUT_ALPHA_FORMAT_POSTMULTIPLIED, ycbcr_output_format); + ycbcr_planar_convert_chain->add_ycbcr_output(inout_format, OUTPUT_ALPHA_FORMAT_POSTMULTIPLIED, ycbcr_output_format); + ycbcr_planar_convert_chain->set_dither_bits(8); + ycbcr_planar_convert_chain->finalize(); + + // Same, for semiplanar inputs. + ycbcr_semiplanar_convert_chain.reset(new EffectChain(1280, 720)); + ycbcr_semiplanar_input = (movit::YCbCrInput *)ycbcr_semiplanar_convert_chain->add_input(new YCbCrInput(inout_format, ycbcr_format, 1280, 720, YCBCR_INPUT_SPLIT_Y_AND_CBCR)); // One full Y'CbCr texture (for interpolation), one that's just Y (throwing away the // Cb and Cr channels). The second copy is sort of redundant, but it's the easiest way // of getting the gray data into a layered texture. - ycbcr_convert_chain->add_ycbcr_output(inout_format, OUTPUT_ALPHA_FORMAT_POSTMULTIPLIED, ycbcr_output_format); - check_error(); - ycbcr_convert_chain->add_ycbcr_output(inout_format, OUTPUT_ALPHA_FORMAT_POSTMULTIPLIED, ycbcr_output_format); - check_error(); - ycbcr_convert_chain->set_dither_bits(8); - check_error(); - ycbcr_convert_chain->finalize(); - check_error(); + ycbcr_semiplanar_convert_chain->add_ycbcr_output(inout_format, OUTPUT_ALPHA_FORMAT_POSTMULTIPLIED, ycbcr_output_format); + ycbcr_semiplanar_convert_chain->add_ycbcr_output(inout_format, OUTPUT_ALPHA_FORMAT_POSTMULTIPLIED, ycbcr_output_format); + ycbcr_semiplanar_convert_chain->set_dither_bits(8); + ycbcr_semiplanar_convert_chain->finalize(); GLuint input_tex[num_interpolate_slots], gray_tex[num_interpolate_slots], cb_tex[num_interpolate_slots], cr_tex[num_interpolate_slots]; glCreateTextures(GL_TEXTURE_2D_ARRAY, 10, input_tex); @@ -325,16 +330,28 @@ void VideoStream::schedule_interpolated_frame(int64_t output_pts, unsigned strea shared_ptr frame = decode_jpeg_with_cache(jpeg_id, DECODE_IF_NOT_IN_CACHE, &did_decode); ycbcr_format.chroma_subsampling_x = frame->chroma_subsampling_x; ycbcr_format.chroma_subsampling_y = frame->chroma_subsampling_y; - ycbcr_input->change_ycbcr_format(ycbcr_format); - ycbcr_input->set_width(frame->width); - ycbcr_input->set_height(frame->height); - ycbcr_input->set_pixel_data(0, frame->y.get()); - ycbcr_input->set_pixel_data(1, frame->cb.get()); - ycbcr_input->set_pixel_data(2, frame->cr.get()); - ycbcr_input->set_pitch(0, frame->pitch_y); - ycbcr_input->set_pitch(1, frame->pitch_chroma); - ycbcr_input->set_pitch(2, frame->pitch_chroma); - ycbcr_convert_chain->render_to_fbo(resources.input_fbos[frame_no], 1280, 720); + + if (frame->is_semiplanar) { + ycbcr_semiplanar_input->change_ycbcr_format(ycbcr_format); + ycbcr_semiplanar_input->set_width(frame->width); + ycbcr_semiplanar_input->set_height(frame->height); + ycbcr_semiplanar_input->set_pixel_data(0, frame->y.get()); + ycbcr_semiplanar_input->set_pixel_data(1, frame->cbcr.get()); + ycbcr_semiplanar_input->set_pitch(0, frame->pitch_y); + ycbcr_semiplanar_input->set_pitch(1, frame->pitch_chroma); + ycbcr_semiplanar_convert_chain->render_to_fbo(resources.input_fbos[frame_no], 1280, 720); + } else { + ycbcr_planar_input->change_ycbcr_format(ycbcr_format); + ycbcr_planar_input->set_width(frame->width); + ycbcr_planar_input->set_height(frame->height); + ycbcr_planar_input->set_pixel_data(0, frame->y.get()); + ycbcr_planar_input->set_pixel_data(1, frame->cb.get()); + ycbcr_planar_input->set_pixel_data(2, frame->cr.get()); + ycbcr_planar_input->set_pitch(0, frame->pitch_y); + ycbcr_planar_input->set_pitch(1, frame->pitch_chroma); + ycbcr_planar_input->set_pitch(2, frame->pitch_chroma); + ycbcr_planar_convert_chain->render_to_fbo(resources.input_fbos[frame_no], 1280, 720); + } } glGenerateTextureMipmap(resources.input_tex); @@ -426,6 +443,7 @@ void VideoStream::encode_thread_func() memcpy(frame->cb.get() + 640 * yy, cb + 640 * (719 - yy), 640); memcpy(frame->cr.get() + 640 * yy, cr + 640 * (719 - yy), 640); } + frame->is_semiplanar = false; frame->width = 1280; frame->height = 720; frame->chroma_subsampling_x = 2; diff --git a/video_stream.h b/video_stream.h index 2d1e8f1..b86d271 100644 --- a/video_stream.h +++ b/video_stream.h @@ -82,9 +82,11 @@ private: // Effectively only converts from 4:2:2 to 4:4:4. // TODO: Have a separate version with ResampleEffect, for scaling? - std::unique_ptr ycbcr_convert_chain; + std::unique_ptr ycbcr_planar_convert_chain; + std::unique_ptr ycbcr_semiplanar_convert_chain; - movit::YCbCrInput *ycbcr_input; + movit::YCbCrInput *ycbcr_planar_input; + movit::YCbCrInput *ycbcr_semiplanar_input; movit::YCbCrFormat ycbcr_format; // Frame interpolation. -- 2.39.2