#include "defs.h"
#include "post_to_main_thread.h"
+#include "vaapi_jpeg_decoder.h"
#include "video_stream.h"
using namespace movit;
atomic<size_t> event_counter{0};
extern QGLWidget *global_share_widget;
-// TODO: Decode using VA-API if available.
shared_ptr<Frame> decode_jpeg(const string &filename)
{
- shared_ptr<Frame> frame(new Frame);
+ shared_ptr<Frame> frame;
+ if (vaapi_jpeg_decoding_usable) {
+ frame = decode_jpeg_vaapi(filename);
+ if (frame != nullptr) {
+ return frame;
+ }
+ fprintf(stderr, "VA-API hardware decoding failed; falling back to software.\n");
+ }
+
+ frame.reset(new Frame);
jpeg_decompress_struct dinfo;
jpeg_error_mgr jerr;
std::thread(&jpeg_decoder_thread).detach();
});
- chain.reset(new EffectChain(1280, 720, resource_pool));
- ImageFormat image_format;
- image_format.color_space = COLORSPACE_sRGB;
- image_format.gamma_curve = GAMMA_sRGB;
+ ImageFormat inout_format;
+ inout_format.color_space = COLORSPACE_sRGB;
+ inout_format.gamma_curve = GAMMA_sRGB;
+
ycbcr_format.luma_coefficients = YCBCR_REC_709;
ycbcr_format.full_range = false;
ycbcr_format.num_levels = 256;
ycbcr_format.cb_y_position = 0.5f; // Irrelevant.
ycbcr_format.cr_x_position = 0.0f;
ycbcr_format.cr_y_position = 0.5f;
- ycbcr_input = (movit::YCbCrInput *)chain->add_input(new YCbCrInput(image_format, ycbcr_format, 1280, 720));
- ImageFormat inout_format;
- inout_format.color_space = COLORSPACE_sRGB;
- inout_format.gamma_curve = GAMMA_sRGB;
+ // Planar Y'CbCr decoding chain.
+ planar_chain.reset(new EffectChain(1280, 720, resource_pool));
+ ycbcr_planar_input = (movit::YCbCrInput *)planar_chain->add_input(new YCbCrInput(inout_format, ycbcr_format, 1280, 720, YCBCR_INPUT_PLANAR));
+ planar_chain->add_output(inout_format, OUTPUT_ALPHA_FORMAT_POSTMULTIPLIED);
+ planar_chain->set_dither_bits(8);
+ planar_chain->finalize();
- check_error();
- chain->add_output(inout_format, OUTPUT_ALPHA_FORMAT_POSTMULTIPLIED);
- check_error();
- chain->set_dither_bits(8);
- check_error();
- chain->finalize();
- check_error();
+ // Semiplanar Y'CbCr decoding chain (for images coming from VA-API).
+ semiplanar_chain.reset(new EffectChain(1280, 720, resource_pool));
+ ycbcr_semiplanar_input = (movit::YCbCrInput *)semiplanar_chain->add_input(new YCbCrInput(inout_format, ycbcr_format, 1280, 720, YCBCR_INPUT_SPLIT_Y_AND_CBCR));
+ semiplanar_chain->add_output(inout_format, OUTPUT_ALPHA_FORMAT_POSTMULTIPLIED);
+ semiplanar_chain->set_dither_bits(8);
+ semiplanar_chain->finalize();
overlay_chain.reset(new EffectChain(overlay_base_width, overlay_base_height, resource_pool));
- overlay_input = (movit::FlatInput *)overlay_chain->add_input(new FlatInput(image_format, FORMAT_GRAYSCALE, GL_UNSIGNED_BYTE, overlay_base_width, overlay_base_height));
+ overlay_input = (movit::FlatInput *)overlay_chain->add_input(new FlatInput(inout_format, FORMAT_GRAYSCALE, GL_UNSIGNED_BYTE, overlay_base_width, overlay_base_height));
overlay_chain->add_output(inout_format, OUTPUT_ALPHA_FORMAT_POSTMULTIPLIED);
overlay_chain->finalize();
}
check_error();
- chain->render_to_screen();
+ if (current_frame->is_semiplanar) {
+ semiplanar_chain->render_to_screen();
+ } else {
+ planar_chain->render_to_screen();
+ }
if (overlay_image != nullptr) {
if (overlay_input_needs_refresh) {
current_frame = frame;
ycbcr_format.chroma_subsampling_x = frame->chroma_subsampling_x;
ycbcr_format.chroma_subsampling_y = frame->chroma_subsampling_y;
- ycbcr_input->change_ycbcr_format(ycbcr_format);
- ycbcr_input->set_width(frame->width);
- ycbcr_input->set_height(frame->height);
- ycbcr_input->set_pixel_data(0, frame->y.get());
- ycbcr_input->set_pixel_data(1, frame->cb.get());
- ycbcr_input->set_pixel_data(2, frame->cr.get());
- ycbcr_input->set_pitch(0, frame->pitch_y);
- ycbcr_input->set_pitch(1, frame->pitch_chroma);
- ycbcr_input->set_pitch(2, frame->pitch_chroma);
+
+ if (frame->is_semiplanar) {
+ ycbcr_semiplanar_input->change_ycbcr_format(ycbcr_format);
+ ycbcr_semiplanar_input->set_width(frame->width);
+ ycbcr_semiplanar_input->set_height(frame->height);
+ ycbcr_semiplanar_input->set_pixel_data(0, frame->y.get());
+ ycbcr_semiplanar_input->set_pixel_data(1, frame->cbcr.get());
+ ycbcr_semiplanar_input->set_pitch(0, frame->pitch_y);
+ ycbcr_semiplanar_input->set_pitch(1, frame->pitch_chroma);
+ } else {
+ ycbcr_planar_input->change_ycbcr_format(ycbcr_format);
+ ycbcr_planar_input->set_width(frame->width);
+ ycbcr_planar_input->set_height(frame->height);
+ ycbcr_planar_input->set_pixel_data(0, frame->y.get());
+ ycbcr_planar_input->set_pixel_data(1, frame->cb.get());
+ ycbcr_planar_input->set_pixel_data(2, frame->cr.get());
+ ycbcr_planar_input->set_pitch(0, frame->pitch_y);
+ ycbcr_planar_input->set_pitch(1, frame->pitch_chroma);
+ ycbcr_planar_input->set_pitch(2, frame->pitch_chroma);
+ }
update();
});
}
--- /dev/null
+#include <cstdint>
+#include <algorithm>
+#include <assert.h>
+#if __SSE2__
+#include <immintrin.h>
+#endif
+
+using namespace std;
+
+// TODO: Support stride.
+void memcpy_interleaved_slow(uint8_t *dest1, uint8_t *dest2, const uint8_t *src, size_t n)
+{
+ assert(n % 2 == 0);
+ uint8_t *dptr1 = dest1;
+ uint8_t *dptr2 = dest2;
+
+ for (size_t i = 0; i < n; i += 2) {
+ *dptr1++ = *src++;
+ *dptr2++ = *src++;
+ }
+}
+
+#ifdef __SSE2__
+
+// Returns the number of bytes consumed.
+size_t memcpy_interleaved_fastpath(uint8_t *dest1, uint8_t *dest2, const uint8_t *src, size_t n)
+{
+ const uint8_t *limit = src + n;
+ size_t consumed = 0;
+
+ // Align end to 32 bytes.
+ limit = (const uint8_t *)(intptr_t(limit) & ~31);
+
+ if (src >= limit) {
+ return 0;
+ }
+
+ // Process [0,31] bytes, such that start gets aligned to 32 bytes.
+ const uint8_t *aligned_src = (const uint8_t *)(intptr_t(src + 31) & ~31);
+ if (aligned_src != src) {
+ size_t n2 = aligned_src - src;
+ memcpy_interleaved_slow(dest1, dest2, src, n2);
+ dest1 += n2 / 2;
+ dest2 += n2 / 2;
+ if (n2 % 2) {
+ swap(dest1, dest2);
+ }
+ src = aligned_src;
+ consumed += n2;
+ }
+
+ // Make the length a multiple of 64.
+ if (((limit - src) % 64) != 0) {
+ limit -= 32;
+ }
+ assert(((limit - src) % 64) == 0);
+
+#if __AVX2__
+ const __m256i * __restrict in = (const __m256i *)src;
+ __m256i * __restrict out1 = (__m256i *)dest1;
+ __m256i * __restrict out2 = (__m256i *)dest2;
+
+ __m256i shuffle_cw = _mm256_set_epi8(
+ 15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0,
+ 15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);
+ while (in < (const __m256i *)limit) {
+ // Note: For brevity, comments show lanes as if they were 2x64-bit (they're actually 2x128).
+ __m256i data1 = _mm256_stream_load_si256(in); // AaBbCcDd EeFfGgHh
+ __m256i data2 = _mm256_stream_load_si256(in + 1); // IiJjKkLl MmNnOoPp
+
+ data1 = _mm256_shuffle_epi8(data1, shuffle_cw); // ABCDabcd EFGHefgh
+ data2 = _mm256_shuffle_epi8(data2, shuffle_cw); // IJKLijkl MNOPmnop
+
+ data1 = _mm256_permute4x64_epi64(data1, 0b11011000); // ABCDEFGH abcdefgh
+ data2 = _mm256_permute4x64_epi64(data2, 0b11011000); // IJKLMNOP ijklmnop
+
+ __m256i lo = _mm256_permute2x128_si256(data1, data2, 0b00100000);
+ __m256i hi = _mm256_permute2x128_si256(data1, data2, 0b00110001);
+
+ _mm256_storeu_si256(out1, lo);
+ _mm256_storeu_si256(out2, hi);
+
+ in += 2;
+ ++out1;
+ ++out2;
+ consumed += 64;
+ }
+#else
+ const __m128i * __restrict in = (const __m128i *)src;
+ __m128i * __restrict out1 = (__m128i *)dest1;
+ __m128i * __restrict out2 = (__m128i *)dest2;
+
+ __m128i mask_lower_byte = _mm_set1_epi16(0x00ff);
+ while (in < (const __m128i *)limit) {
+ __m128i data1 = _mm_load_si128(in);
+ __m128i data2 = _mm_load_si128(in + 1);
+ __m128i data1_lo = _mm_and_si128(data1, mask_lower_byte);
+ __m128i data2_lo = _mm_and_si128(data2, mask_lower_byte);
+ __m128i data1_hi = _mm_srli_epi16(data1, 8);
+ __m128i data2_hi = _mm_srli_epi16(data2, 8);
+ __m128i lo = _mm_packus_epi16(data1_lo, data2_lo);
+ _mm_storeu_si128(out1, lo);
+ __m128i hi = _mm_packus_epi16(data1_hi, data2_hi);
+ _mm_storeu_si128(out2, hi);
+
+ in += 2;
+ ++out1;
+ ++out2;
+ consumed += 32;
+ }
+#endif
+
+ return consumed;
+}
+
+#endif // defined(__SSE2__)
+
+void memcpy_interleaved(uint8_t *dest1, uint8_t *dest2, const uint8_t *src, size_t n)
+{
+#ifdef __SSE2__
+ size_t consumed = memcpy_interleaved_fastpath(dest1, dest2, src, n);
+ src += consumed;
+ dest1 += consumed / 2;
+ dest2 += consumed / 2;
+ if (consumed % 2) {
+ swap(dest1, dest2);
+ }
+ n -= consumed;
+
+ if (n > 0) {
+ memcpy_interleaved_slow(dest1, dest2, src, n);
+ }
+#else
+ memcpy_interleaved_slow(dest1, dest2, src, n);
+#endif
+}
--- /dev/null
+#include "vaapi_jpeg_decoder.h"
+
+#include <X11/Xlib.h>
+#include <assert.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <glob.h>
+#include <jpeglib.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <va/va.h>
+#include <va/va_drm.h>
+#include <va/va_x11.h>
+
+#include <list>
+#include <mutex>
+#include <string>
+
+#include "jpeg_frame.h"
+#include "memcpy_interleaved.h"
+
+using namespace std;
+
+static unique_ptr<VADisplayWithCleanup> va_dpy;
+static VAConfigID config_id;
+static VAImageFormat uyvy_format;
+bool vaapi_jpeg_decoding_usable = false;
+
+struct VAResources {
+ unsigned width, height;
+ VASurfaceID surface;
+ VAContextID context;
+ VAImage image;
+};
+static list<VAResources> va_resources_freelist;
+static mutex va_resources_mutex;
+
+#define CHECK_VASTATUS(va_status, func) \
+ if (va_status != VA_STATUS_SUCCESS) { \
+ fprintf(stderr, "%s:%d (%s) failed with %d\n", __func__, __LINE__, func, va_status); \
+ exit(1); \
+ }
+
+#define CHECK_VASTATUS_RET(va_status, func) \
+ if (va_status != VA_STATUS_SUCCESS) { \
+ fprintf(stderr, "%s:%d (%s) failed with %d\n", __func__, __LINE__, func, va_status); \
+ return nullptr; \
+ }
+
+VAResources get_va_resources(unsigned width, unsigned height)
+{
+ {
+ lock_guard<mutex> lock(va_resources_mutex);
+ for (auto it = va_resources_freelist.begin(); it != va_resources_freelist.end(); ++it) {
+ if (it->width == width && it->height == height) {
+ VAResources ret = *it;
+ va_resources_freelist.erase(it);
+ return ret;
+ }
+ }
+ }
+
+ VAResources ret;
+
+ ret.width = width;
+ ret.height = height;
+
+ VAStatus va_status = vaCreateSurfaces(va_dpy->va_dpy, VA_RT_FORMAT_YUV422,
+ width, height,
+ &ret.surface, 1, nullptr, 0);
+ CHECK_VASTATUS(va_status, "vaCreateSurfaces");
+
+ va_status = vaCreateContext(va_dpy->va_dpy, config_id, width, height, 0, &ret.surface, 1, &ret.context);
+ CHECK_VASTATUS(va_status, "vaCreateContext");
+
+ va_status = vaCreateImage(va_dpy->va_dpy, &uyvy_format, width, height, &ret.image);
+ CHECK_VASTATUS(va_status, "vaCreateImage");
+
+ return ret;
+}
+
+void release_va_resources(VAResources resources)
+{
+ lock_guard<mutex> lock(va_resources_mutex);
+ if (va_resources_freelist.size() > 10) {
+ auto it = va_resources_freelist.end();
+ --it;
+
+ VAStatus va_status = vaDestroyImage(va_dpy->va_dpy, it->image.image_id);
+ CHECK_VASTATUS(va_status, "vaDestroyImage");
+
+ va_status = vaDestroyContext(va_dpy->va_dpy, it->context);
+ CHECK_VASTATUS(va_status, "vaDestroyContext");
+
+ va_status = vaDestroySurfaces(va_dpy->va_dpy, &it->surface, 1);
+ CHECK_VASTATUS(va_status, "vaDestroySurfaces");
+
+ va_resources_freelist.erase(it);
+ }
+
+ va_resources_freelist.push_front(resources);
+}
+
+// RAII wrapper to release VAResources on return (even on error).
+class ReleaseVAResources {
+public:
+ ReleaseVAResources(const VAResources &resources)
+ : resources(resources) {}
+ ~ReleaseVAResources() {
+ if (!committed) {
+ release_va_resources(resources);
+ }
+ }
+
+ void commit() { committed = true; }
+
+private:
+ const VAResources &resources;
+ bool committed = false;
+};
+
+VADisplayWithCleanup::~VADisplayWithCleanup()
+{
+ if (va_dpy != nullptr) {
+ vaTerminate(va_dpy);
+ }
+ if (x11_display != nullptr) {
+ XCloseDisplay(x11_display);
+ }
+ if (drm_fd != -1) {
+ close(drm_fd);
+ }
+}
+
+unique_ptr<VADisplayWithCleanup> va_open_display(const string &va_display)
+{
+ if (va_display.empty() || va_display[0] != '/') { // An X display.
+ Display *x11_display = XOpenDisplay(va_display.empty() ? nullptr : va_display.c_str());
+ if (x11_display == nullptr) {
+ fprintf(stderr, "error: can't connect to X server!\n");
+ return nullptr;
+ }
+
+ unique_ptr<VADisplayWithCleanup> ret(new VADisplayWithCleanup);
+ ret->x11_display = x11_display;
+ ret->va_dpy = vaGetDisplay(x11_display);
+ if (ret->va_dpy == nullptr) {
+ return nullptr;
+ }
+ return ret;
+ } else { // A DRM node on the filesystem (e.g. /dev/dri/renderD128).
+ int drm_fd = open(va_display.c_str(), O_RDWR);
+ if (drm_fd == -1) {
+ perror(va_display.c_str());
+ return nullptr;
+ }
+ unique_ptr<VADisplayWithCleanup> ret(new VADisplayWithCleanup);
+ ret->drm_fd = drm_fd;
+ ret->va_dpy = vaGetDisplayDRM(drm_fd);
+ if (ret->va_dpy == nullptr) {
+ return nullptr;
+ }
+ return ret;
+ }
+}
+
+unique_ptr<VADisplayWithCleanup> try_open_va(const string &va_display, string *error)
+{
+ unique_ptr<VADisplayWithCleanup> va_dpy = va_open_display(va_display);
+ if (va_dpy == nullptr) {
+ if (error) *error = "Opening VA display failed";
+ return nullptr;
+ }
+ int major_ver, minor_ver;
+ VAStatus va_status = vaInitialize(va_dpy->va_dpy, &major_ver, &minor_ver);
+ if (va_status != VA_STATUS_SUCCESS) {
+ char buf[256];
+ snprintf(buf, sizeof(buf), "vaInitialize() failed with status %d\n", va_status);
+ if (error != nullptr) *error = buf;
+ return nullptr;
+ }
+
+ int num_entrypoints = vaMaxNumEntrypoints(va_dpy->va_dpy);
+ unique_ptr<VAEntrypoint[]> entrypoints(new VAEntrypoint[num_entrypoints]);
+ if (entrypoints == nullptr) {
+ if (error != nullptr) *error = "Failed to allocate memory for VA entry points";
+ return nullptr;
+ }
+
+ vaQueryConfigEntrypoints(va_dpy->va_dpy, VAProfileJPEGBaseline, entrypoints.get(), &num_entrypoints);
+ for (int slice_entrypoint = 0; slice_entrypoint < num_entrypoints; slice_entrypoint++) {
+ if (entrypoints[slice_entrypoint] != VAEntrypointVLD) {
+ continue;
+ }
+
+ // We found a usable decode, so return it.
+ return va_dpy;
+ }
+
+ if (error != nullptr) *error = "Can't find VAEntrypointVLD for the JPEG profile";
+ return nullptr;
+}
+
+string get_usable_va_display()
+{
+ // Reduce the amount of chatter while probing,
+ // unless the user has specified otherwise.
+ bool need_env_reset = false;
+ if (getenv("LIBVA_MESSAGING_LEVEL") == nullptr) {
+ setenv("LIBVA_MESSAGING_LEVEL", "0", true);
+ need_env_reset = true;
+ }
+
+ // First try the default (ie., whatever $DISPLAY is set to).
+ unique_ptr<VADisplayWithCleanup> va_dpy = try_open_va("", nullptr);
+ if (va_dpy != nullptr) {
+ if (need_env_reset) {
+ unsetenv("LIBVA_MESSAGING_LEVEL");
+ }
+ return "";
+ }
+
+ fprintf(stderr, "The X11 display did not expose a VA-API JPEG decoder.\n");
+
+ // Try all /dev/dri/render* in turn. TODO: Accept /dev/dri/card*, too?
+ glob_t g;
+ int err = glob("/dev/dri/renderD*", 0, nullptr, &g);
+ if (err != 0) {
+ fprintf(stderr, "Couldn't list render nodes (%s) when trying to autodetect a replacement.\n", strerror(errno));
+ } else {
+ for (size_t i = 0; i < g.gl_pathc; ++i) {
+ string path = g.gl_pathv[i];
+ va_dpy = try_open_va(path, nullptr);
+ if (va_dpy != nullptr) {
+ fprintf(stderr, "Autodetected %s as a suitable replacement; using it.\n",
+ path.c_str());
+ globfree(&g);
+ if (need_env_reset) {
+ unsetenv("LIBVA_MESSAGING_LEVEL");
+ }
+ return path;
+ }
+ }
+ }
+
+ fprintf(stderr, "No suitable VA-API JPEG decoders were found in /dev/dri; giving up.\n");
+ fprintf(stderr, "Note that if you are using an Intel CPU with an external GPU,\n");
+ fprintf(stderr, "you may need to enable the integrated Intel GPU in your BIOS\n");
+ fprintf(stderr, "to expose Quick Sync.\n");
+ return "none";
+}
+
+void init_jpeg_vaapi()
+{
+ string dpy = get_usable_va_display();
+ if (dpy == "none") {
+ return;
+ }
+
+ va_dpy = try_open_va(dpy, nullptr);
+ if (va_dpy == nullptr) {
+ return;
+ }
+
+ VAConfigAttrib attr = { VAConfigAttribRTFormat, VA_RT_FORMAT_YUV422 };
+
+ VAStatus va_status = vaCreateConfig(va_dpy->va_dpy, VAProfileJPEGBaseline, VAEntrypointVLD,
+ &attr, 1, &config_id);
+ CHECK_VASTATUS(va_status, "vaCreateConfig");
+
+ int num_formats = vaMaxNumImageFormats(va_dpy->va_dpy);
+ assert(num_formats > 0);
+
+ unique_ptr<VAImageFormat[]> formats(new VAImageFormat[num_formats]);
+ va_status = vaQueryImageFormats(va_dpy->va_dpy, formats.get(), &num_formats);
+ CHECK_VASTATUS(va_status, "vaQueryImageFormats");
+
+ bool found = false;
+ for (int i = 0; i < num_formats; ++i) {
+ // Seemingly VA_FOURCC_422H is no good for vaGetImage(). :-/
+ if (formats[i].fourcc == VA_FOURCC_UYVY) {
+ memcpy(&uyvy_format, &formats[i], sizeof(VAImageFormat));
+ found = true;
+ break;
+ }
+ }
+ if (!found) {
+ return;
+ }
+
+ fprintf(stderr, "VA-API JPEG decoding initialized.\n");
+ vaapi_jpeg_decoding_usable = true;
+}
+
+shared_ptr<Frame> decode_jpeg_vaapi(const string &filename)
+{
+ jpeg_decompress_struct dinfo;
+ jpeg_error_mgr jerr;
+ dinfo.err = jpeg_std_error(&jerr);
+ jpeg_create_decompress(&dinfo);
+
+ FILE *fp = fopen(filename.c_str(), "rb");
+ if (fp == nullptr) {
+ perror(filename.c_str());
+ exit(1);
+ }
+ jpeg_stdio_src(&dinfo, fp);
+
+ jpeg_read_header(&dinfo, true);
+
+ // Read the data that comes after the header. VA-API will destuff and all for us.
+ std::string str((const char *)dinfo.src->next_input_byte, dinfo.src->bytes_in_buffer);
+ while (!feof(fp)) {
+ char buf[4096];
+ size_t ret = fread(buf, 1, sizeof(buf), fp);
+ str.append(buf, ret);
+ }
+ fclose(fp);
+
+ if (dinfo.num_components != 3) {
+ fprintf(stderr, "Not a color JPEG. (%d components, Y=%dx%d, Cb=%dx%d, Cr=%dx%d)\n",
+ dinfo.num_components,
+ dinfo.comp_info[0].h_samp_factor, dinfo.comp_info[0].v_samp_factor,
+ dinfo.comp_info[1].h_samp_factor, dinfo.comp_info[1].v_samp_factor,
+ dinfo.comp_info[2].h_samp_factor, dinfo.comp_info[2].v_samp_factor);
+ return nullptr;
+ }
+ if (dinfo.comp_info[0].h_samp_factor != 2 ||
+ dinfo.comp_info[0].v_samp_factor != 2 ||
+ dinfo.comp_info[1].h_samp_factor != 1 ||
+ dinfo.comp_info[1].v_samp_factor != 2 ||
+ dinfo.comp_info[2].h_samp_factor != 1 ||
+ dinfo.comp_info[2].v_samp_factor != 2) {
+ fprintf(stderr, "Not 4:2:2. (Y=%dx%d, Cb=%dx%d, Cr=%dx%d)\n",
+ dinfo.comp_info[0].h_samp_factor, dinfo.comp_info[0].v_samp_factor,
+ dinfo.comp_info[1].h_samp_factor, dinfo.comp_info[1].v_samp_factor,
+ dinfo.comp_info[2].h_samp_factor, dinfo.comp_info[2].v_samp_factor);
+ return nullptr;
+ }
+
+ // Picture parameters.
+ VAPictureParameterBufferJPEGBaseline pic_param;
+ memset(&pic_param, 0, sizeof(pic_param));
+ pic_param.picture_width = dinfo.image_width;
+ pic_param.picture_height = dinfo.image_height;
+ for (int component_idx = 0; component_idx < dinfo.num_components; ++component_idx) {
+ const jpeg_component_info *comp = &dinfo.comp_info[component_idx];
+ pic_param.components[component_idx].component_id = comp->component_id;
+ pic_param.components[component_idx].h_sampling_factor = comp->h_samp_factor;
+ pic_param.components[component_idx].v_sampling_factor = comp->v_samp_factor;
+ pic_param.components[component_idx].quantiser_table_selector = comp->quant_tbl_no;
+ }
+ pic_param.num_components = dinfo.num_components;
+ pic_param.color_space = 0; // YUV.
+ pic_param.rotation = VA_ROTATION_NONE;
+
+ VABufferID pic_param_buffer;
+ VAStatus va_status = vaCreateBuffer(va_dpy->va_dpy, config_id, VAPictureParameterBufferType, sizeof(pic_param), 1, &pic_param, &pic_param_buffer);
+ CHECK_VASTATUS_RET(va_status, "vaCreateBuffer");
+
+ // Quantization matrices.
+ VAIQMatrixBufferJPEGBaseline iq;
+ memset(&iq, 0, sizeof(iq));
+
+ for (int quant_tbl_idx = 0; quant_tbl_idx < min(4, NUM_QUANT_TBLS); ++quant_tbl_idx) {
+ const JQUANT_TBL *qtbl = dinfo.quant_tbl_ptrs[quant_tbl_idx];
+ if (qtbl == nullptr) {
+ iq.load_quantiser_table[quant_tbl_idx] = 0;
+ } else {
+ iq.load_quantiser_table[quant_tbl_idx] = 1;
+ for (int i = 0; i < 64; ++i) {
+ if (qtbl->quantval[i] > 255) {
+ fprintf(stderr, "Baseline JPEG only!\n");
+ return nullptr;
+ }
+ iq.quantiser_table[quant_tbl_idx][i] = qtbl->quantval[i];
+ }
+ }
+ }
+
+ VABufferID iq_buffer;
+ va_status = vaCreateBuffer(va_dpy->va_dpy, config_id, VAIQMatrixBufferType, sizeof(iq), 1, &iq, &iq_buffer);
+ CHECK_VASTATUS_RET(va_status, "vaCreateBuffer");
+
+ // Huffman tables (arithmetic is not supported).
+ VAHuffmanTableBufferJPEGBaseline huff;
+ memset(&huff, 0, sizeof(huff));
+
+ for (int huff_tbl_idx = 0; huff_tbl_idx < min(2, NUM_HUFF_TBLS); ++huff_tbl_idx) {
+ const JHUFF_TBL *ac_hufftbl = dinfo.ac_huff_tbl_ptrs[huff_tbl_idx];
+ const JHUFF_TBL *dc_hufftbl = dinfo.dc_huff_tbl_ptrs[huff_tbl_idx];
+ if (ac_hufftbl == nullptr) {
+ assert(dc_hufftbl == nullptr);
+ huff.load_huffman_table[huff_tbl_idx] = 0;
+ } else {
+ assert(dc_hufftbl != nullptr);
+ huff.load_huffman_table[huff_tbl_idx] = 1;
+
+ for (int i = 0; i < 16; ++i) {
+ huff.huffman_table[huff_tbl_idx].num_dc_codes[i] = dc_hufftbl->bits[i + 1];
+ }
+ for (int i = 0; i < 12; ++i) {
+ huff.huffman_table[huff_tbl_idx].dc_values[i] = dc_hufftbl->huffval[i];
+ }
+ for (int i = 0; i < 16; ++i) {
+ huff.huffman_table[huff_tbl_idx].num_ac_codes[i] = ac_hufftbl->bits[i + 1];
+ }
+ for (int i = 0; i < 162; ++i) {
+ huff.huffman_table[huff_tbl_idx].ac_values[i] = ac_hufftbl->huffval[i];
+ }
+ }
+ }
+
+ VABufferID huff_buffer;
+ va_status = vaCreateBuffer(va_dpy->va_dpy, config_id, VAHuffmanTableBufferType, sizeof(huff), 1, &huff, &huff_buffer);
+ CHECK_VASTATUS_RET(va_status, "vaCreateBuffer");
+
+ // Slice parameters (metadata about the slice).
+ VASliceParameterBufferJPEGBaseline parms;
+ memset(&parms, 0, sizeof(parms));
+ parms.slice_data_size = str.size();
+ parms.slice_data_offset = 0;
+ parms.slice_data_flag = VA_SLICE_DATA_FLAG_ALL;
+ parms.slice_horizontal_position = 0;
+ parms.slice_vertical_position = 0;
+ for (int component_idx = 0; component_idx < dinfo.num_components; ++component_idx) {
+ const jpeg_component_info *comp = &dinfo.comp_info[component_idx];
+ parms.components[component_idx].component_selector = comp->component_id;
+ parms.components[component_idx].dc_table_selector = comp->dc_tbl_no;
+ parms.components[component_idx].ac_table_selector = comp->ac_tbl_no;
+ if (parms.components[component_idx].dc_table_selector > 1 ||
+ parms.components[component_idx].ac_table_selector > 1) {
+ fprintf(stderr, "Uses too many Huffman tables\n");
+ return nullptr;
+ }
+ }
+ parms.num_components = dinfo.num_components;
+ parms.restart_interval = dinfo.restart_interval;
+ int horiz_mcus = (dinfo.image_width + (DCTSIZE * 2) - 1) / (DCTSIZE * 2);
+ int vert_mcus = (dinfo.image_height + DCTSIZE - 1) / DCTSIZE;
+ parms.num_mcus = horiz_mcus * vert_mcus;
+
+ VABufferID slice_param_buffer;
+ va_status = vaCreateBuffer(va_dpy->va_dpy, config_id, VASliceParameterBufferType, sizeof(parms), 1, &parms, &slice_param_buffer);
+ CHECK_VASTATUS_RET(va_status, "vaCreateBuffer");
+
+ // The actual data.
+ VABufferID data_buffer;
+ va_status = vaCreateBuffer(va_dpy->va_dpy, config_id, VASliceDataBufferType, str.size(), 1, &str[0], &data_buffer);
+ CHECK_VASTATUS_RET(va_status, "vaCreateBuffer");
+
+ VAResources resources = get_va_resources(dinfo.image_width, dinfo.image_height);
+ ReleaseVAResources release(resources);
+
+ va_status = vaBeginPicture(va_dpy->va_dpy, resources.context, resources.surface);
+ CHECK_VASTATUS_RET(va_status, "vaBeginPicture");
+ va_status = vaRenderPicture(va_dpy->va_dpy, resources.context, &pic_param_buffer, 1);
+ CHECK_VASTATUS_RET(va_status, "vaRenderPicture(pic_param)");
+ va_status = vaRenderPicture(va_dpy->va_dpy, resources.context, &iq_buffer, 1);
+ CHECK_VASTATUS_RET(va_status, "vaRenderPicture(iq)");
+ va_status = vaRenderPicture(va_dpy->va_dpy, resources.context, &huff_buffer, 1);
+ CHECK_VASTATUS_RET(va_status, "vaRenderPicture(huff)");
+ va_status = vaRenderPicture(va_dpy->va_dpy, resources.context, &slice_param_buffer, 1);
+ CHECK_VASTATUS_RET(va_status, "vaRenderPicture(slice_param)");
+ va_status = vaRenderPicture(va_dpy->va_dpy, resources.context, &data_buffer, 1);
+ CHECK_VASTATUS_RET(va_status, "vaRenderPicture(data)");
+ va_status = vaEndPicture(va_dpy->va_dpy, resources.context);
+ CHECK_VASTATUS_RET(va_status, "vaEndPicture");
+
+ // vaDeriveImage() works, but the resulting image seems to live in
+ // uncached memory, which makes copying data out from it very, very slow.
+ // Thanks to FFmpeg for the observation that you can vaGetImage() the
+ // surface onto your own image (although then, it can't be planar, which
+ // is unfortunate for us).
+#if 0
+ VAImage image;
+ va_status = vaDeriveImage(va_dpy->va_dpy, surf, &image);
+ CHECK_VASTATUS_RET(va_status, "vaDeriveImage");
+#else
+ va_status = vaSyncSurface(va_dpy->va_dpy, resources.surface);
+ CHECK_VASTATUS_RET(va_status, "vaSyncSurface");
+
+ va_status = vaGetImage(va_dpy->va_dpy, resources.surface, 0, 0, dinfo.image_width, dinfo.image_height, resources.image.image_id);
+ CHECK_VASTATUS_RET(va_status, "vaGetImage");
+#endif
+
+ void *mapped;
+ va_status = vaMapBuffer(va_dpy->va_dpy, resources.image.buf, &mapped);
+ CHECK_VASTATUS_RET(va_status, "vaMapBuffer");
+
+ shared_ptr<Frame> frame(new Frame);
+#if 0
+ // 4:2:2 planar (for vaDeriveImage).
+ frame->y.reset(new uint8_t[dinfo.image_width * dinfo.image_height]);
+ frame->cb.reset(new uint8_t[(dinfo.image_width / 2) * dinfo.image_height]);
+ frame->cr.reset(new uint8_t[(dinfo.image_width / 2) * dinfo.image_height]);
+ for (int component_idx = 0; component_idx < dinfo.num_components; ++component_idx) {
+ uint8_t *dptr;
+ size_t width;
+ if (component_idx == 0) {
+ dptr = frame->y.get();
+ width = dinfo.image_width;
+ } else if (component_idx == 1) {
+ dptr = frame->cb.get();
+ width = dinfo.image_width / 2;
+ } else if (component_idx == 2) {
+ dptr = frame->cr.get();
+ width = dinfo.image_width / 2;
+ } else {
+ assert(false);
+ }
+ const uint8_t *sptr = (const uint8_t *)mapped + image.offsets[component_idx];
+ size_t spitch = image.pitches[component_idx];
+ for (size_t y = 0; y < dinfo.image_height; ++y) {
+ memcpy(dptr + y * width, sptr + y * spitch, width);
+ }
+ }
+#else
+ // Convert Y'CbCr to separate Y' and CbCr.
+ frame->is_semiplanar = true;
+ frame->y.reset(new uint8_t[dinfo.image_width * dinfo.image_height]);
+ frame->cbcr.reset(new uint8_t[dinfo.image_width * dinfo.image_height]);
+ const uint8_t *src = (const uint8_t *)mapped + resources.image.offsets[0];
+ if (resources.image.pitches[0] == dinfo.image_width * 2) {
+ memcpy_interleaved(frame->cbcr.get(), frame->y.get(), src, dinfo.image_width * dinfo.image_height * 2);
+ } else {
+ for (unsigned y = 0; y < dinfo.image_height; ++y) {
+ memcpy_interleaved(frame->cbcr.get() + y * dinfo.image_width, frame->y.get() + y * dinfo.image_width,
+ src + y * resources.image.pitches[0], dinfo.image_width * 2);
+ }
+ }
+#endif
+ frame->width = dinfo.image_width;
+ frame->height = dinfo.image_height;
+ frame->chroma_subsampling_x = 2;
+ frame->chroma_subsampling_y = 1;
+ frame->pitch_y = dinfo.image_width;
+ frame->pitch_chroma = dinfo.image_width / 2;
+
+ va_status = vaUnmapBuffer(va_dpy->va_dpy, resources.image.buf);
+ CHECK_VASTATUS_RET(va_status, "vaUnmapBuffer");
+
+ return frame;
+}
VideoStream::VideoStream()
{
using namespace movit;
- // TODO: deduplicate code against JPEGFrameView?
- ycbcr_convert_chain.reset(new EffectChain(1280, 720));
- ImageFormat image_format;
- image_format.color_space = COLORSPACE_sRGB;
- image_format.gamma_curve = GAMMA_sRGB;
+
+ ImageFormat inout_format;
+ inout_format.color_space = COLORSPACE_sRGB;
+ inout_format.gamma_curve = GAMMA_sRGB;
+
ycbcr_format.luma_coefficients = YCBCR_REC_709;
ycbcr_format.full_range = true; // JPEG.
ycbcr_format.num_levels = 256;
ycbcr_format.cb_y_position = 0.5f; // Irrelevant.
ycbcr_format.cr_x_position = 0.0f;
ycbcr_format.cr_y_position = 0.5f;
- ycbcr_input = (movit::YCbCrInput *)ycbcr_convert_chain->add_input(new YCbCrInput(image_format, ycbcr_format, 1280, 720));
YCbCrFormat ycbcr_output_format = ycbcr_format;
ycbcr_output_format.chroma_subsampling_x = 1;
- ImageFormat inout_format;
- inout_format.color_space = COLORSPACE_sRGB;
- inout_format.gamma_curve = GAMMA_sRGB;
+ // TODO: deduplicate code against JPEGFrameView?
+ ycbcr_planar_convert_chain.reset(new EffectChain(1280, 720));
+ ycbcr_planar_input = (movit::YCbCrInput *)ycbcr_planar_convert_chain->add_input(new YCbCrInput(inout_format, ycbcr_format, 1280, 720, YCBCR_INPUT_PLANAR));
- check_error();
+ // One full Y'CbCr texture (for interpolation), one that's just Y (throwing away the
+ // Cb and Cr channels). The second copy is sort of redundant, but it's the easiest way
+ // of getting the gray data into a layered texture.
+ ycbcr_planar_convert_chain->add_ycbcr_output(inout_format, OUTPUT_ALPHA_FORMAT_POSTMULTIPLIED, ycbcr_output_format);
+ ycbcr_planar_convert_chain->add_ycbcr_output(inout_format, OUTPUT_ALPHA_FORMAT_POSTMULTIPLIED, ycbcr_output_format);
+ ycbcr_planar_convert_chain->set_dither_bits(8);
+ ycbcr_planar_convert_chain->finalize();
+
+ // Same, for semiplanar inputs.
+ ycbcr_semiplanar_convert_chain.reset(new EffectChain(1280, 720));
+ ycbcr_semiplanar_input = (movit::YCbCrInput *)ycbcr_semiplanar_convert_chain->add_input(new YCbCrInput(inout_format, ycbcr_format, 1280, 720, YCBCR_INPUT_SPLIT_Y_AND_CBCR));
// One full Y'CbCr texture (for interpolation), one that's just Y (throwing away the
// Cb and Cr channels). The second copy is sort of redundant, but it's the easiest way
// of getting the gray data into a layered texture.
- ycbcr_convert_chain->add_ycbcr_output(inout_format, OUTPUT_ALPHA_FORMAT_POSTMULTIPLIED, ycbcr_output_format);
- check_error();
- ycbcr_convert_chain->add_ycbcr_output(inout_format, OUTPUT_ALPHA_FORMAT_POSTMULTIPLIED, ycbcr_output_format);
- check_error();
- ycbcr_convert_chain->set_dither_bits(8);
- check_error();
- ycbcr_convert_chain->finalize();
- check_error();
+ ycbcr_semiplanar_convert_chain->add_ycbcr_output(inout_format, OUTPUT_ALPHA_FORMAT_POSTMULTIPLIED, ycbcr_output_format);
+ ycbcr_semiplanar_convert_chain->add_ycbcr_output(inout_format, OUTPUT_ALPHA_FORMAT_POSTMULTIPLIED, ycbcr_output_format);
+ ycbcr_semiplanar_convert_chain->set_dither_bits(8);
+ ycbcr_semiplanar_convert_chain->finalize();
GLuint input_tex[num_interpolate_slots], gray_tex[num_interpolate_slots], cb_tex[num_interpolate_slots], cr_tex[num_interpolate_slots];
glCreateTextures(GL_TEXTURE_2D_ARRAY, 10, input_tex);
shared_ptr<Frame> frame = decode_jpeg_with_cache(jpeg_id, DECODE_IF_NOT_IN_CACHE, &did_decode);
ycbcr_format.chroma_subsampling_x = frame->chroma_subsampling_x;
ycbcr_format.chroma_subsampling_y = frame->chroma_subsampling_y;
- ycbcr_input->change_ycbcr_format(ycbcr_format);
- ycbcr_input->set_width(frame->width);
- ycbcr_input->set_height(frame->height);
- ycbcr_input->set_pixel_data(0, frame->y.get());
- ycbcr_input->set_pixel_data(1, frame->cb.get());
- ycbcr_input->set_pixel_data(2, frame->cr.get());
- ycbcr_input->set_pitch(0, frame->pitch_y);
- ycbcr_input->set_pitch(1, frame->pitch_chroma);
- ycbcr_input->set_pitch(2, frame->pitch_chroma);
- ycbcr_convert_chain->render_to_fbo(resources.input_fbos[frame_no], 1280, 720);
+
+ if (frame->is_semiplanar) {
+ ycbcr_semiplanar_input->change_ycbcr_format(ycbcr_format);
+ ycbcr_semiplanar_input->set_width(frame->width);
+ ycbcr_semiplanar_input->set_height(frame->height);
+ ycbcr_semiplanar_input->set_pixel_data(0, frame->y.get());
+ ycbcr_semiplanar_input->set_pixel_data(1, frame->cbcr.get());
+ ycbcr_semiplanar_input->set_pitch(0, frame->pitch_y);
+ ycbcr_semiplanar_input->set_pitch(1, frame->pitch_chroma);
+ ycbcr_semiplanar_convert_chain->render_to_fbo(resources.input_fbos[frame_no], 1280, 720);
+ } else {
+ ycbcr_planar_input->change_ycbcr_format(ycbcr_format);
+ ycbcr_planar_input->set_width(frame->width);
+ ycbcr_planar_input->set_height(frame->height);
+ ycbcr_planar_input->set_pixel_data(0, frame->y.get());
+ ycbcr_planar_input->set_pixel_data(1, frame->cb.get());
+ ycbcr_planar_input->set_pixel_data(2, frame->cr.get());
+ ycbcr_planar_input->set_pitch(0, frame->pitch_y);
+ ycbcr_planar_input->set_pitch(1, frame->pitch_chroma);
+ ycbcr_planar_input->set_pitch(2, frame->pitch_chroma);
+ ycbcr_planar_convert_chain->render_to_fbo(resources.input_fbos[frame_no], 1280, 720);
+ }
}
glGenerateTextureMipmap(resources.input_tex);
memcpy(frame->cb.get() + 640 * yy, cb + 640 * (719 - yy), 640);
memcpy(frame->cr.get() + 640 * yy, cr + 640 * (719 - yy), 640);
}
+ frame->is_semiplanar = false;
frame->width = 1280;
frame->height = 720;
frame->chroma_subsampling_x = 2;