From cf158af1c2219bd9f5a9bc531fb3c1133d327b45 Mon Sep 17 00:00:00 2001
From: "Steinar H. Gunderson" <sgunderson@bigfoot.com>
Date: Sat, 29 Sep 2018 16:37:14 +0200
Subject: [PATCH] Decode 4:2:2 JPEGs via VA-API if available.

---
 Makefile               |   3 +-
 jpeg_frame.h           |  16 ++
 jpeg_frame_view.cpp    |  81 +++---
 jpeg_frame_view.h      |  15 +-
 main.cpp               |   3 +
 memcpy_interleaved.cpp | 136 ++++++++++
 memcpy_interleaved.h   |  11 +
 vaapi_jpeg_decoder.cpp | 546 +++++++++++++++++++++++++++++++++++++++++
 vaapi_jpeg_decoder.h   |  27 ++
 video_stream.cpp       |  74 +++---
 video_stream.h         |   6 +-
 11 files changed, 851 insertions(+), 67 deletions(-)
 create mode 100644 jpeg_frame.h
 create mode 100644 memcpy_interleaved.cpp
 create mode 100644 memcpy_interleaved.h
 create mode 100644 vaapi_jpeg_decoder.cpp
 create mode 100644 vaapi_jpeg_decoder.h
diff --git a/Makefile b/Makefile
index a0075c5..a15bb6b 100644
--- a/Makefile
+++ b/Makefile
@@ -3,7 +3,7 @@ PKG_MODULES := Qt5Core Qt5Gui Qt5Widgets Qt5OpenGLExtensions Qt5OpenGL Qt5PrintS
 CXXFLAGS ?= -O2 -g -Wall  # Will be overridden by environment.
 CXXFLAGS += -fPIC $(shell pkg-config --cflags $(PKG_MODULES)) -DMOVIT_SHADER_DIR=\"$(shell pkg-config --variable=shaderdir movit)\" -pthread
 
-LDLIBS=$(shell pkg-config --libs $(PKG_MODULES)) -pthread -lavformat -lavcodec -lavutil -lswscale -lGL
+LDLIBS=$(shell pkg-config --libs $(PKG_MODULES)) -pthread -lavformat -lavcodec -lavutil -lswscale -lGL -lva -lva-drm -lva-x11 -lX11
 
 # Qt objects
 OBJS_WITH_MOC = mainwindow.o jpeg_frame_view.o clip_list.o
@@ -14,6 +14,7 @@ OBJS += $(OBJS_WITH_MOC:.o=.moc.o)
 OBJS += flow.o gpu_timers.o
 
 OBJS += ffmpeg_raii.o main.o player.o httpd.o mux.o metacube2.o video_stream.o context.o chroma_subsampler.o
+OBJS += vaapi_jpeg_decoder.o memcpy_interleaved.o
 
 %.o: %.cpp
 	$(CXX) -MMD -MP $(CPPFLAGS) $(CXXFLAGS) -o $@ -c $<
diff --git a/jpeg_frame.h b/jpeg_frame.h
new file mode 100644
index 0000000..eb73e13
--- /dev/null
+++ b/jpeg_frame.h
@@ -0,0 +1,16 @@
+#ifndef _JPEG_FRAME_H
+#define _JPEG_FRAME_H 1
+
+#include <memory>
+
+struct Frame {
+	bool is_semiplanar = false;
+	std::unique_ptr<uint8_t[]> y;
+	std::unique_ptr<uint8_t[]> cb, cr; // For planar.
+	std::unique_ptr<uint8_t[]> cbcr;  // For semiplanar.
+	unsigned width, height;
+	unsigned chroma_subsampling_x, chroma_subsampling_y;
+	unsigned pitch_y, pitch_chroma;
+};
+
+#endif   // !defined(_JPEG_FRAME_H)
diff --git a/jpeg_frame_view.cpp b/jpeg_frame_view.cpp
index 73030ff..ef1cded 100644
--- a/jpeg_frame_view.cpp
+++ b/jpeg_frame_view.cpp
@@ -20,6 +20,7 @@
 
 #include "defs.h"
 #include "post_to_main_thread.h"
+#include "vaapi_jpeg_decoder.h"
 #include "video_stream.h"
 
 using namespace movit;
@@ -50,10 +51,18 @@ deque<pair<JPEGID, JPEGFrameView *>> pending_decodes;  // Under cache_mu.
 atomic<size_t> event_counter{0};
 extern QGLWidget *global_share_widget;
 
-// TODO: Decode using VA-API if available.
 shared_ptr<Frame> decode_jpeg(const string &filename)
 {
-	shared_ptr<Frame> frame(new Frame);
+	shared_ptr<Frame> frame;
+	if (vaapi_jpeg_decoding_usable) {
+		frame = decode_jpeg_vaapi(filename);
+		if (frame != nullptr) {
+			return frame;
+		}
+		fprintf(stderr, "VA-API hardware decoding failed; falling back to software.\n");
+	}
+
+	frame.reset(new Frame);
 
 	jpeg_decompress_struct dinfo;
 	jpeg_error_mgr jerr;
@@ -298,10 +307,10 @@ void JPEGFrameView::initializeGL()
 		std::thread(&jpeg_decoder_thread).detach();
 	});
 
-	chain.reset(new EffectChain(1280, 720, resource_pool));
-	ImageFormat image_format;
-	image_format.color_space = COLORSPACE_sRGB;
-	image_format.gamma_curve = GAMMA_sRGB;
+	ImageFormat inout_format;
+	inout_format.color_space = COLORSPACE_sRGB;
+	inout_format.gamma_curve = GAMMA_sRGB;
+
 	ycbcr_format.luma_coefficients = YCBCR_REC_709;
 	ycbcr_format.full_range = false;
 	ycbcr_format.num_levels = 256;
@@ -311,22 +320,23 @@ void JPEGFrameView::initializeGL()
 	ycbcr_format.cb_y_position = 0.5f;  // Irrelevant.
 	ycbcr_format.cr_x_position = 0.0f;
 	ycbcr_format.cr_y_position = 0.5f;
-	ycbcr_input = (movit::YCbCrInput *)chain->add_input(new YCbCrInput(image_format, ycbcr_format, 1280, 720));
 
-	ImageFormat inout_format;
-        inout_format.color_space = COLORSPACE_sRGB;
-        inout_format.gamma_curve = GAMMA_sRGB;
+	// Planar Y'CbCr decoding chain.
+	planar_chain.reset(new EffectChain(1280, 720, resource_pool));
+	ycbcr_planar_input = (movit::YCbCrInput *)planar_chain->add_input(new YCbCrInput(inout_format, ycbcr_format, 1280, 720, YCBCR_INPUT_PLANAR));
+	planar_chain->add_output(inout_format, OUTPUT_ALPHA_FORMAT_POSTMULTIPLIED);
+	planar_chain->set_dither_bits(8);
+	planar_chain->finalize();
 
-	check_error();
-	chain->add_output(inout_format, OUTPUT_ALPHA_FORMAT_POSTMULTIPLIED);
-	check_error();
-	chain->set_dither_bits(8);
-	check_error();
-	chain->finalize();
-	check_error();
+	// Semiplanar Y'CbCr decoding chain (for images coming from VA-API).
+	semiplanar_chain.reset(new EffectChain(1280, 720, resource_pool));
+	ycbcr_semiplanar_input = (movit::YCbCrInput *)semiplanar_chain->add_input(new YCbCrInput(inout_format, ycbcr_format, 1280, 720, YCBCR_INPUT_SPLIT_Y_AND_CBCR));
+	semiplanar_chain->add_output(inout_format, OUTPUT_ALPHA_FORMAT_POSTMULTIPLIED);
+	semiplanar_chain->set_dither_bits(8);
+	semiplanar_chain->finalize();
 
 	overlay_chain.reset(new EffectChain(overlay_base_width, overlay_base_height, resource_pool));
-	overlay_input = (movit::FlatInput *)overlay_chain->add_input(new FlatInput(image_format, FORMAT_GRAYSCALE, GL_UNSIGNED_BYTE, overlay_base_width, overlay_base_height));
+	overlay_input = (movit::FlatInput *)overlay_chain->add_input(new FlatInput(inout_format, FORMAT_GRAYSCALE, GL_UNSIGNED_BYTE, overlay_base_width, overlay_base_height));
 
 	overlay_chain->add_output(inout_format, OUTPUT_ALPHA_FORMAT_POSTMULTIPLIED);
 	overlay_chain->finalize();
@@ -353,7 +363,11 @@ void JPEGFrameView::paintGL()
 	}
 
 	check_error();
-	chain->render_to_screen();
+	if (current_frame->is_semiplanar) {
+		semiplanar_chain->render_to_screen();
+	} else {
+		planar_chain->render_to_screen();
+	}
 
 	if (overlay_image != nullptr) {
 		if (overlay_input_needs_refresh) {
@@ -372,15 +386,26 @@ void JPEGFrameView::setDecodedFrame(std::shared_ptr<Frame> frame)
 		current_frame = frame;
 		ycbcr_format.chroma_subsampling_x = frame->chroma_subsampling_x;
 		ycbcr_format.chroma_subsampling_y = frame->chroma_subsampling_y;
-		ycbcr_input->change_ycbcr_format(ycbcr_format);
-		ycbcr_input->set_width(frame->width);
-		ycbcr_input->set_height(frame->height);
-		ycbcr_input->set_pixel_data(0, frame->y.get());
-		ycbcr_input->set_pixel_data(1, frame->cb.get());
-		ycbcr_input->set_pixel_data(2, frame->cr.get());
-		ycbcr_input->set_pitch(0, frame->pitch_y);
-		ycbcr_input->set_pitch(1, frame->pitch_chroma);
-		ycbcr_input->set_pitch(2, frame->pitch_chroma);
+
+		if (frame->is_semiplanar) {
+			ycbcr_semiplanar_input->change_ycbcr_format(ycbcr_format);
+			ycbcr_semiplanar_input->set_width(frame->width);
+			ycbcr_semiplanar_input->set_height(frame->height);
+			ycbcr_semiplanar_input->set_pixel_data(0, frame->y.get());
+			ycbcr_semiplanar_input->set_pixel_data(1, frame->cbcr.get());
+			ycbcr_semiplanar_input->set_pitch(0, frame->pitch_y);
+			ycbcr_semiplanar_input->set_pitch(1, frame->pitch_chroma);
+		} else {
+			ycbcr_planar_input->change_ycbcr_format(ycbcr_format);
+			ycbcr_planar_input->set_width(frame->width);
+			ycbcr_planar_input->set_height(frame->height);
+			ycbcr_planar_input->set_pixel_data(0, frame->y.get());
+			ycbcr_planar_input->set_pixel_data(1, frame->cb.get());
+			ycbcr_planar_input->set_pixel_data(2, frame->cr.get());
+			ycbcr_planar_input->set_pitch(0, frame->pitch_y);
+			ycbcr_planar_input->set_pitch(1, frame->pitch_chroma);
+			ycbcr_planar_input->set_pitch(2, frame->pitch_chroma);
+		}
 		update();
 	});
 }
diff --git a/jpeg_frame_view.h b/jpeg_frame_view.h
index 8b2c93c..7c41b78 100644
--- a/jpeg_frame_view.h
+++ b/jpeg_frame_view.h
@@ -12,17 +12,13 @@
 
 #include <memory>
 
+#include "jpeg_frame.h"
+
 struct JPEGID {
 	unsigned stream_idx;
 	int64_t pts;
 	bool interpolated;
 };
-struct Frame {
-	std::unique_ptr<uint8_t[]> y, cb, cr;
-	unsigned width, height;
-	unsigned chroma_subsampling_x, chroma_subsampling_y;
-	unsigned pitch_y, pitch_chroma;
-};
 enum CacheMissBehavior {
 	DECODE_IF_NOT_IN_CACHE,
 	RETURN_NULLPTR_IF_NOT_IN_CACHE
@@ -60,11 +56,14 @@ private:
 	// The stream index of the latest frame we displayed.
 	unsigned current_stream_idx = 0;
 
-	std::unique_ptr<movit::EffectChain> chain;
+	std::unique_ptr<movit::EffectChain> planar_chain;
 	std::shared_ptr<Frame> current_frame;  // So that we hold on to the pixels.
-	movit::YCbCrInput *ycbcr_input;
+	movit::YCbCrInput *ycbcr_planar_input;
 	movit::YCbCrFormat ycbcr_format;
 
+	std::unique_ptr<movit::EffectChain> semiplanar_chain;
+	movit::YCbCrInput *ycbcr_semiplanar_input;
+
 	static constexpr int overlay_base_width = 16, overlay_base_height = 16;
 	int overlay_width = overlay_base_width, overlay_height = overlay_base_height;
 	std::unique_ptr<QImage> overlay_image;  // If nullptr, no overlay.
diff --git a/main.cpp b/main.cpp
index dcb6f10..9bb666a 100644
--- a/main.cpp
+++ b/main.cpp
@@ -30,6 +30,7 @@ extern "C" {
 #include "ref_counted_gl_sync.h"
 #include "timebase.h"
 #include "ui_mainwindow.h"
+#include "vaapi_jpeg_decoder.h"
 
 using namespace std;
 using namespace std::chrono;
@@ -97,6 +98,8 @@ int main(int argc, char **argv)
 
 	thread(record_thread_func).detach();
 
+	init_jpeg_vaapi();
+
 	return app.exec();
 }
 
diff --git a/memcpy_interleaved.cpp b/memcpy_interleaved.cpp
new file mode 100644
index 0000000..9a41cdd
--- /dev/null
+++ b/memcpy_interleaved.cpp
@@ -0,0 +1,136 @@
+#include <cstdint>
+#include <algorithm>
+#include <assert.h>
+#if __SSE2__
+#include <immintrin.h>
+#endif
+
+using namespace std;
+
+// TODO: Support stride.
+void memcpy_interleaved_slow(uint8_t *dest1, uint8_t *dest2, const uint8_t *src, size_t n)
+{
+	assert(n % 2 == 0);
+	uint8_t *dptr1 = dest1;
+	uint8_t *dptr2 = dest2;
+
+	for (size_t i = 0; i < n; i += 2) {
+		*dptr1++ = *src++;
+		*dptr2++ = *src++;
+	}
+}
+
+#ifdef __SSE2__
+
+// Returns the number of bytes consumed.
+size_t memcpy_interleaved_fastpath(uint8_t *dest1, uint8_t *dest2, const uint8_t *src, size_t n)
+{
+	const uint8_t *limit = src + n;
+	size_t consumed = 0;
+
+	// Align end to 32 bytes.
+	limit = (const uint8_t *)(intptr_t(limit) & ~31);
+
+	if (src >= limit) {
+		return 0;
+	}
+
+	// Process [0,31] bytes, such that start gets aligned to 32 bytes.
+	const uint8_t *aligned_src = (const uint8_t *)(intptr_t(src + 31) & ~31);
+	if (aligned_src != src) {
+		size_t n2 = aligned_src - src;
+		memcpy_interleaved_slow(dest1, dest2, src, n2);
+		dest1 += n2 / 2;
+		dest2 += n2 / 2;
+		if (n2 % 2) {
+			swap(dest1, dest2);
+		}
+		src = aligned_src;
+		consumed += n2;
+	}
+
+	// Make the length a multiple of 64.
+	if (((limit - src) % 64) != 0) {
+		limit -= 32;
+	}
+	assert(((limit - src) % 64) == 0);
+
+#if __AVX2__
+	const __m256i * __restrict in = (const __m256i *)src;
+	__m256i * __restrict out1 = (__m256i *)dest1;
+	__m256i * __restrict out2 = (__m256i *)dest2;
+
+	__m256i shuffle_cw = _mm256_set_epi8(
+		15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0,
+		15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);
+	while (in < (const __m256i *)limit) {
+		// Note: For brevity, comments show lanes as if they were 2x64-bit (they're actually 2x128).
+		__m256i data1 = _mm256_stream_load_si256(in);         // AaBbCcDd EeFfGgHh
+		__m256i data2 = _mm256_stream_load_si256(in + 1);     // IiJjKkLl MmNnOoPp
+
+		data1 = _mm256_shuffle_epi8(data1, shuffle_cw);       // ABCDabcd EFGHefgh
+		data2 = _mm256_shuffle_epi8(data2, shuffle_cw);       // IJKLijkl MNOPmnop
+	
+		data1 = _mm256_permute4x64_epi64(data1, 0b11011000);  // ABCDEFGH abcdefgh
+		data2 = _mm256_permute4x64_epi64(data2, 0b11011000);  // IJKLMNOP ijklmnop
+
+		__m256i lo = _mm256_permute2x128_si256(data1, data2, 0b00100000);
+		__m256i hi = _mm256_permute2x128_si256(data1, data2, 0b00110001);
+
+		_mm256_storeu_si256(out1, lo);
+		_mm256_storeu_si256(out2, hi);
+
+		in += 2;
+		++out1;
+		++out2;
+		consumed += 64;
+	}
+#else
+	const __m128i * __restrict in = (const __m128i *)src;
+	__m128i * __restrict out1 = (__m128i *)dest1;
+	__m128i * __restrict out2 = (__m128i *)dest2;
+
+	__m128i mask_lower_byte = _mm_set1_epi16(0x00ff);
+	while (in < (const __m128i *)limit) {
+		__m128i data1 = _mm_load_si128(in);
+		__m128i data2 = _mm_load_si128(in + 1);
+		__m128i data1_lo = _mm_and_si128(data1, mask_lower_byte);
+		__m128i data2_lo = _mm_and_si128(data2, mask_lower_byte);
+		__m128i data1_hi = _mm_srli_epi16(data1, 8);
+		__m128i data2_hi = _mm_srli_epi16(data2, 8);
+		__m128i lo = _mm_packus_epi16(data1_lo, data2_lo);
+		_mm_storeu_si128(out1, lo);
+		__m128i hi = _mm_packus_epi16(data1_hi, data2_hi);
+		_mm_storeu_si128(out2, hi);
+
+		in += 2;
+		++out1;
+		++out2;
+		consumed += 32;
+	}
+#endif
+
+	return consumed;
+}
+
+#endif  // defined(__SSE2__)
+
+void memcpy_interleaved(uint8_t *dest1, uint8_t *dest2, const uint8_t *src, size_t n)
+{
+#ifdef __SSE2__
+	size_t consumed = memcpy_interleaved_fastpath(dest1, dest2, src, n);
+	src += consumed;
+	dest1 += consumed / 2;
+	dest2 += consumed / 2;
+	if (consumed % 2) {
+		swap(dest1, dest2);
+	}
+	n -= consumed;
+
+	if (n > 0) {
+		memcpy_interleaved_slow(dest1, dest2, src, n);
+	}
+#else
+	memcpy_interleaved_slow(dest1, dest2, src, n);
+#endif
+}
diff --git a/memcpy_interleaved.h b/memcpy_interleaved.h
new file mode 100644
index 0000000..a7f8994
--- /dev/null
+++ b/memcpy_interleaved.h
@@ -0,0 +1,11 @@
+#ifndef _MEMCPY_INTERLEAVED_H
+#define _MEMCPY_INTERLEAVED_H 1
+
+#include <stddef.h>
+#include <stdint.h>
+
+// Copies every other byte from src to dest1 and dest2.
+// TODO: Support stride.
+void memcpy_interleaved(uint8_t *dest1, uint8_t *dest2, const uint8_t *src, size_t n);
+
+#endif  // !defined(_MEMCPY_INTERLEAVED_H)
diff --git a/vaapi_jpeg_decoder.cpp b/vaapi_jpeg_decoder.cpp
new file mode 100644
index 0000000..b6f9c50
--- /dev/null
+++ b/vaapi_jpeg_decoder.cpp
@@ -0,0 +1,546 @@
+#include "vaapi_jpeg_decoder.h"
+
+#include <X11/Xlib.h>
+#include <assert.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <glob.h>
+#include <jpeglib.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <va/va.h>
+#include <va/va_drm.h>
+#include <va/va_x11.h>
+
+#include <list>
+#include <mutex>
+#include <string>
+
+#include "jpeg_frame.h"
+#include "memcpy_interleaved.h"
+
+using namespace std;
+
+static unique_ptr<VADisplayWithCleanup> va_dpy;
+static VAConfigID config_id;
+static VAImageFormat uyvy_format;
+bool vaapi_jpeg_decoding_usable = false;
+	
+struct VAResources {
+	unsigned width, height;
+	VASurfaceID surface;
+	VAContextID context;
+	VAImage image;
+};
+static list<VAResources> va_resources_freelist;
+static mutex va_resources_mutex;
+
+#define CHECK_VASTATUS(va_status, func)                                 \
+    if (va_status != VA_STATUS_SUCCESS) {                               \
+        fprintf(stderr, "%s:%d (%s) failed with %d\n", __func__, __LINE__, func, va_status); \
+        exit(1);                                                        \
+    }
+
+#define CHECK_VASTATUS_RET(va_status, func)                             \
+    if (va_status != VA_STATUS_SUCCESS) {                               \
+        fprintf(stderr, "%s:%d (%s) failed with %d\n", __func__, __LINE__, func, va_status); \
+        return nullptr;                                                 \
+    }
+
+VAResources get_va_resources(unsigned width, unsigned height)
+{
+	{
+		lock_guard<mutex> lock(va_resources_mutex);
+		for (auto it = va_resources_freelist.begin(); it != va_resources_freelist.end(); ++it) {
+			if (it->width == width && it->height == height) {
+				VAResources ret = *it;
+				va_resources_freelist.erase(it);
+				return ret;
+			}
+		}
+	}
+
+	VAResources ret;
+
+	ret.width = width;
+	ret.height = height;
+
+	VAStatus va_status = vaCreateSurfaces(va_dpy->va_dpy, VA_RT_FORMAT_YUV422,
+		width, height,
+		&ret.surface, 1, nullptr, 0);
+	CHECK_VASTATUS(va_status, "vaCreateSurfaces");
+
+	va_status = vaCreateContext(va_dpy->va_dpy, config_id, width, height, 0, &ret.surface, 1, &ret.context);
+	CHECK_VASTATUS(va_status, "vaCreateContext");
+
+	va_status = vaCreateImage(va_dpy->va_dpy, &uyvy_format, width, height, &ret.image);
+	CHECK_VASTATUS(va_status, "vaCreateImage");
+
+	return ret;
+}
+
+void release_va_resources(VAResources resources)
+{
+	lock_guard<mutex> lock(va_resources_mutex);
+	if (va_resources_freelist.size() > 10) {
+		auto it = va_resources_freelist.end();
+		--it;
+
+		VAStatus va_status = vaDestroyImage(va_dpy->va_dpy, it->image.image_id);
+		CHECK_VASTATUS(va_status, "vaDestroyImage");
+
+		va_status = vaDestroyContext(va_dpy->va_dpy, it->context);
+		CHECK_VASTATUS(va_status, "vaDestroyContext");
+
+		va_status = vaDestroySurfaces(va_dpy->va_dpy, &it->surface, 1);
+		CHECK_VASTATUS(va_status, "vaDestroySurfaces");
+
+		va_resources_freelist.erase(it);
+	}
+
+	va_resources_freelist.push_front(resources);
+}
+
+// RAII wrapper to release VAResources on return (even on error).
+class ReleaseVAResources {
+public:
+	ReleaseVAResources(const VAResources &resources)
+		: resources(resources) {}
+	~ReleaseVAResources() {
+		if (!committed) {
+			release_va_resources(resources);
+		}
+	}
+
+	void commit() { committed = true; }
+
+private:
+	const VAResources &resources;
+	bool committed = false;
+};
+
+VADisplayWithCleanup::~VADisplayWithCleanup()
+{
+	if (va_dpy != nullptr) {
+		vaTerminate(va_dpy);
+	}
+	if (x11_display != nullptr) {
+		XCloseDisplay(x11_display);
+	}
+	if (drm_fd != -1) {
+		close(drm_fd);
+	}
+}
+
+unique_ptr<VADisplayWithCleanup> va_open_display(const string &va_display)
+{
+	if (va_display.empty() || va_display[0] != '/') {  // An X display.
+		Display *x11_display = XOpenDisplay(va_display.empty() ? nullptr : va_display.c_str());
+		if (x11_display == nullptr) {
+			fprintf(stderr, "error: can't connect to X server!\n");
+			return nullptr;
+		}
+
+		unique_ptr<VADisplayWithCleanup> ret(new VADisplayWithCleanup);
+		ret->x11_display = x11_display;
+		ret->va_dpy = vaGetDisplay(x11_display);
+		if (ret->va_dpy == nullptr) {
+			return nullptr;
+		}
+		return ret;
+	} else {  // A DRM node on the filesystem (e.g. /dev/dri/renderD128).
+		int drm_fd = open(va_display.c_str(), O_RDWR);
+		if (drm_fd == -1) {
+			perror(va_display.c_str());
+			return nullptr;
+		}
+		unique_ptr<VADisplayWithCleanup> ret(new VADisplayWithCleanup);
+		ret->drm_fd = drm_fd;
+		ret->va_dpy = vaGetDisplayDRM(drm_fd);
+		if (ret->va_dpy == nullptr) {
+			return nullptr;
+		}
+		return ret;
+	}
+}
+
+unique_ptr<VADisplayWithCleanup> try_open_va(const string &va_display, string *error)
+{
+	unique_ptr<VADisplayWithCleanup> va_dpy = va_open_display(va_display);
+	if (va_dpy == nullptr) {
+		if (error) *error = "Opening VA display failed";
+		return nullptr;
+	}
+	int major_ver, minor_ver;
+	VAStatus va_status = vaInitialize(va_dpy->va_dpy, &major_ver, &minor_ver);
+	if (va_status != VA_STATUS_SUCCESS) {
+		char buf[256];
+		snprintf(buf, sizeof(buf), "vaInitialize() failed with status %d\n", va_status);
+		if (error != nullptr) *error = buf;
+		return nullptr;
+	}
+
+	int num_entrypoints = vaMaxNumEntrypoints(va_dpy->va_dpy);
+	unique_ptr<VAEntrypoint[]> entrypoints(new VAEntrypoint[num_entrypoints]);
+	if (entrypoints == nullptr) {
+		if (error != nullptr) *error = "Failed to allocate memory for VA entry points";
+		return nullptr;
+	}
+
+	vaQueryConfigEntrypoints(va_dpy->va_dpy, VAProfileJPEGBaseline, entrypoints.get(), &num_entrypoints);
+	for (int slice_entrypoint = 0; slice_entrypoint < num_entrypoints; slice_entrypoint++) {
+		if (entrypoints[slice_entrypoint] != VAEntrypointVLD) {
+			continue;
+		}
+
+		// We found a usable decode, so return it.
+		return va_dpy;
+	}
+
+	if (error != nullptr) *error = "Can't find VAEntrypointVLD for the JPEG profile";
+	return nullptr;
+}
+
+string get_usable_va_display()
+{
+	// Reduce the amount of chatter while probing,
+	// unless the user has specified otherwise.
+	bool need_env_reset = false;
+	if (getenv("LIBVA_MESSAGING_LEVEL") == nullptr) {
+		setenv("LIBVA_MESSAGING_LEVEL", "0", true);
+		need_env_reset = true;
+	}
+
+	// First try the default (ie., whatever $DISPLAY is set to).
+	unique_ptr<VADisplayWithCleanup> va_dpy = try_open_va("", nullptr);
+	if (va_dpy != nullptr) {
+		if (need_env_reset) {
+			unsetenv("LIBVA_MESSAGING_LEVEL");
+		}
+		return "";
+	}
+
+	fprintf(stderr, "The X11 display did not expose a VA-API JPEG decoder.\n");
+
+	// Try all /dev/dri/render* in turn. TODO: Accept /dev/dri/card*, too?
+	glob_t g;
+	int err = glob("/dev/dri/renderD*", 0, nullptr, &g);
+	if (err != 0) {
+	        fprintf(stderr, "Couldn't list render nodes (%s) when trying to autodetect a replacement.\n", strerror(errno));
+	} else {
+		for (size_t i = 0; i < g.gl_pathc; ++i) {
+			string path = g.gl_pathv[i];
+			va_dpy = try_open_va(path, nullptr);
+			if (va_dpy != nullptr) {
+				fprintf(stderr, "Autodetected %s as a suitable replacement; using it.\n",
+					path.c_str());
+				globfree(&g);
+				if (need_env_reset) {
+					unsetenv("LIBVA_MESSAGING_LEVEL");
+				}
+				return path;
+			}
+		}
+	}
+
+	fprintf(stderr, "No suitable VA-API JPEG decoders were found in /dev/dri; giving up.\n");
+	fprintf(stderr, "Note that if you are using an Intel CPU with an external GPU,\n");
+	fprintf(stderr, "you may need to enable the integrated Intel GPU in your BIOS\n");
+	fprintf(stderr, "to expose Quick Sync.\n");
+	return "none";
+}
+
+void init_jpeg_vaapi()
+{
+	string dpy = get_usable_va_display();
+	if (dpy == "none") {
+		return;
+	}
+
+	va_dpy = try_open_va(dpy, nullptr);
+	if (va_dpy == nullptr) {
+		return;
+	}
+
+	VAConfigAttrib attr = { VAConfigAttribRTFormat, VA_RT_FORMAT_YUV422 };
+
+	VAStatus va_status = vaCreateConfig(va_dpy->va_dpy, VAProfileJPEGBaseline, VAEntrypointVLD,
+		&attr, 1, &config_id);
+	CHECK_VASTATUS(va_status, "vaCreateConfig");
+
+	int num_formats = vaMaxNumImageFormats(va_dpy->va_dpy);
+	assert(num_formats > 0);
+
+	unique_ptr<VAImageFormat[]> formats(new VAImageFormat[num_formats]);
+	va_status = vaQueryImageFormats(va_dpy->va_dpy, formats.get(), &num_formats);
+	CHECK_VASTATUS(va_status, "vaQueryImageFormats");
+
+	bool found = false;
+	for (int i = 0; i < num_formats; ++i) {
+		// Seemingly VA_FOURCC_422H is no good for vaGetImage(). :-/
+		if (formats[i].fourcc == VA_FOURCC_UYVY) {
+			memcpy(&uyvy_format, &formats[i], sizeof(VAImageFormat));
+			found = true;
+			break;
+		}
+	}
+	if (!found) {
+		return;
+	}
+
+	fprintf(stderr, "VA-API JPEG decoding initialized.\n");
+	vaapi_jpeg_decoding_usable = true;
+}
+
+shared_ptr<Frame> decode_jpeg_vaapi(const string &filename)
+{
+	jpeg_decompress_struct dinfo;
+	jpeg_error_mgr jerr;
+	dinfo.err = jpeg_std_error(&jerr);
+	jpeg_create_decompress(&dinfo);
+
+	FILE *fp = fopen(filename.c_str(), "rb");
+	if (fp == nullptr) {
+		perror(filename.c_str());
+		exit(1);
+	}
+	jpeg_stdio_src(&dinfo, fp);
+
+	jpeg_read_header(&dinfo, true);
+
+	// Read the data that comes after the header. VA-API will destuff and all for us.
+	std::string str((const char *)dinfo.src->next_input_byte, dinfo.src->bytes_in_buffer);
+	while (!feof(fp)) {
+		char buf[4096];
+		size_t ret = fread(buf, 1, sizeof(buf), fp);
+		str.append(buf, ret);
+	}
+	fclose(fp);
+
+	if (dinfo.num_components != 3) {
+		fprintf(stderr, "Not a color JPEG. (%d components, Y=%dx%d, Cb=%dx%d, Cr=%dx%d)\n",
+			dinfo.num_components,
+			dinfo.comp_info[0].h_samp_factor, dinfo.comp_info[0].v_samp_factor,
+			dinfo.comp_info[1].h_samp_factor, dinfo.comp_info[1].v_samp_factor,
+			dinfo.comp_info[2].h_samp_factor, dinfo.comp_info[2].v_samp_factor);
+		return nullptr;
+	}
+	if (dinfo.comp_info[0].h_samp_factor != 2 ||
+	    dinfo.comp_info[0].v_samp_factor != 2 ||
+	    dinfo.comp_info[1].h_samp_factor != 1 ||
+	    dinfo.comp_info[1].v_samp_factor != 2 ||
+	    dinfo.comp_info[2].h_samp_factor != 1 ||
+	    dinfo.comp_info[2].v_samp_factor != 2) {
+		fprintf(stderr, "Not 4:2:2. (Y=%dx%d, Cb=%dx%d, Cr=%dx%d)\n",
+			dinfo.comp_info[0].h_samp_factor, dinfo.comp_info[0].v_samp_factor,
+			dinfo.comp_info[1].h_samp_factor, dinfo.comp_info[1].v_samp_factor,
+			dinfo.comp_info[2].h_samp_factor, dinfo.comp_info[2].v_samp_factor);
+		return nullptr;
+	}
+
+	// Picture parameters.
+	VAPictureParameterBufferJPEGBaseline pic_param;
+	memset(&pic_param, 0, sizeof(pic_param));
+	pic_param.picture_width = dinfo.image_width;
+	pic_param.picture_height = dinfo.image_height;
+	for (int component_idx = 0; component_idx < dinfo.num_components; ++component_idx) {
+		const jpeg_component_info *comp = &dinfo.comp_info[component_idx];
+		pic_param.components[component_idx].component_id = comp->component_id;
+		pic_param.components[component_idx].h_sampling_factor = comp->h_samp_factor;
+		pic_param.components[component_idx].v_sampling_factor = comp->v_samp_factor;
+		pic_param.components[component_idx].quantiser_table_selector = comp->quant_tbl_no;
+	}
+	pic_param.num_components = dinfo.num_components;
+	pic_param.color_space = 0;  // YUV.
+	pic_param.rotation = VA_ROTATION_NONE;
+
+	VABufferID pic_param_buffer;
+	VAStatus va_status = vaCreateBuffer(va_dpy->va_dpy, config_id, VAPictureParameterBufferType, sizeof(pic_param), 1, &pic_param, &pic_param_buffer);
+	CHECK_VASTATUS_RET(va_status, "vaCreateBuffer");
+
+	// Quantization matrices.
+	VAIQMatrixBufferJPEGBaseline iq;
+	memset(&iq, 0, sizeof(iq));
+
+	for (int quant_tbl_idx = 0; quant_tbl_idx < min(4, NUM_QUANT_TBLS); ++quant_tbl_idx) {
+		const JQUANT_TBL *qtbl = dinfo.quant_tbl_ptrs[quant_tbl_idx];
+		if (qtbl == nullptr) {
+			iq.load_quantiser_table[quant_tbl_idx] = 0;
+		} else {
+			iq.load_quantiser_table[quant_tbl_idx] = 1;
+			for (int i = 0; i < 64; ++i) {
+				if (qtbl->quantval[i] > 255) {
+					fprintf(stderr, "Baseline JPEG only!\n");
+					return nullptr;
+				}
+				iq.quantiser_table[quant_tbl_idx][i] = qtbl->quantval[i];
+			}
+		}
+	}
+
+	VABufferID iq_buffer;
+	va_status = vaCreateBuffer(va_dpy->va_dpy, config_id, VAIQMatrixBufferType, sizeof(iq), 1, &iq, &iq_buffer);
+	CHECK_VASTATUS_RET(va_status, "vaCreateBuffer");
+
+	// Huffman tables (arithmetic is not supported).
+	VAHuffmanTableBufferJPEGBaseline huff;
+	memset(&huff, 0, sizeof(huff));
+
+	for (int huff_tbl_idx = 0; huff_tbl_idx < min(2, NUM_HUFF_TBLS); ++huff_tbl_idx) {
+		const JHUFF_TBL *ac_hufftbl = dinfo.ac_huff_tbl_ptrs[huff_tbl_idx];
+		const JHUFF_TBL *dc_hufftbl = dinfo.dc_huff_tbl_ptrs[huff_tbl_idx];
+		if (ac_hufftbl == nullptr) {
+			assert(dc_hufftbl == nullptr);
+			huff.load_huffman_table[huff_tbl_idx] = 0;
+		} else {
+			assert(dc_hufftbl != nullptr);
+			huff.load_huffman_table[huff_tbl_idx] = 1;
+
+			for (int i = 0; i < 16; ++i) {
+				huff.huffman_table[huff_tbl_idx].num_dc_codes[i] = dc_hufftbl->bits[i + 1];
+			}
+			for (int i = 0; i < 12; ++i) {
+				huff.huffman_table[huff_tbl_idx].dc_values[i] = dc_hufftbl->huffval[i];
+			}
+			for (int i = 0; i < 16; ++i) {
+				huff.huffman_table[huff_tbl_idx].num_ac_codes[i] = ac_hufftbl->bits[i + 1];
+			}
+			for (int i = 0; i < 162; ++i) {
+				huff.huffman_table[huff_tbl_idx].ac_values[i] = ac_hufftbl->huffval[i];
+			}
+		}
+	}
+
+	VABufferID huff_buffer;
+	va_status = vaCreateBuffer(va_dpy->va_dpy, config_id, VAHuffmanTableBufferType, sizeof(huff), 1, &huff, &huff_buffer);
+	CHECK_VASTATUS_RET(va_status, "vaCreateBuffer");
+
+	// Slice parameters (metadata about the slice).
+	VASliceParameterBufferJPEGBaseline parms;
+	memset(&parms, 0, sizeof(parms));
+	parms.slice_data_size = str.size();
+	parms.slice_data_offset = 0;
+	parms.slice_data_flag = VA_SLICE_DATA_FLAG_ALL;	
+	parms.slice_horizontal_position = 0;
+	parms.slice_vertical_position = 0;
+	for (int component_idx = 0; component_idx < dinfo.num_components; ++component_idx) {
+		const jpeg_component_info *comp = &dinfo.comp_info[component_idx];
+		parms.components[component_idx].component_selector = comp->component_id;
+		parms.components[component_idx].dc_table_selector = comp->dc_tbl_no;
+		parms.components[component_idx].ac_table_selector = comp->ac_tbl_no;
+		if (parms.components[component_idx].dc_table_selector > 1 ||
+	  	    parms.components[component_idx].ac_table_selector > 1) {
+			fprintf(stderr, "Uses too many Huffman tables\n");
+			return nullptr;
+		}
+	}
+	parms.num_components = dinfo.num_components;
+	parms.restart_interval = dinfo.restart_interval;
+	int horiz_mcus = (dinfo.image_width + (DCTSIZE * 2) - 1) / (DCTSIZE * 2);
+	int vert_mcus = (dinfo.image_height + DCTSIZE - 1) / DCTSIZE;
+	parms.num_mcus = horiz_mcus * vert_mcus;
+
+	VABufferID slice_param_buffer;
+	va_status = vaCreateBuffer(va_dpy->va_dpy, config_id, VASliceParameterBufferType, sizeof(parms), 1, &parms, &slice_param_buffer);
+	CHECK_VASTATUS_RET(va_status, "vaCreateBuffer");
+
+	// The actual data.
+	VABufferID data_buffer;
+	va_status = vaCreateBuffer(va_dpy->va_dpy, config_id, VASliceDataBufferType, str.size(), 1, &str[0], &data_buffer);
+	CHECK_VASTATUS_RET(va_status, "vaCreateBuffer");
+
+	VAResources resources = get_va_resources(dinfo.image_width, dinfo.image_height);
+	ReleaseVAResources release(resources);
+
+	va_status = vaBeginPicture(va_dpy->va_dpy, resources.context, resources.surface);
+	CHECK_VASTATUS_RET(va_status, "vaBeginPicture");
+	va_status = vaRenderPicture(va_dpy->va_dpy, resources.context, &pic_param_buffer, 1);
+	CHECK_VASTATUS_RET(va_status, "vaRenderPicture(pic_param)");
+	va_status = vaRenderPicture(va_dpy->va_dpy, resources.context, &iq_buffer, 1);
+	CHECK_VASTATUS_RET(va_status, "vaRenderPicture(iq)");
+	va_status = vaRenderPicture(va_dpy->va_dpy, resources.context, &huff_buffer, 1);
+	CHECK_VASTATUS_RET(va_status, "vaRenderPicture(huff)");
+	va_status = vaRenderPicture(va_dpy->va_dpy, resources.context, &slice_param_buffer, 1);
+	CHECK_VASTATUS_RET(va_status, "vaRenderPicture(slice_param)");
+	va_status = vaRenderPicture(va_dpy->va_dpy, resources.context, &data_buffer, 1);
+	CHECK_VASTATUS_RET(va_status, "vaRenderPicture(data)");
+	va_status = vaEndPicture(va_dpy->va_dpy, resources.context);
+	CHECK_VASTATUS_RET(va_status, "vaEndPicture");
+
+	// vaDeriveImage() works, but the resulting image seems to live in
+	// uncached memory, which makes copying data out from it very, very slow.
+	// Thanks to FFmpeg for the observation that you can vaGetImage() the
+	// surface onto your own image (although then, it can't be planar, which
+	// is unfortunate for us).
+#if 0
+	VAImage image;
+	va_status = vaDeriveImage(va_dpy->va_dpy, surf, &image);
+	CHECK_VASTATUS_RET(va_status, "vaDeriveImage");
+#else
+	va_status = vaSyncSurface(va_dpy->va_dpy, resources.surface);
+	CHECK_VASTATUS_RET(va_status, "vaSyncSurface");
+
+	va_status = vaGetImage(va_dpy->va_dpy, resources.surface, 0, 0, dinfo.image_width, dinfo.image_height, resources.image.image_id);
+	CHECK_VASTATUS_RET(va_status, "vaGetImage");
+#endif
+
+	void *mapped;
+	va_status = vaMapBuffer(va_dpy->va_dpy, resources.image.buf, &mapped);
+	CHECK_VASTATUS_RET(va_status, "vaMapBuffer");
+
+	shared_ptr<Frame> frame(new Frame);
+#if 0
+	// 4:2:2 planar (for vaDeriveImage).
+	frame->y.reset(new uint8_t[dinfo.image_width * dinfo.image_height]);
+	frame->cb.reset(new uint8_t[(dinfo.image_width / 2) * dinfo.image_height]);
+	frame->cr.reset(new uint8_t[(dinfo.image_width / 2) * dinfo.image_height]);
+	for (int component_idx = 0; component_idx < dinfo.num_components; ++component_idx) {
+		uint8_t *dptr;
+		size_t width;
+		if (component_idx == 0) {
+			dptr = frame->y.get();
+			width = dinfo.image_width;
+		} else if (component_idx == 1) {
+			dptr = frame->cb.get();
+			width = dinfo.image_width / 2;
+		} else if (component_idx == 2) {
+			dptr = frame->cr.get();
+			width = dinfo.image_width / 2;
+		} else {
+			assert(false);
+		}
+		const uint8_t *sptr = (const uint8_t *)mapped + image.offsets[component_idx];
+		size_t spitch = image.pitches[component_idx];
+		for (size_t y = 0; y < dinfo.image_height; ++y) {
+			memcpy(dptr + y * width, sptr + y * spitch, width);
+		}
+	}
+#else
+	// Convert Y'CbCr to separate Y' and CbCr.
+	frame->is_semiplanar = true;
+	frame->y.reset(new uint8_t[dinfo.image_width * dinfo.image_height]);
+	frame->cbcr.reset(new uint8_t[dinfo.image_width * dinfo.image_height]);
+	const uint8_t *src = (const uint8_t *)mapped + resources.image.offsets[0];
+	if (resources.image.pitches[0] == dinfo.image_width * 2) {
+		memcpy_interleaved(frame->cbcr.get(), frame->y.get(), src, dinfo.image_width * dinfo.image_height * 2);
+	} else {
+		for (unsigned y = 0; y < dinfo.image_height; ++y) {
+			memcpy_interleaved(frame->cbcr.get() + y * dinfo.image_width, frame->y.get() + y * dinfo.image_width,
+			                   src + y * resources.image.pitches[0], dinfo.image_width * 2);
+		}
+	}
+#endif
+	frame->width = dinfo.image_width;
+	frame->height = dinfo.image_height;
+	frame->chroma_subsampling_x = 2;
+	frame->chroma_subsampling_y = 1;
+	frame->pitch_y = dinfo.image_width;
+	frame->pitch_chroma = dinfo.image_width / 2;
+
+	va_status = vaUnmapBuffer(va_dpy->va_dpy, resources.image.buf);
+	CHECK_VASTATUS_RET(va_status, "vaUnmapBuffer");
+
+	return frame;
+}
diff --git a/vaapi_jpeg_decoder.h b/vaapi_jpeg_decoder.h
new file mode 100644
index 0000000..4ab957e
--- /dev/null
+++ b/vaapi_jpeg_decoder.h
@@ -0,0 +1,27 @@
+#ifndef _VAAPI_JPEG_DECODER_H
+#define _VAAPI_JPEG_DECODER_H 1
+
+#include <X11/Xlib.h>
+#include <va/va.h>
+
+#include <string>
+#include <memory>
+
+struct Frame;
+  
+struct VADisplayWithCleanup {
+	~VADisplayWithCleanup();
+
+	VADisplay va_dpy;
+	Display *x11_display = nullptr;
+	int drm_fd = -1;
+};
+std::unique_ptr<VADisplayWithCleanup> va_open_display(const std::string &va_display);  // Can return nullptr on failure.
+std::string get_usable_va_display();
+
+void init_jpeg_vaapi();
+std::shared_ptr<Frame> decode_jpeg_vaapi(const std::string &filename);
+
+extern bool vaapi_jpeg_decoding_usable;
+  
+#endif  // !defined(_VAAPI_JPEG_DECODER_H)
diff --git a/video_stream.cpp b/video_stream.cpp
index a733d46..aa93dc7 100644
--- a/video_stream.cpp
+++ b/video_stream.cpp
@@ -149,11 +149,11 @@ vector<uint8_t> encode_jpeg(const uint8_t *y_data, const uint8_t *cb_data, const
 VideoStream::VideoStream()
 {
 	using namespace movit;
-	// TODO: deduplicate code against JPEGFrameView?
-	ycbcr_convert_chain.reset(new EffectChain(1280, 720));
-	ImageFormat image_format;
-	image_format.color_space = COLORSPACE_sRGB;
-	image_format.gamma_curve = GAMMA_sRGB;
+
+	ImageFormat inout_format;
+	inout_format.color_space = COLORSPACE_sRGB;
+	inout_format.gamma_curve = GAMMA_sRGB;
+
 	ycbcr_format.luma_coefficients = YCBCR_REC_709;
 	ycbcr_format.full_range = true;  // JPEG.
 	ycbcr_format.num_levels = 256;
@@ -163,28 +163,33 @@ VideoStream::VideoStream()
 	ycbcr_format.cb_y_position = 0.5f;  // Irrelevant.
 	ycbcr_format.cr_x_position = 0.0f;
 	ycbcr_format.cr_y_position = 0.5f;
-	ycbcr_input = (movit::YCbCrInput *)ycbcr_convert_chain->add_input(new YCbCrInput(image_format, ycbcr_format, 1280, 720));
 
 	YCbCrFormat ycbcr_output_format = ycbcr_format;
 	ycbcr_output_format.chroma_subsampling_x = 1;
 
-	ImageFormat inout_format;
-	inout_format.color_space = COLORSPACE_sRGB;
-	inout_format.gamma_curve = GAMMA_sRGB;
+	// TODO: deduplicate code against JPEGFrameView?
+	ycbcr_planar_convert_chain.reset(new EffectChain(1280, 720));
+	ycbcr_planar_input = (movit::YCbCrInput *)ycbcr_planar_convert_chain->add_input(new YCbCrInput(inout_format, ycbcr_format, 1280, 720, YCBCR_INPUT_PLANAR));
 
-	check_error();
+	// One full Y'CbCr texture (for interpolation), one that's just Y (throwing away the
+	// Cb and Cr channels). The second copy is sort of redundant, but it's the easiest way
+	// of getting the gray data into a layered texture.
+	ycbcr_planar_convert_chain->add_ycbcr_output(inout_format, OUTPUT_ALPHA_FORMAT_POSTMULTIPLIED, ycbcr_output_format);
+	ycbcr_planar_convert_chain->add_ycbcr_output(inout_format, OUTPUT_ALPHA_FORMAT_POSTMULTIPLIED, ycbcr_output_format);
+	ycbcr_planar_convert_chain->set_dither_bits(8);
+	ycbcr_planar_convert_chain->finalize();
+
+	// Same, for semiplanar inputs.
+	ycbcr_semiplanar_convert_chain.reset(new EffectChain(1280, 720));
+	ycbcr_semiplanar_input = (movit::YCbCrInput *)ycbcr_semiplanar_convert_chain->add_input(new YCbCrInput(inout_format, ycbcr_format, 1280, 720, YCBCR_INPUT_SPLIT_Y_AND_CBCR));
 
 	// One full Y'CbCr texture (for interpolation), one that's just Y (throwing away the
 	// Cb and Cr channels). The second copy is sort of redundant, but it's the easiest way
 	// of getting the gray data into a layered texture.
-	ycbcr_convert_chain->add_ycbcr_output(inout_format, OUTPUT_ALPHA_FORMAT_POSTMULTIPLIED, ycbcr_output_format);
-	check_error();
-	ycbcr_convert_chain->add_ycbcr_output(inout_format, OUTPUT_ALPHA_FORMAT_POSTMULTIPLIED, ycbcr_output_format);
-	check_error();
-	ycbcr_convert_chain->set_dither_bits(8);
-	check_error();
-	ycbcr_convert_chain->finalize();
-	check_error();
+	ycbcr_semiplanar_convert_chain->add_ycbcr_output(inout_format, OUTPUT_ALPHA_FORMAT_POSTMULTIPLIED, ycbcr_output_format);
+	ycbcr_semiplanar_convert_chain->add_ycbcr_output(inout_format, OUTPUT_ALPHA_FORMAT_POSTMULTIPLIED, ycbcr_output_format);
+	ycbcr_semiplanar_convert_chain->set_dither_bits(8);
+	ycbcr_semiplanar_convert_chain->finalize();
 
 	GLuint input_tex[num_interpolate_slots], gray_tex[num_interpolate_slots], cb_tex[num_interpolate_slots], cr_tex[num_interpolate_slots];
 	glCreateTextures(GL_TEXTURE_2D_ARRAY, 10, input_tex);
@@ -325,16 +330,28 @@ void VideoStream::schedule_interpolated_frame(int64_t output_pts, unsigned strea
 		shared_ptr<Frame> frame = decode_jpeg_with_cache(jpeg_id, DECODE_IF_NOT_IN_CACHE, &did_decode);
 		ycbcr_format.chroma_subsampling_x = frame->chroma_subsampling_x;
 		ycbcr_format.chroma_subsampling_y = frame->chroma_subsampling_y;
-		ycbcr_input->change_ycbcr_format(ycbcr_format);
-		ycbcr_input->set_width(frame->width);
-		ycbcr_input->set_height(frame->height);
-		ycbcr_input->set_pixel_data(0, frame->y.get());
-		ycbcr_input->set_pixel_data(1, frame->cb.get());
-		ycbcr_input->set_pixel_data(2, frame->cr.get());
-		ycbcr_input->set_pitch(0, frame->pitch_y);
-		ycbcr_input->set_pitch(1, frame->pitch_chroma);
-		ycbcr_input->set_pitch(2, frame->pitch_chroma);
-		ycbcr_convert_chain->render_to_fbo(resources.input_fbos[frame_no], 1280, 720);
+
+		if (frame->is_semiplanar) {
+			ycbcr_semiplanar_input->change_ycbcr_format(ycbcr_format);
+			ycbcr_semiplanar_input->set_width(frame->width);
+			ycbcr_semiplanar_input->set_height(frame->height);
+			ycbcr_semiplanar_input->set_pixel_data(0, frame->y.get());
+			ycbcr_semiplanar_input->set_pixel_data(1, frame->cbcr.get());
+			ycbcr_semiplanar_input->set_pitch(0, frame->pitch_y);
+			ycbcr_semiplanar_input->set_pitch(1, frame->pitch_chroma);
+			ycbcr_semiplanar_convert_chain->render_to_fbo(resources.input_fbos[frame_no], 1280, 720);
+		} else {
+			ycbcr_planar_input->change_ycbcr_format(ycbcr_format);
+			ycbcr_planar_input->set_width(frame->width);
+			ycbcr_planar_input->set_height(frame->height);
+			ycbcr_planar_input->set_pixel_data(0, frame->y.get());
+			ycbcr_planar_input->set_pixel_data(1, frame->cb.get());
+			ycbcr_planar_input->set_pixel_data(2, frame->cr.get());
+			ycbcr_planar_input->set_pitch(0, frame->pitch_y);
+			ycbcr_planar_input->set_pitch(1, frame->pitch_chroma);
+			ycbcr_planar_input->set_pitch(2, frame->pitch_chroma);
+			ycbcr_planar_convert_chain->render_to_fbo(resources.input_fbos[frame_no], 1280, 720);
+		}
 	}
 
 	glGenerateTextureMipmap(resources.input_tex);
@@ -426,6 +443,7 @@ void VideoStream::encode_thread_func()
 				memcpy(frame->cb.get() + 640 * yy, cb + 640 * (719 - yy), 640);
 				memcpy(frame->cr.get() + 640 * yy, cr + 640 * (719 - yy), 640);
 			}
+			frame->is_semiplanar = false;
 			frame->width = 1280;
 			frame->height = 720;
 			frame->chroma_subsampling_x = 2;
diff --git a/video_stream.h b/video_stream.h
index 2d1e8f1..b86d271 100644
--- a/video_stream.h
+++ b/video_stream.h
@@ -82,9 +82,11 @@ private:
 
 	// Effectively only converts from 4:2:2 to 4:4:4.
 	// TODO: Have a separate version with ResampleEffect, for scaling?
-	std::unique_ptr<movit::EffectChain> ycbcr_convert_chain;
+	std::unique_ptr<movit::EffectChain> ycbcr_planar_convert_chain;
+	std::unique_ptr<movit::EffectChain> ycbcr_semiplanar_convert_chain;
 
-	movit::YCbCrInput *ycbcr_input;
+	movit::YCbCrInput *ycbcr_planar_input;
+	movit::YCbCrInput *ycbcr_semiplanar_input;
 	movit::YCbCrFormat ycbcr_format;
 
 	// Frame interpolation.
-- 
2.39.2