]> git.sesse.net Git - nageru/commitdiff
Decode 4:2:2 JPEGs via VA-API if available.
authorSteinar H. Gunderson <sgunderson@bigfoot.com>
Sat, 29 Sep 2018 14:37:14 +0000 (16:37 +0200)
committerSteinar H. Gunderson <sgunderson@bigfoot.com>
Sat, 29 Sep 2018 19:05:06 +0000 (21:05 +0200)
Makefile
jpeg_frame.h [new file with mode: 0644]
jpeg_frame_view.cpp
jpeg_frame_view.h
main.cpp
memcpy_interleaved.cpp [new file with mode: 0644]
memcpy_interleaved.h [new file with mode: 0644]
vaapi_jpeg_decoder.cpp [new file with mode: 0644]
vaapi_jpeg_decoder.h [new file with mode: 0644]
video_stream.cpp
video_stream.h

index a0075c505be6fce88e82293ebff415b0c4451d76..a15bb6bee1a8360337e666d4cdfc34f7e19cddaa 100644 (file)
--- a/Makefile
+++ b/Makefile
@@ -3,7 +3,7 @@ PKG_MODULES := Qt5Core Qt5Gui Qt5Widgets Qt5OpenGLExtensions Qt5OpenGL Qt5PrintS
 CXXFLAGS ?= -O2 -g -Wall  # Will be overridden by environment.
 CXXFLAGS += -fPIC $(shell pkg-config --cflags $(PKG_MODULES)) -DMOVIT_SHADER_DIR=\"$(shell pkg-config --variable=shaderdir movit)\" -pthread
 
-LDLIBS=$(shell pkg-config --libs $(PKG_MODULES)) -pthread -lavformat -lavcodec -lavutil -lswscale -lGL
+LDLIBS=$(shell pkg-config --libs $(PKG_MODULES)) -pthread -lavformat -lavcodec -lavutil -lswscale -lGL -lva -lva-drm -lva-x11 -lX11
 
 # Qt objects
 OBJS_WITH_MOC = mainwindow.o jpeg_frame_view.o clip_list.o
@@ -14,6 +14,7 @@ OBJS += $(OBJS_WITH_MOC:.o=.moc.o)
 OBJS += flow.o gpu_timers.o
 
 OBJS += ffmpeg_raii.o main.o player.o httpd.o mux.o metacube2.o video_stream.o context.o chroma_subsampler.o
+OBJS += vaapi_jpeg_decoder.o memcpy_interleaved.o
 
 %.o: %.cpp
        $(CXX) -MMD -MP $(CPPFLAGS) $(CXXFLAGS) -o $@ -c $<
diff --git a/jpeg_frame.h b/jpeg_frame.h
new file mode 100644 (file)
index 0000000..eb73e13
--- /dev/null
@@ -0,0 +1,16 @@
+#ifndef _JPEG_FRAME_H
+#define _JPEG_FRAME_H 1
+
+#include <memory>
+
+struct Frame {
+       bool is_semiplanar = false;
+       std::unique_ptr<uint8_t[]> y;
+       std::unique_ptr<uint8_t[]> cb, cr; // For planar.
+       std::unique_ptr<uint8_t[]> cbcr;  // For semiplanar.
+       unsigned width, height;
+       unsigned chroma_subsampling_x, chroma_subsampling_y;
+       unsigned pitch_y, pitch_chroma;
+};
+
+#endif   // !defined(_JPEG_FRAME_H)
index 73030ffa2ce02a2b53f5ae23eef3f4b77eb66d22..ef1cded69d9dc98fcf4d26864c6ac3eef76bd667 100644 (file)
@@ -20,6 +20,7 @@
 
 #include "defs.h"
 #include "post_to_main_thread.h"
+#include "vaapi_jpeg_decoder.h"
 #include "video_stream.h"
 
 using namespace movit;
@@ -50,10 +51,18 @@ deque<pair<JPEGID, JPEGFrameView *>> pending_decodes;  // Under cache_mu.
 atomic<size_t> event_counter{0};
 extern QGLWidget *global_share_widget;
 
-// TODO: Decode using VA-API if available.
 shared_ptr<Frame> decode_jpeg(const string &filename)
 {
-       shared_ptr<Frame> frame(new Frame);
+       shared_ptr<Frame> frame;
+       if (vaapi_jpeg_decoding_usable) {
+               frame = decode_jpeg_vaapi(filename);
+               if (frame != nullptr) {
+                       return frame;
+               }
+               fprintf(stderr, "VA-API hardware decoding failed; falling back to software.\n");
+       }
+
+       frame.reset(new Frame);
 
        jpeg_decompress_struct dinfo;
        jpeg_error_mgr jerr;
@@ -298,10 +307,10 @@ void JPEGFrameView::initializeGL()
                std::thread(&jpeg_decoder_thread).detach();
        });
 
-       chain.reset(new EffectChain(1280, 720, resource_pool));
-       ImageFormat image_format;
-       image_format.color_space = COLORSPACE_sRGB;
-       image_format.gamma_curve = GAMMA_sRGB;
+       ImageFormat inout_format;
+       inout_format.color_space = COLORSPACE_sRGB;
+       inout_format.gamma_curve = GAMMA_sRGB;
+
        ycbcr_format.luma_coefficients = YCBCR_REC_709;
        ycbcr_format.full_range = false;
        ycbcr_format.num_levels = 256;
@@ -311,22 +320,23 @@ void JPEGFrameView::initializeGL()
        ycbcr_format.cb_y_position = 0.5f;  // Irrelevant.
        ycbcr_format.cr_x_position = 0.0f;
        ycbcr_format.cr_y_position = 0.5f;
-       ycbcr_input = (movit::YCbCrInput *)chain->add_input(new YCbCrInput(image_format, ycbcr_format, 1280, 720));
 
-       ImageFormat inout_format;
-        inout_format.color_space = COLORSPACE_sRGB;
-        inout_format.gamma_curve = GAMMA_sRGB;
+       // Planar Y'CbCr decoding chain.
+       planar_chain.reset(new EffectChain(1280, 720, resource_pool));
+       ycbcr_planar_input = (movit::YCbCrInput *)planar_chain->add_input(new YCbCrInput(inout_format, ycbcr_format, 1280, 720, YCBCR_INPUT_PLANAR));
+       planar_chain->add_output(inout_format, OUTPUT_ALPHA_FORMAT_POSTMULTIPLIED);
+       planar_chain->set_dither_bits(8);
+       planar_chain->finalize();
 
-       check_error();
-       chain->add_output(inout_format, OUTPUT_ALPHA_FORMAT_POSTMULTIPLIED);
-       check_error();
-       chain->set_dither_bits(8);
-       check_error();
-       chain->finalize();
-       check_error();
+       // Semiplanar Y'CbCr decoding chain (for images coming from VA-API).
+       semiplanar_chain.reset(new EffectChain(1280, 720, resource_pool));
+       ycbcr_semiplanar_input = (movit::YCbCrInput *)semiplanar_chain->add_input(new YCbCrInput(inout_format, ycbcr_format, 1280, 720, YCBCR_INPUT_SPLIT_Y_AND_CBCR));
+       semiplanar_chain->add_output(inout_format, OUTPUT_ALPHA_FORMAT_POSTMULTIPLIED);
+       semiplanar_chain->set_dither_bits(8);
+       semiplanar_chain->finalize();
 
        overlay_chain.reset(new EffectChain(overlay_base_width, overlay_base_height, resource_pool));
-       overlay_input = (movit::FlatInput *)overlay_chain->add_input(new FlatInput(image_format, FORMAT_GRAYSCALE, GL_UNSIGNED_BYTE, overlay_base_width, overlay_base_height));
+       overlay_input = (movit::FlatInput *)overlay_chain->add_input(new FlatInput(inout_format, FORMAT_GRAYSCALE, GL_UNSIGNED_BYTE, overlay_base_width, overlay_base_height));
 
        overlay_chain->add_output(inout_format, OUTPUT_ALPHA_FORMAT_POSTMULTIPLIED);
        overlay_chain->finalize();
@@ -353,7 +363,11 @@ void JPEGFrameView::paintGL()
        }
 
        check_error();
-       chain->render_to_screen();
+       if (current_frame->is_semiplanar) {
+               semiplanar_chain->render_to_screen();
+       } else {
+               planar_chain->render_to_screen();
+       }
 
        if (overlay_image != nullptr) {
                if (overlay_input_needs_refresh) {
@@ -372,15 +386,26 @@ void JPEGFrameView::setDecodedFrame(std::shared_ptr<Frame> frame)
                current_frame = frame;
                ycbcr_format.chroma_subsampling_x = frame->chroma_subsampling_x;
                ycbcr_format.chroma_subsampling_y = frame->chroma_subsampling_y;
-               ycbcr_input->change_ycbcr_format(ycbcr_format);
-               ycbcr_input->set_width(frame->width);
-               ycbcr_input->set_height(frame->height);
-               ycbcr_input->set_pixel_data(0, frame->y.get());
-               ycbcr_input->set_pixel_data(1, frame->cb.get());
-               ycbcr_input->set_pixel_data(2, frame->cr.get());
-               ycbcr_input->set_pitch(0, frame->pitch_y);
-               ycbcr_input->set_pitch(1, frame->pitch_chroma);
-               ycbcr_input->set_pitch(2, frame->pitch_chroma);
+
+               if (frame->is_semiplanar) {
+                       ycbcr_semiplanar_input->change_ycbcr_format(ycbcr_format);
+                       ycbcr_semiplanar_input->set_width(frame->width);
+                       ycbcr_semiplanar_input->set_height(frame->height);
+                       ycbcr_semiplanar_input->set_pixel_data(0, frame->y.get());
+                       ycbcr_semiplanar_input->set_pixel_data(1, frame->cbcr.get());
+                       ycbcr_semiplanar_input->set_pitch(0, frame->pitch_y);
+                       ycbcr_semiplanar_input->set_pitch(1, frame->pitch_chroma);
+               } else {
+                       ycbcr_planar_input->change_ycbcr_format(ycbcr_format);
+                       ycbcr_planar_input->set_width(frame->width);
+                       ycbcr_planar_input->set_height(frame->height);
+                       ycbcr_planar_input->set_pixel_data(0, frame->y.get());
+                       ycbcr_planar_input->set_pixel_data(1, frame->cb.get());
+                       ycbcr_planar_input->set_pixel_data(2, frame->cr.get());
+                       ycbcr_planar_input->set_pitch(0, frame->pitch_y);
+                       ycbcr_planar_input->set_pitch(1, frame->pitch_chroma);
+                       ycbcr_planar_input->set_pitch(2, frame->pitch_chroma);
+               }
                update();
        });
 }
index 8b2c93c8d5dbe245b19c4d80d38f108e2247e1c6..7c41b78b384e3fe863e57828ebafcce25e42d597 100644 (file)
 
 #include <memory>
 
+#include "jpeg_frame.h"
+
 struct JPEGID {
        unsigned stream_idx;
        int64_t pts;
        bool interpolated;
 };
-struct Frame {
-       std::unique_ptr<uint8_t[]> y, cb, cr;
-       unsigned width, height;
-       unsigned chroma_subsampling_x, chroma_subsampling_y;
-       unsigned pitch_y, pitch_chroma;
-};
 enum CacheMissBehavior {
        DECODE_IF_NOT_IN_CACHE,
        RETURN_NULLPTR_IF_NOT_IN_CACHE
@@ -60,11 +56,14 @@ private:
        // The stream index of the latest frame we displayed.
        unsigned current_stream_idx = 0;
 
-       std::unique_ptr<movit::EffectChain> chain;
+       std::unique_ptr<movit::EffectChain> planar_chain;
        std::shared_ptr<Frame> current_frame;  // So that we hold on to the pixels.
-       movit::YCbCrInput *ycbcr_input;
+       movit::YCbCrInput *ycbcr_planar_input;
        movit::YCbCrFormat ycbcr_format;
 
+       std::unique_ptr<movit::EffectChain> semiplanar_chain;
+       movit::YCbCrInput *ycbcr_semiplanar_input;
+
        static constexpr int overlay_base_width = 16, overlay_base_height = 16;
        int overlay_width = overlay_base_width, overlay_height = overlay_base_height;
        std::unique_ptr<QImage> overlay_image;  // If nullptr, no overlay.
index dcb6f10674e280a70d8e19b394b03f1503538e2b..9bb666a317984e29deaecfea227377086a8987fc 100644 (file)
--- a/main.cpp
+++ b/main.cpp
@@ -30,6 +30,7 @@ extern "C" {
 #include "ref_counted_gl_sync.h"
 #include "timebase.h"
 #include "ui_mainwindow.h"
+#include "vaapi_jpeg_decoder.h"
 
 using namespace std;
 using namespace std::chrono;
@@ -97,6 +98,8 @@ int main(int argc, char **argv)
 
        thread(record_thread_func).detach();
 
+       init_jpeg_vaapi();
+
        return app.exec();
 }
 
diff --git a/memcpy_interleaved.cpp b/memcpy_interleaved.cpp
new file mode 100644 (file)
index 0000000..9a41cdd
--- /dev/null
@@ -0,0 +1,136 @@
+#include <cstdint>
+#include <algorithm>
+#include <assert.h>
+#if __SSE2__
+#include <immintrin.h>
+#endif
+
+using namespace std;
+
+// TODO: Support stride.
+void memcpy_interleaved_slow(uint8_t *dest1, uint8_t *dest2, const uint8_t *src, size_t n)
+{
+       assert(n % 2 == 0);
+       uint8_t *dptr1 = dest1;
+       uint8_t *dptr2 = dest2;
+
+       for (size_t i = 0; i < n; i += 2) {
+               *dptr1++ = *src++;
+               *dptr2++ = *src++;
+       }
+}
+
+#ifdef __SSE2__
+
+// Returns the number of bytes consumed.
+size_t memcpy_interleaved_fastpath(uint8_t *dest1, uint8_t *dest2, const uint8_t *src, size_t n)
+{
+       const uint8_t *limit = src + n;
+       size_t consumed = 0;
+
+       // Align end to 32 bytes.
+       limit = (const uint8_t *)(intptr_t(limit) & ~31);
+
+       if (src >= limit) {
+               return 0;
+       }
+
+       // Process [0,31] bytes, such that start gets aligned to 32 bytes.
+       const uint8_t *aligned_src = (const uint8_t *)(intptr_t(src + 31) & ~31);
+       if (aligned_src != src) {
+               size_t n2 = aligned_src - src;
+               memcpy_interleaved_slow(dest1, dest2, src, n2);
+               dest1 += n2 / 2;
+               dest2 += n2 / 2;
+               if (n2 % 2) {
+                       swap(dest1, dest2);
+               }
+               src = aligned_src;
+               consumed += n2;
+       }
+
+       // Make the length a multiple of 64.
+       if (((limit - src) % 64) != 0) {
+               limit -= 32;
+       }
+       assert(((limit - src) % 64) == 0);
+
+#if __AVX2__
+       const __m256i * __restrict in = (const __m256i *)src;
+       __m256i * __restrict out1 = (__m256i *)dest1;
+       __m256i * __restrict out2 = (__m256i *)dest2;
+
+       __m256i shuffle_cw = _mm256_set_epi8(
+               15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0,
+               15, 13, 11, 9, 7, 5, 3, 1, 14, 12, 10, 8, 6, 4, 2, 0);
+       while (in < (const __m256i *)limit) {
+               // Note: For brevity, comments show lanes as if they were 2x64-bit (they're actually 2x128).
+               __m256i data1 = _mm256_stream_load_si256(in);         // AaBbCcDd EeFfGgHh
+               __m256i data2 = _mm256_stream_load_si256(in + 1);     // IiJjKkLl MmNnOoPp
+
+               data1 = _mm256_shuffle_epi8(data1, shuffle_cw);       // ABCDabcd EFGHefgh
+               data2 = _mm256_shuffle_epi8(data2, shuffle_cw);       // IJKLijkl MNOPmnop
+       
+               data1 = _mm256_permute4x64_epi64(data1, 0b11011000);  // ABCDEFGH abcdefgh
+               data2 = _mm256_permute4x64_epi64(data2, 0b11011000);  // IJKLMNOP ijklmnop
+
+               __m256i lo = _mm256_permute2x128_si256(data1, data2, 0b00100000);
+               __m256i hi = _mm256_permute2x128_si256(data1, data2, 0b00110001);
+
+               _mm256_storeu_si256(out1, lo);
+               _mm256_storeu_si256(out2, hi);
+
+               in += 2;
+               ++out1;
+               ++out2;
+               consumed += 64;
+       }
+#else
+       const __m128i * __restrict in = (const __m128i *)src;
+       __m128i * __restrict out1 = (__m128i *)dest1;
+       __m128i * __restrict out2 = (__m128i *)dest2;
+
+       __m128i mask_lower_byte = _mm_set1_epi16(0x00ff);
+       while (in < (const __m128i *)limit) {
+               __m128i data1 = _mm_load_si128(in);
+               __m128i data2 = _mm_load_si128(in + 1);
+               __m128i data1_lo = _mm_and_si128(data1, mask_lower_byte);
+               __m128i data2_lo = _mm_and_si128(data2, mask_lower_byte);
+               __m128i data1_hi = _mm_srli_epi16(data1, 8);
+               __m128i data2_hi = _mm_srli_epi16(data2, 8);
+               __m128i lo = _mm_packus_epi16(data1_lo, data2_lo);
+               _mm_storeu_si128(out1, lo);
+               __m128i hi = _mm_packus_epi16(data1_hi, data2_hi);
+               _mm_storeu_si128(out2, hi);
+
+               in += 2;
+               ++out1;
+               ++out2;
+               consumed += 32;
+       }
+#endif
+
+       return consumed;
+}
+
+#endif  // defined(__SSE2__)
+
+void memcpy_interleaved(uint8_t *dest1, uint8_t *dest2, const uint8_t *src, size_t n)
+{
+#ifdef __SSE2__
+       size_t consumed = memcpy_interleaved_fastpath(dest1, dest2, src, n);
+       src += consumed;
+       dest1 += consumed / 2;
+       dest2 += consumed / 2;
+       if (consumed % 2) {
+               swap(dest1, dest2);
+       }
+       n -= consumed;
+
+       if (n > 0) {
+               memcpy_interleaved_slow(dest1, dest2, src, n);
+       }
+#else
+       memcpy_interleaved_slow(dest1, dest2, src, n);
+#endif
+}
diff --git a/memcpy_interleaved.h b/memcpy_interleaved.h
new file mode 100644 (file)
index 0000000..a7f8994
--- /dev/null
@@ -0,0 +1,11 @@
+#ifndef _MEMCPY_INTERLEAVED_H
+#define _MEMCPY_INTERLEAVED_H 1
+
+#include <stddef.h>
+#include <stdint.h>
+
+// Copies every other byte from src to dest1 and dest2.
+// TODO: Support stride.
+void memcpy_interleaved(uint8_t *dest1, uint8_t *dest2, const uint8_t *src, size_t n);
+
+#endif  // !defined(_MEMCPY_INTERLEAVED_H)
diff --git a/vaapi_jpeg_decoder.cpp b/vaapi_jpeg_decoder.cpp
new file mode 100644 (file)
index 0000000..b6f9c50
--- /dev/null
@@ -0,0 +1,546 @@
+#include "vaapi_jpeg_decoder.h"
+
+#include <X11/Xlib.h>
+#include <assert.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <glob.h>
+#include <jpeglib.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <va/va.h>
+#include <va/va_drm.h>
+#include <va/va_x11.h>
+
+#include <list>
+#include <mutex>
+#include <string>
+
+#include "jpeg_frame.h"
+#include "memcpy_interleaved.h"
+
+using namespace std;
+
+static unique_ptr<VADisplayWithCleanup> va_dpy;
+static VAConfigID config_id;
+static VAImageFormat uyvy_format;
+bool vaapi_jpeg_decoding_usable = false;
+       
+struct VAResources {
+       unsigned width, height;
+       VASurfaceID surface;
+       VAContextID context;
+       VAImage image;
+};
+static list<VAResources> va_resources_freelist;
+static mutex va_resources_mutex;
+
+#define CHECK_VASTATUS(va_status, func)                                 \
+    if (va_status != VA_STATUS_SUCCESS) {                               \
+        fprintf(stderr, "%s:%d (%s) failed with %d\n", __func__, __LINE__, func, va_status); \
+        exit(1);                                                        \
+    }
+
+#define CHECK_VASTATUS_RET(va_status, func)                             \
+    if (va_status != VA_STATUS_SUCCESS) {                               \
+        fprintf(stderr, "%s:%d (%s) failed with %d\n", __func__, __LINE__, func, va_status); \
+        return nullptr;                                                 \
+    }
+
+VAResources get_va_resources(unsigned width, unsigned height)
+{
+       {
+               lock_guard<mutex> lock(va_resources_mutex);
+               for (auto it = va_resources_freelist.begin(); it != va_resources_freelist.end(); ++it) {
+                       if (it->width == width && it->height == height) {
+                               VAResources ret = *it;
+                               va_resources_freelist.erase(it);
+                               return ret;
+                       }
+               }
+       }
+
+       VAResources ret;
+
+       ret.width = width;
+       ret.height = height;
+
+       VAStatus va_status = vaCreateSurfaces(va_dpy->va_dpy, VA_RT_FORMAT_YUV422,
+               width, height,
+               &ret.surface, 1, nullptr, 0);
+       CHECK_VASTATUS(va_status, "vaCreateSurfaces");
+
+       va_status = vaCreateContext(va_dpy->va_dpy, config_id, width, height, 0, &ret.surface, 1, &ret.context);
+       CHECK_VASTATUS(va_status, "vaCreateContext");
+
+       va_status = vaCreateImage(va_dpy->va_dpy, &uyvy_format, width, height, &ret.image);
+       CHECK_VASTATUS(va_status, "vaCreateImage");
+
+       return ret;
+}
+
+void release_va_resources(VAResources resources)
+{
+       lock_guard<mutex> lock(va_resources_mutex);
+       if (va_resources_freelist.size() > 10) {
+               auto it = va_resources_freelist.end();
+               --it;
+
+               VAStatus va_status = vaDestroyImage(va_dpy->va_dpy, it->image.image_id);
+               CHECK_VASTATUS(va_status, "vaDestroyImage");
+
+               va_status = vaDestroyContext(va_dpy->va_dpy, it->context);
+               CHECK_VASTATUS(va_status, "vaDestroyContext");
+
+               va_status = vaDestroySurfaces(va_dpy->va_dpy, &it->surface, 1);
+               CHECK_VASTATUS(va_status, "vaDestroySurfaces");
+
+               va_resources_freelist.erase(it);
+       }
+
+       va_resources_freelist.push_front(resources);
+}
+
+// RAII wrapper to release VAResources on return (even on error).
+class ReleaseVAResources {
+public:
+       ReleaseVAResources(const VAResources &resources)
+               : resources(resources) {}
+       ~ReleaseVAResources() {
+               if (!committed) {
+                       release_va_resources(resources);
+               }
+       }
+
+       void commit() { committed = true; }
+
+private:
+       const VAResources &resources;
+       bool committed = false;
+};
+
+VADisplayWithCleanup::~VADisplayWithCleanup()
+{
+       if (va_dpy != nullptr) {
+               vaTerminate(va_dpy);
+       }
+       if (x11_display != nullptr) {
+               XCloseDisplay(x11_display);
+       }
+       if (drm_fd != -1) {
+               close(drm_fd);
+       }
+}
+
+unique_ptr<VADisplayWithCleanup> va_open_display(const string &va_display)
+{
+       if (va_display.empty() || va_display[0] != '/') {  // An X display.
+               Display *x11_display = XOpenDisplay(va_display.empty() ? nullptr : va_display.c_str());
+               if (x11_display == nullptr) {
+                       fprintf(stderr, "error: can't connect to X server!\n");
+                       return nullptr;
+               }
+
+               unique_ptr<VADisplayWithCleanup> ret(new VADisplayWithCleanup);
+               ret->x11_display = x11_display;
+               ret->va_dpy = vaGetDisplay(x11_display);
+               if (ret->va_dpy == nullptr) {
+                       return nullptr;
+               }
+               return ret;
+       } else {  // A DRM node on the filesystem (e.g. /dev/dri/renderD128).
+               int drm_fd = open(va_display.c_str(), O_RDWR);
+               if (drm_fd == -1) {
+                       perror(va_display.c_str());
+                       return nullptr;
+               }
+               unique_ptr<VADisplayWithCleanup> ret(new VADisplayWithCleanup);
+               ret->drm_fd = drm_fd;
+               ret->va_dpy = vaGetDisplayDRM(drm_fd);
+               if (ret->va_dpy == nullptr) {
+                       return nullptr;
+               }
+               return ret;
+       }
+}
+
+unique_ptr<VADisplayWithCleanup> try_open_va(const string &va_display, string *error)
+{
+       unique_ptr<VADisplayWithCleanup> va_dpy = va_open_display(va_display);
+       if (va_dpy == nullptr) {
+               if (error) *error = "Opening VA display failed";
+               return nullptr;
+       }
+       int major_ver, minor_ver;
+       VAStatus va_status = vaInitialize(va_dpy->va_dpy, &major_ver, &minor_ver);
+       if (va_status != VA_STATUS_SUCCESS) {
+               char buf[256];
+               snprintf(buf, sizeof(buf), "vaInitialize() failed with status %d\n", va_status);
+               if (error != nullptr) *error = buf;
+               return nullptr;
+       }
+
+       int num_entrypoints = vaMaxNumEntrypoints(va_dpy->va_dpy);
+       unique_ptr<VAEntrypoint[]> entrypoints(new VAEntrypoint[num_entrypoints]);
+       if (entrypoints == nullptr) {
+               if (error != nullptr) *error = "Failed to allocate memory for VA entry points";
+               return nullptr;
+       }
+
+       vaQueryConfigEntrypoints(va_dpy->va_dpy, VAProfileJPEGBaseline, entrypoints.get(), &num_entrypoints);
+       for (int slice_entrypoint = 0; slice_entrypoint < num_entrypoints; slice_entrypoint++) {
+               if (entrypoints[slice_entrypoint] != VAEntrypointVLD) {
+                       continue;
+               }
+
+               // We found a usable decode, so return it.
+               return va_dpy;
+       }
+
+       if (error != nullptr) *error = "Can't find VAEntrypointVLD for the JPEG profile";
+       return nullptr;
+}
+
+string get_usable_va_display()
+{
+       // Reduce the amount of chatter while probing,
+       // unless the user has specified otherwise.
+       bool need_env_reset = false;
+       if (getenv("LIBVA_MESSAGING_LEVEL") == nullptr) {
+               setenv("LIBVA_MESSAGING_LEVEL", "0", true);
+               need_env_reset = true;
+       }
+
+       // First try the default (ie., whatever $DISPLAY is set to).
+       unique_ptr<VADisplayWithCleanup> va_dpy = try_open_va("", nullptr);
+       if (va_dpy != nullptr) {
+               if (need_env_reset) {
+                       unsetenv("LIBVA_MESSAGING_LEVEL");
+               }
+               return "";
+       }
+
+       fprintf(stderr, "The X11 display did not expose a VA-API JPEG decoder.\n");
+
+       // Try all /dev/dri/render* in turn. TODO: Accept /dev/dri/card*, too?
+       glob_t g;
+       int err = glob("/dev/dri/renderD*", 0, nullptr, &g);
+       if (err != 0) {
+               fprintf(stderr, "Couldn't list render nodes (%s) when trying to autodetect a replacement.\n", strerror(errno));
+       } else {
+               for (size_t i = 0; i < g.gl_pathc; ++i) {
+                       string path = g.gl_pathv[i];
+                       va_dpy = try_open_va(path, nullptr);
+                       if (va_dpy != nullptr) {
+                               fprintf(stderr, "Autodetected %s as a suitable replacement; using it.\n",
+                                       path.c_str());
+                               globfree(&g);
+                               if (need_env_reset) {
+                                       unsetenv("LIBVA_MESSAGING_LEVEL");
+                               }
+                               return path;
+                       }
+               }
+       }
+
+       fprintf(stderr, "No suitable VA-API JPEG decoders were found in /dev/dri; giving up.\n");
+       fprintf(stderr, "Note that if you are using an Intel CPU with an external GPU,\n");
+       fprintf(stderr, "you may need to enable the integrated Intel GPU in your BIOS\n");
+       fprintf(stderr, "to expose Quick Sync.\n");
+       return "none";
+}
+
+void init_jpeg_vaapi()
+{
+       string dpy = get_usable_va_display();
+       if (dpy == "none") {
+               return;
+       }
+
+       va_dpy = try_open_va(dpy, nullptr);
+       if (va_dpy == nullptr) {
+               return;
+       }
+
+       VAConfigAttrib attr = { VAConfigAttribRTFormat, VA_RT_FORMAT_YUV422 };
+
+       VAStatus va_status = vaCreateConfig(va_dpy->va_dpy, VAProfileJPEGBaseline, VAEntrypointVLD,
+               &attr, 1, &config_id);
+       CHECK_VASTATUS(va_status, "vaCreateConfig");
+
+       int num_formats = vaMaxNumImageFormats(va_dpy->va_dpy);
+       assert(num_formats > 0);
+
+       unique_ptr<VAImageFormat[]> formats(new VAImageFormat[num_formats]);
+       va_status = vaQueryImageFormats(va_dpy->va_dpy, formats.get(), &num_formats);
+       CHECK_VASTATUS(va_status, "vaQueryImageFormats");
+
+       bool found = false;
+       for (int i = 0; i < num_formats; ++i) {
+               // Seemingly VA_FOURCC_422H is no good for vaGetImage(). :-/
+               if (formats[i].fourcc == VA_FOURCC_UYVY) {
+                       memcpy(&uyvy_format, &formats[i], sizeof(VAImageFormat));
+                       found = true;
+                       break;
+               }
+       }
+       if (!found) {
+               return;
+       }
+
+       fprintf(stderr, "VA-API JPEG decoding initialized.\n");
+       vaapi_jpeg_decoding_usable = true;
+}
+
+shared_ptr<Frame> decode_jpeg_vaapi(const string &filename)
+{
+       jpeg_decompress_struct dinfo;
+       jpeg_error_mgr jerr;
+       dinfo.err = jpeg_std_error(&jerr);
+       jpeg_create_decompress(&dinfo);
+
+       FILE *fp = fopen(filename.c_str(), "rb");
+       if (fp == nullptr) {
+               perror(filename.c_str());
+               exit(1);
+       }
+       jpeg_stdio_src(&dinfo, fp);
+
+       jpeg_read_header(&dinfo, true);
+
+       // Read the data that comes after the header. VA-API will destuff and all for us.
+       std::string str((const char *)dinfo.src->next_input_byte, dinfo.src->bytes_in_buffer);
+       while (!feof(fp)) {
+               char buf[4096];
+               size_t ret = fread(buf, 1, sizeof(buf), fp);
+               str.append(buf, ret);
+       }
+       fclose(fp);
+
+       if (dinfo.num_components != 3) {
+               fprintf(stderr, "Not a color JPEG. (%d components, Y=%dx%d, Cb=%dx%d, Cr=%dx%d)\n",
+                       dinfo.num_components,
+                       dinfo.comp_info[0].h_samp_factor, dinfo.comp_info[0].v_samp_factor,
+                       dinfo.comp_info[1].h_samp_factor, dinfo.comp_info[1].v_samp_factor,
+                       dinfo.comp_info[2].h_samp_factor, dinfo.comp_info[2].v_samp_factor);
+               return nullptr;
+       }
+       if (dinfo.comp_info[0].h_samp_factor != 2 ||
+           dinfo.comp_info[0].v_samp_factor != 2 ||
+           dinfo.comp_info[1].h_samp_factor != 1 ||
+           dinfo.comp_info[1].v_samp_factor != 2 ||
+           dinfo.comp_info[2].h_samp_factor != 1 ||
+           dinfo.comp_info[2].v_samp_factor != 2) {
+               fprintf(stderr, "Not 4:2:2. (Y=%dx%d, Cb=%dx%d, Cr=%dx%d)\n",
+                       dinfo.comp_info[0].h_samp_factor, dinfo.comp_info[0].v_samp_factor,
+                       dinfo.comp_info[1].h_samp_factor, dinfo.comp_info[1].v_samp_factor,
+                       dinfo.comp_info[2].h_samp_factor, dinfo.comp_info[2].v_samp_factor);
+               return nullptr;
+       }
+
+       // Picture parameters.
+       VAPictureParameterBufferJPEGBaseline pic_param;
+       memset(&pic_param, 0, sizeof(pic_param));
+       pic_param.picture_width = dinfo.image_width;
+       pic_param.picture_height = dinfo.image_height;
+       for (int component_idx = 0; component_idx < dinfo.num_components; ++component_idx) {
+               const jpeg_component_info *comp = &dinfo.comp_info[component_idx];
+               pic_param.components[component_idx].component_id = comp->component_id;
+               pic_param.components[component_idx].h_sampling_factor = comp->h_samp_factor;
+               pic_param.components[component_idx].v_sampling_factor = comp->v_samp_factor;
+               pic_param.components[component_idx].quantiser_table_selector = comp->quant_tbl_no;
+       }
+       pic_param.num_components = dinfo.num_components;
+       pic_param.color_space = 0;  // YUV.
+       pic_param.rotation = VA_ROTATION_NONE;
+
+       VABufferID pic_param_buffer;
+       VAStatus va_status = vaCreateBuffer(va_dpy->va_dpy, config_id, VAPictureParameterBufferType, sizeof(pic_param), 1, &pic_param, &pic_param_buffer);
+       CHECK_VASTATUS_RET(va_status, "vaCreateBuffer");
+
+       // Quantization matrices.
+       VAIQMatrixBufferJPEGBaseline iq;
+       memset(&iq, 0, sizeof(iq));
+
+       for (int quant_tbl_idx = 0; quant_tbl_idx < min(4, NUM_QUANT_TBLS); ++quant_tbl_idx) {
+               const JQUANT_TBL *qtbl = dinfo.quant_tbl_ptrs[quant_tbl_idx];
+               if (qtbl == nullptr) {
+                       iq.load_quantiser_table[quant_tbl_idx] = 0;
+               } else {
+                       iq.load_quantiser_table[quant_tbl_idx] = 1;
+                       for (int i = 0; i < 64; ++i) {
+                               if (qtbl->quantval[i] > 255) {
+                                       fprintf(stderr, "Baseline JPEG only!\n");
+                                       return nullptr;
+                               }
+                               iq.quantiser_table[quant_tbl_idx][i] = qtbl->quantval[i];
+                       }
+               }
+       }
+
+       VABufferID iq_buffer;
+       va_status = vaCreateBuffer(va_dpy->va_dpy, config_id, VAIQMatrixBufferType, sizeof(iq), 1, &iq, &iq_buffer);
+       CHECK_VASTATUS_RET(va_status, "vaCreateBuffer");
+
+       // Huffman tables (arithmetic is not supported).
+       VAHuffmanTableBufferJPEGBaseline huff;
+       memset(&huff, 0, sizeof(huff));
+
+       for (int huff_tbl_idx = 0; huff_tbl_idx < min(2, NUM_HUFF_TBLS); ++huff_tbl_idx) {
+               const JHUFF_TBL *ac_hufftbl = dinfo.ac_huff_tbl_ptrs[huff_tbl_idx];
+               const JHUFF_TBL *dc_hufftbl = dinfo.dc_huff_tbl_ptrs[huff_tbl_idx];
+               if (ac_hufftbl == nullptr) {
+                       assert(dc_hufftbl == nullptr);
+                       huff.load_huffman_table[huff_tbl_idx] = 0;
+               } else {
+                       assert(dc_hufftbl != nullptr);
+                       huff.load_huffman_table[huff_tbl_idx] = 1;
+
+                       for (int i = 0; i < 16; ++i) {
+                               huff.huffman_table[huff_tbl_idx].num_dc_codes[i] = dc_hufftbl->bits[i + 1];
+                       }
+                       for (int i = 0; i < 12; ++i) {
+                               huff.huffman_table[huff_tbl_idx].dc_values[i] = dc_hufftbl->huffval[i];
+                       }
+                       for (int i = 0; i < 16; ++i) {
+                               huff.huffman_table[huff_tbl_idx].num_ac_codes[i] = ac_hufftbl->bits[i + 1];
+                       }
+                       for (int i = 0; i < 162; ++i) {
+                               huff.huffman_table[huff_tbl_idx].ac_values[i] = ac_hufftbl->huffval[i];
+                       }
+               }
+       }
+
+       VABufferID huff_buffer;
+       va_status = vaCreateBuffer(va_dpy->va_dpy, config_id, VAHuffmanTableBufferType, sizeof(huff), 1, &huff, &huff_buffer);
+       CHECK_VASTATUS_RET(va_status, "vaCreateBuffer");
+
+       // Slice parameters (metadata about the slice).
+       VASliceParameterBufferJPEGBaseline parms;
+       memset(&parms, 0, sizeof(parms));
+       parms.slice_data_size = str.size();
+       parms.slice_data_offset = 0;
+       parms.slice_data_flag = VA_SLICE_DATA_FLAG_ALL; 
+       parms.slice_horizontal_position = 0;
+       parms.slice_vertical_position = 0;
+       for (int component_idx = 0; component_idx < dinfo.num_components; ++component_idx) {
+               const jpeg_component_info *comp = &dinfo.comp_info[component_idx];
+               parms.components[component_idx].component_selector = comp->component_id;
+               parms.components[component_idx].dc_table_selector = comp->dc_tbl_no;
+               parms.components[component_idx].ac_table_selector = comp->ac_tbl_no;
+               if (parms.components[component_idx].dc_table_selector > 1 ||
+                   parms.components[component_idx].ac_table_selector > 1) {
+                       fprintf(stderr, "Uses too many Huffman tables\n");
+                       return nullptr;
+               }
+       }
+       parms.num_components = dinfo.num_components;
+       parms.restart_interval = dinfo.restart_interval;
+       int horiz_mcus = (dinfo.image_width + (DCTSIZE * 2) - 1) / (DCTSIZE * 2);
+       int vert_mcus = (dinfo.image_height + DCTSIZE - 1) / DCTSIZE;
+       parms.num_mcus = horiz_mcus * vert_mcus;
+
+       VABufferID slice_param_buffer;
+       va_status = vaCreateBuffer(va_dpy->va_dpy, config_id, VASliceParameterBufferType, sizeof(parms), 1, &parms, &slice_param_buffer);
+       CHECK_VASTATUS_RET(va_status, "vaCreateBuffer");
+
+       // The actual data.
+       VABufferID data_buffer;
+       va_status = vaCreateBuffer(va_dpy->va_dpy, config_id, VASliceDataBufferType, str.size(), 1, &str[0], &data_buffer);
+       CHECK_VASTATUS_RET(va_status, "vaCreateBuffer");
+
+       VAResources resources = get_va_resources(dinfo.image_width, dinfo.image_height);
+       ReleaseVAResources release(resources);
+
+       va_status = vaBeginPicture(va_dpy->va_dpy, resources.context, resources.surface);
+       CHECK_VASTATUS_RET(va_status, "vaBeginPicture");
+       va_status = vaRenderPicture(va_dpy->va_dpy, resources.context, &pic_param_buffer, 1);
+       CHECK_VASTATUS_RET(va_status, "vaRenderPicture(pic_param)");
+       va_status = vaRenderPicture(va_dpy->va_dpy, resources.context, &iq_buffer, 1);
+       CHECK_VASTATUS_RET(va_status, "vaRenderPicture(iq)");
+       va_status = vaRenderPicture(va_dpy->va_dpy, resources.context, &huff_buffer, 1);
+       CHECK_VASTATUS_RET(va_status, "vaRenderPicture(huff)");
+       va_status = vaRenderPicture(va_dpy->va_dpy, resources.context, &slice_param_buffer, 1);
+       CHECK_VASTATUS_RET(va_status, "vaRenderPicture(slice_param)");
+       va_status = vaRenderPicture(va_dpy->va_dpy, resources.context, &data_buffer, 1);
+       CHECK_VASTATUS_RET(va_status, "vaRenderPicture(data)");
+       va_status = vaEndPicture(va_dpy->va_dpy, resources.context);
+       CHECK_VASTATUS_RET(va_status, "vaEndPicture");
+
+       // vaDeriveImage() works, but the resulting image seems to live in
+       // uncached memory, which makes copying data out from it very, very slow.
+       // Thanks to FFmpeg for the observation that you can vaGetImage() the
+       // surface onto your own image (although then, it can't be planar, which
+       // is unfortunate for us).
+#if 0
+       VAImage image;
+       va_status = vaDeriveImage(va_dpy->va_dpy, surf, &image);
+       CHECK_VASTATUS_RET(va_status, "vaDeriveImage");
+#else
+       va_status = vaSyncSurface(va_dpy->va_dpy, resources.surface);
+       CHECK_VASTATUS_RET(va_status, "vaSyncSurface");
+
+       va_status = vaGetImage(va_dpy->va_dpy, resources.surface, 0, 0, dinfo.image_width, dinfo.image_height, resources.image.image_id);
+       CHECK_VASTATUS_RET(va_status, "vaGetImage");
+#endif
+
+       void *mapped;
+       va_status = vaMapBuffer(va_dpy->va_dpy, resources.image.buf, &mapped);
+       CHECK_VASTATUS_RET(va_status, "vaMapBuffer");
+
+       shared_ptr<Frame> frame(new Frame);
+#if 0
+       // 4:2:2 planar (for vaDeriveImage).
+       frame->y.reset(new uint8_t[dinfo.image_width * dinfo.image_height]);
+       frame->cb.reset(new uint8_t[(dinfo.image_width / 2) * dinfo.image_height]);
+       frame->cr.reset(new uint8_t[(dinfo.image_width / 2) * dinfo.image_height]);
+       for (int component_idx = 0; component_idx < dinfo.num_components; ++component_idx) {
+               uint8_t *dptr;
+               size_t width;
+               if (component_idx == 0) {
+                       dptr = frame->y.get();
+                       width = dinfo.image_width;
+               } else if (component_idx == 1) {
+                       dptr = frame->cb.get();
+                       width = dinfo.image_width / 2;
+               } else if (component_idx == 2) {
+                       dptr = frame->cr.get();
+                       width = dinfo.image_width / 2;
+               } else {
+                       assert(false);
+               }
+               const uint8_t *sptr = (const uint8_t *)mapped + image.offsets[component_idx];
+               size_t spitch = image.pitches[component_idx];
+               for (size_t y = 0; y < dinfo.image_height; ++y) {
+                       memcpy(dptr + y * width, sptr + y * spitch, width);
+               }
+       }
+#else
+       // Convert Y'CbCr to separate Y' and CbCr.
+       frame->is_semiplanar = true;
+       frame->y.reset(new uint8_t[dinfo.image_width * dinfo.image_height]);
+       frame->cbcr.reset(new uint8_t[dinfo.image_width * dinfo.image_height]);
+       const uint8_t *src = (const uint8_t *)mapped + resources.image.offsets[0];
+       if (resources.image.pitches[0] == dinfo.image_width * 2) {
+               memcpy_interleaved(frame->cbcr.get(), frame->y.get(), src, dinfo.image_width * dinfo.image_height * 2);
+       } else {
+               for (unsigned y = 0; y < dinfo.image_height; ++y) {
+                       memcpy_interleaved(frame->cbcr.get() + y * dinfo.image_width, frame->y.get() + y * dinfo.image_width,
+                                          src + y * resources.image.pitches[0], dinfo.image_width * 2);
+               }
+       }
+#endif
+       frame->width = dinfo.image_width;
+       frame->height = dinfo.image_height;
+       frame->chroma_subsampling_x = 2;
+       frame->chroma_subsampling_y = 1;
+       frame->pitch_y = dinfo.image_width;
+       frame->pitch_chroma = dinfo.image_width / 2;
+
+       va_status = vaUnmapBuffer(va_dpy->va_dpy, resources.image.buf);
+       CHECK_VASTATUS_RET(va_status, "vaUnmapBuffer");
+
+       return frame;
+}
diff --git a/vaapi_jpeg_decoder.h b/vaapi_jpeg_decoder.h
new file mode 100644 (file)
index 0000000..4ab957e
--- /dev/null
@@ -0,0 +1,27 @@
+#ifndef _VAAPI_JPEG_DECODER_H
+#define _VAAPI_JPEG_DECODER_H 1
+
+#include <X11/Xlib.h>
+#include <va/va.h>
+
+#include <string>
+#include <memory>
+
+struct Frame;
+  
+struct VADisplayWithCleanup {
+       ~VADisplayWithCleanup();
+
+       VADisplay va_dpy;
+       Display *x11_display = nullptr;
+       int drm_fd = -1;
+};
+std::unique_ptr<VADisplayWithCleanup> va_open_display(const std::string &va_display);  // Can return nullptr on failure.
+std::string get_usable_va_display();
+
+void init_jpeg_vaapi();
+std::shared_ptr<Frame> decode_jpeg_vaapi(const std::string &filename);
+
+extern bool vaapi_jpeg_decoding_usable;
+  
+#endif  // !defined(_VAAPI_JPEG_DECODER_H)
index a733d46e6820b3fe32150724cd0eb92aef94cc4c..aa93dc7a0a02499a8e4555bb655adae6bfa1ad0a 100644 (file)
@@ -149,11 +149,11 @@ vector<uint8_t> encode_jpeg(const uint8_t *y_data, const uint8_t *cb_data, const
 VideoStream::VideoStream()
 {
        using namespace movit;
-       // TODO: deduplicate code against JPEGFrameView?
-       ycbcr_convert_chain.reset(new EffectChain(1280, 720));
-       ImageFormat image_format;
-       image_format.color_space = COLORSPACE_sRGB;
-       image_format.gamma_curve = GAMMA_sRGB;
+
+       ImageFormat inout_format;
+       inout_format.color_space = COLORSPACE_sRGB;
+       inout_format.gamma_curve = GAMMA_sRGB;
+
        ycbcr_format.luma_coefficients = YCBCR_REC_709;
        ycbcr_format.full_range = true;  // JPEG.
        ycbcr_format.num_levels = 256;
@@ -163,28 +163,33 @@ VideoStream::VideoStream()
        ycbcr_format.cb_y_position = 0.5f;  // Irrelevant.
        ycbcr_format.cr_x_position = 0.0f;
        ycbcr_format.cr_y_position = 0.5f;
-       ycbcr_input = (movit::YCbCrInput *)ycbcr_convert_chain->add_input(new YCbCrInput(image_format, ycbcr_format, 1280, 720));
 
        YCbCrFormat ycbcr_output_format = ycbcr_format;
        ycbcr_output_format.chroma_subsampling_x = 1;
 
-       ImageFormat inout_format;
-       inout_format.color_space = COLORSPACE_sRGB;
-       inout_format.gamma_curve = GAMMA_sRGB;
+       // TODO: deduplicate code against JPEGFrameView?
+       ycbcr_planar_convert_chain.reset(new EffectChain(1280, 720));
+       ycbcr_planar_input = (movit::YCbCrInput *)ycbcr_planar_convert_chain->add_input(new YCbCrInput(inout_format, ycbcr_format, 1280, 720, YCBCR_INPUT_PLANAR));
 
-       check_error();
+       // One full Y'CbCr texture (for interpolation), one that's just Y (throwing away the
+       // Cb and Cr channels). The second copy is sort of redundant, but it's the easiest way
+       // of getting the gray data into a layered texture.
+       ycbcr_planar_convert_chain->add_ycbcr_output(inout_format, OUTPUT_ALPHA_FORMAT_POSTMULTIPLIED, ycbcr_output_format);
+       ycbcr_planar_convert_chain->add_ycbcr_output(inout_format, OUTPUT_ALPHA_FORMAT_POSTMULTIPLIED, ycbcr_output_format);
+       ycbcr_planar_convert_chain->set_dither_bits(8);
+       ycbcr_planar_convert_chain->finalize();
+
+       // Same, for semiplanar inputs.
+       ycbcr_semiplanar_convert_chain.reset(new EffectChain(1280, 720));
+       ycbcr_semiplanar_input = (movit::YCbCrInput *)ycbcr_semiplanar_convert_chain->add_input(new YCbCrInput(inout_format, ycbcr_format, 1280, 720, YCBCR_INPUT_SPLIT_Y_AND_CBCR));
 
        // One full Y'CbCr texture (for interpolation), one that's just Y (throwing away the
        // Cb and Cr channels). The second copy is sort of redundant, but it's the easiest way
        // of getting the gray data into a layered texture.
-       ycbcr_convert_chain->add_ycbcr_output(inout_format, OUTPUT_ALPHA_FORMAT_POSTMULTIPLIED, ycbcr_output_format);
-       check_error();
-       ycbcr_convert_chain->add_ycbcr_output(inout_format, OUTPUT_ALPHA_FORMAT_POSTMULTIPLIED, ycbcr_output_format);
-       check_error();
-       ycbcr_convert_chain->set_dither_bits(8);
-       check_error();
-       ycbcr_convert_chain->finalize();
-       check_error();
+       ycbcr_semiplanar_convert_chain->add_ycbcr_output(inout_format, OUTPUT_ALPHA_FORMAT_POSTMULTIPLIED, ycbcr_output_format);
+       ycbcr_semiplanar_convert_chain->add_ycbcr_output(inout_format, OUTPUT_ALPHA_FORMAT_POSTMULTIPLIED, ycbcr_output_format);
+       ycbcr_semiplanar_convert_chain->set_dither_bits(8);
+       ycbcr_semiplanar_convert_chain->finalize();
 
        GLuint input_tex[num_interpolate_slots], gray_tex[num_interpolate_slots], cb_tex[num_interpolate_slots], cr_tex[num_interpolate_slots];
        glCreateTextures(GL_TEXTURE_2D_ARRAY, 10, input_tex);
@@ -325,16 +330,28 @@ void VideoStream::schedule_interpolated_frame(int64_t output_pts, unsigned strea
                shared_ptr<Frame> frame = decode_jpeg_with_cache(jpeg_id, DECODE_IF_NOT_IN_CACHE, &did_decode);
                ycbcr_format.chroma_subsampling_x = frame->chroma_subsampling_x;
                ycbcr_format.chroma_subsampling_y = frame->chroma_subsampling_y;
-               ycbcr_input->change_ycbcr_format(ycbcr_format);
-               ycbcr_input->set_width(frame->width);
-               ycbcr_input->set_height(frame->height);
-               ycbcr_input->set_pixel_data(0, frame->y.get());
-               ycbcr_input->set_pixel_data(1, frame->cb.get());
-               ycbcr_input->set_pixel_data(2, frame->cr.get());
-               ycbcr_input->set_pitch(0, frame->pitch_y);
-               ycbcr_input->set_pitch(1, frame->pitch_chroma);
-               ycbcr_input->set_pitch(2, frame->pitch_chroma);
-               ycbcr_convert_chain->render_to_fbo(resources.input_fbos[frame_no], 1280, 720);
+
+               if (frame->is_semiplanar) {
+                       ycbcr_semiplanar_input->change_ycbcr_format(ycbcr_format);
+                       ycbcr_semiplanar_input->set_width(frame->width);
+                       ycbcr_semiplanar_input->set_height(frame->height);
+                       ycbcr_semiplanar_input->set_pixel_data(0, frame->y.get());
+                       ycbcr_semiplanar_input->set_pixel_data(1, frame->cbcr.get());
+                       ycbcr_semiplanar_input->set_pitch(0, frame->pitch_y);
+                       ycbcr_semiplanar_input->set_pitch(1, frame->pitch_chroma);
+                       ycbcr_semiplanar_convert_chain->render_to_fbo(resources.input_fbos[frame_no], 1280, 720);
+               } else {
+                       ycbcr_planar_input->change_ycbcr_format(ycbcr_format);
+                       ycbcr_planar_input->set_width(frame->width);
+                       ycbcr_planar_input->set_height(frame->height);
+                       ycbcr_planar_input->set_pixel_data(0, frame->y.get());
+                       ycbcr_planar_input->set_pixel_data(1, frame->cb.get());
+                       ycbcr_planar_input->set_pixel_data(2, frame->cr.get());
+                       ycbcr_planar_input->set_pitch(0, frame->pitch_y);
+                       ycbcr_planar_input->set_pitch(1, frame->pitch_chroma);
+                       ycbcr_planar_input->set_pitch(2, frame->pitch_chroma);
+                       ycbcr_planar_convert_chain->render_to_fbo(resources.input_fbos[frame_no], 1280, 720);
+               }
        }
 
        glGenerateTextureMipmap(resources.input_tex);
@@ -426,6 +443,7 @@ void VideoStream::encode_thread_func()
                                memcpy(frame->cb.get() + 640 * yy, cb + 640 * (719 - yy), 640);
                                memcpy(frame->cr.get() + 640 * yy, cr + 640 * (719 - yy), 640);
                        }
+                       frame->is_semiplanar = false;
                        frame->width = 1280;
                        frame->height = 720;
                        frame->chroma_subsampling_x = 2;
index 2d1e8f12ba90e866a6196881dd570e142f6de388..b86d271e0c473e7ec98874abdec639970ddc31df 100644 (file)
@@ -82,9 +82,11 @@ private:
 
        // Effectively only converts from 4:2:2 to 4:4:4.
        // TODO: Have a separate version with ResampleEffect, for scaling?
-       std::unique_ptr<movit::EffectChain> ycbcr_convert_chain;
+       std::unique_ptr<movit::EffectChain> ycbcr_planar_convert_chain;
+       std::unique_ptr<movit::EffectChain> ycbcr_semiplanar_convert_chain;
 
-       movit::YCbCrInput *ycbcr_input;
+       movit::YCbCrInput *ycbcr_planar_input;
+       movit::YCbCrInput *ycbcr_semiplanar_input;
        movit::YCbCrFormat ycbcr_format;
 
        // Frame interpolation.