]> git.sesse.net Git - nageru/blobdiff - h264encode.cpp
Fix an issue in the H264 encoder where a surface could be rendered to while it was...
[nageru] / h264encode.cpp
index d5c9e8c9e70a905cb4bef7ee53144e3c1aa91836..6a402f45765e4932f1c0dcaf57208554d19f6969 100644 (file)
@@ -1,6 +1,7 @@
 //#include "sysdeps.h"
 #include "h264encode.h"
 
+#include <movit/util.h>
 #include <EGL/eglplatform.h>
 #include <X11/X.h>
 #include <X11/Xlib.h>
@@ -18,7 +19,9 @@ extern "C" {
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
+#include <fcntl.h>
 #include <va/va.h>
+#include <va/va_drm.h>
 #include <va/va_drmcommon.h>
 #include <va/va_enc_h264.h>
 #include <va/va_x11.h>
@@ -49,6 +52,8 @@ class QSurface;
         exit(1);                                                        \
     }
 
+#define BUFFER_OFFSET(i) ((char *)NULL + (i))
+
 //#include "loadsurface.h"
 
 #define NAL_REF_IDC_NONE        0
@@ -79,6 +84,8 @@ class QSurface;
    
 #define BITSTREAM_ALLOCATE_STEPPING     4096
 #define SURFACE_NUM 16 /* 16 surfaces for source YUV */
+#define MAX_NUM_REF1 16 // Seemingly a hardware-fixed value, not related to SURFACE_NUM
+#define MAX_NUM_REF2 32 // Seemingly a hardware-fixed value, not related to SURFACE_NUM
 
 static constexpr unsigned int MaxFrameNum = (2<<16);
 static constexpr unsigned int MaxPicOrderCntLsb = (2<<8);
@@ -108,9 +115,9 @@ using namespace std;
 
 class H264EncoderImpl {
 public:
-       H264EncoderImpl(QSurface *surface, int width, int height, HTTPD *httpd);
+       H264EncoderImpl(QSurface *surface, const string &va_display, int width, int height, HTTPD *httpd);
        ~H264EncoderImpl();
-       void add_audio(int64_t pts, vector<float> audio);  // Needs to come before end_frame() of same pts.
+       void add_audio(int64_t pts, vector<float> audio);
        bool begin_frame(GLuint *y_tex, GLuint *cbcr_tex);
        void end_frame(RefCountedGLsync fence, int64_t pts, const vector<RefCountedFrame> &input_frames);
        void shutdown();
@@ -147,9 +154,9 @@ private:
        void slice_header(bitstream *bs);
        int build_packed_seq_buffer(unsigned char **header_buffer);
        int build_packed_slice_buffer(unsigned char **header_buffer);
-       int init_va();
+       int init_va(const string &va_display);
        int deinit_va();
-       VADisplay va_open_display(void);
+       VADisplay va_open_display(const string &va_display);
        void va_close_display(VADisplay va_dpy);
        int setup_encode();
        int release_encode();
@@ -157,6 +164,8 @@ private:
        int update_RefPicList(int frame_type);
 
        bool is_shutdown = false;
+       bool use_zerocopy;
+       int drm_fd = -1;
 
        thread encode_thread, storage_thread;
 
@@ -179,8 +188,7 @@ private:
        AVCodecContext *context_audio;
        HTTPD *httpd;
 
-       Display *x11_display;
-       Window x11_window;
+       Display *x11_display = nullptr;
 
        // Encoder parameters
        VADisplay va_dpy;
@@ -194,7 +202,14 @@ private:
 
                VAImage surface_image;
                GLuint y_tex, cbcr_tex;
+
+               // Only if use_zerocopy == true.
                EGLImage y_egl_image, cbcr_egl_image;
+
+               // Only if use_zerocopy == false.
+               GLuint pbo;
+               uint8_t *y_ptr, *cbcr_ptr;
+               size_t y_offset, cbcr_offset;
        };
        GLSurface gl_surfaces[SURFACE_NUM];
 
@@ -204,7 +219,7 @@ private:
        VAEncPictureParameterBufferH264 pic_param;
        VAEncSliceParameterBufferH264 slice_param;
        VAPictureH264 CurrentCurrPic;
-       VAPictureH264 ReferenceFrames[16], RefPicList0_P[32], RefPicList0_B[32], RefPicList1_B[32];
+       VAPictureH264 ReferenceFrames[MAX_NUM_REF1], RefPicList0_P[MAX_NUM_REF2], RefPicList0_B[MAX_NUM_REF2], RefPicList1_B[MAX_NUM_REF2];
 
        // Static quality settings.
        static constexpr unsigned int frame_bitrate = 15000000 / 60;  // Doesn't really matter; only initial_qp does.
@@ -293,7 +308,11 @@ bitstream_put_ui(bitstream *bs, unsigned int val, int size_in_bits)
         bs->buffer[pos] = (bs->buffer[pos] << size_in_bits | val);
     } else {
         size_in_bits -= bit_left;
-        bs->buffer[pos] = (bs->buffer[pos] << bit_left) | (val >> size_in_bits);
+        if (bit_left >= 32) {
+            bs->buffer[pos] = (val >> size_in_bits);
+        } else {
+            bs->buffer[pos] = (bs->buffer[pos] << bit_left) | (val >> size_in_bits);
+        }
         bs->buffer[pos] = va_swap32(bs->buffer[pos]);
 
         if (pos + 1 == bs->max_size_in_dword) {
@@ -827,31 +846,47 @@ static const char *rc_to_string(int rc_mode)
     }
 }
 
-VADisplay H264EncoderImpl::va_open_display(void)
+VADisplay H264EncoderImpl::va_open_display(const string &va_display)
 {
-    x11_display = XOpenDisplay(NULL);
-    if (!x11_display) {
-        fprintf(stderr, "error: can't connect to X server!\n");
-        return NULL;
-    }
-    return vaGetDisplay(x11_display);
+       if (va_display.empty()) {
+               x11_display = XOpenDisplay(NULL);
+               if (!x11_display) {
+                       fprintf(stderr, "error: can't connect to X server!\n");
+                       return NULL;
+               }
+               use_zerocopy = true;
+               return vaGetDisplay(x11_display);
+       } else if (va_display[0] != '/') {
+               x11_display = XOpenDisplay(va_display.c_str());
+               if (!x11_display) {
+                       fprintf(stderr, "error: can't connect to X server!\n");
+                       return NULL;
+               }
+               use_zerocopy = true;
+               return vaGetDisplay(x11_display);
+       } else {
+               drm_fd = open(va_display.c_str(), O_RDWR);
+               if (drm_fd == -1) {
+                       perror(va_display.c_str());
+                       return NULL;
+               }
+               use_zerocopy = false;
+               return vaGetDisplayDRM(drm_fd);
+       }
 }
 
 void H264EncoderImpl::va_close_display(VADisplay va_dpy)
 {
-    if (!x11_display)
-        return;
-
-    if (x11_window) {
-        XUnmapWindow(x11_display, x11_window);
-        XDestroyWindow(x11_display, x11_window);
-        x11_window = None;
-    }
-    XCloseDisplay(x11_display);
-    x11_display = NULL;
+       if (x11_display) {
+               XCloseDisplay(x11_display);
+               x11_display = nullptr;
+       }
+       if (drm_fd != -1) {
+               close(drm_fd);
+       }
 }
 
-int H264EncoderImpl::init_va()
+int H264EncoderImpl::init_va(const string &va_display)
 {
     VAProfile profile_list[]={VAProfileH264High, VAProfileH264Main, VAProfileH264Baseline, VAProfileH264ConstrainedBaseline};
     VAEntrypoint *entrypoints;
@@ -861,7 +896,7 @@ int H264EncoderImpl::init_va()
     VAStatus va_status;
     unsigned int i;
 
-    va_dpy = va_open_display();
+    va_dpy = va_open_display(va_display);
     va_status = vaInitialize(va_dpy, &major_ver, &minor_ver);
     CHECK_VASTATUS(va_status, "vaInitialize");
 
@@ -890,7 +925,9 @@ int H264EncoderImpl::init_va()
     }
     
     if (support_encode == 0) {
-        printf("Can't find VAEntrypointEncSlice for H264 profiles\n");
+        printf("Can't find VAEntrypointEncSlice for H264 profiles. If you are using a non-Intel GPU\n");
+        printf("but have one in your system, try launching Nageru with --va-display /dev/dri/renderD128\n");
+        printf("to use VA-API against DRM instead of X11.\n");
         exit(1);
     } else {
         switch (h264_profile) {
@@ -1060,6 +1097,28 @@ int H264EncoderImpl::setup_encode()
     for (i = 0; i < SURFACE_NUM; i++) {
         glGenTextures(1, &gl_surfaces[i].y_tex);
         glGenTextures(1, &gl_surfaces[i].cbcr_tex);
+
+        if (!use_zerocopy) {
+            // Create Y image.
+            glBindTexture(GL_TEXTURE_2D, gl_surfaces[i].y_tex);
+            glTexStorage2D(GL_TEXTURE_2D, 1, GL_R8, frame_width, frame_height);
+
+            // Create CbCr image.
+            glBindTexture(GL_TEXTURE_2D, gl_surfaces[i].cbcr_tex);
+            glTexStorage2D(GL_TEXTURE_2D, 1, GL_RG8, frame_width / 2, frame_height / 2);
+
+            // Generate a PBO to read into. It doesn't necessarily fit 1:1 with the VA-API
+            // buffers, due to potentially differing pitch.
+            glGenBuffers(1, &gl_surfaces[i].pbo);
+            glBindBuffer(GL_PIXEL_PACK_BUFFER, gl_surfaces[i].pbo);
+            glBufferStorage(GL_PIXEL_PACK_BUFFER, frame_width * frame_height * 2, nullptr, GL_MAP_READ_BIT | GL_MAP_WRITE_BIT | GL_MAP_PERSISTENT_BIT);
+            uint8_t *ptr = (uint8_t *)glMapBufferRange(GL_PIXEL_PACK_BUFFER, 0, frame_width * frame_height * 2, GL_MAP_READ_BIT | GL_MAP_PERSISTENT_BIT);
+            gl_surfaces[i].y_offset = 0;
+            gl_surfaces[i].cbcr_offset = frame_width * frame_height;
+            gl_surfaces[i].y_ptr = ptr + gl_surfaces[i].y_offset;
+            gl_surfaces[i].cbcr_ptr = ptr + gl_surfaces[i].cbcr_offset;
+            glBindBuffer(GL_PIXEL_PACK_BUFFER, 0);
+        }
     }
 
     for (i = 0; i < SURFACE_NUM; i++) {
@@ -1241,7 +1300,7 @@ int H264EncoderImpl::render_picture(int frame_type, int display_frame_num, int g
     CurrentCurrPic = pic_param.CurrPic;
 
     memcpy(pic_param.ReferenceFrames, ReferenceFrames, numShortTerm*sizeof(VAPictureH264));
-    for (i = numShortTerm; i < SURFACE_NUM; i++) {
+    for (i = numShortTerm; i < MAX_NUM_REF1; i++) {
         pic_param.ReferenceFrames[i].picture_id = VA_INVALID_SURFACE;
         pic_param.ReferenceFrames[i].flags = VA_PICTURE_H264_INVALID;
     }
@@ -1391,7 +1450,7 @@ int H264EncoderImpl::render_slice(int encoding_frame_num, int display_frame_num,
         int refpiclist0_max = h264_maxref & 0xffff;
         memcpy(slice_param.RefPicList0, RefPicList0_P, refpiclist0_max*sizeof(VAPictureH264));
 
-        for (i = refpiclist0_max; i < 32; i++) {
+        for (i = refpiclist0_max; i < MAX_NUM_REF2; i++) {
             slice_param.RefPicList0[i].picture_id = VA_INVALID_SURFACE;
             slice_param.RefPicList0[i].flags = VA_PICTURE_H264_INVALID;
         }
@@ -1400,13 +1459,13 @@ int H264EncoderImpl::render_slice(int encoding_frame_num, int display_frame_num,
         int refpiclist1_max = (h264_maxref >> 16) & 0xffff;
 
         memcpy(slice_param.RefPicList0, RefPicList0_B, refpiclist0_max*sizeof(VAPictureH264));
-        for (i = refpiclist0_max; i < 32; i++) {
+        for (i = refpiclist0_max; i < MAX_NUM_REF2; i++) {
             slice_param.RefPicList0[i].picture_id = VA_INVALID_SURFACE;
             slice_param.RefPicList0[i].flags = VA_PICTURE_H264_INVALID;
         }
 
         memcpy(slice_param.RefPicList1, RefPicList1_B, refpiclist1_max*sizeof(VAPictureH264));
-        for (i = refpiclist1_max; i < 32; i++) {
+        for (i = refpiclist1_max; i < MAX_NUM_REF2; i++) {
             slice_param.RefPicList1[i].picture_id = VA_INVALID_SURFACE;
             slice_param.RefPicList1[i].flags = VA_PICTURE_H264_INVALID;
         }
@@ -1440,7 +1499,7 @@ void H264EncoderImpl::save_codeddata(storage_task task)
 
     string data;
 
-    const int64_t global_delay = (ip_period - 1) * (TIMEBASE / MAX_FPS);  // So we never get negative dts.
+    const int64_t global_delay = int64_t(ip_period - 1) * (TIMEBASE / MAX_FPS);  // So we never get negative dts.
 
     va_status = vaMapBuffer(va_dpy, gl_surfaces[task.display_order % SURFACE_NUM].coded_buf, (void **)(&buf_list));
     CHECK_VASTATUS(va_status, "vaMapBuffer");
@@ -1481,7 +1540,7 @@ void H264EncoderImpl::save_codeddata(storage_task task)
              pending_audio_frames.erase(it); 
         }
 
-        AVFrame *frame = avcodec_alloc_frame();
+        AVFrame *frame = av_frame_alloc();
         frame->nb_samples = audio.size() / 2;
         frame->format = AV_SAMPLE_FMT_S32;
         frame->channel_layout = AV_CH_LAYOUT_STEREO;
@@ -1513,7 +1572,7 @@ void H264EncoderImpl::save_codeddata(storage_task task)
             httpd->add_packet(pkt, audio_pts + global_delay, audio_pts + global_delay);
         }
         // TODO: Delayed frames.
-        avcodec_free_frame(&frame);
+        av_frame_unref(frame);
         av_free_packet(&pkt);
         if (audio_pts == task.pts) break;
     }
@@ -1544,7 +1603,6 @@ void H264EncoderImpl::storage_task_enqueue(storage_task task)
 {
        unique_lock<mutex> lock(storage_task_queue_mutex);
        storage_task_queue.push(move(task));
-       srcsurface_status[task.display_order % SURFACE_NUM] = SRC_SURFACE_IN_ENCODING;
        storage_task_queue_changed.notify_all();
 }
 
@@ -1578,18 +1636,25 @@ void H264EncoderImpl::storage_task_thread()
 
 int H264EncoderImpl::release_encode()
 {
-    int i;
-    
-    for (i = 0; i < SURFACE_NUM; i++) {
-        vaDestroyBuffer(va_dpy, gl_surfaces[i].coded_buf);
-        vaDestroySurfaces(va_dpy, &gl_surfaces[i].src_surface, 1);
-        vaDestroySurfaces(va_dpy, &gl_surfaces[i].ref_surface, 1);
-    }
-    
-    vaDestroyContext(va_dpy, context_id);
-    vaDestroyConfig(va_dpy, config_id);
+       for (unsigned i = 0; i < SURFACE_NUM; i++) {
+               vaDestroyBuffer(va_dpy, gl_surfaces[i].coded_buf);
+               vaDestroySurfaces(va_dpy, &gl_surfaces[i].src_surface, 1);
+               vaDestroySurfaces(va_dpy, &gl_surfaces[i].ref_surface, 1);
+
+               if (!use_zerocopy) {
+                       glBindBuffer(GL_PIXEL_PACK_BUFFER, gl_surfaces[i].pbo);
+                       glUnmapBuffer(GL_PIXEL_PACK_BUFFER);
+                       glBindBuffer(GL_PIXEL_PACK_BUFFER, 0);
+                       glDeleteBuffers(1, &gl_surfaces[i].pbo);
+               }
+               glDeleteTextures(1, &gl_surfaces[i].y_tex);
+               glDeleteTextures(1, &gl_surfaces[i].cbcr_tex);
+       }
 
-    return 0;
+       vaDestroyContext(va_dpy, context_id);
+       vaDestroyConfig(va_dpy, config_id);
+
+       return 0;
 }
 
 int H264EncoderImpl::deinit_va()
@@ -1602,7 +1667,7 @@ int H264EncoderImpl::deinit_va()
 }
 
 
-H264EncoderImpl::H264EncoderImpl(QSurface *surface, int width, int height, HTTPD *httpd)
+H264EncoderImpl::H264EncoderImpl(QSurface *surface, const string &va_display, int width, int height, HTTPD *httpd)
        : current_storage_frame(0), surface(surface), httpd(httpd)
 {
        AVCodec *codec_audio = avcodec_find_encoder(AUDIO_OUTPUT_CODEC);
@@ -1625,7 +1690,7 @@ H264EncoderImpl::H264EncoderImpl(QSurface *surface, int width, int height, HTTPD
 
        //print_input();
 
-       init_va();
+       init_va(va_display);
        setup_encode();
 
        // No frames are ready yet.
@@ -1661,7 +1726,12 @@ bool H264EncoderImpl::begin_frame(GLuint *y_tex, GLuint *cbcr_tex)
        {
                // Wait until this frame slot is done encoding.
                unique_lock<mutex> lock(storage_task_queue_mutex);
+               if (srcsurface_status[current_storage_frame % SURFACE_NUM] != SRC_SURFACE_FREE) {
+                       fprintf(stderr, "Warning: Slot %d (for frame %d) is still encoding, rendering has to wait for H.264 encoder\n",
+                               current_storage_frame % SURFACE_NUM, current_storage_frame);
+               }
                storage_task_queue_changed.wait(lock, [this]{ return storage_thread_should_quit || (srcsurface_status[current_storage_frame % SURFACE_NUM] == SRC_SURFACE_FREE); });
+               srcsurface_status[current_storage_frame % SURFACE_NUM] = SRC_SURFACE_IN_ENCODING;
                if (storage_thread_should_quit) return false;
        }
 
@@ -1670,52 +1740,53 @@ bool H264EncoderImpl::begin_frame(GLuint *y_tex, GLuint *cbcr_tex)
        *y_tex = surf->y_tex;
        *cbcr_tex = surf->cbcr_tex;
 
-       VASurfaceID surface = surf->src_surface;
-        VAStatus va_status = vaDeriveImage(va_dpy, surface, &surf->surface_image);
-        CHECK_VASTATUS(va_status, "vaDeriveImage");
-
-       VABufferInfo buf_info;
-       buf_info.mem_type = VA_SURFACE_ATTRIB_MEM_TYPE_DRM_PRIME;  // or VA_SURFACE_ATTRIB_MEM_TYPE_KERNEL_DRM?
-       va_status = vaAcquireBufferHandle(va_dpy, surf->surface_image.buf, &buf_info);
-        CHECK_VASTATUS(va_status, "vaAcquireBufferHandle");
-
-       // Create Y image.
-       surf->y_egl_image = EGL_NO_IMAGE_KHR;
-       EGLint y_attribs[] = {
-               EGL_WIDTH, frame_width,
-               EGL_HEIGHT, frame_height,
-               EGL_LINUX_DRM_FOURCC_EXT, fourcc_code('R', '8', ' ', ' '),
-               EGL_DMA_BUF_PLANE0_FD_EXT, EGLint(buf_info.handle),
-               EGL_DMA_BUF_PLANE0_OFFSET_EXT, EGLint(surf->surface_image.offsets[0]),
-               EGL_DMA_BUF_PLANE0_PITCH_EXT, EGLint(surf->surface_image.pitches[0]),
-               EGL_NONE
-       };
-
-       surf->y_egl_image = eglCreateImageKHR(eglGetCurrentDisplay(), EGL_NO_CONTEXT, EGL_LINUX_DMA_BUF_EXT, NULL, y_attribs);
-       assert(surf->y_egl_image != EGL_NO_IMAGE_KHR);
-
-       // Associate Y image to a texture.
-       glBindTexture(GL_TEXTURE_2D, *y_tex);
-       glEGLImageTargetTexture2DOES(GL_TEXTURE_2D, surf->y_egl_image);
-
-       // Create CbCr image.
-       surf->cbcr_egl_image = EGL_NO_IMAGE_KHR;
-       EGLint cbcr_attribs[] = {
-               EGL_WIDTH, frame_width,
-               EGL_HEIGHT, frame_height,
-               EGL_LINUX_DRM_FOURCC_EXT, fourcc_code('G', 'R', '8', '8'),
-               EGL_DMA_BUF_PLANE0_FD_EXT, EGLint(buf_info.handle),
-               EGL_DMA_BUF_PLANE0_OFFSET_EXT, EGLint(surf->surface_image.offsets[1]),
-               EGL_DMA_BUF_PLANE0_PITCH_EXT, EGLint(surf->surface_image.pitches[1]),
-               EGL_NONE
-       };
-
-       surf->cbcr_egl_image = eglCreateImageKHR(eglGetCurrentDisplay(), EGL_NO_CONTEXT, EGL_LINUX_DMA_BUF_EXT, NULL, cbcr_attribs);
-       assert(surf->cbcr_egl_image != EGL_NO_IMAGE_KHR);
-
-       // Associate CbCr image to a texture.
-       glBindTexture(GL_TEXTURE_2D, *cbcr_tex);
-       glEGLImageTargetTexture2DOES(GL_TEXTURE_2D, surf->cbcr_egl_image);
+       VAStatus va_status = vaDeriveImage(va_dpy, surf->src_surface, &surf->surface_image);
+       CHECK_VASTATUS(va_status, "vaDeriveImage");
+
+       if (use_zerocopy) {
+               VABufferInfo buf_info;
+               buf_info.mem_type = VA_SURFACE_ATTRIB_MEM_TYPE_DRM_PRIME;  // or VA_SURFACE_ATTRIB_MEM_TYPE_KERNEL_DRM?
+               va_status = vaAcquireBufferHandle(va_dpy, surf->surface_image.buf, &buf_info);
+               CHECK_VASTATUS(va_status, "vaAcquireBufferHandle");
+
+               // Create Y image.
+               surf->y_egl_image = EGL_NO_IMAGE_KHR;
+               EGLint y_attribs[] = {
+                       EGL_WIDTH, frame_width,
+                       EGL_HEIGHT, frame_height,
+                       EGL_LINUX_DRM_FOURCC_EXT, fourcc_code('R', '8', ' ', ' '),
+                       EGL_DMA_BUF_PLANE0_FD_EXT, EGLint(buf_info.handle),
+                       EGL_DMA_BUF_PLANE0_OFFSET_EXT, EGLint(surf->surface_image.offsets[0]),
+                       EGL_DMA_BUF_PLANE0_PITCH_EXT, EGLint(surf->surface_image.pitches[0]),
+                       EGL_NONE
+               };
+
+               surf->y_egl_image = eglCreateImageKHR(eglGetCurrentDisplay(), EGL_NO_CONTEXT, EGL_LINUX_DMA_BUF_EXT, NULL, y_attribs);
+               assert(surf->y_egl_image != EGL_NO_IMAGE_KHR);
+
+               // Associate Y image to a texture.
+               glBindTexture(GL_TEXTURE_2D, *y_tex);
+               glEGLImageTargetTexture2DOES(GL_TEXTURE_2D, surf->y_egl_image);
+
+               // Create CbCr image.
+               surf->cbcr_egl_image = EGL_NO_IMAGE_KHR;
+               EGLint cbcr_attribs[] = {
+                       EGL_WIDTH, frame_width,
+                       EGL_HEIGHT, frame_height,
+                       EGL_LINUX_DRM_FOURCC_EXT, fourcc_code('G', 'R', '8', '8'),
+                       EGL_DMA_BUF_PLANE0_FD_EXT, EGLint(buf_info.handle),
+                       EGL_DMA_BUF_PLANE0_OFFSET_EXT, EGLint(surf->surface_image.offsets[1]),
+                       EGL_DMA_BUF_PLANE0_PITCH_EXT, EGLint(surf->surface_image.pitches[1]),
+                       EGL_NONE
+               };
+
+               surf->cbcr_egl_image = eglCreateImageKHR(eglGetCurrentDisplay(), EGL_NO_CONTEXT, EGL_LINUX_DMA_BUF_EXT, NULL, cbcr_attribs);
+               assert(surf->cbcr_egl_image != EGL_NO_IMAGE_KHR);
+
+               // Associate CbCr image to a texture.
+               glBindTexture(GL_TEXTURE_2D, *cbcr_tex);
+               glEGLImageTargetTexture2DOES(GL_TEXTURE_2D, surf->cbcr_egl_image);
+       }
 
        return true;
 }
@@ -1733,6 +1804,37 @@ void H264EncoderImpl::add_audio(int64_t pts, vector<float> audio)
 void H264EncoderImpl::end_frame(RefCountedGLsync fence, int64_t pts, const vector<RefCountedFrame> &input_frames)
 {
        assert(!is_shutdown);
+
+       if (!use_zerocopy) {
+               GLSurface *surf = &gl_surfaces[current_storage_frame % SURFACE_NUM];
+
+               glPixelStorei(GL_PACK_ROW_LENGTH, 0);
+               check_error();
+
+               glBindBuffer(GL_PIXEL_PACK_BUFFER, surf->pbo);
+               check_error();
+
+               glBindTexture(GL_TEXTURE_2D, surf->y_tex);
+               check_error();
+               glGetTexImage(GL_TEXTURE_2D, 0, GL_RED, GL_UNSIGNED_BYTE, BUFFER_OFFSET(surf->y_offset));
+               check_error();
+
+               glBindTexture(GL_TEXTURE_2D, surf->cbcr_tex);
+               check_error();
+               glGetTexImage(GL_TEXTURE_2D, 0, GL_RG, GL_UNSIGNED_BYTE, BUFFER_OFFSET(surf->cbcr_offset));
+               check_error();
+
+               glBindTexture(GL_TEXTURE_2D, 0);
+               check_error();
+               glBindBuffer(GL_PIXEL_PACK_BUFFER, 0);
+               check_error();
+
+               glMemoryBarrier(GL_TEXTURE_UPDATE_BARRIER_BIT | GL_CLIENT_MAPPED_BUFFER_BARRIER_BIT);
+               check_error();
+               fence = RefCountedGLsync(GL_SYNC_GPU_COMMANDS_COMPLETE, /*flags=*/0);
+               check_error();
+       }
+
        {
                unique_lock<mutex> lock(frame_queue_mutex);
                pending_video_frames[current_storage_frame] = PendingFrame{ fence, input_frames, pts };
@@ -1832,28 +1934,65 @@ void H264EncoderImpl::encode_remaining_frames_as_p(int encoding_frame_num, int g
        }
 }
 
+namespace {
+
+void memcpy_with_pitch(uint8_t *dst, const uint8_t *src, size_t src_width, size_t dst_pitch, size_t height)
+{
+       if (src_width == dst_pitch) {
+               memcpy(dst, src, src_width * height);
+       } else {
+               for (size_t y = 0; y < height; ++y) {
+                       const uint8_t *sptr = src + y * src_width;
+                       uint8_t *dptr = dst + y * dst_pitch;
+                       memcpy(dptr, sptr, src_width);
+               }
+       }
+}
+
+}  // namespace
+
 void H264EncoderImpl::encode_frame(H264EncoderImpl::PendingFrame frame, int encoding_frame_num, int display_frame_num, int gop_start_display_frame_num,
                                    int frame_type, int64_t pts, int64_t dts)
 {
        // Wait for the GPU to be done with the frame.
-       glClientWaitSync(frame.fence.get(), 0, 0);
+       GLenum sync_status;
+       do {
+               sync_status = glClientWaitSync(frame.fence.get(), 0, 1000000000);
+               check_error();
+       } while (sync_status == GL_TIMEOUT_EXPIRED);
+       assert(sync_status != GL_WAIT_FAILED);
 
        // Release back any input frames we needed to render this frame.
        frame.input_frames.clear();
 
-       // Unmap the image.
        GLSurface *surf = &gl_surfaces[display_frame_num % SURFACE_NUM];
-       eglDestroyImageKHR(eglGetCurrentDisplay(), surf->y_egl_image);
-       eglDestroyImageKHR(eglGetCurrentDisplay(), surf->cbcr_egl_image);
-       VAStatus va_status = vaReleaseBufferHandle(va_dpy, surf->surface_image.buf);
-       CHECK_VASTATUS(va_status, "vaReleaseBufferHandle");
+       VAStatus va_status;
+
+       if (use_zerocopy) {
+               eglDestroyImageKHR(eglGetCurrentDisplay(), surf->y_egl_image);
+               eglDestroyImageKHR(eglGetCurrentDisplay(), surf->cbcr_egl_image);
+               va_status = vaReleaseBufferHandle(va_dpy, surf->surface_image.buf);
+               CHECK_VASTATUS(va_status, "vaReleaseBufferHandle");
+       } else {
+               unsigned char *surface_p = nullptr;
+               vaMapBuffer(va_dpy, surf->surface_image.buf, (void **)&surface_p);
+
+               unsigned char *va_y_ptr = (unsigned char *)surface_p + surf->surface_image.offsets[0];
+               memcpy_with_pitch(va_y_ptr, surf->y_ptr, frame_width, surf->surface_image.pitches[0], frame_height);
+
+               unsigned char *va_cbcr_ptr = (unsigned char *)surface_p + surf->surface_image.offsets[1];
+               memcpy_with_pitch(va_cbcr_ptr, surf->cbcr_ptr, (frame_width / 2) * sizeof(uint16_t), surf->surface_image.pitches[1], frame_height / 2);
+
+               va_status = vaUnmapBuffer(va_dpy, surf->surface_image.buf);
+               CHECK_VASTATUS(va_status, "vaUnmapBuffer");
+       }
+
        va_status = vaDestroyImage(va_dpy, surf->surface_image.image_id);
        CHECK_VASTATUS(va_status, "vaDestroyImage");
 
-       VASurfaceID surface = surf->src_surface;
-
        // Schedule the frame for encoding.
-       va_status = vaBeginPicture(va_dpy, context_id, surface);
+       VASurfaceID va_surface = surf->src_surface;
+       va_status = vaBeginPicture(va_dpy, context_id, va_surface);
        CHECK_VASTATUS(va_status, "vaBeginPicture");
 
        if (frame_type == FRAME_IDR) {
@@ -1885,8 +2024,8 @@ void H264EncoderImpl::encode_frame(H264EncoderImpl::PendingFrame frame, int enco
 }
 
 // Proxy object.
-H264Encoder::H264Encoder(QSurface *surface, int width, int height, HTTPD *httpd)
-       : impl(new H264EncoderImpl(surface, width, height, httpd)) {}
+H264Encoder::H264Encoder(QSurface *surface, const string &va_display, int width, int height, HTTPD *httpd)
+       : impl(new H264EncoderImpl(surface, va_display, width, height, httpd)) {}
 
 // Must be defined here because unique_ptr<> destructor needs to know the impl.
 H264Encoder::~H264Encoder() {}