X-Git-Url: https://git.sesse.net/?p=nageru;a=blobdiff_plain;f=quicksync_encoder.cpp;h=f664157b3d4ff67158a537ac24eb9c734a60c751;hp=41ace54263d1d235e8a02c4a905ac9fc34e8f3c0;hb=4e3c52ba57c4552a969e71ccdefd9941ce8d6290;hpb=3f34da3ebb9a6fd1ed267f9186d17433321a9214 diff --git a/quicksync_encoder.cpp b/quicksync_encoder.cpp index 41ace54..f664157 100644 --- a/quicksync_encoder.cpp +++ b/quicksync_encoder.cpp @@ -1,52 +1,89 @@ -//#include "sysdeps.h" #include "quicksync_encoder.h" +#include +#include // Must be above the Xlib includes. #include + #include -#include #include #include #include -#include +#include +#include +#include #include #include #include -#include +#include #include #include #include #include #include #include +#include #include +#include #include +#include #include #include #include #include +#include #include #include #include +extern "C" { + +#include +#include +#include +#include + +} // namespace + #include "audio_encoder.h" #include "context.h" #include "defs.h" +#include "disk_space_estimator.h" +#include "ffmpeg_raii.h" #include "flags.h" #include "mux.h" +#include "print_latency.h" +#include "quicksync_encoder_impl.h" +#include "ref_counted_frame.h" #include "timebase.h" #include "x264_encoder.h" +using namespace movit; using namespace std; +using namespace std::chrono; +using namespace std::placeholders; class QOpenGLContext; class QSurface; +namespace { + +// These need to survive several QuickSyncEncoderImpl instances, +// so they are outside. +once_flag quick_sync_metrics_inited; +LatencyHistogram mixer_latency_histogram, qs_latency_histogram; +MuxMetrics current_file_mux_metrics, total_mux_metrics; +std::atomic metric_current_file_start_time_seconds{0.0 / 0.0}; +std::atomic metric_quick_sync_stalled_frames{0}; + +} // namespace + #define CHECK_VASTATUS(va_status, func) \ if (va_status != VA_STATUS_SUCCESS) { \ fprintf(stderr, "%s:%d (%s) failed with %d\n", __func__, __LINE__, func, va_status); \ exit(1); \ } +#undef BUFFER_OFFSET #define BUFFER_OFFSET(i) ((char *)NULL + (i)) //#include "loadsurface.h" @@ -78,268 +115,14 @@ class QSurface; #define PROFILE_IDC_HIGH 100 #define BITSTREAM_ALLOCATE_STEPPING 4096 -#define SURFACE_NUM 16 /* 16 surfaces for source YUV */ -#define MAX_NUM_REF1 16 // Seemingly a hardware-fixed value, not related to SURFACE_NUM -#define MAX_NUM_REF2 32 // Seemingly a hardware-fixed value, not related to SURFACE_NUM static constexpr unsigned int MaxFrameNum = (2<<16); static constexpr unsigned int MaxPicOrderCntLsb = (2<<8); static constexpr unsigned int Log2MaxFrameNum = 16; static constexpr unsigned int Log2MaxPicOrderCntLsb = 8; -static constexpr int rc_default_modes[] = { // Priority list of modes. - VA_RC_VBR, - VA_RC_CQP, - VA_RC_VBR_CONSTRAINED, - VA_RC_CBR, - VA_RC_VCM, - VA_RC_NONE, -}; - -/* thread to save coded data */ -#define SRC_SURFACE_FREE 0 -#define SRC_SURFACE_IN_ENCODING 1 - -struct __bitstream { - unsigned int *buffer; - int bit_offset; - int max_size_in_dword; -}; -typedef struct __bitstream bitstream; using namespace std; -// H.264 video comes out in encoding order (e.g. with two B-frames: -// 0, 3, 1, 2, 6, 4, 5, etc.), but uncompressed video needs to -// come in the right order. Since we do everything, including waiting -// for the frames to come out of OpenGL, in encoding order, we need -// a reordering buffer for uncompressed frames so that they come out -// correctly. We go the super-lazy way of not making it understand -// anything about the true order (which introduces some extra latency, -// though); we know that for N B-frames we need at most (N-1) frames -// in the reorder buffer, and can just sort on that. -// -// The class also deals with keeping a freelist as needed. -class FrameReorderer { -public: - FrameReorderer(unsigned queue_length, int width, int height); - - struct Frame { - int64_t pts, duration; - uint8_t *data; - - // Invert to get the smallest pts first. - bool operator< (const Frame &other) const { return pts > other.pts; } - }; - - // Returns the next frame to insert with its pts, if any. Otherwise -1 and nullptr. - // Does _not_ take ownership of data; a copy is taken if needed. - // The returned pointer is valid until the next call to reorder_frame, or destruction. - // As a special case, if queue_length == 0, will just return pts and data (no reordering needed). - Frame reorder_frame(int64_t pts, int64_t duration, uint8_t *data); - - // The same as reorder_frame, but without inserting anything. Used to empty the queue. - Frame get_first_frame(); - - bool empty() const { return frames.empty(); } - -private: - unsigned queue_length; - int width, height; - - priority_queue frames; - stack freelist; // Includes the last value returned from reorder_frame. - - // Owns all the pointers. Normally, freelist and frames could do this themselves, - // except priority_queue doesn't work well with movable-only types. - vector> owner; -}; - -FrameReorderer::FrameReorderer(unsigned queue_length, int width, int height) - : queue_length(queue_length), width(width), height(height) -{ - for (unsigned i = 0; i < queue_length; ++i) { - owner.emplace_back(new uint8_t[width * height * 2]); - freelist.push(owner.back().get()); - } -} - -FrameReorderer::Frame FrameReorderer::reorder_frame(int64_t pts, int64_t duration, uint8_t *data) -{ - if (queue_length == 0) { - return Frame{pts, duration, data}; - } - - assert(!freelist.empty()); - uint8_t *storage = freelist.top(); - freelist.pop(); - memcpy(storage, data, width * height * 2); - frames.push(Frame{pts, duration, storage}); - - if (frames.size() >= queue_length) { - return get_first_frame(); - } else { - return Frame{-1, -1, nullptr}; - } -} - -FrameReorderer::Frame FrameReorderer::get_first_frame() -{ - assert(!frames.empty()); - Frame storage = frames.top(); - frames.pop(); - freelist.push(storage.data); - return storage; -} - -class QuickSyncEncoderImpl { -public: - QuickSyncEncoderImpl(const std::string &filename, QSurface *surface, const string &va_display, int width, int height, Mux *stream_mux, AudioEncoder *stream_audio_encoder, X264Encoder *x264_encoder); - ~QuickSyncEncoderImpl(); - void add_audio(int64_t pts, vector audio); - bool begin_frame(GLuint *y_tex, GLuint *cbcr_tex); - RefCountedGLsync end_frame(int64_t pts, int64_t duration, const vector &input_frames); - void shutdown(); - -private: - struct storage_task { - unsigned long long display_order; - int frame_type; - vector audio; - int64_t pts, dts, duration; - }; - struct PendingFrame { - RefCountedGLsync fence; - vector input_frames; - int64_t pts, duration; - }; - - // So we never get negative dts. - int64_t global_delay() const { - return int64_t(ip_period - 1) * (TIMEBASE / MAX_FPS); - } - - void open_output_file(const std::string &filename); - void encode_thread_func(); - void encode_remaining_frames_as_p(int encoding_frame_num, int gop_start_display_frame_num, int64_t last_dts); - void add_packet_for_uncompressed_frame(int64_t pts, int64_t duration, const uint8_t *data); - void encode_frame(PendingFrame frame, int encoding_frame_num, int display_frame_num, int gop_start_display_frame_num, - int frame_type, int64_t pts, int64_t dts, int64_t duration); - void storage_task_thread(); - void encode_remaining_audio(); - void storage_task_enqueue(storage_task task); - void save_codeddata(storage_task task); - int render_packedsequence(); - int render_packedpicture(); - void render_packedslice(); - int render_sequence(); - int render_picture(int frame_type, int display_frame_num, int gop_start_display_frame_num); - void sps_rbsp(bitstream *bs); - void pps_rbsp(bitstream *bs); - int build_packed_pic_buffer(unsigned char **header_buffer); - int render_slice(int encoding_frame_num, int display_frame_num, int gop_start_display_frame_num, int frame_type); - void slice_header(bitstream *bs); - int build_packed_seq_buffer(unsigned char **header_buffer); - int build_packed_slice_buffer(unsigned char **header_buffer); - int init_va(const string &va_display); - int deinit_va(); - void enable_zerocopy_if_possible(); - VADisplay va_open_display(const string &va_display); - void va_close_display(VADisplay va_dpy); - int setup_encode(); - int release_encode(); - void update_ReferenceFrames(int frame_type); - int update_RefPicList(int frame_type); - - bool is_shutdown = false; - bool use_zerocopy; - int drm_fd = -1; - - thread encode_thread, storage_thread; - - mutex storage_task_queue_mutex; - condition_variable storage_task_queue_changed; - int srcsurface_status[SURFACE_NUM]; // protected by storage_task_queue_mutex - queue storage_task_queue; // protected by storage_task_queue_mutex - bool storage_thread_should_quit = false; // protected by storage_task_queue_mutex - - mutex frame_queue_mutex; - condition_variable frame_queue_nonempty; - bool encode_thread_should_quit = false; // under frame_queue_mutex - - int current_storage_frame; - - map pending_video_frames; // under frame_queue_mutex - map> pending_audio_frames; // under frame_queue_mutex - QSurface *surface; - - unique_ptr file_audio_encoder; - AudioEncoder *stream_audio_encoder; - - unique_ptr reorderer; - X264Encoder *x264_encoder; // nullptr if not using x264. - - Mux* stream_mux; // To HTTP. - unique_ptr file_mux; // To local disk. - - Display *x11_display = nullptr; - - // Encoder parameters - VADisplay va_dpy; - VAProfile h264_profile = (VAProfile)~0; - VAConfigAttrib config_attrib[VAConfigAttribTypeMax]; - int config_attrib_num = 0, enc_packed_header_idx; - - struct GLSurface { - VASurfaceID src_surface, ref_surface; - VABufferID coded_buf; - - VAImage surface_image; - GLuint y_tex, cbcr_tex; - - // Only if use_zerocopy == true. - EGLImage y_egl_image, cbcr_egl_image; - - // Only if use_zerocopy == false. - GLuint pbo; - uint8_t *y_ptr, *cbcr_ptr; - size_t y_offset, cbcr_offset; - }; - GLSurface gl_surfaces[SURFACE_NUM]; - - VAConfigID config_id; - VAContextID context_id; - VAEncSequenceParameterBufferH264 seq_param; - VAEncPictureParameterBufferH264 pic_param; - VAEncSliceParameterBufferH264 slice_param; - VAPictureH264 CurrentCurrPic; - VAPictureH264 ReferenceFrames[MAX_NUM_REF1], RefPicList0_P[MAX_NUM_REF2], RefPicList0_B[MAX_NUM_REF2], RefPicList1_B[MAX_NUM_REF2]; - - // Static quality settings. - static constexpr unsigned int frame_bitrate = 15000000 / 60; // Doesn't really matter; only initial_qp does. - static constexpr unsigned int num_ref_frames = 2; - static constexpr int initial_qp = 15; - static constexpr int minimal_qp = 0; - static constexpr int intra_period = 30; - static constexpr int intra_idr_period = MAX_FPS; // About a second; more at lower frame rates. Not ideal. - - // Quality settings that are meant to be static, but might be overridden - // by the profile. - int constraint_set_flag = 0; - int h264_packedheader = 0; /* support pack header? */ - int h264_maxref = (1<<16|1); - int h264_entropy_mode = 1; /* cabac */ - int ip_period = 3; - - int rc_mode = -1; - unsigned int current_frame_num = 0; - unsigned int numShortTerm = 0; - - int frame_width; - int frame_height; - int frame_width_mbaligned; - int frame_height_mbaligned; -}; - // Supposedly vaRenderPicture() is supposed to destroy the buffer implicitly, // but if we don't delete it here, we get leaks. The GStreamer implementation // does the same. @@ -484,7 +267,7 @@ static void nal_header(bitstream *bs, int nal_ref_idc, int nal_unit_type) bitstream_put_ui(bs, nal_unit_type, 5); } -void QuickSyncEncoderImpl::sps_rbsp(bitstream *bs) +void QuickSyncEncoderImpl::sps_rbsp(YCbCrLumaCoefficients ycbcr_coefficients, bitstream *bs) { int profile_idc = PROFILE_IDC_BASELINE; @@ -544,6 +327,7 @@ void QuickSyncEncoderImpl::sps_rbsp(bitstream *bs) if ( false ) { bitstream_put_ui(bs, 0, 1); /* vui_parameters_present_flag */ } else { + // See H.264 annex E for the definition of this header. bitstream_put_ui(bs, 1, 1); /* vui_parameters_present_flag */ bitstream_put_ui(bs, 0, 1); /* aspect_ratio_info_present_flag */ bitstream_put_ui(bs, 0, 1); /* overscan_info_present_flag */ @@ -554,8 +338,13 @@ void QuickSyncEncoderImpl::sps_rbsp(bitstream *bs) bitstream_put_ui(bs, 1, 1); /* colour_description_present_flag */ { bitstream_put_ui(bs, 1, 8); /* colour_primaries (1 = BT.709) */ - bitstream_put_ui(bs, 2, 8); /* transfer_characteristics (2 = unspecified, since we use sRGB) */ - bitstream_put_ui(bs, 6, 8); /* matrix_coefficients (6 = BT.601/SMPTE 170M) */ + bitstream_put_ui(bs, 13, 8); /* transfer_characteristics (13 = sRGB) */ + if (ycbcr_coefficients == YCBCR_REC_709) { + bitstream_put_ui(bs, 1, 8); /* matrix_coefficients (1 = BT.709) */ + } else { + assert(ycbcr_coefficients == YCBCR_REC_601); + bitstream_put_ui(bs, 6, 8); /* matrix_coefficients (6 = BT.601/SMPTE 170M) */ + } } } bitstream_put_ui(bs, 0, 1); /* chroma_loc_info_present_flag */ @@ -735,14 +524,14 @@ int QuickSyncEncoderImpl::build_packed_pic_buffer(unsigned char **header_buffer) } int -QuickSyncEncoderImpl::build_packed_seq_buffer(unsigned char **header_buffer) +QuickSyncEncoderImpl::build_packed_seq_buffer(YCbCrLumaCoefficients ycbcr_coefficients, unsigned char **header_buffer) { bitstream bs; bitstream_start(&bs); nal_start_code_prefix(&bs); nal_header(&bs, NAL_REF_IDC_HIGH, NAL_SPS); - sps_rbsp(&bs); + sps_rbsp(ycbcr_coefficients, &bs); bitstream_end(&bs); *header_buffer = (unsigned char *)bs.buffer; @@ -919,29 +708,12 @@ void encoding2display_order( } -static const char *rc_to_string(int rc_mode) -{ - switch (rc_mode) { - case VA_RC_NONE: - return "NONE"; - case VA_RC_CBR: - return "CBR"; - case VA_RC_VBR: - return "VBR"; - case VA_RC_VCM: - return "VCM"; - case VA_RC_CQP: - return "CQP"; - case VA_RC_VBR_CONSTRAINED: - return "VBR_CONSTRAINED"; - default: - return "Unknown"; - } -} - void QuickSyncEncoderImpl::enable_zerocopy_if_possible() { - if (global_flags.uncompressed_video_to_http) { + if (global_flags.x264_video_to_disk) { + // Quick Sync is entirely disabled. + use_zerocopy = false; + } else if (global_flags.uncompressed_video_to_http) { fprintf(stderr, "Disabling zerocopy H.264 encoding due to --http-uncompressed-video.\n"); use_zerocopy = false; } else if (global_flags.x264_video_to_http) { @@ -950,125 +722,139 @@ void QuickSyncEncoderImpl::enable_zerocopy_if_possible() } else { use_zerocopy = true; } + global_flags.use_zerocopy = use_zerocopy; } -VADisplay QuickSyncEncoderImpl::va_open_display(const string &va_display) +VADisplayWithCleanup::~VADisplayWithCleanup() { - if (va_display.empty()) { - x11_display = XOpenDisplay(NULL); - if (!x11_display) { + if (va_dpy != nullptr) { + vaTerminate(va_dpy); + } + if (x11_display != nullptr) { + XCloseDisplay(x11_display); + } + if (drm_fd != -1) { + close(drm_fd); + } +} + +unique_ptr va_open_display(const string &va_display) +{ + if (va_display.empty() || va_display[0] != '/') { // An X display. + Display *x11_display = XOpenDisplay(va_display.empty() ? nullptr : va_display.c_str()); + if (x11_display == nullptr) { fprintf(stderr, "error: can't connect to X server!\n"); - return NULL; + return nullptr; } - enable_zerocopy_if_possible(); - return vaGetDisplay(x11_display); - } else if (va_display[0] != '/') { - x11_display = XOpenDisplay(va_display.c_str()); - if (!x11_display) { - fprintf(stderr, "error: can't connect to X server!\n"); - return NULL; + + unique_ptr ret(new VADisplayWithCleanup); + ret->x11_display = x11_display; + ret->can_use_zerocopy = true; + ret->va_dpy = vaGetDisplay(x11_display); + if (ret->va_dpy == nullptr) { + return nullptr; } - enable_zerocopy_if_possible(); - return vaGetDisplay(x11_display); - } else { - drm_fd = open(va_display.c_str(), O_RDWR); + return ret; + } else { // A DRM node on the filesystem (e.g. /dev/dri/renderD128). + int drm_fd = open(va_display.c_str(), O_RDWR); if (drm_fd == -1) { perror(va_display.c_str()); return NULL; } - use_zerocopy = false; - return vaGetDisplayDRM(drm_fd); + unique_ptr ret(new VADisplayWithCleanup); + ret->drm_fd = drm_fd; + ret->can_use_zerocopy = false; + ret->va_dpy = vaGetDisplayDRM(drm_fd); + if (ret->va_dpy == nullptr) { + return nullptr; + } + return ret; } } -void QuickSyncEncoderImpl::va_close_display(VADisplay va_dpy) +unique_ptr try_open_va(const string &va_display, VAProfile *h264_profile, string *error) { - if (x11_display) { - XCloseDisplay(x11_display); - x11_display = nullptr; + unique_ptr va_dpy = va_open_display(va_display); + if (va_dpy == nullptr) { + if (error) *error = "Opening VA display failed"; + return nullptr; } - if (drm_fd != -1) { - close(drm_fd); + int major_ver, minor_ver; + VAStatus va_status = vaInitialize(va_dpy->va_dpy, &major_ver, &minor_ver); + if (va_status != VA_STATUS_SUCCESS) { + char buf[256]; + snprintf(buf, sizeof(buf), "vaInitialize() failed with status %d\n", va_status); + if (error != nullptr) *error = buf; + return nullptr; + } + + int num_entrypoints = vaMaxNumEntrypoints(va_dpy->va_dpy); + unique_ptr entrypoints(new VAEntrypoint[num_entrypoints]); + if (entrypoints == nullptr) { + if (error != nullptr) *error = "Failed to allocate memory for VA entry points"; + return nullptr; } + + // Try the profiles from highest to lowest until we find one that can be encoded. + constexpr VAProfile profile_list[] = { VAProfileH264High, VAProfileH264Main, VAProfileH264ConstrainedBaseline }; + for (unsigned i = 0; i < sizeof(profile_list) / sizeof(profile_list[0]); ++i) { + vaQueryConfigEntrypoints(va_dpy->va_dpy, profile_list[i], entrypoints.get(), &num_entrypoints); + for (int slice_entrypoint = 0; slice_entrypoint < num_entrypoints; slice_entrypoint++) { + if (entrypoints[slice_entrypoint] != VAEntrypointEncSlice) { + continue; + } + + // We found a usable encoder, so return it. + if (h264_profile != nullptr) { + *h264_profile = profile_list[i]; + } + return va_dpy; + } + } + + if (error != nullptr) *error = "Can't find VAEntrypointEncSlice for H264 profiles"; + return nullptr; } int QuickSyncEncoderImpl::init_va(const string &va_display) { - VAProfile profile_list[]={VAProfileH264High, VAProfileH264Main, VAProfileH264Baseline, VAProfileH264ConstrainedBaseline}; - VAEntrypoint *entrypoints; - int num_entrypoints, slice_entrypoint; - int support_encode = 0; - int major_ver, minor_ver; - VAStatus va_status; - unsigned int i; - - va_dpy = va_open_display(va_display); - va_status = vaInitialize(va_dpy, &major_ver, &minor_ver); - CHECK_VASTATUS(va_status, "vaInitialize"); - - num_entrypoints = vaMaxNumEntrypoints(va_dpy); - entrypoints = (VAEntrypoint *)malloc(num_entrypoints * sizeof(*entrypoints)); - if (!entrypoints) { - fprintf(stderr, "error: failed to initialize VA entrypoints array\n"); + string error; + va_dpy = try_open_va(va_display, &h264_profile, &error); + if (va_dpy == nullptr) { + fprintf(stderr, "error: %s\n", error.c_str()); exit(1); } - - /* use the highest profile */ - for (i = 0; i < sizeof(profile_list)/sizeof(profile_list[0]); i++) { - if ((h264_profile != ~0) && h264_profile != profile_list[i]) - continue; - - h264_profile = profile_list[i]; - vaQueryConfigEntrypoints(va_dpy, h264_profile, entrypoints, &num_entrypoints); - for (slice_entrypoint = 0; slice_entrypoint < num_entrypoints; slice_entrypoint++) { - if (entrypoints[slice_entrypoint] == VAEntrypointEncSlice) { - support_encode = 1; - break; - } - } - if (support_encode == 1) - break; + if (!va_dpy->can_use_zerocopy) { + use_zerocopy = false; } - if (support_encode == 0) { - printf("Can't find VAEntrypointEncSlice for H264 profiles. If you are using a non-Intel GPU\n"); - printf("but have one in your system, try launching Nageru with --va-display /dev/dri/renderD128\n"); - printf("to use VA-API against DRM instead of X11.\n"); - exit(1); - } else { - switch (h264_profile) { - case VAProfileH264Baseline: - ip_period = 1; - constraint_set_flag |= (1 << 0); /* Annex A.2.1 */ - h264_entropy_mode = 0; - break; - case VAProfileH264ConstrainedBaseline: - constraint_set_flag |= (1 << 0 | 1 << 1); /* Annex A.2.2 */ - ip_period = 1; - break; - - case VAProfileH264Main: - constraint_set_flag |= (1 << 1); /* Annex A.2.2 */ - break; - - case VAProfileH264High: - constraint_set_flag |= (1 << 3); /* Annex A.2.4 */ - break; - default: - h264_profile = VAProfileH264Baseline; - ip_period = 1; - constraint_set_flag |= (1 << 0); /* Annex A.2.1 */ - break; - } + switch (h264_profile) { + case VAProfileH264ConstrainedBaseline: + constraint_set_flag |= (1 << 0 | 1 << 1); /* Annex A.2.2 */ + ip_period = 1; + break; + + case VAProfileH264Main: + constraint_set_flag |= (1 << 1); /* Annex A.2.2 */ + break; + + case VAProfileH264High: + constraint_set_flag |= (1 << 3); /* Annex A.2.4 */ + break; + default: + h264_profile = VAProfileH264ConstrainedBaseline; + ip_period = 1; + constraint_set_flag |= (1 << 0); /* Annex A.2.1 */ + break; } VAConfigAttrib attrib[VAConfigAttribTypeMax]; /* find out the format for the render target, and rate control mode */ - for (i = 0; i < VAConfigAttribTypeMax; i++) + for (unsigned i = 0; i < VAConfigAttribTypeMax; i++) attrib[i].type = (VAConfigAttribType)i; - va_status = vaGetConfigAttributes(va_dpy, h264_profile, VAEntrypointEncSlice, + VAStatus va_status = vaGetConfigAttributes(va_dpy->va_dpy, h264_profile, VAEntrypointEncSlice, &attrib[0], VAConfigAttribTypeMax); CHECK_VASTATUS(va_status, "vaGetConfigAttributes"); /* check the interested configattrib */ @@ -1082,23 +868,13 @@ int QuickSyncEncoderImpl::init_va(const string &va_display) } if (attrib[VAConfigAttribRateControl].value != VA_ATTRIB_NOT_SUPPORTED) { - int tmp = attrib[VAConfigAttribRateControl].value; - - if (rc_mode == -1 || !(rc_mode & tmp)) { - if (rc_mode != -1) { - printf("Warning: Don't support the specified RateControl mode: %s!!!, switch to ", rc_to_string(rc_mode)); - } - - for (i = 0; i < sizeof(rc_default_modes) / sizeof(rc_default_modes[0]); i++) { - if (rc_default_modes[i] & tmp) { - rc_mode = rc_default_modes[i]; - break; - } - } + if (!(attrib[VAConfigAttribRateControl].value & VA_RC_CQP)) { + fprintf(stderr, "ERROR: VA-API encoder does not support CQP mode.\n"); + exit(1); } config_attrib[config_attrib_num].type = VAConfigAttribRateControl; - config_attrib[config_attrib_num].value = rc_mode; + config_attrib[config_attrib_num].value = VA_RC_CQP; config_attrib_num++; } @@ -1140,99 +916,92 @@ int QuickSyncEncoderImpl::init_va(const string &va_display) h264_maxref = attrib[VAConfigAttribEncMaxRefFrames].value; } - free(entrypoints); return 0; } int QuickSyncEncoderImpl::setup_encode() { - VAStatus va_status; - VASurfaceID *tmp_surfaceid; - int codedbuf_size, i; - static VASurfaceID src_surface[SURFACE_NUM]; - static VASurfaceID ref_surface[SURFACE_NUM]; - - va_status = vaCreateConfig(va_dpy, h264_profile, VAEntrypointEncSlice, - &config_attrib[0], config_attrib_num, &config_id); - CHECK_VASTATUS(va_status, "vaCreateConfig"); - - /* create source surfaces */ - va_status = vaCreateSurfaces(va_dpy, - VA_RT_FORMAT_YUV420, frame_width_mbaligned, frame_height_mbaligned, - &src_surface[0], SURFACE_NUM, - NULL, 0); - CHECK_VASTATUS(va_status, "vaCreateSurfaces"); - - /* create reference surfaces */ - va_status = vaCreateSurfaces(va_dpy, - VA_RT_FORMAT_YUV420, frame_width_mbaligned, frame_height_mbaligned, - &ref_surface[0], SURFACE_NUM, - NULL, 0); - CHECK_VASTATUS(va_status, "vaCreateSurfaces"); - - tmp_surfaceid = (VASurfaceID *)calloc(2 * SURFACE_NUM, sizeof(VASurfaceID)); - memcpy(tmp_surfaceid, src_surface, SURFACE_NUM * sizeof(VASurfaceID)); - memcpy(tmp_surfaceid + SURFACE_NUM, ref_surface, SURFACE_NUM * sizeof(VASurfaceID)); - - /* Create a context for this encode pipe */ - va_status = vaCreateContext(va_dpy, config_id, - frame_width_mbaligned, frame_height_mbaligned, - VA_PROGRESSIVE, - tmp_surfaceid, 2 * SURFACE_NUM, - &context_id); - CHECK_VASTATUS(va_status, "vaCreateContext"); - free(tmp_surfaceid); - - codedbuf_size = (frame_width_mbaligned * frame_height_mbaligned * 400) / (16*16); - - for (i = 0; i < SURFACE_NUM; i++) { - /* create coded buffer once for all - * other VA buffers which won't be used again after vaRenderPicture. - * so APP can always vaCreateBuffer for every frame - * but coded buffer need to be mapped and accessed after vaRenderPicture/vaEndPicture - * so VA won't maintain the coded buffer - */ - va_status = vaCreateBuffer(va_dpy, context_id, VAEncCodedBufferType, - codedbuf_size, 1, NULL, &gl_surfaces[i].coded_buf); - CHECK_VASTATUS(va_status, "vaCreateBuffer"); - } + if (!global_flags.x264_video_to_disk) { + VAStatus va_status; + VASurfaceID *tmp_surfaceid; + int codedbuf_size; + VASurfaceID src_surface[SURFACE_NUM]; + VASurfaceID ref_surface[SURFACE_NUM]; + + va_status = vaCreateConfig(va_dpy->va_dpy, h264_profile, VAEntrypointEncSlice, + &config_attrib[0], config_attrib_num, &config_id); + CHECK_VASTATUS(va_status, "vaCreateConfig"); + + /* create source surfaces */ + va_status = vaCreateSurfaces(va_dpy->va_dpy, + VA_RT_FORMAT_YUV420, frame_width_mbaligned, frame_height_mbaligned, + &src_surface[0], SURFACE_NUM, + NULL, 0); + CHECK_VASTATUS(va_status, "vaCreateSurfaces"); + + /* create reference surfaces */ + va_status = vaCreateSurfaces(va_dpy->va_dpy, + VA_RT_FORMAT_YUV420, frame_width_mbaligned, frame_height_mbaligned, + &ref_surface[0], SURFACE_NUM, + NULL, 0); + CHECK_VASTATUS(va_status, "vaCreateSurfaces"); + + tmp_surfaceid = (VASurfaceID *)calloc(2 * SURFACE_NUM, sizeof(VASurfaceID)); + memcpy(tmp_surfaceid, src_surface, SURFACE_NUM * sizeof(VASurfaceID)); + memcpy(tmp_surfaceid + SURFACE_NUM, ref_surface, SURFACE_NUM * sizeof(VASurfaceID)); + + for (int i = 0; i < SURFACE_NUM; i++) { + gl_surfaces[i].src_surface = src_surface[i]; + gl_surfaces[i].ref_surface = ref_surface[i]; + } - /* create OpenGL objects */ - //glGenFramebuffers(SURFACE_NUM, fbos); - - for (i = 0; i < SURFACE_NUM; i++) { - glGenTextures(1, &gl_surfaces[i].y_tex); - glGenTextures(1, &gl_surfaces[i].cbcr_tex); - - if (!use_zerocopy) { - // Create Y image. - glBindTexture(GL_TEXTURE_2D, gl_surfaces[i].y_tex); - glTexStorage2D(GL_TEXTURE_2D, 1, GL_R8, frame_width, frame_height); - - // Create CbCr image. - glBindTexture(GL_TEXTURE_2D, gl_surfaces[i].cbcr_tex); - glTexStorage2D(GL_TEXTURE_2D, 1, GL_RG8, frame_width / 2, frame_height / 2); - - // Generate a PBO to read into. It doesn't necessarily fit 1:1 with the VA-API - // buffers, due to potentially differing pitch. - glGenBuffers(1, &gl_surfaces[i].pbo); - glBindBuffer(GL_PIXEL_PACK_BUFFER, gl_surfaces[i].pbo); - glBufferStorage(GL_PIXEL_PACK_BUFFER, frame_width * frame_height * 2, nullptr, GL_MAP_READ_BIT | GL_MAP_WRITE_BIT | GL_MAP_PERSISTENT_BIT); - uint8_t *ptr = (uint8_t *)glMapBufferRange(GL_PIXEL_PACK_BUFFER, 0, frame_width * frame_height * 2, GL_MAP_READ_BIT | GL_MAP_PERSISTENT_BIT); - gl_surfaces[i].y_offset = 0; - gl_surfaces[i].cbcr_offset = frame_width * frame_height; - gl_surfaces[i].y_ptr = ptr + gl_surfaces[i].y_offset; - gl_surfaces[i].cbcr_ptr = ptr + gl_surfaces[i].cbcr_offset; - glBindBuffer(GL_PIXEL_PACK_BUFFER, 0); - } - } + /* Create a context for this encode pipe */ + va_status = vaCreateContext(va_dpy->va_dpy, config_id, + frame_width_mbaligned, frame_height_mbaligned, + VA_PROGRESSIVE, + tmp_surfaceid, 2 * SURFACE_NUM, + &context_id); + CHECK_VASTATUS(va_status, "vaCreateContext"); + free(tmp_surfaceid); + + codedbuf_size = (frame_width_mbaligned * frame_height_mbaligned * 400) / (16*16); + + for (int i = 0; i < SURFACE_NUM; i++) { + /* create coded buffer once for all + * other VA buffers which won't be used again after vaRenderPicture. + * so APP can always vaCreateBuffer for every frame + * but coded buffer need to be mapped and accessed after vaRenderPicture/vaEndPicture + * so VA won't maintain the coded buffer + */ + va_status = vaCreateBuffer(va_dpy->va_dpy, context_id, VAEncCodedBufferType, + codedbuf_size, 1, NULL, &gl_surfaces[i].coded_buf); + CHECK_VASTATUS(va_status, "vaCreateBuffer"); + } + } - for (i = 0; i < SURFACE_NUM; i++) { - gl_surfaces[i].src_surface = src_surface[i]; - gl_surfaces[i].ref_surface = ref_surface[i]; - } - - return 0; + /* create OpenGL objects */ + for (int i = 0; i < SURFACE_NUM; i++) { + if (use_zerocopy) { + gl_surfaces[i].y_tex = resource_pool->create_2d_texture(GL_R8, 1, 1); + gl_surfaces[i].cbcr_tex = resource_pool->create_2d_texture(GL_RG8, 1, 1); + } else { + size_t bytes_per_pixel = (global_flags.x264_bit_depth > 8) ? 2 : 1; + + // Generate a PBO to read into. It doesn't necessarily fit 1:1 with the VA-API + // buffers, due to potentially differing pitch. + glGenBuffers(1, &gl_surfaces[i].pbo); + glBindBuffer(GL_PIXEL_PACK_BUFFER, gl_surfaces[i].pbo); + glBufferStorage(GL_PIXEL_PACK_BUFFER, frame_width * frame_height * 2 * bytes_per_pixel, nullptr, GL_MAP_READ_BIT | GL_MAP_WRITE_BIT | GL_MAP_PERSISTENT_BIT); + uint8_t *ptr = (uint8_t *)glMapBufferRange(GL_PIXEL_PACK_BUFFER, 0, frame_width * frame_height * 2 * bytes_per_pixel, GL_MAP_READ_BIT | GL_MAP_PERSISTENT_BIT); + gl_surfaces[i].y_offset = 0; + gl_surfaces[i].cbcr_offset = frame_width * frame_height * bytes_per_pixel; + gl_surfaces[i].y_ptr = ptr + gl_surfaces[i].y_offset; + gl_surfaces[i].cbcr_ptr = ptr + gl_surfaces[i].cbcr_offset; + glBindBuffer(GL_PIXEL_PACK_BUFFER, 0); + } + } + + return 0; } // Given a list like 1 9 3 0 2 8 4 and a pivot element 3, will produce @@ -1246,51 +1015,65 @@ static void sort_two(T *begin, T *end, const T &pivot, const C &less_than) sort(middle, end, less_than); } -void QuickSyncEncoderImpl::update_ReferenceFrames(int frame_type) +void QuickSyncEncoderImpl::update_ReferenceFrames(int current_display_frame, int frame_type) { - int i; - if (frame_type == FRAME_B) return; + pic_param.CurrPic.frame_idx = current_ref_frame_num; + CurrentCurrPic.flags = VA_PICTURE_H264_SHORT_TERM_REFERENCE; - numShortTerm++; - if (numShortTerm > num_ref_frames) - numShortTerm = num_ref_frames; - for (i=numShortTerm-1; i>0; i--) - ReferenceFrames[i] = ReferenceFrames[i-1]; - ReferenceFrames[0] = CurrentCurrPic; + unique_lock lock(storage_task_queue_mutex); + + // Insert the new frame at the start of the reference queue. + reference_frames.push_front(ReferenceFrame{ CurrentCurrPic, current_display_frame }); + + if (reference_frames.size() > num_ref_frames) + { + // The back frame frame is no longer in use as a reference. + int display_frame_num = reference_frames.back().display_number; + assert(surface_for_frame.count(display_frame_num)); + release_gl_surface(display_frame_num); + reference_frames.pop_back(); + } + + // Mark this frame in use as a reference. + assert(surface_for_frame.count(current_display_frame)); + ++surface_for_frame[current_display_frame]->refcount; - current_frame_num++; - if (current_frame_num > MaxFrameNum) - current_frame_num = 0; + current_ref_frame_num++; + if (current_ref_frame_num > MaxFrameNum) + current_ref_frame_num = 0; } -int QuickSyncEncoderImpl::update_RefPicList(int frame_type) +void QuickSyncEncoderImpl::update_RefPicList_P(VAPictureH264 RefPicList0_P[MAX_NUM_REF2]) { const auto descending_by_frame_idx = [](const VAPictureH264 &a, const VAPictureH264 &b) { return a.frame_idx > b.frame_idx; }; + + for (size_t i = 0; i < reference_frames.size(); ++i) { + RefPicList0_P[i] = reference_frames[i].pic; + } + sort(&RefPicList0_P[0], &RefPicList0_P[reference_frames.size()], descending_by_frame_idx); +} + +void QuickSyncEncoderImpl::update_RefPicList_B(VAPictureH264 RefPicList0_B[MAX_NUM_REF2], VAPictureH264 RefPicList1_B[MAX_NUM_REF2]) +{ const auto ascending_by_top_field_order_cnt = [](const VAPictureH264 &a, const VAPictureH264 &b) { return a.TopFieldOrderCnt < b.TopFieldOrderCnt; }; const auto descending_by_top_field_order_cnt = [](const VAPictureH264 &a, const VAPictureH264 &b) { return a.TopFieldOrderCnt > b.TopFieldOrderCnt; }; - - if (frame_type == FRAME_P) { - memcpy(RefPicList0_P, ReferenceFrames, numShortTerm * sizeof(VAPictureH264)); - sort(&RefPicList0_P[0], &RefPicList0_P[numShortTerm], descending_by_frame_idx); - } else if (frame_type == FRAME_B) { - memcpy(RefPicList0_B, ReferenceFrames, numShortTerm * sizeof(VAPictureH264)); - sort_two(&RefPicList0_B[0], &RefPicList0_B[numShortTerm], CurrentCurrPic, ascending_by_top_field_order_cnt); - memcpy(RefPicList1_B, ReferenceFrames, numShortTerm * sizeof(VAPictureH264)); - sort_two(&RefPicList1_B[0], &RefPicList1_B[numShortTerm], CurrentCurrPic, descending_by_top_field_order_cnt); + for (size_t i = 0; i < reference_frames.size(); ++i) { + RefPicList0_B[i] = reference_frames[i].pic; + RefPicList1_B[i] = reference_frames[i].pic; } - - return 0; + sort_two(&RefPicList0_B[0], &RefPicList0_B[reference_frames.size()], CurrentCurrPic, ascending_by_top_field_order_cnt); + sort_two(&RefPicList1_B[0], &RefPicList1_B[reference_frames.size()], CurrentCurrPic, descending_by_top_field_order_cnt); } @@ -1329,18 +1112,18 @@ int QuickSyncEncoderImpl::render_sequence() seq_param.frame_crop_bottom_offset = (frame_height_mbaligned - frame_height)/2; } - va_status = vaCreateBuffer(va_dpy, context_id, + va_status = vaCreateBuffer(va_dpy->va_dpy, context_id, VAEncSequenceParameterBufferType, sizeof(seq_param), 1, &seq_param, &seq_param_buf); CHECK_VASTATUS(va_status, "vaCreateBuffer"); - va_status = vaCreateBuffer(va_dpy, context_id, + va_status = vaCreateBuffer(va_dpy->va_dpy, context_id, VAEncMiscParameterBufferType, sizeof(VAEncMiscParameterBuffer) + sizeof(VAEncMiscParameterRateControl), 1, NULL, &rc_param_buf); CHECK_VASTATUS(va_status, "vaCreateBuffer"); - vaMapBuffer(va_dpy, rc_param_buf, (void **)&misc_param); + vaMapBuffer(va_dpy->va_dpy, rc_param_buf, (void **)&misc_param); misc_param->type = VAEncMiscParameterTypeRateControl; misc_rate_ctrl = (VAEncMiscParameterRateControl *)misc_param->data; memset(misc_rate_ctrl, 0, sizeof(*misc_rate_ctrl)); @@ -1350,12 +1133,12 @@ int QuickSyncEncoderImpl::render_sequence() misc_rate_ctrl->initial_qp = initial_qp; misc_rate_ctrl->min_qp = minimal_qp; misc_rate_ctrl->basic_unit_size = 0; - vaUnmapBuffer(va_dpy, rc_param_buf); + vaUnmapBuffer(va_dpy->va_dpy, rc_param_buf); render_id[0] = seq_param_buf; render_id[1] = rc_param_buf; - render_picture_and_delete(va_dpy, context_id, &render_id[0], 2); + render_picture_and_delete(va_dpy->va_dpy, context_id, &render_id[0], 2); return 0; } @@ -1392,21 +1175,23 @@ static int calc_poc(int pic_order_cnt_lsb, int frame_type) return TopFieldOrderCnt; } -int QuickSyncEncoderImpl::render_picture(int frame_type, int display_frame_num, int gop_start_display_frame_num) +int QuickSyncEncoderImpl::render_picture(GLSurface *surf, int frame_type, int display_frame_num, int gop_start_display_frame_num) { VABufferID pic_param_buf; VAStatus va_status; - int i = 0; + size_t i = 0; - pic_param.CurrPic.picture_id = gl_surfaces[display_frame_num % SURFACE_NUM].ref_surface; - pic_param.CurrPic.frame_idx = current_frame_num; + pic_param.CurrPic.picture_id = surf->ref_surface; + pic_param.CurrPic.frame_idx = current_ref_frame_num; pic_param.CurrPic.flags = 0; pic_param.CurrPic.TopFieldOrderCnt = calc_poc((display_frame_num - gop_start_display_frame_num) % MaxPicOrderCntLsb, frame_type); pic_param.CurrPic.BottomFieldOrderCnt = pic_param.CurrPic.TopFieldOrderCnt; CurrentCurrPic = pic_param.CurrPic; - memcpy(pic_param.ReferenceFrames, ReferenceFrames, numShortTerm*sizeof(VAPictureH264)); - for (i = numShortTerm; i < MAX_NUM_REF1; i++) { + for (i = 0; i < reference_frames.size(); i++) { + pic_param.ReferenceFrames[i] = reference_frames[i].pic; + } + for (i = reference_frames.size(); i < MAX_NUM_REF1; i++) { pic_param.ReferenceFrames[i].picture_id = VA_INVALID_SURFACE; pic_param.ReferenceFrames[i].flags = VA_PICTURE_H264_INVALID; } @@ -1415,21 +1200,21 @@ int QuickSyncEncoderImpl::render_picture(int frame_type, int display_frame_num, pic_param.pic_fields.bits.reference_pic_flag = (frame_type != FRAME_B); pic_param.pic_fields.bits.entropy_coding_mode_flag = h264_entropy_mode; pic_param.pic_fields.bits.deblocking_filter_control_present_flag = 1; - pic_param.frame_num = current_frame_num; - pic_param.coded_buf = gl_surfaces[display_frame_num % SURFACE_NUM].coded_buf; + pic_param.frame_num = current_ref_frame_num; // FIXME: is this correct? + pic_param.coded_buf = surf->coded_buf; pic_param.last_picture = false; // FIXME pic_param.pic_init_qp = initial_qp; - va_status = vaCreateBuffer(va_dpy, context_id, VAEncPictureParameterBufferType, + va_status = vaCreateBuffer(va_dpy->va_dpy, context_id, VAEncPictureParameterBufferType, sizeof(pic_param), 1, &pic_param, &pic_param_buf); CHECK_VASTATUS(va_status, "vaCreateBuffer"); - render_picture_and_delete(va_dpy, context_id, &pic_param_buf, 1); + render_picture_and_delete(va_dpy->va_dpy, context_id, &pic_param_buf, 1); return 0; } -int QuickSyncEncoderImpl::render_packedsequence() +int QuickSyncEncoderImpl::render_packedsequence(YCbCrLumaCoefficients ycbcr_coefficients) { VAEncPackedHeaderParameterBuffer packedheader_param_buffer; VABufferID packedseq_para_bufid, packedseq_data_bufid, render_id[2]; @@ -1437,20 +1222,20 @@ int QuickSyncEncoderImpl::render_packedsequence() unsigned char *packedseq_buffer = NULL; VAStatus va_status; - length_in_bits = build_packed_seq_buffer(&packedseq_buffer); + length_in_bits = build_packed_seq_buffer(ycbcr_coefficients, &packedseq_buffer); packedheader_param_buffer.type = VAEncPackedHeaderSequence; packedheader_param_buffer.bit_length = length_in_bits; /*length_in_bits*/ packedheader_param_buffer.has_emulation_bytes = 0; - va_status = vaCreateBuffer(va_dpy, + va_status = vaCreateBuffer(va_dpy->va_dpy, context_id, VAEncPackedHeaderParameterBufferType, sizeof(packedheader_param_buffer), 1, &packedheader_param_buffer, &packedseq_para_bufid); CHECK_VASTATUS(va_status, "vaCreateBuffer"); - va_status = vaCreateBuffer(va_dpy, + va_status = vaCreateBuffer(va_dpy->va_dpy, context_id, VAEncPackedHeaderDataBufferType, (length_in_bits + 7) / 8, 1, packedseq_buffer, @@ -1459,7 +1244,7 @@ int QuickSyncEncoderImpl::render_packedsequence() render_id[0] = packedseq_para_bufid; render_id[1] = packedseq_data_bufid; - render_picture_and_delete(va_dpy, context_id, render_id, 2); + render_picture_and_delete(va_dpy->va_dpy, context_id, render_id, 2); free(packedseq_buffer); @@ -1480,14 +1265,14 @@ int QuickSyncEncoderImpl::render_packedpicture() packedheader_param_buffer.bit_length = length_in_bits; packedheader_param_buffer.has_emulation_bytes = 0; - va_status = vaCreateBuffer(va_dpy, + va_status = vaCreateBuffer(va_dpy->va_dpy, context_id, VAEncPackedHeaderParameterBufferType, sizeof(packedheader_param_buffer), 1, &packedheader_param_buffer, &packedpic_para_bufid); CHECK_VASTATUS(va_status, "vaCreateBuffer"); - va_status = vaCreateBuffer(va_dpy, + va_status = vaCreateBuffer(va_dpy->va_dpy, context_id, VAEncPackedHeaderDataBufferType, (length_in_bits + 7) / 8, 1, packedpic_buffer, @@ -1496,7 +1281,7 @@ int QuickSyncEncoderImpl::render_packedpicture() render_id[0] = packedpic_para_bufid; render_id[1] = packedpic_data_bufid; - render_picture_and_delete(va_dpy, context_id, render_id, 2); + render_picture_and_delete(va_dpy->va_dpy, context_id, render_id, 2); free(packedpic_buffer); @@ -1516,14 +1301,14 @@ void QuickSyncEncoderImpl::render_packedslice() packedheader_param_buffer.bit_length = length_in_bits; packedheader_param_buffer.has_emulation_bytes = 0; - va_status = vaCreateBuffer(va_dpy, + va_status = vaCreateBuffer(va_dpy->va_dpy, context_id, VAEncPackedHeaderParameterBufferType, sizeof(packedheader_param_buffer), 1, &packedheader_param_buffer, &packedslice_para_bufid); CHECK_VASTATUS(va_status, "vaCreateBuffer"); - va_status = vaCreateBuffer(va_dpy, + va_status = vaCreateBuffer(va_dpy->va_dpy, context_id, VAEncPackedHeaderDataBufferType, (length_in_bits + 7) / 8, 1, packedslice_buffer, @@ -1532,7 +1317,7 @@ void QuickSyncEncoderImpl::render_packedslice() render_id[0] = packedslice_para_bufid; render_id[1] = packedslice_data_bufid; - render_picture_and_delete(va_dpy, context_id, render_id, 2); + render_picture_and_delete(va_dpy->va_dpy, context_id, render_id, 2); free(packedslice_buffer); } @@ -1543,8 +1328,6 @@ int QuickSyncEncoderImpl::render_slice(int encoding_frame_num, int display_frame VAStatus va_status; int i; - update_RefPicList(frame_type); - /* one frame, one slice */ slice_param.macroblock_address = 0; slice_param.num_macroblocks = frame_width_mbaligned * frame_height_mbaligned/(16*16); /* Measured by MB */ @@ -1553,6 +1336,9 @@ int QuickSyncEncoderImpl::render_slice(int encoding_frame_num, int display_frame if (encoding_frame_num != 0) ++slice_param.idr_pic_id; } else if (frame_type == FRAME_P) { + VAPictureH264 RefPicList0_P[MAX_NUM_REF2]; + update_RefPicList_P(RefPicList0_P); + int refpiclist0_max = h264_maxref & 0xffff; memcpy(slice_param.RefPicList0, RefPicList0_P, refpiclist0_max*sizeof(VAPictureH264)); @@ -1561,6 +1347,9 @@ int QuickSyncEncoderImpl::render_slice(int encoding_frame_num, int display_frame slice_param.RefPicList0[i].flags = VA_PICTURE_H264_INVALID; } } else if (frame_type == FRAME_B) { + VAPictureH264 RefPicList0_B[MAX_NUM_REF2], RefPicList1_B[MAX_NUM_REF2]; + update_RefPicList_B(RefPicList0_B, RefPicList1_B); + int refpiclist0_max = h264_maxref & 0xffff; int refpiclist1_max = (h264_maxref >> 16) & 0xffff; @@ -1587,31 +1376,35 @@ int QuickSyncEncoderImpl::render_slice(int encoding_frame_num, int display_frame config_attrib[enc_packed_header_idx].value & VA_ENC_PACKED_HEADER_SLICE) render_packedslice(); - va_status = vaCreateBuffer(va_dpy, context_id, VAEncSliceParameterBufferType, + va_status = vaCreateBuffer(va_dpy->va_dpy, context_id, VAEncSliceParameterBufferType, sizeof(slice_param), 1, &slice_param, &slice_param_buf); CHECK_VASTATUS(va_status, "vaCreateBuffer"); - render_picture_and_delete(va_dpy, context_id, &slice_param_buf, 1); + render_picture_and_delete(va_dpy->va_dpy, context_id, &slice_param_buf, 1); return 0; } -void QuickSyncEncoderImpl::save_codeddata(storage_task task) +void QuickSyncEncoderImpl::save_codeddata(GLSurface *surf, storage_task task) { VACodedBufferSegment *buf_list = NULL; VAStatus va_status; string data; - va_status = vaMapBuffer(va_dpy, gl_surfaces[task.display_order % SURFACE_NUM].coded_buf, (void **)(&buf_list)); + va_status = vaMapBuffer(va_dpy->va_dpy, surf->coded_buf, (void **)(&buf_list)); CHECK_VASTATUS(va_status, "vaMapBuffer"); while (buf_list != NULL) { data.append(reinterpret_cast(buf_list->buf), buf_list->size); buf_list = (VACodedBufferSegment *) buf_list->next; } - vaUnmapBuffer(va_dpy, gl_surfaces[task.display_order % SURFACE_NUM].coded_buf); + vaUnmapBuffer(va_dpy->va_dpy, surf->coded_buf); + + static int frameno = 0; + print_latency("Current Quick Sync latency (video inputs → disk mux):", + task.received_ts, (task.frame_type == FRAME_B), &frameno, &qs_latency_histogram); { // Add video. @@ -1635,26 +1428,6 @@ void QuickSyncEncoderImpl::save_codeddata(storage_task task) stream_mux->add_packet(pkt, task.pts + global_delay(), task.dts + global_delay()); } } - // Encode and add all audio frames up to and including the pts of this video frame. - for ( ;; ) { - int64_t audio_pts; - vector audio; - { - unique_lock lock(frame_queue_mutex); - frame_queue_nonempty.wait(lock, [this]{ return storage_thread_should_quit || !pending_audio_frames.empty(); }); - if (storage_thread_should_quit && pending_audio_frames.empty()) return; - auto it = pending_audio_frames.begin(); - if (it->first > task.pts) break; - audio_pts = it->first; - audio = move(it->second); - pending_audio_frames.erase(it); - } - - file_audio_encoder->encode_audio(audio, audio_pts + global_delay()); - stream_audio_encoder->encode_audio(audio, audio_pts + global_delay()); - - if (audio_pts == task.pts) break; - } } @@ -1668,8 +1441,10 @@ void QuickSyncEncoderImpl::storage_task_enqueue(storage_task task) void QuickSyncEncoderImpl::storage_task_thread() { + pthread_setname_np(pthread_self(), "QS_Storage"); for ( ;; ) { storage_task current; + GLSurface *surf; { // wait until there's an encoded frame unique_lock lock(storage_task_queue_mutex); @@ -1677,63 +1452,70 @@ void QuickSyncEncoderImpl::storage_task_thread() if (storage_thread_should_quit && storage_task_queue.empty()) return; current = move(storage_task_queue.front()); storage_task_queue.pop(); + surf = surface_for_frame[current.display_order]; + assert(surf != nullptr); } VAStatus va_status; + + size_t display_order = current.display_order; + vector ref_display_frame_numbers = move(current.ref_display_frame_numbers); // waits for data, then saves it to disk. - va_status = vaSyncSurface(va_dpy, gl_surfaces[current.display_order % SURFACE_NUM].src_surface); + va_status = vaSyncSurface(va_dpy->va_dpy, surf->src_surface); CHECK_VASTATUS(va_status, "vaSyncSurface"); - save_codeddata(move(current)); + save_codeddata(surf, move(current)); + // Unlock the frame, and all its references. { unique_lock lock(storage_task_queue_mutex); - srcsurface_status[current.display_order % SURFACE_NUM] = SRC_SURFACE_FREE; - storage_task_queue_changed.notify_all(); + release_gl_surface(display_order); + + for (size_t frame_num : ref_display_frame_numbers) { + release_gl_surface(frame_num); + } } } } -int QuickSyncEncoderImpl::release_encode() +void QuickSyncEncoderImpl::release_encode() { for (unsigned i = 0; i < SURFACE_NUM; i++) { - vaDestroyBuffer(va_dpy, gl_surfaces[i].coded_buf); - vaDestroySurfaces(va_dpy, &gl_surfaces[i].src_surface, 1); - vaDestroySurfaces(va_dpy, &gl_surfaces[i].ref_surface, 1); + vaDestroyBuffer(va_dpy->va_dpy, gl_surfaces[i].coded_buf); + vaDestroySurfaces(va_dpy->va_dpy, &gl_surfaces[i].src_surface, 1); + vaDestroySurfaces(va_dpy->va_dpy, &gl_surfaces[i].ref_surface, 1); + } - if (!use_zerocopy) { + vaDestroyContext(va_dpy->va_dpy, context_id); + vaDestroyConfig(va_dpy->va_dpy, config_id); +} + +void QuickSyncEncoderImpl::release_gl_resources() +{ + assert(is_shutdown); + if (has_released_gl_resources) { + return; + } + + for (unsigned i = 0; i < SURFACE_NUM; i++) { + if (use_zerocopy) { + resource_pool->release_2d_texture(gl_surfaces[i].y_tex); + resource_pool->release_2d_texture(gl_surfaces[i].cbcr_tex); + } else { glBindBuffer(GL_PIXEL_PACK_BUFFER, gl_surfaces[i].pbo); glUnmapBuffer(GL_PIXEL_PACK_BUFFER); glBindBuffer(GL_PIXEL_PACK_BUFFER, 0); glDeleteBuffers(1, &gl_surfaces[i].pbo); } - glDeleteTextures(1, &gl_surfaces[i].y_tex); - glDeleteTextures(1, &gl_surfaces[i].cbcr_tex); } - vaDestroyContext(va_dpy, context_id); - vaDestroyConfig(va_dpy, config_id); - - return 0; + has_released_gl_resources = true; } -int QuickSyncEncoderImpl::deinit_va() -{ - vaTerminate(va_dpy); - - va_close_display(va_dpy); - - return 0; -} - -namespace { - -} // namespace - -QuickSyncEncoderImpl::QuickSyncEncoderImpl(const std::string &filename, QSurface *surface, const string &va_display, int width, int height, Mux *stream_mux, AudioEncoder *stream_audio_encoder, X264Encoder *x264_encoder) - : current_storage_frame(0), surface(surface), stream_audio_encoder(stream_audio_encoder), x264_encoder(x264_encoder), stream_mux(stream_mux), frame_width(width), frame_height(height) +QuickSyncEncoderImpl::QuickSyncEncoderImpl(const std::string &filename, ResourcePool *resource_pool, QSurface *surface, const string &va_display, int width, int height, AVOutputFormat *oformat, X264Encoder *x264_encoder, DiskSpaceEstimator *disk_space_estimator) + : current_storage_frame(0), resource_pool(resource_pool), surface(surface), x264_encoder(x264_encoder), frame_width(width), frame_height(height), disk_space_estimator(disk_space_estimator) { - file_audio_encoder.reset(new AudioEncoder(AUDIO_OUTPUT_CODEC_NAME, DEFAULT_AUDIO_OUTPUT_BIT_RATE)); + file_audio_encoder.reset(new AudioEncoder(AUDIO_OUTPUT_CODEC_NAME, DEFAULT_AUDIO_OUTPUT_BIT_RATE, oformat)); open_output_file(filename); file_audio_encoder->add_mux(file_mux.get()); @@ -1742,30 +1524,36 @@ QuickSyncEncoderImpl::QuickSyncEncoderImpl(const std::string &filename, QSurface //print_input(); - if (global_flags.uncompressed_video_to_http || - global_flags.x264_video_to_http) { - reorderer.reset(new FrameReorderer(ip_period - 1, frame_width, frame_height)); - } - if (global_flags.x264_video_to_http) { + if (global_flags.x264_video_to_http || global_flags.x264_video_to_disk) { assert(x264_encoder != nullptr); } else { assert(x264_encoder == nullptr); } - init_va(va_display); + enable_zerocopy_if_possible(); + if (!global_flags.x264_video_to_disk) { + init_va(va_display); + } setup_encode(); - // No frames are ready yet. - memset(srcsurface_status, SRC_SURFACE_FREE, sizeof(srcsurface_status)); - - memset(&seq_param, 0, sizeof(seq_param)); - memset(&pic_param, 0, sizeof(pic_param)); - memset(&slice_param, 0, sizeof(slice_param)); + if (!global_flags.x264_video_to_disk) { + memset(&seq_param, 0, sizeof(seq_param)); + memset(&pic_param, 0, sizeof(pic_param)); + memset(&slice_param, 0, sizeof(slice_param)); + } + + call_once(quick_sync_metrics_inited, [](){ + mixer_latency_histogram.init("mixer"); + qs_latency_histogram.init("quick_sync"); + current_file_mux_metrics.init({{ "destination", "current_file" }}); + total_mux_metrics.init({{ "destination", "files_total" }}); + global_metrics.add("current_file_start_time_seconds", &metric_current_file_start_time_seconds, Metrics::TYPE_GAUGE); + global_metrics.add("quick_sync_stalled_frames", &metric_quick_sync_stalled_frames); + }); storage_thread = thread(&QuickSyncEncoderImpl::storage_task_thread, this); encode_thread = thread([this]{ - //SDL_GL_MakeCurrent(window, context); QOpenGLContext *context = create_context(this->surface); eglBindAPI(EGL_OPENGL_API); if (!make_current(context, this->surface)) { @@ -1774,101 +1562,149 @@ QuickSyncEncoderImpl::QuickSyncEncoderImpl(const std::string &filename, QSurface exit(1); } encode_thread_func(); + delete_context(context); }); } QuickSyncEncoderImpl::~QuickSyncEncoderImpl() { shutdown(); + release_gl_resources(); +} + +QuickSyncEncoderImpl::GLSurface *QuickSyncEncoderImpl::allocate_gl_surface() +{ + for (unsigned i = 0; i < SURFACE_NUM; ++i) { + if (gl_surfaces[i].refcount == 0) { + ++gl_surfaces[i].refcount; + return &gl_surfaces[i]; + } + } + return nullptr; } -bool QuickSyncEncoderImpl::begin_frame(GLuint *y_tex, GLuint *cbcr_tex) +void QuickSyncEncoderImpl::release_gl_surface(size_t display_frame_num) +{ + assert(surface_for_frame.count(display_frame_num)); + QuickSyncEncoderImpl::GLSurface *surf = surface_for_frame[display_frame_num]; + if (--surf->refcount == 0) { + assert(surface_for_frame.count(display_frame_num)); + surface_for_frame.erase(display_frame_num); + storage_task_queue_changed.notify_all(); + } +} + +bool QuickSyncEncoderImpl::is_zerocopy() const +{ + return use_zerocopy; +} + +bool QuickSyncEncoderImpl::begin_frame(int64_t pts, int64_t duration, YCbCrLumaCoefficients ycbcr_coefficients, const vector &input_frames, GLuint *y_tex, GLuint *cbcr_tex) { assert(!is_shutdown); + GLSurface *surf = nullptr; { // Wait until this frame slot is done encoding. unique_lock lock(storage_task_queue_mutex); - if (srcsurface_status[current_storage_frame % SURFACE_NUM] != SRC_SURFACE_FREE) { - fprintf(stderr, "Warning: Slot %d (for frame %d) is still encoding, rendering has to wait for H.264 encoder\n", - current_storage_frame % SURFACE_NUM, current_storage_frame); + surf = allocate_gl_surface(); + if (surf == nullptr) { + fprintf(stderr, "Warning: No free slots for frame %d, rendering has to wait for H.264 encoder\n", + current_storage_frame); + ++metric_quick_sync_stalled_frames; + storage_task_queue_changed.wait(lock, [this, &surf]{ + if (storage_thread_should_quit) + return true; + surf = allocate_gl_surface(); + return surf != nullptr; + }); } - storage_task_queue_changed.wait(lock, [this]{ return storage_thread_should_quit || (srcsurface_status[current_storage_frame % SURFACE_NUM] == SRC_SURFACE_FREE); }); - srcsurface_status[current_storage_frame % SURFACE_NUM] = SRC_SURFACE_IN_ENCODING; if (storage_thread_should_quit) return false; + assert(surf != nullptr); + surface_for_frame[current_storage_frame] = surf; } - //*fbo = fbos[current_storage_frame % SURFACE_NUM]; - GLSurface *surf = &gl_surfaces[current_storage_frame % SURFACE_NUM]; - *y_tex = surf->y_tex; - *cbcr_tex = surf->cbcr_tex; - - VAStatus va_status = vaDeriveImage(va_dpy, surf->src_surface, &surf->surface_image); - CHECK_VASTATUS(va_status, "vaDeriveImage"); - if (use_zerocopy) { - VABufferInfo buf_info; - buf_info.mem_type = VA_SURFACE_ATTRIB_MEM_TYPE_DRM_PRIME; // or VA_SURFACE_ATTRIB_MEM_TYPE_KERNEL_DRM? - va_status = vaAcquireBufferHandle(va_dpy, surf->surface_image.buf, &buf_info); - CHECK_VASTATUS(va_status, "vaAcquireBufferHandle"); - - // Create Y image. - surf->y_egl_image = EGL_NO_IMAGE_KHR; - EGLint y_attribs[] = { - EGL_WIDTH, frame_width, - EGL_HEIGHT, frame_height, - EGL_LINUX_DRM_FOURCC_EXT, fourcc_code('R', '8', ' ', ' '), - EGL_DMA_BUF_PLANE0_FD_EXT, EGLint(buf_info.handle), - EGL_DMA_BUF_PLANE0_OFFSET_EXT, EGLint(surf->surface_image.offsets[0]), - EGL_DMA_BUF_PLANE0_PITCH_EXT, EGLint(surf->surface_image.pitches[0]), - EGL_NONE - }; - - surf->y_egl_image = eglCreateImageKHR(eglGetCurrentDisplay(), EGL_NO_CONTEXT, EGL_LINUX_DMA_BUF_EXT, NULL, y_attribs); - assert(surf->y_egl_image != EGL_NO_IMAGE_KHR); - - // Associate Y image to a texture. - glBindTexture(GL_TEXTURE_2D, *y_tex); - glEGLImageTargetTexture2DOES(GL_TEXTURE_2D, surf->y_egl_image); - - // Create CbCr image. - surf->cbcr_egl_image = EGL_NO_IMAGE_KHR; - EGLint cbcr_attribs[] = { - EGL_WIDTH, frame_width, - EGL_HEIGHT, frame_height, - EGL_LINUX_DRM_FOURCC_EXT, fourcc_code('G', 'R', '8', '8'), - EGL_DMA_BUF_PLANE0_FD_EXT, EGLint(buf_info.handle), - EGL_DMA_BUF_PLANE0_OFFSET_EXT, EGLint(surf->surface_image.offsets[1]), - EGL_DMA_BUF_PLANE0_PITCH_EXT, EGLint(surf->surface_image.pitches[1]), - EGL_NONE - }; - - surf->cbcr_egl_image = eglCreateImageKHR(eglGetCurrentDisplay(), EGL_NO_CONTEXT, EGL_LINUX_DMA_BUF_EXT, NULL, cbcr_attribs); - assert(surf->cbcr_egl_image != EGL_NO_IMAGE_KHR); - - // Associate CbCr image to a texture. - glBindTexture(GL_TEXTURE_2D, *cbcr_tex); - glEGLImageTargetTexture2DOES(GL_TEXTURE_2D, surf->cbcr_egl_image); + *y_tex = surf->y_tex; + *cbcr_tex = surf->cbcr_tex; + } else { + surf->y_tex = *y_tex; + surf->cbcr_tex = *cbcr_tex; + } + + if (!global_flags.x264_video_to_disk) { + VAStatus va_status = vaDeriveImage(va_dpy->va_dpy, surf->src_surface, &surf->surface_image); + CHECK_VASTATUS(va_status, "vaDeriveImage"); + + if (use_zerocopy) { + VABufferInfo buf_info; + buf_info.mem_type = VA_SURFACE_ATTRIB_MEM_TYPE_DRM_PRIME; // or VA_SURFACE_ATTRIB_MEM_TYPE_KERNEL_DRM? + va_status = vaAcquireBufferHandle(va_dpy->va_dpy, surf->surface_image.buf, &buf_info); + CHECK_VASTATUS(va_status, "vaAcquireBufferHandle"); + + // Create Y image. + surf->y_egl_image = EGL_NO_IMAGE_KHR; + EGLint y_attribs[] = { + EGL_WIDTH, frame_width, + EGL_HEIGHT, frame_height, + EGL_LINUX_DRM_FOURCC_EXT, fourcc_code('R', '8', ' ', ' '), + EGL_DMA_BUF_PLANE0_FD_EXT, EGLint(buf_info.handle), + EGL_DMA_BUF_PLANE0_OFFSET_EXT, EGLint(surf->surface_image.offsets[0]), + EGL_DMA_BUF_PLANE0_PITCH_EXT, EGLint(surf->surface_image.pitches[0]), + EGL_NONE + }; + + surf->y_egl_image = eglCreateImageKHR(eglGetCurrentDisplay(), EGL_NO_CONTEXT, EGL_LINUX_DMA_BUF_EXT, NULL, y_attribs); + assert(surf->y_egl_image != EGL_NO_IMAGE_KHR); + + // Associate Y image to a texture. + glBindTexture(GL_TEXTURE_2D, *y_tex); + glEGLImageTargetTexture2DOES(GL_TEXTURE_2D, surf->y_egl_image); + + // Create CbCr image. + surf->cbcr_egl_image = EGL_NO_IMAGE_KHR; + EGLint cbcr_attribs[] = { + EGL_WIDTH, frame_width / 2, + EGL_HEIGHT, frame_height / 2, + EGL_LINUX_DRM_FOURCC_EXT, fourcc_code('G', 'R', '8', '8'), + EGL_DMA_BUF_PLANE0_FD_EXT, EGLint(buf_info.handle), + EGL_DMA_BUF_PLANE0_OFFSET_EXT, EGLint(surf->surface_image.offsets[1]), + EGL_DMA_BUF_PLANE0_PITCH_EXT, EGLint(surf->surface_image.pitches[1]), + EGL_NONE + }; + + surf->cbcr_egl_image = eglCreateImageKHR(eglGetCurrentDisplay(), EGL_NO_CONTEXT, EGL_LINUX_DMA_BUF_EXT, NULL, cbcr_attribs); + assert(surf->cbcr_egl_image != EGL_NO_IMAGE_KHR); + + // Associate CbCr image to a texture. + glBindTexture(GL_TEXTURE_2D, *cbcr_tex); + glEGLImageTargetTexture2DOES(GL_TEXTURE_2D, surf->cbcr_egl_image); + } } + current_video_frame = PendingFrame{ {}, input_frames, pts, duration, ycbcr_coefficients }; + return true; } void QuickSyncEncoderImpl::add_audio(int64_t pts, vector audio) { + lock_guard lock(file_audio_encoder_mutex); assert(!is_shutdown); - { - unique_lock lock(frame_queue_mutex); - pending_audio_frames[pts] = move(audio); - } - frame_queue_nonempty.notify_all(); + file_audio_encoder->encode_audio(audio, pts + global_delay()); } -RefCountedGLsync QuickSyncEncoderImpl::end_frame(int64_t pts, int64_t duration, const vector &input_frames) +RefCountedGLsync QuickSyncEncoderImpl::end_frame() { assert(!is_shutdown); if (!use_zerocopy) { - GLSurface *surf = &gl_surfaces[current_storage_frame % SURFACE_NUM]; + GLenum type = global_flags.x264_bit_depth > 8 ? GL_UNSIGNED_SHORT : GL_UNSIGNED_BYTE; + GLSurface *surf; + { + unique_lock lock(storage_task_queue_mutex); + surf = surface_for_frame[current_storage_frame]; + assert(surf != nullptr); + } glPixelStorei(GL_PACK_ROW_LENGTH, 0); check_error(); @@ -1878,14 +1714,17 @@ RefCountedGLsync QuickSyncEncoderImpl::end_frame(int64_t pts, int64_t duration, glBindTexture(GL_TEXTURE_2D, surf->y_tex); check_error(); - glGetTexImage(GL_TEXTURE_2D, 0, GL_RED, GL_UNSIGNED_BYTE, BUFFER_OFFSET(surf->y_offset)); + glGetTexImage(GL_TEXTURE_2D, 0, GL_RED, type, BUFFER_OFFSET(surf->y_offset)); check_error(); glBindTexture(GL_TEXTURE_2D, surf->cbcr_tex); check_error(); - glGetTexImage(GL_TEXTURE_2D, 0, GL_RG, GL_UNSIGNED_BYTE, BUFFER_OFFSET(surf->cbcr_offset)); + glGetTexImage(GL_TEXTURE_2D, 0, GL_RG, type, BUFFER_OFFSET(surf->cbcr_offset)); check_error(); + // We don't own these; the caller does. + surf->y_tex = surf->cbcr_tex = 0; + glBindTexture(GL_TEXTURE_2D, 0); check_error(); glBindBuffer(GL_PIXEL_PACK_BUFFER, 0); @@ -1902,7 +1741,8 @@ RefCountedGLsync QuickSyncEncoderImpl::end_frame(int64_t pts, int64_t duration, { unique_lock lock(frame_queue_mutex); - pending_video_frames[current_storage_frame] = PendingFrame{ fence, input_frames, pts, duration }; + current_video_frame.fence = fence; + pending_video_frames.push(move(current_video_frame)); ++current_storage_frame; } frame_queue_nonempty.notify_all(); @@ -1928,14 +1768,26 @@ void QuickSyncEncoderImpl::shutdown() storage_task_queue_changed.notify_all(); } storage_thread.join(); - encode_remaining_audio(); - release_encode(); - deinit_va(); - file_mux.reset(); + // Encode any leftover audio in the queues, and also any delayed frames. + { + lock_guard lock(file_audio_encoder_mutex); + file_audio_encoder->encode_last_audio(); + } + + if (!global_flags.x264_video_to_disk) { + release_encode(); + va_dpy.reset(); + } is_shutdown = true; } +void QuickSyncEncoderImpl::close_file() +{ + file_mux.reset(); + metric_current_file_start_time_seconds = 0.0 / 0.0; +} + void QuickSyncEncoderImpl::open_output_file(const std::string &filename) { AVFormatContext *avctx = avformat_alloc_context(); @@ -1951,106 +1803,127 @@ void QuickSyncEncoderImpl::open_output_file(const std::string &filename) exit(1); } - file_mux.reset(new Mux(avctx, frame_width, frame_height, Mux::CODEC_H264, file_audio_encoder->get_codec(), TIMEBASE, DEFAULT_AUDIO_OUTPUT_BIT_RATE, nullptr)); + string video_extradata; // FIXME: See other comment about global headers. + if (global_flags.x264_video_to_disk) { + video_extradata = x264_encoder->get_global_headers(); + } + + current_file_mux_metrics.reset(); + + { + lock_guard lock(file_audio_encoder_mutex); + AVCodecParametersWithDeleter audio_codecpar = file_audio_encoder->get_codec_parameters(); + file_mux.reset(new Mux(avctx, frame_width, frame_height, Mux::CODEC_H264, video_extradata, audio_codecpar.get(), TIMEBASE, + std::bind(&DiskSpaceEstimator::report_write, disk_space_estimator, filename, _1), + Mux::WRITE_BACKGROUND, + { ¤t_file_mux_metrics, &total_mux_metrics })); + } + metric_current_file_start_time_seconds = get_timestamp_for_metrics(); + + if (global_flags.x264_video_to_disk) { + x264_encoder->add_mux(file_mux.get()); + } } void QuickSyncEncoderImpl::encode_thread_func() { + pthread_setname_np(pthread_self(), "QS_Encode"); + int64_t last_dts = -1; int gop_start_display_frame_num = 0; - for (int encoding_frame_num = 0; ; ++encoding_frame_num) { + for (int display_frame_num = 0; ; ++display_frame_num) { + // Wait for the frame to be in the queue. Note that this only means + // we started rendering it. PendingFrame frame; - int pts_lag; - int frame_type, display_frame_num; - encoding2display_order(encoding_frame_num, intra_period, intra_idr_period, ip_period, - &display_frame_num, &frame_type, &pts_lag); - if (frame_type == FRAME_IDR) { - numShortTerm = 0; - current_frame_num = 0; - gop_start_display_frame_num = display_frame_num; - } - { unique_lock lock(frame_queue_mutex); - frame_queue_nonempty.wait(lock, [this, display_frame_num]{ - return encode_thread_should_quit || pending_video_frames.count(display_frame_num) != 0; + frame_queue_nonempty.wait(lock, [this]{ + return encode_thread_should_quit || !pending_video_frames.empty(); }); - if (encode_thread_should_quit && pending_video_frames.count(display_frame_num) == 0) { - // We have queued frames that were supposed to be B-frames, - // but will be no P-frame to encode them against. Encode them all - // as P-frames instead. Note that this happens under the mutex, + if (encode_thread_should_quit && pending_video_frames.empty()) { + // We may have queued frames left in the reorder buffer + // that were supposed to be B-frames, but have no P-frame + // to be encoded against. If so, encode them all as + // P-frames instead. Note that this happens under the mutex, // but nobody else uses it at this point, since we're shutting down, // so there's no contention. - encode_remaining_frames_as_p(encoding_frame_num, gop_start_display_frame_num, last_dts); + encode_remaining_frames_as_p(quicksync_encoding_frame_num, gop_start_display_frame_num, last_dts); return; } else { - frame = move(pending_video_frames[display_frame_num]); - pending_video_frames.erase(display_frame_num); + frame = move(pending_video_frames.front()); + pending_video_frames.pop(); } } - // Determine the dts of this frame. - int64_t dts; - if (pts_lag == -1) { - assert(last_dts != -1); - dts = last_dts + (TIMEBASE / MAX_FPS); - } else { - dts = frame.pts - pts_lag; + // Pass the frame on to x264 (or uncompressed to HTTP) as needed. + // Note that this implicitly waits for the frame to be done rendering. + pass_frame(frame, display_frame_num, frame.pts, frame.duration); + + if (global_flags.x264_video_to_disk) { + unique_lock lock(storage_task_queue_mutex); + release_gl_surface(display_frame_num); + continue; } - last_dts = dts; - encode_frame(frame, encoding_frame_num, display_frame_num, gop_start_display_frame_num, frame_type, frame.pts, dts, frame.duration); + reorder_buffer[display_frame_num] = move(frame); + + // Now encode as many QuickSync frames as we can using the frames we have available. + // (It could be zero, or it could be multiple.) FIXME: make a function. + for ( ;; ) { + int pts_lag; + int frame_type, quicksync_display_frame_num; + encoding2display_order(quicksync_encoding_frame_num, intra_period, intra_idr_period, ip_period, + &quicksync_display_frame_num, &frame_type, &pts_lag); + if (!reorder_buffer.count(quicksync_display_frame_num)) { + break; + } + frame = move(reorder_buffer[quicksync_display_frame_num]); + reorder_buffer.erase(quicksync_display_frame_num); + + if (frame_type == FRAME_IDR) { + // Release any reference frames from the previous GOP. + { + unique_lock lock(storage_task_queue_mutex); + for (const ReferenceFrame &frame : reference_frames) { + release_gl_surface(frame.display_number); + } + } + reference_frames.clear(); + current_ref_frame_num = 0; + gop_start_display_frame_num = quicksync_display_frame_num; + } + + // Determine the dts of this frame. + int64_t dts; + if (pts_lag == -1) { + assert(last_dts != -1); + dts = last_dts + (TIMEBASE / MAX_FPS); + } else { + dts = frame.pts - pts_lag; + } + last_dts = dts; + + encode_frame(frame, quicksync_encoding_frame_num, quicksync_display_frame_num, gop_start_display_frame_num, frame_type, frame.pts, dts, frame.duration, frame.ycbcr_coefficients); + ++quicksync_encoding_frame_num; + } } } void QuickSyncEncoderImpl::encode_remaining_frames_as_p(int encoding_frame_num, int gop_start_display_frame_num, int64_t last_dts) { - if (pending_video_frames.empty()) { + if (reorder_buffer.empty()) { return; } - for (auto &pending_frame : pending_video_frames) { + for (auto &pending_frame : reorder_buffer) { int display_frame_num = pending_frame.first; assert(display_frame_num > 0); PendingFrame frame = move(pending_frame.second); int64_t dts = last_dts + (TIMEBASE / MAX_FPS); printf("Finalizing encode: Encoding leftover frame %d as P-frame instead of B-frame.\n", display_frame_num); - encode_frame(frame, encoding_frame_num++, display_frame_num, gop_start_display_frame_num, FRAME_P, frame.pts, dts, frame.duration); + encode_frame(frame, encoding_frame_num++, display_frame_num, gop_start_display_frame_num, FRAME_P, frame.pts, dts, frame.duration, frame.ycbcr_coefficients); last_dts = dts; } - - if (global_flags.uncompressed_video_to_http || - global_flags.x264_video_to_http) { - // Add frames left in reorderer. - while (!reorderer->empty()) { - FrameReorderer::Frame output_frame = reorderer->get_first_frame(); - if (global_flags.uncompressed_video_to_http) { - add_packet_for_uncompressed_frame(output_frame.pts, output_frame.duration, output_frame.data); - } else { - assert(global_flags.x264_video_to_http); - x264_encoder->add_frame(output_frame.pts, output_frame.duration, output_frame.data); - } - } - } -} - -void QuickSyncEncoderImpl::encode_remaining_audio() -{ - // This really ought to be empty by now, but just to be sure... - for (auto &pending_frame : pending_audio_frames) { - int64_t audio_pts = pending_frame.first; - vector audio = move(pending_frame.second); - - file_audio_encoder->encode_audio(audio, audio_pts + global_delay()); - if (stream_audio_encoder) { - stream_audio_encoder->encode_audio(audio, audio_pts + global_delay()); - } - } - pending_audio_frames.clear(); - - // Encode any leftover audio in the queues, and also any delayed frames. - // Note: stream_audio_encoder is not owned by us, so don't call encode_last_audio(). - file_audio_encoder->encode_last_audio(); } void QuickSyncEncoderImpl::add_packet_for_uncompressed_frame(int64_t pts, int64_t duration, const uint8_t *data) @@ -2083,31 +1956,64 @@ void memcpy_with_pitch(uint8_t *dst, const uint8_t *src, size_t src_width, size_ } // namespace -void QuickSyncEncoderImpl::encode_frame(QuickSyncEncoderImpl::PendingFrame frame, int encoding_frame_num, int display_frame_num, int gop_start_display_frame_num, - int frame_type, int64_t pts, int64_t dts, int64_t duration) +void QuickSyncEncoderImpl::pass_frame(QuickSyncEncoderImpl::PendingFrame frame, int display_frame_num, int64_t pts, int64_t duration) { // Wait for the GPU to be done with the frame. GLenum sync_status; do { - sync_status = glClientWaitSync(frame.fence.get(), 0, 1000000000); + sync_status = glClientWaitSync(frame.fence.get(), 0, 0); check_error(); + if (sync_status == GL_TIMEOUT_EXPIRED) { + // NVIDIA likes to busy-wait; yield instead. + this_thread::sleep_for(milliseconds(1)); + } } while (sync_status == GL_TIMEOUT_EXPIRED); assert(sync_status != GL_WAIT_FAILED); + ReceivedTimestamps received_ts = find_received_timestamp(frame.input_frames); + static int frameno = 0; + print_latency("Current mixer latency (video inputs → ready for encode):", + received_ts, false, &frameno, &mixer_latency_histogram); + // Release back any input frames we needed to render this frame. frame.input_frames.clear(); - GLSurface *surf = &gl_surfaces[display_frame_num % SURFACE_NUM]; + GLSurface *surf; + { + unique_lock lock(storage_task_queue_mutex); + surf = surface_for_frame[display_frame_num]; + assert(surf != nullptr); + } + uint8_t *data = reinterpret_cast(surf->y_ptr); + if (global_flags.uncompressed_video_to_http) { + add_packet_for_uncompressed_frame(pts, duration, data); + } else if (global_flags.x264_video_to_http || global_flags.x264_video_to_disk) { + x264_encoder->add_frame(pts, duration, frame.ycbcr_coefficients, data, received_ts); + } +} + +void QuickSyncEncoderImpl::encode_frame(QuickSyncEncoderImpl::PendingFrame frame, int encoding_frame_num, int display_frame_num, int gop_start_display_frame_num, + int frame_type, int64_t pts, int64_t dts, int64_t duration, YCbCrLumaCoefficients ycbcr_coefficients) +{ + const ReceivedTimestamps received_ts = find_received_timestamp(frame.input_frames); + + GLSurface *surf; + { + unique_lock lock(storage_task_queue_mutex); + surf = surface_for_frame[display_frame_num]; + assert(surf != nullptr); + } VAStatus va_status; if (use_zerocopy) { eglDestroyImageKHR(eglGetCurrentDisplay(), surf->y_egl_image); eglDestroyImageKHR(eglGetCurrentDisplay(), surf->cbcr_egl_image); - va_status = vaReleaseBufferHandle(va_dpy, surf->surface_image.buf); + va_status = vaReleaseBufferHandle(va_dpy->va_dpy, surf->surface_image.buf); CHECK_VASTATUS(va_status, "vaReleaseBufferHandle"); } else { + // Upload the frame to VA-API. unsigned char *surface_p = nullptr; - vaMapBuffer(va_dpy, surf->surface_image.buf, (void **)&surface_p); + vaMapBuffer(va_dpy->va_dpy, surf->surface_image.buf, (void **)&surface_p); unsigned char *va_y_ptr = (unsigned char *)surface_p + surf->surface_image.offsets[0]; memcpy_with_pitch(va_y_ptr, surf->y_ptr, frame_width, surf->surface_image.pitches[0], frame_height); @@ -2115,49 +2021,56 @@ void QuickSyncEncoderImpl::encode_frame(QuickSyncEncoderImpl::PendingFrame frame unsigned char *va_cbcr_ptr = (unsigned char *)surface_p + surf->surface_image.offsets[1]; memcpy_with_pitch(va_cbcr_ptr, surf->cbcr_ptr, (frame_width / 2) * sizeof(uint16_t), surf->surface_image.pitches[1], frame_height / 2); - va_status = vaUnmapBuffer(va_dpy, surf->surface_image.buf); + va_status = vaUnmapBuffer(va_dpy->va_dpy, surf->surface_image.buf); CHECK_VASTATUS(va_status, "vaUnmapBuffer"); - - if (global_flags.uncompressed_video_to_http || - global_flags.x264_video_to_http) { - // Add uncompressed video. (Note that pts == dts here.) - // Delay needs to match audio. - FrameReorderer::Frame output_frame = reorderer->reorder_frame(pts + global_delay(), duration, reinterpret_cast(surf->y_ptr)); - if (output_frame.data != nullptr) { - if (global_flags.uncompressed_video_to_http) { - add_packet_for_uncompressed_frame(output_frame.pts, output_frame.duration, output_frame.data); - } else { - assert(global_flags.x264_video_to_http); - x264_encoder->add_frame(output_frame.pts, output_frame.duration, output_frame.data); - } - } - } } - va_status = vaDestroyImage(va_dpy, surf->surface_image.image_id); + va_status = vaDestroyImage(va_dpy->va_dpy, surf->surface_image.image_id); CHECK_VASTATUS(va_status, "vaDestroyImage"); // Schedule the frame for encoding. VASurfaceID va_surface = surf->src_surface; - va_status = vaBeginPicture(va_dpy, context_id, va_surface); + va_status = vaBeginPicture(va_dpy->va_dpy, context_id, va_surface); CHECK_VASTATUS(va_status, "vaBeginPicture"); if (frame_type == FRAME_IDR) { + // FIXME: If the mux wants global headers, we should not put the + // SPS/PPS before each IDR frame, but rather put it into the + // codec extradata (formatted differently?). + // + // NOTE: If we change ycbcr_coefficients, it will not take effect + // before the next IDR frame. This is acceptable, as it should only + // happen on a mode change, which is rare. render_sequence(); - render_picture(frame_type, display_frame_num, gop_start_display_frame_num); + render_picture(surf, frame_type, display_frame_num, gop_start_display_frame_num); if (h264_packedheader) { - render_packedsequence(); + render_packedsequence(ycbcr_coefficients); render_packedpicture(); } } else { //render_sequence(); - render_picture(frame_type, display_frame_num, gop_start_display_frame_num); + render_picture(surf, frame_type, display_frame_num, gop_start_display_frame_num); } render_slice(encoding_frame_num, display_frame_num, gop_start_display_frame_num, frame_type); - va_status = vaEndPicture(va_dpy, context_id); + va_status = vaEndPicture(va_dpy->va_dpy, context_id); CHECK_VASTATUS(va_status, "vaEndPicture"); + update_ReferenceFrames(display_frame_num, frame_type); + + vector ref_display_frame_numbers; + + // Lock the references for this frame; otherwise, they could be + // rendered to before this frame is done encoding. + { + unique_lock lock(storage_task_queue_mutex); + for (const ReferenceFrame &frame : reference_frames) { + assert(surface_for_frame.count(frame.display_number)); + ++surface_for_frame[frame.display_number]->refcount; + ref_display_frame_numbers.push_back(frame.display_number); + } + } + // so now the data is done encoding (well, async job kicked off)... // we send that to the storage thread storage_task tmp; @@ -2166,14 +2079,15 @@ void QuickSyncEncoderImpl::encode_frame(QuickSyncEncoderImpl::PendingFrame frame tmp.pts = pts; tmp.dts = dts; tmp.duration = duration; + tmp.ycbcr_coefficients = ycbcr_coefficients; + tmp.received_ts = received_ts; + tmp.ref_display_frame_numbers = move(ref_display_frame_numbers); storage_task_enqueue(move(tmp)); - - update_ReferenceFrames(frame_type); } // Proxy object. -QuickSyncEncoder::QuickSyncEncoder(const std::string &filename, QSurface *surface, const string &va_display, int width, int height, Mux *stream_mux, AudioEncoder *stream_audio_encoder, X264Encoder *x264_encoder) - : impl(new QuickSyncEncoderImpl(filename, surface, va_display, width, height, stream_mux, stream_audio_encoder, x264_encoder)) {} +QuickSyncEncoder::QuickSyncEncoder(const std::string &filename, ResourcePool *resource_pool, QSurface *surface, const string &va_display, int width, int height, AVOutputFormat *oformat, X264Encoder *x264_encoder, DiskSpaceEstimator *disk_space_estimator) + : impl(new QuickSyncEncoderImpl(filename, resource_pool, surface, va_display, width, height, oformat, x264_encoder, disk_space_estimator)) {} // Must be defined here because unique_ptr<> destructor needs to know the impl. QuickSyncEncoder::~QuickSyncEncoder() {} @@ -2183,17 +2097,73 @@ void QuickSyncEncoder::add_audio(int64_t pts, vector audio) impl->add_audio(pts, audio); } -bool QuickSyncEncoder::begin_frame(GLuint *y_tex, GLuint *cbcr_tex) +bool QuickSyncEncoder::is_zerocopy() const { - return impl->begin_frame(y_tex, cbcr_tex); + return impl->is_zerocopy(); } -RefCountedGLsync QuickSyncEncoder::end_frame(int64_t pts, int64_t duration, const vector &input_frames) +bool QuickSyncEncoder::begin_frame(int64_t pts, int64_t duration, YCbCrLumaCoefficients ycbcr_coefficients, const vector &input_frames, GLuint *y_tex, GLuint *cbcr_tex) { - return impl->end_frame(pts, duration, input_frames); + return impl->begin_frame(pts, duration, ycbcr_coefficients, input_frames, y_tex, cbcr_tex); +} + +RefCountedGLsync QuickSyncEncoder::end_frame() +{ + return impl->end_frame(); } void QuickSyncEncoder::shutdown() { impl->shutdown(); } + +void QuickSyncEncoder::close_file() +{ + impl->shutdown(); +} + +void QuickSyncEncoder::set_stream_mux(Mux *mux) +{ + impl->set_stream_mux(mux); +} + +int64_t QuickSyncEncoder::global_delay() const { + return impl->global_delay(); +} + +string QuickSyncEncoder::get_usable_va_display() +{ + // First try the default (ie., whatever $DISPLAY is set to). + unique_ptr va_dpy = try_open_va("", nullptr, nullptr); + if (va_dpy != nullptr) { + return ""; + } + + fprintf(stderr, "No --va-display was given, and the X11 display did not expose a VA-API H.264 encoder.\n"); + + // Try all /dev/dri/render* in turn. TODO: Accept /dev/dri/card*, too? + glob_t g; + int err = glob("/dev/dri/renderD*", 0, nullptr, &g); + if (err != 0) { + fprintf(stderr, "Couldn't list render nodes (%s) when trying to autodetect a replacement.\n", strerror(errno)); + } else { + for (size_t i = 0; i < g.gl_pathc; ++i) { + string path = g.gl_pathv[i]; + va_dpy = try_open_va(path, nullptr, nullptr); + if (va_dpy != nullptr) { + fprintf(stderr, "Autodetected %s as a suitable replacement; using it.\n", + path.c_str()); + globfree(&g); + return path; + } + } + } + + fprintf(stderr, "No suitable VA-API H.264 encoders were found in /dev/dri; giving up.\n"); + fprintf(stderr, "Note that if you are using an Intel CPU with an external GPU,\n"); + fprintf(stderr, "you may need to enable the integrated Intel GPU in your BIOS\n"); + fprintf(stderr, "to expose Quick Sync. Alternatively, you can use --record-x264-video\n"); + fprintf(stderr, "to use software instead of hardware H.264 encoding, at the expense\n"); + fprintf(stderr, "of increased CPU usage and possibly bit rate.\n"); + exit(1); +}