Besides the obvious of spending less time copying, this has two positive effects:
- The VA-API thread is no longer a choke point; uploading can happen from
multiple cores.
- With one copy less, we seem to be reducing L3 cache pressure a bit;
at some point between five and six 1080p sources, we “fall off a cliff”
wrt. the L3 and start thrashing. This doesn't fix the issue, but alleviates
it somewhat.
All in all, we seem to go down from ~2.6 to ~2.1–2.2 cores used with one
720p channel and five 1080p channels. I haven't tried saturating channels
yet to see how many we can actually encode.
-Subproject commit 5163d25c65c3028090db1aea6587ec2fb4cb823e
+Subproject commit 03e38890b599efe6ac906fdb70b43cda63f11d01
assert(stride == width * 2);
}
- current_video_frame = video_frame_allocator->alloc_frame();
+ current_video_frame = video_frame_allocator->create_frame(width, height, stride);
if (current_video_frame.data != nullptr) {
const uint8_t *src;
video_frame->GetBytes((void **)&src);
card->capture->set_frame_callback(bind(&Mixer::bm_frame, this, card_index, _1, _2, _3, _4, _5, _6, _7));
if (card->frame_allocator == nullptr) {
- card->frame_allocator.reset(new PBOFrameAllocator(pixel_format, 8 << 20, global_flags.width, global_flags.height)); // 8 MB.
+ card->frame_allocator.reset(new PBOFrameAllocator(pixel_format, 8 << 20, global_flags.width, global_flags.height, card_index, mjpeg_encoder.get())); // 8 MB.
}
card->capture->set_video_frame_allocator(card->frame_allocator.get());
if (card->surface == nullptr) {
new_frame->upload_func = nullptr;
}
- // Only bother doing MJPEG encoding if there are any connected clients
- // that want the stream. FIXME: We should also stop memcpy-ing if there are none!
- if (httpd.get_num_connected_multicam_clients() > 0) {
- auto stream_it = global_flags.card_to_mjpeg_stream_export.find(card_index);
- if (stream_it != global_flags.card_to_mjpeg_stream_export.end()) {
- mjpeg_encoder->upload_frame(pts_int, stream_it->second, new_frame->frame, new_frame->video_format, new_frame->y_offset, new_frame->cbcr_offset);
+ if (new_frame->frame->data_copy != nullptr) {
+ int mjpeg_card_index = mjpeg_encoder->get_mjpeg_stream_for_card(card_index);
+ if (mjpeg_card_index == -1) {
+ mjpeg_encoder->finish_frame(new_frame->frame);
+ } else {
+ mjpeg_encoder->upload_frame(pts_int, mjpeg_card_index, new_frame->frame, new_frame->video_format, new_frame->y_offset, new_frame->cbcr_offset);
}
}
}
extern void memcpy_with_pitch(uint8_t *dst, const uint8_t *src, size_t src_width, size_t dst_pitch, size_t height);
-#define CHECK_VASTATUS(va_status, func) \
- if (va_status != VA_STATUS_SUCCESS) { \
- fprintf(stderr, "%s:%d (%s) failed with %d\n", __func__, __LINE__, func, va_status); \
- exit(1); \
- }
-
// From libjpeg (although it's of course identical between implementations).
static const int jpeg_natural_order[DCTSIZE2] = {
0, 1, 8, 16, 9, 2, 3, 10,
any_frames_to_be_encoded.notify_all();
}
+void MJPEGEncoder::finish_frame(RefCountedFrame frame)
+{
+ PBOFrameAllocator::Userdata *userdata = (PBOFrameAllocator::Userdata *)frame->userdata;
+
+ if (userdata->data_copy_current_src == PBOFrameAllocator::Userdata::FROM_VA_API) {
+ VAResources resources __attribute__((unused)) = move(userdata->va_resources);
+ ReleaseVAResources release = move(userdata->va_resources_release);
+ VAImage image = move(userdata->va_image);
+
+ VAStatus va_status = vaUnmapBuffer(va_dpy->va_dpy, image.buf);
+ CHECK_VASTATUS(va_status, "vaUnmapBuffer");
+ va_status = vaDestroyImage(va_dpy->va_dpy, image.image_id);
+ CHECK_VASTATUS(va_status, "vaDestroyImage");
+ }
+}
+
+int MJPEGEncoder::get_mjpeg_stream_for_card(unsigned card_index)
+{
+ // Only bother doing MJPEG encoding if there are any connected clients
+ // that want the stream.
+ if (httpd->get_num_connected_multicam_clients() == 0) {
+ return -1;
+ }
+
+ auto it = global_flags.card_to_mjpeg_stream_export.find(card_index);
+ if (it == global_flags.card_to_mjpeg_stream_export.end()) {
+ return -1;
+ }
+ return it->second;
+}
+
void MJPEGEncoder::encoder_thread_func()
{
pthread_setname_np(pthread_self(), "MJPEG_Encode");
void MJPEGEncoder::encode_jpeg_va(QueuedFrame &&qf)
{
+ PBOFrameAllocator::Userdata *userdata = (PBOFrameAllocator::Userdata *)qf.frame->userdata;
unsigned width = qf.video_format.width;
unsigned height = qf.video_format.height;
- VAResources resources = get_va_resources(width, height);
- ReleaseVAResources release(this, resources);
+ VAResources resources;
+ ReleaseVAResources release;
+ if (userdata->data_copy_current_src == PBOFrameAllocator::Userdata::FROM_VA_API) {
+ resources = move(userdata->va_resources);
+ release = move(userdata->va_resources_release);
+ } else {
+ assert(userdata->data_copy_current_src == PBOFrameAllocator::Userdata::FROM_MALLOC);
+ resources = get_va_resources(width, height);
+ release = ReleaseVAResources(this, resources);
+ }
VAData va_data = get_va_data_for_resolution(width, height);
va_data.pic_param.coded_buf = resources.data_buffer;
VABufferDestroyer destroy_slice_param(va_dpy->va_dpy, slice_param_buffer);
VAImage image;
- va_status = vaDeriveImage(va_dpy->va_dpy, resources.surface, &image);
- CHECK_VASTATUS(va_status, "vaDeriveImage");
+ if (userdata->data_copy_current_src == PBOFrameAllocator::Userdata::FROM_VA_API) {
+ // The pixel data is already uploaded by the caller.
+ image = move(userdata->va_image);
+ } else {
+ assert(userdata->data_copy_current_src == PBOFrameAllocator::Userdata::FROM_MALLOC);
- // Upload the pixel data.
- uint8_t *surface_p = nullptr;
- vaMapBuffer(va_dpy->va_dpy, image.buf, (void **)&surface_p);
+ // Upload the pixel data.
+ va_status = vaDeriveImage(va_dpy->va_dpy, resources.surface, &image);
+ CHECK_VASTATUS(va_status, "vaDeriveImage");
- size_t field_start_line = qf.video_format.extra_lines_top; // No interlacing support.
- size_t field_start = qf.cbcr_offset * 2 + qf.video_format.width * field_start_line * 2;
+ uint8_t *surface_p = nullptr;
+ vaMapBuffer(va_dpy->va_dpy, image.buf, (void **)&surface_p);
- {
- const uint8_t *src = qf.frame->data_copy + field_start;
- uint8_t *dst = (unsigned char *)surface_p + image.offsets[0];
- memcpy_with_pitch(dst, src, qf.video_format.width * 2, image.pitches[0], qf.video_format.height);
+ size_t field_start_line = qf.video_format.extra_lines_top; // No interlacing support.
+ size_t field_start = qf.cbcr_offset * 2 + qf.video_format.width * field_start_line * 2;
+
+ {
+ const uint8_t *src = qf.frame->data_copy + field_start;
+ uint8_t *dst = (unsigned char *)surface_p + image.offsets[0];
+ memcpy_with_pitch(dst, src, qf.video_format.width * 2, image.pitches[0], qf.video_format.height);
+ }
}
va_status = vaUnmapBuffer(va_dpy->va_dpy, image.buf);
struct VADisplayWithCleanup;
struct VectorDestinationManager;
+#define CHECK_VASTATUS(va_status, func) \
+ if (va_status != VA_STATUS_SUCCESS) { \
+ fprintf(stderr, "%s:%d (%s) failed with %d\n", __func__, __LINE__, func, va_status); \
+ exit(1); \
+ }
+
class MJPEGEncoder {
public:
MJPEGEncoder(HTTPD *httpd, const std::string &va_display);
void stop();
void upload_frame(int64_t pts, unsigned card_index, RefCountedFrame frame, const bmusb::VideoFormat &video_format, size_t y_offset, size_t cbcr_offset);
+ // If the frame was started (data_copy != nullptr) but will not be finished
+ // (MJPEG decoding was turned off in the meantime), you'll need to call finish_frame()
+ // to release any VA-API resources.
+ void finish_frame(RefCountedFrame frame);
+
+ bool using_vaapi() const { return va_dpy != nullptr; }
+
+ // Returns -1 for inactive (ie., don't encode frames for this card right now).
+ int get_mjpeg_stream_for_card(unsigned card_index);
+
private:
static constexpr int quality = 90;
std::atomic<int64_t> metric_mjpeg_frames_oversized_dropped{0};
std::atomic<int64_t> metric_mjpeg_overrun_dropped{0};
std::atomic<int64_t> metric_mjpeg_overrun_submitted{0};
+
+ friend class PBOFrameAllocator; // FIXME
};
#endif // !defined(_MJPEG_ENCODER_H)
#include <cstddef>
#include "flags.h"
+#include "mjpeg_encoder.h"
#include "v210_converter.h"
+#include "va_display_with_cleanup.h"
using namespace std;
} // namespace
-PBOFrameAllocator::PBOFrameAllocator(bmusb::PixelFormat pixel_format, size_t frame_size, GLuint width, GLuint height, size_t num_queued_frames, GLenum buffer, GLenum permissions, GLenum map_bits)
- : pixel_format(pixel_format), buffer(buffer)
+PBOFrameAllocator::PBOFrameAllocator(bmusb::PixelFormat pixel_format, size_t frame_size, GLuint width, GLuint height, unsigned card_index, MJPEGEncoder *mjpeg_encoder, size_t num_queued_frames, GLenum buffer, GLenum permissions, GLenum map_bits)
+ : card_index(card_index), mjpeg_encoder(mjpeg_encoder), pixel_format(pixel_format), buffer(buffer)
{
userdata.reset(new Userdata[num_queued_frames]);
for (size_t i = 0; i < num_queued_frames; ++i) {
Frame frame;
frame.data = (uint8_t *)glMapBufferRange(buffer, 0, frame_size, permissions | map_bits | GL_MAP_PERSISTENT_BIT);
frame.data2 = frame.data + frame_size / 2;
- frame.data_copy = new uint8_t[frame_size];
check_error();
frame.size = frame_size;
Userdata *ud = &userdata[frame_idx];
frame.userdata = ud;
ud->pbo = pbo;
ud->pixel_format = pixel_format;
+ ud->data_copy_malloc = new uint8_t[frame_size];
frame.owner = this;
// For 8-bit non-planar Y'CbCr, we ask the driver to split Y' and Cb/Cr
void PBOFrameAllocator::destroy_frame(Frame *frame)
{
Userdata *ud = (Userdata *)frame->userdata;
- delete[] frame->data_copy;
+ delete[] ud->data_copy_malloc;
GLuint pbo = ud->pbo;
glBindBuffer(buffer, pbo);
}
vf.len = 0;
vf.overflow = 0;
+
+ if (mjpeg_encoder != nullptr && mjpeg_encoder->using_vaapi() &&
+ mjpeg_encoder->get_mjpeg_stream_for_card(card_index) != -1) {
+ Userdata *ud = (Userdata *)vf.userdata;
+ vf.data_copy = ud->data_copy_malloc;
+ ud->data_copy_current_src = Userdata::FROM_MALLOC;
+ } else {
+ vf.data_copy = nullptr;
+ }
+
+ return vf;
+}
+
+bmusb::FrameAllocator::Frame PBOFrameAllocator::create_frame(size_t width, size_t height, size_t stride)
+{
+ Frame vf;
+
+ {
+ lock_guard<mutex> lock(freelist_mutex);
+ if (freelist.empty()) {
+ printf("Frame overrun (no more spare PBO frames), dropping frame!\n");
+ vf.len = 0;
+ vf.overflow = 0;
+ return vf;
+ } else {
+ vf = freelist.front();
+ freelist.pop();
+ }
+ }
+ vf.len = 0;
+ vf.overflow = 0;
+
+ Userdata *userdata = (Userdata *)vf.userdata;
+
+ if (mjpeg_encoder != nullptr && mjpeg_encoder->using_vaapi() &&
+ mjpeg_encoder->get_mjpeg_stream_for_card(card_index) != -1) {
+ VADisplay va_dpy = mjpeg_encoder->va_dpy->va_dpy;
+ MJPEGEncoder::VAResources resources = mjpeg_encoder->get_va_resources(width, height);
+ MJPEGEncoder::ReleaseVAResources release(mjpeg_encoder, resources);
+
+ VAImage image;
+ VAStatus va_status = vaDeriveImage(va_dpy, resources.surface, &image);
+ CHECK_VASTATUS(va_status, "vaDeriveImage");
+
+ if (image.pitches[0] == stride) {
+ userdata->va_resources = move(resources);
+ userdata->va_resources_release = move(release);
+ userdata->va_image = move(image);
+
+ va_status = vaMapBuffer(va_dpy, image.buf, (void **)&vf.data_copy);
+ CHECK_VASTATUS(va_status, "vaMapBuffer");
+ vf.data_copy += image.offsets[0];
+ userdata->data_copy_current_src = Userdata::FROM_VA_API;
+ } else {
+ printf("WARNING: Could not copy directly into VA-API MJPEG buffer for %zu x %zu, since producer and consumer disagreed on stride (%zu != %d).\n", width, height, stride, image.pitches[0]);
+ vf.data_copy = userdata->data_copy_malloc;
+ userdata->data_copy_current_src = Userdata::FROM_MALLOC;
+
+ va_status = vaDestroyImage(va_dpy, image.image_id);
+ CHECK_VASTATUS(va_status, "vaDestroyImage");
+ }
+ } else {
+ vf.data_copy = nullptr;
+ }
+
return vf;
}
#include <movit/ycbcr.h>
#include "bmusb/bmusb.h"
+#include "mjpeg_encoder.h"
+
+class MJPEGEncoder;
// An allocator that allocates straight into OpenGL pinned memory.
// Meant for video frames only. We use a queue rather than a stack,
PBOFrameAllocator(bmusb::PixelFormat pixel_format,
size_t frame_size,
GLuint width, GLuint height,
+ unsigned card_index,
+ MJPEGEncoder *mjpeg_encoder = nullptr,
size_t num_queued_frames = 16,
GLenum buffer = GL_PIXEL_UNPACK_BUFFER_ARB,
GLenum permissions = GL_MAP_WRITE_BIT,
GLenum map_bits = GL_MAP_FLUSH_EXPLICIT_BIT);
~PBOFrameAllocator() override;
Frame alloc_frame() override;
+ Frame create_frame(size_t width, size_t height, size_t stride) override;
void release_frame(Frame frame) override;
struct Userdata {
unsigned last_frame_rate_nom, last_frame_rate_den;
bool has_last_subtitle = false;
std::string last_subtitle;
+
+ // These are the source of the “data_copy” member in Frame,
+ // used for MJPEG encoding. There are three possibilities:
+ //
+ // - MJPEG encoding is not active (at all, or for this specific
+ // card). Then data_copy is nullptr, and what's in here
+ // does not matter at all.
+ // - We can encode directly into VA-API buffers (ie., VA-API
+ // is active, and nothing strange happened wrt. strides);
+ // then va_resources, va_resources_release and va_image
+ // are fetched from MJPEGEncoder at create_frame() and released
+ // back when the frame is uploaded (or would have been).
+ // In this case, data_copy points into the mapped VAImage.
+ // - If not, data_copy points to data_copy_malloc, and is copied
+ // from there into VA-API buffers (by MJPEGEncoder) if needed.
+ enum { FROM_MALLOC, FROM_VA_API } data_copy_current_src;
+ uint8_t *data_copy_malloc;
+ MJPEGEncoder::VAResources va_resources;
+ MJPEGEncoder::ReleaseVAResources va_resources_release;
+ VAImage va_image;
};
private:
void init_frame(size_t frame_idx, size_t frame_size, GLuint width, GLuint height, GLenum permissions, GLenum map_bits);
void destroy_frame(Frame *frame);
+ unsigned card_index;
+ MJPEGEncoder *mjpeg_encoder;
bmusb::PixelFormat pixel_format;
std::mutex freelist_mutex;
std::queue<Frame> freelist;