Add support for 10-bit AV1 encoding.

[nageru] / nageru / av1_encoder.cpp
diff --git a/nageru/av1_encoder.cpp b/nageru/av1_encoder.cpp

index a0dbdd1a8e17a4e3297a42fa32c67af2bf312ec9..568cd5f9acc3673c425f393fd145416b6e8782e8 100644 (file)
--- a/nageru/av1_encoder.cpp
+++ b/nageru/av1_encoder.cpp
@@ -63,7 +63,7 @@ AV1Encoder::AV1Encoder(const AVOutputFormat *oformat)
                         av1_latency_histogram.init("av1");
                 });
  
-       const size_t bytes_per_pixel = 1;  // TODO: 10-bit support.
+       const size_t bytes_per_pixel = global_flags.av1_bit_depth > 8 ? 2 : 1;
         frame_pool.reset(new uint8_t[global_flags.width * global_flags.height * 2 * bytes_per_pixel * AV1_QUEUE_LENGTH]);
         for (unsigned i = 0; i < AV1_QUEUE_LENGTH; ++i) {
                 free_frames.push(frame_pool.get() + i * (global_flags.width * global_flags.height * 2 * bytes_per_pixel));
@@ -102,8 +102,9 @@ void AV1Encoder::add_frame(int64_t pts, int64_t duration, YCbCrLumaCoefficients
  
         // Since we're copying anyway, we can unpack from NV12 to fully planar on the fly.
         // SVT-AV1 makes its own copy, though, and it would have been nice to avoid the
-       // double-copy.
-       size_t bytes_per_pixel = 1;  // TODO: 10-bit support.
+       // double-copy (and also perhaps let the GPU do the 10-bit compression SVT-AV1
+       // wants, instead of doing it on the CPU).
+       const size_t bytes_per_pixel = global_flags.av1_bit_depth > 8 ? 2 : 1;
         size_t frame_size = global_flags.width * global_flags.height * bytes_per_pixel;
         assert(global_flags.width % 2 == 0);
         assert(global_flags.height % 2 == 0);
@@ -111,7 +112,14 @@ void AV1Encoder::add_frame(int64_t pts, int64_t duration, YCbCrLumaCoefficients
         uint8_t *cb = y + frame_size;
         uint8_t *cr = cb + frame_size / 4;
         memcpy(y, data, frame_size);
-       memcpy_interleaved(cb, cr, data + frame_size, frame_size / 2);
+       if (global_flags.av1_bit_depth == 8) {
+               memcpy_interleaved(cb, cr, data + frame_size, frame_size / 2);
+       } else {
+               const uint16_t *src = reinterpret_cast<const uint16_t *>(data + frame_size);
+               uint16_t *cb16 = reinterpret_cast<uint16_t *>(cb);
+               uint16_t *cr16 = reinterpret_cast<uint16_t *>(cr);
+               memcpy_interleaved_word(cb16, cr16, src, frame_size / 4);
+       }
  
         {
                 lock_guard<mutex> lock(mu);
@@ -136,7 +144,7 @@ void AV1Encoder::init_av1()
         config.source_height = global_flags.height;
         config.frame_rate_numerator = global_flags.av1_fps_num;
         config.frame_rate_denominator = global_flags.av1_fps_den;
-       config.encoder_bit_depth = 8;  // TODO: 10-bit support.
+       config.encoder_bit_depth = global_flags.av1_bit_depth;
         config.rate_control_mode = 2;  // CBR.
         config.pred_structure = 1;  // PRED_LOW_DELAY_B (needed for CBR).
         config.target_bit_rate = global_flags.av1_bitrate * 1000;
@@ -273,23 +281,25 @@ void AV1Encoder::encoder_thread_func()
  void AV1Encoder::encode_frame(AV1Encoder::QueuedFrame qf)
  {
         if (qf.data) {
+               const size_t bytes_per_pixel = global_flags.av1_bit_depth > 8 ? 2 : 1;
+
                 EbSvtIOFormat pic;
                 pic.luma = qf.data;     
-               pic.cb = pic.luma + global_flags.width * global_flags.height;
-               pic.cr = pic.cb + global_flags.width * global_flags.height / 4;
-               pic.y_stride = global_flags.width;
-               pic.cb_stride = global_flags.width / 2;
-               pic.cr_stride = global_flags.width / 2;
+               pic.cb = pic.luma + global_flags.width * global_flags.height * bytes_per_pixel;
+               pic.cr = pic.cb + (global_flags.width * global_flags.height / 4) * bytes_per_pixel;
+               pic.y_stride = global_flags.width;  // In pixels, so no bytes_per_pixel.
+               pic.cb_stride = global_flags.width / 2;  // Likewise.
+               pic.cr_stride = global_flags.width / 2;  // Likewise.
                 pic.width = global_flags.width;
                 pic.height = global_flags.height;
                 pic.origin_x = 0;
                 pic.origin_y = 0;
                 pic.color_fmt = EB_YUV420;
-               pic.bit_depth = EB_EIGHT_BIT;  // TODO: 10-bit.
+               pic.bit_depth = global_flags.av1_bit_depth > 8 ? EB_TEN_BIT : EB_EIGHT_BIT;
  
                 EbBufferHeaderType hdr;
                 hdr.p_buffer      = reinterpret_cast<uint8_t *>(&pic);
-               hdr.n_alloc_len   = global_flags.width * global_flags.height * 3 / 2;  // TODO: 10-bit.
+               hdr.n_alloc_len   = (global_flags.width * global_flags.height * 3 / 2) * bytes_per_pixel;
                 hdr.n_filled_len  = hdr.n_alloc_len;
                 hdr.n_tick_count  = 0;
                 hdr.p_app_private = reinterpret_cast<void *>(intptr_t(qf.duration));