git.sesse.net Git - ffmpeg/blob - libavcodec/proresenc_kostya.c

   1 /*
   2  * Apple ProRes encoder
   3  *
   4  * Copyright (c) 2012 Konstantin Shishkov
   5  *
   6  * This encoder appears to be based on Anatoliy Wassermans considering
   7  * similarities in the bugs.
   8  *
   9  * This file is part of Libav.
  10  *
  11  * Libav is free software; you can redistribute it and/or
  12  * modify it under the terms of the GNU Lesser General Public
  13  * License as published by the Free Software Foundation; either
  14  * version 2.1 of the License, or (at your option) any later version.
  15  *
  16  * Libav is distributed in the hope that it will be useful,
  17  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  18  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  19  * Lesser General Public License for more details.
  20  *
  21  * You should have received a copy of the GNU Lesser General Public
  22  * License along with Libav; if not, write to the Free Software
  23  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  24  */
  25
  26 #include "libavutil/opt.h"
  27 #include "avcodec.h"
  28 #include "put_bits.h"
  29 #include "bytestream.h"
  30 #include "internal.h"
  31 #include "proresdsp.h"
  32 #include "proresdata.h"
  33
  34 #define CFACTOR_Y422 2
  35 #define CFACTOR_Y444 3
  36
  37 #define MAX_MBS_PER_SLICE 8
  38
  39 #define MAX_PLANES 3 // should be increased to 4 when there's PIX_FMT_YUV444AP10
  40
  41 enum {
  42     PRORES_PROFILE_PROXY = 0,
  43     PRORES_PROFILE_LT,
  44     PRORES_PROFILE_STANDARD,
  45     PRORES_PROFILE_HQ,
  46 };
  47
  48 enum {
  49     QUANT_MAT_PROXY = 0,
  50     QUANT_MAT_LT,
  51     QUANT_MAT_STANDARD,
  52     QUANT_MAT_HQ,
  53     QUANT_MAT_DEFAULT,
  54 };
  55
  56 static const uint8_t prores_quant_matrices[][64] = {
  57     { // proxy
  58          4,  7,  9, 11, 13, 14, 15, 63,
  59          7,  7, 11, 12, 14, 15, 63, 63,
  60          9, 11, 13, 14, 15, 63, 63, 63,
  61         11, 11, 13, 14, 63, 63, 63, 63,
  62         11, 13, 14, 63, 63, 63, 63, 63,
  63         13, 14, 63, 63, 63, 63, 63, 63,
  64         13, 63, 63, 63, 63, 63, 63, 63,
  65         63, 63, 63, 63, 63, 63, 63, 63,
  66     },
  67     { // LT
  68          4,  5,  6,  7,  9, 11, 13, 15,
  69          5,  5,  7,  8, 11, 13, 15, 17,
  70          6,  7,  9, 11, 13, 15, 15, 17,
  71          7,  7,  9, 11, 13, 15, 17, 19,
  72          7,  9, 11, 13, 14, 16, 19, 23,
  73          9, 11, 13, 14, 16, 19, 23, 29,
  74          9, 11, 13, 15, 17, 21, 28, 35,
  75         11, 13, 16, 17, 21, 28, 35, 41,
  76     },
  77     { // standard
  78          4,  4,  5,  5,  6,  7,  7,  9,
  79          4,  4,  5,  6,  7,  7,  9,  9,
  80          5,  5,  6,  7,  7,  9,  9, 10,
  81          5,  5,  6,  7,  7,  9,  9, 10,
  82          5,  6,  7,  7,  8,  9, 10, 12,
  83          6,  7,  7,  8,  9, 10, 12, 15,
  84          6,  7,  7,  9, 10, 11, 14, 17,
  85          7,  7,  9, 10, 11, 14, 17, 21,
  86     },
  87     { // high quality
  88          4,  4,  4,  4,  4,  4,  4,  4,
  89          4,  4,  4,  4,  4,  4,  4,  4,
  90          4,  4,  4,  4,  4,  4,  4,  4,
  91          4,  4,  4,  4,  4,  4,  4,  5,
  92          4,  4,  4,  4,  4,  4,  5,  5,
  93          4,  4,  4,  4,  4,  5,  5,  6,
  94          4,  4,  4,  4,  5,  5,  6,  7,
  95          4,  4,  4,  4,  5,  6,  7,  7,
  96     },
  97     { // codec default
  98          4,  4,  4,  4,  4,  4,  4,  4,
  99          4,  4,  4,  4,  4,  4,  4,  4,
 100          4,  4,  4,  4,  4,  4,  4,  4,
 101          4,  4,  4,  4,  4,  4,  4,  4,
 102          4,  4,  4,  4,  4,  4,  4,  4,
 103          4,  4,  4,  4,  4,  4,  4,  4,
 104          4,  4,  4,  4,  4,  4,  4,  4,
 105          4,  4,  4,  4,  4,  4,  4,  4,
 106     },
 107 };
 108
 109 #define NUM_MB_LIMITS 4
 110 static const int prores_mb_limits[NUM_MB_LIMITS] = {
 111     1620, // up to 720x576
 112     2700, // up to 960x720
 113     6075, // up to 1440x1080
 114     9216, // up to 2048x1152
 115 };
 116
 117 static const struct prores_profile {
 118     const char *full_name;
 119     uint32_t    tag;
 120     int         min_quant;
 121     int         max_quant;
 122     int         br_tab[NUM_MB_LIMITS];
 123     int         quant;
 124 } prores_profile_info[4] = {
 125     {
 126         .full_name = "proxy",
 127         .tag       = MKTAG('a', 'p', 'c', 'o'),
 128         .min_quant = 4,
 129         .max_quant = 8,
 130         .br_tab    = { 300, 242, 220, 194 },
 131         .quant     = QUANT_MAT_PROXY,
 132     },
 133     {
 134         .full_name = "LT",
 135         .tag       = MKTAG('a', 'p', 'c', 's'),
 136         .min_quant = 1,
 137         .max_quant = 9,
 138         .br_tab    = { 720, 560, 490, 440 },
 139         .quant     = QUANT_MAT_LT,
 140     },
 141     {
 142         .full_name = "standard",
 143         .tag       = MKTAG('a', 'p', 'c', 'n'),
 144         .min_quant = 1,
 145         .max_quant = 6,
 146         .br_tab    = { 1050, 808, 710, 632 },
 147         .quant     = QUANT_MAT_STANDARD,
 148     },
 149     {
 150         .full_name = "high quality",
 151         .tag       = MKTAG('a', 'p', 'c', 'h'),
 152         .min_quant = 1,
 153         .max_quant = 6,
 154         .br_tab    = { 1566, 1216, 1070, 950 },
 155         .quant     = QUANT_MAT_HQ,
 156     }
 157 // for 4444 profile bitrate numbers are { 2350, 1828, 1600, 1425 }
 158 };
 159
 160 #define TRELLIS_WIDTH 16
 161 #define SCORE_LIMIT   INT_MAX / 2
 162
 163 struct TrellisNode {
 164     int prev_node;
 165     int quant;
 166     int bits;
 167     int score;
 168 };
 169
 170 #define MAX_STORED_Q 16
 171
 172 typedef struct ProresThreadData {
 173     DECLARE_ALIGNED(16, DCTELEM, blocks)[MAX_PLANES][64 * 4 * MAX_MBS_PER_SLICE];
 174     DECLARE_ALIGNED(16, uint16_t, emu_buf)[16 * 16];
 175     int16_t custom_q[64];
 176     struct TrellisNode *nodes;
 177 } ProresThreadData;
 178
 179 typedef struct ProresContext {
 180     AVClass *class;
 181     DECLARE_ALIGNED(16, DCTELEM, blocks)[MAX_PLANES][64 * 4 * MAX_MBS_PER_SLICE];
 182     DECLARE_ALIGNED(16, uint16_t, emu_buf)[16*16];
 183     int16_t quants[MAX_STORED_Q][64];
 184     int16_t custom_q[64];
 185     const uint8_t *quant_mat;
 186
 187     ProresDSPContext dsp;
 188     ScanTable  scantable;
 189
 190     int mb_width, mb_height;
 191     int mbs_per_slice;
 192     int num_chroma_blocks, chroma_factor;
 193     int slices_width;
 194     int num_slices;
 195     int num_planes;
 196     int bits_per_mb;
 197     int force_quant;
 198
 199     char *vendor;
 200     int quant_sel;
 201
 202     int frame_size;
 203
 204     int profile;
 205     const struct prores_profile *profile_info;
 206
 207     int *slice_q;
 208
 209     ProresThreadData *tdata;
 210 } ProresContext;
 211
 212 static void get_slice_data(ProresContext *ctx, const uint16_t *src,
 213                            int linesize, int x, int y, int w, int h,
 214                            DCTELEM *blocks, uint16_t *emu_buf,
 215                            int mbs_per_slice, int blocks_per_mb, int is_chroma)
 216 {
 217     const uint16_t *esrc;
 218     const int mb_width = 4 * blocks_per_mb;
 219     int elinesize;
 220     int i, j, k;
 221
 222     for (i = 0; i < mbs_per_slice; i++, src += mb_width) {
 223         if (x >= w) {
 224             memset(blocks, 0, 64 * (mbs_per_slice - i) * blocks_per_mb
 225                               * sizeof(*blocks));
 226             return;
 227         }
 228         if (x + mb_width <= w && y + 16 <= h) {
 229             esrc      = src;
 230             elinesize = linesize;
 231         } else {
 232             int bw, bh, pix;
 233
 234             esrc      = emu_buf;
 235             elinesize = 16 * sizeof(*emu_buf);
 236
 237             bw = FFMIN(w - x, mb_width);
 238             bh = FFMIN(h - y, 16);
 239
 240             for (j = 0; j < bh; j++) {
 241                 memcpy(emu_buf + j * 16,
 242                        (const uint8_t*)src + j * linesize,
 243                        bw * sizeof(*src));
 244                 pix = emu_buf[j * 16 + bw - 1];
 245                 for (k = bw; k < mb_width; k++)
 246                     emu_buf[j * 16 + k] = pix;
 247             }
 248             for (; j < 16; j++)
 249                 memcpy(emu_buf + j * 16,
 250                        emu_buf + (bh - 1) * 16,
 251                        mb_width * sizeof(*emu_buf));
 252         }
 253         if (!is_chroma) {
 254             ctx->dsp.fdct(esrc, elinesize, blocks);
 255             blocks += 64;
 256             if (blocks_per_mb > 2) {
 257                 ctx->dsp.fdct(src + 8, linesize, blocks);
 258                 blocks += 64;
 259             }
 260             ctx->dsp.fdct(src + linesize * 4, linesize, blocks);
 261             blocks += 64;
 262             if (blocks_per_mb > 2) {
 263                 ctx->dsp.fdct(src + linesize * 4 + 8, linesize, blocks);
 264                 blocks += 64;
 265             }
 266         } else {
 267             ctx->dsp.fdct(esrc, elinesize, blocks);
 268             blocks += 64;
 269             ctx->dsp.fdct(src + linesize * 4, linesize, blocks);
 270             blocks += 64;
 271             if (blocks_per_mb > 2) {
 272                 ctx->dsp.fdct(src + 8, linesize, blocks);
 273                 blocks += 64;
 274                 ctx->dsp.fdct(src + linesize * 4 + 8, linesize, blocks);
 275                 blocks += 64;
 276             }
 277         }
 278
 279         x += mb_width;
 280     }
 281 }
 282
 283 /**
 284  * Write an unsigned rice/exp golomb codeword.
 285  */
 286 static inline void encode_vlc_codeword(PutBitContext *pb, unsigned codebook, int val)
 287 {
 288     unsigned int rice_order, exp_order, switch_bits, switch_val;
 289     int exponent;
 290
 291     /* number of prefix bits to switch between Rice and expGolomb */
 292     switch_bits = (codebook & 3) + 1;
 293     rice_order  =  codebook >> 5;       /* rice code order */
 294     exp_order   = (codebook >> 2) & 7;  /* exp golomb code order */
 295
 296     switch_val  = switch_bits << rice_order;
 297
 298     if (val >= switch_val) {
 299         val -= switch_val - (1 << exp_order);
 300         exponent = av_log2(val);
 301
 302         put_bits(pb, exponent - exp_order + switch_bits, 0);
 303         put_bits(pb, exponent + 1, val);
 304     } else {
 305         exponent = val >> rice_order;
 306
 307         if (exponent)
 308             put_bits(pb, exponent, 0);
 309         put_bits(pb, 1, 1);
 310         if (rice_order)
 311             put_sbits(pb, rice_order, val);
 312     }
 313 }
 314
 315 #define GET_SIGN(x)  ((x) >> 31)
 316 #define MAKE_CODE(x) (((x) << 1) ^ GET_SIGN(x))
 317
 318 static void encode_dcs(PutBitContext *pb, DCTELEM *blocks,
 319                        int blocks_per_slice, int scale)
 320 {
 321     int i;
 322     int codebook = 3, code, dc, prev_dc, delta, sign, new_sign;
 323
 324     prev_dc = (blocks[0] - 0x4000) / scale;
 325     encode_vlc_codeword(pb, FIRST_DC_CB, MAKE_CODE(prev_dc));
 326     sign     = 0;
 327     codebook = 3;
 328     blocks  += 64;
 329
 330     for (i = 1; i < blocks_per_slice; i++, blocks += 64) {
 331         dc       = (blocks[0] - 0x4000) / scale;
 332         delta    = dc - prev_dc;
 333         new_sign = GET_SIGN(delta);
 334         delta    = (delta ^ sign) - sign;
 335         code     = MAKE_CODE(delta);
 336         encode_vlc_codeword(pb, ff_prores_dc_codebook[codebook], code);
 337         codebook = (code + (code & 1)) >> 1;
 338         codebook = FFMIN(codebook, 3);
 339         sign     = new_sign;
 340         prev_dc  = dc;
 341     }
 342 }
 343
 344 static void encode_acs(PutBitContext *pb, DCTELEM *blocks,
 345                        int blocks_per_slice,
 346                        int plane_size_factor,
 347                        const uint8_t *scan, const int16_t *qmat)
 348 {
 349     int idx, i;
 350     int run, level, run_cb, lev_cb;
 351     int max_coeffs, abs_level;
 352
 353     max_coeffs = blocks_per_slice << 6;
 354     run_cb     = ff_prores_run_to_cb_index[4];
 355     lev_cb     = ff_prores_lev_to_cb_index[2];
 356     run        = 0;
 357
 358     for (i = 1; i < 64; i++) {
 359         for (idx = scan[i]; idx < max_coeffs; idx += 64) {
 360             level = blocks[idx] / qmat[scan[i]];
 361             if (level) {
 362                 abs_level = FFABS(level);
 363                 encode_vlc_codeword(pb, ff_prores_ac_codebook[run_cb], run);
 364                 encode_vlc_codeword(pb, ff_prores_ac_codebook[lev_cb],
 365                                     abs_level - 1);
 366                 put_sbits(pb, 1, GET_SIGN(level));
 367
 368                 run_cb = ff_prores_run_to_cb_index[FFMIN(run, 15)];
 369                 lev_cb = ff_prores_lev_to_cb_index[FFMIN(abs_level, 9)];
 370                 run    = 0;
 371             } else {
 372                 run++;
 373             }
 374         }
 375     }
 376 }
 377
 378 static int encode_slice_plane(ProresContext *ctx, PutBitContext *pb,
 379                               const uint16_t *src, int linesize,
 380                               int mbs_per_slice, DCTELEM *blocks,
 381                               int blocks_per_mb, int plane_size_factor,
 382                               const int16_t *qmat)
 383 {
 384     int blocks_per_slice, saved_pos;
 385
 386     saved_pos = put_bits_count(pb);
 387     blocks_per_slice = mbs_per_slice * blocks_per_mb;
 388
 389     encode_dcs(pb, blocks, blocks_per_slice, qmat[0]);
 390     encode_acs(pb, blocks, blocks_per_slice, plane_size_factor,
 391                ctx->scantable.permutated, qmat);
 392     flush_put_bits(pb);
 393
 394     return (put_bits_count(pb) - saved_pos) >> 3;
 395 }
 396
 397 static int encode_slice(AVCodecContext *avctx, const AVFrame *pic,
 398                         PutBitContext *pb,
 399                         int sizes[4], int x, int y, int quant,
 400                         int mbs_per_slice)
 401 {
 402     ProresContext *ctx = avctx->priv_data;
 403     int i, xp, yp;
 404     int total_size = 0;
 405     const uint16_t *src;
 406     int slice_width_factor = av_log2(mbs_per_slice);
 407     int num_cblocks, pwidth;
 408     int plane_factor, is_chroma;
 409     uint16_t *qmat;
 410
 411     if (ctx->force_quant) {
 412         qmat = ctx->quants[0];
 413     } else if (quant < MAX_STORED_Q) {
 414         qmat = ctx->quants[quant];
 415     } else {
 416         qmat = ctx->custom_q;
 417         for (i = 0; i < 64; i++)
 418             qmat[i] = ctx->quant_mat[i] * quant;
 419     }
 420
 421     for (i = 0; i < ctx->num_planes; i++) {
 422         is_chroma    = (i == 1 || i == 2);
 423         plane_factor = slice_width_factor + 2;
 424         if (is_chroma)
 425             plane_factor += ctx->chroma_factor - 3;
 426         if (!is_chroma || ctx->chroma_factor == CFACTOR_Y444) {
 427             xp          = x << 4;
 428             yp          = y << 4;
 429             num_cblocks = 4;
 430             pwidth      = avctx->width;
 431         } else {
 432             xp          = x << 3;
 433             yp          = y << 4;
 434             num_cblocks = 2;
 435             pwidth      = avctx->width >> 1;
 436         }
 437         src = (const uint16_t*)(pic->data[i] + yp * pic->linesize[i]) + xp;
 438
 439         get_slice_data(ctx, src, pic->linesize[i], xp, yp,
 440                        pwidth, avctx->height, ctx->blocks[0], ctx->emu_buf,
 441                        mbs_per_slice, num_cblocks, is_chroma);
 442         sizes[i] = encode_slice_plane(ctx, pb, src, pic->linesize[i],
 443                                       mbs_per_slice, ctx->blocks[0],
 444                                       num_cblocks, plane_factor,
 445                                       qmat);
 446         total_size += sizes[i];
 447     }
 448     return total_size;
 449 }
 450
 451 static inline int estimate_vlc(unsigned codebook, int val)
 452 {
 453     unsigned int rice_order, exp_order, switch_bits, switch_val;
 454     int exponent;
 455
 456     /* number of prefix bits to switch between Rice and expGolomb */
 457     switch_bits = (codebook & 3) + 1;
 458     rice_order  =  codebook >> 5;       /* rice code order */
 459     exp_order   = (codebook >> 2) & 7;  /* exp golomb code order */
 460
 461     switch_val  = switch_bits << rice_order;
 462
 463     if (val >= switch_val) {
 464         val -= switch_val - (1 << exp_order);
 465         exponent = av_log2(val);
 466
 467         return exponent * 2 - exp_order + switch_bits + 1;
 468     } else {
 469         return (val >> rice_order) + rice_order + 1;
 470     }
 471 }
 472
 473 static int estimate_dcs(int *error, DCTELEM *blocks, int blocks_per_slice,
 474                         int scale)
 475 {
 476     int i;
 477     int codebook = 3, code, dc, prev_dc, delta, sign, new_sign;
 478     int bits;
 479
 480     prev_dc  = (blocks[0] - 0x4000) / scale;
 481     bits     = estimate_vlc(FIRST_DC_CB, MAKE_CODE(prev_dc));
 482     sign     = 0;
 483     codebook = 3;
 484     blocks  += 64;
 485     *error  += FFABS(blocks[0] - 0x4000) % scale;
 486
 487     for (i = 1; i < blocks_per_slice; i++, blocks += 64) {
 488         dc       = (blocks[0] - 0x4000) / scale;
 489         *error  += FFABS(blocks[0] - 0x4000) % scale;
 490         delta    = dc - prev_dc;
 491         new_sign = GET_SIGN(delta);
 492         delta    = (delta ^ sign) - sign;
 493         code     = MAKE_CODE(delta);
 494         bits    += estimate_vlc(ff_prores_dc_codebook[codebook], code);
 495         codebook = (code + (code & 1)) >> 1;
 496         codebook = FFMIN(codebook, 3);
 497         sign     = new_sign;
 498         prev_dc  = dc;
 499     }
 500
 501     return bits;
 502 }
 503
 504 static int estimate_acs(int *error, DCTELEM *blocks, int blocks_per_slice,
 505                         int plane_size_factor,
 506                         const uint8_t *scan, const int16_t *qmat)
 507 {
 508     int idx, i;
 509     int run, level, run_cb, lev_cb;
 510     int max_coeffs, abs_level;
 511     int bits = 0;
 512
 513     max_coeffs = blocks_per_slice << 6;
 514     run_cb     = ff_prores_run_to_cb_index[4];
 515     lev_cb     = ff_prores_lev_to_cb_index[2];
 516     run        = 0;
 517
 518     for (i = 1; i < 64; i++) {
 519         for (idx = scan[i]; idx < max_coeffs; idx += 64) {
 520             level   = blocks[idx] / qmat[scan[i]];
 521             *error += FFABS(blocks[idx]) % qmat[scan[i]];
 522             if (level) {
 523                 abs_level = FFABS(level);
 524                 bits += estimate_vlc(ff_prores_ac_codebook[run_cb], run);
 525                 bits += estimate_vlc(ff_prores_ac_codebook[lev_cb],
 526                                      abs_level - 1) + 1;
 527
 528                 run_cb = ff_prores_run_to_cb_index[FFMIN(run, 15)];
 529                 lev_cb = ff_prores_lev_to_cb_index[FFMIN(abs_level, 9)];
 530                 run    = 0;
 531             } else {
 532                 run++;
 533             }
 534         }
 535     }
 536
 537     return bits;
 538 }
 539
 540 static int estimate_slice_plane(ProresContext *ctx, int *error, int plane,
 541                                 const uint16_t *src, int linesize,
 542                                 int mbs_per_slice,
 543                                 int blocks_per_mb, int plane_size_factor,
 544                                 const int16_t *qmat, ProresThreadData *td)
 545 {
 546     int blocks_per_slice;
 547     int bits;
 548
 549     blocks_per_slice = mbs_per_slice * blocks_per_mb;
 550
 551     bits  = estimate_dcs(error, td->blocks[plane], blocks_per_slice, qmat[0]);
 552     bits += estimate_acs(error, td->blocks[plane], blocks_per_slice,
 553                          plane_size_factor, ctx->scantable.permutated, qmat);
 554
 555     return FFALIGN(bits, 8);
 556 }
 557
 558 static int find_slice_quant(AVCodecContext *avctx, const AVFrame *pic,
 559                             int trellis_node, int x, int y, int mbs_per_slice,
 560                             ProresThreadData *td)
 561 {
 562     ProresContext *ctx = avctx->priv_data;
 563     int i, q, pq, xp, yp;
 564     const uint16_t *src;
 565     int slice_width_factor = av_log2(mbs_per_slice);
 566     int num_cblocks[MAX_PLANES], pwidth;
 567     int plane_factor[MAX_PLANES], is_chroma[MAX_PLANES];
 568     const int min_quant = ctx->profile_info->min_quant;
 569     const int max_quant = ctx->profile_info->max_quant;
 570     int error, bits, bits_limit;
 571     int mbs, prev, cur, new_score;
 572     int slice_bits[TRELLIS_WIDTH], slice_score[TRELLIS_WIDTH];
 573     int overquant;
 574     uint16_t *qmat;
 575
 576     mbs = x + mbs_per_slice;
 577
 578     for (i = 0; i < ctx->num_planes; i++) {
 579         is_chroma[i]    = (i == 1 || i == 2);
 580         plane_factor[i] = slice_width_factor + 2;
 581         if (is_chroma[i])
 582             plane_factor[i] += ctx->chroma_factor - 3;
 583         if (!is_chroma[i] || ctx->chroma_factor == CFACTOR_Y444) {
 584             xp             = x << 4;
 585             yp             = y << 4;
 586             num_cblocks[i] = 4;
 587             pwidth         = avctx->width;
 588         } else {
 589             xp             = x << 3;
 590             yp             = y << 4;
 591             num_cblocks[i] = 2;
 592             pwidth         = avctx->width >> 1;
 593         }
 594         src = (const uint16_t*)(pic->data[i] + yp * pic->linesize[i]) + xp;
 595
 596         get_slice_data(ctx, src, pic->linesize[i], xp, yp,
 597                        pwidth, avctx->height, td->blocks[i], td->emu_buf,
 598                        mbs_per_slice, num_cblocks[i], is_chroma[i]);
 599     }
 600
 601     for (q = min_quant; q < max_quant + 2; q++) {
 602         td->nodes[trellis_node + q].prev_node = -1;
 603         td->nodes[trellis_node + q].quant     = q;
 604     }
 605
 606     // todo: maybe perform coarser quantising to fit into frame size when needed
 607     for (q = min_quant; q <= max_quant; q++) {
 608         bits  = 0;
 609         error = 0;
 610         for (i = 0; i < ctx->num_planes; i++) {
 611             bits += estimate_slice_plane(ctx, &error, i,
 612                                          src, pic->linesize[i],
 613                                          mbs_per_slice,
 614                                          num_cblocks[i], plane_factor[i],
 615                                          ctx->quants[q], td);
 616         }
 617         if (bits > 65000 * 8) {
 618             error = SCORE_LIMIT;
 619             break;
 620         }
 621         slice_bits[q]  = bits;
 622         slice_score[q] = error;
 623     }
 624     if (slice_bits[max_quant] <= ctx->bits_per_mb * mbs_per_slice) {
 625         slice_bits[max_quant + 1]  = slice_bits[max_quant];
 626         slice_score[max_quant + 1] = slice_score[max_quant] + 1;
 627         overquant = max_quant;
 628     } else {
 629         for (q = max_quant + 1; q < 128; q++) {
 630             bits  = 0;
 631             error = 0;
 632             if (q < MAX_STORED_Q) {
 633                 qmat = ctx->quants[q];
 634             } else {
 635                 qmat = td->custom_q;
 636                 for (i = 0; i < 64; i++)
 637                     qmat[i] = ctx->quant_mat[i] * q;
 638             }
 639             for (i = 0; i < ctx->num_planes; i++) {
 640                 bits += estimate_slice_plane(ctx, &error, i,
 641                                              src, pic->linesize[i],
 642                                              mbs_per_slice,
 643                                              num_cblocks[i], plane_factor[i],
 644                                              qmat, td);
 645             }
 646             if (bits <= ctx->bits_per_mb * mbs_per_slice)
 647                 break;
 648         }
 649
 650         slice_bits[max_quant + 1]  = bits;
 651         slice_score[max_quant + 1] = error;
 652         overquant = q;
 653     }
 654     td->nodes[trellis_node + max_quant + 1].quant = overquant;
 655
 656     bits_limit = mbs * ctx->bits_per_mb;
 657     for (pq = min_quant; pq < max_quant + 2; pq++) {
 658         prev = trellis_node - TRELLIS_WIDTH + pq;
 659
 660         for (q = min_quant; q < max_quant + 2; q++) {
 661             cur = trellis_node + q;
 662
 663             bits  = td->nodes[prev].bits + slice_bits[q];
 664             error = slice_score[q];
 665             if (bits > bits_limit)
 666                 error = SCORE_LIMIT;
 667
 668             if (td->nodes[prev].score < SCORE_LIMIT && error < SCORE_LIMIT)
 669                 new_score = td->nodes[prev].score + error;
 670             else
 671                 new_score = SCORE_LIMIT;
 672             if (td->nodes[cur].prev_node == -1 ||
 673                 td->nodes[cur].score >= new_score) {
 674
 675                 td->nodes[cur].bits      = bits;
 676                 td->nodes[cur].score     = new_score;
 677                 td->nodes[cur].prev_node = prev;
 678             }
 679         }
 680     }
 681
 682     error = td->nodes[trellis_node + min_quant].score;
 683     pq    = trellis_node + min_quant;
 684     for (q = min_quant + 1; q < max_quant + 2; q++) {
 685         if (td->nodes[trellis_node + q].score <= error) {
 686             error = td->nodes[trellis_node + q].score;
 687             pq    = trellis_node + q;
 688         }
 689     }
 690
 691     return pq;
 692 }
 693
 694 static int find_quant_thread(AVCodecContext *avctx, void *arg,
 695                              int jobnr, int threadnr)
 696 {
 697     ProresContext *ctx = avctx->priv_data;
 698     ProresThreadData *td = ctx->tdata + threadnr;
 699     int mbs_per_slice = ctx->mbs_per_slice;
 700     int x, y = jobnr, mb, q = 0;
 701
 702     for (x = mb = 0; x < ctx->mb_width; x += mbs_per_slice, mb++) {
 703         while (ctx->mb_width - x < mbs_per_slice)
 704             mbs_per_slice >>= 1;
 705         q = find_slice_quant(avctx, avctx->coded_frame,
 706                              (mb + 1) * TRELLIS_WIDTH, x, y,
 707                              mbs_per_slice, td);
 708     }
 709
 710     for (x = ctx->slices_width - 1; x >= 0; x--) {
 711         ctx->slice_q[x + y * ctx->slices_width] = td->nodes[q].quant;
 712         q = td->nodes[q].prev_node;
 713     }
 714
 715     return 0;
 716 }
 717
 718 static int encode_frame(AVCodecContext *avctx, AVPacket *pkt,
 719                         const AVFrame *pic, int *got_packet)
 720 {
 721     ProresContext *ctx = avctx->priv_data;
 722     uint8_t *orig_buf, *buf, *slice_hdr, *slice_sizes, *tmp;
 723     uint8_t *picture_size_pos;
 724     PutBitContext pb;
 725     int x, y, i, mb, q = 0;
 726     int sizes[4] = { 0 };
 727     int slice_hdr_size = 2 + 2 * (ctx->num_planes - 1);
 728     int frame_size, picture_size, slice_size;
 729     int mbs_per_slice = ctx->mbs_per_slice;
 730     int pkt_size, ret;
 731
 732     *avctx->coded_frame           = *pic;
 733     avctx->coded_frame->pict_type = AV_PICTURE_TYPE_I;
 734     avctx->coded_frame->key_frame = 1;
 735
 736     pkt_size = ctx->frame_size + FF_MIN_BUFFER_SIZE;
 737
 738     if ((ret = ff_alloc_packet2(avctx, pkt, pkt_size)) < 0)
 739         return ret;
 740
 741     orig_buf = pkt->data;
 742
 743     // frame atom
 744     orig_buf += 4;                              // frame size
 745     bytestream_put_be32  (&orig_buf, FRAME_ID); // frame container ID
 746     buf = orig_buf;
 747
 748     // frame header
 749     tmp = buf;
 750     buf += 2;                                   // frame header size will be stored here
 751     bytestream_put_be16  (&buf, 0);             // version 1
 752     bytestream_put_buffer(&buf, ctx->vendor, 4);
 753     bytestream_put_be16  (&buf, avctx->width);
 754     bytestream_put_be16  (&buf, avctx->height);
 755     bytestream_put_byte  (&buf, ctx->chroma_factor << 6); // frame flags
 756     bytestream_put_byte  (&buf, 0);             // reserved
 757     bytestream_put_byte  (&buf, avctx->color_primaries);
 758     bytestream_put_byte  (&buf, avctx->color_trc);
 759     bytestream_put_byte  (&buf, avctx->colorspace);
 760     bytestream_put_byte  (&buf, 0x40);          // source format and alpha information
 761     bytestream_put_byte  (&buf, 0);             // reserved
 762     if (ctx->quant_sel != QUANT_MAT_DEFAULT) {
 763         bytestream_put_byte  (&buf, 0x03);      // matrix flags - both matrices are present
 764         // luma quantisation matrix
 765         for (i = 0; i < 64; i++)
 766             bytestream_put_byte(&buf, ctx->quant_mat[i]);
 767         // chroma quantisation matrix
 768         for (i = 0; i < 64; i++)
 769             bytestream_put_byte(&buf, ctx->quant_mat[i]);
 770     } else {
 771         bytestream_put_byte  (&buf, 0x00);      // matrix flags - default matrices are used
 772     }
 773     bytestream_put_be16  (&tmp, buf - orig_buf); // write back frame header size
 774
 775     // picture header
 776     picture_size_pos = buf + 1;
 777     bytestream_put_byte  (&buf, 0x40);          // picture header size (in bits)
 778     buf += 4;                                   // picture data size will be stored here
 779     bytestream_put_be16  (&buf, ctx->num_slices); // total number of slices
 780     bytestream_put_byte  (&buf, av_log2(ctx->mbs_per_slice) << 4); // slice width and height in MBs
 781
 782     // seek table - will be filled during slice encoding
 783     slice_sizes = buf;
 784     buf += ctx->num_slices * 2;
 785
 786     // slices
 787     if (!ctx->force_quant) {
 788         ret = avctx->execute2(avctx, find_quant_thread, NULL, NULL,
 789                               ctx->mb_height);
 790         if (ret)
 791             return ret;
 792     }
 793
 794     for (y = 0; y < ctx->mb_height; y++) {
 795         mbs_per_slice = ctx->mbs_per_slice;
 796         for (x = mb = 0; x < ctx->mb_width; x += mbs_per_slice, mb++) {
 797             q = ctx->force_quant ? ctx->force_quant
 798                                  : ctx->slice_q[mb + y * ctx->slices_width];
 799
 800             while (ctx->mb_width - x < mbs_per_slice)
 801                 mbs_per_slice >>= 1;
 802
 803             bytestream_put_byte(&buf, slice_hdr_size << 3);
 804             slice_hdr = buf;
 805             buf += slice_hdr_size - 1;
 806             init_put_bits(&pb, buf, (pkt_size - (buf - orig_buf)) * 8);
 807             encode_slice(avctx, pic, &pb, sizes, x, y, q, mbs_per_slice);
 808
 809             bytestream_put_byte(&slice_hdr, q);
 810             slice_size = slice_hdr_size + sizes[ctx->num_planes - 1];
 811             for (i = 0; i < ctx->num_planes - 1; i++) {
 812                 bytestream_put_be16(&slice_hdr, sizes[i]);
 813                 slice_size += sizes[i];
 814             }
 815             bytestream_put_be16(&slice_sizes, slice_size);
 816             buf += slice_size - slice_hdr_size;
 817         }
 818     }
 819
 820     orig_buf -= 8;
 821     frame_size = buf - orig_buf;
 822     picture_size = buf - picture_size_pos - 6;
 823     bytestream_put_be32(&orig_buf, frame_size);
 824     bytestream_put_be32(&picture_size_pos, picture_size);
 825
 826     pkt->size   = frame_size;
 827     pkt->flags |= AV_PKT_FLAG_KEY;
 828     *got_packet = 1;
 829
 830     return 0;
 831 }
 832
 833 static av_cold int encode_close(AVCodecContext *avctx)
 834 {
 835     ProresContext *ctx = avctx->priv_data;
 836     int i;
 837
 838     if (avctx->coded_frame->data[0])
 839         avctx->release_buffer(avctx, avctx->coded_frame);
 840
 841     av_freep(&avctx->coded_frame);
 842
 843     if (ctx->tdata) {
 844         for (i = 0; i < avctx->thread_count; i++)
 845             av_free(ctx->tdata[i].nodes);
 846     }
 847     av_freep(&ctx->tdata);
 848     av_freep(&ctx->slice_q);
 849
 850     return 0;
 851 }
 852
 853 static av_cold int encode_init(AVCodecContext *avctx)
 854 {
 855     ProresContext *ctx = avctx->priv_data;
 856     int mps;
 857     int i, j;
 858     int min_quant, max_quant;
 859
 860     avctx->bits_per_raw_sample = 10;
 861     avctx->coded_frame = avcodec_alloc_frame();
 862     if (!avctx->coded_frame)
 863         return AVERROR(ENOMEM);
 864
 865     ff_proresdsp_init(&ctx->dsp, avctx);
 866     ff_init_scantable(ctx->dsp.dct_permutation, &ctx->scantable,
 867                       ff_prores_progressive_scan);
 868
 869     mps = ctx->mbs_per_slice;
 870     if (mps & (mps - 1)) {
 871         av_log(avctx, AV_LOG_ERROR,
 872                "there should be an integer power of two MBs per slice\n");
 873         return AVERROR(EINVAL);
 874     }
 875
 876     ctx->chroma_factor = avctx->pix_fmt == PIX_FMT_YUV422P10
 877                          ? CFACTOR_Y422
 878                          : CFACTOR_Y444;
 879     ctx->profile_info  = prores_profile_info + ctx->profile;
 880     ctx->num_planes    = 3;
 881
 882     ctx->mb_width      = FFALIGN(avctx->width,  16) >> 4;
 883     ctx->mb_height     = FFALIGN(avctx->height, 16) >> 4;
 884     ctx->slices_width  = ctx->mb_width / mps;
 885     ctx->slices_width += av_popcount(ctx->mb_width - ctx->slices_width * mps);
 886     ctx->num_slices    = ctx->mb_height * ctx->slices_width;
 887
 888     if (ctx->quant_sel == -1)
 889         ctx->quant_mat = prores_quant_matrices[ctx->profile_info->quant];
 890     else
 891         ctx->quant_mat = prores_quant_matrices[ctx->quant_sel];
 892
 893     if (strlen(ctx->vendor) != 4) {
 894         av_log(avctx, AV_LOG_ERROR, "vendor ID should be 4 bytes\n");
 895         return AVERROR_INVALIDDATA;
 896     }
 897
 898     ctx->force_quant = avctx->global_quality / FF_QP2LAMBDA;
 899     if (!ctx->force_quant) {
 900         if (!ctx->bits_per_mb) {
 901             for (i = 0; i < NUM_MB_LIMITS - 1; i++)
 902                 if (prores_mb_limits[i] >= ctx->mb_width * ctx->mb_height)
 903                     break;
 904             ctx->bits_per_mb   = ctx->profile_info->br_tab[i];
 905         } else if (ctx->bits_per_mb < 128) {
 906             av_log(avctx, AV_LOG_ERROR, "too few bits per MB, please set at least 128\n");
 907             return AVERROR_INVALIDDATA;
 908         }
 909
 910         min_quant = ctx->profile_info->min_quant;
 911         max_quant = ctx->profile_info->max_quant;
 912         for (i = min_quant; i < MAX_STORED_Q; i++) {
 913             for (j = 0; j < 64; j++)
 914                 ctx->quants[i][j] = ctx->quant_mat[j] * i;
 915         }
 916
 917         ctx->slice_q = av_malloc(ctx->num_slices * sizeof(*ctx->slice_q));
 918         if (!ctx->slice_q) {
 919             encode_close(avctx);
 920             return AVERROR(ENOMEM);
 921         }
 922
 923         ctx->tdata = av_mallocz(avctx->thread_count * sizeof(*ctx->tdata));
 924         if (!ctx->tdata) {
 925             encode_close(avctx);
 926             return AVERROR(ENOMEM);
 927         }
 928
 929         for (j = 0; j < avctx->thread_count; j++) {
 930             ctx->tdata[j].nodes = av_malloc((ctx->slices_width + 1)
 931                                             * TRELLIS_WIDTH
 932                                             * sizeof(*ctx->tdata->nodes));
 933             if (!ctx->tdata[j].nodes) {
 934                 encode_close(avctx);
 935                 return AVERROR(ENOMEM);
 936             }
 937             for (i = min_quant; i < max_quant + 2; i++) {
 938                 ctx->tdata[j].nodes[i].prev_node = -1;
 939                 ctx->tdata[j].nodes[i].bits      = 0;
 940                 ctx->tdata[j].nodes[i].score     = 0;
 941             }
 942         }
 943     } else {
 944         int ls = 0;
 945
 946         if (ctx->force_quant > 64) {
 947             av_log(avctx, AV_LOG_ERROR, "too large quantiser, maximum is 64\n");
 948             return AVERROR_INVALIDDATA;
 949         }
 950
 951         for (j = 0; j < 64; j++) {
 952             ctx->quants[0][j] = ctx->quant_mat[j] * ctx->force_quant;
 953             ls += av_log2((1 << 11)  / ctx->quants[0][j]) * 2 + 1;
 954         }
 955
 956         ctx->bits_per_mb = ls * 8;
 957         if (ctx->chroma_factor == CFACTOR_Y444)
 958             ctx->bits_per_mb += ls * 4;
 959         if (ctx->num_planes == 4)
 960             ctx->bits_per_mb += ls * 4;
 961     }
 962
 963     ctx->frame_size = ctx->num_slices * (2 + 2 * ctx->num_planes
 964                                          + (2 * mps * ctx->bits_per_mb) / 8)
 965                       + 200;
 966
 967     avctx->codec_tag   = ctx->profile_info->tag;
 968
 969     av_log(avctx, AV_LOG_DEBUG, "profile %d, %d slices, %d bits per MB\n",
 970            ctx->profile, ctx->num_slices, ctx->bits_per_mb);
 971     av_log(avctx, AV_LOG_DEBUG, "estimated frame size %d\n",
 972            ctx->frame_size);
 973
 974     return 0;
 975 }
 976
 977 #define OFFSET(x) offsetof(ProresContext, x)
 978 #define VE     AV_OPT_FLAG_VIDEO_PARAM | AV_OPT_FLAG_ENCODING_PARAM
 979
 980 static const AVOption options[] = {
 981     { "mbs_per_slice", "macroblocks per slice", OFFSET(mbs_per_slice),
 982         AV_OPT_TYPE_INT, { 8 }, 1, MAX_MBS_PER_SLICE, VE },
 983     { "profile",       NULL, OFFSET(profile), AV_OPT_TYPE_INT,
 984         { PRORES_PROFILE_STANDARD },
 985         PRORES_PROFILE_PROXY, PRORES_PROFILE_HQ, VE, "profile" },
 986     { "proxy",         NULL, 0, AV_OPT_TYPE_CONST, { PRORES_PROFILE_PROXY },
 987         0, 0, VE, "profile" },
 988     { "lt",            NULL, 0, AV_OPT_TYPE_CONST, { PRORES_PROFILE_LT },
 989         0, 0, VE, "profile" },
 990     { "standard",      NULL, 0, AV_OPT_TYPE_CONST, { PRORES_PROFILE_STANDARD },
 991         0, 0, VE, "profile" },
 992     { "hq",            NULL, 0, AV_OPT_TYPE_CONST, { PRORES_PROFILE_HQ },
 993         0, 0, VE, "profile" },
 994     { "vendor", "vendor ID", OFFSET(vendor),
 995         AV_OPT_TYPE_STRING, { .str = "Lavc" }, CHAR_MIN, CHAR_MAX, VE },
 996     { "bits_per_mb", "desired bits per macroblock", OFFSET(bits_per_mb),
 997         AV_OPT_TYPE_INT, { 0 }, 0, 8192, VE },
 998     { "quant_mat", "quantiser matrix", OFFSET(quant_sel), AV_OPT_TYPE_INT,
 999         { -1 }, -1, QUANT_MAT_DEFAULT, VE, "quant_mat" },
1000     { "auto",          NULL, 0, AV_OPT_TYPE_CONST, { -1 },
1001         0, 0, VE, "quant_mat" },
1002     { "proxy",         NULL, 0, AV_OPT_TYPE_CONST, { QUANT_MAT_PROXY },
1003         0, 0, VE, "quant_mat" },
1004     { "lt",            NULL, 0, AV_OPT_TYPE_CONST, { QUANT_MAT_LT },
1005         0, 0, VE, "quant_mat" },
1006     { "standard",      NULL, 0, AV_OPT_TYPE_CONST, { QUANT_MAT_STANDARD },
1007         0, 0, VE, "quant_mat" },
1008     { "hq",            NULL, 0, AV_OPT_TYPE_CONST, { QUANT_MAT_HQ },
1009         0, 0, VE, "quant_mat" },
1010     { "default",       NULL, 0, AV_OPT_TYPE_CONST, { QUANT_MAT_DEFAULT },
1011         0, 0, VE, "quant_mat" },
1012     { NULL }
1013 };
1014
1015 static const AVClass proresenc_class = {
1016     .class_name = "ProRes encoder",
1017     .item_name  = av_default_item_name,
1018     .option     = options,
1019     .version    = LIBAVUTIL_VERSION_INT,
1020 };
1021
1022 AVCodec ff_prores_kostya_encoder = {
1023     .name           = "prores_kostya",
1024     .type           = AVMEDIA_TYPE_VIDEO,
1025     .id             = CODEC_ID_PRORES,
1026     .priv_data_size = sizeof(ProresContext),
1027     .init           = encode_init,
1028     .close          = encode_close,
1029     .encode2        = encode_frame,
1030     .capabilities   = CODEC_CAP_SLICE_THREADS,
1031     .long_name      = NULL_IF_CONFIG_SMALL("Apple ProRes (iCodec Pro)"),
1032     .pix_fmts       = (const enum PixelFormat[]) {
1033                           PIX_FMT_YUV422P10, PIX_FMT_YUV444P10, PIX_FMT_NONE
1034                       },
1035     .priv_class     = &proresenc_class,
1036 };