2 * VP9 compatible video decoder
4 * Copyright (C) 2013 Ronald S. Bultje <rsbultje gmail com>
5 * Copyright (C) 2013 Clément Bœsch <u pkh me>
7 * This file is part of FFmpeg.
9 * FFmpeg is free software; you can redistribute it and/or
10 * modify it under the terms of the GNU Lesser General Public
11 * License as published by the Free Software Foundation; either
12 * version 2.1 of the License, or (at your option) any later version.
14 * FFmpeg is distributed in the hope that it will be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
17 * Lesser General Public License for more details.
19 * You should have received a copy of the GNU Lesser General Public
20 * License along with FFmpeg; if not, write to the Free Software
21 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
33 #include "libavutil/avassert.h"
35 #define VP9_SYNCCODE 0x498342
72 typedef struct VP9Frame {
74 AVBufferRef *extradata;
75 uint8_t *segmentation_map;
76 struct VP9mvrefPair *mv;
81 uint8_t /* bit=col */ mask[2 /* 0=y, 1=uv */][2 /* 0=col, 1=row */]
82 [8 /* rows */][4 /* 0=16, 1=8, 2=4, 3=inner4 */];
85 typedef struct VP9Block {
86 uint8_t seg_id, intra, comp, ref[2], mode[4], uvmode, skip;
87 enum FilterMode filter;
88 VP56mv mv[4 /* b_idx */][2 /* ref */];
90 enum TxfmMode tx, uvtx;
92 enum BlockPartition bp;
95 typedef struct VP9Context {
102 VP9Block *b_base, *b;
103 int pass, uses_2pass, last_uses_2pass;
104 int row, row7, col, col7;
106 ptrdiff_t y_stride, uv_stride;
110 uint8_t keyframe, last_keyframe;
112 uint8_t use_last_frame_mvs;
118 uint8_t refreshrefmask;
119 uint8_t highprecisionmvs;
120 enum FilterMode filtermode;
121 uint8_t allowcompinter;
124 uint8_t parallelmode;
128 uint8_t varcompref[2];
129 ThreadFrame refs[8], next_refs[8];
138 uint8_t mblim_lut[64];
146 int8_t ydc_qdelta, uvdc_qdelta, uvac_qdelta;
151 uint8_t absolute_vals;
157 uint8_t skip_enabled;
166 unsigned log2_tile_cols, log2_tile_rows;
167 unsigned tile_cols, tile_rows;
168 unsigned tile_row_start, tile_row_end, tile_col_start, tile_col_end;
170 unsigned sb_cols, sb_rows, rows, cols;
173 uint8_t coef[4][2][2][6][6][3];
177 uint8_t coef[4][2][2][6][6][11];
182 unsigned y_mode[4][10];
183 unsigned uv_mode[10][10];
184 unsigned filter[4][3];
185 unsigned mv_mode[7][4];
186 unsigned intra[4][2];
188 unsigned single_ref[5][2][2];
189 unsigned comp_ref[5][2];
190 unsigned tx32p[2][4];
191 unsigned tx16p[2][3];
194 unsigned mv_joint[4];
197 unsigned classes[11];
199 unsigned bits[10][2];
200 unsigned class0_fp[2][4];
202 unsigned class0_hp[2];
205 unsigned partition[4][4][4];
206 unsigned coef[4][2][2][6][6][3];
207 unsigned eob[4][2][2][6][6][2];
209 enum TxfmMode txfmmode;
210 enum CompPredMode comppredmode;
212 // contextual (left/above) cache
213 DECLARE_ALIGNED(16, uint8_t, left_y_nnz_ctx)[16];
214 DECLARE_ALIGNED(16, uint8_t, left_mode_ctx)[16];
215 DECLARE_ALIGNED(16, VP56mv, left_mv_ctx)[16][2];
216 DECLARE_ALIGNED(8, uint8_t, left_uv_nnz_ctx)[2][8];
217 DECLARE_ALIGNED(8, uint8_t, left_partition_ctx)[8];
218 DECLARE_ALIGNED(8, uint8_t, left_skip_ctx)[8];
219 DECLARE_ALIGNED(8, uint8_t, left_txfm_ctx)[8];
220 DECLARE_ALIGNED(8, uint8_t, left_segpred_ctx)[8];
221 DECLARE_ALIGNED(8, uint8_t, left_intra_ctx)[8];
222 DECLARE_ALIGNED(8, uint8_t, left_comp_ctx)[8];
223 DECLARE_ALIGNED(8, uint8_t, left_ref_ctx)[8];
224 DECLARE_ALIGNED(8, uint8_t, left_filter_ctx)[8];
225 uint8_t *above_partition_ctx;
226 uint8_t *above_mode_ctx;
227 // FIXME maybe merge some of the below in a flags field?
228 uint8_t *above_y_nnz_ctx;
229 uint8_t *above_uv_nnz_ctx[2];
230 uint8_t *above_skip_ctx; // 1bit
231 uint8_t *above_txfm_ctx; // 2bit
232 uint8_t *above_segpred_ctx; // 1bit
233 uint8_t *above_intra_ctx; // 1bit
234 uint8_t *above_comp_ctx; // 1bit
235 uint8_t *above_ref_ctx; // 2bit
236 uint8_t *above_filter_ctx;
237 VP56mv (*above_mv_ctx)[2];
240 uint8_t *intra_pred_data[3];
241 struct VP9Filter *lflvl;
242 DECLARE_ALIGNED(32, uint8_t, edge_emu_buffer)[71*80];
244 // block reconstruction intermediates
245 int block_alloc_using_2pass;
246 int16_t *block_base, *block, *uvblock_base[2], *uvblock[2];
247 uint8_t *eob_base, *uveob_base[2], *eob, *uveob[2];
248 struct { int x, y; } min_mv, max_mv;
249 DECLARE_ALIGNED(32, uint8_t, tmp_y)[64*64];
250 DECLARE_ALIGNED(32, uint8_t, tmp_uv)[2][32*32];
253 static const uint8_t bwh_tab[2][N_BS_SIZES][2] = {
255 { 16, 16 }, { 16, 8 }, { 8, 16 }, { 8, 8 }, { 8, 4 }, { 4, 8 },
256 { 4, 4 }, { 4, 2 }, { 2, 4 }, { 2, 2 }, { 2, 1 }, { 1, 2 }, { 1, 1 },
258 { 8, 8 }, { 8, 4 }, { 4, 8 }, { 4, 4 }, { 4, 2 }, { 2, 4 },
259 { 2, 2 }, { 2, 1 }, { 1, 2 }, { 1, 1 }, { 1, 1 }, { 1, 1 }, { 1, 1 },
263 static int vp9_alloc_frame(AVCodecContext *ctx, VP9Frame *f)
265 VP9Context *s = ctx->priv_data;
268 if ((ret = ff_thread_get_buffer(ctx, &f->tf, AV_GET_BUFFER_FLAG_REF)) < 0)
270 sz = 64 * s->sb_cols * s->sb_rows;
271 if (!(f->extradata = av_buffer_allocz(sz * (1 + sizeof(struct VP9mvrefPair))))) {
272 ff_thread_release_buffer(ctx, &f->tf);
273 return AVERROR(ENOMEM);
276 f->segmentation_map = f->extradata->data;
277 f->mv = (struct VP9mvrefPair *) (f->extradata->data + sz);
279 // retain segmentation map if it doesn't update
280 if (s->segmentation.enabled && !s->segmentation.update_map &&
281 !s->intraonly && !s->keyframe) {
282 memcpy(f->segmentation_map, s->frames[LAST_FRAME].segmentation_map, sz);
288 static void vp9_unref_frame(AVCodecContext *ctx, VP9Frame *f)
290 ff_thread_release_buffer(ctx, &f->tf);
291 av_buffer_unref(&f->extradata);
294 static int vp9_ref_frame(AVCodecContext *ctx, VP9Frame *dst, VP9Frame *src)
298 if ((res = ff_thread_ref_frame(&dst->tf, &src->tf)) < 0) {
300 } else if (!(dst->extradata = av_buffer_ref(src->extradata))) {
301 vp9_unref_frame(ctx, dst);
302 return AVERROR(ENOMEM);
305 dst->segmentation_map = src->segmentation_map;
311 static int update_size(AVCodecContext *ctx, int w, int h)
313 VP9Context *s = ctx->priv_data;
316 av_assert0(w > 0 && h > 0);
318 if (s->intra_pred_data[0] && w == ctx->width && h == ctx->height)
323 s->sb_cols = (w + 63) >> 6;
324 s->sb_rows = (h + 63) >> 6;
325 s->cols = (w + 7) >> 3;
326 s->rows = (h + 7) >> 3;
328 #define assign(var, type, n) var = (type) p; p += s->sb_cols * (n) * sizeof(*var)
329 av_freep(&s->intra_pred_data[0]);
330 p = av_malloc(s->sb_cols * (240 + sizeof(*s->lflvl) + 16 * sizeof(*s->above_mv_ctx)));
332 return AVERROR(ENOMEM);
333 assign(s->intra_pred_data[0], uint8_t *, 64);
334 assign(s->intra_pred_data[1], uint8_t *, 32);
335 assign(s->intra_pred_data[2], uint8_t *, 32);
336 assign(s->above_y_nnz_ctx, uint8_t *, 16);
337 assign(s->above_mode_ctx, uint8_t *, 16);
338 assign(s->above_mv_ctx, VP56mv(*)[2], 16);
339 assign(s->above_partition_ctx, uint8_t *, 8);
340 assign(s->above_skip_ctx, uint8_t *, 8);
341 assign(s->above_txfm_ctx, uint8_t *, 8);
342 assign(s->above_uv_nnz_ctx[0], uint8_t *, 8);
343 assign(s->above_uv_nnz_ctx[1], uint8_t *, 8);
344 assign(s->above_segpred_ctx, uint8_t *, 8);
345 assign(s->above_intra_ctx, uint8_t *, 8);
346 assign(s->above_comp_ctx, uint8_t *, 8);
347 assign(s->above_ref_ctx, uint8_t *, 8);
348 assign(s->above_filter_ctx, uint8_t *, 8);
349 assign(s->lflvl, struct VP9Filter *, 1);
352 // these will be re-allocated a little later
353 av_freep(&s->b_base);
354 av_freep(&s->block_base);
359 static int update_block_buffers(AVCodecContext *ctx)
361 VP9Context *s = ctx->priv_data;
363 if (s->b_base && s->block_base && s->block_alloc_using_2pass == s->uses_2pass)
367 av_free(s->block_base);
369 int sbs = s->sb_cols * s->sb_rows;
371 s->b_base = av_malloc(sizeof(VP9Block) * s->cols * s->rows);
372 s->block_base = av_mallocz((64 * 64 + 128) * sbs * 3);
373 if (!s->b_base || !s->block_base)
374 return AVERROR(ENOMEM);
375 s->uvblock_base[0] = s->block_base + sbs * 64 * 64;
376 s->uvblock_base[1] = s->uvblock_base[0] + sbs * 32 * 32;
377 s->eob_base = (uint8_t *) (s->uvblock_base[1] + sbs * 32 * 32);
378 s->uveob_base[0] = s->eob_base + 256 * sbs;
379 s->uveob_base[1] = s->uveob_base[0] + 64 * sbs;
381 s->b_base = av_malloc(sizeof(VP9Block));
382 s->block_base = av_mallocz((64 * 64 + 128) * 3);
383 if (!s->b_base || !s->block_base)
384 return AVERROR(ENOMEM);
385 s->uvblock_base[0] = s->block_base + 64 * 64;
386 s->uvblock_base[1] = s->uvblock_base[0] + 32 * 32;
387 s->eob_base = (uint8_t *) (s->uvblock_base[1] + 32 * 32);
388 s->uveob_base[0] = s->eob_base + 256;
389 s->uveob_base[1] = s->uveob_base[0] + 64;
391 s->block_alloc_using_2pass = s->uses_2pass;
396 // for some reason the sign bit is at the end, not the start, of a bit sequence
397 static av_always_inline int get_sbits_inv(GetBitContext *gb, int n)
399 int v = get_bits(gb, n);
400 return get_bits1(gb) ? -v : v;
403 static av_always_inline int inv_recenter_nonneg(int v, int m)
405 return v > 2 * m ? v : v & 1 ? m - ((v + 1) >> 1) : m + (v >> 1);
408 // differential forward probability updates
409 static int update_prob(VP56RangeCoder *c, int p)
411 static const int inv_map_table[254] = {
412 7, 20, 33, 46, 59, 72, 85, 98, 111, 124, 137, 150, 163, 176,
413 189, 202, 215, 228, 241, 254, 1, 2, 3, 4, 5, 6, 8, 9,
414 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 21, 22, 23, 24,
415 25, 26, 27, 28, 29, 30, 31, 32, 34, 35, 36, 37, 38, 39,
416 40, 41, 42, 43, 44, 45, 47, 48, 49, 50, 51, 52, 53, 54,
417 55, 56, 57, 58, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69,
418 70, 71, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84,
419 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 99, 100,
420 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 112, 113, 114, 115,
421 116, 117, 118, 119, 120, 121, 122, 123, 125, 126, 127, 128, 129, 130,
422 131, 132, 133, 134, 135, 136, 138, 139, 140, 141, 142, 143, 144, 145,
423 146, 147, 148, 149, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160,
424 161, 162, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175,
425 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 190, 191,
426 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 203, 204, 205, 206,
427 207, 208, 209, 210, 211, 212, 213, 214, 216, 217, 218, 219, 220, 221,
428 222, 223, 224, 225, 226, 227, 229, 230, 231, 232, 233, 234, 235, 236,
429 237, 238, 239, 240, 242, 243, 244, 245, 246, 247, 248, 249, 250, 251,
434 /* This code is trying to do a differential probability update. For a
435 * current probability A in the range [1, 255], the difference to a new
436 * probability of any value can be expressed differentially as 1-A,255-A
437 * where some part of this (absolute range) exists both in positive as
438 * well as the negative part, whereas another part only exists in one
439 * half. We're trying to code this shared part differentially, i.e.
440 * times two where the value of the lowest bit specifies the sign, and
441 * the single part is then coded on top of this. This absolute difference
442 * then again has a value of [0,254], but a bigger value in this range
443 * indicates that we're further away from the original value A, so we
444 * can code this as a VLC code, since higher values are increasingly
445 * unlikely. The first 20 values in inv_map_table[] allow 'cheap, rough'
446 * updates vs. the 'fine, exact' updates further down the range, which
447 * adds one extra dimension to this differential update model. */
449 if (!vp8_rac_get(c)) {
450 d = vp8_rac_get_uint(c, 4) + 0;
451 } else if (!vp8_rac_get(c)) {
452 d = vp8_rac_get_uint(c, 4) + 16;
453 } else if (!vp8_rac_get(c)) {
454 d = vp8_rac_get_uint(c, 5) + 32;
456 d = vp8_rac_get_uint(c, 7);
458 d = (d << 1) - 65 + vp8_rac_get(c);
462 return p <= 128 ? 1 + inv_recenter_nonneg(inv_map_table[d], p - 1) :
463 255 - inv_recenter_nonneg(inv_map_table[d], 255 - p);
466 static int decode_frame_header(AVCodecContext *ctx,
467 const uint8_t *data, int size, int *ref)
469 VP9Context *s = ctx->priv_data;
470 int c, i, j, k, l, m, n, w, h, max, size2, res, sharp;
472 const uint8_t *data2;
475 if ((res = init_get_bits8(&s->gb, data, size)) < 0) {
476 av_log(ctx, AV_LOG_ERROR, "Failed to initialize bitstream reader\n");
479 if (get_bits(&s->gb, 2) != 0x2) { // frame marker
480 av_log(ctx, AV_LOG_ERROR, "Invalid frame marker\n");
481 return AVERROR_INVALIDDATA;
483 s->profile = get_bits1(&s->gb);
484 if (get_bits1(&s->gb)) { // reserved bit
485 av_log(ctx, AV_LOG_ERROR, "Reserved bit should be zero\n");
486 return AVERROR_INVALIDDATA;
488 if (get_bits1(&s->gb)) {
489 *ref = get_bits(&s->gb, 3);
492 s->last_uses_2pass = s->uses_2pass;
493 s->last_keyframe = s->keyframe;
494 s->keyframe = !get_bits1(&s->gb);
495 last_invisible = s->invisible;
496 s->invisible = !get_bits1(&s->gb);
497 s->errorres = get_bits1(&s->gb);
498 s->use_last_frame_mvs = !s->errorres && !last_invisible;
500 if (get_bits_long(&s->gb, 24) != VP9_SYNCCODE) { // synccode
501 av_log(ctx, AV_LOG_ERROR, "Invalid sync code\n");
502 return AVERROR_INVALIDDATA;
504 s->colorspace = get_bits(&s->gb, 3);
505 if (s->colorspace == 7) { // RGB = profile 1
506 av_log(ctx, AV_LOG_ERROR, "RGB not supported in profile 0\n");
507 return AVERROR_INVALIDDATA;
509 s->fullrange = get_bits1(&s->gb);
510 // for profile 1, here follows the subsampling bits
511 s->refreshrefmask = 0xff;
512 w = get_bits(&s->gb, 16) + 1;
513 h = get_bits(&s->gb, 16) + 1;
514 if (get_bits1(&s->gb)) // display size
515 skip_bits(&s->gb, 32);
517 s->intraonly = s->invisible ? get_bits1(&s->gb) : 0;
518 s->resetctx = s->errorres ? 0 : get_bits(&s->gb, 2);
520 if (get_bits_long(&s->gb, 24) != VP9_SYNCCODE) { // synccode
521 av_log(ctx, AV_LOG_ERROR, "Invalid sync code\n");
522 return AVERROR_INVALIDDATA;
524 s->refreshrefmask = get_bits(&s->gb, 8);
525 w = get_bits(&s->gb, 16) + 1;
526 h = get_bits(&s->gb, 16) + 1;
527 if (get_bits1(&s->gb)) // display size
528 skip_bits(&s->gb, 32);
530 s->refreshrefmask = get_bits(&s->gb, 8);
531 s->refidx[0] = get_bits(&s->gb, 3);
532 s->signbias[0] = get_bits1(&s->gb);
533 s->refidx[1] = get_bits(&s->gb, 3);
534 s->signbias[1] = get_bits1(&s->gb);
535 s->refidx[2] = get_bits(&s->gb, 3);
536 s->signbias[2] = get_bits1(&s->gb);
537 if (!s->refs[s->refidx[0]].f->data[0] ||
538 !s->refs[s->refidx[1]].f->data[0] ||
539 !s->refs[s->refidx[2]].f->data[0]) {
540 av_log(ctx, AV_LOG_ERROR, "Not all references are available\n");
541 return AVERROR_INVALIDDATA;
543 if (get_bits1(&s->gb)) {
544 w = s->refs[s->refidx[0]].f->width;
545 h = s->refs[s->refidx[0]].f->height;
546 } else if (get_bits1(&s->gb)) {
547 w = s->refs[s->refidx[1]].f->width;
548 h = s->refs[s->refidx[1]].f->height;
549 } else if (get_bits1(&s->gb)) {
550 w = s->refs[s->refidx[2]].f->width;
551 h = s->refs[s->refidx[2]].f->height;
553 w = get_bits(&s->gb, 16) + 1;
554 h = get_bits(&s->gb, 16) + 1;
556 // Note that in this code, "CUR_FRAME" is actually before we
557 // have formally allocated a frame, and thus actually represents
559 s->use_last_frame_mvs &= s->frames[CUR_FRAME].tf.f->width == w &&
560 s->frames[CUR_FRAME].tf.f->height == h;
561 if (get_bits1(&s->gb)) // display size
562 skip_bits(&s->gb, 32);
563 s->highprecisionmvs = get_bits1(&s->gb);
564 s->filtermode = get_bits1(&s->gb) ? FILTER_SWITCHABLE :
566 s->allowcompinter = s->signbias[0] != s->signbias[1] ||
567 s->signbias[0] != s->signbias[2];
568 if (s->allowcompinter) {
569 if (s->signbias[0] == s->signbias[1]) {
571 s->varcompref[0] = 0;
572 s->varcompref[1] = 1;
573 } else if (s->signbias[0] == s->signbias[2]) {
575 s->varcompref[0] = 0;
576 s->varcompref[1] = 2;
579 s->varcompref[0] = 1;
580 s->varcompref[1] = 2;
585 s->refreshctx = s->errorres ? 0 : get_bits1(&s->gb);
586 s->parallelmode = s->errorres ? 1 : get_bits1(&s->gb);
587 s->framectxid = c = get_bits(&s->gb, 2);
589 /* loopfilter header data */
590 s->filter.level = get_bits(&s->gb, 6);
591 sharp = get_bits(&s->gb, 3);
592 // if sharpness changed, reinit lim/mblim LUTs. if it didn't change, keep
593 // the old cache values since they are still valid
594 if (s->filter.sharpness != sharp)
595 memset(s->filter.lim_lut, 0, sizeof(s->filter.lim_lut));
596 s->filter.sharpness = sharp;
597 if ((s->lf_delta.enabled = get_bits1(&s->gb))) {
598 if (get_bits1(&s->gb)) {
599 for (i = 0; i < 4; i++)
600 if (get_bits1(&s->gb))
601 s->lf_delta.ref[i] = get_sbits_inv(&s->gb, 6);
602 for (i = 0; i < 2; i++)
603 if (get_bits1(&s->gb))
604 s->lf_delta.mode[i] = get_sbits_inv(&s->gb, 6);
607 memset(&s->lf_delta, 0, sizeof(s->lf_delta));
610 /* quantization header data */
611 s->yac_qi = get_bits(&s->gb, 8);
612 s->ydc_qdelta = get_bits1(&s->gb) ? get_sbits_inv(&s->gb, 4) : 0;
613 s->uvdc_qdelta = get_bits1(&s->gb) ? get_sbits_inv(&s->gb, 4) : 0;
614 s->uvac_qdelta = get_bits1(&s->gb) ? get_sbits_inv(&s->gb, 4) : 0;
615 s->lossless = s->yac_qi == 0 && s->ydc_qdelta == 0 &&
616 s->uvdc_qdelta == 0 && s->uvac_qdelta == 0;
618 /* segmentation header info */
619 if ((s->segmentation.enabled = get_bits1(&s->gb))) {
620 if ((s->segmentation.update_map = get_bits1(&s->gb))) {
621 for (i = 0; i < 7; i++)
622 s->prob.seg[i] = get_bits1(&s->gb) ?
623 get_bits(&s->gb, 8) : 255;
624 if ((s->segmentation.temporal = get_bits1(&s->gb))) {
625 for (i = 0; i < 3; i++)
626 s->prob.segpred[i] = get_bits1(&s->gb) ?
627 get_bits(&s->gb, 8) : 255;
630 if ((!s->segmentation.update_map || s->segmentation.temporal) &&
631 (w != s->frames[CUR_FRAME].tf.f->width ||
632 h != s->frames[CUR_FRAME].tf.f->height)) {
633 av_log(ctx, AV_LOG_ERROR,
634 "Reference segmap (temp=%d,update=%d) enabled on size-change!\n",
635 s->segmentation.temporal, s->segmentation.update_map);
636 return AVERROR_INVALIDDATA;
639 if (get_bits1(&s->gb)) {
640 s->segmentation.absolute_vals = get_bits1(&s->gb);
641 for (i = 0; i < 8; i++) {
642 if ((s->segmentation.feat[i].q_enabled = get_bits1(&s->gb)))
643 s->segmentation.feat[i].q_val = get_sbits_inv(&s->gb, 8);
644 if ((s->segmentation.feat[i].lf_enabled = get_bits1(&s->gb)))
645 s->segmentation.feat[i].lf_val = get_sbits_inv(&s->gb, 6);
646 if ((s->segmentation.feat[i].ref_enabled = get_bits1(&s->gb)))
647 s->segmentation.feat[i].ref_val = get_bits(&s->gb, 2);
648 s->segmentation.feat[i].skip_enabled = get_bits1(&s->gb);
652 s->segmentation.feat[0].q_enabled = 0;
653 s->segmentation.feat[0].lf_enabled = 0;
654 s->segmentation.feat[0].skip_enabled = 0;
655 s->segmentation.feat[0].ref_enabled = 0;
658 // set qmul[] based on Y/UV, AC/DC and segmentation Q idx deltas
659 for (i = 0; i < (s->segmentation.enabled ? 8 : 1); i++) {
660 int qyac, qydc, quvac, quvdc, lflvl, sh;
662 if (s->segmentation.feat[i].q_enabled) {
663 if (s->segmentation.absolute_vals)
664 qyac = s->segmentation.feat[i].q_val;
666 qyac = s->yac_qi + s->segmentation.feat[i].q_val;
670 qydc = av_clip_uintp2(qyac + s->ydc_qdelta, 8);
671 quvdc = av_clip_uintp2(qyac + s->uvdc_qdelta, 8);
672 quvac = av_clip_uintp2(qyac + s->uvac_qdelta, 8);
673 qyac = av_clip_uintp2(qyac, 8);
675 s->segmentation.feat[i].qmul[0][0] = vp9_dc_qlookup[qydc];
676 s->segmentation.feat[i].qmul[0][1] = vp9_ac_qlookup[qyac];
677 s->segmentation.feat[i].qmul[1][0] = vp9_dc_qlookup[quvdc];
678 s->segmentation.feat[i].qmul[1][1] = vp9_ac_qlookup[quvac];
680 sh = s->filter.level >= 32;
681 if (s->segmentation.feat[i].lf_enabled) {
682 if (s->segmentation.absolute_vals)
683 lflvl = s->segmentation.feat[i].lf_val;
685 lflvl = s->filter.level + s->segmentation.feat[i].lf_val;
687 lflvl = s->filter.level;
689 s->segmentation.feat[i].lflvl[0][0] =
690 s->segmentation.feat[i].lflvl[0][1] =
691 av_clip_uintp2(lflvl + (s->lf_delta.ref[0] << sh), 6);
692 for (j = 1; j < 4; j++) {
693 s->segmentation.feat[i].lflvl[j][0] =
694 av_clip_uintp2(lflvl + ((s->lf_delta.ref[j] +
695 s->lf_delta.mode[0]) << sh), 6);
696 s->segmentation.feat[i].lflvl[j][1] =
697 av_clip_uintp2(lflvl + ((s->lf_delta.ref[j] +
698 s->lf_delta.mode[1]) << sh), 6);
703 if ((res = update_size(ctx, w, h)) < 0) {
704 av_log(ctx, AV_LOG_ERROR, "Failed to initialize decoder for %dx%d\n", w, h);
707 for (s->tiling.log2_tile_cols = 0;
708 (s->sb_cols >> s->tiling.log2_tile_cols) > 64;
709 s->tiling.log2_tile_cols++) ;
710 for (max = 0; (s->sb_cols >> max) >= 4; max++) ;
711 max = FFMAX(0, max - 1);
712 while (max > s->tiling.log2_tile_cols) {
713 if (get_bits1(&s->gb))
714 s->tiling.log2_tile_cols++;
718 s->tiling.log2_tile_rows = decode012(&s->gb);
719 s->tiling.tile_rows = 1 << s->tiling.log2_tile_rows;
720 if (s->tiling.tile_cols != (1 << s->tiling.log2_tile_cols)) {
721 s->tiling.tile_cols = 1 << s->tiling.log2_tile_cols;
722 s->c_b = av_fast_realloc(s->c_b, &s->c_b_size,
723 sizeof(VP56RangeCoder) * s->tiling.tile_cols);
725 av_log(ctx, AV_LOG_ERROR, "Ran out of memory during range coder init\n");
726 return AVERROR(ENOMEM);
730 if (s->keyframe || s->errorres || s->intraonly) {
731 s->prob_ctx[0].p = s->prob_ctx[1].p = s->prob_ctx[2].p =
732 s->prob_ctx[3].p = vp9_default_probs;
733 memcpy(s->prob_ctx[0].coef, vp9_default_coef_probs,
734 sizeof(vp9_default_coef_probs));
735 memcpy(s->prob_ctx[1].coef, vp9_default_coef_probs,
736 sizeof(vp9_default_coef_probs));
737 memcpy(s->prob_ctx[2].coef, vp9_default_coef_probs,
738 sizeof(vp9_default_coef_probs));
739 memcpy(s->prob_ctx[3].coef, vp9_default_coef_probs,
740 sizeof(vp9_default_coef_probs));
743 // next 16 bits is size of the rest of the header (arith-coded)
744 size2 = get_bits(&s->gb, 16);
745 data2 = align_get_bits(&s->gb);
746 if (size2 > size - (data2 - data)) {
747 av_log(ctx, AV_LOG_ERROR, "Invalid compressed header size\n");
748 return AVERROR_INVALIDDATA;
750 ff_vp56_init_range_decoder(&s->c, data2, size2);
751 if (vp56_rac_get_prob_branchy(&s->c, 128)) { // marker bit
752 av_log(ctx, AV_LOG_ERROR, "Marker bit was set\n");
753 return AVERROR_INVALIDDATA;
756 if (s->keyframe || s->intraonly) {
757 memset(s->counts.coef, 0, sizeof(s->counts.coef) + sizeof(s->counts.eob));
759 memset(&s->counts, 0, sizeof(s->counts));
761 // FIXME is it faster to not copy here, but do it down in the fw updates
762 // as explicit copies if the fw update is missing (and skip the copy upon
764 s->prob.p = s->prob_ctx[c].p;
768 s->txfmmode = TX_4X4;
770 s->txfmmode = vp8_rac_get_uint(&s->c, 2);
771 if (s->txfmmode == 3)
772 s->txfmmode += vp8_rac_get(&s->c);
774 if (s->txfmmode == TX_SWITCHABLE) {
775 for (i = 0; i < 2; i++)
776 if (vp56_rac_get_prob_branchy(&s->c, 252))
777 s->prob.p.tx8p[i] = update_prob(&s->c, s->prob.p.tx8p[i]);
778 for (i = 0; i < 2; i++)
779 for (j = 0; j < 2; j++)
780 if (vp56_rac_get_prob_branchy(&s->c, 252))
781 s->prob.p.tx16p[i][j] =
782 update_prob(&s->c, s->prob.p.tx16p[i][j]);
783 for (i = 0; i < 2; i++)
784 for (j = 0; j < 3; j++)
785 if (vp56_rac_get_prob_branchy(&s->c, 252))
786 s->prob.p.tx32p[i][j] =
787 update_prob(&s->c, s->prob.p.tx32p[i][j]);
792 for (i = 0; i < 4; i++) {
793 uint8_t (*ref)[2][6][6][3] = s->prob_ctx[c].coef[i];
794 if (vp8_rac_get(&s->c)) {
795 for (j = 0; j < 2; j++)
796 for (k = 0; k < 2; k++)
797 for (l = 0; l < 6; l++)
798 for (m = 0; m < 6; m++) {
799 uint8_t *p = s->prob.coef[i][j][k][l][m];
800 uint8_t *r = ref[j][k][l][m];
801 if (m >= 3 && l == 0) // dc only has 3 pt
803 for (n = 0; n < 3; n++) {
804 if (vp56_rac_get_prob_branchy(&s->c, 252)) {
805 p[n] = update_prob(&s->c, r[n]);
813 for (j = 0; j < 2; j++)
814 for (k = 0; k < 2; k++)
815 for (l = 0; l < 6; l++)
816 for (m = 0; m < 6; m++) {
817 uint8_t *p = s->prob.coef[i][j][k][l][m];
818 uint8_t *r = ref[j][k][l][m];
819 if (m > 3 && l == 0) // dc only has 3 pt
825 if (s->txfmmode == i)
830 for (i = 0; i < 3; i++)
831 if (vp56_rac_get_prob_branchy(&s->c, 252))
832 s->prob.p.skip[i] = update_prob(&s->c, s->prob.p.skip[i]);
833 if (!s->keyframe && !s->intraonly) {
834 for (i = 0; i < 7; i++)
835 for (j = 0; j < 3; j++)
836 if (vp56_rac_get_prob_branchy(&s->c, 252))
837 s->prob.p.mv_mode[i][j] =
838 update_prob(&s->c, s->prob.p.mv_mode[i][j]);
840 if (s->filtermode == FILTER_SWITCHABLE)
841 for (i = 0; i < 4; i++)
842 for (j = 0; j < 2; j++)
843 if (vp56_rac_get_prob_branchy(&s->c, 252))
844 s->prob.p.filter[i][j] =
845 update_prob(&s->c, s->prob.p.filter[i][j]);
847 for (i = 0; i < 4; i++)
848 if (vp56_rac_get_prob_branchy(&s->c, 252))
849 s->prob.p.intra[i] = update_prob(&s->c, s->prob.p.intra[i]);
851 if (s->allowcompinter) {
852 s->comppredmode = vp8_rac_get(&s->c);
854 s->comppredmode += vp8_rac_get(&s->c);
855 if (s->comppredmode == PRED_SWITCHABLE)
856 for (i = 0; i < 5; i++)
857 if (vp56_rac_get_prob_branchy(&s->c, 252))
859 update_prob(&s->c, s->prob.p.comp[i]);
861 s->comppredmode = PRED_SINGLEREF;
864 if (s->comppredmode != PRED_COMPREF) {
865 for (i = 0; i < 5; i++) {
866 if (vp56_rac_get_prob_branchy(&s->c, 252))
867 s->prob.p.single_ref[i][0] =
868 update_prob(&s->c, s->prob.p.single_ref[i][0]);
869 if (vp56_rac_get_prob_branchy(&s->c, 252))
870 s->prob.p.single_ref[i][1] =
871 update_prob(&s->c, s->prob.p.single_ref[i][1]);
875 if (s->comppredmode != PRED_SINGLEREF) {
876 for (i = 0; i < 5; i++)
877 if (vp56_rac_get_prob_branchy(&s->c, 252))
878 s->prob.p.comp_ref[i] =
879 update_prob(&s->c, s->prob.p.comp_ref[i]);
882 for (i = 0; i < 4; i++)
883 for (j = 0; j < 9; j++)
884 if (vp56_rac_get_prob_branchy(&s->c, 252))
885 s->prob.p.y_mode[i][j] =
886 update_prob(&s->c, s->prob.p.y_mode[i][j]);
888 for (i = 0; i < 4; i++)
889 for (j = 0; j < 4; j++)
890 for (k = 0; k < 3; k++)
891 if (vp56_rac_get_prob_branchy(&s->c, 252))
892 s->prob.p.partition[3 - i][j][k] =
893 update_prob(&s->c, s->prob.p.partition[3 - i][j][k]);
895 // mv fields don't use the update_prob subexp model for some reason
896 for (i = 0; i < 3; i++)
897 if (vp56_rac_get_prob_branchy(&s->c, 252))
898 s->prob.p.mv_joint[i] = (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
900 for (i = 0; i < 2; i++) {
901 if (vp56_rac_get_prob_branchy(&s->c, 252))
902 s->prob.p.mv_comp[i].sign = (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
904 for (j = 0; j < 10; j++)
905 if (vp56_rac_get_prob_branchy(&s->c, 252))
906 s->prob.p.mv_comp[i].classes[j] =
907 (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
909 if (vp56_rac_get_prob_branchy(&s->c, 252))
910 s->prob.p.mv_comp[i].class0 = (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
912 for (j = 0; j < 10; j++)
913 if (vp56_rac_get_prob_branchy(&s->c, 252))
914 s->prob.p.mv_comp[i].bits[j] =
915 (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
918 for (i = 0; i < 2; i++) {
919 for (j = 0; j < 2; j++)
920 for (k = 0; k < 3; k++)
921 if (vp56_rac_get_prob_branchy(&s->c, 252))
922 s->prob.p.mv_comp[i].class0_fp[j][k] =
923 (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
925 for (j = 0; j < 3; j++)
926 if (vp56_rac_get_prob_branchy(&s->c, 252))
927 s->prob.p.mv_comp[i].fp[j] =
928 (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
931 if (s->highprecisionmvs) {
932 for (i = 0; i < 2; i++) {
933 if (vp56_rac_get_prob_branchy(&s->c, 252))
934 s->prob.p.mv_comp[i].class0_hp =
935 (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
937 if (vp56_rac_get_prob_branchy(&s->c, 252))
938 s->prob.p.mv_comp[i].hp =
939 (vp8_rac_get_uint(&s->c, 7) << 1) | 1;
944 return (data2 - data) + size2;
947 static av_always_inline void clamp_mv(VP56mv *dst, const VP56mv *src,
950 dst->x = av_clip(src->x, s->min_mv.x, s->max_mv.x);
951 dst->y = av_clip(src->y, s->min_mv.y, s->max_mv.y);
954 static void find_ref_mvs(VP9Context *s,
955 VP56mv *pmv, int ref, int z, int idx, int sb)
957 static const int8_t mv_ref_blk_off[N_BS_SIZES][8][2] = {
958 [BS_64x64] = {{ 3, -1 }, { -1, 3 }, { 4, -1 }, { -1, 4 },
959 { -1, -1 }, { 0, -1 }, { -1, 0 }, { 6, -1 }},
960 [BS_64x32] = {{ 0, -1 }, { -1, 0 }, { 4, -1 }, { -1, 2 },
961 { -1, -1 }, { 0, -3 }, { -3, 0 }, { 2, -1 }},
962 [BS_32x64] = {{ -1, 0 }, { 0, -1 }, { -1, 4 }, { 2, -1 },
963 { -1, -1 }, { -3, 0 }, { 0, -3 }, { -1, 2 }},
964 [BS_32x32] = {{ 1, -1 }, { -1, 1 }, { 2, -1 }, { -1, 2 },
965 { -1, -1 }, { 0, -3 }, { -3, 0 }, { -3, -3 }},
966 [BS_32x16] = {{ 0, -1 }, { -1, 0 }, { 2, -1 }, { -1, -1 },
967 { -1, 1 }, { 0, -3 }, { -3, 0 }, { -3, -3 }},
968 [BS_16x32] = {{ -1, 0 }, { 0, -1 }, { -1, 2 }, { -1, -1 },
969 { 1, -1 }, { -3, 0 }, { 0, -3 }, { -3, -3 }},
970 [BS_16x16] = {{ 0, -1 }, { -1, 0 }, { 1, -1 }, { -1, 1 },
971 { -1, -1 }, { 0, -3 }, { -3, 0 }, { -3, -3 }},
972 [BS_16x8] = {{ 0, -1 }, { -1, 0 }, { 1, -1 }, { -1, -1 },
973 { 0, -2 }, { -2, 0 }, { -2, -1 }, { -1, -2 }},
974 [BS_8x16] = {{ -1, 0 }, { 0, -1 }, { -1, 1 }, { -1, -1 },
975 { -2, 0 }, { 0, -2 }, { -1, -2 }, { -2, -1 }},
976 [BS_8x8] = {{ 0, -1 }, { -1, 0 }, { -1, -1 }, { 0, -2 },
977 { -2, 0 }, { -1, -2 }, { -2, -1 }, { -2, -2 }},
978 [BS_8x4] = {{ 0, -1 }, { -1, 0 }, { -1, -1 }, { 0, -2 },
979 { -2, 0 }, { -1, -2 }, { -2, -1 }, { -2, -2 }},
980 [BS_4x8] = {{ 0, -1 }, { -1, 0 }, { -1, -1 }, { 0, -2 },
981 { -2, 0 }, { -1, -2 }, { -2, -1 }, { -2, -2 }},
982 [BS_4x4] = {{ 0, -1 }, { -1, 0 }, { -1, -1 }, { 0, -2 },
983 { -2, 0 }, { -1, -2 }, { -2, -1 }, { -2, -2 }},
986 int row = s->row, col = s->col, row7 = s->row7;
987 const int8_t (*p)[2] = mv_ref_blk_off[b->bs];
988 #define INVALID_MV 0x80008000U
989 uint32_t mem = INVALID_MV;
992 #define RETURN_DIRECT_MV(mv) \
994 uint32_t m = AV_RN32A(&mv); \
998 } else if (mem == INVALID_MV) { \
1000 } else if (m != mem) { \
1007 if (sb == 2 || sb == 1) {
1008 RETURN_DIRECT_MV(b->mv[0][z]);
1009 } else if (sb == 3) {
1010 RETURN_DIRECT_MV(b->mv[2][z]);
1011 RETURN_DIRECT_MV(b->mv[1][z]);
1012 RETURN_DIRECT_MV(b->mv[0][z]);
1015 #define RETURN_MV(mv) \
1020 clamp_mv(&tmp, &mv, s); \
1021 m = AV_RN32A(&tmp); \
1025 } else if (mem == INVALID_MV) { \
1027 } else if (m != mem) { \
1032 uint32_t m = AV_RN32A(&mv); \
1034 clamp_mv(pmv, &mv, s); \
1036 } else if (mem == INVALID_MV) { \
1038 } else if (m != mem) { \
1039 clamp_mv(pmv, &mv, s); \
1046 struct VP9mvrefPair *mv = &s->frames[CUR_FRAME].mv[(row - 1) * s->sb_cols * 8 + col];
1047 if (mv->ref[0] == ref) {
1048 RETURN_MV(s->above_mv_ctx[2 * col + (sb & 1)][0]);
1049 } else if (mv->ref[1] == ref) {
1050 RETURN_MV(s->above_mv_ctx[2 * col + (sb & 1)][1]);
1053 if (col > s->tiling.tile_col_start) {
1054 struct VP9mvrefPair *mv = &s->frames[CUR_FRAME].mv[row * s->sb_cols * 8 + col - 1];
1055 if (mv->ref[0] == ref) {
1056 RETURN_MV(s->left_mv_ctx[2 * row7 + (sb >> 1)][0]);
1057 } else if (mv->ref[1] == ref) {
1058 RETURN_MV(s->left_mv_ctx[2 * row7 + (sb >> 1)][1]);
1066 // previously coded MVs in this neighbourhood, using same reference frame
1067 for (; i < 8; i++) {
1068 int c = p[i][0] + col, r = p[i][1] + row;
1070 if (c >= s->tiling.tile_col_start && c < s->cols && r >= 0 && r < s->rows) {
1071 struct VP9mvrefPair *mv = &s->frames[CUR_FRAME].mv[r * s->sb_cols * 8 + c];
1073 if (mv->ref[0] == ref) {
1074 RETURN_MV(mv->mv[0]);
1075 } else if (mv->ref[1] == ref) {
1076 RETURN_MV(mv->mv[1]);
1081 // MV at this position in previous frame, using same reference frame
1082 if (s->use_last_frame_mvs) {
1083 struct VP9mvrefPair *mv = &s->frames[LAST_FRAME].mv[row * s->sb_cols * 8 + col];
1085 if (!s->last_uses_2pass)
1086 ff_thread_await_progress(&s->frames[LAST_FRAME].tf, row >> 3, 0);
1087 if (mv->ref[0] == ref) {
1088 RETURN_MV(mv->mv[0]);
1089 } else if (mv->ref[1] == ref) {
1090 RETURN_MV(mv->mv[1]);
1094 #define RETURN_SCALE_MV(mv, scale) \
1097 VP56mv mv_temp = { -mv.x, -mv.y }; \
1098 RETURN_MV(mv_temp); \
1104 // previously coded MVs in this neighbourhood, using different reference frame
1105 for (i = 0; i < 8; i++) {
1106 int c = p[i][0] + col, r = p[i][1] + row;
1108 if (c >= s->tiling.tile_col_start && c < s->cols && r >= 0 && r < s->rows) {
1109 struct VP9mvrefPair *mv = &s->frames[CUR_FRAME].mv[r * s->sb_cols * 8 + c];
1111 if (mv->ref[0] != ref && mv->ref[0] >= 0) {
1112 RETURN_SCALE_MV(mv->mv[0], s->signbias[mv->ref[0]] != s->signbias[ref]);
1114 if (mv->ref[1] != ref && mv->ref[1] >= 0 &&
1115 // BUG - libvpx has this condition regardless of whether
1116 // we used the first ref MV and pre-scaling
1117 AV_RN32A(&mv->mv[0]) != AV_RN32A(&mv->mv[1])) {
1118 RETURN_SCALE_MV(mv->mv[1], s->signbias[mv->ref[1]] != s->signbias[ref]);
1123 // MV at this position in previous frame, using different reference frame
1124 if (s->use_last_frame_mvs) {
1125 struct VP9mvrefPair *mv = &s->frames[LAST_FRAME].mv[row * s->sb_cols * 8 + col];
1127 // no need to await_progress, because we already did that above
1128 if (mv->ref[0] != ref && mv->ref[0] >= 0) {
1129 RETURN_SCALE_MV(mv->mv[0], s->signbias[mv->ref[0]] != s->signbias[ref]);
1131 if (mv->ref[1] != ref && mv->ref[1] >= 0 &&
1132 // BUG - libvpx has this condition regardless of whether
1133 // we used the first ref MV and pre-scaling
1134 AV_RN32A(&mv->mv[0]) != AV_RN32A(&mv->mv[1])) {
1135 RETURN_SCALE_MV(mv->mv[1], s->signbias[mv->ref[1]] != s->signbias[ref]);
1142 #undef RETURN_SCALE_MV
1145 static av_always_inline int read_mv_component(VP9Context *s, int idx, int hp)
1147 int bit, sign = vp56_rac_get_prob(&s->c, s->prob.p.mv_comp[idx].sign);
1148 int n, c = vp8_rac_get_tree(&s->c, vp9_mv_class_tree,
1149 s->prob.p.mv_comp[idx].classes);
1151 s->counts.mv_comp[idx].sign[sign]++;
1152 s->counts.mv_comp[idx].classes[c]++;
1156 for (n = 0, m = 0; m < c; m++) {
1157 bit = vp56_rac_get_prob(&s->c, s->prob.p.mv_comp[idx].bits[m]);
1159 s->counts.mv_comp[idx].bits[m][bit]++;
1162 bit = vp8_rac_get_tree(&s->c, vp9_mv_fp_tree, s->prob.p.mv_comp[idx].fp);
1164 s->counts.mv_comp[idx].fp[bit]++;
1166 bit = vp56_rac_get_prob(&s->c, s->prob.p.mv_comp[idx].hp);
1167 s->counts.mv_comp[idx].hp[bit]++;
1171 // bug in libvpx - we count for bw entropy purposes even if the
1173 s->counts.mv_comp[idx].hp[1]++;
1177 n = vp56_rac_get_prob(&s->c, s->prob.p.mv_comp[idx].class0);
1178 s->counts.mv_comp[idx].class0[n]++;
1179 bit = vp8_rac_get_tree(&s->c, vp9_mv_fp_tree,
1180 s->prob.p.mv_comp[idx].class0_fp[n]);
1181 s->counts.mv_comp[idx].class0_fp[n][bit]++;
1182 n = (n << 3) | (bit << 1);
1184 bit = vp56_rac_get_prob(&s->c, s->prob.p.mv_comp[idx].class0_hp);
1185 s->counts.mv_comp[idx].class0_hp[bit]++;
1189 // bug in libvpx - we count for bw entropy purposes even if the
1191 s->counts.mv_comp[idx].class0_hp[1]++;
1195 return sign ? -(n + 1) : (n + 1);
1198 static void fill_mv(VP9Context *s,
1199 VP56mv *mv, int mode, int sb)
1203 if (mode == ZEROMV) {
1208 // FIXME cache this value and reuse for other subblocks
1209 find_ref_mvs(s, &mv[0], b->ref[0], 0, mode == NEARMV,
1210 mode == NEWMV ? -1 : sb);
1211 // FIXME maybe move this code into find_ref_mvs()
1212 if ((mode == NEWMV || sb == -1) &&
1213 !(hp = s->highprecisionmvs && abs(mv[0].x) < 64 && abs(mv[0].y) < 64)) {
1227 if (mode == NEWMV) {
1228 enum MVJoint j = vp8_rac_get_tree(&s->c, vp9_mv_joint_tree,
1229 s->prob.p.mv_joint);
1231 s->counts.mv_joint[j]++;
1232 if (j >= MV_JOINT_V)
1233 mv[0].y += read_mv_component(s, 0, hp);
1235 mv[0].x += read_mv_component(s, 1, hp);
1239 // FIXME cache this value and reuse for other subblocks
1240 find_ref_mvs(s, &mv[1], b->ref[1], 1, mode == NEARMV,
1241 mode == NEWMV ? -1 : sb);
1242 if ((mode == NEWMV || sb == -1) &&
1243 !(hp = s->highprecisionmvs && abs(mv[1].x) < 64 && abs(mv[1].y) < 64)) {
1257 if (mode == NEWMV) {
1258 enum MVJoint j = vp8_rac_get_tree(&s->c, vp9_mv_joint_tree,
1259 s->prob.p.mv_joint);
1261 s->counts.mv_joint[j]++;
1262 if (j >= MV_JOINT_V)
1263 mv[1].y += read_mv_component(s, 0, hp);
1265 mv[1].x += read_mv_component(s, 1, hp);
1271 static av_always_inline void setctx_2d(uint8_t *ptr, int w, int h,
1272 ptrdiff_t stride, int v)
1282 int v16 = v * 0x0101;
1290 uint32_t v32 = v * 0x01010101;
1299 uint64_t v64 = v * 0x0101010101010101ULL;
1305 uint32_t v32 = v * 0x01010101;
1308 AV_WN32A(ptr + 4, v32);
1317 static void decode_mode(AVCodecContext *ctx)
1319 static const uint8_t left_ctx[N_BS_SIZES] = {
1320 0x0, 0x8, 0x0, 0x8, 0xc, 0x8, 0xc, 0xe, 0xc, 0xe, 0xf, 0xe, 0xf
1322 static const uint8_t above_ctx[N_BS_SIZES] = {
1323 0x0, 0x0, 0x8, 0x8, 0x8, 0xc, 0xc, 0xc, 0xe, 0xe, 0xe, 0xf, 0xf
1325 static const uint8_t max_tx_for_bl_bp[N_BS_SIZES] = {
1326 TX_32X32, TX_32X32, TX_32X32, TX_32X32, TX_16X16, TX_16X16,
1327 TX_16X16, TX_8X8, TX_8X8, TX_8X8, TX_4X4, TX_4X4, TX_4X4
1329 VP9Context *s = ctx->priv_data;
1331 int row = s->row, col = s->col, row7 = s->row7;
1332 enum TxfmMode max_tx = max_tx_for_bl_bp[b->bs];
1333 int w4 = FFMIN(s->cols - col, bwh_tab[1][b->bs][0]);
1334 int h4 = FFMIN(s->rows - row, bwh_tab[1][b->bs][1]), y;
1335 int have_a = row > 0, have_l = col > s->tiling.tile_col_start;
1336 int vref, filter_id;
1338 if (!s->segmentation.enabled) {
1340 } else if (s->keyframe || s->intraonly) {
1341 b->seg_id = vp8_rac_get_tree(&s->c, vp9_segmentation_tree, s->prob.seg);
1342 } else if (!s->segmentation.update_map ||
1343 (s->segmentation.temporal &&
1344 vp56_rac_get_prob_branchy(&s->c,
1345 s->prob.segpred[s->above_segpred_ctx[col] +
1346 s->left_segpred_ctx[row7]]))) {
1348 uint8_t *refsegmap = s->frames[LAST_FRAME].segmentation_map;
1350 if (!s->last_uses_2pass)
1351 ff_thread_await_progress(&s->frames[LAST_FRAME].tf, row >> 3, 0);
1352 for (y = 0; y < h4; y++)
1353 for (x = 0; x < w4; x++)
1354 pred = FFMIN(pred, refsegmap[(y + row) * 8 * s->sb_cols + x + col]);
1355 av_assert1(pred < 8);
1358 memset(&s->above_segpred_ctx[col], 1, w4);
1359 memset(&s->left_segpred_ctx[row7], 1, h4);
1361 b->seg_id = vp8_rac_get_tree(&s->c, vp9_segmentation_tree,
1364 memset(&s->above_segpred_ctx[col], 0, w4);
1365 memset(&s->left_segpred_ctx[row7], 0, h4);
1367 if (s->segmentation.enabled &&
1368 (s->segmentation.update_map || s->keyframe || s->intraonly)) {
1369 setctx_2d(&s->frames[CUR_FRAME].segmentation_map[row * 8 * s->sb_cols + col],
1370 w4, h4, 8 * s->sb_cols, b->seg_id);
1373 b->skip = s->segmentation.enabled &&
1374 s->segmentation.feat[b->seg_id].skip_enabled;
1376 int c = s->left_skip_ctx[row7] + s->above_skip_ctx[col];
1377 b->skip = vp56_rac_get_prob(&s->c, s->prob.p.skip[c]);
1378 s->counts.skip[c][b->skip]++;
1381 if (s->keyframe || s->intraonly) {
1383 } else if (s->segmentation.feat[b->seg_id].ref_enabled) {
1384 b->intra = !s->segmentation.feat[b->seg_id].ref_val;
1388 if (have_a && have_l) {
1389 c = s->above_intra_ctx[col] + s->left_intra_ctx[row7];
1392 c = have_a ? 2 * s->above_intra_ctx[col] :
1393 have_l ? 2 * s->left_intra_ctx[row7] : 0;
1395 bit = vp56_rac_get_prob(&s->c, s->prob.p.intra[c]);
1396 s->counts.intra[c][bit]++;
1400 if ((b->intra || !b->skip) && s->txfmmode == TX_SWITCHABLE) {
1404 c = (s->above_skip_ctx[col] ? max_tx :
1405 s->above_txfm_ctx[col]) +
1406 (s->left_skip_ctx[row7] ? max_tx :
1407 s->left_txfm_ctx[row7]) > max_tx;
1409 c = s->above_skip_ctx[col] ? 1 :
1410 (s->above_txfm_ctx[col] * 2 > max_tx);
1412 } else if (have_l) {
1413 c = s->left_skip_ctx[row7] ? 1 :
1414 (s->left_txfm_ctx[row7] * 2 > max_tx);
1420 b->tx = vp56_rac_get_prob(&s->c, s->prob.p.tx32p[c][0]);
1422 b->tx += vp56_rac_get_prob(&s->c, s->prob.p.tx32p[c][1]);
1424 b->tx += vp56_rac_get_prob(&s->c, s->prob.p.tx32p[c][2]);
1426 s->counts.tx32p[c][b->tx]++;
1429 b->tx = vp56_rac_get_prob(&s->c, s->prob.p.tx16p[c][0]);
1431 b->tx += vp56_rac_get_prob(&s->c, s->prob.p.tx16p[c][1]);
1432 s->counts.tx16p[c][b->tx]++;
1435 b->tx = vp56_rac_get_prob(&s->c, s->prob.p.tx8p[c]);
1436 s->counts.tx8p[c][b->tx]++;
1443 b->tx = FFMIN(max_tx, s->txfmmode);
1446 if (s->keyframe || s->intraonly) {
1447 uint8_t *a = &s->above_mode_ctx[col * 2];
1448 uint8_t *l = &s->left_mode_ctx[(row7) << 1];
1451 if (b->bs > BS_8x8) {
1452 // FIXME the memory storage intermediates here aren't really
1453 // necessary, they're just there to make the code slightly
1455 b->mode[0] = a[0] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1456 vp9_default_kf_ymode_probs[a[0]][l[0]]);
1457 if (b->bs != BS_8x4) {
1458 b->mode[1] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1459 vp9_default_kf_ymode_probs[a[1]][b->mode[0]]);
1460 l[0] = a[1] = b->mode[1];
1462 l[0] = a[1] = b->mode[1] = b->mode[0];
1464 if (b->bs != BS_4x8) {
1465 b->mode[2] = a[0] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1466 vp9_default_kf_ymode_probs[a[0]][l[1]]);
1467 if (b->bs != BS_8x4) {
1468 b->mode[3] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1469 vp9_default_kf_ymode_probs[a[1]][b->mode[2]]);
1470 l[1] = a[1] = b->mode[3];
1472 l[1] = a[1] = b->mode[3] = b->mode[2];
1475 b->mode[2] = b->mode[0];
1476 l[1] = a[1] = b->mode[3] = b->mode[1];
1479 b->mode[0] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1480 vp9_default_kf_ymode_probs[*a][*l]);
1481 b->mode[3] = b->mode[2] = b->mode[1] = b->mode[0];
1482 // FIXME this can probably be optimized
1483 memset(a, b->mode[0], bwh_tab[0][b->bs][0]);
1484 memset(l, b->mode[0], bwh_tab[0][b->bs][1]);
1486 b->uvmode = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1487 vp9_default_kf_uvmode_probs[b->mode[3]]);
1488 } else if (b->intra) {
1490 if (b->bs > BS_8x8) {
1491 b->mode[0] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1492 s->prob.p.y_mode[0]);
1493 s->counts.y_mode[0][b->mode[0]]++;
1494 if (b->bs != BS_8x4) {
1495 b->mode[1] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1496 s->prob.p.y_mode[0]);
1497 s->counts.y_mode[0][b->mode[1]]++;
1499 b->mode[1] = b->mode[0];
1501 if (b->bs != BS_4x8) {
1502 b->mode[2] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1503 s->prob.p.y_mode[0]);
1504 s->counts.y_mode[0][b->mode[2]]++;
1505 if (b->bs != BS_8x4) {
1506 b->mode[3] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1507 s->prob.p.y_mode[0]);
1508 s->counts.y_mode[0][b->mode[3]]++;
1510 b->mode[3] = b->mode[2];
1513 b->mode[2] = b->mode[0];
1514 b->mode[3] = b->mode[1];
1517 static const uint8_t size_group[10] = {
1518 3, 3, 3, 3, 2, 2, 2, 1, 1, 1
1520 int sz = size_group[b->bs];
1522 b->mode[0] = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1523 s->prob.p.y_mode[sz]);
1524 b->mode[1] = b->mode[2] = b->mode[3] = b->mode[0];
1525 s->counts.y_mode[sz][b->mode[3]]++;
1527 b->uvmode = vp8_rac_get_tree(&s->c, vp9_intramode_tree,
1528 s->prob.p.uv_mode[b->mode[3]]);
1529 s->counts.uv_mode[b->mode[3]][b->uvmode]++;
1531 static const uint8_t inter_mode_ctx_lut[14][14] = {
1532 { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1533 { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1534 { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1535 { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1536 { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1537 { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1538 { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1539 { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1540 { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1541 { 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5 },
1542 { 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 2, 2, 1, 3 },
1543 { 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 2, 2, 1, 3 },
1544 { 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 1, 1, 0, 3 },
1545 { 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 3, 3, 3, 4 },
1548 if (s->segmentation.feat[b->seg_id].ref_enabled) {
1549 av_assert2(s->segmentation.feat[b->seg_id].ref_val != 0);
1551 b->ref[0] = s->segmentation.feat[b->seg_id].ref_val - 1;
1553 // read comp_pred flag
1554 if (s->comppredmode != PRED_SWITCHABLE) {
1555 b->comp = s->comppredmode == PRED_COMPREF;
1559 // FIXME add intra as ref=0xff (or -1) to make these easier?
1562 if (s->above_comp_ctx[col] && s->left_comp_ctx[row7]) {
1564 } else if (s->above_comp_ctx[col]) {
1565 c = 2 + (s->left_intra_ctx[row7] ||
1566 s->left_ref_ctx[row7] == s->fixcompref);
1567 } else if (s->left_comp_ctx[row7]) {
1568 c = 2 + (s->above_intra_ctx[col] ||
1569 s->above_ref_ctx[col] == s->fixcompref);
1571 c = (!s->above_intra_ctx[col] &&
1572 s->above_ref_ctx[col] == s->fixcompref) ^
1573 (!s->left_intra_ctx[row7] &&
1574 s->left_ref_ctx[row & 7] == s->fixcompref);
1577 c = s->above_comp_ctx[col] ? 3 :
1578 (!s->above_intra_ctx[col] && s->above_ref_ctx[col] == s->fixcompref);
1580 } else if (have_l) {
1581 c = s->left_comp_ctx[row7] ? 3 :
1582 (!s->left_intra_ctx[row7] && s->left_ref_ctx[row7] == s->fixcompref);
1586 b->comp = vp56_rac_get_prob(&s->c, s->prob.p.comp[c]);
1587 s->counts.comp[c][b->comp]++;
1590 // read actual references
1591 // FIXME probably cache a few variables here to prevent repetitive
1592 // memory accesses below
1593 if (b->comp) /* two references */ {
1594 int fix_idx = s->signbias[s->fixcompref], var_idx = !fix_idx, c, bit;
1596 b->ref[fix_idx] = s->fixcompref;
1597 // FIXME can this codeblob be replaced by some sort of LUT?
1600 if (s->above_intra_ctx[col]) {
1601 if (s->left_intra_ctx[row7]) {
1604 c = 1 + 2 * (s->left_ref_ctx[row7] != s->varcompref[1]);
1606 } else if (s->left_intra_ctx[row7]) {
1607 c = 1 + 2 * (s->above_ref_ctx[col] != s->varcompref[1]);
1609 int refl = s->left_ref_ctx[row7], refa = s->above_ref_ctx[col];
1611 if (refl == refa && refa == s->varcompref[1]) {
1613 } else if (!s->left_comp_ctx[row7] && !s->above_comp_ctx[col]) {
1614 if ((refa == s->fixcompref && refl == s->varcompref[0]) ||
1615 (refl == s->fixcompref && refa == s->varcompref[0])) {
1618 c = (refa == refl) ? 3 : 1;
1620 } else if (!s->left_comp_ctx[row7]) {
1621 if (refa == s->varcompref[1] && refl != s->varcompref[1]) {
1624 c = (refl == s->varcompref[1] &&
1625 refa != s->varcompref[1]) ? 2 : 4;
1627 } else if (!s->above_comp_ctx[col]) {
1628 if (refl == s->varcompref[1] && refa != s->varcompref[1]) {
1631 c = (refa == s->varcompref[1] &&
1632 refl != s->varcompref[1]) ? 2 : 4;
1635 c = (refl == refa) ? 4 : 2;
1639 if (s->above_intra_ctx[col]) {
1641 } else if (s->above_comp_ctx[col]) {
1642 c = 4 * (s->above_ref_ctx[col] != s->varcompref[1]);
1644 c = 3 * (s->above_ref_ctx[col] != s->varcompref[1]);
1647 } else if (have_l) {
1648 if (s->left_intra_ctx[row7]) {
1650 } else if (s->left_comp_ctx[row7]) {
1651 c = 4 * (s->left_ref_ctx[row7] != s->varcompref[1]);
1653 c = 3 * (s->left_ref_ctx[row7] != s->varcompref[1]);
1658 bit = vp56_rac_get_prob(&s->c, s->prob.p.comp_ref[c]);
1659 b->ref[var_idx] = s->varcompref[bit];
1660 s->counts.comp_ref[c][bit]++;
1661 } else /* single reference */ {
1664 if (have_a && !s->above_intra_ctx[col]) {
1665 if (have_l && !s->left_intra_ctx[row7]) {
1666 if (s->left_comp_ctx[row7]) {
1667 if (s->above_comp_ctx[col]) {
1668 c = 1 + (!s->fixcompref || !s->left_ref_ctx[row7] ||
1669 !s->above_ref_ctx[col]);
1671 c = (3 * !s->above_ref_ctx[col]) +
1672 (!s->fixcompref || !s->left_ref_ctx[row7]);
1674 } else if (s->above_comp_ctx[col]) {
1675 c = (3 * !s->left_ref_ctx[row7]) +
1676 (!s->fixcompref || !s->above_ref_ctx[col]);
1678 c = 2 * !s->left_ref_ctx[row7] + 2 * !s->above_ref_ctx[col];
1680 } else if (s->above_intra_ctx[col]) {
1682 } else if (s->above_comp_ctx[col]) {
1683 c = 1 + (!s->fixcompref || !s->above_ref_ctx[col]);
1685 c = 4 * (!s->above_ref_ctx[col]);
1687 } else if (have_l && !s->left_intra_ctx[row7]) {
1688 if (s->left_intra_ctx[row7]) {
1690 } else if (s->left_comp_ctx[row7]) {
1691 c = 1 + (!s->fixcompref || !s->left_ref_ctx[row7]);
1693 c = 4 * (!s->left_ref_ctx[row7]);
1698 bit = vp56_rac_get_prob(&s->c, s->prob.p.single_ref[c][0]);
1699 s->counts.single_ref[c][0][bit]++;
1703 // FIXME can this codeblob be replaced by some sort of LUT?
1706 if (s->left_intra_ctx[row7]) {
1707 if (s->above_intra_ctx[col]) {
1709 } else if (s->above_comp_ctx[col]) {
1710 c = 1 + 2 * (s->fixcompref == 1 ||
1711 s->above_ref_ctx[col] == 1);
1712 } else if (!s->above_ref_ctx[col]) {
1715 c = 4 * (s->above_ref_ctx[col] == 1);
1717 } else if (s->above_intra_ctx[col]) {
1718 if (s->left_intra_ctx[row7]) {
1720 } else if (s->left_comp_ctx[row7]) {
1721 c = 1 + 2 * (s->fixcompref == 1 ||
1722 s->left_ref_ctx[row7] == 1);
1723 } else if (!s->left_ref_ctx[row7]) {
1726 c = 4 * (s->left_ref_ctx[row7] == 1);
1728 } else if (s->above_comp_ctx[col]) {
1729 if (s->left_comp_ctx[row7]) {
1730 if (s->left_ref_ctx[row7] == s->above_ref_ctx[col]) {
1731 c = 3 * (s->fixcompref == 1 ||
1732 s->left_ref_ctx[row7] == 1);
1736 } else if (!s->left_ref_ctx[row7]) {
1737 c = 1 + 2 * (s->fixcompref == 1 ||
1738 s->above_ref_ctx[col] == 1);
1740 c = 3 * (s->left_ref_ctx[row7] == 1) +
1741 (s->fixcompref == 1 || s->above_ref_ctx[col] == 1);
1743 } else if (s->left_comp_ctx[row7]) {
1744 if (!s->above_ref_ctx[col]) {
1745 c = 1 + 2 * (s->fixcompref == 1 ||
1746 s->left_ref_ctx[row7] == 1);
1748 c = 3 * (s->above_ref_ctx[col] == 1) +
1749 (s->fixcompref == 1 || s->left_ref_ctx[row7] == 1);
1751 } else if (!s->above_ref_ctx[col]) {
1752 if (!s->left_ref_ctx[row7]) {
1755 c = 4 * (s->left_ref_ctx[row7] == 1);
1757 } else if (!s->left_ref_ctx[row7]) {
1758 c = 4 * (s->above_ref_ctx[col] == 1);
1760 c = 2 * (s->left_ref_ctx[row7] == 1) +
1761 2 * (s->above_ref_ctx[col] == 1);
1764 if (s->above_intra_ctx[col] ||
1765 (!s->above_comp_ctx[col] && !s->above_ref_ctx[col])) {
1767 } else if (s->above_comp_ctx[col]) {
1768 c = 3 * (s->fixcompref == 1 || s->above_ref_ctx[col] == 1);
1770 c = 4 * (s->above_ref_ctx[col] == 1);
1773 } else if (have_l) {
1774 if (s->left_intra_ctx[row7] ||
1775 (!s->left_comp_ctx[row7] && !s->left_ref_ctx[row7])) {
1777 } else if (s->left_comp_ctx[row7]) {
1778 c = 3 * (s->fixcompref == 1 || s->left_ref_ctx[row7] == 1);
1780 c = 4 * (s->left_ref_ctx[row7] == 1);
1785 bit = vp56_rac_get_prob(&s->c, s->prob.p.single_ref[c][1]);
1786 s->counts.single_ref[c][1][bit]++;
1787 b->ref[0] = 1 + bit;
1792 if (b->bs <= BS_8x8) {
1793 if (s->segmentation.feat[b->seg_id].skip_enabled) {
1794 b->mode[0] = b->mode[1] = b->mode[2] = b->mode[3] = ZEROMV;
1796 static const uint8_t off[10] = {
1797 3, 0, 0, 1, 0, 0, 0, 0, 0, 0
1800 // FIXME this needs to use the LUT tables from find_ref_mvs
1801 // because not all are -1,0/0,-1
1802 int c = inter_mode_ctx_lut[s->above_mode_ctx[col + off[b->bs]]]
1803 [s->left_mode_ctx[row7 + off[b->bs]]];
1805 b->mode[0] = vp8_rac_get_tree(&s->c, vp9_inter_mode_tree,
1806 s->prob.p.mv_mode[c]);
1807 b->mode[1] = b->mode[2] = b->mode[3] = b->mode[0];
1808 s->counts.mv_mode[c][b->mode[0] - 10]++;
1812 if (s->filtermode == FILTER_SWITCHABLE) {
1815 if (have_a && s->above_mode_ctx[col] >= NEARESTMV) {
1816 if (have_l && s->left_mode_ctx[row7] >= NEARESTMV) {
1817 c = s->above_filter_ctx[col] == s->left_filter_ctx[row7] ?
1818 s->left_filter_ctx[row7] : 3;
1820 c = s->above_filter_ctx[col];
1822 } else if (have_l && s->left_mode_ctx[row7] >= NEARESTMV) {
1823 c = s->left_filter_ctx[row7];
1828 filter_id = vp8_rac_get_tree(&s->c, vp9_filter_tree,
1829 s->prob.p.filter[c]);
1830 s->counts.filter[c][filter_id]++;
1831 b->filter = vp9_filter_lut[filter_id];
1833 b->filter = s->filtermode;
1836 if (b->bs > BS_8x8) {
1837 int c = inter_mode_ctx_lut[s->above_mode_ctx[col]][s->left_mode_ctx[row7]];
1839 b->mode[0] = vp8_rac_get_tree(&s->c, vp9_inter_mode_tree,
1840 s->prob.p.mv_mode[c]);
1841 s->counts.mv_mode[c][b->mode[0] - 10]++;
1842 fill_mv(s, b->mv[0], b->mode[0], 0);
1844 if (b->bs != BS_8x4) {
1845 b->mode[1] = vp8_rac_get_tree(&s->c, vp9_inter_mode_tree,
1846 s->prob.p.mv_mode[c]);
1847 s->counts.mv_mode[c][b->mode[1] - 10]++;
1848 fill_mv(s, b->mv[1], b->mode[1], 1);
1850 b->mode[1] = b->mode[0];
1851 AV_COPY32(&b->mv[1][0], &b->mv[0][0]);
1852 AV_COPY32(&b->mv[1][1], &b->mv[0][1]);
1855 if (b->bs != BS_4x8) {
1856 b->mode[2] = vp8_rac_get_tree(&s->c, vp9_inter_mode_tree,
1857 s->prob.p.mv_mode[c]);
1858 s->counts.mv_mode[c][b->mode[2] - 10]++;
1859 fill_mv(s, b->mv[2], b->mode[2], 2);
1861 if (b->bs != BS_8x4) {
1862 b->mode[3] = vp8_rac_get_tree(&s->c, vp9_inter_mode_tree,
1863 s->prob.p.mv_mode[c]);
1864 s->counts.mv_mode[c][b->mode[3] - 10]++;
1865 fill_mv(s, b->mv[3], b->mode[3], 3);
1867 b->mode[3] = b->mode[2];
1868 AV_COPY32(&b->mv[3][0], &b->mv[2][0]);
1869 AV_COPY32(&b->mv[3][1], &b->mv[2][1]);
1872 b->mode[2] = b->mode[0];
1873 AV_COPY32(&b->mv[2][0], &b->mv[0][0]);
1874 AV_COPY32(&b->mv[2][1], &b->mv[0][1]);
1875 b->mode[3] = b->mode[1];
1876 AV_COPY32(&b->mv[3][0], &b->mv[1][0]);
1877 AV_COPY32(&b->mv[3][1], &b->mv[1][1]);
1880 fill_mv(s, b->mv[0], b->mode[0], -1);
1881 AV_COPY32(&b->mv[1][0], &b->mv[0][0]);
1882 AV_COPY32(&b->mv[2][0], &b->mv[0][0]);
1883 AV_COPY32(&b->mv[3][0], &b->mv[0][0]);
1884 AV_COPY32(&b->mv[1][1], &b->mv[0][1]);
1885 AV_COPY32(&b->mv[2][1], &b->mv[0][1]);
1886 AV_COPY32(&b->mv[3][1], &b->mv[0][1]);
1889 vref = b->ref[b->comp ? s->signbias[s->varcompref[0]] : 0];
1893 #define SPLAT_CTX(var, val, n) \
1895 case 1: var = val; break; \
1896 case 2: AV_WN16A(&var, val * 0x0101); break; \
1897 case 4: AV_WN32A(&var, val * 0x01010101); break; \
1898 case 8: AV_WN64A(&var, val * 0x0101010101010101ULL); break; \
1900 uint64_t v64 = val * 0x0101010101010101ULL; \
1901 AV_WN64A( &var, v64); \
1902 AV_WN64A(&((uint8_t *) &var)[8], v64); \
1907 #define SPLAT_CTX(var, val, n) \
1909 case 1: var = val; break; \
1910 case 2: AV_WN16A(&var, val * 0x0101); break; \
1911 case 4: AV_WN32A(&var, val * 0x01010101); break; \
1913 uint32_t v32 = val * 0x01010101; \
1914 AV_WN32A( &var, v32); \
1915 AV_WN32A(&((uint8_t *) &var)[4], v32); \
1919 uint32_t v32 = val * 0x01010101; \
1920 AV_WN32A( &var, v32); \
1921 AV_WN32A(&((uint8_t *) &var)[4], v32); \
1922 AV_WN32A(&((uint8_t *) &var)[8], v32); \
1923 AV_WN32A(&((uint8_t *) &var)[12], v32); \
1929 switch (bwh_tab[1][b->bs][0]) {
1930 #define SET_CTXS(dir, off, n) \
1932 SPLAT_CTX(s->dir##_skip_ctx[off], b->skip, n); \
1933 SPLAT_CTX(s->dir##_txfm_ctx[off], b->tx, n); \
1934 SPLAT_CTX(s->dir##_partition_ctx[off], dir##_ctx[b->bs], n); \
1935 if (!s->keyframe && !s->intraonly) { \
1936 SPLAT_CTX(s->dir##_intra_ctx[off], b->intra, n); \
1937 SPLAT_CTX(s->dir##_comp_ctx[off], b->comp, n); \
1938 SPLAT_CTX(s->dir##_mode_ctx[off], b->mode[3], n); \
1940 SPLAT_CTX(s->dir##_ref_ctx[off], vref, n); \
1941 if (s->filtermode == FILTER_SWITCHABLE) { \
1942 SPLAT_CTX(s->dir##_filter_ctx[off], filter_id, n); \
1947 case 1: SET_CTXS(above, col, 1); break;
1948 case 2: SET_CTXS(above, col, 2); break;
1949 case 4: SET_CTXS(above, col, 4); break;
1950 case 8: SET_CTXS(above, col, 8); break;
1952 switch (bwh_tab[1][b->bs][1]) {
1953 case 1: SET_CTXS(left, row7, 1); break;
1954 case 2: SET_CTXS(left, row7, 2); break;
1955 case 4: SET_CTXS(left, row7, 4); break;
1956 case 8: SET_CTXS(left, row7, 8); break;
1961 if (!s->keyframe && !s->intraonly) {
1962 if (b->bs > BS_8x8) {
1963 int mv0 = AV_RN32A(&b->mv[3][0]), mv1 = AV_RN32A(&b->mv[3][1]);
1965 AV_COPY32(&s->left_mv_ctx[row7 * 2 + 0][0], &b->mv[1][0]);
1966 AV_COPY32(&s->left_mv_ctx[row7 * 2 + 0][1], &b->mv[1][1]);
1967 AV_WN32A(&s->left_mv_ctx[row7 * 2 + 1][0], mv0);
1968 AV_WN32A(&s->left_mv_ctx[row7 * 2 + 1][1], mv1);
1969 AV_COPY32(&s->above_mv_ctx[col * 2 + 0][0], &b->mv[2][0]);
1970 AV_COPY32(&s->above_mv_ctx[col * 2 + 0][1], &b->mv[2][1]);
1971 AV_WN32A(&s->above_mv_ctx[col * 2 + 1][0], mv0);
1972 AV_WN32A(&s->above_mv_ctx[col * 2 + 1][1], mv1);
1974 int n, mv0 = AV_RN32A(&b->mv[3][0]), mv1 = AV_RN32A(&b->mv[3][1]);
1976 for (n = 0; n < w4 * 2; n++) {
1977 AV_WN32A(&s->above_mv_ctx[col * 2 + n][0], mv0);
1978 AV_WN32A(&s->above_mv_ctx[col * 2 + n][1], mv1);
1980 for (n = 0; n < h4 * 2; n++) {
1981 AV_WN32A(&s->left_mv_ctx[row7 * 2 + n][0], mv0);
1982 AV_WN32A(&s->left_mv_ctx[row7 * 2 + n][1], mv1);
1988 for (y = 0; y < h4; y++) {
1989 int x, o = (row + y) * s->sb_cols * 8 + col;
1990 struct VP9mvrefPair *mv = &s->frames[CUR_FRAME].mv[o];
1993 for (x = 0; x < w4; x++) {
1997 } else if (b->comp) {
1998 for (x = 0; x < w4; x++) {
1999 mv[x].ref[0] = b->ref[0];
2000 mv[x].ref[1] = b->ref[1];
2001 AV_COPY32(&mv[x].mv[0], &b->mv[3][0]);
2002 AV_COPY32(&mv[x].mv[1], &b->mv[3][1]);
2005 for (x = 0; x < w4; x++) {
2006 mv[x].ref[0] = b->ref[0];
2008 AV_COPY32(&mv[x].mv[0], &b->mv[3][0]);
2014 // FIXME merge cnt/eob arguments?
2015 static av_always_inline int
2016 decode_coeffs_b_generic(VP56RangeCoder *c, int16_t *coef, int n_coeffs,
2017 int is_tx32x32, unsigned (*cnt)[6][3],
2018 unsigned (*eob)[6][2], uint8_t (*p)[6][11],
2019 int nnz, const int16_t *scan, const int16_t (*nb)[2],
2020 const int16_t *band_counts, const int16_t *qmul)
2022 int i = 0, band = 0, band_left = band_counts[band];
2023 uint8_t *tp = p[0][nnz];
2024 uint8_t cache[1024];
2029 val = vp56_rac_get_prob_branchy(c, tp[0]); // eob
2030 eob[band][nnz][val]++;
2035 if (!vp56_rac_get_prob_branchy(c, tp[1])) { // zero
2036 cnt[band][nnz][0]++;
2038 band_left = band_counts[++band];
2040 nnz = (1 + cache[nb[i][0]] + cache[nb[i][1]]) >> 1;
2042 if (++i == n_coeffs)
2043 break; //invalid input; blocks should end with EOB
2048 if (!vp56_rac_get_prob_branchy(c, tp[2])) { // one
2049 cnt[band][nnz][1]++;
2053 // fill in p[3-10] (model fill) - only once per frame for each pos
2055 memcpy(&tp[3], vp9_model_pareto8[tp[2]], 8);
2057 cnt[band][nnz][2]++;
2058 if (!vp56_rac_get_prob_branchy(c, tp[3])) { // 2, 3, 4
2059 if (!vp56_rac_get_prob_branchy(c, tp[4])) {
2060 cache[rc] = val = 2;
2062 val = 3 + vp56_rac_get_prob(c, tp[5]);
2065 } else if (!vp56_rac_get_prob_branchy(c, tp[6])) { // cat1/2
2067 if (!vp56_rac_get_prob_branchy(c, tp[7])) {
2068 val = 5 + vp56_rac_get_prob(c, 159);
2070 val = 7 + (vp56_rac_get_prob(c, 165) << 1);
2071 val += vp56_rac_get_prob(c, 145);
2075 if (!vp56_rac_get_prob_branchy(c, tp[8])) {
2076 if (!vp56_rac_get_prob_branchy(c, tp[9])) {
2077 val = 11 + (vp56_rac_get_prob(c, 173) << 2);
2078 val += (vp56_rac_get_prob(c, 148) << 1);
2079 val += vp56_rac_get_prob(c, 140);
2081 val = 19 + (vp56_rac_get_prob(c, 176) << 3);
2082 val += (vp56_rac_get_prob(c, 155) << 2);
2083 val += (vp56_rac_get_prob(c, 140) << 1);
2084 val += vp56_rac_get_prob(c, 135);
2086 } else if (!vp56_rac_get_prob_branchy(c, tp[10])) {
2087 val = 35 + (vp56_rac_get_prob(c, 180) << 4);
2088 val += (vp56_rac_get_prob(c, 157) << 3);
2089 val += (vp56_rac_get_prob(c, 141) << 2);
2090 val += (vp56_rac_get_prob(c, 134) << 1);
2091 val += vp56_rac_get_prob(c, 130);
2093 val = 67 + (vp56_rac_get_prob(c, 254) << 13);
2094 val += (vp56_rac_get_prob(c, 254) << 12);
2095 val += (vp56_rac_get_prob(c, 254) << 11);
2096 val += (vp56_rac_get_prob(c, 252) << 10);
2097 val += (vp56_rac_get_prob(c, 249) << 9);
2098 val += (vp56_rac_get_prob(c, 243) << 8);
2099 val += (vp56_rac_get_prob(c, 230) << 7);
2100 val += (vp56_rac_get_prob(c, 196) << 6);
2101 val += (vp56_rac_get_prob(c, 177) << 5);
2102 val += (vp56_rac_get_prob(c, 153) << 4);
2103 val += (vp56_rac_get_prob(c, 140) << 3);
2104 val += (vp56_rac_get_prob(c, 133) << 2);
2105 val += (vp56_rac_get_prob(c, 130) << 1);
2106 val += vp56_rac_get_prob(c, 129);
2111 band_left = band_counts[++band];
2113 coef[rc] = ((vp8_rac_get(c) ? -val : val) * qmul[!!i]) / 2;
2115 coef[rc] = (vp8_rac_get(c) ? -val : val) * qmul[!!i];
2116 nnz = (1 + cache[nb[i][0]] + cache[nb[i][1]]) >> 1;
2118 } while (++i < n_coeffs);
2123 static int decode_coeffs_b(VP56RangeCoder *c, int16_t *coef, int n_coeffs,
2124 unsigned (*cnt)[6][3], unsigned (*eob)[6][2],
2125 uint8_t (*p)[6][11], int nnz, const int16_t *scan,
2126 const int16_t (*nb)[2], const int16_t *band_counts,
2127 const int16_t *qmul)
2129 return decode_coeffs_b_generic(c, coef, n_coeffs, 0, cnt, eob, p,
2130 nnz, scan, nb, band_counts, qmul);
2133 static int decode_coeffs_b32(VP56RangeCoder *c, int16_t *coef, int n_coeffs,
2134 unsigned (*cnt)[6][3], unsigned (*eob)[6][2],
2135 uint8_t (*p)[6][11], int nnz, const int16_t *scan,
2136 const int16_t (*nb)[2], const int16_t *band_counts,
2137 const int16_t *qmul)
2139 return decode_coeffs_b_generic(c, coef, n_coeffs, 1, cnt, eob, p,
2140 nnz, scan, nb, band_counts, qmul);
2143 static void decode_coeffs(AVCodecContext *ctx)
2145 VP9Context *s = ctx->priv_data;
2147 int row = s->row, col = s->col;
2148 uint8_t (*p)[6][11] = s->prob.coef[b->tx][0 /* y */][!b->intra];
2149 unsigned (*c)[6][3] = s->counts.coef[b->tx][0 /* y */][!b->intra];
2150 unsigned (*e)[6][2] = s->counts.eob[b->tx][0 /* y */][!b->intra];
2151 int w4 = bwh_tab[1][b->bs][0] << 1, h4 = bwh_tab[1][b->bs][1] << 1;
2152 int end_x = FFMIN(2 * (s->cols - col), w4);
2153 int end_y = FFMIN(2 * (s->rows - row), h4);
2154 int n, pl, x, y, res;
2155 int16_t (*qmul)[2] = s->segmentation.feat[b->seg_id].qmul;
2156 int tx = 4 * s->lossless + b->tx;
2157 const int16_t * const *yscans = vp9_scans[tx];
2158 const int16_t (* const *ynbs)[2] = vp9_scans_nb[tx];
2159 const int16_t *uvscan = vp9_scans[b->uvtx][DCT_DCT];
2160 const int16_t (*uvnb)[2] = vp9_scans_nb[b->uvtx][DCT_DCT];
2161 uint8_t *a = &s->above_y_nnz_ctx[col * 2];
2162 uint8_t *l = &s->left_y_nnz_ctx[(row & 7) << 1];
2163 static const int16_t band_counts[4][8] = {
2164 { 1, 2, 3, 4, 3, 16 - 13 },
2165 { 1, 2, 3, 4, 11, 64 - 21 },
2166 { 1, 2, 3, 4, 11, 256 - 21 },
2167 { 1, 2, 3, 4, 11, 1024 - 21 },
2169 const int16_t *y_band_counts = band_counts[b->tx];
2170 const int16_t *uv_band_counts = band_counts[b->uvtx];
2172 #define MERGE(la, end, step, rd) \
2173 for (n = 0; n < end; n += step) \
2174 la[n] = !!rd(&la[n])
2175 #define MERGE_CTX(step, rd) \
2177 MERGE(l, end_y, step, rd); \
2178 MERGE(a, end_x, step, rd); \
2181 #define DECODE_Y_COEF_LOOP(step, mode_index, v) \
2182 for (n = 0, y = 0; y < end_y; y += step) { \
2183 for (x = 0; x < end_x; x += step, n += step * step) { \
2184 enum TxfmType txtp = vp9_intra_txfm_type[b->mode[mode_index]]; \
2185 res = decode_coeffs_b##v(&s->c, s->block + 16 * n, 16 * step * step, \
2186 c, e, p, a[x] + l[y], yscans[txtp], \
2187 ynbs[txtp], y_band_counts, qmul[0]); \
2188 a[x] = l[y] = !!res; \
2190 AV_WN16A(&s->eob[n], res); \
2197 #define SPLAT(la, end, step, cond) \
2199 for (n = 1; n < end; n += step) \
2200 la[n] = la[n - 1]; \
2201 } else if (step == 4) { \
2203 for (n = 0; n < end; n += step) \
2204 AV_WN32A(&la[n], la[n] * 0x01010101); \
2206 for (n = 0; n < end; n += step) \
2207 memset(&la[n + 1], la[n], FFMIN(end - n - 1, 3)); \
2209 } else /* step == 8 */ { \
2211 if (HAVE_FAST_64BIT) { \
2212 for (n = 0; n < end; n += step) \
2213 AV_WN64A(&la[n], la[n] * 0x0101010101010101ULL); \
2215 for (n = 0; n < end; n += step) { \
2216 uint32_t v32 = la[n] * 0x01010101; \
2217 AV_WN32A(&la[n], v32); \
2218 AV_WN32A(&la[n + 4], v32); \
2222 for (n = 0; n < end; n += step) \
2223 memset(&la[n + 1], la[n], FFMIN(end - n - 1, 7)); \
2226 #define SPLAT_CTX(step) \
2228 SPLAT(a, end_x, step, end_x == w4); \
2229 SPLAT(l, end_y, step, end_y == h4); \
2235 DECODE_Y_COEF_LOOP(1, b->bs > BS_8x8 ? n : 0,);
2238 MERGE_CTX(2, AV_RN16A);
2239 DECODE_Y_COEF_LOOP(2, 0,);
2243 MERGE_CTX(4, AV_RN32A);
2244 DECODE_Y_COEF_LOOP(4, 0,);
2248 MERGE_CTX(8, AV_RN64A);
2249 DECODE_Y_COEF_LOOP(8, 0, 32);
2254 #define DECODE_UV_COEF_LOOP(step) \
2255 for (n = 0, y = 0; y < end_y; y += step) { \
2256 for (x = 0; x < end_x; x += step, n += step * step) { \
2257 res = decode_coeffs_b(&s->c, s->uvblock[pl] + 16 * n, \
2258 16 * step * step, c, e, p, a[x] + l[y], \
2259 uvscan, uvnb, uv_band_counts, qmul[1]); \
2260 a[x] = l[y] = !!res; \
2262 AV_WN16A(&s->uveob[pl][n], res); \
2264 s->uveob[pl][n] = res; \
2269 p = s->prob.coef[b->uvtx][1 /* uv */][!b->intra];
2270 c = s->counts.coef[b->uvtx][1 /* uv */][!b->intra];
2271 e = s->counts.eob[b->uvtx][1 /* uv */][!b->intra];
2276 for (pl = 0; pl < 2; pl++) {
2277 a = &s->above_uv_nnz_ctx[pl][col];
2278 l = &s->left_uv_nnz_ctx[pl][row & 7];
2281 DECODE_UV_COEF_LOOP(1);
2284 MERGE_CTX(2, AV_RN16A);
2285 DECODE_UV_COEF_LOOP(2);
2289 MERGE_CTX(4, AV_RN32A);
2290 DECODE_UV_COEF_LOOP(4);
2294 MERGE_CTX(8, AV_RN64A);
2295 // a 64x64 (max) uv block can ever only contain 1 tx32x32 block
2296 // so there is no need to loop
2297 res = decode_coeffs_b32(&s->c, s->uvblock[pl],
2298 1024, c, e, p, a[0] + l[0],
2299 uvscan, uvnb, uv_band_counts, qmul[1]);
2300 a[0] = l[0] = !!res;
2301 AV_WN16A(&s->uveob[pl][0], res);
2308 static av_always_inline int check_intra_mode(VP9Context *s, int mode, uint8_t **a,
2309 uint8_t *dst_edge, ptrdiff_t stride_edge,
2310 uint8_t *dst_inner, ptrdiff_t stride_inner,
2311 uint8_t *l, int col, int x, int w,
2312 int row, int y, enum TxfmMode tx,
2315 int have_top = row > 0 || y > 0;
2316 int have_left = col > s->tiling.tile_col_start || x > 0;
2317 int have_right = x < w - 1;
2318 static const uint8_t mode_conv[10][2 /* have_left */][2 /* have_top */] = {
2319 [VERT_PRED] = { { DC_127_PRED, VERT_PRED },
2320 { DC_127_PRED, VERT_PRED } },
2321 [HOR_PRED] = { { DC_129_PRED, DC_129_PRED },
2322 { HOR_PRED, HOR_PRED } },
2323 [DC_PRED] = { { DC_128_PRED, TOP_DC_PRED },
2324 { LEFT_DC_PRED, DC_PRED } },
2325 [DIAG_DOWN_LEFT_PRED] = { { DC_127_PRED, DIAG_DOWN_LEFT_PRED },
2326 { DC_127_PRED, DIAG_DOWN_LEFT_PRED } },
2327 [DIAG_DOWN_RIGHT_PRED] = { { DIAG_DOWN_RIGHT_PRED, DIAG_DOWN_RIGHT_PRED },
2328 { DIAG_DOWN_RIGHT_PRED, DIAG_DOWN_RIGHT_PRED } },
2329 [VERT_RIGHT_PRED] = { { VERT_RIGHT_PRED, VERT_RIGHT_PRED },
2330 { VERT_RIGHT_PRED, VERT_RIGHT_PRED } },
2331 [HOR_DOWN_PRED] = { { HOR_DOWN_PRED, HOR_DOWN_PRED },
2332 { HOR_DOWN_PRED, HOR_DOWN_PRED } },
2333 [VERT_LEFT_PRED] = { { DC_127_PRED, VERT_LEFT_PRED },
2334 { DC_127_PRED, VERT_LEFT_PRED } },
2335 [HOR_UP_PRED] = { { DC_129_PRED, DC_129_PRED },
2336 { HOR_UP_PRED, HOR_UP_PRED } },
2337 [TM_VP8_PRED] = { { DC_129_PRED, VERT_PRED },
2338 { HOR_PRED, TM_VP8_PRED } },
2340 static const struct {
2341 uint8_t needs_left:1;
2342 uint8_t needs_top:1;
2343 uint8_t needs_topleft:1;
2344 uint8_t needs_topright:1;
2345 } edges[N_INTRA_PRED_MODES] = {
2346 [VERT_PRED] = { .needs_top = 1 },
2347 [HOR_PRED] = { .needs_left = 1 },
2348 [DC_PRED] = { .needs_top = 1, .needs_left = 1 },
2349 [DIAG_DOWN_LEFT_PRED] = { .needs_top = 1, .needs_topright = 1 },
2350 [DIAG_DOWN_RIGHT_PRED] = { .needs_left = 1, .needs_top = 1, .needs_topleft = 1 },
2351 [VERT_RIGHT_PRED] = { .needs_left = 1, .needs_top = 1, .needs_topleft = 1 },
2352 [HOR_DOWN_PRED] = { .needs_left = 1, .needs_top = 1, .needs_topleft = 1 },
2353 [VERT_LEFT_PRED] = { .needs_top = 1, .needs_topright = 1 },
2354 [HOR_UP_PRED] = { .needs_left = 1 },
2355 [TM_VP8_PRED] = { .needs_left = 1, .needs_top = 1, .needs_topleft = 1 },
2356 [LEFT_DC_PRED] = { .needs_left = 1 },
2357 [TOP_DC_PRED] = { .needs_top = 1 },
2358 [DC_128_PRED] = { 0 },
2359 [DC_127_PRED] = { 0 },
2360 [DC_129_PRED] = { 0 }
2363 av_assert2(mode >= 0 && mode < 10);
2364 mode = mode_conv[mode][have_left][have_top];
2365 if (edges[mode].needs_top) {
2366 uint8_t *top, *topleft;
2367 int n_px_need = 4 << tx, n_px_have = (((s->cols - col) << !p) - x) * 4;
2368 int n_px_need_tr = 0;
2370 if (tx == TX_4X4 && edges[mode].needs_topright && have_right)
2373 // if top of sb64-row, use s->intra_pred_data[] instead of
2374 // dst[-stride] for intra prediction (it contains pre- instead of
2375 // post-loopfilter data)
2377 top = !(row & 7) && !y ?
2378 s->intra_pred_data[p] + col * (8 >> !!p) + x * 4 :
2379 y == 0 ? &dst_edge[-stride_edge] : &dst_inner[-stride_inner];
2381 topleft = !(row & 7) && !y ?
2382 s->intra_pred_data[p] + col * (8 >> !!p) + x * 4 :
2383 y == 0 || x == 0 ? &dst_edge[-stride_edge] :
2384 &dst_inner[-stride_inner];
2388 (!edges[mode].needs_topleft || (have_left && top == topleft)) &&
2389 (tx != TX_4X4 || !edges[mode].needs_topright || have_right) &&
2390 n_px_need + n_px_need_tr <= n_px_have) {
2394 if (n_px_need <= n_px_have) {
2395 memcpy(*a, top, n_px_need);
2397 memcpy(*a, top, n_px_have);
2398 memset(&(*a)[n_px_have], (*a)[n_px_have - 1],
2399 n_px_need - n_px_have);
2402 memset(*a, 127, n_px_need);
2404 if (edges[mode].needs_topleft) {
2405 if (have_left && have_top) {
2406 (*a)[-1] = topleft[-1];
2408 (*a)[-1] = have_top ? 129 : 127;
2411 if (tx == TX_4X4 && edges[mode].needs_topright) {
2412 if (have_top && have_right &&
2413 n_px_need + n_px_need_tr <= n_px_have) {
2414 memcpy(&(*a)[4], &top[4], 4);
2416 memset(&(*a)[4], (*a)[3], 4);
2421 if (edges[mode].needs_left) {
2423 int n_px_need = 4 << tx, i, n_px_have = (((s->rows - row) << !p) - y) * 4;
2424 uint8_t *dst = x == 0 ? dst_edge : dst_inner;
2425 ptrdiff_t stride = x == 0 ? stride_edge : stride_inner;
2427 if (n_px_need <= n_px_have) {
2428 for (i = 0; i < n_px_need; i++)
2429 l[n_px_need - 1 - i] = dst[i * stride - 1];
2431 for (i = 0; i < n_px_have; i++)
2432 l[n_px_need - 1 - i] = dst[i * stride - 1];
2433 memset(l, l[n_px_need - n_px_have], n_px_need - n_px_have);
2436 memset(l, 129, 4 << tx);
2443 static void intra_recon(AVCodecContext *ctx, ptrdiff_t y_off, ptrdiff_t uv_off)
2445 VP9Context *s = ctx->priv_data;
2447 int row = s->row, col = s->col;
2448 int w4 = bwh_tab[1][b->bs][0] << 1, step1d = 1 << b->tx, n;
2449 int h4 = bwh_tab[1][b->bs][1] << 1, x, y, step = 1 << (b->tx * 2);
2450 int end_x = FFMIN(2 * (s->cols - col), w4);
2451 int end_y = FFMIN(2 * (s->rows - row), h4);
2452 int tx = 4 * s->lossless + b->tx, uvtx = b->uvtx + 4 * s->lossless;
2453 int uvstep1d = 1 << b->uvtx, p;
2454 uint8_t *dst = s->dst[0], *dst_r = s->frames[CUR_FRAME].tf.f->data[0] + y_off;
2455 LOCAL_ALIGNED_32(uint8_t, a_buf, [64]);
2456 LOCAL_ALIGNED_32(uint8_t, l, [32]);
2458 for (n = 0, y = 0; y < end_y; y += step1d) {
2459 uint8_t *ptr = dst, *ptr_r = dst_r;
2460 for (x = 0; x < end_x; x += step1d, ptr += 4 * step1d,
2461 ptr_r += 4 * step1d, n += step) {
2462 int mode = b->mode[b->bs > BS_8x8 && b->tx == TX_4X4 ?
2464 uint8_t *a = &a_buf[32];
2465 enum TxfmType txtp = vp9_intra_txfm_type[mode];
2466 int eob = b->skip ? 0 : b->tx > TX_8X8 ? AV_RN16A(&s->eob[n]) : s->eob[n];
2468 mode = check_intra_mode(s, mode, &a, ptr_r,
2469 s->frames[CUR_FRAME].tf.f->linesize[0],
2470 ptr, s->y_stride, l,
2471 col, x, w4, row, y, b->tx, 0);
2472 s->dsp.intra_pred[b->tx][mode](ptr, s->y_stride, l, a);
2474 s->dsp.itxfm_add[tx][txtp](ptr, s->y_stride,
2475 s->block + 16 * n, eob);
2477 dst_r += 4 * step1d * s->frames[CUR_FRAME].tf.f->linesize[0];
2478 dst += 4 * step1d * s->y_stride;
2485 step = 1 << (b->uvtx * 2);
2486 for (p = 0; p < 2; p++) {
2487 dst = s->dst[1 + p];
2488 dst_r = s->frames[CUR_FRAME].tf.f->data[1 + p] + uv_off;
2489 for (n = 0, y = 0; y < end_y; y += uvstep1d) {
2490 uint8_t *ptr = dst, *ptr_r = dst_r;
2491 for (x = 0; x < end_x; x += uvstep1d, ptr += 4 * uvstep1d,
2492 ptr_r += 4 * uvstep1d, n += step) {
2493 int mode = b->uvmode;
2494 uint8_t *a = &a_buf[16];
2495 int eob = b->skip ? 0 : b->uvtx > TX_8X8 ? AV_RN16A(&s->uveob[p][n]) : s->uveob[p][n];
2497 mode = check_intra_mode(s, mode, &a, ptr_r,
2498 s->frames[CUR_FRAME].tf.f->linesize[1],
2499 ptr, s->uv_stride, l,
2500 col, x, w4, row, y, b->uvtx, p + 1);
2501 s->dsp.intra_pred[b->uvtx][mode](ptr, s->uv_stride, l, a);
2503 s->dsp.itxfm_add[uvtx][DCT_DCT](ptr, s->uv_stride,
2504 s->uvblock[p] + 16 * n, eob);
2506 dst_r += 4 * uvstep1d * s->frames[CUR_FRAME].tf.f->linesize[1];
2507 dst += 4 * uvstep1d * s->uv_stride;
2512 static av_always_inline void mc_luma_dir(VP9Context *s, vp9_mc_func (*mc)[2],
2513 uint8_t *dst, ptrdiff_t dst_stride,
2514 const uint8_t *ref, ptrdiff_t ref_stride,
2515 ThreadFrame *ref_frame,
2516 ptrdiff_t y, ptrdiff_t x, const VP56mv *mv,
2517 int bw, int bh, int w, int h)
2519 int mx = mv->x, my = mv->y, th;
2523 ref += y * ref_stride + x;
2526 // FIXME bilinear filter only needs 0/1 pixels, not 3/4
2527 // we use +7 because the last 7 pixels of each sbrow can be changed in
2528 // the longest loopfilter of the next sbrow
2529 th = (y + bh + 4 * !!my + 7) >> 6;
2530 ff_thread_await_progress(ref_frame, FFMAX(th, 0), 0);
2531 if (x < !!mx * 3 || y < !!my * 3 ||
2532 x + !!mx * 4 > w - bw || y + !!my * 4 > h - bh) {
2533 s->vdsp.emulated_edge_mc(s->edge_emu_buffer,
2534 ref - !!my * 3 * ref_stride - !!mx * 3,
2536 bw + !!mx * 7, bh + !!my * 7,
2537 x - !!mx * 3, y - !!my * 3, w, h);
2538 ref = s->edge_emu_buffer + !!my * 3 * 80 + !!mx * 3;
2541 mc[!!mx][!!my](dst, dst_stride, ref, ref_stride, bh, mx << 1, my << 1);
2544 static av_always_inline void mc_chroma_dir(VP9Context *s, vp9_mc_func (*mc)[2],
2545 uint8_t *dst_u, uint8_t *dst_v,
2546 ptrdiff_t dst_stride,
2547 const uint8_t *ref_u, ptrdiff_t src_stride_u,
2548 const uint8_t *ref_v, ptrdiff_t src_stride_v,
2549 ThreadFrame *ref_frame,
2550 ptrdiff_t y, ptrdiff_t x, const VP56mv *mv,
2551 int bw, int bh, int w, int h)
2553 int mx = mv->x, my = mv->y, th;
2557 ref_u += y * src_stride_u + x;
2558 ref_v += y * src_stride_v + x;
2561 // FIXME bilinear filter only needs 0/1 pixels, not 3/4
2562 // we use +7 because the last 7 pixels of each sbrow can be changed in
2563 // the longest loopfilter of the next sbrow
2564 th = (y + bh + 4 * !!my + 7) >> 5;
2565 ff_thread_await_progress(ref_frame, FFMAX(th, 0), 0);
2566 if (x < !!mx * 3 || y < !!my * 3 ||
2567 x + !!mx * 4 > w - bw || y + !!my * 4 > h - bh) {
2568 s->vdsp.emulated_edge_mc(s->edge_emu_buffer,
2569 ref_u - !!my * 3 * src_stride_u - !!mx * 3,
2571 bw + !!mx * 7, bh + !!my * 7,
2572 x - !!mx * 3, y - !!my * 3, w, h);
2573 ref_u = s->edge_emu_buffer + !!my * 3 * 80 + !!mx * 3;
2574 mc[!!mx][!!my](dst_u, dst_stride, ref_u, 80, bh, mx, my);
2576 s->vdsp.emulated_edge_mc(s->edge_emu_buffer,
2577 ref_v - !!my * 3 * src_stride_v - !!mx * 3,
2579 bw + !!mx * 7, bh + !!my * 7,
2580 x - !!mx * 3, y - !!my * 3, w, h);
2581 ref_v = s->edge_emu_buffer + !!my * 3 * 80 + !!mx * 3;
2582 mc[!!mx][!!my](dst_v, dst_stride, ref_v, 80, bh, mx, my);
2584 mc[!!mx][!!my](dst_u, dst_stride, ref_u, src_stride_u, bh, mx, my);
2585 mc[!!mx][!!my](dst_v, dst_stride, ref_v, src_stride_v, bh, mx, my);
2589 static void inter_recon(AVCodecContext *ctx)
2591 static const uint8_t bwlog_tab[2][N_BS_SIZES] = {
2592 { 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4 },
2593 { 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 4, 4 },
2595 VP9Context *s = ctx->priv_data;
2597 int row = s->row, col = s->col;
2598 ThreadFrame *tref1 = &s->refs[s->refidx[b->ref[0]]], *tref2;
2599 AVFrame *ref1 = tref1->f, *ref2;
2600 int w1 = ref1->width, h1 = ref1->height, w2, h2;
2601 ptrdiff_t ls_y = s->y_stride, ls_uv = s->uv_stride;
2604 tref2 = &s->refs[s->refidx[b->ref[1]]];
2611 if (b->bs > BS_8x8) {
2612 if (b->bs == BS_8x4) {
2613 mc_luma_dir(s, s->dsp.mc[3][b->filter][0], s->dst[0], ls_y,
2614 ref1->data[0], ref1->linesize[0], tref1,
2615 row << 3, col << 3, &b->mv[0][0], 8, 4, w1, h1);
2616 mc_luma_dir(s, s->dsp.mc[3][b->filter][0],
2617 s->dst[0] + 4 * ls_y, ls_y,
2618 ref1->data[0], ref1->linesize[0], tref1,
2619 (row << 3) + 4, col << 3, &b->mv[2][0], 8, 4, w1, h1);
2622 mc_luma_dir(s, s->dsp.mc[3][b->filter][1], s->dst[0], ls_y,
2623 ref2->data[0], ref2->linesize[0], tref2,
2624 row << 3, col << 3, &b->mv[0][1], 8, 4, w2, h2);
2625 mc_luma_dir(s, s->dsp.mc[3][b->filter][1],
2626 s->dst[0] + 4 * ls_y, ls_y,
2627 ref2->data[0], ref2->linesize[0], tref2,
2628 (row << 3) + 4, col << 3, &b->mv[2][1], 8, 4, w2, h2);
2630 } else if (b->bs == BS_4x8) {
2631 mc_luma_dir(s, s->dsp.mc[4][b->filter][0], s->dst[0], ls_y,
2632 ref1->data[0], ref1->linesize[0], tref1,
2633 row << 3, col << 3, &b->mv[0][0], 4, 8, w1, h1);
2634 mc_luma_dir(s, s->dsp.mc[4][b->filter][0], s->dst[0] + 4, ls_y,
2635 ref1->data[0], ref1->linesize[0], tref1,
2636 row << 3, (col << 3) + 4, &b->mv[1][0], 4, 8, w1, h1);
2639 mc_luma_dir(s, s->dsp.mc[4][b->filter][1], s->dst[0], ls_y,
2640 ref2->data[0], ref2->linesize[0], tref2,
2641 row << 3, col << 3, &b->mv[0][1], 4, 8, w2, h2);
2642 mc_luma_dir(s, s->dsp.mc[4][b->filter][1], s->dst[0] + 4, ls_y,
2643 ref2->data[0], ref2->linesize[0], tref2,
2644 row << 3, (col << 3) + 4, &b->mv[1][1], 4, 8, w2, h2);
2647 av_assert2(b->bs == BS_4x4);
2649 // FIXME if two horizontally adjacent blocks have the same MV,
2650 // do a w8 instead of a w4 call
2651 mc_luma_dir(s, s->dsp.mc[4][b->filter][0], s->dst[0], ls_y,
2652 ref1->data[0], ref1->linesize[0], tref1,
2653 row << 3, col << 3, &b->mv[0][0], 4, 4, w1, h1);
2654 mc_luma_dir(s, s->dsp.mc[4][b->filter][0], s->dst[0] + 4, ls_y,
2655 ref1->data[0], ref1->linesize[0], tref1,
2656 row << 3, (col << 3) + 4, &b->mv[1][0], 4, 4, w1, h1);
2657 mc_luma_dir(s, s->dsp.mc[4][b->filter][0],
2658 s->dst[0] + 4 * ls_y, ls_y,
2659 ref1->data[0], ref1->linesize[0], tref1,
2660 (row << 3) + 4, col << 3, &b->mv[2][0], 4, 4, w1, h1);
2661 mc_luma_dir(s, s->dsp.mc[4][b->filter][0],
2662 s->dst[0] + 4 * ls_y + 4, ls_y,
2663 ref1->data[0], ref1->linesize[0], tref1,
2664 (row << 3) + 4, (col << 3) + 4, &b->mv[3][0], 4, 4, w1, h1);
2667 mc_luma_dir(s, s->dsp.mc[4][b->filter][1], s->dst[0], ls_y,
2668 ref2->data[0], ref2->linesize[0], tref2,
2669 row << 3, col << 3, &b->mv[0][1], 4, 4, w2, h2);
2670 mc_luma_dir(s, s->dsp.mc[4][b->filter][1], s->dst[0] + 4, ls_y,
2671 ref2->data[0], ref2->linesize[0], tref2,
2672 row << 3, (col << 3) + 4, &b->mv[1][1], 4, 4, w2, h2);
2673 mc_luma_dir(s, s->dsp.mc[4][b->filter][1],
2674 s->dst[0] + 4 * ls_y, ls_y,
2675 ref2->data[0], ref2->linesize[0], tref2,
2676 (row << 3) + 4, col << 3, &b->mv[2][1], 4, 4, w2, h2);
2677 mc_luma_dir(s, s->dsp.mc[4][b->filter][1],
2678 s->dst[0] + 4 * ls_y + 4, ls_y,
2679 ref2->data[0], ref2->linesize[0], tref2,
2680 (row << 3) + 4, (col << 3) + 4, &b->mv[3][1], 4, 4, w2, h2);
2684 int bwl = bwlog_tab[0][b->bs];
2685 int bw = bwh_tab[0][b->bs][0] * 4, bh = bwh_tab[0][b->bs][1] * 4;
2687 mc_luma_dir(s, s->dsp.mc[bwl][b->filter][0], s->dst[0], ls_y,
2688 ref1->data[0], ref1->linesize[0], tref1,
2689 row << 3, col << 3, &b->mv[0][0],bw, bh, w1, h1);
2692 mc_luma_dir(s, s->dsp.mc[bwl][b->filter][1], s->dst[0], ls_y,
2693 ref2->data[0], ref2->linesize[0], tref2,
2694 row << 3, col << 3, &b->mv[0][1], bw, bh, w2, h2);
2699 int bwl = bwlog_tab[1][b->bs];
2700 int bw = bwh_tab[1][b->bs][0] * 4, bh = bwh_tab[1][b->bs][1] * 4;
2709 if (b->bs > BS_8x8) {
2710 mvuv.x = ROUNDED_DIV(b->mv[0][0].x + b->mv[1][0].x + b->mv[2][0].x + b->mv[3][0].x, 4);
2711 mvuv.y = ROUNDED_DIV(b->mv[0][0].y + b->mv[1][0].y + b->mv[2][0].y + b->mv[3][0].y, 4);
2716 mc_chroma_dir(s, s->dsp.mc[bwl][b->filter][0],
2717 s->dst[1], s->dst[2], ls_uv,
2718 ref1->data[1], ref1->linesize[1],
2719 ref1->data[2], ref1->linesize[2], tref1,
2720 row << 2, col << 2, &mvuv, bw, bh, w1, h1);
2723 if (b->bs > BS_8x8) {
2724 mvuv.x = ROUNDED_DIV(b->mv[0][1].x + b->mv[1][1].x + b->mv[2][1].x + b->mv[3][1].x, 4);
2725 mvuv.y = ROUNDED_DIV(b->mv[0][1].y + b->mv[1][1].y + b->mv[2][1].y + b->mv[3][1].y, 4);
2729 mc_chroma_dir(s, s->dsp.mc[bwl][b->filter][1],
2730 s->dst[1], s->dst[2], ls_uv,
2731 ref2->data[1], ref2->linesize[1],
2732 ref2->data[2], ref2->linesize[2], tref2,
2733 row << 2, col << 2, &mvuv, bw, bh, w2, h2);
2738 /* mostly copied intra_reconn() */
2740 int w4 = bwh_tab[1][b->bs][0] << 1, step1d = 1 << b->tx, n;
2741 int h4 = bwh_tab[1][b->bs][1] << 1, x, y, step = 1 << (b->tx * 2);
2742 int end_x = FFMIN(2 * (s->cols - col), w4);
2743 int end_y = FFMIN(2 * (s->rows - row), h4);
2744 int tx = 4 * s->lossless + b->tx, uvtx = b->uvtx + 4 * s->lossless;
2745 int uvstep1d = 1 << b->uvtx, p;
2746 uint8_t *dst = s->dst[0];
2749 for (n = 0, y = 0; y < end_y; y += step1d) {
2751 for (x = 0; x < end_x; x += step1d, ptr += 4 * step1d, n += step) {
2752 int eob = b->tx > TX_8X8 ? AV_RN16A(&s->eob[n]) : s->eob[n];
2755 s->dsp.itxfm_add[tx][DCT_DCT](ptr, s->y_stride,
2756 s->block + 16 * n, eob);
2758 dst += 4 * s->y_stride * step1d;
2764 step = 1 << (b->uvtx * 2);
2765 for (p = 0; p < 2; p++) {
2766 dst = s->dst[p + 1];
2767 for (n = 0, y = 0; y < end_y; y += uvstep1d) {
2769 for (x = 0; x < end_x; x += uvstep1d, ptr += 4 * uvstep1d, n += step) {
2770 int eob = b->uvtx > TX_8X8 ? AV_RN16A(&s->uveob[p][n]) : s->uveob[p][n];
2773 s->dsp.itxfm_add[uvtx][DCT_DCT](ptr, s->uv_stride,
2774 s->uvblock[p] + 16 * n, eob);
2776 dst += 4 * uvstep1d * s->uv_stride;
2782 static av_always_inline void mask_edges(struct VP9Filter *lflvl, int is_uv,
2783 int row_and_7, int col_and_7,
2784 int w, int h, int col_end, int row_end,
2785 enum TxfmMode tx, int skip_inter)
2787 // FIXME I'm pretty sure all loops can be replaced by a single LUT if
2788 // we make VP9Filter.mask uint64_t (i.e. row/col all single variable)
2789 // and make the LUT 5-indexed (bl, bp, is_uv, tx and row/col), and then
2790 // use row_and_7/col_and_7 as shifts (1*col_and_7+8*row_and_7)
2792 // the intended behaviour of the vp9 loopfilter is to work on 8-pixel
2793 // edges. This means that for UV, we work on two subsampled blocks at
2794 // a time, and we only use the topleft block's mode information to set
2795 // things like block strength. Thus, for any block size smaller than
2796 // 16x16, ignore the odd portion of the block.
2797 if (tx == TX_4X4 && is_uv) {
2812 if (tx == TX_4X4 && !skip_inter) {
2813 int t = 1 << col_and_7, m_col = (t << w) - t, y;
2814 int m_col_odd = (t << (w - 1)) - t;
2816 // on 32-px edges, use the 8-px wide loopfilter; else, use 4-px wide
2818 int m_row_8 = m_col & 0x01, m_row_4 = m_col - m_row_8;
2820 for (y = row_and_7; y < h + row_and_7; y++) {
2821 int col_mask_id = 2 - !(y & 7);
2823 lflvl->mask[is_uv][0][y][1] |= m_row_8;
2824 lflvl->mask[is_uv][0][y][2] |= m_row_4;
2825 // for odd lines, if the odd col is not being filtered,
2826 // skip odd row also:
2833 // if a/c are even row/col and b/d are odd, and d is skipped,
2834 // e.g. right edge of size-66x66.webm, then skip b also (bug)
2835 if ((col_end & 1) && (y & 1)) {
2836 lflvl->mask[is_uv][1][y][col_mask_id] |= m_col_odd;
2838 lflvl->mask[is_uv][1][y][col_mask_id] |= m_col;
2842 int m_row_8 = m_col & 0x11, m_row_4 = m_col - m_row_8;
2844 for (y = row_and_7; y < h + row_and_7; y++) {
2845 int col_mask_id = 2 - !(y & 3);
2847 lflvl->mask[is_uv][0][y][1] |= m_row_8; // row edge
2848 lflvl->mask[is_uv][0][y][2] |= m_row_4;
2849 lflvl->mask[is_uv][1][y][col_mask_id] |= m_col; // col edge
2850 lflvl->mask[is_uv][0][y][3] |= m_col;
2851 lflvl->mask[is_uv][1][y][3] |= m_col;
2855 int y, t = 1 << col_and_7, m_col = (t << w) - t;
2858 int mask_id = (tx == TX_8X8);
2859 int l2 = tx + is_uv - 1, step1d = 1 << l2;
2860 static const unsigned masks[4] = { 0xff, 0x55, 0x11, 0x01 };
2861 int m_row = m_col & masks[l2];
2863 // at odd UV col/row edges tx16/tx32 loopfilter edges, force
2864 // 8wd loopfilter to prevent going off the visible edge.
2865 if (is_uv && tx > TX_8X8 && (w ^ (w - 1)) == 1) {
2866 int m_row_16 = ((t << (w - 1)) - t) & masks[l2];
2867 int m_row_8 = m_row - m_row_16;
2869 for (y = row_and_7; y < h + row_and_7; y++) {
2870 lflvl->mask[is_uv][0][y][0] |= m_row_16;
2871 lflvl->mask[is_uv][0][y][1] |= m_row_8;
2874 for (y = row_and_7; y < h + row_and_7; y++)
2875 lflvl->mask[is_uv][0][y][mask_id] |= m_row;
2878 if (is_uv && tx > TX_8X8 && (h ^ (h - 1)) == 1) {
2879 for (y = row_and_7; y < h + row_and_7 - 1; y += step1d)
2880 lflvl->mask[is_uv][1][y][0] |= m_col;
2881 if (y - row_and_7 == h - 1)
2882 lflvl->mask[is_uv][1][y][1] |= m_col;
2884 for (y = row_and_7; y < h + row_and_7; y += step1d)
2885 lflvl->mask[is_uv][1][y][mask_id] |= m_col;
2887 } else if (tx != TX_4X4) {
2890 mask_id = (tx == TX_8X8) || (is_uv && h == 1);
2891 lflvl->mask[is_uv][1][row_and_7][mask_id] |= m_col;
2892 mask_id = (tx == TX_8X8) || (is_uv && w == 1);
2893 for (y = row_and_7; y < h + row_and_7; y++)
2894 lflvl->mask[is_uv][0][y][mask_id] |= t;
2896 int t8 = t & 0x01, t4 = t - t8;
2898 for (y = row_and_7; y < h + row_and_7; y++) {
2899 lflvl->mask[is_uv][0][y][2] |= t4;
2900 lflvl->mask[is_uv][0][y][1] |= t8;
2902 lflvl->mask[is_uv][1][row_and_7][2 - !(row_and_7 & 7)] |= m_col;
2904 int t8 = t & 0x11, t4 = t - t8;
2906 for (y = row_and_7; y < h + row_and_7; y++) {
2907 lflvl->mask[is_uv][0][y][2] |= t4;
2908 lflvl->mask[is_uv][0][y][1] |= t8;
2910 lflvl->mask[is_uv][1][row_and_7][2 - !(row_and_7 & 3)] |= m_col;
2915 static void decode_b(AVCodecContext *ctx, int row, int col,
2916 struct VP9Filter *lflvl, ptrdiff_t yoff, ptrdiff_t uvoff,
2917 enum BlockLevel bl, enum BlockPartition bp)
2919 VP9Context *s = ctx->priv_data;
2921 enum BlockSize bs = bl * 3 + bp;
2922 int w4 = bwh_tab[1][bs][0], h4 = bwh_tab[1][bs][1], lvl;
2924 AVFrame *f = s->frames[CUR_FRAME].tf.f;
2930 s->min_mv.x = -(128 + col * 64);
2931 s->min_mv.y = -(128 + row * 64);
2932 s->max_mv.x = 128 + (s->cols - col - w4) * 64;
2933 s->max_mv.y = 128 + (s->rows - row - h4) * 64;
2939 b->uvtx = b->tx - (w4 * 2 == (1 << b->tx) || h4 * 2 == (1 << b->tx));
2946 #define SPLAT_ZERO_CTX(v, n) \
2948 case 1: v = 0; break; \
2949 case 2: AV_ZERO16(&v); break; \
2950 case 4: AV_ZERO32(&v); break; \
2951 case 8: AV_ZERO64(&v); break; \
2952 case 16: AV_ZERO128(&v); break; \
2954 #define SPLAT_ZERO_YUV(dir, var, off, n) \
2956 SPLAT_ZERO_CTX(s->dir##_y_##var[off * 2], n * 2); \
2957 SPLAT_ZERO_CTX(s->dir##_uv_##var[0][off], n); \
2958 SPLAT_ZERO_CTX(s->dir##_uv_##var[1][off], n); \
2962 case 1: SPLAT_ZERO_YUV(above, nnz_ctx, col, 1); break;
2963 case 2: SPLAT_ZERO_YUV(above, nnz_ctx, col, 2); break;
2964 case 4: SPLAT_ZERO_YUV(above, nnz_ctx, col, 4); break;
2965 case 8: SPLAT_ZERO_YUV(above, nnz_ctx, col, 8); break;
2968 case 1: SPLAT_ZERO_YUV(left, nnz_ctx, row7, 1); break;
2969 case 2: SPLAT_ZERO_YUV(left, nnz_ctx, row7, 2); break;
2970 case 4: SPLAT_ZERO_YUV(left, nnz_ctx, row7, 4); break;
2971 case 8: SPLAT_ZERO_YUV(left, nnz_ctx, row7, 8); break;
2976 s->block += w4 * h4 * 64;
2977 s->uvblock[0] += w4 * h4 * 16;
2978 s->uvblock[1] += w4 * h4 * 16;
2979 s->eob += 4 * w4 * h4;
2980 s->uveob[0] += w4 * h4;
2981 s->uveob[1] += w4 * h4;
2987 // emulated overhangs if the stride of the target buffer can't hold. This
2988 // allows to support emu-edge and so on even if we have large block
2990 emu[0] = (col + w4) * 8 > f->linesize[0] ||
2991 (row + h4) > s->rows;
2992 emu[1] = (col + w4) * 4 > f->linesize[1] ||
2993 (row + h4) > s->rows;
2995 s->dst[0] = s->tmp_y;
2998 s->dst[0] = f->data[0] + yoff;
2999 s->y_stride = f->linesize[0];
3002 s->dst[1] = s->tmp_uv[0];
3003 s->dst[2] = s->tmp_uv[1];
3006 s->dst[1] = f->data[1] + uvoff;
3007 s->dst[2] = f->data[2] + uvoff;
3008 s->uv_stride = f->linesize[1];
3011 intra_recon(ctx, yoff, uvoff);
3016 int w = FFMIN(s->cols - col, w4) * 8, h = FFMIN(s->rows - row, h4) * 8, n, o = 0;
3018 for (n = 0; o < w; n++) {
3023 s->dsp.mc[n][0][0][0][0](f->data[0] + yoff + o, f->linesize[0],
3024 s->tmp_y + o, 64, h, 0, 0);
3030 int w = FFMIN(s->cols - col, w4) * 4, h = FFMIN(s->rows - row, h4) * 4, n, o = 0;
3032 for (n = 1; o < w; n++) {
3037 s->dsp.mc[n][0][0][0][0](f->data[1] + uvoff + o, f->linesize[1],
3038 s->tmp_uv[0] + o, 32, h, 0, 0);
3039 s->dsp.mc[n][0][0][0][0](f->data[2] + uvoff + o, f->linesize[2],
3040 s->tmp_uv[1] + o, 32, h, 0, 0);
3046 // pick filter level and find edges to apply filter to
3047 if (s->filter.level &&
3048 (lvl = s->segmentation.feat[b->seg_id].lflvl[b->intra ? 0 : b->ref[0] + 1]
3049 [b->mode[3] != ZEROMV]) > 0) {
3050 int x_end = FFMIN(s->cols - col, w4), y_end = FFMIN(s->rows - row, h4);
3051 int skip_inter = !b->intra && b->skip, col7 = s->col7, row7 = s->row7;
3053 setctx_2d(&lflvl->level[row7 * 8 + col7], w4, h4, 8, lvl);
3054 mask_edges(lflvl, 0, row7, col7, x_end, y_end, 0, 0, b->tx, skip_inter);
3055 mask_edges(lflvl, 1, row7, col7, x_end, y_end,
3056 s->cols & 1 && col + w4 >= s->cols ? s->cols & 7 : 0,
3057 s->rows & 1 && row + h4 >= s->rows ? s->rows & 7 : 0,
3058 b->uvtx, skip_inter);
3060 if (!s->filter.lim_lut[lvl]) {
3061 int sharp = s->filter.sharpness;
3065 limit >>= (sharp + 3) >> 2;
3066 limit = FFMIN(limit, 9 - sharp);
3068 limit = FFMAX(limit, 1);
3070 s->filter.lim_lut[lvl] = limit;
3071 s->filter.mblim_lut[lvl] = 2 * (lvl + 2) + limit;
3077 s->block += w4 * h4 * 64;
3078 s->uvblock[0] += w4 * h4 * 16;
3079 s->uvblock[1] += w4 * h4 * 16;
3080 s->eob += 4 * w4 * h4;
3081 s->uveob[0] += w4 * h4;
3082 s->uveob[1] += w4 * h4;
3086 static void decode_sb(AVCodecContext *ctx, int row, int col, struct VP9Filter *lflvl,
3087 ptrdiff_t yoff, ptrdiff_t uvoff, enum BlockLevel bl)
3089 VP9Context *s = ctx->priv_data;
3090 int c = ((s->above_partition_ctx[col] >> (3 - bl)) & 1) |
3091 (((s->left_partition_ctx[row & 0x7] >> (3 - bl)) & 1) << 1);
3092 const uint8_t *p = s->keyframe ? vp9_default_kf_partition_probs[bl][c] :
3093 s->prob.p.partition[bl][c];
3094 enum BlockPartition bp;
3095 ptrdiff_t hbs = 4 >> bl;
3096 AVFrame *f = s->frames[CUR_FRAME].tf.f;
3097 ptrdiff_t y_stride = f->linesize[0], uv_stride = f->linesize[1];
3100 bp = vp8_rac_get_tree(&s->c, vp9_partition_tree, p);
3101 decode_b(ctx, row, col, lflvl, yoff, uvoff, bl, bp);
3102 } else if (col + hbs < s->cols) { // FIXME why not <=?
3103 if (row + hbs < s->rows) { // FIXME why not <=?
3104 bp = vp8_rac_get_tree(&s->c, vp9_partition_tree, p);
3106 case PARTITION_NONE:
3107 decode_b(ctx, row, col, lflvl, yoff, uvoff, bl, bp);
3110 decode_b(ctx, row, col, lflvl, yoff, uvoff, bl, bp);
3111 yoff += hbs * 8 * y_stride;
3112 uvoff += hbs * 4 * uv_stride;
3113 decode_b(ctx, row + hbs, col, lflvl, yoff, uvoff, bl, bp);
3116 decode_b(ctx, row, col, lflvl, yoff, uvoff, bl, bp);
3119 decode_b(ctx, row, col + hbs, lflvl, yoff, uvoff, bl, bp);
3121 case PARTITION_SPLIT:
3122 decode_sb(ctx, row, col, lflvl, yoff, uvoff, bl + 1);
3123 decode_sb(ctx, row, col + hbs, lflvl,
3124 yoff + 8 * hbs, uvoff + 4 * hbs, bl + 1);
3125 yoff += hbs * 8 * y_stride;
3126 uvoff += hbs * 4 * uv_stride;
3127 decode_sb(ctx, row + hbs, col, lflvl, yoff, uvoff, bl + 1);
3128 decode_sb(ctx, row + hbs, col + hbs, lflvl,
3129 yoff + 8 * hbs, uvoff + 4 * hbs, bl + 1);
3134 } else if (vp56_rac_get_prob_branchy(&s->c, p[1])) {
3135 bp = PARTITION_SPLIT;
3136 decode_sb(ctx, row, col, lflvl, yoff, uvoff, bl + 1);
3137 decode_sb(ctx, row, col + hbs, lflvl,
3138 yoff + 8 * hbs, uvoff + 4 * hbs, bl + 1);
3141 decode_b(ctx, row, col, lflvl, yoff, uvoff, bl, bp);
3143 } else if (row + hbs < s->rows) { // FIXME why not <=?
3144 if (vp56_rac_get_prob_branchy(&s->c, p[2])) {
3145 bp = PARTITION_SPLIT;
3146 decode_sb(ctx, row, col, lflvl, yoff, uvoff, bl + 1);
3147 yoff += hbs * 8 * y_stride;
3148 uvoff += hbs * 4 * uv_stride;
3149 decode_sb(ctx, row + hbs, col, lflvl, yoff, uvoff, bl + 1);
3152 decode_b(ctx, row, col, lflvl, yoff, uvoff, bl, bp);
3155 bp = PARTITION_SPLIT;
3156 decode_sb(ctx, row, col, lflvl, yoff, uvoff, bl + 1);
3158 s->counts.partition[bl][c][bp]++;
3161 static void decode_sb_mem(AVCodecContext *ctx, int row, int col, struct VP9Filter *lflvl,
3162 ptrdiff_t yoff, ptrdiff_t uvoff, enum BlockLevel bl)
3164 VP9Context *s = ctx->priv_data;
3166 ptrdiff_t hbs = 4 >> bl;
3167 AVFrame *f = s->frames[CUR_FRAME].tf.f;
3168 ptrdiff_t y_stride = f->linesize[0], uv_stride = f->linesize[1];
3171 av_assert2(b->bl == BL_8X8);
3172 decode_b(ctx, row, col, lflvl, yoff, uvoff, b->bl, b->bp);
3173 } else if (s->b->bl == bl) {
3174 decode_b(ctx, row, col, lflvl, yoff, uvoff, b->bl, b->bp);
3175 if (b->bp == PARTITION_H && row + hbs < s->rows) {
3176 yoff += hbs * 8 * y_stride;
3177 uvoff += hbs * 4 * uv_stride;
3178 decode_b(ctx, row + hbs, col, lflvl, yoff, uvoff, b->bl, b->bp);
3179 } else if (b->bp == PARTITION_V && col + hbs < s->cols) {
3182 decode_b(ctx, row, col + hbs, lflvl, yoff, uvoff, b->bl, b->bp);
3185 decode_sb_mem(ctx, row, col, lflvl, yoff, uvoff, bl + 1);
3186 if (col + hbs < s->cols) { // FIXME why not <=?
3187 if (row + hbs < s->rows) {
3188 decode_sb_mem(ctx, row, col + hbs, lflvl, yoff + 8 * hbs,
3189 uvoff + 4 * hbs, bl + 1);
3190 yoff += hbs * 8 * y_stride;
3191 uvoff += hbs * 4 * uv_stride;
3192 decode_sb_mem(ctx, row + hbs, col, lflvl, yoff, uvoff, bl + 1);
3193 decode_sb_mem(ctx, row + hbs, col + hbs, lflvl,
3194 yoff + 8 * hbs, uvoff + 4 * hbs, bl + 1);
3198 decode_sb_mem(ctx, row, col + hbs, lflvl, yoff, uvoff, bl + 1);
3200 } else if (row + hbs < s->rows) {
3201 yoff += hbs * 8 * y_stride;
3202 uvoff += hbs * 4 * uv_stride;
3203 decode_sb_mem(ctx, row + hbs, col, lflvl, yoff, uvoff, bl + 1);
3208 static void loopfilter_sb(AVCodecContext *ctx, struct VP9Filter *lflvl,
3209 int row, int col, ptrdiff_t yoff, ptrdiff_t uvoff)
3211 VP9Context *s = ctx->priv_data;
3212 AVFrame *f = s->frames[CUR_FRAME].tf.f;
3213 uint8_t *dst = f->data[0] + yoff, *lvl = lflvl->level;
3214 ptrdiff_t ls_y = f->linesize[0], ls_uv = f->linesize[1];
3217 // FIXME in how far can we interleave the v/h loopfilter calls? E.g.
3218 // if you think of them as acting on a 8x8 block max, we can interleave
3219 // each v/h within the single x loop, but that only works if we work on
3220 // 8 pixel blocks, and we won't always do that (we want at least 16px
3221 // to use SSE2 optimizations, perhaps 32 for AVX2)
3223 // filter edges between columns, Y plane (e.g. block1 | block2)
3224 for (y = 0; y < 8; y += 2, dst += 16 * ls_y, lvl += 16) {
3225 uint8_t *ptr = dst, *l = lvl, *hmask1 = lflvl->mask[0][0][y];
3226 uint8_t *hmask2 = lflvl->mask[0][0][y + 1];
3227 unsigned hm1 = hmask1[0] | hmask1[1] | hmask1[2], hm13 = hmask1[3];
3228 unsigned hm2 = hmask2[1] | hmask2[2], hm23 = hmask2[3];
3229 unsigned hm = hm1 | hm2 | hm13 | hm23;
3231 for (x = 1; hm & ~(x - 1); x <<= 1, ptr += 8, l++) {
3233 int L = *l, H = L >> 4;
3234 int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3237 if (hmask1[0] & x) {
3238 if (hmask2[0] & x) {
3239 av_assert2(l[8] == L);
3240 s->dsp.loop_filter_16[0](ptr, ls_y, E, I, H);
3242 s->dsp.loop_filter_8[2][0](ptr, ls_y, E, I, H);
3244 } else if (hm2 & x) {
3247 E |= s->filter.mblim_lut[L] << 8;
3248 I |= s->filter.lim_lut[L] << 8;
3249 s->dsp.loop_filter_mix2[!!(hmask1[1] & x)]
3251 [0](ptr, ls_y, E, I, H);
3253 s->dsp.loop_filter_8[!!(hmask1[1] & x)]
3254 [0](ptr, ls_y, E, I, H);
3257 } else if (hm2 & x) {
3258 int L = l[8], H = L >> 4;
3259 int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3262 s->dsp.loop_filter_8[!!(hmask2[1] & x)]
3263 [0](ptr + 8 * ls_y, ls_y, E, I, H);
3267 int L = *l, H = L >> 4;
3268 int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3273 E |= s->filter.mblim_lut[L] << 8;
3274 I |= s->filter.lim_lut[L] << 8;
3275 s->dsp.loop_filter_mix2[0][0][0](ptr + 4, ls_y, E, I, H);
3277 s->dsp.loop_filter_8[0][0](ptr + 4, ls_y, E, I, H);
3279 } else if (hm23 & x) {
3280 int L = l[8], H = L >> 4;
3281 int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3283 s->dsp.loop_filter_8[0][0](ptr + 8 * ls_y + 4, ls_y, E, I, H);
3289 // filter edges between rows, Y plane (e.g. ------)
3291 dst = f->data[0] + yoff;
3293 for (y = 0; y < 8; y++, dst += 8 * ls_y, lvl += 8) {
3294 uint8_t *ptr = dst, *l = lvl, *vmask = lflvl->mask[0][1][y];
3295 unsigned vm = vmask[0] | vmask[1] | vmask[2], vm3 = vmask[3];
3297 for (x = 1; vm & ~(x - 1); x <<= 2, ptr += 16, l += 2) {
3300 int L = *l, H = L >> 4;
3301 int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3304 if (vmask[0] & (x << 1)) {
3305 av_assert2(l[1] == L);
3306 s->dsp.loop_filter_16[1](ptr, ls_y, E, I, H);
3308 s->dsp.loop_filter_8[2][1](ptr, ls_y, E, I, H);
3310 } else if (vm & (x << 1)) {
3313 E |= s->filter.mblim_lut[L] << 8;
3314 I |= s->filter.lim_lut[L] << 8;
3315 s->dsp.loop_filter_mix2[!!(vmask[1] & x)]
3316 [!!(vmask[1] & (x << 1))]
3317 [1](ptr, ls_y, E, I, H);
3319 s->dsp.loop_filter_8[!!(vmask[1] & x)]
3320 [1](ptr, ls_y, E, I, H);
3322 } else if (vm & (x << 1)) {
3323 int L = l[1], H = L >> 4;
3324 int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3326 s->dsp.loop_filter_8[!!(vmask[1] & (x << 1))]
3327 [1](ptr + 8, ls_y, E, I, H);
3331 int L = *l, H = L >> 4;
3332 int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3334 if (vm3 & (x << 1)) {
3337 E |= s->filter.mblim_lut[L] << 8;
3338 I |= s->filter.lim_lut[L] << 8;
3339 s->dsp.loop_filter_mix2[0][0][1](ptr + ls_y * 4, ls_y, E, I, H);
3341 s->dsp.loop_filter_8[0][1](ptr + ls_y * 4, ls_y, E, I, H);
3343 } else if (vm3 & (x << 1)) {
3344 int L = l[1], H = L >> 4;
3345 int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3347 s->dsp.loop_filter_8[0][1](ptr + ls_y * 4 + 8, ls_y, E, I, H);
3352 // same principle but for U/V planes
3353 for (p = 0; p < 2; p++) {
3355 dst = f->data[1 + p] + uvoff;
3356 for (y = 0; y < 8; y += 4, dst += 16 * ls_uv, lvl += 32) {
3357 uint8_t *ptr = dst, *l = lvl, *hmask1 = lflvl->mask[1][0][y];
3358 uint8_t *hmask2 = lflvl->mask[1][0][y + 2];
3359 unsigned hm1 = hmask1[0] | hmask1[1] | hmask1[2];
3360 unsigned hm2 = hmask2[1] | hmask2[2], hm = hm1 | hm2;
3362 for (x = 1; hm & ~(x - 1); x <<= 1, ptr += 4) {
3365 int L = *l, H = L >> 4;
3366 int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3368 if (hmask1[0] & x) {
3369 if (hmask2[0] & x) {
3370 av_assert2(l[16] == L);
3371 s->dsp.loop_filter_16[0](ptr, ls_uv, E, I, H);
3373 s->dsp.loop_filter_8[2][0](ptr, ls_uv, E, I, H);
3375 } else if (hm2 & x) {
3378 E |= s->filter.mblim_lut[L] << 8;
3379 I |= s->filter.lim_lut[L] << 8;
3380 s->dsp.loop_filter_mix2[!!(hmask1[1] & x)]
3382 [0](ptr, ls_uv, E, I, H);
3384 s->dsp.loop_filter_8[!!(hmask1[1] & x)]
3385 [0](ptr, ls_uv, E, I, H);
3387 } else if (hm2 & x) {
3388 int L = l[16], H = L >> 4;
3389 int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3391 s->dsp.loop_filter_8[!!(hmask2[1] & x)]
3392 [0](ptr + 8 * ls_uv, ls_uv, E, I, H);
3400 dst = f->data[1 + p] + uvoff;
3401 for (y = 0; y < 8; y++, dst += 4 * ls_uv) {
3402 uint8_t *ptr = dst, *l = lvl, *vmask = lflvl->mask[1][1][y];
3403 unsigned vm = vmask[0] | vmask[1] | vmask[2];
3405 for (x = 1; vm & ~(x - 1); x <<= 4, ptr += 16, l += 4) {
3408 int L = *l, H = L >> 4;
3409 int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3412 if (vmask[0] & (x << 2)) {
3413 av_assert2(l[2] == L);
3414 s->dsp.loop_filter_16[1](ptr, ls_uv, E, I, H);
3416 s->dsp.loop_filter_8[2][1](ptr, ls_uv, E, I, H);
3418 } else if (vm & (x << 2)) {
3421 E |= s->filter.mblim_lut[L] << 8;
3422 I |= s->filter.lim_lut[L] << 8;
3423 s->dsp.loop_filter_mix2[!!(vmask[1] & x)]
3424 [!!(vmask[1] & (x << 2))]
3425 [1](ptr, ls_uv, E, I, H);
3427 s->dsp.loop_filter_8[!!(vmask[1] & x)]
3428 [1](ptr, ls_uv, E, I, H);
3430 } else if (vm & (x << 2)) {
3431 int L = l[2], H = L >> 4;
3432 int E = s->filter.mblim_lut[L], I = s->filter.lim_lut[L];
3434 s->dsp.loop_filter_8[!!(vmask[1] & (x << 2))]
3435 [1](ptr + 8, ls_uv, E, I, H);
3445 static void set_tile_offset(int *start, int *end, int idx, int log2_n, int n)
3447 int sb_start = ( idx * n) >> log2_n;
3448 int sb_end = ((idx + 1) * n) >> log2_n;
3449 *start = FFMIN(sb_start, n) << 3;
3450 *end = FFMIN(sb_end, n) << 3;
3453 static av_always_inline void adapt_prob(uint8_t *p, unsigned ct0, unsigned ct1,
3454 int max_count, int update_factor)
3456 unsigned ct = ct0 + ct1, p2, p1;
3462 p2 = ((ct0 << 8) + (ct >> 1)) / ct;
3463 p2 = av_clip(p2, 1, 255);
3464 ct = FFMIN(ct, max_count);
3465 update_factor = FASTDIV(update_factor * ct, max_count);
3467 // (p1 * (256 - update_factor) + p2 * update_factor + 128) >> 8
3468 *p = p1 + (((p2 - p1) * update_factor + 128) >> 8);
3471 static void adapt_probs(VP9Context *s)
3474 prob_context *p = &s->prob_ctx[s->framectxid].p;
3475 int uf = (s->keyframe || s->intraonly || !s->last_keyframe) ? 112 : 128;
3478 for (i = 0; i < 4; i++)
3479 for (j = 0; j < 2; j++)
3480 for (k = 0; k < 2; k++)
3481 for (l = 0; l < 6; l++)
3482 for (m = 0; m < 6; m++) {
3483 uint8_t *pp = s->prob_ctx[s->framectxid].coef[i][j][k][l][m];
3484 unsigned *e = s->counts.eob[i][j][k][l][m];
3485 unsigned *c = s->counts.coef[i][j][k][l][m];
3487 if (l == 0 && m >= 3) // dc only has 3 pt
3490 adapt_prob(&pp[0], e[0], e[1], 24, uf);
3491 adapt_prob(&pp[1], c[0], c[1] + c[2], 24, uf);
3492 adapt_prob(&pp[2], c[1], c[2], 24, uf);
3495 if (s->keyframe || s->intraonly) {
3496 memcpy(p->skip, s->prob.p.skip, sizeof(p->skip));
3497 memcpy(p->tx32p, s->prob.p.tx32p, sizeof(p->tx32p));
3498 memcpy(p->tx16p, s->prob.p.tx16p, sizeof(p->tx16p));
3499 memcpy(p->tx8p, s->prob.p.tx8p, sizeof(p->tx8p));
3504 for (i = 0; i < 3; i++)
3505 adapt_prob(&p->skip[i], s->counts.skip[i][0], s->counts.skip[i][1], 20, 128);
3508 for (i = 0; i < 4; i++)
3509 adapt_prob(&p->intra[i], s->counts.intra[i][0], s->counts.intra[i][1], 20, 128);
3512 if (s->comppredmode == PRED_SWITCHABLE) {
3513 for (i = 0; i < 5; i++)
3514 adapt_prob(&p->comp[i], s->counts.comp[i][0], s->counts.comp[i][1], 20, 128);
3518 if (s->comppredmode != PRED_SINGLEREF) {
3519 for (i = 0; i < 5; i++)
3520 adapt_prob(&p->comp_ref[i], s->counts.comp_ref[i][0],
3521 s->counts.comp_ref[i][1], 20, 128);
3524 if (s->comppredmode != PRED_COMPREF) {
3525 for (i = 0; i < 5; i++) {
3526 uint8_t *pp = p->single_ref[i];
3527 unsigned (*c)[2] = s->counts.single_ref[i];
3529 adapt_prob(&pp[0], c[0][0], c[0][1], 20, 128);
3530 adapt_prob(&pp[1], c[1][0], c[1][1], 20, 128);
3534 // block partitioning
3535 for (i = 0; i < 4; i++)
3536 for (j = 0; j < 4; j++) {
3537 uint8_t *pp = p->partition[i][j];
3538 unsigned *c = s->counts.partition[i][j];
3540 adapt_prob(&pp[0], c[0], c[1] + c[2] + c[3], 20, 128);
3541 adapt_prob(&pp[1], c[1], c[2] + c[3], 20, 128);
3542 adapt_prob(&pp[2], c[2], c[3], 20, 128);
3546 if (s->txfmmode == TX_SWITCHABLE) {
3547 for (i = 0; i < 2; i++) {
3548 unsigned *c16 = s->counts.tx16p[i], *c32 = s->counts.tx32p[i];
3550 adapt_prob(&p->tx8p[i], s->counts.tx8p[i][0], s->counts.tx8p[i][1], 20, 128);
3551 adapt_prob(&p->tx16p[i][0], c16[0], c16[1] + c16[2], 20, 128);
3552 adapt_prob(&p->tx16p[i][1], c16[1], c16[2], 20, 128);
3553 adapt_prob(&p->tx32p[i][0], c32[0], c32[1] + c32[2] + c32[3], 20, 128);
3554 adapt_prob(&p->tx32p[i][1], c32[1], c32[2] + c32[3], 20, 128);
3555 adapt_prob(&p->tx32p[i][2], c32[2], c32[3], 20, 128);
3559 // interpolation filter
3560 if (s->filtermode == FILTER_SWITCHABLE) {
3561 for (i = 0; i < 4; i++) {
3562 uint8_t *pp = p->filter[i];
3563 unsigned *c = s->counts.filter[i];
3565 adapt_prob(&pp[0], c[0], c[1] + c[2], 20, 128);
3566 adapt_prob(&pp[1], c[1], c[2], 20, 128);
3571 for (i = 0; i < 7; i++) {
3572 uint8_t *pp = p->mv_mode[i];
3573 unsigned *c = s->counts.mv_mode[i];
3575 adapt_prob(&pp[0], c[2], c[1] + c[0] + c[3], 20, 128);
3576 adapt_prob(&pp[1], c[0], c[1] + c[3], 20, 128);
3577 adapt_prob(&pp[2], c[1], c[3], 20, 128);
3582 uint8_t *pp = p->mv_joint;
3583 unsigned *c = s->counts.mv_joint;
3585 adapt_prob(&pp[0], c[0], c[1] + c[2] + c[3], 20, 128);
3586 adapt_prob(&pp[1], c[1], c[2] + c[3], 20, 128);
3587 adapt_prob(&pp[2], c[2], c[3], 20, 128);
3591 for (i = 0; i < 2; i++) {
3593 unsigned *c, (*c2)[2], sum;
3595 adapt_prob(&p->mv_comp[i].sign, s->counts.mv_comp[i].sign[0],
3596 s->counts.mv_comp[i].sign[1], 20, 128);
3598 pp = p->mv_comp[i].classes;
3599 c = s->counts.mv_comp[i].classes;
3600 sum = c[1] + c[2] + c[3] + c[4] + c[5] + c[6] + c[7] + c[8] + c[9] + c[10];
3601 adapt_prob(&pp[0], c[0], sum, 20, 128);
3603 adapt_prob(&pp[1], c[1], sum, 20, 128);
3605 adapt_prob(&pp[2], c[2] + c[3], sum, 20, 128);
3606 adapt_prob(&pp[3], c[2], c[3], 20, 128);
3608 adapt_prob(&pp[4], c[4] + c[5], sum, 20, 128);
3609 adapt_prob(&pp[5], c[4], c[5], 20, 128);
3611 adapt_prob(&pp[6], c[6], sum, 20, 128);
3612 adapt_prob(&pp[7], c[7] + c[8], c[9] + c[10], 20, 128);
3613 adapt_prob(&pp[8], c[7], c[8], 20, 128);
3614 adapt_prob(&pp[9], c[9], c[10], 20, 128);
3616 adapt_prob(&p->mv_comp[i].class0, s->counts.mv_comp[i].class0[0],
3617 s->counts.mv_comp[i].class0[1], 20, 128);
3618 pp = p->mv_comp[i].bits;
3619 c2 = s->counts.mv_comp[i].bits;
3620 for (j = 0; j < 10; j++)
3621 adapt_prob(&pp[j], c2[j][0], c2[j][1], 20, 128);
3623 for (j = 0; j < 2; j++) {
3624 pp = p->mv_comp[i].class0_fp[j];
3625 c = s->counts.mv_comp[i].class0_fp[j];
3626 adapt_prob(&pp[0], c[0], c[1] + c[2] + c[3], 20, 128);
3627 adapt_prob(&pp[1], c[1], c[2] + c[3], 20, 128);
3628 adapt_prob(&pp[2], c[2], c[3], 20, 128);
3630 pp = p->mv_comp[i].fp;
3631 c = s->counts.mv_comp[i].fp;
3632 adapt_prob(&pp[0], c[0], c[1] + c[2] + c[3], 20, 128);
3633 adapt_prob(&pp[1], c[1], c[2] + c[3], 20, 128);
3634 adapt_prob(&pp[2], c[2], c[3], 20, 128);
3636 if (s->highprecisionmvs) {
3637 adapt_prob(&p->mv_comp[i].class0_hp, s->counts.mv_comp[i].class0_hp[0],
3638 s->counts.mv_comp[i].class0_hp[1], 20, 128);
3639 adapt_prob(&p->mv_comp[i].hp, s->counts.mv_comp[i].hp[0],
3640 s->counts.mv_comp[i].hp[1], 20, 128);
3645 for (i = 0; i < 4; i++) {
3646 uint8_t *pp = p->y_mode[i];
3647 unsigned *c = s->counts.y_mode[i], sum, s2;
3649 sum = c[0] + c[1] + c[3] + c[4] + c[5] + c[6] + c[7] + c[8] + c[9];
3650 adapt_prob(&pp[0], c[DC_PRED], sum, 20, 128);
3651 sum -= c[TM_VP8_PRED];
3652 adapt_prob(&pp[1], c[TM_VP8_PRED], sum, 20, 128);
3653 sum -= c[VERT_PRED];
3654 adapt_prob(&pp[2], c[VERT_PRED], sum, 20, 128);
3655 s2 = c[HOR_PRED] + c[DIAG_DOWN_RIGHT_PRED] + c[VERT_RIGHT_PRED];
3657 adapt_prob(&pp[3], s2, sum, 20, 128);
3659 adapt_prob(&pp[4], c[HOR_PRED], s2, 20, 128);
3660 adapt_prob(&pp[5], c[DIAG_DOWN_RIGHT_PRED], c[VERT_RIGHT_PRED], 20, 128);
3661 sum -= c[DIAG_DOWN_LEFT_PRED];
3662 adapt_prob(&pp[6], c[DIAG_DOWN_LEFT_PRED], sum, 20, 128);
3663 sum -= c[VERT_LEFT_PRED];
3664 adapt_prob(&pp[7], c[VERT_LEFT_PRED], sum, 20, 128);
3665 adapt_prob(&pp[8], c[HOR_DOWN_PRED], c[HOR_UP_PRED], 20, 128);
3669 for (i = 0; i < 10; i++) {
3670 uint8_t *pp = p->uv_mode[i];
3671 unsigned *c = s->counts.uv_mode[i], sum, s2;
3673 sum = c[0] + c[1] + c[3] + c[4] + c[5] + c[6] + c[7] + c[8] + c[9];
3674 adapt_prob(&pp[0], c[DC_PRED], sum, 20, 128);
3675 sum -= c[TM_VP8_PRED];
3676 adapt_prob(&pp[1], c[TM_VP8_PRED], sum, 20, 128);
3677 sum -= c[VERT_PRED];
3678 adapt_prob(&pp[2], c[VERT_PRED], sum, 20, 128);
3679 s2 = c[HOR_PRED] + c[DIAG_DOWN_RIGHT_PRED] + c[VERT_RIGHT_PRED];
3681 adapt_prob(&pp[3], s2, sum, 20, 128);
3683 adapt_prob(&pp[4], c[HOR_PRED], s2, 20, 128);
3684 adapt_prob(&pp[5], c[DIAG_DOWN_RIGHT_PRED], c[VERT_RIGHT_PRED], 20, 128);
3685 sum -= c[DIAG_DOWN_LEFT_PRED];
3686 adapt_prob(&pp[6], c[DIAG_DOWN_LEFT_PRED], sum, 20, 128);
3687 sum -= c[VERT_LEFT_PRED];
3688 adapt_prob(&pp[7], c[VERT_LEFT_PRED], sum, 20, 128);
3689 adapt_prob(&pp[8], c[HOR_DOWN_PRED], c[HOR_UP_PRED], 20, 128);
3693 static void free_buffers(VP9Context *s)
3695 av_freep(&s->intra_pred_data[0]);
3696 av_freep(&s->b_base);
3697 av_freep(&s->block_base);
3700 static av_cold int vp9_decode_free(AVCodecContext *ctx)
3702 VP9Context *s = ctx->priv_data;
3705 for (i = 0; i < 2; i++) {
3706 if (s->frames[i].tf.f->data[0])
3707 vp9_unref_frame(ctx, &s->frames[i]);
3708 av_frame_free(&s->frames[i].tf.f);
3710 for (i = 0; i < 8; i++) {
3711 if (s->refs[i].f->data[0])
3712 ff_thread_release_buffer(ctx, &s->refs[i]);
3713 av_frame_free(&s->refs[i].f);
3714 if (s->next_refs[i].f->data[0])
3715 ff_thread_release_buffer(ctx, &s->next_refs[i]);
3716 av_frame_free(&s->next_refs[i].f);
3726 static int vp9_decode_frame(AVCodecContext *ctx, void *frame,
3727 int *got_frame, AVPacket *pkt)
3729 const uint8_t *data = pkt->data;
3730 int size = pkt->size;
3731 VP9Context *s = ctx->priv_data;
3732 int res, tile_row, tile_col, i, ref, row, col;
3733 ptrdiff_t yoff, uvoff, ls_y, ls_uv;
3736 if ((res = decode_frame_header(ctx, data, size, &ref)) < 0) {
3738 } else if (res == 0) {
3739 if (!s->refs[ref].f->data[0]) {
3740 av_log(ctx, AV_LOG_ERROR, "Requested reference %d not available\n", ref);
3741 return AVERROR_INVALIDDATA;
3743 if ((res = av_frame_ref(frame, s->refs[ref].f)) < 0)
3751 if (s->frames[LAST_FRAME].tf.f->data[0])
3752 vp9_unref_frame(ctx, &s->frames[LAST_FRAME]);
3753 if (!s->keyframe && s->frames[CUR_FRAME].tf.f->data[0] &&
3754 (res = vp9_ref_frame(ctx, &s->frames[LAST_FRAME], &s->frames[CUR_FRAME])) < 0)
3756 if (s->frames[CUR_FRAME].tf.f->data[0])
3757 vp9_unref_frame(ctx, &s->frames[CUR_FRAME]);
3758 if ((res = vp9_alloc_frame(ctx, &s->frames[CUR_FRAME])) < 0)
3760 f = s->frames[CUR_FRAME].tf.f;
3761 f->key_frame = s->keyframe;
3762 f->pict_type = s->keyframe ? AV_PICTURE_TYPE_I : AV_PICTURE_TYPE_P;
3763 ls_y = f->linesize[0];
3764 ls_uv =f->linesize[1];
3767 for (i = 0; i < 8; i++) {
3768 if (s->next_refs[i].f->data[0])
3769 ff_thread_release_buffer(ctx, &s->next_refs[i]);
3770 if (s->refreshrefmask & (1 << i)) {
3771 res = ff_thread_ref_frame(&s->next_refs[i], &s->frames[CUR_FRAME].tf);
3773 res = ff_thread_ref_frame(&s->next_refs[i], &s->refs[i]);
3779 // main tile decode loop
3780 memset(s->above_partition_ctx, 0, s->cols);
3781 memset(s->above_skip_ctx, 0, s->cols);
3782 if (s->keyframe || s->intraonly) {
3783 memset(s->above_mode_ctx, DC_PRED, s->cols * 2);
3785 memset(s->above_mode_ctx, NEARESTMV, s->cols);
3787 memset(s->above_y_nnz_ctx, 0, s->sb_cols * 16);
3788 memset(s->above_uv_nnz_ctx[0], 0, s->sb_cols * 8);
3789 memset(s->above_uv_nnz_ctx[1], 0, s->sb_cols * 8);
3790 memset(s->above_segpred_ctx, 0, s->cols);
3791 s->pass = s->uses_2pass =
3792 ctx->active_thread_type == FF_THREAD_FRAME && s->refreshctx && !s->parallelmode;
3793 if ((res = update_block_buffers(ctx)) < 0) {
3794 av_log(ctx, AV_LOG_ERROR,
3795 "Failed to allocate block buffers\n");
3798 if (s->refreshctx && s->parallelmode) {
3801 for (i = 0; i < 4; i++) {
3802 for (j = 0; j < 2; j++)
3803 for (k = 0; k < 2; k++)
3804 for (l = 0; l < 6; l++)
3805 for (m = 0; m < 6; m++)
3806 memcpy(s->prob_ctx[s->framectxid].coef[i][j][k][l][m],
3807 s->prob.coef[i][j][k][l][m], 3);
3808 if (s->txfmmode == i)
3811 s->prob_ctx[s->framectxid].p = s->prob.p;
3812 ff_thread_finish_setup(ctx);
3818 s->block = s->block_base;
3819 s->uvblock[0] = s->uvblock_base[0];
3820 s->uvblock[1] = s->uvblock_base[1];
3821 s->eob = s->eob_base;
3822 s->uveob[0] = s->uveob_base[0];
3823 s->uveob[1] = s->uveob_base[1];
3825 for (tile_row = 0; tile_row < s->tiling.tile_rows; tile_row++) {
3826 set_tile_offset(&s->tiling.tile_row_start, &s->tiling.tile_row_end,
3827 tile_row, s->tiling.log2_tile_rows, s->sb_rows);
3829 for (tile_col = 0; tile_col < s->tiling.tile_cols; tile_col++) {
3832 if (tile_col == s->tiling.tile_cols - 1 &&
3833 tile_row == s->tiling.tile_rows - 1) {
3836 tile_size = AV_RB32(data);
3840 if (tile_size > size) {
3841 ff_thread_report_progress(&s->frames[CUR_FRAME].tf, INT_MAX, 0);
3842 return AVERROR_INVALIDDATA;
3844 ff_vp56_init_range_decoder(&s->c_b[tile_col], data, tile_size);
3845 if (vp56_rac_get_prob_branchy(&s->c_b[tile_col], 128)) { // marker bit
3846 ff_thread_report_progress(&s->frames[CUR_FRAME].tf, INT_MAX, 0);
3847 return AVERROR_INVALIDDATA;
3854 for (row = s->tiling.tile_row_start; row < s->tiling.tile_row_end;
3855 row += 8, yoff += ls_y * 64, uvoff += ls_uv * 32) {
3856 struct VP9Filter *lflvl_ptr = s->lflvl;
3857 ptrdiff_t yoff2 = yoff, uvoff2 = uvoff;
3859 for (tile_col = 0; tile_col < s->tiling.tile_cols; tile_col++) {
3860 set_tile_offset(&s->tiling.tile_col_start, &s->tiling.tile_col_end,
3861 tile_col, s->tiling.log2_tile_cols, s->sb_cols);
3864 memset(s->left_partition_ctx, 0, 8);
3865 memset(s->left_skip_ctx, 0, 8);
3866 if (s->keyframe || s->intraonly) {
3867 memset(s->left_mode_ctx, DC_PRED, 16);
3869 memset(s->left_mode_ctx, NEARESTMV, 8);
3871 memset(s->left_y_nnz_ctx, 0, 16);
3872 memset(s->left_uv_nnz_ctx, 0, 16);
3873 memset(s->left_segpred_ctx, 0, 8);
3875 memcpy(&s->c, &s->c_b[tile_col], sizeof(s->c));
3878 for (col = s->tiling.tile_col_start;
3879 col < s->tiling.tile_col_end;
3880 col += 8, yoff2 += 64, uvoff2 += 32, lflvl_ptr++) {
3881 // FIXME integrate with lf code (i.e. zero after each
3882 // use, similar to invtxfm coefficients, or similar)
3884 memset(lflvl_ptr->mask, 0, sizeof(lflvl_ptr->mask));
3888 decode_sb_mem(ctx, row, col, lflvl_ptr,
3889 yoff2, uvoff2, BL_64X64);
3891 decode_sb(ctx, row, col, lflvl_ptr,
3892 yoff2, uvoff2, BL_64X64);
3896 memcpy(&s->c_b[tile_col], &s->c, sizeof(s->c));
3904 // backup pre-loopfilter reconstruction data for intra
3905 // prediction of next row of sb64s
3906 if (row + 8 < s->rows) {
3907 memcpy(s->intra_pred_data[0],
3908 f->data[0] + yoff + 63 * ls_y,
3910 memcpy(s->intra_pred_data[1],
3911 f->data[1] + uvoff + 31 * ls_uv,
3913 memcpy(s->intra_pred_data[2],
3914 f->data[2] + uvoff + 31 * ls_uv,
3918 // loopfilter one row
3919 if (s->filter.level) {
3922 lflvl_ptr = s->lflvl;
3923 for (col = 0; col < s->cols;
3924 col += 8, yoff2 += 64, uvoff2 += 32, lflvl_ptr++) {
3925 loopfilter_sb(ctx, lflvl_ptr, row, col, yoff2, uvoff2);
3929 // FIXME maybe we can make this more finegrained by running the
3930 // loopfilter per-block instead of after each sbrow
3931 // In fact that would also make intra pred left preparation easier?
3932 ff_thread_report_progress(&s->frames[CUR_FRAME].tf, row >> 3, 0);
3936 if (s->pass < 2 && s->refreshctx && !s->parallelmode) {
3938 ff_thread_finish_setup(ctx);
3940 } while (s->pass++ == 1);
3941 ff_thread_report_progress(&s->frames[CUR_FRAME].tf, INT_MAX, 0);
3944 for (i = 0; i < 8; i++) {
3945 if (s->refs[i].f->data[0])
3946 ff_thread_release_buffer(ctx, &s->refs[i]);
3947 ff_thread_ref_frame(&s->refs[i], &s->next_refs[i]);
3950 if (!s->invisible) {
3951 if ((res = av_frame_ref(frame, s->frames[CUR_FRAME].tf.f)) < 0)
3959 static void vp9_decode_flush(AVCodecContext *ctx)
3961 VP9Context *s = ctx->priv_data;
3964 for (i = 0; i < 2; i++)
3965 vp9_unref_frame(ctx, &s->frames[i]);
3966 for (i = 0; i < 8; i++)
3967 ff_thread_release_buffer(ctx, &s->refs[i]);
3970 static int init_frames(AVCodecContext *ctx)
3972 VP9Context *s = ctx->priv_data;
3975 for (i = 0; i < 2; i++) {
3976 s->frames[i].tf.f = av_frame_alloc();
3977 if (!s->frames[i].tf.f) {
3978 vp9_decode_free(ctx);
3979 av_log(ctx, AV_LOG_ERROR, "Failed to allocate frame buffer %d\n", i);
3980 return AVERROR(ENOMEM);
3983 for (i = 0; i < 8; i++) {
3984 s->refs[i].f = av_frame_alloc();
3985 s->next_refs[i].f = av_frame_alloc();
3986 if (!s->refs[i].f || !s->next_refs[i].f) {
3987 vp9_decode_free(ctx);
3988 av_log(ctx, AV_LOG_ERROR, "Failed to allocate frame buffer %d\n", i);
3989 return AVERROR(ENOMEM);
3996 static av_cold int vp9_decode_init(AVCodecContext *ctx)
3998 VP9Context *s = ctx->priv_data;
4000 ctx->internal->allocate_progress = 1;
4001 ctx->pix_fmt = AV_PIX_FMT_YUV420P;
4002 ff_vp9dsp_init(&s->dsp);
4003 ff_videodsp_init(&s->vdsp, 8);
4004 s->filter.sharpness = -1;
4006 return init_frames(ctx);
4009 static av_cold int vp9_decode_init_thread_copy(AVCodecContext *avctx)
4011 return init_frames(avctx);
4014 static int vp9_decode_update_thread_context(AVCodecContext *dst, const AVCodecContext *src)
4017 VP9Context *s = dst->priv_data, *ssrc = src->priv_data;
4019 // detect size changes in other threads
4020 if (s->intra_pred_data[0] &&
4021 (!ssrc->intra_pred_data[0] || s->cols != ssrc->cols || s->rows != ssrc->rows)) {
4025 for (i = 0; i < 2; i++) {
4026 if (s->frames[i].tf.f->data[0])
4027 vp9_unref_frame(dst, &s->frames[i]);
4028 if (ssrc->frames[i].tf.f->data[0]) {
4029 if ((res = vp9_ref_frame(dst, &s->frames[i], &ssrc->frames[i])) < 0)
4033 for (i = 0; i < 8; i++) {
4034 if (s->refs[i].f->data[0])
4035 ff_thread_release_buffer(dst, &s->refs[i]);
4036 if (ssrc->next_refs[i].f->data[0]) {
4037 if ((res = ff_thread_ref_frame(&s->refs[i], &ssrc->next_refs[i])) < 0)
4042 s->invisible = ssrc->invisible;
4043 s->keyframe = ssrc->keyframe;
4044 s->uses_2pass = ssrc->uses_2pass;
4045 memcpy(&s->prob_ctx, &ssrc->prob_ctx, sizeof(s->prob_ctx));
4046 memcpy(&s->lf_delta, &ssrc->lf_delta, sizeof(s->lf_delta));
4047 if (ssrc->segmentation.enabled) {
4048 memcpy(&s->segmentation.feat, &ssrc->segmentation.feat,
4049 sizeof(s->segmentation.feat));
4055 AVCodec ff_vp9_decoder = {
4057 .long_name = NULL_IF_CONFIG_SMALL("Google VP9"),
4058 .type = AVMEDIA_TYPE_VIDEO,
4059 .id = AV_CODEC_ID_VP9,
4060 .priv_data_size = sizeof(VP9Context),
4061 .init = vp9_decode_init,
4062 .close = vp9_decode_free,
4063 .decode = vp9_decode_frame,
4064 .capabilities = CODEC_CAP_DR1 | CODEC_CAP_FRAME_THREADS,
4065 .flush = vp9_decode_flush,
4066 .init_thread_copy = ONLY_IF_THREADS_ENABLED(vp9_decode_init_thread_copy),
4067 .update_thread_context = ONLY_IF_THREADS_ENABLED(vp9_decode_update_thread_context),