git.sesse.net Git - ffmpeg/blob - libavcodec/h264.c

   1 /*
   2  * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder
   3  * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
   4  *
   5  * This library is free software; you can redistribute it and/or
   6  * modify it under the terms of the GNU Lesser General Public
   7  * License as published by the Free Software Foundation; either
   8  * version 2 of the License, or (at your option) any later version.
   9  *
  10  * This library is distributed in the hope that it will be useful,
  11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13  * Lesser General Public License for more details.
  14  *
  15  * You should have received a copy of the GNU Lesser General Public
  16  * License along with this library; if not, write to the Free Software
  17  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  18  *
  19  */
  20
  21 /**
  22  * @file h264.c
  23  * H.264 / AVC / MPEG4 part10 codec.
  24  * @author Michael Niedermayer <michaelni@gmx.at>
  25  */
  26
  27 #include "common.h"
  28 #include "dsputil.h"
  29 #include "avcodec.h"
  30 #include "mpegvideo.h"
  31 #include "h264data.h"
  32 #include "golomb.h"
  33
  34 #include "cabac.h"
  35
  36 //#undef NDEBUG
  37 #include <assert.h>
  38
  39 #define interlaced_dct interlaced_dct_is_a_bad_name
  40 #define mb_intra mb_intra_isnt_initalized_see_mb_type
  41
  42 #define LUMA_DC_BLOCK_INDEX   25
  43 #define CHROMA_DC_BLOCK_INDEX 26
  44
  45 #define CHROMA_DC_COEFF_TOKEN_VLC_BITS 8
  46 #define COEFF_TOKEN_VLC_BITS           8
  47 #define TOTAL_ZEROS_VLC_BITS           9
  48 #define CHROMA_DC_TOTAL_ZEROS_VLC_BITS 3
  49 #define RUN_VLC_BITS                   3
  50 #define RUN7_VLC_BITS                  6
  51
  52 #define MAX_SPS_COUNT 32
  53 #define MAX_PPS_COUNT 256
  54
  55 #define MAX_MMCO_COUNT 66
  56
  57 /* Compiling in interlaced support reduces the speed
  58  * of progressive decoding by about 2%. */
  59 #define ALLOW_INTERLACE
  60
  61 #ifdef ALLOW_INTERLACE
  62 #define MB_MBAFF h->mb_mbaff
  63 #define MB_FIELD h->mb_field_decoding_flag
  64 #define FRAME_MBAFF h->mb_aff_frame
  65 #else
  66 #define MB_MBAFF 0
  67 #define MB_FIELD 0
  68 #define FRAME_MBAFF 0
  69 #undef  IS_INTERLACED
  70 #define IS_INTERLACED(mb_type) 0
  71 #endif
  72
  73 /**
  74  * Sequence parameter set
  75  */
  76 typedef struct SPS{
  77
  78     int profile_idc;
  79     int level_idc;
  80     int transform_bypass;              ///< qpprime_y_zero_transform_bypass_flag
  81     int log2_max_frame_num;            ///< log2_max_frame_num_minus4 + 4
  82     int poc_type;                      ///< pic_order_cnt_type
  83     int log2_max_poc_lsb;              ///< log2_max_pic_order_cnt_lsb_minus4
  84     int delta_pic_order_always_zero_flag;
  85     int offset_for_non_ref_pic;
  86     int offset_for_top_to_bottom_field;
  87     int poc_cycle_length;              ///< num_ref_frames_in_pic_order_cnt_cycle
  88     int ref_frame_count;               ///< num_ref_frames
  89     int gaps_in_frame_num_allowed_flag;
  90     int mb_width;                      ///< frame_width_in_mbs_minus1 + 1
  91     int mb_height;                     ///< frame_height_in_mbs_minus1 + 1
  92     int frame_mbs_only_flag;
  93     int mb_aff;                        ///<mb_adaptive_frame_field_flag
  94     int direct_8x8_inference_flag;
  95     int crop;                   ///< frame_cropping_flag
  96     int crop_left;              ///< frame_cropping_rect_left_offset
  97     int crop_right;             ///< frame_cropping_rect_right_offset
  98     int crop_top;               ///< frame_cropping_rect_top_offset
  99     int crop_bottom;            ///< frame_cropping_rect_bottom_offset
 100     int vui_parameters_present_flag;
 101     AVRational sar;
 102     int timing_info_present_flag;
 103     uint32_t num_units_in_tick;
 104     uint32_t time_scale;
 105     int fixed_frame_rate_flag;
 106     short offset_for_ref_frame[256]; //FIXME dyn aloc?
 107     int bitstream_restriction_flag;
 108     int num_reorder_frames;
 109     int scaling_matrix_present;
 110     uint8_t scaling_matrix4[6][16];
 111     uint8_t scaling_matrix8[2][64];
 112 }SPS;
 113
 114 /**
 115  * Picture parameter set
 116  */
 117 typedef struct PPS{
 118     int sps_id;
 119     int cabac;                  ///< entropy_coding_mode_flag
 120     int pic_order_present;      ///< pic_order_present_flag
 121     int slice_group_count;      ///< num_slice_groups_minus1 + 1
 122     int mb_slice_group_map_type;
 123     int ref_count[2];           ///< num_ref_idx_l0/1_active_minus1 + 1
 124     int weighted_pred;          ///< weighted_pred_flag
 125     int weighted_bipred_idc;
 126     int init_qp;                ///< pic_init_qp_minus26 + 26
 127     int init_qs;                ///< pic_init_qs_minus26 + 26
 128     int chroma_qp_index_offset;
 129     int deblocking_filter_parameters_present; ///< deblocking_filter_parameters_present_flag
 130     int constrained_intra_pred; ///< constrained_intra_pred_flag
 131     int redundant_pic_cnt_present; ///< redundant_pic_cnt_present_flag
 132     int transform_8x8_mode;     ///< transform_8x8_mode_flag
 133     uint8_t scaling_matrix4[6][16];
 134     uint8_t scaling_matrix8[2][64];
 135 }PPS;
 136
 137 /**
 138  * Memory management control operation opcode.
 139  */
 140 typedef enum MMCOOpcode{
 141     MMCO_END=0,
 142     MMCO_SHORT2UNUSED,
 143     MMCO_LONG2UNUSED,
 144     MMCO_SHORT2LONG,
 145     MMCO_SET_MAX_LONG,
 146     MMCO_RESET,
 147     MMCO_LONG,
 148 } MMCOOpcode;
 149
 150 /**
 151  * Memory management control operation.
 152  */
 153 typedef struct MMCO{
 154     MMCOOpcode opcode;
 155     int short_frame_num;
 156     int long_index;
 157 } MMCO;
 158
 159 /**
 160  * H264Context
 161  */
 162 typedef struct H264Context{
 163     MpegEncContext s;
 164     int nal_ref_idc;
 165     int nal_unit_type;
 166 #define NAL_SLICE                1
 167 #define NAL_DPA                  2
 168 #define NAL_DPB                  3
 169 #define NAL_DPC                  4
 170 #define NAL_IDR_SLICE            5
 171 #define NAL_SEI                  6
 172 #define NAL_SPS                  7
 173 #define NAL_PPS                  8
 174 #define NAL_AUD                  9
 175 #define NAL_END_SEQUENCE        10
 176 #define NAL_END_STREAM          11
 177 #define NAL_FILLER_DATA         12
 178 #define NAL_SPS_EXT             13
 179 #define NAL_AUXILIARY_SLICE     19
 180     uint8_t *rbsp_buffer;
 181     unsigned int rbsp_buffer_size;
 182
 183     /**
 184       * Used to parse AVC variant of h264
 185       */
 186     int is_avc; ///< this flag is != 0 if codec is avc1
 187     int got_avcC; ///< flag used to parse avcC data only once
 188     int nal_length_size; ///< Number of bytes used for nal length (1, 2 or 4)
 189
 190     int chroma_qp; //QPc
 191
 192     int prev_mb_skipped;
 193     int next_mb_skipped;
 194
 195     //prediction stuff
 196     int chroma_pred_mode;
 197     int intra16x16_pred_mode;
 198
 199     int top_mb_xy;
 200     int left_mb_xy[2];
 201
 202     int8_t intra4x4_pred_mode_cache[5*8];
 203     int8_t (*intra4x4_pred_mode)[8];
 204     void (*pred4x4  [9+3])(uint8_t *src, uint8_t *topright, int stride);//FIXME move to dsp?
 205     void (*pred8x8l [9+3])(uint8_t *src, int topleft, int topright, int stride);
 206     void (*pred8x8  [4+3])(uint8_t *src, int stride);
 207     void (*pred16x16[4+3])(uint8_t *src, int stride);
 208     unsigned int topleft_samples_available;
 209     unsigned int top_samples_available;
 210     unsigned int topright_samples_available;
 211     unsigned int left_samples_available;
 212     uint8_t (*top_borders[2])[16+2*8];
 213     uint8_t left_border[2*(17+2*9)];
 214
 215     /**
 216      * non zero coeff count cache.
 217      * is 64 if not available.
 218      */
 219     DECLARE_ALIGNED_8(uint8_t, non_zero_count_cache[6*8]);
 220     uint8_t (*non_zero_count)[16];
 221
 222     /**
 223      * Motion vector cache.
 224      */
 225     DECLARE_ALIGNED_8(int16_t, mv_cache[2][5*8][2]);
 226     DECLARE_ALIGNED_8(int8_t, ref_cache[2][5*8]);
 227 #define LIST_NOT_USED -1 //FIXME rename?
 228 #define PART_NOT_AVAILABLE -2
 229
 230     /**
 231      * is 1 if the specific list MV&references are set to 0,0,-2.
 232      */
 233     int mv_cache_clean[2];
 234
 235     /**
 236      * number of neighbors (top and/or left) that used 8x8 dct
 237      */
 238     int neighbor_transform_size;
 239
 240     /**
 241      * block_offset[ 0..23] for frame macroblocks
 242      * block_offset[24..47] for field macroblocks
 243      */
 244     int block_offset[2*(16+8)];
 245
 246     uint32_t *mb2b_xy; //FIXME are these 4 a good idea?
 247     uint32_t *mb2b8_xy;
 248     int b_stride; //FIXME use s->b4_stride
 249     int b8_stride;
 250
 251     int mb_linesize;   ///< may be equal to s->linesize or s->linesize*2, for mbaff
 252     int mb_uvlinesize;
 253
 254     int emu_edge_width;
 255     int emu_edge_height;
 256
 257     int halfpel_flag;
 258     int thirdpel_flag;
 259
 260     int unknown_svq3_flag;
 261     int next_slice_index;
 262
 263     SPS sps_buffer[MAX_SPS_COUNT];
 264     SPS sps; ///< current sps
 265
 266     PPS pps_buffer[MAX_PPS_COUNT];
 267     /**
 268      * current pps
 269      */
 270     PPS pps; //FIXME move to Picture perhaps? (->no) do we need that?
 271
 272     uint32_t dequant4_buffer[6][52][16];
 273     uint32_t dequant8_buffer[2][52][64];
 274     uint32_t (*dequant4_coeff[6])[16];
 275     uint32_t (*dequant8_coeff[2])[64];
 276     int dequant_coeff_pps;     ///< reinit tables when pps changes
 277
 278     int slice_num;
 279     uint8_t *slice_table_base;
 280     uint8_t *slice_table;      ///< slice_table_base + 2*mb_stride + 1
 281     int slice_type;
 282     int slice_type_fixed;
 283
 284     //interlacing specific flags
 285     int mb_aff_frame;
 286     int mb_field_decoding_flag;
 287     int mb_mbaff;              ///< mb_aff_frame && mb_field_decoding_flag
 288
 289     int sub_mb_type[4];
 290
 291     //POC stuff
 292     int poc_lsb;
 293     int poc_msb;
 294     int delta_poc_bottom;
 295     int delta_poc[2];
 296     int frame_num;
 297     int prev_poc_msb;             ///< poc_msb of the last reference pic for POC type 0
 298     int prev_poc_lsb;             ///< poc_lsb of the last reference pic for POC type 0
 299     int frame_num_offset;         ///< for POC type 2
 300     int prev_frame_num_offset;    ///< for POC type 2
 301     int prev_frame_num;           ///< frame_num of the last pic for POC type 1/2
 302
 303     /**
 304      * frame_num for frames or 2*frame_num for field pics.
 305      */
 306     int curr_pic_num;
 307
 308     /**
 309      * max_frame_num or 2*max_frame_num for field pics.
 310      */
 311     int max_pic_num;
 312
 313     //Weighted pred stuff
 314     int use_weight;
 315     int use_weight_chroma;
 316     int luma_log2_weight_denom;
 317     int chroma_log2_weight_denom;
 318     int luma_weight[2][48];
 319     int luma_offset[2][48];
 320     int chroma_weight[2][48][2];
 321     int chroma_offset[2][48][2];
 322     int implicit_weight[48][48];
 323
 324     //deblock
 325     int deblocking_filter;         ///< disable_deblocking_filter_idc with 1<->0
 326     int slice_alpha_c0_offset;
 327     int slice_beta_offset;
 328
 329     int redundant_pic_count;
 330
 331     int direct_spatial_mv_pred;
 332     int dist_scale_factor[16];
 333     int dist_scale_factor_field[32];
 334     int map_col_to_list0[2][16];
 335     int map_col_to_list0_field[2][32];
 336
 337     /**
 338      * num_ref_idx_l0/1_active_minus1 + 1
 339      */
 340     int ref_count[2];            ///< counts frames or fields, depending on current mb mode
 341     Picture *short_ref[32];
 342     Picture *long_ref[32];
 343     Picture default_ref_list[2][32];
 344     Picture ref_list[2][48];     ///< 0..15: frame refs, 16..47: mbaff field refs
 345     Picture *delayed_pic[16]; //FIXME size?
 346     Picture *delayed_output_pic;
 347
 348     /**
 349      * memory management control operations buffer.
 350      */
 351     MMCO mmco[MAX_MMCO_COUNT];
 352     int mmco_index;
 353
 354     int long_ref_count;  ///< number of actual long term references
 355     int short_ref_count; ///< number of actual short term references
 356
 357     //data partitioning
 358     GetBitContext intra_gb;
 359     GetBitContext inter_gb;
 360     GetBitContext *intra_gb_ptr;
 361     GetBitContext *inter_gb_ptr;
 362
 363     DECLARE_ALIGNED_8(DCTELEM, mb[16*24]);
 364
 365     /**
 366      * Cabac
 367      */
 368     CABACContext cabac;
 369     uint8_t      cabac_state[460];
 370     int          cabac_init_idc;
 371
 372     /* 0x100 -> non null luma_dc, 0x80/0x40 -> non null chroma_dc (cb/cr), 0x?0 -> chroma_cbp(0,1,2), 0x0? luma_cbp */
 373     uint16_t     *cbp_table;
 374     int top_cbp;
 375     int left_cbp;
 376     /* chroma_pred_mode for i4x4 or i16x16, else 0 */
 377     uint8_t     *chroma_pred_mode_table;
 378     int         last_qscale_diff;
 379     int16_t     (*mvd_table[2])[2];
 380     DECLARE_ALIGNED_8(int16_t, mvd_cache[2][5*8][2]);
 381     uint8_t     *direct_table;
 382     uint8_t     direct_cache[5*8];
 383
 384     uint8_t zigzag_scan[16];
 385     uint8_t zigzag_scan8x8[64];
 386     uint8_t zigzag_scan8x8_cavlc[64];
 387     uint8_t field_scan[16];
 388     uint8_t field_scan8x8[64];
 389     uint8_t field_scan8x8_cavlc[64];
 390     const uint8_t *zigzag_scan_q0;
 391     const uint8_t *zigzag_scan8x8_q0;
 392     const uint8_t *zigzag_scan8x8_cavlc_q0;
 393     const uint8_t *field_scan_q0;
 394     const uint8_t *field_scan8x8_q0;
 395     const uint8_t *field_scan8x8_cavlc_q0;
 396
 397     int x264_build;
 398 }H264Context;
 399
 400 static VLC coeff_token_vlc[4];
 401 static VLC chroma_dc_coeff_token_vlc;
 402
 403 static VLC total_zeros_vlc[15];
 404 static VLC chroma_dc_total_zeros_vlc[3];
 405
 406 static VLC run_vlc[6];
 407 static VLC run7_vlc;
 408
 409 static void svq3_luma_dc_dequant_idct_c(DCTELEM *block, int qp);
 410 static void svq3_add_idct_c(uint8_t *dst, DCTELEM *block, int stride, int qp, int dc);
 411 static void filter_mb( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize);
 412
 413 static always_inline uint32_t pack16to32(int a, int b){
 414 #ifdef WORDS_BIGENDIAN
 415    return (b&0xFFFF) + (a<<16);
 416 #else
 417    return (a&0xFFFF) + (b<<16);
 418 #endif
 419 }
 420
 421 /**
 422  * fill a rectangle.
 423  * @param h height of the rectangle, should be a constant
 424  * @param w width of the rectangle, should be a constant
 425  * @param size the size of val (1 or 4), should be a constant
 426  */
 427 static always_inline void fill_rectangle(void *vp, int w, int h, int stride, uint32_t val, int size){
 428     uint8_t *p= (uint8_t*)vp;
 429     assert(size==1 || size==4);
 430     assert(w<=4);
 431
 432     w      *= size;
 433     stride *= size;
 434
 435     assert((((long)vp)&(FFMIN(w, STRIDE_ALIGN)-1)) == 0);
 436     assert((stride&(w-1))==0);
 437     if(w==2){
 438         const uint16_t v= size==4 ? val : val*0x0101;
 439         *(uint16_t*)(p + 0*stride)= v;
 440         if(h==1) return;
 441         *(uint16_t*)(p + 1*stride)= v;
 442         if(h==2) return;
 443         *(uint16_t*)(p + 2*stride)=
 444         *(uint16_t*)(p + 3*stride)= v;
 445     }else if(w==4){
 446         const uint32_t v= size==4 ? val : val*0x01010101;
 447         *(uint32_t*)(p + 0*stride)= v;
 448         if(h==1) return;
 449         *(uint32_t*)(p + 1*stride)= v;
 450         if(h==2) return;
 451         *(uint32_t*)(p + 2*stride)=
 452         *(uint32_t*)(p + 3*stride)= v;
 453     }else if(w==8){
 454     //gcc can't optimize 64bit math on x86_32
 455 #if defined(ARCH_X86_64) || (defined(MP_WORDSIZE) && MP_WORDSIZE >= 64)
 456         const uint64_t v= val*0x0100000001ULL;
 457         *(uint64_t*)(p + 0*stride)= v;
 458         if(h==1) return;
 459         *(uint64_t*)(p + 1*stride)= v;
 460         if(h==2) return;
 461         *(uint64_t*)(p + 2*stride)=
 462         *(uint64_t*)(p + 3*stride)= v;
 463     }else if(w==16){
 464         const uint64_t v= val*0x0100000001ULL;
 465         *(uint64_t*)(p + 0+0*stride)=
 466         *(uint64_t*)(p + 8+0*stride)=
 467         *(uint64_t*)(p + 0+1*stride)=
 468         *(uint64_t*)(p + 8+1*stride)= v;
 469         if(h==2) return;
 470         *(uint64_t*)(p + 0+2*stride)=
 471         *(uint64_t*)(p + 8+2*stride)=
 472         *(uint64_t*)(p + 0+3*stride)=
 473         *(uint64_t*)(p + 8+3*stride)= v;
 474 #else
 475         *(uint32_t*)(p + 0+0*stride)=
 476         *(uint32_t*)(p + 4+0*stride)= val;
 477         if(h==1) return;
 478         *(uint32_t*)(p + 0+1*stride)=
 479         *(uint32_t*)(p + 4+1*stride)= val;
 480         if(h==2) return;
 481         *(uint32_t*)(p + 0+2*stride)=
 482         *(uint32_t*)(p + 4+2*stride)=
 483         *(uint32_t*)(p + 0+3*stride)=
 484         *(uint32_t*)(p + 4+3*stride)= val;
 485     }else if(w==16){
 486         *(uint32_t*)(p + 0+0*stride)=
 487         *(uint32_t*)(p + 4+0*stride)=
 488         *(uint32_t*)(p + 8+0*stride)=
 489         *(uint32_t*)(p +12+0*stride)=
 490         *(uint32_t*)(p + 0+1*stride)=
 491         *(uint32_t*)(p + 4+1*stride)=
 492         *(uint32_t*)(p + 8+1*stride)=
 493         *(uint32_t*)(p +12+1*stride)= val;
 494         if(h==2) return;
 495         *(uint32_t*)(p + 0+2*stride)=
 496         *(uint32_t*)(p + 4+2*stride)=
 497         *(uint32_t*)(p + 8+2*stride)=
 498         *(uint32_t*)(p +12+2*stride)=
 499         *(uint32_t*)(p + 0+3*stride)=
 500         *(uint32_t*)(p + 4+3*stride)=
 501         *(uint32_t*)(p + 8+3*stride)=
 502         *(uint32_t*)(p +12+3*stride)= val;
 503 #endif
 504     }else
 505         assert(0);
 506     assert(h==4);
 507 }
 508
 509 static void fill_caches(H264Context *h, int mb_type, int for_deblock){
 510     MpegEncContext * const s = &h->s;
 511     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
 512     int topleft_xy, top_xy, topright_xy, left_xy[2];
 513     int topleft_type, top_type, topright_type, left_type[2];
 514     int left_block[8];
 515     int i;
 516
 517     //FIXME deblocking can skip fill_caches much of the time with multiple slices too.
 518     // the actual condition is whether we're on the edge of a slice,
 519     // and even then the intra and nnz parts are unnecessary.
 520     if(for_deblock && h->slice_num == 1 && !FRAME_MBAFF)
 521         return;
 522
 523     //wow what a mess, why didn't they simplify the interlacing&intra stuff, i can't imagine that these complex rules are worth it
 524
 525     top_xy     = mb_xy  - s->mb_stride;
 526     topleft_xy = top_xy - 1;
 527     topright_xy= top_xy + 1;
 528     left_xy[1] = left_xy[0] = mb_xy-1;
 529     left_block[0]= 0;
 530     left_block[1]= 1;
 531     left_block[2]= 2;
 532     left_block[3]= 3;
 533     left_block[4]= 7;
 534     left_block[5]= 10;
 535     left_block[6]= 8;
 536     left_block[7]= 11;
 537     if(FRAME_MBAFF){
 538         const int pair_xy          = s->mb_x     + (s->mb_y & ~1)*s->mb_stride;
 539         const int top_pair_xy      = pair_xy     - s->mb_stride;
 540         const int topleft_pair_xy  = top_pair_xy - 1;
 541         const int topright_pair_xy = top_pair_xy + 1;
 542         const int topleft_mb_frame_flag  = !IS_INTERLACED(s->current_picture.mb_type[topleft_pair_xy]);
 543         const int top_mb_frame_flag      = !IS_INTERLACED(s->current_picture.mb_type[top_pair_xy]);
 544         const int topright_mb_frame_flag = !IS_INTERLACED(s->current_picture.mb_type[topright_pair_xy]);
 545         const int left_mb_frame_flag = !IS_INTERLACED(s->current_picture.mb_type[pair_xy-1]);
 546         const int curr_mb_frame_flag = !IS_INTERLACED(mb_type);
 547         const int bottom = (s->mb_y & 1);
 548         tprintf("fill_caches: curr_mb_frame_flag:%d, left_mb_frame_flag:%d, topleft_mb_frame_flag:%d, top_mb_frame_flag:%d, topright_mb_frame_flag:%d\n", curr_mb_frame_flag, left_mb_frame_flag, topleft_mb_frame_flag, top_mb_frame_flag, topright_mb_frame_flag);
 549         if (bottom
 550                 ? !curr_mb_frame_flag // bottom macroblock
 551                 : (!curr_mb_frame_flag && !top_mb_frame_flag) // top macroblock
 552                 ) {
 553             top_xy -= s->mb_stride;
 554         }
 555         if (bottom
 556                 ? !curr_mb_frame_flag // bottom macroblock
 557                 : (!curr_mb_frame_flag && !topleft_mb_frame_flag) // top macroblock
 558                 ) {
 559             topleft_xy -= s->mb_stride;
 560         }
 561         if (bottom
 562                 ? !curr_mb_frame_flag // bottom macroblock
 563                 : (!curr_mb_frame_flag && !topright_mb_frame_flag) // top macroblock
 564                 ) {
 565             topright_xy -= s->mb_stride;
 566         }
 567         if (left_mb_frame_flag != curr_mb_frame_flag) {
 568             left_xy[1] = left_xy[0] = pair_xy - 1;
 569             if (curr_mb_frame_flag) {
 570                 if (bottom) {
 571                     left_block[0]= 2;
 572                     left_block[1]= 2;
 573                     left_block[2]= 3;
 574                     left_block[3]= 3;
 575                     left_block[4]= 8;
 576                     left_block[5]= 11;
 577                     left_block[6]= 8;
 578                     left_block[7]= 11;
 579                 } else {
 580                     left_block[0]= 0;
 581                     left_block[1]= 0;
 582                     left_block[2]= 1;
 583                     left_block[3]= 1;
 584                     left_block[4]= 7;
 585                     left_block[5]= 10;
 586                     left_block[6]= 7;
 587                     left_block[7]= 10;
 588                 }
 589             } else {
 590                 left_xy[1] += s->mb_stride;
 591                 //left_block[0]= 0;
 592                 left_block[1]= 2;
 593                 left_block[2]= 0;
 594                 left_block[3]= 2;
 595                 //left_block[4]= 7;
 596                 left_block[5]= 10;
 597                 left_block[6]= 7;
 598                 left_block[7]= 10;
 599             }
 600         }
 601     }
 602
 603     h->top_mb_xy = top_xy;
 604     h->left_mb_xy[0] = left_xy[0];
 605     h->left_mb_xy[1] = left_xy[1];
 606     if(for_deblock){
 607         topleft_type = h->slice_table[topleft_xy ] < 255 ? s->current_picture.mb_type[topleft_xy] : 0;
 608         top_type     = h->slice_table[top_xy     ] < 255 ? s->current_picture.mb_type[top_xy]     : 0;
 609         topright_type= h->slice_table[topright_xy] < 255 ? s->current_picture.mb_type[topright_xy]: 0;
 610         left_type[0] = h->slice_table[left_xy[0] ] < 255 ? s->current_picture.mb_type[left_xy[0]] : 0;
 611         left_type[1] = h->slice_table[left_xy[1] ] < 255 ? s->current_picture.mb_type[left_xy[1]] : 0;
 612
 613         if(FRAME_MBAFF && !IS_INTRA(mb_type)){
 614             int list;
 615             int v = *(uint16_t*)&h->non_zero_count[mb_xy][14];
 616             for(i=0; i<16; i++)
 617                 h->non_zero_count_cache[scan8[i]] = (v>>i)&1;
 618             for(list=0; list<1+(h->slice_type==B_TYPE); list++){
 619                 if(USES_LIST(mb_type,list)){
 620                     uint32_t *src = (uint32_t*)s->current_picture.motion_val[list][h->mb2b_xy[mb_xy]];
 621                     uint32_t *dst = (uint32_t*)h->mv_cache[list][scan8[0]];
 622                     uint8_t *ref = &s->current_picture.ref_index[list][h->mb2b8_xy[mb_xy]];
 623                     for(i=0; i<4; i++, dst+=8, src+=h->b_stride){
 624                         dst[0] = src[0];
 625                         dst[1] = src[1];
 626                         dst[2] = src[2];
 627                         dst[3] = src[3];
 628                     }
 629                     *(uint32_t*)&h->ref_cache[list][scan8[ 0]] =
 630                     *(uint32_t*)&h->ref_cache[list][scan8[ 2]] = pack16to32(ref[0],ref[1])*0x0101;
 631                     ref += h->b8_stride;
 632                     *(uint32_t*)&h->ref_cache[list][scan8[ 8]] =
 633                     *(uint32_t*)&h->ref_cache[list][scan8[10]] = pack16to32(ref[0],ref[1])*0x0101;
 634                 }else{
 635                     fill_rectangle(&h-> mv_cache[list][scan8[ 0]], 4, 4, 8, 0, 4);
 636                     fill_rectangle(&h->ref_cache[list][scan8[ 0]], 4, 4, 8, (uint8_t)LIST_NOT_USED, 1);
 637                 }
 638             }
 639         }
 640     }else{
 641         topleft_type = h->slice_table[topleft_xy ] == h->slice_num ? s->current_picture.mb_type[topleft_xy] : 0;
 642         top_type     = h->slice_table[top_xy     ] == h->slice_num ? s->current_picture.mb_type[top_xy]     : 0;
 643         topright_type= h->slice_table[topright_xy] == h->slice_num ? s->current_picture.mb_type[topright_xy]: 0;
 644         left_type[0] = h->slice_table[left_xy[0] ] == h->slice_num ? s->current_picture.mb_type[left_xy[0]] : 0;
 645         left_type[1] = h->slice_table[left_xy[1] ] == h->slice_num ? s->current_picture.mb_type[left_xy[1]] : 0;
 646     }
 647
 648     if(IS_INTRA(mb_type)){
 649         h->topleft_samples_available=
 650         h->top_samples_available=
 651         h->left_samples_available= 0xFFFF;
 652         h->topright_samples_available= 0xEEEA;
 653
 654         if(!IS_INTRA(top_type) && (top_type==0 || h->pps.constrained_intra_pred)){
 655             h->topleft_samples_available= 0xB3FF;
 656             h->top_samples_available= 0x33FF;
 657             h->topright_samples_available= 0x26EA;
 658         }
 659         for(i=0; i<2; i++){
 660             if(!IS_INTRA(left_type[i]) && (left_type[i]==0 || h->pps.constrained_intra_pred)){
 661                 h->topleft_samples_available&= 0xDF5F;
 662                 h->left_samples_available&= 0x5F5F;
 663             }
 664         }
 665
 666         if(!IS_INTRA(topleft_type) && (topleft_type==0 || h->pps.constrained_intra_pred))
 667             h->topleft_samples_available&= 0x7FFF;
 668
 669         if(!IS_INTRA(topright_type) && (topright_type==0 || h->pps.constrained_intra_pred))
 670             h->topright_samples_available&= 0xFBFF;
 671
 672         if(IS_INTRA4x4(mb_type)){
 673             if(IS_INTRA4x4(top_type)){
 674                 h->intra4x4_pred_mode_cache[4+8*0]= h->intra4x4_pred_mode[top_xy][4];
 675                 h->intra4x4_pred_mode_cache[5+8*0]= h->intra4x4_pred_mode[top_xy][5];
 676                 h->intra4x4_pred_mode_cache[6+8*0]= h->intra4x4_pred_mode[top_xy][6];
 677                 h->intra4x4_pred_mode_cache[7+8*0]= h->intra4x4_pred_mode[top_xy][3];
 678             }else{
 679                 int pred;
 680                 if(!top_type || (IS_INTER(top_type) && h->pps.constrained_intra_pred))
 681                     pred= -1;
 682                 else{
 683                     pred= 2;
 684                 }
 685                 h->intra4x4_pred_mode_cache[4+8*0]=
 686                 h->intra4x4_pred_mode_cache[5+8*0]=
 687                 h->intra4x4_pred_mode_cache[6+8*0]=
 688                 h->intra4x4_pred_mode_cache[7+8*0]= pred;
 689             }
 690             for(i=0; i<2; i++){
 691                 if(IS_INTRA4x4(left_type[i])){
 692                     h->intra4x4_pred_mode_cache[3+8*1 + 2*8*i]= h->intra4x4_pred_mode[left_xy[i]][left_block[0+2*i]];
 693                     h->intra4x4_pred_mode_cache[3+8*2 + 2*8*i]= h->intra4x4_pred_mode[left_xy[i]][left_block[1+2*i]];
 694                 }else{
 695                     int pred;
 696                     if(!left_type[i] || (IS_INTER(left_type[i]) && h->pps.constrained_intra_pred))
 697                         pred= -1;
 698                     else{
 699                         pred= 2;
 700                     }
 701                     h->intra4x4_pred_mode_cache[3+8*1 + 2*8*i]=
 702                     h->intra4x4_pred_mode_cache[3+8*2 + 2*8*i]= pred;
 703                 }
 704             }
 705         }
 706     }
 707
 708
 709 /*
 710 0 . T T. T T T T
 711 1 L . .L . . . .
 712 2 L . .L . . . .
 713 3 . T TL . . . .
 714 4 L . .L . . . .
 715 5 L . .. . . . .
 716 */
 717 //FIXME constraint_intra_pred & partitioning & nnz (lets hope this is just a typo in the spec)
 718     if(top_type){
 719         h->non_zero_count_cache[4+8*0]= h->non_zero_count[top_xy][4];
 720         h->non_zero_count_cache[5+8*0]= h->non_zero_count[top_xy][5];
 721         h->non_zero_count_cache[6+8*0]= h->non_zero_count[top_xy][6];
 722         h->non_zero_count_cache[7+8*0]= h->non_zero_count[top_xy][3];
 723
 724         h->non_zero_count_cache[1+8*0]= h->non_zero_count[top_xy][9];
 725         h->non_zero_count_cache[2+8*0]= h->non_zero_count[top_xy][8];
 726
 727         h->non_zero_count_cache[1+8*3]= h->non_zero_count[top_xy][12];
 728         h->non_zero_count_cache[2+8*3]= h->non_zero_count[top_xy][11];
 729
 730     }else{
 731         h->non_zero_count_cache[4+8*0]=
 732         h->non_zero_count_cache[5+8*0]=
 733         h->non_zero_count_cache[6+8*0]=
 734         h->non_zero_count_cache[7+8*0]=
 735
 736         h->non_zero_count_cache[1+8*0]=
 737         h->non_zero_count_cache[2+8*0]=
 738
 739         h->non_zero_count_cache[1+8*3]=
 740         h->non_zero_count_cache[2+8*3]= h->pps.cabac && !IS_INTRA(mb_type) ? 0 : 64;
 741
 742     }
 743
 744     for (i=0; i<2; i++) {
 745         if(left_type[i]){
 746             h->non_zero_count_cache[3+8*1 + 2*8*i]= h->non_zero_count[left_xy[i]][left_block[0+2*i]];
 747             h->non_zero_count_cache[3+8*2 + 2*8*i]= h->non_zero_count[left_xy[i]][left_block[1+2*i]];
 748             h->non_zero_count_cache[0+8*1 +   8*i]= h->non_zero_count[left_xy[i]][left_block[4+2*i]];
 749             h->non_zero_count_cache[0+8*4 +   8*i]= h->non_zero_count[left_xy[i]][left_block[5+2*i]];
 750         }else{
 751             h->non_zero_count_cache[3+8*1 + 2*8*i]=
 752             h->non_zero_count_cache[3+8*2 + 2*8*i]=
 753             h->non_zero_count_cache[0+8*1 +   8*i]=
 754             h->non_zero_count_cache[0+8*4 +   8*i]= h->pps.cabac && !IS_INTRA(mb_type) ? 0 : 64;
 755         }
 756     }
 757
 758     if( h->pps.cabac ) {
 759         // top_cbp
 760         if(top_type) {
 761             h->top_cbp = h->cbp_table[top_xy];
 762         } else if(IS_INTRA(mb_type)) {
 763             h->top_cbp = 0x1C0;
 764         } else {
 765             h->top_cbp = 0;
 766         }
 767         // left_cbp
 768         if (left_type[0]) {
 769             h->left_cbp = h->cbp_table[left_xy[0]] & 0x1f0;
 770         } else if(IS_INTRA(mb_type)) {
 771             h->left_cbp = 0x1C0;
 772         } else {
 773             h->left_cbp = 0;
 774         }
 775         if (left_type[0]) {
 776             h->left_cbp |= ((h->cbp_table[left_xy[0]]>>((left_block[0]&(~1))+1))&0x1) << 1;
 777         }
 778         if (left_type[1]) {
 779             h->left_cbp |= ((h->cbp_table[left_xy[1]]>>((left_block[2]&(~1))+1))&0x1) << 3;
 780         }
 781     }
 782
 783 #if 1
 784     //FIXME direct mb can skip much of this
 785     if(IS_INTER(mb_type) || IS_DIRECT(mb_type)){
 786         int list;
 787         for(list=0; list<1+(h->slice_type==B_TYPE); list++){
 788             if(!USES_LIST(mb_type, list) && !IS_DIRECT(mb_type) && !h->deblocking_filter){
 789                 /*if(!h->mv_cache_clean[list]){
 790                     memset(h->mv_cache [list],  0, 8*5*2*sizeof(int16_t)); //FIXME clean only input? clean at all?
 791                     memset(h->ref_cache[list], PART_NOT_AVAILABLE, 8*5*sizeof(int8_t));
 792                     h->mv_cache_clean[list]= 1;
 793                 }*/
 794                 continue;
 795             }
 796             h->mv_cache_clean[list]= 0;
 797
 798             if(USES_LIST(top_type, list)){
 799                 const int b_xy= h->mb2b_xy[top_xy] + 3*h->b_stride;
 800                 const int b8_xy= h->mb2b8_xy[top_xy] + h->b8_stride;
 801                 *(uint32_t*)h->mv_cache[list][scan8[0] + 0 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 0];
 802                 *(uint32_t*)h->mv_cache[list][scan8[0] + 1 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 1];
 803                 *(uint32_t*)h->mv_cache[list][scan8[0] + 2 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 2];
 804                 *(uint32_t*)h->mv_cache[list][scan8[0] + 3 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 3];
 805                 h->ref_cache[list][scan8[0] + 0 - 1*8]=
 806                 h->ref_cache[list][scan8[0] + 1 - 1*8]= s->current_picture.ref_index[list][b8_xy + 0];
 807                 h->ref_cache[list][scan8[0] + 2 - 1*8]=
 808                 h->ref_cache[list][scan8[0] + 3 - 1*8]= s->current_picture.ref_index[list][b8_xy + 1];
 809             }else{
 810                 *(uint32_t*)h->mv_cache [list][scan8[0] + 0 - 1*8]=
 811                 *(uint32_t*)h->mv_cache [list][scan8[0] + 1 - 1*8]=
 812                 *(uint32_t*)h->mv_cache [list][scan8[0] + 2 - 1*8]=
 813                 *(uint32_t*)h->mv_cache [list][scan8[0] + 3 - 1*8]= 0;
 814                 *(uint32_t*)&h->ref_cache[list][scan8[0] + 0 - 1*8]= ((top_type ? LIST_NOT_USED : PART_NOT_AVAILABLE)&0xFF)*0x01010101;
 815             }
 816
 817             //FIXME unify cleanup or sth
 818             if(USES_LIST(left_type[0], list)){
 819                 const int b_xy= h->mb2b_xy[left_xy[0]] + 3;
 820                 const int b8_xy= h->mb2b8_xy[left_xy[0]] + 1;
 821                 *(uint32_t*)h->mv_cache[list][scan8[0] - 1 + 0*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + h->b_stride*left_block[0]];
 822                 *(uint32_t*)h->mv_cache[list][scan8[0] - 1 + 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + h->b_stride*left_block[1]];
 823                 h->ref_cache[list][scan8[0] - 1 + 0*8]= s->current_picture.ref_index[list][b8_xy + h->b8_stride*(left_block[0]>>1)];
 824                 h->ref_cache[list][scan8[0] - 1 + 1*8]= s->current_picture.ref_index[list][b8_xy + h->b8_stride*(left_block[1]>>1)];
 825             }else{
 826                 *(uint32_t*)h->mv_cache [list][scan8[0] - 1 + 0*8]=
 827                 *(uint32_t*)h->mv_cache [list][scan8[0] - 1 + 1*8]= 0;
 828                 h->ref_cache[list][scan8[0] - 1 + 0*8]=
 829                 h->ref_cache[list][scan8[0] - 1 + 1*8]= left_type[0] ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 830             }
 831
 832             if(USES_LIST(left_type[1], list)){
 833                 const int b_xy= h->mb2b_xy[left_xy[1]] + 3;
 834                 const int b8_xy= h->mb2b8_xy[left_xy[1]] + 1;
 835                 *(uint32_t*)h->mv_cache[list][scan8[0] - 1 + 2*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + h->b_stride*left_block[2]];
 836                 *(uint32_t*)h->mv_cache[list][scan8[0] - 1 + 3*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + h->b_stride*left_block[3]];
 837                 h->ref_cache[list][scan8[0] - 1 + 2*8]= s->current_picture.ref_index[list][b8_xy + h->b8_stride*(left_block[2]>>1)];
 838                 h->ref_cache[list][scan8[0] - 1 + 3*8]= s->current_picture.ref_index[list][b8_xy + h->b8_stride*(left_block[3]>>1)];
 839             }else{
 840                 *(uint32_t*)h->mv_cache [list][scan8[0] - 1 + 2*8]=
 841                 *(uint32_t*)h->mv_cache [list][scan8[0] - 1 + 3*8]= 0;
 842                 h->ref_cache[list][scan8[0] - 1 + 2*8]=
 843                 h->ref_cache[list][scan8[0] - 1 + 3*8]= left_type[0] ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 844                 assert((!left_type[0]) == (!left_type[1]));
 845             }
 846
 847             if(for_deblock || (IS_DIRECT(mb_type) && !h->direct_spatial_mv_pred))
 848                 continue;
 849
 850             if(USES_LIST(topleft_type, list)){
 851                 const int b_xy = h->mb2b_xy[topleft_xy] + 3 + 3*h->b_stride;
 852                 const int b8_xy= h->mb2b8_xy[topleft_xy] + 1 + h->b8_stride;
 853                 *(uint32_t*)h->mv_cache[list][scan8[0] - 1 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy];
 854                 h->ref_cache[list][scan8[0] - 1 - 1*8]= s->current_picture.ref_index[list][b8_xy];
 855             }else{
 856                 *(uint32_t*)h->mv_cache[list][scan8[0] - 1 - 1*8]= 0;
 857                 h->ref_cache[list][scan8[0] - 1 - 1*8]= topleft_type ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 858             }
 859
 860             if(USES_LIST(topright_type, list)){
 861                 const int b_xy= h->mb2b_xy[topright_xy] + 3*h->b_stride;
 862                 const int b8_xy= h->mb2b8_xy[topright_xy] + h->b8_stride;
 863                 *(uint32_t*)h->mv_cache[list][scan8[0] + 4 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy];
 864                 h->ref_cache[list][scan8[0] + 4 - 1*8]= s->current_picture.ref_index[list][b8_xy];
 865             }else{
 866                 *(uint32_t*)h->mv_cache [list][scan8[0] + 4 - 1*8]= 0;
 867                 h->ref_cache[list][scan8[0] + 4 - 1*8]= topright_type ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 868             }
 869
 870
 871             h->ref_cache[list][scan8[5 ]+1] =
 872             h->ref_cache[list][scan8[7 ]+1] =
 873             h->ref_cache[list][scan8[13]+1] =  //FIXME remove past 3 (init somewhere else)
 874             h->ref_cache[list][scan8[4 ]] =
 875             h->ref_cache[list][scan8[12]] = PART_NOT_AVAILABLE;
 876             *(uint32_t*)h->mv_cache [list][scan8[5 ]+1]=
 877             *(uint32_t*)h->mv_cache [list][scan8[7 ]+1]=
 878             *(uint32_t*)h->mv_cache [list][scan8[13]+1]= //FIXME remove past 3 (init somewhere else)
 879             *(uint32_t*)h->mv_cache [list][scan8[4 ]]=
 880             *(uint32_t*)h->mv_cache [list][scan8[12]]= 0;
 881
 882             if( h->pps.cabac ) {
 883                 /* XXX beurk, Load mvd */
 884                 if(USES_LIST(top_type, list)){
 885                     const int b_xy= h->mb2b_xy[top_xy] + 3*h->b_stride;
 886                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 0 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 0];
 887                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 1 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 1];
 888                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 2 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 2];
 889                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 3 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 3];
 890                 }else{
 891                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 0 - 1*8]=
 892                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 1 - 1*8]=
 893                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 2 - 1*8]=
 894                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 3 - 1*8]= 0;
 895                 }
 896                 if(USES_LIST(left_type[0], list)){
 897                     const int b_xy= h->mb2b_xy[left_xy[0]] + 3;
 898                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 0*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[0]];
 899                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[1]];
 900                 }else{
 901                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 0*8]=
 902                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 1*8]= 0;
 903                 }
 904                 if(USES_LIST(left_type[1], list)){
 905                     const int b_xy= h->mb2b_xy[left_xy[1]] + 3;
 906                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 2*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[2]];
 907                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 3*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[3]];
 908                 }else{
 909                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 2*8]=
 910                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 3*8]= 0;
 911                 }
 912                 *(uint32_t*)h->mvd_cache [list][scan8[5 ]+1]=
 913                 *(uint32_t*)h->mvd_cache [list][scan8[7 ]+1]=
 914                 *(uint32_t*)h->mvd_cache [list][scan8[13]+1]= //FIXME remove past 3 (init somewhere else)
 915                 *(uint32_t*)h->mvd_cache [list][scan8[4 ]]=
 916                 *(uint32_t*)h->mvd_cache [list][scan8[12]]= 0;
 917
 918                 if(h->slice_type == B_TYPE){
 919                     fill_rectangle(&h->direct_cache[scan8[0]], 4, 4, 8, 0, 1);
 920
 921                     if(IS_DIRECT(top_type)){
 922                         *(uint32_t*)&h->direct_cache[scan8[0] - 1*8]= 0x01010101;
 923                     }else if(IS_8X8(top_type)){
 924                         int b8_xy = h->mb2b8_xy[top_xy] + h->b8_stride;
 925                         h->direct_cache[scan8[0] + 0 - 1*8]= h->direct_table[b8_xy];
 926                         h->direct_cache[scan8[0] + 2 - 1*8]= h->direct_table[b8_xy + 1];
 927                     }else{
 928                         *(uint32_t*)&h->direct_cache[scan8[0] - 1*8]= 0;
 929                     }
 930
 931                     if(IS_DIRECT(left_type[0]))
 932                         h->direct_cache[scan8[0] - 1 + 0*8]= 1;
 933                     else if(IS_8X8(left_type[0]))
 934                         h->direct_cache[scan8[0] - 1 + 0*8]= h->direct_table[h->mb2b8_xy[left_xy[0]] + 1 + h->b8_stride*(left_block[0]>>1)];
 935                     else
 936                         h->direct_cache[scan8[0] - 1 + 0*8]= 0;
 937
 938                     if(IS_DIRECT(left_type[1]))
 939                         h->direct_cache[scan8[0] - 1 + 2*8]= 1;
 940                     else if(IS_8X8(left_type[1]))
 941                         h->direct_cache[scan8[0] - 1 + 2*8]= h->direct_table[h->mb2b8_xy[left_xy[1]] + 1 + h->b8_stride*(left_block[2]>>1)];
 942                     else
 943                         h->direct_cache[scan8[0] - 1 + 2*8]= 0;
 944                 }
 945             }
 946
 947             if(FRAME_MBAFF){
 948 #define MAP_MVS\
 949                     MAP_F2F(scan8[0] - 1 - 1*8, topleft_type)\
 950                     MAP_F2F(scan8[0] + 0 - 1*8, top_type)\
 951                     MAP_F2F(scan8[0] + 1 - 1*8, top_type)\
 952                     MAP_F2F(scan8[0] + 2 - 1*8, top_type)\
 953                     MAP_F2F(scan8[0] + 3 - 1*8, top_type)\
 954                     MAP_F2F(scan8[0] + 4 - 1*8, topright_type)\
 955                     MAP_F2F(scan8[0] - 1 + 0*8, left_type[0])\
 956                     MAP_F2F(scan8[0] - 1 + 1*8, left_type[0])\
 957                     MAP_F2F(scan8[0] - 1 + 2*8, left_type[1])\
 958                     MAP_F2F(scan8[0] - 1 + 3*8, left_type[1])
 959                 if(MB_FIELD){
 960 #define MAP_F2F(idx, mb_type)\
 961                     if(!IS_INTERLACED(mb_type) && h->ref_cache[list][idx] >= 0){\
 962                         h->ref_cache[list][idx] <<= 1;\
 963                         h->mv_cache[list][idx][1] /= 2;\
 964                         h->mvd_cache[list][idx][1] /= 2;\
 965                     }
 966                     MAP_MVS
 967 #undef MAP_F2F
 968                 }else{
 969 #define MAP_F2F(idx, mb_type)\
 970                     if(IS_INTERLACED(mb_type) && h->ref_cache[list][idx] >= 0){\
 971                         h->ref_cache[list][idx] >>= 1;\
 972                         h->mv_cache[list][idx][1] <<= 1;\
 973                         h->mvd_cache[list][idx][1] <<= 1;\
 974                     }
 975                     MAP_MVS
 976 #undef MAP_F2F
 977                 }
 978             }
 979         }
 980     }
 981 #endif
 982
 983     h->neighbor_transform_size= !!IS_8x8DCT(top_type) + !!IS_8x8DCT(left_type[0]);
 984 }
 985
 986 static inline void write_back_intra_pred_mode(H264Context *h){
 987     MpegEncContext * const s = &h->s;
 988     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
 989
 990     h->intra4x4_pred_mode[mb_xy][0]= h->intra4x4_pred_mode_cache[7+8*1];
 991     h->intra4x4_pred_mode[mb_xy][1]= h->intra4x4_pred_mode_cache[7+8*2];
 992     h->intra4x4_pred_mode[mb_xy][2]= h->intra4x4_pred_mode_cache[7+8*3];
 993     h->intra4x4_pred_mode[mb_xy][3]= h->intra4x4_pred_mode_cache[7+8*4];
 994     h->intra4x4_pred_mode[mb_xy][4]= h->intra4x4_pred_mode_cache[4+8*4];
 995     h->intra4x4_pred_mode[mb_xy][5]= h->intra4x4_pred_mode_cache[5+8*4];
 996     h->intra4x4_pred_mode[mb_xy][6]= h->intra4x4_pred_mode_cache[6+8*4];
 997 }
 998
 999 /**
1000  * checks if the top & left blocks are available if needed & changes the dc mode so it only uses the available blocks.
1001  */
1002 static inline int check_intra4x4_pred_mode(H264Context *h){
1003     MpegEncContext * const s = &h->s;
1004     static const int8_t top [12]= {-1, 0,LEFT_DC_PRED,-1,-1,-1,-1,-1, 0};
1005     static const int8_t left[12]= { 0,-1, TOP_DC_PRED, 0,-1,-1,-1, 0,-1,DC_128_PRED};
1006     int i;
1007
1008     if(!(h->top_samples_available&0x8000)){
1009         for(i=0; i<4; i++){
1010             int status= top[ h->intra4x4_pred_mode_cache[scan8[0] + i] ];
1011             if(status<0){
1012                 av_log(h->s.avctx, AV_LOG_ERROR, "top block unavailable for requested intra4x4 mode %d at %d %d\n", status, s->mb_x, s->mb_y);
1013                 return -1;
1014             } else if(status){
1015                 h->intra4x4_pred_mode_cache[scan8[0] + i]= status;
1016             }
1017         }
1018     }
1019
1020     if(!(h->left_samples_available&0x8000)){
1021         for(i=0; i<4; i++){
1022             int status= left[ h->intra4x4_pred_mode_cache[scan8[0] + 8*i] ];
1023             if(status<0){
1024                 av_log(h->s.avctx, AV_LOG_ERROR, "left block unavailable for requested intra4x4 mode %d at %d %d\n", status, s->mb_x, s->mb_y);
1025                 return -1;
1026             } else if(status){
1027                 h->intra4x4_pred_mode_cache[scan8[0] + 8*i]= status;
1028             }
1029         }
1030     }
1031
1032     return 0;
1033 } //FIXME cleanup like next
1034
1035 /**
1036  * checks if the top & left blocks are available if needed & changes the dc mode so it only uses the available blocks.
1037  */
1038 static inline int check_intra_pred_mode(H264Context *h, int mode){
1039     MpegEncContext * const s = &h->s;
1040     static const int8_t top [7]= {LEFT_DC_PRED8x8, 1,-1,-1};
1041     static const int8_t left[7]= { TOP_DC_PRED8x8,-1, 2,-1,DC_128_PRED8x8};
1042
1043     if(mode < 0 || mode > 6) {
1044         av_log(h->s.avctx, AV_LOG_ERROR, "out of range intra chroma pred mode at %d %d\n", s->mb_x, s->mb_y);
1045         return -1;
1046     }
1047
1048     if(!(h->top_samples_available&0x8000)){
1049         mode= top[ mode ];
1050         if(mode<0){
1051             av_log(h->s.avctx, AV_LOG_ERROR, "top block unavailable for requested intra mode at %d %d\n", s->mb_x, s->mb_y);
1052             return -1;
1053         }
1054     }
1055
1056     if(!(h->left_samples_available&0x8000)){
1057         mode= left[ mode ];
1058         if(mode<0){
1059             av_log(h->s.avctx, AV_LOG_ERROR, "left block unavailable for requested intra mode at %d %d\n", s->mb_x, s->mb_y);
1060             return -1;
1061         }
1062     }
1063
1064     return mode;
1065 }
1066
1067 /**
1068  * gets the predicted intra4x4 prediction mode.
1069  */
1070 static inline int pred_intra_mode(H264Context *h, int n){
1071     const int index8= scan8[n];
1072     const int left= h->intra4x4_pred_mode_cache[index8 - 1];
1073     const int top = h->intra4x4_pred_mode_cache[index8 - 8];
1074     const int min= FFMIN(left, top);
1075
1076     tprintf("mode:%d %d min:%d\n", left ,top, min);
1077
1078     if(min<0) return DC_PRED;
1079     else      return min;
1080 }
1081
1082 static inline void write_back_non_zero_count(H264Context *h){
1083     MpegEncContext * const s = &h->s;
1084     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
1085
1086     h->non_zero_count[mb_xy][0]= h->non_zero_count_cache[7+8*1];
1087     h->non_zero_count[mb_xy][1]= h->non_zero_count_cache[7+8*2];
1088     h->non_zero_count[mb_xy][2]= h->non_zero_count_cache[7+8*3];
1089     h->non_zero_count[mb_xy][3]= h->non_zero_count_cache[7+8*4];
1090     h->non_zero_count[mb_xy][4]= h->non_zero_count_cache[4+8*4];
1091     h->non_zero_count[mb_xy][5]= h->non_zero_count_cache[5+8*4];
1092     h->non_zero_count[mb_xy][6]= h->non_zero_count_cache[6+8*4];
1093
1094     h->non_zero_count[mb_xy][9]= h->non_zero_count_cache[1+8*2];
1095     h->non_zero_count[mb_xy][8]= h->non_zero_count_cache[2+8*2];
1096     h->non_zero_count[mb_xy][7]= h->non_zero_count_cache[2+8*1];
1097
1098     h->non_zero_count[mb_xy][12]=h->non_zero_count_cache[1+8*5];
1099     h->non_zero_count[mb_xy][11]=h->non_zero_count_cache[2+8*5];
1100     h->non_zero_count[mb_xy][10]=h->non_zero_count_cache[2+8*4];
1101
1102     if(FRAME_MBAFF){
1103         // store all luma nnzs, for deblocking
1104         int v = 0, i;
1105         for(i=0; i<16; i++)
1106             v += (!!h->non_zero_count_cache[scan8[i]]) << i;
1107         *(uint16_t*)&h->non_zero_count[mb_xy][14] = v;
1108     }
1109 }
1110
1111 /**
1112  * gets the predicted number of non zero coefficients.
1113  * @param n block index
1114  */
1115 static inline int pred_non_zero_count(H264Context *h, int n){
1116     const int index8= scan8[n];
1117     const int left= h->non_zero_count_cache[index8 - 1];
1118     const int top = h->non_zero_count_cache[index8 - 8];
1119     int i= left + top;
1120
1121     if(i<64) i= (i+1)>>1;
1122
1123     tprintf("pred_nnz L%X T%X n%d s%d P%X\n", left, top, n, scan8[n], i&31);
1124
1125     return i&31;
1126 }
1127
1128 static inline int fetch_diagonal_mv(H264Context *h, const int16_t **C, int i, int list, int part_width){
1129     const int topright_ref= h->ref_cache[list][ i - 8 + part_width ];
1130
1131     /* there is no consistent mapping of mvs to neighboring locations that will
1132      * make mbaff happy, so we can't move all this logic to fill_caches */
1133     if(FRAME_MBAFF){
1134         MpegEncContext *s = &h->s;
1135         const int *mb_types = s->current_picture_ptr->mb_type;
1136         const int16_t *mv;
1137         *(uint32_t*)h->mv_cache[list][scan8[0]-2] = 0;
1138         *C = h->mv_cache[list][scan8[0]-2];
1139
1140         if(!MB_FIELD
1141            && (s->mb_y&1) && i < scan8[0]+8 && topright_ref != PART_NOT_AVAILABLE){
1142             int topright_xy = s->mb_x + (s->mb_y-1)*s->mb_stride + (i == scan8[0]+3);
1143             if(IS_INTERLACED(mb_types[topright_xy])){
1144 #define SET_DIAG_MV(MV_OP, REF_OP, X4, Y4)\
1145                 const int x4 = X4, y4 = Y4;\
1146                 const int mb_type = mb_types[(x4>>2)+(y4>>2)*s->mb_stride];\
1147                 if(!USES_LIST(mb_type,list) && !IS_8X8(mb_type))\
1148                     return LIST_NOT_USED;\
1149                 mv = s->current_picture_ptr->motion_val[list][x4 + y4*h->b_stride];\
1150                 h->mv_cache[list][scan8[0]-2][0] = mv[0];\
1151                 h->mv_cache[list][scan8[0]-2][1] = mv[1] MV_OP;\
1152                 return s->current_picture_ptr->ref_index[list][(x4>>1) + (y4>>1)*h->b8_stride] REF_OP;
1153
1154                 SET_DIAG_MV(*2, >>1, s->mb_x*4+(i&7)-4+part_width, s->mb_y*4-1);
1155             }
1156         }
1157         if(topright_ref == PART_NOT_AVAILABLE
1158            && ((s->mb_y&1) || i >= scan8[0]+8) && (i&7)==4
1159            && h->ref_cache[list][scan8[0]-1] != PART_NOT_AVAILABLE){
1160             if(!MB_FIELD
1161                && IS_INTERLACED(mb_types[h->left_mb_xy[0]])){
1162                 SET_DIAG_MV(*2, >>1, s->mb_x*4-1, (s->mb_y|1)*4+(s->mb_y&1)*2+(i>>4)-1);
1163             }
1164             if(MB_FIELD
1165                && !IS_INTERLACED(mb_types[h->left_mb_xy[0]])
1166                && i >= scan8[0]+8){
1167                 // leftshift will turn LIST_NOT_USED into PART_NOT_AVAILABLE, but that's ok.
1168                 SET_DIAG_MV(>>1, <<1, s->mb_x*4-1, (s->mb_y&~1)*4 - 1 + ((i-scan8[0])>>3)*2);
1169             }
1170         }
1171 #undef SET_DIAG_MV
1172     }
1173
1174     if(topright_ref != PART_NOT_AVAILABLE){
1175         *C= h->mv_cache[list][ i - 8 + part_width ];
1176         return topright_ref;
1177     }else{
1178         tprintf("topright MV not available\n");
1179
1180         *C= h->mv_cache[list][ i - 8 - 1 ];
1181         return h->ref_cache[list][ i - 8 - 1 ];
1182     }
1183 }
1184
1185 /**
1186  * gets the predicted MV.
1187  * @param n the block index
1188  * @param part_width the width of the partition (4, 8,16) -> (1, 2, 4)
1189  * @param mx the x component of the predicted motion vector
1190  * @param my the y component of the predicted motion vector
1191  */
1192 static inline void pred_motion(H264Context * const h, int n, int part_width, int list, int ref, int * const mx, int * const my){
1193     const int index8= scan8[n];
1194     const int top_ref=      h->ref_cache[list][ index8 - 8 ];
1195     const int left_ref=     h->ref_cache[list][ index8 - 1 ];
1196     const int16_t * const A= h->mv_cache[list][ index8 - 1 ];
1197     const int16_t * const B= h->mv_cache[list][ index8 - 8 ];
1198     const int16_t * C;
1199     int diagonal_ref, match_count;
1200
1201     assert(part_width==1 || part_width==2 || part_width==4);
1202
1203 /* mv_cache
1204   B . . A T T T T
1205   U . . L . . , .
1206   U . . L . . . .
1207   U . . L . . , .
1208   . . . L . . . .
1209 */
1210
1211     diagonal_ref= fetch_diagonal_mv(h, &C, index8, list, part_width);
1212     match_count= (diagonal_ref==ref) + (top_ref==ref) + (left_ref==ref);
1213     tprintf("pred_motion match_count=%d\n", match_count);
1214     if(match_count > 1){ //most common
1215         *mx= mid_pred(A[0], B[0], C[0]);
1216         *my= mid_pred(A[1], B[1], C[1]);
1217     }else if(match_count==1){
1218         if(left_ref==ref){
1219             *mx= A[0];
1220             *my= A[1];
1221         }else if(top_ref==ref){
1222             *mx= B[0];
1223             *my= B[1];
1224         }else{
1225             *mx= C[0];
1226             *my= C[1];
1227         }
1228     }else{
1229         if(top_ref == PART_NOT_AVAILABLE && diagonal_ref == PART_NOT_AVAILABLE && left_ref != PART_NOT_AVAILABLE){
1230             *mx= A[0];
1231             *my= A[1];
1232         }else{
1233             *mx= mid_pred(A[0], B[0], C[0]);
1234             *my= mid_pred(A[1], B[1], C[1]);
1235         }
1236     }
1237
1238     tprintf("pred_motion (%2d %2d %2d) (%2d %2d %2d) (%2d %2d %2d) -> (%2d %2d %2d) at %2d %2d %d list %d\n", top_ref, B[0], B[1],                    diagonal_ref, C[0], C[1], left_ref, A[0], A[1], ref, *mx, *my, h->s.mb_x, h->s.mb_y, n, list);
1239 }
1240
1241 /**
1242  * gets the directionally predicted 16x8 MV.
1243  * @param n the block index
1244  * @param mx the x component of the predicted motion vector
1245  * @param my the y component of the predicted motion vector
1246  */
1247 static inline void pred_16x8_motion(H264Context * const h, int n, int list, int ref, int * const mx, int * const my){
1248     if(n==0){
1249         const int top_ref=      h->ref_cache[list][ scan8[0] - 8 ];
1250         const int16_t * const B= h->mv_cache[list][ scan8[0] - 8 ];
1251
1252         tprintf("pred_16x8: (%2d %2d %2d) at %2d %2d %d list %d\n", top_ref, B[0], B[1], h->s.mb_x, h->s.mb_y, n, list);
1253
1254         if(top_ref == ref){
1255             *mx= B[0];
1256             *my= B[1];
1257             return;
1258         }
1259     }else{
1260         const int left_ref=     h->ref_cache[list][ scan8[8] - 1 ];
1261         const int16_t * const A= h->mv_cache[list][ scan8[8] - 1 ];
1262
1263         tprintf("pred_16x8: (%2d %2d %2d) at %2d %2d %d list %d\n", left_ref, A[0], A[1], h->s.mb_x, h->s.mb_y, n, list);
1264
1265         if(left_ref == ref){
1266             *mx= A[0];
1267             *my= A[1];
1268             return;
1269         }
1270     }
1271
1272     //RARE
1273     pred_motion(h, n, 4, list, ref, mx, my);
1274 }
1275
1276 /**
1277  * gets the directionally predicted 8x16 MV.
1278  * @param n the block index
1279  * @param mx the x component of the predicted motion vector
1280  * @param my the y component of the predicted motion vector
1281  */
1282 static inline void pred_8x16_motion(H264Context * const h, int n, int list, int ref, int * const mx, int * const my){
1283     if(n==0){
1284         const int left_ref=      h->ref_cache[list][ scan8[0] - 1 ];
1285         const int16_t * const A=  h->mv_cache[list][ scan8[0] - 1 ];
1286
1287         tprintf("pred_8x16: (%2d %2d %2d) at %2d %2d %d list %d\n", left_ref, A[0], A[1], h->s.mb_x, h->s.mb_y, n, list);
1288
1289         if(left_ref == ref){
1290             *mx= A[0];
1291             *my= A[1];
1292             return;
1293         }
1294     }else{
1295         const int16_t * C;
1296         int diagonal_ref;
1297
1298         diagonal_ref= fetch_diagonal_mv(h, &C, scan8[4], list, 2);
1299
1300         tprintf("pred_8x16: (%2d %2d %2d) at %2d %2d %d list %d\n", diagonal_ref, C[0], C[1], h->s.mb_x, h->s.mb_y, n, list);
1301
1302         if(diagonal_ref == ref){
1303             *mx= C[0];
1304             *my= C[1];
1305             return;
1306         }
1307     }
1308
1309     //RARE
1310     pred_motion(h, n, 2, list, ref, mx, my);
1311 }
1312
1313 static inline void pred_pskip_motion(H264Context * const h, int * const mx, int * const my){
1314     const int top_ref = h->ref_cache[0][ scan8[0] - 8 ];
1315     const int left_ref= h->ref_cache[0][ scan8[0] - 1 ];
1316
1317     tprintf("pred_pskip: (%d) (%d) at %2d %2d\n", top_ref, left_ref, h->s.mb_x, h->s.mb_y);
1318
1319     if(top_ref == PART_NOT_AVAILABLE || left_ref == PART_NOT_AVAILABLE
1320        || (top_ref == 0  && *(uint32_t*)h->mv_cache[0][ scan8[0] - 8 ] == 0)
1321        || (left_ref == 0 && *(uint32_t*)h->mv_cache[0][ scan8[0] - 1 ] == 0)){
1322
1323         *mx = *my = 0;
1324         return;
1325     }
1326
1327     pred_motion(h, 0, 4, 0, 0, mx, my);
1328
1329     return;
1330 }
1331
1332 static inline void direct_dist_scale_factor(H264Context * const h){
1333     const int poc = h->s.current_picture_ptr->poc;
1334     const int poc1 = h->ref_list[1][0].poc;
1335     int i;
1336     for(i=0; i<h->ref_count[0]; i++){
1337         int poc0 = h->ref_list[0][i].poc;
1338         int td = clip(poc1 - poc0, -128, 127);
1339         if(td == 0 /* FIXME || pic0 is a long-term ref */){
1340             h->dist_scale_factor[i] = 256;
1341         }else{
1342             int tb = clip(poc - poc0, -128, 127);
1343             int tx = (16384 + (ABS(td) >> 1)) / td;
1344             h->dist_scale_factor[i] = clip((tb*tx + 32) >> 6, -1024, 1023);
1345         }
1346     }
1347     if(FRAME_MBAFF){
1348         for(i=0; i<h->ref_count[0]; i++){
1349             h->dist_scale_factor_field[2*i] =
1350             h->dist_scale_factor_field[2*i+1] = h->dist_scale_factor[i];
1351         }
1352     }
1353 }
1354 static inline void direct_ref_list_init(H264Context * const h){
1355     MpegEncContext * const s = &h->s;
1356     Picture * const ref1 = &h->ref_list[1][0];
1357     Picture * const cur = s->current_picture_ptr;
1358     int list, i, j;
1359     if(cur->pict_type == I_TYPE)
1360         cur->ref_count[0] = 0;
1361     if(cur->pict_type != B_TYPE)
1362         cur->ref_count[1] = 0;
1363     for(list=0; list<2; list++){
1364         cur->ref_count[list] = h->ref_count[list];
1365         for(j=0; j<h->ref_count[list]; j++)
1366             cur->ref_poc[list][j] = h->ref_list[list][j].poc;
1367     }
1368     if(cur->pict_type != B_TYPE || h->direct_spatial_mv_pred)
1369         return;
1370     for(list=0; list<2; list++){
1371         for(i=0; i<ref1->ref_count[list]; i++){
1372             const int poc = ref1->ref_poc[list][i];
1373             h->map_col_to_list0[list][i] = 0; /* bogus; fills in for missing frames */
1374             for(j=0; j<h->ref_count[list]; j++)
1375                 if(h->ref_list[list][j].poc == poc){
1376                     h->map_col_to_list0[list][i] = j;
1377                     break;
1378                 }
1379         }
1380     }
1381     if(FRAME_MBAFF){
1382         for(list=0; list<2; list++){
1383             for(i=0; i<ref1->ref_count[list]; i++){
1384                 j = h->map_col_to_list0[list][i];
1385                 h->map_col_to_list0_field[list][2*i] = 2*j;
1386                 h->map_col_to_list0_field[list][2*i+1] = 2*j+1;
1387             }
1388         }
1389     }
1390 }
1391
1392 static inline void pred_direct_motion(H264Context * const h, int *mb_type){
1393     MpegEncContext * const s = &h->s;
1394     const int mb_xy =   s->mb_x +   s->mb_y*s->mb_stride;
1395     const int b8_xy = 2*s->mb_x + 2*s->mb_y*h->b8_stride;
1396     const int b4_xy = 4*s->mb_x + 4*s->mb_y*h->b_stride;
1397     const int mb_type_col = h->ref_list[1][0].mb_type[mb_xy];
1398     const int16_t (*l1mv0)[2] = (const int16_t (*)[2]) &h->ref_list[1][0].motion_val[0][b4_xy];
1399     const int16_t (*l1mv1)[2] = (const int16_t (*)[2]) &h->ref_list[1][0].motion_val[1][b4_xy];
1400     const int8_t *l1ref0 = &h->ref_list[1][0].ref_index[0][b8_xy];
1401     const int8_t *l1ref1 = &h->ref_list[1][0].ref_index[1][b8_xy];
1402     const int is_b8x8 = IS_8X8(*mb_type);
1403     int sub_mb_type;
1404     int i8, i4;
1405
1406 #define MB_TYPE_16x16_OR_INTRA (MB_TYPE_16x16|MB_TYPE_INTRA4x4|MB_TYPE_INTRA16x16|MB_TYPE_INTRA_PCM)
1407     if(IS_8X8(mb_type_col) && !h->sps.direct_8x8_inference_flag){
1408         /* FIXME save sub mb types from previous frames (or derive from MVs)
1409          * so we know exactly what block size to use */
1410         sub_mb_type = MB_TYPE_8x8|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_4x4 */
1411         *mb_type =    MB_TYPE_8x8|MB_TYPE_L0L1;
1412     }else if(!is_b8x8 && (mb_type_col & MB_TYPE_16x16_OR_INTRA)){
1413         sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
1414         *mb_type =    MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_16x16 */
1415     }else{
1416         sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
1417         *mb_type =    MB_TYPE_8x8|MB_TYPE_L0L1;
1418     }
1419     if(!is_b8x8)
1420         *mb_type |= MB_TYPE_DIRECT2;
1421     if(MB_FIELD)
1422         *mb_type |= MB_TYPE_INTERLACED;
1423
1424     tprintf("mb_type = %08x, sub_mb_type = %08x, is_b8x8 = %d, mb_type_col = %08x\n", *mb_type, sub_mb_type, is_b8x8, mb_type_col);
1425
1426     if(h->direct_spatial_mv_pred){
1427         int ref[2];
1428         int mv[2][2];
1429         int list;
1430
1431         /* FIXME interlacing + spatial direct uses wrong colocated block positions */
1432
1433         /* ref = min(neighbors) */
1434         for(list=0; list<2; list++){
1435             int refa = h->ref_cache[list][scan8[0] - 1];
1436             int refb = h->ref_cache[list][scan8[0] - 8];
1437             int refc = h->ref_cache[list][scan8[0] - 8 + 4];
1438             if(refc == -2)
1439                 refc = h->ref_cache[list][scan8[0] - 8 - 1];
1440             ref[list] = refa;
1441             if(ref[list] < 0 || (refb < ref[list] && refb >= 0))
1442                 ref[list] = refb;
1443             if(ref[list] < 0 || (refc < ref[list] && refc >= 0))
1444                 ref[list] = refc;
1445             if(ref[list] < 0)
1446                 ref[list] = -1;
1447         }
1448
1449         if(ref[0] < 0 && ref[1] < 0){
1450             ref[0] = ref[1] = 0;
1451             mv[0][0] = mv[0][1] =
1452             mv[1][0] = mv[1][1] = 0;
1453         }else{
1454             for(list=0; list<2; list++){
1455                 if(ref[list] >= 0)
1456                     pred_motion(h, 0, 4, list, ref[list], &mv[list][0], &mv[list][1]);
1457                 else
1458                     mv[list][0] = mv[list][1] = 0;
1459             }
1460         }
1461
1462         if(ref[1] < 0){
1463             *mb_type &= ~MB_TYPE_P0L1;
1464             sub_mb_type &= ~MB_TYPE_P0L1;
1465         }else if(ref[0] < 0){
1466             *mb_type &= ~MB_TYPE_P0L0;
1467             sub_mb_type &= ~MB_TYPE_P0L0;
1468         }
1469
1470         if(IS_16X16(*mb_type)){
1471             fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, (uint8_t)ref[0], 1);
1472             fill_rectangle(&h->ref_cache[1][scan8[0]], 4, 4, 8, (uint8_t)ref[1], 1);
1473             if(!IS_INTRA(mb_type_col)
1474                && (   (l1ref0[0] == 0 && ABS(l1mv0[0][0]) <= 1 && ABS(l1mv0[0][1]) <= 1)
1475                    || (l1ref0[0]  < 0 && l1ref1[0] == 0 && ABS(l1mv1[0][0]) <= 1 && ABS(l1mv1[0][1]) <= 1
1476                        && (h->x264_build>33 || !h->x264_build)))){
1477                 if(ref[0] > 0)
1478                     fill_rectangle(&h->mv_cache[0][scan8[0]], 4, 4, 8, pack16to32(mv[0][0],mv[0][1]), 4);
1479                 else
1480                     fill_rectangle(&h->mv_cache[0][scan8[0]], 4, 4, 8, 0, 4);
1481                 if(ref[1] > 0)
1482                     fill_rectangle(&h->mv_cache[1][scan8[0]], 4, 4, 8, pack16to32(mv[1][0],mv[1][1]), 4);
1483                 else
1484                     fill_rectangle(&h->mv_cache[1][scan8[0]], 4, 4, 8, 0, 4);
1485             }else{
1486                 fill_rectangle(&h->mv_cache[0][scan8[0]], 4, 4, 8, pack16to32(mv[0][0],mv[0][1]), 4);
1487                 fill_rectangle(&h->mv_cache[1][scan8[0]], 4, 4, 8, pack16to32(mv[1][0],mv[1][1]), 4);
1488             }
1489         }else{
1490             for(i8=0; i8<4; i8++){
1491                 const int x8 = i8&1;
1492                 const int y8 = i8>>1;
1493
1494                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1495                     continue;
1496                 h->sub_mb_type[i8] = sub_mb_type;
1497
1498                 fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mv[0][0],mv[0][1]), 4);
1499                 fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mv[1][0],mv[1][1]), 4);
1500                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[0], 1);
1501                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[1], 1);
1502
1503                 /* col_zero_flag */
1504                 if(!IS_INTRA(mb_type_col) && (   l1ref0[x8 + y8*h->b8_stride] == 0
1505                                               || (l1ref0[x8 + y8*h->b8_stride] < 0 && l1ref1[x8 + y8*h->b8_stride] == 0
1506                                                   && (h->x264_build>33 || !h->x264_build)))){
1507                     const int16_t (*l1mv)[2]= l1ref0[x8 + y8*h->b8_stride] == 0 ? l1mv0 : l1mv1;
1508                     if(IS_SUB_8X8(sub_mb_type)){
1509                         const int16_t *mv_col = l1mv[x8*3 + y8*3*h->b_stride];
1510                         if(ABS(mv_col[0]) <= 1 && ABS(mv_col[1]) <= 1){
1511                             if(ref[0] == 0)
1512                                 fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
1513                             if(ref[1] == 0)
1514                                 fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
1515                         }
1516                     }else
1517                     for(i4=0; i4<4; i4++){
1518                         const int16_t *mv_col = l1mv[x8*2 + (i4&1) + (y8*2 + (i4>>1))*h->b_stride];
1519                         if(ABS(mv_col[0]) <= 1 && ABS(mv_col[1]) <= 1){
1520                             if(ref[0] == 0)
1521                                 *(uint32_t*)h->mv_cache[0][scan8[i8*4+i4]] = 0;
1522                             if(ref[1] == 0)
1523                                 *(uint32_t*)h->mv_cache[1][scan8[i8*4+i4]] = 0;
1524                         }
1525                     }
1526                 }
1527             }
1528         }
1529     }else{ /* direct temporal mv pred */
1530         const int *map_col_to_list0[2] = {h->map_col_to_list0[0], h->map_col_to_list0[1]};
1531         const int *dist_scale_factor = h->dist_scale_factor;
1532
1533         if(FRAME_MBAFF){
1534             if(IS_INTERLACED(*mb_type)){
1535                 map_col_to_list0[0] = h->map_col_to_list0_field[0];
1536                 map_col_to_list0[1] = h->map_col_to_list0_field[1];
1537                 dist_scale_factor = h->dist_scale_factor_field;
1538             }
1539             if(IS_INTERLACED(*mb_type) != IS_INTERLACED(mb_type_col)){
1540                 /* FIXME assumes direct_8x8_inference == 1 */
1541                 const int pair_xy = s->mb_x + (s->mb_y&~1)*s->mb_stride;
1542                 int mb_types_col[2];
1543                 int y_shift;
1544
1545                 *mb_type = MB_TYPE_8x8|MB_TYPE_L0L1
1546                          | (is_b8x8 ? 0 : MB_TYPE_DIRECT2)
1547                          | (*mb_type & MB_TYPE_INTERLACED);
1548                 sub_mb_type = MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2|MB_TYPE_16x16;
1549
1550                 if(IS_INTERLACED(*mb_type)){
1551                     /* frame to field scaling */
1552                     mb_types_col[0] = h->ref_list[1][0].mb_type[pair_xy];
1553                     mb_types_col[1] = h->ref_list[1][0].mb_type[pair_xy+s->mb_stride];
1554                     if(s->mb_y&1){
1555                         l1ref0 -= 2*h->b8_stride;
1556                         l1ref1 -= 2*h->b8_stride;
1557                         l1mv0 -= 4*h->b_stride;
1558                         l1mv1 -= 4*h->b_stride;
1559                     }
1560                     y_shift = 0;
1561
1562                     if(   (mb_types_col[0] & MB_TYPE_16x16_OR_INTRA)
1563                        && (mb_types_col[1] & MB_TYPE_16x16_OR_INTRA)
1564                        && !is_b8x8)
1565                         *mb_type |= MB_TYPE_16x8;
1566                     else
1567                         *mb_type |= MB_TYPE_8x8;
1568                 }else{
1569                     /* field to frame scaling */
1570                     /* col_mb_y = (mb_y&~1) + (topAbsDiffPOC < bottomAbsDiffPOC ? 0 : 1)
1571                      * but in MBAFF, top and bottom POC are equal */
1572                     int dy = (s->mb_y&1) ? 1 : 2;
1573                     mb_types_col[0] =
1574                     mb_types_col[1] = h->ref_list[1][0].mb_type[pair_xy+s->mb_stride];
1575                     l1ref0 += dy*h->b8_stride;
1576                     l1ref1 += dy*h->b8_stride;
1577                     l1mv0 += 2*dy*h->b_stride;
1578                     l1mv1 += 2*dy*h->b_stride;
1579                     y_shift = 2;
1580
1581                     if((mb_types_col[0] & (MB_TYPE_16x16_OR_INTRA|MB_TYPE_16x8))
1582                        && !is_b8x8)
1583                         *mb_type |= MB_TYPE_16x16;
1584                     else
1585                         *mb_type |= MB_TYPE_8x8;
1586                 }
1587
1588                 for(i8=0; i8<4; i8++){
1589                     const int x8 = i8&1;
1590                     const int y8 = i8>>1;
1591                     int ref0, scale;
1592                     const int16_t (*l1mv)[2]= l1mv0;
1593
1594                     if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1595                         continue;
1596                     h->sub_mb_type[i8] = sub_mb_type;
1597
1598                     fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, 0, 1);
1599                     if(IS_INTRA(mb_types_col[y8])){
1600                         fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, 0, 1);
1601                         fill_rectangle(&h-> mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
1602                         fill_rectangle(&h-> mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
1603                         continue;
1604                     }
1605
1606                     ref0 = l1ref0[x8 + (y8*2>>y_shift)*h->b8_stride];
1607                     if(ref0 >= 0)
1608                         ref0 = map_col_to_list0[0][ref0*2>>y_shift];
1609                     else{
1610                         ref0 = map_col_to_list0[1][l1ref1[x8 + (y8*2>>y_shift)*h->b8_stride]*2>>y_shift];
1611                         l1mv= l1mv1;
1612                     }
1613                     scale = dist_scale_factor[ref0];
1614                     fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, ref0, 1);
1615
1616                     {
1617                         const int16_t *mv_col = l1mv[x8*3 + (y8*6>>y_shift)*h->b_stride];
1618                         int my_col = (mv_col[1]<<y_shift)/2;
1619                         int mx = (scale * mv_col[0] + 128) >> 8;
1620                         int my = (scale * my_col + 128) >> 8;
1621                         fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mx,my), 4);
1622                         fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mx-mv_col[0],my-my_col), 4);
1623                     }
1624                 }
1625                 return;
1626             }
1627         }
1628
1629         /* one-to-one mv scaling */
1630
1631         if(IS_16X16(*mb_type)){
1632             fill_rectangle(&h->ref_cache[1][scan8[0]], 4, 4, 8, 0, 1);
1633             if(IS_INTRA(mb_type_col)){
1634                 fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, 0, 1);
1635                 fill_rectangle(&h-> mv_cache[0][scan8[0]], 4, 4, 8, 0, 4);
1636                 fill_rectangle(&h-> mv_cache[1][scan8[0]], 4, 4, 8, 0, 4);
1637             }else{
1638                 const int ref0 = l1ref0[0] >= 0 ? map_col_to_list0[0][l1ref0[0]]
1639                                                 : map_col_to_list0[1][l1ref1[0]];
1640                 const int scale = dist_scale_factor[ref0];
1641                 const int16_t *mv_col = l1ref0[0] >= 0 ? l1mv0[0] : l1mv1[0];
1642                 int mv_l0[2];
1643                 mv_l0[0] = (scale * mv_col[0] + 128) >> 8;
1644                 mv_l0[1] = (scale * mv_col[1] + 128) >> 8;
1645                 fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, ref0, 1);
1646                 fill_rectangle(&h-> mv_cache[0][scan8[0]], 4, 4, 8, pack16to32(mv_l0[0],mv_l0[1]), 4);
1647                 fill_rectangle(&h-> mv_cache[1][scan8[0]], 4, 4, 8, pack16to32(mv_l0[0]-mv_col[0],mv_l0[1]-mv_col[1]), 4);
1648             }
1649         }else{
1650             for(i8=0; i8<4; i8++){
1651                 const int x8 = i8&1;
1652                 const int y8 = i8>>1;
1653                 int ref0, scale;
1654                 const int16_t (*l1mv)[2]= l1mv0;
1655
1656                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1657                     continue;
1658                 h->sub_mb_type[i8] = sub_mb_type;
1659                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, 0, 1);
1660                 if(IS_INTRA(mb_type_col)){
1661                     fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, 0, 1);
1662                     fill_rectangle(&h-> mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
1663                     fill_rectangle(&h-> mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
1664                     continue;
1665                 }
1666
1667                 ref0 = l1ref0[x8 + y8*h->b8_stride];
1668                 if(ref0 >= 0)
1669                     ref0 = map_col_to_list0[0][ref0];
1670                 else{
1671                     ref0 = map_col_to_list0[1][l1ref1[x8 + y8*h->b8_stride]];
1672                     l1mv= l1mv1;
1673                 }
1674                 scale = dist_scale_factor[ref0];
1675
1676                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, ref0, 1);
1677                 if(IS_SUB_8X8(sub_mb_type)){
1678                     const int16_t *mv_col = l1mv[x8*3 + y8*3*h->b_stride];
1679                     int mx = (scale * mv_col[0] + 128) >> 8;
1680                     int my = (scale * mv_col[1] + 128) >> 8;
1681                     fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mx,my), 4);
1682                     fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mx-mv_col[0],my-mv_col[1]), 4);
1683                 }else
1684                 for(i4=0; i4<4; i4++){
1685                     const int16_t *mv_col = l1mv[x8*2 + (i4&1) + (y8*2 + (i4>>1))*h->b_stride];
1686                     int16_t *mv_l0 = h->mv_cache[0][scan8[i8*4+i4]];
1687                     mv_l0[0] = (scale * mv_col[0] + 128) >> 8;
1688                     mv_l0[1] = (scale * mv_col[1] + 128) >> 8;
1689                     *(uint32_t*)h->mv_cache[1][scan8[i8*4+i4]] =
1690                         pack16to32(mv_l0[0]-mv_col[0],mv_l0[1]-mv_col[1]);
1691                 }
1692             }
1693         }
1694     }
1695 }
1696
1697 static inline void write_back_motion(H264Context *h, int mb_type){
1698     MpegEncContext * const s = &h->s;
1699     const int b_xy = 4*s->mb_x + 4*s->mb_y*h->b_stride;
1700     const int b8_xy= 2*s->mb_x + 2*s->mb_y*h->b8_stride;
1701     int list;
1702
1703     if(!USES_LIST(mb_type, 0))
1704         fill_rectangle(&s->current_picture.ref_index[0][b8_xy], 2, 2, h->b8_stride, (uint8_t)LIST_NOT_USED, 1);
1705
1706     for(list=0; list<2; list++){
1707         int y;
1708         if(!USES_LIST(mb_type, list))
1709             continue;
1710
1711         for(y=0; y<4; y++){
1712             *(uint64_t*)s->current_picture.motion_val[list][b_xy + 0 + y*h->b_stride]= *(uint64_t*)h->mv_cache[list][scan8[0]+0 + 8*y];
1713             *(uint64_t*)s->current_picture.motion_val[list][b_xy + 2 + y*h->b_stride]= *(uint64_t*)h->mv_cache[list][scan8[0]+2 + 8*y];
1714         }
1715         if( h->pps.cabac ) {
1716             for(y=0; y<4; y++){
1717                 *(uint64_t*)h->mvd_table[list][b_xy + 0 + y*h->b_stride]= *(uint64_t*)h->mvd_cache[list][scan8[0]+0 + 8*y];
1718                 *(uint64_t*)h->mvd_table[list][b_xy + 2 + y*h->b_stride]= *(uint64_t*)h->mvd_cache[list][scan8[0]+2 + 8*y];
1719             }
1720         }
1721
1722         {
1723             uint8_t *ref_index = &s->current_picture.ref_index[list][b8_xy];
1724             ref_index[0+0*h->b8_stride]= h->ref_cache[list][scan8[0]];
1725             ref_index[1+0*h->b8_stride]= h->ref_cache[list][scan8[4]];
1726             ref_index[0+1*h->b8_stride]= h->ref_cache[list][scan8[8]];
1727             ref_index[1+1*h->b8_stride]= h->ref_cache[list][scan8[12]];
1728         }
1729     }
1730
1731     if(h->slice_type == B_TYPE && h->pps.cabac){
1732         if(IS_8X8(mb_type)){
1733             uint8_t *direct_table = &h->direct_table[b8_xy];
1734             direct_table[1+0*h->b8_stride] = IS_DIRECT(h->sub_mb_type[1]) ? 1 : 0;
1735             direct_table[0+1*h->b8_stride] = IS_DIRECT(h->sub_mb_type[2]) ? 1 : 0;
1736             direct_table[1+1*h->b8_stride] = IS_DIRECT(h->sub_mb_type[3]) ? 1 : 0;
1737         }
1738     }
1739 }
1740
1741 /**
1742  * Decodes a network abstraction layer unit.
1743  * @param consumed is the number of bytes used as input
1744  * @param length is the length of the array
1745  * @param dst_length is the number of decoded bytes FIXME here or a decode rbsp tailing?
1746  * @returns decoded bytes, might be src+1 if no escapes
1747  */
1748 static uint8_t *decode_nal(H264Context *h, uint8_t *src, int *dst_length, int *consumed, int length){
1749     int i, si, di;
1750     uint8_t *dst;
1751
1752 //    src[0]&0x80;                //forbidden bit
1753     h->nal_ref_idc= src[0]>>5;
1754     h->nal_unit_type= src[0]&0x1F;
1755
1756     src++; length--;
1757 #if 0
1758     for(i=0; i<length; i++)
1759         printf("%2X ", src[i]);
1760 #endif
1761     for(i=0; i+1<length; i+=2){
1762         if(src[i]) continue;
1763         if(i>0 && src[i-1]==0) i--;
1764         if(i+2<length && src[i+1]==0 && src[i+2]<=3){
1765             if(src[i+2]!=3){
1766                 /* startcode, so we must be past the end */
1767                 length=i;
1768             }
1769             break;
1770         }
1771     }
1772
1773     if(i>=length-1){ //no escaped 0
1774         *dst_length= length;
1775         *consumed= length+1; //+1 for the header
1776         return src;
1777     }
1778
1779     h->rbsp_buffer= av_fast_realloc(h->rbsp_buffer, &h->rbsp_buffer_size, length);
1780     dst= h->rbsp_buffer;
1781
1782 //printf("decoding esc\n");
1783     si=di=0;
1784     while(si<length){
1785         //remove escapes (very rare 1:2^22)
1786         if(si+2<length && src[si]==0 && src[si+1]==0 && src[si+2]<=3){
1787             if(src[si+2]==3){ //escape
1788                 dst[di++]= 0;
1789                 dst[di++]= 0;
1790                 si+=3;
1791                 continue;
1792             }else //next start code
1793                 break;
1794         }
1795
1796         dst[di++]= src[si++];
1797     }
1798
1799     *dst_length= di;
1800     *consumed= si + 1;//+1 for the header
1801 //FIXME store exact number of bits in the getbitcontext (its needed for decoding)
1802     return dst;
1803 }
1804
1805 #if 0
1806 /**
1807  * @param src the data which should be escaped
1808  * @param dst the target buffer, dst+1 == src is allowed as a special case
1809  * @param length the length of the src data
1810  * @param dst_length the length of the dst array
1811  * @returns length of escaped data in bytes or -1 if an error occured
1812  */
1813 static int encode_nal(H264Context *h, uint8_t *dst, uint8_t *src, int length, int dst_length){
1814     int i, escape_count, si, di;
1815     uint8_t *temp;
1816
1817     assert(length>=0);
1818     assert(dst_length>0);
1819
1820     dst[0]= (h->nal_ref_idc<<5) + h->nal_unit_type;
1821
1822     if(length==0) return 1;
1823
1824     escape_count= 0;
1825     for(i=0; i<length; i+=2){
1826         if(src[i]) continue;
1827         if(i>0 && src[i-1]==0)
1828             i--;
1829         if(i+2<length && src[i+1]==0 && src[i+2]<=3){
1830             escape_count++;
1831             i+=2;
1832         }
1833     }
1834
1835     if(escape_count==0){
1836         if(dst+1 != src)
1837             memcpy(dst+1, src, length);
1838         return length + 1;
1839     }
1840
1841     if(length + escape_count + 1> dst_length)
1842         return -1;
1843
1844     //this should be damn rare (hopefully)
1845
1846     h->rbsp_buffer= av_fast_realloc(h->rbsp_buffer, &h->rbsp_buffer_size, length + escape_count);
1847     temp= h->rbsp_buffer;
1848 //printf("encoding esc\n");
1849
1850     si= 0;
1851     di= 0;
1852     while(si < length){
1853         if(si+2<length && src[si]==0 && src[si+1]==0 && src[si+2]<=3){
1854             temp[di++]= 0; si++;
1855             temp[di++]= 0; si++;
1856             temp[di++]= 3;
1857             temp[di++]= src[si++];
1858         }
1859         else
1860             temp[di++]= src[si++];
1861     }
1862     memcpy(dst+1, temp, length+escape_count);
1863
1864     assert(di == length+escape_count);
1865
1866     return di + 1;
1867 }
1868
1869 /**
1870  * write 1,10,100,1000,... for alignment, yes its exactly inverse to mpeg4
1871  */
1872 static void encode_rbsp_trailing(PutBitContext *pb){
1873     int length;
1874     put_bits(pb, 1, 1);
1875     length= (-put_bits_count(pb))&7;
1876     if(length) put_bits(pb, length, 0);
1877 }
1878 #endif
1879
1880 /**
1881  * identifies the exact end of the bitstream
1882  * @return the length of the trailing, or 0 if damaged
1883  */
1884 static int decode_rbsp_trailing(uint8_t *src){
1885     int v= *src;
1886     int r;
1887
1888     tprintf("rbsp trailing %X\n", v);
1889
1890     for(r=1; r<9; r++){
1891         if(v&1) return r;
1892         v>>=1;
1893     }
1894     return 0;
1895 }
1896
1897 /**
1898  * idct tranforms the 16 dc values and dequantize them.
1899  * @param qp quantization parameter
1900  */
1901 static void h264_luma_dc_dequant_idct_c(DCTELEM *block, int qp, int qmul){
1902 #define stride 16
1903     int i;
1904     int temp[16]; //FIXME check if this is a good idea
1905     static const int x_offset[4]={0, 1*stride, 4* stride,  5*stride};
1906     static const int y_offset[4]={0, 2*stride, 8* stride, 10*stride};
1907
1908 //memset(block, 64, 2*256);
1909 //return;
1910     for(i=0; i<4; i++){
1911         const int offset= y_offset[i];
1912         const int z0= block[offset+stride*0] + block[offset+stride*4];
1913         const int z1= block[offset+stride*0] - block[offset+stride*4];
1914         const int z2= block[offset+stride*1] - block[offset+stride*5];
1915         const int z3= block[offset+stride*1] + block[offset+stride*5];
1916
1917         temp[4*i+0]= z0+z3;
1918         temp[4*i+1]= z1+z2;
1919         temp[4*i+2]= z1-z2;
1920         temp[4*i+3]= z0-z3;
1921     }
1922
1923     for(i=0; i<4; i++){
1924         const int offset= x_offset[i];
1925         const int z0= temp[4*0+i] + temp[4*2+i];
1926         const int z1= temp[4*0+i] - temp[4*2+i];
1927         const int z2= temp[4*1+i] - temp[4*3+i];
1928         const int z3= temp[4*1+i] + temp[4*3+i];
1929
1930         block[stride*0 +offset]= ((((z0 + z3)*qmul + 128 ) >> 8)); //FIXME think about merging this into decode_resdual
1931         block[stride*2 +offset]= ((((z1 + z2)*qmul + 128 ) >> 8));
1932         block[stride*8 +offset]= ((((z1 - z2)*qmul + 128 ) >> 8));
1933         block[stride*10+offset]= ((((z0 - z3)*qmul + 128 ) >> 8));
1934     }
1935 }
1936
1937 #if 0
1938 /**
1939  * dct tranforms the 16 dc values.
1940  * @param qp quantization parameter ??? FIXME
1941  */
1942 static void h264_luma_dc_dct_c(DCTELEM *block/*, int qp*/){
1943 //    const int qmul= dequant_coeff[qp][0];
1944     int i;
1945     int temp[16]; //FIXME check if this is a good idea
1946     static const int x_offset[4]={0, 1*stride, 4* stride,  5*stride};
1947     static const int y_offset[4]={0, 2*stride, 8* stride, 10*stride};
1948
1949     for(i=0; i<4; i++){
1950         const int offset= y_offset[i];
1951         const int z0= block[offset+stride*0] + block[offset+stride*4];
1952         const int z1= block[offset+stride*0] - block[offset+stride*4];
1953         const int z2= block[offset+stride*1] - block[offset+stride*5];
1954         const int z3= block[offset+stride*1] + block[offset+stride*5];
1955
1956         temp[4*i+0]= z0+z3;
1957         temp[4*i+1]= z1+z2;
1958         temp[4*i+2]= z1-z2;
1959         temp[4*i+3]= z0-z3;
1960     }
1961
1962     for(i=0; i<4; i++){
1963         const int offset= x_offset[i];
1964         const int z0= temp[4*0+i] + temp[4*2+i];
1965         const int z1= temp[4*0+i] - temp[4*2+i];
1966         const int z2= temp[4*1+i] - temp[4*3+i];
1967         const int z3= temp[4*1+i] + temp[4*3+i];
1968
1969         block[stride*0 +offset]= (z0 + z3)>>1;
1970         block[stride*2 +offset]= (z1 + z2)>>1;
1971         block[stride*8 +offset]= (z1 - z2)>>1;
1972         block[stride*10+offset]= (z0 - z3)>>1;
1973     }
1974 }
1975 #endif
1976
1977 #undef xStride
1978 #undef stride
1979
1980 static void chroma_dc_dequant_idct_c(DCTELEM *block, int qp, int qmul){
1981     const int stride= 16*2;
1982     const int xStride= 16;
1983     int a,b,c,d,e;
1984
1985     a= block[stride*0 + xStride*0];
1986     b= block[stride*0 + xStride*1];
1987     c= block[stride*1 + xStride*0];
1988     d= block[stride*1 + xStride*1];
1989
1990     e= a-b;
1991     a= a+b;
1992     b= c-d;
1993     c= c+d;
1994
1995     block[stride*0 + xStride*0]= ((a+c)*qmul) >> 7;
1996     block[stride*0 + xStride*1]= ((e+b)*qmul) >> 7;
1997     block[stride*1 + xStride*0]= ((a-c)*qmul) >> 7;
1998     block[stride*1 + xStride*1]= ((e-b)*qmul) >> 7;
1999 }
2000
2001 #if 0
2002 static void chroma_dc_dct_c(DCTELEM *block){
2003     const int stride= 16*2;
2004     const int xStride= 16;
2005     int a,b,c,d,e;
2006
2007     a= block[stride*0 + xStride*0];
2008     b= block[stride*0 + xStride*1];
2009     c= block[stride*1 + xStride*0];
2010     d= block[stride*1 + xStride*1];
2011
2012     e= a-b;
2013     a= a+b;
2014     b= c-d;
2015     c= c+d;
2016
2017     block[stride*0 + xStride*0]= (a+c);
2018     block[stride*0 + xStride*1]= (e+b);
2019     block[stride*1 + xStride*0]= (a-c);
2020     block[stride*1 + xStride*1]= (e-b);
2021 }
2022 #endif
2023
2024 /**
2025  * gets the chroma qp.
2026  */
2027 static inline int get_chroma_qp(int chroma_qp_index_offset, int qscale){
2028
2029     return chroma_qp[clip(qscale + chroma_qp_index_offset, 0, 51)];
2030 }
2031
2032
2033 #if 0
2034 static void h264_diff_dct_c(DCTELEM *block, uint8_t *src1, uint8_t *src2, int stride){
2035     int i;
2036     //FIXME try int temp instead of block
2037
2038     for(i=0; i<4; i++){
2039         const int d0= src1[0 + i*stride] - src2[0 + i*stride];
2040         const int d1= src1[1 + i*stride] - src2[1 + i*stride];
2041         const int d2= src1[2 + i*stride] - src2[2 + i*stride];
2042         const int d3= src1[3 + i*stride] - src2[3 + i*stride];
2043         const int z0= d0 + d3;
2044         const int z3= d0 - d3;
2045         const int z1= d1 + d2;
2046         const int z2= d1 - d2;
2047
2048         block[0 + 4*i]=   z0 +   z1;
2049         block[1 + 4*i]= 2*z3 +   z2;
2050         block[2 + 4*i]=   z0 -   z1;
2051         block[3 + 4*i]=   z3 - 2*z2;
2052     }
2053
2054     for(i=0; i<4; i++){
2055         const int z0= block[0*4 + i] + block[3*4 + i];
2056         const int z3= block[0*4 + i] - block[3*4 + i];
2057         const int z1= block[1*4 + i] + block[2*4 + i];
2058         const int z2= block[1*4 + i] - block[2*4 + i];
2059
2060         block[0*4 + i]=   z0 +   z1;
2061         block[1*4 + i]= 2*z3 +   z2;
2062         block[2*4 + i]=   z0 -   z1;
2063         block[3*4 + i]=   z3 - 2*z2;
2064     }
2065 }
2066 #endif
2067
2068 //FIXME need to check that this doesnt overflow signed 32 bit for low qp, i am not sure, it's very close
2069 //FIXME check that gcc inlines this (and optimizes intra & seperate_dc stuff away)
2070 static inline int quantize_c(DCTELEM *block, uint8_t *scantable, int qscale, int intra, int seperate_dc){
2071     int i;
2072     const int * const quant_table= quant_coeff[qscale];
2073     const int bias= intra ? (1<<QUANT_SHIFT)/3 : (1<<QUANT_SHIFT)/6;
2074     const unsigned int threshold1= (1<<QUANT_SHIFT) - bias - 1;
2075     const unsigned int threshold2= (threshold1<<1);
2076     int last_non_zero;
2077
2078     if(seperate_dc){
2079         if(qscale<=18){
2080             //avoid overflows
2081             const int dc_bias= intra ? (1<<(QUANT_SHIFT-2))/3 : (1<<(QUANT_SHIFT-2))/6;
2082             const unsigned int dc_threshold1= (1<<(QUANT_SHIFT-2)) - dc_bias - 1;
2083             const unsigned int dc_threshold2= (dc_threshold1<<1);
2084
2085             int level= block[0]*quant_coeff[qscale+18][0];
2086             if(((unsigned)(level+dc_threshold1))>dc_threshold2){
2087                 if(level>0){
2088                     level= (dc_bias + level)>>(QUANT_SHIFT-2);
2089                     block[0]= level;
2090                 }else{
2091                     level= (dc_bias - level)>>(QUANT_SHIFT-2);
2092                     block[0]= -level;
2093                 }
2094 //                last_non_zero = i;
2095             }else{
2096                 block[0]=0;
2097             }
2098         }else{
2099             const int dc_bias= intra ? (1<<(QUANT_SHIFT+1))/3 : (1<<(QUANT_SHIFT+1))/6;
2100             const unsigned int dc_threshold1= (1<<(QUANT_SHIFT+1)) - dc_bias - 1;
2101             const unsigned int dc_threshold2= (dc_threshold1<<1);
2102
2103             int level= block[0]*quant_table[0];
2104             if(((unsigned)(level+dc_threshold1))>dc_threshold2){
2105                 if(level>0){
2106                     level= (dc_bias + level)>>(QUANT_SHIFT+1);
2107                     block[0]= level;
2108                 }else{
2109                     level= (dc_bias - level)>>(QUANT_SHIFT+1);
2110                     block[0]= -level;
2111                 }
2112 //                last_non_zero = i;
2113             }else{
2114                 block[0]=0;
2115             }
2116         }
2117         last_non_zero= 0;
2118         i=1;
2119     }else{
2120         last_non_zero= -1;
2121         i=0;
2122     }
2123
2124     for(; i<16; i++){
2125         const int j= scantable[i];
2126         int level= block[j]*quant_table[j];
2127
2128 //        if(   bias+level >= (1<<(QMAT_SHIFT - 3))
2129 //           || bias-level >= (1<<(QMAT_SHIFT - 3))){
2130         if(((unsigned)(level+threshold1))>threshold2){
2131             if(level>0){
2132                 level= (bias + level)>>QUANT_SHIFT;
2133                 block[j]= level;
2134             }else{
2135                 level= (bias - level)>>QUANT_SHIFT;
2136                 block[j]= -level;
2137             }
2138             last_non_zero = i;
2139         }else{
2140             block[j]=0;
2141         }
2142     }
2143
2144     return last_non_zero;
2145 }
2146
2147 static void pred4x4_vertical_c(uint8_t *src, uint8_t *topright, int stride){
2148     const uint32_t a= ((uint32_t*)(src-stride))[0];
2149     ((uint32_t*)(src+0*stride))[0]= a;
2150     ((uint32_t*)(src+1*stride))[0]= a;
2151     ((uint32_t*)(src+2*stride))[0]= a;
2152     ((uint32_t*)(src+3*stride))[0]= a;
2153 }
2154
2155 static void pred4x4_horizontal_c(uint8_t *src, uint8_t *topright, int stride){
2156     ((uint32_t*)(src+0*stride))[0]= src[-1+0*stride]*0x01010101;
2157     ((uint32_t*)(src+1*stride))[0]= src[-1+1*stride]*0x01010101;
2158     ((uint32_t*)(src+2*stride))[0]= src[-1+2*stride]*0x01010101;
2159     ((uint32_t*)(src+3*stride))[0]= src[-1+3*stride]*0x01010101;
2160 }
2161
2162 static void pred4x4_dc_c(uint8_t *src, uint8_t *topright, int stride){
2163     const int dc= (  src[-stride] + src[1-stride] + src[2-stride] + src[3-stride]
2164                    + src[-1+0*stride] + src[-1+1*stride] + src[-1+2*stride] + src[-1+3*stride] + 4) >>3;
2165
2166     ((uint32_t*)(src+0*stride))[0]=
2167     ((uint32_t*)(src+1*stride))[0]=
2168     ((uint32_t*)(src+2*stride))[0]=
2169     ((uint32_t*)(src+3*stride))[0]= dc* 0x01010101;
2170 }
2171
2172 static void pred4x4_left_dc_c(uint8_t *src, uint8_t *topright, int stride){
2173     const int dc= (  src[-1+0*stride] + src[-1+1*stride] + src[-1+2*stride] + src[-1+3*stride] + 2) >>2;
2174
2175     ((uint32_t*)(src+0*stride))[0]=
2176     ((uint32_t*)(src+1*stride))[0]=
2177     ((uint32_t*)(src+2*stride))[0]=
2178     ((uint32_t*)(src+3*stride))[0]= dc* 0x01010101;
2179 }
2180
2181 static void pred4x4_top_dc_c(uint8_t *src, uint8_t *topright, int stride){
2182     const int dc= (  src[-stride] + src[1-stride] + src[2-stride] + src[3-stride] + 2) >>2;
2183
2184     ((uint32_t*)(src+0*stride))[0]=
2185     ((uint32_t*)(src+1*stride))[0]=
2186     ((uint32_t*)(src+2*stride))[0]=
2187     ((uint32_t*)(src+3*stride))[0]= dc* 0x01010101;
2188 }
2189
2190 static void pred4x4_128_dc_c(uint8_t *src, uint8_t *topright, int stride){
2191     ((uint32_t*)(src+0*stride))[0]=
2192     ((uint32_t*)(src+1*stride))[0]=
2193     ((uint32_t*)(src+2*stride))[0]=
2194     ((uint32_t*)(src+3*stride))[0]= 128U*0x01010101U;
2195 }
2196
2197
2198 #define LOAD_TOP_RIGHT_EDGE\
2199     const int t4= topright[0];\
2200     const int t5= topright[1];\
2201     const int t6= topright[2];\
2202     const int t7= topright[3];\
2203
2204 #define LOAD_LEFT_EDGE\
2205     const int l0= src[-1+0*stride];\
2206     const int l1= src[-1+1*stride];\
2207     const int l2= src[-1+2*stride];\
2208     const int l3= src[-1+3*stride];\
2209
2210 #define LOAD_TOP_EDGE\
2211     const int t0= src[ 0-1*stride];\
2212     const int t1= src[ 1-1*stride];\
2213     const int t2= src[ 2-1*stride];\
2214     const int t3= src[ 3-1*stride];\
2215
2216 static void pred4x4_down_right_c(uint8_t *src, uint8_t *topright, int stride){
2217     const int lt= src[-1-1*stride];
2218     LOAD_TOP_EDGE
2219     LOAD_LEFT_EDGE
2220
2221     src[0+3*stride]=(l3 + 2*l2 + l1 + 2)>>2;
2222     src[0+2*stride]=
2223     src[1+3*stride]=(l2 + 2*l1 + l0 + 2)>>2;
2224     src[0+1*stride]=
2225     src[1+2*stride]=
2226     src[2+3*stride]=(l1 + 2*l0 + lt + 2)>>2;
2227     src[0+0*stride]=
2228     src[1+1*stride]=
2229     src[2+2*stride]=
2230     src[3+3*stride]=(l0 + 2*lt + t0 + 2)>>2;
2231     src[1+0*stride]=
2232     src[2+1*stride]=
2233     src[3+2*stride]=(lt + 2*t0 + t1 + 2)>>2;
2234     src[2+0*stride]=
2235     src[3+1*stride]=(t0 + 2*t1 + t2 + 2)>>2;
2236     src[3+0*stride]=(t1 + 2*t2 + t3 + 2)>>2;
2237 }
2238
2239 static void pred4x4_down_left_c(uint8_t *src, uint8_t *topright, int stride){
2240     LOAD_TOP_EDGE
2241     LOAD_TOP_RIGHT_EDGE
2242 //    LOAD_LEFT_EDGE
2243
2244     src[0+0*stride]=(t0 + t2 + 2*t1 + 2)>>2;
2245     src[1+0*stride]=
2246     src[0+1*stride]=(t1 + t3 + 2*t2 + 2)>>2;
2247     src[2+0*stride]=
2248     src[1+1*stride]=
2249     src[0+2*stride]=(t2 + t4 + 2*t3 + 2)>>2;
2250     src[3+0*stride]=
2251     src[2+1*stride]=
2252     src[1+2*stride]=
2253     src[0+3*stride]=(t3 + t5 + 2*t4 + 2)>>2;
2254     src[3+1*stride]=
2255     src[2+2*stride]=
2256     src[1+3*stride]=(t4 + t6 + 2*t5 + 2)>>2;
2257     src[3+2*stride]=
2258     src[2+3*stride]=(t5 + t7 + 2*t6 + 2)>>2;
2259     src[3+3*stride]=(t6 + 3*t7 + 2)>>2;
2260 }
2261
2262 static void pred4x4_vertical_right_c(uint8_t *src, uint8_t *topright, int stride){
2263     const int lt= src[-1-1*stride];
2264     LOAD_TOP_EDGE
2265     LOAD_LEFT_EDGE
2266     const __attribute__((unused)) int unu= l3;
2267
2268     src[0+0*stride]=
2269     src[1+2*stride]=(lt + t0 + 1)>>1;
2270     src[1+0*stride]=
2271     src[2+2*stride]=(t0 + t1 + 1)>>1;
2272     src[2+0*stride]=
2273     src[3+2*stride]=(t1 + t2 + 1)>>1;
2274     src[3+0*stride]=(t2 + t3 + 1)>>1;
2275     src[0+1*stride]=
2276     src[1+3*stride]=(l0 + 2*lt + t0 + 2)>>2;
2277     src[1+1*stride]=
2278     src[2+3*stride]=(lt + 2*t0 + t1 + 2)>>2;
2279     src[2+1*stride]=
2280     src[3+3*stride]=(t0 + 2*t1 + t2 + 2)>>2;
2281     src[3+1*stride]=(t1 + 2*t2 + t3 + 2)>>2;
2282     src[0+2*stride]=(lt + 2*l0 + l1 + 2)>>2;
2283     src[0+3*stride]=(l0 + 2*l1 + l2 + 2)>>2;
2284 }
2285
2286 static void pred4x4_vertical_left_c(uint8_t *src, uint8_t *topright, int stride){
2287     LOAD_TOP_EDGE
2288     LOAD_TOP_RIGHT_EDGE
2289     const __attribute__((unused)) int unu= t7;
2290
2291     src[0+0*stride]=(t0 + t1 + 1)>>1;
2292     src[1+0*stride]=
2293     src[0+2*stride]=(t1 + t2 + 1)>>1;
2294     src[2+0*stride]=
2295     src[1+2*stride]=(t2 + t3 + 1)>>1;
2296     src[3+0*stride]=
2297     src[2+2*stride]=(t3 + t4+ 1)>>1;
2298     src[3+2*stride]=(t4 + t5+ 1)>>1;
2299     src[0+1*stride]=(t0 + 2*t1 + t2 + 2)>>2;
2300     src[1+1*stride]=
2301     src[0+3*stride]=(t1 + 2*t2 + t3 + 2)>>2;
2302     src[2+1*stride]=
2303     src[1+3*stride]=(t2 + 2*t3 + t4 + 2)>>2;
2304     src[3+1*stride]=
2305     src[2+3*stride]=(t3 + 2*t4 + t5 + 2)>>2;
2306     src[3+3*stride]=(t4 + 2*t5 + t6 + 2)>>2;
2307 }
2308
2309 static void pred4x4_horizontal_up_c(uint8_t *src, uint8_t *topright, int stride){
2310     LOAD_LEFT_EDGE
2311
2312     src[0+0*stride]=(l0 + l1 + 1)>>1;
2313     src[1+0*stride]=(l0 + 2*l1 + l2 + 2)>>2;
2314     src[2+0*stride]=
2315     src[0+1*stride]=(l1 + l2 + 1)>>1;
2316     src[3+0*stride]=
2317     src[1+1*stride]=(l1 + 2*l2 + l3 + 2)>>2;
2318     src[2+1*stride]=
2319     src[0+2*stride]=(l2 + l3 + 1)>>1;
2320     src[3+1*stride]=
2321     src[1+2*stride]=(l2 + 2*l3 + l3 + 2)>>2;
2322     src[3+2*stride]=
2323     src[1+3*stride]=
2324     src[0+3*stride]=
2325     src[2+2*stride]=
2326     src[2+3*stride]=
2327     src[3+3*stride]=l3;
2328 }
2329
2330 static void pred4x4_horizontal_down_c(uint8_t *src, uint8_t *topright, int stride){
2331     const int lt= src[-1-1*stride];
2332     LOAD_TOP_EDGE
2333     LOAD_LEFT_EDGE
2334     const __attribute__((unused)) int unu= t3;
2335
2336     src[0+0*stride]=
2337     src[2+1*stride]=(lt + l0 + 1)>>1;
2338     src[1+0*stride]=
2339     src[3+1*stride]=(l0 + 2*lt + t0 + 2)>>2;
2340     src[2+0*stride]=(lt + 2*t0 + t1 + 2)>>2;
2341     src[3+0*stride]=(t0 + 2*t1 + t2 + 2)>>2;
2342     src[0+1*stride]=
2343     src[2+2*stride]=(l0 + l1 + 1)>>1;
2344     src[1+1*stride]=
2345     src[3+2*stride]=(lt + 2*l0 + l1 + 2)>>2;
2346     src[0+2*stride]=
2347     src[2+3*stride]=(l1 + l2+ 1)>>1;
2348     src[1+2*stride]=
2349     src[3+3*stride]=(l0 + 2*l1 + l2 + 2)>>2;
2350     src[0+3*stride]=(l2 + l3 + 1)>>1;
2351     src[1+3*stride]=(l1 + 2*l2 + l3 + 2)>>2;
2352 }
2353
2354 static void pred16x16_vertical_c(uint8_t *src, int stride){
2355     int i;
2356     const uint32_t a= ((uint32_t*)(src-stride))[0];
2357     const uint32_t b= ((uint32_t*)(src-stride))[1];
2358     const uint32_t c= ((uint32_t*)(src-stride))[2];
2359     const uint32_t d= ((uint32_t*)(src-stride))[3];
2360
2361     for(i=0; i<16; i++){
2362         ((uint32_t*)(src+i*stride))[0]= a;
2363         ((uint32_t*)(src+i*stride))[1]= b;
2364         ((uint32_t*)(src+i*stride))[2]= c;
2365         ((uint32_t*)(src+i*stride))[3]= d;
2366     }
2367 }
2368
2369 static void pred16x16_horizontal_c(uint8_t *src, int stride){
2370     int i;
2371
2372     for(i=0; i<16; i++){
2373         ((uint32_t*)(src+i*stride))[0]=
2374         ((uint32_t*)(src+i*stride))[1]=
2375         ((uint32_t*)(src+i*stride))[2]=
2376         ((uint32_t*)(src+i*stride))[3]= src[-1+i*stride]*0x01010101;
2377     }
2378 }
2379
2380 static void pred16x16_dc_c(uint8_t *src, int stride){
2381     int i, dc=0;
2382
2383     for(i=0;i<16; i++){
2384         dc+= src[-1+i*stride];
2385     }
2386
2387     for(i=0;i<16; i++){
2388         dc+= src[i-stride];
2389     }
2390
2391     dc= 0x01010101*((dc + 16)>>5);
2392
2393     for(i=0; i<16; i++){
2394         ((uint32_t*)(src+i*stride))[0]=
2395         ((uint32_t*)(src+i*stride))[1]=
2396         ((uint32_t*)(src+i*stride))[2]=
2397         ((uint32_t*)(src+i*stride))[3]= dc;
2398     }
2399 }
2400
2401 static void pred16x16_left_dc_c(uint8_t *src, int stride){
2402     int i, dc=0;
2403
2404     for(i=0;i<16; i++){
2405         dc+= src[-1+i*stride];
2406     }
2407
2408     dc= 0x01010101*((dc + 8)>>4);
2409
2410     for(i=0; i<16; i++){
2411         ((uint32_t*)(src+i*stride))[0]=
2412         ((uint32_t*)(src+i*stride))[1]=
2413         ((uint32_t*)(src+i*stride))[2]=
2414         ((uint32_t*)(src+i*stride))[3]= dc;
2415     }
2416 }
2417
2418 static void pred16x16_top_dc_c(uint8_t *src, int stride){
2419     int i, dc=0;
2420
2421     for(i=0;i<16; i++){
2422         dc+= src[i-stride];
2423     }
2424     dc= 0x01010101*((dc + 8)>>4);
2425
2426     for(i=0; i<16; i++){
2427         ((uint32_t*)(src+i*stride))[0]=
2428         ((uint32_t*)(src+i*stride))[1]=
2429         ((uint32_t*)(src+i*stride))[2]=
2430         ((uint32_t*)(src+i*stride))[3]= dc;
2431     }
2432 }
2433
2434 static void pred16x16_128_dc_c(uint8_t *src, int stride){
2435     int i;
2436
2437     for(i=0; i<16; i++){
2438         ((uint32_t*)(src+i*stride))[0]=
2439         ((uint32_t*)(src+i*stride))[1]=
2440         ((uint32_t*)(src+i*stride))[2]=
2441         ((uint32_t*)(src+i*stride))[3]= 0x01010101U*128U;
2442     }
2443 }
2444
2445 static inline void pred16x16_plane_compat_c(uint8_t *src, int stride, const int svq3){
2446   int i, j, k;
2447   int a;
2448   uint8_t *cm = cropTbl + MAX_NEG_CROP;
2449   const uint8_t * const src0 = src+7-stride;
2450   const uint8_t *src1 = src+8*stride-1;
2451   const uint8_t *src2 = src1-2*stride;      // == src+6*stride-1;
2452   int H = src0[1] - src0[-1];
2453   int V = src1[0] - src2[ 0];
2454   for(k=2; k<=8; ++k) {
2455     src1 += stride; src2 -= stride;
2456     H += k*(src0[k] - src0[-k]);
2457     V += k*(src1[0] - src2[ 0]);
2458   }
2459   if(svq3){
2460     H = ( 5*(H/4) ) / 16;
2461     V = ( 5*(V/4) ) / 16;
2462
2463     /* required for 100% accuracy */
2464     i = H; H = V; V = i;
2465   }else{
2466     H = ( 5*H+32 ) >> 6;
2467     V = ( 5*V+32 ) >> 6;
2468   }
2469
2470   a = 16*(src1[0] + src2[16] + 1) - 7*(V+H);
2471   for(j=16; j>0; --j) {
2472     int b = a;
2473     a += V;
2474     for(i=-16; i<0; i+=4) {
2475       src[16+i] = cm[ (b    ) >> 5 ];
2476       src[17+i] = cm[ (b+  H) >> 5 ];
2477       src[18+i] = cm[ (b+2*H) >> 5 ];
2478       src[19+i] = cm[ (b+3*H) >> 5 ];
2479       b += 4*H;
2480     }
2481     src += stride;
2482   }
2483 }
2484
2485 static void pred16x16_plane_c(uint8_t *src, int stride){
2486     pred16x16_plane_compat_c(src, stride, 0);
2487 }
2488
2489 static void pred8x8_vertical_c(uint8_t *src, int stride){
2490     int i;
2491     const uint32_t a= ((uint32_t*)(src-stride))[0];
2492     const uint32_t b= ((uint32_t*)(src-stride))[1];
2493
2494     for(i=0; i<8; i++){
2495         ((uint32_t*)(src+i*stride))[0]= a;
2496         ((uint32_t*)(src+i*stride))[1]= b;
2497     }
2498 }
2499
2500 static void pred8x8_horizontal_c(uint8_t *src, int stride){
2501     int i;
2502
2503     for(i=0; i<8; i++){
2504         ((uint32_t*)(src+i*stride))[0]=
2505         ((uint32_t*)(src+i*stride))[1]= src[-1+i*stride]*0x01010101;
2506     }
2507 }
2508
2509 static void pred8x8_128_dc_c(uint8_t *src, int stride){
2510     int i;
2511
2512     for(i=0; i<8; i++){
2513         ((uint32_t*)(src+i*stride))[0]=
2514         ((uint32_t*)(src+i*stride))[1]= 0x01010101U*128U;
2515     }
2516 }
2517
2518 static void pred8x8_left_dc_c(uint8_t *src, int stride){
2519     int i;
2520     int dc0, dc2;
2521
2522     dc0=dc2=0;
2523     for(i=0;i<4; i++){
2524         dc0+= src[-1+i*stride];
2525         dc2+= src[-1+(i+4)*stride];
2526     }
2527     dc0= 0x01010101*((dc0 + 2)>>2);
2528     dc2= 0x01010101*((dc2 + 2)>>2);
2529
2530     for(i=0; i<4; i++){
2531         ((uint32_t*)(src+i*stride))[0]=
2532         ((uint32_t*)(src+i*stride))[1]= dc0;
2533     }
2534     for(i=4; i<8; i++){
2535         ((uint32_t*)(src+i*stride))[0]=
2536         ((uint32_t*)(src+i*stride))[1]= dc2;
2537     }
2538 }
2539
2540 static void pred8x8_top_dc_c(uint8_t *src, int stride){
2541     int i;
2542     int dc0, dc1;
2543
2544     dc0=dc1=0;
2545     for(i=0;i<4; i++){
2546         dc0+= src[i-stride];
2547         dc1+= src[4+i-stride];
2548     }
2549     dc0= 0x01010101*((dc0 + 2)>>2);
2550     dc1= 0x01010101*((dc1 + 2)>>2);
2551
2552     for(i=0; i<4; i++){
2553         ((uint32_t*)(src+i*stride))[0]= dc0;
2554         ((uint32_t*)(src+i*stride))[1]= dc1;
2555     }
2556     for(i=4; i<8; i++){
2557         ((uint32_t*)(src+i*stride))[0]= dc0;
2558         ((uint32_t*)(src+i*stride))[1]= dc1;
2559     }
2560 }
2561
2562
2563 static void pred8x8_dc_c(uint8_t *src, int stride){
2564     int i;
2565     int dc0, dc1, dc2, dc3;
2566
2567     dc0=dc1=dc2=0;
2568     for(i=0;i<4; i++){
2569         dc0+= src[-1+i*stride] + src[i-stride];
2570         dc1+= src[4+i-stride];
2571         dc2+= src[-1+(i+4)*stride];
2572     }
2573     dc3= 0x01010101*((dc1 + dc2 + 4)>>3);
2574     dc0= 0x01010101*((dc0 + 4)>>3);
2575     dc1= 0x01010101*((dc1 + 2)>>2);
2576     dc2= 0x01010101*((dc2 + 2)>>2);
2577
2578     for(i=0; i<4; i++){
2579         ((uint32_t*)(src+i*stride))[0]= dc0;
2580         ((uint32_t*)(src+i*stride))[1]= dc1;
2581     }
2582     for(i=4; i<8; i++){
2583         ((uint32_t*)(src+i*stride))[0]= dc2;
2584         ((uint32_t*)(src+i*stride))[1]= dc3;
2585     }
2586 }
2587
2588 static void pred8x8_plane_c(uint8_t *src, int stride){
2589   int j, k;
2590   int a;
2591   uint8_t *cm = cropTbl + MAX_NEG_CROP;
2592   const uint8_t * const src0 = src+3-stride;
2593   const uint8_t *src1 = src+4*stride-1;
2594   const uint8_t *src2 = src1-2*stride;      // == src+2*stride-1;
2595   int H = src0[1] - src0[-1];
2596   int V = src1[0] - src2[ 0];
2597   for(k=2; k<=4; ++k) {
2598     src1 += stride; src2 -= stride;
2599     H += k*(src0[k] - src0[-k]);
2600     V += k*(src1[0] - src2[ 0]);
2601   }
2602   H = ( 17*H+16 ) >> 5;
2603   V = ( 17*V+16 ) >> 5;
2604
2605   a = 16*(src1[0] + src2[8]+1) - 3*(V+H);
2606   for(j=8; j>0; --j) {
2607     int b = a;
2608     a += V;
2609     src[0] = cm[ (b    ) >> 5 ];
2610     src[1] = cm[ (b+  H) >> 5 ];
2611     src[2] = cm[ (b+2*H) >> 5 ];
2612     src[3] = cm[ (b+3*H) >> 5 ];
2613     src[4] = cm[ (b+4*H) >> 5 ];
2614     src[5] = cm[ (b+5*H) >> 5 ];
2615     src[6] = cm[ (b+6*H) >> 5 ];
2616     src[7] = cm[ (b+7*H) >> 5 ];
2617     src += stride;
2618   }
2619 }
2620
2621 #define SRC(x,y) src[(x)+(y)*stride]
2622 #define PL(y) \
2623     const int l##y = (SRC(-1,y-1) + 2*SRC(-1,y) + SRC(-1,y+1) + 2) >> 2;
2624 #define PREDICT_8x8_LOAD_LEFT \
2625     const int l0 = ((has_topleft ? SRC(-1,-1) : SRC(-1,0)) \
2626                      + 2*SRC(-1,0) + SRC(-1,1) + 2) >> 2; \
2627     PL(1) PL(2) PL(3) PL(4) PL(5) PL(6) \
2628     const int l7 attribute_unused = (SRC(-1,6) + 3*SRC(-1,7) + 2) >> 2
2629
2630 #define PT(x) \
2631     const int t##x = (SRC(x-1,-1) + 2*SRC(x,-1) + SRC(x+1,-1) + 2) >> 2;
2632 #define PREDICT_8x8_LOAD_TOP \
2633     const int t0 = ((has_topleft ? SRC(-1,-1) : SRC(0,-1)) \
2634                      + 2*SRC(0,-1) + SRC(1,-1) + 2) >> 2; \
2635     PT(1) PT(2) PT(3) PT(4) PT(5) PT(6) \
2636     const int t7 attribute_unused = ((has_topright ? SRC(8,-1) : SRC(7,-1)) \
2637                      + 2*SRC(7,-1) + SRC(6,-1) + 2) >> 2
2638
2639 #define PTR(x) \
2640     t##x = (SRC(x-1,-1) + 2*SRC(x,-1) + SRC(x+1,-1) + 2) >> 2;
2641 #define PREDICT_8x8_LOAD_TOPRIGHT \
2642     int t8, t9, t10, t11, t12, t13, t14, t15; \
2643     if(has_topright) { \
2644         PTR(8) PTR(9) PTR(10) PTR(11) PTR(12) PTR(13) PTR(14) \
2645         t15 = (SRC(14,-1) + 3*SRC(15,-1) + 2) >> 2; \
2646     } else t8=t9=t10=t11=t12=t13=t14=t15= SRC(7,-1);
2647
2648 #define PREDICT_8x8_LOAD_TOPLEFT \
2649     const int lt = (SRC(-1,0) + 2*SRC(-1,-1) + SRC(0,-1) + 2) >> 2
2650
2651 #define PREDICT_8x8_DC(v) \
2652     int y; \
2653     for( y = 0; y < 8; y++ ) { \
2654         ((uint32_t*)src)[0] = \
2655         ((uint32_t*)src)[1] = v; \
2656         src += stride; \
2657     }
2658
2659 static void pred8x8l_128_dc_c(uint8_t *src, int has_topleft, int has_topright, int stride)
2660 {
2661     PREDICT_8x8_DC(0x80808080);
2662 }
2663 static void pred8x8l_left_dc_c(uint8_t *src, int has_topleft, int has_topright, int stride)
2664 {
2665     PREDICT_8x8_LOAD_LEFT;
2666     const uint32_t dc = ((l0+l1+l2+l3+l4+l5+l6+l7+4) >> 3) * 0x01010101;
2667     PREDICT_8x8_DC(dc);
2668 }
2669 static void pred8x8l_top_dc_c(uint8_t *src, int has_topleft, int has_topright, int stride)
2670 {
2671     PREDICT_8x8_LOAD_TOP;
2672     const uint32_t dc = ((t0+t1+t2+t3+t4+t5+t6+t7+4) >> 3) * 0x01010101;
2673     PREDICT_8x8_DC(dc);
2674 }
2675 static void pred8x8l_dc_c(uint8_t *src, int has_topleft, int has_topright, int stride)
2676 {
2677     PREDICT_8x8_LOAD_LEFT;
2678     PREDICT_8x8_LOAD_TOP;
2679     const uint32_t dc = ((l0+l1+l2+l3+l4+l5+l6+l7
2680                          +t0+t1+t2+t3+t4+t5+t6+t7+8) >> 4) * 0x01010101;
2681     PREDICT_8x8_DC(dc);
2682 }
2683 static void pred8x8l_horizontal_c(uint8_t *src, int has_topleft, int has_topright, int stride)
2684 {
2685     PREDICT_8x8_LOAD_LEFT;
2686 #define ROW(y) ((uint32_t*)(src+y*stride))[0] =\
2687                ((uint32_t*)(src+y*stride))[1] = 0x01010101 * l##y
2688     ROW(0); ROW(1); ROW(2); ROW(3); ROW(4); ROW(5); ROW(6); ROW(7);
2689 #undef ROW
2690 }
2691 static void pred8x8l_vertical_c(uint8_t *src, int has_topleft, int has_topright, int stride)
2692 {
2693     int y;
2694     PREDICT_8x8_LOAD_TOP;
2695     src[0] = t0;
2696     src[1] = t1;
2697     src[2] = t2;
2698     src[3] = t3;
2699     src[4] = t4;
2700     src[5] = t5;
2701     src[6] = t6;
2702     src[7] = t7;
2703     for( y = 1; y < 8; y++ )
2704         *(uint64_t*)(src+y*stride) = *(uint64_t*)src;
2705 }
2706 static void pred8x8l_down_left_c(uint8_t *src, int has_topleft, int has_topright, int stride)
2707 {
2708     PREDICT_8x8_LOAD_TOP;
2709     PREDICT_8x8_LOAD_TOPRIGHT;
2710     SRC(0,0)= (t0 + 2*t1 + t2 + 2) >> 2;
2711     SRC(0,1)=SRC(1,0)= (t1 + 2*t2 + t3 + 2) >> 2;
2712     SRC(0,2)=SRC(1,1)=SRC(2,0)= (t2 + 2*t3 + t4 + 2) >> 2;
2713     SRC(0,3)=SRC(1,2)=SRC(2,1)=SRC(3,0)= (t3 + 2*t4 + t5 + 2) >> 2;
2714     SRC(0,4)=SRC(1,3)=SRC(2,2)=SRC(3,1)=SRC(4,0)= (t4 + 2*t5 + t6 + 2) >> 2;
2715     SRC(0,5)=SRC(1,4)=SRC(2,3)=SRC(3,2)=SRC(4,1)=SRC(5,0)= (t5 + 2*t6 + t7 + 2) >> 2;
2716     SRC(0,6)=SRC(1,5)=SRC(2,4)=SRC(3,3)=SRC(4,2)=SRC(5,1)=SRC(6,0)= (t6 + 2*t7 + t8 + 2) >> 2;
2717     SRC(0,7)=SRC(1,6)=SRC(2,5)=SRC(3,4)=SRC(4,3)=SRC(5,2)=SRC(6,1)=SRC(7,0)= (t7 + 2*t8 + t9 + 2) >> 2;
2718     SRC(1,7)=SRC(2,6)=SRC(3,5)=SRC(4,4)=SRC(5,3)=SRC(6,2)=SRC(7,1)= (t8 + 2*t9 + t10 + 2) >> 2;
2719     SRC(2,7)=SRC(3,6)=SRC(4,5)=SRC(5,4)=SRC(6,3)=SRC(7,2)= (t9 + 2*t10 + t11 + 2) >> 2;
2720     SRC(3,7)=SRC(4,6)=SRC(5,5)=SRC(6,4)=SRC(7,3)= (t10 + 2*t11 + t12 + 2) >> 2;
2721     SRC(4,7)=SRC(5,6)=SRC(6,5)=SRC(7,4)= (t11 + 2*t12 + t13 + 2) >> 2;
2722     SRC(5,7)=SRC(6,6)=SRC(7,5)= (t12 + 2*t13 + t14 + 2) >> 2;
2723     SRC(6,7)=SRC(7,6)= (t13 + 2*t14 + t15 + 2) >> 2;
2724     SRC(7,7)= (t14 + 3*t15 + 2) >> 2;
2725 }
2726 static void pred8x8l_down_right_c(uint8_t *src, int has_topleft, int has_topright, int stride)
2727 {
2728     PREDICT_8x8_LOAD_TOP;
2729     PREDICT_8x8_LOAD_LEFT;
2730     PREDICT_8x8_LOAD_TOPLEFT;
2731     SRC(0,7)= (l7 + 2*l6 + l5 + 2) >> 2;
2732     SRC(0,6)=SRC(1,7)= (l6 + 2*l5 + l4 + 2) >> 2;
2733     SRC(0,5)=SRC(1,6)=SRC(2,7)= (l5 + 2*l4 + l3 + 2) >> 2;
2734     SRC(0,4)=SRC(1,5)=SRC(2,6)=SRC(3,7)= (l4 + 2*l3 + l2 + 2) >> 2;
2735     SRC(0,3)=SRC(1,4)=SRC(2,5)=SRC(3,6)=SRC(4,7)= (l3 + 2*l2 + l1 + 2) >> 2;
2736     SRC(0,2)=SRC(1,3)=SRC(2,4)=SRC(3,5)=SRC(4,6)=SRC(5,7)= (l2 + 2*l1 + l0 + 2) >> 2;
2737     SRC(0,1)=SRC(1,2)=SRC(2,3)=SRC(3,4)=SRC(4,5)=SRC(5,6)=SRC(6,7)= (l1 + 2*l0 + lt + 2) >> 2;
2738     SRC(0,0)=SRC(1,1)=SRC(2,2)=SRC(3,3)=SRC(4,4)=SRC(5,5)=SRC(6,6)=SRC(7,7)= (l0 + 2*lt + t0 + 2) >> 2;
2739     SRC(1,0)=SRC(2,1)=SRC(3,2)=SRC(4,3)=SRC(5,4)=SRC(6,5)=SRC(7,6)= (lt + 2*t0 + t1 + 2) >> 2;
2740     SRC(2,0)=SRC(3,1)=SRC(4,2)=SRC(5,3)=SRC(6,4)=SRC(7,5)= (t0 + 2*t1 + t2 + 2) >> 2;
2741     SRC(3,0)=SRC(4,1)=SRC(5,2)=SRC(6,3)=SRC(7,4)= (t1 + 2*t2 + t3 + 2) >> 2;
2742     SRC(4,0)=SRC(5,1)=SRC(6,2)=SRC(7,3)= (t2 + 2*t3 + t4 + 2) >> 2;
2743     SRC(5,0)=SRC(6,1)=SRC(7,2)= (t3 + 2*t4 + t5 + 2) >> 2;
2744     SRC(6,0)=SRC(7,1)= (t4 + 2*t5 + t6 + 2) >> 2;
2745     SRC(7,0)= (t5 + 2*t6 + t7 + 2) >> 2;
2746
2747 }
2748 static void pred8x8l_vertical_right_c(uint8_t *src, int has_topleft, int has_topright, int stride)
2749 {
2750     PREDICT_8x8_LOAD_TOP;
2751     PREDICT_8x8_LOAD_LEFT;
2752     PREDICT_8x8_LOAD_TOPLEFT;
2753     SRC(0,6)= (l5 + 2*l4 + l3 + 2) >> 2;
2754     SRC(0,7)= (l6 + 2*l5 + l4 + 2) >> 2;
2755     SRC(0,4)=SRC(1,6)= (l3 + 2*l2 + l1 + 2) >> 2;
2756     SRC(0,5)=SRC(1,7)= (l4 + 2*l3 + l2 + 2) >> 2;
2757     SRC(0,2)=SRC(1,4)=SRC(2,6)= (l1 + 2*l0 + lt + 2) >> 2;
2758     SRC(0,3)=SRC(1,5)=SRC(2,7)= (l2 + 2*l1 + l0 + 2) >> 2;
2759     SRC(0,1)=SRC(1,3)=SRC(2,5)=SRC(3,7)= (l0 + 2*lt + t0 + 2) >> 2;
2760     SRC(0,0)=SRC(1,2)=SRC(2,4)=SRC(3,6)= (lt + t0 + 1) >> 1;
2761     SRC(1,1)=SRC(2,3)=SRC(3,5)=SRC(4,7)= (lt + 2*t0 + t1 + 2) >> 2;
2762     SRC(1,0)=SRC(2,2)=SRC(3,4)=SRC(4,6)= (t0 + t1 + 1) >> 1;
2763     SRC(2,1)=SRC(3,3)=SRC(4,5)=SRC(5,7)= (t0 + 2*t1 + t2 + 2) >> 2;
2764     SRC(2,0)=SRC(3,2)=SRC(4,4)=SRC(5,6)= (t1 + t2 + 1) >> 1;
2765     SRC(3,1)=SRC(4,3)=SRC(5,5)=SRC(6,7)= (t1 + 2*t2 + t3 + 2) >> 2;
2766     SRC(3,0)=SRC(4,2)=SRC(5,4)=SRC(6,6)= (t2 + t3 + 1) >> 1;
2767     SRC(4,1)=SRC(5,3)=SRC(6,5)=SRC(7,7)= (t2 + 2*t3 + t4 + 2) >> 2;
2768     SRC(4,0)=SRC(5,2)=SRC(6,4)=SRC(7,6)= (t3 + t4 + 1) >> 1;
2769     SRC(5,1)=SRC(6,3)=SRC(7,5)= (t3 + 2*t4 + t5 + 2) >> 2;
2770     SRC(5,0)=SRC(6,2)=SRC(7,4)= (t4 + t5 + 1) >> 1;
2771     SRC(6,1)=SRC(7,3)= (t4 + 2*t5 + t6 + 2) >> 2;
2772     SRC(6,0)=SRC(7,2)= (t5 + t6 + 1) >> 1;
2773     SRC(7,1)= (t5 + 2*t6 + t7 + 2) >> 2;
2774     SRC(7,0)= (t6 + t7 + 1) >> 1;
2775 }
2776 static void pred8x8l_horizontal_down_c(uint8_t *src, int has_topleft, int has_topright, int stride)
2777 {
2778     PREDICT_8x8_LOAD_TOP;
2779     PREDICT_8x8_LOAD_LEFT;
2780     PREDICT_8x8_LOAD_TOPLEFT;
2781     SRC(0,7)= (l6 + l7 + 1) >> 1;
2782     SRC(1,7)= (l5 + 2*l6 + l7 + 2) >> 2;
2783     SRC(0,6)=SRC(2,7)= (l5 + l6 + 1) >> 1;
2784     SRC(1,6)=SRC(3,7)= (l4 + 2*l5 + l6 + 2) >> 2;
2785     SRC(0,5)=SRC(2,6)=SRC(4,7)= (l4 + l5 + 1) >> 1;
2786     SRC(1,5)=SRC(3,6)=SRC(5,7)= (l3 + 2*l4 + l5 + 2) >> 2;
2787     SRC(0,4)=SRC(2,5)=SRC(4,6)=SRC(6,7)= (l3 + l4 + 1) >> 1;
2788     SRC(1,4)=SRC(3,5)=SRC(5,6)=SRC(7,7)= (l2 + 2*l3 + l4 + 2) >> 2;
2789     SRC(0,3)=SRC(2,4)=SRC(4,5)=SRC(6,6)= (l2 + l3 + 1) >> 1;
2790     SRC(1,3)=SRC(3,4)=SRC(5,5)=SRC(7,6)= (l1 + 2*l2 + l3 + 2) >> 2;
2791     SRC(0,2)=SRC(2,3)=SRC(4,4)=SRC(6,5)= (l1 + l2 + 1) >> 1;
2792     SRC(1,2)=SRC(3,3)=SRC(5,4)=SRC(7,5)= (l0 + 2*l1 + l2 + 2) >> 2;
2793     SRC(0,1)=SRC(2,2)=SRC(4,3)=SRC(6,4)= (l0 + l1 + 1) >> 1;
2794     SRC(1,1)=SRC(3,2)=SRC(5,3)=SRC(7,4)= (lt + 2*l0 + l1 + 2) >> 2;
2795     SRC(0,0)=SRC(2,1)=SRC(4,2)=SRC(6,3)= (lt + l0 + 1) >> 1;
2796     SRC(1,0)=SRC(3,1)=SRC(5,2)=SRC(7,3)= (l0 + 2*lt + t0 + 2) >> 2;
2797     SRC(2,0)=SRC(4,1)=SRC(6,2)= (t1 + 2*t0 + lt + 2) >> 2;
2798     SRC(3,0)=SRC(5,1)=SRC(7,2)= (t2 + 2*t1 + t0 + 2) >> 2;
2799     SRC(4,0)=SRC(6,1)= (t3 + 2*t2 + t1 + 2) >> 2;
2800     SRC(5,0)=SRC(7,1)= (t4 + 2*t3 + t2 + 2) >> 2;
2801     SRC(6,0)= (t5 + 2*t4 + t3 + 2) >> 2;
2802     SRC(7,0)= (t6 + 2*t5 + t4 + 2) >> 2;
2803 }
2804 static void pred8x8l_vertical_left_c(uint8_t *src, int has_topleft, int has_topright, int stride)
2805 {
2806     PREDICT_8x8_LOAD_TOP;
2807     PREDICT_8x8_LOAD_TOPRIGHT;
2808     SRC(0,0)= (t0 + t1 + 1) >> 1;
2809     SRC(0,1)= (t0 + 2*t1 + t2 + 2) >> 2;
2810     SRC(0,2)=SRC(1,0)= (t1 + t2 + 1) >> 1;
2811     SRC(0,3)=SRC(1,1)= (t1 + 2*t2 + t3 + 2) >> 2;
2812     SRC(0,4)=SRC(1,2)=SRC(2,0)= (t2 + t3 + 1) >> 1;
2813     SRC(0,5)=SRC(1,3)=SRC(2,1)= (t2 + 2*t3 + t4 + 2) >> 2;
2814     SRC(0,6)=SRC(1,4)=SRC(2,2)=SRC(3,0)= (t3 + t4 + 1) >> 1;
2815     SRC(0,7)=SRC(1,5)=SRC(2,3)=SRC(3,1)= (t3 + 2*t4 + t5 + 2) >> 2;
2816     SRC(1,6)=SRC(2,4)=SRC(3,2)=SRC(4,0)= (t4 + t5 + 1) >> 1;
2817     SRC(1,7)=SRC(2,5)=SRC(3,3)=SRC(4,1)= (t4 + 2*t5 + t6 + 2) >> 2;
2818     SRC(2,6)=SRC(3,4)=SRC(4,2)=SRC(5,0)= (t5 + t6 + 1) >> 1;
2819     SRC(2,7)=SRC(3,5)=SRC(4,3)=SRC(5,1)= (t5 + 2*t6 + t7 + 2) >> 2;
2820     SRC(3,6)=SRC(4,4)=SRC(5,2)=SRC(6,0)= (t6 + t7 + 1) >> 1;
2821     SRC(3,7)=SRC(4,5)=SRC(5,3)=SRC(6,1)= (t6 + 2*t7 + t8 + 2) >> 2;
2822     SRC(4,6)=SRC(5,4)=SRC(6,2)=SRC(7,0)= (t7 + t8 + 1) >> 1;
2823     SRC(4,7)=SRC(5,5)=SRC(6,3)=SRC(7,1)= (t7 + 2*t8 + t9 + 2) >> 2;
2824     SRC(5,6)=SRC(6,4)=SRC(7,2)= (t8 + t9 + 1) >> 1;
2825     SRC(5,7)=SRC(6,5)=SRC(7,3)= (t8 + 2*t9 + t10 + 2) >> 2;
2826     SRC(6,6)=SRC(7,4)= (t9 + t10 + 1) >> 1;
2827     SRC(6,7)=SRC(7,5)= (t9 + 2*t10 + t11 + 2) >> 2;
2828     SRC(7,6)= (t10 + t11 + 1) >> 1;
2829     SRC(7,7)= (t10 + 2*t11 + t12 + 2) >> 2;
2830 }
2831 static void pred8x8l_horizontal_up_c(uint8_t *src, int has_topleft, int has_topright, int stride)
2832 {
2833     PREDICT_8x8_LOAD_LEFT;
2834     SRC(0,0)= (l0 + l1 + 1) >> 1;
2835     SRC(1,0)= (l0 + 2*l1 + l2 + 2) >> 2;
2836     SRC(0,1)=SRC(2,0)= (l1 + l2 + 1) >> 1;
2837     SRC(1,1)=SRC(3,0)= (l1 + 2*l2 + l3 + 2) >> 2;
2838     SRC(0,2)=SRC(2,1)=SRC(4,0)= (l2 + l3 + 1) >> 1;
2839     SRC(1,2)=SRC(3,1)=SRC(5,0)= (l2 + 2*l3 + l4 + 2) >> 2;
2840     SRC(0,3)=SRC(2,2)=SRC(4,1)=SRC(6,0)= (l3 + l4 + 1) >> 1;
2841     SRC(1,3)=SRC(3,2)=SRC(5,1)=SRC(7,0)= (l3 + 2*l4 + l5 + 2) >> 2;
2842     SRC(0,4)=SRC(2,3)=SRC(4,2)=SRC(6,1)= (l4 + l5 + 1) >> 1;
2843     SRC(1,4)=SRC(3,3)=SRC(5,2)=SRC(7,1)= (l4 + 2*l5 + l6 + 2) >> 2;
2844     SRC(0,5)=SRC(2,4)=SRC(4,3)=SRC(6,2)= (l5 + l6 + 1) >> 1;
2845     SRC(1,5)=SRC(3,4)=SRC(5,3)=SRC(7,2)= (l5 + 2*l6 + l7 + 2) >> 2;
2846     SRC(0,6)=SRC(2,5)=SRC(4,4)=SRC(6,3)= (l6 + l7 + 1) >> 1;
2847     SRC(1,6)=SRC(3,5)=SRC(5,4)=SRC(7,3)= (l6 + 3*l7 + 2) >> 2;
2848     SRC(0,7)=SRC(1,7)=SRC(2,6)=SRC(2,7)=SRC(3,6)=
2849     SRC(3,7)=SRC(4,5)=SRC(4,6)=SRC(4,7)=SRC(5,5)=
2850     SRC(5,6)=SRC(5,7)=SRC(6,4)=SRC(6,5)=SRC(6,6)=
2851     SRC(6,7)=SRC(7,4)=SRC(7,5)=SRC(7,6)=SRC(7,7)= l7;
2852 }
2853 #undef PREDICT_8x8_LOAD_LEFT
2854 #undef PREDICT_8x8_LOAD_TOP
2855 #undef PREDICT_8x8_LOAD_TOPLEFT
2856 #undef PREDICT_8x8_LOAD_TOPRIGHT
2857 #undef PREDICT_8x8_DC
2858 #undef PTR
2859 #undef PT
2860 #undef PL
2861 #undef SRC
2862
2863 static inline void mc_dir_part(H264Context *h, Picture *pic, int n, int square, int chroma_height, int delta, int list,
2864                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
2865                            int src_x_offset, int src_y_offset,
2866                            qpel_mc_func *qpix_op, h264_chroma_mc_func chroma_op){
2867     MpegEncContext * const s = &h->s;
2868     const int mx= h->mv_cache[list][ scan8[n] ][0] + src_x_offset*8;
2869     int my=       h->mv_cache[list][ scan8[n] ][1] + src_y_offset*8;
2870     const int luma_xy= (mx&3) + ((my&3)<<2);
2871     uint8_t * src_y = pic->data[0] + (mx>>2) + (my>>2)*h->mb_linesize;
2872     uint8_t * src_cb, * src_cr;
2873     int extra_width= h->emu_edge_width;
2874     int extra_height= h->emu_edge_height;
2875     int emu=0;
2876     const int full_mx= mx>>2;
2877     const int full_my= my>>2;
2878     const int pic_width  = 16*s->mb_width;
2879     const int pic_height = 16*s->mb_height >> MB_MBAFF;
2880
2881     if(!pic->data[0])
2882         return;
2883
2884     if(mx&7) extra_width -= 3;
2885     if(my&7) extra_height -= 3;
2886
2887     if(   full_mx < 0-extra_width
2888        || full_my < 0-extra_height
2889        || full_mx + 16/*FIXME*/ > pic_width + extra_width
2890        || full_my + 16/*FIXME*/ > pic_height + extra_height){
2891         ff_emulated_edge_mc(s->edge_emu_buffer, src_y - 2 - 2*h->mb_linesize, h->mb_linesize, 16+5, 16+5/*FIXME*/, full_mx-2, full_my-2, pic_width, pic_height);
2892             src_y= s->edge_emu_buffer + 2 + 2*h->mb_linesize;
2893         emu=1;
2894     }
2895
2896     qpix_op[luma_xy](dest_y, src_y, h->mb_linesize); //FIXME try variable height perhaps?
2897     if(!square){
2898         qpix_op[luma_xy](dest_y + delta, src_y + delta, h->mb_linesize);
2899     }
2900
2901     if(s->flags&CODEC_FLAG_GRAY) return;
2902
2903     if(MB_MBAFF){
2904         // chroma offset when predicting from a field of opposite parity
2905         my += 2 * ((s->mb_y & 1) - (h->ref_cache[list][scan8[n]] & 1));
2906         emu |= (my>>3) < 0 || (my>>3) + 8 >= (pic_height>>1);
2907     }
2908     src_cb= pic->data[1] + (mx>>3) + (my>>3)*h->mb_uvlinesize;
2909     src_cr= pic->data[2] + (mx>>3) + (my>>3)*h->mb_uvlinesize;
2910
2911     if(emu){
2912         ff_emulated_edge_mc(s->edge_emu_buffer, src_cb, h->mb_uvlinesize, 9, 9/*FIXME*/, (mx>>3), (my>>3), pic_width>>1, pic_height>>1);
2913             src_cb= s->edge_emu_buffer;
2914     }
2915     chroma_op(dest_cb, src_cb, h->mb_uvlinesize, chroma_height, mx&7, my&7);
2916
2917     if(emu){
2918         ff_emulated_edge_mc(s->edge_emu_buffer, src_cr, h->mb_uvlinesize, 9, 9/*FIXME*/, (mx>>3), (my>>3), pic_width>>1, pic_height>>1);
2919             src_cr= s->edge_emu_buffer;
2920     }
2921     chroma_op(dest_cr, src_cr, h->mb_uvlinesize, chroma_height, mx&7, my&7);
2922 }
2923
2924 static inline void mc_part_std(H264Context *h, int n, int square, int chroma_height, int delta,
2925                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
2926                            int x_offset, int y_offset,
2927                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
2928                            qpel_mc_func *qpix_avg, h264_chroma_mc_func chroma_avg,
2929                            int list0, int list1){
2930     MpegEncContext * const s = &h->s;
2931     qpel_mc_func *qpix_op=  qpix_put;
2932     h264_chroma_mc_func chroma_op= chroma_put;
2933
2934     dest_y  += 2*x_offset + 2*y_offset*h->  mb_linesize;
2935     dest_cb +=   x_offset +   y_offset*h->mb_uvlinesize;
2936     dest_cr +=   x_offset +   y_offset*h->mb_uvlinesize;
2937     x_offset += 8*s->mb_x;
2938     y_offset += 8*(s->mb_y >> MB_MBAFF);
2939
2940     if(list0){
2941         Picture *ref= &h->ref_list[0][ h->ref_cache[0][ scan8[n] ] ];
2942         mc_dir_part(h, ref, n, square, chroma_height, delta, 0,
2943                            dest_y, dest_cb, dest_cr, x_offset, y_offset,
2944                            qpix_op, chroma_op);
2945
2946         qpix_op=  qpix_avg;
2947         chroma_op= chroma_avg;
2948     }
2949
2950     if(list1){
2951         Picture *ref= &h->ref_list[1][ h->ref_cache[1][ scan8[n] ] ];
2952         mc_dir_part(h, ref, n, square, chroma_height, delta, 1,
2953                            dest_y, dest_cb, dest_cr, x_offset, y_offset,
2954                            qpix_op, chroma_op);
2955     }
2956 }
2957
2958 static inline void mc_part_weighted(H264Context *h, int n, int square, int chroma_height, int delta,
2959                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
2960                            int x_offset, int y_offset,
2961                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
2962                            h264_weight_func luma_weight_op, h264_weight_func chroma_weight_op,
2963                            h264_biweight_func luma_weight_avg, h264_biweight_func chroma_weight_avg,
2964                            int list0, int list1){
2965     MpegEncContext * const s = &h->s;
2966
2967     dest_y  += 2*x_offset + 2*y_offset*h->  mb_linesize;
2968     dest_cb +=   x_offset +   y_offset*h->mb_uvlinesize;
2969     dest_cr +=   x_offset +   y_offset*h->mb_uvlinesize;
2970     x_offset += 8*s->mb_x;
2971     y_offset += 8*(s->mb_y >> MB_MBAFF);
2972
2973     if(list0 && list1){
2974         /* don't optimize for luma-only case, since B-frames usually
2975          * use implicit weights => chroma too. */
2976         uint8_t *tmp_cb = s->obmc_scratchpad;
2977         uint8_t *tmp_cr = s->obmc_scratchpad + 8;
2978         uint8_t *tmp_y  = s->obmc_scratchpad + 8*h->mb_uvlinesize;
2979         int refn0 = h->ref_cache[0][ scan8[n] ];
2980         int refn1 = h->ref_cache[1][ scan8[n] ];
2981
2982         mc_dir_part(h, &h->ref_list[0][refn0], n, square, chroma_height, delta, 0,
2983                     dest_y, dest_cb, dest_cr,
2984                     x_offset, y_offset, qpix_put, chroma_put);
2985         mc_dir_part(h, &h->ref_list[1][refn1], n, square, chroma_height, delta, 1,
2986                     tmp_y, tmp_cb, tmp_cr,
2987                     x_offset, y_offset, qpix_put, chroma_put);
2988
2989         if(h->use_weight == 2){
2990             int weight0 = h->implicit_weight[refn0][refn1];
2991             int weight1 = 64 - weight0;
2992             luma_weight_avg(  dest_y,  tmp_y,  h->  mb_linesize, 5, weight0, weight1, 0);
2993             chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize, 5, weight0, weight1, 0);
2994             chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize, 5, weight0, weight1, 0);
2995         }else{
2996             luma_weight_avg(dest_y, tmp_y, h->mb_linesize, h->luma_log2_weight_denom,
2997                             h->luma_weight[0][refn0], h->luma_weight[1][refn1],
2998                             h->luma_offset[0][refn0] + h->luma_offset[1][refn1]);
2999             chroma_weight_avg(dest_cb, tmp_cb, h->mb_uvlinesize, h->chroma_log2_weight_denom,
3000                             h->chroma_weight[0][refn0][0], h->chroma_weight[1][refn1][0],
3001                             h->chroma_offset[0][refn0][0] + h->chroma_offset[1][refn1][0]);
3002             chroma_weight_avg(dest_cr, tmp_cr, h->mb_uvlinesize, h->chroma_log2_weight_denom,
3003                             h->chroma_weight[0][refn0][1], h->chroma_weight[1][refn1][1],
3004                             h->chroma_offset[0][refn0][1] + h->chroma_offset[1][refn1][1]);
3005         }
3006     }else{
3007         int list = list1 ? 1 : 0;
3008         int refn = h->ref_cache[list][ scan8[n] ];
3009         Picture *ref= &h->ref_list[list][refn];
3010         mc_dir_part(h, ref, n, square, chroma_height, delta, list,
3011                     dest_y, dest_cb, dest_cr, x_offset, y_offset,
3012                     qpix_put, chroma_put);
3013
3014         luma_weight_op(dest_y, h->mb_linesize, h->luma_log2_weight_denom,
3015                        h->luma_weight[list][refn], h->luma_offset[list][refn]);
3016         if(h->use_weight_chroma){
3017             chroma_weight_op(dest_cb, h->mb_uvlinesize, h->chroma_log2_weight_denom,
3018                              h->chroma_weight[list][refn][0], h->chroma_offset[list][refn][0]);
3019             chroma_weight_op(dest_cr, h->mb_uvlinesize, h->chroma_log2_weight_denom,
3020                              h->chroma_weight[list][refn][1], h->chroma_offset[list][refn][1]);
3021         }
3022     }
3023 }
3024
3025 static inline void mc_part(H264Context *h, int n, int square, int chroma_height, int delta,
3026                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
3027                            int x_offset, int y_offset,
3028                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
3029                            qpel_mc_func *qpix_avg, h264_chroma_mc_func chroma_avg,
3030                            h264_weight_func *weight_op, h264_biweight_func *weight_avg,
3031                            int list0, int list1){
3032     if((h->use_weight==2 && list0 && list1
3033         && (h->implicit_weight[ h->ref_cache[0][scan8[n]] ][ h->ref_cache[1][scan8[n]] ] != 32))
3034        || h->use_weight==1)
3035         mc_part_weighted(h, n, square, chroma_height, delta, dest_y, dest_cb, dest_cr,
3036                          x_offset, y_offset, qpix_put, chroma_put,
3037                          weight_op[0], weight_op[3], weight_avg[0], weight_avg[3], list0, list1);
3038     else
3039         mc_part_std(h, n, square, chroma_height, delta, dest_y, dest_cb, dest_cr,
3040                     x_offset, y_offset, qpix_put, chroma_put, qpix_avg, chroma_avg, list0, list1);
3041 }
3042
3043 static inline void prefetch_motion(H264Context *h, int list){
3044     /* fetch pixels for estimated mv 4 macroblocks ahead
3045      * optimized for 64byte cache lines */
3046     MpegEncContext * const s = &h->s;
3047     const int refn = h->ref_cache[list][scan8[0]];
3048     if(refn >= 0){
3049         const int mx= (h->mv_cache[list][scan8[0]][0]>>2) + 16*s->mb_x + 8;
3050         const int my= (h->mv_cache[list][scan8[0]][1]>>2) + 16*s->mb_y;
3051         uint8_t **src= h->ref_list[list][refn].data;
3052         int off= mx + (my + (s->mb_x&3)*4)*h->mb_linesize + 64;
3053         s->dsp.prefetch(src[0]+off, s->linesize, 4);
3054         off= (mx>>1) + ((my>>1) + (s->mb_x&7))*s->uvlinesize + 64;
3055         s->dsp.prefetch(src[1]+off, src[2]-src[1], 2);
3056     }
3057 }
3058
3059 static void hl_motion(H264Context *h, uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
3060                       qpel_mc_func (*qpix_put)[16], h264_chroma_mc_func (*chroma_put),
3061                       qpel_mc_func (*qpix_avg)[16], h264_chroma_mc_func (*chroma_avg),
3062                       h264_weight_func *weight_op, h264_biweight_func *weight_avg){
3063     MpegEncContext * const s = &h->s;
3064     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
3065     const int mb_type= s->current_picture.mb_type[mb_xy];
3066
3067     assert(IS_INTER(mb_type));
3068
3069     prefetch_motion(h, 0);
3070
3071     if(IS_16X16(mb_type)){
3072         mc_part(h, 0, 1, 8, 0, dest_y, dest_cb, dest_cr, 0, 0,
3073                 qpix_put[0], chroma_put[0], qpix_avg[0], chroma_avg[0],
3074                 &weight_op[0], &weight_avg[0],
3075                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
3076     }else if(IS_16X8(mb_type)){
3077         mc_part(h, 0, 0, 4, 8, dest_y, dest_cb, dest_cr, 0, 0,
3078                 qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0],
3079                 &weight_op[1], &weight_avg[1],
3080                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
3081         mc_part(h, 8, 0, 4, 8, dest_y, dest_cb, dest_cr, 0, 4,
3082                 qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0],
3083                 &weight_op[1], &weight_avg[1],
3084                 IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1));
3085     }else if(IS_8X16(mb_type)){
3086         mc_part(h, 0, 0, 8, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 0, 0,
3087                 qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
3088                 &weight_op[2], &weight_avg[2],
3089                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
3090         mc_part(h, 4, 0, 8, 8*h->mb_linesize, dest_y, dest_cb, dest_cr, 4, 0,
3091                 qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
3092                 &weight_op[2], &weight_avg[2],
3093                 IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1));
3094     }else{
3095         int i;
3096
3097         assert(IS_8X8(mb_type));
3098
3099         for(i=0; i<4; i++){
3100             const int sub_mb_type= h->sub_mb_type[i];
3101             const int n= 4*i;
3102             int x_offset= (i&1)<<2;
3103             int y_offset= (i&2)<<1;
3104
3105             if(IS_SUB_8X8(sub_mb_type)){
3106                 mc_part(h, n, 1, 4, 0, dest_y, dest_cb, dest_cr, x_offset, y_offset,
3107                     qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
3108                     &weight_op[3], &weight_avg[3],
3109                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
3110             }else if(IS_SUB_8X4(sub_mb_type)){
3111                 mc_part(h, n  , 0, 2, 4, dest_y, dest_cb, dest_cr, x_offset, y_offset,
3112                     qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1],
3113                     &weight_op[4], &weight_avg[4],
3114                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
3115                 mc_part(h, n+2, 0, 2, 4, dest_y, dest_cb, dest_cr, x_offset, y_offset+2,
3116                     qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1],
3117                     &weight_op[4], &weight_avg[4],
3118                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
3119             }else if(IS_SUB_4X8(sub_mb_type)){
3120                 mc_part(h, n  , 0, 4, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset, y_offset,
3121                     qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
3122                     &weight_op[5], &weight_avg[5],
3123                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
3124                 mc_part(h, n+1, 0, 4, 4*h->mb_linesize, dest_y, dest_cb, dest_cr, x_offset+2, y_offset,
3125                     qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
3126                     &weight_op[5], &weight_avg[5],
3127                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
3128             }else{
3129                 int j;
3130                 assert(IS_SUB_4X4(sub_mb_type));
3131                 for(j=0; j<4; j++){
3132                     int sub_x_offset= x_offset + 2*(j&1);
3133                     int sub_y_offset= y_offset +   (j&2);
3134                     mc_part(h, n+j, 1, 2, 0, dest_y, dest_cb, dest_cr, sub_x_offset, sub_y_offset,
3135                         qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
3136                         &weight_op[6], &weight_avg[6],
3137                         IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
3138                 }
3139             }
3140         }
3141     }
3142
3143     prefetch_motion(h, 1);
3144 }
3145
3146 static void decode_init_vlc(H264Context *h){
3147     static int done = 0;
3148
3149     if (!done) {
3150         int i;
3151         done = 1;
3152
3153         init_vlc(&chroma_dc_coeff_token_vlc, CHROMA_DC_COEFF_TOKEN_VLC_BITS, 4*5,
3154                  &chroma_dc_coeff_token_len [0], 1, 1,
3155                  &chroma_dc_coeff_token_bits[0], 1, 1, 1);
3156
3157         for(i=0; i<4; i++){
3158             init_vlc(&coeff_token_vlc[i], COEFF_TOKEN_VLC_BITS, 4*17,
3159                      &coeff_token_len [i][0], 1, 1,
3160                      &coeff_token_bits[i][0], 1, 1, 1);
3161         }
3162
3163         for(i=0; i<3; i++){
3164             init_vlc(&chroma_dc_total_zeros_vlc[i], CHROMA_DC_TOTAL_ZEROS_VLC_BITS, 4,
3165                      &chroma_dc_total_zeros_len [i][0], 1, 1,
3166                      &chroma_dc_total_zeros_bits[i][0], 1, 1, 1);
3167         }
3168         for(i=0; i<15; i++){
3169             init_vlc(&total_zeros_vlc[i], TOTAL_ZEROS_VLC_BITS, 16,
3170                      &total_zeros_len [i][0], 1, 1,
3171                      &total_zeros_bits[i][0], 1, 1, 1);
3172         }
3173
3174         for(i=0; i<6; i++){
3175             init_vlc(&run_vlc[i], RUN_VLC_BITS, 7,
3176                      &run_len [i][0], 1, 1,
3177                      &run_bits[i][0], 1, 1, 1);
3178         }
3179         init_vlc(&run7_vlc, RUN7_VLC_BITS, 16,
3180                  &run_len [6][0], 1, 1,
3181                  &run_bits[6][0], 1, 1, 1);
3182     }
3183 }
3184
3185 /**
3186  * Sets the intra prediction function pointers.
3187  */
3188 static void init_pred_ptrs(H264Context *h){
3189 //    MpegEncContext * const s = &h->s;
3190
3191     h->pred4x4[VERT_PRED           ]= pred4x4_vertical_c;
3192     h->pred4x4[HOR_PRED            ]= pred4x4_horizontal_c;
3193     h->pred4x4[DC_PRED             ]= pred4x4_dc_c;
3194     h->pred4x4[DIAG_DOWN_LEFT_PRED ]= pred4x4_down_left_c;
3195     h->pred4x4[DIAG_DOWN_RIGHT_PRED]= pred4x4_down_right_c;
3196     h->pred4x4[VERT_RIGHT_PRED     ]= pred4x4_vertical_right_c;
3197     h->pred4x4[HOR_DOWN_PRED       ]= pred4x4_horizontal_down_c;
3198     h->pred4x4[VERT_LEFT_PRED      ]= pred4x4_vertical_left_c;
3199     h->pred4x4[HOR_UP_PRED         ]= pred4x4_horizontal_up_c;
3200     h->pred4x4[LEFT_DC_PRED        ]= pred4x4_left_dc_c;
3201     h->pred4x4[TOP_DC_PRED         ]= pred4x4_top_dc_c;
3202     h->pred4x4[DC_128_PRED         ]= pred4x4_128_dc_c;
3203
3204     h->pred8x8l[VERT_PRED           ]= pred8x8l_vertical_c;
3205     h->pred8x8l[HOR_PRED            ]= pred8x8l_horizontal_c;
3206     h->pred8x8l[DC_PRED             ]= pred8x8l_dc_c;
3207     h->pred8x8l[DIAG_DOWN_LEFT_PRED ]= pred8x8l_down_left_c;
3208     h->pred8x8l[DIAG_DOWN_RIGHT_PRED]= pred8x8l_down_right_c;
3209     h->pred8x8l[VERT_RIGHT_PRED     ]= pred8x8l_vertical_right_c;
3210     h->pred8x8l[HOR_DOWN_PRED       ]= pred8x8l_horizontal_down_c;
3211     h->pred8x8l[VERT_LEFT_PRED      ]= pred8x8l_vertical_left_c;
3212     h->pred8x8l[HOR_UP_PRED         ]= pred8x8l_horizontal_up_c;
3213     h->pred8x8l[LEFT_DC_PRED        ]= pred8x8l_left_dc_c;
3214     h->pred8x8l[TOP_DC_PRED         ]= pred8x8l_top_dc_c;
3215     h->pred8x8l[DC_128_PRED         ]= pred8x8l_128_dc_c;
3216
3217     h->pred8x8[DC_PRED8x8     ]= pred8x8_dc_c;
3218     h->pred8x8[VERT_PRED8x8   ]= pred8x8_vertical_c;
3219     h->pred8x8[HOR_PRED8x8    ]= pred8x8_horizontal_c;
3220     h->pred8x8[PLANE_PRED8x8  ]= pred8x8_plane_c;
3221     h->pred8x8[LEFT_DC_PRED8x8]= pred8x8_left_dc_c;
3222     h->pred8x8[TOP_DC_PRED8x8 ]= pred8x8_top_dc_c;
3223     h->pred8x8[DC_128_PRED8x8 ]= pred8x8_128_dc_c;
3224
3225     h->pred16x16[DC_PRED8x8     ]= pred16x16_dc_c;
3226     h->pred16x16[VERT_PRED8x8   ]= pred16x16_vertical_c;
3227     h->pred16x16[HOR_PRED8x8    ]= pred16x16_horizontal_c;
3228     h->pred16x16[PLANE_PRED8x8  ]= pred16x16_plane_c;
3229     h->pred16x16[LEFT_DC_PRED8x8]= pred16x16_left_dc_c;
3230     h->pred16x16[TOP_DC_PRED8x8 ]= pred16x16_top_dc_c;
3231     h->pred16x16[DC_128_PRED8x8 ]= pred16x16_128_dc_c;
3232 }
3233
3234 static void free_tables(H264Context *h){
3235     av_freep(&h->intra4x4_pred_mode);
3236     av_freep(&h->chroma_pred_mode_table);
3237     av_freep(&h->cbp_table);
3238     av_freep(&h->mvd_table[0]);
3239     av_freep(&h->mvd_table[1]);
3240     av_freep(&h->direct_table);
3241     av_freep(&h->non_zero_count);
3242     av_freep(&h->slice_table_base);
3243     av_freep(&h->top_borders[1]);
3244     av_freep(&h->top_borders[0]);
3245     h->slice_table= NULL;
3246
3247     av_freep(&h->mb2b_xy);
3248     av_freep(&h->mb2b8_xy);
3249
3250     av_freep(&h->s.obmc_scratchpad);
3251 }
3252
3253 static void init_dequant8_coeff_table(H264Context *h){
3254     int i,q,x;
3255     const int transpose = (h->s.dsp.h264_idct8_add != ff_h264_idct8_add_c); //FIXME ugly
3256     h->dequant8_coeff[0] = h->dequant8_buffer[0];
3257     h->dequant8_coeff[1] = h->dequant8_buffer[1];
3258
3259     for(i=0; i<2; i++ ){
3260         if(i && !memcmp(h->pps.scaling_matrix8[0], h->pps.scaling_matrix8[1], 64*sizeof(uint8_t))){
3261             h->dequant8_coeff[1] = h->dequant8_buffer[0];
3262             break;
3263         }
3264
3265         for(q=0; q<52; q++){
3266             int shift = div6[q];
3267             int idx = rem6[q];
3268             for(x=0; x<64; x++)
3269                 h->dequant8_coeff[i][q][transpose ? (x>>3)|((x&7)<<3) : x] =
3270                     ((uint32_t)dequant8_coeff_init[idx][ dequant8_coeff_init_scan[((x>>1)&12) | (x&3)] ] *
3271                     h->pps.scaling_matrix8[i][x]) << shift;
3272         }
3273     }
3274 }
3275
3276 static void init_dequant4_coeff_table(H264Context *h){
3277     int i,j,q,x;
3278     const int transpose = (h->s.dsp.h264_idct_add != ff_h264_idct_add_c); //FIXME ugly
3279     for(i=0; i<6; i++ ){
3280         h->dequant4_coeff[i] = h->dequant4_buffer[i];
3281         for(j=0; j<i; j++){
3282             if(!memcmp(h->pps.scaling_matrix4[j], h->pps.scaling_matrix4[i], 16*sizeof(uint8_t))){
3283                 h->dequant4_coeff[i] = h->dequant4_buffer[j];
3284                 break;
3285             }
3286         }
3287         if(j<i)
3288             continue;
3289
3290         for(q=0; q<52; q++){
3291             int shift = div6[q] + 2;
3292             int idx = rem6[q];
3293             for(x=0; x<16; x++)
3294                 h->dequant4_coeff[i][q][transpose ? (x>>2)|((x<<2)&0xF) : x] =
3295                     ((uint32_t)dequant4_coeff_init[idx][(x&1) + ((x>>2)&1)] *
3296                     h->pps.scaling_matrix4[i][x]) << shift;
3297         }
3298     }
3299 }
3300
3301 static void init_dequant_tables(H264Context *h){
3302     int i,x;
3303     init_dequant4_coeff_table(h);
3304     if(h->pps.transform_8x8_mode)
3305         init_dequant8_coeff_table(h);
3306     if(h->sps.transform_bypass){
3307         for(i=0; i<6; i++)
3308             for(x=0; x<16; x++)
3309                 h->dequant4_coeff[i][0][x] = 1<<6;
3310         if(h->pps.transform_8x8_mode)
3311             for(i=0; i<2; i++)
3312                 for(x=0; x<64; x++)
3313                     h->dequant8_coeff[i][0][x] = 1<<6;
3314     }
3315 }
3316
3317
3318 /**
3319  * allocates tables.
3320  * needs width/height
3321  */
3322 static int alloc_tables(H264Context *h){
3323     MpegEncContext * const s = &h->s;
3324     const int big_mb_num= s->mb_stride * (s->mb_height+1);
3325     int x,y;
3326
3327     CHECKED_ALLOCZ(h->intra4x4_pred_mode, big_mb_num * 8  * sizeof(uint8_t))
3328
3329     CHECKED_ALLOCZ(h->non_zero_count    , big_mb_num * 16 * sizeof(uint8_t))
3330     CHECKED_ALLOCZ(h->slice_table_base  , (big_mb_num+s->mb_stride) * sizeof(uint8_t))
3331     CHECKED_ALLOCZ(h->top_borders[0]    , s->mb_width * (16+8+8) * sizeof(uint8_t))
3332     CHECKED_ALLOCZ(h->top_borders[1]    , s->mb_width * (16+8+8) * sizeof(uint8_t))
3333     CHECKED_ALLOCZ(h->cbp_table, big_mb_num * sizeof(uint16_t))
3334
3335     if( h->pps.cabac ) {
3336         CHECKED_ALLOCZ(h->chroma_pred_mode_table, big_mb_num * sizeof(uint8_t))
3337         CHECKED_ALLOCZ(h->mvd_table[0], 32*big_mb_num * sizeof(uint16_t));
3338         CHECKED_ALLOCZ(h->mvd_table[1], 32*big_mb_num * sizeof(uint16_t));
3339         CHECKED_ALLOCZ(h->direct_table, 32*big_mb_num * sizeof(uint8_t));
3340     }
3341
3342     memset(h->slice_table_base, -1, (big_mb_num+s->mb_stride)  * sizeof(uint8_t));
3343     h->slice_table= h->slice_table_base + s->mb_stride*2 + 1;
3344
3345     CHECKED_ALLOCZ(h->mb2b_xy  , big_mb_num * sizeof(uint32_t));
3346     CHECKED_ALLOCZ(h->mb2b8_xy , big_mb_num * sizeof(uint32_t));
3347     for(y=0; y<s->mb_height; y++){
3348         for(x=0; x<s->mb_width; x++){
3349             const int mb_xy= x + y*s->mb_stride;
3350             const int b_xy = 4*x + 4*y*h->b_stride;
3351             const int b8_xy= 2*x + 2*y*h->b8_stride;
3352
3353             h->mb2b_xy [mb_xy]= b_xy;
3354             h->mb2b8_xy[mb_xy]= b8_xy;
3355         }
3356     }
3357
3358     s->obmc_scratchpad = NULL;
3359
3360     if(!h->dequant4_coeff[0])
3361         init_dequant_tables(h);
3362
3363     return 0;
3364 fail:
3365     free_tables(h);
3366     return -1;
3367 }
3368
3369 static void common_init(H264Context *h){
3370     MpegEncContext * const s = &h->s;
3371
3372     s->width = s->avctx->width;
3373     s->height = s->avctx->height;
3374     s->codec_id= s->avctx->codec->id;
3375
3376     init_pred_ptrs(h);
3377
3378     h->dequant_coeff_pps= -1;
3379     s->unrestricted_mv=1;
3380     s->decode=1; //FIXME
3381
3382     memset(h->pps.scaling_matrix4, 16, 6*16*sizeof(uint8_t));
3383     memset(h->pps.scaling_matrix8, 16, 2*64*sizeof(uint8_t));
3384 }
3385
3386 static int decode_init(AVCodecContext *avctx){
3387     H264Context *h= avctx->priv_data;
3388     MpegEncContext * const s = &h->s;
3389
3390     MPV_decode_defaults(s);
3391
3392     s->avctx = avctx;
3393     common_init(h);
3394
3395     s->out_format = FMT_H264;
3396     s->workaround_bugs= avctx->workaround_bugs;
3397
3398     // set defaults
3399 //    s->decode_mb= ff_h263_decode_mb;
3400     s->low_delay= 1;
3401     avctx->pix_fmt= PIX_FMT_YUV420P;
3402
3403     decode_init_vlc(h);
3404
3405     if(avctx->extradata_size > 0 && avctx->extradata &&
3406        *(char *)avctx->extradata == 1){
3407         h->is_avc = 1;
3408         h->got_avcC = 0;
3409     } else {
3410         h->is_avc = 0;
3411     }
3412
3413     return 0;
3414 }
3415
3416 static int frame_start(H264Context *h){
3417     MpegEncContext * const s = &h->s;
3418     int i;
3419
3420     if(MPV_frame_start(s, s->avctx) < 0)
3421         return -1;
3422     ff_er_frame_start(s);
3423
3424     assert(s->linesize && s->uvlinesize);
3425
3426     for(i=0; i<16; i++){
3427         h->block_offset[i]= 4*((scan8[i] - scan8[0])&7) + 4*s->linesize*((scan8[i] - scan8[0])>>3);
3428         h->block_offset[24+i]= 4*((scan8[i] - scan8[0])&7) + 8*s->linesize*((scan8[i] - scan8[0])>>3);
3429     }
3430     for(i=0; i<4; i++){
3431         h->block_offset[16+i]=
3432         h->block_offset[20+i]= 4*((scan8[i] - scan8[0])&7) + 4*s->uvlinesize*((scan8[i] - scan8[0])>>3);
3433         h->block_offset[24+16+i]=
3434         h->block_offset[24+20+i]= 4*((scan8[i] - scan8[0])&7) + 8*s->uvlinesize*((scan8[i] - scan8[0])>>3);
3435     }
3436
3437     /* can't be in alloc_tables because linesize isn't known there.
3438      * FIXME: redo bipred weight to not require extra buffer? */
3439     if(!s->obmc_scratchpad)
3440         s->obmc_scratchpad = av_malloc(16*2*s->linesize + 8*2*s->uvlinesize);
3441
3442     /* some macroblocks will be accessed before they're available */
3443     if(FRAME_MBAFF)
3444         memset(h->slice_table, -1, (s->mb_height*s->mb_stride-1) * sizeof(uint8_t));
3445
3446 //    s->decode= (s->flags&CODEC_FLAG_PSNR) || !s->encoding || s->current_picture.reference /*|| h->contains_intra*/ || 1;
3447     return 0;
3448 }
3449
3450 static inline void backup_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize){
3451     MpegEncContext * const s = &h->s;
3452     int i;
3453
3454     src_y  -=   linesize;
3455     src_cb -= uvlinesize;
3456     src_cr -= uvlinesize;
3457
3458     // There are two lines saved, the line above the the top macroblock of a pair,
3459     // and the line above the bottom macroblock
3460     h->left_border[0]= h->top_borders[0][s->mb_x][15];
3461     for(i=1; i<17; i++){
3462         h->left_border[i]= src_y[15+i*  linesize];
3463     }
3464
3465     *(uint64_t*)(h->top_borders[0][s->mb_x]+0)= *(uint64_t*)(src_y +  16*linesize);
3466     *(uint64_t*)(h->top_borders[0][s->mb_x]+8)= *(uint64_t*)(src_y +8+16*linesize);
3467
3468     if(!(s->flags&CODEC_FLAG_GRAY)){
3469         h->left_border[17  ]= h->top_borders[0][s->mb_x][16+7];
3470         h->left_border[17+9]= h->top_borders[0][s->mb_x][24+7];
3471         for(i=1; i<9; i++){
3472             h->left_border[i+17  ]= src_cb[7+i*uvlinesize];
3473             h->left_border[i+17+9]= src_cr[7+i*uvlinesize];
3474         }
3475         *(uint64_t*)(h->top_borders[0][s->mb_x]+16)= *(uint64_t*)(src_cb+8*uvlinesize);
3476         *(uint64_t*)(h->top_borders[0][s->mb_x]+24)= *(uint64_t*)(src_cr+8*uvlinesize);
3477     }
3478 }
3479
3480 static inline void xchg_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int xchg){
3481     MpegEncContext * const s = &h->s;
3482     int temp8, i;
3483     uint64_t temp64;
3484     int deblock_left = (s->mb_x > 0);
3485     int deblock_top  = (s->mb_y > 0);
3486
3487     src_y  -=   linesize + 1;
3488     src_cb -= uvlinesize + 1;
3489     src_cr -= uvlinesize + 1;
3490
3491 #define XCHG(a,b,t,xchg)\
3492 t= a;\
3493 if(xchg)\
3494     a= b;\
3495 b= t;
3496
3497     if(deblock_left){
3498         for(i = !deblock_top; i<17; i++){
3499             XCHG(h->left_border[i     ], src_y [i*  linesize], temp8, xchg);
3500         }
3501     }
3502
3503     if(deblock_top){
3504         XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+0), *(uint64_t*)(src_y +1), temp64, xchg);
3505         XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+8), *(uint64_t*)(src_y +9), temp64, 1);
3506         if(s->mb_x+1 < s->mb_width){
3507             XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x+1]), *(uint64_t*)(src_y +17), temp64, 1);
3508         }
3509     }
3510
3511     if(!(s->flags&CODEC_FLAG_GRAY)){
3512         if(deblock_left){
3513             for(i = !deblock_top; i<9; i++){
3514                 XCHG(h->left_border[i+17  ], src_cb[i*uvlinesize], temp8, xchg);
3515                 XCHG(h->left_border[i+17+9], src_cr[i*uvlinesize], temp8, xchg);
3516             }
3517         }
3518         if(deblock_top){
3519             XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+16), *(uint64_t*)(src_cb+1), temp64, 1);
3520             XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+24), *(uint64_t*)(src_cr+1), temp64, 1);
3521         }
3522     }
3523 }
3524
3525 static inline void backup_pair_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize){
3526     MpegEncContext * const s = &h->s;
3527     int i;
3528
3529     src_y  -= 2 *   linesize;
3530     src_cb -= 2 * uvlinesize;
3531     src_cr -= 2 * uvlinesize;
3532
3533     // There are two lines saved, the line above the the top macroblock of a pair,
3534     // and the line above the bottom macroblock
3535     h->left_border[0]= h->top_borders[0][s->mb_x][15];
3536     h->left_border[1]= h->top_borders[1][s->mb_x][15];
3537     for(i=2; i<34; i++){
3538         h->left_border[i]= src_y[15+i*  linesize];
3539     }
3540
3541     *(uint64_t*)(h->top_borders[0][s->mb_x]+0)= *(uint64_t*)(src_y +  32*linesize);
3542     *(uint64_t*)(h->top_borders[0][s->mb_x]+8)= *(uint64_t*)(src_y +8+32*linesize);
3543     *(uint64_t*)(h->top_borders[1][s->mb_x]+0)= *(uint64_t*)(src_y +  33*linesize);
3544     *(uint64_t*)(h->top_borders[1][s->mb_x]+8)= *(uint64_t*)(src_y +8+33*linesize);
3545
3546     if(!(s->flags&CODEC_FLAG_GRAY)){
3547         h->left_border[34     ]= h->top_borders[0][s->mb_x][16+7];
3548         h->left_border[34+   1]= h->top_borders[1][s->mb_x][16+7];
3549         h->left_border[34+18  ]= h->top_borders[0][s->mb_x][24+7];
3550         h->left_border[34+18+1]= h->top_borders[1][s->mb_x][24+7];
3551         for(i=2; i<18; i++){
3552             h->left_border[i+34   ]= src_cb[7+i*uvlinesize];
3553             h->left_border[i+34+18]= src_cr[7+i*uvlinesize];
3554         }
3555         *(uint64_t*)(h->top_borders[0][s->mb_x]+16)= *(uint64_t*)(src_cb+16*uvlinesize);
3556         *(uint64_t*)(h->top_borders[0][s->mb_x]+24)= *(uint64_t*)(src_cr+16*uvlinesize);
3557         *(uint64_t*)(h->top_borders[1][s->mb_x]+16)= *(uint64_t*)(src_cb+17*uvlinesize);
3558         *(uint64_t*)(h->top_borders[1][s->mb_x]+24)= *(uint64_t*)(src_cr+17*uvlinesize);
3559     }
3560 }
3561
3562 static inline void xchg_pair_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int xchg){
3563     MpegEncContext * const s = &h->s;
3564     int temp8, i;
3565     uint64_t temp64;
3566     int deblock_left = (s->mb_x > 0);
3567     int deblock_top  = (s->mb_y > 1);
3568
3569     tprintf("xchg_pair_border: src_y:%p src_cb:%p src_cr:%p ls:%d uvls:%d\n", src_y, src_cb, src_cr, linesize, uvlinesize);
3570
3571     src_y  -= 2 *   linesize + 1;
3572     src_cb -= 2 * uvlinesize + 1;
3573     src_cr -= 2 * uvlinesize + 1;
3574
3575 #define XCHG(a,b,t,xchg)\
3576 t= a;\
3577 if(xchg)\
3578     a= b;\
3579 b= t;
3580
3581     if(deblock_left){
3582         for(i = (!deblock_top)<<1; i<34; i++){
3583             XCHG(h->left_border[i     ], src_y [i*  linesize], temp8, xchg);
3584         }
3585     }
3586
3587     if(deblock_top){
3588         XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+0), *(uint64_t*)(src_y +1), temp64, xchg);
3589         XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+8), *(uint64_t*)(src_y +9), temp64, 1);
3590         XCHG(*(uint64_t*)(h->top_borders[1][s->mb_x]+0), *(uint64_t*)(src_y +1 +linesize), temp64, xchg);
3591         XCHG(*(uint64_t*)(h->top_borders[1][s->mb_x]+8), *(uint64_t*)(src_y +9 +linesize), temp64, 1);
3592         if(s->mb_x+1 < s->mb_width){
3593             XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x+1]), *(uint64_t*)(src_y +17), temp64, 1);
3594             XCHG(*(uint64_t*)(h->top_borders[1][s->mb_x+1]), *(uint64_t*)(src_y +17 +linesize), temp64, 1);
3595         }
3596     }
3597
3598     if(!(s->flags&CODEC_FLAG_GRAY)){
3599         if(deblock_left){
3600             for(i = (!deblock_top) << 1; i<18; i++){
3601                 XCHG(h->left_border[i+34   ], src_cb[i*uvlinesize], temp8, xchg);
3602                 XCHG(h->left_border[i+34+18], src_cr[i*uvlinesize], temp8, xchg);
3603             }
3604         }
3605         if(deblock_top){
3606             XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+16), *(uint64_t*)(src_cb+1), temp64, 1);
3607             XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+24), *(uint64_t*)(src_cr+1), temp64, 1);
3608             XCHG(*(uint64_t*)(h->top_borders[1][s->mb_x]+16), *(uint64_t*)(src_cb+1 +uvlinesize), temp64, 1);
3609             XCHG(*(uint64_t*)(h->top_borders[1][s->mb_x]+24), *(uint64_t*)(src_cr+1 +uvlinesize), temp64, 1);
3610         }
3611     }
3612 }
3613
3614 static void hl_decode_mb(H264Context *h){
3615     MpegEncContext * const s = &h->s;
3616     const int mb_x= s->mb_x;
3617     const int mb_y= s->mb_y;
3618     const int mb_xy= mb_x + mb_y*s->mb_stride;
3619     const int mb_type= s->current_picture.mb_type[mb_xy];
3620     uint8_t  *dest_y, *dest_cb, *dest_cr;
3621     int linesize, uvlinesize /*dct_offset*/;
3622     int i;
3623     int *block_offset = &h->block_offset[0];
3624     const unsigned int bottom = mb_y & 1;
3625     const int transform_bypass = (s->qscale == 0 && h->sps.transform_bypass);
3626     void (*idct_add)(uint8_t *dst, DCTELEM *block, int stride);
3627     void (*idct_dc_add)(uint8_t *dst, DCTELEM *block, int stride);
3628
3629     if(!s->decode)
3630         return;
3631
3632     dest_y  = s->current_picture.data[0] + (mb_y * 16* s->linesize  ) + mb_x * 16;
3633     dest_cb = s->current_picture.data[1] + (mb_y * 8 * s->uvlinesize) + mb_x * 8;
3634     dest_cr = s->current_picture.data[2] + (mb_y * 8 * s->uvlinesize) + mb_x * 8;
3635
3636     if (MB_FIELD) {
3637         linesize   = h->mb_linesize   = s->linesize * 2;
3638         uvlinesize = h->mb_uvlinesize = s->uvlinesize * 2;
3639         block_offset = &h->block_offset[24];
3640         if(mb_y&1){ //FIXME move out of this func?
3641             dest_y -= s->linesize*15;
3642             dest_cb-= s->uvlinesize*7;
3643             dest_cr-= s->uvlinesize*7;
3644         }
3645         if(FRAME_MBAFF) {
3646             int list;
3647             for(list=0; list<2; list++){
3648                 if(!USES_LIST(mb_type, list))
3649                     continue;
3650                 if(IS_16X16(mb_type)){
3651                     int8_t *ref = &h->ref_cache[list][scan8[0]];
3652                     fill_rectangle(ref, 4, 4, 8, 16+*ref^(s->mb_y&1), 1);
3653                 }else{
3654                     for(i=0; i<16; i+=4){
3655                         //FIXME can refs be smaller than 8x8 when !direct_8x8_inference ?
3656                         int ref = h->ref_cache[list][scan8[i]];
3657                         if(ref >= 0)
3658                             fill_rectangle(&h->ref_cache[list][scan8[i]], 2, 2, 8, 16+ref^(s->mb_y&1), 1);
3659                     }
3660                 }
3661             }
3662         }
3663     } else {
3664         linesize   = h->mb_linesize   = s->linesize;
3665         uvlinesize = h->mb_uvlinesize = s->uvlinesize;
3666 //        dct_offset = s->linesize * 16;
3667     }
3668
3669     if(transform_bypass){
3670         idct_dc_add =
3671         idct_add = IS_8x8DCT(mb_type) ? s->dsp.add_pixels8 : s->dsp.add_pixels4;
3672     }else if(IS_8x8DCT(mb_type)){
3673         idct_dc_add = s->dsp.h264_idct8_dc_add;
3674         idct_add = s->dsp.h264_idct8_add;
3675     }else{
3676         idct_dc_add = s->dsp.h264_idct_dc_add;
3677         idct_add = s->dsp.h264_idct_add;
3678     }
3679
3680     if(FRAME_MBAFF && h->deblocking_filter && IS_INTRA(mb_type)
3681        && (!bottom || !IS_INTRA(s->current_picture.mb_type[mb_xy-s->mb_stride]))){
3682         int mbt_y = mb_y&~1;
3683         uint8_t *top_y  = s->current_picture.data[0] + (mbt_y * 16* s->linesize  ) + mb_x * 16;
3684         uint8_t *top_cb = s->current_picture.data[1] + (mbt_y * 8 * s->uvlinesize) + mb_x * 8;
3685         uint8_t *top_cr = s->current_picture.data[2] + (mbt_y * 8 * s->uvlinesize) + mb_x * 8;
3686         xchg_pair_border(h, top_y, top_cb, top_cr, s->linesize, s->uvlinesize, 1);
3687     }
3688
3689     if (IS_INTRA_PCM(mb_type)) {
3690         unsigned int x, y;
3691
3692         // The pixels are stored in h->mb array in the same order as levels,
3693         // copy them in output in the correct order.
3694         for(i=0; i<16; i++) {
3695             for (y=0; y<4; y++) {
3696                 for (x=0; x<4; x++) {
3697                     *(dest_y + block_offset[i] + y*linesize + x) = h->mb[i*16+y*4+x];
3698                 }
3699             }
3700         }
3701         for(i=16; i<16+4; i++) {
3702             for (y=0; y<4; y++) {
3703                 for (x=0; x<4; x++) {
3704                     *(dest_cb + block_offset[i] + y*uvlinesize + x) = h->mb[i*16+y*4+x];
3705                 }
3706             }
3707         }
3708         for(i=20; i<20+4; i++) {
3709             for (y=0; y<4; y++) {
3710                 for (x=0; x<4; x++) {
3711                     *(dest_cr + block_offset[i] + y*uvlinesize + x) = h->mb[i*16+y*4+x];
3712                 }
3713             }
3714         }
3715     } else {
3716         if(IS_INTRA(mb_type)){
3717             if(h->deblocking_filter && !FRAME_MBAFF)
3718                 xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 1);
3719
3720             if(!(s->flags&CODEC_FLAG_GRAY)){
3721                 h->pred8x8[ h->chroma_pred_mode ](dest_cb, uvlinesize);
3722                 h->pred8x8[ h->chroma_pred_mode ](dest_cr, uvlinesize);
3723             }
3724
3725             if(IS_INTRA4x4(mb_type)){
3726                 if(!s->encoding){
3727                     if(IS_8x8DCT(mb_type)){
3728                         for(i=0; i<16; i+=4){
3729                             uint8_t * const ptr= dest_y + block_offset[i];
3730                             const int dir= h->intra4x4_pred_mode_cache[ scan8[i] ];
3731                             const int nnz = h->non_zero_count_cache[ scan8[i] ];
3732                             h->pred8x8l[ dir ](ptr, (h->topleft_samples_available<<i)&0x8000,
3733                                                    (h->topright_samples_available<<(i+1))&0x8000, linesize);
3734                             if(nnz){
3735                                 if(nnz == 1 && h->mb[i*16])
3736                                     idct_dc_add(ptr, h->mb + i*16, linesize);
3737                                 else
3738                                     idct_add(ptr, h->mb + i*16, linesize);
3739                             }
3740                         }
3741                     }else
3742                     for(i=0; i<16; i++){
3743                         uint8_t * const ptr= dest_y + block_offset[i];
3744                         uint8_t *topright;
3745                         const int dir= h->intra4x4_pred_mode_cache[ scan8[i] ];
3746                         int nnz, tr;
3747
3748                         if(dir == DIAG_DOWN_LEFT_PRED || dir == VERT_LEFT_PRED){
3749                             const int topright_avail= (h->topright_samples_available<<i)&0x8000;
3750                             assert(mb_y || linesize <= block_offset[i]);
3751                             if(!topright_avail){
3752                                 tr= ptr[3 - linesize]*0x01010101;
3753                                 topright= (uint8_t*) &tr;
3754                             }else
3755                                 topright= ptr + 4 - linesize;
3756                         }else
3757                             topright= NULL;
3758
3759                         h->pred4x4[ dir ](ptr, topright, linesize);
3760                         nnz = h->non_zero_count_cache[ scan8[i] ];
3761                         if(nnz){
3762                             if(s->codec_id == CODEC_ID_H264){
3763                                 if(nnz == 1 && h->mb[i*16])
3764                                     idct_dc_add(ptr, h->mb + i*16, linesize);
3765                                 else
3766                                     idct_add(ptr, h->mb + i*16, linesize);
3767                             }else
3768                                 svq3_add_idct_c(ptr, h->mb + i*16, linesize, s->qscale, 0);
3769                         }
3770                     }
3771                 }
3772             }else{
3773                 h->pred16x16[ h->intra16x16_pred_mode ](dest_y , linesize);
3774                 if(s->codec_id == CODEC_ID_H264){
3775                     if(!transform_bypass)
3776                         h264_luma_dc_dequant_idct_c(h->mb, s->qscale, h->dequant4_coeff[IS_INTRA(mb_type) ? 0:3][s->qscale][0]);
3777                 }else
3778                     svq3_luma_dc_dequant_idct_c(h->mb, s->qscale);
3779             }
3780             if(h->deblocking_filter && !FRAME_MBAFF)
3781                 xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 0);
3782         }else if(s->codec_id == CODEC_ID_H264){
3783             hl_motion(h, dest_y, dest_cb, dest_cr,
3784                       s->dsp.put_h264_qpel_pixels_tab, s->dsp.put_h264_chroma_pixels_tab,
3785                       s->dsp.avg_h264_qpel_pixels_tab, s->dsp.avg_h264_chroma_pixels_tab,
3786                       s->dsp.weight_h264_pixels_tab, s->dsp.biweight_h264_pixels_tab);
3787         }
3788
3789
3790         if(!IS_INTRA4x4(mb_type)){
3791             if(s->codec_id == CODEC_ID_H264){
3792                 if(IS_INTRA16x16(mb_type)){
3793                     for(i=0; i<16; i++){
3794                         if(h->non_zero_count_cache[ scan8[i] ])
3795                             idct_add(dest_y + block_offset[i], h->mb + i*16, linesize);
3796                         else if(h->mb[i*16])
3797                             idct_dc_add(dest_y + block_offset[i], h->mb + i*16, linesize);
3798                     }
3799                 }else{
3800                     const int di = IS_8x8DCT(mb_type) ? 4 : 1;
3801                     for(i=0; i<16; i+=di){
3802                         int nnz = h->non_zero_count_cache[ scan8[i] ];
3803                         if(nnz){
3804                             if(nnz==1 && h->mb[i*16])
3805                                 idct_dc_add(dest_y + block_offset[i], h->mb + i*16, linesize);
3806                             else
3807                                 idct_add(dest_y + block_offset[i], h->mb + i*16, linesize);
3808                         }
3809                     }
3810                 }
3811             }else{
3812                 for(i=0; i<16; i++){
3813                     if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){ //FIXME benchmark weird rule, & below
3814                         uint8_t * const ptr= dest_y + block_offset[i];
3815                         svq3_add_idct_c(ptr, h->mb + i*16, linesize, s->qscale, IS_INTRA(mb_type) ? 1 : 0);
3816                     }
3817                 }
3818             }
3819         }
3820
3821         if(!(s->flags&CODEC_FLAG_GRAY)){
3822             uint8_t *dest[2] = {dest_cb, dest_cr};
3823             if(transform_bypass){
3824                 idct_add = idct_dc_add = s->dsp.add_pixels4;
3825             }else{
3826                 idct_add = s->dsp.h264_idct_add;
3827                 idct_dc_add = s->dsp.h264_idct_dc_add;
3828                 chroma_dc_dequant_idct_c(h->mb + 16*16, h->chroma_qp, h->dequant4_coeff[IS_INTRA(mb_type) ? 1:4][h->chroma_qp][0]);
3829                 chroma_dc_dequant_idct_c(h->mb + 16*16+4*16, h->chroma_qp, h->dequant4_coeff[IS_INTRA(mb_type) ? 2:5][h->chroma_qp][0]);
3830             }
3831             if(s->codec_id == CODEC_ID_H264){
3832                 for(i=16; i<16+8; i++){
3833                     if(h->non_zero_count_cache[ scan8[i] ])
3834                         idct_add(dest[(i&4)>>2] + block_offset[i], h->mb + i*16, uvlinesize);
3835                     else if(h->mb[i*16])
3836                         idct_dc_add(dest[(i&4)>>2] + block_offset[i], h->mb + i*16, uvlinesize);
3837                 }
3838             }else{
3839                 for(i=16; i<16+8; i++){
3840                     if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){
3841                         uint8_t * const ptr= dest[(i&4)>>2] + block_offset[i];
3842                         svq3_add_idct_c(ptr, h->mb + i*16, uvlinesize, chroma_qp[s->qscale + 12] - 12, 2);
3843                     }
3844                 }
3845             }
3846         }
3847     }
3848     if(h->deblocking_filter) {
3849         if (FRAME_MBAFF) {
3850             //FIXME try deblocking one mb at a time?
3851             // the reduction in load/storing mvs and such might outweigh the extra backup/xchg_border
3852             const int mb_y = s->mb_y - 1;
3853             uint8_t  *pair_dest_y, *pair_dest_cb, *pair_dest_cr;
3854             const int mb_xy= mb_x + mb_y*s->mb_stride;
3855             const int mb_type_top   = s->current_picture.mb_type[mb_xy];
3856             const int mb_type_bottom= s->current_picture.mb_type[mb_xy+s->mb_stride];
3857             if (!bottom) return;
3858             pair_dest_y  = s->current_picture.data[0] + (mb_y * 16* s->linesize  ) + mb_x * 16;
3859             pair_dest_cb = s->current_picture.data[1] + (mb_y * 8 * s->uvlinesize) + mb_x * 8;
3860             pair_dest_cr = s->current_picture.data[2] + (mb_y * 8 * s->uvlinesize) + mb_x * 8;
3861
3862             if(IS_INTRA(mb_type_top | mb_type_bottom))
3863                 xchg_pair_border(h, pair_dest_y, pair_dest_cb, pair_dest_cr, s->linesize, s->uvlinesize, 0);
3864
3865             backup_pair_border(h, pair_dest_y, pair_dest_cb, pair_dest_cr, s->linesize, s->uvlinesize);
3866             // deblock a pair
3867             // top
3868             s->mb_y--;
3869             tprintf("call mbaff filter_mb mb_x:%d mb_y:%d pair_dest_y = %p, dest_y = %p\n", mb_x, mb_y, pair_dest_y, dest_y);
3870             fill_caches(h, mb_type_top, 1); //FIXME don't fill stuff which isn't used by filter_mb
3871             h->chroma_qp = get_chroma_qp(h->pps.chroma_qp_index_offset, s->current_picture.qscale_table[mb_xy]);
3872             filter_mb(h, mb_x, mb_y, pair_dest_y, pair_dest_cb, pair_dest_cr, linesize, uvlinesize);
3873             // bottom
3874             s->mb_y++;
3875             tprintf("call mbaff filter_mb\n");
3876             fill_caches(h, mb_type_bottom, 1); //FIXME don't fill stuff which isn't used by filter_mb
3877             h->chroma_qp = get_chroma_qp(h->pps.chroma_qp_index_offset, s->current_picture.qscale_table[mb_xy+s->mb_stride]);
3878             filter_mb(h, mb_x, mb_y+1, dest_y, dest_cb, dest_cr, linesize, uvlinesize);
3879         } else {
3880             tprintf("call filter_mb\n");
3881             backup_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize);
3882             fill_caches(h, mb_type, 1); //FIXME don't fill stuff which isn't used by filter_mb
3883             filter_mb(h, mb_x, mb_y, dest_y, dest_cb, dest_cr, linesize, uvlinesize);
3884         }
3885     }
3886 }
3887
3888 /**
3889  * fills the default_ref_list.
3890  */
3891 static int fill_default_ref_list(H264Context *h){
3892     MpegEncContext * const s = &h->s;
3893     int i;
3894     int smallest_poc_greater_than_current = -1;
3895     Picture sorted_short_ref[32];
3896
3897     if(h->slice_type==B_TYPE){
3898         int out_i;
3899         int limit= INT_MIN;
3900
3901         /* sort frame according to poc in B slice */
3902         for(out_i=0; out_i<h->short_ref_count; out_i++){
3903             int best_i=INT_MIN;
3904             int best_poc=INT_MAX;
3905
3906             for(i=0; i<h->short_ref_count; i++){
3907                 const int poc= h->short_ref[i]->poc;
3908                 if(poc > limit && poc < best_poc){
3909                     best_poc= poc;
3910                     best_i= i;
3911                 }
3912             }
3913
3914             assert(best_i != INT_MIN);
3915
3916             limit= best_poc;
3917             sorted_short_ref[out_i]= *h->short_ref[best_i];
3918             tprintf("sorted poc: %d->%d poc:%d fn:%d\n", best_i, out_i, sorted_short_ref[out_i].poc, sorted_short_ref[out_i].frame_num);
3919             if (-1 == smallest_poc_greater_than_current) {
3920                 if (h->short_ref[best_i]->poc >= s->current_picture_ptr->poc) {
3921                     smallest_poc_greater_than_current = out_i;
3922                 }
3923             }
3924         }
3925     }
3926
3927     if(s->picture_structure == PICT_FRAME){
3928         if(h->slice_type==B_TYPE){
3929             int list;
3930             tprintf("current poc: %d, smallest_poc_greater_than_current: %d\n", s->current_picture_ptr->poc, smallest_poc_greater_than_current);
3931
3932             // find the largest poc
3933             for(list=0; list<2; list++){
3934                 int index = 0;
3935                 int j= -99;
3936                 int step= list ? -1 : 1;
3937
3938                 for(i=0; i<h->short_ref_count && index < h->ref_count[list]; i++, j+=step) {
3939                     while(j<0 || j>= h->short_ref_count){
3940                         if(j != -99 && step == (list ? -1 : 1))
3941                             return -1;
3942                         step = -step;
3943                         j= smallest_poc_greater_than_current + (step>>1);
3944                     }
3945                     if(sorted_short_ref[j].reference != 3) continue;
3946                     h->default_ref_list[list][index  ]= sorted_short_ref[j];
3947                     h->default_ref_list[list][index++].pic_id= sorted_short_ref[j].frame_num;
3948                 }
3949
3950                 for(i = 0; i < 16 && index < h->ref_count[ list ]; i++){
3951                     if(h->long_ref[i] == NULL) continue;
3952                     if(h->long_ref[i]->reference != 3) continue;
3953
3954                     h->default_ref_list[ list ][index  ]= *h->long_ref[i];
3955                     h->default_ref_list[ list ][index++].pic_id= i;;
3956                 }
3957
3958                 if(list && (smallest_poc_greater_than_current<=0 || smallest_poc_greater_than_current>=h->short_ref_count) && (1 < index)){
3959                     // swap the two first elements of L1 when
3960                     // L0 and L1 are identical
3961                     Picture temp= h->default_ref_list[1][0];
3962                     h->default_ref_list[1][0] = h->default_ref_list[1][1];
3963                     h->default_ref_list[1][1] = temp;
3964                 }
3965
3966                 if(index < h->ref_count[ list ])
3967                     memset(&h->default_ref_list[list][index], 0, sizeof(Picture)*(h->ref_count[ list ] - index));
3968             }
3969         }else{
3970             int index=0;
3971             for(i=0; i<h->short_ref_count; i++){
3972                 if(h->short_ref[i]->reference != 3) continue; //FIXME refernce field shit
3973                 h->default_ref_list[0][index  ]= *h->short_ref[i];
3974                 h->default_ref_list[0][index++].pic_id= h->short_ref[i]->frame_num;
3975             }
3976             for(i = 0; i < 16; i++){
3977                 if(h->long_ref[i] == NULL) continue;
3978                 if(h->long_ref[i]->reference != 3) continue;
3979                 h->default_ref_list[0][index  ]= *h->long_ref[i];
3980                 h->default_ref_list[0][index++].pic_id= i;;
3981             }
3982             if(index < h->ref_count[0])
3983                 memset(&h->default_ref_list[0][index], 0, sizeof(Picture)*(h->ref_count[0] - index));
3984         }
3985     }else{ //FIELD
3986         if(h->slice_type==B_TYPE){
3987         }else{
3988             //FIXME second field balh
3989         }
3990     }
3991 #ifdef TRACE
3992     for (i=0; i<h->ref_count[0]; i++) {
3993         tprintf("List0: %s fn:%d 0x%p\n", (h->default_ref_list[0][i].long_ref ? "LT" : "ST"), h->default_ref_list[0][i].pic_id, h->default_ref_list[0][i].data[0]);
3994     }
3995     if(h->slice_type==B_TYPE){
3996         for (i=0; i<h->ref_count[1]; i++) {
3997             tprintf("List1: %s fn:%d 0x%p\n", (h->default_ref_list[1][i].long_ref ? "LT" : "ST"), h->default_ref_list[1][i].pic_id, h->default_ref_list[0][i].data[0]);
3998         }
3999     }
4000 #endif
4001     return 0;
4002 }
4003
4004 static void print_short_term(H264Context *h);
4005 static void print_long_term(H264Context *h);
4006
4007 static int decode_ref_pic_list_reordering(H264Context *h){
4008     MpegEncContext * const s = &h->s;
4009     int list, index;
4010
4011     print_short_term(h);
4012     print_long_term(h);
4013     if(h->slice_type==I_TYPE || h->slice_type==SI_TYPE) return 0; //FIXME move before func
4014
4015     for(list=0; list<2; list++){
4016         memcpy(h->ref_list[list], h->default_ref_list[list], sizeof(Picture)*h->ref_count[list]);
4017
4018         if(get_bits1(&s->gb)){
4019             int pred= h->curr_pic_num;
4020
4021             for(index=0; ; index++){
4022                 int reordering_of_pic_nums_idc= get_ue_golomb(&s->gb);
4023                 int pic_id;
4024                 int i;
4025                 Picture *ref = NULL;
4026
4027                 if(reordering_of_pic_nums_idc==3)
4028                     break;
4029
4030                 if(index >= h->ref_count[list]){
4031                     av_log(h->s.avctx, AV_LOG_ERROR, "reference count overflow\n");
4032                     return -1;
4033                 }
4034
4035                 if(reordering_of_pic_nums_idc<3){
4036                     if(reordering_of_pic_nums_idc<2){
4037                         const int abs_diff_pic_num= get_ue_golomb(&s->gb) + 1;
4038
4039                         if(abs_diff_pic_num >= h->max_pic_num){
4040                             av_log(h->s.avctx, AV_LOG_ERROR, "abs_diff_pic_num overflow\n");
4041                             return -1;
4042                         }
4043
4044                         if(reordering_of_pic_nums_idc == 0) pred-= abs_diff_pic_num;
4045                         else                                pred+= abs_diff_pic_num;
4046                         pred &= h->max_pic_num - 1;
4047
4048                         for(i= h->short_ref_count-1; i>=0; i--){
4049                             ref = h->short_ref[i];
4050                             assert(ref->reference == 3);
4051                             assert(!ref->long_ref);
4052                             if(ref->data[0] != NULL && ref->frame_num == pred && ref->long_ref == 0) // ignore non existing pictures by testing data[0] pointer
4053                                 break;
4054                         }
4055                         if(i>=0)
4056                             ref->pic_id= ref->frame_num;
4057                     }else{
4058                         pic_id= get_ue_golomb(&s->gb); //long_term_pic_idx
4059                         ref = h->long_ref[pic_id];
4060                         ref->pic_id= pic_id;
4061                         assert(ref->reference == 3);
4062                         assert(ref->long_ref);
4063                         i=0;
4064                     }
4065
4066                     if (i < 0) {
4067                         av_log(h->s.avctx, AV_LOG_ERROR, "reference picture missing during reorder\n");
4068                         memset(&h->ref_list[list][index], 0, sizeof(Picture)); //FIXME
4069                     } else {
4070                         for(i=index; i+1<h->ref_count[list]; i++){
4071                             if(ref->long_ref == h->ref_list[list][i].long_ref && ref->pic_id == h->ref_list[list][i].pic_id)
4072                                 break;
4073                         }
4074                         for(; i > index; i--){
4075                             h->ref_list[list][i]= h->ref_list[list][i-1];
4076                         }
4077                         h->ref_list[list][index]= *ref;
4078                     }
4079                 }else{
4080                     av_log(h->s.avctx, AV_LOG_ERROR, "illegal reordering_of_pic_nums_idc\n");
4081                     return -1;
4082                 }
4083             }
4084         }
4085
4086         if(h->slice_type!=B_TYPE) break;
4087     }
4088     for(list=0; list<2; list++){
4089         for(index= 0; index < h->ref_count[list]; index++){
4090             if(!h->ref_list[list][index].data[0])
4091                 h->ref_list[list][index]= s->current_picture;
4092         }
4093         if(h->slice_type!=B_TYPE) break;
4094     }
4095
4096     if(h->slice_type==B_TYPE && !h->direct_spatial_mv_pred)
4097         direct_dist_scale_factor(h);
4098     direct_ref_list_init(h);
4099     return 0;
4100 }
4101
4102 static void fill_mbaff_ref_list(H264Context *h){
4103     int list, i, j;
4104     for(list=0; list<2; list++){
4105         for(i=0; i<h->ref_count[list]; i++){
4106             Picture *frame = &h->ref_list[list][i];
4107             Picture *field = &h->ref_list[list][16+2*i];
4108             field[0] = *frame;
4109             for(j=0; j<3; j++)
4110                 field[0].linesize[j] <<= 1;
4111             field[1] = field[0];
4112             for(j=0; j<3; j++)
4113                 field[1].data[j] += frame->linesize[j];
4114
4115             h->luma_weight[list][16+2*i] = h->luma_weight[list][16+2*i+1] = h->luma_weight[list][i];
4116             h->luma_offset[list][16+2*i] = h->luma_offset[list][16+2*i+1] = h->luma_offset[list][i];
4117             for(j=0; j<2; j++){
4118                 h->chroma_weight[list][16+2*i][j] = h->chroma_weight[list][16+2*i+1][j] = h->chroma_weight[list][i][j];
4119                 h->chroma_offset[list][16+2*i][j] = h->chroma_offset[list][16+2*i+1][j] = h->chroma_offset[list][i][j];
4120             }
4121         }
4122     }
4123     for(j=0; j<h->ref_count[1]; j++){
4124         for(i=0; i<h->ref_count[0]; i++)
4125             h->implicit_weight[j][16+2*i] = h->implicit_weight[j][16+2*i+1] = h->implicit_weight[j][i];
4126         memcpy(h->implicit_weight[16+2*j],   h->implicit_weight[j], sizeof(*h->implicit_weight));
4127         memcpy(h->implicit_weight[16+2*j+1], h->implicit_weight[j], sizeof(*h->implicit_weight));
4128     }
4129 }
4130
4131 static int pred_weight_table(H264Context *h){
4132     MpegEncContext * const s = &h->s;
4133     int list, i;
4134     int luma_def, chroma_def;
4135
4136     h->use_weight= 0;
4137     h->use_weight_chroma= 0;
4138     h->luma_log2_weight_denom= get_ue_golomb(&s->gb);
4139     h->chroma_log2_weight_denom= get_ue_golomb(&s->gb);
4140     luma_def = 1<<h->luma_log2_weight_denom;
4141     chroma_def = 1<<h->chroma_log2_weight_denom;
4142
4143     for(list=0; list<2; list++){
4144         for(i=0; i<h->ref_count[list]; i++){
4145             int luma_weight_flag, chroma_weight_flag;
4146
4147             luma_weight_flag= get_bits1(&s->gb);
4148             if(luma_weight_flag){
4149                 h->luma_weight[list][i]= get_se_golomb(&s->gb);
4150                 h->luma_offset[list][i]= get_se_golomb(&s->gb);
4151                 if(   h->luma_weight[list][i] != luma_def
4152                    || h->luma_offset[list][i] != 0)
4153                     h->use_weight= 1;
4154             }else{
4155                 h->luma_weight[list][i]= luma_def;
4156                 h->luma_offset[list][i]= 0;
4157             }
4158
4159             chroma_weight_flag= get_bits1(&s->gb);
4160             if(chroma_weight_flag){
4161                 int j;
4162                 for(j=0; j<2; j++){
4163                     h->chroma_weight[list][i][j]= get_se_golomb(&s->gb);
4164                     h->chroma_offset[list][i][j]= get_se_golomb(&s->gb);
4165                     if(   h->chroma_weight[list][i][j] != chroma_def
4166                        || h->chroma_offset[list][i][j] != 0)
4167                         h->use_weight_chroma= 1;
4168                 }
4169             }else{
4170                 int j;
4171                 for(j=0; j<2; j++){
4172                     h->chroma_weight[list][i][j]= chroma_def;
4173                     h->chroma_offset[list][i][j]= 0;
4174                 }
4175             }
4176         }
4177         if(h->slice_type != B_TYPE) break;
4178     }
4179     h->use_weight= h->use_weight || h->use_weight_chroma;
4180     return 0;
4181 }
4182
4183 static void implicit_weight_table(H264Context *h){
4184     MpegEncContext * const s = &h->s;
4185     int ref0, ref1;
4186     int cur_poc = s->current_picture_ptr->poc;
4187
4188     if(   h->ref_count[0] == 1 && h->ref_count[1] == 1
4189        && h->ref_list[0][0].poc + h->ref_list[1][0].poc == 2*cur_poc){
4190         h->use_weight= 0;
4191         h->use_weight_chroma= 0;
4192         return;
4193     }
4194
4195     h->use_weight= 2;
4196     h->use_weight_chroma= 2;
4197     h->luma_log2_weight_denom= 5;
4198     h->chroma_log2_weight_denom= 5;
4199
4200     for(ref0=0; ref0 < h->ref_count[0]; ref0++){
4201         int poc0 = h->ref_list[0][ref0].poc;
4202         for(ref1=0; ref1 < h->ref_count[1]; ref1++){
4203             int poc1 = h->ref_list[1][ref1].poc;
4204             int td = clip(poc1 - poc0, -128, 127);
4205             if(td){
4206                 int tb = clip(cur_poc - poc0, -128, 127);
4207                 int tx = (16384 + (ABS(td) >> 1)) / td;
4208                 int dist_scale_factor = clip((tb*tx + 32) >> 6, -1024, 1023) >> 2;
4209                 if(dist_scale_factor < -64 || dist_scale_factor > 128)
4210                     h->implicit_weight[ref0][ref1] = 32;
4211                 else
4212                     h->implicit_weight[ref0][ref1] = 64 - dist_scale_factor;
4213             }else
4214                 h->implicit_weight[ref0][ref1] = 32;
4215         }
4216     }
4217 }
4218
4219 static inline void unreference_pic(H264Context *h, Picture *pic){
4220     int i;
4221     pic->reference=0;
4222     if(pic == h->delayed_output_pic)
4223         pic->reference=1;
4224     else{
4225         for(i = 0; h->delayed_pic[i]; i++)
4226             if(pic == h->delayed_pic[i]){
4227                 pic->reference=1;
4228                 break;
4229             }
4230     }
4231 }
4232
4233 /**
4234  * instantaneous decoder refresh.
4235  */
4236 static void idr(H264Context *h){
4237     int i;
4238
4239     for(i=0; i<16; i++){
4240         if (h->long_ref[i] != NULL) {
4241             unreference_pic(h, h->long_ref[i]);
4242             h->long_ref[i]= NULL;
4243         }
4244     }
4245     h->long_ref_count=0;
4246
4247     for(i=0; i<h->short_ref_count; i++){
4248         unreference_pic(h, h->short_ref[i]);
4249         h->short_ref[i]= NULL;
4250     }
4251     h->short_ref_count=0;
4252 }
4253
4254 /* forget old pics after a seek */
4255 static void flush_dpb(AVCodecContext *avctx){
4256     H264Context *h= avctx->priv_data;
4257     int i;
4258     for(i=0; i<16; i++) {
4259         if(h->delayed_pic[i])
4260             h->delayed_pic[i]->reference= 0;
4261         h->delayed_pic[i]= NULL;
4262     }
4263     if(h->delayed_output_pic)
4264         h->delayed_output_pic->reference= 0;
4265     h->delayed_output_pic= NULL;
4266     idr(h);
4267     if(h->s.current_picture_ptr)
4268         h->s.current_picture_ptr->reference= 0;
4269 }
4270
4271 /**
4272  *
4273  * @return the removed picture or NULL if an error occurs
4274  */
4275 static Picture * remove_short(H264Context *h, int frame_num){
4276     MpegEncContext * const s = &h->s;
4277     int i;
4278
4279     if(s->avctx->debug&FF_DEBUG_MMCO)
4280         av_log(h->s.avctx, AV_LOG_DEBUG, "remove short %d count %d\n", frame_num, h->short_ref_count);
4281
4282     for(i=0; i<h->short_ref_count; i++){
4283         Picture *pic= h->short_ref[i];
4284         if(s->avctx->debug&FF_DEBUG_MMCO)
4285             av_log(h->s.avctx, AV_LOG_DEBUG, "%d %d %p\n", i, pic->frame_num, pic);
4286         if(pic->frame_num == frame_num){
4287             h->short_ref[i]= NULL;
4288             memmove(&h->short_ref[i], &h->short_ref[i+1], (h->short_ref_count - i - 1)*sizeof(Picture*));
4289             h->short_ref_count--;
4290             return pic;
4291         }
4292     }
4293     return NULL;
4294 }
4295
4296 /**
4297  *
4298  * @return the removed picture or NULL if an error occurs
4299  */
4300 static Picture * remove_long(H264Context *h, int i){
4301     Picture *pic;
4302
4303     pic= h->long_ref[i];
4304     h->long_ref[i]= NULL;
4305     if(pic) h->long_ref_count--;
4306
4307     return pic;
4308 }
4309
4310 /**
4311  * print short term list
4312  */
4313 static void print_short_term(H264Context *h) {
4314     uint32_t i;
4315     if(h->s.avctx->debug&FF_DEBUG_MMCO) {
4316         av_log(h->s.avctx, AV_LOG_DEBUG, "short term list:\n");
4317         for(i=0; i<h->short_ref_count; i++){
4318             Picture *pic= h->short_ref[i];
4319             av_log(h->s.avctx, AV_LOG_DEBUG, "%d fn:%d poc:%d %p\n", i, pic->frame_num, pic->poc, pic->data[0]);
4320         }
4321     }
4322 }
4323
4324 /**
4325  * print long term list
4326  */
4327 static void print_long_term(H264Context *h) {
4328     uint32_t i;
4329     if(h->s.avctx->debug&FF_DEBUG_MMCO) {
4330         av_log(h->s.avctx, AV_LOG_DEBUG, "long term list:\n");
4331         for(i = 0; i < 16; i++){
4332             Picture *pic= h->long_ref[i];
4333             if (pic) {
4334                 av_log(h->s.avctx, AV_LOG_DEBUG, "%d fn:%d poc:%d %p\n", i, pic->frame_num, pic->poc, pic->data[0]);
4335             }
4336         }
4337     }
4338 }
4339
4340 /**
4341  * Executes the reference picture marking (memory management control operations).
4342  */
4343 static int execute_ref_pic_marking(H264Context *h, MMCO *mmco, int mmco_count){
4344     MpegEncContext * const s = &h->s;
4345     int i, j;
4346     int current_is_long=0;
4347     Picture *pic;
4348
4349     if((s->avctx->debug&FF_DEBUG_MMCO) && mmco_count==0)
4350         av_log(h->s.avctx, AV_LOG_DEBUG, "no mmco here\n");
4351
4352     for(i=0; i<mmco_count; i++){
4353         if(s->avctx->debug&FF_DEBUG_MMCO)
4354             av_log(h->s.avctx, AV_LOG_DEBUG, "mmco:%d %d %d\n", h->mmco[i].opcode, h->mmco[i].short_frame_num, h->mmco[i].long_index);
4355
4356         switch(mmco[i].opcode){
4357         case MMCO_SHORT2UNUSED:
4358             pic= remove_short(h, mmco[i].short_frame_num);
4359             if(pic)
4360                 unreference_pic(h, pic);
4361             else if(s->avctx->debug&FF_DEBUG_MMCO)
4362                 av_log(h->s.avctx, AV_LOG_DEBUG, "mmco: remove_short() failure\n");
4363             break;
4364         case MMCO_SHORT2LONG:
4365             pic= remove_long(h, mmco[i].long_index);
4366             if(pic) unreference_pic(h, pic);
4367
4368             h->long_ref[ mmco[i].long_index ]= remove_short(h, mmco[i].short_frame_num);
4369             h->long_ref[ mmco[i].long_index ]->long_ref=1;
4370             h->long_ref_count++;
4371             break;
4372         case MMCO_LONG2UNUSED:
4373             pic= remove_long(h, mmco[i].long_index);
4374             if(pic)
4375                 unreference_pic(h, pic);
4376             else if(s->avctx->debug&FF_DEBUG_MMCO)
4377                 av_log(h->s.avctx, AV_LOG_DEBUG, "mmco: remove_long() failure\n");
4378             break;
4379         case MMCO_LONG:
4380             pic= remove_long(h, mmco[i].long_index);
4381             if(pic) unreference_pic(h, pic);
4382
4383             h->long_ref[ mmco[i].long_index ]= s->current_picture_ptr;
4384             h->long_ref[ mmco[i].long_index ]->long_ref=1;
4385             h->long_ref_count++;
4386
4387             current_is_long=1;
4388             break;
4389         case MMCO_SET_MAX_LONG:
4390             assert(mmco[i].long_index <= 16);
4391             // just remove the long term which index is greater than new max
4392             for(j = mmco[i].long_index; j<16; j++){
4393                 pic = remove_long(h, j);
4394                 if (pic) unreference_pic(h, pic);
4395             }
4396             break;
4397         case MMCO_RESET:
4398             while(h->short_ref_count){
4399                 pic= remove_short(h, h->short_ref[0]->frame_num);
4400                 unreference_pic(h, pic);
4401             }
4402             for(j = 0; j < 16; j++) {
4403                 pic= remove_long(h, j);
4404                 if(pic) unreference_pic(h, pic);
4405             }
4406             break;
4407         default: assert(0);
4408         }
4409     }
4410
4411     if(!current_is_long){
4412         pic= remove_short(h, s->current_picture_ptr->frame_num);
4413         if(pic){
4414             unreference_pic(h, pic);
4415             av_log(h->s.avctx, AV_LOG_ERROR, "illegal short term buffer state detected\n");
4416         }
4417
4418         if(h->short_ref_count)
4419             memmove(&h->short_ref[1], &h->short_ref[0], h->short_ref_count*sizeof(Picture*));
4420
4421         h->short_ref[0]= s->current_picture_ptr;
4422         h->short_ref[0]->long_ref=0;
4423         h->short_ref_count++;
4424     }
4425
4426     print_short_term(h);
4427     print_long_term(h);
4428     return 0;
4429 }
4430
4431 static int decode_ref_pic_marking(H264Context *h){
4432     MpegEncContext * const s = &h->s;
4433     int i;
4434
4435     if(h->nal_unit_type == NAL_IDR_SLICE){ //FIXME fields
4436         s->broken_link= get_bits1(&s->gb) -1;
4437         h->mmco[0].long_index= get_bits1(&s->gb) - 1; // current_long_term_idx
4438         if(h->mmco[0].long_index == -1)
4439             h->mmco_index= 0;
4440         else{
4441             h->mmco[0].opcode= MMCO_LONG;
4442             h->mmco_index= 1;
4443         }
4444     }else{
4445         if(get_bits1(&s->gb)){ // adaptive_ref_pic_marking_mode_flag
4446             for(i= 0; i<MAX_MMCO_COUNT; i++) {
4447                 MMCOOpcode opcode= get_ue_golomb(&s->gb);;
4448
4449                 h->mmco[i].opcode= opcode;
4450                 if(opcode==MMCO_SHORT2UNUSED || opcode==MMCO_SHORT2LONG){
4451                     h->mmco[i].short_frame_num= (h->frame_num - get_ue_golomb(&s->gb) - 1) & ((1<<h->sps.log2_max_frame_num)-1); //FIXME fields
4452 /*                    if(h->mmco[i].short_frame_num >= h->short_ref_count || h->short_ref[ h->mmco[i].short_frame_num ] == NULL){
4453                         av_log(s->avctx, AV_LOG_ERROR, "illegal short ref in memory management control operation %d\n", mmco);
4454                         return -1;
4455                     }*/
4456                 }
4457                 if(opcode==MMCO_SHORT2LONG || opcode==MMCO_LONG2UNUSED || opcode==MMCO_LONG || opcode==MMCO_SET_MAX_LONG){
4458                     h->mmco[i].long_index= get_ue_golomb(&s->gb);
4459                     if(/*h->mmco[i].long_index >= h->long_ref_count || h->long_ref[ h->mmco[i].long_index ] == NULL*/ h->mmco[i].long_index >= 16){
4460                         av_log(h->s.avctx, AV_LOG_ERROR, "illegal long ref in memory management control operation %d\n", opcode);
4461                         return -1;
4462                     }
4463                 }
4464
4465                 if(opcode > MMCO_LONG){
4466                     av_log(h->s.avctx, AV_LOG_ERROR, "illegal memory management control operation %d\n", opcode);
4467                     return -1;
4468                 }
4469                 if(opcode == MMCO_END)
4470                     break;
4471             }
4472             h->mmco_index= i;
4473         }else{
4474             assert(h->long_ref_count + h->short_ref_count <= h->sps.ref_frame_count);
4475
4476             if(h->long_ref_count + h->short_ref_count == h->sps.ref_frame_count){ //FIXME fields
4477                 h->mmco[0].opcode= MMCO_SHORT2UNUSED;
4478                 h->mmco[0].short_frame_num= h->short_ref[ h->short_ref_count - 1 ]->frame_num;
4479                 h->mmco_index= 1;
4480             }else
4481                 h->mmco_index= 0;
4482         }
4483     }
4484
4485     return 0;
4486 }
4487
4488 static int init_poc(H264Context *h){
4489     MpegEncContext * const s = &h->s;
4490     const int max_frame_num= 1<<h->sps.log2_max_frame_num;
4491     int field_poc[2];
4492
4493     if(h->nal_unit_type == NAL_IDR_SLICE){
4494         h->frame_num_offset= 0;
4495     }else{
4496         if(h->frame_num < h->prev_frame_num)
4497             h->frame_num_offset= h->prev_frame_num_offset + max_frame_num;
4498         else
4499             h->frame_num_offset= h->prev_frame_num_offset;
4500     }
4501
4502     if(h->sps.poc_type==0){
4503         const int max_poc_lsb= 1<<h->sps.log2_max_poc_lsb;
4504
4505         if(h->nal_unit_type == NAL_IDR_SLICE){
4506              h->prev_poc_msb=
4507              h->prev_poc_lsb= 0;
4508         }
4509
4510         if     (h->poc_lsb < h->prev_poc_lsb && h->prev_poc_lsb - h->poc_lsb >= max_poc_lsb/2)
4511             h->poc_msb = h->prev_poc_msb + max_poc_lsb;
4512         else if(h->poc_lsb > h->prev_poc_lsb && h->prev_poc_lsb - h->poc_lsb < -max_poc_lsb/2)
4513             h->poc_msb = h->prev_poc_msb - max_poc_lsb;
4514         else
4515             h->poc_msb = h->prev_poc_msb;
4516 //printf("poc: %d %d\n", h->poc_msb, h->poc_lsb);
4517         field_poc[0] =
4518         field_poc[1] = h->poc_msb + h->poc_lsb;
4519         if(s->picture_structure == PICT_FRAME)
4520             field_poc[1] += h->delta_poc_bottom;
4521     }else if(h->sps.poc_type==1){
4522         int abs_frame_num, expected_delta_per_poc_cycle, expectedpoc;
4523         int i;
4524
4525         if(h->sps.poc_cycle_length != 0)
4526             abs_frame_num = h->frame_num_offset + h->frame_num;
4527         else
4528             abs_frame_num = 0;
4529
4530         if(h->nal_ref_idc==0 && abs_frame_num > 0)
4531             abs_frame_num--;
4532
4533         expected_delta_per_poc_cycle = 0;
4534         for(i=0; i < h->sps.poc_cycle_length; i++)
4535             expected_delta_per_poc_cycle += h->sps.offset_for_ref_frame[ i ]; //FIXME integrate during sps parse
4536
4537         if(abs_frame_num > 0){
4538             int poc_cycle_cnt          = (abs_frame_num - 1) / h->sps.poc_cycle_length;
4539             int frame_num_in_poc_cycle = (abs_frame_num - 1) % h->sps.poc_cycle_length;
4540
4541             expectedpoc = poc_cycle_cnt * expected_delta_per_poc_cycle;
4542             for(i = 0; i <= frame_num_in_poc_cycle; i++)
4543                 expectedpoc = expectedpoc + h->sps.offset_for_ref_frame[ i ];
4544         } else
4545             expectedpoc = 0;
4546
4547         if(h->nal_ref_idc == 0)
4548             expectedpoc = expectedpoc + h->sps.offset_for_non_ref_pic;
4549
4550         field_poc[0] = expectedpoc + h->delta_poc[0];
4551         field_poc[1] = field_poc[0] + h->sps.offset_for_top_to_bottom_field;
4552
4553         if(s->picture_structure == PICT_FRAME)
4554             field_poc[1] += h->delta_poc[1];
4555     }else{
4556         int poc;
4557         if(h->nal_unit_type == NAL_IDR_SLICE){
4558             poc= 0;
4559         }else{
4560             if(h->nal_ref_idc) poc= 2*(h->frame_num_offset + h->frame_num);
4561             else               poc= 2*(h->frame_num_offset + h->frame_num) - 1;
4562         }
4563         field_poc[0]= poc;
4564         field_poc[1]= poc;
4565     }
4566
4567     if(s->picture_structure != PICT_BOTTOM_FIELD)
4568         s->current_picture_ptr->field_poc[0]= field_poc[0];
4569     if(s->picture_structure != PICT_TOP_FIELD)
4570         s->current_picture_ptr->field_poc[1]= field_poc[1];
4571     if(s->picture_structure == PICT_FRAME) // FIXME field pix?
4572         s->current_picture_ptr->poc= FFMIN(field_poc[0], field_poc[1]);
4573
4574     return 0;
4575 }
4576
4577 /**
4578  * decodes a slice header.
4579  * this will allso call MPV_common_init() and frame_start() as needed
4580  */
4581 static int decode_slice_header(H264Context *h){
4582     MpegEncContext * const s = &h->s;
4583     int first_mb_in_slice, pps_id;
4584     int num_ref_idx_active_override_flag;
4585     static const uint8_t slice_type_map[5]= {P_TYPE, B_TYPE, I_TYPE, SP_TYPE, SI_TYPE};
4586     int slice_type;
4587     int default_ref_list_done = 0;
4588
4589     s->current_picture.reference= h->nal_ref_idc != 0;
4590     s->dropable= h->nal_ref_idc == 0;
4591
4592     first_mb_in_slice= get_ue_golomb(&s->gb);
4593
4594     slice_type= get_ue_golomb(&s->gb);
4595     if(slice_type > 9){
4596         av_log(h->s.avctx, AV_LOG_ERROR, "slice type too large (%d) at %d %d\n", h->slice_type, s->mb_x, s->mb_y);
4597         return -1;
4598     }
4599     if(slice_type > 4){
4600         slice_type -= 5;
4601         h->slice_type_fixed=1;
4602     }else
4603         h->slice_type_fixed=0;
4604
4605     slice_type= slice_type_map[ slice_type ];
4606     if (slice_type == I_TYPE
4607         || (h->slice_num != 0 && slice_type == h->slice_type) ) {
4608         default_ref_list_done = 1;
4609     }
4610     h->slice_type= slice_type;
4611
4612     s->pict_type= h->slice_type; // to make a few old func happy, it's wrong though
4613
4614     pps_id= get_ue_golomb(&s->gb);
4615     if(pps_id>255){
4616         av_log(h->s.avctx, AV_LOG_ERROR, "pps_id out of range\n");
4617         return -1;
4618     }
4619     h->pps= h->pps_buffer[pps_id];
4620     if(h->pps.slice_group_count == 0){
4621         av_log(h->s.avctx, AV_LOG_ERROR, "non existing PPS referenced\n");
4622         return -1;
4623     }
4624
4625     h->sps= h->sps_buffer[ h->pps.sps_id ];
4626     if(h->sps.log2_max_frame_num == 0){
4627         av_log(h->s.avctx, AV_LOG_ERROR, "non existing SPS referenced\n");
4628         return -1;
4629     }
4630
4631     if(h->dequant_coeff_pps != pps_id){
4632         h->dequant_coeff_pps = pps_id;
4633         init_dequant_tables(h);
4634     }
4635
4636     s->mb_width= h->sps.mb_width;
4637     s->mb_height= h->sps.mb_height * (2 - h->sps.frame_mbs_only_flag);
4638
4639     h->b_stride=  s->mb_width*4;
4640     h->b8_stride= s->mb_width*2;
4641
4642     s->width = 16*s->mb_width - 2*(h->sps.crop_left + h->sps.crop_right );
4643     if(h->sps.frame_mbs_only_flag)
4644         s->height= 16*s->mb_height - 2*(h->sps.crop_top  + h->sps.crop_bottom);
4645     else
4646         s->height= 16*s->mb_height - 4*(h->sps.crop_top  + h->sps.crop_bottom); //FIXME recheck
4647
4648     if (s->context_initialized
4649         && (   s->width != s->avctx->width || s->height != s->avctx->height)) {
4650         free_tables(h);
4651         MPV_common_end(s);
4652     }
4653     if (!s->context_initialized) {
4654         if (MPV_common_init(s) < 0)
4655             return -1;
4656
4657         if(s->dsp.h264_idct_add == ff_h264_idct_add_c){ //FIXME little ugly
4658             memcpy(h->zigzag_scan, zigzag_scan, 16*sizeof(uint8_t));
4659             memcpy(h-> field_scan,  field_scan, 16*sizeof(uint8_t));
4660         }else{
4661             int i;
4662             for(i=0; i<16; i++){
4663 #define T(x) (x>>2) | ((x<<2) & 0xF)
4664                 h->zigzag_scan[i] = T(zigzag_scan[i]);
4665                 h-> field_scan[i] = T( field_scan[i]);
4666 #undef T
4667             }
4668         }
4669         if(s->dsp.h264_idct8_add == ff_h264_idct8_add_c){
4670             memcpy(h->zigzag_scan8x8,       zigzag_scan8x8,       64*sizeof(uint8_t));
4671             memcpy(h->zigzag_scan8x8_cavlc, zigzag_scan8x8_cavlc, 64*sizeof(uint8_t));
4672             memcpy(h->field_scan8x8,        field_scan8x8,        64*sizeof(uint8_t));
4673             memcpy(h->field_scan8x8_cavlc,  field_scan8x8_cavlc,  64*sizeof(uint8_t));
4674         }else{
4675             int i;
4676             for(i=0; i<64; i++){
4677 #define T(x) (x>>3) | ((x&7)<<3)
4678                 h->zigzag_scan8x8[i]       = T(zigzag_scan8x8[i]);
4679                 h->zigzag_scan8x8_cavlc[i] = T(zigzag_scan8x8_cavlc[i]);
4680                 h->field_scan8x8[i]        = T(field_scan8x8[i]);
4681                 h->field_scan8x8_cavlc[i]  = T(field_scan8x8_cavlc[i]);
4682 #undef T
4683             }
4684         }
4685         if(h->sps.transform_bypass){ //FIXME same ugly
4686             h->zigzag_scan_q0          = zigzag_scan;
4687             h->zigzag_scan8x8_q0       = zigzag_scan8x8;
4688             h->zigzag_scan8x8_cavlc_q0 = zigzag_scan8x8_cavlc;
4689             h->field_scan_q0           = field_scan;
4690             h->field_scan8x8_q0        = field_scan8x8;
4691             h->field_scan8x8_cavlc_q0  = field_scan8x8_cavlc;
4692         }else{
4693             h->zigzag_scan_q0          = h->zigzag_scan;
4694             h->zigzag_scan8x8_q0       = h->zigzag_scan8x8;
4695             h->zigzag_scan8x8_cavlc_q0 = h->zigzag_scan8x8_cavlc;
4696             h->field_scan_q0           = h->field_scan;
4697             h->field_scan8x8_q0        = h->field_scan8x8;
4698             h->field_scan8x8_cavlc_q0  = h->field_scan8x8_cavlc;
4699         }
4700
4701         alloc_tables(h);
4702
4703         s->avctx->width = s->width;
4704         s->avctx->height = s->height;
4705         s->avctx->sample_aspect_ratio= h->sps.sar;
4706         if(!s->avctx->sample_aspect_ratio.den)
4707             s->avctx->sample_aspect_ratio.den = 1;
4708
4709         if(h->sps.timing_info_present_flag){
4710             s->avctx->time_base= (AVRational){h->sps.num_units_in_tick * 2, h->sps.time_scale};
4711             if(h->x264_build > 0 && h->x264_build < 44)
4712                 s->avctx->time_base.den *= 2;
4713             av_reduce(&s->avctx->time_base.num, &s->avctx->time_base.den,
4714                       s->avctx->time_base.num, s->avctx->time_base.den, 1<<30);
4715         }
4716     }
4717
4718     if(h->slice_num == 0){
4719         if(frame_start(h) < 0)
4720             return -1;
4721     }
4722
4723     s->current_picture_ptr->frame_num= //FIXME frame_num cleanup
4724     h->frame_num= get_bits(&s->gb, h->sps.log2_max_frame_num);
4725
4726     h->mb_mbaff = 0;
4727     h->mb_aff_frame = 0;
4728     if(h->sps.frame_mbs_only_flag){
4729         s->picture_structure= PICT_FRAME;
4730     }else{
4731         if(get_bits1(&s->gb)) { //field_pic_flag
4732             s->picture_structure= PICT_TOP_FIELD + get_bits1(&s->gb); //bottom_field_flag
4733             av_log(h->s.avctx, AV_LOG_ERROR, "PAFF interlacing is not implemented\n");
4734         } else {
4735             s->picture_structure= PICT_FRAME;
4736             h->mb_aff_frame = h->sps.mb_aff;
4737         }
4738     }
4739
4740     s->resync_mb_x = s->mb_x = first_mb_in_slice % s->mb_width;
4741     s->resync_mb_y = s->mb_y = (first_mb_in_slice / s->mb_width) << h->mb_aff_frame;
4742     if(s->mb_y >= s->mb_height){
4743         return -1;
4744     }
4745
4746     if(s->picture_structure==PICT_FRAME){
4747         h->curr_pic_num=   h->frame_num;
4748         h->max_pic_num= 1<< h->sps.log2_max_frame_num;
4749     }else{
4750         h->curr_pic_num= 2*h->frame_num;
4751         h->max_pic_num= 1<<(h->sps.log2_max_frame_num + 1);
4752     }
4753
4754     if(h->nal_unit_type == NAL_IDR_SLICE){
4755         get_ue_golomb(&s->gb); /* idr_pic_id */
4756     }
4757
4758     if(h->sps.poc_type==0){
4759         h->poc_lsb= get_bits(&s->gb, h->sps.log2_max_poc_lsb);
4760
4761         if(h->pps.pic_order_present==1 && s->picture_structure==PICT_FRAME){
4762             h->delta_poc_bottom= get_se_golomb(&s->gb);
4763         }
4764     }
4765
4766     if(h->sps.poc_type==1 && !h->sps.delta_pic_order_always_zero_flag){
4767         h->delta_poc[0]= get_se_golomb(&s->gb);
4768
4769         if(h->pps.pic_order_present==1 && s->picture_structure==PICT_FRAME)
4770             h->delta_poc[1]= get_se_golomb(&s->gb);
4771     }
4772
4773     init_poc(h);
4774
4775     if(h->pps.redundant_pic_cnt_present){
4776         h->redundant_pic_count= get_ue_golomb(&s->gb);
4777     }
4778
4779     //set defaults, might be overriden a few line later
4780     h->ref_count[0]= h->pps.ref_count[0];
4781     h->ref_count[1]= h->pps.ref_count[1];
4782
4783     if(h->slice_type == P_TYPE || h->slice_type == SP_TYPE || h->slice_type == B_TYPE){
4784         if(h->slice_type == B_TYPE){
4785             h->direct_spatial_mv_pred= get_bits1(&s->gb);
4786             if(h->sps.mb_aff && h->direct_spatial_mv_pred)
4787                 av_log(h->s.avctx, AV_LOG_ERROR, "MBAFF + spatial direct mode is not implemented\n");
4788         }
4789         num_ref_idx_active_override_flag= get_bits1(&s->gb);
4790
4791         if(num_ref_idx_active_override_flag){
4792             h->ref_count[0]= get_ue_golomb(&s->gb) + 1;
4793             if(h->slice_type==B_TYPE)
4794                 h->ref_count[1]= get_ue_golomb(&s->gb) + 1;
4795
4796             if(h->ref_count[0] > 32 || h->ref_count[1] > 32){
4797                 av_log(h->s.avctx, AV_LOG_ERROR, "reference overflow\n");
4798                 return -1;
4799             }
4800         }
4801     }
4802
4803     if(!default_ref_list_done){
4804         fill_default_ref_list(h);
4805     }
4806
4807     if(decode_ref_pic_list_reordering(h) < 0)
4808         return -1;
4809
4810     if(   (h->pps.weighted_pred          && (h->slice_type == P_TYPE || h->slice_type == SP_TYPE ))
4811        || (h->pps.weighted_bipred_idc==1 && h->slice_type==B_TYPE ) )
4812         pred_weight_table(h);
4813     else if(h->pps.weighted_bipred_idc==2 && h->slice_type==B_TYPE)
4814         implicit_weight_table(h);
4815     else
4816         h->use_weight = 0;
4817
4818     if(s->current_picture.reference)
4819         decode_ref_pic_marking(h);
4820
4821     if(FRAME_MBAFF)
4822         fill_mbaff_ref_list(h);
4823
4824     if( h->slice_type != I_TYPE && h->slice_type != SI_TYPE && h->pps.cabac )
4825         h->cabac_init_idc = get_ue_golomb(&s->gb);
4826
4827     h->last_qscale_diff = 0;
4828     s->qscale = h->pps.init_qp + get_se_golomb(&s->gb);
4829     if(s->qscale<0 || s->qscale>51){
4830         av_log(s->avctx, AV_LOG_ERROR, "QP %d out of range\n", s->qscale);
4831         return -1;
4832     }
4833     h->chroma_qp = get_chroma_qp(h->pps.chroma_qp_index_offset, s->qscale);
4834     //FIXME qscale / qp ... stuff
4835     if(h->slice_type == SP_TYPE){
4836         get_bits1(&s->gb); /* sp_for_switch_flag */
4837     }
4838     if(h->slice_type==SP_TYPE || h->slice_type == SI_TYPE){
4839         get_se_golomb(&s->gb); /* slice_qs_delta */
4840     }
4841
4842     h->deblocking_filter = 1;
4843     h->slice_alpha_c0_offset = 0;
4844     h->slice_beta_offset = 0;
4845     if( h->pps.deblocking_filter_parameters_present ) {
4846         h->deblocking_filter= get_ue_golomb(&s->gb);
4847         if(h->deblocking_filter < 2)
4848             h->deblocking_filter^= 1; // 1<->0
4849
4850         if( h->deblocking_filter ) {
4851             h->slice_alpha_c0_offset = get_se_golomb(&s->gb) << 1;
4852             h->slice_beta_offset = get_se_golomb(&s->gb) << 1;
4853         }
4854     }
4855     if(   s->avctx->skip_loop_filter >= AVDISCARD_ALL
4856        ||(s->avctx->skip_loop_filter >= AVDISCARD_NONKEY && h->slice_type != I_TYPE)
4857        ||(s->avctx->skip_loop_filter >= AVDISCARD_BIDIR  && h->slice_type == B_TYPE)
4858        ||(s->avctx->skip_loop_filter >= AVDISCARD_NONREF && h->nal_ref_idc == 0))
4859         h->deblocking_filter= 0;
4860
4861 #if 0 //FMO
4862     if( h->pps.num_slice_groups > 1  && h->pps.mb_slice_group_map_type >= 3 && h->pps.mb_slice_group_map_type <= 5)
4863         slice_group_change_cycle= get_bits(&s->gb, ?);
4864 #endif
4865
4866     h->slice_num++;
4867
4868     h->emu_edge_width= (s->flags&CODEC_FLAG_EMU_EDGE) ? 0 : 16;
4869     h->emu_edge_height= FRAME_MBAFF ? 0 : h->emu_edge_width;
4870
4871     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
4872         av_log(h->s.avctx, AV_LOG_DEBUG, "slice:%d %s mb:%d %c pps:%d frame:%d poc:%d/%d ref:%d/%d qp:%d loop:%d:%d:%d weight:%d%s\n",
4873                h->slice_num,
4874                (s->picture_structure==PICT_FRAME ? "F" : s->picture_structure==PICT_TOP_FIELD ? "T" : "B"),
4875                first_mb_in_slice,
4876                av_get_pict_type_char(h->slice_type),
4877                pps_id, h->frame_num,
4878                s->current_picture_ptr->field_poc[0], s->current_picture_ptr->field_poc[1],
4879                h->ref_count[0], h->ref_count[1],
4880                s->qscale,
4881                h->deblocking_filter, h->slice_alpha_c0_offset/2, h->slice_beta_offset/2,
4882                h->use_weight,
4883                h->use_weight==1 && h->use_weight_chroma ? "c" : ""
4884                );
4885     }
4886
4887     return 0;
4888 }
4889
4890 /**
4891  *
4892  */
4893 static inline int get_level_prefix(GetBitContext *gb){
4894     unsigned int buf;
4895     int log;
4896
4897     OPEN_READER(re, gb);
4898     UPDATE_CACHE(re, gb);
4899     buf=GET_CACHE(re, gb);
4900
4901     log= 32 - av_log2(buf);
4902 #ifdef TRACE
4903     print_bin(buf>>(32-log), log);
4904     av_log(NULL, AV_LOG_DEBUG, "%5d %2d %3d lpr @%5d in %s get_level_prefix\n", buf>>(32-log), log, log-1, get_bits_count(gb), __FILE__);
4905 #endif
4906
4907     LAST_SKIP_BITS(re, gb, log);
4908     CLOSE_READER(re, gb);
4909
4910     return log-1;
4911 }
4912
4913 static inline int get_dct8x8_allowed(H264Context *h){
4914     int i;
4915     for(i=0; i<4; i++){
4916         if(!IS_SUB_8X8(h->sub_mb_type[i])
4917            || (!h->sps.direct_8x8_inference_flag && IS_DIRECT(h->sub_mb_type[i])))
4918             return 0;
4919     }
4920     return 1;
4921 }
4922
4923 /**
4924  * decodes a residual block.
4925  * @param n block index
4926  * @param scantable scantable
4927  * @param max_coeff number of coefficients in the block
4928  * @return <0 if an error occured
4929  */
4930 static int decode_residual(H264Context *h, GetBitContext *gb, DCTELEM *block, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff){
4931     MpegEncContext * const s = &h->s;
4932     static const int coeff_token_table_index[17]= {0, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3};
4933     int level[16];
4934     int zeros_left, coeff_num, coeff_token, total_coeff, i, j, trailing_ones, run_before;
4935
4936     //FIXME put trailing_onex into the context
4937
4938     if(n == CHROMA_DC_BLOCK_INDEX){
4939         coeff_token= get_vlc2(gb, chroma_dc_coeff_token_vlc.table, CHROMA_DC_COEFF_TOKEN_VLC_BITS, 1);
4940         total_coeff= coeff_token>>2;
4941     }else{
4942         if(n == LUMA_DC_BLOCK_INDEX){
4943             total_coeff= pred_non_zero_count(h, 0);
4944             coeff_token= get_vlc2(gb, coeff_token_vlc[ coeff_token_table_index[total_coeff] ].table, COEFF_TOKEN_VLC_BITS, 2);
4945             total_coeff= coeff_token>>2;
4946         }else{
4947             total_coeff= pred_non_zero_count(h, n);
4948             coeff_token= get_vlc2(gb, coeff_token_vlc[ coeff_token_table_index[total_coeff] ].table, COEFF_TOKEN_VLC_BITS, 2);
4949             total_coeff= coeff_token>>2;
4950             h->non_zero_count_cache[ scan8[n] ]= total_coeff;
4951         }
4952     }
4953
4954     //FIXME set last_non_zero?
4955
4956     if(total_coeff==0)
4957         return 0;
4958
4959     trailing_ones= coeff_token&3;
4960     tprintf("trailing:%d, total:%d\n", trailing_ones, total_coeff);
4961     assert(total_coeff<=16);
4962
4963     for(i=0; i<trailing_ones; i++){
4964         level[i]= 1 - 2*get_bits1(gb);
4965     }
4966
4967     if(i<total_coeff) {
4968         int level_code, mask;
4969         int suffix_length = total_coeff > 10 && trailing_ones < 3;
4970         int prefix= get_level_prefix(gb);
4971
4972         //first coefficient has suffix_length equal to 0 or 1
4973         if(prefix<14){ //FIXME try to build a large unified VLC table for all this
4974             if(suffix_length)
4975                 level_code= (prefix<<suffix_length) + get_bits(gb, suffix_length); //part
4976             else
4977                 level_code= (prefix<<suffix_length); //part
4978         }else if(prefix==14){
4979             if(suffix_length)
4980                 level_code= (prefix<<suffix_length) + get_bits(gb, suffix_length); //part
4981             else
4982                 level_code= prefix + get_bits(gb, 4); //part
4983         }else if(prefix==15){
4984             level_code= (prefix<<suffix_length) + get_bits(gb, 12); //part
4985             if(suffix_length==0) level_code+=15; //FIXME doesn't make (much)sense
4986         }else{
4987             av_log(h->s.avctx, AV_LOG_ERROR, "prefix too large at %d %d\n", s->mb_x, s->mb_y);
4988             return -1;
4989         }
4990
4991         if(trailing_ones < 3) level_code += 2;
4992
4993         suffix_length = 1;
4994         if(level_code > 5)
4995             suffix_length++;
4996         mask= -(level_code&1);
4997         level[i]= (((2+level_code)>>1) ^ mask) - mask;
4998         i++;
4999
5000         //remaining coefficients have suffix_length > 0
5001         for(;i<total_coeff;i++) {
5002             static const int suffix_limit[7] = {0,5,11,23,47,95,INT_MAX };
5003             prefix = get_level_prefix(gb);
5004             if(prefix<15){
5005                 level_code = (prefix<<suffix_length) + get_bits(gb, suffix_length);
5006             }else if(prefix==15){
5007                 level_code =  (prefix<<suffix_length) + get_bits(gb, 12);
5008             }else{
5009                 av_log(h->s.avctx, AV_LOG_ERROR, "prefix too large at %d %d\n", s->mb_x, s->mb_y);
5010                 return -1;
5011             }
5012             mask= -(level_code&1);
5013             level[i]= (((2+level_code)>>1) ^ mask) - mask;
5014             if(level_code > suffix_limit[suffix_length])
5015                 suffix_length++;
5016         }
5017     }
5018
5019     if(total_coeff == max_coeff)
5020         zeros_left=0;
5021     else{
5022         if(n == CHROMA_DC_BLOCK_INDEX)
5023             zeros_left= get_vlc2(gb, chroma_dc_total_zeros_vlc[ total_coeff-1 ].table, CHROMA_DC_TOTAL_ZEROS_VLC_BITS, 1);
5024         else
5025             zeros_left= get_vlc2(gb, total_zeros_vlc[ total_coeff-1 ].table, TOTAL_ZEROS_VLC_BITS, 1);
5026     }
5027
5028     coeff_num = zeros_left + total_coeff - 1;
5029     j = scantable[coeff_num];
5030     if(n > 24){
5031         block[j] = level[0];
5032         for(i=1;i<total_coeff;i++) {
5033             if(zeros_left <= 0)
5034                 run_before = 0;
5035             else if(zeros_left < 7){
5036                 run_before= get_vlc2(gb, run_vlc[zeros_left-1].table, RUN_VLC_BITS, 1);
5037             }else{
5038                 run_before= get_vlc2(gb, run7_vlc.table, RUN7_VLC_BITS, 2);
5039             }
5040             zeros_left -= run_before;
5041             coeff_num -= 1 + run_before;
5042             j= scantable[ coeff_num ];
5043
5044             block[j]= level[i];
5045         }
5046     }else{
5047         block[j] = (level[0] * qmul[j] + 32)>>6;
5048         for(i=1;i<total_coeff;i++) {
5049             if(zeros_left <= 0)
5050                 run_before = 0;
5051             else if(zeros_left < 7){
5052                 run_before= get_vlc2(gb, run_vlc[zeros_left-1].table, RUN_VLC_BITS, 1);
5053             }else{
5054                 run_before= get_vlc2(gb, run7_vlc.table, RUN7_VLC_BITS, 2);
5055             }
5056             zeros_left -= run_before;
5057             coeff_num -= 1 + run_before;
5058             j= scantable[ coeff_num ];
5059
5060             block[j]= (level[i] * qmul[j] + 32)>>6;
5061         }
5062     }
5063
5064     if(zeros_left<0){
5065         av_log(h->s.avctx, AV_LOG_ERROR, "negative number of zero coeffs at %d %d\n", s->mb_x, s->mb_y);
5066         return -1;
5067     }
5068
5069     return 0;
5070 }
5071
5072 static void predict_field_decoding_flag(H264Context *h){
5073     MpegEncContext * const s = &h->s;
5074     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
5075     int mb_type = (h->slice_table[mb_xy-1] == h->slice_num)
5076                 ? s->current_picture.mb_type[mb_xy-1]
5077                 : (h->slice_table[mb_xy-s->mb_stride] == h->slice_num)
5078                 ? s->current_picture.mb_type[mb_xy-s->mb_stride]
5079                 : 0;
5080     h->mb_mbaff = h->mb_field_decoding_flag = IS_INTERLACED(mb_type) ? 1 : 0;
5081 }
5082
5083 /**
5084  * decodes a P_SKIP or B_SKIP macroblock
5085  */
5086 static void decode_mb_skip(H264Context *h){
5087     MpegEncContext * const s = &h->s;
5088     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
5089     int mb_type=0;
5090
5091     memset(h->non_zero_count[mb_xy], 0, 16);
5092     memset(h->non_zero_count_cache + 8, 0, 8*5); //FIXME ugly, remove pfui
5093
5094     if(MB_FIELD)
5095         mb_type|= MB_TYPE_INTERLACED;
5096
5097     if( h->slice_type == B_TYPE )
5098     {
5099         // just for fill_caches. pred_direct_motion will set the real mb_type
5100         mb_type|= MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2|MB_TYPE_SKIP;
5101
5102         fill_caches(h, mb_type, 0); //FIXME check what is needed and what not ...
5103         pred_direct_motion(h, &mb_type);
5104         if(h->pps.cabac){
5105             fill_rectangle(h->mvd_cache[0][scan8[0]], 4, 4, 8, 0, 4);
5106             fill_rectangle(h->mvd_cache[1][scan8[0]], 4, 4, 8, 0, 4);
5107         }
5108     }
5109     else
5110     {
5111         int mx, my;
5112         mb_type|= MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P1L0|MB_TYPE_SKIP;
5113
5114         fill_caches(h, mb_type, 0); //FIXME check what is needed and what not ...
5115         pred_pskip_motion(h, &mx, &my);
5116         fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, 0, 1);
5117         fill_rectangle(  h->mv_cache[0][scan8[0]], 4, 4, 8, pack16to32(mx,my), 4);
5118         if(h->pps.cabac)
5119             fill_rectangle(h->mvd_cache[0][scan8[0]], 4, 4, 8, 0, 4);
5120     }
5121
5122     write_back_motion(h, mb_type);
5123     s->current_picture.mb_type[mb_xy]= mb_type|MB_TYPE_SKIP;
5124     s->current_picture.qscale_table[mb_xy]= s->qscale;
5125     h->slice_table[ mb_xy ]= h->slice_num;
5126     h->prev_mb_skipped= 1;
5127 }
5128
5129 /**
5130  * decodes a macroblock
5131  * @returns 0 if ok, AC_ERROR / DC_ERROR / MV_ERROR if an error is noticed
5132  */
5133 static int decode_mb_cavlc(H264Context *h){
5134     MpegEncContext * const s = &h->s;
5135     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
5136     int mb_type, partition_count, cbp;
5137     int dct8x8_allowed= h->pps.transform_8x8_mode;
5138
5139     s->dsp.clear_blocks(h->mb); //FIXME avoid if already clear (move after skip handlong?
5140
5141     tprintf("pic:%d mb:%d/%d\n", h->frame_num, s->mb_x, s->mb_y);
5142     cbp = 0; /* avoid warning. FIXME: find a solution without slowing
5143                 down the code */
5144     if(h->slice_type != I_TYPE && h->slice_type != SI_TYPE){
5145         if(s->mb_skip_run==-1)
5146             s->mb_skip_run= get_ue_golomb(&s->gb);
5147
5148         if (s->mb_skip_run--) {
5149             if(FRAME_MBAFF && (s->mb_y&1) == 0){
5150                 if(s->mb_skip_run==0)
5151                     h->mb_mbaff = h->mb_field_decoding_flag = get_bits1(&s->gb);
5152                 else
5153                     predict_field_decoding_flag(h);
5154             }
5155             decode_mb_skip(h);
5156             return 0;
5157         }
5158     }
5159     if(FRAME_MBAFF){
5160         if( (s->mb_y&1) == 0 )
5161             h->mb_mbaff = h->mb_field_decoding_flag = get_bits1(&s->gb);
5162     }else
5163         h->mb_field_decoding_flag= (s->picture_structure!=PICT_FRAME);
5164
5165     h->prev_mb_skipped= 0;
5166
5167     mb_type= get_ue_golomb(&s->gb);
5168     if(h->slice_type == B_TYPE){
5169         if(mb_type < 23){
5170             partition_count= b_mb_type_info[mb_type].partition_count;
5171             mb_type=         b_mb_type_info[mb_type].type;
5172         }else{
5173             mb_type -= 23;
5174             goto decode_intra_mb;
5175         }
5176     }else if(h->slice_type == P_TYPE /*|| h->slice_type == SP_TYPE */){
5177         if(mb_type < 5){
5178             partition_count= p_mb_type_info[mb_type].partition_count;
5179             mb_type=         p_mb_type_info[mb_type].type;
5180         }else{
5181             mb_type -= 5;
5182             goto decode_intra_mb;
5183         }
5184     }else{
5185        assert(h->slice_type == I_TYPE);
5186 decode_intra_mb:
5187         if(mb_type > 25){
5188             av_log(h->s.avctx, AV_LOG_ERROR, "mb_type %d in %c slice to large at %d %d\n", mb_type, av_get_pict_type_char(h->slice_type), s->mb_x, s->mb_y);
5189             return -1;
5190         }
5191         partition_count=0;
5192         cbp= i_mb_type_info[mb_type].cbp;
5193         h->intra16x16_pred_mode= i_mb_type_info[mb_type].pred_mode;
5194         mb_type= i_mb_type_info[mb_type].type;
5195     }
5196
5197     if(MB_FIELD)
5198         mb_type |= MB_TYPE_INTERLACED;
5199
5200     h->slice_table[ mb_xy ]= h->slice_num;
5201
5202     if(IS_INTRA_PCM(mb_type)){
5203         unsigned int x, y;
5204
5205         // we assume these blocks are very rare so we dont optimize it
5206         align_get_bits(&s->gb);
5207
5208         // The pixels are stored in the same order as levels in h->mb array.
5209         for(y=0; y<16; y++){
5210             const int index= 4*(y&3) + 32*((y>>2)&1) + 128*(y>>3);
5211             for(x=0; x<16; x++){
5212                 tprintf("LUMA ICPM LEVEL (%3d)\n", show_bits(&s->gb, 8));
5213                 h->mb[index + (x&3) + 16*((x>>2)&1) + 64*(x>>3)]= get_bits(&s->gb, 8);
5214             }
5215         }
5216         for(y=0; y<8; y++){
5217             const int index= 256 + 4*(y&3) + 32*(y>>2);
5218             for(x=0; x<8; x++){
5219                 tprintf("CHROMA U ICPM LEVEL (%3d)\n", show_bits(&s->gb, 8));
5220                 h->mb[index + (x&3) + 16*(x>>2)]= get_bits(&s->gb, 8);
5221             }
5222         }
5223         for(y=0; y<8; y++){
5224             const int index= 256 + 64 + 4*(y&3) + 32*(y>>2);
5225             for(x=0; x<8; x++){
5226                 tprintf("CHROMA V ICPM LEVEL (%3d)\n", show_bits(&s->gb, 8));
5227                 h->mb[index + (x&3) + 16*(x>>2)]= get_bits(&s->gb, 8);
5228             }
5229         }
5230
5231         // In deblocking, the quantizer is 0
5232         s->current_picture.qscale_table[mb_xy]= 0;
5233         h->chroma_qp = get_chroma_qp(h->pps.chroma_qp_index_offset, 0);
5234         // All coeffs are present
5235         memset(h->non_zero_count[mb_xy], 16, 16);
5236
5237         s->current_picture.mb_type[mb_xy]= mb_type;
5238         return 0;
5239     }
5240
5241     if(MB_MBAFF){
5242         h->ref_count[0] <<= 1;
5243         h->ref_count[1] <<= 1;
5244     }
5245
5246     fill_caches(h, mb_type, 0);
5247
5248     //mb_pred
5249     if(IS_INTRA(mb_type)){
5250 //            init_top_left_availability(h);
5251             if(IS_INTRA4x4(mb_type)){
5252                 int i;
5253                 int di = 1;
5254                 if(dct8x8_allowed && get_bits1(&s->gb)){
5255                     mb_type |= MB_TYPE_8x8DCT;
5256                     di = 4;
5257                 }
5258
5259 //                fill_intra4x4_pred_table(h);
5260                 for(i=0; i<16; i+=di){
5261                     int mode= pred_intra_mode(h, i);
5262
5263                     if(!get_bits1(&s->gb)){
5264                         const int rem_mode= get_bits(&s->gb, 3);
5265                         mode = rem_mode + (rem_mode >= mode);
5266                     }
5267
5268                     if(di==4)
5269                         fill_rectangle( &h->intra4x4_pred_mode_cache[ scan8[i] ], 2, 2, 8, mode, 1 );
5270                     else
5271                         h->intra4x4_pred_mode_cache[ scan8[i] ] = mode;
5272                 }
5273                 write_back_intra_pred_mode(h);
5274                 if( check_intra4x4_pred_mode(h) < 0)
5275                     return -1;
5276             }else{
5277                 h->intra16x16_pred_mode= check_intra_pred_mode(h, h->intra16x16_pred_mode);
5278                 if(h->intra16x16_pred_mode < 0)
5279                     return -1;
5280             }
5281             h->chroma_pred_mode= get_ue_golomb(&s->gb);
5282
5283             h->chroma_pred_mode= check_intra_pred_mode(h, h->chroma_pred_mode);
5284             if(h->chroma_pred_mode < 0)
5285                 return -1;
5286     }else if(partition_count==4){
5287         int i, j, sub_partition_count[4], list, ref[2][4];
5288
5289         if(h->slice_type == B_TYPE){
5290             for(i=0; i<4; i++){
5291                 h->sub_mb_type[i]= get_ue_golomb(&s->gb);
5292                 if(h->sub_mb_type[i] >=13){
5293                     av_log(h->s.avctx, AV_LOG_ERROR, "B sub_mb_type %d out of range at %d %d\n", h->sub_mb_type[i], s->mb_x, s->mb_y);
5294                     return -1;
5295                 }
5296                 sub_partition_count[i]= b_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
5297                 h->sub_mb_type[i]=      b_sub_mb_type_info[ h->sub_mb_type[i] ].type;
5298             }
5299             if(   IS_DIRECT(h->sub_mb_type[0]) || IS_DIRECT(h->sub_mb_type[1])
5300                || IS_DIRECT(h->sub_mb_type[2]) || IS_DIRECT(h->sub_mb_type[3])) {
5301                 pred_direct_motion(h, &mb_type);
5302                 h->ref_cache[0][scan8[4]] =
5303                 h->ref_cache[1][scan8[4]] =
5304                 h->ref_cache[0][scan8[12]] =
5305                 h->ref_cache[1][scan8[12]] = PART_NOT_AVAILABLE;
5306             }
5307         }else{
5308             assert(h->slice_type == P_TYPE || h->slice_type == SP_TYPE); //FIXME SP correct ?
5309             for(i=0; i<4; i++){
5310                 h->sub_mb_type[i]= get_ue_golomb(&s->gb);
5311                 if(h->sub_mb_type[i] >=4){
5312                     av_log(h->s.avctx, AV_LOG_ERROR, "P sub_mb_type %d out of range at %d %d\n", h->sub_mb_type[i], s->mb_x, s->mb_y);
5313                     return -1;
5314                 }
5315                 sub_partition_count[i]= p_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
5316                 h->sub_mb_type[i]=      p_sub_mb_type_info[ h->sub_mb_type[i] ].type;
5317             }
5318         }
5319
5320         for(list=0; list<2; list++){
5321             int ref_count= IS_REF0(mb_type) ? 1 : h->ref_count[list];
5322             if(ref_count == 0) continue;
5323             for(i=0; i<4; i++){
5324                 if(IS_DIRECT(h->sub_mb_type[i])) continue;
5325                 if(IS_DIR(h->sub_mb_type[i], 0, list)){
5326                     ref[list][i] = get_te0_golomb(&s->gb, ref_count); //FIXME init to 0 before and skip?
5327                 }else{
5328                  //FIXME
5329                     ref[list][i] = -1;
5330                 }
5331             }
5332         }
5333
5334         if(dct8x8_allowed)
5335             dct8x8_allowed = get_dct8x8_allowed(h);
5336
5337         for(list=0; list<2; list++){
5338             const int ref_count= IS_REF0(mb_type) ? 1 : h->ref_count[list];
5339             if(ref_count == 0) continue;
5340
5341             for(i=0; i<4; i++){
5342                 if(IS_DIRECT(h->sub_mb_type[i])) {
5343                     h->ref_cache[list][ scan8[4*i] ] = h->ref_cache[list][ scan8[4*i]+1 ];
5344                     continue;
5345                 }
5346                 h->ref_cache[list][ scan8[4*i]   ]=h->ref_cache[list][ scan8[4*i]+1 ]=
5347                 h->ref_cache[list][ scan8[4*i]+8 ]=h->ref_cache[list][ scan8[4*i]+9 ]= ref[list][i];
5348
5349                 if(IS_DIR(h->sub_mb_type[i], 0, list)){
5350                     const int sub_mb_type= h->sub_mb_type[i];
5351                     const int block_width= (sub_mb_type & (MB_TYPE_16x16|MB_TYPE_16x8)) ? 2 : 1;
5352                     for(j=0; j<sub_partition_count[i]; j++){
5353                         int mx, my;
5354                         const int index= 4*i + block_width*j;
5355                         int16_t (* mv_cache)[2]= &h->mv_cache[list][ scan8[index] ];
5356                         pred_motion(h, index, block_width, list, h->ref_cache[list][ scan8[index] ], &mx, &my);
5357                         mx += get_se_golomb(&s->gb);
5358                         my += get_se_golomb(&s->gb);
5359                         tprintf("final mv:%d %d\n", mx, my);
5360
5361                         if(IS_SUB_8X8(sub_mb_type)){
5362                             mv_cache[ 0 ][0]= mv_cache[ 1 ][0]=
5363                             mv_cache[ 8 ][0]= mv_cache[ 9 ][0]= mx;
5364                             mv_cache[ 0 ][1]= mv_cache[ 1 ][1]=
5365                             mv_cache[ 8 ][1]= mv_cache[ 9 ][1]= my;
5366                         }else if(IS_SUB_8X4(sub_mb_type)){
5367                             mv_cache[ 0 ][0]= mv_cache[ 1 ][0]= mx;
5368                             mv_cache[ 0 ][1]= mv_cache[ 1 ][1]= my;
5369                         }else if(IS_SUB_4X8(sub_mb_type)){
5370                             mv_cache[ 0 ][0]= mv_cache[ 8 ][0]= mx;
5371                             mv_cache[ 0 ][1]= mv_cache[ 8 ][1]= my;
5372                         }else{
5373                             assert(IS_SUB_4X4(sub_mb_type));
5374                             mv_cache[ 0 ][0]= mx;
5375                             mv_cache[ 0 ][1]= my;
5376                         }
5377                     }
5378                 }else{
5379                     uint32_t *p= (uint32_t *)&h->mv_cache[list][ scan8[4*i] ][0];
5380                     p[0] = p[1]=
5381                     p[8] = p[9]= 0;
5382                 }
5383             }
5384         }
5385     }else if(IS_DIRECT(mb_type)){
5386         pred_direct_motion(h, &mb_type);
5387         dct8x8_allowed &= h->sps.direct_8x8_inference_flag;
5388     }else{
5389         int list, mx, my, i;
5390          //FIXME we should set ref_idx_l? to 0 if we use that later ...
5391         if(IS_16X16(mb_type)){
5392             for(list=0; list<2; list++){
5393                 if(h->ref_count[list]>0){
5394                     if(IS_DIR(mb_type, 0, list)){
5395                         const int val= get_te0_golomb(&s->gb, h->ref_count[list]);
5396                         fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, val, 1);
5397                     }else
5398                         fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, (LIST_NOT_USED&0xFF), 1);
5399                 }
5400             }
5401             for(list=0; list<2; list++){
5402                 if(IS_DIR(mb_type, 0, list)){
5403                     pred_motion(h, 0, 4, list, h->ref_cache[list][ scan8[0] ], &mx, &my);
5404                     mx += get_se_golomb(&s->gb);
5405                     my += get_se_golomb(&s->gb);
5406                     tprintf("final mv:%d %d\n", mx, my);
5407
5408                     fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, pack16to32(mx,my), 4);
5409                 }else
5410                     fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, 0, 4);
5411             }
5412         }
5413         else if(IS_16X8(mb_type)){
5414             for(list=0; list<2; list++){
5415                 if(h->ref_count[list]>0){
5416                     for(i=0; i<2; i++){
5417                         if(IS_DIR(mb_type, i, list)){
5418                             const int val= get_te0_golomb(&s->gb, h->ref_count[list]);
5419                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, val, 1);
5420                         }else
5421                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, (LIST_NOT_USED&0xFF), 1);
5422                     }
5423                 }
5424             }
5425             for(list=0; list<2; list++){
5426                 for(i=0; i<2; i++){
5427                     if(IS_DIR(mb_type, i, list)){
5428                         pred_16x8_motion(h, 8*i, list, h->ref_cache[list][scan8[0] + 16*i], &mx, &my);
5429                         mx += get_se_golomb(&s->gb);
5430                         my += get_se_golomb(&s->gb);
5431                         tprintf("final mv:%d %d\n", mx, my);
5432
5433                         fill_rectangle(h->mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, pack16to32(mx,my), 4);
5434                     }else
5435                         fill_rectangle(h->mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, 0, 4);
5436                 }
5437             }
5438         }else{
5439             assert(IS_8X16(mb_type));
5440             for(list=0; list<2; list++){
5441                 if(h->ref_count[list]>0){
5442                     for(i=0; i<2; i++){
5443                         if(IS_DIR(mb_type, i, list)){ //FIXME optimize
5444                             const int val= get_te0_golomb(&s->gb, h->ref_count[list]);
5445                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, val, 1);
5446                         }else
5447                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, (LIST_NOT_USED&0xFF), 1);
5448                     }
5449                 }
5450             }
5451             for(list=0; list<2; list++){
5452                 for(i=0; i<2; i++){
5453                     if(IS_DIR(mb_type, i, list)){
5454                         pred_8x16_motion(h, i*4, list, h->ref_cache[list][ scan8[0] + 2*i ], &mx, &my);
5455                         mx += get_se_golomb(&s->gb);
5456                         my += get_se_golomb(&s->gb);
5457                         tprintf("final mv:%d %d\n", mx, my);
5458
5459                         fill_rectangle(h->mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, pack16to32(mx,my), 4);
5460                     }else
5461                         fill_rectangle(h->mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, 0, 4);
5462                 }
5463             }
5464         }
5465     }
5466
5467     if(IS_INTER(mb_type))
5468         write_back_motion(h, mb_type);
5469
5470     if(!IS_INTRA16x16(mb_type)){
5471         cbp= get_ue_golomb(&s->gb);
5472         if(cbp > 47){
5473             av_log(h->s.avctx, AV_LOG_ERROR, "cbp too large (%d) at %d %d\n", cbp, s->mb_x, s->mb_y);
5474             return -1;
5475         }
5476
5477         if(IS_INTRA4x4(mb_type))
5478             cbp= golomb_to_intra4x4_cbp[cbp];
5479         else
5480             cbp= golomb_to_inter_cbp[cbp];
5481     }
5482
5483     if(dct8x8_allowed && (cbp&15) && !IS_INTRA(mb_type)){
5484         if(get_bits1(&s->gb))
5485             mb_type |= MB_TYPE_8x8DCT;
5486     }
5487     s->current_picture.mb_type[mb_xy]= mb_type;
5488
5489     if(cbp || IS_INTRA16x16(mb_type)){
5490         int i8x8, i4x4, chroma_idx;
5491         int chroma_qp, dquant;
5492         GetBitContext *gb= IS_INTRA(mb_type) ? h->intra_gb_ptr : h->inter_gb_ptr;
5493         const uint8_t *scan, *scan8x8, *dc_scan;
5494
5495 //        fill_non_zero_count_cache(h);
5496
5497         if(IS_INTERLACED(mb_type)){
5498             scan8x8= s->qscale ? h->field_scan8x8_cavlc : h->field_scan8x8_cavlc_q0;
5499             scan= s->qscale ? h->field_scan : h->field_scan_q0;
5500             dc_scan= luma_dc_field_scan;
5501         }else{
5502             scan8x8= s->qscale ? h->zigzag_scan8x8_cavlc : h->zigzag_scan8x8_cavlc_q0;
5503             scan= s->qscale ? h->zigzag_scan : h->zigzag_scan_q0;
5504             dc_scan= luma_dc_zigzag_scan;
5505         }
5506
5507         dquant= get_se_golomb(&s->gb);
5508
5509         if( dquant > 25 || dquant < -26 ){
5510             av_log(h->s.avctx, AV_LOG_ERROR, "dquant out of range (%d) at %d %d\n", dquant, s->mb_x, s->mb_y);
5511             return -1;
5512         }
5513
5514         s->qscale += dquant;
5515         if(((unsigned)s->qscale) > 51){
5516             if(s->qscale<0) s->qscale+= 52;
5517             else            s->qscale-= 52;
5518         }
5519
5520         h->chroma_qp= chroma_qp= get_chroma_qp(h->pps.chroma_qp_index_offset, s->qscale);
5521         if(IS_INTRA16x16(mb_type)){
5522             if( decode_residual(h, h->intra_gb_ptr, h->mb, LUMA_DC_BLOCK_INDEX, dc_scan, h->dequant4_coeff[0][s->qscale], 16) < 0){
5523                 return -1; //FIXME continue if partitioned and other return -1 too
5524             }
5525
5526             assert((cbp&15) == 0 || (cbp&15) == 15);
5527
5528             if(cbp&15){
5529                 for(i8x8=0; i8x8<4; i8x8++){
5530                     for(i4x4=0; i4x4<4; i4x4++){
5531                         const int index= i4x4 + 4*i8x8;
5532                         if( decode_residual(h, h->intra_gb_ptr, h->mb + 16*index, index, scan + 1, h->dequant4_coeff[0][s->qscale], 15) < 0 ){
5533                             return -1;
5534                         }
5535                     }
5536                 }
5537             }else{
5538                 fill_rectangle(&h->non_zero_count_cache[scan8[0]], 4, 4, 8, 0, 1);
5539             }
5540         }else{
5541             for(i8x8=0; i8x8<4; i8x8++){
5542                 if(cbp & (1<<i8x8)){
5543                     if(IS_8x8DCT(mb_type)){
5544                         DCTELEM *buf = &h->mb[64*i8x8];
5545                         uint8_t *nnz;
5546                         for(i4x4=0; i4x4<4; i4x4++){
5547                             if( decode_residual(h, gb, buf, i4x4+4*i8x8, scan8x8+16*i4x4,
5548                                                 h->dequant8_coeff[IS_INTRA( mb_type ) ? 0:1][s->qscale], 16) <0 )
5549                                 return -1;
5550                         }
5551                         nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
5552                         nnz[0] += nnz[1] + nnz[8] + nnz[9];
5553                     }else{
5554                         for(i4x4=0; i4x4<4; i4x4++){
5555                             const int index= i4x4 + 4*i8x8;
5556
5557                             if( decode_residual(h, gb, h->mb + 16*index, index, scan, h->dequant4_coeff[IS_INTRA( mb_type ) ? 0:3][s->qscale], 16) <0 ){
5558                                 return -1;
5559                             }
5560                         }
5561                     }
5562                 }else{
5563                     uint8_t * const nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
5564                     nnz[0] = nnz[1] = nnz[8] = nnz[9] = 0;
5565                 }
5566             }
5567         }
5568
5569         if(cbp&0x30){
5570             for(chroma_idx=0; chroma_idx<2; chroma_idx++)
5571                 if( decode_residual(h, gb, h->mb + 256 + 16*4*chroma_idx, CHROMA_DC_BLOCK_INDEX, chroma_dc_scan, NULL, 4) < 0){
5572                     return -1;
5573                 }
5574         }
5575
5576         if(cbp&0x20){
5577             for(chroma_idx=0; chroma_idx<2; chroma_idx++){
5578                 for(i4x4=0; i4x4<4; i4x4++){
5579                     const int index= 16 + 4*chroma_idx + i4x4;
5580                     if( decode_residual(h, gb, h->mb + 16*index, index, scan + 1, h->dequant4_coeff[chroma_idx+1+(IS_INTRA( mb_type ) ? 0:3)][chroma_qp], 15) < 0){
5581                         return -1;
5582                     }
5583                 }
5584             }
5585         }else{
5586             uint8_t * const nnz= &h->non_zero_count_cache[0];
5587             nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
5588             nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
5589         }
5590     }else{
5591         uint8_t * const nnz= &h->non_zero_count_cache[0];
5592         fill_rectangle(&nnz[scan8[0]], 4, 4, 8, 0, 1);
5593         nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
5594         nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
5595     }
5596     s->current_picture.qscale_table[mb_xy]= s->qscale;
5597     write_back_non_zero_count(h);
5598
5599     if(MB_MBAFF){
5600         h->ref_count[0] >>= 1;
5601         h->ref_count[1] >>= 1;
5602     }
5603
5604     return 0;
5605 }
5606
5607 static int decode_cabac_field_decoding_flag(H264Context *h) {
5608     MpegEncContext * const s = &h->s;
5609     const int mb_x = s->mb_x;
5610     const int mb_y = s->mb_y & ~1;
5611     const int mba_xy = mb_x - 1 +  mb_y   *s->mb_stride;
5612     const int mbb_xy = mb_x     + (mb_y-2)*s->mb_stride;
5613
5614     unsigned int ctx = 0;
5615
5616     if( h->slice_table[mba_xy] == h->slice_num && IS_INTERLACED( s->current_picture.mb_type[mba_xy] ) ) {
5617         ctx += 1;
5618     }
5619     if( h->slice_table[mbb_xy] == h->slice_num && IS_INTERLACED( s->current_picture.mb_type[mbb_xy] ) ) {
5620         ctx += 1;
5621     }
5622
5623     return get_cabac( &h->cabac, &h->cabac_state[70 + ctx] );
5624 }
5625
5626 static int decode_cabac_intra_mb_type(H264Context *h, int ctx_base, int intra_slice) {
5627     uint8_t *state= &h->cabac_state[ctx_base];
5628     int mb_type;
5629
5630     if(intra_slice){
5631         MpegEncContext * const s = &h->s;
5632         const int mba_xy = h->left_mb_xy[0];
5633         const int mbb_xy = h->top_mb_xy;
5634         int ctx=0;
5635         if( h->slice_table[mba_xy] == h->slice_num && !IS_INTRA4x4( s->current_picture.mb_type[mba_xy] ) )
5636             ctx++;
5637         if( h->slice_table[mbb_xy] == h->slice_num && !IS_INTRA4x4( s->current_picture.mb_type[mbb_xy] ) )
5638             ctx++;
5639         if( get_cabac( &h->cabac, &state[ctx] ) == 0 )
5640             return 0;   /* I4x4 */
5641         state += 2;
5642     }else{
5643         if( get_cabac( &h->cabac, &state[0] ) == 0 )
5644             return 0;   /* I4x4 */
5645     }
5646
5647     if( get_cabac_terminate( &h->cabac ) )
5648         return 25;  /* PCM */
5649
5650     mb_type = 1; /* I16x16 */
5651     mb_type += 12 * get_cabac( &h->cabac, &state[1] ); /* cbp_luma != 0 */
5652     if( get_cabac( &h->cabac, &state[2] ) ) /* cbp_chroma */
5653         mb_type += 4 + 4 * get_cabac( &h->cabac, &state[2+intra_slice] );
5654     mb_type += 2 * get_cabac( &h->cabac, &state[3+intra_slice] );
5655     mb_type += 1 * get_cabac( &h->cabac, &state[3+2*intra_slice] );
5656     return mb_type;
5657 }
5658
5659 static int decode_cabac_mb_type( H264Context *h ) {
5660     MpegEncContext * const s = &h->s;
5661
5662     if( h->slice_type == I_TYPE ) {
5663         return decode_cabac_intra_mb_type(h, 3, 1);
5664     } else if( h->slice_type == P_TYPE ) {
5665         if( get_cabac( &h->cabac, &h->cabac_state[14] ) == 0 ) {
5666             /* P-type */
5667             if( get_cabac( &h->cabac, &h->cabac_state[15] ) == 0 ) {
5668                 /* P_L0_D16x16, P_8x8 */
5669                 return 3 * get_cabac( &h->cabac, &h->cabac_state[16] );
5670             } else {
5671                 /* P_L0_D8x16, P_L0_D16x8 */
5672                 return 2 - get_cabac( &h->cabac, &h->cabac_state[17] );
5673             }
5674         } else {
5675             return decode_cabac_intra_mb_type(h, 17, 0) + 5;
5676         }
5677     } else if( h->slice_type == B_TYPE ) {
5678         const int mba_xy = h->left_mb_xy[0];
5679         const int mbb_xy = h->top_mb_xy;
5680         int ctx = 0;
5681         int bits;
5682
5683         if( h->slice_table[mba_xy] == h->slice_num && !IS_DIRECT( s->current_picture.mb_type[mba_xy] ) )
5684             ctx++;
5685         if( h->slice_table[mbb_xy] == h->slice_num && !IS_DIRECT( s->current_picture.mb_type[mbb_xy] ) )
5686             ctx++;
5687
5688         if( !get_cabac( &h->cabac, &h->cabac_state[27+ctx] ) )
5689             return 0; /* B_Direct_16x16 */
5690
5691         if( !get_cabac( &h->cabac, &h->cabac_state[27+3] ) ) {
5692             return 1 + get_cabac( &h->cabac, &h->cabac_state[27+5] ); /* B_L[01]_16x16 */
5693         }
5694
5695         bits = get_cabac( &h->cabac, &h->cabac_state[27+4] ) << 3;
5696         bits|= get_cabac( &h->cabac, &h->cabac_state[27+5] ) << 2;
5697         bits|= get_cabac( &h->cabac, &h->cabac_state[27+5] ) << 1;
5698         bits|= get_cabac( &h->cabac, &h->cabac_state[27+5] );
5699         if( bits < 8 )
5700             return bits + 3; /* B_Bi_16x16 through B_L1_L0_16x8 */
5701         else if( bits == 13 ) {
5702             return decode_cabac_intra_mb_type(h, 32, 0) + 23;
5703         } else if( bits == 14 )
5704             return 11; /* B_L1_L0_8x16 */
5705         else if( bits == 15 )
5706             return 22; /* B_8x8 */
5707
5708         bits= ( bits<<1 ) | get_cabac( &h->cabac, &h->cabac_state[27+5] );
5709         return bits - 4; /* B_L0_Bi_* through B_Bi_Bi_* */
5710     } else {
5711         /* TODO SI/SP frames? */
5712         return -1;
5713     }
5714 }
5715
5716 static int decode_cabac_mb_skip( H264Context *h, int mb_x, int mb_y ) {
5717     MpegEncContext * const s = &h->s;
5718     int mba_xy, mbb_xy;
5719     int ctx = 0;
5720
5721     if(FRAME_MBAFF){ //FIXME merge with the stuff in fill_caches?
5722         int mb_xy = mb_x + (mb_y&~1)*s->mb_stride;
5723         mba_xy = mb_xy - 1;
5724         if( (mb_y&1)
5725             && h->slice_table[mba_xy] == h->slice_num
5726             && MB_FIELD == !!IS_INTERLACED( s->current_picture.mb_type[mba_xy] ) )
5727             mba_xy += s->mb_stride;
5728         if( MB_FIELD ){
5729             mbb_xy = mb_xy - s->mb_stride;
5730             if( !(mb_y&1)
5731                 && h->slice_table[mbb_xy] == h->slice_num
5732                 && IS_INTERLACED( s->current_picture.mb_type[mbb_xy] ) )
5733                 mbb_xy -= s->mb_stride;
5734         }else
5735             mbb_xy = mb_x + (mb_y-1)*s->mb_stride;
5736     }else{
5737         int mb_xy = mb_x + mb_y*s->mb_stride;
5738         mba_xy = mb_xy - 1;
5739         mbb_xy = mb_xy - s->mb_stride;
5740     }
5741
5742     if( h->slice_table[mba_xy] == h->slice_num && !IS_SKIP( s->current_picture.mb_type[mba_xy] ))
5743         ctx++;
5744     if( h->slice_table[mbb_xy] == h->slice_num && !IS_SKIP( s->current_picture.mb_type[mbb_xy] ))
5745         ctx++;
5746
5747     if( h->slice_type == B_TYPE )
5748         ctx += 13;
5749     return get_cabac( &h->cabac, &h->cabac_state[11+ctx] );
5750 }
5751
5752 static int decode_cabac_mb_intra4x4_pred_mode( H264Context *h, int pred_mode ) {
5753     int mode = 0;
5754
5755     if( get_cabac( &h->cabac, &h->cabac_state[68] ) )
5756         return pred_mode;
5757
5758     mode += 1 * get_cabac( &h->cabac, &h->cabac_state[69] );
5759     mode += 2 * get_cabac( &h->cabac, &h->cabac_state[69] );
5760     mode += 4 * get_cabac( &h->cabac, &h->cabac_state[69] );
5761
5762     if( mode >= pred_mode )
5763         return mode + 1;
5764     else
5765         return mode;
5766 }
5767
5768 static int decode_cabac_mb_chroma_pre_mode( H264Context *h) {
5769     const int mba_xy = h->left_mb_xy[0];
5770     const int mbb_xy = h->top_mb_xy;
5771
5772     int ctx = 0;
5773
5774     /* No need to test for IS_INTRA4x4 and IS_INTRA16x16, as we set chroma_pred_mode_table to 0 */
5775     if( h->slice_table[mba_xy] == h->slice_num && h->chroma_pred_mode_table[mba_xy] != 0 )
5776         ctx++;
5777
5778     if( h->slice_table[mbb_xy] == h->slice_num && h->chroma_pred_mode_table[mbb_xy] != 0 )
5779         ctx++;
5780
5781     if( get_cabac( &h->cabac, &h->cabac_state[64+ctx] ) == 0 )
5782         return 0;
5783
5784     if( get_cabac( &h->cabac, &h->cabac_state[64+3] ) == 0 )
5785         return 1;
5786     if( get_cabac( &h->cabac, &h->cabac_state[64+3] ) == 0 )
5787         return 2;
5788     else
5789         return 3;
5790 }
5791
5792 static const uint8_t block_idx_x[16] = {
5793     0, 1, 0, 1, 2, 3, 2, 3, 0, 1, 0, 1, 2, 3, 2, 3
5794 };
5795 static const uint8_t block_idx_y[16] = {
5796     0, 0, 1, 1, 0, 0, 1, 1, 2, 2, 3, 3, 2, 2, 3, 3
5797 };
5798 static const uint8_t block_idx_xy[4][4] = {
5799     { 0, 2, 8,  10},
5800     { 1, 3, 9,  11},
5801     { 4, 6, 12, 14},
5802     { 5, 7, 13, 15}
5803 };
5804
5805 static int decode_cabac_mb_cbp_luma( H264Context *h) {
5806     int cbp = 0;
5807     int cbp_b = -1;
5808     int i8x8;
5809
5810     if( h->slice_table[h->top_mb_xy] == h->slice_num ) {
5811         cbp_b = h->top_cbp;
5812         tprintf("cbp_b = top_cbp = %x\n", cbp_b);
5813     }
5814
5815     for( i8x8 = 0; i8x8 < 4; i8x8++ ) {
5816         int cbp_a = -1;
5817         int x, y;
5818         int ctx = 0;
5819
5820         x = block_idx_x[4*i8x8];
5821         y = block_idx_y[4*i8x8];
5822
5823         if( x > 0 )
5824             cbp_a = cbp;
5825         else if( h->slice_table[h->left_mb_xy[0]] == h->slice_num ) {
5826             cbp_a = h->left_cbp;
5827             tprintf("cbp_a = left_cbp = %x\n", cbp_a);
5828         }
5829
5830         if( y > 0 )
5831             cbp_b = cbp;
5832
5833         /* No need to test for skip as we put 0 for skip block */
5834         /* No need to test for IPCM as we put 1 for IPCM block */
5835         if( cbp_a >= 0 ) {
5836             int i8x8a = block_idx_xy[(x-1)&0x03][y]/4;
5837             if( ((cbp_a >> i8x8a)&0x01) == 0 )
5838                 ctx++;
5839         }
5840
5841         if( cbp_b >= 0 ) {
5842             int i8x8b = block_idx_xy[x][(y-1)&0x03]/4;
5843             if( ((cbp_b >> i8x8b)&0x01) == 0 )
5844                 ctx += 2;
5845         }
5846
5847         if( get_cabac( &h->cabac, &h->cabac_state[73 + ctx] ) ) {
5848             cbp |= 1 << i8x8;
5849         }
5850     }
5851     return cbp;
5852 }
5853 static int decode_cabac_mb_cbp_chroma( H264Context *h) {
5854     int ctx;
5855     int cbp_a, cbp_b;
5856
5857     cbp_a = (h->left_cbp>>4)&0x03;
5858     cbp_b = (h-> top_cbp>>4)&0x03;
5859
5860     ctx = 0;
5861     if( cbp_a > 0 ) ctx++;
5862     if( cbp_b > 0 ) ctx += 2;
5863     if( get_cabac( &h->cabac, &h->cabac_state[77 + ctx] ) == 0 )
5864         return 0;
5865
5866     ctx = 4;
5867     if( cbp_a == 2 ) ctx++;
5868     if( cbp_b == 2 ) ctx += 2;
5869     return 1 + get_cabac( &h->cabac, &h->cabac_state[77 + ctx] );
5870 }
5871 static int decode_cabac_mb_dqp( H264Context *h) {
5872     MpegEncContext * const s = &h->s;
5873     int mbn_xy;
5874     int   ctx = 0;
5875     int   val = 0;
5876
5877     if( s->mb_x > 0 )
5878         mbn_xy = s->mb_x + s->mb_y*s->mb_stride - 1;
5879     else
5880         mbn_xy = s->mb_width - 1 + (s->mb_y-1)*s->mb_stride;
5881
5882     if( h->last_qscale_diff != 0 )
5883         ctx++;
5884
5885     while( get_cabac( &h->cabac, &h->cabac_state[60 + ctx] ) ) {
5886         if( ctx < 2 )
5887             ctx = 2;
5888         else
5889             ctx = 3;
5890         val++;
5891         if(val > 102) //prevent infinite loop
5892             return INT_MIN;
5893     }
5894
5895     if( val&0x01 )
5896         return (val + 1)/2;
5897     else
5898         return -(val + 1)/2;
5899 }
5900 static int decode_cabac_p_mb_sub_type( H264Context *h ) {
5901     if( get_cabac( &h->cabac, &h->cabac_state[21] ) )
5902         return 0;   /* 8x8 */
5903     if( !get_cabac( &h->cabac, &h->cabac_state[22] ) )
5904         return 1;   /* 8x4 */
5905     if( get_cabac( &h->cabac, &h->cabac_state[23] ) )
5906         return 2;   /* 4x8 */
5907     return 3;       /* 4x4 */
5908 }
5909 static int decode_cabac_b_mb_sub_type( H264Context *h ) {
5910     int type;
5911     if( !get_cabac( &h->cabac, &h->cabac_state[36] ) )
5912         return 0;   /* B_Direct_8x8 */
5913     if( !get_cabac( &h->cabac, &h->cabac_state[37] ) )
5914         return 1 + get_cabac( &h->cabac, &h->cabac_state[39] ); /* B_L0_8x8, B_L1_8x8 */
5915     type = 3;
5916     if( get_cabac( &h->cabac, &h->cabac_state[38] ) ) {
5917         if( get_cabac( &h->cabac, &h->cabac_state[39] ) )
5918             return 11 + get_cabac( &h->cabac, &h->cabac_state[39] ); /* B_L1_4x4, B_Bi_4x4 */
5919         type += 4;
5920     }
5921     type += 2*get_cabac( &h->cabac, &h->cabac_state[39] );
5922     type +=   get_cabac( &h->cabac, &h->cabac_state[39] );
5923     return type;
5924 }
5925
5926 static inline int decode_cabac_mb_transform_size( H264Context *h ) {
5927     return get_cabac( &h->cabac, &h->cabac_state[399 + h->neighbor_transform_size] );
5928 }
5929
5930 static int decode_cabac_mb_ref( H264Context *h, int list, int n ) {
5931     int refa = h->ref_cache[list][scan8[n] - 1];
5932     int refb = h->ref_cache[list][scan8[n] - 8];
5933     int ref  = 0;
5934     int ctx  = 0;
5935
5936     if( h->slice_type == B_TYPE) {
5937         if( refa > 0 && !h->direct_cache[scan8[n] - 1] )
5938             ctx++;
5939         if( refb > 0 && !h->direct_cache[scan8[n] - 8] )
5940             ctx += 2;
5941     } else {
5942         if( refa > 0 )
5943             ctx++;
5944         if( refb > 0 )
5945             ctx += 2;
5946     }
5947
5948     while( get_cabac( &h->cabac, &h->cabac_state[54+ctx] ) ) {
5949         ref++;
5950         if( ctx < 4 )
5951             ctx = 4;
5952         else
5953             ctx = 5;
5954     }
5955     return ref;
5956 }
5957
5958 static int decode_cabac_mb_mvd( H264Context *h, int list, int n, int l ) {
5959     int amvd = abs( h->mvd_cache[list][scan8[n] - 1][l] ) +
5960                abs( h->mvd_cache[list][scan8[n] - 8][l] );
5961     int ctxbase = (l == 0) ? 40 : 47;
5962     int ctx, mvd;
5963
5964     if( amvd < 3 )
5965         ctx = 0;
5966     else if( amvd > 32 )
5967         ctx = 2;
5968     else
5969         ctx = 1;
5970
5971     if(!get_cabac(&h->cabac, &h->cabac_state[ctxbase+ctx]))
5972         return 0;
5973
5974     mvd= 1;
5975     ctx= 3;
5976     while( mvd < 9 && get_cabac( &h->cabac, &h->cabac_state[ctxbase+ctx] ) ) {
5977         mvd++;
5978         if( ctx < 6 )
5979             ctx++;
5980     }
5981
5982     if( mvd >= 9 ) {
5983         int k = 3;
5984         while( get_cabac_bypass( &h->cabac ) ) {
5985             mvd += 1 << k;
5986             k++;
5987         }
5988         while( k-- ) {
5989             if( get_cabac_bypass( &h->cabac ) )
5990                 mvd += 1 << k;
5991         }
5992     }
5993     if( get_cabac_bypass( &h->cabac ) )  return -mvd;
5994     else                                 return  mvd;
5995 }
5996
5997 static int inline get_cabac_cbf_ctx( H264Context *h, int cat, int idx ) {
5998     int nza, nzb;
5999     int ctx = 0;
6000
6001     if( cat == 0 ) {
6002         nza = h->left_cbp&0x100;
6003         nzb = h-> top_cbp&0x100;
6004     } else if( cat == 1 || cat == 2 ) {
6005         nza = h->non_zero_count_cache[scan8[idx] - 1];
6006         nzb = h->non_zero_count_cache[scan8[idx] - 8];
6007     } else if( cat == 3 ) {
6008         nza = (h->left_cbp>>(6+idx))&0x01;
6009         nzb = (h-> top_cbp>>(6+idx))&0x01;
6010     } else {
6011         assert(cat == 4);
6012         nza = h->non_zero_count_cache[scan8[16+idx] - 1];
6013         nzb = h->non_zero_count_cache[scan8[16+idx] - 8];
6014     }
6015
6016     if( nza > 0 )
6017         ctx++;
6018
6019     if( nzb > 0 )
6020         ctx += 2;
6021
6022     return ctx + 4 * cat;
6023 }
6024
6025 static int decode_cabac_residual( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff) {
6026     const int mb_xy  = h->s.mb_x + h->s.mb_y*h->s.mb_stride;
6027     static const int significant_coeff_flag_offset[2][6] = {
6028       { 105+0, 105+15, 105+29, 105+44, 105+47, 402 },
6029       { 277+0, 277+15, 277+29, 277+44, 277+47, 436 }
6030     };
6031     static const int last_coeff_flag_offset[2][6] = {
6032       { 166+0, 166+15, 166+29, 166+44, 166+47, 417 },
6033       { 338+0, 338+15, 338+29, 338+44, 338+47, 451 }
6034     };
6035     static const int coeff_abs_level_m1_offset[6] = {
6036         227+0, 227+10, 227+20, 227+30, 227+39, 426
6037     };
6038     static const int significant_coeff_flag_offset_8x8[2][63] = {
6039       { 0, 1, 2, 3, 4, 5, 5, 4, 4, 3, 3, 4, 4, 4, 5, 5,
6040         4, 4, 4, 4, 3, 3, 6, 7, 7, 7, 8, 9,10, 9, 8, 7,
6041         7, 6,11,12,13,11, 6, 7, 8, 9,14,10, 9, 8, 6,11,
6042        12,13,11, 6, 9,14,10, 9,11,12,13,11,14,10,12 },
6043       { 0, 1, 1, 2, 2, 3, 3, 4, 5, 6, 7, 7, 7, 8, 4, 5,
6044         6, 9,10,10, 8,11,12,11, 9, 9,10,10, 8,11,12,11,
6045         9, 9,10,10, 8,11,12,11, 9, 9,10,10, 8,13,13, 9,
6046         9,10,10, 8,13,13, 9, 9,10,10,14,14,14,14,14 }
6047     };
6048     static const int last_coeff_flag_offset_8x8[63] = {
6049         0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
6050         2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
6051         3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4,
6052         5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7, 8, 8, 8
6053     };
6054
6055     int index[64];
6056
6057     int i, last;
6058     int coeff_count = 0;
6059
6060     int abslevel1 = 1;
6061     int abslevelgt1 = 0;
6062
6063     uint8_t *significant_coeff_ctx_base;
6064     uint8_t *last_coeff_ctx_base;
6065     uint8_t *abs_level_m1_ctx_base;
6066
6067     /* cat: 0-> DC 16x16  n = 0
6068      *      1-> AC 16x16  n = luma4x4idx
6069      *      2-> Luma4x4   n = luma4x4idx
6070      *      3-> DC Chroma n = iCbCr
6071      *      4-> AC Chroma n = 4 * iCbCr + chroma4x4idx
6072      *      5-> Luma8x8   n = 4 * luma8x8idx
6073      */
6074
6075     /* read coded block flag */
6076     if( cat != 5 ) {
6077         if( get_cabac( &h->cabac, &h->cabac_state[85 + get_cabac_cbf_ctx( h, cat, n ) ] ) == 0 ) {
6078             if( cat == 1 || cat == 2 )
6079                 h->non_zero_count_cache[scan8[n]] = 0;
6080             else if( cat == 4 )
6081                 h->non_zero_count_cache[scan8[16+n]] = 0;
6082
6083             return 0;
6084         }
6085     }
6086
6087     significant_coeff_ctx_base = h->cabac_state
6088         + significant_coeff_flag_offset[MB_FIELD][cat];
6089     last_coeff_ctx_base = h->cabac_state
6090         + last_coeff_flag_offset[MB_FIELD][cat];
6091     abs_level_m1_ctx_base = h->cabac_state
6092         + coeff_abs_level_m1_offset[cat];
6093
6094     if( cat == 5 ) {
6095 #define DECODE_SIGNIFICANCE( coefs, sig_off, last_off ) \
6096         for(last= 0; last < coefs; last++) { \
6097             uint8_t *sig_ctx = significant_coeff_ctx_base + sig_off; \
6098             if( get_cabac( &h->cabac, sig_ctx )) { \
6099                 uint8_t *last_ctx = last_coeff_ctx_base + last_off; \
6100                 index[coeff_count++] = last; \
6101                 if( get_cabac( &h->cabac, last_ctx ) ) { \
6102                     last= max_coeff; \
6103                     break; \
6104                 } \
6105             } \
6106         }
6107         const int *sig_off = significant_coeff_flag_offset_8x8[MB_FIELD];
6108         DECODE_SIGNIFICANCE( 63, sig_off[last], last_coeff_flag_offset_8x8[last] );
6109     } else {
6110         DECODE_SIGNIFICANCE( max_coeff - 1, last, last );
6111     }
6112     if( last == max_coeff -1 ) {
6113         index[coeff_count++] = last;
6114     }
6115     assert(coeff_count > 0);
6116
6117     if( cat == 0 )
6118         h->cbp_table[mb_xy] |= 0x100;
6119     else if( cat == 1 || cat == 2 )
6120         h->non_zero_count_cache[scan8[n]] = coeff_count;
6121     else if( cat == 3 )
6122         h->cbp_table[mb_xy] |= 0x40 << n;
6123     else if( cat == 4 )
6124         h->non_zero_count_cache[scan8[16+n]] = coeff_count;
6125     else {
6126         assert( cat == 5 );
6127         fill_rectangle(&h->non_zero_count_cache[scan8[n]], 2, 2, 8, coeff_count, 1);
6128     }
6129
6130     for( i = coeff_count - 1; i >= 0; i-- ) {
6131         uint8_t *ctx = (abslevelgt1 != 0 ? 0 : FFMIN( 4, abslevel1 )) + abs_level_m1_ctx_base;
6132         int j= scantable[index[i]];
6133
6134         if( get_cabac( &h->cabac, ctx ) == 0 ) {
6135             if( !qmul ) {
6136                 if( get_cabac_bypass( &h->cabac ) ) block[j] = -1;
6137                 else                                block[j] =  1;
6138             }else{
6139                 if( get_cabac_bypass( &h->cabac ) ) block[j] = (-qmul[j] + 32) >> 6;
6140                 else                                block[j] = ( qmul[j] + 32) >> 6;
6141             }
6142
6143             abslevel1++;
6144         } else {
6145             int coeff_abs = 2;
6146             ctx = 5 + FFMIN( 4, abslevelgt1 ) + abs_level_m1_ctx_base;
6147             while( coeff_abs < 15 && get_cabac( &h->cabac, ctx ) ) {
6148                 coeff_abs++;
6149             }
6150
6151             if( coeff_abs >= 15 ) {
6152                 int j = 0;
6153                 while( get_cabac_bypass( &h->cabac ) ) {
6154                     coeff_abs += 1 << j;
6155                     j++;
6156                 }
6157
6158                 while( j-- ) {
6159                     if( get_cabac_bypass( &h->cabac ) )
6160                         coeff_abs += 1 << j ;
6161                 }
6162             }
6163
6164             if( !qmul ) {
6165                 if( get_cabac_bypass( &h->cabac ) ) block[j] = -coeff_abs;
6166                 else                                block[j] =  coeff_abs;
6167             }else{
6168                 if( get_cabac_bypass( &h->cabac ) ) block[j] = (-coeff_abs * qmul[j] + 32) >> 6;
6169                 else                                block[j] = ( coeff_abs * qmul[j] + 32) >> 6;
6170             }
6171
6172             abslevelgt1++;
6173         }
6174     }
6175     return 0;
6176 }
6177
6178 static void inline compute_mb_neighbors(H264Context *h)
6179 {
6180     MpegEncContext * const s = &h->s;
6181     const int mb_xy  = s->mb_x + s->mb_y*s->mb_stride;
6182     h->top_mb_xy     = mb_xy - s->mb_stride;
6183     h->left_mb_xy[0] = mb_xy - 1;
6184     if(FRAME_MBAFF){
6185         const int pair_xy          = s->mb_x     + (s->mb_y & ~1)*s->mb_stride;
6186         const int top_pair_xy      = pair_xy     - s->mb_stride;
6187         const int top_mb_frame_flag      = !IS_INTERLACED(s->current_picture.mb_type[top_pair_xy]);
6188         const int left_mb_frame_flag = !IS_INTERLACED(s->current_picture.mb_type[pair_xy-1]);
6189         const int curr_mb_frame_flag = !MB_FIELD;
6190         const int bottom = (s->mb_y & 1);
6191         if (bottom
6192                 ? !curr_mb_frame_flag // bottom macroblock
6193                 : (!curr_mb_frame_flag && !top_mb_frame_flag) // top macroblock
6194                 ) {
6195             h->top_mb_xy -= s->mb_stride;
6196         }
6197         if (left_mb_frame_flag != curr_mb_frame_flag) {
6198             h->left_mb_xy[0] = pair_xy - 1;
6199         }
6200     }
6201     return;
6202 }
6203
6204 /**
6205  * decodes a macroblock
6206  * @returns 0 if ok, AC_ERROR / DC_ERROR / MV_ERROR if an error is noticed
6207  */
6208 static int decode_mb_cabac(H264Context *h) {
6209     MpegEncContext * const s = &h->s;
6210     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
6211     int mb_type, partition_count, cbp = 0;
6212     int dct8x8_allowed= h->pps.transform_8x8_mode;
6213
6214     s->dsp.clear_blocks(h->mb); //FIXME avoid if already clear (move after skip handlong?)
6215
6216     tprintf("pic:%d mb:%d/%d\n", h->frame_num, s->mb_x, s->mb_y);
6217     if( h->slice_type != I_TYPE && h->slice_type != SI_TYPE ) {
6218         int skip;
6219         /* a skipped mb needs the aff flag from the following mb */
6220         if( FRAME_MBAFF && s->mb_x==0 && (s->mb_y&1)==0 )
6221             predict_field_decoding_flag(h);
6222         if( FRAME_MBAFF && (s->mb_y&1)==1 && h->prev_mb_skipped )
6223             skip = h->next_mb_skipped;
6224         else
6225             skip = decode_cabac_mb_skip( h, s->mb_x, s->mb_y );
6226         /* read skip flags */
6227         if( skip ) {
6228             if( FRAME_MBAFF && (s->mb_y&1)==0 ){
6229                 s->current_picture.mb_type[mb_xy] = MB_TYPE_SKIP;
6230                 h->next_mb_skipped = decode_cabac_mb_skip( h, s->mb_x, s->mb_y+1 );
6231                 if(h->next_mb_skipped)
6232                     predict_field_decoding_flag(h);
6233                 else
6234                     h->mb_mbaff = h->mb_field_decoding_flag = decode_cabac_field_decoding_flag(h);
6235             }
6236
6237             decode_mb_skip(h);
6238
6239             h->cbp_table[mb_xy] = 0;
6240             h->chroma_pred_mode_table[mb_xy] = 0;
6241             h->last_qscale_diff = 0;
6242
6243             return 0;
6244
6245         }
6246     }
6247     if(FRAME_MBAFF){
6248         if( (s->mb_y&1) == 0 )
6249             h->mb_mbaff =
6250             h->mb_field_decoding_flag = decode_cabac_field_decoding_flag(h);
6251     }else
6252         h->mb_field_decoding_flag= (s->picture_structure!=PICT_FRAME);
6253
6254     h->prev_mb_skipped = 0;
6255
6256     compute_mb_neighbors(h);
6257     if( ( mb_type = decode_cabac_mb_type( h ) ) < 0 ) {
6258         av_log( h->s.avctx, AV_LOG_ERROR, "decode_cabac_mb_type failed\n" );
6259         return -1;
6260     }
6261
6262     if( h->slice_type == B_TYPE ) {
6263         if( mb_type < 23 ){
6264             partition_count= b_mb_type_info[mb_type].partition_count;
6265             mb_type=         b_mb_type_info[mb_type].type;
6266         }else{
6267             mb_type -= 23;
6268             goto decode_intra_mb;
6269         }
6270     } else if( h->slice_type == P_TYPE ) {
6271         if( mb_type < 5) {
6272             partition_count= p_mb_type_info[mb_type].partition_count;
6273             mb_type=         p_mb_type_info[mb_type].type;
6274         } else {
6275             mb_type -= 5;
6276             goto decode_intra_mb;
6277         }
6278     } else {
6279        assert(h->slice_type == I_TYPE);
6280 decode_intra_mb:
6281         partition_count = 0;
6282         cbp= i_mb_type_info[mb_type].cbp;
6283         h->intra16x16_pred_mode= i_mb_type_info[mb_type].pred_mode;
6284         mb_type= i_mb_type_info[mb_type].type;
6285     }
6286     if(MB_FIELD)
6287         mb_type |= MB_TYPE_INTERLACED;
6288
6289     h->slice_table[ mb_xy ]= h->slice_num;
6290
6291     if(IS_INTRA_PCM(mb_type)) {
6292         const uint8_t *ptr;
6293         unsigned int x, y;
6294
6295         // We assume these blocks are very rare so we dont optimize it.
6296         // FIXME The two following lines get the bitstream position in the cabac
6297         // decode, I think it should be done by a function in cabac.h (or cabac.c).
6298         ptr= h->cabac.bytestream;
6299         if (h->cabac.low&0x1) ptr-=CABAC_BITS/8;
6300
6301         // The pixels are stored in the same order as levels in h->mb array.
6302         for(y=0; y<16; y++){
6303             const int index= 4*(y&3) + 32*((y>>2)&1) + 128*(y>>3);
6304             for(x=0; x<16; x++){
6305                 tprintf("LUMA ICPM LEVEL (%3d)\n", *ptr);
6306                 h->mb[index + (x&3) + 16*((x>>2)&1) + 64*(x>>3)]= *ptr++;
6307             }
6308         }
6309         for(y=0; y<8; y++){
6310             const int index= 256 + 4*(y&3) + 32*(y>>2);
6311             for(x=0; x<8; x++){
6312                 tprintf("CHROMA U ICPM LEVEL (%3d)\n", *ptr);
6313                 h->mb[index + (x&3) + 16*(x>>2)]= *ptr++;
6314             }
6315         }
6316         for(y=0; y<8; y++){
6317             const int index= 256 + 64 + 4*(y&3) + 32*(y>>2);
6318             for(x=0; x<8; x++){
6319                 tprintf("CHROMA V ICPM LEVEL (%3d)\n", *ptr);
6320                 h->mb[index + (x&3) + 16*(x>>2)]= *ptr++;
6321             }
6322         }
6323
6324         ff_init_cabac_decoder(&h->cabac, ptr, h->cabac.bytestream_end - ptr);
6325
6326         // All blocks are present
6327         h->cbp_table[mb_xy] = 0x1ef;
6328         h->chroma_pred_mode_table[mb_xy] = 0;
6329         // In deblocking, the quantizer is 0
6330         s->current_picture.qscale_table[mb_xy]= 0;
6331         h->chroma_qp = get_chroma_qp(h->pps.chroma_qp_index_offset, 0);
6332         // All coeffs are present
6333         memset(h->non_zero_count[mb_xy], 16, 16);
6334         s->current_picture.mb_type[mb_xy]= mb_type;
6335         return 0;
6336     }
6337
6338     if(MB_MBAFF){
6339         h->ref_count[0] <<= 1;
6340         h->ref_count[1] <<= 1;
6341     }
6342
6343     fill_caches(h, mb_type, 0);
6344
6345     if( IS_INTRA( mb_type ) ) {
6346         int i;
6347         if( IS_INTRA4x4( mb_type ) ) {
6348             if( dct8x8_allowed && decode_cabac_mb_transform_size( h ) ) {
6349                 mb_type |= MB_TYPE_8x8DCT;
6350                 for( i = 0; i < 16; i+=4 ) {
6351                     int pred = pred_intra_mode( h, i );
6352                     int mode = decode_cabac_mb_intra4x4_pred_mode( h, pred );
6353                     fill_rectangle( &h->intra4x4_pred_mode_cache[ scan8[i] ], 2, 2, 8, mode, 1 );
6354                 }
6355             } else {
6356                 for( i = 0; i < 16; i++ ) {
6357                     int pred = pred_intra_mode( h, i );
6358                     h->intra4x4_pred_mode_cache[ scan8[i] ] = decode_cabac_mb_intra4x4_pred_mode( h, pred );
6359
6360                 //av_log( s->avctx, AV_LOG_ERROR, "i4x4 pred=%d mode=%d\n", pred, h->intra4x4_pred_mode_cache[ scan8[i] ] );
6361                 }
6362             }
6363             write_back_intra_pred_mode(h);
6364             if( check_intra4x4_pred_mode(h) < 0 ) return -1;
6365         } else {
6366             h->intra16x16_pred_mode= check_intra_pred_mode( h, h->intra16x16_pred_mode );
6367             if( h->intra16x16_pred_mode < 0 ) return -1;
6368         }
6369         h->chroma_pred_mode_table[mb_xy] =
6370             h->chroma_pred_mode          = decode_cabac_mb_chroma_pre_mode( h );
6371
6372         h->chroma_pred_mode= check_intra_pred_mode( h, h->chroma_pred_mode );
6373         if( h->chroma_pred_mode < 0 ) return -1;
6374     } else if( partition_count == 4 ) {
6375         int i, j, sub_partition_count[4], list, ref[2][4];
6376
6377         if( h->slice_type == B_TYPE ) {
6378             for( i = 0; i < 4; i++ ) {
6379                 h->sub_mb_type[i] = decode_cabac_b_mb_sub_type( h );
6380                 sub_partition_count[i]= b_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
6381                 h->sub_mb_type[i]=      b_sub_mb_type_info[ h->sub_mb_type[i] ].type;
6382             }
6383             if(   IS_DIRECT(h->sub_mb_type[0]) || IS_DIRECT(h->sub_mb_type[1])
6384                || IS_DIRECT(h->sub_mb_type[2]) || IS_DIRECT(h->sub_mb_type[3])) {
6385                 pred_direct_motion(h, &mb_type);
6386                 if( h->ref_count[0] > 1 || h->ref_count[1] > 1 ) {
6387                     for( i = 0; i < 4; i++ )
6388                         if( IS_DIRECT(h->sub_mb_type[i]) )
6389                             fill_rectangle( &h->direct_cache[scan8[4*i]], 2, 2, 8, 1, 1 );
6390                 }
6391             }
6392         } else {
6393             for( i = 0; i < 4; i++ ) {
6394                 h->sub_mb_type[i] = decode_cabac_p_mb_sub_type( h );
6395                 sub_partition_count[i]= p_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
6396                 h->sub_mb_type[i]=      p_sub_mb_type_info[ h->sub_mb_type[i] ].type;
6397             }
6398         }
6399
6400         for( list = 0; list < 2; list++ ) {
6401             if( h->ref_count[list] > 0 ) {
6402                 for( i = 0; i < 4; i++ ) {
6403                     if(IS_DIRECT(h->sub_mb_type[i])) continue;
6404                     if(IS_DIR(h->sub_mb_type[i], 0, list)){
6405                         if( h->ref_count[list] > 1 )
6406                             ref[list][i] = decode_cabac_mb_ref( h, list, 4*i );
6407                         else
6408                             ref[list][i] = 0;
6409                     } else {
6410                         ref[list][i] = -1;
6411                     }
6412                                                        h->ref_cache[list][ scan8[4*i]+1 ]=
6413                     h->ref_cache[list][ scan8[4*i]+8 ]=h->ref_cache[list][ scan8[4*i]+9 ]= ref[list][i];
6414                 }
6415             }
6416         }
6417
6418         if(dct8x8_allowed)
6419             dct8x8_allowed = get_dct8x8_allowed(h);
6420
6421         for(list=0; list<2; list++){
6422             for(i=0; i<4; i++){
6423                 if(IS_DIRECT(h->sub_mb_type[i])){
6424                     fill_rectangle(h->mvd_cache[list][scan8[4*i]], 2, 2, 8, 0, 4);
6425                     continue;
6426                 }
6427                 h->ref_cache[list][ scan8[4*i]   ]=h->ref_cache[list][ scan8[4*i]+1 ];
6428
6429                 if(IS_DIR(h->sub_mb_type[i], 0, list) && !IS_DIRECT(h->sub_mb_type[i])){
6430                     const int sub_mb_type= h->sub_mb_type[i];
6431                     const int block_width= (sub_mb_type & (MB_TYPE_16x16|MB_TYPE_16x8)) ? 2 : 1;
6432                     for(j=0; j<sub_partition_count[i]; j++){
6433                         int mpx, mpy;
6434                         int mx, my;
6435                         const int index= 4*i + block_width*j;
6436                         int16_t (* mv_cache)[2]= &h->mv_cache[list][ scan8[index] ];
6437                         int16_t (* mvd_cache)[2]= &h->mvd_cache[list][ scan8[index] ];
6438                         pred_motion(h, index, block_width, list, h->ref_cache[list][ scan8[index] ], &mpx, &mpy);
6439
6440                         mx = mpx + decode_cabac_mb_mvd( h, list, index, 0 );
6441                         my = mpy + decode_cabac_mb_mvd( h, list, index, 1 );
6442                         tprintf("final mv:%d %d\n", mx, my);
6443
6444                         if(IS_SUB_8X8(sub_mb_type)){
6445                             mv_cache[ 0 ][0]= mv_cache[ 1 ][0]=
6446                             mv_cache[ 8 ][0]= mv_cache[ 9 ][0]= mx;
6447                             mv_cache[ 0 ][1]= mv_cache[ 1 ][1]=
6448                             mv_cache[ 8 ][1]= mv_cache[ 9 ][1]= my;
6449
6450                             mvd_cache[ 0 ][0]= mvd_cache[ 1 ][0]=
6451                             mvd_cache[ 8 ][0]= mvd_cache[ 9 ][0]= mx - mpx;
6452                             mvd_cache[ 0 ][1]= mvd_cache[ 1 ][1]=
6453                             mvd_cache[ 8 ][1]= mvd_cache[ 9 ][1]= my - mpy;
6454                         }else if(IS_SUB_8X4(sub_mb_type)){
6455                             mv_cache[ 0 ][0]= mv_cache[ 1 ][0]= mx;
6456                             mv_cache[ 0 ][1]= mv_cache[ 1 ][1]= my;
6457
6458                             mvd_cache[ 0 ][0]= mvd_cache[ 1 ][0]= mx- mpx;
6459                             mvd_cache[ 0 ][1]= mvd_cache[ 1 ][1]= my - mpy;
6460                         }else if(IS_SUB_4X8(sub_mb_type)){
6461                             mv_cache[ 0 ][0]= mv_cache[ 8 ][0]= mx;
6462                             mv_cache[ 0 ][1]= mv_cache[ 8 ][1]= my;
6463
6464                             mvd_cache[ 0 ][0]= mvd_cache[ 8 ][0]= mx - mpx;
6465                             mvd_cache[ 0 ][1]= mvd_cache[ 8 ][1]= my - mpy;
6466                         }else{
6467                             assert(IS_SUB_4X4(sub_mb_type));
6468                             mv_cache[ 0 ][0]= mx;
6469                             mv_cache[ 0 ][1]= my;
6470
6471                             mvd_cache[ 0 ][0]= mx - mpx;
6472                             mvd_cache[ 0 ][1]= my - mpy;
6473                         }
6474                     }
6475                 }else{
6476                     uint32_t *p= (uint32_t *)&h->mv_cache[list][ scan8[4*i] ][0];
6477                     uint32_t *pd= (uint32_t *)&h->mvd_cache[list][ scan8[4*i] ][0];
6478                     p[0] = p[1] = p[8] = p[9] = 0;
6479                     pd[0]= pd[1]= pd[8]= pd[9]= 0;
6480                 }
6481             }
6482         }
6483     } else if( IS_DIRECT(mb_type) ) {
6484         pred_direct_motion(h, &mb_type);
6485         fill_rectangle(h->mvd_cache[0][scan8[0]], 4, 4, 8, 0, 4);
6486         fill_rectangle(h->mvd_cache[1][scan8[0]], 4, 4, 8, 0, 4);
6487         dct8x8_allowed &= h->sps.direct_8x8_inference_flag;
6488     } else {
6489         int list, mx, my, i, mpx, mpy;
6490         if(IS_16X16(mb_type)){
6491             for(list=0; list<2; list++){
6492                 if(IS_DIR(mb_type, 0, list)){
6493                     if(h->ref_count[list] > 0 ){
6494                         const int ref = h->ref_count[list] > 1 ? decode_cabac_mb_ref( h, list, 0 ) : 0;
6495                         fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, ref, 1);
6496                     }
6497                 }else
6498                     fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, (uint8_t)LIST_NOT_USED, 1);
6499             }
6500             for(list=0; list<2; list++){
6501                 if(IS_DIR(mb_type, 0, list)){
6502                     pred_motion(h, 0, 4, list, h->ref_cache[list][ scan8[0] ], &mpx, &mpy);
6503
6504                     mx = mpx + decode_cabac_mb_mvd( h, list, 0, 0 );
6505                     my = mpy + decode_cabac_mb_mvd( h, list, 0, 1 );
6506                     tprintf("final mv:%d %d\n", mx, my);
6507
6508                     fill_rectangle(h->mvd_cache[list][ scan8[0] ], 4, 4, 8, pack16to32(mx-mpx,my-mpy), 4);
6509                     fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, pack16to32(mx,my), 4);
6510                 }else
6511                     fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, 0, 4);
6512             }
6513         }
6514         else if(IS_16X8(mb_type)){
6515             for(list=0; list<2; list++){
6516                 if(h->ref_count[list]>0){
6517                     for(i=0; i<2; i++){
6518                         if(IS_DIR(mb_type, i, list)){
6519                             const int ref= h->ref_count[list] > 1 ? decode_cabac_mb_ref( h, list, 8*i ) : 0;
6520                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, ref, 1);
6521                         }else
6522                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, (LIST_NOT_USED&0xFF), 1);
6523                     }
6524                 }
6525             }
6526             for(list=0; list<2; list++){
6527                 for(i=0; i<2; i++){
6528                     if(IS_DIR(mb_type, i, list)){
6529                         pred_16x8_motion(h, 8*i, list, h->ref_cache[list][scan8[0] + 16*i], &mpx, &mpy);
6530                         mx = mpx + decode_cabac_mb_mvd( h, list, 8*i, 0 );
6531                         my = mpy + decode_cabac_mb_mvd( h, list, 8*i, 1 );
6532                         tprintf("final mv:%d %d\n", mx, my);
6533
6534                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 16*i ], 4, 2, 8, pack16to32(mx-mpx,my-mpy), 4);
6535                         fill_rectangle(h->mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, pack16to32(mx,my), 4);
6536                     }else{
6537                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 16*i ], 4, 2, 8, 0, 4);
6538                         fill_rectangle(h-> mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, 0, 4);
6539                     }
6540                 }
6541             }
6542         }else{
6543             assert(IS_8X16(mb_type));
6544             for(list=0; list<2; list++){
6545                 if(h->ref_count[list]>0){
6546                     for(i=0; i<2; i++){
6547                         if(IS_DIR(mb_type, i, list)){ //FIXME optimize
6548                             const int ref= h->ref_count[list] > 1 ? decode_cabac_mb_ref( h, list, 4*i ) : 0;
6549                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, ref, 1);
6550                         }else
6551                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, (LIST_NOT_USED&0xFF), 1);
6552                     }
6553                 }
6554             }
6555             for(list=0; list<2; list++){
6556                 for(i=0; i<2; i++){
6557                     if(IS_DIR(mb_type, i, list)){
6558                         pred_8x16_motion(h, i*4, list, h->ref_cache[list][ scan8[0] + 2*i ], &mpx, &mpy);
6559                         mx = mpx + decode_cabac_mb_mvd( h, list, 4*i, 0 );
6560                         my = mpy + decode_cabac_mb_mvd( h, list, 4*i, 1 );
6561
6562                         tprintf("final mv:%d %d\n", mx, my);
6563                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 2*i ], 2, 4, 8, pack16to32(mx-mpx,my-mpy), 4);
6564                         fill_rectangle(h->mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, pack16to32(mx,my), 4);
6565                     }else{
6566                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 2*i ], 2, 4, 8, 0, 4);
6567                         fill_rectangle(h-> mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, 0, 4);
6568                     }
6569                 }
6570             }
6571         }
6572     }
6573
6574    if( IS_INTER( mb_type ) ) {
6575         h->chroma_pred_mode_table[mb_xy] = 0;
6576         write_back_motion( h, mb_type );
6577    }
6578
6579     if( !IS_INTRA16x16( mb_type ) ) {
6580         cbp  = decode_cabac_mb_cbp_luma( h );
6581         cbp |= decode_cabac_mb_cbp_chroma( h ) << 4;
6582     }
6583
6584     h->cbp_table[mb_xy] = cbp;
6585
6586     if( dct8x8_allowed && (cbp&15) && !IS_INTRA( mb_type ) ) {
6587         if( decode_cabac_mb_transform_size( h ) )
6588             mb_type |= MB_TYPE_8x8DCT;
6589     }
6590     s->current_picture.mb_type[mb_xy]= mb_type;
6591
6592     if( cbp || IS_INTRA16x16( mb_type ) ) {
6593         const uint8_t *scan, *scan8x8, *dc_scan;
6594         int dqp;
6595
6596         if(IS_INTERLACED(mb_type)){
6597             scan8x8= s->qscale ? h->field_scan8x8 : h->field_scan8x8_q0;
6598             scan= s->qscale ? h->field_scan : h->field_scan_q0;
6599             dc_scan= luma_dc_field_scan;
6600         }else{
6601             scan8x8= s->qscale ? h->zigzag_scan8x8 : h->zigzag_scan8x8_q0;
6602             scan= s->qscale ? h->zigzag_scan : h->zigzag_scan_q0;
6603             dc_scan= luma_dc_zigzag_scan;
6604         }
6605
6606         h->last_qscale_diff = dqp = decode_cabac_mb_dqp( h );
6607         if( dqp == INT_MIN ){
6608             av_log(h->s.avctx, AV_LOG_ERROR, "cabac decode of qscale diff failed at %d %d\n", s->mb_x, s->mb_y);
6609             return -1;
6610         }
6611         s->qscale += dqp;
6612         if(((unsigned)s->qscale) > 51){
6613             if(s->qscale<0) s->qscale+= 52;
6614             else            s->qscale-= 52;
6615         }
6616         h->chroma_qp = get_chroma_qp(h->pps.chroma_qp_index_offset, s->qscale);
6617
6618         if( IS_INTRA16x16( mb_type ) ) {
6619             int i;
6620             //av_log( s->avctx, AV_LOG_ERROR, "INTRA16x16 DC\n" );
6621             if( decode_cabac_residual( h, h->mb, 0, 0, dc_scan, NULL, 16) < 0)
6622                 return -1;
6623             if( cbp&15 ) {
6624                 for( i = 0; i < 16; i++ ) {
6625                     //av_log( s->avctx, AV_LOG_ERROR, "INTRA16x16 AC:%d\n", i );
6626                     if( decode_cabac_residual(h, h->mb + 16*i, 1, i, scan + 1, h->dequant4_coeff[0][s->qscale], 15) < 0 )
6627                         return -1;
6628                 }
6629             } else {
6630                 fill_rectangle(&h->non_zero_count_cache[scan8[0]], 4, 4, 8, 0, 1);
6631             }
6632         } else {
6633             int i8x8, i4x4;
6634             for( i8x8 = 0; i8x8 < 4; i8x8++ ) {
6635                 if( cbp & (1<<i8x8) ) {
6636                     if( IS_8x8DCT(mb_type) ) {
6637                         if( decode_cabac_residual(h, h->mb + 64*i8x8, 5, 4*i8x8,
6638                             scan8x8, h->dequant8_coeff[IS_INTRA( mb_type ) ? 0:1][s->qscale], 64) < 0 )
6639                             return -1;
6640                     } else
6641                     for( i4x4 = 0; i4x4 < 4; i4x4++ ) {
6642                         const int index = 4*i8x8 + i4x4;
6643                         //av_log( s->avctx, AV_LOG_ERROR, "Luma4x4: %d\n", index );
6644                         if( decode_cabac_residual(h, h->mb + 16*index, 2, index, scan, h->dequant4_coeff[IS_INTRA( mb_type ) ? 0:3][s->qscale], 16) < 0 )
6645                             return -1;
6646                     }
6647                 } else {
6648                     uint8_t * const nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
6649                     nnz[0] = nnz[1] = nnz[8] = nnz[9] = 0;
6650                 }
6651             }
6652         }
6653
6654         if( cbp&0x30 ){
6655             int c;
6656             for( c = 0; c < 2; c++ ) {
6657                 //av_log( s->avctx, AV_LOG_ERROR, "INTRA C%d-DC\n",c );
6658                 if( decode_cabac_residual(h, h->mb + 256 + 16*4*c, 3, c, chroma_dc_scan, NULL, 4) < 0)
6659                     return -1;
6660             }
6661         }
6662
6663         if( cbp&0x20 ) {
6664             int c, i;
6665             for( c = 0; c < 2; c++ ) {
6666                 for( i = 0; i < 4; i++ ) {
6667                     const int index = 16 + 4 * c + i;
6668                     //av_log( s->avctx, AV_LOG_ERROR, "INTRA C%d-AC %d\n",c, index - 16 );
6669                     if( decode_cabac_residual(h, h->mb + 16*index, 4, index - 16, scan + 1, h->dequant4_coeff[c+1+(IS_INTRA( mb_type ) ? 0:3)][h->chroma_qp], 15) < 0)
6670                         return -1;
6671                 }
6672             }
6673         } else {
6674             uint8_t * const nnz= &h->non_zero_count_cache[0];
6675             nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
6676             nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
6677         }
6678     } else {
6679         uint8_t * const nnz= &h->non_zero_count_cache[0];
6680         fill_rectangle(&nnz[scan8[0]], 4, 4, 8, 0, 1);
6681         nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
6682         nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
6683         h->last_qscale_diff = 0;
6684     }
6685
6686     s->current_picture.qscale_table[mb_xy]= s->qscale;
6687     write_back_non_zero_count(h);
6688
6689     if(MB_MBAFF){
6690         h->ref_count[0] >>= 1;
6691         h->ref_count[1] >>= 1;
6692     }
6693
6694     return 0;
6695 }
6696
6697
6698 static void filter_mb_edgev( H264Context *h, uint8_t *pix, int stride, int bS[4], int qp ) {
6699     int i, d;
6700     const int index_a = clip( qp + h->slice_alpha_c0_offset, 0, 51 );
6701     const int alpha = alpha_table[index_a];
6702     const int beta  = beta_table[clip( qp + h->slice_beta_offset, 0, 51 )];
6703
6704     if( bS[0] < 4 ) {
6705         int8_t tc[4];
6706         for(i=0; i<4; i++)
6707             tc[i] = bS[i] ? tc0_table[index_a][bS[i] - 1] : -1;
6708         h->s.dsp.h264_h_loop_filter_luma(pix, stride, alpha, beta, tc);
6709     } else {
6710         /* 16px edge length, because bS=4 is triggered by being at
6711          * the edge of an intra MB, so all 4 bS are the same */
6712             for( d = 0; d < 16; d++ ) {
6713                 const int p0 = pix[-1];
6714                 const int p1 = pix[-2];
6715                 const int p2 = pix[-3];
6716
6717                 const int q0 = pix[0];
6718                 const int q1 = pix[1];
6719                 const int q2 = pix[2];
6720
6721                 if( ABS( p0 - q0 ) < alpha &&
6722                     ABS( p1 - p0 ) < beta &&
6723                     ABS( q1 - q0 ) < beta ) {
6724
6725                     if(ABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
6726                         if( ABS( p2 - p0 ) < beta)
6727                         {
6728                             const int p3 = pix[-4];
6729                             /* p0', p1', p2' */
6730                             pix[-1] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
6731                             pix[-2] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
6732                             pix[-3] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
6733                         } else {
6734                             /* p0' */
6735                             pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6736                         }
6737                         if( ABS( q2 - q0 ) < beta)
6738                         {
6739                             const int q3 = pix[3];
6740                             /* q0', q1', q2' */
6741                             pix[0] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
6742                             pix[1] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
6743                             pix[2] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
6744                         } else {
6745                             /* q0' */
6746                             pix[0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6747                         }
6748                     }else{
6749                         /* p0', q0' */
6750                         pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6751                         pix[ 0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6752                     }
6753                     tprintf("filter_mb_edgev i:%d d:%d\n# bS:4 -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, d, p2, p1, p0, q0, q1, q2, pix[-2], pix[-1], pix[0], pix[1]);
6754                 }
6755                 pix += stride;
6756             }
6757     }
6758 }
6759 static void filter_mb_edgecv( H264Context *h, uint8_t *pix, int stride, int bS[4], int qp ) {
6760     int i;
6761     const int index_a = clip( qp + h->slice_alpha_c0_offset, 0, 51 );
6762     const int alpha = alpha_table[index_a];
6763     const int beta  = beta_table[clip( qp + h->slice_beta_offset, 0, 51 )];
6764
6765     if( bS[0] < 4 ) {
6766         int8_t tc[4];
6767         for(i=0; i<4; i++)
6768             tc[i] = bS[i] ? tc0_table[index_a][bS[i] - 1] + 1 : 0;
6769         h->s.dsp.h264_h_loop_filter_chroma(pix, stride, alpha, beta, tc);
6770     } else {
6771         h->s.dsp.h264_h_loop_filter_chroma_intra(pix, stride, alpha, beta);
6772     }
6773 }
6774
6775 static void filter_mb_mbaff_edgev( H264Context *h, uint8_t *pix, int stride, int bS[8], int qp[2] ) {
6776     int i;
6777     for( i = 0; i < 16; i++, pix += stride) {
6778         int index_a;
6779         int alpha;
6780         int beta;
6781
6782         int qp_index;
6783         int bS_index = (i >> 1);
6784         if (!MB_FIELD) {
6785             bS_index &= ~1;
6786             bS_index |= (i & 1);
6787         }
6788
6789         if( bS[bS_index] == 0 ) {
6790             continue;
6791         }
6792
6793         qp_index = MB_FIELD ? (i >> 3) : (i & 1);
6794         index_a = clip( qp[qp_index] + h->slice_alpha_c0_offset, 0, 51 );
6795         alpha = alpha_table[index_a];
6796         beta  = beta_table[clip( qp[qp_index] + h->slice_beta_offset, 0, 51 )];
6797
6798         if( bS[bS_index] < 4 ) {
6799             const int tc0 = tc0_table[index_a][bS[bS_index] - 1];
6800             const int p0 = pix[-1];
6801             const int p1 = pix[-2];
6802             const int p2 = pix[-3];
6803             const int q0 = pix[0];
6804             const int q1 = pix[1];
6805             const int q2 = pix[2];
6806
6807             if( ABS( p0 - q0 ) < alpha &&
6808                 ABS( p1 - p0 ) < beta &&
6809                 ABS( q1 - q0 ) < beta ) {
6810                 int tc = tc0;
6811                 int i_delta;
6812
6813                 if( ABS( p2 - p0 ) < beta ) {
6814                     pix[-2] = p1 + clip( ( p2 + ( ( p0 + q0 + 1 ) >> 1 ) - ( p1 << 1 ) ) >> 1, -tc0, tc0 );
6815                     tc++;
6816                 }
6817                 if( ABS( q2 - q0 ) < beta ) {
6818                     pix[1] = q1 + clip( ( q2 + ( ( p0 + q0 + 1 ) >> 1 ) - ( q1 << 1 ) ) >> 1, -tc0, tc0 );
6819                     tc++;
6820                 }
6821
6822                 i_delta = clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
6823                 pix[-1] = clip_uint8( p0 + i_delta );    /* p0' */
6824                 pix[0]  = clip_uint8( q0 - i_delta );    /* q0' */
6825                 tprintf("filter_mb_mbaff_edgev i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d, tc:%d\n# bS:%d -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, tc, bS[bS_index], pix[-3], p1, p0, q0, q1, pix[2], p1, pix[-1], pix[0], q1);
6826             }
6827         }else{
6828             const int p0 = pix[-1];
6829             const int p1 = pix[-2];
6830             const int p2 = pix[-3];
6831
6832             const int q0 = pix[0];
6833             const int q1 = pix[1];
6834             const int q2 = pix[2];
6835
6836             if( ABS( p0 - q0 ) < alpha &&
6837                 ABS( p1 - p0 ) < beta &&
6838                 ABS( q1 - q0 ) < beta ) {
6839
6840                 if(ABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
6841                     if( ABS( p2 - p0 ) < beta)
6842                     {
6843                         const int p3 = pix[-4];
6844                         /* p0', p1', p2' */
6845                         pix[-1] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
6846                         pix[-2] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
6847                         pix[-3] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
6848                     } else {
6849                         /* p0' */
6850                         pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6851                     }
6852                     if( ABS( q2 - q0 ) < beta)
6853                     {
6854                         const int q3 = pix[3];
6855                         /* q0', q1', q2' */
6856                         pix[0] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
6857                         pix[1] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
6858                         pix[2] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
6859                     } else {
6860                         /* q0' */
6861                         pix[0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6862                     }
6863                 }else{
6864                     /* p0', q0' */
6865                     pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6866                     pix[ 0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6867                 }
6868                 tprintf("filter_mb_mbaff_edgev i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d\n# bS:4 -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, p2, p1, p0, q0, q1, q2, pix[-3], pix[-2], pix[-1], pix[0], pix[1], pix[2]);
6869             }
6870         }
6871     }
6872 }
6873 static void filter_mb_mbaff_edgecv( H264Context *h, uint8_t *pix, int stride, int bS[8], int qp[2] ) {
6874     int i;
6875     for( i = 0; i < 8; i++, pix += stride) {
6876         int index_a;
6877         int alpha;
6878         int beta;
6879
6880         int qp_index;
6881         int bS_index = i;
6882
6883         if( bS[bS_index] == 0 ) {
6884             continue;
6885         }
6886
6887         qp_index = MB_FIELD ? (i >> 2) : (i & 1);
6888         index_a = clip( qp[qp_index] + h->slice_alpha_c0_offset, 0, 51 );
6889         alpha = alpha_table[index_a];
6890         beta  = beta_table[clip( qp[qp_index] + h->slice_beta_offset, 0, 51 )];
6891
6892         if( bS[bS_index] < 4 ) {
6893             const int tc = tc0_table[index_a][bS[bS_index] - 1] + 1;
6894             const int p0 = pix[-1];
6895             const int p1 = pix[-2];
6896             const int q0 = pix[0];
6897             const int q1 = pix[1];
6898
6899             if( ABS( p0 - q0 ) < alpha &&
6900                 ABS( p1 - p0 ) < beta &&
6901                 ABS( q1 - q0 ) < beta ) {
6902                 const int i_delta = clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
6903
6904                 pix[-1] = clip_uint8( p0 + i_delta );    /* p0' */
6905                 pix[0]  = clip_uint8( q0 - i_delta );    /* q0' */
6906                 tprintf("filter_mb_mbaff_edgecv i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d, tc:%d\n# bS:%d -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, tc, bS[bS_index], pix[-3], p1, p0, q0, q1, pix[2], p1, pix[-1], pix[0], q1);
6907             }
6908         }else{
6909             const int p0 = pix[-1];
6910             const int p1 = pix[-2];
6911             const int q0 = pix[0];
6912             const int q1 = pix[1];
6913
6914             if( ABS( p0 - q0 ) < alpha &&
6915                 ABS( p1 - p0 ) < beta &&
6916                 ABS( q1 - q0 ) < beta ) {
6917
6918                 pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;   /* p0' */
6919                 pix[0]  = ( 2*q1 + q0 + p1 + 2 ) >> 2;   /* q0' */
6920                 tprintf("filter_mb_mbaff_edgecv i:%d\n# bS:4 -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x, %02x, %02x]\n", i, pix[-3], p1, p0, q0, q1, pix[2], pix[-3], pix[-2], pix[-1], pix[0], pix[1], pix[2]);
6921             }
6922         }
6923     }
6924 }
6925
6926 static void filter_mb_edgeh( H264Context *h, uint8_t *pix, int stride, int bS[4], int qp ) {
6927     int i, d;
6928     const int index_a = clip( qp + h->slice_alpha_c0_offset, 0, 51 );
6929     const int alpha = alpha_table[index_a];
6930     const int beta  = beta_table[clip( qp + h->slice_beta_offset, 0, 51 )];
6931     const int pix_next  = stride;
6932
6933     if( bS[0] < 4 ) {
6934         int8_t tc[4];
6935         for(i=0; i<4; i++)
6936             tc[i] = bS[i] ? tc0_table[index_a][bS[i] - 1] : -1;
6937         h->s.dsp.h264_v_loop_filter_luma(pix, stride, alpha, beta, tc);
6938     } else {
6939         /* 16px edge length, see filter_mb_edgev */
6940             for( d = 0; d < 16; d++ ) {
6941                 const int p0 = pix[-1*pix_next];
6942                 const int p1 = pix[-2*pix_next];
6943                 const int p2 = pix[-3*pix_next];
6944                 const int q0 = pix[0];
6945                 const int q1 = pix[1*pix_next];
6946                 const int q2 = pix[2*pix_next];
6947
6948                 if( ABS( p0 - q0 ) < alpha &&
6949                     ABS( p1 - p0 ) < beta &&
6950                     ABS( q1 - q0 ) < beta ) {
6951
6952                     const int p3 = pix[-4*pix_next];
6953                     const int q3 = pix[ 3*pix_next];
6954
6955                     if(ABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
6956                         if( ABS( p2 - p0 ) < beta) {
6957                             /* p0', p1', p2' */
6958                             pix[-1*pix_next] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
6959                             pix[-2*pix_next] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
6960                             pix[-3*pix_next] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
6961                         } else {
6962                             /* p0' */
6963                             pix[-1*pix_next] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6964                         }
6965                         if( ABS( q2 - q0 ) < beta) {
6966                             /* q0', q1', q2' */
6967                             pix[0*pix_next] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
6968                             pix[1*pix_next] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
6969                             pix[2*pix_next] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
6970                         } else {
6971                             /* q0' */
6972                             pix[0*pix_next] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6973                         }
6974                     }else{
6975                         /* p0', q0' */
6976                         pix[-1*pix_next] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6977                         pix[ 0*pix_next] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6978                     }
6979                     tprintf("filter_mb_edgeh i:%d d:%d, qp:%d, indexA:%d, alpha:%d, beta:%d\n# bS:%d -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, d, qp, index_a, alpha, beta, bS[i], p2, p1, p0, q0, q1, q2, pix[-2*pix_next], pix[-pix_next], pix[0], pix[pix_next]);
6980                 }
6981                 pix++;
6982             }
6983     }
6984 }
6985
6986 static void filter_mb_edgech( H264Context *h, uint8_t *pix, int stride, int bS[4], int qp ) {
6987     int i;
6988     const int index_a = clip( qp + h->slice_alpha_c0_offset, 0, 51 );
6989     const int alpha = alpha_table[index_a];
6990     const int beta  = beta_table[clip( qp + h->slice_beta_offset, 0, 51 )];
6991
6992     if( bS[0] < 4 ) {
6993         int8_t tc[4];
6994         for(i=0; i<4; i++)
6995             tc[i] = bS[i] ? tc0_table[index_a][bS[i] - 1] + 1 : 0;
6996         h->s.dsp.h264_v_loop_filter_chroma(pix, stride, alpha, beta, tc);
6997     } else {
6998         h->s.dsp.h264_v_loop_filter_chroma_intra(pix, stride, alpha, beta);
6999     }
7000 }
7001
7002 static void filter_mb( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize) {
7003     MpegEncContext * const s = &h->s;
7004     const int mb_xy= mb_x + mb_y*s->mb_stride;
7005     const int mb_type = s->current_picture.mb_type[mb_xy];
7006     const int mvy_limit = IS_INTERLACED(mb_type) ? 2 : 4;
7007     int first_vertical_edge_done = 0;
7008     int dir;
7009     /* FIXME: A given frame may occupy more than one position in
7010      * the reference list. So ref2frm should be populated with
7011      * frame numbers, not indices. */
7012     static const int ref2frm[34] = {-1,-1,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,
7013                                     16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31};
7014
7015     //for sufficiently low qp, filtering wouldn't do anything
7016     //this is a conservative estimate: could also check beta_offset and more accurate chroma_qp
7017     if(!FRAME_MBAFF){
7018         int qp_thresh = 15 - h->slice_alpha_c0_offset - FFMAX(0, h->pps.chroma_qp_index_offset);
7019         int qp = s->current_picture.qscale_table[mb_xy];
7020         if(qp <= qp_thresh
7021            && (mb_x == 0 || ((qp + s->current_picture.qscale_table[mb_xy-1] + 1)>>1) <= qp_thresh)
7022            && (mb_y == 0 || ((qp + s->current_picture.qscale_table[h->top_mb_xy] + 1)>>1) <= qp_thresh)){
7023             return;
7024         }
7025     }
7026
7027     if (FRAME_MBAFF
7028             // left mb is in picture
7029             && h->slice_table[mb_xy-1] != 255
7030             // and current and left pair do not have the same interlaced type
7031             && (IS_INTERLACED(mb_type) != IS_INTERLACED(s->current_picture.mb_type[mb_xy-1]))
7032             // and left mb is in the same slice if deblocking_filter == 2
7033             && (h->deblocking_filter!=2 || h->slice_table[mb_xy-1] == h->slice_table[mb_xy])) {
7034         /* First vertical edge is different in MBAFF frames
7035          * There are 8 different bS to compute and 2 different Qp
7036          */
7037         const int pair_xy = mb_x + (mb_y&~1)*s->mb_stride;
7038         const int left_mb_xy[2] = { pair_xy-1, pair_xy-1+s->mb_stride };
7039         int bS[8];
7040         int qp[2];
7041         int chroma_qp[2];
7042         int mb_qp, mbn0_qp, mbn1_qp;
7043         int i;
7044         first_vertical_edge_done = 1;
7045
7046         if( IS_INTRA(mb_type) )
7047             bS[0] = bS[1] = bS[2] = bS[3] = bS[4] = bS[5] = bS[6] = bS[7] = 4;
7048         else {
7049             for( i = 0; i < 8; i++ ) {
7050                 int mbn_xy = MB_FIELD ? left_mb_xy[i>>2] : left_mb_xy[i&1];
7051
7052                 if( IS_INTRA( s->current_picture.mb_type[mbn_xy] ) )
7053                     bS[i] = 4;
7054                 else if( h->non_zero_count_cache[12+8*(i>>1)] != 0 ||
7055                          /* FIXME: with 8x8dct + cavlc, should check cbp instead of nnz */
7056                          h->non_zero_count[mbn_xy][MB_FIELD ? i&3 : (i>>2)+(mb_y&1)*2] )
7057                     bS[i] = 2;
7058                 else
7059                     bS[i] = 1;
7060             }
7061         }
7062
7063         mb_qp = s->current_picture.qscale_table[mb_xy];
7064         mbn0_qp = s->current_picture.qscale_table[left_mb_xy[0]];
7065         mbn1_qp = s->current_picture.qscale_table[left_mb_xy[1]];
7066         qp[0] = ( mb_qp + mbn0_qp + 1 ) >> 1;
7067         chroma_qp[0] = ( get_chroma_qp( h->pps.chroma_qp_index_offset, mb_qp ) +
7068                          get_chroma_qp( h->pps.chroma_qp_index_offset, mbn0_qp ) + 1 ) >> 1;
7069         qp[1] = ( mb_qp + mbn1_qp + 1 ) >> 1;
7070         chroma_qp[1] = ( get_chroma_qp( h->pps.chroma_qp_index_offset, mb_qp ) +
7071                          get_chroma_qp( h->pps.chroma_qp_index_offset, mbn1_qp ) + 1 ) >> 1;
7072
7073         /* Filter edge */
7074         tprintf("filter mb:%d/%d MBAFF, QPy:%d/%d, QPc:%d/%d ls:%d uvls:%d", mb_x, mb_y, qp[0], qp[1], chroma_qp[0], chroma_qp[1], linesize, uvlinesize);
7075         { int i; for (i = 0; i < 8; i++) tprintf(" bS[%d]:%d", i, bS[i]); tprintf("\n"); }
7076         filter_mb_mbaff_edgev ( h, &img_y [0], linesize,   bS, qp );
7077         filter_mb_mbaff_edgecv( h, &img_cb[0], uvlinesize, bS, chroma_qp );
7078         filter_mb_mbaff_edgecv( h, &img_cr[0], uvlinesize, bS, chroma_qp );
7079     }
7080     /* dir : 0 -> vertical edge, 1 -> horizontal edge */
7081     for( dir = 0; dir < 2; dir++ )
7082     {
7083         int edge;
7084         const int mbm_xy = dir == 0 ? mb_xy -1 : h->top_mb_xy;
7085         const int mbm_type = s->current_picture.mb_type[mbm_xy];
7086         int start = h->slice_table[mbm_xy] == 255 ? 1 : 0;
7087
7088         const int edges = (mb_type & (MB_TYPE_16x16|MB_TYPE_SKIP))
7089                                   == (MB_TYPE_16x16|MB_TYPE_SKIP) ? 1 : 4;
7090         // how often to recheck mv-based bS when iterating between edges
7091         const int mask_edge = (mb_type & (MB_TYPE_16x16 | (MB_TYPE_16x8 << dir))) ? 3 :
7092                               (mb_type & (MB_TYPE_8x16 >> dir)) ? 1 : 0;
7093         // how often to recheck mv-based bS when iterating along each edge
7094         const int mask_par0 = mb_type & (MB_TYPE_16x16 | (MB_TYPE_8x16 >> dir));
7095
7096         if (first_vertical_edge_done) {
7097             start = 1;
7098             first_vertical_edge_done = 0;
7099         }
7100
7101         if (h->deblocking_filter==2 && h->slice_table[mbm_xy] != h->slice_table[mb_xy])
7102             start = 1;
7103
7104         if (FRAME_MBAFF && (dir == 1) && ((mb_y&1) == 0) && start == 0
7105             && !IS_INTERLACED(mb_type)
7106             && IS_INTERLACED(mbm_type)
7107             ) {
7108             // This is a special case in the norm where the filtering must
7109             // be done twice (one each of the field) even if we are in a
7110             // frame macroblock.
7111             //
7112             static const int nnz_idx[4] = {4,5,6,3};
7113             unsigned int tmp_linesize   = 2 *   linesize;
7114             unsigned int tmp_uvlinesize = 2 * uvlinesize;
7115             int mbn_xy = mb_xy - 2 * s->mb_stride;
7116             int qp, chroma_qp;
7117             int i, j;
7118             int bS[4];
7119
7120             for(j=0; j<2; j++, mbn_xy += s->mb_stride){
7121                 if( IS_INTRA(mb_type) ||
7122                     IS_INTRA(s->current_picture.mb_type[mbn_xy]) ) {
7123                     bS[0] = bS[1] = bS[2] = bS[3] = 3;
7124                 } else {
7125                     const uint8_t *mbn_nnz = h->non_zero_count[mbn_xy];
7126                     for( i = 0; i < 4; i++ ) {
7127                         if( h->non_zero_count_cache[scan8[0]+i] != 0 ||
7128                             mbn_nnz[nnz_idx[i]] != 0 )
7129                             bS[i] = 2;
7130                         else
7131                             bS[i] = 1;
7132                     }
7133                 }
7134                 // Do not use s->qscale as luma quantizer because it has not the same
7135                 // value in IPCM macroblocks.
7136                 qp = ( s->current_picture.qscale_table[mb_xy] + s->current_picture.qscale_table[mbn_xy] + 1 ) >> 1;
7137                 tprintf("filter mb:%d/%d dir:%d edge:%d, QPy:%d ls:%d uvls:%d", mb_x, mb_y, dir, edge, qp, tmp_linesize, tmp_uvlinesize);
7138                 { int i; for (i = 0; i < 4; i++) tprintf(" bS[%d]:%d", i, bS[i]); tprintf("\n"); }
7139                 filter_mb_edgeh( h, &img_y[j*linesize], tmp_linesize, bS, qp );
7140                 chroma_qp = ( h->chroma_qp +
7141                               get_chroma_qp( h->pps.chroma_qp_index_offset, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1;
7142                 filter_mb_edgech( h, &img_cb[j*uvlinesize], tmp_uvlinesize, bS, chroma_qp );
7143                 filter_mb_edgech( h, &img_cr[j*uvlinesize], tmp_uvlinesize, bS, chroma_qp );
7144             }
7145
7146             start = 1;
7147         }
7148
7149         /* Calculate bS */
7150         for( edge = start; edge < edges; edge++ ) {
7151             /* mbn_xy: neighbor macroblock */
7152             const int mbn_xy = edge > 0 ? mb_xy : mbm_xy;
7153             const int mbn_type = s->current_picture.mb_type[mbn_xy];
7154             int bS[4];
7155             int qp;
7156
7157             if( (edge&1) && IS_8x8DCT(mb_type) )
7158                 continue;
7159
7160             if( IS_INTRA(mb_type) ||
7161                 IS_INTRA(mbn_type) ) {
7162                 int value;
7163                 if (edge == 0) {
7164                     if (   (!IS_INTERLACED(mb_type) && !IS_INTERLACED(mbm_type))
7165                         || ((FRAME_MBAFF || (s->picture_structure != PICT_FRAME)) && (dir == 0))
7166                     ) {
7167                         value = 4;
7168                     } else {
7169                         value = 3;
7170                     }
7171                 } else {
7172                     value = 3;
7173                 }
7174                 bS[0] = bS[1] = bS[2] = bS[3] = value;
7175             } else {
7176                 int i, l;
7177                 int mv_done;
7178
7179                 if( edge & mask_edge ) {
7180                     bS[0] = bS[1] = bS[2] = bS[3] = 0;
7181                     mv_done = 1;
7182                 }
7183                 else if( FRAME_MBAFF && IS_INTERLACED(mb_type ^ mbn_type)) {
7184                     bS[0] = bS[1] = bS[2] = bS[3] = 1;
7185                     mv_done = 1;
7186                 }
7187                 else if( mask_par0 && (edge || (mbn_type & (MB_TYPE_16x16 | (MB_TYPE_8x16 >> dir)))) ) {
7188                     int b_idx= 8 + 4 + edge * (dir ? 8:1);
7189                     int bn_idx= b_idx - (dir ? 8:1);
7190                     int v = 0;
7191                     for( l = 0; !v && l < 1 + (h->slice_type == B_TYPE); l++ ) {
7192                         v |= ref2frm[h->ref_cache[l][b_idx]+2] != ref2frm[h->ref_cache[l][bn_idx]+2] ||
7193                              ABS( h->mv_cache[l][b_idx][0] - h->mv_cache[l][bn_idx][0] ) >= 4 ||
7194                              ABS( h->mv_cache[l][b_idx][1] - h->mv_cache[l][bn_idx][1] ) >= mvy_limit;
7195                     }
7196                     bS[0] = bS[1] = bS[2] = bS[3] = v;
7197                     mv_done = 1;
7198                 }
7199                 else
7200                     mv_done = 0;
7201
7202                 for( i = 0; i < 4; i++ ) {
7203                     int x = dir == 0 ? edge : i;
7204                     int y = dir == 0 ? i    : edge;
7205                     int b_idx= 8 + 4 + x + 8*y;
7206                     int bn_idx= b_idx - (dir ? 8:1);
7207
7208                     if( h->non_zero_count_cache[b_idx] != 0 ||
7209                         h->non_zero_count_cache[bn_idx] != 0 ) {
7210                         bS[i] = 2;
7211                     }
7212                     else if(!mv_done)
7213                     {
7214                         bS[i] = 0;
7215                         for( l = 0; l < 1 + (h->slice_type == B_TYPE); l++ ) {
7216                             if( ref2frm[h->ref_cache[l][b_idx]+2] != ref2frm[h->ref_cache[l][bn_idx]+2] ||
7217                                 ABS( h->mv_cache[l][b_idx][0] - h->mv_cache[l][bn_idx][0] ) >= 4 ||
7218                                 ABS( h->mv_cache[l][b_idx][1] - h->mv_cache[l][bn_idx][1] ) >= mvy_limit ) {
7219                                 bS[i] = 1;
7220                                 break;
7221                             }
7222                         }
7223                     }
7224                 }
7225
7226                 if(bS[0]+bS[1]+bS[2]+bS[3] == 0)
7227                     continue;
7228             }
7229
7230             /* Filter edge */
7231             // Do not use s->qscale as luma quantizer because it has not the same
7232             // value in IPCM macroblocks.
7233             qp = ( s->current_picture.qscale_table[mb_xy] + s->current_picture.qscale_table[mbn_xy] + 1 ) >> 1;
7234             //tprintf("filter mb:%d/%d dir:%d edge:%d, QPy:%d, QPc:%d, QPcn:%d\n", mb_x, mb_y, dir, edge, qp, h->chroma_qp, s->current_picture.qscale_table[mbn_xy]);
7235             tprintf("filter mb:%d/%d dir:%d edge:%d, QPy:%d ls:%d uvls:%d", mb_x, mb_y, dir, edge, qp, linesize, uvlinesize);
7236             { int i; for (i = 0; i < 4; i++) tprintf(" bS[%d]:%d", i, bS[i]); tprintf("\n"); }
7237             if( dir == 0 ) {
7238                 filter_mb_edgev( h, &img_y[4*edge], linesize, bS, qp );
7239                 if( (edge&1) == 0 ) {
7240                     int chroma_qp = ( h->chroma_qp +
7241                                       get_chroma_qp( h->pps.chroma_qp_index_offset, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1;
7242                     filter_mb_edgecv( h, &img_cb[2*edge], uvlinesize, bS, chroma_qp );
7243                     filter_mb_edgecv( h, &img_cr[2*edge], uvlinesize, bS, chroma_qp );
7244                 }
7245             } else {
7246                 filter_mb_edgeh( h, &img_y[4*edge*linesize], linesize, bS, qp );
7247                 if( (edge&1) == 0 ) {
7248                     int chroma_qp = ( h->chroma_qp +
7249                                       get_chroma_qp( h->pps.chroma_qp_index_offset, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1;
7250                     filter_mb_edgech( h, &img_cb[2*edge*uvlinesize], uvlinesize, bS, chroma_qp );
7251                     filter_mb_edgech( h, &img_cr[2*edge*uvlinesize], uvlinesize, bS, chroma_qp );
7252                 }
7253             }
7254         }
7255     }
7256 }
7257
7258 static int decode_slice(H264Context *h){
7259     MpegEncContext * const s = &h->s;
7260     const int part_mask= s->partitioned_frame ? (AC_END|AC_ERROR) : 0x7F;
7261
7262     s->mb_skip_run= -1;
7263
7264     if( h->pps.cabac ) {
7265         int i;
7266
7267         /* realign */
7268         align_get_bits( &s->gb );
7269
7270         /* init cabac */
7271         ff_init_cabac_states( &h->cabac, ff_h264_lps_range, ff_h264_mps_state, ff_h264_lps_state, 64 );
7272         ff_init_cabac_decoder( &h->cabac,
7273                                s->gb.buffer + get_bits_count(&s->gb)/8,
7274                                ( s->gb.size_in_bits - get_bits_count(&s->gb) + 7)/8);
7275         /* calculate pre-state */
7276         for( i= 0; i < 460; i++ ) {
7277             int pre;
7278             if( h->slice_type == I_TYPE )
7279                 pre = clip( ((cabac_context_init_I[i][0] * s->qscale) >>4 ) + cabac_context_init_I[i][1], 1, 126 );
7280             else
7281                 pre = clip( ((cabac_context_init_PB[h->cabac_init_idc][i][0] * s->qscale) >>4 ) + cabac_context_init_PB[h->cabac_init_idc][i][1], 1, 126 );
7282
7283             if( pre <= 63 )
7284                 h->cabac_state[i] = 2 * ( 63 - pre ) + 0;
7285             else
7286                 h->cabac_state[i] = 2 * ( pre - 64 ) + 1;
7287         }
7288
7289         for(;;){
7290             int ret = decode_mb_cabac(h);
7291             int eos;
7292
7293             if(ret>=0) hl_decode_mb(h);
7294
7295             if( ret >= 0 && FRAME_MBAFF ) { //FIXME optimal? or let mb_decode decode 16x32 ?
7296                 s->mb_y++;
7297
7298                 if(ret>=0) ret = decode_mb_cabac(h);
7299
7300                 if(ret>=0) hl_decode_mb(h);
7301                 s->mb_y--;
7302             }
7303             eos = get_cabac_terminate( &h->cabac );
7304
7305             if( ret < 0 || h->cabac.bytestream > h->cabac.bytestream_end + 1) {
7306                 av_log(h->s.avctx, AV_LOG_ERROR, "error while decoding MB %d %d, bytestream (%d)\n", s->mb_x, s->mb_y, h->cabac.bytestream_end - h->cabac.bytestream);
7307                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
7308                 return -1;
7309             }
7310
7311             if( ++s->mb_x >= s->mb_width ) {
7312                 s->mb_x = 0;
7313                 ff_draw_horiz_band(s, 16*s->mb_y, 16);
7314                 ++s->mb_y;
7315                 if(FRAME_MBAFF) {
7316                     ++s->mb_y;
7317                 }
7318             }
7319
7320             if( eos || s->mb_y >= s->mb_height ) {
7321                 tprintf("slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
7322                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
7323                 return 0;
7324             }
7325         }
7326
7327     } else {
7328         for(;;){
7329             int ret = decode_mb_cavlc(h);
7330
7331             if(ret>=0) hl_decode_mb(h);
7332
7333             if(ret>=0 && FRAME_MBAFF){ //FIXME optimal? or let mb_decode decode 16x32 ?
7334                 s->mb_y++;
7335                 ret = decode_mb_cavlc(h);
7336
7337                 if(ret>=0) hl_decode_mb(h);
7338                 s->mb_y--;
7339             }
7340
7341             if(ret<0){
7342                 av_log(h->s.avctx, AV_LOG_ERROR, "error while decoding MB %d %d\n", s->mb_x, s->mb_y);
7343                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
7344
7345                 return -1;
7346             }
7347
7348             if(++s->mb_x >= s->mb_width){
7349                 s->mb_x=0;
7350                 ff_draw_horiz_band(s, 16*s->mb_y, 16);
7351                 ++s->mb_y;
7352                 if(FRAME_MBAFF) {
7353                     ++s->mb_y;
7354                 }
7355                 if(s->mb_y >= s->mb_height){
7356                     tprintf("slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
7357
7358                     if(get_bits_count(&s->gb) == s->gb.size_in_bits ) {
7359                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
7360
7361                         return 0;
7362                     }else{
7363                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
7364
7365                         return -1;
7366                     }
7367                 }
7368             }
7369
7370             if(get_bits_count(&s->gb) >= s->gb.size_in_bits && s->mb_skip_run<=0){
7371                 tprintf("slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
7372                 if(get_bits_count(&s->gb) == s->gb.size_in_bits ){
7373                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
7374
7375                     return 0;
7376                 }else{
7377                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
7378
7379                     return -1;
7380                 }
7381             }
7382         }
7383     }
7384
7385 #if 0
7386     for(;s->mb_y < s->mb_height; s->mb_y++){
7387         for(;s->mb_x < s->mb_width; s->mb_x++){
7388             int ret= decode_mb(h);
7389
7390             hl_decode_mb(h);
7391
7392             if(ret<0){
7393                 av_log(s->avctx, AV_LOG_ERROR, "error while decoding MB %d %d\n", s->mb_x, s->mb_y);
7394                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
7395
7396                 return -1;
7397             }
7398
7399             if(++s->mb_x >= s->mb_width){
7400                 s->mb_x=0;
7401                 if(++s->mb_y >= s->mb_height){
7402                     if(get_bits_count(s->gb) == s->gb.size_in_bits){
7403                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
7404
7405                         return 0;
7406                     }else{
7407                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
7408
7409                         return -1;
7410                     }
7411                 }
7412             }
7413
7414             if(get_bits_count(s->?gb) >= s->gb?.size_in_bits){
7415                 if(get_bits_count(s->gb) == s->gb.size_in_bits){
7416                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
7417
7418                     return 0;
7419                 }else{
7420                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
7421
7422                     return -1;
7423                 }
7424             }
7425         }
7426         s->mb_x=0;
7427         ff_draw_horiz_band(s, 16*s->mb_y, 16);
7428     }
7429 #endif
7430     return -1; //not reached
7431 }
7432
7433 static int decode_unregistered_user_data(H264Context *h, int size){
7434     MpegEncContext * const s = &h->s;
7435     uint8_t user_data[16+256];
7436     int e, build, i;
7437
7438     if(size<16)
7439         return -1;
7440
7441     for(i=0; i<sizeof(user_data)-1 && i<size; i++){
7442         user_data[i]= get_bits(&s->gb, 8);
7443     }
7444
7445     user_data[i]= 0;
7446     e= sscanf(user_data+16, "x264 - core %d"/*%s - H.264/MPEG-4 AVC codec - Copyleft 2005 - http://www.videolan.org/x264.html*/, &build);
7447     if(e==1 && build>=0)
7448         h->x264_build= build;
7449
7450     if(s->avctx->debug & FF_DEBUG_BUGS)
7451         av_log(s->avctx, AV_LOG_DEBUG, "user data:\"%s\"\n", user_data+16);
7452
7453     for(; i<size; i++)
7454         skip_bits(&s->gb, 8);
7455
7456     return 0;
7457 }
7458
7459 static int decode_sei(H264Context *h){
7460     MpegEncContext * const s = &h->s;
7461
7462     while(get_bits_count(&s->gb) + 16 < s->gb.size_in_bits){
7463         int size, type;
7464
7465         type=0;
7466         do{
7467             type+= show_bits(&s->gb, 8);
7468         }while(get_bits(&s->gb, 8) == 255);
7469
7470         size=0;
7471         do{
7472             size+= show_bits(&s->gb, 8);
7473         }while(get_bits(&s->gb, 8) == 255);
7474
7475         switch(type){
7476         case 5:
7477             if(decode_unregistered_user_data(h, size) < 0)
7478                 return -1;
7479             break;
7480         default:
7481             skip_bits(&s->gb, 8*size);
7482         }
7483
7484         //FIXME check bits here
7485         align_get_bits(&s->gb);
7486     }
7487
7488     return 0;
7489 }
7490
7491 static inline void decode_hrd_parameters(H264Context *h, SPS *sps){
7492     MpegEncContext * const s = &h->s;
7493     int cpb_count, i;
7494     cpb_count = get_ue_golomb(&s->gb) + 1;
7495     get_bits(&s->gb, 4); /* bit_rate_scale */
7496     get_bits(&s->gb, 4); /* cpb_size_scale */
7497     for(i=0; i<cpb_count; i++){
7498         get_ue_golomb(&s->gb); /* bit_rate_value_minus1 */
7499         get_ue_golomb(&s->gb); /* cpb_size_value_minus1 */
7500         get_bits1(&s->gb);     /* cbr_flag */
7501     }
7502     get_bits(&s->gb, 5); /* initial_cpb_removal_delay_length_minus1 */
7503     get_bits(&s->gb, 5); /* cpb_removal_delay_length_minus1 */
7504     get_bits(&s->gb, 5); /* dpb_output_delay_length_minus1 */
7505     get_bits(&s->gb, 5); /* time_offset_length */
7506 }
7507
7508 static inline int decode_vui_parameters(H264Context *h, SPS *sps){
7509     MpegEncContext * const s = &h->s;
7510     int aspect_ratio_info_present_flag, aspect_ratio_idc;
7511     int nal_hrd_parameters_present_flag, vcl_hrd_parameters_present_flag;
7512
7513     aspect_ratio_info_present_flag= get_bits1(&s->gb);
7514
7515     if( aspect_ratio_info_present_flag ) {
7516         aspect_ratio_idc= get_bits(&s->gb, 8);
7517         if( aspect_ratio_idc == EXTENDED_SAR ) {
7518             sps->sar.num= get_bits(&s->gb, 16);
7519             sps->sar.den= get_bits(&s->gb, 16);
7520         }else if(aspect_ratio_idc < 14){
7521             sps->sar=  pixel_aspect[aspect_ratio_idc];
7522         }else{
7523             av_log(h->s.avctx, AV_LOG_ERROR, "illegal aspect ratio\n");
7524             return -1;
7525         }
7526     }else{
7527         sps->sar.num=
7528         sps->sar.den= 0;
7529     }
7530 //            s->avctx->aspect_ratio= sar_width*s->width / (float)(s->height*sar_height);
7531
7532     if(get_bits1(&s->gb)){      /* overscan_info_present_flag */
7533         get_bits1(&s->gb);      /* overscan_appropriate_flag */
7534     }
7535
7536     if(get_bits1(&s->gb)){      /* video_signal_type_present_flag */
7537         get_bits(&s->gb, 3);    /* video_format */
7538         get_bits1(&s->gb);      /* video_full_range_flag */
7539         if(get_bits1(&s->gb)){  /* colour_description_present_flag */
7540             get_bits(&s->gb, 8); /* colour_primaries */
7541             get_bits(&s->gb, 8); /* transfer_characteristics */
7542             get_bits(&s->gb, 8); /* matrix_coefficients */
7543         }
7544     }
7545
7546     if(get_bits1(&s->gb)){      /* chroma_location_info_present_flag */
7547         get_ue_golomb(&s->gb);  /* chroma_sample_location_type_top_field */
7548         get_ue_golomb(&s->gb);  /* chroma_sample_location_type_bottom_field */
7549     }
7550
7551     sps->timing_info_present_flag = get_bits1(&s->gb);
7552     if(sps->timing_info_present_flag){
7553         sps->num_units_in_tick = get_bits_long(&s->gb, 32);
7554         sps->time_scale = get_bits_long(&s->gb, 32);
7555         sps->fixed_frame_rate_flag = get_bits1(&s->gb);
7556     }
7557
7558     nal_hrd_parameters_present_flag = get_bits1(&s->gb);
7559     if(nal_hrd_parameters_present_flag)
7560         decode_hrd_parameters(h, sps);
7561     vcl_hrd_parameters_present_flag = get_bits1(&s->gb);
7562     if(vcl_hrd_parameters_present_flag)
7563         decode_hrd_parameters(h, sps);
7564     if(nal_hrd_parameters_present_flag || vcl_hrd_parameters_present_flag)
7565         get_bits1(&s->gb);     /* low_delay_hrd_flag */
7566     get_bits1(&s->gb);         /* pic_struct_present_flag */
7567
7568     sps->bitstream_restriction_flag = get_bits1(&s->gb);
7569     if(sps->bitstream_restriction_flag){
7570         get_bits1(&s->gb);     /* motion_vectors_over_pic_boundaries_flag */
7571         get_ue_golomb(&s->gb); /* max_bytes_per_pic_denom */
7572         get_ue_golomb(&s->gb); /* max_bits_per_mb_denom */
7573         get_ue_golomb(&s->gb); /* log2_max_mv_length_horizontal */
7574         get_ue_golomb(&s->gb); /* log2_max_mv_length_vertical */
7575         sps->num_reorder_frames = get_ue_golomb(&s->gb);
7576         get_ue_golomb(&s->gb); /* max_dec_frame_buffering */
7577     }
7578
7579     return 0;
7580 }
7581
7582 static void decode_scaling_list(H264Context *h, uint8_t *factors, int size,
7583                                 const uint8_t *jvt_list, const uint8_t *fallback_list){
7584     MpegEncContext * const s = &h->s;
7585     int i, last = 8, next = 8;
7586     const uint8_t *scan = size == 16 ? zigzag_scan : zigzag_scan8x8;
7587     if(!get_bits1(&s->gb)) /* matrix not written, we use the predicted one */
7588         memcpy(factors, fallback_list, size*sizeof(uint8_t));
7589     else
7590     for(i=0;i<size;i++){
7591         if(next)
7592             next = (last + get_se_golomb(&s->gb)) & 0xff;
7593         if(!i && !next){ /* matrix not written, we use the preset one */
7594             memcpy(factors, jvt_list, size*sizeof(uint8_t));
7595             break;
7596         }
7597         last = factors[scan[i]] = next ? next : last;
7598     }
7599 }
7600
7601 static void decode_scaling_matrices(H264Context *h, SPS *sps, PPS *pps, int is_sps,
7602                                    uint8_t (*scaling_matrix4)[16], uint8_t (*scaling_matrix8)[64]){
7603     MpegEncContext * const s = &h->s;
7604     int fallback_sps = !is_sps && sps->scaling_matrix_present;
7605     const uint8_t *fallback[4] = {
7606         fallback_sps ? sps->scaling_matrix4[0] : default_scaling4[0],
7607         fallback_sps ? sps->scaling_matrix4[3] : default_scaling4[1],
7608         fallback_sps ? sps->scaling_matrix8[0] : default_scaling8[0],
7609         fallback_sps ? sps->scaling_matrix8[1] : default_scaling8[1]
7610     };
7611     if(get_bits1(&s->gb)){
7612         sps->scaling_matrix_present |= is_sps;
7613         decode_scaling_list(h,scaling_matrix4[0],16,default_scaling4[0],fallback[0]); // Intra, Y
7614         decode_scaling_list(h,scaling_matrix4[1],16,default_scaling4[0],scaling_matrix4[0]); // Intra, Cr
7615         decode_scaling_list(h,scaling_matrix4[2],16,default_scaling4[0],scaling_matrix4[1]); // Intra, Cb
7616         decode_scaling_list(h,scaling_matrix4[3],16,default_scaling4[1],fallback[1]); // Inter, Y
7617         decode_scaling_list(h,scaling_matrix4[4],16,default_scaling4[1],scaling_matrix4[3]); // Inter, Cr
7618         decode_scaling_list(h,scaling_matrix4[5],16,default_scaling4[1],scaling_matrix4[4]); // Inter, Cb
7619         if(is_sps || pps->transform_8x8_mode){
7620             decode_scaling_list(h,scaling_matrix8[0],64,default_scaling8[0],fallback[2]);  // Intra, Y
7621             decode_scaling_list(h,scaling_matrix8[1],64,default_scaling8[1],fallback[3]);  // Inter, Y
7622         }
7623     } else if(fallback_sps) {
7624         memcpy(scaling_matrix4, sps->scaling_matrix4, 6*16*sizeof(uint8_t));
7625         memcpy(scaling_matrix8, sps->scaling_matrix8, 2*64*sizeof(uint8_t));
7626     }
7627 }
7628
7629 static inline int decode_seq_parameter_set(H264Context *h){
7630     MpegEncContext * const s = &h->s;
7631     int profile_idc, level_idc;
7632     int sps_id, i;
7633     SPS *sps;
7634
7635     profile_idc= get_bits(&s->gb, 8);
7636     get_bits1(&s->gb);   //constraint_set0_flag
7637     get_bits1(&s->gb);   //constraint_set1_flag
7638     get_bits1(&s->gb);   //constraint_set2_flag
7639     get_bits1(&s->gb);   //constraint_set3_flag
7640     get_bits(&s->gb, 4); // reserved
7641     level_idc= get_bits(&s->gb, 8);
7642     sps_id= get_ue_golomb(&s->gb);
7643
7644     sps= &h->sps_buffer[ sps_id ];
7645     sps->profile_idc= profile_idc;
7646     sps->level_idc= level_idc;
7647
7648     if(sps->profile_idc >= 100){ //high profile
7649         if(get_ue_golomb(&s->gb) == 3) //chroma_format_idc
7650             get_bits1(&s->gb);  //residual_color_transform_flag
7651         get_ue_golomb(&s->gb);  //bit_depth_luma_minus8
7652         get_ue_golomb(&s->gb);  //bit_depth_chroma_minus8
7653         sps->transform_bypass = get_bits1(&s->gb);
7654         decode_scaling_matrices(h, sps, NULL, 1, sps->scaling_matrix4, sps->scaling_matrix8);
7655     }else
7656         sps->scaling_matrix_present = 0;
7657
7658     sps->log2_max_frame_num= get_ue_golomb(&s->gb) + 4;
7659     sps->poc_type= get_ue_golomb(&s->gb);
7660
7661     if(sps->poc_type == 0){ //FIXME #define
7662         sps->log2_max_poc_lsb= get_ue_golomb(&s->gb) + 4;
7663     } else if(sps->poc_type == 1){//FIXME #define
7664         sps->delta_pic_order_always_zero_flag= get_bits1(&s->gb);
7665         sps->offset_for_non_ref_pic= get_se_golomb(&s->gb);
7666         sps->offset_for_top_to_bottom_field= get_se_golomb(&s->gb);
7667         sps->poc_cycle_length= get_ue_golomb(&s->gb);
7668
7669         for(i=0; i<sps->poc_cycle_length; i++)
7670             sps->offset_for_ref_frame[i]= get_se_golomb(&s->gb);
7671     }
7672     if(sps->poc_type > 2){
7673         av_log(h->s.avctx, AV_LOG_ERROR, "illegal POC type %d\n", sps->poc_type);
7674         return -1;
7675     }
7676
7677     sps->ref_frame_count= get_ue_golomb(&s->gb);
7678     if(sps->ref_frame_count > MAX_PICTURE_COUNT-2){
7679         av_log(h->s.avctx, AV_LOG_ERROR, "too many reference frames\n");
7680     }
7681     sps->gaps_in_frame_num_allowed_flag= get_bits1(&s->gb);
7682     sps->mb_width= get_ue_golomb(&s->gb) + 1;
7683     sps->mb_height= get_ue_golomb(&s->gb) + 1;
7684     if((unsigned)sps->mb_width >= INT_MAX/16 || (unsigned)sps->mb_height >= INT_MAX/16 ||
7685        avcodec_check_dimensions(NULL, 16*sps->mb_width, 16*sps->mb_height))
7686         return -1;
7687
7688     sps->frame_mbs_only_flag= get_bits1(&s->gb);
7689     if(!sps->frame_mbs_only_flag)
7690         sps->mb_aff= get_bits1(&s->gb);
7691     else
7692         sps->mb_aff= 0;
7693
7694     sps->direct_8x8_inference_flag= get_bits1(&s->gb);
7695
7696 #ifndef ALLOW_INTERLACE
7697     if(sps->mb_aff)
7698         av_log(h->s.avctx, AV_LOG_ERROR, "MBAFF support not included; enable it compilation time\n");
7699 #endif
7700     if(!sps->direct_8x8_inference_flag && sps->mb_aff)
7701         av_log(h->s.avctx, AV_LOG_ERROR, "MBAFF + !direct_8x8_inference is not implemented\n");
7702
7703     sps->crop= get_bits1(&s->gb);
7704     if(sps->crop){
7705         sps->crop_left  = get_ue_golomb(&s->gb);
7706         sps->crop_right = get_ue_golomb(&s->gb);
7707         sps->crop_top   = get_ue_golomb(&s->gb);
7708         sps->crop_bottom= get_ue_golomb(&s->gb);
7709         if(sps->crop_left || sps->crop_top){
7710             av_log(h->s.avctx, AV_LOG_ERROR, "insane cropping not completely supported, this could look slightly wrong ...\n");
7711         }
7712     }else{
7713         sps->crop_left  =
7714         sps->crop_right =
7715         sps->crop_top   =
7716         sps->crop_bottom= 0;
7717     }
7718
7719     sps->vui_parameters_present_flag= get_bits1(&s->gb);
7720     if( sps->vui_parameters_present_flag )
7721         decode_vui_parameters(h, sps);
7722
7723     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
7724         av_log(h->s.avctx, AV_LOG_DEBUG, "sps:%d profile:%d/%d poc:%d ref:%d %dx%d %s %s crop:%d/%d/%d/%d %s\n",
7725                sps_id, sps->profile_idc, sps->level_idc,
7726                sps->poc_type,
7727                sps->ref_frame_count,
7728                sps->mb_width, sps->mb_height,
7729                sps->frame_mbs_only_flag ? "FRM" : (sps->mb_aff ? "MB-AFF" : "PIC-AFF"),
7730                sps->direct_8x8_inference_flag ? "8B8" : "",
7731                sps->crop_left, sps->crop_right,
7732                sps->crop_top, sps->crop_bottom,
7733                sps->vui_parameters_present_flag ? "VUI" : ""
7734                );
7735     }
7736     return 0;
7737 }
7738
7739 static inline int decode_picture_parameter_set(H264Context *h, int bit_length){
7740     MpegEncContext * const s = &h->s;
7741     int pps_id= get_ue_golomb(&s->gb);
7742     PPS *pps= &h->pps_buffer[pps_id];
7743
7744     pps->sps_id= get_ue_golomb(&s->gb);
7745     pps->cabac= get_bits1(&s->gb);
7746     pps->pic_order_present= get_bits1(&s->gb);
7747     pps->slice_group_count= get_ue_golomb(&s->gb) + 1;
7748     if(pps->slice_group_count > 1 ){
7749         pps->mb_slice_group_map_type= get_ue_golomb(&s->gb);
7750         av_log(h->s.avctx, AV_LOG_ERROR, "FMO not supported\n");
7751         switch(pps->mb_slice_group_map_type){
7752         case 0:
7753 #if 0
7754 |   for( i = 0; i <= num_slice_groups_minus1; i++ ) |   |        |
7755 |    run_length[ i ]                                |1  |ue(v)   |
7756 #endif
7757             break;
7758         case 2:
7759 #if 0
7760 |   for( i = 0; i < num_slice_groups_minus1; i++ )  |   |        |
7761 |{                                                  |   |        |
7762 |    top_left_mb[ i ]                               |1  |ue(v)   |
7763 |    bottom_right_mb[ i ]                           |1  |ue(v)   |
7764 |   }                                               |   |        |
7765 #endif
7766             break;
7767         case 3:
7768         case 4:
7769         case 5:
7770 #if 0
7771 |   slice_group_change_direction_flag               |1  |u(1)    |
7772 |   slice_group_change_rate_minus1                  |1  |ue(v)   |
7773 #endif
7774             break;
7775         case 6:
7776 #if 0
7777 |   slice_group_id_cnt_minus1                       |1  |ue(v)   |
7778 |   for( i = 0; i <= slice_group_id_cnt_minus1; i++ |   |        |
7779 |)                                                  |   |        |
7780 |    slice_group_id[ i ]                            |1  |u(v)    |
7781 #endif
7782             break;
7783         }
7784     }
7785     pps->ref_count[0]= get_ue_golomb(&s->gb) + 1;
7786     pps->ref_count[1]= get_ue_golomb(&s->gb) + 1;
7787     if(pps->ref_count[0] > 32 || pps->ref_count[1] > 32){
7788         av_log(h->s.avctx, AV_LOG_ERROR, "reference overflow (pps)\n");
7789         return -1;
7790     }
7791
7792     pps->weighted_pred= get_bits1(&s->gb);
7793     pps->weighted_bipred_idc= get_bits(&s->gb, 2);
7794     pps->init_qp= get_se_golomb(&s->gb) + 26;
7795     pps->init_qs= get_se_golomb(&s->gb) + 26;
7796     pps->chroma_qp_index_offset= get_se_golomb(&s->gb);
7797     pps->deblocking_filter_parameters_present= get_bits1(&s->gb);
7798     pps->constrained_intra_pred= get_bits1(&s->gb);
7799     pps->redundant_pic_cnt_present = get_bits1(&s->gb);
7800
7801     pps->transform_8x8_mode= 0;
7802     h->dequant_coeff_pps= -1; //contents of sps/pps can change even if id doesn't, so reinit
7803     memset(pps->scaling_matrix4, 16, 6*16*sizeof(uint8_t));
7804     memset(pps->scaling_matrix8, 16, 2*64*sizeof(uint8_t));
7805
7806     if(get_bits_count(&s->gb) < bit_length){
7807         pps->transform_8x8_mode= get_bits1(&s->gb);
7808         decode_scaling_matrices(h, &h->sps_buffer[pps->sps_id], pps, 0, pps->scaling_matrix4, pps->scaling_matrix8);
7809         get_se_golomb(&s->gb);  //second_chroma_qp_index_offset
7810     }
7811
7812     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
7813         av_log(h->s.avctx, AV_LOG_DEBUG, "pps:%d sps:%d %s slice_groups:%d ref:%d/%d %s qp:%d/%d/%d %s %s %s %s\n",
7814                pps_id, pps->sps_id,
7815                pps->cabac ? "CABAC" : "CAVLC",
7816                pps->slice_group_count,
7817                pps->ref_count[0], pps->ref_count[1],
7818                pps->weighted_pred ? "weighted" : "",
7819                pps->init_qp, pps->init_qs, pps->chroma_qp_index_offset,
7820                pps->deblocking_filter_parameters_present ? "LPAR" : "",
7821                pps->constrained_intra_pred ? "CONSTR" : "",
7822                pps->redundant_pic_cnt_present ? "REDU" : "",
7823                pps->transform_8x8_mode ? "8x8DCT" : ""
7824                );
7825     }
7826
7827     return 0;
7828 }
7829
7830 /**
7831  * finds the end of the current frame in the bitstream.
7832  * @return the position of the first byte of the next frame, or -1
7833  */
7834 static int find_frame_end(H264Context *h, const uint8_t *buf, int buf_size){
7835     int i;
7836     uint32_t state;
7837     ParseContext *pc = &(h->s.parse_context);
7838 //printf("first %02X%02X%02X%02X\n", buf[0], buf[1],buf[2],buf[3]);
7839 //    mb_addr= pc->mb_addr - 1;
7840     state= pc->state;
7841     for(i=0; i<=buf_size; i++){
7842         if((state&0xFFFFFF1F) == 0x101 || (state&0xFFFFFF1F) == 0x102 || (state&0xFFFFFF1F) == 0x105){
7843             tprintf("find_frame_end new startcode = %08x, frame_start_found = %d, pos = %d\n", state, pc->frame_start_found, i);
7844             if(pc->frame_start_found){
7845                 // If there isn't one more byte in the buffer
7846                 // the test on first_mb_in_slice cannot be done yet
7847                 // do it at next call.
7848                 if (i >= buf_size) break;
7849                 if (buf[i] & 0x80) {
7850                     // first_mb_in_slice is 0, probably the first nal of a new
7851                     // slice
7852                     tprintf("find_frame_end frame_end_found, state = %08x, pos = %d\n", state, i);
7853                     pc->state=-1;
7854                     pc->frame_start_found= 0;
7855                     return i-4;
7856                 }
7857             }
7858             pc->frame_start_found = 1;
7859         }
7860         if((state&0xFFFFFF1F) == 0x107 || (state&0xFFFFFF1F) == 0x108 || (state&0xFFFFFF1F) == 0x109){
7861            if(pc->frame_start_found){
7862                 pc->state=-1;
7863                 pc->frame_start_found= 0;
7864                 return i-4;
7865            }
7866         }
7867         if (i<buf_size)
7868             state= (state<<8) | buf[i];
7869     }
7870
7871     pc->state= state;
7872     return END_NOT_FOUND;
7873 }
7874
7875 static int h264_parse(AVCodecParserContext *s,
7876                       AVCodecContext *avctx,
7877                       uint8_t **poutbuf, int *poutbuf_size,
7878                       const uint8_t *buf, int buf_size)
7879 {
7880     H264Context *h = s->priv_data;
7881     ParseContext *pc = &h->s.parse_context;
7882     int next;
7883
7884     next= find_frame_end(h, buf, buf_size);
7885
7886     if (ff_combine_frame(pc, next, (uint8_t **)&buf, &buf_size) < 0) {
7887         *poutbuf = NULL;
7888         *poutbuf_size = 0;
7889         return buf_size;
7890     }
7891
7892     *poutbuf = (uint8_t *)buf;
7893     *poutbuf_size = buf_size;
7894     return next;
7895 }
7896
7897 static int h264_split(AVCodecContext *avctx,
7898                       const uint8_t *buf, int buf_size)
7899 {
7900     int i;
7901     uint32_t state = -1;
7902     int has_sps= 0;
7903
7904     for(i=0; i<=buf_size; i++){
7905         if((state&0xFFFFFF1F) == 0x107)
7906             has_sps=1;
7907 /*        if((state&0xFFFFFF1F) == 0x101 || (state&0xFFFFFF1F) == 0x102 || (state&0xFFFFFF1F) == 0x105){
7908         }*/
7909         if((state&0xFFFFFF00) == 0x100 && (state&0xFFFFFF1F) != 0x107 && (state&0xFFFFFF1F) != 0x108 && (state&0xFFFFFF1F) != 0x109){
7910             if(has_sps){
7911                 while(i>4 && buf[i-5]==0) i--;
7912                 return i-4;
7913             }
7914         }
7915         if (i<buf_size)
7916             state= (state<<8) | buf[i];
7917     }
7918     return 0;
7919 }
7920
7921
7922 static int decode_nal_units(H264Context *h, uint8_t *buf, int buf_size){
7923     MpegEncContext * const s = &h->s;
7924     AVCodecContext * const avctx= s->avctx;
7925     int buf_index=0;
7926 #if 0
7927     int i;
7928     for(i=0; i<50; i++){
7929         av_log(NULL, AV_LOG_ERROR,"%02X ", buf[i]);
7930     }
7931 #endif
7932     h->slice_num = 0;
7933     s->current_picture_ptr= NULL;
7934     for(;;){
7935         int consumed;
7936         int dst_length;
7937         int bit_length;
7938         uint8_t *ptr;
7939         int i, nalsize = 0;
7940
7941       if(h->is_avc) {
7942         if(buf_index >= buf_size) break;
7943         nalsize = 0;
7944         for(i = 0; i < h->nal_length_size; i++)
7945             nalsize = (nalsize << 8) | buf[buf_index++];
7946         if(nalsize <= 1){
7947             if(nalsize == 1){
7948                 buf_index++;
7949                 continue;
7950             }else{
7951                 av_log(h->s.avctx, AV_LOG_ERROR, "AVC: nal size %d\n", nalsize);
7952                 break;
7953             }
7954         }
7955       } else {
7956         // start code prefix search
7957         for(; buf_index + 3 < buf_size; buf_index++){
7958             // this should allways succeed in the first iteration
7959             if(buf[buf_index] == 0 && buf[buf_index+1] == 0 && buf[buf_index+2] == 1)
7960                 break;
7961         }
7962
7963         if(buf_index+3 >= buf_size) break;
7964
7965         buf_index+=3;
7966       }
7967
7968         ptr= decode_nal(h, buf + buf_index, &dst_length, &consumed, h->is_avc ? nalsize : buf_size - buf_index);
7969         while(ptr[dst_length - 1] == 0 && dst_length > 1)
7970             dst_length--;
7971         bit_length= 8*dst_length - decode_rbsp_trailing(ptr + dst_length - 1);
7972
7973         if(s->avctx->debug&FF_DEBUG_STARTCODE){
7974             av_log(h->s.avctx, AV_LOG_DEBUG, "NAL %d at %d/%d length %d\n", h->nal_unit_type, buf_index, buf_size, dst_length);
7975         }
7976
7977         if (h->is_avc && (nalsize != consumed))
7978             av_log(h->s.avctx, AV_LOG_ERROR, "AVC: Consumed only %d bytes instead of %d\n", consumed, nalsize);
7979
7980         buf_index += consumed;
7981
7982         if(  (s->hurry_up == 1 && h->nal_ref_idc  == 0) //FIXME dont discard SEI id
7983            ||(avctx->skip_frame >= AVDISCARD_NONREF && h->nal_ref_idc  == 0))
7984             continue;
7985
7986         switch(h->nal_unit_type){
7987         case NAL_IDR_SLICE:
7988             idr(h); //FIXME ensure we don't loose some frames if there is reordering
7989         case NAL_SLICE:
7990             init_get_bits(&s->gb, ptr, bit_length);
7991             h->intra_gb_ptr=
7992             h->inter_gb_ptr= &s->gb;
7993             s->data_partitioning = 0;
7994
7995             if(decode_slice_header(h) < 0){
7996                 av_log(h->s.avctx, AV_LOG_ERROR, "decode_slice_header error\n");
7997                 break;
7998             }
7999             s->current_picture_ptr->key_frame= (h->nal_unit_type == NAL_IDR_SLICE);
8000             if(h->redundant_pic_count==0 && s->hurry_up < 5
8001                && (avctx->skip_frame < AVDISCARD_NONREF || h->nal_ref_idc)
8002                && (avctx->skip_frame < AVDISCARD_BIDIR  || h->slice_type!=B_TYPE)
8003                && (avctx->skip_frame < AVDISCARD_NONKEY || h->slice_type==I_TYPE)
8004                && avctx->skip_frame < AVDISCARD_ALL)
8005                 decode_slice(h);
8006             break;
8007         case NAL_DPA:
8008             init_get_bits(&s->gb, ptr, bit_length);
8009             h->intra_gb_ptr=
8010             h->inter_gb_ptr= NULL;
8011             s->data_partitioning = 1;
8012
8013             if(decode_slice_header(h) < 0){
8014                 av_log(h->s.avctx, AV_LOG_ERROR, "decode_slice_header error\n");
8015             }
8016             break;
8017         case NAL_DPB:
8018             init_get_bits(&h->intra_gb, ptr, bit_length);
8019             h->intra_gb_ptr= &h->intra_gb;
8020             break;
8021         case NAL_DPC:
8022             init_get_bits(&h->inter_gb, ptr, bit_length);
8023             h->inter_gb_ptr= &h->inter_gb;
8024
8025             if(h->redundant_pic_count==0 && h->intra_gb_ptr && s->data_partitioning
8026                && s->hurry_up < 5
8027                && (avctx->skip_frame < AVDISCARD_NONREF || h->nal_ref_idc)
8028                && (avctx->skip_frame < AVDISCARD_BIDIR  || h->slice_type!=B_TYPE)
8029                && (avctx->skip_frame < AVDISCARD_NONKEY || h->slice_type==I_TYPE)
8030                && avctx->skip_frame < AVDISCARD_ALL)
8031                 decode_slice(h);
8032             break;
8033         case NAL_SEI:
8034             init_get_bits(&s->gb, ptr, bit_length);
8035             decode_sei(h);
8036             break;
8037         case NAL_SPS:
8038             init_get_bits(&s->gb, ptr, bit_length);
8039             decode_seq_parameter_set(h);
8040
8041             if(s->flags& CODEC_FLAG_LOW_DELAY)
8042                 s->low_delay=1;
8043
8044             if(avctx->has_b_frames < 2)
8045                 avctx->has_b_frames= !s->low_delay;
8046             break;
8047         case NAL_PPS:
8048             init_get_bits(&s->gb, ptr, bit_length);
8049
8050             decode_picture_parameter_set(h, bit_length);
8051
8052             break;
8053         case NAL_AUD:
8054         case NAL_END_SEQUENCE:
8055         case NAL_END_STREAM:
8056         case NAL_FILLER_DATA:
8057         case NAL_SPS_EXT:
8058         case NAL_AUXILIARY_SLICE:
8059             break;
8060         default:
8061             av_log(avctx, AV_LOG_ERROR, "Unknown NAL code: %d\n", h->nal_unit_type);
8062         }
8063     }
8064
8065     if(!s->current_picture_ptr) return buf_index; //no frame
8066
8067     s->current_picture_ptr->qscale_type= FF_QSCALE_TYPE_H264;
8068     s->current_picture_ptr->pict_type= s->pict_type;
8069
8070     h->prev_frame_num_offset= h->frame_num_offset;
8071     h->prev_frame_num= h->frame_num;
8072     if(s->current_picture_ptr->reference){
8073         h->prev_poc_msb= h->poc_msb;
8074         h->prev_poc_lsb= h->poc_lsb;
8075     }
8076     if(s->current_picture_ptr->reference)
8077         execute_ref_pic_marking(h, h->mmco, h->mmco_index);
8078
8079     ff_er_frame_end(s);
8080
8081     MPV_frame_end(s);
8082
8083     return buf_index;
8084 }
8085
8086 /**
8087  * returns the number of bytes consumed for building the current frame
8088  */
8089 static int get_consumed_bytes(MpegEncContext *s, int pos, int buf_size){
8090     if(s->flags&CODEC_FLAG_TRUNCATED){
8091         pos -= s->parse_context.last_index;
8092         if(pos<0) pos=0; // FIXME remove (unneeded?)
8093
8094         return pos;
8095     }else{
8096         if(pos==0) pos=1; //avoid infinite loops (i doubt thats needed but ...)
8097         if(pos+10>buf_size) pos=buf_size; // oops ;)
8098
8099         return pos;
8100     }
8101 }
8102
8103 static int decode_frame(AVCodecContext *avctx,
8104                              void *data, int *data_size,
8105                              uint8_t *buf, int buf_size)
8106 {
8107     H264Context *h = avctx->priv_data;
8108     MpegEncContext *s = &h->s;
8109     AVFrame *pict = data;
8110     int buf_index;
8111
8112     s->flags= avctx->flags;
8113     s->flags2= avctx->flags2;
8114
8115    /* no supplementary picture */
8116     if (buf_size == 0) {
8117         return 0;
8118     }
8119
8120     if(s->flags&CODEC_FLAG_TRUNCATED){
8121         int next= find_frame_end(h, buf, buf_size);
8122
8123         if( ff_combine_frame(&s->parse_context, next, &buf, &buf_size) < 0 )
8124             return buf_size;
8125 //printf("next:%d buf_size:%d last_index:%d\n", next, buf_size, s->parse_context.last_index);
8126     }
8127
8128     if(h->is_avc && !h->got_avcC) {
8129         int i, cnt, nalsize;
8130         unsigned char *p = avctx->extradata;
8131         if(avctx->extradata_size < 7) {
8132             av_log(avctx, AV_LOG_ERROR, "avcC too short\n");
8133             return -1;
8134         }
8135         if(*p != 1) {
8136             av_log(avctx, AV_LOG_ERROR, "Unknown avcC version %d\n", *p);
8137             return -1;
8138         }
8139         /* sps and pps in the avcC always have length coded with 2 bytes,
8140            so put a fake nal_length_size = 2 while parsing them */
8141         h->nal_length_size = 2;
8142         // Decode sps from avcC
8143         cnt = *(p+5) & 0x1f; // Number of sps
8144         p += 6;
8145         for (i = 0; i < cnt; i++) {
8146             nalsize = BE_16(p) + 2;
8147             if(decode_nal_units(h, p, nalsize) < 0) {
8148                 av_log(avctx, AV_LOG_ERROR, "Decoding sps %d from avcC failed\n", i);
8149                 return -1;
8150             }
8151             p += nalsize;
8152         }
8153         // Decode pps from avcC
8154         cnt = *(p++); // Number of pps
8155         for (i = 0; i < cnt; i++) {
8156             nalsize = BE_16(p) + 2;
8157             if(decode_nal_units(h, p, nalsize)  != nalsize) {
8158                 av_log(avctx, AV_LOG_ERROR, "Decoding pps %d from avcC failed\n", i);
8159                 return -1;
8160             }
8161             p += nalsize;
8162         }
8163         // Now store right nal length size, that will be use to parse all other nals
8164         h->nal_length_size = ((*(((char*)(avctx->extradata))+4))&0x03)+1;
8165         // Do not reparse avcC
8166         h->got_avcC = 1;
8167     }
8168
8169     if(!h->is_avc && s->avctx->extradata_size && s->picture_number==0){
8170         if(decode_nal_units(h, s->avctx->extradata, s->avctx->extradata_size) < 0)
8171             return -1;
8172     }
8173
8174     buf_index=decode_nal_units(h, buf, buf_size);
8175     if(buf_index < 0)
8176         return -1;
8177
8178     //FIXME do something with unavailable reference frames
8179
8180 //    if(ret==FRAME_SKIPPED) return get_consumed_bytes(s, buf_index, buf_size);
8181     if(!s->current_picture_ptr){
8182         av_log(h->s.avctx, AV_LOG_DEBUG, "error, NO frame\n");
8183         return -1;
8184     }
8185
8186     {
8187         Picture *out = s->current_picture_ptr;
8188 #if 0 //decode order
8189         *data_size = sizeof(AVFrame);
8190 #else
8191         /* Sort B-frames into display order */
8192         Picture *cur = s->current_picture_ptr;
8193         Picture *prev = h->delayed_output_pic;
8194         int i, pics, cross_idr, out_of_order, out_idx;
8195
8196         if(h->sps.bitstream_restriction_flag
8197            && s->avctx->has_b_frames < h->sps.num_reorder_frames){
8198             s->avctx->has_b_frames = h->sps.num_reorder_frames;
8199             s->low_delay = 0;
8200         }
8201
8202         pics = 0;
8203         while(h->delayed_pic[pics]) pics++;
8204         h->delayed_pic[pics++] = cur;
8205         if(cur->reference == 0)
8206             cur->reference = 1;
8207
8208         cross_idr = 0;
8209         for(i=0; h->delayed_pic[i]; i++)
8210             if(h->delayed_pic[i]->key_frame || h->delayed_pic[i]->poc==0)
8211                 cross_idr = 1;
8212
8213         out = h->delayed_pic[0];
8214         out_idx = 0;
8215         for(i=1; h->delayed_pic[i] && !h->delayed_pic[i]->key_frame; i++)
8216             if(h->delayed_pic[i]->poc < out->poc){
8217                 out = h->delayed_pic[i];
8218                 out_idx = i;
8219             }
8220
8221         out_of_order = !cross_idr && prev && out->poc < prev->poc;
8222         if(h->sps.bitstream_restriction_flag && s->avctx->has_b_frames >= h->sps.num_reorder_frames)
8223             { }
8224         else if(prev && pics <= s->avctx->has_b_frames)
8225             out = prev;
8226         else if((out_of_order && pics-1 == s->avctx->has_b_frames && pics < 15)
8227            || (s->low_delay &&
8228             ((!cross_idr && prev && out->poc > prev->poc + 2)
8229              || cur->pict_type == B_TYPE)))
8230         {
8231             s->low_delay = 0;
8232             s->avctx->has_b_frames++;
8233             out = prev;
8234         }
8235         else if(out_of_order)
8236             out = prev;
8237
8238         if(out_of_order || pics > s->avctx->has_b_frames){
8239             for(i=out_idx; h->delayed_pic[i]; i++)
8240                 h->delayed_pic[i] = h->delayed_pic[i+1];
8241         }
8242
8243         if(prev == out)
8244             *data_size = 0;
8245         else
8246             *data_size = sizeof(AVFrame);
8247         if(prev && prev != out && prev->reference == 1)
8248             prev->reference = 0;
8249         h->delayed_output_pic = out;
8250 #endif
8251
8252         if(out)
8253             *pict= *(AVFrame*)out;
8254         else
8255             av_log(avctx, AV_LOG_DEBUG, "no picture\n");
8256     }
8257
8258     assert(pict->data[0] || !*data_size);
8259     ff_print_debug_info(s, pict);
8260 //printf("out %d\n", (int)pict->data[0]);
8261 #if 0 //?
8262
8263     /* Return the Picture timestamp as the frame number */
8264     /* we substract 1 because it is added on utils.c    */
8265     avctx->frame_number = s->picture_number - 1;
8266 #endif
8267     return get_consumed_bytes(s, buf_index, buf_size);
8268 }
8269 #if 0
8270 static inline void fill_mb_avail(H264Context *h){
8271     MpegEncContext * const s = &h->s;
8272     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
8273
8274     if(s->mb_y){
8275         h->mb_avail[0]= s->mb_x                 && h->slice_table[mb_xy - s->mb_stride - 1] == h->slice_num;
8276         h->mb_avail[1]=                            h->slice_table[mb_xy - s->mb_stride    ] == h->slice_num;
8277         h->mb_avail[2]= s->mb_x+1 < s->mb_width && h->slice_table[mb_xy - s->mb_stride + 1] == h->slice_num;
8278     }else{
8279         h->mb_avail[0]=
8280         h->mb_avail[1]=
8281         h->mb_avail[2]= 0;
8282     }
8283     h->mb_avail[3]= s->mb_x && h->slice_table[mb_xy - 1] == h->slice_num;
8284     h->mb_avail[4]= 1; //FIXME move out
8285     h->mb_avail[5]= 0; //FIXME move out
8286 }
8287 #endif
8288
8289 #if 0 //selftest
8290 #define COUNT 8000
8291 #define SIZE (COUNT*40)
8292 int main(){
8293     int i;
8294     uint8_t temp[SIZE];
8295     PutBitContext pb;
8296     GetBitContext gb;
8297 //    int int_temp[10000];
8298     DSPContext dsp;
8299     AVCodecContext avctx;
8300
8301     dsputil_init(&dsp, &avctx);
8302
8303     init_put_bits(&pb, temp, SIZE);
8304     printf("testing unsigned exp golomb\n");
8305     for(i=0; i<COUNT; i++){
8306         START_TIMER
8307         set_ue_golomb(&pb, i);
8308         STOP_TIMER("set_ue_golomb");
8309     }
8310     flush_put_bits(&pb);
8311
8312     init_get_bits(&gb, temp, 8*SIZE);
8313     for(i=0; i<COUNT; i++){
8314         int j, s;
8315
8316         s= show_bits(&gb, 24);
8317
8318         START_TIMER
8319         j= get_ue_golomb(&gb);
8320         if(j != i){
8321             printf("missmatch! at %d (%d should be %d) bits:%6X\n", i, j, i, s);
8322 //            return -1;
8323         }
8324         STOP_TIMER("get_ue_golomb");
8325     }
8326
8327
8328     init_put_bits(&pb, temp, SIZE);
8329     printf("testing signed exp golomb\n");
8330     for(i=0; i<COUNT; i++){
8331         START_TIMER
8332         set_se_golomb(&pb, i - COUNT/2);
8333         STOP_TIMER("set_se_golomb");
8334     }
8335     flush_put_bits(&pb);
8336
8337     init_get_bits(&gb, temp, 8*SIZE);
8338     for(i=0; i<COUNT; i++){
8339         int j, s;
8340
8341         s= show_bits(&gb, 24);
8342
8343         START_TIMER
8344         j= get_se_golomb(&gb);
8345         if(j != i - COUNT/2){
8346             printf("missmatch! at %d (%d should be %d) bits:%6X\n", i, j, i, s);
8347 //            return -1;
8348         }
8349         STOP_TIMER("get_se_golomb");
8350     }
8351
8352     printf("testing 4x4 (I)DCT\n");
8353
8354     DCTELEM block[16];
8355     uint8_t src[16], ref[16];
8356     uint64_t error= 0, max_error=0;
8357
8358     for(i=0; i<COUNT; i++){
8359         int j;
8360 //        printf("%d %d %d\n", r1, r2, (r2-r1)*16);
8361         for(j=0; j<16; j++){
8362             ref[j]= random()%255;
8363             src[j]= random()%255;
8364         }
8365
8366         h264_diff_dct_c(block, src, ref, 4);
8367
8368         //normalize
8369         for(j=0; j<16; j++){
8370 //            printf("%d ", block[j]);
8371             block[j]= block[j]*4;
8372             if(j&1) block[j]= (block[j]*4 + 2)/5;
8373             if(j&4) block[j]= (block[j]*4 + 2)/5;
8374         }
8375 //        printf("\n");
8376
8377         s->dsp.h264_idct_add(ref, block, 4);
8378 /*        for(j=0; j<16; j++){
8379             printf("%d ", ref[j]);
8380         }
8381         printf("\n");*/
8382
8383         for(j=0; j<16; j++){
8384             int diff= ABS(src[j] - ref[j]);
8385
8386             error+= diff*diff;
8387             max_error= FFMAX(max_error, diff);
8388         }
8389     }
8390     printf("error=%f max_error=%d\n", ((float)error)/COUNT/16, (int)max_error );
8391 #if 0
8392     printf("testing quantizer\n");
8393     for(qp=0; qp<52; qp++){
8394         for(i=0; i<16; i++)
8395             src1_block[i]= src2_block[i]= random()%255;
8396
8397     }
8398 #endif
8399     printf("Testing NAL layer\n");
8400
8401     uint8_t bitstream[COUNT];
8402     uint8_t nal[COUNT*2];
8403     H264Context h;
8404     memset(&h, 0, sizeof(H264Context));
8405
8406     for(i=0; i<COUNT; i++){
8407         int zeros= i;
8408         int nal_length;
8409         int consumed;
8410         int out_length;
8411         uint8_t *out;
8412         int j;
8413
8414         for(j=0; j<COUNT; j++){
8415             bitstream[j]= (random() % 255) + 1;
8416         }
8417
8418         for(j=0; j<zeros; j++){
8419             int pos= random() % COUNT;
8420             while(bitstream[pos] == 0){
8421                 pos++;
8422                 pos %= COUNT;
8423             }
8424             bitstream[pos]=0;
8425         }
8426
8427         START_TIMER
8428
8429         nal_length= encode_nal(&h, nal, bitstream, COUNT, COUNT*2);
8430         if(nal_length<0){
8431             printf("encoding failed\n");
8432             return -1;
8433         }
8434
8435         out= decode_nal(&h, nal, &out_length, &consumed, nal_length);
8436
8437         STOP_TIMER("NAL")
8438
8439         if(out_length != COUNT){
8440             printf("incorrect length %d %d\n", out_length, COUNT);
8441             return -1;
8442         }
8443
8444         if(consumed != nal_length){
8445             printf("incorrect consumed length %d %d\n", nal_length, consumed);
8446             return -1;
8447         }
8448
8449         if(memcmp(bitstream, out, COUNT)){
8450             printf("missmatch\n");
8451             return -1;
8452         }
8453     }
8454
8455     printf("Testing RBSP\n");
8456
8457
8458     return 0;
8459 }
8460 #endif
8461
8462
8463 static int decode_end(AVCodecContext *avctx)
8464 {
8465     H264Context *h = avctx->priv_data;
8466     MpegEncContext *s = &h->s;
8467
8468     av_freep(&h->rbsp_buffer);
8469     free_tables(h); //FIXME cleanup init stuff perhaps
8470     MPV_common_end(s);
8471
8472 //    memset(h, 0, sizeof(H264Context));
8473
8474     return 0;
8475 }
8476
8477
8478 AVCodec h264_decoder = {
8479     "h264",
8480     CODEC_TYPE_VIDEO,
8481     CODEC_ID_H264,
8482     sizeof(H264Context),
8483     decode_init,
8484     NULL,
8485     decode_end,
8486     decode_frame,
8487     /*CODEC_CAP_DRAW_HORIZ_BAND |*/ CODEC_CAP_DR1 | CODEC_CAP_TRUNCATED | CODEC_CAP_DELAY,
8488     .flush= flush_dpb,
8489 };
8490
8491 AVCodecParser h264_parser = {
8492     { CODEC_ID_H264 },
8493     sizeof(H264Context),
8494     NULL,
8495     h264_parse,
8496     ff_parse_close,
8497     h264_split,
8498 };
8499
8500 #include "svq3.c"