git.sesse.net Git - ffmpeg/blob - libavcodec/h264.c

   1 /*
   2  * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder
   3  * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
   4  *
   5  * This library is free software; you can redistribute it and/or
   6  * modify it under the terms of the GNU Lesser General Public
   7  * License as published by the Free Software Foundation; either
   8  * version 2 of the License, or (at your option) any later version.
   9  *
  10  * This library is distributed in the hope that it will be useful,
  11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13  * Lesser General Public License for more details.
  14  *
  15  * You should have received a copy of the GNU Lesser General Public
  16  * License along with this library; if not, write to the Free Software
  17  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  18  *
  19  */
  20
  21 /**
  22  * @file h264.c
  23  * H.264 / AVC / MPEG4 part10 codec.
  24  * @author Michael Niedermayer <michaelni@gmx.at>
  25  */
  26
  27 #include "common.h"
  28 #include "dsputil.h"
  29 #include "avcodec.h"
  30 #include "mpegvideo.h"
  31 #include "h264data.h"
  32 #include "golomb.h"
  33
  34 #include "cabac.h"
  35
  36 #undef NDEBUG
  37 #include <assert.h>
  38
  39 #define interlaced_dct interlaced_dct_is_a_bad_name
  40 #define mb_intra mb_intra_isnt_initalized_see_mb_type
  41
  42 #define LUMA_DC_BLOCK_INDEX   25
  43 #define CHROMA_DC_BLOCK_INDEX 26
  44
  45 #define CHROMA_DC_COEFF_TOKEN_VLC_BITS 8
  46 #define COEFF_TOKEN_VLC_BITS           8
  47 #define TOTAL_ZEROS_VLC_BITS           9
  48 #define CHROMA_DC_TOTAL_ZEROS_VLC_BITS 3
  49 #define RUN_VLC_BITS                   3
  50 #define RUN7_VLC_BITS                  6
  51
  52 #define MAX_SPS_COUNT 32
  53 #define MAX_PPS_COUNT 256
  54
  55 #define MAX_MMCO_COUNT 66
  56
  57 /**
  58  * Sequence parameter set
  59  */
  60 typedef struct SPS{
  61
  62     int profile_idc;
  63     int level_idc;
  64     int transform_bypass;              ///< qpprime_y_zero_transform_bypass_flag
  65     int log2_max_frame_num;            ///< log2_max_frame_num_minus4 + 4
  66     int poc_type;                      ///< pic_order_cnt_type
  67     int log2_max_poc_lsb;              ///< log2_max_pic_order_cnt_lsb_minus4
  68     int delta_pic_order_always_zero_flag;
  69     int offset_for_non_ref_pic;
  70     int offset_for_top_to_bottom_field;
  71     int poc_cycle_length;              ///< num_ref_frames_in_pic_order_cnt_cycle
  72     int ref_frame_count;               ///< num_ref_frames
  73     int gaps_in_frame_num_allowed_flag;
  74     int mb_width;                      ///< frame_width_in_mbs_minus1 + 1
  75     int mb_height;                     ///< frame_height_in_mbs_minus1 + 1
  76     int frame_mbs_only_flag;
  77     int mb_aff;                        ///<mb_adaptive_frame_field_flag
  78     int direct_8x8_inference_flag;
  79     int crop;                   ///< frame_cropping_flag
  80     int crop_left;              ///< frame_cropping_rect_left_offset
  81     int crop_right;             ///< frame_cropping_rect_right_offset
  82     int crop_top;               ///< frame_cropping_rect_top_offset
  83     int crop_bottom;            ///< frame_cropping_rect_bottom_offset
  84     int vui_parameters_present_flag;
  85     AVRational sar;
  86     int timing_info_present_flag;
  87     uint32_t num_units_in_tick;
  88     uint32_t time_scale;
  89     int fixed_frame_rate_flag;
  90     short offset_for_ref_frame[256]; //FIXME dyn aloc?
  91     int bitstream_restriction_flag;
  92     int num_reorder_frames;
  93     int scaling_matrix_present;
  94     uint8_t scaling_matrix4[6][16];
  95     uint8_t scaling_matrix8[2][64];
  96 }SPS;
  97
  98 /**
  99  * Picture parameter set
 100  */
 101 typedef struct PPS{
 102     int sps_id;
 103     int cabac;                  ///< entropy_coding_mode_flag
 104     int pic_order_present;      ///< pic_order_present_flag
 105     int slice_group_count;      ///< num_slice_groups_minus1 + 1
 106     int mb_slice_group_map_type;
 107     int ref_count[2];           ///< num_ref_idx_l0/1_active_minus1 + 1
 108     int weighted_pred;          ///< weighted_pred_flag
 109     int weighted_bipred_idc;
 110     int init_qp;                ///< pic_init_qp_minus26 + 26
 111     int init_qs;                ///< pic_init_qs_minus26 + 26
 112     int chroma_qp_index_offset;
 113     int deblocking_filter_parameters_present; ///< deblocking_filter_parameters_present_flag
 114     int constrained_intra_pred; ///< constrained_intra_pred_flag
 115     int redundant_pic_cnt_present; ///< redundant_pic_cnt_present_flag
 116     int transform_8x8_mode;     ///< transform_8x8_mode_flag
 117     uint8_t scaling_matrix4[6][16];
 118     uint8_t scaling_matrix8[2][64];
 119 }PPS;
 120
 121 /**
 122  * Memory management control operation opcode.
 123  */
 124 typedef enum MMCOOpcode{
 125     MMCO_END=0,
 126     MMCO_SHORT2UNUSED,
 127     MMCO_LONG2UNUSED,
 128     MMCO_SHORT2LONG,
 129     MMCO_SET_MAX_LONG,
 130     MMCO_RESET,
 131     MMCO_LONG,
 132 } MMCOOpcode;
 133
 134 /**
 135  * Memory management control operation.
 136  */
 137 typedef struct MMCO{
 138     MMCOOpcode opcode;
 139     int short_frame_num;
 140     int long_index;
 141 } MMCO;
 142
 143 /**
 144  * H264Context
 145  */
 146 typedef struct H264Context{
 147     MpegEncContext s;
 148     int nal_ref_idc;
 149     int nal_unit_type;
 150 #define NAL_SLICE                1
 151 #define NAL_DPA                  2
 152 #define NAL_DPB                  3
 153 #define NAL_DPC                  4
 154 #define NAL_IDR_SLICE            5
 155 #define NAL_SEI                  6
 156 #define NAL_SPS                  7
 157 #define NAL_PPS                  8
 158 #define NAL_AUD                  9
 159 #define NAL_END_SEQUENCE        10
 160 #define NAL_END_STREAM          11
 161 #define NAL_FILLER_DATA         12
 162 #define NAL_SPS_EXT             13
 163 #define NAL_AUXILIARY_SLICE     19
 164     uint8_t *rbsp_buffer;
 165     unsigned int rbsp_buffer_size;
 166
 167     /**
 168       * Used to parse AVC variant of h264
 169       */
 170     int is_avc; ///< this flag is != 0 if codec is avc1
 171     int got_avcC; ///< flag used to parse avcC data only once
 172     int nal_length_size; ///< Number of bytes used for nal length (1, 2 or 4)
 173
 174     int chroma_qp; //QPc
 175
 176     int prev_mb_skipped; //FIXME remove (IMHO not used)
 177
 178     //prediction stuff
 179     int chroma_pred_mode;
 180     int intra16x16_pred_mode;
 181
 182     int top_mb_xy;
 183     int left_mb_xy[2];
 184
 185     int8_t intra4x4_pred_mode_cache[5*8];
 186     int8_t (*intra4x4_pred_mode)[8];
 187     void (*pred4x4  [9+3])(uint8_t *src, uint8_t *topright, int stride);//FIXME move to dsp?
 188     void (*pred8x8l [9+3])(uint8_t *src, int topleft, int topright, int stride);
 189     void (*pred8x8  [4+3])(uint8_t *src, int stride);
 190     void (*pred16x16[4+3])(uint8_t *src, int stride);
 191     unsigned int topleft_samples_available;
 192     unsigned int top_samples_available;
 193     unsigned int topright_samples_available;
 194     unsigned int left_samples_available;
 195     uint8_t (*top_borders[2])[16+2*8];
 196     uint8_t left_border[2*(17+2*9)];
 197
 198     /**
 199      * non zero coeff count cache.
 200      * is 64 if not available.
 201      */
 202     DECLARE_ALIGNED_8(uint8_t, non_zero_count_cache[6*8]);
 203     uint8_t (*non_zero_count)[16];
 204
 205     /**
 206      * Motion vector cache.
 207      */
 208     DECLARE_ALIGNED_8(int16_t, mv_cache[2][5*8][2]);
 209     DECLARE_ALIGNED_8(int8_t, ref_cache[2][5*8]);
 210 #define LIST_NOT_USED -1 //FIXME rename?
 211 #define PART_NOT_AVAILABLE -2
 212
 213     /**
 214      * is 1 if the specific list MV&references are set to 0,0,-2.
 215      */
 216     int mv_cache_clean[2];
 217
 218     /**
 219      * number of neighbors (top and/or left) that used 8x8 dct
 220      */
 221     int neighbor_transform_size;
 222
 223     /**
 224      * block_offset[ 0..23] for frame macroblocks
 225      * block_offset[24..47] for field macroblocks
 226      */
 227     int block_offset[2*(16+8)];
 228
 229     uint32_t *mb2b_xy; //FIXME are these 4 a good idea?
 230     uint32_t *mb2b8_xy;
 231     int b_stride; //FIXME use s->b4_stride
 232     int b8_stride;
 233
 234     int halfpel_flag;
 235     int thirdpel_flag;
 236
 237     int unknown_svq3_flag;
 238     int next_slice_index;
 239
 240     SPS sps_buffer[MAX_SPS_COUNT];
 241     SPS sps; ///< current sps
 242
 243     PPS pps_buffer[MAX_PPS_COUNT];
 244     /**
 245      * current pps
 246      */
 247     PPS pps; //FIXME move to Picture perhaps? (->no) do we need that?
 248
 249     uint32_t dequant4_buffer[6][52][16];
 250     uint32_t dequant8_buffer[2][52][64];
 251     uint32_t (*dequant4_coeff[6])[16];
 252     uint32_t (*dequant8_coeff[2])[64];
 253     int dequant_coeff_pps;     ///< reinit tables when pps changes
 254
 255     int slice_num;
 256     uint8_t *slice_table_base;
 257     uint8_t *slice_table;      ///< slice_table_base + mb_stride + 1
 258     int slice_type;
 259     int slice_type_fixed;
 260
 261     //interlacing specific flags
 262     int mb_aff_frame;
 263     int mb_field_decoding_flag;
 264
 265     int sub_mb_type[4];
 266
 267     //POC stuff
 268     int poc_lsb;
 269     int poc_msb;
 270     int delta_poc_bottom;
 271     int delta_poc[2];
 272     int frame_num;
 273     int prev_poc_msb;             ///< poc_msb of the last reference pic for POC type 0
 274     int prev_poc_lsb;             ///< poc_lsb of the last reference pic for POC type 0
 275     int frame_num_offset;         ///< for POC type 2
 276     int prev_frame_num_offset;    ///< for POC type 2
 277     int prev_frame_num;           ///< frame_num of the last pic for POC type 1/2
 278
 279     /**
 280      * frame_num for frames or 2*frame_num for field pics.
 281      */
 282     int curr_pic_num;
 283
 284     /**
 285      * max_frame_num or 2*max_frame_num for field pics.
 286      */
 287     int max_pic_num;
 288
 289     //Weighted pred stuff
 290     int use_weight;
 291     int use_weight_chroma;
 292     int luma_log2_weight_denom;
 293     int chroma_log2_weight_denom;
 294     int luma_weight[2][16];
 295     int luma_offset[2][16];
 296     int chroma_weight[2][16][2];
 297     int chroma_offset[2][16][2];
 298     int implicit_weight[16][16];
 299
 300     //deblock
 301     int deblocking_filter;         ///< disable_deblocking_filter_idc with 1<->0
 302     int slice_alpha_c0_offset;
 303     int slice_beta_offset;
 304
 305     int redundant_pic_count;
 306
 307     int direct_spatial_mv_pred;
 308     int dist_scale_factor[16];
 309     int map_col_to_list0[2][16];
 310
 311     /**
 312      * num_ref_idx_l0/1_active_minus1 + 1
 313      */
 314     int ref_count[2];// FIXME split for AFF
 315     Picture *short_ref[32];
 316     Picture *long_ref[32];
 317     Picture default_ref_list[2][32];
 318     Picture ref_list[2][32]; //FIXME size?
 319     Picture field_ref_list[2][32]; //FIXME size?
 320     Picture *delayed_pic[16]; //FIXME size?
 321     Picture *delayed_output_pic;
 322
 323     /**
 324      * memory management control operations buffer.
 325      */
 326     MMCO mmco[MAX_MMCO_COUNT];
 327     int mmco_index;
 328
 329     int long_ref_count;  ///< number of actual long term references
 330     int short_ref_count; ///< number of actual short term references
 331
 332     //data partitioning
 333     GetBitContext intra_gb;
 334     GetBitContext inter_gb;
 335     GetBitContext *intra_gb_ptr;
 336     GetBitContext *inter_gb_ptr;
 337
 338     DECLARE_ALIGNED_8(DCTELEM, mb[16*24]);
 339
 340     /**
 341      * Cabac
 342      */
 343     CABACContext cabac;
 344     uint8_t      cabac_state[460];
 345     int          cabac_init_idc;
 346
 347     /* 0x100 -> non null luma_dc, 0x80/0x40 -> non null chroma_dc (cb/cr), 0x?0 -> chroma_cbp(0,1,2), 0x0? luma_cbp */
 348     uint16_t     *cbp_table;
 349     int top_cbp;
 350     int left_cbp;
 351     /* chroma_pred_mode for i4x4 or i16x16, else 0 */
 352     uint8_t     *chroma_pred_mode_table;
 353     int         last_qscale_diff;
 354     int16_t     (*mvd_table[2])[2];
 355     DECLARE_ALIGNED_8(int16_t, mvd_cache[2][5*8][2]);
 356     uint8_t     *direct_table;
 357     uint8_t     direct_cache[5*8];
 358
 359     uint8_t zigzag_scan[16];
 360     uint8_t field_scan[16];
 361     uint8_t zigzag_scan8x8[64];
 362     uint8_t zigzag_scan8x8_cavlc[64];
 363     const uint8_t *zigzag_scan_q0;
 364     const uint8_t *field_scan_q0;
 365     const uint8_t *zigzag_scan8x8_q0;
 366     const uint8_t *zigzag_scan8x8_cavlc_q0;
 367
 368     int x264_build;
 369 }H264Context;
 370
 371 static VLC coeff_token_vlc[4];
 372 static VLC chroma_dc_coeff_token_vlc;
 373
 374 static VLC total_zeros_vlc[15];
 375 static VLC chroma_dc_total_zeros_vlc[3];
 376
 377 static VLC run_vlc[6];
 378 static VLC run7_vlc;
 379
 380 static void svq3_luma_dc_dequant_idct_c(DCTELEM *block, int qp);
 381 static void svq3_add_idct_c(uint8_t *dst, DCTELEM *block, int stride, int qp, int dc);
 382 static void filter_mb( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize);
 383
 384 static always_inline uint32_t pack16to32(int a, int b){
 385 #ifdef WORDS_BIGENDIAN
 386    return (b&0xFFFF) + (a<<16);
 387 #else
 388    return (a&0xFFFF) + (b<<16);
 389 #endif
 390 }
 391
 392 /**
 393  * fill a rectangle.
 394  * @param h height of the rectangle, should be a constant
 395  * @param w width of the rectangle, should be a constant
 396  * @param size the size of val (1 or 4), should be a constant
 397  */
 398 static always_inline void fill_rectangle(void *vp, int w, int h, int stride, uint32_t val, int size){
 399     uint8_t *p= (uint8_t*)vp;
 400     assert(size==1 || size==4);
 401
 402     w      *= size;
 403     stride *= size;
 404
 405     assert((((long)vp)&(FFMIN(w, STRIDE_ALIGN)-1)) == 0);
 406     assert((stride&(w-1))==0);
 407 //FIXME check what gcc generates for 64 bit on x86 and possibly write a 32 bit ver of it
 408     if(w==2 && h==2){
 409         *(uint16_t*)(p + 0)=
 410         *(uint16_t*)(p + stride)= size==4 ? val : val*0x0101;
 411     }else if(w==2 && h==4){
 412         *(uint16_t*)(p + 0*stride)=
 413         *(uint16_t*)(p + 1*stride)=
 414         *(uint16_t*)(p + 2*stride)=
 415         *(uint16_t*)(p + 3*stride)= size==4 ? val : val*0x0101;
 416     }else if(w==4 && h==1){
 417         *(uint32_t*)(p + 0*stride)= size==4 ? val : val*0x01010101;
 418     }else if(w==4 && h==2){
 419         *(uint32_t*)(p + 0*stride)=
 420         *(uint32_t*)(p + 1*stride)= size==4 ? val : val*0x01010101;
 421     }else if(w==4 && h==4){
 422         *(uint32_t*)(p + 0*stride)=
 423         *(uint32_t*)(p + 1*stride)=
 424         *(uint32_t*)(p + 2*stride)=
 425         *(uint32_t*)(p + 3*stride)= size==4 ? val : val*0x01010101;
 426     }else if(w==8 && h==1){
 427         *(uint32_t*)(p + 0)=
 428         *(uint32_t*)(p + 4)= size==4 ? val : val*0x01010101;
 429     }else if(w==8 && h==2){
 430         *(uint32_t*)(p + 0 + 0*stride)=
 431         *(uint32_t*)(p + 4 + 0*stride)=
 432         *(uint32_t*)(p + 0 + 1*stride)=
 433         *(uint32_t*)(p + 4 + 1*stride)=  size==4 ? val : val*0x01010101;
 434     }else if(w==8 && h==4){
 435         *(uint64_t*)(p + 0*stride)=
 436         *(uint64_t*)(p + 1*stride)=
 437         *(uint64_t*)(p + 2*stride)=
 438         *(uint64_t*)(p + 3*stride)= size==4 ? val*0x0100000001ULL : val*0x0101010101010101ULL;
 439     }else if(w==16 && h==2){
 440         *(uint64_t*)(p + 0+0*stride)=
 441         *(uint64_t*)(p + 8+0*stride)=
 442         *(uint64_t*)(p + 0+1*stride)=
 443         *(uint64_t*)(p + 8+1*stride)= size==4 ? val*0x0100000001ULL : val*0x0101010101010101ULL;
 444     }else if(w==16 && h==4){
 445         *(uint64_t*)(p + 0+0*stride)=
 446         *(uint64_t*)(p + 8+0*stride)=
 447         *(uint64_t*)(p + 0+1*stride)=
 448         *(uint64_t*)(p + 8+1*stride)=
 449         *(uint64_t*)(p + 0+2*stride)=
 450         *(uint64_t*)(p + 8+2*stride)=
 451         *(uint64_t*)(p + 0+3*stride)=
 452         *(uint64_t*)(p + 8+3*stride)= size==4 ? val*0x0100000001ULL : val*0x0101010101010101ULL;
 453     }else
 454         assert(0);
 455 }
 456
 457 static void fill_caches(H264Context *h, int mb_type, int for_deblock){
 458     MpegEncContext * const s = &h->s;
 459     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
 460     int topleft_xy, top_xy, topright_xy, left_xy[2];
 461     int topleft_type, top_type, topright_type, left_type[2];
 462     int left_block[8];
 463     int i;
 464
 465     //FIXME deblocking can skip fill_caches much of the time with multiple slices too.
 466     // the actual condition is whether we're on the edge of a slice,
 467     // and even then the intra and nnz parts are unnecessary.
 468     if(for_deblock && h->slice_num == 1)
 469         return;
 470
 471     //wow what a mess, why didn't they simplify the interlacing&intra stuff, i can't imagine that these complex rules are worth it
 472
 473     top_xy     = mb_xy  - s->mb_stride;
 474     topleft_xy = top_xy - 1;
 475     topright_xy= top_xy + 1;
 476     left_xy[1] = left_xy[0] = mb_xy-1;
 477     left_block[0]= 0;
 478     left_block[1]= 1;
 479     left_block[2]= 2;
 480     left_block[3]= 3;
 481     left_block[4]= 7;
 482     left_block[5]= 10;
 483     left_block[6]= 8;
 484     left_block[7]= 11;
 485     if(h->mb_aff_frame){
 486         const int pair_xy          = s->mb_x     + (s->mb_y & ~1)*s->mb_stride;
 487         const int top_pair_xy      = pair_xy     - s->mb_stride;
 488         const int topleft_pair_xy  = top_pair_xy - 1;
 489         const int topright_pair_xy = top_pair_xy + 1;
 490         const int topleft_mb_frame_flag  = !IS_INTERLACED(s->current_picture.mb_type[topleft_pair_xy]);
 491         const int top_mb_frame_flag      = !IS_INTERLACED(s->current_picture.mb_type[top_pair_xy]);
 492         const int topright_mb_frame_flag = !IS_INTERLACED(s->current_picture.mb_type[topright_pair_xy]);
 493         const int left_mb_frame_flag = !IS_INTERLACED(s->current_picture.mb_type[pair_xy-1]);
 494         const int curr_mb_frame_flag = !IS_INTERLACED(mb_type);
 495         const int bottom = (s->mb_y & 1);
 496         tprintf("fill_caches: curr_mb_frame_flag:%d, left_mb_frame_flag:%d, topleft_mb_frame_flag:%d, top_mb_frame_flag:%d, topright_mb_frame_flag:%d\n", curr_mb_frame_flag, left_mb_frame_flag, topleft_mb_frame_flag, top_mb_frame_flag, topright_mb_frame_flag);
 497         if (bottom
 498                 ? !curr_mb_frame_flag // bottom macroblock
 499                 : (!curr_mb_frame_flag && !top_mb_frame_flag) // top macroblock
 500                 ) {
 501             top_xy -= s->mb_stride;
 502         }
 503         if (bottom
 504                 ? !curr_mb_frame_flag // bottom macroblock
 505                 : (!curr_mb_frame_flag && !topleft_mb_frame_flag) // top macroblock
 506                 ) {
 507             topleft_xy -= s->mb_stride;
 508         }
 509         if (bottom
 510                 ? !curr_mb_frame_flag // bottom macroblock
 511                 : (!curr_mb_frame_flag && !topright_mb_frame_flag) // top macroblock
 512                 ) {
 513             topright_xy -= s->mb_stride;
 514         }
 515         if (left_mb_frame_flag != curr_mb_frame_flag) {
 516             left_xy[1] = left_xy[0] = pair_xy - 1;
 517             if (curr_mb_frame_flag) {
 518                 if (bottom) {
 519                     left_block[0]= 2;
 520                     left_block[1]= 2;
 521                     left_block[2]= 3;
 522                     left_block[3]= 3;
 523                     left_block[4]= 8;
 524                     left_block[5]= 11;
 525                     left_block[6]= 8;
 526                     left_block[7]= 11;
 527                 } else {
 528                     left_block[0]= 0;
 529                     left_block[1]= 0;
 530                     left_block[2]= 1;
 531                     left_block[3]= 1;
 532                     left_block[4]= 7;
 533                     left_block[5]= 10;
 534                     left_block[6]= 7;
 535                     left_block[7]= 10;
 536                 }
 537             } else {
 538                 left_xy[1] += s->mb_stride;
 539                 //left_block[0]= 0;
 540                 left_block[1]= 2;
 541                 left_block[2]= 0;
 542                 left_block[3]= 2;
 543                 //left_block[4]= 7;
 544                 left_block[5]= 10;
 545                 left_block[6]= 7;
 546                 left_block[7]= 10;
 547             }
 548         }
 549     }
 550
 551     h->top_mb_xy = top_xy;
 552     h->left_mb_xy[0] = left_xy[0];
 553     h->left_mb_xy[1] = left_xy[1];
 554     if(for_deblock){
 555         topleft_type = h->slice_table[topleft_xy ] < 255 ? s->current_picture.mb_type[topleft_xy] : 0;
 556         top_type     = h->slice_table[top_xy     ] < 255 ? s->current_picture.mb_type[top_xy]     : 0;
 557         topright_type= h->slice_table[topright_xy] < 255 ? s->current_picture.mb_type[topright_xy]: 0;
 558         left_type[0] = h->slice_table[left_xy[0] ] < 255 ? s->current_picture.mb_type[left_xy[0]] : 0;
 559         left_type[1] = h->slice_table[left_xy[1] ] < 255 ? s->current_picture.mb_type[left_xy[1]] : 0;
 560     }else{
 561         topleft_type = h->slice_table[topleft_xy ] == h->slice_num ? s->current_picture.mb_type[topleft_xy] : 0;
 562         top_type     = h->slice_table[top_xy     ] == h->slice_num ? s->current_picture.mb_type[top_xy]     : 0;
 563         topright_type= h->slice_table[topright_xy] == h->slice_num ? s->current_picture.mb_type[topright_xy]: 0;
 564         left_type[0] = h->slice_table[left_xy[0] ] == h->slice_num ? s->current_picture.mb_type[left_xy[0]] : 0;
 565         left_type[1] = h->slice_table[left_xy[1] ] == h->slice_num ? s->current_picture.mb_type[left_xy[1]] : 0;
 566     }
 567
 568     if(IS_INTRA(mb_type)){
 569         h->topleft_samples_available=
 570         h->top_samples_available=
 571         h->left_samples_available= 0xFFFF;
 572         h->topright_samples_available= 0xEEEA;
 573
 574         if(!IS_INTRA(top_type) && (top_type==0 || h->pps.constrained_intra_pred)){
 575             h->topleft_samples_available= 0xB3FF;
 576             h->top_samples_available= 0x33FF;
 577             h->topright_samples_available= 0x26EA;
 578         }
 579         for(i=0; i<2; i++){
 580             if(!IS_INTRA(left_type[i]) && (left_type[i]==0 || h->pps.constrained_intra_pred)){
 581                 h->topleft_samples_available&= 0xDF5F;
 582                 h->left_samples_available&= 0x5F5F;
 583             }
 584         }
 585
 586         if(!IS_INTRA(topleft_type) && (topleft_type==0 || h->pps.constrained_intra_pred))
 587             h->topleft_samples_available&= 0x7FFF;
 588
 589         if(!IS_INTRA(topright_type) && (topright_type==0 || h->pps.constrained_intra_pred))
 590             h->topright_samples_available&= 0xFBFF;
 591
 592         if(IS_INTRA4x4(mb_type)){
 593             if(IS_INTRA4x4(top_type)){
 594                 h->intra4x4_pred_mode_cache[4+8*0]= h->intra4x4_pred_mode[top_xy][4];
 595                 h->intra4x4_pred_mode_cache[5+8*0]= h->intra4x4_pred_mode[top_xy][5];
 596                 h->intra4x4_pred_mode_cache[6+8*0]= h->intra4x4_pred_mode[top_xy][6];
 597                 h->intra4x4_pred_mode_cache[7+8*0]= h->intra4x4_pred_mode[top_xy][3];
 598             }else{
 599                 int pred;
 600                 if(!top_type || (IS_INTER(top_type) && h->pps.constrained_intra_pred))
 601                     pred= -1;
 602                 else{
 603                     pred= 2;
 604                 }
 605                 h->intra4x4_pred_mode_cache[4+8*0]=
 606                 h->intra4x4_pred_mode_cache[5+8*0]=
 607                 h->intra4x4_pred_mode_cache[6+8*0]=
 608                 h->intra4x4_pred_mode_cache[7+8*0]= pred;
 609             }
 610             for(i=0; i<2; i++){
 611                 if(IS_INTRA4x4(left_type[i])){
 612                     h->intra4x4_pred_mode_cache[3+8*1 + 2*8*i]= h->intra4x4_pred_mode[left_xy[i]][left_block[0+2*i]];
 613                     h->intra4x4_pred_mode_cache[3+8*2 + 2*8*i]= h->intra4x4_pred_mode[left_xy[i]][left_block[1+2*i]];
 614                 }else{
 615                     int pred;
 616                     if(!left_type[i] || (IS_INTER(left_type[i]) && h->pps.constrained_intra_pred))
 617                         pred= -1;
 618                     else{
 619                         pred= 2;
 620                     }
 621                     h->intra4x4_pred_mode_cache[3+8*1 + 2*8*i]=
 622                     h->intra4x4_pred_mode_cache[3+8*2 + 2*8*i]= pred;
 623                 }
 624             }
 625         }
 626     }
 627
 628
 629 /*
 630 0 . T T. T T T T
 631 1 L . .L . . . .
 632 2 L . .L . . . .
 633 3 . T TL . . . .
 634 4 L . .L . . . .
 635 5 L . .. . . . .
 636 */
 637 //FIXME constraint_intra_pred & partitioning & nnz (lets hope this is just a typo in the spec)
 638     if(top_type){
 639         h->non_zero_count_cache[4+8*0]= h->non_zero_count[top_xy][4];
 640         h->non_zero_count_cache[5+8*0]= h->non_zero_count[top_xy][5];
 641         h->non_zero_count_cache[6+8*0]= h->non_zero_count[top_xy][6];
 642         h->non_zero_count_cache[7+8*0]= h->non_zero_count[top_xy][3];
 643
 644         h->non_zero_count_cache[1+8*0]= h->non_zero_count[top_xy][9];
 645         h->non_zero_count_cache[2+8*0]= h->non_zero_count[top_xy][8];
 646
 647         h->non_zero_count_cache[1+8*3]= h->non_zero_count[top_xy][12];
 648         h->non_zero_count_cache[2+8*3]= h->non_zero_count[top_xy][11];
 649
 650     }else{
 651         h->non_zero_count_cache[4+8*0]=
 652         h->non_zero_count_cache[5+8*0]=
 653         h->non_zero_count_cache[6+8*0]=
 654         h->non_zero_count_cache[7+8*0]=
 655
 656         h->non_zero_count_cache[1+8*0]=
 657         h->non_zero_count_cache[2+8*0]=
 658
 659         h->non_zero_count_cache[1+8*3]=
 660         h->non_zero_count_cache[2+8*3]= h->pps.cabac && !IS_INTRA(mb_type) ? 0 : 64;
 661
 662     }
 663
 664     for (i=0; i<2; i++) {
 665         if(left_type[i]){
 666             h->non_zero_count_cache[3+8*1 + 2*8*i]= h->non_zero_count[left_xy[i]][left_block[0+2*i]];
 667             h->non_zero_count_cache[3+8*2 + 2*8*i]= h->non_zero_count[left_xy[i]][left_block[1+2*i]];
 668             h->non_zero_count_cache[0+8*1 +   8*i]= h->non_zero_count[left_xy[i]][left_block[4+2*i]];
 669             h->non_zero_count_cache[0+8*4 +   8*i]= h->non_zero_count[left_xy[i]][left_block[5+2*i]];
 670         }else{
 671             h->non_zero_count_cache[3+8*1 + 2*8*i]=
 672             h->non_zero_count_cache[3+8*2 + 2*8*i]=
 673             h->non_zero_count_cache[0+8*1 +   8*i]=
 674             h->non_zero_count_cache[0+8*4 +   8*i]= h->pps.cabac && !IS_INTRA(mb_type) ? 0 : 64;
 675         }
 676     }
 677
 678     if( h->pps.cabac ) {
 679         // top_cbp
 680         if(top_type) {
 681             h->top_cbp = h->cbp_table[top_xy];
 682         } else if(IS_INTRA(mb_type)) {
 683             h->top_cbp = 0x1C0;
 684         } else {
 685             h->top_cbp = 0;
 686         }
 687         // left_cbp
 688         if (left_type[0]) {
 689             h->left_cbp = h->cbp_table[left_xy[0]] & 0x1f0;
 690         } else if(IS_INTRA(mb_type)) {
 691             h->left_cbp = 0x1C0;
 692         } else {
 693             h->left_cbp = 0;
 694         }
 695         if (left_type[0]) {
 696             h->left_cbp |= ((h->cbp_table[left_xy[0]]>>((left_block[0]&(~1))+1))&0x1) << 1;
 697         }
 698         if (left_type[1]) {
 699             h->left_cbp |= ((h->cbp_table[left_xy[1]]>>((left_block[2]&(~1))+1))&0x1) << 3;
 700         }
 701     }
 702
 703 #if 1
 704     //FIXME direct mb can skip much of this
 705     if(IS_INTER(mb_type) || IS_DIRECT(mb_type)){
 706         int list;
 707         for(list=0; list<1+(h->slice_type==B_TYPE); list++){
 708             if(!USES_LIST(mb_type, list) && !IS_DIRECT(mb_type) && !h->deblocking_filter){
 709                 /*if(!h->mv_cache_clean[list]){
 710                     memset(h->mv_cache [list],  0, 8*5*2*sizeof(int16_t)); //FIXME clean only input? clean at all?
 711                     memset(h->ref_cache[list], PART_NOT_AVAILABLE, 8*5*sizeof(int8_t));
 712                     h->mv_cache_clean[list]= 1;
 713                 }*/
 714                 continue;
 715             }
 716             h->mv_cache_clean[list]= 0;
 717
 718             if(IS_INTER(top_type)){
 719                 const int b_xy= h->mb2b_xy[top_xy] + 3*h->b_stride;
 720                 const int b8_xy= h->mb2b8_xy[top_xy] + h->b8_stride;
 721                 *(uint32_t*)h->mv_cache[list][scan8[0] + 0 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 0];
 722                 *(uint32_t*)h->mv_cache[list][scan8[0] + 1 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 1];
 723                 *(uint32_t*)h->mv_cache[list][scan8[0] + 2 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 2];
 724                 *(uint32_t*)h->mv_cache[list][scan8[0] + 3 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 3];
 725                 h->ref_cache[list][scan8[0] + 0 - 1*8]=
 726                 h->ref_cache[list][scan8[0] + 1 - 1*8]= s->current_picture.ref_index[list][b8_xy + 0];
 727                 h->ref_cache[list][scan8[0] + 2 - 1*8]=
 728                 h->ref_cache[list][scan8[0] + 3 - 1*8]= s->current_picture.ref_index[list][b8_xy + 1];
 729             }else{
 730                 *(uint32_t*)h->mv_cache [list][scan8[0] + 0 - 1*8]=
 731                 *(uint32_t*)h->mv_cache [list][scan8[0] + 1 - 1*8]=
 732                 *(uint32_t*)h->mv_cache [list][scan8[0] + 2 - 1*8]=
 733                 *(uint32_t*)h->mv_cache [list][scan8[0] + 3 - 1*8]= 0;
 734                 *(uint32_t*)&h->ref_cache[list][scan8[0] + 0 - 1*8]= ((top_type ? LIST_NOT_USED : PART_NOT_AVAILABLE)&0xFF)*0x01010101;
 735             }
 736
 737             //FIXME unify cleanup or sth
 738             if(IS_INTER(left_type[0])){
 739                 const int b_xy= h->mb2b_xy[left_xy[0]] + 3;
 740                 const int b8_xy= h->mb2b8_xy[left_xy[0]] + 1;
 741                 *(uint32_t*)h->mv_cache[list][scan8[0] - 1 + 0*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + h->b_stride*left_block[0]];
 742                 *(uint32_t*)h->mv_cache[list][scan8[0] - 1 + 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + h->b_stride*left_block[1]];
 743                 h->ref_cache[list][scan8[0] - 1 + 0*8]=
 744                 h->ref_cache[list][scan8[0] - 1 + 1*8]= s->current_picture.ref_index[list][b8_xy + h->b8_stride*(left_block[0]>>1)];
 745             }else{
 746                 *(uint32_t*)h->mv_cache [list][scan8[0] - 1 + 0*8]=
 747                 *(uint32_t*)h->mv_cache [list][scan8[0] - 1 + 1*8]= 0;
 748                 h->ref_cache[list][scan8[0] - 1 + 0*8]=
 749                 h->ref_cache[list][scan8[0] - 1 + 1*8]= left_type[0] ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 750             }
 751
 752             if(IS_INTER(left_type[1])){
 753                 const int b_xy= h->mb2b_xy[left_xy[1]] + 3;
 754                 const int b8_xy= h->mb2b8_xy[left_xy[1]] + 1;
 755                 *(uint32_t*)h->mv_cache[list][scan8[0] - 1 + 2*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + h->b_stride*left_block[2]];
 756                 *(uint32_t*)h->mv_cache[list][scan8[0] - 1 + 3*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + h->b_stride*left_block[3]];
 757                 h->ref_cache[list][scan8[0] - 1 + 2*8]=
 758                 h->ref_cache[list][scan8[0] - 1 + 3*8]= s->current_picture.ref_index[list][b8_xy + h->b8_stride*(left_block[2]>>1)];
 759             }else{
 760                 *(uint32_t*)h->mv_cache [list][scan8[0] - 1 + 2*8]=
 761                 *(uint32_t*)h->mv_cache [list][scan8[0] - 1 + 3*8]= 0;
 762                 h->ref_cache[list][scan8[0] - 1 + 2*8]=
 763                 h->ref_cache[list][scan8[0] - 1 + 3*8]= left_type[0] ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 764                 assert((!left_type[0]) == (!left_type[1]));
 765             }
 766
 767             if(for_deblock || (IS_DIRECT(mb_type) && !h->direct_spatial_mv_pred))
 768                 continue;
 769
 770             if(IS_INTER(topleft_type)){
 771                 const int b_xy = h->mb2b_xy[topleft_xy] + 3 + 3*h->b_stride;
 772                 const int b8_xy= h->mb2b8_xy[topleft_xy] + 1 + h->b8_stride;
 773                 *(uint32_t*)h->mv_cache[list][scan8[0] - 1 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy];
 774                 h->ref_cache[list][scan8[0] - 1 - 1*8]= s->current_picture.ref_index[list][b8_xy];
 775             }else{
 776                 *(uint32_t*)h->mv_cache[list][scan8[0] - 1 - 1*8]= 0;
 777                 h->ref_cache[list][scan8[0] - 1 - 1*8]= topleft_type ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 778             }
 779
 780             if(IS_INTER(topright_type)){
 781                 const int b_xy= h->mb2b_xy[topright_xy] + 3*h->b_stride;
 782                 const int b8_xy= h->mb2b8_xy[topright_xy] + h->b8_stride;
 783                 *(uint32_t*)h->mv_cache[list][scan8[0] + 4 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy];
 784                 h->ref_cache[list][scan8[0] + 4 - 1*8]= s->current_picture.ref_index[list][b8_xy];
 785             }else{
 786                 *(uint32_t*)h->mv_cache [list][scan8[0] + 4 - 1*8]= 0;
 787                 h->ref_cache[list][scan8[0] + 4 - 1*8]= topright_type ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 788             }
 789
 790
 791             h->ref_cache[list][scan8[5 ]+1] =
 792             h->ref_cache[list][scan8[7 ]+1] =
 793             h->ref_cache[list][scan8[13]+1] =  //FIXME remove past 3 (init somewhere else)
 794             h->ref_cache[list][scan8[4 ]] =
 795             h->ref_cache[list][scan8[12]] = PART_NOT_AVAILABLE;
 796             *(uint32_t*)h->mv_cache [list][scan8[5 ]+1]=
 797             *(uint32_t*)h->mv_cache [list][scan8[7 ]+1]=
 798             *(uint32_t*)h->mv_cache [list][scan8[13]+1]= //FIXME remove past 3 (init somewhere else)
 799             *(uint32_t*)h->mv_cache [list][scan8[4 ]]=
 800             *(uint32_t*)h->mv_cache [list][scan8[12]]= 0;
 801
 802             if( h->pps.cabac ) {
 803                 /* XXX beurk, Load mvd */
 804                 if(IS_INTER(topleft_type)){
 805                     const int b_xy = h->mb2b_xy[topleft_xy] + 3 + 3*h->b_stride;
 806                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy];
 807                 }else{
 808                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 - 1*8]= 0;
 809                 }
 810
 811                 if(IS_INTER(top_type)){
 812                     const int b_xy= h->mb2b_xy[top_xy] + 3*h->b_stride;
 813                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 0 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 0];
 814                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 1 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 1];
 815                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 2 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 2];
 816                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 3 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 3];
 817                 }else{
 818                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 0 - 1*8]=
 819                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 1 - 1*8]=
 820                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 2 - 1*8]=
 821                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 3 - 1*8]= 0;
 822                 }
 823                 if(IS_INTER(left_type[0])){
 824                     const int b_xy= h->mb2b_xy[left_xy[0]] + 3;
 825                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 0*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[0]];
 826                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[1]];
 827                 }else{
 828                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 0*8]=
 829                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 1*8]= 0;
 830                 }
 831                 if(IS_INTER(left_type[1])){
 832                     const int b_xy= h->mb2b_xy[left_xy[1]] + 3;
 833                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 2*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[2]];
 834                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 3*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[3]];
 835                 }else{
 836                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 2*8]=
 837                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 3*8]= 0;
 838                 }
 839                 *(uint32_t*)h->mvd_cache [list][scan8[5 ]+1]=
 840                 *(uint32_t*)h->mvd_cache [list][scan8[7 ]+1]=
 841                 *(uint32_t*)h->mvd_cache [list][scan8[13]+1]= //FIXME remove past 3 (init somewhere else)
 842                 *(uint32_t*)h->mvd_cache [list][scan8[4 ]]=
 843                 *(uint32_t*)h->mvd_cache [list][scan8[12]]= 0;
 844
 845                 if(h->slice_type == B_TYPE){
 846                     fill_rectangle(&h->direct_cache[scan8[0]], 4, 4, 8, 0, 1);
 847
 848                     if(IS_DIRECT(top_type)){
 849                         *(uint32_t*)&h->direct_cache[scan8[0] - 1*8]= 0x01010101;
 850                     }else if(IS_8X8(top_type)){
 851                         int b8_xy = h->mb2b8_xy[top_xy] + h->b8_stride;
 852                         h->direct_cache[scan8[0] + 0 - 1*8]= h->direct_table[b8_xy];
 853                         h->direct_cache[scan8[0] + 2 - 1*8]= h->direct_table[b8_xy + 1];
 854                     }else{
 855                         *(uint32_t*)&h->direct_cache[scan8[0] - 1*8]= 0;
 856                     }
 857
 858                     //FIXME interlacing
 859                     if(IS_DIRECT(left_type[0])){
 860                         h->direct_cache[scan8[0] - 1 + 0*8]=
 861                         h->direct_cache[scan8[0] - 1 + 2*8]= 1;
 862                     }else if(IS_8X8(left_type[0])){
 863                         int b8_xy = h->mb2b8_xy[left_xy[0]] + 1;
 864                         h->direct_cache[scan8[0] - 1 + 0*8]= h->direct_table[b8_xy];
 865                         h->direct_cache[scan8[0] - 1 + 2*8]= h->direct_table[b8_xy + h->b8_stride];
 866                     }else{
 867                         h->direct_cache[scan8[0] - 1 + 0*8]=
 868                         h->direct_cache[scan8[0] - 1 + 2*8]= 0;
 869                     }
 870                 }
 871             }
 872         }
 873     }
 874 #endif
 875
 876     h->neighbor_transform_size= !!IS_8x8DCT(top_type) + !!IS_8x8DCT(left_type[0]);
 877 }
 878
 879 static inline void write_back_intra_pred_mode(H264Context *h){
 880     MpegEncContext * const s = &h->s;
 881     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
 882
 883     h->intra4x4_pred_mode[mb_xy][0]= h->intra4x4_pred_mode_cache[7+8*1];
 884     h->intra4x4_pred_mode[mb_xy][1]= h->intra4x4_pred_mode_cache[7+8*2];
 885     h->intra4x4_pred_mode[mb_xy][2]= h->intra4x4_pred_mode_cache[7+8*3];
 886     h->intra4x4_pred_mode[mb_xy][3]= h->intra4x4_pred_mode_cache[7+8*4];
 887     h->intra4x4_pred_mode[mb_xy][4]= h->intra4x4_pred_mode_cache[4+8*4];
 888     h->intra4x4_pred_mode[mb_xy][5]= h->intra4x4_pred_mode_cache[5+8*4];
 889     h->intra4x4_pred_mode[mb_xy][6]= h->intra4x4_pred_mode_cache[6+8*4];
 890 }
 891
 892 /**
 893  * checks if the top & left blocks are available if needed & changes the dc mode so it only uses the available blocks.
 894  */
 895 static inline int check_intra4x4_pred_mode(H264Context *h){
 896     MpegEncContext * const s = &h->s;
 897     static const int8_t top [12]= {-1, 0,LEFT_DC_PRED,-1,-1,-1,-1,-1, 0};
 898     static const int8_t left[12]= { 0,-1, TOP_DC_PRED, 0,-1,-1,-1, 0,-1,DC_128_PRED};
 899     int i;
 900
 901     if(!(h->top_samples_available&0x8000)){
 902         for(i=0; i<4; i++){
 903             int status= top[ h->intra4x4_pred_mode_cache[scan8[0] + i] ];
 904             if(status<0){
 905                 av_log(h->s.avctx, AV_LOG_ERROR, "top block unavailable for requested intra4x4 mode %d at %d %d\n", status, s->mb_x, s->mb_y);
 906                 return -1;
 907             } else if(status){
 908                 h->intra4x4_pred_mode_cache[scan8[0] + i]= status;
 909             }
 910         }
 911     }
 912
 913     if(!(h->left_samples_available&0x8000)){
 914         for(i=0; i<4; i++){
 915             int status= left[ h->intra4x4_pred_mode_cache[scan8[0] + 8*i] ];
 916             if(status<0){
 917                 av_log(h->s.avctx, AV_LOG_ERROR, "left block unavailable for requested intra4x4 mode %d at %d %d\n", status, s->mb_x, s->mb_y);
 918                 return -1;
 919             } else if(status){
 920                 h->intra4x4_pred_mode_cache[scan8[0] + 8*i]= status;
 921             }
 922         }
 923     }
 924
 925     return 0;
 926 } //FIXME cleanup like next
 927
 928 /**
 929  * checks if the top & left blocks are available if needed & changes the dc mode so it only uses the available blocks.
 930  */
 931 static inline int check_intra_pred_mode(H264Context *h, int mode){
 932     MpegEncContext * const s = &h->s;
 933     static const int8_t top [7]= {LEFT_DC_PRED8x8, 1,-1,-1};
 934     static const int8_t left[7]= { TOP_DC_PRED8x8,-1, 2,-1,DC_128_PRED8x8};
 935
 936     if(mode < 0 || mode > 6) {
 937         av_log(h->s.avctx, AV_LOG_ERROR, "out of range intra chroma pred mode at %d %d\n", s->mb_x, s->mb_y);
 938         return -1;
 939     }
 940
 941     if(!(h->top_samples_available&0x8000)){
 942         mode= top[ mode ];
 943         if(mode<0){
 944             av_log(h->s.avctx, AV_LOG_ERROR, "top block unavailable for requested intra mode at %d %d\n", s->mb_x, s->mb_y);
 945             return -1;
 946         }
 947     }
 948
 949     if(!(h->left_samples_available&0x8000)){
 950         mode= left[ mode ];
 951         if(mode<0){
 952             av_log(h->s.avctx, AV_LOG_ERROR, "left block unavailable for requested intra mode at %d %d\n", s->mb_x, s->mb_y);
 953             return -1;
 954         }
 955     }
 956
 957     return mode;
 958 }
 959
 960 /**
 961  * gets the predicted intra4x4 prediction mode.
 962  */
 963 static inline int pred_intra_mode(H264Context *h, int n){
 964     const int index8= scan8[n];
 965     const int left= h->intra4x4_pred_mode_cache[index8 - 1];
 966     const int top = h->intra4x4_pred_mode_cache[index8 - 8];
 967     const int min= FFMIN(left, top);
 968
 969     tprintf("mode:%d %d min:%d\n", left ,top, min);
 970
 971     if(min<0) return DC_PRED;
 972     else      return min;
 973 }
 974
 975 static inline void write_back_non_zero_count(H264Context *h){
 976     MpegEncContext * const s = &h->s;
 977     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
 978
 979     h->non_zero_count[mb_xy][0]= h->non_zero_count_cache[7+8*1];
 980     h->non_zero_count[mb_xy][1]= h->non_zero_count_cache[7+8*2];
 981     h->non_zero_count[mb_xy][2]= h->non_zero_count_cache[7+8*3];
 982     h->non_zero_count[mb_xy][3]= h->non_zero_count_cache[7+8*4];
 983     h->non_zero_count[mb_xy][4]= h->non_zero_count_cache[4+8*4];
 984     h->non_zero_count[mb_xy][5]= h->non_zero_count_cache[5+8*4];
 985     h->non_zero_count[mb_xy][6]= h->non_zero_count_cache[6+8*4];
 986
 987     h->non_zero_count[mb_xy][9]= h->non_zero_count_cache[1+8*2];
 988     h->non_zero_count[mb_xy][8]= h->non_zero_count_cache[2+8*2];
 989     h->non_zero_count[mb_xy][7]= h->non_zero_count_cache[2+8*1];
 990
 991     h->non_zero_count[mb_xy][12]=h->non_zero_count_cache[1+8*5];
 992     h->non_zero_count[mb_xy][11]=h->non_zero_count_cache[2+8*5];
 993     h->non_zero_count[mb_xy][10]=h->non_zero_count_cache[2+8*4];
 994 }
 995
 996 /**
 997  * gets the predicted number of non zero coefficients.
 998  * @param n block index
 999  */
1000 static inline int pred_non_zero_count(H264Context *h, int n){
1001     const int index8= scan8[n];
1002     const int left= h->non_zero_count_cache[index8 - 1];
1003     const int top = h->non_zero_count_cache[index8 - 8];
1004     int i= left + top;
1005
1006     if(i<64) i= (i+1)>>1;
1007
1008     tprintf("pred_nnz L%X T%X n%d s%d P%X\n", left, top, n, scan8[n], i&31);
1009
1010     return i&31;
1011 }
1012
1013 static inline int fetch_diagonal_mv(H264Context *h, const int16_t **C, int i, int list, int part_width){
1014     const int topright_ref= h->ref_cache[list][ i - 8 + part_width ];
1015
1016     if(topright_ref != PART_NOT_AVAILABLE){
1017         *C= h->mv_cache[list][ i - 8 + part_width ];
1018         return topright_ref;
1019     }else{
1020         tprintf("topright MV not available\n");
1021
1022         *C= h->mv_cache[list][ i - 8 - 1 ];
1023         return h->ref_cache[list][ i - 8 - 1 ];
1024     }
1025 }
1026
1027 /**
1028  * gets the predicted MV.
1029  * @param n the block index
1030  * @param part_width the width of the partition (4, 8,16) -> (1, 2, 4)
1031  * @param mx the x component of the predicted motion vector
1032  * @param my the y component of the predicted motion vector
1033  */
1034 static inline void pred_motion(H264Context * const h, int n, int part_width, int list, int ref, int * const mx, int * const my){
1035     const int index8= scan8[n];
1036     const int top_ref=      h->ref_cache[list][ index8 - 8 ];
1037     const int left_ref=     h->ref_cache[list][ index8 - 1 ];
1038     const int16_t * const A= h->mv_cache[list][ index8 - 1 ];
1039     const int16_t * const B= h->mv_cache[list][ index8 - 8 ];
1040     const int16_t * C;
1041     int diagonal_ref, match_count;
1042
1043     assert(part_width==1 || part_width==2 || part_width==4);
1044
1045 /* mv_cache
1046   B . . A T T T T
1047   U . . L . . , .
1048   U . . L . . . .
1049   U . . L . . , .
1050   . . . L . . . .
1051 */
1052
1053     diagonal_ref= fetch_diagonal_mv(h, &C, index8, list, part_width);
1054     match_count= (diagonal_ref==ref) + (top_ref==ref) + (left_ref==ref);
1055     tprintf("pred_motion match_count=%d\n", match_count);
1056     if(match_count > 1){ //most common
1057         *mx= mid_pred(A[0], B[0], C[0]);
1058         *my= mid_pred(A[1], B[1], C[1]);
1059     }else if(match_count==1){
1060         if(left_ref==ref){
1061             *mx= A[0];
1062             *my= A[1];
1063         }else if(top_ref==ref){
1064             *mx= B[0];
1065             *my= B[1];
1066         }else{
1067             *mx= C[0];
1068             *my= C[1];
1069         }
1070     }else{
1071         if(top_ref == PART_NOT_AVAILABLE && diagonal_ref == PART_NOT_AVAILABLE && left_ref != PART_NOT_AVAILABLE){
1072             *mx= A[0];
1073             *my= A[1];
1074         }else{
1075             *mx= mid_pred(A[0], B[0], C[0]);
1076             *my= mid_pred(A[1], B[1], C[1]);
1077         }
1078     }
1079
1080     tprintf("pred_motion (%2d %2d %2d) (%2d %2d %2d) (%2d %2d %2d) -> (%2d %2d %2d) at %2d %2d %d list %d\n", top_ref, B[0], B[1],                    diagonal_ref, C[0], C[1], left_ref, A[0], A[1], ref, *mx, *my, h->s.mb_x, h->s.mb_y, n, list);
1081 }
1082
1083 /**
1084  * gets the directionally predicted 16x8 MV.
1085  * @param n the block index
1086  * @param mx the x component of the predicted motion vector
1087  * @param my the y component of the predicted motion vector
1088  */
1089 static inline void pred_16x8_motion(H264Context * const h, int n, int list, int ref, int * const mx, int * const my){
1090     if(n==0){
1091         const int top_ref=      h->ref_cache[list][ scan8[0] - 8 ];
1092         const int16_t * const B= h->mv_cache[list][ scan8[0] - 8 ];
1093
1094         tprintf("pred_16x8: (%2d %2d %2d) at %2d %2d %d list %d\n", top_ref, B[0], B[1], h->s.mb_x, h->s.mb_y, n, list);
1095
1096         if(top_ref == ref){
1097             *mx= B[0];
1098             *my= B[1];
1099             return;
1100         }
1101     }else{
1102         const int left_ref=     h->ref_cache[list][ scan8[8] - 1 ];
1103         const int16_t * const A= h->mv_cache[list][ scan8[8] - 1 ];
1104
1105         tprintf("pred_16x8: (%2d %2d %2d) at %2d %2d %d list %d\n", left_ref, A[0], A[1], h->s.mb_x, h->s.mb_y, n, list);
1106
1107         if(left_ref == ref){
1108             *mx= A[0];
1109             *my= A[1];
1110             return;
1111         }
1112     }
1113
1114     //RARE
1115     pred_motion(h, n, 4, list, ref, mx, my);
1116 }
1117
1118 /**
1119  * gets the directionally predicted 8x16 MV.
1120  * @param n the block index
1121  * @param mx the x component of the predicted motion vector
1122  * @param my the y component of the predicted motion vector
1123  */
1124 static inline void pred_8x16_motion(H264Context * const h, int n, int list, int ref, int * const mx, int * const my){
1125     if(n==0){
1126         const int left_ref=      h->ref_cache[list][ scan8[0] - 1 ];
1127         const int16_t * const A=  h->mv_cache[list][ scan8[0] - 1 ];
1128
1129         tprintf("pred_8x16: (%2d %2d %2d) at %2d %2d %d list %d\n", left_ref, A[0], A[1], h->s.mb_x, h->s.mb_y, n, list);
1130
1131         if(left_ref == ref){
1132             *mx= A[0];
1133             *my= A[1];
1134             return;
1135         }
1136     }else{
1137         const int16_t * C;
1138         int diagonal_ref;
1139
1140         diagonal_ref= fetch_diagonal_mv(h, &C, scan8[4], list, 2);
1141
1142         tprintf("pred_8x16: (%2d %2d %2d) at %2d %2d %d list %d\n", diagonal_ref, C[0], C[1], h->s.mb_x, h->s.mb_y, n, list);
1143
1144         if(diagonal_ref == ref){
1145             *mx= C[0];
1146             *my= C[1];
1147             return;
1148         }
1149     }
1150
1151     //RARE
1152     pred_motion(h, n, 2, list, ref, mx, my);
1153 }
1154
1155 static inline void pred_pskip_motion(H264Context * const h, int * const mx, int * const my){
1156     const int top_ref = h->ref_cache[0][ scan8[0] - 8 ];
1157     const int left_ref= h->ref_cache[0][ scan8[0] - 1 ];
1158
1159     tprintf("pred_pskip: (%d) (%d) at %2d %2d\n", top_ref, left_ref, h->s.mb_x, h->s.mb_y);
1160
1161     if(top_ref == PART_NOT_AVAILABLE || left_ref == PART_NOT_AVAILABLE
1162        || (top_ref == 0  && *(uint32_t*)h->mv_cache[0][ scan8[0] - 8 ] == 0)
1163        || (left_ref == 0 && *(uint32_t*)h->mv_cache[0][ scan8[0] - 1 ] == 0)){
1164
1165         *mx = *my = 0;
1166         return;
1167     }
1168
1169     pred_motion(h, 0, 4, 0, 0, mx, my);
1170
1171     return;
1172 }
1173
1174 static inline void direct_dist_scale_factor(H264Context * const h){
1175     const int poc = h->s.current_picture_ptr->poc;
1176     const int poc1 = h->ref_list[1][0].poc;
1177     int i;
1178     for(i=0; i<h->ref_count[0]; i++){
1179         int poc0 = h->ref_list[0][i].poc;
1180         int td = clip(poc1 - poc0, -128, 127);
1181         if(td == 0 /* FIXME || pic0 is a long-term ref */){
1182             h->dist_scale_factor[i] = 256;
1183         }else{
1184             int tb = clip(poc - poc0, -128, 127);
1185             int tx = (16384 + (ABS(td) >> 1)) / td;
1186             h->dist_scale_factor[i] = clip((tb*tx + 32) >> 6, -1024, 1023);
1187         }
1188     }
1189 }
1190 static inline void direct_ref_list_init(H264Context * const h){
1191     MpegEncContext * const s = &h->s;
1192     Picture * const ref1 = &h->ref_list[1][0];
1193     Picture * const cur = s->current_picture_ptr;
1194     int list, i, j;
1195     if(cur->pict_type == I_TYPE)
1196         cur->ref_count[0] = 0;
1197     if(cur->pict_type != B_TYPE)
1198         cur->ref_count[1] = 0;
1199     for(list=0; list<2; list++){
1200         cur->ref_count[list] = h->ref_count[list];
1201         for(j=0; j<h->ref_count[list]; j++)
1202             cur->ref_poc[list][j] = h->ref_list[list][j].poc;
1203     }
1204     if(cur->pict_type != B_TYPE || h->direct_spatial_mv_pred)
1205         return;
1206     for(list=0; list<2; list++){
1207         for(i=0; i<ref1->ref_count[list]; i++){
1208             const int poc = ref1->ref_poc[list][i];
1209             h->map_col_to_list0[list][i] = 0; /* bogus; fills in for missing frames */
1210             for(j=0; j<h->ref_count[list]; j++)
1211                 if(h->ref_list[list][j].poc == poc){
1212                     h->map_col_to_list0[list][i] = j;
1213                     break;
1214                 }
1215         }
1216     }
1217 }
1218
1219 static inline void pred_direct_motion(H264Context * const h, int *mb_type){
1220     MpegEncContext * const s = &h->s;
1221     const int mb_xy =   s->mb_x +   s->mb_y*s->mb_stride;
1222     const int b8_xy = 2*s->mb_x + 2*s->mb_y*h->b8_stride;
1223     const int b4_xy = 4*s->mb_x + 4*s->mb_y*h->b_stride;
1224     const int mb_type_col = h->ref_list[1][0].mb_type[mb_xy];
1225     const int16_t (*l1mv0)[2] = (const int16_t (*)[2]) &h->ref_list[1][0].motion_val[0][b4_xy];
1226     const int16_t (*l1mv1)[2] = (const int16_t (*)[2]) &h->ref_list[1][0].motion_val[1][b4_xy];
1227     const int8_t *l1ref0 = &h->ref_list[1][0].ref_index[0][b8_xy];
1228     const int8_t *l1ref1 = &h->ref_list[1][0].ref_index[1][b8_xy];
1229     const int is_b8x8 = IS_8X8(*mb_type);
1230     int sub_mb_type;
1231     int i8, i4;
1232
1233     if(IS_8X8(mb_type_col) && !h->sps.direct_8x8_inference_flag){
1234         /* FIXME save sub mb types from previous frames (or derive from MVs)
1235          * so we know exactly what block size to use */
1236         sub_mb_type = MB_TYPE_8x8|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_4x4 */
1237         *mb_type =    MB_TYPE_8x8|MB_TYPE_L0L1;
1238     }else if(!is_b8x8 && (IS_16X16(mb_type_col) || IS_INTRA(mb_type_col))){
1239         sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
1240         *mb_type =    MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_16x16 */
1241     }else{
1242         sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
1243         *mb_type =    MB_TYPE_8x8|MB_TYPE_L0L1;
1244     }
1245     if(!is_b8x8)
1246         *mb_type |= MB_TYPE_DIRECT2;
1247
1248     tprintf("mb_type = %08x, sub_mb_type = %08x, is_b8x8 = %d, mb_type_col = %08x\n", *mb_type, sub_mb_type, is_b8x8, mb_type_col);
1249
1250     if(h->direct_spatial_mv_pred){
1251         int ref[2];
1252         int mv[2][2];
1253         int list;
1254
1255         /* ref = min(neighbors) */
1256         for(list=0; list<2; list++){
1257             int refa = h->ref_cache[list][scan8[0] - 1];
1258             int refb = h->ref_cache[list][scan8[0] - 8];
1259             int refc = h->ref_cache[list][scan8[0] - 8 + 4];
1260             if(refc == -2)
1261                 refc = h->ref_cache[list][scan8[0] - 8 - 1];
1262             ref[list] = refa;
1263             if(ref[list] < 0 || (refb < ref[list] && refb >= 0))
1264                 ref[list] = refb;
1265             if(ref[list] < 0 || (refc < ref[list] && refc >= 0))
1266                 ref[list] = refc;
1267             if(ref[list] < 0)
1268                 ref[list] = -1;
1269         }
1270
1271         if(ref[0] < 0 && ref[1] < 0){
1272             ref[0] = ref[1] = 0;
1273             mv[0][0] = mv[0][1] =
1274             mv[1][0] = mv[1][1] = 0;
1275         }else{
1276             for(list=0; list<2; list++){
1277                 if(ref[list] >= 0)
1278                     pred_motion(h, 0, 4, list, ref[list], &mv[list][0], &mv[list][1]);
1279                 else
1280                     mv[list][0] = mv[list][1] = 0;
1281             }
1282         }
1283
1284         if(ref[1] < 0){
1285             *mb_type &= ~MB_TYPE_P0L1;
1286             sub_mb_type &= ~MB_TYPE_P0L1;
1287         }else if(ref[0] < 0){
1288             *mb_type &= ~MB_TYPE_P0L0;
1289             sub_mb_type &= ~MB_TYPE_P0L0;
1290         }
1291
1292         if(IS_16X16(*mb_type)){
1293             fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, (uint8_t)ref[0], 1);
1294             fill_rectangle(&h->ref_cache[1][scan8[0]], 4, 4, 8, (uint8_t)ref[1], 1);
1295             if(!IS_INTRA(mb_type_col)
1296                && (   (l1ref0[0] == 0 && ABS(l1mv0[0][0]) <= 1 && ABS(l1mv0[0][1]) <= 1)
1297                    || (l1ref0[0]  < 0 && l1ref1[0] == 0 && ABS(l1mv1[0][0]) <= 1 && ABS(l1mv1[0][1]) <= 1
1298                        && (h->x264_build>33 || !h->x264_build)))){
1299                 if(ref[0] > 0)
1300                     fill_rectangle(&h->mv_cache[0][scan8[0]], 4, 4, 8, pack16to32(mv[0][0],mv[0][1]), 4);
1301                 else
1302                     fill_rectangle(&h->mv_cache[0][scan8[0]], 4, 4, 8, 0, 4);
1303                 if(ref[1] > 0)
1304                     fill_rectangle(&h->mv_cache[1][scan8[0]], 4, 4, 8, pack16to32(mv[1][0],mv[1][1]), 4);
1305                 else
1306                     fill_rectangle(&h->mv_cache[1][scan8[0]], 4, 4, 8, 0, 4);
1307             }else{
1308                 fill_rectangle(&h->mv_cache[0][scan8[0]], 4, 4, 8, pack16to32(mv[0][0],mv[0][1]), 4);
1309                 fill_rectangle(&h->mv_cache[1][scan8[0]], 4, 4, 8, pack16to32(mv[1][0],mv[1][1]), 4);
1310             }
1311         }else{
1312             for(i8=0; i8<4; i8++){
1313                 const int x8 = i8&1;
1314                 const int y8 = i8>>1;
1315
1316                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1317                     continue;
1318                 h->sub_mb_type[i8] = sub_mb_type;
1319
1320                 fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mv[0][0],mv[0][1]), 4);
1321                 fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mv[1][0],mv[1][1]), 4);
1322                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[0], 1);
1323                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[1], 1);
1324
1325                 /* col_zero_flag */
1326                 if(!IS_INTRA(mb_type_col) && (   l1ref0[x8 + y8*h->b8_stride] == 0
1327                                               || (l1ref0[x8 + y8*h->b8_stride] < 0 && l1ref1[x8 + y8*h->b8_stride] == 0
1328                                                   && (h->x264_build>33 || !h->x264_build)))){
1329                     const int16_t (*l1mv)[2]= l1ref0[x8 + y8*h->b8_stride] == 0 ? l1mv0 : l1mv1;
1330                     if(IS_SUB_8X8(sub_mb_type)){
1331                         const int16_t *mv_col = l1mv[x8*3 + y8*3*h->b_stride];
1332                         if(ABS(mv_col[0]) <= 1 && ABS(mv_col[1]) <= 1){
1333                             if(ref[0] == 0)
1334                                 fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
1335                             if(ref[1] == 0)
1336                                 fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
1337                         }
1338                     }else
1339                     for(i4=0; i4<4; i4++){
1340                         const int16_t *mv_col = l1mv[x8*2 + (i4&1) + (y8*2 + (i4>>1))*h->b_stride];
1341                         if(ABS(mv_col[0]) <= 1 && ABS(mv_col[1]) <= 1){
1342                             if(ref[0] == 0)
1343                                 *(uint32_t*)h->mv_cache[0][scan8[i8*4+i4]] = 0;
1344                             if(ref[1] == 0)
1345                                 *(uint32_t*)h->mv_cache[1][scan8[i8*4+i4]] = 0;
1346                         }
1347                     }
1348                 }
1349             }
1350         }
1351     }else{ /* direct temporal mv pred */
1352         if(IS_16X16(*mb_type)){
1353             fill_rectangle(&h->ref_cache[1][scan8[0]], 4, 4, 8, 0, 1);
1354             if(IS_INTRA(mb_type_col)){
1355                 fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, 0, 1);
1356                 fill_rectangle(&h-> mv_cache[0][scan8[0]], 4, 4, 8, 0, 4);
1357                 fill_rectangle(&h-> mv_cache[1][scan8[0]], 4, 4, 8, 0, 4);
1358             }else{
1359                 const int ref0 = l1ref0[0] >= 0 ? h->map_col_to_list0[0][l1ref0[0]]
1360                                                 : h->map_col_to_list0[1][l1ref1[0]];
1361                 const int dist_scale_factor = h->dist_scale_factor[ref0];
1362                 const int16_t *mv_col = l1ref0[0] >= 0 ? l1mv0[0] : l1mv1[0];
1363                 int mv_l0[2];
1364                 mv_l0[0] = (dist_scale_factor * mv_col[0] + 128) >> 8;
1365                 mv_l0[1] = (dist_scale_factor * mv_col[1] + 128) >> 8;
1366                 fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, ref0, 1);
1367                 fill_rectangle(&h-> mv_cache[0][scan8[0]], 4, 4, 8, pack16to32(mv_l0[0],mv_l0[1]), 4);
1368                 fill_rectangle(&h-> mv_cache[1][scan8[0]], 4, 4, 8, pack16to32(mv_l0[0]-mv_col[0],mv_l0[1]-mv_col[1]), 4);
1369             }
1370         }else{
1371             for(i8=0; i8<4; i8++){
1372                 const int x8 = i8&1;
1373                 const int y8 = i8>>1;
1374                 int ref0, dist_scale_factor;
1375                 const int16_t (*l1mv)[2]= l1mv0;
1376
1377                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1378                     continue;
1379                 h->sub_mb_type[i8] = sub_mb_type;
1380                 if(IS_INTRA(mb_type_col)){
1381                     fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, 0, 1);
1382                     fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, 0, 1);
1383                     fill_rectangle(&h-> mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
1384                     fill_rectangle(&h-> mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
1385                     continue;
1386                 }
1387
1388                 ref0 = l1ref0[x8 + y8*h->b8_stride];
1389                 if(ref0 >= 0)
1390                     ref0 = h->map_col_to_list0[0][ref0];
1391                 else{
1392                     ref0 = h->map_col_to_list0[1][l1ref1[x8 + y8*h->b8_stride]];
1393                     l1mv= l1mv1;
1394                 }
1395                 dist_scale_factor = h->dist_scale_factor[ref0];
1396
1397                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, ref0, 1);
1398                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, 0, 1);
1399                 if(IS_SUB_8X8(sub_mb_type)){
1400                     const int16_t *mv_col = l1mv[x8*3 + y8*3*h->b_stride];
1401                     int mx = (dist_scale_factor * mv_col[0] + 128) >> 8;
1402                     int my = (dist_scale_factor * mv_col[1] + 128) >> 8;
1403                     fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mx,my), 4);
1404                     fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mx-mv_col[0],my-mv_col[1]), 4);
1405                 }else
1406                 for(i4=0; i4<4; i4++){
1407                     const int16_t *mv_col = l1mv[x8*2 + (i4&1) + (y8*2 + (i4>>1))*h->b_stride];
1408                     int16_t *mv_l0 = h->mv_cache[0][scan8[i8*4+i4]];
1409                     mv_l0[0] = (dist_scale_factor * mv_col[0] + 128) >> 8;
1410                     mv_l0[1] = (dist_scale_factor * mv_col[1] + 128) >> 8;
1411                     *(uint32_t*)h->mv_cache[1][scan8[i8*4+i4]] =
1412                         pack16to32(mv_l0[0]-mv_col[0],mv_l0[1]-mv_col[1]);
1413                 }
1414             }
1415         }
1416     }
1417 }
1418
1419 static inline void write_back_motion(H264Context *h, int mb_type){
1420     MpegEncContext * const s = &h->s;
1421     const int b_xy = 4*s->mb_x + 4*s->mb_y*h->b_stride;
1422     const int b8_xy= 2*s->mb_x + 2*s->mb_y*h->b8_stride;
1423     int list;
1424
1425     for(list=0; list<2; list++){
1426         int y;
1427         if(!USES_LIST(mb_type, list)){
1428             if(1){ //FIXME skip or never read if mb_type doesn't use it
1429                 for(y=0; y<4; y++){
1430                     *(uint64_t*)s->current_picture.motion_val[list][b_xy + 0 + y*h->b_stride]=
1431                     *(uint64_t*)s->current_picture.motion_val[list][b_xy + 2 + y*h->b_stride]= 0;
1432                 }
1433                 if( h->pps.cabac ) {
1434                     /* FIXME needed ? */
1435                     for(y=0; y<4; y++){
1436                         *(uint64_t*)h->mvd_table[list][b_xy + 0 + y*h->b_stride]=
1437                         *(uint64_t*)h->mvd_table[list][b_xy + 2 + y*h->b_stride]= 0;
1438                     }
1439                 }
1440                 for(y=0; y<2; y++){
1441                     s->current_picture.ref_index[list][b8_xy + 0 + y*h->b8_stride]=
1442                     s->current_picture.ref_index[list][b8_xy + 1 + y*h->b8_stride]= LIST_NOT_USED;
1443                 }
1444             }
1445             continue;
1446         }
1447
1448         for(y=0; y<4; y++){
1449             *(uint64_t*)s->current_picture.motion_val[list][b_xy + 0 + y*h->b_stride]= *(uint64_t*)h->mv_cache[list][scan8[0]+0 + 8*y];
1450             *(uint64_t*)s->current_picture.motion_val[list][b_xy + 2 + y*h->b_stride]= *(uint64_t*)h->mv_cache[list][scan8[0]+2 + 8*y];
1451         }
1452         if( h->pps.cabac ) {
1453             for(y=0; y<4; y++){
1454                 *(uint64_t*)h->mvd_table[list][b_xy + 0 + y*h->b_stride]= *(uint64_t*)h->mvd_cache[list][scan8[0]+0 + 8*y];
1455                 *(uint64_t*)h->mvd_table[list][b_xy + 2 + y*h->b_stride]= *(uint64_t*)h->mvd_cache[list][scan8[0]+2 + 8*y];
1456             }
1457         }
1458         for(y=0; y<2; y++){
1459             s->current_picture.ref_index[list][b8_xy + 0 + y*h->b8_stride]= h->ref_cache[list][scan8[0]+0 + 16*y];
1460             s->current_picture.ref_index[list][b8_xy + 1 + y*h->b8_stride]= h->ref_cache[list][scan8[0]+2 + 16*y];
1461         }
1462     }
1463
1464     if(h->slice_type == B_TYPE && h->pps.cabac){
1465         if(IS_8X8(mb_type)){
1466             h->direct_table[b8_xy+1+0*h->b8_stride] = IS_DIRECT(h->sub_mb_type[1]) ? 1 : 0;
1467             h->direct_table[b8_xy+0+1*h->b8_stride] = IS_DIRECT(h->sub_mb_type[2]) ? 1 : 0;
1468             h->direct_table[b8_xy+1+1*h->b8_stride] = IS_DIRECT(h->sub_mb_type[3]) ? 1 : 0;
1469         }
1470     }
1471 }
1472
1473 /**
1474  * Decodes a network abstraction layer unit.
1475  * @param consumed is the number of bytes used as input
1476  * @param length is the length of the array
1477  * @param dst_length is the number of decoded bytes FIXME here or a decode rbsp tailing?
1478  * @returns decoded bytes, might be src+1 if no escapes
1479  */
1480 static uint8_t *decode_nal(H264Context *h, uint8_t *src, int *dst_length, int *consumed, int length){
1481     int i, si, di;
1482     uint8_t *dst;
1483
1484 //    src[0]&0x80;                //forbidden bit
1485     h->nal_ref_idc= src[0]>>5;
1486     h->nal_unit_type= src[0]&0x1F;
1487
1488     src++; length--;
1489 #if 0
1490     for(i=0; i<length; i++)
1491         printf("%2X ", src[i]);
1492 #endif
1493     for(i=0; i+1<length; i+=2){
1494         if(src[i]) continue;
1495         if(i>0 && src[i-1]==0) i--;
1496         if(i+2<length && src[i+1]==0 && src[i+2]<=3){
1497             if(src[i+2]!=3){
1498                 /* startcode, so we must be past the end */
1499                 length=i;
1500             }
1501             break;
1502         }
1503     }
1504
1505     if(i>=length-1){ //no escaped 0
1506         *dst_length= length;
1507         *consumed= length+1; //+1 for the header
1508         return src;
1509     }
1510
1511     h->rbsp_buffer= av_fast_realloc(h->rbsp_buffer, &h->rbsp_buffer_size, length);
1512     dst= h->rbsp_buffer;
1513
1514 //printf("decoding esc\n");
1515     si=di=0;
1516     while(si<length){
1517         //remove escapes (very rare 1:2^22)
1518         if(si+2<length && src[si]==0 && src[si+1]==0 && src[si+2]<=3){
1519             if(src[si+2]==3){ //escape
1520                 dst[di++]= 0;
1521                 dst[di++]= 0;
1522                 si+=3;
1523                 continue;
1524             }else //next start code
1525                 break;
1526         }
1527
1528         dst[di++]= src[si++];
1529     }
1530
1531     *dst_length= di;
1532     *consumed= si + 1;//+1 for the header
1533 //FIXME store exact number of bits in the getbitcontext (its needed for decoding)
1534     return dst;
1535 }
1536
1537 #if 0
1538 /**
1539  * @param src the data which should be escaped
1540  * @param dst the target buffer, dst+1 == src is allowed as a special case
1541  * @param length the length of the src data
1542  * @param dst_length the length of the dst array
1543  * @returns length of escaped data in bytes or -1 if an error occured
1544  */
1545 static int encode_nal(H264Context *h, uint8_t *dst, uint8_t *src, int length, int dst_length){
1546     int i, escape_count, si, di;
1547     uint8_t *temp;
1548
1549     assert(length>=0);
1550     assert(dst_length>0);
1551
1552     dst[0]= (h->nal_ref_idc<<5) + h->nal_unit_type;
1553
1554     if(length==0) return 1;
1555
1556     escape_count= 0;
1557     for(i=0; i<length; i+=2){
1558         if(src[i]) continue;
1559         if(i>0 && src[i-1]==0)
1560             i--;
1561         if(i+2<length && src[i+1]==0 && src[i+2]<=3){
1562             escape_count++;
1563             i+=2;
1564         }
1565     }
1566
1567     if(escape_count==0){
1568         if(dst+1 != src)
1569             memcpy(dst+1, src, length);
1570         return length + 1;
1571     }
1572
1573     if(length + escape_count + 1> dst_length)
1574         return -1;
1575
1576     //this should be damn rare (hopefully)
1577
1578     h->rbsp_buffer= av_fast_realloc(h->rbsp_buffer, &h->rbsp_buffer_size, length + escape_count);
1579     temp= h->rbsp_buffer;
1580 //printf("encoding esc\n");
1581
1582     si= 0;
1583     di= 0;
1584     while(si < length){
1585         if(si+2<length && src[si]==0 && src[si+1]==0 && src[si+2]<=3){
1586             temp[di++]= 0; si++;
1587             temp[di++]= 0; si++;
1588             temp[di++]= 3;
1589             temp[di++]= src[si++];
1590         }
1591         else
1592             temp[di++]= src[si++];
1593     }
1594     memcpy(dst+1, temp, length+escape_count);
1595
1596     assert(di == length+escape_count);
1597
1598     return di + 1;
1599 }
1600
1601 /**
1602  * write 1,10,100,1000,... for alignment, yes its exactly inverse to mpeg4
1603  */
1604 static void encode_rbsp_trailing(PutBitContext *pb){
1605     int length;
1606     put_bits(pb, 1, 1);
1607     length= (-put_bits_count(pb))&7;
1608     if(length) put_bits(pb, length, 0);
1609 }
1610 #endif
1611
1612 /**
1613  * identifies the exact end of the bitstream
1614  * @return the length of the trailing, or 0 if damaged
1615  */
1616 static int decode_rbsp_trailing(uint8_t *src){
1617     int v= *src;
1618     int r;
1619
1620     tprintf("rbsp trailing %X\n", v);
1621
1622     for(r=1; r<9; r++){
1623         if(v&1) return r;
1624         v>>=1;
1625     }
1626     return 0;
1627 }
1628
1629 /**
1630  * idct tranforms the 16 dc values and dequantize them.
1631  * @param qp quantization parameter
1632  */
1633 static void h264_luma_dc_dequant_idct_c(DCTELEM *block, int qp, int qmul){
1634 #define stride 16
1635     int i;
1636     int temp[16]; //FIXME check if this is a good idea
1637     static const int x_offset[4]={0, 1*stride, 4* stride,  5*stride};
1638     static const int y_offset[4]={0, 2*stride, 8* stride, 10*stride};
1639
1640 //memset(block, 64, 2*256);
1641 //return;
1642     for(i=0; i<4; i++){
1643         const int offset= y_offset[i];
1644         const int z0= block[offset+stride*0] + block[offset+stride*4];
1645         const int z1= block[offset+stride*0] - block[offset+stride*4];
1646         const int z2= block[offset+stride*1] - block[offset+stride*5];
1647         const int z3= block[offset+stride*1] + block[offset+stride*5];
1648
1649         temp[4*i+0]= z0+z3;
1650         temp[4*i+1]= z1+z2;
1651         temp[4*i+2]= z1-z2;
1652         temp[4*i+3]= z0-z3;
1653     }
1654
1655     for(i=0; i<4; i++){
1656         const int offset= x_offset[i];
1657         const int z0= temp[4*0+i] + temp[4*2+i];
1658         const int z1= temp[4*0+i] - temp[4*2+i];
1659         const int z2= temp[4*1+i] - temp[4*3+i];
1660         const int z3= temp[4*1+i] + temp[4*3+i];
1661
1662         block[stride*0 +offset]= ((((z0 + z3)*qmul + 128 ) >> 8)); //FIXME think about merging this into decode_resdual
1663         block[stride*2 +offset]= ((((z1 + z2)*qmul + 128 ) >> 8));
1664         block[stride*8 +offset]= ((((z1 - z2)*qmul + 128 ) >> 8));
1665         block[stride*10+offset]= ((((z0 - z3)*qmul + 128 ) >> 8));
1666     }
1667 }
1668
1669 #if 0
1670 /**
1671  * dct tranforms the 16 dc values.
1672  * @param qp quantization parameter ??? FIXME
1673  */
1674 static void h264_luma_dc_dct_c(DCTELEM *block/*, int qp*/){
1675 //    const int qmul= dequant_coeff[qp][0];
1676     int i;
1677     int temp[16]; //FIXME check if this is a good idea
1678     static const int x_offset[4]={0, 1*stride, 4* stride,  5*stride};
1679     static const int y_offset[4]={0, 2*stride, 8* stride, 10*stride};
1680
1681     for(i=0; i<4; i++){
1682         const int offset= y_offset[i];
1683         const int z0= block[offset+stride*0] + block[offset+stride*4];
1684         const int z1= block[offset+stride*0] - block[offset+stride*4];
1685         const int z2= block[offset+stride*1] - block[offset+stride*5];
1686         const int z3= block[offset+stride*1] + block[offset+stride*5];
1687
1688         temp[4*i+0]= z0+z3;
1689         temp[4*i+1]= z1+z2;
1690         temp[4*i+2]= z1-z2;
1691         temp[4*i+3]= z0-z3;
1692     }
1693
1694     for(i=0; i<4; i++){
1695         const int offset= x_offset[i];
1696         const int z0= temp[4*0+i] + temp[4*2+i];
1697         const int z1= temp[4*0+i] - temp[4*2+i];
1698         const int z2= temp[4*1+i] - temp[4*3+i];
1699         const int z3= temp[4*1+i] + temp[4*3+i];
1700
1701         block[stride*0 +offset]= (z0 + z3)>>1;
1702         block[stride*2 +offset]= (z1 + z2)>>1;
1703         block[stride*8 +offset]= (z1 - z2)>>1;
1704         block[stride*10+offset]= (z0 - z3)>>1;
1705     }
1706 }
1707 #endif
1708
1709 #undef xStride
1710 #undef stride
1711
1712 static void chroma_dc_dequant_idct_c(DCTELEM *block, int qp, int qmul){
1713     const int stride= 16*2;
1714     const int xStride= 16;
1715     int a,b,c,d,e;
1716
1717     a= block[stride*0 + xStride*0];
1718     b= block[stride*0 + xStride*1];
1719     c= block[stride*1 + xStride*0];
1720     d= block[stride*1 + xStride*1];
1721
1722     e= a-b;
1723     a= a+b;
1724     b= c-d;
1725     c= c+d;
1726
1727     block[stride*0 + xStride*0]= ((a+c)*qmul) >> 7;
1728     block[stride*0 + xStride*1]= ((e+b)*qmul) >> 7;
1729     block[stride*1 + xStride*0]= ((a-c)*qmul) >> 7;
1730     block[stride*1 + xStride*1]= ((e-b)*qmul) >> 7;
1731 }
1732
1733 #if 0
1734 static void chroma_dc_dct_c(DCTELEM *block){
1735     const int stride= 16*2;
1736     const int xStride= 16;
1737     int a,b,c,d,e;
1738
1739     a= block[stride*0 + xStride*0];
1740     b= block[stride*0 + xStride*1];
1741     c= block[stride*1 + xStride*0];
1742     d= block[stride*1 + xStride*1];
1743
1744     e= a-b;
1745     a= a+b;
1746     b= c-d;
1747     c= c+d;
1748
1749     block[stride*0 + xStride*0]= (a+c);
1750     block[stride*0 + xStride*1]= (e+b);
1751     block[stride*1 + xStride*0]= (a-c);
1752     block[stride*1 + xStride*1]= (e-b);
1753 }
1754 #endif
1755
1756 /**
1757  * gets the chroma qp.
1758  */
1759 static inline int get_chroma_qp(int chroma_qp_index_offset, int qscale){
1760
1761     return chroma_qp[clip(qscale + chroma_qp_index_offset, 0, 51)];
1762 }
1763
1764
1765 #if 0
1766 static void h264_diff_dct_c(DCTELEM *block, uint8_t *src1, uint8_t *src2, int stride){
1767     int i;
1768     //FIXME try int temp instead of block
1769
1770     for(i=0; i<4; i++){
1771         const int d0= src1[0 + i*stride] - src2[0 + i*stride];
1772         const int d1= src1[1 + i*stride] - src2[1 + i*stride];
1773         const int d2= src1[2 + i*stride] - src2[2 + i*stride];
1774         const int d3= src1[3 + i*stride] - src2[3 + i*stride];
1775         const int z0= d0 + d3;
1776         const int z3= d0 - d3;
1777         const int z1= d1 + d2;
1778         const int z2= d1 - d2;
1779
1780         block[0 + 4*i]=   z0 +   z1;
1781         block[1 + 4*i]= 2*z3 +   z2;
1782         block[2 + 4*i]=   z0 -   z1;
1783         block[3 + 4*i]=   z3 - 2*z2;
1784     }
1785
1786     for(i=0; i<4; i++){
1787         const int z0= block[0*4 + i] + block[3*4 + i];
1788         const int z3= block[0*4 + i] - block[3*4 + i];
1789         const int z1= block[1*4 + i] + block[2*4 + i];
1790         const int z2= block[1*4 + i] - block[2*4 + i];
1791
1792         block[0*4 + i]=   z0 +   z1;
1793         block[1*4 + i]= 2*z3 +   z2;
1794         block[2*4 + i]=   z0 -   z1;
1795         block[3*4 + i]=   z3 - 2*z2;
1796     }
1797 }
1798 #endif
1799
1800 //FIXME need to check that this doesnt overflow signed 32 bit for low qp, i am not sure, it's very close
1801 //FIXME check that gcc inlines this (and optimizes intra & seperate_dc stuff away)
1802 static inline int quantize_c(DCTELEM *block, uint8_t *scantable, int qscale, int intra, int seperate_dc){
1803     int i;
1804     const int * const quant_table= quant_coeff[qscale];
1805     const int bias= intra ? (1<<QUANT_SHIFT)/3 : (1<<QUANT_SHIFT)/6;
1806     const unsigned int threshold1= (1<<QUANT_SHIFT) - bias - 1;
1807     const unsigned int threshold2= (threshold1<<1);
1808     int last_non_zero;
1809
1810     if(seperate_dc){
1811         if(qscale<=18){
1812             //avoid overflows
1813             const int dc_bias= intra ? (1<<(QUANT_SHIFT-2))/3 : (1<<(QUANT_SHIFT-2))/6;
1814             const unsigned int dc_threshold1= (1<<(QUANT_SHIFT-2)) - dc_bias - 1;
1815             const unsigned int dc_threshold2= (dc_threshold1<<1);
1816
1817             int level= block[0]*quant_coeff[qscale+18][0];
1818             if(((unsigned)(level+dc_threshold1))>dc_threshold2){
1819                 if(level>0){
1820                     level= (dc_bias + level)>>(QUANT_SHIFT-2);
1821                     block[0]= level;
1822                 }else{
1823                     level= (dc_bias - level)>>(QUANT_SHIFT-2);
1824                     block[0]= -level;
1825                 }
1826 //                last_non_zero = i;
1827             }else{
1828                 block[0]=0;
1829             }
1830         }else{
1831             const int dc_bias= intra ? (1<<(QUANT_SHIFT+1))/3 : (1<<(QUANT_SHIFT+1))/6;
1832             const unsigned int dc_threshold1= (1<<(QUANT_SHIFT+1)) - dc_bias - 1;
1833             const unsigned int dc_threshold2= (dc_threshold1<<1);
1834
1835             int level= block[0]*quant_table[0];
1836             if(((unsigned)(level+dc_threshold1))>dc_threshold2){
1837                 if(level>0){
1838                     level= (dc_bias + level)>>(QUANT_SHIFT+1);
1839                     block[0]= level;
1840                 }else{
1841                     level= (dc_bias - level)>>(QUANT_SHIFT+1);
1842                     block[0]= -level;
1843                 }
1844 //                last_non_zero = i;
1845             }else{
1846                 block[0]=0;
1847             }
1848         }
1849         last_non_zero= 0;
1850         i=1;
1851     }else{
1852         last_non_zero= -1;
1853         i=0;
1854     }
1855
1856     for(; i<16; i++){
1857         const int j= scantable[i];
1858         int level= block[j]*quant_table[j];
1859
1860 //        if(   bias+level >= (1<<(QMAT_SHIFT - 3))
1861 //           || bias-level >= (1<<(QMAT_SHIFT - 3))){
1862         if(((unsigned)(level+threshold1))>threshold2){
1863             if(level>0){
1864                 level= (bias + level)>>QUANT_SHIFT;
1865                 block[j]= level;
1866             }else{
1867                 level= (bias - level)>>QUANT_SHIFT;
1868                 block[j]= -level;
1869             }
1870             last_non_zero = i;
1871         }else{
1872             block[j]=0;
1873         }
1874     }
1875
1876     return last_non_zero;
1877 }
1878
1879 static void pred4x4_vertical_c(uint8_t *src, uint8_t *topright, int stride){
1880     const uint32_t a= ((uint32_t*)(src-stride))[0];
1881     ((uint32_t*)(src+0*stride))[0]= a;
1882     ((uint32_t*)(src+1*stride))[0]= a;
1883     ((uint32_t*)(src+2*stride))[0]= a;
1884     ((uint32_t*)(src+3*stride))[0]= a;
1885 }
1886
1887 static void pred4x4_horizontal_c(uint8_t *src, uint8_t *topright, int stride){
1888     ((uint32_t*)(src+0*stride))[0]= src[-1+0*stride]*0x01010101;
1889     ((uint32_t*)(src+1*stride))[0]= src[-1+1*stride]*0x01010101;
1890     ((uint32_t*)(src+2*stride))[0]= src[-1+2*stride]*0x01010101;
1891     ((uint32_t*)(src+3*stride))[0]= src[-1+3*stride]*0x01010101;
1892 }
1893
1894 static void pred4x4_dc_c(uint8_t *src, uint8_t *topright, int stride){
1895     const int dc= (  src[-stride] + src[1-stride] + src[2-stride] + src[3-stride]
1896                    + src[-1+0*stride] + src[-1+1*stride] + src[-1+2*stride] + src[-1+3*stride] + 4) >>3;
1897
1898     ((uint32_t*)(src+0*stride))[0]=
1899     ((uint32_t*)(src+1*stride))[0]=
1900     ((uint32_t*)(src+2*stride))[0]=
1901     ((uint32_t*)(src+3*stride))[0]= dc* 0x01010101;
1902 }
1903
1904 static void pred4x4_left_dc_c(uint8_t *src, uint8_t *topright, int stride){
1905     const int dc= (  src[-1+0*stride] + src[-1+1*stride] + src[-1+2*stride] + src[-1+3*stride] + 2) >>2;
1906
1907     ((uint32_t*)(src+0*stride))[0]=
1908     ((uint32_t*)(src+1*stride))[0]=
1909     ((uint32_t*)(src+2*stride))[0]=
1910     ((uint32_t*)(src+3*stride))[0]= dc* 0x01010101;
1911 }
1912
1913 static void pred4x4_top_dc_c(uint8_t *src, uint8_t *topright, int stride){
1914     const int dc= (  src[-stride] + src[1-stride] + src[2-stride] + src[3-stride] + 2) >>2;
1915
1916     ((uint32_t*)(src+0*stride))[0]=
1917     ((uint32_t*)(src+1*stride))[0]=
1918     ((uint32_t*)(src+2*stride))[0]=
1919     ((uint32_t*)(src+3*stride))[0]= dc* 0x01010101;
1920 }
1921
1922 static void pred4x4_128_dc_c(uint8_t *src, uint8_t *topright, int stride){
1923     ((uint32_t*)(src+0*stride))[0]=
1924     ((uint32_t*)(src+1*stride))[0]=
1925     ((uint32_t*)(src+2*stride))[0]=
1926     ((uint32_t*)(src+3*stride))[0]= 128U*0x01010101U;
1927 }
1928
1929
1930 #define LOAD_TOP_RIGHT_EDGE\
1931     const int t4= topright[0];\
1932     const int t5= topright[1];\
1933     const int t6= topright[2];\
1934     const int t7= topright[3];\
1935
1936 #define LOAD_LEFT_EDGE\
1937     const int l0= src[-1+0*stride];\
1938     const int l1= src[-1+1*stride];\
1939     const int l2= src[-1+2*stride];\
1940     const int l3= src[-1+3*stride];\
1941
1942 #define LOAD_TOP_EDGE\
1943     const int t0= src[ 0-1*stride];\
1944     const int t1= src[ 1-1*stride];\
1945     const int t2= src[ 2-1*stride];\
1946     const int t3= src[ 3-1*stride];\
1947
1948 static void pred4x4_down_right_c(uint8_t *src, uint8_t *topright, int stride){
1949     const int lt= src[-1-1*stride];
1950     LOAD_TOP_EDGE
1951     LOAD_LEFT_EDGE
1952
1953     src[0+3*stride]=(l3 + 2*l2 + l1 + 2)>>2;
1954     src[0+2*stride]=
1955     src[1+3*stride]=(l2 + 2*l1 + l0 + 2)>>2;
1956     src[0+1*stride]=
1957     src[1+2*stride]=
1958     src[2+3*stride]=(l1 + 2*l0 + lt + 2)>>2;
1959     src[0+0*stride]=
1960     src[1+1*stride]=
1961     src[2+2*stride]=
1962     src[3+3*stride]=(l0 + 2*lt + t0 + 2)>>2;
1963     src[1+0*stride]=
1964     src[2+1*stride]=
1965     src[3+2*stride]=(lt + 2*t0 + t1 + 2)>>2;
1966     src[2+0*stride]=
1967     src[3+1*stride]=(t0 + 2*t1 + t2 + 2)>>2;
1968     src[3+0*stride]=(t1 + 2*t2 + t3 + 2)>>2;
1969 }
1970
1971 static void pred4x4_down_left_c(uint8_t *src, uint8_t *topright, int stride){
1972     LOAD_TOP_EDGE
1973     LOAD_TOP_RIGHT_EDGE
1974 //    LOAD_LEFT_EDGE
1975
1976     src[0+0*stride]=(t0 + t2 + 2*t1 + 2)>>2;
1977     src[1+0*stride]=
1978     src[0+1*stride]=(t1 + t3 + 2*t2 + 2)>>2;
1979     src[2+0*stride]=
1980     src[1+1*stride]=
1981     src[0+2*stride]=(t2 + t4 + 2*t3 + 2)>>2;
1982     src[3+0*stride]=
1983     src[2+1*stride]=
1984     src[1+2*stride]=
1985     src[0+3*stride]=(t3 + t5 + 2*t4 + 2)>>2;
1986     src[3+1*stride]=
1987     src[2+2*stride]=
1988     src[1+3*stride]=(t4 + t6 + 2*t5 + 2)>>2;
1989     src[3+2*stride]=
1990     src[2+3*stride]=(t5 + t7 + 2*t6 + 2)>>2;
1991     src[3+3*stride]=(t6 + 3*t7 + 2)>>2;
1992 }
1993
1994 static void pred4x4_vertical_right_c(uint8_t *src, uint8_t *topright, int stride){
1995     const int lt= src[-1-1*stride];
1996     LOAD_TOP_EDGE
1997     LOAD_LEFT_EDGE
1998     const __attribute__((unused)) int unu= l3;
1999
2000     src[0+0*stride]=
2001     src[1+2*stride]=(lt + t0 + 1)>>1;
2002     src[1+0*stride]=
2003     src[2+2*stride]=(t0 + t1 + 1)>>1;
2004     src[2+0*stride]=
2005     src[3+2*stride]=(t1 + t2 + 1)>>1;
2006     src[3+0*stride]=(t2 + t3 + 1)>>1;
2007     src[0+1*stride]=
2008     src[1+3*stride]=(l0 + 2*lt + t0 + 2)>>2;
2009     src[1+1*stride]=
2010     src[2+3*stride]=(lt + 2*t0 + t1 + 2)>>2;
2011     src[2+1*stride]=
2012     src[3+3*stride]=(t0 + 2*t1 + t2 + 2)>>2;
2013     src[3+1*stride]=(t1 + 2*t2 + t3 + 2)>>2;
2014     src[0+2*stride]=(lt + 2*l0 + l1 + 2)>>2;
2015     src[0+3*stride]=(l0 + 2*l1 + l2 + 2)>>2;
2016 }
2017
2018 static void pred4x4_vertical_left_c(uint8_t *src, uint8_t *topright, int stride){
2019     LOAD_TOP_EDGE
2020     LOAD_TOP_RIGHT_EDGE
2021     const __attribute__((unused)) int unu= t7;
2022
2023     src[0+0*stride]=(t0 + t1 + 1)>>1;
2024     src[1+0*stride]=
2025     src[0+2*stride]=(t1 + t2 + 1)>>1;
2026     src[2+0*stride]=
2027     src[1+2*stride]=(t2 + t3 + 1)>>1;
2028     src[3+0*stride]=
2029     src[2+2*stride]=(t3 + t4+ 1)>>1;
2030     src[3+2*stride]=(t4 + t5+ 1)>>1;
2031     src[0+1*stride]=(t0 + 2*t1 + t2 + 2)>>2;
2032     src[1+1*stride]=
2033     src[0+3*stride]=(t1 + 2*t2 + t3 + 2)>>2;
2034     src[2+1*stride]=
2035     src[1+3*stride]=(t2 + 2*t3 + t4 + 2)>>2;
2036     src[3+1*stride]=
2037     src[2+3*stride]=(t3 + 2*t4 + t5 + 2)>>2;
2038     src[3+3*stride]=(t4 + 2*t5 + t6 + 2)>>2;
2039 }
2040
2041 static void pred4x4_horizontal_up_c(uint8_t *src, uint8_t *topright, int stride){
2042     LOAD_LEFT_EDGE
2043
2044     src[0+0*stride]=(l0 + l1 + 1)>>1;
2045     src[1+0*stride]=(l0 + 2*l1 + l2 + 2)>>2;
2046     src[2+0*stride]=
2047     src[0+1*stride]=(l1 + l2 + 1)>>1;
2048     src[3+0*stride]=
2049     src[1+1*stride]=(l1 + 2*l2 + l3 + 2)>>2;
2050     src[2+1*stride]=
2051     src[0+2*stride]=(l2 + l3 + 1)>>1;
2052     src[3+1*stride]=
2053     src[1+2*stride]=(l2 + 2*l3 + l3 + 2)>>2;
2054     src[3+2*stride]=
2055     src[1+3*stride]=
2056     src[0+3*stride]=
2057     src[2+2*stride]=
2058     src[2+3*stride]=
2059     src[3+3*stride]=l3;
2060 }
2061
2062 static void pred4x4_horizontal_down_c(uint8_t *src, uint8_t *topright, int stride){
2063     const int lt= src[-1-1*stride];
2064     LOAD_TOP_EDGE
2065     LOAD_LEFT_EDGE
2066     const __attribute__((unused)) int unu= t3;
2067
2068     src[0+0*stride]=
2069     src[2+1*stride]=(lt + l0 + 1)>>1;
2070     src[1+0*stride]=
2071     src[3+1*stride]=(l0 + 2*lt + t0 + 2)>>2;
2072     src[2+0*stride]=(lt + 2*t0 + t1 + 2)>>2;
2073     src[3+0*stride]=(t0 + 2*t1 + t2 + 2)>>2;
2074     src[0+1*stride]=
2075     src[2+2*stride]=(l0 + l1 + 1)>>1;
2076     src[1+1*stride]=
2077     src[3+2*stride]=(lt + 2*l0 + l1 + 2)>>2;
2078     src[0+2*stride]=
2079     src[2+3*stride]=(l1 + l2+ 1)>>1;
2080     src[1+2*stride]=
2081     src[3+3*stride]=(l0 + 2*l1 + l2 + 2)>>2;
2082     src[0+3*stride]=(l2 + l3 + 1)>>1;
2083     src[1+3*stride]=(l1 + 2*l2 + l3 + 2)>>2;
2084 }
2085
2086 static void pred16x16_vertical_c(uint8_t *src, int stride){
2087     int i;
2088     const uint32_t a= ((uint32_t*)(src-stride))[0];
2089     const uint32_t b= ((uint32_t*)(src-stride))[1];
2090     const uint32_t c= ((uint32_t*)(src-stride))[2];
2091     const uint32_t d= ((uint32_t*)(src-stride))[3];
2092
2093     for(i=0; i<16; i++){
2094         ((uint32_t*)(src+i*stride))[0]= a;
2095         ((uint32_t*)(src+i*stride))[1]= b;
2096         ((uint32_t*)(src+i*stride))[2]= c;
2097         ((uint32_t*)(src+i*stride))[3]= d;
2098     }
2099 }
2100
2101 static void pred16x16_horizontal_c(uint8_t *src, int stride){
2102     int i;
2103
2104     for(i=0; i<16; i++){
2105         ((uint32_t*)(src+i*stride))[0]=
2106         ((uint32_t*)(src+i*stride))[1]=
2107         ((uint32_t*)(src+i*stride))[2]=
2108         ((uint32_t*)(src+i*stride))[3]= src[-1+i*stride]*0x01010101;
2109     }
2110 }
2111
2112 static void pred16x16_dc_c(uint8_t *src, int stride){
2113     int i, dc=0;
2114
2115     for(i=0;i<16; i++){
2116         dc+= src[-1+i*stride];
2117     }
2118
2119     for(i=0;i<16; i++){
2120         dc+= src[i-stride];
2121     }
2122
2123     dc= 0x01010101*((dc + 16)>>5);
2124
2125     for(i=0; i<16; i++){
2126         ((uint32_t*)(src+i*stride))[0]=
2127         ((uint32_t*)(src+i*stride))[1]=
2128         ((uint32_t*)(src+i*stride))[2]=
2129         ((uint32_t*)(src+i*stride))[3]= dc;
2130     }
2131 }
2132
2133 static void pred16x16_left_dc_c(uint8_t *src, int stride){
2134     int i, dc=0;
2135
2136     for(i=0;i<16; i++){
2137         dc+= src[-1+i*stride];
2138     }
2139
2140     dc= 0x01010101*((dc + 8)>>4);
2141
2142     for(i=0; i<16; i++){
2143         ((uint32_t*)(src+i*stride))[0]=
2144         ((uint32_t*)(src+i*stride))[1]=
2145         ((uint32_t*)(src+i*stride))[2]=
2146         ((uint32_t*)(src+i*stride))[3]= dc;
2147     }
2148 }
2149
2150 static void pred16x16_top_dc_c(uint8_t *src, int stride){
2151     int i, dc=0;
2152
2153     for(i=0;i<16; i++){
2154         dc+= src[i-stride];
2155     }
2156     dc= 0x01010101*((dc + 8)>>4);
2157
2158     for(i=0; i<16; i++){
2159         ((uint32_t*)(src+i*stride))[0]=
2160         ((uint32_t*)(src+i*stride))[1]=
2161         ((uint32_t*)(src+i*stride))[2]=
2162         ((uint32_t*)(src+i*stride))[3]= dc;
2163     }
2164 }
2165
2166 static void pred16x16_128_dc_c(uint8_t *src, int stride){
2167     int i;
2168
2169     for(i=0; i<16; i++){
2170         ((uint32_t*)(src+i*stride))[0]=
2171         ((uint32_t*)(src+i*stride))[1]=
2172         ((uint32_t*)(src+i*stride))[2]=
2173         ((uint32_t*)(src+i*stride))[3]= 0x01010101U*128U;
2174     }
2175 }
2176
2177 static inline void pred16x16_plane_compat_c(uint8_t *src, int stride, const int svq3){
2178   int i, j, k;
2179   int a;
2180   uint8_t *cm = cropTbl + MAX_NEG_CROP;
2181   const uint8_t * const src0 = src+7-stride;
2182   const uint8_t *src1 = src+8*stride-1;
2183   const uint8_t *src2 = src1-2*stride;      // == src+6*stride-1;
2184   int H = src0[1] - src0[-1];
2185   int V = src1[0] - src2[ 0];
2186   for(k=2; k<=8; ++k) {
2187     src1 += stride; src2 -= stride;
2188     H += k*(src0[k] - src0[-k]);
2189     V += k*(src1[0] - src2[ 0]);
2190   }
2191   if(svq3){
2192     H = ( 5*(H/4) ) / 16;
2193     V = ( 5*(V/4) ) / 16;
2194
2195     /* required for 100% accuracy */
2196     i = H; H = V; V = i;
2197   }else{
2198     H = ( 5*H+32 ) >> 6;
2199     V = ( 5*V+32 ) >> 6;
2200   }
2201
2202   a = 16*(src1[0] + src2[16] + 1) - 7*(V+H);
2203   for(j=16; j>0; --j) {
2204     int b = a;
2205     a += V;
2206     for(i=-16; i<0; i+=4) {
2207       src[16+i] = cm[ (b    ) >> 5 ];
2208       src[17+i] = cm[ (b+  H) >> 5 ];
2209       src[18+i] = cm[ (b+2*H) >> 5 ];
2210       src[19+i] = cm[ (b+3*H) >> 5 ];
2211       b += 4*H;
2212     }
2213     src += stride;
2214   }
2215 }
2216
2217 static void pred16x16_plane_c(uint8_t *src, int stride){
2218     pred16x16_plane_compat_c(src, stride, 0);
2219 }
2220
2221 static void pred8x8_vertical_c(uint8_t *src, int stride){
2222     int i;
2223     const uint32_t a= ((uint32_t*)(src-stride))[0];
2224     const uint32_t b= ((uint32_t*)(src-stride))[1];
2225
2226     for(i=0; i<8; i++){
2227         ((uint32_t*)(src+i*stride))[0]= a;
2228         ((uint32_t*)(src+i*stride))[1]= b;
2229     }
2230 }
2231
2232 static void pred8x8_horizontal_c(uint8_t *src, int stride){
2233     int i;
2234
2235     for(i=0; i<8; i++){
2236         ((uint32_t*)(src+i*stride))[0]=
2237         ((uint32_t*)(src+i*stride))[1]= src[-1+i*stride]*0x01010101;
2238     }
2239 }
2240
2241 static void pred8x8_128_dc_c(uint8_t *src, int stride){
2242     int i;
2243
2244     for(i=0; i<8; i++){
2245         ((uint32_t*)(src+i*stride))[0]=
2246         ((uint32_t*)(src+i*stride))[1]= 0x01010101U*128U;
2247     }
2248 }
2249
2250 static void pred8x8_left_dc_c(uint8_t *src, int stride){
2251     int i;
2252     int dc0, dc2;
2253
2254     dc0=dc2=0;
2255     for(i=0;i<4; i++){
2256         dc0+= src[-1+i*stride];
2257         dc2+= src[-1+(i+4)*stride];
2258     }
2259     dc0= 0x01010101*((dc0 + 2)>>2);
2260     dc2= 0x01010101*((dc2 + 2)>>2);
2261
2262     for(i=0; i<4; i++){
2263         ((uint32_t*)(src+i*stride))[0]=
2264         ((uint32_t*)(src+i*stride))[1]= dc0;
2265     }
2266     for(i=4; i<8; i++){
2267         ((uint32_t*)(src+i*stride))[0]=
2268         ((uint32_t*)(src+i*stride))[1]= dc2;
2269     }
2270 }
2271
2272 static void pred8x8_top_dc_c(uint8_t *src, int stride){
2273     int i;
2274     int dc0, dc1;
2275
2276     dc0=dc1=0;
2277     for(i=0;i<4; i++){
2278         dc0+= src[i-stride];
2279         dc1+= src[4+i-stride];
2280     }
2281     dc0= 0x01010101*((dc0 + 2)>>2);
2282     dc1= 0x01010101*((dc1 + 2)>>2);
2283
2284     for(i=0; i<4; i++){
2285         ((uint32_t*)(src+i*stride))[0]= dc0;
2286         ((uint32_t*)(src+i*stride))[1]= dc1;
2287     }
2288     for(i=4; i<8; i++){
2289         ((uint32_t*)(src+i*stride))[0]= dc0;
2290         ((uint32_t*)(src+i*stride))[1]= dc1;
2291     }
2292 }
2293
2294
2295 static void pred8x8_dc_c(uint8_t *src, int stride){
2296     int i;
2297     int dc0, dc1, dc2, dc3;
2298
2299     dc0=dc1=dc2=0;
2300     for(i=0;i<4; i++){
2301         dc0+= src[-1+i*stride] + src[i-stride];
2302         dc1+= src[4+i-stride];
2303         dc2+= src[-1+(i+4)*stride];
2304     }
2305     dc3= 0x01010101*((dc1 + dc2 + 4)>>3);
2306     dc0= 0x01010101*((dc0 + 4)>>3);
2307     dc1= 0x01010101*((dc1 + 2)>>2);
2308     dc2= 0x01010101*((dc2 + 2)>>2);
2309
2310     for(i=0; i<4; i++){
2311         ((uint32_t*)(src+i*stride))[0]= dc0;
2312         ((uint32_t*)(src+i*stride))[1]= dc1;
2313     }
2314     for(i=4; i<8; i++){
2315         ((uint32_t*)(src+i*stride))[0]= dc2;
2316         ((uint32_t*)(src+i*stride))[1]= dc3;
2317     }
2318 }
2319
2320 static void pred8x8_plane_c(uint8_t *src, int stride){
2321   int j, k;
2322   int a;
2323   uint8_t *cm = cropTbl + MAX_NEG_CROP;
2324   const uint8_t * const src0 = src+3-stride;
2325   const uint8_t *src1 = src+4*stride-1;
2326   const uint8_t *src2 = src1-2*stride;      // == src+2*stride-1;
2327   int H = src0[1] - src0[-1];
2328   int V = src1[0] - src2[ 0];
2329   for(k=2; k<=4; ++k) {
2330     src1 += stride; src2 -= stride;
2331     H += k*(src0[k] - src0[-k]);
2332     V += k*(src1[0] - src2[ 0]);
2333   }
2334   H = ( 17*H+16 ) >> 5;
2335   V = ( 17*V+16 ) >> 5;
2336
2337   a = 16*(src1[0] + src2[8]+1) - 3*(V+H);
2338   for(j=8; j>0; --j) {
2339     int b = a;
2340     a += V;
2341     src[0] = cm[ (b    ) >> 5 ];
2342     src[1] = cm[ (b+  H) >> 5 ];
2343     src[2] = cm[ (b+2*H) >> 5 ];
2344     src[3] = cm[ (b+3*H) >> 5 ];
2345     src[4] = cm[ (b+4*H) >> 5 ];
2346     src[5] = cm[ (b+5*H) >> 5 ];
2347     src[6] = cm[ (b+6*H) >> 5 ];
2348     src[7] = cm[ (b+7*H) >> 5 ];
2349     src += stride;
2350   }
2351 }
2352
2353 #define SRC(x,y) src[(x)+(y)*stride]
2354 #define PL(y) \
2355     const int l##y = (SRC(-1,y-1) + 2*SRC(-1,y) + SRC(-1,y+1) + 2) >> 2;
2356 #define PREDICT_8x8_LOAD_LEFT \
2357     const int l0 = ((has_topleft ? SRC(-1,-1) : SRC(-1,0)) \
2358                      + 2*SRC(-1,0) + SRC(-1,1) + 2) >> 2; \
2359     PL(1) PL(2) PL(3) PL(4) PL(5) PL(6) \
2360     const int l7 attribute_unused = (SRC(-1,6) + 3*SRC(-1,7) + 2) >> 2
2361
2362 #define PT(x) \
2363     const int t##x = (SRC(x-1,-1) + 2*SRC(x,-1) + SRC(x+1,-1) + 2) >> 2;
2364 #define PREDICT_8x8_LOAD_TOP \
2365     const int t0 = ((has_topleft ? SRC(-1,-1) : SRC(0,-1)) \
2366                      + 2*SRC(0,-1) + SRC(1,-1) + 2) >> 2; \
2367     PT(1) PT(2) PT(3) PT(4) PT(5) PT(6) \
2368     const int t7 attribute_unused = ((has_topright ? SRC(8,-1) : SRC(7,-1)) \
2369                      + 2*SRC(7,-1) + SRC(6,-1) + 2) >> 2
2370
2371 #define PTR(x) \
2372     t##x = (SRC(x-1,-1) + 2*SRC(x,-1) + SRC(x+1,-1) + 2) >> 2;
2373 #define PREDICT_8x8_LOAD_TOPRIGHT \
2374     int t8, t9, t10, t11, t12, t13, t14, t15; \
2375     if(has_topright) { \
2376         PTR(8) PTR(9) PTR(10) PTR(11) PTR(12) PTR(13) PTR(14) \
2377         t15 = (SRC(14,-1) + 3*SRC(15,-1) + 2) >> 2; \
2378     } else t8=t9=t10=t11=t12=t13=t14=t15= SRC(7,-1);
2379
2380 #define PREDICT_8x8_LOAD_TOPLEFT \
2381     const int lt = (SRC(-1,0) + 2*SRC(-1,-1) + SRC(0,-1) + 2) >> 2
2382
2383 #define PREDICT_8x8_DC(v) \
2384     int y; \
2385     for( y = 0; y < 8; y++ ) { \
2386         ((uint32_t*)src)[0] = \
2387         ((uint32_t*)src)[1] = v; \
2388         src += stride; \
2389     }
2390
2391 static void pred8x8l_128_dc_c(uint8_t *src, int has_topleft, int has_topright, int stride)
2392 {
2393     PREDICT_8x8_DC(0x80808080);
2394 }
2395 static void pred8x8l_left_dc_c(uint8_t *src, int has_topleft, int has_topright, int stride)
2396 {
2397     PREDICT_8x8_LOAD_LEFT;
2398     const uint32_t dc = ((l0+l1+l2+l3+l4+l5+l6+l7+4) >> 3) * 0x01010101;
2399     PREDICT_8x8_DC(dc);
2400 }
2401 static void pred8x8l_top_dc_c(uint8_t *src, int has_topleft, int has_topright, int stride)
2402 {
2403     PREDICT_8x8_LOAD_TOP;
2404     const uint32_t dc = ((t0+t1+t2+t3+t4+t5+t6+t7+4) >> 3) * 0x01010101;
2405     PREDICT_8x8_DC(dc);
2406 }
2407 static void pred8x8l_dc_c(uint8_t *src, int has_topleft, int has_topright, int stride)
2408 {
2409     PREDICT_8x8_LOAD_LEFT;
2410     PREDICT_8x8_LOAD_TOP;
2411     const uint32_t dc = ((l0+l1+l2+l3+l4+l5+l6+l7
2412                          +t0+t1+t2+t3+t4+t5+t6+t7+8) >> 4) * 0x01010101;
2413     PREDICT_8x8_DC(dc);
2414 }
2415 static void pred8x8l_horizontal_c(uint8_t *src, int has_topleft, int has_topright, int stride)
2416 {
2417     PREDICT_8x8_LOAD_LEFT;
2418 #define ROW(y) ((uint32_t*)(src+y*stride))[0] =\
2419                ((uint32_t*)(src+y*stride))[1] = 0x01010101 * l##y
2420     ROW(0); ROW(1); ROW(2); ROW(3); ROW(4); ROW(5); ROW(6); ROW(7);
2421 #undef ROW
2422 }
2423 static void pred8x8l_vertical_c(uint8_t *src, int has_topleft, int has_topright, int stride)
2424 {
2425     int y;
2426     PREDICT_8x8_LOAD_TOP;
2427     src[0] = t0;
2428     src[1] = t1;
2429     src[2] = t2;
2430     src[3] = t3;
2431     src[4] = t4;
2432     src[5] = t5;
2433     src[6] = t6;
2434     src[7] = t7;
2435     for( y = 1; y < 8; y++ )
2436         *(uint64_t*)(src+y*stride) = *(uint64_t*)src;
2437 }
2438 static void pred8x8l_down_left_c(uint8_t *src, int has_topleft, int has_topright, int stride)
2439 {
2440     PREDICT_8x8_LOAD_TOP;
2441     PREDICT_8x8_LOAD_TOPRIGHT;
2442     SRC(0,0)= (t0 + 2*t1 + t2 + 2) >> 2;
2443     SRC(0,1)=SRC(1,0)= (t1 + 2*t2 + t3 + 2) >> 2;
2444     SRC(0,2)=SRC(1,1)=SRC(2,0)= (t2 + 2*t3 + t4 + 2) >> 2;
2445     SRC(0,3)=SRC(1,2)=SRC(2,1)=SRC(3,0)= (t3 + 2*t4 + t5 + 2) >> 2;
2446     SRC(0,4)=SRC(1,3)=SRC(2,2)=SRC(3,1)=SRC(4,0)= (t4 + 2*t5 + t6 + 2) >> 2;
2447     SRC(0,5)=SRC(1,4)=SRC(2,3)=SRC(3,2)=SRC(4,1)=SRC(5,0)= (t5 + 2*t6 + t7 + 2) >> 2;
2448     SRC(0,6)=SRC(1,5)=SRC(2,4)=SRC(3,3)=SRC(4,2)=SRC(5,1)=SRC(6,0)= (t6 + 2*t7 + t8 + 2) >> 2;
2449     SRC(0,7)=SRC(1,6)=SRC(2,5)=SRC(3,4)=SRC(4,3)=SRC(5,2)=SRC(6,1)=SRC(7,0)= (t7 + 2*t8 + t9 + 2) >> 2;
2450     SRC(1,7)=SRC(2,6)=SRC(3,5)=SRC(4,4)=SRC(5,3)=SRC(6,2)=SRC(7,1)= (t8 + 2*t9 + t10 + 2) >> 2;
2451     SRC(2,7)=SRC(3,6)=SRC(4,5)=SRC(5,4)=SRC(6,3)=SRC(7,2)= (t9 + 2*t10 + t11 + 2) >> 2;
2452     SRC(3,7)=SRC(4,6)=SRC(5,5)=SRC(6,4)=SRC(7,3)= (t10 + 2*t11 + t12 + 2) >> 2;
2453     SRC(4,7)=SRC(5,6)=SRC(6,5)=SRC(7,4)= (t11 + 2*t12 + t13 + 2) >> 2;
2454     SRC(5,7)=SRC(6,6)=SRC(7,5)= (t12 + 2*t13 + t14 + 2) >> 2;
2455     SRC(6,7)=SRC(7,6)= (t13 + 2*t14 + t15 + 2) >> 2;
2456     SRC(7,7)= (t14 + 3*t15 + 2) >> 2;
2457 }
2458 static void pred8x8l_down_right_c(uint8_t *src, int has_topleft, int has_topright, int stride)
2459 {
2460     PREDICT_8x8_LOAD_TOP;
2461     PREDICT_8x8_LOAD_LEFT;
2462     PREDICT_8x8_LOAD_TOPLEFT;
2463     SRC(0,7)= (l7 + 2*l6 + l5 + 2) >> 2;
2464     SRC(0,6)=SRC(1,7)= (l6 + 2*l5 + l4 + 2) >> 2;
2465     SRC(0,5)=SRC(1,6)=SRC(2,7)= (l5 + 2*l4 + l3 + 2) >> 2;
2466     SRC(0,4)=SRC(1,5)=SRC(2,6)=SRC(3,7)= (l4 + 2*l3 + l2 + 2) >> 2;
2467     SRC(0,3)=SRC(1,4)=SRC(2,5)=SRC(3,6)=SRC(4,7)= (l3 + 2*l2 + l1 + 2) >> 2;
2468     SRC(0,2)=SRC(1,3)=SRC(2,4)=SRC(3,5)=SRC(4,6)=SRC(5,7)= (l2 + 2*l1 + l0 + 2) >> 2;
2469     SRC(0,1)=SRC(1,2)=SRC(2,3)=SRC(3,4)=SRC(4,5)=SRC(5,6)=SRC(6,7)= (l1 + 2*l0 + lt + 2) >> 2;
2470     SRC(0,0)=SRC(1,1)=SRC(2,2)=SRC(3,3)=SRC(4,4)=SRC(5,5)=SRC(6,6)=SRC(7,7)= (l0 + 2*lt + t0 + 2) >> 2;
2471     SRC(1,0)=SRC(2,1)=SRC(3,2)=SRC(4,3)=SRC(5,4)=SRC(6,5)=SRC(7,6)= (lt + 2*t0 + t1 + 2) >> 2;
2472     SRC(2,0)=SRC(3,1)=SRC(4,2)=SRC(5,3)=SRC(6,4)=SRC(7,5)= (t0 + 2*t1 + t2 + 2) >> 2;
2473     SRC(3,0)=SRC(4,1)=SRC(5,2)=SRC(6,3)=SRC(7,4)= (t1 + 2*t2 + t3 + 2) >> 2;
2474     SRC(4,0)=SRC(5,1)=SRC(6,2)=SRC(7,3)= (t2 + 2*t3 + t4 + 2) >> 2;
2475     SRC(5,0)=SRC(6,1)=SRC(7,2)= (t3 + 2*t4 + t5 + 2) >> 2;
2476     SRC(6,0)=SRC(7,1)= (t4 + 2*t5 + t6 + 2) >> 2;
2477     SRC(7,0)= (t5 + 2*t6 + t7 + 2) >> 2;
2478
2479 }
2480 static void pred8x8l_vertical_right_c(uint8_t *src, int has_topleft, int has_topright, int stride)
2481 {
2482     PREDICT_8x8_LOAD_TOP;
2483     PREDICT_8x8_LOAD_LEFT;
2484     PREDICT_8x8_LOAD_TOPLEFT;
2485     SRC(0,6)= (l5 + 2*l4 + l3 + 2) >> 2;
2486     SRC(0,7)= (l6 + 2*l5 + l4 + 2) >> 2;
2487     SRC(0,4)=SRC(1,6)= (l3 + 2*l2 + l1 + 2) >> 2;
2488     SRC(0,5)=SRC(1,7)= (l4 + 2*l3 + l2 + 2) >> 2;
2489     SRC(0,2)=SRC(1,4)=SRC(2,6)= (l1 + 2*l0 + lt + 2) >> 2;
2490     SRC(0,3)=SRC(1,5)=SRC(2,7)= (l2 + 2*l1 + l0 + 2) >> 2;
2491     SRC(0,1)=SRC(1,3)=SRC(2,5)=SRC(3,7)= (l0 + 2*lt + t0 + 2) >> 2;
2492     SRC(0,0)=SRC(1,2)=SRC(2,4)=SRC(3,6)= (lt + t0 + 1) >> 1;
2493     SRC(1,1)=SRC(2,3)=SRC(3,5)=SRC(4,7)= (lt + 2*t0 + t1 + 2) >> 2;
2494     SRC(1,0)=SRC(2,2)=SRC(3,4)=SRC(4,6)= (t0 + t1 + 1) >> 1;
2495     SRC(2,1)=SRC(3,3)=SRC(4,5)=SRC(5,7)= (t0 + 2*t1 + t2 + 2) >> 2;
2496     SRC(2,0)=SRC(3,2)=SRC(4,4)=SRC(5,6)= (t1 + t2 + 1) >> 1;
2497     SRC(3,1)=SRC(4,3)=SRC(5,5)=SRC(6,7)= (t1 + 2*t2 + t3 + 2) >> 2;
2498     SRC(3,0)=SRC(4,2)=SRC(5,4)=SRC(6,6)= (t2 + t3 + 1) >> 1;
2499     SRC(4,1)=SRC(5,3)=SRC(6,5)=SRC(7,7)= (t2 + 2*t3 + t4 + 2) >> 2;
2500     SRC(4,0)=SRC(5,2)=SRC(6,4)=SRC(7,6)= (t3 + t4 + 1) >> 1;
2501     SRC(5,1)=SRC(6,3)=SRC(7,5)= (t3 + 2*t4 + t5 + 2) >> 2;
2502     SRC(5,0)=SRC(6,2)=SRC(7,4)= (t4 + t5 + 1) >> 1;
2503     SRC(6,1)=SRC(7,3)= (t4 + 2*t5 + t6 + 2) >> 2;
2504     SRC(6,0)=SRC(7,2)= (t5 + t6 + 1) >> 1;
2505     SRC(7,1)= (t5 + 2*t6 + t7 + 2) >> 2;
2506     SRC(7,0)= (t6 + t7 + 1) >> 1;
2507 }
2508 static void pred8x8l_horizontal_down_c(uint8_t *src, int has_topleft, int has_topright, int stride)
2509 {
2510     PREDICT_8x8_LOAD_TOP;
2511     PREDICT_8x8_LOAD_LEFT;
2512     PREDICT_8x8_LOAD_TOPLEFT;
2513     SRC(0,7)= (l6 + l7 + 1) >> 1;
2514     SRC(1,7)= (l5 + 2*l6 + l7 + 2) >> 2;
2515     SRC(0,6)=SRC(2,7)= (l5 + l6 + 1) >> 1;
2516     SRC(1,6)=SRC(3,7)= (l4 + 2*l5 + l6 + 2) >> 2;
2517     SRC(0,5)=SRC(2,6)=SRC(4,7)= (l4 + l5 + 1) >> 1;
2518     SRC(1,5)=SRC(3,6)=SRC(5,7)= (l3 + 2*l4 + l5 + 2) >> 2;
2519     SRC(0,4)=SRC(2,5)=SRC(4,6)=SRC(6,7)= (l3 + l4 + 1) >> 1;
2520     SRC(1,4)=SRC(3,5)=SRC(5,6)=SRC(7,7)= (l2 + 2*l3 + l4 + 2) >> 2;
2521     SRC(0,3)=SRC(2,4)=SRC(4,5)=SRC(6,6)= (l2 + l3 + 1) >> 1;
2522     SRC(1,3)=SRC(3,4)=SRC(5,5)=SRC(7,6)= (l1 + 2*l2 + l3 + 2) >> 2;
2523     SRC(0,2)=SRC(2,3)=SRC(4,4)=SRC(6,5)= (l1 + l2 + 1) >> 1;
2524     SRC(1,2)=SRC(3,3)=SRC(5,4)=SRC(7,5)= (l0 + 2*l1 + l2 + 2) >> 2;
2525     SRC(0,1)=SRC(2,2)=SRC(4,3)=SRC(6,4)= (l0 + l1 + 1) >> 1;
2526     SRC(1,1)=SRC(3,2)=SRC(5,3)=SRC(7,4)= (lt + 2*l0 + l1 + 2) >> 2;
2527     SRC(0,0)=SRC(2,1)=SRC(4,2)=SRC(6,3)= (lt + l0 + 1) >> 1;
2528     SRC(1,0)=SRC(3,1)=SRC(5,2)=SRC(7,3)= (l0 + 2*lt + t0 + 2) >> 2;
2529     SRC(2,0)=SRC(4,1)=SRC(6,2)= (t1 + 2*t0 + lt + 2) >> 2;
2530     SRC(3,0)=SRC(5,1)=SRC(7,2)= (t2 + 2*t1 + t0 + 2) >> 2;
2531     SRC(4,0)=SRC(6,1)= (t3 + 2*t2 + t1 + 2) >> 2;
2532     SRC(5,0)=SRC(7,1)= (t4 + 2*t3 + t2 + 2) >> 2;
2533     SRC(6,0)= (t5 + 2*t4 + t3 + 2) >> 2;
2534     SRC(7,0)= (t6 + 2*t5 + t4 + 2) >> 2;
2535 }
2536 static void pred8x8l_vertical_left_c(uint8_t *src, int has_topleft, int has_topright, int stride)
2537 {
2538     PREDICT_8x8_LOAD_TOP;
2539     PREDICT_8x8_LOAD_TOPRIGHT;
2540     SRC(0,0)= (t0 + t1 + 1) >> 1;
2541     SRC(0,1)= (t0 + 2*t1 + t2 + 2) >> 2;
2542     SRC(0,2)=SRC(1,0)= (t1 + t2 + 1) >> 1;
2543     SRC(0,3)=SRC(1,1)= (t1 + 2*t2 + t3 + 2) >> 2;
2544     SRC(0,4)=SRC(1,2)=SRC(2,0)= (t2 + t3 + 1) >> 1;
2545     SRC(0,5)=SRC(1,3)=SRC(2,1)= (t2 + 2*t3 + t4 + 2) >> 2;
2546     SRC(0,6)=SRC(1,4)=SRC(2,2)=SRC(3,0)= (t3 + t4 + 1) >> 1;
2547     SRC(0,7)=SRC(1,5)=SRC(2,3)=SRC(3,1)= (t3 + 2*t4 + t5 + 2) >> 2;
2548     SRC(1,6)=SRC(2,4)=SRC(3,2)=SRC(4,0)= (t4 + t5 + 1) >> 1;
2549     SRC(1,7)=SRC(2,5)=SRC(3,3)=SRC(4,1)= (t4 + 2*t5 + t6 + 2) >> 2;
2550     SRC(2,6)=SRC(3,4)=SRC(4,2)=SRC(5,0)= (t5 + t6 + 1) >> 1;
2551     SRC(2,7)=SRC(3,5)=SRC(4,3)=SRC(5,1)= (t5 + 2*t6 + t7 + 2) >> 2;
2552     SRC(3,6)=SRC(4,4)=SRC(5,2)=SRC(6,0)= (t6 + t7 + 1) >> 1;
2553     SRC(3,7)=SRC(4,5)=SRC(5,3)=SRC(6,1)= (t6 + 2*t7 + t8 + 2) >> 2;
2554     SRC(4,6)=SRC(5,4)=SRC(6,2)=SRC(7,0)= (t7 + t8 + 1) >> 1;
2555     SRC(4,7)=SRC(5,5)=SRC(6,3)=SRC(7,1)= (t7 + 2*t8 + t9 + 2) >> 2;
2556     SRC(5,6)=SRC(6,4)=SRC(7,2)= (t8 + t9 + 1) >> 1;
2557     SRC(5,7)=SRC(6,5)=SRC(7,3)= (t8 + 2*t9 + t10 + 2) >> 2;
2558     SRC(6,6)=SRC(7,4)= (t9 + t10 + 1) >> 1;
2559     SRC(6,7)=SRC(7,5)= (t9 + 2*t10 + t11 + 2) >> 2;
2560     SRC(7,6)= (t10 + t11 + 1) >> 1;
2561     SRC(7,7)= (t10 + 2*t11 + t12 + 2) >> 2;
2562 }
2563 static void pred8x8l_horizontal_up_c(uint8_t *src, int has_topleft, int has_topright, int stride)
2564 {
2565     PREDICT_8x8_LOAD_LEFT;
2566     SRC(0,0)= (l0 + l1 + 1) >> 1;
2567     SRC(1,0)= (l0 + 2*l1 + l2 + 2) >> 2;
2568     SRC(0,1)=SRC(2,0)= (l1 + l2 + 1) >> 1;
2569     SRC(1,1)=SRC(3,0)= (l1 + 2*l2 + l3 + 2) >> 2;
2570     SRC(0,2)=SRC(2,1)=SRC(4,0)= (l2 + l3 + 1) >> 1;
2571     SRC(1,2)=SRC(3,1)=SRC(5,0)= (l2 + 2*l3 + l4 + 2) >> 2;
2572     SRC(0,3)=SRC(2,2)=SRC(4,1)=SRC(6,0)= (l3 + l4 + 1) >> 1;
2573     SRC(1,3)=SRC(3,2)=SRC(5,1)=SRC(7,0)= (l3 + 2*l4 + l5 + 2) >> 2;
2574     SRC(0,4)=SRC(2,3)=SRC(4,2)=SRC(6,1)= (l4 + l5 + 1) >> 1;
2575     SRC(1,4)=SRC(3,3)=SRC(5,2)=SRC(7,1)= (l4 + 2*l5 + l6 + 2) >> 2;
2576     SRC(0,5)=SRC(2,4)=SRC(4,3)=SRC(6,2)= (l5 + l6 + 1) >> 1;
2577     SRC(1,5)=SRC(3,4)=SRC(5,3)=SRC(7,2)= (l5 + 2*l6 + l7 + 2) >> 2;
2578     SRC(0,6)=SRC(2,5)=SRC(4,4)=SRC(6,3)= (l6 + l7 + 1) >> 1;
2579     SRC(1,6)=SRC(3,5)=SRC(5,4)=SRC(7,3)= (l6 + 3*l7 + 2) >> 2;
2580     SRC(0,7)=SRC(1,7)=SRC(2,6)=SRC(2,7)=SRC(3,6)=
2581     SRC(3,7)=SRC(4,5)=SRC(4,6)=SRC(4,7)=SRC(5,5)=
2582     SRC(5,6)=SRC(5,7)=SRC(6,4)=SRC(6,5)=SRC(6,6)=
2583     SRC(6,7)=SRC(7,4)=SRC(7,5)=SRC(7,6)=SRC(7,7)= l7;
2584 }
2585 #undef PREDICT_8x8_LOAD_LEFT
2586 #undef PREDICT_8x8_LOAD_TOP
2587 #undef PREDICT_8x8_LOAD_TOPLEFT
2588 #undef PREDICT_8x8_LOAD_TOPRIGHT
2589 #undef PREDICT_8x8_DC
2590 #undef PTR
2591 #undef PT
2592 #undef PL
2593 #undef SRC
2594
2595 static inline void mc_dir_part(H264Context *h, Picture *pic, int n, int square, int chroma_height, int delta, int list,
2596                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
2597                            int src_x_offset, int src_y_offset,
2598                            qpel_mc_func *qpix_op, h264_chroma_mc_func chroma_op){
2599     MpegEncContext * const s = &h->s;
2600     const int mx= h->mv_cache[list][ scan8[n] ][0] + src_x_offset*8;
2601     const int my= h->mv_cache[list][ scan8[n] ][1] + src_y_offset*8;
2602     const int luma_xy= (mx&3) + ((my&3)<<2);
2603     uint8_t * src_y = pic->data[0] + (mx>>2) + (my>>2)*s->linesize;
2604     uint8_t * src_cb= pic->data[1] + (mx>>3) + (my>>3)*s->uvlinesize;
2605     uint8_t * src_cr= pic->data[2] + (mx>>3) + (my>>3)*s->uvlinesize;
2606     int extra_width= (s->flags&CODEC_FLAG_EMU_EDGE) ? 0 : 16; //FIXME increase edge?, IMHO not worth it
2607     int extra_height= extra_width;
2608     int emu=0;
2609     const int full_mx= mx>>2;
2610     const int full_my= my>>2;
2611     const int pic_width  = 16*s->mb_width;
2612     const int pic_height = 16*s->mb_height;
2613
2614     if(!pic->data[0])
2615         return;
2616
2617     if(mx&7) extra_width -= 3;
2618     if(my&7) extra_height -= 3;
2619
2620     if(   full_mx < 0-extra_width
2621        || full_my < 0-extra_height
2622        || full_mx + 16/*FIXME*/ > pic_width + extra_width
2623        || full_my + 16/*FIXME*/ > pic_height + extra_height){
2624         ff_emulated_edge_mc(s->edge_emu_buffer, src_y - 2 - 2*s->linesize, s->linesize, 16+5, 16+5/*FIXME*/, full_mx-2, full_my-2, pic_width, pic_height);
2625             src_y= s->edge_emu_buffer + 2 + 2*s->linesize;
2626         emu=1;
2627     }
2628
2629     qpix_op[luma_xy](dest_y, src_y, s->linesize); //FIXME try variable height perhaps?
2630     if(!square){
2631         qpix_op[luma_xy](dest_y + delta, src_y + delta, s->linesize);
2632     }
2633
2634     if(s->flags&CODEC_FLAG_GRAY) return;
2635
2636     if(emu){
2637         ff_emulated_edge_mc(s->edge_emu_buffer, src_cb, s->uvlinesize, 9, 9/*FIXME*/, (mx>>3), (my>>3), pic_width>>1, pic_height>>1);
2638             src_cb= s->edge_emu_buffer;
2639     }
2640     chroma_op(dest_cb, src_cb, s->uvlinesize, chroma_height, mx&7, my&7);
2641
2642     if(emu){
2643         ff_emulated_edge_mc(s->edge_emu_buffer, src_cr, s->uvlinesize, 9, 9/*FIXME*/, (mx>>3), (my>>3), pic_width>>1, pic_height>>1);
2644             src_cr= s->edge_emu_buffer;
2645     }
2646     chroma_op(dest_cr, src_cr, s->uvlinesize, chroma_height, mx&7, my&7);
2647 }
2648
2649 static inline void mc_part_std(H264Context *h, int n, int square, int chroma_height, int delta,
2650                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
2651                            int x_offset, int y_offset,
2652                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
2653                            qpel_mc_func *qpix_avg, h264_chroma_mc_func chroma_avg,
2654                            int list0, int list1){
2655     MpegEncContext * const s = &h->s;
2656     qpel_mc_func *qpix_op=  qpix_put;
2657     h264_chroma_mc_func chroma_op= chroma_put;
2658
2659     dest_y  += 2*x_offset + 2*y_offset*s->  linesize;
2660     dest_cb +=   x_offset +   y_offset*s->uvlinesize;
2661     dest_cr +=   x_offset +   y_offset*s->uvlinesize;
2662     x_offset += 8*s->mb_x;
2663     y_offset += 8*s->mb_y;
2664
2665     if(list0){
2666         Picture *ref= &h->ref_list[0][ h->ref_cache[0][ scan8[n] ] ];
2667         mc_dir_part(h, ref, n, square, chroma_height, delta, 0,
2668                            dest_y, dest_cb, dest_cr, x_offset, y_offset,
2669                            qpix_op, chroma_op);
2670
2671         qpix_op=  qpix_avg;
2672         chroma_op= chroma_avg;
2673     }
2674
2675     if(list1){
2676         Picture *ref= &h->ref_list[1][ h->ref_cache[1][ scan8[n] ] ];
2677         mc_dir_part(h, ref, n, square, chroma_height, delta, 1,
2678                            dest_y, dest_cb, dest_cr, x_offset, y_offset,
2679                            qpix_op, chroma_op);
2680     }
2681 }
2682
2683 static inline void mc_part_weighted(H264Context *h, int n, int square, int chroma_height, int delta,
2684                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
2685                            int x_offset, int y_offset,
2686                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
2687                            h264_weight_func luma_weight_op, h264_weight_func chroma_weight_op,
2688                            h264_biweight_func luma_weight_avg, h264_biweight_func chroma_weight_avg,
2689                            int list0, int list1){
2690     MpegEncContext * const s = &h->s;
2691
2692     dest_y  += 2*x_offset + 2*y_offset*s->  linesize;
2693     dest_cb +=   x_offset +   y_offset*s->uvlinesize;
2694     dest_cr +=   x_offset +   y_offset*s->uvlinesize;
2695     x_offset += 8*s->mb_x;
2696     y_offset += 8*s->mb_y;
2697
2698     if(list0 && list1){
2699         /* don't optimize for luma-only case, since B-frames usually
2700          * use implicit weights => chroma too. */
2701         uint8_t *tmp_cb = s->obmc_scratchpad;
2702         uint8_t *tmp_cr = tmp_cb + 8*s->uvlinesize;
2703         uint8_t *tmp_y  = tmp_cr + 8*s->uvlinesize;
2704         int refn0 = h->ref_cache[0][ scan8[n] ];
2705         int refn1 = h->ref_cache[1][ scan8[n] ];
2706
2707         mc_dir_part(h, &h->ref_list[0][refn0], n, square, chroma_height, delta, 0,
2708                     dest_y, dest_cb, dest_cr,
2709                     x_offset, y_offset, qpix_put, chroma_put);
2710         mc_dir_part(h, &h->ref_list[1][refn1], n, square, chroma_height, delta, 1,
2711                     tmp_y, tmp_cb, tmp_cr,
2712                     x_offset, y_offset, qpix_put, chroma_put);
2713
2714         if(h->use_weight == 2){
2715             int weight0 = h->implicit_weight[refn0][refn1];
2716             int weight1 = 64 - weight0;
2717             luma_weight_avg(  dest_y,  tmp_y,  s->  linesize, 5, weight0, weight1, 0);
2718             chroma_weight_avg(dest_cb, tmp_cb, s->uvlinesize, 5, weight0, weight1, 0);
2719             chroma_weight_avg(dest_cr, tmp_cr, s->uvlinesize, 5, weight0, weight1, 0);
2720         }else{
2721             luma_weight_avg(dest_y, tmp_y, s->linesize, h->luma_log2_weight_denom,
2722                             h->luma_weight[0][refn0], h->luma_weight[1][refn1],
2723                             h->luma_offset[0][refn0] + h->luma_offset[1][refn1]);
2724             chroma_weight_avg(dest_cb, tmp_cb, s->uvlinesize, h->chroma_log2_weight_denom,
2725                             h->chroma_weight[0][refn0][0], h->chroma_weight[1][refn1][0],
2726                             h->chroma_offset[0][refn0][0] + h->chroma_offset[1][refn1][0]);
2727             chroma_weight_avg(dest_cr, tmp_cr, s->uvlinesize, h->chroma_log2_weight_denom,
2728                             h->chroma_weight[0][refn0][1], h->chroma_weight[1][refn1][1],
2729                             h->chroma_offset[0][refn0][1] + h->chroma_offset[1][refn1][1]);
2730         }
2731     }else{
2732         int list = list1 ? 1 : 0;
2733         int refn = h->ref_cache[list][ scan8[n] ];
2734         Picture *ref= &h->ref_list[list][refn];
2735         mc_dir_part(h, ref, n, square, chroma_height, delta, list,
2736                     dest_y, dest_cb, dest_cr, x_offset, y_offset,
2737                     qpix_put, chroma_put);
2738
2739         luma_weight_op(dest_y, s->linesize, h->luma_log2_weight_denom,
2740                        h->luma_weight[list][refn], h->luma_offset[list][refn]);
2741         if(h->use_weight_chroma){
2742             chroma_weight_op(dest_cb, s->uvlinesize, h->chroma_log2_weight_denom,
2743                              h->chroma_weight[list][refn][0], h->chroma_offset[list][refn][0]);
2744             chroma_weight_op(dest_cr, s->uvlinesize, h->chroma_log2_weight_denom,
2745                              h->chroma_weight[list][refn][1], h->chroma_offset[list][refn][1]);
2746         }
2747     }
2748 }
2749
2750 static inline void mc_part(H264Context *h, int n, int square, int chroma_height, int delta,
2751                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
2752                            int x_offset, int y_offset,
2753                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
2754                            qpel_mc_func *qpix_avg, h264_chroma_mc_func chroma_avg,
2755                            h264_weight_func *weight_op, h264_biweight_func *weight_avg,
2756                            int list0, int list1){
2757     if((h->use_weight==2 && list0 && list1
2758         && (h->implicit_weight[ h->ref_cache[0][scan8[n]] ][ h->ref_cache[1][scan8[n]] ] != 32))
2759        || h->use_weight==1)
2760         mc_part_weighted(h, n, square, chroma_height, delta, dest_y, dest_cb, dest_cr,
2761                          x_offset, y_offset, qpix_put, chroma_put,
2762                          weight_op[0], weight_op[3], weight_avg[0], weight_avg[3], list0, list1);
2763     else
2764         mc_part_std(h, n, square, chroma_height, delta, dest_y, dest_cb, dest_cr,
2765                     x_offset, y_offset, qpix_put, chroma_put, qpix_avg, chroma_avg, list0, list1);
2766 }
2767
2768 static void hl_motion(H264Context *h, uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
2769                       qpel_mc_func (*qpix_put)[16], h264_chroma_mc_func (*chroma_put),
2770                       qpel_mc_func (*qpix_avg)[16], h264_chroma_mc_func (*chroma_avg),
2771                       h264_weight_func *weight_op, h264_biweight_func *weight_avg){
2772     MpegEncContext * const s = &h->s;
2773     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
2774     const int mb_type= s->current_picture.mb_type[mb_xy];
2775
2776     assert(IS_INTER(mb_type));
2777
2778     if(IS_16X16(mb_type)){
2779         mc_part(h, 0, 1, 8, 0, dest_y, dest_cb, dest_cr, 0, 0,
2780                 qpix_put[0], chroma_put[0], qpix_avg[0], chroma_avg[0],
2781                 &weight_op[0], &weight_avg[0],
2782                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
2783     }else if(IS_16X8(mb_type)){
2784         mc_part(h, 0, 0, 4, 8, dest_y, dest_cb, dest_cr, 0, 0,
2785                 qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0],
2786                 &weight_op[1], &weight_avg[1],
2787                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
2788         mc_part(h, 8, 0, 4, 8, dest_y, dest_cb, dest_cr, 0, 4,
2789                 qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0],
2790                 &weight_op[1], &weight_avg[1],
2791                 IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1));
2792     }else if(IS_8X16(mb_type)){
2793         mc_part(h, 0, 0, 8, 8*s->linesize, dest_y, dest_cb, dest_cr, 0, 0,
2794                 qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
2795                 &weight_op[2], &weight_avg[2],
2796                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
2797         mc_part(h, 4, 0, 8, 8*s->linesize, dest_y, dest_cb, dest_cr, 4, 0,
2798                 qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
2799                 &weight_op[2], &weight_avg[2],
2800                 IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1));
2801     }else{
2802         int i;
2803
2804         assert(IS_8X8(mb_type));
2805
2806         for(i=0; i<4; i++){
2807             const int sub_mb_type= h->sub_mb_type[i];
2808             const int n= 4*i;
2809             int x_offset= (i&1)<<2;
2810             int y_offset= (i&2)<<1;
2811
2812             if(IS_SUB_8X8(sub_mb_type)){
2813                 mc_part(h, n, 1, 4, 0, dest_y, dest_cb, dest_cr, x_offset, y_offset,
2814                     qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
2815                     &weight_op[3], &weight_avg[3],
2816                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
2817             }else if(IS_SUB_8X4(sub_mb_type)){
2818                 mc_part(h, n  , 0, 2, 4, dest_y, dest_cb, dest_cr, x_offset, y_offset,
2819                     qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1],
2820                     &weight_op[4], &weight_avg[4],
2821                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
2822                 mc_part(h, n+2, 0, 2, 4, dest_y, dest_cb, dest_cr, x_offset, y_offset+2,
2823                     qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1],
2824                     &weight_op[4], &weight_avg[4],
2825                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
2826             }else if(IS_SUB_4X8(sub_mb_type)){
2827                 mc_part(h, n  , 0, 4, 4*s->linesize, dest_y, dest_cb, dest_cr, x_offset, y_offset,
2828                     qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
2829                     &weight_op[5], &weight_avg[5],
2830                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
2831                 mc_part(h, n+1, 0, 4, 4*s->linesize, dest_y, dest_cb, dest_cr, x_offset+2, y_offset,
2832                     qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
2833                     &weight_op[5], &weight_avg[5],
2834                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
2835             }else{
2836                 int j;
2837                 assert(IS_SUB_4X4(sub_mb_type));
2838                 for(j=0; j<4; j++){
2839                     int sub_x_offset= x_offset + 2*(j&1);
2840                     int sub_y_offset= y_offset +   (j&2);
2841                     mc_part(h, n+j, 1, 2, 0, dest_y, dest_cb, dest_cr, sub_x_offset, sub_y_offset,
2842                         qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
2843                         &weight_op[6], &weight_avg[6],
2844                         IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
2845                 }
2846             }
2847         }
2848     }
2849 }
2850
2851 static void decode_init_vlc(H264Context *h){
2852     static int done = 0;
2853
2854     if (!done) {
2855         int i;
2856         done = 1;
2857
2858         init_vlc(&chroma_dc_coeff_token_vlc, CHROMA_DC_COEFF_TOKEN_VLC_BITS, 4*5,
2859                  &chroma_dc_coeff_token_len [0], 1, 1,
2860                  &chroma_dc_coeff_token_bits[0], 1, 1, 1);
2861
2862         for(i=0; i<4; i++){
2863             init_vlc(&coeff_token_vlc[i], COEFF_TOKEN_VLC_BITS, 4*17,
2864                      &coeff_token_len [i][0], 1, 1,
2865                      &coeff_token_bits[i][0], 1, 1, 1);
2866         }
2867
2868         for(i=0; i<3; i++){
2869             init_vlc(&chroma_dc_total_zeros_vlc[i], CHROMA_DC_TOTAL_ZEROS_VLC_BITS, 4,
2870                      &chroma_dc_total_zeros_len [i][0], 1, 1,
2871                      &chroma_dc_total_zeros_bits[i][0], 1, 1, 1);
2872         }
2873         for(i=0; i<15; i++){
2874             init_vlc(&total_zeros_vlc[i], TOTAL_ZEROS_VLC_BITS, 16,
2875                      &total_zeros_len [i][0], 1, 1,
2876                      &total_zeros_bits[i][0], 1, 1, 1);
2877         }
2878
2879         for(i=0; i<6; i++){
2880             init_vlc(&run_vlc[i], RUN_VLC_BITS, 7,
2881                      &run_len [i][0], 1, 1,
2882                      &run_bits[i][0], 1, 1, 1);
2883         }
2884         init_vlc(&run7_vlc, RUN7_VLC_BITS, 16,
2885                  &run_len [6][0], 1, 1,
2886                  &run_bits[6][0], 1, 1, 1);
2887     }
2888 }
2889
2890 /**
2891  * Sets the intra prediction function pointers.
2892  */
2893 static void init_pred_ptrs(H264Context *h){
2894 //    MpegEncContext * const s = &h->s;
2895
2896     h->pred4x4[VERT_PRED           ]= pred4x4_vertical_c;
2897     h->pred4x4[HOR_PRED            ]= pred4x4_horizontal_c;
2898     h->pred4x4[DC_PRED             ]= pred4x4_dc_c;
2899     h->pred4x4[DIAG_DOWN_LEFT_PRED ]= pred4x4_down_left_c;
2900     h->pred4x4[DIAG_DOWN_RIGHT_PRED]= pred4x4_down_right_c;
2901     h->pred4x4[VERT_RIGHT_PRED     ]= pred4x4_vertical_right_c;
2902     h->pred4x4[HOR_DOWN_PRED       ]= pred4x4_horizontal_down_c;
2903     h->pred4x4[VERT_LEFT_PRED      ]= pred4x4_vertical_left_c;
2904     h->pred4x4[HOR_UP_PRED         ]= pred4x4_horizontal_up_c;
2905     h->pred4x4[LEFT_DC_PRED        ]= pred4x4_left_dc_c;
2906     h->pred4x4[TOP_DC_PRED         ]= pred4x4_top_dc_c;
2907     h->pred4x4[DC_128_PRED         ]= pred4x4_128_dc_c;
2908
2909     h->pred8x8l[VERT_PRED           ]= pred8x8l_vertical_c;
2910     h->pred8x8l[HOR_PRED            ]= pred8x8l_horizontal_c;
2911     h->pred8x8l[DC_PRED             ]= pred8x8l_dc_c;
2912     h->pred8x8l[DIAG_DOWN_LEFT_PRED ]= pred8x8l_down_left_c;
2913     h->pred8x8l[DIAG_DOWN_RIGHT_PRED]= pred8x8l_down_right_c;
2914     h->pred8x8l[VERT_RIGHT_PRED     ]= pred8x8l_vertical_right_c;
2915     h->pred8x8l[HOR_DOWN_PRED       ]= pred8x8l_horizontal_down_c;
2916     h->pred8x8l[VERT_LEFT_PRED      ]= pred8x8l_vertical_left_c;
2917     h->pred8x8l[HOR_UP_PRED         ]= pred8x8l_horizontal_up_c;
2918     h->pred8x8l[LEFT_DC_PRED        ]= pred8x8l_left_dc_c;
2919     h->pred8x8l[TOP_DC_PRED         ]= pred8x8l_top_dc_c;
2920     h->pred8x8l[DC_128_PRED         ]= pred8x8l_128_dc_c;
2921
2922     h->pred8x8[DC_PRED8x8     ]= pred8x8_dc_c;
2923     h->pred8x8[VERT_PRED8x8   ]= pred8x8_vertical_c;
2924     h->pred8x8[HOR_PRED8x8    ]= pred8x8_horizontal_c;
2925     h->pred8x8[PLANE_PRED8x8  ]= pred8x8_plane_c;
2926     h->pred8x8[LEFT_DC_PRED8x8]= pred8x8_left_dc_c;
2927     h->pred8x8[TOP_DC_PRED8x8 ]= pred8x8_top_dc_c;
2928     h->pred8x8[DC_128_PRED8x8 ]= pred8x8_128_dc_c;
2929
2930     h->pred16x16[DC_PRED8x8     ]= pred16x16_dc_c;
2931     h->pred16x16[VERT_PRED8x8   ]= pred16x16_vertical_c;
2932     h->pred16x16[HOR_PRED8x8    ]= pred16x16_horizontal_c;
2933     h->pred16x16[PLANE_PRED8x8  ]= pred16x16_plane_c;
2934     h->pred16x16[LEFT_DC_PRED8x8]= pred16x16_left_dc_c;
2935     h->pred16x16[TOP_DC_PRED8x8 ]= pred16x16_top_dc_c;
2936     h->pred16x16[DC_128_PRED8x8 ]= pred16x16_128_dc_c;
2937 }
2938
2939 static void free_tables(H264Context *h){
2940     av_freep(&h->intra4x4_pred_mode);
2941     av_freep(&h->chroma_pred_mode_table);
2942     av_freep(&h->cbp_table);
2943     av_freep(&h->mvd_table[0]);
2944     av_freep(&h->mvd_table[1]);
2945     av_freep(&h->direct_table);
2946     av_freep(&h->non_zero_count);
2947     av_freep(&h->slice_table_base);
2948     av_freep(&h->top_borders[1]);
2949     av_freep(&h->top_borders[0]);
2950     h->slice_table= NULL;
2951
2952     av_freep(&h->mb2b_xy);
2953     av_freep(&h->mb2b8_xy);
2954
2955     av_freep(&h->s.obmc_scratchpad);
2956 }
2957
2958 static void init_dequant8_coeff_table(H264Context *h){
2959     int i,q,x;
2960     const int transpose = (h->s.dsp.h264_idct8_add != ff_h264_idct8_add_c); //FIXME ugly
2961     h->dequant8_coeff[0] = h->dequant8_buffer[0];
2962     h->dequant8_coeff[1] = h->dequant8_buffer[1];
2963
2964     for(i=0; i<2; i++ ){
2965         if(i && !memcmp(h->pps.scaling_matrix8[0], h->pps.scaling_matrix8[1], 64*sizeof(uint8_t))){
2966             h->dequant8_coeff[1] = h->dequant8_buffer[0];
2967             break;
2968         }
2969
2970         for(q=0; q<52; q++){
2971             int shift = div6[q];
2972             int idx = rem6[q];
2973             for(x=0; x<64; x++)
2974                 h->dequant8_coeff[i][q][transpose ? (x>>3)|((x&7)<<3) : x] =
2975                     ((uint32_t)dequant8_coeff_init[idx][ dequant8_coeff_init_scan[((x>>1)&12) | (x&3)] ] *
2976                     h->pps.scaling_matrix8[i][x]) << shift;
2977         }
2978     }
2979 }
2980
2981 static void init_dequant4_coeff_table(H264Context *h){
2982     int i,j,q,x;
2983     const int transpose = (h->s.dsp.h264_idct_add != ff_h264_idct_add_c); //FIXME ugly
2984     for(i=0; i<6; i++ ){
2985         h->dequant4_coeff[i] = h->dequant4_buffer[i];
2986         for(j=0; j<i; j++){
2987             if(!memcmp(h->pps.scaling_matrix4[j], h->pps.scaling_matrix4[i], 16*sizeof(uint8_t))){
2988                 h->dequant4_coeff[i] = h->dequant4_buffer[j];
2989                 break;
2990             }
2991         }
2992         if(j<i)
2993             continue;
2994
2995         for(q=0; q<52; q++){
2996             int shift = div6[q] + 2;
2997             int idx = rem6[q];
2998             for(x=0; x<16; x++)
2999                 h->dequant4_coeff[i][q][transpose ? (x>>2)|((x<<2)&0xF) : x] =
3000                     ((uint32_t)dequant4_coeff_init[idx][(x&1) + ((x>>2)&1)] *
3001                     h->pps.scaling_matrix4[i][x]) << shift;
3002         }
3003     }
3004 }
3005
3006 static void init_dequant_tables(H264Context *h){
3007     int i,x;
3008     init_dequant4_coeff_table(h);
3009     if(h->pps.transform_8x8_mode)
3010         init_dequant8_coeff_table(h);
3011     if(h->sps.transform_bypass){
3012         for(i=0; i<6; i++)
3013             for(x=0; x<16; x++)
3014                 h->dequant4_coeff[i][0][x] = 1<<6;
3015         if(h->pps.transform_8x8_mode)
3016             for(i=0; i<2; i++)
3017                 for(x=0; x<64; x++)
3018                     h->dequant8_coeff[i][0][x] = 1<<6;
3019     }
3020 }
3021
3022
3023 /**
3024  * allocates tables.
3025  * needs width/height
3026  */
3027 static int alloc_tables(H264Context *h){
3028     MpegEncContext * const s = &h->s;
3029     const int big_mb_num= s->mb_stride * (s->mb_height+1);
3030     int x,y;
3031
3032     CHECKED_ALLOCZ(h->intra4x4_pred_mode, big_mb_num * 8  * sizeof(uint8_t))
3033
3034     CHECKED_ALLOCZ(h->non_zero_count    , big_mb_num * 16 * sizeof(uint8_t))
3035     CHECKED_ALLOCZ(h->slice_table_base  , big_mb_num * sizeof(uint8_t))
3036     CHECKED_ALLOCZ(h->top_borders[0]    , s->mb_width * (16+8+8) * sizeof(uint8_t))
3037     CHECKED_ALLOCZ(h->top_borders[1]    , s->mb_width * (16+8+8) * sizeof(uint8_t))
3038     CHECKED_ALLOCZ(h->cbp_table, big_mb_num * sizeof(uint16_t))
3039
3040     if( h->pps.cabac ) {
3041         CHECKED_ALLOCZ(h->chroma_pred_mode_table, big_mb_num * sizeof(uint8_t))
3042         CHECKED_ALLOCZ(h->mvd_table[0], 32*big_mb_num * sizeof(uint16_t));
3043         CHECKED_ALLOCZ(h->mvd_table[1], 32*big_mb_num * sizeof(uint16_t));
3044         CHECKED_ALLOCZ(h->direct_table, 32*big_mb_num * sizeof(uint8_t));
3045     }
3046
3047     memset(h->slice_table_base, -1, big_mb_num  * sizeof(uint8_t));
3048     h->slice_table= h->slice_table_base + s->mb_stride + 1;
3049
3050     CHECKED_ALLOCZ(h->mb2b_xy  , big_mb_num * sizeof(uint32_t));
3051     CHECKED_ALLOCZ(h->mb2b8_xy , big_mb_num * sizeof(uint32_t));
3052     for(y=0; y<s->mb_height; y++){
3053         for(x=0; x<s->mb_width; x++){
3054             const int mb_xy= x + y*s->mb_stride;
3055             const int b_xy = 4*x + 4*y*h->b_stride;
3056             const int b8_xy= 2*x + 2*y*h->b8_stride;
3057
3058             h->mb2b_xy [mb_xy]= b_xy;
3059             h->mb2b8_xy[mb_xy]= b8_xy;
3060         }
3061     }
3062
3063     s->obmc_scratchpad = NULL;
3064
3065     if(!h->dequant4_coeff[0])
3066         init_dequant_tables(h);
3067
3068     return 0;
3069 fail:
3070     free_tables(h);
3071     return -1;
3072 }
3073
3074 static void common_init(H264Context *h){
3075     MpegEncContext * const s = &h->s;
3076
3077     s->width = s->avctx->width;
3078     s->height = s->avctx->height;
3079     s->codec_id= s->avctx->codec->id;
3080
3081     init_pred_ptrs(h);
3082
3083     h->dequant_coeff_pps= -1;
3084     s->unrestricted_mv=1;
3085     s->decode=1; //FIXME
3086
3087     memset(h->pps.scaling_matrix4, 16, 6*16*sizeof(uint8_t));
3088     memset(h->pps.scaling_matrix8, 16, 2*64*sizeof(uint8_t));
3089 }
3090
3091 static int decode_init(AVCodecContext *avctx){
3092     H264Context *h= avctx->priv_data;
3093     MpegEncContext * const s = &h->s;
3094
3095     MPV_decode_defaults(s);
3096
3097     s->avctx = avctx;
3098     common_init(h);
3099
3100     s->out_format = FMT_H264;
3101     s->workaround_bugs= avctx->workaround_bugs;
3102
3103     // set defaults
3104 //    s->decode_mb= ff_h263_decode_mb;
3105     s->low_delay= 1;
3106     avctx->pix_fmt= PIX_FMT_YUV420P;
3107
3108     decode_init_vlc(h);
3109
3110     if(avctx->extradata_size > 0 && avctx->extradata &&
3111        *(char *)avctx->extradata == 1){
3112         h->is_avc = 1;
3113         h->got_avcC = 0;
3114     } else {
3115         h->is_avc = 0;
3116     }
3117
3118     return 0;
3119 }
3120
3121 static int frame_start(H264Context *h){
3122     MpegEncContext * const s = &h->s;
3123     int i;
3124
3125     if(MPV_frame_start(s, s->avctx) < 0)
3126         return -1;
3127     ff_er_frame_start(s);
3128
3129     assert(s->linesize && s->uvlinesize);
3130
3131     for(i=0; i<16; i++){
3132         h->block_offset[i]= 4*((scan8[i] - scan8[0])&7) + 4*s->linesize*((scan8[i] - scan8[0])>>3);
3133         h->block_offset[24+i]= 4*((scan8[i] - scan8[0])&7) + 8*s->linesize*((scan8[i] - scan8[0])>>3);
3134     }
3135     for(i=0; i<4; i++){
3136         h->block_offset[16+i]=
3137         h->block_offset[20+i]= 4*((scan8[i] - scan8[0])&7) + 4*s->uvlinesize*((scan8[i] - scan8[0])>>3);
3138         h->block_offset[24+16+i]=
3139         h->block_offset[24+20+i]= 4*((scan8[i] - scan8[0])&7) + 8*s->uvlinesize*((scan8[i] - scan8[0])>>3);
3140     }
3141
3142     /* can't be in alloc_tables because linesize isn't known there.
3143      * FIXME: redo bipred weight to not require extra buffer? */
3144     if(!s->obmc_scratchpad)
3145         s->obmc_scratchpad = av_malloc(16*s->linesize + 2*8*s->uvlinesize);
3146
3147 //    s->decode= (s->flags&CODEC_FLAG_PSNR) || !s->encoding || s->current_picture.reference /*|| h->contains_intra*/ || 1;
3148     return 0;
3149 }
3150
3151 static inline void backup_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize){
3152     MpegEncContext * const s = &h->s;
3153     int i;
3154
3155     src_y  -=   linesize;
3156     src_cb -= uvlinesize;
3157     src_cr -= uvlinesize;
3158
3159     // There are two lines saved, the line above the the top macroblock of a pair,
3160     // and the line above the bottom macroblock
3161     h->left_border[0]= h->top_borders[0][s->mb_x][15];
3162     for(i=1; i<17; i++){
3163         h->left_border[i]= src_y[15+i*  linesize];
3164     }
3165
3166     *(uint64_t*)(h->top_borders[0][s->mb_x]+0)= *(uint64_t*)(src_y +  16*linesize);
3167     *(uint64_t*)(h->top_borders[0][s->mb_x]+8)= *(uint64_t*)(src_y +8+16*linesize);
3168
3169     if(!(s->flags&CODEC_FLAG_GRAY)){
3170         h->left_border[17  ]= h->top_borders[0][s->mb_x][16+7];
3171         h->left_border[17+9]= h->top_borders[0][s->mb_x][24+7];
3172         for(i=1; i<9; i++){
3173             h->left_border[i+17  ]= src_cb[7+i*uvlinesize];
3174             h->left_border[i+17+9]= src_cr[7+i*uvlinesize];
3175         }
3176         *(uint64_t*)(h->top_borders[0][s->mb_x]+16)= *(uint64_t*)(src_cb+8*uvlinesize);
3177         *(uint64_t*)(h->top_borders[0][s->mb_x]+24)= *(uint64_t*)(src_cr+8*uvlinesize);
3178     }
3179 }
3180
3181 static inline void xchg_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int xchg){
3182     MpegEncContext * const s = &h->s;
3183     int temp8, i;
3184     uint64_t temp64;
3185     int deblock_left = (s->mb_x > 0);
3186     int deblock_top  = (s->mb_y > 0);
3187
3188     src_y  -=   linesize + 1;
3189     src_cb -= uvlinesize + 1;
3190     src_cr -= uvlinesize + 1;
3191
3192 #define XCHG(a,b,t,xchg)\
3193 t= a;\
3194 if(xchg)\
3195     a= b;\
3196 b= t;
3197
3198     if(deblock_left){
3199         for(i = !deblock_top; i<17; i++){
3200             XCHG(h->left_border[i     ], src_y [i*  linesize], temp8, xchg);
3201         }
3202     }
3203
3204     if(deblock_top){
3205         XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+0), *(uint64_t*)(src_y +1), temp64, xchg);
3206         XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+8), *(uint64_t*)(src_y +9), temp64, 1);
3207         if(s->mb_x+1 < s->mb_width){
3208             XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x+1]), *(uint64_t*)(src_y +17), temp64, 1);
3209         }
3210     }
3211
3212     if(!(s->flags&CODEC_FLAG_GRAY)){
3213         if(deblock_left){
3214             for(i = !deblock_top; i<9; i++){
3215                 XCHG(h->left_border[i+17  ], src_cb[i*uvlinesize], temp8, xchg);
3216                 XCHG(h->left_border[i+17+9], src_cr[i*uvlinesize], temp8, xchg);
3217             }
3218         }
3219         if(deblock_top){
3220             XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+16), *(uint64_t*)(src_cb+1), temp64, 1);
3221             XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+24), *(uint64_t*)(src_cr+1), temp64, 1);
3222         }
3223     }
3224 }
3225
3226 static inline void backup_pair_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize){
3227     MpegEncContext * const s = &h->s;
3228     int i;
3229
3230     src_y  -= 2 *   linesize;
3231     src_cb -= 2 * uvlinesize;
3232     src_cr -= 2 * uvlinesize;
3233
3234     // There are two lines saved, the line above the the top macroblock of a pair,
3235     // and the line above the bottom macroblock
3236     h->left_border[0]= h->top_borders[0][s->mb_x][15];
3237     h->left_border[1]= h->top_borders[1][s->mb_x][15];
3238     for(i=2; i<34; i++){
3239         h->left_border[i]= src_y[15+i*  linesize];
3240     }
3241
3242     *(uint64_t*)(h->top_borders[0][s->mb_x]+0)= *(uint64_t*)(src_y +  32*linesize);
3243     *(uint64_t*)(h->top_borders[0][s->mb_x]+8)= *(uint64_t*)(src_y +8+32*linesize);
3244     *(uint64_t*)(h->top_borders[1][s->mb_x]+0)= *(uint64_t*)(src_y +  33*linesize);
3245     *(uint64_t*)(h->top_borders[1][s->mb_x]+8)= *(uint64_t*)(src_y +8+33*linesize);
3246
3247     if(!(s->flags&CODEC_FLAG_GRAY)){
3248         h->left_border[34     ]= h->top_borders[0][s->mb_x][16+7];
3249         h->left_border[34+   1]= h->top_borders[1][s->mb_x][16+7];
3250         h->left_border[34+18  ]= h->top_borders[0][s->mb_x][24+7];
3251         h->left_border[34+18+1]= h->top_borders[1][s->mb_x][24+7];
3252         for(i=2; i<18; i++){
3253             h->left_border[i+34   ]= src_cb[7+i*uvlinesize];
3254             h->left_border[i+34+18]= src_cr[7+i*uvlinesize];
3255         }
3256         *(uint64_t*)(h->top_borders[0][s->mb_x]+16)= *(uint64_t*)(src_cb+16*uvlinesize);
3257         *(uint64_t*)(h->top_borders[0][s->mb_x]+24)= *(uint64_t*)(src_cr+16*uvlinesize);
3258         *(uint64_t*)(h->top_borders[1][s->mb_x]+16)= *(uint64_t*)(src_cb+17*uvlinesize);
3259         *(uint64_t*)(h->top_borders[1][s->mb_x]+24)= *(uint64_t*)(src_cr+17*uvlinesize);
3260     }
3261 }
3262
3263 static inline void xchg_pair_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int xchg){
3264     MpegEncContext * const s = &h->s;
3265     int temp8, i;
3266     uint64_t temp64;
3267     int deblock_left = (s->mb_x > 0);
3268     int deblock_top  = (s->mb_y > 0);
3269
3270     tprintf("xchg_pair_border: src_y:%p src_cb:%p src_cr:%p ls:%d uvls:%d\n", src_y, src_cb, src_cr, linesize, uvlinesize);
3271
3272     src_y  -= 2 *   linesize + 1;
3273     src_cb -= 2 * uvlinesize + 1;
3274     src_cr -= 2 * uvlinesize + 1;
3275
3276 #define XCHG(a,b,t,xchg)\
3277 t= a;\
3278 if(xchg)\
3279     a= b;\
3280 b= t;
3281
3282     if(deblock_left){
3283         for(i = (!deblock_top)<<1; i<34; i++){
3284             XCHG(h->left_border[i     ], src_y [i*  linesize], temp8, xchg);
3285         }
3286     }
3287
3288     if(deblock_top){
3289         XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+0), *(uint64_t*)(src_y +1), temp64, xchg);
3290         XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+8), *(uint64_t*)(src_y +9), temp64, 1);
3291         XCHG(*(uint64_t*)(h->top_borders[1][s->mb_x]+0), *(uint64_t*)(src_y +1 +linesize), temp64, xchg);
3292         XCHG(*(uint64_t*)(h->top_borders[1][s->mb_x]+8), *(uint64_t*)(src_y +9 +linesize), temp64, 1);
3293     }
3294
3295     if(!(s->flags&CODEC_FLAG_GRAY)){
3296         if(deblock_left){
3297             for(i = (!deblock_top) << 1; i<18; i++){
3298                 XCHG(h->left_border[i+34   ], src_cb[i*uvlinesize], temp8, xchg);
3299                 XCHG(h->left_border[i+34+18], src_cr[i*uvlinesize], temp8, xchg);
3300             }
3301         }
3302         if(deblock_top){
3303             XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+16), *(uint64_t*)(src_cb+1), temp64, 1);
3304             XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+24), *(uint64_t*)(src_cr+1), temp64, 1);
3305             XCHG(*(uint64_t*)(h->top_borders[1][s->mb_x]+16), *(uint64_t*)(src_cb+1 +uvlinesize), temp64, 1);
3306             XCHG(*(uint64_t*)(h->top_borders[1][s->mb_x]+24), *(uint64_t*)(src_cr+1 +uvlinesize), temp64, 1);
3307         }
3308     }
3309 }
3310
3311 static void hl_decode_mb(H264Context *h){
3312     MpegEncContext * const s = &h->s;
3313     const int mb_x= s->mb_x;
3314     const int mb_y= s->mb_y;
3315     const int mb_xy= mb_x + mb_y*s->mb_stride;
3316     const int mb_type= s->current_picture.mb_type[mb_xy];
3317     uint8_t  *dest_y, *dest_cb, *dest_cr;
3318     int linesize, uvlinesize /*dct_offset*/;
3319     int i;
3320     int *block_offset = &h->block_offset[0];
3321     const unsigned int bottom = mb_y & 1;
3322     const int transform_bypass = (s->qscale == 0 && h->sps.transform_bypass);
3323     void (*idct_add)(uint8_t *dst, DCTELEM *block, int stride);
3324     void (*idct_dc_add)(uint8_t *dst, DCTELEM *block, int stride);
3325
3326     if(!s->decode)
3327         return;
3328
3329     dest_y  = s->current_picture.data[0] + (mb_y * 16* s->linesize  ) + mb_x * 16;
3330     dest_cb = s->current_picture.data[1] + (mb_y * 8 * s->uvlinesize) + mb_x * 8;
3331     dest_cr = s->current_picture.data[2] + (mb_y * 8 * s->uvlinesize) + mb_x * 8;
3332
3333     if (h->mb_field_decoding_flag) {
3334         linesize = s->linesize * 2;
3335         uvlinesize = s->uvlinesize * 2;
3336         block_offset = &h->block_offset[24];
3337         if(mb_y&1){ //FIXME move out of this func?
3338             dest_y -= s->linesize*15;
3339             dest_cb-= s->uvlinesize*7;
3340             dest_cr-= s->uvlinesize*7;
3341         }
3342     } else {
3343         linesize = s->linesize;
3344         uvlinesize = s->uvlinesize;
3345 //        dct_offset = s->linesize * 16;
3346     }
3347
3348     if(transform_bypass){
3349         idct_dc_add =
3350         idct_add = IS_8x8DCT(mb_type) ? s->dsp.add_pixels8 : s->dsp.add_pixels4;
3351     }else if(IS_8x8DCT(mb_type)){
3352         idct_dc_add = s->dsp.h264_idct8_dc_add;
3353         idct_add = s->dsp.h264_idct8_add;
3354     }else{
3355         idct_dc_add = s->dsp.h264_idct_dc_add;
3356         idct_add = s->dsp.h264_idct_add;
3357     }
3358
3359     if (IS_INTRA_PCM(mb_type)) {
3360         unsigned int x, y;
3361
3362         // The pixels are stored in h->mb array in the same order as levels,
3363         // copy them in output in the correct order.
3364         for(i=0; i<16; i++) {
3365             for (y=0; y<4; y++) {
3366                 for (x=0; x<4; x++) {
3367                     *(dest_y + block_offset[i] + y*linesize + x) = h->mb[i*16+y*4+x];
3368                 }
3369             }
3370         }
3371         for(i=16; i<16+4; i++) {
3372             for (y=0; y<4; y++) {
3373                 for (x=0; x<4; x++) {
3374                     *(dest_cb + block_offset[i] + y*uvlinesize + x) = h->mb[i*16+y*4+x];
3375                 }
3376             }
3377         }
3378         for(i=20; i<20+4; i++) {
3379             for (y=0; y<4; y++) {
3380                 for (x=0; x<4; x++) {
3381                     *(dest_cr + block_offset[i] + y*uvlinesize + x) = h->mb[i*16+y*4+x];
3382                 }
3383             }
3384         }
3385     } else {
3386         if(IS_INTRA(mb_type)){
3387             if(h->deblocking_filter) {
3388                 if (h->mb_aff_frame) {
3389                     if (!bottom)
3390                         xchg_pair_border(h, dest_y, dest_cb, dest_cr, s->linesize, s->uvlinesize, 1);
3391                 } else {
3392                     xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 1);
3393                 }
3394             }
3395
3396             if(!(s->flags&CODEC_FLAG_GRAY)){
3397                 h->pred8x8[ h->chroma_pred_mode ](dest_cb, uvlinesize);
3398                 h->pred8x8[ h->chroma_pred_mode ](dest_cr, uvlinesize);
3399             }
3400
3401             if(IS_INTRA4x4(mb_type)){
3402                 if(!s->encoding){
3403                     if(IS_8x8DCT(mb_type)){
3404                         for(i=0; i<16; i+=4){
3405                             uint8_t * const ptr= dest_y + block_offset[i];
3406                             const int dir= h->intra4x4_pred_mode_cache[ scan8[i] ];
3407                             const int nnz = h->non_zero_count_cache[ scan8[i] ];
3408                             h->pred8x8l[ dir ](ptr, (h->topleft_samples_available<<i)&0x8000,
3409                                                    (h->topright_samples_available<<(i+1))&0x8000, linesize);
3410                             if(nnz){
3411                                 if(nnz == 1 && h->mb[i*16])
3412                                     idct_dc_add(ptr, h->mb + i*16, linesize);
3413                                 else
3414                                     idct_add(ptr, h->mb + i*16, linesize);
3415                             }
3416                         }
3417                     }else
3418                     for(i=0; i<16; i++){
3419                         uint8_t * const ptr= dest_y + block_offset[i];
3420                         uint8_t *topright;
3421                         const int dir= h->intra4x4_pred_mode_cache[ scan8[i] ];
3422                         int nnz, tr;
3423
3424                         if(dir == DIAG_DOWN_LEFT_PRED || dir == VERT_LEFT_PRED){
3425                             const int topright_avail= (h->topright_samples_available<<i)&0x8000;
3426                             assert(mb_y || linesize <= block_offset[i]);
3427                             if(!topright_avail){
3428                                 tr= ptr[3 - linesize]*0x01010101;
3429                                 topright= (uint8_t*) &tr;
3430                             }else
3431                                 topright= ptr + 4 - linesize;
3432                         }else
3433                             topright= NULL;
3434
3435                         h->pred4x4[ dir ](ptr, topright, linesize);
3436                         nnz = h->non_zero_count_cache[ scan8[i] ];
3437                         if(nnz){
3438                             if(s->codec_id == CODEC_ID_H264){
3439                                 if(nnz == 1 && h->mb[i*16])
3440                                     idct_dc_add(ptr, h->mb + i*16, linesize);
3441                                 else
3442                                     idct_add(ptr, h->mb + i*16, linesize);
3443                             }else
3444                                 svq3_add_idct_c(ptr, h->mb + i*16, linesize, s->qscale, 0);
3445                         }
3446                     }
3447                 }
3448             }else{
3449                 h->pred16x16[ h->intra16x16_pred_mode ](dest_y , linesize);
3450                 if(s->codec_id == CODEC_ID_H264){
3451                     if(!transform_bypass)
3452                         h264_luma_dc_dequant_idct_c(h->mb, s->qscale, h->dequant4_coeff[IS_INTRA(mb_type) ? 0:3][s->qscale][0]);
3453                 }else
3454                     svq3_luma_dc_dequant_idct_c(h->mb, s->qscale);
3455             }
3456             if(h->deblocking_filter) {
3457                 if (h->mb_aff_frame) {
3458                     if (bottom) {
3459                         uint8_t *pair_dest_y  = s->current_picture.data[0] + ((mb_y-1) * 16* s->linesize  ) + mb_x * 16;
3460                         uint8_t *pair_dest_cb = s->current_picture.data[1] + ((mb_y-1) * 8 * s->uvlinesize) + mb_x * 8;
3461                         uint8_t *pair_dest_cr = s->current_picture.data[2] + ((mb_y-1) * 8 * s->uvlinesize) + mb_x * 8;
3462                         s->mb_y--;
3463                         xchg_pair_border(h, pair_dest_y, pair_dest_cb, pair_dest_cr, s->linesize, s->uvlinesize, 0);
3464                         s->mb_y++;
3465                     }
3466                 } else {
3467                     xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 0);
3468                 }
3469             }
3470         }else if(s->codec_id == CODEC_ID_H264){
3471             hl_motion(h, dest_y, dest_cb, dest_cr,
3472                       s->dsp.put_h264_qpel_pixels_tab, s->dsp.put_h264_chroma_pixels_tab,
3473                       s->dsp.avg_h264_qpel_pixels_tab, s->dsp.avg_h264_chroma_pixels_tab,
3474                       s->dsp.weight_h264_pixels_tab, s->dsp.biweight_h264_pixels_tab);
3475         }
3476
3477
3478         if(!IS_INTRA4x4(mb_type)){
3479             if(s->codec_id == CODEC_ID_H264){
3480                 if(IS_INTRA16x16(mb_type)){
3481                     for(i=0; i<16; i++){
3482                         if(h->non_zero_count_cache[ scan8[i] ])
3483                             idct_add(dest_y + block_offset[i], h->mb + i*16, linesize);
3484                         else if(h->mb[i*16])
3485                             idct_dc_add(dest_y + block_offset[i], h->mb + i*16, linesize);
3486                     }
3487                 }else{
3488                     const int di = IS_8x8DCT(mb_type) ? 4 : 1;
3489                     for(i=0; i<16; i+=di){
3490                         int nnz = h->non_zero_count_cache[ scan8[i] ];
3491                         if(nnz){
3492                             if(nnz==1 && h->mb[i*16])
3493                                 idct_dc_add(dest_y + block_offset[i], h->mb + i*16, linesize);
3494                             else
3495                                 idct_add(dest_y + block_offset[i], h->mb + i*16, linesize);
3496                         }
3497                     }
3498                 }
3499             }else{
3500                 for(i=0; i<16; i++){
3501                     if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){ //FIXME benchmark weird rule, & below
3502                         uint8_t * const ptr= dest_y + block_offset[i];
3503                         svq3_add_idct_c(ptr, h->mb + i*16, linesize, s->qscale, IS_INTRA(mb_type) ? 1 : 0);
3504                     }
3505                 }
3506             }
3507         }
3508
3509         if(!(s->flags&CODEC_FLAG_GRAY)){
3510             uint8_t *dest[2] = {dest_cb, dest_cr};
3511             if(transform_bypass){
3512                 idct_add = idct_dc_add = s->dsp.add_pixels4;
3513             }else{
3514                 idct_add = s->dsp.h264_idct_add;
3515                 idct_dc_add = s->dsp.h264_idct_dc_add;
3516                 chroma_dc_dequant_idct_c(h->mb + 16*16, h->chroma_qp, h->dequant4_coeff[IS_INTRA(mb_type) ? 1:4][h->chroma_qp][0]);
3517                 chroma_dc_dequant_idct_c(h->mb + 16*16+4*16, h->chroma_qp, h->dequant4_coeff[IS_INTRA(mb_type) ? 2:5][h->chroma_qp][0]);
3518             }
3519             if(s->codec_id == CODEC_ID_H264){
3520                 for(i=16; i<16+8; i++){
3521                     if(h->non_zero_count_cache[ scan8[i] ])
3522                         idct_add(dest[(i&4)>>2] + block_offset[i], h->mb + i*16, uvlinesize);
3523                     else if(h->mb[i*16])
3524                         idct_dc_add(dest[(i&4)>>2] + block_offset[i], h->mb + i*16, uvlinesize);
3525                 }
3526             }else{
3527                 for(i=16; i<16+8; i++){
3528                     if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){
3529                         uint8_t * const ptr= dest[(i&4)>>2] + block_offset[i];
3530                         svq3_add_idct_c(ptr, h->mb + i*16, uvlinesize, chroma_qp[s->qscale + 12] - 12, 2);
3531                     }
3532                 }
3533             }
3534         }
3535     }
3536     if(h->deblocking_filter) {
3537         if (h->mb_aff_frame) {
3538             const int mb_y = s->mb_y - 1;
3539             uint8_t  *pair_dest_y, *pair_dest_cb, *pair_dest_cr;
3540             const int mb_xy= mb_x + mb_y*s->mb_stride;
3541             const int mb_type_top   = s->current_picture.mb_type[mb_xy];
3542             const int mb_type_bottom= s->current_picture.mb_type[mb_xy+s->mb_stride];
3543             uint8_t tmp = s->current_picture.data[1][384];
3544             if (!bottom) return;
3545             pair_dest_y  = s->current_picture.data[0] + (mb_y * 16* s->linesize  ) + mb_x * 16;
3546             pair_dest_cb = s->current_picture.data[1] + (mb_y * 8 * s->uvlinesize) + mb_x * 8;
3547             pair_dest_cr = s->current_picture.data[2] + (mb_y * 8 * s->uvlinesize) + mb_x * 8;
3548
3549             backup_pair_border(h, pair_dest_y, pair_dest_cb, pair_dest_cr, s->linesize, s->uvlinesize);
3550             // TODO deblock a pair
3551             // top
3552             s->mb_y--;
3553             tprintf("call mbaff filter_mb mb_x:%d mb_y:%d pair_dest_y = %p, dest_y = %p\n", mb_x, mb_y, pair_dest_y, dest_y);
3554             fill_caches(h, mb_type_top, 1); //FIXME don't fill stuff which isn't used by filter_mb
3555             filter_mb(h, mb_x, mb_y, pair_dest_y, pair_dest_cb, pair_dest_cr, linesize, uvlinesize);
3556             if (tmp != s->current_picture.data[1][384]) {
3557                 tprintf("modified pixel 8,1 (1)\n");
3558             }
3559             // bottom
3560             s->mb_y++;
3561             tprintf("call mbaff filter_mb\n");
3562             fill_caches(h, mb_type_bottom, 1); //FIXME don't fill stuff which isn't used by filter_mb
3563             filter_mb(h, mb_x, mb_y+1, dest_y, dest_cb, dest_cr, linesize, uvlinesize);
3564             if (tmp != s->current_picture.data[1][384]) {
3565                 tprintf("modified pixel 8,1 (2)\n");
3566             }
3567         } else {
3568             tprintf("call filter_mb\n");
3569             backup_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize);
3570             fill_caches(h, mb_type, 1); //FIXME don't fill stuff which isn't used by filter_mb
3571             filter_mb(h, mb_x, mb_y, dest_y, dest_cb, dest_cr, linesize, uvlinesize);
3572         }
3573     }
3574 }
3575
3576 /**
3577  * fills the default_ref_list.
3578  */
3579 static int fill_default_ref_list(H264Context *h){
3580     MpegEncContext * const s = &h->s;
3581     int i;
3582     int smallest_poc_greater_than_current = -1;
3583     Picture sorted_short_ref[32];
3584
3585     if(h->slice_type==B_TYPE){
3586         int out_i;
3587         int limit= INT_MIN;
3588
3589         /* sort frame according to poc in B slice */
3590         for(out_i=0; out_i<h->short_ref_count; out_i++){
3591             int best_i=INT_MIN;
3592             int best_poc=INT_MAX;
3593
3594             for(i=0; i<h->short_ref_count; i++){
3595                 const int poc= h->short_ref[i]->poc;
3596                 if(poc > limit && poc < best_poc){
3597                     best_poc= poc;
3598                     best_i= i;
3599                 }
3600             }
3601
3602             assert(best_i != INT_MIN);
3603
3604             limit= best_poc;
3605             sorted_short_ref[out_i]= *h->short_ref[best_i];
3606             tprintf("sorted poc: %d->%d poc:%d fn:%d\n", best_i, out_i, sorted_short_ref[out_i].poc, sorted_short_ref[out_i].frame_num);
3607             if (-1 == smallest_poc_greater_than_current) {
3608                 if (h->short_ref[best_i]->poc >= s->current_picture_ptr->poc) {
3609                     smallest_poc_greater_than_current = out_i;
3610                 }
3611             }
3612         }
3613     }
3614
3615     if(s->picture_structure == PICT_FRAME){
3616         if(h->slice_type==B_TYPE){
3617             int list;
3618             tprintf("current poc: %d, smallest_poc_greater_than_current: %d\n", s->current_picture_ptr->poc, smallest_poc_greater_than_current);
3619
3620             // find the largest poc
3621             for(list=0; list<2; list++){
3622                 int index = 0;
3623                 int j= -99;
3624                 int step= list ? -1 : 1;
3625
3626                 for(i=0; i<h->short_ref_count && index < h->ref_count[list]; i++, j+=step) {
3627                     while(j<0 || j>= h->short_ref_count){
3628                         if(j != -99 && step == (list ? -1 : 1))
3629                             return -1;
3630                         step = -step;
3631                         j= smallest_poc_greater_than_current + (step>>1);
3632                     }
3633                     if(sorted_short_ref[j].reference != 3) continue;
3634                     h->default_ref_list[list][index  ]= sorted_short_ref[j];
3635                     h->default_ref_list[list][index++].pic_id= sorted_short_ref[j].frame_num;
3636                 }
3637
3638                 for(i = 0; i < 16 && index < h->ref_count[ list ]; i++){
3639                     if(h->long_ref[i] == NULL) continue;
3640                     if(h->long_ref[i]->reference != 3) continue;
3641
3642                     h->default_ref_list[ list ][index  ]= *h->long_ref[i];
3643                     h->default_ref_list[ list ][index++].pic_id= i;;
3644                 }
3645
3646                 if(list && (smallest_poc_greater_than_current<=0 || smallest_poc_greater_than_current>=h->short_ref_count) && (1 < index)){
3647                     // swap the two first elements of L1 when
3648                     // L0 and L1 are identical
3649                     Picture temp= h->default_ref_list[1][0];
3650                     h->default_ref_list[1][0] = h->default_ref_list[1][1];
3651                     h->default_ref_list[1][1] = temp;
3652                 }
3653
3654                 if(index < h->ref_count[ list ])
3655                     memset(&h->default_ref_list[list][index], 0, sizeof(Picture)*(h->ref_count[ list ] - index));
3656             }
3657         }else{
3658             int index=0;
3659             for(i=0; i<h->short_ref_count; i++){
3660                 if(h->short_ref[i]->reference != 3) continue; //FIXME refernce field shit
3661                 h->default_ref_list[0][index  ]= *h->short_ref[i];
3662                 h->default_ref_list[0][index++].pic_id= h->short_ref[i]->frame_num;
3663             }
3664             for(i = 0; i < 16; i++){
3665                 if(h->long_ref[i] == NULL) continue;
3666                 if(h->long_ref[i]->reference != 3) continue;
3667                 h->default_ref_list[0][index  ]= *h->long_ref[i];
3668                 h->default_ref_list[0][index++].pic_id= i;;
3669             }
3670             if(index < h->ref_count[0])
3671                 memset(&h->default_ref_list[0][index], 0, sizeof(Picture)*(h->ref_count[0] - index));
3672         }
3673     }else{ //FIELD
3674         if(h->slice_type==B_TYPE){
3675         }else{
3676             //FIXME second field balh
3677         }
3678     }
3679 #ifdef TRACE
3680     for (i=0; i<h->ref_count[0]; i++) {
3681         tprintf("List0: %s fn:%d 0x%p\n", (h->default_ref_list[0][i].long_ref ? "LT" : "ST"), h->default_ref_list[0][i].pic_id, h->default_ref_list[0][i].data[0]);
3682     }
3683     if(h->slice_type==B_TYPE){
3684         for (i=0; i<h->ref_count[1]; i++) {
3685             tprintf("List1: %s fn:%d 0x%p\n", (h->default_ref_list[1][i].long_ref ? "LT" : "ST"), h->default_ref_list[1][i].pic_id, h->default_ref_list[0][i].data[0]);
3686         }
3687     }
3688 #endif
3689     return 0;
3690 }
3691
3692 static void print_short_term(H264Context *h);
3693 static void print_long_term(H264Context *h);
3694
3695 static int decode_ref_pic_list_reordering(H264Context *h){
3696     MpegEncContext * const s = &h->s;
3697     int list, index;
3698
3699     print_short_term(h);
3700     print_long_term(h);
3701     if(h->slice_type==I_TYPE || h->slice_type==SI_TYPE) return 0; //FIXME move before func
3702
3703     for(list=0; list<2; list++){
3704         memcpy(h->ref_list[list], h->default_ref_list[list], sizeof(Picture)*h->ref_count[list]);
3705
3706         if(get_bits1(&s->gb)){
3707             int pred= h->curr_pic_num;
3708
3709             for(index=0; ; index++){
3710                 int reordering_of_pic_nums_idc= get_ue_golomb(&s->gb);
3711                 int pic_id;
3712                 int i;
3713                 Picture *ref = NULL;
3714
3715                 if(reordering_of_pic_nums_idc==3)
3716                     break;
3717
3718                 if(index >= h->ref_count[list]){
3719                     av_log(h->s.avctx, AV_LOG_ERROR, "reference count overflow\n");
3720                     return -1;
3721                 }
3722
3723                 if(reordering_of_pic_nums_idc<3){
3724                     if(reordering_of_pic_nums_idc<2){
3725                         const int abs_diff_pic_num= get_ue_golomb(&s->gb) + 1;
3726
3727                         if(abs_diff_pic_num >= h->max_pic_num){
3728                             av_log(h->s.avctx, AV_LOG_ERROR, "abs_diff_pic_num overflow\n");
3729                             return -1;
3730                         }
3731
3732                         if(reordering_of_pic_nums_idc == 0) pred-= abs_diff_pic_num;
3733                         else                                pred+= abs_diff_pic_num;
3734                         pred &= h->max_pic_num - 1;
3735
3736                         for(i= h->short_ref_count-1; i>=0; i--){
3737                             ref = h->short_ref[i];
3738                             assert(ref->reference == 3);
3739                             assert(!ref->long_ref);
3740                             if(ref->data[0] != NULL && ref->frame_num == pred && ref->long_ref == 0) // ignore non existing pictures by testing data[0] pointer
3741                                 break;
3742                         }
3743                         if(i>=0)
3744                             ref->pic_id= ref->frame_num;
3745                     }else{
3746                         pic_id= get_ue_golomb(&s->gb); //long_term_pic_idx
3747                         ref = h->long_ref[pic_id];
3748                         ref->pic_id= pic_id;
3749                         assert(ref->reference == 3);
3750                         assert(ref->long_ref);
3751                         i=0;
3752                     }
3753
3754                     if (i < 0) {
3755                         av_log(h->s.avctx, AV_LOG_ERROR, "reference picture missing during reorder\n");
3756                         memset(&h->ref_list[list][index], 0, sizeof(Picture)); //FIXME
3757                     } else {
3758                         for(i=index; i+1<h->ref_count[list]; i++){
3759                             if(ref->long_ref == h->ref_list[list][i].long_ref && ref->pic_id == h->ref_list[list][i].pic_id)
3760                                 break;
3761                         }
3762                         for(; i > index; i--){
3763                             h->ref_list[list][i]= h->ref_list[list][i-1];
3764                         }
3765                         h->ref_list[list][index]= *ref;
3766                     }
3767                 }else{
3768                     av_log(h->s.avctx, AV_LOG_ERROR, "illegal reordering_of_pic_nums_idc\n");
3769                     return -1;
3770                 }
3771             }
3772         }
3773
3774         if(h->slice_type!=B_TYPE) break;
3775     }
3776     for(list=0; list<2; list++){
3777         for(index= 0; index < h->ref_count[list]; index++){
3778             if(!h->ref_list[list][index].data[0])
3779                 h->ref_list[list][index]= s->current_picture;
3780         }
3781         if(h->slice_type!=B_TYPE) break;
3782     }
3783
3784     if(h->slice_type==B_TYPE && !h->direct_spatial_mv_pred)
3785         direct_dist_scale_factor(h);
3786     direct_ref_list_init(h);
3787     return 0;
3788 }
3789
3790 static int pred_weight_table(H264Context *h){
3791     MpegEncContext * const s = &h->s;
3792     int list, i;
3793     int luma_def, chroma_def;
3794
3795     h->use_weight= 0;
3796     h->use_weight_chroma= 0;
3797     h->luma_log2_weight_denom= get_ue_golomb(&s->gb);
3798     h->chroma_log2_weight_denom= get_ue_golomb(&s->gb);
3799     luma_def = 1<<h->luma_log2_weight_denom;
3800     chroma_def = 1<<h->chroma_log2_weight_denom;
3801
3802     for(list=0; list<2; list++){
3803         for(i=0; i<h->ref_count[list]; i++){
3804             int luma_weight_flag, chroma_weight_flag;
3805
3806             luma_weight_flag= get_bits1(&s->gb);
3807             if(luma_weight_flag){
3808                 h->luma_weight[list][i]= get_se_golomb(&s->gb);
3809                 h->luma_offset[list][i]= get_se_golomb(&s->gb);
3810                 if(   h->luma_weight[list][i] != luma_def
3811                    || h->luma_offset[list][i] != 0)
3812                     h->use_weight= 1;
3813             }else{
3814                 h->luma_weight[list][i]= luma_def;
3815                 h->luma_offset[list][i]= 0;
3816             }
3817
3818             chroma_weight_flag= get_bits1(&s->gb);
3819             if(chroma_weight_flag){
3820                 int j;
3821                 for(j=0; j<2; j++){
3822                     h->chroma_weight[list][i][j]= get_se_golomb(&s->gb);
3823                     h->chroma_offset[list][i][j]= get_se_golomb(&s->gb);
3824                     if(   h->chroma_weight[list][i][j] != chroma_def
3825                        || h->chroma_offset[list][i][j] != 0)
3826                         h->use_weight_chroma= 1;
3827                 }
3828             }else{
3829                 int j;
3830                 for(j=0; j<2; j++){
3831                     h->chroma_weight[list][i][j]= chroma_def;
3832                     h->chroma_offset[list][i][j]= 0;
3833                 }
3834             }
3835         }
3836         if(h->slice_type != B_TYPE) break;
3837     }
3838     h->use_weight= h->use_weight || h->use_weight_chroma;
3839     return 0;
3840 }
3841
3842 static void implicit_weight_table(H264Context *h){
3843     MpegEncContext * const s = &h->s;
3844     int ref0, ref1;
3845     int cur_poc = s->current_picture_ptr->poc;
3846
3847     if(   h->ref_count[0] == 1 && h->ref_count[1] == 1
3848        && h->ref_list[0][0].poc + h->ref_list[1][0].poc == 2*cur_poc){
3849         h->use_weight= 0;
3850         h->use_weight_chroma= 0;
3851         return;
3852     }
3853
3854     h->use_weight= 2;
3855     h->use_weight_chroma= 2;
3856     h->luma_log2_weight_denom= 5;
3857     h->chroma_log2_weight_denom= 5;
3858
3859     /* FIXME: MBAFF */
3860     for(ref0=0; ref0 < h->ref_count[0]; ref0++){
3861         int poc0 = h->ref_list[0][ref0].poc;
3862         for(ref1=0; ref1 < h->ref_count[1]; ref1++){
3863             int poc1 = h->ref_list[1][ref1].poc;
3864             int td = clip(poc1 - poc0, -128, 127);
3865             if(td){
3866                 int tb = clip(cur_poc - poc0, -128, 127);
3867                 int tx = (16384 + (ABS(td) >> 1)) / td;
3868                 int dist_scale_factor = clip((tb*tx + 32) >> 6, -1024, 1023) >> 2;
3869                 if(dist_scale_factor < -64 || dist_scale_factor > 128)
3870                     h->implicit_weight[ref0][ref1] = 32;
3871                 else
3872                     h->implicit_weight[ref0][ref1] = 64 - dist_scale_factor;
3873             }else
3874                 h->implicit_weight[ref0][ref1] = 32;
3875         }
3876     }
3877 }
3878
3879 static inline void unreference_pic(H264Context *h, Picture *pic){
3880     int i;
3881     pic->reference=0;
3882     if(pic == h->delayed_output_pic)
3883         pic->reference=1;
3884     else{
3885         for(i = 0; h->delayed_pic[i]; i++)
3886             if(pic == h->delayed_pic[i]){
3887                 pic->reference=1;
3888                 break;
3889             }
3890     }
3891 }
3892
3893 /**
3894  * instantaneous decoder refresh.
3895  */
3896 static void idr(H264Context *h){
3897     int i;
3898
3899     for(i=0; i<16; i++){
3900         if (h->long_ref[i] != NULL) {
3901             unreference_pic(h, h->long_ref[i]);
3902             h->long_ref[i]= NULL;
3903         }
3904     }
3905     h->long_ref_count=0;
3906
3907     for(i=0; i<h->short_ref_count; i++){
3908         unreference_pic(h, h->short_ref[i]);
3909         h->short_ref[i]= NULL;
3910     }
3911     h->short_ref_count=0;
3912 }
3913
3914 /* forget old pics after a seek */
3915 static void flush_dpb(AVCodecContext *avctx){
3916     H264Context *h= avctx->priv_data;
3917     int i;
3918     for(i=0; i<16; i++) {
3919         if(h->delayed_pic[i])
3920             h->delayed_pic[i]->reference= 0;
3921         h->delayed_pic[i]= NULL;
3922     }
3923     if(h->delayed_output_pic)
3924         h->delayed_output_pic->reference= 0;
3925     h->delayed_output_pic= NULL;
3926     idr(h);
3927     if(h->s.current_picture_ptr)
3928         h->s.current_picture_ptr->reference= 0;
3929 }
3930
3931 /**
3932  *
3933  * @return the removed picture or NULL if an error occurs
3934  */
3935 static Picture * remove_short(H264Context *h, int frame_num){
3936     MpegEncContext * const s = &h->s;
3937     int i;
3938
3939     if(s->avctx->debug&FF_DEBUG_MMCO)
3940         av_log(h->s.avctx, AV_LOG_DEBUG, "remove short %d count %d\n", frame_num, h->short_ref_count);
3941
3942     for(i=0; i<h->short_ref_count; i++){
3943         Picture *pic= h->short_ref[i];
3944         if(s->avctx->debug&FF_DEBUG_MMCO)
3945             av_log(h->s.avctx, AV_LOG_DEBUG, "%d %d %p\n", i, pic->frame_num, pic);
3946         if(pic->frame_num == frame_num){
3947             h->short_ref[i]= NULL;
3948             memmove(&h->short_ref[i], &h->short_ref[i+1], (h->short_ref_count - i - 1)*sizeof(Picture*));
3949             h->short_ref_count--;
3950             return pic;
3951         }
3952     }
3953     return NULL;
3954 }
3955
3956 /**
3957  *
3958  * @return the removed picture or NULL if an error occurs
3959  */
3960 static Picture * remove_long(H264Context *h, int i){
3961     Picture *pic;
3962
3963     pic= h->long_ref[i];
3964     h->long_ref[i]= NULL;
3965     if(pic) h->long_ref_count--;
3966
3967     return pic;
3968 }
3969
3970 /**
3971  * print short term list
3972  */
3973 static void print_short_term(H264Context *h) {
3974     uint32_t i;
3975     if(h->s.avctx->debug&FF_DEBUG_MMCO) {
3976         av_log(h->s.avctx, AV_LOG_DEBUG, "short term list:\n");
3977         for(i=0; i<h->short_ref_count; i++){
3978             Picture *pic= h->short_ref[i];
3979             av_log(h->s.avctx, AV_LOG_DEBUG, "%d fn:%d poc:%d %p\n", i, pic->frame_num, pic->poc, pic->data[0]);
3980         }
3981     }
3982 }
3983
3984 /**
3985  * print long term list
3986  */
3987 static void print_long_term(H264Context *h) {
3988     uint32_t i;
3989     if(h->s.avctx->debug&FF_DEBUG_MMCO) {
3990         av_log(h->s.avctx, AV_LOG_DEBUG, "long term list:\n");
3991         for(i = 0; i < 16; i++){
3992             Picture *pic= h->long_ref[i];
3993             if (pic) {
3994                 av_log(h->s.avctx, AV_LOG_DEBUG, "%d fn:%d poc:%d %p\n", i, pic->frame_num, pic->poc, pic->data[0]);
3995             }
3996         }
3997     }
3998 }
3999
4000 /**
4001  * Executes the reference picture marking (memory management control operations).
4002  */
4003 static int execute_ref_pic_marking(H264Context *h, MMCO *mmco, int mmco_count){
4004     MpegEncContext * const s = &h->s;
4005     int i, j;
4006     int current_is_long=0;
4007     Picture *pic;
4008
4009     if((s->avctx->debug&FF_DEBUG_MMCO) && mmco_count==0)
4010         av_log(h->s.avctx, AV_LOG_DEBUG, "no mmco here\n");
4011
4012     for(i=0; i<mmco_count; i++){
4013         if(s->avctx->debug&FF_DEBUG_MMCO)
4014             av_log(h->s.avctx, AV_LOG_DEBUG, "mmco:%d %d %d\n", h->mmco[i].opcode, h->mmco[i].short_frame_num, h->mmco[i].long_index);
4015
4016         switch(mmco[i].opcode){
4017         case MMCO_SHORT2UNUSED:
4018             pic= remove_short(h, mmco[i].short_frame_num);
4019             if(pic)
4020                 unreference_pic(h, pic);
4021             else if(s->avctx->debug&FF_DEBUG_MMCO)
4022                 av_log(h->s.avctx, AV_LOG_DEBUG, "mmco: remove_short() failure\n");
4023             break;
4024         case MMCO_SHORT2LONG:
4025             pic= remove_long(h, mmco[i].long_index);
4026             if(pic) unreference_pic(h, pic);
4027
4028             h->long_ref[ mmco[i].long_index ]= remove_short(h, mmco[i].short_frame_num);
4029             h->long_ref[ mmco[i].long_index ]->long_ref=1;
4030             h->long_ref_count++;
4031             break;
4032         case MMCO_LONG2UNUSED:
4033             pic= remove_long(h, mmco[i].long_index);
4034             if(pic)
4035                 unreference_pic(h, pic);
4036             else if(s->avctx->debug&FF_DEBUG_MMCO)
4037                 av_log(h->s.avctx, AV_LOG_DEBUG, "mmco: remove_long() failure\n");
4038             break;
4039         case MMCO_LONG:
4040             pic= remove_long(h, mmco[i].long_index);
4041             if(pic) unreference_pic(h, pic);
4042
4043             h->long_ref[ mmco[i].long_index ]= s->current_picture_ptr;
4044             h->long_ref[ mmco[i].long_index ]->long_ref=1;
4045             h->long_ref_count++;
4046
4047             current_is_long=1;
4048             break;
4049         case MMCO_SET_MAX_LONG:
4050             assert(mmco[i].long_index <= 16);
4051             // just remove the long term which index is greater than new max
4052             for(j = mmco[i].long_index; j<16; j++){
4053                 pic = remove_long(h, j);
4054                 if (pic) unreference_pic(h, pic);
4055             }
4056             break;
4057         case MMCO_RESET:
4058             while(h->short_ref_count){
4059                 pic= remove_short(h, h->short_ref[0]->frame_num);
4060                 unreference_pic(h, pic);
4061             }
4062             for(j = 0; j < 16; j++) {
4063                 pic= remove_long(h, j);
4064                 if(pic) unreference_pic(h, pic);
4065             }
4066             break;
4067         default: assert(0);
4068         }
4069     }
4070
4071     if(!current_is_long){
4072         pic= remove_short(h, s->current_picture_ptr->frame_num);
4073         if(pic){
4074             unreference_pic(h, pic);
4075             av_log(h->s.avctx, AV_LOG_ERROR, "illegal short term buffer state detected\n");
4076         }
4077
4078         if(h->short_ref_count)
4079             memmove(&h->short_ref[1], &h->short_ref[0], h->short_ref_count*sizeof(Picture*));
4080
4081         h->short_ref[0]= s->current_picture_ptr;
4082         h->short_ref[0]->long_ref=0;
4083         h->short_ref_count++;
4084     }
4085
4086     print_short_term(h);
4087     print_long_term(h);
4088     return 0;
4089 }
4090
4091 static int decode_ref_pic_marking(H264Context *h){
4092     MpegEncContext * const s = &h->s;
4093     int i;
4094
4095     if(h->nal_unit_type == NAL_IDR_SLICE){ //FIXME fields
4096         s->broken_link= get_bits1(&s->gb) -1;
4097         h->mmco[0].long_index= get_bits1(&s->gb) - 1; // current_long_term_idx
4098         if(h->mmco[0].long_index == -1)
4099             h->mmco_index= 0;
4100         else{
4101             h->mmco[0].opcode= MMCO_LONG;
4102             h->mmco_index= 1;
4103         }
4104     }else{
4105         if(get_bits1(&s->gb)){ // adaptive_ref_pic_marking_mode_flag
4106             for(i= 0; i<MAX_MMCO_COUNT; i++) {
4107                 MMCOOpcode opcode= get_ue_golomb(&s->gb);;
4108
4109                 h->mmco[i].opcode= opcode;
4110                 if(opcode==MMCO_SHORT2UNUSED || opcode==MMCO_SHORT2LONG){
4111                     h->mmco[i].short_frame_num= (h->frame_num - get_ue_golomb(&s->gb) - 1) & ((1<<h->sps.log2_max_frame_num)-1); //FIXME fields
4112 /*                    if(h->mmco[i].short_frame_num >= h->short_ref_count || h->short_ref[ h->mmco[i].short_frame_num ] == NULL){
4113                         av_log(s->avctx, AV_LOG_ERROR, "illegal short ref in memory management control operation %d\n", mmco);
4114                         return -1;
4115                     }*/
4116                 }
4117                 if(opcode==MMCO_SHORT2LONG || opcode==MMCO_LONG2UNUSED || opcode==MMCO_LONG || opcode==MMCO_SET_MAX_LONG){
4118                     h->mmco[i].long_index= get_ue_golomb(&s->gb);
4119                     if(/*h->mmco[i].long_index >= h->long_ref_count || h->long_ref[ h->mmco[i].long_index ] == NULL*/ h->mmco[i].long_index >= 16){
4120                         av_log(h->s.avctx, AV_LOG_ERROR, "illegal long ref in memory management control operation %d\n", opcode);
4121                         return -1;
4122                     }
4123                 }
4124
4125                 if(opcode > MMCO_LONG){
4126                     av_log(h->s.avctx, AV_LOG_ERROR, "illegal memory management control operation %d\n", opcode);
4127                     return -1;
4128                 }
4129                 if(opcode == MMCO_END)
4130                     break;
4131             }
4132             h->mmco_index= i;
4133         }else{
4134             assert(h->long_ref_count + h->short_ref_count <= h->sps.ref_frame_count);
4135
4136             if(h->long_ref_count + h->short_ref_count == h->sps.ref_frame_count){ //FIXME fields
4137                 h->mmco[0].opcode= MMCO_SHORT2UNUSED;
4138                 h->mmco[0].short_frame_num= h->short_ref[ h->short_ref_count - 1 ]->frame_num;
4139                 h->mmco_index= 1;
4140             }else
4141                 h->mmco_index= 0;
4142         }
4143     }
4144
4145     return 0;
4146 }
4147
4148 static int init_poc(H264Context *h){
4149     MpegEncContext * const s = &h->s;
4150     const int max_frame_num= 1<<h->sps.log2_max_frame_num;
4151     int field_poc[2];
4152
4153     if(h->nal_unit_type == NAL_IDR_SLICE){
4154         h->frame_num_offset= 0;
4155     }else{
4156         if(h->frame_num < h->prev_frame_num)
4157             h->frame_num_offset= h->prev_frame_num_offset + max_frame_num;
4158         else
4159             h->frame_num_offset= h->prev_frame_num_offset;
4160     }
4161
4162     if(h->sps.poc_type==0){
4163         const int max_poc_lsb= 1<<h->sps.log2_max_poc_lsb;
4164
4165         if(h->nal_unit_type == NAL_IDR_SLICE){
4166              h->prev_poc_msb=
4167              h->prev_poc_lsb= 0;
4168         }
4169
4170         if     (h->poc_lsb < h->prev_poc_lsb && h->prev_poc_lsb - h->poc_lsb >= max_poc_lsb/2)
4171             h->poc_msb = h->prev_poc_msb + max_poc_lsb;
4172         else if(h->poc_lsb > h->prev_poc_lsb && h->prev_poc_lsb - h->poc_lsb < -max_poc_lsb/2)
4173             h->poc_msb = h->prev_poc_msb - max_poc_lsb;
4174         else
4175             h->poc_msb = h->prev_poc_msb;
4176 //printf("poc: %d %d\n", h->poc_msb, h->poc_lsb);
4177         field_poc[0] =
4178         field_poc[1] = h->poc_msb + h->poc_lsb;
4179         if(s->picture_structure == PICT_FRAME)
4180             field_poc[1] += h->delta_poc_bottom;
4181     }else if(h->sps.poc_type==1){
4182         int abs_frame_num, expected_delta_per_poc_cycle, expectedpoc;
4183         int i;
4184
4185         if(h->sps.poc_cycle_length != 0)
4186             abs_frame_num = h->frame_num_offset + h->frame_num;
4187         else
4188             abs_frame_num = 0;
4189
4190         if(h->nal_ref_idc==0 && abs_frame_num > 0)
4191             abs_frame_num--;
4192
4193         expected_delta_per_poc_cycle = 0;
4194         for(i=0; i < h->sps.poc_cycle_length; i++)
4195             expected_delta_per_poc_cycle += h->sps.offset_for_ref_frame[ i ]; //FIXME integrate during sps parse
4196
4197         if(abs_frame_num > 0){
4198             int poc_cycle_cnt          = (abs_frame_num - 1) / h->sps.poc_cycle_length;
4199             int frame_num_in_poc_cycle = (abs_frame_num - 1) % h->sps.poc_cycle_length;
4200
4201             expectedpoc = poc_cycle_cnt * expected_delta_per_poc_cycle;
4202             for(i = 0; i <= frame_num_in_poc_cycle; i++)
4203                 expectedpoc = expectedpoc + h->sps.offset_for_ref_frame[ i ];
4204         } else
4205             expectedpoc = 0;
4206
4207         if(h->nal_ref_idc == 0)
4208             expectedpoc = expectedpoc + h->sps.offset_for_non_ref_pic;
4209
4210         field_poc[0] = expectedpoc + h->delta_poc[0];
4211         field_poc[1] = field_poc[0] + h->sps.offset_for_top_to_bottom_field;
4212
4213         if(s->picture_structure == PICT_FRAME)
4214             field_poc[1] += h->delta_poc[1];
4215     }else{
4216         int poc;
4217         if(h->nal_unit_type == NAL_IDR_SLICE){
4218             poc= 0;
4219         }else{
4220             if(h->nal_ref_idc) poc= 2*(h->frame_num_offset + h->frame_num);
4221             else               poc= 2*(h->frame_num_offset + h->frame_num) - 1;
4222         }
4223         field_poc[0]= poc;
4224         field_poc[1]= poc;
4225     }
4226
4227     if(s->picture_structure != PICT_BOTTOM_FIELD)
4228         s->current_picture_ptr->field_poc[0]= field_poc[0];
4229     if(s->picture_structure != PICT_TOP_FIELD)
4230         s->current_picture_ptr->field_poc[1]= field_poc[1];
4231     if(s->picture_structure == PICT_FRAME) // FIXME field pix?
4232         s->current_picture_ptr->poc= FFMIN(field_poc[0], field_poc[1]);
4233
4234     return 0;
4235 }
4236
4237 /**
4238  * decodes a slice header.
4239  * this will allso call MPV_common_init() and frame_start() as needed
4240  */
4241 static int decode_slice_header(H264Context *h){
4242     MpegEncContext * const s = &h->s;
4243     int first_mb_in_slice, pps_id;
4244     int num_ref_idx_active_override_flag;
4245     static const uint8_t slice_type_map[5]= {P_TYPE, B_TYPE, I_TYPE, SP_TYPE, SI_TYPE};
4246     int slice_type;
4247     int default_ref_list_done = 0;
4248
4249     s->current_picture.reference= h->nal_ref_idc != 0;
4250     s->dropable= h->nal_ref_idc == 0;
4251
4252     first_mb_in_slice= get_ue_golomb(&s->gb);
4253
4254     slice_type= get_ue_golomb(&s->gb);
4255     if(slice_type > 9){
4256         av_log(h->s.avctx, AV_LOG_ERROR, "slice type too large (%d) at %d %d\n", h->slice_type, s->mb_x, s->mb_y);
4257         return -1;
4258     }
4259     if(slice_type > 4){
4260         slice_type -= 5;
4261         h->slice_type_fixed=1;
4262     }else
4263         h->slice_type_fixed=0;
4264
4265     slice_type= slice_type_map[ slice_type ];
4266     if (slice_type == I_TYPE
4267         || (h->slice_num != 0 && slice_type == h->slice_type) ) {
4268         default_ref_list_done = 1;
4269     }
4270     h->slice_type= slice_type;
4271
4272     s->pict_type= h->slice_type; // to make a few old func happy, it's wrong though
4273
4274     pps_id= get_ue_golomb(&s->gb);
4275     if(pps_id>255){
4276         av_log(h->s.avctx, AV_LOG_ERROR, "pps_id out of range\n");
4277         return -1;
4278     }
4279     h->pps= h->pps_buffer[pps_id];
4280     if(h->pps.slice_group_count == 0){
4281         av_log(h->s.avctx, AV_LOG_ERROR, "non existing PPS referenced\n");
4282         return -1;
4283     }
4284
4285     h->sps= h->sps_buffer[ h->pps.sps_id ];
4286     if(h->sps.log2_max_frame_num == 0){
4287         av_log(h->s.avctx, AV_LOG_ERROR, "non existing SPS referenced\n");
4288         return -1;
4289     }
4290
4291     if(h->dequant_coeff_pps != pps_id){
4292         h->dequant_coeff_pps = pps_id;
4293         init_dequant_tables(h);
4294     }
4295
4296     s->mb_width= h->sps.mb_width;
4297     s->mb_height= h->sps.mb_height * (2 - h->sps.frame_mbs_only_flag);
4298
4299     h->b_stride=  s->mb_width*4 + 1;
4300     h->b8_stride= s->mb_width*2 + 1;
4301
4302     s->width = 16*s->mb_width - 2*(h->sps.crop_left + h->sps.crop_right );
4303     if(h->sps.frame_mbs_only_flag)
4304         s->height= 16*s->mb_height - 2*(h->sps.crop_top  + h->sps.crop_bottom);
4305     else
4306         s->height= 16*s->mb_height - 4*(h->sps.crop_top  + h->sps.crop_bottom); //FIXME recheck
4307
4308     if (s->context_initialized
4309         && (   s->width != s->avctx->width || s->height != s->avctx->height)) {
4310         free_tables(h);
4311         MPV_common_end(s);
4312     }
4313     if (!s->context_initialized) {
4314         if (MPV_common_init(s) < 0)
4315             return -1;
4316
4317         if(s->dsp.h264_idct_add == ff_h264_idct_add_c){ //FIXME little ugly
4318             memcpy(h->zigzag_scan, zigzag_scan, 16*sizeof(uint8_t));
4319             memcpy(h-> field_scan,  field_scan, 16*sizeof(uint8_t));
4320         }else{
4321             int i;
4322             for(i=0; i<16; i++){
4323 #define T(x) (x>>2) | ((x<<2) & 0xF)
4324                 h->zigzag_scan[i] = T(zigzag_scan[i]);
4325                 h-> field_scan[i] = T( field_scan[i]);
4326 #undef T
4327             }
4328         }
4329         if(s->dsp.h264_idct8_add == ff_h264_idct8_add_c){
4330             memcpy(h->zigzag_scan8x8, zigzag_scan8x8, 64*sizeof(uint8_t));
4331             memcpy(h->zigzag_scan8x8_cavlc, zigzag_scan8x8_cavlc, 64*sizeof(uint8_t));
4332         }else{
4333             int i;
4334             for(i=0; i<64; i++){
4335 #define T(x) (x>>3) | ((x&7)<<3)
4336                 h->zigzag_scan8x8[i] = T(zigzag_scan8x8[i]);
4337                 h->zigzag_scan8x8_cavlc[i] = T(zigzag_scan8x8_cavlc[i]);
4338 #undef T
4339             }
4340         }
4341         if(h->sps.transform_bypass){ //FIXME same ugly
4342             h->zigzag_scan_q0 = zigzag_scan;
4343             h->field_scan_q0 = field_scan;
4344             h->zigzag_scan8x8_q0 = zigzag_scan8x8;
4345             h->zigzag_scan8x8_cavlc_q0 = zigzag_scan8x8_cavlc;
4346         }else{
4347             h->zigzag_scan_q0 = h->zigzag_scan;
4348             h->field_scan_q0 = h->field_scan;
4349             h->zigzag_scan8x8_q0 = h->zigzag_scan8x8;
4350             h->zigzag_scan8x8_cavlc_q0 = h->zigzag_scan8x8_cavlc;
4351         }
4352
4353         alloc_tables(h);
4354
4355         s->avctx->width = s->width;
4356         s->avctx->height = s->height;
4357         s->avctx->sample_aspect_ratio= h->sps.sar;
4358         if(!s->avctx->sample_aspect_ratio.den)
4359             s->avctx->sample_aspect_ratio.den = 1;
4360
4361         if(h->sps.timing_info_present_flag){
4362             s->avctx->time_base= (AVRational){h->sps.num_units_in_tick * 2, h->sps.time_scale};
4363             if(h->x264_build > 0 && h->x264_build < 44)
4364                 s->avctx->time_base.den *= 2;
4365             av_reduce(&s->avctx->time_base.num, &s->avctx->time_base.den,
4366                       s->avctx->time_base.num, s->avctx->time_base.den, 1<<30);
4367         }
4368     }
4369
4370     if(h->slice_num == 0){
4371         if(frame_start(h) < 0)
4372             return -1;
4373     }
4374
4375     s->current_picture_ptr->frame_num= //FIXME frame_num cleanup
4376     h->frame_num= get_bits(&s->gb, h->sps.log2_max_frame_num);
4377
4378     h->mb_aff_frame = 0;
4379     if(h->sps.frame_mbs_only_flag){
4380         s->picture_structure= PICT_FRAME;
4381     }else{
4382         if(get_bits1(&s->gb)) { //field_pic_flag
4383             s->picture_structure= PICT_TOP_FIELD + get_bits1(&s->gb); //bottom_field_flag
4384         } else {
4385             s->picture_structure= PICT_FRAME;
4386             first_mb_in_slice <<= h->sps.mb_aff;
4387             h->mb_aff_frame = h->sps.mb_aff;
4388         }
4389     }
4390
4391     s->resync_mb_x = s->mb_x = first_mb_in_slice % s->mb_width;
4392     s->resync_mb_y = s->mb_y = first_mb_in_slice / s->mb_width;
4393     if(s->mb_y >= s->mb_height){
4394         return -1;
4395     }
4396
4397     if(s->picture_structure==PICT_FRAME){
4398         h->curr_pic_num=   h->frame_num;
4399         h->max_pic_num= 1<< h->sps.log2_max_frame_num;
4400     }else{
4401         h->curr_pic_num= 2*h->frame_num;
4402         h->max_pic_num= 1<<(h->sps.log2_max_frame_num + 1);
4403     }
4404
4405     if(h->nal_unit_type == NAL_IDR_SLICE){
4406         get_ue_golomb(&s->gb); /* idr_pic_id */
4407     }
4408
4409     if(h->sps.poc_type==0){
4410         h->poc_lsb= get_bits(&s->gb, h->sps.log2_max_poc_lsb);
4411
4412         if(h->pps.pic_order_present==1 && s->picture_structure==PICT_FRAME){
4413             h->delta_poc_bottom= get_se_golomb(&s->gb);
4414         }
4415     }
4416
4417     if(h->sps.poc_type==1 && !h->sps.delta_pic_order_always_zero_flag){
4418         h->delta_poc[0]= get_se_golomb(&s->gb);
4419
4420         if(h->pps.pic_order_present==1 && s->picture_structure==PICT_FRAME)
4421             h->delta_poc[1]= get_se_golomb(&s->gb);
4422     }
4423
4424     init_poc(h);
4425
4426     if(h->pps.redundant_pic_cnt_present){
4427         h->redundant_pic_count= get_ue_golomb(&s->gb);
4428     }
4429
4430     //set defaults, might be overriden a few line later
4431     h->ref_count[0]= h->pps.ref_count[0];
4432     h->ref_count[1]= h->pps.ref_count[1];
4433
4434     if(h->slice_type == P_TYPE || h->slice_type == SP_TYPE || h->slice_type == B_TYPE){
4435         if(h->slice_type == B_TYPE){
4436             h->direct_spatial_mv_pred= get_bits1(&s->gb);
4437         }
4438         num_ref_idx_active_override_flag= get_bits1(&s->gb);
4439
4440         if(num_ref_idx_active_override_flag){
4441             h->ref_count[0]= get_ue_golomb(&s->gb) + 1;
4442             if(h->slice_type==B_TYPE)
4443                 h->ref_count[1]= get_ue_golomb(&s->gb) + 1;
4444
4445             if(h->ref_count[0] > 32 || h->ref_count[1] > 32){
4446                 av_log(h->s.avctx, AV_LOG_ERROR, "reference overflow\n");
4447                 return -1;
4448             }
4449         }
4450     }
4451
4452     if(!default_ref_list_done){
4453         fill_default_ref_list(h);
4454     }
4455
4456     if(decode_ref_pic_list_reordering(h) < 0)
4457         return -1;
4458
4459     if(   (h->pps.weighted_pred          && (h->slice_type == P_TYPE || h->slice_type == SP_TYPE ))
4460        || (h->pps.weighted_bipred_idc==1 && h->slice_type==B_TYPE ) )
4461         pred_weight_table(h);
4462     else if(h->pps.weighted_bipred_idc==2 && h->slice_type==B_TYPE)
4463         implicit_weight_table(h);
4464     else
4465         h->use_weight = 0;
4466
4467     if(s->current_picture.reference)
4468         decode_ref_pic_marking(h);
4469
4470     if( h->slice_type != I_TYPE && h->slice_type != SI_TYPE && h->pps.cabac )
4471         h->cabac_init_idc = get_ue_golomb(&s->gb);
4472
4473     h->last_qscale_diff = 0;
4474     s->qscale = h->pps.init_qp + get_se_golomb(&s->gb);
4475     if(s->qscale<0 || s->qscale>51){
4476         av_log(s->avctx, AV_LOG_ERROR, "QP %d out of range\n", s->qscale);
4477         return -1;
4478     }
4479     h->chroma_qp = get_chroma_qp(h->pps.chroma_qp_index_offset, s->qscale);
4480     //FIXME qscale / qp ... stuff
4481     if(h->slice_type == SP_TYPE){
4482         get_bits1(&s->gb); /* sp_for_switch_flag */
4483     }
4484     if(h->slice_type==SP_TYPE || h->slice_type == SI_TYPE){
4485         get_se_golomb(&s->gb); /* slice_qs_delta */
4486     }
4487
4488     h->deblocking_filter = 1;
4489     h->slice_alpha_c0_offset = 0;
4490     h->slice_beta_offset = 0;
4491     if( h->pps.deblocking_filter_parameters_present ) {
4492         h->deblocking_filter= get_ue_golomb(&s->gb);
4493         if(h->deblocking_filter < 2)
4494             h->deblocking_filter^= 1; // 1<->0
4495
4496         if( h->deblocking_filter ) {
4497             h->slice_alpha_c0_offset = get_se_golomb(&s->gb) << 1;
4498             h->slice_beta_offset = get_se_golomb(&s->gb) << 1;
4499         }
4500     }
4501     if(   s->avctx->skip_loop_filter >= AVDISCARD_ALL
4502        ||(s->avctx->skip_loop_filter >= AVDISCARD_NONKEY && h->slice_type != I_TYPE)
4503        ||(s->avctx->skip_loop_filter >= AVDISCARD_BIDIR  && h->slice_type == B_TYPE)
4504        ||(s->avctx->skip_loop_filter >= AVDISCARD_NONREF && h->nal_ref_idc == 0))
4505         h->deblocking_filter= 0;
4506
4507 #if 0 //FMO
4508     if( h->pps.num_slice_groups > 1  && h->pps.mb_slice_group_map_type >= 3 && h->pps.mb_slice_group_map_type <= 5)
4509         slice_group_change_cycle= get_bits(&s->gb, ?);
4510 #endif
4511
4512     h->slice_num++;
4513
4514     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
4515         av_log(h->s.avctx, AV_LOG_DEBUG, "slice:%d %s mb:%d %c pps:%d frame:%d poc:%d/%d ref:%d/%d qp:%d loop:%d:%d:%d weight:%d%s\n",
4516                h->slice_num,
4517                (s->picture_structure==PICT_FRAME ? "F" : s->picture_structure==PICT_TOP_FIELD ? "T" : "B"),
4518                first_mb_in_slice,
4519                av_get_pict_type_char(h->slice_type),
4520                pps_id, h->frame_num,
4521                s->current_picture_ptr->field_poc[0], s->current_picture_ptr->field_poc[1],
4522                h->ref_count[0], h->ref_count[1],
4523                s->qscale,
4524                h->deblocking_filter, h->slice_alpha_c0_offset/2, h->slice_beta_offset/2,
4525                h->use_weight,
4526                h->use_weight==1 && h->use_weight_chroma ? "c" : ""
4527                );
4528     }
4529
4530     return 0;
4531 }
4532
4533 /**
4534  *
4535  */
4536 static inline int get_level_prefix(GetBitContext *gb){
4537     unsigned int buf;
4538     int log;
4539
4540     OPEN_READER(re, gb);
4541     UPDATE_CACHE(re, gb);
4542     buf=GET_CACHE(re, gb);
4543
4544     log= 32 - av_log2(buf);
4545 #ifdef TRACE
4546     print_bin(buf>>(32-log), log);
4547     av_log(NULL, AV_LOG_DEBUG, "%5d %2d %3d lpr @%5d in %s get_level_prefix\n", buf>>(32-log), log, log-1, get_bits_count(gb), __FILE__);
4548 #endif
4549
4550     LAST_SKIP_BITS(re, gb, log);
4551     CLOSE_READER(re, gb);
4552
4553     return log-1;
4554 }
4555
4556 static inline int get_dct8x8_allowed(H264Context *h){
4557     int i;
4558     for(i=0; i<4; i++){
4559         if(!IS_SUB_8X8(h->sub_mb_type[i])
4560            || (!h->sps.direct_8x8_inference_flag && IS_DIRECT(h->sub_mb_type[i])))
4561             return 0;
4562     }
4563     return 1;
4564 }
4565
4566 /**
4567  * decodes a residual block.
4568  * @param n block index
4569  * @param scantable scantable
4570  * @param max_coeff number of coefficients in the block
4571  * @return <0 if an error occured
4572  */
4573 static int decode_residual(H264Context *h, GetBitContext *gb, DCTELEM *block, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff){
4574     MpegEncContext * const s = &h->s;
4575     static const int coeff_token_table_index[17]= {0, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3};
4576     int level[16];
4577     int zeros_left, coeff_num, coeff_token, total_coeff, i, j, trailing_ones, run_before;
4578
4579     //FIXME put trailing_onex into the context
4580
4581     if(n == CHROMA_DC_BLOCK_INDEX){
4582         coeff_token= get_vlc2(gb, chroma_dc_coeff_token_vlc.table, CHROMA_DC_COEFF_TOKEN_VLC_BITS, 1);
4583         total_coeff= coeff_token>>2;
4584     }else{
4585         if(n == LUMA_DC_BLOCK_INDEX){
4586             total_coeff= pred_non_zero_count(h, 0);
4587             coeff_token= get_vlc2(gb, coeff_token_vlc[ coeff_token_table_index[total_coeff] ].table, COEFF_TOKEN_VLC_BITS, 2);
4588             total_coeff= coeff_token>>2;
4589         }else{
4590             total_coeff= pred_non_zero_count(h, n);
4591             coeff_token= get_vlc2(gb, coeff_token_vlc[ coeff_token_table_index[total_coeff] ].table, COEFF_TOKEN_VLC_BITS, 2);
4592             total_coeff= coeff_token>>2;
4593             h->non_zero_count_cache[ scan8[n] ]= total_coeff;
4594         }
4595     }
4596
4597     //FIXME set last_non_zero?
4598
4599     if(total_coeff==0)
4600         return 0;
4601
4602     trailing_ones= coeff_token&3;
4603     tprintf("trailing:%d, total:%d\n", trailing_ones, total_coeff);
4604     assert(total_coeff<=16);
4605
4606     for(i=0; i<trailing_ones; i++){
4607         level[i]= 1 - 2*get_bits1(gb);
4608     }
4609
4610     if(i<total_coeff) {
4611         int level_code, mask;
4612         int suffix_length = total_coeff > 10 && trailing_ones < 3;
4613         int prefix= get_level_prefix(gb);
4614
4615         //first coefficient has suffix_length equal to 0 or 1
4616         if(prefix<14){ //FIXME try to build a large unified VLC table for all this
4617             if(suffix_length)
4618                 level_code= (prefix<<suffix_length) + get_bits(gb, suffix_length); //part
4619             else
4620                 level_code= (prefix<<suffix_length); //part
4621         }else if(prefix==14){
4622             if(suffix_length)
4623                 level_code= (prefix<<suffix_length) + get_bits(gb, suffix_length); //part
4624             else
4625                 level_code= prefix + get_bits(gb, 4); //part
4626         }else if(prefix==15){
4627             level_code= (prefix<<suffix_length) + get_bits(gb, 12); //part
4628             if(suffix_length==0) level_code+=15; //FIXME doesn't make (much)sense
4629         }else{
4630             av_log(h->s.avctx, AV_LOG_ERROR, "prefix too large at %d %d\n", s->mb_x, s->mb_y);
4631             return -1;
4632         }
4633
4634         if(trailing_ones < 3) level_code += 2;
4635
4636         suffix_length = 1;
4637         if(level_code > 5)
4638             suffix_length++;
4639         mask= -(level_code&1);
4640         level[i]= (((2+level_code)>>1) ^ mask) - mask;
4641         i++;
4642
4643         //remaining coefficients have suffix_length > 0
4644         for(;i<total_coeff;i++) {
4645             static const int suffix_limit[7] = {0,5,11,23,47,95,INT_MAX };
4646             prefix = get_level_prefix(gb);
4647             if(prefix<15){
4648                 level_code = (prefix<<suffix_length) + get_bits(gb, suffix_length);
4649             }else if(prefix==15){
4650                 level_code =  (prefix<<suffix_length) + get_bits(gb, 12);
4651             }else{
4652                 av_log(h->s.avctx, AV_LOG_ERROR, "prefix too large at %d %d\n", s->mb_x, s->mb_y);
4653                 return -1;
4654             }
4655             mask= -(level_code&1);
4656             level[i]= (((2+level_code)>>1) ^ mask) - mask;
4657             if(level_code > suffix_limit[suffix_length])
4658                 suffix_length++;
4659         }
4660     }
4661
4662     if(total_coeff == max_coeff)
4663         zeros_left=0;
4664     else{
4665         if(n == CHROMA_DC_BLOCK_INDEX)
4666             zeros_left= get_vlc2(gb, chroma_dc_total_zeros_vlc[ total_coeff-1 ].table, CHROMA_DC_TOTAL_ZEROS_VLC_BITS, 1);
4667         else
4668             zeros_left= get_vlc2(gb, total_zeros_vlc[ total_coeff-1 ].table, TOTAL_ZEROS_VLC_BITS, 1);
4669     }
4670
4671     coeff_num = zeros_left + total_coeff - 1;
4672     j = scantable[coeff_num];
4673     if(n > 24){
4674         block[j] = level[0];
4675         for(i=1;i<total_coeff;i++) {
4676             if(zeros_left <= 0)
4677                 run_before = 0;
4678             else if(zeros_left < 7){
4679                 run_before= get_vlc2(gb, run_vlc[zeros_left-1].table, RUN_VLC_BITS, 1);
4680             }else{
4681                 run_before= get_vlc2(gb, run7_vlc.table, RUN7_VLC_BITS, 2);
4682             }
4683             zeros_left -= run_before;
4684             coeff_num -= 1 + run_before;
4685             j= scantable[ coeff_num ];
4686
4687             block[j]= level[i];
4688         }
4689     }else{
4690         block[j] = (level[0] * qmul[j] + 32)>>6;
4691         for(i=1;i<total_coeff;i++) {
4692             if(zeros_left <= 0)
4693                 run_before = 0;
4694             else if(zeros_left < 7){
4695                 run_before= get_vlc2(gb, run_vlc[zeros_left-1].table, RUN_VLC_BITS, 1);
4696             }else{
4697                 run_before= get_vlc2(gb, run7_vlc.table, RUN7_VLC_BITS, 2);
4698             }
4699             zeros_left -= run_before;
4700             coeff_num -= 1 + run_before;
4701             j= scantable[ coeff_num ];
4702
4703             block[j]= (level[i] * qmul[j] + 32)>>6;
4704         }
4705     }
4706
4707     if(zeros_left<0){
4708         av_log(h->s.avctx, AV_LOG_ERROR, "negative number of zero coeffs at %d %d\n", s->mb_x, s->mb_y);
4709         return -1;
4710     }
4711
4712     return 0;
4713 }
4714
4715 /**
4716  * decodes a P_SKIP or B_SKIP macroblock
4717  */
4718 static void decode_mb_skip(H264Context *h){
4719     MpegEncContext * const s = &h->s;
4720     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
4721     int mb_type=0;
4722
4723     memset(h->non_zero_count[mb_xy], 0, 16);
4724     memset(h->non_zero_count_cache + 8, 0, 8*5); //FIXME ugly, remove pfui
4725
4726     if(h->mb_aff_frame && s->mb_skip_run==0 && (s->mb_y&1)==0){
4727         h->mb_field_decoding_flag= get_bits1(&s->gb);
4728     }
4729     if(h->mb_field_decoding_flag)
4730         mb_type|= MB_TYPE_INTERLACED;
4731
4732     if( h->slice_type == B_TYPE )
4733     {
4734         // just for fill_caches. pred_direct_motion will set the real mb_type
4735         mb_type|= MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2|MB_TYPE_SKIP;
4736
4737         fill_caches(h, mb_type, 0); //FIXME check what is needed and what not ...
4738         pred_direct_motion(h, &mb_type);
4739         if(h->pps.cabac){
4740             fill_rectangle(h->mvd_cache[0][scan8[0]], 4, 4, 8, 0, 4);
4741             fill_rectangle(h->mvd_cache[1][scan8[0]], 4, 4, 8, 0, 4);
4742         }
4743     }
4744     else
4745     {
4746         int mx, my;
4747         mb_type|= MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P1L0|MB_TYPE_SKIP;
4748
4749         fill_caches(h, mb_type, 0); //FIXME check what is needed and what not ...
4750         pred_pskip_motion(h, &mx, &my);
4751         fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, 0, 1);
4752         fill_rectangle(  h->mv_cache[0][scan8[0]], 4, 4, 8, pack16to32(mx,my), 4);
4753         if(h->pps.cabac)
4754             fill_rectangle(h->mvd_cache[0][scan8[0]], 4, 4, 8, 0, 4);
4755     }
4756
4757     write_back_motion(h, mb_type);
4758     s->current_picture.mb_type[mb_xy]= mb_type|MB_TYPE_SKIP;
4759     s->current_picture.qscale_table[mb_xy]= s->qscale;
4760     h->slice_table[ mb_xy ]= h->slice_num;
4761     h->prev_mb_skipped= 1;
4762 }
4763
4764 /**
4765  * decodes a macroblock
4766  * @returns 0 if ok, AC_ERROR / DC_ERROR / MV_ERROR if an error is noticed
4767  */
4768 static int decode_mb_cavlc(H264Context *h){
4769     MpegEncContext * const s = &h->s;
4770     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
4771     int mb_type, partition_count, cbp;
4772     int dct8x8_allowed= h->pps.transform_8x8_mode;
4773
4774     s->dsp.clear_blocks(h->mb); //FIXME avoid if already clear (move after skip handlong?
4775
4776     tprintf("pic:%d mb:%d/%d\n", h->frame_num, s->mb_x, s->mb_y);
4777     cbp = 0; /* avoid warning. FIXME: find a solution without slowing
4778                 down the code */
4779     if(h->slice_type != I_TYPE && h->slice_type != SI_TYPE){
4780         if(s->mb_skip_run==-1)
4781             s->mb_skip_run= get_ue_golomb(&s->gb);
4782
4783         if (s->mb_skip_run--) {
4784             decode_mb_skip(h);
4785             return 0;
4786         }
4787     }
4788     if(h->mb_aff_frame){
4789         if ( ((s->mb_y&1) == 0) || h->prev_mb_skipped)
4790             h->mb_field_decoding_flag = get_bits1(&s->gb);
4791     }else
4792         h->mb_field_decoding_flag= (s->picture_structure!=PICT_FRAME);
4793
4794     h->prev_mb_skipped= 0;
4795
4796     mb_type= get_ue_golomb(&s->gb);
4797     if(h->slice_type == B_TYPE){
4798         if(mb_type < 23){
4799             partition_count= b_mb_type_info[mb_type].partition_count;
4800             mb_type=         b_mb_type_info[mb_type].type;
4801         }else{
4802             mb_type -= 23;
4803             goto decode_intra_mb;
4804         }
4805     }else if(h->slice_type == P_TYPE /*|| h->slice_type == SP_TYPE */){
4806         if(mb_type < 5){
4807             partition_count= p_mb_type_info[mb_type].partition_count;
4808             mb_type=         p_mb_type_info[mb_type].type;
4809         }else{
4810             mb_type -= 5;
4811             goto decode_intra_mb;
4812         }
4813     }else{
4814        assert(h->slice_type == I_TYPE);
4815 decode_intra_mb:
4816         if(mb_type > 25){
4817             av_log(h->s.avctx, AV_LOG_ERROR, "mb_type %d in %c slice to large at %d %d\n", mb_type, av_get_pict_type_char(h->slice_type), s->mb_x, s->mb_y);
4818             return -1;
4819         }
4820         partition_count=0;
4821         cbp= i_mb_type_info[mb_type].cbp;
4822         h->intra16x16_pred_mode= i_mb_type_info[mb_type].pred_mode;
4823         mb_type= i_mb_type_info[mb_type].type;
4824     }
4825
4826     if(h->mb_field_decoding_flag)
4827         mb_type |= MB_TYPE_INTERLACED;
4828
4829     h->slice_table[ mb_xy ]= h->slice_num;
4830
4831     if(IS_INTRA_PCM(mb_type)){
4832         unsigned int x, y;
4833
4834         // we assume these blocks are very rare so we dont optimize it
4835         align_get_bits(&s->gb);
4836
4837         // The pixels are stored in the same order as levels in h->mb array.
4838         for(y=0; y<16; y++){
4839             const int index= 4*(y&3) + 32*((y>>2)&1) + 128*(y>>3);
4840             for(x=0; x<16; x++){
4841                 tprintf("LUMA ICPM LEVEL (%3d)\n", show_bits(&s->gb, 8));
4842                 h->mb[index + (x&3) + 16*((x>>2)&1) + 64*(x>>3)]= get_bits(&s->gb, 8);
4843             }
4844         }
4845         for(y=0; y<8; y++){
4846             const int index= 256 + 4*(y&3) + 32*(y>>2);
4847             for(x=0; x<8; x++){
4848                 tprintf("CHROMA U ICPM LEVEL (%3d)\n", show_bits(&s->gb, 8));
4849                 h->mb[index + (x&3) + 16*(x>>2)]= get_bits(&s->gb, 8);
4850             }
4851         }
4852         for(y=0; y<8; y++){
4853             const int index= 256 + 64 + 4*(y&3) + 32*(y>>2);
4854             for(x=0; x<8; x++){
4855                 tprintf("CHROMA V ICPM LEVEL (%3d)\n", show_bits(&s->gb, 8));
4856                 h->mb[index + (x&3) + 16*(x>>2)]= get_bits(&s->gb, 8);
4857             }
4858         }
4859
4860         // In deblocking, the quantizer is 0
4861         s->current_picture.qscale_table[mb_xy]= 0;
4862         h->chroma_qp = get_chroma_qp(h->pps.chroma_qp_index_offset, 0);
4863         // All coeffs are present
4864         memset(h->non_zero_count[mb_xy], 16, 16);
4865
4866         s->current_picture.mb_type[mb_xy]= mb_type;
4867         return 0;
4868     }
4869
4870     fill_caches(h, mb_type, 0);
4871
4872     //mb_pred
4873     if(IS_INTRA(mb_type)){
4874 //            init_top_left_availability(h);
4875             if(IS_INTRA4x4(mb_type)){
4876                 int i;
4877                 int di = 1;
4878                 if(dct8x8_allowed && get_bits1(&s->gb)){
4879                     mb_type |= MB_TYPE_8x8DCT;
4880                     di = 4;
4881                 }
4882
4883 //                fill_intra4x4_pred_table(h);
4884                 for(i=0; i<16; i+=di){
4885                     const int mode_coded= !get_bits1(&s->gb);
4886                     const int predicted_mode=  pred_intra_mode(h, i);
4887                     int mode;
4888
4889                     if(mode_coded){
4890                         const int rem_mode= get_bits(&s->gb, 3);
4891                         if(rem_mode<predicted_mode)
4892                             mode= rem_mode;
4893                         else
4894                             mode= rem_mode + 1;
4895                     }else{
4896                         mode= predicted_mode;
4897                     }
4898
4899                     if(di==4)
4900                         fill_rectangle( &h->intra4x4_pred_mode_cache[ scan8[i] ], 2, 2, 8, mode, 1 );
4901                     else
4902                         h->intra4x4_pred_mode_cache[ scan8[i] ] = mode;
4903                 }
4904                 write_back_intra_pred_mode(h);
4905                 if( check_intra4x4_pred_mode(h) < 0)
4906                     return -1;
4907             }else{
4908                 h->intra16x16_pred_mode= check_intra_pred_mode(h, h->intra16x16_pred_mode);
4909                 if(h->intra16x16_pred_mode < 0)
4910                     return -1;
4911             }
4912             h->chroma_pred_mode= get_ue_golomb(&s->gb);
4913
4914             h->chroma_pred_mode= check_intra_pred_mode(h, h->chroma_pred_mode);
4915             if(h->chroma_pred_mode < 0)
4916                 return -1;
4917     }else if(partition_count==4){
4918         int i, j, sub_partition_count[4], list, ref[2][4];
4919
4920         if(h->slice_type == B_TYPE){
4921             for(i=0; i<4; i++){
4922                 h->sub_mb_type[i]= get_ue_golomb(&s->gb);
4923                 if(h->sub_mb_type[i] >=13){
4924                     av_log(h->s.avctx, AV_LOG_ERROR, "B sub_mb_type %d out of range at %d %d\n", h->sub_mb_type[i], s->mb_x, s->mb_y);
4925                     return -1;
4926                 }
4927                 sub_partition_count[i]= b_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
4928                 h->sub_mb_type[i]=      b_sub_mb_type_info[ h->sub_mb_type[i] ].type;
4929             }
4930             if(   IS_DIRECT(h->sub_mb_type[0]) || IS_DIRECT(h->sub_mb_type[1])
4931                || IS_DIRECT(h->sub_mb_type[2]) || IS_DIRECT(h->sub_mb_type[3])) {
4932                 pred_direct_motion(h, &mb_type);
4933                 h->ref_cache[0][scan8[4]] =
4934                 h->ref_cache[1][scan8[4]] =
4935                 h->ref_cache[0][scan8[12]] =
4936                 h->ref_cache[1][scan8[12]] = PART_NOT_AVAILABLE;
4937             }
4938         }else{
4939             assert(h->slice_type == P_TYPE || h->slice_type == SP_TYPE); //FIXME SP correct ?
4940             for(i=0; i<4; i++){
4941                 h->sub_mb_type[i]= get_ue_golomb(&s->gb);
4942                 if(h->sub_mb_type[i] >=4){
4943                     av_log(h->s.avctx, AV_LOG_ERROR, "P sub_mb_type %d out of range at %d %d\n", h->sub_mb_type[i], s->mb_x, s->mb_y);
4944                     return -1;
4945                 }
4946                 sub_partition_count[i]= p_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
4947                 h->sub_mb_type[i]=      p_sub_mb_type_info[ h->sub_mb_type[i] ].type;
4948             }
4949         }
4950
4951         for(list=0; list<2; list++){
4952             int ref_count= IS_REF0(mb_type) ? 1 : h->ref_count[list];
4953             if(ref_count == 0) continue;
4954             if (h->mb_aff_frame && h->mb_field_decoding_flag) {
4955                 ref_count <<= 1;
4956             }
4957             for(i=0; i<4; i++){
4958                 if(IS_DIRECT(h->sub_mb_type[i])) continue;
4959                 if(IS_DIR(h->sub_mb_type[i], 0, list)){
4960                     ref[list][i] = get_te0_golomb(&s->gb, ref_count); //FIXME init to 0 before and skip?
4961                 }else{
4962                  //FIXME
4963                     ref[list][i] = -1;
4964                 }
4965             }
4966         }
4967
4968         if(dct8x8_allowed)
4969             dct8x8_allowed = get_dct8x8_allowed(h);
4970
4971         for(list=0; list<2; list++){
4972             const int ref_count= IS_REF0(mb_type) ? 1 : h->ref_count[list];
4973             if(ref_count == 0) continue;
4974
4975             for(i=0; i<4; i++){
4976                 if(IS_DIRECT(h->sub_mb_type[i])) {
4977                     h->ref_cache[list][ scan8[4*i] ] = h->ref_cache[list][ scan8[4*i]+1 ];
4978                     continue;
4979                 }
4980                 h->ref_cache[list][ scan8[4*i]   ]=h->ref_cache[list][ scan8[4*i]+1 ]=
4981                 h->ref_cache[list][ scan8[4*i]+8 ]=h->ref_cache[list][ scan8[4*i]+9 ]= ref[list][i];
4982
4983                 if(IS_DIR(h->sub_mb_type[i], 0, list)){
4984                     const int sub_mb_type= h->sub_mb_type[i];
4985                     const int block_width= (sub_mb_type & (MB_TYPE_16x16|MB_TYPE_16x8)) ? 2 : 1;
4986                     for(j=0; j<sub_partition_count[i]; j++){
4987                         int mx, my;
4988                         const int index= 4*i + block_width*j;
4989                         int16_t (* mv_cache)[2]= &h->mv_cache[list][ scan8[index] ];
4990                         pred_motion(h, index, block_width, list, h->ref_cache[list][ scan8[index] ], &mx, &my);
4991                         mx += get_se_golomb(&s->gb);
4992                         my += get_se_golomb(&s->gb);
4993                         tprintf("final mv:%d %d\n", mx, my);
4994
4995                         if(IS_SUB_8X8(sub_mb_type)){
4996                             mv_cache[ 0 ][0]= mv_cache[ 1 ][0]=
4997                             mv_cache[ 8 ][0]= mv_cache[ 9 ][0]= mx;
4998                             mv_cache[ 0 ][1]= mv_cache[ 1 ][1]=
4999                             mv_cache[ 8 ][1]= mv_cache[ 9 ][1]= my;
5000                         }else if(IS_SUB_8X4(sub_mb_type)){
5001                             mv_cache[ 0 ][0]= mv_cache[ 1 ][0]= mx;
5002                             mv_cache[ 0 ][1]= mv_cache[ 1 ][1]= my;
5003                         }else if(IS_SUB_4X8(sub_mb_type)){
5004                             mv_cache[ 0 ][0]= mv_cache[ 8 ][0]= mx;
5005                             mv_cache[ 0 ][1]= mv_cache[ 8 ][1]= my;
5006                         }else{
5007                             assert(IS_SUB_4X4(sub_mb_type));
5008                             mv_cache[ 0 ][0]= mx;
5009                             mv_cache[ 0 ][1]= my;
5010                         }
5011                     }
5012                 }else{
5013                     uint32_t *p= (uint32_t *)&h->mv_cache[list][ scan8[4*i] ][0];
5014                     p[0] = p[1]=
5015                     p[8] = p[9]= 0;
5016                 }
5017             }
5018         }
5019     }else if(IS_DIRECT(mb_type)){
5020         pred_direct_motion(h, &mb_type);
5021         dct8x8_allowed &= h->sps.direct_8x8_inference_flag;
5022     }else{
5023         int list, mx, my, i;
5024          //FIXME we should set ref_idx_l? to 0 if we use that later ...
5025         if(IS_16X16(mb_type)){
5026             for(list=0; list<2; list++){
5027                 if(h->ref_count[list]>0){
5028                     if(IS_DIR(mb_type, 0, list)){
5029                         const int val= get_te0_golomb(&s->gb, h->ref_count[list]);
5030                         fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, val, 1);
5031                     }else
5032                         fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, (LIST_NOT_USED&0xFF), 1);
5033                 }
5034             }
5035             for(list=0; list<2; list++){
5036                 if(IS_DIR(mb_type, 0, list)){
5037                     pred_motion(h, 0, 4, list, h->ref_cache[list][ scan8[0] ], &mx, &my);
5038                     mx += get_se_golomb(&s->gb);
5039                     my += get_se_golomb(&s->gb);
5040                     tprintf("final mv:%d %d\n", mx, my);
5041
5042                     fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, pack16to32(mx,my), 4);
5043                 }else
5044                     fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, 0, 4);
5045             }
5046         }
5047         else if(IS_16X8(mb_type)){
5048             for(list=0; list<2; list++){
5049                 if(h->ref_count[list]>0){
5050                     for(i=0; i<2; i++){
5051                         if(IS_DIR(mb_type, i, list)){
5052                             const int val= get_te0_golomb(&s->gb, h->ref_count[list]);
5053                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, val, 1);
5054                         }else
5055                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, (LIST_NOT_USED&0xFF), 1);
5056                     }
5057                 }
5058             }
5059             for(list=0; list<2; list++){
5060                 for(i=0; i<2; i++){
5061                     if(IS_DIR(mb_type, i, list)){
5062                         pred_16x8_motion(h, 8*i, list, h->ref_cache[list][scan8[0] + 16*i], &mx, &my);
5063                         mx += get_se_golomb(&s->gb);
5064                         my += get_se_golomb(&s->gb);
5065                         tprintf("final mv:%d %d\n", mx, my);
5066
5067                         fill_rectangle(h->mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, pack16to32(mx,my), 4);
5068                     }else
5069                         fill_rectangle(h->mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, 0, 4);
5070                 }
5071             }
5072         }else{
5073             assert(IS_8X16(mb_type));
5074             for(list=0; list<2; list++){
5075                 if(h->ref_count[list]>0){
5076                     for(i=0; i<2; i++){
5077                         if(IS_DIR(mb_type, i, list)){ //FIXME optimize
5078                             const int val= get_te0_golomb(&s->gb, h->ref_count[list]);
5079                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, val, 1);
5080                         }else
5081                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, (LIST_NOT_USED&0xFF), 1);
5082                     }
5083                 }
5084             }
5085             for(list=0; list<2; list++){
5086                 for(i=0; i<2; i++){
5087                     if(IS_DIR(mb_type, i, list)){
5088                         pred_8x16_motion(h, i*4, list, h->ref_cache[list][ scan8[0] + 2*i ], &mx, &my);
5089                         mx += get_se_golomb(&s->gb);
5090                         my += get_se_golomb(&s->gb);
5091                         tprintf("final mv:%d %d\n", mx, my);
5092
5093                         fill_rectangle(h->mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, pack16to32(mx,my), 4);
5094                     }else
5095                         fill_rectangle(h->mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, 0, 4);
5096                 }
5097             }
5098         }
5099     }
5100
5101     if(IS_INTER(mb_type))
5102         write_back_motion(h, mb_type);
5103
5104     if(!IS_INTRA16x16(mb_type)){
5105         cbp= get_ue_golomb(&s->gb);
5106         if(cbp > 47){
5107             av_log(h->s.avctx, AV_LOG_ERROR, "cbp too large (%d) at %d %d\n", cbp, s->mb_x, s->mb_y);
5108             return -1;
5109         }
5110
5111         if(IS_INTRA4x4(mb_type))
5112             cbp= golomb_to_intra4x4_cbp[cbp];
5113         else
5114             cbp= golomb_to_inter_cbp[cbp];
5115     }
5116
5117     if(dct8x8_allowed && (cbp&15) && !IS_INTRA(mb_type)){
5118         if(get_bits1(&s->gb))
5119             mb_type |= MB_TYPE_8x8DCT;
5120     }
5121     s->current_picture.mb_type[mb_xy]= mb_type;
5122
5123     if(cbp || IS_INTRA16x16(mb_type)){
5124         int i8x8, i4x4, chroma_idx;
5125         int chroma_qp, dquant;
5126         GetBitContext *gb= IS_INTRA(mb_type) ? h->intra_gb_ptr : h->inter_gb_ptr;
5127         const uint8_t *scan, *scan8x8, *dc_scan;
5128
5129 //        fill_non_zero_count_cache(h);
5130
5131         if(IS_INTERLACED(mb_type)){
5132             scan= s->qscale ? h->field_scan : h->field_scan_q0;
5133             dc_scan= luma_dc_field_scan;
5134         }else{
5135             scan= s->qscale ? h->zigzag_scan : h->zigzag_scan_q0;
5136             dc_scan= luma_dc_zigzag_scan;
5137         }
5138         scan8x8= s->qscale ? h->zigzag_scan8x8_cavlc : h->zigzag_scan8x8_cavlc_q0;
5139
5140         dquant= get_se_golomb(&s->gb);
5141
5142         if( dquant > 25 || dquant < -26 ){
5143             av_log(h->s.avctx, AV_LOG_ERROR, "dquant out of range (%d) at %d %d\n", dquant, s->mb_x, s->mb_y);
5144             return -1;
5145         }
5146
5147         s->qscale += dquant;
5148         if(((unsigned)s->qscale) > 51){
5149             if(s->qscale<0) s->qscale+= 52;
5150             else            s->qscale-= 52;
5151         }
5152
5153         h->chroma_qp= chroma_qp= get_chroma_qp(h->pps.chroma_qp_index_offset, s->qscale);
5154         if(IS_INTRA16x16(mb_type)){
5155             if( decode_residual(h, h->intra_gb_ptr, h->mb, LUMA_DC_BLOCK_INDEX, dc_scan, h->dequant4_coeff[0][s->qscale], 16) < 0){
5156                 return -1; //FIXME continue if partitioned and other return -1 too
5157             }
5158
5159             assert((cbp&15) == 0 || (cbp&15) == 15);
5160
5161             if(cbp&15){
5162                 for(i8x8=0; i8x8<4; i8x8++){
5163                     for(i4x4=0; i4x4<4; i4x4++){
5164                         const int index= i4x4 + 4*i8x8;
5165                         if( decode_residual(h, h->intra_gb_ptr, h->mb + 16*index, index, scan + 1, h->dequant4_coeff[0][s->qscale], 15) < 0 ){
5166                             return -1;
5167                         }
5168                     }
5169                 }
5170             }else{
5171                 fill_rectangle(&h->non_zero_count_cache[scan8[0]], 4, 4, 8, 0, 1);
5172             }
5173         }else{
5174             for(i8x8=0; i8x8<4; i8x8++){
5175                 if(cbp & (1<<i8x8)){
5176                     if(IS_8x8DCT(mb_type)){
5177                         DCTELEM *buf = &h->mb[64*i8x8];
5178                         uint8_t *nnz;
5179                         for(i4x4=0; i4x4<4; i4x4++){
5180                             if( decode_residual(h, gb, buf, i4x4+4*i8x8, scan8x8+16*i4x4,
5181                                                 h->dequant8_coeff[IS_INTRA( mb_type ) ? 0:1][s->qscale], 16) <0 )
5182                                 return -1;
5183                         }
5184                         nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
5185                         nnz[0] += nnz[1] + nnz[8] + nnz[9];
5186                     }else{
5187                         for(i4x4=0; i4x4<4; i4x4++){
5188                             const int index= i4x4 + 4*i8x8;
5189
5190                             if( decode_residual(h, gb, h->mb + 16*index, index, scan, h->dequant4_coeff[IS_INTRA( mb_type ) ? 0:3][s->qscale], 16) <0 ){
5191                                 return -1;
5192                             }
5193                         }
5194                     }
5195                 }else{
5196                     uint8_t * const nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
5197                     nnz[0] = nnz[1] = nnz[8] = nnz[9] = 0;
5198                 }
5199             }
5200         }
5201
5202         if(cbp&0x30){
5203             for(chroma_idx=0; chroma_idx<2; chroma_idx++)
5204                 if( decode_residual(h, gb, h->mb + 256 + 16*4*chroma_idx, CHROMA_DC_BLOCK_INDEX, chroma_dc_scan, NULL, 4) < 0){
5205                     return -1;
5206                 }
5207         }
5208
5209         if(cbp&0x20){
5210             for(chroma_idx=0; chroma_idx<2; chroma_idx++){
5211                 for(i4x4=0; i4x4<4; i4x4++){
5212                     const int index= 16 + 4*chroma_idx + i4x4;
5213                     if( decode_residual(h, gb, h->mb + 16*index, index, scan + 1, h->dequant4_coeff[chroma_idx+1+(IS_INTRA( mb_type ) ? 0:3)][chroma_qp], 15) < 0){
5214                         return -1;
5215                     }
5216                 }
5217             }
5218         }else{
5219             uint8_t * const nnz= &h->non_zero_count_cache[0];
5220             nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
5221             nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
5222         }
5223     }else{
5224         uint8_t * const nnz= &h->non_zero_count_cache[0];
5225         fill_rectangle(&nnz[scan8[0]], 4, 4, 8, 0, 1);
5226         nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
5227         nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
5228     }
5229     s->current_picture.qscale_table[mb_xy]= s->qscale;
5230     write_back_non_zero_count(h);
5231
5232     return 0;
5233 }
5234
5235 static int decode_cabac_field_decoding_flag(H264Context *h) {
5236     MpegEncContext * const s = &h->s;
5237     const int mb_x = s->mb_x;
5238     const int mb_y = s->mb_y & ~1;
5239     const int mba_xy = mb_x - 1 +  mb_y   *s->mb_stride;
5240     const int mbb_xy = mb_x     + (mb_y-2)*s->mb_stride;
5241
5242     unsigned int ctx = 0;
5243
5244     if( h->slice_table[mba_xy] == h->slice_num && IS_INTERLACED( s->current_picture.mb_type[mba_xy] ) ) {
5245         ctx += 1;
5246     }
5247     if( h->slice_table[mbb_xy] == h->slice_num && IS_INTERLACED( s->current_picture.mb_type[mbb_xy] ) ) {
5248         ctx += 1;
5249     }
5250
5251     return get_cabac( &h->cabac, &h->cabac_state[70 + ctx] );
5252 }
5253
5254 static int decode_cabac_intra_mb_type(H264Context *h, int ctx_base, int intra_slice) {
5255     uint8_t *state= &h->cabac_state[ctx_base];
5256     int mb_type;
5257
5258     if(intra_slice){
5259         MpegEncContext * const s = &h->s;
5260         const int mba_xy = h->left_mb_xy[0];
5261         const int mbb_xy = h->top_mb_xy;
5262         int ctx=0;
5263         if( h->slice_table[mba_xy] == h->slice_num && !IS_INTRA4x4( s->current_picture.mb_type[mba_xy] ) )
5264             ctx++;
5265         if( h->slice_table[mbb_xy] == h->slice_num && !IS_INTRA4x4( s->current_picture.mb_type[mbb_xy] ) )
5266             ctx++;
5267         if( get_cabac( &h->cabac, &state[ctx] ) == 0 )
5268             return 0;   /* I4x4 */
5269         state += 2;
5270     }else{
5271         if( get_cabac( &h->cabac, &state[0] ) == 0 )
5272             return 0;   /* I4x4 */
5273     }
5274
5275     if( get_cabac_terminate( &h->cabac ) )
5276         return 25;  /* PCM */
5277
5278     mb_type = 1; /* I16x16 */
5279     mb_type += 12 * get_cabac( &h->cabac, &state[1] ); /* cbp_luma != 0 */
5280     if( get_cabac( &h->cabac, &state[2] ) ) /* cbp_chroma */
5281         mb_type += 4 + 4 * get_cabac( &h->cabac, &state[2+intra_slice] );
5282     mb_type += 2 * get_cabac( &h->cabac, &state[3+intra_slice] );
5283     mb_type += 1 * get_cabac( &h->cabac, &state[3+2*intra_slice] );
5284     return mb_type;
5285 }
5286
5287 static int decode_cabac_mb_type( H264Context *h ) {
5288     MpegEncContext * const s = &h->s;
5289
5290     if( h->slice_type == I_TYPE ) {
5291         return decode_cabac_intra_mb_type(h, 3, 1);
5292     } else if( h->slice_type == P_TYPE ) {
5293         if( get_cabac( &h->cabac, &h->cabac_state[14] ) == 0 ) {
5294             /* P-type */
5295             if( get_cabac( &h->cabac, &h->cabac_state[15] ) == 0 ) {
5296                 /* P_L0_D16x16, P_8x8 */
5297                 return 3 * get_cabac( &h->cabac, &h->cabac_state[16] );
5298             } else {
5299                 /* P_L0_D8x16, P_L0_D16x8 */
5300                 return 2 - get_cabac( &h->cabac, &h->cabac_state[17] );
5301             }
5302         } else {
5303             return decode_cabac_intra_mb_type(h, 17, 0) + 5;
5304         }
5305     } else if( h->slice_type == B_TYPE ) {
5306         const int mba_xy = h->left_mb_xy[0];
5307         const int mbb_xy = h->top_mb_xy;
5308         int ctx = 0;
5309         int bits;
5310
5311         if( h->slice_table[mba_xy] == h->slice_num && !IS_DIRECT( s->current_picture.mb_type[mba_xy] ) )
5312             ctx++;
5313         if( h->slice_table[mbb_xy] == h->slice_num && !IS_DIRECT( s->current_picture.mb_type[mbb_xy] ) )
5314             ctx++;
5315
5316         if( !get_cabac( &h->cabac, &h->cabac_state[27+ctx] ) )
5317             return 0; /* B_Direct_16x16 */
5318
5319         if( !get_cabac( &h->cabac, &h->cabac_state[27+3] ) ) {
5320             return 1 + get_cabac( &h->cabac, &h->cabac_state[27+5] ); /* B_L[01]_16x16 */
5321         }
5322
5323         bits = get_cabac( &h->cabac, &h->cabac_state[27+4] ) << 3;
5324         bits|= get_cabac( &h->cabac, &h->cabac_state[27+5] ) << 2;
5325         bits|= get_cabac( &h->cabac, &h->cabac_state[27+5] ) << 1;
5326         bits|= get_cabac( &h->cabac, &h->cabac_state[27+5] );
5327         if( bits < 8 )
5328             return bits + 3; /* B_Bi_16x16 through B_L1_L0_16x8 */
5329         else if( bits == 13 ) {
5330             return decode_cabac_intra_mb_type(h, 32, 0) + 23;
5331         } else if( bits == 14 )
5332             return 11; /* B_L1_L0_8x16 */
5333         else if( bits == 15 )
5334             return 22; /* B_8x8 */
5335
5336         bits= ( bits<<1 ) | get_cabac( &h->cabac, &h->cabac_state[27+5] );
5337         return bits - 4; /* B_L0_Bi_* through B_Bi_Bi_* */
5338     } else {
5339         /* TODO SI/SP frames? */
5340         return -1;
5341     }
5342 }
5343
5344 static int decode_cabac_mb_skip( H264Context *h) {
5345     MpegEncContext * const s = &h->s;
5346     const int mb_xy = s->mb_x + s->mb_y*s->mb_stride;
5347     const int mba_xy = mb_xy - 1;
5348     const int mbb_xy = mb_xy - s->mb_stride;
5349     int ctx = 0;
5350
5351     if( h->slice_table[mba_xy] == h->slice_num && !IS_SKIP( s->current_picture.mb_type[mba_xy] ))
5352         ctx++;
5353     if( h->slice_table[mbb_xy] == h->slice_num && !IS_SKIP( s->current_picture.mb_type[mbb_xy] ))
5354         ctx++;
5355
5356     if( h->slice_type == B_TYPE )
5357         ctx += 13;
5358     return get_cabac( &h->cabac, &h->cabac_state[11+ctx] );
5359 }
5360
5361 static int decode_cabac_mb_intra4x4_pred_mode( H264Context *h, int pred_mode ) {
5362     int mode = 0;
5363
5364     if( get_cabac( &h->cabac, &h->cabac_state[68] ) )
5365         return pred_mode;
5366
5367     mode += 1 * get_cabac( &h->cabac, &h->cabac_state[69] );
5368     mode += 2 * get_cabac( &h->cabac, &h->cabac_state[69] );
5369     mode += 4 * get_cabac( &h->cabac, &h->cabac_state[69] );
5370
5371     if( mode >= pred_mode )
5372         return mode + 1;
5373     else
5374         return mode;
5375 }
5376
5377 static int decode_cabac_mb_chroma_pre_mode( H264Context *h) {
5378     const int mba_xy = h->left_mb_xy[0];
5379     const int mbb_xy = h->top_mb_xy;
5380
5381     int ctx = 0;
5382
5383     /* No need to test for IS_INTRA4x4 and IS_INTRA16x16, as we set chroma_pred_mode_table to 0 */
5384     if( h->slice_table[mba_xy] == h->slice_num && h->chroma_pred_mode_table[mba_xy] != 0 )
5385         ctx++;
5386
5387     if( h->slice_table[mbb_xy] == h->slice_num && h->chroma_pred_mode_table[mbb_xy] != 0 )
5388         ctx++;
5389
5390     if( get_cabac( &h->cabac, &h->cabac_state[64+ctx] ) == 0 )
5391         return 0;
5392
5393     if( get_cabac( &h->cabac, &h->cabac_state[64+3] ) == 0 )
5394         return 1;
5395     if( get_cabac( &h->cabac, &h->cabac_state[64+3] ) == 0 )
5396         return 2;
5397     else
5398         return 3;
5399 }
5400
5401 static const uint8_t block_idx_x[16] = {
5402     0, 1, 0, 1, 2, 3, 2, 3, 0, 1, 0, 1, 2, 3, 2, 3
5403 };
5404 static const uint8_t block_idx_y[16] = {
5405     0, 0, 1, 1, 0, 0, 1, 1, 2, 2, 3, 3, 2, 2, 3, 3
5406 };
5407 static const uint8_t block_idx_xy[4][4] = {
5408     { 0, 2, 8,  10},
5409     { 1, 3, 9,  11},
5410     { 4, 6, 12, 14},
5411     { 5, 7, 13, 15}
5412 };
5413
5414 static int decode_cabac_mb_cbp_luma( H264Context *h) {
5415     int cbp = 0;
5416     int cbp_b = -1;
5417     int i8x8;
5418
5419     if( h->slice_table[h->top_mb_xy] == h->slice_num ) {
5420         cbp_b = h->top_cbp;
5421         tprintf("cbp_b = top_cbp = %x\n", cbp_b);
5422     }
5423
5424     for( i8x8 = 0; i8x8 < 4; i8x8++ ) {
5425         int cbp_a = -1;
5426         int x, y;
5427         int ctx = 0;
5428
5429         x = block_idx_x[4*i8x8];
5430         y = block_idx_y[4*i8x8];
5431
5432         if( x > 0 )
5433             cbp_a = cbp;
5434         else if( h->slice_table[h->left_mb_xy[0]] == h->slice_num ) {
5435             cbp_a = h->left_cbp;
5436             tprintf("cbp_a = left_cbp = %x\n", cbp_a);
5437         }
5438
5439         if( y > 0 )
5440             cbp_b = cbp;
5441
5442         /* No need to test for skip as we put 0 for skip block */
5443         /* No need to test for IPCM as we put 1 for IPCM block */
5444         if( cbp_a >= 0 ) {
5445             int i8x8a = block_idx_xy[(x-1)&0x03][y]/4;
5446             if( ((cbp_a >> i8x8a)&0x01) == 0 )
5447                 ctx++;
5448         }
5449
5450         if( cbp_b >= 0 ) {
5451             int i8x8b = block_idx_xy[x][(y-1)&0x03]/4;
5452             if( ((cbp_b >> i8x8b)&0x01) == 0 )
5453                 ctx += 2;
5454         }
5455
5456         if( get_cabac( &h->cabac, &h->cabac_state[73 + ctx] ) ) {
5457             cbp |= 1 << i8x8;
5458         }
5459     }
5460     return cbp;
5461 }
5462 static int decode_cabac_mb_cbp_chroma( H264Context *h) {
5463     int ctx;
5464     int cbp_a, cbp_b;
5465
5466     cbp_a = (h->left_cbp>>4)&0x03;
5467     cbp_b = (h-> top_cbp>>4)&0x03;
5468
5469     ctx = 0;
5470     if( cbp_a > 0 ) ctx++;
5471     if( cbp_b > 0 ) ctx += 2;
5472     if( get_cabac( &h->cabac, &h->cabac_state[77 + ctx] ) == 0 )
5473         return 0;
5474
5475     ctx = 4;
5476     if( cbp_a == 2 ) ctx++;
5477     if( cbp_b == 2 ) ctx += 2;
5478     return 1 + get_cabac( &h->cabac, &h->cabac_state[77 + ctx] );
5479 }
5480 static int decode_cabac_mb_dqp( H264Context *h) {
5481     MpegEncContext * const s = &h->s;
5482     int mbn_xy;
5483     int   ctx = 0;
5484     int   val = 0;
5485
5486     if( s->mb_x > 0 )
5487         mbn_xy = s->mb_x + s->mb_y*s->mb_stride - 1;
5488     else
5489         mbn_xy = s->mb_width - 1 + (s->mb_y-1)*s->mb_stride;
5490
5491     if( h->last_qscale_diff != 0 && ( IS_INTRA16x16(s->current_picture.mb_type[mbn_xy] ) || (h->cbp_table[mbn_xy]&0x3f) ) )
5492         ctx++;
5493
5494     while( get_cabac( &h->cabac, &h->cabac_state[60 + ctx] ) ) {
5495         if( ctx < 2 )
5496             ctx = 2;
5497         else
5498             ctx = 3;
5499         val++;
5500         if(val > 102) //prevent infinite loop
5501             return INT_MIN;
5502     }
5503
5504     if( val&0x01 )
5505         return (val + 1)/2;
5506     else
5507         return -(val + 1)/2;
5508 }
5509 static int decode_cabac_p_mb_sub_type( H264Context *h ) {
5510     if( get_cabac( &h->cabac, &h->cabac_state[21] ) )
5511         return 0;   /* 8x8 */
5512     if( !get_cabac( &h->cabac, &h->cabac_state[22] ) )
5513         return 1;   /* 8x4 */
5514     if( get_cabac( &h->cabac, &h->cabac_state[23] ) )
5515         return 2;   /* 4x8 */
5516     return 3;       /* 4x4 */
5517 }
5518 static int decode_cabac_b_mb_sub_type( H264Context *h ) {
5519     int type;
5520     if( !get_cabac( &h->cabac, &h->cabac_state[36] ) )
5521         return 0;   /* B_Direct_8x8 */
5522     if( !get_cabac( &h->cabac, &h->cabac_state[37] ) )
5523         return 1 + get_cabac( &h->cabac, &h->cabac_state[39] ); /* B_L0_8x8, B_L1_8x8 */
5524     type = 3;
5525     if( get_cabac( &h->cabac, &h->cabac_state[38] ) ) {
5526         if( get_cabac( &h->cabac, &h->cabac_state[39] ) )
5527             return 11 + get_cabac( &h->cabac, &h->cabac_state[39] ); /* B_L1_4x4, B_Bi_4x4 */
5528         type += 4;
5529     }
5530     type += 2*get_cabac( &h->cabac, &h->cabac_state[39] );
5531     type +=   get_cabac( &h->cabac, &h->cabac_state[39] );
5532     return type;
5533 }
5534
5535 static inline int decode_cabac_mb_transform_size( H264Context *h ) {
5536     return get_cabac( &h->cabac, &h->cabac_state[399 + h->neighbor_transform_size] );
5537 }
5538
5539 static int decode_cabac_mb_ref( H264Context *h, int list, int n ) {
5540     int refa = h->ref_cache[list][scan8[n] - 1];
5541     int refb = h->ref_cache[list][scan8[n] - 8];
5542     int ref  = 0;
5543     int ctx  = 0;
5544
5545     if( h->slice_type == B_TYPE) {
5546         if( refa > 0 && !h->direct_cache[scan8[n] - 1] )
5547             ctx++;
5548         if( refb > 0 && !h->direct_cache[scan8[n] - 8] )
5549             ctx += 2;
5550     } else {
5551         if( refa > 0 )
5552             ctx++;
5553         if( refb > 0 )
5554             ctx += 2;
5555     }
5556
5557     while( get_cabac( &h->cabac, &h->cabac_state[54+ctx] ) ) {
5558         ref++;
5559         if( ctx < 4 )
5560             ctx = 4;
5561         else
5562             ctx = 5;
5563     }
5564     return ref;
5565 }
5566
5567 static int decode_cabac_mb_mvd( H264Context *h, int list, int n, int l ) {
5568     int amvd = abs( h->mvd_cache[list][scan8[n] - 1][l] ) +
5569                abs( h->mvd_cache[list][scan8[n] - 8][l] );
5570     int ctxbase = (l == 0) ? 40 : 47;
5571     int ctx, mvd;
5572
5573     if( amvd < 3 )
5574         ctx = 0;
5575     else if( amvd > 32 )
5576         ctx = 2;
5577     else
5578         ctx = 1;
5579
5580     if(!get_cabac(&h->cabac, &h->cabac_state[ctxbase+ctx]))
5581         return 0;
5582
5583     mvd= 1;
5584     ctx= 3;
5585     while( mvd < 9 && get_cabac( &h->cabac, &h->cabac_state[ctxbase+ctx] ) ) {
5586         mvd++;
5587         if( ctx < 6 )
5588             ctx++;
5589     }
5590
5591     if( mvd >= 9 ) {
5592         int k = 3;
5593         while( get_cabac_bypass( &h->cabac ) ) {
5594             mvd += 1 << k;
5595             k++;
5596         }
5597         while( k-- ) {
5598             if( get_cabac_bypass( &h->cabac ) )
5599                 mvd += 1 << k;
5600         }
5601     }
5602     if( get_cabac_bypass( &h->cabac ) )  return -mvd;
5603     else                                 return  mvd;
5604 }
5605
5606 static int inline get_cabac_cbf_ctx( H264Context *h, int cat, int idx ) {
5607     int nza, nzb;
5608     int ctx = 0;
5609
5610     if( cat == 0 ) {
5611         nza = h->left_cbp&0x100;
5612         nzb = h-> top_cbp&0x100;
5613     } else if( cat == 1 || cat == 2 ) {
5614         nza = h->non_zero_count_cache[scan8[idx] - 1];
5615         nzb = h->non_zero_count_cache[scan8[idx] - 8];
5616     } else if( cat == 3 ) {
5617         nza = (h->left_cbp>>(6+idx))&0x01;
5618         nzb = (h-> top_cbp>>(6+idx))&0x01;
5619     } else {
5620         assert(cat == 4);
5621         nza = h->non_zero_count_cache[scan8[16+idx] - 1];
5622         nzb = h->non_zero_count_cache[scan8[16+idx] - 8];
5623     }
5624
5625     if( nza > 0 )
5626         ctx++;
5627
5628     if( nzb > 0 )
5629         ctx += 2;
5630
5631     return ctx + 4 * cat;
5632 }
5633
5634 static int decode_cabac_residual( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff) {
5635     const int mb_xy  = h->s.mb_x + h->s.mb_y*h->s.mb_stride;
5636     static const int significant_coeff_flag_field_offset[2] = { 105, 277 };
5637     static const int last_significant_coeff_flag_field_offset[2] = { 166, 338 };
5638     static const int significant_coeff_flag_offset[6] = { 0, 15, 29, 44, 47, 297 };
5639     static const int last_significant_coeff_flag_offset[6] = { 0, 15, 29, 44, 47, 251 };
5640     static const int coeff_abs_level_m1_offset[6] = { 227+0, 227+10, 227+20, 227+30, 227+39, 426 };
5641     static const int significant_coeff_flag_offset_8x8[63] = {
5642         0, 1, 2, 3, 4, 5, 5, 4, 4, 3, 3, 4, 4, 4, 5, 5,
5643         4, 4, 4, 4, 3, 3, 6, 7, 7, 7, 8, 9,10, 9, 8, 7,
5644         7, 6,11,12,13,11, 6, 7, 8, 9,14,10, 9, 8, 6,11,
5645        12,13,11, 6, 9,14,10, 9,11,12,13,11,14,10,12
5646     };
5647     static const int last_coeff_flag_offset_8x8[63] = {
5648         0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
5649         2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
5650         3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4,
5651         5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7, 8, 8, 8
5652     };
5653
5654     int index[64];
5655
5656     int i, last;
5657     int coeff_count = 0;
5658
5659     int abslevel1 = 1;
5660     int abslevelgt1 = 0;
5661
5662     uint8_t *significant_coeff_ctx_base;
5663     uint8_t *last_coeff_ctx_base;
5664     uint8_t *abs_level_m1_ctx_base;
5665
5666     /* cat: 0-> DC 16x16  n = 0
5667      *      1-> AC 16x16  n = luma4x4idx
5668      *      2-> Luma4x4   n = luma4x4idx
5669      *      3-> DC Chroma n = iCbCr
5670      *      4-> AC Chroma n = 4 * iCbCr + chroma4x4idx
5671      *      5-> Luma8x8   n = 4 * luma8x8idx
5672      */
5673
5674     /* read coded block flag */
5675     if( cat != 5 ) {
5676         if( get_cabac( &h->cabac, &h->cabac_state[85 + get_cabac_cbf_ctx( h, cat, n ) ] ) == 0 ) {
5677             if( cat == 1 || cat == 2 )
5678                 h->non_zero_count_cache[scan8[n]] = 0;
5679             else if( cat == 4 )
5680                 h->non_zero_count_cache[scan8[16+n]] = 0;
5681
5682             return 0;
5683         }
5684     }
5685
5686     significant_coeff_ctx_base = h->cabac_state
5687         + significant_coeff_flag_offset[cat]
5688         + significant_coeff_flag_field_offset[h->mb_field_decoding_flag];
5689     last_coeff_ctx_base = h->cabac_state
5690         + last_significant_coeff_flag_offset[cat]
5691         + last_significant_coeff_flag_field_offset[h->mb_field_decoding_flag];
5692     abs_level_m1_ctx_base = h->cabac_state
5693         + coeff_abs_level_m1_offset[cat];
5694
5695     if( cat == 5 ) {
5696 #define DECODE_SIGNIFICANCE( coefs, sig_off, last_off ) \
5697         for(last= 0; last < coefs; last++) { \
5698             uint8_t *sig_ctx = significant_coeff_ctx_base + sig_off; \
5699             if( get_cabac( &h->cabac, sig_ctx )) { \
5700                 uint8_t *last_ctx = last_coeff_ctx_base + last_off; \
5701                 index[coeff_count++] = last; \
5702                 if( get_cabac( &h->cabac, last_ctx ) ) { \
5703                     last= max_coeff; \
5704                     break; \
5705                 } \
5706             } \
5707         }
5708         DECODE_SIGNIFICANCE( 63, significant_coeff_flag_offset_8x8[last],
5709                                  last_coeff_flag_offset_8x8[last] );
5710     } else {
5711         DECODE_SIGNIFICANCE( max_coeff - 1, last, last );
5712     }
5713     if( last == max_coeff -1 ) {
5714         index[coeff_count++] = last;
5715     }
5716     assert(coeff_count > 0);
5717
5718     if( cat == 0 )
5719         h->cbp_table[mb_xy] |= 0x100;
5720     else if( cat == 1 || cat == 2 )
5721         h->non_zero_count_cache[scan8[n]] = coeff_count;
5722     else if( cat == 3 )
5723         h->cbp_table[mb_xy] |= 0x40 << n;
5724     else if( cat == 4 )
5725         h->non_zero_count_cache[scan8[16+n]] = coeff_count;
5726     else {
5727         assert( cat == 5 );
5728         fill_rectangle(&h->non_zero_count_cache[scan8[n]], 2, 2, 8, coeff_count, 1);
5729     }
5730
5731     for( i = coeff_count - 1; i >= 0; i-- ) {
5732         uint8_t *ctx = (abslevelgt1 != 0 ? 0 : FFMIN( 4, abslevel1 )) + abs_level_m1_ctx_base;
5733         int j= scantable[index[i]];
5734
5735         if( get_cabac( &h->cabac, ctx ) == 0 ) {
5736             if( !qmul ) {
5737                 if( get_cabac_bypass( &h->cabac ) ) block[j] = -1;
5738                 else                                block[j] =  1;
5739             }else{
5740                 if( get_cabac_bypass( &h->cabac ) ) block[j] = (-qmul[j] + 32) >> 6;
5741                 else                                block[j] = ( qmul[j] + 32) >> 6;
5742             }
5743
5744             abslevel1++;
5745         } else {
5746             int coeff_abs = 2;
5747             ctx = 5 + FFMIN( 4, abslevelgt1 ) + abs_level_m1_ctx_base;
5748             while( coeff_abs < 15 && get_cabac( &h->cabac, ctx ) ) {
5749                 coeff_abs++;
5750             }
5751
5752             if( coeff_abs >= 15 ) {
5753                 int j = 0;
5754                 while( get_cabac_bypass( &h->cabac ) ) {
5755                     coeff_abs += 1 << j;
5756                     j++;
5757                 }
5758
5759                 while( j-- ) {
5760                     if( get_cabac_bypass( &h->cabac ) )
5761                         coeff_abs += 1 << j ;
5762                 }
5763             }
5764
5765             if( !qmul ) {
5766                 if( get_cabac_bypass( &h->cabac ) ) block[j] = -coeff_abs;
5767                 else                                block[j] =  coeff_abs;
5768             }else{
5769                 if( get_cabac_bypass( &h->cabac ) ) block[j] = (-coeff_abs * qmul[j] + 32) >> 6;
5770                 else                                block[j] = ( coeff_abs * qmul[j] + 32) >> 6;
5771             }
5772
5773             abslevelgt1++;
5774         }
5775     }
5776     return 0;
5777 }
5778
5779 static void inline compute_mb_neighbors(H264Context *h)
5780 {
5781     MpegEncContext * const s = &h->s;
5782     const int mb_xy  = s->mb_x + s->mb_y*s->mb_stride;
5783     h->top_mb_xy     = mb_xy - s->mb_stride;
5784     h->left_mb_xy[0] = mb_xy - 1;
5785     if(h->mb_aff_frame){
5786         const int pair_xy          = s->mb_x     + (s->mb_y & ~1)*s->mb_stride;
5787         const int top_pair_xy      = pair_xy     - s->mb_stride;
5788         const int top_mb_frame_flag      = !IS_INTERLACED(s->current_picture.mb_type[top_pair_xy]);
5789         const int left_mb_frame_flag = !IS_INTERLACED(s->current_picture.mb_type[pair_xy-1]);
5790         const int curr_mb_frame_flag = !h->mb_field_decoding_flag;
5791         const int bottom = (s->mb_y & 1);
5792         if (bottom
5793                 ? !curr_mb_frame_flag // bottom macroblock
5794                 : (!curr_mb_frame_flag && !top_mb_frame_flag) // top macroblock
5795                 ) {
5796             h->top_mb_xy -= s->mb_stride;
5797         }
5798         if (left_mb_frame_flag != curr_mb_frame_flag) {
5799             h->left_mb_xy[0] = pair_xy - 1;
5800         }
5801     }
5802     return;
5803 }
5804
5805 /**
5806  * decodes a macroblock
5807  * @returns 0 if ok, AC_ERROR / DC_ERROR / MV_ERROR if an error is noticed
5808  */
5809 static int decode_mb_cabac(H264Context *h) {
5810     MpegEncContext * const s = &h->s;
5811     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
5812     int mb_type, partition_count, cbp = 0;
5813     int dct8x8_allowed= h->pps.transform_8x8_mode;
5814
5815     s->dsp.clear_blocks(h->mb); //FIXME avoid if already clear (move after skip handlong?)
5816
5817     tprintf("pic:%d mb:%d/%d\n", h->frame_num, s->mb_x, s->mb_y);
5818     if( h->slice_type != I_TYPE && h->slice_type != SI_TYPE ) {
5819         /* read skip flags */
5820         if( decode_cabac_mb_skip( h ) ) {
5821             decode_mb_skip(h);
5822
5823             h->cbp_table[mb_xy] = 0;
5824             h->chroma_pred_mode_table[mb_xy] = 0;
5825             h->last_qscale_diff = 0;
5826
5827             return 0;
5828
5829         }
5830     }
5831     if(h->mb_aff_frame){
5832         if ( ((s->mb_y&1) == 0) || h->prev_mb_skipped)
5833             h->mb_field_decoding_flag = decode_cabac_field_decoding_flag(h);
5834     }else
5835         h->mb_field_decoding_flag= (s->picture_structure!=PICT_FRAME);
5836
5837     h->prev_mb_skipped = 0;
5838
5839     compute_mb_neighbors(h);
5840     if( ( mb_type = decode_cabac_mb_type( h ) ) < 0 ) {
5841         av_log( h->s.avctx, AV_LOG_ERROR, "decode_cabac_mb_type failed\n" );
5842         return -1;
5843     }
5844
5845     if( h->slice_type == B_TYPE ) {
5846         if( mb_type < 23 ){
5847             partition_count= b_mb_type_info[mb_type].partition_count;
5848             mb_type=         b_mb_type_info[mb_type].type;
5849         }else{
5850             mb_type -= 23;
5851             goto decode_intra_mb;
5852         }
5853     } else if( h->slice_type == P_TYPE ) {
5854         if( mb_type < 5) {
5855             partition_count= p_mb_type_info[mb_type].partition_count;
5856             mb_type=         p_mb_type_info[mb_type].type;
5857         } else {
5858             mb_type -= 5;
5859             goto decode_intra_mb;
5860         }
5861     } else {
5862        assert(h->slice_type == I_TYPE);
5863 decode_intra_mb:
5864         partition_count = 0;
5865         cbp= i_mb_type_info[mb_type].cbp;
5866         h->intra16x16_pred_mode= i_mb_type_info[mb_type].pred_mode;
5867         mb_type= i_mb_type_info[mb_type].type;
5868     }
5869     if(h->mb_field_decoding_flag)
5870         mb_type |= MB_TYPE_INTERLACED;
5871
5872     h->slice_table[ mb_xy ]= h->slice_num;
5873
5874     if(IS_INTRA_PCM(mb_type)) {
5875         const uint8_t *ptr;
5876         unsigned int x, y;
5877
5878         // We assume these blocks are very rare so we dont optimize it.
5879         // FIXME The two following lines get the bitstream position in the cabac
5880         // decode, I think it should be done by a function in cabac.h (or cabac.c).
5881         ptr= h->cabac.bytestream;
5882         if (h->cabac.low&0x1) ptr-=CABAC_BITS/8;
5883
5884         // The pixels are stored in the same order as levels in h->mb array.
5885         for(y=0; y<16; y++){
5886             const int index= 4*(y&3) + 32*((y>>2)&1) + 128*(y>>3);
5887             for(x=0; x<16; x++){
5888                 tprintf("LUMA ICPM LEVEL (%3d)\n", *ptr);
5889                 h->mb[index + (x&3) + 16*((x>>2)&1) + 64*(x>>3)]= *ptr++;
5890             }
5891         }
5892         for(y=0; y<8; y++){
5893             const int index= 256 + 4*(y&3) + 32*(y>>2);
5894             for(x=0; x<8; x++){
5895                 tprintf("CHROMA U ICPM LEVEL (%3d)\n", *ptr);
5896                 h->mb[index + (x&3) + 16*(x>>2)]= *ptr++;
5897             }
5898         }
5899         for(y=0; y<8; y++){
5900             const int index= 256 + 64 + 4*(y&3) + 32*(y>>2);
5901             for(x=0; x<8; x++){
5902                 tprintf("CHROMA V ICPM LEVEL (%3d)\n", *ptr);
5903                 h->mb[index + (x&3) + 16*(x>>2)]= *ptr++;
5904             }
5905         }
5906
5907         ff_init_cabac_decoder(&h->cabac, ptr, h->cabac.bytestream_end - ptr);
5908
5909         // All blocks are present
5910         h->cbp_table[mb_xy] = 0x1ef;
5911         h->chroma_pred_mode_table[mb_xy] = 0;
5912         // In deblocking, the quantizer is 0
5913         s->current_picture.qscale_table[mb_xy]= 0;
5914         h->chroma_qp = get_chroma_qp(h->pps.chroma_qp_index_offset, 0);
5915         // All coeffs are present
5916         memset(h->non_zero_count[mb_xy], 16, 16);
5917         s->current_picture.mb_type[mb_xy]= mb_type;
5918         return 0;
5919     }
5920
5921     fill_caches(h, mb_type, 0);
5922
5923     if( IS_INTRA( mb_type ) ) {
5924         int i;
5925         if( IS_INTRA4x4( mb_type ) ) {
5926             if( dct8x8_allowed && decode_cabac_mb_transform_size( h ) ) {
5927                 mb_type |= MB_TYPE_8x8DCT;
5928                 for( i = 0; i < 16; i+=4 ) {
5929                     int pred = pred_intra_mode( h, i );
5930                     int mode = decode_cabac_mb_intra4x4_pred_mode( h, pred );
5931                     fill_rectangle( &h->intra4x4_pred_mode_cache[ scan8[i] ], 2, 2, 8, mode, 1 );
5932                 }
5933             } else {
5934                 for( i = 0; i < 16; i++ ) {
5935                     int pred = pred_intra_mode( h, i );
5936                     h->intra4x4_pred_mode_cache[ scan8[i] ] = decode_cabac_mb_intra4x4_pred_mode( h, pred );
5937
5938                 //av_log( s->avctx, AV_LOG_ERROR, "i4x4 pred=%d mode=%d\n", pred, h->intra4x4_pred_mode_cache[ scan8[i] ] );
5939                 }
5940             }
5941             write_back_intra_pred_mode(h);
5942             if( check_intra4x4_pred_mode(h) < 0 ) return -1;
5943         } else {
5944             h->intra16x16_pred_mode= check_intra_pred_mode( h, h->intra16x16_pred_mode );
5945             if( h->intra16x16_pred_mode < 0 ) return -1;
5946         }
5947         h->chroma_pred_mode_table[mb_xy] =
5948             h->chroma_pred_mode          = decode_cabac_mb_chroma_pre_mode( h );
5949
5950         h->chroma_pred_mode= check_intra_pred_mode( h, h->chroma_pred_mode );
5951         if( h->chroma_pred_mode < 0 ) return -1;
5952     } else if( partition_count == 4 ) {
5953         int i, j, sub_partition_count[4], list, ref[2][4];
5954
5955         if( h->slice_type == B_TYPE ) {
5956             for( i = 0; i < 4; i++ ) {
5957                 h->sub_mb_type[i] = decode_cabac_b_mb_sub_type( h );
5958                 sub_partition_count[i]= b_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
5959                 h->sub_mb_type[i]=      b_sub_mb_type_info[ h->sub_mb_type[i] ].type;
5960             }
5961             if(   IS_DIRECT(h->sub_mb_type[0]) || IS_DIRECT(h->sub_mb_type[1])
5962                || IS_DIRECT(h->sub_mb_type[2]) || IS_DIRECT(h->sub_mb_type[3])) {
5963                 pred_direct_motion(h, &mb_type);
5964                 if( h->ref_count[0] > 1 || h->ref_count[1] > 1 ) {
5965                     for( i = 0; i < 4; i++ )
5966                         if( IS_DIRECT(h->sub_mb_type[i]) )
5967                             fill_rectangle( &h->direct_cache[scan8[4*i]], 2, 2, 8, 1, 1 );
5968                 }
5969             }
5970         } else {
5971             for( i = 0; i < 4; i++ ) {
5972                 h->sub_mb_type[i] = decode_cabac_p_mb_sub_type( h );
5973                 sub_partition_count[i]= p_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
5974                 h->sub_mb_type[i]=      p_sub_mb_type_info[ h->sub_mb_type[i] ].type;
5975             }
5976         }
5977
5978         for( list = 0; list < 2; list++ ) {
5979             if( h->ref_count[list] > 0 ) {
5980                 for( i = 0; i < 4; i++ ) {
5981                     if(IS_DIRECT(h->sub_mb_type[i])) continue;
5982                     if(IS_DIR(h->sub_mb_type[i], 0, list)){
5983                         if( h->ref_count[list] > 1 )
5984                             ref[list][i] = decode_cabac_mb_ref( h, list, 4*i );
5985                         else
5986                             ref[list][i] = 0;
5987                     } else {
5988                         ref[list][i] = -1;
5989                     }
5990                                                        h->ref_cache[list][ scan8[4*i]+1 ]=
5991                     h->ref_cache[list][ scan8[4*i]+8 ]=h->ref_cache[list][ scan8[4*i]+9 ]= ref[list][i];
5992                 }
5993             }
5994         }
5995
5996         if(dct8x8_allowed)
5997             dct8x8_allowed = get_dct8x8_allowed(h);
5998
5999         for(list=0; list<2; list++){
6000             for(i=0; i<4; i++){
6001                 if(IS_DIRECT(h->sub_mb_type[i])){
6002                     fill_rectangle(h->mvd_cache[list][scan8[4*i]], 2, 2, 8, 0, 4);
6003                     continue;
6004                 }
6005                 h->ref_cache[list][ scan8[4*i]   ]=h->ref_cache[list][ scan8[4*i]+1 ];
6006
6007                 if(IS_DIR(h->sub_mb_type[i], 0, list) && !IS_DIRECT(h->sub_mb_type[i])){
6008                     const int sub_mb_type= h->sub_mb_type[i];
6009                     const int block_width= (sub_mb_type & (MB_TYPE_16x16|MB_TYPE_16x8)) ? 2 : 1;
6010                     for(j=0; j<sub_partition_count[i]; j++){
6011                         int mpx, mpy;
6012                         int mx, my;
6013                         const int index= 4*i + block_width*j;
6014                         int16_t (* mv_cache)[2]= &h->mv_cache[list][ scan8[index] ];
6015                         int16_t (* mvd_cache)[2]= &h->mvd_cache[list][ scan8[index] ];
6016                         pred_motion(h, index, block_width, list, h->ref_cache[list][ scan8[index] ], &mpx, &mpy);
6017
6018                         mx = mpx + decode_cabac_mb_mvd( h, list, index, 0 );
6019                         my = mpy + decode_cabac_mb_mvd( h, list, index, 1 );
6020                         tprintf("final mv:%d %d\n", mx, my);
6021
6022                         if(IS_SUB_8X8(sub_mb_type)){
6023                             mv_cache[ 0 ][0]= mv_cache[ 1 ][0]=
6024                             mv_cache[ 8 ][0]= mv_cache[ 9 ][0]= mx;
6025                             mv_cache[ 0 ][1]= mv_cache[ 1 ][1]=
6026                             mv_cache[ 8 ][1]= mv_cache[ 9 ][1]= my;
6027
6028                             mvd_cache[ 0 ][0]= mvd_cache[ 1 ][0]=
6029                             mvd_cache[ 8 ][0]= mvd_cache[ 9 ][0]= mx - mpx;
6030                             mvd_cache[ 0 ][1]= mvd_cache[ 1 ][1]=
6031                             mvd_cache[ 8 ][1]= mvd_cache[ 9 ][1]= my - mpy;
6032                         }else if(IS_SUB_8X4(sub_mb_type)){
6033                             mv_cache[ 0 ][0]= mv_cache[ 1 ][0]= mx;
6034                             mv_cache[ 0 ][1]= mv_cache[ 1 ][1]= my;
6035
6036                             mvd_cache[ 0 ][0]= mvd_cache[ 1 ][0]= mx- mpx;
6037                             mvd_cache[ 0 ][1]= mvd_cache[ 1 ][1]= my - mpy;
6038                         }else if(IS_SUB_4X8(sub_mb_type)){
6039                             mv_cache[ 0 ][0]= mv_cache[ 8 ][0]= mx;
6040                             mv_cache[ 0 ][1]= mv_cache[ 8 ][1]= my;
6041
6042                             mvd_cache[ 0 ][0]= mvd_cache[ 8 ][0]= mx - mpx;
6043                             mvd_cache[ 0 ][1]= mvd_cache[ 8 ][1]= my - mpy;
6044                         }else{
6045                             assert(IS_SUB_4X4(sub_mb_type));
6046                             mv_cache[ 0 ][0]= mx;
6047                             mv_cache[ 0 ][1]= my;
6048
6049                             mvd_cache[ 0 ][0]= mx - mpx;
6050                             mvd_cache[ 0 ][1]= my - mpy;
6051                         }
6052                     }
6053                 }else{
6054                     uint32_t *p= (uint32_t *)&h->mv_cache[list][ scan8[4*i] ][0];
6055                     uint32_t *pd= (uint32_t *)&h->mvd_cache[list][ scan8[4*i] ][0];
6056                     p[0] = p[1] = p[8] = p[9] = 0;
6057                     pd[0]= pd[1]= pd[8]= pd[9]= 0;
6058                 }
6059             }
6060         }
6061     } else if( IS_DIRECT(mb_type) ) {
6062         pred_direct_motion(h, &mb_type);
6063         fill_rectangle(h->mvd_cache[0][scan8[0]], 4, 4, 8, 0, 4);
6064         fill_rectangle(h->mvd_cache[1][scan8[0]], 4, 4, 8, 0, 4);
6065         dct8x8_allowed &= h->sps.direct_8x8_inference_flag;
6066     } else {
6067         int list, mx, my, i, mpx, mpy;
6068         if(IS_16X16(mb_type)){
6069             for(list=0; list<2; list++){
6070                 if(IS_DIR(mb_type, 0, list)){
6071                     if(h->ref_count[list] > 0 ){
6072                         const int ref = h->ref_count[list] > 1 ? decode_cabac_mb_ref( h, list, 0 ) : 0;
6073                         fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, ref, 1);
6074                     }
6075                 }else
6076                     fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, (uint8_t)LIST_NOT_USED, 1);
6077             }
6078             for(list=0; list<2; list++){
6079                 if(IS_DIR(mb_type, 0, list)){
6080                     pred_motion(h, 0, 4, list, h->ref_cache[list][ scan8[0] ], &mpx, &mpy);
6081
6082                     mx = mpx + decode_cabac_mb_mvd( h, list, 0, 0 );
6083                     my = mpy + decode_cabac_mb_mvd( h, list, 0, 1 );
6084                     tprintf("final mv:%d %d\n", mx, my);
6085
6086                     fill_rectangle(h->mvd_cache[list][ scan8[0] ], 4, 4, 8, pack16to32(mx-mpx,my-mpy), 4);
6087                     fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, pack16to32(mx,my), 4);
6088                 }else
6089                     fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, 0, 4);
6090             }
6091         }
6092         else if(IS_16X8(mb_type)){
6093             for(list=0; list<2; list++){
6094                 if(h->ref_count[list]>0){
6095                     for(i=0; i<2; i++){
6096                         if(IS_DIR(mb_type, i, list)){
6097                             const int ref= h->ref_count[list] > 1 ? decode_cabac_mb_ref( h, list, 8*i ) : 0;
6098                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, ref, 1);
6099                         }else
6100                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, (LIST_NOT_USED&0xFF), 1);
6101                     }
6102                 }
6103             }
6104             for(list=0; list<2; list++){
6105                 for(i=0; i<2; i++){
6106                     if(IS_DIR(mb_type, i, list)){
6107                         pred_16x8_motion(h, 8*i, list, h->ref_cache[list][scan8[0] + 16*i], &mpx, &mpy);
6108                         mx = mpx + decode_cabac_mb_mvd( h, list, 8*i, 0 );
6109                         my = mpy + decode_cabac_mb_mvd( h, list, 8*i, 1 );
6110                         tprintf("final mv:%d %d\n", mx, my);
6111
6112                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 16*i ], 4, 2, 8, pack16to32(mx-mpx,my-mpy), 4);
6113                         fill_rectangle(h->mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, pack16to32(mx,my), 4);
6114                     }else{
6115                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 16*i ], 4, 2, 8, 0, 4);
6116                         fill_rectangle(h-> mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, 0, 4);
6117                     }
6118                 }
6119             }
6120         }else{
6121             assert(IS_8X16(mb_type));
6122             for(list=0; list<2; list++){
6123                 if(h->ref_count[list]>0){
6124                     for(i=0; i<2; i++){
6125                         if(IS_DIR(mb_type, i, list)){ //FIXME optimize
6126                             const int ref= h->ref_count[list] > 1 ? decode_cabac_mb_ref( h, list, 4*i ) : 0;
6127                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, ref, 1);
6128                         }else
6129                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, (LIST_NOT_USED&0xFF), 1);
6130                     }
6131                 }
6132             }
6133             for(list=0; list<2; list++){
6134                 for(i=0; i<2; i++){
6135                     if(IS_DIR(mb_type, i, list)){
6136                         pred_8x16_motion(h, i*4, list, h->ref_cache[list][ scan8[0] + 2*i ], &mpx, &mpy);
6137                         mx = mpx + decode_cabac_mb_mvd( h, list, 4*i, 0 );
6138                         my = mpy + decode_cabac_mb_mvd( h, list, 4*i, 1 );
6139
6140                         tprintf("final mv:%d %d\n", mx, my);
6141                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 2*i ], 2, 4, 8, pack16to32(mx-mpx,my-mpy), 4);
6142                         fill_rectangle(h->mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, pack16to32(mx,my), 4);
6143                     }else{
6144                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 2*i ], 2, 4, 8, 0, 4);
6145                         fill_rectangle(h-> mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, 0, 4);
6146                     }
6147                 }
6148             }
6149         }
6150     }
6151
6152    if( IS_INTER( mb_type ) ) {
6153         h->chroma_pred_mode_table[mb_xy] = 0;
6154         write_back_motion( h, mb_type );
6155    }
6156
6157     if( !IS_INTRA16x16( mb_type ) ) {
6158         cbp  = decode_cabac_mb_cbp_luma( h );
6159         cbp |= decode_cabac_mb_cbp_chroma( h ) << 4;
6160     }
6161
6162     h->cbp_table[mb_xy] = cbp;
6163
6164     if( dct8x8_allowed && (cbp&15) && !IS_INTRA( mb_type ) ) {
6165         if( decode_cabac_mb_transform_size( h ) )
6166             mb_type |= MB_TYPE_8x8DCT;
6167     }
6168     s->current_picture.mb_type[mb_xy]= mb_type;
6169
6170     if( cbp || IS_INTRA16x16( mb_type ) ) {
6171         const uint8_t *scan, *scan8x8, *dc_scan;
6172         int dqp;
6173
6174         if(IS_INTERLACED(mb_type)){
6175             scan= s->qscale ? h->field_scan : h->field_scan_q0;
6176             dc_scan= luma_dc_field_scan;
6177         }else{
6178             scan= s->qscale ? h->zigzag_scan : h->zigzag_scan_q0;
6179             dc_scan= luma_dc_zigzag_scan;
6180         }
6181         scan8x8= s->qscale ? h->zigzag_scan8x8 : h->zigzag_scan8x8_q0;
6182
6183         h->last_qscale_diff = dqp = decode_cabac_mb_dqp( h );
6184         if( dqp == INT_MIN ){
6185             av_log(h->s.avctx, AV_LOG_ERROR, "cabac decode of qscale diff failed at %d %d\n", s->mb_x, s->mb_y);
6186             return -1;
6187         }
6188         s->qscale += dqp;
6189         if(((unsigned)s->qscale) > 51){
6190             if(s->qscale<0) s->qscale+= 52;
6191             else            s->qscale-= 52;
6192         }
6193         h->chroma_qp = get_chroma_qp(h->pps.chroma_qp_index_offset, s->qscale);
6194
6195         if( IS_INTRA16x16( mb_type ) ) {
6196             int i;
6197             //av_log( s->avctx, AV_LOG_ERROR, "INTRA16x16 DC\n" );
6198             if( decode_cabac_residual( h, h->mb, 0, 0, dc_scan, NULL, 16) < 0)
6199                 return -1;
6200             if( cbp&15 ) {
6201                 for( i = 0; i < 16; i++ ) {
6202                     //av_log( s->avctx, AV_LOG_ERROR, "INTRA16x16 AC:%d\n", i );
6203                     if( decode_cabac_residual(h, h->mb + 16*i, 1, i, scan + 1, h->dequant4_coeff[0][s->qscale], 15) < 0 )
6204                         return -1;
6205                 }
6206             } else {
6207                 fill_rectangle(&h->non_zero_count_cache[scan8[0]], 4, 4, 8, 0, 1);
6208             }
6209         } else {
6210             int i8x8, i4x4;
6211             for( i8x8 = 0; i8x8 < 4; i8x8++ ) {
6212                 if( cbp & (1<<i8x8) ) {
6213                     if( IS_8x8DCT(mb_type) ) {
6214                         if( decode_cabac_residual(h, h->mb + 64*i8x8, 5, 4*i8x8,
6215                             scan8x8, h->dequant8_coeff[IS_INTRA( mb_type ) ? 0:1][s->qscale], 64) < 0 )
6216                             return -1;
6217                     } else
6218                     for( i4x4 = 0; i4x4 < 4; i4x4++ ) {
6219                         const int index = 4*i8x8 + i4x4;
6220                         //av_log( s->avctx, AV_LOG_ERROR, "Luma4x4: %d\n", index );
6221                         if( decode_cabac_residual(h, h->mb + 16*index, 2, index, scan, h->dequant4_coeff[IS_INTRA( mb_type ) ? 0:3][s->qscale], 16) < 0 )
6222                             return -1;
6223                     }
6224                 } else {
6225                     uint8_t * const nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
6226                     nnz[0] = nnz[1] = nnz[8] = nnz[9] = 0;
6227                 }
6228             }
6229         }
6230
6231         if( cbp&0x30 ){
6232             int c;
6233             for( c = 0; c < 2; c++ ) {
6234                 //av_log( s->avctx, AV_LOG_ERROR, "INTRA C%d-DC\n",c );
6235                 if( decode_cabac_residual(h, h->mb + 256 + 16*4*c, 3, c, chroma_dc_scan, NULL, 4) < 0)
6236                     return -1;
6237             }
6238         }
6239
6240         if( cbp&0x20 ) {
6241             int c, i;
6242             for( c = 0; c < 2; c++ ) {
6243                 for( i = 0; i < 4; i++ ) {
6244                     const int index = 16 + 4 * c + i;
6245                     //av_log( s->avctx, AV_LOG_ERROR, "INTRA C%d-AC %d\n",c, index - 16 );
6246                     if( decode_cabac_residual(h, h->mb + 16*index, 4, index - 16, scan + 1, h->dequant4_coeff[c+1+(IS_INTRA( mb_type ) ? 0:3)][h->chroma_qp], 15) < 0)
6247                         return -1;
6248                 }
6249             }
6250         } else {
6251             uint8_t * const nnz= &h->non_zero_count_cache[0];
6252             nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
6253             nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
6254         }
6255     } else {
6256         uint8_t * const nnz= &h->non_zero_count_cache[0];
6257         fill_rectangle(&nnz[scan8[0]], 4, 4, 8, 0, 1);
6258         nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
6259         nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
6260     }
6261
6262     s->current_picture.qscale_table[mb_xy]= s->qscale;
6263     write_back_non_zero_count(h);
6264
6265     return 0;
6266 }
6267
6268
6269 static void filter_mb_edgev( H264Context *h, uint8_t *pix, int stride, int bS[4], int qp ) {
6270     int i, d;
6271     const int index_a = clip( qp + h->slice_alpha_c0_offset, 0, 51 );
6272     const int alpha = alpha_table[index_a];
6273     const int beta  = beta_table[clip( qp + h->slice_beta_offset, 0, 51 )];
6274
6275     if( bS[0] < 4 ) {
6276         int8_t tc[4];
6277         for(i=0; i<4; i++)
6278             tc[i] = bS[i] ? tc0_table[index_a][bS[i] - 1] : -1;
6279         h->s.dsp.h264_h_loop_filter_luma(pix, stride, alpha, beta, tc);
6280     } else {
6281         /* 16px edge length, because bS=4 is triggered by being at
6282          * the edge of an intra MB, so all 4 bS are the same */
6283             for( d = 0; d < 16; d++ ) {
6284                 const int p0 = pix[-1];
6285                 const int p1 = pix[-2];
6286                 const int p2 = pix[-3];
6287
6288                 const int q0 = pix[0];
6289                 const int q1 = pix[1];
6290                 const int q2 = pix[2];
6291
6292                 if( ABS( p0 - q0 ) < alpha &&
6293                     ABS( p1 - p0 ) < beta &&
6294                     ABS( q1 - q0 ) < beta ) {
6295
6296                     if(ABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
6297                         if( ABS( p2 - p0 ) < beta)
6298                         {
6299                             const int p3 = pix[-4];
6300                             /* p0', p1', p2' */
6301                             pix[-1] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
6302                             pix[-2] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
6303                             pix[-3] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
6304                         } else {
6305                             /* p0' */
6306                             pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6307                         }
6308                         if( ABS( q2 - q0 ) < beta)
6309                         {
6310                             const int q3 = pix[3];
6311                             /* q0', q1', q2' */
6312                             pix[0] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
6313                             pix[1] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
6314                             pix[2] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
6315                         } else {
6316                             /* q0' */
6317                             pix[0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6318                         }
6319                     }else{
6320                         /* p0', q0' */
6321                         pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6322                         pix[ 0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6323                     }
6324                     tprintf("filter_mb_edgev i:%d d:%d\n# bS:4 -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, d, p2, p1, p0, q0, q1, q2, pix[-2], pix[-1], pix[0], pix[1]);
6325                 }
6326                 pix += stride;
6327             }
6328     }
6329 }
6330 static void filter_mb_edgecv( H264Context *h, uint8_t *pix, int stride, int bS[4], int qp ) {
6331     int i;
6332     const int index_a = clip( qp + h->slice_alpha_c0_offset, 0, 51 );
6333     const int alpha = alpha_table[index_a];
6334     const int beta  = beta_table[clip( qp + h->slice_beta_offset, 0, 51 )];
6335
6336     if( bS[0] < 4 ) {
6337         int8_t tc[4];
6338         for(i=0; i<4; i++)
6339             tc[i] = bS[i] ? tc0_table[index_a][bS[i] - 1] + 1 : 0;
6340         h->s.dsp.h264_h_loop_filter_chroma(pix, stride, alpha, beta, tc);
6341     } else {
6342         h->s.dsp.h264_h_loop_filter_chroma_intra(pix, stride, alpha, beta);
6343     }
6344 }
6345
6346 static void filter_mb_mbaff_edgev( H264Context *h, uint8_t *pix, int stride, int bS[8], int qp[2] ) {
6347     int i;
6348     for( i = 0; i < 16; i++, pix += stride) {
6349         int index_a;
6350         int alpha;
6351         int beta;
6352
6353         int qp_index;
6354         int bS_index = (i >> 1);
6355         if (h->mb_field_decoding_flag) {
6356             bS_index &= ~1;
6357             bS_index |= (i & 1);
6358         }
6359
6360         if( bS[bS_index] == 0 ) {
6361             continue;
6362         }
6363
6364         qp_index = h->mb_field_decoding_flag ? (i & 1) : (i >> 3);
6365         index_a = clip( qp[qp_index] + h->slice_alpha_c0_offset, 0, 51 );
6366         alpha = alpha_table[index_a];
6367         beta  = beta_table[clip( qp[qp_index] + h->slice_beta_offset, 0, 51 )];
6368
6369
6370         if( bS[bS_index] < 4 ) {
6371             const int tc0 = tc0_table[index_a][bS[bS_index] - 1];
6372             /* 4px edge length */
6373             const int p0 = pix[-1];
6374             const int p1 = pix[-2];
6375             const int p2 = pix[-3];
6376             const int q0 = pix[0];
6377             const int q1 = pix[1];
6378             const int q2 = pix[2];
6379
6380             if( ABS( p0 - q0 ) < alpha &&
6381                 ABS( p1 - p0 ) < beta &&
6382                 ABS( q1 - q0 ) < beta ) {
6383                 int tc = tc0;
6384                 int i_delta;
6385
6386                 if( ABS( p2 - p0 ) < beta ) {
6387                     pix[-2] = p1 + clip( ( p2 + ( ( p0 + q0 + 1 ) >> 1 ) - ( p1 << 1 ) ) >> 1, -tc0, tc0 );
6388                     tc++;
6389                 }
6390                 if( ABS( q2 - q0 ) < beta ) {
6391                     pix[1] = q1 + clip( ( q2 + ( ( p0 + q0 + 1 ) >> 1 ) - ( q1 << 1 ) ) >> 1, -tc0, tc0 );
6392                     tc++;
6393                 }
6394
6395                 i_delta = clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
6396                 pix[-1] = clip_uint8( p0 + i_delta );    /* p0' */
6397                 pix[0]  = clip_uint8( q0 - i_delta );    /* q0' */
6398                 tprintf("filter_mb_mbaff_edgev i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d, tc:%d\n# bS:%d -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, tc, bS[bS_index], pix[-3], p1, p0, q0, q1, pix[2], p1, pix[-1], pix[0], q1);
6399             }
6400         }else{
6401             /* 4px edge length */
6402             const int p0 = pix[-1];
6403             const int p1 = pix[-2];
6404             const int p2 = pix[-3];
6405
6406             const int q0 = pix[0];
6407             const int q1 = pix[1];
6408             const int q2 = pix[2];
6409
6410             if( ABS( p0 - q0 ) < alpha &&
6411                 ABS( p1 - p0 ) < beta &&
6412                 ABS( q1 - q0 ) < beta ) {
6413
6414                 if(ABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
6415                     if( ABS( p2 - p0 ) < beta)
6416                     {
6417                         const int p3 = pix[-4];
6418                         /* p0', p1', p2' */
6419                         pix[-1] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
6420                         pix[-2] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
6421                         pix[-3] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
6422                     } else {
6423                         /* p0' */
6424                         pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6425                     }
6426                     if( ABS( q2 - q0 ) < beta)
6427                     {
6428                         const int q3 = pix[3];
6429                         /* q0', q1', q2' */
6430                         pix[0] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
6431                         pix[1] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
6432                         pix[2] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
6433                     } else {
6434                         /* q0' */
6435                         pix[0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6436                     }
6437                 }else{
6438                     /* p0', q0' */
6439                     pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6440                     pix[ 0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6441                 }
6442                 tprintf("filter_mb_mbaff_edgev i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d\n# bS:4 -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, p2, p1, p0, q0, q1, q2, pix[-3], pix[-2], pix[-1], pix[0], pix[1], pix[2]);
6443             }
6444         }
6445     }
6446 }
6447 static void filter_mb_mbaff_edgecv( H264Context *h, uint8_t *pix, int stride, int bS[4], int qp[2] ) {
6448     int i;
6449     for( i = 0; i < 8; i++, pix += stride) {
6450         int index_a;
6451         int alpha;
6452         int beta;
6453
6454         int qp_index;
6455         int bS_index = i;
6456
6457         if( bS[bS_index] == 0 ) {
6458             continue;
6459         }
6460
6461         qp_index = h->mb_field_decoding_flag ? (i & 1) : (i >> 3);
6462         index_a = clip( qp[qp_index] + h->slice_alpha_c0_offset, 0, 51 );
6463         alpha = alpha_table[index_a];
6464         beta  = beta_table[clip( qp[qp_index] + h->slice_beta_offset, 0, 51 )];
6465         if( bS[bS_index] < 4 ) {
6466             const int tc = tc0_table[index_a][bS[bS_index] - 1] + 1;
6467             /* 2px edge length (because we use same bS than the one for luma) */
6468             const int p0 = pix[-1];
6469             const int p1 = pix[-2];
6470             const int q0 = pix[0];
6471             const int q1 = pix[1];
6472
6473             if( ABS( p0 - q0 ) < alpha &&
6474                 ABS( p1 - p0 ) < beta &&
6475                 ABS( q1 - q0 ) < beta ) {
6476                 const int i_delta = clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
6477
6478                 pix[-1] = clip_uint8( p0 + i_delta );    /* p0' */
6479                 pix[0]  = clip_uint8( q0 - i_delta );    /* q0' */
6480                 tprintf("filter_mb_mbaff_edgecv i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d, tc:%d\n# bS:%d -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, tc, bS[bS_index], pix[-3], p1, p0, q0, q1, pix[2], p1, pix[-1], pix[0], q1);
6481             }
6482         }else{
6483             const int p0 = pix[-1];
6484             const int p1 = pix[-2];
6485             const int q0 = pix[0];
6486             const int q1 = pix[1];
6487
6488             if( ABS( p0 - q0 ) < alpha &&
6489                 ABS( p1 - p0 ) < beta &&
6490                 ABS( q1 - q0 ) < beta ) {
6491
6492                 pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;   /* p0' */
6493                 pix[0]  = ( 2*q1 + q0 + p1 + 2 ) >> 2;   /* q0' */
6494                 tprintf("filter_mb_mbaff_edgecv i:%d\n# bS:4 -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x, %02x, %02x]\n", i, pix[-3], p1, p0, q0, q1, pix[2], pix[-3], pix[-2], pix[-1], pix[0], pix[1], pix[2]);
6495             }
6496         }
6497     }
6498 }
6499
6500 static void filter_mb_edgeh( H264Context *h, uint8_t *pix, int stride, int bS[4], int qp ) {
6501     int i, d;
6502     const int index_a = clip( qp + h->slice_alpha_c0_offset, 0, 51 );
6503     const int alpha = alpha_table[index_a];
6504     const int beta  = beta_table[clip( qp + h->slice_beta_offset, 0, 51 )];
6505     const int pix_next  = stride;
6506
6507     if( bS[0] < 4 ) {
6508         int8_t tc[4];
6509         for(i=0; i<4; i++)
6510             tc[i] = bS[i] ? tc0_table[index_a][bS[i] - 1] : -1;
6511         h->s.dsp.h264_v_loop_filter_luma(pix, stride, alpha, beta, tc);
6512     } else {
6513         /* 16px edge length, see filter_mb_edgev */
6514             for( d = 0; d < 16; d++ ) {
6515                 const int p0 = pix[-1*pix_next];
6516                 const int p1 = pix[-2*pix_next];
6517                 const int p2 = pix[-3*pix_next];
6518                 const int q0 = pix[0];
6519                 const int q1 = pix[1*pix_next];
6520                 const int q2 = pix[2*pix_next];
6521
6522                 if( ABS( p0 - q0 ) < alpha &&
6523                     ABS( p1 - p0 ) < beta &&
6524                     ABS( q1 - q0 ) < beta ) {
6525
6526                     const int p3 = pix[-4*pix_next];
6527                     const int q3 = pix[ 3*pix_next];
6528
6529                     if(ABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
6530                         if( ABS( p2 - p0 ) < beta) {
6531                             /* p0', p1', p2' */
6532                             pix[-1*pix_next] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
6533                             pix[-2*pix_next] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
6534                             pix[-3*pix_next] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
6535                         } else {
6536                             /* p0' */
6537                             pix[-1*pix_next] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6538                         }
6539                         if( ABS( q2 - q0 ) < beta) {
6540                             /* q0', q1', q2' */
6541                             pix[0*pix_next] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
6542                             pix[1*pix_next] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
6543                             pix[2*pix_next] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
6544                         } else {
6545                             /* q0' */
6546                             pix[0*pix_next] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6547                         }
6548                     }else{
6549                         /* p0', q0' */
6550                         pix[-1*pix_next] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6551                         pix[ 0*pix_next] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6552                     }
6553                     tprintf("filter_mb_edgeh i:%d d:%d, qp:%d, indexA:%d, alpha:%d, beta:%d\n# bS:%d -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, d, qp, index_a, alpha, beta, bS[i], p2, p1, p0, q0, q1, q2, pix[-2*pix_next], pix[-pix_next], pix[0], pix[pix_next]);
6554                 }
6555                 pix++;
6556             }
6557     }
6558 }
6559
6560 static void filter_mb_edgech( H264Context *h, uint8_t *pix, int stride, int bS[4], int qp ) {
6561     int i;
6562     const int index_a = clip( qp + h->slice_alpha_c0_offset, 0, 51 );
6563     const int alpha = alpha_table[index_a];
6564     const int beta  = beta_table[clip( qp + h->slice_beta_offset, 0, 51 )];
6565
6566     if( bS[0] < 4 ) {
6567         int8_t tc[4];
6568         for(i=0; i<4; i++)
6569             tc[i] = bS[i] ? tc0_table[index_a][bS[i] - 1] + 1 : 0;
6570         h->s.dsp.h264_v_loop_filter_chroma(pix, stride, alpha, beta, tc);
6571     } else {
6572         h->s.dsp.h264_v_loop_filter_chroma_intra(pix, stride, alpha, beta);
6573     }
6574 }
6575
6576 static void filter_mb( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize) {
6577     MpegEncContext * const s = &h->s;
6578     const int mb_xy= mb_x + mb_y*s->mb_stride;
6579     int first_vertical_edge_done = 0;
6580     int dir;
6581     /* FIXME: A given frame may occupy more than one position in
6582      * the reference list. So ref2frm should be populated with
6583      * frame numbers, not indices. */
6584     static const int ref2frm[18] = {-1,-1,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15};
6585
6586     //for sufficiently low qp, filtering wouldn't do anything
6587     //this is a conservative estimate: could also check beta_offset and more accurate chroma_qp
6588     if(!h->mb_aff_frame){
6589         int qp_thresh = 15 - h->slice_alpha_c0_offset - FFMAX(0, h->pps.chroma_qp_index_offset);
6590         int qp = s->current_picture.qscale_table[mb_xy];
6591         if(qp <= qp_thresh
6592            && (mb_x == 0 || ((qp + s->current_picture.qscale_table[mb_xy-1] + 1)>>1) <= qp_thresh)
6593            && (mb_y == 0 || ((qp + s->current_picture.qscale_table[h->top_mb_xy] + 1)>>1) <= qp_thresh)){
6594             return;
6595         }
6596     }
6597
6598     if (h->mb_aff_frame
6599             // left mb is in picture
6600             && h->slice_table[mb_xy-1] != 255
6601             // and current and left pair do not have the same interlaced type
6602             && (IS_INTERLACED(s->current_picture.mb_type[mb_xy]) != IS_INTERLACED(s->current_picture.mb_type[mb_xy-1]))
6603             // and left mb is in the same slice if deblocking_filter == 2
6604             && (h->deblocking_filter!=2 || h->slice_table[mb_xy-1] == h->slice_table[mb_xy])) {
6605         /* First vertical edge is different in MBAFF frames
6606          * There are 8 different bS to compute and 2 different Qp
6607          */
6608         int bS[8];
6609         int qp[2];
6610         int chroma_qp[2];
6611
6612         int i;
6613         first_vertical_edge_done = 1;
6614         for( i = 0; i < 8; i++ ) {
6615             int y = i>>1;
6616             int b_idx= 8 + 4 + 8*y;
6617             int bn_idx= b_idx - 1;
6618
6619             int mbn_xy = h->mb_field_decoding_flag ? h->left_mb_xy[i>>2] : h->left_mb_xy[i&1];
6620
6621             if( IS_INTRA( s->current_picture.mb_type[mb_xy] ) ||
6622                 IS_INTRA( s->current_picture.mb_type[mbn_xy] ) ) {
6623                 bS[i] = 4;
6624             } else if( h->non_zero_count_cache[b_idx] != 0 ||
6625                 /* FIXME: with 8x8dct + cavlc, should check cbp instead of nnz */
6626                 h->non_zero_count_cache[bn_idx] != 0 ) {
6627                 bS[i] = 2;
6628             } else {
6629                 int l;
6630                 bS[i] = 0;
6631                 for( l = 0; l < 1 + (h->slice_type == B_TYPE); l++ ) {
6632                     if( ref2frm[h->ref_cache[l][b_idx]+2] != ref2frm[h->ref_cache[l][bn_idx]+2] ||
6633                         ABS( h->mv_cache[l][b_idx][0] - h->mv_cache[l][bn_idx][0] ) >= 4 ||
6634                         ABS( h->mv_cache[l][b_idx][1] - h->mv_cache[l][bn_idx][1] ) >= 4 ) {
6635                         bS[i] = 1;
6636                         break;
6637                     }
6638                 }
6639             }
6640         }
6641         if(bS[0]+bS[1]+bS[2]+bS[3] != 0) {
6642             // Do not use s->qscale as luma quantizer because it has not the same
6643             // value in IPCM macroblocks.
6644             qp[0] = ( s->current_picture.qscale_table[mb_xy] + s->current_picture.qscale_table[h->left_mb_xy[0]] + 1 ) >> 1;
6645             chroma_qp[0] = ( get_chroma_qp( h->pps.chroma_qp_index_offset, s->current_picture.qscale_table[mb_xy] ) +
6646                              get_chroma_qp( h->pps.chroma_qp_index_offset, s->current_picture.qscale_table[h->left_mb_xy[0]] ) + 1 ) >> 1;
6647             qp[1] = ( s->current_picture.qscale_table[mb_xy] + s->current_picture.qscale_table[h->left_mb_xy[1]] + 1 ) >> 1;
6648             chroma_qp[1] = ( get_chroma_qp( h->pps.chroma_qp_index_offset, s->current_picture.qscale_table[mb_xy] ) +
6649                              get_chroma_qp( h->pps.chroma_qp_index_offset, s->current_picture.qscale_table[h->left_mb_xy[1]] ) + 1 ) >> 1;
6650
6651             /* Filter edge */
6652             tprintf("filter mb:%d/%d MBAFF, QPy:%d/%d, QPc:%d/%d ls:%d uvls:%d", mb_x, mb_y, qp[0], qp[1], chroma_qp[0], chroma_qp[1], linesize, uvlinesize);
6653             { int i; for (i = 0; i < 8; i++) tprintf(" bS[%d]:%d", i, bS[i]); tprintf("\n"); }
6654             filter_mb_mbaff_edgev ( h, &img_y [0], linesize,   bS, qp );
6655             filter_mb_mbaff_edgecv( h, &img_cb[0], uvlinesize, bS, chroma_qp );
6656             filter_mb_mbaff_edgecv( h, &img_cr[0], uvlinesize, bS, chroma_qp );
6657         }
6658     }
6659     /* dir : 0 -> vertical edge, 1 -> horizontal edge */
6660     for( dir = 0; dir < 2; dir++ )
6661     {
6662         int edge;
6663         const int mbm_xy = dir == 0 ? mb_xy -1 : h->top_mb_xy;
6664         const int mb_type = s->current_picture.mb_type[mb_xy];
6665         const int mbm_type = s->current_picture.mb_type[mbm_xy];
6666         int start = h->slice_table[mbm_xy] == 255 ? 1 : 0;
6667
6668         const int edges = (mb_type & (MB_TYPE_16x16|MB_TYPE_SKIP))
6669                                   == (MB_TYPE_16x16|MB_TYPE_SKIP) ? 1 : 4;
6670         // how often to recheck mv-based bS when iterating between edges
6671         const int mask_edge = (mb_type & (MB_TYPE_16x16 | (MB_TYPE_16x8 << dir))) ? 3 :
6672                               (mb_type & (MB_TYPE_8x16 >> dir)) ? 1 : 0;
6673         // how often to recheck mv-based bS when iterating along each edge
6674         const int mask_par0 = mb_type & (MB_TYPE_16x16 | (MB_TYPE_8x16 >> dir));
6675
6676         if (first_vertical_edge_done) {
6677             start = 1;
6678             first_vertical_edge_done = 0;
6679         }
6680
6681         if (h->deblocking_filter==2 && h->slice_table[mbm_xy] != h->slice_table[mb_xy])
6682             start = 1;
6683
6684         /* Calculate bS */
6685         for( edge = start; edge < edges; edge++ ) {
6686             /* mbn_xy: neighbor macroblock */
6687             const int mbn_xy = edge > 0 ? mb_xy : mbm_xy;
6688             const int mbn_type = s->current_picture.mb_type[mbn_xy];
6689             int bS[4];
6690             int qp;
6691
6692             if( (edge&1) && IS_8x8DCT(mb_type) )
6693                 continue;
6694
6695             if (h->mb_aff_frame && (dir == 1) && (edge == 0) && ((mb_y & 1) == 0)
6696                 && !IS_INTERLACED(mb_type)
6697                 && IS_INTERLACED(mbn_type)
6698                 ) {
6699                 // This is a special case in the norm where the filtering must
6700                 // be done twice (one each of the field) even if we are in a
6701                 // frame macroblock.
6702                 //
6703                 unsigned int tmp_linesize   = 2 *   linesize;
6704                 unsigned int tmp_uvlinesize = 2 * uvlinesize;
6705                 int mbn_xy = mb_xy - 2 * s->mb_stride;
6706                 int qp, chroma_qp;
6707
6708                 // first filtering
6709                 if( IS_INTRA(mb_type) ||
6710                     IS_INTRA(s->current_picture.mb_type[mbn_xy]) ) {
6711                     bS[0] = bS[1] = bS[2] = bS[3] = 3;
6712                 } else {
6713                     // TODO
6714                     av_log(h->s.avctx, AV_LOG_ERROR, "both non intra (TODO)\n");
6715                 }
6716                 /* Filter edge */
6717                 // Do not use s->qscale as luma quantizer because it has not the same
6718                 // value in IPCM macroblocks.
6719                 qp = ( s->current_picture.qscale_table[mb_xy] + s->current_picture.qscale_table[mbn_xy] + 1 ) >> 1;
6720                 tprintf("filter mb:%d/%d dir:%d edge:%d, QPy:%d ls:%d uvls:%d", mb_x, mb_y, dir, edge, qp, tmp_linesize, tmp_uvlinesize);
6721                 { int i; for (i = 0; i < 4; i++) tprintf(" bS[%d]:%d", i, bS[i]); tprintf("\n"); }
6722                 filter_mb_edgeh( h, &img_y[0], tmp_linesize, bS, qp );
6723                 chroma_qp = ( h->chroma_qp +
6724                               get_chroma_qp( h->pps.chroma_qp_index_offset, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1;
6725                 filter_mb_edgech( h, &img_cb[0], tmp_uvlinesize, bS, chroma_qp );
6726                 filter_mb_edgech( h, &img_cr[0], tmp_uvlinesize, bS, chroma_qp );
6727
6728                 // second filtering
6729                 mbn_xy += s->mb_stride;
6730                 if( IS_INTRA(mb_type) ||
6731                     IS_INTRA(mbn_type) ) {
6732                     bS[0] = bS[1] = bS[2] = bS[3] = 3;
6733                 } else {
6734                     // TODO
6735                     av_log(h->s.avctx, AV_LOG_ERROR, "both non intra (TODO)\n");
6736                 }
6737                 /* Filter edge */
6738                 // Do not use s->qscale as luma quantizer because it has not the same
6739                 // value in IPCM macroblocks.
6740                 qp = ( s->current_picture.qscale_table[mb_xy] + s->current_picture.qscale_table[mbn_xy] + 1 ) >> 1;
6741                 tprintf("filter mb:%d/%d dir:%d edge:%d, QPy:%d ls:%d uvls:%d", mb_x, mb_y, dir, edge, qp, tmp_linesize, tmp_uvlinesize);
6742                 { int i; for (i = 0; i < 4; i++) tprintf(" bS[%d]:%d", i, bS[i]); tprintf("\n"); }
6743                 filter_mb_edgeh( h, &img_y[linesize], tmp_linesize, bS, qp );
6744                 chroma_qp = ( h->chroma_qp +
6745                               get_chroma_qp( h->pps.chroma_qp_index_offset, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1;
6746                 filter_mb_edgech( h, &img_cb[uvlinesize], tmp_uvlinesize, bS, chroma_qp );
6747                 filter_mb_edgech( h, &img_cr[uvlinesize], tmp_uvlinesize, bS, chroma_qp );
6748                 continue;
6749             }
6750             if( IS_INTRA(mb_type) ||
6751                 IS_INTRA(mbn_type) ) {
6752                 int value;
6753                 if (edge == 0) {
6754                     if (   (!IS_INTERLACED(mb_type) && !IS_INTERLACED(mbm_type))
6755                         || ((h->mb_aff_frame || (s->picture_structure != PICT_FRAME)) && (dir == 0))
6756                     ) {
6757                         value = 4;
6758                     } else {
6759                         value = 3;
6760                     }
6761                 } else {
6762                     value = 3;
6763                 }
6764                 bS[0] = bS[1] = bS[2] = bS[3] = value;
6765             } else {
6766                 int i, l;
6767                 int mv_done;
6768
6769                 if( edge & mask_edge ) {
6770                     bS[0] = bS[1] = bS[2] = bS[3] = 0;
6771                     mv_done = 1;
6772                 }
6773                 else if( mask_par0 && (edge || (mbn_type & (MB_TYPE_16x16 | (MB_TYPE_8x16 >> dir)))) ) {
6774                     int b_idx= 8 + 4 + edge * (dir ? 8:1);
6775                     int bn_idx= b_idx - (dir ? 8:1);
6776                     int v = 0;
6777                     for( l = 0; !v && l < 1 + (h->slice_type == B_TYPE); l++ ) {
6778                         v |= ref2frm[h->ref_cache[l][b_idx]+2] != ref2frm[h->ref_cache[l][bn_idx]+2] ||
6779                              ABS( h->mv_cache[l][b_idx][0] - h->mv_cache[l][bn_idx][0] ) >= 4 ||
6780                              ABS( h->mv_cache[l][b_idx][1] - h->mv_cache[l][bn_idx][1] ) >= 4;
6781                     }
6782                     bS[0] = bS[1] = bS[2] = bS[3] = v;
6783                     mv_done = 1;
6784                 }
6785                 else
6786                     mv_done = 0;
6787
6788                 for( i = 0; i < 4; i++ ) {
6789                     int x = dir == 0 ? edge : i;
6790                     int y = dir == 0 ? i    : edge;
6791                     int b_idx= 8 + 4 + x + 8*y;
6792                     int bn_idx= b_idx - (dir ? 8:1);
6793
6794                     if( h->non_zero_count_cache[b_idx] != 0 ||
6795                         h->non_zero_count_cache[bn_idx] != 0 ) {
6796                         bS[i] = 2;
6797                     }
6798                     else if(!mv_done)
6799                     {
6800                         bS[i] = 0;
6801                         for( l = 0; l < 1 + (h->slice_type == B_TYPE); l++ ) {
6802                             if( ref2frm[h->ref_cache[l][b_idx]+2] != ref2frm[h->ref_cache[l][bn_idx]+2] ||
6803                                 ABS( h->mv_cache[l][b_idx][0] - h->mv_cache[l][bn_idx][0] ) >= 4 ||
6804                                 ABS( h->mv_cache[l][b_idx][1] - h->mv_cache[l][bn_idx][1] ) >= 4 ) {
6805                                 bS[i] = 1;
6806                                 break;
6807                             }
6808                         }
6809                     }
6810                 }
6811
6812                 if(bS[0]+bS[1]+bS[2]+bS[3] == 0)
6813                     continue;
6814             }
6815
6816             /* Filter edge */
6817             // Do not use s->qscale as luma quantizer because it has not the same
6818             // value in IPCM macroblocks.
6819             qp = ( s->current_picture.qscale_table[mb_xy] + s->current_picture.qscale_table[mbn_xy] + 1 ) >> 1;
6820             //tprintf("filter mb:%d/%d dir:%d edge:%d, QPy:%d, QPc:%d, QPcn:%d\n", mb_x, mb_y, dir, edge, qp, h->chroma_qp, s->current_picture.qscale_table[mbn_xy]);
6821             tprintf("filter mb:%d/%d dir:%d edge:%d, QPy:%d ls:%d uvls:%d", mb_x, mb_y, dir, edge, qp, linesize, uvlinesize);
6822             { int i; for (i = 0; i < 4; i++) tprintf(" bS[%d]:%d", i, bS[i]); tprintf("\n"); }
6823             if( dir == 0 ) {
6824                 filter_mb_edgev( h, &img_y[4*edge], linesize, bS, qp );
6825                 if( (edge&1) == 0 ) {
6826                     int chroma_qp = ( h->chroma_qp +
6827                                       get_chroma_qp( h->pps.chroma_qp_index_offset, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1;
6828                     filter_mb_edgecv( h, &img_cb[2*edge], uvlinesize, bS, chroma_qp );
6829                     filter_mb_edgecv( h, &img_cr[2*edge], uvlinesize, bS, chroma_qp );
6830                 }
6831             } else {
6832                 filter_mb_edgeh( h, &img_y[4*edge*linesize], linesize, bS, qp );
6833                 if( (edge&1) == 0 ) {
6834                     int chroma_qp = ( h->chroma_qp +
6835                                       get_chroma_qp( h->pps.chroma_qp_index_offset, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1;
6836                     filter_mb_edgech( h, &img_cb[2*edge*uvlinesize], uvlinesize, bS, chroma_qp );
6837                     filter_mb_edgech( h, &img_cr[2*edge*uvlinesize], uvlinesize, bS, chroma_qp );
6838                 }
6839             }
6840         }
6841     }
6842 }
6843
6844 static int decode_slice(H264Context *h){
6845     MpegEncContext * const s = &h->s;
6846     const int part_mask= s->partitioned_frame ? (AC_END|AC_ERROR) : 0x7F;
6847
6848     s->mb_skip_run= -1;
6849
6850     if( h->pps.cabac ) {
6851         int i;
6852
6853         /* realign */
6854         align_get_bits( &s->gb );
6855
6856         /* init cabac */
6857         ff_init_cabac_states( &h->cabac, ff_h264_lps_range, ff_h264_mps_state, ff_h264_lps_state, 64 );
6858         ff_init_cabac_decoder( &h->cabac,
6859                                s->gb.buffer + get_bits_count(&s->gb)/8,
6860                                ( s->gb.size_in_bits - get_bits_count(&s->gb) + 7)/8);
6861         /* calculate pre-state */
6862         for( i= 0; i < 460; i++ ) {
6863             int pre;
6864             if( h->slice_type == I_TYPE )
6865                 pre = clip( ((cabac_context_init_I[i][0] * s->qscale) >>4 ) + cabac_context_init_I[i][1], 1, 126 );
6866             else
6867                 pre = clip( ((cabac_context_init_PB[h->cabac_init_idc][i][0] * s->qscale) >>4 ) + cabac_context_init_PB[h->cabac_init_idc][i][1], 1, 126 );
6868
6869             if( pre <= 63 )
6870                 h->cabac_state[i] = 2 * ( 63 - pre ) + 0;
6871             else
6872                 h->cabac_state[i] = 2 * ( pre - 64 ) + 1;
6873         }
6874
6875         for(;;){
6876             int ret = decode_mb_cabac(h);
6877             int eos;
6878
6879             if(ret>=0) hl_decode_mb(h);
6880
6881             /* XXX: useless as decode_mb_cabac it doesn't support that ... */
6882             if( ret >= 0 && h->mb_aff_frame ) { //FIXME optimal? or let mb_decode decode 16x32 ?
6883                 s->mb_y++;
6884
6885                 if(ret>=0) ret = decode_mb_cabac(h);
6886
6887                 if(ret>=0) hl_decode_mb(h);
6888                 s->mb_y--;
6889             }
6890             eos = get_cabac_terminate( &h->cabac );
6891
6892             if( ret < 0 || h->cabac.bytestream > h->cabac.bytestream_end + 1) {
6893                 av_log(h->s.avctx, AV_LOG_ERROR, "error while decoding MB %d %d\n", s->mb_x, s->mb_y);
6894                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6895                 return -1;
6896             }
6897
6898             if( ++s->mb_x >= s->mb_width ) {
6899                 s->mb_x = 0;
6900                 ff_draw_horiz_band(s, 16*s->mb_y, 16);
6901                 ++s->mb_y;
6902                 if(h->mb_aff_frame) {
6903                     ++s->mb_y;
6904                 }
6905             }
6906
6907             if( eos || s->mb_y >= s->mb_height ) {
6908                 tprintf("slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
6909                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6910                 return 0;
6911             }
6912         }
6913
6914     } else {
6915         for(;;){
6916             int ret = decode_mb_cavlc(h);
6917
6918             if(ret>=0) hl_decode_mb(h);
6919
6920             if(ret>=0 && h->mb_aff_frame){ //FIXME optimal? or let mb_decode decode 16x32 ?
6921                 s->mb_y++;
6922                 ret = decode_mb_cavlc(h);
6923
6924                 if(ret>=0) hl_decode_mb(h);
6925                 s->mb_y--;
6926             }
6927
6928             if(ret<0){
6929                 av_log(h->s.avctx, AV_LOG_ERROR, "error while decoding MB %d %d\n", s->mb_x, s->mb_y);
6930                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6931
6932                 return -1;
6933             }
6934
6935             if(++s->mb_x >= s->mb_width){
6936                 s->mb_x=0;
6937                 ff_draw_horiz_band(s, 16*s->mb_y, 16);
6938                 ++s->mb_y;
6939                 if(h->mb_aff_frame) {
6940                     ++s->mb_y;
6941                 }
6942                 if(s->mb_y >= s->mb_height){
6943                     tprintf("slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
6944
6945                     if(get_bits_count(&s->gb) == s->gb.size_in_bits ) {
6946                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6947
6948                         return 0;
6949                     }else{
6950                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6951
6952                         return -1;
6953                     }
6954                 }
6955             }
6956
6957             if(get_bits_count(&s->gb) >= s->gb.size_in_bits && s->mb_skip_run<=0){
6958                 tprintf("slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
6959                 if(get_bits_count(&s->gb) == s->gb.size_in_bits ){
6960                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6961
6962                     return 0;
6963                 }else{
6964                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6965
6966                     return -1;
6967                 }
6968             }
6969         }
6970     }
6971
6972 #if 0
6973     for(;s->mb_y < s->mb_height; s->mb_y++){
6974         for(;s->mb_x < s->mb_width; s->mb_x++){
6975             int ret= decode_mb(h);
6976
6977             hl_decode_mb(h);
6978
6979             if(ret<0){
6980                 fprintf(stderr, "error while decoding MB %d %d\n", s->mb_x, s->mb_y);
6981                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6982
6983                 return -1;
6984             }
6985
6986             if(++s->mb_x >= s->mb_width){
6987                 s->mb_x=0;
6988                 if(++s->mb_y >= s->mb_height){
6989                     if(get_bits_count(s->gb) == s->gb.size_in_bits){
6990                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6991
6992                         return 0;
6993                     }else{
6994                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6995
6996                         return -1;
6997                     }
6998                 }
6999             }
7000
7001             if(get_bits_count(s->?gb) >= s->gb?.size_in_bits){
7002                 if(get_bits_count(s->gb) == s->gb.size_in_bits){
7003                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
7004
7005                     return 0;
7006                 }else{
7007                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
7008
7009                     return -1;
7010                 }
7011             }
7012         }
7013         s->mb_x=0;
7014         ff_draw_horiz_band(s, 16*s->mb_y, 16);
7015     }
7016 #endif
7017     return -1; //not reached
7018 }
7019
7020 static int decode_unregistered_user_data(H264Context *h, int size){
7021     MpegEncContext * const s = &h->s;
7022     uint8_t user_data[16+256];
7023     int e, build, i;
7024
7025     if(size<16)
7026         return -1;
7027
7028     for(i=0; i<sizeof(user_data)-1 && i<size; i++){
7029         user_data[i]= get_bits(&s->gb, 8);
7030     }
7031
7032     user_data[i]= 0;
7033     e= sscanf(user_data+16, "x264 - core %d"/*%s - H.264/MPEG-4 AVC codec - Copyleft 2005 - http://www.videolan.org/x264.html*/, &build);
7034     if(e==1 && build>=0)
7035         h->x264_build= build;
7036
7037     if(s->avctx->debug & FF_DEBUG_BUGS)
7038         av_log(s->avctx, AV_LOG_DEBUG, "user data:\"%s\"\n", user_data+16);
7039
7040     for(; i<size; i++)
7041         skip_bits(&s->gb, 8);
7042
7043     return 0;
7044 }
7045
7046 static int decode_sei(H264Context *h){
7047     MpegEncContext * const s = &h->s;
7048
7049     while(get_bits_count(&s->gb) + 16 < s->gb.size_in_bits){
7050         int size, type;
7051
7052         type=0;
7053         do{
7054             type+= show_bits(&s->gb, 8);
7055         }while(get_bits(&s->gb, 8) == 255);
7056
7057         size=0;
7058         do{
7059             size+= show_bits(&s->gb, 8);
7060         }while(get_bits(&s->gb, 8) == 255);
7061
7062         switch(type){
7063         case 5:
7064             if(decode_unregistered_user_data(h, size) < 0);
7065                 return -1;
7066             break;
7067         default:
7068             skip_bits(&s->gb, 8*size);
7069         }
7070
7071         //FIXME check bits here
7072         align_get_bits(&s->gb);
7073     }
7074
7075     return 0;
7076 }
7077
7078 static inline void decode_hrd_parameters(H264Context *h, SPS *sps){
7079     MpegEncContext * const s = &h->s;
7080     int cpb_count, i;
7081     cpb_count = get_ue_golomb(&s->gb) + 1;
7082     get_bits(&s->gb, 4); /* bit_rate_scale */
7083     get_bits(&s->gb, 4); /* cpb_size_scale */
7084     for(i=0; i<cpb_count; i++){
7085         get_ue_golomb(&s->gb); /* bit_rate_value_minus1 */
7086         get_ue_golomb(&s->gb); /* cpb_size_value_minus1 */
7087         get_bits1(&s->gb);     /* cbr_flag */
7088     }
7089     get_bits(&s->gb, 5); /* initial_cpb_removal_delay_length_minus1 */
7090     get_bits(&s->gb, 5); /* cpb_removal_delay_length_minus1 */
7091     get_bits(&s->gb, 5); /* dpb_output_delay_length_minus1 */
7092     get_bits(&s->gb, 5); /* time_offset_length */
7093 }
7094
7095 static inline int decode_vui_parameters(H264Context *h, SPS *sps){
7096     MpegEncContext * const s = &h->s;
7097     int aspect_ratio_info_present_flag, aspect_ratio_idc;
7098     int nal_hrd_parameters_present_flag, vcl_hrd_parameters_present_flag;
7099
7100     aspect_ratio_info_present_flag= get_bits1(&s->gb);
7101
7102     if( aspect_ratio_info_present_flag ) {
7103         aspect_ratio_idc= get_bits(&s->gb, 8);
7104         if( aspect_ratio_idc == EXTENDED_SAR ) {
7105             sps->sar.num= get_bits(&s->gb, 16);
7106             sps->sar.den= get_bits(&s->gb, 16);
7107         }else if(aspect_ratio_idc < 16){
7108             sps->sar=  pixel_aspect[aspect_ratio_idc];
7109         }else{
7110             av_log(h->s.avctx, AV_LOG_ERROR, "illegal aspect ratio\n");
7111             return -1;
7112         }
7113     }else{
7114         sps->sar.num=
7115         sps->sar.den= 0;
7116     }
7117 //            s->avctx->aspect_ratio= sar_width*s->width / (float)(s->height*sar_height);
7118
7119     if(get_bits1(&s->gb)){      /* overscan_info_present_flag */
7120         get_bits1(&s->gb);      /* overscan_appropriate_flag */
7121     }
7122
7123     if(get_bits1(&s->gb)){      /* video_signal_type_present_flag */
7124         get_bits(&s->gb, 3);    /* video_format */
7125         get_bits1(&s->gb);      /* video_full_range_flag */
7126         if(get_bits1(&s->gb)){  /* colour_description_present_flag */
7127             get_bits(&s->gb, 8); /* colour_primaries */
7128             get_bits(&s->gb, 8); /* transfer_characteristics */
7129             get_bits(&s->gb, 8); /* matrix_coefficients */
7130         }
7131     }
7132
7133     if(get_bits1(&s->gb)){      /* chroma_location_info_present_flag */
7134         get_ue_golomb(&s->gb);  /* chroma_sample_location_type_top_field */
7135         get_ue_golomb(&s->gb);  /* chroma_sample_location_type_bottom_field */
7136     }
7137
7138     sps->timing_info_present_flag = get_bits1(&s->gb);
7139     if(sps->timing_info_present_flag){
7140         sps->num_units_in_tick = get_bits_long(&s->gb, 32);
7141         sps->time_scale = get_bits_long(&s->gb, 32);
7142         sps->fixed_frame_rate_flag = get_bits1(&s->gb);
7143     }
7144
7145     nal_hrd_parameters_present_flag = get_bits1(&s->gb);
7146     if(nal_hrd_parameters_present_flag)
7147         decode_hrd_parameters(h, sps);
7148     vcl_hrd_parameters_present_flag = get_bits1(&s->gb);
7149     if(vcl_hrd_parameters_present_flag)
7150         decode_hrd_parameters(h, sps);
7151     if(nal_hrd_parameters_present_flag || vcl_hrd_parameters_present_flag)
7152         get_bits1(&s->gb);     /* low_delay_hrd_flag */
7153     get_bits1(&s->gb);         /* pic_struct_present_flag */
7154
7155     sps->bitstream_restriction_flag = get_bits1(&s->gb);
7156     if(sps->bitstream_restriction_flag){
7157         get_bits1(&s->gb);     /* motion_vectors_over_pic_boundaries_flag */
7158         get_ue_golomb(&s->gb); /* max_bytes_per_pic_denom */
7159         get_ue_golomb(&s->gb); /* max_bits_per_mb_denom */
7160         get_ue_golomb(&s->gb); /* log2_max_mv_length_horizontal */
7161         get_ue_golomb(&s->gb); /* log2_max_mv_length_vertical */
7162         sps->num_reorder_frames = get_ue_golomb(&s->gb);
7163         get_ue_golomb(&s->gb); /* max_dec_frame_buffering */
7164     }
7165
7166     return 0;
7167 }
7168
7169 static void decode_scaling_list(H264Context *h, uint8_t *factors, int size,
7170                                 const uint8_t *jvt_list, const uint8_t *fallback_list){
7171     MpegEncContext * const s = &h->s;
7172     int i, last = 8, next = 8;
7173     const uint8_t *scan = size == 16 ? zigzag_scan : zigzag_scan8x8;
7174     if(!get_bits1(&s->gb)) /* matrix not written, we use the predicted one */
7175         memcpy(factors, fallback_list, size*sizeof(uint8_t));
7176     else
7177     for(i=0;i<size;i++){
7178         if(next)
7179             next = (last + get_se_golomb(&s->gb)) & 0xff;
7180         if(!i && !next){ /* matrix not written, we use the preset one */
7181             memcpy(factors, jvt_list, size*sizeof(uint8_t));
7182             break;
7183         }
7184         last = factors[scan[i]] = next ? next : last;
7185     }
7186 }
7187
7188 static void decode_scaling_matrices(H264Context *h, SPS *sps, PPS *pps, int is_sps,
7189                                    uint8_t (*scaling_matrix4)[16], uint8_t (*scaling_matrix8)[64]){
7190     MpegEncContext * const s = &h->s;
7191     int fallback_sps = !is_sps && sps->scaling_matrix_present;
7192     const uint8_t *fallback[4] = {
7193         fallback_sps ? sps->scaling_matrix4[0] : default_scaling4[0],
7194         fallback_sps ? sps->scaling_matrix4[3] : default_scaling4[1],
7195         fallback_sps ? sps->scaling_matrix8[0] : default_scaling8[0],
7196         fallback_sps ? sps->scaling_matrix8[1] : default_scaling8[1]
7197     };
7198     if(get_bits1(&s->gb)){
7199         sps->scaling_matrix_present |= is_sps;
7200         decode_scaling_list(h,scaling_matrix4[0],16,default_scaling4[0],fallback[0]); // Intra, Y
7201         decode_scaling_list(h,scaling_matrix4[1],16,default_scaling4[0],scaling_matrix4[0]); // Intra, Cr
7202         decode_scaling_list(h,scaling_matrix4[2],16,default_scaling4[0],scaling_matrix4[1]); // Intra, Cb
7203         decode_scaling_list(h,scaling_matrix4[3],16,default_scaling4[1],fallback[1]); // Inter, Y
7204         decode_scaling_list(h,scaling_matrix4[4],16,default_scaling4[1],scaling_matrix4[3]); // Inter, Cr
7205         decode_scaling_list(h,scaling_matrix4[5],16,default_scaling4[1],scaling_matrix4[4]); // Inter, Cb
7206         if(is_sps || pps->transform_8x8_mode){
7207             decode_scaling_list(h,scaling_matrix8[0],64,default_scaling8[0],fallback[2]);  // Intra, Y
7208             decode_scaling_list(h,scaling_matrix8[1],64,default_scaling8[1],fallback[3]);  // Inter, Y
7209         }
7210     } else if(fallback_sps) {
7211         memcpy(scaling_matrix4, sps->scaling_matrix4, 6*16*sizeof(uint8_t));
7212         memcpy(scaling_matrix8, sps->scaling_matrix8, 2*64*sizeof(uint8_t));
7213     }
7214 }
7215
7216 static inline int decode_seq_parameter_set(H264Context *h){
7217     MpegEncContext * const s = &h->s;
7218     int profile_idc, level_idc;
7219     int sps_id, i;
7220     SPS *sps;
7221
7222     profile_idc= get_bits(&s->gb, 8);
7223     get_bits1(&s->gb);   //constraint_set0_flag
7224     get_bits1(&s->gb);   //constraint_set1_flag
7225     get_bits1(&s->gb);   //constraint_set2_flag
7226     get_bits1(&s->gb);   //constraint_set3_flag
7227     get_bits(&s->gb, 4); // reserved
7228     level_idc= get_bits(&s->gb, 8);
7229     sps_id= get_ue_golomb(&s->gb);
7230
7231     sps= &h->sps_buffer[ sps_id ];
7232     sps->profile_idc= profile_idc;
7233     sps->level_idc= level_idc;
7234
7235     if(sps->profile_idc >= 100){ //high profile
7236         if(get_ue_golomb(&s->gb) == 3) //chroma_format_idc
7237             get_bits1(&s->gb);  //residual_color_transform_flag
7238         get_ue_golomb(&s->gb);  //bit_depth_luma_minus8
7239         get_ue_golomb(&s->gb);  //bit_depth_chroma_minus8
7240         sps->transform_bypass = get_bits1(&s->gb);
7241         decode_scaling_matrices(h, sps, NULL, 1, sps->scaling_matrix4, sps->scaling_matrix8);
7242     }else
7243         sps->scaling_matrix_present = 0;
7244
7245     sps->log2_max_frame_num= get_ue_golomb(&s->gb) + 4;
7246     sps->poc_type= get_ue_golomb(&s->gb);
7247
7248     if(sps->poc_type == 0){ //FIXME #define
7249         sps->log2_max_poc_lsb= get_ue_golomb(&s->gb) + 4;
7250     } else if(sps->poc_type == 1){//FIXME #define
7251         sps->delta_pic_order_always_zero_flag= get_bits1(&s->gb);
7252         sps->offset_for_non_ref_pic= get_se_golomb(&s->gb);
7253         sps->offset_for_top_to_bottom_field= get_se_golomb(&s->gb);
7254         sps->poc_cycle_length= get_ue_golomb(&s->gb);
7255
7256         for(i=0; i<sps->poc_cycle_length; i++)
7257             sps->offset_for_ref_frame[i]= get_se_golomb(&s->gb);
7258     }
7259     if(sps->poc_type > 2){
7260         av_log(h->s.avctx, AV_LOG_ERROR, "illegal POC type %d\n", sps->poc_type);
7261         return -1;
7262     }
7263
7264     sps->ref_frame_count= get_ue_golomb(&s->gb);
7265     if(sps->ref_frame_count > MAX_PICTURE_COUNT-2){
7266         av_log(h->s.avctx, AV_LOG_ERROR, "too many reference frames\n");
7267     }
7268     sps->gaps_in_frame_num_allowed_flag= get_bits1(&s->gb);
7269     sps->mb_width= get_ue_golomb(&s->gb) + 1;
7270     sps->mb_height= get_ue_golomb(&s->gb) + 1;
7271     if((unsigned)sps->mb_width >= INT_MAX/16 || (unsigned)sps->mb_height >= INT_MAX/16 ||
7272        avcodec_check_dimensions(NULL, 16*sps->mb_width, 16*sps->mb_height))
7273         return -1;
7274
7275     sps->frame_mbs_only_flag= get_bits1(&s->gb);
7276     if(!sps->frame_mbs_only_flag)
7277         sps->mb_aff= get_bits1(&s->gb);
7278     else
7279         sps->mb_aff= 0;
7280
7281     sps->direct_8x8_inference_flag= get_bits1(&s->gb);
7282
7283     sps->crop= get_bits1(&s->gb);
7284     if(sps->crop){
7285         sps->crop_left  = get_ue_golomb(&s->gb);
7286         sps->crop_right = get_ue_golomb(&s->gb);
7287         sps->crop_top   = get_ue_golomb(&s->gb);
7288         sps->crop_bottom= get_ue_golomb(&s->gb);
7289         if(sps->crop_left || sps->crop_top){
7290             av_log(h->s.avctx, AV_LOG_ERROR, "insane cropping not completely supported, this could look slightly wrong ...\n");
7291         }
7292     }else{
7293         sps->crop_left  =
7294         sps->crop_right =
7295         sps->crop_top   =
7296         sps->crop_bottom= 0;
7297     }
7298
7299     sps->vui_parameters_present_flag= get_bits1(&s->gb);
7300     if( sps->vui_parameters_present_flag )
7301         decode_vui_parameters(h, sps);
7302
7303     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
7304         av_log(h->s.avctx, AV_LOG_DEBUG, "sps:%d profile:%d/%d poc:%d ref:%d %dx%d %s %s crop:%d/%d/%d/%d %s\n",
7305                sps_id, sps->profile_idc, sps->level_idc,
7306                sps->poc_type,
7307                sps->ref_frame_count,
7308                sps->mb_width, sps->mb_height,
7309                sps->frame_mbs_only_flag ? "FRM" : (sps->mb_aff ? "MB-AFF" : "PIC-AFF"),
7310                sps->direct_8x8_inference_flag ? "8B8" : "",
7311                sps->crop_left, sps->crop_right,
7312                sps->crop_top, sps->crop_bottom,
7313                sps->vui_parameters_present_flag ? "VUI" : ""
7314                );
7315     }
7316     return 0;
7317 }
7318
7319 static inline int decode_picture_parameter_set(H264Context *h, int bit_length){
7320     MpegEncContext * const s = &h->s;
7321     int pps_id= get_ue_golomb(&s->gb);
7322     PPS *pps= &h->pps_buffer[pps_id];
7323
7324     pps->sps_id= get_ue_golomb(&s->gb);
7325     pps->cabac= get_bits1(&s->gb);
7326     pps->pic_order_present= get_bits1(&s->gb);
7327     pps->slice_group_count= get_ue_golomb(&s->gb) + 1;
7328     if(pps->slice_group_count > 1 ){
7329         pps->mb_slice_group_map_type= get_ue_golomb(&s->gb);
7330         av_log(h->s.avctx, AV_LOG_ERROR, "FMO not supported\n");
7331         switch(pps->mb_slice_group_map_type){
7332         case 0:
7333 #if 0
7334 |   for( i = 0; i <= num_slice_groups_minus1; i++ ) |   |        |
7335 |    run_length[ i ]                                |1  |ue(v)   |
7336 #endif
7337             break;
7338         case 2:
7339 #if 0
7340 |   for( i = 0; i < num_slice_groups_minus1; i++ )  |   |        |
7341 |{                                                  |   |        |
7342 |    top_left_mb[ i ]                               |1  |ue(v)   |
7343 |    bottom_right_mb[ i ]                           |1  |ue(v)   |
7344 |   }                                               |   |        |
7345 #endif
7346             break;
7347         case 3:
7348         case 4:
7349         case 5:
7350 #if 0
7351 |   slice_group_change_direction_flag               |1  |u(1)    |
7352 |   slice_group_change_rate_minus1                  |1  |ue(v)   |
7353 #endif
7354             break;
7355         case 6:
7356 #if 0
7357 |   slice_group_id_cnt_minus1                       |1  |ue(v)   |
7358 |   for( i = 0; i <= slice_group_id_cnt_minus1; i++ |   |        |
7359 |)                                                  |   |        |
7360 |    slice_group_id[ i ]                            |1  |u(v)    |
7361 #endif
7362             break;
7363         }
7364     }
7365     pps->ref_count[0]= get_ue_golomb(&s->gb) + 1;
7366     pps->ref_count[1]= get_ue_golomb(&s->gb) + 1;
7367     if(pps->ref_count[0] > 32 || pps->ref_count[1] > 32){
7368         av_log(h->s.avctx, AV_LOG_ERROR, "reference overflow (pps)\n");
7369         return -1;
7370     }
7371
7372     pps->weighted_pred= get_bits1(&s->gb);
7373     pps->weighted_bipred_idc= get_bits(&s->gb, 2);
7374     pps->init_qp= get_se_golomb(&s->gb) + 26;
7375     pps->init_qs= get_se_golomb(&s->gb) + 26;
7376     pps->chroma_qp_index_offset= get_se_golomb(&s->gb);
7377     pps->deblocking_filter_parameters_present= get_bits1(&s->gb);
7378     pps->constrained_intra_pred= get_bits1(&s->gb);
7379     pps->redundant_pic_cnt_present = get_bits1(&s->gb);
7380
7381     memset(pps->scaling_matrix4, 16, 6*16*sizeof(uint8_t));
7382     memset(pps->scaling_matrix8, 16, 2*64*sizeof(uint8_t));
7383
7384     if(get_bits_count(&s->gb) < bit_length){
7385         pps->transform_8x8_mode= get_bits1(&s->gb);
7386         decode_scaling_matrices(h, &h->sps_buffer[pps->sps_id], pps, 0, pps->scaling_matrix4, pps->scaling_matrix8);
7387         get_se_golomb(&s->gb);  //second_chroma_qp_index_offset
7388     }
7389
7390     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
7391         av_log(h->s.avctx, AV_LOG_DEBUG, "pps:%d sps:%d %s slice_groups:%d ref:%d/%d %s qp:%d/%d/%d %s %s %s %s\n",
7392                pps_id, pps->sps_id,
7393                pps->cabac ? "CABAC" : "CAVLC",
7394                pps->slice_group_count,
7395                pps->ref_count[0], pps->ref_count[1],
7396                pps->weighted_pred ? "weighted" : "",
7397                pps->init_qp, pps->init_qs, pps->chroma_qp_index_offset,
7398                pps->deblocking_filter_parameters_present ? "LPAR" : "",
7399                pps->constrained_intra_pred ? "CONSTR" : "",
7400                pps->redundant_pic_cnt_present ? "REDU" : "",
7401                pps->transform_8x8_mode ? "8x8DCT" : ""
7402                );
7403     }
7404
7405     return 0;
7406 }
7407
7408 /**
7409  * finds the end of the current frame in the bitstream.
7410  * @return the position of the first byte of the next frame, or -1
7411  */
7412 static int find_frame_end(H264Context *h, const uint8_t *buf, int buf_size){
7413     int i;
7414     uint32_t state;
7415     ParseContext *pc = &(h->s.parse_context);
7416 //printf("first %02X%02X%02X%02X\n", buf[0], buf[1],buf[2],buf[3]);
7417 //    mb_addr= pc->mb_addr - 1;
7418     state= pc->state;
7419     for(i=0; i<=buf_size; i++){
7420         if((state&0xFFFFFF1F) == 0x101 || (state&0xFFFFFF1F) == 0x102 || (state&0xFFFFFF1F) == 0x105){
7421             tprintf("find_frame_end new startcode = %08x, frame_start_found = %d, pos = %d\n", state, pc->frame_start_found, i);
7422             if(pc->frame_start_found){
7423                 // If there isn't one more byte in the buffer
7424                 // the test on first_mb_in_slice cannot be done yet
7425                 // do it at next call.
7426                 if (i >= buf_size) break;
7427                 if (buf[i] & 0x80) {
7428                     // first_mb_in_slice is 0, probably the first nal of a new
7429                     // slice
7430                     tprintf("find_frame_end frame_end_found, state = %08x, pos = %d\n", state, i);
7431                     pc->state=-1;
7432                     pc->frame_start_found= 0;
7433                     return i-4;
7434                 }
7435             }
7436             pc->frame_start_found = 1;
7437         }
7438         if((state&0xFFFFFF1F) == 0x107 || (state&0xFFFFFF1F) == 0x108 || (state&0xFFFFFF1F) == 0x109){
7439            if(pc->frame_start_found){
7440                 pc->state=-1;
7441                 pc->frame_start_found= 0;
7442                 return i-4;
7443            }
7444         }
7445         if (i<buf_size)
7446             state= (state<<8) | buf[i];
7447     }
7448
7449     pc->state= state;
7450     return END_NOT_FOUND;
7451 }
7452
7453 static int h264_parse(AVCodecParserContext *s,
7454                       AVCodecContext *avctx,
7455                       uint8_t **poutbuf, int *poutbuf_size,
7456                       const uint8_t *buf, int buf_size)
7457 {
7458     H264Context *h = s->priv_data;
7459     ParseContext *pc = &h->s.parse_context;
7460     int next;
7461
7462     next= find_frame_end(h, buf, buf_size);
7463
7464     if (ff_combine_frame(pc, next, (uint8_t **)&buf, &buf_size) < 0) {
7465         *poutbuf = NULL;
7466         *poutbuf_size = 0;
7467         return buf_size;
7468     }
7469
7470     *poutbuf = (uint8_t *)buf;
7471     *poutbuf_size = buf_size;
7472     return next;
7473 }
7474
7475 static int h264_split(AVCodecContext *avctx,
7476                       const uint8_t *buf, int buf_size)
7477 {
7478     int i;
7479     uint32_t state = -1;
7480     int has_sps= 0;
7481
7482     for(i=0; i<=buf_size; i++){
7483         if((state&0xFFFFFF1F) == 0x107)
7484             has_sps=1;
7485 /*        if((state&0xFFFFFF1F) == 0x101 || (state&0xFFFFFF1F) == 0x102 || (state&0xFFFFFF1F) == 0x105){
7486         }*/
7487         if((state&0xFFFFFF00) == 0x100 && (state&0xFFFFFF1F) != 0x107 && (state&0xFFFFFF1F) != 0x108 && (state&0xFFFFFF1F) != 0x109){
7488             if(has_sps){
7489                 while(i>4 && buf[i-5]==0) i--;
7490                 return i-4;
7491             }
7492         }
7493         if (i<buf_size)
7494             state= (state<<8) | buf[i];
7495     }
7496     return 0;
7497 }
7498
7499
7500 static int decode_nal_units(H264Context *h, uint8_t *buf, int buf_size){
7501     MpegEncContext * const s = &h->s;
7502     AVCodecContext * const avctx= s->avctx;
7503     int buf_index=0;
7504 #if 0
7505     int i;
7506     for(i=0; i<50; i++){
7507         av_log(NULL, AV_LOG_ERROR,"%02X ", buf[i]);
7508     }
7509 #endif
7510     h->slice_num = 0;
7511     s->current_picture_ptr= NULL;
7512     for(;;){
7513         int consumed;
7514         int dst_length;
7515         int bit_length;
7516         uint8_t *ptr;
7517         int i, nalsize = 0;
7518
7519       if(h->is_avc) {
7520         if(buf_index >= buf_size) break;
7521         nalsize = 0;
7522         for(i = 0; i < h->nal_length_size; i++)
7523             nalsize = (nalsize << 8) | buf[buf_index++];
7524         if(nalsize <= 1){
7525             if(nalsize == 1){
7526                 buf_index++;
7527                 continue;
7528             }else{
7529                 av_log(h->s.avctx, AV_LOG_ERROR, "AVC: nal size %d\n", nalsize);
7530                 break;
7531             }
7532         }
7533       } else {
7534         // start code prefix search
7535         for(; buf_index + 3 < buf_size; buf_index++){
7536             // this should allways succeed in the first iteration
7537             if(buf[buf_index] == 0 && buf[buf_index+1] == 0 && buf[buf_index+2] == 1)
7538                 break;
7539         }
7540
7541         if(buf_index+3 >= buf_size) break;
7542
7543         buf_index+=3;
7544       }
7545
7546         ptr= decode_nal(h, buf + buf_index, &dst_length, &consumed, h->is_avc ? nalsize : buf_size - buf_index);
7547         if(ptr[dst_length - 1] == 0) dst_length--;
7548         bit_length= 8*dst_length - decode_rbsp_trailing(ptr + dst_length - 1);
7549
7550         if(s->avctx->debug&FF_DEBUG_STARTCODE){
7551             av_log(h->s.avctx, AV_LOG_DEBUG, "NAL %d at %d/%d length %d\n", h->nal_unit_type, buf_index, buf_size, dst_length);
7552         }
7553
7554         if (h->is_avc && (nalsize != consumed))
7555             av_log(h->s.avctx, AV_LOG_ERROR, "AVC: Consumed only %d bytes instead of %d\n", consumed, nalsize);
7556
7557         buf_index += consumed;
7558
7559         if(  (s->hurry_up == 1 && h->nal_ref_idc  == 0) //FIXME dont discard SEI id
7560            ||(avctx->skip_frame >= AVDISCARD_NONREF && h->nal_ref_idc  == 0))
7561             continue;
7562
7563         switch(h->nal_unit_type){
7564         case NAL_IDR_SLICE:
7565             idr(h); //FIXME ensure we don't loose some frames if there is reordering
7566         case NAL_SLICE:
7567             init_get_bits(&s->gb, ptr, bit_length);
7568             h->intra_gb_ptr=
7569             h->inter_gb_ptr= &s->gb;
7570             s->data_partitioning = 0;
7571
7572             if(decode_slice_header(h) < 0){
7573                 av_log(h->s.avctx, AV_LOG_ERROR, "decode_slice_header error\n");
7574                 break;
7575             }
7576             s->current_picture_ptr->key_frame= (h->nal_unit_type == NAL_IDR_SLICE);
7577             if(h->redundant_pic_count==0 && s->hurry_up < 5
7578                && (avctx->skip_frame < AVDISCARD_NONREF || h->nal_ref_idc)
7579                && (avctx->skip_frame < AVDISCARD_BIDIR  || h->slice_type!=B_TYPE)
7580                && (avctx->skip_frame < AVDISCARD_NONKEY || h->slice_type==I_TYPE)
7581                && avctx->skip_frame < AVDISCARD_ALL)
7582                 decode_slice(h);
7583             break;
7584         case NAL_DPA:
7585             init_get_bits(&s->gb, ptr, bit_length);
7586             h->intra_gb_ptr=
7587             h->inter_gb_ptr= NULL;
7588             s->data_partitioning = 1;
7589
7590             if(decode_slice_header(h) < 0){
7591                 av_log(h->s.avctx, AV_LOG_ERROR, "decode_slice_header error\n");
7592             }
7593             break;
7594         case NAL_DPB:
7595             init_get_bits(&h->intra_gb, ptr, bit_length);
7596             h->intra_gb_ptr= &h->intra_gb;
7597             break;
7598         case NAL_DPC:
7599             init_get_bits(&h->inter_gb, ptr, bit_length);
7600             h->inter_gb_ptr= &h->inter_gb;
7601
7602             if(h->redundant_pic_count==0 && h->intra_gb_ptr && s->data_partitioning
7603                && s->hurry_up < 5
7604                && (avctx->skip_frame < AVDISCARD_NONREF || h->nal_ref_idc)
7605                && (avctx->skip_frame < AVDISCARD_BIDIR  || h->slice_type!=B_TYPE)
7606                && (avctx->skip_frame < AVDISCARD_NONKEY || h->slice_type==I_TYPE)
7607                && avctx->skip_frame < AVDISCARD_ALL)
7608                 decode_slice(h);
7609             break;
7610         case NAL_SEI:
7611             init_get_bits(&s->gb, ptr, bit_length);
7612             decode_sei(h);
7613             break;
7614         case NAL_SPS:
7615             init_get_bits(&s->gb, ptr, bit_length);
7616             decode_seq_parameter_set(h);
7617
7618             if(s->flags& CODEC_FLAG_LOW_DELAY)
7619                 s->low_delay=1;
7620
7621             if(avctx->has_b_frames < 2)
7622                 avctx->has_b_frames= !s->low_delay;
7623             break;
7624         case NAL_PPS:
7625             init_get_bits(&s->gb, ptr, bit_length);
7626
7627             decode_picture_parameter_set(h, bit_length);
7628
7629             break;
7630         case NAL_AUD:
7631         case NAL_END_SEQUENCE:
7632         case NAL_END_STREAM:
7633         case NAL_FILLER_DATA:
7634         case NAL_SPS_EXT:
7635         case NAL_AUXILIARY_SLICE:
7636             break;
7637         default:
7638             av_log(avctx, AV_LOG_ERROR, "Unknown NAL code: %d\n", h->nal_unit_type);
7639         }
7640     }
7641
7642     if(!s->current_picture_ptr) return buf_index; //no frame
7643
7644     s->current_picture_ptr->qscale_type= FF_QSCALE_TYPE_H264;
7645     s->current_picture_ptr->pict_type= s->pict_type;
7646
7647     h->prev_frame_num_offset= h->frame_num_offset;
7648     h->prev_frame_num= h->frame_num;
7649     if(s->current_picture_ptr->reference){
7650         h->prev_poc_msb= h->poc_msb;
7651         h->prev_poc_lsb= h->poc_lsb;
7652     }
7653     if(s->current_picture_ptr->reference)
7654         execute_ref_pic_marking(h, h->mmco, h->mmco_index);
7655
7656     ff_er_frame_end(s);
7657
7658     MPV_frame_end(s);
7659
7660     return buf_index;
7661 }
7662
7663 /**
7664  * returns the number of bytes consumed for building the current frame
7665  */
7666 static int get_consumed_bytes(MpegEncContext *s, int pos, int buf_size){
7667     if(s->flags&CODEC_FLAG_TRUNCATED){
7668         pos -= s->parse_context.last_index;
7669         if(pos<0) pos=0; // FIXME remove (unneeded?)
7670
7671         return pos;
7672     }else{
7673         if(pos==0) pos=1; //avoid infinite loops (i doubt thats needed but ...)
7674         if(pos+10>buf_size) pos=buf_size; // oops ;)
7675
7676         return pos;
7677     }
7678 }
7679
7680 static int decode_frame(AVCodecContext *avctx,
7681                              void *data, int *data_size,
7682                              uint8_t *buf, int buf_size)
7683 {
7684     H264Context *h = avctx->priv_data;
7685     MpegEncContext *s = &h->s;
7686     AVFrame *pict = data;
7687     int buf_index;
7688
7689     s->flags= avctx->flags;
7690     s->flags2= avctx->flags2;
7691
7692    /* no supplementary picture */
7693     if (buf_size == 0) {
7694         return 0;
7695     }
7696
7697     if(s->flags&CODEC_FLAG_TRUNCATED){
7698         int next= find_frame_end(h, buf, buf_size);
7699
7700         if( ff_combine_frame(&s->parse_context, next, &buf, &buf_size) < 0 )
7701             return buf_size;
7702 //printf("next:%d buf_size:%d last_index:%d\n", next, buf_size, s->parse_context.last_index);
7703     }
7704
7705     if(h->is_avc && !h->got_avcC) {
7706         int i, cnt, nalsize;
7707         unsigned char *p = avctx->extradata;
7708         if(avctx->extradata_size < 7) {
7709             av_log(avctx, AV_LOG_ERROR, "avcC too short\n");
7710             return -1;
7711         }
7712         if(*p != 1) {
7713             av_log(avctx, AV_LOG_ERROR, "Unknown avcC version %d\n", *p);
7714             return -1;
7715         }
7716         /* sps and pps in the avcC always have length coded with 2 bytes,
7717            so put a fake nal_length_size = 2 while parsing them */
7718         h->nal_length_size = 2;
7719         // Decode sps from avcC
7720         cnt = *(p+5) & 0x1f; // Number of sps
7721         p += 6;
7722         for (i = 0; i < cnt; i++) {
7723             nalsize = BE_16(p) + 2;
7724             if(decode_nal_units(h, p, nalsize) < 0) {
7725                 av_log(avctx, AV_LOG_ERROR, "Decoding sps %d from avcC failed\n", i);
7726                 return -1;
7727             }
7728             p += nalsize;
7729         }
7730         // Decode pps from avcC
7731         cnt = *(p++); // Number of pps
7732         for (i = 0; i < cnt; i++) {
7733             nalsize = BE_16(p) + 2;
7734             if(decode_nal_units(h, p, nalsize)  != nalsize) {
7735                 av_log(avctx, AV_LOG_ERROR, "Decoding pps %d from avcC failed\n", i);
7736                 return -1;
7737             }
7738             p += nalsize;
7739         }
7740         // Now store right nal length size, that will be use to parse all other nals
7741         h->nal_length_size = ((*(((char*)(avctx->extradata))+4))&0x03)+1;
7742         // Do not reparse avcC
7743         h->got_avcC = 1;
7744     }
7745
7746     if(!h->is_avc && s->avctx->extradata_size && s->picture_number==0){
7747         if(decode_nal_units(h, s->avctx->extradata, s->avctx->extradata_size) < 0)
7748             return -1;
7749     }
7750
7751     buf_index=decode_nal_units(h, buf, buf_size);
7752     if(buf_index < 0)
7753         return -1;
7754
7755     //FIXME do something with unavailable reference frames
7756
7757 //    if(ret==FRAME_SKIPPED) return get_consumed_bytes(s, buf_index, buf_size);
7758     if(!s->current_picture_ptr){
7759         av_log(h->s.avctx, AV_LOG_DEBUG, "error, NO frame\n");
7760         return -1;
7761     }
7762
7763     {
7764         Picture *out = s->current_picture_ptr;
7765 #if 0 //decode order
7766         *data_size = sizeof(AVFrame);
7767 #else
7768         /* Sort B-frames into display order */
7769         Picture *cur = s->current_picture_ptr;
7770         Picture *prev = h->delayed_output_pic;
7771         int out_idx = 0;
7772         int pics = 0;
7773         int out_of_order;
7774         int cross_idr = 0;
7775         int dropped_frame = 0;
7776         int i;
7777
7778         if(h->sps.bitstream_restriction_flag
7779            && s->avctx->has_b_frames < h->sps.num_reorder_frames){
7780             s->avctx->has_b_frames = h->sps.num_reorder_frames;
7781             s->low_delay = 0;
7782         }
7783
7784         while(h->delayed_pic[pics]) pics++;
7785         h->delayed_pic[pics++] = cur;
7786         if(cur->reference == 0)
7787             cur->reference = 1;
7788
7789         for(i=0; h->delayed_pic[i]; i++)
7790             if(h->delayed_pic[i]->key_frame || h->delayed_pic[i]->poc==0)
7791                 cross_idr = 1;
7792
7793         out = h->delayed_pic[0];
7794         for(i=1; h->delayed_pic[i] && !h->delayed_pic[i]->key_frame; i++)
7795             if(h->delayed_pic[i]->poc < out->poc){
7796                 out = h->delayed_pic[i];
7797                 out_idx = i;
7798             }
7799
7800         out_of_order = !cross_idr && prev && out->poc < prev->poc;
7801         if(h->sps.bitstream_restriction_flag && s->avctx->has_b_frames >= h->sps.num_reorder_frames)
7802             { }
7803         else if(prev && pics <= s->avctx->has_b_frames)
7804             out = prev;
7805         else if((out_of_order && pics-1 == s->avctx->has_b_frames && pics < 15)
7806            || (s->low_delay &&
7807             ((!cross_idr && prev && out->poc > prev->poc + 2)
7808              || cur->pict_type == B_TYPE)))
7809         {
7810             s->low_delay = 0;
7811             s->avctx->has_b_frames++;
7812             out = prev;
7813         }
7814         else if(out_of_order)
7815             out = prev;
7816
7817         if(out_of_order || pics > s->avctx->has_b_frames){
7818             dropped_frame = (out != h->delayed_pic[out_idx]);
7819             for(i=out_idx; h->delayed_pic[i]; i++)
7820                 h->delayed_pic[i] = h->delayed_pic[i+1];
7821         }
7822
7823         if(prev == out && !dropped_frame)
7824             *data_size = 0;
7825         else
7826             *data_size = sizeof(AVFrame);
7827         if(prev && prev != out && prev->reference == 1)
7828             prev->reference = 0;
7829         h->delayed_output_pic = out;
7830 #endif
7831
7832         if(out)
7833             *pict= *(AVFrame*)out;
7834         else
7835             av_log(avctx, AV_LOG_DEBUG, "no picture\n");
7836     }
7837
7838     assert(pict->data[0] || !*data_size);
7839     ff_print_debug_info(s, pict);
7840 //printf("out %d\n", (int)pict->data[0]);
7841 #if 0 //?
7842
7843     /* Return the Picture timestamp as the frame number */
7844     /* we substract 1 because it is added on utils.c    */
7845     avctx->frame_number = s->picture_number - 1;
7846 #endif
7847     return get_consumed_bytes(s, buf_index, buf_size);
7848 }
7849 #if 0
7850 static inline void fill_mb_avail(H264Context *h){
7851     MpegEncContext * const s = &h->s;
7852     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
7853
7854     if(s->mb_y){
7855         h->mb_avail[0]= s->mb_x                 && h->slice_table[mb_xy - s->mb_stride - 1] == h->slice_num;
7856         h->mb_avail[1]=                            h->slice_table[mb_xy - s->mb_stride    ] == h->slice_num;
7857         h->mb_avail[2]= s->mb_x+1 < s->mb_width && h->slice_table[mb_xy - s->mb_stride + 1] == h->slice_num;
7858     }else{
7859         h->mb_avail[0]=
7860         h->mb_avail[1]=
7861         h->mb_avail[2]= 0;
7862     }
7863     h->mb_avail[3]= s->mb_x && h->slice_table[mb_xy - 1] == h->slice_num;
7864     h->mb_avail[4]= 1; //FIXME move out
7865     h->mb_avail[5]= 0; //FIXME move out
7866 }
7867 #endif
7868
7869 #if 0 //selftest
7870 #define COUNT 8000
7871 #define SIZE (COUNT*40)
7872 int main(){
7873     int i;
7874     uint8_t temp[SIZE];
7875     PutBitContext pb;
7876     GetBitContext gb;
7877 //    int int_temp[10000];
7878     DSPContext dsp;
7879     AVCodecContext avctx;
7880
7881     dsputil_init(&dsp, &avctx);
7882
7883     init_put_bits(&pb, temp, SIZE);
7884     printf("testing unsigned exp golomb\n");
7885     for(i=0; i<COUNT; i++){
7886         START_TIMER
7887         set_ue_golomb(&pb, i);
7888         STOP_TIMER("set_ue_golomb");
7889     }
7890     flush_put_bits(&pb);
7891
7892     init_get_bits(&gb, temp, 8*SIZE);
7893     for(i=0; i<COUNT; i++){
7894         int j, s;
7895
7896         s= show_bits(&gb, 24);
7897
7898         START_TIMER
7899         j= get_ue_golomb(&gb);
7900         if(j != i){
7901             printf("missmatch! at %d (%d should be %d) bits:%6X\n", i, j, i, s);
7902 //            return -1;
7903         }
7904         STOP_TIMER("get_ue_golomb");
7905     }
7906
7907
7908     init_put_bits(&pb, temp, SIZE);
7909     printf("testing signed exp golomb\n");
7910     for(i=0; i<COUNT; i++){
7911         START_TIMER
7912         set_se_golomb(&pb, i - COUNT/2);
7913         STOP_TIMER("set_se_golomb");
7914     }
7915     flush_put_bits(&pb);
7916
7917     init_get_bits(&gb, temp, 8*SIZE);
7918     for(i=0; i<COUNT; i++){
7919         int j, s;
7920
7921         s= show_bits(&gb, 24);
7922
7923         START_TIMER
7924         j= get_se_golomb(&gb);
7925         if(j != i - COUNT/2){
7926             printf("missmatch! at %d (%d should be %d) bits:%6X\n", i, j, i, s);
7927 //            return -1;
7928         }
7929         STOP_TIMER("get_se_golomb");
7930     }
7931
7932     printf("testing 4x4 (I)DCT\n");
7933
7934     DCTELEM block[16];
7935     uint8_t src[16], ref[16];
7936     uint64_t error= 0, max_error=0;
7937
7938     for(i=0; i<COUNT; i++){
7939         int j;
7940 //        printf("%d %d %d\n", r1, r2, (r2-r1)*16);
7941         for(j=0; j<16; j++){
7942             ref[j]= random()%255;
7943             src[j]= random()%255;
7944         }
7945
7946         h264_diff_dct_c(block, src, ref, 4);
7947
7948         //normalize
7949         for(j=0; j<16; j++){
7950 //            printf("%d ", block[j]);
7951             block[j]= block[j]*4;
7952             if(j&1) block[j]= (block[j]*4 + 2)/5;
7953             if(j&4) block[j]= (block[j]*4 + 2)/5;
7954         }
7955 //        printf("\n");
7956
7957         s->dsp.h264_idct_add(ref, block, 4);
7958 /*        for(j=0; j<16; j++){
7959             printf("%d ", ref[j]);
7960         }
7961         printf("\n");*/
7962
7963         for(j=0; j<16; j++){
7964             int diff= ABS(src[j] - ref[j]);
7965
7966             error+= diff*diff;
7967             max_error= FFMAX(max_error, diff);
7968         }
7969     }
7970     printf("error=%f max_error=%d\n", ((float)error)/COUNT/16, (int)max_error );
7971 #if 0
7972     printf("testing quantizer\n");
7973     for(qp=0; qp<52; qp++){
7974         for(i=0; i<16; i++)
7975             src1_block[i]= src2_block[i]= random()%255;
7976
7977     }
7978 #endif
7979     printf("Testing NAL layer\n");
7980
7981     uint8_t bitstream[COUNT];
7982     uint8_t nal[COUNT*2];
7983     H264Context h;
7984     memset(&h, 0, sizeof(H264Context));
7985
7986     for(i=0; i<COUNT; i++){
7987         int zeros= i;
7988         int nal_length;
7989         int consumed;
7990         int out_length;
7991         uint8_t *out;
7992         int j;
7993
7994         for(j=0; j<COUNT; j++){
7995             bitstream[j]= (random() % 255) + 1;
7996         }
7997
7998         for(j=0; j<zeros; j++){
7999             int pos= random() % COUNT;
8000             while(bitstream[pos] == 0){
8001                 pos++;
8002                 pos %= COUNT;
8003             }
8004             bitstream[pos]=0;
8005         }
8006
8007         START_TIMER
8008
8009         nal_length= encode_nal(&h, nal, bitstream, COUNT, COUNT*2);
8010         if(nal_length<0){
8011             printf("encoding failed\n");
8012             return -1;
8013         }
8014
8015         out= decode_nal(&h, nal, &out_length, &consumed, nal_length);
8016
8017         STOP_TIMER("NAL")
8018
8019         if(out_length != COUNT){
8020             printf("incorrect length %d %d\n", out_length, COUNT);
8021             return -1;
8022         }
8023
8024         if(consumed != nal_length){
8025             printf("incorrect consumed length %d %d\n", nal_length, consumed);
8026             return -1;
8027         }
8028
8029         if(memcmp(bitstream, out, COUNT)){
8030             printf("missmatch\n");
8031             return -1;
8032         }
8033     }
8034
8035     printf("Testing RBSP\n");
8036
8037
8038     return 0;
8039 }
8040 #endif
8041
8042
8043 static int decode_end(AVCodecContext *avctx)
8044 {
8045     H264Context *h = avctx->priv_data;
8046     MpegEncContext *s = &h->s;
8047
8048     av_freep(&h->rbsp_buffer);
8049     free_tables(h); //FIXME cleanup init stuff perhaps
8050     MPV_common_end(s);
8051
8052 //    memset(h, 0, sizeof(H264Context));
8053
8054     return 0;
8055 }
8056
8057
8058 AVCodec h264_decoder = {
8059     "h264",
8060     CODEC_TYPE_VIDEO,
8061     CODEC_ID_H264,
8062     sizeof(H264Context),
8063     decode_init,
8064     NULL,
8065     decode_end,
8066     decode_frame,
8067     /*CODEC_CAP_DRAW_HORIZ_BAND |*/ CODEC_CAP_DR1 | CODEC_CAP_TRUNCATED | CODEC_CAP_DELAY,
8068     .flush= flush_dpb,
8069 };
8070
8071 AVCodecParser h264_parser = {
8072     { CODEC_ID_H264 },
8073     sizeof(H264Context),
8074     NULL,
8075     h264_parse,
8076     ff_parse_close,
8077     h264_split,
8078 };
8079
8080 #include "svq3.c"