git.sesse.net Git - ffmpeg/blob - libavcodec/h264.c

   1 /*
   2  * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder
   3  * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
   4  *
   5  * This library is free software; you can redistribute it and/or
   6  * modify it under the terms of the GNU Lesser General Public
   7  * License as published by the Free Software Foundation; either
   8  * version 2 of the License, or (at your option) any later version.
   9  *
  10  * This library is distributed in the hope that it will be useful,
  11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13  * Lesser General Public License for more details.
  14  *
  15  * You should have received a copy of the GNU Lesser General Public
  16  * License along with this library; if not, write to the Free Software
  17  * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
  18  *
  19  */
  20
  21 /**
  22  * @file h264.c
  23  * H.264 / AVC / MPEG4 part10 codec.
  24  * @author Michael Niedermayer <michaelni@gmx.at>
  25  */
  26
  27 #include "common.h"
  28 #include "dsputil.h"
  29 #include "avcodec.h"
  30 #include "mpegvideo.h"
  31 #include "h264data.h"
  32 #include "golomb.h"
  33
  34 #include "cabac.h"
  35
  36 //#undef NDEBUG
  37 #include <assert.h>
  38
  39 #define interlaced_dct interlaced_dct_is_a_bad_name
  40 #define mb_intra mb_intra_isnt_initalized_see_mb_type
  41
  42 #define LUMA_DC_BLOCK_INDEX   25
  43 #define CHROMA_DC_BLOCK_INDEX 26
  44
  45 #define CHROMA_DC_COEFF_TOKEN_VLC_BITS 8
  46 #define COEFF_TOKEN_VLC_BITS           8
  47 #define TOTAL_ZEROS_VLC_BITS           9
  48 #define CHROMA_DC_TOTAL_ZEROS_VLC_BITS 3
  49 #define RUN_VLC_BITS                   3
  50 #define RUN7_VLC_BITS                  6
  51
  52 #define MAX_SPS_COUNT 32
  53 #define MAX_PPS_COUNT 256
  54
  55 #define MAX_MMCO_COUNT 66
  56
  57 /**
  58  * Sequence parameter set
  59  */
  60 typedef struct SPS{
  61
  62     int profile_idc;
  63     int level_idc;
  64     int transform_bypass;              ///< qpprime_y_zero_transform_bypass_flag
  65     int log2_max_frame_num;            ///< log2_max_frame_num_minus4 + 4
  66     int poc_type;                      ///< pic_order_cnt_type
  67     int log2_max_poc_lsb;              ///< log2_max_pic_order_cnt_lsb_minus4
  68     int delta_pic_order_always_zero_flag;
  69     int offset_for_non_ref_pic;
  70     int offset_for_top_to_bottom_field;
  71     int poc_cycle_length;              ///< num_ref_frames_in_pic_order_cnt_cycle
  72     int ref_frame_count;               ///< num_ref_frames
  73     int gaps_in_frame_num_allowed_flag;
  74     int mb_width;                      ///< frame_width_in_mbs_minus1 + 1
  75     int mb_height;                     ///< frame_height_in_mbs_minus1 + 1
  76     int frame_mbs_only_flag;
  77     int mb_aff;                        ///<mb_adaptive_frame_field_flag
  78     int direct_8x8_inference_flag;
  79     int crop;                   ///< frame_cropping_flag
  80     int crop_left;              ///< frame_cropping_rect_left_offset
  81     int crop_right;             ///< frame_cropping_rect_right_offset
  82     int crop_top;               ///< frame_cropping_rect_top_offset
  83     int crop_bottom;            ///< frame_cropping_rect_bottom_offset
  84     int vui_parameters_present_flag;
  85     AVRational sar;
  86     int timing_info_present_flag;
  87     uint32_t num_units_in_tick;
  88     uint32_t time_scale;
  89     int fixed_frame_rate_flag;
  90     short offset_for_ref_frame[256]; //FIXME dyn aloc?
  91     int bitstream_restriction_flag;
  92     int num_reorder_frames;
  93     int scaling_matrix_present;
  94     uint8_t scaling_matrix4[6][16];
  95     uint8_t scaling_matrix8[2][64];
  96 }SPS;
  97
  98 /**
  99  * Picture parameter set
 100  */
 101 typedef struct PPS{
 102     int sps_id;
 103     int cabac;                  ///< entropy_coding_mode_flag
 104     int pic_order_present;      ///< pic_order_present_flag
 105     int slice_group_count;      ///< num_slice_groups_minus1 + 1
 106     int mb_slice_group_map_type;
 107     int ref_count[2];           ///< num_ref_idx_l0/1_active_minus1 + 1
 108     int weighted_pred;          ///< weighted_pred_flag
 109     int weighted_bipred_idc;
 110     int init_qp;                ///< pic_init_qp_minus26 + 26
 111     int init_qs;                ///< pic_init_qs_minus26 + 26
 112     int chroma_qp_index_offset;
 113     int deblocking_filter_parameters_present; ///< deblocking_filter_parameters_present_flag
 114     int constrained_intra_pred; ///< constrained_intra_pred_flag
 115     int redundant_pic_cnt_present; ///< redundant_pic_cnt_present_flag
 116     int transform_8x8_mode;     ///< transform_8x8_mode_flag
 117     uint8_t scaling_matrix4[6][16];
 118     uint8_t scaling_matrix8[2][64];
 119 }PPS;
 120
 121 /**
 122  * Memory management control operation opcode.
 123  */
 124 typedef enum MMCOOpcode{
 125     MMCO_END=0,
 126     MMCO_SHORT2UNUSED,
 127     MMCO_LONG2UNUSED,
 128     MMCO_SHORT2LONG,
 129     MMCO_SET_MAX_LONG,
 130     MMCO_RESET,
 131     MMCO_LONG,
 132 } MMCOOpcode;
 133
 134 /**
 135  * Memory management control operation.
 136  */
 137 typedef struct MMCO{
 138     MMCOOpcode opcode;
 139     int short_frame_num;
 140     int long_index;
 141 } MMCO;
 142
 143 /**
 144  * H264Context
 145  */
 146 typedef struct H264Context{
 147     MpegEncContext s;
 148     int nal_ref_idc;
 149     int nal_unit_type;
 150 #define NAL_SLICE                1
 151 #define NAL_DPA                  2
 152 #define NAL_DPB                  3
 153 #define NAL_DPC                  4
 154 #define NAL_IDR_SLICE            5
 155 #define NAL_SEI                  6
 156 #define NAL_SPS                  7
 157 #define NAL_PPS                  8
 158 #define NAL_AUD                  9
 159 #define NAL_END_SEQUENCE        10
 160 #define NAL_END_STREAM          11
 161 #define NAL_FILLER_DATA         12
 162 #define NAL_SPS_EXT             13
 163 #define NAL_AUXILIARY_SLICE     19
 164     uint8_t *rbsp_buffer;
 165     unsigned int rbsp_buffer_size;
 166
 167     /**
 168       * Used to parse AVC variant of h264
 169       */
 170     int is_avc; ///< this flag is != 0 if codec is avc1
 171     int got_avcC; ///< flag used to parse avcC data only once
 172     int nal_length_size; ///< Number of bytes used for nal length (1, 2 or 4)
 173
 174     int chroma_qp; //QPc
 175
 176     int prev_mb_skipped; //FIXME remove (IMHO not used)
 177
 178     //prediction stuff
 179     int chroma_pred_mode;
 180     int intra16x16_pred_mode;
 181
 182     int top_mb_xy;
 183     int left_mb_xy[2];
 184
 185     int8_t intra4x4_pred_mode_cache[5*8];
 186     int8_t (*intra4x4_pred_mode)[8];
 187     void (*pred4x4  [9+3])(uint8_t *src, uint8_t *topright, int stride);//FIXME move to dsp?
 188     void (*pred8x8l [9+3])(uint8_t *src, int topleft, int topright, int stride);
 189     void (*pred8x8  [4+3])(uint8_t *src, int stride);
 190     void (*pred16x16[4+3])(uint8_t *src, int stride);
 191     unsigned int topleft_samples_available;
 192     unsigned int top_samples_available;
 193     unsigned int topright_samples_available;
 194     unsigned int left_samples_available;
 195     uint8_t (*top_borders[2])[16+2*8];
 196     uint8_t left_border[2*(17+2*9)];
 197
 198     /**
 199      * non zero coeff count cache.
 200      * is 64 if not available.
 201      */
 202     DECLARE_ALIGNED_8(uint8_t, non_zero_count_cache[6*8]);
 203     uint8_t (*non_zero_count)[16];
 204
 205     /**
 206      * Motion vector cache.
 207      */
 208     DECLARE_ALIGNED_8(int16_t, mv_cache[2][5*8][2]);
 209     DECLARE_ALIGNED_8(int8_t, ref_cache[2][5*8]);
 210 #define LIST_NOT_USED -1 //FIXME rename?
 211 #define PART_NOT_AVAILABLE -2
 212
 213     /**
 214      * is 1 if the specific list MV&references are set to 0,0,-2.
 215      */
 216     int mv_cache_clean[2];
 217
 218     /**
 219      * number of neighbors (top and/or left) that used 8x8 dct
 220      */
 221     int neighbor_transform_size;
 222
 223     /**
 224      * block_offset[ 0..23] for frame macroblocks
 225      * block_offset[24..47] for field macroblocks
 226      */
 227     int block_offset[2*(16+8)];
 228
 229     uint32_t *mb2b_xy; //FIXME are these 4 a good idea?
 230     uint32_t *mb2b8_xy;
 231     int b_stride; //FIXME use s->b4_stride
 232     int b8_stride;
 233
 234     int halfpel_flag;
 235     int thirdpel_flag;
 236
 237     int unknown_svq3_flag;
 238     int next_slice_index;
 239
 240     SPS sps_buffer[MAX_SPS_COUNT];
 241     SPS sps; ///< current sps
 242
 243     PPS pps_buffer[MAX_PPS_COUNT];
 244     /**
 245      * current pps
 246      */
 247     PPS pps; //FIXME move to Picture perhaps? (->no) do we need that?
 248
 249     uint32_t dequant4_buffer[6][52][16];
 250     uint32_t dequant8_buffer[2][52][64];
 251     uint32_t (*dequant4_coeff[6])[16];
 252     uint32_t (*dequant8_coeff[2])[64];
 253     int dequant_coeff_pps;     ///< reinit tables when pps changes
 254
 255     int slice_num;
 256     uint8_t *slice_table_base;
 257     uint8_t *slice_table;      ///< slice_table_base + mb_stride + 1
 258     int slice_type;
 259     int slice_type_fixed;
 260
 261     //interlacing specific flags
 262     int mb_aff_frame;
 263     int mb_field_decoding_flag;
 264
 265     int sub_mb_type[4];
 266
 267     //POC stuff
 268     int poc_lsb;
 269     int poc_msb;
 270     int delta_poc_bottom;
 271     int delta_poc[2];
 272     int frame_num;
 273     int prev_poc_msb;             ///< poc_msb of the last reference pic for POC type 0
 274     int prev_poc_lsb;             ///< poc_lsb of the last reference pic for POC type 0
 275     int frame_num_offset;         ///< for POC type 2
 276     int prev_frame_num_offset;    ///< for POC type 2
 277     int prev_frame_num;           ///< frame_num of the last pic for POC type 1/2
 278
 279     /**
 280      * frame_num for frames or 2*frame_num for field pics.
 281      */
 282     int curr_pic_num;
 283
 284     /**
 285      * max_frame_num or 2*max_frame_num for field pics.
 286      */
 287     int max_pic_num;
 288
 289     //Weighted pred stuff
 290     int use_weight;
 291     int use_weight_chroma;
 292     int luma_log2_weight_denom;
 293     int chroma_log2_weight_denom;
 294     int luma_weight[2][16];
 295     int luma_offset[2][16];
 296     int chroma_weight[2][16][2];
 297     int chroma_offset[2][16][2];
 298     int implicit_weight[16][16];
 299
 300     //deblock
 301     int deblocking_filter;         ///< disable_deblocking_filter_idc with 1<->0
 302     int slice_alpha_c0_offset;
 303     int slice_beta_offset;
 304
 305     int redundant_pic_count;
 306
 307     int direct_spatial_mv_pred;
 308     int dist_scale_factor[16];
 309     int map_col_to_list0[2][16];
 310
 311     /**
 312      * num_ref_idx_l0/1_active_minus1 + 1
 313      */
 314     int ref_count[2];// FIXME split for AFF
 315     Picture *short_ref[32];
 316     Picture *long_ref[32];
 317     Picture default_ref_list[2][32];
 318     Picture ref_list[2][32]; //FIXME size?
 319     Picture field_ref_list[2][32]; //FIXME size?
 320     Picture *delayed_pic[16]; //FIXME size?
 321     Picture *delayed_output_pic;
 322
 323     /**
 324      * memory management control operations buffer.
 325      */
 326     MMCO mmco[MAX_MMCO_COUNT];
 327     int mmco_index;
 328
 329     int long_ref_count;  ///< number of actual long term references
 330     int short_ref_count; ///< number of actual short term references
 331
 332     //data partitioning
 333     GetBitContext intra_gb;
 334     GetBitContext inter_gb;
 335     GetBitContext *intra_gb_ptr;
 336     GetBitContext *inter_gb_ptr;
 337
 338     DECLARE_ALIGNED_8(DCTELEM, mb[16*24]);
 339
 340     /**
 341      * Cabac
 342      */
 343     CABACContext cabac;
 344     uint8_t      cabac_state[460];
 345     int          cabac_init_idc;
 346
 347     /* 0x100 -> non null luma_dc, 0x80/0x40 -> non null chroma_dc (cb/cr), 0x?0 -> chroma_cbp(0,1,2), 0x0? luma_cbp */
 348     uint16_t     *cbp_table;
 349     int top_cbp;
 350     int left_cbp;
 351     /* chroma_pred_mode for i4x4 or i16x16, else 0 */
 352     uint8_t     *chroma_pred_mode_table;
 353     int         last_qscale_diff;
 354     int16_t     (*mvd_table[2])[2];
 355     DECLARE_ALIGNED_8(int16_t, mvd_cache[2][5*8][2]);
 356     uint8_t     *direct_table;
 357     uint8_t     direct_cache[5*8];
 358
 359     uint8_t zigzag_scan[16];
 360     uint8_t field_scan[16];
 361     uint8_t zigzag_scan8x8[64];
 362     uint8_t zigzag_scan8x8_cavlc[64];
 363     const uint8_t *zigzag_scan_q0;
 364     const uint8_t *field_scan_q0;
 365     const uint8_t *zigzag_scan8x8_q0;
 366     const uint8_t *zigzag_scan8x8_cavlc_q0;
 367
 368     int x264_build;
 369 }H264Context;
 370
 371 static VLC coeff_token_vlc[4];
 372 static VLC chroma_dc_coeff_token_vlc;
 373
 374 static VLC total_zeros_vlc[15];
 375 static VLC chroma_dc_total_zeros_vlc[3];
 376
 377 static VLC run_vlc[6];
 378 static VLC run7_vlc;
 379
 380 static void svq3_luma_dc_dequant_idct_c(DCTELEM *block, int qp);
 381 static void svq3_add_idct_c(uint8_t *dst, DCTELEM *block, int stride, int qp, int dc);
 382 static void filter_mb( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize);
 383
 384 static always_inline uint32_t pack16to32(int a, int b){
 385 #ifdef WORDS_BIGENDIAN
 386    return (b&0xFFFF) + (a<<16);
 387 #else
 388    return (a&0xFFFF) + (b<<16);
 389 #endif
 390 }
 391
 392 /**
 393  * fill a rectangle.
 394  * @param h height of the rectangle, should be a constant
 395  * @param w width of the rectangle, should be a constant
 396  * @param size the size of val (1 or 4), should be a constant
 397  */
 398 static always_inline void fill_rectangle(void *vp, int w, int h, int stride, uint32_t val, int size){
 399     uint8_t *p= (uint8_t*)vp;
 400     assert(size==1 || size==4);
 401
 402     w      *= size;
 403     stride *= size;
 404
 405     assert((((long)vp)&(FFMIN(w, STRIDE_ALIGN)-1)) == 0);
 406     assert((stride&(w-1))==0);
 407 //FIXME check what gcc generates for 64 bit on x86 and possibly write a 32 bit ver of it
 408     if(w==2 && h==2){
 409         *(uint16_t*)(p + 0)=
 410         *(uint16_t*)(p + stride)= size==4 ? val : val*0x0101;
 411     }else if(w==2 && h==4){
 412         *(uint16_t*)(p + 0*stride)=
 413         *(uint16_t*)(p + 1*stride)=
 414         *(uint16_t*)(p + 2*stride)=
 415         *(uint16_t*)(p + 3*stride)= size==4 ? val : val*0x0101;
 416     }else if(w==4 && h==1){
 417         *(uint32_t*)(p + 0*stride)= size==4 ? val : val*0x01010101;
 418     }else if(w==4 && h==2){
 419         *(uint32_t*)(p + 0*stride)=
 420         *(uint32_t*)(p + 1*stride)= size==4 ? val : val*0x01010101;
 421     }else if(w==4 && h==4){
 422         *(uint32_t*)(p + 0*stride)=
 423         *(uint32_t*)(p + 1*stride)=
 424         *(uint32_t*)(p + 2*stride)=
 425         *(uint32_t*)(p + 3*stride)= size==4 ? val : val*0x01010101;
 426     }else if(w==8 && h==1){
 427         *(uint32_t*)(p + 0)=
 428         *(uint32_t*)(p + 4)= size==4 ? val : val*0x01010101;
 429     }else if(w==8 && h==2){
 430         *(uint32_t*)(p + 0 + 0*stride)=
 431         *(uint32_t*)(p + 4 + 0*stride)=
 432         *(uint32_t*)(p + 0 + 1*stride)=
 433         *(uint32_t*)(p + 4 + 1*stride)=  size==4 ? val : val*0x01010101;
 434     }else if(w==8 && h==4){
 435         *(uint64_t*)(p + 0*stride)=
 436         *(uint64_t*)(p + 1*stride)=
 437         *(uint64_t*)(p + 2*stride)=
 438         *(uint64_t*)(p + 3*stride)= size==4 ? val*0x0100000001ULL : val*0x0101010101010101ULL;
 439     }else if(w==16 && h==2){
 440         *(uint64_t*)(p + 0+0*stride)=
 441         *(uint64_t*)(p + 8+0*stride)=
 442         *(uint64_t*)(p + 0+1*stride)=
 443         *(uint64_t*)(p + 8+1*stride)= size==4 ? val*0x0100000001ULL : val*0x0101010101010101ULL;
 444     }else if(w==16 && h==4){
 445         *(uint64_t*)(p + 0+0*stride)=
 446         *(uint64_t*)(p + 8+0*stride)=
 447         *(uint64_t*)(p + 0+1*stride)=
 448         *(uint64_t*)(p + 8+1*stride)=
 449         *(uint64_t*)(p + 0+2*stride)=
 450         *(uint64_t*)(p + 8+2*stride)=
 451         *(uint64_t*)(p + 0+3*stride)=
 452         *(uint64_t*)(p + 8+3*stride)= size==4 ? val*0x0100000001ULL : val*0x0101010101010101ULL;
 453     }else
 454         assert(0);
 455 }
 456
 457 static void fill_caches(H264Context *h, int mb_type, int for_deblock){
 458     MpegEncContext * const s = &h->s;
 459     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
 460     int topleft_xy, top_xy, topright_xy, left_xy[2];
 461     int topleft_type, top_type, topright_type, left_type[2];
 462     int left_block[8];
 463     int i;
 464
 465     //FIXME deblocking can skip fill_caches much of the time with multiple slices too.
 466     // the actual condition is whether we're on the edge of a slice,
 467     // and even then the intra and nnz parts are unnecessary.
 468     if(for_deblock && h->slice_num == 1)
 469         return;
 470
 471     //wow what a mess, why didn't they simplify the interlacing&intra stuff, i can't imagine that these complex rules are worth it
 472
 473     top_xy     = mb_xy  - s->mb_stride;
 474     topleft_xy = top_xy - 1;
 475     topright_xy= top_xy + 1;
 476     left_xy[1] = left_xy[0] = mb_xy-1;
 477     left_block[0]= 0;
 478     left_block[1]= 1;
 479     left_block[2]= 2;
 480     left_block[3]= 3;
 481     left_block[4]= 7;
 482     left_block[5]= 10;
 483     left_block[6]= 8;
 484     left_block[7]= 11;
 485     if(h->mb_aff_frame){
 486         const int pair_xy          = s->mb_x     + (s->mb_y & ~1)*s->mb_stride;
 487         const int top_pair_xy      = pair_xy     - s->mb_stride;
 488         const int topleft_pair_xy  = top_pair_xy - 1;
 489         const int topright_pair_xy = top_pair_xy + 1;
 490         const int topleft_mb_frame_flag  = !IS_INTERLACED(s->current_picture.mb_type[topleft_pair_xy]);
 491         const int top_mb_frame_flag      = !IS_INTERLACED(s->current_picture.mb_type[top_pair_xy]);
 492         const int topright_mb_frame_flag = !IS_INTERLACED(s->current_picture.mb_type[topright_pair_xy]);
 493         const int left_mb_frame_flag = !IS_INTERLACED(s->current_picture.mb_type[pair_xy-1]);
 494         const int curr_mb_frame_flag = !IS_INTERLACED(mb_type);
 495         const int bottom = (s->mb_y & 1);
 496         tprintf("fill_caches: curr_mb_frame_flag:%d, left_mb_frame_flag:%d, topleft_mb_frame_flag:%d, top_mb_frame_flag:%d, topright_mb_frame_flag:%d\n", curr_mb_frame_flag, left_mb_frame_flag, topleft_mb_frame_flag, top_mb_frame_flag, topright_mb_frame_flag);
 497         if (bottom
 498                 ? !curr_mb_frame_flag // bottom macroblock
 499                 : (!curr_mb_frame_flag && !top_mb_frame_flag) // top macroblock
 500                 ) {
 501             top_xy -= s->mb_stride;
 502         }
 503         if (bottom
 504                 ? !curr_mb_frame_flag // bottom macroblock
 505                 : (!curr_mb_frame_flag && !topleft_mb_frame_flag) // top macroblock
 506                 ) {
 507             topleft_xy -= s->mb_stride;
 508         }
 509         if (bottom
 510                 ? !curr_mb_frame_flag // bottom macroblock
 511                 : (!curr_mb_frame_flag && !topright_mb_frame_flag) // top macroblock
 512                 ) {
 513             topright_xy -= s->mb_stride;
 514         }
 515         if (left_mb_frame_flag != curr_mb_frame_flag) {
 516             left_xy[1] = left_xy[0] = pair_xy - 1;
 517             if (curr_mb_frame_flag) {
 518                 if (bottom) {
 519                     left_block[0]= 2;
 520                     left_block[1]= 2;
 521                     left_block[2]= 3;
 522                     left_block[3]= 3;
 523                     left_block[4]= 8;
 524                     left_block[5]= 11;
 525                     left_block[6]= 8;
 526                     left_block[7]= 11;
 527                 } else {
 528                     left_block[0]= 0;
 529                     left_block[1]= 0;
 530                     left_block[2]= 1;
 531                     left_block[3]= 1;
 532                     left_block[4]= 7;
 533                     left_block[5]= 10;
 534                     left_block[6]= 7;
 535                     left_block[7]= 10;
 536                 }
 537             } else {
 538                 left_xy[1] += s->mb_stride;
 539                 //left_block[0]= 0;
 540                 left_block[1]= 2;
 541                 left_block[2]= 0;
 542                 left_block[3]= 2;
 543                 //left_block[4]= 7;
 544                 left_block[5]= 10;
 545                 left_block[6]= 7;
 546                 left_block[7]= 10;
 547             }
 548         }
 549     }
 550
 551     h->top_mb_xy = top_xy;
 552     h->left_mb_xy[0] = left_xy[0];
 553     h->left_mb_xy[1] = left_xy[1];
 554     if(for_deblock){
 555         topleft_type = h->slice_table[topleft_xy ] < 255 ? s->current_picture.mb_type[topleft_xy] : 0;
 556         top_type     = h->slice_table[top_xy     ] < 255 ? s->current_picture.mb_type[top_xy]     : 0;
 557         topright_type= h->slice_table[topright_xy] < 255 ? s->current_picture.mb_type[topright_xy]: 0;
 558         left_type[0] = h->slice_table[left_xy[0] ] < 255 ? s->current_picture.mb_type[left_xy[0]] : 0;
 559         left_type[1] = h->slice_table[left_xy[1] ] < 255 ? s->current_picture.mb_type[left_xy[1]] : 0;
 560     }else{
 561         topleft_type = h->slice_table[topleft_xy ] == h->slice_num ? s->current_picture.mb_type[topleft_xy] : 0;
 562         top_type     = h->slice_table[top_xy     ] == h->slice_num ? s->current_picture.mb_type[top_xy]     : 0;
 563         topright_type= h->slice_table[topright_xy] == h->slice_num ? s->current_picture.mb_type[topright_xy]: 0;
 564         left_type[0] = h->slice_table[left_xy[0] ] == h->slice_num ? s->current_picture.mb_type[left_xy[0]] : 0;
 565         left_type[1] = h->slice_table[left_xy[1] ] == h->slice_num ? s->current_picture.mb_type[left_xy[1]] : 0;
 566     }
 567
 568     if(IS_INTRA(mb_type)){
 569         h->topleft_samples_available=
 570         h->top_samples_available=
 571         h->left_samples_available= 0xFFFF;
 572         h->topright_samples_available= 0xEEEA;
 573
 574         if(!IS_INTRA(top_type) && (top_type==0 || h->pps.constrained_intra_pred)){
 575             h->topleft_samples_available= 0xB3FF;
 576             h->top_samples_available= 0x33FF;
 577             h->topright_samples_available= 0x26EA;
 578         }
 579         for(i=0; i<2; i++){
 580             if(!IS_INTRA(left_type[i]) && (left_type[i]==0 || h->pps.constrained_intra_pred)){
 581                 h->topleft_samples_available&= 0xDF5F;
 582                 h->left_samples_available&= 0x5F5F;
 583             }
 584         }
 585
 586         if(!IS_INTRA(topleft_type) && (topleft_type==0 || h->pps.constrained_intra_pred))
 587             h->topleft_samples_available&= 0x7FFF;
 588
 589         if(!IS_INTRA(topright_type) && (topright_type==0 || h->pps.constrained_intra_pred))
 590             h->topright_samples_available&= 0xFBFF;
 591
 592         if(IS_INTRA4x4(mb_type)){
 593             if(IS_INTRA4x4(top_type)){
 594                 h->intra4x4_pred_mode_cache[4+8*0]= h->intra4x4_pred_mode[top_xy][4];
 595                 h->intra4x4_pred_mode_cache[5+8*0]= h->intra4x4_pred_mode[top_xy][5];
 596                 h->intra4x4_pred_mode_cache[6+8*0]= h->intra4x4_pred_mode[top_xy][6];
 597                 h->intra4x4_pred_mode_cache[7+8*0]= h->intra4x4_pred_mode[top_xy][3];
 598             }else{
 599                 int pred;
 600                 if(!top_type || (IS_INTER(top_type) && h->pps.constrained_intra_pred))
 601                     pred= -1;
 602                 else{
 603                     pred= 2;
 604                 }
 605                 h->intra4x4_pred_mode_cache[4+8*0]=
 606                 h->intra4x4_pred_mode_cache[5+8*0]=
 607                 h->intra4x4_pred_mode_cache[6+8*0]=
 608                 h->intra4x4_pred_mode_cache[7+8*0]= pred;
 609             }
 610             for(i=0; i<2; i++){
 611                 if(IS_INTRA4x4(left_type[i])){
 612                     h->intra4x4_pred_mode_cache[3+8*1 + 2*8*i]= h->intra4x4_pred_mode[left_xy[i]][left_block[0+2*i]];
 613                     h->intra4x4_pred_mode_cache[3+8*2 + 2*8*i]= h->intra4x4_pred_mode[left_xy[i]][left_block[1+2*i]];
 614                 }else{
 615                     int pred;
 616                     if(!left_type[i] || (IS_INTER(left_type[i]) && h->pps.constrained_intra_pred))
 617                         pred= -1;
 618                     else{
 619                         pred= 2;
 620                     }
 621                     h->intra4x4_pred_mode_cache[3+8*1 + 2*8*i]=
 622                     h->intra4x4_pred_mode_cache[3+8*2 + 2*8*i]= pred;
 623                 }
 624             }
 625         }
 626     }
 627
 628
 629 /*
 630 0 . T T. T T T T
 631 1 L . .L . . . .
 632 2 L . .L . . . .
 633 3 . T TL . . . .
 634 4 L . .L . . . .
 635 5 L . .. . . . .
 636 */
 637 //FIXME constraint_intra_pred & partitioning & nnz (lets hope this is just a typo in the spec)
 638     if(top_type){
 639         h->non_zero_count_cache[4+8*0]= h->non_zero_count[top_xy][4];
 640         h->non_zero_count_cache[5+8*0]= h->non_zero_count[top_xy][5];
 641         h->non_zero_count_cache[6+8*0]= h->non_zero_count[top_xy][6];
 642         h->non_zero_count_cache[7+8*0]= h->non_zero_count[top_xy][3];
 643
 644         h->non_zero_count_cache[1+8*0]= h->non_zero_count[top_xy][9];
 645         h->non_zero_count_cache[2+8*0]= h->non_zero_count[top_xy][8];
 646
 647         h->non_zero_count_cache[1+8*3]= h->non_zero_count[top_xy][12];
 648         h->non_zero_count_cache[2+8*3]= h->non_zero_count[top_xy][11];
 649
 650     }else{
 651         h->non_zero_count_cache[4+8*0]=
 652         h->non_zero_count_cache[5+8*0]=
 653         h->non_zero_count_cache[6+8*0]=
 654         h->non_zero_count_cache[7+8*0]=
 655
 656         h->non_zero_count_cache[1+8*0]=
 657         h->non_zero_count_cache[2+8*0]=
 658
 659         h->non_zero_count_cache[1+8*3]=
 660         h->non_zero_count_cache[2+8*3]= h->pps.cabac && !IS_INTRA(mb_type) ? 0 : 64;
 661
 662     }
 663
 664     for (i=0; i<2; i++) {
 665         if(left_type[i]){
 666             h->non_zero_count_cache[3+8*1 + 2*8*i]= h->non_zero_count[left_xy[i]][left_block[0+2*i]];
 667             h->non_zero_count_cache[3+8*2 + 2*8*i]= h->non_zero_count[left_xy[i]][left_block[1+2*i]];
 668             h->non_zero_count_cache[0+8*1 +   8*i]= h->non_zero_count[left_xy[i]][left_block[4+2*i]];
 669             h->non_zero_count_cache[0+8*4 +   8*i]= h->non_zero_count[left_xy[i]][left_block[5+2*i]];
 670         }else{
 671             h->non_zero_count_cache[3+8*1 + 2*8*i]=
 672             h->non_zero_count_cache[3+8*2 + 2*8*i]=
 673             h->non_zero_count_cache[0+8*1 +   8*i]=
 674             h->non_zero_count_cache[0+8*4 +   8*i]= h->pps.cabac && !IS_INTRA(mb_type) ? 0 : 64;
 675         }
 676     }
 677
 678     if( h->pps.cabac ) {
 679         // top_cbp
 680         if(top_type) {
 681             h->top_cbp = h->cbp_table[top_xy];
 682         } else if(IS_INTRA(mb_type)) {
 683             h->top_cbp = 0x1C0;
 684         } else {
 685             h->top_cbp = 0;
 686         }
 687         // left_cbp
 688         if (left_type[0]) {
 689             h->left_cbp = h->cbp_table[left_xy[0]] & 0x1f0;
 690         } else if(IS_INTRA(mb_type)) {
 691             h->left_cbp = 0x1C0;
 692         } else {
 693             h->left_cbp = 0;
 694         }
 695         if (left_type[0]) {
 696             h->left_cbp |= ((h->cbp_table[left_xy[0]]>>((left_block[0]&(~1))+1))&0x1) << 1;
 697         }
 698         if (left_type[1]) {
 699             h->left_cbp |= ((h->cbp_table[left_xy[1]]>>((left_block[2]&(~1))+1))&0x1) << 3;
 700         }
 701     }
 702
 703 #if 1
 704     //FIXME direct mb can skip much of this
 705     if(IS_INTER(mb_type) || IS_DIRECT(mb_type)){
 706         int list;
 707         for(list=0; list<1+(h->slice_type==B_TYPE); list++){
 708             if(!USES_LIST(mb_type, list) && !IS_DIRECT(mb_type) && !h->deblocking_filter){
 709                 /*if(!h->mv_cache_clean[list]){
 710                     memset(h->mv_cache [list],  0, 8*5*2*sizeof(int16_t)); //FIXME clean only input? clean at all?
 711                     memset(h->ref_cache[list], PART_NOT_AVAILABLE, 8*5*sizeof(int8_t));
 712                     h->mv_cache_clean[list]= 1;
 713                 }*/
 714                 continue;
 715             }
 716             h->mv_cache_clean[list]= 0;
 717
 718             if(USES_LIST(top_type, list)){
 719                 const int b_xy= h->mb2b_xy[top_xy] + 3*h->b_stride;
 720                 const int b8_xy= h->mb2b8_xy[top_xy] + h->b8_stride;
 721                 *(uint32_t*)h->mv_cache[list][scan8[0] + 0 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 0];
 722                 *(uint32_t*)h->mv_cache[list][scan8[0] + 1 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 1];
 723                 *(uint32_t*)h->mv_cache[list][scan8[0] + 2 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 2];
 724                 *(uint32_t*)h->mv_cache[list][scan8[0] + 3 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 3];
 725                 h->ref_cache[list][scan8[0] + 0 - 1*8]=
 726                 h->ref_cache[list][scan8[0] + 1 - 1*8]= s->current_picture.ref_index[list][b8_xy + 0];
 727                 h->ref_cache[list][scan8[0] + 2 - 1*8]=
 728                 h->ref_cache[list][scan8[0] + 3 - 1*8]= s->current_picture.ref_index[list][b8_xy + 1];
 729             }else{
 730                 *(uint32_t*)h->mv_cache [list][scan8[0] + 0 - 1*8]=
 731                 *(uint32_t*)h->mv_cache [list][scan8[0] + 1 - 1*8]=
 732                 *(uint32_t*)h->mv_cache [list][scan8[0] + 2 - 1*8]=
 733                 *(uint32_t*)h->mv_cache [list][scan8[0] + 3 - 1*8]= 0;
 734                 *(uint32_t*)&h->ref_cache[list][scan8[0] + 0 - 1*8]= ((top_type ? LIST_NOT_USED : PART_NOT_AVAILABLE)&0xFF)*0x01010101;
 735             }
 736
 737             //FIXME unify cleanup or sth
 738             if(USES_LIST(left_type[0], list)){
 739                 const int b_xy= h->mb2b_xy[left_xy[0]] + 3;
 740                 const int b8_xy= h->mb2b8_xy[left_xy[0]] + 1;
 741                 *(uint32_t*)h->mv_cache[list][scan8[0] - 1 + 0*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + h->b_stride*left_block[0]];
 742                 *(uint32_t*)h->mv_cache[list][scan8[0] - 1 + 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + h->b_stride*left_block[1]];
 743                 h->ref_cache[list][scan8[0] - 1 + 0*8]=
 744                 h->ref_cache[list][scan8[0] - 1 + 1*8]= s->current_picture.ref_index[list][b8_xy + h->b8_stride*(left_block[0]>>1)];
 745             }else{
 746                 *(uint32_t*)h->mv_cache [list][scan8[0] - 1 + 0*8]=
 747                 *(uint32_t*)h->mv_cache [list][scan8[0] - 1 + 1*8]= 0;
 748                 h->ref_cache[list][scan8[0] - 1 + 0*8]=
 749                 h->ref_cache[list][scan8[0] - 1 + 1*8]= left_type[0] ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 750             }
 751
 752             if(USES_LIST(left_type[1], list)){
 753                 const int b_xy= h->mb2b_xy[left_xy[1]] + 3;
 754                 const int b8_xy= h->mb2b8_xy[left_xy[1]] + 1;
 755                 *(uint32_t*)h->mv_cache[list][scan8[0] - 1 + 2*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + h->b_stride*left_block[2]];
 756                 *(uint32_t*)h->mv_cache[list][scan8[0] - 1 + 3*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + h->b_stride*left_block[3]];
 757                 h->ref_cache[list][scan8[0] - 1 + 2*8]=
 758                 h->ref_cache[list][scan8[0] - 1 + 3*8]= s->current_picture.ref_index[list][b8_xy + h->b8_stride*(left_block[2]>>1)];
 759             }else{
 760                 *(uint32_t*)h->mv_cache [list][scan8[0] - 1 + 2*8]=
 761                 *(uint32_t*)h->mv_cache [list][scan8[0] - 1 + 3*8]= 0;
 762                 h->ref_cache[list][scan8[0] - 1 + 2*8]=
 763                 h->ref_cache[list][scan8[0] - 1 + 3*8]= left_type[0] ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 764                 assert((!left_type[0]) == (!left_type[1]));
 765             }
 766
 767             if(for_deblock || (IS_DIRECT(mb_type) && !h->direct_spatial_mv_pred))
 768                 continue;
 769
 770             if(USES_LIST(topleft_type, list)){
 771                 const int b_xy = h->mb2b_xy[topleft_xy] + 3 + 3*h->b_stride;
 772                 const int b8_xy= h->mb2b8_xy[topleft_xy] + 1 + h->b8_stride;
 773                 *(uint32_t*)h->mv_cache[list][scan8[0] - 1 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy];
 774                 h->ref_cache[list][scan8[0] - 1 - 1*8]= s->current_picture.ref_index[list][b8_xy];
 775             }else{
 776                 *(uint32_t*)h->mv_cache[list][scan8[0] - 1 - 1*8]= 0;
 777                 h->ref_cache[list][scan8[0] - 1 - 1*8]= topleft_type ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 778             }
 779
 780             if(USES_LIST(topright_type, list)){
 781                 const int b_xy= h->mb2b_xy[topright_xy] + 3*h->b_stride;
 782                 const int b8_xy= h->mb2b8_xy[topright_xy] + h->b8_stride;
 783                 *(uint32_t*)h->mv_cache[list][scan8[0] + 4 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy];
 784                 h->ref_cache[list][scan8[0] + 4 - 1*8]= s->current_picture.ref_index[list][b8_xy];
 785             }else{
 786                 *(uint32_t*)h->mv_cache [list][scan8[0] + 4 - 1*8]= 0;
 787                 h->ref_cache[list][scan8[0] + 4 - 1*8]= topright_type ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 788             }
 789
 790
 791             h->ref_cache[list][scan8[5 ]+1] =
 792             h->ref_cache[list][scan8[7 ]+1] =
 793             h->ref_cache[list][scan8[13]+1] =  //FIXME remove past 3 (init somewhere else)
 794             h->ref_cache[list][scan8[4 ]] =
 795             h->ref_cache[list][scan8[12]] = PART_NOT_AVAILABLE;
 796             *(uint32_t*)h->mv_cache [list][scan8[5 ]+1]=
 797             *(uint32_t*)h->mv_cache [list][scan8[7 ]+1]=
 798             *(uint32_t*)h->mv_cache [list][scan8[13]+1]= //FIXME remove past 3 (init somewhere else)
 799             *(uint32_t*)h->mv_cache [list][scan8[4 ]]=
 800             *(uint32_t*)h->mv_cache [list][scan8[12]]= 0;
 801
 802             if( h->pps.cabac ) {
 803                 /* XXX beurk, Load mvd */
 804                 if(USES_LIST(topleft_type, list)){
 805                     const int b_xy = h->mb2b_xy[topleft_xy] + 3 + 3*h->b_stride;
 806                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy];
 807                 }else{
 808                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 - 1*8]= 0;
 809                 }
 810
 811                 if(USES_LIST(top_type, list)){
 812                     const int b_xy= h->mb2b_xy[top_xy] + 3*h->b_stride;
 813                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 0 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 0];
 814                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 1 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 1];
 815                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 2 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 2];
 816                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 3 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 3];
 817                 }else{
 818                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 0 - 1*8]=
 819                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 1 - 1*8]=
 820                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 2 - 1*8]=
 821                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 3 - 1*8]= 0;
 822                 }
 823                 if(USES_LIST(left_type[0], list)){
 824                     const int b_xy= h->mb2b_xy[left_xy[0]] + 3;
 825                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 0*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[0]];
 826                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[1]];
 827                 }else{
 828                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 0*8]=
 829                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 1*8]= 0;
 830                 }
 831                 if(USES_LIST(left_type[1], list)){
 832                     const int b_xy= h->mb2b_xy[left_xy[1]] + 3;
 833                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 2*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[2]];
 834                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 3*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[3]];
 835                 }else{
 836                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 2*8]=
 837                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 3*8]= 0;
 838                 }
 839                 *(uint32_t*)h->mvd_cache [list][scan8[5 ]+1]=
 840                 *(uint32_t*)h->mvd_cache [list][scan8[7 ]+1]=
 841                 *(uint32_t*)h->mvd_cache [list][scan8[13]+1]= //FIXME remove past 3 (init somewhere else)
 842                 *(uint32_t*)h->mvd_cache [list][scan8[4 ]]=
 843                 *(uint32_t*)h->mvd_cache [list][scan8[12]]= 0;
 844
 845                 if(h->slice_type == B_TYPE){
 846                     fill_rectangle(&h->direct_cache[scan8[0]], 4, 4, 8, 0, 1);
 847
 848                     if(IS_DIRECT(top_type)){
 849                         *(uint32_t*)&h->direct_cache[scan8[0] - 1*8]= 0x01010101;
 850                     }else if(IS_8X8(top_type)){
 851                         int b8_xy = h->mb2b8_xy[top_xy] + h->b8_stride;
 852                         h->direct_cache[scan8[0] + 0 - 1*8]= h->direct_table[b8_xy];
 853                         h->direct_cache[scan8[0] + 2 - 1*8]= h->direct_table[b8_xy + 1];
 854                     }else{
 855                         *(uint32_t*)&h->direct_cache[scan8[0] - 1*8]= 0;
 856                     }
 857
 858                     //FIXME interlacing
 859                     if(IS_DIRECT(left_type[0])){
 860                         h->direct_cache[scan8[0] - 1 + 0*8]=
 861                         h->direct_cache[scan8[0] - 1 + 2*8]= 1;
 862                     }else if(IS_8X8(left_type[0])){
 863                         int b8_xy = h->mb2b8_xy[left_xy[0]] + 1;
 864                         h->direct_cache[scan8[0] - 1 + 0*8]= h->direct_table[b8_xy];
 865                         h->direct_cache[scan8[0] - 1 + 2*8]= h->direct_table[b8_xy + h->b8_stride];
 866                     }else{
 867                         h->direct_cache[scan8[0] - 1 + 0*8]=
 868                         h->direct_cache[scan8[0] - 1 + 2*8]= 0;
 869                     }
 870                 }
 871             }
 872         }
 873     }
 874 #endif
 875
 876     h->neighbor_transform_size= !!IS_8x8DCT(top_type) + !!IS_8x8DCT(left_type[0]);
 877 }
 878
 879 static inline void write_back_intra_pred_mode(H264Context *h){
 880     MpegEncContext * const s = &h->s;
 881     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
 882
 883     h->intra4x4_pred_mode[mb_xy][0]= h->intra4x4_pred_mode_cache[7+8*1];
 884     h->intra4x4_pred_mode[mb_xy][1]= h->intra4x4_pred_mode_cache[7+8*2];
 885     h->intra4x4_pred_mode[mb_xy][2]= h->intra4x4_pred_mode_cache[7+8*3];
 886     h->intra4x4_pred_mode[mb_xy][3]= h->intra4x4_pred_mode_cache[7+8*4];
 887     h->intra4x4_pred_mode[mb_xy][4]= h->intra4x4_pred_mode_cache[4+8*4];
 888     h->intra4x4_pred_mode[mb_xy][5]= h->intra4x4_pred_mode_cache[5+8*4];
 889     h->intra4x4_pred_mode[mb_xy][6]= h->intra4x4_pred_mode_cache[6+8*4];
 890 }
 891
 892 /**
 893  * checks if the top & left blocks are available if needed & changes the dc mode so it only uses the available blocks.
 894  */
 895 static inline int check_intra4x4_pred_mode(H264Context *h){
 896     MpegEncContext * const s = &h->s;
 897     static const int8_t top [12]= {-1, 0,LEFT_DC_PRED,-1,-1,-1,-1,-1, 0};
 898     static const int8_t left[12]= { 0,-1, TOP_DC_PRED, 0,-1,-1,-1, 0,-1,DC_128_PRED};
 899     int i;
 900
 901     if(!(h->top_samples_available&0x8000)){
 902         for(i=0; i<4; i++){
 903             int status= top[ h->intra4x4_pred_mode_cache[scan8[0] + i] ];
 904             if(status<0){
 905                 av_log(h->s.avctx, AV_LOG_ERROR, "top block unavailable for requested intra4x4 mode %d at %d %d\n", status, s->mb_x, s->mb_y);
 906                 return -1;
 907             } else if(status){
 908                 h->intra4x4_pred_mode_cache[scan8[0] + i]= status;
 909             }
 910         }
 911     }
 912
 913     if(!(h->left_samples_available&0x8000)){
 914         for(i=0; i<4; i++){
 915             int status= left[ h->intra4x4_pred_mode_cache[scan8[0] + 8*i] ];
 916             if(status<0){
 917                 av_log(h->s.avctx, AV_LOG_ERROR, "left block unavailable for requested intra4x4 mode %d at %d %d\n", status, s->mb_x, s->mb_y);
 918                 return -1;
 919             } else if(status){
 920                 h->intra4x4_pred_mode_cache[scan8[0] + 8*i]= status;
 921             }
 922         }
 923     }
 924
 925     return 0;
 926 } //FIXME cleanup like next
 927
 928 /**
 929  * checks if the top & left blocks are available if needed & changes the dc mode so it only uses the available blocks.
 930  */
 931 static inline int check_intra_pred_mode(H264Context *h, int mode){
 932     MpegEncContext * const s = &h->s;
 933     static const int8_t top [7]= {LEFT_DC_PRED8x8, 1,-1,-1};
 934     static const int8_t left[7]= { TOP_DC_PRED8x8,-1, 2,-1,DC_128_PRED8x8};
 935
 936     if(mode < 0 || mode > 6) {
 937         av_log(h->s.avctx, AV_LOG_ERROR, "out of range intra chroma pred mode at %d %d\n", s->mb_x, s->mb_y);
 938         return -1;
 939     }
 940
 941     if(!(h->top_samples_available&0x8000)){
 942         mode= top[ mode ];
 943         if(mode<0){
 944             av_log(h->s.avctx, AV_LOG_ERROR, "top block unavailable for requested intra mode at %d %d\n", s->mb_x, s->mb_y);
 945             return -1;
 946         }
 947     }
 948
 949     if(!(h->left_samples_available&0x8000)){
 950         mode= left[ mode ];
 951         if(mode<0){
 952             av_log(h->s.avctx, AV_LOG_ERROR, "left block unavailable for requested intra mode at %d %d\n", s->mb_x, s->mb_y);
 953             return -1;
 954         }
 955     }
 956
 957     return mode;
 958 }
 959
 960 /**
 961  * gets the predicted intra4x4 prediction mode.
 962  */
 963 static inline int pred_intra_mode(H264Context *h, int n){
 964     const int index8= scan8[n];
 965     const int left= h->intra4x4_pred_mode_cache[index8 - 1];
 966     const int top = h->intra4x4_pred_mode_cache[index8 - 8];
 967     const int min= FFMIN(left, top);
 968
 969     tprintf("mode:%d %d min:%d\n", left ,top, min);
 970
 971     if(min<0) return DC_PRED;
 972     else      return min;
 973 }
 974
 975 static inline void write_back_non_zero_count(H264Context *h){
 976     MpegEncContext * const s = &h->s;
 977     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
 978
 979     h->non_zero_count[mb_xy][0]= h->non_zero_count_cache[7+8*1];
 980     h->non_zero_count[mb_xy][1]= h->non_zero_count_cache[7+8*2];
 981     h->non_zero_count[mb_xy][2]= h->non_zero_count_cache[7+8*3];
 982     h->non_zero_count[mb_xy][3]= h->non_zero_count_cache[7+8*4];
 983     h->non_zero_count[mb_xy][4]= h->non_zero_count_cache[4+8*4];
 984     h->non_zero_count[mb_xy][5]= h->non_zero_count_cache[5+8*4];
 985     h->non_zero_count[mb_xy][6]= h->non_zero_count_cache[6+8*4];
 986
 987     h->non_zero_count[mb_xy][9]= h->non_zero_count_cache[1+8*2];
 988     h->non_zero_count[mb_xy][8]= h->non_zero_count_cache[2+8*2];
 989     h->non_zero_count[mb_xy][7]= h->non_zero_count_cache[2+8*1];
 990
 991     h->non_zero_count[mb_xy][12]=h->non_zero_count_cache[1+8*5];
 992     h->non_zero_count[mb_xy][11]=h->non_zero_count_cache[2+8*5];
 993     h->non_zero_count[mb_xy][10]=h->non_zero_count_cache[2+8*4];
 994 }
 995
 996 /**
 997  * gets the predicted number of non zero coefficients.
 998  * @param n block index
 999  */
1000 static inline int pred_non_zero_count(H264Context *h, int n){
1001     const int index8= scan8[n];
1002     const int left= h->non_zero_count_cache[index8 - 1];
1003     const int top = h->non_zero_count_cache[index8 - 8];
1004     int i= left + top;
1005
1006     if(i<64) i= (i+1)>>1;
1007
1008     tprintf("pred_nnz L%X T%X n%d s%d P%X\n", left, top, n, scan8[n], i&31);
1009
1010     return i&31;
1011 }
1012
1013 static inline int fetch_diagonal_mv(H264Context *h, const int16_t **C, int i, int list, int part_width){
1014     const int topright_ref= h->ref_cache[list][ i - 8 + part_width ];
1015
1016     if(topright_ref != PART_NOT_AVAILABLE){
1017         *C= h->mv_cache[list][ i - 8 + part_width ];
1018         return topright_ref;
1019     }else{
1020         tprintf("topright MV not available\n");
1021
1022         *C= h->mv_cache[list][ i - 8 - 1 ];
1023         return h->ref_cache[list][ i - 8 - 1 ];
1024     }
1025 }
1026
1027 /**
1028  * gets the predicted MV.
1029  * @param n the block index
1030  * @param part_width the width of the partition (4, 8,16) -> (1, 2, 4)
1031  * @param mx the x component of the predicted motion vector
1032  * @param my the y component of the predicted motion vector
1033  */
1034 static inline void pred_motion(H264Context * const h, int n, int part_width, int list, int ref, int * const mx, int * const my){
1035     const int index8= scan8[n];
1036     const int top_ref=      h->ref_cache[list][ index8 - 8 ];
1037     const int left_ref=     h->ref_cache[list][ index8 - 1 ];
1038     const int16_t * const A= h->mv_cache[list][ index8 - 1 ];
1039     const int16_t * const B= h->mv_cache[list][ index8 - 8 ];
1040     const int16_t * C;
1041     int diagonal_ref, match_count;
1042
1043     assert(part_width==1 || part_width==2 || part_width==4);
1044
1045 /* mv_cache
1046   B . . A T T T T
1047   U . . L . . , .
1048   U . . L . . . .
1049   U . . L . . , .
1050   . . . L . . . .
1051 */
1052
1053     diagonal_ref= fetch_diagonal_mv(h, &C, index8, list, part_width);
1054     match_count= (diagonal_ref==ref) + (top_ref==ref) + (left_ref==ref);
1055     tprintf("pred_motion match_count=%d\n", match_count);
1056     if(match_count > 1){ //most common
1057         *mx= mid_pred(A[0], B[0], C[0]);
1058         *my= mid_pred(A[1], B[1], C[1]);
1059     }else if(match_count==1){
1060         if(left_ref==ref){
1061             *mx= A[0];
1062             *my= A[1];
1063         }else if(top_ref==ref){
1064             *mx= B[0];
1065             *my= B[1];
1066         }else{
1067             *mx= C[0];
1068             *my= C[1];
1069         }
1070     }else{
1071         if(top_ref == PART_NOT_AVAILABLE && diagonal_ref == PART_NOT_AVAILABLE && left_ref != PART_NOT_AVAILABLE){
1072             *mx= A[0];
1073             *my= A[1];
1074         }else{
1075             *mx= mid_pred(A[0], B[0], C[0]);
1076             *my= mid_pred(A[1], B[1], C[1]);
1077         }
1078     }
1079
1080     tprintf("pred_motion (%2d %2d %2d) (%2d %2d %2d) (%2d %2d %2d) -> (%2d %2d %2d) at %2d %2d %d list %d\n", top_ref, B[0], B[1],                    diagonal_ref, C[0], C[1], left_ref, A[0], A[1], ref, *mx, *my, h->s.mb_x, h->s.mb_y, n, list);
1081 }
1082
1083 /**
1084  * gets the directionally predicted 16x8 MV.
1085  * @param n the block index
1086  * @param mx the x component of the predicted motion vector
1087  * @param my the y component of the predicted motion vector
1088  */
1089 static inline void pred_16x8_motion(H264Context * const h, int n, int list, int ref, int * const mx, int * const my){
1090     if(n==0){
1091         const int top_ref=      h->ref_cache[list][ scan8[0] - 8 ];
1092         const int16_t * const B= h->mv_cache[list][ scan8[0] - 8 ];
1093
1094         tprintf("pred_16x8: (%2d %2d %2d) at %2d %2d %d list %d\n", top_ref, B[0], B[1], h->s.mb_x, h->s.mb_y, n, list);
1095
1096         if(top_ref == ref){
1097             *mx= B[0];
1098             *my= B[1];
1099             return;
1100         }
1101     }else{
1102         const int left_ref=     h->ref_cache[list][ scan8[8] - 1 ];
1103         const int16_t * const A= h->mv_cache[list][ scan8[8] - 1 ];
1104
1105         tprintf("pred_16x8: (%2d %2d %2d) at %2d %2d %d list %d\n", left_ref, A[0], A[1], h->s.mb_x, h->s.mb_y, n, list);
1106
1107         if(left_ref == ref){
1108             *mx= A[0];
1109             *my= A[1];
1110             return;
1111         }
1112     }
1113
1114     //RARE
1115     pred_motion(h, n, 4, list, ref, mx, my);
1116 }
1117
1118 /**
1119  * gets the directionally predicted 8x16 MV.
1120  * @param n the block index
1121  * @param mx the x component of the predicted motion vector
1122  * @param my the y component of the predicted motion vector
1123  */
1124 static inline void pred_8x16_motion(H264Context * const h, int n, int list, int ref, int * const mx, int * const my){
1125     if(n==0){
1126         const int left_ref=      h->ref_cache[list][ scan8[0] - 1 ];
1127         const int16_t * const A=  h->mv_cache[list][ scan8[0] - 1 ];
1128
1129         tprintf("pred_8x16: (%2d %2d %2d) at %2d %2d %d list %d\n", left_ref, A[0], A[1], h->s.mb_x, h->s.mb_y, n, list);
1130
1131         if(left_ref == ref){
1132             *mx= A[0];
1133             *my= A[1];
1134             return;
1135         }
1136     }else{
1137         const int16_t * C;
1138         int diagonal_ref;
1139
1140         diagonal_ref= fetch_diagonal_mv(h, &C, scan8[4], list, 2);
1141
1142         tprintf("pred_8x16: (%2d %2d %2d) at %2d %2d %d list %d\n", diagonal_ref, C[0], C[1], h->s.mb_x, h->s.mb_y, n, list);
1143
1144         if(diagonal_ref == ref){
1145             *mx= C[0];
1146             *my= C[1];
1147             return;
1148         }
1149     }
1150
1151     //RARE
1152     pred_motion(h, n, 2, list, ref, mx, my);
1153 }
1154
1155 static inline void pred_pskip_motion(H264Context * const h, int * const mx, int * const my){
1156     const int top_ref = h->ref_cache[0][ scan8[0] - 8 ];
1157     const int left_ref= h->ref_cache[0][ scan8[0] - 1 ];
1158
1159     tprintf("pred_pskip: (%d) (%d) at %2d %2d\n", top_ref, left_ref, h->s.mb_x, h->s.mb_y);
1160
1161     if(top_ref == PART_NOT_AVAILABLE || left_ref == PART_NOT_AVAILABLE
1162        || (top_ref == 0  && *(uint32_t*)h->mv_cache[0][ scan8[0] - 8 ] == 0)
1163        || (left_ref == 0 && *(uint32_t*)h->mv_cache[0][ scan8[0] - 1 ] == 0)){
1164
1165         *mx = *my = 0;
1166         return;
1167     }
1168
1169     pred_motion(h, 0, 4, 0, 0, mx, my);
1170
1171     return;
1172 }
1173
1174 static inline void direct_dist_scale_factor(H264Context * const h){
1175     const int poc = h->s.current_picture_ptr->poc;
1176     const int poc1 = h->ref_list[1][0].poc;
1177     int i;
1178     for(i=0; i<h->ref_count[0]; i++){
1179         int poc0 = h->ref_list[0][i].poc;
1180         int td = clip(poc1 - poc0, -128, 127);
1181         if(td == 0 /* FIXME || pic0 is a long-term ref */){
1182             h->dist_scale_factor[i] = 256;
1183         }else{
1184             int tb = clip(poc - poc0, -128, 127);
1185             int tx = (16384 + (ABS(td) >> 1)) / td;
1186             h->dist_scale_factor[i] = clip((tb*tx + 32) >> 6, -1024, 1023);
1187         }
1188     }
1189 }
1190 static inline void direct_ref_list_init(H264Context * const h){
1191     MpegEncContext * const s = &h->s;
1192     Picture * const ref1 = &h->ref_list[1][0];
1193     Picture * const cur = s->current_picture_ptr;
1194     int list, i, j;
1195     if(cur->pict_type == I_TYPE)
1196         cur->ref_count[0] = 0;
1197     if(cur->pict_type != B_TYPE)
1198         cur->ref_count[1] = 0;
1199     for(list=0; list<2; list++){
1200         cur->ref_count[list] = h->ref_count[list];
1201         for(j=0; j<h->ref_count[list]; j++)
1202             cur->ref_poc[list][j] = h->ref_list[list][j].poc;
1203     }
1204     if(cur->pict_type != B_TYPE || h->direct_spatial_mv_pred)
1205         return;
1206     for(list=0; list<2; list++){
1207         for(i=0; i<ref1->ref_count[list]; i++){
1208             const int poc = ref1->ref_poc[list][i];
1209             h->map_col_to_list0[list][i] = 0; /* bogus; fills in for missing frames */
1210             for(j=0; j<h->ref_count[list]; j++)
1211                 if(h->ref_list[list][j].poc == poc){
1212                     h->map_col_to_list0[list][i] = j;
1213                     break;
1214                 }
1215         }
1216     }
1217 }
1218
1219 static inline void pred_direct_motion(H264Context * const h, int *mb_type){
1220     MpegEncContext * const s = &h->s;
1221     const int mb_xy =   s->mb_x +   s->mb_y*s->mb_stride;
1222     const int b8_xy = 2*s->mb_x + 2*s->mb_y*h->b8_stride;
1223     const int b4_xy = 4*s->mb_x + 4*s->mb_y*h->b_stride;
1224     const int mb_type_col = h->ref_list[1][0].mb_type[mb_xy];
1225     const int16_t (*l1mv0)[2] = (const int16_t (*)[2]) &h->ref_list[1][0].motion_val[0][b4_xy];
1226     const int16_t (*l1mv1)[2] = (const int16_t (*)[2]) &h->ref_list[1][0].motion_val[1][b4_xy];
1227     const int8_t *l1ref0 = &h->ref_list[1][0].ref_index[0][b8_xy];
1228     const int8_t *l1ref1 = &h->ref_list[1][0].ref_index[1][b8_xy];
1229     const int is_b8x8 = IS_8X8(*mb_type);
1230     int sub_mb_type;
1231     int i8, i4;
1232
1233     if(IS_8X8(mb_type_col) && !h->sps.direct_8x8_inference_flag){
1234         /* FIXME save sub mb types from previous frames (or derive from MVs)
1235          * so we know exactly what block size to use */
1236         sub_mb_type = MB_TYPE_8x8|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_4x4 */
1237         *mb_type =    MB_TYPE_8x8|MB_TYPE_L0L1;
1238     }else if(!is_b8x8 && (IS_16X16(mb_type_col) || IS_INTRA(mb_type_col))){
1239         sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
1240         *mb_type =    MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_16x16 */
1241     }else{
1242         sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
1243         *mb_type =    MB_TYPE_8x8|MB_TYPE_L0L1;
1244     }
1245     if(!is_b8x8)
1246         *mb_type |= MB_TYPE_DIRECT2;
1247
1248     tprintf("mb_type = %08x, sub_mb_type = %08x, is_b8x8 = %d, mb_type_col = %08x\n", *mb_type, sub_mb_type, is_b8x8, mb_type_col);
1249
1250     if(h->direct_spatial_mv_pred){
1251         int ref[2];
1252         int mv[2][2];
1253         int list;
1254
1255         /* ref = min(neighbors) */
1256         for(list=0; list<2; list++){
1257             int refa = h->ref_cache[list][scan8[0] - 1];
1258             int refb = h->ref_cache[list][scan8[0] - 8];
1259             int refc = h->ref_cache[list][scan8[0] - 8 + 4];
1260             if(refc == -2)
1261                 refc = h->ref_cache[list][scan8[0] - 8 - 1];
1262             ref[list] = refa;
1263             if(ref[list] < 0 || (refb < ref[list] && refb >= 0))
1264                 ref[list] = refb;
1265             if(ref[list] < 0 || (refc < ref[list] && refc >= 0))
1266                 ref[list] = refc;
1267             if(ref[list] < 0)
1268                 ref[list] = -1;
1269         }
1270
1271         if(ref[0] < 0 && ref[1] < 0){
1272             ref[0] = ref[1] = 0;
1273             mv[0][0] = mv[0][1] =
1274             mv[1][0] = mv[1][1] = 0;
1275         }else{
1276             for(list=0; list<2; list++){
1277                 if(ref[list] >= 0)
1278                     pred_motion(h, 0, 4, list, ref[list], &mv[list][0], &mv[list][1]);
1279                 else
1280                     mv[list][0] = mv[list][1] = 0;
1281             }
1282         }
1283
1284         if(ref[1] < 0){
1285             *mb_type &= ~MB_TYPE_P0L1;
1286             sub_mb_type &= ~MB_TYPE_P0L1;
1287         }else if(ref[0] < 0){
1288             *mb_type &= ~MB_TYPE_P0L0;
1289             sub_mb_type &= ~MB_TYPE_P0L0;
1290         }
1291
1292         if(IS_16X16(*mb_type)){
1293             fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, (uint8_t)ref[0], 1);
1294             fill_rectangle(&h->ref_cache[1][scan8[0]], 4, 4, 8, (uint8_t)ref[1], 1);
1295             if(!IS_INTRA(mb_type_col)
1296                && (   (l1ref0[0] == 0 && ABS(l1mv0[0][0]) <= 1 && ABS(l1mv0[0][1]) <= 1)
1297                    || (l1ref0[0]  < 0 && l1ref1[0] == 0 && ABS(l1mv1[0][0]) <= 1 && ABS(l1mv1[0][1]) <= 1
1298                        && (h->x264_build>33 || !h->x264_build)))){
1299                 if(ref[0] > 0)
1300                     fill_rectangle(&h->mv_cache[0][scan8[0]], 4, 4, 8, pack16to32(mv[0][0],mv[0][1]), 4);
1301                 else
1302                     fill_rectangle(&h->mv_cache[0][scan8[0]], 4, 4, 8, 0, 4);
1303                 if(ref[1] > 0)
1304                     fill_rectangle(&h->mv_cache[1][scan8[0]], 4, 4, 8, pack16to32(mv[1][0],mv[1][1]), 4);
1305                 else
1306                     fill_rectangle(&h->mv_cache[1][scan8[0]], 4, 4, 8, 0, 4);
1307             }else{
1308                 fill_rectangle(&h->mv_cache[0][scan8[0]], 4, 4, 8, pack16to32(mv[0][0],mv[0][1]), 4);
1309                 fill_rectangle(&h->mv_cache[1][scan8[0]], 4, 4, 8, pack16to32(mv[1][0],mv[1][1]), 4);
1310             }
1311         }else{
1312             for(i8=0; i8<4; i8++){
1313                 const int x8 = i8&1;
1314                 const int y8 = i8>>1;
1315
1316                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1317                     continue;
1318                 h->sub_mb_type[i8] = sub_mb_type;
1319
1320                 fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mv[0][0],mv[0][1]), 4);
1321                 fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mv[1][0],mv[1][1]), 4);
1322                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[0], 1);
1323                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, (uint8_t)ref[1], 1);
1324
1325                 /* col_zero_flag */
1326                 if(!IS_INTRA(mb_type_col) && (   l1ref0[x8 + y8*h->b8_stride] == 0
1327                                               || (l1ref0[x8 + y8*h->b8_stride] < 0 && l1ref1[x8 + y8*h->b8_stride] == 0
1328                                                   && (h->x264_build>33 || !h->x264_build)))){
1329                     const int16_t (*l1mv)[2]= l1ref0[x8 + y8*h->b8_stride] == 0 ? l1mv0 : l1mv1;
1330                     if(IS_SUB_8X8(sub_mb_type)){
1331                         const int16_t *mv_col = l1mv[x8*3 + y8*3*h->b_stride];
1332                         if(ABS(mv_col[0]) <= 1 && ABS(mv_col[1]) <= 1){
1333                             if(ref[0] == 0)
1334                                 fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
1335                             if(ref[1] == 0)
1336                                 fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
1337                         }
1338                     }else
1339                     for(i4=0; i4<4; i4++){
1340                         const int16_t *mv_col = l1mv[x8*2 + (i4&1) + (y8*2 + (i4>>1))*h->b_stride];
1341                         if(ABS(mv_col[0]) <= 1 && ABS(mv_col[1]) <= 1){
1342                             if(ref[0] == 0)
1343                                 *(uint32_t*)h->mv_cache[0][scan8[i8*4+i4]] = 0;
1344                             if(ref[1] == 0)
1345                                 *(uint32_t*)h->mv_cache[1][scan8[i8*4+i4]] = 0;
1346                         }
1347                     }
1348                 }
1349             }
1350         }
1351     }else{ /* direct temporal mv pred */
1352         if(IS_16X16(*mb_type)){
1353             fill_rectangle(&h->ref_cache[1][scan8[0]], 4, 4, 8, 0, 1);
1354             if(IS_INTRA(mb_type_col)){
1355                 fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, 0, 1);
1356                 fill_rectangle(&h-> mv_cache[0][scan8[0]], 4, 4, 8, 0, 4);
1357                 fill_rectangle(&h-> mv_cache[1][scan8[0]], 4, 4, 8, 0, 4);
1358             }else{
1359                 const int ref0 = l1ref0[0] >= 0 ? h->map_col_to_list0[0][l1ref0[0]]
1360                                                 : h->map_col_to_list0[1][l1ref1[0]];
1361                 const int dist_scale_factor = h->dist_scale_factor[ref0];
1362                 const int16_t *mv_col = l1ref0[0] >= 0 ? l1mv0[0] : l1mv1[0];
1363                 int mv_l0[2];
1364                 mv_l0[0] = (dist_scale_factor * mv_col[0] + 128) >> 8;
1365                 mv_l0[1] = (dist_scale_factor * mv_col[1] + 128) >> 8;
1366                 fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, ref0, 1);
1367                 fill_rectangle(&h-> mv_cache[0][scan8[0]], 4, 4, 8, pack16to32(mv_l0[0],mv_l0[1]), 4);
1368                 fill_rectangle(&h-> mv_cache[1][scan8[0]], 4, 4, 8, pack16to32(mv_l0[0]-mv_col[0],mv_l0[1]-mv_col[1]), 4);
1369             }
1370         }else{
1371             for(i8=0; i8<4; i8++){
1372                 const int x8 = i8&1;
1373                 const int y8 = i8>>1;
1374                 int ref0, dist_scale_factor;
1375                 const int16_t (*l1mv)[2]= l1mv0;
1376
1377                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1378                     continue;
1379                 h->sub_mb_type[i8] = sub_mb_type;
1380                 if(IS_INTRA(mb_type_col)){
1381                     fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, 0, 1);
1382                     fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, 0, 1);
1383                     fill_rectangle(&h-> mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
1384                     fill_rectangle(&h-> mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
1385                     continue;
1386                 }
1387
1388                 ref0 = l1ref0[x8 + y8*h->b8_stride];
1389                 if(ref0 >= 0)
1390                     ref0 = h->map_col_to_list0[0][ref0];
1391                 else{
1392                     ref0 = h->map_col_to_list0[1][l1ref1[x8 + y8*h->b8_stride]];
1393                     l1mv= l1mv1;
1394                 }
1395                 dist_scale_factor = h->dist_scale_factor[ref0];
1396
1397                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, ref0, 1);
1398                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, 0, 1);
1399                 if(IS_SUB_8X8(sub_mb_type)){
1400                     const int16_t *mv_col = l1mv[x8*3 + y8*3*h->b_stride];
1401                     int mx = (dist_scale_factor * mv_col[0] + 128) >> 8;
1402                     int my = (dist_scale_factor * mv_col[1] + 128) >> 8;
1403                     fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mx,my), 4);
1404                     fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mx-mv_col[0],my-mv_col[1]), 4);
1405                 }else
1406                 for(i4=0; i4<4; i4++){
1407                     const int16_t *mv_col = l1mv[x8*2 + (i4&1) + (y8*2 + (i4>>1))*h->b_stride];
1408                     int16_t *mv_l0 = h->mv_cache[0][scan8[i8*4+i4]];
1409                     mv_l0[0] = (dist_scale_factor * mv_col[0] + 128) >> 8;
1410                     mv_l0[1] = (dist_scale_factor * mv_col[1] + 128) >> 8;
1411                     *(uint32_t*)h->mv_cache[1][scan8[i8*4+i4]] =
1412                         pack16to32(mv_l0[0]-mv_col[0],mv_l0[1]-mv_col[1]);
1413                 }
1414             }
1415         }
1416     }
1417 }
1418
1419 static inline void write_back_motion(H264Context *h, int mb_type){
1420     MpegEncContext * const s = &h->s;
1421     const int b_xy = 4*s->mb_x + 4*s->mb_y*h->b_stride;
1422     const int b8_xy= 2*s->mb_x + 2*s->mb_y*h->b8_stride;
1423     int list;
1424
1425     if(!USES_LIST(mb_type, 0))
1426         fill_rectangle(&s->current_picture.ref_index[0][b8_xy], 2, 2, h->b8_stride, (uint8_t)LIST_NOT_USED, 1);
1427
1428     for(list=0; list<2; list++){
1429         int y;
1430         if(!USES_LIST(mb_type, list))
1431             continue;
1432
1433         for(y=0; y<4; y++){
1434             *(uint64_t*)s->current_picture.motion_val[list][b_xy + 0 + y*h->b_stride]= *(uint64_t*)h->mv_cache[list][scan8[0]+0 + 8*y];
1435             *(uint64_t*)s->current_picture.motion_val[list][b_xy + 2 + y*h->b_stride]= *(uint64_t*)h->mv_cache[list][scan8[0]+2 + 8*y];
1436         }
1437         if( h->pps.cabac ) {
1438             for(y=0; y<4; y++){
1439                 *(uint64_t*)h->mvd_table[list][b_xy + 0 + y*h->b_stride]= *(uint64_t*)h->mvd_cache[list][scan8[0]+0 + 8*y];
1440                 *(uint64_t*)h->mvd_table[list][b_xy + 2 + y*h->b_stride]= *(uint64_t*)h->mvd_cache[list][scan8[0]+2 + 8*y];
1441             }
1442         }
1443
1444         {
1445             uint8_t *ref_index = &s->current_picture.ref_index[list][b8_xy];
1446             ref_index[0+0*h->b8_stride]= h->ref_cache[list][scan8[0]];
1447             ref_index[1+0*h->b8_stride]= h->ref_cache[list][scan8[4]];
1448             ref_index[0+1*h->b8_stride]= h->ref_cache[list][scan8[8]];
1449             ref_index[1+1*h->b8_stride]= h->ref_cache[list][scan8[12]];
1450         }
1451     }
1452
1453     if(h->slice_type == B_TYPE && h->pps.cabac){
1454         if(IS_8X8(mb_type)){
1455             uint8_t *direct_table = &h->direct_table[b8_xy];
1456             direct_table[1+0*h->b8_stride] = IS_DIRECT(h->sub_mb_type[1]) ? 1 : 0;
1457             direct_table[0+1*h->b8_stride] = IS_DIRECT(h->sub_mb_type[2]) ? 1 : 0;
1458             direct_table[1+1*h->b8_stride] = IS_DIRECT(h->sub_mb_type[3]) ? 1 : 0;
1459         }
1460     }
1461 }
1462
1463 /**
1464  * Decodes a network abstraction layer unit.
1465  * @param consumed is the number of bytes used as input
1466  * @param length is the length of the array
1467  * @param dst_length is the number of decoded bytes FIXME here or a decode rbsp tailing?
1468  * @returns decoded bytes, might be src+1 if no escapes
1469  */
1470 static uint8_t *decode_nal(H264Context *h, uint8_t *src, int *dst_length, int *consumed, int length){
1471     int i, si, di;
1472     uint8_t *dst;
1473
1474 //    src[0]&0x80;                //forbidden bit
1475     h->nal_ref_idc= src[0]>>5;
1476     h->nal_unit_type= src[0]&0x1F;
1477
1478     src++; length--;
1479 #if 0
1480     for(i=0; i<length; i++)
1481         printf("%2X ", src[i]);
1482 #endif
1483     for(i=0; i+1<length; i+=2){
1484         if(src[i]) continue;
1485         if(i>0 && src[i-1]==0) i--;
1486         if(i+2<length && src[i+1]==0 && src[i+2]<=3){
1487             if(src[i+2]!=3){
1488                 /* startcode, so we must be past the end */
1489                 length=i;
1490             }
1491             break;
1492         }
1493     }
1494
1495     if(i>=length-1){ //no escaped 0
1496         *dst_length= length;
1497         *consumed= length+1; //+1 for the header
1498         return src;
1499     }
1500
1501     h->rbsp_buffer= av_fast_realloc(h->rbsp_buffer, &h->rbsp_buffer_size, length);
1502     dst= h->rbsp_buffer;
1503
1504 //printf("decoding esc\n");
1505     si=di=0;
1506     while(si<length){
1507         //remove escapes (very rare 1:2^22)
1508         if(si+2<length && src[si]==0 && src[si+1]==0 && src[si+2]<=3){
1509             if(src[si+2]==3){ //escape
1510                 dst[di++]= 0;
1511                 dst[di++]= 0;
1512                 si+=3;
1513                 continue;
1514             }else //next start code
1515                 break;
1516         }
1517
1518         dst[di++]= src[si++];
1519     }
1520
1521     *dst_length= di;
1522     *consumed= si + 1;//+1 for the header
1523 //FIXME store exact number of bits in the getbitcontext (its needed for decoding)
1524     return dst;
1525 }
1526
1527 #if 0
1528 /**
1529  * @param src the data which should be escaped
1530  * @param dst the target buffer, dst+1 == src is allowed as a special case
1531  * @param length the length of the src data
1532  * @param dst_length the length of the dst array
1533  * @returns length of escaped data in bytes or -1 if an error occured
1534  */
1535 static int encode_nal(H264Context *h, uint8_t *dst, uint8_t *src, int length, int dst_length){
1536     int i, escape_count, si, di;
1537     uint8_t *temp;
1538
1539     assert(length>=0);
1540     assert(dst_length>0);
1541
1542     dst[0]= (h->nal_ref_idc<<5) + h->nal_unit_type;
1543
1544     if(length==0) return 1;
1545
1546     escape_count= 0;
1547     for(i=0; i<length; i+=2){
1548         if(src[i]) continue;
1549         if(i>0 && src[i-1]==0)
1550             i--;
1551         if(i+2<length && src[i+1]==0 && src[i+2]<=3){
1552             escape_count++;
1553             i+=2;
1554         }
1555     }
1556
1557     if(escape_count==0){
1558         if(dst+1 != src)
1559             memcpy(dst+1, src, length);
1560         return length + 1;
1561     }
1562
1563     if(length + escape_count + 1> dst_length)
1564         return -1;
1565
1566     //this should be damn rare (hopefully)
1567
1568     h->rbsp_buffer= av_fast_realloc(h->rbsp_buffer, &h->rbsp_buffer_size, length + escape_count);
1569     temp= h->rbsp_buffer;
1570 //printf("encoding esc\n");
1571
1572     si= 0;
1573     di= 0;
1574     while(si < length){
1575         if(si+2<length && src[si]==0 && src[si+1]==0 && src[si+2]<=3){
1576             temp[di++]= 0; si++;
1577             temp[di++]= 0; si++;
1578             temp[di++]= 3;
1579             temp[di++]= src[si++];
1580         }
1581         else
1582             temp[di++]= src[si++];
1583     }
1584     memcpy(dst+1, temp, length+escape_count);
1585
1586     assert(di == length+escape_count);
1587
1588     return di + 1;
1589 }
1590
1591 /**
1592  * write 1,10,100,1000,... for alignment, yes its exactly inverse to mpeg4
1593  */
1594 static void encode_rbsp_trailing(PutBitContext *pb){
1595     int length;
1596     put_bits(pb, 1, 1);
1597     length= (-put_bits_count(pb))&7;
1598     if(length) put_bits(pb, length, 0);
1599 }
1600 #endif
1601
1602 /**
1603  * identifies the exact end of the bitstream
1604  * @return the length of the trailing, or 0 if damaged
1605  */
1606 static int decode_rbsp_trailing(uint8_t *src){
1607     int v= *src;
1608     int r;
1609
1610     tprintf("rbsp trailing %X\n", v);
1611
1612     for(r=1; r<9; r++){
1613         if(v&1) return r;
1614         v>>=1;
1615     }
1616     return 0;
1617 }
1618
1619 /**
1620  * idct tranforms the 16 dc values and dequantize them.
1621  * @param qp quantization parameter
1622  */
1623 static void h264_luma_dc_dequant_idct_c(DCTELEM *block, int qp, int qmul){
1624 #define stride 16
1625     int i;
1626     int temp[16]; //FIXME check if this is a good idea
1627     static const int x_offset[4]={0, 1*stride, 4* stride,  5*stride};
1628     static const int y_offset[4]={0, 2*stride, 8* stride, 10*stride};
1629
1630 //memset(block, 64, 2*256);
1631 //return;
1632     for(i=0; i<4; i++){
1633         const int offset= y_offset[i];
1634         const int z0= block[offset+stride*0] + block[offset+stride*4];
1635         const int z1= block[offset+stride*0] - block[offset+stride*4];
1636         const int z2= block[offset+stride*1] - block[offset+stride*5];
1637         const int z3= block[offset+stride*1] + block[offset+stride*5];
1638
1639         temp[4*i+0]= z0+z3;
1640         temp[4*i+1]= z1+z2;
1641         temp[4*i+2]= z1-z2;
1642         temp[4*i+3]= z0-z3;
1643     }
1644
1645     for(i=0; i<4; i++){
1646         const int offset= x_offset[i];
1647         const int z0= temp[4*0+i] + temp[4*2+i];
1648         const int z1= temp[4*0+i] - temp[4*2+i];
1649         const int z2= temp[4*1+i] - temp[4*3+i];
1650         const int z3= temp[4*1+i] + temp[4*3+i];
1651
1652         block[stride*0 +offset]= ((((z0 + z3)*qmul + 128 ) >> 8)); //FIXME think about merging this into decode_resdual
1653         block[stride*2 +offset]= ((((z1 + z2)*qmul + 128 ) >> 8));
1654         block[stride*8 +offset]= ((((z1 - z2)*qmul + 128 ) >> 8));
1655         block[stride*10+offset]= ((((z0 - z3)*qmul + 128 ) >> 8));
1656     }
1657 }
1658
1659 #if 0
1660 /**
1661  * dct tranforms the 16 dc values.
1662  * @param qp quantization parameter ??? FIXME
1663  */
1664 static void h264_luma_dc_dct_c(DCTELEM *block/*, int qp*/){
1665 //    const int qmul= dequant_coeff[qp][0];
1666     int i;
1667     int temp[16]; //FIXME check if this is a good idea
1668     static const int x_offset[4]={0, 1*stride, 4* stride,  5*stride};
1669     static const int y_offset[4]={0, 2*stride, 8* stride, 10*stride};
1670
1671     for(i=0; i<4; i++){
1672         const int offset= y_offset[i];
1673         const int z0= block[offset+stride*0] + block[offset+stride*4];
1674         const int z1= block[offset+stride*0] - block[offset+stride*4];
1675         const int z2= block[offset+stride*1] - block[offset+stride*5];
1676         const int z3= block[offset+stride*1] + block[offset+stride*5];
1677
1678         temp[4*i+0]= z0+z3;
1679         temp[4*i+1]= z1+z2;
1680         temp[4*i+2]= z1-z2;
1681         temp[4*i+3]= z0-z3;
1682     }
1683
1684     for(i=0; i<4; i++){
1685         const int offset= x_offset[i];
1686         const int z0= temp[4*0+i] + temp[4*2+i];
1687         const int z1= temp[4*0+i] - temp[4*2+i];
1688         const int z2= temp[4*1+i] - temp[4*3+i];
1689         const int z3= temp[4*1+i] + temp[4*3+i];
1690
1691         block[stride*0 +offset]= (z0 + z3)>>1;
1692         block[stride*2 +offset]= (z1 + z2)>>1;
1693         block[stride*8 +offset]= (z1 - z2)>>1;
1694         block[stride*10+offset]= (z0 - z3)>>1;
1695     }
1696 }
1697 #endif
1698
1699 #undef xStride
1700 #undef stride
1701
1702 static void chroma_dc_dequant_idct_c(DCTELEM *block, int qp, int qmul){
1703     const int stride= 16*2;
1704     const int xStride= 16;
1705     int a,b,c,d,e;
1706
1707     a= block[stride*0 + xStride*0];
1708     b= block[stride*0 + xStride*1];
1709     c= block[stride*1 + xStride*0];
1710     d= block[stride*1 + xStride*1];
1711
1712     e= a-b;
1713     a= a+b;
1714     b= c-d;
1715     c= c+d;
1716
1717     block[stride*0 + xStride*0]= ((a+c)*qmul) >> 7;
1718     block[stride*0 + xStride*1]= ((e+b)*qmul) >> 7;
1719     block[stride*1 + xStride*0]= ((a-c)*qmul) >> 7;
1720     block[stride*1 + xStride*1]= ((e-b)*qmul) >> 7;
1721 }
1722
1723 #if 0
1724 static void chroma_dc_dct_c(DCTELEM *block){
1725     const int stride= 16*2;
1726     const int xStride= 16;
1727     int a,b,c,d,e;
1728
1729     a= block[stride*0 + xStride*0];
1730     b= block[stride*0 + xStride*1];
1731     c= block[stride*1 + xStride*0];
1732     d= block[stride*1 + xStride*1];
1733
1734     e= a-b;
1735     a= a+b;
1736     b= c-d;
1737     c= c+d;
1738
1739     block[stride*0 + xStride*0]= (a+c);
1740     block[stride*0 + xStride*1]= (e+b);
1741     block[stride*1 + xStride*0]= (a-c);
1742     block[stride*1 + xStride*1]= (e-b);
1743 }
1744 #endif
1745
1746 /**
1747  * gets the chroma qp.
1748  */
1749 static inline int get_chroma_qp(int chroma_qp_index_offset, int qscale){
1750
1751     return chroma_qp[clip(qscale + chroma_qp_index_offset, 0, 51)];
1752 }
1753
1754
1755 #if 0
1756 static void h264_diff_dct_c(DCTELEM *block, uint8_t *src1, uint8_t *src2, int stride){
1757     int i;
1758     //FIXME try int temp instead of block
1759
1760     for(i=0; i<4; i++){
1761         const int d0= src1[0 + i*stride] - src2[0 + i*stride];
1762         const int d1= src1[1 + i*stride] - src2[1 + i*stride];
1763         const int d2= src1[2 + i*stride] - src2[2 + i*stride];
1764         const int d3= src1[3 + i*stride] - src2[3 + i*stride];
1765         const int z0= d0 + d3;
1766         const int z3= d0 - d3;
1767         const int z1= d1 + d2;
1768         const int z2= d1 - d2;
1769
1770         block[0 + 4*i]=   z0 +   z1;
1771         block[1 + 4*i]= 2*z3 +   z2;
1772         block[2 + 4*i]=   z0 -   z1;
1773         block[3 + 4*i]=   z3 - 2*z2;
1774     }
1775
1776     for(i=0; i<4; i++){
1777         const int z0= block[0*4 + i] + block[3*4 + i];
1778         const int z3= block[0*4 + i] - block[3*4 + i];
1779         const int z1= block[1*4 + i] + block[2*4 + i];
1780         const int z2= block[1*4 + i] - block[2*4 + i];
1781
1782         block[0*4 + i]=   z0 +   z1;
1783         block[1*4 + i]= 2*z3 +   z2;
1784         block[2*4 + i]=   z0 -   z1;
1785         block[3*4 + i]=   z3 - 2*z2;
1786     }
1787 }
1788 #endif
1789
1790 //FIXME need to check that this doesnt overflow signed 32 bit for low qp, i am not sure, it's very close
1791 //FIXME check that gcc inlines this (and optimizes intra & seperate_dc stuff away)
1792 static inline int quantize_c(DCTELEM *block, uint8_t *scantable, int qscale, int intra, int seperate_dc){
1793     int i;
1794     const int * const quant_table= quant_coeff[qscale];
1795     const int bias= intra ? (1<<QUANT_SHIFT)/3 : (1<<QUANT_SHIFT)/6;
1796     const unsigned int threshold1= (1<<QUANT_SHIFT) - bias - 1;
1797     const unsigned int threshold2= (threshold1<<1);
1798     int last_non_zero;
1799
1800     if(seperate_dc){
1801         if(qscale<=18){
1802             //avoid overflows
1803             const int dc_bias= intra ? (1<<(QUANT_SHIFT-2))/3 : (1<<(QUANT_SHIFT-2))/6;
1804             const unsigned int dc_threshold1= (1<<(QUANT_SHIFT-2)) - dc_bias - 1;
1805             const unsigned int dc_threshold2= (dc_threshold1<<1);
1806
1807             int level= block[0]*quant_coeff[qscale+18][0];
1808             if(((unsigned)(level+dc_threshold1))>dc_threshold2){
1809                 if(level>0){
1810                     level= (dc_bias + level)>>(QUANT_SHIFT-2);
1811                     block[0]= level;
1812                 }else{
1813                     level= (dc_bias - level)>>(QUANT_SHIFT-2);
1814                     block[0]= -level;
1815                 }
1816 //                last_non_zero = i;
1817             }else{
1818                 block[0]=0;
1819             }
1820         }else{
1821             const int dc_bias= intra ? (1<<(QUANT_SHIFT+1))/3 : (1<<(QUANT_SHIFT+1))/6;
1822             const unsigned int dc_threshold1= (1<<(QUANT_SHIFT+1)) - dc_bias - 1;
1823             const unsigned int dc_threshold2= (dc_threshold1<<1);
1824
1825             int level= block[0]*quant_table[0];
1826             if(((unsigned)(level+dc_threshold1))>dc_threshold2){
1827                 if(level>0){
1828                     level= (dc_bias + level)>>(QUANT_SHIFT+1);
1829                     block[0]= level;
1830                 }else{
1831                     level= (dc_bias - level)>>(QUANT_SHIFT+1);
1832                     block[0]= -level;
1833                 }
1834 //                last_non_zero = i;
1835             }else{
1836                 block[0]=0;
1837             }
1838         }
1839         last_non_zero= 0;
1840         i=1;
1841     }else{
1842         last_non_zero= -1;
1843         i=0;
1844     }
1845
1846     for(; i<16; i++){
1847         const int j= scantable[i];
1848         int level= block[j]*quant_table[j];
1849
1850 //        if(   bias+level >= (1<<(QMAT_SHIFT - 3))
1851 //           || bias-level >= (1<<(QMAT_SHIFT - 3))){
1852         if(((unsigned)(level+threshold1))>threshold2){
1853             if(level>0){
1854                 level= (bias + level)>>QUANT_SHIFT;
1855                 block[j]= level;
1856             }else{
1857                 level= (bias - level)>>QUANT_SHIFT;
1858                 block[j]= -level;
1859             }
1860             last_non_zero = i;
1861         }else{
1862             block[j]=0;
1863         }
1864     }
1865
1866     return last_non_zero;
1867 }
1868
1869 static void pred4x4_vertical_c(uint8_t *src, uint8_t *topright, int stride){
1870     const uint32_t a= ((uint32_t*)(src-stride))[0];
1871     ((uint32_t*)(src+0*stride))[0]= a;
1872     ((uint32_t*)(src+1*stride))[0]= a;
1873     ((uint32_t*)(src+2*stride))[0]= a;
1874     ((uint32_t*)(src+3*stride))[0]= a;
1875 }
1876
1877 static void pred4x4_horizontal_c(uint8_t *src, uint8_t *topright, int stride){
1878     ((uint32_t*)(src+0*stride))[0]= src[-1+0*stride]*0x01010101;
1879     ((uint32_t*)(src+1*stride))[0]= src[-1+1*stride]*0x01010101;
1880     ((uint32_t*)(src+2*stride))[0]= src[-1+2*stride]*0x01010101;
1881     ((uint32_t*)(src+3*stride))[0]= src[-1+3*stride]*0x01010101;
1882 }
1883
1884 static void pred4x4_dc_c(uint8_t *src, uint8_t *topright, int stride){
1885     const int dc= (  src[-stride] + src[1-stride] + src[2-stride] + src[3-stride]
1886                    + src[-1+0*stride] + src[-1+1*stride] + src[-1+2*stride] + src[-1+3*stride] + 4) >>3;
1887
1888     ((uint32_t*)(src+0*stride))[0]=
1889     ((uint32_t*)(src+1*stride))[0]=
1890     ((uint32_t*)(src+2*stride))[0]=
1891     ((uint32_t*)(src+3*stride))[0]= dc* 0x01010101;
1892 }
1893
1894 static void pred4x4_left_dc_c(uint8_t *src, uint8_t *topright, int stride){
1895     const int dc= (  src[-1+0*stride] + src[-1+1*stride] + src[-1+2*stride] + src[-1+3*stride] + 2) >>2;
1896
1897     ((uint32_t*)(src+0*stride))[0]=
1898     ((uint32_t*)(src+1*stride))[0]=
1899     ((uint32_t*)(src+2*stride))[0]=
1900     ((uint32_t*)(src+3*stride))[0]= dc* 0x01010101;
1901 }
1902
1903 static void pred4x4_top_dc_c(uint8_t *src, uint8_t *topright, int stride){
1904     const int dc= (  src[-stride] + src[1-stride] + src[2-stride] + src[3-stride] + 2) >>2;
1905
1906     ((uint32_t*)(src+0*stride))[0]=
1907     ((uint32_t*)(src+1*stride))[0]=
1908     ((uint32_t*)(src+2*stride))[0]=
1909     ((uint32_t*)(src+3*stride))[0]= dc* 0x01010101;
1910 }
1911
1912 static void pred4x4_128_dc_c(uint8_t *src, uint8_t *topright, int stride){
1913     ((uint32_t*)(src+0*stride))[0]=
1914     ((uint32_t*)(src+1*stride))[0]=
1915     ((uint32_t*)(src+2*stride))[0]=
1916     ((uint32_t*)(src+3*stride))[0]= 128U*0x01010101U;
1917 }
1918
1919
1920 #define LOAD_TOP_RIGHT_EDGE\
1921     const int t4= topright[0];\
1922     const int t5= topright[1];\
1923     const int t6= topright[2];\
1924     const int t7= topright[3];\
1925
1926 #define LOAD_LEFT_EDGE\
1927     const int l0= src[-1+0*stride];\
1928     const int l1= src[-1+1*stride];\
1929     const int l2= src[-1+2*stride];\
1930     const int l3= src[-1+3*stride];\
1931
1932 #define LOAD_TOP_EDGE\
1933     const int t0= src[ 0-1*stride];\
1934     const int t1= src[ 1-1*stride];\
1935     const int t2= src[ 2-1*stride];\
1936     const int t3= src[ 3-1*stride];\
1937
1938 static void pred4x4_down_right_c(uint8_t *src, uint8_t *topright, int stride){
1939     const int lt= src[-1-1*stride];
1940     LOAD_TOP_EDGE
1941     LOAD_LEFT_EDGE
1942
1943     src[0+3*stride]=(l3 + 2*l2 + l1 + 2)>>2;
1944     src[0+2*stride]=
1945     src[1+3*stride]=(l2 + 2*l1 + l0 + 2)>>2;
1946     src[0+1*stride]=
1947     src[1+2*stride]=
1948     src[2+3*stride]=(l1 + 2*l0 + lt + 2)>>2;
1949     src[0+0*stride]=
1950     src[1+1*stride]=
1951     src[2+2*stride]=
1952     src[3+3*stride]=(l0 + 2*lt + t0 + 2)>>2;
1953     src[1+0*stride]=
1954     src[2+1*stride]=
1955     src[3+2*stride]=(lt + 2*t0 + t1 + 2)>>2;
1956     src[2+0*stride]=
1957     src[3+1*stride]=(t0 + 2*t1 + t2 + 2)>>2;
1958     src[3+0*stride]=(t1 + 2*t2 + t3 + 2)>>2;
1959 }
1960
1961 static void pred4x4_down_left_c(uint8_t *src, uint8_t *topright, int stride){
1962     LOAD_TOP_EDGE
1963     LOAD_TOP_RIGHT_EDGE
1964 //    LOAD_LEFT_EDGE
1965
1966     src[0+0*stride]=(t0 + t2 + 2*t1 + 2)>>2;
1967     src[1+0*stride]=
1968     src[0+1*stride]=(t1 + t3 + 2*t2 + 2)>>2;
1969     src[2+0*stride]=
1970     src[1+1*stride]=
1971     src[0+2*stride]=(t2 + t4 + 2*t3 + 2)>>2;
1972     src[3+0*stride]=
1973     src[2+1*stride]=
1974     src[1+2*stride]=
1975     src[0+3*stride]=(t3 + t5 + 2*t4 + 2)>>2;
1976     src[3+1*stride]=
1977     src[2+2*stride]=
1978     src[1+3*stride]=(t4 + t6 + 2*t5 + 2)>>2;
1979     src[3+2*stride]=
1980     src[2+3*stride]=(t5 + t7 + 2*t6 + 2)>>2;
1981     src[3+3*stride]=(t6 + 3*t7 + 2)>>2;
1982 }
1983
1984 static void pred4x4_vertical_right_c(uint8_t *src, uint8_t *topright, int stride){
1985     const int lt= src[-1-1*stride];
1986     LOAD_TOP_EDGE
1987     LOAD_LEFT_EDGE
1988     const __attribute__((unused)) int unu= l3;
1989
1990     src[0+0*stride]=
1991     src[1+2*stride]=(lt + t0 + 1)>>1;
1992     src[1+0*stride]=
1993     src[2+2*stride]=(t0 + t1 + 1)>>1;
1994     src[2+0*stride]=
1995     src[3+2*stride]=(t1 + t2 + 1)>>1;
1996     src[3+0*stride]=(t2 + t3 + 1)>>1;
1997     src[0+1*stride]=
1998     src[1+3*stride]=(l0 + 2*lt + t0 + 2)>>2;
1999     src[1+1*stride]=
2000     src[2+3*stride]=(lt + 2*t0 + t1 + 2)>>2;
2001     src[2+1*stride]=
2002     src[3+3*stride]=(t0 + 2*t1 + t2 + 2)>>2;
2003     src[3+1*stride]=(t1 + 2*t2 + t3 + 2)>>2;
2004     src[0+2*stride]=(lt + 2*l0 + l1 + 2)>>2;
2005     src[0+3*stride]=(l0 + 2*l1 + l2 + 2)>>2;
2006 }
2007
2008 static void pred4x4_vertical_left_c(uint8_t *src, uint8_t *topright, int stride){
2009     LOAD_TOP_EDGE
2010     LOAD_TOP_RIGHT_EDGE
2011     const __attribute__((unused)) int unu= t7;
2012
2013     src[0+0*stride]=(t0 + t1 + 1)>>1;
2014     src[1+0*stride]=
2015     src[0+2*stride]=(t1 + t2 + 1)>>1;
2016     src[2+0*stride]=
2017     src[1+2*stride]=(t2 + t3 + 1)>>1;
2018     src[3+0*stride]=
2019     src[2+2*stride]=(t3 + t4+ 1)>>1;
2020     src[3+2*stride]=(t4 + t5+ 1)>>1;
2021     src[0+1*stride]=(t0 + 2*t1 + t2 + 2)>>2;
2022     src[1+1*stride]=
2023     src[0+3*stride]=(t1 + 2*t2 + t3 + 2)>>2;
2024     src[2+1*stride]=
2025     src[1+3*stride]=(t2 + 2*t3 + t4 + 2)>>2;
2026     src[3+1*stride]=
2027     src[2+3*stride]=(t3 + 2*t4 + t5 + 2)>>2;
2028     src[3+3*stride]=(t4 + 2*t5 + t6 + 2)>>2;
2029 }
2030
2031 static void pred4x4_horizontal_up_c(uint8_t *src, uint8_t *topright, int stride){
2032     LOAD_LEFT_EDGE
2033
2034     src[0+0*stride]=(l0 + l1 + 1)>>1;
2035     src[1+0*stride]=(l0 + 2*l1 + l2 + 2)>>2;
2036     src[2+0*stride]=
2037     src[0+1*stride]=(l1 + l2 + 1)>>1;
2038     src[3+0*stride]=
2039     src[1+1*stride]=(l1 + 2*l2 + l3 + 2)>>2;
2040     src[2+1*stride]=
2041     src[0+2*stride]=(l2 + l3 + 1)>>1;
2042     src[3+1*stride]=
2043     src[1+2*stride]=(l2 + 2*l3 + l3 + 2)>>2;
2044     src[3+2*stride]=
2045     src[1+3*stride]=
2046     src[0+3*stride]=
2047     src[2+2*stride]=
2048     src[2+3*stride]=
2049     src[3+3*stride]=l3;
2050 }
2051
2052 static void pred4x4_horizontal_down_c(uint8_t *src, uint8_t *topright, int stride){
2053     const int lt= src[-1-1*stride];
2054     LOAD_TOP_EDGE
2055     LOAD_LEFT_EDGE
2056     const __attribute__((unused)) int unu= t3;
2057
2058     src[0+0*stride]=
2059     src[2+1*stride]=(lt + l0 + 1)>>1;
2060     src[1+0*stride]=
2061     src[3+1*stride]=(l0 + 2*lt + t0 + 2)>>2;
2062     src[2+0*stride]=(lt + 2*t0 + t1 + 2)>>2;
2063     src[3+0*stride]=(t0 + 2*t1 + t2 + 2)>>2;
2064     src[0+1*stride]=
2065     src[2+2*stride]=(l0 + l1 + 1)>>1;
2066     src[1+1*stride]=
2067     src[3+2*stride]=(lt + 2*l0 + l1 + 2)>>2;
2068     src[0+2*stride]=
2069     src[2+3*stride]=(l1 + l2+ 1)>>1;
2070     src[1+2*stride]=
2071     src[3+3*stride]=(l0 + 2*l1 + l2 + 2)>>2;
2072     src[0+3*stride]=(l2 + l3 + 1)>>1;
2073     src[1+3*stride]=(l1 + 2*l2 + l3 + 2)>>2;
2074 }
2075
2076 static void pred16x16_vertical_c(uint8_t *src, int stride){
2077     int i;
2078     const uint32_t a= ((uint32_t*)(src-stride))[0];
2079     const uint32_t b= ((uint32_t*)(src-stride))[1];
2080     const uint32_t c= ((uint32_t*)(src-stride))[2];
2081     const uint32_t d= ((uint32_t*)(src-stride))[3];
2082
2083     for(i=0; i<16; i++){
2084         ((uint32_t*)(src+i*stride))[0]= a;
2085         ((uint32_t*)(src+i*stride))[1]= b;
2086         ((uint32_t*)(src+i*stride))[2]= c;
2087         ((uint32_t*)(src+i*stride))[3]= d;
2088     }
2089 }
2090
2091 static void pred16x16_horizontal_c(uint8_t *src, int stride){
2092     int i;
2093
2094     for(i=0; i<16; i++){
2095         ((uint32_t*)(src+i*stride))[0]=
2096         ((uint32_t*)(src+i*stride))[1]=
2097         ((uint32_t*)(src+i*stride))[2]=
2098         ((uint32_t*)(src+i*stride))[3]= src[-1+i*stride]*0x01010101;
2099     }
2100 }
2101
2102 static void pred16x16_dc_c(uint8_t *src, int stride){
2103     int i, dc=0;
2104
2105     for(i=0;i<16; i++){
2106         dc+= src[-1+i*stride];
2107     }
2108
2109     for(i=0;i<16; i++){
2110         dc+= src[i-stride];
2111     }
2112
2113     dc= 0x01010101*((dc + 16)>>5);
2114
2115     for(i=0; i<16; i++){
2116         ((uint32_t*)(src+i*stride))[0]=
2117         ((uint32_t*)(src+i*stride))[1]=
2118         ((uint32_t*)(src+i*stride))[2]=
2119         ((uint32_t*)(src+i*stride))[3]= dc;
2120     }
2121 }
2122
2123 static void pred16x16_left_dc_c(uint8_t *src, int stride){
2124     int i, dc=0;
2125
2126     for(i=0;i<16; i++){
2127         dc+= src[-1+i*stride];
2128     }
2129
2130     dc= 0x01010101*((dc + 8)>>4);
2131
2132     for(i=0; i<16; i++){
2133         ((uint32_t*)(src+i*stride))[0]=
2134         ((uint32_t*)(src+i*stride))[1]=
2135         ((uint32_t*)(src+i*stride))[2]=
2136         ((uint32_t*)(src+i*stride))[3]= dc;
2137     }
2138 }
2139
2140 static void pred16x16_top_dc_c(uint8_t *src, int stride){
2141     int i, dc=0;
2142
2143     for(i=0;i<16; i++){
2144         dc+= src[i-stride];
2145     }
2146     dc= 0x01010101*((dc + 8)>>4);
2147
2148     for(i=0; i<16; i++){
2149         ((uint32_t*)(src+i*stride))[0]=
2150         ((uint32_t*)(src+i*stride))[1]=
2151         ((uint32_t*)(src+i*stride))[2]=
2152         ((uint32_t*)(src+i*stride))[3]= dc;
2153     }
2154 }
2155
2156 static void pred16x16_128_dc_c(uint8_t *src, int stride){
2157     int i;
2158
2159     for(i=0; i<16; i++){
2160         ((uint32_t*)(src+i*stride))[0]=
2161         ((uint32_t*)(src+i*stride))[1]=
2162         ((uint32_t*)(src+i*stride))[2]=
2163         ((uint32_t*)(src+i*stride))[3]= 0x01010101U*128U;
2164     }
2165 }
2166
2167 static inline void pred16x16_plane_compat_c(uint8_t *src, int stride, const int svq3){
2168   int i, j, k;
2169   int a;
2170   uint8_t *cm = cropTbl + MAX_NEG_CROP;
2171   const uint8_t * const src0 = src+7-stride;
2172   const uint8_t *src1 = src+8*stride-1;
2173   const uint8_t *src2 = src1-2*stride;      // == src+6*stride-1;
2174   int H = src0[1] - src0[-1];
2175   int V = src1[0] - src2[ 0];
2176   for(k=2; k<=8; ++k) {
2177     src1 += stride; src2 -= stride;
2178     H += k*(src0[k] - src0[-k]);
2179     V += k*(src1[0] - src2[ 0]);
2180   }
2181   if(svq3){
2182     H = ( 5*(H/4) ) / 16;
2183     V = ( 5*(V/4) ) / 16;
2184
2185     /* required for 100% accuracy */
2186     i = H; H = V; V = i;
2187   }else{
2188     H = ( 5*H+32 ) >> 6;
2189     V = ( 5*V+32 ) >> 6;
2190   }
2191
2192   a = 16*(src1[0] + src2[16] + 1) - 7*(V+H);
2193   for(j=16; j>0; --j) {
2194     int b = a;
2195     a += V;
2196     for(i=-16; i<0; i+=4) {
2197       src[16+i] = cm[ (b    ) >> 5 ];
2198       src[17+i] = cm[ (b+  H) >> 5 ];
2199       src[18+i] = cm[ (b+2*H) >> 5 ];
2200       src[19+i] = cm[ (b+3*H) >> 5 ];
2201       b += 4*H;
2202     }
2203     src += stride;
2204   }
2205 }
2206
2207 static void pred16x16_plane_c(uint8_t *src, int stride){
2208     pred16x16_plane_compat_c(src, stride, 0);
2209 }
2210
2211 static void pred8x8_vertical_c(uint8_t *src, int stride){
2212     int i;
2213     const uint32_t a= ((uint32_t*)(src-stride))[0];
2214     const uint32_t b= ((uint32_t*)(src-stride))[1];
2215
2216     for(i=0; i<8; i++){
2217         ((uint32_t*)(src+i*stride))[0]= a;
2218         ((uint32_t*)(src+i*stride))[1]= b;
2219     }
2220 }
2221
2222 static void pred8x8_horizontal_c(uint8_t *src, int stride){
2223     int i;
2224
2225     for(i=0; i<8; i++){
2226         ((uint32_t*)(src+i*stride))[0]=
2227         ((uint32_t*)(src+i*stride))[1]= src[-1+i*stride]*0x01010101;
2228     }
2229 }
2230
2231 static void pred8x8_128_dc_c(uint8_t *src, int stride){
2232     int i;
2233
2234     for(i=0; i<8; i++){
2235         ((uint32_t*)(src+i*stride))[0]=
2236         ((uint32_t*)(src+i*stride))[1]= 0x01010101U*128U;
2237     }
2238 }
2239
2240 static void pred8x8_left_dc_c(uint8_t *src, int stride){
2241     int i;
2242     int dc0, dc2;
2243
2244     dc0=dc2=0;
2245     for(i=0;i<4; i++){
2246         dc0+= src[-1+i*stride];
2247         dc2+= src[-1+(i+4)*stride];
2248     }
2249     dc0= 0x01010101*((dc0 + 2)>>2);
2250     dc2= 0x01010101*((dc2 + 2)>>2);
2251
2252     for(i=0; i<4; i++){
2253         ((uint32_t*)(src+i*stride))[0]=
2254         ((uint32_t*)(src+i*stride))[1]= dc0;
2255     }
2256     for(i=4; i<8; i++){
2257         ((uint32_t*)(src+i*stride))[0]=
2258         ((uint32_t*)(src+i*stride))[1]= dc2;
2259     }
2260 }
2261
2262 static void pred8x8_top_dc_c(uint8_t *src, int stride){
2263     int i;
2264     int dc0, dc1;
2265
2266     dc0=dc1=0;
2267     for(i=0;i<4; i++){
2268         dc0+= src[i-stride];
2269         dc1+= src[4+i-stride];
2270     }
2271     dc0= 0x01010101*((dc0 + 2)>>2);
2272     dc1= 0x01010101*((dc1 + 2)>>2);
2273
2274     for(i=0; i<4; i++){
2275         ((uint32_t*)(src+i*stride))[0]= dc0;
2276         ((uint32_t*)(src+i*stride))[1]= dc1;
2277     }
2278     for(i=4; i<8; i++){
2279         ((uint32_t*)(src+i*stride))[0]= dc0;
2280         ((uint32_t*)(src+i*stride))[1]= dc1;
2281     }
2282 }
2283
2284
2285 static void pred8x8_dc_c(uint8_t *src, int stride){
2286     int i;
2287     int dc0, dc1, dc2, dc3;
2288
2289     dc0=dc1=dc2=0;
2290     for(i=0;i<4; i++){
2291         dc0+= src[-1+i*stride] + src[i-stride];
2292         dc1+= src[4+i-stride];
2293         dc2+= src[-1+(i+4)*stride];
2294     }
2295     dc3= 0x01010101*((dc1 + dc2 + 4)>>3);
2296     dc0= 0x01010101*((dc0 + 4)>>3);
2297     dc1= 0x01010101*((dc1 + 2)>>2);
2298     dc2= 0x01010101*((dc2 + 2)>>2);
2299
2300     for(i=0; i<4; i++){
2301         ((uint32_t*)(src+i*stride))[0]= dc0;
2302         ((uint32_t*)(src+i*stride))[1]= dc1;
2303     }
2304     for(i=4; i<8; i++){
2305         ((uint32_t*)(src+i*stride))[0]= dc2;
2306         ((uint32_t*)(src+i*stride))[1]= dc3;
2307     }
2308 }
2309
2310 static void pred8x8_plane_c(uint8_t *src, int stride){
2311   int j, k;
2312   int a;
2313   uint8_t *cm = cropTbl + MAX_NEG_CROP;
2314   const uint8_t * const src0 = src+3-stride;
2315   const uint8_t *src1 = src+4*stride-1;
2316   const uint8_t *src2 = src1-2*stride;      // == src+2*stride-1;
2317   int H = src0[1] - src0[-1];
2318   int V = src1[0] - src2[ 0];
2319   for(k=2; k<=4; ++k) {
2320     src1 += stride; src2 -= stride;
2321     H += k*(src0[k] - src0[-k]);
2322     V += k*(src1[0] - src2[ 0]);
2323   }
2324   H = ( 17*H+16 ) >> 5;
2325   V = ( 17*V+16 ) >> 5;
2326
2327   a = 16*(src1[0] + src2[8]+1) - 3*(V+H);
2328   for(j=8; j>0; --j) {
2329     int b = a;
2330     a += V;
2331     src[0] = cm[ (b    ) >> 5 ];
2332     src[1] = cm[ (b+  H) >> 5 ];
2333     src[2] = cm[ (b+2*H) >> 5 ];
2334     src[3] = cm[ (b+3*H) >> 5 ];
2335     src[4] = cm[ (b+4*H) >> 5 ];
2336     src[5] = cm[ (b+5*H) >> 5 ];
2337     src[6] = cm[ (b+6*H) >> 5 ];
2338     src[7] = cm[ (b+7*H) >> 5 ];
2339     src += stride;
2340   }
2341 }
2342
2343 #define SRC(x,y) src[(x)+(y)*stride]
2344 #define PL(y) \
2345     const int l##y = (SRC(-1,y-1) + 2*SRC(-1,y) + SRC(-1,y+1) + 2) >> 2;
2346 #define PREDICT_8x8_LOAD_LEFT \
2347     const int l0 = ((has_topleft ? SRC(-1,-1) : SRC(-1,0)) \
2348                      + 2*SRC(-1,0) + SRC(-1,1) + 2) >> 2; \
2349     PL(1) PL(2) PL(3) PL(4) PL(5) PL(6) \
2350     const int l7 attribute_unused = (SRC(-1,6) + 3*SRC(-1,7) + 2) >> 2
2351
2352 #define PT(x) \
2353     const int t##x = (SRC(x-1,-1) + 2*SRC(x,-1) + SRC(x+1,-1) + 2) >> 2;
2354 #define PREDICT_8x8_LOAD_TOP \
2355     const int t0 = ((has_topleft ? SRC(-1,-1) : SRC(0,-1)) \
2356                      + 2*SRC(0,-1) + SRC(1,-1) + 2) >> 2; \
2357     PT(1) PT(2) PT(3) PT(4) PT(5) PT(6) \
2358     const int t7 attribute_unused = ((has_topright ? SRC(8,-1) : SRC(7,-1)) \
2359                      + 2*SRC(7,-1) + SRC(6,-1) + 2) >> 2
2360
2361 #define PTR(x) \
2362     t##x = (SRC(x-1,-1) + 2*SRC(x,-1) + SRC(x+1,-1) + 2) >> 2;
2363 #define PREDICT_8x8_LOAD_TOPRIGHT \
2364     int t8, t9, t10, t11, t12, t13, t14, t15; \
2365     if(has_topright) { \
2366         PTR(8) PTR(9) PTR(10) PTR(11) PTR(12) PTR(13) PTR(14) \
2367         t15 = (SRC(14,-1) + 3*SRC(15,-1) + 2) >> 2; \
2368     } else t8=t9=t10=t11=t12=t13=t14=t15= SRC(7,-1);
2369
2370 #define PREDICT_8x8_LOAD_TOPLEFT \
2371     const int lt = (SRC(-1,0) + 2*SRC(-1,-1) + SRC(0,-1) + 2) >> 2
2372
2373 #define PREDICT_8x8_DC(v) \
2374     int y; \
2375     for( y = 0; y < 8; y++ ) { \
2376         ((uint32_t*)src)[0] = \
2377         ((uint32_t*)src)[1] = v; \
2378         src += stride; \
2379     }
2380
2381 static void pred8x8l_128_dc_c(uint8_t *src, int has_topleft, int has_topright, int stride)
2382 {
2383     PREDICT_8x8_DC(0x80808080);
2384 }
2385 static void pred8x8l_left_dc_c(uint8_t *src, int has_topleft, int has_topright, int stride)
2386 {
2387     PREDICT_8x8_LOAD_LEFT;
2388     const uint32_t dc = ((l0+l1+l2+l3+l4+l5+l6+l7+4) >> 3) * 0x01010101;
2389     PREDICT_8x8_DC(dc);
2390 }
2391 static void pred8x8l_top_dc_c(uint8_t *src, int has_topleft, int has_topright, int stride)
2392 {
2393     PREDICT_8x8_LOAD_TOP;
2394     const uint32_t dc = ((t0+t1+t2+t3+t4+t5+t6+t7+4) >> 3) * 0x01010101;
2395     PREDICT_8x8_DC(dc);
2396 }
2397 static void pred8x8l_dc_c(uint8_t *src, int has_topleft, int has_topright, int stride)
2398 {
2399     PREDICT_8x8_LOAD_LEFT;
2400     PREDICT_8x8_LOAD_TOP;
2401     const uint32_t dc = ((l0+l1+l2+l3+l4+l5+l6+l7
2402                          +t0+t1+t2+t3+t4+t5+t6+t7+8) >> 4) * 0x01010101;
2403     PREDICT_8x8_DC(dc);
2404 }
2405 static void pred8x8l_horizontal_c(uint8_t *src, int has_topleft, int has_topright, int stride)
2406 {
2407     PREDICT_8x8_LOAD_LEFT;
2408 #define ROW(y) ((uint32_t*)(src+y*stride))[0] =\
2409                ((uint32_t*)(src+y*stride))[1] = 0x01010101 * l##y
2410     ROW(0); ROW(1); ROW(2); ROW(3); ROW(4); ROW(5); ROW(6); ROW(7);
2411 #undef ROW
2412 }
2413 static void pred8x8l_vertical_c(uint8_t *src, int has_topleft, int has_topright, int stride)
2414 {
2415     int y;
2416     PREDICT_8x8_LOAD_TOP;
2417     src[0] = t0;
2418     src[1] = t1;
2419     src[2] = t2;
2420     src[3] = t3;
2421     src[4] = t4;
2422     src[5] = t5;
2423     src[6] = t6;
2424     src[7] = t7;
2425     for( y = 1; y < 8; y++ )
2426         *(uint64_t*)(src+y*stride) = *(uint64_t*)src;
2427 }
2428 static void pred8x8l_down_left_c(uint8_t *src, int has_topleft, int has_topright, int stride)
2429 {
2430     PREDICT_8x8_LOAD_TOP;
2431     PREDICT_8x8_LOAD_TOPRIGHT;
2432     SRC(0,0)= (t0 + 2*t1 + t2 + 2) >> 2;
2433     SRC(0,1)=SRC(1,0)= (t1 + 2*t2 + t3 + 2) >> 2;
2434     SRC(0,2)=SRC(1,1)=SRC(2,0)= (t2 + 2*t3 + t4 + 2) >> 2;
2435     SRC(0,3)=SRC(1,2)=SRC(2,1)=SRC(3,0)= (t3 + 2*t4 + t5 + 2) >> 2;
2436     SRC(0,4)=SRC(1,3)=SRC(2,2)=SRC(3,1)=SRC(4,0)= (t4 + 2*t5 + t6 + 2) >> 2;
2437     SRC(0,5)=SRC(1,4)=SRC(2,3)=SRC(3,2)=SRC(4,1)=SRC(5,0)= (t5 + 2*t6 + t7 + 2) >> 2;
2438     SRC(0,6)=SRC(1,5)=SRC(2,4)=SRC(3,3)=SRC(4,2)=SRC(5,1)=SRC(6,0)= (t6 + 2*t7 + t8 + 2) >> 2;
2439     SRC(0,7)=SRC(1,6)=SRC(2,5)=SRC(3,4)=SRC(4,3)=SRC(5,2)=SRC(6,1)=SRC(7,0)= (t7 + 2*t8 + t9 + 2) >> 2;
2440     SRC(1,7)=SRC(2,6)=SRC(3,5)=SRC(4,4)=SRC(5,3)=SRC(6,2)=SRC(7,1)= (t8 + 2*t9 + t10 + 2) >> 2;
2441     SRC(2,7)=SRC(3,6)=SRC(4,5)=SRC(5,4)=SRC(6,3)=SRC(7,2)= (t9 + 2*t10 + t11 + 2) >> 2;
2442     SRC(3,7)=SRC(4,6)=SRC(5,5)=SRC(6,4)=SRC(7,3)= (t10 + 2*t11 + t12 + 2) >> 2;
2443     SRC(4,7)=SRC(5,6)=SRC(6,5)=SRC(7,4)= (t11 + 2*t12 + t13 + 2) >> 2;
2444     SRC(5,7)=SRC(6,6)=SRC(7,5)= (t12 + 2*t13 + t14 + 2) >> 2;
2445     SRC(6,7)=SRC(7,6)= (t13 + 2*t14 + t15 + 2) >> 2;
2446     SRC(7,7)= (t14 + 3*t15 + 2) >> 2;
2447 }
2448 static void pred8x8l_down_right_c(uint8_t *src, int has_topleft, int has_topright, int stride)
2449 {
2450     PREDICT_8x8_LOAD_TOP;
2451     PREDICT_8x8_LOAD_LEFT;
2452     PREDICT_8x8_LOAD_TOPLEFT;
2453     SRC(0,7)= (l7 + 2*l6 + l5 + 2) >> 2;
2454     SRC(0,6)=SRC(1,7)= (l6 + 2*l5 + l4 + 2) >> 2;
2455     SRC(0,5)=SRC(1,6)=SRC(2,7)= (l5 + 2*l4 + l3 + 2) >> 2;
2456     SRC(0,4)=SRC(1,5)=SRC(2,6)=SRC(3,7)= (l4 + 2*l3 + l2 + 2) >> 2;
2457     SRC(0,3)=SRC(1,4)=SRC(2,5)=SRC(3,6)=SRC(4,7)= (l3 + 2*l2 + l1 + 2) >> 2;
2458     SRC(0,2)=SRC(1,3)=SRC(2,4)=SRC(3,5)=SRC(4,6)=SRC(5,7)= (l2 + 2*l1 + l0 + 2) >> 2;
2459     SRC(0,1)=SRC(1,2)=SRC(2,3)=SRC(3,4)=SRC(4,5)=SRC(5,6)=SRC(6,7)= (l1 + 2*l0 + lt + 2) >> 2;
2460     SRC(0,0)=SRC(1,1)=SRC(2,2)=SRC(3,3)=SRC(4,4)=SRC(5,5)=SRC(6,6)=SRC(7,7)= (l0 + 2*lt + t0 + 2) >> 2;
2461     SRC(1,0)=SRC(2,1)=SRC(3,2)=SRC(4,3)=SRC(5,4)=SRC(6,5)=SRC(7,6)= (lt + 2*t0 + t1 + 2) >> 2;
2462     SRC(2,0)=SRC(3,1)=SRC(4,2)=SRC(5,3)=SRC(6,4)=SRC(7,5)= (t0 + 2*t1 + t2 + 2) >> 2;
2463     SRC(3,0)=SRC(4,1)=SRC(5,2)=SRC(6,3)=SRC(7,4)= (t1 + 2*t2 + t3 + 2) >> 2;
2464     SRC(4,0)=SRC(5,1)=SRC(6,2)=SRC(7,3)= (t2 + 2*t3 + t4 + 2) >> 2;
2465     SRC(5,0)=SRC(6,1)=SRC(7,2)= (t3 + 2*t4 + t5 + 2) >> 2;
2466     SRC(6,0)=SRC(7,1)= (t4 + 2*t5 + t6 + 2) >> 2;
2467     SRC(7,0)= (t5 + 2*t6 + t7 + 2) >> 2;
2468
2469 }
2470 static void pred8x8l_vertical_right_c(uint8_t *src, int has_topleft, int has_topright, int stride)
2471 {
2472     PREDICT_8x8_LOAD_TOP;
2473     PREDICT_8x8_LOAD_LEFT;
2474     PREDICT_8x8_LOAD_TOPLEFT;
2475     SRC(0,6)= (l5 + 2*l4 + l3 + 2) >> 2;
2476     SRC(0,7)= (l6 + 2*l5 + l4 + 2) >> 2;
2477     SRC(0,4)=SRC(1,6)= (l3 + 2*l2 + l1 + 2) >> 2;
2478     SRC(0,5)=SRC(1,7)= (l4 + 2*l3 + l2 + 2) >> 2;
2479     SRC(0,2)=SRC(1,4)=SRC(2,6)= (l1 + 2*l0 + lt + 2) >> 2;
2480     SRC(0,3)=SRC(1,5)=SRC(2,7)= (l2 + 2*l1 + l0 + 2) >> 2;
2481     SRC(0,1)=SRC(1,3)=SRC(2,5)=SRC(3,7)= (l0 + 2*lt + t0 + 2) >> 2;
2482     SRC(0,0)=SRC(1,2)=SRC(2,4)=SRC(3,6)= (lt + t0 + 1) >> 1;
2483     SRC(1,1)=SRC(2,3)=SRC(3,5)=SRC(4,7)= (lt + 2*t0 + t1 + 2) >> 2;
2484     SRC(1,0)=SRC(2,2)=SRC(3,4)=SRC(4,6)= (t0 + t1 + 1) >> 1;
2485     SRC(2,1)=SRC(3,3)=SRC(4,5)=SRC(5,7)= (t0 + 2*t1 + t2 + 2) >> 2;
2486     SRC(2,0)=SRC(3,2)=SRC(4,4)=SRC(5,6)= (t1 + t2 + 1) >> 1;
2487     SRC(3,1)=SRC(4,3)=SRC(5,5)=SRC(6,7)= (t1 + 2*t2 + t3 + 2) >> 2;
2488     SRC(3,0)=SRC(4,2)=SRC(5,4)=SRC(6,6)= (t2 + t3 + 1) >> 1;
2489     SRC(4,1)=SRC(5,3)=SRC(6,5)=SRC(7,7)= (t2 + 2*t3 + t4 + 2) >> 2;
2490     SRC(4,0)=SRC(5,2)=SRC(6,4)=SRC(7,6)= (t3 + t4 + 1) >> 1;
2491     SRC(5,1)=SRC(6,3)=SRC(7,5)= (t3 + 2*t4 + t5 + 2) >> 2;
2492     SRC(5,0)=SRC(6,2)=SRC(7,4)= (t4 + t5 + 1) >> 1;
2493     SRC(6,1)=SRC(7,3)= (t4 + 2*t5 + t6 + 2) >> 2;
2494     SRC(6,0)=SRC(7,2)= (t5 + t6 + 1) >> 1;
2495     SRC(7,1)= (t5 + 2*t6 + t7 + 2) >> 2;
2496     SRC(7,0)= (t6 + t7 + 1) >> 1;
2497 }
2498 static void pred8x8l_horizontal_down_c(uint8_t *src, int has_topleft, int has_topright, int stride)
2499 {
2500     PREDICT_8x8_LOAD_TOP;
2501     PREDICT_8x8_LOAD_LEFT;
2502     PREDICT_8x8_LOAD_TOPLEFT;
2503     SRC(0,7)= (l6 + l7 + 1) >> 1;
2504     SRC(1,7)= (l5 + 2*l6 + l7 + 2) >> 2;
2505     SRC(0,6)=SRC(2,7)= (l5 + l6 + 1) >> 1;
2506     SRC(1,6)=SRC(3,7)= (l4 + 2*l5 + l6 + 2) >> 2;
2507     SRC(0,5)=SRC(2,6)=SRC(4,7)= (l4 + l5 + 1) >> 1;
2508     SRC(1,5)=SRC(3,6)=SRC(5,7)= (l3 + 2*l4 + l5 + 2) >> 2;
2509     SRC(0,4)=SRC(2,5)=SRC(4,6)=SRC(6,7)= (l3 + l4 + 1) >> 1;
2510     SRC(1,4)=SRC(3,5)=SRC(5,6)=SRC(7,7)= (l2 + 2*l3 + l4 + 2) >> 2;
2511     SRC(0,3)=SRC(2,4)=SRC(4,5)=SRC(6,6)= (l2 + l3 + 1) >> 1;
2512     SRC(1,3)=SRC(3,4)=SRC(5,5)=SRC(7,6)= (l1 + 2*l2 + l3 + 2) >> 2;
2513     SRC(0,2)=SRC(2,3)=SRC(4,4)=SRC(6,5)= (l1 + l2 + 1) >> 1;
2514     SRC(1,2)=SRC(3,3)=SRC(5,4)=SRC(7,5)= (l0 + 2*l1 + l2 + 2) >> 2;
2515     SRC(0,1)=SRC(2,2)=SRC(4,3)=SRC(6,4)= (l0 + l1 + 1) >> 1;
2516     SRC(1,1)=SRC(3,2)=SRC(5,3)=SRC(7,4)= (lt + 2*l0 + l1 + 2) >> 2;
2517     SRC(0,0)=SRC(2,1)=SRC(4,2)=SRC(6,3)= (lt + l0 + 1) >> 1;
2518     SRC(1,0)=SRC(3,1)=SRC(5,2)=SRC(7,3)= (l0 + 2*lt + t0 + 2) >> 2;
2519     SRC(2,0)=SRC(4,1)=SRC(6,2)= (t1 + 2*t0 + lt + 2) >> 2;
2520     SRC(3,0)=SRC(5,1)=SRC(7,2)= (t2 + 2*t1 + t0 + 2) >> 2;
2521     SRC(4,0)=SRC(6,1)= (t3 + 2*t2 + t1 + 2) >> 2;
2522     SRC(5,0)=SRC(7,1)= (t4 + 2*t3 + t2 + 2) >> 2;
2523     SRC(6,0)= (t5 + 2*t4 + t3 + 2) >> 2;
2524     SRC(7,0)= (t6 + 2*t5 + t4 + 2) >> 2;
2525 }
2526 static void pred8x8l_vertical_left_c(uint8_t *src, int has_topleft, int has_topright, int stride)
2527 {
2528     PREDICT_8x8_LOAD_TOP;
2529     PREDICT_8x8_LOAD_TOPRIGHT;
2530     SRC(0,0)= (t0 + t1 + 1) >> 1;
2531     SRC(0,1)= (t0 + 2*t1 + t2 + 2) >> 2;
2532     SRC(0,2)=SRC(1,0)= (t1 + t2 + 1) >> 1;
2533     SRC(0,3)=SRC(1,1)= (t1 + 2*t2 + t3 + 2) >> 2;
2534     SRC(0,4)=SRC(1,2)=SRC(2,0)= (t2 + t3 + 1) >> 1;
2535     SRC(0,5)=SRC(1,3)=SRC(2,1)= (t2 + 2*t3 + t4 + 2) >> 2;
2536     SRC(0,6)=SRC(1,4)=SRC(2,2)=SRC(3,0)= (t3 + t4 + 1) >> 1;
2537     SRC(0,7)=SRC(1,5)=SRC(2,3)=SRC(3,1)= (t3 + 2*t4 + t5 + 2) >> 2;
2538     SRC(1,6)=SRC(2,4)=SRC(3,2)=SRC(4,0)= (t4 + t5 + 1) >> 1;
2539     SRC(1,7)=SRC(2,5)=SRC(3,3)=SRC(4,1)= (t4 + 2*t5 + t6 + 2) >> 2;
2540     SRC(2,6)=SRC(3,4)=SRC(4,2)=SRC(5,0)= (t5 + t6 + 1) >> 1;
2541     SRC(2,7)=SRC(3,5)=SRC(4,3)=SRC(5,1)= (t5 + 2*t6 + t7 + 2) >> 2;
2542     SRC(3,6)=SRC(4,4)=SRC(5,2)=SRC(6,0)= (t6 + t7 + 1) >> 1;
2543     SRC(3,7)=SRC(4,5)=SRC(5,3)=SRC(6,1)= (t6 + 2*t7 + t8 + 2) >> 2;
2544     SRC(4,6)=SRC(5,4)=SRC(6,2)=SRC(7,0)= (t7 + t8 + 1) >> 1;
2545     SRC(4,7)=SRC(5,5)=SRC(6,3)=SRC(7,1)= (t7 + 2*t8 + t9 + 2) >> 2;
2546     SRC(5,6)=SRC(6,4)=SRC(7,2)= (t8 + t9 + 1) >> 1;
2547     SRC(5,7)=SRC(6,5)=SRC(7,3)= (t8 + 2*t9 + t10 + 2) >> 2;
2548     SRC(6,6)=SRC(7,4)= (t9 + t10 + 1) >> 1;
2549     SRC(6,7)=SRC(7,5)= (t9 + 2*t10 + t11 + 2) >> 2;
2550     SRC(7,6)= (t10 + t11 + 1) >> 1;
2551     SRC(7,7)= (t10 + 2*t11 + t12 + 2) >> 2;
2552 }
2553 static void pred8x8l_horizontal_up_c(uint8_t *src, int has_topleft, int has_topright, int stride)
2554 {
2555     PREDICT_8x8_LOAD_LEFT;
2556     SRC(0,0)= (l0 + l1 + 1) >> 1;
2557     SRC(1,0)= (l0 + 2*l1 + l2 + 2) >> 2;
2558     SRC(0,1)=SRC(2,0)= (l1 + l2 + 1) >> 1;
2559     SRC(1,1)=SRC(3,0)= (l1 + 2*l2 + l3 + 2) >> 2;
2560     SRC(0,2)=SRC(2,1)=SRC(4,0)= (l2 + l3 + 1) >> 1;
2561     SRC(1,2)=SRC(3,1)=SRC(5,0)= (l2 + 2*l3 + l4 + 2) >> 2;
2562     SRC(0,3)=SRC(2,2)=SRC(4,1)=SRC(6,0)= (l3 + l4 + 1) >> 1;
2563     SRC(1,3)=SRC(3,2)=SRC(5,1)=SRC(7,0)= (l3 + 2*l4 + l5 + 2) >> 2;
2564     SRC(0,4)=SRC(2,3)=SRC(4,2)=SRC(6,1)= (l4 + l5 + 1) >> 1;
2565     SRC(1,4)=SRC(3,3)=SRC(5,2)=SRC(7,1)= (l4 + 2*l5 + l6 + 2) >> 2;
2566     SRC(0,5)=SRC(2,4)=SRC(4,3)=SRC(6,2)= (l5 + l6 + 1) >> 1;
2567     SRC(1,5)=SRC(3,4)=SRC(5,3)=SRC(7,2)= (l5 + 2*l6 + l7 + 2) >> 2;
2568     SRC(0,6)=SRC(2,5)=SRC(4,4)=SRC(6,3)= (l6 + l7 + 1) >> 1;
2569     SRC(1,6)=SRC(3,5)=SRC(5,4)=SRC(7,3)= (l6 + 3*l7 + 2) >> 2;
2570     SRC(0,7)=SRC(1,7)=SRC(2,6)=SRC(2,7)=SRC(3,6)=
2571     SRC(3,7)=SRC(4,5)=SRC(4,6)=SRC(4,7)=SRC(5,5)=
2572     SRC(5,6)=SRC(5,7)=SRC(6,4)=SRC(6,5)=SRC(6,6)=
2573     SRC(6,7)=SRC(7,4)=SRC(7,5)=SRC(7,6)=SRC(7,7)= l7;
2574 }
2575 #undef PREDICT_8x8_LOAD_LEFT
2576 #undef PREDICT_8x8_LOAD_TOP
2577 #undef PREDICT_8x8_LOAD_TOPLEFT
2578 #undef PREDICT_8x8_LOAD_TOPRIGHT
2579 #undef PREDICT_8x8_DC
2580 #undef PTR
2581 #undef PT
2582 #undef PL
2583 #undef SRC
2584
2585 static inline void mc_dir_part(H264Context *h, Picture *pic, int n, int square, int chroma_height, int delta, int list,
2586                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
2587                            int src_x_offset, int src_y_offset,
2588                            qpel_mc_func *qpix_op, h264_chroma_mc_func chroma_op){
2589     MpegEncContext * const s = &h->s;
2590     const int mx= h->mv_cache[list][ scan8[n] ][0] + src_x_offset*8;
2591     const int my= h->mv_cache[list][ scan8[n] ][1] + src_y_offset*8;
2592     const int luma_xy= (mx&3) + ((my&3)<<2);
2593     uint8_t * src_y = pic->data[0] + (mx>>2) + (my>>2)*s->linesize;
2594     uint8_t * src_cb= pic->data[1] + (mx>>3) + (my>>3)*s->uvlinesize;
2595     uint8_t * src_cr= pic->data[2] + (mx>>3) + (my>>3)*s->uvlinesize;
2596     int extra_width= (s->flags&CODEC_FLAG_EMU_EDGE) ? 0 : 16; //FIXME increase edge?, IMHO not worth it
2597     int extra_height= extra_width;
2598     int emu=0;
2599     const int full_mx= mx>>2;
2600     const int full_my= my>>2;
2601     const int pic_width  = 16*s->mb_width;
2602     const int pic_height = 16*s->mb_height;
2603
2604     if(!pic->data[0])
2605         return;
2606
2607     if(mx&7) extra_width -= 3;
2608     if(my&7) extra_height -= 3;
2609
2610     if(   full_mx < 0-extra_width
2611        || full_my < 0-extra_height
2612        || full_mx + 16/*FIXME*/ > pic_width + extra_width
2613        || full_my + 16/*FIXME*/ > pic_height + extra_height){
2614         ff_emulated_edge_mc(s->edge_emu_buffer, src_y - 2 - 2*s->linesize, s->linesize, 16+5, 16+5/*FIXME*/, full_mx-2, full_my-2, pic_width, pic_height);
2615             src_y= s->edge_emu_buffer + 2 + 2*s->linesize;
2616         emu=1;
2617     }
2618
2619     qpix_op[luma_xy](dest_y, src_y, s->linesize); //FIXME try variable height perhaps?
2620     if(!square){
2621         qpix_op[luma_xy](dest_y + delta, src_y + delta, s->linesize);
2622     }
2623
2624     if(s->flags&CODEC_FLAG_GRAY) return;
2625
2626     if(emu){
2627         ff_emulated_edge_mc(s->edge_emu_buffer, src_cb, s->uvlinesize, 9, 9/*FIXME*/, (mx>>3), (my>>3), pic_width>>1, pic_height>>1);
2628             src_cb= s->edge_emu_buffer;
2629     }
2630     chroma_op(dest_cb, src_cb, s->uvlinesize, chroma_height, mx&7, my&7);
2631
2632     if(emu){
2633         ff_emulated_edge_mc(s->edge_emu_buffer, src_cr, s->uvlinesize, 9, 9/*FIXME*/, (mx>>3), (my>>3), pic_width>>1, pic_height>>1);
2634             src_cr= s->edge_emu_buffer;
2635     }
2636     chroma_op(dest_cr, src_cr, s->uvlinesize, chroma_height, mx&7, my&7);
2637 }
2638
2639 static inline void mc_part_std(H264Context *h, int n, int square, int chroma_height, int delta,
2640                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
2641                            int x_offset, int y_offset,
2642                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
2643                            qpel_mc_func *qpix_avg, h264_chroma_mc_func chroma_avg,
2644                            int list0, int list1){
2645     MpegEncContext * const s = &h->s;
2646     qpel_mc_func *qpix_op=  qpix_put;
2647     h264_chroma_mc_func chroma_op= chroma_put;
2648
2649     dest_y  += 2*x_offset + 2*y_offset*s->  linesize;
2650     dest_cb +=   x_offset +   y_offset*s->uvlinesize;
2651     dest_cr +=   x_offset +   y_offset*s->uvlinesize;
2652     x_offset += 8*s->mb_x;
2653     y_offset += 8*s->mb_y;
2654
2655     if(list0){
2656         Picture *ref= &h->ref_list[0][ h->ref_cache[0][ scan8[n] ] ];
2657         mc_dir_part(h, ref, n, square, chroma_height, delta, 0,
2658                            dest_y, dest_cb, dest_cr, x_offset, y_offset,
2659                            qpix_op, chroma_op);
2660
2661         qpix_op=  qpix_avg;
2662         chroma_op= chroma_avg;
2663     }
2664
2665     if(list1){
2666         Picture *ref= &h->ref_list[1][ h->ref_cache[1][ scan8[n] ] ];
2667         mc_dir_part(h, ref, n, square, chroma_height, delta, 1,
2668                            dest_y, dest_cb, dest_cr, x_offset, y_offset,
2669                            qpix_op, chroma_op);
2670     }
2671 }
2672
2673 static inline void mc_part_weighted(H264Context *h, int n, int square, int chroma_height, int delta,
2674                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
2675                            int x_offset, int y_offset,
2676                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
2677                            h264_weight_func luma_weight_op, h264_weight_func chroma_weight_op,
2678                            h264_biweight_func luma_weight_avg, h264_biweight_func chroma_weight_avg,
2679                            int list0, int list1){
2680     MpegEncContext * const s = &h->s;
2681
2682     dest_y  += 2*x_offset + 2*y_offset*s->  linesize;
2683     dest_cb +=   x_offset +   y_offset*s->uvlinesize;
2684     dest_cr +=   x_offset +   y_offset*s->uvlinesize;
2685     x_offset += 8*s->mb_x;
2686     y_offset += 8*s->mb_y;
2687
2688     if(list0 && list1){
2689         /* don't optimize for luma-only case, since B-frames usually
2690          * use implicit weights => chroma too. */
2691         uint8_t *tmp_cb = s->obmc_scratchpad;
2692         uint8_t *tmp_cr = tmp_cb + 8*s->uvlinesize;
2693         uint8_t *tmp_y  = tmp_cr + 8*s->uvlinesize;
2694         int refn0 = h->ref_cache[0][ scan8[n] ];
2695         int refn1 = h->ref_cache[1][ scan8[n] ];
2696
2697         mc_dir_part(h, &h->ref_list[0][refn0], n, square, chroma_height, delta, 0,
2698                     dest_y, dest_cb, dest_cr,
2699                     x_offset, y_offset, qpix_put, chroma_put);
2700         mc_dir_part(h, &h->ref_list[1][refn1], n, square, chroma_height, delta, 1,
2701                     tmp_y, tmp_cb, tmp_cr,
2702                     x_offset, y_offset, qpix_put, chroma_put);
2703
2704         if(h->use_weight == 2){
2705             int weight0 = h->implicit_weight[refn0][refn1];
2706             int weight1 = 64 - weight0;
2707             luma_weight_avg(  dest_y,  tmp_y,  s->  linesize, 5, weight0, weight1, 0);
2708             chroma_weight_avg(dest_cb, tmp_cb, s->uvlinesize, 5, weight0, weight1, 0);
2709             chroma_weight_avg(dest_cr, tmp_cr, s->uvlinesize, 5, weight0, weight1, 0);
2710         }else{
2711             luma_weight_avg(dest_y, tmp_y, s->linesize, h->luma_log2_weight_denom,
2712                             h->luma_weight[0][refn0], h->luma_weight[1][refn1],
2713                             h->luma_offset[0][refn0] + h->luma_offset[1][refn1]);
2714             chroma_weight_avg(dest_cb, tmp_cb, s->uvlinesize, h->chroma_log2_weight_denom,
2715                             h->chroma_weight[0][refn0][0], h->chroma_weight[1][refn1][0],
2716                             h->chroma_offset[0][refn0][0] + h->chroma_offset[1][refn1][0]);
2717             chroma_weight_avg(dest_cr, tmp_cr, s->uvlinesize, h->chroma_log2_weight_denom,
2718                             h->chroma_weight[0][refn0][1], h->chroma_weight[1][refn1][1],
2719                             h->chroma_offset[0][refn0][1] + h->chroma_offset[1][refn1][1]);
2720         }
2721     }else{
2722         int list = list1 ? 1 : 0;
2723         int refn = h->ref_cache[list][ scan8[n] ];
2724         Picture *ref= &h->ref_list[list][refn];
2725         mc_dir_part(h, ref, n, square, chroma_height, delta, list,
2726                     dest_y, dest_cb, dest_cr, x_offset, y_offset,
2727                     qpix_put, chroma_put);
2728
2729         luma_weight_op(dest_y, s->linesize, h->luma_log2_weight_denom,
2730                        h->luma_weight[list][refn], h->luma_offset[list][refn]);
2731         if(h->use_weight_chroma){
2732             chroma_weight_op(dest_cb, s->uvlinesize, h->chroma_log2_weight_denom,
2733                              h->chroma_weight[list][refn][0], h->chroma_offset[list][refn][0]);
2734             chroma_weight_op(dest_cr, s->uvlinesize, h->chroma_log2_weight_denom,
2735                              h->chroma_weight[list][refn][1], h->chroma_offset[list][refn][1]);
2736         }
2737     }
2738 }
2739
2740 static inline void mc_part(H264Context *h, int n, int square, int chroma_height, int delta,
2741                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
2742                            int x_offset, int y_offset,
2743                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
2744                            qpel_mc_func *qpix_avg, h264_chroma_mc_func chroma_avg,
2745                            h264_weight_func *weight_op, h264_biweight_func *weight_avg,
2746                            int list0, int list1){
2747     if((h->use_weight==2 && list0 && list1
2748         && (h->implicit_weight[ h->ref_cache[0][scan8[n]] ][ h->ref_cache[1][scan8[n]] ] != 32))
2749        || h->use_weight==1)
2750         mc_part_weighted(h, n, square, chroma_height, delta, dest_y, dest_cb, dest_cr,
2751                          x_offset, y_offset, qpix_put, chroma_put,
2752                          weight_op[0], weight_op[3], weight_avg[0], weight_avg[3], list0, list1);
2753     else
2754         mc_part_std(h, n, square, chroma_height, delta, dest_y, dest_cb, dest_cr,
2755                     x_offset, y_offset, qpix_put, chroma_put, qpix_avg, chroma_avg, list0, list1);
2756 }
2757
2758 static inline void prefetch_motion(H264Context *h, int list){
2759     /* fetch pixels for estimated mv 4 macroblocks ahead
2760      * optimized for 64byte cache lines */
2761     MpegEncContext * const s = &h->s;
2762     const int refn = h->ref_cache[list][scan8[0]];
2763     if(refn >= 0){
2764         const int mx= (h->mv_cache[list][scan8[0]][0]>>2) + 16*s->mb_x + 8;
2765         const int my= (h->mv_cache[list][scan8[0]][1]>>2) + 16*s->mb_y;
2766         uint8_t **src= h->ref_list[list][refn].data;
2767         int off= mx + (my + (s->mb_x&3)*4)*s->linesize + 64;
2768         s->dsp.prefetch(src[0]+off, s->linesize, 4);
2769         off= (mx>>1) + ((my>>1) + (s->mb_x&7))*s->uvlinesize + 64;
2770         s->dsp.prefetch(src[1]+off, src[2]-src[1], 2);
2771     }
2772 }
2773
2774 static void hl_motion(H264Context *h, uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
2775                       qpel_mc_func (*qpix_put)[16], h264_chroma_mc_func (*chroma_put),
2776                       qpel_mc_func (*qpix_avg)[16], h264_chroma_mc_func (*chroma_avg),
2777                       h264_weight_func *weight_op, h264_biweight_func *weight_avg){
2778     MpegEncContext * const s = &h->s;
2779     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
2780     const int mb_type= s->current_picture.mb_type[mb_xy];
2781
2782     assert(IS_INTER(mb_type));
2783
2784     prefetch_motion(h, 0);
2785
2786     if(IS_16X16(mb_type)){
2787         mc_part(h, 0, 1, 8, 0, dest_y, dest_cb, dest_cr, 0, 0,
2788                 qpix_put[0], chroma_put[0], qpix_avg[0], chroma_avg[0],
2789                 &weight_op[0], &weight_avg[0],
2790                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
2791     }else if(IS_16X8(mb_type)){
2792         mc_part(h, 0, 0, 4, 8, dest_y, dest_cb, dest_cr, 0, 0,
2793                 qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0],
2794                 &weight_op[1], &weight_avg[1],
2795                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
2796         mc_part(h, 8, 0, 4, 8, dest_y, dest_cb, dest_cr, 0, 4,
2797                 qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0],
2798                 &weight_op[1], &weight_avg[1],
2799                 IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1));
2800     }else if(IS_8X16(mb_type)){
2801         mc_part(h, 0, 0, 8, 8*s->linesize, dest_y, dest_cb, dest_cr, 0, 0,
2802                 qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
2803                 &weight_op[2], &weight_avg[2],
2804                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
2805         mc_part(h, 4, 0, 8, 8*s->linesize, dest_y, dest_cb, dest_cr, 4, 0,
2806                 qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
2807                 &weight_op[2], &weight_avg[2],
2808                 IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1));
2809     }else{
2810         int i;
2811
2812         assert(IS_8X8(mb_type));
2813
2814         for(i=0; i<4; i++){
2815             const int sub_mb_type= h->sub_mb_type[i];
2816             const int n= 4*i;
2817             int x_offset= (i&1)<<2;
2818             int y_offset= (i&2)<<1;
2819
2820             if(IS_SUB_8X8(sub_mb_type)){
2821                 mc_part(h, n, 1, 4, 0, dest_y, dest_cb, dest_cr, x_offset, y_offset,
2822                     qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
2823                     &weight_op[3], &weight_avg[3],
2824                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
2825             }else if(IS_SUB_8X4(sub_mb_type)){
2826                 mc_part(h, n  , 0, 2, 4, dest_y, dest_cb, dest_cr, x_offset, y_offset,
2827                     qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1],
2828                     &weight_op[4], &weight_avg[4],
2829                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
2830                 mc_part(h, n+2, 0, 2, 4, dest_y, dest_cb, dest_cr, x_offset, y_offset+2,
2831                     qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1],
2832                     &weight_op[4], &weight_avg[4],
2833                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
2834             }else if(IS_SUB_4X8(sub_mb_type)){
2835                 mc_part(h, n  , 0, 4, 4*s->linesize, dest_y, dest_cb, dest_cr, x_offset, y_offset,
2836                     qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
2837                     &weight_op[5], &weight_avg[5],
2838                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
2839                 mc_part(h, n+1, 0, 4, 4*s->linesize, dest_y, dest_cb, dest_cr, x_offset+2, y_offset,
2840                     qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
2841                     &weight_op[5], &weight_avg[5],
2842                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
2843             }else{
2844                 int j;
2845                 assert(IS_SUB_4X4(sub_mb_type));
2846                 for(j=0; j<4; j++){
2847                     int sub_x_offset= x_offset + 2*(j&1);
2848                     int sub_y_offset= y_offset +   (j&2);
2849                     mc_part(h, n+j, 1, 2, 0, dest_y, dest_cb, dest_cr, sub_x_offset, sub_y_offset,
2850                         qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
2851                         &weight_op[6], &weight_avg[6],
2852                         IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
2853                 }
2854             }
2855         }
2856     }
2857
2858     prefetch_motion(h, 1);
2859 }
2860
2861 static void decode_init_vlc(H264Context *h){
2862     static int done = 0;
2863
2864     if (!done) {
2865         int i;
2866         done = 1;
2867
2868         init_vlc(&chroma_dc_coeff_token_vlc, CHROMA_DC_COEFF_TOKEN_VLC_BITS, 4*5,
2869                  &chroma_dc_coeff_token_len [0], 1, 1,
2870                  &chroma_dc_coeff_token_bits[0], 1, 1, 1);
2871
2872         for(i=0; i<4; i++){
2873             init_vlc(&coeff_token_vlc[i], COEFF_TOKEN_VLC_BITS, 4*17,
2874                      &coeff_token_len [i][0], 1, 1,
2875                      &coeff_token_bits[i][0], 1, 1, 1);
2876         }
2877
2878         for(i=0; i<3; i++){
2879             init_vlc(&chroma_dc_total_zeros_vlc[i], CHROMA_DC_TOTAL_ZEROS_VLC_BITS, 4,
2880                      &chroma_dc_total_zeros_len [i][0], 1, 1,
2881                      &chroma_dc_total_zeros_bits[i][0], 1, 1, 1);
2882         }
2883         for(i=0; i<15; i++){
2884             init_vlc(&total_zeros_vlc[i], TOTAL_ZEROS_VLC_BITS, 16,
2885                      &total_zeros_len [i][0], 1, 1,
2886                      &total_zeros_bits[i][0], 1, 1, 1);
2887         }
2888
2889         for(i=0; i<6; i++){
2890             init_vlc(&run_vlc[i], RUN_VLC_BITS, 7,
2891                      &run_len [i][0], 1, 1,
2892                      &run_bits[i][0], 1, 1, 1);
2893         }
2894         init_vlc(&run7_vlc, RUN7_VLC_BITS, 16,
2895                  &run_len [6][0], 1, 1,
2896                  &run_bits[6][0], 1, 1, 1);
2897     }
2898 }
2899
2900 /**
2901  * Sets the intra prediction function pointers.
2902  */
2903 static void init_pred_ptrs(H264Context *h){
2904 //    MpegEncContext * const s = &h->s;
2905
2906     h->pred4x4[VERT_PRED           ]= pred4x4_vertical_c;
2907     h->pred4x4[HOR_PRED            ]= pred4x4_horizontal_c;
2908     h->pred4x4[DC_PRED             ]= pred4x4_dc_c;
2909     h->pred4x4[DIAG_DOWN_LEFT_PRED ]= pred4x4_down_left_c;
2910     h->pred4x4[DIAG_DOWN_RIGHT_PRED]= pred4x4_down_right_c;
2911     h->pred4x4[VERT_RIGHT_PRED     ]= pred4x4_vertical_right_c;
2912     h->pred4x4[HOR_DOWN_PRED       ]= pred4x4_horizontal_down_c;
2913     h->pred4x4[VERT_LEFT_PRED      ]= pred4x4_vertical_left_c;
2914     h->pred4x4[HOR_UP_PRED         ]= pred4x4_horizontal_up_c;
2915     h->pred4x4[LEFT_DC_PRED        ]= pred4x4_left_dc_c;
2916     h->pred4x4[TOP_DC_PRED         ]= pred4x4_top_dc_c;
2917     h->pred4x4[DC_128_PRED         ]= pred4x4_128_dc_c;
2918
2919     h->pred8x8l[VERT_PRED           ]= pred8x8l_vertical_c;
2920     h->pred8x8l[HOR_PRED            ]= pred8x8l_horizontal_c;
2921     h->pred8x8l[DC_PRED             ]= pred8x8l_dc_c;
2922     h->pred8x8l[DIAG_DOWN_LEFT_PRED ]= pred8x8l_down_left_c;
2923     h->pred8x8l[DIAG_DOWN_RIGHT_PRED]= pred8x8l_down_right_c;
2924     h->pred8x8l[VERT_RIGHT_PRED     ]= pred8x8l_vertical_right_c;
2925     h->pred8x8l[HOR_DOWN_PRED       ]= pred8x8l_horizontal_down_c;
2926     h->pred8x8l[VERT_LEFT_PRED      ]= pred8x8l_vertical_left_c;
2927     h->pred8x8l[HOR_UP_PRED         ]= pred8x8l_horizontal_up_c;
2928     h->pred8x8l[LEFT_DC_PRED        ]= pred8x8l_left_dc_c;
2929     h->pred8x8l[TOP_DC_PRED         ]= pred8x8l_top_dc_c;
2930     h->pred8x8l[DC_128_PRED         ]= pred8x8l_128_dc_c;
2931
2932     h->pred8x8[DC_PRED8x8     ]= pred8x8_dc_c;
2933     h->pred8x8[VERT_PRED8x8   ]= pred8x8_vertical_c;
2934     h->pred8x8[HOR_PRED8x8    ]= pred8x8_horizontal_c;
2935     h->pred8x8[PLANE_PRED8x8  ]= pred8x8_plane_c;
2936     h->pred8x8[LEFT_DC_PRED8x8]= pred8x8_left_dc_c;
2937     h->pred8x8[TOP_DC_PRED8x8 ]= pred8x8_top_dc_c;
2938     h->pred8x8[DC_128_PRED8x8 ]= pred8x8_128_dc_c;
2939
2940     h->pred16x16[DC_PRED8x8     ]= pred16x16_dc_c;
2941     h->pred16x16[VERT_PRED8x8   ]= pred16x16_vertical_c;
2942     h->pred16x16[HOR_PRED8x8    ]= pred16x16_horizontal_c;
2943     h->pred16x16[PLANE_PRED8x8  ]= pred16x16_plane_c;
2944     h->pred16x16[LEFT_DC_PRED8x8]= pred16x16_left_dc_c;
2945     h->pred16x16[TOP_DC_PRED8x8 ]= pred16x16_top_dc_c;
2946     h->pred16x16[DC_128_PRED8x8 ]= pred16x16_128_dc_c;
2947 }
2948
2949 static void free_tables(H264Context *h){
2950     av_freep(&h->intra4x4_pred_mode);
2951     av_freep(&h->chroma_pred_mode_table);
2952     av_freep(&h->cbp_table);
2953     av_freep(&h->mvd_table[0]);
2954     av_freep(&h->mvd_table[1]);
2955     av_freep(&h->direct_table);
2956     av_freep(&h->non_zero_count);
2957     av_freep(&h->slice_table_base);
2958     av_freep(&h->top_borders[1]);
2959     av_freep(&h->top_borders[0]);
2960     h->slice_table= NULL;
2961
2962     av_freep(&h->mb2b_xy);
2963     av_freep(&h->mb2b8_xy);
2964
2965     av_freep(&h->s.obmc_scratchpad);
2966 }
2967
2968 static void init_dequant8_coeff_table(H264Context *h){
2969     int i,q,x;
2970     const int transpose = (h->s.dsp.h264_idct8_add != ff_h264_idct8_add_c); //FIXME ugly
2971     h->dequant8_coeff[0] = h->dequant8_buffer[0];
2972     h->dequant8_coeff[1] = h->dequant8_buffer[1];
2973
2974     for(i=0; i<2; i++ ){
2975         if(i && !memcmp(h->pps.scaling_matrix8[0], h->pps.scaling_matrix8[1], 64*sizeof(uint8_t))){
2976             h->dequant8_coeff[1] = h->dequant8_buffer[0];
2977             break;
2978         }
2979
2980         for(q=0; q<52; q++){
2981             int shift = div6[q];
2982             int idx = rem6[q];
2983             for(x=0; x<64; x++)
2984                 h->dequant8_coeff[i][q][transpose ? (x>>3)|((x&7)<<3) : x] =
2985                     ((uint32_t)dequant8_coeff_init[idx][ dequant8_coeff_init_scan[((x>>1)&12) | (x&3)] ] *
2986                     h->pps.scaling_matrix8[i][x]) << shift;
2987         }
2988     }
2989 }
2990
2991 static void init_dequant4_coeff_table(H264Context *h){
2992     int i,j,q,x;
2993     const int transpose = (h->s.dsp.h264_idct_add != ff_h264_idct_add_c); //FIXME ugly
2994     for(i=0; i<6; i++ ){
2995         h->dequant4_coeff[i] = h->dequant4_buffer[i];
2996         for(j=0; j<i; j++){
2997             if(!memcmp(h->pps.scaling_matrix4[j], h->pps.scaling_matrix4[i], 16*sizeof(uint8_t))){
2998                 h->dequant4_coeff[i] = h->dequant4_buffer[j];
2999                 break;
3000             }
3001         }
3002         if(j<i)
3003             continue;
3004
3005         for(q=0; q<52; q++){
3006             int shift = div6[q] + 2;
3007             int idx = rem6[q];
3008             for(x=0; x<16; x++)
3009                 h->dequant4_coeff[i][q][transpose ? (x>>2)|((x<<2)&0xF) : x] =
3010                     ((uint32_t)dequant4_coeff_init[idx][(x&1) + ((x>>2)&1)] *
3011                     h->pps.scaling_matrix4[i][x]) << shift;
3012         }
3013     }
3014 }
3015
3016 static void init_dequant_tables(H264Context *h){
3017     int i,x;
3018     init_dequant4_coeff_table(h);
3019     if(h->pps.transform_8x8_mode)
3020         init_dequant8_coeff_table(h);
3021     if(h->sps.transform_bypass){
3022         for(i=0; i<6; i++)
3023             for(x=0; x<16; x++)
3024                 h->dequant4_coeff[i][0][x] = 1<<6;
3025         if(h->pps.transform_8x8_mode)
3026             for(i=0; i<2; i++)
3027                 for(x=0; x<64; x++)
3028                     h->dequant8_coeff[i][0][x] = 1<<6;
3029     }
3030 }
3031
3032
3033 /**
3034  * allocates tables.
3035  * needs width/height
3036  */
3037 static int alloc_tables(H264Context *h){
3038     MpegEncContext * const s = &h->s;
3039     const int big_mb_num= s->mb_stride * (s->mb_height+1);
3040     int x,y;
3041
3042     CHECKED_ALLOCZ(h->intra4x4_pred_mode, big_mb_num * 8  * sizeof(uint8_t))
3043
3044     CHECKED_ALLOCZ(h->non_zero_count    , big_mb_num * 16 * sizeof(uint8_t))
3045     CHECKED_ALLOCZ(h->slice_table_base  , big_mb_num * sizeof(uint8_t))
3046     CHECKED_ALLOCZ(h->top_borders[0]    , s->mb_width * (16+8+8) * sizeof(uint8_t))
3047     CHECKED_ALLOCZ(h->top_borders[1]    , s->mb_width * (16+8+8) * sizeof(uint8_t))
3048     CHECKED_ALLOCZ(h->cbp_table, big_mb_num * sizeof(uint16_t))
3049
3050     if( h->pps.cabac ) {
3051         CHECKED_ALLOCZ(h->chroma_pred_mode_table, big_mb_num * sizeof(uint8_t))
3052         CHECKED_ALLOCZ(h->mvd_table[0], 32*big_mb_num * sizeof(uint16_t));
3053         CHECKED_ALLOCZ(h->mvd_table[1], 32*big_mb_num * sizeof(uint16_t));
3054         CHECKED_ALLOCZ(h->direct_table, 32*big_mb_num * sizeof(uint8_t));
3055     }
3056
3057     memset(h->slice_table_base, -1, big_mb_num  * sizeof(uint8_t));
3058     h->slice_table= h->slice_table_base + s->mb_stride + 1;
3059
3060     CHECKED_ALLOCZ(h->mb2b_xy  , big_mb_num * sizeof(uint32_t));
3061     CHECKED_ALLOCZ(h->mb2b8_xy , big_mb_num * sizeof(uint32_t));
3062     for(y=0; y<s->mb_height; y++){
3063         for(x=0; x<s->mb_width; x++){
3064             const int mb_xy= x + y*s->mb_stride;
3065             const int b_xy = 4*x + 4*y*h->b_stride;
3066             const int b8_xy= 2*x + 2*y*h->b8_stride;
3067
3068             h->mb2b_xy [mb_xy]= b_xy;
3069             h->mb2b8_xy[mb_xy]= b8_xy;
3070         }
3071     }
3072
3073     s->obmc_scratchpad = NULL;
3074
3075     if(!h->dequant4_coeff[0])
3076         init_dequant_tables(h);
3077
3078     return 0;
3079 fail:
3080     free_tables(h);
3081     return -1;
3082 }
3083
3084 static void common_init(H264Context *h){
3085     MpegEncContext * const s = &h->s;
3086
3087     s->width = s->avctx->width;
3088     s->height = s->avctx->height;
3089     s->codec_id= s->avctx->codec->id;
3090
3091     init_pred_ptrs(h);
3092
3093     h->dequant_coeff_pps= -1;
3094     s->unrestricted_mv=1;
3095     s->decode=1; //FIXME
3096
3097     memset(h->pps.scaling_matrix4, 16, 6*16*sizeof(uint8_t));
3098     memset(h->pps.scaling_matrix8, 16, 2*64*sizeof(uint8_t));
3099 }
3100
3101 static int decode_init(AVCodecContext *avctx){
3102     H264Context *h= avctx->priv_data;
3103     MpegEncContext * const s = &h->s;
3104
3105     MPV_decode_defaults(s);
3106
3107     s->avctx = avctx;
3108     common_init(h);
3109
3110     s->out_format = FMT_H264;
3111     s->workaround_bugs= avctx->workaround_bugs;
3112
3113     // set defaults
3114 //    s->decode_mb= ff_h263_decode_mb;
3115     s->low_delay= 1;
3116     avctx->pix_fmt= PIX_FMT_YUV420P;
3117
3118     decode_init_vlc(h);
3119
3120     if(avctx->extradata_size > 0 && avctx->extradata &&
3121        *(char *)avctx->extradata == 1){
3122         h->is_avc = 1;
3123         h->got_avcC = 0;
3124     } else {
3125         h->is_avc = 0;
3126     }
3127
3128     return 0;
3129 }
3130
3131 static int frame_start(H264Context *h){
3132     MpegEncContext * const s = &h->s;
3133     int i;
3134
3135     if(MPV_frame_start(s, s->avctx) < 0)
3136         return -1;
3137     ff_er_frame_start(s);
3138
3139     assert(s->linesize && s->uvlinesize);
3140
3141     for(i=0; i<16; i++){
3142         h->block_offset[i]= 4*((scan8[i] - scan8[0])&7) + 4*s->linesize*((scan8[i] - scan8[0])>>3);
3143         h->block_offset[24+i]= 4*((scan8[i] - scan8[0])&7) + 8*s->linesize*((scan8[i] - scan8[0])>>3);
3144     }
3145     for(i=0; i<4; i++){
3146         h->block_offset[16+i]=
3147         h->block_offset[20+i]= 4*((scan8[i] - scan8[0])&7) + 4*s->uvlinesize*((scan8[i] - scan8[0])>>3);
3148         h->block_offset[24+16+i]=
3149         h->block_offset[24+20+i]= 4*((scan8[i] - scan8[0])&7) + 8*s->uvlinesize*((scan8[i] - scan8[0])>>3);
3150     }
3151
3152     /* can't be in alloc_tables because linesize isn't known there.
3153      * FIXME: redo bipred weight to not require extra buffer? */
3154     if(!s->obmc_scratchpad)
3155         s->obmc_scratchpad = av_malloc(16*s->linesize + 2*8*s->uvlinesize);
3156
3157 //    s->decode= (s->flags&CODEC_FLAG_PSNR) || !s->encoding || s->current_picture.reference /*|| h->contains_intra*/ || 1;
3158     return 0;
3159 }
3160
3161 static inline void backup_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize){
3162     MpegEncContext * const s = &h->s;
3163     int i;
3164
3165     src_y  -=   linesize;
3166     src_cb -= uvlinesize;
3167     src_cr -= uvlinesize;
3168
3169     // There are two lines saved, the line above the the top macroblock of a pair,
3170     // and the line above the bottom macroblock
3171     h->left_border[0]= h->top_borders[0][s->mb_x][15];
3172     for(i=1; i<17; i++){
3173         h->left_border[i]= src_y[15+i*  linesize];
3174     }
3175
3176     *(uint64_t*)(h->top_borders[0][s->mb_x]+0)= *(uint64_t*)(src_y +  16*linesize);
3177     *(uint64_t*)(h->top_borders[0][s->mb_x]+8)= *(uint64_t*)(src_y +8+16*linesize);
3178
3179     if(!(s->flags&CODEC_FLAG_GRAY)){
3180         h->left_border[17  ]= h->top_borders[0][s->mb_x][16+7];
3181         h->left_border[17+9]= h->top_borders[0][s->mb_x][24+7];
3182         for(i=1; i<9; i++){
3183             h->left_border[i+17  ]= src_cb[7+i*uvlinesize];
3184             h->left_border[i+17+9]= src_cr[7+i*uvlinesize];
3185         }
3186         *(uint64_t*)(h->top_borders[0][s->mb_x]+16)= *(uint64_t*)(src_cb+8*uvlinesize);
3187         *(uint64_t*)(h->top_borders[0][s->mb_x]+24)= *(uint64_t*)(src_cr+8*uvlinesize);
3188     }
3189 }
3190
3191 static inline void xchg_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int xchg){
3192     MpegEncContext * const s = &h->s;
3193     int temp8, i;
3194     uint64_t temp64;
3195     int deblock_left = (s->mb_x > 0);
3196     int deblock_top  = (s->mb_y > 0);
3197
3198     src_y  -=   linesize + 1;
3199     src_cb -= uvlinesize + 1;
3200     src_cr -= uvlinesize + 1;
3201
3202 #define XCHG(a,b,t,xchg)\
3203 t= a;\
3204 if(xchg)\
3205     a= b;\
3206 b= t;
3207
3208     if(deblock_left){
3209         for(i = !deblock_top; i<17; i++){
3210             XCHG(h->left_border[i     ], src_y [i*  linesize], temp8, xchg);
3211         }
3212     }
3213
3214     if(deblock_top){
3215         XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+0), *(uint64_t*)(src_y +1), temp64, xchg);
3216         XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+8), *(uint64_t*)(src_y +9), temp64, 1);
3217         if(s->mb_x+1 < s->mb_width){
3218             XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x+1]), *(uint64_t*)(src_y +17), temp64, 1);
3219         }
3220     }
3221
3222     if(!(s->flags&CODEC_FLAG_GRAY)){
3223         if(deblock_left){
3224             for(i = !deblock_top; i<9; i++){
3225                 XCHG(h->left_border[i+17  ], src_cb[i*uvlinesize], temp8, xchg);
3226                 XCHG(h->left_border[i+17+9], src_cr[i*uvlinesize], temp8, xchg);
3227             }
3228         }
3229         if(deblock_top){
3230             XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+16), *(uint64_t*)(src_cb+1), temp64, 1);
3231             XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+24), *(uint64_t*)(src_cr+1), temp64, 1);
3232         }
3233     }
3234 }
3235
3236 static inline void backup_pair_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize){
3237     MpegEncContext * const s = &h->s;
3238     int i;
3239
3240     src_y  -= 2 *   linesize;
3241     src_cb -= 2 * uvlinesize;
3242     src_cr -= 2 * uvlinesize;
3243
3244     // There are two lines saved, the line above the the top macroblock of a pair,
3245     // and the line above the bottom macroblock
3246     h->left_border[0]= h->top_borders[0][s->mb_x][15];
3247     h->left_border[1]= h->top_borders[1][s->mb_x][15];
3248     for(i=2; i<34; i++){
3249         h->left_border[i]= src_y[15+i*  linesize];
3250     }
3251
3252     *(uint64_t*)(h->top_borders[0][s->mb_x]+0)= *(uint64_t*)(src_y +  32*linesize);
3253     *(uint64_t*)(h->top_borders[0][s->mb_x]+8)= *(uint64_t*)(src_y +8+32*linesize);
3254     *(uint64_t*)(h->top_borders[1][s->mb_x]+0)= *(uint64_t*)(src_y +  33*linesize);
3255     *(uint64_t*)(h->top_borders[1][s->mb_x]+8)= *(uint64_t*)(src_y +8+33*linesize);
3256
3257     if(!(s->flags&CODEC_FLAG_GRAY)){
3258         h->left_border[34     ]= h->top_borders[0][s->mb_x][16+7];
3259         h->left_border[34+   1]= h->top_borders[1][s->mb_x][16+7];
3260         h->left_border[34+18  ]= h->top_borders[0][s->mb_x][24+7];
3261         h->left_border[34+18+1]= h->top_borders[1][s->mb_x][24+7];
3262         for(i=2; i<18; i++){
3263             h->left_border[i+34   ]= src_cb[7+i*uvlinesize];
3264             h->left_border[i+34+18]= src_cr[7+i*uvlinesize];
3265         }
3266         *(uint64_t*)(h->top_borders[0][s->mb_x]+16)= *(uint64_t*)(src_cb+16*uvlinesize);
3267         *(uint64_t*)(h->top_borders[0][s->mb_x]+24)= *(uint64_t*)(src_cr+16*uvlinesize);
3268         *(uint64_t*)(h->top_borders[1][s->mb_x]+16)= *(uint64_t*)(src_cb+17*uvlinesize);
3269         *(uint64_t*)(h->top_borders[1][s->mb_x]+24)= *(uint64_t*)(src_cr+17*uvlinesize);
3270     }
3271 }
3272
3273 static inline void xchg_pair_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int xchg){
3274     MpegEncContext * const s = &h->s;
3275     int temp8, i;
3276     uint64_t temp64;
3277     int deblock_left = (s->mb_x > 0);
3278     int deblock_top  = (s->mb_y > 0);
3279
3280     tprintf("xchg_pair_border: src_y:%p src_cb:%p src_cr:%p ls:%d uvls:%d\n", src_y, src_cb, src_cr, linesize, uvlinesize);
3281
3282     src_y  -= 2 *   linesize + 1;
3283     src_cb -= 2 * uvlinesize + 1;
3284     src_cr -= 2 * uvlinesize + 1;
3285
3286 #define XCHG(a,b,t,xchg)\
3287 t= a;\
3288 if(xchg)\
3289     a= b;\
3290 b= t;
3291
3292     if(deblock_left){
3293         for(i = (!deblock_top)<<1; i<34; i++){
3294             XCHG(h->left_border[i     ], src_y [i*  linesize], temp8, xchg);
3295         }
3296     }
3297
3298     if(deblock_top){
3299         XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+0), *(uint64_t*)(src_y +1), temp64, xchg);
3300         XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+8), *(uint64_t*)(src_y +9), temp64, 1);
3301         XCHG(*(uint64_t*)(h->top_borders[1][s->mb_x]+0), *(uint64_t*)(src_y +1 +linesize), temp64, xchg);
3302         XCHG(*(uint64_t*)(h->top_borders[1][s->mb_x]+8), *(uint64_t*)(src_y +9 +linesize), temp64, 1);
3303     }
3304
3305     if(!(s->flags&CODEC_FLAG_GRAY)){
3306         if(deblock_left){
3307             for(i = (!deblock_top) << 1; i<18; i++){
3308                 XCHG(h->left_border[i+34   ], src_cb[i*uvlinesize], temp8, xchg);
3309                 XCHG(h->left_border[i+34+18], src_cr[i*uvlinesize], temp8, xchg);
3310             }
3311         }
3312         if(deblock_top){
3313             XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+16), *(uint64_t*)(src_cb+1), temp64, 1);
3314             XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+24), *(uint64_t*)(src_cr+1), temp64, 1);
3315             XCHG(*(uint64_t*)(h->top_borders[1][s->mb_x]+16), *(uint64_t*)(src_cb+1 +uvlinesize), temp64, 1);
3316             XCHG(*(uint64_t*)(h->top_borders[1][s->mb_x]+24), *(uint64_t*)(src_cr+1 +uvlinesize), temp64, 1);
3317         }
3318     }
3319 }
3320
3321 static void hl_decode_mb(H264Context *h){
3322     MpegEncContext * const s = &h->s;
3323     const int mb_x= s->mb_x;
3324     const int mb_y= s->mb_y;
3325     const int mb_xy= mb_x + mb_y*s->mb_stride;
3326     const int mb_type= s->current_picture.mb_type[mb_xy];
3327     uint8_t  *dest_y, *dest_cb, *dest_cr;
3328     int linesize, uvlinesize /*dct_offset*/;
3329     int i;
3330     int *block_offset = &h->block_offset[0];
3331     const unsigned int bottom = mb_y & 1;
3332     const int transform_bypass = (s->qscale == 0 && h->sps.transform_bypass);
3333     void (*idct_add)(uint8_t *dst, DCTELEM *block, int stride);
3334     void (*idct_dc_add)(uint8_t *dst, DCTELEM *block, int stride);
3335
3336     if(!s->decode)
3337         return;
3338
3339     dest_y  = s->current_picture.data[0] + (mb_y * 16* s->linesize  ) + mb_x * 16;
3340     dest_cb = s->current_picture.data[1] + (mb_y * 8 * s->uvlinesize) + mb_x * 8;
3341     dest_cr = s->current_picture.data[2] + (mb_y * 8 * s->uvlinesize) + mb_x * 8;
3342
3343     if (h->mb_field_decoding_flag) {
3344         linesize = s->linesize * 2;
3345         uvlinesize = s->uvlinesize * 2;
3346         block_offset = &h->block_offset[24];
3347         if(mb_y&1){ //FIXME move out of this func?
3348             dest_y -= s->linesize*15;
3349             dest_cb-= s->uvlinesize*7;
3350             dest_cr-= s->uvlinesize*7;
3351         }
3352     } else {
3353         linesize = s->linesize;
3354         uvlinesize = s->uvlinesize;
3355 //        dct_offset = s->linesize * 16;
3356     }
3357
3358     if(transform_bypass){
3359         idct_dc_add =
3360         idct_add = IS_8x8DCT(mb_type) ? s->dsp.add_pixels8 : s->dsp.add_pixels4;
3361     }else if(IS_8x8DCT(mb_type)){
3362         idct_dc_add = s->dsp.h264_idct8_dc_add;
3363         idct_add = s->dsp.h264_idct8_add;
3364     }else{
3365         idct_dc_add = s->dsp.h264_idct_dc_add;
3366         idct_add = s->dsp.h264_idct_add;
3367     }
3368
3369     if (IS_INTRA_PCM(mb_type)) {
3370         unsigned int x, y;
3371
3372         // The pixels are stored in h->mb array in the same order as levels,
3373         // copy them in output in the correct order.
3374         for(i=0; i<16; i++) {
3375             for (y=0; y<4; y++) {
3376                 for (x=0; x<4; x++) {
3377                     *(dest_y + block_offset[i] + y*linesize + x) = h->mb[i*16+y*4+x];
3378                 }
3379             }
3380         }
3381         for(i=16; i<16+4; i++) {
3382             for (y=0; y<4; y++) {
3383                 for (x=0; x<4; x++) {
3384                     *(dest_cb + block_offset[i] + y*uvlinesize + x) = h->mb[i*16+y*4+x];
3385                 }
3386             }
3387         }
3388         for(i=20; i<20+4; i++) {
3389             for (y=0; y<4; y++) {
3390                 for (x=0; x<4; x++) {
3391                     *(dest_cr + block_offset[i] + y*uvlinesize + x) = h->mb[i*16+y*4+x];
3392                 }
3393             }
3394         }
3395     } else {
3396         if(IS_INTRA(mb_type)){
3397             if(h->deblocking_filter) {
3398                 if (h->mb_aff_frame) {
3399                     if (!bottom)
3400                         xchg_pair_border(h, dest_y, dest_cb, dest_cr, s->linesize, s->uvlinesize, 1);
3401                 } else {
3402                     xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 1);
3403                 }
3404             }
3405
3406             if(!(s->flags&CODEC_FLAG_GRAY)){
3407                 h->pred8x8[ h->chroma_pred_mode ](dest_cb, uvlinesize);
3408                 h->pred8x8[ h->chroma_pred_mode ](dest_cr, uvlinesize);
3409             }
3410
3411             if(IS_INTRA4x4(mb_type)){
3412                 if(!s->encoding){
3413                     if(IS_8x8DCT(mb_type)){
3414                         for(i=0; i<16; i+=4){
3415                             uint8_t * const ptr= dest_y + block_offset[i];
3416                             const int dir= h->intra4x4_pred_mode_cache[ scan8[i] ];
3417                             const int nnz = h->non_zero_count_cache[ scan8[i] ];
3418                             h->pred8x8l[ dir ](ptr, (h->topleft_samples_available<<i)&0x8000,
3419                                                    (h->topright_samples_available<<(i+1))&0x8000, linesize);
3420                             if(nnz){
3421                                 if(nnz == 1 && h->mb[i*16])
3422                                     idct_dc_add(ptr, h->mb + i*16, linesize);
3423                                 else
3424                                     idct_add(ptr, h->mb + i*16, linesize);
3425                             }
3426                         }
3427                     }else
3428                     for(i=0; i<16; i++){
3429                         uint8_t * const ptr= dest_y + block_offset[i];
3430                         uint8_t *topright;
3431                         const int dir= h->intra4x4_pred_mode_cache[ scan8[i] ];
3432                         int nnz, tr;
3433
3434                         if(dir == DIAG_DOWN_LEFT_PRED || dir == VERT_LEFT_PRED){
3435                             const int topright_avail= (h->topright_samples_available<<i)&0x8000;
3436                             assert(mb_y || linesize <= block_offset[i]);
3437                             if(!topright_avail){
3438                                 tr= ptr[3 - linesize]*0x01010101;
3439                                 topright= (uint8_t*) &tr;
3440                             }else
3441                                 topright= ptr + 4 - linesize;
3442                         }else
3443                             topright= NULL;
3444
3445                         h->pred4x4[ dir ](ptr, topright, linesize);
3446                         nnz = h->non_zero_count_cache[ scan8[i] ];
3447                         if(nnz){
3448                             if(s->codec_id == CODEC_ID_H264){
3449                                 if(nnz == 1 && h->mb[i*16])
3450                                     idct_dc_add(ptr, h->mb + i*16, linesize);
3451                                 else
3452                                     idct_add(ptr, h->mb + i*16, linesize);
3453                             }else
3454                                 svq3_add_idct_c(ptr, h->mb + i*16, linesize, s->qscale, 0);
3455                         }
3456                     }
3457                 }
3458             }else{
3459                 h->pred16x16[ h->intra16x16_pred_mode ](dest_y , linesize);
3460                 if(s->codec_id == CODEC_ID_H264){
3461                     if(!transform_bypass)
3462                         h264_luma_dc_dequant_idct_c(h->mb, s->qscale, h->dequant4_coeff[IS_INTRA(mb_type) ? 0:3][s->qscale][0]);
3463                 }else
3464                     svq3_luma_dc_dequant_idct_c(h->mb, s->qscale);
3465             }
3466             if(h->deblocking_filter) {
3467                 if (h->mb_aff_frame) {
3468                     if (bottom) {
3469                         uint8_t *pair_dest_y  = s->current_picture.data[0] + ((mb_y-1) * 16* s->linesize  ) + mb_x * 16;
3470                         uint8_t *pair_dest_cb = s->current_picture.data[1] + ((mb_y-1) * 8 * s->uvlinesize) + mb_x * 8;
3471                         uint8_t *pair_dest_cr = s->current_picture.data[2] + ((mb_y-1) * 8 * s->uvlinesize) + mb_x * 8;
3472                         s->mb_y--;
3473                         xchg_pair_border(h, pair_dest_y, pair_dest_cb, pair_dest_cr, s->linesize, s->uvlinesize, 0);
3474                         s->mb_y++;
3475                     }
3476                 } else {
3477                     xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 0);
3478                 }
3479             }
3480         }else if(s->codec_id == CODEC_ID_H264){
3481             hl_motion(h, dest_y, dest_cb, dest_cr,
3482                       s->dsp.put_h264_qpel_pixels_tab, s->dsp.put_h264_chroma_pixels_tab,
3483                       s->dsp.avg_h264_qpel_pixels_tab, s->dsp.avg_h264_chroma_pixels_tab,
3484                       s->dsp.weight_h264_pixels_tab, s->dsp.biweight_h264_pixels_tab);
3485         }
3486
3487
3488         if(!IS_INTRA4x4(mb_type)){
3489             if(s->codec_id == CODEC_ID_H264){
3490                 if(IS_INTRA16x16(mb_type)){
3491                     for(i=0; i<16; i++){
3492                         if(h->non_zero_count_cache[ scan8[i] ])
3493                             idct_add(dest_y + block_offset[i], h->mb + i*16, linesize);
3494                         else if(h->mb[i*16])
3495                             idct_dc_add(dest_y + block_offset[i], h->mb + i*16, linesize);
3496                     }
3497                 }else{
3498                     const int di = IS_8x8DCT(mb_type) ? 4 : 1;
3499                     for(i=0; i<16; i+=di){
3500                         int nnz = h->non_zero_count_cache[ scan8[i] ];
3501                         if(nnz){
3502                             if(nnz==1 && h->mb[i*16])
3503                                 idct_dc_add(dest_y + block_offset[i], h->mb + i*16, linesize);
3504                             else
3505                                 idct_add(dest_y + block_offset[i], h->mb + i*16, linesize);
3506                         }
3507                     }
3508                 }
3509             }else{
3510                 for(i=0; i<16; i++){
3511                     if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){ //FIXME benchmark weird rule, & below
3512                         uint8_t * const ptr= dest_y + block_offset[i];
3513                         svq3_add_idct_c(ptr, h->mb + i*16, linesize, s->qscale, IS_INTRA(mb_type) ? 1 : 0);
3514                     }
3515                 }
3516             }
3517         }
3518
3519         if(!(s->flags&CODEC_FLAG_GRAY)){
3520             uint8_t *dest[2] = {dest_cb, dest_cr};
3521             if(transform_bypass){
3522                 idct_add = idct_dc_add = s->dsp.add_pixels4;
3523             }else{
3524                 idct_add = s->dsp.h264_idct_add;
3525                 idct_dc_add = s->dsp.h264_idct_dc_add;
3526                 chroma_dc_dequant_idct_c(h->mb + 16*16, h->chroma_qp, h->dequant4_coeff[IS_INTRA(mb_type) ? 1:4][h->chroma_qp][0]);
3527                 chroma_dc_dequant_idct_c(h->mb + 16*16+4*16, h->chroma_qp, h->dequant4_coeff[IS_INTRA(mb_type) ? 2:5][h->chroma_qp][0]);
3528             }
3529             if(s->codec_id == CODEC_ID_H264){
3530                 for(i=16; i<16+8; i++){
3531                     if(h->non_zero_count_cache[ scan8[i] ])
3532                         idct_add(dest[(i&4)>>2] + block_offset[i], h->mb + i*16, uvlinesize);
3533                     else if(h->mb[i*16])
3534                         idct_dc_add(dest[(i&4)>>2] + block_offset[i], h->mb + i*16, uvlinesize);
3535                 }
3536             }else{
3537                 for(i=16; i<16+8; i++){
3538                     if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){
3539                         uint8_t * const ptr= dest[(i&4)>>2] + block_offset[i];
3540                         svq3_add_idct_c(ptr, h->mb + i*16, uvlinesize, chroma_qp[s->qscale + 12] - 12, 2);
3541                     }
3542                 }
3543             }
3544         }
3545     }
3546     if(h->deblocking_filter) {
3547         if (h->mb_aff_frame) {
3548             const int mb_y = s->mb_y - 1;
3549             uint8_t  *pair_dest_y, *pair_dest_cb, *pair_dest_cr;
3550             const int mb_xy= mb_x + mb_y*s->mb_stride;
3551             const int mb_type_top   = s->current_picture.mb_type[mb_xy];
3552             const int mb_type_bottom= s->current_picture.mb_type[mb_xy+s->mb_stride];
3553             uint8_t tmp = s->current_picture.data[1][384];
3554             if (!bottom) return;
3555             pair_dest_y  = s->current_picture.data[0] + (mb_y * 16* s->linesize  ) + mb_x * 16;
3556             pair_dest_cb = s->current_picture.data[1] + (mb_y * 8 * s->uvlinesize) + mb_x * 8;
3557             pair_dest_cr = s->current_picture.data[2] + (mb_y * 8 * s->uvlinesize) + mb_x * 8;
3558
3559             backup_pair_border(h, pair_dest_y, pair_dest_cb, pair_dest_cr, s->linesize, s->uvlinesize);
3560             // TODO deblock a pair
3561             // top
3562             s->mb_y--;
3563             tprintf("call mbaff filter_mb mb_x:%d mb_y:%d pair_dest_y = %p, dest_y = %p\n", mb_x, mb_y, pair_dest_y, dest_y);
3564             fill_caches(h, mb_type_top, 1); //FIXME don't fill stuff which isn't used by filter_mb
3565             filter_mb(h, mb_x, mb_y, pair_dest_y, pair_dest_cb, pair_dest_cr, linesize, uvlinesize);
3566             if (tmp != s->current_picture.data[1][384]) {
3567                 tprintf("modified pixel 8,1 (1)\n");
3568             }
3569             // bottom
3570             s->mb_y++;
3571             tprintf("call mbaff filter_mb\n");
3572             fill_caches(h, mb_type_bottom, 1); //FIXME don't fill stuff which isn't used by filter_mb
3573             filter_mb(h, mb_x, mb_y+1, dest_y, dest_cb, dest_cr, linesize, uvlinesize);
3574             if (tmp != s->current_picture.data[1][384]) {
3575                 tprintf("modified pixel 8,1 (2)\n");
3576             }
3577         } else {
3578             tprintf("call filter_mb\n");
3579             backup_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize);
3580             fill_caches(h, mb_type, 1); //FIXME don't fill stuff which isn't used by filter_mb
3581             filter_mb(h, mb_x, mb_y, dest_y, dest_cb, dest_cr, linesize, uvlinesize);
3582         }
3583     }
3584 }
3585
3586 /**
3587  * fills the default_ref_list.
3588  */
3589 static int fill_default_ref_list(H264Context *h){
3590     MpegEncContext * const s = &h->s;
3591     int i;
3592     int smallest_poc_greater_than_current = -1;
3593     Picture sorted_short_ref[32];
3594
3595     if(h->slice_type==B_TYPE){
3596         int out_i;
3597         int limit= INT_MIN;
3598
3599         /* sort frame according to poc in B slice */
3600         for(out_i=0; out_i<h->short_ref_count; out_i++){
3601             int best_i=INT_MIN;
3602             int best_poc=INT_MAX;
3603
3604             for(i=0; i<h->short_ref_count; i++){
3605                 const int poc= h->short_ref[i]->poc;
3606                 if(poc > limit && poc < best_poc){
3607                     best_poc= poc;
3608                     best_i= i;
3609                 }
3610             }
3611
3612             assert(best_i != INT_MIN);
3613
3614             limit= best_poc;
3615             sorted_short_ref[out_i]= *h->short_ref[best_i];
3616             tprintf("sorted poc: %d->%d poc:%d fn:%d\n", best_i, out_i, sorted_short_ref[out_i].poc, sorted_short_ref[out_i].frame_num);
3617             if (-1 == smallest_poc_greater_than_current) {
3618                 if (h->short_ref[best_i]->poc >= s->current_picture_ptr->poc) {
3619                     smallest_poc_greater_than_current = out_i;
3620                 }
3621             }
3622         }
3623     }
3624
3625     if(s->picture_structure == PICT_FRAME){
3626         if(h->slice_type==B_TYPE){
3627             int list;
3628             tprintf("current poc: %d, smallest_poc_greater_than_current: %d\n", s->current_picture_ptr->poc, smallest_poc_greater_than_current);
3629
3630             // find the largest poc
3631             for(list=0; list<2; list++){
3632                 int index = 0;
3633                 int j= -99;
3634                 int step= list ? -1 : 1;
3635
3636                 for(i=0; i<h->short_ref_count && index < h->ref_count[list]; i++, j+=step) {
3637                     while(j<0 || j>= h->short_ref_count){
3638                         if(j != -99 && step == (list ? -1 : 1))
3639                             return -1;
3640                         step = -step;
3641                         j= smallest_poc_greater_than_current + (step>>1);
3642                     }
3643                     if(sorted_short_ref[j].reference != 3) continue;
3644                     h->default_ref_list[list][index  ]= sorted_short_ref[j];
3645                     h->default_ref_list[list][index++].pic_id= sorted_short_ref[j].frame_num;
3646                 }
3647
3648                 for(i = 0; i < 16 && index < h->ref_count[ list ]; i++){
3649                     if(h->long_ref[i] == NULL) continue;
3650                     if(h->long_ref[i]->reference != 3) continue;
3651
3652                     h->default_ref_list[ list ][index  ]= *h->long_ref[i];
3653                     h->default_ref_list[ list ][index++].pic_id= i;;
3654                 }
3655
3656                 if(list && (smallest_poc_greater_than_current<=0 || smallest_poc_greater_than_current>=h->short_ref_count) && (1 < index)){
3657                     // swap the two first elements of L1 when
3658                     // L0 and L1 are identical
3659                     Picture temp= h->default_ref_list[1][0];
3660                     h->default_ref_list[1][0] = h->default_ref_list[1][1];
3661                     h->default_ref_list[1][1] = temp;
3662                 }
3663
3664                 if(index < h->ref_count[ list ])
3665                     memset(&h->default_ref_list[list][index], 0, sizeof(Picture)*(h->ref_count[ list ] - index));
3666             }
3667         }else{
3668             int index=0;
3669             for(i=0; i<h->short_ref_count; i++){
3670                 if(h->short_ref[i]->reference != 3) continue; //FIXME refernce field shit
3671                 h->default_ref_list[0][index  ]= *h->short_ref[i];
3672                 h->default_ref_list[0][index++].pic_id= h->short_ref[i]->frame_num;
3673             }
3674             for(i = 0; i < 16; i++){
3675                 if(h->long_ref[i] == NULL) continue;
3676                 if(h->long_ref[i]->reference != 3) continue;
3677                 h->default_ref_list[0][index  ]= *h->long_ref[i];
3678                 h->default_ref_list[0][index++].pic_id= i;;
3679             }
3680             if(index < h->ref_count[0])
3681                 memset(&h->default_ref_list[0][index], 0, sizeof(Picture)*(h->ref_count[0] - index));
3682         }
3683     }else{ //FIELD
3684         if(h->slice_type==B_TYPE){
3685         }else{
3686             //FIXME second field balh
3687         }
3688     }
3689 #ifdef TRACE
3690     for (i=0; i<h->ref_count[0]; i++) {
3691         tprintf("List0: %s fn:%d 0x%p\n", (h->default_ref_list[0][i].long_ref ? "LT" : "ST"), h->default_ref_list[0][i].pic_id, h->default_ref_list[0][i].data[0]);
3692     }
3693     if(h->slice_type==B_TYPE){
3694         for (i=0; i<h->ref_count[1]; i++) {
3695             tprintf("List1: %s fn:%d 0x%p\n", (h->default_ref_list[1][i].long_ref ? "LT" : "ST"), h->default_ref_list[1][i].pic_id, h->default_ref_list[0][i].data[0]);
3696         }
3697     }
3698 #endif
3699     return 0;
3700 }
3701
3702 static void print_short_term(H264Context *h);
3703 static void print_long_term(H264Context *h);
3704
3705 static int decode_ref_pic_list_reordering(H264Context *h){
3706     MpegEncContext * const s = &h->s;
3707     int list, index;
3708
3709     print_short_term(h);
3710     print_long_term(h);
3711     if(h->slice_type==I_TYPE || h->slice_type==SI_TYPE) return 0; //FIXME move before func
3712
3713     for(list=0; list<2; list++){
3714         memcpy(h->ref_list[list], h->default_ref_list[list], sizeof(Picture)*h->ref_count[list]);
3715
3716         if(get_bits1(&s->gb)){
3717             int pred= h->curr_pic_num;
3718
3719             for(index=0; ; index++){
3720                 int reordering_of_pic_nums_idc= get_ue_golomb(&s->gb);
3721                 int pic_id;
3722                 int i;
3723                 Picture *ref = NULL;
3724
3725                 if(reordering_of_pic_nums_idc==3)
3726                     break;
3727
3728                 if(index >= h->ref_count[list]){
3729                     av_log(h->s.avctx, AV_LOG_ERROR, "reference count overflow\n");
3730                     return -1;
3731                 }
3732
3733                 if(reordering_of_pic_nums_idc<3){
3734                     if(reordering_of_pic_nums_idc<2){
3735                         const int abs_diff_pic_num= get_ue_golomb(&s->gb) + 1;
3736
3737                         if(abs_diff_pic_num >= h->max_pic_num){
3738                             av_log(h->s.avctx, AV_LOG_ERROR, "abs_diff_pic_num overflow\n");
3739                             return -1;
3740                         }
3741
3742                         if(reordering_of_pic_nums_idc == 0) pred-= abs_diff_pic_num;
3743                         else                                pred+= abs_diff_pic_num;
3744                         pred &= h->max_pic_num - 1;
3745
3746                         for(i= h->short_ref_count-1; i>=0; i--){
3747                             ref = h->short_ref[i];
3748                             assert(ref->reference == 3);
3749                             assert(!ref->long_ref);
3750                             if(ref->data[0] != NULL && ref->frame_num == pred && ref->long_ref == 0) // ignore non existing pictures by testing data[0] pointer
3751                                 break;
3752                         }
3753                         if(i>=0)
3754                             ref->pic_id= ref->frame_num;
3755                     }else{
3756                         pic_id= get_ue_golomb(&s->gb); //long_term_pic_idx
3757                         ref = h->long_ref[pic_id];
3758                         ref->pic_id= pic_id;
3759                         assert(ref->reference == 3);
3760                         assert(ref->long_ref);
3761                         i=0;
3762                     }
3763
3764                     if (i < 0) {
3765                         av_log(h->s.avctx, AV_LOG_ERROR, "reference picture missing during reorder\n");
3766                         memset(&h->ref_list[list][index], 0, sizeof(Picture)); //FIXME
3767                     } else {
3768                         for(i=index; i+1<h->ref_count[list]; i++){
3769                             if(ref->long_ref == h->ref_list[list][i].long_ref && ref->pic_id == h->ref_list[list][i].pic_id)
3770                                 break;
3771                         }
3772                         for(; i > index; i--){
3773                             h->ref_list[list][i]= h->ref_list[list][i-1];
3774                         }
3775                         h->ref_list[list][index]= *ref;
3776                     }
3777                 }else{
3778                     av_log(h->s.avctx, AV_LOG_ERROR, "illegal reordering_of_pic_nums_idc\n");
3779                     return -1;
3780                 }
3781             }
3782         }
3783
3784         if(h->slice_type!=B_TYPE) break;
3785     }
3786     for(list=0; list<2; list++){
3787         for(index= 0; index < h->ref_count[list]; index++){
3788             if(!h->ref_list[list][index].data[0])
3789                 h->ref_list[list][index]= s->current_picture;
3790         }
3791         if(h->slice_type!=B_TYPE) break;
3792     }
3793
3794     if(h->slice_type==B_TYPE && !h->direct_spatial_mv_pred)
3795         direct_dist_scale_factor(h);
3796     direct_ref_list_init(h);
3797     return 0;
3798 }
3799
3800 static int pred_weight_table(H264Context *h){
3801     MpegEncContext * const s = &h->s;
3802     int list, i;
3803     int luma_def, chroma_def;
3804
3805     h->use_weight= 0;
3806     h->use_weight_chroma= 0;
3807     h->luma_log2_weight_denom= get_ue_golomb(&s->gb);
3808     h->chroma_log2_weight_denom= get_ue_golomb(&s->gb);
3809     luma_def = 1<<h->luma_log2_weight_denom;
3810     chroma_def = 1<<h->chroma_log2_weight_denom;
3811
3812     for(list=0; list<2; list++){
3813         for(i=0; i<h->ref_count[list]; i++){
3814             int luma_weight_flag, chroma_weight_flag;
3815
3816             luma_weight_flag= get_bits1(&s->gb);
3817             if(luma_weight_flag){
3818                 h->luma_weight[list][i]= get_se_golomb(&s->gb);
3819                 h->luma_offset[list][i]= get_se_golomb(&s->gb);
3820                 if(   h->luma_weight[list][i] != luma_def
3821                    || h->luma_offset[list][i] != 0)
3822                     h->use_weight= 1;
3823             }else{
3824                 h->luma_weight[list][i]= luma_def;
3825                 h->luma_offset[list][i]= 0;
3826             }
3827
3828             chroma_weight_flag= get_bits1(&s->gb);
3829             if(chroma_weight_flag){
3830                 int j;
3831                 for(j=0; j<2; j++){
3832                     h->chroma_weight[list][i][j]= get_se_golomb(&s->gb);
3833                     h->chroma_offset[list][i][j]= get_se_golomb(&s->gb);
3834                     if(   h->chroma_weight[list][i][j] != chroma_def
3835                        || h->chroma_offset[list][i][j] != 0)
3836                         h->use_weight_chroma= 1;
3837                 }
3838             }else{
3839                 int j;
3840                 for(j=0; j<2; j++){
3841                     h->chroma_weight[list][i][j]= chroma_def;
3842                     h->chroma_offset[list][i][j]= 0;
3843                 }
3844             }
3845         }
3846         if(h->slice_type != B_TYPE) break;
3847     }
3848     h->use_weight= h->use_weight || h->use_weight_chroma;
3849     return 0;
3850 }
3851
3852 static void implicit_weight_table(H264Context *h){
3853     MpegEncContext * const s = &h->s;
3854     int ref0, ref1;
3855     int cur_poc = s->current_picture_ptr->poc;
3856
3857     if(   h->ref_count[0] == 1 && h->ref_count[1] == 1
3858        && h->ref_list[0][0].poc + h->ref_list[1][0].poc == 2*cur_poc){
3859         h->use_weight= 0;
3860         h->use_weight_chroma= 0;
3861         return;
3862     }
3863
3864     h->use_weight= 2;
3865     h->use_weight_chroma= 2;
3866     h->luma_log2_weight_denom= 5;
3867     h->chroma_log2_weight_denom= 5;
3868
3869     /* FIXME: MBAFF */
3870     for(ref0=0; ref0 < h->ref_count[0]; ref0++){
3871         int poc0 = h->ref_list[0][ref0].poc;
3872         for(ref1=0; ref1 < h->ref_count[1]; ref1++){
3873             int poc1 = h->ref_list[1][ref1].poc;
3874             int td = clip(poc1 - poc0, -128, 127);
3875             if(td){
3876                 int tb = clip(cur_poc - poc0, -128, 127);
3877                 int tx = (16384 + (ABS(td) >> 1)) / td;
3878                 int dist_scale_factor = clip((tb*tx + 32) >> 6, -1024, 1023) >> 2;
3879                 if(dist_scale_factor < -64 || dist_scale_factor > 128)
3880                     h->implicit_weight[ref0][ref1] = 32;
3881                 else
3882                     h->implicit_weight[ref0][ref1] = 64 - dist_scale_factor;
3883             }else
3884                 h->implicit_weight[ref0][ref1] = 32;
3885         }
3886     }
3887 }
3888
3889 static inline void unreference_pic(H264Context *h, Picture *pic){
3890     int i;
3891     pic->reference=0;
3892     if(pic == h->delayed_output_pic)
3893         pic->reference=1;
3894     else{
3895         for(i = 0; h->delayed_pic[i]; i++)
3896             if(pic == h->delayed_pic[i]){
3897                 pic->reference=1;
3898                 break;
3899             }
3900     }
3901 }
3902
3903 /**
3904  * instantaneous decoder refresh.
3905  */
3906 static void idr(H264Context *h){
3907     int i;
3908
3909     for(i=0; i<16; i++){
3910         if (h->long_ref[i] != NULL) {
3911             unreference_pic(h, h->long_ref[i]);
3912             h->long_ref[i]= NULL;
3913         }
3914     }
3915     h->long_ref_count=0;
3916
3917     for(i=0; i<h->short_ref_count; i++){
3918         unreference_pic(h, h->short_ref[i]);
3919         h->short_ref[i]= NULL;
3920     }
3921     h->short_ref_count=0;
3922 }
3923
3924 /* forget old pics after a seek */
3925 static void flush_dpb(AVCodecContext *avctx){
3926     H264Context *h= avctx->priv_data;
3927     int i;
3928     for(i=0; i<16; i++) {
3929         if(h->delayed_pic[i])
3930             h->delayed_pic[i]->reference= 0;
3931         h->delayed_pic[i]= NULL;
3932     }
3933     if(h->delayed_output_pic)
3934         h->delayed_output_pic->reference= 0;
3935     h->delayed_output_pic= NULL;
3936     idr(h);
3937     if(h->s.current_picture_ptr)
3938         h->s.current_picture_ptr->reference= 0;
3939 }
3940
3941 /**
3942  *
3943  * @return the removed picture or NULL if an error occurs
3944  */
3945 static Picture * remove_short(H264Context *h, int frame_num){
3946     MpegEncContext * const s = &h->s;
3947     int i;
3948
3949     if(s->avctx->debug&FF_DEBUG_MMCO)
3950         av_log(h->s.avctx, AV_LOG_DEBUG, "remove short %d count %d\n", frame_num, h->short_ref_count);
3951
3952     for(i=0; i<h->short_ref_count; i++){
3953         Picture *pic= h->short_ref[i];
3954         if(s->avctx->debug&FF_DEBUG_MMCO)
3955             av_log(h->s.avctx, AV_LOG_DEBUG, "%d %d %p\n", i, pic->frame_num, pic);
3956         if(pic->frame_num == frame_num){
3957             h->short_ref[i]= NULL;
3958             memmove(&h->short_ref[i], &h->short_ref[i+1], (h->short_ref_count - i - 1)*sizeof(Picture*));
3959             h->short_ref_count--;
3960             return pic;
3961         }
3962     }
3963     return NULL;
3964 }
3965
3966 /**
3967  *
3968  * @return the removed picture or NULL if an error occurs
3969  */
3970 static Picture * remove_long(H264Context *h, int i){
3971     Picture *pic;
3972
3973     pic= h->long_ref[i];
3974     h->long_ref[i]= NULL;
3975     if(pic) h->long_ref_count--;
3976
3977     return pic;
3978 }
3979
3980 /**
3981  * print short term list
3982  */
3983 static void print_short_term(H264Context *h) {
3984     uint32_t i;
3985     if(h->s.avctx->debug&FF_DEBUG_MMCO) {
3986         av_log(h->s.avctx, AV_LOG_DEBUG, "short term list:\n");
3987         for(i=0; i<h->short_ref_count; i++){
3988             Picture *pic= h->short_ref[i];
3989             av_log(h->s.avctx, AV_LOG_DEBUG, "%d fn:%d poc:%d %p\n", i, pic->frame_num, pic->poc, pic->data[0]);
3990         }
3991     }
3992 }
3993
3994 /**
3995  * print long term list
3996  */
3997 static void print_long_term(H264Context *h) {
3998     uint32_t i;
3999     if(h->s.avctx->debug&FF_DEBUG_MMCO) {
4000         av_log(h->s.avctx, AV_LOG_DEBUG, "long term list:\n");
4001         for(i = 0; i < 16; i++){
4002             Picture *pic= h->long_ref[i];
4003             if (pic) {
4004                 av_log(h->s.avctx, AV_LOG_DEBUG, "%d fn:%d poc:%d %p\n", i, pic->frame_num, pic->poc, pic->data[0]);
4005             }
4006         }
4007     }
4008 }
4009
4010 /**
4011  * Executes the reference picture marking (memory management control operations).
4012  */
4013 static int execute_ref_pic_marking(H264Context *h, MMCO *mmco, int mmco_count){
4014     MpegEncContext * const s = &h->s;
4015     int i, j;
4016     int current_is_long=0;
4017     Picture *pic;
4018
4019     if((s->avctx->debug&FF_DEBUG_MMCO) && mmco_count==0)
4020         av_log(h->s.avctx, AV_LOG_DEBUG, "no mmco here\n");
4021
4022     for(i=0; i<mmco_count; i++){
4023         if(s->avctx->debug&FF_DEBUG_MMCO)
4024             av_log(h->s.avctx, AV_LOG_DEBUG, "mmco:%d %d %d\n", h->mmco[i].opcode, h->mmco[i].short_frame_num, h->mmco[i].long_index);
4025
4026         switch(mmco[i].opcode){
4027         case MMCO_SHORT2UNUSED:
4028             pic= remove_short(h, mmco[i].short_frame_num);
4029             if(pic)
4030                 unreference_pic(h, pic);
4031             else if(s->avctx->debug&FF_DEBUG_MMCO)
4032                 av_log(h->s.avctx, AV_LOG_DEBUG, "mmco: remove_short() failure\n");
4033             break;
4034         case MMCO_SHORT2LONG:
4035             pic= remove_long(h, mmco[i].long_index);
4036             if(pic) unreference_pic(h, pic);
4037
4038             h->long_ref[ mmco[i].long_index ]= remove_short(h, mmco[i].short_frame_num);
4039             h->long_ref[ mmco[i].long_index ]->long_ref=1;
4040             h->long_ref_count++;
4041             break;
4042         case MMCO_LONG2UNUSED:
4043             pic= remove_long(h, mmco[i].long_index);
4044             if(pic)
4045                 unreference_pic(h, pic);
4046             else if(s->avctx->debug&FF_DEBUG_MMCO)
4047                 av_log(h->s.avctx, AV_LOG_DEBUG, "mmco: remove_long() failure\n");
4048             break;
4049         case MMCO_LONG:
4050             pic= remove_long(h, mmco[i].long_index);
4051             if(pic) unreference_pic(h, pic);
4052
4053             h->long_ref[ mmco[i].long_index ]= s->current_picture_ptr;
4054             h->long_ref[ mmco[i].long_index ]->long_ref=1;
4055             h->long_ref_count++;
4056
4057             current_is_long=1;
4058             break;
4059         case MMCO_SET_MAX_LONG:
4060             assert(mmco[i].long_index <= 16);
4061             // just remove the long term which index is greater than new max
4062             for(j = mmco[i].long_index; j<16; j++){
4063                 pic = remove_long(h, j);
4064                 if (pic) unreference_pic(h, pic);
4065             }
4066             break;
4067         case MMCO_RESET:
4068             while(h->short_ref_count){
4069                 pic= remove_short(h, h->short_ref[0]->frame_num);
4070                 unreference_pic(h, pic);
4071             }
4072             for(j = 0; j < 16; j++) {
4073                 pic= remove_long(h, j);
4074                 if(pic) unreference_pic(h, pic);
4075             }
4076             break;
4077         default: assert(0);
4078         }
4079     }
4080
4081     if(!current_is_long){
4082         pic= remove_short(h, s->current_picture_ptr->frame_num);
4083         if(pic){
4084             unreference_pic(h, pic);
4085             av_log(h->s.avctx, AV_LOG_ERROR, "illegal short term buffer state detected\n");
4086         }
4087
4088         if(h->short_ref_count)
4089             memmove(&h->short_ref[1], &h->short_ref[0], h->short_ref_count*sizeof(Picture*));
4090
4091         h->short_ref[0]= s->current_picture_ptr;
4092         h->short_ref[0]->long_ref=0;
4093         h->short_ref_count++;
4094     }
4095
4096     print_short_term(h);
4097     print_long_term(h);
4098     return 0;
4099 }
4100
4101 static int decode_ref_pic_marking(H264Context *h){
4102     MpegEncContext * const s = &h->s;
4103     int i;
4104
4105     if(h->nal_unit_type == NAL_IDR_SLICE){ //FIXME fields
4106         s->broken_link= get_bits1(&s->gb) -1;
4107         h->mmco[0].long_index= get_bits1(&s->gb) - 1; // current_long_term_idx
4108         if(h->mmco[0].long_index == -1)
4109             h->mmco_index= 0;
4110         else{
4111             h->mmco[0].opcode= MMCO_LONG;
4112             h->mmco_index= 1;
4113         }
4114     }else{
4115         if(get_bits1(&s->gb)){ // adaptive_ref_pic_marking_mode_flag
4116             for(i= 0; i<MAX_MMCO_COUNT; i++) {
4117                 MMCOOpcode opcode= get_ue_golomb(&s->gb);;
4118
4119                 h->mmco[i].opcode= opcode;
4120                 if(opcode==MMCO_SHORT2UNUSED || opcode==MMCO_SHORT2LONG){
4121                     h->mmco[i].short_frame_num= (h->frame_num - get_ue_golomb(&s->gb) - 1) & ((1<<h->sps.log2_max_frame_num)-1); //FIXME fields
4122 /*                    if(h->mmco[i].short_frame_num >= h->short_ref_count || h->short_ref[ h->mmco[i].short_frame_num ] == NULL){
4123                         av_log(s->avctx, AV_LOG_ERROR, "illegal short ref in memory management control operation %d\n", mmco);
4124                         return -1;
4125                     }*/
4126                 }
4127                 if(opcode==MMCO_SHORT2LONG || opcode==MMCO_LONG2UNUSED || opcode==MMCO_LONG || opcode==MMCO_SET_MAX_LONG){
4128                     h->mmco[i].long_index= get_ue_golomb(&s->gb);
4129                     if(/*h->mmco[i].long_index >= h->long_ref_count || h->long_ref[ h->mmco[i].long_index ] == NULL*/ h->mmco[i].long_index >= 16){
4130                         av_log(h->s.avctx, AV_LOG_ERROR, "illegal long ref in memory management control operation %d\n", opcode);
4131                         return -1;
4132                     }
4133                 }
4134
4135                 if(opcode > MMCO_LONG){
4136                     av_log(h->s.avctx, AV_LOG_ERROR, "illegal memory management control operation %d\n", opcode);
4137                     return -1;
4138                 }
4139                 if(opcode == MMCO_END)
4140                     break;
4141             }
4142             h->mmco_index= i;
4143         }else{
4144             assert(h->long_ref_count + h->short_ref_count <= h->sps.ref_frame_count);
4145
4146             if(h->long_ref_count + h->short_ref_count == h->sps.ref_frame_count){ //FIXME fields
4147                 h->mmco[0].opcode= MMCO_SHORT2UNUSED;
4148                 h->mmco[0].short_frame_num= h->short_ref[ h->short_ref_count - 1 ]->frame_num;
4149                 h->mmco_index= 1;
4150             }else
4151                 h->mmco_index= 0;
4152         }
4153     }
4154
4155     return 0;
4156 }
4157
4158 static int init_poc(H264Context *h){
4159     MpegEncContext * const s = &h->s;
4160     const int max_frame_num= 1<<h->sps.log2_max_frame_num;
4161     int field_poc[2];
4162
4163     if(h->nal_unit_type == NAL_IDR_SLICE){
4164         h->frame_num_offset= 0;
4165     }else{
4166         if(h->frame_num < h->prev_frame_num)
4167             h->frame_num_offset= h->prev_frame_num_offset + max_frame_num;
4168         else
4169             h->frame_num_offset= h->prev_frame_num_offset;
4170     }
4171
4172     if(h->sps.poc_type==0){
4173         const int max_poc_lsb= 1<<h->sps.log2_max_poc_lsb;
4174
4175         if(h->nal_unit_type == NAL_IDR_SLICE){
4176              h->prev_poc_msb=
4177              h->prev_poc_lsb= 0;
4178         }
4179
4180         if     (h->poc_lsb < h->prev_poc_lsb && h->prev_poc_lsb - h->poc_lsb >= max_poc_lsb/2)
4181             h->poc_msb = h->prev_poc_msb + max_poc_lsb;
4182         else if(h->poc_lsb > h->prev_poc_lsb && h->prev_poc_lsb - h->poc_lsb < -max_poc_lsb/2)
4183             h->poc_msb = h->prev_poc_msb - max_poc_lsb;
4184         else
4185             h->poc_msb = h->prev_poc_msb;
4186 //printf("poc: %d %d\n", h->poc_msb, h->poc_lsb);
4187         field_poc[0] =
4188         field_poc[1] = h->poc_msb + h->poc_lsb;
4189         if(s->picture_structure == PICT_FRAME)
4190             field_poc[1] += h->delta_poc_bottom;
4191     }else if(h->sps.poc_type==1){
4192         int abs_frame_num, expected_delta_per_poc_cycle, expectedpoc;
4193         int i;
4194
4195         if(h->sps.poc_cycle_length != 0)
4196             abs_frame_num = h->frame_num_offset + h->frame_num;
4197         else
4198             abs_frame_num = 0;
4199
4200         if(h->nal_ref_idc==0 && abs_frame_num > 0)
4201             abs_frame_num--;
4202
4203         expected_delta_per_poc_cycle = 0;
4204         for(i=0; i < h->sps.poc_cycle_length; i++)
4205             expected_delta_per_poc_cycle += h->sps.offset_for_ref_frame[ i ]; //FIXME integrate during sps parse
4206
4207         if(abs_frame_num > 0){
4208             int poc_cycle_cnt          = (abs_frame_num - 1) / h->sps.poc_cycle_length;
4209             int frame_num_in_poc_cycle = (abs_frame_num - 1) % h->sps.poc_cycle_length;
4210
4211             expectedpoc = poc_cycle_cnt * expected_delta_per_poc_cycle;
4212             for(i = 0; i <= frame_num_in_poc_cycle; i++)
4213                 expectedpoc = expectedpoc + h->sps.offset_for_ref_frame[ i ];
4214         } else
4215             expectedpoc = 0;
4216
4217         if(h->nal_ref_idc == 0)
4218             expectedpoc = expectedpoc + h->sps.offset_for_non_ref_pic;
4219
4220         field_poc[0] = expectedpoc + h->delta_poc[0];
4221         field_poc[1] = field_poc[0] + h->sps.offset_for_top_to_bottom_field;
4222
4223         if(s->picture_structure == PICT_FRAME)
4224             field_poc[1] += h->delta_poc[1];
4225     }else{
4226         int poc;
4227         if(h->nal_unit_type == NAL_IDR_SLICE){
4228             poc= 0;
4229         }else{
4230             if(h->nal_ref_idc) poc= 2*(h->frame_num_offset + h->frame_num);
4231             else               poc= 2*(h->frame_num_offset + h->frame_num) - 1;
4232         }
4233         field_poc[0]= poc;
4234         field_poc[1]= poc;
4235     }
4236
4237     if(s->picture_structure != PICT_BOTTOM_FIELD)
4238         s->current_picture_ptr->field_poc[0]= field_poc[0];
4239     if(s->picture_structure != PICT_TOP_FIELD)
4240         s->current_picture_ptr->field_poc[1]= field_poc[1];
4241     if(s->picture_structure == PICT_FRAME) // FIXME field pix?
4242         s->current_picture_ptr->poc= FFMIN(field_poc[0], field_poc[1]);
4243
4244     return 0;
4245 }
4246
4247 /**
4248  * decodes a slice header.
4249  * this will allso call MPV_common_init() and frame_start() as needed
4250  */
4251 static int decode_slice_header(H264Context *h){
4252     MpegEncContext * const s = &h->s;
4253     int first_mb_in_slice, pps_id;
4254     int num_ref_idx_active_override_flag;
4255     static const uint8_t slice_type_map[5]= {P_TYPE, B_TYPE, I_TYPE, SP_TYPE, SI_TYPE};
4256     int slice_type;
4257     int default_ref_list_done = 0;
4258
4259     s->current_picture.reference= h->nal_ref_idc != 0;
4260     s->dropable= h->nal_ref_idc == 0;
4261
4262     first_mb_in_slice= get_ue_golomb(&s->gb);
4263
4264     slice_type= get_ue_golomb(&s->gb);
4265     if(slice_type > 9){
4266         av_log(h->s.avctx, AV_LOG_ERROR, "slice type too large (%d) at %d %d\n", h->slice_type, s->mb_x, s->mb_y);
4267         return -1;
4268     }
4269     if(slice_type > 4){
4270         slice_type -= 5;
4271         h->slice_type_fixed=1;
4272     }else
4273         h->slice_type_fixed=0;
4274
4275     slice_type= slice_type_map[ slice_type ];
4276     if (slice_type == I_TYPE
4277         || (h->slice_num != 0 && slice_type == h->slice_type) ) {
4278         default_ref_list_done = 1;
4279     }
4280     h->slice_type= slice_type;
4281
4282     s->pict_type= h->slice_type; // to make a few old func happy, it's wrong though
4283
4284     pps_id= get_ue_golomb(&s->gb);
4285     if(pps_id>255){
4286         av_log(h->s.avctx, AV_LOG_ERROR, "pps_id out of range\n");
4287         return -1;
4288     }
4289     h->pps= h->pps_buffer[pps_id];
4290     if(h->pps.slice_group_count == 0){
4291         av_log(h->s.avctx, AV_LOG_ERROR, "non existing PPS referenced\n");
4292         return -1;
4293     }
4294
4295     h->sps= h->sps_buffer[ h->pps.sps_id ];
4296     if(h->sps.log2_max_frame_num == 0){
4297         av_log(h->s.avctx, AV_LOG_ERROR, "non existing SPS referenced\n");
4298         return -1;
4299     }
4300
4301     if(h->dequant_coeff_pps != pps_id){
4302         h->dequant_coeff_pps = pps_id;
4303         init_dequant_tables(h);
4304     }
4305
4306     s->mb_width= h->sps.mb_width;
4307     s->mb_height= h->sps.mb_height * (2 - h->sps.frame_mbs_only_flag);
4308
4309     h->b_stride=  s->mb_width*4;
4310     h->b8_stride= s->mb_width*2;
4311
4312     s->width = 16*s->mb_width - 2*(h->sps.crop_left + h->sps.crop_right );
4313     if(h->sps.frame_mbs_only_flag)
4314         s->height= 16*s->mb_height - 2*(h->sps.crop_top  + h->sps.crop_bottom);
4315     else
4316         s->height= 16*s->mb_height - 4*(h->sps.crop_top  + h->sps.crop_bottom); //FIXME recheck
4317
4318     if (s->context_initialized
4319         && (   s->width != s->avctx->width || s->height != s->avctx->height)) {
4320         free_tables(h);
4321         MPV_common_end(s);
4322     }
4323     if (!s->context_initialized) {
4324         if (MPV_common_init(s) < 0)
4325             return -1;
4326
4327         if(s->dsp.h264_idct_add == ff_h264_idct_add_c){ //FIXME little ugly
4328             memcpy(h->zigzag_scan, zigzag_scan, 16*sizeof(uint8_t));
4329             memcpy(h-> field_scan,  field_scan, 16*sizeof(uint8_t));
4330         }else{
4331             int i;
4332             for(i=0; i<16; i++){
4333 #define T(x) (x>>2) | ((x<<2) & 0xF)
4334                 h->zigzag_scan[i] = T(zigzag_scan[i]);
4335                 h-> field_scan[i] = T( field_scan[i]);
4336 #undef T
4337             }
4338         }
4339         if(s->dsp.h264_idct8_add == ff_h264_idct8_add_c){
4340             memcpy(h->zigzag_scan8x8, zigzag_scan8x8, 64*sizeof(uint8_t));
4341             memcpy(h->zigzag_scan8x8_cavlc, zigzag_scan8x8_cavlc, 64*sizeof(uint8_t));
4342         }else{
4343             int i;
4344             for(i=0; i<64; i++){
4345 #define T(x) (x>>3) | ((x&7)<<3)
4346                 h->zigzag_scan8x8[i] = T(zigzag_scan8x8[i]);
4347                 h->zigzag_scan8x8_cavlc[i] = T(zigzag_scan8x8_cavlc[i]);
4348 #undef T
4349             }
4350         }
4351         if(h->sps.transform_bypass){ //FIXME same ugly
4352             h->zigzag_scan_q0 = zigzag_scan;
4353             h->field_scan_q0 = field_scan;
4354             h->zigzag_scan8x8_q0 = zigzag_scan8x8;
4355             h->zigzag_scan8x8_cavlc_q0 = zigzag_scan8x8_cavlc;
4356         }else{
4357             h->zigzag_scan_q0 = h->zigzag_scan;
4358             h->field_scan_q0 = h->field_scan;
4359             h->zigzag_scan8x8_q0 = h->zigzag_scan8x8;
4360             h->zigzag_scan8x8_cavlc_q0 = h->zigzag_scan8x8_cavlc;
4361         }
4362
4363         alloc_tables(h);
4364
4365         s->avctx->width = s->width;
4366         s->avctx->height = s->height;
4367         s->avctx->sample_aspect_ratio= h->sps.sar;
4368         if(!s->avctx->sample_aspect_ratio.den)
4369             s->avctx->sample_aspect_ratio.den = 1;
4370
4371         if(h->sps.timing_info_present_flag){
4372             s->avctx->time_base= (AVRational){h->sps.num_units_in_tick * 2, h->sps.time_scale};
4373             if(h->x264_build > 0 && h->x264_build < 44)
4374                 s->avctx->time_base.den *= 2;
4375             av_reduce(&s->avctx->time_base.num, &s->avctx->time_base.den,
4376                       s->avctx->time_base.num, s->avctx->time_base.den, 1<<30);
4377         }
4378     }
4379
4380     if(h->slice_num == 0){
4381         if(frame_start(h) < 0)
4382             return -1;
4383     }
4384
4385     s->current_picture_ptr->frame_num= //FIXME frame_num cleanup
4386     h->frame_num= get_bits(&s->gb, h->sps.log2_max_frame_num);
4387
4388     h->mb_aff_frame = 0;
4389     if(h->sps.frame_mbs_only_flag){
4390         s->picture_structure= PICT_FRAME;
4391     }else{
4392         if(get_bits1(&s->gb)) { //field_pic_flag
4393             s->picture_structure= PICT_TOP_FIELD + get_bits1(&s->gb); //bottom_field_flag
4394         } else {
4395             s->picture_structure= PICT_FRAME;
4396             first_mb_in_slice <<= h->sps.mb_aff;
4397             h->mb_aff_frame = h->sps.mb_aff;
4398         }
4399     }
4400
4401     s->resync_mb_x = s->mb_x = first_mb_in_slice % s->mb_width;
4402     s->resync_mb_y = s->mb_y = first_mb_in_slice / s->mb_width;
4403     if(s->mb_y >= s->mb_height){
4404         return -1;
4405     }
4406
4407     if(s->picture_structure==PICT_FRAME){
4408         h->curr_pic_num=   h->frame_num;
4409         h->max_pic_num= 1<< h->sps.log2_max_frame_num;
4410     }else{
4411         h->curr_pic_num= 2*h->frame_num;
4412         h->max_pic_num= 1<<(h->sps.log2_max_frame_num + 1);
4413     }
4414
4415     if(h->nal_unit_type == NAL_IDR_SLICE){
4416         get_ue_golomb(&s->gb); /* idr_pic_id */
4417     }
4418
4419     if(h->sps.poc_type==0){
4420         h->poc_lsb= get_bits(&s->gb, h->sps.log2_max_poc_lsb);
4421
4422         if(h->pps.pic_order_present==1 && s->picture_structure==PICT_FRAME){
4423             h->delta_poc_bottom= get_se_golomb(&s->gb);
4424         }
4425     }
4426
4427     if(h->sps.poc_type==1 && !h->sps.delta_pic_order_always_zero_flag){
4428         h->delta_poc[0]= get_se_golomb(&s->gb);
4429
4430         if(h->pps.pic_order_present==1 && s->picture_structure==PICT_FRAME)
4431             h->delta_poc[1]= get_se_golomb(&s->gb);
4432     }
4433
4434     init_poc(h);
4435
4436     if(h->pps.redundant_pic_cnt_present){
4437         h->redundant_pic_count= get_ue_golomb(&s->gb);
4438     }
4439
4440     //set defaults, might be overriden a few line later
4441     h->ref_count[0]= h->pps.ref_count[0];
4442     h->ref_count[1]= h->pps.ref_count[1];
4443
4444     if(h->slice_type == P_TYPE || h->slice_type == SP_TYPE || h->slice_type == B_TYPE){
4445         if(h->slice_type == B_TYPE){
4446             h->direct_spatial_mv_pred= get_bits1(&s->gb);
4447         }
4448         num_ref_idx_active_override_flag= get_bits1(&s->gb);
4449
4450         if(num_ref_idx_active_override_flag){
4451             h->ref_count[0]= get_ue_golomb(&s->gb) + 1;
4452             if(h->slice_type==B_TYPE)
4453                 h->ref_count[1]= get_ue_golomb(&s->gb) + 1;
4454
4455             if(h->ref_count[0] > 32 || h->ref_count[1] > 32){
4456                 av_log(h->s.avctx, AV_LOG_ERROR, "reference overflow\n");
4457                 return -1;
4458             }
4459         }
4460     }
4461
4462     if(!default_ref_list_done){
4463         fill_default_ref_list(h);
4464     }
4465
4466     if(decode_ref_pic_list_reordering(h) < 0)
4467         return -1;
4468
4469     if(   (h->pps.weighted_pred          && (h->slice_type == P_TYPE || h->slice_type == SP_TYPE ))
4470        || (h->pps.weighted_bipred_idc==1 && h->slice_type==B_TYPE ) )
4471         pred_weight_table(h);
4472     else if(h->pps.weighted_bipred_idc==2 && h->slice_type==B_TYPE)
4473         implicit_weight_table(h);
4474     else
4475         h->use_weight = 0;
4476
4477     if(s->current_picture.reference)
4478         decode_ref_pic_marking(h);
4479
4480     if( h->slice_type != I_TYPE && h->slice_type != SI_TYPE && h->pps.cabac )
4481         h->cabac_init_idc = get_ue_golomb(&s->gb);
4482
4483     h->last_qscale_diff = 0;
4484     s->qscale = h->pps.init_qp + get_se_golomb(&s->gb);
4485     if(s->qscale<0 || s->qscale>51){
4486         av_log(s->avctx, AV_LOG_ERROR, "QP %d out of range\n", s->qscale);
4487         return -1;
4488     }
4489     h->chroma_qp = get_chroma_qp(h->pps.chroma_qp_index_offset, s->qscale);
4490     //FIXME qscale / qp ... stuff
4491     if(h->slice_type == SP_TYPE){
4492         get_bits1(&s->gb); /* sp_for_switch_flag */
4493     }
4494     if(h->slice_type==SP_TYPE || h->slice_type == SI_TYPE){
4495         get_se_golomb(&s->gb); /* slice_qs_delta */
4496     }
4497
4498     h->deblocking_filter = 1;
4499     h->slice_alpha_c0_offset = 0;
4500     h->slice_beta_offset = 0;
4501     if( h->pps.deblocking_filter_parameters_present ) {
4502         h->deblocking_filter= get_ue_golomb(&s->gb);
4503         if(h->deblocking_filter < 2)
4504             h->deblocking_filter^= 1; // 1<->0
4505
4506         if( h->deblocking_filter ) {
4507             h->slice_alpha_c0_offset = get_se_golomb(&s->gb) << 1;
4508             h->slice_beta_offset = get_se_golomb(&s->gb) << 1;
4509         }
4510     }
4511     if(   s->avctx->skip_loop_filter >= AVDISCARD_ALL
4512        ||(s->avctx->skip_loop_filter >= AVDISCARD_NONKEY && h->slice_type != I_TYPE)
4513        ||(s->avctx->skip_loop_filter >= AVDISCARD_BIDIR  && h->slice_type == B_TYPE)
4514        ||(s->avctx->skip_loop_filter >= AVDISCARD_NONREF && h->nal_ref_idc == 0))
4515         h->deblocking_filter= 0;
4516
4517 #if 0 //FMO
4518     if( h->pps.num_slice_groups > 1  && h->pps.mb_slice_group_map_type >= 3 && h->pps.mb_slice_group_map_type <= 5)
4519         slice_group_change_cycle= get_bits(&s->gb, ?);
4520 #endif
4521
4522     h->slice_num++;
4523
4524     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
4525         av_log(h->s.avctx, AV_LOG_DEBUG, "slice:%d %s mb:%d %c pps:%d frame:%d poc:%d/%d ref:%d/%d qp:%d loop:%d:%d:%d weight:%d%s\n",
4526                h->slice_num,
4527                (s->picture_structure==PICT_FRAME ? "F" : s->picture_structure==PICT_TOP_FIELD ? "T" : "B"),
4528                first_mb_in_slice,
4529                av_get_pict_type_char(h->slice_type),
4530                pps_id, h->frame_num,
4531                s->current_picture_ptr->field_poc[0], s->current_picture_ptr->field_poc[1],
4532                h->ref_count[0], h->ref_count[1],
4533                s->qscale,
4534                h->deblocking_filter, h->slice_alpha_c0_offset/2, h->slice_beta_offset/2,
4535                h->use_weight,
4536                h->use_weight==1 && h->use_weight_chroma ? "c" : ""
4537                );
4538     }
4539
4540     return 0;
4541 }
4542
4543 /**
4544  *
4545  */
4546 static inline int get_level_prefix(GetBitContext *gb){
4547     unsigned int buf;
4548     int log;
4549
4550     OPEN_READER(re, gb);
4551     UPDATE_CACHE(re, gb);
4552     buf=GET_CACHE(re, gb);
4553
4554     log= 32 - av_log2(buf);
4555 #ifdef TRACE
4556     print_bin(buf>>(32-log), log);
4557     av_log(NULL, AV_LOG_DEBUG, "%5d %2d %3d lpr @%5d in %s get_level_prefix\n", buf>>(32-log), log, log-1, get_bits_count(gb), __FILE__);
4558 #endif
4559
4560     LAST_SKIP_BITS(re, gb, log);
4561     CLOSE_READER(re, gb);
4562
4563     return log-1;
4564 }
4565
4566 static inline int get_dct8x8_allowed(H264Context *h){
4567     int i;
4568     for(i=0; i<4; i++){
4569         if(!IS_SUB_8X8(h->sub_mb_type[i])
4570            || (!h->sps.direct_8x8_inference_flag && IS_DIRECT(h->sub_mb_type[i])))
4571             return 0;
4572     }
4573     return 1;
4574 }
4575
4576 /**
4577  * decodes a residual block.
4578  * @param n block index
4579  * @param scantable scantable
4580  * @param max_coeff number of coefficients in the block
4581  * @return <0 if an error occured
4582  */
4583 static int decode_residual(H264Context *h, GetBitContext *gb, DCTELEM *block, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff){
4584     MpegEncContext * const s = &h->s;
4585     static const int coeff_token_table_index[17]= {0, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3};
4586     int level[16];
4587     int zeros_left, coeff_num, coeff_token, total_coeff, i, j, trailing_ones, run_before;
4588
4589     //FIXME put trailing_onex into the context
4590
4591     if(n == CHROMA_DC_BLOCK_INDEX){
4592         coeff_token= get_vlc2(gb, chroma_dc_coeff_token_vlc.table, CHROMA_DC_COEFF_TOKEN_VLC_BITS, 1);
4593         total_coeff= coeff_token>>2;
4594     }else{
4595         if(n == LUMA_DC_BLOCK_INDEX){
4596             total_coeff= pred_non_zero_count(h, 0);
4597             coeff_token= get_vlc2(gb, coeff_token_vlc[ coeff_token_table_index[total_coeff] ].table, COEFF_TOKEN_VLC_BITS, 2);
4598             total_coeff= coeff_token>>2;
4599         }else{
4600             total_coeff= pred_non_zero_count(h, n);
4601             coeff_token= get_vlc2(gb, coeff_token_vlc[ coeff_token_table_index[total_coeff] ].table, COEFF_TOKEN_VLC_BITS, 2);
4602             total_coeff= coeff_token>>2;
4603             h->non_zero_count_cache[ scan8[n] ]= total_coeff;
4604         }
4605     }
4606
4607     //FIXME set last_non_zero?
4608
4609     if(total_coeff==0)
4610         return 0;
4611
4612     trailing_ones= coeff_token&3;
4613     tprintf("trailing:%d, total:%d\n", trailing_ones, total_coeff);
4614     assert(total_coeff<=16);
4615
4616     for(i=0; i<trailing_ones; i++){
4617         level[i]= 1 - 2*get_bits1(gb);
4618     }
4619
4620     if(i<total_coeff) {
4621         int level_code, mask;
4622         int suffix_length = total_coeff > 10 && trailing_ones < 3;
4623         int prefix= get_level_prefix(gb);
4624
4625         //first coefficient has suffix_length equal to 0 or 1
4626         if(prefix<14){ //FIXME try to build a large unified VLC table for all this
4627             if(suffix_length)
4628                 level_code= (prefix<<suffix_length) + get_bits(gb, suffix_length); //part
4629             else
4630                 level_code= (prefix<<suffix_length); //part
4631         }else if(prefix==14){
4632             if(suffix_length)
4633                 level_code= (prefix<<suffix_length) + get_bits(gb, suffix_length); //part
4634             else
4635                 level_code= prefix + get_bits(gb, 4); //part
4636         }else if(prefix==15){
4637             level_code= (prefix<<suffix_length) + get_bits(gb, 12); //part
4638             if(suffix_length==0) level_code+=15; //FIXME doesn't make (much)sense
4639         }else{
4640             av_log(h->s.avctx, AV_LOG_ERROR, "prefix too large at %d %d\n", s->mb_x, s->mb_y);
4641             return -1;
4642         }
4643
4644         if(trailing_ones < 3) level_code += 2;
4645
4646         suffix_length = 1;
4647         if(level_code > 5)
4648             suffix_length++;
4649         mask= -(level_code&1);
4650         level[i]= (((2+level_code)>>1) ^ mask) - mask;
4651         i++;
4652
4653         //remaining coefficients have suffix_length > 0
4654         for(;i<total_coeff;i++) {
4655             static const int suffix_limit[7] = {0,5,11,23,47,95,INT_MAX };
4656             prefix = get_level_prefix(gb);
4657             if(prefix<15){
4658                 level_code = (prefix<<suffix_length) + get_bits(gb, suffix_length);
4659             }else if(prefix==15){
4660                 level_code =  (prefix<<suffix_length) + get_bits(gb, 12);
4661             }else{
4662                 av_log(h->s.avctx, AV_LOG_ERROR, "prefix too large at %d %d\n", s->mb_x, s->mb_y);
4663                 return -1;
4664             }
4665             mask= -(level_code&1);
4666             level[i]= (((2+level_code)>>1) ^ mask) - mask;
4667             if(level_code > suffix_limit[suffix_length])
4668                 suffix_length++;
4669         }
4670     }
4671
4672     if(total_coeff == max_coeff)
4673         zeros_left=0;
4674     else{
4675         if(n == CHROMA_DC_BLOCK_INDEX)
4676             zeros_left= get_vlc2(gb, chroma_dc_total_zeros_vlc[ total_coeff-1 ].table, CHROMA_DC_TOTAL_ZEROS_VLC_BITS, 1);
4677         else
4678             zeros_left= get_vlc2(gb, total_zeros_vlc[ total_coeff-1 ].table, TOTAL_ZEROS_VLC_BITS, 1);
4679     }
4680
4681     coeff_num = zeros_left + total_coeff - 1;
4682     j = scantable[coeff_num];
4683     if(n > 24){
4684         block[j] = level[0];
4685         for(i=1;i<total_coeff;i++) {
4686             if(zeros_left <= 0)
4687                 run_before = 0;
4688             else if(zeros_left < 7){
4689                 run_before= get_vlc2(gb, run_vlc[zeros_left-1].table, RUN_VLC_BITS, 1);
4690             }else{
4691                 run_before= get_vlc2(gb, run7_vlc.table, RUN7_VLC_BITS, 2);
4692             }
4693             zeros_left -= run_before;
4694             coeff_num -= 1 + run_before;
4695             j= scantable[ coeff_num ];
4696
4697             block[j]= level[i];
4698         }
4699     }else{
4700         block[j] = (level[0] * qmul[j] + 32)>>6;
4701         for(i=1;i<total_coeff;i++) {
4702             if(zeros_left <= 0)
4703                 run_before = 0;
4704             else if(zeros_left < 7){
4705                 run_before= get_vlc2(gb, run_vlc[zeros_left-1].table, RUN_VLC_BITS, 1);
4706             }else{
4707                 run_before= get_vlc2(gb, run7_vlc.table, RUN7_VLC_BITS, 2);
4708             }
4709             zeros_left -= run_before;
4710             coeff_num -= 1 + run_before;
4711             j= scantable[ coeff_num ];
4712
4713             block[j]= (level[i] * qmul[j] + 32)>>6;
4714         }
4715     }
4716
4717     if(zeros_left<0){
4718         av_log(h->s.avctx, AV_LOG_ERROR, "negative number of zero coeffs at %d %d\n", s->mb_x, s->mb_y);
4719         return -1;
4720     }
4721
4722     return 0;
4723 }
4724
4725 /**
4726  * decodes a P_SKIP or B_SKIP macroblock
4727  */
4728 static void decode_mb_skip(H264Context *h){
4729     MpegEncContext * const s = &h->s;
4730     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
4731     int mb_type=0;
4732
4733     memset(h->non_zero_count[mb_xy], 0, 16);
4734     memset(h->non_zero_count_cache + 8, 0, 8*5); //FIXME ugly, remove pfui
4735
4736     if(h->mb_aff_frame && s->mb_skip_run==0 && (s->mb_y&1)==0){
4737         h->mb_field_decoding_flag= get_bits1(&s->gb);
4738     }
4739     if(h->mb_field_decoding_flag)
4740         mb_type|= MB_TYPE_INTERLACED;
4741
4742     if( h->slice_type == B_TYPE )
4743     {
4744         // just for fill_caches. pred_direct_motion will set the real mb_type
4745         mb_type|= MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2|MB_TYPE_SKIP;
4746
4747         fill_caches(h, mb_type, 0); //FIXME check what is needed and what not ...
4748         pred_direct_motion(h, &mb_type);
4749         if(h->pps.cabac){
4750             fill_rectangle(h->mvd_cache[0][scan8[0]], 4, 4, 8, 0, 4);
4751             fill_rectangle(h->mvd_cache[1][scan8[0]], 4, 4, 8, 0, 4);
4752         }
4753     }
4754     else
4755     {
4756         int mx, my;
4757         mb_type|= MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P1L0|MB_TYPE_SKIP;
4758
4759         fill_caches(h, mb_type, 0); //FIXME check what is needed and what not ...
4760         pred_pskip_motion(h, &mx, &my);
4761         fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, 0, 1);
4762         fill_rectangle(  h->mv_cache[0][scan8[0]], 4, 4, 8, pack16to32(mx,my), 4);
4763         if(h->pps.cabac)
4764             fill_rectangle(h->mvd_cache[0][scan8[0]], 4, 4, 8, 0, 4);
4765     }
4766
4767     write_back_motion(h, mb_type);
4768     s->current_picture.mb_type[mb_xy]= mb_type|MB_TYPE_SKIP;
4769     s->current_picture.qscale_table[mb_xy]= s->qscale;
4770     h->slice_table[ mb_xy ]= h->slice_num;
4771     h->prev_mb_skipped= 1;
4772 }
4773
4774 /**
4775  * decodes a macroblock
4776  * @returns 0 if ok, AC_ERROR / DC_ERROR / MV_ERROR if an error is noticed
4777  */
4778 static int decode_mb_cavlc(H264Context *h){
4779     MpegEncContext * const s = &h->s;
4780     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
4781     int mb_type, partition_count, cbp;
4782     int dct8x8_allowed= h->pps.transform_8x8_mode;
4783
4784     s->dsp.clear_blocks(h->mb); //FIXME avoid if already clear (move after skip handlong?
4785
4786     tprintf("pic:%d mb:%d/%d\n", h->frame_num, s->mb_x, s->mb_y);
4787     cbp = 0; /* avoid warning. FIXME: find a solution without slowing
4788                 down the code */
4789     if(h->slice_type != I_TYPE && h->slice_type != SI_TYPE){
4790         if(s->mb_skip_run==-1)
4791             s->mb_skip_run= get_ue_golomb(&s->gb);
4792
4793         if (s->mb_skip_run--) {
4794             decode_mb_skip(h);
4795             return 0;
4796         }
4797     }
4798     if(h->mb_aff_frame){
4799         if ( ((s->mb_y&1) == 0) || h->prev_mb_skipped)
4800             h->mb_field_decoding_flag = get_bits1(&s->gb);
4801     }else
4802         h->mb_field_decoding_flag= (s->picture_structure!=PICT_FRAME);
4803
4804     h->prev_mb_skipped= 0;
4805
4806     mb_type= get_ue_golomb(&s->gb);
4807     if(h->slice_type == B_TYPE){
4808         if(mb_type < 23){
4809             partition_count= b_mb_type_info[mb_type].partition_count;
4810             mb_type=         b_mb_type_info[mb_type].type;
4811         }else{
4812             mb_type -= 23;
4813             goto decode_intra_mb;
4814         }
4815     }else if(h->slice_type == P_TYPE /*|| h->slice_type == SP_TYPE */){
4816         if(mb_type < 5){
4817             partition_count= p_mb_type_info[mb_type].partition_count;
4818             mb_type=         p_mb_type_info[mb_type].type;
4819         }else{
4820             mb_type -= 5;
4821             goto decode_intra_mb;
4822         }
4823     }else{
4824        assert(h->slice_type == I_TYPE);
4825 decode_intra_mb:
4826         if(mb_type > 25){
4827             av_log(h->s.avctx, AV_LOG_ERROR, "mb_type %d in %c slice to large at %d %d\n", mb_type, av_get_pict_type_char(h->slice_type), s->mb_x, s->mb_y);
4828             return -1;
4829         }
4830         partition_count=0;
4831         cbp= i_mb_type_info[mb_type].cbp;
4832         h->intra16x16_pred_mode= i_mb_type_info[mb_type].pred_mode;
4833         mb_type= i_mb_type_info[mb_type].type;
4834     }
4835
4836     if(h->mb_field_decoding_flag)
4837         mb_type |= MB_TYPE_INTERLACED;
4838
4839     h->slice_table[ mb_xy ]= h->slice_num;
4840
4841     if(IS_INTRA_PCM(mb_type)){
4842         unsigned int x, y;
4843
4844         // we assume these blocks are very rare so we dont optimize it
4845         align_get_bits(&s->gb);
4846
4847         // The pixels are stored in the same order as levels in h->mb array.
4848         for(y=0; y<16; y++){
4849             const int index= 4*(y&3) + 32*((y>>2)&1) + 128*(y>>3);
4850             for(x=0; x<16; x++){
4851                 tprintf("LUMA ICPM LEVEL (%3d)\n", show_bits(&s->gb, 8));
4852                 h->mb[index + (x&3) + 16*((x>>2)&1) + 64*(x>>3)]= get_bits(&s->gb, 8);
4853             }
4854         }
4855         for(y=0; y<8; y++){
4856             const int index= 256 + 4*(y&3) + 32*(y>>2);
4857             for(x=0; x<8; x++){
4858                 tprintf("CHROMA U ICPM LEVEL (%3d)\n", show_bits(&s->gb, 8));
4859                 h->mb[index + (x&3) + 16*(x>>2)]= get_bits(&s->gb, 8);
4860             }
4861         }
4862         for(y=0; y<8; y++){
4863             const int index= 256 + 64 + 4*(y&3) + 32*(y>>2);
4864             for(x=0; x<8; x++){
4865                 tprintf("CHROMA V ICPM LEVEL (%3d)\n", show_bits(&s->gb, 8));
4866                 h->mb[index + (x&3) + 16*(x>>2)]= get_bits(&s->gb, 8);
4867             }
4868         }
4869
4870         // In deblocking, the quantizer is 0
4871         s->current_picture.qscale_table[mb_xy]= 0;
4872         h->chroma_qp = get_chroma_qp(h->pps.chroma_qp_index_offset, 0);
4873         // All coeffs are present
4874         memset(h->non_zero_count[mb_xy], 16, 16);
4875
4876         s->current_picture.mb_type[mb_xy]= mb_type;
4877         return 0;
4878     }
4879
4880     fill_caches(h, mb_type, 0);
4881
4882     //mb_pred
4883     if(IS_INTRA(mb_type)){
4884 //            init_top_left_availability(h);
4885             if(IS_INTRA4x4(mb_type)){
4886                 int i;
4887                 int di = 1;
4888                 if(dct8x8_allowed && get_bits1(&s->gb)){
4889                     mb_type |= MB_TYPE_8x8DCT;
4890                     di = 4;
4891                 }
4892
4893 //                fill_intra4x4_pred_table(h);
4894                 for(i=0; i<16; i+=di){
4895                     const int mode_coded= !get_bits1(&s->gb);
4896                     const int predicted_mode=  pred_intra_mode(h, i);
4897                     int mode;
4898
4899                     if(mode_coded){
4900                         const int rem_mode= get_bits(&s->gb, 3);
4901                         if(rem_mode<predicted_mode)
4902                             mode= rem_mode;
4903                         else
4904                             mode= rem_mode + 1;
4905                     }else{
4906                         mode= predicted_mode;
4907                     }
4908
4909                     if(di==4)
4910                         fill_rectangle( &h->intra4x4_pred_mode_cache[ scan8[i] ], 2, 2, 8, mode, 1 );
4911                     else
4912                         h->intra4x4_pred_mode_cache[ scan8[i] ] = mode;
4913                 }
4914                 write_back_intra_pred_mode(h);
4915                 if( check_intra4x4_pred_mode(h) < 0)
4916                     return -1;
4917             }else{
4918                 h->intra16x16_pred_mode= check_intra_pred_mode(h, h->intra16x16_pred_mode);
4919                 if(h->intra16x16_pred_mode < 0)
4920                     return -1;
4921             }
4922             h->chroma_pred_mode= get_ue_golomb(&s->gb);
4923
4924             h->chroma_pred_mode= check_intra_pred_mode(h, h->chroma_pred_mode);
4925             if(h->chroma_pred_mode < 0)
4926                 return -1;
4927     }else if(partition_count==4){
4928         int i, j, sub_partition_count[4], list, ref[2][4];
4929
4930         if(h->slice_type == B_TYPE){
4931             for(i=0; i<4; i++){
4932                 h->sub_mb_type[i]= get_ue_golomb(&s->gb);
4933                 if(h->sub_mb_type[i] >=13){
4934                     av_log(h->s.avctx, AV_LOG_ERROR, "B sub_mb_type %d out of range at %d %d\n", h->sub_mb_type[i], s->mb_x, s->mb_y);
4935                     return -1;
4936                 }
4937                 sub_partition_count[i]= b_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
4938                 h->sub_mb_type[i]=      b_sub_mb_type_info[ h->sub_mb_type[i] ].type;
4939             }
4940             if(   IS_DIRECT(h->sub_mb_type[0]) || IS_DIRECT(h->sub_mb_type[1])
4941                || IS_DIRECT(h->sub_mb_type[2]) || IS_DIRECT(h->sub_mb_type[3])) {
4942                 pred_direct_motion(h, &mb_type);
4943                 h->ref_cache[0][scan8[4]] =
4944                 h->ref_cache[1][scan8[4]] =
4945                 h->ref_cache[0][scan8[12]] =
4946                 h->ref_cache[1][scan8[12]] = PART_NOT_AVAILABLE;
4947             }
4948         }else{
4949             assert(h->slice_type == P_TYPE || h->slice_type == SP_TYPE); //FIXME SP correct ?
4950             for(i=0; i<4; i++){
4951                 h->sub_mb_type[i]= get_ue_golomb(&s->gb);
4952                 if(h->sub_mb_type[i] >=4){
4953                     av_log(h->s.avctx, AV_LOG_ERROR, "P sub_mb_type %d out of range at %d %d\n", h->sub_mb_type[i], s->mb_x, s->mb_y);
4954                     return -1;
4955                 }
4956                 sub_partition_count[i]= p_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
4957                 h->sub_mb_type[i]=      p_sub_mb_type_info[ h->sub_mb_type[i] ].type;
4958             }
4959         }
4960
4961         for(list=0; list<2; list++){
4962             int ref_count= IS_REF0(mb_type) ? 1 : h->ref_count[list];
4963             if(ref_count == 0) continue;
4964             if (h->mb_aff_frame && h->mb_field_decoding_flag) {
4965                 ref_count <<= 1;
4966             }
4967             for(i=0; i<4; i++){
4968                 if(IS_DIRECT(h->sub_mb_type[i])) continue;
4969                 if(IS_DIR(h->sub_mb_type[i], 0, list)){
4970                     ref[list][i] = get_te0_golomb(&s->gb, ref_count); //FIXME init to 0 before and skip?
4971                 }else{
4972                  //FIXME
4973                     ref[list][i] = -1;
4974                 }
4975             }
4976         }
4977
4978         if(dct8x8_allowed)
4979             dct8x8_allowed = get_dct8x8_allowed(h);
4980
4981         for(list=0; list<2; list++){
4982             const int ref_count= IS_REF0(mb_type) ? 1 : h->ref_count[list];
4983             if(ref_count == 0) continue;
4984
4985             for(i=0; i<4; i++){
4986                 if(IS_DIRECT(h->sub_mb_type[i])) {
4987                     h->ref_cache[list][ scan8[4*i] ] = h->ref_cache[list][ scan8[4*i]+1 ];
4988                     continue;
4989                 }
4990                 h->ref_cache[list][ scan8[4*i]   ]=h->ref_cache[list][ scan8[4*i]+1 ]=
4991                 h->ref_cache[list][ scan8[4*i]+8 ]=h->ref_cache[list][ scan8[4*i]+9 ]= ref[list][i];
4992
4993                 if(IS_DIR(h->sub_mb_type[i], 0, list)){
4994                     const int sub_mb_type= h->sub_mb_type[i];
4995                     const int block_width= (sub_mb_type & (MB_TYPE_16x16|MB_TYPE_16x8)) ? 2 : 1;
4996                     for(j=0; j<sub_partition_count[i]; j++){
4997                         int mx, my;
4998                         const int index= 4*i + block_width*j;
4999                         int16_t (* mv_cache)[2]= &h->mv_cache[list][ scan8[index] ];
5000                         pred_motion(h, index, block_width, list, h->ref_cache[list][ scan8[index] ], &mx, &my);
5001                         mx += get_se_golomb(&s->gb);
5002                         my += get_se_golomb(&s->gb);
5003                         tprintf("final mv:%d %d\n", mx, my);
5004
5005                         if(IS_SUB_8X8(sub_mb_type)){
5006                             mv_cache[ 0 ][0]= mv_cache[ 1 ][0]=
5007                             mv_cache[ 8 ][0]= mv_cache[ 9 ][0]= mx;
5008                             mv_cache[ 0 ][1]= mv_cache[ 1 ][1]=
5009                             mv_cache[ 8 ][1]= mv_cache[ 9 ][1]= my;
5010                         }else if(IS_SUB_8X4(sub_mb_type)){
5011                             mv_cache[ 0 ][0]= mv_cache[ 1 ][0]= mx;
5012                             mv_cache[ 0 ][1]= mv_cache[ 1 ][1]= my;
5013                         }else if(IS_SUB_4X8(sub_mb_type)){
5014                             mv_cache[ 0 ][0]= mv_cache[ 8 ][0]= mx;
5015                             mv_cache[ 0 ][1]= mv_cache[ 8 ][1]= my;
5016                         }else{
5017                             assert(IS_SUB_4X4(sub_mb_type));
5018                             mv_cache[ 0 ][0]= mx;
5019                             mv_cache[ 0 ][1]= my;
5020                         }
5021                     }
5022                 }else{
5023                     uint32_t *p= (uint32_t *)&h->mv_cache[list][ scan8[4*i] ][0];
5024                     p[0] = p[1]=
5025                     p[8] = p[9]= 0;
5026                 }
5027             }
5028         }
5029     }else if(IS_DIRECT(mb_type)){
5030         pred_direct_motion(h, &mb_type);
5031         dct8x8_allowed &= h->sps.direct_8x8_inference_flag;
5032     }else{
5033         int list, mx, my, i;
5034          //FIXME we should set ref_idx_l? to 0 if we use that later ...
5035         if(IS_16X16(mb_type)){
5036             for(list=0; list<2; list++){
5037                 if(h->ref_count[list]>0){
5038                     if(IS_DIR(mb_type, 0, list)){
5039                         const int val= get_te0_golomb(&s->gb, h->ref_count[list]);
5040                         fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, val, 1);
5041                     }else
5042                         fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, (LIST_NOT_USED&0xFF), 1);
5043                 }
5044             }
5045             for(list=0; list<2; list++){
5046                 if(IS_DIR(mb_type, 0, list)){
5047                     pred_motion(h, 0, 4, list, h->ref_cache[list][ scan8[0] ], &mx, &my);
5048                     mx += get_se_golomb(&s->gb);
5049                     my += get_se_golomb(&s->gb);
5050                     tprintf("final mv:%d %d\n", mx, my);
5051
5052                     fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, pack16to32(mx,my), 4);
5053                 }else
5054                     fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, 0, 4);
5055             }
5056         }
5057         else if(IS_16X8(mb_type)){
5058             for(list=0; list<2; list++){
5059                 if(h->ref_count[list]>0){
5060                     for(i=0; i<2; i++){
5061                         if(IS_DIR(mb_type, i, list)){
5062                             const int val= get_te0_golomb(&s->gb, h->ref_count[list]);
5063                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, val, 1);
5064                         }else
5065                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, (LIST_NOT_USED&0xFF), 1);
5066                     }
5067                 }
5068             }
5069             for(list=0; list<2; list++){
5070                 for(i=0; i<2; i++){
5071                     if(IS_DIR(mb_type, i, list)){
5072                         pred_16x8_motion(h, 8*i, list, h->ref_cache[list][scan8[0] + 16*i], &mx, &my);
5073                         mx += get_se_golomb(&s->gb);
5074                         my += get_se_golomb(&s->gb);
5075                         tprintf("final mv:%d %d\n", mx, my);
5076
5077                         fill_rectangle(h->mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, pack16to32(mx,my), 4);
5078                     }else
5079                         fill_rectangle(h->mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, 0, 4);
5080                 }
5081             }
5082         }else{
5083             assert(IS_8X16(mb_type));
5084             for(list=0; list<2; list++){
5085                 if(h->ref_count[list]>0){
5086                     for(i=0; i<2; i++){
5087                         if(IS_DIR(mb_type, i, list)){ //FIXME optimize
5088                             const int val= get_te0_golomb(&s->gb, h->ref_count[list]);
5089                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, val, 1);
5090                         }else
5091                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, (LIST_NOT_USED&0xFF), 1);
5092                     }
5093                 }
5094             }
5095             for(list=0; list<2; list++){
5096                 for(i=0; i<2; i++){
5097                     if(IS_DIR(mb_type, i, list)){
5098                         pred_8x16_motion(h, i*4, list, h->ref_cache[list][ scan8[0] + 2*i ], &mx, &my);
5099                         mx += get_se_golomb(&s->gb);
5100                         my += get_se_golomb(&s->gb);
5101                         tprintf("final mv:%d %d\n", mx, my);
5102
5103                         fill_rectangle(h->mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, pack16to32(mx,my), 4);
5104                     }else
5105                         fill_rectangle(h->mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, 0, 4);
5106                 }
5107             }
5108         }
5109     }
5110
5111     if(IS_INTER(mb_type))
5112         write_back_motion(h, mb_type);
5113
5114     if(!IS_INTRA16x16(mb_type)){
5115         cbp= get_ue_golomb(&s->gb);
5116         if(cbp > 47){
5117             av_log(h->s.avctx, AV_LOG_ERROR, "cbp too large (%d) at %d %d\n", cbp, s->mb_x, s->mb_y);
5118             return -1;
5119         }
5120
5121         if(IS_INTRA4x4(mb_type))
5122             cbp= golomb_to_intra4x4_cbp[cbp];
5123         else
5124             cbp= golomb_to_inter_cbp[cbp];
5125     }
5126
5127     if(dct8x8_allowed && (cbp&15) && !IS_INTRA(mb_type)){
5128         if(get_bits1(&s->gb))
5129             mb_type |= MB_TYPE_8x8DCT;
5130     }
5131     s->current_picture.mb_type[mb_xy]= mb_type;
5132
5133     if(cbp || IS_INTRA16x16(mb_type)){
5134         int i8x8, i4x4, chroma_idx;
5135         int chroma_qp, dquant;
5136         GetBitContext *gb= IS_INTRA(mb_type) ? h->intra_gb_ptr : h->inter_gb_ptr;
5137         const uint8_t *scan, *scan8x8, *dc_scan;
5138
5139 //        fill_non_zero_count_cache(h);
5140
5141         if(IS_INTERLACED(mb_type)){
5142             scan= s->qscale ? h->field_scan : h->field_scan_q0;
5143             dc_scan= luma_dc_field_scan;
5144         }else{
5145             scan= s->qscale ? h->zigzag_scan : h->zigzag_scan_q0;
5146             dc_scan= luma_dc_zigzag_scan;
5147         }
5148         scan8x8= s->qscale ? h->zigzag_scan8x8_cavlc : h->zigzag_scan8x8_cavlc_q0;
5149
5150         dquant= get_se_golomb(&s->gb);
5151
5152         if( dquant > 25 || dquant < -26 ){
5153             av_log(h->s.avctx, AV_LOG_ERROR, "dquant out of range (%d) at %d %d\n", dquant, s->mb_x, s->mb_y);
5154             return -1;
5155         }
5156
5157         s->qscale += dquant;
5158         if(((unsigned)s->qscale) > 51){
5159             if(s->qscale<0) s->qscale+= 52;
5160             else            s->qscale-= 52;
5161         }
5162
5163         h->chroma_qp= chroma_qp= get_chroma_qp(h->pps.chroma_qp_index_offset, s->qscale);
5164         if(IS_INTRA16x16(mb_type)){
5165             if( decode_residual(h, h->intra_gb_ptr, h->mb, LUMA_DC_BLOCK_INDEX, dc_scan, h->dequant4_coeff[0][s->qscale], 16) < 0){
5166                 return -1; //FIXME continue if partitioned and other return -1 too
5167             }
5168
5169             assert((cbp&15) == 0 || (cbp&15) == 15);
5170
5171             if(cbp&15){
5172                 for(i8x8=0; i8x8<4; i8x8++){
5173                     for(i4x4=0; i4x4<4; i4x4++){
5174                         const int index= i4x4 + 4*i8x8;
5175                         if( decode_residual(h, h->intra_gb_ptr, h->mb + 16*index, index, scan + 1, h->dequant4_coeff[0][s->qscale], 15) < 0 ){
5176                             return -1;
5177                         }
5178                     }
5179                 }
5180             }else{
5181                 fill_rectangle(&h->non_zero_count_cache[scan8[0]], 4, 4, 8, 0, 1);
5182             }
5183         }else{
5184             for(i8x8=0; i8x8<4; i8x8++){
5185                 if(cbp & (1<<i8x8)){
5186                     if(IS_8x8DCT(mb_type)){
5187                         DCTELEM *buf = &h->mb[64*i8x8];
5188                         uint8_t *nnz;
5189                         for(i4x4=0; i4x4<4; i4x4++){
5190                             if( decode_residual(h, gb, buf, i4x4+4*i8x8, scan8x8+16*i4x4,
5191                                                 h->dequant8_coeff[IS_INTRA( mb_type ) ? 0:1][s->qscale], 16) <0 )
5192                                 return -1;
5193                         }
5194                         nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
5195                         nnz[0] += nnz[1] + nnz[8] + nnz[9];
5196                     }else{
5197                         for(i4x4=0; i4x4<4; i4x4++){
5198                             const int index= i4x4 + 4*i8x8;
5199
5200                             if( decode_residual(h, gb, h->mb + 16*index, index, scan, h->dequant4_coeff[IS_INTRA( mb_type ) ? 0:3][s->qscale], 16) <0 ){
5201                                 return -1;
5202                             }
5203                         }
5204                     }
5205                 }else{
5206                     uint8_t * const nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
5207                     nnz[0] = nnz[1] = nnz[8] = nnz[9] = 0;
5208                 }
5209             }
5210         }
5211
5212         if(cbp&0x30){
5213             for(chroma_idx=0; chroma_idx<2; chroma_idx++)
5214                 if( decode_residual(h, gb, h->mb + 256 + 16*4*chroma_idx, CHROMA_DC_BLOCK_INDEX, chroma_dc_scan, NULL, 4) < 0){
5215                     return -1;
5216                 }
5217         }
5218
5219         if(cbp&0x20){
5220             for(chroma_idx=0; chroma_idx<2; chroma_idx++){
5221                 for(i4x4=0; i4x4<4; i4x4++){
5222                     const int index= 16 + 4*chroma_idx + i4x4;
5223                     if( decode_residual(h, gb, h->mb + 16*index, index, scan + 1, h->dequant4_coeff[chroma_idx+1+(IS_INTRA( mb_type ) ? 0:3)][chroma_qp], 15) < 0){
5224                         return -1;
5225                     }
5226                 }
5227             }
5228         }else{
5229             uint8_t * const nnz= &h->non_zero_count_cache[0];
5230             nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
5231             nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
5232         }
5233     }else{
5234         uint8_t * const nnz= &h->non_zero_count_cache[0];
5235         fill_rectangle(&nnz[scan8[0]], 4, 4, 8, 0, 1);
5236         nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
5237         nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
5238     }
5239     s->current_picture.qscale_table[mb_xy]= s->qscale;
5240     write_back_non_zero_count(h);
5241
5242     return 0;
5243 }
5244
5245 static int decode_cabac_field_decoding_flag(H264Context *h) {
5246     MpegEncContext * const s = &h->s;
5247     const int mb_x = s->mb_x;
5248     const int mb_y = s->mb_y & ~1;
5249     const int mba_xy = mb_x - 1 +  mb_y   *s->mb_stride;
5250     const int mbb_xy = mb_x     + (mb_y-2)*s->mb_stride;
5251
5252     unsigned int ctx = 0;
5253
5254     if( h->slice_table[mba_xy] == h->slice_num && IS_INTERLACED( s->current_picture.mb_type[mba_xy] ) ) {
5255         ctx += 1;
5256     }
5257     if( h->slice_table[mbb_xy] == h->slice_num && IS_INTERLACED( s->current_picture.mb_type[mbb_xy] ) ) {
5258         ctx += 1;
5259     }
5260
5261     return get_cabac( &h->cabac, &h->cabac_state[70 + ctx] );
5262 }
5263
5264 static int decode_cabac_intra_mb_type(H264Context *h, int ctx_base, int intra_slice) {
5265     uint8_t *state= &h->cabac_state[ctx_base];
5266     int mb_type;
5267
5268     if(intra_slice){
5269         MpegEncContext * const s = &h->s;
5270         const int mba_xy = h->left_mb_xy[0];
5271         const int mbb_xy = h->top_mb_xy;
5272         int ctx=0;
5273         if( h->slice_table[mba_xy] == h->slice_num && !IS_INTRA4x4( s->current_picture.mb_type[mba_xy] ) )
5274             ctx++;
5275         if( h->slice_table[mbb_xy] == h->slice_num && !IS_INTRA4x4( s->current_picture.mb_type[mbb_xy] ) )
5276             ctx++;
5277         if( get_cabac( &h->cabac, &state[ctx] ) == 0 )
5278             return 0;   /* I4x4 */
5279         state += 2;
5280     }else{
5281         if( get_cabac( &h->cabac, &state[0] ) == 0 )
5282             return 0;   /* I4x4 */
5283     }
5284
5285     if( get_cabac_terminate( &h->cabac ) )
5286         return 25;  /* PCM */
5287
5288     mb_type = 1; /* I16x16 */
5289     mb_type += 12 * get_cabac( &h->cabac, &state[1] ); /* cbp_luma != 0 */
5290     if( get_cabac( &h->cabac, &state[2] ) ) /* cbp_chroma */
5291         mb_type += 4 + 4 * get_cabac( &h->cabac, &state[2+intra_slice] );
5292     mb_type += 2 * get_cabac( &h->cabac, &state[3+intra_slice] );
5293     mb_type += 1 * get_cabac( &h->cabac, &state[3+2*intra_slice] );
5294     return mb_type;
5295 }
5296
5297 static int decode_cabac_mb_type( H264Context *h ) {
5298     MpegEncContext * const s = &h->s;
5299
5300     if( h->slice_type == I_TYPE ) {
5301         return decode_cabac_intra_mb_type(h, 3, 1);
5302     } else if( h->slice_type == P_TYPE ) {
5303         if( get_cabac( &h->cabac, &h->cabac_state[14] ) == 0 ) {
5304             /* P-type */
5305             if( get_cabac( &h->cabac, &h->cabac_state[15] ) == 0 ) {
5306                 /* P_L0_D16x16, P_8x8 */
5307                 return 3 * get_cabac( &h->cabac, &h->cabac_state[16] );
5308             } else {
5309                 /* P_L0_D8x16, P_L0_D16x8 */
5310                 return 2 - get_cabac( &h->cabac, &h->cabac_state[17] );
5311             }
5312         } else {
5313             return decode_cabac_intra_mb_type(h, 17, 0) + 5;
5314         }
5315     } else if( h->slice_type == B_TYPE ) {
5316         const int mba_xy = h->left_mb_xy[0];
5317         const int mbb_xy = h->top_mb_xy;
5318         int ctx = 0;
5319         int bits;
5320
5321         if( h->slice_table[mba_xy] == h->slice_num && !IS_DIRECT( s->current_picture.mb_type[mba_xy] ) )
5322             ctx++;
5323         if( h->slice_table[mbb_xy] == h->slice_num && !IS_DIRECT( s->current_picture.mb_type[mbb_xy] ) )
5324             ctx++;
5325
5326         if( !get_cabac( &h->cabac, &h->cabac_state[27+ctx] ) )
5327             return 0; /* B_Direct_16x16 */
5328
5329         if( !get_cabac( &h->cabac, &h->cabac_state[27+3] ) ) {
5330             return 1 + get_cabac( &h->cabac, &h->cabac_state[27+5] ); /* B_L[01]_16x16 */
5331         }
5332
5333         bits = get_cabac( &h->cabac, &h->cabac_state[27+4] ) << 3;
5334         bits|= get_cabac( &h->cabac, &h->cabac_state[27+5] ) << 2;
5335         bits|= get_cabac( &h->cabac, &h->cabac_state[27+5] ) << 1;
5336         bits|= get_cabac( &h->cabac, &h->cabac_state[27+5] );
5337         if( bits < 8 )
5338             return bits + 3; /* B_Bi_16x16 through B_L1_L0_16x8 */
5339         else if( bits == 13 ) {
5340             return decode_cabac_intra_mb_type(h, 32, 0) + 23;
5341         } else if( bits == 14 )
5342             return 11; /* B_L1_L0_8x16 */
5343         else if( bits == 15 )
5344             return 22; /* B_8x8 */
5345
5346         bits= ( bits<<1 ) | get_cabac( &h->cabac, &h->cabac_state[27+5] );
5347         return bits - 4; /* B_L0_Bi_* through B_Bi_Bi_* */
5348     } else {
5349         /* TODO SI/SP frames? */
5350         return -1;
5351     }
5352 }
5353
5354 static int decode_cabac_mb_skip( H264Context *h) {
5355     MpegEncContext * const s = &h->s;
5356     const int mb_xy = s->mb_x + s->mb_y*s->mb_stride;
5357     const int mba_xy = mb_xy - 1;
5358     const int mbb_xy = mb_xy - s->mb_stride;
5359     int ctx = 0;
5360
5361     if( h->slice_table[mba_xy] == h->slice_num && !IS_SKIP( s->current_picture.mb_type[mba_xy] ))
5362         ctx++;
5363     if( h->slice_table[mbb_xy] == h->slice_num && !IS_SKIP( s->current_picture.mb_type[mbb_xy] ))
5364         ctx++;
5365
5366     if( h->slice_type == B_TYPE )
5367         ctx += 13;
5368     return get_cabac( &h->cabac, &h->cabac_state[11+ctx] );
5369 }
5370
5371 static int decode_cabac_mb_intra4x4_pred_mode( H264Context *h, int pred_mode ) {
5372     int mode = 0;
5373
5374     if( get_cabac( &h->cabac, &h->cabac_state[68] ) )
5375         return pred_mode;
5376
5377     mode += 1 * get_cabac( &h->cabac, &h->cabac_state[69] );
5378     mode += 2 * get_cabac( &h->cabac, &h->cabac_state[69] );
5379     mode += 4 * get_cabac( &h->cabac, &h->cabac_state[69] );
5380
5381     if( mode >= pred_mode )
5382         return mode + 1;
5383     else
5384         return mode;
5385 }
5386
5387 static int decode_cabac_mb_chroma_pre_mode( H264Context *h) {
5388     const int mba_xy = h->left_mb_xy[0];
5389     const int mbb_xy = h->top_mb_xy;
5390
5391     int ctx = 0;
5392
5393     /* No need to test for IS_INTRA4x4 and IS_INTRA16x16, as we set chroma_pred_mode_table to 0 */
5394     if( h->slice_table[mba_xy] == h->slice_num && h->chroma_pred_mode_table[mba_xy] != 0 )
5395         ctx++;
5396
5397     if( h->slice_table[mbb_xy] == h->slice_num && h->chroma_pred_mode_table[mbb_xy] != 0 )
5398         ctx++;
5399
5400     if( get_cabac( &h->cabac, &h->cabac_state[64+ctx] ) == 0 )
5401         return 0;
5402
5403     if( get_cabac( &h->cabac, &h->cabac_state[64+3] ) == 0 )
5404         return 1;
5405     if( get_cabac( &h->cabac, &h->cabac_state[64+3] ) == 0 )
5406         return 2;
5407     else
5408         return 3;
5409 }
5410
5411 static const uint8_t block_idx_x[16] = {
5412     0, 1, 0, 1, 2, 3, 2, 3, 0, 1, 0, 1, 2, 3, 2, 3
5413 };
5414 static const uint8_t block_idx_y[16] = {
5415     0, 0, 1, 1, 0, 0, 1, 1, 2, 2, 3, 3, 2, 2, 3, 3
5416 };
5417 static const uint8_t block_idx_xy[4][4] = {
5418     { 0, 2, 8,  10},
5419     { 1, 3, 9,  11},
5420     { 4, 6, 12, 14},
5421     { 5, 7, 13, 15}
5422 };
5423
5424 static int decode_cabac_mb_cbp_luma( H264Context *h) {
5425     int cbp = 0;
5426     int cbp_b = -1;
5427     int i8x8;
5428
5429     if( h->slice_table[h->top_mb_xy] == h->slice_num ) {
5430         cbp_b = h->top_cbp;
5431         tprintf("cbp_b = top_cbp = %x\n", cbp_b);
5432     }
5433
5434     for( i8x8 = 0; i8x8 < 4; i8x8++ ) {
5435         int cbp_a = -1;
5436         int x, y;
5437         int ctx = 0;
5438
5439         x = block_idx_x[4*i8x8];
5440         y = block_idx_y[4*i8x8];
5441
5442         if( x > 0 )
5443             cbp_a = cbp;
5444         else if( h->slice_table[h->left_mb_xy[0]] == h->slice_num ) {
5445             cbp_a = h->left_cbp;
5446             tprintf("cbp_a = left_cbp = %x\n", cbp_a);
5447         }
5448
5449         if( y > 0 )
5450             cbp_b = cbp;
5451
5452         /* No need to test for skip as we put 0 for skip block */
5453         /* No need to test for IPCM as we put 1 for IPCM block */
5454         if( cbp_a >= 0 ) {
5455             int i8x8a = block_idx_xy[(x-1)&0x03][y]/4;
5456             if( ((cbp_a >> i8x8a)&0x01) == 0 )
5457                 ctx++;
5458         }
5459
5460         if( cbp_b >= 0 ) {
5461             int i8x8b = block_idx_xy[x][(y-1)&0x03]/4;
5462             if( ((cbp_b >> i8x8b)&0x01) == 0 )
5463                 ctx += 2;
5464         }
5465
5466         if( get_cabac( &h->cabac, &h->cabac_state[73 + ctx] ) ) {
5467             cbp |= 1 << i8x8;
5468         }
5469     }
5470     return cbp;
5471 }
5472 static int decode_cabac_mb_cbp_chroma( H264Context *h) {
5473     int ctx;
5474     int cbp_a, cbp_b;
5475
5476     cbp_a = (h->left_cbp>>4)&0x03;
5477     cbp_b = (h-> top_cbp>>4)&0x03;
5478
5479     ctx = 0;
5480     if( cbp_a > 0 ) ctx++;
5481     if( cbp_b > 0 ) ctx += 2;
5482     if( get_cabac( &h->cabac, &h->cabac_state[77 + ctx] ) == 0 )
5483         return 0;
5484
5485     ctx = 4;
5486     if( cbp_a == 2 ) ctx++;
5487     if( cbp_b == 2 ) ctx += 2;
5488     return 1 + get_cabac( &h->cabac, &h->cabac_state[77 + ctx] );
5489 }
5490 static int decode_cabac_mb_dqp( H264Context *h) {
5491     MpegEncContext * const s = &h->s;
5492     int mbn_xy;
5493     int   ctx = 0;
5494     int   val = 0;
5495
5496     if( s->mb_x > 0 )
5497         mbn_xy = s->mb_x + s->mb_y*s->mb_stride - 1;
5498     else
5499         mbn_xy = s->mb_width - 1 + (s->mb_y-1)*s->mb_stride;
5500
5501     if( h->last_qscale_diff != 0 && ( IS_INTRA16x16(s->current_picture.mb_type[mbn_xy] ) || (h->cbp_table[mbn_xy]&0x3f) ) )
5502         ctx++;
5503
5504     while( get_cabac( &h->cabac, &h->cabac_state[60 + ctx] ) ) {
5505         if( ctx < 2 )
5506             ctx = 2;
5507         else
5508             ctx = 3;
5509         val++;
5510         if(val > 102) //prevent infinite loop
5511             return INT_MIN;
5512     }
5513
5514     if( val&0x01 )
5515         return (val + 1)/2;
5516     else
5517         return -(val + 1)/2;
5518 }
5519 static int decode_cabac_p_mb_sub_type( H264Context *h ) {
5520     if( get_cabac( &h->cabac, &h->cabac_state[21] ) )
5521         return 0;   /* 8x8 */
5522     if( !get_cabac( &h->cabac, &h->cabac_state[22] ) )
5523         return 1;   /* 8x4 */
5524     if( get_cabac( &h->cabac, &h->cabac_state[23] ) )
5525         return 2;   /* 4x8 */
5526     return 3;       /* 4x4 */
5527 }
5528 static int decode_cabac_b_mb_sub_type( H264Context *h ) {
5529     int type;
5530     if( !get_cabac( &h->cabac, &h->cabac_state[36] ) )
5531         return 0;   /* B_Direct_8x8 */
5532     if( !get_cabac( &h->cabac, &h->cabac_state[37] ) )
5533         return 1 + get_cabac( &h->cabac, &h->cabac_state[39] ); /* B_L0_8x8, B_L1_8x8 */
5534     type = 3;
5535     if( get_cabac( &h->cabac, &h->cabac_state[38] ) ) {
5536         if( get_cabac( &h->cabac, &h->cabac_state[39] ) )
5537             return 11 + get_cabac( &h->cabac, &h->cabac_state[39] ); /* B_L1_4x4, B_Bi_4x4 */
5538         type += 4;
5539     }
5540     type += 2*get_cabac( &h->cabac, &h->cabac_state[39] );
5541     type +=   get_cabac( &h->cabac, &h->cabac_state[39] );
5542     return type;
5543 }
5544
5545 static inline int decode_cabac_mb_transform_size( H264Context *h ) {
5546     return get_cabac( &h->cabac, &h->cabac_state[399 + h->neighbor_transform_size] );
5547 }
5548
5549 static int decode_cabac_mb_ref( H264Context *h, int list, int n ) {
5550     int refa = h->ref_cache[list][scan8[n] - 1];
5551     int refb = h->ref_cache[list][scan8[n] - 8];
5552     int ref  = 0;
5553     int ctx  = 0;
5554
5555     if( h->slice_type == B_TYPE) {
5556         if( refa > 0 && !h->direct_cache[scan8[n] - 1] )
5557             ctx++;
5558         if( refb > 0 && !h->direct_cache[scan8[n] - 8] )
5559             ctx += 2;
5560     } else {
5561         if( refa > 0 )
5562             ctx++;
5563         if( refb > 0 )
5564             ctx += 2;
5565     }
5566
5567     while( get_cabac( &h->cabac, &h->cabac_state[54+ctx] ) ) {
5568         ref++;
5569         if( ctx < 4 )
5570             ctx = 4;
5571         else
5572             ctx = 5;
5573     }
5574     return ref;
5575 }
5576
5577 static int decode_cabac_mb_mvd( H264Context *h, int list, int n, int l ) {
5578     int amvd = abs( h->mvd_cache[list][scan8[n] - 1][l] ) +
5579                abs( h->mvd_cache[list][scan8[n] - 8][l] );
5580     int ctxbase = (l == 0) ? 40 : 47;
5581     int ctx, mvd;
5582
5583     if( amvd < 3 )
5584         ctx = 0;
5585     else if( amvd > 32 )
5586         ctx = 2;
5587     else
5588         ctx = 1;
5589
5590     if(!get_cabac(&h->cabac, &h->cabac_state[ctxbase+ctx]))
5591         return 0;
5592
5593     mvd= 1;
5594     ctx= 3;
5595     while( mvd < 9 && get_cabac( &h->cabac, &h->cabac_state[ctxbase+ctx] ) ) {
5596         mvd++;
5597         if( ctx < 6 )
5598             ctx++;
5599     }
5600
5601     if( mvd >= 9 ) {
5602         int k = 3;
5603         while( get_cabac_bypass( &h->cabac ) ) {
5604             mvd += 1 << k;
5605             k++;
5606         }
5607         while( k-- ) {
5608             if( get_cabac_bypass( &h->cabac ) )
5609                 mvd += 1 << k;
5610         }
5611     }
5612     if( get_cabac_bypass( &h->cabac ) )  return -mvd;
5613     else                                 return  mvd;
5614 }
5615
5616 static int inline get_cabac_cbf_ctx( H264Context *h, int cat, int idx ) {
5617     int nza, nzb;
5618     int ctx = 0;
5619
5620     if( cat == 0 ) {
5621         nza = h->left_cbp&0x100;
5622         nzb = h-> top_cbp&0x100;
5623     } else if( cat == 1 || cat == 2 ) {
5624         nza = h->non_zero_count_cache[scan8[idx] - 1];
5625         nzb = h->non_zero_count_cache[scan8[idx] - 8];
5626     } else if( cat == 3 ) {
5627         nza = (h->left_cbp>>(6+idx))&0x01;
5628         nzb = (h-> top_cbp>>(6+idx))&0x01;
5629     } else {
5630         assert(cat == 4);
5631         nza = h->non_zero_count_cache[scan8[16+idx] - 1];
5632         nzb = h->non_zero_count_cache[scan8[16+idx] - 8];
5633     }
5634
5635     if( nza > 0 )
5636         ctx++;
5637
5638     if( nzb > 0 )
5639         ctx += 2;
5640
5641     return ctx + 4 * cat;
5642 }
5643
5644 static int decode_cabac_residual( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, const uint32_t *qmul, int max_coeff) {
5645     const int mb_xy  = h->s.mb_x + h->s.mb_y*h->s.mb_stride;
5646     static const int significant_coeff_flag_field_offset[2] = { 105, 277 };
5647     static const int last_significant_coeff_flag_field_offset[2] = { 166, 338 };
5648     static const int significant_coeff_flag_offset[6] = { 0, 15, 29, 44, 47, 297 };
5649     static const int last_significant_coeff_flag_offset[6] = { 0, 15, 29, 44, 47, 251 };
5650     static const int coeff_abs_level_m1_offset[6] = { 227+0, 227+10, 227+20, 227+30, 227+39, 426 };
5651     static const int significant_coeff_flag_offset_8x8[63] = {
5652         0, 1, 2, 3, 4, 5, 5, 4, 4, 3, 3, 4, 4, 4, 5, 5,
5653         4, 4, 4, 4, 3, 3, 6, 7, 7, 7, 8, 9,10, 9, 8, 7,
5654         7, 6,11,12,13,11, 6, 7, 8, 9,14,10, 9, 8, 6,11,
5655        12,13,11, 6, 9,14,10, 9,11,12,13,11,14,10,12
5656     };
5657     static const int last_coeff_flag_offset_8x8[63] = {
5658         0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
5659         2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
5660         3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4,
5661         5, 5, 5, 5, 6, 6, 6, 6, 7, 7, 7, 7, 8, 8, 8
5662     };
5663
5664     int index[64];
5665
5666     int i, last;
5667     int coeff_count = 0;
5668
5669     int abslevel1 = 1;
5670     int abslevelgt1 = 0;
5671
5672     uint8_t *significant_coeff_ctx_base;
5673     uint8_t *last_coeff_ctx_base;
5674     uint8_t *abs_level_m1_ctx_base;
5675
5676     /* cat: 0-> DC 16x16  n = 0
5677      *      1-> AC 16x16  n = luma4x4idx
5678      *      2-> Luma4x4   n = luma4x4idx
5679      *      3-> DC Chroma n = iCbCr
5680      *      4-> AC Chroma n = 4 * iCbCr + chroma4x4idx
5681      *      5-> Luma8x8   n = 4 * luma8x8idx
5682      */
5683
5684     /* read coded block flag */
5685     if( cat != 5 ) {
5686         if( get_cabac( &h->cabac, &h->cabac_state[85 + get_cabac_cbf_ctx( h, cat, n ) ] ) == 0 ) {
5687             if( cat == 1 || cat == 2 )
5688                 h->non_zero_count_cache[scan8[n]] = 0;
5689             else if( cat == 4 )
5690                 h->non_zero_count_cache[scan8[16+n]] = 0;
5691
5692             return 0;
5693         }
5694     }
5695
5696     significant_coeff_ctx_base = h->cabac_state
5697         + significant_coeff_flag_offset[cat]
5698         + significant_coeff_flag_field_offset[h->mb_field_decoding_flag];
5699     last_coeff_ctx_base = h->cabac_state
5700         + last_significant_coeff_flag_offset[cat]
5701         + last_significant_coeff_flag_field_offset[h->mb_field_decoding_flag];
5702     abs_level_m1_ctx_base = h->cabac_state
5703         + coeff_abs_level_m1_offset[cat];
5704
5705     if( cat == 5 ) {
5706 #define DECODE_SIGNIFICANCE( coefs, sig_off, last_off ) \
5707         for(last= 0; last < coefs; last++) { \
5708             uint8_t *sig_ctx = significant_coeff_ctx_base + sig_off; \
5709             if( get_cabac( &h->cabac, sig_ctx )) { \
5710                 uint8_t *last_ctx = last_coeff_ctx_base + last_off; \
5711                 index[coeff_count++] = last; \
5712                 if( get_cabac( &h->cabac, last_ctx ) ) { \
5713                     last= max_coeff; \
5714                     break; \
5715                 } \
5716             } \
5717         }
5718         DECODE_SIGNIFICANCE( 63, significant_coeff_flag_offset_8x8[last],
5719                                  last_coeff_flag_offset_8x8[last] );
5720     } else {
5721         DECODE_SIGNIFICANCE( max_coeff - 1, last, last );
5722     }
5723     if( last == max_coeff -1 ) {
5724         index[coeff_count++] = last;
5725     }
5726     assert(coeff_count > 0);
5727
5728     if( cat == 0 )
5729         h->cbp_table[mb_xy] |= 0x100;
5730     else if( cat == 1 || cat == 2 )
5731         h->non_zero_count_cache[scan8[n]] = coeff_count;
5732     else if( cat == 3 )
5733         h->cbp_table[mb_xy] |= 0x40 << n;
5734     else if( cat == 4 )
5735         h->non_zero_count_cache[scan8[16+n]] = coeff_count;
5736     else {
5737         assert( cat == 5 );
5738         fill_rectangle(&h->non_zero_count_cache[scan8[n]], 2, 2, 8, coeff_count, 1);
5739     }
5740
5741     for( i = coeff_count - 1; i >= 0; i-- ) {
5742         uint8_t *ctx = (abslevelgt1 != 0 ? 0 : FFMIN( 4, abslevel1 )) + abs_level_m1_ctx_base;
5743         int j= scantable[index[i]];
5744
5745         if( get_cabac( &h->cabac, ctx ) == 0 ) {
5746             if( !qmul ) {
5747                 if( get_cabac_bypass( &h->cabac ) ) block[j] = -1;
5748                 else                                block[j] =  1;
5749             }else{
5750                 if( get_cabac_bypass( &h->cabac ) ) block[j] = (-qmul[j] + 32) >> 6;
5751                 else                                block[j] = ( qmul[j] + 32) >> 6;
5752             }
5753
5754             abslevel1++;
5755         } else {
5756             int coeff_abs = 2;
5757             ctx = 5 + FFMIN( 4, abslevelgt1 ) + abs_level_m1_ctx_base;
5758             while( coeff_abs < 15 && get_cabac( &h->cabac, ctx ) ) {
5759                 coeff_abs++;
5760             }
5761
5762             if( coeff_abs >= 15 ) {
5763                 int j = 0;
5764                 while( get_cabac_bypass( &h->cabac ) ) {
5765                     coeff_abs += 1 << j;
5766                     j++;
5767                 }
5768
5769                 while( j-- ) {
5770                     if( get_cabac_bypass( &h->cabac ) )
5771                         coeff_abs += 1 << j ;
5772                 }
5773             }
5774
5775             if( !qmul ) {
5776                 if( get_cabac_bypass( &h->cabac ) ) block[j] = -coeff_abs;
5777                 else                                block[j] =  coeff_abs;
5778             }else{
5779                 if( get_cabac_bypass( &h->cabac ) ) block[j] = (-coeff_abs * qmul[j] + 32) >> 6;
5780                 else                                block[j] = ( coeff_abs * qmul[j] + 32) >> 6;
5781             }
5782
5783             abslevelgt1++;
5784         }
5785     }
5786     return 0;
5787 }
5788
5789 static void inline compute_mb_neighbors(H264Context *h)
5790 {
5791     MpegEncContext * const s = &h->s;
5792     const int mb_xy  = s->mb_x + s->mb_y*s->mb_stride;
5793     h->top_mb_xy     = mb_xy - s->mb_stride;
5794     h->left_mb_xy[0] = mb_xy - 1;
5795     if(h->mb_aff_frame){
5796         const int pair_xy          = s->mb_x     + (s->mb_y & ~1)*s->mb_stride;
5797         const int top_pair_xy      = pair_xy     - s->mb_stride;
5798         const int top_mb_frame_flag      = !IS_INTERLACED(s->current_picture.mb_type[top_pair_xy]);
5799         const int left_mb_frame_flag = !IS_INTERLACED(s->current_picture.mb_type[pair_xy-1]);
5800         const int curr_mb_frame_flag = !h->mb_field_decoding_flag;
5801         const int bottom = (s->mb_y & 1);
5802         if (bottom
5803                 ? !curr_mb_frame_flag // bottom macroblock
5804                 : (!curr_mb_frame_flag && !top_mb_frame_flag) // top macroblock
5805                 ) {
5806             h->top_mb_xy -= s->mb_stride;
5807         }
5808         if (left_mb_frame_flag != curr_mb_frame_flag) {
5809             h->left_mb_xy[0] = pair_xy - 1;
5810         }
5811     }
5812     return;
5813 }
5814
5815 /**
5816  * decodes a macroblock
5817  * @returns 0 if ok, AC_ERROR / DC_ERROR / MV_ERROR if an error is noticed
5818  */
5819 static int decode_mb_cabac(H264Context *h) {
5820     MpegEncContext * const s = &h->s;
5821     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
5822     int mb_type, partition_count, cbp = 0;
5823     int dct8x8_allowed= h->pps.transform_8x8_mode;
5824
5825     s->dsp.clear_blocks(h->mb); //FIXME avoid if already clear (move after skip handlong?)
5826
5827     tprintf("pic:%d mb:%d/%d\n", h->frame_num, s->mb_x, s->mb_y);
5828     if( h->slice_type != I_TYPE && h->slice_type != SI_TYPE ) {
5829         /* read skip flags */
5830         if( decode_cabac_mb_skip( h ) ) {
5831             decode_mb_skip(h);
5832
5833             h->cbp_table[mb_xy] = 0;
5834             h->chroma_pred_mode_table[mb_xy] = 0;
5835             h->last_qscale_diff = 0;
5836
5837             return 0;
5838
5839         }
5840     }
5841     if(h->mb_aff_frame){
5842         if ( ((s->mb_y&1) == 0) || h->prev_mb_skipped)
5843             h->mb_field_decoding_flag = decode_cabac_field_decoding_flag(h);
5844     }else
5845         h->mb_field_decoding_flag= (s->picture_structure!=PICT_FRAME);
5846
5847     h->prev_mb_skipped = 0;
5848
5849     compute_mb_neighbors(h);
5850     if( ( mb_type = decode_cabac_mb_type( h ) ) < 0 ) {
5851         av_log( h->s.avctx, AV_LOG_ERROR, "decode_cabac_mb_type failed\n" );
5852         return -1;
5853     }
5854
5855     if( h->slice_type == B_TYPE ) {
5856         if( mb_type < 23 ){
5857             partition_count= b_mb_type_info[mb_type].partition_count;
5858             mb_type=         b_mb_type_info[mb_type].type;
5859         }else{
5860             mb_type -= 23;
5861             goto decode_intra_mb;
5862         }
5863     } else if( h->slice_type == P_TYPE ) {
5864         if( mb_type < 5) {
5865             partition_count= p_mb_type_info[mb_type].partition_count;
5866             mb_type=         p_mb_type_info[mb_type].type;
5867         } else {
5868             mb_type -= 5;
5869             goto decode_intra_mb;
5870         }
5871     } else {
5872        assert(h->slice_type == I_TYPE);
5873 decode_intra_mb:
5874         partition_count = 0;
5875         cbp= i_mb_type_info[mb_type].cbp;
5876         h->intra16x16_pred_mode= i_mb_type_info[mb_type].pred_mode;
5877         mb_type= i_mb_type_info[mb_type].type;
5878     }
5879     if(h->mb_field_decoding_flag)
5880         mb_type |= MB_TYPE_INTERLACED;
5881
5882     h->slice_table[ mb_xy ]= h->slice_num;
5883
5884     if(IS_INTRA_PCM(mb_type)) {
5885         const uint8_t *ptr;
5886         unsigned int x, y;
5887
5888         // We assume these blocks are very rare so we dont optimize it.
5889         // FIXME The two following lines get the bitstream position in the cabac
5890         // decode, I think it should be done by a function in cabac.h (or cabac.c).
5891         ptr= h->cabac.bytestream;
5892         if (h->cabac.low&0x1) ptr-=CABAC_BITS/8;
5893
5894         // The pixels are stored in the same order as levels in h->mb array.
5895         for(y=0; y<16; y++){
5896             const int index= 4*(y&3) + 32*((y>>2)&1) + 128*(y>>3);
5897             for(x=0; x<16; x++){
5898                 tprintf("LUMA ICPM LEVEL (%3d)\n", *ptr);
5899                 h->mb[index + (x&3) + 16*((x>>2)&1) + 64*(x>>3)]= *ptr++;
5900             }
5901         }
5902         for(y=0; y<8; y++){
5903             const int index= 256 + 4*(y&3) + 32*(y>>2);
5904             for(x=0; x<8; x++){
5905                 tprintf("CHROMA U ICPM LEVEL (%3d)\n", *ptr);
5906                 h->mb[index + (x&3) + 16*(x>>2)]= *ptr++;
5907             }
5908         }
5909         for(y=0; y<8; y++){
5910             const int index= 256 + 64 + 4*(y&3) + 32*(y>>2);
5911             for(x=0; x<8; x++){
5912                 tprintf("CHROMA V ICPM LEVEL (%3d)\n", *ptr);
5913                 h->mb[index + (x&3) + 16*(x>>2)]= *ptr++;
5914             }
5915         }
5916
5917         ff_init_cabac_decoder(&h->cabac, ptr, h->cabac.bytestream_end - ptr);
5918
5919         // All blocks are present
5920         h->cbp_table[mb_xy] = 0x1ef;
5921         h->chroma_pred_mode_table[mb_xy] = 0;
5922         // In deblocking, the quantizer is 0
5923         s->current_picture.qscale_table[mb_xy]= 0;
5924         h->chroma_qp = get_chroma_qp(h->pps.chroma_qp_index_offset, 0);
5925         // All coeffs are present
5926         memset(h->non_zero_count[mb_xy], 16, 16);
5927         s->current_picture.mb_type[mb_xy]= mb_type;
5928         return 0;
5929     }
5930
5931     fill_caches(h, mb_type, 0);
5932
5933     if( IS_INTRA( mb_type ) ) {
5934         int i;
5935         if( IS_INTRA4x4( mb_type ) ) {
5936             if( dct8x8_allowed && decode_cabac_mb_transform_size( h ) ) {
5937                 mb_type |= MB_TYPE_8x8DCT;
5938                 for( i = 0; i < 16; i+=4 ) {
5939                     int pred = pred_intra_mode( h, i );
5940                     int mode = decode_cabac_mb_intra4x4_pred_mode( h, pred );
5941                     fill_rectangle( &h->intra4x4_pred_mode_cache[ scan8[i] ], 2, 2, 8, mode, 1 );
5942                 }
5943             } else {
5944                 for( i = 0; i < 16; i++ ) {
5945                     int pred = pred_intra_mode( h, i );
5946                     h->intra4x4_pred_mode_cache[ scan8[i] ] = decode_cabac_mb_intra4x4_pred_mode( h, pred );
5947
5948                 //av_log( s->avctx, AV_LOG_ERROR, "i4x4 pred=%d mode=%d\n", pred, h->intra4x4_pred_mode_cache[ scan8[i] ] );
5949                 }
5950             }
5951             write_back_intra_pred_mode(h);
5952             if( check_intra4x4_pred_mode(h) < 0 ) return -1;
5953         } else {
5954             h->intra16x16_pred_mode= check_intra_pred_mode( h, h->intra16x16_pred_mode );
5955             if( h->intra16x16_pred_mode < 0 ) return -1;
5956         }
5957         h->chroma_pred_mode_table[mb_xy] =
5958             h->chroma_pred_mode          = decode_cabac_mb_chroma_pre_mode( h );
5959
5960         h->chroma_pred_mode= check_intra_pred_mode( h, h->chroma_pred_mode );
5961         if( h->chroma_pred_mode < 0 ) return -1;
5962     } else if( partition_count == 4 ) {
5963         int i, j, sub_partition_count[4], list, ref[2][4];
5964
5965         if( h->slice_type == B_TYPE ) {
5966             for( i = 0; i < 4; i++ ) {
5967                 h->sub_mb_type[i] = decode_cabac_b_mb_sub_type( h );
5968                 sub_partition_count[i]= b_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
5969                 h->sub_mb_type[i]=      b_sub_mb_type_info[ h->sub_mb_type[i] ].type;
5970             }
5971             if(   IS_DIRECT(h->sub_mb_type[0]) || IS_DIRECT(h->sub_mb_type[1])
5972                || IS_DIRECT(h->sub_mb_type[2]) || IS_DIRECT(h->sub_mb_type[3])) {
5973                 pred_direct_motion(h, &mb_type);
5974                 if( h->ref_count[0] > 1 || h->ref_count[1] > 1 ) {
5975                     for( i = 0; i < 4; i++ )
5976                         if( IS_DIRECT(h->sub_mb_type[i]) )
5977                             fill_rectangle( &h->direct_cache[scan8[4*i]], 2, 2, 8, 1, 1 );
5978                 }
5979             }
5980         } else {
5981             for( i = 0; i < 4; i++ ) {
5982                 h->sub_mb_type[i] = decode_cabac_p_mb_sub_type( h );
5983                 sub_partition_count[i]= p_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
5984                 h->sub_mb_type[i]=      p_sub_mb_type_info[ h->sub_mb_type[i] ].type;
5985             }
5986         }
5987
5988         for( list = 0; list < 2; list++ ) {
5989             if( h->ref_count[list] > 0 ) {
5990                 for( i = 0; i < 4; i++ ) {
5991                     if(IS_DIRECT(h->sub_mb_type[i])) continue;
5992                     if(IS_DIR(h->sub_mb_type[i], 0, list)){
5993                         if( h->ref_count[list] > 1 )
5994                             ref[list][i] = decode_cabac_mb_ref( h, list, 4*i );
5995                         else
5996                             ref[list][i] = 0;
5997                     } else {
5998                         ref[list][i] = -1;
5999                     }
6000                                                        h->ref_cache[list][ scan8[4*i]+1 ]=
6001                     h->ref_cache[list][ scan8[4*i]+8 ]=h->ref_cache[list][ scan8[4*i]+9 ]= ref[list][i];
6002                 }
6003             }
6004         }
6005
6006         if(dct8x8_allowed)
6007             dct8x8_allowed = get_dct8x8_allowed(h);
6008
6009         for(list=0; list<2; list++){
6010             for(i=0; i<4; i++){
6011                 if(IS_DIRECT(h->sub_mb_type[i])){
6012                     fill_rectangle(h->mvd_cache[list][scan8[4*i]], 2, 2, 8, 0, 4);
6013                     continue;
6014                 }
6015                 h->ref_cache[list][ scan8[4*i]   ]=h->ref_cache[list][ scan8[4*i]+1 ];
6016
6017                 if(IS_DIR(h->sub_mb_type[i], 0, list) && !IS_DIRECT(h->sub_mb_type[i])){
6018                     const int sub_mb_type= h->sub_mb_type[i];
6019                     const int block_width= (sub_mb_type & (MB_TYPE_16x16|MB_TYPE_16x8)) ? 2 : 1;
6020                     for(j=0; j<sub_partition_count[i]; j++){
6021                         int mpx, mpy;
6022                         int mx, my;
6023                         const int index= 4*i + block_width*j;
6024                         int16_t (* mv_cache)[2]= &h->mv_cache[list][ scan8[index] ];
6025                         int16_t (* mvd_cache)[2]= &h->mvd_cache[list][ scan8[index] ];
6026                         pred_motion(h, index, block_width, list, h->ref_cache[list][ scan8[index] ], &mpx, &mpy);
6027
6028                         mx = mpx + decode_cabac_mb_mvd( h, list, index, 0 );
6029                         my = mpy + decode_cabac_mb_mvd( h, list, index, 1 );
6030                         tprintf("final mv:%d %d\n", mx, my);
6031
6032                         if(IS_SUB_8X8(sub_mb_type)){
6033                             mv_cache[ 0 ][0]= mv_cache[ 1 ][0]=
6034                             mv_cache[ 8 ][0]= mv_cache[ 9 ][0]= mx;
6035                             mv_cache[ 0 ][1]= mv_cache[ 1 ][1]=
6036                             mv_cache[ 8 ][1]= mv_cache[ 9 ][1]= my;
6037
6038                             mvd_cache[ 0 ][0]= mvd_cache[ 1 ][0]=
6039                             mvd_cache[ 8 ][0]= mvd_cache[ 9 ][0]= mx - mpx;
6040                             mvd_cache[ 0 ][1]= mvd_cache[ 1 ][1]=
6041                             mvd_cache[ 8 ][1]= mvd_cache[ 9 ][1]= my - mpy;
6042                         }else if(IS_SUB_8X4(sub_mb_type)){
6043                             mv_cache[ 0 ][0]= mv_cache[ 1 ][0]= mx;
6044                             mv_cache[ 0 ][1]= mv_cache[ 1 ][1]= my;
6045
6046                             mvd_cache[ 0 ][0]= mvd_cache[ 1 ][0]= mx- mpx;
6047                             mvd_cache[ 0 ][1]= mvd_cache[ 1 ][1]= my - mpy;
6048                         }else if(IS_SUB_4X8(sub_mb_type)){
6049                             mv_cache[ 0 ][0]= mv_cache[ 8 ][0]= mx;
6050                             mv_cache[ 0 ][1]= mv_cache[ 8 ][1]= my;
6051
6052                             mvd_cache[ 0 ][0]= mvd_cache[ 8 ][0]= mx - mpx;
6053                             mvd_cache[ 0 ][1]= mvd_cache[ 8 ][1]= my - mpy;
6054                         }else{
6055                             assert(IS_SUB_4X4(sub_mb_type));
6056                             mv_cache[ 0 ][0]= mx;
6057                             mv_cache[ 0 ][1]= my;
6058
6059                             mvd_cache[ 0 ][0]= mx - mpx;
6060                             mvd_cache[ 0 ][1]= my - mpy;
6061                         }
6062                     }
6063                 }else{
6064                     uint32_t *p= (uint32_t *)&h->mv_cache[list][ scan8[4*i] ][0];
6065                     uint32_t *pd= (uint32_t *)&h->mvd_cache[list][ scan8[4*i] ][0];
6066                     p[0] = p[1] = p[8] = p[9] = 0;
6067                     pd[0]= pd[1]= pd[8]= pd[9]= 0;
6068                 }
6069             }
6070         }
6071     } else if( IS_DIRECT(mb_type) ) {
6072         pred_direct_motion(h, &mb_type);
6073         fill_rectangle(h->mvd_cache[0][scan8[0]], 4, 4, 8, 0, 4);
6074         fill_rectangle(h->mvd_cache[1][scan8[0]], 4, 4, 8, 0, 4);
6075         dct8x8_allowed &= h->sps.direct_8x8_inference_flag;
6076     } else {
6077         int list, mx, my, i, mpx, mpy;
6078         if(IS_16X16(mb_type)){
6079             for(list=0; list<2; list++){
6080                 if(IS_DIR(mb_type, 0, list)){
6081                     if(h->ref_count[list] > 0 ){
6082                         const int ref = h->ref_count[list] > 1 ? decode_cabac_mb_ref( h, list, 0 ) : 0;
6083                         fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, ref, 1);
6084                     }
6085                 }else
6086                     fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, (uint8_t)LIST_NOT_USED, 1);
6087             }
6088             for(list=0; list<2; list++){
6089                 if(IS_DIR(mb_type, 0, list)){
6090                     pred_motion(h, 0, 4, list, h->ref_cache[list][ scan8[0] ], &mpx, &mpy);
6091
6092                     mx = mpx + decode_cabac_mb_mvd( h, list, 0, 0 );
6093                     my = mpy + decode_cabac_mb_mvd( h, list, 0, 1 );
6094                     tprintf("final mv:%d %d\n", mx, my);
6095
6096                     fill_rectangle(h->mvd_cache[list][ scan8[0] ], 4, 4, 8, pack16to32(mx-mpx,my-mpy), 4);
6097                     fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, pack16to32(mx,my), 4);
6098                 }else
6099                     fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, 0, 4);
6100             }
6101         }
6102         else if(IS_16X8(mb_type)){
6103             for(list=0; list<2; list++){
6104                 if(h->ref_count[list]>0){
6105                     for(i=0; i<2; i++){
6106                         if(IS_DIR(mb_type, i, list)){
6107                             const int ref= h->ref_count[list] > 1 ? decode_cabac_mb_ref( h, list, 8*i ) : 0;
6108                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, ref, 1);
6109                         }else
6110                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, (LIST_NOT_USED&0xFF), 1);
6111                     }
6112                 }
6113             }
6114             for(list=0; list<2; list++){
6115                 for(i=0; i<2; i++){
6116                     if(IS_DIR(mb_type, i, list)){
6117                         pred_16x8_motion(h, 8*i, list, h->ref_cache[list][scan8[0] + 16*i], &mpx, &mpy);
6118                         mx = mpx + decode_cabac_mb_mvd( h, list, 8*i, 0 );
6119                         my = mpy + decode_cabac_mb_mvd( h, list, 8*i, 1 );
6120                         tprintf("final mv:%d %d\n", mx, my);
6121
6122                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 16*i ], 4, 2, 8, pack16to32(mx-mpx,my-mpy), 4);
6123                         fill_rectangle(h->mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, pack16to32(mx,my), 4);
6124                     }else{
6125                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 16*i ], 4, 2, 8, 0, 4);
6126                         fill_rectangle(h-> mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, 0, 4);
6127                     }
6128                 }
6129             }
6130         }else{
6131             assert(IS_8X16(mb_type));
6132             for(list=0; list<2; list++){
6133                 if(h->ref_count[list]>0){
6134                     for(i=0; i<2; i++){
6135                         if(IS_DIR(mb_type, i, list)){ //FIXME optimize
6136                             const int ref= h->ref_count[list] > 1 ? decode_cabac_mb_ref( h, list, 4*i ) : 0;
6137                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, ref, 1);
6138                         }else
6139                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, (LIST_NOT_USED&0xFF), 1);
6140                     }
6141                 }
6142             }
6143             for(list=0; list<2; list++){
6144                 for(i=0; i<2; i++){
6145                     if(IS_DIR(mb_type, i, list)){
6146                         pred_8x16_motion(h, i*4, list, h->ref_cache[list][ scan8[0] + 2*i ], &mpx, &mpy);
6147                         mx = mpx + decode_cabac_mb_mvd( h, list, 4*i, 0 );
6148                         my = mpy + decode_cabac_mb_mvd( h, list, 4*i, 1 );
6149
6150                         tprintf("final mv:%d %d\n", mx, my);
6151                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 2*i ], 2, 4, 8, pack16to32(mx-mpx,my-mpy), 4);
6152                         fill_rectangle(h->mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, pack16to32(mx,my), 4);
6153                     }else{
6154                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 2*i ], 2, 4, 8, 0, 4);
6155                         fill_rectangle(h-> mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, 0, 4);
6156                     }
6157                 }
6158             }
6159         }
6160     }
6161
6162    if( IS_INTER( mb_type ) ) {
6163         h->chroma_pred_mode_table[mb_xy] = 0;
6164         write_back_motion( h, mb_type );
6165    }
6166
6167     if( !IS_INTRA16x16( mb_type ) ) {
6168         cbp  = decode_cabac_mb_cbp_luma( h );
6169         cbp |= decode_cabac_mb_cbp_chroma( h ) << 4;
6170     }
6171
6172     h->cbp_table[mb_xy] = cbp;
6173
6174     if( dct8x8_allowed && (cbp&15) && !IS_INTRA( mb_type ) ) {
6175         if( decode_cabac_mb_transform_size( h ) )
6176             mb_type |= MB_TYPE_8x8DCT;
6177     }
6178     s->current_picture.mb_type[mb_xy]= mb_type;
6179
6180     if( cbp || IS_INTRA16x16( mb_type ) ) {
6181         const uint8_t *scan, *scan8x8, *dc_scan;
6182         int dqp;
6183
6184         if(IS_INTERLACED(mb_type)){
6185             scan= s->qscale ? h->field_scan : h->field_scan_q0;
6186             dc_scan= luma_dc_field_scan;
6187         }else{
6188             scan= s->qscale ? h->zigzag_scan : h->zigzag_scan_q0;
6189             dc_scan= luma_dc_zigzag_scan;
6190         }
6191         scan8x8= s->qscale ? h->zigzag_scan8x8 : h->zigzag_scan8x8_q0;
6192
6193         h->last_qscale_diff = dqp = decode_cabac_mb_dqp( h );
6194         if( dqp == INT_MIN ){
6195             av_log(h->s.avctx, AV_LOG_ERROR, "cabac decode of qscale diff failed at %d %d\n", s->mb_x, s->mb_y);
6196             return -1;
6197         }
6198         s->qscale += dqp;
6199         if(((unsigned)s->qscale) > 51){
6200             if(s->qscale<0) s->qscale+= 52;
6201             else            s->qscale-= 52;
6202         }
6203         h->chroma_qp = get_chroma_qp(h->pps.chroma_qp_index_offset, s->qscale);
6204
6205         if( IS_INTRA16x16( mb_type ) ) {
6206             int i;
6207             //av_log( s->avctx, AV_LOG_ERROR, "INTRA16x16 DC\n" );
6208             if( decode_cabac_residual( h, h->mb, 0, 0, dc_scan, NULL, 16) < 0)
6209                 return -1;
6210             if( cbp&15 ) {
6211                 for( i = 0; i < 16; i++ ) {
6212                     //av_log( s->avctx, AV_LOG_ERROR, "INTRA16x16 AC:%d\n", i );
6213                     if( decode_cabac_residual(h, h->mb + 16*i, 1, i, scan + 1, h->dequant4_coeff[0][s->qscale], 15) < 0 )
6214                         return -1;
6215                 }
6216             } else {
6217                 fill_rectangle(&h->non_zero_count_cache[scan8[0]], 4, 4, 8, 0, 1);
6218             }
6219         } else {
6220             int i8x8, i4x4;
6221             for( i8x8 = 0; i8x8 < 4; i8x8++ ) {
6222                 if( cbp & (1<<i8x8) ) {
6223                     if( IS_8x8DCT(mb_type) ) {
6224                         if( decode_cabac_residual(h, h->mb + 64*i8x8, 5, 4*i8x8,
6225                             scan8x8, h->dequant8_coeff[IS_INTRA( mb_type ) ? 0:1][s->qscale], 64) < 0 )
6226                             return -1;
6227                     } else
6228                     for( i4x4 = 0; i4x4 < 4; i4x4++ ) {
6229                         const int index = 4*i8x8 + i4x4;
6230                         //av_log( s->avctx, AV_LOG_ERROR, "Luma4x4: %d\n", index );
6231                         if( decode_cabac_residual(h, h->mb + 16*index, 2, index, scan, h->dequant4_coeff[IS_INTRA( mb_type ) ? 0:3][s->qscale], 16) < 0 )
6232                             return -1;
6233                     }
6234                 } else {
6235                     uint8_t * const nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
6236                     nnz[0] = nnz[1] = nnz[8] = nnz[9] = 0;
6237                 }
6238             }
6239         }
6240
6241         if( cbp&0x30 ){
6242             int c;
6243             for( c = 0; c < 2; c++ ) {
6244                 //av_log( s->avctx, AV_LOG_ERROR, "INTRA C%d-DC\n",c );
6245                 if( decode_cabac_residual(h, h->mb + 256 + 16*4*c, 3, c, chroma_dc_scan, NULL, 4) < 0)
6246                     return -1;
6247             }
6248         }
6249
6250         if( cbp&0x20 ) {
6251             int c, i;
6252             for( c = 0; c < 2; c++ ) {
6253                 for( i = 0; i < 4; i++ ) {
6254                     const int index = 16 + 4 * c + i;
6255                     //av_log( s->avctx, AV_LOG_ERROR, "INTRA C%d-AC %d\n",c, index - 16 );
6256                     if( decode_cabac_residual(h, h->mb + 16*index, 4, index - 16, scan + 1, h->dequant4_coeff[c+1+(IS_INTRA( mb_type ) ? 0:3)][h->chroma_qp], 15) < 0)
6257                         return -1;
6258                 }
6259             }
6260         } else {
6261             uint8_t * const nnz= &h->non_zero_count_cache[0];
6262             nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
6263             nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
6264         }
6265     } else {
6266         uint8_t * const nnz= &h->non_zero_count_cache[0];
6267         fill_rectangle(&nnz[scan8[0]], 4, 4, 8, 0, 1);
6268         nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
6269         nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
6270     }
6271
6272     s->current_picture.qscale_table[mb_xy]= s->qscale;
6273     write_back_non_zero_count(h);
6274
6275     return 0;
6276 }
6277
6278
6279 static void filter_mb_edgev( H264Context *h, uint8_t *pix, int stride, int bS[4], int qp ) {
6280     int i, d;
6281     const int index_a = clip( qp + h->slice_alpha_c0_offset, 0, 51 );
6282     const int alpha = alpha_table[index_a];
6283     const int beta  = beta_table[clip( qp + h->slice_beta_offset, 0, 51 )];
6284
6285     if( bS[0] < 4 ) {
6286         int8_t tc[4];
6287         for(i=0; i<4; i++)
6288             tc[i] = bS[i] ? tc0_table[index_a][bS[i] - 1] : -1;
6289         h->s.dsp.h264_h_loop_filter_luma(pix, stride, alpha, beta, tc);
6290     } else {
6291         /* 16px edge length, because bS=4 is triggered by being at
6292          * the edge of an intra MB, so all 4 bS are the same */
6293             for( d = 0; d < 16; d++ ) {
6294                 const int p0 = pix[-1];
6295                 const int p1 = pix[-2];
6296                 const int p2 = pix[-3];
6297
6298                 const int q0 = pix[0];
6299                 const int q1 = pix[1];
6300                 const int q2 = pix[2];
6301
6302                 if( ABS( p0 - q0 ) < alpha &&
6303                     ABS( p1 - p0 ) < beta &&
6304                     ABS( q1 - q0 ) < beta ) {
6305
6306                     if(ABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
6307                         if( ABS( p2 - p0 ) < beta)
6308                         {
6309                             const int p3 = pix[-4];
6310                             /* p0', p1', p2' */
6311                             pix[-1] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
6312                             pix[-2] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
6313                             pix[-3] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
6314                         } else {
6315                             /* p0' */
6316                             pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6317                         }
6318                         if( ABS( q2 - q0 ) < beta)
6319                         {
6320                             const int q3 = pix[3];
6321                             /* q0', q1', q2' */
6322                             pix[0] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
6323                             pix[1] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
6324                             pix[2] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
6325                         } else {
6326                             /* q0' */
6327                             pix[0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6328                         }
6329                     }else{
6330                         /* p0', q0' */
6331                         pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6332                         pix[ 0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6333                     }
6334                     tprintf("filter_mb_edgev i:%d d:%d\n# bS:4 -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, d, p2, p1, p0, q0, q1, q2, pix[-2], pix[-1], pix[0], pix[1]);
6335                 }
6336                 pix += stride;
6337             }
6338     }
6339 }
6340 static void filter_mb_edgecv( H264Context *h, uint8_t *pix, int stride, int bS[4], int qp ) {
6341     int i;
6342     const int index_a = clip( qp + h->slice_alpha_c0_offset, 0, 51 );
6343     const int alpha = alpha_table[index_a];
6344     const int beta  = beta_table[clip( qp + h->slice_beta_offset, 0, 51 )];
6345
6346     if( bS[0] < 4 ) {
6347         int8_t tc[4];
6348         for(i=0; i<4; i++)
6349             tc[i] = bS[i] ? tc0_table[index_a][bS[i] - 1] + 1 : 0;
6350         h->s.dsp.h264_h_loop_filter_chroma(pix, stride, alpha, beta, tc);
6351     } else {
6352         h->s.dsp.h264_h_loop_filter_chroma_intra(pix, stride, alpha, beta);
6353     }
6354 }
6355
6356 static void filter_mb_mbaff_edgev( H264Context *h, uint8_t *pix, int stride, int bS[8], int qp[2] ) {
6357     int i;
6358     for( i = 0; i < 16; i++, pix += stride) {
6359         int index_a;
6360         int alpha;
6361         int beta;
6362
6363         int qp_index;
6364         int bS_index = (i >> 1);
6365         if (h->mb_field_decoding_flag) {
6366             bS_index &= ~1;
6367             bS_index |= (i & 1);
6368         }
6369
6370         if( bS[bS_index] == 0 ) {
6371             continue;
6372         }
6373
6374         qp_index = h->mb_field_decoding_flag ? (i & 1) : (i >> 3);
6375         index_a = clip( qp[qp_index] + h->slice_alpha_c0_offset, 0, 51 );
6376         alpha = alpha_table[index_a];
6377         beta  = beta_table[clip( qp[qp_index] + h->slice_beta_offset, 0, 51 )];
6378
6379
6380         if( bS[bS_index] < 4 ) {
6381             const int tc0 = tc0_table[index_a][bS[bS_index] - 1];
6382             /* 4px edge length */
6383             const int p0 = pix[-1];
6384             const int p1 = pix[-2];
6385             const int p2 = pix[-3];
6386             const int q0 = pix[0];
6387             const int q1 = pix[1];
6388             const int q2 = pix[2];
6389
6390             if( ABS( p0 - q0 ) < alpha &&
6391                 ABS( p1 - p0 ) < beta &&
6392                 ABS( q1 - q0 ) < beta ) {
6393                 int tc = tc0;
6394                 int i_delta;
6395
6396                 if( ABS( p2 - p0 ) < beta ) {
6397                     pix[-2] = p1 + clip( ( p2 + ( ( p0 + q0 + 1 ) >> 1 ) - ( p1 << 1 ) ) >> 1, -tc0, tc0 );
6398                     tc++;
6399                 }
6400                 if( ABS( q2 - q0 ) < beta ) {
6401                     pix[1] = q1 + clip( ( q2 + ( ( p0 + q0 + 1 ) >> 1 ) - ( q1 << 1 ) ) >> 1, -tc0, tc0 );
6402                     tc++;
6403                 }
6404
6405                 i_delta = clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
6406                 pix[-1] = clip_uint8( p0 + i_delta );    /* p0' */
6407                 pix[0]  = clip_uint8( q0 - i_delta );    /* q0' */
6408                 tprintf("filter_mb_mbaff_edgev i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d, tc:%d\n# bS:%d -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, tc, bS[bS_index], pix[-3], p1, p0, q0, q1, pix[2], p1, pix[-1], pix[0], q1);
6409             }
6410         }else{
6411             /* 4px edge length */
6412             const int p0 = pix[-1];
6413             const int p1 = pix[-2];
6414             const int p2 = pix[-3];
6415
6416             const int q0 = pix[0];
6417             const int q1 = pix[1];
6418             const int q2 = pix[2];
6419
6420             if( ABS( p0 - q0 ) < alpha &&
6421                 ABS( p1 - p0 ) < beta &&
6422                 ABS( q1 - q0 ) < beta ) {
6423
6424                 if(ABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
6425                     if( ABS( p2 - p0 ) < beta)
6426                     {
6427                         const int p3 = pix[-4];
6428                         /* p0', p1', p2' */
6429                         pix[-1] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
6430                         pix[-2] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
6431                         pix[-3] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
6432                     } else {
6433                         /* p0' */
6434                         pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6435                     }
6436                     if( ABS( q2 - q0 ) < beta)
6437                     {
6438                         const int q3 = pix[3];
6439                         /* q0', q1', q2' */
6440                         pix[0] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
6441                         pix[1] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
6442                         pix[2] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
6443                     } else {
6444                         /* q0' */
6445                         pix[0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6446                     }
6447                 }else{
6448                     /* p0', q0' */
6449                     pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6450                     pix[ 0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6451                 }
6452                 tprintf("filter_mb_mbaff_edgev i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d\n# bS:4 -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, p2, p1, p0, q0, q1, q2, pix[-3], pix[-2], pix[-1], pix[0], pix[1], pix[2]);
6453             }
6454         }
6455     }
6456 }
6457 static void filter_mb_mbaff_edgecv( H264Context *h, uint8_t *pix, int stride, int bS[4], int qp[2] ) {
6458     int i;
6459     for( i = 0; i < 8; i++, pix += stride) {
6460         int index_a;
6461         int alpha;
6462         int beta;
6463
6464         int qp_index;
6465         int bS_index = i;
6466
6467         if( bS[bS_index] == 0 ) {
6468             continue;
6469         }
6470
6471         qp_index = h->mb_field_decoding_flag ? (i & 1) : (i >> 3);
6472         index_a = clip( qp[qp_index] + h->slice_alpha_c0_offset, 0, 51 );
6473         alpha = alpha_table[index_a];
6474         beta  = beta_table[clip( qp[qp_index] + h->slice_beta_offset, 0, 51 )];
6475         if( bS[bS_index] < 4 ) {
6476             const int tc = tc0_table[index_a][bS[bS_index] - 1] + 1;
6477             /* 2px edge length (because we use same bS than the one for luma) */
6478             const int p0 = pix[-1];
6479             const int p1 = pix[-2];
6480             const int q0 = pix[0];
6481             const int q1 = pix[1];
6482
6483             if( ABS( p0 - q0 ) < alpha &&
6484                 ABS( p1 - p0 ) < beta &&
6485                 ABS( q1 - q0 ) < beta ) {
6486                 const int i_delta = clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
6487
6488                 pix[-1] = clip_uint8( p0 + i_delta );    /* p0' */
6489                 pix[0]  = clip_uint8( q0 - i_delta );    /* q0' */
6490                 tprintf("filter_mb_mbaff_edgecv i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d, tc:%d\n# bS:%d -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, tc, bS[bS_index], pix[-3], p1, p0, q0, q1, pix[2], p1, pix[-1], pix[0], q1);
6491             }
6492         }else{
6493             const int p0 = pix[-1];
6494             const int p1 = pix[-2];
6495             const int q0 = pix[0];
6496             const int q1 = pix[1];
6497
6498             if( ABS( p0 - q0 ) < alpha &&
6499                 ABS( p1 - p0 ) < beta &&
6500                 ABS( q1 - q0 ) < beta ) {
6501
6502                 pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;   /* p0' */
6503                 pix[0]  = ( 2*q1 + q0 + p1 + 2 ) >> 2;   /* q0' */
6504                 tprintf("filter_mb_mbaff_edgecv i:%d\n# bS:4 -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x, %02x, %02x]\n", i, pix[-3], p1, p0, q0, q1, pix[2], pix[-3], pix[-2], pix[-1], pix[0], pix[1], pix[2]);
6505             }
6506         }
6507     }
6508 }
6509
6510 static void filter_mb_edgeh( H264Context *h, uint8_t *pix, int stride, int bS[4], int qp ) {
6511     int i, d;
6512     const int index_a = clip( qp + h->slice_alpha_c0_offset, 0, 51 );
6513     const int alpha = alpha_table[index_a];
6514     const int beta  = beta_table[clip( qp + h->slice_beta_offset, 0, 51 )];
6515     const int pix_next  = stride;
6516
6517     if( bS[0] < 4 ) {
6518         int8_t tc[4];
6519         for(i=0; i<4; i++)
6520             tc[i] = bS[i] ? tc0_table[index_a][bS[i] - 1] : -1;
6521         h->s.dsp.h264_v_loop_filter_luma(pix, stride, alpha, beta, tc);
6522     } else {
6523         /* 16px edge length, see filter_mb_edgev */
6524             for( d = 0; d < 16; d++ ) {
6525                 const int p0 = pix[-1*pix_next];
6526                 const int p1 = pix[-2*pix_next];
6527                 const int p2 = pix[-3*pix_next];
6528                 const int q0 = pix[0];
6529                 const int q1 = pix[1*pix_next];
6530                 const int q2 = pix[2*pix_next];
6531
6532                 if( ABS( p0 - q0 ) < alpha &&
6533                     ABS( p1 - p0 ) < beta &&
6534                     ABS( q1 - q0 ) < beta ) {
6535
6536                     const int p3 = pix[-4*pix_next];
6537                     const int q3 = pix[ 3*pix_next];
6538
6539                     if(ABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
6540                         if( ABS( p2 - p0 ) < beta) {
6541                             /* p0', p1', p2' */
6542                             pix[-1*pix_next] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
6543                             pix[-2*pix_next] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
6544                             pix[-3*pix_next] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
6545                         } else {
6546                             /* p0' */
6547                             pix[-1*pix_next] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6548                         }
6549                         if( ABS( q2 - q0 ) < beta) {
6550                             /* q0', q1', q2' */
6551                             pix[0*pix_next] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
6552                             pix[1*pix_next] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
6553                             pix[2*pix_next] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
6554                         } else {
6555                             /* q0' */
6556                             pix[0*pix_next] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6557                         }
6558                     }else{
6559                         /* p0', q0' */
6560                         pix[-1*pix_next] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
6561                         pix[ 0*pix_next] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
6562                     }
6563                     tprintf("filter_mb_edgeh i:%d d:%d, qp:%d, indexA:%d, alpha:%d, beta:%d\n# bS:%d -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, d, qp, index_a, alpha, beta, bS[i], p2, p1, p0, q0, q1, q2, pix[-2*pix_next], pix[-pix_next], pix[0], pix[pix_next]);
6564                 }
6565                 pix++;
6566             }
6567     }
6568 }
6569
6570 static void filter_mb_edgech( H264Context *h, uint8_t *pix, int stride, int bS[4], int qp ) {
6571     int i;
6572     const int index_a = clip( qp + h->slice_alpha_c0_offset, 0, 51 );
6573     const int alpha = alpha_table[index_a];
6574     const int beta  = beta_table[clip( qp + h->slice_beta_offset, 0, 51 )];
6575
6576     if( bS[0] < 4 ) {
6577         int8_t tc[4];
6578         for(i=0; i<4; i++)
6579             tc[i] = bS[i] ? tc0_table[index_a][bS[i] - 1] + 1 : 0;
6580         h->s.dsp.h264_v_loop_filter_chroma(pix, stride, alpha, beta, tc);
6581     } else {
6582         h->s.dsp.h264_v_loop_filter_chroma_intra(pix, stride, alpha, beta);
6583     }
6584 }
6585
6586 static void filter_mb( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize) {
6587     MpegEncContext * const s = &h->s;
6588     const int mb_xy= mb_x + mb_y*s->mb_stride;
6589     int first_vertical_edge_done = 0;
6590     int dir;
6591     /* FIXME: A given frame may occupy more than one position in
6592      * the reference list. So ref2frm should be populated with
6593      * frame numbers, not indices. */
6594     static const int ref2frm[18] = {-1,-1,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15};
6595
6596     //for sufficiently low qp, filtering wouldn't do anything
6597     //this is a conservative estimate: could also check beta_offset and more accurate chroma_qp
6598     if(!h->mb_aff_frame){
6599         int qp_thresh = 15 - h->slice_alpha_c0_offset - FFMAX(0, h->pps.chroma_qp_index_offset);
6600         int qp = s->current_picture.qscale_table[mb_xy];
6601         if(qp <= qp_thresh
6602            && (mb_x == 0 || ((qp + s->current_picture.qscale_table[mb_xy-1] + 1)>>1) <= qp_thresh)
6603            && (mb_y == 0 || ((qp + s->current_picture.qscale_table[h->top_mb_xy] + 1)>>1) <= qp_thresh)){
6604             return;
6605         }
6606     }
6607
6608     if (h->mb_aff_frame
6609             // left mb is in picture
6610             && h->slice_table[mb_xy-1] != 255
6611             // and current and left pair do not have the same interlaced type
6612             && (IS_INTERLACED(s->current_picture.mb_type[mb_xy]) != IS_INTERLACED(s->current_picture.mb_type[mb_xy-1]))
6613             // and left mb is in the same slice if deblocking_filter == 2
6614             && (h->deblocking_filter!=2 || h->slice_table[mb_xy-1] == h->slice_table[mb_xy])) {
6615         /* First vertical edge is different in MBAFF frames
6616          * There are 8 different bS to compute and 2 different Qp
6617          */
6618         int bS[8];
6619         int qp[2];
6620         int chroma_qp[2];
6621
6622         int i;
6623         first_vertical_edge_done = 1;
6624         for( i = 0; i < 8; i++ ) {
6625             int y = i>>1;
6626             int b_idx= 8 + 4 + 8*y;
6627             int bn_idx= b_idx - 1;
6628
6629             int mbn_xy = h->mb_field_decoding_flag ? h->left_mb_xy[i>>2] : h->left_mb_xy[i&1];
6630
6631             if( IS_INTRA( s->current_picture.mb_type[mb_xy] ) ||
6632                 IS_INTRA( s->current_picture.mb_type[mbn_xy] ) ) {
6633                 bS[i] = 4;
6634             } else if( h->non_zero_count_cache[b_idx] != 0 ||
6635                 /* FIXME: with 8x8dct + cavlc, should check cbp instead of nnz */
6636                 h->non_zero_count_cache[bn_idx] != 0 ) {
6637                 bS[i] = 2;
6638             } else {
6639                 int l;
6640                 bS[i] = 0;
6641                 for( l = 0; l < 1 + (h->slice_type == B_TYPE); l++ ) {
6642                     if( ref2frm[h->ref_cache[l][b_idx]+2] != ref2frm[h->ref_cache[l][bn_idx]+2] ||
6643                         ABS( h->mv_cache[l][b_idx][0] - h->mv_cache[l][bn_idx][0] ) >= 4 ||
6644                         ABS( h->mv_cache[l][b_idx][1] - h->mv_cache[l][bn_idx][1] ) >= 4 ) {
6645                         bS[i] = 1;
6646                         break;
6647                     }
6648                 }
6649             }
6650         }
6651         if(bS[0]+bS[1]+bS[2]+bS[3] != 0) {
6652             // Do not use s->qscale as luma quantizer because it has not the same
6653             // value in IPCM macroblocks.
6654             qp[0] = ( s->current_picture.qscale_table[mb_xy] + s->current_picture.qscale_table[h->left_mb_xy[0]] + 1 ) >> 1;
6655             chroma_qp[0] = ( get_chroma_qp( h->pps.chroma_qp_index_offset, s->current_picture.qscale_table[mb_xy] ) +
6656                              get_chroma_qp( h->pps.chroma_qp_index_offset, s->current_picture.qscale_table[h->left_mb_xy[0]] ) + 1 ) >> 1;
6657             qp[1] = ( s->current_picture.qscale_table[mb_xy] + s->current_picture.qscale_table[h->left_mb_xy[1]] + 1 ) >> 1;
6658             chroma_qp[1] = ( get_chroma_qp( h->pps.chroma_qp_index_offset, s->current_picture.qscale_table[mb_xy] ) +
6659                              get_chroma_qp( h->pps.chroma_qp_index_offset, s->current_picture.qscale_table[h->left_mb_xy[1]] ) + 1 ) >> 1;
6660
6661             /* Filter edge */
6662             tprintf("filter mb:%d/%d MBAFF, QPy:%d/%d, QPc:%d/%d ls:%d uvls:%d", mb_x, mb_y, qp[0], qp[1], chroma_qp[0], chroma_qp[1], linesize, uvlinesize);
6663             { int i; for (i = 0; i < 8; i++) tprintf(" bS[%d]:%d", i, bS[i]); tprintf("\n"); }
6664             filter_mb_mbaff_edgev ( h, &img_y [0], linesize,   bS, qp );
6665             filter_mb_mbaff_edgecv( h, &img_cb[0], uvlinesize, bS, chroma_qp );
6666             filter_mb_mbaff_edgecv( h, &img_cr[0], uvlinesize, bS, chroma_qp );
6667         }
6668     }
6669     /* dir : 0 -> vertical edge, 1 -> horizontal edge */
6670     for( dir = 0; dir < 2; dir++ )
6671     {
6672         int edge;
6673         const int mbm_xy = dir == 0 ? mb_xy -1 : h->top_mb_xy;
6674         const int mb_type = s->current_picture.mb_type[mb_xy];
6675         const int mbm_type = s->current_picture.mb_type[mbm_xy];
6676         int start = h->slice_table[mbm_xy] == 255 ? 1 : 0;
6677
6678         const int edges = (mb_type & (MB_TYPE_16x16|MB_TYPE_SKIP))
6679                                   == (MB_TYPE_16x16|MB_TYPE_SKIP) ? 1 : 4;
6680         // how often to recheck mv-based bS when iterating between edges
6681         const int mask_edge = (mb_type & (MB_TYPE_16x16 | (MB_TYPE_16x8 << dir))) ? 3 :
6682                               (mb_type & (MB_TYPE_8x16 >> dir)) ? 1 : 0;
6683         // how often to recheck mv-based bS when iterating along each edge
6684         const int mask_par0 = mb_type & (MB_TYPE_16x16 | (MB_TYPE_8x16 >> dir));
6685
6686         if (first_vertical_edge_done) {
6687             start = 1;
6688             first_vertical_edge_done = 0;
6689         }
6690
6691         if (h->deblocking_filter==2 && h->slice_table[mbm_xy] != h->slice_table[mb_xy])
6692             start = 1;
6693
6694         /* Calculate bS */
6695         for( edge = start; edge < edges; edge++ ) {
6696             /* mbn_xy: neighbor macroblock */
6697             const int mbn_xy = edge > 0 ? mb_xy : mbm_xy;
6698             const int mbn_type = s->current_picture.mb_type[mbn_xy];
6699             int bS[4];
6700             int qp;
6701
6702             if( (edge&1) && IS_8x8DCT(mb_type) )
6703                 continue;
6704
6705             if (h->mb_aff_frame && (dir == 1) && (edge == 0) && ((mb_y & 1) == 0)
6706                 && !IS_INTERLACED(mb_type)
6707                 && IS_INTERLACED(mbn_type)
6708                 ) {
6709                 // This is a special case in the norm where the filtering must
6710                 // be done twice (one each of the field) even if we are in a
6711                 // frame macroblock.
6712                 //
6713                 unsigned int tmp_linesize   = 2 *   linesize;
6714                 unsigned int tmp_uvlinesize = 2 * uvlinesize;
6715                 int mbn_xy = mb_xy - 2 * s->mb_stride;
6716                 int qp, chroma_qp;
6717
6718                 // first filtering
6719                 if( IS_INTRA(mb_type) ||
6720                     IS_INTRA(s->current_picture.mb_type[mbn_xy]) ) {
6721                     bS[0] = bS[1] = bS[2] = bS[3] = 3;
6722                 } else {
6723                     // TODO
6724                     av_log(h->s.avctx, AV_LOG_ERROR, "both non intra (TODO)\n");
6725                 }
6726                 /* Filter edge */
6727                 // Do not use s->qscale as luma quantizer because it has not the same
6728                 // value in IPCM macroblocks.
6729                 qp = ( s->current_picture.qscale_table[mb_xy] + s->current_picture.qscale_table[mbn_xy] + 1 ) >> 1;
6730                 tprintf("filter mb:%d/%d dir:%d edge:%d, QPy:%d ls:%d uvls:%d", mb_x, mb_y, dir, edge, qp, tmp_linesize, tmp_uvlinesize);
6731                 { int i; for (i = 0; i < 4; i++) tprintf(" bS[%d]:%d", i, bS[i]); tprintf("\n"); }
6732                 filter_mb_edgeh( h, &img_y[0], tmp_linesize, bS, qp );
6733                 chroma_qp = ( h->chroma_qp +
6734                               get_chroma_qp( h->pps.chroma_qp_index_offset, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1;
6735                 filter_mb_edgech( h, &img_cb[0], tmp_uvlinesize, bS, chroma_qp );
6736                 filter_mb_edgech( h, &img_cr[0], tmp_uvlinesize, bS, chroma_qp );
6737
6738                 // second filtering
6739                 mbn_xy += s->mb_stride;
6740                 if( IS_INTRA(mb_type) ||
6741                     IS_INTRA(mbn_type) ) {
6742                     bS[0] = bS[1] = bS[2] = bS[3] = 3;
6743                 } else {
6744                     // TODO
6745                     av_log(h->s.avctx, AV_LOG_ERROR, "both non intra (TODO)\n");
6746                 }
6747                 /* Filter edge */
6748                 // Do not use s->qscale as luma quantizer because it has not the same
6749                 // value in IPCM macroblocks.
6750                 qp = ( s->current_picture.qscale_table[mb_xy] + s->current_picture.qscale_table[mbn_xy] + 1 ) >> 1;
6751                 tprintf("filter mb:%d/%d dir:%d edge:%d, QPy:%d ls:%d uvls:%d", mb_x, mb_y, dir, edge, qp, tmp_linesize, tmp_uvlinesize);
6752                 { int i; for (i = 0; i < 4; i++) tprintf(" bS[%d]:%d", i, bS[i]); tprintf("\n"); }
6753                 filter_mb_edgeh( h, &img_y[linesize], tmp_linesize, bS, qp );
6754                 chroma_qp = ( h->chroma_qp +
6755                               get_chroma_qp( h->pps.chroma_qp_index_offset, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1;
6756                 filter_mb_edgech( h, &img_cb[uvlinesize], tmp_uvlinesize, bS, chroma_qp );
6757                 filter_mb_edgech( h, &img_cr[uvlinesize], tmp_uvlinesize, bS, chroma_qp );
6758                 continue;
6759             }
6760             if( IS_INTRA(mb_type) ||
6761                 IS_INTRA(mbn_type) ) {
6762                 int value;
6763                 if (edge == 0) {
6764                     if (   (!IS_INTERLACED(mb_type) && !IS_INTERLACED(mbm_type))
6765                         || ((h->mb_aff_frame || (s->picture_structure != PICT_FRAME)) && (dir == 0))
6766                     ) {
6767                         value = 4;
6768                     } else {
6769                         value = 3;
6770                     }
6771                 } else {
6772                     value = 3;
6773                 }
6774                 bS[0] = bS[1] = bS[2] = bS[3] = value;
6775             } else {
6776                 int i, l;
6777                 int mv_done;
6778
6779                 if( edge & mask_edge ) {
6780                     bS[0] = bS[1] = bS[2] = bS[3] = 0;
6781                     mv_done = 1;
6782                 }
6783                 else if( mask_par0 && (edge || (mbn_type & (MB_TYPE_16x16 | (MB_TYPE_8x16 >> dir)))) ) {
6784                     int b_idx= 8 + 4 + edge * (dir ? 8:1);
6785                     int bn_idx= b_idx - (dir ? 8:1);
6786                     int v = 0;
6787                     for( l = 0; !v && l < 1 + (h->slice_type == B_TYPE); l++ ) {
6788                         v |= ref2frm[h->ref_cache[l][b_idx]+2] != ref2frm[h->ref_cache[l][bn_idx]+2] ||
6789                              ABS( h->mv_cache[l][b_idx][0] - h->mv_cache[l][bn_idx][0] ) >= 4 ||
6790                              ABS( h->mv_cache[l][b_idx][1] - h->mv_cache[l][bn_idx][1] ) >= 4;
6791                     }
6792                     bS[0] = bS[1] = bS[2] = bS[3] = v;
6793                     mv_done = 1;
6794                 }
6795                 else
6796                     mv_done = 0;
6797
6798                 for( i = 0; i < 4; i++ ) {
6799                     int x = dir == 0 ? edge : i;
6800                     int y = dir == 0 ? i    : edge;
6801                     int b_idx= 8 + 4 + x + 8*y;
6802                     int bn_idx= b_idx - (dir ? 8:1);
6803
6804                     if( h->non_zero_count_cache[b_idx] != 0 ||
6805                         h->non_zero_count_cache[bn_idx] != 0 ) {
6806                         bS[i] = 2;
6807                     }
6808                     else if(!mv_done)
6809                     {
6810                         bS[i] = 0;
6811                         for( l = 0; l < 1 + (h->slice_type == B_TYPE); l++ ) {
6812                             if( ref2frm[h->ref_cache[l][b_idx]+2] != ref2frm[h->ref_cache[l][bn_idx]+2] ||
6813                                 ABS( h->mv_cache[l][b_idx][0] - h->mv_cache[l][bn_idx][0] ) >= 4 ||
6814                                 ABS( h->mv_cache[l][b_idx][1] - h->mv_cache[l][bn_idx][1] ) >= 4 ) {
6815                                 bS[i] = 1;
6816                                 break;
6817                             }
6818                         }
6819                     }
6820                 }
6821
6822                 if(bS[0]+bS[1]+bS[2]+bS[3] == 0)
6823                     continue;
6824             }
6825
6826             /* Filter edge */
6827             // Do not use s->qscale as luma quantizer because it has not the same
6828             // value in IPCM macroblocks.
6829             qp = ( s->current_picture.qscale_table[mb_xy] + s->current_picture.qscale_table[mbn_xy] + 1 ) >> 1;
6830             //tprintf("filter mb:%d/%d dir:%d edge:%d, QPy:%d, QPc:%d, QPcn:%d\n", mb_x, mb_y, dir, edge, qp, h->chroma_qp, s->current_picture.qscale_table[mbn_xy]);
6831             tprintf("filter mb:%d/%d dir:%d edge:%d, QPy:%d ls:%d uvls:%d", mb_x, mb_y, dir, edge, qp, linesize, uvlinesize);
6832             { int i; for (i = 0; i < 4; i++) tprintf(" bS[%d]:%d", i, bS[i]); tprintf("\n"); }
6833             if( dir == 0 ) {
6834                 filter_mb_edgev( h, &img_y[4*edge], linesize, bS, qp );
6835                 if( (edge&1) == 0 ) {
6836                     int chroma_qp = ( h->chroma_qp +
6837                                       get_chroma_qp( h->pps.chroma_qp_index_offset, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1;
6838                     filter_mb_edgecv( h, &img_cb[2*edge], uvlinesize, bS, chroma_qp );
6839                     filter_mb_edgecv( h, &img_cr[2*edge], uvlinesize, bS, chroma_qp );
6840                 }
6841             } else {
6842                 filter_mb_edgeh( h, &img_y[4*edge*linesize], linesize, bS, qp );
6843                 if( (edge&1) == 0 ) {
6844                     int chroma_qp = ( h->chroma_qp +
6845                                       get_chroma_qp( h->pps.chroma_qp_index_offset, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1;
6846                     filter_mb_edgech( h, &img_cb[2*edge*uvlinesize], uvlinesize, bS, chroma_qp );
6847                     filter_mb_edgech( h, &img_cr[2*edge*uvlinesize], uvlinesize, bS, chroma_qp );
6848                 }
6849             }
6850         }
6851     }
6852 }
6853
6854 static int decode_slice(H264Context *h){
6855     MpegEncContext * const s = &h->s;
6856     const int part_mask= s->partitioned_frame ? (AC_END|AC_ERROR) : 0x7F;
6857
6858     s->mb_skip_run= -1;
6859
6860     if( h->pps.cabac ) {
6861         int i;
6862
6863         /* realign */
6864         align_get_bits( &s->gb );
6865
6866         /* init cabac */
6867         ff_init_cabac_states( &h->cabac, ff_h264_lps_range, ff_h264_mps_state, ff_h264_lps_state, 64 );
6868         ff_init_cabac_decoder( &h->cabac,
6869                                s->gb.buffer + get_bits_count(&s->gb)/8,
6870                                ( s->gb.size_in_bits - get_bits_count(&s->gb) + 7)/8);
6871         /* calculate pre-state */
6872         for( i= 0; i < 460; i++ ) {
6873             int pre;
6874             if( h->slice_type == I_TYPE )
6875                 pre = clip( ((cabac_context_init_I[i][0] * s->qscale) >>4 ) + cabac_context_init_I[i][1], 1, 126 );
6876             else
6877                 pre = clip( ((cabac_context_init_PB[h->cabac_init_idc][i][0] * s->qscale) >>4 ) + cabac_context_init_PB[h->cabac_init_idc][i][1], 1, 126 );
6878
6879             if( pre <= 63 )
6880                 h->cabac_state[i] = 2 * ( 63 - pre ) + 0;
6881             else
6882                 h->cabac_state[i] = 2 * ( pre - 64 ) + 1;
6883         }
6884
6885         for(;;){
6886             int ret = decode_mb_cabac(h);
6887             int eos;
6888
6889             if(ret>=0) hl_decode_mb(h);
6890
6891             /* XXX: useless as decode_mb_cabac it doesn't support that ... */
6892             if( ret >= 0 && h->mb_aff_frame ) { //FIXME optimal? or let mb_decode decode 16x32 ?
6893                 s->mb_y++;
6894
6895                 if(ret>=0) ret = decode_mb_cabac(h);
6896
6897                 if(ret>=0) hl_decode_mb(h);
6898                 s->mb_y--;
6899             }
6900             eos = get_cabac_terminate( &h->cabac );
6901
6902             if( ret < 0 || h->cabac.bytestream > h->cabac.bytestream_end + 1) {
6903                 av_log(h->s.avctx, AV_LOG_ERROR, "error while decoding MB %d %d\n", s->mb_x, s->mb_y);
6904                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6905                 return -1;
6906             }
6907
6908             if( ++s->mb_x >= s->mb_width ) {
6909                 s->mb_x = 0;
6910                 ff_draw_horiz_band(s, 16*s->mb_y, 16);
6911                 ++s->mb_y;
6912                 if(h->mb_aff_frame) {
6913                     ++s->mb_y;
6914                 }
6915             }
6916
6917             if( eos || s->mb_y >= s->mb_height ) {
6918                 tprintf("slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
6919                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6920                 return 0;
6921             }
6922         }
6923
6924     } else {
6925         for(;;){
6926             int ret = decode_mb_cavlc(h);
6927
6928             if(ret>=0) hl_decode_mb(h);
6929
6930             if(ret>=0 && h->mb_aff_frame){ //FIXME optimal? or let mb_decode decode 16x32 ?
6931                 s->mb_y++;
6932                 ret = decode_mb_cavlc(h);
6933
6934                 if(ret>=0) hl_decode_mb(h);
6935                 s->mb_y--;
6936             }
6937
6938             if(ret<0){
6939                 av_log(h->s.avctx, AV_LOG_ERROR, "error while decoding MB %d %d\n", s->mb_x, s->mb_y);
6940                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6941
6942                 return -1;
6943             }
6944
6945             if(++s->mb_x >= s->mb_width){
6946                 s->mb_x=0;
6947                 ff_draw_horiz_band(s, 16*s->mb_y, 16);
6948                 ++s->mb_y;
6949                 if(h->mb_aff_frame) {
6950                     ++s->mb_y;
6951                 }
6952                 if(s->mb_y >= s->mb_height){
6953                     tprintf("slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
6954
6955                     if(get_bits_count(&s->gb) == s->gb.size_in_bits ) {
6956                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6957
6958                         return 0;
6959                     }else{
6960                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6961
6962                         return -1;
6963                     }
6964                 }
6965             }
6966
6967             if(get_bits_count(&s->gb) >= s->gb.size_in_bits && s->mb_skip_run<=0){
6968                 tprintf("slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
6969                 if(get_bits_count(&s->gb) == s->gb.size_in_bits ){
6970                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6971
6972                     return 0;
6973                 }else{
6974                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6975
6976                     return -1;
6977                 }
6978             }
6979         }
6980     }
6981
6982 #if 0
6983     for(;s->mb_y < s->mb_height; s->mb_y++){
6984         for(;s->mb_x < s->mb_width; s->mb_x++){
6985             int ret= decode_mb(h);
6986
6987             hl_decode_mb(h);
6988
6989             if(ret<0){
6990                 av_log(s->avctx, AV_LOG_ERROR, "error while decoding MB %d %d\n", s->mb_x, s->mb_y);
6991                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6992
6993                 return -1;
6994             }
6995
6996             if(++s->mb_x >= s->mb_width){
6997                 s->mb_x=0;
6998                 if(++s->mb_y >= s->mb_height){
6999                     if(get_bits_count(s->gb) == s->gb.size_in_bits){
7000                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
7001
7002                         return 0;
7003                     }else{
7004                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
7005
7006                         return -1;
7007                     }
7008                 }
7009             }
7010
7011             if(get_bits_count(s->?gb) >= s->gb?.size_in_bits){
7012                 if(get_bits_count(s->gb) == s->gb.size_in_bits){
7013                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
7014
7015                     return 0;
7016                 }else{
7017                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
7018
7019                     return -1;
7020                 }
7021             }
7022         }
7023         s->mb_x=0;
7024         ff_draw_horiz_band(s, 16*s->mb_y, 16);
7025     }
7026 #endif
7027     return -1; //not reached
7028 }
7029
7030 static int decode_unregistered_user_data(H264Context *h, int size){
7031     MpegEncContext * const s = &h->s;
7032     uint8_t user_data[16+256];
7033     int e, build, i;
7034
7035     if(size<16)
7036         return -1;
7037
7038     for(i=0; i<sizeof(user_data)-1 && i<size; i++){
7039         user_data[i]= get_bits(&s->gb, 8);
7040     }
7041
7042     user_data[i]= 0;
7043     e= sscanf(user_data+16, "x264 - core %d"/*%s - H.264/MPEG-4 AVC codec - Copyleft 2005 - http://www.videolan.org/x264.html*/, &build);
7044     if(e==1 && build>=0)
7045         h->x264_build= build;
7046
7047     if(s->avctx->debug & FF_DEBUG_BUGS)
7048         av_log(s->avctx, AV_LOG_DEBUG, "user data:\"%s\"\n", user_data+16);
7049
7050     for(; i<size; i++)
7051         skip_bits(&s->gb, 8);
7052
7053     return 0;
7054 }
7055
7056 static int decode_sei(H264Context *h){
7057     MpegEncContext * const s = &h->s;
7058
7059     while(get_bits_count(&s->gb) + 16 < s->gb.size_in_bits){
7060         int size, type;
7061
7062         type=0;
7063         do{
7064             type+= show_bits(&s->gb, 8);
7065         }while(get_bits(&s->gb, 8) == 255);
7066
7067         size=0;
7068         do{
7069             size+= show_bits(&s->gb, 8);
7070         }while(get_bits(&s->gb, 8) == 255);
7071
7072         switch(type){
7073         case 5:
7074             if(decode_unregistered_user_data(h, size) < 0);
7075                 return -1;
7076             break;
7077         default:
7078             skip_bits(&s->gb, 8*size);
7079         }
7080
7081         //FIXME check bits here
7082         align_get_bits(&s->gb);
7083     }
7084
7085     return 0;
7086 }
7087
7088 static inline void decode_hrd_parameters(H264Context *h, SPS *sps){
7089     MpegEncContext * const s = &h->s;
7090     int cpb_count, i;
7091     cpb_count = get_ue_golomb(&s->gb) + 1;
7092     get_bits(&s->gb, 4); /* bit_rate_scale */
7093     get_bits(&s->gb, 4); /* cpb_size_scale */
7094     for(i=0; i<cpb_count; i++){
7095         get_ue_golomb(&s->gb); /* bit_rate_value_minus1 */
7096         get_ue_golomb(&s->gb); /* cpb_size_value_minus1 */
7097         get_bits1(&s->gb);     /* cbr_flag */
7098     }
7099     get_bits(&s->gb, 5); /* initial_cpb_removal_delay_length_minus1 */
7100     get_bits(&s->gb, 5); /* cpb_removal_delay_length_minus1 */
7101     get_bits(&s->gb, 5); /* dpb_output_delay_length_minus1 */
7102     get_bits(&s->gb, 5); /* time_offset_length */
7103 }
7104
7105 static inline int decode_vui_parameters(H264Context *h, SPS *sps){
7106     MpegEncContext * const s = &h->s;
7107     int aspect_ratio_info_present_flag, aspect_ratio_idc;
7108     int nal_hrd_parameters_present_flag, vcl_hrd_parameters_present_flag;
7109
7110     aspect_ratio_info_present_flag= get_bits1(&s->gb);
7111
7112     if( aspect_ratio_info_present_flag ) {
7113         aspect_ratio_idc= get_bits(&s->gb, 8);
7114         if( aspect_ratio_idc == EXTENDED_SAR ) {
7115             sps->sar.num= get_bits(&s->gb, 16);
7116             sps->sar.den= get_bits(&s->gb, 16);
7117         }else if(aspect_ratio_idc < 14){
7118             sps->sar=  pixel_aspect[aspect_ratio_idc];
7119         }else{
7120             av_log(h->s.avctx, AV_LOG_ERROR, "illegal aspect ratio\n");
7121             return -1;
7122         }
7123     }else{
7124         sps->sar.num=
7125         sps->sar.den= 0;
7126     }
7127 //            s->avctx->aspect_ratio= sar_width*s->width / (float)(s->height*sar_height);
7128
7129     if(get_bits1(&s->gb)){      /* overscan_info_present_flag */
7130         get_bits1(&s->gb);      /* overscan_appropriate_flag */
7131     }
7132
7133     if(get_bits1(&s->gb)){      /* video_signal_type_present_flag */
7134         get_bits(&s->gb, 3);    /* video_format */
7135         get_bits1(&s->gb);      /* video_full_range_flag */
7136         if(get_bits1(&s->gb)){  /* colour_description_present_flag */
7137             get_bits(&s->gb, 8); /* colour_primaries */
7138             get_bits(&s->gb, 8); /* transfer_characteristics */
7139             get_bits(&s->gb, 8); /* matrix_coefficients */
7140         }
7141     }
7142
7143     if(get_bits1(&s->gb)){      /* chroma_location_info_present_flag */
7144         get_ue_golomb(&s->gb);  /* chroma_sample_location_type_top_field */
7145         get_ue_golomb(&s->gb);  /* chroma_sample_location_type_bottom_field */
7146     }
7147
7148     sps->timing_info_present_flag = get_bits1(&s->gb);
7149     if(sps->timing_info_present_flag){
7150         sps->num_units_in_tick = get_bits_long(&s->gb, 32);
7151         sps->time_scale = get_bits_long(&s->gb, 32);
7152         sps->fixed_frame_rate_flag = get_bits1(&s->gb);
7153     }
7154
7155     nal_hrd_parameters_present_flag = get_bits1(&s->gb);
7156     if(nal_hrd_parameters_present_flag)
7157         decode_hrd_parameters(h, sps);
7158     vcl_hrd_parameters_present_flag = get_bits1(&s->gb);
7159     if(vcl_hrd_parameters_present_flag)
7160         decode_hrd_parameters(h, sps);
7161     if(nal_hrd_parameters_present_flag || vcl_hrd_parameters_present_flag)
7162         get_bits1(&s->gb);     /* low_delay_hrd_flag */
7163     get_bits1(&s->gb);         /* pic_struct_present_flag */
7164
7165     sps->bitstream_restriction_flag = get_bits1(&s->gb);
7166     if(sps->bitstream_restriction_flag){
7167         get_bits1(&s->gb);     /* motion_vectors_over_pic_boundaries_flag */
7168         get_ue_golomb(&s->gb); /* max_bytes_per_pic_denom */
7169         get_ue_golomb(&s->gb); /* max_bits_per_mb_denom */
7170         get_ue_golomb(&s->gb); /* log2_max_mv_length_horizontal */
7171         get_ue_golomb(&s->gb); /* log2_max_mv_length_vertical */
7172         sps->num_reorder_frames = get_ue_golomb(&s->gb);
7173         get_ue_golomb(&s->gb); /* max_dec_frame_buffering */
7174     }
7175
7176     return 0;
7177 }
7178
7179 static void decode_scaling_list(H264Context *h, uint8_t *factors, int size,
7180                                 const uint8_t *jvt_list, const uint8_t *fallback_list){
7181     MpegEncContext * const s = &h->s;
7182     int i, last = 8, next = 8;
7183     const uint8_t *scan = size == 16 ? zigzag_scan : zigzag_scan8x8;
7184     if(!get_bits1(&s->gb)) /* matrix not written, we use the predicted one */
7185         memcpy(factors, fallback_list, size*sizeof(uint8_t));
7186     else
7187     for(i=0;i<size;i++){
7188         if(next)
7189             next = (last + get_se_golomb(&s->gb)) & 0xff;
7190         if(!i && !next){ /* matrix not written, we use the preset one */
7191             memcpy(factors, jvt_list, size*sizeof(uint8_t));
7192             break;
7193         }
7194         last = factors[scan[i]] = next ? next : last;
7195     }
7196 }
7197
7198 static void decode_scaling_matrices(H264Context *h, SPS *sps, PPS *pps, int is_sps,
7199                                    uint8_t (*scaling_matrix4)[16], uint8_t (*scaling_matrix8)[64]){
7200     MpegEncContext * const s = &h->s;
7201     int fallback_sps = !is_sps && sps->scaling_matrix_present;
7202     const uint8_t *fallback[4] = {
7203         fallback_sps ? sps->scaling_matrix4[0] : default_scaling4[0],
7204         fallback_sps ? sps->scaling_matrix4[3] : default_scaling4[1],
7205         fallback_sps ? sps->scaling_matrix8[0] : default_scaling8[0],
7206         fallback_sps ? sps->scaling_matrix8[1] : default_scaling8[1]
7207     };
7208     if(get_bits1(&s->gb)){
7209         sps->scaling_matrix_present |= is_sps;
7210         decode_scaling_list(h,scaling_matrix4[0],16,default_scaling4[0],fallback[0]); // Intra, Y
7211         decode_scaling_list(h,scaling_matrix4[1],16,default_scaling4[0],scaling_matrix4[0]); // Intra, Cr
7212         decode_scaling_list(h,scaling_matrix4[2],16,default_scaling4[0],scaling_matrix4[1]); // Intra, Cb
7213         decode_scaling_list(h,scaling_matrix4[3],16,default_scaling4[1],fallback[1]); // Inter, Y
7214         decode_scaling_list(h,scaling_matrix4[4],16,default_scaling4[1],scaling_matrix4[3]); // Inter, Cr
7215         decode_scaling_list(h,scaling_matrix4[5],16,default_scaling4[1],scaling_matrix4[4]); // Inter, Cb
7216         if(is_sps || pps->transform_8x8_mode){
7217             decode_scaling_list(h,scaling_matrix8[0],64,default_scaling8[0],fallback[2]);  // Intra, Y
7218             decode_scaling_list(h,scaling_matrix8[1],64,default_scaling8[1],fallback[3]);  // Inter, Y
7219         }
7220     } else if(fallback_sps) {
7221         memcpy(scaling_matrix4, sps->scaling_matrix4, 6*16*sizeof(uint8_t));
7222         memcpy(scaling_matrix8, sps->scaling_matrix8, 2*64*sizeof(uint8_t));
7223     }
7224 }
7225
7226 static inline int decode_seq_parameter_set(H264Context *h){
7227     MpegEncContext * const s = &h->s;
7228     int profile_idc, level_idc;
7229     int sps_id, i;
7230     SPS *sps;
7231
7232     profile_idc= get_bits(&s->gb, 8);
7233     get_bits1(&s->gb);   //constraint_set0_flag
7234     get_bits1(&s->gb);   //constraint_set1_flag
7235     get_bits1(&s->gb);   //constraint_set2_flag
7236     get_bits1(&s->gb);   //constraint_set3_flag
7237     get_bits(&s->gb, 4); // reserved
7238     level_idc= get_bits(&s->gb, 8);
7239     sps_id= get_ue_golomb(&s->gb);
7240
7241     sps= &h->sps_buffer[ sps_id ];
7242     sps->profile_idc= profile_idc;
7243     sps->level_idc= level_idc;
7244
7245     if(sps->profile_idc >= 100){ //high profile
7246         if(get_ue_golomb(&s->gb) == 3) //chroma_format_idc
7247             get_bits1(&s->gb);  //residual_color_transform_flag
7248         get_ue_golomb(&s->gb);  //bit_depth_luma_minus8
7249         get_ue_golomb(&s->gb);  //bit_depth_chroma_minus8
7250         sps->transform_bypass = get_bits1(&s->gb);
7251         decode_scaling_matrices(h, sps, NULL, 1, sps->scaling_matrix4, sps->scaling_matrix8);
7252     }else
7253         sps->scaling_matrix_present = 0;
7254
7255     sps->log2_max_frame_num= get_ue_golomb(&s->gb) + 4;
7256     sps->poc_type= get_ue_golomb(&s->gb);
7257
7258     if(sps->poc_type == 0){ //FIXME #define
7259         sps->log2_max_poc_lsb= get_ue_golomb(&s->gb) + 4;
7260     } else if(sps->poc_type == 1){//FIXME #define
7261         sps->delta_pic_order_always_zero_flag= get_bits1(&s->gb);
7262         sps->offset_for_non_ref_pic= get_se_golomb(&s->gb);
7263         sps->offset_for_top_to_bottom_field= get_se_golomb(&s->gb);
7264         sps->poc_cycle_length= get_ue_golomb(&s->gb);
7265
7266         for(i=0; i<sps->poc_cycle_length; i++)
7267             sps->offset_for_ref_frame[i]= get_se_golomb(&s->gb);
7268     }
7269     if(sps->poc_type > 2){
7270         av_log(h->s.avctx, AV_LOG_ERROR, "illegal POC type %d\n", sps->poc_type);
7271         return -1;
7272     }
7273
7274     sps->ref_frame_count= get_ue_golomb(&s->gb);
7275     if(sps->ref_frame_count > MAX_PICTURE_COUNT-2){
7276         av_log(h->s.avctx, AV_LOG_ERROR, "too many reference frames\n");
7277     }
7278     sps->gaps_in_frame_num_allowed_flag= get_bits1(&s->gb);
7279     sps->mb_width= get_ue_golomb(&s->gb) + 1;
7280     sps->mb_height= get_ue_golomb(&s->gb) + 1;
7281     if((unsigned)sps->mb_width >= INT_MAX/16 || (unsigned)sps->mb_height >= INT_MAX/16 ||
7282        avcodec_check_dimensions(NULL, 16*sps->mb_width, 16*sps->mb_height))
7283         return -1;
7284
7285     sps->frame_mbs_only_flag= get_bits1(&s->gb);
7286     if(!sps->frame_mbs_only_flag)
7287         av_log(h->s.avctx, AV_LOG_ERROR, "interlacing is not supported, picture will probably be garbage\n");
7288     if(!sps->frame_mbs_only_flag)
7289         sps->mb_aff= get_bits1(&s->gb);
7290     else
7291         sps->mb_aff= 0;
7292
7293     sps->direct_8x8_inference_flag= get_bits1(&s->gb);
7294
7295     sps->crop= get_bits1(&s->gb);
7296     if(sps->crop){
7297         sps->crop_left  = get_ue_golomb(&s->gb);
7298         sps->crop_right = get_ue_golomb(&s->gb);
7299         sps->crop_top   = get_ue_golomb(&s->gb);
7300         sps->crop_bottom= get_ue_golomb(&s->gb);
7301         if(sps->crop_left || sps->crop_top){
7302             av_log(h->s.avctx, AV_LOG_ERROR, "insane cropping not completely supported, this could look slightly wrong ...\n");
7303         }
7304     }else{
7305         sps->crop_left  =
7306         sps->crop_right =
7307         sps->crop_top   =
7308         sps->crop_bottom= 0;
7309     }
7310
7311     sps->vui_parameters_present_flag= get_bits1(&s->gb);
7312     if( sps->vui_parameters_present_flag )
7313         decode_vui_parameters(h, sps);
7314
7315     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
7316         av_log(h->s.avctx, AV_LOG_DEBUG, "sps:%d profile:%d/%d poc:%d ref:%d %dx%d %s %s crop:%d/%d/%d/%d %s\n",
7317                sps_id, sps->profile_idc, sps->level_idc,
7318                sps->poc_type,
7319                sps->ref_frame_count,
7320                sps->mb_width, sps->mb_height,
7321                sps->frame_mbs_only_flag ? "FRM" : (sps->mb_aff ? "MB-AFF" : "PIC-AFF"),
7322                sps->direct_8x8_inference_flag ? "8B8" : "",
7323                sps->crop_left, sps->crop_right,
7324                sps->crop_top, sps->crop_bottom,
7325                sps->vui_parameters_present_flag ? "VUI" : ""
7326                );
7327     }
7328     return 0;
7329 }
7330
7331 static inline int decode_picture_parameter_set(H264Context *h, int bit_length){
7332     MpegEncContext * const s = &h->s;
7333     int pps_id= get_ue_golomb(&s->gb);
7334     PPS *pps= &h->pps_buffer[pps_id];
7335
7336     pps->sps_id= get_ue_golomb(&s->gb);
7337     pps->cabac= get_bits1(&s->gb);
7338     pps->pic_order_present= get_bits1(&s->gb);
7339     pps->slice_group_count= get_ue_golomb(&s->gb) + 1;
7340     if(pps->slice_group_count > 1 ){
7341         pps->mb_slice_group_map_type= get_ue_golomb(&s->gb);
7342         av_log(h->s.avctx, AV_LOG_ERROR, "FMO not supported\n");
7343         switch(pps->mb_slice_group_map_type){
7344         case 0:
7345 #if 0
7346 |   for( i = 0; i <= num_slice_groups_minus1; i++ ) |   |        |
7347 |    run_length[ i ]                                |1  |ue(v)   |
7348 #endif
7349             break;
7350         case 2:
7351 #if 0
7352 |   for( i = 0; i < num_slice_groups_minus1; i++ )  |   |        |
7353 |{                                                  |   |        |
7354 |    top_left_mb[ i ]                               |1  |ue(v)   |
7355 |    bottom_right_mb[ i ]                           |1  |ue(v)   |
7356 |   }                                               |   |        |
7357 #endif
7358             break;
7359         case 3:
7360         case 4:
7361         case 5:
7362 #if 0
7363 |   slice_group_change_direction_flag               |1  |u(1)    |
7364 |   slice_group_change_rate_minus1                  |1  |ue(v)   |
7365 #endif
7366             break;
7367         case 6:
7368 #if 0
7369 |   slice_group_id_cnt_minus1                       |1  |ue(v)   |
7370 |   for( i = 0; i <= slice_group_id_cnt_minus1; i++ |   |        |
7371 |)                                                  |   |        |
7372 |    slice_group_id[ i ]                            |1  |u(v)    |
7373 #endif
7374             break;
7375         }
7376     }
7377     pps->ref_count[0]= get_ue_golomb(&s->gb) + 1;
7378     pps->ref_count[1]= get_ue_golomb(&s->gb) + 1;
7379     if(pps->ref_count[0] > 32 || pps->ref_count[1] > 32){
7380         av_log(h->s.avctx, AV_LOG_ERROR, "reference overflow (pps)\n");
7381         return -1;
7382     }
7383
7384     pps->weighted_pred= get_bits1(&s->gb);
7385     pps->weighted_bipred_idc= get_bits(&s->gb, 2);
7386     pps->init_qp= get_se_golomb(&s->gb) + 26;
7387     pps->init_qs= get_se_golomb(&s->gb) + 26;
7388     pps->chroma_qp_index_offset= get_se_golomb(&s->gb);
7389     pps->deblocking_filter_parameters_present= get_bits1(&s->gb);
7390     pps->constrained_intra_pred= get_bits1(&s->gb);
7391     pps->redundant_pic_cnt_present = get_bits1(&s->gb);
7392
7393     pps->transform_8x8_mode= 0;
7394     h->dequant_coeff_pps= -1; //contents of sps/pps can change even if id doesn't, so reinit
7395     memset(pps->scaling_matrix4, 16, 6*16*sizeof(uint8_t));
7396     memset(pps->scaling_matrix8, 16, 2*64*sizeof(uint8_t));
7397
7398     if(get_bits_count(&s->gb) < bit_length){
7399         pps->transform_8x8_mode= get_bits1(&s->gb);
7400         decode_scaling_matrices(h, &h->sps_buffer[pps->sps_id], pps, 0, pps->scaling_matrix4, pps->scaling_matrix8);
7401         get_se_golomb(&s->gb);  //second_chroma_qp_index_offset
7402     }
7403
7404     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
7405         av_log(h->s.avctx, AV_LOG_DEBUG, "pps:%d sps:%d %s slice_groups:%d ref:%d/%d %s qp:%d/%d/%d %s %s %s %s\n",
7406                pps_id, pps->sps_id,
7407                pps->cabac ? "CABAC" : "CAVLC",
7408                pps->slice_group_count,
7409                pps->ref_count[0], pps->ref_count[1],
7410                pps->weighted_pred ? "weighted" : "",
7411                pps->init_qp, pps->init_qs, pps->chroma_qp_index_offset,
7412                pps->deblocking_filter_parameters_present ? "LPAR" : "",
7413                pps->constrained_intra_pred ? "CONSTR" : "",
7414                pps->redundant_pic_cnt_present ? "REDU" : "",
7415                pps->transform_8x8_mode ? "8x8DCT" : ""
7416                );
7417     }
7418
7419     return 0;
7420 }
7421
7422 /**
7423  * finds the end of the current frame in the bitstream.
7424  * @return the position of the first byte of the next frame, or -1
7425  */
7426 static int find_frame_end(H264Context *h, const uint8_t *buf, int buf_size){
7427     int i;
7428     uint32_t state;
7429     ParseContext *pc = &(h->s.parse_context);
7430 //printf("first %02X%02X%02X%02X\n", buf[0], buf[1],buf[2],buf[3]);
7431 //    mb_addr= pc->mb_addr - 1;
7432     state= pc->state;
7433     for(i=0; i<=buf_size; i++){
7434         if((state&0xFFFFFF1F) == 0x101 || (state&0xFFFFFF1F) == 0x102 || (state&0xFFFFFF1F) == 0x105){
7435             tprintf("find_frame_end new startcode = %08x, frame_start_found = %d, pos = %d\n", state, pc->frame_start_found, i);
7436             if(pc->frame_start_found){
7437                 // If there isn't one more byte in the buffer
7438                 // the test on first_mb_in_slice cannot be done yet
7439                 // do it at next call.
7440                 if (i >= buf_size) break;
7441                 if (buf[i] & 0x80) {
7442                     // first_mb_in_slice is 0, probably the first nal of a new
7443                     // slice
7444                     tprintf("find_frame_end frame_end_found, state = %08x, pos = %d\n", state, i);
7445                     pc->state=-1;
7446                     pc->frame_start_found= 0;
7447                     return i-4;
7448                 }
7449             }
7450             pc->frame_start_found = 1;
7451         }
7452         if((state&0xFFFFFF1F) == 0x107 || (state&0xFFFFFF1F) == 0x108 || (state&0xFFFFFF1F) == 0x109){
7453            if(pc->frame_start_found){
7454                 pc->state=-1;
7455                 pc->frame_start_found= 0;
7456                 return i-4;
7457            }
7458         }
7459         if (i<buf_size)
7460             state= (state<<8) | buf[i];
7461     }
7462
7463     pc->state= state;
7464     return END_NOT_FOUND;
7465 }
7466
7467 static int h264_parse(AVCodecParserContext *s,
7468                       AVCodecContext *avctx,
7469                       uint8_t **poutbuf, int *poutbuf_size,
7470                       const uint8_t *buf, int buf_size)
7471 {
7472     H264Context *h = s->priv_data;
7473     ParseContext *pc = &h->s.parse_context;
7474     int next;
7475
7476     next= find_frame_end(h, buf, buf_size);
7477
7478     if (ff_combine_frame(pc, next, (uint8_t **)&buf, &buf_size) < 0) {
7479         *poutbuf = NULL;
7480         *poutbuf_size = 0;
7481         return buf_size;
7482     }
7483
7484     *poutbuf = (uint8_t *)buf;
7485     *poutbuf_size = buf_size;
7486     return next;
7487 }
7488
7489 static int h264_split(AVCodecContext *avctx,
7490                       const uint8_t *buf, int buf_size)
7491 {
7492     int i;
7493     uint32_t state = -1;
7494     int has_sps= 0;
7495
7496     for(i=0; i<=buf_size; i++){
7497         if((state&0xFFFFFF1F) == 0x107)
7498             has_sps=1;
7499 /*        if((state&0xFFFFFF1F) == 0x101 || (state&0xFFFFFF1F) == 0x102 || (state&0xFFFFFF1F) == 0x105){
7500         }*/
7501         if((state&0xFFFFFF00) == 0x100 && (state&0xFFFFFF1F) != 0x107 && (state&0xFFFFFF1F) != 0x108 && (state&0xFFFFFF1F) != 0x109){
7502             if(has_sps){
7503                 while(i>4 && buf[i-5]==0) i--;
7504                 return i-4;
7505             }
7506         }
7507         if (i<buf_size)
7508             state= (state<<8) | buf[i];
7509     }
7510     return 0;
7511 }
7512
7513
7514 static int decode_nal_units(H264Context *h, uint8_t *buf, int buf_size){
7515     MpegEncContext * const s = &h->s;
7516     AVCodecContext * const avctx= s->avctx;
7517     int buf_index=0;
7518 #if 0
7519     int i;
7520     for(i=0; i<50; i++){
7521         av_log(NULL, AV_LOG_ERROR,"%02X ", buf[i]);
7522     }
7523 #endif
7524     h->slice_num = 0;
7525     s->current_picture_ptr= NULL;
7526     for(;;){
7527         int consumed;
7528         int dst_length;
7529         int bit_length;
7530         uint8_t *ptr;
7531         int i, nalsize = 0;
7532
7533       if(h->is_avc) {
7534         if(buf_index >= buf_size) break;
7535         nalsize = 0;
7536         for(i = 0; i < h->nal_length_size; i++)
7537             nalsize = (nalsize << 8) | buf[buf_index++];
7538         if(nalsize <= 1){
7539             if(nalsize == 1){
7540                 buf_index++;
7541                 continue;
7542             }else{
7543                 av_log(h->s.avctx, AV_LOG_ERROR, "AVC: nal size %d\n", nalsize);
7544                 break;
7545             }
7546         }
7547       } else {
7548         // start code prefix search
7549         for(; buf_index + 3 < buf_size; buf_index++){
7550             // this should allways succeed in the first iteration
7551             if(buf[buf_index] == 0 && buf[buf_index+1] == 0 && buf[buf_index+2] == 1)
7552                 break;
7553         }
7554
7555         if(buf_index+3 >= buf_size) break;
7556
7557         buf_index+=3;
7558       }
7559
7560         ptr= decode_nal(h, buf + buf_index, &dst_length, &consumed, h->is_avc ? nalsize : buf_size - buf_index);
7561         if(ptr[dst_length - 1] == 0) dst_length--;
7562         bit_length= 8*dst_length - decode_rbsp_trailing(ptr + dst_length - 1);
7563
7564         if(s->avctx->debug&FF_DEBUG_STARTCODE){
7565             av_log(h->s.avctx, AV_LOG_DEBUG, "NAL %d at %d/%d length %d\n", h->nal_unit_type, buf_index, buf_size, dst_length);
7566         }
7567
7568         if (h->is_avc && (nalsize != consumed))
7569             av_log(h->s.avctx, AV_LOG_ERROR, "AVC: Consumed only %d bytes instead of %d\n", consumed, nalsize);
7570
7571         buf_index += consumed;
7572
7573         if(  (s->hurry_up == 1 && h->nal_ref_idc  == 0) //FIXME dont discard SEI id
7574            ||(avctx->skip_frame >= AVDISCARD_NONREF && h->nal_ref_idc  == 0))
7575             continue;
7576
7577         switch(h->nal_unit_type){
7578         case NAL_IDR_SLICE:
7579             idr(h); //FIXME ensure we don't loose some frames if there is reordering
7580         case NAL_SLICE:
7581             init_get_bits(&s->gb, ptr, bit_length);
7582             h->intra_gb_ptr=
7583             h->inter_gb_ptr= &s->gb;
7584             s->data_partitioning = 0;
7585
7586             if(decode_slice_header(h) < 0){
7587                 av_log(h->s.avctx, AV_LOG_ERROR, "decode_slice_header error\n");
7588                 break;
7589             }
7590             s->current_picture_ptr->key_frame= (h->nal_unit_type == NAL_IDR_SLICE);
7591             if(h->redundant_pic_count==0 && s->hurry_up < 5
7592                && (avctx->skip_frame < AVDISCARD_NONREF || h->nal_ref_idc)
7593                && (avctx->skip_frame < AVDISCARD_BIDIR  || h->slice_type!=B_TYPE)
7594                && (avctx->skip_frame < AVDISCARD_NONKEY || h->slice_type==I_TYPE)
7595                && avctx->skip_frame < AVDISCARD_ALL)
7596                 decode_slice(h);
7597             break;
7598         case NAL_DPA:
7599             init_get_bits(&s->gb, ptr, bit_length);
7600             h->intra_gb_ptr=
7601             h->inter_gb_ptr= NULL;
7602             s->data_partitioning = 1;
7603
7604             if(decode_slice_header(h) < 0){
7605                 av_log(h->s.avctx, AV_LOG_ERROR, "decode_slice_header error\n");
7606             }
7607             break;
7608         case NAL_DPB:
7609             init_get_bits(&h->intra_gb, ptr, bit_length);
7610             h->intra_gb_ptr= &h->intra_gb;
7611             break;
7612         case NAL_DPC:
7613             init_get_bits(&h->inter_gb, ptr, bit_length);
7614             h->inter_gb_ptr= &h->inter_gb;
7615
7616             if(h->redundant_pic_count==0 && h->intra_gb_ptr && s->data_partitioning
7617                && s->hurry_up < 5
7618                && (avctx->skip_frame < AVDISCARD_NONREF || h->nal_ref_idc)
7619                && (avctx->skip_frame < AVDISCARD_BIDIR  || h->slice_type!=B_TYPE)
7620                && (avctx->skip_frame < AVDISCARD_NONKEY || h->slice_type==I_TYPE)
7621                && avctx->skip_frame < AVDISCARD_ALL)
7622                 decode_slice(h);
7623             break;
7624         case NAL_SEI:
7625             init_get_bits(&s->gb, ptr, bit_length);
7626             decode_sei(h);
7627             break;
7628         case NAL_SPS:
7629             init_get_bits(&s->gb, ptr, bit_length);
7630             decode_seq_parameter_set(h);
7631
7632             if(s->flags& CODEC_FLAG_LOW_DELAY)
7633                 s->low_delay=1;
7634
7635             if(avctx->has_b_frames < 2)
7636                 avctx->has_b_frames= !s->low_delay;
7637             break;
7638         case NAL_PPS:
7639             init_get_bits(&s->gb, ptr, bit_length);
7640
7641             decode_picture_parameter_set(h, bit_length);
7642
7643             break;
7644         case NAL_AUD:
7645         case NAL_END_SEQUENCE:
7646         case NAL_END_STREAM:
7647         case NAL_FILLER_DATA:
7648         case NAL_SPS_EXT:
7649         case NAL_AUXILIARY_SLICE:
7650             break;
7651         default:
7652             av_log(avctx, AV_LOG_ERROR, "Unknown NAL code: %d\n", h->nal_unit_type);
7653         }
7654     }
7655
7656     if(!s->current_picture_ptr) return buf_index; //no frame
7657
7658     s->current_picture_ptr->qscale_type= FF_QSCALE_TYPE_H264;
7659     s->current_picture_ptr->pict_type= s->pict_type;
7660
7661     h->prev_frame_num_offset= h->frame_num_offset;
7662     h->prev_frame_num= h->frame_num;
7663     if(s->current_picture_ptr->reference){
7664         h->prev_poc_msb= h->poc_msb;
7665         h->prev_poc_lsb= h->poc_lsb;
7666     }
7667     if(s->current_picture_ptr->reference)
7668         execute_ref_pic_marking(h, h->mmco, h->mmco_index);
7669
7670     ff_er_frame_end(s);
7671
7672     MPV_frame_end(s);
7673
7674     return buf_index;
7675 }
7676
7677 /**
7678  * returns the number of bytes consumed for building the current frame
7679  */
7680 static int get_consumed_bytes(MpegEncContext *s, int pos, int buf_size){
7681     if(s->flags&CODEC_FLAG_TRUNCATED){
7682         pos -= s->parse_context.last_index;
7683         if(pos<0) pos=0; // FIXME remove (unneeded?)
7684
7685         return pos;
7686     }else{
7687         if(pos==0) pos=1; //avoid infinite loops (i doubt thats needed but ...)
7688         if(pos+10>buf_size) pos=buf_size; // oops ;)
7689
7690         return pos;
7691     }
7692 }
7693
7694 static int decode_frame(AVCodecContext *avctx,
7695                              void *data, int *data_size,
7696                              uint8_t *buf, int buf_size)
7697 {
7698     H264Context *h = avctx->priv_data;
7699     MpegEncContext *s = &h->s;
7700     AVFrame *pict = data;
7701     int buf_index;
7702
7703     s->flags= avctx->flags;
7704     s->flags2= avctx->flags2;
7705
7706    /* no supplementary picture */
7707     if (buf_size == 0) {
7708         return 0;
7709     }
7710
7711     if(s->flags&CODEC_FLAG_TRUNCATED){
7712         int next= find_frame_end(h, buf, buf_size);
7713
7714         if( ff_combine_frame(&s->parse_context, next, &buf, &buf_size) < 0 )
7715             return buf_size;
7716 //printf("next:%d buf_size:%d last_index:%d\n", next, buf_size, s->parse_context.last_index);
7717     }
7718
7719     if(h->is_avc && !h->got_avcC) {
7720         int i, cnt, nalsize;
7721         unsigned char *p = avctx->extradata;
7722         if(avctx->extradata_size < 7) {
7723             av_log(avctx, AV_LOG_ERROR, "avcC too short\n");
7724             return -1;
7725         }
7726         if(*p != 1) {
7727             av_log(avctx, AV_LOG_ERROR, "Unknown avcC version %d\n", *p);
7728             return -1;
7729         }
7730         /* sps and pps in the avcC always have length coded with 2 bytes,
7731            so put a fake nal_length_size = 2 while parsing them */
7732         h->nal_length_size = 2;
7733         // Decode sps from avcC
7734         cnt = *(p+5) & 0x1f; // Number of sps
7735         p += 6;
7736         for (i = 0; i < cnt; i++) {
7737             nalsize = BE_16(p) + 2;
7738             if(decode_nal_units(h, p, nalsize) < 0) {
7739                 av_log(avctx, AV_LOG_ERROR, "Decoding sps %d from avcC failed\n", i);
7740                 return -1;
7741             }
7742             p += nalsize;
7743         }
7744         // Decode pps from avcC
7745         cnt = *(p++); // Number of pps
7746         for (i = 0; i < cnt; i++) {
7747             nalsize = BE_16(p) + 2;
7748             if(decode_nal_units(h, p, nalsize)  != nalsize) {
7749                 av_log(avctx, AV_LOG_ERROR, "Decoding pps %d from avcC failed\n", i);
7750                 return -1;
7751             }
7752             p += nalsize;
7753         }
7754         // Now store right nal length size, that will be use to parse all other nals
7755         h->nal_length_size = ((*(((char*)(avctx->extradata))+4))&0x03)+1;
7756         // Do not reparse avcC
7757         h->got_avcC = 1;
7758     }
7759
7760     if(!h->is_avc && s->avctx->extradata_size && s->picture_number==0){
7761         if(decode_nal_units(h, s->avctx->extradata, s->avctx->extradata_size) < 0)
7762             return -1;
7763     }
7764
7765     buf_index=decode_nal_units(h, buf, buf_size);
7766     if(buf_index < 0)
7767         return -1;
7768
7769     //FIXME do something with unavailable reference frames
7770
7771 //    if(ret==FRAME_SKIPPED) return get_consumed_bytes(s, buf_index, buf_size);
7772     if(!s->current_picture_ptr){
7773         av_log(h->s.avctx, AV_LOG_DEBUG, "error, NO frame\n");
7774         return -1;
7775     }
7776
7777     {
7778         Picture *out = s->current_picture_ptr;
7779 #if 0 //decode order
7780         *data_size = sizeof(AVFrame);
7781 #else
7782         /* Sort B-frames into display order */
7783         Picture *cur = s->current_picture_ptr;
7784         Picture *prev = h->delayed_output_pic;
7785         int out_idx = 0;
7786         int pics = 0;
7787         int out_of_order;
7788         int cross_idr = 0;
7789         int dropped_frame = 0;
7790         int i;
7791
7792         if(h->sps.bitstream_restriction_flag
7793            && s->avctx->has_b_frames < h->sps.num_reorder_frames){
7794             s->avctx->has_b_frames = h->sps.num_reorder_frames;
7795             s->low_delay = 0;
7796         }
7797
7798         while(h->delayed_pic[pics]) pics++;
7799         h->delayed_pic[pics++] = cur;
7800         if(cur->reference == 0)
7801             cur->reference = 1;
7802
7803         for(i=0; h->delayed_pic[i]; i++)
7804             if(h->delayed_pic[i]->key_frame || h->delayed_pic[i]->poc==0)
7805                 cross_idr = 1;
7806
7807         out = h->delayed_pic[0];
7808         for(i=1; h->delayed_pic[i] && !h->delayed_pic[i]->key_frame; i++)
7809             if(h->delayed_pic[i]->poc < out->poc){
7810                 out = h->delayed_pic[i];
7811                 out_idx = i;
7812             }
7813
7814         out_of_order = !cross_idr && prev && out->poc < prev->poc;
7815         if(h->sps.bitstream_restriction_flag && s->avctx->has_b_frames >= h->sps.num_reorder_frames)
7816             { }
7817         else if(prev && pics <= s->avctx->has_b_frames)
7818             out = prev;
7819         else if((out_of_order && pics-1 == s->avctx->has_b_frames && pics < 15)
7820            || (s->low_delay &&
7821             ((!cross_idr && prev && out->poc > prev->poc + 2)
7822              || cur->pict_type == B_TYPE)))
7823         {
7824             s->low_delay = 0;
7825             s->avctx->has_b_frames++;
7826             out = prev;
7827         }
7828         else if(out_of_order)
7829             out = prev;
7830
7831         if(out_of_order || pics > s->avctx->has_b_frames){
7832             dropped_frame = (out != h->delayed_pic[out_idx]);
7833             for(i=out_idx; h->delayed_pic[i]; i++)
7834                 h->delayed_pic[i] = h->delayed_pic[i+1];
7835         }
7836
7837         if(prev == out && !dropped_frame)
7838             *data_size = 0;
7839         else
7840             *data_size = sizeof(AVFrame);
7841         if(prev && prev != out && prev->reference == 1)
7842             prev->reference = 0;
7843         h->delayed_output_pic = out;
7844 #endif
7845
7846         if(out)
7847             *pict= *(AVFrame*)out;
7848         else
7849             av_log(avctx, AV_LOG_DEBUG, "no picture\n");
7850     }
7851
7852     assert(pict->data[0] || !*data_size);
7853     ff_print_debug_info(s, pict);
7854 //printf("out %d\n", (int)pict->data[0]);
7855 #if 0 //?
7856
7857     /* Return the Picture timestamp as the frame number */
7858     /* we substract 1 because it is added on utils.c    */
7859     avctx->frame_number = s->picture_number - 1;
7860 #endif
7861     return get_consumed_bytes(s, buf_index, buf_size);
7862 }
7863 #if 0
7864 static inline void fill_mb_avail(H264Context *h){
7865     MpegEncContext * const s = &h->s;
7866     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
7867
7868     if(s->mb_y){
7869         h->mb_avail[0]= s->mb_x                 && h->slice_table[mb_xy - s->mb_stride - 1] == h->slice_num;
7870         h->mb_avail[1]=                            h->slice_table[mb_xy - s->mb_stride    ] == h->slice_num;
7871         h->mb_avail[2]= s->mb_x+1 < s->mb_width && h->slice_table[mb_xy - s->mb_stride + 1] == h->slice_num;
7872     }else{
7873         h->mb_avail[0]=
7874         h->mb_avail[1]=
7875         h->mb_avail[2]= 0;
7876     }
7877     h->mb_avail[3]= s->mb_x && h->slice_table[mb_xy - 1] == h->slice_num;
7878     h->mb_avail[4]= 1; //FIXME move out
7879     h->mb_avail[5]= 0; //FIXME move out
7880 }
7881 #endif
7882
7883 #if 0 //selftest
7884 #define COUNT 8000
7885 #define SIZE (COUNT*40)
7886 int main(){
7887     int i;
7888     uint8_t temp[SIZE];
7889     PutBitContext pb;
7890     GetBitContext gb;
7891 //    int int_temp[10000];
7892     DSPContext dsp;
7893     AVCodecContext avctx;
7894
7895     dsputil_init(&dsp, &avctx);
7896
7897     init_put_bits(&pb, temp, SIZE);
7898     printf("testing unsigned exp golomb\n");
7899     for(i=0; i<COUNT; i++){
7900         START_TIMER
7901         set_ue_golomb(&pb, i);
7902         STOP_TIMER("set_ue_golomb");
7903     }
7904     flush_put_bits(&pb);
7905
7906     init_get_bits(&gb, temp, 8*SIZE);
7907     for(i=0; i<COUNT; i++){
7908         int j, s;
7909
7910         s= show_bits(&gb, 24);
7911
7912         START_TIMER
7913         j= get_ue_golomb(&gb);
7914         if(j != i){
7915             printf("missmatch! at %d (%d should be %d) bits:%6X\n", i, j, i, s);
7916 //            return -1;
7917         }
7918         STOP_TIMER("get_ue_golomb");
7919     }
7920
7921
7922     init_put_bits(&pb, temp, SIZE);
7923     printf("testing signed exp golomb\n");
7924     for(i=0; i<COUNT; i++){
7925         START_TIMER
7926         set_se_golomb(&pb, i - COUNT/2);
7927         STOP_TIMER("set_se_golomb");
7928     }
7929     flush_put_bits(&pb);
7930
7931     init_get_bits(&gb, temp, 8*SIZE);
7932     for(i=0; i<COUNT; i++){
7933         int j, s;
7934
7935         s= show_bits(&gb, 24);
7936
7937         START_TIMER
7938         j= get_se_golomb(&gb);
7939         if(j != i - COUNT/2){
7940             printf("missmatch! at %d (%d should be %d) bits:%6X\n", i, j, i, s);
7941 //            return -1;
7942         }
7943         STOP_TIMER("get_se_golomb");
7944     }
7945
7946     printf("testing 4x4 (I)DCT\n");
7947
7948     DCTELEM block[16];
7949     uint8_t src[16], ref[16];
7950     uint64_t error= 0, max_error=0;
7951
7952     for(i=0; i<COUNT; i++){
7953         int j;
7954 //        printf("%d %d %d\n", r1, r2, (r2-r1)*16);
7955         for(j=0; j<16; j++){
7956             ref[j]= random()%255;
7957             src[j]= random()%255;
7958         }
7959
7960         h264_diff_dct_c(block, src, ref, 4);
7961
7962         //normalize
7963         for(j=0; j<16; j++){
7964 //            printf("%d ", block[j]);
7965             block[j]= block[j]*4;
7966             if(j&1) block[j]= (block[j]*4 + 2)/5;
7967             if(j&4) block[j]= (block[j]*4 + 2)/5;
7968         }
7969 //        printf("\n");
7970
7971         s->dsp.h264_idct_add(ref, block, 4);
7972 /*        for(j=0; j<16; j++){
7973             printf("%d ", ref[j]);
7974         }
7975         printf("\n");*/
7976
7977         for(j=0; j<16; j++){
7978             int diff= ABS(src[j] - ref[j]);
7979
7980             error+= diff*diff;
7981             max_error= FFMAX(max_error, diff);
7982         }
7983     }
7984     printf("error=%f max_error=%d\n", ((float)error)/COUNT/16, (int)max_error );
7985 #if 0
7986     printf("testing quantizer\n");
7987     for(qp=0; qp<52; qp++){
7988         for(i=0; i<16; i++)
7989             src1_block[i]= src2_block[i]= random()%255;
7990
7991     }
7992 #endif
7993     printf("Testing NAL layer\n");
7994
7995     uint8_t bitstream[COUNT];
7996     uint8_t nal[COUNT*2];
7997     H264Context h;
7998     memset(&h, 0, sizeof(H264Context));
7999
8000     for(i=0; i<COUNT; i++){
8001         int zeros= i;
8002         int nal_length;
8003         int consumed;
8004         int out_length;
8005         uint8_t *out;
8006         int j;
8007
8008         for(j=0; j<COUNT; j++){
8009             bitstream[j]= (random() % 255) + 1;
8010         }
8011
8012         for(j=0; j<zeros; j++){
8013             int pos= random() % COUNT;
8014             while(bitstream[pos] == 0){
8015                 pos++;
8016                 pos %= COUNT;
8017             }
8018             bitstream[pos]=0;
8019         }
8020
8021         START_TIMER
8022
8023         nal_length= encode_nal(&h, nal, bitstream, COUNT, COUNT*2);
8024         if(nal_length<0){
8025             printf("encoding failed\n");
8026             return -1;
8027         }
8028
8029         out= decode_nal(&h, nal, &out_length, &consumed, nal_length);
8030
8031         STOP_TIMER("NAL")
8032
8033         if(out_length != COUNT){
8034             printf("incorrect length %d %d\n", out_length, COUNT);
8035             return -1;
8036         }
8037
8038         if(consumed != nal_length){
8039             printf("incorrect consumed length %d %d\n", nal_length, consumed);
8040             return -1;
8041         }
8042
8043         if(memcmp(bitstream, out, COUNT)){
8044             printf("missmatch\n");
8045             return -1;
8046         }
8047     }
8048
8049     printf("Testing RBSP\n");
8050
8051
8052     return 0;
8053 }
8054 #endif
8055
8056
8057 static int decode_end(AVCodecContext *avctx)
8058 {
8059     H264Context *h = avctx->priv_data;
8060     MpegEncContext *s = &h->s;
8061
8062     av_freep(&h->rbsp_buffer);
8063     free_tables(h); //FIXME cleanup init stuff perhaps
8064     MPV_common_end(s);
8065
8066 //    memset(h, 0, sizeof(H264Context));
8067
8068     return 0;
8069 }
8070
8071
8072 AVCodec h264_decoder = {
8073     "h264",
8074     CODEC_TYPE_VIDEO,
8075     CODEC_ID_H264,
8076     sizeof(H264Context),
8077     decode_init,
8078     NULL,
8079     decode_end,
8080     decode_frame,
8081     /*CODEC_CAP_DRAW_HORIZ_BAND |*/ CODEC_CAP_DR1 | CODEC_CAP_TRUNCATED | CODEC_CAP_DELAY,
8082     .flush= flush_dpb,
8083 };
8084
8085 AVCodecParser h264_parser = {
8086     { CODEC_ID_H264 },
8087     sizeof(H264Context),
8088     NULL,
8089     h264_parse,
8090     ff_parse_close,
8091     h264_split,
8092 };
8093
8094 #include "svq3.c"