git.sesse.net Git - ffmpeg/blob - libavcodec/h264.c

   1 /*
   2  * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder
   3  * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
   4  *
   5  * This library is free software; you can redistribute it and/or
   6  * modify it under the terms of the GNU Lesser General Public
   7  * License as published by the Free Software Foundation; either
   8  * version 2 of the License, or (at your option) any later version.
   9  *
  10  * This library is distributed in the hope that it will be useful,
  11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13  * Lesser General Public License for more details.
  14  *
  15  * You should have received a copy of the GNU Lesser General Public
  16  * License along with this library; if not, write to the Free Software
  17  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  18  *
  19  */
  20
  21 /**
  22  * @file h264.c
  23  * H.264 / AVC / MPEG4 part10 codec.
  24  * @author Michael Niedermayer <michaelni@gmx.at>
  25  */
  26
  27 #include "common.h"
  28 #include "dsputil.h"
  29 #include "avcodec.h"
  30 #include "mpegvideo.h"
  31 #include "h264data.h"
  32 #include "golomb.h"
  33
  34 #include "cabac.h"
  35
  36 #undef NDEBUG
  37 #include <assert.h>
  38
  39 #define interlaced_dct interlaced_dct_is_a_bad_name
  40 #define mb_intra mb_intra_isnt_initalized_see_mb_type
  41
  42 #define LUMA_DC_BLOCK_INDEX   25
  43 #define CHROMA_DC_BLOCK_INDEX 26
  44
  45 #define CHROMA_DC_COEFF_TOKEN_VLC_BITS 8
  46 #define COEFF_TOKEN_VLC_BITS           8
  47 #define TOTAL_ZEROS_VLC_BITS           9
  48 #define CHROMA_DC_TOTAL_ZEROS_VLC_BITS 3
  49 #define RUN_VLC_BITS                   3
  50 #define RUN7_VLC_BITS                  6
  51
  52 #define MAX_SPS_COUNT 32
  53 #define MAX_PPS_COUNT 256
  54
  55 #define MAX_MMCO_COUNT 66
  56
  57 /**
  58  * Sequence parameter set
  59  */
  60 typedef struct SPS{
  61
  62     int profile_idc;
  63     int level_idc;
  64     int log2_max_frame_num;            ///< log2_max_frame_num_minus4 + 4
  65     int poc_type;                      ///< pic_order_cnt_type
  66     int log2_max_poc_lsb;              ///< log2_max_pic_order_cnt_lsb_minus4
  67     int delta_pic_order_always_zero_flag;
  68     int offset_for_non_ref_pic;
  69     int offset_for_top_to_bottom_field;
  70     int poc_cycle_length;              ///< num_ref_frames_in_pic_order_cnt_cycle
  71     int ref_frame_count;               ///< num_ref_frames
  72     int gaps_in_frame_num_allowed_flag;
  73     int mb_width;                      ///< frame_width_in_mbs_minus1 + 1
  74     int mb_height;                     ///< frame_height_in_mbs_minus1 + 1
  75     int frame_mbs_only_flag;
  76     int mb_aff;                        ///<mb_adaptive_frame_field_flag
  77     int direct_8x8_inference_flag;
  78     int crop;                   ///< frame_cropping_flag
  79     int crop_left;              ///< frame_cropping_rect_left_offset
  80     int crop_right;             ///< frame_cropping_rect_right_offset
  81     int crop_top;               ///< frame_cropping_rect_top_offset
  82     int crop_bottom;            ///< frame_cropping_rect_bottom_offset
  83     int vui_parameters_present_flag;
  84     AVRational sar;
  85     int timing_info_present_flag;
  86     uint32_t num_units_in_tick;
  87     uint32_t time_scale;
  88     int fixed_frame_rate_flag;
  89     short offset_for_ref_frame[256]; //FIXME dyn aloc?
  90     int bitstream_restriction_flag;
  91     int num_reorder_frames;
  92 }SPS;
  93
  94 /**
  95  * Picture parameter set
  96  */
  97 typedef struct PPS{
  98     int sps_id;
  99     int cabac;                  ///< entropy_coding_mode_flag
 100     int pic_order_present;      ///< pic_order_present_flag
 101     int slice_group_count;      ///< num_slice_groups_minus1 + 1
 102     int mb_slice_group_map_type;
 103     int ref_count[2];           ///< num_ref_idx_l0/1_active_minus1 + 1
 104     int weighted_pred;          ///< weighted_pred_flag
 105     int weighted_bipred_idc;
 106     int init_qp;                ///< pic_init_qp_minus26 + 26
 107     int init_qs;                ///< pic_init_qs_minus26 + 26
 108     int chroma_qp_index_offset;
 109     int deblocking_filter_parameters_present; ///< deblocking_filter_parameters_present_flag
 110     int constrained_intra_pred; ///< constrained_intra_pred_flag
 111     int redundant_pic_cnt_present; ///< redundant_pic_cnt_present_flag
 112 }PPS;
 113
 114 /**
 115  * Memory management control operation opcode.
 116  */
 117 typedef enum MMCOOpcode{
 118     MMCO_END=0,
 119     MMCO_SHORT2UNUSED,
 120     MMCO_LONG2UNUSED,
 121     MMCO_SHORT2LONG,
 122     MMCO_SET_MAX_LONG,
 123     MMCO_RESET,
 124     MMCO_LONG,
 125 } MMCOOpcode;
 126
 127 /**
 128  * Memory management control operation.
 129  */
 130 typedef struct MMCO{
 131     MMCOOpcode opcode;
 132     int short_frame_num;
 133     int long_index;
 134 } MMCO;
 135
 136 /**
 137  * H264Context
 138  */
 139 typedef struct H264Context{
 140     MpegEncContext s;
 141     int nal_ref_idc;
 142     int nal_unit_type;
 143 #define NAL_SLICE               1
 144 #define NAL_DPA                 2
 145 #define NAL_DPB                 3
 146 #define NAL_DPC                 4
 147 #define NAL_IDR_SLICE           5
 148 #define NAL_SEI                 6
 149 #define NAL_SPS                 7
 150 #define NAL_PPS                 8
 151 #define NAL_PICTURE_DELIMITER   9
 152 #define NAL_FILTER_DATA         10
 153     uint8_t *rbsp_buffer;
 154     int rbsp_buffer_size;
 155
 156     /**
 157       * Used to parse AVC variant of h264
 158       */
 159     int is_avc; ///< this flag is != 0 if codec is avc1
 160     int got_avcC; ///< flag used to parse avcC data only once
 161     int nal_length_size; ///< Number of bytes used for nal length (1, 2 or 4)
 162
 163     int chroma_qp; //QPc
 164
 165     int prev_mb_skiped; //FIXME remove (IMHO not used)
 166
 167     //prediction stuff
 168     int chroma_pred_mode;
 169     int intra16x16_pred_mode;
 170
 171     int top_mb_xy;
 172     int left_mb_xy[2];
 173
 174     int8_t intra4x4_pred_mode_cache[5*8];
 175     int8_t (*intra4x4_pred_mode)[8];
 176     void (*pred4x4  [9+3])(uint8_t *src, uint8_t *topright, int stride);//FIXME move to dsp?
 177     void (*pred8x8  [4+3])(uint8_t *src, int stride);
 178     void (*pred16x16[4+3])(uint8_t *src, int stride);
 179     unsigned int topleft_samples_available;
 180     unsigned int top_samples_available;
 181     unsigned int topright_samples_available;
 182     unsigned int left_samples_available;
 183     uint8_t (*top_borders[2])[16+2*8];
 184     uint8_t left_border[2*(17+2*9)];
 185
 186     /**
 187      * non zero coeff count cache.
 188      * is 64 if not available.
 189      */
 190     uint8_t non_zero_count_cache[6*8] __align8;
 191     uint8_t (*non_zero_count)[16];
 192
 193     /**
 194      * Motion vector cache.
 195      */
 196     int16_t mv_cache[2][5*8][2] __align8;
 197     int8_t ref_cache[2][5*8] __align8;
 198 #define LIST_NOT_USED -1 //FIXME rename?
 199 #define PART_NOT_AVAILABLE -2
 200
 201     /**
 202      * is 1 if the specific list MV&references are set to 0,0,-2.
 203      */
 204     int mv_cache_clean[2];
 205
 206     /**
 207      * block_offset[ 0..23] for frame macroblocks
 208      * block_offset[24..47] for field macroblocks
 209      */
 210     int block_offset[2*(16+8)];
 211
 212     uint16_t *mb2b_xy; //FIXME are these 4 a good idea?
 213     uint16_t *mb2b8_xy;
 214     int b_stride; //FIXME use s->b4_stride
 215     int b8_stride;
 216
 217     int halfpel_flag;
 218     int thirdpel_flag;
 219
 220     int unknown_svq3_flag;
 221     int next_slice_index;
 222
 223     SPS sps_buffer[MAX_SPS_COUNT];
 224     SPS sps; ///< current sps
 225
 226     PPS pps_buffer[MAX_PPS_COUNT];
 227     /**
 228      * current pps
 229      */
 230     PPS pps; //FIXME move tp Picture perhaps? (->no) do we need that?
 231
 232     int slice_num;
 233     uint8_t *slice_table_base;
 234     uint8_t *slice_table;      ///< slice_table_base + mb_stride + 1
 235     int slice_type;
 236     int slice_type_fixed;
 237
 238     //interlacing specific flags
 239     int mb_aff_frame;
 240     int mb_field_decoding_flag;
 241
 242     int sub_mb_type[4];
 243
 244     //POC stuff
 245     int poc_lsb;
 246     int poc_msb;
 247     int delta_poc_bottom;
 248     int delta_poc[2];
 249     int frame_num;
 250     int prev_poc_msb;             ///< poc_msb of the last reference pic for POC type 0
 251     int prev_poc_lsb;             ///< poc_lsb of the last reference pic for POC type 0
 252     int frame_num_offset;         ///< for POC type 2
 253     int prev_frame_num_offset;    ///< for POC type 2
 254     int prev_frame_num;           ///< frame_num of the last pic for POC type 1/2
 255
 256     /**
 257      * frame_num for frames or 2*frame_num for field pics.
 258      */
 259     int curr_pic_num;
 260
 261     /**
 262      * max_frame_num or 2*max_frame_num for field pics.
 263      */
 264     int max_pic_num;
 265
 266     //Weighted pred stuff
 267     int use_weight;
 268     int use_weight_chroma;
 269     int luma_log2_weight_denom;
 270     int chroma_log2_weight_denom;
 271     int luma_weight[2][16];
 272     int luma_offset[2][16];
 273     int chroma_weight[2][16][2];
 274     int chroma_offset[2][16][2];
 275     int implicit_weight[16][16];
 276
 277     //deblock
 278     int deblocking_filter;         ///< disable_deblocking_filter_idc with 1<->0
 279     int slice_alpha_c0_offset;
 280     int slice_beta_offset;
 281
 282     int redundant_pic_count;
 283
 284     int direct_spatial_mv_pred;
 285     int dist_scale_factor[16];
 286     int map_col_to_list0[2][16];
 287
 288     /**
 289      * num_ref_idx_l0/1_active_minus1 + 1
 290      */
 291     int ref_count[2];// FIXME split for AFF
 292     Picture *short_ref[32];
 293     Picture *long_ref[32];
 294     Picture default_ref_list[2][32];
 295     Picture ref_list[2][32]; //FIXME size?
 296     Picture field_ref_list[2][32]; //FIXME size?
 297     Picture *delayed_pic[16]; //FIXME size?
 298     Picture *delayed_output_pic;
 299
 300     /**
 301      * memory management control operations buffer.
 302      */
 303     MMCO mmco[MAX_MMCO_COUNT];
 304     int mmco_index;
 305
 306     int long_ref_count;  ///< number of actual long term references
 307     int short_ref_count; ///< number of actual short term references
 308
 309     //data partitioning
 310     GetBitContext intra_gb;
 311     GetBitContext inter_gb;
 312     GetBitContext *intra_gb_ptr;
 313     GetBitContext *inter_gb_ptr;
 314
 315     DCTELEM mb[16*24] __align8;
 316
 317     /**
 318      * Cabac
 319      */
 320     CABACContext cabac;
 321     uint8_t      cabac_state[399];
 322     int          cabac_init_idc;
 323
 324     /* 0x100 -> non null luma_dc, 0x80/0x40 -> non null chroma_dc (cb/cr), 0x?0 -> chroma_cbp(0,1,2), 0x0? luma_cbp */
 325     uint16_t     *cbp_table;
 326     int top_cbp;
 327     int left_cbp;
 328     /* chroma_pred_mode for i4x4 or i16x16, else 0 */
 329     uint8_t     *chroma_pred_mode_table;
 330     int         last_qscale_diff;
 331     int16_t     (*mvd_table[2])[2];
 332     int16_t     mvd_cache[2][5*8][2] __align8;
 333     uint8_t     *direct_table;
 334     uint8_t     direct_cache[5*8];
 335
 336 }H264Context;
 337
 338 static VLC coeff_token_vlc[4];
 339 static VLC chroma_dc_coeff_token_vlc;
 340
 341 static VLC total_zeros_vlc[15];
 342 static VLC chroma_dc_total_zeros_vlc[3];
 343
 344 static VLC run_vlc[6];
 345 static VLC run7_vlc;
 346
 347 static void svq3_luma_dc_dequant_idct_c(DCTELEM *block, int qp);
 348 static void svq3_add_idct_c(uint8_t *dst, DCTELEM *block, int stride, int qp, int dc);
 349 static void filter_mb( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize);
 350
 351 static inline uint32_t pack16to32(int a, int b){
 352 #ifdef WORDS_BIGENDIAN
 353    return (b&0xFFFF) + (a<<16);
 354 #else
 355    return (a&0xFFFF) + (b<<16);
 356 #endif
 357 }
 358
 359 /**
 360  * fill a rectangle.
 361  * @param h height of the rectangle, should be a constant
 362  * @param w width of the rectangle, should be a constant
 363  * @param size the size of val (1 or 4), should be a constant
 364  */
 365 static inline void fill_rectangle(void *vp, int w, int h, int stride, uint32_t val, int size){ //FIXME ensure this IS inlined
 366     uint8_t *p= (uint8_t*)vp;
 367     assert(size==1 || size==4);
 368
 369     w      *= size;
 370     stride *= size;
 371
 372     assert((((int)vp)&(FFMIN(w, STRIDE_ALIGN)-1)) == 0);
 373 //FIXME check what gcc generates for 64 bit on x86 and possible write a 32 bit ver of it
 374     if(w==2 && h==2){
 375         *(uint16_t*)(p + 0)=
 376         *(uint16_t*)(p + stride)= size==4 ? val : val*0x0101;
 377     }else if(w==2 && h==4){
 378         *(uint16_t*)(p + 0*stride)=
 379         *(uint16_t*)(p + 1*stride)=
 380         *(uint16_t*)(p + 2*stride)=
 381         *(uint16_t*)(p + 3*stride)= size==4 ? val : val*0x0101;
 382     }else if(w==4 && h==1){
 383         *(uint32_t*)(p + 0*stride)= size==4 ? val : val*0x01010101;
 384     }else if(w==4 && h==2){
 385         *(uint32_t*)(p + 0*stride)=
 386         *(uint32_t*)(p + 1*stride)= size==4 ? val : val*0x01010101;
 387     }else if(w==4 && h==4){
 388         *(uint32_t*)(p + 0*stride)=
 389         *(uint32_t*)(p + 1*stride)=
 390         *(uint32_t*)(p + 2*stride)=
 391         *(uint32_t*)(p + 3*stride)= size==4 ? val : val*0x01010101;
 392     }else if(w==8 && h==1){
 393         *(uint32_t*)(p + 0)=
 394         *(uint32_t*)(p + 4)= size==4 ? val : val*0x01010101;
 395     }else if(w==8 && h==2){
 396         *(uint32_t*)(p + 0 + 0*stride)=
 397         *(uint32_t*)(p + 4 + 0*stride)=
 398         *(uint32_t*)(p + 0 + 1*stride)=
 399         *(uint32_t*)(p + 4 + 1*stride)=  size==4 ? val : val*0x01010101;
 400     }else if(w==8 && h==4){
 401         *(uint64_t*)(p + 0*stride)=
 402         *(uint64_t*)(p + 1*stride)=
 403         *(uint64_t*)(p + 2*stride)=
 404         *(uint64_t*)(p + 3*stride)= size==4 ? val*0x0100000001ULL : val*0x0101010101010101ULL;
 405     }else if(w==16 && h==2){
 406         *(uint64_t*)(p + 0+0*stride)=
 407         *(uint64_t*)(p + 8+0*stride)=
 408         *(uint64_t*)(p + 0+1*stride)=
 409         *(uint64_t*)(p + 8+1*stride)= size==4 ? val*0x0100000001ULL : val*0x0101010101010101ULL;
 410     }else if(w==16 && h==4){
 411         *(uint64_t*)(p + 0+0*stride)=
 412         *(uint64_t*)(p + 8+0*stride)=
 413         *(uint64_t*)(p + 0+1*stride)=
 414         *(uint64_t*)(p + 8+1*stride)=
 415         *(uint64_t*)(p + 0+2*stride)=
 416         *(uint64_t*)(p + 8+2*stride)=
 417         *(uint64_t*)(p + 0+3*stride)=
 418         *(uint64_t*)(p + 8+3*stride)= size==4 ? val*0x0100000001ULL : val*0x0101010101010101ULL;
 419     }else
 420         assert(0);
 421 }
 422
 423 static inline void fill_caches(H264Context *h, int mb_type, int for_deblock){
 424     MpegEncContext * const s = &h->s;
 425     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
 426     int topleft_xy, top_xy, topright_xy, left_xy[2];
 427     int topleft_type, top_type, topright_type, left_type[2];
 428     int left_block[8];
 429     int i;
 430
 431     //wow what a mess, why didnt they simplify the interlacing&intra stuff, i cant imagine that these complex rules are worth it
 432
 433     top_xy     = mb_xy  - s->mb_stride;
 434     topleft_xy = top_xy - 1;
 435     topright_xy= top_xy + 1;
 436     left_xy[1] = left_xy[0] = mb_xy-1;
 437     left_block[0]= 0;
 438     left_block[1]= 1;
 439     left_block[2]= 2;
 440     left_block[3]= 3;
 441     left_block[4]= 7;
 442     left_block[5]= 10;
 443     left_block[6]= 8;
 444     left_block[7]= 11;
 445     if(h->mb_aff_frame){
 446         const int pair_xy          = s->mb_x     + (s->mb_y & ~1)*s->mb_stride;
 447         const int top_pair_xy      = pair_xy     - s->mb_stride;
 448         const int topleft_pair_xy  = top_pair_xy - 1;
 449         const int topright_pair_xy = top_pair_xy + 1;
 450         const int topleft_mb_frame_flag  = !IS_INTERLACED(s->current_picture.mb_type[topleft_pair_xy]);
 451         const int top_mb_frame_flag      = !IS_INTERLACED(s->current_picture.mb_type[top_pair_xy]);
 452         const int topright_mb_frame_flag = !IS_INTERLACED(s->current_picture.mb_type[topright_pair_xy]);
 453         const int left_mb_frame_flag = !IS_INTERLACED(s->current_picture.mb_type[pair_xy-1]);
 454         const int curr_mb_frame_flag = !IS_INTERLACED(mb_type);
 455         const int bottom = (s->mb_y & 1);
 456         tprintf("fill_caches: curr_mb_frame_flag:%d, left_mb_frame_flag:%d, topleft_mb_frame_flag:%d, top_mb_frame_flag:%d, topright_mb_frame_flag:%d\n", curr_mb_frame_flag, left_mb_frame_flag, topleft_mb_frame_flag, top_mb_frame_flag, topright_mb_frame_flag);
 457         if (bottom
 458                 ? !curr_mb_frame_flag // bottom macroblock
 459                 : (!curr_mb_frame_flag && !top_mb_frame_flag) // top macroblock
 460                 ) {
 461             top_xy -= s->mb_stride;
 462         }
 463         if (bottom
 464                 ? !curr_mb_frame_flag // bottom macroblock
 465                 : (!curr_mb_frame_flag && !topleft_mb_frame_flag) // top macroblock
 466                 ) {
 467             topleft_xy -= s->mb_stride;
 468         }
 469         if (bottom
 470                 ? !curr_mb_frame_flag // bottom macroblock
 471                 : (!curr_mb_frame_flag && !topright_mb_frame_flag) // top macroblock
 472                 ) {
 473             topright_xy -= s->mb_stride;
 474         }
 475         if (left_mb_frame_flag != curr_mb_frame_flag) {
 476             left_xy[1] = left_xy[0] = pair_xy - 1;
 477             if (curr_mb_frame_flag) {
 478                 if (bottom) {
 479                     left_block[0]= 2;
 480                     left_block[1]= 2;
 481                     left_block[2]= 3;
 482                     left_block[3]= 3;
 483                     left_block[4]= 8;
 484                     left_block[5]= 11;
 485                     left_block[6]= 8;
 486                     left_block[7]= 11;
 487                 } else {
 488                     left_block[0]= 0;
 489                     left_block[1]= 0;
 490                     left_block[2]= 1;
 491                     left_block[3]= 1;
 492                     left_block[4]= 7;
 493                     left_block[5]= 10;
 494                     left_block[6]= 7;
 495                     left_block[7]= 10;
 496                 }
 497             } else {
 498                 left_xy[1] += s->mb_stride;
 499                 //left_block[0]= 0;
 500                 left_block[1]= 2;
 501                 left_block[2]= 0;
 502                 left_block[3]= 2;
 503                 //left_block[4]= 7;
 504                 left_block[5]= 10;
 505                 left_block[6]= 7;
 506                 left_block[7]= 10;
 507             }
 508         }
 509     }
 510
 511     if(for_deblock){
 512         h->top_mb_xy = top_xy;
 513         h->left_mb_xy[0] = left_xy[0];
 514         h->left_mb_xy[1] = left_xy[1];
 515     }
 516     if(for_deblock){
 517         topleft_type = h->slice_table[topleft_xy ] < 255 ? s->current_picture.mb_type[topleft_xy] : 0;
 518         top_type     = h->slice_table[top_xy     ] < 255 ? s->current_picture.mb_type[top_xy]     : 0;
 519         topright_type= h->slice_table[topright_xy] < 255 ? s->current_picture.mb_type[topright_xy]: 0;
 520         left_type[0] = h->slice_table[left_xy[0] ] < 255 ? s->current_picture.mb_type[left_xy[0]] : 0;
 521         left_type[1] = h->slice_table[left_xy[1] ] < 255 ? s->current_picture.mb_type[left_xy[1]] : 0;
 522     }else{
 523         topleft_type = h->slice_table[topleft_xy ] == h->slice_num ? s->current_picture.mb_type[topleft_xy] : 0;
 524         top_type     = h->slice_table[top_xy     ] == h->slice_num ? s->current_picture.mb_type[top_xy]     : 0;
 525         topright_type= h->slice_table[topright_xy] == h->slice_num ? s->current_picture.mb_type[topright_xy]: 0;
 526         left_type[0] = h->slice_table[left_xy[0] ] == h->slice_num ? s->current_picture.mb_type[left_xy[0]] : 0;
 527         left_type[1] = h->slice_table[left_xy[1] ] == h->slice_num ? s->current_picture.mb_type[left_xy[1]] : 0;
 528     }
 529
 530     if(IS_INTRA(mb_type)){
 531         h->topleft_samples_available=
 532         h->top_samples_available=
 533         h->left_samples_available= 0xFFFF;
 534         h->topright_samples_available= 0xEEEA;
 535
 536         if(!IS_INTRA(top_type) && (top_type==0 || h->pps.constrained_intra_pred)){
 537             h->topleft_samples_available= 0xB3FF;
 538             h->top_samples_available= 0x33FF;
 539             h->topright_samples_available= 0x26EA;
 540         }
 541         for(i=0; i<2; i++){
 542             if(!IS_INTRA(left_type[i]) && (left_type[i]==0 || h->pps.constrained_intra_pred)){
 543                 h->topleft_samples_available&= 0xDF5F;
 544                 h->left_samples_available&= 0x5F5F;
 545             }
 546         }
 547
 548         if(!IS_INTRA(topleft_type) && (topleft_type==0 || h->pps.constrained_intra_pred))
 549             h->topleft_samples_available&= 0x7FFF;
 550
 551         if(!IS_INTRA(topright_type) && (topright_type==0 || h->pps.constrained_intra_pred))
 552             h->topright_samples_available&= 0xFBFF;
 553
 554         if(IS_INTRA4x4(mb_type)){
 555             if(IS_INTRA4x4(top_type)){
 556                 h->intra4x4_pred_mode_cache[4+8*0]= h->intra4x4_pred_mode[top_xy][4];
 557                 h->intra4x4_pred_mode_cache[5+8*0]= h->intra4x4_pred_mode[top_xy][5];
 558                 h->intra4x4_pred_mode_cache[6+8*0]= h->intra4x4_pred_mode[top_xy][6];
 559                 h->intra4x4_pred_mode_cache[7+8*0]= h->intra4x4_pred_mode[top_xy][3];
 560             }else{
 561                 int pred;
 562                 if(!top_type || (IS_INTER(top_type) && h->pps.constrained_intra_pred))
 563                     pred= -1;
 564                 else{
 565                     pred= 2;
 566                 }
 567                 h->intra4x4_pred_mode_cache[4+8*0]=
 568                 h->intra4x4_pred_mode_cache[5+8*0]=
 569                 h->intra4x4_pred_mode_cache[6+8*0]=
 570                 h->intra4x4_pred_mode_cache[7+8*0]= pred;
 571             }
 572             for(i=0; i<2; i++){
 573                 if(IS_INTRA4x4(left_type[i])){
 574                     h->intra4x4_pred_mode_cache[3+8*1 + 2*8*i]= h->intra4x4_pred_mode[left_xy[i]][left_block[0+2*i]];
 575                     h->intra4x4_pred_mode_cache[3+8*2 + 2*8*i]= h->intra4x4_pred_mode[left_xy[i]][left_block[1+2*i]];
 576                 }else{
 577                     int pred;
 578                     if(!left_type[i] || (IS_INTER(left_type[i]) && h->pps.constrained_intra_pred))
 579                         pred= -1;
 580                     else{
 581                         pred= 2;
 582                     }
 583                     h->intra4x4_pred_mode_cache[3+8*1 + 2*8*i]=
 584                     h->intra4x4_pred_mode_cache[3+8*2 + 2*8*i]= pred;
 585                 }
 586             }
 587         }
 588     }
 589
 590
 591 /*
 592 0 . T T. T T T T
 593 1 L . .L . . . .
 594 2 L . .L . . . .
 595 3 . T TL . . . .
 596 4 L . .L . . . .
 597 5 L . .. . . . .
 598 */
 599 //FIXME constraint_intra_pred & partitioning & nnz (lets hope this is just a typo in the spec)
 600     if(top_type){
 601         h->non_zero_count_cache[4+8*0]= h->non_zero_count[top_xy][4];
 602         h->non_zero_count_cache[5+8*0]= h->non_zero_count[top_xy][5];
 603         h->non_zero_count_cache[6+8*0]= h->non_zero_count[top_xy][6];
 604         h->non_zero_count_cache[7+8*0]= h->non_zero_count[top_xy][3];
 605
 606         h->non_zero_count_cache[1+8*0]= h->non_zero_count[top_xy][9];
 607         h->non_zero_count_cache[2+8*0]= h->non_zero_count[top_xy][8];
 608
 609         h->non_zero_count_cache[1+8*3]= h->non_zero_count[top_xy][12];
 610         h->non_zero_count_cache[2+8*3]= h->non_zero_count[top_xy][11];
 611
 612         h->top_cbp= h->cbp_table[top_xy];
 613     }else{
 614         h->non_zero_count_cache[4+8*0]=
 615         h->non_zero_count_cache[5+8*0]=
 616         h->non_zero_count_cache[6+8*0]=
 617         h->non_zero_count_cache[7+8*0]=
 618
 619         h->non_zero_count_cache[1+8*0]=
 620         h->non_zero_count_cache[2+8*0]=
 621
 622         h->non_zero_count_cache[1+8*3]=
 623         h->non_zero_count_cache[2+8*3]= h->pps.cabac && !IS_INTRA(mb_type) ? 0 : 64;
 624
 625         if(IS_INTRA(mb_type)) h->top_cbp= 0x1C0;
 626         else                  h->top_cbp= 0;
 627     }
 628
 629     for (i=0; i<2; i++) {
 630         if(left_type[i]){
 631             h->non_zero_count_cache[3+8*1 + 2*8*i]= h->non_zero_count[left_xy[i]][left_block[0+2*i]];
 632             h->non_zero_count_cache[3+8*2 + 2*8*i]= h->non_zero_count[left_xy[i]][left_block[1+2*i]];
 633             h->non_zero_count_cache[0+8*1 +   8*i]= h->non_zero_count[left_xy[i]][left_block[4+2*i]];
 634             h->non_zero_count_cache[0+8*4 +   8*i]= h->non_zero_count[left_xy[i]][left_block[5+2*i]];
 635             h->left_cbp= h->cbp_table[left_xy[i]]; //FIXME interlacing
 636         }else{
 637             h->non_zero_count_cache[3+8*1 + 2*8*i]=
 638             h->non_zero_count_cache[3+8*2 + 2*8*i]=
 639             h->non_zero_count_cache[0+8*1 +   8*i]=
 640             h->non_zero_count_cache[0+8*4 +   8*i]= h->pps.cabac && !IS_INTRA(mb_type) ? 0 : 64;
 641
 642             if(IS_INTRA(mb_type)) h->left_cbp= 0x1C0;//FIXME interlacing
 643             else                  h->left_cbp= 0;
 644         }
 645     }
 646
 647 #if 1
 648     //FIXME direct mb can skip much of this
 649     if(IS_INTER(mb_type) || (IS_DIRECT(mb_type) && h->direct_spatial_mv_pred)){
 650         int list;
 651         for(list=0; list<2; list++){
 652             if(!USES_LIST(mb_type, list) && !IS_DIRECT(mb_type) && !for_deblock){
 653                 /*if(!h->mv_cache_clean[list]){
 654                     memset(h->mv_cache [list],  0, 8*5*2*sizeof(int16_t)); //FIXME clean only input? clean at all?
 655                     memset(h->ref_cache[list], PART_NOT_AVAILABLE, 8*5*sizeof(int8_t));
 656                     h->mv_cache_clean[list]= 1;
 657                 }*/
 658                 continue;
 659             }
 660             h->mv_cache_clean[list]= 0;
 661
 662             if(IS_INTER(topleft_type)){
 663                 const int b_xy = h->mb2b_xy[topleft_xy] + 3 + 3*h->b_stride;
 664                 const int b8_xy= h->mb2b8_xy[topleft_xy] + 1 + h->b8_stride;
 665                 *(uint32_t*)h->mv_cache[list][scan8[0] - 1 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy];
 666                 h->ref_cache[list][scan8[0] - 1 - 1*8]= s->current_picture.ref_index[list][b8_xy];
 667             }else{
 668                 *(uint32_t*)h->mv_cache[list][scan8[0] - 1 - 1*8]= 0;
 669                 h->ref_cache[list][scan8[0] - 1 - 1*8]= topleft_type ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 670             }
 671
 672             if(IS_INTER(top_type)){
 673                 const int b_xy= h->mb2b_xy[top_xy] + 3*h->b_stride;
 674                 const int b8_xy= h->mb2b8_xy[top_xy] + h->b8_stride;
 675                 *(uint32_t*)h->mv_cache[list][scan8[0] + 0 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 0];
 676                 *(uint32_t*)h->mv_cache[list][scan8[0] + 1 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 1];
 677                 *(uint32_t*)h->mv_cache[list][scan8[0] + 2 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 2];
 678                 *(uint32_t*)h->mv_cache[list][scan8[0] + 3 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 3];
 679                 h->ref_cache[list][scan8[0] + 0 - 1*8]=
 680                 h->ref_cache[list][scan8[0] + 1 - 1*8]= s->current_picture.ref_index[list][b8_xy + 0];
 681                 h->ref_cache[list][scan8[0] + 2 - 1*8]=
 682                 h->ref_cache[list][scan8[0] + 3 - 1*8]= s->current_picture.ref_index[list][b8_xy + 1];
 683             }else{
 684                 *(uint32_t*)h->mv_cache [list][scan8[0] + 0 - 1*8]=
 685                 *(uint32_t*)h->mv_cache [list][scan8[0] + 1 - 1*8]=
 686                 *(uint32_t*)h->mv_cache [list][scan8[0] + 2 - 1*8]=
 687                 *(uint32_t*)h->mv_cache [list][scan8[0] + 3 - 1*8]= 0;
 688                 *(uint32_t*)&h->ref_cache[list][scan8[0] + 0 - 1*8]= ((top_type ? LIST_NOT_USED : PART_NOT_AVAILABLE)&0xFF)*0x01010101;
 689             }
 690
 691             if(IS_INTER(topright_type)){
 692                 const int b_xy= h->mb2b_xy[topright_xy] + 3*h->b_stride;
 693                 const int b8_xy= h->mb2b8_xy[topright_xy] + h->b8_stride;
 694                 *(uint32_t*)h->mv_cache[list][scan8[0] + 4 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy];
 695                 h->ref_cache[list][scan8[0] + 4 - 1*8]= s->current_picture.ref_index[list][b8_xy];
 696             }else{
 697                 *(uint32_t*)h->mv_cache [list][scan8[0] + 4 - 1*8]= 0;
 698                 h->ref_cache[list][scan8[0] + 4 - 1*8]= topright_type ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 699             }
 700
 701             //FIXME unify cleanup or sth
 702             if(IS_INTER(left_type[0])){
 703                 const int b_xy= h->mb2b_xy[left_xy[0]] + 3;
 704                 const int b8_xy= h->mb2b8_xy[left_xy[0]] + 1;
 705                 *(uint32_t*)h->mv_cache[list][scan8[0] - 1 + 0*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + h->b_stride*left_block[0]];
 706                 *(uint32_t*)h->mv_cache[list][scan8[0] - 1 + 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + h->b_stride*left_block[1]];
 707                 h->ref_cache[list][scan8[0] - 1 + 0*8]=
 708                 h->ref_cache[list][scan8[0] - 1 + 1*8]= s->current_picture.ref_index[list][b8_xy + h->b8_stride*(left_block[0]>>1)];
 709             }else{
 710                 *(uint32_t*)h->mv_cache [list][scan8[0] - 1 + 0*8]=
 711                 *(uint32_t*)h->mv_cache [list][scan8[0] - 1 + 1*8]= 0;
 712                 h->ref_cache[list][scan8[0] - 1 + 0*8]=
 713                 h->ref_cache[list][scan8[0] - 1 + 1*8]= left_type[0] ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 714             }
 715
 716             if(IS_INTER(left_type[1])){
 717                 const int b_xy= h->mb2b_xy[left_xy[1]] + 3;
 718                 const int b8_xy= h->mb2b8_xy[left_xy[1]] + 1;
 719                 *(uint32_t*)h->mv_cache[list][scan8[0] - 1 + 2*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + h->b_stride*left_block[2]];
 720                 *(uint32_t*)h->mv_cache[list][scan8[0] - 1 + 3*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + h->b_stride*left_block[3]];
 721                 h->ref_cache[list][scan8[0] - 1 + 2*8]=
 722                 h->ref_cache[list][scan8[0] - 1 + 3*8]= s->current_picture.ref_index[list][b8_xy + h->b8_stride*(left_block[2]>>1)];
 723             }else{
 724                 *(uint32_t*)h->mv_cache [list][scan8[0] - 1 + 2*8]=
 725                 *(uint32_t*)h->mv_cache [list][scan8[0] - 1 + 3*8]= 0;
 726                 h->ref_cache[list][scan8[0] - 1 + 2*8]=
 727                 h->ref_cache[list][scan8[0] - 1 + 3*8]= left_type[0] ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 728             }
 729
 730             if(for_deblock)
 731                 continue;
 732
 733             h->ref_cache[list][scan8[5 ]+1] =
 734             h->ref_cache[list][scan8[7 ]+1] =
 735             h->ref_cache[list][scan8[13]+1] =  //FIXME remove past 3 (init somewher else)
 736             h->ref_cache[list][scan8[4 ]] =
 737             h->ref_cache[list][scan8[12]] = PART_NOT_AVAILABLE;
 738             *(uint32_t*)h->mv_cache [list][scan8[5 ]+1]=
 739             *(uint32_t*)h->mv_cache [list][scan8[7 ]+1]=
 740             *(uint32_t*)h->mv_cache [list][scan8[13]+1]= //FIXME remove past 3 (init somewher else)
 741             *(uint32_t*)h->mv_cache [list][scan8[4 ]]=
 742             *(uint32_t*)h->mv_cache [list][scan8[12]]= 0;
 743
 744             if( h->pps.cabac ) {
 745                 /* XXX beurk, Load mvd */
 746                 if(IS_INTER(topleft_type)){
 747                     const int b_xy = h->mb2b_xy[topleft_xy] + 3 + 3*h->b_stride;
 748                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy];
 749                 }else{
 750                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 - 1*8]= 0;
 751                 }
 752
 753                 if(IS_INTER(top_type)){
 754                     const int b_xy= h->mb2b_xy[top_xy] + 3*h->b_stride;
 755                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 0 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 0];
 756                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 1 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 1];
 757                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 2 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 2];
 758                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 3 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 3];
 759                 }else{
 760                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 0 - 1*8]=
 761                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 1 - 1*8]=
 762                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 2 - 1*8]=
 763                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 3 - 1*8]= 0;
 764                 }
 765                 if(IS_INTER(left_type[0])){
 766                     const int b_xy= h->mb2b_xy[left_xy[0]] + 3;
 767                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 0*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[0]];
 768                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[1]];
 769                 }else{
 770                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 0*8]=
 771                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 1*8]= 0;
 772                 }
 773                 if(IS_INTER(left_type[1])){
 774                     const int b_xy= h->mb2b_xy[left_xy[1]] + 3;
 775                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 2*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[2]];
 776                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 3*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[3]];
 777                 }else{
 778                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 2*8]=
 779                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 3*8]= 0;
 780                 }
 781                 *(uint32_t*)h->mvd_cache [list][scan8[5 ]+1]=
 782                 *(uint32_t*)h->mvd_cache [list][scan8[7 ]+1]=
 783                 *(uint32_t*)h->mvd_cache [list][scan8[13]+1]= //FIXME remove past 3 (init somewher else)
 784                 *(uint32_t*)h->mvd_cache [list][scan8[4 ]]=
 785                 *(uint32_t*)h->mvd_cache [list][scan8[12]]= 0;
 786
 787                 if(h->slice_type == B_TYPE){
 788                     fill_rectangle(&h->direct_cache[scan8[0]], 4, 4, 8, 0, 1);
 789
 790                     if(IS_DIRECT(top_type)){
 791                         *(uint32_t*)&h->direct_cache[scan8[0] - 1*8]= 0x01010101;
 792                     }else if(IS_8X8(top_type)){
 793                         int b8_xy = h->mb2b8_xy[top_xy] + h->b8_stride;
 794                         h->direct_cache[scan8[0] + 0 - 1*8]= h->direct_table[b8_xy];
 795                         h->direct_cache[scan8[0] + 2 - 1*8]= h->direct_table[b8_xy + 1];
 796                     }else{
 797                         *(uint32_t*)&h->direct_cache[scan8[0] - 1*8]= 0;
 798                     }
 799
 800                     //FIXME interlacing
 801                     if(IS_DIRECT(left_type[0])){
 802                         h->direct_cache[scan8[0] - 1 + 0*8]=
 803                         h->direct_cache[scan8[0] - 1 + 2*8]= 1;
 804                     }else if(IS_8X8(left_type[0])){
 805                         int b8_xy = h->mb2b8_xy[left_xy[0]] + 1;
 806                         h->direct_cache[scan8[0] - 1 + 0*8]= h->direct_table[b8_xy];
 807                         h->direct_cache[scan8[0] - 1 + 2*8]= h->direct_table[b8_xy + h->b8_stride];
 808                     }else{
 809                         h->direct_cache[scan8[0] - 1 + 0*8]=
 810                         h->direct_cache[scan8[0] - 1 + 2*8]= 0;
 811                     }
 812                 }
 813             }
 814         }
 815     }
 816 #endif
 817 }
 818
 819 static inline void write_back_intra_pred_mode(H264Context *h){
 820     MpegEncContext * const s = &h->s;
 821     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
 822
 823     h->intra4x4_pred_mode[mb_xy][0]= h->intra4x4_pred_mode_cache[7+8*1];
 824     h->intra4x4_pred_mode[mb_xy][1]= h->intra4x4_pred_mode_cache[7+8*2];
 825     h->intra4x4_pred_mode[mb_xy][2]= h->intra4x4_pred_mode_cache[7+8*3];
 826     h->intra4x4_pred_mode[mb_xy][3]= h->intra4x4_pred_mode_cache[7+8*4];
 827     h->intra4x4_pred_mode[mb_xy][4]= h->intra4x4_pred_mode_cache[4+8*4];
 828     h->intra4x4_pred_mode[mb_xy][5]= h->intra4x4_pred_mode_cache[5+8*4];
 829     h->intra4x4_pred_mode[mb_xy][6]= h->intra4x4_pred_mode_cache[6+8*4];
 830 }
 831
 832 /**
 833  * checks if the top & left blocks are available if needed & changes the dc mode so it only uses the available blocks.
 834  */
 835 static inline int check_intra4x4_pred_mode(H264Context *h){
 836     MpegEncContext * const s = &h->s;
 837     static const int8_t top [12]= {-1, 0,LEFT_DC_PRED,-1,-1,-1,-1,-1, 0};
 838     static const int8_t left[12]= { 0,-1, TOP_DC_PRED, 0,-1,-1,-1, 0,-1,DC_128_PRED};
 839     int i;
 840
 841     if(!(h->top_samples_available&0x8000)){
 842         for(i=0; i<4; i++){
 843             int status= top[ h->intra4x4_pred_mode_cache[scan8[0] + i] ];
 844             if(status<0){
 845                 av_log(h->s.avctx, AV_LOG_ERROR, "top block unavailable for requested intra4x4 mode %d at %d %d\n", status, s->mb_x, s->mb_y);
 846                 return -1;
 847             } else if(status){
 848                 h->intra4x4_pred_mode_cache[scan8[0] + i]= status;
 849             }
 850         }
 851     }
 852
 853     if(!(h->left_samples_available&0x8000)){
 854         for(i=0; i<4; i++){
 855             int status= left[ h->intra4x4_pred_mode_cache[scan8[0] + 8*i] ];
 856             if(status<0){
 857                 av_log(h->s.avctx, AV_LOG_ERROR, "left block unavailable for requested intra4x4 mode %d at %d %d\n", status, s->mb_x, s->mb_y);
 858                 return -1;
 859             } else if(status){
 860                 h->intra4x4_pred_mode_cache[scan8[0] + 8*i]= status;
 861             }
 862         }
 863     }
 864
 865     return 0;
 866 } //FIXME cleanup like next
 867
 868 /**
 869  * checks if the top & left blocks are available if needed & changes the dc mode so it only uses the available blocks.
 870  */
 871 static inline int check_intra_pred_mode(H264Context *h, int mode){
 872     MpegEncContext * const s = &h->s;
 873     static const int8_t top [7]= {LEFT_DC_PRED8x8, 1,-1,-1};
 874     static const int8_t left[7]= { TOP_DC_PRED8x8,-1, 2,-1,DC_128_PRED8x8};
 875
 876     if(mode < 0 || mode > 6) {
 877         av_log(h->s.avctx, AV_LOG_ERROR, "out of range intra chroma pred mode at %d %d\n", s->mb_x, s->mb_y);
 878         return -1;
 879     }
 880
 881     if(!(h->top_samples_available&0x8000)){
 882         mode= top[ mode ];
 883         if(mode<0){
 884             av_log(h->s.avctx, AV_LOG_ERROR, "top block unavailable for requested intra mode at %d %d\n", s->mb_x, s->mb_y);
 885             return -1;
 886         }
 887     }
 888
 889     if(!(h->left_samples_available&0x8000)){
 890         mode= left[ mode ];
 891         if(mode<0){
 892             av_log(h->s.avctx, AV_LOG_ERROR, "left block unavailable for requested intra mode at %d %d\n", s->mb_x, s->mb_y);
 893             return -1;
 894         }
 895     }
 896
 897     return mode;
 898 }
 899
 900 /**
 901  * gets the predicted intra4x4 prediction mode.
 902  */
 903 static inline int pred_intra_mode(H264Context *h, int n){
 904     const int index8= scan8[n];
 905     const int left= h->intra4x4_pred_mode_cache[index8 - 1];
 906     const int top = h->intra4x4_pred_mode_cache[index8 - 8];
 907     const int min= FFMIN(left, top);
 908
 909     tprintf("mode:%d %d min:%d\n", left ,top, min);
 910
 911     if(min<0) return DC_PRED;
 912     else      return min;
 913 }
 914
 915 static inline void write_back_non_zero_count(H264Context *h){
 916     MpegEncContext * const s = &h->s;
 917     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
 918
 919     h->non_zero_count[mb_xy][0]= h->non_zero_count_cache[7+8*1];
 920     h->non_zero_count[mb_xy][1]= h->non_zero_count_cache[7+8*2];
 921     h->non_zero_count[mb_xy][2]= h->non_zero_count_cache[7+8*3];
 922     h->non_zero_count[mb_xy][3]= h->non_zero_count_cache[7+8*4];
 923     h->non_zero_count[mb_xy][4]= h->non_zero_count_cache[4+8*4];
 924     h->non_zero_count[mb_xy][5]= h->non_zero_count_cache[5+8*4];
 925     h->non_zero_count[mb_xy][6]= h->non_zero_count_cache[6+8*4];
 926
 927     h->non_zero_count[mb_xy][9]= h->non_zero_count_cache[1+8*2];
 928     h->non_zero_count[mb_xy][8]= h->non_zero_count_cache[2+8*2];
 929     h->non_zero_count[mb_xy][7]= h->non_zero_count_cache[2+8*1];
 930
 931     h->non_zero_count[mb_xy][12]=h->non_zero_count_cache[1+8*5];
 932     h->non_zero_count[mb_xy][11]=h->non_zero_count_cache[2+8*5];
 933     h->non_zero_count[mb_xy][10]=h->non_zero_count_cache[2+8*4];
 934 }
 935
 936 /**
 937  * gets the predicted number of non zero coefficients.
 938  * @param n block index
 939  */
 940 static inline int pred_non_zero_count(H264Context *h, int n){
 941     const int index8= scan8[n];
 942     const int left= h->non_zero_count_cache[index8 - 1];
 943     const int top = h->non_zero_count_cache[index8 - 8];
 944     int i= left + top;
 945
 946     if(i<64) i= (i+1)>>1;
 947
 948     tprintf("pred_nnz L%X T%X n%d s%d P%X\n", left, top, n, scan8[n], i&31);
 949
 950     return i&31;
 951 }
 952
 953 static inline int fetch_diagonal_mv(H264Context *h, const int16_t **C, int i, int list, int part_width){
 954     const int topright_ref= h->ref_cache[list][ i - 8 + part_width ];
 955
 956     if(topright_ref != PART_NOT_AVAILABLE){
 957         *C= h->mv_cache[list][ i - 8 + part_width ];
 958         return topright_ref;
 959     }else{
 960         tprintf("topright MV not available\n");
 961
 962         *C= h->mv_cache[list][ i - 8 - 1 ];
 963         return h->ref_cache[list][ i - 8 - 1 ];
 964     }
 965 }
 966
 967 /**
 968  * gets the predicted MV.
 969  * @param n the block index
 970  * @param part_width the width of the partition (4, 8,16) -> (1, 2, 4)
 971  * @param mx the x component of the predicted motion vector
 972  * @param my the y component of the predicted motion vector
 973  */
 974 static inline void pred_motion(H264Context * const h, int n, int part_width, int list, int ref, int * const mx, int * const my){
 975     const int index8= scan8[n];
 976     const int top_ref=      h->ref_cache[list][ index8 - 8 ];
 977     const int left_ref=     h->ref_cache[list][ index8 - 1 ];
 978     const int16_t * const A= h->mv_cache[list][ index8 - 1 ];
 979     const int16_t * const B= h->mv_cache[list][ index8 - 8 ];
 980     const int16_t * C;
 981     int diagonal_ref, match_count;
 982
 983     assert(part_width==1 || part_width==2 || part_width==4);
 984
 985 /* mv_cache
 986   B . . A T T T T
 987   U . . L . . , .
 988   U . . L . . . .
 989   U . . L . . , .
 990   . . . L . . . .
 991 */
 992
 993     diagonal_ref= fetch_diagonal_mv(h, &C, index8, list, part_width);
 994     match_count= (diagonal_ref==ref) + (top_ref==ref) + (left_ref==ref);
 995     tprintf("pred_motion match_count=%d\n", match_count);
 996     if(match_count > 1){ //most common
 997         *mx= mid_pred(A[0], B[0], C[0]);
 998         *my= mid_pred(A[1], B[1], C[1]);
 999     }else if(match_count==1){
1000         if(left_ref==ref){
1001             *mx= A[0];
1002             *my= A[1];
1003         }else if(top_ref==ref){
1004             *mx= B[0];
1005             *my= B[1];
1006         }else{
1007             *mx= C[0];
1008             *my= C[1];
1009         }
1010     }else{
1011         if(top_ref == PART_NOT_AVAILABLE && diagonal_ref == PART_NOT_AVAILABLE && left_ref != PART_NOT_AVAILABLE){
1012             *mx= A[0];
1013             *my= A[1];
1014         }else{
1015             *mx= mid_pred(A[0], B[0], C[0]);
1016             *my= mid_pred(A[1], B[1], C[1]);
1017         }
1018     }
1019
1020     tprintf("pred_motion (%2d %2d %2d) (%2d %2d %2d) (%2d %2d %2d) -> (%2d %2d %2d) at %2d %2d %d list %d\n", top_ref, B[0], B[1],                    diagonal_ref, C[0], C[1], left_ref, A[0], A[1], ref, *mx, *my, h->s.mb_x, h->s.mb_y, n, list);
1021 }
1022
1023 /**
1024  * gets the directionally predicted 16x8 MV.
1025  * @param n the block index
1026  * @param mx the x component of the predicted motion vector
1027  * @param my the y component of the predicted motion vector
1028  */
1029 static inline void pred_16x8_motion(H264Context * const h, int n, int list, int ref, int * const mx, int * const my){
1030     if(n==0){
1031         const int top_ref=      h->ref_cache[list][ scan8[0] - 8 ];
1032         const int16_t * const B= h->mv_cache[list][ scan8[0] - 8 ];
1033
1034         tprintf("pred_16x8: (%2d %2d %2d) at %2d %2d %d list %d\n", top_ref, B[0], B[1], h->s.mb_x, h->s.mb_y, n, list);
1035
1036         if(top_ref == ref){
1037             *mx= B[0];
1038             *my= B[1];
1039             return;
1040         }
1041     }else{
1042         const int left_ref=     h->ref_cache[list][ scan8[8] - 1 ];
1043         const int16_t * const A= h->mv_cache[list][ scan8[8] - 1 ];
1044
1045         tprintf("pred_16x8: (%2d %2d %2d) at %2d %2d %d list %d\n", left_ref, A[0], A[1], h->s.mb_x, h->s.mb_y, n, list);
1046
1047         if(left_ref == ref){
1048             *mx= A[0];
1049             *my= A[1];
1050             return;
1051         }
1052     }
1053
1054     //RARE
1055     pred_motion(h, n, 4, list, ref, mx, my);
1056 }
1057
1058 /**
1059  * gets the directionally predicted 8x16 MV.
1060  * @param n the block index
1061  * @param mx the x component of the predicted motion vector
1062  * @param my the y component of the predicted motion vector
1063  */
1064 static inline void pred_8x16_motion(H264Context * const h, int n, int list, int ref, int * const mx, int * const my){
1065     if(n==0){
1066         const int left_ref=      h->ref_cache[list][ scan8[0] - 1 ];
1067         const int16_t * const A=  h->mv_cache[list][ scan8[0] - 1 ];
1068
1069         tprintf("pred_8x16: (%2d %2d %2d) at %2d %2d %d list %d\n", left_ref, A[0], A[1], h->s.mb_x, h->s.mb_y, n, list);
1070
1071         if(left_ref == ref){
1072             *mx= A[0];
1073             *my= A[1];
1074             return;
1075         }
1076     }else{
1077         const int16_t * C;
1078         int diagonal_ref;
1079
1080         diagonal_ref= fetch_diagonal_mv(h, &C, scan8[4], list, 2);
1081
1082         tprintf("pred_8x16: (%2d %2d %2d) at %2d %2d %d list %d\n", diagonal_ref, C[0], C[1], h->s.mb_x, h->s.mb_y, n, list);
1083
1084         if(diagonal_ref == ref){
1085             *mx= C[0];
1086             *my= C[1];
1087             return;
1088         }
1089     }
1090
1091     //RARE
1092     pred_motion(h, n, 2, list, ref, mx, my);
1093 }
1094
1095 static inline void pred_pskip_motion(H264Context * const h, int * const mx, int * const my){
1096     const int top_ref = h->ref_cache[0][ scan8[0] - 8 ];
1097     const int left_ref= h->ref_cache[0][ scan8[0] - 1 ];
1098
1099     tprintf("pred_pskip: (%d) (%d) at %2d %2d\n", top_ref, left_ref, h->s.mb_x, h->s.mb_y);
1100
1101     if(top_ref == PART_NOT_AVAILABLE || left_ref == PART_NOT_AVAILABLE
1102        || (top_ref == 0  && *(uint32_t*)h->mv_cache[0][ scan8[0] - 8 ] == 0)
1103        || (left_ref == 0 && *(uint32_t*)h->mv_cache[0][ scan8[0] - 1 ] == 0)){
1104
1105         *mx = *my = 0;
1106         return;
1107     }
1108
1109     pred_motion(h, 0, 4, 0, 0, mx, my);
1110
1111     return;
1112 }
1113
1114 static inline void direct_dist_scale_factor(H264Context * const h){
1115     const int poc = h->s.current_picture_ptr->poc;
1116     const int poc1 = h->ref_list[1][0].poc;
1117     int i;
1118     for(i=0; i<h->ref_count[0]; i++){
1119         int poc0 = h->ref_list[0][i].poc;
1120         int td = clip(poc1 - poc0, -128, 127);
1121         if(td == 0 /* FIXME || pic0 is a long-term ref */){
1122             h->dist_scale_factor[i] = 256;
1123         }else{
1124             int tb = clip(poc - poc0, -128, 127);
1125             int tx = (16384 + (ABS(td) >> 1)) / td;
1126             h->dist_scale_factor[i] = clip((tb*tx + 32) >> 6, -1024, 1023);
1127         }
1128     }
1129 }
1130 static inline void direct_ref_list_init(H264Context * const h){
1131     MpegEncContext * const s = &h->s;
1132     Picture * const ref1 = &h->ref_list[1][0];
1133     Picture * const cur = s->current_picture_ptr;
1134     int list, i, j;
1135     if(cur->pict_type == I_TYPE)
1136         cur->ref_count[0] = 0;
1137     if(cur->pict_type != B_TYPE)
1138         cur->ref_count[1] = 0;
1139     for(list=0; list<2; list++){
1140         cur->ref_count[list] = h->ref_count[list];
1141         for(j=0; j<h->ref_count[list]; j++)
1142             cur->ref_poc[list][j] = h->ref_list[list][j].poc;
1143     }
1144     if(cur->pict_type != B_TYPE || h->direct_spatial_mv_pred)
1145         return;
1146     for(list=0; list<2; list++){
1147         for(i=0; i<ref1->ref_count[list]; i++){
1148             const int poc = ref1->ref_poc[list][i];
1149             h->map_col_to_list0[list][i] = PART_NOT_AVAILABLE;
1150             for(j=0; j<h->ref_count[list]; j++)
1151                 if(h->ref_list[list][j].poc == poc){
1152                     h->map_col_to_list0[list][i] = j;
1153                     break;
1154                 }
1155         }
1156     }
1157 }
1158
1159 static inline void pred_direct_motion(H264Context * const h, int *mb_type){
1160     MpegEncContext * const s = &h->s;
1161     const int mb_xy =   s->mb_x +   s->mb_y*s->mb_stride;
1162     const int b8_xy = 2*s->mb_x + 2*s->mb_y*h->b8_stride;
1163     const int b4_xy = 4*s->mb_x + 4*s->mb_y*h->b_stride;
1164     const int mb_type_col = h->ref_list[1][0].mb_type[mb_xy];
1165     const int16_t (*l1mv0)[2] = (const int16_t (*)[2]) &h->ref_list[1][0].motion_val[0][b4_xy];
1166     const int8_t *l1ref0 = &h->ref_list[1][0].ref_index[0][b8_xy];
1167     const int8_t *l1ref1 = &h->ref_list[1][0].ref_index[1][b8_xy];
1168     const int is_b8x8 = IS_8X8(*mb_type);
1169     int sub_mb_type;
1170     int i8, i4;
1171
1172     if(IS_8X8(mb_type_col) && !h->sps.direct_8x8_inference_flag){
1173         /* FIXME save sub mb types from previous frames (or derive from MVs)
1174          * so we know exactly what block size to use */
1175         sub_mb_type = MB_TYPE_8x8|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_4x4 */
1176         *mb_type =    MB_TYPE_8x8|MB_TYPE_L0L1;
1177     }else if(!is_b8x8 && (IS_16X16(mb_type_col) || IS_INTRA(mb_type_col))){
1178         sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
1179         *mb_type =    MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_16x16 */
1180     }else{
1181         sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
1182         *mb_type =    MB_TYPE_8x8|MB_TYPE_L0L1;
1183     }
1184     if(!is_b8x8)
1185         *mb_type |= MB_TYPE_DIRECT2;
1186
1187     tprintf("mb_type = %08x, sub_mb_type = %08x, is_b8x8 = %d, mb_type_col = %08x\n", *mb_type, sub_mb_type, is_b8x8, mb_type_col);
1188
1189     if(h->direct_spatial_mv_pred){
1190         int ref[2];
1191         int mv[2][2];
1192         int list;
1193
1194         /* ref = min(neighbors) */
1195         for(list=0; list<2; list++){
1196             int refa = h->ref_cache[list][scan8[0] - 1];
1197             int refb = h->ref_cache[list][scan8[0] - 8];
1198             int refc = h->ref_cache[list][scan8[0] - 8 + 4];
1199             if(refc == -2)
1200                 refc = h->ref_cache[list][scan8[0] - 8 - 1];
1201             ref[list] = refa;
1202             if(ref[list] < 0 || (refb < ref[list] && refb >= 0))
1203                 ref[list] = refb;
1204             if(ref[list] < 0 || (refc < ref[list] && refc >= 0))
1205                 ref[list] = refc;
1206             if(ref[list] < 0)
1207                 ref[list] = -1;
1208         }
1209
1210         if(ref[0] < 0 && ref[1] < 0){
1211             ref[0] = ref[1] = 0;
1212             mv[0][0] = mv[0][1] =
1213             mv[1][0] = mv[1][1] = 0;
1214         }else{
1215             for(list=0; list<2; list++){
1216                 if(ref[list] >= 0)
1217                     pred_motion(h, 0, 4, list, ref[list], &mv[list][0], &mv[list][1]);
1218                 else
1219                     mv[list][0] = mv[list][1] = 0;
1220             }
1221         }
1222
1223         if(ref[1] < 0){
1224             *mb_type &= ~MB_TYPE_P0L1;
1225             sub_mb_type &= ~MB_TYPE_P0L1;
1226         }else if(ref[0] < 0){
1227             *mb_type &= ~MB_TYPE_P0L0;
1228             sub_mb_type &= ~MB_TYPE_P0L0;
1229         }
1230
1231         if(IS_16X16(*mb_type)){
1232             fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, ref[0], 1);
1233             fill_rectangle(&h->ref_cache[1][scan8[0]], 4, 4, 8, ref[1], 1);
1234             if(!IS_INTRA(mb_type_col) && l1ref0[0] == 0 &&
1235                 ABS(l1mv0[0][0]) <= 1 && ABS(l1mv0[0][1]) <= 1){
1236                 if(ref[0] > 0)
1237                     fill_rectangle(&h->mv_cache[0][scan8[0]], 4, 4, 8, pack16to32(mv[0][0],mv[0][1]), 4);
1238                 else
1239                     fill_rectangle(&h->mv_cache[0][scan8[0]], 4, 4, 8, 0, 4);
1240                 if(ref[1] > 0)
1241                     fill_rectangle(&h->mv_cache[1][scan8[0]], 4, 4, 8, pack16to32(mv[1][0],mv[1][1]), 4);
1242                 else
1243                     fill_rectangle(&h->mv_cache[1][scan8[0]], 4, 4, 8, 0, 4);
1244             }else{
1245                 fill_rectangle(&h->mv_cache[0][scan8[0]], 4, 4, 8, pack16to32(mv[0][0],mv[0][1]), 4);
1246                 fill_rectangle(&h->mv_cache[1][scan8[0]], 4, 4, 8, pack16to32(mv[1][0],mv[1][1]), 4);
1247             }
1248         }else{
1249             for(i8=0; i8<4; i8++){
1250                 const int x8 = i8&1;
1251                 const int y8 = i8>>1;
1252
1253                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1254                     continue;
1255                 h->sub_mb_type[i8] = sub_mb_type;
1256
1257                 fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mv[0][0],mv[0][1]), 4);
1258                 fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mv[1][0],mv[1][1]), 4);
1259                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, ref[0], 1);
1260                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, ref[1], 1);
1261
1262                 /* col_zero_flag */
1263                 if(!IS_INTRA(mb_type_col) && l1ref0[x8 + y8*h->b8_stride] == 0){
1264                     for(i4=0; i4<4; i4++){
1265                         const int16_t *mv_col = l1mv0[x8*2 + (i4&1) + (y8*2 + (i4>>1))*h->b_stride];
1266                         if(ABS(mv_col[0]) <= 1 && ABS(mv_col[1]) <= 1){
1267                             if(ref[0] == 0)
1268                                 *(uint32_t*)h->mv_cache[0][scan8[i8*4+i4]] = 0;
1269                             if(ref[1] == 0)
1270                                 *(uint32_t*)h->mv_cache[1][scan8[i8*4+i4]] = 0;
1271                         }
1272                     }
1273                 }
1274             }
1275         }
1276     }else{ /* direct temporal mv pred */
1277         if(IS_16X16(*mb_type)){
1278             fill_rectangle(&h->ref_cache[1][scan8[0]], 4, 4, 8, 0, 1);
1279             if(IS_INTRA(mb_type_col)){
1280                 fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, 0, 1);
1281                 fill_rectangle(&h-> mv_cache[0][scan8[0]], 4, 4, 8, 0, 4);
1282                 fill_rectangle(&h-> mv_cache[1][scan8[0]], 4, 4, 8, 0, 4);
1283             }else{
1284                 const int ref0 = l1ref0[0] >= 0 ? h->map_col_to_list0[0][l1ref0[0]]
1285                                                 : h->map_col_to_list0[1][l1ref1[0]];
1286                 const int dist_scale_factor = h->dist_scale_factor[ref0];
1287                 const int16_t *mv_col = l1mv0[0];
1288                 int mv_l0[2];
1289                 mv_l0[0] = (dist_scale_factor * mv_col[0] + 128) >> 8;
1290                 mv_l0[1] = (dist_scale_factor * mv_col[1] + 128) >> 8;
1291                 fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, ref0, 1);
1292                 fill_rectangle(&h-> mv_cache[0][scan8[0]], 4, 4, 8, pack16to32(mv_l0[0],mv_l0[1]), 4);
1293                 fill_rectangle(&h-> mv_cache[1][scan8[0]], 4, 4, 8, pack16to32(mv_l0[0]-mv_col[0],mv_l0[1]-mv_col[1]), 4);
1294             }
1295         }else{
1296             for(i8=0; i8<4; i8++){
1297                 const int x8 = i8&1;
1298                 const int y8 = i8>>1;
1299                 int ref0, dist_scale_factor;
1300
1301                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1302                     continue;
1303                 h->sub_mb_type[i8] = sub_mb_type;
1304                 if(IS_INTRA(mb_type_col)){
1305                     fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, 0, 1);
1306                     fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, 0, 1);
1307                     fill_rectangle(&h-> mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
1308                     fill_rectangle(&h-> mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
1309                     continue;
1310                 }
1311
1312                 ref0 = l1ref0[x8 + y8*h->b8_stride];
1313                 if(ref0 >= 0)
1314                     ref0 = h->map_col_to_list0[0][ref0];
1315                 else
1316                     ref0 = h->map_col_to_list0[1][l1ref1[x8 + y8*h->b8_stride]];
1317                 dist_scale_factor = h->dist_scale_factor[ref0];
1318
1319                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, ref0, 1);
1320                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, 0, 1);
1321                 for(i4=0; i4<4; i4++){
1322                     const int16_t *mv_col = l1mv0[x8*2 + (i4&1) + (y8*2 + (i4>>1))*h->b_stride];
1323                     int16_t *mv_l0 = h->mv_cache[0][scan8[i8*4+i4]];
1324                     mv_l0[0] = (dist_scale_factor * mv_col[0] + 128) >> 8;
1325                     mv_l0[1] = (dist_scale_factor * mv_col[1] + 128) >> 8;
1326                     *(uint32_t*)h->mv_cache[1][scan8[i8*4+i4]] =
1327                         pack16to32(mv_l0[0]-mv_col[0],mv_l0[1]-mv_col[1]);
1328                 }
1329             }
1330         }
1331     }
1332 }
1333
1334 static inline void write_back_motion(H264Context *h, int mb_type){
1335     MpegEncContext * const s = &h->s;
1336     const int b_xy = 4*s->mb_x + 4*s->mb_y*h->b_stride;
1337     const int b8_xy= 2*s->mb_x + 2*s->mb_y*h->b8_stride;
1338     int list;
1339
1340     for(list=0; list<2; list++){
1341         int y;
1342         if(!USES_LIST(mb_type, list)){
1343             if(1){ //FIXME skip or never read if mb_type doesnt use it
1344                 for(y=0; y<4; y++){
1345                     *(uint64_t*)s->current_picture.motion_val[list][b_xy + 0 + y*h->b_stride]=
1346                     *(uint64_t*)s->current_picture.motion_val[list][b_xy + 2 + y*h->b_stride]= 0;
1347                 }
1348                 if( h->pps.cabac ) {
1349                     /* FIXME needed ? */
1350                     for(y=0; y<4; y++){
1351                         *(uint64_t*)h->mvd_table[list][b_xy + 0 + y*h->b_stride]=
1352                         *(uint64_t*)h->mvd_table[list][b_xy + 2 + y*h->b_stride]= 0;
1353                     }
1354                 }
1355                 for(y=0; y<2; y++){
1356                     *(uint16_t*)&s->current_picture.ref_index[list][b8_xy + y*h->b8_stride]= (LIST_NOT_USED&0xFF)*0x0101;
1357                 }
1358             }
1359             continue;
1360         }
1361
1362         for(y=0; y<4; y++){
1363             *(uint64_t*)s->current_picture.motion_val[list][b_xy + 0 + y*h->b_stride]= *(uint64_t*)h->mv_cache[list][scan8[0]+0 + 8*y];
1364             *(uint64_t*)s->current_picture.motion_val[list][b_xy + 2 + y*h->b_stride]= *(uint64_t*)h->mv_cache[list][scan8[0]+2 + 8*y];
1365         }
1366         if( h->pps.cabac ) {
1367             for(y=0; y<4; y++){
1368                 *(uint64_t*)h->mvd_table[list][b_xy + 0 + y*h->b_stride]= *(uint64_t*)h->mvd_cache[list][scan8[0]+0 + 8*y];
1369                 *(uint64_t*)h->mvd_table[list][b_xy + 2 + y*h->b_stride]= *(uint64_t*)h->mvd_cache[list][scan8[0]+2 + 8*y];
1370             }
1371         }
1372         for(y=0; y<2; y++){
1373             s->current_picture.ref_index[list][b8_xy + 0 + y*h->b8_stride]= h->ref_cache[list][scan8[0]+0 + 16*y];
1374             s->current_picture.ref_index[list][b8_xy + 1 + y*h->b8_stride]= h->ref_cache[list][scan8[0]+2 + 16*y];
1375         }
1376     }
1377
1378     if(h->slice_type == B_TYPE && h->pps.cabac){
1379         if(IS_8X8(mb_type)){
1380             h->direct_table[b8_xy+1+0*h->b8_stride] = IS_DIRECT(h->sub_mb_type[1]) ? 1 : 0;
1381             h->direct_table[b8_xy+0+1*h->b8_stride] = IS_DIRECT(h->sub_mb_type[2]) ? 1 : 0;
1382             h->direct_table[b8_xy+1+1*h->b8_stride] = IS_DIRECT(h->sub_mb_type[3]) ? 1 : 0;
1383         }
1384     }
1385 }
1386
1387 /**
1388  * Decodes a network abstraction layer unit.
1389  * @param consumed is the number of bytes used as input
1390  * @param length is the length of the array
1391  * @param dst_length is the number of decoded bytes FIXME here or a decode rbsp ttailing?
1392  * @returns decoded bytes, might be src+1 if no escapes
1393  */
1394 static uint8_t *decode_nal(H264Context *h, uint8_t *src, int *dst_length, int *consumed, int length){
1395     int i, si, di;
1396     uint8_t *dst;
1397
1398 //    src[0]&0x80;              //forbidden bit
1399     h->nal_ref_idc= src[0]>>5;
1400     h->nal_unit_type= src[0]&0x1F;
1401
1402     src++; length--;
1403 #if 0
1404     for(i=0; i<length; i++)
1405         printf("%2X ", src[i]);
1406 #endif
1407     for(i=0; i+1<length; i+=2){
1408         if(src[i]) continue;
1409         if(i>0 && src[i-1]==0) i--;
1410         if(i+2<length && src[i+1]==0 && src[i+2]<=3){
1411             if(src[i+2]!=3){
1412                 /* startcode, so we must be past the end */
1413                 length=i;
1414             }
1415             break;
1416         }
1417     }
1418
1419     if(i>=length-1){ //no escaped 0
1420         *dst_length= length;
1421         *consumed= length+1; //+1 for the header
1422         return src;
1423     }
1424
1425     h->rbsp_buffer= av_fast_realloc(h->rbsp_buffer, &h->rbsp_buffer_size, length);
1426     dst= h->rbsp_buffer;
1427
1428 //printf("deoding esc\n");
1429     si=di=0;
1430     while(si<length){
1431         //remove escapes (very rare 1:2^22)
1432         if(si+2<length && src[si]==0 && src[si+1]==0 && src[si+2]<=3){
1433             if(src[si+2]==3){ //escape
1434                 dst[di++]= 0;
1435                 dst[di++]= 0;
1436                 si+=3;
1437                 continue;
1438             }else //next start code
1439                 break;
1440         }
1441
1442         dst[di++]= src[si++];
1443     }
1444
1445     *dst_length= di;
1446     *consumed= si + 1;//+1 for the header
1447 //FIXME store exact number of bits in the getbitcontext (its needed for decoding)
1448     return dst;
1449 }
1450
1451 #if 0
1452 /**
1453  * @param src the data which should be escaped
1454  * @param dst the target buffer, dst+1 == src is allowed as a special case
1455  * @param length the length of the src data
1456  * @param dst_length the length of the dst array
1457  * @returns length of escaped data in bytes or -1 if an error occured
1458  */
1459 static int encode_nal(H264Context *h, uint8_t *dst, uint8_t *src, int length, int dst_length){
1460     int i, escape_count, si, di;
1461     uint8_t *temp;
1462
1463     assert(length>=0);
1464     assert(dst_length>0);
1465
1466     dst[0]= (h->nal_ref_idc<<5) + h->nal_unit_type;
1467
1468     if(length==0) return 1;
1469
1470     escape_count= 0;
1471     for(i=0; i<length; i+=2){
1472         if(src[i]) continue;
1473         if(i>0 && src[i-1]==0)
1474             i--;
1475         if(i+2<length && src[i+1]==0 && src[i+2]<=3){
1476             escape_count++;
1477             i+=2;
1478         }
1479     }
1480
1481     if(escape_count==0){
1482         if(dst+1 != src)
1483             memcpy(dst+1, src, length);
1484         return length + 1;
1485     }
1486
1487     if(length + escape_count + 1> dst_length)
1488         return -1;
1489
1490     //this should be damn rare (hopefully)
1491
1492     h->rbsp_buffer= av_fast_realloc(h->rbsp_buffer, &h->rbsp_buffer_size, length + escape_count);
1493     temp= h->rbsp_buffer;
1494 //printf("encoding esc\n");
1495
1496     si= 0;
1497     di= 0;
1498     while(si < length){
1499         if(si+2<length && src[si]==0 && src[si+1]==0 && src[si+2]<=3){
1500             temp[di++]= 0; si++;
1501             temp[di++]= 0; si++;
1502             temp[di++]= 3;
1503             temp[di++]= src[si++];
1504         }
1505         else
1506             temp[di++]= src[si++];
1507     }
1508     memcpy(dst+1, temp, length+escape_count);
1509
1510     assert(di == length+escape_count);
1511
1512     return di + 1;
1513 }
1514
1515 /**
1516  * write 1,10,100,1000,... for alignment, yes its exactly inverse to mpeg4
1517  */
1518 static void encode_rbsp_trailing(PutBitContext *pb){
1519     int length;
1520     put_bits(pb, 1, 1);
1521     length= (-put_bits_count(pb))&7;
1522     if(length) put_bits(pb, length, 0);
1523 }
1524 #endif
1525
1526 /**
1527  * identifies the exact end of the bitstream
1528  * @return the length of the trailing, or 0 if damaged
1529  */
1530 static int decode_rbsp_trailing(uint8_t *src){
1531     int v= *src;
1532     int r;
1533
1534     tprintf("rbsp trailing %X\n", v);
1535
1536     for(r=1; r<9; r++){
1537         if(v&1) return r;
1538         v>>=1;
1539     }
1540     return 0;
1541 }
1542
1543 /**
1544  * idct tranforms the 16 dc values and dequantize them.
1545  * @param qp quantization parameter
1546  */
1547 static void h264_luma_dc_dequant_idct_c(DCTELEM *block, int qp){
1548     const int qmul= dequant_coeff[qp][0];
1549 #define stride 16
1550     int i;
1551     int temp[16]; //FIXME check if this is a good idea
1552     static const int x_offset[4]={0, 1*stride, 4* stride,  5*stride};
1553     static const int y_offset[4]={0, 2*stride, 8* stride, 10*stride};
1554
1555 //memset(block, 64, 2*256);
1556 //return;
1557     for(i=0; i<4; i++){
1558         const int offset= y_offset[i];
1559         const int z0= block[offset+stride*0] + block[offset+stride*4];
1560         const int z1= block[offset+stride*0] - block[offset+stride*4];
1561         const int z2= block[offset+stride*1] - block[offset+stride*5];
1562         const int z3= block[offset+stride*1] + block[offset+stride*5];
1563
1564         temp[4*i+0]= z0+z3;
1565         temp[4*i+1]= z1+z2;
1566         temp[4*i+2]= z1-z2;
1567         temp[4*i+3]= z0-z3;
1568     }
1569
1570     for(i=0; i<4; i++){
1571         const int offset= x_offset[i];
1572         const int z0= temp[4*0+i] + temp[4*2+i];
1573         const int z1= temp[4*0+i] - temp[4*2+i];
1574         const int z2= temp[4*1+i] - temp[4*3+i];
1575         const int z3= temp[4*1+i] + temp[4*3+i];
1576
1577         block[stride*0 +offset]= ((z0 + z3)*qmul + 2)>>2; //FIXME think about merging this into decode_resdual
1578         block[stride*2 +offset]= ((z1 + z2)*qmul + 2)>>2;
1579         block[stride*8 +offset]= ((z1 - z2)*qmul + 2)>>2;
1580         block[stride*10+offset]= ((z0 - z3)*qmul + 2)>>2;
1581     }
1582 }
1583
1584 #if 0
1585 /**
1586  * dct tranforms the 16 dc values.
1587  * @param qp quantization parameter ??? FIXME
1588  */
1589 static void h264_luma_dc_dct_c(DCTELEM *block/*, int qp*/){
1590 //    const int qmul= dequant_coeff[qp][0];
1591     int i;
1592     int temp[16]; //FIXME check if this is a good idea
1593     static const int x_offset[4]={0, 1*stride, 4* stride,  5*stride};
1594     static const int y_offset[4]={0, 2*stride, 8* stride, 10*stride};
1595
1596     for(i=0; i<4; i++){
1597         const int offset= y_offset[i];
1598         const int z0= block[offset+stride*0] + block[offset+stride*4];
1599         const int z1= block[offset+stride*0] - block[offset+stride*4];
1600         const int z2= block[offset+stride*1] - block[offset+stride*5];
1601         const int z3= block[offset+stride*1] + block[offset+stride*5];
1602
1603         temp[4*i+0]= z0+z3;
1604         temp[4*i+1]= z1+z2;
1605         temp[4*i+2]= z1-z2;
1606         temp[4*i+3]= z0-z3;
1607     }
1608
1609     for(i=0; i<4; i++){
1610         const int offset= x_offset[i];
1611         const int z0= temp[4*0+i] + temp[4*2+i];
1612         const int z1= temp[4*0+i] - temp[4*2+i];
1613         const int z2= temp[4*1+i] - temp[4*3+i];
1614         const int z3= temp[4*1+i] + temp[4*3+i];
1615
1616         block[stride*0 +offset]= (z0 + z3)>>1;
1617         block[stride*2 +offset]= (z1 + z2)>>1;
1618         block[stride*8 +offset]= (z1 - z2)>>1;
1619         block[stride*10+offset]= (z0 - z3)>>1;
1620     }
1621 }
1622 #endif
1623
1624 #undef xStride
1625 #undef stride
1626
1627 static void chroma_dc_dequant_idct_c(DCTELEM *block, int qp){
1628     const int qmul= dequant_coeff[qp][0];
1629     const int stride= 16*2;
1630     const int xStride= 16;
1631     int a,b,c,d,e;
1632
1633     a= block[stride*0 + xStride*0];
1634     b= block[stride*0 + xStride*1];
1635     c= block[stride*1 + xStride*0];
1636     d= block[stride*1 + xStride*1];
1637
1638     e= a-b;
1639     a= a+b;
1640     b= c-d;
1641     c= c+d;
1642
1643     block[stride*0 + xStride*0]= ((a+c)*qmul + 0)>>1;
1644     block[stride*0 + xStride*1]= ((e+b)*qmul + 0)>>1;
1645     block[stride*1 + xStride*0]= ((a-c)*qmul + 0)>>1;
1646     block[stride*1 + xStride*1]= ((e-b)*qmul + 0)>>1;
1647 }
1648
1649 #if 0
1650 static void chroma_dc_dct_c(DCTELEM *block){
1651     const int stride= 16*2;
1652     const int xStride= 16;
1653     int a,b,c,d,e;
1654
1655     a= block[stride*0 + xStride*0];
1656     b= block[stride*0 + xStride*1];
1657     c= block[stride*1 + xStride*0];
1658     d= block[stride*1 + xStride*1];
1659
1660     e= a-b;
1661     a= a+b;
1662     b= c-d;
1663     c= c+d;
1664
1665     block[stride*0 + xStride*0]= (a+c);
1666     block[stride*0 + xStride*1]= (e+b);
1667     block[stride*1 + xStride*0]= (a-c);
1668     block[stride*1 + xStride*1]= (e-b);
1669 }
1670 #endif
1671
1672 /**
1673  * gets the chroma qp.
1674  */
1675 static inline int get_chroma_qp(int chroma_qp_index_offset, int qscale){
1676
1677     return chroma_qp[clip(qscale + chroma_qp_index_offset, 0, 51)];
1678 }
1679
1680
1681 #if 0
1682 static void h264_diff_dct_c(DCTELEM *block, uint8_t *src1, uint8_t *src2, int stride){
1683     int i;
1684     //FIXME try int temp instead of block
1685
1686     for(i=0; i<4; i++){
1687         const int d0= src1[0 + i*stride] - src2[0 + i*stride];
1688         const int d1= src1[1 + i*stride] - src2[1 + i*stride];
1689         const int d2= src1[2 + i*stride] - src2[2 + i*stride];
1690         const int d3= src1[3 + i*stride] - src2[3 + i*stride];
1691         const int z0= d0 + d3;
1692         const int z3= d0 - d3;
1693         const int z1= d1 + d2;
1694         const int z2= d1 - d2;
1695
1696         block[0 + 4*i]=   z0 +   z1;
1697         block[1 + 4*i]= 2*z3 +   z2;
1698         block[2 + 4*i]=   z0 -   z1;
1699         block[3 + 4*i]=   z3 - 2*z2;
1700     }
1701
1702     for(i=0; i<4; i++){
1703         const int z0= block[0*4 + i] + block[3*4 + i];
1704         const int z3= block[0*4 + i] - block[3*4 + i];
1705         const int z1= block[1*4 + i] + block[2*4 + i];
1706         const int z2= block[1*4 + i] - block[2*4 + i];
1707
1708         block[0*4 + i]=   z0 +   z1;
1709         block[1*4 + i]= 2*z3 +   z2;
1710         block[2*4 + i]=   z0 -   z1;
1711         block[3*4 + i]=   z3 - 2*z2;
1712     }
1713 }
1714 #endif
1715
1716 //FIXME need to check that this doesnt overflow signed 32 bit for low qp, iam not sure, its very close
1717 //FIXME check that gcc inlines this (and optimizes intra & seperate_dc stuff away)
1718 static inline int quantize_c(DCTELEM *block, uint8_t *scantable, int qscale, int intra, int seperate_dc){
1719     int i;
1720     const int * const quant_table= quant_coeff[qscale];
1721     const int bias= intra ? (1<<QUANT_SHIFT)/3 : (1<<QUANT_SHIFT)/6;
1722     const unsigned int threshold1= (1<<QUANT_SHIFT) - bias - 1;
1723     const unsigned int threshold2= (threshold1<<1);
1724     int last_non_zero;
1725
1726     if(seperate_dc){
1727         if(qscale<=18){
1728             //avoid overflows
1729             const int dc_bias= intra ? (1<<(QUANT_SHIFT-2))/3 : (1<<(QUANT_SHIFT-2))/6;
1730             const unsigned int dc_threshold1= (1<<(QUANT_SHIFT-2)) - dc_bias - 1;
1731             const unsigned int dc_threshold2= (dc_threshold1<<1);
1732
1733             int level= block[0]*quant_coeff[qscale+18][0];
1734             if(((unsigned)(level+dc_threshold1))>dc_threshold2){
1735                 if(level>0){
1736                     level= (dc_bias + level)>>(QUANT_SHIFT-2);
1737                     block[0]= level;
1738                 }else{
1739                     level= (dc_bias - level)>>(QUANT_SHIFT-2);
1740                     block[0]= -level;
1741                 }
1742 //                last_non_zero = i;
1743             }else{
1744                 block[0]=0;
1745             }
1746         }else{
1747             const int dc_bias= intra ? (1<<(QUANT_SHIFT+1))/3 : (1<<(QUANT_SHIFT+1))/6;
1748             const unsigned int dc_threshold1= (1<<(QUANT_SHIFT+1)) - dc_bias - 1;
1749             const unsigned int dc_threshold2= (dc_threshold1<<1);
1750
1751             int level= block[0]*quant_table[0];
1752             if(((unsigned)(level+dc_threshold1))>dc_threshold2){
1753                 if(level>0){
1754                     level= (dc_bias + level)>>(QUANT_SHIFT+1);
1755                     block[0]= level;
1756                 }else{
1757                     level= (dc_bias - level)>>(QUANT_SHIFT+1);
1758                     block[0]= -level;
1759                 }
1760 //                last_non_zero = i;
1761             }else{
1762                 block[0]=0;
1763             }
1764         }
1765         last_non_zero= 0;
1766         i=1;
1767     }else{
1768         last_non_zero= -1;
1769         i=0;
1770     }
1771
1772     for(; i<16; i++){
1773         const int j= scantable[i];
1774         int level= block[j]*quant_table[j];
1775
1776 //        if(   bias+level >= (1<<(QMAT_SHIFT - 3))
1777 //           || bias-level >= (1<<(QMAT_SHIFT - 3))){
1778         if(((unsigned)(level+threshold1))>threshold2){
1779             if(level>0){
1780                 level= (bias + level)>>QUANT_SHIFT;
1781                 block[j]= level;
1782             }else{
1783                 level= (bias - level)>>QUANT_SHIFT;
1784                 block[j]= -level;
1785             }
1786             last_non_zero = i;
1787         }else{
1788             block[j]=0;
1789         }
1790     }
1791
1792     return last_non_zero;
1793 }
1794
1795 static void pred4x4_vertical_c(uint8_t *src, uint8_t *topright, int stride){
1796     const uint32_t a= ((uint32_t*)(src-stride))[0];
1797     ((uint32_t*)(src+0*stride))[0]= a;
1798     ((uint32_t*)(src+1*stride))[0]= a;
1799     ((uint32_t*)(src+2*stride))[0]= a;
1800     ((uint32_t*)(src+3*stride))[0]= a;
1801 }
1802
1803 static void pred4x4_horizontal_c(uint8_t *src, uint8_t *topright, int stride){
1804     ((uint32_t*)(src+0*stride))[0]= src[-1+0*stride]*0x01010101;
1805     ((uint32_t*)(src+1*stride))[0]= src[-1+1*stride]*0x01010101;
1806     ((uint32_t*)(src+2*stride))[0]= src[-1+2*stride]*0x01010101;
1807     ((uint32_t*)(src+3*stride))[0]= src[-1+3*stride]*0x01010101;
1808 }
1809
1810 static void pred4x4_dc_c(uint8_t *src, uint8_t *topright, int stride){
1811     const int dc= (  src[-stride] + src[1-stride] + src[2-stride] + src[3-stride]
1812                    + src[-1+0*stride] + src[-1+1*stride] + src[-1+2*stride] + src[-1+3*stride] + 4) >>3;
1813
1814     ((uint32_t*)(src+0*stride))[0]=
1815     ((uint32_t*)(src+1*stride))[0]=
1816     ((uint32_t*)(src+2*stride))[0]=
1817     ((uint32_t*)(src+3*stride))[0]= dc* 0x01010101;
1818 }
1819
1820 static void pred4x4_left_dc_c(uint8_t *src, uint8_t *topright, int stride){
1821     const int dc= (  src[-1+0*stride] + src[-1+1*stride] + src[-1+2*stride] + src[-1+3*stride] + 2) >>2;
1822
1823     ((uint32_t*)(src+0*stride))[0]=
1824     ((uint32_t*)(src+1*stride))[0]=
1825     ((uint32_t*)(src+2*stride))[0]=
1826     ((uint32_t*)(src+3*stride))[0]= dc* 0x01010101;
1827 }
1828
1829 static void pred4x4_top_dc_c(uint8_t *src, uint8_t *topright, int stride){
1830     const int dc= (  src[-stride] + src[1-stride] + src[2-stride] + src[3-stride] + 2) >>2;
1831
1832     ((uint32_t*)(src+0*stride))[0]=
1833     ((uint32_t*)(src+1*stride))[0]=
1834     ((uint32_t*)(src+2*stride))[0]=
1835     ((uint32_t*)(src+3*stride))[0]= dc* 0x01010101;
1836 }
1837
1838 static void pred4x4_128_dc_c(uint8_t *src, uint8_t *topright, int stride){
1839     ((uint32_t*)(src+0*stride))[0]=
1840     ((uint32_t*)(src+1*stride))[0]=
1841     ((uint32_t*)(src+2*stride))[0]=
1842     ((uint32_t*)(src+3*stride))[0]= 128U*0x01010101U;
1843 }
1844
1845
1846 #define LOAD_TOP_RIGHT_EDGE\
1847     const int t4= topright[0];\
1848     const int t5= topright[1];\
1849     const int t6= topright[2];\
1850     const int t7= topright[3];\
1851
1852 #define LOAD_LEFT_EDGE\
1853     const int l0= src[-1+0*stride];\
1854     const int l1= src[-1+1*stride];\
1855     const int l2= src[-1+2*stride];\
1856     const int l3= src[-1+3*stride];\
1857
1858 #define LOAD_TOP_EDGE\
1859     const int t0= src[ 0-1*stride];\
1860     const int t1= src[ 1-1*stride];\
1861     const int t2= src[ 2-1*stride];\
1862     const int t3= src[ 3-1*stride];\
1863
1864 static void pred4x4_down_right_c(uint8_t *src, uint8_t *topright, int stride){
1865     const int lt= src[-1-1*stride];
1866     LOAD_TOP_EDGE
1867     LOAD_LEFT_EDGE
1868
1869     src[0+3*stride]=(l3 + 2*l2 + l1 + 2)>>2;
1870     src[0+2*stride]=
1871     src[1+3*stride]=(l2 + 2*l1 + l0 + 2)>>2;
1872     src[0+1*stride]=
1873     src[1+2*stride]=
1874     src[2+3*stride]=(l1 + 2*l0 + lt + 2)>>2;
1875     src[0+0*stride]=
1876     src[1+1*stride]=
1877     src[2+2*stride]=
1878     src[3+3*stride]=(l0 + 2*lt + t0 + 2)>>2;
1879     src[1+0*stride]=
1880     src[2+1*stride]=
1881     src[3+2*stride]=(lt + 2*t0 + t1 + 2)>>2;
1882     src[2+0*stride]=
1883     src[3+1*stride]=(t0 + 2*t1 + t2 + 2)>>2;
1884     src[3+0*stride]=(t1 + 2*t2 + t3 + 2)>>2;
1885 }
1886
1887 static void pred4x4_down_left_c(uint8_t *src, uint8_t *topright, int stride){
1888     LOAD_TOP_EDGE
1889     LOAD_TOP_RIGHT_EDGE
1890 //    LOAD_LEFT_EDGE
1891
1892     src[0+0*stride]=(t0 + t2 + 2*t1 + 2)>>2;
1893     src[1+0*stride]=
1894     src[0+1*stride]=(t1 + t3 + 2*t2 + 2)>>2;
1895     src[2+0*stride]=
1896     src[1+1*stride]=
1897     src[0+2*stride]=(t2 + t4 + 2*t3 + 2)>>2;
1898     src[3+0*stride]=
1899     src[2+1*stride]=
1900     src[1+2*stride]=
1901     src[0+3*stride]=(t3 + t5 + 2*t4 + 2)>>2;
1902     src[3+1*stride]=
1903     src[2+2*stride]=
1904     src[1+3*stride]=(t4 + t6 + 2*t5 + 2)>>2;
1905     src[3+2*stride]=
1906     src[2+3*stride]=(t5 + t7 + 2*t6 + 2)>>2;
1907     src[3+3*stride]=(t6 + 3*t7 + 2)>>2;
1908 }
1909
1910 static void pred4x4_vertical_right_c(uint8_t *src, uint8_t *topright, int stride){
1911     const int lt= src[-1-1*stride];
1912     LOAD_TOP_EDGE
1913     LOAD_LEFT_EDGE
1914     const __attribute__((unused)) int unu= l3;
1915
1916     src[0+0*stride]=
1917     src[1+2*stride]=(lt + t0 + 1)>>1;
1918     src[1+0*stride]=
1919     src[2+2*stride]=(t0 + t1 + 1)>>1;
1920     src[2+0*stride]=
1921     src[3+2*stride]=(t1 + t2 + 1)>>1;
1922     src[3+0*stride]=(t2 + t3 + 1)>>1;
1923     src[0+1*stride]=
1924     src[1+3*stride]=(l0 + 2*lt + t0 + 2)>>2;
1925     src[1+1*stride]=
1926     src[2+3*stride]=(lt + 2*t0 + t1 + 2)>>2;
1927     src[2+1*stride]=
1928     src[3+3*stride]=(t0 + 2*t1 + t2 + 2)>>2;
1929     src[3+1*stride]=(t1 + 2*t2 + t3 + 2)>>2;
1930     src[0+2*stride]=(lt + 2*l0 + l1 + 2)>>2;
1931     src[0+3*stride]=(l0 + 2*l1 + l2 + 2)>>2;
1932 }
1933
1934 static void pred4x4_vertical_left_c(uint8_t *src, uint8_t *topright, int stride){
1935     LOAD_TOP_EDGE
1936     LOAD_TOP_RIGHT_EDGE
1937     const __attribute__((unused)) int unu= t7;
1938
1939     src[0+0*stride]=(t0 + t1 + 1)>>1;
1940     src[1+0*stride]=
1941     src[0+2*stride]=(t1 + t2 + 1)>>1;
1942     src[2+0*stride]=
1943     src[1+2*stride]=(t2 + t3 + 1)>>1;
1944     src[3+0*stride]=
1945     src[2+2*stride]=(t3 + t4+ 1)>>1;
1946     src[3+2*stride]=(t4 + t5+ 1)>>1;
1947     src[0+1*stride]=(t0 + 2*t1 + t2 + 2)>>2;
1948     src[1+1*stride]=
1949     src[0+3*stride]=(t1 + 2*t2 + t3 + 2)>>2;
1950     src[2+1*stride]=
1951     src[1+3*stride]=(t2 + 2*t3 + t4 + 2)>>2;
1952     src[3+1*stride]=
1953     src[2+3*stride]=(t3 + 2*t4 + t5 + 2)>>2;
1954     src[3+3*stride]=(t4 + 2*t5 + t6 + 2)>>2;
1955 }
1956
1957 static void pred4x4_horizontal_up_c(uint8_t *src, uint8_t *topright, int stride){
1958     LOAD_LEFT_EDGE
1959
1960     src[0+0*stride]=(l0 + l1 + 1)>>1;
1961     src[1+0*stride]=(l0 + 2*l1 + l2 + 2)>>2;
1962     src[2+0*stride]=
1963     src[0+1*stride]=(l1 + l2 + 1)>>1;
1964     src[3+0*stride]=
1965     src[1+1*stride]=(l1 + 2*l2 + l3 + 2)>>2;
1966     src[2+1*stride]=
1967     src[0+2*stride]=(l2 + l3 + 1)>>1;
1968     src[3+1*stride]=
1969     src[1+2*stride]=(l2 + 2*l3 + l3 + 2)>>2;
1970     src[3+2*stride]=
1971     src[1+3*stride]=
1972     src[0+3*stride]=
1973     src[2+2*stride]=
1974     src[2+3*stride]=
1975     src[3+3*stride]=l3;
1976 }
1977
1978 static void pred4x4_horizontal_down_c(uint8_t *src, uint8_t *topright, int stride){
1979     const int lt= src[-1-1*stride];
1980     LOAD_TOP_EDGE
1981     LOAD_LEFT_EDGE
1982     const __attribute__((unused)) int unu= t3;
1983
1984     src[0+0*stride]=
1985     src[2+1*stride]=(lt + l0 + 1)>>1;
1986     src[1+0*stride]=
1987     src[3+1*stride]=(l0 + 2*lt + t0 + 2)>>2;
1988     src[2+0*stride]=(lt + 2*t0 + t1 + 2)>>2;
1989     src[3+0*stride]=(t0 + 2*t1 + t2 + 2)>>2;
1990     src[0+1*stride]=
1991     src[2+2*stride]=(l0 + l1 + 1)>>1;
1992     src[1+1*stride]=
1993     src[3+2*stride]=(lt + 2*l0 + l1 + 2)>>2;
1994     src[0+2*stride]=
1995     src[2+3*stride]=(l1 + l2+ 1)>>1;
1996     src[1+2*stride]=
1997     src[3+3*stride]=(l0 + 2*l1 + l2 + 2)>>2;
1998     src[0+3*stride]=(l2 + l3 + 1)>>1;
1999     src[1+3*stride]=(l1 + 2*l2 + l3 + 2)>>2;
2000 }
2001
2002 static void pred16x16_vertical_c(uint8_t *src, int stride){
2003     int i;
2004     const uint32_t a= ((uint32_t*)(src-stride))[0];
2005     const uint32_t b= ((uint32_t*)(src-stride))[1];
2006     const uint32_t c= ((uint32_t*)(src-stride))[2];
2007     const uint32_t d= ((uint32_t*)(src-stride))[3];
2008
2009     for(i=0; i<16; i++){
2010         ((uint32_t*)(src+i*stride))[0]= a;
2011         ((uint32_t*)(src+i*stride))[1]= b;
2012         ((uint32_t*)(src+i*stride))[2]= c;
2013         ((uint32_t*)(src+i*stride))[3]= d;
2014     }
2015 }
2016
2017 static void pred16x16_horizontal_c(uint8_t *src, int stride){
2018     int i;
2019
2020     for(i=0; i<16; i++){
2021         ((uint32_t*)(src+i*stride))[0]=
2022         ((uint32_t*)(src+i*stride))[1]=
2023         ((uint32_t*)(src+i*stride))[2]=
2024         ((uint32_t*)(src+i*stride))[3]= src[-1+i*stride]*0x01010101;
2025     }
2026 }
2027
2028 static void pred16x16_dc_c(uint8_t *src, int stride){
2029     int i, dc=0;
2030
2031     for(i=0;i<16; i++){
2032         dc+= src[-1+i*stride];
2033     }
2034
2035     for(i=0;i<16; i++){
2036         dc+= src[i-stride];
2037     }
2038
2039     dc= 0x01010101*((dc + 16)>>5);
2040
2041     for(i=0; i<16; i++){
2042         ((uint32_t*)(src+i*stride))[0]=
2043         ((uint32_t*)(src+i*stride))[1]=
2044         ((uint32_t*)(src+i*stride))[2]=
2045         ((uint32_t*)(src+i*stride))[3]= dc;
2046     }
2047 }
2048
2049 static void pred16x16_left_dc_c(uint8_t *src, int stride){
2050     int i, dc=0;
2051
2052     for(i=0;i<16; i++){
2053         dc+= src[-1+i*stride];
2054     }
2055
2056     dc= 0x01010101*((dc + 8)>>4);
2057
2058     for(i=0; i<16; i++){
2059         ((uint32_t*)(src+i*stride))[0]=
2060         ((uint32_t*)(src+i*stride))[1]=
2061         ((uint32_t*)(src+i*stride))[2]=
2062         ((uint32_t*)(src+i*stride))[3]= dc;
2063     }
2064 }
2065
2066 static void pred16x16_top_dc_c(uint8_t *src, int stride){
2067     int i, dc=0;
2068
2069     for(i=0;i<16; i++){
2070         dc+= src[i-stride];
2071     }
2072     dc= 0x01010101*((dc + 8)>>4);
2073
2074     for(i=0; i<16; i++){
2075         ((uint32_t*)(src+i*stride))[0]=
2076         ((uint32_t*)(src+i*stride))[1]=
2077         ((uint32_t*)(src+i*stride))[2]=
2078         ((uint32_t*)(src+i*stride))[3]= dc;
2079     }
2080 }
2081
2082 static void pred16x16_128_dc_c(uint8_t *src, int stride){
2083     int i;
2084
2085     for(i=0; i<16; i++){
2086         ((uint32_t*)(src+i*stride))[0]=
2087         ((uint32_t*)(src+i*stride))[1]=
2088         ((uint32_t*)(src+i*stride))[2]=
2089         ((uint32_t*)(src+i*stride))[3]= 0x01010101U*128U;
2090     }
2091 }
2092
2093 static inline void pred16x16_plane_compat_c(uint8_t *src, int stride, const int svq3){
2094   int i, j, k;
2095   int a;
2096   uint8_t *cm = cropTbl + MAX_NEG_CROP;
2097   const uint8_t * const src0 = src+7-stride;
2098   const uint8_t *src1 = src+8*stride-1;
2099   const uint8_t *src2 = src1-2*stride;      // == src+6*stride-1;
2100   int H = src0[1] - src0[-1];
2101   int V = src1[0] - src2[ 0];
2102   for(k=2; k<=8; ++k) {
2103     src1 += stride; src2 -= stride;
2104     H += k*(src0[k] - src0[-k]);
2105     V += k*(src1[0] - src2[ 0]);
2106   }
2107   if(svq3){
2108     H = ( 5*(H/4) ) / 16;
2109     V = ( 5*(V/4) ) / 16;
2110
2111     /* required for 100% accuracy */
2112     i = H; H = V; V = i;
2113   }else{
2114     H = ( 5*H+32 ) >> 6;
2115     V = ( 5*V+32 ) >> 6;
2116   }
2117
2118   a = 16*(src1[0] + src2[16] + 1) - 7*(V+H);
2119   for(j=16; j>0; --j) {
2120     int b = a;
2121     a += V;
2122     for(i=-16; i<0; i+=4) {
2123       src[16+i] = cm[ (b    ) >> 5 ];
2124       src[17+i] = cm[ (b+  H) >> 5 ];
2125       src[18+i] = cm[ (b+2*H) >> 5 ];
2126       src[19+i] = cm[ (b+3*H) >> 5 ];
2127       b += 4*H;
2128     }
2129     src += stride;
2130   }
2131 }
2132
2133 static void pred16x16_plane_c(uint8_t *src, int stride){
2134     pred16x16_plane_compat_c(src, stride, 0);
2135 }
2136
2137 static void pred8x8_vertical_c(uint8_t *src, int stride){
2138     int i;
2139     const uint32_t a= ((uint32_t*)(src-stride))[0];
2140     const uint32_t b= ((uint32_t*)(src-stride))[1];
2141
2142     for(i=0; i<8; i++){
2143         ((uint32_t*)(src+i*stride))[0]= a;
2144         ((uint32_t*)(src+i*stride))[1]= b;
2145     }
2146 }
2147
2148 static void pred8x8_horizontal_c(uint8_t *src, int stride){
2149     int i;
2150
2151     for(i=0; i<8; i++){
2152         ((uint32_t*)(src+i*stride))[0]=
2153         ((uint32_t*)(src+i*stride))[1]= src[-1+i*stride]*0x01010101;
2154     }
2155 }
2156
2157 static void pred8x8_128_dc_c(uint8_t *src, int stride){
2158     int i;
2159
2160     for(i=0; i<4; i++){
2161         ((uint32_t*)(src+i*stride))[0]=
2162         ((uint32_t*)(src+i*stride))[1]= 0x01010101U*128U;
2163     }
2164     for(i=4; i<8; i++){
2165         ((uint32_t*)(src+i*stride))[0]=
2166         ((uint32_t*)(src+i*stride))[1]= 0x01010101U*128U;
2167     }
2168 }
2169
2170 static void pred8x8_left_dc_c(uint8_t *src, int stride){
2171     int i;
2172     int dc0, dc2;
2173
2174     dc0=dc2=0;
2175     for(i=0;i<4; i++){
2176         dc0+= src[-1+i*stride];
2177         dc2+= src[-1+(i+4)*stride];
2178     }
2179     dc0= 0x01010101*((dc0 + 2)>>2);
2180     dc2= 0x01010101*((dc2 + 2)>>2);
2181
2182     for(i=0; i<4; i++){
2183         ((uint32_t*)(src+i*stride))[0]=
2184         ((uint32_t*)(src+i*stride))[1]= dc0;
2185     }
2186     for(i=4; i<8; i++){
2187         ((uint32_t*)(src+i*stride))[0]=
2188         ((uint32_t*)(src+i*stride))[1]= dc2;
2189     }
2190 }
2191
2192 static void pred8x8_top_dc_c(uint8_t *src, int stride){
2193     int i;
2194     int dc0, dc1;
2195
2196     dc0=dc1=0;
2197     for(i=0;i<4; i++){
2198         dc0+= src[i-stride];
2199         dc1+= src[4+i-stride];
2200     }
2201     dc0= 0x01010101*((dc0 + 2)>>2);
2202     dc1= 0x01010101*((dc1 + 2)>>2);
2203
2204     for(i=0; i<4; i++){
2205         ((uint32_t*)(src+i*stride))[0]= dc0;
2206         ((uint32_t*)(src+i*stride))[1]= dc1;
2207     }
2208     for(i=4; i<8; i++){
2209         ((uint32_t*)(src+i*stride))[0]= dc0;
2210         ((uint32_t*)(src+i*stride))[1]= dc1;
2211     }
2212 }
2213
2214
2215 static void pred8x8_dc_c(uint8_t *src, int stride){
2216     int i;
2217     int dc0, dc1, dc2, dc3;
2218
2219     dc0=dc1=dc2=0;
2220     for(i=0;i<4; i++){
2221         dc0+= src[-1+i*stride] + src[i-stride];
2222         dc1+= src[4+i-stride];
2223         dc2+= src[-1+(i+4)*stride];
2224     }
2225     dc3= 0x01010101*((dc1 + dc2 + 4)>>3);
2226     dc0= 0x01010101*((dc0 + 4)>>3);
2227     dc1= 0x01010101*((dc1 + 2)>>2);
2228     dc2= 0x01010101*((dc2 + 2)>>2);
2229
2230     for(i=0; i<4; i++){
2231         ((uint32_t*)(src+i*stride))[0]= dc0;
2232         ((uint32_t*)(src+i*stride))[1]= dc1;
2233     }
2234     for(i=4; i<8; i++){
2235         ((uint32_t*)(src+i*stride))[0]= dc2;
2236         ((uint32_t*)(src+i*stride))[1]= dc3;
2237     }
2238 }
2239
2240 static void pred8x8_plane_c(uint8_t *src, int stride){
2241   int j, k;
2242   int a;
2243   uint8_t *cm = cropTbl + MAX_NEG_CROP;
2244   const uint8_t * const src0 = src+3-stride;
2245   const uint8_t *src1 = src+4*stride-1;
2246   const uint8_t *src2 = src1-2*stride;      // == src+2*stride-1;
2247   int H = src0[1] - src0[-1];
2248   int V = src1[0] - src2[ 0];
2249   for(k=2; k<=4; ++k) {
2250     src1 += stride; src2 -= stride;
2251     H += k*(src0[k] - src0[-k]);
2252     V += k*(src1[0] - src2[ 0]);
2253   }
2254   H = ( 17*H+16 ) >> 5;
2255   V = ( 17*V+16 ) >> 5;
2256
2257   a = 16*(src1[0] + src2[8]+1) - 3*(V+H);
2258   for(j=8; j>0; --j) {
2259     int b = a;
2260     a += V;
2261     src[0] = cm[ (b    ) >> 5 ];
2262     src[1] = cm[ (b+  H) >> 5 ];
2263     src[2] = cm[ (b+2*H) >> 5 ];
2264     src[3] = cm[ (b+3*H) >> 5 ];
2265     src[4] = cm[ (b+4*H) >> 5 ];
2266     src[5] = cm[ (b+5*H) >> 5 ];
2267     src[6] = cm[ (b+6*H) >> 5 ];
2268     src[7] = cm[ (b+7*H) >> 5 ];
2269     src += stride;
2270   }
2271 }
2272
2273 static inline void mc_dir_part(H264Context *h, Picture *pic, int n, int square, int chroma_height, int delta, int list,
2274                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
2275                            int src_x_offset, int src_y_offset,
2276                            qpel_mc_func *qpix_op, h264_chroma_mc_func chroma_op){
2277     MpegEncContext * const s = &h->s;
2278     const int mx= h->mv_cache[list][ scan8[n] ][0] + src_x_offset*8;
2279     const int my= h->mv_cache[list][ scan8[n] ][1] + src_y_offset*8;
2280     const int luma_xy= (mx&3) + ((my&3)<<2);
2281     uint8_t * src_y = pic->data[0] + (mx>>2) + (my>>2)*s->linesize;
2282     uint8_t * src_cb= pic->data[1] + (mx>>3) + (my>>3)*s->uvlinesize;
2283     uint8_t * src_cr= pic->data[2] + (mx>>3) + (my>>3)*s->uvlinesize;
2284     int extra_width= (s->flags&CODEC_FLAG_EMU_EDGE) ? 0 : 16; //FIXME increase edge?, IMHO not worth it
2285     int extra_height= extra_width;
2286     int emu=0;
2287     const int full_mx= mx>>2;
2288     const int full_my= my>>2;
2289
2290     assert(pic->data[0]);
2291
2292     if(mx&7) extra_width -= 3;
2293     if(my&7) extra_height -= 3;
2294
2295     if(   full_mx < 0-extra_width
2296        || full_my < 0-extra_height
2297        || full_mx + 16/*FIXME*/ > s->width + extra_width
2298        || full_my + 16/*FIXME*/ > s->height + extra_height){
2299         ff_emulated_edge_mc(s->edge_emu_buffer, src_y - 2 - 2*s->linesize, s->linesize, 16+5, 16+5/*FIXME*/, full_mx-2, full_my-2, s->width, s->height);
2300             src_y= s->edge_emu_buffer + 2 + 2*s->linesize;
2301         emu=1;
2302     }
2303
2304     qpix_op[luma_xy](dest_y, src_y, s->linesize); //FIXME try variable height perhaps?
2305     if(!square){
2306         qpix_op[luma_xy](dest_y + delta, src_y + delta, s->linesize);
2307     }
2308
2309     if(s->flags&CODEC_FLAG_GRAY) return;
2310
2311     if(emu){
2312         ff_emulated_edge_mc(s->edge_emu_buffer, src_cb, s->uvlinesize, 9, 9/*FIXME*/, (mx>>3), (my>>3), s->width>>1, s->height>>1);
2313             src_cb= s->edge_emu_buffer;
2314     }
2315     chroma_op(dest_cb, src_cb, s->uvlinesize, chroma_height, mx&7, my&7);
2316
2317     if(emu){
2318         ff_emulated_edge_mc(s->edge_emu_buffer, src_cr, s->uvlinesize, 9, 9/*FIXME*/, (mx>>3), (my>>3), s->width>>1, s->height>>1);
2319             src_cr= s->edge_emu_buffer;
2320     }
2321     chroma_op(dest_cr, src_cr, s->uvlinesize, chroma_height, mx&7, my&7);
2322 }
2323
2324 static inline void mc_part_std(H264Context *h, int n, int square, int chroma_height, int delta,
2325                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
2326                            int x_offset, int y_offset,
2327                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
2328                            qpel_mc_func *qpix_avg, h264_chroma_mc_func chroma_avg,
2329                            int list0, int list1){
2330     MpegEncContext * const s = &h->s;
2331     qpel_mc_func *qpix_op=  qpix_put;
2332     h264_chroma_mc_func chroma_op= chroma_put;
2333
2334     dest_y  += 2*x_offset + 2*y_offset*s->  linesize;
2335     dest_cb +=   x_offset +   y_offset*s->uvlinesize;
2336     dest_cr +=   x_offset +   y_offset*s->uvlinesize;
2337     x_offset += 8*s->mb_x;
2338     y_offset += 8*s->mb_y;
2339
2340     if(list0){
2341         Picture *ref= &h->ref_list[0][ h->ref_cache[0][ scan8[n] ] ];
2342         mc_dir_part(h, ref, n, square, chroma_height, delta, 0,
2343                            dest_y, dest_cb, dest_cr, x_offset, y_offset,
2344                            qpix_op, chroma_op);
2345
2346         qpix_op=  qpix_avg;
2347         chroma_op= chroma_avg;
2348     }
2349
2350     if(list1){
2351         Picture *ref= &h->ref_list[1][ h->ref_cache[1][ scan8[n] ] ];
2352         mc_dir_part(h, ref, n, square, chroma_height, delta, 1,
2353                            dest_y, dest_cb, dest_cr, x_offset, y_offset,
2354                            qpix_op, chroma_op);
2355     }
2356 }
2357
2358 static inline void mc_part_weighted(H264Context *h, int n, int square, int chroma_height, int delta,
2359                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
2360                            int x_offset, int y_offset,
2361                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
2362                            h264_weight_func luma_weight_op, h264_weight_func chroma_weight_op,
2363                            h264_biweight_func luma_weight_avg, h264_biweight_func chroma_weight_avg,
2364                            int list0, int list1){
2365     MpegEncContext * const s = &h->s;
2366
2367     dest_y  += 2*x_offset + 2*y_offset*s->  linesize;
2368     dest_cb +=   x_offset +   y_offset*s->uvlinesize;
2369     dest_cr +=   x_offset +   y_offset*s->uvlinesize;
2370     x_offset += 8*s->mb_x;
2371     y_offset += 8*s->mb_y;
2372
2373     if(list0 && list1){
2374         /* don't optimize for luma-only case, since B-frames usually
2375          * use implicit weights => chroma too. */
2376         uint8_t *tmp_cb = s->obmc_scratchpad;
2377         uint8_t *tmp_cr = tmp_cb + 8*s->uvlinesize;
2378         uint8_t *tmp_y  = tmp_cr + 8*s->uvlinesize;
2379         int refn0 = h->ref_cache[0][ scan8[n] ];
2380         int refn1 = h->ref_cache[1][ scan8[n] ];
2381
2382         mc_dir_part(h, &h->ref_list[0][refn0], n, square, chroma_height, delta, 0,
2383                     dest_y, dest_cb, dest_cr,
2384                     x_offset, y_offset, qpix_put, chroma_put);
2385         mc_dir_part(h, &h->ref_list[1][refn1], n, square, chroma_height, delta, 1,
2386                     tmp_y, tmp_cb, tmp_cr,
2387                     x_offset, y_offset, qpix_put, chroma_put);
2388
2389         if(h->use_weight == 2){
2390             int weight0 = h->implicit_weight[refn0][refn1];
2391             int weight1 = 64 - weight0;
2392             luma_weight_avg(  dest_y,  tmp_y,  s->  linesize, 5, weight0, weight1, 0, 0);
2393             chroma_weight_avg(dest_cb, tmp_cb, s->uvlinesize, 5, weight0, weight1, 0, 0);
2394             chroma_weight_avg(dest_cr, tmp_cr, s->uvlinesize, 5, weight0, weight1, 0, 0);
2395         }else{
2396             luma_weight_avg(dest_y, tmp_y, s->linesize, h->luma_log2_weight_denom,
2397                             h->luma_weight[0][refn0], h->luma_weight[1][refn1],
2398                             h->luma_offset[0][refn0], h->luma_offset[1][refn1]);
2399             chroma_weight_avg(dest_cb, tmp_cb, s->uvlinesize, h->chroma_log2_weight_denom,
2400                             h->chroma_weight[0][refn0][0], h->chroma_weight[1][refn1][0],
2401                             h->chroma_offset[0][refn0][0], h->chroma_offset[1][refn1][0]);
2402             chroma_weight_avg(dest_cr, tmp_cr, s->uvlinesize, h->chroma_log2_weight_denom,
2403                             h->chroma_weight[0][refn0][1], h->chroma_weight[1][refn1][1],
2404                             h->chroma_offset[0][refn0][1], h->chroma_offset[1][refn1][1]);
2405         }
2406     }else{
2407         int list = list1 ? 1 : 0;
2408         int refn = h->ref_cache[list][ scan8[n] ];
2409         Picture *ref= &h->ref_list[list][refn];
2410         mc_dir_part(h, ref, n, square, chroma_height, delta, list,
2411                     dest_y, dest_cb, dest_cr, x_offset, y_offset,
2412                     qpix_put, chroma_put);
2413
2414         luma_weight_op(dest_y, s->linesize, h->luma_log2_weight_denom,
2415                        h->luma_weight[list][refn], h->luma_offset[list][refn]);
2416         if(h->use_weight_chroma){
2417             chroma_weight_op(dest_cb, s->uvlinesize, h->chroma_log2_weight_denom,
2418                              h->chroma_weight[list][refn][0], h->chroma_offset[list][refn][0]);
2419             chroma_weight_op(dest_cr, s->uvlinesize, h->chroma_log2_weight_denom,
2420                              h->chroma_weight[list][refn][1], h->chroma_offset[list][refn][1]);
2421         }
2422     }
2423 }
2424
2425 static inline void mc_part(H264Context *h, int n, int square, int chroma_height, int delta,
2426                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
2427                            int x_offset, int y_offset,
2428                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
2429                            qpel_mc_func *qpix_avg, h264_chroma_mc_func chroma_avg,
2430                            h264_weight_func *weight_op, h264_biweight_func *weight_avg,
2431                            int list0, int list1){
2432     if((h->use_weight==2 && list0 && list1
2433         && (h->implicit_weight[ h->ref_cache[0][scan8[n]] ][ h->ref_cache[1][scan8[n]] ] != 32))
2434        || h->use_weight==1)
2435         mc_part_weighted(h, n, square, chroma_height, delta, dest_y, dest_cb, dest_cr,
2436                          x_offset, y_offset, qpix_put, chroma_put,
2437                          weight_op[0], weight_op[3], weight_avg[0], weight_avg[3], list0, list1);
2438     else
2439         mc_part_std(h, n, square, chroma_height, delta, dest_y, dest_cb, dest_cr,
2440                     x_offset, y_offset, qpix_put, chroma_put, qpix_avg, chroma_avg, list0, list1);
2441 }
2442
2443 static void hl_motion(H264Context *h, uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
2444                       qpel_mc_func (*qpix_put)[16], h264_chroma_mc_func (*chroma_put),
2445                       qpel_mc_func (*qpix_avg)[16], h264_chroma_mc_func (*chroma_avg),
2446                       h264_weight_func *weight_op, h264_biweight_func *weight_avg){
2447     MpegEncContext * const s = &h->s;
2448     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
2449     const int mb_type= s->current_picture.mb_type[mb_xy];
2450
2451     assert(IS_INTER(mb_type));
2452
2453     if(IS_16X16(mb_type)){
2454         mc_part(h, 0, 1, 8, 0, dest_y, dest_cb, dest_cr, 0, 0,
2455                 qpix_put[0], chroma_put[0], qpix_avg[0], chroma_avg[0],
2456                 &weight_op[0], &weight_avg[0],
2457                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
2458     }else if(IS_16X8(mb_type)){
2459         mc_part(h, 0, 0, 4, 8, dest_y, dest_cb, dest_cr, 0, 0,
2460                 qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0],
2461                 &weight_op[1], &weight_avg[1],
2462                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
2463         mc_part(h, 8, 0, 4, 8, dest_y, dest_cb, dest_cr, 0, 4,
2464                 qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0],
2465                 &weight_op[1], &weight_avg[1],
2466                 IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1));
2467     }else if(IS_8X16(mb_type)){
2468         mc_part(h, 0, 0, 8, 8*s->linesize, dest_y, dest_cb, dest_cr, 0, 0,
2469                 qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
2470                 &weight_op[2], &weight_avg[2],
2471                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
2472         mc_part(h, 4, 0, 8, 8*s->linesize, dest_y, dest_cb, dest_cr, 4, 0,
2473                 qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
2474                 &weight_op[2], &weight_avg[2],
2475                 IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1));
2476     }else{
2477         int i;
2478
2479         assert(IS_8X8(mb_type));
2480
2481         for(i=0; i<4; i++){
2482             const int sub_mb_type= h->sub_mb_type[i];
2483             const int n= 4*i;
2484             int x_offset= (i&1)<<2;
2485             int y_offset= (i&2)<<1;
2486
2487             if(IS_SUB_8X8(sub_mb_type)){
2488                 mc_part(h, n, 1, 4, 0, dest_y, dest_cb, dest_cr, x_offset, y_offset,
2489                     qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
2490                     &weight_op[3], &weight_avg[3],
2491                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
2492             }else if(IS_SUB_8X4(sub_mb_type)){
2493                 mc_part(h, n  , 0, 2, 4, dest_y, dest_cb, dest_cr, x_offset, y_offset,
2494                     qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1],
2495                     &weight_op[4], &weight_avg[4],
2496                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
2497                 mc_part(h, n+2, 0, 2, 4, dest_y, dest_cb, dest_cr, x_offset, y_offset+2,
2498                     qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1],
2499                     &weight_op[4], &weight_avg[4],
2500                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
2501             }else if(IS_SUB_4X8(sub_mb_type)){
2502                 mc_part(h, n  , 0, 4, 4*s->linesize, dest_y, dest_cb, dest_cr, x_offset, y_offset,
2503                     qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
2504                     &weight_op[5], &weight_avg[5],
2505                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
2506                 mc_part(h, n+1, 0, 4, 4*s->linesize, dest_y, dest_cb, dest_cr, x_offset+2, y_offset,
2507                     qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
2508                     &weight_op[5], &weight_avg[5],
2509                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
2510             }else{
2511                 int j;
2512                 assert(IS_SUB_4X4(sub_mb_type));
2513                 for(j=0; j<4; j++){
2514                     int sub_x_offset= x_offset + 2*(j&1);
2515                     int sub_y_offset= y_offset +   (j&2);
2516                     mc_part(h, n+j, 1, 2, 0, dest_y, dest_cb, dest_cr, sub_x_offset, sub_y_offset,
2517                         qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
2518                         &weight_op[6], &weight_avg[6],
2519                         IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
2520                 }
2521             }
2522         }
2523     }
2524 }
2525
2526 static void decode_init_vlc(H264Context *h){
2527     static int done = 0;
2528
2529     if (!done) {
2530         int i;
2531         done = 1;
2532
2533         init_vlc(&chroma_dc_coeff_token_vlc, CHROMA_DC_COEFF_TOKEN_VLC_BITS, 4*5,
2534                  &chroma_dc_coeff_token_len [0], 1, 1,
2535                  &chroma_dc_coeff_token_bits[0], 1, 1, 1);
2536
2537         for(i=0; i<4; i++){
2538             init_vlc(&coeff_token_vlc[i], COEFF_TOKEN_VLC_BITS, 4*17,
2539                      &coeff_token_len [i][0], 1, 1,
2540                      &coeff_token_bits[i][0], 1, 1, 1);
2541         }
2542
2543         for(i=0; i<3; i++){
2544             init_vlc(&chroma_dc_total_zeros_vlc[i], CHROMA_DC_TOTAL_ZEROS_VLC_BITS, 4,
2545                      &chroma_dc_total_zeros_len [i][0], 1, 1,
2546                      &chroma_dc_total_zeros_bits[i][0], 1, 1, 1);
2547         }
2548         for(i=0; i<15; i++){
2549             init_vlc(&total_zeros_vlc[i], TOTAL_ZEROS_VLC_BITS, 16,
2550                      &total_zeros_len [i][0], 1, 1,
2551                      &total_zeros_bits[i][0], 1, 1, 1);
2552         }
2553
2554         for(i=0; i<6; i++){
2555             init_vlc(&run_vlc[i], RUN_VLC_BITS, 7,
2556                      &run_len [i][0], 1, 1,
2557                      &run_bits[i][0], 1, 1, 1);
2558         }
2559         init_vlc(&run7_vlc, RUN7_VLC_BITS, 16,
2560                  &run_len [6][0], 1, 1,
2561                  &run_bits[6][0], 1, 1, 1);
2562     }
2563 }
2564
2565 /**
2566  * Sets the intra prediction function pointers.
2567  */
2568 static void init_pred_ptrs(H264Context *h){
2569 //    MpegEncContext * const s = &h->s;
2570
2571     h->pred4x4[VERT_PRED           ]= pred4x4_vertical_c;
2572     h->pred4x4[HOR_PRED            ]= pred4x4_horizontal_c;
2573     h->pred4x4[DC_PRED             ]= pred4x4_dc_c;
2574     h->pred4x4[DIAG_DOWN_LEFT_PRED ]= pred4x4_down_left_c;
2575     h->pred4x4[DIAG_DOWN_RIGHT_PRED]= pred4x4_down_right_c;
2576     h->pred4x4[VERT_RIGHT_PRED     ]= pred4x4_vertical_right_c;
2577     h->pred4x4[HOR_DOWN_PRED       ]= pred4x4_horizontal_down_c;
2578     h->pred4x4[VERT_LEFT_PRED      ]= pred4x4_vertical_left_c;
2579     h->pred4x4[HOR_UP_PRED         ]= pred4x4_horizontal_up_c;
2580     h->pred4x4[LEFT_DC_PRED        ]= pred4x4_left_dc_c;
2581     h->pred4x4[TOP_DC_PRED         ]= pred4x4_top_dc_c;
2582     h->pred4x4[DC_128_PRED         ]= pred4x4_128_dc_c;
2583
2584     h->pred8x8[DC_PRED8x8     ]= pred8x8_dc_c;
2585     h->pred8x8[VERT_PRED8x8   ]= pred8x8_vertical_c;
2586     h->pred8x8[HOR_PRED8x8    ]= pred8x8_horizontal_c;
2587     h->pred8x8[PLANE_PRED8x8  ]= pred8x8_plane_c;
2588     h->pred8x8[LEFT_DC_PRED8x8]= pred8x8_left_dc_c;
2589     h->pred8x8[TOP_DC_PRED8x8 ]= pred8x8_top_dc_c;
2590     h->pred8x8[DC_128_PRED8x8 ]= pred8x8_128_dc_c;
2591
2592     h->pred16x16[DC_PRED8x8     ]= pred16x16_dc_c;
2593     h->pred16x16[VERT_PRED8x8   ]= pred16x16_vertical_c;
2594     h->pred16x16[HOR_PRED8x8    ]= pred16x16_horizontal_c;
2595     h->pred16x16[PLANE_PRED8x8  ]= pred16x16_plane_c;
2596     h->pred16x16[LEFT_DC_PRED8x8]= pred16x16_left_dc_c;
2597     h->pred16x16[TOP_DC_PRED8x8 ]= pred16x16_top_dc_c;
2598     h->pred16x16[DC_128_PRED8x8 ]= pred16x16_128_dc_c;
2599 }
2600
2601 static void free_tables(H264Context *h){
2602     av_freep(&h->intra4x4_pred_mode);
2603     av_freep(&h->chroma_pred_mode_table);
2604     av_freep(&h->cbp_table);
2605     av_freep(&h->mvd_table[0]);
2606     av_freep(&h->mvd_table[1]);
2607     av_freep(&h->direct_table);
2608     av_freep(&h->non_zero_count);
2609     av_freep(&h->slice_table_base);
2610     av_freep(&h->top_borders[1]);
2611     av_freep(&h->top_borders[0]);
2612     h->slice_table= NULL;
2613
2614     av_freep(&h->mb2b_xy);
2615     av_freep(&h->mb2b8_xy);
2616
2617     av_freep(&h->s.obmc_scratchpad);
2618 }
2619
2620 /**
2621  * allocates tables.
2622  * needs widzh/height
2623  */
2624 static int alloc_tables(H264Context *h){
2625     MpegEncContext * const s = &h->s;
2626     const int big_mb_num= s->mb_stride * (s->mb_height+1);
2627     int x,y;
2628
2629     CHECKED_ALLOCZ(h->intra4x4_pred_mode, big_mb_num * 8  * sizeof(uint8_t))
2630
2631     CHECKED_ALLOCZ(h->non_zero_count    , big_mb_num * 16 * sizeof(uint8_t))
2632     CHECKED_ALLOCZ(h->slice_table_base  , big_mb_num * sizeof(uint8_t))
2633     CHECKED_ALLOCZ(h->top_borders[0]    , s->mb_width * (16+8+8) * sizeof(uint8_t))
2634     CHECKED_ALLOCZ(h->top_borders[1]    , s->mb_width * (16+8+8) * sizeof(uint8_t))
2635     CHECKED_ALLOCZ(h->cbp_table, big_mb_num * sizeof(uint16_t))
2636
2637     if( h->pps.cabac ) {
2638         CHECKED_ALLOCZ(h->chroma_pred_mode_table, big_mb_num * sizeof(uint8_t))
2639         CHECKED_ALLOCZ(h->mvd_table[0], 32*big_mb_num * sizeof(uint16_t));
2640         CHECKED_ALLOCZ(h->mvd_table[1], 32*big_mb_num * sizeof(uint16_t));
2641         CHECKED_ALLOCZ(h->direct_table, 32*big_mb_num * sizeof(uint8_t));
2642     }
2643
2644     memset(h->slice_table_base, -1, big_mb_num  * sizeof(uint8_t));
2645     h->slice_table= h->slice_table_base + s->mb_stride + 1;
2646
2647     CHECKED_ALLOCZ(h->mb2b_xy  , big_mb_num * sizeof(uint16_t));
2648     CHECKED_ALLOCZ(h->mb2b8_xy , big_mb_num * sizeof(uint16_t));
2649     for(y=0; y<s->mb_height; y++){
2650         for(x=0; x<s->mb_width; x++){
2651             const int mb_xy= x + y*s->mb_stride;
2652             const int b_xy = 4*x + 4*y*h->b_stride;
2653             const int b8_xy= 2*x + 2*y*h->b8_stride;
2654
2655             h->mb2b_xy [mb_xy]= b_xy;
2656             h->mb2b8_xy[mb_xy]= b8_xy;
2657         }
2658     }
2659
2660     s->obmc_scratchpad = NULL;
2661
2662     return 0;
2663 fail:
2664     free_tables(h);
2665     return -1;
2666 }
2667
2668 static void common_init(H264Context *h){
2669     MpegEncContext * const s = &h->s;
2670
2671     s->width = s->avctx->width;
2672     s->height = s->avctx->height;
2673     s->codec_id= s->avctx->codec->id;
2674
2675     init_pred_ptrs(h);
2676
2677     s->unrestricted_mv=1;
2678     s->decode=1; //FIXME
2679 }
2680
2681 static int decode_init(AVCodecContext *avctx){
2682     H264Context *h= avctx->priv_data;
2683     MpegEncContext * const s = &h->s;
2684
2685     MPV_decode_defaults(s);
2686
2687     s->avctx = avctx;
2688     common_init(h);
2689
2690     s->out_format = FMT_H264;
2691     s->workaround_bugs= avctx->workaround_bugs;
2692
2693     // set defaults
2694 //    s->decode_mb= ff_h263_decode_mb;
2695     s->low_delay= 1;
2696     avctx->pix_fmt= PIX_FMT_YUV420P;
2697
2698     decode_init_vlc(h);
2699
2700     if(avctx->extradata_size > 0 && avctx->extradata &&
2701        *(char *)avctx->extradata == 1){
2702         h->is_avc = 1;
2703         h->got_avcC = 0;
2704     } else {
2705         h->is_avc = 0;
2706     }
2707
2708     return 0;
2709 }
2710
2711 static void frame_start(H264Context *h){
2712     MpegEncContext * const s = &h->s;
2713     int i;
2714
2715     MPV_frame_start(s, s->avctx);
2716     ff_er_frame_start(s);
2717
2718     assert(s->linesize && s->uvlinesize);
2719
2720     for(i=0; i<16; i++){
2721         h->block_offset[i]= 4*((scan8[i] - scan8[0])&7) + 4*s->linesize*((scan8[i] - scan8[0])>>3);
2722         h->block_offset[24+i]= 4*((scan8[i] - scan8[0])&7) + 8*s->linesize*((scan8[i] - scan8[0])>>3);
2723     }
2724     for(i=0; i<4; i++){
2725         h->block_offset[16+i]=
2726         h->block_offset[20+i]= 4*((scan8[i] - scan8[0])&7) + 4*s->uvlinesize*((scan8[i] - scan8[0])>>3);
2727         h->block_offset[24+16+i]=
2728         h->block_offset[24+20+i]= 4*((scan8[i] - scan8[0])&7) + 8*s->uvlinesize*((scan8[i] - scan8[0])>>3);
2729     }
2730
2731     /* can't be in alloc_tables because linesize isn't known there.
2732      * FIXME: redo bipred weight to not require extra buffer? */
2733     if(!s->obmc_scratchpad)
2734         s->obmc_scratchpad = av_malloc(16*s->linesize + 2*8*s->uvlinesize);
2735
2736 //    s->decode= (s->flags&CODEC_FLAG_PSNR) || !s->encoding || s->current_picture.reference /*|| h->contains_intra*/ || 1;
2737 }
2738
2739 static inline void backup_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize){
2740     MpegEncContext * const s = &h->s;
2741     int i;
2742
2743     src_y  -=   linesize;
2744     src_cb -= uvlinesize;
2745     src_cr -= uvlinesize;
2746
2747     // There is two lines saved, the line above the the top macroblock of a pair,
2748     // and the line above the bottom macroblock
2749     h->left_border[0]= h->top_borders[0][s->mb_x][15];
2750     for(i=1; i<17; i++){
2751         h->left_border[i]= src_y[15+i*  linesize];
2752     }
2753
2754     *(uint64_t*)(h->top_borders[0][s->mb_x]+0)= *(uint64_t*)(src_y +  16*linesize);
2755     *(uint64_t*)(h->top_borders[0][s->mb_x]+8)= *(uint64_t*)(src_y +8+16*linesize);
2756
2757     if(!(s->flags&CODEC_FLAG_GRAY)){
2758         h->left_border[17  ]= h->top_borders[0][s->mb_x][16+7];
2759         h->left_border[17+9]= h->top_borders[0][s->mb_x][24+7];
2760         for(i=1; i<9; i++){
2761             h->left_border[i+17  ]= src_cb[7+i*uvlinesize];
2762             h->left_border[i+17+9]= src_cr[7+i*uvlinesize];
2763         }
2764         *(uint64_t*)(h->top_borders[0][s->mb_x]+16)= *(uint64_t*)(src_cb+8*uvlinesize);
2765         *(uint64_t*)(h->top_borders[0][s->mb_x]+24)= *(uint64_t*)(src_cr+8*uvlinesize);
2766     }
2767 }
2768
2769 static inline void xchg_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int xchg){
2770     MpegEncContext * const s = &h->s;
2771     int temp8, i;
2772     uint64_t temp64;
2773     int deblock_left = (s->mb_x > 0);
2774     int deblock_top  = (s->mb_y > 0);
2775
2776     src_y  -=   linesize + 1;
2777     src_cb -= uvlinesize + 1;
2778     src_cr -= uvlinesize + 1;
2779
2780 #define XCHG(a,b,t,xchg)\
2781 t= a;\
2782 if(xchg)\
2783     a= b;\
2784 b= t;
2785
2786     if(deblock_left){
2787         for(i = !deblock_top; i<17; i++){
2788             XCHG(h->left_border[i     ], src_y [i*  linesize], temp8, xchg);
2789         }
2790     }
2791
2792     if(deblock_top){
2793         XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+0), *(uint64_t*)(src_y +1), temp64, xchg);
2794         XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+8), *(uint64_t*)(src_y +9), temp64, 1);
2795     }
2796
2797     if(!(s->flags&CODEC_FLAG_GRAY)){
2798         if(deblock_left){
2799             for(i = !deblock_top; i<9; i++){
2800                 XCHG(h->left_border[i+17  ], src_cb[i*uvlinesize], temp8, xchg);
2801                 XCHG(h->left_border[i+17+9], src_cr[i*uvlinesize], temp8, xchg);
2802             }
2803         }
2804         if(deblock_top){
2805             XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+16), *(uint64_t*)(src_cb+1), temp64, 1);
2806             XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+24), *(uint64_t*)(src_cr+1), temp64, 1);
2807         }
2808     }
2809 }
2810
2811 static inline void backup_pair_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize){
2812     MpegEncContext * const s = &h->s;
2813     int i;
2814
2815     src_y  -= 2 *   linesize;
2816     src_cb -= 2 * uvlinesize;
2817     src_cr -= 2 * uvlinesize;
2818
2819     // There is two lines saved, the line above the the top macroblock of a pair,
2820     // and the line above the bottom macroblock
2821     h->left_border[0]= h->top_borders[0][s->mb_x][15];
2822     h->left_border[1]= h->top_borders[1][s->mb_x][15];
2823     for(i=2; i<34; i++){
2824         h->left_border[i]= src_y[15+i*  linesize];
2825     }
2826
2827     *(uint64_t*)(h->top_borders[0][s->mb_x]+0)= *(uint64_t*)(src_y +  32*linesize);
2828     *(uint64_t*)(h->top_borders[0][s->mb_x]+8)= *(uint64_t*)(src_y +8+32*linesize);
2829     *(uint64_t*)(h->top_borders[1][s->mb_x]+0)= *(uint64_t*)(src_y +  33*linesize);
2830     *(uint64_t*)(h->top_borders[1][s->mb_x]+8)= *(uint64_t*)(src_y +8+33*linesize);
2831
2832     if(!(s->flags&CODEC_FLAG_GRAY)){
2833         h->left_border[34     ]= h->top_borders[0][s->mb_x][16+7];
2834         h->left_border[34+   1]= h->top_borders[1][s->mb_x][16+7];
2835         h->left_border[34+18  ]= h->top_borders[0][s->mb_x][24+7];
2836         h->left_border[34+18+1]= h->top_borders[1][s->mb_x][24+7];
2837         for(i=2; i<18; i++){
2838             h->left_border[i+34   ]= src_cb[7+i*uvlinesize];
2839             h->left_border[i+34+18]= src_cr[7+i*uvlinesize];
2840         }
2841         *(uint64_t*)(h->top_borders[0][s->mb_x]+16)= *(uint64_t*)(src_cb+16*uvlinesize);
2842         *(uint64_t*)(h->top_borders[0][s->mb_x]+24)= *(uint64_t*)(src_cr+16*uvlinesize);
2843         *(uint64_t*)(h->top_borders[1][s->mb_x]+16)= *(uint64_t*)(src_cb+17*uvlinesize);
2844         *(uint64_t*)(h->top_borders[1][s->mb_x]+24)= *(uint64_t*)(src_cr+17*uvlinesize);
2845     }
2846 }
2847
2848 static inline void xchg_pair_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int xchg){
2849     MpegEncContext * const s = &h->s;
2850     int temp8, i;
2851     uint64_t temp64;
2852     int deblock_left = (s->mb_x > 0);
2853     int deblock_top  = (s->mb_y > 0);
2854
2855     tprintf("xchg_pair_border: src_y:%p src_cb:%p src_cr:%p ls:%d uvls:%d\n", src_y, src_cb, src_cr, linesize, uvlinesize);
2856
2857     src_y  -= 2 *   linesize + 1;
2858     src_cb -= 2 * uvlinesize + 1;
2859     src_cr -= 2 * uvlinesize + 1;
2860
2861 #define XCHG(a,b,t,xchg)\
2862 t= a;\
2863 if(xchg)\
2864     a= b;\
2865 b= t;
2866
2867     if(deblock_left){
2868         for(i = (!deblock_top)<<1; i<34; i++){
2869             XCHG(h->left_border[i     ], src_y [i*  linesize], temp8, xchg);
2870         }
2871     }
2872
2873     if(deblock_top){
2874         XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+0), *(uint64_t*)(src_y +1), temp64, xchg);
2875         XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+8), *(uint64_t*)(src_y +9), temp64, 1);
2876         XCHG(*(uint64_t*)(h->top_borders[1][s->mb_x]+0), *(uint64_t*)(src_y +1 +linesize), temp64, xchg);
2877         XCHG(*(uint64_t*)(h->top_borders[1][s->mb_x]+8), *(uint64_t*)(src_y +9 +linesize), temp64, 1);
2878     }
2879
2880     if(!(s->flags&CODEC_FLAG_GRAY)){
2881         if(deblock_left){
2882             for(i = (!deblock_top) << 1; i<18; i++){
2883                 XCHG(h->left_border[i+34   ], src_cb[i*uvlinesize], temp8, xchg);
2884                 XCHG(h->left_border[i+34+18], src_cr[i*uvlinesize], temp8, xchg);
2885             }
2886         }
2887         if(deblock_top){
2888             XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+16), *(uint64_t*)(src_cb+1), temp64, 1);
2889             XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+24), *(uint64_t*)(src_cr+1), temp64, 1);
2890             XCHG(*(uint64_t*)(h->top_borders[1][s->mb_x]+16), *(uint64_t*)(src_cb+1 +uvlinesize), temp64, 1);
2891             XCHG(*(uint64_t*)(h->top_borders[1][s->mb_x]+24), *(uint64_t*)(src_cr+1 +uvlinesize), temp64, 1);
2892         }
2893     }
2894 }
2895
2896 static void hl_decode_mb(H264Context *h){
2897     MpegEncContext * const s = &h->s;
2898     const int mb_x= s->mb_x;
2899     const int mb_y= s->mb_y;
2900     const int mb_xy= mb_x + mb_y*s->mb_stride;
2901     const int mb_type= s->current_picture.mb_type[mb_xy];
2902     uint8_t  *dest_y, *dest_cb, *dest_cr;
2903     int linesize, uvlinesize /*dct_offset*/;
2904     int i;
2905     int *block_offset = &h->block_offset[0];
2906     const unsigned int bottom = mb_y & 1;
2907
2908     if(!s->decode)
2909         return;
2910
2911     dest_y  = s->current_picture.data[0] + (mb_y * 16* s->linesize  ) + mb_x * 16;
2912     dest_cb = s->current_picture.data[1] + (mb_y * 8 * s->uvlinesize) + mb_x * 8;
2913     dest_cr = s->current_picture.data[2] + (mb_y * 8 * s->uvlinesize) + mb_x * 8;
2914
2915     if (h->mb_field_decoding_flag) {
2916         linesize = s->linesize * 2;
2917         uvlinesize = s->uvlinesize * 2;
2918         block_offset = &h->block_offset[24];
2919         if(mb_y&1){ //FIXME move out of this func?
2920             dest_y -= s->linesize*15;
2921             dest_cb-= s->uvlinesize*7;
2922             dest_cr-= s->uvlinesize*7;
2923         }
2924     } else {
2925         linesize = s->linesize;
2926         uvlinesize = s->uvlinesize;
2927 //        dct_offset = s->linesize * 16;
2928     }
2929
2930     if (IS_INTRA_PCM(mb_type)) {
2931         unsigned int x, y;
2932
2933         // The pixels are stored in h->mb array in the same order as levels,
2934         // copy them in output in the correct order.
2935         for(i=0; i<16; i++) {
2936             for (y=0; y<4; y++) {
2937                 for (x=0; x<4; x++) {
2938                     *(dest_y + block_offset[i] + y*linesize + x) = h->mb[i*16+y*4+x];
2939                 }
2940             }
2941         }
2942         for(i=16; i<16+4; i++) {
2943             for (y=0; y<4; y++) {
2944                 for (x=0; x<4; x++) {
2945                     *(dest_cb + block_offset[i] + y*uvlinesize + x) = h->mb[i*16+y*4+x];
2946                 }
2947             }
2948         }
2949         for(i=20; i<20+4; i++) {
2950             for (y=0; y<4; y++) {
2951                 for (x=0; x<4; x++) {
2952                     *(dest_cr + block_offset[i] + y*uvlinesize + x) = h->mb[i*16+y*4+x];
2953                 }
2954             }
2955         }
2956     } else {
2957         if(IS_INTRA(mb_type)){
2958             if(h->deblocking_filter) {
2959                 if (h->mb_aff_frame) {
2960                     if (!bottom)
2961                         xchg_pair_border(h, dest_y, dest_cb, dest_cr, s->linesize, s->uvlinesize, 1);
2962                 } else {
2963                     xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 1);
2964                 }
2965             }
2966
2967             if(!(s->flags&CODEC_FLAG_GRAY)){
2968                 h->pred8x8[ h->chroma_pred_mode ](dest_cb, uvlinesize);
2969                 h->pred8x8[ h->chroma_pred_mode ](dest_cr, uvlinesize);
2970             }
2971
2972             if(IS_INTRA4x4(mb_type)){
2973                 if(!s->encoding){
2974                     for(i=0; i<16; i++){
2975                         uint8_t * const ptr= dest_y + block_offset[i];
2976                         uint8_t *topright;
2977                         const int dir= h->intra4x4_pred_mode_cache[ scan8[i] ];
2978                         int tr;
2979
2980                         if(dir == DIAG_DOWN_LEFT_PRED || dir == VERT_LEFT_PRED){
2981                             const int topright_avail= (h->topright_samples_available<<i)&0x8000;
2982                             assert(mb_y || linesize <= block_offset[i]);
2983                             if(!topright_avail){
2984                                 tr= ptr[3 - linesize]*0x01010101;
2985                                 topright= (uint8_t*) &tr;
2986                             }else if(i==5 && h->deblocking_filter){
2987                                 tr= *(uint32_t*)h->top_borders[h->mb_aff_frame ? IS_INTERLACED(mb_type) ? bottom : 1 : 0][mb_x+1];
2988                                 topright= (uint8_t*) &tr;
2989                             }else
2990                                 topright= ptr + 4 - linesize;
2991                         }else
2992                             topright= NULL;
2993
2994                         h->pred4x4[ dir ](ptr, topright, linesize);
2995                         if(h->non_zero_count_cache[ scan8[i] ]){
2996                             if(s->codec_id == CODEC_ID_H264)
2997                                 s->dsp.h264_idct_add(ptr, h->mb + i*16, linesize);
2998                             else
2999                                 svq3_add_idct_c(ptr, h->mb + i*16, linesize, s->qscale, 0);
3000                         }
3001                     }
3002                 }
3003             }else{
3004                 h->pred16x16[ h->intra16x16_pred_mode ](dest_y , linesize);
3005                 if(s->codec_id == CODEC_ID_H264)
3006                     h264_luma_dc_dequant_idct_c(h->mb, s->qscale);
3007                 else
3008                     svq3_luma_dc_dequant_idct_c(h->mb, s->qscale);
3009             }
3010             if(h->deblocking_filter) {
3011                 if (h->mb_aff_frame) {
3012                     if (bottom) {
3013                         uint8_t *pair_dest_y  = s->current_picture.data[0] + ((mb_y-1) * 16* s->linesize  ) + mb_x * 16;
3014                         uint8_t *pair_dest_cb = s->current_picture.data[1] + ((mb_y-1) * 8 * s->uvlinesize) + mb_x * 8;
3015                         uint8_t *pair_dest_cr = s->current_picture.data[2] + ((mb_y-1) * 8 * s->uvlinesize) + mb_x * 8;
3016                         s->mb_y--;
3017                         xchg_pair_border(h, pair_dest_y, pair_dest_cb, pair_dest_cr, s->linesize, s->uvlinesize, 0);
3018                         s->mb_y++;
3019                     }
3020                 } else {
3021                     xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 0);
3022                 }
3023             }
3024         }else if(s->codec_id == CODEC_ID_H264){
3025             hl_motion(h, dest_y, dest_cb, dest_cr,
3026                       s->dsp.put_h264_qpel_pixels_tab, s->dsp.put_h264_chroma_pixels_tab,
3027                       s->dsp.avg_h264_qpel_pixels_tab, s->dsp.avg_h264_chroma_pixels_tab,
3028                       s->dsp.weight_h264_pixels_tab, s->dsp.biweight_h264_pixels_tab);
3029         }
3030
3031
3032         if(!IS_INTRA4x4(mb_type)){
3033             if(s->codec_id == CODEC_ID_H264){
3034                 for(i=0; i<16; i++){
3035                     if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){ //FIXME benchmark weird rule, & below
3036                         uint8_t * const ptr= dest_y + block_offset[i];
3037                         s->dsp.h264_idct_add(ptr, h->mb + i*16, linesize);
3038                     }
3039                 }
3040             }else{
3041                 for(i=0; i<16; i++){
3042                     if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){ //FIXME benchmark weird rule, & below
3043                         uint8_t * const ptr= dest_y + block_offset[i];
3044                         svq3_add_idct_c(ptr, h->mb + i*16, linesize, s->qscale, IS_INTRA(mb_type) ? 1 : 0);
3045                     }
3046                 }
3047             }
3048         }
3049
3050         if(!(s->flags&CODEC_FLAG_GRAY)){
3051             chroma_dc_dequant_idct_c(h->mb + 16*16, h->chroma_qp);
3052             chroma_dc_dequant_idct_c(h->mb + 16*16+4*16, h->chroma_qp);
3053             if(s->codec_id == CODEC_ID_H264){
3054                 for(i=16; i<16+4; i++){
3055                     if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){
3056                         uint8_t * const ptr= dest_cb + block_offset[i];
3057                         s->dsp.h264_idct_add(ptr, h->mb + i*16, uvlinesize);
3058                     }
3059                 }
3060                 for(i=20; i<20+4; i++){
3061                     if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){
3062                         uint8_t * const ptr= dest_cr + block_offset[i];
3063                         s->dsp.h264_idct_add(ptr, h->mb + i*16, uvlinesize);
3064                     }
3065                 }
3066             }else{
3067                 for(i=16; i<16+4; i++){
3068                     if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){
3069                         uint8_t * const ptr= dest_cb + block_offset[i];
3070                         svq3_add_idct_c(ptr, h->mb + i*16, uvlinesize, chroma_qp[s->qscale + 12] - 12, 2);
3071                     }
3072                 }
3073                 for(i=20; i<20+4; i++){
3074                     if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){
3075                         uint8_t * const ptr= dest_cr + block_offset[i];
3076                         svq3_add_idct_c(ptr, h->mb + i*16, uvlinesize, chroma_qp[s->qscale + 12] - 12, 2);
3077                     }
3078                 }
3079             }
3080         }
3081     }
3082     if(h->deblocking_filter) {
3083         if (h->mb_aff_frame) {
3084             const int mb_y = s->mb_y - 1;
3085             uint8_t  *pair_dest_y, *pair_dest_cb, *pair_dest_cr;
3086             const int mb_xy= mb_x + mb_y*s->mb_stride;
3087             const int mb_type_top   = s->current_picture.mb_type[mb_xy];
3088             const int mb_type_bottom= s->current_picture.mb_type[mb_xy+s->mb_stride];
3089             uint8_t tmp = s->current_picture.data[1][384];
3090             if (!bottom) return;
3091             pair_dest_y  = s->current_picture.data[0] + (mb_y * 16* s->linesize  ) + mb_x * 16;
3092             pair_dest_cb = s->current_picture.data[1] + (mb_y * 8 * s->uvlinesize) + mb_x * 8;
3093             pair_dest_cr = s->current_picture.data[2] + (mb_y * 8 * s->uvlinesize) + mb_x * 8;
3094
3095             backup_pair_border(h, pair_dest_y, pair_dest_cb, pair_dest_cr, s->linesize, s->uvlinesize);
3096             // TODO deblock a pair
3097             // top
3098             s->mb_y--;
3099             tprintf("call mbaff filter_mb mb_x:%d mb_y:%d pair_dest_y = %p, dest_y = %p\n", mb_x, mb_y, pair_dest_y, dest_y);
3100             fill_caches(h, mb_type_top, 1); //FIXME dont fill stuff which isnt used by filter_mb
3101             filter_mb(h, mb_x, mb_y, pair_dest_y, pair_dest_cb, pair_dest_cr, linesize, uvlinesize);
3102             if (tmp != s->current_picture.data[1][384]) {
3103                 tprintf("modified pixel 8,1 (1)\n");
3104             }
3105             // bottom
3106             s->mb_y++;
3107             tprintf("call mbaff filter_mb\n");
3108             fill_caches(h, mb_type_bottom, 1); //FIXME dont fill stuff which isnt used by filter_mb
3109             filter_mb(h, mb_x, mb_y+1, dest_y, dest_cb, dest_cr, linesize, uvlinesize);
3110             if (tmp != s->current_picture.data[1][384]) {
3111                 tprintf("modified pixel 8,1 (2)\n");
3112             }
3113         } else {
3114             tprintf("call filter_mb\n");
3115             backup_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize);
3116             fill_caches(h, mb_type, 1); //FIXME dont fill stuff which isnt used by filter_mb
3117             filter_mb(h, mb_x, mb_y, dest_y, dest_cb, dest_cr, linesize, uvlinesize);
3118         }
3119     }
3120 }
3121
3122 /**
3123  * fills the default_ref_list.
3124  */
3125 static int fill_default_ref_list(H264Context *h){
3126     MpegEncContext * const s = &h->s;
3127     int i;
3128     int smallest_poc_greater_than_current = -1;
3129     Picture sorted_short_ref[32];
3130
3131     if(h->slice_type==B_TYPE){
3132         int out_i;
3133         int limit= -1;
3134
3135         /* sort frame according to poc in B slice */
3136         for(out_i=0; out_i<h->short_ref_count; out_i++){
3137             int best_i=-1;
3138             int best_poc=INT_MAX;
3139
3140             for(i=0; i<h->short_ref_count; i++){
3141                 const int poc= h->short_ref[i]->poc;
3142                 if(poc > limit && poc < best_poc){
3143                     best_poc= poc;
3144                     best_i= i;
3145                 }
3146             }
3147
3148             assert(best_i != -1);
3149
3150             limit= best_poc;
3151             sorted_short_ref[out_i]= *h->short_ref[best_i];
3152             tprintf("sorted poc: %d->%d poc:%d fn:%d\n", best_i, out_i, sorted_short_ref[out_i].poc, sorted_short_ref[out_i].frame_num);
3153             if (-1 == smallest_poc_greater_than_current) {
3154                 if (h->short_ref[best_i]->poc >= s->current_picture_ptr->poc) {
3155                     smallest_poc_greater_than_current = out_i;
3156                 }
3157             }
3158         }
3159     }
3160
3161     if(s->picture_structure == PICT_FRAME){
3162         if(h->slice_type==B_TYPE){
3163             int list;
3164             tprintf("current poc: %d, smallest_poc_greater_than_current: %d\n", s->current_picture_ptr->poc, smallest_poc_greater_than_current);
3165
3166             // find the largest poc
3167             for(list=0; list<2; list++){
3168                 int index = 0;
3169                 int j= -99;
3170                 int step= list ? -1 : 1;
3171
3172                 for(i=0; i<h->short_ref_count && index < h->ref_count[list]; i++, j+=step) {
3173                     while(j<0 || j>= h->short_ref_count){
3174                         step = -step;
3175                         j= smallest_poc_greater_than_current + (step>>1);
3176                     }
3177                     if(sorted_short_ref[j].reference != 3) continue;
3178                     h->default_ref_list[list][index  ]= sorted_short_ref[j];
3179                     h->default_ref_list[list][index++].pic_id= sorted_short_ref[j].frame_num;
3180                 }
3181
3182                 for(i = 0; i < 16 && index < h->ref_count[ list ]; i++){
3183                     if(h->long_ref[i] == NULL) continue;
3184                     if(h->long_ref[i]->reference != 3) continue;
3185
3186                     h->default_ref_list[ list ][index  ]= *h->long_ref[i];
3187                     h->default_ref_list[ list ][index++].pic_id= i;;
3188                 }
3189
3190                 if(list && (smallest_poc_greater_than_current<=0 || smallest_poc_greater_than_current>=h->short_ref_count) && (1 < index)){
3191                     // swap the two first elements of L1 when
3192                     // L0 and L1 are identical
3193                     Picture temp= h->default_ref_list[1][0];
3194                     h->default_ref_list[1][0] = h->default_ref_list[1][1];
3195                     h->default_ref_list[1][0] = temp;
3196                 }
3197
3198                 if(index < h->ref_count[ list ])
3199                     memset(&h->default_ref_list[list][index], 0, sizeof(Picture)*(h->ref_count[ list ] - index));
3200             }
3201         }else{
3202             int index=0;
3203             for(i=0; i<h->short_ref_count; i++){
3204                 if(h->short_ref[i]->reference != 3) continue; //FIXME refernce field shit
3205                 h->default_ref_list[0][index  ]= *h->short_ref[i];
3206                 h->default_ref_list[0][index++].pic_id= h->short_ref[i]->frame_num;
3207             }
3208             for(i = 0; i < 16; i++){
3209                 if(h->long_ref[i] == NULL) continue;
3210                 if(h->long_ref[i]->reference != 3) continue;
3211                 h->default_ref_list[0][index  ]= *h->long_ref[i];
3212                 h->default_ref_list[0][index++].pic_id= i;;
3213             }
3214             if(index < h->ref_count[0])
3215                 memset(&h->default_ref_list[0][index], 0, sizeof(Picture)*(h->ref_count[0] - index));
3216         }
3217     }else{ //FIELD
3218         if(h->slice_type==B_TYPE){
3219         }else{
3220             //FIXME second field balh
3221         }
3222     }
3223 #ifdef TRACE
3224     for (i=0; i<h->ref_count[0]; i++) {
3225         tprintf("List0: %s fn:%d 0x%p\n", (h->default_ref_list[0][i].long_ref ? "LT" : "ST"), h->default_ref_list[0][i].pic_id, h->default_ref_list[0][i].data[0]);
3226     }
3227     if(h->slice_type==B_TYPE){
3228         for (i=0; i<h->ref_count[1]; i++) {
3229             tprintf("List1: %s fn:%d 0x%p\n", (h->default_ref_list[1][i].long_ref ? "LT" : "ST"), h->default_ref_list[1][i].pic_id, h->default_ref_list[0][i].data[0]);
3230         }
3231     }
3232 #endif
3233     return 0;
3234 }
3235
3236 static void print_short_term(H264Context *h);
3237 static void print_long_term(H264Context *h);
3238
3239 static int decode_ref_pic_list_reordering(H264Context *h){
3240     MpegEncContext * const s = &h->s;
3241     int list;
3242
3243     print_short_term(h);
3244     print_long_term(h);
3245     if(h->slice_type==I_TYPE || h->slice_type==SI_TYPE) return 0; //FIXME move beofre func
3246
3247     for(list=0; list<2; list++){
3248         memcpy(h->ref_list[list], h->default_ref_list[list], sizeof(Picture)*h->ref_count[list]);
3249
3250         if(get_bits1(&s->gb)){
3251             int pred= h->curr_pic_num;
3252             int index;
3253
3254             for(index=0; ; index++){
3255                 int reordering_of_pic_nums_idc= get_ue_golomb(&s->gb);
3256                 int pic_id;
3257                 int i;
3258                 Picture *ref = NULL;
3259
3260                 if(reordering_of_pic_nums_idc==3)
3261                     break;
3262
3263                 if(index >= h->ref_count[list]){
3264                     av_log(h->s.avctx, AV_LOG_ERROR, "reference count overflow\n");
3265                     return -1;
3266                 }
3267
3268                 if(reordering_of_pic_nums_idc<3){
3269                     if(reordering_of_pic_nums_idc<2){
3270                         const int abs_diff_pic_num= get_ue_golomb(&s->gb) + 1;
3271
3272                         if(abs_diff_pic_num >= h->max_pic_num){
3273                             av_log(h->s.avctx, AV_LOG_ERROR, "abs_diff_pic_num overflow\n");
3274                             return -1;
3275                         }
3276
3277                         if(reordering_of_pic_nums_idc == 0) pred-= abs_diff_pic_num;
3278                         else                                pred+= abs_diff_pic_num;
3279                         pred &= h->max_pic_num - 1;
3280
3281                         for(i= h->short_ref_count-1; i>=0; i--){
3282                             ref = h->short_ref[i];
3283                             if(ref->data[0] != NULL && ref->frame_num == pred && ref->long_ref == 0) // ignore non existing pictures by testing data[0] pointer
3284                                 break;
3285                         }
3286                     }else{
3287                         pic_id= get_ue_golomb(&s->gb); //long_term_pic_idx
3288                         ref = h->long_ref[pic_id];
3289                     }
3290
3291                     if (i < 0) {
3292                         av_log(h->s.avctx, AV_LOG_ERROR, "reference picture missing during reorder\n");
3293                         memset(&h->ref_list[list][index], 0, sizeof(Picture)); //FIXME
3294                     } else {
3295                         h->ref_list[list][index]= *ref;
3296                     }
3297                 }else{
3298                     av_log(h->s.avctx, AV_LOG_ERROR, "illegal reordering_of_pic_nums_idc\n");
3299                     return -1;
3300                 }
3301             }
3302         }
3303
3304         if(h->slice_type!=B_TYPE) break;
3305     }
3306
3307     if(h->slice_type==B_TYPE && !h->direct_spatial_mv_pred)
3308         direct_dist_scale_factor(h);
3309     direct_ref_list_init(h);
3310     return 0;
3311 }
3312
3313 static int pred_weight_table(H264Context *h){
3314     MpegEncContext * const s = &h->s;
3315     int list, i;
3316     int luma_def, chroma_def;
3317
3318     h->use_weight= 0;
3319     h->use_weight_chroma= 0;
3320     h->luma_log2_weight_denom= get_ue_golomb(&s->gb);
3321     h->chroma_log2_weight_denom= get_ue_golomb(&s->gb);
3322     luma_def = 1<<h->luma_log2_weight_denom;
3323     chroma_def = 1<<h->chroma_log2_weight_denom;
3324
3325     for(list=0; list<2; list++){
3326         for(i=0; i<h->ref_count[list]; i++){
3327             int luma_weight_flag, chroma_weight_flag;
3328
3329             luma_weight_flag= get_bits1(&s->gb);
3330             if(luma_weight_flag){
3331                 h->luma_weight[list][i]= get_se_golomb(&s->gb);
3332                 h->luma_offset[list][i]= get_se_golomb(&s->gb);
3333                 if(   h->luma_weight[list][i] != luma_def
3334                    || h->luma_offset[list][i] != 0)
3335                     h->use_weight= 1;
3336             }else{
3337                 h->luma_weight[list][i]= luma_def;
3338                 h->luma_offset[list][i]= 0;
3339             }
3340
3341             chroma_weight_flag= get_bits1(&s->gb);
3342             if(chroma_weight_flag){
3343                 int j;
3344                 for(j=0; j<2; j++){
3345                     h->chroma_weight[list][i][j]= get_se_golomb(&s->gb);
3346                     h->chroma_offset[list][i][j]= get_se_golomb(&s->gb);
3347                     if(   h->chroma_weight[list][i][j] != chroma_def
3348                        || h->chroma_offset[list][i][j] != 0)
3349                         h->use_weight_chroma= 1;
3350                 }
3351             }else{
3352                 int j;
3353                 for(j=0; j<2; j++){
3354                     h->chroma_weight[list][i][j]= chroma_def;
3355                     h->chroma_offset[list][i][j]= 0;
3356                 }
3357             }
3358         }
3359         if(h->slice_type != B_TYPE) break;
3360     }
3361     h->use_weight= h->use_weight || h->use_weight_chroma;
3362     return 0;
3363 }
3364
3365 static void implicit_weight_table(H264Context *h){
3366     MpegEncContext * const s = &h->s;
3367     int ref0, ref1;
3368     int cur_poc = s->current_picture_ptr->poc;
3369
3370     if(   h->ref_count[0] == 1 && h->ref_count[1] == 1
3371        && h->ref_list[0][0].poc + h->ref_list[1][0].poc == 2*cur_poc){
3372         h->use_weight= 0;
3373         h->use_weight_chroma= 0;
3374         return;
3375     }
3376
3377     h->use_weight= 2;
3378     h->use_weight_chroma= 2;
3379     h->luma_log2_weight_denom= 5;
3380     h->chroma_log2_weight_denom= 5;
3381
3382     /* FIXME: MBAFF */
3383     for(ref0=0; ref0 < h->ref_count[0]; ref0++){
3384         int poc0 = h->ref_list[0][ref0].poc;
3385         for(ref1=0; ref1 < h->ref_count[1]; ref1++){
3386             int poc1 = h->ref_list[1][ref1].poc;
3387             int td = clip(poc1 - poc0, -128, 127);
3388             if(td){
3389                 int tb = clip(cur_poc - poc0, -128, 127);
3390                 int tx = (16384 + (ABS(td) >> 1)) / td;
3391                 int dist_scale_factor = clip((tb*tx + 32) >> 6, -1024, 1023) >> 2;
3392                 if(dist_scale_factor < -64 || dist_scale_factor > 128)
3393                     h->implicit_weight[ref0][ref1] = 32;
3394                 else
3395                     h->implicit_weight[ref0][ref1] = 64 - dist_scale_factor;
3396             }else
3397                 h->implicit_weight[ref0][ref1] = 32;
3398         }
3399     }
3400 }
3401
3402 static inline void unreference_pic(H264Context *h, Picture *pic){
3403     int i;
3404     pic->reference=0;
3405     if(pic == h->delayed_output_pic)
3406         pic->reference=1;
3407     else{
3408         for(i = 0; h->delayed_pic[i]; i++)
3409             if(pic == h->delayed_pic[i]){
3410                 pic->reference=1;
3411                 break;
3412             }
3413     }
3414 }
3415
3416 /**
3417  * instantaneous decoder refresh.
3418  */
3419 static void idr(H264Context *h){
3420     int i;
3421
3422     for(i=0; i<16; i++){
3423         if (h->long_ref[i] != NULL) {
3424             unreference_pic(h, h->long_ref[i]);
3425             h->long_ref[i]= NULL;
3426         }
3427     }
3428     h->long_ref_count=0;
3429
3430     for(i=0; i<h->short_ref_count; i++){
3431         unreference_pic(h, h->short_ref[i]);
3432         h->short_ref[i]= NULL;
3433     }
3434     h->short_ref_count=0;
3435 }
3436
3437 /**
3438  *
3439  * @return the removed picture or NULL if an error occures
3440  */
3441 static Picture * remove_short(H264Context *h, int frame_num){
3442     MpegEncContext * const s = &h->s;
3443     int i;
3444
3445     if(s->avctx->debug&FF_DEBUG_MMCO)
3446         av_log(h->s.avctx, AV_LOG_DEBUG, "remove short %d count %d\n", frame_num, h->short_ref_count);
3447
3448     for(i=0; i<h->short_ref_count; i++){
3449         Picture *pic= h->short_ref[i];
3450         if(s->avctx->debug&FF_DEBUG_MMCO)
3451             av_log(h->s.avctx, AV_LOG_DEBUG, "%d %d %p\n", i, pic->frame_num, pic);
3452         if(pic->frame_num == frame_num){
3453             h->short_ref[i]= NULL;
3454             memmove(&h->short_ref[i], &h->short_ref[i+1], (h->short_ref_count - i - 1)*sizeof(Picture*));
3455             h->short_ref_count--;
3456             return pic;
3457         }
3458     }
3459     return NULL;
3460 }
3461
3462 /**
3463  *
3464  * @return the removed picture or NULL if an error occures
3465  */
3466 static Picture * remove_long(H264Context *h, int i){
3467     Picture *pic;
3468
3469     pic= h->long_ref[i];
3470     h->long_ref[i]= NULL;
3471     if(pic) h->long_ref_count--;
3472
3473     return pic;
3474 }
3475
3476 /**
3477  * print short term list
3478  */
3479 static void print_short_term(H264Context *h) {
3480     uint32_t i;
3481     if(h->s.avctx->debug&FF_DEBUG_MMCO) {
3482         av_log(h->s.avctx, AV_LOG_DEBUG, "short term list:\n");
3483         for(i=0; i<h->short_ref_count; i++){
3484             Picture *pic= h->short_ref[i];
3485             av_log(h->s.avctx, AV_LOG_DEBUG, "%d fn:%d poc:%d %p\n", i, pic->frame_num, pic->poc, pic->data[0]);
3486         }
3487     }
3488 }
3489
3490 /**
3491  * print long term list
3492  */
3493 static void print_long_term(H264Context *h) {
3494     uint32_t i;
3495     if(h->s.avctx->debug&FF_DEBUG_MMCO) {
3496         av_log(h->s.avctx, AV_LOG_DEBUG, "long term list:\n");
3497         for(i = 0; i < 16; i++){
3498             Picture *pic= h->long_ref[i];
3499             if (pic) {
3500                 av_log(h->s.avctx, AV_LOG_DEBUG, "%d fn:%d poc:%d %p\n", i, pic->frame_num, pic->poc, pic->data[0]);
3501             }
3502         }
3503     }
3504 }
3505
3506 /**
3507  * Executes the reference picture marking (memory management control operations).
3508  */
3509 static int execute_ref_pic_marking(H264Context *h, MMCO *mmco, int mmco_count){
3510     MpegEncContext * const s = &h->s;
3511     int i, j;
3512     int current_is_long=0;
3513     Picture *pic;
3514
3515     if((s->avctx->debug&FF_DEBUG_MMCO) && mmco_count==0)
3516         av_log(h->s.avctx, AV_LOG_DEBUG, "no mmco here\n");
3517
3518     for(i=0; i<mmco_count; i++){
3519         if(s->avctx->debug&FF_DEBUG_MMCO)
3520             av_log(h->s.avctx, AV_LOG_DEBUG, "mmco:%d %d %d\n", h->mmco[i].opcode, h->mmco[i].short_frame_num, h->mmco[i].long_index);
3521
3522         switch(mmco[i].opcode){
3523         case MMCO_SHORT2UNUSED:
3524             pic= remove_short(h, mmco[i].short_frame_num);
3525             if(pic==NULL) return -1;
3526             unreference_pic(h, pic);
3527             break;
3528         case MMCO_SHORT2LONG:
3529             pic= remove_long(h, mmco[i].long_index);
3530             if(pic) unreference_pic(h, pic);
3531
3532             h->long_ref[ mmco[i].long_index ]= remove_short(h, mmco[i].short_frame_num);
3533             h->long_ref[ mmco[i].long_index ]->long_ref=1;
3534             h->long_ref_count++;
3535             break;
3536         case MMCO_LONG2UNUSED:
3537             pic= remove_long(h, mmco[i].long_index);
3538             if(pic==NULL) return -1;
3539             unreference_pic(h, pic);
3540             break;
3541         case MMCO_LONG:
3542             pic= remove_long(h, mmco[i].long_index);
3543             if(pic) unreference_pic(h, pic);
3544
3545             h->long_ref[ mmco[i].long_index ]= s->current_picture_ptr;
3546             h->long_ref[ mmco[i].long_index ]->long_ref=1;
3547             h->long_ref_count++;
3548
3549             current_is_long=1;
3550             break;
3551         case MMCO_SET_MAX_LONG:
3552             assert(mmco[i].long_index <= 16);
3553             // just remove the long term which index is greater than new max
3554             for(j = mmco[i].long_index; j<16; j++){
3555                 pic = remove_long(h, j);
3556                 if (pic) unreference_pic(h, pic);
3557             }
3558             break;
3559         case MMCO_RESET:
3560             while(h->short_ref_count){
3561                 pic= remove_short(h, h->short_ref[0]->frame_num);
3562                 unreference_pic(h, pic);
3563             }
3564             for(j = 0; j < 16; j++) {
3565                 pic= remove_long(h, j);
3566                 if(pic) unreference_pic(h, pic);
3567             }
3568             break;
3569         default: assert(0);
3570         }
3571     }
3572
3573     if(!current_is_long){
3574         pic= remove_short(h, s->current_picture_ptr->frame_num);
3575         if(pic){
3576             unreference_pic(h, pic);
3577             av_log(h->s.avctx, AV_LOG_ERROR, "illegal short term buffer state detected\n");
3578         }
3579
3580         if(h->short_ref_count)
3581             memmove(&h->short_ref[1], &h->short_ref[0], h->short_ref_count*sizeof(Picture*));
3582
3583         h->short_ref[0]= s->current_picture_ptr;
3584         h->short_ref[0]->long_ref=0;
3585         h->short_ref_count++;
3586     }
3587
3588     print_short_term(h);
3589     print_long_term(h);
3590     return 0;
3591 }
3592
3593 static int decode_ref_pic_marking(H264Context *h){
3594     MpegEncContext * const s = &h->s;
3595     int i;
3596
3597     if(h->nal_unit_type == NAL_IDR_SLICE){ //FIXME fields
3598         s->broken_link= get_bits1(&s->gb) -1;
3599         h->mmco[0].long_index= get_bits1(&s->gb) - 1; // current_long_term_idx
3600         if(h->mmco[0].long_index == -1)
3601             h->mmco_index= 0;
3602         else{
3603             h->mmco[0].opcode= MMCO_LONG;
3604             h->mmco_index= 1;
3605         }
3606     }else{
3607         if(get_bits1(&s->gb)){ // adaptive_ref_pic_marking_mode_flag
3608             for(i= 0; i<MAX_MMCO_COUNT; i++) {
3609                 MMCOOpcode opcode= get_ue_golomb(&s->gb);;
3610
3611                 h->mmco[i].opcode= opcode;
3612                 if(opcode==MMCO_SHORT2UNUSED || opcode==MMCO_SHORT2LONG){
3613                     h->mmco[i].short_frame_num= (h->frame_num - get_ue_golomb(&s->gb) - 1) & ((1<<h->sps.log2_max_frame_num)-1); //FIXME fields
3614 /*                    if(h->mmco[i].short_frame_num >= h->short_ref_count || h->short_ref[ h->mmco[i].short_frame_num ] == NULL){
3615                         fprintf(stderr, "illegal short ref in memory management control operation %d\n", mmco);
3616                         return -1;
3617                     }*/
3618                 }
3619                 if(opcode==MMCO_SHORT2LONG || opcode==MMCO_LONG2UNUSED || opcode==MMCO_LONG || opcode==MMCO_SET_MAX_LONG){
3620                     h->mmco[i].long_index= get_ue_golomb(&s->gb);
3621                     if(/*h->mmco[i].long_index >= h->long_ref_count || h->long_ref[ h->mmco[i].long_index ] == NULL*/ h->mmco[i].long_index >= 16){
3622                         av_log(h->s.avctx, AV_LOG_ERROR, "illegal long ref in memory management control operation %d\n", opcode);
3623                         return -1;
3624                     }
3625                 }
3626
3627                 if(opcode > MMCO_LONG){
3628                     av_log(h->s.avctx, AV_LOG_ERROR, "illegal memory management control operation %d\n", opcode);
3629                     return -1;
3630                 }
3631                 if(opcode == MMCO_END)
3632                     break;
3633             }
3634             h->mmco_index= i;
3635         }else{
3636             assert(h->long_ref_count + h->short_ref_count <= h->sps.ref_frame_count);
3637
3638             if(h->long_ref_count + h->short_ref_count == h->sps.ref_frame_count){ //FIXME fields
3639                 h->mmco[0].opcode= MMCO_SHORT2UNUSED;
3640                 h->mmco[0].short_frame_num= h->short_ref[ h->short_ref_count - 1 ]->frame_num;
3641                 h->mmco_index= 1;
3642             }else
3643                 h->mmco_index= 0;
3644         }
3645     }
3646
3647     return 0;
3648 }
3649
3650 static int init_poc(H264Context *h){
3651     MpegEncContext * const s = &h->s;
3652     const int max_frame_num= 1<<h->sps.log2_max_frame_num;
3653     int field_poc[2];
3654
3655     if(h->nal_unit_type == NAL_IDR_SLICE){
3656         h->frame_num_offset= 0;
3657     }else{
3658         if(h->frame_num < h->prev_frame_num)
3659             h->frame_num_offset= h->prev_frame_num_offset + max_frame_num;
3660         else
3661             h->frame_num_offset= h->prev_frame_num_offset;
3662     }
3663
3664     if(h->sps.poc_type==0){
3665         const int max_poc_lsb= 1<<h->sps.log2_max_poc_lsb;
3666
3667         if     (h->poc_lsb < h->prev_poc_lsb && h->prev_poc_lsb - h->poc_lsb >= max_poc_lsb/2)
3668             h->poc_msb = h->prev_poc_msb + max_poc_lsb;
3669         else if(h->poc_lsb > h->prev_poc_lsb && h->prev_poc_lsb - h->poc_lsb < -max_poc_lsb/2)
3670             h->poc_msb = h->prev_poc_msb - max_poc_lsb;
3671         else
3672             h->poc_msb = h->prev_poc_msb;
3673 //printf("poc: %d %d\n", h->poc_msb, h->poc_lsb);
3674         field_poc[0] =
3675         field_poc[1] = h->poc_msb + h->poc_lsb;
3676         if(s->picture_structure == PICT_FRAME)
3677             field_poc[1] += h->delta_poc_bottom;
3678     }else if(h->sps.poc_type==1){
3679         int abs_frame_num, expected_delta_per_poc_cycle, expectedpoc;
3680         int i;
3681
3682         if(h->sps.poc_cycle_length != 0)
3683             abs_frame_num = h->frame_num_offset + h->frame_num;
3684         else
3685             abs_frame_num = 0;
3686
3687         if(h->nal_ref_idc==0 && abs_frame_num > 0)
3688             abs_frame_num--;
3689
3690         expected_delta_per_poc_cycle = 0;
3691         for(i=0; i < h->sps.poc_cycle_length; i++)
3692             expected_delta_per_poc_cycle += h->sps.offset_for_ref_frame[ i ]; //FIXME integrate during sps parse
3693
3694         if(abs_frame_num > 0){
3695             int poc_cycle_cnt          = (abs_frame_num - 1) / h->sps.poc_cycle_length;
3696             int frame_num_in_poc_cycle = (abs_frame_num - 1) % h->sps.poc_cycle_length;
3697
3698             expectedpoc = poc_cycle_cnt * expected_delta_per_poc_cycle;
3699             for(i = 0; i <= frame_num_in_poc_cycle; i++)
3700                 expectedpoc = expectedpoc + h->sps.offset_for_ref_frame[ i ];
3701         } else
3702             expectedpoc = 0;
3703
3704         if(h->nal_ref_idc == 0)
3705             expectedpoc = expectedpoc + h->sps.offset_for_non_ref_pic;
3706
3707         field_poc[0] = expectedpoc + h->delta_poc[0];
3708         field_poc[1] = field_poc[0] + h->sps.offset_for_top_to_bottom_field;
3709
3710         if(s->picture_structure == PICT_FRAME)
3711             field_poc[1] += h->delta_poc[1];
3712     }else{
3713         int poc;
3714         if(h->nal_unit_type == NAL_IDR_SLICE){
3715             poc= 0;
3716         }else{
3717             if(h->nal_ref_idc) poc= 2*(h->frame_num_offset + h->frame_num);
3718             else               poc= 2*(h->frame_num_offset + h->frame_num) - 1;
3719         }
3720         field_poc[0]= poc;
3721         field_poc[1]= poc;
3722     }
3723
3724     if(s->picture_structure != PICT_BOTTOM_FIELD)
3725         s->current_picture_ptr->field_poc[0]= field_poc[0];
3726     if(s->picture_structure != PICT_TOP_FIELD)
3727         s->current_picture_ptr->field_poc[1]= field_poc[1];
3728     if(s->picture_structure == PICT_FRAME) // FIXME field pix?
3729         s->current_picture_ptr->poc= FFMIN(field_poc[0], field_poc[1]);
3730
3731     return 0;
3732 }
3733
3734 /**
3735  * decodes a slice header.
3736  * this will allso call MPV_common_init() and frame_start() as needed
3737  */
3738 static int decode_slice_header(H264Context *h){
3739     MpegEncContext * const s = &h->s;
3740     int first_mb_in_slice, pps_id;
3741     int num_ref_idx_active_override_flag;
3742     static const uint8_t slice_type_map[5]= {P_TYPE, B_TYPE, I_TYPE, SP_TYPE, SI_TYPE};
3743     int slice_type;
3744     int default_ref_list_done = 0;
3745
3746     s->current_picture.reference= h->nal_ref_idc != 0;
3747     s->dropable= h->nal_ref_idc == 0;
3748
3749     first_mb_in_slice= get_ue_golomb(&s->gb);
3750
3751     slice_type= get_ue_golomb(&s->gb);
3752     if(slice_type > 9){
3753         av_log(h->s.avctx, AV_LOG_ERROR, "slice type too large (%d) at %d %d\n", h->slice_type, s->mb_x, s->mb_y);
3754         return -1;
3755     }
3756     if(slice_type > 4){
3757         slice_type -= 5;
3758         h->slice_type_fixed=1;
3759     }else
3760         h->slice_type_fixed=0;
3761
3762     slice_type= slice_type_map[ slice_type ];
3763     if (slice_type == I_TYPE
3764         || (h->slice_num != 0 && slice_type == h->slice_type) ) {
3765         default_ref_list_done = 1;
3766     }
3767     h->slice_type= slice_type;
3768
3769     s->pict_type= h->slice_type; // to make a few old func happy, its wrong though
3770
3771     pps_id= get_ue_golomb(&s->gb);
3772     if(pps_id>255){
3773         av_log(h->s.avctx, AV_LOG_ERROR, "pps_id out of range\n");
3774         return -1;
3775     }
3776     h->pps= h->pps_buffer[pps_id];
3777     if(h->pps.slice_group_count == 0){
3778         av_log(h->s.avctx, AV_LOG_ERROR, "non existing PPS referenced\n");
3779         return -1;
3780     }
3781
3782     h->sps= h->sps_buffer[ h->pps.sps_id ];
3783     if(h->sps.log2_max_frame_num == 0){
3784         av_log(h->s.avctx, AV_LOG_ERROR, "non existing SPS referenced\n");
3785         return -1;
3786     }
3787
3788     s->mb_width= h->sps.mb_width;
3789     s->mb_height= h->sps.mb_height * (2 - h->sps.frame_mbs_only_flag);
3790
3791     h->b_stride=  s->mb_width*4 + 1;
3792     h->b8_stride= s->mb_width*2 + 1;
3793
3794     s->width = 16*s->mb_width - 2*(h->sps.crop_left + h->sps.crop_right );
3795     if(h->sps.frame_mbs_only_flag)
3796         s->height= 16*s->mb_height - 2*(h->sps.crop_top  + h->sps.crop_bottom);
3797     else
3798         s->height= 16*s->mb_height - 4*(h->sps.crop_top  + h->sps.crop_bottom); //FIXME recheck
3799
3800     if (s->context_initialized
3801         && (   s->width != s->avctx->width || s->height != s->avctx->height)) {
3802         free_tables(h);
3803         MPV_common_end(s);
3804     }
3805     if (!s->context_initialized) {
3806         if (MPV_common_init(s) < 0)
3807             return -1;
3808
3809         alloc_tables(h);
3810
3811         s->avctx->width = s->width;
3812         s->avctx->height = s->height;
3813         s->avctx->sample_aspect_ratio= h->sps.sar;
3814         if(!s->avctx->sample_aspect_ratio.den)
3815             s->avctx->sample_aspect_ratio.den = 1;
3816
3817         if(h->sps.timing_info_present_flag && h->sps.fixed_frame_rate_flag){
3818             s->avctx->frame_rate = h->sps.time_scale;
3819             s->avctx->frame_rate_base = h->sps.num_units_in_tick;
3820         }
3821     }
3822
3823     if(h->slice_num == 0){
3824         frame_start(h);
3825     }
3826
3827     s->current_picture_ptr->frame_num= //FIXME frame_num cleanup
3828     h->frame_num= get_bits(&s->gb, h->sps.log2_max_frame_num);
3829
3830     h->mb_aff_frame = 0;
3831     if(h->sps.frame_mbs_only_flag){
3832         s->picture_structure= PICT_FRAME;
3833     }else{
3834         if(get_bits1(&s->gb)) { //field_pic_flag
3835             s->picture_structure= PICT_TOP_FIELD + get_bits1(&s->gb); //bottom_field_flag
3836         } else {
3837             s->picture_structure= PICT_FRAME;
3838             first_mb_in_slice <<= 1;
3839             h->mb_aff_frame = h->sps.mb_aff;
3840         }
3841     }
3842
3843     s->resync_mb_x = s->mb_x = first_mb_in_slice % s->mb_width;
3844     s->resync_mb_y = s->mb_y = first_mb_in_slice / s->mb_width;
3845
3846     if(s->picture_structure==PICT_FRAME){
3847         h->curr_pic_num=   h->frame_num;
3848         h->max_pic_num= 1<< h->sps.log2_max_frame_num;
3849     }else{
3850         h->curr_pic_num= 2*h->frame_num;
3851         h->max_pic_num= 1<<(h->sps.log2_max_frame_num + 1);
3852     }
3853
3854     if(h->nal_unit_type == NAL_IDR_SLICE){
3855         get_ue_golomb(&s->gb); /* idr_pic_id */
3856     }
3857
3858     if(h->sps.poc_type==0){
3859         h->poc_lsb= get_bits(&s->gb, h->sps.log2_max_poc_lsb);
3860
3861         if(h->pps.pic_order_present==1 && s->picture_structure==PICT_FRAME){
3862             h->delta_poc_bottom= get_se_golomb(&s->gb);
3863         }
3864     }
3865
3866     if(h->sps.poc_type==1 && !h->sps.delta_pic_order_always_zero_flag){
3867         h->delta_poc[0]= get_se_golomb(&s->gb);
3868
3869         if(h->pps.pic_order_present==1 && s->picture_structure==PICT_FRAME)
3870             h->delta_poc[1]= get_se_golomb(&s->gb);
3871     }
3872
3873     init_poc(h);
3874
3875     if(h->pps.redundant_pic_cnt_present){
3876         h->redundant_pic_count= get_ue_golomb(&s->gb);
3877     }
3878
3879     //set defaults, might be overriden a few line later
3880     h->ref_count[0]= h->pps.ref_count[0];
3881     h->ref_count[1]= h->pps.ref_count[1];
3882
3883     if(h->slice_type == P_TYPE || h->slice_type == SP_TYPE || h->slice_type == B_TYPE){
3884         if(h->slice_type == B_TYPE){
3885             h->direct_spatial_mv_pred= get_bits1(&s->gb);
3886         }
3887         num_ref_idx_active_override_flag= get_bits1(&s->gb);
3888
3889         if(num_ref_idx_active_override_flag){
3890             h->ref_count[0]= get_ue_golomb(&s->gb) + 1;
3891             if(h->slice_type==B_TYPE)
3892                 h->ref_count[1]= get_ue_golomb(&s->gb) + 1;
3893
3894             if(h->ref_count[0] > 32 || h->ref_count[1] > 32){
3895                 av_log(h->s.avctx, AV_LOG_ERROR, "reference overflow\n");
3896                 return -1;
3897             }
3898         }
3899     }
3900
3901     if(!default_ref_list_done){
3902         fill_default_ref_list(h);
3903     }
3904
3905     decode_ref_pic_list_reordering(h);
3906
3907     if(   (h->pps.weighted_pred          && (h->slice_type == P_TYPE || h->slice_type == SP_TYPE ))
3908        || (h->pps.weighted_bipred_idc==1 && h->slice_type==B_TYPE ) )
3909         pred_weight_table(h);
3910     else if(h->pps.weighted_bipred_idc==2 && h->slice_type==B_TYPE)
3911         implicit_weight_table(h);
3912     else
3913         h->use_weight = 0;
3914
3915     if(s->current_picture.reference)
3916         decode_ref_pic_marking(h);
3917
3918     if( h->slice_type != I_TYPE && h->slice_type != SI_TYPE && h->pps.cabac )
3919         h->cabac_init_idc = get_ue_golomb(&s->gb);
3920
3921     h->last_qscale_diff = 0;
3922     s->qscale = h->pps.init_qp + get_se_golomb(&s->gb);
3923     if(s->qscale<0 || s->qscale>51){
3924         av_log(s->avctx, AV_LOG_ERROR, "QP %d out of range\n", s->qscale);
3925         return -1;
3926     }
3927     h->chroma_qp = get_chroma_qp(h->pps.chroma_qp_index_offset, s->qscale);
3928     //FIXME qscale / qp ... stuff
3929     if(h->slice_type == SP_TYPE){
3930         get_bits1(&s->gb); /* sp_for_switch_flag */
3931     }
3932     if(h->slice_type==SP_TYPE || h->slice_type == SI_TYPE){
3933         get_se_golomb(&s->gb); /* slice_qs_delta */
3934     }
3935
3936     h->deblocking_filter = 1;
3937     h->slice_alpha_c0_offset = 0;
3938     h->slice_beta_offset = 0;
3939     if( h->pps.deblocking_filter_parameters_present ) {
3940         h->deblocking_filter= get_ue_golomb(&s->gb);
3941         if(h->deblocking_filter < 2)
3942             h->deblocking_filter^= 1; // 1<->0
3943
3944         if( h->deblocking_filter ) {
3945             h->slice_alpha_c0_offset = get_se_golomb(&s->gb) << 1;
3946             h->slice_beta_offset = get_se_golomb(&s->gb) << 1;
3947         }
3948     }
3949
3950 #if 0 //FMO
3951     if( h->pps.num_slice_groups > 1  && h->pps.mb_slice_group_map_type >= 3 && h->pps.mb_slice_group_map_type <= 5)
3952         slice_group_change_cycle= get_bits(&s->gb, ?);
3953 #endif
3954
3955     h->slice_num++;
3956
3957     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
3958         av_log(h->s.avctx, AV_LOG_DEBUG, "slice:%d %s mb:%d %c pps:%d frame:%d poc:%d/%d ref:%d/%d qp:%d loop:%d weight:%d%s\n",
3959                h->slice_num,
3960                (s->picture_structure==PICT_FRAME ? "F" : s->picture_structure==PICT_TOP_FIELD ? "T" : "B"),
3961                first_mb_in_slice,
3962                av_get_pict_type_char(h->slice_type),
3963                pps_id, h->frame_num,
3964                s->current_picture_ptr->field_poc[0], s->current_picture_ptr->field_poc[1],
3965                h->ref_count[0], h->ref_count[1],
3966                s->qscale,
3967                h->deblocking_filter,
3968                h->use_weight,
3969                h->use_weight==1 && h->use_weight_chroma ? "c" : ""
3970                );
3971     }
3972
3973     return 0;
3974 }
3975
3976 /**
3977  *
3978  */
3979 static inline int get_level_prefix(GetBitContext *gb){
3980     unsigned int buf;
3981     int log;
3982
3983     OPEN_READER(re, gb);
3984     UPDATE_CACHE(re, gb);
3985     buf=GET_CACHE(re, gb);
3986
3987     log= 32 - av_log2(buf);
3988 #ifdef TRACE
3989     print_bin(buf>>(32-log), log);
3990     av_log(NULL, AV_LOG_DEBUG, "%5d %2d %3d lpr @%5d in %s get_level_prefix\n", buf>>(32-log), log, log-1, get_bits_count(gb), __FILE__);
3991 #endif
3992
3993     LAST_SKIP_BITS(re, gb, log);
3994     CLOSE_READER(re, gb);
3995
3996     return log-1;
3997 }
3998
3999 /**
4000  * decodes a residual block.
4001  * @param n block index
4002  * @param scantable scantable
4003  * @param max_coeff number of coefficients in the block
4004  * @return <0 if an error occured
4005  */
4006 static int decode_residual(H264Context *h, GetBitContext *gb, DCTELEM *block, int n, const uint8_t *scantable, int qp, int max_coeff){
4007     MpegEncContext * const s = &h->s;
4008     const uint16_t *qmul= dequant_coeff[qp];
4009     static const int coeff_token_table_index[17]= {0, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3};
4010     int level[16], run[16];
4011     int suffix_length, zeros_left, coeff_num, coeff_token, total_coeff, i, trailing_ones;
4012
4013     //FIXME put trailing_onex into the context
4014
4015     if(n == CHROMA_DC_BLOCK_INDEX){
4016         coeff_token= get_vlc2(gb, chroma_dc_coeff_token_vlc.table, CHROMA_DC_COEFF_TOKEN_VLC_BITS, 1);
4017         total_coeff= coeff_token>>2;
4018     }else{
4019         if(n == LUMA_DC_BLOCK_INDEX){
4020             total_coeff= pred_non_zero_count(h, 0);
4021             coeff_token= get_vlc2(gb, coeff_token_vlc[ coeff_token_table_index[total_coeff] ].table, COEFF_TOKEN_VLC_BITS, 2);
4022             total_coeff= coeff_token>>2;
4023         }else{
4024             total_coeff= pred_non_zero_count(h, n);
4025             coeff_token= get_vlc2(gb, coeff_token_vlc[ coeff_token_table_index[total_coeff] ].table, COEFF_TOKEN_VLC_BITS, 2);
4026             total_coeff= coeff_token>>2;
4027             h->non_zero_count_cache[ scan8[n] ]= total_coeff;
4028         }
4029     }
4030
4031     //FIXME set last_non_zero?
4032
4033     if(total_coeff==0)
4034         return 0;
4035
4036     trailing_ones= coeff_token&3;
4037     tprintf("trailing:%d, total:%d\n", trailing_ones, total_coeff);
4038     assert(total_coeff<=16);
4039
4040     for(i=0; i<trailing_ones; i++){
4041         level[i]= 1 - 2*get_bits1(gb);
4042     }
4043
4044     suffix_length= total_coeff > 10 && trailing_ones < 3;
4045
4046     for(; i<total_coeff; i++){
4047         const int prefix= get_level_prefix(gb);
4048         int level_code, mask;
4049
4050         if(prefix<14){ //FIXME try to build a large unified VLC table for all this
4051             if(suffix_length)
4052                 level_code= (prefix<<suffix_length) + get_bits(gb, suffix_length); //part
4053             else
4054                 level_code= (prefix<<suffix_length); //part
4055         }else if(prefix==14){
4056             if(suffix_length)
4057                 level_code= (prefix<<suffix_length) + get_bits(gb, suffix_length); //part
4058             else
4059                 level_code= prefix + get_bits(gb, 4); //part
4060         }else if(prefix==15){
4061             level_code= (prefix<<suffix_length) + get_bits(gb, 12); //part
4062             if(suffix_length==0) level_code+=15; //FIXME doesnt make (much)sense
4063         }else{
4064             av_log(h->s.avctx, AV_LOG_ERROR, "prefix too large at %d %d\n", s->mb_x, s->mb_y);
4065             return -1;
4066         }
4067
4068         if(i==trailing_ones && i<3) level_code+= 2; //FIXME split first iteration
4069
4070         mask= -(level_code&1);
4071         level[i]= (((2+level_code)>>1) ^ mask) - mask;
4072
4073         if(suffix_length==0) suffix_length=1; //FIXME split first iteration
4074
4075 #if 1
4076         if(ABS(level[i]) > (3<<(suffix_length-1)) && suffix_length<6) suffix_length++;
4077 #else
4078         if((2+level_code)>>1) > (3<<(suffix_length-1)) && suffix_length<6) suffix_length++;
4079         /* ? == prefix > 2 or sth */
4080 #endif
4081         tprintf("level: %d suffix_length:%d\n", level[i], suffix_length);
4082     }
4083
4084     if(total_coeff == max_coeff)
4085         zeros_left=0;
4086     else{
4087         if(n == CHROMA_DC_BLOCK_INDEX)
4088             zeros_left= get_vlc2(gb, chroma_dc_total_zeros_vlc[ total_coeff-1 ].table, CHROMA_DC_TOTAL_ZEROS_VLC_BITS, 1);
4089         else
4090             zeros_left= get_vlc2(gb, total_zeros_vlc[ total_coeff-1 ].table, TOTAL_ZEROS_VLC_BITS, 1);
4091     }
4092
4093     for(i=0; i<total_coeff-1; i++){
4094         if(zeros_left <=0)
4095             break;
4096         else if(zeros_left < 7){
4097             run[i]= get_vlc2(gb, run_vlc[zeros_left-1].table, RUN_VLC_BITS, 1);
4098         }else{
4099             run[i]= get_vlc2(gb, run7_vlc.table, RUN7_VLC_BITS, 2);
4100         }
4101         zeros_left -= run[i];
4102     }
4103
4104     if(zeros_left<0){
4105         av_log(h->s.avctx, AV_LOG_ERROR, "negative number of zero coeffs at %d %d\n", s->mb_x, s->mb_y);
4106         return -1;
4107     }
4108
4109     for(; i<total_coeff-1; i++){
4110         run[i]= 0;
4111     }
4112
4113     run[i]= zeros_left;
4114
4115     coeff_num=-1;
4116     if(n > 24){
4117         for(i=total_coeff-1; i>=0; i--){ //FIXME merge into rundecode?
4118             int j;
4119
4120             coeff_num += run[i] + 1; //FIXME add 1 earlier ?
4121             j= scantable[ coeff_num ];
4122
4123             block[j]= level[i];
4124         }
4125     }else{
4126         for(i=total_coeff-1; i>=0; i--){ //FIXME merge into  rundecode?
4127             int j;
4128
4129             coeff_num += run[i] + 1; //FIXME add 1 earlier ?
4130             j= scantable[ coeff_num ];
4131
4132             block[j]= level[i] * qmul[j];
4133 //            printf("%d %d  ", block[j], qmul[j]);
4134         }
4135     }
4136     return 0;
4137 }
4138
4139 /**
4140  * decodes a P_SKIP or B_SKIP macroblock
4141  */
4142 static void decode_mb_skip(H264Context *h){
4143     MpegEncContext * const s = &h->s;
4144     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
4145     int mb_type;
4146
4147     memset(h->non_zero_count[mb_xy], 0, 16);
4148     memset(h->non_zero_count_cache + 8, 0, 8*5); //FIXME ugly, remove pfui
4149
4150     if(h->mb_aff_frame && s->mb_skip_run==0 && (s->mb_y&1)==0){
4151         h->mb_field_decoding_flag= get_bits1(&s->gb);
4152     }
4153     if(h->mb_field_decoding_flag)
4154         mb_type|= MB_TYPE_INTERLACED;
4155
4156     if( h->slice_type == B_TYPE )
4157     {
4158         // just for fill_caches. pred_direct_motion will set the real mb_type
4159         mb_type= MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2|MB_TYPE_SKIP;
4160
4161         fill_caches(h, mb_type, 0); //FIXME check what is needed and what not ...
4162         pred_direct_motion(h, &mb_type);
4163         if(h->pps.cabac){
4164             fill_rectangle(h->mvd_cache[0][scan8[0]], 4, 4, 8, 0, 4);
4165             fill_rectangle(h->mvd_cache[1][scan8[0]], 4, 4, 8, 0, 4);
4166         }
4167     }
4168     else
4169     {
4170         int mx, my;
4171         mb_type= MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P1L0|MB_TYPE_SKIP;
4172
4173         fill_caches(h, mb_type, 0); //FIXME check what is needed and what not ...
4174         pred_pskip_motion(h, &mx, &my);
4175         fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, 0, 1);
4176         fill_rectangle(  h->mv_cache[0][scan8[0]], 4, 4, 8, pack16to32(mx,my), 4);
4177         if(h->pps.cabac)
4178             fill_rectangle(h->mvd_cache[0][scan8[0]], 4, 4, 8, 0, 4);
4179     }
4180
4181     write_back_motion(h, mb_type);
4182     s->current_picture.mb_type[mb_xy]= mb_type|MB_TYPE_SKIP;
4183     s->current_picture.qscale_table[mb_xy]= s->qscale;
4184     h->slice_table[ mb_xy ]= h->slice_num;
4185     h->prev_mb_skiped= 1;
4186 }
4187
4188 /**
4189  * decodes a macroblock
4190  * @returns 0 if ok, AC_ERROR / DC_ERROR / MV_ERROR if an error is noticed
4191  */
4192 static int decode_mb_cavlc(H264Context *h){
4193     MpegEncContext * const s = &h->s;
4194     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
4195     int mb_type, partition_count, cbp;
4196
4197     s->dsp.clear_blocks(h->mb); //FIXME avoid if allready clear (move after skip handlong?
4198
4199     tprintf("pic:%d mb:%d/%d\n", h->frame_num, s->mb_x, s->mb_y);
4200     cbp = 0; /* avoid warning. FIXME: find a solution without slowing
4201                 down the code */
4202     if(h->slice_type != I_TYPE && h->slice_type != SI_TYPE){
4203         if(s->mb_skip_run==-1)
4204             s->mb_skip_run= get_ue_golomb(&s->gb);
4205
4206         if (s->mb_skip_run--) {
4207             decode_mb_skip(h);
4208             return 0;
4209         }
4210     }
4211     if(h->mb_aff_frame){
4212         if ( ((s->mb_y&1) == 0) || h->prev_mb_skiped)
4213             h->mb_field_decoding_flag = get_bits1(&s->gb);
4214     }else
4215         h->mb_field_decoding_flag= (s->picture_structure!=PICT_FRAME);
4216
4217     h->prev_mb_skiped= 0;
4218
4219     mb_type= get_ue_golomb(&s->gb);
4220     if(h->slice_type == B_TYPE){
4221         if(mb_type < 23){
4222             partition_count= b_mb_type_info[mb_type].partition_count;
4223             mb_type=         b_mb_type_info[mb_type].type;
4224         }else{
4225             mb_type -= 23;
4226             goto decode_intra_mb;
4227         }
4228     }else if(h->slice_type == P_TYPE /*|| h->slice_type == SP_TYPE */){
4229         if(mb_type < 5){
4230             partition_count= p_mb_type_info[mb_type].partition_count;
4231             mb_type=         p_mb_type_info[mb_type].type;
4232         }else{
4233             mb_type -= 5;
4234             goto decode_intra_mb;
4235         }
4236     }else{
4237        assert(h->slice_type == I_TYPE);
4238 decode_intra_mb:
4239         if(mb_type > 25){
4240             av_log(h->s.avctx, AV_LOG_ERROR, "mb_type %d in %c slice to large at %d %d\n", mb_type, av_get_pict_type_char(h->slice_type), s->mb_x, s->mb_y);
4241             return -1;
4242         }
4243         partition_count=0;
4244         cbp= i_mb_type_info[mb_type].cbp;
4245         h->intra16x16_pred_mode= i_mb_type_info[mb_type].pred_mode;
4246         mb_type= i_mb_type_info[mb_type].type;
4247     }
4248
4249     if(h->mb_field_decoding_flag)
4250         mb_type |= MB_TYPE_INTERLACED;
4251
4252     s->current_picture.mb_type[mb_xy]= mb_type;
4253     h->slice_table[ mb_xy ]= h->slice_num;
4254
4255     if(IS_INTRA_PCM(mb_type)){
4256         unsigned int x, y;
4257
4258         // we assume these blocks are very rare so we dont optimize it
4259         align_get_bits(&s->gb);
4260
4261         // The pixels are stored in the same order as levels in h->mb array.
4262         for(y=0; y<16; y++){
4263             const int index= 4*(y&3) + 32*((y>>2)&1) + 128*(y>>3);
4264             for(x=0; x<16; x++){
4265                 tprintf("LUMA ICPM LEVEL (%3d)\n", show_bits(&s->gb, 8));
4266                 h->mb[index + (x&3) + 16*((x>>2)&1) + 64*(x>>3)]= get_bits(&s->gb, 8);
4267             }
4268         }
4269         for(y=0; y<8; y++){
4270             const int index= 256 + 4*(y&3) + 32*(y>>2);
4271             for(x=0; x<8; x++){
4272                 tprintf("CHROMA U ICPM LEVEL (%3d)\n", show_bits(&s->gb, 8));
4273                 h->mb[index + (x&3) + 16*(x>>2)]= get_bits(&s->gb, 8);
4274             }
4275         }
4276         for(y=0; y<8; y++){
4277             const int index= 256 + 64 + 4*(y&3) + 32*(y>>2);
4278             for(x=0; x<8; x++){
4279                 tprintf("CHROMA V ICPM LEVEL (%3d)\n", show_bits(&s->gb, 8));
4280                 h->mb[index + (x&3) + 16*(x>>2)]= get_bits(&s->gb, 8);
4281             }
4282         }
4283
4284         // In deblocking, the quantiser is 0
4285         s->current_picture.qscale_table[mb_xy]= 0;
4286         h->chroma_qp = get_chroma_qp(h->pps.chroma_qp_index_offset, 0);
4287         // All coeffs are presents
4288         memset(h->non_zero_count[mb_xy], 16, 16);
4289
4290         return 0;
4291     }
4292
4293     fill_caches(h, mb_type, 0);
4294
4295     //mb_pred
4296     if(IS_INTRA(mb_type)){
4297 //            init_top_left_availability(h);
4298             if(IS_INTRA4x4(mb_type)){
4299                 int i;
4300
4301 //                fill_intra4x4_pred_table(h);
4302                 for(i=0; i<16; i++){
4303                     const int mode_coded= !get_bits1(&s->gb);
4304                     const int predicted_mode=  pred_intra_mode(h, i);
4305                     int mode;
4306
4307                     if(mode_coded){
4308                         const int rem_mode= get_bits(&s->gb, 3);
4309                         if(rem_mode<predicted_mode)
4310                             mode= rem_mode;
4311                         else
4312                             mode= rem_mode + 1;
4313                     }else{
4314                         mode= predicted_mode;
4315                     }
4316
4317                     h->intra4x4_pred_mode_cache[ scan8[i] ] = mode;
4318                 }
4319                 write_back_intra_pred_mode(h);
4320                 if( check_intra4x4_pred_mode(h) < 0)
4321                     return -1;
4322             }else{
4323                 h->intra16x16_pred_mode= check_intra_pred_mode(h, h->intra16x16_pred_mode);
4324                 if(h->intra16x16_pred_mode < 0)
4325                     return -1;
4326             }
4327             h->chroma_pred_mode= get_ue_golomb(&s->gb);
4328
4329             h->chroma_pred_mode= check_intra_pred_mode(h, h->chroma_pred_mode);
4330             if(h->chroma_pred_mode < 0)
4331                 return -1;
4332     }else if(partition_count==4){
4333         int i, j, sub_partition_count[4], list, ref[2][4];
4334
4335         if(h->slice_type == B_TYPE){
4336             for(i=0; i<4; i++){
4337                 h->sub_mb_type[i]= get_ue_golomb(&s->gb);
4338                 if(h->sub_mb_type[i] >=13){
4339                     av_log(h->s.avctx, AV_LOG_ERROR, "B sub_mb_type %d out of range at %d %d\n", h->sub_mb_type[i], s->mb_x, s->mb_y);
4340                     return -1;
4341                 }
4342                 sub_partition_count[i]= b_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
4343                 h->sub_mb_type[i]=      b_sub_mb_type_info[ h->sub_mb_type[i] ].type;
4344             }
4345             if(   IS_DIRECT(h->sub_mb_type[0]) || IS_DIRECT(h->sub_mb_type[1])
4346                || IS_DIRECT(h->sub_mb_type[2]) || IS_DIRECT(h->sub_mb_type[3]))
4347                 pred_direct_motion(h, &mb_type);
4348         }else{
4349             assert(h->slice_type == P_TYPE || h->slice_type == SP_TYPE); //FIXME SP correct ?
4350             for(i=0; i<4; i++){
4351                 h->sub_mb_type[i]= get_ue_golomb(&s->gb);
4352                 if(h->sub_mb_type[i] >=4){
4353                     av_log(h->s.avctx, AV_LOG_ERROR, "P sub_mb_type %d out of range at %d %d\n", h->sub_mb_type[i], s->mb_x, s->mb_y);
4354                     return -1;
4355                 }
4356                 sub_partition_count[i]= p_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
4357                 h->sub_mb_type[i]=      p_sub_mb_type_info[ h->sub_mb_type[i] ].type;
4358             }
4359         }
4360
4361         for(list=0; list<2; list++){
4362             const int ref_count= IS_REF0(mb_type) ? 1 : h->ref_count[list];
4363             if(ref_count == 0) continue;
4364             for(i=0; i<4; i++){
4365                 if(IS_DIRECT(h->sub_mb_type[i])) continue;
4366                 if(IS_DIR(h->sub_mb_type[i], 0, list)){
4367                     ref[list][i] = get_te0_golomb(&s->gb, ref_count); //FIXME init to 0 before and skip?
4368                 }else{
4369                  //FIXME
4370                     ref[list][i] = -1;
4371                 }
4372             }
4373         }
4374
4375         for(list=0; list<2; list++){
4376             const int ref_count= IS_REF0(mb_type) ? 1 : h->ref_count[list];
4377             if(ref_count == 0) continue;
4378
4379             for(i=0; i<4; i++){
4380                 if(IS_DIRECT(h->sub_mb_type[i])) continue;
4381                 h->ref_cache[list][ scan8[4*i]   ]=h->ref_cache[list][ scan8[4*i]+1 ]=
4382                 h->ref_cache[list][ scan8[4*i]+8 ]=h->ref_cache[list][ scan8[4*i]+9 ]= ref[list][i];
4383
4384                 if(IS_DIR(h->sub_mb_type[i], 0, list)){
4385                     const int sub_mb_type= h->sub_mb_type[i];
4386                     const int block_width= (sub_mb_type & (MB_TYPE_16x16|MB_TYPE_16x8)) ? 2 : 1;
4387                     for(j=0; j<sub_partition_count[i]; j++){
4388                         int mx, my;
4389                         const int index= 4*i + block_width*j;
4390                         int16_t (* mv_cache)[2]= &h->mv_cache[list][ scan8[index] ];
4391                         pred_motion(h, index, block_width, list, h->ref_cache[list][ scan8[index] ], &mx, &my);
4392                         mx += get_se_golomb(&s->gb);
4393                         my += get_se_golomb(&s->gb);
4394                         tprintf("final mv:%d %d\n", mx, my);
4395
4396                         if(IS_SUB_8X8(sub_mb_type)){
4397                             mv_cache[ 0 ][0]= mv_cache[ 1 ][0]=
4398                             mv_cache[ 8 ][0]= mv_cache[ 9 ][0]= mx;
4399                             mv_cache[ 0 ][1]= mv_cache[ 1 ][1]=
4400                             mv_cache[ 8 ][1]= mv_cache[ 9 ][1]= my;
4401                         }else if(IS_SUB_8X4(sub_mb_type)){
4402                             mv_cache[ 0 ][0]= mv_cache[ 1 ][0]= mx;
4403                             mv_cache[ 0 ][1]= mv_cache[ 1 ][1]= my;
4404                         }else if(IS_SUB_4X8(sub_mb_type)){
4405                             mv_cache[ 0 ][0]= mv_cache[ 8 ][0]= mx;
4406                             mv_cache[ 0 ][1]= mv_cache[ 8 ][1]= my;
4407                         }else{
4408                             assert(IS_SUB_4X4(sub_mb_type));
4409                             mv_cache[ 0 ][0]= mx;
4410                             mv_cache[ 0 ][1]= my;
4411                         }
4412                     }
4413                 }else{
4414                     uint32_t *p= (uint32_t *)&h->mv_cache[list][ scan8[4*i] ][0];
4415                     p[0] = p[1]=
4416                     p[8] = p[9]= 0;
4417                 }
4418             }
4419         }
4420     }else if(IS_DIRECT(mb_type)){
4421         pred_direct_motion(h, &mb_type);
4422         s->current_picture.mb_type[mb_xy]= mb_type;
4423     }else{
4424         int list, mx, my, i;
4425          //FIXME we should set ref_idx_l? to 0 if we use that later ...
4426         if(IS_16X16(mb_type)){
4427             for(list=0; list<2; list++){
4428                 if(h->ref_count[list]>0){
4429                     if(IS_DIR(mb_type, 0, list)){
4430                         const int val= get_te0_golomb(&s->gb, h->ref_count[list]);
4431                         fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, val, 1);
4432                     }else
4433                         fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, (LIST_NOT_USED&0xFF), 1);
4434                 }
4435             }
4436             for(list=0; list<2; list++){
4437                 if(IS_DIR(mb_type, 0, list)){
4438                     pred_motion(h, 0, 4, list, h->ref_cache[list][ scan8[0] ], &mx, &my);
4439                     mx += get_se_golomb(&s->gb);
4440                     my += get_se_golomb(&s->gb);
4441                     tprintf("final mv:%d %d\n", mx, my);
4442
4443                     fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, pack16to32(mx,my), 4);
4444                 }else
4445                     fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, 0, 4);
4446             }
4447         }
4448         else if(IS_16X8(mb_type)){
4449             for(list=0; list<2; list++){
4450                 if(h->ref_count[list]>0){
4451                     for(i=0; i<2; i++){
4452                         if(IS_DIR(mb_type, i, list)){
4453                             const int val= get_te0_golomb(&s->gb, h->ref_count[list]);
4454                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, val, 1);
4455                         }else
4456                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, (LIST_NOT_USED&0xFF), 1);
4457                     }
4458                 }
4459             }
4460             for(list=0; list<2; list++){
4461                 for(i=0; i<2; i++){
4462                     if(IS_DIR(mb_type, i, list)){
4463                         pred_16x8_motion(h, 8*i, list, h->ref_cache[list][scan8[0] + 16*i], &mx, &my);
4464                         mx += get_se_golomb(&s->gb);
4465                         my += get_se_golomb(&s->gb);
4466                         tprintf("final mv:%d %d\n", mx, my);
4467
4468                         fill_rectangle(h->mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, pack16to32(mx,my), 4);
4469                     }else
4470                         fill_rectangle(h->mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, 0, 4);
4471                 }
4472             }
4473         }else{
4474             assert(IS_8X16(mb_type));
4475             for(list=0; list<2; list++){
4476                 if(h->ref_count[list]>0){
4477                     for(i=0; i<2; i++){
4478                         if(IS_DIR(mb_type, i, list)){ //FIXME optimize
4479                             const int val= get_te0_golomb(&s->gb, h->ref_count[list]);
4480                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, val, 1);
4481                         }else
4482                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, (LIST_NOT_USED&0xFF), 1);
4483                     }
4484                 }
4485             }
4486             for(list=0; list<2; list++){
4487                 for(i=0; i<2; i++){
4488                     if(IS_DIR(mb_type, i, list)){
4489                         pred_8x16_motion(h, i*4, list, h->ref_cache[list][ scan8[0] + 2*i ], &mx, &my);
4490                         mx += get_se_golomb(&s->gb);
4491                         my += get_se_golomb(&s->gb);
4492                         tprintf("final mv:%d %d\n", mx, my);
4493
4494                         fill_rectangle(h->mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, pack16to32(mx,my), 4);
4495                     }else
4496                         fill_rectangle(h->mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, 0, 4);
4497                 }
4498             }
4499         }
4500     }
4501
4502     if(IS_INTER(mb_type))
4503         write_back_motion(h, mb_type);
4504
4505     if(!IS_INTRA16x16(mb_type)){
4506         cbp= get_ue_golomb(&s->gb);
4507         if(cbp > 47){
4508             av_log(h->s.avctx, AV_LOG_ERROR, "cbp too large (%d) at %d %d\n", cbp, s->mb_x, s->mb_y);
4509             return -1;
4510         }
4511
4512         if(IS_INTRA4x4(mb_type))
4513             cbp= golomb_to_intra4x4_cbp[cbp];
4514         else
4515             cbp= golomb_to_inter_cbp[cbp];
4516     }
4517
4518     if(cbp || IS_INTRA16x16(mb_type)){
4519         int i8x8, i4x4, chroma_idx;
4520         int chroma_qp, dquant;
4521         GetBitContext *gb= IS_INTRA(mb_type) ? h->intra_gb_ptr : h->inter_gb_ptr;
4522         const uint8_t *scan, *dc_scan;
4523
4524 //        fill_non_zero_count_cache(h);
4525
4526         if(IS_INTERLACED(mb_type)){
4527             scan= field_scan;
4528             dc_scan= luma_dc_field_scan;
4529         }else{
4530             scan= zigzag_scan;
4531             dc_scan= luma_dc_zigzag_scan;
4532         }
4533
4534         dquant= get_se_golomb(&s->gb);
4535
4536         if( dquant > 25 || dquant < -26 ){
4537             av_log(h->s.avctx, AV_LOG_ERROR, "dquant out of range (%d) at %d %d\n", dquant, s->mb_x, s->mb_y);
4538             return -1;
4539         }
4540
4541         s->qscale += dquant;
4542         if(((unsigned)s->qscale) > 51){
4543             if(s->qscale<0) s->qscale+= 52;
4544             else            s->qscale-= 52;
4545         }
4546
4547         h->chroma_qp= chroma_qp= get_chroma_qp(h->pps.chroma_qp_index_offset, s->qscale);
4548         if(IS_INTRA16x16(mb_type)){
4549             if( decode_residual(h, h->intra_gb_ptr, h->mb, LUMA_DC_BLOCK_INDEX, dc_scan, s->qscale, 16) < 0){
4550                 return -1; //FIXME continue if partotioned and other retirn -1 too
4551             }
4552
4553             assert((cbp&15) == 0 || (cbp&15) == 15);
4554
4555             if(cbp&15){
4556                 for(i8x8=0; i8x8<4; i8x8++){
4557                     for(i4x4=0; i4x4<4; i4x4++){
4558                         const int index= i4x4 + 4*i8x8;
4559                         if( decode_residual(h, h->intra_gb_ptr, h->mb + 16*index, index, scan + 1, s->qscale, 15) < 0 ){
4560                             return -1;
4561                         }
4562                     }
4563                 }
4564             }else{
4565                 fill_rectangle(&h->non_zero_count_cache[scan8[0]], 4, 4, 8, 0, 1);
4566             }
4567         }else{
4568             for(i8x8=0; i8x8<4; i8x8++){
4569                 if(cbp & (1<<i8x8)){
4570                     for(i4x4=0; i4x4<4; i4x4++){
4571                         const int index= i4x4 + 4*i8x8;
4572
4573                         if( decode_residual(h, gb, h->mb + 16*index, index, scan, s->qscale, 16) <0 ){
4574                             return -1;
4575                         }
4576                     }
4577                 }else{
4578                     uint8_t * const nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
4579                     nnz[0] = nnz[1] = nnz[8] = nnz[9] = 0;
4580                 }
4581             }
4582         }
4583
4584         if(cbp&0x30){
4585             for(chroma_idx=0; chroma_idx<2; chroma_idx++)
4586                 if( decode_residual(h, gb, h->mb + 256 + 16*4*chroma_idx, CHROMA_DC_BLOCK_INDEX, chroma_dc_scan, chroma_qp, 4) < 0){
4587                     return -1;
4588                 }
4589         }
4590
4591         if(cbp&0x20){
4592             for(chroma_idx=0; chroma_idx<2; chroma_idx++){
4593                 for(i4x4=0; i4x4<4; i4x4++){
4594                     const int index= 16 + 4*chroma_idx + i4x4;
4595                     if( decode_residual(h, gb, h->mb + 16*index, index, scan + 1, chroma_qp, 15) < 0){
4596                         return -1;
4597                     }
4598                 }
4599             }
4600         }else{
4601             uint8_t * const nnz= &h->non_zero_count_cache[0];
4602             nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
4603             nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
4604         }
4605     }else{
4606         uint8_t * const nnz= &h->non_zero_count_cache[0];
4607         fill_rectangle(&nnz[scan8[0]], 4, 4, 8, 0, 1);
4608         nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
4609         nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
4610     }
4611     s->current_picture.qscale_table[mb_xy]= s->qscale;
4612     write_back_non_zero_count(h);
4613
4614     return 0;
4615 }
4616
4617 static int decode_cabac_intra_mb_type(H264Context *h, int ctx_base, int intra_slice) {
4618     uint8_t *state= &h->cabac_state[ctx_base];
4619     int mb_type;
4620
4621     if(intra_slice){
4622         MpegEncContext * const s = &h->s;
4623         const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
4624         const int mba_xy = mb_xy - 1;
4625         const int mbb_xy = mb_xy - s->mb_stride;
4626         int ctx=0;
4627         if( h->slice_table[mba_xy] == h->slice_num && !IS_INTRA4x4( s->current_picture.mb_type[mba_xy] ) )
4628             ctx++;
4629         if( h->slice_table[mbb_xy] == h->slice_num && !IS_INTRA4x4( s->current_picture.mb_type[mbb_xy] ) )
4630             ctx++;
4631         if( get_cabac( &h->cabac, &state[ctx] ) == 0 )
4632             return 0;   /* I4x4 */
4633         state += 2;
4634     }else{
4635         if( get_cabac( &h->cabac, &state[0] ) == 0 )
4636             return 0;   /* I4x4 */
4637     }
4638
4639     if( get_cabac_terminate( &h->cabac ) )
4640         return 25;  /* PCM */
4641
4642     mb_type = 1; /* I16x16 */
4643     if( get_cabac( &h->cabac, &state[1] ) )
4644         mb_type += 12;  /* cbp_luma != 0 */
4645
4646     if( get_cabac( &h->cabac, &state[2] ) ) {
4647         if( get_cabac( &h->cabac, &state[2+intra_slice] ) )
4648             mb_type += 4 * 2;   /* cbp_chroma == 2 */
4649         else
4650             mb_type += 4 * 1;   /* cbp_chroma == 1 */
4651     }
4652     if( get_cabac( &h->cabac, &state[3+intra_slice] ) )
4653         mb_type += 2;
4654     if( get_cabac( &h->cabac, &state[3+2*intra_slice] ) )
4655         mb_type += 1;
4656     return mb_type;
4657 }
4658
4659 static int decode_cabac_mb_type( H264Context *h ) {
4660     MpegEncContext * const s = &h->s;
4661
4662     if( h->slice_type == I_TYPE ) {
4663         return decode_cabac_intra_mb_type(h, 3, 1);
4664     } else if( h->slice_type == P_TYPE ) {
4665         if( get_cabac( &h->cabac, &h->cabac_state[14] ) == 0 ) {
4666             /* P-type */
4667             if( get_cabac( &h->cabac, &h->cabac_state[15] ) == 0 ) {
4668                 if( get_cabac( &h->cabac, &h->cabac_state[16] ) == 0 )
4669                     return 0; /* P_L0_D16x16; */
4670                 else
4671                     return 3; /* P_8x8; */
4672             } else {
4673                 if( get_cabac( &h->cabac, &h->cabac_state[17] ) == 0 )
4674                     return 2; /* P_L0_D8x16; */
4675                 else
4676                     return 1; /* P_L0_D16x8; */
4677             }
4678         } else {
4679             return decode_cabac_intra_mb_type(h, 17, 0) + 5;
4680         }
4681     } else if( h->slice_type == B_TYPE ) {
4682         const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
4683         const int mba_xy = mb_xy - 1;
4684         const int mbb_xy = mb_xy - s->mb_stride;
4685         int ctx = 0;
4686         int bits;
4687
4688         if( h->slice_table[mba_xy] == h->slice_num && !IS_SKIP( s->current_picture.mb_type[mba_xy] )
4689                       && !IS_DIRECT( s->current_picture.mb_type[mba_xy] ) )
4690             ctx++;
4691         if( h->slice_table[mbb_xy] == h->slice_num && !IS_SKIP( s->current_picture.mb_type[mbb_xy] )
4692                       && !IS_DIRECT( s->current_picture.mb_type[mbb_xy] ) )
4693             ctx++;
4694
4695         if( !get_cabac( &h->cabac, &h->cabac_state[27+ctx] ) )
4696             return 0; /* B_Direct_16x16 */
4697
4698         if( !get_cabac( &h->cabac, &h->cabac_state[27+3] ) ) {
4699             return 1 + get_cabac( &h->cabac, &h->cabac_state[27+5] ); /* B_L[01]_16x16 */
4700         }
4701
4702         bits = get_cabac( &h->cabac, &h->cabac_state[27+4] ) << 3;
4703         bits|= get_cabac( &h->cabac, &h->cabac_state[27+5] ) << 2;
4704         bits|= get_cabac( &h->cabac, &h->cabac_state[27+5] ) << 1;
4705         bits|= get_cabac( &h->cabac, &h->cabac_state[27+5] );
4706         if( bits < 8 )
4707             return bits + 3; /* B_Bi_16x16 through B_L1_L0_16x8 */
4708         else if( bits == 13 ) {
4709             return decode_cabac_intra_mb_type(h, 32, 0) + 23;
4710         } else if( bits == 14 )
4711             return 11; /* B_L1_L0_8x16 */
4712         else if( bits == 15 )
4713             return 22; /* B_8x8 */
4714
4715         bits= ( bits<<1 ) | get_cabac( &h->cabac, &h->cabac_state[27+5] );
4716         return bits - 4; /* B_L0_Bi_* through B_Bi_Bi_* */
4717     } else {
4718         /* TODO SI/SP frames? */
4719         return -1;
4720     }
4721 }
4722
4723 static int decode_cabac_mb_skip( H264Context *h) {
4724     MpegEncContext * const s = &h->s;
4725     const int mb_xy = s->mb_x + s->mb_y*s->mb_stride;
4726     const int mba_xy = mb_xy - 1;
4727     const int mbb_xy = mb_xy - s->mb_stride;
4728     int ctx = 0;
4729
4730     if( h->slice_table[mba_xy] == h->slice_num && !IS_SKIP( s->current_picture.mb_type[mba_xy] ))
4731         ctx++;
4732     if( h->slice_table[mbb_xy] == h->slice_num && !IS_SKIP( s->current_picture.mb_type[mbb_xy] ))
4733         ctx++;
4734
4735     if( h->slice_type == P_TYPE || h->slice_type == SP_TYPE)
4736         return get_cabac( &h->cabac, &h->cabac_state[11+ctx] );
4737     else /* B-frame */
4738         return get_cabac( &h->cabac, &h->cabac_state[24+ctx] );
4739 }
4740
4741 static int decode_cabac_mb_intra4x4_pred_mode( H264Context *h, int pred_mode ) {
4742     int mode = 0;
4743
4744     if( get_cabac( &h->cabac, &h->cabac_state[68] ) )
4745         return pred_mode;
4746
4747     if( get_cabac( &h->cabac, &h->cabac_state[69] ) )
4748         mode += 1;
4749     if( get_cabac( &h->cabac, &h->cabac_state[69] ) )
4750         mode += 2;
4751     if( get_cabac( &h->cabac, &h->cabac_state[69] ) )
4752         mode += 4;
4753     if( mode >= pred_mode )
4754         return mode + 1;
4755     else
4756         return mode;
4757 }
4758
4759 static int decode_cabac_mb_chroma_pre_mode( H264Context *h) {
4760     MpegEncContext * const s = &h->s;
4761     const int mb_xy = s->mb_x + s->mb_y*s->mb_stride;
4762     const int mba_xy = mb_xy - 1;
4763     const int mbb_xy = mb_xy - s->mb_stride;
4764
4765     int ctx = 0;
4766
4767     /* No need to test for IS_INTRA4x4 and IS_INTRA16x16, as we set chroma_pred_mode_table to 0 */
4768     if( h->slice_table[mba_xy] == h->slice_num && h->chroma_pred_mode_table[mba_xy] != 0 )
4769         ctx++;
4770
4771     if( h->slice_table[mbb_xy] == h->slice_num && h->chroma_pred_mode_table[mbb_xy] != 0 )
4772         ctx++;
4773
4774     if( get_cabac( &h->cabac, &h->cabac_state[64+ctx] ) == 0 )
4775         return 0;
4776
4777     if( get_cabac( &h->cabac, &h->cabac_state[64+3] ) == 0 )
4778         return 1;
4779     if( get_cabac( &h->cabac, &h->cabac_state[64+3] ) == 0 )
4780         return 2;
4781     else
4782         return 3;
4783 }
4784
4785 static const uint8_t block_idx_x[16] = {
4786     0, 1, 0, 1, 2, 3, 2, 3, 0, 1, 0, 1, 2, 3, 2, 3
4787 };
4788 static const uint8_t block_idx_y[16] = {
4789     0, 0, 1, 1, 0, 0, 1, 1, 2, 2, 3, 3, 2, 2, 3, 3
4790 };
4791 static const uint8_t block_idx_xy[4][4] = {
4792     { 0, 2, 8,  10},
4793     { 1, 3, 9,  11},
4794     { 4, 6, 12, 14},
4795     { 5, 7, 13, 15}
4796 };
4797
4798 static int decode_cabac_mb_cbp_luma( H264Context *h) {
4799     MpegEncContext * const s = &h->s;
4800     const int mb_xy = s->mb_x + s->mb_y*s->mb_stride;
4801
4802     int cbp = 0;
4803     int i8x8;
4804
4805     h->cbp_table[mb_xy] = 0;  /* FIXME aaahahahah beurk */
4806
4807     for( i8x8 = 0; i8x8 < 4; i8x8++ ) {
4808         int mba_xy = -1;
4809         int mbb_xy = -1;
4810         int x, y;
4811         int ctx = 0;
4812
4813         x = block_idx_x[4*i8x8];
4814         y = block_idx_y[4*i8x8];
4815
4816         if( x > 0 )
4817             mba_xy = mb_xy;
4818         else if( s->mb_x > 0 ) {
4819             mba_xy = mb_xy - 1;
4820             if (h->slice_table[mba_xy] != h->slice_num) {
4821                 mba_xy = -1;
4822             }
4823         }
4824
4825         if( y > 0 )
4826             mbb_xy = mb_xy;
4827         else if( s->mb_y > 0 ) {
4828             mbb_xy = mb_xy - s->mb_stride;
4829             if (h->slice_table[mbb_xy] != h->slice_num) {
4830                 mbb_xy = -1;
4831             }
4832         }
4833
4834         /* No need to test for skip as we put 0 for skip block */
4835         /* No need to test for IPCM as we put 1 for IPCM block */
4836         if( mba_xy >= 0 ) {
4837             int i8x8a = block_idx_xy[(x-1)&0x03][y]/4;
4838             if( ((h->cbp_table[mba_xy] >> i8x8a)&0x01) == 0 )
4839                 ctx++;
4840         }
4841
4842         if( mbb_xy >= 0 ) {
4843             int i8x8b = block_idx_xy[x][(y-1)&0x03]/4;
4844             if( ((h->cbp_table[mbb_xy] >> i8x8b)&0x01) == 0 )
4845                 ctx += 2;
4846         }
4847
4848         if( get_cabac( &h->cabac, &h->cabac_state[73 + ctx] ) ) {
4849             cbp |= 1 << i8x8;
4850             h->cbp_table[mb_xy] = cbp;  /* FIXME aaahahahah beurk */
4851         }
4852     }
4853     return cbp;
4854 }
4855 static int decode_cabac_mb_cbp_chroma( H264Context *h) {
4856     int ctx;
4857     int cbp_a, cbp_b;
4858
4859     cbp_a = (h->left_cbp>>4)&0x03;
4860     cbp_b = (h-> top_cbp>>4)&0x03;
4861
4862     ctx = 0;
4863     if( cbp_a > 0 ) ctx++;
4864     if( cbp_b > 0 ) ctx += 2;
4865     if( get_cabac( &h->cabac, &h->cabac_state[77 + ctx] ) == 0 )
4866         return 0;
4867
4868     ctx = 4;
4869     if( cbp_a == 2 ) ctx++;
4870     if( cbp_b == 2 ) ctx += 2;
4871     return 1 + get_cabac( &h->cabac, &h->cabac_state[77 + ctx] );
4872 }
4873 static int decode_cabac_mb_dqp( H264Context *h) {
4874     MpegEncContext * const s = &h->s;
4875     int mbn_xy;
4876     int   ctx = 0;
4877     int   val = 0;
4878
4879     if( s->mb_x > 0 )
4880         mbn_xy = s->mb_x + s->mb_y*s->mb_stride - 1;
4881     else
4882         mbn_xy = s->mb_width - 1 + (s->mb_y-1)*s->mb_stride;
4883
4884     if( h->last_qscale_diff != 0 && ( IS_INTRA16x16(s->current_picture.mb_type[mbn_xy] ) || (h->cbp_table[mbn_xy]&0x3f) ) )
4885         ctx++;
4886
4887     while( get_cabac( &h->cabac, &h->cabac_state[60 + ctx] ) ) {
4888         if( ctx < 2 )
4889             ctx = 2;
4890         else
4891             ctx = 3;
4892         val++;
4893     }
4894
4895     if( val&0x01 )
4896         return (val + 1)/2;
4897     else
4898         return -(val + 1)/2;
4899 }
4900 static int decode_cabac_p_mb_sub_type( H264Context *h ) {
4901     if( get_cabac( &h->cabac, &h->cabac_state[21] ) )
4902         return 0;   /* 8x8 */
4903     if( !get_cabac( &h->cabac, &h->cabac_state[22] ) )
4904         return 1;   /* 8x4 */
4905     if( get_cabac( &h->cabac, &h->cabac_state[23] ) )
4906         return 2;   /* 4x8 */
4907     return 3;       /* 4x4 */
4908 }
4909 static int decode_cabac_b_mb_sub_type( H264Context *h ) {
4910     int type;
4911     if( !get_cabac( &h->cabac, &h->cabac_state[36] ) )
4912         return 0;   /* B_Direct_8x8 */
4913     if( !get_cabac( &h->cabac, &h->cabac_state[37] ) )
4914         return 1 + get_cabac( &h->cabac, &h->cabac_state[39] ); /* B_L0_8x8, B_L1_8x8 */
4915     type = 3;
4916     if( get_cabac( &h->cabac, &h->cabac_state[38] ) ) {
4917         if( get_cabac( &h->cabac, &h->cabac_state[39] ) )
4918             return 11 + get_cabac( &h->cabac, &h->cabac_state[39] ); /* B_L1_4x4, B_Bi_4x4 */
4919         type += 4;
4920     }
4921     type += 2*get_cabac( &h->cabac, &h->cabac_state[39] );
4922     type +=   get_cabac( &h->cabac, &h->cabac_state[39] );
4923     return type;
4924 }
4925
4926 static int decode_cabac_mb_ref( H264Context *h, int list, int n ) {
4927     int refa = h->ref_cache[list][scan8[n] - 1];
4928     int refb = h->ref_cache[list][scan8[n] - 8];
4929     int ref  = 0;
4930     int ctx  = 0;
4931
4932     if( h->slice_type == B_TYPE) {
4933         if( refa > 0 && !h->direct_cache[scan8[n] - 1] )
4934             ctx++;
4935         if( refb > 0 && !h->direct_cache[scan8[n] - 8] )
4936             ctx += 2;
4937     } else {
4938         if( refa > 0 )
4939             ctx++;
4940         if( refb > 0 )
4941             ctx += 2;
4942     }
4943
4944     while( get_cabac( &h->cabac, &h->cabac_state[54+ctx] ) ) {
4945         ref++;
4946         if( ctx < 4 )
4947             ctx = 4;
4948         else
4949             ctx = 5;
4950     }
4951     return ref;
4952 }
4953
4954 static int decode_cabac_mb_mvd( H264Context *h, int list, int n, int l ) {
4955     int amvd = abs( h->mvd_cache[list][scan8[n] - 1][l] ) +
4956                abs( h->mvd_cache[list][scan8[n] - 8][l] );
4957     int ctxbase = (l == 0) ? 40 : 47;
4958     int ctx, mvd;
4959
4960     if( amvd < 3 )
4961         ctx = 0;
4962     else if( amvd > 32 )
4963         ctx = 2;
4964     else
4965         ctx = 1;
4966
4967     if(!get_cabac(&h->cabac, &h->cabac_state[ctxbase+ctx]))
4968         return 0;
4969
4970     mvd= 1;
4971     ctx= 3;
4972     while( mvd < 9 && get_cabac( &h->cabac, &h->cabac_state[ctxbase+ctx] ) ) {
4973         mvd++;
4974         if( ctx < 6 )
4975             ctx++;
4976     }
4977
4978     if( mvd >= 9 ) {
4979         int k = 3;
4980         while( get_cabac_bypass( &h->cabac ) ) {
4981             mvd += 1 << k;
4982             k++;
4983         }
4984         while( k-- ) {
4985             if( get_cabac_bypass( &h->cabac ) )
4986                 mvd += 1 << k;
4987         }
4988     }
4989     if( get_cabac_bypass( &h->cabac ) )  return -mvd;
4990     else                                 return  mvd;
4991 }
4992
4993 static int inline get_cabac_cbf_ctx( H264Context *h, int cat, int idx ) {
4994     int nza, nzb;
4995     int ctx = 0;
4996
4997     if( cat == 0 ) {
4998         nza = h->left_cbp&0x100;
4999         nzb = h-> top_cbp&0x100;
5000     } else if( cat == 1 || cat == 2 ) {
5001         nza = h->non_zero_count_cache[scan8[idx] - 1];
5002         nzb = h->non_zero_count_cache[scan8[idx] - 8];
5003     } else if( cat == 3 ) {
5004         nza = (h->left_cbp>>(6+idx))&0x01;
5005         nzb = (h-> top_cbp>>(6+idx))&0x01;
5006     } else {
5007         assert(cat == 4);
5008         nza = h->non_zero_count_cache[scan8[16+idx] - 1];
5009         nzb = h->non_zero_count_cache[scan8[16+idx] - 8];
5010     }
5011
5012     if( nza > 0 )
5013         ctx++;
5014
5015     if( nzb > 0 )
5016         ctx += 2;
5017
5018     return ctx + 4 * cat;
5019 }
5020
5021 static int inline decode_cabac_residual( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, int qp, int max_coeff) {
5022     const int mb_xy  = h->s.mb_x + h->s.mb_y*h->s.mb_stride;
5023     const uint16_t *qmul= dequant_coeff[qp];
5024     static const int significant_coeff_flag_offset[5] = { 0, 15, 29, 44, 47 };
5025     static const int coeff_abs_level_m1_offset[5] = {227+ 0, 227+10, 227+20, 227+30, 227+39 };
5026
5027     int index[16];
5028
5029     int i, last;
5030     int coeff_count = 0;
5031
5032     int abslevel1 = 1;
5033     int abslevelgt1 = 0;
5034
5035     /* cat: 0-> DC 16x16  n = 0
5036      *      1-> AC 16x16  n = luma4x4idx
5037      *      2-> Luma4x4   n = luma4x4idx
5038      *      3-> DC Chroma n = iCbCr
5039      *      4-> AC Chroma n = 4 * iCbCr + chroma4x4idx
5040      */
5041
5042     /* read coded block flag */
5043     if( get_cabac( &h->cabac, &h->cabac_state[85 + get_cabac_cbf_ctx( h, cat, n ) ] ) == 0 ) {
5044         if( cat == 1 || cat == 2 )
5045             h->non_zero_count_cache[scan8[n]] = 0;
5046         else if( cat == 4 )
5047             h->non_zero_count_cache[scan8[16+n]] = 0;
5048
5049         return 0;
5050     }
5051
5052     for(last= 0; last < max_coeff - 1; last++) {
5053         if( get_cabac( &h->cabac, &h->cabac_state[105+significant_coeff_flag_offset[cat]+last] )) {
5054             index[coeff_count++] = last;
5055             if( get_cabac( &h->cabac, &h->cabac_state[166+significant_coeff_flag_offset[cat]+last] ) ) {
5056                 last= max_coeff;
5057                 break;
5058             }
5059         }
5060     }
5061     if( last == max_coeff -1 ) {
5062         index[coeff_count++] = last;
5063     }
5064     assert(coeff_count > 0);
5065
5066     if( cat == 0 )
5067         h->cbp_table[mb_xy] |= 0x100;
5068     else if( cat == 1 || cat == 2 )
5069         h->non_zero_count_cache[scan8[n]] = coeff_count;
5070     else if( cat == 3 )
5071         h->cbp_table[mb_xy] |= 0x40 << n;
5072     else {
5073         assert( cat == 4 );
5074         h->non_zero_count_cache[scan8[16+n]] = coeff_count;
5075     }
5076
5077     for( i = coeff_count - 1; i >= 0; i-- ) {
5078         int ctx = (abslevelgt1 != 0 ? 0 : FFMIN( 4, abslevel1 )) + coeff_abs_level_m1_offset[cat];
5079         int j= scantable[index[i]];
5080
5081         if( get_cabac( &h->cabac, &h->cabac_state[ctx] ) == 0 ) {
5082             if( cat == 0 || cat == 3 ) {
5083                 if( get_cabac_bypass( &h->cabac ) ) block[j] = -1;
5084                 else                                block[j] =  1;
5085             }else{
5086                 if( get_cabac_bypass( &h->cabac ) ) block[j] = -qmul[j];
5087                 else                                block[j] =  qmul[j];
5088             }
5089
5090             abslevel1++;
5091         } else {
5092             int coeff_abs = 2;
5093             ctx = 5 + FFMIN( 4, abslevelgt1 ) + coeff_abs_level_m1_offset[cat];
5094             while( coeff_abs < 15 && get_cabac( &h->cabac, &h->cabac_state[ctx] ) ) {
5095                 coeff_abs++;
5096             }
5097
5098             if( coeff_abs >= 15 ) {
5099                 int j = 0;
5100                 while( get_cabac_bypass( &h->cabac ) ) {
5101                     coeff_abs += 1 << j;
5102                     j++;
5103                 }
5104
5105                 while( j-- ) {
5106                     if( get_cabac_bypass( &h->cabac ) )
5107                         coeff_abs += 1 << j ;
5108                 }
5109             }
5110
5111             if( cat == 0 || cat == 3 ) {
5112                 if( get_cabac_bypass( &h->cabac ) ) block[j] = -coeff_abs;
5113                 else                                block[j] =  coeff_abs;
5114             }else{
5115                 if( get_cabac_bypass( &h->cabac ) ) block[j] = -coeff_abs * qmul[j];
5116                 else                                block[j] =  coeff_abs * qmul[j];
5117             }
5118
5119             abslevelgt1++;
5120         }
5121     }
5122     return 0;
5123 }
5124
5125 /**
5126  * decodes a macroblock
5127  * @returns 0 if ok, AC_ERROR / DC_ERROR / MV_ERROR if an error is noticed
5128  */
5129 static int decode_mb_cabac(H264Context *h) {
5130     MpegEncContext * const s = &h->s;
5131     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
5132     int mb_type, partition_count, cbp = 0;
5133
5134     s->dsp.clear_blocks(h->mb); //FIXME avoid if allready clear (move after skip handlong?)
5135
5136     if( h->sps.mb_aff ) {
5137         av_log( h->s.avctx, AV_LOG_ERROR, "Fields not supported with CABAC\n" );
5138         return -1;
5139     }
5140
5141     tprintf("pic:%d mb:%d/%d\n", h->frame_num, s->mb_x, s->mb_y);
5142     if( h->slice_type != I_TYPE && h->slice_type != SI_TYPE ) {
5143         /* read skip flags */
5144         if( decode_cabac_mb_skip( h ) ) {
5145             decode_mb_skip(h);
5146
5147             h->cbp_table[mb_xy] = 0;
5148             h->chroma_pred_mode_table[mb_xy] = 0;
5149             h->last_qscale_diff = 0;
5150
5151             return 0;
5152
5153         }
5154     }
5155     h->prev_mb_skiped = 0;
5156
5157     if( ( mb_type = decode_cabac_mb_type( h ) ) < 0 ) {
5158         av_log( h->s.avctx, AV_LOG_ERROR, "decode_cabac_mb_type failed\n" );
5159         return -1;
5160     }
5161
5162     if( h->slice_type == B_TYPE ) {
5163         if( mb_type < 23 ){
5164             partition_count= b_mb_type_info[mb_type].partition_count;
5165             mb_type=         b_mb_type_info[mb_type].type;
5166         }else{
5167             mb_type -= 23;
5168             goto decode_intra_mb;
5169         }
5170     } else if( h->slice_type == P_TYPE ) {
5171         if( mb_type < 5) {
5172             partition_count= p_mb_type_info[mb_type].partition_count;
5173             mb_type=         p_mb_type_info[mb_type].type;
5174         } else {
5175             mb_type -= 5;
5176             goto decode_intra_mb;
5177         }
5178     } else {
5179        assert(h->slice_type == I_TYPE);
5180 decode_intra_mb:
5181         partition_count = 0;
5182         cbp= i_mb_type_info[mb_type].cbp;
5183         h->intra16x16_pred_mode= i_mb_type_info[mb_type].pred_mode;
5184         mb_type= i_mb_type_info[mb_type].type;
5185     }
5186 #if 0
5187     if(h->mb_field_decoding_flag)
5188         mb_type |= MB_TYPE_INTERLACED;
5189 #endif
5190
5191     s->current_picture.mb_type[mb_xy]= mb_type;
5192     h->slice_table[ mb_xy ]= h->slice_num;
5193
5194     if(IS_INTRA_PCM(mb_type)) {
5195         const uint8_t *ptr;
5196         unsigned int x, y;
5197
5198         // We assume these blocks are very rare so we dont optimize it.
5199         // FIXME The two following lines get the bitstream position in the cabac
5200         // decode, I think it should be done by a function in cabac.h (or cabac.c).
5201         ptr= h->cabac.bytestream;
5202         if (h->cabac.low&0x1) ptr-=CABAC_BITS/8;
5203
5204         // The pixels are stored in the same order as levels in h->mb array.
5205         for(y=0; y<16; y++){
5206             const int index= 4*(y&3) + 32*((y>>2)&1) + 128*(y>>3);
5207             for(x=0; x<16; x++){
5208                 tprintf("LUMA ICPM LEVEL (%3d)\n", *ptr);
5209                 h->mb[index + (x&3) + 16*((x>>2)&1) + 64*(x>>3)]= *ptr++;
5210             }
5211         }
5212         for(y=0; y<8; y++){
5213             const int index= 256 + 4*(y&3) + 32*(y>>2);
5214             for(x=0; x<8; x++){
5215                 tprintf("CHROMA U ICPM LEVEL (%3d)\n", *ptr);
5216                 h->mb[index + (x&3) + 16*(x>>2)]= *ptr++;
5217             }
5218         }
5219         for(y=0; y<8; y++){
5220             const int index= 256 + 64 + 4*(y&3) + 32*(y>>2);
5221             for(x=0; x<8; x++){
5222                 tprintf("CHROMA V ICPM LEVEL (%3d)\n", *ptr);
5223                 h->mb[index + (x&3) + 16*(x>>2)]= *ptr++;
5224             }
5225         }
5226
5227         ff_init_cabac_decoder(&h->cabac, ptr, h->cabac.bytestream_end - ptr);
5228
5229         // All blocks are presents
5230         h->cbp_table[mb_xy] = 0x1ef;
5231         h->chroma_pred_mode_table[mb_xy] = 0;
5232         // In deblocking, the quantiser is 0
5233         s->current_picture.qscale_table[mb_xy]= 0;
5234         h->chroma_qp = get_chroma_qp(h->pps.chroma_qp_index_offset, 0);
5235         // All coeffs are presents
5236         memset(h->non_zero_count[mb_xy], 16, 16);
5237         return 0;
5238     }
5239
5240     fill_caches(h, mb_type, 0);
5241
5242     if( IS_INTRA( mb_type ) ) {
5243         if( IS_INTRA4x4( mb_type ) ) {
5244             int i;
5245             for( i = 0; i < 16; i++ ) {
5246                 int pred = pred_intra_mode( h, i );
5247                 h->intra4x4_pred_mode_cache[ scan8[i] ] = decode_cabac_mb_intra4x4_pred_mode( h, pred );
5248
5249                 //av_log( s->avctx, AV_LOG_ERROR, "i4x4 pred=%d mode=%d\n", pred, h->intra4x4_pred_mode_cache[ scan8[i] ] );
5250             }
5251             write_back_intra_pred_mode(h);
5252             if( check_intra4x4_pred_mode(h) < 0 ) return -1;
5253         } else {
5254             h->intra16x16_pred_mode= check_intra_pred_mode( h, h->intra16x16_pred_mode );
5255             if( h->intra16x16_pred_mode < 0 ) return -1;
5256         }
5257         h->chroma_pred_mode_table[mb_xy] =
5258             h->chroma_pred_mode          = decode_cabac_mb_chroma_pre_mode( h );
5259
5260         h->chroma_pred_mode= check_intra_pred_mode( h, h->chroma_pred_mode );
5261         if( h->chroma_pred_mode < 0 ) return -1;
5262     } else if( partition_count == 4 ) {
5263         int i, j, sub_partition_count[4], list, ref[2][4];
5264
5265         if( h->slice_type == B_TYPE ) {
5266             for( i = 0; i < 4; i++ ) {
5267                 h->sub_mb_type[i] = decode_cabac_b_mb_sub_type( h );
5268                 sub_partition_count[i]= b_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
5269                 h->sub_mb_type[i]=      b_sub_mb_type_info[ h->sub_mb_type[i] ].type;
5270             }
5271             if(   IS_DIRECT(h->sub_mb_type[0]) || IS_DIRECT(h->sub_mb_type[1])
5272                || IS_DIRECT(h->sub_mb_type[2]) || IS_DIRECT(h->sub_mb_type[3])) {
5273                 pred_direct_motion(h, &mb_type);
5274                 if( h->ref_count[0] > 1 || h->ref_count[1] > 1 ) {
5275                     for( i = 0; i < 4; i++ )
5276                         if( IS_DIRECT(h->sub_mb_type[i]) )
5277                             fill_rectangle( &h->direct_cache[scan8[4*i]], 2, 2, 8, 1, 1 );
5278                 }
5279             }
5280         } else {
5281             for( i = 0; i < 4; i++ ) {
5282                 h->sub_mb_type[i] = decode_cabac_p_mb_sub_type( h );
5283                 sub_partition_count[i]= p_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
5284                 h->sub_mb_type[i]=      p_sub_mb_type_info[ h->sub_mb_type[i] ].type;
5285             }
5286         }
5287
5288         for( list = 0; list < 2; list++ ) {
5289             if( h->ref_count[list] > 0 ) {
5290                 for( i = 0; i < 4; i++ ) {
5291                     if(IS_DIRECT(h->sub_mb_type[i])) continue;
5292                     if(IS_DIR(h->sub_mb_type[i], 0, list)){
5293                         if( h->ref_count[list] > 1 )
5294                             ref[list][i] = decode_cabac_mb_ref( h, list, 4*i );
5295                         else
5296                             ref[list][i] = 0;
5297                     } else {
5298                         ref[list][i] = -1;
5299                     }
5300                                                        h->ref_cache[list][ scan8[4*i]+1 ]=
5301                     h->ref_cache[list][ scan8[4*i]+8 ]=h->ref_cache[list][ scan8[4*i]+9 ]= ref[list][i];
5302                 }
5303             }
5304         }
5305
5306         for(list=0; list<2; list++){
5307             for(i=0; i<4; i++){
5308                 if(IS_DIRECT(h->sub_mb_type[i])){
5309                     fill_rectangle(h->mvd_cache[list][scan8[4*i]], 2, 2, 8, 0, 4);
5310                     continue;
5311                 }
5312                 h->ref_cache[list][ scan8[4*i]   ]=h->ref_cache[list][ scan8[4*i]+1 ];
5313
5314                 if(IS_DIR(h->sub_mb_type[i], 0, list) && !IS_DIRECT(h->sub_mb_type[i])){
5315                     const int sub_mb_type= h->sub_mb_type[i];
5316                     const int block_width= (sub_mb_type & (MB_TYPE_16x16|MB_TYPE_16x8)) ? 2 : 1;
5317                     for(j=0; j<sub_partition_count[i]; j++){
5318                         int mpx, mpy;
5319                         int mx, my;
5320                         const int index= 4*i + block_width*j;
5321                         int16_t (* mv_cache)[2]= &h->mv_cache[list][ scan8[index] ];
5322                         int16_t (* mvd_cache)[2]= &h->mvd_cache[list][ scan8[index] ];
5323                         pred_motion(h, index, block_width, list, h->ref_cache[list][ scan8[index] ], &mpx, &mpy);
5324
5325                         mx = mpx + decode_cabac_mb_mvd( h, list, index, 0 );
5326                         my = mpy + decode_cabac_mb_mvd( h, list, index, 1 );
5327                         tprintf("final mv:%d %d\n", mx, my);
5328
5329                         if(IS_SUB_8X8(sub_mb_type)){
5330                             mv_cache[ 0 ][0]= mv_cache[ 1 ][0]=
5331                             mv_cache[ 8 ][0]= mv_cache[ 9 ][0]= mx;
5332                             mv_cache[ 0 ][1]= mv_cache[ 1 ][1]=
5333                             mv_cache[ 8 ][1]= mv_cache[ 9 ][1]= my;
5334
5335                             mvd_cache[ 0 ][0]= mvd_cache[ 1 ][0]=
5336                             mvd_cache[ 8 ][0]= mvd_cache[ 9 ][0]= mx - mpx;
5337                             mvd_cache[ 0 ][1]= mvd_cache[ 1 ][1]=
5338                             mvd_cache[ 8 ][1]= mvd_cache[ 9 ][1]= my - mpy;
5339                         }else if(IS_SUB_8X4(sub_mb_type)){
5340                             mv_cache[ 0 ][0]= mv_cache[ 1 ][0]= mx;
5341                             mv_cache[ 0 ][1]= mv_cache[ 1 ][1]= my;
5342
5343                             mvd_cache[ 0 ][0]= mvd_cache[ 1 ][0]= mx- mpx;
5344                             mvd_cache[ 0 ][1]= mvd_cache[ 1 ][1]= my - mpy;
5345                         }else if(IS_SUB_4X8(sub_mb_type)){
5346                             mv_cache[ 0 ][0]= mv_cache[ 8 ][0]= mx;
5347                             mv_cache[ 0 ][1]= mv_cache[ 8 ][1]= my;
5348
5349                             mvd_cache[ 0 ][0]= mvd_cache[ 8 ][0]= mx - mpx;
5350                             mvd_cache[ 0 ][1]= mvd_cache[ 8 ][1]= my - mpy;
5351                         }else{
5352                             assert(IS_SUB_4X4(sub_mb_type));
5353                             mv_cache[ 0 ][0]= mx;
5354                             mv_cache[ 0 ][1]= my;
5355
5356                             mvd_cache[ 0 ][0]= mx - mpx;
5357                             mvd_cache[ 0 ][1]= my - mpy;
5358                         }
5359                     }
5360                 }else{
5361                     uint32_t *p= (uint32_t *)&h->mv_cache[list][ scan8[4*i] ][0];
5362                     uint32_t *pd= (uint32_t *)&h->mvd_cache[list][ scan8[4*i] ][0];
5363                     p[0] = p[1] = p[8] = p[9] = 0;
5364                     pd[0]= pd[1]= pd[8]= pd[9]= 0;
5365                 }
5366             }
5367         }
5368     } else if( IS_DIRECT(mb_type) ) {
5369         pred_direct_motion(h, &mb_type);
5370         s->current_picture.mb_type[mb_xy]= mb_type;
5371         fill_rectangle(h->mvd_cache[0][scan8[0]], 4, 4, 8, 0, 4);
5372         fill_rectangle(h->mvd_cache[1][scan8[0]], 4, 4, 8, 0, 4);
5373     } else {
5374         int list, mx, my, i, mpx, mpy;
5375         if(IS_16X16(mb_type)){
5376             for(list=0; list<2; list++){
5377                 if(IS_DIR(mb_type, 0, list)){
5378                     if(h->ref_count[list] > 0 ){
5379                         const int ref = h->ref_count[list] > 1 ? decode_cabac_mb_ref( h, list, 0 ) : 0;
5380                         fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, ref, 1);
5381                     }
5382                 }else
5383                     fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, (uint8_t)LIST_NOT_USED, 1);
5384             }
5385             for(list=0; list<2; list++){
5386                 if(IS_DIR(mb_type, 0, list)){
5387                     pred_motion(h, 0, 4, list, h->ref_cache[list][ scan8[0] ], &mpx, &mpy);
5388
5389                     mx = mpx + decode_cabac_mb_mvd( h, list, 0, 0 );
5390                     my = mpy + decode_cabac_mb_mvd( h, list, 0, 1 );
5391                     tprintf("final mv:%d %d\n", mx, my);
5392
5393                     fill_rectangle(h->mvd_cache[list][ scan8[0] ], 4, 4, 8, pack16to32(mx-mpx,my-mpy), 4);
5394                     fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, pack16to32(mx,my), 4);
5395                 }else
5396                     fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, 0, 4);
5397             }
5398         }
5399         else if(IS_16X8(mb_type)){
5400             for(list=0; list<2; list++){
5401                 if(h->ref_count[list]>0){
5402                     for(i=0; i<2; i++){
5403                         if(IS_DIR(mb_type, i, list)){
5404                             const int ref= h->ref_count[list] > 1 ? decode_cabac_mb_ref( h, list, 8*i ) : 0;
5405                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, ref, 1);
5406                         }else
5407                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, (LIST_NOT_USED&0xFF), 1);
5408                     }
5409                 }
5410             }
5411             for(list=0; list<2; list++){
5412                 for(i=0; i<2; i++){
5413                     if(IS_DIR(mb_type, i, list)){
5414                         pred_16x8_motion(h, 8*i, list, h->ref_cache[list][scan8[0] + 16*i], &mpx, &mpy);
5415                         mx = mpx + decode_cabac_mb_mvd( h, list, 8*i, 0 );
5416                         my = mpy + decode_cabac_mb_mvd( h, list, 8*i, 1 );
5417                         tprintf("final mv:%d %d\n", mx, my);
5418
5419                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 16*i ], 4, 2, 8, pack16to32(mx-mpx,my-mpy), 4);
5420                         fill_rectangle(h->mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, pack16to32(mx,my), 4);
5421                     }else{
5422                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 16*i ], 4, 2, 8, 0, 4);
5423                         fill_rectangle(h-> mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, 0, 4);
5424                     }
5425                 }
5426             }
5427         }else{
5428             assert(IS_8X16(mb_type));
5429             for(list=0; list<2; list++){
5430                 if(h->ref_count[list]>0){
5431                     for(i=0; i<2; i++){
5432                         if(IS_DIR(mb_type, i, list)){ //FIXME optimize
5433                             const int ref= h->ref_count[list] > 1 ? decode_cabac_mb_ref( h, list, 4*i ) : 0;
5434                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, ref, 1);
5435                         }else
5436                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, (LIST_NOT_USED&0xFF), 1);
5437                     }
5438                 }
5439             }
5440             for(list=0; list<2; list++){
5441                 for(i=0; i<2; i++){
5442                     if(IS_DIR(mb_type, i, list)){
5443                         pred_8x16_motion(h, i*4, list, h->ref_cache[list][ scan8[0] + 2*i ], &mpx, &mpy);
5444                         mx = mpx + decode_cabac_mb_mvd( h, list, 4*i, 0 );
5445                         my = mpy + decode_cabac_mb_mvd( h, list, 4*i, 1 );
5446
5447                         tprintf("final mv:%d %d\n", mx, my);
5448                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 2*i ], 2, 4, 8, pack16to32(mx-mpx,my-mpy), 4);
5449                         fill_rectangle(h->mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, pack16to32(mx,my), 4);
5450                     }else{
5451                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 2*i ], 2, 4, 8, 0, 4);
5452                         fill_rectangle(h-> mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, 0, 4);
5453                     }
5454                 }
5455             }
5456         }
5457     }
5458
5459    if( IS_INTER( mb_type ) ) {
5460         h->chroma_pred_mode_table[mb_xy] = 0;
5461         write_back_motion( h, mb_type );
5462    }
5463
5464     if( !IS_INTRA16x16( mb_type ) ) {
5465         cbp  = decode_cabac_mb_cbp_luma( h );
5466         cbp |= decode_cabac_mb_cbp_chroma( h ) << 4;
5467     }
5468
5469     h->cbp_table[mb_xy] = cbp;
5470
5471     if( cbp || IS_INTRA16x16( mb_type ) ) {
5472         const uint8_t *scan, *dc_scan;
5473         int dqp;
5474
5475         if(IS_INTERLACED(mb_type)){
5476             scan= field_scan;
5477             dc_scan= luma_dc_field_scan;
5478         }else{
5479             scan= zigzag_scan;
5480             dc_scan= luma_dc_zigzag_scan;
5481         }
5482
5483         h->last_qscale_diff = dqp = decode_cabac_mb_dqp( h );
5484         s->qscale += dqp;
5485         if(((unsigned)s->qscale) > 51){
5486             if(s->qscale<0) s->qscale+= 52;
5487             else            s->qscale-= 52;
5488         }
5489         h->chroma_qp = get_chroma_qp(h->pps.chroma_qp_index_offset, s->qscale);
5490
5491         if( IS_INTRA16x16( mb_type ) ) {
5492             int i;
5493             //av_log( s->avctx, AV_LOG_ERROR, "INTRA16x16 DC\n" );
5494             if( decode_cabac_residual( h, h->mb, 0, 0, dc_scan, s->qscale, 16) < 0)
5495                 return -1;
5496             if( cbp&15 ) {
5497                 for( i = 0; i < 16; i++ ) {
5498                     //av_log( s->avctx, AV_LOG_ERROR, "INTRA16x16 AC:%d\n", i );
5499                     if( decode_cabac_residual(h, h->mb + 16*i, 1, i, scan + 1, s->qscale, 15) < 0 )
5500                         return -1;
5501                 }
5502             } else {
5503                 fill_rectangle(&h->non_zero_count_cache[scan8[0]], 4, 4, 8, 0, 1);
5504             }
5505         } else {
5506             int i8x8, i4x4;
5507             for( i8x8 = 0; i8x8 < 4; i8x8++ ) {
5508                 if( cbp & (1<<i8x8) ) {
5509                     for( i4x4 = 0; i4x4 < 4; i4x4++ ) {
5510                         const int index = 4*i8x8 + i4x4;
5511                         //av_log( s->avctx, AV_LOG_ERROR, "Luma4x4: %d\n", index );
5512                         if( decode_cabac_residual(h, h->mb + 16*index, 2, index, scan, s->qscale, 16) < 0 )
5513                             return -1;
5514                     }
5515                 } else {
5516                     uint8_t * const nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
5517                     nnz[0] = nnz[1] = nnz[8] = nnz[9] = 0;
5518                 }
5519             }
5520         }
5521
5522         if( cbp&0x30 ){
5523             int c;
5524             for( c = 0; c < 2; c++ ) {
5525                 //av_log( s->avctx, AV_LOG_ERROR, "INTRA C%d-DC\n",c );
5526                 if( decode_cabac_residual(h, h->mb + 256 + 16*4*c, 3, c, chroma_dc_scan, h->chroma_qp, 4) < 0)
5527                     return -1;
5528             }
5529         }
5530
5531         if( cbp&0x20 ) {
5532             int c, i;
5533             for( c = 0; c < 2; c++ ) {
5534                 for( i = 0; i < 4; i++ ) {
5535                     const int index = 16 + 4 * c + i;
5536                     //av_log( s->avctx, AV_LOG_ERROR, "INTRA C%d-AC %d\n",c, index - 16 );
5537                     if( decode_cabac_residual(h, h->mb + 16*index, 4, index - 16, scan + 1, h->chroma_qp, 15) < 0)
5538                         return -1;
5539                 }
5540             }
5541         } else {
5542             uint8_t * const nnz= &h->non_zero_count_cache[0];
5543             nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
5544             nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
5545         }
5546     } else {
5547         uint8_t * const nnz= &h->non_zero_count_cache[0];
5548         fill_rectangle(&nnz[scan8[0]], 4, 4, 8, 0, 1);
5549         nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
5550         nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
5551     }
5552
5553     s->current_picture.qscale_table[mb_xy]= s->qscale;
5554     write_back_non_zero_count(h);
5555
5556     return 0;
5557 }
5558
5559
5560 static void filter_mb_edgev( H264Context *h, uint8_t *pix, int stride, int bS[4], int qp ) {
5561     int i, d;
5562     const int index_a = clip( qp + h->slice_alpha_c0_offset, 0, 51 );
5563     const int alpha = alpha_table[index_a];
5564     const int beta  = beta_table[clip( qp + h->slice_beta_offset, 0, 51 )];
5565
5566     for( i = 0; i < 4; i++ ) {
5567         if( bS[i] == 0 ) {
5568             pix += 4 * stride;
5569             continue;
5570         }
5571
5572         if( bS[i] < 4 ) {
5573             const int tc0 = tc0_table[index_a][bS[i] - 1];
5574             /* 4px edge length */
5575             for( d = 0; d < 4; d++ ) {
5576                 const int p0 = pix[-1];
5577                 const int p1 = pix[-2];
5578                 const int p2 = pix[-3];
5579                 const int q0 = pix[0];
5580                 const int q1 = pix[1];
5581                 const int q2 = pix[2];
5582
5583                 if( ABS( p0 - q0 ) < alpha &&
5584                     ABS( p1 - p0 ) < beta &&
5585                     ABS( q1 - q0 ) < beta ) {
5586                     int tc = tc0;
5587                     int i_delta;
5588
5589                     if( ABS( p2 - p0 ) < beta ) {
5590                         pix[-2] = p1 + clip( ( p2 + ( ( p0 + q0 + 1 ) >> 1 ) - ( p1 << 1 ) ) >> 1, -tc0, tc0 );
5591                         tc++;
5592                     }
5593                     if( ABS( q2 - q0 ) < beta ) {
5594                         pix[1] = q1 + clip( ( q2 + ( ( p0 + q0 + 1 ) >> 1 ) - ( q1 << 1 ) ) >> 1, -tc0, tc0 );
5595                         tc++;
5596                     }
5597
5598                     i_delta = clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
5599                     pix[-1] = clip_uint8( p0 + i_delta );    /* p0' */
5600                     pix[0]  = clip_uint8( q0 - i_delta );    /* q0' */
5601                     tprintf("filter_mb_edgev i:%d d:%d, qp:%d, indexA:%d, alpha:%d, beta:%d, tc:%d\n# bS:%d -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, d, qp, index_a, alpha, beta, tc, bS[i], pix[-3], p1, p0, q0, q1, pix[2], pix[-2], pix[-1], pix[0], pix[1]);
5602                 }
5603                 pix += stride;
5604             }
5605         }else{
5606             /* 4px edge length */
5607             for( d = 0; d < 4; d++ ) {
5608                 const int p0 = pix[-1];
5609                 const int p1 = pix[-2];
5610                 const int p2 = pix[-3];
5611
5612                 const int q0 = pix[0];
5613                 const int q1 = pix[1];
5614                 const int q2 = pix[2];
5615
5616                 if( ABS( p0 - q0 ) < alpha &&
5617                     ABS( p1 - p0 ) < beta &&
5618                     ABS( q1 - q0 ) < beta ) {
5619
5620                     if(ABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
5621                         if( ABS( p2 - p0 ) < beta)
5622                         {
5623                             const int p3 = pix[-4];
5624                             /* p0', p1', p2' */
5625                             pix[-1] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
5626                             pix[-2] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
5627                             pix[-3] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
5628                         } else {
5629                             /* p0' */
5630                             pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
5631                         }
5632                         if( ABS( q2 - q0 ) < beta)
5633                         {
5634                             const int q3 = pix[3];
5635                             /* q0', q1', q2' */
5636                             pix[0] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
5637                             pix[1] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
5638                             pix[2] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
5639                         } else {
5640                             /* q0' */
5641                             pix[0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
5642                         }
5643                     }else{
5644                         /* p0', q0' */
5645                         pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
5646                         pix[ 0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
5647                     }
5648                     tprintf("filter_mb_edgev i:%d d:%d\n# bS:4 -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, d, p2, p1, p0, q0, q1, q2, pix[-2], pix[-1], pix[0], pix[1]);
5649                 }
5650                 pix += stride;
5651             }
5652         }
5653     }
5654 }
5655 static void filter_mb_edgecv( H264Context *h, uint8_t *pix, int stride, int bS[4], int qp ) {
5656     int i, d;
5657     const int index_a = clip( qp + h->slice_alpha_c0_offset, 0, 51 );
5658     const int alpha = alpha_table[index_a];
5659     const int beta  = beta_table[clip( qp + h->slice_beta_offset, 0, 51 )];
5660
5661     for( i = 0; i < 4; i++ ) {
5662         if( bS[i] == 0 ) {
5663             pix += 2 * stride;
5664             continue;
5665         }
5666
5667         if( bS[i] < 4 ) {
5668             const int tc = tc0_table[index_a][bS[i] - 1] + 1;
5669             /* 2px edge length (because we use same bS than the one for luma) */
5670             for( d = 0; d < 2; d++ ){
5671                 const int p0 = pix[-1];
5672                 const int p1 = pix[-2];
5673                 const int q0 = pix[0];
5674                 const int q1 = pix[1];
5675
5676                 if( ABS( p0 - q0 ) < alpha &&
5677                     ABS( p1 - p0 ) < beta &&
5678                     ABS( q1 - q0 ) < beta ) {
5679                     const int i_delta = clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
5680
5681                     pix[-1] = clip_uint8( p0 + i_delta );    /* p0' */
5682                     pix[0]  = clip_uint8( q0 - i_delta );    /* q0' */
5683                     tprintf("filter_mb_edgecv i:%d d:%d, qp:%d, indexA:%d, alpha:%d, beta:%d, tc:%d\n# bS:%d -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, d, qp, index_a, alpha, beta, tc, bS[i], pix[-3], p1, p0, q0, q1, pix[2], p1, pix[-1], pix[0], q1);
5684                 }
5685                 pix += stride;
5686             }
5687         }else{
5688             /* 2px edge length (because we use same bS than the one for luma) */
5689             for( d = 0; d < 2; d++ ){
5690                 const int p0 = pix[-1];
5691                 const int p1 = pix[-2];
5692                 const int q0 = pix[0];
5693                 const int q1 = pix[1];
5694
5695                 if( ABS( p0 - q0 ) < alpha &&
5696                     ABS( p1 - p0 ) < beta &&
5697                     ABS( q1 - q0 ) < beta ) {
5698
5699                     pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;   /* p0' */
5700                     pix[0]  = ( 2*q1 + q0 + p1 + 2 ) >> 2;   /* q0' */
5701                     tprintf("filter_mb_edgecv i:%d d:%d\n# bS:4 -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, d, pix[-3], p1, p0, q0, q1, pix[2], p1, pix[-1], pix[0], q1);
5702                 }
5703                 pix += stride;
5704             }
5705         }
5706     }
5707 }
5708
5709 static void filter_mb_mbaff_edgev( H264Context *h, uint8_t *pix, int stride, int bS[8], int qp[2] ) {
5710     int i;
5711     for( i = 0; i < 16; i++, pix += stride) {
5712         int index_a;
5713         int alpha;
5714         int beta;
5715
5716         int qp_index;
5717         int bS_index = (i >> 1);
5718         if (h->mb_field_decoding_flag) {
5719             bS_index &= ~1;
5720             bS_index |= (i & 1);
5721         }
5722
5723         if( bS[bS_index] == 0 ) {
5724             continue;
5725         }
5726
5727         qp_index = h->mb_field_decoding_flag ? (i & 1) : (i >> 3);
5728         index_a = clip( qp[qp_index] + h->slice_alpha_c0_offset, 0, 51 );
5729         alpha = alpha_table[index_a];
5730         beta  = beta_table[clip( qp[qp_index] + h->slice_beta_offset, 0, 51 )];
5731
5732
5733         if( bS[bS_index] < 4 ) {
5734             const int tc0 = tc0_table[index_a][bS[bS_index] - 1];
5735             /* 4px edge length */
5736             const int p0 = pix[-1];
5737             const int p1 = pix[-2];
5738             const int p2 = pix[-3];
5739             const int q0 = pix[0];
5740             const int q1 = pix[1];
5741             const int q2 = pix[2];
5742
5743             if( ABS( p0 - q0 ) < alpha &&
5744                 ABS( p1 - p0 ) < beta &&
5745                 ABS( q1 - q0 ) < beta ) {
5746                 int tc = tc0;
5747                 int i_delta;
5748
5749                 if( ABS( p2 - p0 ) < beta ) {
5750                     pix[-2] = p1 + clip( ( p2 + ( ( p0 + q0 + 1 ) >> 1 ) - ( p1 << 1 ) ) >> 1, -tc0, tc0 );
5751                     tc++;
5752                 }
5753                 if( ABS( q2 - q0 ) < beta ) {
5754                     pix[1] = q1 + clip( ( q2 + ( ( p0 + q0 + 1 ) >> 1 ) - ( q1 << 1 ) ) >> 1, -tc0, tc0 );
5755                     tc++;
5756                 }
5757
5758                 i_delta = clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
5759                 pix[-1] = clip_uint8( p0 + i_delta );    /* p0' */
5760                 pix[0]  = clip_uint8( q0 - i_delta );    /* q0' */
5761                 tprintf("filter_mb_mbaff_edgev i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d, tc:%d\n# bS:%d -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, tc, bS[bS_index], pix[-3], p1, p0, q0, q1, pix[2], p1, pix[-1], pix[0], q1);
5762             }
5763         }else{
5764             /* 4px edge length */
5765             const int p0 = pix[-1];
5766             const int p1 = pix[-2];
5767             const int p2 = pix[-3];
5768
5769             const int q0 = pix[0];
5770             const int q1 = pix[1];
5771             const int q2 = pix[2];
5772
5773             if( ABS( p0 - q0 ) < alpha &&
5774                 ABS( p1 - p0 ) < beta &&
5775                 ABS( q1 - q0 ) < beta ) {
5776
5777                 if(ABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
5778                     if( ABS( p2 - p0 ) < beta)
5779                     {
5780                         const int p3 = pix[-4];
5781                         /* p0', p1', p2' */
5782                         pix[-1] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
5783                         pix[-2] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
5784                         pix[-3] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
5785                     } else {
5786                         /* p0' */
5787                         pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
5788                     }
5789                     if( ABS( q2 - q0 ) < beta)
5790                     {
5791                         const int q3 = pix[3];
5792                         /* q0', q1', q2' */
5793                         pix[0] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
5794                         pix[1] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
5795                         pix[2] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
5796                     } else {
5797                         /* q0' */
5798                         pix[0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
5799                     }
5800                 }else{
5801                     /* p0', q0' */
5802                     pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
5803                     pix[ 0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
5804                 }
5805                 tprintf("filter_mb_mbaff_edgev i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d\n# bS:4 -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, p2, p1, p0, q0, q1, q2, pix[-3], pix[-2], pix[-1], pix[0], pix[1], pix[2]);
5806             }
5807         }
5808     }
5809 }
5810 static void filter_mb_mbaff_edgecv( H264Context *h, uint8_t *pix, int stride, int bS[4], int qp[2] ) {
5811     int i;
5812     for( i = 0; i < 8; i++, pix += stride) {
5813         int index_a;
5814         int alpha;
5815         int beta;
5816
5817         int qp_index;
5818         int bS_index = i;
5819
5820         if( bS[bS_index] == 0 ) {
5821             continue;
5822         }
5823
5824         qp_index = h->mb_field_decoding_flag ? (i & 1) : (i >> 3);
5825         index_a = clip( qp[qp_index] + h->slice_alpha_c0_offset, 0, 51 );
5826         alpha = alpha_table[index_a];
5827         beta  = beta_table[clip( qp[qp_index] + h->slice_beta_offset, 0, 51 )];
5828         if( bS[bS_index] < 4 ) {
5829             const int tc = tc0_table[index_a][bS[bS_index] - 1] + 1;
5830             /* 2px edge length (because we use same bS than the one for luma) */
5831             const int p0 = pix[-1];
5832             const int p1 = pix[-2];
5833             const int q0 = pix[0];
5834             const int q1 = pix[1];
5835
5836             if( ABS( p0 - q0 ) < alpha &&
5837                 ABS( p1 - p0 ) < beta &&
5838                 ABS( q1 - q0 ) < beta ) {
5839                 const int i_delta = clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
5840
5841                 pix[-1] = clip_uint8( p0 + i_delta );    /* p0' */
5842                 pix[0]  = clip_uint8( q0 - i_delta );    /* q0' */
5843                 tprintf("filter_mb_mbaff_edgecv i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d, tc:%d\n# bS:%d -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, tc, bS[bS_index], pix[-3], p1, p0, q0, q1, pix[2], p1, pix[-1], pix[0], q1);
5844             }
5845         }else{
5846             const int p0 = pix[-1];
5847             const int p1 = pix[-2];
5848             const int q0 = pix[0];
5849             const int q1 = pix[1];
5850
5851             if( ABS( p0 - q0 ) < alpha &&
5852                 ABS( p1 - p0 ) < beta &&
5853                 ABS( q1 - q0 ) < beta ) {
5854
5855                 pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;   /* p0' */
5856                 pix[0]  = ( 2*q1 + q0 + p1 + 2 ) >> 2;   /* q0' */
5857                 tprintf("filter_mb_mbaff_edgecv i:%d\n# bS:4 -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x, %02x, %02x]\n", i, pix[-3], p1, p0, q0, q1, pix[2], pix[-3], pix[-2], pix[-1], pix[0], pix[1], pix[2]);
5858             }
5859         }
5860     }
5861 }
5862
5863 static void filter_mb_edgeh( H264Context *h, uint8_t *pix, int stride, int bS[4], int qp ) {
5864     int i, d;
5865     const int index_a = clip( qp + h->slice_alpha_c0_offset, 0, 51 );
5866     const int alpha = alpha_table[index_a];
5867     const int beta  = beta_table[clip( qp + h->slice_beta_offset, 0, 51 )];
5868     const int pix_next  = stride;
5869
5870     for( i = 0; i < 4; i++ ) {
5871         if( bS[i] == 0 ) {
5872             pix += 4;
5873             continue;
5874         }
5875
5876         if( bS[i] < 4 ) {
5877             const int tc0 = tc0_table[index_a][bS[i] - 1];
5878             /* 4px edge length */
5879             for( d = 0; d < 4; d++ ) {
5880                 const int p0 = pix[-1*pix_next];
5881                 const int p1 = pix[-2*pix_next];
5882                 const int p2 = pix[-3*pix_next];
5883                 const int q0 = pix[0];
5884                 const int q1 = pix[1*pix_next];
5885                 const int q2 = pix[2*pix_next];
5886
5887                 if( ABS( p0 - q0 ) < alpha &&
5888                     ABS( p1 - p0 ) < beta &&
5889                     ABS( q1 - q0 ) < beta ) {
5890
5891                     int tc = tc0;
5892                     int i_delta;
5893
5894                     if( ABS( p2 - p0 ) < beta ) {
5895                         pix[-2*pix_next] = p1 + clip( ( p2 + ( ( p0 + q0 + 1 ) >> 1 ) - ( p1 << 1 ) ) >> 1, -tc0, tc0 );
5896                         tc++;
5897                     }
5898                     if( ABS( q2 - q0 ) < beta ) {
5899                         pix[pix_next] = q1 + clip( ( q2 + ( ( p0 + q0 + 1 ) >> 1 ) - ( q1 << 1 ) ) >> 1, -tc0, tc0 );
5900                         tc++;
5901                     }
5902
5903                     i_delta = clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
5904                     pix[-pix_next] = clip_uint8( p0 + i_delta );    /* p0' */
5905                     pix[0]         = clip_uint8( q0 - i_delta );    /* q0' */
5906                     tprintf("filter_mb_edgeh i:%d d:%d, qp:%d, indexA:%d, alpha:%d, beta:%d, tc:%d\n# bS:%d -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, d, qp, index_a, alpha, beta, tc, bS[i], p2, p1, p0, q0, q1, q2, pix[-2*pix_next], pix[-pix_next], pix[0], pix[pix_next]);
5907                 }
5908                 pix++;
5909             }
5910         }else{
5911             /* 4px edge length */
5912             for( d = 0; d < 4; d++ ) {
5913                 const int p0 = pix[-1*pix_next];
5914                 const int p1 = pix[-2*pix_next];
5915                 const int p2 = pix[-3*pix_next];
5916                 const int q0 = pix[0];
5917                 const int q1 = pix[1*pix_next];
5918                 const int q2 = pix[2*pix_next];
5919
5920                 if( ABS( p0 - q0 ) < alpha &&
5921                     ABS( p1 - p0 ) < beta &&
5922                     ABS( q1 - q0 ) < beta ) {
5923
5924                     const int p3 = pix[-4*pix_next];
5925                     const int q3 = pix[ 3*pix_next];
5926
5927                     if(ABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
5928                         if( ABS( p2 - p0 ) < beta) {
5929                             /* p0', p1', p2' */
5930                             pix[-1*pix_next] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
5931                             pix[-2*pix_next] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
5932                             pix[-3*pix_next] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
5933                         } else {
5934                             /* p0' */
5935                             pix[-1*pix_next] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
5936                         }
5937                         if( ABS( q2 - q0 ) < beta) {
5938                             /* q0', q1', q2' */
5939                             pix[0*pix_next] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
5940                             pix[1*pix_next] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
5941                             pix[2*pix_next] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
5942                         } else {
5943                             /* q0' */
5944                             pix[0*pix_next] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
5945                         }
5946                     }else{
5947                         /* p0', q0' */
5948                         pix[-1*pix_next] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
5949                         pix[ 0*pix_next] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
5950                     }
5951                     tprintf("filter_mb_edgeh i:%d d:%d, qp:%d, indexA:%d, alpha:%d, beta:%d\n# bS:%d -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, d, qp, index_a, alpha, beta, bS[i], p2, p1, p0, q0, q1, q2, pix[-2*pix_next], pix[-pix_next], pix[0], pix[pix_next]);
5952                 }
5953                 pix++;
5954             }
5955         }
5956     }
5957 }
5958
5959 static void filter_mb_edgech( H264Context *h, uint8_t *pix, int stride, int bS[4], int qp ) {
5960     int i, d;
5961     const int index_a = clip( qp + h->slice_alpha_c0_offset, 0, 51 );
5962     const int alpha = alpha_table[index_a];
5963     const int beta  = beta_table[clip( qp + h->slice_beta_offset, 0, 51 )];
5964     const int pix_next  = stride;
5965
5966     for( i = 0; i < 4; i++ )
5967     {
5968         if( bS[i] == 0 ) {
5969             pix += 2;
5970             continue;
5971         }
5972
5973         if( bS[i] < 4 ) {
5974             int tc = tc0_table[index_a][bS[i] - 1] + 1;
5975             /* 2px edge length (see deblocking_filter_edgecv) */
5976             for( d = 0; d < 2; d++ ) {
5977                 const int p0 = pix[-1*pix_next];
5978                 const int p1 = pix[-2*pix_next];
5979                 const int q0 = pix[0];
5980                 const int q1 = pix[1*pix_next];
5981
5982                 if( ABS( p0 - q0 ) < alpha &&
5983                     ABS( p1 - p0 ) < beta &&
5984                     ABS( q1 - q0 ) < beta ) {
5985
5986                     int i_delta = clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
5987
5988                     pix[-pix_next] = clip_uint8( p0 + i_delta );    /* p0' */
5989                     pix[0]         = clip_uint8( q0 - i_delta );    /* q0' */
5990                     tprintf("filter_mb_edgech i:%d d:%d, qp:%d, indexA:%d, alpha:%d, beta:%d, tc:%d\n# bS:%d -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, d, qp, index_a, alpha, beta, tc, bS[i], pix[-3*pix_next], p1, p0, q0, q1, pix[2*pix_next], pix[-2*pix_next], pix[-pix_next], pix[0], pix[pix_next]);
5991                 }
5992                 pix++;
5993             }
5994         }else{
5995             /* 2px edge length (see deblocking_filter_edgecv) */
5996             for( d = 0; d < 2; d++ ) {
5997                 const int p0 = pix[-1*pix_next];
5998                 const int p1 = pix[-2*pix_next];
5999                 const int q0 = pix[0];
6000                 const int q1 = pix[1*pix_next];
6001
6002                 if( ABS( p0 - q0 ) < alpha &&
6003                     ABS( p1 - p0 ) < beta &&
6004                     ABS( q1 - q0 ) < beta ) {
6005
6006                     pix[-pix_next] = ( 2*p1 + p0 + q1 + 2 ) >> 2;   /* p0' */
6007                     pix[0]         = ( 2*q1 + q0 + p1 + 2 ) >> 2;   /* q0' */
6008                     tprintf("filter_mb_edgech i:%d d:%d, qp:%d, indexA:%d, alpha:%d, beta:%d\n# bS:%d -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, d, qp, index_a, alpha, beta, bS[i], pix[-3*pix_next], p1, p0, q0, q1, pix[2*pix_next], pix[-2*pix_next], pix[-pix_next], pix[0], pix[pix_next]);
6009                 }
6010                 pix++;
6011             }
6012         }
6013     }
6014 }
6015
6016 static void filter_mb( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize) {
6017     MpegEncContext * const s = &h->s;
6018     const int mb_xy= mb_x + mb_y*s->mb_stride;
6019     int first_vertical_edge_done = 0;
6020     int dir;
6021
6022     if (h->mb_aff_frame
6023             // left mb is in picture
6024             && h->slice_table[mb_xy-1] != 255
6025             // and current and left pair do not have the same interlaced type
6026             && (IS_INTERLACED(s->current_picture.mb_type[mb_xy]) != IS_INTERLACED(s->current_picture.mb_type[mb_xy-1]))
6027             // and left mb is in the same slice if deblocking_filter == 2
6028             && (h->deblocking_filter!=2 || h->slice_table[mb_xy-1] == h->slice_table[mb_xy])) {
6029         /* First vertical edge is different in MBAFF frames
6030          * There are 8 differents bS to compute and 2 differents Qp
6031          */
6032         int bS[8];
6033         int qp[2];
6034         int chroma_qp[2];
6035
6036         int i;
6037         first_vertical_edge_done = 1;
6038         for( i = 0; i < 8; i++ ) {
6039             int y = i>>1;
6040             int b_idx= 8 + 4 + 8*y;
6041             int bn_idx= b_idx - 1;
6042
6043             int mbn_xy = h->mb_field_decoding_flag ? h->left_mb_xy[i>>2] : h->left_mb_xy[i&1];
6044
6045             if( IS_INTRA( s->current_picture.mb_type[mb_xy] ) ||
6046                 IS_INTRA( s->current_picture.mb_type[mbn_xy] ) ) {
6047                 bS[i] = 4;
6048             } else if( h->non_zero_count_cache[b_idx] != 0 ||
6049                 h->non_zero_count_cache[bn_idx] != 0 ) {
6050                 bS[i] = 2;
6051             } else {
6052                 /* FIXME: A given frame may occupy more than one position in
6053                  * the reference list. So we should compare the frame numbers,
6054                  * not the indices in the ref list. */
6055                 int l;
6056                 bS[i] = 0;
6057                 for( l = 0; l < 1 + (h->slice_type == B_TYPE); l++ ) {
6058                     if( h->ref_cache[l][b_idx] != h->ref_cache[l][bn_idx] ||
6059                         ABS( h->mv_cache[l][b_idx][0] - h->mv_cache[l][bn_idx][0] ) >= 4 ||
6060                         ABS( h->mv_cache[l][b_idx][1] - h->mv_cache[l][bn_idx][1] ) >= 4 ) {
6061                         bS[i] = 1;
6062                         break;
6063                     }
6064                 }
6065             }
6066         }
6067         if(bS[0]+bS[1]+bS[2]+bS[3] != 0) {
6068             // Do not use s->qscale as luma quantiser because it has not the same
6069             // value in IPCM macroblocks.
6070             qp[0] = ( s->current_picture.qscale_table[mb_xy] + s->current_picture.qscale_table[h->left_mb_xy[0]] + 1 ) >> 1;
6071             chroma_qp[0] = ( get_chroma_qp( h->pps.chroma_qp_index_offset, s->current_picture.qscale_table[mb_xy] ) +
6072                              get_chroma_qp( h->pps.chroma_qp_index_offset, s->current_picture.qscale_table[h->left_mb_xy[0]] ) + 1 ) >> 1;
6073             qp[1] = ( s->current_picture.qscale_table[mb_xy] + s->current_picture.qscale_table[h->left_mb_xy[1]] + 1 ) >> 1;
6074             chroma_qp[1] = ( get_chroma_qp( h->pps.chroma_qp_index_offset, s->current_picture.qscale_table[mb_xy] ) +
6075                              get_chroma_qp( h->pps.chroma_qp_index_offset, s->current_picture.qscale_table[h->left_mb_xy[1]] ) + 1 ) >> 1;
6076
6077             /* Filter edge */
6078             tprintf("filter mb:%d/%d MBAFF, QPy:%d/%d, QPc:%d/%d ls:%d uvls:%d", mb_x, mb_y, qp[0], qp[1], chroma_qp[0], chroma_qp[1], linesize, uvlinesize);
6079             { int i; for (i = 0; i < 8; i++) tprintf(" bS[%d]:%d", i, bS[i]); tprintf("\n"); }
6080             filter_mb_mbaff_edgev ( h, &img_y [0], linesize,   bS, qp );
6081             filter_mb_mbaff_edgecv( h, &img_cb[0], uvlinesize, bS, chroma_qp );
6082             filter_mb_mbaff_edgecv( h, &img_cr[0], uvlinesize, bS, chroma_qp );
6083         }
6084     }
6085     /* dir : 0 -> vertical edge, 1 -> horizontal edge */
6086     for( dir = 0; dir < 2; dir++ )
6087     {
6088         int edge;
6089         const int mbm_xy = dir == 0 ? mb_xy -1 : h->top_mb_xy;
6090         int start = h->slice_table[mbm_xy] == 255 ? 1 : 0;
6091
6092         if (first_vertical_edge_done) {
6093             start = 1;
6094             first_vertical_edge_done = 0;
6095         }
6096
6097         if (h->deblocking_filter==2 && h->slice_table[mbm_xy] != h->slice_table[mb_xy])
6098             start = 1;
6099
6100         /* Calculate bS */
6101         for( edge = start; edge < 4; edge++ ) {
6102             /* mbn_xy: neighbour macroblock */
6103             int mbn_xy = edge > 0 ? mb_xy : mbm_xy;
6104             int bS[4];
6105             int qp;
6106
6107             if (h->mb_aff_frame && (dir == 1) && (edge == 0) && ((mb_y & 1) == 0)
6108                 && !IS_INTERLACED(s->current_picture.mb_type[mb_xy])
6109                 && IS_INTERLACED(s->current_picture.mb_type[mbn_xy])
6110                 ) {
6111                 // This is a special case in the norm where the filtering must
6112                 // be done twice (one each of the field) even if we are in a
6113                 // frame macroblock.
6114                 //
6115                 unsigned int tmp_linesize   = 2 *   linesize;
6116                 unsigned int tmp_uvlinesize = 2 * uvlinesize;
6117                 int mbn_xy = mb_xy - 2 * s->mb_stride;
6118                 int qp, chroma_qp;
6119
6120                 // first filtering
6121                 if( IS_INTRA( s->current_picture.mb_type[mb_xy] ) ||
6122                     IS_INTRA( s->current_picture.mb_type[mbn_xy] ) ) {
6123                     bS[0] = bS[1] = bS[2] = bS[3] = 3;
6124                 } else {
6125                     // TODO
6126                     assert(0);
6127                 }
6128                 /* Filter edge */
6129                 // Do not use s->qscale as luma quantiser because it has not the same
6130                 // value in IPCM macroblocks.
6131                 qp = ( s->current_picture.qscale_table[mb_xy] + s->current_picture.qscale_table[mbn_xy] + 1 ) >> 1;
6132                 tprintf("filter mb:%d/%d dir:%d edge:%d, QPy:%d ls:%d uvls:%d", mb_x, mb_y, dir, edge, qp, tmp_linesize, tmp_uvlinesize);
6133                 { int i; for (i = 0; i < 4; i++) tprintf(" bS[%d]:%d", i, bS[i]); tprintf("\n"); }
6134                 filter_mb_edgeh( h, &img_y[0], tmp_linesize, bS, qp );
6135                 chroma_qp = ( h->chroma_qp +
6136                               get_chroma_qp( h->pps.chroma_qp_index_offset, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1;
6137                 filter_mb_edgech( h, &img_cb[0], tmp_uvlinesize, bS, chroma_qp );
6138                 filter_mb_edgech( h, &img_cr[0], tmp_uvlinesize, bS, chroma_qp );
6139
6140                 // second filtering
6141                 mbn_xy += s->mb_stride;
6142                 if( IS_INTRA( s->current_picture.mb_type[mb_xy] ) ||
6143                     IS_INTRA( s->current_picture.mb_type[mbn_xy] ) ) {
6144                     bS[0] = bS[1] = bS[2] = bS[3] = 3;
6145                 } else {
6146                     // TODO
6147                     assert(0);
6148                 }
6149                 /* Filter edge */
6150                 // Do not use s->qscale as luma quantiser because it has not the same
6151                 // value in IPCM macroblocks.
6152                 qp = ( s->current_picture.qscale_table[mb_xy] + s->current_picture.qscale_table[mbn_xy] + 1 ) >> 1;
6153                 tprintf("filter mb:%d/%d dir:%d edge:%d, QPy:%d ls:%d uvls:%d", mb_x, mb_y, dir, edge, qp, tmp_linesize, tmp_uvlinesize);
6154                 { int i; for (i = 0; i < 4; i++) tprintf(" bS[%d]:%d", i, bS[i]); tprintf("\n"); }
6155                 filter_mb_edgeh( h, &img_y[linesize], tmp_linesize, bS, qp );
6156                 chroma_qp = ( h->chroma_qp +
6157                               get_chroma_qp( h->pps.chroma_qp_index_offset, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1;
6158                 filter_mb_edgech( h, &img_cb[uvlinesize], tmp_uvlinesize, bS, chroma_qp );
6159                 filter_mb_edgech( h, &img_cr[uvlinesize], tmp_uvlinesize, bS, chroma_qp );
6160                 continue;
6161             }
6162             if( IS_INTRA( s->current_picture.mb_type[mb_xy] ) ||
6163                 IS_INTRA( s->current_picture.mb_type[mbn_xy] ) ) {
6164                 int value;
6165                 if (edge == 0) {
6166                     if (   (!IS_INTERLACED(s->current_picture.mb_type[mb_xy]) && !IS_INTERLACED(s->current_picture.mb_type[mbm_xy]))
6167                         || ((h->mb_aff_frame || (s->picture_structure != PICT_FRAME)) && (dir == 0))
6168                     ) {
6169                         value = 4;
6170                     } else {
6171                         value = 3;
6172                     }
6173                 } else {
6174                     value = 3;
6175                 }
6176                 bS[0] = bS[1] = bS[2] = bS[3] = value;
6177             } else {
6178                 int i;
6179                 for( i = 0; i < 4; i++ ) {
6180                     int x = dir == 0 ? edge : i;
6181                     int y = dir == 0 ? i    : edge;
6182                     int b_idx= 8 + 4 + x + 8*y;
6183                     int bn_idx= b_idx - (dir ? 8:1);
6184
6185                     if( h->non_zero_count_cache[b_idx] != 0 ||
6186                         h->non_zero_count_cache[bn_idx] != 0 ) {
6187                         bS[i] = 2;
6188                     }
6189                     else
6190                     {
6191                         /* FIXME: A given frame may occupy more than one position in
6192                          * the reference list. So we should compare the frame numbers,
6193                          * not the indices in the ref list. */
6194                         int l;
6195                         bS[i] = 0;
6196                         for( l = 0; l < 1 + (h->slice_type == B_TYPE); l++ ) {
6197                             if( h->ref_cache[l][b_idx] != h->ref_cache[l][bn_idx] ||
6198                                 ABS( h->mv_cache[l][b_idx][0] - h->mv_cache[l][bn_idx][0] ) >= 4 ||
6199                                 ABS( h->mv_cache[l][b_idx][1] - h->mv_cache[l][bn_idx][1] ) >= 4 ) {
6200                                 bS[i] = 1;
6201                                 break;
6202                             }
6203                         }
6204                     }
6205                 }
6206
6207                 if(bS[0]+bS[1]+bS[2]+bS[3] == 0)
6208                     continue;
6209             }
6210
6211             /* Filter edge */
6212             // Do not use s->qscale as luma quantiser because it has not the same
6213             // value in IPCM macroblocks.
6214             qp = ( s->current_picture.qscale_table[mb_xy] + s->current_picture.qscale_table[mbn_xy] + 1 ) >> 1;
6215             //tprintf("filter mb:%d/%d dir:%d edge:%d, QPy:%d, QPc:%d, QPcn:%d\n", mb_x, mb_y, dir, edge, qp, h->chroma_qp, s->current_picture.qscale_table[mbn_xy]);
6216             tprintf("filter mb:%d/%d dir:%d edge:%d, QPy:%d ls:%d uvls:%d", mb_x, mb_y, dir, edge, qp, linesize, uvlinesize);
6217             { int i; for (i = 0; i < 4; i++) tprintf(" bS[%d]:%d", i, bS[i]); tprintf("\n"); }
6218             if( dir == 0 ) {
6219                 filter_mb_edgev( h, &img_y[4*edge], linesize, bS, qp );
6220                 if( (edge&1) == 0 ) {
6221                     int chroma_qp = ( h->chroma_qp +
6222                                       get_chroma_qp( h->pps.chroma_qp_index_offset, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1;
6223                     filter_mb_edgecv( h, &img_cb[2*edge], uvlinesize, bS, chroma_qp );
6224                     filter_mb_edgecv( h, &img_cr[2*edge], uvlinesize, bS, chroma_qp );
6225                 }
6226             } else {
6227                 filter_mb_edgeh( h, &img_y[4*edge*linesize], linesize, bS, qp );
6228                 if( (edge&1) == 0 ) {
6229                     int chroma_qp = ( h->chroma_qp +
6230                                       get_chroma_qp( h->pps.chroma_qp_index_offset, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1;
6231                     filter_mb_edgech( h, &img_cb[2*edge*uvlinesize], uvlinesize, bS, chroma_qp );
6232                     filter_mb_edgech( h, &img_cr[2*edge*uvlinesize], uvlinesize, bS, chroma_qp );
6233                 }
6234             }
6235         }
6236     }
6237 }
6238
6239 static int decode_slice(H264Context *h){
6240     MpegEncContext * const s = &h->s;
6241     const int part_mask= s->partitioned_frame ? (AC_END|AC_ERROR) : 0x7F;
6242
6243     s->mb_skip_run= -1;
6244
6245     if( h->pps.cabac ) {
6246         int i;
6247
6248         /* realign */
6249         align_get_bits( &s->gb );
6250
6251         /* init cabac */
6252         ff_init_cabac_states( &h->cabac, ff_h264_lps_range, ff_h264_mps_state, ff_h264_lps_state, 64 );
6253         ff_init_cabac_decoder( &h->cabac,
6254                                s->gb.buffer + get_bits_count(&s->gb)/8,
6255                                ( s->gb.size_in_bits - get_bits_count(&s->gb) + 7)/8);
6256         /* calculate pre-state */
6257         for( i= 0; i < 399; i++ ) {
6258             int pre;
6259             if( h->slice_type == I_TYPE )
6260                 pre = clip( ((cabac_context_init_I[i][0] * s->qscale) >>4 ) + cabac_context_init_I[i][1], 1, 126 );
6261             else
6262                 pre = clip( ((cabac_context_init_PB[h->cabac_init_idc][i][0] * s->qscale) >>4 ) + cabac_context_init_PB[h->cabac_init_idc][i][1], 1, 126 );
6263
6264             if( pre <= 63 )
6265                 h->cabac_state[i] = 2 * ( 63 - pre ) + 0;
6266             else
6267                 h->cabac_state[i] = 2 * ( pre - 64 ) + 1;
6268         }
6269
6270         for(;;){
6271             int ret = decode_mb_cabac(h);
6272             int eos;
6273
6274             if(ret>=0) hl_decode_mb(h);
6275
6276             /* XXX: useless as decode_mb_cabac it doesn't support that ... */
6277             if( ret >= 0 && h->mb_aff_frame ) { //FIXME optimal? or let mb_decode decode 16x32 ?
6278                 s->mb_y++;
6279
6280                 if(ret>=0) ret = decode_mb_cabac(h);
6281
6282                 hl_decode_mb(h);
6283                 s->mb_y--;
6284             }
6285             eos = get_cabac_terminate( &h->cabac );
6286
6287             if( ret < 0 || h->cabac.bytestream > h->cabac.bytestream_end + 1) {
6288                 av_log(h->s.avctx, AV_LOG_ERROR, "error while decoding MB %d %d\n", s->mb_x, s->mb_y);
6289                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6290                 return -1;
6291             }
6292
6293             if( ++s->mb_x >= s->mb_width ) {
6294                 s->mb_x = 0;
6295                 ff_draw_horiz_band(s, 16*s->mb_y, 16);
6296                 ++s->mb_y;
6297                 if(h->mb_aff_frame) {
6298                     ++s->mb_y;
6299                 }
6300             }
6301
6302             if( eos || s->mb_y >= s->mb_height ) {
6303                 tprintf("slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
6304                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6305                 return 0;
6306             }
6307         }
6308
6309     } else {
6310         for(;;){
6311             int ret = decode_mb_cavlc(h);
6312
6313             if(ret>=0) hl_decode_mb(h);
6314
6315             if(ret>=0 && h->mb_aff_frame){ //FIXME optimal? or let mb_decode decode 16x32 ?
6316                 s->mb_y++;
6317                 ret = decode_mb_cavlc(h);
6318
6319                 if(ret>=0) hl_decode_mb(h);
6320                 s->mb_y--;
6321             }
6322
6323             if(ret<0){
6324                 av_log(h->s.avctx, AV_LOG_ERROR, "error while decoding MB %d %d\n", s->mb_x, s->mb_y);
6325                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6326
6327                 return -1;
6328             }
6329
6330             if(++s->mb_x >= s->mb_width){
6331                 s->mb_x=0;
6332                 ff_draw_horiz_band(s, 16*s->mb_y, 16);
6333                 ++s->mb_y;
6334                 if(h->mb_aff_frame) {
6335                     ++s->mb_y;
6336                 }
6337                 if(s->mb_y >= s->mb_height){
6338                     tprintf("slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
6339
6340                     if(get_bits_count(&s->gb) == s->gb.size_in_bits ) {
6341                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6342
6343                         return 0;
6344                     }else{
6345                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6346
6347                         return -1;
6348                     }
6349                 }
6350             }
6351
6352             if(get_bits_count(&s->gb) >= s->gb.size_in_bits && s->mb_skip_run<=0){
6353                 tprintf("slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
6354                 if(get_bits_count(&s->gb) == s->gb.size_in_bits ){
6355                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6356
6357                     return 0;
6358                 }else{
6359                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6360
6361                     return -1;
6362                 }
6363             }
6364         }
6365     }
6366
6367 #if 0
6368     for(;s->mb_y < s->mb_height; s->mb_y++){
6369         for(;s->mb_x < s->mb_width; s->mb_x++){
6370             int ret= decode_mb(h);
6371
6372             hl_decode_mb(h);
6373
6374             if(ret<0){
6375                 fprintf(stderr, "error while decoding MB %d %d\n", s->mb_x, s->mb_y);
6376                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6377
6378                 return -1;
6379             }
6380
6381             if(++s->mb_x >= s->mb_width){
6382                 s->mb_x=0;
6383                 if(++s->mb_y >= s->mb_height){
6384                     if(get_bits_count(s->gb) == s->gb.size_in_bits){
6385                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6386
6387                         return 0;
6388                     }else{
6389                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6390
6391                         return -1;
6392                     }
6393                 }
6394             }
6395
6396             if(get_bits_count(s->?gb) >= s->gb?.size_in_bits){
6397                 if(get_bits_count(s->gb) == s->gb.size_in_bits){
6398                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6399
6400                     return 0;
6401                 }else{
6402                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6403
6404                     return -1;
6405                 }
6406             }
6407         }
6408         s->mb_x=0;
6409         ff_draw_horiz_band(s, 16*s->mb_y, 16);
6410     }
6411 #endif
6412     return -1; //not reached
6413 }
6414
6415 static inline void decode_hrd_parameters(H264Context *h, SPS *sps){
6416     MpegEncContext * const s = &h->s;
6417     int cpb_count, i;
6418     cpb_count = get_ue_golomb(&s->gb) + 1;
6419     get_bits(&s->gb, 4); /* bit_rate_scale */
6420     get_bits(&s->gb, 4); /* cpb_size_scale */
6421     for(i=0; i<cpb_count; i++){
6422         get_ue_golomb(&s->gb); /* bit_rate_value_minus1 */
6423         get_ue_golomb(&s->gb); /* cpb_size_value_minus1 */
6424         get_bits1(&s->gb);     /* cbr_flag */
6425     }
6426     get_bits(&s->gb, 5); /* initial_cpb_removal_delay_length_minus1 */
6427     get_bits(&s->gb, 5); /* cpb_removal_delay_length_minus1 */
6428     get_bits(&s->gb, 5); /* dpb_output_delay_length_minus1 */
6429     get_bits(&s->gb, 5); /* time_offset_length */
6430 }
6431
6432 static inline int decode_vui_parameters(H264Context *h, SPS *sps){
6433     MpegEncContext * const s = &h->s;
6434     int aspect_ratio_info_present_flag, aspect_ratio_idc;
6435     int nal_hrd_parameters_present_flag, vcl_hrd_parameters_present_flag;
6436
6437     aspect_ratio_info_present_flag= get_bits1(&s->gb);
6438
6439     if( aspect_ratio_info_present_flag ) {
6440         aspect_ratio_idc= get_bits(&s->gb, 8);
6441         if( aspect_ratio_idc == EXTENDED_SAR ) {
6442             sps->sar.num= get_bits(&s->gb, 16);
6443             sps->sar.den= get_bits(&s->gb, 16);
6444         }else if(aspect_ratio_idc < 16){
6445             sps->sar=  pixel_aspect[aspect_ratio_idc];
6446         }else{
6447             av_log(h->s.avctx, AV_LOG_ERROR, "illegal aspect ratio\n");
6448             return -1;
6449         }
6450     }else{
6451         sps->sar.num=
6452         sps->sar.den= 0;
6453     }
6454 //            s->avctx->aspect_ratio= sar_width*s->width / (float)(s->height*sar_height);
6455
6456     if(get_bits1(&s->gb)){      /* overscan_info_present_flag */
6457         get_bits1(&s->gb);      /* overscan_appropriate_flag */
6458     }
6459
6460     if(get_bits1(&s->gb)){      /* video_signal_type_present_flag */
6461         get_bits(&s->gb, 3);    /* video_format */
6462         get_bits1(&s->gb);      /* video_full_range_flag */
6463         if(get_bits1(&s->gb)){  /* colour_description_present_flag */
6464             get_bits(&s->gb, 8); /* colour_primaries */
6465             get_bits(&s->gb, 8); /* transfer_characteristics */
6466             get_bits(&s->gb, 8); /* matrix_coefficients */
6467         }
6468     }
6469
6470     if(get_bits1(&s->gb)){      /* chroma_location_info_present_flag */
6471         get_ue_golomb(&s->gb);  /* chroma_sample_location_type_top_field */
6472         get_ue_golomb(&s->gb);  /* chroma_sample_location_type_bottom_field */
6473     }
6474
6475     sps->timing_info_present_flag = get_bits1(&s->gb);
6476     if(sps->timing_info_present_flag){
6477         sps->num_units_in_tick = get_bits_long(&s->gb, 32);
6478         sps->time_scale = get_bits_long(&s->gb, 32);
6479         sps->fixed_frame_rate_flag = get_bits1(&s->gb);
6480     }
6481
6482     nal_hrd_parameters_present_flag = get_bits1(&s->gb);
6483     if(nal_hrd_parameters_present_flag)
6484         decode_hrd_parameters(h, sps);
6485     vcl_hrd_parameters_present_flag = get_bits1(&s->gb);
6486     if(vcl_hrd_parameters_present_flag)
6487         decode_hrd_parameters(h, sps);
6488     if(nal_hrd_parameters_present_flag || vcl_hrd_parameters_present_flag)
6489         get_bits1(&s->gb);     /* low_delay_hrd_flag */
6490     get_bits1(&s->gb);         /* pic_struct_present_flag */
6491
6492     sps->bitstream_restriction_flag = get_bits1(&s->gb);
6493     if(sps->bitstream_restriction_flag){
6494         get_bits1(&s->gb);     /* motion_vectors_over_pic_boundaries_flag */
6495         get_ue_golomb(&s->gb); /* max_bytes_per_pic_denom */
6496         get_ue_golomb(&s->gb); /* max_bits_per_mb_denom */
6497         get_ue_golomb(&s->gb); /* log2_max_mv_length_horizontal */
6498         get_ue_golomb(&s->gb); /* log2_max_mv_length_vertical */
6499         sps->num_reorder_frames = get_ue_golomb(&s->gb);
6500         get_ue_golomb(&s->gb); /* max_dec_frame_buffering */
6501     }
6502
6503     return 0;
6504 }
6505
6506 static inline int decode_seq_parameter_set(H264Context *h){
6507     MpegEncContext * const s = &h->s;
6508     int profile_idc, level_idc;
6509     int sps_id, i;
6510     SPS *sps;
6511
6512     profile_idc= get_bits(&s->gb, 8);
6513     get_bits1(&s->gb);   //constraint_set0_flag
6514     get_bits1(&s->gb);   //constraint_set1_flag
6515     get_bits1(&s->gb);   //constraint_set2_flag
6516     get_bits1(&s->gb);   //constraint_set3_flag
6517     get_bits(&s->gb, 4); // reserved
6518     level_idc= get_bits(&s->gb, 8);
6519     sps_id= get_ue_golomb(&s->gb);
6520
6521     sps= &h->sps_buffer[ sps_id ];
6522     sps->profile_idc= profile_idc;
6523     sps->level_idc= level_idc;
6524
6525     sps->log2_max_frame_num= get_ue_golomb(&s->gb) + 4;
6526     sps->poc_type= get_ue_golomb(&s->gb);
6527
6528     if(sps->poc_type == 0){ //FIXME #define
6529         sps->log2_max_poc_lsb= get_ue_golomb(&s->gb) + 4;
6530     } else if(sps->poc_type == 1){//FIXME #define
6531         sps->delta_pic_order_always_zero_flag= get_bits1(&s->gb);
6532         sps->offset_for_non_ref_pic= get_se_golomb(&s->gb);
6533         sps->offset_for_top_to_bottom_field= get_se_golomb(&s->gb);
6534         sps->poc_cycle_length= get_ue_golomb(&s->gb);
6535
6536         for(i=0; i<sps->poc_cycle_length; i++)
6537             sps->offset_for_ref_frame[i]= get_se_golomb(&s->gb);
6538     }
6539     if(sps->poc_type > 2){
6540         av_log(h->s.avctx, AV_LOG_ERROR, "illegal POC type %d\n", sps->poc_type);
6541         return -1;
6542     }
6543
6544     sps->ref_frame_count= get_ue_golomb(&s->gb);
6545     if(sps->ref_frame_count > MAX_PICTURE_COUNT-2){
6546         av_log(h->s.avctx, AV_LOG_ERROR, "too many reference frames\n");
6547     }
6548     sps->gaps_in_frame_num_allowed_flag= get_bits1(&s->gb);
6549     sps->mb_width= get_ue_golomb(&s->gb) + 1;
6550     sps->mb_height= get_ue_golomb(&s->gb) + 1;
6551     if((unsigned)sps->mb_width >= INT_MAX/16 || (unsigned)sps->mb_height >= INT_MAX/16 ||
6552        avcodec_check_dimensions(NULL, 16*sps->mb_width, 16*sps->mb_height))
6553         return -1;
6554
6555     sps->frame_mbs_only_flag= get_bits1(&s->gb);
6556     if(!sps->frame_mbs_only_flag)
6557         sps->mb_aff= get_bits1(&s->gb);
6558     else
6559         sps->mb_aff= 0;
6560
6561     sps->direct_8x8_inference_flag= get_bits1(&s->gb);
6562
6563     sps->crop= get_bits1(&s->gb);
6564     if(sps->crop){
6565         sps->crop_left  = get_ue_golomb(&s->gb);
6566         sps->crop_right = get_ue_golomb(&s->gb);
6567         sps->crop_top   = get_ue_golomb(&s->gb);
6568         sps->crop_bottom= get_ue_golomb(&s->gb);
6569         if(sps->crop_left || sps->crop_top){
6570             av_log(h->s.avctx, AV_LOG_ERROR, "insane cropping not completly supported, this could look slightly wrong ...\n");
6571         }
6572     }else{
6573         sps->crop_left  =
6574         sps->crop_right =
6575         sps->crop_top   =
6576         sps->crop_bottom= 0;
6577     }
6578
6579     sps->vui_parameters_present_flag= get_bits1(&s->gb);
6580     if( sps->vui_parameters_present_flag )
6581         decode_vui_parameters(h, sps);
6582
6583     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
6584         av_log(h->s.avctx, AV_LOG_DEBUG, "sps:%d profile:%d/%d poc:%d ref:%d %dx%d %s %s crop:%d/%d/%d/%d %s\n",
6585                sps_id, sps->profile_idc, sps->level_idc,
6586                sps->poc_type,
6587                sps->ref_frame_count,
6588                sps->mb_width, sps->mb_height,
6589                sps->frame_mbs_only_flag ? "FRM" : (sps->mb_aff ? "MB-AFF" : "PIC-AFF"),
6590                sps->direct_8x8_inference_flag ? "8B8" : "",
6591                sps->crop_left, sps->crop_right,
6592                sps->crop_top, sps->crop_bottom,
6593                sps->vui_parameters_present_flag ? "VUI" : ""
6594                );
6595     }
6596     return 0;
6597 }
6598
6599 static inline int decode_picture_parameter_set(H264Context *h){
6600     MpegEncContext * const s = &h->s;
6601     int pps_id= get_ue_golomb(&s->gb);
6602     PPS *pps= &h->pps_buffer[pps_id];
6603
6604     pps->sps_id= get_ue_golomb(&s->gb);
6605     pps->cabac= get_bits1(&s->gb);
6606     pps->pic_order_present= get_bits1(&s->gb);
6607     pps->slice_group_count= get_ue_golomb(&s->gb) + 1;
6608     if(pps->slice_group_count > 1 ){
6609         pps->mb_slice_group_map_type= get_ue_golomb(&s->gb);
6610         av_log(h->s.avctx, AV_LOG_ERROR, "FMO not supported\n");
6611         switch(pps->mb_slice_group_map_type){
6612         case 0:
6613 #if 0
6614 |   for( i = 0; i <= num_slice_groups_minus1; i++ ) |   |        |
6615 |    run_length[ i ]                                |1  |ue(v)   |
6616 #endif
6617             break;
6618         case 2:
6619 #if 0
6620 |   for( i = 0; i < num_slice_groups_minus1; i++ )  |   |        |
6621 |{                                                  |   |        |
6622 |    top_left_mb[ i ]                               |1  |ue(v)   |
6623 |    bottom_right_mb[ i ]                           |1  |ue(v)   |
6624 |   }                                               |   |        |
6625 #endif
6626             break;
6627         case 3:
6628         case 4:
6629         case 5:
6630 #if 0
6631 |   slice_group_change_direction_flag               |1  |u(1)    |
6632 |   slice_group_change_rate_minus1                  |1  |ue(v)   |
6633 #endif
6634             break;
6635         case 6:
6636 #if 0
6637 |   slice_group_id_cnt_minus1                       |1  |ue(v)   |
6638 |   for( i = 0; i <= slice_group_id_cnt_minus1; i++ |   |        |
6639 |)                                                  |   |        |
6640 |    slice_group_id[ i ]                            |1  |u(v)    |
6641 #endif
6642             break;
6643         }
6644     }
6645     pps->ref_count[0]= get_ue_golomb(&s->gb) + 1;
6646     pps->ref_count[1]= get_ue_golomb(&s->gb) + 1;
6647     if(pps->ref_count[0] > 32 || pps->ref_count[1] > 32){
6648         av_log(h->s.avctx, AV_LOG_ERROR, "reference overflow (pps)\n");
6649         return -1;
6650     }
6651
6652     pps->weighted_pred= get_bits1(&s->gb);
6653     pps->weighted_bipred_idc= get_bits(&s->gb, 2);
6654     pps->init_qp= get_se_golomb(&s->gb) + 26;
6655     pps->init_qs= get_se_golomb(&s->gb) + 26;
6656     pps->chroma_qp_index_offset= get_se_golomb(&s->gb);
6657     pps->deblocking_filter_parameters_present= get_bits1(&s->gb);
6658     pps->constrained_intra_pred= get_bits1(&s->gb);
6659     pps->redundant_pic_cnt_present = get_bits1(&s->gb);
6660
6661     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
6662         av_log(h->s.avctx, AV_LOG_DEBUG, "pps:%d sps:%d %s slice_groups:%d ref:%d/%d %s qp:%d/%d/%d %s %s %s\n",
6663                pps_id, pps->sps_id,
6664                pps->cabac ? "CABAC" : "CAVLC",
6665                pps->slice_group_count,
6666                pps->ref_count[0], pps->ref_count[1],
6667                pps->weighted_pred ? "weighted" : "",
6668                pps->init_qp, pps->init_qs, pps->chroma_qp_index_offset,
6669                pps->deblocking_filter_parameters_present ? "LPAR" : "",
6670                pps->constrained_intra_pred ? "CONSTR" : "",
6671                pps->redundant_pic_cnt_present ? "REDU" : ""
6672                );
6673     }
6674
6675     return 0;
6676 }
6677
6678 /**
6679  * finds the end of the current frame in the bitstream.
6680  * @return the position of the first byte of the next frame, or -1
6681  */
6682 static int find_frame_end(H264Context *h, const uint8_t *buf, int buf_size){
6683     int i;
6684     uint32_t state;
6685     ParseContext *pc = &(h->s.parse_context);
6686 //printf("first %02X%02X%02X%02X\n", buf[0], buf[1],buf[2],buf[3]);
6687 //    mb_addr= pc->mb_addr - 1;
6688     state= pc->state;
6689     for(i=0; i<=buf_size; i++){
6690         if((state&0xFFFFFF1F) == 0x101 || (state&0xFFFFFF1F) == 0x102 || (state&0xFFFFFF1F) == 0x105){
6691             tprintf("find_frame_end new startcode = %08x, frame_start_found = %d, pos = %d\n", state, pc->frame_start_found, i);
6692             if(pc->frame_start_found){
6693                 // If there isn't one more byte in the buffer
6694                 // the test on first_mb_in_slice cannot be done yet
6695                 // do it at next call.
6696                 if (i >= buf_size) break;
6697                 if (buf[i] & 0x80) {
6698                     // first_mb_in_slice is 0, probably the first nal of a new
6699                     // slice
6700                     tprintf("find_frame_end frame_end_found, state = %08x, pos = %d\n", state, i);
6701                     pc->state=-1;
6702                     pc->frame_start_found= 0;
6703                     return i-4;
6704                 }
6705             }
6706             pc->frame_start_found = 1;
6707         }
6708         if (i<buf_size)
6709             state= (state<<8) | buf[i];
6710     }
6711
6712     pc->state= state;
6713     return END_NOT_FOUND;
6714 }
6715
6716 static int h264_parse(AVCodecParserContext *s,
6717                       AVCodecContext *avctx,
6718                       uint8_t **poutbuf, int *poutbuf_size,
6719                       const uint8_t *buf, int buf_size)
6720 {
6721     H264Context *h = s->priv_data;
6722     ParseContext *pc = &h->s.parse_context;
6723     int next;
6724
6725     next= find_frame_end(h, buf, buf_size);
6726
6727     if (ff_combine_frame(pc, next, (uint8_t **)&buf, &buf_size) < 0) {
6728         *poutbuf = NULL;
6729         *poutbuf_size = 0;
6730         return buf_size;
6731     }
6732
6733     *poutbuf = (uint8_t *)buf;
6734     *poutbuf_size = buf_size;
6735     return next;
6736 }
6737
6738 static int decode_nal_units(H264Context *h, uint8_t *buf, int buf_size){
6739     MpegEncContext * const s = &h->s;
6740     AVCodecContext * const avctx= s->avctx;
6741     int buf_index=0;
6742 #if 0
6743     int i;
6744     for(i=0; i<32; i++){
6745         printf("%X ", buf[i]);
6746     }
6747 #endif
6748     h->slice_num = 0;
6749     for(;;){
6750         int consumed;
6751         int dst_length;
6752         int bit_length;
6753         uint8_t *ptr;
6754         int i, nalsize = 0;
6755
6756       if(h->is_avc) {
6757         if(buf_index >= buf_size) break;
6758         nalsize = 0;
6759         for(i = 0; i < h->nal_length_size; i++)
6760             nalsize = (nalsize << 8) | buf[buf_index++];
6761       } else {
6762         // start code prefix search
6763         for(; buf_index + 3 < buf_size; buf_index++){
6764             // this should allways succeed in the first iteration
6765             if(buf[buf_index] == 0 && buf[buf_index+1] == 0 && buf[buf_index+2] == 1)
6766                 break;
6767         }
6768
6769         if(buf_index+3 >= buf_size) break;
6770
6771         buf_index+=3;
6772       }
6773
6774         ptr= decode_nal(h, buf + buf_index, &dst_length, &consumed, h->is_avc ? nalsize : buf_size - buf_index);
6775         if(ptr[dst_length - 1] == 0) dst_length--;
6776         bit_length= 8*dst_length - decode_rbsp_trailing(ptr + dst_length - 1);
6777
6778         if(s->avctx->debug&FF_DEBUG_STARTCODE){
6779             av_log(h->s.avctx, AV_LOG_DEBUG, "NAL %d at %d/%d length %d\n", h->nal_unit_type, buf_index, buf_size, dst_length);
6780         }
6781
6782         if (h->is_avc && (nalsize != consumed))
6783             av_log(h->s.avctx, AV_LOG_ERROR, "AVC: Consumed only %d bytes instead of %d\n", consumed, nalsize);
6784
6785         buf_index += consumed;
6786
6787         if( s->hurry_up == 1 && h->nal_ref_idc  == 0 )
6788             continue;
6789
6790         switch(h->nal_unit_type){
6791         case NAL_IDR_SLICE:
6792             idr(h); //FIXME ensure we dont loose some frames if there is reordering
6793         case NAL_SLICE:
6794             init_get_bits(&s->gb, ptr, bit_length);
6795             h->intra_gb_ptr=
6796             h->inter_gb_ptr= &s->gb;
6797             s->data_partitioning = 0;
6798
6799             if(decode_slice_header(h) < 0) return -1;
6800             if(h->redundant_pic_count==0 && s->hurry_up < 5 )
6801                 decode_slice(h);
6802             break;
6803         case NAL_DPA:
6804             init_get_bits(&s->gb, ptr, bit_length);
6805             h->intra_gb_ptr=
6806             h->inter_gb_ptr= NULL;
6807             s->data_partitioning = 1;
6808
6809             if(decode_slice_header(h) < 0) return -1;
6810             break;
6811         case NAL_DPB:
6812             init_get_bits(&h->intra_gb, ptr, bit_length);
6813             h->intra_gb_ptr= &h->intra_gb;
6814             break;
6815         case NAL_DPC:
6816             init_get_bits(&h->inter_gb, ptr, bit_length);
6817             h->inter_gb_ptr= &h->inter_gb;
6818
6819             if(h->redundant_pic_count==0 && h->intra_gb_ptr && s->data_partitioning && s->hurry_up < 5 )
6820                 decode_slice(h);
6821             break;
6822         case NAL_SEI:
6823             break;
6824         case NAL_SPS:
6825             init_get_bits(&s->gb, ptr, bit_length);
6826             decode_seq_parameter_set(h);
6827
6828             if(s->flags& CODEC_FLAG_LOW_DELAY)
6829                 s->low_delay=1;
6830
6831             if(avctx->has_b_frames < 2)
6832                 avctx->has_b_frames= !s->low_delay;
6833             break;
6834         case NAL_PPS:
6835             init_get_bits(&s->gb, ptr, bit_length);
6836
6837             decode_picture_parameter_set(h);
6838
6839             break;
6840         case NAL_PICTURE_DELIMITER:
6841             break;
6842         case NAL_FILTER_DATA:
6843             break;
6844         default:
6845             av_log(avctx, AV_LOG_ERROR, "Unknown NAL code: %d\n", h->nal_unit_type);
6846         }
6847     }
6848
6849     if(!s->current_picture_ptr) return buf_index; //no frame
6850
6851     s->current_picture_ptr->pict_type= s->pict_type;
6852     s->current_picture_ptr->key_frame= s->pict_type == I_TYPE && h->nal_unit_type == NAL_IDR_SLICE;
6853
6854     h->prev_frame_num_offset= h->frame_num_offset;
6855     h->prev_frame_num= h->frame_num;
6856     if(s->current_picture_ptr->reference){
6857         h->prev_poc_msb= h->poc_msb;
6858         h->prev_poc_lsb= h->poc_lsb;
6859     }
6860     if(s->current_picture_ptr->reference)
6861         execute_ref_pic_marking(h, h->mmco, h->mmco_index);
6862
6863     ff_er_frame_end(s);
6864
6865     MPV_frame_end(s);
6866
6867     return buf_index;
6868 }
6869
6870 /**
6871  * retunrs the number of bytes consumed for building the current frame
6872  */
6873 static int get_consumed_bytes(MpegEncContext *s, int pos, int buf_size){
6874     if(s->flags&CODEC_FLAG_TRUNCATED){
6875         pos -= s->parse_context.last_index;
6876         if(pos<0) pos=0; // FIXME remove (uneeded?)
6877
6878         return pos;
6879     }else{
6880         if(pos==0) pos=1; //avoid infinite loops (i doubt thats needed but ...)
6881         if(pos+10>buf_size) pos=buf_size; // oops ;)
6882
6883         return pos;
6884     }
6885 }
6886
6887 static int decode_frame(AVCodecContext *avctx,
6888                              void *data, int *data_size,
6889                              uint8_t *buf, int buf_size)
6890 {
6891     H264Context *h = avctx->priv_data;
6892     MpegEncContext *s = &h->s;
6893     AVFrame *pict = data;
6894     int buf_index;
6895
6896     s->flags= avctx->flags;
6897     s->flags2= avctx->flags2;
6898
6899    /* no supplementary picture */
6900     if (buf_size == 0) {
6901         return 0;
6902     }
6903
6904     if(s->flags&CODEC_FLAG_TRUNCATED){
6905         int next= find_frame_end(h, buf, buf_size);
6906
6907         if( ff_combine_frame(&s->parse_context, next, &buf, &buf_size) < 0 )
6908             return buf_size;
6909 //printf("next:%d buf_size:%d last_index:%d\n", next, buf_size, s->parse_context.last_index);
6910     }
6911
6912     if(h->is_avc && !h->got_avcC) {
6913         int i, cnt, nalsize;
6914         unsigned char *p = avctx->extradata;
6915         if(avctx->extradata_size < 7) {
6916             av_log(avctx, AV_LOG_ERROR, "avcC too short\n");
6917             return -1;
6918         }
6919         if(*p != 1) {
6920             av_log(avctx, AV_LOG_ERROR, "Unknown avcC version %d\n", *p);
6921             return -1;
6922         }
6923         /* sps and pps in the avcC always have length coded with 2 bytes,
6924            so put a fake nal_length_size = 2 while parsing them */
6925         h->nal_length_size = 2;
6926         // Decode sps from avcC
6927         cnt = *(p+5) & 0x1f; // Number of sps
6928         p += 6;
6929         for (i = 0; i < cnt; i++) {
6930             nalsize = BE_16(p) + 2;
6931             if(decode_nal_units(h, p, nalsize) != nalsize) {
6932                 av_log(avctx, AV_LOG_ERROR, "Decoding sps %d from avcC failed\n", i);
6933                 return -1;
6934             }
6935             p += nalsize;
6936         }
6937         // Decode pps from avcC
6938         cnt = *(p++); // Number of pps
6939         for (i = 0; i < cnt; i++) {
6940             nalsize = BE_16(p) + 2;
6941             if(decode_nal_units(h, p, nalsize)  != nalsize) {
6942                 av_log(avctx, AV_LOG_ERROR, "Decoding pps %d from avcC failed\n", i);
6943                 return -1;
6944             }
6945             p += nalsize;
6946         }
6947         // Now store right nal length size, that will be use to parse all other nals
6948         h->nal_length_size = ((*(((char*)(avctx->extradata))+4))&0x03)+1;
6949         // Do not reparse avcC
6950         h->got_avcC = 1;
6951     }
6952
6953     if(!h->is_avc && s->avctx->extradata_size && s->picture_number==0){
6954         if(0 < decode_nal_units(h, s->avctx->extradata, s->avctx->extradata_size) )
6955             return -1;
6956     }
6957
6958     buf_index=decode_nal_units(h, buf, buf_size);
6959     if(buf_index < 0)
6960         return -1;
6961
6962     //FIXME do something with unavailable reference frames
6963
6964 //    if(ret==FRAME_SKIPED) return get_consumed_bytes(s, buf_index, buf_size);
6965     if(!s->current_picture_ptr){
6966         av_log(h->s.avctx, AV_LOG_DEBUG, "error, NO frame\n");
6967         return -1;
6968     }
6969
6970     {
6971         Picture *out = s->current_picture_ptr;
6972 #if 0 //decode order
6973         *data_size = sizeof(AVFrame);
6974 #else
6975         /* Sort B-frames into display order */
6976         Picture *cur = s->current_picture_ptr;
6977         Picture *prev = h->delayed_output_pic;
6978         int out_idx = 0;
6979         int pics = 0;
6980         int out_of_order;
6981         int cross_idr = 0;
6982         int dropped_frame = 0;
6983         int i;
6984
6985         if(h->sps.bitstream_restriction_flag
6986            && s->avctx->has_b_frames < h->sps.num_reorder_frames){
6987             s->avctx->has_b_frames = h->sps.num_reorder_frames;
6988             s->low_delay = 0;
6989         }
6990
6991         while(h->delayed_pic[pics]) pics++;
6992         h->delayed_pic[pics++] = cur;
6993         if(cur->reference == 0)
6994             cur->reference = 1;
6995
6996         for(i=0; h->delayed_pic[i]; i++)
6997             if(h->delayed_pic[i]->key_frame)
6998                 cross_idr = 1;
6999
7000         out = h->delayed_pic[0];
7001         for(i=1; h->delayed_pic[i] && !h->delayed_pic[i]->key_frame; i++)
7002             if(h->delayed_pic[i]->poc < out->poc){
7003                 out = h->delayed_pic[i];
7004                 out_idx = i;
7005             }
7006
7007         out_of_order = !cross_idr && prev && out->poc < prev->poc;
7008         if(prev && pics <= s->avctx->has_b_frames)
7009             out = prev;
7010         else if((out_of_order && pics-1 == s->avctx->has_b_frames)
7011            || (s->low_delay &&
7012             ((!cross_idr && prev && out->poc > prev->poc + 2)
7013              || cur->pict_type == B_TYPE)))
7014         {
7015             s->low_delay = 0;
7016             s->avctx->has_b_frames++;
7017             out = prev;
7018         }
7019         else if(out_of_order)
7020             out = prev;
7021
7022         if(out_of_order || pics > s->avctx->has_b_frames){
7023             dropped_frame = (out != h->delayed_pic[out_idx]);
7024             for(i=out_idx; h->delayed_pic[i]; i++)
7025                 h->delayed_pic[i] = h->delayed_pic[i+1];
7026         }
7027
7028         if(prev == out && !dropped_frame)
7029             *data_size = 0;
7030         else
7031             *data_size = sizeof(AVFrame);
7032         if(prev && prev != out && prev->reference == 1)
7033             prev->reference = 0;
7034         h->delayed_output_pic = out;
7035 #endif
7036
7037         *pict= *(AVFrame*)out;
7038     }
7039
7040     assert(pict->data[0]);
7041     ff_print_debug_info(s, pict);
7042 //printf("out %d\n", (int)pict->data[0]);
7043 #if 0 //?
7044
7045     /* Return the Picture timestamp as the frame number */
7046     /* we substract 1 because it is added on utils.c    */
7047     avctx->frame_number = s->picture_number - 1;
7048 #endif
7049     return get_consumed_bytes(s, buf_index, buf_size);
7050 }
7051 #if 0
7052 static inline void fill_mb_avail(H264Context *h){
7053     MpegEncContext * const s = &h->s;
7054     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
7055
7056     if(s->mb_y){
7057         h->mb_avail[0]= s->mb_x                 && h->slice_table[mb_xy - s->mb_stride - 1] == h->slice_num;
7058         h->mb_avail[1]=                            h->slice_table[mb_xy - s->mb_stride    ] == h->slice_num;
7059         h->mb_avail[2]= s->mb_x+1 < s->mb_width && h->slice_table[mb_xy - s->mb_stride + 1] == h->slice_num;
7060     }else{
7061         h->mb_avail[0]=
7062         h->mb_avail[1]=
7063         h->mb_avail[2]= 0;
7064     }
7065     h->mb_avail[3]= s->mb_x && h->slice_table[mb_xy - 1] == h->slice_num;
7066     h->mb_avail[4]= 1; //FIXME move out
7067     h->mb_avail[5]= 0; //FIXME move out
7068 }
7069 #endif
7070
7071 #if 0 //selftest
7072 #define COUNT 8000
7073 #define SIZE (COUNT*40)
7074 int main(){
7075     int i;
7076     uint8_t temp[SIZE];
7077     PutBitContext pb;
7078     GetBitContext gb;
7079 //    int int_temp[10000];
7080     DSPContext dsp;
7081     AVCodecContext avctx;
7082
7083     dsputil_init(&dsp, &avctx);
7084
7085     init_put_bits(&pb, temp, SIZE);
7086     printf("testing unsigned exp golomb\n");
7087     for(i=0; i<COUNT; i++){
7088         START_TIMER
7089         set_ue_golomb(&pb, i);
7090         STOP_TIMER("set_ue_golomb");
7091     }
7092     flush_put_bits(&pb);
7093
7094     init_get_bits(&gb, temp, 8*SIZE);
7095     for(i=0; i<COUNT; i++){
7096         int j, s;
7097
7098         s= show_bits(&gb, 24);
7099
7100         START_TIMER
7101         j= get_ue_golomb(&gb);
7102         if(j != i){
7103             printf("missmatch! at %d (%d should be %d) bits:%6X\n", i, j, i, s);
7104 //            return -1;
7105         }
7106         STOP_TIMER("get_ue_golomb");
7107     }
7108
7109
7110     init_put_bits(&pb, temp, SIZE);
7111     printf("testing signed exp golomb\n");
7112     for(i=0; i<COUNT; i++){
7113         START_TIMER
7114         set_se_golomb(&pb, i - COUNT/2);
7115         STOP_TIMER("set_se_golomb");
7116     }
7117     flush_put_bits(&pb);
7118
7119     init_get_bits(&gb, temp, 8*SIZE);
7120     for(i=0; i<COUNT; i++){
7121         int j, s;
7122
7123         s= show_bits(&gb, 24);
7124
7125         START_TIMER
7126         j= get_se_golomb(&gb);
7127         if(j != i - COUNT/2){
7128             printf("missmatch! at %d (%d should be %d) bits:%6X\n", i, j, i, s);
7129 //            return -1;
7130         }
7131         STOP_TIMER("get_se_golomb");
7132     }
7133
7134     printf("testing 4x4 (I)DCT\n");
7135
7136     DCTELEM block[16];
7137     uint8_t src[16], ref[16];
7138     uint64_t error= 0, max_error=0;
7139
7140     for(i=0; i<COUNT; i++){
7141         int j;
7142 //        printf("%d %d %d\n", r1, r2, (r2-r1)*16);
7143         for(j=0; j<16; j++){
7144             ref[j]= random()%255;
7145             src[j]= random()%255;
7146         }
7147
7148         h264_diff_dct_c(block, src, ref, 4);
7149
7150         //normalize
7151         for(j=0; j<16; j++){
7152 //            printf("%d ", block[j]);
7153             block[j]= block[j]*4;
7154             if(j&1) block[j]= (block[j]*4 + 2)/5;
7155             if(j&4) block[j]= (block[j]*4 + 2)/5;
7156         }
7157 //        printf("\n");
7158
7159         s->dsp.h264_idct_add(ref, block, 4);
7160 /*        for(j=0; j<16; j++){
7161             printf("%d ", ref[j]);
7162         }
7163         printf("\n");*/
7164
7165         for(j=0; j<16; j++){
7166             int diff= ABS(src[j] - ref[j]);
7167
7168             error+= diff*diff;
7169             max_error= FFMAX(max_error, diff);
7170         }
7171     }
7172     printf("error=%f max_error=%d\n", ((float)error)/COUNT/16, (int)max_error );
7173 #if 0
7174     printf("testing quantizer\n");
7175     for(qp=0; qp<52; qp++){
7176         for(i=0; i<16; i++)
7177             src1_block[i]= src2_block[i]= random()%255;
7178
7179     }
7180 #endif
7181     printf("Testing NAL layer\n");
7182
7183     uint8_t bitstream[COUNT];
7184     uint8_t nal[COUNT*2];
7185     H264Context h;
7186     memset(&h, 0, sizeof(H264Context));
7187
7188     for(i=0; i<COUNT; i++){
7189         int zeros= i;
7190         int nal_length;
7191         int consumed;
7192         int out_length;
7193         uint8_t *out;
7194         int j;
7195
7196         for(j=0; j<COUNT; j++){
7197             bitstream[j]= (random() % 255) + 1;
7198         }
7199
7200         for(j=0; j<zeros; j++){
7201             int pos= random() % COUNT;
7202             while(bitstream[pos] == 0){
7203                 pos++;
7204                 pos %= COUNT;
7205             }
7206             bitstream[pos]=0;
7207         }
7208
7209         START_TIMER
7210
7211         nal_length= encode_nal(&h, nal, bitstream, COUNT, COUNT*2);
7212         if(nal_length<0){
7213             printf("encoding failed\n");
7214             return -1;
7215         }
7216
7217         out= decode_nal(&h, nal, &out_length, &consumed, nal_length);
7218
7219         STOP_TIMER("NAL")
7220
7221         if(out_length != COUNT){
7222             printf("incorrect length %d %d\n", out_length, COUNT);
7223             return -1;
7224         }
7225
7226         if(consumed != nal_length){
7227             printf("incorrect consumed length %d %d\n", nal_length, consumed);
7228             return -1;
7229         }
7230
7231         if(memcmp(bitstream, out, COUNT)){
7232             printf("missmatch\n");
7233             return -1;
7234         }
7235     }
7236
7237     printf("Testing RBSP\n");
7238
7239
7240     return 0;
7241 }
7242 #endif
7243
7244
7245 static int decode_end(AVCodecContext *avctx)
7246 {
7247     H264Context *h = avctx->priv_data;
7248     MpegEncContext *s = &h->s;
7249
7250     free_tables(h); //FIXME cleanup init stuff perhaps
7251     MPV_common_end(s);
7252
7253 //    memset(h, 0, sizeof(H264Context));
7254
7255     return 0;
7256 }
7257
7258
7259 AVCodec h264_decoder = {
7260     "h264",
7261     CODEC_TYPE_VIDEO,
7262     CODEC_ID_H264,
7263     sizeof(H264Context),
7264     decode_init,
7265     NULL,
7266     decode_end,
7267     decode_frame,
7268     /*CODEC_CAP_DRAW_HORIZ_BAND |*/ CODEC_CAP_DR1 | CODEC_CAP_TRUNCATED | CODEC_CAP_DELAY,
7269 };
7270
7271 AVCodecParser h264_parser = {
7272     { CODEC_ID_H264 },
7273     sizeof(H264Context),
7274     NULL,
7275     h264_parse,
7276     ff_parse_close,
7277 };
7278
7279 #include "svq3.c"