git.sesse.net Git - ffmpeg/blob - libavcodec/h264.c

   1 /*
   2  * H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder
   3  * Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
   4  *
   5  * This library is free software; you can redistribute it and/or
   6  * modify it under the terms of the GNU Lesser General Public
   7  * License as published by the Free Software Foundation; either
   8  * version 2 of the License, or (at your option) any later version.
   9  *
  10  * This library is distributed in the hope that it will be useful,
  11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  13  * Lesser General Public License for more details.
  14  *
  15  * You should have received a copy of the GNU Lesser General Public
  16  * License along with this library; if not, write to the Free Software
  17  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
  18  *
  19  */
  20
  21 /**
  22  * @file h264.c
  23  * H.264 / AVC / MPEG4 part10 codec.
  24  * @author Michael Niedermayer <michaelni@gmx.at>
  25  */
  26
  27 #include "common.h"
  28 #include "dsputil.h"
  29 #include "avcodec.h"
  30 #include "mpegvideo.h"
  31 #include "h264data.h"
  32 #include "golomb.h"
  33
  34 #include "cabac.h"
  35
  36 #undef NDEBUG
  37 #include <assert.h>
  38
  39 #define interlaced_dct interlaced_dct_is_a_bad_name
  40 #define mb_intra mb_intra_isnt_initalized_see_mb_type
  41
  42 #define LUMA_DC_BLOCK_INDEX   25
  43 #define CHROMA_DC_BLOCK_INDEX 26
  44
  45 #define CHROMA_DC_COEFF_TOKEN_VLC_BITS 8
  46 #define COEFF_TOKEN_VLC_BITS           8
  47 #define TOTAL_ZEROS_VLC_BITS           9
  48 #define CHROMA_DC_TOTAL_ZEROS_VLC_BITS 3
  49 #define RUN_VLC_BITS                   3
  50 #define RUN7_VLC_BITS                  6
  51
  52 #define MAX_SPS_COUNT 32
  53 #define MAX_PPS_COUNT 256
  54
  55 #define MAX_MMCO_COUNT 66
  56
  57 /**
  58  * Sequence parameter set
  59  */
  60 typedef struct SPS{
  61
  62     int profile_idc;
  63     int level_idc;
  64     int log2_max_frame_num;            ///< log2_max_frame_num_minus4 + 4
  65     int poc_type;                      ///< pic_order_cnt_type
  66     int log2_max_poc_lsb;              ///< log2_max_pic_order_cnt_lsb_minus4
  67     int delta_pic_order_always_zero_flag;
  68     int offset_for_non_ref_pic;
  69     int offset_for_top_to_bottom_field;
  70     int poc_cycle_length;              ///< num_ref_frames_in_pic_order_cnt_cycle
  71     int ref_frame_count;               ///< num_ref_frames
  72     int gaps_in_frame_num_allowed_flag;
  73     int mb_width;                      ///< frame_width_in_mbs_minus1 + 1
  74     int mb_height;                     ///< frame_height_in_mbs_minus1 + 1
  75     int frame_mbs_only_flag;
  76     int mb_aff;                        ///<mb_adaptive_frame_field_flag
  77     int direct_8x8_inference_flag;
  78     int crop;                   ///< frame_cropping_flag
  79     int crop_left;              ///< frame_cropping_rect_left_offset
  80     int crop_right;             ///< frame_cropping_rect_right_offset
  81     int crop_top;               ///< frame_cropping_rect_top_offset
  82     int crop_bottom;            ///< frame_cropping_rect_bottom_offset
  83     int vui_parameters_present_flag;
  84     AVRational sar;
  85     int timing_info_present_flag;
  86     uint32_t num_units_in_tick;
  87     uint32_t time_scale;
  88     int fixed_frame_rate_flag;
  89     short offset_for_ref_frame[256]; //FIXME dyn aloc?
  90     int bitstream_restriction_flag;
  91     int num_reorder_frames;
  92 }SPS;
  93
  94 /**
  95  * Picture parameter set
  96  */
  97 typedef struct PPS{
  98     int sps_id;
  99     int cabac;                  ///< entropy_coding_mode_flag
 100     int pic_order_present;      ///< pic_order_present_flag
 101     int slice_group_count;      ///< num_slice_groups_minus1 + 1
 102     int mb_slice_group_map_type;
 103     int ref_count[2];           ///< num_ref_idx_l0/1_active_minus1 + 1
 104     int weighted_pred;          ///< weighted_pred_flag
 105     int weighted_bipred_idc;
 106     int init_qp;                ///< pic_init_qp_minus26 + 26
 107     int init_qs;                ///< pic_init_qs_minus26 + 26
 108     int chroma_qp_index_offset;
 109     int deblocking_filter_parameters_present; ///< deblocking_filter_parameters_present_flag
 110     int constrained_intra_pred; ///< constrained_intra_pred_flag
 111     int redundant_pic_cnt_present; ///< redundant_pic_cnt_present_flag
 112 }PPS;
 113
 114 /**
 115  * Memory management control operation opcode.
 116  */
 117 typedef enum MMCOOpcode{
 118     MMCO_END=0,
 119     MMCO_SHORT2UNUSED,
 120     MMCO_LONG2UNUSED,
 121     MMCO_SHORT2LONG,
 122     MMCO_SET_MAX_LONG,
 123     MMCO_RESET,
 124     MMCO_LONG,
 125 } MMCOOpcode;
 126
 127 /**
 128  * Memory management control operation.
 129  */
 130 typedef struct MMCO{
 131     MMCOOpcode opcode;
 132     int short_frame_num;
 133     int long_index;
 134 } MMCO;
 135
 136 /**
 137  * H264Context
 138  */
 139 typedef struct H264Context{
 140     MpegEncContext s;
 141     int nal_ref_idc;
 142     int nal_unit_type;
 143 #define NAL_SLICE               1
 144 #define NAL_DPA                 2
 145 #define NAL_DPB                 3
 146 #define NAL_DPC                 4
 147 #define NAL_IDR_SLICE           5
 148 #define NAL_SEI                 6
 149 #define NAL_SPS                 7
 150 #define NAL_PPS                 8
 151 #define NAL_PICTURE_DELIMITER   9
 152 #define NAL_FILTER_DATA         10
 153     uint8_t *rbsp_buffer;
 154     int rbsp_buffer_size;
 155
 156     /**
 157       * Used to parse AVC variant of h264
 158       */
 159     int is_avc; ///< this flag is != 0 if codec is avc1
 160     int got_avcC; ///< flag used to parse avcC data only once
 161     int nal_length_size; ///< Number of bytes used for nal length (1, 2 or 4)
 162
 163     int chroma_qp; //QPc
 164
 165     int prev_mb_skipped; //FIXME remove (IMHO not used)
 166
 167     //prediction stuff
 168     int chroma_pred_mode;
 169     int intra16x16_pred_mode;
 170
 171     int top_mb_xy;
 172     int left_mb_xy[2];
 173
 174     int8_t intra4x4_pred_mode_cache[5*8];
 175     int8_t (*intra4x4_pred_mode)[8];
 176     void (*pred4x4  [9+3])(uint8_t *src, uint8_t *topright, int stride);//FIXME move to dsp?
 177     void (*pred8x8  [4+3])(uint8_t *src, int stride);
 178     void (*pred16x16[4+3])(uint8_t *src, int stride);
 179     unsigned int topleft_samples_available;
 180     unsigned int top_samples_available;
 181     unsigned int topright_samples_available;
 182     unsigned int left_samples_available;
 183     uint8_t (*top_borders[2])[16+2*8];
 184     uint8_t left_border[2*(17+2*9)];
 185
 186     /**
 187      * non zero coeff count cache.
 188      * is 64 if not available.
 189      */
 190     uint8_t non_zero_count_cache[6*8] __align8;
 191     uint8_t (*non_zero_count)[16];
 192
 193     /**
 194      * Motion vector cache.
 195      */
 196     int16_t mv_cache[2][5*8][2] __align8;
 197     int8_t ref_cache[2][5*8] __align8;
 198 #define LIST_NOT_USED -1 //FIXME rename?
 199 #define PART_NOT_AVAILABLE -2
 200
 201     /**
 202      * is 1 if the specific list MV&references are set to 0,0,-2.
 203      */
 204     int mv_cache_clean[2];
 205
 206     /**
 207      * block_offset[ 0..23] for frame macroblocks
 208      * block_offset[24..47] for field macroblocks
 209      */
 210     int block_offset[2*(16+8)];
 211
 212     uint32_t *mb2b_xy; //FIXME are these 4 a good idea?
 213     uint32_t *mb2b8_xy;
 214     int b_stride; //FIXME use s->b4_stride
 215     int b8_stride;
 216
 217     int halfpel_flag;
 218     int thirdpel_flag;
 219
 220     int unknown_svq3_flag;
 221     int next_slice_index;
 222
 223     SPS sps_buffer[MAX_SPS_COUNT];
 224     SPS sps; ///< current sps
 225
 226     PPS pps_buffer[MAX_PPS_COUNT];
 227     /**
 228      * current pps
 229      */
 230     PPS pps; //FIXME move to Picture perhaps? (->no) do we need that?
 231
 232     int slice_num;
 233     uint8_t *slice_table_base;
 234     uint8_t *slice_table;      ///< slice_table_base + mb_stride + 1
 235     int slice_type;
 236     int slice_type_fixed;
 237
 238     //interlacing specific flags
 239     int mb_aff_frame;
 240     int mb_field_decoding_flag;
 241
 242     int sub_mb_type[4];
 243
 244     //POC stuff
 245     int poc_lsb;
 246     int poc_msb;
 247     int delta_poc_bottom;
 248     int delta_poc[2];
 249     int frame_num;
 250     int prev_poc_msb;             ///< poc_msb of the last reference pic for POC type 0
 251     int prev_poc_lsb;             ///< poc_lsb of the last reference pic for POC type 0
 252     int frame_num_offset;         ///< for POC type 2
 253     int prev_frame_num_offset;    ///< for POC type 2
 254     int prev_frame_num;           ///< frame_num of the last pic for POC type 1/2
 255
 256     /**
 257      * frame_num for frames or 2*frame_num for field pics.
 258      */
 259     int curr_pic_num;
 260
 261     /**
 262      * max_frame_num or 2*max_frame_num for field pics.
 263      */
 264     int max_pic_num;
 265
 266     //Weighted pred stuff
 267     int use_weight;
 268     int use_weight_chroma;
 269     int luma_log2_weight_denom;
 270     int chroma_log2_weight_denom;
 271     int luma_weight[2][16];
 272     int luma_offset[2][16];
 273     int chroma_weight[2][16][2];
 274     int chroma_offset[2][16][2];
 275     int implicit_weight[16][16];
 276
 277     //deblock
 278     int deblocking_filter;         ///< disable_deblocking_filter_idc with 1<->0
 279     int slice_alpha_c0_offset;
 280     int slice_beta_offset;
 281
 282     int redundant_pic_count;
 283
 284     int direct_spatial_mv_pred;
 285     int dist_scale_factor[16];
 286     int map_col_to_list0[2][16];
 287
 288     /**
 289      * num_ref_idx_l0/1_active_minus1 + 1
 290      */
 291     int ref_count[2];// FIXME split for AFF
 292     Picture *short_ref[32];
 293     Picture *long_ref[32];
 294     Picture default_ref_list[2][32];
 295     Picture ref_list[2][32]; //FIXME size?
 296     Picture field_ref_list[2][32]; //FIXME size?
 297     Picture *delayed_pic[16]; //FIXME size?
 298     Picture *delayed_output_pic;
 299
 300     /**
 301      * memory management control operations buffer.
 302      */
 303     MMCO mmco[MAX_MMCO_COUNT];
 304     int mmco_index;
 305
 306     int long_ref_count;  ///< number of actual long term references
 307     int short_ref_count; ///< number of actual short term references
 308
 309     //data partitioning
 310     GetBitContext intra_gb;
 311     GetBitContext inter_gb;
 312     GetBitContext *intra_gb_ptr;
 313     GetBitContext *inter_gb_ptr;
 314
 315     DCTELEM mb[16*24] __align8;
 316
 317     /**
 318      * Cabac
 319      */
 320     CABACContext cabac;
 321     uint8_t      cabac_state[399];
 322     int          cabac_init_idc;
 323
 324     /* 0x100 -> non null luma_dc, 0x80/0x40 -> non null chroma_dc (cb/cr), 0x?0 -> chroma_cbp(0,1,2), 0x0? luma_cbp */
 325     uint16_t     *cbp_table;
 326     int top_cbp;
 327     int left_cbp;
 328     /* chroma_pred_mode for i4x4 or i16x16, else 0 */
 329     uint8_t     *chroma_pred_mode_table;
 330     int         last_qscale_diff;
 331     int16_t     (*mvd_table[2])[2];
 332     int16_t     mvd_cache[2][5*8][2] __align8;
 333     uint8_t     *direct_table;
 334     uint8_t     direct_cache[5*8];
 335
 336 }H264Context;
 337
 338 static VLC coeff_token_vlc[4];
 339 static VLC chroma_dc_coeff_token_vlc;
 340
 341 static VLC total_zeros_vlc[15];
 342 static VLC chroma_dc_total_zeros_vlc[3];
 343
 344 static VLC run_vlc[6];
 345 static VLC run7_vlc;
 346
 347 static void svq3_luma_dc_dequant_idct_c(DCTELEM *block, int qp);
 348 static void svq3_add_idct_c(uint8_t *dst, DCTELEM *block, int stride, int qp, int dc);
 349 static void filter_mb( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize);
 350
 351 static inline uint32_t pack16to32(int a, int b){
 352 #ifdef WORDS_BIGENDIAN
 353    return (b&0xFFFF) + (a<<16);
 354 #else
 355    return (a&0xFFFF) + (b<<16);
 356 #endif
 357 }
 358
 359 /**
 360  * fill a rectangle.
 361  * @param h height of the rectangle, should be a constant
 362  * @param w width of the rectangle, should be a constant
 363  * @param size the size of val (1 or 4), should be a constant
 364  */
 365 static inline void fill_rectangle(void *vp, int w, int h, int stride, uint32_t val, int size){ //FIXME ensure this IS inlined
 366     uint8_t *p= (uint8_t*)vp;
 367     assert(size==1 || size==4);
 368
 369     w      *= size;
 370     stride *= size;
 371
 372     assert((((int)vp)&(FFMIN(w, STRIDE_ALIGN)-1)) == 0);
 373 //FIXME check what gcc generates for 64 bit on x86 and possibly write a 32 bit ver of it
 374     if(w==2 && h==2){
 375         *(uint16_t*)(p + 0)=
 376         *(uint16_t*)(p + stride)= size==4 ? val : val*0x0101;
 377     }else if(w==2 && h==4){
 378         *(uint16_t*)(p + 0*stride)=
 379         *(uint16_t*)(p + 1*stride)=
 380         *(uint16_t*)(p + 2*stride)=
 381         *(uint16_t*)(p + 3*stride)= size==4 ? val : val*0x0101;
 382     }else if(w==4 && h==1){
 383         *(uint32_t*)(p + 0*stride)= size==4 ? val : val*0x01010101;
 384     }else if(w==4 && h==2){
 385         *(uint32_t*)(p + 0*stride)=
 386         *(uint32_t*)(p + 1*stride)= size==4 ? val : val*0x01010101;
 387     }else if(w==4 && h==4){
 388         *(uint32_t*)(p + 0*stride)=
 389         *(uint32_t*)(p + 1*stride)=
 390         *(uint32_t*)(p + 2*stride)=
 391         *(uint32_t*)(p + 3*stride)= size==4 ? val : val*0x01010101;
 392     }else if(w==8 && h==1){
 393         *(uint32_t*)(p + 0)=
 394         *(uint32_t*)(p + 4)= size==4 ? val : val*0x01010101;
 395     }else if(w==8 && h==2){
 396         *(uint32_t*)(p + 0 + 0*stride)=
 397         *(uint32_t*)(p + 4 + 0*stride)=
 398         *(uint32_t*)(p + 0 + 1*stride)=
 399         *(uint32_t*)(p + 4 + 1*stride)=  size==4 ? val : val*0x01010101;
 400     }else if(w==8 && h==4){
 401         *(uint64_t*)(p + 0*stride)=
 402         *(uint64_t*)(p + 1*stride)=
 403         *(uint64_t*)(p + 2*stride)=
 404         *(uint64_t*)(p + 3*stride)= size==4 ? val*0x0100000001ULL : val*0x0101010101010101ULL;
 405     }else if(w==16 && h==2){
 406         *(uint64_t*)(p + 0+0*stride)=
 407         *(uint64_t*)(p + 8+0*stride)=
 408         *(uint64_t*)(p + 0+1*stride)=
 409         *(uint64_t*)(p + 8+1*stride)= size==4 ? val*0x0100000001ULL : val*0x0101010101010101ULL;
 410     }else if(w==16 && h==4){
 411         *(uint64_t*)(p + 0+0*stride)=
 412         *(uint64_t*)(p + 8+0*stride)=
 413         *(uint64_t*)(p + 0+1*stride)=
 414         *(uint64_t*)(p + 8+1*stride)=
 415         *(uint64_t*)(p + 0+2*stride)=
 416         *(uint64_t*)(p + 8+2*stride)=
 417         *(uint64_t*)(p + 0+3*stride)=
 418         *(uint64_t*)(p + 8+3*stride)= size==4 ? val*0x0100000001ULL : val*0x0101010101010101ULL;
 419     }else
 420         assert(0);
 421 }
 422
 423 static inline void fill_caches(H264Context *h, int mb_type, int for_deblock){
 424     MpegEncContext * const s = &h->s;
 425     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
 426     int topleft_xy, top_xy, topright_xy, left_xy[2];
 427     int topleft_type, top_type, topright_type, left_type[2];
 428     int left_block[8];
 429     int i;
 430
 431     //FIXME deblocking can skip fill_caches much of the time with multiple slices too.
 432     // the actual condition is whether we're on the edge of a slice,
 433     // and even then the intra and nnz parts are unnecessary.
 434     if(for_deblock && h->slice_num == 1)
 435         return;
 436
 437     //wow what a mess, why didn't they simplify the interlacing&intra stuff, i can't imagine that these complex rules are worth it
 438
 439     top_xy     = mb_xy  - s->mb_stride;
 440     topleft_xy = top_xy - 1;
 441     topright_xy= top_xy + 1;
 442     left_xy[1] = left_xy[0] = mb_xy-1;
 443     left_block[0]= 0;
 444     left_block[1]= 1;
 445     left_block[2]= 2;
 446     left_block[3]= 3;
 447     left_block[4]= 7;
 448     left_block[5]= 10;
 449     left_block[6]= 8;
 450     left_block[7]= 11;
 451     if(h->mb_aff_frame){
 452         const int pair_xy          = s->mb_x     + (s->mb_y & ~1)*s->mb_stride;
 453         const int top_pair_xy      = pair_xy     - s->mb_stride;
 454         const int topleft_pair_xy  = top_pair_xy - 1;
 455         const int topright_pair_xy = top_pair_xy + 1;
 456         const int topleft_mb_frame_flag  = !IS_INTERLACED(s->current_picture.mb_type[topleft_pair_xy]);
 457         const int top_mb_frame_flag      = !IS_INTERLACED(s->current_picture.mb_type[top_pair_xy]);
 458         const int topright_mb_frame_flag = !IS_INTERLACED(s->current_picture.mb_type[topright_pair_xy]);
 459         const int left_mb_frame_flag = !IS_INTERLACED(s->current_picture.mb_type[pair_xy-1]);
 460         const int curr_mb_frame_flag = !IS_INTERLACED(mb_type);
 461         const int bottom = (s->mb_y & 1);
 462         tprintf("fill_caches: curr_mb_frame_flag:%d, left_mb_frame_flag:%d, topleft_mb_frame_flag:%d, top_mb_frame_flag:%d, topright_mb_frame_flag:%d\n", curr_mb_frame_flag, left_mb_frame_flag, topleft_mb_frame_flag, top_mb_frame_flag, topright_mb_frame_flag);
 463         if (bottom
 464                 ? !curr_mb_frame_flag // bottom macroblock
 465                 : (!curr_mb_frame_flag && !top_mb_frame_flag) // top macroblock
 466                 ) {
 467             top_xy -= s->mb_stride;
 468         }
 469         if (bottom
 470                 ? !curr_mb_frame_flag // bottom macroblock
 471                 : (!curr_mb_frame_flag && !topleft_mb_frame_flag) // top macroblock
 472                 ) {
 473             topleft_xy -= s->mb_stride;
 474         }
 475         if (bottom
 476                 ? !curr_mb_frame_flag // bottom macroblock
 477                 : (!curr_mb_frame_flag && !topright_mb_frame_flag) // top macroblock
 478                 ) {
 479             topright_xy -= s->mb_stride;
 480         }
 481         if (left_mb_frame_flag != curr_mb_frame_flag) {
 482             left_xy[1] = left_xy[0] = pair_xy - 1;
 483             if (curr_mb_frame_flag) {
 484                 if (bottom) {
 485                     left_block[0]= 2;
 486                     left_block[1]= 2;
 487                     left_block[2]= 3;
 488                     left_block[3]= 3;
 489                     left_block[4]= 8;
 490                     left_block[5]= 11;
 491                     left_block[6]= 8;
 492                     left_block[7]= 11;
 493                 } else {
 494                     left_block[0]= 0;
 495                     left_block[1]= 0;
 496                     left_block[2]= 1;
 497                     left_block[3]= 1;
 498                     left_block[4]= 7;
 499                     left_block[5]= 10;
 500                     left_block[6]= 7;
 501                     left_block[7]= 10;
 502                 }
 503             } else {
 504                 left_xy[1] += s->mb_stride;
 505                 //left_block[0]= 0;
 506                 left_block[1]= 2;
 507                 left_block[2]= 0;
 508                 left_block[3]= 2;
 509                 //left_block[4]= 7;
 510                 left_block[5]= 10;
 511                 left_block[6]= 7;
 512                 left_block[7]= 10;
 513             }
 514         }
 515     }
 516
 517     h->top_mb_xy = top_xy;
 518     h->left_mb_xy[0] = left_xy[0];
 519     h->left_mb_xy[1] = left_xy[1];
 520     if(for_deblock){
 521         topleft_type = h->slice_table[topleft_xy ] < 255 ? s->current_picture.mb_type[topleft_xy] : 0;
 522         top_type     = h->slice_table[top_xy     ] < 255 ? s->current_picture.mb_type[top_xy]     : 0;
 523         topright_type= h->slice_table[topright_xy] < 255 ? s->current_picture.mb_type[topright_xy]: 0;
 524         left_type[0] = h->slice_table[left_xy[0] ] < 255 ? s->current_picture.mb_type[left_xy[0]] : 0;
 525         left_type[1] = h->slice_table[left_xy[1] ] < 255 ? s->current_picture.mb_type[left_xy[1]] : 0;
 526     }else{
 527         topleft_type = h->slice_table[topleft_xy ] == h->slice_num ? s->current_picture.mb_type[topleft_xy] : 0;
 528         top_type     = h->slice_table[top_xy     ] == h->slice_num ? s->current_picture.mb_type[top_xy]     : 0;
 529         topright_type= h->slice_table[topright_xy] == h->slice_num ? s->current_picture.mb_type[topright_xy]: 0;
 530         left_type[0] = h->slice_table[left_xy[0] ] == h->slice_num ? s->current_picture.mb_type[left_xy[0]] : 0;
 531         left_type[1] = h->slice_table[left_xy[1] ] == h->slice_num ? s->current_picture.mb_type[left_xy[1]] : 0;
 532     }
 533
 534     if(IS_INTRA(mb_type)){
 535         h->topleft_samples_available=
 536         h->top_samples_available=
 537         h->left_samples_available= 0xFFFF;
 538         h->topright_samples_available= 0xEEEA;
 539
 540         if(!IS_INTRA(top_type) && (top_type==0 || h->pps.constrained_intra_pred)){
 541             h->topleft_samples_available= 0xB3FF;
 542             h->top_samples_available= 0x33FF;
 543             h->topright_samples_available= 0x26EA;
 544         }
 545         for(i=0; i<2; i++){
 546             if(!IS_INTRA(left_type[i]) && (left_type[i]==0 || h->pps.constrained_intra_pred)){
 547                 h->topleft_samples_available&= 0xDF5F;
 548                 h->left_samples_available&= 0x5F5F;
 549             }
 550         }
 551
 552         if(!IS_INTRA(topleft_type) && (topleft_type==0 || h->pps.constrained_intra_pred))
 553             h->topleft_samples_available&= 0x7FFF;
 554
 555         if(!IS_INTRA(topright_type) && (topright_type==0 || h->pps.constrained_intra_pred))
 556             h->topright_samples_available&= 0xFBFF;
 557
 558         if(IS_INTRA4x4(mb_type)){
 559             if(IS_INTRA4x4(top_type)){
 560                 h->intra4x4_pred_mode_cache[4+8*0]= h->intra4x4_pred_mode[top_xy][4];
 561                 h->intra4x4_pred_mode_cache[5+8*0]= h->intra4x4_pred_mode[top_xy][5];
 562                 h->intra4x4_pred_mode_cache[6+8*0]= h->intra4x4_pred_mode[top_xy][6];
 563                 h->intra4x4_pred_mode_cache[7+8*0]= h->intra4x4_pred_mode[top_xy][3];
 564             }else{
 565                 int pred;
 566                 if(!top_type || (IS_INTER(top_type) && h->pps.constrained_intra_pred))
 567                     pred= -1;
 568                 else{
 569                     pred= 2;
 570                 }
 571                 h->intra4x4_pred_mode_cache[4+8*0]=
 572                 h->intra4x4_pred_mode_cache[5+8*0]=
 573                 h->intra4x4_pred_mode_cache[6+8*0]=
 574                 h->intra4x4_pred_mode_cache[7+8*0]= pred;
 575             }
 576             for(i=0; i<2; i++){
 577                 if(IS_INTRA4x4(left_type[i])){
 578                     h->intra4x4_pred_mode_cache[3+8*1 + 2*8*i]= h->intra4x4_pred_mode[left_xy[i]][left_block[0+2*i]];
 579                     h->intra4x4_pred_mode_cache[3+8*2 + 2*8*i]= h->intra4x4_pred_mode[left_xy[i]][left_block[1+2*i]];
 580                 }else{
 581                     int pred;
 582                     if(!left_type[i] || (IS_INTER(left_type[i]) && h->pps.constrained_intra_pred))
 583                         pred= -1;
 584                     else{
 585                         pred= 2;
 586                     }
 587                     h->intra4x4_pred_mode_cache[3+8*1 + 2*8*i]=
 588                     h->intra4x4_pred_mode_cache[3+8*2 + 2*8*i]= pred;
 589                 }
 590             }
 591         }
 592     }
 593
 594
 595 /*
 596 0 . T T. T T T T
 597 1 L . .L . . . .
 598 2 L . .L . . . .
 599 3 . T TL . . . .
 600 4 L . .L . . . .
 601 5 L . .. . . . .
 602 */
 603 //FIXME constraint_intra_pred & partitioning & nnz (lets hope this is just a typo in the spec)
 604     if(top_type){
 605         h->non_zero_count_cache[4+8*0]= h->non_zero_count[top_xy][4];
 606         h->non_zero_count_cache[5+8*0]= h->non_zero_count[top_xy][5];
 607         h->non_zero_count_cache[6+8*0]= h->non_zero_count[top_xy][6];
 608         h->non_zero_count_cache[7+8*0]= h->non_zero_count[top_xy][3];
 609
 610         h->non_zero_count_cache[1+8*0]= h->non_zero_count[top_xy][9];
 611         h->non_zero_count_cache[2+8*0]= h->non_zero_count[top_xy][8];
 612
 613         h->non_zero_count_cache[1+8*3]= h->non_zero_count[top_xy][12];
 614         h->non_zero_count_cache[2+8*3]= h->non_zero_count[top_xy][11];
 615
 616     }else{
 617         h->non_zero_count_cache[4+8*0]=
 618         h->non_zero_count_cache[5+8*0]=
 619         h->non_zero_count_cache[6+8*0]=
 620         h->non_zero_count_cache[7+8*0]=
 621
 622         h->non_zero_count_cache[1+8*0]=
 623         h->non_zero_count_cache[2+8*0]=
 624
 625         h->non_zero_count_cache[1+8*3]=
 626         h->non_zero_count_cache[2+8*3]= h->pps.cabac && !IS_INTRA(mb_type) ? 0 : 64;
 627
 628     }
 629
 630     for (i=0; i<2; i++) {
 631         if(left_type[i]){
 632             h->non_zero_count_cache[3+8*1 + 2*8*i]= h->non_zero_count[left_xy[i]][left_block[0+2*i]];
 633             h->non_zero_count_cache[3+8*2 + 2*8*i]= h->non_zero_count[left_xy[i]][left_block[1+2*i]];
 634             h->non_zero_count_cache[0+8*1 +   8*i]= h->non_zero_count[left_xy[i]][left_block[4+2*i]];
 635             h->non_zero_count_cache[0+8*4 +   8*i]= h->non_zero_count[left_xy[i]][left_block[5+2*i]];
 636         }else{
 637             h->non_zero_count_cache[3+8*1 + 2*8*i]=
 638             h->non_zero_count_cache[3+8*2 + 2*8*i]=
 639             h->non_zero_count_cache[0+8*1 +   8*i]=
 640             h->non_zero_count_cache[0+8*4 +   8*i]= h->pps.cabac && !IS_INTRA(mb_type) ? 0 : 64;
 641         }
 642     }
 643
 644     if( h->pps.cabac ) {
 645         // top_cbp
 646         if(top_type) {
 647             h->top_cbp = h->cbp_table[top_xy];
 648         } else if(IS_INTRA(mb_type)) {
 649             h->top_cbp = 0x1C0;
 650         } else {
 651             h->top_cbp = 0;
 652         }
 653         // left_cbp
 654         if (left_type[0]) {
 655             h->left_cbp = h->cbp_table[left_xy[0]] & 0x1f0;
 656         } else if(IS_INTRA(mb_type)) {
 657             h->left_cbp = 0x1C0;
 658         } else {
 659             h->left_cbp = 0;
 660         }
 661         if (left_type[0]) {
 662             h->left_cbp |= ((h->cbp_table[left_xy[0]]>>((left_block[0]&(~1))+1))&0x1) << 1;
 663         }
 664         if (left_type[1]) {
 665             h->left_cbp |= ((h->cbp_table[left_xy[1]]>>((left_block[2]&(~1))+1))&0x1) << 3;
 666         }
 667     }
 668
 669 #if 1
 670     //FIXME direct mb can skip much of this
 671     if(IS_INTER(mb_type) || IS_DIRECT(mb_type)){
 672         int list;
 673         for(list=0; list<1+(h->slice_type==B_TYPE); list++){
 674             if(!USES_LIST(mb_type, list) && !IS_DIRECT(mb_type) && !h->deblocking_filter){
 675                 /*if(!h->mv_cache_clean[list]){
 676                     memset(h->mv_cache [list],  0, 8*5*2*sizeof(int16_t)); //FIXME clean only input? clean at all?
 677                     memset(h->ref_cache[list], PART_NOT_AVAILABLE, 8*5*sizeof(int8_t));
 678                     h->mv_cache_clean[list]= 1;
 679                 }*/
 680                 continue;
 681             }
 682             h->mv_cache_clean[list]= 0;
 683
 684             if(IS_INTER(top_type)){
 685                 const int b_xy= h->mb2b_xy[top_xy] + 3*h->b_stride;
 686                 const int b8_xy= h->mb2b8_xy[top_xy] + h->b8_stride;
 687                 *(uint32_t*)h->mv_cache[list][scan8[0] + 0 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 0];
 688                 *(uint32_t*)h->mv_cache[list][scan8[0] + 1 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 1];
 689                 *(uint32_t*)h->mv_cache[list][scan8[0] + 2 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 2];
 690                 *(uint32_t*)h->mv_cache[list][scan8[0] + 3 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + 3];
 691                 h->ref_cache[list][scan8[0] + 0 - 1*8]=
 692                 h->ref_cache[list][scan8[0] + 1 - 1*8]= s->current_picture.ref_index[list][b8_xy + 0];
 693                 h->ref_cache[list][scan8[0] + 2 - 1*8]=
 694                 h->ref_cache[list][scan8[0] + 3 - 1*8]= s->current_picture.ref_index[list][b8_xy + 1];
 695             }else{
 696                 *(uint32_t*)h->mv_cache [list][scan8[0] + 0 - 1*8]=
 697                 *(uint32_t*)h->mv_cache [list][scan8[0] + 1 - 1*8]=
 698                 *(uint32_t*)h->mv_cache [list][scan8[0] + 2 - 1*8]=
 699                 *(uint32_t*)h->mv_cache [list][scan8[0] + 3 - 1*8]= 0;
 700                 *(uint32_t*)&h->ref_cache[list][scan8[0] + 0 - 1*8]= ((top_type ? LIST_NOT_USED : PART_NOT_AVAILABLE)&0xFF)*0x01010101;
 701             }
 702
 703             //FIXME unify cleanup or sth
 704             if(IS_INTER(left_type[0])){
 705                 const int b_xy= h->mb2b_xy[left_xy[0]] + 3;
 706                 const int b8_xy= h->mb2b8_xy[left_xy[0]] + 1;
 707                 *(uint32_t*)h->mv_cache[list][scan8[0] - 1 + 0*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + h->b_stride*left_block[0]];
 708                 *(uint32_t*)h->mv_cache[list][scan8[0] - 1 + 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + h->b_stride*left_block[1]];
 709                 h->ref_cache[list][scan8[0] - 1 + 0*8]=
 710                 h->ref_cache[list][scan8[0] - 1 + 1*8]= s->current_picture.ref_index[list][b8_xy + h->b8_stride*(left_block[0]>>1)];
 711             }else{
 712                 *(uint32_t*)h->mv_cache [list][scan8[0] - 1 + 0*8]=
 713                 *(uint32_t*)h->mv_cache [list][scan8[0] - 1 + 1*8]= 0;
 714                 h->ref_cache[list][scan8[0] - 1 + 0*8]=
 715                 h->ref_cache[list][scan8[0] - 1 + 1*8]= left_type[0] ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 716             }
 717
 718             if(IS_INTER(left_type[1])){
 719                 const int b_xy= h->mb2b_xy[left_xy[1]] + 3;
 720                 const int b8_xy= h->mb2b8_xy[left_xy[1]] + 1;
 721                 *(uint32_t*)h->mv_cache[list][scan8[0] - 1 + 2*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + h->b_stride*left_block[2]];
 722                 *(uint32_t*)h->mv_cache[list][scan8[0] - 1 + 3*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy + h->b_stride*left_block[3]];
 723                 h->ref_cache[list][scan8[0] - 1 + 2*8]=
 724                 h->ref_cache[list][scan8[0] - 1 + 3*8]= s->current_picture.ref_index[list][b8_xy + h->b8_stride*(left_block[2]>>1)];
 725             }else{
 726                 *(uint32_t*)h->mv_cache [list][scan8[0] - 1 + 2*8]=
 727                 *(uint32_t*)h->mv_cache [list][scan8[0] - 1 + 3*8]= 0;
 728                 h->ref_cache[list][scan8[0] - 1 + 2*8]=
 729                 h->ref_cache[list][scan8[0] - 1 + 3*8]= left_type[0] ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 730             }
 731
 732             if(for_deblock || (IS_DIRECT(mb_type) && !h->direct_spatial_mv_pred))
 733                 continue;
 734
 735             if(IS_INTER(topleft_type)){
 736                 const int b_xy = h->mb2b_xy[topleft_xy] + 3 + 3*h->b_stride;
 737                 const int b8_xy= h->mb2b8_xy[topleft_xy] + 1 + h->b8_stride;
 738                 *(uint32_t*)h->mv_cache[list][scan8[0] - 1 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy];
 739                 h->ref_cache[list][scan8[0] - 1 - 1*8]= s->current_picture.ref_index[list][b8_xy];
 740             }else{
 741                 *(uint32_t*)h->mv_cache[list][scan8[0] - 1 - 1*8]= 0;
 742                 h->ref_cache[list][scan8[0] - 1 - 1*8]= topleft_type ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 743             }
 744
 745             if(IS_INTER(topright_type)){
 746                 const int b_xy= h->mb2b_xy[topright_xy] + 3*h->b_stride;
 747                 const int b8_xy= h->mb2b8_xy[topright_xy] + h->b8_stride;
 748                 *(uint32_t*)h->mv_cache[list][scan8[0] + 4 - 1*8]= *(uint32_t*)s->current_picture.motion_val[list][b_xy];
 749                 h->ref_cache[list][scan8[0] + 4 - 1*8]= s->current_picture.ref_index[list][b8_xy];
 750             }else{
 751                 *(uint32_t*)h->mv_cache [list][scan8[0] + 4 - 1*8]= 0;
 752                 h->ref_cache[list][scan8[0] + 4 - 1*8]= topright_type ? LIST_NOT_USED : PART_NOT_AVAILABLE;
 753             }
 754
 755
 756             h->ref_cache[list][scan8[5 ]+1] =
 757             h->ref_cache[list][scan8[7 ]+1] =
 758             h->ref_cache[list][scan8[13]+1] =  //FIXME remove past 3 (init somewhere else)
 759             h->ref_cache[list][scan8[4 ]] =
 760             h->ref_cache[list][scan8[12]] = PART_NOT_AVAILABLE;
 761             *(uint32_t*)h->mv_cache [list][scan8[5 ]+1]=
 762             *(uint32_t*)h->mv_cache [list][scan8[7 ]+1]=
 763             *(uint32_t*)h->mv_cache [list][scan8[13]+1]= //FIXME remove past 3 (init somewhere else)
 764             *(uint32_t*)h->mv_cache [list][scan8[4 ]]=
 765             *(uint32_t*)h->mv_cache [list][scan8[12]]= 0;
 766
 767             if( h->pps.cabac ) {
 768                 /* XXX beurk, Load mvd */
 769                 if(IS_INTER(topleft_type)){
 770                     const int b_xy = h->mb2b_xy[topleft_xy] + 3 + 3*h->b_stride;
 771                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy];
 772                 }else{
 773                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 - 1*8]= 0;
 774                 }
 775
 776                 if(IS_INTER(top_type)){
 777                     const int b_xy= h->mb2b_xy[top_xy] + 3*h->b_stride;
 778                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 0 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 0];
 779                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 1 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 1];
 780                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 2 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 2];
 781                     *(uint32_t*)h->mvd_cache[list][scan8[0] + 3 - 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + 3];
 782                 }else{
 783                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 0 - 1*8]=
 784                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 1 - 1*8]=
 785                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 2 - 1*8]=
 786                     *(uint32_t*)h->mvd_cache [list][scan8[0] + 3 - 1*8]= 0;
 787                 }
 788                 if(IS_INTER(left_type[0])){
 789                     const int b_xy= h->mb2b_xy[left_xy[0]] + 3;
 790                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 0*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[0]];
 791                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 1*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[1]];
 792                 }else{
 793                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 0*8]=
 794                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 1*8]= 0;
 795                 }
 796                 if(IS_INTER(left_type[1])){
 797                     const int b_xy= h->mb2b_xy[left_xy[1]] + 3;
 798                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 2*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[2]];
 799                     *(uint32_t*)h->mvd_cache[list][scan8[0] - 1 + 3*8]= *(uint32_t*)h->mvd_table[list][b_xy + h->b_stride*left_block[3]];
 800                 }else{
 801                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 2*8]=
 802                     *(uint32_t*)h->mvd_cache [list][scan8[0] - 1 + 3*8]= 0;
 803                 }
 804                 *(uint32_t*)h->mvd_cache [list][scan8[5 ]+1]=
 805                 *(uint32_t*)h->mvd_cache [list][scan8[7 ]+1]=
 806                 *(uint32_t*)h->mvd_cache [list][scan8[13]+1]= //FIXME remove past 3 (init somewhere else)
 807                 *(uint32_t*)h->mvd_cache [list][scan8[4 ]]=
 808                 *(uint32_t*)h->mvd_cache [list][scan8[12]]= 0;
 809
 810                 if(h->slice_type == B_TYPE){
 811                     fill_rectangle(&h->direct_cache[scan8[0]], 4, 4, 8, 0, 1);
 812
 813                     if(IS_DIRECT(top_type)){
 814                         *(uint32_t*)&h->direct_cache[scan8[0] - 1*8]= 0x01010101;
 815                     }else if(IS_8X8(top_type)){
 816                         int b8_xy = h->mb2b8_xy[top_xy] + h->b8_stride;
 817                         h->direct_cache[scan8[0] + 0 - 1*8]= h->direct_table[b8_xy];
 818                         h->direct_cache[scan8[0] + 2 - 1*8]= h->direct_table[b8_xy + 1];
 819                     }else{
 820                         *(uint32_t*)&h->direct_cache[scan8[0] - 1*8]= 0;
 821                     }
 822
 823                     //FIXME interlacing
 824                     if(IS_DIRECT(left_type[0])){
 825                         h->direct_cache[scan8[0] - 1 + 0*8]=
 826                         h->direct_cache[scan8[0] - 1 + 2*8]= 1;
 827                     }else if(IS_8X8(left_type[0])){
 828                         int b8_xy = h->mb2b8_xy[left_xy[0]] + 1;
 829                         h->direct_cache[scan8[0] - 1 + 0*8]= h->direct_table[b8_xy];
 830                         h->direct_cache[scan8[0] - 1 + 2*8]= h->direct_table[b8_xy + h->b8_stride];
 831                     }else{
 832                         h->direct_cache[scan8[0] - 1 + 0*8]=
 833                         h->direct_cache[scan8[0] - 1 + 2*8]= 0;
 834                     }
 835                 }
 836             }
 837         }
 838     }
 839 #endif
 840 }
 841
 842 static inline void write_back_intra_pred_mode(H264Context *h){
 843     MpegEncContext * const s = &h->s;
 844     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
 845
 846     h->intra4x4_pred_mode[mb_xy][0]= h->intra4x4_pred_mode_cache[7+8*1];
 847     h->intra4x4_pred_mode[mb_xy][1]= h->intra4x4_pred_mode_cache[7+8*2];
 848     h->intra4x4_pred_mode[mb_xy][2]= h->intra4x4_pred_mode_cache[7+8*3];
 849     h->intra4x4_pred_mode[mb_xy][3]= h->intra4x4_pred_mode_cache[7+8*4];
 850     h->intra4x4_pred_mode[mb_xy][4]= h->intra4x4_pred_mode_cache[4+8*4];
 851     h->intra4x4_pred_mode[mb_xy][5]= h->intra4x4_pred_mode_cache[5+8*4];
 852     h->intra4x4_pred_mode[mb_xy][6]= h->intra4x4_pred_mode_cache[6+8*4];
 853 }
 854
 855 /**
 856  * checks if the top & left blocks are available if needed & changes the dc mode so it only uses the available blocks.
 857  */
 858 static inline int check_intra4x4_pred_mode(H264Context *h){
 859     MpegEncContext * const s = &h->s;
 860     static const int8_t top [12]= {-1, 0,LEFT_DC_PRED,-1,-1,-1,-1,-1, 0};
 861     static const int8_t left[12]= { 0,-1, TOP_DC_PRED, 0,-1,-1,-1, 0,-1,DC_128_PRED};
 862     int i;
 863
 864     if(!(h->top_samples_available&0x8000)){
 865         for(i=0; i<4; i++){
 866             int status= top[ h->intra4x4_pred_mode_cache[scan8[0] + i] ];
 867             if(status<0){
 868                 av_log(h->s.avctx, AV_LOG_ERROR, "top block unavailable for requested intra4x4 mode %d at %d %d\n", status, s->mb_x, s->mb_y);
 869                 return -1;
 870             } else if(status){
 871                 h->intra4x4_pred_mode_cache[scan8[0] + i]= status;
 872             }
 873         }
 874     }
 875
 876     if(!(h->left_samples_available&0x8000)){
 877         for(i=0; i<4; i++){
 878             int status= left[ h->intra4x4_pred_mode_cache[scan8[0] + 8*i] ];
 879             if(status<0){
 880                 av_log(h->s.avctx, AV_LOG_ERROR, "left block unavailable for requested intra4x4 mode %d at %d %d\n", status, s->mb_x, s->mb_y);
 881                 return -1;
 882             } else if(status){
 883                 h->intra4x4_pred_mode_cache[scan8[0] + 8*i]= status;
 884             }
 885         }
 886     }
 887
 888     return 0;
 889 } //FIXME cleanup like next
 890
 891 /**
 892  * checks if the top & left blocks are available if needed & changes the dc mode so it only uses the available blocks.
 893  */
 894 static inline int check_intra_pred_mode(H264Context *h, int mode){
 895     MpegEncContext * const s = &h->s;
 896     static const int8_t top [7]= {LEFT_DC_PRED8x8, 1,-1,-1};
 897     static const int8_t left[7]= { TOP_DC_PRED8x8,-1, 2,-1,DC_128_PRED8x8};
 898
 899     if(mode < 0 || mode > 6) {
 900         av_log(h->s.avctx, AV_LOG_ERROR, "out of range intra chroma pred mode at %d %d\n", s->mb_x, s->mb_y);
 901         return -1;
 902     }
 903
 904     if(!(h->top_samples_available&0x8000)){
 905         mode= top[ mode ];
 906         if(mode<0){
 907             av_log(h->s.avctx, AV_LOG_ERROR, "top block unavailable for requested intra mode at %d %d\n", s->mb_x, s->mb_y);
 908             return -1;
 909         }
 910     }
 911
 912     if(!(h->left_samples_available&0x8000)){
 913         mode= left[ mode ];
 914         if(mode<0){
 915             av_log(h->s.avctx, AV_LOG_ERROR, "left block unavailable for requested intra mode at %d %d\n", s->mb_x, s->mb_y);
 916             return -1;
 917         }
 918     }
 919
 920     return mode;
 921 }
 922
 923 /**
 924  * gets the predicted intra4x4 prediction mode.
 925  */
 926 static inline int pred_intra_mode(H264Context *h, int n){
 927     const int index8= scan8[n];
 928     const int left= h->intra4x4_pred_mode_cache[index8 - 1];
 929     const int top = h->intra4x4_pred_mode_cache[index8 - 8];
 930     const int min= FFMIN(left, top);
 931
 932     tprintf("mode:%d %d min:%d\n", left ,top, min);
 933
 934     if(min<0) return DC_PRED;
 935     else      return min;
 936 }
 937
 938 static inline void write_back_non_zero_count(H264Context *h){
 939     MpegEncContext * const s = &h->s;
 940     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
 941
 942     h->non_zero_count[mb_xy][0]= h->non_zero_count_cache[7+8*1];
 943     h->non_zero_count[mb_xy][1]= h->non_zero_count_cache[7+8*2];
 944     h->non_zero_count[mb_xy][2]= h->non_zero_count_cache[7+8*3];
 945     h->non_zero_count[mb_xy][3]= h->non_zero_count_cache[7+8*4];
 946     h->non_zero_count[mb_xy][4]= h->non_zero_count_cache[4+8*4];
 947     h->non_zero_count[mb_xy][5]= h->non_zero_count_cache[5+8*4];
 948     h->non_zero_count[mb_xy][6]= h->non_zero_count_cache[6+8*4];
 949
 950     h->non_zero_count[mb_xy][9]= h->non_zero_count_cache[1+8*2];
 951     h->non_zero_count[mb_xy][8]= h->non_zero_count_cache[2+8*2];
 952     h->non_zero_count[mb_xy][7]= h->non_zero_count_cache[2+8*1];
 953
 954     h->non_zero_count[mb_xy][12]=h->non_zero_count_cache[1+8*5];
 955     h->non_zero_count[mb_xy][11]=h->non_zero_count_cache[2+8*5];
 956     h->non_zero_count[mb_xy][10]=h->non_zero_count_cache[2+8*4];
 957 }
 958
 959 /**
 960  * gets the predicted number of non zero coefficients.
 961  * @param n block index
 962  */
 963 static inline int pred_non_zero_count(H264Context *h, int n){
 964     const int index8= scan8[n];
 965     const int left= h->non_zero_count_cache[index8 - 1];
 966     const int top = h->non_zero_count_cache[index8 - 8];
 967     int i= left + top;
 968
 969     if(i<64) i= (i+1)>>1;
 970
 971     tprintf("pred_nnz L%X T%X n%d s%d P%X\n", left, top, n, scan8[n], i&31);
 972
 973     return i&31;
 974 }
 975
 976 static inline int fetch_diagonal_mv(H264Context *h, const int16_t **C, int i, int list, int part_width){
 977     const int topright_ref= h->ref_cache[list][ i - 8 + part_width ];
 978
 979     if(topright_ref != PART_NOT_AVAILABLE){
 980         *C= h->mv_cache[list][ i - 8 + part_width ];
 981         return topright_ref;
 982     }else{
 983         tprintf("topright MV not available\n");
 984
 985         *C= h->mv_cache[list][ i - 8 - 1 ];
 986         return h->ref_cache[list][ i - 8 - 1 ];
 987     }
 988 }
 989
 990 /**
 991  * gets the predicted MV.
 992  * @param n the block index
 993  * @param part_width the width of the partition (4, 8,16) -> (1, 2, 4)
 994  * @param mx the x component of the predicted motion vector
 995  * @param my the y component of the predicted motion vector
 996  */
 997 static inline void pred_motion(H264Context * const h, int n, int part_width, int list, int ref, int * const mx, int * const my){
 998     const int index8= scan8[n];
 999     const int top_ref=      h->ref_cache[list][ index8 - 8 ];
1000     const int left_ref=     h->ref_cache[list][ index8 - 1 ];
1001     const int16_t * const A= h->mv_cache[list][ index8 - 1 ];
1002     const int16_t * const B= h->mv_cache[list][ index8 - 8 ];
1003     const int16_t * C;
1004     int diagonal_ref, match_count;
1005
1006     assert(part_width==1 || part_width==2 || part_width==4);
1007
1008 /* mv_cache
1009   B . . A T T T T
1010   U . . L . . , .
1011   U . . L . . . .
1012   U . . L . . , .
1013   . . . L . . . .
1014 */
1015
1016     diagonal_ref= fetch_diagonal_mv(h, &C, index8, list, part_width);
1017     match_count= (diagonal_ref==ref) + (top_ref==ref) + (left_ref==ref);
1018     tprintf("pred_motion match_count=%d\n", match_count);
1019     if(match_count > 1){ //most common
1020         *mx= mid_pred(A[0], B[0], C[0]);
1021         *my= mid_pred(A[1], B[1], C[1]);
1022     }else if(match_count==1){
1023         if(left_ref==ref){
1024             *mx= A[0];
1025             *my= A[1];
1026         }else if(top_ref==ref){
1027             *mx= B[0];
1028             *my= B[1];
1029         }else{
1030             *mx= C[0];
1031             *my= C[1];
1032         }
1033     }else{
1034         if(top_ref == PART_NOT_AVAILABLE && diagonal_ref == PART_NOT_AVAILABLE && left_ref != PART_NOT_AVAILABLE){
1035             *mx= A[0];
1036             *my= A[1];
1037         }else{
1038             *mx= mid_pred(A[0], B[0], C[0]);
1039             *my= mid_pred(A[1], B[1], C[1]);
1040         }
1041     }
1042
1043     tprintf("pred_motion (%2d %2d %2d) (%2d %2d %2d) (%2d %2d %2d) -> (%2d %2d %2d) at %2d %2d %d list %d\n", top_ref, B[0], B[1],                    diagonal_ref, C[0], C[1], left_ref, A[0], A[1], ref, *mx, *my, h->s.mb_x, h->s.mb_y, n, list);
1044 }
1045
1046 /**
1047  * gets the directionally predicted 16x8 MV.
1048  * @param n the block index
1049  * @param mx the x component of the predicted motion vector
1050  * @param my the y component of the predicted motion vector
1051  */
1052 static inline void pred_16x8_motion(H264Context * const h, int n, int list, int ref, int * const mx, int * const my){
1053     if(n==0){
1054         const int top_ref=      h->ref_cache[list][ scan8[0] - 8 ];
1055         const int16_t * const B= h->mv_cache[list][ scan8[0] - 8 ];
1056
1057         tprintf("pred_16x8: (%2d %2d %2d) at %2d %2d %d list %d\n", top_ref, B[0], B[1], h->s.mb_x, h->s.mb_y, n, list);
1058
1059         if(top_ref == ref){
1060             *mx= B[0];
1061             *my= B[1];
1062             return;
1063         }
1064     }else{
1065         const int left_ref=     h->ref_cache[list][ scan8[8] - 1 ];
1066         const int16_t * const A= h->mv_cache[list][ scan8[8] - 1 ];
1067
1068         tprintf("pred_16x8: (%2d %2d %2d) at %2d %2d %d list %d\n", left_ref, A[0], A[1], h->s.mb_x, h->s.mb_y, n, list);
1069
1070         if(left_ref == ref){
1071             *mx= A[0];
1072             *my= A[1];
1073             return;
1074         }
1075     }
1076
1077     //RARE
1078     pred_motion(h, n, 4, list, ref, mx, my);
1079 }
1080
1081 /**
1082  * gets the directionally predicted 8x16 MV.
1083  * @param n the block index
1084  * @param mx the x component of the predicted motion vector
1085  * @param my the y component of the predicted motion vector
1086  */
1087 static inline void pred_8x16_motion(H264Context * const h, int n, int list, int ref, int * const mx, int * const my){
1088     if(n==0){
1089         const int left_ref=      h->ref_cache[list][ scan8[0] - 1 ];
1090         const int16_t * const A=  h->mv_cache[list][ scan8[0] - 1 ];
1091
1092         tprintf("pred_8x16: (%2d %2d %2d) at %2d %2d %d list %d\n", left_ref, A[0], A[1], h->s.mb_x, h->s.mb_y, n, list);
1093
1094         if(left_ref == ref){
1095             *mx= A[0];
1096             *my= A[1];
1097             return;
1098         }
1099     }else{
1100         const int16_t * C;
1101         int diagonal_ref;
1102
1103         diagonal_ref= fetch_diagonal_mv(h, &C, scan8[4], list, 2);
1104
1105         tprintf("pred_8x16: (%2d %2d %2d) at %2d %2d %d list %d\n", diagonal_ref, C[0], C[1], h->s.mb_x, h->s.mb_y, n, list);
1106
1107         if(diagonal_ref == ref){
1108             *mx= C[0];
1109             *my= C[1];
1110             return;
1111         }
1112     }
1113
1114     //RARE
1115     pred_motion(h, n, 2, list, ref, mx, my);
1116 }
1117
1118 static inline void pred_pskip_motion(H264Context * const h, int * const mx, int * const my){
1119     const int top_ref = h->ref_cache[0][ scan8[0] - 8 ];
1120     const int left_ref= h->ref_cache[0][ scan8[0] - 1 ];
1121
1122     tprintf("pred_pskip: (%d) (%d) at %2d %2d\n", top_ref, left_ref, h->s.mb_x, h->s.mb_y);
1123
1124     if(top_ref == PART_NOT_AVAILABLE || left_ref == PART_NOT_AVAILABLE
1125        || (top_ref == 0  && *(uint32_t*)h->mv_cache[0][ scan8[0] - 8 ] == 0)
1126        || (left_ref == 0 && *(uint32_t*)h->mv_cache[0][ scan8[0] - 1 ] == 0)){
1127
1128         *mx = *my = 0;
1129         return;
1130     }
1131
1132     pred_motion(h, 0, 4, 0, 0, mx, my);
1133
1134     return;
1135 }
1136
1137 static inline void direct_dist_scale_factor(H264Context * const h){
1138     const int poc = h->s.current_picture_ptr->poc;
1139     const int poc1 = h->ref_list[1][0].poc;
1140     int i;
1141     for(i=0; i<h->ref_count[0]; i++){
1142         int poc0 = h->ref_list[0][i].poc;
1143         int td = clip(poc1 - poc0, -128, 127);
1144         if(td == 0 /* FIXME || pic0 is a long-term ref */){
1145             h->dist_scale_factor[i] = 256;
1146         }else{
1147             int tb = clip(poc - poc0, -128, 127);
1148             int tx = (16384 + (ABS(td) >> 1)) / td;
1149             h->dist_scale_factor[i] = clip((tb*tx + 32) >> 6, -1024, 1023);
1150         }
1151     }
1152 }
1153 static inline void direct_ref_list_init(H264Context * const h){
1154     MpegEncContext * const s = &h->s;
1155     Picture * const ref1 = &h->ref_list[1][0];
1156     Picture * const cur = s->current_picture_ptr;
1157     int list, i, j;
1158     if(cur->pict_type == I_TYPE)
1159         cur->ref_count[0] = 0;
1160     if(cur->pict_type != B_TYPE)
1161         cur->ref_count[1] = 0;
1162     for(list=0; list<2; list++){
1163         cur->ref_count[list] = h->ref_count[list];
1164         for(j=0; j<h->ref_count[list]; j++)
1165             cur->ref_poc[list][j] = h->ref_list[list][j].poc;
1166     }
1167     if(cur->pict_type != B_TYPE || h->direct_spatial_mv_pred)
1168         return;
1169     for(list=0; list<2; list++){
1170         for(i=0; i<ref1->ref_count[list]; i++){
1171             const int poc = ref1->ref_poc[list][i];
1172             h->map_col_to_list0[list][i] = PART_NOT_AVAILABLE;
1173             for(j=0; j<h->ref_count[list]; j++)
1174                 if(h->ref_list[list][j].poc == poc){
1175                     h->map_col_to_list0[list][i] = j;
1176                     break;
1177                 }
1178         }
1179     }
1180 }
1181
1182 static inline void pred_direct_motion(H264Context * const h, int *mb_type){
1183     MpegEncContext * const s = &h->s;
1184     const int mb_xy =   s->mb_x +   s->mb_y*s->mb_stride;
1185     const int b8_xy = 2*s->mb_x + 2*s->mb_y*h->b8_stride;
1186     const int b4_xy = 4*s->mb_x + 4*s->mb_y*h->b_stride;
1187     const int mb_type_col = h->ref_list[1][0].mb_type[mb_xy];
1188     const int16_t (*l1mv0)[2] = (const int16_t (*)[2]) &h->ref_list[1][0].motion_val[0][b4_xy];
1189     const int8_t *l1ref0 = &h->ref_list[1][0].ref_index[0][b8_xy];
1190     const int8_t *l1ref1 = &h->ref_list[1][0].ref_index[1][b8_xy];
1191     const int is_b8x8 = IS_8X8(*mb_type);
1192     int sub_mb_type;
1193     int i8, i4;
1194
1195     if(IS_8X8(mb_type_col) && !h->sps.direct_8x8_inference_flag){
1196         /* FIXME save sub mb types from previous frames (or derive from MVs)
1197          * so we know exactly what block size to use */
1198         sub_mb_type = MB_TYPE_8x8|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_4x4 */
1199         *mb_type =    MB_TYPE_8x8|MB_TYPE_L0L1;
1200     }else if(!is_b8x8 && (IS_16X16(mb_type_col) || IS_INTRA(mb_type_col))){
1201         sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
1202         *mb_type =    MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_16x16 */
1203     }else{
1204         sub_mb_type = MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2; /* B_SUB_8x8 */
1205         *mb_type =    MB_TYPE_8x8|MB_TYPE_L0L1;
1206     }
1207     if(!is_b8x8)
1208         *mb_type |= MB_TYPE_DIRECT2;
1209
1210     tprintf("mb_type = %08x, sub_mb_type = %08x, is_b8x8 = %d, mb_type_col = %08x\n", *mb_type, sub_mb_type, is_b8x8, mb_type_col);
1211
1212     if(h->direct_spatial_mv_pred){
1213         int ref[2];
1214         int mv[2][2];
1215         int list;
1216
1217         /* ref = min(neighbors) */
1218         for(list=0; list<2; list++){
1219             int refa = h->ref_cache[list][scan8[0] - 1];
1220             int refb = h->ref_cache[list][scan8[0] - 8];
1221             int refc = h->ref_cache[list][scan8[0] - 8 + 4];
1222             if(refc == -2)
1223                 refc = h->ref_cache[list][scan8[0] - 8 - 1];
1224             ref[list] = refa;
1225             if(ref[list] < 0 || (refb < ref[list] && refb >= 0))
1226                 ref[list] = refb;
1227             if(ref[list] < 0 || (refc < ref[list] && refc >= 0))
1228                 ref[list] = refc;
1229             if(ref[list] < 0)
1230                 ref[list] = -1;
1231         }
1232
1233         if(ref[0] < 0 && ref[1] < 0){
1234             ref[0] = ref[1] = 0;
1235             mv[0][0] = mv[0][1] =
1236             mv[1][0] = mv[1][1] = 0;
1237         }else{
1238             for(list=0; list<2; list++){
1239                 if(ref[list] >= 0)
1240                     pred_motion(h, 0, 4, list, ref[list], &mv[list][0], &mv[list][1]);
1241                 else
1242                     mv[list][0] = mv[list][1] = 0;
1243             }
1244         }
1245
1246         if(ref[1] < 0){
1247             *mb_type &= ~MB_TYPE_P0L1;
1248             sub_mb_type &= ~MB_TYPE_P0L1;
1249         }else if(ref[0] < 0){
1250             *mb_type &= ~MB_TYPE_P0L0;
1251             sub_mb_type &= ~MB_TYPE_P0L0;
1252         }
1253
1254         if(IS_16X16(*mb_type)){
1255             fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, ref[0], 1);
1256             fill_rectangle(&h->ref_cache[1][scan8[0]], 4, 4, 8, ref[1], 1);
1257             if(!IS_INTRA(mb_type_col) && l1ref0[0] == 0 &&
1258                 ABS(l1mv0[0][0]) <= 1 && ABS(l1mv0[0][1]) <= 1){
1259                 if(ref[0] > 0)
1260                     fill_rectangle(&h->mv_cache[0][scan8[0]], 4, 4, 8, pack16to32(mv[0][0],mv[0][1]), 4);
1261                 else
1262                     fill_rectangle(&h->mv_cache[0][scan8[0]], 4, 4, 8, 0, 4);
1263                 if(ref[1] > 0)
1264                     fill_rectangle(&h->mv_cache[1][scan8[0]], 4, 4, 8, pack16to32(mv[1][0],mv[1][1]), 4);
1265                 else
1266                     fill_rectangle(&h->mv_cache[1][scan8[0]], 4, 4, 8, 0, 4);
1267             }else{
1268                 fill_rectangle(&h->mv_cache[0][scan8[0]], 4, 4, 8, pack16to32(mv[0][0],mv[0][1]), 4);
1269                 fill_rectangle(&h->mv_cache[1][scan8[0]], 4, 4, 8, pack16to32(mv[1][0],mv[1][1]), 4);
1270             }
1271         }else{
1272             for(i8=0; i8<4; i8++){
1273                 const int x8 = i8&1;
1274                 const int y8 = i8>>1;
1275
1276                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1277                     continue;
1278                 h->sub_mb_type[i8] = sub_mb_type;
1279
1280                 fill_rectangle(&h->mv_cache[0][scan8[i8*4]], 2, 2, 8, pack16to32(mv[0][0],mv[0][1]), 4);
1281                 fill_rectangle(&h->mv_cache[1][scan8[i8*4]], 2, 2, 8, pack16to32(mv[1][0],mv[1][1]), 4);
1282                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, ref[0], 1);
1283                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, ref[1], 1);
1284
1285                 /* col_zero_flag */
1286                 if(!IS_INTRA(mb_type_col) && l1ref0[x8 + y8*h->b8_stride] == 0){
1287                     for(i4=0; i4<4; i4++){
1288                         const int16_t *mv_col = l1mv0[x8*2 + (i4&1) + (y8*2 + (i4>>1))*h->b_stride];
1289                         if(ABS(mv_col[0]) <= 1 && ABS(mv_col[1]) <= 1){
1290                             if(ref[0] == 0)
1291                                 *(uint32_t*)h->mv_cache[0][scan8[i8*4+i4]] = 0;
1292                             if(ref[1] == 0)
1293                                 *(uint32_t*)h->mv_cache[1][scan8[i8*4+i4]] = 0;
1294                         }
1295                     }
1296                 }
1297             }
1298         }
1299     }else{ /* direct temporal mv pred */
1300         if(IS_16X16(*mb_type)){
1301             fill_rectangle(&h->ref_cache[1][scan8[0]], 4, 4, 8, 0, 1);
1302             if(IS_INTRA(mb_type_col)){
1303                 fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, 0, 1);
1304                 fill_rectangle(&h-> mv_cache[0][scan8[0]], 4, 4, 8, 0, 4);
1305                 fill_rectangle(&h-> mv_cache[1][scan8[0]], 4, 4, 8, 0, 4);
1306             }else{
1307                 const int ref0 = l1ref0[0] >= 0 ? h->map_col_to_list0[0][l1ref0[0]]
1308                                                 : h->map_col_to_list0[1][l1ref1[0]];
1309                 const int dist_scale_factor = h->dist_scale_factor[ref0];
1310                 const int16_t *mv_col = l1mv0[0];
1311                 int mv_l0[2];
1312                 mv_l0[0] = (dist_scale_factor * mv_col[0] + 128) >> 8;
1313                 mv_l0[1] = (dist_scale_factor * mv_col[1] + 128) >> 8;
1314                 fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, ref0, 1);
1315                 fill_rectangle(&h-> mv_cache[0][scan8[0]], 4, 4, 8, pack16to32(mv_l0[0],mv_l0[1]), 4);
1316                 fill_rectangle(&h-> mv_cache[1][scan8[0]], 4, 4, 8, pack16to32(mv_l0[0]-mv_col[0],mv_l0[1]-mv_col[1]), 4);
1317             }
1318         }else{
1319             for(i8=0; i8<4; i8++){
1320                 const int x8 = i8&1;
1321                 const int y8 = i8>>1;
1322                 int ref0, dist_scale_factor;
1323
1324                 if(is_b8x8 && !IS_DIRECT(h->sub_mb_type[i8]))
1325                     continue;
1326                 h->sub_mb_type[i8] = sub_mb_type;
1327                 if(IS_INTRA(mb_type_col)){
1328                     fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, 0, 1);
1329                     fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, 0, 1);
1330                     fill_rectangle(&h-> mv_cache[0][scan8[i8*4]], 2, 2, 8, 0, 4);
1331                     fill_rectangle(&h-> mv_cache[1][scan8[i8*4]], 2, 2, 8, 0, 4);
1332                     continue;
1333                 }
1334
1335                 ref0 = l1ref0[x8 + y8*h->b8_stride];
1336                 if(ref0 >= 0)
1337                     ref0 = h->map_col_to_list0[0][ref0];
1338                 else
1339                     ref0 = h->map_col_to_list0[1][l1ref1[x8 + y8*h->b8_stride]];
1340                 dist_scale_factor = h->dist_scale_factor[ref0];
1341
1342                 fill_rectangle(&h->ref_cache[0][scan8[i8*4]], 2, 2, 8, ref0, 1);
1343                 fill_rectangle(&h->ref_cache[1][scan8[i8*4]], 2, 2, 8, 0, 1);
1344                 for(i4=0; i4<4; i4++){
1345                     const int16_t *mv_col = l1mv0[x8*2 + (i4&1) + (y8*2 + (i4>>1))*h->b_stride];
1346                     int16_t *mv_l0 = h->mv_cache[0][scan8[i8*4+i4]];
1347                     mv_l0[0] = (dist_scale_factor * mv_col[0] + 128) >> 8;
1348                     mv_l0[1] = (dist_scale_factor * mv_col[1] + 128) >> 8;
1349                     *(uint32_t*)h->mv_cache[1][scan8[i8*4+i4]] =
1350                         pack16to32(mv_l0[0]-mv_col[0],mv_l0[1]-mv_col[1]);
1351                 }
1352             }
1353         }
1354     }
1355 }
1356
1357 static inline void write_back_motion(H264Context *h, int mb_type){
1358     MpegEncContext * const s = &h->s;
1359     const int b_xy = 4*s->mb_x + 4*s->mb_y*h->b_stride;
1360     const int b8_xy= 2*s->mb_x + 2*s->mb_y*h->b8_stride;
1361     int list;
1362
1363     for(list=0; list<2; list++){
1364         int y;
1365         if(!USES_LIST(mb_type, list)){
1366             if(1){ //FIXME skip or never read if mb_type doesn't use it
1367                 for(y=0; y<4; y++){
1368                     *(uint64_t*)s->current_picture.motion_val[list][b_xy + 0 + y*h->b_stride]=
1369                     *(uint64_t*)s->current_picture.motion_val[list][b_xy + 2 + y*h->b_stride]= 0;
1370                 }
1371                 if( h->pps.cabac ) {
1372                     /* FIXME needed ? */
1373                     for(y=0; y<4; y++){
1374                         *(uint64_t*)h->mvd_table[list][b_xy + 0 + y*h->b_stride]=
1375                         *(uint64_t*)h->mvd_table[list][b_xy + 2 + y*h->b_stride]= 0;
1376                     }
1377                 }
1378                 for(y=0; y<2; y++){
1379                     *(uint16_t*)&s->current_picture.ref_index[list][b8_xy + y*h->b8_stride]= (LIST_NOT_USED&0xFF)*0x0101;
1380                 }
1381             }
1382             continue;
1383         }
1384
1385         for(y=0; y<4; y++){
1386             *(uint64_t*)s->current_picture.motion_val[list][b_xy + 0 + y*h->b_stride]= *(uint64_t*)h->mv_cache[list][scan8[0]+0 + 8*y];
1387             *(uint64_t*)s->current_picture.motion_val[list][b_xy + 2 + y*h->b_stride]= *(uint64_t*)h->mv_cache[list][scan8[0]+2 + 8*y];
1388         }
1389         if( h->pps.cabac ) {
1390             for(y=0; y<4; y++){
1391                 *(uint64_t*)h->mvd_table[list][b_xy + 0 + y*h->b_stride]= *(uint64_t*)h->mvd_cache[list][scan8[0]+0 + 8*y];
1392                 *(uint64_t*)h->mvd_table[list][b_xy + 2 + y*h->b_stride]= *(uint64_t*)h->mvd_cache[list][scan8[0]+2 + 8*y];
1393             }
1394         }
1395         for(y=0; y<2; y++){
1396             s->current_picture.ref_index[list][b8_xy + 0 + y*h->b8_stride]= h->ref_cache[list][scan8[0]+0 + 16*y];
1397             s->current_picture.ref_index[list][b8_xy + 1 + y*h->b8_stride]= h->ref_cache[list][scan8[0]+2 + 16*y];
1398         }
1399     }
1400
1401     if(h->slice_type == B_TYPE && h->pps.cabac){
1402         if(IS_8X8(mb_type)){
1403             h->direct_table[b8_xy+1+0*h->b8_stride] = IS_DIRECT(h->sub_mb_type[1]) ? 1 : 0;
1404             h->direct_table[b8_xy+0+1*h->b8_stride] = IS_DIRECT(h->sub_mb_type[2]) ? 1 : 0;
1405             h->direct_table[b8_xy+1+1*h->b8_stride] = IS_DIRECT(h->sub_mb_type[3]) ? 1 : 0;
1406         }
1407     }
1408 }
1409
1410 /**
1411  * Decodes a network abstraction layer unit.
1412  * @param consumed is the number of bytes used as input
1413  * @param length is the length of the array
1414  * @param dst_length is the number of decoded bytes FIXME here or a decode rbsp tailing?
1415  * @returns decoded bytes, might be src+1 if no escapes
1416  */
1417 static uint8_t *decode_nal(H264Context *h, uint8_t *src, int *dst_length, int *consumed, int length){
1418     int i, si, di;
1419     uint8_t *dst;
1420
1421 //    src[0]&0x80;              //forbidden bit
1422     h->nal_ref_idc= src[0]>>5;
1423     h->nal_unit_type= src[0]&0x1F;
1424
1425     src++; length--;
1426 #if 0
1427     for(i=0; i<length; i++)
1428         printf("%2X ", src[i]);
1429 #endif
1430     for(i=0; i+1<length; i+=2){
1431         if(src[i]) continue;
1432         if(i>0 && src[i-1]==0) i--;
1433         if(i+2<length && src[i+1]==0 && src[i+2]<=3){
1434             if(src[i+2]!=3){
1435                 /* startcode, so we must be past the end */
1436                 length=i;
1437             }
1438             break;
1439         }
1440     }
1441
1442     if(i>=length-1){ //no escaped 0
1443         *dst_length= length;
1444         *consumed= length+1; //+1 for the header
1445         return src;
1446     }
1447
1448     h->rbsp_buffer= av_fast_realloc(h->rbsp_buffer, &h->rbsp_buffer_size, length);
1449     dst= h->rbsp_buffer;
1450
1451 //printf("decoding esc\n");
1452     si=di=0;
1453     while(si<length){
1454         //remove escapes (very rare 1:2^22)
1455         if(si+2<length && src[si]==0 && src[si+1]==0 && src[si+2]<=3){
1456             if(src[si+2]==3){ //escape
1457                 dst[di++]= 0;
1458                 dst[di++]= 0;
1459                 si+=3;
1460                 continue;
1461             }else //next start code
1462                 break;
1463         }
1464
1465         dst[di++]= src[si++];
1466     }
1467
1468     *dst_length= di;
1469     *consumed= si + 1;//+1 for the header
1470 //FIXME store exact number of bits in the getbitcontext (its needed for decoding)
1471     return dst;
1472 }
1473
1474 #if 0
1475 /**
1476  * @param src the data which should be escaped
1477  * @param dst the target buffer, dst+1 == src is allowed as a special case
1478  * @param length the length of the src data
1479  * @param dst_length the length of the dst array
1480  * @returns length of escaped data in bytes or -1 if an error occured
1481  */
1482 static int encode_nal(H264Context *h, uint8_t *dst, uint8_t *src, int length, int dst_length){
1483     int i, escape_count, si, di;
1484     uint8_t *temp;
1485
1486     assert(length>=0);
1487     assert(dst_length>0);
1488
1489     dst[0]= (h->nal_ref_idc<<5) + h->nal_unit_type;
1490
1491     if(length==0) return 1;
1492
1493     escape_count= 0;
1494     for(i=0; i<length; i+=2){
1495         if(src[i]) continue;
1496         if(i>0 && src[i-1]==0)
1497             i--;
1498         if(i+2<length && src[i+1]==0 && src[i+2]<=3){
1499             escape_count++;
1500             i+=2;
1501         }
1502     }
1503
1504     if(escape_count==0){
1505         if(dst+1 != src)
1506             memcpy(dst+1, src, length);
1507         return length + 1;
1508     }
1509
1510     if(length + escape_count + 1> dst_length)
1511         return -1;
1512
1513     //this should be damn rare (hopefully)
1514
1515     h->rbsp_buffer= av_fast_realloc(h->rbsp_buffer, &h->rbsp_buffer_size, length + escape_count);
1516     temp= h->rbsp_buffer;
1517 //printf("encoding esc\n");
1518
1519     si= 0;
1520     di= 0;
1521     while(si < length){
1522         if(si+2<length && src[si]==0 && src[si+1]==0 && src[si+2]<=3){
1523             temp[di++]= 0; si++;
1524             temp[di++]= 0; si++;
1525             temp[di++]= 3;
1526             temp[di++]= src[si++];
1527         }
1528         else
1529             temp[di++]= src[si++];
1530     }
1531     memcpy(dst+1, temp, length+escape_count);
1532
1533     assert(di == length+escape_count);
1534
1535     return di + 1;
1536 }
1537
1538 /**
1539  * write 1,10,100,1000,... for alignment, yes its exactly inverse to mpeg4
1540  */
1541 static void encode_rbsp_trailing(PutBitContext *pb){
1542     int length;
1543     put_bits(pb, 1, 1);
1544     length= (-put_bits_count(pb))&7;
1545     if(length) put_bits(pb, length, 0);
1546 }
1547 #endif
1548
1549 /**
1550  * identifies the exact end of the bitstream
1551  * @return the length of the trailing, or 0 if damaged
1552  */
1553 static int decode_rbsp_trailing(uint8_t *src){
1554     int v= *src;
1555     int r;
1556
1557     tprintf("rbsp trailing %X\n", v);
1558
1559     for(r=1; r<9; r++){
1560         if(v&1) return r;
1561         v>>=1;
1562     }
1563     return 0;
1564 }
1565
1566 /**
1567  * idct tranforms the 16 dc values and dequantize them.
1568  * @param qp quantization parameter
1569  */
1570 static void h264_luma_dc_dequant_idct_c(DCTELEM *block, int qp){
1571     const int qmul= dequant_coeff[qp][0];
1572 #define stride 16
1573     int i;
1574     int temp[16]; //FIXME check if this is a good idea
1575     static const int x_offset[4]={0, 1*stride, 4* stride,  5*stride};
1576     static const int y_offset[4]={0, 2*stride, 8* stride, 10*stride};
1577
1578 //memset(block, 64, 2*256);
1579 //return;
1580     for(i=0; i<4; i++){
1581         const int offset= y_offset[i];
1582         const int z0= block[offset+stride*0] + block[offset+stride*4];
1583         const int z1= block[offset+stride*0] - block[offset+stride*4];
1584         const int z2= block[offset+stride*1] - block[offset+stride*5];
1585         const int z3= block[offset+stride*1] + block[offset+stride*5];
1586
1587         temp[4*i+0]= z0+z3;
1588         temp[4*i+1]= z1+z2;
1589         temp[4*i+2]= z1-z2;
1590         temp[4*i+3]= z0-z3;
1591     }
1592
1593     for(i=0; i<4; i++){
1594         const int offset= x_offset[i];
1595         const int z0= temp[4*0+i] + temp[4*2+i];
1596         const int z1= temp[4*0+i] - temp[4*2+i];
1597         const int z2= temp[4*1+i] - temp[4*3+i];
1598         const int z3= temp[4*1+i] + temp[4*3+i];
1599
1600         block[stride*0 +offset]= ((z0 + z3)*qmul + 2)>>2; //FIXME think about merging this into decode_resdual
1601         block[stride*2 +offset]= ((z1 + z2)*qmul + 2)>>2;
1602         block[stride*8 +offset]= ((z1 - z2)*qmul + 2)>>2;
1603         block[stride*10+offset]= ((z0 - z3)*qmul + 2)>>2;
1604     }
1605 }
1606
1607 #if 0
1608 /**
1609  * dct tranforms the 16 dc values.
1610  * @param qp quantization parameter ??? FIXME
1611  */
1612 static void h264_luma_dc_dct_c(DCTELEM *block/*, int qp*/){
1613 //    const int qmul= dequant_coeff[qp][0];
1614     int i;
1615     int temp[16]; //FIXME check if this is a good idea
1616     static const int x_offset[4]={0, 1*stride, 4* stride,  5*stride};
1617     static const int y_offset[4]={0, 2*stride, 8* stride, 10*stride};
1618
1619     for(i=0; i<4; i++){
1620         const int offset= y_offset[i];
1621         const int z0= block[offset+stride*0] + block[offset+stride*4];
1622         const int z1= block[offset+stride*0] - block[offset+stride*4];
1623         const int z2= block[offset+stride*1] - block[offset+stride*5];
1624         const int z3= block[offset+stride*1] + block[offset+stride*5];
1625
1626         temp[4*i+0]= z0+z3;
1627         temp[4*i+1]= z1+z2;
1628         temp[4*i+2]= z1-z2;
1629         temp[4*i+3]= z0-z3;
1630     }
1631
1632     for(i=0; i<4; i++){
1633         const int offset= x_offset[i];
1634         const int z0= temp[4*0+i] + temp[4*2+i];
1635         const int z1= temp[4*0+i] - temp[4*2+i];
1636         const int z2= temp[4*1+i] - temp[4*3+i];
1637         const int z3= temp[4*1+i] + temp[4*3+i];
1638
1639         block[stride*0 +offset]= (z0 + z3)>>1;
1640         block[stride*2 +offset]= (z1 + z2)>>1;
1641         block[stride*8 +offset]= (z1 - z2)>>1;
1642         block[stride*10+offset]= (z0 - z3)>>1;
1643     }
1644 }
1645 #endif
1646
1647 #undef xStride
1648 #undef stride
1649
1650 static void chroma_dc_dequant_idct_c(DCTELEM *block, int qp){
1651     const int qmul= dequant_coeff[qp][0];
1652     const int stride= 16*2;
1653     const int xStride= 16;
1654     int a,b,c,d,e;
1655
1656     a= block[stride*0 + xStride*0];
1657     b= block[stride*0 + xStride*1];
1658     c= block[stride*1 + xStride*0];
1659     d= block[stride*1 + xStride*1];
1660
1661     e= a-b;
1662     a= a+b;
1663     b= c-d;
1664     c= c+d;
1665
1666     block[stride*0 + xStride*0]= ((a+c)*qmul + 0)>>1;
1667     block[stride*0 + xStride*1]= ((e+b)*qmul + 0)>>1;
1668     block[stride*1 + xStride*0]= ((a-c)*qmul + 0)>>1;
1669     block[stride*1 + xStride*1]= ((e-b)*qmul + 0)>>1;
1670 }
1671
1672 #if 0
1673 static void chroma_dc_dct_c(DCTELEM *block){
1674     const int stride= 16*2;
1675     const int xStride= 16;
1676     int a,b,c,d,e;
1677
1678     a= block[stride*0 + xStride*0];
1679     b= block[stride*0 + xStride*1];
1680     c= block[stride*1 + xStride*0];
1681     d= block[stride*1 + xStride*1];
1682
1683     e= a-b;
1684     a= a+b;
1685     b= c-d;
1686     c= c+d;
1687
1688     block[stride*0 + xStride*0]= (a+c);
1689     block[stride*0 + xStride*1]= (e+b);
1690     block[stride*1 + xStride*0]= (a-c);
1691     block[stride*1 + xStride*1]= (e-b);
1692 }
1693 #endif
1694
1695 /**
1696  * gets the chroma qp.
1697  */
1698 static inline int get_chroma_qp(int chroma_qp_index_offset, int qscale){
1699
1700     return chroma_qp[clip(qscale + chroma_qp_index_offset, 0, 51)];
1701 }
1702
1703
1704 #if 0
1705 static void h264_diff_dct_c(DCTELEM *block, uint8_t *src1, uint8_t *src2, int stride){
1706     int i;
1707     //FIXME try int temp instead of block
1708
1709     for(i=0; i<4; i++){
1710         const int d0= src1[0 + i*stride] - src2[0 + i*stride];
1711         const int d1= src1[1 + i*stride] - src2[1 + i*stride];
1712         const int d2= src1[2 + i*stride] - src2[2 + i*stride];
1713         const int d3= src1[3 + i*stride] - src2[3 + i*stride];
1714         const int z0= d0 + d3;
1715         const int z3= d0 - d3;
1716         const int z1= d1 + d2;
1717         const int z2= d1 - d2;
1718
1719         block[0 + 4*i]=   z0 +   z1;
1720         block[1 + 4*i]= 2*z3 +   z2;
1721         block[2 + 4*i]=   z0 -   z1;
1722         block[3 + 4*i]=   z3 - 2*z2;
1723     }
1724
1725     for(i=0; i<4; i++){
1726         const int z0= block[0*4 + i] + block[3*4 + i];
1727         const int z3= block[0*4 + i] - block[3*4 + i];
1728         const int z1= block[1*4 + i] + block[2*4 + i];
1729         const int z2= block[1*4 + i] - block[2*4 + i];
1730
1731         block[0*4 + i]=   z0 +   z1;
1732         block[1*4 + i]= 2*z3 +   z2;
1733         block[2*4 + i]=   z0 -   z1;
1734         block[3*4 + i]=   z3 - 2*z2;
1735     }
1736 }
1737 #endif
1738
1739 //FIXME need to check that this doesnt overflow signed 32 bit for low qp, i am not sure, it's very close
1740 //FIXME check that gcc inlines this (and optimizes intra & seperate_dc stuff away)
1741 static inline int quantize_c(DCTELEM *block, uint8_t *scantable, int qscale, int intra, int seperate_dc){
1742     int i;
1743     const int * const quant_table= quant_coeff[qscale];
1744     const int bias= intra ? (1<<QUANT_SHIFT)/3 : (1<<QUANT_SHIFT)/6;
1745     const unsigned int threshold1= (1<<QUANT_SHIFT) - bias - 1;
1746     const unsigned int threshold2= (threshold1<<1);
1747     int last_non_zero;
1748
1749     if(seperate_dc){
1750         if(qscale<=18){
1751             //avoid overflows
1752             const int dc_bias= intra ? (1<<(QUANT_SHIFT-2))/3 : (1<<(QUANT_SHIFT-2))/6;
1753             const unsigned int dc_threshold1= (1<<(QUANT_SHIFT-2)) - dc_bias - 1;
1754             const unsigned int dc_threshold2= (dc_threshold1<<1);
1755
1756             int level= block[0]*quant_coeff[qscale+18][0];
1757             if(((unsigned)(level+dc_threshold1))>dc_threshold2){
1758                 if(level>0){
1759                     level= (dc_bias + level)>>(QUANT_SHIFT-2);
1760                     block[0]= level;
1761                 }else{
1762                     level= (dc_bias - level)>>(QUANT_SHIFT-2);
1763                     block[0]= -level;
1764                 }
1765 //                last_non_zero = i;
1766             }else{
1767                 block[0]=0;
1768             }
1769         }else{
1770             const int dc_bias= intra ? (1<<(QUANT_SHIFT+1))/3 : (1<<(QUANT_SHIFT+1))/6;
1771             const unsigned int dc_threshold1= (1<<(QUANT_SHIFT+1)) - dc_bias - 1;
1772             const unsigned int dc_threshold2= (dc_threshold1<<1);
1773
1774             int level= block[0]*quant_table[0];
1775             if(((unsigned)(level+dc_threshold1))>dc_threshold2){
1776                 if(level>0){
1777                     level= (dc_bias + level)>>(QUANT_SHIFT+1);
1778                     block[0]= level;
1779                 }else{
1780                     level= (dc_bias - level)>>(QUANT_SHIFT+1);
1781                     block[0]= -level;
1782                 }
1783 //                last_non_zero = i;
1784             }else{
1785                 block[0]=0;
1786             }
1787         }
1788         last_non_zero= 0;
1789         i=1;
1790     }else{
1791         last_non_zero= -1;
1792         i=0;
1793     }
1794
1795     for(; i<16; i++){
1796         const int j= scantable[i];
1797         int level= block[j]*quant_table[j];
1798
1799 //        if(   bias+level >= (1<<(QMAT_SHIFT - 3))
1800 //           || bias-level >= (1<<(QMAT_SHIFT - 3))){
1801         if(((unsigned)(level+threshold1))>threshold2){
1802             if(level>0){
1803                 level= (bias + level)>>QUANT_SHIFT;
1804                 block[j]= level;
1805             }else{
1806                 level= (bias - level)>>QUANT_SHIFT;
1807                 block[j]= -level;
1808             }
1809             last_non_zero = i;
1810         }else{
1811             block[j]=0;
1812         }
1813     }
1814
1815     return last_non_zero;
1816 }
1817
1818 static void pred4x4_vertical_c(uint8_t *src, uint8_t *topright, int stride){
1819     const uint32_t a= ((uint32_t*)(src-stride))[0];
1820     ((uint32_t*)(src+0*stride))[0]= a;
1821     ((uint32_t*)(src+1*stride))[0]= a;
1822     ((uint32_t*)(src+2*stride))[0]= a;
1823     ((uint32_t*)(src+3*stride))[0]= a;
1824 }
1825
1826 static void pred4x4_horizontal_c(uint8_t *src, uint8_t *topright, int stride){
1827     ((uint32_t*)(src+0*stride))[0]= src[-1+0*stride]*0x01010101;
1828     ((uint32_t*)(src+1*stride))[0]= src[-1+1*stride]*0x01010101;
1829     ((uint32_t*)(src+2*stride))[0]= src[-1+2*stride]*0x01010101;
1830     ((uint32_t*)(src+3*stride))[0]= src[-1+3*stride]*0x01010101;
1831 }
1832
1833 static void pred4x4_dc_c(uint8_t *src, uint8_t *topright, int stride){
1834     const int dc= (  src[-stride] + src[1-stride] + src[2-stride] + src[3-stride]
1835                    + src[-1+0*stride] + src[-1+1*stride] + src[-1+2*stride] + src[-1+3*stride] + 4) >>3;
1836
1837     ((uint32_t*)(src+0*stride))[0]=
1838     ((uint32_t*)(src+1*stride))[0]=
1839     ((uint32_t*)(src+2*stride))[0]=
1840     ((uint32_t*)(src+3*stride))[0]= dc* 0x01010101;
1841 }
1842
1843 static void pred4x4_left_dc_c(uint8_t *src, uint8_t *topright, int stride){
1844     const int dc= (  src[-1+0*stride] + src[-1+1*stride] + src[-1+2*stride] + src[-1+3*stride] + 2) >>2;
1845
1846     ((uint32_t*)(src+0*stride))[0]=
1847     ((uint32_t*)(src+1*stride))[0]=
1848     ((uint32_t*)(src+2*stride))[0]=
1849     ((uint32_t*)(src+3*stride))[0]= dc* 0x01010101;
1850 }
1851
1852 static void pred4x4_top_dc_c(uint8_t *src, uint8_t *topright, int stride){
1853     const int dc= (  src[-stride] + src[1-stride] + src[2-stride] + src[3-stride] + 2) >>2;
1854
1855     ((uint32_t*)(src+0*stride))[0]=
1856     ((uint32_t*)(src+1*stride))[0]=
1857     ((uint32_t*)(src+2*stride))[0]=
1858     ((uint32_t*)(src+3*stride))[0]= dc* 0x01010101;
1859 }
1860
1861 static void pred4x4_128_dc_c(uint8_t *src, uint8_t *topright, int stride){
1862     ((uint32_t*)(src+0*stride))[0]=
1863     ((uint32_t*)(src+1*stride))[0]=
1864     ((uint32_t*)(src+2*stride))[0]=
1865     ((uint32_t*)(src+3*stride))[0]= 128U*0x01010101U;
1866 }
1867
1868
1869 #define LOAD_TOP_RIGHT_EDGE\
1870     const int t4= topright[0];\
1871     const int t5= topright[1];\
1872     const int t6= topright[2];\
1873     const int t7= topright[3];\
1874
1875 #define LOAD_LEFT_EDGE\
1876     const int l0= src[-1+0*stride];\
1877     const int l1= src[-1+1*stride];\
1878     const int l2= src[-1+2*stride];\
1879     const int l3= src[-1+3*stride];\
1880
1881 #define LOAD_TOP_EDGE\
1882     const int t0= src[ 0-1*stride];\
1883     const int t1= src[ 1-1*stride];\
1884     const int t2= src[ 2-1*stride];\
1885     const int t3= src[ 3-1*stride];\
1886
1887 static void pred4x4_down_right_c(uint8_t *src, uint8_t *topright, int stride){
1888     const int lt= src[-1-1*stride];
1889     LOAD_TOP_EDGE
1890     LOAD_LEFT_EDGE
1891
1892     src[0+3*stride]=(l3 + 2*l2 + l1 + 2)>>2;
1893     src[0+2*stride]=
1894     src[1+3*stride]=(l2 + 2*l1 + l0 + 2)>>2;
1895     src[0+1*stride]=
1896     src[1+2*stride]=
1897     src[2+3*stride]=(l1 + 2*l0 + lt + 2)>>2;
1898     src[0+0*stride]=
1899     src[1+1*stride]=
1900     src[2+2*stride]=
1901     src[3+3*stride]=(l0 + 2*lt + t0 + 2)>>2;
1902     src[1+0*stride]=
1903     src[2+1*stride]=
1904     src[3+2*stride]=(lt + 2*t0 + t1 + 2)>>2;
1905     src[2+0*stride]=
1906     src[3+1*stride]=(t0 + 2*t1 + t2 + 2)>>2;
1907     src[3+0*stride]=(t1 + 2*t2 + t3 + 2)>>2;
1908 }
1909
1910 static void pred4x4_down_left_c(uint8_t *src, uint8_t *topright, int stride){
1911     LOAD_TOP_EDGE
1912     LOAD_TOP_RIGHT_EDGE
1913 //    LOAD_LEFT_EDGE
1914
1915     src[0+0*stride]=(t0 + t2 + 2*t1 + 2)>>2;
1916     src[1+0*stride]=
1917     src[0+1*stride]=(t1 + t3 + 2*t2 + 2)>>2;
1918     src[2+0*stride]=
1919     src[1+1*stride]=
1920     src[0+2*stride]=(t2 + t4 + 2*t3 + 2)>>2;
1921     src[3+0*stride]=
1922     src[2+1*stride]=
1923     src[1+2*stride]=
1924     src[0+3*stride]=(t3 + t5 + 2*t4 + 2)>>2;
1925     src[3+1*stride]=
1926     src[2+2*stride]=
1927     src[1+3*stride]=(t4 + t6 + 2*t5 + 2)>>2;
1928     src[3+2*stride]=
1929     src[2+3*stride]=(t5 + t7 + 2*t6 + 2)>>2;
1930     src[3+3*stride]=(t6 + 3*t7 + 2)>>2;
1931 }
1932
1933 static void pred4x4_vertical_right_c(uint8_t *src, uint8_t *topright, int stride){
1934     const int lt= src[-1-1*stride];
1935     LOAD_TOP_EDGE
1936     LOAD_LEFT_EDGE
1937     const __attribute__((unused)) int unu= l3;
1938
1939     src[0+0*stride]=
1940     src[1+2*stride]=(lt + t0 + 1)>>1;
1941     src[1+0*stride]=
1942     src[2+2*stride]=(t0 + t1 + 1)>>1;
1943     src[2+0*stride]=
1944     src[3+2*stride]=(t1 + t2 + 1)>>1;
1945     src[3+0*stride]=(t2 + t3 + 1)>>1;
1946     src[0+1*stride]=
1947     src[1+3*stride]=(l0 + 2*lt + t0 + 2)>>2;
1948     src[1+1*stride]=
1949     src[2+3*stride]=(lt + 2*t0 + t1 + 2)>>2;
1950     src[2+1*stride]=
1951     src[3+3*stride]=(t0 + 2*t1 + t2 + 2)>>2;
1952     src[3+1*stride]=(t1 + 2*t2 + t3 + 2)>>2;
1953     src[0+2*stride]=(lt + 2*l0 + l1 + 2)>>2;
1954     src[0+3*stride]=(l0 + 2*l1 + l2 + 2)>>2;
1955 }
1956
1957 static void pred4x4_vertical_left_c(uint8_t *src, uint8_t *topright, int stride){
1958     LOAD_TOP_EDGE
1959     LOAD_TOP_RIGHT_EDGE
1960     const __attribute__((unused)) int unu= t7;
1961
1962     src[0+0*stride]=(t0 + t1 + 1)>>1;
1963     src[1+0*stride]=
1964     src[0+2*stride]=(t1 + t2 + 1)>>1;
1965     src[2+0*stride]=
1966     src[1+2*stride]=(t2 + t3 + 1)>>1;
1967     src[3+0*stride]=
1968     src[2+2*stride]=(t3 + t4+ 1)>>1;
1969     src[3+2*stride]=(t4 + t5+ 1)>>1;
1970     src[0+1*stride]=(t0 + 2*t1 + t2 + 2)>>2;
1971     src[1+1*stride]=
1972     src[0+3*stride]=(t1 + 2*t2 + t3 + 2)>>2;
1973     src[2+1*stride]=
1974     src[1+3*stride]=(t2 + 2*t3 + t4 + 2)>>2;
1975     src[3+1*stride]=
1976     src[2+3*stride]=(t3 + 2*t4 + t5 + 2)>>2;
1977     src[3+3*stride]=(t4 + 2*t5 + t6 + 2)>>2;
1978 }
1979
1980 static void pred4x4_horizontal_up_c(uint8_t *src, uint8_t *topright, int stride){
1981     LOAD_LEFT_EDGE
1982
1983     src[0+0*stride]=(l0 + l1 + 1)>>1;
1984     src[1+0*stride]=(l0 + 2*l1 + l2 + 2)>>2;
1985     src[2+0*stride]=
1986     src[0+1*stride]=(l1 + l2 + 1)>>1;
1987     src[3+0*stride]=
1988     src[1+1*stride]=(l1 + 2*l2 + l3 + 2)>>2;
1989     src[2+1*stride]=
1990     src[0+2*stride]=(l2 + l3 + 1)>>1;
1991     src[3+1*stride]=
1992     src[1+2*stride]=(l2 + 2*l3 + l3 + 2)>>2;
1993     src[3+2*stride]=
1994     src[1+3*stride]=
1995     src[0+3*stride]=
1996     src[2+2*stride]=
1997     src[2+3*stride]=
1998     src[3+3*stride]=l3;
1999 }
2000
2001 static void pred4x4_horizontal_down_c(uint8_t *src, uint8_t *topright, int stride){
2002     const int lt= src[-1-1*stride];
2003     LOAD_TOP_EDGE
2004     LOAD_LEFT_EDGE
2005     const __attribute__((unused)) int unu= t3;
2006
2007     src[0+0*stride]=
2008     src[2+1*stride]=(lt + l0 + 1)>>1;
2009     src[1+0*stride]=
2010     src[3+1*stride]=(l0 + 2*lt + t0 + 2)>>2;
2011     src[2+0*stride]=(lt + 2*t0 + t1 + 2)>>2;
2012     src[3+0*stride]=(t0 + 2*t1 + t2 + 2)>>2;
2013     src[0+1*stride]=
2014     src[2+2*stride]=(l0 + l1 + 1)>>1;
2015     src[1+1*stride]=
2016     src[3+2*stride]=(lt + 2*l0 + l1 + 2)>>2;
2017     src[0+2*stride]=
2018     src[2+3*stride]=(l1 + l2+ 1)>>1;
2019     src[1+2*stride]=
2020     src[3+3*stride]=(l0 + 2*l1 + l2 + 2)>>2;
2021     src[0+3*stride]=(l2 + l3 + 1)>>1;
2022     src[1+3*stride]=(l1 + 2*l2 + l3 + 2)>>2;
2023 }
2024
2025 static void pred16x16_vertical_c(uint8_t *src, int stride){
2026     int i;
2027     const uint32_t a= ((uint32_t*)(src-stride))[0];
2028     const uint32_t b= ((uint32_t*)(src-stride))[1];
2029     const uint32_t c= ((uint32_t*)(src-stride))[2];
2030     const uint32_t d= ((uint32_t*)(src-stride))[3];
2031
2032     for(i=0; i<16; i++){
2033         ((uint32_t*)(src+i*stride))[0]= a;
2034         ((uint32_t*)(src+i*stride))[1]= b;
2035         ((uint32_t*)(src+i*stride))[2]= c;
2036         ((uint32_t*)(src+i*stride))[3]= d;
2037     }
2038 }
2039
2040 static void pred16x16_horizontal_c(uint8_t *src, int stride){
2041     int i;
2042
2043     for(i=0; i<16; i++){
2044         ((uint32_t*)(src+i*stride))[0]=
2045         ((uint32_t*)(src+i*stride))[1]=
2046         ((uint32_t*)(src+i*stride))[2]=
2047         ((uint32_t*)(src+i*stride))[3]= src[-1+i*stride]*0x01010101;
2048     }
2049 }
2050
2051 static void pred16x16_dc_c(uint8_t *src, int stride){
2052     int i, dc=0;
2053
2054     for(i=0;i<16; i++){
2055         dc+= src[-1+i*stride];
2056     }
2057
2058     for(i=0;i<16; i++){
2059         dc+= src[i-stride];
2060     }
2061
2062     dc= 0x01010101*((dc + 16)>>5);
2063
2064     for(i=0; i<16; i++){
2065         ((uint32_t*)(src+i*stride))[0]=
2066         ((uint32_t*)(src+i*stride))[1]=
2067         ((uint32_t*)(src+i*stride))[2]=
2068         ((uint32_t*)(src+i*stride))[3]= dc;
2069     }
2070 }
2071
2072 static void pred16x16_left_dc_c(uint8_t *src, int stride){
2073     int i, dc=0;
2074
2075     for(i=0;i<16; i++){
2076         dc+= src[-1+i*stride];
2077     }
2078
2079     dc= 0x01010101*((dc + 8)>>4);
2080
2081     for(i=0; i<16; i++){
2082         ((uint32_t*)(src+i*stride))[0]=
2083         ((uint32_t*)(src+i*stride))[1]=
2084         ((uint32_t*)(src+i*stride))[2]=
2085         ((uint32_t*)(src+i*stride))[3]= dc;
2086     }
2087 }
2088
2089 static void pred16x16_top_dc_c(uint8_t *src, int stride){
2090     int i, dc=0;
2091
2092     for(i=0;i<16; i++){
2093         dc+= src[i-stride];
2094     }
2095     dc= 0x01010101*((dc + 8)>>4);
2096
2097     for(i=0; i<16; i++){
2098         ((uint32_t*)(src+i*stride))[0]=
2099         ((uint32_t*)(src+i*stride))[1]=
2100         ((uint32_t*)(src+i*stride))[2]=
2101         ((uint32_t*)(src+i*stride))[3]= dc;
2102     }
2103 }
2104
2105 static void pred16x16_128_dc_c(uint8_t *src, int stride){
2106     int i;
2107
2108     for(i=0; i<16; i++){
2109         ((uint32_t*)(src+i*stride))[0]=
2110         ((uint32_t*)(src+i*stride))[1]=
2111         ((uint32_t*)(src+i*stride))[2]=
2112         ((uint32_t*)(src+i*stride))[3]= 0x01010101U*128U;
2113     }
2114 }
2115
2116 static inline void pred16x16_plane_compat_c(uint8_t *src, int stride, const int svq3){
2117   int i, j, k;
2118   int a;
2119   uint8_t *cm = cropTbl + MAX_NEG_CROP;
2120   const uint8_t * const src0 = src+7-stride;
2121   const uint8_t *src1 = src+8*stride-1;
2122   const uint8_t *src2 = src1-2*stride;      // == src+6*stride-1;
2123   int H = src0[1] - src0[-1];
2124   int V = src1[0] - src2[ 0];
2125   for(k=2; k<=8; ++k) {
2126     src1 += stride; src2 -= stride;
2127     H += k*(src0[k] - src0[-k]);
2128     V += k*(src1[0] - src2[ 0]);
2129   }
2130   if(svq3){
2131     H = ( 5*(H/4) ) / 16;
2132     V = ( 5*(V/4) ) / 16;
2133
2134     /* required for 100% accuracy */
2135     i = H; H = V; V = i;
2136   }else{
2137     H = ( 5*H+32 ) >> 6;
2138     V = ( 5*V+32 ) >> 6;
2139   }
2140
2141   a = 16*(src1[0] + src2[16] + 1) - 7*(V+H);
2142   for(j=16; j>0; --j) {
2143     int b = a;
2144     a += V;
2145     for(i=-16; i<0; i+=4) {
2146       src[16+i] = cm[ (b    ) >> 5 ];
2147       src[17+i] = cm[ (b+  H) >> 5 ];
2148       src[18+i] = cm[ (b+2*H) >> 5 ];
2149       src[19+i] = cm[ (b+3*H) >> 5 ];
2150       b += 4*H;
2151     }
2152     src += stride;
2153   }
2154 }
2155
2156 static void pred16x16_plane_c(uint8_t *src, int stride){
2157     pred16x16_plane_compat_c(src, stride, 0);
2158 }
2159
2160 static void pred8x8_vertical_c(uint8_t *src, int stride){
2161     int i;
2162     const uint32_t a= ((uint32_t*)(src-stride))[0];
2163     const uint32_t b= ((uint32_t*)(src-stride))[1];
2164
2165     for(i=0; i<8; i++){
2166         ((uint32_t*)(src+i*stride))[0]= a;
2167         ((uint32_t*)(src+i*stride))[1]= b;
2168     }
2169 }
2170
2171 static void pred8x8_horizontal_c(uint8_t *src, int stride){
2172     int i;
2173
2174     for(i=0; i<8; i++){
2175         ((uint32_t*)(src+i*stride))[0]=
2176         ((uint32_t*)(src+i*stride))[1]= src[-1+i*stride]*0x01010101;
2177     }
2178 }
2179
2180 static void pred8x8_128_dc_c(uint8_t *src, int stride){
2181     int i;
2182
2183     for(i=0; i<4; i++){
2184         ((uint32_t*)(src+i*stride))[0]=
2185         ((uint32_t*)(src+i*stride))[1]= 0x01010101U*128U;
2186     }
2187     for(i=4; i<8; i++){
2188         ((uint32_t*)(src+i*stride))[0]=
2189         ((uint32_t*)(src+i*stride))[1]= 0x01010101U*128U;
2190     }
2191 }
2192
2193 static void pred8x8_left_dc_c(uint8_t *src, int stride){
2194     int i;
2195     int dc0, dc2;
2196
2197     dc0=dc2=0;
2198     for(i=0;i<4; i++){
2199         dc0+= src[-1+i*stride];
2200         dc2+= src[-1+(i+4)*stride];
2201     }
2202     dc0= 0x01010101*((dc0 + 2)>>2);
2203     dc2= 0x01010101*((dc2 + 2)>>2);
2204
2205     for(i=0; i<4; i++){
2206         ((uint32_t*)(src+i*stride))[0]=
2207         ((uint32_t*)(src+i*stride))[1]= dc0;
2208     }
2209     for(i=4; i<8; i++){
2210         ((uint32_t*)(src+i*stride))[0]=
2211         ((uint32_t*)(src+i*stride))[1]= dc2;
2212     }
2213 }
2214
2215 static void pred8x8_top_dc_c(uint8_t *src, int stride){
2216     int i;
2217     int dc0, dc1;
2218
2219     dc0=dc1=0;
2220     for(i=0;i<4; i++){
2221         dc0+= src[i-stride];
2222         dc1+= src[4+i-stride];
2223     }
2224     dc0= 0x01010101*((dc0 + 2)>>2);
2225     dc1= 0x01010101*((dc1 + 2)>>2);
2226
2227     for(i=0; i<4; i++){
2228         ((uint32_t*)(src+i*stride))[0]= dc0;
2229         ((uint32_t*)(src+i*stride))[1]= dc1;
2230     }
2231     for(i=4; i<8; i++){
2232         ((uint32_t*)(src+i*stride))[0]= dc0;
2233         ((uint32_t*)(src+i*stride))[1]= dc1;
2234     }
2235 }
2236
2237
2238 static void pred8x8_dc_c(uint8_t *src, int stride){
2239     int i;
2240     int dc0, dc1, dc2, dc3;
2241
2242     dc0=dc1=dc2=0;
2243     for(i=0;i<4; i++){
2244         dc0+= src[-1+i*stride] + src[i-stride];
2245         dc1+= src[4+i-stride];
2246         dc2+= src[-1+(i+4)*stride];
2247     }
2248     dc3= 0x01010101*((dc1 + dc2 + 4)>>3);
2249     dc0= 0x01010101*((dc0 + 4)>>3);
2250     dc1= 0x01010101*((dc1 + 2)>>2);
2251     dc2= 0x01010101*((dc2 + 2)>>2);
2252
2253     for(i=0; i<4; i++){
2254         ((uint32_t*)(src+i*stride))[0]= dc0;
2255         ((uint32_t*)(src+i*stride))[1]= dc1;
2256     }
2257     for(i=4; i<8; i++){
2258         ((uint32_t*)(src+i*stride))[0]= dc2;
2259         ((uint32_t*)(src+i*stride))[1]= dc3;
2260     }
2261 }
2262
2263 static void pred8x8_plane_c(uint8_t *src, int stride){
2264   int j, k;
2265   int a;
2266   uint8_t *cm = cropTbl + MAX_NEG_CROP;
2267   const uint8_t * const src0 = src+3-stride;
2268   const uint8_t *src1 = src+4*stride-1;
2269   const uint8_t *src2 = src1-2*stride;      // == src+2*stride-1;
2270   int H = src0[1] - src0[-1];
2271   int V = src1[0] - src2[ 0];
2272   for(k=2; k<=4; ++k) {
2273     src1 += stride; src2 -= stride;
2274     H += k*(src0[k] - src0[-k]);
2275     V += k*(src1[0] - src2[ 0]);
2276   }
2277   H = ( 17*H+16 ) >> 5;
2278   V = ( 17*V+16 ) >> 5;
2279
2280   a = 16*(src1[0] + src2[8]+1) - 3*(V+H);
2281   for(j=8; j>0; --j) {
2282     int b = a;
2283     a += V;
2284     src[0] = cm[ (b    ) >> 5 ];
2285     src[1] = cm[ (b+  H) >> 5 ];
2286     src[2] = cm[ (b+2*H) >> 5 ];
2287     src[3] = cm[ (b+3*H) >> 5 ];
2288     src[4] = cm[ (b+4*H) >> 5 ];
2289     src[5] = cm[ (b+5*H) >> 5 ];
2290     src[6] = cm[ (b+6*H) >> 5 ];
2291     src[7] = cm[ (b+7*H) >> 5 ];
2292     src += stride;
2293   }
2294 }
2295
2296 static inline void mc_dir_part(H264Context *h, Picture *pic, int n, int square, int chroma_height, int delta, int list,
2297                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
2298                            int src_x_offset, int src_y_offset,
2299                            qpel_mc_func *qpix_op, h264_chroma_mc_func chroma_op){
2300     MpegEncContext * const s = &h->s;
2301     const int mx= h->mv_cache[list][ scan8[n] ][0] + src_x_offset*8;
2302     const int my= h->mv_cache[list][ scan8[n] ][1] + src_y_offset*8;
2303     const int luma_xy= (mx&3) + ((my&3)<<2);
2304     uint8_t * src_y = pic->data[0] + (mx>>2) + (my>>2)*s->linesize;
2305     uint8_t * src_cb= pic->data[1] + (mx>>3) + (my>>3)*s->uvlinesize;
2306     uint8_t * src_cr= pic->data[2] + (mx>>3) + (my>>3)*s->uvlinesize;
2307     int extra_width= (s->flags&CODEC_FLAG_EMU_EDGE) ? 0 : 16; //FIXME increase edge?, IMHO not worth it
2308     int extra_height= extra_width;
2309     int emu=0;
2310     const int full_mx= mx>>2;
2311     const int full_my= my>>2;
2312
2313     assert(pic->data[0]);
2314
2315     if(mx&7) extra_width -= 3;
2316     if(my&7) extra_height -= 3;
2317
2318     if(   full_mx < 0-extra_width
2319        || full_my < 0-extra_height
2320        || full_mx + 16/*FIXME*/ > s->width + extra_width
2321        || full_my + 16/*FIXME*/ > s->height + extra_height){
2322         ff_emulated_edge_mc(s->edge_emu_buffer, src_y - 2 - 2*s->linesize, s->linesize, 16+5, 16+5/*FIXME*/, full_mx-2, full_my-2, s->width, s->height);
2323             src_y= s->edge_emu_buffer + 2 + 2*s->linesize;
2324         emu=1;
2325     }
2326
2327     qpix_op[luma_xy](dest_y, src_y, s->linesize); //FIXME try variable height perhaps?
2328     if(!square){
2329         qpix_op[luma_xy](dest_y + delta, src_y + delta, s->linesize);
2330     }
2331
2332     if(s->flags&CODEC_FLAG_GRAY) return;
2333
2334     if(emu){
2335         ff_emulated_edge_mc(s->edge_emu_buffer, src_cb, s->uvlinesize, 9, 9/*FIXME*/, (mx>>3), (my>>3), s->width>>1, s->height>>1);
2336             src_cb= s->edge_emu_buffer;
2337     }
2338     chroma_op(dest_cb, src_cb, s->uvlinesize, chroma_height, mx&7, my&7);
2339
2340     if(emu){
2341         ff_emulated_edge_mc(s->edge_emu_buffer, src_cr, s->uvlinesize, 9, 9/*FIXME*/, (mx>>3), (my>>3), s->width>>1, s->height>>1);
2342             src_cr= s->edge_emu_buffer;
2343     }
2344     chroma_op(dest_cr, src_cr, s->uvlinesize, chroma_height, mx&7, my&7);
2345 }
2346
2347 static inline void mc_part_std(H264Context *h, int n, int square, int chroma_height, int delta,
2348                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
2349                            int x_offset, int y_offset,
2350                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
2351                            qpel_mc_func *qpix_avg, h264_chroma_mc_func chroma_avg,
2352                            int list0, int list1){
2353     MpegEncContext * const s = &h->s;
2354     qpel_mc_func *qpix_op=  qpix_put;
2355     h264_chroma_mc_func chroma_op= chroma_put;
2356
2357     dest_y  += 2*x_offset + 2*y_offset*s->  linesize;
2358     dest_cb +=   x_offset +   y_offset*s->uvlinesize;
2359     dest_cr +=   x_offset +   y_offset*s->uvlinesize;
2360     x_offset += 8*s->mb_x;
2361     y_offset += 8*s->mb_y;
2362
2363     if(list0){
2364         Picture *ref= &h->ref_list[0][ h->ref_cache[0][ scan8[n] ] ];
2365         mc_dir_part(h, ref, n, square, chroma_height, delta, 0,
2366                            dest_y, dest_cb, dest_cr, x_offset, y_offset,
2367                            qpix_op, chroma_op);
2368
2369         qpix_op=  qpix_avg;
2370         chroma_op= chroma_avg;
2371     }
2372
2373     if(list1){
2374         Picture *ref= &h->ref_list[1][ h->ref_cache[1][ scan8[n] ] ];
2375         mc_dir_part(h, ref, n, square, chroma_height, delta, 1,
2376                            dest_y, dest_cb, dest_cr, x_offset, y_offset,
2377                            qpix_op, chroma_op);
2378     }
2379 }
2380
2381 static inline void mc_part_weighted(H264Context *h, int n, int square, int chroma_height, int delta,
2382                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
2383                            int x_offset, int y_offset,
2384                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
2385                            h264_weight_func luma_weight_op, h264_weight_func chroma_weight_op,
2386                            h264_biweight_func luma_weight_avg, h264_biweight_func chroma_weight_avg,
2387                            int list0, int list1){
2388     MpegEncContext * const s = &h->s;
2389
2390     dest_y  += 2*x_offset + 2*y_offset*s->  linesize;
2391     dest_cb +=   x_offset +   y_offset*s->uvlinesize;
2392     dest_cr +=   x_offset +   y_offset*s->uvlinesize;
2393     x_offset += 8*s->mb_x;
2394     y_offset += 8*s->mb_y;
2395
2396     if(list0 && list1){
2397         /* don't optimize for luma-only case, since B-frames usually
2398          * use implicit weights => chroma too. */
2399         uint8_t *tmp_cb = s->obmc_scratchpad;
2400         uint8_t *tmp_cr = tmp_cb + 8*s->uvlinesize;
2401         uint8_t *tmp_y  = tmp_cr + 8*s->uvlinesize;
2402         int refn0 = h->ref_cache[0][ scan8[n] ];
2403         int refn1 = h->ref_cache[1][ scan8[n] ];
2404
2405         mc_dir_part(h, &h->ref_list[0][refn0], n, square, chroma_height, delta, 0,
2406                     dest_y, dest_cb, dest_cr,
2407                     x_offset, y_offset, qpix_put, chroma_put);
2408         mc_dir_part(h, &h->ref_list[1][refn1], n, square, chroma_height, delta, 1,
2409                     tmp_y, tmp_cb, tmp_cr,
2410                     x_offset, y_offset, qpix_put, chroma_put);
2411
2412         if(h->use_weight == 2){
2413             int weight0 = h->implicit_weight[refn0][refn1];
2414             int weight1 = 64 - weight0;
2415             luma_weight_avg(  dest_y,  tmp_y,  s->  linesize, 5, weight0, weight1, 0, 0);
2416             chroma_weight_avg(dest_cb, tmp_cb, s->uvlinesize, 5, weight0, weight1, 0, 0);
2417             chroma_weight_avg(dest_cr, tmp_cr, s->uvlinesize, 5, weight0, weight1, 0, 0);
2418         }else{
2419             luma_weight_avg(dest_y, tmp_y, s->linesize, h->luma_log2_weight_denom,
2420                             h->luma_weight[0][refn0], h->luma_weight[1][refn1],
2421                             h->luma_offset[0][refn0], h->luma_offset[1][refn1]);
2422             chroma_weight_avg(dest_cb, tmp_cb, s->uvlinesize, h->chroma_log2_weight_denom,
2423                             h->chroma_weight[0][refn0][0], h->chroma_weight[1][refn1][0],
2424                             h->chroma_offset[0][refn0][0], h->chroma_offset[1][refn1][0]);
2425             chroma_weight_avg(dest_cr, tmp_cr, s->uvlinesize, h->chroma_log2_weight_denom,
2426                             h->chroma_weight[0][refn0][1], h->chroma_weight[1][refn1][1],
2427                             h->chroma_offset[0][refn0][1], h->chroma_offset[1][refn1][1]);
2428         }
2429     }else{
2430         int list = list1 ? 1 : 0;
2431         int refn = h->ref_cache[list][ scan8[n] ];
2432         Picture *ref= &h->ref_list[list][refn];
2433         mc_dir_part(h, ref, n, square, chroma_height, delta, list,
2434                     dest_y, dest_cb, dest_cr, x_offset, y_offset,
2435                     qpix_put, chroma_put);
2436
2437         luma_weight_op(dest_y, s->linesize, h->luma_log2_weight_denom,
2438                        h->luma_weight[list][refn], h->luma_offset[list][refn]);
2439         if(h->use_weight_chroma){
2440             chroma_weight_op(dest_cb, s->uvlinesize, h->chroma_log2_weight_denom,
2441                              h->chroma_weight[list][refn][0], h->chroma_offset[list][refn][0]);
2442             chroma_weight_op(dest_cr, s->uvlinesize, h->chroma_log2_weight_denom,
2443                              h->chroma_weight[list][refn][1], h->chroma_offset[list][refn][1]);
2444         }
2445     }
2446 }
2447
2448 static inline void mc_part(H264Context *h, int n, int square, int chroma_height, int delta,
2449                            uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
2450                            int x_offset, int y_offset,
2451                            qpel_mc_func *qpix_put, h264_chroma_mc_func chroma_put,
2452                            qpel_mc_func *qpix_avg, h264_chroma_mc_func chroma_avg,
2453                            h264_weight_func *weight_op, h264_biweight_func *weight_avg,
2454                            int list0, int list1){
2455     if((h->use_weight==2 && list0 && list1
2456         && (h->implicit_weight[ h->ref_cache[0][scan8[n]] ][ h->ref_cache[1][scan8[n]] ] != 32))
2457        || h->use_weight==1)
2458         mc_part_weighted(h, n, square, chroma_height, delta, dest_y, dest_cb, dest_cr,
2459                          x_offset, y_offset, qpix_put, chroma_put,
2460                          weight_op[0], weight_op[3], weight_avg[0], weight_avg[3], list0, list1);
2461     else
2462         mc_part_std(h, n, square, chroma_height, delta, dest_y, dest_cb, dest_cr,
2463                     x_offset, y_offset, qpix_put, chroma_put, qpix_avg, chroma_avg, list0, list1);
2464 }
2465
2466 static void hl_motion(H264Context *h, uint8_t *dest_y, uint8_t *dest_cb, uint8_t *dest_cr,
2467                       qpel_mc_func (*qpix_put)[16], h264_chroma_mc_func (*chroma_put),
2468                       qpel_mc_func (*qpix_avg)[16], h264_chroma_mc_func (*chroma_avg),
2469                       h264_weight_func *weight_op, h264_biweight_func *weight_avg){
2470     MpegEncContext * const s = &h->s;
2471     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
2472     const int mb_type= s->current_picture.mb_type[mb_xy];
2473
2474     assert(IS_INTER(mb_type));
2475
2476     if(IS_16X16(mb_type)){
2477         mc_part(h, 0, 1, 8, 0, dest_y, dest_cb, dest_cr, 0, 0,
2478                 qpix_put[0], chroma_put[0], qpix_avg[0], chroma_avg[0],
2479                 &weight_op[0], &weight_avg[0],
2480                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
2481     }else if(IS_16X8(mb_type)){
2482         mc_part(h, 0, 0, 4, 8, dest_y, dest_cb, dest_cr, 0, 0,
2483                 qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0],
2484                 &weight_op[1], &weight_avg[1],
2485                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
2486         mc_part(h, 8, 0, 4, 8, dest_y, dest_cb, dest_cr, 0, 4,
2487                 qpix_put[1], chroma_put[0], qpix_avg[1], chroma_avg[0],
2488                 &weight_op[1], &weight_avg[1],
2489                 IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1));
2490     }else if(IS_8X16(mb_type)){
2491         mc_part(h, 0, 0, 8, 8*s->linesize, dest_y, dest_cb, dest_cr, 0, 0,
2492                 qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
2493                 &weight_op[2], &weight_avg[2],
2494                 IS_DIR(mb_type, 0, 0), IS_DIR(mb_type, 0, 1));
2495         mc_part(h, 4, 0, 8, 8*s->linesize, dest_y, dest_cb, dest_cr, 4, 0,
2496                 qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
2497                 &weight_op[2], &weight_avg[2],
2498                 IS_DIR(mb_type, 1, 0), IS_DIR(mb_type, 1, 1));
2499     }else{
2500         int i;
2501
2502         assert(IS_8X8(mb_type));
2503
2504         for(i=0; i<4; i++){
2505             const int sub_mb_type= h->sub_mb_type[i];
2506             const int n= 4*i;
2507             int x_offset= (i&1)<<2;
2508             int y_offset= (i&2)<<1;
2509
2510             if(IS_SUB_8X8(sub_mb_type)){
2511                 mc_part(h, n, 1, 4, 0, dest_y, dest_cb, dest_cr, x_offset, y_offset,
2512                     qpix_put[1], chroma_put[1], qpix_avg[1], chroma_avg[1],
2513                     &weight_op[3], &weight_avg[3],
2514                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
2515             }else if(IS_SUB_8X4(sub_mb_type)){
2516                 mc_part(h, n  , 0, 2, 4, dest_y, dest_cb, dest_cr, x_offset, y_offset,
2517                     qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1],
2518                     &weight_op[4], &weight_avg[4],
2519                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
2520                 mc_part(h, n+2, 0, 2, 4, dest_y, dest_cb, dest_cr, x_offset, y_offset+2,
2521                     qpix_put[2], chroma_put[1], qpix_avg[2], chroma_avg[1],
2522                     &weight_op[4], &weight_avg[4],
2523                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
2524             }else if(IS_SUB_4X8(sub_mb_type)){
2525                 mc_part(h, n  , 0, 4, 4*s->linesize, dest_y, dest_cb, dest_cr, x_offset, y_offset,
2526                     qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
2527                     &weight_op[5], &weight_avg[5],
2528                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
2529                 mc_part(h, n+1, 0, 4, 4*s->linesize, dest_y, dest_cb, dest_cr, x_offset+2, y_offset,
2530                     qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
2531                     &weight_op[5], &weight_avg[5],
2532                     IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
2533             }else{
2534                 int j;
2535                 assert(IS_SUB_4X4(sub_mb_type));
2536                 for(j=0; j<4; j++){
2537                     int sub_x_offset= x_offset + 2*(j&1);
2538                     int sub_y_offset= y_offset +   (j&2);
2539                     mc_part(h, n+j, 1, 2, 0, dest_y, dest_cb, dest_cr, sub_x_offset, sub_y_offset,
2540                         qpix_put[2], chroma_put[2], qpix_avg[2], chroma_avg[2],
2541                         &weight_op[6], &weight_avg[6],
2542                         IS_DIR(sub_mb_type, 0, 0), IS_DIR(sub_mb_type, 0, 1));
2543                 }
2544             }
2545         }
2546     }
2547 }
2548
2549 static void decode_init_vlc(H264Context *h){
2550     static int done = 0;
2551
2552     if (!done) {
2553         int i;
2554         done = 1;
2555
2556         init_vlc(&chroma_dc_coeff_token_vlc, CHROMA_DC_COEFF_TOKEN_VLC_BITS, 4*5,
2557                  &chroma_dc_coeff_token_len [0], 1, 1,
2558                  &chroma_dc_coeff_token_bits[0], 1, 1, 1);
2559
2560         for(i=0; i<4; i++){
2561             init_vlc(&coeff_token_vlc[i], COEFF_TOKEN_VLC_BITS, 4*17,
2562                      &coeff_token_len [i][0], 1, 1,
2563                      &coeff_token_bits[i][0], 1, 1, 1);
2564         }
2565
2566         for(i=0; i<3; i++){
2567             init_vlc(&chroma_dc_total_zeros_vlc[i], CHROMA_DC_TOTAL_ZEROS_VLC_BITS, 4,
2568                      &chroma_dc_total_zeros_len [i][0], 1, 1,
2569                      &chroma_dc_total_zeros_bits[i][0], 1, 1, 1);
2570         }
2571         for(i=0; i<15; i++){
2572             init_vlc(&total_zeros_vlc[i], TOTAL_ZEROS_VLC_BITS, 16,
2573                      &total_zeros_len [i][0], 1, 1,
2574                      &total_zeros_bits[i][0], 1, 1, 1);
2575         }
2576
2577         for(i=0; i<6; i++){
2578             init_vlc(&run_vlc[i], RUN_VLC_BITS, 7,
2579                      &run_len [i][0], 1, 1,
2580                      &run_bits[i][0], 1, 1, 1);
2581         }
2582         init_vlc(&run7_vlc, RUN7_VLC_BITS, 16,
2583                  &run_len [6][0], 1, 1,
2584                  &run_bits[6][0], 1, 1, 1);
2585     }
2586 }
2587
2588 /**
2589  * Sets the intra prediction function pointers.
2590  */
2591 static void init_pred_ptrs(H264Context *h){
2592 //    MpegEncContext * const s = &h->s;
2593
2594     h->pred4x4[VERT_PRED           ]= pred4x4_vertical_c;
2595     h->pred4x4[HOR_PRED            ]= pred4x4_horizontal_c;
2596     h->pred4x4[DC_PRED             ]= pred4x4_dc_c;
2597     h->pred4x4[DIAG_DOWN_LEFT_PRED ]= pred4x4_down_left_c;
2598     h->pred4x4[DIAG_DOWN_RIGHT_PRED]= pred4x4_down_right_c;
2599     h->pred4x4[VERT_RIGHT_PRED     ]= pred4x4_vertical_right_c;
2600     h->pred4x4[HOR_DOWN_PRED       ]= pred4x4_horizontal_down_c;
2601     h->pred4x4[VERT_LEFT_PRED      ]= pred4x4_vertical_left_c;
2602     h->pred4x4[HOR_UP_PRED         ]= pred4x4_horizontal_up_c;
2603     h->pred4x4[LEFT_DC_PRED        ]= pred4x4_left_dc_c;
2604     h->pred4x4[TOP_DC_PRED         ]= pred4x4_top_dc_c;
2605     h->pred4x4[DC_128_PRED         ]= pred4x4_128_dc_c;
2606
2607     h->pred8x8[DC_PRED8x8     ]= pred8x8_dc_c;
2608     h->pred8x8[VERT_PRED8x8   ]= pred8x8_vertical_c;
2609     h->pred8x8[HOR_PRED8x8    ]= pred8x8_horizontal_c;
2610     h->pred8x8[PLANE_PRED8x8  ]= pred8x8_plane_c;
2611     h->pred8x8[LEFT_DC_PRED8x8]= pred8x8_left_dc_c;
2612     h->pred8x8[TOP_DC_PRED8x8 ]= pred8x8_top_dc_c;
2613     h->pred8x8[DC_128_PRED8x8 ]= pred8x8_128_dc_c;
2614
2615     h->pred16x16[DC_PRED8x8     ]= pred16x16_dc_c;
2616     h->pred16x16[VERT_PRED8x8   ]= pred16x16_vertical_c;
2617     h->pred16x16[HOR_PRED8x8    ]= pred16x16_horizontal_c;
2618     h->pred16x16[PLANE_PRED8x8  ]= pred16x16_plane_c;
2619     h->pred16x16[LEFT_DC_PRED8x8]= pred16x16_left_dc_c;
2620     h->pred16x16[TOP_DC_PRED8x8 ]= pred16x16_top_dc_c;
2621     h->pred16x16[DC_128_PRED8x8 ]= pred16x16_128_dc_c;
2622 }
2623
2624 static void free_tables(H264Context *h){
2625     av_freep(&h->intra4x4_pred_mode);
2626     av_freep(&h->chroma_pred_mode_table);
2627     av_freep(&h->cbp_table);
2628     av_freep(&h->mvd_table[0]);
2629     av_freep(&h->mvd_table[1]);
2630     av_freep(&h->direct_table);
2631     av_freep(&h->non_zero_count);
2632     av_freep(&h->slice_table_base);
2633     av_freep(&h->top_borders[1]);
2634     av_freep(&h->top_borders[0]);
2635     h->slice_table= NULL;
2636
2637     av_freep(&h->mb2b_xy);
2638     av_freep(&h->mb2b8_xy);
2639
2640     av_freep(&h->s.obmc_scratchpad);
2641 }
2642
2643 /**
2644  * allocates tables.
2645  * needs width/height
2646  */
2647 static int alloc_tables(H264Context *h){
2648     MpegEncContext * const s = &h->s;
2649     const int big_mb_num= s->mb_stride * (s->mb_height+1);
2650     int x,y;
2651
2652     CHECKED_ALLOCZ(h->intra4x4_pred_mode, big_mb_num * 8  * sizeof(uint8_t))
2653
2654     CHECKED_ALLOCZ(h->non_zero_count    , big_mb_num * 16 * sizeof(uint8_t))
2655     CHECKED_ALLOCZ(h->slice_table_base  , big_mb_num * sizeof(uint8_t))
2656     CHECKED_ALLOCZ(h->top_borders[0]    , s->mb_width * (16+8+8) * sizeof(uint8_t))
2657     CHECKED_ALLOCZ(h->top_borders[1]    , s->mb_width * (16+8+8) * sizeof(uint8_t))
2658     CHECKED_ALLOCZ(h->cbp_table, big_mb_num * sizeof(uint16_t))
2659
2660     if( h->pps.cabac ) {
2661         CHECKED_ALLOCZ(h->chroma_pred_mode_table, big_mb_num * sizeof(uint8_t))
2662         CHECKED_ALLOCZ(h->mvd_table[0], 32*big_mb_num * sizeof(uint16_t));
2663         CHECKED_ALLOCZ(h->mvd_table[1], 32*big_mb_num * sizeof(uint16_t));
2664         CHECKED_ALLOCZ(h->direct_table, 32*big_mb_num * sizeof(uint8_t));
2665     }
2666
2667     memset(h->slice_table_base, -1, big_mb_num  * sizeof(uint8_t));
2668     h->slice_table= h->slice_table_base + s->mb_stride + 1;
2669
2670     CHECKED_ALLOCZ(h->mb2b_xy  , big_mb_num * sizeof(uint32_t));
2671     CHECKED_ALLOCZ(h->mb2b8_xy , big_mb_num * sizeof(uint32_t));
2672     for(y=0; y<s->mb_height; y++){
2673         for(x=0; x<s->mb_width; x++){
2674             const int mb_xy= x + y*s->mb_stride;
2675             const int b_xy = 4*x + 4*y*h->b_stride;
2676             const int b8_xy= 2*x + 2*y*h->b8_stride;
2677
2678             h->mb2b_xy [mb_xy]= b_xy;
2679             h->mb2b8_xy[mb_xy]= b8_xy;
2680         }
2681     }
2682
2683     s->obmc_scratchpad = NULL;
2684
2685     return 0;
2686 fail:
2687     free_tables(h);
2688     return -1;
2689 }
2690
2691 static void common_init(H264Context *h){
2692     MpegEncContext * const s = &h->s;
2693
2694     s->width = s->avctx->width;
2695     s->height = s->avctx->height;
2696     s->codec_id= s->avctx->codec->id;
2697
2698     init_pred_ptrs(h);
2699
2700     s->unrestricted_mv=1;
2701     s->decode=1; //FIXME
2702 }
2703
2704 static int decode_init(AVCodecContext *avctx){
2705     H264Context *h= avctx->priv_data;
2706     MpegEncContext * const s = &h->s;
2707
2708     MPV_decode_defaults(s);
2709
2710     s->avctx = avctx;
2711     common_init(h);
2712
2713     s->out_format = FMT_H264;
2714     s->workaround_bugs= avctx->workaround_bugs;
2715
2716     // set defaults
2717 //    s->decode_mb= ff_h263_decode_mb;
2718     s->low_delay= 1;
2719     avctx->pix_fmt= PIX_FMT_YUV420P;
2720
2721     decode_init_vlc(h);
2722
2723     if(avctx->extradata_size > 0 && avctx->extradata &&
2724        *(char *)avctx->extradata == 1){
2725         h->is_avc = 1;
2726         h->got_avcC = 0;
2727     } else {
2728         h->is_avc = 0;
2729     }
2730
2731     return 0;
2732 }
2733
2734 static void frame_start(H264Context *h){
2735     MpegEncContext * const s = &h->s;
2736     int i;
2737
2738     MPV_frame_start(s, s->avctx);
2739     ff_er_frame_start(s);
2740
2741     assert(s->linesize && s->uvlinesize);
2742
2743     for(i=0; i<16; i++){
2744         h->block_offset[i]= 4*((scan8[i] - scan8[0])&7) + 4*s->linesize*((scan8[i] - scan8[0])>>3);
2745         h->block_offset[24+i]= 4*((scan8[i] - scan8[0])&7) + 8*s->linesize*((scan8[i] - scan8[0])>>3);
2746     }
2747     for(i=0; i<4; i++){
2748         h->block_offset[16+i]=
2749         h->block_offset[20+i]= 4*((scan8[i] - scan8[0])&7) + 4*s->uvlinesize*((scan8[i] - scan8[0])>>3);
2750         h->block_offset[24+16+i]=
2751         h->block_offset[24+20+i]= 4*((scan8[i] - scan8[0])&7) + 8*s->uvlinesize*((scan8[i] - scan8[0])>>3);
2752     }
2753
2754     /* can't be in alloc_tables because linesize isn't known there.
2755      * FIXME: redo bipred weight to not require extra buffer? */
2756     if(!s->obmc_scratchpad)
2757         s->obmc_scratchpad = av_malloc(16*s->linesize + 2*8*s->uvlinesize);
2758
2759 //    s->decode= (s->flags&CODEC_FLAG_PSNR) || !s->encoding || s->current_picture.reference /*|| h->contains_intra*/ || 1;
2760 }
2761
2762 static inline void backup_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize){
2763     MpegEncContext * const s = &h->s;
2764     int i;
2765
2766     src_y  -=   linesize;
2767     src_cb -= uvlinesize;
2768     src_cr -= uvlinesize;
2769
2770     // There are two lines saved, the line above the the top macroblock of a pair,
2771     // and the line above the bottom macroblock
2772     h->left_border[0]= h->top_borders[0][s->mb_x][15];
2773     for(i=1; i<17; i++){
2774         h->left_border[i]= src_y[15+i*  linesize];
2775     }
2776
2777     *(uint64_t*)(h->top_borders[0][s->mb_x]+0)= *(uint64_t*)(src_y +  16*linesize);
2778     *(uint64_t*)(h->top_borders[0][s->mb_x]+8)= *(uint64_t*)(src_y +8+16*linesize);
2779
2780     if(!(s->flags&CODEC_FLAG_GRAY)){
2781         h->left_border[17  ]= h->top_borders[0][s->mb_x][16+7];
2782         h->left_border[17+9]= h->top_borders[0][s->mb_x][24+7];
2783         for(i=1; i<9; i++){
2784             h->left_border[i+17  ]= src_cb[7+i*uvlinesize];
2785             h->left_border[i+17+9]= src_cr[7+i*uvlinesize];
2786         }
2787         *(uint64_t*)(h->top_borders[0][s->mb_x]+16)= *(uint64_t*)(src_cb+8*uvlinesize);
2788         *(uint64_t*)(h->top_borders[0][s->mb_x]+24)= *(uint64_t*)(src_cr+8*uvlinesize);
2789     }
2790 }
2791
2792 static inline void xchg_mb_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int xchg){
2793     MpegEncContext * const s = &h->s;
2794     int temp8, i;
2795     uint64_t temp64;
2796     int deblock_left = (s->mb_x > 0);
2797     int deblock_top  = (s->mb_y > 0);
2798
2799     src_y  -=   linesize + 1;
2800     src_cb -= uvlinesize + 1;
2801     src_cr -= uvlinesize + 1;
2802
2803 #define XCHG(a,b,t,xchg)\
2804 t= a;\
2805 if(xchg)\
2806     a= b;\
2807 b= t;
2808
2809     if(deblock_left){
2810         for(i = !deblock_top; i<17; i++){
2811             XCHG(h->left_border[i     ], src_y [i*  linesize], temp8, xchg);
2812         }
2813     }
2814
2815     if(deblock_top){
2816         XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+0), *(uint64_t*)(src_y +1), temp64, xchg);
2817         XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+8), *(uint64_t*)(src_y +9), temp64, 1);
2818     }
2819
2820     if(!(s->flags&CODEC_FLAG_GRAY)){
2821         if(deblock_left){
2822             for(i = !deblock_top; i<9; i++){
2823                 XCHG(h->left_border[i+17  ], src_cb[i*uvlinesize], temp8, xchg);
2824                 XCHG(h->left_border[i+17+9], src_cr[i*uvlinesize], temp8, xchg);
2825             }
2826         }
2827         if(deblock_top){
2828             XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+16), *(uint64_t*)(src_cb+1), temp64, 1);
2829             XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+24), *(uint64_t*)(src_cr+1), temp64, 1);
2830         }
2831     }
2832 }
2833
2834 static inline void backup_pair_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize){
2835     MpegEncContext * const s = &h->s;
2836     int i;
2837
2838     src_y  -= 2 *   linesize;
2839     src_cb -= 2 * uvlinesize;
2840     src_cr -= 2 * uvlinesize;
2841
2842     // There are two lines saved, the line above the the top macroblock of a pair,
2843     // and the line above the bottom macroblock
2844     h->left_border[0]= h->top_borders[0][s->mb_x][15];
2845     h->left_border[1]= h->top_borders[1][s->mb_x][15];
2846     for(i=2; i<34; i++){
2847         h->left_border[i]= src_y[15+i*  linesize];
2848     }
2849
2850     *(uint64_t*)(h->top_borders[0][s->mb_x]+0)= *(uint64_t*)(src_y +  32*linesize);
2851     *(uint64_t*)(h->top_borders[0][s->mb_x]+8)= *(uint64_t*)(src_y +8+32*linesize);
2852     *(uint64_t*)(h->top_borders[1][s->mb_x]+0)= *(uint64_t*)(src_y +  33*linesize);
2853     *(uint64_t*)(h->top_borders[1][s->mb_x]+8)= *(uint64_t*)(src_y +8+33*linesize);
2854
2855     if(!(s->flags&CODEC_FLAG_GRAY)){
2856         h->left_border[34     ]= h->top_borders[0][s->mb_x][16+7];
2857         h->left_border[34+   1]= h->top_borders[1][s->mb_x][16+7];
2858         h->left_border[34+18  ]= h->top_borders[0][s->mb_x][24+7];
2859         h->left_border[34+18+1]= h->top_borders[1][s->mb_x][24+7];
2860         for(i=2; i<18; i++){
2861             h->left_border[i+34   ]= src_cb[7+i*uvlinesize];
2862             h->left_border[i+34+18]= src_cr[7+i*uvlinesize];
2863         }
2864         *(uint64_t*)(h->top_borders[0][s->mb_x]+16)= *(uint64_t*)(src_cb+16*uvlinesize);
2865         *(uint64_t*)(h->top_borders[0][s->mb_x]+24)= *(uint64_t*)(src_cr+16*uvlinesize);
2866         *(uint64_t*)(h->top_borders[1][s->mb_x]+16)= *(uint64_t*)(src_cb+17*uvlinesize);
2867         *(uint64_t*)(h->top_borders[1][s->mb_x]+24)= *(uint64_t*)(src_cr+17*uvlinesize);
2868     }
2869 }
2870
2871 static inline void xchg_pair_border(H264Context *h, uint8_t *src_y, uint8_t *src_cb, uint8_t *src_cr, int linesize, int uvlinesize, int xchg){
2872     MpegEncContext * const s = &h->s;
2873     int temp8, i;
2874     uint64_t temp64;
2875     int deblock_left = (s->mb_x > 0);
2876     int deblock_top  = (s->mb_y > 0);
2877
2878     tprintf("xchg_pair_border: src_y:%p src_cb:%p src_cr:%p ls:%d uvls:%d\n", src_y, src_cb, src_cr, linesize, uvlinesize);
2879
2880     src_y  -= 2 *   linesize + 1;
2881     src_cb -= 2 * uvlinesize + 1;
2882     src_cr -= 2 * uvlinesize + 1;
2883
2884 #define XCHG(a,b,t,xchg)\
2885 t= a;\
2886 if(xchg)\
2887     a= b;\
2888 b= t;
2889
2890     if(deblock_left){
2891         for(i = (!deblock_top)<<1; i<34; i++){
2892             XCHG(h->left_border[i     ], src_y [i*  linesize], temp8, xchg);
2893         }
2894     }
2895
2896     if(deblock_top){
2897         XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+0), *(uint64_t*)(src_y +1), temp64, xchg);
2898         XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+8), *(uint64_t*)(src_y +9), temp64, 1);
2899         XCHG(*(uint64_t*)(h->top_borders[1][s->mb_x]+0), *(uint64_t*)(src_y +1 +linesize), temp64, xchg);
2900         XCHG(*(uint64_t*)(h->top_borders[1][s->mb_x]+8), *(uint64_t*)(src_y +9 +linesize), temp64, 1);
2901     }
2902
2903     if(!(s->flags&CODEC_FLAG_GRAY)){
2904         if(deblock_left){
2905             for(i = (!deblock_top) << 1; i<18; i++){
2906                 XCHG(h->left_border[i+34   ], src_cb[i*uvlinesize], temp8, xchg);
2907                 XCHG(h->left_border[i+34+18], src_cr[i*uvlinesize], temp8, xchg);
2908             }
2909         }
2910         if(deblock_top){
2911             XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+16), *(uint64_t*)(src_cb+1), temp64, 1);
2912             XCHG(*(uint64_t*)(h->top_borders[0][s->mb_x]+24), *(uint64_t*)(src_cr+1), temp64, 1);
2913             XCHG(*(uint64_t*)(h->top_borders[1][s->mb_x]+16), *(uint64_t*)(src_cb+1 +uvlinesize), temp64, 1);
2914             XCHG(*(uint64_t*)(h->top_borders[1][s->mb_x]+24), *(uint64_t*)(src_cr+1 +uvlinesize), temp64, 1);
2915         }
2916     }
2917 }
2918
2919 static void hl_decode_mb(H264Context *h){
2920     MpegEncContext * const s = &h->s;
2921     const int mb_x= s->mb_x;
2922     const int mb_y= s->mb_y;
2923     const int mb_xy= mb_x + mb_y*s->mb_stride;
2924     const int mb_type= s->current_picture.mb_type[mb_xy];
2925     uint8_t  *dest_y, *dest_cb, *dest_cr;
2926     int linesize, uvlinesize /*dct_offset*/;
2927     int i;
2928     int *block_offset = &h->block_offset[0];
2929     const unsigned int bottom = mb_y & 1;
2930
2931     if(!s->decode)
2932         return;
2933
2934     dest_y  = s->current_picture.data[0] + (mb_y * 16* s->linesize  ) + mb_x * 16;
2935     dest_cb = s->current_picture.data[1] + (mb_y * 8 * s->uvlinesize) + mb_x * 8;
2936     dest_cr = s->current_picture.data[2] + (mb_y * 8 * s->uvlinesize) + mb_x * 8;
2937
2938     if (h->mb_field_decoding_flag) {
2939         linesize = s->linesize * 2;
2940         uvlinesize = s->uvlinesize * 2;
2941         block_offset = &h->block_offset[24];
2942         if(mb_y&1){ //FIXME move out of this func?
2943             dest_y -= s->linesize*15;
2944             dest_cb-= s->uvlinesize*7;
2945             dest_cr-= s->uvlinesize*7;
2946         }
2947     } else {
2948         linesize = s->linesize;
2949         uvlinesize = s->uvlinesize;
2950 //        dct_offset = s->linesize * 16;
2951     }
2952
2953     if (IS_INTRA_PCM(mb_type)) {
2954         unsigned int x, y;
2955
2956         // The pixels are stored in h->mb array in the same order as levels,
2957         // copy them in output in the correct order.
2958         for(i=0; i<16; i++) {
2959             for (y=0; y<4; y++) {
2960                 for (x=0; x<4; x++) {
2961                     *(dest_y + block_offset[i] + y*linesize + x) = h->mb[i*16+y*4+x];
2962                 }
2963             }
2964         }
2965         for(i=16; i<16+4; i++) {
2966             for (y=0; y<4; y++) {
2967                 for (x=0; x<4; x++) {
2968                     *(dest_cb + block_offset[i] + y*uvlinesize + x) = h->mb[i*16+y*4+x];
2969                 }
2970             }
2971         }
2972         for(i=20; i<20+4; i++) {
2973             for (y=0; y<4; y++) {
2974                 for (x=0; x<4; x++) {
2975                     *(dest_cr + block_offset[i] + y*uvlinesize + x) = h->mb[i*16+y*4+x];
2976                 }
2977             }
2978         }
2979     } else {
2980         if(IS_INTRA(mb_type)){
2981             if(h->deblocking_filter) {
2982                 if (h->mb_aff_frame) {
2983                     if (!bottom)
2984                         xchg_pair_border(h, dest_y, dest_cb, dest_cr, s->linesize, s->uvlinesize, 1);
2985                 } else {
2986                     xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 1);
2987                 }
2988             }
2989
2990             if(!(s->flags&CODEC_FLAG_GRAY)){
2991                 h->pred8x8[ h->chroma_pred_mode ](dest_cb, uvlinesize);
2992                 h->pred8x8[ h->chroma_pred_mode ](dest_cr, uvlinesize);
2993             }
2994
2995             if(IS_INTRA4x4(mb_type)){
2996                 if(!s->encoding){
2997                     for(i=0; i<16; i++){
2998                         uint8_t * const ptr= dest_y + block_offset[i];
2999                         uint8_t *topright;
3000                         const int dir= h->intra4x4_pred_mode_cache[ scan8[i] ];
3001                         int tr;
3002
3003                         if(dir == DIAG_DOWN_LEFT_PRED || dir == VERT_LEFT_PRED){
3004                             const int topright_avail= (h->topright_samples_available<<i)&0x8000;
3005                             assert(mb_y || linesize <= block_offset[i]);
3006                             if(!topright_avail){
3007                                 tr= ptr[3 - linesize]*0x01010101;
3008                                 topright= (uint8_t*) &tr;
3009                             }else if(i==5 && h->deblocking_filter){
3010                                 tr= *(uint32_t*)h->top_borders[h->mb_aff_frame ? IS_INTERLACED(mb_type) ? bottom : 1 : 0][mb_x+1];
3011                                 topright= (uint8_t*) &tr;
3012                             }else
3013                                 topright= ptr + 4 - linesize;
3014                         }else
3015                             topright= NULL;
3016
3017                         h->pred4x4[ dir ](ptr, topright, linesize);
3018                         if(h->non_zero_count_cache[ scan8[i] ]){
3019                             if(s->codec_id == CODEC_ID_H264)
3020                                 s->dsp.h264_idct_add(ptr, h->mb + i*16, linesize);
3021                             else
3022                                 svq3_add_idct_c(ptr, h->mb + i*16, linesize, s->qscale, 0);
3023                         }
3024                     }
3025                 }
3026             }else{
3027                 h->pred16x16[ h->intra16x16_pred_mode ](dest_y , linesize);
3028                 if(s->codec_id == CODEC_ID_H264)
3029                     h264_luma_dc_dequant_idct_c(h->mb, s->qscale);
3030                 else
3031                     svq3_luma_dc_dequant_idct_c(h->mb, s->qscale);
3032             }
3033             if(h->deblocking_filter) {
3034                 if (h->mb_aff_frame) {
3035                     if (bottom) {
3036                         uint8_t *pair_dest_y  = s->current_picture.data[0] + ((mb_y-1) * 16* s->linesize  ) + mb_x * 16;
3037                         uint8_t *pair_dest_cb = s->current_picture.data[1] + ((mb_y-1) * 8 * s->uvlinesize) + mb_x * 8;
3038                         uint8_t *pair_dest_cr = s->current_picture.data[2] + ((mb_y-1) * 8 * s->uvlinesize) + mb_x * 8;
3039                         s->mb_y--;
3040                         xchg_pair_border(h, pair_dest_y, pair_dest_cb, pair_dest_cr, s->linesize, s->uvlinesize, 0);
3041                         s->mb_y++;
3042                     }
3043                 } else {
3044                     xchg_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize, 0);
3045                 }
3046             }
3047         }else if(s->codec_id == CODEC_ID_H264){
3048             hl_motion(h, dest_y, dest_cb, dest_cr,
3049                       s->dsp.put_h264_qpel_pixels_tab, s->dsp.put_h264_chroma_pixels_tab,
3050                       s->dsp.avg_h264_qpel_pixels_tab, s->dsp.avg_h264_chroma_pixels_tab,
3051                       s->dsp.weight_h264_pixels_tab, s->dsp.biweight_h264_pixels_tab);
3052         }
3053
3054
3055         if(!IS_INTRA4x4(mb_type)){
3056             if(s->codec_id == CODEC_ID_H264){
3057                 for(i=0; i<16; i++){
3058                     if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){ //FIXME benchmark weird rule, & below
3059                         uint8_t * const ptr= dest_y + block_offset[i];
3060                         s->dsp.h264_idct_add(ptr, h->mb + i*16, linesize);
3061                     }
3062                 }
3063             }else{
3064                 for(i=0; i<16; i++){
3065                     if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){ //FIXME benchmark weird rule, & below
3066                         uint8_t * const ptr= dest_y + block_offset[i];
3067                         svq3_add_idct_c(ptr, h->mb + i*16, linesize, s->qscale, IS_INTRA(mb_type) ? 1 : 0);
3068                     }
3069                 }
3070             }
3071         }
3072
3073         if(!(s->flags&CODEC_FLAG_GRAY)){
3074             chroma_dc_dequant_idct_c(h->mb + 16*16, h->chroma_qp);
3075             chroma_dc_dequant_idct_c(h->mb + 16*16+4*16, h->chroma_qp);
3076             if(s->codec_id == CODEC_ID_H264){
3077                 for(i=16; i<16+4; i++){
3078                     if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){
3079                         uint8_t * const ptr= dest_cb + block_offset[i];
3080                         s->dsp.h264_idct_add(ptr, h->mb + i*16, uvlinesize);
3081                     }
3082                 }
3083                 for(i=20; i<20+4; i++){
3084                     if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){
3085                         uint8_t * const ptr= dest_cr + block_offset[i];
3086                         s->dsp.h264_idct_add(ptr, h->mb + i*16, uvlinesize);
3087                     }
3088                 }
3089             }else{
3090                 for(i=16; i<16+4; i++){
3091                     if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){
3092                         uint8_t * const ptr= dest_cb + block_offset[i];
3093                         svq3_add_idct_c(ptr, h->mb + i*16, uvlinesize, chroma_qp[s->qscale + 12] - 12, 2);
3094                     }
3095                 }
3096                 for(i=20; i<20+4; i++){
3097                     if(h->non_zero_count_cache[ scan8[i] ] || h->mb[i*16]){
3098                         uint8_t * const ptr= dest_cr + block_offset[i];
3099                         svq3_add_idct_c(ptr, h->mb + i*16, uvlinesize, chroma_qp[s->qscale + 12] - 12, 2);
3100                     }
3101                 }
3102             }
3103         }
3104     }
3105     if(h->deblocking_filter) {
3106         if (h->mb_aff_frame) {
3107             const int mb_y = s->mb_y - 1;
3108             uint8_t  *pair_dest_y, *pair_dest_cb, *pair_dest_cr;
3109             const int mb_xy= mb_x + mb_y*s->mb_stride;
3110             const int mb_type_top   = s->current_picture.mb_type[mb_xy];
3111             const int mb_type_bottom= s->current_picture.mb_type[mb_xy+s->mb_stride];
3112             uint8_t tmp = s->current_picture.data[1][384];
3113             if (!bottom) return;
3114             pair_dest_y  = s->current_picture.data[0] + (mb_y * 16* s->linesize  ) + mb_x * 16;
3115             pair_dest_cb = s->current_picture.data[1] + (mb_y * 8 * s->uvlinesize) + mb_x * 8;
3116             pair_dest_cr = s->current_picture.data[2] + (mb_y * 8 * s->uvlinesize) + mb_x * 8;
3117
3118             backup_pair_border(h, pair_dest_y, pair_dest_cb, pair_dest_cr, s->linesize, s->uvlinesize);
3119             // TODO deblock a pair
3120             // top
3121             s->mb_y--;
3122             tprintf("call mbaff filter_mb mb_x:%d mb_y:%d pair_dest_y = %p, dest_y = %p\n", mb_x, mb_y, pair_dest_y, dest_y);
3123             fill_caches(h, mb_type_top, 1); //FIXME don't fill stuff which isn't used by filter_mb
3124             filter_mb(h, mb_x, mb_y, pair_dest_y, pair_dest_cb, pair_dest_cr, linesize, uvlinesize);
3125             if (tmp != s->current_picture.data[1][384]) {
3126                 tprintf("modified pixel 8,1 (1)\n");
3127             }
3128             // bottom
3129             s->mb_y++;
3130             tprintf("call mbaff filter_mb\n");
3131             fill_caches(h, mb_type_bottom, 1); //FIXME don't fill stuff which isn't used by filter_mb
3132             filter_mb(h, mb_x, mb_y+1, dest_y, dest_cb, dest_cr, linesize, uvlinesize);
3133             if (tmp != s->current_picture.data[1][384]) {
3134                 tprintf("modified pixel 8,1 (2)\n");
3135             }
3136         } else {
3137             tprintf("call filter_mb\n");
3138             backup_mb_border(h, dest_y, dest_cb, dest_cr, linesize, uvlinesize);
3139             fill_caches(h, mb_type, 1); //FIXME don't fill stuff which isn't used by filter_mb
3140             filter_mb(h, mb_x, mb_y, dest_y, dest_cb, dest_cr, linesize, uvlinesize);
3141         }
3142     }
3143 }
3144
3145 /**
3146  * fills the default_ref_list.
3147  */
3148 static int fill_default_ref_list(H264Context *h){
3149     MpegEncContext * const s = &h->s;
3150     int i;
3151     int smallest_poc_greater_than_current = -1;
3152     Picture sorted_short_ref[32];
3153
3154     if(h->slice_type==B_TYPE){
3155         int out_i;
3156         int limit= -1;
3157
3158         /* sort frame according to poc in B slice */
3159         for(out_i=0; out_i<h->short_ref_count; out_i++){
3160             int best_i=-1;
3161             int best_poc=INT_MAX;
3162
3163             for(i=0; i<h->short_ref_count; i++){
3164                 const int poc= h->short_ref[i]->poc;
3165                 if(poc > limit && poc < best_poc){
3166                     best_poc= poc;
3167                     best_i= i;
3168                 }
3169             }
3170
3171             assert(best_i != -1);
3172
3173             limit= best_poc;
3174             sorted_short_ref[out_i]= *h->short_ref[best_i];
3175             tprintf("sorted poc: %d->%d poc:%d fn:%d\n", best_i, out_i, sorted_short_ref[out_i].poc, sorted_short_ref[out_i].frame_num);
3176             if (-1 == smallest_poc_greater_than_current) {
3177                 if (h->short_ref[best_i]->poc >= s->current_picture_ptr->poc) {
3178                     smallest_poc_greater_than_current = out_i;
3179                 }
3180             }
3181         }
3182     }
3183
3184     if(s->picture_structure == PICT_FRAME){
3185         if(h->slice_type==B_TYPE){
3186             int list;
3187             tprintf("current poc: %d, smallest_poc_greater_than_current: %d\n", s->current_picture_ptr->poc, smallest_poc_greater_than_current);
3188
3189             // find the largest poc
3190             for(list=0; list<2; list++){
3191                 int index = 0;
3192                 int j= -99;
3193                 int step= list ? -1 : 1;
3194
3195                 for(i=0; i<h->short_ref_count && index < h->ref_count[list]; i++, j+=step) {
3196                     while(j<0 || j>= h->short_ref_count){
3197                         step = -step;
3198                         j= smallest_poc_greater_than_current + (step>>1);
3199                     }
3200                     if(sorted_short_ref[j].reference != 3) continue;
3201                     h->default_ref_list[list][index  ]= sorted_short_ref[j];
3202                     h->default_ref_list[list][index++].pic_id= sorted_short_ref[j].frame_num;
3203                 }
3204
3205                 for(i = 0; i < 16 && index < h->ref_count[ list ]; i++){
3206                     if(h->long_ref[i] == NULL) continue;
3207                     if(h->long_ref[i]->reference != 3) continue;
3208
3209                     h->default_ref_list[ list ][index  ]= *h->long_ref[i];
3210                     h->default_ref_list[ list ][index++].pic_id= i;;
3211                 }
3212
3213                 if(list && (smallest_poc_greater_than_current<=0 || smallest_poc_greater_than_current>=h->short_ref_count) && (1 < index)){
3214                     // swap the two first elements of L1 when
3215                     // L0 and L1 are identical
3216                     Picture temp= h->default_ref_list[1][0];
3217                     h->default_ref_list[1][0] = h->default_ref_list[1][1];
3218                     h->default_ref_list[1][0] = temp;
3219                 }
3220
3221                 if(index < h->ref_count[ list ])
3222                     memset(&h->default_ref_list[list][index], 0, sizeof(Picture)*(h->ref_count[ list ] - index));
3223             }
3224         }else{
3225             int index=0;
3226             for(i=0; i<h->short_ref_count; i++){
3227                 if(h->short_ref[i]->reference != 3) continue; //FIXME refernce field shit
3228                 h->default_ref_list[0][index  ]= *h->short_ref[i];
3229                 h->default_ref_list[0][index++].pic_id= h->short_ref[i]->frame_num;
3230             }
3231             for(i = 0; i < 16; i++){
3232                 if(h->long_ref[i] == NULL) continue;
3233                 if(h->long_ref[i]->reference != 3) continue;
3234                 h->default_ref_list[0][index  ]= *h->long_ref[i];
3235                 h->default_ref_list[0][index++].pic_id= i;;
3236             }
3237             if(index < h->ref_count[0])
3238                 memset(&h->default_ref_list[0][index], 0, sizeof(Picture)*(h->ref_count[0] - index));
3239         }
3240     }else{ //FIELD
3241         if(h->slice_type==B_TYPE){
3242         }else{
3243             //FIXME second field balh
3244         }
3245     }
3246 #ifdef TRACE
3247     for (i=0; i<h->ref_count[0]; i++) {
3248         tprintf("List0: %s fn:%d 0x%p\n", (h->default_ref_list[0][i].long_ref ? "LT" : "ST"), h->default_ref_list[0][i].pic_id, h->default_ref_list[0][i].data[0]);
3249     }
3250     if(h->slice_type==B_TYPE){
3251         for (i=0; i<h->ref_count[1]; i++) {
3252             tprintf("List1: %s fn:%d 0x%p\n", (h->default_ref_list[1][i].long_ref ? "LT" : "ST"), h->default_ref_list[1][i].pic_id, h->default_ref_list[0][i].data[0]);
3253         }
3254     }
3255 #endif
3256     return 0;
3257 }
3258
3259 static void print_short_term(H264Context *h);
3260 static void print_long_term(H264Context *h);
3261
3262 static int decode_ref_pic_list_reordering(H264Context *h){
3263     MpegEncContext * const s = &h->s;
3264     int list;
3265
3266     print_short_term(h);
3267     print_long_term(h);
3268     if(h->slice_type==I_TYPE || h->slice_type==SI_TYPE) return 0; //FIXME move before func
3269
3270     for(list=0; list<2; list++){
3271         memcpy(h->ref_list[list], h->default_ref_list[list], sizeof(Picture)*h->ref_count[list]);
3272
3273         if(get_bits1(&s->gb)){
3274             int pred= h->curr_pic_num;
3275             int index;
3276
3277             for(index=0; ; index++){
3278                 int reordering_of_pic_nums_idc= get_ue_golomb(&s->gb);
3279                 int pic_id;
3280                 int i;
3281                 Picture *ref = NULL;
3282
3283                 if(reordering_of_pic_nums_idc==3)
3284                     break;
3285
3286                 if(index >= h->ref_count[list]){
3287                     av_log(h->s.avctx, AV_LOG_ERROR, "reference count overflow\n");
3288                     return -1;
3289                 }
3290
3291                 if(reordering_of_pic_nums_idc<3){
3292                     if(reordering_of_pic_nums_idc<2){
3293                         const int abs_diff_pic_num= get_ue_golomb(&s->gb) + 1;
3294
3295                         if(abs_diff_pic_num >= h->max_pic_num){
3296                             av_log(h->s.avctx, AV_LOG_ERROR, "abs_diff_pic_num overflow\n");
3297                             return -1;
3298                         }
3299
3300                         if(reordering_of_pic_nums_idc == 0) pred-= abs_diff_pic_num;
3301                         else                                pred+= abs_diff_pic_num;
3302                         pred &= h->max_pic_num - 1;
3303
3304                         for(i= h->ref_count[list]-1; i>=0; i--){
3305                             if(h->ref_list[list][i].data[0] != NULL && h->ref_list[list][i].pic_id == pred && h->ref_list[list][i].long_ref==0) // ignore non existing pictures by testing data[0] pointer
3306                                 break;
3307                         }
3308                     }else{
3309                         pic_id= get_ue_golomb(&s->gb); //long_term_pic_idx
3310
3311                         for(i= h->ref_count[list]-1; i>=0; i--){
3312                             if(h->ref_list[list][i].pic_id == pic_id && h->ref_list[list][i].long_ref==1) // no need to ignore non existing pictures as non existing pictures have long_ref==0
3313                                 break;
3314                         }
3315                     }
3316
3317                     if (i < 0) {
3318                         av_log(h->s.avctx, AV_LOG_ERROR, "reference picture missing during reorder\n");
3319                         memset(&h->ref_list[list][index], 0, sizeof(Picture)); //FIXME
3320                     } else if (i != index) /* this test is not necessary, it is only an optimisation to skip double copy of Picture structure in this case */ {
3321                         Picture tmp= h->ref_list[list][i];
3322                         if (i < index) {
3323                             i = h->ref_count[list];
3324                         }
3325                         for(; i > index; i--){
3326                             h->ref_list[list][i]= h->ref_list[list][i-1];
3327                         }
3328                         h->ref_list[list][index]= tmp;
3329                     }
3330                 }else{
3331                     av_log(h->s.avctx, AV_LOG_ERROR, "illegal reordering_of_pic_nums_idc\n");
3332                     return -1;
3333                 }
3334             }
3335         }
3336
3337         if(h->slice_type!=B_TYPE) break;
3338     }
3339
3340     if(h->slice_type==B_TYPE && !h->direct_spatial_mv_pred)
3341         direct_dist_scale_factor(h);
3342     direct_ref_list_init(h);
3343     return 0;
3344 }
3345
3346 static int pred_weight_table(H264Context *h){
3347     MpegEncContext * const s = &h->s;
3348     int list, i;
3349     int luma_def, chroma_def;
3350
3351     h->use_weight= 0;
3352     h->use_weight_chroma= 0;
3353     h->luma_log2_weight_denom= get_ue_golomb(&s->gb);
3354     h->chroma_log2_weight_denom= get_ue_golomb(&s->gb);
3355     luma_def = 1<<h->luma_log2_weight_denom;
3356     chroma_def = 1<<h->chroma_log2_weight_denom;
3357
3358     for(list=0; list<2; list++){
3359         for(i=0; i<h->ref_count[list]; i++){
3360             int luma_weight_flag, chroma_weight_flag;
3361
3362             luma_weight_flag= get_bits1(&s->gb);
3363             if(luma_weight_flag){
3364                 h->luma_weight[list][i]= get_se_golomb(&s->gb);
3365                 h->luma_offset[list][i]= get_se_golomb(&s->gb);
3366                 if(   h->luma_weight[list][i] != luma_def
3367                    || h->luma_offset[list][i] != 0)
3368                     h->use_weight= 1;
3369             }else{
3370                 h->luma_weight[list][i]= luma_def;
3371                 h->luma_offset[list][i]= 0;
3372             }
3373
3374             chroma_weight_flag= get_bits1(&s->gb);
3375             if(chroma_weight_flag){
3376                 int j;
3377                 for(j=0; j<2; j++){
3378                     h->chroma_weight[list][i][j]= get_se_golomb(&s->gb);
3379                     h->chroma_offset[list][i][j]= get_se_golomb(&s->gb);
3380                     if(   h->chroma_weight[list][i][j] != chroma_def
3381                        || h->chroma_offset[list][i][j] != 0)
3382                         h->use_weight_chroma= 1;
3383                 }
3384             }else{
3385                 int j;
3386                 for(j=0; j<2; j++){
3387                     h->chroma_weight[list][i][j]= chroma_def;
3388                     h->chroma_offset[list][i][j]= 0;
3389                 }
3390             }
3391         }
3392         if(h->slice_type != B_TYPE) break;
3393     }
3394     h->use_weight= h->use_weight || h->use_weight_chroma;
3395     return 0;
3396 }
3397
3398 static void implicit_weight_table(H264Context *h){
3399     MpegEncContext * const s = &h->s;
3400     int ref0, ref1;
3401     int cur_poc = s->current_picture_ptr->poc;
3402
3403     if(   h->ref_count[0] == 1 && h->ref_count[1] == 1
3404        && h->ref_list[0][0].poc + h->ref_list[1][0].poc == 2*cur_poc){
3405         h->use_weight= 0;
3406         h->use_weight_chroma= 0;
3407         return;
3408     }
3409
3410     h->use_weight= 2;
3411     h->use_weight_chroma= 2;
3412     h->luma_log2_weight_denom= 5;
3413     h->chroma_log2_weight_denom= 5;
3414
3415     /* FIXME: MBAFF */
3416     for(ref0=0; ref0 < h->ref_count[0]; ref0++){
3417         int poc0 = h->ref_list[0][ref0].poc;
3418         for(ref1=0; ref1 < h->ref_count[1]; ref1++){
3419             int poc1 = h->ref_list[1][ref1].poc;
3420             int td = clip(poc1 - poc0, -128, 127);
3421             if(td){
3422                 int tb = clip(cur_poc - poc0, -128, 127);
3423                 int tx = (16384 + (ABS(td) >> 1)) / td;
3424                 int dist_scale_factor = clip((tb*tx + 32) >> 6, -1024, 1023) >> 2;
3425                 if(dist_scale_factor < -64 || dist_scale_factor > 128)
3426                     h->implicit_weight[ref0][ref1] = 32;
3427                 else
3428                     h->implicit_weight[ref0][ref1] = 64 - dist_scale_factor;
3429             }else
3430                 h->implicit_weight[ref0][ref1] = 32;
3431         }
3432     }
3433 }
3434
3435 static inline void unreference_pic(H264Context *h, Picture *pic){
3436     int i;
3437     pic->reference=0;
3438     if(pic == h->delayed_output_pic)
3439         pic->reference=1;
3440     else{
3441         for(i = 0; h->delayed_pic[i]; i++)
3442             if(pic == h->delayed_pic[i]){
3443                 pic->reference=1;
3444                 break;
3445             }
3446     }
3447 }
3448
3449 /**
3450  * instantaneous decoder refresh.
3451  */
3452 static void idr(H264Context *h){
3453     int i;
3454
3455     for(i=0; i<16; i++){
3456         if (h->long_ref[i] != NULL) {
3457             unreference_pic(h, h->long_ref[i]);
3458             h->long_ref[i]= NULL;
3459         }
3460     }
3461     h->long_ref_count=0;
3462
3463     for(i=0; i<h->short_ref_count; i++){
3464         unreference_pic(h, h->short_ref[i]);
3465         h->short_ref[i]= NULL;
3466     }
3467     h->short_ref_count=0;
3468 }
3469
3470 /* forget old pics after a seek */
3471 static void flush_dpb(AVCodecContext *avctx){
3472     H264Context *h= avctx->priv_data;
3473     int i;
3474     for(i=0; i<16; i++)
3475         h->delayed_pic[i]= NULL;
3476     h->delayed_output_pic= NULL;
3477     idr(h);
3478 }
3479
3480 /**
3481  *
3482  * @return the removed picture or NULL if an error occurs
3483  */
3484 static Picture * remove_short(H264Context *h, int frame_num){
3485     MpegEncContext * const s = &h->s;
3486     int i;
3487
3488     if(s->avctx->debug&FF_DEBUG_MMCO)
3489         av_log(h->s.avctx, AV_LOG_DEBUG, "remove short %d count %d\n", frame_num, h->short_ref_count);
3490
3491     for(i=0; i<h->short_ref_count; i++){
3492         Picture *pic= h->short_ref[i];
3493         if(s->avctx->debug&FF_DEBUG_MMCO)
3494             av_log(h->s.avctx, AV_LOG_DEBUG, "%d %d %p\n", i, pic->frame_num, pic);
3495         if(pic->frame_num == frame_num){
3496             h->short_ref[i]= NULL;
3497             memmove(&h->short_ref[i], &h->short_ref[i+1], (h->short_ref_count - i - 1)*sizeof(Picture*));
3498             h->short_ref_count--;
3499             return pic;
3500         }
3501     }
3502     return NULL;
3503 }
3504
3505 /**
3506  *
3507  * @return the removed picture or NULL if an error occurs
3508  */
3509 static Picture * remove_long(H264Context *h, int i){
3510     Picture *pic;
3511
3512     pic= h->long_ref[i];
3513     h->long_ref[i]= NULL;
3514     if(pic) h->long_ref_count--;
3515
3516     return pic;
3517 }
3518
3519 /**
3520  * print short term list
3521  */
3522 static void print_short_term(H264Context *h) {
3523     uint32_t i;
3524     if(h->s.avctx->debug&FF_DEBUG_MMCO) {
3525         av_log(h->s.avctx, AV_LOG_DEBUG, "short term list:\n");
3526         for(i=0; i<h->short_ref_count; i++){
3527             Picture *pic= h->short_ref[i];
3528             av_log(h->s.avctx, AV_LOG_DEBUG, "%d fn:%d poc:%d %p\n", i, pic->frame_num, pic->poc, pic->data[0]);
3529         }
3530     }
3531 }
3532
3533 /**
3534  * print long term list
3535  */
3536 static void print_long_term(H264Context *h) {
3537     uint32_t i;
3538     if(h->s.avctx->debug&FF_DEBUG_MMCO) {
3539         av_log(h->s.avctx, AV_LOG_DEBUG, "long term list:\n");
3540         for(i = 0; i < 16; i++){
3541             Picture *pic= h->long_ref[i];
3542             if (pic) {
3543                 av_log(h->s.avctx, AV_LOG_DEBUG, "%d fn:%d poc:%d %p\n", i, pic->frame_num, pic->poc, pic->data[0]);
3544             }
3545         }
3546     }
3547 }
3548
3549 /**
3550  * Executes the reference picture marking (memory management control operations).
3551  */
3552 static int execute_ref_pic_marking(H264Context *h, MMCO *mmco, int mmco_count){
3553     MpegEncContext * const s = &h->s;
3554     int i, j;
3555     int current_is_long=0;
3556     Picture *pic;
3557
3558     if((s->avctx->debug&FF_DEBUG_MMCO) && mmco_count==0)
3559         av_log(h->s.avctx, AV_LOG_DEBUG, "no mmco here\n");
3560
3561     for(i=0; i<mmco_count; i++){
3562         if(s->avctx->debug&FF_DEBUG_MMCO)
3563             av_log(h->s.avctx, AV_LOG_DEBUG, "mmco:%d %d %d\n", h->mmco[i].opcode, h->mmco[i].short_frame_num, h->mmco[i].long_index);
3564
3565         switch(mmco[i].opcode){
3566         case MMCO_SHORT2UNUSED:
3567             pic= remove_short(h, mmco[i].short_frame_num);
3568             if(pic==NULL) return -1;
3569             unreference_pic(h, pic);
3570             break;
3571         case MMCO_SHORT2LONG:
3572             pic= remove_long(h, mmco[i].long_index);
3573             if(pic) unreference_pic(h, pic);
3574
3575             h->long_ref[ mmco[i].long_index ]= remove_short(h, mmco[i].short_frame_num);
3576             h->long_ref[ mmco[i].long_index ]->long_ref=1;
3577             h->long_ref_count++;
3578             break;
3579         case MMCO_LONG2UNUSED:
3580             pic= remove_long(h, mmco[i].long_index);
3581             if(pic==NULL) return -1;
3582             unreference_pic(h, pic);
3583             break;
3584         case MMCO_LONG:
3585             pic= remove_long(h, mmco[i].long_index);
3586             if(pic) unreference_pic(h, pic);
3587
3588             h->long_ref[ mmco[i].long_index ]= s->current_picture_ptr;
3589             h->long_ref[ mmco[i].long_index ]->long_ref=1;
3590             h->long_ref_count++;
3591
3592             current_is_long=1;
3593             break;
3594         case MMCO_SET_MAX_LONG:
3595             assert(mmco[i].long_index <= 16);
3596             // just remove the long term which index is greater than new max
3597             for(j = mmco[i].long_index; j<16; j++){
3598                 pic = remove_long(h, j);
3599                 if (pic) unreference_pic(h, pic);
3600             }
3601             break;
3602         case MMCO_RESET:
3603             while(h->short_ref_count){
3604                 pic= remove_short(h, h->short_ref[0]->frame_num);
3605                 unreference_pic(h, pic);
3606             }
3607             for(j = 0; j < 16; j++) {
3608                 pic= remove_long(h, j);
3609                 if(pic) unreference_pic(h, pic);
3610             }
3611             break;
3612         default: assert(0);
3613         }
3614     }
3615
3616     if(!current_is_long){
3617         pic= remove_short(h, s->current_picture_ptr->frame_num);
3618         if(pic){
3619             unreference_pic(h, pic);
3620             av_log(h->s.avctx, AV_LOG_ERROR, "illegal short term buffer state detected\n");
3621         }
3622
3623         if(h->short_ref_count)
3624             memmove(&h->short_ref[1], &h->short_ref[0], h->short_ref_count*sizeof(Picture*));
3625
3626         h->short_ref[0]= s->current_picture_ptr;
3627         h->short_ref[0]->long_ref=0;
3628         h->short_ref_count++;
3629     }
3630
3631     print_short_term(h);
3632     print_long_term(h);
3633     return 0;
3634 }
3635
3636 static int decode_ref_pic_marking(H264Context *h){
3637     MpegEncContext * const s = &h->s;
3638     int i;
3639
3640     if(h->nal_unit_type == NAL_IDR_SLICE){ //FIXME fields
3641         s->broken_link= get_bits1(&s->gb) -1;
3642         h->mmco[0].long_index= get_bits1(&s->gb) - 1; // current_long_term_idx
3643         if(h->mmco[0].long_index == -1)
3644             h->mmco_index= 0;
3645         else{
3646             h->mmco[0].opcode= MMCO_LONG;
3647             h->mmco_index= 1;
3648         }
3649     }else{
3650         if(get_bits1(&s->gb)){ // adaptive_ref_pic_marking_mode_flag
3651             for(i= 0; i<MAX_MMCO_COUNT; i++) {
3652                 MMCOOpcode opcode= get_ue_golomb(&s->gb);;
3653
3654                 h->mmco[i].opcode= opcode;
3655                 if(opcode==MMCO_SHORT2UNUSED || opcode==MMCO_SHORT2LONG){
3656                     h->mmco[i].short_frame_num= (h->frame_num - get_ue_golomb(&s->gb) - 1) & ((1<<h->sps.log2_max_frame_num)-1); //FIXME fields
3657 /*                    if(h->mmco[i].short_frame_num >= h->short_ref_count || h->short_ref[ h->mmco[i].short_frame_num ] == NULL){
3658                         fprintf(stderr, "illegal short ref in memory management control operation %d\n", mmco);
3659                         return -1;
3660                     }*/
3661                 }
3662                 if(opcode==MMCO_SHORT2LONG || opcode==MMCO_LONG2UNUSED || opcode==MMCO_LONG || opcode==MMCO_SET_MAX_LONG){
3663                     h->mmco[i].long_index= get_ue_golomb(&s->gb);
3664                     if(/*h->mmco[i].long_index >= h->long_ref_count || h->long_ref[ h->mmco[i].long_index ] == NULL*/ h->mmco[i].long_index >= 16){
3665                         av_log(h->s.avctx, AV_LOG_ERROR, "illegal long ref in memory management control operation %d\n", opcode);
3666                         return -1;
3667                     }
3668                 }
3669
3670                 if(opcode > MMCO_LONG){
3671                     av_log(h->s.avctx, AV_LOG_ERROR, "illegal memory management control operation %d\n", opcode);
3672                     return -1;
3673                 }
3674                 if(opcode == MMCO_END)
3675                     break;
3676             }
3677             h->mmco_index= i;
3678         }else{
3679             assert(h->long_ref_count + h->short_ref_count <= h->sps.ref_frame_count);
3680
3681             if(h->long_ref_count + h->short_ref_count == h->sps.ref_frame_count){ //FIXME fields
3682                 h->mmco[0].opcode= MMCO_SHORT2UNUSED;
3683                 h->mmco[0].short_frame_num= h->short_ref[ h->short_ref_count - 1 ]->frame_num;
3684                 h->mmco_index= 1;
3685             }else
3686                 h->mmco_index= 0;
3687         }
3688     }
3689
3690     return 0;
3691 }
3692
3693 static int init_poc(H264Context *h){
3694     MpegEncContext * const s = &h->s;
3695     const int max_frame_num= 1<<h->sps.log2_max_frame_num;
3696     int field_poc[2];
3697
3698     if(h->nal_unit_type == NAL_IDR_SLICE){
3699         h->frame_num_offset= 0;
3700     }else{
3701         if(h->frame_num < h->prev_frame_num)
3702             h->frame_num_offset= h->prev_frame_num_offset + max_frame_num;
3703         else
3704             h->frame_num_offset= h->prev_frame_num_offset;
3705     }
3706
3707     if(h->sps.poc_type==0){
3708         const int max_poc_lsb= 1<<h->sps.log2_max_poc_lsb;
3709
3710         if(h->nal_unit_type == NAL_IDR_SLICE){
3711              h->prev_poc_msb=
3712              h->prev_poc_lsb= 0;
3713         }
3714
3715         if     (h->poc_lsb < h->prev_poc_lsb && h->prev_poc_lsb - h->poc_lsb >= max_poc_lsb/2)
3716             h->poc_msb = h->prev_poc_msb + max_poc_lsb;
3717         else if(h->poc_lsb > h->prev_poc_lsb && h->prev_poc_lsb - h->poc_lsb < -max_poc_lsb/2)
3718             h->poc_msb = h->prev_poc_msb - max_poc_lsb;
3719         else
3720             h->poc_msb = h->prev_poc_msb;
3721 //printf("poc: %d %d\n", h->poc_msb, h->poc_lsb);
3722         field_poc[0] =
3723         field_poc[1] = h->poc_msb + h->poc_lsb;
3724         if(s->picture_structure == PICT_FRAME)
3725             field_poc[1] += h->delta_poc_bottom;
3726     }else if(h->sps.poc_type==1){
3727         int abs_frame_num, expected_delta_per_poc_cycle, expectedpoc;
3728         int i;
3729
3730         if(h->sps.poc_cycle_length != 0)
3731             abs_frame_num = h->frame_num_offset + h->frame_num;
3732         else
3733             abs_frame_num = 0;
3734
3735         if(h->nal_ref_idc==0 && abs_frame_num > 0)
3736             abs_frame_num--;
3737
3738         expected_delta_per_poc_cycle = 0;
3739         for(i=0; i < h->sps.poc_cycle_length; i++)
3740             expected_delta_per_poc_cycle += h->sps.offset_for_ref_frame[ i ]; //FIXME integrate during sps parse
3741
3742         if(abs_frame_num > 0){
3743             int poc_cycle_cnt          = (abs_frame_num - 1) / h->sps.poc_cycle_length;
3744             int frame_num_in_poc_cycle = (abs_frame_num - 1) % h->sps.poc_cycle_length;
3745
3746             expectedpoc = poc_cycle_cnt * expected_delta_per_poc_cycle;
3747             for(i = 0; i <= frame_num_in_poc_cycle; i++)
3748                 expectedpoc = expectedpoc + h->sps.offset_for_ref_frame[ i ];
3749         } else
3750             expectedpoc = 0;
3751
3752         if(h->nal_ref_idc == 0)
3753             expectedpoc = expectedpoc + h->sps.offset_for_non_ref_pic;
3754
3755         field_poc[0] = expectedpoc + h->delta_poc[0];
3756         field_poc[1] = field_poc[0] + h->sps.offset_for_top_to_bottom_field;
3757
3758         if(s->picture_structure == PICT_FRAME)
3759             field_poc[1] += h->delta_poc[1];
3760     }else{
3761         int poc;
3762         if(h->nal_unit_type == NAL_IDR_SLICE){
3763             poc= 0;
3764         }else{
3765             if(h->nal_ref_idc) poc= 2*(h->frame_num_offset + h->frame_num);
3766             else               poc= 2*(h->frame_num_offset + h->frame_num) - 1;
3767         }
3768         field_poc[0]= poc;
3769         field_poc[1]= poc;
3770     }
3771
3772     if(s->picture_structure != PICT_BOTTOM_FIELD)
3773         s->current_picture_ptr->field_poc[0]= field_poc[0];
3774     if(s->picture_structure != PICT_TOP_FIELD)
3775         s->current_picture_ptr->field_poc[1]= field_poc[1];
3776     if(s->picture_structure == PICT_FRAME) // FIXME field pix?
3777         s->current_picture_ptr->poc= FFMIN(field_poc[0], field_poc[1]);
3778
3779     return 0;
3780 }
3781
3782 /**
3783  * decodes a slice header.
3784  * this will allso call MPV_common_init() and frame_start() as needed
3785  */
3786 static int decode_slice_header(H264Context *h){
3787     MpegEncContext * const s = &h->s;
3788     int first_mb_in_slice, pps_id;
3789     int num_ref_idx_active_override_flag;
3790     static const uint8_t slice_type_map[5]= {P_TYPE, B_TYPE, I_TYPE, SP_TYPE, SI_TYPE};
3791     int slice_type;
3792     int default_ref_list_done = 0;
3793
3794     s->current_picture.reference= h->nal_ref_idc != 0;
3795     s->dropable= h->nal_ref_idc == 0;
3796
3797     first_mb_in_slice= get_ue_golomb(&s->gb);
3798
3799     slice_type= get_ue_golomb(&s->gb);
3800     if(slice_type > 9){
3801         av_log(h->s.avctx, AV_LOG_ERROR, "slice type too large (%d) at %d %d\n", h->slice_type, s->mb_x, s->mb_y);
3802         return -1;
3803     }
3804     if(slice_type > 4){
3805         slice_type -= 5;
3806         h->slice_type_fixed=1;
3807     }else
3808         h->slice_type_fixed=0;
3809
3810     slice_type= slice_type_map[ slice_type ];
3811     if (slice_type == I_TYPE
3812         || (h->slice_num != 0 && slice_type == h->slice_type) ) {
3813         default_ref_list_done = 1;
3814     }
3815     h->slice_type= slice_type;
3816
3817     s->pict_type= h->slice_type; // to make a few old func happy, it's wrong though
3818
3819     pps_id= get_ue_golomb(&s->gb);
3820     if(pps_id>255){
3821         av_log(h->s.avctx, AV_LOG_ERROR, "pps_id out of range\n");
3822         return -1;
3823     }
3824     h->pps= h->pps_buffer[pps_id];
3825     if(h->pps.slice_group_count == 0){
3826         av_log(h->s.avctx, AV_LOG_ERROR, "non existing PPS referenced\n");
3827         return -1;
3828     }
3829
3830     h->sps= h->sps_buffer[ h->pps.sps_id ];
3831     if(h->sps.log2_max_frame_num == 0){
3832         av_log(h->s.avctx, AV_LOG_ERROR, "non existing SPS referenced\n");
3833         return -1;
3834     }
3835
3836     s->mb_width= h->sps.mb_width;
3837     s->mb_height= h->sps.mb_height * (2 - h->sps.frame_mbs_only_flag);
3838
3839     h->b_stride=  s->mb_width*4 + 1;
3840     h->b8_stride= s->mb_width*2 + 1;
3841
3842     s->width = 16*s->mb_width - 2*(h->sps.crop_left + h->sps.crop_right );
3843     if(h->sps.frame_mbs_only_flag)
3844         s->height= 16*s->mb_height - 2*(h->sps.crop_top  + h->sps.crop_bottom);
3845     else
3846         s->height= 16*s->mb_height - 4*(h->sps.crop_top  + h->sps.crop_bottom); //FIXME recheck
3847
3848     if (s->context_initialized
3849         && (   s->width != s->avctx->width || s->height != s->avctx->height)) {
3850         free_tables(h);
3851         MPV_common_end(s);
3852     }
3853     if (!s->context_initialized) {
3854         if (MPV_common_init(s) < 0)
3855             return -1;
3856
3857         alloc_tables(h);
3858
3859         s->avctx->width = s->width;
3860         s->avctx->height = s->height;
3861         s->avctx->sample_aspect_ratio= h->sps.sar;
3862         if(!s->avctx->sample_aspect_ratio.den)
3863             s->avctx->sample_aspect_ratio.den = 1;
3864
3865         if(h->sps.timing_info_present_flag){
3866             s->avctx->time_base= (AVRational){h->sps.num_units_in_tick, h->sps.time_scale};
3867         }
3868     }
3869
3870     if(h->slice_num == 0){
3871         frame_start(h);
3872     }
3873
3874     s->current_picture_ptr->frame_num= //FIXME frame_num cleanup
3875     h->frame_num= get_bits(&s->gb, h->sps.log2_max_frame_num);
3876
3877     h->mb_aff_frame = 0;
3878     if(h->sps.frame_mbs_only_flag){
3879         s->picture_structure= PICT_FRAME;
3880     }else{
3881         if(get_bits1(&s->gb)) { //field_pic_flag
3882             s->picture_structure= PICT_TOP_FIELD + get_bits1(&s->gb); //bottom_field_flag
3883         } else {
3884             s->picture_structure= PICT_FRAME;
3885             first_mb_in_slice <<= 1;
3886             h->mb_aff_frame = h->sps.mb_aff;
3887         }
3888     }
3889
3890     s->resync_mb_x = s->mb_x = first_mb_in_slice % s->mb_width;
3891     s->resync_mb_y = s->mb_y = first_mb_in_slice / s->mb_width;
3892
3893     if(s->picture_structure==PICT_FRAME){
3894         h->curr_pic_num=   h->frame_num;
3895         h->max_pic_num= 1<< h->sps.log2_max_frame_num;
3896     }else{
3897         h->curr_pic_num= 2*h->frame_num;
3898         h->max_pic_num= 1<<(h->sps.log2_max_frame_num + 1);
3899     }
3900
3901     if(h->nal_unit_type == NAL_IDR_SLICE){
3902         get_ue_golomb(&s->gb); /* idr_pic_id */
3903     }
3904
3905     if(h->sps.poc_type==0){
3906         h->poc_lsb= get_bits(&s->gb, h->sps.log2_max_poc_lsb);
3907
3908         if(h->pps.pic_order_present==1 && s->picture_structure==PICT_FRAME){
3909             h->delta_poc_bottom= get_se_golomb(&s->gb);
3910         }
3911     }
3912
3913     if(h->sps.poc_type==1 && !h->sps.delta_pic_order_always_zero_flag){
3914         h->delta_poc[0]= get_se_golomb(&s->gb);
3915
3916         if(h->pps.pic_order_present==1 && s->picture_structure==PICT_FRAME)
3917             h->delta_poc[1]= get_se_golomb(&s->gb);
3918     }
3919
3920     init_poc(h);
3921
3922     if(h->pps.redundant_pic_cnt_present){
3923         h->redundant_pic_count= get_ue_golomb(&s->gb);
3924     }
3925
3926     //set defaults, might be overriden a few line later
3927     h->ref_count[0]= h->pps.ref_count[0];
3928     h->ref_count[1]= h->pps.ref_count[1];
3929
3930     if(h->slice_type == P_TYPE || h->slice_type == SP_TYPE || h->slice_type == B_TYPE){
3931         if(h->slice_type == B_TYPE){
3932             h->direct_spatial_mv_pred= get_bits1(&s->gb);
3933         }
3934         num_ref_idx_active_override_flag= get_bits1(&s->gb);
3935
3936         if(num_ref_idx_active_override_flag){
3937             h->ref_count[0]= get_ue_golomb(&s->gb) + 1;
3938             if(h->slice_type==B_TYPE)
3939                 h->ref_count[1]= get_ue_golomb(&s->gb) + 1;
3940
3941             if(h->ref_count[0] > 32 || h->ref_count[1] > 32){
3942                 av_log(h->s.avctx, AV_LOG_ERROR, "reference overflow\n");
3943                 return -1;
3944             }
3945         }
3946     }
3947
3948     if(!default_ref_list_done){
3949         fill_default_ref_list(h);
3950     }
3951
3952     decode_ref_pic_list_reordering(h);
3953
3954     if(   (h->pps.weighted_pred          && (h->slice_type == P_TYPE || h->slice_type == SP_TYPE ))
3955        || (h->pps.weighted_bipred_idc==1 && h->slice_type==B_TYPE ) )
3956         pred_weight_table(h);
3957     else if(h->pps.weighted_bipred_idc==2 && h->slice_type==B_TYPE)
3958         implicit_weight_table(h);
3959     else
3960         h->use_weight = 0;
3961
3962     if(s->current_picture.reference)
3963         decode_ref_pic_marking(h);
3964
3965     if( h->slice_type != I_TYPE && h->slice_type != SI_TYPE && h->pps.cabac )
3966         h->cabac_init_idc = get_ue_golomb(&s->gb);
3967
3968     h->last_qscale_diff = 0;
3969     s->qscale = h->pps.init_qp + get_se_golomb(&s->gb);
3970     if(s->qscale<0 || s->qscale>51){
3971         av_log(s->avctx, AV_LOG_ERROR, "QP %d out of range\n", s->qscale);
3972         return -1;
3973     }
3974     h->chroma_qp = get_chroma_qp(h->pps.chroma_qp_index_offset, s->qscale);
3975     //FIXME qscale / qp ... stuff
3976     if(h->slice_type == SP_TYPE){
3977         get_bits1(&s->gb); /* sp_for_switch_flag */
3978     }
3979     if(h->slice_type==SP_TYPE || h->slice_type == SI_TYPE){
3980         get_se_golomb(&s->gb); /* slice_qs_delta */
3981     }
3982
3983     h->deblocking_filter = 1;
3984     h->slice_alpha_c0_offset = 0;
3985     h->slice_beta_offset = 0;
3986     if( h->pps.deblocking_filter_parameters_present ) {
3987         h->deblocking_filter= get_ue_golomb(&s->gb);
3988         if(h->deblocking_filter < 2)
3989             h->deblocking_filter^= 1; // 1<->0
3990
3991         if( h->deblocking_filter ) {
3992             h->slice_alpha_c0_offset = get_se_golomb(&s->gb) << 1;
3993             h->slice_beta_offset = get_se_golomb(&s->gb) << 1;
3994         }
3995     }
3996
3997 #if 0 //FMO
3998     if( h->pps.num_slice_groups > 1  && h->pps.mb_slice_group_map_type >= 3 && h->pps.mb_slice_group_map_type <= 5)
3999         slice_group_change_cycle= get_bits(&s->gb, ?);
4000 #endif
4001
4002     h->slice_num++;
4003
4004     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
4005         av_log(h->s.avctx, AV_LOG_DEBUG, "slice:%d %s mb:%d %c pps:%d frame:%d poc:%d/%d ref:%d/%d qp:%d loop:%d:%d:%d weight:%d%s\n",
4006                h->slice_num,
4007                (s->picture_structure==PICT_FRAME ? "F" : s->picture_structure==PICT_TOP_FIELD ? "T" : "B"),
4008                first_mb_in_slice,
4009                av_get_pict_type_char(h->slice_type),
4010                pps_id, h->frame_num,
4011                s->current_picture_ptr->field_poc[0], s->current_picture_ptr->field_poc[1],
4012                h->ref_count[0], h->ref_count[1],
4013                s->qscale,
4014                h->deblocking_filter, h->slice_alpha_c0_offset/2, h->slice_beta_offset/2,
4015                h->use_weight,
4016                h->use_weight==1 && h->use_weight_chroma ? "c" : ""
4017                );
4018     }
4019
4020     return 0;
4021 }
4022
4023 /**
4024  *
4025  */
4026 static inline int get_level_prefix(GetBitContext *gb){
4027     unsigned int buf;
4028     int log;
4029
4030     OPEN_READER(re, gb);
4031     UPDATE_CACHE(re, gb);
4032     buf=GET_CACHE(re, gb);
4033
4034     log= 32 - av_log2(buf);
4035 #ifdef TRACE
4036     print_bin(buf>>(32-log), log);
4037     av_log(NULL, AV_LOG_DEBUG, "%5d %2d %3d lpr @%5d in %s get_level_prefix\n", buf>>(32-log), log, log-1, get_bits_count(gb), __FILE__);
4038 #endif
4039
4040     LAST_SKIP_BITS(re, gb, log);
4041     CLOSE_READER(re, gb);
4042
4043     return log-1;
4044 }
4045
4046 /**
4047  * decodes a residual block.
4048  * @param n block index
4049  * @param scantable scantable
4050  * @param max_coeff number of coefficients in the block
4051  * @return <0 if an error occured
4052  */
4053 static int decode_residual(H264Context *h, GetBitContext *gb, DCTELEM *block, int n, const uint8_t *scantable, int qp, int max_coeff){
4054     MpegEncContext * const s = &h->s;
4055     const uint16_t *qmul= dequant_coeff[qp];
4056     static const int coeff_token_table_index[17]= {0, 0, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3};
4057     int level[16], run[16];
4058     int suffix_length, zeros_left, coeff_num, coeff_token, total_coeff, i, trailing_ones;
4059
4060     //FIXME put trailing_onex into the context
4061
4062     if(n == CHROMA_DC_BLOCK_INDEX){
4063         coeff_token= get_vlc2(gb, chroma_dc_coeff_token_vlc.table, CHROMA_DC_COEFF_TOKEN_VLC_BITS, 1);
4064         total_coeff= coeff_token>>2;
4065     }else{
4066         if(n == LUMA_DC_BLOCK_INDEX){
4067             total_coeff= pred_non_zero_count(h, 0);
4068             coeff_token= get_vlc2(gb, coeff_token_vlc[ coeff_token_table_index[total_coeff] ].table, COEFF_TOKEN_VLC_BITS, 2);
4069             total_coeff= coeff_token>>2;
4070         }else{
4071             total_coeff= pred_non_zero_count(h, n);
4072             coeff_token= get_vlc2(gb, coeff_token_vlc[ coeff_token_table_index[total_coeff] ].table, COEFF_TOKEN_VLC_BITS, 2);
4073             total_coeff= coeff_token>>2;
4074             h->non_zero_count_cache[ scan8[n] ]= total_coeff;
4075         }
4076     }
4077
4078     //FIXME set last_non_zero?
4079
4080     if(total_coeff==0)
4081         return 0;
4082
4083     trailing_ones= coeff_token&3;
4084     tprintf("trailing:%d, total:%d\n", trailing_ones, total_coeff);
4085     assert(total_coeff<=16);
4086
4087     for(i=0; i<trailing_ones; i++){
4088         level[i]= 1 - 2*get_bits1(gb);
4089     }
4090
4091     suffix_length= total_coeff > 10 && trailing_ones < 3;
4092
4093     for(; i<total_coeff; i++){
4094         const int prefix= get_level_prefix(gb);
4095         int level_code, mask;
4096
4097         if(prefix<14){ //FIXME try to build a large unified VLC table for all this
4098             if(suffix_length)
4099                 level_code= (prefix<<suffix_length) + get_bits(gb, suffix_length); //part
4100             else
4101                 level_code= (prefix<<suffix_length); //part
4102         }else if(prefix==14){
4103             if(suffix_length)
4104                 level_code= (prefix<<suffix_length) + get_bits(gb, suffix_length); //part
4105             else
4106                 level_code= prefix + get_bits(gb, 4); //part
4107         }else if(prefix==15){
4108             level_code= (prefix<<suffix_length) + get_bits(gb, 12); //part
4109             if(suffix_length==0) level_code+=15; //FIXME doesn't make (much)sense
4110         }else{
4111             av_log(h->s.avctx, AV_LOG_ERROR, "prefix too large at %d %d\n", s->mb_x, s->mb_y);
4112             return -1;
4113         }
4114
4115         if(i==trailing_ones && i<3) level_code+= 2; //FIXME split first iteration
4116
4117         mask= -(level_code&1);
4118         level[i]= (((2+level_code)>>1) ^ mask) - mask;
4119
4120         if(suffix_length==0) suffix_length=1; //FIXME split first iteration
4121
4122 #if 1
4123         if(ABS(level[i]) > (3<<(suffix_length-1)) && suffix_length<6) suffix_length++;
4124 #else
4125         if((2+level_code)>>1) > (3<<(suffix_length-1)) && suffix_length<6) suffix_length++;
4126         /* ? == prefix > 2 or sth */
4127 #endif
4128         tprintf("level: %d suffix_length:%d\n", level[i], suffix_length);
4129     }
4130
4131     if(total_coeff == max_coeff)
4132         zeros_left=0;
4133     else{
4134         if(n == CHROMA_DC_BLOCK_INDEX)
4135             zeros_left= get_vlc2(gb, chroma_dc_total_zeros_vlc[ total_coeff-1 ].table, CHROMA_DC_TOTAL_ZEROS_VLC_BITS, 1);
4136         else
4137             zeros_left= get_vlc2(gb, total_zeros_vlc[ total_coeff-1 ].table, TOTAL_ZEROS_VLC_BITS, 1);
4138     }
4139
4140     for(i=0; i<total_coeff-1; i++){
4141         if(zeros_left <=0)
4142             break;
4143         else if(zeros_left < 7){
4144             run[i]= get_vlc2(gb, run_vlc[zeros_left-1].table, RUN_VLC_BITS, 1);
4145         }else{
4146             run[i]= get_vlc2(gb, run7_vlc.table, RUN7_VLC_BITS, 2);
4147         }
4148         zeros_left -= run[i];
4149     }
4150
4151     if(zeros_left<0){
4152         av_log(h->s.avctx, AV_LOG_ERROR, "negative number of zero coeffs at %d %d\n", s->mb_x, s->mb_y);
4153         return -1;
4154     }
4155
4156     for(; i<total_coeff-1; i++){
4157         run[i]= 0;
4158     }
4159
4160     run[i]= zeros_left;
4161
4162     coeff_num=-1;
4163     if(n > 24){
4164         for(i=total_coeff-1; i>=0; i--){ //FIXME merge into rundecode?
4165             int j;
4166
4167             coeff_num += run[i] + 1; //FIXME add 1 earlier ?
4168             j= scantable[ coeff_num ];
4169
4170             block[j]= level[i];
4171         }
4172     }else{
4173         for(i=total_coeff-1; i>=0; i--){ //FIXME merge into  rundecode?
4174             int j;
4175
4176             coeff_num += run[i] + 1; //FIXME add 1 earlier ?
4177             j= scantable[ coeff_num ];
4178
4179             block[j]= level[i] * qmul[j];
4180 //            printf("%d %d  ", block[j], qmul[j]);
4181         }
4182     }
4183     return 0;
4184 }
4185
4186 /**
4187  * decodes a P_SKIP or B_SKIP macroblock
4188  */
4189 static void decode_mb_skip(H264Context *h){
4190     MpegEncContext * const s = &h->s;
4191     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
4192     int mb_type;
4193
4194     memset(h->non_zero_count[mb_xy], 0, 16);
4195     memset(h->non_zero_count_cache + 8, 0, 8*5); //FIXME ugly, remove pfui
4196
4197     if(h->mb_aff_frame && s->mb_skip_run==0 && (s->mb_y&1)==0){
4198         h->mb_field_decoding_flag= get_bits1(&s->gb);
4199     }
4200     if(h->mb_field_decoding_flag)
4201         mb_type|= MB_TYPE_INTERLACED;
4202
4203     if( h->slice_type == B_TYPE )
4204     {
4205         // just for fill_caches. pred_direct_motion will set the real mb_type
4206         mb_type= MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P0L1|MB_TYPE_DIRECT2|MB_TYPE_SKIP;
4207
4208         fill_caches(h, mb_type, 0); //FIXME check what is needed and what not ...
4209         pred_direct_motion(h, &mb_type);
4210         if(h->pps.cabac){
4211             fill_rectangle(h->mvd_cache[0][scan8[0]], 4, 4, 8, 0, 4);
4212             fill_rectangle(h->mvd_cache[1][scan8[0]], 4, 4, 8, 0, 4);
4213         }
4214     }
4215     else
4216     {
4217         int mx, my;
4218         mb_type= MB_TYPE_16x16|MB_TYPE_P0L0|MB_TYPE_P1L0|MB_TYPE_SKIP;
4219
4220         fill_caches(h, mb_type, 0); //FIXME check what is needed and what not ...
4221         pred_pskip_motion(h, &mx, &my);
4222         fill_rectangle(&h->ref_cache[0][scan8[0]], 4, 4, 8, 0, 1);
4223         fill_rectangle(  h->mv_cache[0][scan8[0]], 4, 4, 8, pack16to32(mx,my), 4);
4224         if(h->pps.cabac)
4225             fill_rectangle(h->mvd_cache[0][scan8[0]], 4, 4, 8, 0, 4);
4226     }
4227
4228     write_back_motion(h, mb_type);
4229     s->current_picture.mb_type[mb_xy]= mb_type|MB_TYPE_SKIP;
4230     s->current_picture.qscale_table[mb_xy]= s->qscale;
4231     h->slice_table[ mb_xy ]= h->slice_num;
4232     h->prev_mb_skipped= 1;
4233 }
4234
4235 /**
4236  * decodes a macroblock
4237  * @returns 0 if ok, AC_ERROR / DC_ERROR / MV_ERROR if an error is noticed
4238  */
4239 static int decode_mb_cavlc(H264Context *h){
4240     MpegEncContext * const s = &h->s;
4241     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
4242     int mb_type, partition_count, cbp;
4243
4244     s->dsp.clear_blocks(h->mb); //FIXME avoid if already clear (move after skip handlong?
4245
4246     tprintf("pic:%d mb:%d/%d\n", h->frame_num, s->mb_x, s->mb_y);
4247     cbp = 0; /* avoid warning. FIXME: find a solution without slowing
4248                 down the code */
4249     if(h->slice_type != I_TYPE && h->slice_type != SI_TYPE){
4250         if(s->mb_skip_run==-1)
4251             s->mb_skip_run= get_ue_golomb(&s->gb);
4252
4253         if (s->mb_skip_run--) {
4254             decode_mb_skip(h);
4255             return 0;
4256         }
4257     }
4258     if(h->mb_aff_frame){
4259         if ( ((s->mb_y&1) == 0) || h->prev_mb_skipped)
4260             h->mb_field_decoding_flag = get_bits1(&s->gb);
4261     }else
4262         h->mb_field_decoding_flag= (s->picture_structure!=PICT_FRAME);
4263
4264     h->prev_mb_skipped= 0;
4265
4266     mb_type= get_ue_golomb(&s->gb);
4267     if(h->slice_type == B_TYPE){
4268         if(mb_type < 23){
4269             partition_count= b_mb_type_info[mb_type].partition_count;
4270             mb_type=         b_mb_type_info[mb_type].type;
4271         }else{
4272             mb_type -= 23;
4273             goto decode_intra_mb;
4274         }
4275     }else if(h->slice_type == P_TYPE /*|| h->slice_type == SP_TYPE */){
4276         if(mb_type < 5){
4277             partition_count= p_mb_type_info[mb_type].partition_count;
4278             mb_type=         p_mb_type_info[mb_type].type;
4279         }else{
4280             mb_type -= 5;
4281             goto decode_intra_mb;
4282         }
4283     }else{
4284        assert(h->slice_type == I_TYPE);
4285 decode_intra_mb:
4286         if(mb_type > 25){
4287             av_log(h->s.avctx, AV_LOG_ERROR, "mb_type %d in %c slice to large at %d %d\n", mb_type, av_get_pict_type_char(h->slice_type), s->mb_x, s->mb_y);
4288             return -1;
4289         }
4290         partition_count=0;
4291         cbp= i_mb_type_info[mb_type].cbp;
4292         h->intra16x16_pred_mode= i_mb_type_info[mb_type].pred_mode;
4293         mb_type= i_mb_type_info[mb_type].type;
4294     }
4295
4296     if(h->mb_field_decoding_flag)
4297         mb_type |= MB_TYPE_INTERLACED;
4298
4299     s->current_picture.mb_type[mb_xy]= mb_type;
4300     h->slice_table[ mb_xy ]= h->slice_num;
4301
4302     if(IS_INTRA_PCM(mb_type)){
4303         unsigned int x, y;
4304
4305         // we assume these blocks are very rare so we dont optimize it
4306         align_get_bits(&s->gb);
4307
4308         // The pixels are stored in the same order as levels in h->mb array.
4309         for(y=0; y<16; y++){
4310             const int index= 4*(y&3) + 32*((y>>2)&1) + 128*(y>>3);
4311             for(x=0; x<16; x++){
4312                 tprintf("LUMA ICPM LEVEL (%3d)\n", show_bits(&s->gb, 8));
4313                 h->mb[index + (x&3) + 16*((x>>2)&1) + 64*(x>>3)]= get_bits(&s->gb, 8);
4314             }
4315         }
4316         for(y=0; y<8; y++){
4317             const int index= 256 + 4*(y&3) + 32*(y>>2);
4318             for(x=0; x<8; x++){
4319                 tprintf("CHROMA U ICPM LEVEL (%3d)\n", show_bits(&s->gb, 8));
4320                 h->mb[index + (x&3) + 16*(x>>2)]= get_bits(&s->gb, 8);
4321             }
4322         }
4323         for(y=0; y<8; y++){
4324             const int index= 256 + 64 + 4*(y&3) + 32*(y>>2);
4325             for(x=0; x<8; x++){
4326                 tprintf("CHROMA V ICPM LEVEL (%3d)\n", show_bits(&s->gb, 8));
4327                 h->mb[index + (x&3) + 16*(x>>2)]= get_bits(&s->gb, 8);
4328             }
4329         }
4330
4331         // In deblocking, the quantizer is 0
4332         s->current_picture.qscale_table[mb_xy]= 0;
4333         h->chroma_qp = get_chroma_qp(h->pps.chroma_qp_index_offset, 0);
4334         // All coeffs are present
4335         memset(h->non_zero_count[mb_xy], 16, 16);
4336
4337         return 0;
4338     }
4339
4340     fill_caches(h, mb_type, 0);
4341
4342     //mb_pred
4343     if(IS_INTRA(mb_type)){
4344 //            init_top_left_availability(h);
4345             if(IS_INTRA4x4(mb_type)){
4346                 int i;
4347
4348 //                fill_intra4x4_pred_table(h);
4349                 for(i=0; i<16; i++){
4350                     const int mode_coded= !get_bits1(&s->gb);
4351                     const int predicted_mode=  pred_intra_mode(h, i);
4352                     int mode;
4353
4354                     if(mode_coded){
4355                         const int rem_mode= get_bits(&s->gb, 3);
4356                         if(rem_mode<predicted_mode)
4357                             mode= rem_mode;
4358                         else
4359                             mode= rem_mode + 1;
4360                     }else{
4361                         mode= predicted_mode;
4362                     }
4363
4364                     h->intra4x4_pred_mode_cache[ scan8[i] ] = mode;
4365                 }
4366                 write_back_intra_pred_mode(h);
4367                 if( check_intra4x4_pred_mode(h) < 0)
4368                     return -1;
4369             }else{
4370                 h->intra16x16_pred_mode= check_intra_pred_mode(h, h->intra16x16_pred_mode);
4371                 if(h->intra16x16_pred_mode < 0)
4372                     return -1;
4373             }
4374             h->chroma_pred_mode= get_ue_golomb(&s->gb);
4375
4376             h->chroma_pred_mode= check_intra_pred_mode(h, h->chroma_pred_mode);
4377             if(h->chroma_pred_mode < 0)
4378                 return -1;
4379     }else if(partition_count==4){
4380         int i, j, sub_partition_count[4], list, ref[2][4];
4381
4382         if(h->slice_type == B_TYPE){
4383             for(i=0; i<4; i++){
4384                 h->sub_mb_type[i]= get_ue_golomb(&s->gb);
4385                 if(h->sub_mb_type[i] >=13){
4386                     av_log(h->s.avctx, AV_LOG_ERROR, "B sub_mb_type %d out of range at %d %d\n", h->sub_mb_type[i], s->mb_x, s->mb_y);
4387                     return -1;
4388                 }
4389                 sub_partition_count[i]= b_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
4390                 h->sub_mb_type[i]=      b_sub_mb_type_info[ h->sub_mb_type[i] ].type;
4391             }
4392             if(   IS_DIRECT(h->sub_mb_type[0]) || IS_DIRECT(h->sub_mb_type[1])
4393                || IS_DIRECT(h->sub_mb_type[2]) || IS_DIRECT(h->sub_mb_type[3]))
4394                 pred_direct_motion(h, &mb_type);
4395         }else{
4396             assert(h->slice_type == P_TYPE || h->slice_type == SP_TYPE); //FIXME SP correct ?
4397             for(i=0; i<4; i++){
4398                 h->sub_mb_type[i]= get_ue_golomb(&s->gb);
4399                 if(h->sub_mb_type[i] >=4){
4400                     av_log(h->s.avctx, AV_LOG_ERROR, "P sub_mb_type %d out of range at %d %d\n", h->sub_mb_type[i], s->mb_x, s->mb_y);
4401                     return -1;
4402                 }
4403                 sub_partition_count[i]= p_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
4404                 h->sub_mb_type[i]=      p_sub_mb_type_info[ h->sub_mb_type[i] ].type;
4405             }
4406         }
4407
4408         for(list=0; list<2; list++){
4409             int ref_count= IS_REF0(mb_type) ? 1 : h->ref_count[list];
4410             if(ref_count == 0) continue;
4411             if (h->mb_aff_frame && h->mb_field_decoding_flag) {
4412                 ref_count <<= 1;
4413             }
4414             for(i=0; i<4; i++){
4415                 if(IS_DIRECT(h->sub_mb_type[i])) continue;
4416                 if(IS_DIR(h->sub_mb_type[i], 0, list)){
4417                     ref[list][i] = get_te0_golomb(&s->gb, ref_count); //FIXME init to 0 before and skip?
4418                 }else{
4419                  //FIXME
4420                     ref[list][i] = -1;
4421                 }
4422             }
4423         }
4424
4425         for(list=0; list<2; list++){
4426             const int ref_count= IS_REF0(mb_type) ? 1 : h->ref_count[list];
4427             if(ref_count == 0) continue;
4428
4429             for(i=0; i<4; i++){
4430                 if(IS_DIRECT(h->sub_mb_type[i])) continue;
4431                 h->ref_cache[list][ scan8[4*i]   ]=h->ref_cache[list][ scan8[4*i]+1 ]=
4432                 h->ref_cache[list][ scan8[4*i]+8 ]=h->ref_cache[list][ scan8[4*i]+9 ]= ref[list][i];
4433
4434                 if(IS_DIR(h->sub_mb_type[i], 0, list)){
4435                     const int sub_mb_type= h->sub_mb_type[i];
4436                     const int block_width= (sub_mb_type & (MB_TYPE_16x16|MB_TYPE_16x8)) ? 2 : 1;
4437                     for(j=0; j<sub_partition_count[i]; j++){
4438                         int mx, my;
4439                         const int index= 4*i + block_width*j;
4440                         int16_t (* mv_cache)[2]= &h->mv_cache[list][ scan8[index] ];
4441                         pred_motion(h, index, block_width, list, h->ref_cache[list][ scan8[index] ], &mx, &my);
4442                         mx += get_se_golomb(&s->gb);
4443                         my += get_se_golomb(&s->gb);
4444                         tprintf("final mv:%d %d\n", mx, my);
4445
4446                         if(IS_SUB_8X8(sub_mb_type)){
4447                             mv_cache[ 0 ][0]= mv_cache[ 1 ][0]=
4448                             mv_cache[ 8 ][0]= mv_cache[ 9 ][0]= mx;
4449                             mv_cache[ 0 ][1]= mv_cache[ 1 ][1]=
4450                             mv_cache[ 8 ][1]= mv_cache[ 9 ][1]= my;
4451                         }else if(IS_SUB_8X4(sub_mb_type)){
4452                             mv_cache[ 0 ][0]= mv_cache[ 1 ][0]= mx;
4453                             mv_cache[ 0 ][1]= mv_cache[ 1 ][1]= my;
4454                         }else if(IS_SUB_4X8(sub_mb_type)){
4455                             mv_cache[ 0 ][0]= mv_cache[ 8 ][0]= mx;
4456                             mv_cache[ 0 ][1]= mv_cache[ 8 ][1]= my;
4457                         }else{
4458                             assert(IS_SUB_4X4(sub_mb_type));
4459                             mv_cache[ 0 ][0]= mx;
4460                             mv_cache[ 0 ][1]= my;
4461                         }
4462                     }
4463                 }else{
4464                     uint32_t *p= (uint32_t *)&h->mv_cache[list][ scan8[4*i] ][0];
4465                     p[0] = p[1]=
4466                     p[8] = p[9]= 0;
4467                 }
4468             }
4469         }
4470     }else if(IS_DIRECT(mb_type)){
4471         pred_direct_motion(h, &mb_type);
4472         s->current_picture.mb_type[mb_xy]= mb_type;
4473     }else{
4474         int list, mx, my, i;
4475          //FIXME we should set ref_idx_l? to 0 if we use that later ...
4476         if(IS_16X16(mb_type)){
4477             for(list=0; list<2; list++){
4478                 if(h->ref_count[list]>0){
4479                     if(IS_DIR(mb_type, 0, list)){
4480                         const int val= get_te0_golomb(&s->gb, h->ref_count[list]);
4481                         fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, val, 1);
4482                     }else
4483                         fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, (LIST_NOT_USED&0xFF), 1);
4484                 }
4485             }
4486             for(list=0; list<2; list++){
4487                 if(IS_DIR(mb_type, 0, list)){
4488                     pred_motion(h, 0, 4, list, h->ref_cache[list][ scan8[0] ], &mx, &my);
4489                     mx += get_se_golomb(&s->gb);
4490                     my += get_se_golomb(&s->gb);
4491                     tprintf("final mv:%d %d\n", mx, my);
4492
4493                     fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, pack16to32(mx,my), 4);
4494                 }else
4495                     fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, 0, 4);
4496             }
4497         }
4498         else if(IS_16X8(mb_type)){
4499             for(list=0; list<2; list++){
4500                 if(h->ref_count[list]>0){
4501                     for(i=0; i<2; i++){
4502                         if(IS_DIR(mb_type, i, list)){
4503                             const int val= get_te0_golomb(&s->gb, h->ref_count[list]);
4504                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, val, 1);
4505                         }else
4506                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, (LIST_NOT_USED&0xFF), 1);
4507                     }
4508                 }
4509             }
4510             for(list=0; list<2; list++){
4511                 for(i=0; i<2; i++){
4512                     if(IS_DIR(mb_type, i, list)){
4513                         pred_16x8_motion(h, 8*i, list, h->ref_cache[list][scan8[0] + 16*i], &mx, &my);
4514                         mx += get_se_golomb(&s->gb);
4515                         my += get_se_golomb(&s->gb);
4516                         tprintf("final mv:%d %d\n", mx, my);
4517
4518                         fill_rectangle(h->mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, pack16to32(mx,my), 4);
4519                     }else
4520                         fill_rectangle(h->mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, 0, 4);
4521                 }
4522             }
4523         }else{
4524             assert(IS_8X16(mb_type));
4525             for(list=0; list<2; list++){
4526                 if(h->ref_count[list]>0){
4527                     for(i=0; i<2; i++){
4528                         if(IS_DIR(mb_type, i, list)){ //FIXME optimize
4529                             const int val= get_te0_golomb(&s->gb, h->ref_count[list]);
4530                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, val, 1);
4531                         }else
4532                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, (LIST_NOT_USED&0xFF), 1);
4533                     }
4534                 }
4535             }
4536             for(list=0; list<2; list++){
4537                 for(i=0; i<2; i++){
4538                     if(IS_DIR(mb_type, i, list)){
4539                         pred_8x16_motion(h, i*4, list, h->ref_cache[list][ scan8[0] + 2*i ], &mx, &my);
4540                         mx += get_se_golomb(&s->gb);
4541                         my += get_se_golomb(&s->gb);
4542                         tprintf("final mv:%d %d\n", mx, my);
4543
4544                         fill_rectangle(h->mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, pack16to32(mx,my), 4);
4545                     }else
4546                         fill_rectangle(h->mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, 0, 4);
4547                 }
4548             }
4549         }
4550     }
4551
4552     if(IS_INTER(mb_type))
4553         write_back_motion(h, mb_type);
4554
4555     if(!IS_INTRA16x16(mb_type)){
4556         cbp= get_ue_golomb(&s->gb);
4557         if(cbp > 47){
4558             av_log(h->s.avctx, AV_LOG_ERROR, "cbp too large (%d) at %d %d\n", cbp, s->mb_x, s->mb_y);
4559             return -1;
4560         }
4561
4562         if(IS_INTRA4x4(mb_type))
4563             cbp= golomb_to_intra4x4_cbp[cbp];
4564         else
4565             cbp= golomb_to_inter_cbp[cbp];
4566     }
4567
4568     if(cbp || IS_INTRA16x16(mb_type)){
4569         int i8x8, i4x4, chroma_idx;
4570         int chroma_qp, dquant;
4571         GetBitContext *gb= IS_INTRA(mb_type) ? h->intra_gb_ptr : h->inter_gb_ptr;
4572         const uint8_t *scan, *dc_scan;
4573
4574 //        fill_non_zero_count_cache(h);
4575
4576         if(IS_INTERLACED(mb_type)){
4577             scan= field_scan;
4578             dc_scan= luma_dc_field_scan;
4579         }else{
4580             scan= zigzag_scan;
4581             dc_scan= luma_dc_zigzag_scan;
4582         }
4583
4584         dquant= get_se_golomb(&s->gb);
4585
4586         if( dquant > 25 || dquant < -26 ){
4587             av_log(h->s.avctx, AV_LOG_ERROR, "dquant out of range (%d) at %d %d\n", dquant, s->mb_x, s->mb_y);
4588             return -1;
4589         }
4590
4591         s->qscale += dquant;
4592         if(((unsigned)s->qscale) > 51){
4593             if(s->qscale<0) s->qscale+= 52;
4594             else            s->qscale-= 52;
4595         }
4596
4597         h->chroma_qp= chroma_qp= get_chroma_qp(h->pps.chroma_qp_index_offset, s->qscale);
4598         if(IS_INTRA16x16(mb_type)){
4599             if( decode_residual(h, h->intra_gb_ptr, h->mb, LUMA_DC_BLOCK_INDEX, dc_scan, s->qscale, 16) < 0){
4600                 return -1; //FIXME continue if partitioned and other return -1 too
4601             }
4602
4603             assert((cbp&15) == 0 || (cbp&15) == 15);
4604
4605             if(cbp&15){
4606                 for(i8x8=0; i8x8<4; i8x8++){
4607                     for(i4x4=0; i4x4<4; i4x4++){
4608                         const int index= i4x4 + 4*i8x8;
4609                         if( decode_residual(h, h->intra_gb_ptr, h->mb + 16*index, index, scan + 1, s->qscale, 15) < 0 ){
4610                             return -1;
4611                         }
4612                     }
4613                 }
4614             }else{
4615                 fill_rectangle(&h->non_zero_count_cache[scan8[0]], 4, 4, 8, 0, 1);
4616             }
4617         }else{
4618             for(i8x8=0; i8x8<4; i8x8++){
4619                 if(cbp & (1<<i8x8)){
4620                     for(i4x4=0; i4x4<4; i4x4++){
4621                         const int index= i4x4 + 4*i8x8;
4622
4623                         if( decode_residual(h, gb, h->mb + 16*index, index, scan, s->qscale, 16) <0 ){
4624                             return -1;
4625                         }
4626                     }
4627                 }else{
4628                     uint8_t * const nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
4629                     nnz[0] = nnz[1] = nnz[8] = nnz[9] = 0;
4630                 }
4631             }
4632         }
4633
4634         if(cbp&0x30){
4635             for(chroma_idx=0; chroma_idx<2; chroma_idx++)
4636                 if( decode_residual(h, gb, h->mb + 256 + 16*4*chroma_idx, CHROMA_DC_BLOCK_INDEX, chroma_dc_scan, chroma_qp, 4) < 0){
4637                     return -1;
4638                 }
4639         }
4640
4641         if(cbp&0x20){
4642             for(chroma_idx=0; chroma_idx<2; chroma_idx++){
4643                 for(i4x4=0; i4x4<4; i4x4++){
4644                     const int index= 16 + 4*chroma_idx + i4x4;
4645                     if( decode_residual(h, gb, h->mb + 16*index, index, scan + 1, chroma_qp, 15) < 0){
4646                         return -1;
4647                     }
4648                 }
4649             }
4650         }else{
4651             uint8_t * const nnz= &h->non_zero_count_cache[0];
4652             nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
4653             nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
4654         }
4655     }else{
4656         uint8_t * const nnz= &h->non_zero_count_cache[0];
4657         fill_rectangle(&nnz[scan8[0]], 4, 4, 8, 0, 1);
4658         nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
4659         nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
4660     }
4661     s->current_picture.qscale_table[mb_xy]= s->qscale;
4662     write_back_non_zero_count(h);
4663
4664     return 0;
4665 }
4666
4667 static int decode_cabac_field_decoding_flag(H264Context *h) {
4668     MpegEncContext * const s = &h->s;
4669     const int mb_x = s->mb_x;
4670     const int mb_y = s->mb_y & ~1;
4671     const int mba_xy = mb_x - 1 +  mb_y   *s->mb_stride;
4672     const int mbb_xy = mb_x     + (mb_y-2)*s->mb_stride;
4673
4674     unsigned int ctx = 0;
4675
4676     if( h->slice_table[mba_xy] == h->slice_num && IS_INTERLACED( s->current_picture.mb_type[mba_xy] ) ) {
4677         ctx += 1;
4678     }
4679     if( h->slice_table[mbb_xy] == h->slice_num && IS_INTERLACED( s->current_picture.mb_type[mbb_xy] ) ) {
4680         ctx += 1;
4681     }
4682
4683     return get_cabac( &h->cabac, &h->cabac_state[70 + ctx] );
4684 }
4685
4686 static int decode_cabac_intra_mb_type(H264Context *h, int ctx_base, int intra_slice) {
4687     uint8_t *state= &h->cabac_state[ctx_base];
4688     int mb_type;
4689
4690     if(intra_slice){
4691         MpegEncContext * const s = &h->s;
4692         const int mba_xy = h->left_mb_xy[0];
4693         const int mbb_xy = h->top_mb_xy;
4694         int ctx=0;
4695         if( h->slice_table[mba_xy] == h->slice_num && !IS_INTRA4x4( s->current_picture.mb_type[mba_xy] ) )
4696             ctx++;
4697         if( h->slice_table[mbb_xy] == h->slice_num && !IS_INTRA4x4( s->current_picture.mb_type[mbb_xy] ) )
4698             ctx++;
4699         if( get_cabac( &h->cabac, &state[ctx] ) == 0 )
4700             return 0;   /* I4x4 */
4701         state += 2;
4702     }else{
4703         if( get_cabac( &h->cabac, &state[0] ) == 0 )
4704             return 0;   /* I4x4 */
4705     }
4706
4707     if( get_cabac_terminate( &h->cabac ) )
4708         return 25;  /* PCM */
4709
4710     mb_type = 1; /* I16x16 */
4711     if( get_cabac( &h->cabac, &state[1] ) )
4712         mb_type += 12;  /* cbp_luma != 0 */
4713
4714     if( get_cabac( &h->cabac, &state[2] ) ) {
4715         if( get_cabac( &h->cabac, &state[2+intra_slice] ) )
4716             mb_type += 4 * 2;   /* cbp_chroma == 2 */
4717         else
4718             mb_type += 4 * 1;   /* cbp_chroma == 1 */
4719     }
4720     if( get_cabac( &h->cabac, &state[3+intra_slice] ) )
4721         mb_type += 2;
4722     if( get_cabac( &h->cabac, &state[3+2*intra_slice] ) )
4723         mb_type += 1;
4724     return mb_type;
4725 }
4726
4727 static int decode_cabac_mb_type( H264Context *h ) {
4728     MpegEncContext * const s = &h->s;
4729
4730     if( h->slice_type == I_TYPE ) {
4731         return decode_cabac_intra_mb_type(h, 3, 1);
4732     } else if( h->slice_type == P_TYPE ) {
4733         if( get_cabac( &h->cabac, &h->cabac_state[14] ) == 0 ) {
4734             /* P-type */
4735             if( get_cabac( &h->cabac, &h->cabac_state[15] ) == 0 ) {
4736                 if( get_cabac( &h->cabac, &h->cabac_state[16] ) == 0 )
4737                     return 0; /* P_L0_D16x16; */
4738                 else
4739                     return 3; /* P_8x8; */
4740             } else {
4741                 if( get_cabac( &h->cabac, &h->cabac_state[17] ) == 0 )
4742                     return 2; /* P_L0_D8x16; */
4743                 else
4744                     return 1; /* P_L0_D16x8; */
4745             }
4746         } else {
4747             return decode_cabac_intra_mb_type(h, 17, 0) + 5;
4748         }
4749     } else if( h->slice_type == B_TYPE ) {
4750         const int mba_xy = h->left_mb_xy[0];
4751         const int mbb_xy = h->top_mb_xy;
4752         int ctx = 0;
4753         int bits;
4754
4755         if( h->slice_table[mba_xy] == h->slice_num && !IS_SKIP( s->current_picture.mb_type[mba_xy] )
4756                       && !IS_DIRECT( s->current_picture.mb_type[mba_xy] ) )
4757             ctx++;
4758         if( h->slice_table[mbb_xy] == h->slice_num && !IS_SKIP( s->current_picture.mb_type[mbb_xy] )
4759                       && !IS_DIRECT( s->current_picture.mb_type[mbb_xy] ) )
4760             ctx++;
4761
4762         if( !get_cabac( &h->cabac, &h->cabac_state[27+ctx] ) )
4763             return 0; /* B_Direct_16x16 */
4764
4765         if( !get_cabac( &h->cabac, &h->cabac_state[27+3] ) ) {
4766             return 1 + get_cabac( &h->cabac, &h->cabac_state[27+5] ); /* B_L[01]_16x16 */
4767         }
4768
4769         bits = get_cabac( &h->cabac, &h->cabac_state[27+4] ) << 3;
4770         bits|= get_cabac( &h->cabac, &h->cabac_state[27+5] ) << 2;
4771         bits|= get_cabac( &h->cabac, &h->cabac_state[27+5] ) << 1;
4772         bits|= get_cabac( &h->cabac, &h->cabac_state[27+5] );
4773         if( bits < 8 )
4774             return bits + 3; /* B_Bi_16x16 through B_L1_L0_16x8 */
4775         else if( bits == 13 ) {
4776             return decode_cabac_intra_mb_type(h, 32, 0) + 23;
4777         } else if( bits == 14 )
4778             return 11; /* B_L1_L0_8x16 */
4779         else if( bits == 15 )
4780             return 22; /* B_8x8 */
4781
4782         bits= ( bits<<1 ) | get_cabac( &h->cabac, &h->cabac_state[27+5] );
4783         return bits - 4; /* B_L0_Bi_* through B_Bi_Bi_* */
4784     } else {
4785         /* TODO SI/SP frames? */
4786         return -1;
4787     }
4788 }
4789
4790 static int decode_cabac_mb_skip( H264Context *h) {
4791     MpegEncContext * const s = &h->s;
4792     const int mb_xy = s->mb_x + s->mb_y*s->mb_stride;
4793     const int mba_xy = mb_xy - 1;
4794     const int mbb_xy = mb_xy - s->mb_stride;
4795     int ctx = 0;
4796
4797     if( h->slice_table[mba_xy] == h->slice_num && !IS_SKIP( s->current_picture.mb_type[mba_xy] ))
4798         ctx++;
4799     if( h->slice_table[mbb_xy] == h->slice_num && !IS_SKIP( s->current_picture.mb_type[mbb_xy] ))
4800         ctx++;
4801
4802     if( h->slice_type == P_TYPE || h->slice_type == SP_TYPE)
4803         return get_cabac( &h->cabac, &h->cabac_state[11+ctx] );
4804     else /* B-frame */
4805         return get_cabac( &h->cabac, &h->cabac_state[24+ctx] );
4806 }
4807
4808 static int decode_cabac_mb_intra4x4_pred_mode( H264Context *h, int pred_mode ) {
4809     int mode = 0;
4810
4811     if( get_cabac( &h->cabac, &h->cabac_state[68] ) )
4812         return pred_mode;
4813
4814     if( get_cabac( &h->cabac, &h->cabac_state[69] ) )
4815         mode += 1;
4816     if( get_cabac( &h->cabac, &h->cabac_state[69] ) )
4817         mode += 2;
4818     if( get_cabac( &h->cabac, &h->cabac_state[69] ) )
4819         mode += 4;
4820     if( mode >= pred_mode )
4821         return mode + 1;
4822     else
4823         return mode;
4824 }
4825
4826 static int decode_cabac_mb_chroma_pre_mode( H264Context *h) {
4827     const int mba_xy = h->left_mb_xy[0];
4828     const int mbb_xy = h->top_mb_xy;
4829
4830     int ctx = 0;
4831
4832     /* No need to test for IS_INTRA4x4 and IS_INTRA16x16, as we set chroma_pred_mode_table to 0 */
4833     if( h->slice_table[mba_xy] == h->slice_num && h->chroma_pred_mode_table[mba_xy] != 0 )
4834         ctx++;
4835
4836     if( h->slice_table[mbb_xy] == h->slice_num && h->chroma_pred_mode_table[mbb_xy] != 0 )
4837         ctx++;
4838
4839     if( get_cabac( &h->cabac, &h->cabac_state[64+ctx] ) == 0 )
4840         return 0;
4841
4842     if( get_cabac( &h->cabac, &h->cabac_state[64+3] ) == 0 )
4843         return 1;
4844     if( get_cabac( &h->cabac, &h->cabac_state[64+3] ) == 0 )
4845         return 2;
4846     else
4847         return 3;
4848 }
4849
4850 static const uint8_t block_idx_x[16] = {
4851     0, 1, 0, 1, 2, 3, 2, 3, 0, 1, 0, 1, 2, 3, 2, 3
4852 };
4853 static const uint8_t block_idx_y[16] = {
4854     0, 0, 1, 1, 0, 0, 1, 1, 2, 2, 3, 3, 2, 2, 3, 3
4855 };
4856 static const uint8_t block_idx_xy[4][4] = {
4857     { 0, 2, 8,  10},
4858     { 1, 3, 9,  11},
4859     { 4, 6, 12, 14},
4860     { 5, 7, 13, 15}
4861 };
4862
4863 static int decode_cabac_mb_cbp_luma( H264Context *h) {
4864     MpegEncContext * const s = &h->s;
4865
4866     int cbp = 0;
4867     int i8x8;
4868
4869     for( i8x8 = 0; i8x8 < 4; i8x8++ ) {
4870         int cbp_a = -1;
4871         int cbp_b = -1;
4872         int x, y;
4873         int ctx = 0;
4874
4875         x = block_idx_x[4*i8x8];
4876         y = block_idx_y[4*i8x8];
4877
4878         if( x > 0 )
4879             cbp_a = cbp;
4880         else if( s->mb_x > 0 && (h->slice_table[h->left_mb_xy[0]] == h->slice_num)) {
4881             cbp_a = h->left_cbp;
4882             tprintf("cbp_a = left_cbp = %x\n", cbp_a);
4883         }
4884
4885         if( y > 0 )
4886             cbp_b = cbp;
4887         else if( s->mb_y > 0 && (h->slice_table[h->top_mb_xy] == h->slice_num)) {
4888             cbp_b = h->top_cbp;
4889             tprintf("cbp_b = top_cbp = %x\n", cbp_b);
4890         }
4891
4892         /* No need to test for skip as we put 0 for skip block */
4893         /* No need to test for IPCM as we put 1 for IPCM block */
4894         if( cbp_a >= 0 ) {
4895             int i8x8a = block_idx_xy[(x-1)&0x03][y]/4;
4896             if( ((cbp_a >> i8x8a)&0x01) == 0 )
4897                 ctx++;
4898         }
4899
4900         if( cbp_b >= 0 ) {
4901             int i8x8b = block_idx_xy[x][(y-1)&0x03]/4;
4902             if( ((cbp_b >> i8x8b)&0x01) == 0 )
4903                 ctx += 2;
4904         }
4905
4906         if( get_cabac( &h->cabac, &h->cabac_state[73 + ctx] ) ) {
4907             cbp |= 1 << i8x8;
4908         }
4909     }
4910     return cbp;
4911 }
4912 static int decode_cabac_mb_cbp_chroma( H264Context *h) {
4913     int ctx;
4914     int cbp_a, cbp_b;
4915
4916     cbp_a = (h->left_cbp>>4)&0x03;
4917     cbp_b = (h-> top_cbp>>4)&0x03;
4918
4919     ctx = 0;
4920     if( cbp_a > 0 ) ctx++;
4921     if( cbp_b > 0 ) ctx += 2;
4922     if( get_cabac( &h->cabac, &h->cabac_state[77 + ctx] ) == 0 )
4923         return 0;
4924
4925     ctx = 4;
4926     if( cbp_a == 2 ) ctx++;
4927     if( cbp_b == 2 ) ctx += 2;
4928     return 1 + get_cabac( &h->cabac, &h->cabac_state[77 + ctx] );
4929 }
4930 static int decode_cabac_mb_dqp( H264Context *h) {
4931     MpegEncContext * const s = &h->s;
4932     int mbn_xy;
4933     int   ctx = 0;
4934     int   val = 0;
4935
4936     if( s->mb_x > 0 )
4937         mbn_xy = s->mb_x + s->mb_y*s->mb_stride - 1;
4938     else
4939         mbn_xy = s->mb_width - 1 + (s->mb_y-1)*s->mb_stride;
4940
4941     if( h->last_qscale_diff != 0 && ( IS_INTRA16x16(s->current_picture.mb_type[mbn_xy] ) || (h->cbp_table[mbn_xy]&0x3f) ) )
4942         ctx++;
4943
4944     while( get_cabac( &h->cabac, &h->cabac_state[60 + ctx] ) ) {
4945         if( ctx < 2 )
4946             ctx = 2;
4947         else
4948             ctx = 3;
4949         val++;
4950     }
4951
4952     if( val&0x01 )
4953         return (val + 1)/2;
4954     else
4955         return -(val + 1)/2;
4956 }
4957 static int decode_cabac_p_mb_sub_type( H264Context *h ) {
4958     if( get_cabac( &h->cabac, &h->cabac_state[21] ) )
4959         return 0;   /* 8x8 */
4960     if( !get_cabac( &h->cabac, &h->cabac_state[22] ) )
4961         return 1;   /* 8x4 */
4962     if( get_cabac( &h->cabac, &h->cabac_state[23] ) )
4963         return 2;   /* 4x8 */
4964     return 3;       /* 4x4 */
4965 }
4966 static int decode_cabac_b_mb_sub_type( H264Context *h ) {
4967     int type;
4968     if( !get_cabac( &h->cabac, &h->cabac_state[36] ) )
4969         return 0;   /* B_Direct_8x8 */
4970     if( !get_cabac( &h->cabac, &h->cabac_state[37] ) )
4971         return 1 + get_cabac( &h->cabac, &h->cabac_state[39] ); /* B_L0_8x8, B_L1_8x8 */
4972     type = 3;
4973     if( get_cabac( &h->cabac, &h->cabac_state[38] ) ) {
4974         if( get_cabac( &h->cabac, &h->cabac_state[39] ) )
4975             return 11 + get_cabac( &h->cabac, &h->cabac_state[39] ); /* B_L1_4x4, B_Bi_4x4 */
4976         type += 4;
4977     }
4978     type += 2*get_cabac( &h->cabac, &h->cabac_state[39] );
4979     type +=   get_cabac( &h->cabac, &h->cabac_state[39] );
4980     return type;
4981 }
4982
4983 static int decode_cabac_mb_ref( H264Context *h, int list, int n ) {
4984     int refa = h->ref_cache[list][scan8[n] - 1];
4985     int refb = h->ref_cache[list][scan8[n] - 8];
4986     int ref  = 0;
4987     int ctx  = 0;
4988
4989     if( h->slice_type == B_TYPE) {
4990         if( refa > 0 && !h->direct_cache[scan8[n] - 1] )
4991             ctx++;
4992         if( refb > 0 && !h->direct_cache[scan8[n] - 8] )
4993             ctx += 2;
4994     } else {
4995         if( refa > 0 )
4996             ctx++;
4997         if( refb > 0 )
4998             ctx += 2;
4999     }
5000
5001     while( get_cabac( &h->cabac, &h->cabac_state[54+ctx] ) ) {
5002         ref++;
5003         if( ctx < 4 )
5004             ctx = 4;
5005         else
5006             ctx = 5;
5007     }
5008     return ref;
5009 }
5010
5011 static int decode_cabac_mb_mvd( H264Context *h, int list, int n, int l ) {
5012     int amvd = abs( h->mvd_cache[list][scan8[n] - 1][l] ) +
5013                abs( h->mvd_cache[list][scan8[n] - 8][l] );
5014     int ctxbase = (l == 0) ? 40 : 47;
5015     int ctx, mvd;
5016
5017     if( amvd < 3 )
5018         ctx = 0;
5019     else if( amvd > 32 )
5020         ctx = 2;
5021     else
5022         ctx = 1;
5023
5024     if(!get_cabac(&h->cabac, &h->cabac_state[ctxbase+ctx]))
5025         return 0;
5026
5027     mvd= 1;
5028     ctx= 3;
5029     while( mvd < 9 && get_cabac( &h->cabac, &h->cabac_state[ctxbase+ctx] ) ) {
5030         mvd++;
5031         if( ctx < 6 )
5032             ctx++;
5033     }
5034
5035     if( mvd >= 9 ) {
5036         int k = 3;
5037         while( get_cabac_bypass( &h->cabac ) ) {
5038             mvd += 1 << k;
5039             k++;
5040         }
5041         while( k-- ) {
5042             if( get_cabac_bypass( &h->cabac ) )
5043                 mvd += 1 << k;
5044         }
5045     }
5046     if( get_cabac_bypass( &h->cabac ) )  return -mvd;
5047     else                                 return  mvd;
5048 }
5049
5050 static int inline get_cabac_cbf_ctx( H264Context *h, int cat, int idx ) {
5051     int nza, nzb;
5052     int ctx = 0;
5053
5054     if( cat == 0 ) {
5055         nza = h->left_cbp&0x100;
5056         nzb = h-> top_cbp&0x100;
5057     } else if( cat == 1 || cat == 2 ) {
5058         nza = h->non_zero_count_cache[scan8[idx] - 1];
5059         nzb = h->non_zero_count_cache[scan8[idx] - 8];
5060     } else if( cat == 3 ) {
5061         nza = (h->left_cbp>>(6+idx))&0x01;
5062         nzb = (h-> top_cbp>>(6+idx))&0x01;
5063     } else {
5064         assert(cat == 4);
5065         nza = h->non_zero_count_cache[scan8[16+idx] - 1];
5066         nzb = h->non_zero_count_cache[scan8[16+idx] - 8];
5067     }
5068
5069     if( nza > 0 )
5070         ctx++;
5071
5072     if( nzb > 0 )
5073         ctx += 2;
5074
5075     return ctx + 4 * cat;
5076 }
5077
5078 static int inline decode_cabac_residual( H264Context *h, DCTELEM *block, int cat, int n, const uint8_t *scantable, int qp, int max_coeff) {
5079     const int mb_xy  = h->s.mb_x + h->s.mb_y*h->s.mb_stride;
5080     const uint16_t *qmul= dequant_coeff[qp];
5081     static const int significant_coeff_flag_field_offset[2] = { 105, 277 };
5082     static const int last_significant_coeff_flag_field_offset[2] = { 166, 338 };
5083     static const int significant_coeff_flag_offset[5] = { 0, 15, 29, 44, 47 };
5084     static const int coeff_abs_level_m1_offset[5] = {227+ 0, 227+10, 227+20, 227+30, 227+39 };
5085
5086     int index[16];
5087
5088     int i, last;
5089     int coeff_count = 0;
5090
5091     int abslevel1 = 1;
5092     int abslevelgt1 = 0;
5093
5094     /* cat: 0-> DC 16x16  n = 0
5095      *      1-> AC 16x16  n = luma4x4idx
5096      *      2-> Luma4x4   n = luma4x4idx
5097      *      3-> DC Chroma n = iCbCr
5098      *      4-> AC Chroma n = 4 * iCbCr + chroma4x4idx
5099      */
5100
5101     /* read coded block flag */
5102     if( get_cabac( &h->cabac, &h->cabac_state[85 + get_cabac_cbf_ctx( h, cat, n ) ] ) == 0 ) {
5103         if( cat == 1 || cat == 2 )
5104             h->non_zero_count_cache[scan8[n]] = 0;
5105         else if( cat == 4 )
5106             h->non_zero_count_cache[scan8[16+n]] = 0;
5107
5108         return 0;
5109     }
5110
5111     for(last= 0; last < max_coeff - 1; last++) {
5112         if( get_cabac( &h->cabac, &h->cabac_state[significant_coeff_flag_field_offset[h->mb_field_decoding_flag]+significant_coeff_flag_offset[cat]+last] )) {
5113             index[coeff_count++] = last;
5114             if( get_cabac( &h->cabac, &h->cabac_state[last_significant_coeff_flag_field_offset[h->mb_field_decoding_flag]+significant_coeff_flag_offset[cat]+last] ) ) {
5115                 last= max_coeff;
5116                 break;
5117             }
5118         }
5119     }
5120     if( last == max_coeff -1 ) {
5121         index[coeff_count++] = last;
5122     }
5123     assert(coeff_count > 0);
5124
5125     if( cat == 0 )
5126         h->cbp_table[mb_xy] |= 0x100;
5127     else if( cat == 1 || cat == 2 )
5128         h->non_zero_count_cache[scan8[n]] = coeff_count;
5129     else if( cat == 3 )
5130         h->cbp_table[mb_xy] |= 0x40 << n;
5131     else {
5132         assert( cat == 4 );
5133         h->non_zero_count_cache[scan8[16+n]] = coeff_count;
5134     }
5135
5136     for( i = coeff_count - 1; i >= 0; i-- ) {
5137         int ctx = (abslevelgt1 != 0 ? 0 : FFMIN( 4, abslevel1 )) + coeff_abs_level_m1_offset[cat];
5138         int j= scantable[index[i]];
5139
5140         if( get_cabac( &h->cabac, &h->cabac_state[ctx] ) == 0 ) {
5141             if( cat == 0 || cat == 3 ) {
5142                 if( get_cabac_bypass( &h->cabac ) ) block[j] = -1;
5143                 else                                block[j] =  1;
5144             }else{
5145                 if( get_cabac_bypass( &h->cabac ) ) block[j] = -qmul[j];
5146                 else                                block[j] =  qmul[j];
5147             }
5148
5149             abslevel1++;
5150         } else {
5151             int coeff_abs = 2;
5152             ctx = 5 + FFMIN( 4, abslevelgt1 ) + coeff_abs_level_m1_offset[cat];
5153             while( coeff_abs < 15 && get_cabac( &h->cabac, &h->cabac_state[ctx] ) ) {
5154                 coeff_abs++;
5155             }
5156
5157             if( coeff_abs >= 15 ) {
5158                 int j = 0;
5159                 while( get_cabac_bypass( &h->cabac ) ) {
5160                     coeff_abs += 1 << j;
5161                     j++;
5162                 }
5163
5164                 while( j-- ) {
5165                     if( get_cabac_bypass( &h->cabac ) )
5166                         coeff_abs += 1 << j ;
5167                 }
5168             }
5169
5170             if( cat == 0 || cat == 3 ) {
5171                 if( get_cabac_bypass( &h->cabac ) ) block[j] = -coeff_abs;
5172                 else                                block[j] =  coeff_abs;
5173             }else{
5174                 if( get_cabac_bypass( &h->cabac ) ) block[j] = -coeff_abs * qmul[j];
5175                 else                                block[j] =  coeff_abs * qmul[j];
5176             }
5177
5178             abslevelgt1++;
5179         }
5180     }
5181     return 0;
5182 }
5183
5184 void inline compute_mb_neighboors(H264Context *h)
5185 {
5186     MpegEncContext * const s = &h->s;
5187     const int mb_xy  = s->mb_x + s->mb_y*s->mb_stride;
5188     h->top_mb_xy     = mb_xy - s->mb_stride;
5189     h->left_mb_xy[0] = mb_xy - 1;
5190     if(h->mb_aff_frame){
5191         const int pair_xy          = s->mb_x     + (s->mb_y & ~1)*s->mb_stride;
5192         const int top_pair_xy      = pair_xy     - s->mb_stride;
5193         const int top_mb_frame_flag      = !IS_INTERLACED(s->current_picture.mb_type[top_pair_xy]);
5194         const int left_mb_frame_flag = !IS_INTERLACED(s->current_picture.mb_type[pair_xy-1]);
5195         const int curr_mb_frame_flag = !h->mb_field_decoding_flag;
5196         const int bottom = (s->mb_y & 1);
5197         if (bottom
5198                 ? !curr_mb_frame_flag // bottom macroblock
5199                 : (!curr_mb_frame_flag && !top_mb_frame_flag) // top macroblock
5200                 ) {
5201             h->top_mb_xy -= s->mb_stride;
5202         }
5203         if (left_mb_frame_flag != curr_mb_frame_flag) {
5204             h->left_mb_xy[0] = pair_xy - 1;
5205         }
5206     }
5207     return;
5208 }
5209
5210 /**
5211  * decodes a macroblock
5212  * @returns 0 if ok, AC_ERROR / DC_ERROR / MV_ERROR if an error is noticed
5213  */
5214 static int decode_mb_cabac(H264Context *h) {
5215     MpegEncContext * const s = &h->s;
5216     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
5217     int mb_type, partition_count, cbp = 0;
5218
5219     s->dsp.clear_blocks(h->mb); //FIXME avoid if already clear (move after skip handlong?)
5220
5221     tprintf("pic:%d mb:%d/%d\n", h->frame_num, s->mb_x, s->mb_y);
5222     if( h->slice_type != I_TYPE && h->slice_type != SI_TYPE ) {
5223         /* read skip flags */
5224         if( decode_cabac_mb_skip( h ) ) {
5225             decode_mb_skip(h);
5226
5227             h->cbp_table[mb_xy] = 0;
5228             h->chroma_pred_mode_table[mb_xy] = 0;
5229             h->last_qscale_diff = 0;
5230
5231             return 0;
5232
5233         }
5234     }
5235     if(h->mb_aff_frame){
5236         if ( ((s->mb_y&1) == 0) || h->prev_mb_skipped)
5237             h->mb_field_decoding_flag = decode_cabac_field_decoding_flag(h);
5238     }else
5239         h->mb_field_decoding_flag= (s->picture_structure!=PICT_FRAME);
5240
5241     h->prev_mb_skipped = 0;
5242
5243     compute_mb_neighboors(h);
5244     if( ( mb_type = decode_cabac_mb_type( h ) ) < 0 ) {
5245         av_log( h->s.avctx, AV_LOG_ERROR, "decode_cabac_mb_type failed\n" );
5246         return -1;
5247     }
5248
5249     if( h->slice_type == B_TYPE ) {
5250         if( mb_type < 23 ){
5251             partition_count= b_mb_type_info[mb_type].partition_count;
5252             mb_type=         b_mb_type_info[mb_type].type;
5253         }else{
5254             mb_type -= 23;
5255             goto decode_intra_mb;
5256         }
5257     } else if( h->slice_type == P_TYPE ) {
5258         if( mb_type < 5) {
5259             partition_count= p_mb_type_info[mb_type].partition_count;
5260             mb_type=         p_mb_type_info[mb_type].type;
5261         } else {
5262             mb_type -= 5;
5263             goto decode_intra_mb;
5264         }
5265     } else {
5266        assert(h->slice_type == I_TYPE);
5267 decode_intra_mb:
5268         partition_count = 0;
5269         cbp= i_mb_type_info[mb_type].cbp;
5270         h->intra16x16_pred_mode= i_mb_type_info[mb_type].pred_mode;
5271         mb_type= i_mb_type_info[mb_type].type;
5272     }
5273     if(h->mb_field_decoding_flag)
5274         mb_type |= MB_TYPE_INTERLACED;
5275
5276     s->current_picture.mb_type[mb_xy]= mb_type;
5277     h->slice_table[ mb_xy ]= h->slice_num;
5278
5279     if(IS_INTRA_PCM(mb_type)) {
5280         const uint8_t *ptr;
5281         unsigned int x, y;
5282
5283         // We assume these blocks are very rare so we dont optimize it.
5284         // FIXME The two following lines get the bitstream position in the cabac
5285         // decode, I think it should be done by a function in cabac.h (or cabac.c).
5286         ptr= h->cabac.bytestream;
5287         if (h->cabac.low&0x1) ptr-=CABAC_BITS/8;
5288
5289         // The pixels are stored in the same order as levels in h->mb array.
5290         for(y=0; y<16; y++){
5291             const int index= 4*(y&3) + 32*((y>>2)&1) + 128*(y>>3);
5292             for(x=0; x<16; x++){
5293                 tprintf("LUMA ICPM LEVEL (%3d)\n", *ptr);
5294                 h->mb[index + (x&3) + 16*((x>>2)&1) + 64*(x>>3)]= *ptr++;
5295             }
5296         }
5297         for(y=0; y<8; y++){
5298             const int index= 256 + 4*(y&3) + 32*(y>>2);
5299             for(x=0; x<8; x++){
5300                 tprintf("CHROMA U ICPM LEVEL (%3d)\n", *ptr);
5301                 h->mb[index + (x&3) + 16*(x>>2)]= *ptr++;
5302             }
5303         }
5304         for(y=0; y<8; y++){
5305             const int index= 256 + 64 + 4*(y&3) + 32*(y>>2);
5306             for(x=0; x<8; x++){
5307                 tprintf("CHROMA V ICPM LEVEL (%3d)\n", *ptr);
5308                 h->mb[index + (x&3) + 16*(x>>2)]= *ptr++;
5309             }
5310         }
5311
5312         ff_init_cabac_decoder(&h->cabac, ptr, h->cabac.bytestream_end - ptr);
5313
5314         // All blocks are present
5315         h->cbp_table[mb_xy] = 0x1ef;
5316         h->chroma_pred_mode_table[mb_xy] = 0;
5317         // In deblocking, the quantizer is 0
5318         s->current_picture.qscale_table[mb_xy]= 0;
5319         h->chroma_qp = get_chroma_qp(h->pps.chroma_qp_index_offset, 0);
5320         // All coeffs are present
5321         memset(h->non_zero_count[mb_xy], 16, 16);
5322         return 0;
5323     }
5324
5325     fill_caches(h, mb_type, 0);
5326
5327     if( IS_INTRA( mb_type ) ) {
5328         if( IS_INTRA4x4( mb_type ) ) {
5329             int i;
5330             for( i = 0; i < 16; i++ ) {
5331                 int pred = pred_intra_mode( h, i );
5332                 h->intra4x4_pred_mode_cache[ scan8[i] ] = decode_cabac_mb_intra4x4_pred_mode( h, pred );
5333
5334                 //av_log( s->avctx, AV_LOG_ERROR, "i4x4 pred=%d mode=%d\n", pred, h->intra4x4_pred_mode_cache[ scan8[i] ] );
5335             }
5336             write_back_intra_pred_mode(h);
5337             if( check_intra4x4_pred_mode(h) < 0 ) return -1;
5338         } else {
5339             h->intra16x16_pred_mode= check_intra_pred_mode( h, h->intra16x16_pred_mode );
5340             if( h->intra16x16_pred_mode < 0 ) return -1;
5341         }
5342         h->chroma_pred_mode_table[mb_xy] =
5343             h->chroma_pred_mode          = decode_cabac_mb_chroma_pre_mode( h );
5344
5345         h->chroma_pred_mode= check_intra_pred_mode( h, h->chroma_pred_mode );
5346         if( h->chroma_pred_mode < 0 ) return -1;
5347     } else if( partition_count == 4 ) {
5348         int i, j, sub_partition_count[4], list, ref[2][4];
5349
5350         if( h->slice_type == B_TYPE ) {
5351             for( i = 0; i < 4; i++ ) {
5352                 h->sub_mb_type[i] = decode_cabac_b_mb_sub_type( h );
5353                 sub_partition_count[i]= b_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
5354                 h->sub_mb_type[i]=      b_sub_mb_type_info[ h->sub_mb_type[i] ].type;
5355             }
5356             if(   IS_DIRECT(h->sub_mb_type[0]) || IS_DIRECT(h->sub_mb_type[1])
5357                || IS_DIRECT(h->sub_mb_type[2]) || IS_DIRECT(h->sub_mb_type[3])) {
5358                 pred_direct_motion(h, &mb_type);
5359                 if( h->ref_count[0] > 1 || h->ref_count[1] > 1 ) {
5360                     for( i = 0; i < 4; i++ )
5361                         if( IS_DIRECT(h->sub_mb_type[i]) )
5362                             fill_rectangle( &h->direct_cache[scan8[4*i]], 2, 2, 8, 1, 1 );
5363                 }
5364             }
5365         } else {
5366             for( i = 0; i < 4; i++ ) {
5367                 h->sub_mb_type[i] = decode_cabac_p_mb_sub_type( h );
5368                 sub_partition_count[i]= p_sub_mb_type_info[ h->sub_mb_type[i] ].partition_count;
5369                 h->sub_mb_type[i]=      p_sub_mb_type_info[ h->sub_mb_type[i] ].type;
5370             }
5371         }
5372
5373         for( list = 0; list < 2; list++ ) {
5374             if( h->ref_count[list] > 0 ) {
5375                 for( i = 0; i < 4; i++ ) {
5376                     if(IS_DIRECT(h->sub_mb_type[i])) continue;
5377                     if(IS_DIR(h->sub_mb_type[i], 0, list)){
5378                         if( h->ref_count[list] > 1 )
5379                             ref[list][i] = decode_cabac_mb_ref( h, list, 4*i );
5380                         else
5381                             ref[list][i] = 0;
5382                     } else {
5383                         ref[list][i] = -1;
5384                     }
5385                                                        h->ref_cache[list][ scan8[4*i]+1 ]=
5386                     h->ref_cache[list][ scan8[4*i]+8 ]=h->ref_cache[list][ scan8[4*i]+9 ]= ref[list][i];
5387                 }
5388             }
5389         }
5390
5391         for(list=0; list<2; list++){
5392             for(i=0; i<4; i++){
5393                 if(IS_DIRECT(h->sub_mb_type[i])){
5394                     fill_rectangle(h->mvd_cache[list][scan8[4*i]], 2, 2, 8, 0, 4);
5395                     continue;
5396                 }
5397                 h->ref_cache[list][ scan8[4*i]   ]=h->ref_cache[list][ scan8[4*i]+1 ];
5398
5399                 if(IS_DIR(h->sub_mb_type[i], 0, list) && !IS_DIRECT(h->sub_mb_type[i])){
5400                     const int sub_mb_type= h->sub_mb_type[i];
5401                     const int block_width= (sub_mb_type & (MB_TYPE_16x16|MB_TYPE_16x8)) ? 2 : 1;
5402                     for(j=0; j<sub_partition_count[i]; j++){
5403                         int mpx, mpy;
5404                         int mx, my;
5405                         const int index= 4*i + block_width*j;
5406                         int16_t (* mv_cache)[2]= &h->mv_cache[list][ scan8[index] ];
5407                         int16_t (* mvd_cache)[2]= &h->mvd_cache[list][ scan8[index] ];
5408                         pred_motion(h, index, block_width, list, h->ref_cache[list][ scan8[index] ], &mpx, &mpy);
5409
5410                         mx = mpx + decode_cabac_mb_mvd( h, list, index, 0 );
5411                         my = mpy + decode_cabac_mb_mvd( h, list, index, 1 );
5412                         tprintf("final mv:%d %d\n", mx, my);
5413
5414                         if(IS_SUB_8X8(sub_mb_type)){
5415                             mv_cache[ 0 ][0]= mv_cache[ 1 ][0]=
5416                             mv_cache[ 8 ][0]= mv_cache[ 9 ][0]= mx;
5417                             mv_cache[ 0 ][1]= mv_cache[ 1 ][1]=
5418                             mv_cache[ 8 ][1]= mv_cache[ 9 ][1]= my;
5419
5420                             mvd_cache[ 0 ][0]= mvd_cache[ 1 ][0]=
5421                             mvd_cache[ 8 ][0]= mvd_cache[ 9 ][0]= mx - mpx;
5422                             mvd_cache[ 0 ][1]= mvd_cache[ 1 ][1]=
5423                             mvd_cache[ 8 ][1]= mvd_cache[ 9 ][1]= my - mpy;
5424                         }else if(IS_SUB_8X4(sub_mb_type)){
5425                             mv_cache[ 0 ][0]= mv_cache[ 1 ][0]= mx;
5426                             mv_cache[ 0 ][1]= mv_cache[ 1 ][1]= my;
5427
5428                             mvd_cache[ 0 ][0]= mvd_cache[ 1 ][0]= mx- mpx;
5429                             mvd_cache[ 0 ][1]= mvd_cache[ 1 ][1]= my - mpy;
5430                         }else if(IS_SUB_4X8(sub_mb_type)){
5431                             mv_cache[ 0 ][0]= mv_cache[ 8 ][0]= mx;
5432                             mv_cache[ 0 ][1]= mv_cache[ 8 ][1]= my;
5433
5434                             mvd_cache[ 0 ][0]= mvd_cache[ 8 ][0]= mx - mpx;
5435                             mvd_cache[ 0 ][1]= mvd_cache[ 8 ][1]= my - mpy;
5436                         }else{
5437                             assert(IS_SUB_4X4(sub_mb_type));
5438                             mv_cache[ 0 ][0]= mx;
5439                             mv_cache[ 0 ][1]= my;
5440
5441                             mvd_cache[ 0 ][0]= mx - mpx;
5442                             mvd_cache[ 0 ][1]= my - mpy;
5443                         }
5444                     }
5445                 }else{
5446                     uint32_t *p= (uint32_t *)&h->mv_cache[list][ scan8[4*i] ][0];
5447                     uint32_t *pd= (uint32_t *)&h->mvd_cache[list][ scan8[4*i] ][0];
5448                     p[0] = p[1] = p[8] = p[9] = 0;
5449                     pd[0]= pd[1]= pd[8]= pd[9]= 0;
5450                 }
5451             }
5452         }
5453     } else if( IS_DIRECT(mb_type) ) {
5454         pred_direct_motion(h, &mb_type);
5455         s->current_picture.mb_type[mb_xy]= mb_type;
5456         fill_rectangle(h->mvd_cache[0][scan8[0]], 4, 4, 8, 0, 4);
5457         fill_rectangle(h->mvd_cache[1][scan8[0]], 4, 4, 8, 0, 4);
5458     } else {
5459         int list, mx, my, i, mpx, mpy;
5460         if(IS_16X16(mb_type)){
5461             for(list=0; list<2; list++){
5462                 if(IS_DIR(mb_type, 0, list)){
5463                     if(h->ref_count[list] > 0 ){
5464                         const int ref = h->ref_count[list] > 1 ? decode_cabac_mb_ref( h, list, 0 ) : 0;
5465                         fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, ref, 1);
5466                     }
5467                 }else
5468                     fill_rectangle(&h->ref_cache[list][ scan8[0] ], 4, 4, 8, (uint8_t)LIST_NOT_USED, 1);
5469             }
5470             for(list=0; list<2; list++){
5471                 if(IS_DIR(mb_type, 0, list)){
5472                     pred_motion(h, 0, 4, list, h->ref_cache[list][ scan8[0] ], &mpx, &mpy);
5473
5474                     mx = mpx + decode_cabac_mb_mvd( h, list, 0, 0 );
5475                     my = mpy + decode_cabac_mb_mvd( h, list, 0, 1 );
5476                     tprintf("final mv:%d %d\n", mx, my);
5477
5478                     fill_rectangle(h->mvd_cache[list][ scan8[0] ], 4, 4, 8, pack16to32(mx-mpx,my-mpy), 4);
5479                     fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, pack16to32(mx,my), 4);
5480                 }else
5481                     fill_rectangle(h->mv_cache[list][ scan8[0] ], 4, 4, 8, 0, 4);
5482             }
5483         }
5484         else if(IS_16X8(mb_type)){
5485             for(list=0; list<2; list++){
5486                 if(h->ref_count[list]>0){
5487                     for(i=0; i<2; i++){
5488                         if(IS_DIR(mb_type, i, list)){
5489                             const int ref= h->ref_count[list] > 1 ? decode_cabac_mb_ref( h, list, 8*i ) : 0;
5490                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, ref, 1);
5491                         }else
5492                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 16*i ], 4, 2, 8, (LIST_NOT_USED&0xFF), 1);
5493                     }
5494                 }
5495             }
5496             for(list=0; list<2; list++){
5497                 for(i=0; i<2; i++){
5498                     if(IS_DIR(mb_type, i, list)){
5499                         pred_16x8_motion(h, 8*i, list, h->ref_cache[list][scan8[0] + 16*i], &mpx, &mpy);
5500                         mx = mpx + decode_cabac_mb_mvd( h, list, 8*i, 0 );
5501                         my = mpy + decode_cabac_mb_mvd( h, list, 8*i, 1 );
5502                         tprintf("final mv:%d %d\n", mx, my);
5503
5504                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 16*i ], 4, 2, 8, pack16to32(mx-mpx,my-mpy), 4);
5505                         fill_rectangle(h->mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, pack16to32(mx,my), 4);
5506                     }else{
5507                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 16*i ], 4, 2, 8, 0, 4);
5508                         fill_rectangle(h-> mv_cache[list][ scan8[0] + 16*i ], 4, 2, 8, 0, 4);
5509                     }
5510                 }
5511             }
5512         }else{
5513             assert(IS_8X16(mb_type));
5514             for(list=0; list<2; list++){
5515                 if(h->ref_count[list]>0){
5516                     for(i=0; i<2; i++){
5517                         if(IS_DIR(mb_type, i, list)){ //FIXME optimize
5518                             const int ref= h->ref_count[list] > 1 ? decode_cabac_mb_ref( h, list, 4*i ) : 0;
5519                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, ref, 1);
5520                         }else
5521                             fill_rectangle(&h->ref_cache[list][ scan8[0] + 2*i ], 2, 4, 8, (LIST_NOT_USED&0xFF), 1);
5522                     }
5523                 }
5524             }
5525             for(list=0; list<2; list++){
5526                 for(i=0; i<2; i++){
5527                     if(IS_DIR(mb_type, i, list)){
5528                         pred_8x16_motion(h, i*4, list, h->ref_cache[list][ scan8[0] + 2*i ], &mpx, &mpy);
5529                         mx = mpx + decode_cabac_mb_mvd( h, list, 4*i, 0 );
5530                         my = mpy + decode_cabac_mb_mvd( h, list, 4*i, 1 );
5531
5532                         tprintf("final mv:%d %d\n", mx, my);
5533                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 2*i ], 2, 4, 8, pack16to32(mx-mpx,my-mpy), 4);
5534                         fill_rectangle(h->mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, pack16to32(mx,my), 4);
5535                     }else{
5536                         fill_rectangle(h->mvd_cache[list][ scan8[0] + 2*i ], 2, 4, 8, 0, 4);
5537                         fill_rectangle(h-> mv_cache[list][ scan8[0] + 2*i ], 2, 4, 8, 0, 4);
5538                     }
5539                 }
5540             }
5541         }
5542     }
5543
5544    if( IS_INTER( mb_type ) ) {
5545         h->chroma_pred_mode_table[mb_xy] = 0;
5546         write_back_motion( h, mb_type );
5547    }
5548
5549     if( !IS_INTRA16x16( mb_type ) ) {
5550         cbp  = decode_cabac_mb_cbp_luma( h );
5551         cbp |= decode_cabac_mb_cbp_chroma( h ) << 4;
5552     }
5553
5554     h->cbp_table[mb_xy] = cbp;
5555
5556     if( cbp || IS_INTRA16x16( mb_type ) ) {
5557         const uint8_t *scan, *dc_scan;
5558         int dqp;
5559
5560         if(IS_INTERLACED(mb_type)){
5561             scan= field_scan;
5562             dc_scan= luma_dc_field_scan;
5563         }else{
5564             scan= zigzag_scan;
5565             dc_scan= luma_dc_zigzag_scan;
5566         }
5567
5568         h->last_qscale_diff = dqp = decode_cabac_mb_dqp( h );
5569         s->qscale += dqp;
5570         if(((unsigned)s->qscale) > 51){
5571             if(s->qscale<0) s->qscale+= 52;
5572             else            s->qscale-= 52;
5573         }
5574         h->chroma_qp = get_chroma_qp(h->pps.chroma_qp_index_offset, s->qscale);
5575
5576         if( IS_INTRA16x16( mb_type ) ) {
5577             int i;
5578             //av_log( s->avctx, AV_LOG_ERROR, "INTRA16x16 DC\n" );
5579             if( decode_cabac_residual( h, h->mb, 0, 0, dc_scan, s->qscale, 16) < 0)
5580                 return -1;
5581             if( cbp&15 ) {
5582                 for( i = 0; i < 16; i++ ) {
5583                     //av_log( s->avctx, AV_LOG_ERROR, "INTRA16x16 AC:%d\n", i );
5584                     if( decode_cabac_residual(h, h->mb + 16*i, 1, i, scan + 1, s->qscale, 15) < 0 )
5585                         return -1;
5586                 }
5587             } else {
5588                 fill_rectangle(&h->non_zero_count_cache[scan8[0]], 4, 4, 8, 0, 1);
5589             }
5590         } else {
5591             int i8x8, i4x4;
5592             for( i8x8 = 0; i8x8 < 4; i8x8++ ) {
5593                 if( cbp & (1<<i8x8) ) {
5594                     for( i4x4 = 0; i4x4 < 4; i4x4++ ) {
5595                         const int index = 4*i8x8 + i4x4;
5596                         //av_log( s->avctx, AV_LOG_ERROR, "Luma4x4: %d\n", index );
5597                         if( decode_cabac_residual(h, h->mb + 16*index, 2, index, scan, s->qscale, 16) < 0 )
5598                             return -1;
5599                     }
5600                 } else {
5601                     uint8_t * const nnz= &h->non_zero_count_cache[ scan8[4*i8x8] ];
5602                     nnz[0] = nnz[1] = nnz[8] = nnz[9] = 0;
5603                 }
5604             }
5605         }
5606
5607         if( cbp&0x30 ){
5608             int c;
5609             for( c = 0; c < 2; c++ ) {
5610                 //av_log( s->avctx, AV_LOG_ERROR, "INTRA C%d-DC\n",c );
5611                 if( decode_cabac_residual(h, h->mb + 256 + 16*4*c, 3, c, chroma_dc_scan, h->chroma_qp, 4) < 0)
5612                     return -1;
5613             }
5614         }
5615
5616         if( cbp&0x20 ) {
5617             int c, i;
5618             for( c = 0; c < 2; c++ ) {
5619                 for( i = 0; i < 4; i++ ) {
5620                     const int index = 16 + 4 * c + i;
5621                     //av_log( s->avctx, AV_LOG_ERROR, "INTRA C%d-AC %d\n",c, index - 16 );
5622                     if( decode_cabac_residual(h, h->mb + 16*index, 4, index - 16, scan + 1, h->chroma_qp, 15) < 0)
5623                         return -1;
5624                 }
5625             }
5626         } else {
5627             uint8_t * const nnz= &h->non_zero_count_cache[0];
5628             nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
5629             nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
5630         }
5631     } else {
5632         uint8_t * const nnz= &h->non_zero_count_cache[0];
5633         fill_rectangle(&nnz[scan8[0]], 4, 4, 8, 0, 1);
5634         nnz[ scan8[16]+0 ] = nnz[ scan8[16]+1 ] =nnz[ scan8[16]+8 ] =nnz[ scan8[16]+9 ] =
5635         nnz[ scan8[20]+0 ] = nnz[ scan8[20]+1 ] =nnz[ scan8[20]+8 ] =nnz[ scan8[20]+9 ] = 0;
5636     }
5637
5638     s->current_picture.qscale_table[mb_xy]= s->qscale;
5639     write_back_non_zero_count(h);
5640
5641     return 0;
5642 }
5643
5644
5645 static void filter_mb_edgev( H264Context *h, uint8_t *pix, int stride, int bS[4], int qp ) {
5646     int i, d;
5647     const int index_a = clip( qp + h->slice_alpha_c0_offset, 0, 51 );
5648     const int alpha = alpha_table[index_a];
5649     const int beta  = beta_table[clip( qp + h->slice_beta_offset, 0, 51 )];
5650
5651     if( bS[0] < 4 ) {
5652         int tc[4];
5653         for(i=0; i<4; i++)
5654             tc[i] = bS[i] ? tc0_table[index_a][bS[i] - 1] : -1;
5655         h->s.dsp.h264_h_loop_filter_luma(pix, stride, alpha, beta, tc);
5656     } else {
5657         /* 16px edge length, because bS=4 is triggered by being at
5658          * the edge of an intra MB, so all 4 bS are the same */
5659             for( d = 0; d < 16; d++ ) {
5660                 const int p0 = pix[-1];
5661                 const int p1 = pix[-2];
5662                 const int p2 = pix[-3];
5663
5664                 const int q0 = pix[0];
5665                 const int q1 = pix[1];
5666                 const int q2 = pix[2];
5667
5668                 if( ABS( p0 - q0 ) < alpha &&
5669                     ABS( p1 - p0 ) < beta &&
5670                     ABS( q1 - q0 ) < beta ) {
5671
5672                     if(ABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
5673                         if( ABS( p2 - p0 ) < beta)
5674                         {
5675                             const int p3 = pix[-4];
5676                             /* p0', p1', p2' */
5677                             pix[-1] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
5678                             pix[-2] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
5679                             pix[-3] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
5680                         } else {
5681                             /* p0' */
5682                             pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
5683                         }
5684                         if( ABS( q2 - q0 ) < beta)
5685                         {
5686                             const int q3 = pix[3];
5687                             /* q0', q1', q2' */
5688                             pix[0] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
5689                             pix[1] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
5690                             pix[2] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
5691                         } else {
5692                             /* q0' */
5693                             pix[0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
5694                         }
5695                     }else{
5696                         /* p0', q0' */
5697                         pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
5698                         pix[ 0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
5699                     }
5700                     tprintf("filter_mb_edgev i:%d d:%d\n# bS:4 -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, d, p2, p1, p0, q0, q1, q2, pix[-2], pix[-1], pix[0], pix[1]);
5701                 }
5702                 pix += stride;
5703             }
5704     }
5705 }
5706 static void filter_mb_edgecv( H264Context *h, uint8_t *pix, int stride, int bS[4], int qp ) {
5707     int i, d;
5708     const int index_a = clip( qp + h->slice_alpha_c0_offset, 0, 51 );
5709     const int alpha = alpha_table[index_a];
5710     const int beta  = beta_table[clip( qp + h->slice_beta_offset, 0, 51 )];
5711
5712     if( bS[0] < 4 ) {
5713         int tc[4];
5714         for(i=0; i<4; i++)
5715             tc[i] = bS[i] ? tc0_table[index_a][bS[i] - 1] + 1 : 0;
5716         h->s.dsp.h264_h_loop_filter_chroma(pix, stride, alpha, beta, tc);
5717     } else {
5718         /* 8px edge length, see filter_mb_edgev */
5719             for( d = 0; d < 8; d++ ){
5720                 const int p0 = pix[-1];
5721                 const int p1 = pix[-2];
5722                 const int q0 = pix[0];
5723                 const int q1 = pix[1];
5724
5725                 if( ABS( p0 - q0 ) < alpha &&
5726                     ABS( p1 - p0 ) < beta &&
5727                     ABS( q1 - q0 ) < beta ) {
5728
5729                     pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;   /* p0' */
5730                     pix[0]  = ( 2*q1 + q0 + p1 + 2 ) >> 2;   /* q0' */
5731                     tprintf("filter_mb_edgecv i:%d d:%d\n# bS:4 -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, d, pix[-3], p1, p0, q0, q1, pix[2], p1, pix[-1], pix[0], q1);
5732                 }
5733                 pix += stride;
5734             }
5735     }
5736 }
5737
5738 static void filter_mb_mbaff_edgev( H264Context *h, uint8_t *pix, int stride, int bS[8], int qp[2] ) {
5739     int i;
5740     for( i = 0; i < 16; i++, pix += stride) {
5741         int index_a;
5742         int alpha;
5743         int beta;
5744
5745         int qp_index;
5746         int bS_index = (i >> 1);
5747         if (h->mb_field_decoding_flag) {
5748             bS_index &= ~1;
5749             bS_index |= (i & 1);
5750         }
5751
5752         if( bS[bS_index] == 0 ) {
5753             continue;
5754         }
5755
5756         qp_index = h->mb_field_decoding_flag ? (i & 1) : (i >> 3);
5757         index_a = clip( qp[qp_index] + h->slice_alpha_c0_offset, 0, 51 );
5758         alpha = alpha_table[index_a];
5759         beta  = beta_table[clip( qp[qp_index] + h->slice_beta_offset, 0, 51 )];
5760
5761
5762         if( bS[bS_index] < 4 ) {
5763             const int tc0 = tc0_table[index_a][bS[bS_index] - 1];
5764             /* 4px edge length */
5765             const int p0 = pix[-1];
5766             const int p1 = pix[-2];
5767             const int p2 = pix[-3];
5768             const int q0 = pix[0];
5769             const int q1 = pix[1];
5770             const int q2 = pix[2];
5771
5772             if( ABS( p0 - q0 ) < alpha &&
5773                 ABS( p1 - p0 ) < beta &&
5774                 ABS( q1 - q0 ) < beta ) {
5775                 int tc = tc0;
5776                 int i_delta;
5777
5778                 if( ABS( p2 - p0 ) < beta ) {
5779                     pix[-2] = p1 + clip( ( p2 + ( ( p0 + q0 + 1 ) >> 1 ) - ( p1 << 1 ) ) >> 1, -tc0, tc0 );
5780                     tc++;
5781                 }
5782                 if( ABS( q2 - q0 ) < beta ) {
5783                     pix[1] = q1 + clip( ( q2 + ( ( p0 + q0 + 1 ) >> 1 ) - ( q1 << 1 ) ) >> 1, -tc0, tc0 );
5784                     tc++;
5785                 }
5786
5787                 i_delta = clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
5788                 pix[-1] = clip_uint8( p0 + i_delta );    /* p0' */
5789                 pix[0]  = clip_uint8( q0 - i_delta );    /* q0' */
5790                 tprintf("filter_mb_mbaff_edgev i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d, tc:%d\n# bS:%d -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, tc, bS[bS_index], pix[-3], p1, p0, q0, q1, pix[2], p1, pix[-1], pix[0], q1);
5791             }
5792         }else{
5793             /* 4px edge length */
5794             const int p0 = pix[-1];
5795             const int p1 = pix[-2];
5796             const int p2 = pix[-3];
5797
5798             const int q0 = pix[0];
5799             const int q1 = pix[1];
5800             const int q2 = pix[2];
5801
5802             if( ABS( p0 - q0 ) < alpha &&
5803                 ABS( p1 - p0 ) < beta &&
5804                 ABS( q1 - q0 ) < beta ) {
5805
5806                 if(ABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
5807                     if( ABS( p2 - p0 ) < beta)
5808                     {
5809                         const int p3 = pix[-4];
5810                         /* p0', p1', p2' */
5811                         pix[-1] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
5812                         pix[-2] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
5813                         pix[-3] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
5814                     } else {
5815                         /* p0' */
5816                         pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
5817                     }
5818                     if( ABS( q2 - q0 ) < beta)
5819                     {
5820                         const int q3 = pix[3];
5821                         /* q0', q1', q2' */
5822                         pix[0] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
5823                         pix[1] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
5824                         pix[2] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
5825                     } else {
5826                         /* q0' */
5827                         pix[0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
5828                     }
5829                 }else{
5830                     /* p0', q0' */
5831                     pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
5832                     pix[ 0] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
5833                 }
5834                 tprintf("filter_mb_mbaff_edgev i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d\n# bS:4 -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, p2, p1, p0, q0, q1, q2, pix[-3], pix[-2], pix[-1], pix[0], pix[1], pix[2]);
5835             }
5836         }
5837     }
5838 }
5839 static void filter_mb_mbaff_edgecv( H264Context *h, uint8_t *pix, int stride, int bS[4], int qp[2] ) {
5840     int i;
5841     for( i = 0; i < 8; i++, pix += stride) {
5842         int index_a;
5843         int alpha;
5844         int beta;
5845
5846         int qp_index;
5847         int bS_index = i;
5848
5849         if( bS[bS_index] == 0 ) {
5850             continue;
5851         }
5852
5853         qp_index = h->mb_field_decoding_flag ? (i & 1) : (i >> 3);
5854         index_a = clip( qp[qp_index] + h->slice_alpha_c0_offset, 0, 51 );
5855         alpha = alpha_table[index_a];
5856         beta  = beta_table[clip( qp[qp_index] + h->slice_beta_offset, 0, 51 )];
5857         if( bS[bS_index] < 4 ) {
5858             const int tc = tc0_table[index_a][bS[bS_index] - 1] + 1;
5859             /* 2px edge length (because we use same bS than the one for luma) */
5860             const int p0 = pix[-1];
5861             const int p1 = pix[-2];
5862             const int q0 = pix[0];
5863             const int q1 = pix[1];
5864
5865             if( ABS( p0 - q0 ) < alpha &&
5866                 ABS( p1 - p0 ) < beta &&
5867                 ABS( q1 - q0 ) < beta ) {
5868                 const int i_delta = clip( (((q0 - p0 ) << 2) + (p1 - q1) + 4) >> 3, -tc, tc );
5869
5870                 pix[-1] = clip_uint8( p0 + i_delta );    /* p0' */
5871                 pix[0]  = clip_uint8( q0 - i_delta );    /* q0' */
5872                 tprintf("filter_mb_mbaff_edgecv i:%d, qp:%d, indexA:%d, alpha:%d, beta:%d, tc:%d\n# bS:%d -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, qp[qp_index], index_a, alpha, beta, tc, bS[bS_index], pix[-3], p1, p0, q0, q1, pix[2], p1, pix[-1], pix[0], q1);
5873             }
5874         }else{
5875             const int p0 = pix[-1];
5876             const int p1 = pix[-2];
5877             const int q0 = pix[0];
5878             const int q1 = pix[1];
5879
5880             if( ABS( p0 - q0 ) < alpha &&
5881                 ABS( p1 - p0 ) < beta &&
5882                 ABS( q1 - q0 ) < beta ) {
5883
5884                 pix[-1] = ( 2*p1 + p0 + q1 + 2 ) >> 2;   /* p0' */
5885                 pix[0]  = ( 2*q1 + q0 + p1 + 2 ) >> 2;   /* q0' */
5886                 tprintf("filter_mb_mbaff_edgecv i:%d\n# bS:4 -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x, %02x, %02x]\n", i, pix[-3], p1, p0, q0, q1, pix[2], pix[-3], pix[-2], pix[-1], pix[0], pix[1], pix[2]);
5887             }
5888         }
5889     }
5890 }
5891
5892 static void filter_mb_edgeh( H264Context *h, uint8_t *pix, int stride, int bS[4], int qp ) {
5893     int i, d;
5894     const int index_a = clip( qp + h->slice_alpha_c0_offset, 0, 51 );
5895     const int alpha = alpha_table[index_a];
5896     const int beta  = beta_table[clip( qp + h->slice_beta_offset, 0, 51 )];
5897     const int pix_next  = stride;
5898
5899     if( bS[0] < 4 ) {
5900         int tc[4];
5901         for(i=0; i<4; i++)
5902             tc[i] = bS[i] ? tc0_table[index_a][bS[i] - 1] : -1;
5903         h->s.dsp.h264_v_loop_filter_luma(pix, stride, alpha, beta, tc);
5904     } else {
5905         /* 16px edge length, see filter_mb_edgev */
5906             for( d = 0; d < 16; d++ ) {
5907                 const int p0 = pix[-1*pix_next];
5908                 const int p1 = pix[-2*pix_next];
5909                 const int p2 = pix[-3*pix_next];
5910                 const int q0 = pix[0];
5911                 const int q1 = pix[1*pix_next];
5912                 const int q2 = pix[2*pix_next];
5913
5914                 if( ABS( p0 - q0 ) < alpha &&
5915                     ABS( p1 - p0 ) < beta &&
5916                     ABS( q1 - q0 ) < beta ) {
5917
5918                     const int p3 = pix[-4*pix_next];
5919                     const int q3 = pix[ 3*pix_next];
5920
5921                     if(ABS( p0 - q0 ) < (( alpha >> 2 ) + 2 )){
5922                         if( ABS( p2 - p0 ) < beta) {
5923                             /* p0', p1', p2' */
5924                             pix[-1*pix_next] = ( p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4 ) >> 3;
5925                             pix[-2*pix_next] = ( p2 + p1 + p0 + q0 + 2 ) >> 2;
5926                             pix[-3*pix_next] = ( 2*p3 + 3*p2 + p1 + p0 + q0 + 4 ) >> 3;
5927                         } else {
5928                             /* p0' */
5929                             pix[-1*pix_next] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
5930                         }
5931                         if( ABS( q2 - q0 ) < beta) {
5932                             /* q0', q1', q2' */
5933                             pix[0*pix_next] = ( p1 + 2*p0 + 2*q0 + 2*q1 + q2 + 4 ) >> 3;
5934                             pix[1*pix_next] = ( p0 + q0 + q1 + q2 + 2 ) >> 2;
5935                             pix[2*pix_next] = ( 2*q3 + 3*q2 + q1 + q0 + p0 + 4 ) >> 3;
5936                         } else {
5937                             /* q0' */
5938                             pix[0*pix_next] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
5939                         }
5940                     }else{
5941                         /* p0', q0' */
5942                         pix[-1*pix_next] = ( 2*p1 + p0 + q1 + 2 ) >> 2;
5943                         pix[ 0*pix_next] = ( 2*q1 + q0 + p1 + 2 ) >> 2;
5944                     }
5945                     tprintf("filter_mb_edgeh i:%d d:%d, qp:%d, indexA:%d, alpha:%d, beta:%d\n# bS:%d -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, d, qp, index_a, alpha, beta, bS[i], p2, p1, p0, q0, q1, q2, pix[-2*pix_next], pix[-pix_next], pix[0], pix[pix_next]);
5946                 }
5947                 pix++;
5948             }
5949     }
5950 }
5951
5952 static void filter_mb_edgech( H264Context *h, uint8_t *pix, int stride, int bS[4], int qp ) {
5953     int i, d;
5954     const int index_a = clip( qp + h->slice_alpha_c0_offset, 0, 51 );
5955     const int alpha = alpha_table[index_a];
5956     const int beta  = beta_table[clip( qp + h->slice_beta_offset, 0, 51 )];
5957     const int pix_next  = stride;
5958
5959     if( bS[0] < 4 ) {
5960         int tc[4];
5961         for(i=0; i<4; i++)
5962             tc[i] = bS[i] ? tc0_table[index_a][bS[i] - 1] + 1 : 0;
5963         h->s.dsp.h264_v_loop_filter_chroma(pix, stride, alpha, beta, tc);
5964     } else {
5965         /* 8px edge length, see filter_mb_edgev */
5966             for( d = 0; d < 8; d++ ) {
5967                 const int p0 = pix[-1*pix_next];
5968                 const int p1 = pix[-2*pix_next];
5969                 const int q0 = pix[0];
5970                 const int q1 = pix[1*pix_next];
5971
5972                 if( ABS( p0 - q0 ) < alpha &&
5973                     ABS( p1 - p0 ) < beta &&
5974                     ABS( q1 - q0 ) < beta ) {
5975
5976                     pix[-pix_next] = ( 2*p1 + p0 + q1 + 2 ) >> 2;   /* p0' */
5977                     pix[0]         = ( 2*q1 + q0 + p1 + 2 ) >> 2;   /* q0' */
5978                     tprintf("filter_mb_edgech i:%d d:%d, qp:%d, indexA:%d, alpha:%d, beta:%d\n# bS:%d -> [%02x, %02x, %02x, %02x, %02x, %02x] =>[%02x, %02x, %02x, %02x]\n", i, d, qp, index_a, alpha, beta, bS[i], pix[-3*pix_next], p1, p0, q0, q1, pix[2*pix_next], pix[-2*pix_next], pix[-pix_next], pix[0], pix[pix_next]);
5979                 }
5980                 pix++;
5981             }
5982     }
5983 }
5984
5985 static void filter_mb( H264Context *h, int mb_x, int mb_y, uint8_t *img_y, uint8_t *img_cb, uint8_t *img_cr, unsigned int linesize, unsigned int uvlinesize) {
5986     MpegEncContext * const s = &h->s;
5987     const int mb_xy= mb_x + mb_y*s->mb_stride;
5988     int first_vertical_edge_done = 0;
5989     int dir;
5990     /* FIXME: A given frame may occupy more than one position in
5991      * the reference list. So ref2frm should be populated with
5992      * frame numbers, not indices. */
5993     static const int ref2frm[18] = {-1,-1,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15};
5994
5995     if (h->mb_aff_frame
5996             // left mb is in picture
5997             && h->slice_table[mb_xy-1] != 255
5998             // and current and left pair do not have the same interlaced type
5999             && (IS_INTERLACED(s->current_picture.mb_type[mb_xy]) != IS_INTERLACED(s->current_picture.mb_type[mb_xy-1]))
6000             // and left mb is in the same slice if deblocking_filter == 2
6001             && (h->deblocking_filter!=2 || h->slice_table[mb_xy-1] == h->slice_table[mb_xy])) {
6002         /* First vertical edge is different in MBAFF frames
6003          * There are 8 different bS to compute and 2 different Qp
6004          */
6005         int bS[8];
6006         int qp[2];
6007         int chroma_qp[2];
6008
6009         int i;
6010         first_vertical_edge_done = 1;
6011         for( i = 0; i < 8; i++ ) {
6012             int y = i>>1;
6013             int b_idx= 8 + 4 + 8*y;
6014             int bn_idx= b_idx - 1;
6015
6016             int mbn_xy = h->mb_field_decoding_flag ? h->left_mb_xy[i>>2] : h->left_mb_xy[i&1];
6017
6018             if( IS_INTRA( s->current_picture.mb_type[mb_xy] ) ||
6019                 IS_INTRA( s->current_picture.mb_type[mbn_xy] ) ) {
6020                 bS[i] = 4;
6021             } else if( h->non_zero_count_cache[b_idx] != 0 ||
6022                 h->non_zero_count_cache[bn_idx] != 0 ) {
6023                 bS[i] = 2;
6024             } else {
6025                 int l;
6026                 bS[i] = 0;
6027                 for( l = 0; l < 1 + (h->slice_type == B_TYPE); l++ ) {
6028                     if( ref2frm[h->ref_cache[l][b_idx]+2] != ref2frm[h->ref_cache[l][bn_idx]+2] ||
6029                         ABS( h->mv_cache[l][b_idx][0] - h->mv_cache[l][bn_idx][0] ) >= 4 ||
6030                         ABS( h->mv_cache[l][b_idx][1] - h->mv_cache[l][bn_idx][1] ) >= 4 ) {
6031                         bS[i] = 1;
6032                         break;
6033                     }
6034                 }
6035             }
6036         }
6037         if(bS[0]+bS[1]+bS[2]+bS[3] != 0) {
6038             // Do not use s->qscale as luma quantizer because it has not the same
6039             // value in IPCM macroblocks.
6040             qp[0] = ( s->current_picture.qscale_table[mb_xy] + s->current_picture.qscale_table[h->left_mb_xy[0]] + 1 ) >> 1;
6041             chroma_qp[0] = ( get_chroma_qp( h->pps.chroma_qp_index_offset, s->current_picture.qscale_table[mb_xy] ) +
6042                              get_chroma_qp( h->pps.chroma_qp_index_offset, s->current_picture.qscale_table[h->left_mb_xy[0]] ) + 1 ) >> 1;
6043             qp[1] = ( s->current_picture.qscale_table[mb_xy] + s->current_picture.qscale_table[h->left_mb_xy[1]] + 1 ) >> 1;
6044             chroma_qp[1] = ( get_chroma_qp( h->pps.chroma_qp_index_offset, s->current_picture.qscale_table[mb_xy] ) +
6045                              get_chroma_qp( h->pps.chroma_qp_index_offset, s->current_picture.qscale_table[h->left_mb_xy[1]] ) + 1 ) >> 1;
6046
6047             /* Filter edge */
6048             tprintf("filter mb:%d/%d MBAFF, QPy:%d/%d, QPc:%d/%d ls:%d uvls:%d", mb_x, mb_y, qp[0], qp[1], chroma_qp[0], chroma_qp[1], linesize, uvlinesize);
6049             { int i; for (i = 0; i < 8; i++) tprintf(" bS[%d]:%d", i, bS[i]); tprintf("\n"); }
6050             filter_mb_mbaff_edgev ( h, &img_y [0], linesize,   bS, qp );
6051             filter_mb_mbaff_edgecv( h, &img_cb[0], uvlinesize, bS, chroma_qp );
6052             filter_mb_mbaff_edgecv( h, &img_cr[0], uvlinesize, bS, chroma_qp );
6053         }
6054     }
6055     /* dir : 0 -> vertical edge, 1 -> horizontal edge */
6056     for( dir = 0; dir < 2; dir++ )
6057     {
6058         int edge;
6059         const int mbm_xy = dir == 0 ? mb_xy -1 : h->top_mb_xy;
6060         int start = h->slice_table[mbm_xy] == 255 ? 1 : 0;
6061
6062         if (first_vertical_edge_done) {
6063             start = 1;
6064             first_vertical_edge_done = 0;
6065         }
6066
6067         if (h->deblocking_filter==2 && h->slice_table[mbm_xy] != h->slice_table[mb_xy])
6068             start = 1;
6069
6070         /* Calculate bS */
6071         for( edge = start; edge < 4; edge++ ) {
6072             /* mbn_xy: neighbor macroblock */
6073             int mbn_xy = edge > 0 ? mb_xy : mbm_xy;
6074             int bS[4];
6075             int qp;
6076
6077             if (h->mb_aff_frame && (dir == 1) && (edge == 0) && ((mb_y & 1) == 0)
6078                 && !IS_INTERLACED(s->current_picture.mb_type[mb_xy])
6079                 && IS_INTERLACED(s->current_picture.mb_type[mbn_xy])
6080                 ) {
6081                 // This is a special case in the norm where the filtering must
6082                 // be done twice (one each of the field) even if we are in a
6083                 // frame macroblock.
6084                 //
6085                 unsigned int tmp_linesize   = 2 *   linesize;
6086                 unsigned int tmp_uvlinesize = 2 * uvlinesize;
6087                 int mbn_xy = mb_xy - 2 * s->mb_stride;
6088                 int qp, chroma_qp;
6089
6090                 // first filtering
6091                 if( IS_INTRA( s->current_picture.mb_type[mb_xy] ) ||
6092                     IS_INTRA( s->current_picture.mb_type[mbn_xy] ) ) {
6093                     bS[0] = bS[1] = bS[2] = bS[3] = 3;
6094                 } else {
6095                     // TODO
6096                     assert(0);
6097                 }
6098                 /* Filter edge */
6099                 // Do not use s->qscale as luma quantizer because it has not the same
6100                 // value in IPCM macroblocks.
6101                 qp = ( s->current_picture.qscale_table[mb_xy] + s->current_picture.qscale_table[mbn_xy] + 1 ) >> 1;
6102                 tprintf("filter mb:%d/%d dir:%d edge:%d, QPy:%d ls:%d uvls:%d", mb_x, mb_y, dir, edge, qp, tmp_linesize, tmp_uvlinesize);
6103                 { int i; for (i = 0; i < 4; i++) tprintf(" bS[%d]:%d", i, bS[i]); tprintf("\n"); }
6104                 filter_mb_edgeh( h, &img_y[0], tmp_linesize, bS, qp );
6105                 chroma_qp = ( h->chroma_qp +
6106                               get_chroma_qp( h->pps.chroma_qp_index_offset, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1;
6107                 filter_mb_edgech( h, &img_cb[0], tmp_uvlinesize, bS, chroma_qp );
6108                 filter_mb_edgech( h, &img_cr[0], tmp_uvlinesize, bS, chroma_qp );
6109
6110                 // second filtering
6111                 mbn_xy += s->mb_stride;
6112                 if( IS_INTRA( s->current_picture.mb_type[mb_xy] ) ||
6113                     IS_INTRA( s->current_picture.mb_type[mbn_xy] ) ) {
6114                     bS[0] = bS[1] = bS[2] = bS[3] = 3;
6115                 } else {
6116                     // TODO
6117                     assert(0);
6118                 }
6119                 /* Filter edge */
6120                 // Do not use s->qscale as luma quantizer because it has not the same
6121                 // value in IPCM macroblocks.
6122                 qp = ( s->current_picture.qscale_table[mb_xy] + s->current_picture.qscale_table[mbn_xy] + 1 ) >> 1;
6123                 tprintf("filter mb:%d/%d dir:%d edge:%d, QPy:%d ls:%d uvls:%d", mb_x, mb_y, dir, edge, qp, tmp_linesize, tmp_uvlinesize);
6124                 { int i; for (i = 0; i < 4; i++) tprintf(" bS[%d]:%d", i, bS[i]); tprintf("\n"); }
6125                 filter_mb_edgeh( h, &img_y[linesize], tmp_linesize, bS, qp );
6126                 chroma_qp = ( h->chroma_qp +
6127                               get_chroma_qp( h->pps.chroma_qp_index_offset, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1;
6128                 filter_mb_edgech( h, &img_cb[uvlinesize], tmp_uvlinesize, bS, chroma_qp );
6129                 filter_mb_edgech( h, &img_cr[uvlinesize], tmp_uvlinesize, bS, chroma_qp );
6130                 continue;
6131             }
6132             if( IS_INTRA( s->current_picture.mb_type[mb_xy] ) ||
6133                 IS_INTRA( s->current_picture.mb_type[mbn_xy] ) ) {
6134                 int value;
6135                 if (edge == 0) {
6136                     if (   (!IS_INTERLACED(s->current_picture.mb_type[mb_xy]) && !IS_INTERLACED(s->current_picture.mb_type[mbm_xy]))
6137                         || ((h->mb_aff_frame || (s->picture_structure != PICT_FRAME)) && (dir == 0))
6138                     ) {
6139                         value = 4;
6140                     } else {
6141                         value = 3;
6142                     }
6143                 } else {
6144                     value = 3;
6145                 }
6146                 bS[0] = bS[1] = bS[2] = bS[3] = value;
6147             } else {
6148                 int i;
6149                 for( i = 0; i < 4; i++ ) {
6150                     int x = dir == 0 ? edge : i;
6151                     int y = dir == 0 ? i    : edge;
6152                     int b_idx= 8 + 4 + x + 8*y;
6153                     int bn_idx= b_idx - (dir ? 8:1);
6154
6155                     if( h->non_zero_count_cache[b_idx] != 0 ||
6156                         h->non_zero_count_cache[bn_idx] != 0 ) {
6157                         bS[i] = 2;
6158                     }
6159                     else
6160                     {
6161                         int l;
6162                         bS[i] = 0;
6163                         for( l = 0; l < 1 + (h->slice_type == B_TYPE); l++ ) {
6164                             if( ref2frm[h->ref_cache[l][b_idx]+2] != ref2frm[h->ref_cache[l][bn_idx]+2] ||
6165                                 ABS( h->mv_cache[l][b_idx][0] - h->mv_cache[l][bn_idx][0] ) >= 4 ||
6166                                 ABS( h->mv_cache[l][b_idx][1] - h->mv_cache[l][bn_idx][1] ) >= 4 ) {
6167                                 bS[i] = 1;
6168                                 break;
6169                             }
6170                         }
6171                     }
6172                 }
6173
6174                 if(bS[0]+bS[1]+bS[2]+bS[3] == 0)
6175                     continue;
6176             }
6177
6178             /* Filter edge */
6179             // Do not use s->qscale as luma quantizer because it has not the same
6180             // value in IPCM macroblocks.
6181             qp = ( s->current_picture.qscale_table[mb_xy] + s->current_picture.qscale_table[mbn_xy] + 1 ) >> 1;
6182             //tprintf("filter mb:%d/%d dir:%d edge:%d, QPy:%d, QPc:%d, QPcn:%d\n", mb_x, mb_y, dir, edge, qp, h->chroma_qp, s->current_picture.qscale_table[mbn_xy]);
6183             tprintf("filter mb:%d/%d dir:%d edge:%d, QPy:%d ls:%d uvls:%d", mb_x, mb_y, dir, edge, qp, linesize, uvlinesize);
6184             { int i; for (i = 0; i < 4; i++) tprintf(" bS[%d]:%d", i, bS[i]); tprintf("\n"); }
6185             if( dir == 0 ) {
6186                 filter_mb_edgev( h, &img_y[4*edge], linesize, bS, qp );
6187                 if( (edge&1) == 0 ) {
6188                     int chroma_qp = ( h->chroma_qp +
6189                                       get_chroma_qp( h->pps.chroma_qp_index_offset, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1;
6190                     filter_mb_edgecv( h, &img_cb[2*edge], uvlinesize, bS, chroma_qp );
6191                     filter_mb_edgecv( h, &img_cr[2*edge], uvlinesize, bS, chroma_qp );
6192                 }
6193             } else {
6194                 filter_mb_edgeh( h, &img_y[4*edge*linesize], linesize, bS, qp );
6195                 if( (edge&1) == 0 ) {
6196                     int chroma_qp = ( h->chroma_qp +
6197                                       get_chroma_qp( h->pps.chroma_qp_index_offset, s->current_picture.qscale_table[mbn_xy] ) + 1 ) >> 1;
6198                     filter_mb_edgech( h, &img_cb[2*edge*uvlinesize], uvlinesize, bS, chroma_qp );
6199                     filter_mb_edgech( h, &img_cr[2*edge*uvlinesize], uvlinesize, bS, chroma_qp );
6200                 }
6201             }
6202         }
6203     }
6204 }
6205
6206 static int decode_slice(H264Context *h){
6207     MpegEncContext * const s = &h->s;
6208     const int part_mask= s->partitioned_frame ? (AC_END|AC_ERROR) : 0x7F;
6209
6210     s->mb_skip_run= -1;
6211
6212     if( h->pps.cabac ) {
6213         int i;
6214
6215         /* realign */
6216         align_get_bits( &s->gb );
6217
6218         /* init cabac */
6219         ff_init_cabac_states( &h->cabac, ff_h264_lps_range, ff_h264_mps_state, ff_h264_lps_state, 64 );
6220         ff_init_cabac_decoder( &h->cabac,
6221                                s->gb.buffer + get_bits_count(&s->gb)/8,
6222                                ( s->gb.size_in_bits - get_bits_count(&s->gb) + 7)/8);
6223         /* calculate pre-state */
6224         for( i= 0; i < 399; i++ ) {
6225             int pre;
6226             if( h->slice_type == I_TYPE )
6227                 pre = clip( ((cabac_context_init_I[i][0] * s->qscale) >>4 ) + cabac_context_init_I[i][1], 1, 126 );
6228             else
6229                 pre = clip( ((cabac_context_init_PB[h->cabac_init_idc][i][0] * s->qscale) >>4 ) + cabac_context_init_PB[h->cabac_init_idc][i][1], 1, 126 );
6230
6231             if( pre <= 63 )
6232                 h->cabac_state[i] = 2 * ( 63 - pre ) + 0;
6233             else
6234                 h->cabac_state[i] = 2 * ( pre - 64 ) + 1;
6235         }
6236
6237         for(;;){
6238             int ret = decode_mb_cabac(h);
6239             int eos;
6240
6241             if(ret>=0) hl_decode_mb(h);
6242
6243             /* XXX: useless as decode_mb_cabac it doesn't support that ... */
6244             if( ret >= 0 && h->mb_aff_frame ) { //FIXME optimal? or let mb_decode decode 16x32 ?
6245                 s->mb_y++;
6246
6247                 if(ret>=0) ret = decode_mb_cabac(h);
6248
6249                 hl_decode_mb(h);
6250                 s->mb_y--;
6251             }
6252             eos = get_cabac_terminate( &h->cabac );
6253
6254             if( ret < 0 || h->cabac.bytestream > h->cabac.bytestream_end + 1) {
6255                 av_log(h->s.avctx, AV_LOG_ERROR, "error while decoding MB %d %d\n", s->mb_x, s->mb_y);
6256                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6257                 return -1;
6258             }
6259
6260             if( ++s->mb_x >= s->mb_width ) {
6261                 s->mb_x = 0;
6262                 ff_draw_horiz_band(s, 16*s->mb_y, 16);
6263                 ++s->mb_y;
6264                 if(h->mb_aff_frame) {
6265                     ++s->mb_y;
6266                 }
6267             }
6268
6269             if( eos || s->mb_y >= s->mb_height ) {
6270                 tprintf("slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
6271                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6272                 return 0;
6273             }
6274         }
6275
6276     } else {
6277         for(;;){
6278             int ret = decode_mb_cavlc(h);
6279
6280             if(ret>=0) hl_decode_mb(h);
6281
6282             if(ret>=0 && h->mb_aff_frame){ //FIXME optimal? or let mb_decode decode 16x32 ?
6283                 s->mb_y++;
6284                 ret = decode_mb_cavlc(h);
6285
6286                 if(ret>=0) hl_decode_mb(h);
6287                 s->mb_y--;
6288             }
6289
6290             if(ret<0){
6291                 av_log(h->s.avctx, AV_LOG_ERROR, "error while decoding MB %d %d\n", s->mb_x, s->mb_y);
6292                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6293
6294                 return -1;
6295             }
6296
6297             if(++s->mb_x >= s->mb_width){
6298                 s->mb_x=0;
6299                 ff_draw_horiz_band(s, 16*s->mb_y, 16);
6300                 ++s->mb_y;
6301                 if(h->mb_aff_frame) {
6302                     ++s->mb_y;
6303                 }
6304                 if(s->mb_y >= s->mb_height){
6305                     tprintf("slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
6306
6307                     if(get_bits_count(&s->gb) == s->gb.size_in_bits ) {
6308                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6309
6310                         return 0;
6311                     }else{
6312                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6313
6314                         return -1;
6315                     }
6316                 }
6317             }
6318
6319             if(get_bits_count(&s->gb) >= s->gb.size_in_bits && s->mb_skip_run<=0){
6320                 tprintf("slice end %d %d\n", get_bits_count(&s->gb), s->gb.size_in_bits);
6321                 if(get_bits_count(&s->gb) == s->gb.size_in_bits ){
6322                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6323
6324                     return 0;
6325                 }else{
6326                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6327
6328                     return -1;
6329                 }
6330             }
6331         }
6332     }
6333
6334 #if 0
6335     for(;s->mb_y < s->mb_height; s->mb_y++){
6336         for(;s->mb_x < s->mb_width; s->mb_x++){
6337             int ret= decode_mb(h);
6338
6339             hl_decode_mb(h);
6340
6341             if(ret<0){
6342                 fprintf(stderr, "error while decoding MB %d %d\n", s->mb_x, s->mb_y);
6343                 ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6344
6345                 return -1;
6346             }
6347
6348             if(++s->mb_x >= s->mb_width){
6349                 s->mb_x=0;
6350                 if(++s->mb_y >= s->mb_height){
6351                     if(get_bits_count(s->gb) == s->gb.size_in_bits){
6352                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6353
6354                         return 0;
6355                     }else{
6356                         ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6357
6358                         return -1;
6359                     }
6360                 }
6361             }
6362
6363             if(get_bits_count(s->?gb) >= s->gb?.size_in_bits){
6364                 if(get_bits_count(s->gb) == s->gb.size_in_bits){
6365                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x-1, s->mb_y, (AC_END|DC_END|MV_END)&part_mask);
6366
6367                     return 0;
6368                 }else{
6369                     ff_er_add_slice(s, s->resync_mb_x, s->resync_mb_y, s->mb_x, s->mb_y, (AC_ERROR|DC_ERROR|MV_ERROR)&part_mask);
6370
6371                     return -1;
6372                 }
6373             }
6374         }
6375         s->mb_x=0;
6376         ff_draw_horiz_band(s, 16*s->mb_y, 16);
6377     }
6378 #endif
6379     return -1; //not reached
6380 }
6381
6382 static inline void decode_hrd_parameters(H264Context *h, SPS *sps){
6383     MpegEncContext * const s = &h->s;
6384     int cpb_count, i;
6385     cpb_count = get_ue_golomb(&s->gb) + 1;
6386     get_bits(&s->gb, 4); /* bit_rate_scale */
6387     get_bits(&s->gb, 4); /* cpb_size_scale */
6388     for(i=0; i<cpb_count; i++){
6389         get_ue_golomb(&s->gb); /* bit_rate_value_minus1 */
6390         get_ue_golomb(&s->gb); /* cpb_size_value_minus1 */
6391         get_bits1(&s->gb);     /* cbr_flag */
6392     }
6393     get_bits(&s->gb, 5); /* initial_cpb_removal_delay_length_minus1 */
6394     get_bits(&s->gb, 5); /* cpb_removal_delay_length_minus1 */
6395     get_bits(&s->gb, 5); /* dpb_output_delay_length_minus1 */
6396     get_bits(&s->gb, 5); /* time_offset_length */
6397 }
6398
6399 static inline int decode_vui_parameters(H264Context *h, SPS *sps){
6400     MpegEncContext * const s = &h->s;
6401     int aspect_ratio_info_present_flag, aspect_ratio_idc;
6402     int nal_hrd_parameters_present_flag, vcl_hrd_parameters_present_flag;
6403
6404     aspect_ratio_info_present_flag= get_bits1(&s->gb);
6405
6406     if( aspect_ratio_info_present_flag ) {
6407         aspect_ratio_idc= get_bits(&s->gb, 8);
6408         if( aspect_ratio_idc == EXTENDED_SAR ) {
6409             sps->sar.num= get_bits(&s->gb, 16);
6410             sps->sar.den= get_bits(&s->gb, 16);
6411         }else if(aspect_ratio_idc < 16){
6412             sps->sar=  pixel_aspect[aspect_ratio_idc];
6413         }else{
6414             av_log(h->s.avctx, AV_LOG_ERROR, "illegal aspect ratio\n");
6415             return -1;
6416         }
6417     }else{
6418         sps->sar.num=
6419         sps->sar.den= 0;
6420     }
6421 //            s->avctx->aspect_ratio= sar_width*s->width / (float)(s->height*sar_height);
6422
6423     if(get_bits1(&s->gb)){      /* overscan_info_present_flag */
6424         get_bits1(&s->gb);      /* overscan_appropriate_flag */
6425     }
6426
6427     if(get_bits1(&s->gb)){      /* video_signal_type_present_flag */
6428         get_bits(&s->gb, 3);    /* video_format */
6429         get_bits1(&s->gb);      /* video_full_range_flag */
6430         if(get_bits1(&s->gb)){  /* colour_description_present_flag */
6431             get_bits(&s->gb, 8); /* colour_primaries */
6432             get_bits(&s->gb, 8); /* transfer_characteristics */
6433             get_bits(&s->gb, 8); /* matrix_coefficients */
6434         }
6435     }
6436
6437     if(get_bits1(&s->gb)){      /* chroma_location_info_present_flag */
6438         get_ue_golomb(&s->gb);  /* chroma_sample_location_type_top_field */
6439         get_ue_golomb(&s->gb);  /* chroma_sample_location_type_bottom_field */
6440     }
6441
6442     sps->timing_info_present_flag = get_bits1(&s->gb);
6443     if(sps->timing_info_present_flag){
6444         sps->num_units_in_tick = get_bits_long(&s->gb, 32);
6445         sps->time_scale = get_bits_long(&s->gb, 32);
6446         sps->fixed_frame_rate_flag = get_bits1(&s->gb);
6447     }
6448
6449     nal_hrd_parameters_present_flag = get_bits1(&s->gb);
6450     if(nal_hrd_parameters_present_flag)
6451         decode_hrd_parameters(h, sps);
6452     vcl_hrd_parameters_present_flag = get_bits1(&s->gb);
6453     if(vcl_hrd_parameters_present_flag)
6454         decode_hrd_parameters(h, sps);
6455     if(nal_hrd_parameters_present_flag || vcl_hrd_parameters_present_flag)
6456         get_bits1(&s->gb);     /* low_delay_hrd_flag */
6457     get_bits1(&s->gb);         /* pic_struct_present_flag */
6458
6459     sps->bitstream_restriction_flag = get_bits1(&s->gb);
6460     if(sps->bitstream_restriction_flag){
6461         get_bits1(&s->gb);     /* motion_vectors_over_pic_boundaries_flag */
6462         get_ue_golomb(&s->gb); /* max_bytes_per_pic_denom */
6463         get_ue_golomb(&s->gb); /* max_bits_per_mb_denom */
6464         get_ue_golomb(&s->gb); /* log2_max_mv_length_horizontal */
6465         get_ue_golomb(&s->gb); /* log2_max_mv_length_vertical */
6466         sps->num_reorder_frames = get_ue_golomb(&s->gb);
6467         get_ue_golomb(&s->gb); /* max_dec_frame_buffering */
6468     }
6469
6470     return 0;
6471 }
6472
6473 static inline int decode_seq_parameter_set(H264Context *h){
6474     MpegEncContext * const s = &h->s;
6475     int profile_idc, level_idc;
6476     int sps_id, i;
6477     SPS *sps;
6478
6479     profile_idc= get_bits(&s->gb, 8);
6480     get_bits1(&s->gb);   //constraint_set0_flag
6481     get_bits1(&s->gb);   //constraint_set1_flag
6482     get_bits1(&s->gb);   //constraint_set2_flag
6483     get_bits1(&s->gb);   //constraint_set3_flag
6484     get_bits(&s->gb, 4); // reserved
6485     level_idc= get_bits(&s->gb, 8);
6486     sps_id= get_ue_golomb(&s->gb);
6487
6488     sps= &h->sps_buffer[ sps_id ];
6489     sps->profile_idc= profile_idc;
6490     sps->level_idc= level_idc;
6491
6492     sps->log2_max_frame_num= get_ue_golomb(&s->gb) + 4;
6493     sps->poc_type= get_ue_golomb(&s->gb);
6494
6495     if(sps->poc_type == 0){ //FIXME #define
6496         sps->log2_max_poc_lsb= get_ue_golomb(&s->gb) + 4;
6497     } else if(sps->poc_type == 1){//FIXME #define
6498         sps->delta_pic_order_always_zero_flag= get_bits1(&s->gb);
6499         sps->offset_for_non_ref_pic= get_se_golomb(&s->gb);
6500         sps->offset_for_top_to_bottom_field= get_se_golomb(&s->gb);
6501         sps->poc_cycle_length= get_ue_golomb(&s->gb);
6502
6503         for(i=0; i<sps->poc_cycle_length; i++)
6504             sps->offset_for_ref_frame[i]= get_se_golomb(&s->gb);
6505     }
6506     if(sps->poc_type > 2){
6507         av_log(h->s.avctx, AV_LOG_ERROR, "illegal POC type %d\n", sps->poc_type);
6508         return -1;
6509     }
6510
6511     sps->ref_frame_count= get_ue_golomb(&s->gb);
6512     if(sps->ref_frame_count > MAX_PICTURE_COUNT-2){
6513         av_log(h->s.avctx, AV_LOG_ERROR, "too many reference frames\n");
6514     }
6515     sps->gaps_in_frame_num_allowed_flag= get_bits1(&s->gb);
6516     sps->mb_width= get_ue_golomb(&s->gb) + 1;
6517     sps->mb_height= get_ue_golomb(&s->gb) + 1;
6518     if((unsigned)sps->mb_width >= INT_MAX/16 || (unsigned)sps->mb_height >= INT_MAX/16 ||
6519        avcodec_check_dimensions(NULL, 16*sps->mb_width, 16*sps->mb_height))
6520         return -1;
6521
6522     sps->frame_mbs_only_flag= get_bits1(&s->gb);
6523     if(!sps->frame_mbs_only_flag)
6524         sps->mb_aff= get_bits1(&s->gb);
6525     else
6526         sps->mb_aff= 0;
6527
6528     sps->direct_8x8_inference_flag= get_bits1(&s->gb);
6529
6530     sps->crop= get_bits1(&s->gb);
6531     if(sps->crop){
6532         sps->crop_left  = get_ue_golomb(&s->gb);
6533         sps->crop_right = get_ue_golomb(&s->gb);
6534         sps->crop_top   = get_ue_golomb(&s->gb);
6535         sps->crop_bottom= get_ue_golomb(&s->gb);
6536         if(sps->crop_left || sps->crop_top){
6537             av_log(h->s.avctx, AV_LOG_ERROR, "insane cropping not completely supported, this could look slightly wrong ...\n");
6538         }
6539     }else{
6540         sps->crop_left  =
6541         sps->crop_right =
6542         sps->crop_top   =
6543         sps->crop_bottom= 0;
6544     }
6545
6546     sps->vui_parameters_present_flag= get_bits1(&s->gb);
6547     if( sps->vui_parameters_present_flag )
6548         decode_vui_parameters(h, sps);
6549
6550     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
6551         av_log(h->s.avctx, AV_LOG_DEBUG, "sps:%d profile:%d/%d poc:%d ref:%d %dx%d %s %s crop:%d/%d/%d/%d %s\n",
6552                sps_id, sps->profile_idc, sps->level_idc,
6553                sps->poc_type,
6554                sps->ref_frame_count,
6555                sps->mb_width, sps->mb_height,
6556                sps->frame_mbs_only_flag ? "FRM" : (sps->mb_aff ? "MB-AFF" : "PIC-AFF"),
6557                sps->direct_8x8_inference_flag ? "8B8" : "",
6558                sps->crop_left, sps->crop_right,
6559                sps->crop_top, sps->crop_bottom,
6560                sps->vui_parameters_present_flag ? "VUI" : ""
6561                );
6562     }
6563     return 0;
6564 }
6565
6566 static inline int decode_picture_parameter_set(H264Context *h){
6567     MpegEncContext * const s = &h->s;
6568     int pps_id= get_ue_golomb(&s->gb);
6569     PPS *pps= &h->pps_buffer[pps_id];
6570
6571     pps->sps_id= get_ue_golomb(&s->gb);
6572     pps->cabac= get_bits1(&s->gb);
6573     pps->pic_order_present= get_bits1(&s->gb);
6574     pps->slice_group_count= get_ue_golomb(&s->gb) + 1;
6575     if(pps->slice_group_count > 1 ){
6576         pps->mb_slice_group_map_type= get_ue_golomb(&s->gb);
6577         av_log(h->s.avctx, AV_LOG_ERROR, "FMO not supported\n");
6578         switch(pps->mb_slice_group_map_type){
6579         case 0:
6580 #if 0
6581 |   for( i = 0; i <= num_slice_groups_minus1; i++ ) |   |        |
6582 |    run_length[ i ]                                |1  |ue(v)   |
6583 #endif
6584             break;
6585         case 2:
6586 #if 0
6587 |   for( i = 0; i < num_slice_groups_minus1; i++ )  |   |        |
6588 |{                                                  |   |        |
6589 |    top_left_mb[ i ]                               |1  |ue(v)   |
6590 |    bottom_right_mb[ i ]                           |1  |ue(v)   |
6591 |   }                                               |   |        |
6592 #endif
6593             break;
6594         case 3:
6595         case 4:
6596         case 5:
6597 #if 0
6598 |   slice_group_change_direction_flag               |1  |u(1)    |
6599 |   slice_group_change_rate_minus1                  |1  |ue(v)   |
6600 #endif
6601             break;
6602         case 6:
6603 #if 0
6604 |   slice_group_id_cnt_minus1                       |1  |ue(v)   |
6605 |   for( i = 0; i <= slice_group_id_cnt_minus1; i++ |   |        |
6606 |)                                                  |   |        |
6607 |    slice_group_id[ i ]                            |1  |u(v)    |
6608 #endif
6609             break;
6610         }
6611     }
6612     pps->ref_count[0]= get_ue_golomb(&s->gb) + 1;
6613     pps->ref_count[1]= get_ue_golomb(&s->gb) + 1;
6614     if(pps->ref_count[0] > 32 || pps->ref_count[1] > 32){
6615         av_log(h->s.avctx, AV_LOG_ERROR, "reference overflow (pps)\n");
6616         return -1;
6617     }
6618
6619     pps->weighted_pred= get_bits1(&s->gb);
6620     pps->weighted_bipred_idc= get_bits(&s->gb, 2);
6621     pps->init_qp= get_se_golomb(&s->gb) + 26;
6622     pps->init_qs= get_se_golomb(&s->gb) + 26;
6623     pps->chroma_qp_index_offset= get_se_golomb(&s->gb);
6624     pps->deblocking_filter_parameters_present= get_bits1(&s->gb);
6625     pps->constrained_intra_pred= get_bits1(&s->gb);
6626     pps->redundant_pic_cnt_present = get_bits1(&s->gb);
6627
6628     if(s->avctx->debug&FF_DEBUG_PICT_INFO){
6629         av_log(h->s.avctx, AV_LOG_DEBUG, "pps:%d sps:%d %s slice_groups:%d ref:%d/%d %s qp:%d/%d/%d %s %s %s\n",
6630                pps_id, pps->sps_id,
6631                pps->cabac ? "CABAC" : "CAVLC",
6632                pps->slice_group_count,
6633                pps->ref_count[0], pps->ref_count[1],
6634                pps->weighted_pred ? "weighted" : "",
6635                pps->init_qp, pps->init_qs, pps->chroma_qp_index_offset,
6636                pps->deblocking_filter_parameters_present ? "LPAR" : "",
6637                pps->constrained_intra_pred ? "CONSTR" : "",
6638                pps->redundant_pic_cnt_present ? "REDU" : ""
6639                );
6640     }
6641
6642     return 0;
6643 }
6644
6645 /**
6646  * finds the end of the current frame in the bitstream.
6647  * @return the position of the first byte of the next frame, or -1
6648  */
6649 static int find_frame_end(H264Context *h, const uint8_t *buf, int buf_size){
6650     int i;
6651     uint32_t state;
6652     ParseContext *pc = &(h->s.parse_context);
6653 //printf("first %02X%02X%02X%02X\n", buf[0], buf[1],buf[2],buf[3]);
6654 //    mb_addr= pc->mb_addr - 1;
6655     state= pc->state;
6656     for(i=0; i<=buf_size; i++){
6657         if((state&0xFFFFFF1F) == 0x101 || (state&0xFFFFFF1F) == 0x102 || (state&0xFFFFFF1F) == 0x105){
6658             tprintf("find_frame_end new startcode = %08x, frame_start_found = %d, pos = %d\n", state, pc->frame_start_found, i);
6659             if(pc->frame_start_found){
6660                 // If there isn't one more byte in the buffer
6661                 // the test on first_mb_in_slice cannot be done yet
6662                 // do it at next call.
6663                 if (i >= buf_size) break;
6664                 if (buf[i] & 0x80) {
6665                     // first_mb_in_slice is 0, probably the first nal of a new
6666                     // slice
6667                     tprintf("find_frame_end frame_end_found, state = %08x, pos = %d\n", state, i);
6668                     pc->state=-1;
6669                     pc->frame_start_found= 0;
6670                     return i-4;
6671                 }
6672             }
6673             pc->frame_start_found = 1;
6674         }
6675         if (i<buf_size)
6676             state= (state<<8) | buf[i];
6677     }
6678
6679     pc->state= state;
6680     return END_NOT_FOUND;
6681 }
6682
6683 static int h264_parse(AVCodecParserContext *s,
6684                       AVCodecContext *avctx,
6685                       uint8_t **poutbuf, int *poutbuf_size,
6686                       const uint8_t *buf, int buf_size)
6687 {
6688     H264Context *h = s->priv_data;
6689     ParseContext *pc = &h->s.parse_context;
6690     int next;
6691
6692     next= find_frame_end(h, buf, buf_size);
6693
6694     if (ff_combine_frame(pc, next, (uint8_t **)&buf, &buf_size) < 0) {
6695         *poutbuf = NULL;
6696         *poutbuf_size = 0;
6697         return buf_size;
6698     }
6699
6700     *poutbuf = (uint8_t *)buf;
6701     *poutbuf_size = buf_size;
6702     return next;
6703 }
6704
6705 static int decode_nal_units(H264Context *h, uint8_t *buf, int buf_size){
6706     MpegEncContext * const s = &h->s;
6707     AVCodecContext * const avctx= s->avctx;
6708     int buf_index=0;
6709 #if 0
6710     int i;
6711     for(i=0; i<32; i++){
6712         printf("%X ", buf[i]);
6713     }
6714 #endif
6715     h->slice_num = 0;
6716     for(;;){
6717         int consumed;
6718         int dst_length;
6719         int bit_length;
6720         uint8_t *ptr;
6721         int i, nalsize = 0;
6722
6723       if(h->is_avc) {
6724         if(buf_index >= buf_size) break;
6725         nalsize = 0;
6726         for(i = 0; i < h->nal_length_size; i++)
6727             nalsize = (nalsize << 8) | buf[buf_index++];
6728       } else {
6729         // start code prefix search
6730         for(; buf_index + 3 < buf_size; buf_index++){
6731             // this should allways succeed in the first iteration
6732             if(buf[buf_index] == 0 && buf[buf_index+1] == 0 && buf[buf_index+2] == 1)
6733                 break;
6734         }
6735
6736         if(buf_index+3 >= buf_size) break;
6737
6738         buf_index+=3;
6739       }
6740
6741         ptr= decode_nal(h, buf + buf_index, &dst_length, &consumed, h->is_avc ? nalsize : buf_size - buf_index);
6742         if(ptr[dst_length - 1] == 0) dst_length--;
6743         bit_length= 8*dst_length - decode_rbsp_trailing(ptr + dst_length - 1);
6744
6745         if(s->avctx->debug&FF_DEBUG_STARTCODE){
6746             av_log(h->s.avctx, AV_LOG_DEBUG, "NAL %d at %d/%d length %d\n", h->nal_unit_type, buf_index, buf_size, dst_length);
6747         }
6748
6749         if (h->is_avc && (nalsize != consumed))
6750             av_log(h->s.avctx, AV_LOG_ERROR, "AVC: Consumed only %d bytes instead of %d\n", consumed, nalsize);
6751
6752         buf_index += consumed;
6753
6754         if( s->hurry_up == 1 && h->nal_ref_idc  == 0 )
6755             continue;
6756
6757         switch(h->nal_unit_type){
6758         case NAL_IDR_SLICE:
6759             idr(h); //FIXME ensure we don't loose some frames if there is reordering
6760         case NAL_SLICE:
6761             init_get_bits(&s->gb, ptr, bit_length);
6762             h->intra_gb_ptr=
6763             h->inter_gb_ptr= &s->gb;
6764             s->data_partitioning = 0;
6765
6766             if(decode_slice_header(h) < 0) return -1;
6767             if(h->redundant_pic_count==0 && s->hurry_up < 5 )
6768                 decode_slice(h);
6769             break;
6770         case NAL_DPA:
6771             init_get_bits(&s->gb, ptr, bit_length);
6772             h->intra_gb_ptr=
6773             h->inter_gb_ptr= NULL;
6774             s->data_partitioning = 1;
6775
6776             if(decode_slice_header(h) < 0) return -1;
6777             break;
6778         case NAL_DPB:
6779             init_get_bits(&h->intra_gb, ptr, bit_length);
6780             h->intra_gb_ptr= &h->intra_gb;
6781             break;
6782         case NAL_DPC:
6783             init_get_bits(&h->inter_gb, ptr, bit_length);
6784             h->inter_gb_ptr= &h->inter_gb;
6785
6786             if(h->redundant_pic_count==0 && h->intra_gb_ptr && s->data_partitioning && s->hurry_up < 5 )
6787                 decode_slice(h);
6788             break;
6789         case NAL_SEI:
6790             break;
6791         case NAL_SPS:
6792             init_get_bits(&s->gb, ptr, bit_length);
6793             decode_seq_parameter_set(h);
6794
6795             if(s->flags& CODEC_FLAG_LOW_DELAY)
6796                 s->low_delay=1;
6797
6798             if(avctx->has_b_frames < 2)
6799                 avctx->has_b_frames= !s->low_delay;
6800             break;
6801         case NAL_PPS:
6802             init_get_bits(&s->gb, ptr, bit_length);
6803
6804             decode_picture_parameter_set(h);
6805
6806             break;
6807         case NAL_PICTURE_DELIMITER:
6808             break;
6809         case NAL_FILTER_DATA:
6810             break;
6811         default:
6812             av_log(avctx, AV_LOG_ERROR, "Unknown NAL code: %d\n", h->nal_unit_type);
6813         }
6814     }
6815
6816     if(!s->current_picture_ptr) return buf_index; //no frame
6817
6818     s->current_picture_ptr->pict_type= s->pict_type;
6819     s->current_picture_ptr->key_frame= s->pict_type == I_TYPE && h->nal_unit_type == NAL_IDR_SLICE;
6820
6821     h->prev_frame_num_offset= h->frame_num_offset;
6822     h->prev_frame_num= h->frame_num;
6823     if(s->current_picture_ptr->reference){
6824         h->prev_poc_msb= h->poc_msb;
6825         h->prev_poc_lsb= h->poc_lsb;
6826     }
6827     if(s->current_picture_ptr->reference)
6828         execute_ref_pic_marking(h, h->mmco, h->mmco_index);
6829
6830     ff_er_frame_end(s);
6831
6832     MPV_frame_end(s);
6833
6834     return buf_index;
6835 }
6836
6837 /**
6838  * returns the number of bytes consumed for building the current frame
6839  */
6840 static int get_consumed_bytes(MpegEncContext *s, int pos, int buf_size){
6841     if(s->flags&CODEC_FLAG_TRUNCATED){
6842         pos -= s->parse_context.last_index;
6843         if(pos<0) pos=0; // FIXME remove (unneeded?)
6844
6845         return pos;
6846     }else{
6847         if(pos==0) pos=1; //avoid infinite loops (i doubt thats needed but ...)
6848         if(pos+10>buf_size) pos=buf_size; // oops ;)
6849
6850         return pos;
6851     }
6852 }
6853
6854 static int decode_frame(AVCodecContext *avctx,
6855                              void *data, int *data_size,
6856                              uint8_t *buf, int buf_size)
6857 {
6858     H264Context *h = avctx->priv_data;
6859     MpegEncContext *s = &h->s;
6860     AVFrame *pict = data;
6861     int buf_index;
6862
6863     s->flags= avctx->flags;
6864     s->flags2= avctx->flags2;
6865
6866    /* no supplementary picture */
6867     if (buf_size == 0) {
6868         return 0;
6869     }
6870
6871     if(s->flags&CODEC_FLAG_TRUNCATED){
6872         int next= find_frame_end(h, buf, buf_size);
6873
6874         if( ff_combine_frame(&s->parse_context, next, &buf, &buf_size) < 0 )
6875             return buf_size;
6876 //printf("next:%d buf_size:%d last_index:%d\n", next, buf_size, s->parse_context.last_index);
6877     }
6878
6879     if(h->is_avc && !h->got_avcC) {
6880         int i, cnt, nalsize;
6881         unsigned char *p = avctx->extradata;
6882         if(avctx->extradata_size < 7) {
6883             av_log(avctx, AV_LOG_ERROR, "avcC too short\n");
6884             return -1;
6885         }
6886         if(*p != 1) {
6887             av_log(avctx, AV_LOG_ERROR, "Unknown avcC version %d\n", *p);
6888             return -1;
6889         }
6890         /* sps and pps in the avcC always have length coded with 2 bytes,
6891            so put a fake nal_length_size = 2 while parsing them */
6892         h->nal_length_size = 2;
6893         // Decode sps from avcC
6894         cnt = *(p+5) & 0x1f; // Number of sps
6895         p += 6;
6896         for (i = 0; i < cnt; i++) {
6897             nalsize = BE_16(p) + 2;
6898             if(decode_nal_units(h, p, nalsize) != nalsize) {
6899                 av_log(avctx, AV_LOG_ERROR, "Decoding sps %d from avcC failed\n", i);
6900                 return -1;
6901             }
6902             p += nalsize;
6903         }
6904         // Decode pps from avcC
6905         cnt = *(p++); // Number of pps
6906         for (i = 0; i < cnt; i++) {
6907             nalsize = BE_16(p) + 2;
6908             if(decode_nal_units(h, p, nalsize)  != nalsize) {
6909                 av_log(avctx, AV_LOG_ERROR, "Decoding pps %d from avcC failed\n", i);
6910                 return -1;
6911             }
6912             p += nalsize;
6913         }
6914         // Now store right nal length size, that will be use to parse all other nals
6915         h->nal_length_size = ((*(((char*)(avctx->extradata))+4))&0x03)+1;
6916         // Do not reparse avcC
6917         h->got_avcC = 1;
6918     }
6919
6920     if(!h->is_avc && s->avctx->extradata_size && s->picture_number==0){
6921         if(0 < decode_nal_units(h, s->avctx->extradata, s->avctx->extradata_size) )
6922             return -1;
6923     }
6924
6925     buf_index=decode_nal_units(h, buf, buf_size);
6926     if(buf_index < 0)
6927         return -1;
6928
6929     //FIXME do something with unavailable reference frames
6930
6931 //    if(ret==FRAME_SKIPPED) return get_consumed_bytes(s, buf_index, buf_size);
6932     if(!s->current_picture_ptr){
6933         av_log(h->s.avctx, AV_LOG_DEBUG, "error, NO frame\n");
6934         return -1;
6935     }
6936
6937     {
6938         Picture *out = s->current_picture_ptr;
6939 #if 0 //decode order
6940         *data_size = sizeof(AVFrame);
6941 #else
6942         /* Sort B-frames into display order */
6943         Picture *cur = s->current_picture_ptr;
6944         Picture *prev = h->delayed_output_pic;
6945         int out_idx = 0;
6946         int pics = 0;
6947         int out_of_order;
6948         int cross_idr = 0;
6949         int dropped_frame = 0;
6950         int i;
6951
6952         if(h->sps.bitstream_restriction_flag
6953            && s->avctx->has_b_frames < h->sps.num_reorder_frames){
6954             s->avctx->has_b_frames = h->sps.num_reorder_frames;
6955             s->low_delay = 0;
6956         }
6957
6958         while(h->delayed_pic[pics]) pics++;
6959         h->delayed_pic[pics++] = cur;
6960         if(cur->reference == 0)
6961             cur->reference = 1;
6962
6963         for(i=0; h->delayed_pic[i]; i++)
6964             if(h->delayed_pic[i]->key_frame || h->delayed_pic[i]->poc==0)
6965                 cross_idr = 1;
6966
6967         out = h->delayed_pic[0];
6968         for(i=1; h->delayed_pic[i] && !h->delayed_pic[i]->key_frame; i++)
6969             if(h->delayed_pic[i]->poc < out->poc){
6970                 out = h->delayed_pic[i];
6971                 out_idx = i;
6972             }
6973
6974         out_of_order = !cross_idr && prev && out->poc < prev->poc;
6975         if(prev && pics <= s->avctx->has_b_frames)
6976             out = prev;
6977         else if((out_of_order && pics-1 == s->avctx->has_b_frames)
6978            || (s->low_delay &&
6979             ((!cross_idr && prev && out->poc > prev->poc + 2)
6980              || cur->pict_type == B_TYPE)))
6981         {
6982             s->low_delay = 0;
6983             s->avctx->has_b_frames++;
6984             out = prev;
6985         }
6986         else if(out_of_order)
6987             out = prev;
6988
6989         if(out_of_order || pics > s->avctx->has_b_frames){
6990             dropped_frame = (out != h->delayed_pic[out_idx]);
6991             for(i=out_idx; h->delayed_pic[i]; i++)
6992                 h->delayed_pic[i] = h->delayed_pic[i+1];
6993         }
6994
6995         if(prev == out && !dropped_frame)
6996             *data_size = 0;
6997         else
6998             *data_size = sizeof(AVFrame);
6999         if(prev && prev != out && prev->reference == 1)
7000             prev->reference = 0;
7001         h->delayed_output_pic = out;
7002 #endif
7003
7004         *pict= *(AVFrame*)out;
7005     }
7006
7007     assert(pict->data[0]);
7008     ff_print_debug_info(s, pict);
7009 //printf("out %d\n", (int)pict->data[0]);
7010 #if 0 //?
7011
7012     /* Return the Picture timestamp as the frame number */
7013     /* we substract 1 because it is added on utils.c    */
7014     avctx->frame_number = s->picture_number - 1;
7015 #endif
7016     return get_consumed_bytes(s, buf_index, buf_size);
7017 }
7018 #if 0
7019 static inline void fill_mb_avail(H264Context *h){
7020     MpegEncContext * const s = &h->s;
7021     const int mb_xy= s->mb_x + s->mb_y*s->mb_stride;
7022
7023     if(s->mb_y){
7024         h->mb_avail[0]= s->mb_x                 && h->slice_table[mb_xy - s->mb_stride - 1] == h->slice_num;
7025         h->mb_avail[1]=                            h->slice_table[mb_xy - s->mb_stride    ] == h->slice_num;
7026         h->mb_avail[2]= s->mb_x+1 < s->mb_width && h->slice_table[mb_xy - s->mb_stride + 1] == h->slice_num;
7027     }else{
7028         h->mb_avail[0]=
7029         h->mb_avail[1]=
7030         h->mb_avail[2]= 0;
7031     }
7032     h->mb_avail[3]= s->mb_x && h->slice_table[mb_xy - 1] == h->slice_num;
7033     h->mb_avail[4]= 1; //FIXME move out
7034     h->mb_avail[5]= 0; //FIXME move out
7035 }
7036 #endif
7037
7038 #if 0 //selftest
7039 #define COUNT 8000
7040 #define SIZE (COUNT*40)
7041 int main(){
7042     int i;
7043     uint8_t temp[SIZE];
7044     PutBitContext pb;
7045     GetBitContext gb;
7046 //    int int_temp[10000];
7047     DSPContext dsp;
7048     AVCodecContext avctx;
7049
7050     dsputil_init(&dsp, &avctx);
7051
7052     init_put_bits(&pb, temp, SIZE);
7053     printf("testing unsigned exp golomb\n");
7054     for(i=0; i<COUNT; i++){
7055         START_TIMER
7056         set_ue_golomb(&pb, i);
7057         STOP_TIMER("set_ue_golomb");
7058     }
7059     flush_put_bits(&pb);
7060
7061     init_get_bits(&gb, temp, 8*SIZE);
7062     for(i=0; i<COUNT; i++){
7063         int j, s;
7064
7065         s= show_bits(&gb, 24);
7066
7067         START_TIMER
7068         j= get_ue_golomb(&gb);
7069         if(j != i){
7070             printf("missmatch! at %d (%d should be %d) bits:%6X\n", i, j, i, s);
7071 //            return -1;
7072         }
7073         STOP_TIMER("get_ue_golomb");
7074     }
7075
7076
7077     init_put_bits(&pb, temp, SIZE);
7078     printf("testing signed exp golomb\n");
7079     for(i=0; i<COUNT; i++){
7080         START_TIMER
7081         set_se_golomb(&pb, i - COUNT/2);
7082         STOP_TIMER("set_se_golomb");
7083     }
7084     flush_put_bits(&pb);
7085
7086     init_get_bits(&gb, temp, 8*SIZE);
7087     for(i=0; i<COUNT; i++){
7088         int j, s;
7089
7090         s= show_bits(&gb, 24);
7091
7092         START_TIMER
7093         j= get_se_golomb(&gb);
7094         if(j != i - COUNT/2){
7095             printf("missmatch! at %d (%d should be %d) bits:%6X\n", i, j, i, s);
7096 //            return -1;
7097         }
7098         STOP_TIMER("get_se_golomb");
7099     }
7100
7101     printf("testing 4x4 (I)DCT\n");
7102
7103     DCTELEM block[16];
7104     uint8_t src[16], ref[16];
7105     uint64_t error= 0, max_error=0;
7106
7107     for(i=0; i<COUNT; i++){
7108         int j;
7109 //        printf("%d %d %d\n", r1, r2, (r2-r1)*16);
7110         for(j=0; j<16; j++){
7111             ref[j]= random()%255;
7112             src[j]= random()%255;
7113         }
7114
7115         h264_diff_dct_c(block, src, ref, 4);
7116
7117         //normalize
7118         for(j=0; j<16; j++){
7119 //            printf("%d ", block[j]);
7120             block[j]= block[j]*4;
7121             if(j&1) block[j]= (block[j]*4 + 2)/5;
7122             if(j&4) block[j]= (block[j]*4 + 2)/5;
7123         }
7124 //        printf("\n");
7125
7126         s->dsp.h264_idct_add(ref, block, 4);
7127 /*        for(j=0; j<16; j++){
7128             printf("%d ", ref[j]);
7129         }
7130         printf("\n");*/
7131
7132         for(j=0; j<16; j++){
7133             int diff= ABS(src[j] - ref[j]);
7134
7135             error+= diff*diff;
7136             max_error= FFMAX(max_error, diff);
7137         }
7138     }
7139     printf("error=%f max_error=%d\n", ((float)error)/COUNT/16, (int)max_error );
7140 #if 0
7141     printf("testing quantizer\n");
7142     for(qp=0; qp<52; qp++){
7143         for(i=0; i<16; i++)
7144             src1_block[i]= src2_block[i]= random()%255;
7145
7146     }
7147 #endif
7148     printf("Testing NAL layer\n");
7149
7150     uint8_t bitstream[COUNT];
7151     uint8_t nal[COUNT*2];
7152     H264Context h;
7153     memset(&h, 0, sizeof(H264Context));
7154
7155     for(i=0; i<COUNT; i++){
7156         int zeros= i;
7157         int nal_length;
7158         int consumed;
7159         int out_length;
7160         uint8_t *out;
7161         int j;
7162
7163         for(j=0; j<COUNT; j++){
7164             bitstream[j]= (random() % 255) + 1;
7165         }
7166
7167         for(j=0; j<zeros; j++){
7168             int pos= random() % COUNT;
7169             while(bitstream[pos] == 0){
7170                 pos++;
7171                 pos %= COUNT;
7172             }
7173             bitstream[pos]=0;
7174         }
7175
7176         START_TIMER
7177
7178         nal_length= encode_nal(&h, nal, bitstream, COUNT, COUNT*2);
7179         if(nal_length<0){
7180             printf("encoding failed\n");
7181             return -1;
7182         }
7183
7184         out= decode_nal(&h, nal, &out_length, &consumed, nal_length);
7185
7186         STOP_TIMER("NAL")
7187
7188         if(out_length != COUNT){
7189             printf("incorrect length %d %d\n", out_length, COUNT);
7190             return -1;
7191         }
7192
7193         if(consumed != nal_length){
7194             printf("incorrect consumed length %d %d\n", nal_length, consumed);
7195             return -1;
7196         }
7197
7198         if(memcmp(bitstream, out, COUNT)){
7199             printf("missmatch\n");
7200             return -1;
7201         }
7202     }
7203
7204     printf("Testing RBSP\n");
7205
7206
7207     return 0;
7208 }
7209 #endif
7210
7211
7212 static int decode_end(AVCodecContext *avctx)
7213 {
7214     H264Context *h = avctx->priv_data;
7215     MpegEncContext *s = &h->s;
7216
7217     free_tables(h); //FIXME cleanup init stuff perhaps
7218     MPV_common_end(s);
7219
7220 //    memset(h, 0, sizeof(H264Context));
7221
7222     return 0;
7223 }
7224
7225
7226 AVCodec h264_decoder = {
7227     "h264",
7228     CODEC_TYPE_VIDEO,
7229     CODEC_ID_H264,
7230     sizeof(H264Context),
7231     decode_init,
7232     NULL,
7233     decode_end,
7234     decode_frame,
7235     /*CODEC_CAP_DRAW_HORIZ_BAND |*/ CODEC_CAP_DR1 | CODEC_CAP_TRUNCATED | CODEC_CAP_DELAY,
7236     .flush= flush_dpb,
7237 };
7238
7239 AVCodecParser h264_parser = {
7240     { CODEC_ID_H264 },
7241     sizeof(H264Context),
7242     NULL,
7243     h264_parse,
7244     ff_parse_close,
7245 };
7246
7247 #include "svq3.c"